rawmaker 2.40.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- letty/__init__.py +46 -0
- letty/cli.py +63 -0
- letty/optimizer.py +138 -0
- letty/quality/__init__.py +8 -0
- letty/quality/whitespace.py +50 -0
- letty/strategy.py +8 -0
- rawmaker/__init__.py +29 -0
- rawmaker/__main__.py +13 -0
- rawmaker/__patch__.py +36 -0
- rawmaker/cli.py +206 -0
- rawmaker/cli_automate.py +69 -0
- rawmaker/converter/__init__.py +8 -0
- rawmaker/converter/basic.py +174 -0
- rawmaker/converter/images.py +168 -0
- rawmaker/date.py +83 -0
- rawmaker/destination.py +202 -0
- rawmaker/error.py +34 -0
- rawmaker/features/__init__.py +138 -0
- rawmaker/features/annotation.py +254 -0
- rawmaker/features/border.py +172 -0
- rawmaker/features/boxes.py +153 -0
- rawmaker/features/figures.py +24 -0
- rawmaker/features/fonts.py +229 -0
- rawmaker/features/formula.py +16 -0
- rawmaker/features/horizontals.py +132 -0
- rawmaker/features/images.py +155 -0
- rawmaker/features/line.py +337 -0
- rawmaker/features/outlines.py +123 -0
- rawmaker/features/text.py +91 -0
- rawmaker/fonts/__init__.py +8 -0
- rawmaker/fonts/parser.py +354 -0
- rawmaker/images/__init__.py +8 -0
- rawmaker/images/info.py +35 -0
- rawmaker/miner/__init__.py +8 -0
- rawmaker/miner/char.py +42 -0
- rawmaker/miner/colorspace.py +75 -0
- rawmaker/miner/images.py +448 -0
- rawmaker/miner/position.py +121 -0
- rawmaker/miner/rawchar.py +207 -0
- rawmaker/miner/text.py +833 -0
- rawmaker/miner/underline.py +66 -0
- rawmaker/parameter.py +130 -0
- rawmaker/patch/__init__.py +8 -0
- rawmaker/patch/ltchar.py +79 -0
- rawmaker/reader.py +97 -0
- rawmaker/text/__init__.py +8 -0
- rawmaker/text/chars.py +24 -0
- rawmaker/text/data.py +47 -0
- rawmaker/text/superfast.py +91 -0
- rawmaker/text/wordbox.py +95 -0
- rawmaker/utils.py +44 -0
- rawmaker-2.40.3.dist-info/METADATA +51 -0
- rawmaker-2.40.3.dist-info/RECORD +63 -0
- rawmaker-2.40.3.dist-info/WHEEL +5 -0
- rawmaker-2.40.3.dist-info/entry_points.txt +6 -0
- rawmaker-2.40.3.dist-info/licenses/LICENSE +21 -0
- rawmaker-2.40.3.dist-info/top_level.txt +3 -0
- spacestation/__init__.py +18 -0
- spacestation/cli.py +51 -0
- spacestation/features/__init__.py +8 -0
- spacestation/features/chardist.py +85 -0
- spacestation/features/worddist.py +57 -0
- spacestation/features/wspace.py +130 -0
letty/__init__.py
ADDED
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
# =============================================================================
|
|
2
|
+
# C O P Y R I G H T
|
|
3
|
+
# -----------------------------------------------------------------------------
|
|
4
|
+
# Copyright (c) 2020-2023 by Helmut Konrad Schewe. All rights reserved.
|
|
5
|
+
# This file is property of Helmut Konrad Schewe. Any unauthorized copy,
|
|
6
|
+
# use or distribution is an offensive act against international law and may
|
|
7
|
+
# be prosecuted under federal law. Its content is company confidential.
|
|
8
|
+
# =============================================================================
|
|
9
|
+
"""Layout Estimator Tool t.y.
|
|
10
|
+
==========================
|
|
11
|
+
|
|
12
|
+
Variation Strategy
|
|
13
|
+
------------------
|
|
14
|
+
|
|
15
|
+
Static
|
|
16
|
+
~~~~~~
|
|
17
|
+
|
|
18
|
+
Dynamic
|
|
19
|
+
~~~~~~~
|
|
20
|
+
|
|
21
|
+
Result: Points
|
|
22
|
+
~~~~~~~~~~~~~~
|
|
23
|
+
|
|
24
|
+
Judger
|
|
25
|
+
-------
|
|
26
|
+
|
|
27
|
+
Optimizer
|
|
28
|
+
---------
|
|
29
|
+
|
|
30
|
+
.. code-block:: none
|
|
31
|
+
|
|
32
|
+
Optimizer -> Strategy -> Points -> Tool(Points) -> Result -> Judger(Result)
|
|
33
|
+
! ! |
|
|
34
|
+
! ! |
|
|
35
|
+
< < < < < < < < < < < < < < < < < < < < < < < < < < < < < </
|
|
36
|
+
|
|
37
|
+
"""
|
|
38
|
+
|
|
39
|
+
import os
|
|
40
|
+
|
|
41
|
+
import rawmaker
|
|
42
|
+
|
|
43
|
+
__version__ = rawmaker.__version__
|
|
44
|
+
ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))
|
|
45
|
+
|
|
46
|
+
PROCESS = 'letty'
|
letty/cli.py
ADDED
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
# =============================================================================
|
|
2
|
+
# C O P Y R I G H T
|
|
3
|
+
# -----------------------------------------------------------------------------
|
|
4
|
+
# Copyright (c) 2019-2023 by Helmut Konrad Schewe. All rights reserved.
|
|
5
|
+
# This file is property of Helmut Konrad Schewe. Any unauthorized copy,
|
|
6
|
+
# use or distribution is an offensive act against international law and may
|
|
7
|
+
# be prosecuted under federal law. Its content is company confidential.
|
|
8
|
+
# =============================================================================
|
|
9
|
+
|
|
10
|
+
import utilo
|
|
11
|
+
import utilo.cli
|
|
12
|
+
|
|
13
|
+
import letty
|
|
14
|
+
import letty.quality.whitespace
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
@utilo.saveme
|
|
18
|
+
def main() -> int:
|
|
19
|
+
parser = create_parser()
|
|
20
|
+
args = utilo.parse(parser) # pylint:disable=W0612
|
|
21
|
+
inpath, pages, whitespace = parse_args(args)
|
|
22
|
+
if whitespace:
|
|
23
|
+
white_spaces = letty.quality.whitespace.determine(inpath, pages=pages)
|
|
24
|
+
utilo.log(white_spaces)
|
|
25
|
+
return utilo.SUCCESS
|
|
26
|
+
parser.print_help()
|
|
27
|
+
return utilo.FAILURE
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def parse_args(args) -> tuple:
|
|
31
|
+
"""\
|
|
32
|
+
>>> parse_args({})
|
|
33
|
+
('...', None, False)
|
|
34
|
+
>>> parse_args(dict(pages=['3:10']))
|
|
35
|
+
('...', (3, 4, 5, 6, 7, 8, 9), False)
|
|
36
|
+
"""
|
|
37
|
+
inpath, _ = utilo.cli.sources(args, singleinput=True) # pylint:disable=W0632
|
|
38
|
+
inpath = inpath[0]
|
|
39
|
+
pages = None
|
|
40
|
+
if args.get('pages', None) is not None:
|
|
41
|
+
pages = utilo.parse_pages(','.join(args['pages']))
|
|
42
|
+
whitespace = args.get('whitespace', False)
|
|
43
|
+
result = (inpath, pages, whitespace)
|
|
44
|
+
return result
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def create_parser():
|
|
48
|
+
commands = [
|
|
49
|
+
utilo.cli.Flag('--whitespace', message='evalute number of whitespaces'),
|
|
50
|
+
]
|
|
51
|
+
parser = utilo.cli.create_parser(
|
|
52
|
+
todo=commands,
|
|
53
|
+
config=utilo.ParserConfiguration(
|
|
54
|
+
inputparameter=True,
|
|
55
|
+
outputparameter=False,
|
|
56
|
+
pages=True,
|
|
57
|
+
prefix=False,
|
|
58
|
+
verboseflag=True,
|
|
59
|
+
),
|
|
60
|
+
version=letty.__version__,
|
|
61
|
+
prog=letty.PROCESS,
|
|
62
|
+
)
|
|
63
|
+
return parser
|
letty/optimizer.py
ADDED
|
@@ -0,0 +1,138 @@
|
|
|
1
|
+
# =============================================================================
|
|
2
|
+
# C O P Y R I G H T
|
|
3
|
+
# -----------------------------------------------------------------------------
|
|
4
|
+
# Copyright (c) 2020-2023 by Helmut Konrad Schewe. All rights reserved.
|
|
5
|
+
# This file is property of Helmut Konrad Schewe. Any unauthorized copy,
|
|
6
|
+
# use or distribution is an offensive act against international law and may
|
|
7
|
+
# be prosecuted under federal law. Its content is company confidential.
|
|
8
|
+
# =============================================================================
|
|
9
|
+
|
|
10
|
+
import collections
|
|
11
|
+
import concurrent
|
|
12
|
+
import itertools
|
|
13
|
+
import math
|
|
14
|
+
import os
|
|
15
|
+
import sys
|
|
16
|
+
|
|
17
|
+
import configos
|
|
18
|
+
import utilo
|
|
19
|
+
|
|
20
|
+
import letty.quality.whitespace
|
|
21
|
+
|
|
22
|
+
OptimizerResult = collections.namedtuple('OptimizerResult', 'value, config')
|
|
23
|
+
WORKER = 12
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def run(
|
|
27
|
+
path: str,
|
|
28
|
+
pages: tuple = None,
|
|
29
|
+
boxes: int = 1,
|
|
30
|
+
chars: int = 10,
|
|
31
|
+
lines: int = 1,
|
|
32
|
+
words: int = 1,
|
|
33
|
+
*,
|
|
34
|
+
multicore: bool = True,
|
|
35
|
+
) -> OptimizerResult:
|
|
36
|
+
todo = strategy(chars=chars, words=words, lines=lines, boxes=boxes)
|
|
37
|
+
runner = threadpool if multicore else singlecore
|
|
38
|
+
result = runner(todo, path, pages)
|
|
39
|
+
judged = judge(result)
|
|
40
|
+
return judged
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def singlecore(todo: list, path: str, pages: tuple):
|
|
44
|
+
result = []
|
|
45
|
+
for config in todo:
|
|
46
|
+
quality = run_single(path, pages, config)
|
|
47
|
+
result.append(quality)
|
|
48
|
+
return result
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def threadpool(todo: list, path: str, pages: tuple):
|
|
52
|
+
result = []
|
|
53
|
+
with concurrent.futures.ThreadPoolExecutor(max_workers=WORKER) as executor:
|
|
54
|
+
futures = {
|
|
55
|
+
executor.submit(run_single, path, pages, config): config
|
|
56
|
+
for config in todo
|
|
57
|
+
}
|
|
58
|
+
for future in concurrent.futures.as_completed(futures):
|
|
59
|
+
try:
|
|
60
|
+
quality = future.result()
|
|
61
|
+
result.append(quality)
|
|
62
|
+
except Exception as error: # pylint:disable=broad-except
|
|
63
|
+
utilo.error(f'{future} failed.')
|
|
64
|
+
utilo.error(error)
|
|
65
|
+
return result
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def run_single(path: str, pages: tuple, config: dict):
|
|
69
|
+
config = ' '.join([f'--{key}={value}' for key, value in config.items()])
|
|
70
|
+
pages_raw = ','.join([str(item) for item in pages])
|
|
71
|
+
pages_raw = f'--pages={pages_raw}' if pages is not None else ''
|
|
72
|
+
with utilo.make_tmpdir(root=configos.tmp()) as cwd:
|
|
73
|
+
cmd = f'rawmaker -i {path} -o {cwd} {pages_raw} --text {config}'
|
|
74
|
+
config_outpath = os.path.join(cwd, 'layout.ini')
|
|
75
|
+
utilo.file_create(config_outpath, config)
|
|
76
|
+
completed = utilo.run(cmd, cwd=cwd)
|
|
77
|
+
if completed.returncode:
|
|
78
|
+
utilo.error(f'could not run: {cmd}')
|
|
79
|
+
utilo.error(completed.stdout)
|
|
80
|
+
utilo.error(completed.stderr)
|
|
81
|
+
sys.exit(utilo.FAILURE)
|
|
82
|
+
quality = letty.quality.whitespace.determine(cwd, pages=pages)
|
|
83
|
+
return OptimizerResult(quality, config)
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def judge(result):
|
|
87
|
+
ratio, best = result[0]
|
|
88
|
+
utilo.log(result[0])
|
|
89
|
+
for item in result[1:]:
|
|
90
|
+
utilo.log(item)
|
|
91
|
+
if item[0] < ratio:
|
|
92
|
+
ratio, best = item
|
|
93
|
+
return ratio, best
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
def strategy(
|
|
97
|
+
chars: int = 10,
|
|
98
|
+
words: int = 1,
|
|
99
|
+
lines: int = 1,
|
|
100
|
+
boxes: int = 1,
|
|
101
|
+
):
|
|
102
|
+
boxes_flow = ranges(0.5, 1.0, boxes)
|
|
103
|
+
char_margin = ranges(0.5, 20.0, chars)
|
|
104
|
+
line_margin = ranges(0.01, 5.0, lines)
|
|
105
|
+
word_margin = ranges(1.5, 5.0, words)
|
|
106
|
+
result = []
|
|
107
|
+
for char, word, box, line, in itertools.product(
|
|
108
|
+
char_margin,
|
|
109
|
+
word_margin,
|
|
110
|
+
boxes_flow,
|
|
111
|
+
line_margin,
|
|
112
|
+
):
|
|
113
|
+
result.append({
|
|
114
|
+
'boxes_flow': box,
|
|
115
|
+
'char_margin': char,
|
|
116
|
+
'word_margin': word,
|
|
117
|
+
'line_margin': line,
|
|
118
|
+
})
|
|
119
|
+
return result
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
# TODO: REPLACE WITH utilo CODE
|
|
123
|
+
def ranges(mini: float, maxi: float, steps: int = 15):
|
|
124
|
+
"""Compute parameter.
|
|
125
|
+
|
|
126
|
+
>>> utilo.roundme(ranges(0.1, 100, steps=10))
|
|
127
|
+
[0.1, 0.12, 0.18, 0.34, 0.76, 1.92, 5.06, 13.61, 36.84, 99.99]
|
|
128
|
+
>>> utilo.roundme(ranges(0.1, 20, steps=5))
|
|
129
|
+
[0.1, 0.73, 2.43, 7.06, 19.64]
|
|
130
|
+
"""
|
|
131
|
+
func = math.exp
|
|
132
|
+
maxed = func(steps - 1) / (maxi - mini)
|
|
133
|
+
result = []
|
|
134
|
+
for index in range(steps):
|
|
135
|
+
value = mini + (math.exp(index) - 1) / maxed
|
|
136
|
+
value = utilo.roundme(value, digits=5) # pylint:disable=R0204
|
|
137
|
+
result.append(value)
|
|
138
|
+
return result
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
# =============================================================================
|
|
2
|
+
# C O P Y R I G H T
|
|
3
|
+
# -----------------------------------------------------------------------------
|
|
4
|
+
# Copyright (c) 2020-2023 by Helmut Konrad Schewe. All rights reserved.
|
|
5
|
+
# This file is property of Helmut Konrad Schewe. Any unauthorized copy,
|
|
6
|
+
# use or distribution is an offensive act against international law and may
|
|
7
|
+
# be prosecuted under federal law. Its content is company confidential.
|
|
8
|
+
# =============================================================================
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
# =============================================================================
|
|
2
|
+
# C O P Y R I G H T
|
|
3
|
+
# -----------------------------------------------------------------------------
|
|
4
|
+
# Copyright (c) 2020-2023 by Helmut Konrad Schewe. All rights reserved.
|
|
5
|
+
# This file is property of Helmut Konrad Schewe. Any unauthorized copy,
|
|
6
|
+
# use or distribution is an offensive act against international law and may
|
|
7
|
+
# be prosecuted under federal law. Its content is company confidential.
|
|
8
|
+
# =============================================================================
|
|
9
|
+
|
|
10
|
+
import collections
|
|
11
|
+
import re
|
|
12
|
+
|
|
13
|
+
import serializeraw
|
|
14
|
+
import utilo
|
|
15
|
+
|
|
16
|
+
PageWhitespace = collections.namedtuple('PageWhitespaces', 'page, content')
|
|
17
|
+
PageWhitespaces = list[PageWhitespace]
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def determine(path: str, pages: tuple = None) -> PageWhitespaces:
|
|
21
|
+
data = serializeraw.ptn_frompath(
|
|
22
|
+
path,
|
|
23
|
+
pages=pages,
|
|
24
|
+
logging=False,
|
|
25
|
+
)
|
|
26
|
+
analyzed = [analyse_page(item) for item in data]
|
|
27
|
+
result = quality(analyzed)
|
|
28
|
+
return result
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
INNER_WHITESPACE = r'\b\s{2,}\b'
|
|
32
|
+
CONTENT_ENDING = r'\b\n'
|
|
33
|
+
|
|
34
|
+
COMMON = '(' + INNER_WHITESPACE + '|' + CONTENT_ENDING + ')'
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def analyse_page(page) -> PageWhitespace:
|
|
38
|
+
counter = collections.Counter()
|
|
39
|
+
for line in page:
|
|
40
|
+
text = line.text
|
|
41
|
+
for item in re.finditer(COMMON, text):
|
|
42
|
+
counter[len(utilo.extract_match(item))] += 1
|
|
43
|
+
counter[2] += len(page)
|
|
44
|
+
result = PageWhitespace(page=page.page, content=counter.most_common())
|
|
45
|
+
return result
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def quality(pages) -> int:
|
|
49
|
+
result = sum([page.content[0][1] for page in pages if page.content], 0)
|
|
50
|
+
return result
|
letty/strategy.py
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
# =============================================================================
|
|
2
|
+
# C O P Y R I G H T
|
|
3
|
+
# -----------------------------------------------------------------------------
|
|
4
|
+
# Copyright (c) 2020-2023 by Helmut Konrad Schewe. All rights reserved.
|
|
5
|
+
# This file is property of Helmut Konrad Schewe. Any unauthorized copy,
|
|
6
|
+
# use or distribution is an offensive act against international law and may
|
|
7
|
+
# be prosecuted under federal law. Its content is company confidential.
|
|
8
|
+
# =============================================================================
|
rawmaker/__init__.py
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
#==============================================================================
|
|
2
|
+
# C O P Y R I G H T
|
|
3
|
+
#------------------------------------------------------------------------------
|
|
4
|
+
# Copyright (c) 2019-2023 by Helmut Konrad Schewe. All rights reserved.
|
|
5
|
+
# This file is property of Helmut Konrad Schewe. Any unauthorized copy,
|
|
6
|
+
# use or distribution is an offensive act against international law and may
|
|
7
|
+
# be prosecuted under federal law. Its content is company confidential.
|
|
8
|
+
#==============================================================================
|
|
9
|
+
"""The rawmaker converts pdf to kiwi-internal project format
|
|
10
|
+
|
|
11
|
+
Hint: Pay attention to the public API on this file!
|
|
12
|
+
Breaking changes are breaking!
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
import os
|
|
16
|
+
|
|
17
|
+
import configos
|
|
18
|
+
|
|
19
|
+
# pylint:disable=W0613
|
|
20
|
+
import rawmaker.__patch__
|
|
21
|
+
from rawmaker.parameter import LAYOUT
|
|
22
|
+
from rawmaker.parameter import ONELINE
|
|
23
|
+
|
|
24
|
+
__version__ = '2.40.2'
|
|
25
|
+
|
|
26
|
+
ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))
|
|
27
|
+
PROCESS = 'rawmaker'
|
|
28
|
+
|
|
29
|
+
configos.cloud_lookup(PROCESS)
|
rawmaker/__main__.py
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
#==============================================================================
|
|
2
|
+
# C O P Y R I G H T
|
|
3
|
+
#------------------------------------------------------------------------------
|
|
4
|
+
# Copyright (c) 2019-2023 by Helmut Konrad Schewe. All rights reserved.
|
|
5
|
+
# This file is property of Helmut Konrad Schewe. Any unauthorized copy,
|
|
6
|
+
# use or distribution is an offensive act against international law and may
|
|
7
|
+
# be prosecuted under federal law. Its content is company confidential.
|
|
8
|
+
#==============================================================================
|
|
9
|
+
|
|
10
|
+
from rawmaker.cli import main
|
|
11
|
+
|
|
12
|
+
if __name__ == "__main__":
|
|
13
|
+
main()
|
rawmaker/__patch__.py
ADDED
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
# =============================================================================
|
|
2
|
+
# C O P Y R I G H T
|
|
3
|
+
# -----------------------------------------------------------------------------
|
|
4
|
+
# Copyright (c) 2020-2023 by Helmut Konrad Schewe. All rights reserved.
|
|
5
|
+
# This file is property of Helmut Konrad Schewe. Any unauthorized copy,
|
|
6
|
+
# use or distribution is an offensive act against international law and may
|
|
7
|
+
# be prosecuted under federal law. Its content is company confidential.
|
|
8
|
+
# =============================================================================
|
|
9
|
+
|
|
10
|
+
import sys
|
|
11
|
+
|
|
12
|
+
import pdfminer.glyphlist
|
|
13
|
+
import pdfminer.pdfpage
|
|
14
|
+
import utilo
|
|
15
|
+
|
|
16
|
+
before = pdfminer.pdfpage.PDFPage.create_pages # pylint:disable=C0103
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def create_pages(document):
|
|
20
|
+
try:
|
|
21
|
+
yield from before(document)
|
|
22
|
+
except IndexError:
|
|
23
|
+
utilo.error('pdfminer parsing error: IndexError')
|
|
24
|
+
sys.exit(1)
|
|
25
|
+
except RecursionError:
|
|
26
|
+
utilo.error('pdfminer parsing error: RecursionError')
|
|
27
|
+
sys.exit(1)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
pdfminer.pdfpage.PDFPage.create_pages = create_pages
|
|
31
|
+
|
|
32
|
+
# TODO HACK HACK HACK
|
|
33
|
+
# bachelor090 REGISTERED SIGN
|
|
34
|
+
# circlecopyrt
|
|
35
|
+
# pdfminer.glyphlist.glyphname2unicode['circlecopyrt'] = '\u25CF'
|
|
36
|
+
pdfminer.glyphlist.glyphname2unicode['circlecopyrt'] = '\r'
|
rawmaker/cli.py
ADDED
|
@@ -0,0 +1,206 @@
|
|
|
1
|
+
#==============================================================================
|
|
2
|
+
# C O P Y R I G H T
|
|
3
|
+
#------------------------------------------------------------------------------
|
|
4
|
+
# Copyright (c) 2019-2023 by Helmut Konrad Schewe. All rights reserved.
|
|
5
|
+
# This file is property of Helmut Konrad Schewe. Any unauthorized copy,
|
|
6
|
+
# use or distribution is an offensive act against international law and may
|
|
7
|
+
# be prosecuted under federal law. Its content is company confidential.
|
|
8
|
+
#==============================================================================
|
|
9
|
+
"""The `rawmaker` takes pdf's from the input folder or direct file and parse
|
|
10
|
+
the raw structure of the pdf and provide them as yaml file for further
|
|
11
|
+
analyze-processes.
|
|
12
|
+
|
|
13
|
+
- toc: tableofcontent
|
|
14
|
+
- text: text content from pdf file
|
|
15
|
+
- border: determine page size and bounding boxes from page content
|
|
16
|
+
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
import contextlib
|
|
20
|
+
import os
|
|
21
|
+
import sys
|
|
22
|
+
|
|
23
|
+
import protoerror
|
|
24
|
+
import utilo
|
|
25
|
+
|
|
26
|
+
import rawmaker
|
|
27
|
+
import rawmaker.error
|
|
28
|
+
import rawmaker.features
|
|
29
|
+
|
|
30
|
+
PDF = utilo.Pattern('*', 'pdf')
|
|
31
|
+
|
|
32
|
+
CHAR_MARGIN = utilo.Value('char_margin', float, defaultvar=2.0, minimum=0.1)
|
|
33
|
+
LINE_OVERLAP = utilo.Value('line_overlap', float, defaultvar=0.5, minimum=0.1)
|
|
34
|
+
LINE_MARGIN = utilo.Value('line_margin', float, defaultvar=0.5, minimum=0.1)
|
|
35
|
+
WORD_MARGIN = utilo.Value('word_margin', float, defaultvar=0.1, minimum=0.1)
|
|
36
|
+
BOXES_FLOW = utilo.Value('boxes_flow', float, defaultvar=0.5, minimum=0.1)
|
|
37
|
+
NOSTRIP = utilo.Bool('nostrip')
|
|
38
|
+
DETECT_VERTICAL = utilo.Bool('detect_vertical')
|
|
39
|
+
|
|
40
|
+
PDF_INPUT = [PDF]
|
|
41
|
+
|
|
42
|
+
CONFIG_INPUTS = [
|
|
43
|
+
BOXES_FLOW,
|
|
44
|
+
CHAR_MARGIN,
|
|
45
|
+
LINE_MARGIN,
|
|
46
|
+
LINE_OVERLAP,
|
|
47
|
+
WORD_MARGIN,
|
|
48
|
+
NOSTRIP,
|
|
49
|
+
DETECT_VERTICAL,
|
|
50
|
+
]
|
|
51
|
+
|
|
52
|
+
WORKPLAN = [
|
|
53
|
+
utilo.create_step(
|
|
54
|
+
'annotation',
|
|
55
|
+
inputs=PDF_INPUT,
|
|
56
|
+
output=('annotation',),
|
|
57
|
+
),
|
|
58
|
+
utilo.create_step(
|
|
59
|
+
'border',
|
|
60
|
+
inputs=PDF_INPUT,
|
|
61
|
+
output=(
|
|
62
|
+
'pages',
|
|
63
|
+
'boundingboxes',
|
|
64
|
+
),
|
|
65
|
+
),
|
|
66
|
+
utilo.create_step(
|
|
67
|
+
'boxes',
|
|
68
|
+
inputs=[
|
|
69
|
+
utilo.ResultFile(producer='rawmaker', name='line_line'),
|
|
70
|
+
],
|
|
71
|
+
output=('boxes',),
|
|
72
|
+
),
|
|
73
|
+
utilo.create_step(
|
|
74
|
+
'figures',
|
|
75
|
+
inputs=PDF_INPUT + [
|
|
76
|
+
utilo.ResultFile(producer='rawmaker', name='boxes_boxes'),
|
|
77
|
+
utilo.Pattern('rawmaker__images_images/*', 'yaml'),
|
|
78
|
+
],
|
|
79
|
+
output=[
|
|
80
|
+
('figures/{FILEHASH_1}', 'yaml'),
|
|
81
|
+
('figures/{FILEHASHS}', 'png'),
|
|
82
|
+
],
|
|
83
|
+
),
|
|
84
|
+
utilo.create_step(
|
|
85
|
+
'horizontals',
|
|
86
|
+
inputs=[
|
|
87
|
+
utilo.ResultFile(producer='rawmaker', name='line_line'),
|
|
88
|
+
],
|
|
89
|
+
output=('horizontals',),
|
|
90
|
+
),
|
|
91
|
+
utilo.create_step(
|
|
92
|
+
'fonts',
|
|
93
|
+
inputs=[PDF] + CONFIG_INPUTS,
|
|
94
|
+
output=(
|
|
95
|
+
'header',
|
|
96
|
+
'content',
|
|
97
|
+
),
|
|
98
|
+
),
|
|
99
|
+
utilo.create_step(
|
|
100
|
+
'formula',
|
|
101
|
+
inputs=PDF_INPUT,
|
|
102
|
+
output=('formula',),
|
|
103
|
+
),
|
|
104
|
+
utilo.create_step(
|
|
105
|
+
'images',
|
|
106
|
+
inputs=PDF_INPUT,
|
|
107
|
+
output=[
|
|
108
|
+
('images/{FILEHASH_1}', 'yaml'),
|
|
109
|
+
('images/{FILEHASHS}', '???'),
|
|
110
|
+
],
|
|
111
|
+
),
|
|
112
|
+
utilo.create_step(
|
|
113
|
+
'line',
|
|
114
|
+
inputs=[
|
|
115
|
+
PDF,
|
|
116
|
+
utilo.ResultFile(producer='rawmaker', name='annotation_annotation'),
|
|
117
|
+
],
|
|
118
|
+
output=('line',),
|
|
119
|
+
),
|
|
120
|
+
utilo.create_step(
|
|
121
|
+
'text',
|
|
122
|
+
inputs=[PDF] + [
|
|
123
|
+
utilo.ResultFile(
|
|
124
|
+
producer='rawmaker',
|
|
125
|
+
name='horizontals_horizontals',
|
|
126
|
+
),
|
|
127
|
+
] + CONFIG_INPUTS,
|
|
128
|
+
output=(
|
|
129
|
+
'text',
|
|
130
|
+
'positions',
|
|
131
|
+
),
|
|
132
|
+
),
|
|
133
|
+
utilo.create_step(
|
|
134
|
+
'outlines',
|
|
135
|
+
inputs=PDF_INPUT,
|
|
136
|
+
output=('outlines',),
|
|
137
|
+
),
|
|
138
|
+
]
|
|
139
|
+
|
|
140
|
+
RAWMAKER_DESCRIPTION = """
|
|
141
|
+
Extract features from pdf document.
|
|
142
|
+
"""
|
|
143
|
+
|
|
144
|
+
LINTER_FLAG = 'linter'
|
|
145
|
+
SUPERFAST_FLAG = 'sf'
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
def main():
|
|
149
|
+
flags = [
|
|
150
|
+
(LINTER_FLAG, 'write linter result'),
|
|
151
|
+
(SUPERFAST_FLAG, 'use superfast to fork processes and merge results'),
|
|
152
|
+
]
|
|
153
|
+
config = utilo.FeaturePackConfig(
|
|
154
|
+
configflag=True,
|
|
155
|
+
description=RAWMAKER_DESCRIPTION,
|
|
156
|
+
errorhook=errorhook,
|
|
157
|
+
flags=flags,
|
|
158
|
+
multiprocessed=True,
|
|
159
|
+
name=rawmaker.PROCESS,
|
|
160
|
+
pages=True,
|
|
161
|
+
profileflag=True,
|
|
162
|
+
singleinput=True,
|
|
163
|
+
verboseflag=True,
|
|
164
|
+
version=rawmaker.__version__,
|
|
165
|
+
)
|
|
166
|
+
with linter():
|
|
167
|
+
utilo.featurepack(
|
|
168
|
+
workplan=WORKPLAN,
|
|
169
|
+
config=config,
|
|
170
|
+
root=rawmaker.ROOT,
|
|
171
|
+
featurepackage='rawmaker.features',
|
|
172
|
+
)
|
|
173
|
+
|
|
174
|
+
|
|
175
|
+
def errorhook(exception, source): # pylint:disable=W0613
|
|
176
|
+
logger = errorhook.linter
|
|
177
|
+
|
|
178
|
+
if isinstance(exception, rawmaker.error.InvalidPDF):
|
|
179
|
+
logger.add_finding(msgid='F0000', confidence=1.0)
|
|
180
|
+
|
|
181
|
+
|
|
182
|
+
def superfast() -> bool:
|
|
183
|
+
return '--sf' in sys.argv
|
|
184
|
+
|
|
185
|
+
|
|
186
|
+
@contextlib.contextmanager
|
|
187
|
+
def linter():
|
|
188
|
+
"""Write result of linting when using `--linter` parameter."""
|
|
189
|
+
# path to write error report
|
|
190
|
+
root = str(os.getcwd())
|
|
191
|
+
# setup linter
|
|
192
|
+
solver = protoerror.Solver()
|
|
193
|
+
for msg, solution in protoerror.solution.SOLUTION.items():
|
|
194
|
+
solver.add_solution(msg, solution)
|
|
195
|
+
|
|
196
|
+
active = [protoerror.MessageStatus(msgid='F0000', active=True)]
|
|
197
|
+
|
|
198
|
+
# init linter
|
|
199
|
+
errorhook.linter = protoerror.Linter(solver=solver, active=active)
|
|
200
|
+
|
|
201
|
+
try:
|
|
202
|
+
yield
|
|
203
|
+
except SystemExit as exc:
|
|
204
|
+
if f'--{LINTER_FLAG}' in sys.argv:
|
|
205
|
+
errorhook.linter.write(root)
|
|
206
|
+
raise exc
|
rawmaker/cli_automate.py
ADDED
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
# =============================================================================
|
|
2
|
+
# C O P Y R I G H T
|
|
3
|
+
# -----------------------------------------------------------------------------
|
|
4
|
+
# Copyright (c) 2021-2023 by Helmut Konrad Schewe. All rights reserved.
|
|
5
|
+
# This file is property of Helmut Konrad Schewe. Any unauthorized copy,
|
|
6
|
+
# use or distribution is an offensive act against international law and may
|
|
7
|
+
# be prosecuted under federal law. Its content is company confidential.
|
|
8
|
+
# =============================================================================
|
|
9
|
+
|
|
10
|
+
import argparse
|
|
11
|
+
import os
|
|
12
|
+
import sys
|
|
13
|
+
|
|
14
|
+
import utilo
|
|
15
|
+
import utilotest
|
|
16
|
+
|
|
17
|
+
DESCRIPTION = """\
|
|
18
|
+
Collect pdf files of defined folders and use them to run rawmaker.
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
@utilo.saveme
|
|
23
|
+
def main():
|
|
24
|
+
parameter = user_input()
|
|
25
|
+
run(*parameter)
|
|
26
|
+
sys.exit(utilo.SUCCESS)
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def user_input() -> tuple:
|
|
30
|
+
parser = argparse.ArgumentParser(description=DESCRIPTION)
|
|
31
|
+
parser.add_argument(
|
|
32
|
+
'-i',
|
|
33
|
+
dest='inpath',
|
|
34
|
+
default=os.getcwd(),
|
|
35
|
+
)
|
|
36
|
+
parser.add_argument(
|
|
37
|
+
'-o',
|
|
38
|
+
dest='outpath',
|
|
39
|
+
default=os.path.join(os.getcwd(), 'outpath'),
|
|
40
|
+
)
|
|
41
|
+
parser.add_argument(
|
|
42
|
+
'-n',
|
|
43
|
+
default=1,
|
|
44
|
+
type=int,
|
|
45
|
+
dest='cores',
|
|
46
|
+
help='count of used cores',
|
|
47
|
+
)
|
|
48
|
+
args = parser.parse_args()
|
|
49
|
+
inpath, outpath, cores = args.inpath, args.outpath, args.cores
|
|
50
|
+
return inpath, outpath, cores
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def run(inpath: str, outpath: str, cores: int = 1):
|
|
54
|
+
os.makedirs(outpath, exist_ok=True)
|
|
55
|
+
files = utilo.file_list(inpath, include='pdf', absolute=True)
|
|
56
|
+
for item in files:
|
|
57
|
+
utilo.log(item)
|
|
58
|
+
cmds = []
|
|
59
|
+
for item in files:
|
|
60
|
+
_, name = os.path.split(item)
|
|
61
|
+
# use quotation marks to encapsulate file path white spaces
|
|
62
|
+
item = f'"{item}"' if ' ' in str(item) else item
|
|
63
|
+
name = utilotest.simple(name) # TODO: REPLACE WITH utilo CODE
|
|
64
|
+
out = os.path.join(outpath, name)
|
|
65
|
+
cmd = f'rawmaker -i {item} -o {out} -j4'
|
|
66
|
+
cmds.append(cmd)
|
|
67
|
+
for cmd in cmds:
|
|
68
|
+
utilo.log(cmd, preserve_newlines=False)
|
|
69
|
+
utilo.run_parallel(cmds, worker=cores, verbose=True)
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
# =============================================================================
|
|
2
|
+
# C O P Y R I G H T
|
|
3
|
+
# -----------------------------------------------------------------------------
|
|
4
|
+
# Copyright (c) 2020-2023 by Helmut Konrad Schewe. All rights reserved.
|
|
5
|
+
# This file is property of Helmut Konrad Schewe. Any unauthorized copy,
|
|
6
|
+
# use or distribution is an offensive act against international law and may
|
|
7
|
+
# be prosecuted under federal law. Its content is company confidential.
|
|
8
|
+
# =============================================================================
|