rawmaker 2.40.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- letty/__init__.py +46 -0
- letty/cli.py +63 -0
- letty/optimizer.py +138 -0
- letty/quality/__init__.py +8 -0
- letty/quality/whitespace.py +50 -0
- letty/strategy.py +8 -0
- rawmaker/__init__.py +29 -0
- rawmaker/__main__.py +13 -0
- rawmaker/__patch__.py +36 -0
- rawmaker/cli.py +206 -0
- rawmaker/cli_automate.py +69 -0
- rawmaker/converter/__init__.py +8 -0
- rawmaker/converter/basic.py +174 -0
- rawmaker/converter/images.py +168 -0
- rawmaker/date.py +83 -0
- rawmaker/destination.py +202 -0
- rawmaker/error.py +34 -0
- rawmaker/features/__init__.py +138 -0
- rawmaker/features/annotation.py +254 -0
- rawmaker/features/border.py +172 -0
- rawmaker/features/boxes.py +153 -0
- rawmaker/features/figures.py +24 -0
- rawmaker/features/fonts.py +229 -0
- rawmaker/features/formula.py +16 -0
- rawmaker/features/horizontals.py +132 -0
- rawmaker/features/images.py +155 -0
- rawmaker/features/line.py +337 -0
- rawmaker/features/outlines.py +123 -0
- rawmaker/features/text.py +91 -0
- rawmaker/fonts/__init__.py +8 -0
- rawmaker/fonts/parser.py +354 -0
- rawmaker/images/__init__.py +8 -0
- rawmaker/images/info.py +35 -0
- rawmaker/miner/__init__.py +8 -0
- rawmaker/miner/char.py +42 -0
- rawmaker/miner/colorspace.py +75 -0
- rawmaker/miner/images.py +448 -0
- rawmaker/miner/position.py +121 -0
- rawmaker/miner/rawchar.py +207 -0
- rawmaker/miner/text.py +833 -0
- rawmaker/miner/underline.py +66 -0
- rawmaker/parameter.py +130 -0
- rawmaker/patch/__init__.py +8 -0
- rawmaker/patch/ltchar.py +79 -0
- rawmaker/reader.py +97 -0
- rawmaker/text/__init__.py +8 -0
- rawmaker/text/chars.py +24 -0
- rawmaker/text/data.py +47 -0
- rawmaker/text/superfast.py +91 -0
- rawmaker/text/wordbox.py +95 -0
- rawmaker/utils.py +44 -0
- rawmaker-2.40.3.dist-info/METADATA +51 -0
- rawmaker-2.40.3.dist-info/RECORD +63 -0
- rawmaker-2.40.3.dist-info/WHEEL +5 -0
- rawmaker-2.40.3.dist-info/entry_points.txt +6 -0
- rawmaker-2.40.3.dist-info/licenses/LICENSE +21 -0
- rawmaker-2.40.3.dist-info/top_level.txt +3 -0
- spacestation/__init__.py +18 -0
- spacestation/cli.py +51 -0
- spacestation/features/__init__.py +8 -0
- spacestation/features/chardist.py +85 -0
- spacestation/features/worddist.py +57 -0
- spacestation/features/wspace.py +130 -0
|
@@ -0,0 +1,153 @@
|
|
|
1
|
+
# =============================================================================
|
|
2
|
+
# C O P Y R I G H T
|
|
3
|
+
# -----------------------------------------------------------------------------
|
|
4
|
+
# Copyright (c) 2019-2023 by Helmut Konrad Schewe. All rights reserved.
|
|
5
|
+
# This file is property of Helmut Konrad Schewe. Any unauthorized copy,
|
|
6
|
+
# use or distribution is an offensive act against international law and may
|
|
7
|
+
# be prosecuted under federal law. Its content is company confidential.
|
|
8
|
+
# =============================================================================
|
|
9
|
+
"""Boxes
|
|
10
|
+
=====
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
import functools
|
|
14
|
+
import operator
|
|
15
|
+
|
|
16
|
+
import configos
|
|
17
|
+
import iamraw
|
|
18
|
+
import pdfminer.layout
|
|
19
|
+
import serializeraw
|
|
20
|
+
import utilo
|
|
21
|
+
|
|
22
|
+
# width of box
|
|
23
|
+
RECTANGLE_WIDTH_MIN = configos.HV_FLOAT_PLUS(default=50.0)
|
|
24
|
+
# height of box
|
|
25
|
+
RECTANGLE_HEIGHT_MIN = configos.HV_FLOAT_PLUS(default=50.0)
|
|
26
|
+
# distance of two merging boxes/rectangles
|
|
27
|
+
ENDING_DISTANCE_MAX = configos.HV_FLOAT_PLUS(default=3)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def work(lines: str, pages: tuple) -> str:
|
|
31
|
+
"""Extract content boxes from given `document`.
|
|
32
|
+
|
|
33
|
+
Args:
|
|
34
|
+
lines(str): path to lines
|
|
35
|
+
pages(tuple): pages to analyze
|
|
36
|
+
Returns:
|
|
37
|
+
dumped parsed boxes, dumped parsed horizontals
|
|
38
|
+
"""
|
|
39
|
+
assert isinstance(lines, str), type(lines)
|
|
40
|
+
lines = serializeraw.load_lines(lines, pages=pages)
|
|
41
|
+
boxes = determine_clusteritem(lines, determine_pageboxes)
|
|
42
|
+
dumped_boxes = serializeraw.dump_boxes(boxes)
|
|
43
|
+
return dumped_boxes
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def determine_boxes(
|
|
47
|
+
lines,
|
|
48
|
+
rectangle_width_min=RECTANGLE_WIDTH_MIN,
|
|
49
|
+
rectangle_height_min=RECTANGLE_HEIGHT_MIN,
|
|
50
|
+
):
|
|
51
|
+
collect = functools.partial(
|
|
52
|
+
determine_pageboxes,
|
|
53
|
+
rectangle_width_min=rectangle_width_min,
|
|
54
|
+
rectangle_height_min=rectangle_height_min,
|
|
55
|
+
)
|
|
56
|
+
boxes = determine_clusteritem(lines, collect)
|
|
57
|
+
return boxes
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def determine_clusteritem(
|
|
61
|
+
lines: iamraw.PageContentLines,
|
|
62
|
+
collector: callable,
|
|
63
|
+
):
|
|
64
|
+
result = []
|
|
65
|
+
for paged in lines:
|
|
66
|
+
lines_in_page, page = paged.content, paged.page
|
|
67
|
+
# remove lines which are to short and represent a dot
|
|
68
|
+
lines_in_page = [
|
|
69
|
+
item for item in lines_in_page if not utilo.isdot(item)
|
|
70
|
+
]
|
|
71
|
+
# remove duplicated lines
|
|
72
|
+
lines_in_page = utilo.unique_lines(lines_in_page)
|
|
73
|
+
grouped = determine_cluster(lines_in_page)
|
|
74
|
+
collected = collector(
|
|
75
|
+
grouped,
|
|
76
|
+
page,
|
|
77
|
+
rotated=paged.rotated,
|
|
78
|
+
)
|
|
79
|
+
result.append(collected)
|
|
80
|
+
return result
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def determine_pageboxes(
|
|
84
|
+
clusters: list[pdfminer.layout.LTLine],
|
|
85
|
+
page: int,
|
|
86
|
+
rotated: bool = False, # pylint:disable=W0613
|
|
87
|
+
rectangle_width_min=RECTANGLE_WIDTH_MIN,
|
|
88
|
+
rectangle_height_min=RECTANGLE_HEIGHT_MIN,
|
|
89
|
+
) -> iamraw.PageContentBoxes:
|
|
90
|
+
result = []
|
|
91
|
+
for cluster in clusters:
|
|
92
|
+
count = len(cluster)
|
|
93
|
+
if count != 4:
|
|
94
|
+
continue
|
|
95
|
+
x0 = min([line[0] for line in cluster] + [line[2] for line in cluster])
|
|
96
|
+
x1 = max([line[0] for line in cluster] + [line[2] for line in cluster])
|
|
97
|
+
y0 = min([line[1] for line in cluster] + [line[3] for line in cluster])
|
|
98
|
+
y1 = max([line[1] for line in cluster] + [line[3] for line in cluster])
|
|
99
|
+
width, height = x1 - x0, y1 - y0
|
|
100
|
+
if width < rectangle_width_min:
|
|
101
|
+
# small boxes are mostly a result of bad parsed figures or
|
|
102
|
+
# tables, we do not want them.
|
|
103
|
+
continue
|
|
104
|
+
if height < rectangle_height_min:
|
|
105
|
+
continue
|
|
106
|
+
box = iamraw.Box(box=iamraw.BoundingBox(x0, y0, x1, y1))
|
|
107
|
+
result.append(box)
|
|
108
|
+
# ensure to sort items top to bottom and left to right
|
|
109
|
+
result = sorted(result, key=operator.attrgetter('box.y0', 'box.x0'))
|
|
110
|
+
return iamraw.PageContentBoxes(content=result, page=page)
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
def determine_cluster(items: iamraw.BoundingBoxes) -> iamraw.BoundingBoxes: # pylint:disable=R1260
|
|
114
|
+
# TODO: REPLACE THIS CODE
|
|
115
|
+
if not items:
|
|
116
|
+
return []
|
|
117
|
+
# a single element is a cluster
|
|
118
|
+
result = [[item] for item in items]
|
|
119
|
+
|
|
120
|
+
def match(result, current):
|
|
121
|
+
for clusterindex, cluster in enumerate(result):
|
|
122
|
+
for clusteritem in cluster:
|
|
123
|
+
for test in current:
|
|
124
|
+
if utilo.intersecting_ending(
|
|
125
|
+
clusteritem,
|
|
126
|
+
test,
|
|
127
|
+
tol=ENDING_DISTANCE_MAX,
|
|
128
|
+
):
|
|
129
|
+
return clusterindex
|
|
130
|
+
return None
|
|
131
|
+
|
|
132
|
+
def cluster(result):
|
|
133
|
+
result, todo = result[0], result[1:]
|
|
134
|
+
if not isinstance(result[0], list):
|
|
135
|
+
result = [result]
|
|
136
|
+
while todo: # pylint:disable=W0149
|
|
137
|
+
current = todo.pop()
|
|
138
|
+
index = match(result, current)
|
|
139
|
+
if index is None:
|
|
140
|
+
# No match, create new cluster
|
|
141
|
+
result.insert(0, current)
|
|
142
|
+
else:
|
|
143
|
+
result[index].extend(current)
|
|
144
|
+
return result
|
|
145
|
+
|
|
146
|
+
single = utilo.Single()
|
|
147
|
+
while True: # pylint:disable=W0149
|
|
148
|
+
# Break when cluster does not change result Cluster till cluster
|
|
149
|
+
# move does not change the result.
|
|
150
|
+
result = cluster(result)
|
|
151
|
+
if single.contains(result):
|
|
152
|
+
break
|
|
153
|
+
return result
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
# =============================================================================
|
|
2
|
+
# C O P Y R I G H T
|
|
3
|
+
# -----------------------------------------------------------------------------
|
|
4
|
+
# Copyright (c) 2020-2023 by Helmut Konrad Schewe. All rights reserved.
|
|
5
|
+
# This file is property of Helmut Konrad Schewe. Any unauthorized copy,
|
|
6
|
+
# use or distribution is an offensive act against international law and may
|
|
7
|
+
# be prosecuted under federal law. Its content is company confidential.
|
|
8
|
+
# =============================================================================
|
|
9
|
+
"""Figure Extractor
|
|
10
|
+
|
|
11
|
+
Extract figures and convert to images
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
DumpedFigureInformation = list[tuple[str, bytes]]
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def work( # pylint:disable=keyword-arg-before-vararg,W0613
|
|
18
|
+
path: str,
|
|
19
|
+
boxes: str = None, # pylint:disable=W0613
|
|
20
|
+
*images: list,
|
|
21
|
+
pages: tuple = None,
|
|
22
|
+
) -> DumpedFigureInformation:
|
|
23
|
+
dumped = []
|
|
24
|
+
return dumped
|
|
@@ -0,0 +1,229 @@
|
|
|
1
|
+
#==============================================================================
|
|
2
|
+
# C O P Y R I G H T
|
|
3
|
+
#------------------------------------------------------------------------------
|
|
4
|
+
# Copyright (c) 2019-2023 by Helmut Konrad Schewe. All rights reserved.
|
|
5
|
+
# This file is property of Helmut Konrad Schewe. Any unauthorized copy,
|
|
6
|
+
# use or distribution is an offensive act against international law and may
|
|
7
|
+
# be prosecuted under federal law. Its content is company confidential.
|
|
8
|
+
#==============================================================================
|
|
9
|
+
"""Extract fonts out of pdf document to gather information
|
|
10
|
+
|
|
11
|
+
Stored format:
|
|
12
|
+
|
|
13
|
+
(
|
|
14
|
+
container,
|
|
15
|
+
line,
|
|
16
|
+
char,
|
|
17
|
+
fontkey
|
|
18
|
+
)
|
|
19
|
+
|
|
20
|
+
Stored item is the first different item.
|
|
21
|
+
|
|
22
|
+
The font container indexing indexes only on text-container, other pages
|
|
23
|
+
objects are ignored.
|
|
24
|
+
"""
|
|
25
|
+
|
|
26
|
+
import functools
|
|
27
|
+
import math
|
|
28
|
+
|
|
29
|
+
import iamraw
|
|
30
|
+
import serializeraw
|
|
31
|
+
import utilo
|
|
32
|
+
|
|
33
|
+
import rawmaker.features
|
|
34
|
+
import rawmaker.features.text
|
|
35
|
+
import rawmaker.fonts.parser
|
|
36
|
+
import rawmaker.miner.rawchar
|
|
37
|
+
import rawmaker.parameter
|
|
38
|
+
import rawmaker.reader
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def work( # pylint:disable=W9015,W0613
|
|
42
|
+
document: str,
|
|
43
|
+
boxes_flow: float = 0.5,
|
|
44
|
+
char_margin: float = 2.0,
|
|
45
|
+
line_margin: float = 0.5,
|
|
46
|
+
line_overlap: float = 0.5,
|
|
47
|
+
word_margin: float = 0.1,
|
|
48
|
+
nostrip: bool = not rawmaker.parameter.STRIP,
|
|
49
|
+
detect_vertical: bool = False,
|
|
50
|
+
pages: list = None,
|
|
51
|
+
) -> tuple[str, str]:
|
|
52
|
+
"""Extract structured text out of document
|
|
53
|
+
|
|
54
|
+
Args:
|
|
55
|
+
document: pdf-document to run parsing
|
|
56
|
+
char_margin(float): XXX:5.0 why?
|
|
57
|
+
pages: limit analyzed area, if None every page is analyzed
|
|
58
|
+
Returns:
|
|
59
|
+
parsed document as yaml output
|
|
60
|
+
"""
|
|
61
|
+
# TODO: CHANGE BEHAVIOR OF --detect_vertical. Convert to PARAMETER
|
|
62
|
+
# with True as default.
|
|
63
|
+
detect_vertical = True
|
|
64
|
+
assert isinstance(document, str), str(document)
|
|
65
|
+
config = rawmaker.parameter.ParsingConfiguration.from_dict(**locals())
|
|
66
|
+
document = rawmaker.features.text.extract_document(
|
|
67
|
+
document,
|
|
68
|
+
config=config,
|
|
69
|
+
pages=pages,
|
|
70
|
+
)
|
|
71
|
+
header, content = parse_fonts(document)
|
|
72
|
+
header, content = (
|
|
73
|
+
serializeraw.dump_font_header(header),
|
|
74
|
+
serializeraw.dump_font_content(content),
|
|
75
|
+
)
|
|
76
|
+
return header, content
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
class FontStore:
|
|
80
|
+
|
|
81
|
+
def __init__(self, parser=None):
|
|
82
|
+
self.parser = parser if parser else rawmaker.fonts.parser.font_fromraw
|
|
83
|
+
self.data = {}
|
|
84
|
+
|
|
85
|
+
@functools.lru_cache(maxsize=128)
|
|
86
|
+
def font_key(self, raw_font: str, scale: float, flags: int) -> int:
|
|
87
|
+
parsed = self.parser(raw_font, scale, flags)
|
|
88
|
+
hashed = hash(parsed)
|
|
89
|
+
try:
|
|
90
|
+
self.data[hashed]
|
|
91
|
+
except KeyError:
|
|
92
|
+
self.data[hashed] = parsed
|
|
93
|
+
return hashed
|
|
94
|
+
|
|
95
|
+
def font(self, hashed: int):
|
|
96
|
+
return self.data[hashed]
|
|
97
|
+
|
|
98
|
+
def fonts(self):
|
|
99
|
+
return list(self.data.values())
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
def process_page( # pylint:disable=R0914
|
|
103
|
+
page: iamraw.Page,
|
|
104
|
+
fontstore: FontStore,
|
|
105
|
+
) -> iamraw.PageFontContent:
|
|
106
|
+
"""Iterate throw text container and extract the different fonts and
|
|
107
|
+
positions.
|
|
108
|
+
|
|
109
|
+
There are three indexs describing the position where the font-size
|
|
110
|
+
or font-rises changes. The text container, the line in the
|
|
111
|
+
container, and the char in line. The position of change is oriented
|
|
112
|
+
on python range/indexing. We note the change one char after the
|
|
113
|
+
change. Container and line are equal. Therefore on line endings, the
|
|
114
|
+
change is noted on a char position which does not exists.
|
|
115
|
+
|
|
116
|
+
Args:
|
|
117
|
+
page(Page): current pdf page
|
|
118
|
+
fontstore(FontStore): fontstore to store full information of used font
|
|
119
|
+
Returns:
|
|
120
|
+
Page with font information of the page text content.
|
|
121
|
+
"""
|
|
122
|
+
assert isinstance(page, iamraw.Page), type(page)
|
|
123
|
+
position = (0, 0, 0) # container, line, char
|
|
124
|
+
current_font, current_scale = None, None
|
|
125
|
+
current_flags = None
|
|
126
|
+
textcontainer = utilo.select_type(page.children, iamraw.TextContainer)
|
|
127
|
+
result = []
|
|
128
|
+
for container_index, container in enumerate(textcontainer):
|
|
129
|
+
rotated = isinstance(container, iamraw.VerticalTextContainer)
|
|
130
|
+
for line_index, line in enumerate(container.lines):
|
|
131
|
+
for char_index, char in enumerate(line):
|
|
132
|
+
try:
|
|
133
|
+
font = char.font
|
|
134
|
+
except AttributeError:
|
|
135
|
+
# Virtual chars have no fonts, but newlines are part
|
|
136
|
+
# of font definition.
|
|
137
|
+
position = (container_index, line_index, char_index)
|
|
138
|
+
continue
|
|
139
|
+
scale = scale_fromchar(char, vertical=rotated)
|
|
140
|
+
flags = flags_fromchar(char)
|
|
141
|
+
# No font type or size is selected
|
|
142
|
+
if current_font is None:
|
|
143
|
+
current_font, current_scale = (font, scale)
|
|
144
|
+
current_flags = flags
|
|
145
|
+
continue
|
|
146
|
+
# Font type, size or flags changed
|
|
147
|
+
if any((
|
|
148
|
+
current_font != font,
|
|
149
|
+
current_scale != scale,
|
|
150
|
+
current_flags != flags,
|
|
151
|
+
)):
|
|
152
|
+
fontid = add_font(
|
|
153
|
+
current_font,
|
|
154
|
+
current_scale,
|
|
155
|
+
flags=current_flags,
|
|
156
|
+
position=position,
|
|
157
|
+
fontstore=fontstore,
|
|
158
|
+
)
|
|
159
|
+
result.append(fontid)
|
|
160
|
+
# Reset current front
|
|
161
|
+
current_font, current_scale = font, scale
|
|
162
|
+
current_flags = flags
|
|
163
|
+
# update last index of current font
|
|
164
|
+
position = (container_index, line_index, char_index)
|
|
165
|
+
# add last text line of a page, because there is nothing changing
|
|
166
|
+
if current_font:
|
|
167
|
+
fontid = add_font(
|
|
168
|
+
current_font,
|
|
169
|
+
current_scale,
|
|
170
|
+
flags=current_flags,
|
|
171
|
+
position=position,
|
|
172
|
+
fontstore=fontstore,
|
|
173
|
+
)
|
|
174
|
+
result.append(fontid)
|
|
175
|
+
return iamraw.PageFontContent(content=result, page=page.page)
|
|
176
|
+
|
|
177
|
+
|
|
178
|
+
def parse_fonts(document: iamraw.Document):
|
|
179
|
+
fontstore = FontStore(rawmaker.fonts.parser.font_fromraw)
|
|
180
|
+
content = [process_page(page, fontstore) for page in document.pages]
|
|
181
|
+
# Run header after content is important. DO NOT CHANGE ORDER. If
|
|
182
|
+
# running .fonts() first, content will be empty cause no fonts where
|
|
183
|
+
# processed.
|
|
184
|
+
header = fontstore.fonts()
|
|
185
|
+
return header, content
|
|
186
|
+
|
|
187
|
+
|
|
188
|
+
def add_font(font, scale, flags, *, fontstore, position):
|
|
189
|
+
# position = (container, line, chars + 1)
|
|
190
|
+
container, line, char = position
|
|
191
|
+
# store position after the change happend
|
|
192
|
+
char = char + 1
|
|
193
|
+
fontkey = fontstore.font_key(font, scale, flags)
|
|
194
|
+
return (container, line, char, fontkey)
|
|
195
|
+
|
|
196
|
+
|
|
197
|
+
def flags_fromchar(char) -> tuple:
|
|
198
|
+
try:
|
|
199
|
+
# LTChar
|
|
200
|
+
flags = char.ltchar.flags
|
|
201
|
+
except AttributeError:
|
|
202
|
+
# Char
|
|
203
|
+
flags = char.flags
|
|
204
|
+
return flags
|
|
205
|
+
|
|
206
|
+
|
|
207
|
+
def upright_fromchar(char) -> bool:
|
|
208
|
+
try:
|
|
209
|
+
upright = char.ltchar.upright
|
|
210
|
+
except AttributeError:
|
|
211
|
+
upright = True
|
|
212
|
+
return upright
|
|
213
|
+
|
|
214
|
+
|
|
215
|
+
def scale_fromchar(char, vertical: bool = False) -> float:
|
|
216
|
+
# TODO: INVESTIGATE 1.34??
|
|
217
|
+
# NOTE: This works for POSTSCRIPT_14_DEFAULT's but not for
|
|
218
|
+
# Calibri.
|
|
219
|
+
scale = utilo.roundme(char.size / 1.34005)
|
|
220
|
+
# TODO: THINK ABOUT VERTICAL HACK
|
|
221
|
+
if scale < 0:
|
|
222
|
+
rotated = not upright_fromchar(char)
|
|
223
|
+
rotated |= vertical
|
|
224
|
+
absolute = math.fabs(scale)
|
|
225
|
+
if rotated and absolute > 4.0: # TODO: HOLY VALUE
|
|
226
|
+
# rotated char which is printed top down
|
|
227
|
+
return absolute
|
|
228
|
+
utilo.debug(f'negative font size: {scale} {char}')
|
|
229
|
+
return scale
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
# =============================================================================
|
|
2
|
+
# C O P Y R I G H T
|
|
3
|
+
# -----------------------------------------------------------------------------
|
|
4
|
+
# Copyright (c) 2020-2023 by Helmut Konrad Schewe. All rights reserved.
|
|
5
|
+
# This file is property of Helmut Konrad Schewe. Any unauthorized copy,
|
|
6
|
+
# use or distribution is an offensive act against international law and may
|
|
7
|
+
# be prosecuted under federal law. Its content is company confidential.
|
|
8
|
+
# =============================================================================
|
|
9
|
+
|
|
10
|
+
import serializeraw
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def work(path: str, pages: tuple = None) -> str: # pylint:disable=W0613
|
|
14
|
+
formulas = []
|
|
15
|
+
dumped = serializeraw.dump_rawformulas(formulas)
|
|
16
|
+
return dumped
|
|
@@ -0,0 +1,132 @@
|
|
|
1
|
+
# =============================================================================
|
|
2
|
+
# C O P Y R I G H T
|
|
3
|
+
# -----------------------------------------------------------------------------
|
|
4
|
+
# Copyright (c) 2019-2023 by Helmut Konrad Schewe. All rights reserved.
|
|
5
|
+
# This file is property of Helmut Konrad Schewe. Any unauthorized copy,
|
|
6
|
+
# use or distribution is an offensive act against international law and may
|
|
7
|
+
# be prosecuted under federal law. Its content is company confidential.
|
|
8
|
+
# =============================================================================
|
|
9
|
+
"""Horizontals
|
|
10
|
+
===========
|
|
11
|
+
|
|
12
|
+
Whats the difference between `boxes_horizontals` and `lines`?
|
|
13
|
+
`boxes_horizontals` contain only vertical lines. `lines` can contain
|
|
14
|
+
every lines in every direction.
|
|
15
|
+
|
|
16
|
+
Why do we cluster for horizontal lines?
|
|
17
|
+
To ignore lines which are part of a box and can not be a horizontal line.
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
import functools
|
|
21
|
+
import operator
|
|
22
|
+
|
|
23
|
+
import configos
|
|
24
|
+
import iamraw
|
|
25
|
+
import pdfminer.layout
|
|
26
|
+
import serializeraw
|
|
27
|
+
import utilo
|
|
28
|
+
|
|
29
|
+
import rawmaker.features
|
|
30
|
+
import rawmaker.features.border
|
|
31
|
+
import rawmaker.features.boxes
|
|
32
|
+
import rawmaker.reader
|
|
33
|
+
|
|
34
|
+
# TODO: LTLine - replace with own data structure to reduce dependencies to
|
|
35
|
+
# rawmaker
|
|
36
|
+
LineClusters = list[list[pdfminer.layout.LTLine]]
|
|
37
|
+
|
|
38
|
+
# minimal length of a horizontal line
|
|
39
|
+
HORIZONTAL_WIDTH_MIN = configos.HV_FLOAT(default=0.2)
|
|
40
|
+
# maximal difference in x-component
|
|
41
|
+
HORIZONTAL_VERTICAL_DIFF_MAX = configos.HV_FLOAT_PLUS(default=5.0)
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def work(lines: str, pages: tuple) -> str:
|
|
45
|
+
"""Extract content horizontal lines from given `document`
|
|
46
|
+
|
|
47
|
+
Args:
|
|
48
|
+
lines(str): path to document
|
|
49
|
+
pages(tuple): pages to analyze
|
|
50
|
+
Returns:
|
|
51
|
+
dumped parsed boxes, dumped parsed horizontals
|
|
52
|
+
"""
|
|
53
|
+
assert isinstance(lines, str), type(lines)
|
|
54
|
+
lines = serializeraw.load_lines(lines, pages=pages)
|
|
55
|
+
horizontal = determine_horizontal(lines)
|
|
56
|
+
dumped = serializeraw.dump_horizontals(horizontal)
|
|
57
|
+
return dumped
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def determine_horizontal(lines, pagewidth=500):
|
|
61
|
+
worker = functools.partial(determine_pagehorizontals, page_width=pagewidth)
|
|
62
|
+
# run worker
|
|
63
|
+
result = rawmaker.features.boxes.determine_clusteritem(
|
|
64
|
+
lines,
|
|
65
|
+
worker,
|
|
66
|
+
)
|
|
67
|
+
return result
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def determine_pagehorizontals( # pylint:disable=R0914
|
|
71
|
+
cluster: LineClusters,
|
|
72
|
+
page: int,
|
|
73
|
+
*,
|
|
74
|
+
page_width: float,
|
|
75
|
+
rotated: bool = False,
|
|
76
|
+
vertical_maxerror: float = HORIZONTAL_VERTICAL_DIFF_MAX,
|
|
77
|
+
horizontal_minwidth: float = HORIZONTAL_WIDTH_MIN,
|
|
78
|
+
) -> iamraw.PageContentHorizontals:
|
|
79
|
+
"""Collect single line which are expanded horizontal
|
|
80
|
+
|
|
81
|
+
Args:
|
|
82
|
+
cluster: list of line cluster
|
|
83
|
+
page(int): current analyzed page
|
|
84
|
+
|
|
85
|
+
page_width(float): width of page page
|
|
86
|
+
rotated(bool): if True pdfpage is rotated
|
|
87
|
+
vertical_maxerror(float): maximal vertical difference of the left and
|
|
88
|
+
right y-component [0.0,1.0].
|
|
89
|
+
horizontal_minwidth(float): minimum distance between left and right
|
|
90
|
+
x-component [0.0,1.0].
|
|
91
|
+
Returns:
|
|
92
|
+
list with horizontal line
|
|
93
|
+
"""
|
|
94
|
+
horizontal_minwidth = horizontal_minwidth * page_width
|
|
95
|
+
collected = []
|
|
96
|
+
for merged in cluster:
|
|
97
|
+
if len(merged) != 1:
|
|
98
|
+
# ignore boxed lines
|
|
99
|
+
continue
|
|
100
|
+
# convert from BoundingBox
|
|
101
|
+
x0, y0, x1, y1 = utilo.roundme(tuple(merged[0]))
|
|
102
|
+
height = abs(y1 - y0)
|
|
103
|
+
width = abs(x1 - x0)
|
|
104
|
+
# check roated
|
|
105
|
+
if rotated:
|
|
106
|
+
width, height = height, width # flip
|
|
107
|
+
if height > vertical_maxerror:
|
|
108
|
+
utilo.debug(f'no horizontal line {x0} {y0} {x1} {y1}; page: {page}'
|
|
109
|
+
f' vertical error: {height} > {vertical_maxerror}')
|
|
110
|
+
continue
|
|
111
|
+
if width < horizontal_minwidth:
|
|
112
|
+
utilo.debug(f'no horizontal line {x0} {y0} {x1} {y1}; page: {page}'
|
|
113
|
+
f' too short: {width} < {horizontal_minwidth}')
|
|
114
|
+
continue
|
|
115
|
+
if not rotated:
|
|
116
|
+
y0 = y1 = utilo.roundme((y0 + y1) / 2)
|
|
117
|
+
else:
|
|
118
|
+
x0 = x1 = utilo.roundme((x0 + x1) / 2)
|
|
119
|
+
box = iamraw.BoundingBox(x0, y0, x1, y1)
|
|
120
|
+
horizontal = iamraw.HorizontalLine(box=box)
|
|
121
|
+
collected.append(horizontal)
|
|
122
|
+
# ensure to sort items top to bottom and left to right
|
|
123
|
+
collected = sorted(
|
|
124
|
+
collected,
|
|
125
|
+
key=operator.attrgetter('box.y0', 'box.x0'),
|
|
126
|
+
)
|
|
127
|
+
result = iamraw.PageContentHorizontals(
|
|
128
|
+
content=collected,
|
|
129
|
+
page=page,
|
|
130
|
+
rotated=rotated,
|
|
131
|
+
)
|
|
132
|
+
return result
|