rawmaker 2.40.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (63) hide show
  1. letty/__init__.py +46 -0
  2. letty/cli.py +63 -0
  3. letty/optimizer.py +138 -0
  4. letty/quality/__init__.py +8 -0
  5. letty/quality/whitespace.py +50 -0
  6. letty/strategy.py +8 -0
  7. rawmaker/__init__.py +29 -0
  8. rawmaker/__main__.py +13 -0
  9. rawmaker/__patch__.py +36 -0
  10. rawmaker/cli.py +206 -0
  11. rawmaker/cli_automate.py +69 -0
  12. rawmaker/converter/__init__.py +8 -0
  13. rawmaker/converter/basic.py +174 -0
  14. rawmaker/converter/images.py +168 -0
  15. rawmaker/date.py +83 -0
  16. rawmaker/destination.py +202 -0
  17. rawmaker/error.py +34 -0
  18. rawmaker/features/__init__.py +138 -0
  19. rawmaker/features/annotation.py +254 -0
  20. rawmaker/features/border.py +172 -0
  21. rawmaker/features/boxes.py +153 -0
  22. rawmaker/features/figures.py +24 -0
  23. rawmaker/features/fonts.py +229 -0
  24. rawmaker/features/formula.py +16 -0
  25. rawmaker/features/horizontals.py +132 -0
  26. rawmaker/features/images.py +155 -0
  27. rawmaker/features/line.py +337 -0
  28. rawmaker/features/outlines.py +123 -0
  29. rawmaker/features/text.py +91 -0
  30. rawmaker/fonts/__init__.py +8 -0
  31. rawmaker/fonts/parser.py +354 -0
  32. rawmaker/images/__init__.py +8 -0
  33. rawmaker/images/info.py +35 -0
  34. rawmaker/miner/__init__.py +8 -0
  35. rawmaker/miner/char.py +42 -0
  36. rawmaker/miner/colorspace.py +75 -0
  37. rawmaker/miner/images.py +448 -0
  38. rawmaker/miner/position.py +121 -0
  39. rawmaker/miner/rawchar.py +207 -0
  40. rawmaker/miner/text.py +833 -0
  41. rawmaker/miner/underline.py +66 -0
  42. rawmaker/parameter.py +130 -0
  43. rawmaker/patch/__init__.py +8 -0
  44. rawmaker/patch/ltchar.py +79 -0
  45. rawmaker/reader.py +97 -0
  46. rawmaker/text/__init__.py +8 -0
  47. rawmaker/text/chars.py +24 -0
  48. rawmaker/text/data.py +47 -0
  49. rawmaker/text/superfast.py +91 -0
  50. rawmaker/text/wordbox.py +95 -0
  51. rawmaker/utils.py +44 -0
  52. rawmaker-2.40.3.dist-info/METADATA +51 -0
  53. rawmaker-2.40.3.dist-info/RECORD +63 -0
  54. rawmaker-2.40.3.dist-info/WHEEL +5 -0
  55. rawmaker-2.40.3.dist-info/entry_points.txt +6 -0
  56. rawmaker-2.40.3.dist-info/licenses/LICENSE +21 -0
  57. rawmaker-2.40.3.dist-info/top_level.txt +3 -0
  58. spacestation/__init__.py +18 -0
  59. spacestation/cli.py +51 -0
  60. spacestation/features/__init__.py +8 -0
  61. spacestation/features/chardist.py +85 -0
  62. spacestation/features/worddist.py +57 -0
  63. spacestation/features/wspace.py +130 -0
@@ -0,0 +1,153 @@
1
+ # =============================================================================
2
+ # C O P Y R I G H T
3
+ # -----------------------------------------------------------------------------
4
+ # Copyright (c) 2019-2023 by Helmut Konrad Schewe. All rights reserved.
5
+ # This file is property of Helmut Konrad Schewe. Any unauthorized copy,
6
+ # use or distribution is an offensive act against international law and may
7
+ # be prosecuted under federal law. Its content is company confidential.
8
+ # =============================================================================
9
+ """Boxes
10
+ =====
11
+ """
12
+
13
+ import functools
14
+ import operator
15
+
16
+ import configos
17
+ import iamraw
18
+ import pdfminer.layout
19
+ import serializeraw
20
+ import utilo
21
+
22
+ # width of box
23
+ RECTANGLE_WIDTH_MIN = configos.HV_FLOAT_PLUS(default=50.0)
24
+ # height of box
25
+ RECTANGLE_HEIGHT_MIN = configos.HV_FLOAT_PLUS(default=50.0)
26
+ # distance of two merging boxes/rectangles
27
+ ENDING_DISTANCE_MAX = configos.HV_FLOAT_PLUS(default=3)
28
+
29
+
30
+ def work(lines: str, pages: tuple) -> str:
31
+ """Extract content boxes from given `document`.
32
+
33
+ Args:
34
+ lines(str): path to lines
35
+ pages(tuple): pages to analyze
36
+ Returns:
37
+ dumped parsed boxes, dumped parsed horizontals
38
+ """
39
+ assert isinstance(lines, str), type(lines)
40
+ lines = serializeraw.load_lines(lines, pages=pages)
41
+ boxes = determine_clusteritem(lines, determine_pageboxes)
42
+ dumped_boxes = serializeraw.dump_boxes(boxes)
43
+ return dumped_boxes
44
+
45
+
46
+ def determine_boxes(
47
+ lines,
48
+ rectangle_width_min=RECTANGLE_WIDTH_MIN,
49
+ rectangle_height_min=RECTANGLE_HEIGHT_MIN,
50
+ ):
51
+ collect = functools.partial(
52
+ determine_pageboxes,
53
+ rectangle_width_min=rectangle_width_min,
54
+ rectangle_height_min=rectangle_height_min,
55
+ )
56
+ boxes = determine_clusteritem(lines, collect)
57
+ return boxes
58
+
59
+
60
+ def determine_clusteritem(
61
+ lines: iamraw.PageContentLines,
62
+ collector: callable,
63
+ ):
64
+ result = []
65
+ for paged in lines:
66
+ lines_in_page, page = paged.content, paged.page
67
+ # remove lines which are to short and represent a dot
68
+ lines_in_page = [
69
+ item for item in lines_in_page if not utilo.isdot(item)
70
+ ]
71
+ # remove duplicated lines
72
+ lines_in_page = utilo.unique_lines(lines_in_page)
73
+ grouped = determine_cluster(lines_in_page)
74
+ collected = collector(
75
+ grouped,
76
+ page,
77
+ rotated=paged.rotated,
78
+ )
79
+ result.append(collected)
80
+ return result
81
+
82
+
83
+ def determine_pageboxes(
84
+ clusters: list[pdfminer.layout.LTLine],
85
+ page: int,
86
+ rotated: bool = False, # pylint:disable=W0613
87
+ rectangle_width_min=RECTANGLE_WIDTH_MIN,
88
+ rectangle_height_min=RECTANGLE_HEIGHT_MIN,
89
+ ) -> iamraw.PageContentBoxes:
90
+ result = []
91
+ for cluster in clusters:
92
+ count = len(cluster)
93
+ if count != 4:
94
+ continue
95
+ x0 = min([line[0] for line in cluster] + [line[2] for line in cluster])
96
+ x1 = max([line[0] for line in cluster] + [line[2] for line in cluster])
97
+ y0 = min([line[1] for line in cluster] + [line[3] for line in cluster])
98
+ y1 = max([line[1] for line in cluster] + [line[3] for line in cluster])
99
+ width, height = x1 - x0, y1 - y0
100
+ if width < rectangle_width_min:
101
+ # small boxes are mostly a result of bad parsed figures or
102
+ # tables, we do not want them.
103
+ continue
104
+ if height < rectangle_height_min:
105
+ continue
106
+ box = iamraw.Box(box=iamraw.BoundingBox(x0, y0, x1, y1))
107
+ result.append(box)
108
+ # ensure to sort items top to bottom and left to right
109
+ result = sorted(result, key=operator.attrgetter('box.y0', 'box.x0'))
110
+ return iamraw.PageContentBoxes(content=result, page=page)
111
+
112
+
113
+ def determine_cluster(items: iamraw.BoundingBoxes) -> iamraw.BoundingBoxes: # pylint:disable=R1260
114
+ # TODO: REPLACE THIS CODE
115
+ if not items:
116
+ return []
117
+ # a single element is a cluster
118
+ result = [[item] for item in items]
119
+
120
+ def match(result, current):
121
+ for clusterindex, cluster in enumerate(result):
122
+ for clusteritem in cluster:
123
+ for test in current:
124
+ if utilo.intersecting_ending(
125
+ clusteritem,
126
+ test,
127
+ tol=ENDING_DISTANCE_MAX,
128
+ ):
129
+ return clusterindex
130
+ return None
131
+
132
+ def cluster(result):
133
+ result, todo = result[0], result[1:]
134
+ if not isinstance(result[0], list):
135
+ result = [result]
136
+ while todo: # pylint:disable=W0149
137
+ current = todo.pop()
138
+ index = match(result, current)
139
+ if index is None:
140
+ # No match, create new cluster
141
+ result.insert(0, current)
142
+ else:
143
+ result[index].extend(current)
144
+ return result
145
+
146
+ single = utilo.Single()
147
+ while True: # pylint:disable=W0149
148
+ # Break when cluster does not change result Cluster till cluster
149
+ # move does not change the result.
150
+ result = cluster(result)
151
+ if single.contains(result):
152
+ break
153
+ return result
@@ -0,0 +1,24 @@
1
+ # =============================================================================
2
+ # C O P Y R I G H T
3
+ # -----------------------------------------------------------------------------
4
+ # Copyright (c) 2020-2023 by Helmut Konrad Schewe. All rights reserved.
5
+ # This file is property of Helmut Konrad Schewe. Any unauthorized copy,
6
+ # use or distribution is an offensive act against international law and may
7
+ # be prosecuted under federal law. Its content is company confidential.
8
+ # =============================================================================
9
+ """Figure Extractor
10
+
11
+ Extract figures and convert to images
12
+ """
13
+
14
+ DumpedFigureInformation = list[tuple[str, bytes]]
15
+
16
+
17
+ def work( # pylint:disable=keyword-arg-before-vararg,W0613
18
+ path: str,
19
+ boxes: str = None, # pylint:disable=W0613
20
+ *images: list,
21
+ pages: tuple = None,
22
+ ) -> DumpedFigureInformation:
23
+ dumped = []
24
+ return dumped
@@ -0,0 +1,229 @@
1
+ #==============================================================================
2
+ # C O P Y R I G H T
3
+ #------------------------------------------------------------------------------
4
+ # Copyright (c) 2019-2023 by Helmut Konrad Schewe. All rights reserved.
5
+ # This file is property of Helmut Konrad Schewe. Any unauthorized copy,
6
+ # use or distribution is an offensive act against international law and may
7
+ # be prosecuted under federal law. Its content is company confidential.
8
+ #==============================================================================
9
+ """Extract fonts out of pdf document to gather information
10
+
11
+ Stored format:
12
+
13
+ (
14
+ container,
15
+ line,
16
+ char,
17
+ fontkey
18
+ )
19
+
20
+ Stored item is the first different item.
21
+
22
+ The font container indexing indexes only on text-container, other pages
23
+ objects are ignored.
24
+ """
25
+
26
+ import functools
27
+ import math
28
+
29
+ import iamraw
30
+ import serializeraw
31
+ import utilo
32
+
33
+ import rawmaker.features
34
+ import rawmaker.features.text
35
+ import rawmaker.fonts.parser
36
+ import rawmaker.miner.rawchar
37
+ import rawmaker.parameter
38
+ import rawmaker.reader
39
+
40
+
41
+ def work( # pylint:disable=W9015,W0613
42
+ document: str,
43
+ boxes_flow: float = 0.5,
44
+ char_margin: float = 2.0,
45
+ line_margin: float = 0.5,
46
+ line_overlap: float = 0.5,
47
+ word_margin: float = 0.1,
48
+ nostrip: bool = not rawmaker.parameter.STRIP,
49
+ detect_vertical: bool = False,
50
+ pages: list = None,
51
+ ) -> tuple[str, str]:
52
+ """Extract structured text out of document
53
+
54
+ Args:
55
+ document: pdf-document to run parsing
56
+ char_margin(float): XXX:5.0 why?
57
+ pages: limit analyzed area, if None every page is analyzed
58
+ Returns:
59
+ parsed document as yaml output
60
+ """
61
+ # TODO: CHANGE BEHAVIOR OF --detect_vertical. Convert to PARAMETER
62
+ # with True as default.
63
+ detect_vertical = True
64
+ assert isinstance(document, str), str(document)
65
+ config = rawmaker.parameter.ParsingConfiguration.from_dict(**locals())
66
+ document = rawmaker.features.text.extract_document(
67
+ document,
68
+ config=config,
69
+ pages=pages,
70
+ )
71
+ header, content = parse_fonts(document)
72
+ header, content = (
73
+ serializeraw.dump_font_header(header),
74
+ serializeraw.dump_font_content(content),
75
+ )
76
+ return header, content
77
+
78
+
79
+ class FontStore:
80
+
81
+ def __init__(self, parser=None):
82
+ self.parser = parser if parser else rawmaker.fonts.parser.font_fromraw
83
+ self.data = {}
84
+
85
+ @functools.lru_cache(maxsize=128)
86
+ def font_key(self, raw_font: str, scale: float, flags: int) -> int:
87
+ parsed = self.parser(raw_font, scale, flags)
88
+ hashed = hash(parsed)
89
+ try:
90
+ self.data[hashed]
91
+ except KeyError:
92
+ self.data[hashed] = parsed
93
+ return hashed
94
+
95
+ def font(self, hashed: int):
96
+ return self.data[hashed]
97
+
98
+ def fonts(self):
99
+ return list(self.data.values())
100
+
101
+
102
+ def process_page( # pylint:disable=R0914
103
+ page: iamraw.Page,
104
+ fontstore: FontStore,
105
+ ) -> iamraw.PageFontContent:
106
+ """Iterate throw text container and extract the different fonts and
107
+ positions.
108
+
109
+ There are three indexs describing the position where the font-size
110
+ or font-rises changes. The text container, the line in the
111
+ container, and the char in line. The position of change is oriented
112
+ on python range/indexing. We note the change one char after the
113
+ change. Container and line are equal. Therefore on line endings, the
114
+ change is noted on a char position which does not exists.
115
+
116
+ Args:
117
+ page(Page): current pdf page
118
+ fontstore(FontStore): fontstore to store full information of used font
119
+ Returns:
120
+ Page with font information of the page text content.
121
+ """
122
+ assert isinstance(page, iamraw.Page), type(page)
123
+ position = (0, 0, 0) # container, line, char
124
+ current_font, current_scale = None, None
125
+ current_flags = None
126
+ textcontainer = utilo.select_type(page.children, iamraw.TextContainer)
127
+ result = []
128
+ for container_index, container in enumerate(textcontainer):
129
+ rotated = isinstance(container, iamraw.VerticalTextContainer)
130
+ for line_index, line in enumerate(container.lines):
131
+ for char_index, char in enumerate(line):
132
+ try:
133
+ font = char.font
134
+ except AttributeError:
135
+ # Virtual chars have no fonts, but newlines are part
136
+ # of font definition.
137
+ position = (container_index, line_index, char_index)
138
+ continue
139
+ scale = scale_fromchar(char, vertical=rotated)
140
+ flags = flags_fromchar(char)
141
+ # No font type or size is selected
142
+ if current_font is None:
143
+ current_font, current_scale = (font, scale)
144
+ current_flags = flags
145
+ continue
146
+ # Font type, size or flags changed
147
+ if any((
148
+ current_font != font,
149
+ current_scale != scale,
150
+ current_flags != flags,
151
+ )):
152
+ fontid = add_font(
153
+ current_font,
154
+ current_scale,
155
+ flags=current_flags,
156
+ position=position,
157
+ fontstore=fontstore,
158
+ )
159
+ result.append(fontid)
160
+ # Reset current front
161
+ current_font, current_scale = font, scale
162
+ current_flags = flags
163
+ # update last index of current font
164
+ position = (container_index, line_index, char_index)
165
+ # add last text line of a page, because there is nothing changing
166
+ if current_font:
167
+ fontid = add_font(
168
+ current_font,
169
+ current_scale,
170
+ flags=current_flags,
171
+ position=position,
172
+ fontstore=fontstore,
173
+ )
174
+ result.append(fontid)
175
+ return iamraw.PageFontContent(content=result, page=page.page)
176
+
177
+
178
+ def parse_fonts(document: iamraw.Document):
179
+ fontstore = FontStore(rawmaker.fonts.parser.font_fromraw)
180
+ content = [process_page(page, fontstore) for page in document.pages]
181
+ # Run header after content is important. DO NOT CHANGE ORDER. If
182
+ # running .fonts() first, content will be empty cause no fonts where
183
+ # processed.
184
+ header = fontstore.fonts()
185
+ return header, content
186
+
187
+
188
+ def add_font(font, scale, flags, *, fontstore, position):
189
+ # position = (container, line, chars + 1)
190
+ container, line, char = position
191
+ # store position after the change happend
192
+ char = char + 1
193
+ fontkey = fontstore.font_key(font, scale, flags)
194
+ return (container, line, char, fontkey)
195
+
196
+
197
+ def flags_fromchar(char) -> tuple:
198
+ try:
199
+ # LTChar
200
+ flags = char.ltchar.flags
201
+ except AttributeError:
202
+ # Char
203
+ flags = char.flags
204
+ return flags
205
+
206
+
207
+ def upright_fromchar(char) -> bool:
208
+ try:
209
+ upright = char.ltchar.upright
210
+ except AttributeError:
211
+ upright = True
212
+ return upright
213
+
214
+
215
+ def scale_fromchar(char, vertical: bool = False) -> float:
216
+ # TODO: INVESTIGATE 1.34??
217
+ # NOTE: This works for POSTSCRIPT_14_DEFAULT's but not for
218
+ # Calibri.
219
+ scale = utilo.roundme(char.size / 1.34005)
220
+ # TODO: THINK ABOUT VERTICAL HACK
221
+ if scale < 0:
222
+ rotated = not upright_fromchar(char)
223
+ rotated |= vertical
224
+ absolute = math.fabs(scale)
225
+ if rotated and absolute > 4.0: # TODO: HOLY VALUE
226
+ # rotated char which is printed top down
227
+ return absolute
228
+ utilo.debug(f'negative font size: {scale} {char}')
229
+ return scale
@@ -0,0 +1,16 @@
1
+ # =============================================================================
2
+ # C O P Y R I G H T
3
+ # -----------------------------------------------------------------------------
4
+ # Copyright (c) 2020-2023 by Helmut Konrad Schewe. All rights reserved.
5
+ # This file is property of Helmut Konrad Schewe. Any unauthorized copy,
6
+ # use or distribution is an offensive act against international law and may
7
+ # be prosecuted under federal law. Its content is company confidential.
8
+ # =============================================================================
9
+
10
+ import serializeraw
11
+
12
+
13
+ def work(path: str, pages: tuple = None) -> str: # pylint:disable=W0613
14
+ formulas = []
15
+ dumped = serializeraw.dump_rawformulas(formulas)
16
+ return dumped
@@ -0,0 +1,132 @@
1
+ # =============================================================================
2
+ # C O P Y R I G H T
3
+ # -----------------------------------------------------------------------------
4
+ # Copyright (c) 2019-2023 by Helmut Konrad Schewe. All rights reserved.
5
+ # This file is property of Helmut Konrad Schewe. Any unauthorized copy,
6
+ # use or distribution is an offensive act against international law and may
7
+ # be prosecuted under federal law. Its content is company confidential.
8
+ # =============================================================================
9
+ """Horizontals
10
+ ===========
11
+
12
+ Whats the difference between `boxes_horizontals` and `lines`?
13
+ `boxes_horizontals` contain only vertical lines. `lines` can contain
14
+ every lines in every direction.
15
+
16
+ Why do we cluster for horizontal lines?
17
+ To ignore lines which are part of a box and can not be a horizontal line.
18
+ """
19
+
20
+ import functools
21
+ import operator
22
+
23
+ import configos
24
+ import iamraw
25
+ import pdfminer.layout
26
+ import serializeraw
27
+ import utilo
28
+
29
+ import rawmaker.features
30
+ import rawmaker.features.border
31
+ import rawmaker.features.boxes
32
+ import rawmaker.reader
33
+
34
+ # TODO: LTLine - replace with own data structure to reduce dependencies to
35
+ # rawmaker
36
+ LineClusters = list[list[pdfminer.layout.LTLine]]
37
+
38
+ # minimal length of a horizontal line
39
+ HORIZONTAL_WIDTH_MIN = configos.HV_FLOAT(default=0.2)
40
+ # maximal difference in x-component
41
+ HORIZONTAL_VERTICAL_DIFF_MAX = configos.HV_FLOAT_PLUS(default=5.0)
42
+
43
+
44
+ def work(lines: str, pages: tuple) -> str:
45
+ """Extract content horizontal lines from given `document`
46
+
47
+ Args:
48
+ lines(str): path to document
49
+ pages(tuple): pages to analyze
50
+ Returns:
51
+ dumped parsed boxes, dumped parsed horizontals
52
+ """
53
+ assert isinstance(lines, str), type(lines)
54
+ lines = serializeraw.load_lines(lines, pages=pages)
55
+ horizontal = determine_horizontal(lines)
56
+ dumped = serializeraw.dump_horizontals(horizontal)
57
+ return dumped
58
+
59
+
60
+ def determine_horizontal(lines, pagewidth=500):
61
+ worker = functools.partial(determine_pagehorizontals, page_width=pagewidth)
62
+ # run worker
63
+ result = rawmaker.features.boxes.determine_clusteritem(
64
+ lines,
65
+ worker,
66
+ )
67
+ return result
68
+
69
+
70
+ def determine_pagehorizontals( # pylint:disable=R0914
71
+ cluster: LineClusters,
72
+ page: int,
73
+ *,
74
+ page_width: float,
75
+ rotated: bool = False,
76
+ vertical_maxerror: float = HORIZONTAL_VERTICAL_DIFF_MAX,
77
+ horizontal_minwidth: float = HORIZONTAL_WIDTH_MIN,
78
+ ) -> iamraw.PageContentHorizontals:
79
+ """Collect single line which are expanded horizontal
80
+
81
+ Args:
82
+ cluster: list of line cluster
83
+ page(int): current analyzed page
84
+
85
+ page_width(float): width of page page
86
+ rotated(bool): if True pdfpage is rotated
87
+ vertical_maxerror(float): maximal vertical difference of the left and
88
+ right y-component [0.0,1.0].
89
+ horizontal_minwidth(float): minimum distance between left and right
90
+ x-component [0.0,1.0].
91
+ Returns:
92
+ list with horizontal line
93
+ """
94
+ horizontal_minwidth = horizontal_minwidth * page_width
95
+ collected = []
96
+ for merged in cluster:
97
+ if len(merged) != 1:
98
+ # ignore boxed lines
99
+ continue
100
+ # convert from BoundingBox
101
+ x0, y0, x1, y1 = utilo.roundme(tuple(merged[0]))
102
+ height = abs(y1 - y0)
103
+ width = abs(x1 - x0)
104
+ # check roated
105
+ if rotated:
106
+ width, height = height, width # flip
107
+ if height > vertical_maxerror:
108
+ utilo.debug(f'no horizontal line {x0} {y0} {x1} {y1}; page: {page}'
109
+ f' vertical error: {height} > {vertical_maxerror}')
110
+ continue
111
+ if width < horizontal_minwidth:
112
+ utilo.debug(f'no horizontal line {x0} {y0} {x1} {y1}; page: {page}'
113
+ f' too short: {width} < {horizontal_minwidth}')
114
+ continue
115
+ if not rotated:
116
+ y0 = y1 = utilo.roundme((y0 + y1) / 2)
117
+ else:
118
+ x0 = x1 = utilo.roundme((x0 + x1) / 2)
119
+ box = iamraw.BoundingBox(x0, y0, x1, y1)
120
+ horizontal = iamraw.HorizontalLine(box=box)
121
+ collected.append(horizontal)
122
+ # ensure to sort items top to bottom and left to right
123
+ collected = sorted(
124
+ collected,
125
+ key=operator.attrgetter('box.y0', 'box.x0'),
126
+ )
127
+ result = iamraw.PageContentHorizontals(
128
+ content=collected,
129
+ page=page,
130
+ rotated=rotated,
131
+ )
132
+ return result