rawmaker 2.40.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (63) hide show
  1. letty/__init__.py +46 -0
  2. letty/cli.py +63 -0
  3. letty/optimizer.py +138 -0
  4. letty/quality/__init__.py +8 -0
  5. letty/quality/whitespace.py +50 -0
  6. letty/strategy.py +8 -0
  7. rawmaker/__init__.py +29 -0
  8. rawmaker/__main__.py +13 -0
  9. rawmaker/__patch__.py +36 -0
  10. rawmaker/cli.py +206 -0
  11. rawmaker/cli_automate.py +69 -0
  12. rawmaker/converter/__init__.py +8 -0
  13. rawmaker/converter/basic.py +174 -0
  14. rawmaker/converter/images.py +168 -0
  15. rawmaker/date.py +83 -0
  16. rawmaker/destination.py +202 -0
  17. rawmaker/error.py +34 -0
  18. rawmaker/features/__init__.py +138 -0
  19. rawmaker/features/annotation.py +254 -0
  20. rawmaker/features/border.py +172 -0
  21. rawmaker/features/boxes.py +153 -0
  22. rawmaker/features/figures.py +24 -0
  23. rawmaker/features/fonts.py +229 -0
  24. rawmaker/features/formula.py +16 -0
  25. rawmaker/features/horizontals.py +132 -0
  26. rawmaker/features/images.py +155 -0
  27. rawmaker/features/line.py +337 -0
  28. rawmaker/features/outlines.py +123 -0
  29. rawmaker/features/text.py +91 -0
  30. rawmaker/fonts/__init__.py +8 -0
  31. rawmaker/fonts/parser.py +354 -0
  32. rawmaker/images/__init__.py +8 -0
  33. rawmaker/images/info.py +35 -0
  34. rawmaker/miner/__init__.py +8 -0
  35. rawmaker/miner/char.py +42 -0
  36. rawmaker/miner/colorspace.py +75 -0
  37. rawmaker/miner/images.py +448 -0
  38. rawmaker/miner/position.py +121 -0
  39. rawmaker/miner/rawchar.py +207 -0
  40. rawmaker/miner/text.py +833 -0
  41. rawmaker/miner/underline.py +66 -0
  42. rawmaker/parameter.py +130 -0
  43. rawmaker/patch/__init__.py +8 -0
  44. rawmaker/patch/ltchar.py +79 -0
  45. rawmaker/reader.py +97 -0
  46. rawmaker/text/__init__.py +8 -0
  47. rawmaker/text/chars.py +24 -0
  48. rawmaker/text/data.py +47 -0
  49. rawmaker/text/superfast.py +91 -0
  50. rawmaker/text/wordbox.py +95 -0
  51. rawmaker/utils.py +44 -0
  52. rawmaker-2.40.3.dist-info/METADATA +51 -0
  53. rawmaker-2.40.3.dist-info/RECORD +63 -0
  54. rawmaker-2.40.3.dist-info/WHEEL +5 -0
  55. rawmaker-2.40.3.dist-info/entry_points.txt +6 -0
  56. rawmaker-2.40.3.dist-info/licenses/LICENSE +21 -0
  57. rawmaker-2.40.3.dist-info/top_level.txt +3 -0
  58. spacestation/__init__.py +18 -0
  59. spacestation/cli.py +51 -0
  60. spacestation/features/__init__.py +8 -0
  61. spacestation/features/chardist.py +85 -0
  62. spacestation/features/worddist.py +57 -0
  63. spacestation/features/wspace.py +130 -0
@@ -0,0 +1,155 @@
1
+ # =============================================================================
2
+ # C O P Y R I G H T
3
+ # -----------------------------------------------------------------------------
4
+ # Copyright (c) 2019-2023 by Helmut Konrad Schewe. All rights reserved.
5
+ # This file is property of Helmut Konrad Schewe. Any unauthorized copy,
6
+ # use or distribution is an offensive act against international law and may
7
+ # be prosecuted under federal law. Its content is company confidential.
8
+ # =============================================================================
9
+ """ImageExtractor
10
+
11
+ The ImageExtractor provides the possibility to extract all images out of
12
+ a pdf file.
13
+
14
+ Support formats:
15
+ - png?
16
+ - jpg?
17
+ """
18
+
19
+ import collections
20
+ import io
21
+ import os
22
+
23
+ import PIL.Image
24
+ import serializeraw
25
+ import ughost
26
+ import utilo
27
+
28
+ import rawmaker
29
+ import rawmaker.images.info
30
+ import rawmaker.miner.images
31
+ import rawmaker.reader
32
+
33
+ PageContentImages = collections.namedtuple('PageContentImages', 'content, page')
34
+ PageContentImagesList = list[PageContentImages]
35
+
36
+ DumpedImageInformations = list[tuple[str, bytes]]
37
+
38
+
39
+ def work(document: str, pages: tuple = None) -> DumpedImageInformations:
40
+ extracted = extract_images(document, pages=pages)
41
+ extracted = beautify_images(extracted, document)
42
+ result = []
43
+ for page in extracted:
44
+ for info, (rawimage, ext) in page.content:
45
+ info = serializeraw.dump_image_info(info)
46
+ result.append((info, (rawimage, ext)))
47
+ return result
48
+
49
+
50
+ def extract_images(
51
+ document: str,
52
+ outputfolder: str = None,
53
+ pages=None,
54
+ ) -> PageContentImagesList:
55
+ # TODO: REPLACE AFTER UPGRADING utilo
56
+ if outputfolder is None:
57
+ outputfolder = utilo.tmpfile(rawmaker.ROOT)
58
+ with rawmaker.reader.read(document) as loaded:
59
+ extracted = rawmaker.miner.images.extract_images(
60
+ loaded,
61
+ outputfolder=outputfolder,
62
+ pages=pages,
63
+ )
64
+ result = convert_images(
65
+ extracted,
66
+ outputfolder,
67
+ pages=pages,
68
+ )
69
+ return result
70
+
71
+
72
+ def convert_images(
73
+ extracted: dict,
74
+ outputfolder: str,
75
+ pages: tuple = None,
76
+ ) -> list:
77
+ result = []
78
+ for page, images in extracted.items():
79
+ # convert selected pages to global pages
80
+ page = convert_pages(page, pages)
81
+ pagecontent = []
82
+ for parsed in images:
83
+ bounding = parsed.bounding
84
+ path = os.path.join(outputfolder, parsed.filename)
85
+ if not os.path.exists(path):
86
+ # TODO: FIX IMAGE EXTRACTION
87
+ utilo.error(f'missing image: {path}')
88
+ continue
89
+ loaded = utilo.file_read_binary(path)
90
+ info = rawmaker.images.info.imageinfo(path, page, bounding)
91
+ if info is None:
92
+ utilo.error(f'could not extract {path}, {page}, {bounding}')
93
+ continue
94
+ ext = utilo.file_ext(path)
95
+ pagecontent.append((info, (loaded, ext)))
96
+ if not pagecontent:
97
+ continue
98
+ result.append(PageContentImages(page=page, content=pagecontent))
99
+ return result
100
+
101
+
102
+ def beautify_images(images, path: str):
103
+ """Use ghost to render pdf and crop image area."""
104
+ result = []
105
+ for page in images:
106
+ boundings = [item[0] for item in page.content]
107
+ extracted = run_ghost(path, boundings)
108
+ content = []
109
+ for raw, bounding in zip(extracted, boundings):
110
+ content.append((bounding, (raw, 'png')))
111
+ result.append(PageContentImages(content=content, page=page.page))
112
+ return result
113
+
114
+
115
+ def run_ghost(path: str, boundings: list) -> list:
116
+ """Extract images out of pdf.
117
+
118
+ If ghost is not installed, we return a empty white image.
119
+ """
120
+ if ughost.HAS_GHOST:
121
+ extracted = ughost.images(path, boundings)
122
+ return extracted
123
+ utilo.error('could not beautify images: install ghost')
124
+ result = []
125
+ for bounding in boundings:
126
+ size = (int(bounding.width), int(bounding.height))
127
+ # white image backup bock box
128
+ raw = PIL.Image.new('RGB', size, color=1)
129
+ png = convert_topng(raw)
130
+ result.append(png)
131
+ return result
132
+
133
+
134
+ def convert_topng(image) -> bytes:
135
+ raw = io.BytesIO()
136
+ image.save(raw, format='png')
137
+ # rewind the buffer
138
+ raw.seek(0)
139
+ # convert to bytes
140
+ result = raw.getvalue()
141
+ return result
142
+
143
+
144
+ def convert_pages(page: int, pages: tuple) -> int:
145
+ """Pdfminer produces directly ascending pages.
146
+
147
+ If we select pages=('0:5,28') pdfminer produces 0, 1, 2, 3, 4, 5.
148
+ This method convert this to 0, 1, 2, 3, 4, 28.
149
+ """
150
+ # TODO: INVESTIGATE HERE
151
+ if pages is None:
152
+ return page
153
+ # starting with starting offset
154
+ offset = min(pages, default=0)
155
+ return pages[page - offset]
@@ -0,0 +1,337 @@
1
+ # =============================================================================
2
+ # C O P Y R I G H T
3
+ # -----------------------------------------------------------------------------
4
+ # Copyright (c) 2020-2023 by Helmut Konrad Schewe. All rights reserved.
5
+ # This file is property of Helmut Konrad Schewe. Any unauthorized copy,
6
+ # use or distribution is an offensive act against international law and may
7
+ # be prosecuted under federal law. Its content is company confidential.
8
+ # =============================================================================
9
+ """Line Extractor
10
+ ==============
11
+
12
+ This module aims to extract lines out of pdf document.
13
+
14
+ Furthermore the lines are:
15
+ * fixed in x0/x1 and y0/y1
16
+ * sorted from top to bottom and left to right
17
+ * if required merged together.
18
+ """
19
+
20
+ import operator
21
+
22
+ import configos
23
+ import iamraw
24
+ import pdfminer.layout
25
+ import pdfminer.pdfdocument
26
+ import serializeraw
27
+ import utilo
28
+
29
+ import rawmaker.reader
30
+
31
+ # maximal difference in y-component
32
+ HORIZONTAL_DIFF_MAX = configos.HV_FLOAT_PLUS(default=2.0)
33
+ # maximal difference in x-component
34
+ VERTICAL_DIFF_MAX = configos.HV_FLOAT_PLUS(default=2.0)
35
+ # minimal number of minus signs which build a horizontal line
36
+ REQUIRED_MINUS_SIGNS = configos.HV_INT_PLUS(default=40)
37
+
38
+
39
+ def work(document: str, annotations: str, pages: tuple = None) -> str:
40
+ if utilo.exists(annotations):
41
+ annotations = serializeraw.load_annotations(annotations, pages=pages)
42
+ else:
43
+ utilo.debug(f'missing {annotations} could not skip underlines')
44
+ annotations = []
45
+ with rawmaker.reader.read(document) as pdf:
46
+ extracted = determine_lines(pdf, pages=pages)
47
+ extracted = skip_lines(extracted, annotations)
48
+ dumped = serializeraw.dump_lines(extracted)
49
+ return dumped
50
+
51
+
52
+ def determine_lines(
53
+ document: pdfminer.pdfdocument.PDFDocument,
54
+ pages: tuple = None,
55
+ ) -> iamraw.PageContentLines:
56
+ lines_ = lines(document, pages=pages)
57
+ rotates = rotated(document, pages=pages)
58
+ result = []
59
+ for content, number in lines_:
60
+ # left point is left above from right down point
61
+ content = [utilo.rect_ensure_bounding(item) for item in content]
62
+ # top down, left right
63
+ content.sort(key=operator.itemgetter(1, 0))
64
+ # merge lines which are divided by pdf printer
65
+ merged = utilo.merge_lines(content)
66
+ contentline = iamraw.PageContentLine(
67
+ content=merged,
68
+ page=number,
69
+ rotated=number in rotates,
70
+ )
71
+ result.append(contentline)
72
+ return result
73
+
74
+
75
+ def rotated(
76
+ document: pdfminer.pdfdocument.PDFDocument,
77
+ pages: tuple = None,
78
+ ) -> set:
79
+ """Determine rotated pages."""
80
+ result = set()
81
+ for page in rawmaker.features.process_pagecontent(
82
+ document,
83
+ pages=pages,
84
+ ):
85
+ width, height = page.content.width, page.content.height
86
+ if width < height:
87
+ continue
88
+ result.add(page.page)
89
+ return result
90
+
91
+
92
+ def skip_lines(linex, annotation) -> list:
93
+ result = []
94
+ for page in linex:
95
+ anno = utilo.select_page(annotation, page.page)
96
+ if not anno:
97
+ result.append(page)
98
+ continue
99
+ invalid_area = [item.bounds for item in anno.hyperlinks]
100
+ # remove annotated lines. This lines are the underlines of
101
+ # hyperlinks which are produced by cray pdf printer.
102
+ linex = [
103
+ item for item in page.content
104
+ if not utilo.rectangles_intersecting(invalid_area, item)
105
+ ]
106
+ result.append(iamraw.PageContentLine(content=linex, page=page.page))
107
+ return result
108
+
109
+
110
+ # do not merge near horizontal: '_______________' to text container below.
111
+ LAYOUT_LINES = pdfminer.layout.LAParams(line_margin=0.0000001)
112
+
113
+
114
+ def lines(
115
+ pdf: pdfminer.pdfdocument.PDFDocument,
116
+ pages: tuple = None,
117
+ ) -> list:
118
+ """Extract all `LTLine` out of `PDFDocument` page wise
119
+
120
+ Support 3 different types of pdf layout elements:
121
+ LTLine:
122
+ LTRect: small difference between oposite lines
123
+ LTTextBoxHorizontal:
124
+
125
+ Args:
126
+ pdf: pdf document to collect lines
127
+ pages: select pages to run anlaysis on
128
+ Returns:
129
+ list of line objects[LTLine, LTRect, LTTextBoxHorizontal]
130
+ """
131
+ utilo.asserts(pdf, pdfminer.pdfdocument.PDFDocument)
132
+ possible_lines = type_in_document(
133
+ pdf,
134
+ datatype=(
135
+ pdfminer.layout.LTTextBoxHorizontal,
136
+ pdfminer.layout.LTLine,
137
+ pdfminer.layout.LTRect,
138
+ pdfminer.layout.LTFigure,
139
+ pdfminer.layout.LTCurve,
140
+ ),
141
+ layout=LAYOUT_LINES,
142
+ pages=pages,
143
+ )
144
+ strategy = {
145
+ pdfminer.layout.LTLine: accept_ltline,
146
+ pdfminer.layout.LTRect: accept_ltrect,
147
+ pdfminer.layout.LTTextBoxHorizontal: accept_text_as_line,
148
+ pdfminer.layout.LTCurve: accept_curve_as_line,
149
+ pdfminer.layout.LTFigure: accept_figure_as_line,
150
+ }
151
+ result = []
152
+ for content, pagenumber in possible_lines:
153
+ page = []
154
+ for item in content:
155
+ # check item against strategy. If no stategy is supported, the
156
+ # element is skipped.
157
+ try:
158
+ if not strategy[type(item)](item):
159
+ continue
160
+ page.append(item)
161
+ except KeyError:
162
+ utilo.error(f'unsupported strategy {item}')
163
+ # convert bounding
164
+ page = [(item.bbox[0], item.bbox[1], item.bbox[2], item.bbox[3])
165
+ for item in page]
166
+ # round bounding
167
+ page = [utilo.roundme(item) for item in page]
168
+ # remove very short lines/dots
169
+ page = [item for item in page if not utilo.isdot(item, max_length=5.0)]
170
+ # ensure left, top, right, down bounding
171
+ page = [utilo.rect_ensure_bounding(item) for item in page]
172
+ # sort item top down; left right
173
+ page.sort(key=operator.itemgetter(1, 0))
174
+ # merges divided lines
175
+ page = utilo.merge_lines(page)
176
+ # remove duplicated lines which mainly produces out of bad figure
177
+ # extraction
178
+ # TODO: ADD LINE DENSITY CHECK?
179
+ page = utilo.unique_lines(page, max_diff=3.0)
180
+ result.append((page, pagenumber))
181
+ return result
182
+
183
+
184
+ def accept_text_as_line(item: pdfminer.layout.LTTextBoxHorizontal):
185
+ symbols = '_-='
186
+ text = item.get_text()
187
+ if len(text) < REQUIRED_MINUS_SIGNS:
188
+ return False
189
+ for symbol in symbols:
190
+ if text.count(symbol) >= REQUIRED_MINUS_SIGNS:
191
+ # update bounding to pass vertical error test.
192
+ # use vertical centric position
193
+ # TODO: CHECK THIS: Make it symbol dependend?
194
+ if symbol in '_':
195
+ ypos = utilo.roundme(max((item.bbox[1], item.bbox[3])))
196
+ else:
197
+ ypos = utilo.roundme((item.bbox[1] + item.bbox[3]) / 2)
198
+ # update bounding box
199
+ item.bbox = (item.bbox[0], ypos, item.bbox[2], ypos)
200
+ return True
201
+ return False
202
+
203
+
204
+ def accept_ltrect(item: pdfminer.layout.LTRect):
205
+ return accept_ltline(item)
206
+
207
+
208
+ def accept_ltline(
209
+ item: pdfminer.layout.LTLine,
210
+ vertical_max_diff=VERTICAL_DIFF_MAX,
211
+ horizontal_max_diff=HORIZONTAL_DIFF_MAX,
212
+ ) -> bool:
213
+ """Accept horizontal or vertical lines
214
+
215
+ The lines must vary only little. A crossing line has vertical
216
+ and horizontal error. We want | or - not / or \\.
217
+ """
218
+ assert item.bbox[3] >= item.bbox[1], str(item.bbox)
219
+ assert item.bbox[0] <= item.bbox[2], str(item.bbox)
220
+
221
+ horizontal_error = item.bbox[3] - item.bbox[1] >= horizontal_max_diff
222
+ vertical_error = item.bbox[2] - item.bbox[0] >= vertical_max_diff
223
+
224
+ if horizontal_error and vertical_error:
225
+ return False
226
+
227
+ if vertical_error:
228
+ # HACK: WORKAROUND TODO:
229
+ # horizontal lines: There are lines in bachelor028 which are
230
+ try:
231
+ blueline = BLUE in (item.stroking_color, item.non_stroking_color)
232
+ except AttributeError:
233
+ blueline = False
234
+ if blueline:
235
+ utilo.debug('skip horizontal blue line which is may part of a '
236
+ 'hyperlink and destroys footnote detection')
237
+ utilo.debug(item)
238
+ return False
239
+ return True
240
+
241
+
242
+ BLUE = [0, 0, 1]
243
+
244
+
245
+ def accept_figure_as_line(figure: pdfminer.layout.LTFigure) -> bool:
246
+ """Some pdf renderer converts lines into images."""
247
+ content = figure._objs # pylint:disable=W0212
248
+ if len(content) != 1:
249
+ return False
250
+ # Do we need a min width? I don't think so because thats the job of
251
+ # later running methods.
252
+ if accept_ltline(content[0]):
253
+ return True
254
+ if figure_special_line(figure):
255
+ return True
256
+ return False
257
+
258
+
259
+ def accept_curve_as_line(curve: pdfminer.layout.LTCurve) -> bool:
260
+ pts = curve.pts
261
+ if not curve.linewidth and not curve.fill:
262
+ # invisible line
263
+ return False
264
+ if curve.stroke:
265
+ if curve.stroking_color is None and curve.non_stroking_color is None:
266
+ # TODO: DONT KNOW WHY
267
+ return False
268
+ if curve.fill:
269
+ # polygon?
270
+ if curve.height < 5.0 or curve.width < 5.0:
271
+ return True
272
+ if len(pts) == 2:
273
+ # start and end point
274
+ return True
275
+ # more than two points in a row, check if point are on a line
276
+ # [(437.04645, 259.38056), (437.04645, 293.26655), (437.04645, 269.60483999999997)]
277
+ items = [(*first, *second) for first, second in zip(pts[:-1], pts[1:])]
278
+ merged = utilo.merge_lines(items, diff=1.5)
279
+ if len(merged) == 1:
280
+ # all lines in a row
281
+ return True
282
+ return False
283
+
284
+
285
+ # horizontal line which is rendered as a figure
286
+ HORIZONTAL_FIGURE_LINE_WIDTH_MIN = configos.HV_FLOAT_PLUS(default=350.0)
287
+ # the object have to be more width than height with this ratio
288
+ HORIZONTAL_FIGURE_LINE_RATIO_MIN = configos.HV_FLOAT_PLUS(default=45.0)
289
+
290
+
291
+ def figure_special_line(figure: pdfminer.layout.LTFigure) -> bool:
292
+ """Detect special line and update figure box if figure is special line."""
293
+ # TODO: THIS IS ONLY A HORIZONTAL?
294
+ # EXAMPLE: MASTER155
295
+ # 'width': 413.96, 'height': 8.54
296
+ # TODO: ANALYZE IMAGE
297
+ image = figure._objs[0] # pylint:disable=W0212
298
+ height = utilo.rect_height(image.bbox)
299
+ width = utilo.rect_width(image.bbox)
300
+ ratio = width / height
301
+ if width <= HORIZONTAL_FIGURE_LINE_WIDTH_MIN:
302
+ return False
303
+ if ratio <= HORIZONTAL_FIGURE_LINE_RATIO_MIN:
304
+ return False
305
+ # adjust bounding of figure to middle line
306
+ # TODO: USE IMAGE INFORMATION
307
+ middle = utilo.roundme((figure.bbox[1] + figure.bbox[3]) / 2)
308
+ figure.bbox = (figure.bbox[0], middle, figure.bbox[2], middle)
309
+ return True
310
+
311
+
312
+ def type_in_document(
313
+ document: pdfminer.pdfdocument.PDFDocument,
314
+ datatype: object,
315
+ layout=None,
316
+ pages: tuple = None,
317
+ ) -> list[tuple[pdfminer.layout.LTPage, int]]:
318
+ """Extract defined `datatype` out of `PDFDocument`
319
+
320
+ Args:
321
+ document(PDFDocument): pdf document to extract all types
322
+ datatype: selected item type
323
+ layout(Param): process with different layout
324
+ pages(tuple): select pages
325
+ Returns:
326
+ List with selected `datatype`.
327
+ """
328
+ utilo.asserts(document, pdfminer.pdfdocument.PDFDocument)
329
+ result = []
330
+ for page in rawmaker.features.process_pagecontent(
331
+ document,
332
+ layout=layout,
333
+ pages=pages,
334
+ ):
335
+ data = [item for item in page.content if isinstance(item, datatype)]
336
+ result.append((data, page.page))
337
+ return result
@@ -0,0 +1,123 @@
1
+ #==============================================================================
2
+ # C O P Y R I G H T
3
+ #------------------------------------------------------------------------------
4
+ # Copyright (c) 2019-2023 by Helmut Konrad Schewe. All rights reserved.
5
+ # This file is property of Helmut Konrad Schewe. Any unauthorized copy,
6
+ # use or distribution is an offensive act against international law and may
7
+ # be prosecuted under federal law. Its content is company confidential.
8
+ #==============================================================================
9
+ """Document Outlines
10
+ =================
11
+
12
+ See PDF2008: 12.3.3 Document Outline
13
+
14
+ Basic structure of get_outlines: (level, title, args, children)
15
+
16
+ Entries of outlines dict
17
+ ------------------------
18
+
19
+ Dest(str, list): destination if item was clicked/activated, not present
20
+ if A is present.
21
+ A(dict): Action(launch application, play sound, chaning state) Shall
22
+ not be present if an DEST item is present
23
+ SE(dict): Reference to structure element(see Structural Hierarchy)
24
+ """
25
+
26
+ import iamraw
27
+ import pdfminer.pdfdocument
28
+ import pdfminer.pdfpage
29
+ import serializeraw
30
+ import utilo
31
+
32
+ import rawmaker.destination
33
+ import rawmaker.reader
34
+ import rawmaker.utils
35
+
36
+
37
+ def work(document: str) -> str:
38
+ """Extract outlines of a pdf document.
39
+
40
+ If there are no outlines provided dump empty list.
41
+ """
42
+ assert isinstance(document, str), str(document)
43
+ parsed = parse_outlines(document)
44
+ toc = iamraw.create_toc(parsed)
45
+ try:
46
+ # toc to yaml
47
+ dumped = serializeraw.dump_toc(toc)
48
+ except TypeError:
49
+ utilo.error('could not convert toc to YAML.')
50
+ utilo.error('The toc may contain indirect references, buffer, etc.')
51
+ utilo.error('Outline implementation seem not complete, yet.')
52
+ dumped = None
53
+ return dumped
54
+
55
+
56
+ def parse_outlines(document: str) -> list:
57
+ result = []
58
+ with rawmaker.reader.read(document) as pdf:
59
+ try:
60
+ # extract all outlines from pdf
61
+ outlines = list(pdf.get_outlines())
62
+ pagelookup = rawmaker.destination.pageids(document)
63
+ except pdfminer.pdfdocument.PDFNoOutlines:
64
+ outlines = []
65
+ utilo.error('could not locatate any outlines')
66
+ for (level, title, dest, action, _) in outlines:
67
+ try:
68
+ page = pagenumber(action, dest, pdf)
69
+ except (AttributeError, ValueError) as error:
70
+ utilo.error('PDF NOT FULLY SUPPORTED')
71
+ utilo.print_stacktrace()
72
+ utilo.error(error)
73
+ continue
74
+ if not isinstance(page, int):
75
+ try:
76
+ page = pagelookup[page.objid]
77
+ except KeyError:
78
+ utilo.error(f'invalid page lookup: {page.objid} pdf is '
79
+ 'maybe an invalid extraction out of an other '
80
+ f'file: {pagelookup}')
81
+ continue
82
+ assert isinstance(page, int), f'require convertion: {type(page)}'
83
+ raw_section = iamraw.SectionRaw(
84
+ level,
85
+ title,
86
+ page=page,
87
+ raw='toc outline page',
88
+ raw_location=-1,
89
+ )
90
+ result.append(raw_section)
91
+ return result
92
+
93
+
94
+ def pagenumber(action, dest, pdf) -> rawmaker.destination.ExplicitDestination:
95
+ parsed = None
96
+ if action:
97
+ parsed = rawmaker.destination.parse(action)
98
+ if isinstance(parsed, rawmaker.destination.NamedDestination):
99
+ try:
100
+ resolved = pdf.get_dest(parsed.pdf_reference)
101
+ except pdfminer.pdfdocument.PDFDestinationNotFound:
102
+ utilo.error(f'invald pdf reference: {parsed.pdf_reference}')
103
+ return -1
104
+ resolved = rawmaker.utils.resolve(resolved)
105
+ parsed = rawmaker.destination.parse(resolved)
106
+ if dest:
107
+ dest = rawmaker.utils.resolve(dest)
108
+ if isinstance(dest, list):
109
+ # pdf 1.5: [<PDFObjRef:13>, /'XYZ', 72.0, 769.89, None]
110
+ resolved = dest
111
+ else:
112
+ destname = dest if isinstance(dest, bytes) else dest.name
113
+ resolved = pdf.get_dest(destname)
114
+ if isinstance(resolved, list):
115
+ # pdf 1.4: [<PDFObjRef:4>, /'XYZ', 134.031754, 373.949829, None]
116
+ pass
117
+ else:
118
+ resolved = rawmaker.utils.resolve(resolved)
119
+ parsed = rawmaker.destination.parse(resolved)
120
+ assert parsed
121
+ if isinstance(parsed, rawmaker.destination.ExternalLinkDestination):
122
+ return -1
123
+ return parsed.page
@@ -0,0 +1,91 @@
1
+ #==============================================================================
2
+ # C O P Y R I G H T
3
+ #------------------------------------------------------------------------------
4
+ # Copyright (c) 2019-2023 by Helmut Konrad Schewe. All rights reserved.
5
+ # This file is property of Helmut Konrad Schewe. Any unauthorized copy,
6
+ # use or distribution is an offensive act against international law and may
7
+ # be prosecuted under federal law. Its content is company confidential.
8
+ #==============================================================================
9
+ """Extract text out of pdf document to gather information."""
10
+
11
+ import os
12
+
13
+ import iamraw
14
+ import serializeraw
15
+
16
+ import rawmaker.cli
17
+ import rawmaker.features
18
+ import rawmaker.miner.position
19
+ import rawmaker.miner.text
20
+ import rawmaker.miner.underline
21
+ import rawmaker.parameter
22
+ import rawmaker.reader
23
+ import rawmaker.text.superfast
24
+
25
+
26
+ def work( # pylint:disable=W9015,W0613
27
+ document: str,
28
+ xhorizontals: str = None,
29
+ boxes_flow: float = 0.5,
30
+ char_margin: float = 2.0,
31
+ line_margin: float = 0.5,
32
+ line_overlap: float = 0.5,
33
+ word_margin: float = 0.1,
34
+ nostrip: bool = not rawmaker.parameter.STRIP,
35
+ detect_vertical: bool = False,
36
+ pages: tuple = None,
37
+ ) -> tuple[str, str]:
38
+ """Extract structured text out of document
39
+
40
+ Args:
41
+ document: pdf-document to run parsing
42
+ char_margin(float): XXX Why 5.0?
43
+ pages(list): List of processed pages.
44
+ Returns:
45
+ parsed document as yaml output
46
+ parsed positions of text container
47
+ """
48
+ # TODO: CHANGE BEHAVIOR OF --detect_vertical. Convert to PARAMETER
49
+ # with True as default.
50
+ detect_vertical = True # TODO: REMOVE?
51
+ config = rawmaker.parameter.ParsingConfiguration.from_dict(**locals())
52
+ if rawmaker.cli.superfast(): # pylint:disable=W0160
53
+ document = rawmaker.text.superfast.superfast(
54
+ document,
55
+ config,
56
+ workdir=os.getcwd(),
57
+ pages=pages,
58
+ )
59
+ else:
60
+ document = extract_document(source=document, config=config, pages=pages)
61
+ document = rawmaker.miner.underline.underline_chars(
62
+ document,
63
+ xhorizontals,
64
+ pages=pages,
65
+ )
66
+ positions = rawmaker.miner.position.hash_positions(document, pages=pages)
67
+ # dump result
68
+ dumped_text = serializeraw.dump_document(document)
69
+ dumped_positions = serializeraw.dump_textpositions(positions)
70
+ return dumped_text, dumped_positions
71
+
72
+
73
+ def extract_document(
74
+ source: str,
75
+ config: rawmaker.parameter.ParsingConfiguration = None,
76
+ converter=None,
77
+ pages: tuple = None,
78
+ ) -> iamraw.Document:
79
+ if config:
80
+ rawmaker.parameter.print_layout(config)
81
+ if converter is None:
82
+ converter = rawmaker.miner.text.PrecisePDFConverter
83
+ assert isinstance(source, str), str(source)
84
+ with rawmaker.reader.read(source) as pdf:
85
+ document = rawmaker.features.extract_content(
86
+ pdf,
87
+ config=config,
88
+ converter=converter,
89
+ pages=pages,
90
+ )
91
+ return document