rawmaker 2.40.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (63) hide show
  1. letty/__init__.py +46 -0
  2. letty/cli.py +63 -0
  3. letty/optimizer.py +138 -0
  4. letty/quality/__init__.py +8 -0
  5. letty/quality/whitespace.py +50 -0
  6. letty/strategy.py +8 -0
  7. rawmaker/__init__.py +29 -0
  8. rawmaker/__main__.py +13 -0
  9. rawmaker/__patch__.py +36 -0
  10. rawmaker/cli.py +206 -0
  11. rawmaker/cli_automate.py +69 -0
  12. rawmaker/converter/__init__.py +8 -0
  13. rawmaker/converter/basic.py +174 -0
  14. rawmaker/converter/images.py +168 -0
  15. rawmaker/date.py +83 -0
  16. rawmaker/destination.py +202 -0
  17. rawmaker/error.py +34 -0
  18. rawmaker/features/__init__.py +138 -0
  19. rawmaker/features/annotation.py +254 -0
  20. rawmaker/features/border.py +172 -0
  21. rawmaker/features/boxes.py +153 -0
  22. rawmaker/features/figures.py +24 -0
  23. rawmaker/features/fonts.py +229 -0
  24. rawmaker/features/formula.py +16 -0
  25. rawmaker/features/horizontals.py +132 -0
  26. rawmaker/features/images.py +155 -0
  27. rawmaker/features/line.py +337 -0
  28. rawmaker/features/outlines.py +123 -0
  29. rawmaker/features/text.py +91 -0
  30. rawmaker/fonts/__init__.py +8 -0
  31. rawmaker/fonts/parser.py +354 -0
  32. rawmaker/images/__init__.py +8 -0
  33. rawmaker/images/info.py +35 -0
  34. rawmaker/miner/__init__.py +8 -0
  35. rawmaker/miner/char.py +42 -0
  36. rawmaker/miner/colorspace.py +75 -0
  37. rawmaker/miner/images.py +448 -0
  38. rawmaker/miner/position.py +121 -0
  39. rawmaker/miner/rawchar.py +207 -0
  40. rawmaker/miner/text.py +833 -0
  41. rawmaker/miner/underline.py +66 -0
  42. rawmaker/parameter.py +130 -0
  43. rawmaker/patch/__init__.py +8 -0
  44. rawmaker/patch/ltchar.py +79 -0
  45. rawmaker/reader.py +97 -0
  46. rawmaker/text/__init__.py +8 -0
  47. rawmaker/text/chars.py +24 -0
  48. rawmaker/text/data.py +47 -0
  49. rawmaker/text/superfast.py +91 -0
  50. rawmaker/text/wordbox.py +95 -0
  51. rawmaker/utils.py +44 -0
  52. rawmaker-2.40.3.dist-info/METADATA +51 -0
  53. rawmaker-2.40.3.dist-info/RECORD +63 -0
  54. rawmaker-2.40.3.dist-info/WHEEL +5 -0
  55. rawmaker-2.40.3.dist-info/entry_points.txt +6 -0
  56. rawmaker-2.40.3.dist-info/licenses/LICENSE +21 -0
  57. rawmaker-2.40.3.dist-info/top_level.txt +3 -0
  58. spacestation/__init__.py +18 -0
  59. spacestation/cli.py +51 -0
  60. spacestation/features/__init__.py +8 -0
  61. spacestation/features/chardist.py +85 -0
  62. spacestation/features/worddist.py +57 -0
  63. spacestation/features/wspace.py +130 -0
rawmaker/miner/text.py ADDED
@@ -0,0 +1,833 @@
1
+ #==============================================================================
2
+ # C O P Y R I G H T
3
+ #------------------------------------------------------------------------------
4
+ # Copyright (c) 2019-2023 by Helmut Konrad Schewe. All rights reserved.
5
+ # This file is property of Helmut Konrad Schewe. Any unauthorized copy,
6
+ # use or distribution is an offensive act against international law and may
7
+ # be prosecuted under federal law. Its content is company confidential.
8
+ #==============================================================================
9
+ """Textminer
10
+ =========
11
+
12
+ Parses pdf document and extracts layouted text components.
13
+ """
14
+
15
+ import contextlib
16
+ import copy
17
+ import math
18
+
19
+ import configos
20
+ import iamraw
21
+ import pdfminer.converter
22
+ import pdfminer.layout
23
+ import pdfminer.pdfinterp
24
+ import pdfminer.utils
25
+ import utilo
26
+
27
+ import rawmaker.converter.basic
28
+ import rawmaker.miner.rawchar
29
+ import rawmaker.parameter
30
+ import rawmaker.patch.ltchar
31
+
32
+ # all rises lower this threshold are treated as noise, therefore zero.
33
+ FONT_RISE_MIN = configos.HV_FLOAT_PLUS(default=1.0)
34
+
35
+ FIX_FONTRISE_OCCURENCE_MAX = configos.HolyTable(items=[
36
+ (1, 1),
37
+ (2, 2),
38
+ (3, 3),
39
+ (4, 4),
40
+ (5, 5),
41
+ (20, 5),
42
+ (40, 10),
43
+ (60, 15),
44
+ ])
45
+
46
+
47
+ class PrecisePDFConverter(rawmaker.converter.basic.FlippedLayoutAnalyzer):
48
+ """Parsing PDF-files based on given layout definition `laparams`.
49
+
50
+ The `PrecisePDFConverter` parses every single page and run the
51
+ `recive_layout` method for extracted page. Based on this method
52
+ every Character, Textbox and TextContainer is converted from
53
+ `pdfminer` to own format. The y-coordiante is flipped cause pdf uses
54
+ bottom -> up and we want to use top -> bottom"""
55
+
56
+ def __init__(
57
+ self,
58
+ config: rawmaker.parameter.ParsingConfiguration = None,
59
+ imagewriter: callable = None,
60
+ ):
61
+ """Create converter instance.
62
+
63
+ Args:
64
+ config(ParsingConfiguration): layout to define maximum
65
+ spacing between chars, words
66
+ and lines.
67
+ imagewriter(callable): listener to recive extract images
68
+ """
69
+ super().__init__()
70
+ self.laparams, self.second = configure_layout_processor(config)
71
+ self.imagewriter = imagewriter
72
+ self.strip = config.strip if config else rawmaker.parameter.STRIP
73
+ self.page = 0
74
+ self.document = None
75
+
76
+ # TODO: Remove after upgrading pdfminer
77
+ PrecisePDFConverter.render_char = rawmaker.patch.ltchar.render_char
78
+ self.done = utilo.Single()
79
+
80
+ def new_document(self):
81
+ """Clear the current `Document` and initialze a new one"""
82
+ self.document = iamraw.Document()
83
+ self.done = utilo.Single()
84
+
85
+ def finish_document(self) -> iamraw.Document:
86
+ """Return the current `Document` and clear the current one"""
87
+ document = self.document
88
+ document.dimension = page_size(document)
89
+ self.document = None
90
+ return document
91
+
92
+ def end_page(self, page):
93
+ self.cur_item = run_layout( # pylint:disable=attribute-defined-outside-init
94
+ self.cur_item,
95
+ self.laparams,
96
+ self.second,
97
+ )
98
+ self.pageno += 1
99
+ self.receive_layout(self.cur_item)
100
+
101
+ def receive_layout(self, ltpage):
102
+ super().receive_layout(ltpage)
103
+ page = render(ltpage, strip=self.strip)
104
+ self.document.pages.append(page) # pylint:disable=E1101
105
+
106
+ def render_string(self, textstate, seq, ncs, graphicstate):
107
+ # HACK: PDFMINER READS SOME PDF WITH IMAGES ON PAGE WRONG
108
+ # THE BUG PRODUCES DUPLICATED OR TRIPPLED STRINGS. THE EXTRACTION
109
+ # DOES NOT FAIL BUT THE RESULT IS USELESS.
110
+ matrix = pdfminer.utils.mult_matrix(textstate.matrix, self.ctm)
111
+ hashed = hash(f'{self.pageno}{textstate}{matrix}{seq}{ncs}{graphicstate}') # yapf:disable
112
+ if self.done.contains(hashed):
113
+ return
114
+ super().render_string(textstate, seq, ncs, graphicstate)
115
+
116
+
117
+ def run_layout(page, layout, layout_vertical):
118
+ if not layout:
119
+ # no layout analyzation
120
+ return page
121
+ if not layout_vertical:
122
+ page.analyze(layout)
123
+ return page
124
+ horizontals, verticals, rest = [], [], []
125
+ for item in page._objs: # pylint:disable=W0212
126
+ with contextlib.suppress(AttributeError):
127
+ # process horizontal and vertical chars separately
128
+ if item.upright:
129
+ horizontals.append(item)
130
+ else:
131
+ verticals.append(item)
132
+ continue
133
+ rest.append(item)
134
+ # pylint:disable=W0212
135
+ # horizontal
136
+ page._objs = horizontals
137
+ page.analyze(layout)
138
+ horizontals = page._objs
139
+ # vertical
140
+ page._objs = verticals
141
+ page.analyze(layout_vertical)
142
+ verticals = page._objs
143
+ # unite result
144
+ page._objs = horizontals + verticals + rest
145
+ return page
146
+
147
+
148
+ def configure_layout_processor(config):
149
+ """Detecting horizonal and vertical text container requires to
150
+ layout object twice. In further releases of pdfminer this is may not
151
+ required anymore.
152
+ """
153
+ laparams = rawmaker.parameter.from_config(config)
154
+ if not laparams.detect_vertical:
155
+ return laparams, None
156
+ layout_vertical = rawmaker.parameter.from_config(config)
157
+ # disable vertical a first layout processing
158
+ laparams.detect_vertical = False
159
+ return laparams, layout_vertical
160
+
161
+
162
+ def page_size(document: iamraw.Document) -> iamraw.PageSize:
163
+ """Determine maximum bounding of document. Iterate throw the page and
164
+ determine the largest page"""
165
+ # TODO ?support multiple page sizes in document?
166
+ width, height = -utilo.INF, -utilo.INF
167
+ for page in document.pages:
168
+ width = max(width, page.dimension[2])
169
+ height = max(height, page.dimension[3])
170
+ return iamraw.PageSize(width, height)
171
+
172
+
173
+ def render_char(
174
+ item: pdfminer.layout.LTChar,
175
+ baseline: float,
176
+ ) -> iamraw.Char:
177
+ """Convert character and determine `fontrise` based on parent `baseline`
178
+
179
+ NOTE: Unicode character creates 2 single chars. This can affect
180
+ Bounding-Computation
181
+
182
+ Args:
183
+ item(LTChar): single character
184
+ baseline(float): bottom y-coordinate of parent text line
185
+ Returns:
186
+ Converted `iamraw.Char` with `fontsize` and `fontrise`.
187
+ """
188
+ try:
189
+ # layout characher due pdfminer changes removes BoundingBox from
190
+ # item, therefore we have to add this again
191
+ bounding = iamraw.BoundingBox(*item.bbox)
192
+ except AttributeError:
193
+ # VirtualChar has no `iamraw.BoundingBox`
194
+ bounding = None
195
+ # recive text
196
+ value = item.get_text()
197
+ # controlling chars
198
+ if not bounding:
199
+ # Example VirtualChar: <LTAnno ' '>
200
+ virtual = iamraw.VirtualChar(value=value)
201
+ return virtual
202
+ # chars with content
203
+ fontsize = utilo.roundme(item.fontsize)
204
+ # distance to bottom y-coodinate
205
+ fontrise = utilo.roundme(baseline - bounding.y1)
206
+ if math.fabs(fontsize) <= FONT_RISE_MIN:
207
+ # add threshold to avoid noise in char-fontrise
208
+ fontrise: float = 0.0
209
+ char = None
210
+ replaced = rawmaker.miner.rawchar.special_char(
211
+ value,
212
+ fontname=item.fontname,
213
+ )
214
+ if replaced is not None: # pylint:disable=W0160
215
+ # Unicode character
216
+ char = rawmaker.miner.rawchar.RawUnicodeChar(
217
+ ltchar=item,
218
+ box=bounding,
219
+ font=item.fontname,
220
+ rise=fontrise,
221
+ size=fontsize,
222
+ special=value,
223
+ value=replaced,
224
+ )
225
+ else:
226
+ char = rawmaker.miner.rawchar.RawChar(
227
+ ltchar=item,
228
+ box=bounding,
229
+ font=item.fontname,
230
+ rise=fontrise,
231
+ size=fontsize,
232
+ value=value,
233
+ )
234
+ return char
235
+
236
+
237
+ WHITE = (1, 1, 1)
238
+ BLACK = (0, 0, 0)
239
+
240
+
241
+ def transparent(char) -> bool:
242
+ try:
243
+ char = char.ltchar
244
+ except AttributeError:
245
+ # VirtualChar
246
+ return False
247
+ colorspace = char.graphicstate
248
+ stroking = colorspace.scolor
249
+ non_storking = colorspace.ncolor
250
+ if stroking == non_storking == WHITE:
251
+ return True
252
+ if stroking is None and non_storking == 1:
253
+ return True
254
+ return False
255
+
256
+
257
+ def render_textline(
258
+ item: pdfminer.layout.LTTextBox,
259
+ strip: bool = False,
260
+ remove_horizontals: bool = True,
261
+ ) -> iamraw.Line:
262
+ """Determine character Bounding and split character if required
263
+ cause layout parser puts two character together.
264
+
265
+ Args:
266
+ item: LTTextBox with list of containg LTChar's
267
+ strip: remove white spaces at begin and end of text line
268
+ remove_horizontals: if True remove horizontal lines build out of
269
+ characters
270
+ Returns:
271
+ iamraw.Line with converted iamraw.Character
272
+ """
273
+ result = iamraw.Line(box=item.bbox)
274
+ baseline = item.bbox.y1
275
+ for char in item._objs: # pylint: disable=protected-access
276
+ # pylint:disable=E1101
277
+ character = render_char(char, baseline=baseline)
278
+ if transparent(character):
279
+ # TODO: WRITE TO DEBUG FILE TO INFORM USER ABOUT BAD PRINTED PDF
280
+ utilo.debug(f'white char, skip: {character}')
281
+ result.chars.append(iamraw.VirtualChar(value=' '))
282
+ continue
283
+ if len(character.value) == 1:
284
+ result.chars.append(character)
285
+ else:
286
+ # in some case the layout parser matches to chars together.
287
+ # Therefore we have to split the character by content and fix
288
+ # the bounding.
289
+ for splitted in split_characters(character):
290
+ assert len(splitted.value) == 1, splitted
291
+ result.chars.append(splitted)
292
+ # ensure that chars are sorted from left to right
293
+ # TODO: CHECK VERTICAL TEXT?
294
+ result.chars = ensure_leftright(result.chars)
295
+ result.chars = merge_small_whitespaces(result.chars)
296
+ result.chars = merge_special_char(result.chars)
297
+ result.chars = fix_fontrise(result.chars)
298
+ if remove_horizontals and ishorizontal(result.text):
299
+ return None
300
+ if not strip:
301
+ return result
302
+ result = textline_strip(result)
303
+ return result
304
+
305
+
306
+ def textline_strip(result):
307
+ # remove left
308
+ lstrip = len(result.text) - len(result.text.lstrip())
309
+ result.chars = result.chars[lstrip:]
310
+ # remove right
311
+ # +1 to preserve virtual newline char
312
+ # rstrip = len(result.text.rstrip()) +1
313
+ rstrip = len(result.text.rstrip())
314
+ result.chars = result.chars[:rstrip]
315
+ if result.chars:
316
+ # TODO: ENSURE THAT ONLY A SINGLE LINE IS RENDERED?
317
+ # IF MORE THAN ONE LINE IS RENDERED, LAST CHAR MUST NOT BE THE
318
+ # MOST RIGHT CHAR.
319
+ # fix bounding box of line rectangle ensure to end with newline
320
+ # result.chars[-1].value = ' '
321
+ x0 = result.chars[0].box.x0
322
+ try:
323
+ x1 = result.chars[-1].box.x1
324
+ except AttributeError:
325
+ # VirtualChar has no BoundingBox, use one Char before
326
+ # TODO: THIS MAY NOT HAPPEN ANYMORE CAUSE OF THE STRIP ABOVE
327
+ x1 = result.chars[-2].box.x1
328
+ result.box.x0 = x0
329
+ result.box.x1 = x1
330
+ # TODO: VERIFY <=
331
+ assert result.box.x0 <= result.box.x1, str(vars(result))
332
+ return result
333
+
334
+
335
+ def ishorizontal(text: str, mincount=10) -> bool:
336
+ """Check if text line is a char based horizontal line.
337
+
338
+ >>> ishorizontal('--------------')
339
+ True
340
+ >>> ishorizontal('_______________________')
341
+ True
342
+ >>> ishorizontal('this is a text')
343
+ False
344
+ """
345
+ shorten = text.replace('_', '').replace('-', '').replace('=', '').strip()
346
+ if shorten:
347
+ return False
348
+ counted = text.count('_') + text.count('-') + text.count('=')
349
+ if counted < mincount:
350
+ return False
351
+ return True
352
+
353
+
354
+ def fix_fontrise(items):
355
+ """Workaround for font rise extraction bug.
356
+
357
+ In some cases the layout is extracted with font rises which are not
358
+ necessary. There is a single char without font rise and the other
359
+ are layouted with different y1 position and a font rise.
360
+ """
361
+ if not items:
362
+ return items
363
+ non_virtual = [
364
+ item for item in items if not isinstance(item, iamraw.VirtualChar)
365
+ ]
366
+ rises = [item for item in non_virtual if item.rise]
367
+ if not rises:
368
+ # no fix is required
369
+ return items
370
+ zero, non_zero = utilo.partition(
371
+ key=lambda item: utilo.near(
372
+ item.rise,
373
+ 0.0,
374
+ diff=FONT_RISE_MIN,
375
+ ),
376
+ items=non_virtual,
377
+ )
378
+ fix_fontrise_occurence_max = FIX_FONTRISE_OCCURENCE_MAX(len(items))
379
+ if len(non_zero) > fix_fontrise_occurence_max:
380
+ # disable font rise for too many false detection?
381
+ # TODO: VERIFY LATER
382
+ for item in items:
383
+ item.rise = 0.0
384
+ if len(zero) != 1:
385
+ return items
386
+ if not non_zero:
387
+ return items
388
+ mode = utilo.mode(item.rise for item in non_zero)
389
+ for item in non_zero:
390
+ item.rise = item.rise - mode
391
+ item.box.y1 = item.box.y1 + mode
392
+ return items
393
+
394
+
395
+ def ensure_leftright(items):
396
+ """Fix layout parser miss detection.
397
+
398
+ Ensure that more left x0 coordinates comes before higher x0
399
+ coordinate.
400
+ """
401
+ # TODO: ENSURE TOP TO DOWN, LOOK AT FONT RISE PROBLEM
402
+ # map bounding cause virtual chars has no bounding
403
+ if not items:
404
+ return items
405
+
406
+ def first_box(items):
407
+ if not items:
408
+ return None
409
+ with contextlib.suppress(AttributeError):
410
+ return items[0].box[2] # x1
411
+ # TODO: WHY X1 AND NOT X0?
412
+ # TODO: CATCH OUT OF BOUNDS
413
+ return first_box(items[1:])
414
+
415
+ current = first_box(items)
416
+ if current is None:
417
+ # VirtualChars only
418
+ return items
419
+ boundings = []
420
+ for item in items:
421
+ try:
422
+ boundings.append((item.box[0], item)) # x0 left border
423
+ current = item.box[2] # x1 right border
424
+ except AttributeError:
425
+ boundings.append((current, item))
426
+ # more than one virtual char in a row, don't know if possible
427
+ current += 0.1
428
+ current: float = utilo.roundme(current)
429
+ # sort from left to right
430
+ boundings = sorted(boundings, key=lambda x: x[0])
431
+ # remove mapped coordiante
432
+ items = [item[1] for item in boundings]
433
+ return items
434
+
435
+
436
+ MERGES = {
437
+ 'A': 'Ä',
438
+ 'a': 'ä',
439
+ 'O': 'Ö',
440
+ 'o': 'ö',
441
+ 'U': 'Ü',
442
+ 'u': 'ü',
443
+ # bachelor090:page88 \x0d '\r' R
444
+ 'R': '®',
445
+ }
446
+
447
+ # \x0d => \r
448
+ SPECIALS = {'¨', '\x0d'}
449
+
450
+
451
+ def merge_special_char(items): # pylint:disable=R1260
452
+ """Convert `A¨` to `Ä` etc.
453
+
454
+ See bachelor90 example.
455
+ """
456
+ if not items:
457
+ return []
458
+ result = [items[0]]
459
+ for item in items[1:]:
460
+ if result[-1].value in SPECIALS:
461
+ # try merge
462
+ try:
463
+ replaced = MERGES[item.value]
464
+ result.pop()
465
+ item.value = replaced
466
+ result.append(item)
467
+ except KeyError:
468
+ utilo.error(f'could not merge with after {item}')
469
+ result.append(item)
470
+ continue
471
+ try:
472
+ special = item.special
473
+ except AttributeError:
474
+ special = None
475
+ if special not in SPECIALS:
476
+ result.append(item)
477
+ continue
478
+ # merge with before
479
+ try:
480
+ replaced = MERGES[result[-1].value]
481
+ except KeyError:
482
+ # TODO: REMOVE ERROR LOG LATER
483
+ utilo.debug(f'could not merge with before {item}')
484
+ result.append(item)
485
+ continue
486
+ result[-1].value = replaced
487
+ return result
488
+
489
+
490
+ def merge_small_whitespaces(items):
491
+ """Removed unnescessary bad printed white spaces.
492
+
493
+ See bachelor90 example.
494
+ """
495
+ if len(items) < 3:
496
+ return items
497
+ result = [items[0]]
498
+ for current, after in zip(items[1:-1], items[2:]):
499
+ if not isinstance(current, iamraw.VirtualChar):
500
+ result.append(current)
501
+ continue
502
+ try:
503
+ before_x0 = result[-1].box.x0
504
+ before_x1 = result[-1].box.x1
505
+ after_x0 = after.box.x0
506
+ except AttributeError:
507
+ # TODO: INVESTIGATE LATER
508
+ # whitespace before or after
509
+ result.append(current)
510
+ continue
511
+ if before_x0 <= after_x0 <= before_x1:
512
+ # ensure to overlap and not merge hthan required
513
+ # remove unnecessary virtual char
514
+ continue
515
+ # add required virtual char
516
+ result.append(current)
517
+ result.append(items[-1])
518
+ return result
519
+
520
+
521
+ def split_characters(char) -> list:
522
+ """Split character which contains multiple chars. Split given
523
+ BoundingBox and give every splitted character the same space.
524
+
525
+ Args:
526
+ char with multiple character in `char.value`
527
+ Returns:
528
+ List of splitted character.
529
+ """
530
+ result = []
531
+ charbounding = char.box
532
+ charstep = charbounding.x1 - charbounding.x0
533
+ if charstep <= 0.0:
534
+ utilo.error(f'invalid charstep: {charstep}: {charbounding} - {char}')
535
+ assert charstep >= 0.0, f'{charstep}: {charbounding} - {char}'
536
+ for index, text in enumerate(char.value):
537
+ copied = copy.deepcopy(char)
538
+ copied.value = text
539
+ # split common BoundingBox of multiple chars to single
540
+ # BoundingBoxes.
541
+ # NOTE: This does not work hundert percent correctly. Imagine if
542
+ # you have the character Z and I togester. Z is bigger than I. But
543
+ # that accurarcy is fine.
544
+ bounding = iamraw.BoundingBox.from_list([
545
+ charbounding.x0 + index * charstep,
546
+ charbounding.y0,
547
+ charbounding.x0 + (index + 1) * charstep,
548
+ charbounding.y1,
549
+ ])
550
+ copied.box = bounding
551
+ result.append(copied)
552
+ return result
553
+
554
+
555
+ def split_container(
556
+ item: pdfminer.layout.LTTextBox,
557
+ strip: bool = False,
558
+ ) -> list:
559
+ grouped = [[]]
560
+ for line in item:
561
+ split = not line.get_text().strip() and strip
562
+ vertical_change = False
563
+ if grouped[-1]:
564
+ vertical_change = vertical(grouped[-1]) != vertical(line)
565
+ if split or vertical_change:
566
+ grouped.append([])
567
+ else:
568
+ grouped[-1].append(line)
569
+ grouped = [item for item in grouped if item]
570
+ # add bounding
571
+ result = []
572
+ for index, group in enumerate(grouped):
573
+ ctor = pdfminer.layout.LTTextBoxHorizontal
574
+ if vertical(group):
575
+ ctor = pdfminer.layout.LTTextBoxVertical
576
+ item = ctor()
577
+ for line in group:
578
+ item.add(line)
579
+ item.index = index
580
+ item.bbox = iamraw.BoundingBox(*item.bbox)
581
+ result.append(item)
582
+ return result
583
+
584
+
585
+ def render_textcontainer(
586
+ item: pdfminer.layout.LTTextBox,
587
+ strip: bool = False,
588
+ ) -> iamraw.TextContainer:
589
+ splitted = split_container(item, strip=strip)
590
+ rendered = [
591
+ render_vertical_textcontainer(item, strip=strip) if vertical(item) else
592
+ render_horizontal_textcontainer(item, strip=strip) for item in splitted
593
+ ]
594
+ # Ensure that all TextContainer have only one line. Prepare to remove
595
+ # lines concept and handle everything as a single line.
596
+ result = []
597
+ for container in rendered:
598
+ if len(container) == 1:
599
+ result.append(container)
600
+ continue
601
+ splitted = [
602
+ container.__class__(box=line.box, lines=[line])
603
+ for line in container
604
+ ]
605
+ result.extend(splitted)
606
+ return result
607
+
608
+
609
+ def render_horizontal_textcontainer(
610
+ item: pdfminer.layout.LTTextBox,
611
+ strip: bool = False,
612
+ ) -> iamraw.TextContainer:
613
+ container = iamraw.TextContainer(box=item.bbox)
614
+ for line in item:
615
+ rendered = render_textline(line, strip=strip)
616
+ if not rendered:
617
+ continue
618
+ container.append(rendered)
619
+ if len(container.lines) == 1:
620
+ # update parent box
621
+ # TODO: ENSURE TO UPDATE MULTILINE BOXES CORRECTLY
622
+ # TODO: COMPUTE BOXES OUT OF MEMBER/CHILDREN/LINES
623
+ container.box = container[0].box
624
+ if container:
625
+ # fix start of container
626
+ # pdfminer extracts the TextContainer bigger than the chars really
627
+ # are. In top(y0) direction, therefore we replace the top boundary
628
+ # with first line boundary.
629
+ container.box.y0 = container[0].box.y0
630
+ return container
631
+
632
+
633
+ def render_vertical_textcontainer(
634
+ item: pdfminer.layout.LTTextBox,
635
+ strip: bool = False,
636
+ ) -> iamraw.VerticalTextContainer:
637
+ container = iamraw.VerticalTextContainer(box=item.bbox)
638
+ for line in item:
639
+ rendered = render_textline(line, strip=strip)
640
+ if not rendered:
641
+ continue
642
+ container.append(rendered)
643
+ return container
644
+
645
+
646
+ def vertical(item: pdfminer.layout.LTTextBox):
647
+ """Check LTChar.upright flag."""
648
+ if isinstance(item, (pdfminer.layout.LTTextLine)):
649
+ # enable checking single lines
650
+ item = [item]
651
+ for line in item:
652
+ for char in line._objs: # pylint: disable=protected-access
653
+ with contextlib.suppress(AttributeError):
654
+ if rawmaker.patch.ltchar.vertical(char):
655
+ return True
656
+ return False
657
+
658
+
659
+ def render(item, strip: bool = False): # pylint:disable=R1260,too-many-branches
660
+ if isinstance(item, pdfminer.layout.LTPage): # pylint:disable=too-many-nested-blocks
661
+ pagenumber = item.pageid
662
+ page = iamraw.Page(pagenumber, iamraw.BoundingBox(*item.bbox))
663
+ # TODO: ENSURE ROTATED PAGES?
664
+ for child in item:
665
+ # pylint:disable=E1101
666
+ rendered = render(child, strip=strip)
667
+ if rendered is None:
668
+ continue
669
+ if isinstance(rendered, list):
670
+ for single in rendered:
671
+ if isinstance(single, list):
672
+ for pageitem in single:
673
+ page.append(pageitem)
674
+ else:
675
+ page.append(single)
676
+ else:
677
+ page.append(rendered)
678
+ page = mylayout(page)
679
+ return page
680
+ if isinstance(item, pdfminer.layout.LTTextBox):
681
+ textcontainers = render_textcontainer(item, strip=strip)
682
+ result = []
683
+ for container in textcontainers:
684
+ if strip:
685
+ container.lines = [
686
+ line for line in container.lines if line.text.strip()
687
+ ]
688
+ if not container.lines:
689
+ # ignore stripped line
690
+ continue
691
+ container = ensure_bounding(container)
692
+ result.append(container)
693
+ return result
694
+ return None
695
+
696
+
697
+ def ensure_bounding(textcontainer: iamraw.TextContainer):
698
+ if len(textcontainer) == 1:
699
+ return textcontainer
700
+ if isinstance(textcontainer, iamraw.VerticalTextContainer):
701
+ # TODO: NOT SUPPORTED YET
702
+ return textcontainer
703
+ # check if splitting bounding container is required or container fits
704
+ # already.
705
+ indexed = [[0]]
706
+ for index, item in enumerate(textcontainer[1:], start=1):
707
+ before = textcontainer[indexed[-1][0]].box
708
+ cur = item.box
709
+ if (utilo.near(before[0], cur[0]) and utilo.near(before[2], cur[2])):
710
+ indexed[-1].append(index)
711
+ else:
712
+ indexed.append([index])
713
+ if len(indexed) == 1:
714
+ # splitting is not required, container fits already
715
+ return textcontainer
716
+ result = []
717
+ for block in indexed:
718
+ # split container into smaller, better fitting containers
719
+ collected = [textcontainer[index] for index in block]
720
+ current = iamraw.TextContainer()
721
+ for item in collected:
722
+ current.append(item)
723
+ current.box = utilo.rect_max([item.box for item in collected])
724
+ result.append(current)
725
+ return result
726
+
727
+
728
+ def mylayout(page: iamraw.Page) -> iamraw.Page:
729
+ children = page.children
730
+ if not children:
731
+ return page
732
+ verticals, horizontal = utilo.partition(
733
+ lambda x: isinstance(x, iamraw.VerticalTextContainer),
734
+ children,
735
+ )
736
+ verticals = merge_neighbors(
737
+ verticals,
738
+ horizontal=False,
739
+ ydiff=15.0,
740
+ xdiff=15.0,
741
+ )
742
+ horizontal = merge_neighbors(horizontal)
743
+ page.children = horizontal + verticals
744
+ return page
745
+
746
+
747
+ def merge_neighbors(
748
+ children: list,
749
+ xdiff: float = 10.0,
750
+ ydiff: float = 5.0,
751
+ horizontal: bool = True,
752
+ ) -> list:
753
+ # TODO: IMPROVE VERTICAL MERGER
754
+ if not children:
755
+ return []
756
+ # ensure to sort items top to bottom and left to right. It is
757
+ # important to connect only neighbored items to avoid conflicts in
758
+ # bounding computation. See: test_mylayout_bounding_extraction_bug
759
+ # Use y1 as lower text line.
760
+ if horizontal:
761
+ children = sorted(children, key=lambda x: x.box[0]) # leftright
762
+ children = sorted(children, key=lambda x: x.box[3]) # topdown
763
+ else:
764
+ # vertical
765
+ # bottom up
766
+ children = sorted(children, key=lambda x: x.box[1], reverse=True)
767
+ children = sorted(children, key=lambda x: x.box[0]) # leftright
768
+ result = [children[0]]
769
+ for item in children[1:]:
770
+ before = result[-1]
771
+ if required := require_merge(
772
+ item.box,
773
+ before=before.box,
774
+ xdiff=xdiff,
775
+ ydiff=ydiff,
776
+ ):
777
+ # merge before
778
+ # add virtual char
779
+ before.lines[-1].chars.append(iamraw.VirtualChar(value=' '))
780
+ before.lines[-1].chars.extend(item.lines[0].chars)
781
+ if len(item.lines) >= 2:
782
+ before.lines.extend(item.lines[1:])
783
+ # adjust bounding
784
+ if item.box[2] > before.box[2]:
785
+ # ensure that right border is more right than left border.
786
+ # In some cases, formulas for example, it can happen that
787
+ # this contraint is not given.
788
+ before.box = utilo.update_tuple(
789
+ data=tuple(before.box), # REMOVE TUPLE LATER
790
+ value=item.box[2],
791
+ index=2,
792
+ )
793
+ else:
794
+ utilo.debug('HINT: no bounding box update required')
795
+ before.box = iamraw.BoundingBox(*before.box)
796
+ continue
797
+ if required is None:
798
+ utilo.error('duplicated bounding, bad printed layout')
799
+ utilo.error(vars(before))
800
+ utilo.error(vars(item))
801
+ else:
802
+ result.append(item)
803
+ return result
804
+
805
+
806
+ def require_merge(
807
+ current: tuple,
808
+ before: tuple,
809
+ xdiff: float,
810
+ ydiff: float,
811
+ ) -> bool:
812
+ """Should we merge two bounding boxes causes there are very near?
813
+
814
+ Problem:
815
+ Some pdf printer produces very bad boundings, sometimes object
816
+ completely covered by each other.
817
+ """
818
+ # TODO: MAKE THIS SIZE DEPENDENT
819
+ ynear = utilo.near(current[3], before[3], diff=ydiff)
820
+ if not ynear:
821
+ return False
822
+ if utilo.rect_overlapping(current, before) > 0.98: # TODO: HOLY VALUE
823
+ # nearly equals objects
824
+ # hoverpower.HOME016A_PDF
825
+ # {'box': BoundingBox(x0=292.73, y0=789.45, x1=302.75, y1=799.41), 'lines': [Line(text="16")], 'state': None}
826
+ # {'box': BoundingBox(x0=292.73, y0=789.57, x1=302.75, y1=799.53), 'lines': [Line(text="16")], 'state': None}
827
+ # nearly equal bounding, we skip it. Bad printed pdf.
828
+ # TODO: XXX
829
+ return None
830
+ xnear = utilo.near(current[0], before[2], diff=xdiff)
831
+ if xnear:
832
+ return True
833
+ return False