rawmaker 2.40.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- letty/__init__.py +46 -0
- letty/cli.py +63 -0
- letty/optimizer.py +138 -0
- letty/quality/__init__.py +8 -0
- letty/quality/whitespace.py +50 -0
- letty/strategy.py +8 -0
- rawmaker/__init__.py +29 -0
- rawmaker/__main__.py +13 -0
- rawmaker/__patch__.py +36 -0
- rawmaker/cli.py +206 -0
- rawmaker/cli_automate.py +69 -0
- rawmaker/converter/__init__.py +8 -0
- rawmaker/converter/basic.py +174 -0
- rawmaker/converter/images.py +168 -0
- rawmaker/date.py +83 -0
- rawmaker/destination.py +202 -0
- rawmaker/error.py +34 -0
- rawmaker/features/__init__.py +138 -0
- rawmaker/features/annotation.py +254 -0
- rawmaker/features/border.py +172 -0
- rawmaker/features/boxes.py +153 -0
- rawmaker/features/figures.py +24 -0
- rawmaker/features/fonts.py +229 -0
- rawmaker/features/formula.py +16 -0
- rawmaker/features/horizontals.py +132 -0
- rawmaker/features/images.py +155 -0
- rawmaker/features/line.py +337 -0
- rawmaker/features/outlines.py +123 -0
- rawmaker/features/text.py +91 -0
- rawmaker/fonts/__init__.py +8 -0
- rawmaker/fonts/parser.py +354 -0
- rawmaker/images/__init__.py +8 -0
- rawmaker/images/info.py +35 -0
- rawmaker/miner/__init__.py +8 -0
- rawmaker/miner/char.py +42 -0
- rawmaker/miner/colorspace.py +75 -0
- rawmaker/miner/images.py +448 -0
- rawmaker/miner/position.py +121 -0
- rawmaker/miner/rawchar.py +207 -0
- rawmaker/miner/text.py +833 -0
- rawmaker/miner/underline.py +66 -0
- rawmaker/parameter.py +130 -0
- rawmaker/patch/__init__.py +8 -0
- rawmaker/patch/ltchar.py +79 -0
- rawmaker/reader.py +97 -0
- rawmaker/text/__init__.py +8 -0
- rawmaker/text/chars.py +24 -0
- rawmaker/text/data.py +47 -0
- rawmaker/text/superfast.py +91 -0
- rawmaker/text/wordbox.py +95 -0
- rawmaker/utils.py +44 -0
- rawmaker-2.40.3.dist-info/METADATA +51 -0
- rawmaker-2.40.3.dist-info/RECORD +63 -0
- rawmaker-2.40.3.dist-info/WHEEL +5 -0
- rawmaker-2.40.3.dist-info/entry_points.txt +6 -0
- rawmaker-2.40.3.dist-info/licenses/LICENSE +21 -0
- rawmaker-2.40.3.dist-info/top_level.txt +3 -0
- spacestation/__init__.py +18 -0
- spacestation/cli.py +51 -0
- spacestation/features/__init__.py +8 -0
- spacestation/features/chardist.py +85 -0
- spacestation/features/worddist.py +57 -0
- spacestation/features/wspace.py +130 -0
rawmaker/miner/text.py
ADDED
|
@@ -0,0 +1,833 @@
|
|
|
1
|
+
#==============================================================================
|
|
2
|
+
# C O P Y R I G H T
|
|
3
|
+
#------------------------------------------------------------------------------
|
|
4
|
+
# Copyright (c) 2019-2023 by Helmut Konrad Schewe. All rights reserved.
|
|
5
|
+
# This file is property of Helmut Konrad Schewe. Any unauthorized copy,
|
|
6
|
+
# use or distribution is an offensive act against international law and may
|
|
7
|
+
# be prosecuted under federal law. Its content is company confidential.
|
|
8
|
+
#==============================================================================
|
|
9
|
+
"""Textminer
|
|
10
|
+
=========
|
|
11
|
+
|
|
12
|
+
Parses pdf document and extracts layouted text components.
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
import contextlib
|
|
16
|
+
import copy
|
|
17
|
+
import math
|
|
18
|
+
|
|
19
|
+
import configos
|
|
20
|
+
import iamraw
|
|
21
|
+
import pdfminer.converter
|
|
22
|
+
import pdfminer.layout
|
|
23
|
+
import pdfminer.pdfinterp
|
|
24
|
+
import pdfminer.utils
|
|
25
|
+
import utilo
|
|
26
|
+
|
|
27
|
+
import rawmaker.converter.basic
|
|
28
|
+
import rawmaker.miner.rawchar
|
|
29
|
+
import rawmaker.parameter
|
|
30
|
+
import rawmaker.patch.ltchar
|
|
31
|
+
|
|
32
|
+
# all rises lower this threshold are treated as noise, therefore zero.
|
|
33
|
+
FONT_RISE_MIN = configos.HV_FLOAT_PLUS(default=1.0)
|
|
34
|
+
|
|
35
|
+
FIX_FONTRISE_OCCURENCE_MAX = configos.HolyTable(items=[
|
|
36
|
+
(1, 1),
|
|
37
|
+
(2, 2),
|
|
38
|
+
(3, 3),
|
|
39
|
+
(4, 4),
|
|
40
|
+
(5, 5),
|
|
41
|
+
(20, 5),
|
|
42
|
+
(40, 10),
|
|
43
|
+
(60, 15),
|
|
44
|
+
])
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
class PrecisePDFConverter(rawmaker.converter.basic.FlippedLayoutAnalyzer):
|
|
48
|
+
"""Parsing PDF-files based on given layout definition `laparams`.
|
|
49
|
+
|
|
50
|
+
The `PrecisePDFConverter` parses every single page and run the
|
|
51
|
+
`recive_layout` method for extracted page. Based on this method
|
|
52
|
+
every Character, Textbox and TextContainer is converted from
|
|
53
|
+
`pdfminer` to own format. The y-coordiante is flipped cause pdf uses
|
|
54
|
+
bottom -> up and we want to use top -> bottom"""
|
|
55
|
+
|
|
56
|
+
def __init__(
|
|
57
|
+
self,
|
|
58
|
+
config: rawmaker.parameter.ParsingConfiguration = None,
|
|
59
|
+
imagewriter: callable = None,
|
|
60
|
+
):
|
|
61
|
+
"""Create converter instance.
|
|
62
|
+
|
|
63
|
+
Args:
|
|
64
|
+
config(ParsingConfiguration): layout to define maximum
|
|
65
|
+
spacing between chars, words
|
|
66
|
+
and lines.
|
|
67
|
+
imagewriter(callable): listener to recive extract images
|
|
68
|
+
"""
|
|
69
|
+
super().__init__()
|
|
70
|
+
self.laparams, self.second = configure_layout_processor(config)
|
|
71
|
+
self.imagewriter = imagewriter
|
|
72
|
+
self.strip = config.strip if config else rawmaker.parameter.STRIP
|
|
73
|
+
self.page = 0
|
|
74
|
+
self.document = None
|
|
75
|
+
|
|
76
|
+
# TODO: Remove after upgrading pdfminer
|
|
77
|
+
PrecisePDFConverter.render_char = rawmaker.patch.ltchar.render_char
|
|
78
|
+
self.done = utilo.Single()
|
|
79
|
+
|
|
80
|
+
def new_document(self):
|
|
81
|
+
"""Clear the current `Document` and initialze a new one"""
|
|
82
|
+
self.document = iamraw.Document()
|
|
83
|
+
self.done = utilo.Single()
|
|
84
|
+
|
|
85
|
+
def finish_document(self) -> iamraw.Document:
|
|
86
|
+
"""Return the current `Document` and clear the current one"""
|
|
87
|
+
document = self.document
|
|
88
|
+
document.dimension = page_size(document)
|
|
89
|
+
self.document = None
|
|
90
|
+
return document
|
|
91
|
+
|
|
92
|
+
def end_page(self, page):
|
|
93
|
+
self.cur_item = run_layout( # pylint:disable=attribute-defined-outside-init
|
|
94
|
+
self.cur_item,
|
|
95
|
+
self.laparams,
|
|
96
|
+
self.second,
|
|
97
|
+
)
|
|
98
|
+
self.pageno += 1
|
|
99
|
+
self.receive_layout(self.cur_item)
|
|
100
|
+
|
|
101
|
+
def receive_layout(self, ltpage):
|
|
102
|
+
super().receive_layout(ltpage)
|
|
103
|
+
page = render(ltpage, strip=self.strip)
|
|
104
|
+
self.document.pages.append(page) # pylint:disable=E1101
|
|
105
|
+
|
|
106
|
+
def render_string(self, textstate, seq, ncs, graphicstate):
|
|
107
|
+
# HACK: PDFMINER READS SOME PDF WITH IMAGES ON PAGE WRONG
|
|
108
|
+
# THE BUG PRODUCES DUPLICATED OR TRIPPLED STRINGS. THE EXTRACTION
|
|
109
|
+
# DOES NOT FAIL BUT THE RESULT IS USELESS.
|
|
110
|
+
matrix = pdfminer.utils.mult_matrix(textstate.matrix, self.ctm)
|
|
111
|
+
hashed = hash(f'{self.pageno}{textstate}{matrix}{seq}{ncs}{graphicstate}') # yapf:disable
|
|
112
|
+
if self.done.contains(hashed):
|
|
113
|
+
return
|
|
114
|
+
super().render_string(textstate, seq, ncs, graphicstate)
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
def run_layout(page, layout, layout_vertical):
|
|
118
|
+
if not layout:
|
|
119
|
+
# no layout analyzation
|
|
120
|
+
return page
|
|
121
|
+
if not layout_vertical:
|
|
122
|
+
page.analyze(layout)
|
|
123
|
+
return page
|
|
124
|
+
horizontals, verticals, rest = [], [], []
|
|
125
|
+
for item in page._objs: # pylint:disable=W0212
|
|
126
|
+
with contextlib.suppress(AttributeError):
|
|
127
|
+
# process horizontal and vertical chars separately
|
|
128
|
+
if item.upright:
|
|
129
|
+
horizontals.append(item)
|
|
130
|
+
else:
|
|
131
|
+
verticals.append(item)
|
|
132
|
+
continue
|
|
133
|
+
rest.append(item)
|
|
134
|
+
# pylint:disable=W0212
|
|
135
|
+
# horizontal
|
|
136
|
+
page._objs = horizontals
|
|
137
|
+
page.analyze(layout)
|
|
138
|
+
horizontals = page._objs
|
|
139
|
+
# vertical
|
|
140
|
+
page._objs = verticals
|
|
141
|
+
page.analyze(layout_vertical)
|
|
142
|
+
verticals = page._objs
|
|
143
|
+
# unite result
|
|
144
|
+
page._objs = horizontals + verticals + rest
|
|
145
|
+
return page
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
def configure_layout_processor(config):
|
|
149
|
+
"""Detecting horizonal and vertical text container requires to
|
|
150
|
+
layout object twice. In further releases of pdfminer this is may not
|
|
151
|
+
required anymore.
|
|
152
|
+
"""
|
|
153
|
+
laparams = rawmaker.parameter.from_config(config)
|
|
154
|
+
if not laparams.detect_vertical:
|
|
155
|
+
return laparams, None
|
|
156
|
+
layout_vertical = rawmaker.parameter.from_config(config)
|
|
157
|
+
# disable vertical a first layout processing
|
|
158
|
+
laparams.detect_vertical = False
|
|
159
|
+
return laparams, layout_vertical
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
def page_size(document: iamraw.Document) -> iamraw.PageSize:
|
|
163
|
+
"""Determine maximum bounding of document. Iterate throw the page and
|
|
164
|
+
determine the largest page"""
|
|
165
|
+
# TODO ?support multiple page sizes in document?
|
|
166
|
+
width, height = -utilo.INF, -utilo.INF
|
|
167
|
+
for page in document.pages:
|
|
168
|
+
width = max(width, page.dimension[2])
|
|
169
|
+
height = max(height, page.dimension[3])
|
|
170
|
+
return iamraw.PageSize(width, height)
|
|
171
|
+
|
|
172
|
+
|
|
173
|
+
def render_char(
|
|
174
|
+
item: pdfminer.layout.LTChar,
|
|
175
|
+
baseline: float,
|
|
176
|
+
) -> iamraw.Char:
|
|
177
|
+
"""Convert character and determine `fontrise` based on parent `baseline`
|
|
178
|
+
|
|
179
|
+
NOTE: Unicode character creates 2 single chars. This can affect
|
|
180
|
+
Bounding-Computation
|
|
181
|
+
|
|
182
|
+
Args:
|
|
183
|
+
item(LTChar): single character
|
|
184
|
+
baseline(float): bottom y-coordinate of parent text line
|
|
185
|
+
Returns:
|
|
186
|
+
Converted `iamraw.Char` with `fontsize` and `fontrise`.
|
|
187
|
+
"""
|
|
188
|
+
try:
|
|
189
|
+
# layout characher due pdfminer changes removes BoundingBox from
|
|
190
|
+
# item, therefore we have to add this again
|
|
191
|
+
bounding = iamraw.BoundingBox(*item.bbox)
|
|
192
|
+
except AttributeError:
|
|
193
|
+
# VirtualChar has no `iamraw.BoundingBox`
|
|
194
|
+
bounding = None
|
|
195
|
+
# recive text
|
|
196
|
+
value = item.get_text()
|
|
197
|
+
# controlling chars
|
|
198
|
+
if not bounding:
|
|
199
|
+
# Example VirtualChar: <LTAnno ' '>
|
|
200
|
+
virtual = iamraw.VirtualChar(value=value)
|
|
201
|
+
return virtual
|
|
202
|
+
# chars with content
|
|
203
|
+
fontsize = utilo.roundme(item.fontsize)
|
|
204
|
+
# distance to bottom y-coodinate
|
|
205
|
+
fontrise = utilo.roundme(baseline - bounding.y1)
|
|
206
|
+
if math.fabs(fontsize) <= FONT_RISE_MIN:
|
|
207
|
+
# add threshold to avoid noise in char-fontrise
|
|
208
|
+
fontrise: float = 0.0
|
|
209
|
+
char = None
|
|
210
|
+
replaced = rawmaker.miner.rawchar.special_char(
|
|
211
|
+
value,
|
|
212
|
+
fontname=item.fontname,
|
|
213
|
+
)
|
|
214
|
+
if replaced is not None: # pylint:disable=W0160
|
|
215
|
+
# Unicode character
|
|
216
|
+
char = rawmaker.miner.rawchar.RawUnicodeChar(
|
|
217
|
+
ltchar=item,
|
|
218
|
+
box=bounding,
|
|
219
|
+
font=item.fontname,
|
|
220
|
+
rise=fontrise,
|
|
221
|
+
size=fontsize,
|
|
222
|
+
special=value,
|
|
223
|
+
value=replaced,
|
|
224
|
+
)
|
|
225
|
+
else:
|
|
226
|
+
char = rawmaker.miner.rawchar.RawChar(
|
|
227
|
+
ltchar=item,
|
|
228
|
+
box=bounding,
|
|
229
|
+
font=item.fontname,
|
|
230
|
+
rise=fontrise,
|
|
231
|
+
size=fontsize,
|
|
232
|
+
value=value,
|
|
233
|
+
)
|
|
234
|
+
return char
|
|
235
|
+
|
|
236
|
+
|
|
237
|
+
WHITE = (1, 1, 1)
|
|
238
|
+
BLACK = (0, 0, 0)
|
|
239
|
+
|
|
240
|
+
|
|
241
|
+
def transparent(char) -> bool:
|
|
242
|
+
try:
|
|
243
|
+
char = char.ltchar
|
|
244
|
+
except AttributeError:
|
|
245
|
+
# VirtualChar
|
|
246
|
+
return False
|
|
247
|
+
colorspace = char.graphicstate
|
|
248
|
+
stroking = colorspace.scolor
|
|
249
|
+
non_storking = colorspace.ncolor
|
|
250
|
+
if stroking == non_storking == WHITE:
|
|
251
|
+
return True
|
|
252
|
+
if stroking is None and non_storking == 1:
|
|
253
|
+
return True
|
|
254
|
+
return False
|
|
255
|
+
|
|
256
|
+
|
|
257
|
+
def render_textline(
|
|
258
|
+
item: pdfminer.layout.LTTextBox,
|
|
259
|
+
strip: bool = False,
|
|
260
|
+
remove_horizontals: bool = True,
|
|
261
|
+
) -> iamraw.Line:
|
|
262
|
+
"""Determine character Bounding and split character if required
|
|
263
|
+
cause layout parser puts two character together.
|
|
264
|
+
|
|
265
|
+
Args:
|
|
266
|
+
item: LTTextBox with list of containg LTChar's
|
|
267
|
+
strip: remove white spaces at begin and end of text line
|
|
268
|
+
remove_horizontals: if True remove horizontal lines build out of
|
|
269
|
+
characters
|
|
270
|
+
Returns:
|
|
271
|
+
iamraw.Line with converted iamraw.Character
|
|
272
|
+
"""
|
|
273
|
+
result = iamraw.Line(box=item.bbox)
|
|
274
|
+
baseline = item.bbox.y1
|
|
275
|
+
for char in item._objs: # pylint: disable=protected-access
|
|
276
|
+
# pylint:disable=E1101
|
|
277
|
+
character = render_char(char, baseline=baseline)
|
|
278
|
+
if transparent(character):
|
|
279
|
+
# TODO: WRITE TO DEBUG FILE TO INFORM USER ABOUT BAD PRINTED PDF
|
|
280
|
+
utilo.debug(f'white char, skip: {character}')
|
|
281
|
+
result.chars.append(iamraw.VirtualChar(value=' '))
|
|
282
|
+
continue
|
|
283
|
+
if len(character.value) == 1:
|
|
284
|
+
result.chars.append(character)
|
|
285
|
+
else:
|
|
286
|
+
# in some case the layout parser matches to chars together.
|
|
287
|
+
# Therefore we have to split the character by content and fix
|
|
288
|
+
# the bounding.
|
|
289
|
+
for splitted in split_characters(character):
|
|
290
|
+
assert len(splitted.value) == 1, splitted
|
|
291
|
+
result.chars.append(splitted)
|
|
292
|
+
# ensure that chars are sorted from left to right
|
|
293
|
+
# TODO: CHECK VERTICAL TEXT?
|
|
294
|
+
result.chars = ensure_leftright(result.chars)
|
|
295
|
+
result.chars = merge_small_whitespaces(result.chars)
|
|
296
|
+
result.chars = merge_special_char(result.chars)
|
|
297
|
+
result.chars = fix_fontrise(result.chars)
|
|
298
|
+
if remove_horizontals and ishorizontal(result.text):
|
|
299
|
+
return None
|
|
300
|
+
if not strip:
|
|
301
|
+
return result
|
|
302
|
+
result = textline_strip(result)
|
|
303
|
+
return result
|
|
304
|
+
|
|
305
|
+
|
|
306
|
+
def textline_strip(result):
|
|
307
|
+
# remove left
|
|
308
|
+
lstrip = len(result.text) - len(result.text.lstrip())
|
|
309
|
+
result.chars = result.chars[lstrip:]
|
|
310
|
+
# remove right
|
|
311
|
+
# +1 to preserve virtual newline char
|
|
312
|
+
# rstrip = len(result.text.rstrip()) +1
|
|
313
|
+
rstrip = len(result.text.rstrip())
|
|
314
|
+
result.chars = result.chars[:rstrip]
|
|
315
|
+
if result.chars:
|
|
316
|
+
# TODO: ENSURE THAT ONLY A SINGLE LINE IS RENDERED?
|
|
317
|
+
# IF MORE THAN ONE LINE IS RENDERED, LAST CHAR MUST NOT BE THE
|
|
318
|
+
# MOST RIGHT CHAR.
|
|
319
|
+
# fix bounding box of line rectangle ensure to end with newline
|
|
320
|
+
# result.chars[-1].value = ' '
|
|
321
|
+
x0 = result.chars[0].box.x0
|
|
322
|
+
try:
|
|
323
|
+
x1 = result.chars[-1].box.x1
|
|
324
|
+
except AttributeError:
|
|
325
|
+
# VirtualChar has no BoundingBox, use one Char before
|
|
326
|
+
# TODO: THIS MAY NOT HAPPEN ANYMORE CAUSE OF THE STRIP ABOVE
|
|
327
|
+
x1 = result.chars[-2].box.x1
|
|
328
|
+
result.box.x0 = x0
|
|
329
|
+
result.box.x1 = x1
|
|
330
|
+
# TODO: VERIFY <=
|
|
331
|
+
assert result.box.x0 <= result.box.x1, str(vars(result))
|
|
332
|
+
return result
|
|
333
|
+
|
|
334
|
+
|
|
335
|
+
def ishorizontal(text: str, mincount=10) -> bool:
|
|
336
|
+
"""Check if text line is a char based horizontal line.
|
|
337
|
+
|
|
338
|
+
>>> ishorizontal('--------------')
|
|
339
|
+
True
|
|
340
|
+
>>> ishorizontal('_______________________')
|
|
341
|
+
True
|
|
342
|
+
>>> ishorizontal('this is a text')
|
|
343
|
+
False
|
|
344
|
+
"""
|
|
345
|
+
shorten = text.replace('_', '').replace('-', '').replace('=', '').strip()
|
|
346
|
+
if shorten:
|
|
347
|
+
return False
|
|
348
|
+
counted = text.count('_') + text.count('-') + text.count('=')
|
|
349
|
+
if counted < mincount:
|
|
350
|
+
return False
|
|
351
|
+
return True
|
|
352
|
+
|
|
353
|
+
|
|
354
|
+
def fix_fontrise(items):
|
|
355
|
+
"""Workaround for font rise extraction bug.
|
|
356
|
+
|
|
357
|
+
In some cases the layout is extracted with font rises which are not
|
|
358
|
+
necessary. There is a single char without font rise and the other
|
|
359
|
+
are layouted with different y1 position and a font rise.
|
|
360
|
+
"""
|
|
361
|
+
if not items:
|
|
362
|
+
return items
|
|
363
|
+
non_virtual = [
|
|
364
|
+
item for item in items if not isinstance(item, iamraw.VirtualChar)
|
|
365
|
+
]
|
|
366
|
+
rises = [item for item in non_virtual if item.rise]
|
|
367
|
+
if not rises:
|
|
368
|
+
# no fix is required
|
|
369
|
+
return items
|
|
370
|
+
zero, non_zero = utilo.partition(
|
|
371
|
+
key=lambda item: utilo.near(
|
|
372
|
+
item.rise,
|
|
373
|
+
0.0,
|
|
374
|
+
diff=FONT_RISE_MIN,
|
|
375
|
+
),
|
|
376
|
+
items=non_virtual,
|
|
377
|
+
)
|
|
378
|
+
fix_fontrise_occurence_max = FIX_FONTRISE_OCCURENCE_MAX(len(items))
|
|
379
|
+
if len(non_zero) > fix_fontrise_occurence_max:
|
|
380
|
+
# disable font rise for too many false detection?
|
|
381
|
+
# TODO: VERIFY LATER
|
|
382
|
+
for item in items:
|
|
383
|
+
item.rise = 0.0
|
|
384
|
+
if len(zero) != 1:
|
|
385
|
+
return items
|
|
386
|
+
if not non_zero:
|
|
387
|
+
return items
|
|
388
|
+
mode = utilo.mode(item.rise for item in non_zero)
|
|
389
|
+
for item in non_zero:
|
|
390
|
+
item.rise = item.rise - mode
|
|
391
|
+
item.box.y1 = item.box.y1 + mode
|
|
392
|
+
return items
|
|
393
|
+
|
|
394
|
+
|
|
395
|
+
def ensure_leftright(items):
|
|
396
|
+
"""Fix layout parser miss detection.
|
|
397
|
+
|
|
398
|
+
Ensure that more left x0 coordinates comes before higher x0
|
|
399
|
+
coordinate.
|
|
400
|
+
"""
|
|
401
|
+
# TODO: ENSURE TOP TO DOWN, LOOK AT FONT RISE PROBLEM
|
|
402
|
+
# map bounding cause virtual chars has no bounding
|
|
403
|
+
if not items:
|
|
404
|
+
return items
|
|
405
|
+
|
|
406
|
+
def first_box(items):
|
|
407
|
+
if not items:
|
|
408
|
+
return None
|
|
409
|
+
with contextlib.suppress(AttributeError):
|
|
410
|
+
return items[0].box[2] # x1
|
|
411
|
+
# TODO: WHY X1 AND NOT X0?
|
|
412
|
+
# TODO: CATCH OUT OF BOUNDS
|
|
413
|
+
return first_box(items[1:])
|
|
414
|
+
|
|
415
|
+
current = first_box(items)
|
|
416
|
+
if current is None:
|
|
417
|
+
# VirtualChars only
|
|
418
|
+
return items
|
|
419
|
+
boundings = []
|
|
420
|
+
for item in items:
|
|
421
|
+
try:
|
|
422
|
+
boundings.append((item.box[0], item)) # x0 left border
|
|
423
|
+
current = item.box[2] # x1 right border
|
|
424
|
+
except AttributeError:
|
|
425
|
+
boundings.append((current, item))
|
|
426
|
+
# more than one virtual char in a row, don't know if possible
|
|
427
|
+
current += 0.1
|
|
428
|
+
current: float = utilo.roundme(current)
|
|
429
|
+
# sort from left to right
|
|
430
|
+
boundings = sorted(boundings, key=lambda x: x[0])
|
|
431
|
+
# remove mapped coordiante
|
|
432
|
+
items = [item[1] for item in boundings]
|
|
433
|
+
return items
|
|
434
|
+
|
|
435
|
+
|
|
436
|
+
MERGES = {
|
|
437
|
+
'A': 'Ä',
|
|
438
|
+
'a': 'ä',
|
|
439
|
+
'O': 'Ö',
|
|
440
|
+
'o': 'ö',
|
|
441
|
+
'U': 'Ü',
|
|
442
|
+
'u': 'ü',
|
|
443
|
+
# bachelor090:page88 \x0d '\r' R
|
|
444
|
+
'R': '®',
|
|
445
|
+
}
|
|
446
|
+
|
|
447
|
+
# \x0d => \r
|
|
448
|
+
SPECIALS = {'¨', '\x0d'}
|
|
449
|
+
|
|
450
|
+
|
|
451
|
+
def merge_special_char(items): # pylint:disable=R1260
|
|
452
|
+
"""Convert `A¨` to `Ä` etc.
|
|
453
|
+
|
|
454
|
+
See bachelor90 example.
|
|
455
|
+
"""
|
|
456
|
+
if not items:
|
|
457
|
+
return []
|
|
458
|
+
result = [items[0]]
|
|
459
|
+
for item in items[1:]:
|
|
460
|
+
if result[-1].value in SPECIALS:
|
|
461
|
+
# try merge
|
|
462
|
+
try:
|
|
463
|
+
replaced = MERGES[item.value]
|
|
464
|
+
result.pop()
|
|
465
|
+
item.value = replaced
|
|
466
|
+
result.append(item)
|
|
467
|
+
except KeyError:
|
|
468
|
+
utilo.error(f'could not merge with after {item}')
|
|
469
|
+
result.append(item)
|
|
470
|
+
continue
|
|
471
|
+
try:
|
|
472
|
+
special = item.special
|
|
473
|
+
except AttributeError:
|
|
474
|
+
special = None
|
|
475
|
+
if special not in SPECIALS:
|
|
476
|
+
result.append(item)
|
|
477
|
+
continue
|
|
478
|
+
# merge with before
|
|
479
|
+
try:
|
|
480
|
+
replaced = MERGES[result[-1].value]
|
|
481
|
+
except KeyError:
|
|
482
|
+
# TODO: REMOVE ERROR LOG LATER
|
|
483
|
+
utilo.debug(f'could not merge with before {item}')
|
|
484
|
+
result.append(item)
|
|
485
|
+
continue
|
|
486
|
+
result[-1].value = replaced
|
|
487
|
+
return result
|
|
488
|
+
|
|
489
|
+
|
|
490
|
+
def merge_small_whitespaces(items):
|
|
491
|
+
"""Removed unnescessary bad printed white spaces.
|
|
492
|
+
|
|
493
|
+
See bachelor90 example.
|
|
494
|
+
"""
|
|
495
|
+
if len(items) < 3:
|
|
496
|
+
return items
|
|
497
|
+
result = [items[0]]
|
|
498
|
+
for current, after in zip(items[1:-1], items[2:]):
|
|
499
|
+
if not isinstance(current, iamraw.VirtualChar):
|
|
500
|
+
result.append(current)
|
|
501
|
+
continue
|
|
502
|
+
try:
|
|
503
|
+
before_x0 = result[-1].box.x0
|
|
504
|
+
before_x1 = result[-1].box.x1
|
|
505
|
+
after_x0 = after.box.x0
|
|
506
|
+
except AttributeError:
|
|
507
|
+
# TODO: INVESTIGATE LATER
|
|
508
|
+
# whitespace before or after
|
|
509
|
+
result.append(current)
|
|
510
|
+
continue
|
|
511
|
+
if before_x0 <= after_x0 <= before_x1:
|
|
512
|
+
# ensure to overlap and not merge hthan required
|
|
513
|
+
# remove unnecessary virtual char
|
|
514
|
+
continue
|
|
515
|
+
# add required virtual char
|
|
516
|
+
result.append(current)
|
|
517
|
+
result.append(items[-1])
|
|
518
|
+
return result
|
|
519
|
+
|
|
520
|
+
|
|
521
|
+
def split_characters(char) -> list:
|
|
522
|
+
"""Split character which contains multiple chars. Split given
|
|
523
|
+
BoundingBox and give every splitted character the same space.
|
|
524
|
+
|
|
525
|
+
Args:
|
|
526
|
+
char with multiple character in `char.value`
|
|
527
|
+
Returns:
|
|
528
|
+
List of splitted character.
|
|
529
|
+
"""
|
|
530
|
+
result = []
|
|
531
|
+
charbounding = char.box
|
|
532
|
+
charstep = charbounding.x1 - charbounding.x0
|
|
533
|
+
if charstep <= 0.0:
|
|
534
|
+
utilo.error(f'invalid charstep: {charstep}: {charbounding} - {char}')
|
|
535
|
+
assert charstep >= 0.0, f'{charstep}: {charbounding} - {char}'
|
|
536
|
+
for index, text in enumerate(char.value):
|
|
537
|
+
copied = copy.deepcopy(char)
|
|
538
|
+
copied.value = text
|
|
539
|
+
# split common BoundingBox of multiple chars to single
|
|
540
|
+
# BoundingBoxes.
|
|
541
|
+
# NOTE: This does not work hundert percent correctly. Imagine if
|
|
542
|
+
# you have the character Z and I togester. Z is bigger than I. But
|
|
543
|
+
# that accurarcy is fine.
|
|
544
|
+
bounding = iamraw.BoundingBox.from_list([
|
|
545
|
+
charbounding.x0 + index * charstep,
|
|
546
|
+
charbounding.y0,
|
|
547
|
+
charbounding.x0 + (index + 1) * charstep,
|
|
548
|
+
charbounding.y1,
|
|
549
|
+
])
|
|
550
|
+
copied.box = bounding
|
|
551
|
+
result.append(copied)
|
|
552
|
+
return result
|
|
553
|
+
|
|
554
|
+
|
|
555
|
+
def split_container(
|
|
556
|
+
item: pdfminer.layout.LTTextBox,
|
|
557
|
+
strip: bool = False,
|
|
558
|
+
) -> list:
|
|
559
|
+
grouped = [[]]
|
|
560
|
+
for line in item:
|
|
561
|
+
split = not line.get_text().strip() and strip
|
|
562
|
+
vertical_change = False
|
|
563
|
+
if grouped[-1]:
|
|
564
|
+
vertical_change = vertical(grouped[-1]) != vertical(line)
|
|
565
|
+
if split or vertical_change:
|
|
566
|
+
grouped.append([])
|
|
567
|
+
else:
|
|
568
|
+
grouped[-1].append(line)
|
|
569
|
+
grouped = [item for item in grouped if item]
|
|
570
|
+
# add bounding
|
|
571
|
+
result = []
|
|
572
|
+
for index, group in enumerate(grouped):
|
|
573
|
+
ctor = pdfminer.layout.LTTextBoxHorizontal
|
|
574
|
+
if vertical(group):
|
|
575
|
+
ctor = pdfminer.layout.LTTextBoxVertical
|
|
576
|
+
item = ctor()
|
|
577
|
+
for line in group:
|
|
578
|
+
item.add(line)
|
|
579
|
+
item.index = index
|
|
580
|
+
item.bbox = iamraw.BoundingBox(*item.bbox)
|
|
581
|
+
result.append(item)
|
|
582
|
+
return result
|
|
583
|
+
|
|
584
|
+
|
|
585
|
+
def render_textcontainer(
|
|
586
|
+
item: pdfminer.layout.LTTextBox,
|
|
587
|
+
strip: bool = False,
|
|
588
|
+
) -> iamraw.TextContainer:
|
|
589
|
+
splitted = split_container(item, strip=strip)
|
|
590
|
+
rendered = [
|
|
591
|
+
render_vertical_textcontainer(item, strip=strip) if vertical(item) else
|
|
592
|
+
render_horizontal_textcontainer(item, strip=strip) for item in splitted
|
|
593
|
+
]
|
|
594
|
+
# Ensure that all TextContainer have only one line. Prepare to remove
|
|
595
|
+
# lines concept and handle everything as a single line.
|
|
596
|
+
result = []
|
|
597
|
+
for container in rendered:
|
|
598
|
+
if len(container) == 1:
|
|
599
|
+
result.append(container)
|
|
600
|
+
continue
|
|
601
|
+
splitted = [
|
|
602
|
+
container.__class__(box=line.box, lines=[line])
|
|
603
|
+
for line in container
|
|
604
|
+
]
|
|
605
|
+
result.extend(splitted)
|
|
606
|
+
return result
|
|
607
|
+
|
|
608
|
+
|
|
609
|
+
def render_horizontal_textcontainer(
|
|
610
|
+
item: pdfminer.layout.LTTextBox,
|
|
611
|
+
strip: bool = False,
|
|
612
|
+
) -> iamraw.TextContainer:
|
|
613
|
+
container = iamraw.TextContainer(box=item.bbox)
|
|
614
|
+
for line in item:
|
|
615
|
+
rendered = render_textline(line, strip=strip)
|
|
616
|
+
if not rendered:
|
|
617
|
+
continue
|
|
618
|
+
container.append(rendered)
|
|
619
|
+
if len(container.lines) == 1:
|
|
620
|
+
# update parent box
|
|
621
|
+
# TODO: ENSURE TO UPDATE MULTILINE BOXES CORRECTLY
|
|
622
|
+
# TODO: COMPUTE BOXES OUT OF MEMBER/CHILDREN/LINES
|
|
623
|
+
container.box = container[0].box
|
|
624
|
+
if container:
|
|
625
|
+
# fix start of container
|
|
626
|
+
# pdfminer extracts the TextContainer bigger than the chars really
|
|
627
|
+
# are. In top(y0) direction, therefore we replace the top boundary
|
|
628
|
+
# with first line boundary.
|
|
629
|
+
container.box.y0 = container[0].box.y0
|
|
630
|
+
return container
|
|
631
|
+
|
|
632
|
+
|
|
633
|
+
def render_vertical_textcontainer(
|
|
634
|
+
item: pdfminer.layout.LTTextBox,
|
|
635
|
+
strip: bool = False,
|
|
636
|
+
) -> iamraw.VerticalTextContainer:
|
|
637
|
+
container = iamraw.VerticalTextContainer(box=item.bbox)
|
|
638
|
+
for line in item:
|
|
639
|
+
rendered = render_textline(line, strip=strip)
|
|
640
|
+
if not rendered:
|
|
641
|
+
continue
|
|
642
|
+
container.append(rendered)
|
|
643
|
+
return container
|
|
644
|
+
|
|
645
|
+
|
|
646
|
+
def vertical(item: pdfminer.layout.LTTextBox):
|
|
647
|
+
"""Check LTChar.upright flag."""
|
|
648
|
+
if isinstance(item, (pdfminer.layout.LTTextLine)):
|
|
649
|
+
# enable checking single lines
|
|
650
|
+
item = [item]
|
|
651
|
+
for line in item:
|
|
652
|
+
for char in line._objs: # pylint: disable=protected-access
|
|
653
|
+
with contextlib.suppress(AttributeError):
|
|
654
|
+
if rawmaker.patch.ltchar.vertical(char):
|
|
655
|
+
return True
|
|
656
|
+
return False
|
|
657
|
+
|
|
658
|
+
|
|
659
|
+
def render(item, strip: bool = False): # pylint:disable=R1260,too-many-branches
|
|
660
|
+
if isinstance(item, pdfminer.layout.LTPage): # pylint:disable=too-many-nested-blocks
|
|
661
|
+
pagenumber = item.pageid
|
|
662
|
+
page = iamraw.Page(pagenumber, iamraw.BoundingBox(*item.bbox))
|
|
663
|
+
# TODO: ENSURE ROTATED PAGES?
|
|
664
|
+
for child in item:
|
|
665
|
+
# pylint:disable=E1101
|
|
666
|
+
rendered = render(child, strip=strip)
|
|
667
|
+
if rendered is None:
|
|
668
|
+
continue
|
|
669
|
+
if isinstance(rendered, list):
|
|
670
|
+
for single in rendered:
|
|
671
|
+
if isinstance(single, list):
|
|
672
|
+
for pageitem in single:
|
|
673
|
+
page.append(pageitem)
|
|
674
|
+
else:
|
|
675
|
+
page.append(single)
|
|
676
|
+
else:
|
|
677
|
+
page.append(rendered)
|
|
678
|
+
page = mylayout(page)
|
|
679
|
+
return page
|
|
680
|
+
if isinstance(item, pdfminer.layout.LTTextBox):
|
|
681
|
+
textcontainers = render_textcontainer(item, strip=strip)
|
|
682
|
+
result = []
|
|
683
|
+
for container in textcontainers:
|
|
684
|
+
if strip:
|
|
685
|
+
container.lines = [
|
|
686
|
+
line for line in container.lines if line.text.strip()
|
|
687
|
+
]
|
|
688
|
+
if not container.lines:
|
|
689
|
+
# ignore stripped line
|
|
690
|
+
continue
|
|
691
|
+
container = ensure_bounding(container)
|
|
692
|
+
result.append(container)
|
|
693
|
+
return result
|
|
694
|
+
return None
|
|
695
|
+
|
|
696
|
+
|
|
697
|
+
def ensure_bounding(textcontainer: iamraw.TextContainer):
|
|
698
|
+
if len(textcontainer) == 1:
|
|
699
|
+
return textcontainer
|
|
700
|
+
if isinstance(textcontainer, iamraw.VerticalTextContainer):
|
|
701
|
+
# TODO: NOT SUPPORTED YET
|
|
702
|
+
return textcontainer
|
|
703
|
+
# check if splitting bounding container is required or container fits
|
|
704
|
+
# already.
|
|
705
|
+
indexed = [[0]]
|
|
706
|
+
for index, item in enumerate(textcontainer[1:], start=1):
|
|
707
|
+
before = textcontainer[indexed[-1][0]].box
|
|
708
|
+
cur = item.box
|
|
709
|
+
if (utilo.near(before[0], cur[0]) and utilo.near(before[2], cur[2])):
|
|
710
|
+
indexed[-1].append(index)
|
|
711
|
+
else:
|
|
712
|
+
indexed.append([index])
|
|
713
|
+
if len(indexed) == 1:
|
|
714
|
+
# splitting is not required, container fits already
|
|
715
|
+
return textcontainer
|
|
716
|
+
result = []
|
|
717
|
+
for block in indexed:
|
|
718
|
+
# split container into smaller, better fitting containers
|
|
719
|
+
collected = [textcontainer[index] for index in block]
|
|
720
|
+
current = iamraw.TextContainer()
|
|
721
|
+
for item in collected:
|
|
722
|
+
current.append(item)
|
|
723
|
+
current.box = utilo.rect_max([item.box for item in collected])
|
|
724
|
+
result.append(current)
|
|
725
|
+
return result
|
|
726
|
+
|
|
727
|
+
|
|
728
|
+
def mylayout(page: iamraw.Page) -> iamraw.Page:
|
|
729
|
+
children = page.children
|
|
730
|
+
if not children:
|
|
731
|
+
return page
|
|
732
|
+
verticals, horizontal = utilo.partition(
|
|
733
|
+
lambda x: isinstance(x, iamraw.VerticalTextContainer),
|
|
734
|
+
children,
|
|
735
|
+
)
|
|
736
|
+
verticals = merge_neighbors(
|
|
737
|
+
verticals,
|
|
738
|
+
horizontal=False,
|
|
739
|
+
ydiff=15.0,
|
|
740
|
+
xdiff=15.0,
|
|
741
|
+
)
|
|
742
|
+
horizontal = merge_neighbors(horizontal)
|
|
743
|
+
page.children = horizontal + verticals
|
|
744
|
+
return page
|
|
745
|
+
|
|
746
|
+
|
|
747
|
+
def merge_neighbors(
|
|
748
|
+
children: list,
|
|
749
|
+
xdiff: float = 10.0,
|
|
750
|
+
ydiff: float = 5.0,
|
|
751
|
+
horizontal: bool = True,
|
|
752
|
+
) -> list:
|
|
753
|
+
# TODO: IMPROVE VERTICAL MERGER
|
|
754
|
+
if not children:
|
|
755
|
+
return []
|
|
756
|
+
# ensure to sort items top to bottom and left to right. It is
|
|
757
|
+
# important to connect only neighbored items to avoid conflicts in
|
|
758
|
+
# bounding computation. See: test_mylayout_bounding_extraction_bug
|
|
759
|
+
# Use y1 as lower text line.
|
|
760
|
+
if horizontal:
|
|
761
|
+
children = sorted(children, key=lambda x: x.box[0]) # leftright
|
|
762
|
+
children = sorted(children, key=lambda x: x.box[3]) # topdown
|
|
763
|
+
else:
|
|
764
|
+
# vertical
|
|
765
|
+
# bottom up
|
|
766
|
+
children = sorted(children, key=lambda x: x.box[1], reverse=True)
|
|
767
|
+
children = sorted(children, key=lambda x: x.box[0]) # leftright
|
|
768
|
+
result = [children[0]]
|
|
769
|
+
for item in children[1:]:
|
|
770
|
+
before = result[-1]
|
|
771
|
+
if required := require_merge(
|
|
772
|
+
item.box,
|
|
773
|
+
before=before.box,
|
|
774
|
+
xdiff=xdiff,
|
|
775
|
+
ydiff=ydiff,
|
|
776
|
+
):
|
|
777
|
+
# merge before
|
|
778
|
+
# add virtual char
|
|
779
|
+
before.lines[-1].chars.append(iamraw.VirtualChar(value=' '))
|
|
780
|
+
before.lines[-1].chars.extend(item.lines[0].chars)
|
|
781
|
+
if len(item.lines) >= 2:
|
|
782
|
+
before.lines.extend(item.lines[1:])
|
|
783
|
+
# adjust bounding
|
|
784
|
+
if item.box[2] > before.box[2]:
|
|
785
|
+
# ensure that right border is more right than left border.
|
|
786
|
+
# In some cases, formulas for example, it can happen that
|
|
787
|
+
# this contraint is not given.
|
|
788
|
+
before.box = utilo.update_tuple(
|
|
789
|
+
data=tuple(before.box), # REMOVE TUPLE LATER
|
|
790
|
+
value=item.box[2],
|
|
791
|
+
index=2,
|
|
792
|
+
)
|
|
793
|
+
else:
|
|
794
|
+
utilo.debug('HINT: no bounding box update required')
|
|
795
|
+
before.box = iamraw.BoundingBox(*before.box)
|
|
796
|
+
continue
|
|
797
|
+
if required is None:
|
|
798
|
+
utilo.error('duplicated bounding, bad printed layout')
|
|
799
|
+
utilo.error(vars(before))
|
|
800
|
+
utilo.error(vars(item))
|
|
801
|
+
else:
|
|
802
|
+
result.append(item)
|
|
803
|
+
return result
|
|
804
|
+
|
|
805
|
+
|
|
806
|
+
def require_merge(
|
|
807
|
+
current: tuple,
|
|
808
|
+
before: tuple,
|
|
809
|
+
xdiff: float,
|
|
810
|
+
ydiff: float,
|
|
811
|
+
) -> bool:
|
|
812
|
+
"""Should we merge two bounding boxes causes there are very near?
|
|
813
|
+
|
|
814
|
+
Problem:
|
|
815
|
+
Some pdf printer produces very bad boundings, sometimes object
|
|
816
|
+
completely covered by each other.
|
|
817
|
+
"""
|
|
818
|
+
# TODO: MAKE THIS SIZE DEPENDENT
|
|
819
|
+
ynear = utilo.near(current[3], before[3], diff=ydiff)
|
|
820
|
+
if not ynear:
|
|
821
|
+
return False
|
|
822
|
+
if utilo.rect_overlapping(current, before) > 0.98: # TODO: HOLY VALUE
|
|
823
|
+
# nearly equals objects
|
|
824
|
+
# hoverpower.HOME016A_PDF
|
|
825
|
+
# {'box': BoundingBox(x0=292.73, y0=789.45, x1=302.75, y1=799.41), 'lines': [Line(text="16")], 'state': None}
|
|
826
|
+
# {'box': BoundingBox(x0=292.73, y0=789.57, x1=302.75, y1=799.53), 'lines': [Line(text="16")], 'state': None}
|
|
827
|
+
# nearly equal bounding, we skip it. Bad printed pdf.
|
|
828
|
+
# TODO: XXX
|
|
829
|
+
return None
|
|
830
|
+
xnear = utilo.near(current[0], before[2], diff=xdiff)
|
|
831
|
+
if xnear:
|
|
832
|
+
return True
|
|
833
|
+
return False
|