groupmes 1.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- groupmes/__init__.py +21 -0
- groupmes/__main__.py +13 -0
- groupmes/__patch__.py +8 -0
- groupmes/border/__init__.py +8 -0
- groupmes/border/leftright.py +315 -0
- groupmes/border/most.py +70 -0
- groupmes/cli.py +81 -0
- groupmes/feature/__init__.py +8 -0
- groupmes/feature/area.py +221 -0
- groupmes/feature/border.py +101 -0
- groupmes/feature/content.py +52 -0
- groupmes/feature/distance.py +237 -0
- groupmes/feature/hefopa.py +126 -0
- groupmes/path.py +34 -0
- groupmes-1.1.0.dist-info/METADATA +28 -0
- groupmes-1.1.0.dist-info/RECORD +19 -0
- groupmes-1.1.0.dist-info/WHEEL +5 -0
- groupmes-1.1.0.dist-info/entry_points.txt +2 -0
- groupmes-1.1.0.dist-info/top_level.txt +1 -0
groupmes/__init__.py
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
#==============================================================================
|
|
2
|
+
# C O P Y R I G H T
|
|
3
|
+
#------------------------------------------------------------------------------
|
|
4
|
+
# Copyright (c) 2019-2023 by Helmut Konrad Schewe. All rights reserved.
|
|
5
|
+
# This file is property of Helmut Konrad Schewe. Any unauthorized copy,
|
|
6
|
+
# use or distribution is an offensive act against international law and may
|
|
7
|
+
# be prosecuted under federal law. Its content is company confidential.
|
|
8
|
+
#==============================================================================
|
|
9
|
+
|
|
10
|
+
import importlib.metadata
|
|
11
|
+
import os
|
|
12
|
+
|
|
13
|
+
import groupmes.__patch__
|
|
14
|
+
import groupmes.path
|
|
15
|
+
|
|
16
|
+
PACKAGE = 'groupmes'
|
|
17
|
+
PROCESS = 'groupme'
|
|
18
|
+
|
|
19
|
+
__version__ = importlib.metadata.version(PACKAGE)
|
|
20
|
+
|
|
21
|
+
ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))
|
groupmes/__main__.py
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
#==============================================================================
|
|
2
|
+
# C O P Y R I G H T
|
|
3
|
+
#------------------------------------------------------------------------------
|
|
4
|
+
# Copyright (c) 2019-2023 by Helmut Konrad Schewe. All rights reserved.
|
|
5
|
+
# This file is property of Helmut Konrad Schewe. Any unauthorized copy,
|
|
6
|
+
# use or distribution is an offensive act against international law and may
|
|
7
|
+
# be prosecuted under federal law. Its content is company confidential.
|
|
8
|
+
#==============================================================================
|
|
9
|
+
|
|
10
|
+
from groupmes.cli import main
|
|
11
|
+
|
|
12
|
+
if __name__ == "__main__":
|
|
13
|
+
main()
|
groupmes/__patch__.py
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
# =============================================================================
|
|
2
|
+
# C O P Y R I G H T
|
|
3
|
+
# -----------------------------------------------------------------------------
|
|
4
|
+
# Copyright (c) 2020-2023 by Helmut Konrad Schewe. All rights reserved.
|
|
5
|
+
# This file is property of Helmut Konrad Schewe. Any unauthorized copy,
|
|
6
|
+
# use or distribution is an offensive act against international law and may
|
|
7
|
+
# be prosecuted under federal law. Its content is company confidential.
|
|
8
|
+
# =============================================================================
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
# =============================================================================
|
|
2
|
+
# C O P Y R I G H T
|
|
3
|
+
# -----------------------------------------------------------------------------
|
|
4
|
+
# Copyright (c) 2020-2023 by Helmut Konrad Schewe. All rights reserved.
|
|
5
|
+
# This file is property of Helmut Konrad Schewe. Any unauthorized copy,
|
|
6
|
+
# use or distribution is an offensive act against international law and may
|
|
7
|
+
# be prosecuted under federal law. Its content is company confidential.
|
|
8
|
+
# =============================================================================
|
|
@@ -0,0 +1,315 @@
|
|
|
1
|
+
# =============================================================================
|
|
2
|
+
# C O P Y R I G H T
|
|
3
|
+
# -----------------------------------------------------------------------------
|
|
4
|
+
# Copyright (c) 2020-2023 by Helmut Konrad Schewe. All rights reserved.
|
|
5
|
+
# This file is property of Helmut Konrad Schewe. Any unauthorized copy,
|
|
6
|
+
# use or distribution is an offensive act against international law and may
|
|
7
|
+
# be prosecuted under federal law. Its content is company confidential.
|
|
8
|
+
# =============================================================================
|
|
9
|
+
"""left right border detector
|
|
10
|
+
==========================
|
|
11
|
+
|
|
12
|
+
The `left right border detector` (lrbd) separates borders of the left
|
|
13
|
+
and right page especially which are used for books with different border
|
|
14
|
+
width for left and right page.
|
|
15
|
+
|
|
16
|
+
As a result of different left-right borders there are alternating border
|
|
17
|
+
widths which we detect. On single pages there are no alternating border.
|
|
18
|
+
|
|
19
|
+
Currently there are two strategies to detect different page border:
|
|
20
|
+
|
|
21
|
+
* simple approach
|
|
22
|
+
* raising edge
|
|
23
|
+
|
|
24
|
+
Simple Approach
|
|
25
|
+
~~~~~~~~~~~~~~~
|
|
26
|
+
|
|
27
|
+
Some documents have exceptions on some pages. We handle this via allowed
|
|
28
|
+
errors defined with the HolyValues `FIRSTSECOND_ERROR_COUNT_MAX` and
|
|
29
|
+
`MIXED_ERROR_MIN`.
|
|
30
|
+
|
|
31
|
+
Raising Edge
|
|
32
|
+
~~~~~~~~~~~~
|
|
33
|
+
|
|
34
|
+
The change of the text feed creates an raising edge between pages. This
|
|
35
|
+
approach detects these edges to determine left and right page border
|
|
36
|
+
width.
|
|
37
|
+
"""
|
|
38
|
+
|
|
39
|
+
import dataclasses
|
|
40
|
+
import math
|
|
41
|
+
import statistics
|
|
42
|
+
import typing
|
|
43
|
+
|
|
44
|
+
import configos
|
|
45
|
+
import iamraw
|
|
46
|
+
import utilo
|
|
47
|
+
|
|
48
|
+
# max diff to match in common group.
|
|
49
|
+
SIDE_DIFF_MAX = configos.HV_INT_PLUS(default=2)
|
|
50
|
+
# exceptions which are allowed cause of user defined error.
|
|
51
|
+
FIRSTSECOND_ERROR_COUNT_MAX = configos.HolyTable(
|
|
52
|
+
items=(
|
|
53
|
+
(2, 0.5),
|
|
54
|
+
(3, 0.35),
|
|
55
|
+
(5, 0.26),
|
|
56
|
+
(10, .21),
|
|
57
|
+
(15, 0.05),
|
|
58
|
+
(200, 0.01),
|
|
59
|
+
),
|
|
60
|
+
right_outranges_none=False,
|
|
61
|
+
)
|
|
62
|
+
# errors which are a result of handle alternating border as single border.
|
|
63
|
+
MIXED_ERROR_MIN = configos.HV_PERCENT_PLUS(default=15)
|
|
64
|
+
# area where left border can be located.
|
|
65
|
+
LEFT_PERCENT = configos.HV_PERCENT_PLUS(default=30)
|
|
66
|
+
# area where right border can be located.
|
|
67
|
+
RIGHT_PERCENT = configos.HV_PERCENT_PLUS(default=30)
|
|
68
|
+
|
|
69
|
+
RAISING_EDGE_MIN = configos.HV_PERCENT_PLUS(default=75)
|
|
70
|
+
|
|
71
|
+
# TODO: SHOULD WE DISABLE ALGO ON BIG FAIL COUNT?
|
|
72
|
+
RAISING_FAILRATE = configos.HolyTable(
|
|
73
|
+
items=(
|
|
74
|
+
(5, 1 / 5),
|
|
75
|
+
(7, 2 / 7),
|
|
76
|
+
(10, 3 / 10),
|
|
77
|
+
(40, 10 / 40),
|
|
78
|
+
(200, 40 / 200),
|
|
79
|
+
),
|
|
80
|
+
right_outranges_none=False,
|
|
81
|
+
)
|
|
82
|
+
|
|
83
|
+
LeftRight = tuple[float, float]
|
|
84
|
+
|
|
85
|
+
DetectedBorder = typing.TypeVar('DetectedBorder', tuple[float], float)
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
@dataclasses.dataclass
|
|
89
|
+
class LeftRightDetected:
|
|
90
|
+
left: DetectedBorder = None
|
|
91
|
+
right: DetectedBorder = None
|
|
92
|
+
confidence: float = 0.0
|
|
93
|
+
|
|
94
|
+
@property
|
|
95
|
+
def valid(self):
|
|
96
|
+
return isinstance(self.left, tuple)
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def run(
|
|
100
|
+
textpositions: iamraw.PageContentTextPositions,
|
|
101
|
+
pagesizes: iamraw.PageSizeBorderList,
|
|
102
|
+
) -> LeftRightDetected:
|
|
103
|
+
"""Run LeftRight-Strategy to determine that document contains
|
|
104
|
+
special leftright-border or a normal equal border for every page."""
|
|
105
|
+
left, right = determine_pageborder(textpositions, pagesizes)
|
|
106
|
+
left, right = handle_emptypage(left, right)
|
|
107
|
+
|
|
108
|
+
result = simple(left, right)
|
|
109
|
+
if result:
|
|
110
|
+
return result
|
|
111
|
+
|
|
112
|
+
result = raising(left, right)
|
|
113
|
+
if result:
|
|
114
|
+
return result
|
|
115
|
+
|
|
116
|
+
leftborder = utilo.mode(left, minimize=True)
|
|
117
|
+
rightborder = utilo.mode(right, minimize=False)
|
|
118
|
+
return LeftRightDetected(
|
|
119
|
+
left=leftborder,
|
|
120
|
+
right=rightborder,
|
|
121
|
+
confidence=1.0,
|
|
122
|
+
)
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
def simple(left: utilo.Numbers, right: utilo.Numbers) -> LeftRightDetected:
|
|
126
|
+
"""Determine LeftRight border based on changing text feed. We use
|
|
127
|
+
the even numbers to determine the left page and the odd numbers to
|
|
128
|
+
determine the right text feed.
|
|
129
|
+
|
|
130
|
+
This appraoch is limited when one page was missing and therefore
|
|
131
|
+
these pages are mixed and therefore wrong classified.
|
|
132
|
+
|
|
133
|
+
Args:
|
|
134
|
+
left: ordered list of left text feed
|
|
135
|
+
right: ending of right text content border
|
|
136
|
+
Returns:
|
|
137
|
+
None if no valid LeftRightDetected was detected
|
|
138
|
+
LeftRightDetected if failrate is not to hight
|
|
139
|
+
"""
|
|
140
|
+
if len(left) <= 1:
|
|
141
|
+
# TODO: INVESTIGATE HERE
|
|
142
|
+
utilo.error('could not run simple approach')
|
|
143
|
+
return None
|
|
144
|
+
mixed = utilo.diff_mode(left, max_diff=SIDE_DIFF_MAX)
|
|
145
|
+
# first side
|
|
146
|
+
first = left[::2]
|
|
147
|
+
first_matched = utilo.diff_mode(first, max_diff=SIDE_DIFF_MAX)
|
|
148
|
+
# second side
|
|
149
|
+
second = left[1::2]
|
|
150
|
+
second_matched = utilo.diff_mode(second, max_diff=SIDE_DIFF_MAX)
|
|
151
|
+
|
|
152
|
+
mixed_error = 1 - len(mixed) / len(left)
|
|
153
|
+
first_error = 1 - len(first_matched) / len(first)
|
|
154
|
+
second_error = 1 - len(second_matched) / len(second)
|
|
155
|
+
|
|
156
|
+
utilo.debug(f'mixed: {mixed_error}')
|
|
157
|
+
utilo.debug(f'first: {first_error}')
|
|
158
|
+
utilo.debug(f'second: {second_error}')
|
|
159
|
+
|
|
160
|
+
# left right
|
|
161
|
+
# TODO: DEFINE BETTER CONFIDENCE APPROACH
|
|
162
|
+
max_firstsecond_error = FIRSTSECOND_ERROR_COUNT_MAX(len(first))
|
|
163
|
+
if mixed_error > MIXED_ERROR_MIN.value and all([
|
|
164
|
+
first_error < max_firstsecond_error,
|
|
165
|
+
second_error < max_firstsecond_error,
|
|
166
|
+
]):
|
|
167
|
+
leftborder = (
|
|
168
|
+
utilo.mode(first, minimize=True),
|
|
169
|
+
utilo.mode(second, minimize=True),
|
|
170
|
+
)
|
|
171
|
+
rightborder = (
|
|
172
|
+
utilo.mode(right[::2]),
|
|
173
|
+
utilo.mode(right[1::2]),
|
|
174
|
+
)
|
|
175
|
+
return LeftRightDetected(
|
|
176
|
+
left=leftborder,
|
|
177
|
+
right=rightborder,
|
|
178
|
+
confidence=1.0,
|
|
179
|
+
)
|
|
180
|
+
return None
|
|
181
|
+
|
|
182
|
+
|
|
183
|
+
def raising(left: utilo.Numbers, right: utilo.Numbers) -> LeftRightDetected:
|
|
184
|
+
"""Determine border depending on changing text feed on left page
|
|
185
|
+
border.
|
|
186
|
+
|
|
187
|
+
This approach has no problems when one or more leftright pages are
|
|
188
|
+
missing. The limit of problems is defined in lookup table
|
|
189
|
+
`RAISING_FAILRATE`.
|
|
190
|
+
|
|
191
|
+
Args:
|
|
192
|
+
left: ordered list of left text feed
|
|
193
|
+
right: ending of right text content border
|
|
194
|
+
Returns:
|
|
195
|
+
None if no valid LeftRightDetected was detected
|
|
196
|
+
LeftRightDetected if `failrate` is not too high
|
|
197
|
+
"""
|
|
198
|
+
longest_left = longest_two(left)
|
|
199
|
+
longest_right = longest_two(right)
|
|
200
|
+
|
|
201
|
+
if longest_left is None or longest_right is None:
|
|
202
|
+
# single page document which does not contain left-right-pages
|
|
203
|
+
return None
|
|
204
|
+
|
|
205
|
+
first_left = statistics.mean(longest_left[0])
|
|
206
|
+
second_left = statistics.mean(longest_left[1])
|
|
207
|
+
edge = math.fabs(first_left - second_left)
|
|
208
|
+
|
|
209
|
+
first_right = statistics.mean(longest_right[0])
|
|
210
|
+
second_right = statistics.mean(longest_right[1])
|
|
211
|
+
|
|
212
|
+
edges = utilo.diffs(left)
|
|
213
|
+
failures = [
|
|
214
|
+
index for index, item in enumerate(edges)
|
|
215
|
+
if item < edge * RAISING_EDGE_MIN.value
|
|
216
|
+
]
|
|
217
|
+
failrate = len(failures) / len(edges)
|
|
218
|
+
max_failrate = RAISING_FAILRATE(len(edges))
|
|
219
|
+
|
|
220
|
+
if failrate > max_failrate:
|
|
221
|
+
return None
|
|
222
|
+
|
|
223
|
+
first_left, second_left = utilo.roundme(first_left, second_left)
|
|
224
|
+
first_right, second_right = utilo.roundme(first_right, second_right)
|
|
225
|
+
|
|
226
|
+
leftborder = (
|
|
227
|
+
min([first_left, second_left]),
|
|
228
|
+
max([first_left, second_left]),
|
|
229
|
+
)
|
|
230
|
+
rightborder = (
|
|
231
|
+
min([first_right, second_right]),
|
|
232
|
+
max([first_right, second_right]),
|
|
233
|
+
)
|
|
234
|
+
|
|
235
|
+
return LeftRightDetected(
|
|
236
|
+
left=leftborder,
|
|
237
|
+
right=rightborder,
|
|
238
|
+
confidence=1.0,
|
|
239
|
+
)
|
|
240
|
+
|
|
241
|
+
|
|
242
|
+
def handle_emptypage(left, right):
|
|
243
|
+
# TODO: THINK ABOUT IF THIS IS ENOUGH
|
|
244
|
+
# ignore empty pages
|
|
245
|
+
left_none = 0.0
|
|
246
|
+
left = [item if item is not None else left_none for item in left]
|
|
247
|
+
# TODO: Is default=0 a good one?
|
|
248
|
+
right_none = max((item for item in right if item is not None), default=0)
|
|
249
|
+
# NOTE: Determine more pages as large than it realy are - is this a
|
|
250
|
+
# problem?
|
|
251
|
+
right = [item if item is not None else right_none for item in right]
|
|
252
|
+
return left, right
|
|
253
|
+
|
|
254
|
+
|
|
255
|
+
def determine_pageborder(textpositions, pagesizes):
|
|
256
|
+
left = []
|
|
257
|
+
right = []
|
|
258
|
+
before = -1
|
|
259
|
+
for current, (page, size) in utilo.sync_pages([textpositions, pagesizes]):
|
|
260
|
+
assert current > before, f'{before} < {current}'
|
|
261
|
+
before = current
|
|
262
|
+
if not page or not size:
|
|
263
|
+
left.append(None)
|
|
264
|
+
right.append(None)
|
|
265
|
+
continue
|
|
266
|
+
bounding = [item for item, _ in page.content.values()]
|
|
267
|
+
leftright = maximize_leftright(bounding, size)
|
|
268
|
+
left.append(leftright[0])
|
|
269
|
+
right.append(leftright[1])
|
|
270
|
+
return left, right
|
|
271
|
+
|
|
272
|
+
|
|
273
|
+
def maximize_leftright(
|
|
274
|
+
boundings: utilo.Rectangles,
|
|
275
|
+
size: iamraw.PageSizeBorder,
|
|
276
|
+
) -> LeftRight:
|
|
277
|
+
"""Determine the left and right border of a page based on `mode`
|
|
278
|
+
selection in `size`.
|
|
279
|
+
|
|
280
|
+
Minimize the left and maximize the right position. The area where
|
|
281
|
+
mode is used to determine the most common border which is assumed as
|
|
282
|
+
correct border is limit by `size` configuration.
|
|
283
|
+
|
|
284
|
+
Args:
|
|
285
|
+
boundings: textpositions of defined page
|
|
286
|
+
size: width and height of current page
|
|
287
|
+
Returns:
|
|
288
|
+
tuple with left and right content bounding
|
|
289
|
+
"""
|
|
290
|
+
left_max = size.size.width * LEFT_PERCENT
|
|
291
|
+
right_min = size.size.width * (1 - RIGHT_PERCENT)
|
|
292
|
+
left_max, right_min = utilo.roundme(left_max, right_min)
|
|
293
|
+
assert left_max <= right_min, 'left and right bounds are flipped'
|
|
294
|
+
left = [item[0] for item in boundings if item[0] <= left_max]
|
|
295
|
+
right = [item[2] for item in boundings if item[2] >= right_min]
|
|
296
|
+
# TODO: DO WE RELAY NEED THIS?
|
|
297
|
+
left = utilo.mode(left, minimize=True) if left else 0.0
|
|
298
|
+
right = utilo.mode(right, minimize=False) if right else size.size.width
|
|
299
|
+
return left, right
|
|
300
|
+
|
|
301
|
+
|
|
302
|
+
CLUSTER_CANDIAT_DIFF_MAX = configos.HV_FLOAT_PLUS(default=2.0)
|
|
303
|
+
|
|
304
|
+
|
|
305
|
+
def longest_two(items: utilo.Numbers) -> tuple[float, float]:
|
|
306
|
+
|
|
307
|
+
def close(candidat, clusteritem) -> bool:
|
|
308
|
+
diff = math.fabs(candidat - clusteritem)
|
|
309
|
+
return diff < CLUSTER_CANDIAT_DIFF_MAX
|
|
310
|
+
|
|
311
|
+
clustered = utilo.determine_cluster(items, close)
|
|
312
|
+
result = sorted(clustered, key=len, reverse=True)
|
|
313
|
+
if len(result) < 2:
|
|
314
|
+
return None
|
|
315
|
+
return result[0], result[1]
|
groupmes/border/most.py
ADDED
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
# =============================================================================
|
|
2
|
+
# C O P Y R I G H T
|
|
3
|
+
# -----------------------------------------------------------------------------
|
|
4
|
+
# Copyright (c) 2020-2023 by Helmut Konrad Schewe. All rights reserved.
|
|
5
|
+
# This file is property of Helmut Konrad Schewe. Any unauthorized copy,
|
|
6
|
+
# use or distribution is an offensive act against international law and may
|
|
7
|
+
# be prosecuted under federal law. Its content is company confidential.
|
|
8
|
+
# =============================================================================
|
|
9
|
+
|
|
10
|
+
import dataclasses
|
|
11
|
+
import math
|
|
12
|
+
|
|
13
|
+
import iamraw
|
|
14
|
+
import utilo
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
@dataclasses.dataclass
|
|
18
|
+
class MostBoundingDetected:
|
|
19
|
+
left: float = None
|
|
20
|
+
right: float = None
|
|
21
|
+
top: float = None
|
|
22
|
+
bottom: float = None
|
|
23
|
+
confidence: float = 0.0
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def run(sizeandborder: iamraw.PageSizeBorderList) -> MostBoundingDetected:
|
|
27
|
+
borders = [item.border for item in sizeandborder]
|
|
28
|
+
most = most_boundingbox(borders)
|
|
29
|
+
# x0, y0, x1, y1
|
|
30
|
+
# left right
|
|
31
|
+
assert most[0] < most[1]
|
|
32
|
+
# top bottom
|
|
33
|
+
assert most[2] < most[3]
|
|
34
|
+
result = MostBoundingDetected(
|
|
35
|
+
left=most[0],
|
|
36
|
+
right=most[1],
|
|
37
|
+
top=most[2],
|
|
38
|
+
bottom=most[3],
|
|
39
|
+
)
|
|
40
|
+
return result
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def most_boundingbox(
|
|
44
|
+
boxes: utilo.Rectangles,
|
|
45
|
+
roundme: bool = False,
|
|
46
|
+
) -> utilo.Rectangle:
|
|
47
|
+
"""Extract bounding box of most common occurence for every side.
|
|
48
|
+
|
|
49
|
+
Round detected boundingbox to full number to make approach more
|
|
50
|
+
robust. Round numbers in direction to the border end to increase the
|
|
51
|
+
detected rectangle that fit more items in it.
|
|
52
|
+
"""
|
|
53
|
+
# TODO: Think about right and left, maybe search for the top 2 borders?
|
|
54
|
+
# Filter None entries
|
|
55
|
+
# left, right, top, bottom
|
|
56
|
+
rounding = [math.floor, math.ceil, math.floor, math.ceil]
|
|
57
|
+
result = []
|
|
58
|
+
for index, method in enumerate(rounding):
|
|
59
|
+
# remove None items
|
|
60
|
+
filtered = [item[index] for item in boxes if item[index] is not None]
|
|
61
|
+
# round to have a more robust grouping
|
|
62
|
+
rounded = [method(item) for item in filtered] if roundme else filtered
|
|
63
|
+
# determine most occured border to determine them as required border
|
|
64
|
+
# support multiple border options.
|
|
65
|
+
minimize = method is math.floor
|
|
66
|
+
# TODO: REQUIRE A BETTER TY-BREAKER
|
|
67
|
+
mode = utilo.mode(rounded, minimize=minimize)
|
|
68
|
+
result.append(mode)
|
|
69
|
+
# (x0, y0, x1, y1)
|
|
70
|
+
return tuple(result)
|
groupmes/cli.py
ADDED
|
@@ -0,0 +1,81 @@
|
|
|
1
|
+
#==============================================================================
|
|
2
|
+
# C O P Y R I G H T
|
|
3
|
+
#------------------------------------------------------------------------------
|
|
4
|
+
# Copyright (c) 2019-2023 by Helmut Konrad Schewe. All rights reserved.
|
|
5
|
+
# This file is property of Helmut Konrad Schewe. Any unauthorized copy,
|
|
6
|
+
# use or distribution is an offensive act against international law and may
|
|
7
|
+
# be prosecuted under federal law. Its content is company confidential.
|
|
8
|
+
#==============================================================================
|
|
9
|
+
|
|
10
|
+
import utilo
|
|
11
|
+
|
|
12
|
+
import groupmes
|
|
13
|
+
|
|
14
|
+
DESCRIPTION = 'TODO'
|
|
15
|
+
|
|
16
|
+
WORKPLAN = [
|
|
17
|
+
utilo.create_step(
|
|
18
|
+
'area',
|
|
19
|
+
inputs=[
|
|
20
|
+
utilo.ResultFile(producer='rawmaker', name='boxes_boxes'),
|
|
21
|
+
utilo.ResultFile(producer='tablero',
|
|
22
|
+
name='result_result',
|
|
23
|
+
optional=True),
|
|
24
|
+
utilo.ResultFile(producer='rawmaker', name='text_text'),
|
|
25
|
+
utilo.ResultFile(producer='rawmaker', name='text_positions'),
|
|
26
|
+
],
|
|
27
|
+
output=('area',),
|
|
28
|
+
),
|
|
29
|
+
utilo.create_step(
|
|
30
|
+
'border',
|
|
31
|
+
inputs=[
|
|
32
|
+
utilo.ResultFile(producer='rawmaker', name='border_pages'),
|
|
33
|
+
utilo.ResultFile(producer='rawmaker', name='text_positions'),
|
|
34
|
+
],
|
|
35
|
+
output=('leftright',),
|
|
36
|
+
),
|
|
37
|
+
utilo.create_step(
|
|
38
|
+
'distance',
|
|
39
|
+
inputs=[
|
|
40
|
+
utilo.ResultFile(producer=groupmes.PROCESS, name='area_area'),
|
|
41
|
+
utilo.ResultFile(producer='rawmaker', name='text_text'),
|
|
42
|
+
utilo.ResultFile(producer='rawmaker', name='text_positions'),
|
|
43
|
+
],
|
|
44
|
+
output=('distance',),
|
|
45
|
+
),
|
|
46
|
+
utilo.create_step(
|
|
47
|
+
'content',
|
|
48
|
+
inputs=[
|
|
49
|
+
utilo.ResultFile(producer='rawmaker', name='text_text'),
|
|
50
|
+
utilo.ResultFile(producer='rawmaker', name='text_positions'),
|
|
51
|
+
utilo.ResultFile(producer='rawmaker', name='border_pages'),
|
|
52
|
+
utilo.ResultFile(producer='footnote', name='result_result'),
|
|
53
|
+
],
|
|
54
|
+
output=('content',),
|
|
55
|
+
),
|
|
56
|
+
utilo.create_step(
|
|
57
|
+
'hefopa',
|
|
58
|
+
inputs=[
|
|
59
|
+
utilo.ResultFile(producer='headnote', name='result_result'),
|
|
60
|
+
utilo.ResultFile(producer='footnote', name='result_result'),
|
|
61
|
+
utilo.ResultFile(producer='pagenumber', name='result_result'),
|
|
62
|
+
utilo.ResultFile(producer='rawmaker', name='border_pages'),
|
|
63
|
+
],
|
|
64
|
+
output=('result',),
|
|
65
|
+
)
|
|
66
|
+
]
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def main():
|
|
70
|
+
utilo.featurepack(
|
|
71
|
+
workplan=WORKPLAN,
|
|
72
|
+
root=groupmes.ROOT,
|
|
73
|
+
featurepackage='groupmes.feature',
|
|
74
|
+
config=utilo.FeaturePackConfig(
|
|
75
|
+
description=DESCRIPTION,
|
|
76
|
+
multiprocessed=True,
|
|
77
|
+
name=groupmes.PROCESS,
|
|
78
|
+
pages=True,
|
|
79
|
+
version=groupmes.__version__,
|
|
80
|
+
),
|
|
81
|
+
)
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
#==============================================================================
|
|
2
|
+
# C O P Y R I G H T
|
|
3
|
+
#------------------------------------------------------------------------------
|
|
4
|
+
# Copyright (c) 2019-2023 by Helmut Konrad Schewe. All rights reserved.
|
|
5
|
+
# This file is property of Helmut Konrad Schewe. Any unauthorized copy,
|
|
6
|
+
# use or distribution is an offensive act against international law and may
|
|
7
|
+
# be prosecuted under federal law. Its content is company confidential.
|
|
8
|
+
#==============================================================================
|
groupmes/feature/area.py
ADDED
|
@@ -0,0 +1,221 @@
|
|
|
1
|
+
# =============================================================================
|
|
2
|
+
# C O P Y R I G H T
|
|
3
|
+
# -----------------------------------------------------------------------------
|
|
4
|
+
# Copyright (c) 2020-2023 by Helmut Konrad Schewe. All rights reserved.
|
|
5
|
+
# This file is property of Helmut Konrad Schewe. Any unauthorized copy,
|
|
6
|
+
# use or distribution is an offensive act against international law and may
|
|
7
|
+
# be prosecuted under federal law. Its content is company confidential.
|
|
8
|
+
# =============================================================================
|
|
9
|
+
"""Textual Area
|
|
10
|
+
============
|
|
11
|
+
|
|
12
|
+
TODO:
|
|
13
|
+
* table of content
|
|
14
|
+
* images
|
|
15
|
+
* reference table
|
|
16
|
+
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
import collections
|
|
20
|
+
import os
|
|
21
|
+
|
|
22
|
+
import configos
|
|
23
|
+
import serializeraw
|
|
24
|
+
import utilo
|
|
25
|
+
|
|
26
|
+
RECTANGLE_DIFF_MAX = configos.HV_FLOAT_PLUS(default=10.0)
|
|
27
|
+
|
|
28
|
+
RequiredResources = collections.namedtuple(
|
|
29
|
+
'RequiredResources',
|
|
30
|
+
'textnavigator, tables, boxes',
|
|
31
|
+
)
|
|
32
|
+
|
|
33
|
+
PageContentTextualArea = collections.namedtuple(
|
|
34
|
+
'PageContentTextualArea',
|
|
35
|
+
'page, textual, outside, border',
|
|
36
|
+
)
|
|
37
|
+
PageContentTextualAreas = list[PageContentTextualArea]
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def work(
|
|
41
|
+
boxes: str,
|
|
42
|
+
tables: str,
|
|
43
|
+
text: str,
|
|
44
|
+
textpositions: str,
|
|
45
|
+
pages: tuple = None,
|
|
46
|
+
) -> str:
|
|
47
|
+
"""Extract different areas out of given data.
|
|
48
|
+
|
|
49
|
+
Args:
|
|
50
|
+
boxes(str): path to extract `rawmaker` content boxes
|
|
51
|
+
tables(str): path to extracted `tablero` tables
|
|
52
|
+
text(str): extracted `rawmaker` text
|
|
53
|
+
textpositions(str): positions of extracted text
|
|
54
|
+
pages(tuple): tuple of pages to process
|
|
55
|
+
Returns:
|
|
56
|
+
Dumped extracted areas.
|
|
57
|
+
"""
|
|
58
|
+
loaded = load(
|
|
59
|
+
boxes=boxes,
|
|
60
|
+
pages=pages,
|
|
61
|
+
tables=tables,
|
|
62
|
+
text=text,
|
|
63
|
+
textpositions=textpositions,
|
|
64
|
+
)
|
|
65
|
+
|
|
66
|
+
grouped = group_areas(loaded=loaded)
|
|
67
|
+
|
|
68
|
+
dumped = dump_area(grouped)
|
|
69
|
+
return dumped
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def group_areas(loaded: RequiredResources) -> PageContentTextualAreas:
|
|
73
|
+
result = []
|
|
74
|
+
for navigator in loaded.textnavigator:
|
|
75
|
+
page = navigator.page
|
|
76
|
+
|
|
77
|
+
tables = utilo.select_page(loaded.tables, page)
|
|
78
|
+
|
|
79
|
+
boxes = utilo.select_page(loaded.boxes, page)
|
|
80
|
+
boxes = boxes.content if boxes else None
|
|
81
|
+
|
|
82
|
+
grouped = group_page(navigator, tables=tables, boxes=boxes)
|
|
83
|
+
result.append(grouped)
|
|
84
|
+
return result
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
def group_page(navigator, tables, boxes) -> PageContentTextualArea:
|
|
88
|
+
if tables:
|
|
89
|
+
tables = table_checker(tables)
|
|
90
|
+
|
|
91
|
+
if boxes:
|
|
92
|
+
boxes = boxed_checker(boxes)
|
|
93
|
+
|
|
94
|
+
textual = []
|
|
95
|
+
inside_tables = []
|
|
96
|
+
inside_boxes = []
|
|
97
|
+
for text in navigator:
|
|
98
|
+
bounding = tuple(text.bounding)
|
|
99
|
+
if tables and tables.contains(*bounding):
|
|
100
|
+
inside_tables.append(bounding)
|
|
101
|
+
if boxes and boxes.contains(*bounding):
|
|
102
|
+
inside_boxes.append(bounding)
|
|
103
|
+
else:
|
|
104
|
+
textual.append(bounding)
|
|
105
|
+
|
|
106
|
+
# optimize rectangles
|
|
107
|
+
textual = utilo.rect_merge(textual)
|
|
108
|
+
inside_tables = utilo.rect_merge(inside_tables)
|
|
109
|
+
inside_boxes = utilo.rect_merge(inside_boxes)
|
|
110
|
+
outside = {
|
|
111
|
+
'boxes': inside_boxes,
|
|
112
|
+
'tables': inside_tables,
|
|
113
|
+
}
|
|
114
|
+
border = {
|
|
115
|
+
key: list(value) for key, value in (
|
|
116
|
+
('boxes', boxes.content if boxes else []),
|
|
117
|
+
('tables', tables.content if tables else []),
|
|
118
|
+
)
|
|
119
|
+
}
|
|
120
|
+
result = PageContentTextualArea(
|
|
121
|
+
page=navigator.page,
|
|
122
|
+
textual=textual,
|
|
123
|
+
outside=outside,
|
|
124
|
+
border=border,
|
|
125
|
+
)
|
|
126
|
+
return result
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
def boxed_checker(items) -> utilo.RectangleCheck:
|
|
130
|
+
result = utilo.RectangleCheck(max_diff=RECTANGLE_DIFF_MAX)
|
|
131
|
+
for item in items:
|
|
132
|
+
result.extend(*item.box)
|
|
133
|
+
return result
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
def table_checker(items) -> utilo.RectangleCheck:
|
|
137
|
+
result = utilo.RectangleCheck(max_diff=RECTANGLE_DIFF_MAX)
|
|
138
|
+
for item in items:
|
|
139
|
+
result.extend(*item.bounding)
|
|
140
|
+
return result
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
def load(
|
|
144
|
+
boxes: str,
|
|
145
|
+
tables: str,
|
|
146
|
+
text: str,
|
|
147
|
+
textpositions: str,
|
|
148
|
+
pages: tuple = None,
|
|
149
|
+
) -> RequiredResources:
|
|
150
|
+
# TODO: SHOULD WE REMOVE HIDDEN ITEMS?
|
|
151
|
+
textnavigator = serializeraw.ptn_fromfile(
|
|
152
|
+
text=text,
|
|
153
|
+
textpositions=textpositions,
|
|
154
|
+
pages=pages,
|
|
155
|
+
state=None, # load hidden items
|
|
156
|
+
)
|
|
157
|
+
boxes = serializeraw.load_boxes(boxes, pages=pages)
|
|
158
|
+
if os.path.exists(tables):
|
|
159
|
+
tables = serializeraw.load_tables(tables, pages=pages)
|
|
160
|
+
else:
|
|
161
|
+
utilo.log(f'skip using tablero: {tables}, generation is required')
|
|
162
|
+
tables = []
|
|
163
|
+
result = RequiredResources(
|
|
164
|
+
boxes=boxes,
|
|
165
|
+
tables=tables,
|
|
166
|
+
textnavigator=textnavigator,
|
|
167
|
+
)
|
|
168
|
+
return result
|
|
169
|
+
|
|
170
|
+
|
|
171
|
+
def dump_area(items) -> str:
|
|
172
|
+
raw = []
|
|
173
|
+
for page in items:
|
|
174
|
+
outside = {
|
|
175
|
+
key: [utilo.from_tuple(item) for item in value] if value else value
|
|
176
|
+
for key, value in page.outside.items()
|
|
177
|
+
}
|
|
178
|
+
border = {
|
|
179
|
+
key: [utilo.from_tuple(item) for item in border]
|
|
180
|
+
if border else border for key, border in page.border.items()
|
|
181
|
+
}
|
|
182
|
+
textual = page.textual
|
|
183
|
+
if textual:
|
|
184
|
+
textual = [utilo.from_tuple(item) for item in textual]
|
|
185
|
+
|
|
186
|
+
content = {
|
|
187
|
+
'border': border,
|
|
188
|
+
'outside': outside,
|
|
189
|
+
'page': page.page,
|
|
190
|
+
'textual': textual,
|
|
191
|
+
}
|
|
192
|
+
raw.append(content)
|
|
193
|
+
dumped = utilo.yaml_dump(raw)
|
|
194
|
+
return dumped
|
|
195
|
+
|
|
196
|
+
|
|
197
|
+
def load_area(content: str, pages: tuple = None) -> PageContentTextualAreas:
|
|
198
|
+
loaded = utilo.yaml_load(content)
|
|
199
|
+
result = []
|
|
200
|
+
for page in loaded:
|
|
201
|
+
pagenumber = int(page['page'])
|
|
202
|
+
if utilo.should_skip(pagenumber, pages):
|
|
203
|
+
continue
|
|
204
|
+
textual = [utilo.parse_tuple(item) for item in page['textual']
|
|
205
|
+
] if page['textual'] else page['textual']
|
|
206
|
+
outside = {
|
|
207
|
+
key: [utilo.parse_tuple(item) for item in values] if values else
|
|
208
|
+
values for key, values in page['outside'].items()
|
|
209
|
+
}
|
|
210
|
+
border = {
|
|
211
|
+
key: [utilo.parse_tuple(item) for item in values] if values else
|
|
212
|
+
values for key, values in page['border'].items()
|
|
213
|
+
}
|
|
214
|
+
result.append(
|
|
215
|
+
PageContentTextualArea(
|
|
216
|
+
border=border,
|
|
217
|
+
outside=outside,
|
|
218
|
+
page=pagenumber,
|
|
219
|
+
textual=textual,
|
|
220
|
+
))
|
|
221
|
+
return result
|
|
@@ -0,0 +1,101 @@
|
|
|
1
|
+
# =============================================================================
|
|
2
|
+
# C O P Y R I G H T
|
|
3
|
+
# -----------------------------------------------------------------------------
|
|
4
|
+
# Copyright (c) 2020-2023 by Helmut Konrad Schewe. All rights reserved.
|
|
5
|
+
# This file is property of Helmut Konrad Schewe. Any unauthorized copy,
|
|
6
|
+
# use or distribution is an offensive act against international law and may
|
|
7
|
+
# be prosecuted under federal law. Its content is company confidential.
|
|
8
|
+
# =============================================================================
|
|
9
|
+
|
|
10
|
+
import configos
|
|
11
|
+
import iamraw
|
|
12
|
+
import serializeraw
|
|
13
|
+
import utilo
|
|
14
|
+
|
|
15
|
+
import groupmes.border.leftright
|
|
16
|
+
import groupmes.border.most
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def work(
|
|
20
|
+
sizeandborder: str,
|
|
21
|
+
textpositions: str,
|
|
22
|
+
pages: tuple = None,
|
|
23
|
+
) -> tuple[str]:
|
|
24
|
+
sizeandborder = serializeraw.load_pageborders(sizeandborder, pages=pages)
|
|
25
|
+
textpositions = serializeraw.load_textpositions(textpositions, pages=pages)
|
|
26
|
+
|
|
27
|
+
result = determine_border(textpositions, sizeandborder)
|
|
28
|
+
|
|
29
|
+
dumped = serializeraw.dump_leftright_border(result)
|
|
30
|
+
return dumped
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def determine_border(
|
|
34
|
+
textpositions: iamraw.PageContentTextPositions,
|
|
35
|
+
pagesizes: iamraw.PageSizeBorderList,
|
|
36
|
+
):
|
|
37
|
+
clustered = pagecluster(pagesizes)
|
|
38
|
+
result = []
|
|
39
|
+
for pages_incluster in clustered:
|
|
40
|
+
border = cluster_border(textpositions, pagesizes, pages_incluster)
|
|
41
|
+
result.append(border)
|
|
42
|
+
result = utilo.flat(result)
|
|
43
|
+
# sort by page number
|
|
44
|
+
result = sorted(result, key=lambda x: x[0])
|
|
45
|
+
return result
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def cluster_border(textpositions, pagesizes, pages_incluster):
|
|
49
|
+
textpositions = utilo.select_pages(textpositions, pages_incluster)
|
|
50
|
+
pagesizes = utilo.select_pages(pagesizes, pages_incluster)
|
|
51
|
+
|
|
52
|
+
textpositions = utilo.notnone(textpositions)
|
|
53
|
+
pagesizes = utilo.notnone(pagesizes)
|
|
54
|
+
|
|
55
|
+
most = groupmes.border.most.run(pagesizes)
|
|
56
|
+
leftright = groupmes.border.leftright.run(textpositions, pagesizes)
|
|
57
|
+
|
|
58
|
+
result = [(page, *expected_border(leftright, most, pagesizes, page))
|
|
59
|
+
for page in pages_incluster]
|
|
60
|
+
return result
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def expected_border(leftright, most, pagesizes, page: int):
|
|
64
|
+
# left, right, top, down
|
|
65
|
+
# TODO: CHECK THAT PAGE CALL IS CORRECT
|
|
66
|
+
left = leftright.left
|
|
67
|
+
if isinstance(left, tuple):
|
|
68
|
+
left = left[page % 2] # pylint:disable=E1136
|
|
69
|
+
|
|
70
|
+
right = leftright.right
|
|
71
|
+
if isinstance(right, tuple):
|
|
72
|
+
right = right[page % 2] # pylint:disable=E1136
|
|
73
|
+
|
|
74
|
+
pagesize = utilo.select_page(pagesizes, page).size
|
|
75
|
+
rightborder = pagesize.width - right
|
|
76
|
+
bottomborder = pagesize.height - most.bottom
|
|
77
|
+
|
|
78
|
+
result = (left, rightborder, most.top, bottomborder)
|
|
79
|
+
result = utilo.roundme(result)
|
|
80
|
+
return result
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
PAGE_CLUSTER_SIZE_MIN = configos.HV_INT_PLUS(default=3)
|
|
84
|
+
|
|
85
|
+
PAGE_CLUSTER_DIFF_MAX = configos.HV_FLOAT_PLUS(default=10.0)
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def pagecluster(pagesizes) -> list:
|
|
89
|
+
|
|
90
|
+
def equal_size(candidat, clusteritem) -> bool:
|
|
91
|
+
diff = utilo.norms(candidat[0], clusteritem[0])
|
|
92
|
+
return diff < PAGE_CLUSTER_DIFF_MAX
|
|
93
|
+
|
|
94
|
+
grouped = utilo.determine_cluster(
|
|
95
|
+
pagesizes,
|
|
96
|
+
classifier=equal_size,
|
|
97
|
+
min_elements=PAGE_CLUSTER_SIZE_MIN,
|
|
98
|
+
)
|
|
99
|
+
|
|
100
|
+
pages = [sorted(item.page for item in cluster) for cluster in grouped]
|
|
101
|
+
return pages
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
# =============================================================================
|
|
2
|
+
# C O P Y R I G H T
|
|
3
|
+
# -----------------------------------------------------------------------------
|
|
4
|
+
# Copyright (c) 2019-2023 by Helmut Konrad Schewe. All rights reserved.
|
|
5
|
+
# This file is property of Helmut Konrad Schewe. Any unauthorized copy,
|
|
6
|
+
# use or distribution is an offensive act against international law and may
|
|
7
|
+
# be prosecuted under federal law. Its content is company confidential.
|
|
8
|
+
# =============================================================================
|
|
9
|
+
|
|
10
|
+
import iamraw
|
|
11
|
+
import serializeraw
|
|
12
|
+
import utilo
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def work(
|
|
16
|
+
text: str,
|
|
17
|
+
textpositions: str,
|
|
18
|
+
sizeandborder: str,
|
|
19
|
+
headerfooter: str,
|
|
20
|
+
pages: tuple | None = None,
|
|
21
|
+
) -> str:
|
|
22
|
+
"""Extract table of content out of `document`.
|
|
23
|
+
|
|
24
|
+
Args:
|
|
25
|
+
text(str): path to load document
|
|
26
|
+
textpositions(str): path to load document textpositions
|
|
27
|
+
sizeandborder(str): path with page sizes and content border
|
|
28
|
+
headerfooter(str): path with header and footer to determine
|
|
29
|
+
content border.
|
|
30
|
+
pages(tuple): tuple of selected pages
|
|
31
|
+
Returns:
|
|
32
|
+
dump of extracted content bounding boxes
|
|
33
|
+
"""
|
|
34
|
+
navigators = serializeraw.ptcn_fromfile(
|
|
35
|
+
text,
|
|
36
|
+
textpositions,
|
|
37
|
+
sizeandborder=sizeandborder,
|
|
38
|
+
headerfooter=headerfooter,
|
|
39
|
+
pages=pages,
|
|
40
|
+
)
|
|
41
|
+
result = []
|
|
42
|
+
for page in navigators:
|
|
43
|
+
top, bottom = page.content.top, page.content.bottom
|
|
44
|
+
top, bottom = utilo.roundme((top, bottom))
|
|
45
|
+
result.append(
|
|
46
|
+
iamraw.ContentBoundingBox(
|
|
47
|
+
page=page.page,
|
|
48
|
+
top=top,
|
|
49
|
+
bottom=bottom,
|
|
50
|
+
))
|
|
51
|
+
dumped = serializeraw.dump_contentboundingbox(result)
|
|
52
|
+
return dumped
|
|
@@ -0,0 +1,237 @@
|
|
|
1
|
+
# =============================================================================
|
|
2
|
+
# C O P Y R I G H T
|
|
3
|
+
# -----------------------------------------------------------------------------
|
|
4
|
+
# Copyright (c) 2020-2023 by Helmut Konrad Schewe. All rights reserved.
|
|
5
|
+
# This file is property of Helmut Konrad Schewe. Any unauthorized copy,
|
|
6
|
+
# use or distribution is an offensive act against international law and may
|
|
7
|
+
# be prosecuted under federal law. Its content is company confidential.
|
|
8
|
+
# =============================================================================
|
|
9
|
+
"""Distance detector
|
|
10
|
+
=================
|
|
11
|
+
|
|
12
|
+
Compute the distance between textual and non textual elements.
|
|
13
|
+
|
|
14
|
+
There are two differences for every non textual elements. The distance
|
|
15
|
+
before(negative) and the distance after(positive). If the page starts or
|
|
16
|
+
ends with an non textual element, the distance is None.
|
|
17
|
+
|
|
18
|
+
TODO: SUPPORT LEFT RIGHT DISTANCE
|
|
19
|
+
|
|
20
|
+
"""
|
|
21
|
+
|
|
22
|
+
import collections
|
|
23
|
+
|
|
24
|
+
import serializeraw
|
|
25
|
+
import texmex
|
|
26
|
+
import utilo
|
|
27
|
+
|
|
28
|
+
import groupmes.feature.area
|
|
29
|
+
|
|
30
|
+
RequiredResources = collections.namedtuple(
|
|
31
|
+
'RequiredResources',
|
|
32
|
+
'area, textnavigator',
|
|
33
|
+
)
|
|
34
|
+
|
|
35
|
+
AreaDistance = collections.namedtuple(
|
|
36
|
+
'AreaDistance',
|
|
37
|
+
'index, before, after',
|
|
38
|
+
)
|
|
39
|
+
AreaDistances = list[AreaDistance]
|
|
40
|
+
|
|
41
|
+
PageContentAreaDistance = collections.namedtuple(
|
|
42
|
+
'PageContentAreaDistance',
|
|
43
|
+
'page, content',
|
|
44
|
+
)
|
|
45
|
+
PageContentAreaDistances = list[PageContentAreaDistance]
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def work(
|
|
49
|
+
areas: str,
|
|
50
|
+
text: str,
|
|
51
|
+
textpositions: str,
|
|
52
|
+
pages: tuple = None,
|
|
53
|
+
) -> str:
|
|
54
|
+
loaded = load(areas, text, textpositions, pages=pages)
|
|
55
|
+
distances = determine_distances(loaded)
|
|
56
|
+
dumped = dump_distance(distances)
|
|
57
|
+
return dumped
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def determine_distances(loaded: RequiredResources) -> PageContentAreaDistances:
|
|
61
|
+
result = []
|
|
62
|
+
for navigator in loaded.textnavigator:
|
|
63
|
+
page = navigator.page
|
|
64
|
+
areas = utilo.select_page(loaded.area, page)
|
|
65
|
+
grouped = group_page(navigator, areas)
|
|
66
|
+
if not grouped:
|
|
67
|
+
continue
|
|
68
|
+
result.append(PageContentAreaDistance(content=grouped, page=page))
|
|
69
|
+
return result
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def group_page(navigator, areas) -> AreaDistances:
|
|
73
|
+
if areas is None:
|
|
74
|
+
return []
|
|
75
|
+
distance = create_distance(areas.border)
|
|
76
|
+
distances = [distance.distance(line.bounding) for line in navigator]
|
|
77
|
+
|
|
78
|
+
collected = collections.defaultdict(list)
|
|
79
|
+
for item in distances:
|
|
80
|
+
if not item:
|
|
81
|
+
continue
|
|
82
|
+
values, index = item
|
|
83
|
+
if isinstance(values, float):
|
|
84
|
+
collected[index].append(values)
|
|
85
|
+
else:
|
|
86
|
+
collected[index].append(values[0])
|
|
87
|
+
collected[index + 1].append(values[1])
|
|
88
|
+
|
|
89
|
+
final = []
|
|
90
|
+
for key, value in collected.items():
|
|
91
|
+
negative = max((item for item in value if item < 0), default=None)
|
|
92
|
+
negative = utilo.roundme(negative) if negative is not None else None
|
|
93
|
+
positive = min((item for item in value if item >= 0), default=None)
|
|
94
|
+
positive = utilo.roundme(positive) if positive is not None else None
|
|
95
|
+
final.append(AreaDistance(index=key, before=negative, after=positive))
|
|
96
|
+
return final
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
class Distance:
|
|
100
|
+
|
|
101
|
+
def __init__(self, diff: float = 10.0):
|
|
102
|
+
self.content = []
|
|
103
|
+
self.sorted = True
|
|
104
|
+
self.diff = diff
|
|
105
|
+
|
|
106
|
+
def distance(self, bounding): # pylint:disable=R1260,R0911
|
|
107
|
+
if not self:
|
|
108
|
+
return None
|
|
109
|
+
if not self.sorted:
|
|
110
|
+
self.sort()
|
|
111
|
+
top, bottom = bounding[1], bounding[3]
|
|
112
|
+
if len(self) == 1:
|
|
113
|
+
if utilo.rect_inside(self[0], bounding, diff=self.diff):
|
|
114
|
+
return None
|
|
115
|
+
top_current = self[0][1]
|
|
116
|
+
bottom_current = self[0][3]
|
|
117
|
+
if bottom <= top_current:
|
|
118
|
+
# content is above
|
|
119
|
+
return (utilo.roundme(bottom - top_current), 0)
|
|
120
|
+
# content is below
|
|
121
|
+
return (utilo.roundme(top - bottom_current), 0)
|
|
122
|
+
|
|
123
|
+
if self[-1][3] <= bounding[1]:
|
|
124
|
+
# after
|
|
125
|
+
return (utilo.roundme(bounding[1] - self[-1][3]), len(self) - 1)
|
|
126
|
+
|
|
127
|
+
# in the middle
|
|
128
|
+
for index, (before, after) in enumerate(zip(self[0:-1], self[1:])):
|
|
129
|
+
bottom_before = before[3]
|
|
130
|
+
top_after = after[1]
|
|
131
|
+
if utilo.rect_inside(before, bounding):
|
|
132
|
+
return None
|
|
133
|
+
if utilo.rect_inside(after, bounding):
|
|
134
|
+
return None
|
|
135
|
+
if bottom_before <= top <= bottom <= top_after:
|
|
136
|
+
diff_top = top - bottom_before
|
|
137
|
+
diff_bottom = bottom - top_after
|
|
138
|
+
return (diff_top, diff_bottom), index
|
|
139
|
+
return None
|
|
140
|
+
|
|
141
|
+
def append(self, item):
|
|
142
|
+
self.content.append(item)
|
|
143
|
+
self.sorted = False
|
|
144
|
+
|
|
145
|
+
def sort(self):
|
|
146
|
+
self.content = sorted(self.content, key=lambda item: item[1])
|
|
147
|
+
self.sorted = True
|
|
148
|
+
|
|
149
|
+
def __getitem__(self, index):
|
|
150
|
+
return self.content[index]
|
|
151
|
+
|
|
152
|
+
def __len__(self):
|
|
153
|
+
return len(self.content)
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
def create_distance(items) -> Distance:
|
|
157
|
+
result = Distance()
|
|
158
|
+
for values in items.values():
|
|
159
|
+
for item in values:
|
|
160
|
+
result.append(item)
|
|
161
|
+
result.sort()
|
|
162
|
+
return result
|
|
163
|
+
|
|
164
|
+
|
|
165
|
+
def load(
|
|
166
|
+
area: str,
|
|
167
|
+
text: str,
|
|
168
|
+
textpositions: str,
|
|
169
|
+
pages: tuple = None,
|
|
170
|
+
) -> RequiredResources:
|
|
171
|
+
try:
|
|
172
|
+
area = groupmes.feature.area.load_area(area, pages=pages)
|
|
173
|
+
except FileNotFoundError as err:
|
|
174
|
+
area = []
|
|
175
|
+
utilo.error(err)
|
|
176
|
+
text = serializeraw.load_document(text, pages=pages)
|
|
177
|
+
textpositions = serializeraw.load_textpositions(textpositions, pages=pages)
|
|
178
|
+
textnavigator = texmex.create_ptns(
|
|
179
|
+
text,
|
|
180
|
+
textpositions=textpositions,
|
|
181
|
+
fill_empty=False,
|
|
182
|
+
)
|
|
183
|
+
result = RequiredResources(
|
|
184
|
+
area=area,
|
|
185
|
+
textnavigator=textnavigator,
|
|
186
|
+
)
|
|
187
|
+
return result
|
|
188
|
+
|
|
189
|
+
|
|
190
|
+
def dump_distance(items: PageContentAreaDistances) -> str:
|
|
191
|
+
raw = []
|
|
192
|
+
for page in items:
|
|
193
|
+
content = []
|
|
194
|
+
for item in page.content:
|
|
195
|
+
before = utilo.roundme(item.before) if item.before is not None else 'None' # yapf:disable
|
|
196
|
+
after = utilo.roundme(item.after) if item.after is not None else 'None' # yapf:disable
|
|
197
|
+
content.append(f'{item.index} {before} {after}')
|
|
198
|
+
raw.append({'page': page.page, 'content': content})
|
|
199
|
+
dumped = utilo.yaml_dump(raw)
|
|
200
|
+
return dumped
|
|
201
|
+
|
|
202
|
+
|
|
203
|
+
def load_distance(
|
|
204
|
+
content: str,
|
|
205
|
+
pages: tuple = None,
|
|
206
|
+
) -> PageContentAreaDistances:
|
|
207
|
+
# TODO: MOVE TO SERIALIZERAW
|
|
208
|
+
loaded = utilo.yaml_load(content)
|
|
209
|
+
result = []
|
|
210
|
+
for page in loaded:
|
|
211
|
+
pagenumber = int(page['page'])
|
|
212
|
+
if utilo.should_skip(pagenumber, pages):
|
|
213
|
+
continue
|
|
214
|
+
pagecontent = []
|
|
215
|
+
for line in page['content']:
|
|
216
|
+
index, before, after = line.split()
|
|
217
|
+
try:
|
|
218
|
+
before = float(before)
|
|
219
|
+
except ValueError:
|
|
220
|
+
before = None
|
|
221
|
+
try:
|
|
222
|
+
after = float(after)
|
|
223
|
+
except ValueError:
|
|
224
|
+
after = None
|
|
225
|
+
index = int(index)
|
|
226
|
+
pagecontent.append(
|
|
227
|
+
AreaDistance(
|
|
228
|
+
index=index,
|
|
229
|
+
before=before,
|
|
230
|
+
after=after,
|
|
231
|
+
))
|
|
232
|
+
result.append(
|
|
233
|
+
PageContentAreaDistance(
|
|
234
|
+
page=pagenumber,
|
|
235
|
+
content=pagecontent,
|
|
236
|
+
))
|
|
237
|
+
return result
|
|
@@ -0,0 +1,126 @@
|
|
|
1
|
+
# =============================================================================
|
|
2
|
+
# C O P Y R I G H T
|
|
3
|
+
# -----------------------------------------------------------------------------
|
|
4
|
+
# Copyright (c) 2022-2023 by Helmut Konrad Schewe. All rights reserved.
|
|
5
|
+
# This file is property of Helmut Konrad Schewe. Any unauthorized copy,
|
|
6
|
+
# use or distribution is an offensive act against international law and may
|
|
7
|
+
# be prosecuted under federal law. Its content is company confidential.
|
|
8
|
+
# =============================================================================
|
|
9
|
+
|
|
10
|
+
import iamraw
|
|
11
|
+
import serializeraw
|
|
12
|
+
import texmex
|
|
13
|
+
import utilo
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def work(
|
|
17
|
+
headnote: str,
|
|
18
|
+
footnote: str,
|
|
19
|
+
pagenumber: str,
|
|
20
|
+
borders: str,
|
|
21
|
+
pages: int | tuple = None,
|
|
22
|
+
) -> str:
|
|
23
|
+
try:
|
|
24
|
+
headnote = serializeraw.load_headerfooter(headnote, pages=pages)
|
|
25
|
+
except FileNotFoundError as err:
|
|
26
|
+
utilo.error(err)
|
|
27
|
+
headnote: list = []
|
|
28
|
+
try:
|
|
29
|
+
footnote = serializeraw.load_headerfooter(footnote, pages=pages)
|
|
30
|
+
except FileNotFoundError as err:
|
|
31
|
+
utilo.error(err)
|
|
32
|
+
footnote: list = []
|
|
33
|
+
|
|
34
|
+
borders = serializeraw.load_pageborders(borders, pages=pages)
|
|
35
|
+
pagenumber = load_pagenumbers(pagenumber, borders, pages=pages)
|
|
36
|
+
merged = merge(headnote, footnote, pagenumber)
|
|
37
|
+
dumped = serializeraw.dump_headerfooter(merged)
|
|
38
|
+
return dumped
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def merge(headnotes, footnotes, pagenumbers) -> list:
|
|
42
|
+
result = iamraw.PageContentFooterHeaders(content=[])
|
|
43
|
+
result.__strategy__ = 'hefopa'
|
|
44
|
+
for page, (headnote, footnote, pagenumber) in utilo.sync_pages(
|
|
45
|
+
(headnotes, footnotes, pagenumbers)):
|
|
46
|
+
if not any((headnote, footnote, pagenumber)):
|
|
47
|
+
continue
|
|
48
|
+
item = iamraw.PageContentFooterHeader(page=page)
|
|
49
|
+
if footnote and footnote.footer:
|
|
50
|
+
item.footer = footnote.footer
|
|
51
|
+
if headnote:
|
|
52
|
+
if headnote.header:
|
|
53
|
+
item.header = headnote.header
|
|
54
|
+
if headnote.footer:
|
|
55
|
+
if not item.footer:
|
|
56
|
+
item.footer = headnote.footer
|
|
57
|
+
if pagenumber and pagenumber.footer:
|
|
58
|
+
if not item.header and not item.footer:
|
|
59
|
+
item.header = pagenumber.header
|
|
60
|
+
item.footer = pagenumber.footer
|
|
61
|
+
result.content.append(item)
|
|
62
|
+
return result
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def load_pagenumbers(
|
|
66
|
+
pagenumber,
|
|
67
|
+
borders,
|
|
68
|
+
pages: tuple,
|
|
69
|
+
) -> iamraw.PageContentFooterHeaders:
|
|
70
|
+
result = iamraw.PageContentFooterHeaders(content=[])
|
|
71
|
+
result.__stategy__ = 'pagenumber'
|
|
72
|
+
loaded = serializeraw.load_pagenumbers(pagenumber, pages=pages)
|
|
73
|
+
single = utilo.Single()
|
|
74
|
+
for item in loaded:
|
|
75
|
+
pdfpage = item.pdfpage # pylint:disable=E1101
|
|
76
|
+
pageborder = utilo.select_page(borders, page=pdfpage)
|
|
77
|
+
# TODO: MAY REMOVE LATER
|
|
78
|
+
if single.contains(pdfpage):
|
|
79
|
+
utilo.error(f'duplicated pagenumber/pdfpage: {item}')
|
|
80
|
+
continue
|
|
81
|
+
page = create(
|
|
82
|
+
item,
|
|
83
|
+
pdfpage,
|
|
84
|
+
pageborder,
|
|
85
|
+
)
|
|
86
|
+
result.content.append(page)
|
|
87
|
+
return result
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
def create(item, pdfpage, pageborder) -> iamraw.PageContentFooterHeader:
|
|
91
|
+
pageinfo = iamraw.PageInformation(value=item.detected) # pylint:disable=E1101
|
|
92
|
+
header, footer = None, None
|
|
93
|
+
begin, end = head_foot_area(pageborder, item.bounding) # pylint:disable=E1101
|
|
94
|
+
isheader = begin == texmex.START
|
|
95
|
+
if isheader:
|
|
96
|
+
header = iamraw.FixedHeaderInfo(page=pageinfo)
|
|
97
|
+
header.begin = begin
|
|
98
|
+
header.end = end
|
|
99
|
+
else:
|
|
100
|
+
footer = iamraw.FixedFooterInfo(page=pageinfo)
|
|
101
|
+
footer.begin = begin
|
|
102
|
+
footer.end = end
|
|
103
|
+
result = iamraw.PageContentFooterHeader(
|
|
104
|
+
page=pdfpage,
|
|
105
|
+
header=header,
|
|
106
|
+
footer=footer,
|
|
107
|
+
)
|
|
108
|
+
return result
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
def head_foot_area(pageborder, pagenumber_bounding) -> float:
|
|
112
|
+
pageheight = pageborder.size.height
|
|
113
|
+
if not pageheight:
|
|
114
|
+
utilo.error(f'missing page height: {pageborder} {pagenumber_bounding}')
|
|
115
|
+
return texmex.END
|
|
116
|
+
pagenumber_y0 = pagenumber_bounding.y0
|
|
117
|
+
pagenumber_y1 = pagenumber_bounding.y1
|
|
118
|
+
header = pagenumber_y1 < 350
|
|
119
|
+
if header:
|
|
120
|
+
begin = texmex.START
|
|
121
|
+
end = utilo.roundme(pagenumber_y1 / pageheight + 0.00) # TOL
|
|
122
|
+
else:
|
|
123
|
+
# footer
|
|
124
|
+
begin = utilo.roundme(pagenumber_y0 / pageheight - 0.01) # TOL
|
|
125
|
+
end = texmex.END
|
|
126
|
+
return begin, end
|
groupmes/path.py
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
# =============================================================================
|
|
2
|
+
# C O P Y R I G H T
|
|
3
|
+
# -----------------------------------------------------------------------------
|
|
4
|
+
# Copyright (c) 2019-2023 by Helmut Konrad Schewe. All rights reserved.
|
|
5
|
+
# This file is property of Helmut Konrad Schewe. Any unauthorized copy,
|
|
6
|
+
# use or distribution is an offensive act against international law and may
|
|
7
|
+
# be prosecuted under federal law. Its content is company confidential.
|
|
8
|
+
# =============================================================================
|
|
9
|
+
|
|
10
|
+
import utilo
|
|
11
|
+
|
|
12
|
+
import groupmes
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def area(path: str, prefix: str = '') -> str:
|
|
16
|
+
return utilo.pathconnector(path, groupmes.PACKAGE, 'area_area', prefix)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def border_leftright(path: str, prefix: str = '') -> str:
|
|
20
|
+
return utilo.pathconnector(
|
|
21
|
+
path,
|
|
22
|
+
groupmes.PACKAGE,
|
|
23
|
+
'border_leftright',
|
|
24
|
+
prefix,
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def distance(path: str, prefix: str = '') -> str:
|
|
29
|
+
return utilo.pathconnector(
|
|
30
|
+
path,
|
|
31
|
+
groupmes.PACKAGE,
|
|
32
|
+
'distance_distance',
|
|
33
|
+
prefix,
|
|
34
|
+
)
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: groupmes
|
|
3
|
+
Version: 1.1.0
|
|
4
|
+
Author-email: Helmut Konrad Schewe <helmutus@outlook.com>
|
|
5
|
+
License-Expression: MIT
|
|
6
|
+
Project-URL: Homepage, https://github.com/anaticulae/groupmes
|
|
7
|
+
Project-URL: Repository, https://github.com/anaticulae/groupmes
|
|
8
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
9
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
10
|
+
Classifier: Programming Language :: Python :: 3.14
|
|
11
|
+
Requires-Python: >=3.12
|
|
12
|
+
Description-Content-Type: text/markdown
|
|
13
|
+
Requires-Dist: iamraw<5.0.0,>=4.91.5
|
|
14
|
+
Requires-Dist: utilo<3.0.0,>=2.109.0
|
|
15
|
+
Requires-Dist: configos<2.0.0,>=1.0.4
|
|
16
|
+
Requires-Dist: konradus<2.0.0,>=1.0.1
|
|
17
|
+
Provides-Extra: dev
|
|
18
|
+
Requires-Dist: utilotest<2.0.0,>=1.0.4; extra == "dev"
|
|
19
|
+
Requires-Dist: hoverpower==1.4.3; extra == "dev"
|
|
20
|
+
Requires-Dist: gennex==1.0.3; extra == "dev"
|
|
21
|
+
Requires-Dist: rawmaker==2.40.3; extra == "dev"
|
|
22
|
+
Requires-Dist: pagenumber==1.0.0; extra == "dev"
|
|
23
|
+
|
|
24
|
+
# groupmes
|
|
25
|
+
|
|
26
|
+
* chapter: split document text by chapter
|
|
27
|
+
* structure: divide document in head(toc) and tail(rest)
|
|
28
|
+
* toc: create table of content out of raw text data
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
groupmes/__init__.py,sha256=h23h6l8V-tYm0m8M2D21L8ZbZGOOuKymxMJ2OHAtayo,801
|
|
2
|
+
groupmes/__main__.py,sha256=XGxs_W_jWiuac2zJS9cmc_YQ4_06X2-6insjQwP5AwY,622
|
|
3
|
+
groupmes/__patch__.py,sha256=TM9PAYPmQhbW1HY3Vx-bS5rXe132Z02TPBxFMad80Dk,552
|
|
4
|
+
groupmes/cli.py,sha256=QuNRQDf4Kvs6qQhyCLBT2aGWtCH45BzQzYAXJ9vM9cc,2812
|
|
5
|
+
groupmes/path.py,sha256=Nlxte_pPtVXEbFkCpSk8L50GJ_oKDJ7NRlTtKZ9YCms,1064
|
|
6
|
+
groupmes/border/__init__.py,sha256=TM9PAYPmQhbW1HY3Vx-bS5rXe132Z02TPBxFMad80Dk,552
|
|
7
|
+
groupmes/border/leftright.py,sha256=i_iSLvXoWe3HKctvOb_jarDsO202tWrwwCAURE9bM5s,10029
|
|
8
|
+
groupmes/border/most.py,sha256=-82HuzurQC7xrwhOCgCquN-1XtKP1FePb-tweYcvm_o,2411
|
|
9
|
+
groupmes/feature/__init__.py,sha256=kxNXE6nGBAEes4-tV7jYOvjrGAKCZ_bMBKXrB56Hgho,552
|
|
10
|
+
groupmes/feature/area.py,sha256=AslnKHbh9mS_RhScrAKRShuMje3vXAwz0elag9DKqpU,6227
|
|
11
|
+
groupmes/feature/border.py,sha256=qiQhnxid14Rff8osBNFhNDJZNBViqMy4WiVNufVntUY,3227
|
|
12
|
+
groupmes/feature/content.py,sha256=os0Nwycs8EAY8U4UdapqdmIy3BVELNf63To8_NiGsZw,1779
|
|
13
|
+
groupmes/feature/distance.py,sha256=a4gDdrwHRQkKwTzcEZP80UVI9LJuC2-UgqeO3HKJbbM,7267
|
|
14
|
+
groupmes/feature/hefopa.py,sha256=-8QVippCqHvSZhZ7C7eRpD11pn1xlmLMa8lpn0pAjqY,4337
|
|
15
|
+
groupmes-1.1.0.dist-info/METADATA,sha256=e22TPdVWcijKql3FYdiuFEdR8DNdXTBHxB9PfzUOYFk,1057
|
|
16
|
+
groupmes-1.1.0.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
|
|
17
|
+
groupmes-1.1.0.dist-info/entry_points.txt,sha256=uweQthSsMHckhQ-yFmjsFhJK8sZslWMylWxAeCTOcTs,46
|
|
18
|
+
groupmes-1.1.0.dist-info/top_level.txt,sha256=RqsOvZPqxlp_Pu6Ic-F6Z1H3MnsIIJpw3UEf7VZkJAU,9
|
|
19
|
+
groupmes-1.1.0.dist-info/RECORD,,
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
groupmes
|