groupmes 1.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- groupmes-1.1.0/PKG-INFO +28 -0
- groupmes-1.1.0/README +5 -0
- groupmes-1.1.0/groupmes/__init__.py +21 -0
- groupmes-1.1.0/groupmes/__main__.py +13 -0
- groupmes-1.1.0/groupmes/__patch__.py +8 -0
- groupmes-1.1.0/groupmes/border/__init__.py +8 -0
- groupmes-1.1.0/groupmes/border/leftright.py +315 -0
- groupmes-1.1.0/groupmes/border/most.py +70 -0
- groupmes-1.1.0/groupmes/cli.py +81 -0
- groupmes-1.1.0/groupmes/feature/__init__.py +8 -0
- groupmes-1.1.0/groupmes/feature/area.py +221 -0
- groupmes-1.1.0/groupmes/feature/border.py +101 -0
- groupmes-1.1.0/groupmes/feature/content.py +52 -0
- groupmes-1.1.0/groupmes/feature/distance.py +237 -0
- groupmes-1.1.0/groupmes/feature/hefopa.py +126 -0
- groupmes-1.1.0/groupmes/path.py +34 -0
- groupmes-1.1.0/groupmes.egg-info/PKG-INFO +28 -0
- groupmes-1.1.0/groupmes.egg-info/SOURCES.txt +24 -0
- groupmes-1.1.0/groupmes.egg-info/dependency_links.txt +1 -0
- groupmes-1.1.0/groupmes.egg-info/entry_points.txt +2 -0
- groupmes-1.1.0/groupmes.egg-info/requires.txt +11 -0
- groupmes-1.1.0/groupmes.egg-info/top_level.txt +1 -0
- groupmes-1.1.0/pyproject.toml +100 -0
- groupmes-1.1.0/setup.cfg +4 -0
- groupmes-1.1.0/tests/test_cli.py +48 -0
- groupmes-1.1.0/tests/test_huge.py +91 -0
groupmes-1.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: groupmes
|
|
3
|
+
Version: 1.1.0
|
|
4
|
+
Author-email: Helmut Konrad Schewe <helmutus@outlook.com>
|
|
5
|
+
License-Expression: MIT
|
|
6
|
+
Project-URL: Homepage, https://github.com/anaticulae/groupmes
|
|
7
|
+
Project-URL: Repository, https://github.com/anaticulae/groupmes
|
|
8
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
9
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
10
|
+
Classifier: Programming Language :: Python :: 3.14
|
|
11
|
+
Requires-Python: >=3.12
|
|
12
|
+
Description-Content-Type: text/markdown
|
|
13
|
+
Requires-Dist: iamraw<5.0.0,>=4.91.5
|
|
14
|
+
Requires-Dist: utilo<3.0.0,>=2.109.0
|
|
15
|
+
Requires-Dist: configos<2.0.0,>=1.0.4
|
|
16
|
+
Requires-Dist: konradus<2.0.0,>=1.0.1
|
|
17
|
+
Provides-Extra: dev
|
|
18
|
+
Requires-Dist: utilotest<2.0.0,>=1.0.4; extra == "dev"
|
|
19
|
+
Requires-Dist: hoverpower==1.4.3; extra == "dev"
|
|
20
|
+
Requires-Dist: gennex==1.0.3; extra == "dev"
|
|
21
|
+
Requires-Dist: rawmaker==2.40.3; extra == "dev"
|
|
22
|
+
Requires-Dist: pagenumber==1.0.0; extra == "dev"
|
|
23
|
+
|
|
24
|
+
# groupmes
|
|
25
|
+
|
|
26
|
+
* chapter: split document text by chapter
|
|
27
|
+
* structure: divide document in head(toc) and tail(rest)
|
|
28
|
+
* toc: create table of content out of raw text data
|
groupmes-1.1.0/README
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
#==============================================================================
|
|
2
|
+
# C O P Y R I G H T
|
|
3
|
+
#------------------------------------------------------------------------------
|
|
4
|
+
# Copyright (c) 2019-2023 by Helmut Konrad Schewe. All rights reserved.
|
|
5
|
+
# This file is property of Helmut Konrad Schewe. Any unauthorized copy,
|
|
6
|
+
# use or distribution is an offensive act against international law and may
|
|
7
|
+
# be prosecuted under federal law. Its content is company confidential.
|
|
8
|
+
#==============================================================================
|
|
9
|
+
|
|
10
|
+
import importlib.metadata
|
|
11
|
+
import os
|
|
12
|
+
|
|
13
|
+
import groupmes.__patch__
|
|
14
|
+
import groupmes.path
|
|
15
|
+
|
|
16
|
+
PACKAGE = 'groupmes'
|
|
17
|
+
PROCESS = 'groupme'
|
|
18
|
+
|
|
19
|
+
__version__ = importlib.metadata.version(PACKAGE)
|
|
20
|
+
|
|
21
|
+
ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
#==============================================================================
|
|
2
|
+
# C O P Y R I G H T
|
|
3
|
+
#------------------------------------------------------------------------------
|
|
4
|
+
# Copyright (c) 2019-2023 by Helmut Konrad Schewe. All rights reserved.
|
|
5
|
+
# This file is property of Helmut Konrad Schewe. Any unauthorized copy,
|
|
6
|
+
# use or distribution is an offensive act against international law and may
|
|
7
|
+
# be prosecuted under federal law. Its content is company confidential.
|
|
8
|
+
#==============================================================================
|
|
9
|
+
|
|
10
|
+
from groupmes.cli import main
|
|
11
|
+
|
|
12
|
+
if __name__ == "__main__":
|
|
13
|
+
main()
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
# =============================================================================
|
|
2
|
+
# C O P Y R I G H T
|
|
3
|
+
# -----------------------------------------------------------------------------
|
|
4
|
+
# Copyright (c) 2020-2023 by Helmut Konrad Schewe. All rights reserved.
|
|
5
|
+
# This file is property of Helmut Konrad Schewe. Any unauthorized copy,
|
|
6
|
+
# use or distribution is an offensive act against international law and may
|
|
7
|
+
# be prosecuted under federal law. Its content is company confidential.
|
|
8
|
+
# =============================================================================
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
# =============================================================================
|
|
2
|
+
# C O P Y R I G H T
|
|
3
|
+
# -----------------------------------------------------------------------------
|
|
4
|
+
# Copyright (c) 2020-2023 by Helmut Konrad Schewe. All rights reserved.
|
|
5
|
+
# This file is property of Helmut Konrad Schewe. Any unauthorized copy,
|
|
6
|
+
# use or distribution is an offensive act against international law and may
|
|
7
|
+
# be prosecuted under federal law. Its content is company confidential.
|
|
8
|
+
# =============================================================================
|
|
@@ -0,0 +1,315 @@
|
|
|
1
|
+
# =============================================================================
|
|
2
|
+
# C O P Y R I G H T
|
|
3
|
+
# -----------------------------------------------------------------------------
|
|
4
|
+
# Copyright (c) 2020-2023 by Helmut Konrad Schewe. All rights reserved.
|
|
5
|
+
# This file is property of Helmut Konrad Schewe. Any unauthorized copy,
|
|
6
|
+
# use or distribution is an offensive act against international law and may
|
|
7
|
+
# be prosecuted under federal law. Its content is company confidential.
|
|
8
|
+
# =============================================================================
|
|
9
|
+
"""left right border detector
|
|
10
|
+
==========================
|
|
11
|
+
|
|
12
|
+
The `left right border detector` (lrbd) separates borders of the left
|
|
13
|
+
and right page especially which are used for books with different border
|
|
14
|
+
width for left and right page.
|
|
15
|
+
|
|
16
|
+
As a result of different left-right borders there are alternating border
|
|
17
|
+
widths which we detect. On single pages there are no alternating border.
|
|
18
|
+
|
|
19
|
+
Currently there are two strategies to detect different page border:
|
|
20
|
+
|
|
21
|
+
* simple approach
|
|
22
|
+
* raising edge
|
|
23
|
+
|
|
24
|
+
Simple Approach
|
|
25
|
+
~~~~~~~~~~~~~~~
|
|
26
|
+
|
|
27
|
+
Some documents have exceptions on some pages. We handle this via allowed
|
|
28
|
+
errors defined with the HolyValues `FIRSTSECOND_ERROR_COUNT_MAX` and
|
|
29
|
+
`MIXED_ERROR_MIN`.
|
|
30
|
+
|
|
31
|
+
Raising Edge
|
|
32
|
+
~~~~~~~~~~~~
|
|
33
|
+
|
|
34
|
+
The change of the text feed creates an raising edge between pages. This
|
|
35
|
+
approach detects these edges to determine left and right page border
|
|
36
|
+
width.
|
|
37
|
+
"""
|
|
38
|
+
|
|
39
|
+
import dataclasses
|
|
40
|
+
import math
|
|
41
|
+
import statistics
|
|
42
|
+
import typing
|
|
43
|
+
|
|
44
|
+
import configos
|
|
45
|
+
import iamraw
|
|
46
|
+
import utilo
|
|
47
|
+
|
|
48
|
+
# max diff to match in common group.
|
|
49
|
+
SIDE_DIFF_MAX = configos.HV_INT_PLUS(default=2)
|
|
50
|
+
# exceptions which are allowed cause of user defined error.
|
|
51
|
+
FIRSTSECOND_ERROR_COUNT_MAX = configos.HolyTable(
|
|
52
|
+
items=(
|
|
53
|
+
(2, 0.5),
|
|
54
|
+
(3, 0.35),
|
|
55
|
+
(5, 0.26),
|
|
56
|
+
(10, .21),
|
|
57
|
+
(15, 0.05),
|
|
58
|
+
(200, 0.01),
|
|
59
|
+
),
|
|
60
|
+
right_outranges_none=False,
|
|
61
|
+
)
|
|
62
|
+
# errors which are a result of handle alternating border as single border.
|
|
63
|
+
MIXED_ERROR_MIN = configos.HV_PERCENT_PLUS(default=15)
|
|
64
|
+
# area where left border can be located.
|
|
65
|
+
LEFT_PERCENT = configos.HV_PERCENT_PLUS(default=30)
|
|
66
|
+
# area where right border can be located.
|
|
67
|
+
RIGHT_PERCENT = configos.HV_PERCENT_PLUS(default=30)
|
|
68
|
+
|
|
69
|
+
RAISING_EDGE_MIN = configos.HV_PERCENT_PLUS(default=75)
|
|
70
|
+
|
|
71
|
+
# TODO: SHOULD WE DISABLE ALGO ON BIG FAIL COUNT?
|
|
72
|
+
RAISING_FAILRATE = configos.HolyTable(
|
|
73
|
+
items=(
|
|
74
|
+
(5, 1 / 5),
|
|
75
|
+
(7, 2 / 7),
|
|
76
|
+
(10, 3 / 10),
|
|
77
|
+
(40, 10 / 40),
|
|
78
|
+
(200, 40 / 200),
|
|
79
|
+
),
|
|
80
|
+
right_outranges_none=False,
|
|
81
|
+
)
|
|
82
|
+
|
|
83
|
+
LeftRight = tuple[float, float]
|
|
84
|
+
|
|
85
|
+
DetectedBorder = typing.TypeVar('DetectedBorder', tuple[float], float)
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
@dataclasses.dataclass
|
|
89
|
+
class LeftRightDetected:
|
|
90
|
+
left: DetectedBorder = None
|
|
91
|
+
right: DetectedBorder = None
|
|
92
|
+
confidence: float = 0.0
|
|
93
|
+
|
|
94
|
+
@property
|
|
95
|
+
def valid(self):
|
|
96
|
+
return isinstance(self.left, tuple)
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def run(
|
|
100
|
+
textpositions: iamraw.PageContentTextPositions,
|
|
101
|
+
pagesizes: iamraw.PageSizeBorderList,
|
|
102
|
+
) -> LeftRightDetected:
|
|
103
|
+
"""Run LeftRight-Strategy to determine that document contains
|
|
104
|
+
special leftright-border or a normal equal border for every page."""
|
|
105
|
+
left, right = determine_pageborder(textpositions, pagesizes)
|
|
106
|
+
left, right = handle_emptypage(left, right)
|
|
107
|
+
|
|
108
|
+
result = simple(left, right)
|
|
109
|
+
if result:
|
|
110
|
+
return result
|
|
111
|
+
|
|
112
|
+
result = raising(left, right)
|
|
113
|
+
if result:
|
|
114
|
+
return result
|
|
115
|
+
|
|
116
|
+
leftborder = utilo.mode(left, minimize=True)
|
|
117
|
+
rightborder = utilo.mode(right, minimize=False)
|
|
118
|
+
return LeftRightDetected(
|
|
119
|
+
left=leftborder,
|
|
120
|
+
right=rightborder,
|
|
121
|
+
confidence=1.0,
|
|
122
|
+
)
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
def simple(left: utilo.Numbers, right: utilo.Numbers) -> LeftRightDetected:
|
|
126
|
+
"""Determine LeftRight border based on changing text feed. We use
|
|
127
|
+
the even numbers to determine the left page and the odd numbers to
|
|
128
|
+
determine the right text feed.
|
|
129
|
+
|
|
130
|
+
This appraoch is limited when one page was missing and therefore
|
|
131
|
+
these pages are mixed and therefore wrong classified.
|
|
132
|
+
|
|
133
|
+
Args:
|
|
134
|
+
left: ordered list of left text feed
|
|
135
|
+
right: ending of right text content border
|
|
136
|
+
Returns:
|
|
137
|
+
None if no valid LeftRightDetected was detected
|
|
138
|
+
LeftRightDetected if failrate is not to hight
|
|
139
|
+
"""
|
|
140
|
+
if len(left) <= 1:
|
|
141
|
+
# TODO: INVESTIGATE HERE
|
|
142
|
+
utilo.error('could not run simple approach')
|
|
143
|
+
return None
|
|
144
|
+
mixed = utilo.diff_mode(left, max_diff=SIDE_DIFF_MAX)
|
|
145
|
+
# first side
|
|
146
|
+
first = left[::2]
|
|
147
|
+
first_matched = utilo.diff_mode(first, max_diff=SIDE_DIFF_MAX)
|
|
148
|
+
# second side
|
|
149
|
+
second = left[1::2]
|
|
150
|
+
second_matched = utilo.diff_mode(second, max_diff=SIDE_DIFF_MAX)
|
|
151
|
+
|
|
152
|
+
mixed_error = 1 - len(mixed) / len(left)
|
|
153
|
+
first_error = 1 - len(first_matched) / len(first)
|
|
154
|
+
second_error = 1 - len(second_matched) / len(second)
|
|
155
|
+
|
|
156
|
+
utilo.debug(f'mixed: {mixed_error}')
|
|
157
|
+
utilo.debug(f'first: {first_error}')
|
|
158
|
+
utilo.debug(f'second: {second_error}')
|
|
159
|
+
|
|
160
|
+
# left right
|
|
161
|
+
# TODO: DEFINE BETTER CONFIDENCE APPROACH
|
|
162
|
+
max_firstsecond_error = FIRSTSECOND_ERROR_COUNT_MAX(len(first))
|
|
163
|
+
if mixed_error > MIXED_ERROR_MIN.value and all([
|
|
164
|
+
first_error < max_firstsecond_error,
|
|
165
|
+
second_error < max_firstsecond_error,
|
|
166
|
+
]):
|
|
167
|
+
leftborder = (
|
|
168
|
+
utilo.mode(first, minimize=True),
|
|
169
|
+
utilo.mode(second, minimize=True),
|
|
170
|
+
)
|
|
171
|
+
rightborder = (
|
|
172
|
+
utilo.mode(right[::2]),
|
|
173
|
+
utilo.mode(right[1::2]),
|
|
174
|
+
)
|
|
175
|
+
return LeftRightDetected(
|
|
176
|
+
left=leftborder,
|
|
177
|
+
right=rightborder,
|
|
178
|
+
confidence=1.0,
|
|
179
|
+
)
|
|
180
|
+
return None
|
|
181
|
+
|
|
182
|
+
|
|
183
|
+
def raising(left: utilo.Numbers, right: utilo.Numbers) -> LeftRightDetected:
|
|
184
|
+
"""Determine border depending on changing text feed on left page
|
|
185
|
+
border.
|
|
186
|
+
|
|
187
|
+
This approach has no problems when one or more leftright pages are
|
|
188
|
+
missing. The limit of problems is defined in lookup table
|
|
189
|
+
`RAISING_FAILRATE`.
|
|
190
|
+
|
|
191
|
+
Args:
|
|
192
|
+
left: ordered list of left text feed
|
|
193
|
+
right: ending of right text content border
|
|
194
|
+
Returns:
|
|
195
|
+
None if no valid LeftRightDetected was detected
|
|
196
|
+
LeftRightDetected if `failrate` is not too high
|
|
197
|
+
"""
|
|
198
|
+
longest_left = longest_two(left)
|
|
199
|
+
longest_right = longest_two(right)
|
|
200
|
+
|
|
201
|
+
if longest_left is None or longest_right is None:
|
|
202
|
+
# single page document which does not contain left-right-pages
|
|
203
|
+
return None
|
|
204
|
+
|
|
205
|
+
first_left = statistics.mean(longest_left[0])
|
|
206
|
+
second_left = statistics.mean(longest_left[1])
|
|
207
|
+
edge = math.fabs(first_left - second_left)
|
|
208
|
+
|
|
209
|
+
first_right = statistics.mean(longest_right[0])
|
|
210
|
+
second_right = statistics.mean(longest_right[1])
|
|
211
|
+
|
|
212
|
+
edges = utilo.diffs(left)
|
|
213
|
+
failures = [
|
|
214
|
+
index for index, item in enumerate(edges)
|
|
215
|
+
if item < edge * RAISING_EDGE_MIN.value
|
|
216
|
+
]
|
|
217
|
+
failrate = len(failures) / len(edges)
|
|
218
|
+
max_failrate = RAISING_FAILRATE(len(edges))
|
|
219
|
+
|
|
220
|
+
if failrate > max_failrate:
|
|
221
|
+
return None
|
|
222
|
+
|
|
223
|
+
first_left, second_left = utilo.roundme(first_left, second_left)
|
|
224
|
+
first_right, second_right = utilo.roundme(first_right, second_right)
|
|
225
|
+
|
|
226
|
+
leftborder = (
|
|
227
|
+
min([first_left, second_left]),
|
|
228
|
+
max([first_left, second_left]),
|
|
229
|
+
)
|
|
230
|
+
rightborder = (
|
|
231
|
+
min([first_right, second_right]),
|
|
232
|
+
max([first_right, second_right]),
|
|
233
|
+
)
|
|
234
|
+
|
|
235
|
+
return LeftRightDetected(
|
|
236
|
+
left=leftborder,
|
|
237
|
+
right=rightborder,
|
|
238
|
+
confidence=1.0,
|
|
239
|
+
)
|
|
240
|
+
|
|
241
|
+
|
|
242
|
+
def handle_emptypage(left, right):
|
|
243
|
+
# TODO: THINK ABOUT IF THIS IS ENOUGH
|
|
244
|
+
# ignore empty pages
|
|
245
|
+
left_none = 0.0
|
|
246
|
+
left = [item if item is not None else left_none for item in left]
|
|
247
|
+
# TODO: Is default=0 a good one?
|
|
248
|
+
right_none = max((item for item in right if item is not None), default=0)
|
|
249
|
+
# NOTE: Determine more pages as large than it realy are - is this a
|
|
250
|
+
# problem?
|
|
251
|
+
right = [item if item is not None else right_none for item in right]
|
|
252
|
+
return left, right
|
|
253
|
+
|
|
254
|
+
|
|
255
|
+
def determine_pageborder(textpositions, pagesizes):
|
|
256
|
+
left = []
|
|
257
|
+
right = []
|
|
258
|
+
before = -1
|
|
259
|
+
for current, (page, size) in utilo.sync_pages([textpositions, pagesizes]):
|
|
260
|
+
assert current > before, f'{before} < {current}'
|
|
261
|
+
before = current
|
|
262
|
+
if not page or not size:
|
|
263
|
+
left.append(None)
|
|
264
|
+
right.append(None)
|
|
265
|
+
continue
|
|
266
|
+
bounding = [item for item, _ in page.content.values()]
|
|
267
|
+
leftright = maximize_leftright(bounding, size)
|
|
268
|
+
left.append(leftright[0])
|
|
269
|
+
right.append(leftright[1])
|
|
270
|
+
return left, right
|
|
271
|
+
|
|
272
|
+
|
|
273
|
+
def maximize_leftright(
|
|
274
|
+
boundings: utilo.Rectangles,
|
|
275
|
+
size: iamraw.PageSizeBorder,
|
|
276
|
+
) -> LeftRight:
|
|
277
|
+
"""Determine the left and right border of a page based on `mode`
|
|
278
|
+
selection in `size`.
|
|
279
|
+
|
|
280
|
+
Minimize the left and maximize the right position. The area where
|
|
281
|
+
mode is used to determine the most common border which is assumed as
|
|
282
|
+
correct border is limit by `size` configuration.
|
|
283
|
+
|
|
284
|
+
Args:
|
|
285
|
+
boundings: textpositions of defined page
|
|
286
|
+
size: width and height of current page
|
|
287
|
+
Returns:
|
|
288
|
+
tuple with left and right content bounding
|
|
289
|
+
"""
|
|
290
|
+
left_max = size.size.width * LEFT_PERCENT
|
|
291
|
+
right_min = size.size.width * (1 - RIGHT_PERCENT)
|
|
292
|
+
left_max, right_min = utilo.roundme(left_max, right_min)
|
|
293
|
+
assert left_max <= right_min, 'left and right bounds are flipped'
|
|
294
|
+
left = [item[0] for item in boundings if item[0] <= left_max]
|
|
295
|
+
right = [item[2] for item in boundings if item[2] >= right_min]
|
|
296
|
+
# TODO: DO WE RELAY NEED THIS?
|
|
297
|
+
left = utilo.mode(left, minimize=True) if left else 0.0
|
|
298
|
+
right = utilo.mode(right, minimize=False) if right else size.size.width
|
|
299
|
+
return left, right
|
|
300
|
+
|
|
301
|
+
|
|
302
|
+
CLUSTER_CANDIAT_DIFF_MAX = configos.HV_FLOAT_PLUS(default=2.0)
|
|
303
|
+
|
|
304
|
+
|
|
305
|
+
def longest_two(items: utilo.Numbers) -> tuple[float, float]:
|
|
306
|
+
|
|
307
|
+
def close(candidat, clusteritem) -> bool:
|
|
308
|
+
diff = math.fabs(candidat - clusteritem)
|
|
309
|
+
return diff < CLUSTER_CANDIAT_DIFF_MAX
|
|
310
|
+
|
|
311
|
+
clustered = utilo.determine_cluster(items, close)
|
|
312
|
+
result = sorted(clustered, key=len, reverse=True)
|
|
313
|
+
if len(result) < 2:
|
|
314
|
+
return None
|
|
315
|
+
return result[0], result[1]
|
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
# =============================================================================
|
|
2
|
+
# C O P Y R I G H T
|
|
3
|
+
# -----------------------------------------------------------------------------
|
|
4
|
+
# Copyright (c) 2020-2023 by Helmut Konrad Schewe. All rights reserved.
|
|
5
|
+
# This file is property of Helmut Konrad Schewe. Any unauthorized copy,
|
|
6
|
+
# use or distribution is an offensive act against international law and may
|
|
7
|
+
# be prosecuted under federal law. Its content is company confidential.
|
|
8
|
+
# =============================================================================
|
|
9
|
+
|
|
10
|
+
import dataclasses
|
|
11
|
+
import math
|
|
12
|
+
|
|
13
|
+
import iamraw
|
|
14
|
+
import utilo
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
@dataclasses.dataclass
|
|
18
|
+
class MostBoundingDetected:
|
|
19
|
+
left: float = None
|
|
20
|
+
right: float = None
|
|
21
|
+
top: float = None
|
|
22
|
+
bottom: float = None
|
|
23
|
+
confidence: float = 0.0
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def run(sizeandborder: iamraw.PageSizeBorderList) -> MostBoundingDetected:
|
|
27
|
+
borders = [item.border for item in sizeandborder]
|
|
28
|
+
most = most_boundingbox(borders)
|
|
29
|
+
# x0, y0, x1, y1
|
|
30
|
+
# left right
|
|
31
|
+
assert most[0] < most[1]
|
|
32
|
+
# top bottom
|
|
33
|
+
assert most[2] < most[3]
|
|
34
|
+
result = MostBoundingDetected(
|
|
35
|
+
left=most[0],
|
|
36
|
+
right=most[1],
|
|
37
|
+
top=most[2],
|
|
38
|
+
bottom=most[3],
|
|
39
|
+
)
|
|
40
|
+
return result
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def most_boundingbox(
|
|
44
|
+
boxes: utilo.Rectangles,
|
|
45
|
+
roundme: bool = False,
|
|
46
|
+
) -> utilo.Rectangle:
|
|
47
|
+
"""Extract bounding box of most common occurence for every side.
|
|
48
|
+
|
|
49
|
+
Round detected boundingbox to full number to make approach more
|
|
50
|
+
robust. Round numbers in direction to the border end to increase the
|
|
51
|
+
detected rectangle that fit more items in it.
|
|
52
|
+
"""
|
|
53
|
+
# TODO: Think about right and left, maybe search for the top 2 borders?
|
|
54
|
+
# Filter None entries
|
|
55
|
+
# left, right, top, bottom
|
|
56
|
+
rounding = [math.floor, math.ceil, math.floor, math.ceil]
|
|
57
|
+
result = []
|
|
58
|
+
for index, method in enumerate(rounding):
|
|
59
|
+
# remove None items
|
|
60
|
+
filtered = [item[index] for item in boxes if item[index] is not None]
|
|
61
|
+
# round to have a more robust grouping
|
|
62
|
+
rounded = [method(item) for item in filtered] if roundme else filtered
|
|
63
|
+
# determine most occured border to determine them as required border
|
|
64
|
+
# support multiple border options.
|
|
65
|
+
minimize = method is math.floor
|
|
66
|
+
# TODO: REQUIRE A BETTER TY-BREAKER
|
|
67
|
+
mode = utilo.mode(rounded, minimize=minimize)
|
|
68
|
+
result.append(mode)
|
|
69
|
+
# (x0, y0, x1, y1)
|
|
70
|
+
return tuple(result)
|
|
@@ -0,0 +1,81 @@
|
|
|
1
|
+
#==============================================================================
|
|
2
|
+
# C O P Y R I G H T
|
|
3
|
+
#------------------------------------------------------------------------------
|
|
4
|
+
# Copyright (c) 2019-2023 by Helmut Konrad Schewe. All rights reserved.
|
|
5
|
+
# This file is property of Helmut Konrad Schewe. Any unauthorized copy,
|
|
6
|
+
# use or distribution is an offensive act against international law and may
|
|
7
|
+
# be prosecuted under federal law. Its content is company confidential.
|
|
8
|
+
#==============================================================================
|
|
9
|
+
|
|
10
|
+
import utilo
|
|
11
|
+
|
|
12
|
+
import groupmes
|
|
13
|
+
|
|
14
|
+
DESCRIPTION = 'TODO'
|
|
15
|
+
|
|
16
|
+
WORKPLAN = [
|
|
17
|
+
utilo.create_step(
|
|
18
|
+
'area',
|
|
19
|
+
inputs=[
|
|
20
|
+
utilo.ResultFile(producer='rawmaker', name='boxes_boxes'),
|
|
21
|
+
utilo.ResultFile(producer='tablero',
|
|
22
|
+
name='result_result',
|
|
23
|
+
optional=True),
|
|
24
|
+
utilo.ResultFile(producer='rawmaker', name='text_text'),
|
|
25
|
+
utilo.ResultFile(producer='rawmaker', name='text_positions'),
|
|
26
|
+
],
|
|
27
|
+
output=('area',),
|
|
28
|
+
),
|
|
29
|
+
utilo.create_step(
|
|
30
|
+
'border',
|
|
31
|
+
inputs=[
|
|
32
|
+
utilo.ResultFile(producer='rawmaker', name='border_pages'),
|
|
33
|
+
utilo.ResultFile(producer='rawmaker', name='text_positions'),
|
|
34
|
+
],
|
|
35
|
+
output=('leftright',),
|
|
36
|
+
),
|
|
37
|
+
utilo.create_step(
|
|
38
|
+
'distance',
|
|
39
|
+
inputs=[
|
|
40
|
+
utilo.ResultFile(producer=groupmes.PROCESS, name='area_area'),
|
|
41
|
+
utilo.ResultFile(producer='rawmaker', name='text_text'),
|
|
42
|
+
utilo.ResultFile(producer='rawmaker', name='text_positions'),
|
|
43
|
+
],
|
|
44
|
+
output=('distance',),
|
|
45
|
+
),
|
|
46
|
+
utilo.create_step(
|
|
47
|
+
'content',
|
|
48
|
+
inputs=[
|
|
49
|
+
utilo.ResultFile(producer='rawmaker', name='text_text'),
|
|
50
|
+
utilo.ResultFile(producer='rawmaker', name='text_positions'),
|
|
51
|
+
utilo.ResultFile(producer='rawmaker', name='border_pages'),
|
|
52
|
+
utilo.ResultFile(producer='footnote', name='result_result'),
|
|
53
|
+
],
|
|
54
|
+
output=('content',),
|
|
55
|
+
),
|
|
56
|
+
utilo.create_step(
|
|
57
|
+
'hefopa',
|
|
58
|
+
inputs=[
|
|
59
|
+
utilo.ResultFile(producer='headnote', name='result_result'),
|
|
60
|
+
utilo.ResultFile(producer='footnote', name='result_result'),
|
|
61
|
+
utilo.ResultFile(producer='pagenumber', name='result_result'),
|
|
62
|
+
utilo.ResultFile(producer='rawmaker', name='border_pages'),
|
|
63
|
+
],
|
|
64
|
+
output=('result',),
|
|
65
|
+
)
|
|
66
|
+
]
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def main():
|
|
70
|
+
utilo.featurepack(
|
|
71
|
+
workplan=WORKPLAN,
|
|
72
|
+
root=groupmes.ROOT,
|
|
73
|
+
featurepackage='groupmes.feature',
|
|
74
|
+
config=utilo.FeaturePackConfig(
|
|
75
|
+
description=DESCRIPTION,
|
|
76
|
+
multiprocessed=True,
|
|
77
|
+
name=groupmes.PROCESS,
|
|
78
|
+
pages=True,
|
|
79
|
+
version=groupmes.__version__,
|
|
80
|
+
),
|
|
81
|
+
)
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
#==============================================================================
|
|
2
|
+
# C O P Y R I G H T
|
|
3
|
+
#------------------------------------------------------------------------------
|
|
4
|
+
# Copyright (c) 2019-2023 by Helmut Konrad Schewe. All rights reserved.
|
|
5
|
+
# This file is property of Helmut Konrad Schewe. Any unauthorized copy,
|
|
6
|
+
# use or distribution is an offensive act against international law and may
|
|
7
|
+
# be prosecuted under federal law. Its content is company confidential.
|
|
8
|
+
#==============================================================================
|