groupmes 1.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,28 @@
1
+ Metadata-Version: 2.4
2
+ Name: groupmes
3
+ Version: 1.1.0
4
+ Author-email: Helmut Konrad Schewe <helmutus@outlook.com>
5
+ License-Expression: MIT
6
+ Project-URL: Homepage, https://github.com/anaticulae/groupmes
7
+ Project-URL: Repository, https://github.com/anaticulae/groupmes
8
+ Classifier: Programming Language :: Python :: 3.12
9
+ Classifier: Programming Language :: Python :: 3.13
10
+ Classifier: Programming Language :: Python :: 3.14
11
+ Requires-Python: >=3.12
12
+ Description-Content-Type: text/markdown
13
+ Requires-Dist: iamraw<5.0.0,>=4.91.5
14
+ Requires-Dist: utilo<3.0.0,>=2.109.0
15
+ Requires-Dist: configos<2.0.0,>=1.0.4
16
+ Requires-Dist: konradus<2.0.0,>=1.0.1
17
+ Provides-Extra: dev
18
+ Requires-Dist: utilotest<2.0.0,>=1.0.4; extra == "dev"
19
+ Requires-Dist: hoverpower==1.4.3; extra == "dev"
20
+ Requires-Dist: gennex==1.0.3; extra == "dev"
21
+ Requires-Dist: rawmaker==2.40.3; extra == "dev"
22
+ Requires-Dist: pagenumber==1.0.0; extra == "dev"
23
+
24
+ # groupmes
25
+
26
+ * chapter: split document text by chapter
27
+ * structure: divide document in head(toc) and tail(rest)
28
+ * toc: create table of content out of raw text data
groupmes-1.1.0/README ADDED
@@ -0,0 +1,5 @@
1
+ # groupmes
2
+
3
+ * chapter: split document text by chapter
4
+ * structure: divide document in head(toc) and tail(rest)
5
+ * toc: create table of content out of raw text data
@@ -0,0 +1,21 @@
1
+ #==============================================================================
2
+ # C O P Y R I G H T
3
+ #------------------------------------------------------------------------------
4
+ # Copyright (c) 2019-2023 by Helmut Konrad Schewe. All rights reserved.
5
+ # This file is property of Helmut Konrad Schewe. Any unauthorized copy,
6
+ # use or distribution is an offensive act against international law and may
7
+ # be prosecuted under federal law. Its content is company confidential.
8
+ #==============================================================================
9
+
10
+ import importlib.metadata
11
+ import os
12
+
13
+ import groupmes.__patch__
14
+ import groupmes.path
15
+
16
+ PACKAGE = 'groupmes'
17
+ PROCESS = 'groupme'
18
+
19
+ __version__ = importlib.metadata.version(PACKAGE)
20
+
21
+ ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))
@@ -0,0 +1,13 @@
1
+ #==============================================================================
2
+ # C O P Y R I G H T
3
+ #------------------------------------------------------------------------------
4
+ # Copyright (c) 2019-2023 by Helmut Konrad Schewe. All rights reserved.
5
+ # This file is property of Helmut Konrad Schewe. Any unauthorized copy,
6
+ # use or distribution is an offensive act against international law and may
7
+ # be prosecuted under federal law. Its content is company confidential.
8
+ #==============================================================================
9
+
10
+ from groupmes.cli import main
11
+
12
+ if __name__ == "__main__":
13
+ main()
@@ -0,0 +1,8 @@
1
+ # =============================================================================
2
+ # C O P Y R I G H T
3
+ # -----------------------------------------------------------------------------
4
+ # Copyright (c) 2020-2023 by Helmut Konrad Schewe. All rights reserved.
5
+ # This file is property of Helmut Konrad Schewe. Any unauthorized copy,
6
+ # use or distribution is an offensive act against international law and may
7
+ # be prosecuted under federal law. Its content is company confidential.
8
+ # =============================================================================
@@ -0,0 +1,8 @@
1
+ # =============================================================================
2
+ # C O P Y R I G H T
3
+ # -----------------------------------------------------------------------------
4
+ # Copyright (c) 2020-2023 by Helmut Konrad Schewe. All rights reserved.
5
+ # This file is property of Helmut Konrad Schewe. Any unauthorized copy,
6
+ # use or distribution is an offensive act against international law and may
7
+ # be prosecuted under federal law. Its content is company confidential.
8
+ # =============================================================================
@@ -0,0 +1,315 @@
1
+ # =============================================================================
2
+ # C O P Y R I G H T
3
+ # -----------------------------------------------------------------------------
4
+ # Copyright (c) 2020-2023 by Helmut Konrad Schewe. All rights reserved.
5
+ # This file is property of Helmut Konrad Schewe. Any unauthorized copy,
6
+ # use or distribution is an offensive act against international law and may
7
+ # be prosecuted under federal law. Its content is company confidential.
8
+ # =============================================================================
9
+ """left right border detector
10
+ ==========================
11
+
12
+ The `left right border detector` (lrbd) separates borders of the left
13
+ and right page especially which are used for books with different border
14
+ width for left and right page.
15
+
16
+ As a result of different left-right borders there are alternating border
17
+ widths which we detect. On single pages there are no alternating border.
18
+
19
+ Currently there are two strategies to detect different page border:
20
+
21
+ * simple approach
22
+ * raising edge
23
+
24
+ Simple Approach
25
+ ~~~~~~~~~~~~~~~
26
+
27
+ Some documents have exceptions on some pages. We handle this via allowed
28
+ errors defined with the HolyValues `FIRSTSECOND_ERROR_COUNT_MAX` and
29
+ `MIXED_ERROR_MIN`.
30
+
31
+ Raising Edge
32
+ ~~~~~~~~~~~~
33
+
34
+ The change of the text feed creates an raising edge between pages. This
35
+ approach detects these edges to determine left and right page border
36
+ width.
37
+ """
38
+
39
+ import dataclasses
40
+ import math
41
+ import statistics
42
+ import typing
43
+
44
+ import configos
45
+ import iamraw
46
+ import utilo
47
+
48
+ # max diff to match in common group.
49
+ SIDE_DIFF_MAX = configos.HV_INT_PLUS(default=2)
50
+ # exceptions which are allowed cause of user defined error.
51
+ FIRSTSECOND_ERROR_COUNT_MAX = configos.HolyTable(
52
+ items=(
53
+ (2, 0.5),
54
+ (3, 0.35),
55
+ (5, 0.26),
56
+ (10, .21),
57
+ (15, 0.05),
58
+ (200, 0.01),
59
+ ),
60
+ right_outranges_none=False,
61
+ )
62
+ # errors which are a result of handle alternating border as single border.
63
+ MIXED_ERROR_MIN = configos.HV_PERCENT_PLUS(default=15)
64
+ # area where left border can be located.
65
+ LEFT_PERCENT = configos.HV_PERCENT_PLUS(default=30)
66
+ # area where right border can be located.
67
+ RIGHT_PERCENT = configos.HV_PERCENT_PLUS(default=30)
68
+
69
+ RAISING_EDGE_MIN = configos.HV_PERCENT_PLUS(default=75)
70
+
71
+ # TODO: SHOULD WE DISABLE ALGO ON BIG FAIL COUNT?
72
+ RAISING_FAILRATE = configos.HolyTable(
73
+ items=(
74
+ (5, 1 / 5),
75
+ (7, 2 / 7),
76
+ (10, 3 / 10),
77
+ (40, 10 / 40),
78
+ (200, 40 / 200),
79
+ ),
80
+ right_outranges_none=False,
81
+ )
82
+
83
+ LeftRight = tuple[float, float]
84
+
85
+ DetectedBorder = typing.TypeVar('DetectedBorder', tuple[float], float)
86
+
87
+
88
+ @dataclasses.dataclass
89
+ class LeftRightDetected:
90
+ left: DetectedBorder = None
91
+ right: DetectedBorder = None
92
+ confidence: float = 0.0
93
+
94
+ @property
95
+ def valid(self):
96
+ return isinstance(self.left, tuple)
97
+
98
+
99
+ def run(
100
+ textpositions: iamraw.PageContentTextPositions,
101
+ pagesizes: iamraw.PageSizeBorderList,
102
+ ) -> LeftRightDetected:
103
+ """Run LeftRight-Strategy to determine that document contains
104
+ special leftright-border or a normal equal border for every page."""
105
+ left, right = determine_pageborder(textpositions, pagesizes)
106
+ left, right = handle_emptypage(left, right)
107
+
108
+ result = simple(left, right)
109
+ if result:
110
+ return result
111
+
112
+ result = raising(left, right)
113
+ if result:
114
+ return result
115
+
116
+ leftborder = utilo.mode(left, minimize=True)
117
+ rightborder = utilo.mode(right, minimize=False)
118
+ return LeftRightDetected(
119
+ left=leftborder,
120
+ right=rightborder,
121
+ confidence=1.0,
122
+ )
123
+
124
+
125
+ def simple(left: utilo.Numbers, right: utilo.Numbers) -> LeftRightDetected:
126
+ """Determine LeftRight border based on changing text feed. We use
127
+ the even numbers to determine the left page and the odd numbers to
128
+ determine the right text feed.
129
+
130
+ This appraoch is limited when one page was missing and therefore
131
+ these pages are mixed and therefore wrong classified.
132
+
133
+ Args:
134
+ left: ordered list of left text feed
135
+ right: ending of right text content border
136
+ Returns:
137
+ None if no valid LeftRightDetected was detected
138
+ LeftRightDetected if failrate is not to hight
139
+ """
140
+ if len(left) <= 1:
141
+ # TODO: INVESTIGATE HERE
142
+ utilo.error('could not run simple approach')
143
+ return None
144
+ mixed = utilo.diff_mode(left, max_diff=SIDE_DIFF_MAX)
145
+ # first side
146
+ first = left[::2]
147
+ first_matched = utilo.diff_mode(first, max_diff=SIDE_DIFF_MAX)
148
+ # second side
149
+ second = left[1::2]
150
+ second_matched = utilo.diff_mode(second, max_diff=SIDE_DIFF_MAX)
151
+
152
+ mixed_error = 1 - len(mixed) / len(left)
153
+ first_error = 1 - len(first_matched) / len(first)
154
+ second_error = 1 - len(second_matched) / len(second)
155
+
156
+ utilo.debug(f'mixed: {mixed_error}')
157
+ utilo.debug(f'first: {first_error}')
158
+ utilo.debug(f'second: {second_error}')
159
+
160
+ # left right
161
+ # TODO: DEFINE BETTER CONFIDENCE APPROACH
162
+ max_firstsecond_error = FIRSTSECOND_ERROR_COUNT_MAX(len(first))
163
+ if mixed_error > MIXED_ERROR_MIN.value and all([
164
+ first_error < max_firstsecond_error,
165
+ second_error < max_firstsecond_error,
166
+ ]):
167
+ leftborder = (
168
+ utilo.mode(first, minimize=True),
169
+ utilo.mode(second, minimize=True),
170
+ )
171
+ rightborder = (
172
+ utilo.mode(right[::2]),
173
+ utilo.mode(right[1::2]),
174
+ )
175
+ return LeftRightDetected(
176
+ left=leftborder,
177
+ right=rightborder,
178
+ confidence=1.0,
179
+ )
180
+ return None
181
+
182
+
183
+ def raising(left: utilo.Numbers, right: utilo.Numbers) -> LeftRightDetected:
184
+ """Determine border depending on changing text feed on left page
185
+ border.
186
+
187
+ This approach has no problems when one or more leftright pages are
188
+ missing. The limit of problems is defined in lookup table
189
+ `RAISING_FAILRATE`.
190
+
191
+ Args:
192
+ left: ordered list of left text feed
193
+ right: ending of right text content border
194
+ Returns:
195
+ None if no valid LeftRightDetected was detected
196
+ LeftRightDetected if `failrate` is not too high
197
+ """
198
+ longest_left = longest_two(left)
199
+ longest_right = longest_two(right)
200
+
201
+ if longest_left is None or longest_right is None:
202
+ # single page document which does not contain left-right-pages
203
+ return None
204
+
205
+ first_left = statistics.mean(longest_left[0])
206
+ second_left = statistics.mean(longest_left[1])
207
+ edge = math.fabs(first_left - second_left)
208
+
209
+ first_right = statistics.mean(longest_right[0])
210
+ second_right = statistics.mean(longest_right[1])
211
+
212
+ edges = utilo.diffs(left)
213
+ failures = [
214
+ index for index, item in enumerate(edges)
215
+ if item < edge * RAISING_EDGE_MIN.value
216
+ ]
217
+ failrate = len(failures) / len(edges)
218
+ max_failrate = RAISING_FAILRATE(len(edges))
219
+
220
+ if failrate > max_failrate:
221
+ return None
222
+
223
+ first_left, second_left = utilo.roundme(first_left, second_left)
224
+ first_right, second_right = utilo.roundme(first_right, second_right)
225
+
226
+ leftborder = (
227
+ min([first_left, second_left]),
228
+ max([first_left, second_left]),
229
+ )
230
+ rightborder = (
231
+ min([first_right, second_right]),
232
+ max([first_right, second_right]),
233
+ )
234
+
235
+ return LeftRightDetected(
236
+ left=leftborder,
237
+ right=rightborder,
238
+ confidence=1.0,
239
+ )
240
+
241
+
242
+ def handle_emptypage(left, right):
243
+ # TODO: THINK ABOUT IF THIS IS ENOUGH
244
+ # ignore empty pages
245
+ left_none = 0.0
246
+ left = [item if item is not None else left_none for item in left]
247
+ # TODO: Is default=0 a good one?
248
+ right_none = max((item for item in right if item is not None), default=0)
249
+ # NOTE: Determine more pages as large than it realy are - is this a
250
+ # problem?
251
+ right = [item if item is not None else right_none for item in right]
252
+ return left, right
253
+
254
+
255
+ def determine_pageborder(textpositions, pagesizes):
256
+ left = []
257
+ right = []
258
+ before = -1
259
+ for current, (page, size) in utilo.sync_pages([textpositions, pagesizes]):
260
+ assert current > before, f'{before} < {current}'
261
+ before = current
262
+ if not page or not size:
263
+ left.append(None)
264
+ right.append(None)
265
+ continue
266
+ bounding = [item for item, _ in page.content.values()]
267
+ leftright = maximize_leftright(bounding, size)
268
+ left.append(leftright[0])
269
+ right.append(leftright[1])
270
+ return left, right
271
+
272
+
273
+ def maximize_leftright(
274
+ boundings: utilo.Rectangles,
275
+ size: iamraw.PageSizeBorder,
276
+ ) -> LeftRight:
277
+ """Determine the left and right border of a page based on `mode`
278
+ selection in `size`.
279
+
280
+ Minimize the left and maximize the right position. The area where
281
+ mode is used to determine the most common border which is assumed as
282
+ correct border is limit by `size` configuration.
283
+
284
+ Args:
285
+ boundings: textpositions of defined page
286
+ size: width and height of current page
287
+ Returns:
288
+ tuple with left and right content bounding
289
+ """
290
+ left_max = size.size.width * LEFT_PERCENT
291
+ right_min = size.size.width * (1 - RIGHT_PERCENT)
292
+ left_max, right_min = utilo.roundme(left_max, right_min)
293
+ assert left_max <= right_min, 'left and right bounds are flipped'
294
+ left = [item[0] for item in boundings if item[0] <= left_max]
295
+ right = [item[2] for item in boundings if item[2] >= right_min]
296
+ # TODO: DO WE RELAY NEED THIS?
297
+ left = utilo.mode(left, minimize=True) if left else 0.0
298
+ right = utilo.mode(right, minimize=False) if right else size.size.width
299
+ return left, right
300
+
301
+
302
+ CLUSTER_CANDIAT_DIFF_MAX = configos.HV_FLOAT_PLUS(default=2.0)
303
+
304
+
305
+ def longest_two(items: utilo.Numbers) -> tuple[float, float]:
306
+
307
+ def close(candidat, clusteritem) -> bool:
308
+ diff = math.fabs(candidat - clusteritem)
309
+ return diff < CLUSTER_CANDIAT_DIFF_MAX
310
+
311
+ clustered = utilo.determine_cluster(items, close)
312
+ result = sorted(clustered, key=len, reverse=True)
313
+ if len(result) < 2:
314
+ return None
315
+ return result[0], result[1]
@@ -0,0 +1,70 @@
1
+ # =============================================================================
2
+ # C O P Y R I G H T
3
+ # -----------------------------------------------------------------------------
4
+ # Copyright (c) 2020-2023 by Helmut Konrad Schewe. All rights reserved.
5
+ # This file is property of Helmut Konrad Schewe. Any unauthorized copy,
6
+ # use or distribution is an offensive act against international law and may
7
+ # be prosecuted under federal law. Its content is company confidential.
8
+ # =============================================================================
9
+
10
+ import dataclasses
11
+ import math
12
+
13
+ import iamraw
14
+ import utilo
15
+
16
+
17
+ @dataclasses.dataclass
18
+ class MostBoundingDetected:
19
+ left: float = None
20
+ right: float = None
21
+ top: float = None
22
+ bottom: float = None
23
+ confidence: float = 0.0
24
+
25
+
26
+ def run(sizeandborder: iamraw.PageSizeBorderList) -> MostBoundingDetected:
27
+ borders = [item.border for item in sizeandborder]
28
+ most = most_boundingbox(borders)
29
+ # x0, y0, x1, y1
30
+ # left right
31
+ assert most[0] < most[1]
32
+ # top bottom
33
+ assert most[2] < most[3]
34
+ result = MostBoundingDetected(
35
+ left=most[0],
36
+ right=most[1],
37
+ top=most[2],
38
+ bottom=most[3],
39
+ )
40
+ return result
41
+
42
+
43
+ def most_boundingbox(
44
+ boxes: utilo.Rectangles,
45
+ roundme: bool = False,
46
+ ) -> utilo.Rectangle:
47
+ """Extract bounding box of most common occurence for every side.
48
+
49
+ Round detected boundingbox to full number to make approach more
50
+ robust. Round numbers in direction to the border end to increase the
51
+ detected rectangle that fit more items in it.
52
+ """
53
+ # TODO: Think about right and left, maybe search for the top 2 borders?
54
+ # Filter None entries
55
+ # left, right, top, bottom
56
+ rounding = [math.floor, math.ceil, math.floor, math.ceil]
57
+ result = []
58
+ for index, method in enumerate(rounding):
59
+ # remove None items
60
+ filtered = [item[index] for item in boxes if item[index] is not None]
61
+ # round to have a more robust grouping
62
+ rounded = [method(item) for item in filtered] if roundme else filtered
63
+ # determine most occured border to determine them as required border
64
+ # support multiple border options.
65
+ minimize = method is math.floor
66
+ # TODO: REQUIRE A BETTER TY-BREAKER
67
+ mode = utilo.mode(rounded, minimize=minimize)
68
+ result.append(mode)
69
+ # (x0, y0, x1, y1)
70
+ return tuple(result)
@@ -0,0 +1,81 @@
1
+ #==============================================================================
2
+ # C O P Y R I G H T
3
+ #------------------------------------------------------------------------------
4
+ # Copyright (c) 2019-2023 by Helmut Konrad Schewe. All rights reserved.
5
+ # This file is property of Helmut Konrad Schewe. Any unauthorized copy,
6
+ # use or distribution is an offensive act against international law and may
7
+ # be prosecuted under federal law. Its content is company confidential.
8
+ #==============================================================================
9
+
10
+ import utilo
11
+
12
+ import groupmes
13
+
14
+ DESCRIPTION = 'TODO'
15
+
16
+ WORKPLAN = [
17
+ utilo.create_step(
18
+ 'area',
19
+ inputs=[
20
+ utilo.ResultFile(producer='rawmaker', name='boxes_boxes'),
21
+ utilo.ResultFile(producer='tablero',
22
+ name='result_result',
23
+ optional=True),
24
+ utilo.ResultFile(producer='rawmaker', name='text_text'),
25
+ utilo.ResultFile(producer='rawmaker', name='text_positions'),
26
+ ],
27
+ output=('area',),
28
+ ),
29
+ utilo.create_step(
30
+ 'border',
31
+ inputs=[
32
+ utilo.ResultFile(producer='rawmaker', name='border_pages'),
33
+ utilo.ResultFile(producer='rawmaker', name='text_positions'),
34
+ ],
35
+ output=('leftright',),
36
+ ),
37
+ utilo.create_step(
38
+ 'distance',
39
+ inputs=[
40
+ utilo.ResultFile(producer=groupmes.PROCESS, name='area_area'),
41
+ utilo.ResultFile(producer='rawmaker', name='text_text'),
42
+ utilo.ResultFile(producer='rawmaker', name='text_positions'),
43
+ ],
44
+ output=('distance',),
45
+ ),
46
+ utilo.create_step(
47
+ 'content',
48
+ inputs=[
49
+ utilo.ResultFile(producer='rawmaker', name='text_text'),
50
+ utilo.ResultFile(producer='rawmaker', name='text_positions'),
51
+ utilo.ResultFile(producer='rawmaker', name='border_pages'),
52
+ utilo.ResultFile(producer='footnote', name='result_result'),
53
+ ],
54
+ output=('content',),
55
+ ),
56
+ utilo.create_step(
57
+ 'hefopa',
58
+ inputs=[
59
+ utilo.ResultFile(producer='headnote', name='result_result'),
60
+ utilo.ResultFile(producer='footnote', name='result_result'),
61
+ utilo.ResultFile(producer='pagenumber', name='result_result'),
62
+ utilo.ResultFile(producer='rawmaker', name='border_pages'),
63
+ ],
64
+ output=('result',),
65
+ )
66
+ ]
67
+
68
+
69
+ def main():
70
+ utilo.featurepack(
71
+ workplan=WORKPLAN,
72
+ root=groupmes.ROOT,
73
+ featurepackage='groupmes.feature',
74
+ config=utilo.FeaturePackConfig(
75
+ description=DESCRIPTION,
76
+ multiprocessed=True,
77
+ name=groupmes.PROCESS,
78
+ pages=True,
79
+ version=groupmes.__version__,
80
+ ),
81
+ )
@@ -0,0 +1,8 @@
1
+ #==============================================================================
2
+ # C O P Y R I G H T
3
+ #------------------------------------------------------------------------------
4
+ # Copyright (c) 2019-2023 by Helmut Konrad Schewe. All rights reserved.
5
+ # This file is property of Helmut Konrad Schewe. Any unauthorized copy,
6
+ # use or distribution is an offensive act against international law and may
7
+ # be prosecuted under federal law. Its content is company confidential.
8
+ #==============================================================================