groupmes 1.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
groupmes/__init__.py ADDED
@@ -0,0 +1,21 @@
1
+ #==============================================================================
2
+ # C O P Y R I G H T
3
+ #------------------------------------------------------------------------------
4
+ # Copyright (c) 2019-2023 by Helmut Konrad Schewe. All rights reserved.
5
+ # This file is property of Helmut Konrad Schewe. Any unauthorized copy,
6
+ # use or distribution is an offensive act against international law and may
7
+ # be prosecuted under federal law. Its content is company confidential.
8
+ #==============================================================================
9
+
10
+ import importlib.metadata
11
+ import os
12
+
13
+ import groupmes.__patch__
14
+ import groupmes.path
15
+
16
+ PACKAGE = 'groupmes'
17
+ PROCESS = 'groupme'
18
+
19
+ __version__ = importlib.metadata.version(PACKAGE)
20
+
21
+ ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))
groupmes/__main__.py ADDED
@@ -0,0 +1,13 @@
1
+ #==============================================================================
2
+ # C O P Y R I G H T
3
+ #------------------------------------------------------------------------------
4
+ # Copyright (c) 2019-2023 by Helmut Konrad Schewe. All rights reserved.
5
+ # This file is property of Helmut Konrad Schewe. Any unauthorized copy,
6
+ # use or distribution is an offensive act against international law and may
7
+ # be prosecuted under federal law. Its content is company confidential.
8
+ #==============================================================================
9
+
10
+ from groupmes.cli import main
11
+
12
+ if __name__ == "__main__":
13
+ main()
groupmes/__patch__.py ADDED
@@ -0,0 +1,8 @@
1
+ # =============================================================================
2
+ # C O P Y R I G H T
3
+ # -----------------------------------------------------------------------------
4
+ # Copyright (c) 2020-2023 by Helmut Konrad Schewe. All rights reserved.
5
+ # This file is property of Helmut Konrad Schewe. Any unauthorized copy,
6
+ # use or distribution is an offensive act against international law and may
7
+ # be prosecuted under federal law. Its content is company confidential.
8
+ # =============================================================================
@@ -0,0 +1,8 @@
1
+ # =============================================================================
2
+ # C O P Y R I G H T
3
+ # -----------------------------------------------------------------------------
4
+ # Copyright (c) 2020-2023 by Helmut Konrad Schewe. All rights reserved.
5
+ # This file is property of Helmut Konrad Schewe. Any unauthorized copy,
6
+ # use or distribution is an offensive act against international law and may
7
+ # be prosecuted under federal law. Its content is company confidential.
8
+ # =============================================================================
@@ -0,0 +1,315 @@
1
+ # =============================================================================
2
+ # C O P Y R I G H T
3
+ # -----------------------------------------------------------------------------
4
+ # Copyright (c) 2020-2023 by Helmut Konrad Schewe. All rights reserved.
5
+ # This file is property of Helmut Konrad Schewe. Any unauthorized copy,
6
+ # use or distribution is an offensive act against international law and may
7
+ # be prosecuted under federal law. Its content is company confidential.
8
+ # =============================================================================
9
+ """left right border detector
10
+ ==========================
11
+
12
+ The `left right border detector` (lrbd) separates borders of the left
13
+ and right page especially which are used for books with different border
14
+ width for left and right page.
15
+
16
+ As a result of different left-right borders there are alternating border
17
+ widths which we detect. On single pages there are no alternating border.
18
+
19
+ Currently there are two strategies to detect different page border:
20
+
21
+ * simple approach
22
+ * raising edge
23
+
24
+ Simple Approach
25
+ ~~~~~~~~~~~~~~~
26
+
27
+ Some documents have exceptions on some pages. We handle this via allowed
28
+ errors defined with the HolyValues `FIRSTSECOND_ERROR_COUNT_MAX` and
29
+ `MIXED_ERROR_MIN`.
30
+
31
+ Raising Edge
32
+ ~~~~~~~~~~~~
33
+
34
+ The change of the text feed creates an raising edge between pages. This
35
+ approach detects these edges to determine left and right page border
36
+ width.
37
+ """
38
+
39
+ import dataclasses
40
+ import math
41
+ import statistics
42
+ import typing
43
+
44
+ import configos
45
+ import iamraw
46
+ import utilo
47
+
48
+ # max diff to match in common group.
49
+ SIDE_DIFF_MAX = configos.HV_INT_PLUS(default=2)
50
+ # exceptions which are allowed cause of user defined error.
51
+ FIRSTSECOND_ERROR_COUNT_MAX = configos.HolyTable(
52
+ items=(
53
+ (2, 0.5),
54
+ (3, 0.35),
55
+ (5, 0.26),
56
+ (10, .21),
57
+ (15, 0.05),
58
+ (200, 0.01),
59
+ ),
60
+ right_outranges_none=False,
61
+ )
62
+ # errors which are a result of handle alternating border as single border.
63
+ MIXED_ERROR_MIN = configos.HV_PERCENT_PLUS(default=15)
64
+ # area where left border can be located.
65
+ LEFT_PERCENT = configos.HV_PERCENT_PLUS(default=30)
66
+ # area where right border can be located.
67
+ RIGHT_PERCENT = configos.HV_PERCENT_PLUS(default=30)
68
+
69
+ RAISING_EDGE_MIN = configos.HV_PERCENT_PLUS(default=75)
70
+
71
+ # TODO: SHOULD WE DISABLE ALGO ON BIG FAIL COUNT?
72
+ RAISING_FAILRATE = configos.HolyTable(
73
+ items=(
74
+ (5, 1 / 5),
75
+ (7, 2 / 7),
76
+ (10, 3 / 10),
77
+ (40, 10 / 40),
78
+ (200, 40 / 200),
79
+ ),
80
+ right_outranges_none=False,
81
+ )
82
+
83
+ LeftRight = tuple[float, float]
84
+
85
+ DetectedBorder = typing.TypeVar('DetectedBorder', tuple[float], float)
86
+
87
+
88
+ @dataclasses.dataclass
89
+ class LeftRightDetected:
90
+ left: DetectedBorder = None
91
+ right: DetectedBorder = None
92
+ confidence: float = 0.0
93
+
94
+ @property
95
+ def valid(self):
96
+ return isinstance(self.left, tuple)
97
+
98
+
99
+ def run(
100
+ textpositions: iamraw.PageContentTextPositions,
101
+ pagesizes: iamraw.PageSizeBorderList,
102
+ ) -> LeftRightDetected:
103
+ """Run LeftRight-Strategy to determine that document contains
104
+ special leftright-border or a normal equal border for every page."""
105
+ left, right = determine_pageborder(textpositions, pagesizes)
106
+ left, right = handle_emptypage(left, right)
107
+
108
+ result = simple(left, right)
109
+ if result:
110
+ return result
111
+
112
+ result = raising(left, right)
113
+ if result:
114
+ return result
115
+
116
+ leftborder = utilo.mode(left, minimize=True)
117
+ rightborder = utilo.mode(right, minimize=False)
118
+ return LeftRightDetected(
119
+ left=leftborder,
120
+ right=rightborder,
121
+ confidence=1.0,
122
+ )
123
+
124
+
125
+ def simple(left: utilo.Numbers, right: utilo.Numbers) -> LeftRightDetected:
126
+ """Determine LeftRight border based on changing text feed. We use
127
+ the even numbers to determine the left page and the odd numbers to
128
+ determine the right text feed.
129
+
130
+ This appraoch is limited when one page was missing and therefore
131
+ these pages are mixed and therefore wrong classified.
132
+
133
+ Args:
134
+ left: ordered list of left text feed
135
+ right: ending of right text content border
136
+ Returns:
137
+ None if no valid LeftRightDetected was detected
138
+ LeftRightDetected if failrate is not to hight
139
+ """
140
+ if len(left) <= 1:
141
+ # TODO: INVESTIGATE HERE
142
+ utilo.error('could not run simple approach')
143
+ return None
144
+ mixed = utilo.diff_mode(left, max_diff=SIDE_DIFF_MAX)
145
+ # first side
146
+ first = left[::2]
147
+ first_matched = utilo.diff_mode(first, max_diff=SIDE_DIFF_MAX)
148
+ # second side
149
+ second = left[1::2]
150
+ second_matched = utilo.diff_mode(second, max_diff=SIDE_DIFF_MAX)
151
+
152
+ mixed_error = 1 - len(mixed) / len(left)
153
+ first_error = 1 - len(first_matched) / len(first)
154
+ second_error = 1 - len(second_matched) / len(second)
155
+
156
+ utilo.debug(f'mixed: {mixed_error}')
157
+ utilo.debug(f'first: {first_error}')
158
+ utilo.debug(f'second: {second_error}')
159
+
160
+ # left right
161
+ # TODO: DEFINE BETTER CONFIDENCE APPROACH
162
+ max_firstsecond_error = FIRSTSECOND_ERROR_COUNT_MAX(len(first))
163
+ if mixed_error > MIXED_ERROR_MIN.value and all([
164
+ first_error < max_firstsecond_error,
165
+ second_error < max_firstsecond_error,
166
+ ]):
167
+ leftborder = (
168
+ utilo.mode(first, minimize=True),
169
+ utilo.mode(second, minimize=True),
170
+ )
171
+ rightborder = (
172
+ utilo.mode(right[::2]),
173
+ utilo.mode(right[1::2]),
174
+ )
175
+ return LeftRightDetected(
176
+ left=leftborder,
177
+ right=rightborder,
178
+ confidence=1.0,
179
+ )
180
+ return None
181
+
182
+
183
+ def raising(left: utilo.Numbers, right: utilo.Numbers) -> LeftRightDetected:
184
+ """Determine border depending on changing text feed on left page
185
+ border.
186
+
187
+ This approach has no problems when one or more leftright pages are
188
+ missing. The limit of problems is defined in lookup table
189
+ `RAISING_FAILRATE`.
190
+
191
+ Args:
192
+ left: ordered list of left text feed
193
+ right: ending of right text content border
194
+ Returns:
195
+ None if no valid LeftRightDetected was detected
196
+ LeftRightDetected if `failrate` is not too high
197
+ """
198
+ longest_left = longest_two(left)
199
+ longest_right = longest_two(right)
200
+
201
+ if longest_left is None or longest_right is None:
202
+ # single page document which does not contain left-right-pages
203
+ return None
204
+
205
+ first_left = statistics.mean(longest_left[0])
206
+ second_left = statistics.mean(longest_left[1])
207
+ edge = math.fabs(first_left - second_left)
208
+
209
+ first_right = statistics.mean(longest_right[0])
210
+ second_right = statistics.mean(longest_right[1])
211
+
212
+ edges = utilo.diffs(left)
213
+ failures = [
214
+ index for index, item in enumerate(edges)
215
+ if item < edge * RAISING_EDGE_MIN.value
216
+ ]
217
+ failrate = len(failures) / len(edges)
218
+ max_failrate = RAISING_FAILRATE(len(edges))
219
+
220
+ if failrate > max_failrate:
221
+ return None
222
+
223
+ first_left, second_left = utilo.roundme(first_left, second_left)
224
+ first_right, second_right = utilo.roundme(first_right, second_right)
225
+
226
+ leftborder = (
227
+ min([first_left, second_left]),
228
+ max([first_left, second_left]),
229
+ )
230
+ rightborder = (
231
+ min([first_right, second_right]),
232
+ max([first_right, second_right]),
233
+ )
234
+
235
+ return LeftRightDetected(
236
+ left=leftborder,
237
+ right=rightborder,
238
+ confidence=1.0,
239
+ )
240
+
241
+
242
+ def handle_emptypage(left, right):
243
+ # TODO: THINK ABOUT IF THIS IS ENOUGH
244
+ # ignore empty pages
245
+ left_none = 0.0
246
+ left = [item if item is not None else left_none for item in left]
247
+ # TODO: Is default=0 a good one?
248
+ right_none = max((item for item in right if item is not None), default=0)
249
+ # NOTE: Determine more pages as large than it realy are - is this a
250
+ # problem?
251
+ right = [item if item is not None else right_none for item in right]
252
+ return left, right
253
+
254
+
255
+ def determine_pageborder(textpositions, pagesizes):
256
+ left = []
257
+ right = []
258
+ before = -1
259
+ for current, (page, size) in utilo.sync_pages([textpositions, pagesizes]):
260
+ assert current > before, f'{before} < {current}'
261
+ before = current
262
+ if not page or not size:
263
+ left.append(None)
264
+ right.append(None)
265
+ continue
266
+ bounding = [item for item, _ in page.content.values()]
267
+ leftright = maximize_leftright(bounding, size)
268
+ left.append(leftright[0])
269
+ right.append(leftright[1])
270
+ return left, right
271
+
272
+
273
+ def maximize_leftright(
274
+ boundings: utilo.Rectangles,
275
+ size: iamraw.PageSizeBorder,
276
+ ) -> LeftRight:
277
+ """Determine the left and right border of a page based on `mode`
278
+ selection in `size`.
279
+
280
+ Minimize the left and maximize the right position. The area where
281
+ mode is used to determine the most common border which is assumed as
282
+ correct border is limit by `size` configuration.
283
+
284
+ Args:
285
+ boundings: textpositions of defined page
286
+ size: width and height of current page
287
+ Returns:
288
+ tuple with left and right content bounding
289
+ """
290
+ left_max = size.size.width * LEFT_PERCENT
291
+ right_min = size.size.width * (1 - RIGHT_PERCENT)
292
+ left_max, right_min = utilo.roundme(left_max, right_min)
293
+ assert left_max <= right_min, 'left and right bounds are flipped'
294
+ left = [item[0] for item in boundings if item[0] <= left_max]
295
+ right = [item[2] for item in boundings if item[2] >= right_min]
296
+ # TODO: DO WE RELAY NEED THIS?
297
+ left = utilo.mode(left, minimize=True) if left else 0.0
298
+ right = utilo.mode(right, minimize=False) if right else size.size.width
299
+ return left, right
300
+
301
+
302
+ CLUSTER_CANDIAT_DIFF_MAX = configos.HV_FLOAT_PLUS(default=2.0)
303
+
304
+
305
+ def longest_two(items: utilo.Numbers) -> tuple[float, float]:
306
+
307
+ def close(candidat, clusteritem) -> bool:
308
+ diff = math.fabs(candidat - clusteritem)
309
+ return diff < CLUSTER_CANDIAT_DIFF_MAX
310
+
311
+ clustered = utilo.determine_cluster(items, close)
312
+ result = sorted(clustered, key=len, reverse=True)
313
+ if len(result) < 2:
314
+ return None
315
+ return result[0], result[1]
@@ -0,0 +1,70 @@
1
+ # =============================================================================
2
+ # C O P Y R I G H T
3
+ # -----------------------------------------------------------------------------
4
+ # Copyright (c) 2020-2023 by Helmut Konrad Schewe. All rights reserved.
5
+ # This file is property of Helmut Konrad Schewe. Any unauthorized copy,
6
+ # use or distribution is an offensive act against international law and may
7
+ # be prosecuted under federal law. Its content is company confidential.
8
+ # =============================================================================
9
+
10
+ import dataclasses
11
+ import math
12
+
13
+ import iamraw
14
+ import utilo
15
+
16
+
17
+ @dataclasses.dataclass
18
+ class MostBoundingDetected:
19
+ left: float = None
20
+ right: float = None
21
+ top: float = None
22
+ bottom: float = None
23
+ confidence: float = 0.0
24
+
25
+
26
+ def run(sizeandborder: iamraw.PageSizeBorderList) -> MostBoundingDetected:
27
+ borders = [item.border for item in sizeandborder]
28
+ most = most_boundingbox(borders)
29
+ # x0, y0, x1, y1
30
+ # left right
31
+ assert most[0] < most[1]
32
+ # top bottom
33
+ assert most[2] < most[3]
34
+ result = MostBoundingDetected(
35
+ left=most[0],
36
+ right=most[1],
37
+ top=most[2],
38
+ bottom=most[3],
39
+ )
40
+ return result
41
+
42
+
43
+ def most_boundingbox(
44
+ boxes: utilo.Rectangles,
45
+ roundme: bool = False,
46
+ ) -> utilo.Rectangle:
47
+ """Extract bounding box of most common occurence for every side.
48
+
49
+ Round detected boundingbox to full number to make approach more
50
+ robust. Round numbers in direction to the border end to increase the
51
+ detected rectangle that fit more items in it.
52
+ """
53
+ # TODO: Think about right and left, maybe search for the top 2 borders?
54
+ # Filter None entries
55
+ # left, right, top, bottom
56
+ rounding = [math.floor, math.ceil, math.floor, math.ceil]
57
+ result = []
58
+ for index, method in enumerate(rounding):
59
+ # remove None items
60
+ filtered = [item[index] for item in boxes if item[index] is not None]
61
+ # round to have a more robust grouping
62
+ rounded = [method(item) for item in filtered] if roundme else filtered
63
+ # determine most occured border to determine them as required border
64
+ # support multiple border options.
65
+ minimize = method is math.floor
66
+ # TODO: REQUIRE A BETTER TY-BREAKER
67
+ mode = utilo.mode(rounded, minimize=minimize)
68
+ result.append(mode)
69
+ # (x0, y0, x1, y1)
70
+ return tuple(result)
groupmes/cli.py ADDED
@@ -0,0 +1,81 @@
1
+ #==============================================================================
2
+ # C O P Y R I G H T
3
+ #------------------------------------------------------------------------------
4
+ # Copyright (c) 2019-2023 by Helmut Konrad Schewe. All rights reserved.
5
+ # This file is property of Helmut Konrad Schewe. Any unauthorized copy,
6
+ # use or distribution is an offensive act against international law and may
7
+ # be prosecuted under federal law. Its content is company confidential.
8
+ #==============================================================================
9
+
10
+ import utilo
11
+
12
+ import groupmes
13
+
14
+ DESCRIPTION = 'TODO'
15
+
16
+ WORKPLAN = [
17
+ utilo.create_step(
18
+ 'area',
19
+ inputs=[
20
+ utilo.ResultFile(producer='rawmaker', name='boxes_boxes'),
21
+ utilo.ResultFile(producer='tablero',
22
+ name='result_result',
23
+ optional=True),
24
+ utilo.ResultFile(producer='rawmaker', name='text_text'),
25
+ utilo.ResultFile(producer='rawmaker', name='text_positions'),
26
+ ],
27
+ output=('area',),
28
+ ),
29
+ utilo.create_step(
30
+ 'border',
31
+ inputs=[
32
+ utilo.ResultFile(producer='rawmaker', name='border_pages'),
33
+ utilo.ResultFile(producer='rawmaker', name='text_positions'),
34
+ ],
35
+ output=('leftright',),
36
+ ),
37
+ utilo.create_step(
38
+ 'distance',
39
+ inputs=[
40
+ utilo.ResultFile(producer=groupmes.PROCESS, name='area_area'),
41
+ utilo.ResultFile(producer='rawmaker', name='text_text'),
42
+ utilo.ResultFile(producer='rawmaker', name='text_positions'),
43
+ ],
44
+ output=('distance',),
45
+ ),
46
+ utilo.create_step(
47
+ 'content',
48
+ inputs=[
49
+ utilo.ResultFile(producer='rawmaker', name='text_text'),
50
+ utilo.ResultFile(producer='rawmaker', name='text_positions'),
51
+ utilo.ResultFile(producer='rawmaker', name='border_pages'),
52
+ utilo.ResultFile(producer='footnote', name='result_result'),
53
+ ],
54
+ output=('content',),
55
+ ),
56
+ utilo.create_step(
57
+ 'hefopa',
58
+ inputs=[
59
+ utilo.ResultFile(producer='headnote', name='result_result'),
60
+ utilo.ResultFile(producer='footnote', name='result_result'),
61
+ utilo.ResultFile(producer='pagenumber', name='result_result'),
62
+ utilo.ResultFile(producer='rawmaker', name='border_pages'),
63
+ ],
64
+ output=('result',),
65
+ )
66
+ ]
67
+
68
+
69
+ def main():
70
+ utilo.featurepack(
71
+ workplan=WORKPLAN,
72
+ root=groupmes.ROOT,
73
+ featurepackage='groupmes.feature',
74
+ config=utilo.FeaturePackConfig(
75
+ description=DESCRIPTION,
76
+ multiprocessed=True,
77
+ name=groupmes.PROCESS,
78
+ pages=True,
79
+ version=groupmes.__version__,
80
+ ),
81
+ )
@@ -0,0 +1,8 @@
1
+ #==============================================================================
2
+ # C O P Y R I G H T
3
+ #------------------------------------------------------------------------------
4
+ # Copyright (c) 2019-2023 by Helmut Konrad Schewe. All rights reserved.
5
+ # This file is property of Helmut Konrad Schewe. Any unauthorized copy,
6
+ # use or distribution is an offensive act against international law and may
7
+ # be prosecuted under federal law. Its content is company confidential.
8
+ #==============================================================================
@@ -0,0 +1,221 @@
1
+ # =============================================================================
2
+ # C O P Y R I G H T
3
+ # -----------------------------------------------------------------------------
4
+ # Copyright (c) 2020-2023 by Helmut Konrad Schewe. All rights reserved.
5
+ # This file is property of Helmut Konrad Schewe. Any unauthorized copy,
6
+ # use or distribution is an offensive act against international law and may
7
+ # be prosecuted under federal law. Its content is company confidential.
8
+ # =============================================================================
9
+ """Textual Area
10
+ ============
11
+
12
+ TODO:
13
+ * table of content
14
+ * images
15
+ * reference table
16
+
17
+ """
18
+
19
+ import collections
20
+ import os
21
+
22
+ import configos
23
+ import serializeraw
24
+ import utilo
25
+
26
+ RECTANGLE_DIFF_MAX = configos.HV_FLOAT_PLUS(default=10.0)
27
+
28
+ RequiredResources = collections.namedtuple(
29
+ 'RequiredResources',
30
+ 'textnavigator, tables, boxes',
31
+ )
32
+
33
+ PageContentTextualArea = collections.namedtuple(
34
+ 'PageContentTextualArea',
35
+ 'page, textual, outside, border',
36
+ )
37
+ PageContentTextualAreas = list[PageContentTextualArea]
38
+
39
+
40
+ def work(
41
+ boxes: str,
42
+ tables: str,
43
+ text: str,
44
+ textpositions: str,
45
+ pages: tuple = None,
46
+ ) -> str:
47
+ """Extract different areas out of given data.
48
+
49
+ Args:
50
+ boxes(str): path to extract `rawmaker` content boxes
51
+ tables(str): path to extracted `tablero` tables
52
+ text(str): extracted `rawmaker` text
53
+ textpositions(str): positions of extracted text
54
+ pages(tuple): tuple of pages to process
55
+ Returns:
56
+ Dumped extracted areas.
57
+ """
58
+ loaded = load(
59
+ boxes=boxes,
60
+ pages=pages,
61
+ tables=tables,
62
+ text=text,
63
+ textpositions=textpositions,
64
+ )
65
+
66
+ grouped = group_areas(loaded=loaded)
67
+
68
+ dumped = dump_area(grouped)
69
+ return dumped
70
+
71
+
72
+ def group_areas(loaded: RequiredResources) -> PageContentTextualAreas:
73
+ result = []
74
+ for navigator in loaded.textnavigator:
75
+ page = navigator.page
76
+
77
+ tables = utilo.select_page(loaded.tables, page)
78
+
79
+ boxes = utilo.select_page(loaded.boxes, page)
80
+ boxes = boxes.content if boxes else None
81
+
82
+ grouped = group_page(navigator, tables=tables, boxes=boxes)
83
+ result.append(grouped)
84
+ return result
85
+
86
+
87
+ def group_page(navigator, tables, boxes) -> PageContentTextualArea:
88
+ if tables:
89
+ tables = table_checker(tables)
90
+
91
+ if boxes:
92
+ boxes = boxed_checker(boxes)
93
+
94
+ textual = []
95
+ inside_tables = []
96
+ inside_boxes = []
97
+ for text in navigator:
98
+ bounding = tuple(text.bounding)
99
+ if tables and tables.contains(*bounding):
100
+ inside_tables.append(bounding)
101
+ if boxes and boxes.contains(*bounding):
102
+ inside_boxes.append(bounding)
103
+ else:
104
+ textual.append(bounding)
105
+
106
+ # optimize rectangles
107
+ textual = utilo.rect_merge(textual)
108
+ inside_tables = utilo.rect_merge(inside_tables)
109
+ inside_boxes = utilo.rect_merge(inside_boxes)
110
+ outside = {
111
+ 'boxes': inside_boxes,
112
+ 'tables': inside_tables,
113
+ }
114
+ border = {
115
+ key: list(value) for key, value in (
116
+ ('boxes', boxes.content if boxes else []),
117
+ ('tables', tables.content if tables else []),
118
+ )
119
+ }
120
+ result = PageContentTextualArea(
121
+ page=navigator.page,
122
+ textual=textual,
123
+ outside=outside,
124
+ border=border,
125
+ )
126
+ return result
127
+
128
+
129
+ def boxed_checker(items) -> utilo.RectangleCheck:
130
+ result = utilo.RectangleCheck(max_diff=RECTANGLE_DIFF_MAX)
131
+ for item in items:
132
+ result.extend(*item.box)
133
+ return result
134
+
135
+
136
+ def table_checker(items) -> utilo.RectangleCheck:
137
+ result = utilo.RectangleCheck(max_diff=RECTANGLE_DIFF_MAX)
138
+ for item in items:
139
+ result.extend(*item.bounding)
140
+ return result
141
+
142
+
143
+ def load(
144
+ boxes: str,
145
+ tables: str,
146
+ text: str,
147
+ textpositions: str,
148
+ pages: tuple = None,
149
+ ) -> RequiredResources:
150
+ # TODO: SHOULD WE REMOVE HIDDEN ITEMS?
151
+ textnavigator = serializeraw.ptn_fromfile(
152
+ text=text,
153
+ textpositions=textpositions,
154
+ pages=pages,
155
+ state=None, # load hidden items
156
+ )
157
+ boxes = serializeraw.load_boxes(boxes, pages=pages)
158
+ if os.path.exists(tables):
159
+ tables = serializeraw.load_tables(tables, pages=pages)
160
+ else:
161
+ utilo.log(f'skip using tablero: {tables}, generation is required')
162
+ tables = []
163
+ result = RequiredResources(
164
+ boxes=boxes,
165
+ tables=tables,
166
+ textnavigator=textnavigator,
167
+ )
168
+ return result
169
+
170
+
171
+ def dump_area(items) -> str:
172
+ raw = []
173
+ for page in items:
174
+ outside = {
175
+ key: [utilo.from_tuple(item) for item in value] if value else value
176
+ for key, value in page.outside.items()
177
+ }
178
+ border = {
179
+ key: [utilo.from_tuple(item) for item in border]
180
+ if border else border for key, border in page.border.items()
181
+ }
182
+ textual = page.textual
183
+ if textual:
184
+ textual = [utilo.from_tuple(item) for item in textual]
185
+
186
+ content = {
187
+ 'border': border,
188
+ 'outside': outside,
189
+ 'page': page.page,
190
+ 'textual': textual,
191
+ }
192
+ raw.append(content)
193
+ dumped = utilo.yaml_dump(raw)
194
+ return dumped
195
+
196
+
197
+ def load_area(content: str, pages: tuple = None) -> PageContentTextualAreas:
198
+ loaded = utilo.yaml_load(content)
199
+ result = []
200
+ for page in loaded:
201
+ pagenumber = int(page['page'])
202
+ if utilo.should_skip(pagenumber, pages):
203
+ continue
204
+ textual = [utilo.parse_tuple(item) for item in page['textual']
205
+ ] if page['textual'] else page['textual']
206
+ outside = {
207
+ key: [utilo.parse_tuple(item) for item in values] if values else
208
+ values for key, values in page['outside'].items()
209
+ }
210
+ border = {
211
+ key: [utilo.parse_tuple(item) for item in values] if values else
212
+ values for key, values in page['border'].items()
213
+ }
214
+ result.append(
215
+ PageContentTextualArea(
216
+ border=border,
217
+ outside=outside,
218
+ page=pagenumber,
219
+ textual=textual,
220
+ ))
221
+ return result
@@ -0,0 +1,101 @@
1
+ # =============================================================================
2
+ # C O P Y R I G H T
3
+ # -----------------------------------------------------------------------------
4
+ # Copyright (c) 2020-2023 by Helmut Konrad Schewe. All rights reserved.
5
+ # This file is property of Helmut Konrad Schewe. Any unauthorized copy,
6
+ # use or distribution is an offensive act against international law and may
7
+ # be prosecuted under federal law. Its content is company confidential.
8
+ # =============================================================================
9
+
10
+ import configos
11
+ import iamraw
12
+ import serializeraw
13
+ import utilo
14
+
15
+ import groupmes.border.leftright
16
+ import groupmes.border.most
17
+
18
+
19
+ def work(
20
+ sizeandborder: str,
21
+ textpositions: str,
22
+ pages: tuple = None,
23
+ ) -> tuple[str]:
24
+ sizeandborder = serializeraw.load_pageborders(sizeandborder, pages=pages)
25
+ textpositions = serializeraw.load_textpositions(textpositions, pages=pages)
26
+
27
+ result = determine_border(textpositions, sizeandborder)
28
+
29
+ dumped = serializeraw.dump_leftright_border(result)
30
+ return dumped
31
+
32
+
33
+ def determine_border(
34
+ textpositions: iamraw.PageContentTextPositions,
35
+ pagesizes: iamraw.PageSizeBorderList,
36
+ ):
37
+ clustered = pagecluster(pagesizes)
38
+ result = []
39
+ for pages_incluster in clustered:
40
+ border = cluster_border(textpositions, pagesizes, pages_incluster)
41
+ result.append(border)
42
+ result = utilo.flat(result)
43
+ # sort by page number
44
+ result = sorted(result, key=lambda x: x[0])
45
+ return result
46
+
47
+
48
+ def cluster_border(textpositions, pagesizes, pages_incluster):
49
+ textpositions = utilo.select_pages(textpositions, pages_incluster)
50
+ pagesizes = utilo.select_pages(pagesizes, pages_incluster)
51
+
52
+ textpositions = utilo.notnone(textpositions)
53
+ pagesizes = utilo.notnone(pagesizes)
54
+
55
+ most = groupmes.border.most.run(pagesizes)
56
+ leftright = groupmes.border.leftright.run(textpositions, pagesizes)
57
+
58
+ result = [(page, *expected_border(leftright, most, pagesizes, page))
59
+ for page in pages_incluster]
60
+ return result
61
+
62
+
63
+ def expected_border(leftright, most, pagesizes, page: int):
64
+ # left, right, top, down
65
+ # TODO: CHECK THAT PAGE CALL IS CORRECT
66
+ left = leftright.left
67
+ if isinstance(left, tuple):
68
+ left = left[page % 2] # pylint:disable=E1136
69
+
70
+ right = leftright.right
71
+ if isinstance(right, tuple):
72
+ right = right[page % 2] # pylint:disable=E1136
73
+
74
+ pagesize = utilo.select_page(pagesizes, page).size
75
+ rightborder = pagesize.width - right
76
+ bottomborder = pagesize.height - most.bottom
77
+
78
+ result = (left, rightborder, most.top, bottomborder)
79
+ result = utilo.roundme(result)
80
+ return result
81
+
82
+
83
+ PAGE_CLUSTER_SIZE_MIN = configos.HV_INT_PLUS(default=3)
84
+
85
+ PAGE_CLUSTER_DIFF_MAX = configos.HV_FLOAT_PLUS(default=10.0)
86
+
87
+
88
+ def pagecluster(pagesizes) -> list:
89
+
90
+ def equal_size(candidat, clusteritem) -> bool:
91
+ diff = utilo.norms(candidat[0], clusteritem[0])
92
+ return diff < PAGE_CLUSTER_DIFF_MAX
93
+
94
+ grouped = utilo.determine_cluster(
95
+ pagesizes,
96
+ classifier=equal_size,
97
+ min_elements=PAGE_CLUSTER_SIZE_MIN,
98
+ )
99
+
100
+ pages = [sorted(item.page for item in cluster) for cluster in grouped]
101
+ return pages
@@ -0,0 +1,52 @@
1
+ # =============================================================================
2
+ # C O P Y R I G H T
3
+ # -----------------------------------------------------------------------------
4
+ # Copyright (c) 2019-2023 by Helmut Konrad Schewe. All rights reserved.
5
+ # This file is property of Helmut Konrad Schewe. Any unauthorized copy,
6
+ # use or distribution is an offensive act against international law and may
7
+ # be prosecuted under federal law. Its content is company confidential.
8
+ # =============================================================================
9
+
10
+ import iamraw
11
+ import serializeraw
12
+ import utilo
13
+
14
+
15
+ def work(
16
+ text: str,
17
+ textpositions: str,
18
+ sizeandborder: str,
19
+ headerfooter: str,
20
+ pages: tuple | None = None,
21
+ ) -> str:
22
+ """Extract table of content out of `document`.
23
+
24
+ Args:
25
+ text(str): path to load document
26
+ textpositions(str): path to load document textpositions
27
+ sizeandborder(str): path with page sizes and content border
28
+ headerfooter(str): path with header and footer to determine
29
+ content border.
30
+ pages(tuple): tuple of selected pages
31
+ Returns:
32
+ dump of extracted content bounding boxes
33
+ """
34
+ navigators = serializeraw.ptcn_fromfile(
35
+ text,
36
+ textpositions,
37
+ sizeandborder=sizeandborder,
38
+ headerfooter=headerfooter,
39
+ pages=pages,
40
+ )
41
+ result = []
42
+ for page in navigators:
43
+ top, bottom = page.content.top, page.content.bottom
44
+ top, bottom = utilo.roundme((top, bottom))
45
+ result.append(
46
+ iamraw.ContentBoundingBox(
47
+ page=page.page,
48
+ top=top,
49
+ bottom=bottom,
50
+ ))
51
+ dumped = serializeraw.dump_contentboundingbox(result)
52
+ return dumped
@@ -0,0 +1,237 @@
1
+ # =============================================================================
2
+ # C O P Y R I G H T
3
+ # -----------------------------------------------------------------------------
4
+ # Copyright (c) 2020-2023 by Helmut Konrad Schewe. All rights reserved.
5
+ # This file is property of Helmut Konrad Schewe. Any unauthorized copy,
6
+ # use or distribution is an offensive act against international law and may
7
+ # be prosecuted under federal law. Its content is company confidential.
8
+ # =============================================================================
9
+ """Distance detector
10
+ =================
11
+
12
+ Compute the distance between textual and non textual elements.
13
+
14
+ There are two differences for every non textual elements. The distance
15
+ before(negative) and the distance after(positive). If the page starts or
16
+ ends with an non textual element, the distance is None.
17
+
18
+ TODO: SUPPORT LEFT RIGHT DISTANCE
19
+
20
+ """
21
+
22
+ import collections
23
+
24
+ import serializeraw
25
+ import texmex
26
+ import utilo
27
+
28
+ import groupmes.feature.area
29
+
30
+ RequiredResources = collections.namedtuple(
31
+ 'RequiredResources',
32
+ 'area, textnavigator',
33
+ )
34
+
35
+ AreaDistance = collections.namedtuple(
36
+ 'AreaDistance',
37
+ 'index, before, after',
38
+ )
39
+ AreaDistances = list[AreaDistance]
40
+
41
+ PageContentAreaDistance = collections.namedtuple(
42
+ 'PageContentAreaDistance',
43
+ 'page, content',
44
+ )
45
+ PageContentAreaDistances = list[PageContentAreaDistance]
46
+
47
+
48
+ def work(
49
+ areas: str,
50
+ text: str,
51
+ textpositions: str,
52
+ pages: tuple = None,
53
+ ) -> str:
54
+ loaded = load(areas, text, textpositions, pages=pages)
55
+ distances = determine_distances(loaded)
56
+ dumped = dump_distance(distances)
57
+ return dumped
58
+
59
+
60
+ def determine_distances(loaded: RequiredResources) -> PageContentAreaDistances:
61
+ result = []
62
+ for navigator in loaded.textnavigator:
63
+ page = navigator.page
64
+ areas = utilo.select_page(loaded.area, page)
65
+ grouped = group_page(navigator, areas)
66
+ if not grouped:
67
+ continue
68
+ result.append(PageContentAreaDistance(content=grouped, page=page))
69
+ return result
70
+
71
+
72
+ def group_page(navigator, areas) -> AreaDistances:
73
+ if areas is None:
74
+ return []
75
+ distance = create_distance(areas.border)
76
+ distances = [distance.distance(line.bounding) for line in navigator]
77
+
78
+ collected = collections.defaultdict(list)
79
+ for item in distances:
80
+ if not item:
81
+ continue
82
+ values, index = item
83
+ if isinstance(values, float):
84
+ collected[index].append(values)
85
+ else:
86
+ collected[index].append(values[0])
87
+ collected[index + 1].append(values[1])
88
+
89
+ final = []
90
+ for key, value in collected.items():
91
+ negative = max((item for item in value if item < 0), default=None)
92
+ negative = utilo.roundme(negative) if negative is not None else None
93
+ positive = min((item for item in value if item >= 0), default=None)
94
+ positive = utilo.roundme(positive) if positive is not None else None
95
+ final.append(AreaDistance(index=key, before=negative, after=positive))
96
+ return final
97
+
98
+
99
+ class Distance:
100
+
101
+ def __init__(self, diff: float = 10.0):
102
+ self.content = []
103
+ self.sorted = True
104
+ self.diff = diff
105
+
106
+ def distance(self, bounding): # pylint:disable=R1260,R0911
107
+ if not self:
108
+ return None
109
+ if not self.sorted:
110
+ self.sort()
111
+ top, bottom = bounding[1], bounding[3]
112
+ if len(self) == 1:
113
+ if utilo.rect_inside(self[0], bounding, diff=self.diff):
114
+ return None
115
+ top_current = self[0][1]
116
+ bottom_current = self[0][3]
117
+ if bottom <= top_current:
118
+ # content is above
119
+ return (utilo.roundme(bottom - top_current), 0)
120
+ # content is below
121
+ return (utilo.roundme(top - bottom_current), 0)
122
+
123
+ if self[-1][3] <= bounding[1]:
124
+ # after
125
+ return (utilo.roundme(bounding[1] - self[-1][3]), len(self) - 1)
126
+
127
+ # in the middle
128
+ for index, (before, after) in enumerate(zip(self[0:-1], self[1:])):
129
+ bottom_before = before[3]
130
+ top_after = after[1]
131
+ if utilo.rect_inside(before, bounding):
132
+ return None
133
+ if utilo.rect_inside(after, bounding):
134
+ return None
135
+ if bottom_before <= top <= bottom <= top_after:
136
+ diff_top = top - bottom_before
137
+ diff_bottom = bottom - top_after
138
+ return (diff_top, diff_bottom), index
139
+ return None
140
+
141
+ def append(self, item):
142
+ self.content.append(item)
143
+ self.sorted = False
144
+
145
+ def sort(self):
146
+ self.content = sorted(self.content, key=lambda item: item[1])
147
+ self.sorted = True
148
+
149
+ def __getitem__(self, index):
150
+ return self.content[index]
151
+
152
+ def __len__(self):
153
+ return len(self.content)
154
+
155
+
156
+ def create_distance(items) -> Distance:
157
+ result = Distance()
158
+ for values in items.values():
159
+ for item in values:
160
+ result.append(item)
161
+ result.sort()
162
+ return result
163
+
164
+
165
+ def load(
166
+ area: str,
167
+ text: str,
168
+ textpositions: str,
169
+ pages: tuple = None,
170
+ ) -> RequiredResources:
171
+ try:
172
+ area = groupmes.feature.area.load_area(area, pages=pages)
173
+ except FileNotFoundError as err:
174
+ area = []
175
+ utilo.error(err)
176
+ text = serializeraw.load_document(text, pages=pages)
177
+ textpositions = serializeraw.load_textpositions(textpositions, pages=pages)
178
+ textnavigator = texmex.create_ptns(
179
+ text,
180
+ textpositions=textpositions,
181
+ fill_empty=False,
182
+ )
183
+ result = RequiredResources(
184
+ area=area,
185
+ textnavigator=textnavigator,
186
+ )
187
+ return result
188
+
189
+
190
+ def dump_distance(items: PageContentAreaDistances) -> str:
191
+ raw = []
192
+ for page in items:
193
+ content = []
194
+ for item in page.content:
195
+ before = utilo.roundme(item.before) if item.before is not None else 'None' # yapf:disable
196
+ after = utilo.roundme(item.after) if item.after is not None else 'None' # yapf:disable
197
+ content.append(f'{item.index} {before} {after}')
198
+ raw.append({'page': page.page, 'content': content})
199
+ dumped = utilo.yaml_dump(raw)
200
+ return dumped
201
+
202
+
203
+ def load_distance(
204
+ content: str,
205
+ pages: tuple = None,
206
+ ) -> PageContentAreaDistances:
207
+ # TODO: MOVE TO SERIALIZERAW
208
+ loaded = utilo.yaml_load(content)
209
+ result = []
210
+ for page in loaded:
211
+ pagenumber = int(page['page'])
212
+ if utilo.should_skip(pagenumber, pages):
213
+ continue
214
+ pagecontent = []
215
+ for line in page['content']:
216
+ index, before, after = line.split()
217
+ try:
218
+ before = float(before)
219
+ except ValueError:
220
+ before = None
221
+ try:
222
+ after = float(after)
223
+ except ValueError:
224
+ after = None
225
+ index = int(index)
226
+ pagecontent.append(
227
+ AreaDistance(
228
+ index=index,
229
+ before=before,
230
+ after=after,
231
+ ))
232
+ result.append(
233
+ PageContentAreaDistance(
234
+ page=pagenumber,
235
+ content=pagecontent,
236
+ ))
237
+ return result
@@ -0,0 +1,126 @@
1
+ # =============================================================================
2
+ # C O P Y R I G H T
3
+ # -----------------------------------------------------------------------------
4
+ # Copyright (c) 2022-2023 by Helmut Konrad Schewe. All rights reserved.
5
+ # This file is property of Helmut Konrad Schewe. Any unauthorized copy,
6
+ # use or distribution is an offensive act against international law and may
7
+ # be prosecuted under federal law. Its content is company confidential.
8
+ # =============================================================================
9
+
10
+ import iamraw
11
+ import serializeraw
12
+ import texmex
13
+ import utilo
14
+
15
+
16
+ def work(
17
+ headnote: str,
18
+ footnote: str,
19
+ pagenumber: str,
20
+ borders: str,
21
+ pages: int | tuple = None,
22
+ ) -> str:
23
+ try:
24
+ headnote = serializeraw.load_headerfooter(headnote, pages=pages)
25
+ except FileNotFoundError as err:
26
+ utilo.error(err)
27
+ headnote: list = []
28
+ try:
29
+ footnote = serializeraw.load_headerfooter(footnote, pages=pages)
30
+ except FileNotFoundError as err:
31
+ utilo.error(err)
32
+ footnote: list = []
33
+
34
+ borders = serializeraw.load_pageborders(borders, pages=pages)
35
+ pagenumber = load_pagenumbers(pagenumber, borders, pages=pages)
36
+ merged = merge(headnote, footnote, pagenumber)
37
+ dumped = serializeraw.dump_headerfooter(merged)
38
+ return dumped
39
+
40
+
41
+ def merge(headnotes, footnotes, pagenumbers) -> list:
42
+ result = iamraw.PageContentFooterHeaders(content=[])
43
+ result.__strategy__ = 'hefopa'
44
+ for page, (headnote, footnote, pagenumber) in utilo.sync_pages(
45
+ (headnotes, footnotes, pagenumbers)):
46
+ if not any((headnote, footnote, pagenumber)):
47
+ continue
48
+ item = iamraw.PageContentFooterHeader(page=page)
49
+ if footnote and footnote.footer:
50
+ item.footer = footnote.footer
51
+ if headnote:
52
+ if headnote.header:
53
+ item.header = headnote.header
54
+ if headnote.footer:
55
+ if not item.footer:
56
+ item.footer = headnote.footer
57
+ if pagenumber and pagenumber.footer:
58
+ if not item.header and not item.footer:
59
+ item.header = pagenumber.header
60
+ item.footer = pagenumber.footer
61
+ result.content.append(item)
62
+ return result
63
+
64
+
65
+ def load_pagenumbers(
66
+ pagenumber,
67
+ borders,
68
+ pages: tuple,
69
+ ) -> iamraw.PageContentFooterHeaders:
70
+ result = iamraw.PageContentFooterHeaders(content=[])
71
+ result.__stategy__ = 'pagenumber'
72
+ loaded = serializeraw.load_pagenumbers(pagenumber, pages=pages)
73
+ single = utilo.Single()
74
+ for item in loaded:
75
+ pdfpage = item.pdfpage # pylint:disable=E1101
76
+ pageborder = utilo.select_page(borders, page=pdfpage)
77
+ # TODO: MAY REMOVE LATER
78
+ if single.contains(pdfpage):
79
+ utilo.error(f'duplicated pagenumber/pdfpage: {item}')
80
+ continue
81
+ page = create(
82
+ item,
83
+ pdfpage,
84
+ pageborder,
85
+ )
86
+ result.content.append(page)
87
+ return result
88
+
89
+
90
+ def create(item, pdfpage, pageborder) -> iamraw.PageContentFooterHeader:
91
+ pageinfo = iamraw.PageInformation(value=item.detected) # pylint:disable=E1101
92
+ header, footer = None, None
93
+ begin, end = head_foot_area(pageborder, item.bounding) # pylint:disable=E1101
94
+ isheader = begin == texmex.START
95
+ if isheader:
96
+ header = iamraw.FixedHeaderInfo(page=pageinfo)
97
+ header.begin = begin
98
+ header.end = end
99
+ else:
100
+ footer = iamraw.FixedFooterInfo(page=pageinfo)
101
+ footer.begin = begin
102
+ footer.end = end
103
+ result = iamraw.PageContentFooterHeader(
104
+ page=pdfpage,
105
+ header=header,
106
+ footer=footer,
107
+ )
108
+ return result
109
+
110
+
111
+ def head_foot_area(pageborder, pagenumber_bounding) -> float:
112
+ pageheight = pageborder.size.height
113
+ if not pageheight:
114
+ utilo.error(f'missing page height: {pageborder} {pagenumber_bounding}')
115
+ return texmex.END
116
+ pagenumber_y0 = pagenumber_bounding.y0
117
+ pagenumber_y1 = pagenumber_bounding.y1
118
+ header = pagenumber_y1 < 350
119
+ if header:
120
+ begin = texmex.START
121
+ end = utilo.roundme(pagenumber_y1 / pageheight + 0.00) # TOL
122
+ else:
123
+ # footer
124
+ begin = utilo.roundme(pagenumber_y0 / pageheight - 0.01) # TOL
125
+ end = texmex.END
126
+ return begin, end
groupmes/path.py ADDED
@@ -0,0 +1,34 @@
1
+ # =============================================================================
2
+ # C O P Y R I G H T
3
+ # -----------------------------------------------------------------------------
4
+ # Copyright (c) 2019-2023 by Helmut Konrad Schewe. All rights reserved.
5
+ # This file is property of Helmut Konrad Schewe. Any unauthorized copy,
6
+ # use or distribution is an offensive act against international law and may
7
+ # be prosecuted under federal law. Its content is company confidential.
8
+ # =============================================================================
9
+
10
+ import utilo
11
+
12
+ import groupmes
13
+
14
+
15
+ def area(path: str, prefix: str = '') -> str:
16
+ return utilo.pathconnector(path, groupmes.PACKAGE, 'area_area', prefix)
17
+
18
+
19
+ def border_leftright(path: str, prefix: str = '') -> str:
20
+ return utilo.pathconnector(
21
+ path,
22
+ groupmes.PACKAGE,
23
+ 'border_leftright',
24
+ prefix,
25
+ )
26
+
27
+
28
+ def distance(path: str, prefix: str = '') -> str:
29
+ return utilo.pathconnector(
30
+ path,
31
+ groupmes.PACKAGE,
32
+ 'distance_distance',
33
+ prefix,
34
+ )
@@ -0,0 +1,28 @@
1
+ Metadata-Version: 2.4
2
+ Name: groupmes
3
+ Version: 1.1.0
4
+ Author-email: Helmut Konrad Schewe <helmutus@outlook.com>
5
+ License-Expression: MIT
6
+ Project-URL: Homepage, https://github.com/anaticulae/groupmes
7
+ Project-URL: Repository, https://github.com/anaticulae/groupmes
8
+ Classifier: Programming Language :: Python :: 3.12
9
+ Classifier: Programming Language :: Python :: 3.13
10
+ Classifier: Programming Language :: Python :: 3.14
11
+ Requires-Python: >=3.12
12
+ Description-Content-Type: text/markdown
13
+ Requires-Dist: iamraw<5.0.0,>=4.91.5
14
+ Requires-Dist: utilo<3.0.0,>=2.109.0
15
+ Requires-Dist: configos<2.0.0,>=1.0.4
16
+ Requires-Dist: konradus<2.0.0,>=1.0.1
17
+ Provides-Extra: dev
18
+ Requires-Dist: utilotest<2.0.0,>=1.0.4; extra == "dev"
19
+ Requires-Dist: hoverpower==1.4.3; extra == "dev"
20
+ Requires-Dist: gennex==1.0.3; extra == "dev"
21
+ Requires-Dist: rawmaker==2.40.3; extra == "dev"
22
+ Requires-Dist: pagenumber==1.0.0; extra == "dev"
23
+
24
+ # groupmes
25
+
26
+ * chapter: split document text by chapter
27
+ * structure: divide document in head(toc) and tail(rest)
28
+ * toc: create table of content out of raw text data
@@ -0,0 +1,19 @@
1
+ groupmes/__init__.py,sha256=h23h6l8V-tYm0m8M2D21L8ZbZGOOuKymxMJ2OHAtayo,801
2
+ groupmes/__main__.py,sha256=XGxs_W_jWiuac2zJS9cmc_YQ4_06X2-6insjQwP5AwY,622
3
+ groupmes/__patch__.py,sha256=TM9PAYPmQhbW1HY3Vx-bS5rXe132Z02TPBxFMad80Dk,552
4
+ groupmes/cli.py,sha256=QuNRQDf4Kvs6qQhyCLBT2aGWtCH45BzQzYAXJ9vM9cc,2812
5
+ groupmes/path.py,sha256=Nlxte_pPtVXEbFkCpSk8L50GJ_oKDJ7NRlTtKZ9YCms,1064
6
+ groupmes/border/__init__.py,sha256=TM9PAYPmQhbW1HY3Vx-bS5rXe132Z02TPBxFMad80Dk,552
7
+ groupmes/border/leftright.py,sha256=i_iSLvXoWe3HKctvOb_jarDsO202tWrwwCAURE9bM5s,10029
8
+ groupmes/border/most.py,sha256=-82HuzurQC7xrwhOCgCquN-1XtKP1FePb-tweYcvm_o,2411
9
+ groupmes/feature/__init__.py,sha256=kxNXE6nGBAEes4-tV7jYOvjrGAKCZ_bMBKXrB56Hgho,552
10
+ groupmes/feature/area.py,sha256=AslnKHbh9mS_RhScrAKRShuMje3vXAwz0elag9DKqpU,6227
11
+ groupmes/feature/border.py,sha256=qiQhnxid14Rff8osBNFhNDJZNBViqMy4WiVNufVntUY,3227
12
+ groupmes/feature/content.py,sha256=os0Nwycs8EAY8U4UdapqdmIy3BVELNf63To8_NiGsZw,1779
13
+ groupmes/feature/distance.py,sha256=a4gDdrwHRQkKwTzcEZP80UVI9LJuC2-UgqeO3HKJbbM,7267
14
+ groupmes/feature/hefopa.py,sha256=-8QVippCqHvSZhZ7C7eRpD11pn1xlmLMa8lpn0pAjqY,4337
15
+ groupmes-1.1.0.dist-info/METADATA,sha256=e22TPdVWcijKql3FYdiuFEdR8DNdXTBHxB9PfzUOYFk,1057
16
+ groupmes-1.1.0.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
17
+ groupmes-1.1.0.dist-info/entry_points.txt,sha256=uweQthSsMHckhQ-yFmjsFhJK8sZslWMylWxAeCTOcTs,46
18
+ groupmes-1.1.0.dist-info/top_level.txt,sha256=RqsOvZPqxlp_Pu6Ic-F6Z1H3MnsIIJpw3UEf7VZkJAU,9
19
+ groupmes-1.1.0.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (82.0.1)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
5
+
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ groupme = groupmes.cli:main
@@ -0,0 +1 @@
1
+ groupmes