rawmaker 2.40.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (63) hide show
  1. letty/__init__.py +46 -0
  2. letty/cli.py +63 -0
  3. letty/optimizer.py +138 -0
  4. letty/quality/__init__.py +8 -0
  5. letty/quality/whitespace.py +50 -0
  6. letty/strategy.py +8 -0
  7. rawmaker/__init__.py +29 -0
  8. rawmaker/__main__.py +13 -0
  9. rawmaker/__patch__.py +36 -0
  10. rawmaker/cli.py +206 -0
  11. rawmaker/cli_automate.py +69 -0
  12. rawmaker/converter/__init__.py +8 -0
  13. rawmaker/converter/basic.py +174 -0
  14. rawmaker/converter/images.py +168 -0
  15. rawmaker/date.py +83 -0
  16. rawmaker/destination.py +202 -0
  17. rawmaker/error.py +34 -0
  18. rawmaker/features/__init__.py +138 -0
  19. rawmaker/features/annotation.py +254 -0
  20. rawmaker/features/border.py +172 -0
  21. rawmaker/features/boxes.py +153 -0
  22. rawmaker/features/figures.py +24 -0
  23. rawmaker/features/fonts.py +229 -0
  24. rawmaker/features/formula.py +16 -0
  25. rawmaker/features/horizontals.py +132 -0
  26. rawmaker/features/images.py +155 -0
  27. rawmaker/features/line.py +337 -0
  28. rawmaker/features/outlines.py +123 -0
  29. rawmaker/features/text.py +91 -0
  30. rawmaker/fonts/__init__.py +8 -0
  31. rawmaker/fonts/parser.py +354 -0
  32. rawmaker/images/__init__.py +8 -0
  33. rawmaker/images/info.py +35 -0
  34. rawmaker/miner/__init__.py +8 -0
  35. rawmaker/miner/char.py +42 -0
  36. rawmaker/miner/colorspace.py +75 -0
  37. rawmaker/miner/images.py +448 -0
  38. rawmaker/miner/position.py +121 -0
  39. rawmaker/miner/rawchar.py +207 -0
  40. rawmaker/miner/text.py +833 -0
  41. rawmaker/miner/underline.py +66 -0
  42. rawmaker/parameter.py +130 -0
  43. rawmaker/patch/__init__.py +8 -0
  44. rawmaker/patch/ltchar.py +79 -0
  45. rawmaker/reader.py +97 -0
  46. rawmaker/text/__init__.py +8 -0
  47. rawmaker/text/chars.py +24 -0
  48. rawmaker/text/data.py +47 -0
  49. rawmaker/text/superfast.py +91 -0
  50. rawmaker/text/wordbox.py +95 -0
  51. rawmaker/utils.py +44 -0
  52. rawmaker-2.40.3.dist-info/METADATA +51 -0
  53. rawmaker-2.40.3.dist-info/RECORD +63 -0
  54. rawmaker-2.40.3.dist-info/WHEEL +5 -0
  55. rawmaker-2.40.3.dist-info/entry_points.txt +6 -0
  56. rawmaker-2.40.3.dist-info/licenses/LICENSE +21 -0
  57. rawmaker-2.40.3.dist-info/top_level.txt +3 -0
  58. spacestation/__init__.py +18 -0
  59. spacestation/cli.py +51 -0
  60. spacestation/features/__init__.py +8 -0
  61. spacestation/features/chardist.py +85 -0
  62. spacestation/features/worddist.py +57 -0
  63. spacestation/features/wspace.py +130 -0
@@ -0,0 +1,51 @@
1
+ Metadata-Version: 2.4
2
+ Name: rawmaker
3
+ Version: 2.40.3
4
+ Author-email: Helmut Konrad Schewe <helmutus@outlook.com>
5
+ License-Expression: MIT
6
+ Project-URL: Homepage, https://github.com/anaticulae/rawmaker
7
+ Project-URL: Repository, https://github.com/anaticulae/rawmaker
8
+ Classifier: Programming Language :: Python :: 3.12
9
+ Classifier: Programming Language :: Python :: 3.13
10
+ Classifier: Programming Language :: Python :: 3.14
11
+ Requires-Python: >=3.12
12
+ Description-Content-Type: text/markdown
13
+ License-File: LICENSE
14
+ Requires-Dist: pdfminer.six<20270000,>=20260107
15
+ Requires-Dist: pillow<13.0.0,>=12.2.0
16
+ Requires-Dist: camelot_py<2.0.0,>=1.0.9
17
+ Requires-Dist: utilo<3.0.0,>=2.105.0
18
+ Requires-Dist: configos<2.0.0,>=1.0.4
19
+ Requires-Dist: iamraw<5.0.0,>=4.91.0
20
+ Requires-Dist: protoerror<4.0.0,>=3.20.2
21
+ Requires-Dist: ughost<2.0.0,>=1.0.1
22
+ Requires-Dist: pdflog<2.0.0,>=1.0.2
23
+ Provides-Extra: dev
24
+ Requires-Dist: hoverpower<2.0.0,>=1.1.0; extra == "dev"
25
+ Requires-Dist: jamer<2.0.0,>=1.0.1; extra == "dev"
26
+ Requires-Dist: utilotest<2.0.0,>=1.0.2; extra == "dev"
27
+ Requires-Dist: gennex<2.0.0,>=1.0.1; extra == "dev"
28
+ Dynamic: license-file
29
+
30
+ # rawmaker - convert PDF to raw data
31
+
32
+ This package extract information out of a pdf-file and write them to
33
+ yaml-files.
34
+
35
+ ## Introduction
36
+
37
+ Supported functions:
38
+
39
+ * border: get outline of pdf content
40
+ * boxes: determine boxes which are constructed from lines
41
+ * fonts: extract used fonts of text data
42
+ * text: extract text blocks per page
43
+ * toc: determine table of content of pdf file
44
+
45
+ ## Supported PDF version
46
+
47
+ PDF 1.5?
48
+
49
+ ## Glossary
50
+
51
+ * TOC - table of content
@@ -0,0 +1,63 @@
1
+ letty/__init__.py,sha256=4nIt8u7ywq2emdp8FRjDOb7ALCAV8AnrZBDwUlxydyE,1222
2
+ letty/cli.py,sha256=lwawJtFpXlEn3WFO29E37bCLCgJHEkk18F4OknUQ1Ts,1993
3
+ letty/optimizer.py,sha256=Urz9MSlIb9Rn5d_qbovbHvetjq9XDDWEmESX2J99sPo,4292
4
+ letty/strategy.py,sha256=TM9PAYPmQhbW1HY3Vx-bS5rXe132Z02TPBxFMad80Dk,552
5
+ letty/quality/__init__.py,sha256=TM9PAYPmQhbW1HY3Vx-bS5rXe132Z02TPBxFMad80Dk,552
6
+ letty/quality/whitespace.py,sha256=2diIbyo8AbD8szxfwil16mZJd2MY8o-fbtcQCxGUpgA,1601
7
+ rawmaker/__init__.py,sha256=jDMpCPAn3PAEbXwPxSImmERUlcQ-Pt0mQtRQaSxDv10,1010
8
+ rawmaker/__main__.py,sha256=op6068lkAl5-d0nLcGvOLr5k8bzBk2dL9M7maRq30Qk,622
9
+ rawmaker/__patch__.py,sha256=ZWfuv8xmw9VvXOlth3sXvT44L9iVN9ZmF_x-qYviqwc,1235
10
+ rawmaker/cli.py,sha256=kx6Ofia4gORG1E0WytMjcrs8NojBqGuPzKTPIze4sDQ,5410
11
+ rawmaker/cli_automate.py,sha256=d9g1RVOCLlwRi5qXBK_UmK5rDipuYiHRLU6BTm7Gykk,2120
12
+ rawmaker/date.py,sha256=NMKUXQiwJgqp2zZOKnLEiXD0FGXGOv7eYodmc1A9GMg,2275
13
+ rawmaker/destination.py,sha256=yUEHRHWAKj0KXoXuLn9uO-d4CeMlma2UmwaKM9q4GBk,5868
14
+ rawmaker/error.py,sha256=LopcN1QI6RujHp9rhlEpnkR9UgvQJ71ZWNwA2-Tt63s,1055
15
+ rawmaker/parameter.py,sha256=jrwqsn8rN6tS6XzfnP2FSBCN_VmbMR5QtUxeHpZXoYw,4121
16
+ rawmaker/reader.py,sha256=GSnbarPP03Pk1j9XkyPf7_1z-nUVA3xq1JZkgYvo0xk,3690
17
+ rawmaker/utils.py,sha256=fqdqSU8EziZ-2UKs9r0EZwbJjg7zZkx2YYojViLrJ8A,1337
18
+ rawmaker/converter/__init__.py,sha256=TM9PAYPmQhbW1HY3Vx-bS5rXe132Z02TPBxFMad80Dk,552
19
+ rawmaker/converter/basic.py,sha256=Gq27IcSdHamZ3EcFHaTVWDew0mDvPWAlbtc3jZJfj3E,5546
20
+ rawmaker/converter/images.py,sha256=DEK6cuCe49Cc_MDAUBIcaphc0lSQa0aUMT3k6f2UvHY,5374
21
+ rawmaker/features/__init__.py,sha256=31Ed-ZPyRbMgxsD368lTIZlzIVIm81iNFhugUeVw2yY,5134
22
+ rawmaker/features/annotation.py,sha256=AN-hP9iPLlzPzgqdGukHvJ6fgjkVAgTZoJXqH3OvZhM,8346
23
+ rawmaker/features/border.py,sha256=6hbbaLSikZitOjk_0t7INfmPvzfZWOKTB_gEiy2sCts,5658
24
+ rawmaker/features/boxes.py,sha256=OhaYmnwlohtHPn-DsQXhljqyHVFFqe0MeoVEFmrCbXQ,5171
25
+ rawmaker/features/figures.py,sha256=p3Z23oFgmfE0BtIMLJuzahfkjQB8abuob6BUUc_ME3c,898
26
+ rawmaker/features/fonts.py,sha256=tTwDs0i7-w0OAHpilRUqIjJe8FszUFY7m6aOdc91ujs,7703
27
+ rawmaker/features/formula.py,sha256=pJVjfYxyL0SM4hQWBgzZ2j1pE8KBjIQJGilsfqO8FCc,737
28
+ rawmaker/features/horizontals.py,sha256=9Xoh05lf2SkcdjnoQ63594cy7vbsz_dSlaLOuuTZQ9U,4503
29
+ rawmaker/features/images.py,sha256=Ifm8HWKE-mgbu0mIeO_TNy21wp9s-1U0qOPgZGVQmfM,4761
30
+ rawmaker/features/line.py,sha256=Vzc8H3ktXf3T1g2KyePdkrGFNvNXMCorTVfAYc429-g,11498
31
+ rawmaker/features/outlines.py,sha256=i43rJ6Al1hkl62zDnel5-qAzrxckTwoCXY9i5RiihYA,4626
32
+ rawmaker/features/text.py,sha256=zNamrGTJ-4O6gxF3jIIz1nkM7jST7PJpBAMZgrEIRhA,3052
33
+ rawmaker/fonts/__init__.py,sha256=TM9PAYPmQhbW1HY3Vx-bS5rXe132Z02TPBxFMad80Dk,552
34
+ rawmaker/fonts/parser.py,sha256=IKDj4fO1FJ04SFqe8AKwgG3FRXxDKD3Sk7BizFUHye0,10304
35
+ rawmaker/images/__init__.py,sha256=TM9PAYPmQhbW1HY3Vx-bS5rXe132Z02TPBxFMad80Dk,552
36
+ rawmaker/images/info.py,sha256=REmqL1rqgGFcnfTnI2syMRtVlxtD8PwfPC0KqidPpc4,1221
37
+ rawmaker/miner/__init__.py,sha256=kxNXE6nGBAEes4-tV7jYOvjrGAKCZ_bMBKXrB56Hgho,552
38
+ rawmaker/miner/char.py,sha256=14ULLveN76R5NEZxhv4EwjoqYydp8D_rJ3dxvDDcIOA,1609
39
+ rawmaker/miner/colorspace.py,sha256=yBI0fUNCeHkMxMLo1xSstJWWaN6Be0sl2H4WWsHmSg8,2513
40
+ rawmaker/miner/images.py,sha256=4TdnZQfwKYNhwwm6KPqFDM9bFni7YtG4oILisQ3WPuo,15056
41
+ rawmaker/miner/position.py,sha256=8vQMXo4auqlu0SaJ4JeodNGj59MQmuvhzne9kdmjtl4,3863
42
+ rawmaker/miner/rawchar.py,sha256=KG2HmWrxU0DqFlAD-8LTGKcZEA5t18vWCHwZcCUY_1k,5040
43
+ rawmaker/miner/text.py,sha256=PLWcJKswd98aJsMWu1Kpn2i6eaL8nIiFimILPiRTCvc,27580
44
+ rawmaker/miner/underline.py,sha256=pQtQmSbC94vuK8NhDvatzaabLHKWmG1l0LiTRH9-xoE,2479
45
+ rawmaker/patch/__init__.py,sha256=PasSHyvYtcCN_V2nXstCKKRmvKoZRqDp1KK4C4xG88M,552
46
+ rawmaker/patch/ltchar.py,sha256=tm-kKclisBe9SwacZ4-v5NFvd_y9cJtrzy65DeJEQDw,2702
47
+ rawmaker/text/__init__.py,sha256=TM9PAYPmQhbW1HY3Vx-bS5rXe132Z02TPBxFMad80Dk,552
48
+ rawmaker/text/chars.py,sha256=k68ekWXHMhBYPtFo4s9CZnQhCNZevpiu95O3xzaPQuU,951
49
+ rawmaker/text/data.py,sha256=6XbFmrSXRl0EeOzQYeGbTN_22-2gzNm1uYWdGmRJyQk,1568
50
+ rawmaker/text/superfast.py,sha256=nl1fMvtZE20FV1ELRDdbmQL25GDOg6rkNnxBxzqdBng,3068
51
+ rawmaker/text/wordbox.py,sha256=98zwP1qXbGQEbjc4XTo2aiw_S4Uq3KP4qh7aneKok1I,2822
52
+ rawmaker-2.40.3.dist-info/licenses/LICENSE,sha256=9jV_XivjSpyzpEYIFKZGcjDzX8invw3EHEJsGXmGJbA,1077
53
+ spacestation/__init__.py,sha256=dgU9xAAHcqN7iZ7mDo4NpBM3cq6FrZy-zDyvE9eF2WY,676
54
+ spacestation/cli.py,sha256=Bo3AUv-wzV_uQfaKPxFNn1HkctwG_2NViaSRhRUA-nk,1568
55
+ spacestation/features/__init__.py,sha256=ZnAmZoM8YS7O_aZREOZiaHHCsO1ctWvQVu__JGFNWCc,552
56
+ spacestation/features/chardist.py,sha256=NUgyaRZLFgFYx9YFzS1OVMETwmO4Tm_Xmlt55VvjHP8,2769
57
+ spacestation/features/worddist.py,sha256=xz9J_4xNSngPz9qSyqHyOHCEWkkhddE8eISEFqTRUsg,2052
58
+ spacestation/features/wspace.py,sha256=7Ly_sf_BZwm3mmwQspKJpCeo4yvkVIceZIwWAVTGcNQ,4102
59
+ rawmaker-2.40.3.dist-info/METADATA,sha256=MGq8jjznonPUgwv2hNeEdvNtr3gfi-JmeqFCa09JPR8,1561
60
+ rawmaker-2.40.3.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
61
+ rawmaker-2.40.3.dist-info/entry_points.txt,sha256=RWomFqFfMvzDf4Gb5JBztTIRfbwntcr77OttNSGGhb8,190
62
+ rawmaker-2.40.3.dist-info/top_level.txt,sha256=moTmH7KSrId8qVt4ljZUJ_ukngCG96fb4KcbHp-rpfk,28
63
+ rawmaker-2.40.3.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (82.0.1)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
5
+
@@ -0,0 +1,6 @@
1
+ [console_scripts]
2
+ letty = letty.cli:main
3
+ rawmaker = rawmaker.cli:main
4
+ rawmaker_automate = rawmaker.cli_automate:main
5
+ rawmaker_cleanup = cleanup.cli:main
6
+ spacestation = spacestation.cli:main
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Helmut Konrad Schewe
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,3 @@
1
+ letty
2
+ rawmaker
3
+ spacestation
@@ -0,0 +1,18 @@
1
+ # =============================================================================
2
+ # C O P Y R I G H T
3
+ # -----------------------------------------------------------------------------
4
+ # Copyright (c) 2021-2023 by Helmut Konrad Schewe. All rights reserved.
5
+ # This file is property of Helmut Konrad Schewe. Any unauthorized copy,
6
+ # use or distribution is an offensive act against international law and may
7
+ # be prosecuted under federal law. Its content is company confidential.
8
+ # =============================================================================
9
+
10
+ import rawmaker
11
+
12
+ __version__ = rawmaker.__version__
13
+
14
+ ROOT = rawmaker.ROOT
15
+
16
+ PROCESS = 'spacestation'
17
+ DESCRIPTION = """\
18
+ """
spacestation/cli.py ADDED
@@ -0,0 +1,51 @@
1
+ # =============================================================================
2
+ # C O P Y R I G H T
3
+ # -----------------------------------------------------------------------------
4
+ # Copyright (c) 2021-2023 by Helmut Konrad Schewe. All rights reserved.
5
+ # This file is property of Helmut Konrad Schewe. Any unauthorized copy,
6
+ # use or distribution is an offensive act against international law and may
7
+ # be prosecuted under federal law. Its content is company confidential.
8
+ # =============================================================================
9
+
10
+ import utilo
11
+
12
+ import spacestation
13
+
14
+ WORKPLAN = [
15
+ utilo.create_step(
16
+ name='wspace',
17
+ inputs=[utilo.Pattern('*', 'pdf')],
18
+ output=('wspace', 'words'),
19
+ ),
20
+ utilo.create_step(
21
+ name='chardist',
22
+ inputs=[
23
+ utilo.ResultFile(producer='spacestation', name='wspace_words'),
24
+ ],
25
+ output=('chardist',),
26
+ ),
27
+ utilo.create_step(
28
+ name='worddist',
29
+ inputs=[
30
+ utilo.ResultFile(producer='spacestation', name='wspace_wspace'),
31
+ ],
32
+ output=('worddist',),
33
+ ),
34
+ ]
35
+
36
+
37
+ def main():
38
+ utilo.featurepack(
39
+ workplan=WORKPLAN,
40
+ root=spacestation.ROOT,
41
+ featurepackage='spacestation.features',
42
+ config=utilo.FeaturePackConfig(
43
+ description=spacestation.DESCRIPTION,
44
+ multiprocessed=True,
45
+ name=spacestation.PROCESS,
46
+ pages=True,
47
+ profileflag=True,
48
+ singleinput=True,
49
+ version=spacestation.__version__,
50
+ ),
51
+ )
@@ -0,0 +1,8 @@
1
+ # =============================================================================
2
+ # C O P Y R I G H T
3
+ # -----------------------------------------------------------------------------
4
+ # Copyright (c) 2021-2023 by Helmut Konrad Schewe. All rights reserved.
5
+ # This file is property of Helmut Konrad Schewe. Any unauthorized copy,
6
+ # use or distribution is an offensive act against international law and may
7
+ # be prosecuted under federal law. Its content is company confidential.
8
+ # =============================================================================
@@ -0,0 +1,85 @@
1
+ # =============================================================================
2
+ # C O P Y R I G H T
3
+ # -----------------------------------------------------------------------------
4
+ # Copyright (c) 2021-2023 by Helmut Konrad Schewe. All rights reserved.
5
+ # This file is property of Helmut Konrad Schewe. Any unauthorized copy,
6
+ # use or distribution is an offensive act against international law and may
7
+ # be prosecuted under federal law. Its content is company confidential.
8
+ # =============================================================================
9
+
10
+ import collections
11
+ import statistics
12
+
13
+ import iamraw
14
+ import serializeraw
15
+ import utilo
16
+
17
+
18
+ def work(source: str, pages: tuple = None) -> str:
19
+ wordpages = serializeraw.load_wwords(source, pages=pages)
20
+ document = pages_chardist(wordpages)
21
+ grouped = group_chardist(document)
22
+ dumped = serializeraw.dump_document_chardist(grouped)
23
+ return dumped
24
+
25
+
26
+ def group_chardist(pages):
27
+ """Determine document char dist(mode, mean, median) for multiple pages."""
28
+ grouped = collections.defaultdict(list)
29
+ for _, content in pages:
30
+ for fontsize, distances in content:
31
+ fontsize = utilo.roundme(fontsize, digits=2)
32
+ for distance in distances:
33
+ grouped[fontsize].append(distance)
34
+ result = iamraw.DocumentCharDist()
35
+ for var, operation in (
36
+ ('mode', statistics.mode),
37
+ ('mean', statistics.mean),
38
+ ('median', statistics.median),
39
+ ('count', len),
40
+ ('maxx', max),
41
+ ('minn', min),
42
+ ):
43
+ current = {
44
+ fontsize: utilo.roundme(operation(content), digits=3)
45
+ for fontsize, content in grouped.items()
46
+ }
47
+ for fontsize, value in current.items():
48
+ getattr(result, var)[fontsize] = value
49
+ return result
50
+
51
+
52
+ def pages_chardist(pages):
53
+ """Iterate over pages and determine chardist for every single word."""
54
+ result = []
55
+ for page in pages:
56
+ paged = []
57
+ for word in page.content:
58
+ dist = chardist(word)
59
+ if not dist:
60
+ continue
61
+ paged.append(dist)
62
+ result.append((page.page, paged))
63
+ return result
64
+
65
+
66
+ def chardist(word):
67
+ """Deterine char dist`s for a single word."""
68
+ if not word:
69
+ return None
70
+ if word[-1][0] == ' ':
71
+ # cut last white space char
72
+ word = word[:-1]
73
+ if len(word) <= 2:
74
+ return None
75
+ x1 = word[0][1][2]
76
+ fontsizes = []
77
+ result = []
78
+ for _, bounding, fontsize, _ in word[1:]:
79
+ xdiff = bounding[0] - x1
80
+ result.append(xdiff)
81
+ x1 = bounding[2]
82
+ fontsizes.append(fontsize)
83
+ fontsize = utilo.mode(fontsizes)
84
+ result: tuple = utilo.roundme(result, digits=5, convert=False)
85
+ return fontsize, result
@@ -0,0 +1,57 @@
1
+ # =============================================================================
2
+ # C O P Y R I G H T
3
+ # -----------------------------------------------------------------------------
4
+ # Copyright (c) 2021-2023 by Helmut Konrad Schewe. All rights reserved.
5
+ # This file is property of Helmut Konrad Schewe. Any unauthorized copy,
6
+ # use or distribution is an offensive act against international law and may
7
+ # be prosecuted under federal law. Its content is company confidential.
8
+ # =============================================================================
9
+
10
+ import collections
11
+ import statistics
12
+
13
+ import iamraw
14
+ import serializeraw
15
+ import utilo
16
+
17
+
18
+ def work(source: str, pages: tuple = None) -> str:
19
+ wspaces = serializeraw.load_wspaces(source, pages=pages)
20
+ document = [wordspace(page) for page in wspaces]
21
+ grouped = document_worddist(document)
22
+ dumped = serializeraw.dump_document_worddist(grouped)
23
+ return dumped
24
+
25
+
26
+ def document_worddist(pages):
27
+ grouped = collections.defaultdict(list)
28
+ for _, content in pages:
29
+ for fontsize, distance in content.items():
30
+ fontsize = utilo.roundme(fontsize, digits=2)
31
+ grouped[fontsize].extend(distance)
32
+ result = iamraw.DocumentWordDist()
33
+ for var, operation in (
34
+ ('mode', statistics.mode),
35
+ ('mean', statistics.mean),
36
+ ('median', statistics.median),
37
+ ('count', len),
38
+ ('maxx', max),
39
+ ('minn', min),
40
+ ):
41
+ current = {
42
+ fontsize: utilo.roundme(operation(content), digits=3)
43
+ for fontsize, content in grouped.items()
44
+ }
45
+ for fontsize, value in current.items():
46
+ getattr(result, var)[fontsize] = value
47
+ return result
48
+
49
+
50
+ def wordspace(page) -> dict:
51
+ collected = collections.defaultdict(list)
52
+ for wspace in page.content:
53
+ fontsize = utilo.roundme(wspace[3] - wspace[1], digits=1)
54
+ width = utilo.roundme(wspace[2] - wspace[0], digits=2)
55
+ collected[fontsize].append(width)
56
+ collected: dict = dict(collected)
57
+ return page.page, collected
@@ -0,0 +1,130 @@
1
+ # =============================================================================
2
+ # C O P Y R I G H T
3
+ # -----------------------------------------------------------------------------
4
+ # Copyright (c) 2021-2023 by Helmut Konrad Schewe. All rights reserved.
5
+ # This file is property of Helmut Konrad Schewe. Any unauthorized copy,
6
+ # use or distribution is an offensive act against international law and may
7
+ # be prosecuted under federal law. Its content is company confidential.
8
+ # =============================================================================
9
+
10
+ import configos
11
+ import iamraw
12
+ import serializeraw
13
+ import utilo
14
+
15
+ import rawmaker.features
16
+ import rawmaker.miner.char
17
+ import rawmaker.reader
18
+
19
+ # two chars differ less than this to be merged to the same line
20
+ CHAR_SAME_LINE_DIFF_MAX = configos.HV_FLOAT_PLUS(default=10.0)
21
+
22
+
23
+ def work(source: str, pages: tuple = None) -> tuple[str, str]:
24
+ wordspaces, words = extract(source, pages=pages)
25
+ dumped_space = serializeraw.dump_wspaces(wordspaces)
26
+ dumped_words = serializeraw.dump_wwords(words)
27
+ return dumped_space, dumped_words
28
+
29
+
30
+ def extract(document: str, pages: tuple = None):
31
+ with rawmaker.reader.read(document) as pdf:
32
+ document = rawmaker.features.extract_content(
33
+ pdf,
34
+ converter=rawmaker.miner.char.CharPDFConvert,
35
+ config=rawmaker.parameter.ParsingConfiguration(strip=True),
36
+ pages=pages,
37
+ )
38
+ result = []
39
+ words = []
40
+ for page in document:
41
+ if utilo.should_skip(page.page, pages):
42
+ continue
43
+ extracted = extract_page(page)
44
+ if not extracted:
45
+ continue
46
+ wspace, chargroups = extracted
47
+ result.append(iamraw.PageContent(page=page.page, content=wspace))
48
+ words.append(iamraw.PageContent(page=page.page, content=chargroups))
49
+ return result, words
50
+
51
+
52
+ def extract_page(chars: list, maxdiff: callable = None) -> list:
53
+ if not maxdiff:
54
+ maxdiff = diffme
55
+ # remove empty chars
56
+ chars = [char for char in chars if char._text.strip()] # pylint:disable=W0212
57
+ if not chars:
58
+ return []
59
+ result = []
60
+ chargroups = [[chars[0]]]
61
+ chars = sameline(chars)
62
+ last = chars[0].bbox
63
+ for char in chars[1:]:
64
+ bbox = char.bbox
65
+ chargroups[-1].append(char)
66
+ xdiff_max, ydiff_max = maxdiff(char.fontsize)
67
+ # x0, y0, x1, y1
68
+ xdiff = last[2] - bbox[0]
69
+ ydiff = last[3] - bbox[3]
70
+ # rectangle between
71
+ if abs(ydiff) > ydiff_max:
72
+ # new line
73
+ last = char
74
+ chargroups.append([chargroups[-1].pop()])
75
+ continue
76
+ if abs(xdiff) > xdiff_max:
77
+ # new word
78
+ # x0, y0, x1, y1
79
+ bounding = iamraw.BoundingBox(
80
+ min(last[2], bbox[0]),
81
+ min(last[1], bbox[1]),
82
+ max(last[2], bbox[0]),
83
+ max(last[3], bbox[3]),
84
+ )
85
+ result.append(bounding)
86
+ last = char
87
+ chargroups.append([chargroups[-1].pop()])
88
+ continue
89
+ last = char.bbox
90
+ return result, chargroups
91
+
92
+
93
+ MAXDIFF = configos.HolyTable(items=[
94
+ (7.0, 1.4),
95
+ (10.0, 1.4),
96
+ (15.0, 3.0),
97
+ (20.0, 4.0),
98
+ (25.0, 5.0),
99
+ ])
100
+
101
+
102
+ def diffme(fontsize: float) -> tuple:
103
+ # assert 4.0 <= fontsize <= 100, str(fontsize)
104
+ # xdiff, ydiff
105
+ xdiff = MAXDIFF(fontsize)
106
+ return (xdiff, 10.0)
107
+
108
+
109
+ def sameline(
110
+ chars,
111
+ diff_max=CHAR_SAME_LINE_DIFF_MAX,
112
+ ):
113
+ # sort by x0
114
+ chars = sorted(chars, key=lambda x: x.bbox[0])
115
+ # sort by y0
116
+ chars = sorted(chars, key=lambda x: x.bbox[3])
117
+ # run cluster
118
+ clusterd = utilo.same_line_cluster(
119
+ chars,
120
+ min_elements=1,
121
+ max_diff=diff_max.
122
+ value, # TODO: FIX ACCESS LATER, SEE RUNTIME PROBLEM IN NEAR
123
+ matcher=lambda x: x.bbox[3],
124
+ )
125
+ # sort top down
126
+ # TODO: REMOVE AFTR HAVING STABLE LINE CLUSTER
127
+ clusterd = sorted(clusterd, key=lambda x: x.center.bbox[3])
128
+ result = [sorted(line, key=lambda x: x.bbox[0]) for line in clusterd]
129
+ result = utilo.flat(result)
130
+ return result