rawmaker 2.40.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- letty/__init__.py +46 -0
- letty/cli.py +63 -0
- letty/optimizer.py +138 -0
- letty/quality/__init__.py +8 -0
- letty/quality/whitespace.py +50 -0
- letty/strategy.py +8 -0
- rawmaker/__init__.py +29 -0
- rawmaker/__main__.py +13 -0
- rawmaker/__patch__.py +36 -0
- rawmaker/cli.py +206 -0
- rawmaker/cli_automate.py +69 -0
- rawmaker/converter/__init__.py +8 -0
- rawmaker/converter/basic.py +174 -0
- rawmaker/converter/images.py +168 -0
- rawmaker/date.py +83 -0
- rawmaker/destination.py +202 -0
- rawmaker/error.py +34 -0
- rawmaker/features/__init__.py +138 -0
- rawmaker/features/annotation.py +254 -0
- rawmaker/features/border.py +172 -0
- rawmaker/features/boxes.py +153 -0
- rawmaker/features/figures.py +24 -0
- rawmaker/features/fonts.py +229 -0
- rawmaker/features/formula.py +16 -0
- rawmaker/features/horizontals.py +132 -0
- rawmaker/features/images.py +155 -0
- rawmaker/features/line.py +337 -0
- rawmaker/features/outlines.py +123 -0
- rawmaker/features/text.py +91 -0
- rawmaker/fonts/__init__.py +8 -0
- rawmaker/fonts/parser.py +354 -0
- rawmaker/images/__init__.py +8 -0
- rawmaker/images/info.py +35 -0
- rawmaker/miner/__init__.py +8 -0
- rawmaker/miner/char.py +42 -0
- rawmaker/miner/colorspace.py +75 -0
- rawmaker/miner/images.py +448 -0
- rawmaker/miner/position.py +121 -0
- rawmaker/miner/rawchar.py +207 -0
- rawmaker/miner/text.py +833 -0
- rawmaker/miner/underline.py +66 -0
- rawmaker/parameter.py +130 -0
- rawmaker/patch/__init__.py +8 -0
- rawmaker/patch/ltchar.py +79 -0
- rawmaker/reader.py +97 -0
- rawmaker/text/__init__.py +8 -0
- rawmaker/text/chars.py +24 -0
- rawmaker/text/data.py +47 -0
- rawmaker/text/superfast.py +91 -0
- rawmaker/text/wordbox.py +95 -0
- rawmaker/utils.py +44 -0
- rawmaker-2.40.3.dist-info/METADATA +51 -0
- rawmaker-2.40.3.dist-info/RECORD +63 -0
- rawmaker-2.40.3.dist-info/WHEEL +5 -0
- rawmaker-2.40.3.dist-info/entry_points.txt +6 -0
- rawmaker-2.40.3.dist-info/licenses/LICENSE +21 -0
- rawmaker-2.40.3.dist-info/top_level.txt +3 -0
- spacestation/__init__.py +18 -0
- spacestation/cli.py +51 -0
- spacestation/features/__init__.py +8 -0
- spacestation/features/chardist.py +85 -0
- spacestation/features/worddist.py +57 -0
- spacestation/features/wspace.py +130 -0
|
@@ -0,0 +1,207 @@
|
|
|
1
|
+
# =============================================================================
|
|
2
|
+
# C O P Y R I G H T
|
|
3
|
+
# -----------------------------------------------------------------------------
|
|
4
|
+
# Copyright (c) 2020-2023 by Helmut Konrad Schewe. All rights reserved.
|
|
5
|
+
# This file is property of Helmut Konrad Schewe. Any unauthorized copy,
|
|
6
|
+
# use or distribution is an offensive act against international law and may
|
|
7
|
+
# be prosecuted under federal law. Its content is company confidential.
|
|
8
|
+
# =============================================================================
|
|
9
|
+
"""RawCharacter
|
|
10
|
+
============
|
|
11
|
+
|
|
12
|
+
The concept of `RawItem`s aims to store the full pdf information by
|
|
13
|
+
current items to use them for further analysis. This information are
|
|
14
|
+
`rawmaker` internal and will be removed before serializing the data.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
import contextlib
|
|
18
|
+
|
|
19
|
+
import iamraw
|
|
20
|
+
import pdfminer.layout
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class RawChar(iamraw.Char):
|
|
24
|
+
|
|
25
|
+
def __init__(self, ltchar: pdfminer.layout.LTChar, **kwargs):
|
|
26
|
+
super().__init__(**kwargs)
|
|
27
|
+
self.ltchar = ltchar
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class RawUnicodeChar(iamraw.UnicodeChar):
|
|
31
|
+
|
|
32
|
+
def __init__(self, ltchar: pdfminer.layout.LTChar, **kwargs):
|
|
33
|
+
super().__init__(**kwargs)
|
|
34
|
+
self.ltchar = ltchar
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def special_char(item: str, fontname: str = None) -> str:
|
|
38
|
+
"""\
|
|
39
|
+
>>> special_char('š')
|
|
40
|
+
's'
|
|
41
|
+
>>> special_char('é')
|
|
42
|
+
'e'
|
|
43
|
+
>>> special_char('∗')
|
|
44
|
+
'*'
|
|
45
|
+
>>> special_char('ff')
|
|
46
|
+
'ff'
|
|
47
|
+
"""
|
|
48
|
+
if not item:
|
|
49
|
+
return None
|
|
50
|
+
if 27 <= ord(item[0]) <= 128:
|
|
51
|
+
return item
|
|
52
|
+
if fontname and 'LMMath' in fontname:
|
|
53
|
+
with contextlib.suppress(KeyError):
|
|
54
|
+
return SPECIAL_CHARS_LMMath[item]
|
|
55
|
+
if fontname and 'NPPQGQ' in fontname:
|
|
56
|
+
with contextlib.suppress(KeyError):
|
|
57
|
+
return SPECIAL_CHARS_NPPQGQ[item]
|
|
58
|
+
with contextlib.suppress(KeyError):
|
|
59
|
+
return SPECIAL_CHARS_TABLE[item]
|
|
60
|
+
return None
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def special_chars(text: str) -> str:
|
|
64
|
+
"""\
|
|
65
|
+
>>> special_chars('Řůř')
|
|
66
|
+
'Rur'
|
|
67
|
+
>>> special_chars('öäüÖÄÜ')
|
|
68
|
+
'öäüÖÄÜ'
|
|
69
|
+
"""
|
|
70
|
+
collected = []
|
|
71
|
+
for char in text:
|
|
72
|
+
converted = special_char(char)
|
|
73
|
+
if converted is None:
|
|
74
|
+
continue
|
|
75
|
+
collected.append(converted)
|
|
76
|
+
result = ''.join(collected)
|
|
77
|
+
return result
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def parse_special_chars(table: str) -> dict:
|
|
81
|
+
result = {
|
|
82
|
+
line.split()[0]: line.split()[1]
|
|
83
|
+
for line in table.strip().splitlines()
|
|
84
|
+
if line and not line.strip().startswith('#')
|
|
85
|
+
}
|
|
86
|
+
return result
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
SPECIAL_CHARS_LMMath = parse_special_chars("""
|
|
90
|
+
\u03B1 α # alpha
|
|
91
|
+
\u03B2 β # beta
|
|
92
|
+
# \u2211 − # minus
|
|
93
|
+
\u2206 ∆
|
|
94
|
+
\u223c ∼
|
|
95
|
+
\u2212 − # minus
|
|
96
|
+
\u03c0 π
|
|
97
|
+
\u03c6 φ
|
|
98
|
+
\u03c9 ω
|
|
99
|
+
\u25e6 ◦
|
|
100
|
+
\u03c4 τ
|
|
101
|
+
\u03c1 ρ
|
|
102
|
+
\xb7 ·
|
|
103
|
+
\xb5 µ
|
|
104
|
+
# \u03B1 a # alpha
|
|
105
|
+
# \u03B2 b # beta
|
|
106
|
+
# \u2212 - # minus
|
|
107
|
+
# \u03c0 p
|
|
108
|
+
# \u03c6 o
|
|
109
|
+
# \u03c9 w
|
|
110
|
+
""")
|
|
111
|
+
|
|
112
|
+
# TODO: HC_DISS_171_P9
|
|
113
|
+
# TODO: CHECK CRAZY FONT
|
|
114
|
+
SPECIAL_CHARS_NPPQGQ = parse_special_chars("""
|
|
115
|
+
""")
|
|
116
|
+
# 11 is VT VERTICAL TAB
|
|
117
|
+
SPECIAL_CHARS_NPPQGQ['\x0b'] = 'ff'
|
|
118
|
+
|
|
119
|
+
# TODO: REQUIRE BETTER APPROACH OF REPLACING `LEGATURES`
|
|
120
|
+
SPECIAL_CHARS_TABLE = parse_special_chars("""
|
|
121
|
+
# legiaturen
|
|
122
|
+
\uFB00 ff
|
|
123
|
+
\uFB01 fi
|
|
124
|
+
\uFB02 fl
|
|
125
|
+
\uFB03 ffi
|
|
126
|
+
|
|
127
|
+
\u2217 * # hcdiss171p9
|
|
128
|
+
\x03 * # hcdiss171p9
|
|
129
|
+
|
|
130
|
+
\xA8 ¨
|
|
131
|
+
|
|
132
|
+
# umlaute
|
|
133
|
+
\xC4 Ä
|
|
134
|
+
\xD6 Ö
|
|
135
|
+
\xDC Ü
|
|
136
|
+
\xE4 ä
|
|
137
|
+
\xF6 ö
|
|
138
|
+
\xFC ü
|
|
139
|
+
|
|
140
|
+
\u0161 s š
|
|
141
|
+
\xE9 e é
|
|
142
|
+
|
|
143
|
+
\xa1 i ¡
|
|
144
|
+
\xc0 A À
|
|
145
|
+
\xc1 A Á
|
|
146
|
+
\xc2 A Â
|
|
147
|
+
\xc3 A Ã
|
|
148
|
+
# \xc4 A Ä
|
|
149
|
+
\xc5 A Å
|
|
150
|
+
\xc6 A Æ
|
|
151
|
+
\xc7 C Ç
|
|
152
|
+
\xc8 E È
|
|
153
|
+
\xc9 E É
|
|
154
|
+
\xca E Ê
|
|
155
|
+
\xcb E Ë
|
|
156
|
+
\xcc I Ì
|
|
157
|
+
\xcd I Í
|
|
158
|
+
\xce I Î
|
|
159
|
+
\xcf I Ï
|
|
160
|
+
\xd0 D Ð
|
|
161
|
+
\xd1 N Ñ
|
|
162
|
+
\xd2 O Ò
|
|
163
|
+
\xd3 O Ó
|
|
164
|
+
\xd4 O Ô
|
|
165
|
+
\xd5 O Õ
|
|
166
|
+
# \xd6 O Ö
|
|
167
|
+
\xd8 O Ø
|
|
168
|
+
\xd9 U Ù
|
|
169
|
+
\xda U Ú
|
|
170
|
+
\xdb U Û
|
|
171
|
+
# \xdc U Ü
|
|
172
|
+
\xdd Y Ý
|
|
173
|
+
\xe0 a à
|
|
174
|
+
\xe1 a á
|
|
175
|
+
\xe2 a â
|
|
176
|
+
\xe3 a ã
|
|
177
|
+
# \xe4 a ä
|
|
178
|
+
\xe5 a å
|
|
179
|
+
\xe6 a æ
|
|
180
|
+
\xe7 c ç
|
|
181
|
+
\xe8 e è
|
|
182
|
+
\xe9 e é
|
|
183
|
+
\xea e ê
|
|
184
|
+
\xeb e ë
|
|
185
|
+
\xec l ì
|
|
186
|
+
\xed l í
|
|
187
|
+
\xee l î
|
|
188
|
+
\xef l ï
|
|
189
|
+
\xf0 o ð
|
|
190
|
+
\xf1 n ñ
|
|
191
|
+
\xf2 o ò
|
|
192
|
+
\xf3 o ó
|
|
193
|
+
\xf4 o ô
|
|
194
|
+
\xf5 o õ
|
|
195
|
+
# \xf6 ö ö
|
|
196
|
+
\xf8 o ø
|
|
197
|
+
\xf9 u ù
|
|
198
|
+
\xfa u ú
|
|
199
|
+
\xfb u û
|
|
200
|
+
# \xfc ü ü
|
|
201
|
+
\xfd y ý
|
|
202
|
+
\xff y ÿ
|
|
203
|
+
Ř R
|
|
204
|
+
ř r
|
|
205
|
+
ů u
|
|
206
|
+
Ů U
|
|
207
|
+
""")
|