rawmaker 2.40.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- letty/__init__.py +46 -0
- letty/cli.py +63 -0
- letty/optimizer.py +138 -0
- letty/quality/__init__.py +8 -0
- letty/quality/whitespace.py +50 -0
- letty/strategy.py +8 -0
- rawmaker/__init__.py +29 -0
- rawmaker/__main__.py +13 -0
- rawmaker/__patch__.py +36 -0
- rawmaker/cli.py +206 -0
- rawmaker/cli_automate.py +69 -0
- rawmaker/converter/__init__.py +8 -0
- rawmaker/converter/basic.py +174 -0
- rawmaker/converter/images.py +168 -0
- rawmaker/date.py +83 -0
- rawmaker/destination.py +202 -0
- rawmaker/error.py +34 -0
- rawmaker/features/__init__.py +138 -0
- rawmaker/features/annotation.py +254 -0
- rawmaker/features/border.py +172 -0
- rawmaker/features/boxes.py +153 -0
- rawmaker/features/figures.py +24 -0
- rawmaker/features/fonts.py +229 -0
- rawmaker/features/formula.py +16 -0
- rawmaker/features/horizontals.py +132 -0
- rawmaker/features/images.py +155 -0
- rawmaker/features/line.py +337 -0
- rawmaker/features/outlines.py +123 -0
- rawmaker/features/text.py +91 -0
- rawmaker/fonts/__init__.py +8 -0
- rawmaker/fonts/parser.py +354 -0
- rawmaker/images/__init__.py +8 -0
- rawmaker/images/info.py +35 -0
- rawmaker/miner/__init__.py +8 -0
- rawmaker/miner/char.py +42 -0
- rawmaker/miner/colorspace.py +75 -0
- rawmaker/miner/images.py +448 -0
- rawmaker/miner/position.py +121 -0
- rawmaker/miner/rawchar.py +207 -0
- rawmaker/miner/text.py +833 -0
- rawmaker/miner/underline.py +66 -0
- rawmaker/parameter.py +130 -0
- rawmaker/patch/__init__.py +8 -0
- rawmaker/patch/ltchar.py +79 -0
- rawmaker/reader.py +97 -0
- rawmaker/text/__init__.py +8 -0
- rawmaker/text/chars.py +24 -0
- rawmaker/text/data.py +47 -0
- rawmaker/text/superfast.py +91 -0
- rawmaker/text/wordbox.py +95 -0
- rawmaker/utils.py +44 -0
- rawmaker-2.40.3.dist-info/METADATA +51 -0
- rawmaker-2.40.3.dist-info/RECORD +63 -0
- rawmaker-2.40.3.dist-info/WHEEL +5 -0
- rawmaker-2.40.3.dist-info/entry_points.txt +6 -0
- rawmaker-2.40.3.dist-info/licenses/LICENSE +21 -0
- rawmaker-2.40.3.dist-info/top_level.txt +3 -0
- spacestation/__init__.py +18 -0
- spacestation/cli.py +51 -0
- spacestation/features/__init__.py +8 -0
- spacestation/features/chardist.py +85 -0
- spacestation/features/worddist.py +57 -0
- spacestation/features/wspace.py +130 -0
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
# =============================================================================
|
|
2
|
+
# C O P Y R I G H T
|
|
3
|
+
# -----------------------------------------------------------------------------
|
|
4
|
+
# Copyright (c) 2020-2023 by Helmut Konrad Schewe. All rights reserved.
|
|
5
|
+
# This file is property of Helmut Konrad Schewe. Any unauthorized copy,
|
|
6
|
+
# use or distribution is an offensive act against international law and may
|
|
7
|
+
# be prosecuted under federal law. Its content is company confidential.
|
|
8
|
+
# =============================================================================
|
rawmaker/fonts/parser.py
ADDED
|
@@ -0,0 +1,354 @@
|
|
|
1
|
+
# =============================================================================
|
|
2
|
+
# C O P Y R I G H T
|
|
3
|
+
# -----------------------------------------------------------------------------
|
|
4
|
+
# Copyright (c) 2020-2023 by Helmut Konrad Schewe. All rights reserved.
|
|
5
|
+
# This file is property of Helmut Konrad Schewe. Any unauthorized copy,
|
|
6
|
+
# use or distribution is an offensive act against international law and may
|
|
7
|
+
# be prosecuted under federal law. Its content is company confidential.
|
|
8
|
+
# =============================================================================
|
|
9
|
+
"""Font Parser
|
|
10
|
+
|
|
11
|
+
PDF Font description:
|
|
12
|
+
|
|
13
|
+
9.5. Introduction into Font Data Structures
|
|
14
|
+
|
|
15
|
+
Font types
|
|
16
|
+
|
|
17
|
+
Type0
|
|
18
|
+
Type1 Type1
|
|
19
|
+
MMType1: MultiMaster Font
|
|
20
|
+
Type3 stream of pdf graphic operators
|
|
21
|
+
TrueType Based on TrueType font format
|
|
22
|
+
CIDFont CIDFontType0
|
|
23
|
+
CIDFontType2
|
|
24
|
+
|
|
25
|
+
9.6.2.2 Standard Type 1 Fonts
|
|
26
|
+
|
|
27
|
+
uses compact encoding for glyph description and additional hints to print
|
|
28
|
+
on small sizes and solutions well.
|
|
29
|
+
|
|
30
|
+
PostScript 14 standard types:
|
|
31
|
+
Times-Roman, Helvectica, Courier, Symbol, Times-Bold,
|
|
32
|
+
Helvetica-Bold, Courier-Bold, ZapfDingbats, Times-Italic,
|
|
33
|
+
Helvetica-Oblique, Courier-Oblique, Times-BoldItalic,
|
|
34
|
+
Helvetica-BoldOblique, Courier-BoldOblique.
|
|
35
|
+
|
|
36
|
+
9.6.2.3 MultiMasterFonts
|
|
37
|
+
|
|
38
|
+
9.6.3. TrueTypeFonts
|
|
39
|
+
|
|
40
|
+
9.6.4. Font Subsets
|
|
41
|
+
|
|
42
|
+
BaseFont
|
|
43
|
+
FontName
|
|
44
|
+
|
|
45
|
+
Tag(6 chars) +
|
|
46
|
+
|
|
47
|
+
Example: EOODIA+Poetica - name of a subset of Poetica, a Type 1 font.
|
|
48
|
+
|
|
49
|
+
9.6.5 Type 3 Fonts
|
|
50
|
+
|
|
51
|
+
Defined by a stream of pdf graphic commands, no special support or hint
|
|
52
|
+
for very small characters.
|
|
53
|
+
|
|
54
|
+
9.7.4 CIDFonts
|
|
55
|
+
|
|
56
|
+
CIDFont program contains glyph descriptions that are accessed using a CID
|
|
57
|
+
as a character selector.
|
|
58
|
+
|
|
59
|
+
Summary:
|
|
60
|
+
|
|
61
|
+
Font Type 0
|
|
62
|
+
('Helvetica - Bold', 16.70),
|
|
63
|
+
('Times - Roman', 13.40),
|
|
64
|
+
Font Type 1, TrueType Fonts:
|
|
65
|
+
('ZTJCPR + NimbusRomNo9L - MediItal', 11.60),
|
|
66
|
+
('KCXMNX + TeX - feymr10', 10.70),
|
|
67
|
+
"""
|
|
68
|
+
|
|
69
|
+
import contextlib
|
|
70
|
+
|
|
71
|
+
import iamraw
|
|
72
|
+
import iamraw.fonts
|
|
73
|
+
import serializeraw
|
|
74
|
+
import utilo
|
|
75
|
+
|
|
76
|
+
POSTSCRIPT_14_DEFAULT = {
|
|
77
|
+
'Courier',
|
|
78
|
+
'Courier-Bold',
|
|
79
|
+
'Courier-BoldOblique',
|
|
80
|
+
'Courier-Oblique',
|
|
81
|
+
'Helvectica',
|
|
82
|
+
'Helvetica-Bold',
|
|
83
|
+
'Helvetica-BoldOblique',
|
|
84
|
+
'Helvetica-Oblique',
|
|
85
|
+
'Symbol',
|
|
86
|
+
'Times-Bold',
|
|
87
|
+
'Times-BoldItalic',
|
|
88
|
+
'Times-Italic',
|
|
89
|
+
'Times-Roman',
|
|
90
|
+
'ZapfDingbats',
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def font_fromraw(font: str, scale: float = 0.0, flags: int = 0) -> iamraw.Font:
|
|
95
|
+
"""Parse `Font` from pdf representation, read the description above.
|
|
96
|
+
|
|
97
|
+
Args:
|
|
98
|
+
font(str): pdf standard font definition
|
|
99
|
+
scale(float): size of font(unit?)
|
|
100
|
+
flags(int): style of rendered font
|
|
101
|
+
Returns:
|
|
102
|
+
returns internal `Font` object with detected style and scale
|
|
103
|
+
|
|
104
|
+
>>> font_fromraw('Times-RomanRegularLight')
|
|
105
|
+
Font(pdfref='Times-RomanRegularLight',...)
|
|
106
|
+
>>> font_fromraw('CFSJNQ+LinBiolinumTB')
|
|
107
|
+
Font(pdfref='CFSJNQ+LinBiolinumTB',...weight=iamraw.fonts.Weight.BOLD,...)
|
|
108
|
+
"""
|
|
109
|
+
pdfref = font
|
|
110
|
+
utilo.call('rawmaker.fonts.parser.font_fromraw')
|
|
111
|
+
utilo.debug('%s %.2f' % (str(font), scale)) # pylint:disable=C0209
|
|
112
|
+
flags = serializeraw.load_flags(flags)
|
|
113
|
+
# remove white spaces to avoid missing PostScript 14 language cause of
|
|
114
|
+
# containg white spaces, for example `Times - Roman` instead of
|
|
115
|
+
# `Times-Roman`.
|
|
116
|
+
font = font.replace(' ', '')
|
|
117
|
+
# parse different fonts
|
|
118
|
+
basefont = parse_basefont(font)
|
|
119
|
+
cidfont = parse_cidfont(font)
|
|
120
|
+
default = parse_default(font)
|
|
121
|
+
styled = parse_font_styled(font)
|
|
122
|
+
simple = parse_font_simple(font)
|
|
123
|
+
# select best font parsing
|
|
124
|
+
fontname, style = None, None
|
|
125
|
+
if cidfont is not None:
|
|
126
|
+
# cidfont at first, cause cidfont selector is the clearest and not
|
|
127
|
+
# ambigous.
|
|
128
|
+
fontname, style = cidfont
|
|
129
|
+
elif simple:
|
|
130
|
+
fontname, style = simple
|
|
131
|
+
elif styled:
|
|
132
|
+
fontname, style = styled
|
|
133
|
+
elif basefont is not None:
|
|
134
|
+
fontname, style = basefont
|
|
135
|
+
elif default is not None:
|
|
136
|
+
fontname, style = default
|
|
137
|
+
# use default style if no style is given
|
|
138
|
+
weight, style, stretch = style if style else (None, None, None)
|
|
139
|
+
# inform about parsing problem
|
|
140
|
+
if fontname is None or '+' in fontname or ',' in fontname:
|
|
141
|
+
utilo.error(f'detected fontname {fontname}; input: {font}')
|
|
142
|
+
font = iamraw.Font(
|
|
143
|
+
name=fontname,
|
|
144
|
+
scale=scale,
|
|
145
|
+
stretch=stretch,
|
|
146
|
+
style=style,
|
|
147
|
+
weight=weight,
|
|
148
|
+
flags=flags,
|
|
149
|
+
pdfref=pdfref,
|
|
150
|
+
)
|
|
151
|
+
return font
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
def parse_basefont(font: str):
|
|
155
|
+
"""\
|
|
156
|
+
>>> parse_basefont('Arial,Bold')
|
|
157
|
+
('Arial',...Weight.BOLD...)
|
|
158
|
+
>>> parse_basefont('WTUVLZ+NimbusRomNo9L-Regu') is None
|
|
159
|
+
True
|
|
160
|
+
>>> parse_basefont('Times-RomanRegularLight')
|
|
161
|
+
('Times-Roman', (...))
|
|
162
|
+
"""
|
|
163
|
+
if '+' in font:
|
|
164
|
+
return None
|
|
165
|
+
# Example: Arial,Bold
|
|
166
|
+
fontname, raw_style = font, ''
|
|
167
|
+
with contextlib.suppress(ValueError):
|
|
168
|
+
fontname, raw_style = font.split(',')
|
|
169
|
+
style = parse_style(raw_style)
|
|
170
|
+
if not style:
|
|
171
|
+
return None
|
|
172
|
+
for name, *styles in STYLES:
|
|
173
|
+
if not set(style) & set(styles):
|
|
174
|
+
# nothing to replace
|
|
175
|
+
continue
|
|
176
|
+
fontname = utilo.rreplace(fontname, name, '')
|
|
177
|
+
return fontname, style
|
|
178
|
+
|
|
179
|
+
|
|
180
|
+
def parse_cidfont(font: str):
|
|
181
|
+
"""\
|
|
182
|
+
>>> parse_cidfont('CIDFont+F1')
|
|
183
|
+
('F1', None)
|
|
184
|
+
"""
|
|
185
|
+
cidfont = font.startswith('CIDFont+')
|
|
186
|
+
if not cidfont:
|
|
187
|
+
return None
|
|
188
|
+
# Example: CIDFont+F1
|
|
189
|
+
# remove cid tag and plus sign
|
|
190
|
+
fontname = font[8:]
|
|
191
|
+
return fontname, None
|
|
192
|
+
|
|
193
|
+
|
|
194
|
+
def parse_postscript14(fontname: str):
|
|
195
|
+
if fontname not in POSTSCRIPT_14_DEFAULT:
|
|
196
|
+
return None
|
|
197
|
+
if '-' not in fontname:
|
|
198
|
+
# Courier
|
|
199
|
+
return fontname, None
|
|
200
|
+
# 'Courier-Oblique',
|
|
201
|
+
fontname, style = fontname.split('-')
|
|
202
|
+
style = parse_style(style)
|
|
203
|
+
return fontname, style
|
|
204
|
+
|
|
205
|
+
|
|
206
|
+
def parse_default(font: str):
|
|
207
|
+
# Example: LGAZPG + SegoeUI, Bold
|
|
208
|
+
# remove base tag and plus sign
|
|
209
|
+
font = font[7:]
|
|
210
|
+
fontname, raw_style = font, ''
|
|
211
|
+
|
|
212
|
+
parsed = parse_postscript14(font)
|
|
213
|
+
if parsed:
|
|
214
|
+
# 'AIDZQU+Times-Roman' no style parsing is required
|
|
215
|
+
return parsed
|
|
216
|
+
|
|
217
|
+
style = None
|
|
218
|
+
with contextlib.suppress(ValueError):
|
|
219
|
+
fontname, raw_style = font.split(',')
|
|
220
|
+
with contextlib.suppress(ValueError):
|
|
221
|
+
fontname, raw_style = font.split('-')
|
|
222
|
+
style = parse_style(raw_style)
|
|
223
|
+
if not raw_style:
|
|
224
|
+
parsed = parse_font_simple(fontname)
|
|
225
|
+
if parsed:
|
|
226
|
+
fontname, style = parsed
|
|
227
|
+
if not style:
|
|
228
|
+
# TODO: FONT STYLE PARSER
|
|
229
|
+
return fontname, None
|
|
230
|
+
return fontname, style
|
|
231
|
+
|
|
232
|
+
|
|
233
|
+
def parse_font_styled(font: str):
|
|
234
|
+
if not font.count('-') == 1 or '+' in font:
|
|
235
|
+
return None
|
|
236
|
+
name, style = font.split('-')
|
|
237
|
+
style = parse_style(style)
|
|
238
|
+
|
|
239
|
+
if not style:
|
|
240
|
+
return None
|
|
241
|
+
name = named(name)
|
|
242
|
+
return name, style
|
|
243
|
+
|
|
244
|
+
|
|
245
|
+
def parse_font_simple(font: str):
|
|
246
|
+
if any(char in font for char in {'+', '-', ','}): # pylint:disable=use-sequence-for-iteration
|
|
247
|
+
return None
|
|
248
|
+
styles = []
|
|
249
|
+
for item in STYLES:
|
|
250
|
+
if item[0] not in font:
|
|
251
|
+
continue
|
|
252
|
+
styles.append((item[1], item[2], item[3]))
|
|
253
|
+
font = font.replace(item[0], '')
|
|
254
|
+
|
|
255
|
+
weight, style, stretch = MEDIUM, NORMAL, REGULAR
|
|
256
|
+
for item in styles:
|
|
257
|
+
if item[0]:
|
|
258
|
+
weight = item[0]
|
|
259
|
+
if item[1]:
|
|
260
|
+
style = item[1]
|
|
261
|
+
if item[2]:
|
|
262
|
+
stretch = item[2]
|
|
263
|
+
return font, (weight, style, stretch)
|
|
264
|
+
|
|
265
|
+
|
|
266
|
+
def named(font: str):
|
|
267
|
+
for item in STYLES:
|
|
268
|
+
font = font.replace(item[0], '')
|
|
269
|
+
return font
|
|
270
|
+
|
|
271
|
+
|
|
272
|
+
def font_toraw(font: iamraw.Font) -> str:
|
|
273
|
+
"""\
|
|
274
|
+
>>> font_toraw(iamraw.Font(name='Times-Roman', scale=8.95, weight=LIGHT, stretch=REGULAR))
|
|
275
|
+
'Times-RomanRegularLight'
|
|
276
|
+
"""
|
|
277
|
+
result = font.name
|
|
278
|
+
selected = {font.weight, font.style, font.stretch}
|
|
279
|
+
if not any(selected):
|
|
280
|
+
# no style given, do not use default style
|
|
281
|
+
return f'CIDFont+{result}'
|
|
282
|
+
styles = [
|
|
283
|
+
('Bd', BOLD, None, None),
|
|
284
|
+
('Italic', None, ITALIC, None),
|
|
285
|
+
('Medium', MEDIUM, None, None),
|
|
286
|
+
('Oblique', None, OBLIQUE, None),
|
|
287
|
+
('Regular', None, None, REGULAR),
|
|
288
|
+
('Light', LIGHT, None, None),
|
|
289
|
+
]
|
|
290
|
+
for raw, *items in styles:
|
|
291
|
+
if not any(item for item in items if item in selected):
|
|
292
|
+
continue
|
|
293
|
+
result += raw
|
|
294
|
+
return result
|
|
295
|
+
|
|
296
|
+
|
|
297
|
+
def parse_style(raw_style): # pylint:disable=R1260,R0912
|
|
298
|
+
save = raw_style
|
|
299
|
+
weight, style, stretch = LIGHT, NORMAL, REGULAR
|
|
300
|
+
for item in STYLES:
|
|
301
|
+
if item[0] not in raw_style:
|
|
302
|
+
continue
|
|
303
|
+
raw_style = raw_style.replace(item[0], '')
|
|
304
|
+
if item[1]:
|
|
305
|
+
weight = item[1] # pylint:disable=R0204
|
|
306
|
+
if item[2]:
|
|
307
|
+
style = item[2]
|
|
308
|
+
if item[3]:
|
|
309
|
+
stretch = item[3]
|
|
310
|
+
if raw_style: # TODO: Remove before going live
|
|
311
|
+
# at the end, everything must be replaced
|
|
312
|
+
utilo.error(f'unsupported style {save}, maybe a name: {raw_style}')
|
|
313
|
+
if raw_style:
|
|
314
|
+
return None
|
|
315
|
+
return weight, style, stretch
|
|
316
|
+
|
|
317
|
+
|
|
318
|
+
BOLD = iamraw.Weight.BOLD
|
|
319
|
+
ITALIC = iamraw.Style.ITALIC
|
|
320
|
+
LIGHT = iamraw.Weight.LIGHT
|
|
321
|
+
MEDIUM = iamraw.Weight.MEDIUM
|
|
322
|
+
NORMAL = iamraw.Style.NORMAL
|
|
323
|
+
OBLIQUE = iamraw.Style.OBLIQUE
|
|
324
|
+
REGULAR = iamraw.Stretch.REGULAR
|
|
325
|
+
|
|
326
|
+
# TODO: INVESTIGATE MT
|
|
327
|
+
# BOLD = BOLD
|
|
328
|
+
# MT = MEDIUM
|
|
329
|
+
# HOW TO DEAL WITH BOLD MT?
|
|
330
|
+
|
|
331
|
+
STYLES = [
|
|
332
|
+
('Bd', BOLD, None, None),
|
|
333
|
+
('Italic', None, ITALIC, None),
|
|
334
|
+
('Ital', None, ITALIC, None),
|
|
335
|
+
# ('MI', MEDIUM, ITALIC, None),
|
|
336
|
+
('Medium', MEDIUM, None, None),
|
|
337
|
+
('Medi', MEDIUM, None, None),
|
|
338
|
+
# ('M', MEDIUM, None, None), # HOW TO DEAL WITH ?BOLDMT?
|
|
339
|
+
('MT', None, None, None), # HOW TO DEAL WITH ?BOLDMT? TODO: DECIDE LATER
|
|
340
|
+
('Bold', BOLD, None, None), # HOW TO DEAL WITH ?BOLDMT?
|
|
341
|
+
('Oblique', None, OBLIQUE, None),
|
|
342
|
+
('Obli', None, OBLIQUE, None),
|
|
343
|
+
('PSMT', MEDIUM, None, None),
|
|
344
|
+
('PS', MEDIUM, None, None),
|
|
345
|
+
('Regular', None, None, REGULAR),
|
|
346
|
+
('Regu', None, None, REGULAR),
|
|
347
|
+
('Rg', None, None, REGULAR),
|
|
348
|
+
('Light', LIGHT, None, None),
|
|
349
|
+
('TB', BOLD, None, None),
|
|
350
|
+
]
|
|
351
|
+
# LinBiolinumTB: TODO CHECK TB
|
|
352
|
+
# TODO: Roman converts TimesNewRoman to TimesNew. I could not verify if
|
|
353
|
+
# that is a smart necessary option or it will introduce more problems?
|
|
354
|
+
# ('Roman', None, None, None),
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
# =============================================================================
|
|
2
|
+
# C O P Y R I G H T
|
|
3
|
+
# -----------------------------------------------------------------------------
|
|
4
|
+
# Copyright (c) 2020-2023 by Helmut Konrad Schewe. All rights reserved.
|
|
5
|
+
# This file is property of Helmut Konrad Schewe. Any unauthorized copy,
|
|
6
|
+
# use or distribution is an offensive act against international law and may
|
|
7
|
+
# be prosecuted under federal law. Its content is company confidential.
|
|
8
|
+
# =============================================================================
|
rawmaker/images/info.py
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
# =============================================================================
|
|
2
|
+
# C O P Y R I G H T
|
|
3
|
+
# -----------------------------------------------------------------------------
|
|
4
|
+
# Copyright (c) 2020-2023 by Helmut Konrad Schewe. All rights reserved.
|
|
5
|
+
# This file is property of Helmut Konrad Schewe. Any unauthorized copy,
|
|
6
|
+
# use or distribution is an offensive act against international law and may
|
|
7
|
+
# be prosecuted under federal law. Its content is company confidential.
|
|
8
|
+
# =============================================================================
|
|
9
|
+
|
|
10
|
+
import iamraw
|
|
11
|
+
import PIL.Image
|
|
12
|
+
import utilo
|
|
13
|
+
|
|
14
|
+
DEFAULT_DPI = (96, 96)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def imageinfo(path: str, page: int, bounding: tuple) -> iamraw.ImageInformation:
|
|
18
|
+
assert isinstance(bounding, (iamraw.BoundingBox, tuple)), type(bounding)
|
|
19
|
+
try:
|
|
20
|
+
image = PIL.Image.open(path)
|
|
21
|
+
image.load()
|
|
22
|
+
except OSError as err:
|
|
23
|
+
utilo.error(err)
|
|
24
|
+
return None
|
|
25
|
+
width, height = image.size
|
|
26
|
+
# add default DPI to distinguish images and figures
|
|
27
|
+
dpi = image.info.get('dpi', DEFAULT_DPI)
|
|
28
|
+
result = iamraw.ImageInformation(
|
|
29
|
+
width=width,
|
|
30
|
+
height=height,
|
|
31
|
+
dpi=dpi,
|
|
32
|
+
page=page,
|
|
33
|
+
bounding=bounding,
|
|
34
|
+
)
|
|
35
|
+
return result
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
#==============================================================================
|
|
2
|
+
# C O P Y R I G H T
|
|
3
|
+
#------------------------------------------------------------------------------
|
|
4
|
+
# Copyright (c) 2019-2023 by Helmut Konrad Schewe. All rights reserved.
|
|
5
|
+
# This file is property of Helmut Konrad Schewe. Any unauthorized copy,
|
|
6
|
+
# use or distribution is an offensive act against international law and may
|
|
7
|
+
# be prosecuted under federal law. Its content is company confidential.
|
|
8
|
+
#==============================================================================
|
rawmaker/miner/char.py
ADDED
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
# =============================================================================
|
|
2
|
+
# C O P Y R I G H T
|
|
3
|
+
# -----------------------------------------------------------------------------
|
|
4
|
+
# Copyright (c) 2020-2023 by Helmut Konrad Schewe. All rights reserved.
|
|
5
|
+
# This file is property of Helmut Konrad Schewe. Any unauthorized copy,
|
|
6
|
+
# use or distribution is an offensive act against international law and may
|
|
7
|
+
# be prosecuted under federal law. Its content is company confidential.
|
|
8
|
+
# =============================================================================
|
|
9
|
+
|
|
10
|
+
import iamraw
|
|
11
|
+
|
|
12
|
+
import rawmaker.converter.basic
|
|
13
|
+
import rawmaker.miner.text
|
|
14
|
+
import rawmaker.parameter
|
|
15
|
+
import rawmaker.patch.ltchar
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class CharPDFConvert(rawmaker.miner.text.PrecisePDFConverter):
|
|
19
|
+
|
|
20
|
+
def __init__(
|
|
21
|
+
self,
|
|
22
|
+
config: rawmaker.parameter.ParsingConfiguration = None,
|
|
23
|
+
imagewriter: callable = None,
|
|
24
|
+
):
|
|
25
|
+
super().__init__()
|
|
26
|
+
self.laparams = None # disable layout analysis
|
|
27
|
+
|
|
28
|
+
def receive_layout(self, ltpage):
|
|
29
|
+
rawmaker.converter.basic.FlippedLayoutAnalyzer.receive_layout(
|
|
30
|
+
self, ltpage)
|
|
31
|
+
chars = [
|
|
32
|
+
char for char in ltpage
|
|
33
|
+
if isinstance(char, rawmaker.patch.ltchar.PatchedLTChar)
|
|
34
|
+
]
|
|
35
|
+
chars = sorted(chars, key=lambda x: x.bbox[0]) # x0 # pylint:disable=C3001
|
|
36
|
+
chars = sorted(chars, key=lambda x: x.bbox[3]) # y1 # pylint:disable=C3001
|
|
37
|
+
|
|
38
|
+
page = iamraw.Page(ltpage.pageid, iamraw.BoundingBox(*ltpage.bbox))
|
|
39
|
+
|
|
40
|
+
for item in chars:
|
|
41
|
+
page.append(item)
|
|
42
|
+
self.document.pages.append(page) # pylint:disable=E1101
|
|
@@ -0,0 +1,75 @@
|
|
|
1
|
+
# =============================================================================
|
|
2
|
+
# C O P Y R I G H T
|
|
3
|
+
# -----------------------------------------------------------------------------
|
|
4
|
+
# Copyright (c) 2021-2023 by Helmut Konrad Schewe. All rights reserved.
|
|
5
|
+
# This file is property of Helmut Konrad Schewe. Any unauthorized copy,
|
|
6
|
+
# use or distribution is an offensive act against international law and may
|
|
7
|
+
# be prosecuted under federal law. Its content is company confidential.
|
|
8
|
+
# =============================================================================
|
|
9
|
+
|
|
10
|
+
import contextlib
|
|
11
|
+
|
|
12
|
+
import rawmaker.utils
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def parse(colorspace) -> str: # pylint:disable=R0911
|
|
16
|
+
"""\
|
|
17
|
+
>>> import pdfminer.psparser
|
|
18
|
+
>>> parse([pdfminer.psparser.PSLiteral('CS')]) # verify later
|
|
19
|
+
'CS'
|
|
20
|
+
"""
|
|
21
|
+
if len(colorspace) == 1:
|
|
22
|
+
colorspace = colorspace[0]
|
|
23
|
+
if not colorspace:
|
|
24
|
+
# TODO: VERIFY THIS
|
|
25
|
+
return 'DeviceGray'
|
|
26
|
+
colorspace = name(colorspace)
|
|
27
|
+
if isinstance(colorspace, str):
|
|
28
|
+
# TODO: VERIFY R212!
|
|
29
|
+
if colorspace in {'DeviceRGB', 'RGB', 'R213'}:
|
|
30
|
+
# RGB is an abbreviation of DeviceRGB
|
|
31
|
+
return 'DeviceRGB'
|
|
32
|
+
if colorspace in {'DeviceGray', 'G'}:
|
|
33
|
+
# G is an abbreviation of DeviceGray
|
|
34
|
+
return 'DeviceGray'
|
|
35
|
+
if 'CS' in colorspace:
|
|
36
|
+
return colorspace
|
|
37
|
+
return colorspace
|
|
38
|
+
typ = colorspace[0].name
|
|
39
|
+
if typ == 'Indexed':
|
|
40
|
+
return indexed_space(*colorspace[1:])
|
|
41
|
+
if typ == 'ICCBased':
|
|
42
|
+
return iccbased(rawmaker.utils.resolve(colorspace[1]))
|
|
43
|
+
return 'DeviceRGB'
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def indexed_space(base, hival, lookup): # pylint:disable=W0613
|
|
47
|
+
base = name(base)
|
|
48
|
+
if base[0] == 'ICCBased':
|
|
49
|
+
return iccbased(rawmaker.utils.resolve(base[1]))
|
|
50
|
+
if str(base) == 'DeviceRGB':
|
|
51
|
+
return 'DeviceRGB'
|
|
52
|
+
return None
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def iccbased(stream) -> str:
|
|
56
|
+
attributes = stream.attrs
|
|
57
|
+
# rawdata = stream.rawdata # pylint:disable=W0612
|
|
58
|
+
with contextlib.suppress(KeyError):
|
|
59
|
+
colorspace = attributes['N']
|
|
60
|
+
if colorspace == 1:
|
|
61
|
+
colorspace = 'DeviceGray'
|
|
62
|
+
elif colorspace == 3:
|
|
63
|
+
colorspace = 'DeviceRGB'
|
|
64
|
+
elif colorspace == 4:
|
|
65
|
+
colorspace = 'DeviceCMYK'
|
|
66
|
+
with contextlib.suppress(KeyError):
|
|
67
|
+
colorspace = attributes['Alternate'].name
|
|
68
|
+
return colorspace
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def name(reference) -> str:
|
|
72
|
+
reference = rawmaker.utils.resolve(reference)
|
|
73
|
+
with contextlib.suppress(AttributeError):
|
|
74
|
+
return reference.name
|
|
75
|
+
return reference
|