rawmaker 2.40.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- letty/__init__.py +46 -0
- letty/cli.py +63 -0
- letty/optimizer.py +138 -0
- letty/quality/__init__.py +8 -0
- letty/quality/whitespace.py +50 -0
- letty/strategy.py +8 -0
- rawmaker/__init__.py +29 -0
- rawmaker/__main__.py +13 -0
- rawmaker/__patch__.py +36 -0
- rawmaker/cli.py +206 -0
- rawmaker/cli_automate.py +69 -0
- rawmaker/converter/__init__.py +8 -0
- rawmaker/converter/basic.py +174 -0
- rawmaker/converter/images.py +168 -0
- rawmaker/date.py +83 -0
- rawmaker/destination.py +202 -0
- rawmaker/error.py +34 -0
- rawmaker/features/__init__.py +138 -0
- rawmaker/features/annotation.py +254 -0
- rawmaker/features/border.py +172 -0
- rawmaker/features/boxes.py +153 -0
- rawmaker/features/figures.py +24 -0
- rawmaker/features/fonts.py +229 -0
- rawmaker/features/formula.py +16 -0
- rawmaker/features/horizontals.py +132 -0
- rawmaker/features/images.py +155 -0
- rawmaker/features/line.py +337 -0
- rawmaker/features/outlines.py +123 -0
- rawmaker/features/text.py +91 -0
- rawmaker/fonts/__init__.py +8 -0
- rawmaker/fonts/parser.py +354 -0
- rawmaker/images/__init__.py +8 -0
- rawmaker/images/info.py +35 -0
- rawmaker/miner/__init__.py +8 -0
- rawmaker/miner/char.py +42 -0
- rawmaker/miner/colorspace.py +75 -0
- rawmaker/miner/images.py +448 -0
- rawmaker/miner/position.py +121 -0
- rawmaker/miner/rawchar.py +207 -0
- rawmaker/miner/text.py +833 -0
- rawmaker/miner/underline.py +66 -0
- rawmaker/parameter.py +130 -0
- rawmaker/patch/__init__.py +8 -0
- rawmaker/patch/ltchar.py +79 -0
- rawmaker/reader.py +97 -0
- rawmaker/text/__init__.py +8 -0
- rawmaker/text/chars.py +24 -0
- rawmaker/text/data.py +47 -0
- rawmaker/text/superfast.py +91 -0
- rawmaker/text/wordbox.py +95 -0
- rawmaker/utils.py +44 -0
- rawmaker-2.40.3.dist-info/METADATA +51 -0
- rawmaker-2.40.3.dist-info/RECORD +63 -0
- rawmaker-2.40.3.dist-info/WHEEL +5 -0
- rawmaker-2.40.3.dist-info/entry_points.txt +6 -0
- rawmaker-2.40.3.dist-info/licenses/LICENSE +21 -0
- rawmaker-2.40.3.dist-info/top_level.txt +3 -0
- spacestation/__init__.py +18 -0
- spacestation/cli.py +51 -0
- spacestation/features/__init__.py +8 -0
- spacestation/features/chardist.py +85 -0
- spacestation/features/worddist.py +57 -0
- spacestation/features/wspace.py +130 -0
|
@@ -0,0 +1,174 @@
|
|
|
1
|
+
# =============================================================================
|
|
2
|
+
# C O P Y R I G H T
|
|
3
|
+
# -----------------------------------------------------------------------------
|
|
4
|
+
# Copyright (c) 2020-2023 by Helmut Konrad Schewe. All rights reserved.
|
|
5
|
+
# This file is property of Helmut Konrad Schewe. Any unauthorized copy,
|
|
6
|
+
# use or distribution is an offensive act against international law and may
|
|
7
|
+
# be prosecuted under federal law. Its content is company confidential.
|
|
8
|
+
# =============================================================================
|
|
9
|
+
|
|
10
|
+
import contextlib
|
|
11
|
+
|
|
12
|
+
import iamraw
|
|
13
|
+
import pdfminer.converter
|
|
14
|
+
import pdfminer.layout
|
|
15
|
+
import pdfminer.pdfinterp
|
|
16
|
+
import pdfminer.pdfpage
|
|
17
|
+
import utilo
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class FlippedLayoutAnalyzer(pdfminer.converter.PDFLayoutAnalyzer):
|
|
21
|
+
|
|
22
|
+
def __init__(self, laparams=None, pageno=0):
|
|
23
|
+
super().__init__(
|
|
24
|
+
rsrcmgr=pdfminer.pdfinterp.PDFResourceManager(),
|
|
25
|
+
pageno=pageno,
|
|
26
|
+
laparams=laparams,
|
|
27
|
+
)
|
|
28
|
+
|
|
29
|
+
def receive_layout(self, ltpage):
|
|
30
|
+
if content_inside_single_figure(ltpage):
|
|
31
|
+
# extract content out of a single figure container
|
|
32
|
+
ltpage._objs = ltpage._objs[0]._objs # pylint:disable=W0212
|
|
33
|
+
params = self.laparams
|
|
34
|
+
if not params:
|
|
35
|
+
# use default layout for image extractor
|
|
36
|
+
params = pdfminer.layout.LAParams()
|
|
37
|
+
ltpage.analyze(params)
|
|
38
|
+
for item in ltpage:
|
|
39
|
+
flip_object(item, ltpage)
|
|
40
|
+
for item in ltpage:
|
|
41
|
+
item.bbox = figure_bounding(item)
|
|
42
|
+
# remove invisible objects
|
|
43
|
+
ltpage._objs = [item for item in ltpage if item.bbox is not None] # pylint:disable=W0212
|
|
44
|
+
|
|
45
|
+
def handle_undefined_char(self, font, cid) -> str:
|
|
46
|
+
# TODO: CHECK AFTER UPGRADING PDFMINER
|
|
47
|
+
# TODO: FIX PAGE NUMBER
|
|
48
|
+
try:
|
|
49
|
+
char = MAPPING[cid]
|
|
50
|
+
utilo.debug(f'could not convert: {font!r}, {cid!r} use backup: '
|
|
51
|
+
f'{char} on page: {self.pageno}')
|
|
52
|
+
except KeyError:
|
|
53
|
+
utilo.error(f'could not convert: {font!r}, {cid!r} '
|
|
54
|
+
f'on page: {self.pageno} no backup char defined')
|
|
55
|
+
# use warning to log only once
|
|
56
|
+
utilo.warning(str(vars(font)))
|
|
57
|
+
char = chr(cid)
|
|
58
|
+
return char
|
|
59
|
+
|
|
60
|
+
@property
|
|
61
|
+
def resources(self):
|
|
62
|
+
return self.rsrcmgr
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def content_inside_single_figure(page) -> bool:
|
|
66
|
+
"""Some pdf printer write all page content to a single figure.
|
|
67
|
+
|
|
68
|
+
If all content is in a single figure, no text extraction is possible.
|
|
69
|
+
"""
|
|
70
|
+
objs = page._objs # pylint:disable=W0212
|
|
71
|
+
if len(objs) != 1:
|
|
72
|
+
return False
|
|
73
|
+
figure = objs[0]
|
|
74
|
+
if not isinstance(figure, pdfminer.layout.LTFigure):
|
|
75
|
+
return False
|
|
76
|
+
if len(figure._objs) == 1: # pylint:disable=W0212
|
|
77
|
+
# image container, see master116 page,2,3. This works fine.
|
|
78
|
+
return False
|
|
79
|
+
return True
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
# REMOVE HACK LATER
|
|
83
|
+
# UAZWCW+CMR10
|
|
84
|
+
MAPPING = {
|
|
85
|
+
0: '−',
|
|
86
|
+
1: '·',
|
|
87
|
+
12: 'fi',
|
|
88
|
+
13: 'fl',
|
|
89
|
+
14: 'ffi',
|
|
90
|
+
# -Fern´andez, ´Ecole may support later, for now removing is a good match
|
|
91
|
+
# for more infos see master110p106
|
|
92
|
+
# 19: '´',
|
|
93
|
+
19: '',
|
|
94
|
+
20: '≤',
|
|
95
|
+
25: 'ß',
|
|
96
|
+
127: '¨', # Umlaute, oe, ae, ue, use already implemented replace to
|
|
97
|
+
# support umlaute
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
def flip_object(item, page):
|
|
102
|
+
try:
|
|
103
|
+
box = list(item.bbox)
|
|
104
|
+
except AttributeError:
|
|
105
|
+
# VirtualChar for example
|
|
106
|
+
return
|
|
107
|
+
pageheight = page.height
|
|
108
|
+
box[1], box[3] = pageheight - box[3], pageheight - box[1]
|
|
109
|
+
box = utilo.roundme(box) # pylint:disable=R0204
|
|
110
|
+
try:
|
|
111
|
+
item.bbox = iamraw.BoundingBox(*box)
|
|
112
|
+
except AssertionError:
|
|
113
|
+
utilo.debug(f'invalid bounding on page {page}: {box}')
|
|
114
|
+
utilo.debug(item)
|
|
115
|
+
item.x0, item.y0, item.x1, item.y1 = box
|
|
116
|
+
with contextlib.suppress(AttributeError):
|
|
117
|
+
for obj in item._objs: # pylint:disable=W0212
|
|
118
|
+
flip_object(obj, page)
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
class PageAggregator(FlippedLayoutAnalyzer):
|
|
122
|
+
|
|
123
|
+
def __init__(self, laparams=None):
|
|
124
|
+
super().__init__(laparams=laparams)
|
|
125
|
+
self.result = None
|
|
126
|
+
|
|
127
|
+
def receive_layout(self, ltpage):
|
|
128
|
+
super().receive_layout(ltpage)
|
|
129
|
+
self.result = ltpage
|
|
130
|
+
|
|
131
|
+
def get_result(self):
|
|
132
|
+
return self.result
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
def figure_bounding(figure) -> tuple:
|
|
136
|
+
"""Bounding of some bad printed figures where too large, we strip
|
|
137
|
+
this bounding to real content.
|
|
138
|
+
|
|
139
|
+
Empty figures must return None
|
|
140
|
+
>>> assert figure_bounding(pdfminer.layout.LTFigure('empty', (10, 10, 50, 50),
|
|
141
|
+
... (1, 1, 1, 1, 1, 1))) is None
|
|
142
|
+
"""
|
|
143
|
+
if not isinstance(figure, pdfminer.layout.LTFigure):
|
|
144
|
+
return figure.bbox
|
|
145
|
+
figure = [item for item in figure if visible(item)]
|
|
146
|
+
boundings = []
|
|
147
|
+
for item in figure:
|
|
148
|
+
if isinstance(item, pdfminer.layout.LTFigure): # pylint:disable=W0160
|
|
149
|
+
# figure inside a figure
|
|
150
|
+
bounding = figure_bounding(item)
|
|
151
|
+
else:
|
|
152
|
+
bounding = item.bbox
|
|
153
|
+
if bounding is None:
|
|
154
|
+
# hidden item
|
|
155
|
+
continue
|
|
156
|
+
boundings.append(bounding)
|
|
157
|
+
if not boundings:
|
|
158
|
+
return None
|
|
159
|
+
result = utilo.rect_max(boundings)
|
|
160
|
+
return result
|
|
161
|
+
|
|
162
|
+
|
|
163
|
+
def visible(item) -> bool:
|
|
164
|
+
with contextlib.suppress(AttributeError):
|
|
165
|
+
# TODO: INVESTIGATE THIS
|
|
166
|
+
if item.linewidth:
|
|
167
|
+
return True
|
|
168
|
+
if item.fill:
|
|
169
|
+
if not item.evenodd:
|
|
170
|
+
return True
|
|
171
|
+
return False
|
|
172
|
+
if not item.stroking_color and not item.non_stroking_color:
|
|
173
|
+
return False
|
|
174
|
+
return True
|
|
@@ -0,0 +1,168 @@
|
|
|
1
|
+
# =============================================================================
|
|
2
|
+
# C O P Y R I G H T
|
|
3
|
+
# -----------------------------------------------------------------------------
|
|
4
|
+
# Copyright (c) 2020-2023 by Helmut Konrad Schewe. All rights reserved.
|
|
5
|
+
# This file is property of Helmut Konrad Schewe. Any unauthorized copy,
|
|
6
|
+
# use or distribution is an offensive act against international law and may
|
|
7
|
+
# be prosecuted under federal law. Its content is company confidential.
|
|
8
|
+
# =============================================================================
|
|
9
|
+
|
|
10
|
+
import configos
|
|
11
|
+
import pdfminer.converter
|
|
12
|
+
import pdfminer.image
|
|
13
|
+
import pdfminer.layout
|
|
14
|
+
import pdfminer.pdfdocument
|
|
15
|
+
import pdfminer.pdfinterp
|
|
16
|
+
import pdfminer.pdftypes
|
|
17
|
+
import pdfminer.psparser
|
|
18
|
+
import utilo
|
|
19
|
+
|
|
20
|
+
import rawmaker.converter.basic
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class ImageConverter(rawmaker.converter.basic.FlippedLayoutAnalyzer):
|
|
24
|
+
|
|
25
|
+
def __init__(self, imagewriter, firstpage: int):
|
|
26
|
+
super().__init__(pageno=firstpage)
|
|
27
|
+
assert callable(imagewriter), imagewriter
|
|
28
|
+
self.imagewriter = imagewriter
|
|
29
|
+
# TODO avoid duplicated parsed, check if we require this?
|
|
30
|
+
self.parsed = utilo.Single()
|
|
31
|
+
|
|
32
|
+
def receive_layout(self, ltpage):
|
|
33
|
+
super().receive_layout(ltpage)
|
|
34
|
+
for item in ltpage:
|
|
35
|
+
self.render_pagecontent(ltpage.pageid, item)
|
|
36
|
+
|
|
37
|
+
def render_pagecontent(self, pageid, item):
|
|
38
|
+
"""Collect all imageable items"""
|
|
39
|
+
if isinstance(item, pdfminer.layout.LTImage):
|
|
40
|
+
self.render_result_image(item, pageid=pageid)
|
|
41
|
+
elif isinstance(item, pdfminer.layout.LTFigure):
|
|
42
|
+
self.render_figure(item, pageid=pageid)
|
|
43
|
+
|
|
44
|
+
def render_result_image(
|
|
45
|
+
self,
|
|
46
|
+
image: pdfminer.layout.LTImage,
|
|
47
|
+
pageid: int,
|
|
48
|
+
):
|
|
49
|
+
# add pageid to ensure that equal image names from different pages
|
|
50
|
+
# are not handled as same same.
|
|
51
|
+
imagename = f'{pageid}_{image.name}'
|
|
52
|
+
if self.parsed.contains(imagename):
|
|
53
|
+
return
|
|
54
|
+
self.imagewriter(pageid, image)
|
|
55
|
+
|
|
56
|
+
def render_figure(
|
|
57
|
+
self,
|
|
58
|
+
item: pdfminer.layout.LTFigure,
|
|
59
|
+
pageid: int,
|
|
60
|
+
):
|
|
61
|
+
# TODO: RENDER CURVES ETC.
|
|
62
|
+
images = item._objs # pylint:disable=W0212
|
|
63
|
+
if len(images) == 1:
|
|
64
|
+
if isinstance(images[0], pdfminer.layout.LTFigure):
|
|
65
|
+
# image inside figure
|
|
66
|
+
images = images[0]._objs # pylint:disable=W0212
|
|
67
|
+
images = [
|
|
68
|
+
item for item in images if isinstance(item, pdfminer.layout.LTImage)
|
|
69
|
+
]
|
|
70
|
+
if not images:
|
|
71
|
+
return
|
|
72
|
+
assert len(images) == 1, str(images)
|
|
73
|
+
# TODO: Investigate with list
|
|
74
|
+
image = images[0] # pylint:disable=W0212
|
|
75
|
+
if skipme(image):
|
|
76
|
+
return
|
|
77
|
+
self.render_result_image(image, pageid)
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
SKIPME_RATE_MIN = configos.HV_PERCENT_PLUS(default=50.0)
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def skipme(image) -> bool:
|
|
84
|
+
"""\
|
|
85
|
+
Master31Page10 Black/White image is printed under figure caption.
|
|
86
|
+
"""
|
|
87
|
+
# TODO: INVESTIGATE THIS HACK
|
|
88
|
+
stream_raw = image.stream.rawdata
|
|
89
|
+
counted = stream_raw.count(b'\x00')
|
|
90
|
+
rate = counted / len(stream_raw)
|
|
91
|
+
if rate >= SKIPME_RATE_MIN:
|
|
92
|
+
return True
|
|
93
|
+
return False
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
class FastImageInterpreter(pdfminer.pdfinterp.PDFPageInterpreter):
|
|
97
|
+
"""Experimental, think about the sence of this ?optimization?."""
|
|
98
|
+
|
|
99
|
+
# TODO: SEE DOCSTRING
|
|
100
|
+
|
|
101
|
+
def __init__(self, rsrcmgr, device):
|
|
102
|
+
super().__init__(rsrcmgr, device)
|
|
103
|
+
self.fast = {
|
|
104
|
+
'CS': self.do_CS,
|
|
105
|
+
'Do': self.do_Do,
|
|
106
|
+
'EI': self.do_EI,
|
|
107
|
+
'MP': self.do_MP,
|
|
108
|
+
'Q': self.do_Q,
|
|
109
|
+
'SC': self.do_SC,
|
|
110
|
+
'SCN': self.do_SCN,
|
|
111
|
+
'cm': self.do_cm,
|
|
112
|
+
'cs': self.do_cs,
|
|
113
|
+
'sc': self.do_sc,
|
|
114
|
+
'scn': self.do_scn,
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
# pylint:disable=W0613,R0201
|
|
118
|
+
def render_char(self, matrix, font, fontsize, scaling, rise, cid, ncs,
|
|
119
|
+
graphicstate):
|
|
120
|
+
# assert 0
|
|
121
|
+
return
|
|
122
|
+
|
|
123
|
+
def render_string(self, textstate, seq, ncs, graphicstate):
|
|
124
|
+
return
|
|
125
|
+
|
|
126
|
+
def do_TJ(self, seq):
|
|
127
|
+
return
|
|
128
|
+
|
|
129
|
+
def execute(self, streams): # pylint:disable=R1260
|
|
130
|
+
try:
|
|
131
|
+
parser = pdfminer.pdfinterp.PDFContentParser(streams)
|
|
132
|
+
except pdfminer.psparser.PSEOF:
|
|
133
|
+
# empty page
|
|
134
|
+
return
|
|
135
|
+
while 1: # pylint:disable=W0149
|
|
136
|
+
try:
|
|
137
|
+
(_, obj) = parser.nextobject()
|
|
138
|
+
except pdfminer.psparser.PSEOF:
|
|
139
|
+
break
|
|
140
|
+
if isinstance(obj, pdfminer.psparser.PSKeyword):
|
|
141
|
+
name = pdfminer.psparser.keyword_name(obj)
|
|
142
|
+
try:
|
|
143
|
+
func = self.fast[name]
|
|
144
|
+
except KeyError:
|
|
145
|
+
continue
|
|
146
|
+
nargs = func.__code__.co_argcount - 1
|
|
147
|
+
# nargs = six.get_function_code(func).co_argcount - 1
|
|
148
|
+
if nargs:
|
|
149
|
+
args = self.pop(nargs)
|
|
150
|
+
if len(args) == nargs:
|
|
151
|
+
func(*args)
|
|
152
|
+
else:
|
|
153
|
+
func()
|
|
154
|
+
|
|
155
|
+
else:
|
|
156
|
+
self.push(obj)
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
def create_fastimageextractor(imagelistener, firstpage: int):
|
|
160
|
+
device = ImageConverter(
|
|
161
|
+
imagewriter=imagelistener,
|
|
162
|
+
firstpage=firstpage,
|
|
163
|
+
)
|
|
164
|
+
interpreter = pdfminer.pdfinterp.PDFPageInterpreter(
|
|
165
|
+
device.resources,
|
|
166
|
+
device,
|
|
167
|
+
)
|
|
168
|
+
return interpreter
|
rawmaker/date.py
ADDED
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
# =============================================================================
|
|
2
|
+
# C O P Y R I G H T
|
|
3
|
+
# -----------------------------------------------------------------------------
|
|
4
|
+
# Copyright (c) 2020-2023 by Helmut Konrad Schewe. All rights reserved.
|
|
5
|
+
# This file is property of Helmut Konrad Schewe. Any unauthorized copy,
|
|
6
|
+
# use or distribution is an offensive act against international law and may
|
|
7
|
+
# be prosecuted under federal law. Its content is company confidential.
|
|
8
|
+
# =============================================================================
|
|
9
|
+
"""Date
|
|
10
|
+
====
|
|
11
|
+
|
|
12
|
+
See Adobe PDF 2008 REF 7.9.4.
|
|
13
|
+
|
|
14
|
+
Format: (D:YYYYMMDDHHmmSSOHH'mm)
|
|
15
|
+
YYYY: Year
|
|
16
|
+
MM: Month(01-12)
|
|
17
|
+
DD: Day(0-31)
|
|
18
|
+
HH: Hour(0-23)
|
|
19
|
+
mm: minute(00-59)
|
|
20
|
+
SS: second(00-59)
|
|
21
|
+
O: + or -
|
|
22
|
+
HH: offset in hours
|
|
23
|
+
'
|
|
24
|
+
mm: offset in minutes
|
|
25
|
+
|
|
26
|
+
Parse and generate raw data again.
|
|
27
|
+
|
|
28
|
+
>>> time = "D:20160419072554+02'00"
|
|
29
|
+
>>> parsed = parse(time)
|
|
30
|
+
|
|
31
|
+
Convert parsed back to raw string
|
|
32
|
+
|
|
33
|
+
>>> raw(parsed)
|
|
34
|
+
"D:20160419072554+02'00"
|
|
35
|
+
"""
|
|
36
|
+
|
|
37
|
+
import dataclasses
|
|
38
|
+
import re
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
@dataclasses.dataclass
|
|
42
|
+
class PDFDate:
|
|
43
|
+
year: int = None
|
|
44
|
+
month: int = None
|
|
45
|
+
day: int = None
|
|
46
|
+
hour: int = None
|
|
47
|
+
minute: int = None
|
|
48
|
+
second: int = None
|
|
49
|
+
utc_hour: int = None
|
|
50
|
+
utc_minute: int = None
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
PATTERN = (r'D:(?P<year>\d{4})(?P<month>\d{2})(?P<day>\d{2})'
|
|
54
|
+
r'(?P<hour>\d{2})(?P<minute>\d{2})(?P<second>\d{2})(?P<sign>[+-])'
|
|
55
|
+
r'(?P<utc_hour>\d{2})\'(?P<utc_minute>\d{2})')
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def raw(date: PDFDate) -> str:
|
|
59
|
+
sign = '+' if date.utc_hour >= 0 else '-'
|
|
60
|
+
result = (f'D:{date.year:04d}{date.month:02d}{date.day:02d}'
|
|
61
|
+
f'{date.hour:02d}{date.minute:02d}{date.second:02d}'
|
|
62
|
+
f'{sign}{date.utc_hour:02d}\'{date.utc_minute:02d}')
|
|
63
|
+
return result
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def parse(item: str) -> PDFDate:
|
|
67
|
+
"""Parse ASN.1 date pattern.
|
|
68
|
+
|
|
69
|
+
>>> parse("D:20160419072554+02'00")
|
|
70
|
+
PDFDate(year=2016, month=4, day=19, hour=7, minute=25, second=54, utc_hour=2, utc_minute=0)
|
|
71
|
+
"""
|
|
72
|
+
matched = re.match(PATTERN, item)
|
|
73
|
+
if not matched:
|
|
74
|
+
return None
|
|
75
|
+
values = [
|
|
76
|
+
'day', 'hour', 'minute', 'month', 'second', 'year', 'utc_hour',
|
|
77
|
+
'utc_minute'
|
|
78
|
+
]
|
|
79
|
+
data = {key: int(matched[key]) for key in values}
|
|
80
|
+
result = PDFDate(**data)
|
|
81
|
+
if matched['sign'] == '-':
|
|
82
|
+
result.hour = result * -1
|
|
83
|
+
return result
|
rawmaker/destination.py
ADDED
|
@@ -0,0 +1,202 @@
|
|
|
1
|
+
# =============================================================================
|
|
2
|
+
# C O P Y R I G H T
|
|
3
|
+
# -----------------------------------------------------------------------------
|
|
4
|
+
# Copyright (c) 2020-2023 by Helmut Konrad Schewe. All rights reserved.
|
|
5
|
+
# This file is property of Helmut Konrad Schewe. Any unauthorized copy,
|
|
6
|
+
# use or distribution is an offensive act against international law and may
|
|
7
|
+
# be prosecuted under federal law. Its content is company confidential.
|
|
8
|
+
# =============================================================================
|
|
9
|
+
"""Destination
|
|
10
|
+
===========
|
|
11
|
+
|
|
12
|
+
Hint: There is no direct link between `Annotation` a the real pdf page.
|
|
13
|
+
Therefore we have to extract the reference and link them with the
|
|
14
|
+
page-header to determine the real pdf page. See: `pageids` and
|
|
15
|
+
`solve_pageid.`
|
|
16
|
+
|
|
17
|
+
Types:
|
|
18
|
+
|
|
19
|
+
- Simple: Page Number is directly encoded
|
|
20
|
+
- Named: Solve `Destination` reference to determine page number
|
|
21
|
+
- Explicit: PDFPageReference is given
|
|
22
|
+
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
import contextlib
|
|
26
|
+
import dataclasses
|
|
27
|
+
|
|
28
|
+
import pdfminer.pdfpage
|
|
29
|
+
import pdfminer.pdftypes
|
|
30
|
+
|
|
31
|
+
import rawmaker.utils
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class DestinationMixin:
|
|
35
|
+
pass
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
@dataclasses.dataclass
|
|
39
|
+
class ExplicitDestination(DestinationMixin):
|
|
40
|
+
page: int = None
|
|
41
|
+
left: float = None
|
|
42
|
+
top: float = None
|
|
43
|
+
zoom: float = None
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
@dataclasses.dataclass
|
|
47
|
+
class ExternalLinkDestination(DestinationMixin):
|
|
48
|
+
"""Hyperlink to external web resource.
|
|
49
|
+
|
|
50
|
+
See: 12.6.4.7 URI Actions; PDF 2008
|
|
51
|
+
"""
|
|
52
|
+
hyperlink: str = None
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
@dataclasses.dataclass
|
|
56
|
+
class NamedDestination(DestinationMixin):
|
|
57
|
+
reference: str = None
|
|
58
|
+
|
|
59
|
+
@property
|
|
60
|
+
def pdf_reference(self) -> bytes:
|
|
61
|
+
"""Convert human readable reference to pdf reference.
|
|
62
|
+
|
|
63
|
+
>>> NamedDestination('Kapitel 1').pdf_reference
|
|
64
|
+
b'Kapitel 1'
|
|
65
|
+
"""
|
|
66
|
+
encoded = rawmaker.utils.guess_encoding(self.reference)
|
|
67
|
+
return encoded
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def parse(item) -> DestinationMixin: # pylint:disable=R1260
|
|
71
|
+
"""\
|
|
72
|
+
A `null` value means that parameter shall be unchanged.
|
|
73
|
+
|
|
74
|
+
# TODO: Change null later
|
|
75
|
+
>>> parse([b'/null', 0.0, 0.0, 1.0]).page
|
|
76
|
+
0
|
|
77
|
+
"""
|
|
78
|
+
item = rawmaker.utils.resolve(item)
|
|
79
|
+
hyperlink = parse_hyperlink(item)
|
|
80
|
+
if hyperlink:
|
|
81
|
+
return hyperlink
|
|
82
|
+
fitr = parse_fitr(item)
|
|
83
|
+
if fitr:
|
|
84
|
+
return fitr
|
|
85
|
+
simple = parse_simple(item)
|
|
86
|
+
if simple:
|
|
87
|
+
return simple
|
|
88
|
+
for method in (parse_explict, parse_named):
|
|
89
|
+
explicit = method(item)
|
|
90
|
+
if explicit:
|
|
91
|
+
return explicit
|
|
92
|
+
return None
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def parse_hyperlink(item) -> ExternalLinkDestination:
|
|
96
|
+
"""\
|
|
97
|
+
>>> parse_hyperlink("{'S': /'URI', 'URI': b'http://www.helm.org/jst.pdf'}")
|
|
98
|
+
"""
|
|
99
|
+
if not isinstance(item, dict):
|
|
100
|
+
return None
|
|
101
|
+
try:
|
|
102
|
+
hyperlink = item['URI']
|
|
103
|
+
except KeyError:
|
|
104
|
+
return None
|
|
105
|
+
return ExternalLinkDestination(hyperlink=hyperlink)
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
def parse_simple(item) -> NamedDestination:
|
|
109
|
+
"""Page number is directly encoded, therefore we can convert and
|
|
110
|
+
return.
|
|
111
|
+
|
|
112
|
+
>>> from pdfminer.psparser import PSLiteral as PS
|
|
113
|
+
>>> parse_simple({'S': PS('GoTo'), 'D': b'FF'}).reference
|
|
114
|
+
'FF'
|
|
115
|
+
"""
|
|
116
|
+
if not isinstance(item, dict):
|
|
117
|
+
return None
|
|
118
|
+
if not item['S'].name == 'GoTo':
|
|
119
|
+
# 12.6.4.2 Go-To Actions
|
|
120
|
+
return None
|
|
121
|
+
reference = item['D']
|
|
122
|
+
reference = rawmaker.utils.guess_decoding(reference)
|
|
123
|
+
return NamedDestination(reference=reference)
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
def parse_fitr(item) -> ExplicitDestination:
|
|
127
|
+
"""\
|
|
128
|
+
>>> from pdfminer.psparser import PSLiteral as PS
|
|
129
|
+
>>> parse_fitr({'S': 'GoTo', 'D': [5, PS('FitR'), 0, 625, 440, 309]}).page
|
|
130
|
+
5
|
|
131
|
+
>>> parse_fitr({'D': [None, 'FitH', 3512], 'S': 'GoTo'})
|
|
132
|
+
ExplicitDestination(page=0, left=None, top=3512, zoom=None)
|
|
133
|
+
"""
|
|
134
|
+
item = rawmaker.utils.resolve(item)
|
|
135
|
+
# {'S': /'GoTo', 'D': [0, /'FitR', 0, 625, 440, 309]}
|
|
136
|
+
with contextlib.suppress(TypeError):
|
|
137
|
+
item = item['D']
|
|
138
|
+
# [0, /'FitR', 0, 625, 440, 309]
|
|
139
|
+
if not isinstance(item, list):
|
|
140
|
+
return None
|
|
141
|
+
if isinstance(item[1], float):
|
|
142
|
+
# [/b'null', 0.0, 0.0, 1.0]
|
|
143
|
+
return ExplicitDestination(page=int(item[1])) # TODO: HACK?
|
|
144
|
+
fit = ('Fit', 'FitH', 'FitR', 'XYZ')
|
|
145
|
+
if not item[1] in fit and not item[1].name in fit:
|
|
146
|
+
return None
|
|
147
|
+
pagenumber = item[0]
|
|
148
|
+
top = item[2] if len(item) >= 3 else None
|
|
149
|
+
if pagenumber is None:
|
|
150
|
+
# TODO: CHANGE TO UNCHANGED/NONE
|
|
151
|
+
pagenumber = 0
|
|
152
|
+
return ExplicitDestination(page=pagenumber, top=top)
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
def parse_explict(item) -> ExplicitDestination:
|
|
156
|
+
item = rawmaker.utils.resolve(item)
|
|
157
|
+
with contextlib.suppress(KeyError, TypeError):
|
|
158
|
+
# KeyError: ? add docs here ?
|
|
159
|
+
# TypeError: item is already the requested list:
|
|
160
|
+
# [34, /'XYZ', 72.4799999, 532.319999, 0]
|
|
161
|
+
item = item['D']
|
|
162
|
+
if isinstance(item, bytes):
|
|
163
|
+
# {'S': /'GoTo', 'D': b'subsection.A.5.4'}
|
|
164
|
+
return None
|
|
165
|
+
try:
|
|
166
|
+
page, _, left, top, zoom = item # TODO: FLIP Y-Coordinate
|
|
167
|
+
except ValueError:
|
|
168
|
+
return None
|
|
169
|
+
with contextlib.suppress(AttributeError):
|
|
170
|
+
# skip when zoom is already a float
|
|
171
|
+
# null means: do not change current zoom
|
|
172
|
+
if zoom.name == b'null':
|
|
173
|
+
zoom = 0.0
|
|
174
|
+
result = ExplicitDestination(
|
|
175
|
+
page=page,
|
|
176
|
+
left=left,
|
|
177
|
+
top=top,
|
|
178
|
+
zoom=zoom,
|
|
179
|
+
)
|
|
180
|
+
return result
|
|
181
|
+
|
|
182
|
+
|
|
183
|
+
def parse_named(item) -> ExplicitDestination:
|
|
184
|
+
doc = item.doc
|
|
185
|
+
item = rawmaker.utils.resolve(item)
|
|
186
|
+
with contextlib.suppress(KeyError):
|
|
187
|
+
item = item['D']
|
|
188
|
+
resolved = doc.lookup_name('Dests', item)
|
|
189
|
+
resolved = rawmaker.utils.resolve(resolved)
|
|
190
|
+
return parse_explict(resolved)
|
|
191
|
+
|
|
192
|
+
|
|
193
|
+
def pageids(path: str) -> dict:
|
|
194
|
+
result = {}
|
|
195
|
+
with open(path, mode='rb') as pdf:
|
|
196
|
+
pages = pdfminer.pdfpage.PDFPage.get_pages(
|
|
197
|
+
pdf,
|
|
198
|
+
check_extractable=False,
|
|
199
|
+
)
|
|
200
|
+
for index, page in enumerate(pages):
|
|
201
|
+
result[page.pageid] = index
|
|
202
|
+
return result
|
rawmaker/error.py
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
#==============================================================================
|
|
2
|
+
# C O P Y R I G H T
|
|
3
|
+
#------------------------------------------------------------------------------
|
|
4
|
+
# Copyright (c) 2019-2023 by Helmut Konrad Schewe. All rights reserved.
|
|
5
|
+
# This file is property of Helmut Konrad Schewe. Any unauthorized copy,
|
|
6
|
+
# use or distribution is an offensive act against international law and may
|
|
7
|
+
# be prosecuted under federal law. Its content is company confidential.
|
|
8
|
+
#==============================================================================
|
|
9
|
+
|
|
10
|
+
from pdfminer.pdfdocument import PDFNoOutlines
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class RawMakerError(Exception):
|
|
14
|
+
"""Parent class for user errors or input errors.
|
|
15
|
+
|
|
16
|
+
Exceptions of this type are handled by the command line tool
|
|
17
|
+
and result in clear error messages, as opposed to backtraces.
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class TextExtractionNotAllowed(RawMakerError):
|
|
22
|
+
pass
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class InvalidPDF(RawMakerError):
|
|
26
|
+
pass
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class PDFParserImplementationError(RawMakerError):
|
|
30
|
+
pass
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class MissingOutlines(RawMakerError, PDFNoOutlines):
|
|
34
|
+
pass
|