rawmaker 2.40.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- letty/__init__.py +46 -0
- letty/cli.py +63 -0
- letty/optimizer.py +138 -0
- letty/quality/__init__.py +8 -0
- letty/quality/whitespace.py +50 -0
- letty/strategy.py +8 -0
- rawmaker/__init__.py +29 -0
- rawmaker/__main__.py +13 -0
- rawmaker/__patch__.py +36 -0
- rawmaker/cli.py +206 -0
- rawmaker/cli_automate.py +69 -0
- rawmaker/converter/__init__.py +8 -0
- rawmaker/converter/basic.py +174 -0
- rawmaker/converter/images.py +168 -0
- rawmaker/date.py +83 -0
- rawmaker/destination.py +202 -0
- rawmaker/error.py +34 -0
- rawmaker/features/__init__.py +138 -0
- rawmaker/features/annotation.py +254 -0
- rawmaker/features/border.py +172 -0
- rawmaker/features/boxes.py +153 -0
- rawmaker/features/figures.py +24 -0
- rawmaker/features/fonts.py +229 -0
- rawmaker/features/formula.py +16 -0
- rawmaker/features/horizontals.py +132 -0
- rawmaker/features/images.py +155 -0
- rawmaker/features/line.py +337 -0
- rawmaker/features/outlines.py +123 -0
- rawmaker/features/text.py +91 -0
- rawmaker/fonts/__init__.py +8 -0
- rawmaker/fonts/parser.py +354 -0
- rawmaker/images/__init__.py +8 -0
- rawmaker/images/info.py +35 -0
- rawmaker/miner/__init__.py +8 -0
- rawmaker/miner/char.py +42 -0
- rawmaker/miner/colorspace.py +75 -0
- rawmaker/miner/images.py +448 -0
- rawmaker/miner/position.py +121 -0
- rawmaker/miner/rawchar.py +207 -0
- rawmaker/miner/text.py +833 -0
- rawmaker/miner/underline.py +66 -0
- rawmaker/parameter.py +130 -0
- rawmaker/patch/__init__.py +8 -0
- rawmaker/patch/ltchar.py +79 -0
- rawmaker/reader.py +97 -0
- rawmaker/text/__init__.py +8 -0
- rawmaker/text/chars.py +24 -0
- rawmaker/text/data.py +47 -0
- rawmaker/text/superfast.py +91 -0
- rawmaker/text/wordbox.py +95 -0
- rawmaker/utils.py +44 -0
- rawmaker-2.40.3.dist-info/METADATA +51 -0
- rawmaker-2.40.3.dist-info/RECORD +63 -0
- rawmaker-2.40.3.dist-info/WHEEL +5 -0
- rawmaker-2.40.3.dist-info/entry_points.txt +6 -0
- rawmaker-2.40.3.dist-info/licenses/LICENSE +21 -0
- rawmaker-2.40.3.dist-info/top_level.txt +3 -0
- spacestation/__init__.py +18 -0
- spacestation/cli.py +51 -0
- spacestation/features/__init__.py +8 -0
- spacestation/features/chardist.py +85 -0
- spacestation/features/worddist.py +57 -0
- spacestation/features/wspace.py +130 -0
|
@@ -0,0 +1,138 @@
|
|
|
1
|
+
#==============================================================================
|
|
2
|
+
# C O P Y R I G H T
|
|
3
|
+
#------------------------------------------------------------------------------
|
|
4
|
+
# Copyright (c) 2019-2023 by Helmut Konrad Schewe. All rights reserved.
|
|
5
|
+
# This file is property of Helmut Konrad Schewe. Any unauthorized copy,
|
|
6
|
+
# use or distribution is an offensive act against international law and may
|
|
7
|
+
# be prosecuted under federal law. Its content is company confidential.
|
|
8
|
+
#==============================================================================
|
|
9
|
+
|
|
10
|
+
import collections
|
|
11
|
+
|
|
12
|
+
import iamraw
|
|
13
|
+
import pdfminer.converter
|
|
14
|
+
import pdfminer.layout
|
|
15
|
+
import pdfminer.pdfdocument
|
|
16
|
+
import pdfminer.pdfinterp
|
|
17
|
+
import pdfminer.pdfpage
|
|
18
|
+
import utilo
|
|
19
|
+
|
|
20
|
+
import rawmaker.converter.basic
|
|
21
|
+
import rawmaker.miner.text
|
|
22
|
+
import rawmaker.parameter
|
|
23
|
+
|
|
24
|
+
PageContent = collections.namedtuple('PageContent', 'content, page')
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def create_interpreter(layout=None) -> pdfminer.pdfinterp.PDFPageInterpreter:
|
|
28
|
+
if not layout:
|
|
29
|
+
layout = rawmaker.parameter.ParsingConfiguration().laparams()
|
|
30
|
+
device = rawmaker.converter.basic.PageAggregator(laparams=layout)
|
|
31
|
+
interpreter = pdfminer.pdfinterp.PDFPageInterpreter(
|
|
32
|
+
device.resources,
|
|
33
|
+
device,
|
|
34
|
+
)
|
|
35
|
+
return interpreter, device
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def process_pdfpages(
|
|
39
|
+
document: pdfminer.pdfdocument.PDFDocument,
|
|
40
|
+
pages: tuple = None,
|
|
41
|
+
) -> pdfminer.pdfpage.PDFPage:
|
|
42
|
+
"""Contextmanager to yield `PDFPage` of every selected page of
|
|
43
|
+
`PDFDocument`.
|
|
44
|
+
|
|
45
|
+
Args:
|
|
46
|
+
document: open pdf file
|
|
47
|
+
pages: number of pages to procress, if None every page is processed
|
|
48
|
+
Yields:
|
|
49
|
+
PDFPage: tuple of page content and pdf page number
|
|
50
|
+
"""
|
|
51
|
+
utilo.call('process_pdfpages')
|
|
52
|
+
utilo.asserts(document, pdfminer.pdfdocument.PDFDocument)
|
|
53
|
+
create_pages = pdfminer.pdfpage.PDFPage.create_pages
|
|
54
|
+
with utilo.SkipCollector(pages) as collector:
|
|
55
|
+
for number, page in enumerate(create_pages(document), start=0):
|
|
56
|
+
if collector.skip(number):
|
|
57
|
+
continue
|
|
58
|
+
page.pageid = number
|
|
59
|
+
yield (page, number)
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def process_document(
|
|
63
|
+
document: pdfminer.pdfdocument.PDFDocument,
|
|
64
|
+
layout=None,
|
|
65
|
+
pages=None,
|
|
66
|
+
) -> tuple[int, pdfminer.layout.LTPage]:
|
|
67
|
+
"""Yield (pagenumber, LTPage) for every selected page of `PDFDocument`"""
|
|
68
|
+
assert isinstance(
|
|
69
|
+
document,
|
|
70
|
+
pdfminer.pdfdocument.PDFDocument,
|
|
71
|
+
), type(document)
|
|
72
|
+
interpreter, device = create_interpreter(layout=layout)
|
|
73
|
+
for content, number in process_pdfpages(document, pages=pages):
|
|
74
|
+
interpreter.process_page(content)
|
|
75
|
+
pagecontent = PageContent(content=device.get_result(), page=number)
|
|
76
|
+
yield (content, pagecontent)
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
def process_pagecontent(
|
|
80
|
+
document: pdfminer.pdfdocument.PDFDocument,
|
|
81
|
+
layout=None,
|
|
82
|
+
pages=None,
|
|
83
|
+
) -> pdfminer.layout.LTPage:
|
|
84
|
+
utilo.asserts(document, pdfminer.pdfdocument.PDFDocument)
|
|
85
|
+
for _, content in process_document(document, layout=layout, pages=pages):
|
|
86
|
+
yield content
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
def page_selection(document: iamraw.Document, pages: tuple):
|
|
90
|
+
assert isinstance(document, iamraw.Document), type(document)
|
|
91
|
+
if pages:
|
|
92
|
+
assert isinstance(pages, (list, tuple)), '%s %s' % (pages, type(pages)) # pylint:disable=C0209
|
|
93
|
+
return pages
|
|
94
|
+
# if pages is None, every page must processed
|
|
95
|
+
return list(range(len(document.pages)))
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
def extract_content(
|
|
99
|
+
document: pdfminer.pdfdocument.PDFDocument,
|
|
100
|
+
config: rawmaker.parameter.ParsingConfiguration = None,
|
|
101
|
+
converter=rawmaker.miner.text.PrecisePDFConverter,
|
|
102
|
+
pages: tuple = None,
|
|
103
|
+
) -> iamraw.Document:
|
|
104
|
+
"""Extract content from PDF file
|
|
105
|
+
|
|
106
|
+
Args:
|
|
107
|
+
document(PDFDocument): PDF file to process
|
|
108
|
+
config(ParsingConfiguration): parametrization for layout analysis.
|
|
109
|
+
This parameter defines how chars are
|
|
110
|
+
matched together in words and sentences.
|
|
111
|
+
See pdf reference documentation.
|
|
112
|
+
converter(pdfminer.converter.PDFLayoutAnalyzer): how to handle
|
|
113
|
+
the layout extraction
|
|
114
|
+
pages: tuple of selected pages
|
|
115
|
+
Returns:
|
|
116
|
+
Document: parsed and layouted document
|
|
117
|
+
"""
|
|
118
|
+
if config is None:
|
|
119
|
+
config = rawmaker.parameter.ParsingConfiguration()
|
|
120
|
+
utilo.asserts(config, rawmaker.parameter.ParsingConfiguration)
|
|
121
|
+
# prepare parser
|
|
122
|
+
device = converter(config=config)
|
|
123
|
+
device.new_document()
|
|
124
|
+
interpreter = pdfminer.pdfinterp.PDFPageInterpreter(device.rsrcmgr, device)
|
|
125
|
+
# Processing layout
|
|
126
|
+
create_pages = pdfminer.pdfpage.PDFPage.create_pages
|
|
127
|
+
with utilo.SkipCollector(pages) as collector:
|
|
128
|
+
for index, page in enumerate(create_pages(document)):
|
|
129
|
+
if collector.skip(index):
|
|
130
|
+
continue
|
|
131
|
+
interpreter.process_page(page)
|
|
132
|
+
result = device.finish_document()
|
|
133
|
+
# upgrade page number
|
|
134
|
+
pages = page_selection(result, pages)
|
|
135
|
+
# TODO: REPLACE PAGE WITH ENDLESS ITER AND CHANGE ZIP TO ZIP_LONGEST
|
|
136
|
+
for (page, pagenumber) in zip(result.pages, pages):
|
|
137
|
+
page.page = pagenumber
|
|
138
|
+
return result
|
|
@@ -0,0 +1,254 @@
|
|
|
1
|
+
# =============================================================================
|
|
2
|
+
# C O P Y R I G H T
|
|
3
|
+
# -----------------------------------------------------------------------------
|
|
4
|
+
# Copyright (c) 2019-2023 by Helmut Konrad Schewe. All rights reserved.
|
|
5
|
+
# This file is property of Helmut Konrad Schewe. Any unauthorized copy,
|
|
6
|
+
# use or distribution is an offensive act against international law and may
|
|
7
|
+
# be prosecuted under federal law. Its content is company confidential.
|
|
8
|
+
# =============================================================================
|
|
9
|
+
"""Add parser to parse non annotated links to inform user about
|
|
10
|
+
broken/malformated links."""
|
|
11
|
+
|
|
12
|
+
import iamraw
|
|
13
|
+
import pdfminer.pdfdocument
|
|
14
|
+
import serializeraw
|
|
15
|
+
import utilo
|
|
16
|
+
|
|
17
|
+
import rawmaker.features
|
|
18
|
+
import rawmaker.reader
|
|
19
|
+
import rawmaker.utils
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def work(document: str, pages=None) -> str:
|
|
23
|
+
assert isinstance(document, str), str(document)
|
|
24
|
+
with rawmaker.reader.read(document) as pdf:
|
|
25
|
+
annotations = extract_annotations(pdf, pages=pages)
|
|
26
|
+
dumped = serializeraw.dump_annotations(annotations)
|
|
27
|
+
return dumped
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def extract_annotations(
|
|
31
|
+
document: pdfminer.pdfdocument.PDFDocument,
|
|
32
|
+
pages=None,
|
|
33
|
+
) -> iamraw.PageAnnotations:
|
|
34
|
+
result = []
|
|
35
|
+
for page, number in rawmaker.features.process_pdfpages(
|
|
36
|
+
document,
|
|
37
|
+
pages=pages,
|
|
38
|
+
):
|
|
39
|
+
parsed = parse_page(page, pagenumber=number)
|
|
40
|
+
if not (parsed.hyperlinks or parsed.pagelinks):
|
|
41
|
+
# skip empty page
|
|
42
|
+
continue
|
|
43
|
+
result.append(parsed)
|
|
44
|
+
return result
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
ANNOTATION_LABEL = 'Annot'
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def parse_page(
|
|
51
|
+
page: pdfminer.pdfpage.PDFPage,
|
|
52
|
+
pagenumber: int,
|
|
53
|
+
) -> iamraw.PageAnnotation:
|
|
54
|
+
"""Parse annotation from `PDFPage`.
|
|
55
|
+
|
|
56
|
+
Args:
|
|
57
|
+
page(PDFPage): pdf page to parse annotation
|
|
58
|
+
pagenumber(int): number of selected page
|
|
59
|
+
Returns:
|
|
60
|
+
parsed Annotations.
|
|
61
|
+
|
|
62
|
+
There are 2 different types of annotation, the internal and external
|
|
63
|
+
links:
|
|
64
|
+
* The internal links, better called page links refer to a chapter or a
|
|
65
|
+
location in the document.
|
|
66
|
+
* The external links, so called hyperlink refer to clickable weblinks.
|
|
67
|
+
|
|
68
|
+
# Internal reference
|
|
69
|
+
# {'A': {'S': /'GoTo', 'D': b'subsection.1.30.7'}}
|
|
70
|
+
# {'S': /'GoTo', 'D': b'chapter*.1'}
|
|
71
|
+
"""
|
|
72
|
+
pageannotation = page.annots
|
|
73
|
+
if not pageannotation:
|
|
74
|
+
return iamraw.PageAnnotation(None, None, pagenumber)
|
|
75
|
+
getobj = page.doc.getobj
|
|
76
|
+
if not isinstance(pageannotation, list):
|
|
77
|
+
# WORKAROUND: THIS IS A FIX WHEN PAGE ANNOTATIONS ARE NESTED IN A
|
|
78
|
+
# SINGLE REFERENCE, DON'T KNOW WHY THIS CAN HAPPEN. TODO:
|
|
79
|
+
# INVESTIGATE LATER
|
|
80
|
+
pageannotation = list(getobj(page.annots.objid))
|
|
81
|
+
pagelinks, hyperlinks = [], []
|
|
82
|
+
for reference in pageannotation:
|
|
83
|
+
if isinstance(reference, dict): # pylint:disable=W0160
|
|
84
|
+
# reference is already resolved
|
|
85
|
+
pageobject = reference
|
|
86
|
+
else:
|
|
87
|
+
pageobject = getobj(reference.objid)
|
|
88
|
+
reference = parse_reference(pageobject, getobj)
|
|
89
|
+
if reference:
|
|
90
|
+
pagelinks.append(reference)
|
|
91
|
+
continue
|
|
92
|
+
if annotation := parse_appereance_strean(pageobject):
|
|
93
|
+
pagelinks.append(annotation)
|
|
94
|
+
continue
|
|
95
|
+
external = parse_external(pageobject, getobj)
|
|
96
|
+
if external:
|
|
97
|
+
hyperlinks.append(external)
|
|
98
|
+
continue
|
|
99
|
+
utilo.error(f'Unhandeld annotation {pageobject}')
|
|
100
|
+
# flip boundings
|
|
101
|
+
pageheight = float(page.mediabox[3])
|
|
102
|
+
for item in pagelinks:
|
|
103
|
+
item.bounds = flip_bounding(item.bounds, pageheight)
|
|
104
|
+
for item in hyperlinks:
|
|
105
|
+
item.bounds = flip_bounding(item.bounds, pageheight)
|
|
106
|
+
return iamraw.PageAnnotation(pagelinks, hyperlinks, page=pagenumber)
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
def flip_bounding(box, pageheight):
|
|
110
|
+
result = iamraw.BoundingBox(
|
|
111
|
+
box[0],
|
|
112
|
+
pageheight - box[3],
|
|
113
|
+
box[2],
|
|
114
|
+
pageheight - box[1],
|
|
115
|
+
)
|
|
116
|
+
return result
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
def parse_reference(pageobject, getobj=None) -> iamraw.PageLink:
|
|
120
|
+
if link := parse_link(pageobject):
|
|
121
|
+
return link
|
|
122
|
+
if label := parse_label(pageobject, getobj):
|
|
123
|
+
return label
|
|
124
|
+
return None
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
def parse_label(pageobject, getobj=None) -> iamraw.PageLink:
|
|
128
|
+
try:
|
|
129
|
+
typ = pageobject['Type'].name
|
|
130
|
+
if typ != ANNOTATION_LABEL:
|
|
131
|
+
return None
|
|
132
|
+
except KeyError:
|
|
133
|
+
return None
|
|
134
|
+
try:
|
|
135
|
+
annotated = pageobject['A']
|
|
136
|
+
except KeyError:
|
|
137
|
+
return None
|
|
138
|
+
if isinstance(annotated, pdfminer.pdftypes.PDFObjRef):
|
|
139
|
+
# TODO: add layer to automatically convert reference to object.
|
|
140
|
+
annotated = getobj(annotated.objid)
|
|
141
|
+
try:
|
|
142
|
+
pagelink = annotated['D']
|
|
143
|
+
except KeyError:
|
|
144
|
+
return None
|
|
145
|
+
bounds = determine_bounding(pageobject['Rect'])
|
|
146
|
+
pagelink = parse_pagelink(pagelink)
|
|
147
|
+
return iamraw.PageLink(bounds=bounds, goal=pagelink)
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
def parse_link(pageobject) -> iamraw.PageLink:
|
|
151
|
+
# TODO: don't know what this element means
|
|
152
|
+
#{'Type': /'Annot', 'Border': [0, 0, 0], 'H': /'I', 'C': [0,
|
|
153
|
+
#0.5, 0.5], 'Rect': [348.517, 428.927, 431.794, 439.831],
|
|
154
|
+
#'Subtype': /'Link', 'A': {'F': b'distributions.pdf', 'S':
|
|
155
|
+
#/'GoToR', 'D': [0, /'Fit']}} [0, /'Fit']
|
|
156
|
+
try:
|
|
157
|
+
typ = pageobject['Subtype'].name
|
|
158
|
+
if typ != 'Link':
|
|
159
|
+
return None
|
|
160
|
+
assert typ == 'Link'
|
|
161
|
+
except KeyError:
|
|
162
|
+
return None
|
|
163
|
+
try:
|
|
164
|
+
pagelink = pageobject['Dest']
|
|
165
|
+
except KeyError:
|
|
166
|
+
return None
|
|
167
|
+
bounds = determine_bounding(pageobject['Rect'])
|
|
168
|
+
pagelink = parse_pagelink(pagelink)
|
|
169
|
+
return iamraw.PageLink(bounds=bounds, goal=pagelink)
|
|
170
|
+
|
|
171
|
+
|
|
172
|
+
def parse_appereance_strean(pageobject) -> iamraw.PageLink:
|
|
173
|
+
# 12.5.5 Appereance Stream
|
|
174
|
+
# TODO: IMPROVE
|
|
175
|
+
try:
|
|
176
|
+
typ = pageobject['Type'].name
|
|
177
|
+
except KeyError:
|
|
178
|
+
return None
|
|
179
|
+
if typ != ANNOTATION_LABEL:
|
|
180
|
+
return None
|
|
181
|
+
try:
|
|
182
|
+
stream = rawmaker.utils.resolve(pageobject['AP']) # pylint:disable=W0612
|
|
183
|
+
stream = rawmaker.utils.resolve(stream['N'])
|
|
184
|
+
except KeyError:
|
|
185
|
+
return None
|
|
186
|
+
bounds = determine_bounding(pageobject['Rect'])
|
|
187
|
+
result = iamraw.PageLink(
|
|
188
|
+
bounds=bounds,
|
|
189
|
+
goal='STREAM',
|
|
190
|
+
)
|
|
191
|
+
return result
|
|
192
|
+
|
|
193
|
+
|
|
194
|
+
def parse_external(pageobject, getobj=None) -> iamraw.HyperLink:
|
|
195
|
+
# {'S': /'URI', 'URI': b'http://www.europarl.europa.eu/factsheets/de/sheet/92/allgemeine-steuerpolitik'}
|
|
196
|
+
# {'F': b'/C/Users/user/Downloads/MEMO-16-2265_DE.pdf', 'S': /'Launch'}
|
|
197
|
+
try:
|
|
198
|
+
annotated = pageobject['A']
|
|
199
|
+
except KeyError:
|
|
200
|
+
return None
|
|
201
|
+
if isinstance(annotated, pdfminer.pdftypes.PDFObjRef):
|
|
202
|
+
# TODO: add layer to automatically convert reference to object.
|
|
203
|
+
annotated = getobj(annotated.objid)
|
|
204
|
+
bounds = determine_bounding(pageobject['Rect'])
|
|
205
|
+
if 'URI' in annotated:
|
|
206
|
+
hyperlink = hyperlink_decode(annotated['URI'])
|
|
207
|
+
return iamraw.HyperLink(bounds=bounds, goal=hyperlink)
|
|
208
|
+
if 'F' in annotated:
|
|
209
|
+
hyperlink = hyperlink_decode(annotated['F'])
|
|
210
|
+
return iamraw.HyperLink(bounds=bounds, goal=hyperlink)
|
|
211
|
+
return None
|
|
212
|
+
|
|
213
|
+
|
|
214
|
+
def hyperlink_decode(text: bytes) -> str:
|
|
215
|
+
r""""
|
|
216
|
+
text(bytes): 7-bit ASCII, see 12.6.4.7
|
|
217
|
+
|
|
218
|
+
Hint: ASCII must not always be correct cause of bad programmed printer.?
|
|
219
|
+
TODO: MAY A PDFMINER CONVERTION ERROR?
|
|
220
|
+
TODO: VERIFY LINUX/WIN DUE CP1252
|
|
221
|
+
|
|
222
|
+
>>> hyperlink_decode(b'http://road.cc/measure-\x96-smart-street')
|
|
223
|
+
'http://road.cc/measure-–-smart-street'
|
|
224
|
+
"""
|
|
225
|
+
result = rawmaker.utils.guess_decoding(text)
|
|
226
|
+
if result is None:
|
|
227
|
+
utilo.error(f'annotation: could not decode: {text}')
|
|
228
|
+
return result
|
|
229
|
+
|
|
230
|
+
|
|
231
|
+
def parse_pagelink(pagelink):
|
|
232
|
+
r"""\
|
|
233
|
+
>>> parse_pagelink(b'glo:glos:Glas\xfcbergangstemperatur')
|
|
234
|
+
'glo:glos:Glasübergangstemperatur'
|
|
235
|
+
"""
|
|
236
|
+
if isinstance(pagelink, bytes):
|
|
237
|
+
decoded = rawmaker.utils.guess_decoding(pagelink)
|
|
238
|
+
if decoded:
|
|
239
|
+
return decoded
|
|
240
|
+
if isinstance(pagelink, list):
|
|
241
|
+
if isinstance(pagelink[0], pdfminer.pdftypes.PDFObjRef):
|
|
242
|
+
# internal link to pdf page
|
|
243
|
+
# resolve objid
|
|
244
|
+
pagelink[0] = f'objid: {pagelink[0].objid}'
|
|
245
|
+
pagelink = [str(item) for item in pagelink]
|
|
246
|
+
else:
|
|
247
|
+
pagelink = str(pagelink)
|
|
248
|
+
return pagelink
|
|
249
|
+
|
|
250
|
+
|
|
251
|
+
def determine_bounding(bounding):
|
|
252
|
+
bounding = utilo.rect_ensure_bounding(bounding)
|
|
253
|
+
result = iamraw.BoundingBox.from_list(bounding)
|
|
254
|
+
return result
|
|
@@ -0,0 +1,172 @@
|
|
|
1
|
+
# =============================================================================
|
|
2
|
+
# C O P Y R I G H T
|
|
3
|
+
# -----------------------------------------------------------------------------
|
|
4
|
+
# Copyright (c) 2019-2023 by Helmut Konrad Schewe. All rights reserved.
|
|
5
|
+
# This file is property of Helmut Konrad Schewe. Any unauthorized copy,
|
|
6
|
+
# use or distribution is an offensive act against international law and may
|
|
7
|
+
# be prosecuted under federal law. Its content is company confidential.
|
|
8
|
+
# =============================================================================
|
|
9
|
+
"""The `border`-feature enables to detect the pdf page size in ?pixel?
|
|
10
|
+
and locate the cropped box around the content.
|
|
11
|
+
|
|
12
|
+
Features:
|
|
13
|
+
* page size
|
|
14
|
+
* content size
|
|
15
|
+
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
import collections
|
|
19
|
+
import contextlib
|
|
20
|
+
|
|
21
|
+
import iamraw
|
|
22
|
+
import pdfminer.pdfdocument
|
|
23
|
+
import serializeraw
|
|
24
|
+
import utilo
|
|
25
|
+
|
|
26
|
+
import rawmaker.features
|
|
27
|
+
import rawmaker.reader
|
|
28
|
+
|
|
29
|
+
PagePageSize = collections.namedtuple('PagePageSize', 'size page')
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def work(document: str, pages: tuple = None) -> tuple[str, str]:
|
|
33
|
+
"""Extract page size of `document` bounding boxes of page content.
|
|
34
|
+
|
|
35
|
+
Args:
|
|
36
|
+
document: path to document to run parsing
|
|
37
|
+
pages: tuple of processed pages
|
|
38
|
+
Returns:
|
|
39
|
+
tuple(pages, boxes): page size and list of bounding boxes for page
|
|
40
|
+
content.
|
|
41
|
+
"""
|
|
42
|
+
assert isinstance(document, str), str(document)
|
|
43
|
+
with rawmaker.reader.read(document) as pdf:
|
|
44
|
+
sizeandborders, boxes = determine_boundingboxes(pdf, pages=pages)
|
|
45
|
+
|
|
46
|
+
pages = serializeraw.dump_pageborders(sizeandborders)
|
|
47
|
+
boundingboxes = serializeraw.dump_boundingboxes(boxes)
|
|
48
|
+
|
|
49
|
+
return pages, boundingboxes
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def determine_boundingboxes(
|
|
53
|
+
document: pdfminer.pdfdocument.PDFDocument,
|
|
54
|
+
pages: tuple = None,
|
|
55
|
+
) -> iamraw.PageBoundingsList:
|
|
56
|
+
"""Extract page size, border and boundingboxes from `PDFDocument`.
|
|
57
|
+
|
|
58
|
+
Args:
|
|
59
|
+
document(PDFDocument): loaded document
|
|
60
|
+
pages: tuple of processed pages
|
|
61
|
+
Returns:
|
|
62
|
+
sizeandborder(List[PageSizeBorder]) a list for every page with page
|
|
63
|
+
border and a list of the BoundingBoxes of the objects on the
|
|
64
|
+
current page.
|
|
65
|
+
boxes(PageBoundings)
|
|
66
|
+
"""
|
|
67
|
+
sizeborders, boxes = [], []
|
|
68
|
+
contentid = 0
|
|
69
|
+
for page, content in rawmaker.features.process_document(document, pages=pages): # yapf:disable
|
|
70
|
+
content, pagenumber = content.content, content.page
|
|
71
|
+
size = pagesize_from_page(page)
|
|
72
|
+
|
|
73
|
+
pagebounding = iamraw.PageBoundings(
|
|
74
|
+
boundings=boundingboxes_from_page(content, contentid),
|
|
75
|
+
page=pagenumber,
|
|
76
|
+
)
|
|
77
|
+
boxes.append(pagebounding)
|
|
78
|
+
|
|
79
|
+
contentid += len(content)
|
|
80
|
+
border = cropborder_from_page(content)
|
|
81
|
+
sizeborders.append(
|
|
82
|
+
iamraw.PageSizeBorder(
|
|
83
|
+
size=size,
|
|
84
|
+
border=border,
|
|
85
|
+
page=pagenumber,
|
|
86
|
+
))
|
|
87
|
+
return sizeborders, boxes
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
def pagesizes(
|
|
91
|
+
pdf: pdfminer.pdfdocument.PDFDocument,
|
|
92
|
+
pages: tuple = None,
|
|
93
|
+
) -> list[iamraw.PageSize]:
|
|
94
|
+
"""Extract page sizes of `PDFDocument`.
|
|
95
|
+
|
|
96
|
+
Args:
|
|
97
|
+
pdf(PDFDocument): load pdf document
|
|
98
|
+
pages: tuple of processed pages
|
|
99
|
+
Returns:
|
|
100
|
+
List of page sizes.
|
|
101
|
+
"""
|
|
102
|
+
result = []
|
|
103
|
+
for page, content in rawmaker.features.process_document(pdf, pages=pages):
|
|
104
|
+
content, pagenumber = content.content, content.page
|
|
105
|
+
size = pagesize_from_page(page)
|
|
106
|
+
result.append(PagePageSize(size=size, page=pagenumber))
|
|
107
|
+
return result
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
def boundingboxes_from_page(content: list, contentid: int) -> tuple:
|
|
111
|
+
"""Extract bounding boxes from page `content`.
|
|
112
|
+
|
|
113
|
+
Args:
|
|
114
|
+
content: content of a single page
|
|
115
|
+
contentid: last id of the previous page
|
|
116
|
+
Returns:
|
|
117
|
+
Cropbox which contains all items of this page
|
|
118
|
+
"""
|
|
119
|
+
result = []
|
|
120
|
+
for index, item in enumerate(content, start=contentid):
|
|
121
|
+
result.append((index, item.bbox))
|
|
122
|
+
return result
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
def pagesize_from_page(page: pdfminer.pdfdocument.PDFDocument) -> iamraw.PageSize: # yapf:disable
|
|
126
|
+
# x, y, width, height
|
|
127
|
+
pagewidth = utilo.roundme(page.mediabox[2])
|
|
128
|
+
pageheight = utilo.roundme(page.mediabox[3])
|
|
129
|
+
|
|
130
|
+
rotate = page.rotate
|
|
131
|
+
if rotate in {90, 270}:
|
|
132
|
+
# rotated page, flip page size
|
|
133
|
+
pagewidth, pageheight = pageheight, pagewidth
|
|
134
|
+
return iamraw.PageSize(width=pagewidth, height=pageheight)
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
def cropborder_from_page(content) -> iamraw.Border:
|
|
138
|
+
"""Determine bounding box which includes all page items except of
|
|
139
|
+
white space only text.
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
>>> cropborder_from_page([
|
|
143
|
+
... pdfminer.layout.LTLine(linewidth=1.0, p0=(50.520,78.540) , p1=(106.200,78.540)),
|
|
144
|
+
... pdfminer.layout.LTLine(linewidth=1.0, p0=(107.160,78.540), p1=(122.220,78.540)),
|
|
145
|
+
... ])
|
|
146
|
+
Border(left=50.52, right=122.22, top=78.54, bottom=78.54)
|
|
147
|
+
"""
|
|
148
|
+
|
|
149
|
+
def no_whitespace(items):
|
|
150
|
+
result = []
|
|
151
|
+
for item in items:
|
|
152
|
+
with contextlib.suppress(AttributeError):
|
|
153
|
+
if not item.get_text().strip():
|
|
154
|
+
# skip white spaces
|
|
155
|
+
continue
|
|
156
|
+
result.append(item)
|
|
157
|
+
return result
|
|
158
|
+
|
|
159
|
+
content = no_whitespace(content)
|
|
160
|
+
if not content:
|
|
161
|
+
return iamraw.Border(None, None, None, None)
|
|
162
|
+
|
|
163
|
+
# left, top, right, bottom
|
|
164
|
+
x0 = min((item.bbox[0] for item in content))
|
|
165
|
+
y0 = min((item.bbox[1] for item in content))
|
|
166
|
+
x1 = max((item.bbox[2] for item in content))
|
|
167
|
+
y1 = max((item.bbox[3] for item in content))
|
|
168
|
+
# left, right, top, bottom
|
|
169
|
+
x0, y0, x1, y1 = utilo.roundme((x0, y0, x1, y1))
|
|
170
|
+
assert x0 <= x1, f'{x0} <= {x1}'
|
|
171
|
+
assert y0 <= y1, f'{y0} <= {y1}'
|
|
172
|
+
return iamraw.Border(left=x0, right=x1, top=y0, bottom=y1)
|