rawmaker 2.40.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- letty/__init__.py +46 -0
- letty/cli.py +63 -0
- letty/optimizer.py +138 -0
- letty/quality/__init__.py +8 -0
- letty/quality/whitespace.py +50 -0
- letty/strategy.py +8 -0
- rawmaker/__init__.py +29 -0
- rawmaker/__main__.py +13 -0
- rawmaker/__patch__.py +36 -0
- rawmaker/cli.py +206 -0
- rawmaker/cli_automate.py +69 -0
- rawmaker/converter/__init__.py +8 -0
- rawmaker/converter/basic.py +174 -0
- rawmaker/converter/images.py +168 -0
- rawmaker/date.py +83 -0
- rawmaker/destination.py +202 -0
- rawmaker/error.py +34 -0
- rawmaker/features/__init__.py +138 -0
- rawmaker/features/annotation.py +254 -0
- rawmaker/features/border.py +172 -0
- rawmaker/features/boxes.py +153 -0
- rawmaker/features/figures.py +24 -0
- rawmaker/features/fonts.py +229 -0
- rawmaker/features/formula.py +16 -0
- rawmaker/features/horizontals.py +132 -0
- rawmaker/features/images.py +155 -0
- rawmaker/features/line.py +337 -0
- rawmaker/features/outlines.py +123 -0
- rawmaker/features/text.py +91 -0
- rawmaker/fonts/__init__.py +8 -0
- rawmaker/fonts/parser.py +354 -0
- rawmaker/images/__init__.py +8 -0
- rawmaker/images/info.py +35 -0
- rawmaker/miner/__init__.py +8 -0
- rawmaker/miner/char.py +42 -0
- rawmaker/miner/colorspace.py +75 -0
- rawmaker/miner/images.py +448 -0
- rawmaker/miner/position.py +121 -0
- rawmaker/miner/rawchar.py +207 -0
- rawmaker/miner/text.py +833 -0
- rawmaker/miner/underline.py +66 -0
- rawmaker/parameter.py +130 -0
- rawmaker/patch/__init__.py +8 -0
- rawmaker/patch/ltchar.py +79 -0
- rawmaker/reader.py +97 -0
- rawmaker/text/__init__.py +8 -0
- rawmaker/text/chars.py +24 -0
- rawmaker/text/data.py +47 -0
- rawmaker/text/superfast.py +91 -0
- rawmaker/text/wordbox.py +95 -0
- rawmaker/utils.py +44 -0
- rawmaker-2.40.3.dist-info/METADATA +51 -0
- rawmaker-2.40.3.dist-info/RECORD +63 -0
- rawmaker-2.40.3.dist-info/WHEEL +5 -0
- rawmaker-2.40.3.dist-info/entry_points.txt +6 -0
- rawmaker-2.40.3.dist-info/licenses/LICENSE +21 -0
- rawmaker-2.40.3.dist-info/top_level.txt +3 -0
- spacestation/__init__.py +18 -0
- spacestation/cli.py +51 -0
- spacestation/features/__init__.py +8 -0
- spacestation/features/chardist.py +85 -0
- spacestation/features/worddist.py +57 -0
- spacestation/features/wspace.py +130 -0
|
@@ -0,0 +1,155 @@
|
|
|
1
|
+
# =============================================================================
|
|
2
|
+
# C O P Y R I G H T
|
|
3
|
+
# -----------------------------------------------------------------------------
|
|
4
|
+
# Copyright (c) 2019-2023 by Helmut Konrad Schewe. All rights reserved.
|
|
5
|
+
# This file is property of Helmut Konrad Schewe. Any unauthorized copy,
|
|
6
|
+
# use or distribution is an offensive act against international law and may
|
|
7
|
+
# be prosecuted under federal law. Its content is company confidential.
|
|
8
|
+
# =============================================================================
|
|
9
|
+
"""ImageExtractor
|
|
10
|
+
|
|
11
|
+
The ImageExtractor provides the possibility to extract all images out of
|
|
12
|
+
a pdf file.
|
|
13
|
+
|
|
14
|
+
Support formats:
|
|
15
|
+
- png?
|
|
16
|
+
- jpg?
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
import collections
|
|
20
|
+
import io
|
|
21
|
+
import os
|
|
22
|
+
|
|
23
|
+
import PIL.Image
|
|
24
|
+
import serializeraw
|
|
25
|
+
import ughost
|
|
26
|
+
import utilo
|
|
27
|
+
|
|
28
|
+
import rawmaker
|
|
29
|
+
import rawmaker.images.info
|
|
30
|
+
import rawmaker.miner.images
|
|
31
|
+
import rawmaker.reader
|
|
32
|
+
|
|
33
|
+
PageContentImages = collections.namedtuple('PageContentImages', 'content, page')
|
|
34
|
+
PageContentImagesList = list[PageContentImages]
|
|
35
|
+
|
|
36
|
+
DumpedImageInformations = list[tuple[str, bytes]]
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def work(document: str, pages: tuple = None) -> DumpedImageInformations:
|
|
40
|
+
extracted = extract_images(document, pages=pages)
|
|
41
|
+
extracted = beautify_images(extracted, document)
|
|
42
|
+
result = []
|
|
43
|
+
for page in extracted:
|
|
44
|
+
for info, (rawimage, ext) in page.content:
|
|
45
|
+
info = serializeraw.dump_image_info(info)
|
|
46
|
+
result.append((info, (rawimage, ext)))
|
|
47
|
+
return result
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def extract_images(
|
|
51
|
+
document: str,
|
|
52
|
+
outputfolder: str = None,
|
|
53
|
+
pages=None,
|
|
54
|
+
) -> PageContentImagesList:
|
|
55
|
+
# TODO: REPLACE AFTER UPGRADING utilo
|
|
56
|
+
if outputfolder is None:
|
|
57
|
+
outputfolder = utilo.tmpfile(rawmaker.ROOT)
|
|
58
|
+
with rawmaker.reader.read(document) as loaded:
|
|
59
|
+
extracted = rawmaker.miner.images.extract_images(
|
|
60
|
+
loaded,
|
|
61
|
+
outputfolder=outputfolder,
|
|
62
|
+
pages=pages,
|
|
63
|
+
)
|
|
64
|
+
result = convert_images(
|
|
65
|
+
extracted,
|
|
66
|
+
outputfolder,
|
|
67
|
+
pages=pages,
|
|
68
|
+
)
|
|
69
|
+
return result
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def convert_images(
|
|
73
|
+
extracted: dict,
|
|
74
|
+
outputfolder: str,
|
|
75
|
+
pages: tuple = None,
|
|
76
|
+
) -> list:
|
|
77
|
+
result = []
|
|
78
|
+
for page, images in extracted.items():
|
|
79
|
+
# convert selected pages to global pages
|
|
80
|
+
page = convert_pages(page, pages)
|
|
81
|
+
pagecontent = []
|
|
82
|
+
for parsed in images:
|
|
83
|
+
bounding = parsed.bounding
|
|
84
|
+
path = os.path.join(outputfolder, parsed.filename)
|
|
85
|
+
if not os.path.exists(path):
|
|
86
|
+
# TODO: FIX IMAGE EXTRACTION
|
|
87
|
+
utilo.error(f'missing image: {path}')
|
|
88
|
+
continue
|
|
89
|
+
loaded = utilo.file_read_binary(path)
|
|
90
|
+
info = rawmaker.images.info.imageinfo(path, page, bounding)
|
|
91
|
+
if info is None:
|
|
92
|
+
utilo.error(f'could not extract {path}, {page}, {bounding}')
|
|
93
|
+
continue
|
|
94
|
+
ext = utilo.file_ext(path)
|
|
95
|
+
pagecontent.append((info, (loaded, ext)))
|
|
96
|
+
if not pagecontent:
|
|
97
|
+
continue
|
|
98
|
+
result.append(PageContentImages(page=page, content=pagecontent))
|
|
99
|
+
return result
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
def beautify_images(images, path: str):
|
|
103
|
+
"""Use ghost to render pdf and crop image area."""
|
|
104
|
+
result = []
|
|
105
|
+
for page in images:
|
|
106
|
+
boundings = [item[0] for item in page.content]
|
|
107
|
+
extracted = run_ghost(path, boundings)
|
|
108
|
+
content = []
|
|
109
|
+
for raw, bounding in zip(extracted, boundings):
|
|
110
|
+
content.append((bounding, (raw, 'png')))
|
|
111
|
+
result.append(PageContentImages(content=content, page=page.page))
|
|
112
|
+
return result
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
def run_ghost(path: str, boundings: list) -> list:
|
|
116
|
+
"""Extract images out of pdf.
|
|
117
|
+
|
|
118
|
+
If ghost is not installed, we return a empty white image.
|
|
119
|
+
"""
|
|
120
|
+
if ughost.HAS_GHOST:
|
|
121
|
+
extracted = ughost.images(path, boundings)
|
|
122
|
+
return extracted
|
|
123
|
+
utilo.error('could not beautify images: install ghost')
|
|
124
|
+
result = []
|
|
125
|
+
for bounding in boundings:
|
|
126
|
+
size = (int(bounding.width), int(bounding.height))
|
|
127
|
+
# white image backup bock box
|
|
128
|
+
raw = PIL.Image.new('RGB', size, color=1)
|
|
129
|
+
png = convert_topng(raw)
|
|
130
|
+
result.append(png)
|
|
131
|
+
return result
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
def convert_topng(image) -> bytes:
|
|
135
|
+
raw = io.BytesIO()
|
|
136
|
+
image.save(raw, format='png')
|
|
137
|
+
# rewind the buffer
|
|
138
|
+
raw.seek(0)
|
|
139
|
+
# convert to bytes
|
|
140
|
+
result = raw.getvalue()
|
|
141
|
+
return result
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
def convert_pages(page: int, pages: tuple) -> int:
|
|
145
|
+
"""Pdfminer produces directly ascending pages.
|
|
146
|
+
|
|
147
|
+
If we select pages=('0:5,28') pdfminer produces 0, 1, 2, 3, 4, 5.
|
|
148
|
+
This method convert this to 0, 1, 2, 3, 4, 28.
|
|
149
|
+
"""
|
|
150
|
+
# TODO: INVESTIGATE HERE
|
|
151
|
+
if pages is None:
|
|
152
|
+
return page
|
|
153
|
+
# starting with starting offset
|
|
154
|
+
offset = min(pages, default=0)
|
|
155
|
+
return pages[page - offset]
|
|
@@ -0,0 +1,337 @@
|
|
|
1
|
+
# =============================================================================
|
|
2
|
+
# C O P Y R I G H T
|
|
3
|
+
# -----------------------------------------------------------------------------
|
|
4
|
+
# Copyright (c) 2020-2023 by Helmut Konrad Schewe. All rights reserved.
|
|
5
|
+
# This file is property of Helmut Konrad Schewe. Any unauthorized copy,
|
|
6
|
+
# use or distribution is an offensive act against international law and may
|
|
7
|
+
# be prosecuted under federal law. Its content is company confidential.
|
|
8
|
+
# =============================================================================
|
|
9
|
+
"""Line Extractor
|
|
10
|
+
==============
|
|
11
|
+
|
|
12
|
+
This module aims to extract lines out of pdf document.
|
|
13
|
+
|
|
14
|
+
Furthermore the lines are:
|
|
15
|
+
* fixed in x0/x1 and y0/y1
|
|
16
|
+
* sorted from top to bottom and left to right
|
|
17
|
+
* if required merged together.
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
import operator
|
|
21
|
+
|
|
22
|
+
import configos
|
|
23
|
+
import iamraw
|
|
24
|
+
import pdfminer.layout
|
|
25
|
+
import pdfminer.pdfdocument
|
|
26
|
+
import serializeraw
|
|
27
|
+
import utilo
|
|
28
|
+
|
|
29
|
+
import rawmaker.reader
|
|
30
|
+
|
|
31
|
+
# maximal difference in y-component
|
|
32
|
+
HORIZONTAL_DIFF_MAX = configos.HV_FLOAT_PLUS(default=2.0)
|
|
33
|
+
# maximal difference in x-component
|
|
34
|
+
VERTICAL_DIFF_MAX = configos.HV_FLOAT_PLUS(default=2.0)
|
|
35
|
+
# minimal number of minus signs which build a horizontal line
|
|
36
|
+
REQUIRED_MINUS_SIGNS = configos.HV_INT_PLUS(default=40)
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def work(document: str, annotations: str, pages: tuple = None) -> str:
|
|
40
|
+
if utilo.exists(annotations):
|
|
41
|
+
annotations = serializeraw.load_annotations(annotations, pages=pages)
|
|
42
|
+
else:
|
|
43
|
+
utilo.debug(f'missing {annotations} could not skip underlines')
|
|
44
|
+
annotations = []
|
|
45
|
+
with rawmaker.reader.read(document) as pdf:
|
|
46
|
+
extracted = determine_lines(pdf, pages=pages)
|
|
47
|
+
extracted = skip_lines(extracted, annotations)
|
|
48
|
+
dumped = serializeraw.dump_lines(extracted)
|
|
49
|
+
return dumped
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def determine_lines(
|
|
53
|
+
document: pdfminer.pdfdocument.PDFDocument,
|
|
54
|
+
pages: tuple = None,
|
|
55
|
+
) -> iamraw.PageContentLines:
|
|
56
|
+
lines_ = lines(document, pages=pages)
|
|
57
|
+
rotates = rotated(document, pages=pages)
|
|
58
|
+
result = []
|
|
59
|
+
for content, number in lines_:
|
|
60
|
+
# left point is left above from right down point
|
|
61
|
+
content = [utilo.rect_ensure_bounding(item) for item in content]
|
|
62
|
+
# top down, left right
|
|
63
|
+
content.sort(key=operator.itemgetter(1, 0))
|
|
64
|
+
# merge lines which are divided by pdf printer
|
|
65
|
+
merged = utilo.merge_lines(content)
|
|
66
|
+
contentline = iamraw.PageContentLine(
|
|
67
|
+
content=merged,
|
|
68
|
+
page=number,
|
|
69
|
+
rotated=number in rotates,
|
|
70
|
+
)
|
|
71
|
+
result.append(contentline)
|
|
72
|
+
return result
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def rotated(
|
|
76
|
+
document: pdfminer.pdfdocument.PDFDocument,
|
|
77
|
+
pages: tuple = None,
|
|
78
|
+
) -> set:
|
|
79
|
+
"""Determine rotated pages."""
|
|
80
|
+
result = set()
|
|
81
|
+
for page in rawmaker.features.process_pagecontent(
|
|
82
|
+
document,
|
|
83
|
+
pages=pages,
|
|
84
|
+
):
|
|
85
|
+
width, height = page.content.width, page.content.height
|
|
86
|
+
if width < height:
|
|
87
|
+
continue
|
|
88
|
+
result.add(page.page)
|
|
89
|
+
return result
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
def skip_lines(linex, annotation) -> list:
|
|
93
|
+
result = []
|
|
94
|
+
for page in linex:
|
|
95
|
+
anno = utilo.select_page(annotation, page.page)
|
|
96
|
+
if not anno:
|
|
97
|
+
result.append(page)
|
|
98
|
+
continue
|
|
99
|
+
invalid_area = [item.bounds for item in anno.hyperlinks]
|
|
100
|
+
# remove annotated lines. This lines are the underlines of
|
|
101
|
+
# hyperlinks which are produced by cray pdf printer.
|
|
102
|
+
linex = [
|
|
103
|
+
item for item in page.content
|
|
104
|
+
if not utilo.rectangles_intersecting(invalid_area, item)
|
|
105
|
+
]
|
|
106
|
+
result.append(iamraw.PageContentLine(content=linex, page=page.page))
|
|
107
|
+
return result
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
# do not merge near horizontal: '_______________' to text container below.
|
|
111
|
+
LAYOUT_LINES = pdfminer.layout.LAParams(line_margin=0.0000001)
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
def lines(
|
|
115
|
+
pdf: pdfminer.pdfdocument.PDFDocument,
|
|
116
|
+
pages: tuple = None,
|
|
117
|
+
) -> list:
|
|
118
|
+
"""Extract all `LTLine` out of `PDFDocument` page wise
|
|
119
|
+
|
|
120
|
+
Support 3 different types of pdf layout elements:
|
|
121
|
+
LTLine:
|
|
122
|
+
LTRect: small difference between oposite lines
|
|
123
|
+
LTTextBoxHorizontal:
|
|
124
|
+
|
|
125
|
+
Args:
|
|
126
|
+
pdf: pdf document to collect lines
|
|
127
|
+
pages: select pages to run anlaysis on
|
|
128
|
+
Returns:
|
|
129
|
+
list of line objects[LTLine, LTRect, LTTextBoxHorizontal]
|
|
130
|
+
"""
|
|
131
|
+
utilo.asserts(pdf, pdfminer.pdfdocument.PDFDocument)
|
|
132
|
+
possible_lines = type_in_document(
|
|
133
|
+
pdf,
|
|
134
|
+
datatype=(
|
|
135
|
+
pdfminer.layout.LTTextBoxHorizontal,
|
|
136
|
+
pdfminer.layout.LTLine,
|
|
137
|
+
pdfminer.layout.LTRect,
|
|
138
|
+
pdfminer.layout.LTFigure,
|
|
139
|
+
pdfminer.layout.LTCurve,
|
|
140
|
+
),
|
|
141
|
+
layout=LAYOUT_LINES,
|
|
142
|
+
pages=pages,
|
|
143
|
+
)
|
|
144
|
+
strategy = {
|
|
145
|
+
pdfminer.layout.LTLine: accept_ltline,
|
|
146
|
+
pdfminer.layout.LTRect: accept_ltrect,
|
|
147
|
+
pdfminer.layout.LTTextBoxHorizontal: accept_text_as_line,
|
|
148
|
+
pdfminer.layout.LTCurve: accept_curve_as_line,
|
|
149
|
+
pdfminer.layout.LTFigure: accept_figure_as_line,
|
|
150
|
+
}
|
|
151
|
+
result = []
|
|
152
|
+
for content, pagenumber in possible_lines:
|
|
153
|
+
page = []
|
|
154
|
+
for item in content:
|
|
155
|
+
# check item against strategy. If no stategy is supported, the
|
|
156
|
+
# element is skipped.
|
|
157
|
+
try:
|
|
158
|
+
if not strategy[type(item)](item):
|
|
159
|
+
continue
|
|
160
|
+
page.append(item)
|
|
161
|
+
except KeyError:
|
|
162
|
+
utilo.error(f'unsupported strategy {item}')
|
|
163
|
+
# convert bounding
|
|
164
|
+
page = [(item.bbox[0], item.bbox[1], item.bbox[2], item.bbox[3])
|
|
165
|
+
for item in page]
|
|
166
|
+
# round bounding
|
|
167
|
+
page = [utilo.roundme(item) for item in page]
|
|
168
|
+
# remove very short lines/dots
|
|
169
|
+
page = [item for item in page if not utilo.isdot(item, max_length=5.0)]
|
|
170
|
+
# ensure left, top, right, down bounding
|
|
171
|
+
page = [utilo.rect_ensure_bounding(item) for item in page]
|
|
172
|
+
# sort item top down; left right
|
|
173
|
+
page.sort(key=operator.itemgetter(1, 0))
|
|
174
|
+
# merges divided lines
|
|
175
|
+
page = utilo.merge_lines(page)
|
|
176
|
+
# remove duplicated lines which mainly produces out of bad figure
|
|
177
|
+
# extraction
|
|
178
|
+
# TODO: ADD LINE DENSITY CHECK?
|
|
179
|
+
page = utilo.unique_lines(page, max_diff=3.0)
|
|
180
|
+
result.append((page, pagenumber))
|
|
181
|
+
return result
|
|
182
|
+
|
|
183
|
+
|
|
184
|
+
def accept_text_as_line(item: pdfminer.layout.LTTextBoxHorizontal):
|
|
185
|
+
symbols = '_-='
|
|
186
|
+
text = item.get_text()
|
|
187
|
+
if len(text) < REQUIRED_MINUS_SIGNS:
|
|
188
|
+
return False
|
|
189
|
+
for symbol in symbols:
|
|
190
|
+
if text.count(symbol) >= REQUIRED_MINUS_SIGNS:
|
|
191
|
+
# update bounding to pass vertical error test.
|
|
192
|
+
# use vertical centric position
|
|
193
|
+
# TODO: CHECK THIS: Make it symbol dependend?
|
|
194
|
+
if symbol in '_':
|
|
195
|
+
ypos = utilo.roundme(max((item.bbox[1], item.bbox[3])))
|
|
196
|
+
else:
|
|
197
|
+
ypos = utilo.roundme((item.bbox[1] + item.bbox[3]) / 2)
|
|
198
|
+
# update bounding box
|
|
199
|
+
item.bbox = (item.bbox[0], ypos, item.bbox[2], ypos)
|
|
200
|
+
return True
|
|
201
|
+
return False
|
|
202
|
+
|
|
203
|
+
|
|
204
|
+
def accept_ltrect(item: pdfminer.layout.LTRect):
|
|
205
|
+
return accept_ltline(item)
|
|
206
|
+
|
|
207
|
+
|
|
208
|
+
def accept_ltline(
|
|
209
|
+
item: pdfminer.layout.LTLine,
|
|
210
|
+
vertical_max_diff=VERTICAL_DIFF_MAX,
|
|
211
|
+
horizontal_max_diff=HORIZONTAL_DIFF_MAX,
|
|
212
|
+
) -> bool:
|
|
213
|
+
"""Accept horizontal or vertical lines
|
|
214
|
+
|
|
215
|
+
The lines must vary only little. A crossing line has vertical
|
|
216
|
+
and horizontal error. We want | or - not / or \\.
|
|
217
|
+
"""
|
|
218
|
+
assert item.bbox[3] >= item.bbox[1], str(item.bbox)
|
|
219
|
+
assert item.bbox[0] <= item.bbox[2], str(item.bbox)
|
|
220
|
+
|
|
221
|
+
horizontal_error = item.bbox[3] - item.bbox[1] >= horizontal_max_diff
|
|
222
|
+
vertical_error = item.bbox[2] - item.bbox[0] >= vertical_max_diff
|
|
223
|
+
|
|
224
|
+
if horizontal_error and vertical_error:
|
|
225
|
+
return False
|
|
226
|
+
|
|
227
|
+
if vertical_error:
|
|
228
|
+
# HACK: WORKAROUND TODO:
|
|
229
|
+
# horizontal lines: There are lines in bachelor028 which are
|
|
230
|
+
try:
|
|
231
|
+
blueline = BLUE in (item.stroking_color, item.non_stroking_color)
|
|
232
|
+
except AttributeError:
|
|
233
|
+
blueline = False
|
|
234
|
+
if blueline:
|
|
235
|
+
utilo.debug('skip horizontal blue line which is may part of a '
|
|
236
|
+
'hyperlink and destroys footnote detection')
|
|
237
|
+
utilo.debug(item)
|
|
238
|
+
return False
|
|
239
|
+
return True
|
|
240
|
+
|
|
241
|
+
|
|
242
|
+
BLUE = [0, 0, 1]
|
|
243
|
+
|
|
244
|
+
|
|
245
|
+
def accept_figure_as_line(figure: pdfminer.layout.LTFigure) -> bool:
|
|
246
|
+
"""Some pdf renderer converts lines into images."""
|
|
247
|
+
content = figure._objs # pylint:disable=W0212
|
|
248
|
+
if len(content) != 1:
|
|
249
|
+
return False
|
|
250
|
+
# Do we need a min width? I don't think so because thats the job of
|
|
251
|
+
# later running methods.
|
|
252
|
+
if accept_ltline(content[0]):
|
|
253
|
+
return True
|
|
254
|
+
if figure_special_line(figure):
|
|
255
|
+
return True
|
|
256
|
+
return False
|
|
257
|
+
|
|
258
|
+
|
|
259
|
+
def accept_curve_as_line(curve: pdfminer.layout.LTCurve) -> bool:
|
|
260
|
+
pts = curve.pts
|
|
261
|
+
if not curve.linewidth and not curve.fill:
|
|
262
|
+
# invisible line
|
|
263
|
+
return False
|
|
264
|
+
if curve.stroke:
|
|
265
|
+
if curve.stroking_color is None and curve.non_stroking_color is None:
|
|
266
|
+
# TODO: DONT KNOW WHY
|
|
267
|
+
return False
|
|
268
|
+
if curve.fill:
|
|
269
|
+
# polygon?
|
|
270
|
+
if curve.height < 5.0 or curve.width < 5.0:
|
|
271
|
+
return True
|
|
272
|
+
if len(pts) == 2:
|
|
273
|
+
# start and end point
|
|
274
|
+
return True
|
|
275
|
+
# more than two points in a row, check if point are on a line
|
|
276
|
+
# [(437.04645, 259.38056), (437.04645, 293.26655), (437.04645, 269.60483999999997)]
|
|
277
|
+
items = [(*first, *second) for first, second in zip(pts[:-1], pts[1:])]
|
|
278
|
+
merged = utilo.merge_lines(items, diff=1.5)
|
|
279
|
+
if len(merged) == 1:
|
|
280
|
+
# all lines in a row
|
|
281
|
+
return True
|
|
282
|
+
return False
|
|
283
|
+
|
|
284
|
+
|
|
285
|
+
# horizontal line which is rendered as a figure
|
|
286
|
+
HORIZONTAL_FIGURE_LINE_WIDTH_MIN = configos.HV_FLOAT_PLUS(default=350.0)
|
|
287
|
+
# the object have to be more width than height with this ratio
|
|
288
|
+
HORIZONTAL_FIGURE_LINE_RATIO_MIN = configos.HV_FLOAT_PLUS(default=45.0)
|
|
289
|
+
|
|
290
|
+
|
|
291
|
+
def figure_special_line(figure: pdfminer.layout.LTFigure) -> bool:
|
|
292
|
+
"""Detect special line and update figure box if figure is special line."""
|
|
293
|
+
# TODO: THIS IS ONLY A HORIZONTAL?
|
|
294
|
+
# EXAMPLE: MASTER155
|
|
295
|
+
# 'width': 413.96, 'height': 8.54
|
|
296
|
+
# TODO: ANALYZE IMAGE
|
|
297
|
+
image = figure._objs[0] # pylint:disable=W0212
|
|
298
|
+
height = utilo.rect_height(image.bbox)
|
|
299
|
+
width = utilo.rect_width(image.bbox)
|
|
300
|
+
ratio = width / height
|
|
301
|
+
if width <= HORIZONTAL_FIGURE_LINE_WIDTH_MIN:
|
|
302
|
+
return False
|
|
303
|
+
if ratio <= HORIZONTAL_FIGURE_LINE_RATIO_MIN:
|
|
304
|
+
return False
|
|
305
|
+
# adjust bounding of figure to middle line
|
|
306
|
+
# TODO: USE IMAGE INFORMATION
|
|
307
|
+
middle = utilo.roundme((figure.bbox[1] + figure.bbox[3]) / 2)
|
|
308
|
+
figure.bbox = (figure.bbox[0], middle, figure.bbox[2], middle)
|
|
309
|
+
return True
|
|
310
|
+
|
|
311
|
+
|
|
312
|
+
def type_in_document(
|
|
313
|
+
document: pdfminer.pdfdocument.PDFDocument,
|
|
314
|
+
datatype: object,
|
|
315
|
+
layout=None,
|
|
316
|
+
pages: tuple = None,
|
|
317
|
+
) -> list[tuple[pdfminer.layout.LTPage, int]]:
|
|
318
|
+
"""Extract defined `datatype` out of `PDFDocument`
|
|
319
|
+
|
|
320
|
+
Args:
|
|
321
|
+
document(PDFDocument): pdf document to extract all types
|
|
322
|
+
datatype: selected item type
|
|
323
|
+
layout(Param): process with different layout
|
|
324
|
+
pages(tuple): select pages
|
|
325
|
+
Returns:
|
|
326
|
+
List with selected `datatype`.
|
|
327
|
+
"""
|
|
328
|
+
utilo.asserts(document, pdfminer.pdfdocument.PDFDocument)
|
|
329
|
+
result = []
|
|
330
|
+
for page in rawmaker.features.process_pagecontent(
|
|
331
|
+
document,
|
|
332
|
+
layout=layout,
|
|
333
|
+
pages=pages,
|
|
334
|
+
):
|
|
335
|
+
data = [item for item in page.content if isinstance(item, datatype)]
|
|
336
|
+
result.append((data, page.page))
|
|
337
|
+
return result
|
|
@@ -0,0 +1,123 @@
|
|
|
1
|
+
#==============================================================================
|
|
2
|
+
# C O P Y R I G H T
|
|
3
|
+
#------------------------------------------------------------------------------
|
|
4
|
+
# Copyright (c) 2019-2023 by Helmut Konrad Schewe. All rights reserved.
|
|
5
|
+
# This file is property of Helmut Konrad Schewe. Any unauthorized copy,
|
|
6
|
+
# use or distribution is an offensive act against international law and may
|
|
7
|
+
# be prosecuted under federal law. Its content is company confidential.
|
|
8
|
+
#==============================================================================
|
|
9
|
+
"""Document Outlines
|
|
10
|
+
=================
|
|
11
|
+
|
|
12
|
+
See PDF2008: 12.3.3 Document Outline
|
|
13
|
+
|
|
14
|
+
Basic structure of get_outlines: (level, title, args, children)
|
|
15
|
+
|
|
16
|
+
Entries of outlines dict
|
|
17
|
+
------------------------
|
|
18
|
+
|
|
19
|
+
Dest(str, list): destination if item was clicked/activated, not present
|
|
20
|
+
if A is present.
|
|
21
|
+
A(dict): Action(launch application, play sound, chaning state) Shall
|
|
22
|
+
not be present if an DEST item is present
|
|
23
|
+
SE(dict): Reference to structure element(see Structural Hierarchy)
|
|
24
|
+
"""
|
|
25
|
+
|
|
26
|
+
import iamraw
|
|
27
|
+
import pdfminer.pdfdocument
|
|
28
|
+
import pdfminer.pdfpage
|
|
29
|
+
import serializeraw
|
|
30
|
+
import utilo
|
|
31
|
+
|
|
32
|
+
import rawmaker.destination
|
|
33
|
+
import rawmaker.reader
|
|
34
|
+
import rawmaker.utils
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def work(document: str) -> str:
|
|
38
|
+
"""Extract outlines of a pdf document.
|
|
39
|
+
|
|
40
|
+
If there are no outlines provided dump empty list.
|
|
41
|
+
"""
|
|
42
|
+
assert isinstance(document, str), str(document)
|
|
43
|
+
parsed = parse_outlines(document)
|
|
44
|
+
toc = iamraw.create_toc(parsed)
|
|
45
|
+
try:
|
|
46
|
+
# toc to yaml
|
|
47
|
+
dumped = serializeraw.dump_toc(toc)
|
|
48
|
+
except TypeError:
|
|
49
|
+
utilo.error('could not convert toc to YAML.')
|
|
50
|
+
utilo.error('The toc may contain indirect references, buffer, etc.')
|
|
51
|
+
utilo.error('Outline implementation seem not complete, yet.')
|
|
52
|
+
dumped = None
|
|
53
|
+
return dumped
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def parse_outlines(document: str) -> list:
|
|
57
|
+
result = []
|
|
58
|
+
with rawmaker.reader.read(document) as pdf:
|
|
59
|
+
try:
|
|
60
|
+
# extract all outlines from pdf
|
|
61
|
+
outlines = list(pdf.get_outlines())
|
|
62
|
+
pagelookup = rawmaker.destination.pageids(document)
|
|
63
|
+
except pdfminer.pdfdocument.PDFNoOutlines:
|
|
64
|
+
outlines = []
|
|
65
|
+
utilo.error('could not locatate any outlines')
|
|
66
|
+
for (level, title, dest, action, _) in outlines:
|
|
67
|
+
try:
|
|
68
|
+
page = pagenumber(action, dest, pdf)
|
|
69
|
+
except (AttributeError, ValueError) as error:
|
|
70
|
+
utilo.error('PDF NOT FULLY SUPPORTED')
|
|
71
|
+
utilo.print_stacktrace()
|
|
72
|
+
utilo.error(error)
|
|
73
|
+
continue
|
|
74
|
+
if not isinstance(page, int):
|
|
75
|
+
try:
|
|
76
|
+
page = pagelookup[page.objid]
|
|
77
|
+
except KeyError:
|
|
78
|
+
utilo.error(f'invalid page lookup: {page.objid} pdf is '
|
|
79
|
+
'maybe an invalid extraction out of an other '
|
|
80
|
+
f'file: {pagelookup}')
|
|
81
|
+
continue
|
|
82
|
+
assert isinstance(page, int), f'require convertion: {type(page)}'
|
|
83
|
+
raw_section = iamraw.SectionRaw(
|
|
84
|
+
level,
|
|
85
|
+
title,
|
|
86
|
+
page=page,
|
|
87
|
+
raw='toc outline page',
|
|
88
|
+
raw_location=-1,
|
|
89
|
+
)
|
|
90
|
+
result.append(raw_section)
|
|
91
|
+
return result
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def pagenumber(action, dest, pdf) -> rawmaker.destination.ExplicitDestination:
|
|
95
|
+
parsed = None
|
|
96
|
+
if action:
|
|
97
|
+
parsed = rawmaker.destination.parse(action)
|
|
98
|
+
if isinstance(parsed, rawmaker.destination.NamedDestination):
|
|
99
|
+
try:
|
|
100
|
+
resolved = pdf.get_dest(parsed.pdf_reference)
|
|
101
|
+
except pdfminer.pdfdocument.PDFDestinationNotFound:
|
|
102
|
+
utilo.error(f'invald pdf reference: {parsed.pdf_reference}')
|
|
103
|
+
return -1
|
|
104
|
+
resolved = rawmaker.utils.resolve(resolved)
|
|
105
|
+
parsed = rawmaker.destination.parse(resolved)
|
|
106
|
+
if dest:
|
|
107
|
+
dest = rawmaker.utils.resolve(dest)
|
|
108
|
+
if isinstance(dest, list):
|
|
109
|
+
# pdf 1.5: [<PDFObjRef:13>, /'XYZ', 72.0, 769.89, None]
|
|
110
|
+
resolved = dest
|
|
111
|
+
else:
|
|
112
|
+
destname = dest if isinstance(dest, bytes) else dest.name
|
|
113
|
+
resolved = pdf.get_dest(destname)
|
|
114
|
+
if isinstance(resolved, list):
|
|
115
|
+
# pdf 1.4: [<PDFObjRef:4>, /'XYZ', 134.031754, 373.949829, None]
|
|
116
|
+
pass
|
|
117
|
+
else:
|
|
118
|
+
resolved = rawmaker.utils.resolve(resolved)
|
|
119
|
+
parsed = rawmaker.destination.parse(resolved)
|
|
120
|
+
assert parsed
|
|
121
|
+
if isinstance(parsed, rawmaker.destination.ExternalLinkDestination):
|
|
122
|
+
return -1
|
|
123
|
+
return parsed.page
|
|
@@ -0,0 +1,91 @@
|
|
|
1
|
+
#==============================================================================
|
|
2
|
+
# C O P Y R I G H T
|
|
3
|
+
#------------------------------------------------------------------------------
|
|
4
|
+
# Copyright (c) 2019-2023 by Helmut Konrad Schewe. All rights reserved.
|
|
5
|
+
# This file is property of Helmut Konrad Schewe. Any unauthorized copy,
|
|
6
|
+
# use or distribution is an offensive act against international law and may
|
|
7
|
+
# be prosecuted under federal law. Its content is company confidential.
|
|
8
|
+
#==============================================================================
|
|
9
|
+
"""Extract text out of pdf document to gather information."""
|
|
10
|
+
|
|
11
|
+
import os
|
|
12
|
+
|
|
13
|
+
import iamraw
|
|
14
|
+
import serializeraw
|
|
15
|
+
|
|
16
|
+
import rawmaker.cli
|
|
17
|
+
import rawmaker.features
|
|
18
|
+
import rawmaker.miner.position
|
|
19
|
+
import rawmaker.miner.text
|
|
20
|
+
import rawmaker.miner.underline
|
|
21
|
+
import rawmaker.parameter
|
|
22
|
+
import rawmaker.reader
|
|
23
|
+
import rawmaker.text.superfast
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def work( # pylint:disable=W9015,W0613
|
|
27
|
+
document: str,
|
|
28
|
+
xhorizontals: str = None,
|
|
29
|
+
boxes_flow: float = 0.5,
|
|
30
|
+
char_margin: float = 2.0,
|
|
31
|
+
line_margin: float = 0.5,
|
|
32
|
+
line_overlap: float = 0.5,
|
|
33
|
+
word_margin: float = 0.1,
|
|
34
|
+
nostrip: bool = not rawmaker.parameter.STRIP,
|
|
35
|
+
detect_vertical: bool = False,
|
|
36
|
+
pages: tuple = None,
|
|
37
|
+
) -> tuple[str, str]:
|
|
38
|
+
"""Extract structured text out of document
|
|
39
|
+
|
|
40
|
+
Args:
|
|
41
|
+
document: pdf-document to run parsing
|
|
42
|
+
char_margin(float): XXX Why 5.0?
|
|
43
|
+
pages(list): List of processed pages.
|
|
44
|
+
Returns:
|
|
45
|
+
parsed document as yaml output
|
|
46
|
+
parsed positions of text container
|
|
47
|
+
"""
|
|
48
|
+
# TODO: CHANGE BEHAVIOR OF --detect_vertical. Convert to PARAMETER
|
|
49
|
+
# with True as default.
|
|
50
|
+
detect_vertical = True # TODO: REMOVE?
|
|
51
|
+
config = rawmaker.parameter.ParsingConfiguration.from_dict(**locals())
|
|
52
|
+
if rawmaker.cli.superfast(): # pylint:disable=W0160
|
|
53
|
+
document = rawmaker.text.superfast.superfast(
|
|
54
|
+
document,
|
|
55
|
+
config,
|
|
56
|
+
workdir=os.getcwd(),
|
|
57
|
+
pages=pages,
|
|
58
|
+
)
|
|
59
|
+
else:
|
|
60
|
+
document = extract_document(source=document, config=config, pages=pages)
|
|
61
|
+
document = rawmaker.miner.underline.underline_chars(
|
|
62
|
+
document,
|
|
63
|
+
xhorizontals,
|
|
64
|
+
pages=pages,
|
|
65
|
+
)
|
|
66
|
+
positions = rawmaker.miner.position.hash_positions(document, pages=pages)
|
|
67
|
+
# dump result
|
|
68
|
+
dumped_text = serializeraw.dump_document(document)
|
|
69
|
+
dumped_positions = serializeraw.dump_textpositions(positions)
|
|
70
|
+
return dumped_text, dumped_positions
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def extract_document(
|
|
74
|
+
source: str,
|
|
75
|
+
config: rawmaker.parameter.ParsingConfiguration = None,
|
|
76
|
+
converter=None,
|
|
77
|
+
pages: tuple = None,
|
|
78
|
+
) -> iamraw.Document:
|
|
79
|
+
if config:
|
|
80
|
+
rawmaker.parameter.print_layout(config)
|
|
81
|
+
if converter is None:
|
|
82
|
+
converter = rawmaker.miner.text.PrecisePDFConverter
|
|
83
|
+
assert isinstance(source, str), str(source)
|
|
84
|
+
with rawmaker.reader.read(source) as pdf:
|
|
85
|
+
document = rawmaker.features.extract_content(
|
|
86
|
+
pdf,
|
|
87
|
+
config=config,
|
|
88
|
+
converter=converter,
|
|
89
|
+
pages=pages,
|
|
90
|
+
)
|
|
91
|
+
return document
|