rawmaker 2.40.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- letty/__init__.py +46 -0
- letty/cli.py +63 -0
- letty/optimizer.py +138 -0
- letty/quality/__init__.py +8 -0
- letty/quality/whitespace.py +50 -0
- letty/strategy.py +8 -0
- rawmaker/__init__.py +29 -0
- rawmaker/__main__.py +13 -0
- rawmaker/__patch__.py +36 -0
- rawmaker/cli.py +206 -0
- rawmaker/cli_automate.py +69 -0
- rawmaker/converter/__init__.py +8 -0
- rawmaker/converter/basic.py +174 -0
- rawmaker/converter/images.py +168 -0
- rawmaker/date.py +83 -0
- rawmaker/destination.py +202 -0
- rawmaker/error.py +34 -0
- rawmaker/features/__init__.py +138 -0
- rawmaker/features/annotation.py +254 -0
- rawmaker/features/border.py +172 -0
- rawmaker/features/boxes.py +153 -0
- rawmaker/features/figures.py +24 -0
- rawmaker/features/fonts.py +229 -0
- rawmaker/features/formula.py +16 -0
- rawmaker/features/horizontals.py +132 -0
- rawmaker/features/images.py +155 -0
- rawmaker/features/line.py +337 -0
- rawmaker/features/outlines.py +123 -0
- rawmaker/features/text.py +91 -0
- rawmaker/fonts/__init__.py +8 -0
- rawmaker/fonts/parser.py +354 -0
- rawmaker/images/__init__.py +8 -0
- rawmaker/images/info.py +35 -0
- rawmaker/miner/__init__.py +8 -0
- rawmaker/miner/char.py +42 -0
- rawmaker/miner/colorspace.py +75 -0
- rawmaker/miner/images.py +448 -0
- rawmaker/miner/position.py +121 -0
- rawmaker/miner/rawchar.py +207 -0
- rawmaker/miner/text.py +833 -0
- rawmaker/miner/underline.py +66 -0
- rawmaker/parameter.py +130 -0
- rawmaker/patch/__init__.py +8 -0
- rawmaker/patch/ltchar.py +79 -0
- rawmaker/reader.py +97 -0
- rawmaker/text/__init__.py +8 -0
- rawmaker/text/chars.py +24 -0
- rawmaker/text/data.py +47 -0
- rawmaker/text/superfast.py +91 -0
- rawmaker/text/wordbox.py +95 -0
- rawmaker/utils.py +44 -0
- rawmaker-2.40.3.dist-info/METADATA +51 -0
- rawmaker-2.40.3.dist-info/RECORD +63 -0
- rawmaker-2.40.3.dist-info/WHEEL +5 -0
- rawmaker-2.40.3.dist-info/entry_points.txt +6 -0
- rawmaker-2.40.3.dist-info/licenses/LICENSE +21 -0
- rawmaker-2.40.3.dist-info/top_level.txt +3 -0
- spacestation/__init__.py +18 -0
- spacestation/cli.py +51 -0
- spacestation/features/__init__.py +8 -0
- spacestation/features/chardist.py +85 -0
- spacestation/features/worddist.py +57 -0
- spacestation/features/wspace.py +130 -0
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
# =============================================================================
|
|
2
|
+
# C O P Y R I G H T
|
|
3
|
+
# -----------------------------------------------------------------------------
|
|
4
|
+
# Copyright (c) 2022-2023 by Helmut Konrad Schewe. All rights reserved.
|
|
5
|
+
# This file is property of Helmut Konrad Schewe. Any unauthorized copy,
|
|
6
|
+
# use or distribution is an offensive act against international law and may
|
|
7
|
+
# be prosecuted under federal law. Its content is company confidential.
|
|
8
|
+
# =============================================================================
|
|
9
|
+
|
|
10
|
+
import iamraw
|
|
11
|
+
import serializeraw
|
|
12
|
+
import utilo
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def underline_chars(
|
|
16
|
+
document: iamraw.Document,
|
|
17
|
+
underlinex: str = None,
|
|
18
|
+
pages: tuple = None,
|
|
19
|
+
):
|
|
20
|
+
# TODO: MARK HORIZONTAL AS TEXT UNDERLINE HORIZONTAL!
|
|
21
|
+
# TODO: SUPPORT PARTIAL UNDERLINES
|
|
22
|
+
# TODO: UPDATE STYLE RANGE AFTER SETTING ONLY SOME CHARS AS UNDERLINED
|
|
23
|
+
# TODO: REPLACE UNDERLINE WITH STYLE(NONE, UNDERLINE, CROSSED, OVERLINED)
|
|
24
|
+
if not utilo.exists(underlinex):
|
|
25
|
+
utilo.log(f'missing underlines: {underlinex}, skipping char underline')
|
|
26
|
+
return document
|
|
27
|
+
underlinex = serializeraw.load_horizontals(
|
|
28
|
+
underlinex,
|
|
29
|
+
pages=pages,
|
|
30
|
+
)
|
|
31
|
+
for pdfpage in underlinex:
|
|
32
|
+
underlines, pagenumber = pdfpage.content, pdfpage.page
|
|
33
|
+
current_page = utilo.select_page(document.pages, page=pagenumber)
|
|
34
|
+
if not current_page:
|
|
35
|
+
continue
|
|
36
|
+
for underline in underlines:
|
|
37
|
+
for textcontainer in current_page:
|
|
38
|
+
if not underlined(textcontainer.box, underline.box):
|
|
39
|
+
continue
|
|
40
|
+
# TODO: REMOVE APPEND AFTER SHRINKING TEXTCONTAINER TO
|
|
41
|
+
# SINGLE LINE
|
|
42
|
+
# update chars
|
|
43
|
+
for char in utilo.flat(textcontainer, append=True):
|
|
44
|
+
char.underline = True
|
|
45
|
+
break
|
|
46
|
+
return document
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def underlined(text: utilo.Rectangle, horizontal: utilo.Rectangle) -> bool:
|
|
50
|
+
# TODO: SUPPORT CROSSED ETC.
|
|
51
|
+
hline_inside = text[1] < horizontal[1] < text[3]
|
|
52
|
+
if not hline_inside:
|
|
53
|
+
return False
|
|
54
|
+
near_bottom = utilo.near(
|
|
55
|
+
expected=text[3],
|
|
56
|
+
current=horizontal[1],
|
|
57
|
+
diff=3.0,
|
|
58
|
+
)
|
|
59
|
+
if not near_bottom:
|
|
60
|
+
return False
|
|
61
|
+
# start and end of horizontal and text matches
|
|
62
|
+
leftright = utilo.near(text[0], horizontal[0], diff=5.0)
|
|
63
|
+
leftright &= utilo.near(text[2], horizontal[2], diff=5.0)
|
|
64
|
+
if not leftright:
|
|
65
|
+
return False
|
|
66
|
+
return True
|
rawmaker/parameter.py
ADDED
|
@@ -0,0 +1,130 @@
|
|
|
1
|
+
# =============================================================================
|
|
2
|
+
# C O P Y R I G H T
|
|
3
|
+
# -----------------------------------------------------------------------------
|
|
4
|
+
# Copyright (c) 2019-2023 by Helmut Konrad Schewe. All rights reserved.
|
|
5
|
+
# This file is property of Helmut Konrad Schewe. Any unauthorized copy,
|
|
6
|
+
# use or distribution is an offensive act against international law and may
|
|
7
|
+
# be prosecuted under federal law. Its content is company confidential.
|
|
8
|
+
# =============================================================================
|
|
9
|
+
|
|
10
|
+
import dataclasses
|
|
11
|
+
|
|
12
|
+
import configos
|
|
13
|
+
import pdfminer.layout
|
|
14
|
+
import utilo
|
|
15
|
+
|
|
16
|
+
STRIP = True
|
|
17
|
+
|
|
18
|
+
ONELINE = pdfminer.layout.LAParams(
|
|
19
|
+
boxes_flow=1.0,
|
|
20
|
+
char_margin=100.0,
|
|
21
|
+
line_margin=0.001,
|
|
22
|
+
)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
@dataclasses.dataclass
|
|
26
|
+
class ParsingConfiguration:
|
|
27
|
+
boxes_flow: float = 0.5
|
|
28
|
+
char_margin: float = 2.0
|
|
29
|
+
line_margin: float = 0.5
|
|
30
|
+
line_overlap: float = 0.5
|
|
31
|
+
word_margin: float = 0.1
|
|
32
|
+
detect_vertical: bool = True
|
|
33
|
+
strip: bool = STRIP
|
|
34
|
+
|
|
35
|
+
def cmdline(self) -> str:
|
|
36
|
+
"""Convert configuration to `linix` command line parameter syntax."""
|
|
37
|
+
parameter = []
|
|
38
|
+
for item, value in vars(self).items():
|
|
39
|
+
if item == 'strip':
|
|
40
|
+
if not value:
|
|
41
|
+
parameter.append('--nostrip')
|
|
42
|
+
continue
|
|
43
|
+
if isinstance(value, bool):
|
|
44
|
+
if value:
|
|
45
|
+
parameter.append(f'--{item}')
|
|
46
|
+
continue
|
|
47
|
+
parameter.append(f'--{item}={value}')
|
|
48
|
+
return utilo.from_tuple(parameter)
|
|
49
|
+
|
|
50
|
+
def laparams(self) -> pdfminer.layout.LAParams:
|
|
51
|
+
result = pdfminer.layout.LAParams(
|
|
52
|
+
boxes_flow=self.boxes_flow,
|
|
53
|
+
char_margin=self.char_margin,
|
|
54
|
+
detect_vertical=self.detect_vertical,
|
|
55
|
+
line_margin=self.line_margin,
|
|
56
|
+
line_overlap=self.line_overlap,
|
|
57
|
+
word_margin=self.word_margin,
|
|
58
|
+
)
|
|
59
|
+
return result
|
|
60
|
+
|
|
61
|
+
@classmethod
|
|
62
|
+
def from_dict(cls, **kwargs):
|
|
63
|
+
"""\
|
|
64
|
+
>>> ParsingConfiguration.from_dict()
|
|
65
|
+
--boxes_flow=...--char_margin=...--line_margin=...--line_overlap=...
|
|
66
|
+
"""
|
|
67
|
+
instance = cls()
|
|
68
|
+
for key, value in kwargs.items():
|
|
69
|
+
if key == 'nostrip':
|
|
70
|
+
instance.strip = not value
|
|
71
|
+
continue
|
|
72
|
+
if hasattr(instance, key):
|
|
73
|
+
setattr(instance, key, value)
|
|
74
|
+
return instance
|
|
75
|
+
|
|
76
|
+
def __str__(self):
|
|
77
|
+
return self.cmdline()
|
|
78
|
+
|
|
79
|
+
def __repr__(self):
|
|
80
|
+
return str(self)
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
LAYOUT_CHAR_MARGIN = configos.HV_FLOAT_PLUS(default=1.1)
|
|
84
|
+
|
|
85
|
+
LAYOUT = ParsingConfiguration(char_margin=LAYOUT_CHAR_MARGIN)
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def from_config(config: ParsingConfiguration) -> pdfminer.layout.LAParams:
|
|
89
|
+
boxes_flow: float = 0.5
|
|
90
|
+
char_margin: float = 2.0
|
|
91
|
+
line_margin: float = 0.5
|
|
92
|
+
line_overlap: float = 0.5
|
|
93
|
+
word_margin: float = 0.1
|
|
94
|
+
detect_vertical = True
|
|
95
|
+
if config:
|
|
96
|
+
boxes_flow = config.boxes_flow
|
|
97
|
+
char_margin = config.char_margin
|
|
98
|
+
detect_vertical = config.detect_vertical
|
|
99
|
+
line_margin = config.line_margin
|
|
100
|
+
line_overlap = config.line_overlap
|
|
101
|
+
word_margin = config.word_margin
|
|
102
|
+
result = pdfminer.layout.LAParams(
|
|
103
|
+
boxes_flow=boxes_flow,
|
|
104
|
+
char_margin=char_margin,
|
|
105
|
+
detect_vertical=detect_vertical,
|
|
106
|
+
line_margin=line_margin,
|
|
107
|
+
line_overlap=line_overlap,
|
|
108
|
+
word_margin=word_margin,
|
|
109
|
+
)
|
|
110
|
+
return result
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
def print_layout(layout: ParsingConfiguration = None):
|
|
114
|
+
assert layout, 'missing layout'
|
|
115
|
+
layout = from_config(layout)
|
|
116
|
+
utilo.log(' layout:', end=' ', level=utilo.Level.INFORMATION)
|
|
117
|
+
information = [
|
|
118
|
+
('boxes_flow', layout.boxes_flow),
|
|
119
|
+
('char_margin', layout.char_margin),
|
|
120
|
+
('line_margin', layout.line_margin),
|
|
121
|
+
('line_overlap', layout.line_overlap),
|
|
122
|
+
('word_margin', layout.word_margin),
|
|
123
|
+
]
|
|
124
|
+
for (text, value) in information:
|
|
125
|
+
utilo.log(
|
|
126
|
+
'%s %.2f' % (text, value), # pylint:disable=C0209
|
|
127
|
+
end=' ',
|
|
128
|
+
level=utilo.Level.INFORMATION,
|
|
129
|
+
)
|
|
130
|
+
utilo.log(level=utilo.Level.INFORMATION) # newline
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
# =============================================================================
|
|
2
|
+
# C O P Y R I G H T
|
|
3
|
+
# -----------------------------------------------------------------------------
|
|
4
|
+
# Copyright (c) 2019-2023 by Helmut Konrad Schewe. All rights reserved.
|
|
5
|
+
# This file is property of Helmut Konrad Schewe. Any unauthorized copy,
|
|
6
|
+
# use or distribution is an offensive act against international law and may
|
|
7
|
+
# be prosecuted under federal law. Its content is company confidential.
|
|
8
|
+
# =============================================================================
|
rawmaker/patch/ltchar.py
ADDED
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
# =============================================================================
|
|
2
|
+
# C O P Y R I G H T
|
|
3
|
+
# -----------------------------------------------------------------------------
|
|
4
|
+
# Copyright (c) 2019-2023 by Helmut Konrad Schewe. All rights reserved.
|
|
5
|
+
# This file is property of Helmut Konrad Schewe. Any unauthorized copy,
|
|
6
|
+
# use or distribution is an offensive act against international law and may
|
|
7
|
+
# be prosecuted under federal law. Its content is company confidential.
|
|
8
|
+
# =============================================================================
|
|
9
|
+
|
|
10
|
+
import contextlib
|
|
11
|
+
|
|
12
|
+
import pdfminer.layout
|
|
13
|
+
import pdfminer.pdffont
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class PatchedLTChar(pdfminer.layout.LTChar):
|
|
17
|
+
|
|
18
|
+
# pylint:disable=R0913
|
|
19
|
+
def __init__(self, matrix, font, fontsize, scaling, rise, text, textwidth,
|
|
20
|
+
textdisp, ncs, graphicstate):
|
|
21
|
+
super().__init__(matrix, font, fontsize, scaling, rise, text, textwidth,
|
|
22
|
+
textdisp, ncs, graphicstate)
|
|
23
|
+
if fontsize == 1:
|
|
24
|
+
# HACK: CLARIFY WHAT IS CORRECT, we need more generated pdf
|
|
25
|
+
# examples
|
|
26
|
+
# HQEOGA+Arial in bachelor37
|
|
27
|
+
# self.fontsize = self.size
|
|
28
|
+
# matrix[0] normal char
|
|
29
|
+
# matrix[1] rotated char
|
|
30
|
+
self.fontsize = matrix[0] if matrix[0] else matrix[1]
|
|
31
|
+
assert self.fontsize, matrix
|
|
32
|
+
else:
|
|
33
|
+
self.fontsize = fontsize
|
|
34
|
+
self.rise = rise
|
|
35
|
+
self.flags = font.flags
|
|
36
|
+
|
|
37
|
+
def __getitem__(self, index):
|
|
38
|
+
"""Access bounding component."""
|
|
39
|
+
return self.bbox[index]
|
|
40
|
+
|
|
41
|
+
@property
|
|
42
|
+
def text(self):
|
|
43
|
+
return self.get_text()
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def vertical(item: pdfminer.layout.LTChar) -> bool:
|
|
47
|
+
"""Check LTChar.upright flag."""
|
|
48
|
+
with contextlib.suppress(AttributeError):
|
|
49
|
+
if not item.upright:
|
|
50
|
+
return True
|
|
51
|
+
return False
|
|
52
|
+
return None
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def render_char( # pylint:disable=W9015,W9016
|
|
56
|
+
self, matrix, font, fontsize, scaling, rise, cid, ncs,
|
|
57
|
+
graphicstate) -> PatchedLTChar:
|
|
58
|
+
"""Patch LTChar to introduce font size hack.
|
|
59
|
+
|
|
60
|
+
Args:
|
|
61
|
+
cid(int): character number id - ASCII
|
|
62
|
+
ncs(PDFColorSpace): color space of document
|
|
63
|
+
Returns:
|
|
64
|
+
Patched char object.
|
|
65
|
+
"""
|
|
66
|
+
try:
|
|
67
|
+
text = font.to_unichr(cid)
|
|
68
|
+
assert isinstance(text, str), type(text)
|
|
69
|
+
except pdfminer.pdffont.PDFUnicodeNotDefined:
|
|
70
|
+
text = self.handle_undefined_char(font, cid)
|
|
71
|
+
|
|
72
|
+
textwidth = font.char_width(cid)
|
|
73
|
+
textdisp = font.char_disp(cid)
|
|
74
|
+
|
|
75
|
+
# patch to document font size and rise
|
|
76
|
+
item = PatchedLTChar(matrix, font, fontsize, scaling, rise, text, textwidth,
|
|
77
|
+
textdisp, ncs, graphicstate)
|
|
78
|
+
self.cur_item.add(item)
|
|
79
|
+
return item.adv
|
rawmaker/reader.py
ADDED
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
#==============================================================================
|
|
2
|
+
# C O P Y R I G H T
|
|
3
|
+
#------------------------------------------------------------------------------
|
|
4
|
+
# Copyright (c) 2019-2023 by Helmut Konrad Schewe. All rights reserved.
|
|
5
|
+
# This file is property of Helmut Konrad Schewe. Any unauthorized copy,
|
|
6
|
+
# use or distribution is an offensive act against international law and may
|
|
7
|
+
# be prosecuted under federal law. Its content is company confidential.
|
|
8
|
+
#==============================================================================
|
|
9
|
+
|
|
10
|
+
import sys
|
|
11
|
+
from contextlib import contextmanager
|
|
12
|
+
from os.path import exists
|
|
13
|
+
from os.path import isfile
|
|
14
|
+
|
|
15
|
+
import utilo
|
|
16
|
+
from pdfminer.pdfdocument import PDFDocument
|
|
17
|
+
from pdfminer.pdfdocument import PDFEncryptionError
|
|
18
|
+
from pdfminer.pdfdocument import PDFSyntaxError
|
|
19
|
+
from pdfminer.pdfparser import PDFParser
|
|
20
|
+
|
|
21
|
+
# from rawmaker.error import InvalidPDF
|
|
22
|
+
# from rawmaker.error import PDFParserImplementationError
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
@contextmanager
|
|
26
|
+
def read(path: str, password: str = None, verify: bool = True) -> PDFDocument:
|
|
27
|
+
"""Open pdf from `path`.
|
|
28
|
+
|
|
29
|
+
Args:
|
|
30
|
+
path(str): path to pdf-file
|
|
31
|
+
password(str): optional password to extract encrypted data
|
|
32
|
+
verify(bool): ensure that file starts with `%PDF-`
|
|
33
|
+
Raises:
|
|
34
|
+
TextExtractNotAllowed: if no extraction is allowed - currently disabled
|
|
35
|
+
FileNotFoundError: `path` does not exists
|
|
36
|
+
ValueError: `path` is not a file
|
|
37
|
+
Yields:
|
|
38
|
+
PDFDocument: open pdf file
|
|
39
|
+
"""
|
|
40
|
+
if not exists(path):
|
|
41
|
+
raise FileNotFoundError(f'Path does not exists: {path}')
|
|
42
|
+
if not isfile(path):
|
|
43
|
+
raise ValueError(f'Read requires an pdf document, not {path}')
|
|
44
|
+
if verify:
|
|
45
|
+
header = open(path, 'rb').read(5)
|
|
46
|
+
if header != b'%PDF-':
|
|
47
|
+
# TODO: MOVE TO def before() method after upgrading utilo
|
|
48
|
+
utilo.error('invalid pdf header')
|
|
49
|
+
sys.exit(1)
|
|
50
|
+
with open(path, 'rb') as fp:
|
|
51
|
+
# Create a PDF parser object associated with the file object.
|
|
52
|
+
parser = PDFParser(fp)
|
|
53
|
+
# Create a PDF document object that stores the document structure.
|
|
54
|
+
# Supply the password for initialization.
|
|
55
|
+
document = open_document(parser, password)
|
|
56
|
+
yield document
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def open_document(parser: PDFParser, password: str) -> PDFDocument:
|
|
60
|
+
"""Open pdf document base on selected `parser`.
|
|
61
|
+
|
|
62
|
+
Hint:
|
|
63
|
+
Using fallback as default mode is very slow. Therefore we try
|
|
64
|
+
without fallback and if this does not work, we try it with
|
|
65
|
+
fallback again.
|
|
66
|
+
Try first without using fallback because this is much faster on
|
|
67
|
+
valid documents. If the run without fallback fails, start it
|
|
68
|
+
with fallback again.
|
|
69
|
+
"""
|
|
70
|
+
password = password if password is not None else ''
|
|
71
|
+
try:
|
|
72
|
+
document = PDFDocument(parser, password, fallback=False)
|
|
73
|
+
except PDFSyntaxError:
|
|
74
|
+
pass # try with fallback again
|
|
75
|
+
except PDFEncryptionError as encryption:
|
|
76
|
+
utilo.error('encryption not supported')
|
|
77
|
+
utilo.debug(encryption)
|
|
78
|
+
sys.exit(1)
|
|
79
|
+
except Exception: # pylint:disable=broad-except
|
|
80
|
+
utilo.print_stacktrace()
|
|
81
|
+
sys.exit(2)
|
|
82
|
+
# raise PDFParserImplementationError(path) from exc
|
|
83
|
+
else:
|
|
84
|
+
return document
|
|
85
|
+
|
|
86
|
+
try:
|
|
87
|
+
utilo.info('try to use `fallback` pdf loader')
|
|
88
|
+
document = PDFDocument(parser, password, fallback=True)
|
|
89
|
+
except PDFSyntaxError:
|
|
90
|
+
utilo.print_stacktrace()
|
|
91
|
+
sys.exit(3)
|
|
92
|
+
# raise InvalidPDF(path) from exc
|
|
93
|
+
except Exception: # pylint:disable=broad-except
|
|
94
|
+
# raise PDFParserImplementationError(path) from exc
|
|
95
|
+
utilo.print_stacktrace()
|
|
96
|
+
sys.exit(2)
|
|
97
|
+
return document
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
# =============================================================================
|
|
2
|
+
# C O P Y R I G H T
|
|
3
|
+
# -----------------------------------------------------------------------------
|
|
4
|
+
# Copyright (c) 2020-2023 by Helmut Konrad Schewe. All rights reserved.
|
|
5
|
+
# This file is property of Helmut Konrad Schewe. Any unauthorized copy,
|
|
6
|
+
# use or distribution is an offensive act against international law and may
|
|
7
|
+
# be prosecuted under federal law. Its content is company confidential.
|
|
8
|
+
# =============================================================================
|
rawmaker/text/chars.py
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
# =============================================================================
|
|
2
|
+
# C O P Y R I G H T
|
|
3
|
+
# -----------------------------------------------------------------------------
|
|
4
|
+
# Copyright (c) 2020-2023 by Helmut Konrad Schewe. All rights reserved.
|
|
5
|
+
# This file is property of Helmut Konrad Schewe. Any unauthorized copy,
|
|
6
|
+
# use or distribution is an offensive act against international law and may
|
|
7
|
+
# be prosecuted under federal law. Its content is company confidential.
|
|
8
|
+
# =============================================================================
|
|
9
|
+
|
|
10
|
+
import iamraw
|
|
11
|
+
|
|
12
|
+
import rawmaker.features.text
|
|
13
|
+
import rawmaker.miner.char
|
|
14
|
+
import rawmaker.reader
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def extract_chars(document: str, pages: tuple = None) -> iamraw.Document:
|
|
18
|
+
assert isinstance(document, str), str(document)
|
|
19
|
+
document = rawmaker.features.text.extract_document(
|
|
20
|
+
document,
|
|
21
|
+
pages=pages,
|
|
22
|
+
converter=rawmaker.miner.char.CharPDFConvert,
|
|
23
|
+
)
|
|
24
|
+
return document
|
rawmaker/text/data.py
ADDED
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
# =============================================================================
|
|
2
|
+
# C O P Y R I G H T
|
|
3
|
+
# -----------------------------------------------------------------------------
|
|
4
|
+
# Copyright (c) 2020-2023 by Helmut Konrad Schewe. All rights reserved.
|
|
5
|
+
# This file is property of Helmut Konrad Schewe. Any unauthorized copy,
|
|
6
|
+
# use or distribution is an offensive act against international law and may
|
|
7
|
+
# be prosecuted under federal law. Its content is company confidential.
|
|
8
|
+
# =============================================================================
|
|
9
|
+
|
|
10
|
+
import dataclasses
|
|
11
|
+
|
|
12
|
+
import iamraw
|
|
13
|
+
import utilo
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
@dataclasses.dataclass
|
|
17
|
+
class WordBoxPage:
|
|
18
|
+
|
|
19
|
+
content: iamraw.BoundingBoxes = dataclasses.field(default_factory=list)
|
|
20
|
+
page: int = None
|
|
21
|
+
|
|
22
|
+
def __getitem__(self, index):
|
|
23
|
+
return self.content[index] # pylint:disable=E1136
|
|
24
|
+
|
|
25
|
+
def __len__(self):
|
|
26
|
+
return len(self.content) # pylint:disable=E1136
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
WordBoxPages = list[WordBoxPage]
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
@dataclasses.dataclass
|
|
33
|
+
class PageLines:
|
|
34
|
+
|
|
35
|
+
lines: list = dataclasses.field(default_factory=list)
|
|
36
|
+
|
|
37
|
+
def __getitem__(self, index):
|
|
38
|
+
return self.lines[index] # pylint:disable=E1136
|
|
39
|
+
|
|
40
|
+
def __len__(self):
|
|
41
|
+
return len(self.lines) # pylint:disable=E1136
|
|
42
|
+
|
|
43
|
+
def __str__(self):
|
|
44
|
+
word = lambda x: ''.join([char.text for char in x]) # pylint:disable=C3001
|
|
45
|
+
line = lambda x: ' '.join([word(item) for item in x]) # pylint:disable=C3001
|
|
46
|
+
lines = utilo.NEWLINE.join(line(item) for item in self.lines) # pylint:disable=E1133
|
|
47
|
+
return f'PageLines: {len(self.lines)}\n{lines}'
|
|
@@ -0,0 +1,91 @@
|
|
|
1
|
+
# =============================================================================
|
|
2
|
+
# C O P Y R I G H T
|
|
3
|
+
# -----------------------------------------------------------------------------
|
|
4
|
+
# Copyright (c) 2021-2023 by Helmut Konrad Schewe. All rights reserved.
|
|
5
|
+
# This file is property of Helmut Konrad Schewe. Any unauthorized copy,
|
|
6
|
+
# use or distribution is an offensive act against international law and may
|
|
7
|
+
# be prosecuted under federal law. Its content is company confidential.
|
|
8
|
+
# =============================================================================
|
|
9
|
+
|
|
10
|
+
import os
|
|
11
|
+
|
|
12
|
+
import iamraw
|
|
13
|
+
import pdflog
|
|
14
|
+
import serializeraw
|
|
15
|
+
import utilo
|
|
16
|
+
|
|
17
|
+
import rawmaker.parameter
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def superfast(
|
|
21
|
+
document: str,
|
|
22
|
+
config: rawmaker.parameter.ParsingConfiguration,
|
|
23
|
+
workdir: str,
|
|
24
|
+
pages: list = None,
|
|
25
|
+
) -> iamraw.Document:
|
|
26
|
+
if pages is None:
|
|
27
|
+
pagecount = pdflog.pagecount(document)
|
|
28
|
+
pages = utilo.make_tuple(pagecount)
|
|
29
|
+
chunks = utilo.chunks(pages, size=10)
|
|
30
|
+
parameter = config.cmdline()
|
|
31
|
+
todo = []
|
|
32
|
+
for index, chunk in enumerate(chunks):
|
|
33
|
+
joined_pages = utilo.from_tuple(chunk, separator=',')
|
|
34
|
+
cmd = (f'rawmaker -i {document} -o {workdir} --prefix {index}'
|
|
35
|
+
f' --text --pages {joined_pages} {parameter}')
|
|
36
|
+
utilo.log(cmd)
|
|
37
|
+
todo.append(cmd)
|
|
38
|
+
# run in parallel
|
|
39
|
+
completed = utilo.run_parallel(todo, workdir, worker=12)
|
|
40
|
+
assert completed == utilo.SUCCESS, completed
|
|
41
|
+
# merge document
|
|
42
|
+
document = merge_document(workdir, len(chunks))
|
|
43
|
+
return document
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def merge_document(path: str, size: int) -> iamraw.Document:
|
|
47
|
+
"""Merge chunks of extract document.
|
|
48
|
+
|
|
49
|
+
A little bit diry, but ok for now. XXX
|
|
50
|
+
"""
|
|
51
|
+
text_files = [
|
|
52
|
+
os.path.join(path, f'rawmaker__{item}_text_text.yaml')
|
|
53
|
+
for item in range(size)
|
|
54
|
+
]
|
|
55
|
+
posi_files = [
|
|
56
|
+
os.path.join(path, f'rawmaker__{item}_text_positions.yaml')
|
|
57
|
+
for item in range(size)
|
|
58
|
+
]
|
|
59
|
+
|
|
60
|
+
text = [serializeraw.load_document(item) for item in text_files]
|
|
61
|
+
positions = [serializeraw.load_textpositions(item) for item in posi_files]
|
|
62
|
+
|
|
63
|
+
for item in text_files + posi_files:
|
|
64
|
+
utilo.info(f'remove {item}')
|
|
65
|
+
utilo.file_remove(item)
|
|
66
|
+
|
|
67
|
+
for docs, pos in zip(text, positions):
|
|
68
|
+
for page in docs:
|
|
69
|
+
index = 0
|
|
70
|
+
for item in page:
|
|
71
|
+
if not isinstance(item, iamraw.TextContainer):
|
|
72
|
+
continue
|
|
73
|
+
# bounding, mean
|
|
74
|
+
bounding, mean = utilo.select_page(pos, page.page).content[index] # yapf:disable
|
|
75
|
+
fake_text_mean_height(item, bounding, mean)
|
|
76
|
+
item.box = bounding
|
|
77
|
+
index += 1
|
|
78
|
+
|
|
79
|
+
document = iamraw.Document(dimension=text[0].dimension)
|
|
80
|
+
for chunk in text:
|
|
81
|
+
for page in chunk:
|
|
82
|
+
document.append(page)
|
|
83
|
+
return document
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def fake_text_mean_height(item, bounding, mean):
|
|
87
|
+
# TODO: REMOVE THIS HACK LATER
|
|
88
|
+
for line in item.lines:
|
|
89
|
+
for char in line:
|
|
90
|
+
# Fake mean char height
|
|
91
|
+
char.box = iamraw.BoundingBox(0, bounding.y1 - mean, 0, bounding.y1)
|
rawmaker/text/wordbox.py
ADDED
|
@@ -0,0 +1,95 @@
|
|
|
1
|
+
# =============================================================================
|
|
2
|
+
# C O P Y R I G H T
|
|
3
|
+
# -----------------------------------------------------------------------------
|
|
4
|
+
# Copyright (c) 2020-2023 by Helmut Konrad Schewe. All rights reserved.
|
|
5
|
+
# This file is property of Helmut Konrad Schewe. Any unauthorized copy,
|
|
6
|
+
# use or distribution is an offensive act against international law and may
|
|
7
|
+
# be prosecuted under federal law. Its content is company confidential.
|
|
8
|
+
# =============================================================================
|
|
9
|
+
|
|
10
|
+
import configos
|
|
11
|
+
import utilo
|
|
12
|
+
|
|
13
|
+
import rawmaker.text.chars
|
|
14
|
+
import rawmaker.text.data
|
|
15
|
+
|
|
16
|
+
DIFF_MAX = configos.HolyTable(items=[
|
|
17
|
+
(6, 1.0),
|
|
18
|
+
(12, 1.0),
|
|
19
|
+
(16, 1.0),
|
|
20
|
+
(22, 10.0),
|
|
21
|
+
(24, 10.0),
|
|
22
|
+
(48, 15.0),
|
|
23
|
+
])
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def parses(
|
|
27
|
+
source: str,
|
|
28
|
+
pages: tuple,
|
|
29
|
+
word_length_min: int = 1,
|
|
30
|
+
difftable: configos.HolyTable = DIFF_MAX,
|
|
31
|
+
) -> rawmaker.text.data.WordBoxPages:
|
|
32
|
+
extracted = rawmaker.text.chars.extract_chars(source, pages)
|
|
33
|
+
pages = [
|
|
34
|
+
extract_page(
|
|
35
|
+
page,
|
|
36
|
+
word_length_min=word_length_min,
|
|
37
|
+
difftable=difftable,
|
|
38
|
+
) for page in extracted
|
|
39
|
+
]
|
|
40
|
+
boundings = [wordbox_boundings(page) for page in pages]
|
|
41
|
+
|
|
42
|
+
# adjust page numbers
|
|
43
|
+
for page, bounding in zip(extracted, boundings):
|
|
44
|
+
bounding.page = page.page
|
|
45
|
+
return boundings
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def extract_page(
|
|
49
|
+
page,
|
|
50
|
+
word_length_min: int = 1,
|
|
51
|
+
difftable: configos.HolyTable = DIFF_MAX,
|
|
52
|
+
) -> rawmaker.text.data.PageLines:
|
|
53
|
+
# remove white space
|
|
54
|
+
page = [item for item in page if item.get_text().strip()]
|
|
55
|
+
|
|
56
|
+
lines = utilo.same_line_cluster(
|
|
57
|
+
page,
|
|
58
|
+
min_elements=word_length_min, # support single chars
|
|
59
|
+
matcher=lambda x: x.bbox[3],
|
|
60
|
+
)
|
|
61
|
+
# ensure top bottom
|
|
62
|
+
lines = sorted(lines, key=lambda bounding: bounding.center[3]) # y1
|
|
63
|
+
|
|
64
|
+
result = []
|
|
65
|
+
for line in lines:
|
|
66
|
+
# ensure left right
|
|
67
|
+
line = sorted(line, key=lambda x: x[0]) # x0
|
|
68
|
+
merged = merge_line(line, difftable=difftable)
|
|
69
|
+
result.append(merged)
|
|
70
|
+
return rawmaker.text.data.PageLines(lines=result)
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def merge_line(line, difftable: configos.HolyTable):
|
|
74
|
+
if not line:
|
|
75
|
+
return []
|
|
76
|
+
diffs = [
|
|
77
|
+
after[0] - current[2] for current, after in zip(line[0:-1], line[1:])
|
|
78
|
+
]
|
|
79
|
+
result = [[line[0]]]
|
|
80
|
+
for char, diff in zip(line[1:], diffs):
|
|
81
|
+
if diff > difftable(char.fontsize):
|
|
82
|
+
result.append([char])
|
|
83
|
+
else:
|
|
84
|
+
result[-1].append(char)
|
|
85
|
+
return result
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def wordbox_boundings(page) -> rawmaker.text.data.WordBoxPage:
|
|
89
|
+
result = []
|
|
90
|
+
for line in page:
|
|
91
|
+
for word in line:
|
|
92
|
+
bounding = utilo.rect_max([char.bbox for char in word])
|
|
93
|
+
result.append(bounding)
|
|
94
|
+
wordbox = rawmaker.text.data.WordBoxPage(content=result)
|
|
95
|
+
return wordbox
|
rawmaker/utils.py
ADDED
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
# =============================================================================
|
|
2
|
+
# C O P Y R I G H T
|
|
3
|
+
# -----------------------------------------------------------------------------
|
|
4
|
+
# Copyright (c) 2021-2023 by Helmut Konrad Schewe. All rights reserved.
|
|
5
|
+
# This file is property of Helmut Konrad Schewe. Any unauthorized copy,
|
|
6
|
+
# use or distribution is an offensive act against international law and may
|
|
7
|
+
# be prosecuted under federal law. Its content is company confidential.
|
|
8
|
+
# =============================================================================
|
|
9
|
+
|
|
10
|
+
import contextlib
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def resolve(reference):
|
|
14
|
+
with contextlib.suppress(AttributeError):
|
|
15
|
+
return reference.resolve()
|
|
16
|
+
return reference
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
ENCODINGS = 'ascii cp1252 utf8 '.split()
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def guess_decoding(text: bytes) -> str:
|
|
23
|
+
r"""\
|
|
24
|
+
>>> guess_decoding(b'http://road.cc/measure-\x96-smart-street')
|
|
25
|
+
'http://road.cc/measure-–-smart-street'
|
|
26
|
+
"""
|
|
27
|
+
text = resolve(text)
|
|
28
|
+
for encoding in ENCODINGS:
|
|
29
|
+
try:
|
|
30
|
+
text = text.decode(encoding)
|
|
31
|
+
except UnicodeDecodeError:
|
|
32
|
+
continue
|
|
33
|
+
return text
|
|
34
|
+
return None
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def guess_encoding(text: bytes) -> str:
|
|
38
|
+
for encoding in ENCODINGS:
|
|
39
|
+
try:
|
|
40
|
+
text = text.encode(encoding)
|
|
41
|
+
except UnicodeEncodeError:
|
|
42
|
+
continue
|
|
43
|
+
return text
|
|
44
|
+
return None
|