rawmaker 2.40.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (63) hide show
  1. letty/__init__.py +46 -0
  2. letty/cli.py +63 -0
  3. letty/optimizer.py +138 -0
  4. letty/quality/__init__.py +8 -0
  5. letty/quality/whitespace.py +50 -0
  6. letty/strategy.py +8 -0
  7. rawmaker/__init__.py +29 -0
  8. rawmaker/__main__.py +13 -0
  9. rawmaker/__patch__.py +36 -0
  10. rawmaker/cli.py +206 -0
  11. rawmaker/cli_automate.py +69 -0
  12. rawmaker/converter/__init__.py +8 -0
  13. rawmaker/converter/basic.py +174 -0
  14. rawmaker/converter/images.py +168 -0
  15. rawmaker/date.py +83 -0
  16. rawmaker/destination.py +202 -0
  17. rawmaker/error.py +34 -0
  18. rawmaker/features/__init__.py +138 -0
  19. rawmaker/features/annotation.py +254 -0
  20. rawmaker/features/border.py +172 -0
  21. rawmaker/features/boxes.py +153 -0
  22. rawmaker/features/figures.py +24 -0
  23. rawmaker/features/fonts.py +229 -0
  24. rawmaker/features/formula.py +16 -0
  25. rawmaker/features/horizontals.py +132 -0
  26. rawmaker/features/images.py +155 -0
  27. rawmaker/features/line.py +337 -0
  28. rawmaker/features/outlines.py +123 -0
  29. rawmaker/features/text.py +91 -0
  30. rawmaker/fonts/__init__.py +8 -0
  31. rawmaker/fonts/parser.py +354 -0
  32. rawmaker/images/__init__.py +8 -0
  33. rawmaker/images/info.py +35 -0
  34. rawmaker/miner/__init__.py +8 -0
  35. rawmaker/miner/char.py +42 -0
  36. rawmaker/miner/colorspace.py +75 -0
  37. rawmaker/miner/images.py +448 -0
  38. rawmaker/miner/position.py +121 -0
  39. rawmaker/miner/rawchar.py +207 -0
  40. rawmaker/miner/text.py +833 -0
  41. rawmaker/miner/underline.py +66 -0
  42. rawmaker/parameter.py +130 -0
  43. rawmaker/patch/__init__.py +8 -0
  44. rawmaker/patch/ltchar.py +79 -0
  45. rawmaker/reader.py +97 -0
  46. rawmaker/text/__init__.py +8 -0
  47. rawmaker/text/chars.py +24 -0
  48. rawmaker/text/data.py +47 -0
  49. rawmaker/text/superfast.py +91 -0
  50. rawmaker/text/wordbox.py +95 -0
  51. rawmaker/utils.py +44 -0
  52. rawmaker-2.40.3.dist-info/METADATA +51 -0
  53. rawmaker-2.40.3.dist-info/RECORD +63 -0
  54. rawmaker-2.40.3.dist-info/WHEEL +5 -0
  55. rawmaker-2.40.3.dist-info/entry_points.txt +6 -0
  56. rawmaker-2.40.3.dist-info/licenses/LICENSE +21 -0
  57. rawmaker-2.40.3.dist-info/top_level.txt +3 -0
  58. spacestation/__init__.py +18 -0
  59. spacestation/cli.py +51 -0
  60. spacestation/features/__init__.py +8 -0
  61. spacestation/features/chardist.py +85 -0
  62. spacestation/features/worddist.py +57 -0
  63. spacestation/features/wspace.py +130 -0
@@ -0,0 +1,66 @@
1
+ # =============================================================================
2
+ # C O P Y R I G H T
3
+ # -----------------------------------------------------------------------------
4
+ # Copyright (c) 2022-2023 by Helmut Konrad Schewe. All rights reserved.
5
+ # This file is property of Helmut Konrad Schewe. Any unauthorized copy,
6
+ # use or distribution is an offensive act against international law and may
7
+ # be prosecuted under federal law. Its content is company confidential.
8
+ # =============================================================================
9
+
10
+ import iamraw
11
+ import serializeraw
12
+ import utilo
13
+
14
+
15
+ def underline_chars(
16
+ document: iamraw.Document,
17
+ underlinex: str = None,
18
+ pages: tuple = None,
19
+ ):
20
+ # TODO: MARK HORIZONTAL AS TEXT UNDERLINE HORIZONTAL!
21
+ # TODO: SUPPORT PARTIAL UNDERLINES
22
+ # TODO: UPDATE STYLE RANGE AFTER SETTING ONLY SOME CHARS AS UNDERLINED
23
+ # TODO: REPLACE UNDERLINE WITH STYLE(NONE, UNDERLINE, CROSSED, OVERLINED)
24
+ if not utilo.exists(underlinex):
25
+ utilo.log(f'missing underlines: {underlinex}, skipping char underline')
26
+ return document
27
+ underlinex = serializeraw.load_horizontals(
28
+ underlinex,
29
+ pages=pages,
30
+ )
31
+ for pdfpage in underlinex:
32
+ underlines, pagenumber = pdfpage.content, pdfpage.page
33
+ current_page = utilo.select_page(document.pages, page=pagenumber)
34
+ if not current_page:
35
+ continue
36
+ for underline in underlines:
37
+ for textcontainer in current_page:
38
+ if not underlined(textcontainer.box, underline.box):
39
+ continue
40
+ # TODO: REMOVE APPEND AFTER SHRINKING TEXTCONTAINER TO
41
+ # SINGLE LINE
42
+ # update chars
43
+ for char in utilo.flat(textcontainer, append=True):
44
+ char.underline = True
45
+ break
46
+ return document
47
+
48
+
49
+ def underlined(text: utilo.Rectangle, horizontal: utilo.Rectangle) -> bool:
50
+ # TODO: SUPPORT CROSSED ETC.
51
+ hline_inside = text[1] < horizontal[1] < text[3]
52
+ if not hline_inside:
53
+ return False
54
+ near_bottom = utilo.near(
55
+ expected=text[3],
56
+ current=horizontal[1],
57
+ diff=3.0,
58
+ )
59
+ if not near_bottom:
60
+ return False
61
+ # start and end of horizontal and text matches
62
+ leftright = utilo.near(text[0], horizontal[0], diff=5.0)
63
+ leftright &= utilo.near(text[2], horizontal[2], diff=5.0)
64
+ if not leftright:
65
+ return False
66
+ return True
rawmaker/parameter.py ADDED
@@ -0,0 +1,130 @@
1
+ # =============================================================================
2
+ # C O P Y R I G H T
3
+ # -----------------------------------------------------------------------------
4
+ # Copyright (c) 2019-2023 by Helmut Konrad Schewe. All rights reserved.
5
+ # This file is property of Helmut Konrad Schewe. Any unauthorized copy,
6
+ # use or distribution is an offensive act against international law and may
7
+ # be prosecuted under federal law. Its content is company confidential.
8
+ # =============================================================================
9
+
10
+ import dataclasses
11
+
12
+ import configos
13
+ import pdfminer.layout
14
+ import utilo
15
+
16
+ STRIP = True
17
+
18
+ ONELINE = pdfminer.layout.LAParams(
19
+ boxes_flow=1.0,
20
+ char_margin=100.0,
21
+ line_margin=0.001,
22
+ )
23
+
24
+
25
+ @dataclasses.dataclass
26
+ class ParsingConfiguration:
27
+ boxes_flow: float = 0.5
28
+ char_margin: float = 2.0
29
+ line_margin: float = 0.5
30
+ line_overlap: float = 0.5
31
+ word_margin: float = 0.1
32
+ detect_vertical: bool = True
33
+ strip: bool = STRIP
34
+
35
+ def cmdline(self) -> str:
36
+ """Convert configuration to `linix` command line parameter syntax."""
37
+ parameter = []
38
+ for item, value in vars(self).items():
39
+ if item == 'strip':
40
+ if not value:
41
+ parameter.append('--nostrip')
42
+ continue
43
+ if isinstance(value, bool):
44
+ if value:
45
+ parameter.append(f'--{item}')
46
+ continue
47
+ parameter.append(f'--{item}={value}')
48
+ return utilo.from_tuple(parameter)
49
+
50
+ def laparams(self) -> pdfminer.layout.LAParams:
51
+ result = pdfminer.layout.LAParams(
52
+ boxes_flow=self.boxes_flow,
53
+ char_margin=self.char_margin,
54
+ detect_vertical=self.detect_vertical,
55
+ line_margin=self.line_margin,
56
+ line_overlap=self.line_overlap,
57
+ word_margin=self.word_margin,
58
+ )
59
+ return result
60
+
61
+ @classmethod
62
+ def from_dict(cls, **kwargs):
63
+ """\
64
+ >>> ParsingConfiguration.from_dict()
65
+ --boxes_flow=...--char_margin=...--line_margin=...--line_overlap=...
66
+ """
67
+ instance = cls()
68
+ for key, value in kwargs.items():
69
+ if key == 'nostrip':
70
+ instance.strip = not value
71
+ continue
72
+ if hasattr(instance, key):
73
+ setattr(instance, key, value)
74
+ return instance
75
+
76
+ def __str__(self):
77
+ return self.cmdline()
78
+
79
+ def __repr__(self):
80
+ return str(self)
81
+
82
+
83
+ LAYOUT_CHAR_MARGIN = configos.HV_FLOAT_PLUS(default=1.1)
84
+
85
+ LAYOUT = ParsingConfiguration(char_margin=LAYOUT_CHAR_MARGIN)
86
+
87
+
88
+ def from_config(config: ParsingConfiguration) -> pdfminer.layout.LAParams:
89
+ boxes_flow: float = 0.5
90
+ char_margin: float = 2.0
91
+ line_margin: float = 0.5
92
+ line_overlap: float = 0.5
93
+ word_margin: float = 0.1
94
+ detect_vertical = True
95
+ if config:
96
+ boxes_flow = config.boxes_flow
97
+ char_margin = config.char_margin
98
+ detect_vertical = config.detect_vertical
99
+ line_margin = config.line_margin
100
+ line_overlap = config.line_overlap
101
+ word_margin = config.word_margin
102
+ result = pdfminer.layout.LAParams(
103
+ boxes_flow=boxes_flow,
104
+ char_margin=char_margin,
105
+ detect_vertical=detect_vertical,
106
+ line_margin=line_margin,
107
+ line_overlap=line_overlap,
108
+ word_margin=word_margin,
109
+ )
110
+ return result
111
+
112
+
113
+ def print_layout(layout: ParsingConfiguration = None):
114
+ assert layout, 'missing layout'
115
+ layout = from_config(layout)
116
+ utilo.log(' layout:', end=' ', level=utilo.Level.INFORMATION)
117
+ information = [
118
+ ('boxes_flow', layout.boxes_flow),
119
+ ('char_margin', layout.char_margin),
120
+ ('line_margin', layout.line_margin),
121
+ ('line_overlap', layout.line_overlap),
122
+ ('word_margin', layout.word_margin),
123
+ ]
124
+ for (text, value) in information:
125
+ utilo.log(
126
+ '%s %.2f' % (text, value), # pylint:disable=C0209
127
+ end=' ',
128
+ level=utilo.Level.INFORMATION,
129
+ )
130
+ utilo.log(level=utilo.Level.INFORMATION) # newline
@@ -0,0 +1,8 @@
1
+ # =============================================================================
2
+ # C O P Y R I G H T
3
+ # -----------------------------------------------------------------------------
4
+ # Copyright (c) 2019-2023 by Helmut Konrad Schewe. All rights reserved.
5
+ # This file is property of Helmut Konrad Schewe. Any unauthorized copy,
6
+ # use or distribution is an offensive act against international law and may
7
+ # be prosecuted under federal law. Its content is company confidential.
8
+ # =============================================================================
@@ -0,0 +1,79 @@
1
+ # =============================================================================
2
+ # C O P Y R I G H T
3
+ # -----------------------------------------------------------------------------
4
+ # Copyright (c) 2019-2023 by Helmut Konrad Schewe. All rights reserved.
5
+ # This file is property of Helmut Konrad Schewe. Any unauthorized copy,
6
+ # use or distribution is an offensive act against international law and may
7
+ # be prosecuted under federal law. Its content is company confidential.
8
+ # =============================================================================
9
+
10
+ import contextlib
11
+
12
+ import pdfminer.layout
13
+ import pdfminer.pdffont
14
+
15
+
16
+ class PatchedLTChar(pdfminer.layout.LTChar):
17
+
18
+ # pylint:disable=R0913
19
+ def __init__(self, matrix, font, fontsize, scaling, rise, text, textwidth,
20
+ textdisp, ncs, graphicstate):
21
+ super().__init__(matrix, font, fontsize, scaling, rise, text, textwidth,
22
+ textdisp, ncs, graphicstate)
23
+ if fontsize == 1:
24
+ # HACK: CLARIFY WHAT IS CORRECT, we need more generated pdf
25
+ # examples
26
+ # HQEOGA+Arial in bachelor37
27
+ # self.fontsize = self.size
28
+ # matrix[0] normal char
29
+ # matrix[1] rotated char
30
+ self.fontsize = matrix[0] if matrix[0] else matrix[1]
31
+ assert self.fontsize, matrix
32
+ else:
33
+ self.fontsize = fontsize
34
+ self.rise = rise
35
+ self.flags = font.flags
36
+
37
+ def __getitem__(self, index):
38
+ """Access bounding component."""
39
+ return self.bbox[index]
40
+
41
+ @property
42
+ def text(self):
43
+ return self.get_text()
44
+
45
+
46
+ def vertical(item: pdfminer.layout.LTChar) -> bool:
47
+ """Check LTChar.upright flag."""
48
+ with contextlib.suppress(AttributeError):
49
+ if not item.upright:
50
+ return True
51
+ return False
52
+ return None
53
+
54
+
55
+ def render_char( # pylint:disable=W9015,W9016
56
+ self, matrix, font, fontsize, scaling, rise, cid, ncs,
57
+ graphicstate) -> PatchedLTChar:
58
+ """Patch LTChar to introduce font size hack.
59
+
60
+ Args:
61
+ cid(int): character number id - ASCII
62
+ ncs(PDFColorSpace): color space of document
63
+ Returns:
64
+ Patched char object.
65
+ """
66
+ try:
67
+ text = font.to_unichr(cid)
68
+ assert isinstance(text, str), type(text)
69
+ except pdfminer.pdffont.PDFUnicodeNotDefined:
70
+ text = self.handle_undefined_char(font, cid)
71
+
72
+ textwidth = font.char_width(cid)
73
+ textdisp = font.char_disp(cid)
74
+
75
+ # patch to document font size and rise
76
+ item = PatchedLTChar(matrix, font, fontsize, scaling, rise, text, textwidth,
77
+ textdisp, ncs, graphicstate)
78
+ self.cur_item.add(item)
79
+ return item.adv
rawmaker/reader.py ADDED
@@ -0,0 +1,97 @@
1
+ #==============================================================================
2
+ # C O P Y R I G H T
3
+ #------------------------------------------------------------------------------
4
+ # Copyright (c) 2019-2023 by Helmut Konrad Schewe. All rights reserved.
5
+ # This file is property of Helmut Konrad Schewe. Any unauthorized copy,
6
+ # use or distribution is an offensive act against international law and may
7
+ # be prosecuted under federal law. Its content is company confidential.
8
+ #==============================================================================
9
+
10
+ import sys
11
+ from contextlib import contextmanager
12
+ from os.path import exists
13
+ from os.path import isfile
14
+
15
+ import utilo
16
+ from pdfminer.pdfdocument import PDFDocument
17
+ from pdfminer.pdfdocument import PDFEncryptionError
18
+ from pdfminer.pdfdocument import PDFSyntaxError
19
+ from pdfminer.pdfparser import PDFParser
20
+
21
+ # from rawmaker.error import InvalidPDF
22
+ # from rawmaker.error import PDFParserImplementationError
23
+
24
+
25
+ @contextmanager
26
+ def read(path: str, password: str = None, verify: bool = True) -> PDFDocument:
27
+ """Open pdf from `path`.
28
+
29
+ Args:
30
+ path(str): path to pdf-file
31
+ password(str): optional password to extract encrypted data
32
+ verify(bool): ensure that file starts with `%PDF-`
33
+ Raises:
34
+ TextExtractNotAllowed: if no extraction is allowed - currently disabled
35
+ FileNotFoundError: `path` does not exists
36
+ ValueError: `path` is not a file
37
+ Yields:
38
+ PDFDocument: open pdf file
39
+ """
40
+ if not exists(path):
41
+ raise FileNotFoundError(f'Path does not exists: {path}')
42
+ if not isfile(path):
43
+ raise ValueError(f'Read requires an pdf document, not {path}')
44
+ if verify:
45
+ header = open(path, 'rb').read(5)
46
+ if header != b'%PDF-':
47
+ # TODO: MOVE TO def before() method after upgrading utilo
48
+ utilo.error('invalid pdf header')
49
+ sys.exit(1)
50
+ with open(path, 'rb') as fp:
51
+ # Create a PDF parser object associated with the file object.
52
+ parser = PDFParser(fp)
53
+ # Create a PDF document object that stores the document structure.
54
+ # Supply the password for initialization.
55
+ document = open_document(parser, password)
56
+ yield document
57
+
58
+
59
+ def open_document(parser: PDFParser, password: str) -> PDFDocument:
60
+ """Open pdf document base on selected `parser`.
61
+
62
+ Hint:
63
+ Using fallback as default mode is very slow. Therefore we try
64
+ without fallback and if this does not work, we try it with
65
+ fallback again.
66
+ Try first without using fallback because this is much faster on
67
+ valid documents. If the run without fallback fails, start it
68
+ with fallback again.
69
+ """
70
+ password = password if password is not None else ''
71
+ try:
72
+ document = PDFDocument(parser, password, fallback=False)
73
+ except PDFSyntaxError:
74
+ pass # try with fallback again
75
+ except PDFEncryptionError as encryption:
76
+ utilo.error('encryption not supported')
77
+ utilo.debug(encryption)
78
+ sys.exit(1)
79
+ except Exception: # pylint:disable=broad-except
80
+ utilo.print_stacktrace()
81
+ sys.exit(2)
82
+ # raise PDFParserImplementationError(path) from exc
83
+ else:
84
+ return document
85
+
86
+ try:
87
+ utilo.info('try to use `fallback` pdf loader')
88
+ document = PDFDocument(parser, password, fallback=True)
89
+ except PDFSyntaxError:
90
+ utilo.print_stacktrace()
91
+ sys.exit(3)
92
+ # raise InvalidPDF(path) from exc
93
+ except Exception: # pylint:disable=broad-except
94
+ # raise PDFParserImplementationError(path) from exc
95
+ utilo.print_stacktrace()
96
+ sys.exit(2)
97
+ return document
@@ -0,0 +1,8 @@
1
+ # =============================================================================
2
+ # C O P Y R I G H T
3
+ # -----------------------------------------------------------------------------
4
+ # Copyright (c) 2020-2023 by Helmut Konrad Schewe. All rights reserved.
5
+ # This file is property of Helmut Konrad Schewe. Any unauthorized copy,
6
+ # use or distribution is an offensive act against international law and may
7
+ # be prosecuted under federal law. Its content is company confidential.
8
+ # =============================================================================
rawmaker/text/chars.py ADDED
@@ -0,0 +1,24 @@
1
+ # =============================================================================
2
+ # C O P Y R I G H T
3
+ # -----------------------------------------------------------------------------
4
+ # Copyright (c) 2020-2023 by Helmut Konrad Schewe. All rights reserved.
5
+ # This file is property of Helmut Konrad Schewe. Any unauthorized copy,
6
+ # use or distribution is an offensive act against international law and may
7
+ # be prosecuted under federal law. Its content is company confidential.
8
+ # =============================================================================
9
+
10
+ import iamraw
11
+
12
+ import rawmaker.features.text
13
+ import rawmaker.miner.char
14
+ import rawmaker.reader
15
+
16
+
17
+ def extract_chars(document: str, pages: tuple = None) -> iamraw.Document:
18
+ assert isinstance(document, str), str(document)
19
+ document = rawmaker.features.text.extract_document(
20
+ document,
21
+ pages=pages,
22
+ converter=rawmaker.miner.char.CharPDFConvert,
23
+ )
24
+ return document
rawmaker/text/data.py ADDED
@@ -0,0 +1,47 @@
1
+ # =============================================================================
2
+ # C O P Y R I G H T
3
+ # -----------------------------------------------------------------------------
4
+ # Copyright (c) 2020-2023 by Helmut Konrad Schewe. All rights reserved.
5
+ # This file is property of Helmut Konrad Schewe. Any unauthorized copy,
6
+ # use or distribution is an offensive act against international law and may
7
+ # be prosecuted under federal law. Its content is company confidential.
8
+ # =============================================================================
9
+
10
+ import dataclasses
11
+
12
+ import iamraw
13
+ import utilo
14
+
15
+
16
+ @dataclasses.dataclass
17
+ class WordBoxPage:
18
+
19
+ content: iamraw.BoundingBoxes = dataclasses.field(default_factory=list)
20
+ page: int = None
21
+
22
+ def __getitem__(self, index):
23
+ return self.content[index] # pylint:disable=E1136
24
+
25
+ def __len__(self):
26
+ return len(self.content) # pylint:disable=E1136
27
+
28
+
29
+ WordBoxPages = list[WordBoxPage]
30
+
31
+
32
+ @dataclasses.dataclass
33
+ class PageLines:
34
+
35
+ lines: list = dataclasses.field(default_factory=list)
36
+
37
+ def __getitem__(self, index):
38
+ return self.lines[index] # pylint:disable=E1136
39
+
40
+ def __len__(self):
41
+ return len(self.lines) # pylint:disable=E1136
42
+
43
+ def __str__(self):
44
+ word = lambda x: ''.join([char.text for char in x]) # pylint:disable=C3001
45
+ line = lambda x: ' '.join([word(item) for item in x]) # pylint:disable=C3001
46
+ lines = utilo.NEWLINE.join(line(item) for item in self.lines) # pylint:disable=E1133
47
+ return f'PageLines: {len(self.lines)}\n{lines}'
@@ -0,0 +1,91 @@
1
+ # =============================================================================
2
+ # C O P Y R I G H T
3
+ # -----------------------------------------------------------------------------
4
+ # Copyright (c) 2021-2023 by Helmut Konrad Schewe. All rights reserved.
5
+ # This file is property of Helmut Konrad Schewe. Any unauthorized copy,
6
+ # use or distribution is an offensive act against international law and may
7
+ # be prosecuted under federal law. Its content is company confidential.
8
+ # =============================================================================
9
+
10
+ import os
11
+
12
+ import iamraw
13
+ import pdflog
14
+ import serializeraw
15
+ import utilo
16
+
17
+ import rawmaker.parameter
18
+
19
+
20
+ def superfast(
21
+ document: str,
22
+ config: rawmaker.parameter.ParsingConfiguration,
23
+ workdir: str,
24
+ pages: list = None,
25
+ ) -> iamraw.Document:
26
+ if pages is None:
27
+ pagecount = pdflog.pagecount(document)
28
+ pages = utilo.make_tuple(pagecount)
29
+ chunks = utilo.chunks(pages, size=10)
30
+ parameter = config.cmdline()
31
+ todo = []
32
+ for index, chunk in enumerate(chunks):
33
+ joined_pages = utilo.from_tuple(chunk, separator=',')
34
+ cmd = (f'rawmaker -i {document} -o {workdir} --prefix {index}'
35
+ f' --text --pages {joined_pages} {parameter}')
36
+ utilo.log(cmd)
37
+ todo.append(cmd)
38
+ # run in parallel
39
+ completed = utilo.run_parallel(todo, workdir, worker=12)
40
+ assert completed == utilo.SUCCESS, completed
41
+ # merge document
42
+ document = merge_document(workdir, len(chunks))
43
+ return document
44
+
45
+
46
+ def merge_document(path: str, size: int) -> iamraw.Document:
47
+ """Merge chunks of extract document.
48
+
49
+ A little bit diry, but ok for now. XXX
50
+ """
51
+ text_files = [
52
+ os.path.join(path, f'rawmaker__{item}_text_text.yaml')
53
+ for item in range(size)
54
+ ]
55
+ posi_files = [
56
+ os.path.join(path, f'rawmaker__{item}_text_positions.yaml')
57
+ for item in range(size)
58
+ ]
59
+
60
+ text = [serializeraw.load_document(item) for item in text_files]
61
+ positions = [serializeraw.load_textpositions(item) for item in posi_files]
62
+
63
+ for item in text_files + posi_files:
64
+ utilo.info(f'remove {item}')
65
+ utilo.file_remove(item)
66
+
67
+ for docs, pos in zip(text, positions):
68
+ for page in docs:
69
+ index = 0
70
+ for item in page:
71
+ if not isinstance(item, iamraw.TextContainer):
72
+ continue
73
+ # bounding, mean
74
+ bounding, mean = utilo.select_page(pos, page.page).content[index] # yapf:disable
75
+ fake_text_mean_height(item, bounding, mean)
76
+ item.box = bounding
77
+ index += 1
78
+
79
+ document = iamraw.Document(dimension=text[0].dimension)
80
+ for chunk in text:
81
+ for page in chunk:
82
+ document.append(page)
83
+ return document
84
+
85
+
86
+ def fake_text_mean_height(item, bounding, mean):
87
+ # TODO: REMOVE THIS HACK LATER
88
+ for line in item.lines:
89
+ for char in line:
90
+ # Fake mean char height
91
+ char.box = iamraw.BoundingBox(0, bounding.y1 - mean, 0, bounding.y1)
@@ -0,0 +1,95 @@
1
+ # =============================================================================
2
+ # C O P Y R I G H T
3
+ # -----------------------------------------------------------------------------
4
+ # Copyright (c) 2020-2023 by Helmut Konrad Schewe. All rights reserved.
5
+ # This file is property of Helmut Konrad Schewe. Any unauthorized copy,
6
+ # use or distribution is an offensive act against international law and may
7
+ # be prosecuted under federal law. Its content is company confidential.
8
+ # =============================================================================
9
+
10
+ import configos
11
+ import utilo
12
+
13
+ import rawmaker.text.chars
14
+ import rawmaker.text.data
15
+
16
+ DIFF_MAX = configos.HolyTable(items=[
17
+ (6, 1.0),
18
+ (12, 1.0),
19
+ (16, 1.0),
20
+ (22, 10.0),
21
+ (24, 10.0),
22
+ (48, 15.0),
23
+ ])
24
+
25
+
26
+ def parses(
27
+ source: str,
28
+ pages: tuple,
29
+ word_length_min: int = 1,
30
+ difftable: configos.HolyTable = DIFF_MAX,
31
+ ) -> rawmaker.text.data.WordBoxPages:
32
+ extracted = rawmaker.text.chars.extract_chars(source, pages)
33
+ pages = [
34
+ extract_page(
35
+ page,
36
+ word_length_min=word_length_min,
37
+ difftable=difftable,
38
+ ) for page in extracted
39
+ ]
40
+ boundings = [wordbox_boundings(page) for page in pages]
41
+
42
+ # adjust page numbers
43
+ for page, bounding in zip(extracted, boundings):
44
+ bounding.page = page.page
45
+ return boundings
46
+
47
+
48
+ def extract_page(
49
+ page,
50
+ word_length_min: int = 1,
51
+ difftable: configos.HolyTable = DIFF_MAX,
52
+ ) -> rawmaker.text.data.PageLines:
53
+ # remove white space
54
+ page = [item for item in page if item.get_text().strip()]
55
+
56
+ lines = utilo.same_line_cluster(
57
+ page,
58
+ min_elements=word_length_min, # support single chars
59
+ matcher=lambda x: x.bbox[3],
60
+ )
61
+ # ensure top bottom
62
+ lines = sorted(lines, key=lambda bounding: bounding.center[3]) # y1
63
+
64
+ result = []
65
+ for line in lines:
66
+ # ensure left right
67
+ line = sorted(line, key=lambda x: x[0]) # x0
68
+ merged = merge_line(line, difftable=difftable)
69
+ result.append(merged)
70
+ return rawmaker.text.data.PageLines(lines=result)
71
+
72
+
73
+ def merge_line(line, difftable: configos.HolyTable):
74
+ if not line:
75
+ return []
76
+ diffs = [
77
+ after[0] - current[2] for current, after in zip(line[0:-1], line[1:])
78
+ ]
79
+ result = [[line[0]]]
80
+ for char, diff in zip(line[1:], diffs):
81
+ if diff > difftable(char.fontsize):
82
+ result.append([char])
83
+ else:
84
+ result[-1].append(char)
85
+ return result
86
+
87
+
88
+ def wordbox_boundings(page) -> rawmaker.text.data.WordBoxPage:
89
+ result = []
90
+ for line in page:
91
+ for word in line:
92
+ bounding = utilo.rect_max([char.bbox for char in word])
93
+ result.append(bounding)
94
+ wordbox = rawmaker.text.data.WordBoxPage(content=result)
95
+ return wordbox
rawmaker/utils.py ADDED
@@ -0,0 +1,44 @@
1
+ # =============================================================================
2
+ # C O P Y R I G H T
3
+ # -----------------------------------------------------------------------------
4
+ # Copyright (c) 2021-2023 by Helmut Konrad Schewe. All rights reserved.
5
+ # This file is property of Helmut Konrad Schewe. Any unauthorized copy,
6
+ # use or distribution is an offensive act against international law and may
7
+ # be prosecuted under federal law. Its content is company confidential.
8
+ # =============================================================================
9
+
10
+ import contextlib
11
+
12
+
13
+ def resolve(reference):
14
+ with contextlib.suppress(AttributeError):
15
+ return reference.resolve()
16
+ return reference
17
+
18
+
19
+ ENCODINGS = 'ascii cp1252 utf8 '.split()
20
+
21
+
22
+ def guess_decoding(text: bytes) -> str:
23
+ r"""\
24
+ >>> guess_decoding(b'http://road.cc/measure-\x96-smart-street')
25
+ 'http://road.cc/measure-–-smart-street'
26
+ """
27
+ text = resolve(text)
28
+ for encoding in ENCODINGS:
29
+ try:
30
+ text = text.decode(encoding)
31
+ except UnicodeDecodeError:
32
+ continue
33
+ return text
34
+ return None
35
+
36
+
37
+ def guess_encoding(text: bytes) -> str:
38
+ for encoding in ENCODINGS:
39
+ try:
40
+ text = text.encode(encoding)
41
+ except UnicodeEncodeError:
42
+ continue
43
+ return text
44
+ return None