rawmaker 2.40.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (63) hide show
  1. letty/__init__.py +46 -0
  2. letty/cli.py +63 -0
  3. letty/optimizer.py +138 -0
  4. letty/quality/__init__.py +8 -0
  5. letty/quality/whitespace.py +50 -0
  6. letty/strategy.py +8 -0
  7. rawmaker/__init__.py +29 -0
  8. rawmaker/__main__.py +13 -0
  9. rawmaker/__patch__.py +36 -0
  10. rawmaker/cli.py +206 -0
  11. rawmaker/cli_automate.py +69 -0
  12. rawmaker/converter/__init__.py +8 -0
  13. rawmaker/converter/basic.py +174 -0
  14. rawmaker/converter/images.py +168 -0
  15. rawmaker/date.py +83 -0
  16. rawmaker/destination.py +202 -0
  17. rawmaker/error.py +34 -0
  18. rawmaker/features/__init__.py +138 -0
  19. rawmaker/features/annotation.py +254 -0
  20. rawmaker/features/border.py +172 -0
  21. rawmaker/features/boxes.py +153 -0
  22. rawmaker/features/figures.py +24 -0
  23. rawmaker/features/fonts.py +229 -0
  24. rawmaker/features/formula.py +16 -0
  25. rawmaker/features/horizontals.py +132 -0
  26. rawmaker/features/images.py +155 -0
  27. rawmaker/features/line.py +337 -0
  28. rawmaker/features/outlines.py +123 -0
  29. rawmaker/features/text.py +91 -0
  30. rawmaker/fonts/__init__.py +8 -0
  31. rawmaker/fonts/parser.py +354 -0
  32. rawmaker/images/__init__.py +8 -0
  33. rawmaker/images/info.py +35 -0
  34. rawmaker/miner/__init__.py +8 -0
  35. rawmaker/miner/char.py +42 -0
  36. rawmaker/miner/colorspace.py +75 -0
  37. rawmaker/miner/images.py +448 -0
  38. rawmaker/miner/position.py +121 -0
  39. rawmaker/miner/rawchar.py +207 -0
  40. rawmaker/miner/text.py +833 -0
  41. rawmaker/miner/underline.py +66 -0
  42. rawmaker/parameter.py +130 -0
  43. rawmaker/patch/__init__.py +8 -0
  44. rawmaker/patch/ltchar.py +79 -0
  45. rawmaker/reader.py +97 -0
  46. rawmaker/text/__init__.py +8 -0
  47. rawmaker/text/chars.py +24 -0
  48. rawmaker/text/data.py +47 -0
  49. rawmaker/text/superfast.py +91 -0
  50. rawmaker/text/wordbox.py +95 -0
  51. rawmaker/utils.py +44 -0
  52. rawmaker-2.40.3.dist-info/METADATA +51 -0
  53. rawmaker-2.40.3.dist-info/RECORD +63 -0
  54. rawmaker-2.40.3.dist-info/WHEEL +5 -0
  55. rawmaker-2.40.3.dist-info/entry_points.txt +6 -0
  56. rawmaker-2.40.3.dist-info/licenses/LICENSE +21 -0
  57. rawmaker-2.40.3.dist-info/top_level.txt +3 -0
  58. spacestation/__init__.py +18 -0
  59. spacestation/cli.py +51 -0
  60. spacestation/features/__init__.py +8 -0
  61. spacestation/features/chardist.py +85 -0
  62. spacestation/features/worddist.py +57 -0
  63. spacestation/features/wspace.py +130 -0
@@ -0,0 +1,174 @@
1
+ # =============================================================================
2
+ # C O P Y R I G H T
3
+ # -----------------------------------------------------------------------------
4
+ # Copyright (c) 2020-2023 by Helmut Konrad Schewe. All rights reserved.
5
+ # This file is property of Helmut Konrad Schewe. Any unauthorized copy,
6
+ # use or distribution is an offensive act against international law and may
7
+ # be prosecuted under federal law. Its content is company confidential.
8
+ # =============================================================================
9
+
10
+ import contextlib
11
+
12
+ import iamraw
13
+ import pdfminer.converter
14
+ import pdfminer.layout
15
+ import pdfminer.pdfinterp
16
+ import pdfminer.pdfpage
17
+ import utilo
18
+
19
+
20
+ class FlippedLayoutAnalyzer(pdfminer.converter.PDFLayoutAnalyzer):
21
+
22
+ def __init__(self, laparams=None, pageno=0):
23
+ super().__init__(
24
+ rsrcmgr=pdfminer.pdfinterp.PDFResourceManager(),
25
+ pageno=pageno,
26
+ laparams=laparams,
27
+ )
28
+
29
+ def receive_layout(self, ltpage):
30
+ if content_inside_single_figure(ltpage):
31
+ # extract content out of a single figure container
32
+ ltpage._objs = ltpage._objs[0]._objs # pylint:disable=W0212
33
+ params = self.laparams
34
+ if not params:
35
+ # use default layout for image extractor
36
+ params = pdfminer.layout.LAParams()
37
+ ltpage.analyze(params)
38
+ for item in ltpage:
39
+ flip_object(item, ltpage)
40
+ for item in ltpage:
41
+ item.bbox = figure_bounding(item)
42
+ # remove invisible objects
43
+ ltpage._objs = [item for item in ltpage if item.bbox is not None] # pylint:disable=W0212
44
+
45
+ def handle_undefined_char(self, font, cid) -> str:
46
+ # TODO: CHECK AFTER UPGRADING PDFMINER
47
+ # TODO: FIX PAGE NUMBER
48
+ try:
49
+ char = MAPPING[cid]
50
+ utilo.debug(f'could not convert: {font!r}, {cid!r} use backup: '
51
+ f'{char} on page: {self.pageno}')
52
+ except KeyError:
53
+ utilo.error(f'could not convert: {font!r}, {cid!r} '
54
+ f'on page: {self.pageno} no backup char defined')
55
+ # use warning to log only once
56
+ utilo.warning(str(vars(font)))
57
+ char = chr(cid)
58
+ return char
59
+
60
+ @property
61
+ def resources(self):
62
+ return self.rsrcmgr
63
+
64
+
65
+ def content_inside_single_figure(page) -> bool:
66
+ """Some pdf printer write all page content to a single figure.
67
+
68
+ If all content is in a single figure, no text extraction is possible.
69
+ """
70
+ objs = page._objs # pylint:disable=W0212
71
+ if len(objs) != 1:
72
+ return False
73
+ figure = objs[0]
74
+ if not isinstance(figure, pdfminer.layout.LTFigure):
75
+ return False
76
+ if len(figure._objs) == 1: # pylint:disable=W0212
77
+ # image container, see master116 page,2,3. This works fine.
78
+ return False
79
+ return True
80
+
81
+
82
+ # REMOVE HACK LATER
83
+ # UAZWCW+CMR10
84
+ MAPPING = {
85
+ 0: '−',
86
+ 1: '·',
87
+ 12: 'fi',
88
+ 13: 'fl',
89
+ 14: 'ffi',
90
+ # -Fern´andez, ´Ecole may support later, for now removing is a good match
91
+ # for more infos see master110p106
92
+ # 19: '´',
93
+ 19: '',
94
+ 20: '≤',
95
+ 25: 'ß',
96
+ 127: '¨', # Umlaute, oe, ae, ue, use already implemented replace to
97
+ # support umlaute
98
+ }
99
+
100
+
101
+ def flip_object(item, page):
102
+ try:
103
+ box = list(item.bbox)
104
+ except AttributeError:
105
+ # VirtualChar for example
106
+ return
107
+ pageheight = page.height
108
+ box[1], box[3] = pageheight - box[3], pageheight - box[1]
109
+ box = utilo.roundme(box) # pylint:disable=R0204
110
+ try:
111
+ item.bbox = iamraw.BoundingBox(*box)
112
+ except AssertionError:
113
+ utilo.debug(f'invalid bounding on page {page}: {box}')
114
+ utilo.debug(item)
115
+ item.x0, item.y0, item.x1, item.y1 = box
116
+ with contextlib.suppress(AttributeError):
117
+ for obj in item._objs: # pylint:disable=W0212
118
+ flip_object(obj, page)
119
+
120
+
121
+ class PageAggregator(FlippedLayoutAnalyzer):
122
+
123
+ def __init__(self, laparams=None):
124
+ super().__init__(laparams=laparams)
125
+ self.result = None
126
+
127
+ def receive_layout(self, ltpage):
128
+ super().receive_layout(ltpage)
129
+ self.result = ltpage
130
+
131
+ def get_result(self):
132
+ return self.result
133
+
134
+
135
+ def figure_bounding(figure) -> tuple:
136
+ """Bounding of some bad printed figures where too large, we strip
137
+ this bounding to real content.
138
+
139
+ Empty figures must return None
140
+ >>> assert figure_bounding(pdfminer.layout.LTFigure('empty', (10, 10, 50, 50),
141
+ ... (1, 1, 1, 1, 1, 1))) is None
142
+ """
143
+ if not isinstance(figure, pdfminer.layout.LTFigure):
144
+ return figure.bbox
145
+ figure = [item for item in figure if visible(item)]
146
+ boundings = []
147
+ for item in figure:
148
+ if isinstance(item, pdfminer.layout.LTFigure): # pylint:disable=W0160
149
+ # figure inside a figure
150
+ bounding = figure_bounding(item)
151
+ else:
152
+ bounding = item.bbox
153
+ if bounding is None:
154
+ # hidden item
155
+ continue
156
+ boundings.append(bounding)
157
+ if not boundings:
158
+ return None
159
+ result = utilo.rect_max(boundings)
160
+ return result
161
+
162
+
163
+ def visible(item) -> bool:
164
+ with contextlib.suppress(AttributeError):
165
+ # TODO: INVESTIGATE THIS
166
+ if item.linewidth:
167
+ return True
168
+ if item.fill:
169
+ if not item.evenodd:
170
+ return True
171
+ return False
172
+ if not item.stroking_color and not item.non_stroking_color:
173
+ return False
174
+ return True
@@ -0,0 +1,168 @@
1
+ # =============================================================================
2
+ # C O P Y R I G H T
3
+ # -----------------------------------------------------------------------------
4
+ # Copyright (c) 2020-2023 by Helmut Konrad Schewe. All rights reserved.
5
+ # This file is property of Helmut Konrad Schewe. Any unauthorized copy,
6
+ # use or distribution is an offensive act against international law and may
7
+ # be prosecuted under federal law. Its content is company confidential.
8
+ # =============================================================================
9
+
10
+ import configos
11
+ import pdfminer.converter
12
+ import pdfminer.image
13
+ import pdfminer.layout
14
+ import pdfminer.pdfdocument
15
+ import pdfminer.pdfinterp
16
+ import pdfminer.pdftypes
17
+ import pdfminer.psparser
18
+ import utilo
19
+
20
+ import rawmaker.converter.basic
21
+
22
+
23
+ class ImageConverter(rawmaker.converter.basic.FlippedLayoutAnalyzer):
24
+
25
+ def __init__(self, imagewriter, firstpage: int):
26
+ super().__init__(pageno=firstpage)
27
+ assert callable(imagewriter), imagewriter
28
+ self.imagewriter = imagewriter
29
+ # TODO avoid duplicated parsed, check if we require this?
30
+ self.parsed = utilo.Single()
31
+
32
+ def receive_layout(self, ltpage):
33
+ super().receive_layout(ltpage)
34
+ for item in ltpage:
35
+ self.render_pagecontent(ltpage.pageid, item)
36
+
37
+ def render_pagecontent(self, pageid, item):
38
+ """Collect all imageable items"""
39
+ if isinstance(item, pdfminer.layout.LTImage):
40
+ self.render_result_image(item, pageid=pageid)
41
+ elif isinstance(item, pdfminer.layout.LTFigure):
42
+ self.render_figure(item, pageid=pageid)
43
+
44
+ def render_result_image(
45
+ self,
46
+ image: pdfminer.layout.LTImage,
47
+ pageid: int,
48
+ ):
49
+ # add pageid to ensure that equal image names from different pages
50
+ # are not handled as same same.
51
+ imagename = f'{pageid}_{image.name}'
52
+ if self.parsed.contains(imagename):
53
+ return
54
+ self.imagewriter(pageid, image)
55
+
56
+ def render_figure(
57
+ self,
58
+ item: pdfminer.layout.LTFigure,
59
+ pageid: int,
60
+ ):
61
+ # TODO: RENDER CURVES ETC.
62
+ images = item._objs # pylint:disable=W0212
63
+ if len(images) == 1:
64
+ if isinstance(images[0], pdfminer.layout.LTFigure):
65
+ # image inside figure
66
+ images = images[0]._objs # pylint:disable=W0212
67
+ images = [
68
+ item for item in images if isinstance(item, pdfminer.layout.LTImage)
69
+ ]
70
+ if not images:
71
+ return
72
+ assert len(images) == 1, str(images)
73
+ # TODO: Investigate with list
74
+ image = images[0] # pylint:disable=W0212
75
+ if skipme(image):
76
+ return
77
+ self.render_result_image(image, pageid)
78
+
79
+
80
+ SKIPME_RATE_MIN = configos.HV_PERCENT_PLUS(default=50.0)
81
+
82
+
83
+ def skipme(image) -> bool:
84
+ """\
85
+ Master31Page10 Black/White image is printed under figure caption.
86
+ """
87
+ # TODO: INVESTIGATE THIS HACK
88
+ stream_raw = image.stream.rawdata
89
+ counted = stream_raw.count(b'\x00')
90
+ rate = counted / len(stream_raw)
91
+ if rate >= SKIPME_RATE_MIN:
92
+ return True
93
+ return False
94
+
95
+
96
+ class FastImageInterpreter(pdfminer.pdfinterp.PDFPageInterpreter):
97
+ """Experimental, think about the sence of this ?optimization?."""
98
+
99
+ # TODO: SEE DOCSTRING
100
+
101
+ def __init__(self, rsrcmgr, device):
102
+ super().__init__(rsrcmgr, device)
103
+ self.fast = {
104
+ 'CS': self.do_CS,
105
+ 'Do': self.do_Do,
106
+ 'EI': self.do_EI,
107
+ 'MP': self.do_MP,
108
+ 'Q': self.do_Q,
109
+ 'SC': self.do_SC,
110
+ 'SCN': self.do_SCN,
111
+ 'cm': self.do_cm,
112
+ 'cs': self.do_cs,
113
+ 'sc': self.do_sc,
114
+ 'scn': self.do_scn,
115
+ }
116
+
117
+ # pylint:disable=W0613,R0201
118
+ def render_char(self, matrix, font, fontsize, scaling, rise, cid, ncs,
119
+ graphicstate):
120
+ # assert 0
121
+ return
122
+
123
+ def render_string(self, textstate, seq, ncs, graphicstate):
124
+ return
125
+
126
+ def do_TJ(self, seq):
127
+ return
128
+
129
+ def execute(self, streams): # pylint:disable=R1260
130
+ try:
131
+ parser = pdfminer.pdfinterp.PDFContentParser(streams)
132
+ except pdfminer.psparser.PSEOF:
133
+ # empty page
134
+ return
135
+ while 1: # pylint:disable=W0149
136
+ try:
137
+ (_, obj) = parser.nextobject()
138
+ except pdfminer.psparser.PSEOF:
139
+ break
140
+ if isinstance(obj, pdfminer.psparser.PSKeyword):
141
+ name = pdfminer.psparser.keyword_name(obj)
142
+ try:
143
+ func = self.fast[name]
144
+ except KeyError:
145
+ continue
146
+ nargs = func.__code__.co_argcount - 1
147
+ # nargs = six.get_function_code(func).co_argcount - 1
148
+ if nargs:
149
+ args = self.pop(nargs)
150
+ if len(args) == nargs:
151
+ func(*args)
152
+ else:
153
+ func()
154
+
155
+ else:
156
+ self.push(obj)
157
+
158
+
159
+ def create_fastimageextractor(imagelistener, firstpage: int):
160
+ device = ImageConverter(
161
+ imagewriter=imagelistener,
162
+ firstpage=firstpage,
163
+ )
164
+ interpreter = pdfminer.pdfinterp.PDFPageInterpreter(
165
+ device.resources,
166
+ device,
167
+ )
168
+ return interpreter
rawmaker/date.py ADDED
@@ -0,0 +1,83 @@
1
+ # =============================================================================
2
+ # C O P Y R I G H T
3
+ # -----------------------------------------------------------------------------
4
+ # Copyright (c) 2020-2023 by Helmut Konrad Schewe. All rights reserved.
5
+ # This file is property of Helmut Konrad Schewe. Any unauthorized copy,
6
+ # use or distribution is an offensive act against international law and may
7
+ # be prosecuted under federal law. Its content is company confidential.
8
+ # =============================================================================
9
+ """Date
10
+ ====
11
+
12
+ See Adobe PDF 2008 REF 7.9.4.
13
+
14
+ Format: (D:YYYYMMDDHHmmSSOHH'mm)
15
+ YYYY: Year
16
+ MM: Month(01-12)
17
+ DD: Day(0-31)
18
+ HH: Hour(0-23)
19
+ mm: minute(00-59)
20
+ SS: second(00-59)
21
+ O: + or -
22
+ HH: offset in hours
23
+ '
24
+ mm: offset in minutes
25
+
26
+ Parse and generate raw data again.
27
+
28
+ >>> time = "D:20160419072554+02'00"
29
+ >>> parsed = parse(time)
30
+
31
+ Convert parsed back to raw string
32
+
33
+ >>> raw(parsed)
34
+ "D:20160419072554+02'00"
35
+ """
36
+
37
+ import dataclasses
38
+ import re
39
+
40
+
41
+ @dataclasses.dataclass
42
+ class PDFDate:
43
+ year: int = None
44
+ month: int = None
45
+ day: int = None
46
+ hour: int = None
47
+ minute: int = None
48
+ second: int = None
49
+ utc_hour: int = None
50
+ utc_minute: int = None
51
+
52
+
53
+ PATTERN = (r'D:(?P<year>\d{4})(?P<month>\d{2})(?P<day>\d{2})'
54
+ r'(?P<hour>\d{2})(?P<minute>\d{2})(?P<second>\d{2})(?P<sign>[+-])'
55
+ r'(?P<utc_hour>\d{2})\'(?P<utc_minute>\d{2})')
56
+
57
+
58
+ def raw(date: PDFDate) -> str:
59
+ sign = '+' if date.utc_hour >= 0 else '-'
60
+ result = (f'D:{date.year:04d}{date.month:02d}{date.day:02d}'
61
+ f'{date.hour:02d}{date.minute:02d}{date.second:02d}'
62
+ f'{sign}{date.utc_hour:02d}\'{date.utc_minute:02d}')
63
+ return result
64
+
65
+
66
+ def parse(item: str) -> PDFDate:
67
+ """Parse ASN.1 date pattern.
68
+
69
+ >>> parse("D:20160419072554+02'00")
70
+ PDFDate(year=2016, month=4, day=19, hour=7, minute=25, second=54, utc_hour=2, utc_minute=0)
71
+ """
72
+ matched = re.match(PATTERN, item)
73
+ if not matched:
74
+ return None
75
+ values = [
76
+ 'day', 'hour', 'minute', 'month', 'second', 'year', 'utc_hour',
77
+ 'utc_minute'
78
+ ]
79
+ data = {key: int(matched[key]) for key in values}
80
+ result = PDFDate(**data)
81
+ if matched['sign'] == '-':
82
+ result.hour = result * -1
83
+ return result
@@ -0,0 +1,202 @@
1
+ # =============================================================================
2
+ # C O P Y R I G H T
3
+ # -----------------------------------------------------------------------------
4
+ # Copyright (c) 2020-2023 by Helmut Konrad Schewe. All rights reserved.
5
+ # This file is property of Helmut Konrad Schewe. Any unauthorized copy,
6
+ # use or distribution is an offensive act against international law and may
7
+ # be prosecuted under federal law. Its content is company confidential.
8
+ # =============================================================================
9
+ """Destination
10
+ ===========
11
+
12
+ Hint: There is no direct link between `Annotation` a the real pdf page.
13
+ Therefore we have to extract the reference and link them with the
14
+ page-header to determine the real pdf page. See: `pageids` and
15
+ `solve_pageid.`
16
+
17
+ Types:
18
+
19
+ - Simple: Page Number is directly encoded
20
+ - Named: Solve `Destination` reference to determine page number
21
+ - Explicit: PDFPageReference is given
22
+
23
+ """
24
+
25
+ import contextlib
26
+ import dataclasses
27
+
28
+ import pdfminer.pdfpage
29
+ import pdfminer.pdftypes
30
+
31
+ import rawmaker.utils
32
+
33
+
34
+ class DestinationMixin:
35
+ pass
36
+
37
+
38
+ @dataclasses.dataclass
39
+ class ExplicitDestination(DestinationMixin):
40
+ page: int = None
41
+ left: float = None
42
+ top: float = None
43
+ zoom: float = None
44
+
45
+
46
+ @dataclasses.dataclass
47
+ class ExternalLinkDestination(DestinationMixin):
48
+ """Hyperlink to external web resource.
49
+
50
+ See: 12.6.4.7 URI Actions; PDF 2008
51
+ """
52
+ hyperlink: str = None
53
+
54
+
55
+ @dataclasses.dataclass
56
+ class NamedDestination(DestinationMixin):
57
+ reference: str = None
58
+
59
+ @property
60
+ def pdf_reference(self) -> bytes:
61
+ """Convert human readable reference to pdf reference.
62
+
63
+ >>> NamedDestination('Kapitel 1').pdf_reference
64
+ b'Kapitel 1'
65
+ """
66
+ encoded = rawmaker.utils.guess_encoding(self.reference)
67
+ return encoded
68
+
69
+
70
+ def parse(item) -> DestinationMixin: # pylint:disable=R1260
71
+ """\
72
+ A `null` value means that parameter shall be unchanged.
73
+
74
+ # TODO: Change null later
75
+ >>> parse([b'/null', 0.0, 0.0, 1.0]).page
76
+ 0
77
+ """
78
+ item = rawmaker.utils.resolve(item)
79
+ hyperlink = parse_hyperlink(item)
80
+ if hyperlink:
81
+ return hyperlink
82
+ fitr = parse_fitr(item)
83
+ if fitr:
84
+ return fitr
85
+ simple = parse_simple(item)
86
+ if simple:
87
+ return simple
88
+ for method in (parse_explict, parse_named):
89
+ explicit = method(item)
90
+ if explicit:
91
+ return explicit
92
+ return None
93
+
94
+
95
+ def parse_hyperlink(item) -> ExternalLinkDestination:
96
+ """\
97
+ >>> parse_hyperlink("{'S': /'URI', 'URI': b'http://www.helm.org/jst.pdf'}")
98
+ """
99
+ if not isinstance(item, dict):
100
+ return None
101
+ try:
102
+ hyperlink = item['URI']
103
+ except KeyError:
104
+ return None
105
+ return ExternalLinkDestination(hyperlink=hyperlink)
106
+
107
+
108
+ def parse_simple(item) -> NamedDestination:
109
+ """Page number is directly encoded, therefore we can convert and
110
+ return.
111
+
112
+ >>> from pdfminer.psparser import PSLiteral as PS
113
+ >>> parse_simple({'S': PS('GoTo'), 'D': b'FF'}).reference
114
+ 'FF'
115
+ """
116
+ if not isinstance(item, dict):
117
+ return None
118
+ if not item['S'].name == 'GoTo':
119
+ # 12.6.4.2 Go-To Actions
120
+ return None
121
+ reference = item['D']
122
+ reference = rawmaker.utils.guess_decoding(reference)
123
+ return NamedDestination(reference=reference)
124
+
125
+
126
+ def parse_fitr(item) -> ExplicitDestination:
127
+ """\
128
+ >>> from pdfminer.psparser import PSLiteral as PS
129
+ >>> parse_fitr({'S': 'GoTo', 'D': [5, PS('FitR'), 0, 625, 440, 309]}).page
130
+ 5
131
+ >>> parse_fitr({'D': [None, 'FitH', 3512], 'S': 'GoTo'})
132
+ ExplicitDestination(page=0, left=None, top=3512, zoom=None)
133
+ """
134
+ item = rawmaker.utils.resolve(item)
135
+ # {'S': /'GoTo', 'D': [0, /'FitR', 0, 625, 440, 309]}
136
+ with contextlib.suppress(TypeError):
137
+ item = item['D']
138
+ # [0, /'FitR', 0, 625, 440, 309]
139
+ if not isinstance(item, list):
140
+ return None
141
+ if isinstance(item[1], float):
142
+ # [/b'null', 0.0, 0.0, 1.0]
143
+ return ExplicitDestination(page=int(item[1])) # TODO: HACK?
144
+ fit = ('Fit', 'FitH', 'FitR', 'XYZ')
145
+ if not item[1] in fit and not item[1].name in fit:
146
+ return None
147
+ pagenumber = item[0]
148
+ top = item[2] if len(item) >= 3 else None
149
+ if pagenumber is None:
150
+ # TODO: CHANGE TO UNCHANGED/NONE
151
+ pagenumber = 0
152
+ return ExplicitDestination(page=pagenumber, top=top)
153
+
154
+
155
+ def parse_explict(item) -> ExplicitDestination:
156
+ item = rawmaker.utils.resolve(item)
157
+ with contextlib.suppress(KeyError, TypeError):
158
+ # KeyError: ? add docs here ?
159
+ # TypeError: item is already the requested list:
160
+ # [34, /'XYZ', 72.4799999, 532.319999, 0]
161
+ item = item['D']
162
+ if isinstance(item, bytes):
163
+ # {'S': /'GoTo', 'D': b'subsection.A.5.4'}
164
+ return None
165
+ try:
166
+ page, _, left, top, zoom = item # TODO: FLIP Y-Coordinate
167
+ except ValueError:
168
+ return None
169
+ with contextlib.suppress(AttributeError):
170
+ # skip when zoom is already a float
171
+ # null means: do not change current zoom
172
+ if zoom.name == b'null':
173
+ zoom = 0.0
174
+ result = ExplicitDestination(
175
+ page=page,
176
+ left=left,
177
+ top=top,
178
+ zoom=zoom,
179
+ )
180
+ return result
181
+
182
+
183
+ def parse_named(item) -> ExplicitDestination:
184
+ doc = item.doc
185
+ item = rawmaker.utils.resolve(item)
186
+ with contextlib.suppress(KeyError):
187
+ item = item['D']
188
+ resolved = doc.lookup_name('Dests', item)
189
+ resolved = rawmaker.utils.resolve(resolved)
190
+ return parse_explict(resolved)
191
+
192
+
193
+ def pageids(path: str) -> dict:
194
+ result = {}
195
+ with open(path, mode='rb') as pdf:
196
+ pages = pdfminer.pdfpage.PDFPage.get_pages(
197
+ pdf,
198
+ check_extractable=False,
199
+ )
200
+ for index, page in enumerate(pages):
201
+ result[page.pageid] = index
202
+ return result
rawmaker/error.py ADDED
@@ -0,0 +1,34 @@
1
+ #==============================================================================
2
+ # C O P Y R I G H T
3
+ #------------------------------------------------------------------------------
4
+ # Copyright (c) 2019-2023 by Helmut Konrad Schewe. All rights reserved.
5
+ # This file is property of Helmut Konrad Schewe. Any unauthorized copy,
6
+ # use or distribution is an offensive act against international law and may
7
+ # be prosecuted under federal law. Its content is company confidential.
8
+ #==============================================================================
9
+
10
+ from pdfminer.pdfdocument import PDFNoOutlines
11
+
12
+
13
+ class RawMakerError(Exception):
14
+ """Parent class for user errors or input errors.
15
+
16
+ Exceptions of this type are handled by the command line tool
17
+ and result in clear error messages, as opposed to backtraces.
18
+ """
19
+
20
+
21
+ class TextExtractionNotAllowed(RawMakerError):
22
+ pass
23
+
24
+
25
+ class InvalidPDF(RawMakerError):
26
+ pass
27
+
28
+
29
+ class PDFParserImplementationError(RawMakerError):
30
+ pass
31
+
32
+
33
+ class MissingOutlines(RawMakerError, PDFNoOutlines):
34
+ pass