rawmaker 2.40.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (63) hide show
  1. letty/__init__.py +46 -0
  2. letty/cli.py +63 -0
  3. letty/optimizer.py +138 -0
  4. letty/quality/__init__.py +8 -0
  5. letty/quality/whitespace.py +50 -0
  6. letty/strategy.py +8 -0
  7. rawmaker/__init__.py +29 -0
  8. rawmaker/__main__.py +13 -0
  9. rawmaker/__patch__.py +36 -0
  10. rawmaker/cli.py +206 -0
  11. rawmaker/cli_automate.py +69 -0
  12. rawmaker/converter/__init__.py +8 -0
  13. rawmaker/converter/basic.py +174 -0
  14. rawmaker/converter/images.py +168 -0
  15. rawmaker/date.py +83 -0
  16. rawmaker/destination.py +202 -0
  17. rawmaker/error.py +34 -0
  18. rawmaker/features/__init__.py +138 -0
  19. rawmaker/features/annotation.py +254 -0
  20. rawmaker/features/border.py +172 -0
  21. rawmaker/features/boxes.py +153 -0
  22. rawmaker/features/figures.py +24 -0
  23. rawmaker/features/fonts.py +229 -0
  24. rawmaker/features/formula.py +16 -0
  25. rawmaker/features/horizontals.py +132 -0
  26. rawmaker/features/images.py +155 -0
  27. rawmaker/features/line.py +337 -0
  28. rawmaker/features/outlines.py +123 -0
  29. rawmaker/features/text.py +91 -0
  30. rawmaker/fonts/__init__.py +8 -0
  31. rawmaker/fonts/parser.py +354 -0
  32. rawmaker/images/__init__.py +8 -0
  33. rawmaker/images/info.py +35 -0
  34. rawmaker/miner/__init__.py +8 -0
  35. rawmaker/miner/char.py +42 -0
  36. rawmaker/miner/colorspace.py +75 -0
  37. rawmaker/miner/images.py +448 -0
  38. rawmaker/miner/position.py +121 -0
  39. rawmaker/miner/rawchar.py +207 -0
  40. rawmaker/miner/text.py +833 -0
  41. rawmaker/miner/underline.py +66 -0
  42. rawmaker/parameter.py +130 -0
  43. rawmaker/patch/__init__.py +8 -0
  44. rawmaker/patch/ltchar.py +79 -0
  45. rawmaker/reader.py +97 -0
  46. rawmaker/text/__init__.py +8 -0
  47. rawmaker/text/chars.py +24 -0
  48. rawmaker/text/data.py +47 -0
  49. rawmaker/text/superfast.py +91 -0
  50. rawmaker/text/wordbox.py +95 -0
  51. rawmaker/utils.py +44 -0
  52. rawmaker-2.40.3.dist-info/METADATA +51 -0
  53. rawmaker-2.40.3.dist-info/RECORD +63 -0
  54. rawmaker-2.40.3.dist-info/WHEEL +5 -0
  55. rawmaker-2.40.3.dist-info/entry_points.txt +6 -0
  56. rawmaker-2.40.3.dist-info/licenses/LICENSE +21 -0
  57. rawmaker-2.40.3.dist-info/top_level.txt +3 -0
  58. spacestation/__init__.py +18 -0
  59. spacestation/cli.py +51 -0
  60. spacestation/features/__init__.py +8 -0
  61. spacestation/features/chardist.py +85 -0
  62. spacestation/features/worddist.py +57 -0
  63. spacestation/features/wspace.py +130 -0
@@ -0,0 +1,138 @@
1
+ #==============================================================================
2
+ # C O P Y R I G H T
3
+ #------------------------------------------------------------------------------
4
+ # Copyright (c) 2019-2023 by Helmut Konrad Schewe. All rights reserved.
5
+ # This file is property of Helmut Konrad Schewe. Any unauthorized copy,
6
+ # use or distribution is an offensive act against international law and may
7
+ # be prosecuted under federal law. Its content is company confidential.
8
+ #==============================================================================
9
+
10
+ import collections
11
+
12
+ import iamraw
13
+ import pdfminer.converter
14
+ import pdfminer.layout
15
+ import pdfminer.pdfdocument
16
+ import pdfminer.pdfinterp
17
+ import pdfminer.pdfpage
18
+ import utilo
19
+
20
+ import rawmaker.converter.basic
21
+ import rawmaker.miner.text
22
+ import rawmaker.parameter
23
+
24
+ PageContent = collections.namedtuple('PageContent', 'content, page')
25
+
26
+
27
+ def create_interpreter(layout=None) -> pdfminer.pdfinterp.PDFPageInterpreter:
28
+ if not layout:
29
+ layout = rawmaker.parameter.ParsingConfiguration().laparams()
30
+ device = rawmaker.converter.basic.PageAggregator(laparams=layout)
31
+ interpreter = pdfminer.pdfinterp.PDFPageInterpreter(
32
+ device.resources,
33
+ device,
34
+ )
35
+ return interpreter, device
36
+
37
+
38
+ def process_pdfpages(
39
+ document: pdfminer.pdfdocument.PDFDocument,
40
+ pages: tuple = None,
41
+ ) -> pdfminer.pdfpage.PDFPage:
42
+ """Contextmanager to yield `PDFPage` of every selected page of
43
+ `PDFDocument`.
44
+
45
+ Args:
46
+ document: open pdf file
47
+ pages: number of pages to procress, if None every page is processed
48
+ Yields:
49
+ PDFPage: tuple of page content and pdf page number
50
+ """
51
+ utilo.call('process_pdfpages')
52
+ utilo.asserts(document, pdfminer.pdfdocument.PDFDocument)
53
+ create_pages = pdfminer.pdfpage.PDFPage.create_pages
54
+ with utilo.SkipCollector(pages) as collector:
55
+ for number, page in enumerate(create_pages(document), start=0):
56
+ if collector.skip(number):
57
+ continue
58
+ page.pageid = number
59
+ yield (page, number)
60
+
61
+
62
+ def process_document(
63
+ document: pdfminer.pdfdocument.PDFDocument,
64
+ layout=None,
65
+ pages=None,
66
+ ) -> tuple[int, pdfminer.layout.LTPage]:
67
+ """Yield (pagenumber, LTPage) for every selected page of `PDFDocument`"""
68
+ assert isinstance(
69
+ document,
70
+ pdfminer.pdfdocument.PDFDocument,
71
+ ), type(document)
72
+ interpreter, device = create_interpreter(layout=layout)
73
+ for content, number in process_pdfpages(document, pages=pages):
74
+ interpreter.process_page(content)
75
+ pagecontent = PageContent(content=device.get_result(), page=number)
76
+ yield (content, pagecontent)
77
+
78
+
79
+ def process_pagecontent(
80
+ document: pdfminer.pdfdocument.PDFDocument,
81
+ layout=None,
82
+ pages=None,
83
+ ) -> pdfminer.layout.LTPage:
84
+ utilo.asserts(document, pdfminer.pdfdocument.PDFDocument)
85
+ for _, content in process_document(document, layout=layout, pages=pages):
86
+ yield content
87
+
88
+
89
+ def page_selection(document: iamraw.Document, pages: tuple):
90
+ assert isinstance(document, iamraw.Document), type(document)
91
+ if pages:
92
+ assert isinstance(pages, (list, tuple)), '%s %s' % (pages, type(pages)) # pylint:disable=C0209
93
+ return pages
94
+ # if pages is None, every page must processed
95
+ return list(range(len(document.pages)))
96
+
97
+
98
+ def extract_content(
99
+ document: pdfminer.pdfdocument.PDFDocument,
100
+ config: rawmaker.parameter.ParsingConfiguration = None,
101
+ converter=rawmaker.miner.text.PrecisePDFConverter,
102
+ pages: tuple = None,
103
+ ) -> iamraw.Document:
104
+ """Extract content from PDF file
105
+
106
+ Args:
107
+ document(PDFDocument): PDF file to process
108
+ config(ParsingConfiguration): parametrization for layout analysis.
109
+ This parameter defines how chars are
110
+ matched together in words and sentences.
111
+ See pdf reference documentation.
112
+ converter(pdfminer.converter.PDFLayoutAnalyzer): how to handle
113
+ the layout extraction
114
+ pages: tuple of selected pages
115
+ Returns:
116
+ Document: parsed and layouted document
117
+ """
118
+ if config is None:
119
+ config = rawmaker.parameter.ParsingConfiguration()
120
+ utilo.asserts(config, rawmaker.parameter.ParsingConfiguration)
121
+ # prepare parser
122
+ device = converter(config=config)
123
+ device.new_document()
124
+ interpreter = pdfminer.pdfinterp.PDFPageInterpreter(device.rsrcmgr, device)
125
+ # Processing layout
126
+ create_pages = pdfminer.pdfpage.PDFPage.create_pages
127
+ with utilo.SkipCollector(pages) as collector:
128
+ for index, page in enumerate(create_pages(document)):
129
+ if collector.skip(index):
130
+ continue
131
+ interpreter.process_page(page)
132
+ result = device.finish_document()
133
+ # upgrade page number
134
+ pages = page_selection(result, pages)
135
+ # TODO: REPLACE PAGE WITH ENDLESS ITER AND CHANGE ZIP TO ZIP_LONGEST
136
+ for (page, pagenumber) in zip(result.pages, pages):
137
+ page.page = pagenumber
138
+ return result
@@ -0,0 +1,254 @@
1
+ # =============================================================================
2
+ # C O P Y R I G H T
3
+ # -----------------------------------------------------------------------------
4
+ # Copyright (c) 2019-2023 by Helmut Konrad Schewe. All rights reserved.
5
+ # This file is property of Helmut Konrad Schewe. Any unauthorized copy,
6
+ # use or distribution is an offensive act against international law and may
7
+ # be prosecuted under federal law. Its content is company confidential.
8
+ # =============================================================================
9
+ """Add parser to parse non annotated links to inform user about
10
+ broken/malformated links."""
11
+
12
+ import iamraw
13
+ import pdfminer.pdfdocument
14
+ import serializeraw
15
+ import utilo
16
+
17
+ import rawmaker.features
18
+ import rawmaker.reader
19
+ import rawmaker.utils
20
+
21
+
22
+ def work(document: str, pages=None) -> str:
23
+ assert isinstance(document, str), str(document)
24
+ with rawmaker.reader.read(document) as pdf:
25
+ annotations = extract_annotations(pdf, pages=pages)
26
+ dumped = serializeraw.dump_annotations(annotations)
27
+ return dumped
28
+
29
+
30
+ def extract_annotations(
31
+ document: pdfminer.pdfdocument.PDFDocument,
32
+ pages=None,
33
+ ) -> iamraw.PageAnnotations:
34
+ result = []
35
+ for page, number in rawmaker.features.process_pdfpages(
36
+ document,
37
+ pages=pages,
38
+ ):
39
+ parsed = parse_page(page, pagenumber=number)
40
+ if not (parsed.hyperlinks or parsed.pagelinks):
41
+ # skip empty page
42
+ continue
43
+ result.append(parsed)
44
+ return result
45
+
46
+
47
+ ANNOTATION_LABEL = 'Annot'
48
+
49
+
50
+ def parse_page(
51
+ page: pdfminer.pdfpage.PDFPage,
52
+ pagenumber: int,
53
+ ) -> iamraw.PageAnnotation:
54
+ """Parse annotation from `PDFPage`.
55
+
56
+ Args:
57
+ page(PDFPage): pdf page to parse annotation
58
+ pagenumber(int): number of selected page
59
+ Returns:
60
+ parsed Annotations.
61
+
62
+ There are 2 different types of annotation, the internal and external
63
+ links:
64
+ * The internal links, better called page links refer to a chapter or a
65
+ location in the document.
66
+ * The external links, so called hyperlink refer to clickable weblinks.
67
+
68
+ # Internal reference
69
+ # {'A': {'S': /'GoTo', 'D': b'subsection.1.30.7'}}
70
+ # {'S': /'GoTo', 'D': b'chapter*.1'}
71
+ """
72
+ pageannotation = page.annots
73
+ if not pageannotation:
74
+ return iamraw.PageAnnotation(None, None, pagenumber)
75
+ getobj = page.doc.getobj
76
+ if not isinstance(pageannotation, list):
77
+ # WORKAROUND: THIS IS A FIX WHEN PAGE ANNOTATIONS ARE NESTED IN A
78
+ # SINGLE REFERENCE, DON'T KNOW WHY THIS CAN HAPPEN. TODO:
79
+ # INVESTIGATE LATER
80
+ pageannotation = list(getobj(page.annots.objid))
81
+ pagelinks, hyperlinks = [], []
82
+ for reference in pageannotation:
83
+ if isinstance(reference, dict): # pylint:disable=W0160
84
+ # reference is already resolved
85
+ pageobject = reference
86
+ else:
87
+ pageobject = getobj(reference.objid)
88
+ reference = parse_reference(pageobject, getobj)
89
+ if reference:
90
+ pagelinks.append(reference)
91
+ continue
92
+ if annotation := parse_appereance_strean(pageobject):
93
+ pagelinks.append(annotation)
94
+ continue
95
+ external = parse_external(pageobject, getobj)
96
+ if external:
97
+ hyperlinks.append(external)
98
+ continue
99
+ utilo.error(f'Unhandeld annotation {pageobject}')
100
+ # flip boundings
101
+ pageheight = float(page.mediabox[3])
102
+ for item in pagelinks:
103
+ item.bounds = flip_bounding(item.bounds, pageheight)
104
+ for item in hyperlinks:
105
+ item.bounds = flip_bounding(item.bounds, pageheight)
106
+ return iamraw.PageAnnotation(pagelinks, hyperlinks, page=pagenumber)
107
+
108
+
109
+ def flip_bounding(box, pageheight):
110
+ result = iamraw.BoundingBox(
111
+ box[0],
112
+ pageheight - box[3],
113
+ box[2],
114
+ pageheight - box[1],
115
+ )
116
+ return result
117
+
118
+
119
+ def parse_reference(pageobject, getobj=None) -> iamraw.PageLink:
120
+ if link := parse_link(pageobject):
121
+ return link
122
+ if label := parse_label(pageobject, getobj):
123
+ return label
124
+ return None
125
+
126
+
127
+ def parse_label(pageobject, getobj=None) -> iamraw.PageLink:
128
+ try:
129
+ typ = pageobject['Type'].name
130
+ if typ != ANNOTATION_LABEL:
131
+ return None
132
+ except KeyError:
133
+ return None
134
+ try:
135
+ annotated = pageobject['A']
136
+ except KeyError:
137
+ return None
138
+ if isinstance(annotated, pdfminer.pdftypes.PDFObjRef):
139
+ # TODO: add layer to automatically convert reference to object.
140
+ annotated = getobj(annotated.objid)
141
+ try:
142
+ pagelink = annotated['D']
143
+ except KeyError:
144
+ return None
145
+ bounds = determine_bounding(pageobject['Rect'])
146
+ pagelink = parse_pagelink(pagelink)
147
+ return iamraw.PageLink(bounds=bounds, goal=pagelink)
148
+
149
+
150
+ def parse_link(pageobject) -> iamraw.PageLink:
151
+ # TODO: don't know what this element means
152
+ #{'Type': /'Annot', 'Border': [0, 0, 0], 'H': /'I', 'C': [0,
153
+ #0.5, 0.5], 'Rect': [348.517, 428.927, 431.794, 439.831],
154
+ #'Subtype': /'Link', 'A': {'F': b'distributions.pdf', 'S':
155
+ #/'GoToR', 'D': [0, /'Fit']}} [0, /'Fit']
156
+ try:
157
+ typ = pageobject['Subtype'].name
158
+ if typ != 'Link':
159
+ return None
160
+ assert typ == 'Link'
161
+ except KeyError:
162
+ return None
163
+ try:
164
+ pagelink = pageobject['Dest']
165
+ except KeyError:
166
+ return None
167
+ bounds = determine_bounding(pageobject['Rect'])
168
+ pagelink = parse_pagelink(pagelink)
169
+ return iamraw.PageLink(bounds=bounds, goal=pagelink)
170
+
171
+
172
+ def parse_appereance_strean(pageobject) -> iamraw.PageLink:
173
+ # 12.5.5 Appereance Stream
174
+ # TODO: IMPROVE
175
+ try:
176
+ typ = pageobject['Type'].name
177
+ except KeyError:
178
+ return None
179
+ if typ != ANNOTATION_LABEL:
180
+ return None
181
+ try:
182
+ stream = rawmaker.utils.resolve(pageobject['AP']) # pylint:disable=W0612
183
+ stream = rawmaker.utils.resolve(stream['N'])
184
+ except KeyError:
185
+ return None
186
+ bounds = determine_bounding(pageobject['Rect'])
187
+ result = iamraw.PageLink(
188
+ bounds=bounds,
189
+ goal='STREAM',
190
+ )
191
+ return result
192
+
193
+
194
+ def parse_external(pageobject, getobj=None) -> iamraw.HyperLink:
195
+ # {'S': /'URI', 'URI': b'http://www.europarl.europa.eu/factsheets/de/sheet/92/allgemeine-steuerpolitik'}
196
+ # {'F': b'/C/Users/user/Downloads/MEMO-16-2265_DE.pdf', 'S': /'Launch'}
197
+ try:
198
+ annotated = pageobject['A']
199
+ except KeyError:
200
+ return None
201
+ if isinstance(annotated, pdfminer.pdftypes.PDFObjRef):
202
+ # TODO: add layer to automatically convert reference to object.
203
+ annotated = getobj(annotated.objid)
204
+ bounds = determine_bounding(pageobject['Rect'])
205
+ if 'URI' in annotated:
206
+ hyperlink = hyperlink_decode(annotated['URI'])
207
+ return iamraw.HyperLink(bounds=bounds, goal=hyperlink)
208
+ if 'F' in annotated:
209
+ hyperlink = hyperlink_decode(annotated['F'])
210
+ return iamraw.HyperLink(bounds=bounds, goal=hyperlink)
211
+ return None
212
+
213
+
214
+ def hyperlink_decode(text: bytes) -> str:
215
+ r""""
216
+ text(bytes): 7-bit ASCII, see 12.6.4.7
217
+
218
+ Hint: ASCII must not always be correct cause of bad programmed printer.?
219
+ TODO: MAY A PDFMINER CONVERTION ERROR?
220
+ TODO: VERIFY LINUX/WIN DUE CP1252
221
+
222
+ >>> hyperlink_decode(b'http://road.cc/measure-\x96-smart-street')
223
+ 'http://road.cc/measure-–-smart-street'
224
+ """
225
+ result = rawmaker.utils.guess_decoding(text)
226
+ if result is None:
227
+ utilo.error(f'annotation: could not decode: {text}')
228
+ return result
229
+
230
+
231
+ def parse_pagelink(pagelink):
232
+ r"""\
233
+ >>> parse_pagelink(b'glo:glos:Glas\xfcbergangstemperatur')
234
+ 'glo:glos:Glasübergangstemperatur'
235
+ """
236
+ if isinstance(pagelink, bytes):
237
+ decoded = rawmaker.utils.guess_decoding(pagelink)
238
+ if decoded:
239
+ return decoded
240
+ if isinstance(pagelink, list):
241
+ if isinstance(pagelink[0], pdfminer.pdftypes.PDFObjRef):
242
+ # internal link to pdf page
243
+ # resolve objid
244
+ pagelink[0] = f'objid: {pagelink[0].objid}'
245
+ pagelink = [str(item) for item in pagelink]
246
+ else:
247
+ pagelink = str(pagelink)
248
+ return pagelink
249
+
250
+
251
+ def determine_bounding(bounding):
252
+ bounding = utilo.rect_ensure_bounding(bounding)
253
+ result = iamraw.BoundingBox.from_list(bounding)
254
+ return result
@@ -0,0 +1,172 @@
1
+ # =============================================================================
2
+ # C O P Y R I G H T
3
+ # -----------------------------------------------------------------------------
4
+ # Copyright (c) 2019-2023 by Helmut Konrad Schewe. All rights reserved.
5
+ # This file is property of Helmut Konrad Schewe. Any unauthorized copy,
6
+ # use or distribution is an offensive act against international law and may
7
+ # be prosecuted under federal law. Its content is company confidential.
8
+ # =============================================================================
9
+ """The `border`-feature enables to detect the pdf page size in ?pixel?
10
+ and locate the cropped box around the content.
11
+
12
+ Features:
13
+ * page size
14
+ * content size
15
+
16
+ """
17
+
18
+ import collections
19
+ import contextlib
20
+
21
+ import iamraw
22
+ import pdfminer.pdfdocument
23
+ import serializeraw
24
+ import utilo
25
+
26
+ import rawmaker.features
27
+ import rawmaker.reader
28
+
29
+ PagePageSize = collections.namedtuple('PagePageSize', 'size page')
30
+
31
+
32
+ def work(document: str, pages: tuple = None) -> tuple[str, str]:
33
+ """Extract page size of `document` bounding boxes of page content.
34
+
35
+ Args:
36
+ document: path to document to run parsing
37
+ pages: tuple of processed pages
38
+ Returns:
39
+ tuple(pages, boxes): page size and list of bounding boxes for page
40
+ content.
41
+ """
42
+ assert isinstance(document, str), str(document)
43
+ with rawmaker.reader.read(document) as pdf:
44
+ sizeandborders, boxes = determine_boundingboxes(pdf, pages=pages)
45
+
46
+ pages = serializeraw.dump_pageborders(sizeandborders)
47
+ boundingboxes = serializeraw.dump_boundingboxes(boxes)
48
+
49
+ return pages, boundingboxes
50
+
51
+
52
+ def determine_boundingboxes(
53
+ document: pdfminer.pdfdocument.PDFDocument,
54
+ pages: tuple = None,
55
+ ) -> iamraw.PageBoundingsList:
56
+ """Extract page size, border and boundingboxes from `PDFDocument`.
57
+
58
+ Args:
59
+ document(PDFDocument): loaded document
60
+ pages: tuple of processed pages
61
+ Returns:
62
+ sizeandborder(List[PageSizeBorder]) a list for every page with page
63
+ border and a list of the BoundingBoxes of the objects on the
64
+ current page.
65
+ boxes(PageBoundings)
66
+ """
67
+ sizeborders, boxes = [], []
68
+ contentid = 0
69
+ for page, content in rawmaker.features.process_document(document, pages=pages): # yapf:disable
70
+ content, pagenumber = content.content, content.page
71
+ size = pagesize_from_page(page)
72
+
73
+ pagebounding = iamraw.PageBoundings(
74
+ boundings=boundingboxes_from_page(content, contentid),
75
+ page=pagenumber,
76
+ )
77
+ boxes.append(pagebounding)
78
+
79
+ contentid += len(content)
80
+ border = cropborder_from_page(content)
81
+ sizeborders.append(
82
+ iamraw.PageSizeBorder(
83
+ size=size,
84
+ border=border,
85
+ page=pagenumber,
86
+ ))
87
+ return sizeborders, boxes
88
+
89
+
90
+ def pagesizes(
91
+ pdf: pdfminer.pdfdocument.PDFDocument,
92
+ pages: tuple = None,
93
+ ) -> list[iamraw.PageSize]:
94
+ """Extract page sizes of `PDFDocument`.
95
+
96
+ Args:
97
+ pdf(PDFDocument): load pdf document
98
+ pages: tuple of processed pages
99
+ Returns:
100
+ List of page sizes.
101
+ """
102
+ result = []
103
+ for page, content in rawmaker.features.process_document(pdf, pages=pages):
104
+ content, pagenumber = content.content, content.page
105
+ size = pagesize_from_page(page)
106
+ result.append(PagePageSize(size=size, page=pagenumber))
107
+ return result
108
+
109
+
110
+ def boundingboxes_from_page(content: list, contentid: int) -> tuple:
111
+ """Extract bounding boxes from page `content`.
112
+
113
+ Args:
114
+ content: content of a single page
115
+ contentid: last id of the previous page
116
+ Returns:
117
+ Cropbox which contains all items of this page
118
+ """
119
+ result = []
120
+ for index, item in enumerate(content, start=contentid):
121
+ result.append((index, item.bbox))
122
+ return result
123
+
124
+
125
+ def pagesize_from_page(page: pdfminer.pdfdocument.PDFDocument) -> iamraw.PageSize: # yapf:disable
126
+ # x, y, width, height
127
+ pagewidth = utilo.roundme(page.mediabox[2])
128
+ pageheight = utilo.roundme(page.mediabox[3])
129
+
130
+ rotate = page.rotate
131
+ if rotate in {90, 270}:
132
+ # rotated page, flip page size
133
+ pagewidth, pageheight = pageheight, pagewidth
134
+ return iamraw.PageSize(width=pagewidth, height=pageheight)
135
+
136
+
137
+ def cropborder_from_page(content) -> iamraw.Border:
138
+ """Determine bounding box which includes all page items except of
139
+ white space only text.
140
+
141
+
142
+ >>> cropborder_from_page([
143
+ ... pdfminer.layout.LTLine(linewidth=1.0, p0=(50.520,78.540) , p1=(106.200,78.540)),
144
+ ... pdfminer.layout.LTLine(linewidth=1.0, p0=(107.160,78.540), p1=(122.220,78.540)),
145
+ ... ])
146
+ Border(left=50.52, right=122.22, top=78.54, bottom=78.54)
147
+ """
148
+
149
+ def no_whitespace(items):
150
+ result = []
151
+ for item in items:
152
+ with contextlib.suppress(AttributeError):
153
+ if not item.get_text().strip():
154
+ # skip white spaces
155
+ continue
156
+ result.append(item)
157
+ return result
158
+
159
+ content = no_whitespace(content)
160
+ if not content:
161
+ return iamraw.Border(None, None, None, None)
162
+
163
+ # left, top, right, bottom
164
+ x0 = min((item.bbox[0] for item in content))
165
+ y0 = min((item.bbox[1] for item in content))
166
+ x1 = max((item.bbox[2] for item in content))
167
+ y1 = max((item.bbox[3] for item in content))
168
+ # left, right, top, bottom
169
+ x0, y0, x1, y1 = utilo.roundme((x0, y0, x1, y1))
170
+ assert x0 <= x1, f'{x0} <= {x1}'
171
+ assert y0 <= y1, f'{y0} <= {y1}'
172
+ return iamraw.Border(left=x0, right=x1, top=y0, bottom=y1)