natural-pdf 0.1.9__py3-none-any.whl → 0.1.11__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- natural_pdf/__init__.py +24 -40
- natural_pdf/classification/manager.py +26 -22
- natural_pdf/classification/mixin.py +7 -7
- natural_pdf/classification/results.py +17 -9
- natural_pdf/collections/mixins.py +17 -0
- natural_pdf/collections/pdf_collection.py +78 -46
- natural_pdf/core/page.py +17 -17
- natural_pdf/core/pdf.py +192 -18
- natural_pdf/elements/collections.py +307 -3
- natural_pdf/elements/region.py +2 -3
- natural_pdf/exporters/hocr.py +540 -0
- natural_pdf/exporters/hocr_font.py +142 -0
- natural_pdf/exporters/original_pdf.py +130 -0
- natural_pdf/exporters/searchable_pdf.py +3 -3
- natural_pdf/ocr/engine_surya.py +1 -1
- {natural_pdf-0.1.9.dist-info → natural_pdf-0.1.11.dist-info}/METADATA +1 -2
- {natural_pdf-0.1.9.dist-info → natural_pdf-0.1.11.dist-info}/RECORD +20 -17
- {natural_pdf-0.1.9.dist-info → natural_pdf-0.1.11.dist-info}/WHEEL +1 -1
- {natural_pdf-0.1.9.dist-info → natural_pdf-0.1.11.dist-info}/licenses/LICENSE +0 -0
- {natural_pdf-0.1.9.dist-info → natural_pdf-0.1.11.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,540 @@
|
|
1
|
+
# SPDX-FileCopyrightText: 2010 Jonathan Brinley
|
2
|
+
# SPDX-FileCopyrightText: 2013-2014 Julien Pfefferkorn
|
3
|
+
# SPDX-FileCopyrightText: 2023 James R. Barlow
|
4
|
+
# SPDX-FileCopyrightText: 2025 Odin Dahlström
|
5
|
+
# SPDX-License-Identifier: MIT
|
6
|
+
|
7
|
+
"""hOCR transform implementation."""
|
8
|
+
|
9
|
+
from __future__ import annotations
|
10
|
+
|
11
|
+
import logging
|
12
|
+
import os
|
13
|
+
import re
|
14
|
+
import unicodedata
|
15
|
+
from dataclasses import dataclass
|
16
|
+
from itertools import pairwise
|
17
|
+
from math import atan, pi
|
18
|
+
from pathlib import Path
|
19
|
+
from xml.etree import ElementTree
|
20
|
+
|
21
|
+
from pikepdf import Matrix, Name, Rectangle
|
22
|
+
from pikepdf.canvas import (
|
23
|
+
BLACK,
|
24
|
+
BLUE,
|
25
|
+
CYAN,
|
26
|
+
DARKGREEN,
|
27
|
+
GREEN,
|
28
|
+
MAGENTA,
|
29
|
+
RED,
|
30
|
+
Canvas,
|
31
|
+
Text,
|
32
|
+
TextDirection,
|
33
|
+
)
|
34
|
+
|
35
|
+
from natural_pdf.exporters.hocr_font import EncodableFont as Font
|
36
|
+
from natural_pdf.exporters.hocr_font import GlyphlessFont
|
37
|
+
|
38
|
+
log = logging.getLogger(__name__)
|
39
|
+
|
40
|
+
INCH = 72.0
|
41
|
+
|
42
|
+
Element = ElementTree.Element
|
43
|
+
|
44
|
+
|
45
|
+
@dataclass
|
46
|
+
class DebugRenderOptions:
|
47
|
+
"""A class for managing rendering options."""
|
48
|
+
|
49
|
+
render_paragraph_bbox: bool = False
|
50
|
+
render_baseline: bool = False
|
51
|
+
render_triangle: bool = False
|
52
|
+
render_line_bbox: bool = False
|
53
|
+
render_word_bbox: bool = False
|
54
|
+
render_space_bbox: bool = False
|
55
|
+
|
56
|
+
|
57
|
+
class HocrTransformError(Exception):
|
58
|
+
"""Error while applying hOCR transform."""
|
59
|
+
|
60
|
+
|
61
|
+
class HocrTransform:
|
62
|
+
"""A class for converting documents from the hOCR format.
|
63
|
+
|
64
|
+
For details of the hOCR format, see:
|
65
|
+
http://kba.github.io/hocr-spec/1.2/.
|
66
|
+
"""
|
67
|
+
|
68
|
+
box_pattern = re.compile(
|
69
|
+
r'''
|
70
|
+
bbox \s+
|
71
|
+
(\d+) \s+ # left: uint
|
72
|
+
(\d+) \s+ # top: uint
|
73
|
+
(\d+) \s+ # right: uint
|
74
|
+
(\d+) # bottom: uint
|
75
|
+
''',
|
76
|
+
re.VERBOSE,
|
77
|
+
)
|
78
|
+
baseline_pattern = re.compile(
|
79
|
+
r'''
|
80
|
+
baseline \s+
|
81
|
+
([\-\+]?\d*\.?\d*) \s+ # +/- decimal float
|
82
|
+
([\-\+]?\d+) # +/- int
|
83
|
+
''',
|
84
|
+
re.VERBOSE,
|
85
|
+
)
|
86
|
+
textangle_pattern = re.compile(
|
87
|
+
r'''
|
88
|
+
textangle \s+
|
89
|
+
([\-\+]?\d*\.?\d*) # +/- decimal float
|
90
|
+
''',
|
91
|
+
re.VERBOSE,
|
92
|
+
)
|
93
|
+
|
94
|
+
def __init__(
|
95
|
+
self,
|
96
|
+
*,
|
97
|
+
hocr_filename: str | Path,
|
98
|
+
dpi: float,
|
99
|
+
debug: bool = False,
|
100
|
+
fontname: Name = Name("/f-0-0"),
|
101
|
+
font: Font = GlyphlessFont(),
|
102
|
+
debug_render_options: DebugRenderOptions | None = None,
|
103
|
+
):
|
104
|
+
"""Initialize the HocrTransform object."""
|
105
|
+
if debug:
|
106
|
+
log.warning("Use debug_render_options instead", DeprecationWarning)
|
107
|
+
self.render_options = DebugRenderOptions(
|
108
|
+
render_baseline=debug,
|
109
|
+
render_triangle=debug,
|
110
|
+
render_line_bbox=False,
|
111
|
+
render_word_bbox=debug,
|
112
|
+
render_paragraph_bbox=False,
|
113
|
+
render_space_bbox=False,
|
114
|
+
)
|
115
|
+
else:
|
116
|
+
self.render_options = debug_render_options or DebugRenderOptions()
|
117
|
+
self.dpi = dpi
|
118
|
+
self.hocr = ElementTree.parse(os.fspath(hocr_filename))
|
119
|
+
self._fontname = fontname
|
120
|
+
self._font = font
|
121
|
+
|
122
|
+
# if the hOCR file has a namespace, ElementTree requires its use to
|
123
|
+
# find elements
|
124
|
+
matches = re.match(r'({.*})html', self.hocr.getroot().tag)
|
125
|
+
self.xmlns = ''
|
126
|
+
if matches:
|
127
|
+
self.xmlns = matches.group(1)
|
128
|
+
|
129
|
+
for div in self.hocr.findall(self._child_xpath('div', 'ocr_page')):
|
130
|
+
coords = self.element_coordinates(div)
|
131
|
+
if not coords:
|
132
|
+
raise HocrTransformError("hocr file is missing page dimensions")
|
133
|
+
self.width = (coords.urx - coords.llx) / (self.dpi / INCH)
|
134
|
+
self.height = (coords.ury - coords.lly) / (self.dpi / INCH)
|
135
|
+
# Stop after first div that has page coordinates
|
136
|
+
break
|
137
|
+
|
138
|
+
def _get_element_text(self, element: Element) -> str:
|
139
|
+
"""Return the textual content of the element and its children."""
|
140
|
+
text = element.text if element.text is not None else ''
|
141
|
+
for child in element:
|
142
|
+
text += self._get_element_text(child)
|
143
|
+
text += element.tail if element.tail is not None else ''
|
144
|
+
return text
|
145
|
+
|
146
|
+
@classmethod
|
147
|
+
def element_coordinates(cls, element: Element) -> Rectangle | None:
|
148
|
+
"""Get coordinates of the bounding box around an element."""
|
149
|
+
matches = cls.box_pattern.search(element.attrib.get('title', ''))
|
150
|
+
if not matches:
|
151
|
+
return None
|
152
|
+
return Rectangle(
|
153
|
+
float(matches.group(1)), # llx = left
|
154
|
+
float(matches.group(2)), # lly = top
|
155
|
+
float(matches.group(3)), # urx = right
|
156
|
+
float(matches.group(4)), # ury = bottom
|
157
|
+
)
|
158
|
+
|
159
|
+
@classmethod
|
160
|
+
def baseline(cls, element: Element) -> tuple[float, float]:
|
161
|
+
"""Get baseline's slope and intercept."""
|
162
|
+
matches = cls.baseline_pattern.search(element.attrib.get('title', ''))
|
163
|
+
if not matches:
|
164
|
+
return (0.0, 0.0)
|
165
|
+
return float(matches.group(1)), int(matches.group(2))
|
166
|
+
|
167
|
+
@classmethod
|
168
|
+
def textangle(cls, element: Element) -> float:
|
169
|
+
"""Get text angle of an element."""
|
170
|
+
matches = cls.textangle_pattern.search(element.attrib.get('title', ''))
|
171
|
+
if not matches:
|
172
|
+
return 0.0
|
173
|
+
return float(matches.group(1))
|
174
|
+
|
175
|
+
def _child_xpath(self, html_tag: str, html_class: str | None = None) -> str:
|
176
|
+
xpath = f".//{self.xmlns}{html_tag}"
|
177
|
+
if html_class:
|
178
|
+
xpath += f"[@class='{html_class}']"
|
179
|
+
return xpath
|
180
|
+
|
181
|
+
@classmethod
|
182
|
+
def normalize_text(cls, s: str) -> str:
|
183
|
+
"""Normalize the given text using the NFKC normalization form."""
|
184
|
+
return unicodedata.normalize("NFKC", s)
|
185
|
+
|
186
|
+
def to_pdf(
|
187
|
+
self,
|
188
|
+
*,
|
189
|
+
out_filename: Path,
|
190
|
+
image_filename: Path | None = None,
|
191
|
+
invisible_text: bool = True,
|
192
|
+
) -> None:
|
193
|
+
"""Creates a PDF file with an image superimposed on top of the text.
|
194
|
+
|
195
|
+
Text is positioned according to the bounding box of the lines in
|
196
|
+
the hOCR file.
|
197
|
+
The image need not be identical to the image used to create the hOCR
|
198
|
+
file.
|
199
|
+
It can have a lower resolution, different color mode, etc.
|
200
|
+
|
201
|
+
Arguments:
|
202
|
+
out_filename: Path of PDF to write.
|
203
|
+
image_filename: Image to use for this file. If omitted, the OCR text
|
204
|
+
is shown.
|
205
|
+
invisible_text: If True, text is rendered invisible so that is
|
206
|
+
selectable but never drawn. If False, text is visible and may
|
207
|
+
be seen if the image is skipped or deleted in Acrobat.
|
208
|
+
"""
|
209
|
+
# create the PDF file
|
210
|
+
# page size in points (1/72 in.)
|
211
|
+
canvas = Canvas(page_size=(self.width, self.height))
|
212
|
+
canvas.add_font(self._fontname, self._font)
|
213
|
+
page_matrix = (
|
214
|
+
Matrix()
|
215
|
+
.translated(0, self.height)
|
216
|
+
.scaled(1, -1)
|
217
|
+
.scaled(INCH / self.dpi, INCH / self.dpi)
|
218
|
+
)
|
219
|
+
log.debug(page_matrix)
|
220
|
+
with canvas.do.save_state(cm=page_matrix):
|
221
|
+
self._debug_draw_paragraph_boxes(canvas)
|
222
|
+
found_lines = False
|
223
|
+
for par in self.hocr.iterfind(self._child_xpath('p', 'ocr_par')):
|
224
|
+
for line in (
|
225
|
+
element
|
226
|
+
for element in par.iterfind(self._child_xpath('span'))
|
227
|
+
if 'class' in element.attrib
|
228
|
+
and element.attrib['class']
|
229
|
+
in {'ocr_header', 'ocr_line', 'ocr_textfloat', 'ocr_caption'}
|
230
|
+
):
|
231
|
+
found_lines = True
|
232
|
+
direction = self._get_text_direction(par)
|
233
|
+
inject_word_breaks = self._get_inject_word_breaks(par)
|
234
|
+
self._do_line(
|
235
|
+
canvas,
|
236
|
+
line,
|
237
|
+
"ocrx_word",
|
238
|
+
invisible_text,
|
239
|
+
direction,
|
240
|
+
inject_word_breaks,
|
241
|
+
)
|
242
|
+
|
243
|
+
if not found_lines:
|
244
|
+
# Tesseract did not report any lines (just words)
|
245
|
+
root = self.hocr.find(self._child_xpath('div', 'ocr_page'))
|
246
|
+
direction = self._get_text_direction(root)
|
247
|
+
self._do_line(
|
248
|
+
canvas,
|
249
|
+
root,
|
250
|
+
"ocrx_word",
|
251
|
+
invisible_text,
|
252
|
+
direction,
|
253
|
+
True,
|
254
|
+
)
|
255
|
+
# put the image on the page, scaled to fill the page
|
256
|
+
if image_filename is not None:
|
257
|
+
canvas.do.draw_image(
|
258
|
+
image_filename, 0, 0, width=self.width, height=self.height
|
259
|
+
)
|
260
|
+
|
261
|
+
# finish up the page and save it
|
262
|
+
canvas.to_pdf().save(out_filename)
|
263
|
+
|
264
|
+
def _get_text_direction(self, par):
|
265
|
+
"""Get the text direction of the paragraph.
|
266
|
+
|
267
|
+
Arabic, Hebrew, Persian, are right-to-left languages.
|
268
|
+
When the paragraph element is None, defaults to left-to-right.
|
269
|
+
"""
|
270
|
+
if par is None:
|
271
|
+
return TextDirection.LTR
|
272
|
+
|
273
|
+
return (
|
274
|
+
TextDirection.RTL
|
275
|
+
if par.attrib.get('dir', 'ltr') == 'rtl'
|
276
|
+
else TextDirection.LTR
|
277
|
+
)
|
278
|
+
|
279
|
+
def _get_inject_word_breaks(self, par):
|
280
|
+
"""Determine whether word breaks should be injected.
|
281
|
+
|
282
|
+
In Chinese, Japanese, and Korean, word breaks are not injected, because
|
283
|
+
words are usually one or two characters and separators are usually explicit.
|
284
|
+
In all other languages, we inject word breaks to help word segmentation.
|
285
|
+
"""
|
286
|
+
lang = par.attrib.get('lang', '')
|
287
|
+
log.debug(lang)
|
288
|
+
if lang in {'chi_sim', 'chi_tra', 'jpn', 'kor'}:
|
289
|
+
return False
|
290
|
+
return True
|
291
|
+
|
292
|
+
@classmethod
|
293
|
+
def polyval(cls, poly, x): # pragma: no cover
|
294
|
+
"""Calculate the value of a polynomial at a point."""
|
295
|
+
return x * poly[0] + poly[1]
|
296
|
+
|
297
|
+
def _do_line(
|
298
|
+
self,
|
299
|
+
canvas: Canvas,
|
300
|
+
line: Element | None,
|
301
|
+
elemclass: str,
|
302
|
+
invisible_text: bool,
|
303
|
+
text_direction: TextDirection,
|
304
|
+
inject_word_breaks: bool,
|
305
|
+
):
|
306
|
+
"""Render the text for a given line.
|
307
|
+
|
308
|
+
The canvas's coordinate system must be configured so that hOCR pixel
|
309
|
+
coordinates are mapped to PDF coordinates.
|
310
|
+
"""
|
311
|
+
if line is None:
|
312
|
+
return
|
313
|
+
# line_min_aabb (which is created from the "bbox" hOCR property) is so named
|
314
|
+
# because a Rectangle instance is always an AABB (it has no orientation).
|
315
|
+
# However, this means that for non-zero values of the "textangle" hOCR
|
316
|
+
# property, line_min_aabb is not the true bounding box of the hOCR line,
|
317
|
+
# but rather the minimum AABB that encloses the bounding box of the line.
|
318
|
+
# The true bounding box of the line must be seen as an OBB, due to the
|
319
|
+
# existance of the "textangle" hOCR property.
|
320
|
+
line_min_aabb = self.element_coordinates(line)
|
321
|
+
if not line_min_aabb:
|
322
|
+
return
|
323
|
+
if line_min_aabb.ury <= line_min_aabb.lly:
|
324
|
+
log.error(
|
325
|
+
"line box is invalid so we cannot render it: box=%s text=%s",
|
326
|
+
line_min_aabb,
|
327
|
+
self._get_element_text(line),
|
328
|
+
)
|
329
|
+
return
|
330
|
+
self._debug_draw_line_bbox(canvas, line_min_aabb)
|
331
|
+
|
332
|
+
# Even though line_min_aabb is not the true bounding box of the line,
|
333
|
+
# it is still possible to derive an AABB (Rectangle) from it that is
|
334
|
+
# the same size as the true bounding box of the line,
|
335
|
+
# if we use a coordinate system that is axis-aligned with respect to
|
336
|
+
# the rotation of the OBB (textangle).
|
337
|
+
# line_size_aabb_matrix is a transform matrix for such a coordinate
|
338
|
+
# system, and line_size_aabb is thus an AABB with the same
|
339
|
+
# size as the true bounding box of the line.
|
340
|
+
top_left_corner = (line_min_aabb.llx, line_min_aabb.lly)
|
341
|
+
line_size_aabb_matrix = (
|
342
|
+
Matrix()
|
343
|
+
.translated(*top_left_corner)
|
344
|
+
# Note: negative sign (textangle is counter-clockwise, see hOCR spec)
|
345
|
+
.rotated(-self.textangle(line))
|
346
|
+
)
|
347
|
+
line_size_aabb = line_size_aabb_matrix.inverse().transform(line_min_aabb)
|
348
|
+
|
349
|
+
slope, intercept = self.baseline(line)
|
350
|
+
if abs(slope) < 0.005:
|
351
|
+
slope = 0.0
|
352
|
+
slope_angle = atan(slope)
|
353
|
+
|
354
|
+
# Final PDF-perspective (bottom-left corner) transform matrix for the
|
355
|
+
# text baseline, which has an intercept and slope relative to the OBB.
|
356
|
+
# See "bbox", "textangle" and "baseline" in the hOCR spec for more details.
|
357
|
+
baseline_matrix = (
|
358
|
+
line_size_aabb_matrix
|
359
|
+
# Translate from hOCR perspective (top-left corner) to PDF perspective
|
360
|
+
# (bottom-left corner).
|
361
|
+
# Note: it would be incorrect to use line_min_aabb.height here because
|
362
|
+
# it is not the true height of the OBB of the line, if textangle != 0.
|
363
|
+
.translated(0, line_size_aabb.height)
|
364
|
+
.translated(0, intercept)
|
365
|
+
.rotated(slope_angle / pi * 180)
|
366
|
+
)
|
367
|
+
|
368
|
+
with canvas.do.save_state(cm=baseline_matrix):
|
369
|
+
text = Text(direction=text_direction)
|
370
|
+
fontsize = line_size_aabb.height + intercept
|
371
|
+
text.font(self._fontname, fontsize)
|
372
|
+
text.render_mode(3 if invisible_text else 0)
|
373
|
+
|
374
|
+
self._debug_draw_baseline(
|
375
|
+
canvas, baseline_matrix.inverse().transform(line_min_aabb), 0
|
376
|
+
)
|
377
|
+
|
378
|
+
canvas.do.fill_color(BLACK) # text in black
|
379
|
+
elements = line.findall(self._child_xpath('span', elemclass))
|
380
|
+
for elem, next_elem in pairwise(elements + [None]):
|
381
|
+
self._do_line_word(
|
382
|
+
canvas,
|
383
|
+
baseline_matrix,
|
384
|
+
text,
|
385
|
+
fontsize,
|
386
|
+
elem,
|
387
|
+
next_elem,
|
388
|
+
text_direction,
|
389
|
+
inject_word_breaks,
|
390
|
+
)
|
391
|
+
canvas.do.draw_text(text)
|
392
|
+
|
393
|
+
def _do_line_word(
|
394
|
+
self,
|
395
|
+
canvas: Canvas,
|
396
|
+
line_matrix: Matrix,
|
397
|
+
text: Text,
|
398
|
+
fontsize: float,
|
399
|
+
elem: Element | None,
|
400
|
+
next_elem: Element | None,
|
401
|
+
text_direction: TextDirection,
|
402
|
+
inject_word_breaks: bool,
|
403
|
+
):
|
404
|
+
"""Render the text for a single word."""
|
405
|
+
if elem is None:
|
406
|
+
return
|
407
|
+
elemtxt = self.normalize_text(self._get_element_text(elem).strip())
|
408
|
+
if elemtxt == '':
|
409
|
+
return
|
410
|
+
|
411
|
+
hocr_box = self.element_coordinates(elem)
|
412
|
+
if hocr_box is None:
|
413
|
+
return
|
414
|
+
box = line_matrix.inverse().transform(hocr_box)
|
415
|
+
font_width = self._font.text_width(elemtxt, fontsize)
|
416
|
+
|
417
|
+
# Debug sketches
|
418
|
+
self._debug_draw_word_triangle(canvas, box)
|
419
|
+
self._debug_draw_word_bbox(canvas, box)
|
420
|
+
|
421
|
+
# If this word is 0 units wide, our best bet seems to be to suppress this text
|
422
|
+
if text_direction == TextDirection.RTL:
|
423
|
+
log.info("RTL: %s", elemtxt)
|
424
|
+
if font_width > 0:
|
425
|
+
if text_direction == TextDirection.LTR:
|
426
|
+
text.text_transform(Matrix(1, 0, 0, -1, box.llx, 0))
|
427
|
+
elif text_direction == TextDirection.RTL:
|
428
|
+
text.text_transform(Matrix(-1, 0, 0, -1, box.llx + box.width, 0))
|
429
|
+
text.horiz_scale(100 * box.width / font_width)
|
430
|
+
text.show(self._font.text_encode(elemtxt))
|
431
|
+
|
432
|
+
# Get coordinates of the next word (if there is one)
|
433
|
+
hocr_next_box = (
|
434
|
+
self.element_coordinates(next_elem) if next_elem is not None else None
|
435
|
+
)
|
436
|
+
if hocr_next_box is None:
|
437
|
+
return
|
438
|
+
# Render a space between this word and the next word. The explicit space helps
|
439
|
+
# PDF viewers identify the word break, and horizontally scaling it to
|
440
|
+
# occupy the space the between the words helps the PDF viewer
|
441
|
+
# avoid combiningthewordstogether.
|
442
|
+
if not inject_word_breaks:
|
443
|
+
return
|
444
|
+
next_box = line_matrix.inverse().transform(hocr_next_box)
|
445
|
+
if text_direction == TextDirection.LTR:
|
446
|
+
space_box = Rectangle(box.urx, box.lly, next_box.llx, next_box.ury)
|
447
|
+
elif text_direction == TextDirection.RTL:
|
448
|
+
space_box = Rectangle(next_box.urx, box.lly, box.llx, next_box.ury)
|
449
|
+
self._debug_draw_space_bbox(canvas, space_box)
|
450
|
+
space_width = self._font.text_width(' ', fontsize)
|
451
|
+
if space_width > 0 and space_box.width > 0:
|
452
|
+
if text_direction == TextDirection.LTR:
|
453
|
+
text.text_transform(Matrix(1, 0, 0, -1, space_box.llx, 0))
|
454
|
+
elif text_direction == TextDirection.RTL:
|
455
|
+
text.text_transform(
|
456
|
+
Matrix(-1, 0, 0, -1, space_box.llx + space_box.width, 0)
|
457
|
+
)
|
458
|
+
text.horiz_scale(100 * space_box.width / space_width)
|
459
|
+
text.show(self._font.text_encode(' '))
|
460
|
+
|
461
|
+
def _debug_draw_paragraph_boxes(self, canvas: Canvas, color=CYAN):
|
462
|
+
"""Draw boxes around paragraphs in the document."""
|
463
|
+
if not self.render_options.render_paragraph_bbox: # pragma: no cover
|
464
|
+
return
|
465
|
+
with canvas.do.save_state():
|
466
|
+
# draw box around paragraph
|
467
|
+
canvas.do.stroke_color(color).line_width(0.1)
|
468
|
+
for elem in self.hocr.iterfind(self._child_xpath('p', 'ocr_par')):
|
469
|
+
elemtxt = self._get_element_text(elem).strip()
|
470
|
+
if len(elemtxt) == 0:
|
471
|
+
continue
|
472
|
+
ocr_par = self.element_coordinates(elem)
|
473
|
+
if ocr_par is None:
|
474
|
+
continue
|
475
|
+
canvas.do.rect(
|
476
|
+
ocr_par.llx, ocr_par.lly, ocr_par.width, ocr_par.height, fill=False
|
477
|
+
)
|
478
|
+
|
479
|
+
def _debug_draw_line_bbox(self, canvas: Canvas, line_box: Rectangle, color=BLUE):
|
480
|
+
"""Render the bounding box of a text line."""
|
481
|
+
if not self.render_options.render_line_bbox: # pragma: no cover
|
482
|
+
return
|
483
|
+
with canvas.do.save_state():
|
484
|
+
canvas.do.stroke_color(color).line_width(0.15).rect(
|
485
|
+
line_box.llx, line_box.lly, line_box.width, line_box.height, fill=False
|
486
|
+
)
|
487
|
+
|
488
|
+
def _debug_draw_word_triangle(
|
489
|
+
self, canvas: Canvas, box: Rectangle, color=RED, line_width=0.1
|
490
|
+
):
|
491
|
+
"""Render a triangle that conveys word height and drawing direction."""
|
492
|
+
if not self.render_options.render_triangle: # pragma: no cover
|
493
|
+
return
|
494
|
+
with canvas.do.save_state():
|
495
|
+
canvas.do.stroke_color(color).line_width(line_width).line(
|
496
|
+
box.llx, box.lly, box.urx, box.lly
|
497
|
+
).line(box.urx, box.lly, box.llx, box.ury).line(
|
498
|
+
box.llx, box.lly, box.llx, box.ury
|
499
|
+
)
|
500
|
+
|
501
|
+
def _debug_draw_word_bbox(
|
502
|
+
self, canvas: Canvas, box: Rectangle, color=GREEN, line_width=0.1
|
503
|
+
):
|
504
|
+
"""Render a box depicting the word."""
|
505
|
+
if not self.render_options.render_word_bbox: # pragma: no cover
|
506
|
+
return
|
507
|
+
with canvas.do.save_state():
|
508
|
+
canvas.do.stroke_color(color).line_width(line_width).rect(
|
509
|
+
box.llx, box.lly, box.width, box.height, fill=False
|
510
|
+
)
|
511
|
+
|
512
|
+
def _debug_draw_space_bbox(
|
513
|
+
self, canvas: Canvas, box: Rectangle, color=DARKGREEN, line_width=0.1
|
514
|
+
):
|
515
|
+
"""Render a box depicting the space between two words."""
|
516
|
+
if not self.render_options.render_space_bbox: # pragma: no cover
|
517
|
+
return
|
518
|
+
with canvas.do.save_state():
|
519
|
+
canvas.do.fill_color(color).line_width(line_width).rect(
|
520
|
+
box.llx, box.lly, box.width, box.height, fill=True
|
521
|
+
)
|
522
|
+
|
523
|
+
def _debug_draw_baseline(
|
524
|
+
self,
|
525
|
+
canvas: Canvas,
|
526
|
+
line_box: Rectangle,
|
527
|
+
baseline_lly,
|
528
|
+
color=MAGENTA,
|
529
|
+
line_width=0.25,
|
530
|
+
):
|
531
|
+
"""Render the text baseline."""
|
532
|
+
if not self.render_options.render_baseline:
|
533
|
+
return
|
534
|
+
with canvas.do.save_state():
|
535
|
+
canvas.do.stroke_color(color).line_width(line_width).line(
|
536
|
+
line_box.llx,
|
537
|
+
baseline_lly,
|
538
|
+
line_box.urx,
|
539
|
+
baseline_lly,
|
540
|
+
)
|
@@ -0,0 +1,142 @@
|
|
1
|
+
# SPDX-FileCopyrightText: 2023 James R. Barlow
|
2
|
+
# SPDX-License-Identifier: MPL-2.0
|
3
|
+
|
4
|
+
from __future__ import annotations
|
5
|
+
|
6
|
+
import logging
|
7
|
+
import unicodedata
|
8
|
+
import zlib
|
9
|
+
from importlib.resources import files as package_files
|
10
|
+
|
11
|
+
from pikepdf import (
|
12
|
+
Dictionary,
|
13
|
+
Name,
|
14
|
+
Pdf,
|
15
|
+
)
|
16
|
+
from pikepdf.canvas import Font
|
17
|
+
|
18
|
+
log = logging.getLogger(__name__)
|
19
|
+
|
20
|
+
|
21
|
+
class EncodableFont(Font):
|
22
|
+
def text_encode(self, text: str) -> bytes:
|
23
|
+
raise NotImplementedError()
|
24
|
+
|
25
|
+
|
26
|
+
class GlyphlessFont(EncodableFont):
|
27
|
+
CID_TO_GID_DATA = zlib.compress(b"\x00\x01" * 65536)
|
28
|
+
GLYPHLESS_FONT_NAME = 'pdf.ttf'
|
29
|
+
GLYPHLESS_FONT_PACKAGE_PATH = 'natural_pdf.exporters.data'
|
30
|
+
GLYPHLESS_FONT = (package_files(GLYPHLESS_FONT_PACKAGE_PATH) / GLYPHLESS_FONT_NAME).read_bytes()
|
31
|
+
CHAR_ASPECT = 2
|
32
|
+
|
33
|
+
def __init__(self):
|
34
|
+
pass
|
35
|
+
|
36
|
+
def text_width(self, text: str, fontsize: float) -> float:
|
37
|
+
"""Estimate the width of a text string when rendered with the given font."""
|
38
|
+
# NFKC: split ligatures, combine diacritics
|
39
|
+
return len(unicodedata.normalize("NFKC", text)) * (fontsize / self.CHAR_ASPECT)
|
40
|
+
|
41
|
+
def text_encode(self, text: str) -> bytes:
|
42
|
+
return text.encode('utf-16be')
|
43
|
+
|
44
|
+
def register(self, pdf: Pdf):
|
45
|
+
"""Register the glyphless font.
|
46
|
+
|
47
|
+
Create several data structures in the Pdf to describe the font. While it create
|
48
|
+
the data, a reference should be set in at least one page's /Resources dictionary
|
49
|
+
to retain the font in the output PDF and ensure it is usable on that page.
|
50
|
+
"""
|
51
|
+
PLACEHOLDER = Name.Placeholder
|
52
|
+
|
53
|
+
basefont = pdf.make_indirect(
|
54
|
+
Dictionary(
|
55
|
+
BaseFont=Name.GlyphLessFont,
|
56
|
+
DescendantFonts=[PLACEHOLDER],
|
57
|
+
Encoding=Name("/Identity-H"),
|
58
|
+
Subtype=Name.Type0,
|
59
|
+
ToUnicode=PLACEHOLDER,
|
60
|
+
Type=Name.Font,
|
61
|
+
)
|
62
|
+
)
|
63
|
+
cid_font_type2 = pdf.make_indirect(
|
64
|
+
Dictionary(
|
65
|
+
BaseFont=Name.GlyphLessFont,
|
66
|
+
CIDToGIDMap=PLACEHOLDER,
|
67
|
+
CIDSystemInfo=Dictionary(
|
68
|
+
Ordering="Identity",
|
69
|
+
Registry="Adobe",
|
70
|
+
Supplement=0,
|
71
|
+
),
|
72
|
+
FontDescriptor=PLACEHOLDER,
|
73
|
+
Subtype=Name.CIDFontType2,
|
74
|
+
Type=Name.Font,
|
75
|
+
DW=1000 // self.CHAR_ASPECT,
|
76
|
+
)
|
77
|
+
)
|
78
|
+
basefont.DescendantFonts = [cid_font_type2]
|
79
|
+
cid_font_type2.CIDToGIDMap = pdf.make_stream(
|
80
|
+
self.CID_TO_GID_DATA, Filter=Name.FlateDecode
|
81
|
+
)
|
82
|
+
basefont.ToUnicode = pdf.make_stream(
|
83
|
+
b"/CIDInit /ProcSet findresource begin\n"
|
84
|
+
b"12 dict begin\n"
|
85
|
+
b"begincmap\n"
|
86
|
+
b"/CIDSystemInfo\n"
|
87
|
+
b"<<\n"
|
88
|
+
b" /Registry (Adobe)\n"
|
89
|
+
b" /Ordering (UCS)\n"
|
90
|
+
b" /Supplement 0\n"
|
91
|
+
b">> def\n"
|
92
|
+
b"/CMapName /Adobe-Identify-UCS def\n"
|
93
|
+
b"/CMapType 2 def\n"
|
94
|
+
b"1 begincodespacerange\n"
|
95
|
+
b"<0000> <FFFF>\n"
|
96
|
+
b"endcodespacerange\n"
|
97
|
+
b"1 beginbfrange\n"
|
98
|
+
b"<0000> <FFFF> <0000>\n"
|
99
|
+
b"endbfrange\n"
|
100
|
+
b"endcmap\n"
|
101
|
+
b"CMapName currentdict /CMap defineresource pop\n"
|
102
|
+
b"end\n"
|
103
|
+
b"end\n"
|
104
|
+
)
|
105
|
+
font_descriptor = pdf.make_indirect(
|
106
|
+
Dictionary(
|
107
|
+
Ascent=1000,
|
108
|
+
CapHeight=1000,
|
109
|
+
Descent=-1,
|
110
|
+
Flags=5, # Fixed pitch and symbolic
|
111
|
+
FontBBox=[0, 0, 1000 // self.CHAR_ASPECT, 1000],
|
112
|
+
FontFile2=PLACEHOLDER,
|
113
|
+
FontName=Name.GlyphLessFont,
|
114
|
+
ItalicAngle=0,
|
115
|
+
StemV=80,
|
116
|
+
Type=Name.FontDescriptor,
|
117
|
+
)
|
118
|
+
)
|
119
|
+
font_descriptor.FontFile2 = pdf.make_stream(self.GLYPHLESS_FONT)
|
120
|
+
cid_font_type2.FontDescriptor = font_descriptor
|
121
|
+
return basefont
|
122
|
+
|
123
|
+
|
124
|
+
class Courier(EncodableFont):
|
125
|
+
"""Courier font."""
|
126
|
+
|
127
|
+
def text_width(self, text: str, fontsize: float) -> float:
|
128
|
+
"""Estimate the width of a text string when rendered with the given font."""
|
129
|
+
return len(text) * fontsize
|
130
|
+
|
131
|
+
def text_encode(self, text: str) -> bytes:
|
132
|
+
return text.encode('pdfdoc', errors='ignore')
|
133
|
+
|
134
|
+
def register(self, pdf: Pdf) -> Dictionary:
|
135
|
+
"""Register the font."""
|
136
|
+
return pdf.make_indirect(
|
137
|
+
Dictionary(
|
138
|
+
BaseFont=Name.Courier,
|
139
|
+
Type=Name.Font,
|
140
|
+
Subtype=Name.Type1,
|
141
|
+
)
|
142
|
+
)
|