natural-pdf 0.1.10__py3-none-any.whl → 0.1.12__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (31) hide show
  1. natural_pdf/__init__.py +29 -40
  2. natural_pdf/analyzers/text_options.py +9 -1
  3. natural_pdf/analyzers/text_structure.py +371 -58
  4. natural_pdf/classification/manager.py +1 -1
  5. natural_pdf/core/element_manager.py +11 -1
  6. natural_pdf/core/highlighting_service.py +120 -40
  7. natural_pdf/core/page.py +20 -18
  8. natural_pdf/core/pdf.py +146 -13
  9. natural_pdf/elements/base.py +17 -0
  10. natural_pdf/elements/collections.py +374 -30
  11. natural_pdf/elements/region.py +45 -14
  12. natural_pdf/exporters/data/__init__.py +0 -0
  13. natural_pdf/exporters/data/pdf.ttf +0 -0
  14. natural_pdf/exporters/data/sRGB.icc +0 -0
  15. natural_pdf/exporters/hocr.py +519 -0
  16. natural_pdf/exporters/hocr_font.py +136 -0
  17. natural_pdf/exporters/original_pdf.py +127 -0
  18. natural_pdf/exporters/searchable_pdf.py +2 -12
  19. natural_pdf/ocr/engine_surya.py +1 -1
  20. natural_pdf/search/__init__.py +65 -52
  21. natural_pdf/search/lancedb_search_service.py +325 -0
  22. natural_pdf/search/numpy_search_service.py +255 -0
  23. natural_pdf/search/searchable_mixin.py +25 -71
  24. natural_pdf/widgets/viewer.py +22 -31
  25. {natural_pdf-0.1.10.dist-info → natural_pdf-0.1.12.dist-info}/METADATA +54 -50
  26. {natural_pdf-0.1.10.dist-info → natural_pdf-0.1.12.dist-info}/RECORD +29 -23
  27. {natural_pdf-0.1.10.dist-info → natural_pdf-0.1.12.dist-info}/WHEEL +1 -1
  28. natural_pdf/search/haystack_search_service.py +0 -687
  29. natural_pdf/search/haystack_utils.py +0 -474
  30. {natural_pdf-0.1.10.dist-info → natural_pdf-0.1.12.dist-info}/licenses/LICENSE +0 -0
  31. {natural_pdf-0.1.10.dist-info → natural_pdf-0.1.12.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,519 @@
1
+ # SPDX-FileCopyrightText: 2010 Jonathan Brinley
2
+ # SPDX-FileCopyrightText: 2013-2014 Julien Pfefferkorn
3
+ # SPDX-FileCopyrightText: 2023 James R. Barlow
4
+ # SPDX-FileCopyrightText: 2025 Odin Dahlström
5
+ # SPDX-License-Identifier: MIT
6
+
7
+ """hOCR transform implementation."""
8
+
9
+ from __future__ import annotations
10
+
11
+ import logging
12
+ import os
13
+ import re
14
+ import unicodedata
15
+ from dataclasses import dataclass
16
+ from itertools import pairwise
17
+ from math import atan, pi
18
+ from pathlib import Path
19
+ from xml.etree import ElementTree
20
+
21
+ from pikepdf import Matrix, Name, Rectangle
22
+ from pikepdf.canvas import (
23
+ BLACK,
24
+ BLUE,
25
+ CYAN,
26
+ DARKGREEN,
27
+ GREEN,
28
+ MAGENTA,
29
+ RED,
30
+ Canvas,
31
+ Text,
32
+ TextDirection,
33
+ )
34
+
35
+ from natural_pdf.exporters.hocr_font import EncodableFont as Font
36
+ from natural_pdf.exporters.hocr_font import GlyphlessFont
37
+
38
+ log = logging.getLogger(__name__)
39
+
40
+ INCH = 72.0
41
+
42
+ Element = ElementTree.Element
43
+
44
+
45
+ @dataclass
46
+ class DebugRenderOptions:
47
+ """A class for managing rendering options."""
48
+
49
+ render_paragraph_bbox: bool = False
50
+ render_baseline: bool = False
51
+ render_triangle: bool = False
52
+ render_line_bbox: bool = False
53
+ render_word_bbox: bool = False
54
+ render_space_bbox: bool = False
55
+
56
+
57
+ class HocrTransformError(Exception):
58
+ """Error while applying hOCR transform."""
59
+
60
+
61
+ class HocrTransform:
62
+ """A class for converting documents from the hOCR format.
63
+
64
+ For details of the hOCR format, see:
65
+ http://kba.github.io/hocr-spec/1.2/.
66
+ """
67
+
68
+ box_pattern = re.compile(
69
+ r"""
70
+ bbox \s+
71
+ (\d+) \s+ # left: uint
72
+ (\d+) \s+ # top: uint
73
+ (\d+) \s+ # right: uint
74
+ (\d+) # bottom: uint
75
+ """,
76
+ re.VERBOSE,
77
+ )
78
+ baseline_pattern = re.compile(
79
+ r"""
80
+ baseline \s+
81
+ ([\-\+]?\d*\.?\d*) \s+ # +/- decimal float
82
+ ([\-\+]?\d+) # +/- int
83
+ """,
84
+ re.VERBOSE,
85
+ )
86
+ textangle_pattern = re.compile(
87
+ r"""
88
+ textangle \s+
89
+ ([\-\+]?\d*\.?\d*) # +/- decimal float
90
+ """,
91
+ re.VERBOSE,
92
+ )
93
+
94
+ def __init__(
95
+ self,
96
+ *,
97
+ hocr_filename: str | Path,
98
+ dpi: float,
99
+ debug: bool = False,
100
+ fontname: Name = Name("/f-0-0"),
101
+ font: Font = GlyphlessFont(),
102
+ debug_render_options: DebugRenderOptions | None = None,
103
+ ):
104
+ """Initialize the HocrTransform object."""
105
+ if debug:
106
+ log.warning("Use debug_render_options instead", DeprecationWarning)
107
+ self.render_options = DebugRenderOptions(
108
+ render_baseline=debug,
109
+ render_triangle=debug,
110
+ render_line_bbox=False,
111
+ render_word_bbox=debug,
112
+ render_paragraph_bbox=False,
113
+ render_space_bbox=False,
114
+ )
115
+ else:
116
+ self.render_options = debug_render_options or DebugRenderOptions()
117
+ self.dpi = dpi
118
+ self.hocr = ElementTree.parse(os.fspath(hocr_filename))
119
+ self._fontname = fontname
120
+ self._font = font
121
+
122
+ # if the hOCR file has a namespace, ElementTree requires its use to
123
+ # find elements
124
+ matches = re.match(r"({.*})html", self.hocr.getroot().tag)
125
+ self.xmlns = ""
126
+ if matches:
127
+ self.xmlns = matches.group(1)
128
+
129
+ for div in self.hocr.findall(self._child_xpath("div", "ocr_page")):
130
+ coords = self.element_coordinates(div)
131
+ if not coords:
132
+ raise HocrTransformError("hocr file is missing page dimensions")
133
+ self.width = (coords.urx - coords.llx) / (self.dpi / INCH)
134
+ self.height = (coords.ury - coords.lly) / (self.dpi / INCH)
135
+ # Stop after first div that has page coordinates
136
+ break
137
+
138
+ def _get_element_text(self, element: Element) -> str:
139
+ """Return the textual content of the element and its children."""
140
+ text = element.text if element.text is not None else ""
141
+ for child in element:
142
+ text += self._get_element_text(child)
143
+ text += element.tail if element.tail is not None else ""
144
+ return text
145
+
146
+ @classmethod
147
+ def element_coordinates(cls, element: Element) -> Rectangle | None:
148
+ """Get coordinates of the bounding box around an element."""
149
+ matches = cls.box_pattern.search(element.attrib.get("title", ""))
150
+ if not matches:
151
+ return None
152
+ return Rectangle(
153
+ float(matches.group(1)), # llx = left
154
+ float(matches.group(2)), # lly = top
155
+ float(matches.group(3)), # urx = right
156
+ float(matches.group(4)), # ury = bottom
157
+ )
158
+
159
+ @classmethod
160
+ def baseline(cls, element: Element) -> tuple[float, float]:
161
+ """Get baseline's slope and intercept."""
162
+ matches = cls.baseline_pattern.search(element.attrib.get("title", ""))
163
+ if not matches:
164
+ return (0.0, 0.0)
165
+ return float(matches.group(1)), int(matches.group(2))
166
+
167
+ @classmethod
168
+ def textangle(cls, element: Element) -> float:
169
+ """Get text angle of an element."""
170
+ matches = cls.textangle_pattern.search(element.attrib.get("title", ""))
171
+ if not matches:
172
+ return 0.0
173
+ return float(matches.group(1))
174
+
175
+ def _child_xpath(self, html_tag: str, html_class: str | None = None) -> str:
176
+ xpath = f".//{self.xmlns}{html_tag}"
177
+ if html_class:
178
+ xpath += f"[@class='{html_class}']"
179
+ return xpath
180
+
181
+ @classmethod
182
+ def normalize_text(cls, s: str) -> str:
183
+ """Normalize the given text using the NFKC normalization form."""
184
+ return unicodedata.normalize("NFKC", s)
185
+
186
+ def to_pdf(
187
+ self,
188
+ *,
189
+ out_filename: Path,
190
+ image_filename: Path | None = None,
191
+ invisible_text: bool = True,
192
+ ) -> None:
193
+ """Creates a PDF file with an image superimposed on top of the text.
194
+
195
+ Text is positioned according to the bounding box of the lines in
196
+ the hOCR file.
197
+ The image need not be identical to the image used to create the hOCR
198
+ file.
199
+ It can have a lower resolution, different color mode, etc.
200
+
201
+ Arguments:
202
+ out_filename: Path of PDF to write.
203
+ image_filename: Image to use for this file. If omitted, the OCR text
204
+ is shown.
205
+ invisible_text: If True, text is rendered invisible so that is
206
+ selectable but never drawn. If False, text is visible and may
207
+ be seen if the image is skipped or deleted in Acrobat.
208
+ """
209
+ # create the PDF file
210
+ # page size in points (1/72 in.)
211
+ canvas = Canvas(page_size=(self.width, self.height))
212
+ canvas.add_font(self._fontname, self._font)
213
+ page_matrix = (
214
+ Matrix()
215
+ .translated(0, self.height)
216
+ .scaled(1, -1)
217
+ .scaled(INCH / self.dpi, INCH / self.dpi)
218
+ )
219
+ log.debug(page_matrix)
220
+ with canvas.do.save_state(cm=page_matrix):
221
+ self._debug_draw_paragraph_boxes(canvas)
222
+ found_lines = False
223
+ for par in self.hocr.iterfind(self._child_xpath("p", "ocr_par")):
224
+ for line in (
225
+ element
226
+ for element in par.iterfind(self._child_xpath("span"))
227
+ if "class" in element.attrib
228
+ and element.attrib["class"]
229
+ in {"ocr_header", "ocr_line", "ocr_textfloat", "ocr_caption"}
230
+ ):
231
+ found_lines = True
232
+ direction = self._get_text_direction(par)
233
+ inject_word_breaks = self._get_inject_word_breaks(par)
234
+ self._do_line(
235
+ canvas,
236
+ line,
237
+ "ocrx_word",
238
+ invisible_text,
239
+ direction,
240
+ inject_word_breaks,
241
+ )
242
+
243
+ if not found_lines:
244
+ # Tesseract did not report any lines (just words)
245
+ root = self.hocr.find(self._child_xpath("div", "ocr_page"))
246
+ direction = self._get_text_direction(root)
247
+ self._do_line(
248
+ canvas,
249
+ root,
250
+ "ocrx_word",
251
+ invisible_text,
252
+ direction,
253
+ True,
254
+ )
255
+ # put the image on the page, scaled to fill the page
256
+ if image_filename is not None:
257
+ canvas.do.draw_image(image_filename, 0, 0, width=self.width, height=self.height)
258
+
259
+ # finish up the page and save it
260
+ canvas.to_pdf().save(out_filename)
261
+
262
+ def _get_text_direction(self, par):
263
+ """Get the text direction of the paragraph.
264
+
265
+ Arabic, Hebrew, Persian, are right-to-left languages.
266
+ When the paragraph element is None, defaults to left-to-right.
267
+ """
268
+ if par is None:
269
+ return TextDirection.LTR
270
+
271
+ return TextDirection.RTL if par.attrib.get("dir", "ltr") == "rtl" else TextDirection.LTR
272
+
273
+ def _get_inject_word_breaks(self, par):
274
+ """Determine whether word breaks should be injected.
275
+
276
+ In Chinese, Japanese, and Korean, word breaks are not injected, because
277
+ words are usually one or two characters and separators are usually explicit.
278
+ In all other languages, we inject word breaks to help word segmentation.
279
+ """
280
+ lang = par.attrib.get("lang", "")
281
+ log.debug(lang)
282
+ if lang in {"chi_sim", "chi_tra", "jpn", "kor"}:
283
+ return False
284
+ return True
285
+
286
+ @classmethod
287
+ def polyval(cls, poly, x): # pragma: no cover
288
+ """Calculate the value of a polynomial at a point."""
289
+ return x * poly[0] + poly[1]
290
+
291
+ def _do_line(
292
+ self,
293
+ canvas: Canvas,
294
+ line: Element | None,
295
+ elemclass: str,
296
+ invisible_text: bool,
297
+ text_direction: TextDirection,
298
+ inject_word_breaks: bool,
299
+ ):
300
+ """Render the text for a given line.
301
+
302
+ The canvas's coordinate system must be configured so that hOCR pixel
303
+ coordinates are mapped to PDF coordinates.
304
+ """
305
+ if line is None:
306
+ return
307
+ # line_min_aabb (which is created from the "bbox" hOCR property) is so named
308
+ # because a Rectangle instance is always an AABB (it has no orientation).
309
+ # However, this means that for non-zero values of the "textangle" hOCR
310
+ # property, line_min_aabb is not the true bounding box of the hOCR line,
311
+ # but rather the minimum AABB that encloses the bounding box of the line.
312
+ # The true bounding box of the line must be seen as an OBB, due to the
313
+ # existance of the "textangle" hOCR property.
314
+ line_min_aabb = self.element_coordinates(line)
315
+ if not line_min_aabb:
316
+ return
317
+ if line_min_aabb.ury <= line_min_aabb.lly:
318
+ log.error(
319
+ "line box is invalid so we cannot render it: box=%s text=%s",
320
+ line_min_aabb,
321
+ self._get_element_text(line),
322
+ )
323
+ return
324
+ self._debug_draw_line_bbox(canvas, line_min_aabb)
325
+
326
+ # Even though line_min_aabb is not the true bounding box of the line,
327
+ # it is still possible to derive an AABB (Rectangle) from it that is
328
+ # the same size as the true bounding box of the line,
329
+ # if we use a coordinate system that is axis-aligned with respect to
330
+ # the rotation of the OBB (textangle).
331
+ # line_size_aabb_matrix is a transform matrix for such a coordinate
332
+ # system, and line_size_aabb is thus an AABB with the same
333
+ # size as the true bounding box of the line.
334
+ top_left_corner = (line_min_aabb.llx, line_min_aabb.lly)
335
+ line_size_aabb_matrix = (
336
+ Matrix().translated(*top_left_corner)
337
+ # Note: negative sign (textangle is counter-clockwise, see hOCR spec)
338
+ .rotated(-self.textangle(line))
339
+ )
340
+ line_size_aabb = line_size_aabb_matrix.inverse().transform(line_min_aabb)
341
+
342
+ slope, intercept = self.baseline(line)
343
+ if abs(slope) < 0.005:
344
+ slope = 0.0
345
+ slope_angle = atan(slope)
346
+
347
+ # Final PDF-perspective (bottom-left corner) transform matrix for the
348
+ # text baseline, which has an intercept and slope relative to the OBB.
349
+ # See "bbox", "textangle" and "baseline" in the hOCR spec for more details.
350
+ baseline_matrix = (
351
+ line_size_aabb_matrix
352
+ # Translate from hOCR perspective (top-left corner) to PDF perspective
353
+ # (bottom-left corner).
354
+ # Note: it would be incorrect to use line_min_aabb.height here because
355
+ # it is not the true height of the OBB of the line, if textangle != 0.
356
+ .translated(0, line_size_aabb.height)
357
+ .translated(0, intercept)
358
+ .rotated(slope_angle / pi * 180)
359
+ )
360
+
361
+ with canvas.do.save_state(cm=baseline_matrix):
362
+ text = Text(direction=text_direction)
363
+ fontsize = line_size_aabb.height + intercept
364
+ text.font(self._fontname, fontsize)
365
+ text.render_mode(3 if invisible_text else 0)
366
+
367
+ self._debug_draw_baseline(canvas, baseline_matrix.inverse().transform(line_min_aabb), 0)
368
+
369
+ canvas.do.fill_color(BLACK) # text in black
370
+ elements = line.findall(self._child_xpath("span", elemclass))
371
+ for elem, next_elem in pairwise(elements + [None]):
372
+ self._do_line_word(
373
+ canvas,
374
+ baseline_matrix,
375
+ text,
376
+ fontsize,
377
+ elem,
378
+ next_elem,
379
+ text_direction,
380
+ inject_word_breaks,
381
+ )
382
+ canvas.do.draw_text(text)
383
+
384
+ def _do_line_word(
385
+ self,
386
+ canvas: Canvas,
387
+ line_matrix: Matrix,
388
+ text: Text,
389
+ fontsize: float,
390
+ elem: Element | None,
391
+ next_elem: Element | None,
392
+ text_direction: TextDirection,
393
+ inject_word_breaks: bool,
394
+ ):
395
+ """Render the text for a single word."""
396
+ if elem is None:
397
+ return
398
+ elemtxt = self.normalize_text(self._get_element_text(elem).strip())
399
+ if elemtxt == "":
400
+ return
401
+
402
+ hocr_box = self.element_coordinates(elem)
403
+ if hocr_box is None:
404
+ return
405
+ box = line_matrix.inverse().transform(hocr_box)
406
+ font_width = self._font.text_width(elemtxt, fontsize)
407
+
408
+ # Debug sketches
409
+ self._debug_draw_word_triangle(canvas, box)
410
+ self._debug_draw_word_bbox(canvas, box)
411
+
412
+ # If this word is 0 units wide, our best bet seems to be to suppress this text
413
+ if text_direction == TextDirection.RTL:
414
+ log.info("RTL: %s", elemtxt)
415
+ if font_width > 0:
416
+ if text_direction == TextDirection.LTR:
417
+ text.text_transform(Matrix(1, 0, 0, -1, box.llx, 0))
418
+ elif text_direction == TextDirection.RTL:
419
+ text.text_transform(Matrix(-1, 0, 0, -1, box.llx + box.width, 0))
420
+ text.horiz_scale(100 * box.width / font_width)
421
+ text.show(self._font.text_encode(elemtxt))
422
+
423
+ # Get coordinates of the next word (if there is one)
424
+ hocr_next_box = self.element_coordinates(next_elem) if next_elem is not None else None
425
+ if hocr_next_box is None:
426
+ return
427
+ # Render a space between this word and the next word. The explicit space helps
428
+ # PDF viewers identify the word break, and horizontally scaling it to
429
+ # occupy the space the between the words helps the PDF viewer
430
+ # avoid combiningthewordstogether.
431
+ if not inject_word_breaks:
432
+ return
433
+ next_box = line_matrix.inverse().transform(hocr_next_box)
434
+ if text_direction == TextDirection.LTR:
435
+ space_box = Rectangle(box.urx, box.lly, next_box.llx, next_box.ury)
436
+ elif text_direction == TextDirection.RTL:
437
+ space_box = Rectangle(next_box.urx, box.lly, box.llx, next_box.ury)
438
+ self._debug_draw_space_bbox(canvas, space_box)
439
+ space_width = self._font.text_width(" ", fontsize)
440
+ if space_width > 0 and space_box.width > 0:
441
+ if text_direction == TextDirection.LTR:
442
+ text.text_transform(Matrix(1, 0, 0, -1, space_box.llx, 0))
443
+ elif text_direction == TextDirection.RTL:
444
+ text.text_transform(Matrix(-1, 0, 0, -1, space_box.llx + space_box.width, 0))
445
+ text.horiz_scale(100 * space_box.width / space_width)
446
+ text.show(self._font.text_encode(" "))
447
+
448
+ def _debug_draw_paragraph_boxes(self, canvas: Canvas, color=CYAN):
449
+ """Draw boxes around paragraphs in the document."""
450
+ if not self.render_options.render_paragraph_bbox: # pragma: no cover
451
+ return
452
+ with canvas.do.save_state():
453
+ # draw box around paragraph
454
+ canvas.do.stroke_color(color).line_width(0.1)
455
+ for elem in self.hocr.iterfind(self._child_xpath("p", "ocr_par")):
456
+ elemtxt = self._get_element_text(elem).strip()
457
+ if len(elemtxt) == 0:
458
+ continue
459
+ ocr_par = self.element_coordinates(elem)
460
+ if ocr_par is None:
461
+ continue
462
+ canvas.do.rect(ocr_par.llx, ocr_par.lly, ocr_par.width, ocr_par.height, fill=False)
463
+
464
+ def _debug_draw_line_bbox(self, canvas: Canvas, line_box: Rectangle, color=BLUE):
465
+ """Render the bounding box of a text line."""
466
+ if not self.render_options.render_line_bbox: # pragma: no cover
467
+ return
468
+ with canvas.do.save_state():
469
+ canvas.do.stroke_color(color).line_width(0.15).rect(
470
+ line_box.llx, line_box.lly, line_box.width, line_box.height, fill=False
471
+ )
472
+
473
+ def _debug_draw_word_triangle(self, canvas: Canvas, box: Rectangle, color=RED, line_width=0.1):
474
+ """Render a triangle that conveys word height and drawing direction."""
475
+ if not self.render_options.render_triangle: # pragma: no cover
476
+ return
477
+ with canvas.do.save_state():
478
+ canvas.do.stroke_color(color).line_width(line_width).line(
479
+ box.llx, box.lly, box.urx, box.lly
480
+ ).line(box.urx, box.lly, box.llx, box.ury).line(box.llx, box.lly, box.llx, box.ury)
481
+
482
+ def _debug_draw_word_bbox(self, canvas: Canvas, box: Rectangle, color=GREEN, line_width=0.1):
483
+ """Render a box depicting the word."""
484
+ if not self.render_options.render_word_bbox: # pragma: no cover
485
+ return
486
+ with canvas.do.save_state():
487
+ canvas.do.stroke_color(color).line_width(line_width).rect(
488
+ box.llx, box.lly, box.width, box.height, fill=False
489
+ )
490
+
491
+ def _debug_draw_space_bbox(
492
+ self, canvas: Canvas, box: Rectangle, color=DARKGREEN, line_width=0.1
493
+ ):
494
+ """Render a box depicting the space between two words."""
495
+ if not self.render_options.render_space_bbox: # pragma: no cover
496
+ return
497
+ with canvas.do.save_state():
498
+ canvas.do.fill_color(color).line_width(line_width).rect(
499
+ box.llx, box.lly, box.width, box.height, fill=True
500
+ )
501
+
502
+ def _debug_draw_baseline(
503
+ self,
504
+ canvas: Canvas,
505
+ line_box: Rectangle,
506
+ baseline_lly,
507
+ color=MAGENTA,
508
+ line_width=0.25,
509
+ ):
510
+ """Render the text baseline."""
511
+ if not self.render_options.render_baseline:
512
+ return
513
+ with canvas.do.save_state():
514
+ canvas.do.stroke_color(color).line_width(line_width).line(
515
+ line_box.llx,
516
+ baseline_lly,
517
+ line_box.urx,
518
+ baseline_lly,
519
+ )
@@ -0,0 +1,136 @@
1
+ # SPDX-FileCopyrightText: 2023 James R. Barlow
2
+ # SPDX-License-Identifier: MPL-2.0
3
+
4
+ from __future__ import annotations
5
+
6
+ import logging
7
+ import unicodedata
8
+ import zlib
9
+ from importlib.resources import files as package_files
10
+
11
+ from pikepdf import Dictionary, Name, Pdf
12
+ from pikepdf.canvas import Font
13
+
14
+ log = logging.getLogger(__name__)
15
+
16
+
17
+ class EncodableFont(Font):
18
+ def text_encode(self, text: str) -> bytes:
19
+ raise NotImplementedError()
20
+
21
+
22
+ class GlyphlessFont(EncodableFont):
23
+ CID_TO_GID_DATA = zlib.compress(b"\x00\x01" * 65536)
24
+ GLYPHLESS_FONT_NAME = "pdf.ttf"
25
+ GLYPHLESS_FONT_PACKAGE_PATH = "natural_pdf.exporters.data"
26
+ GLYPHLESS_FONT = (package_files(GLYPHLESS_FONT_PACKAGE_PATH) / GLYPHLESS_FONT_NAME).read_bytes()
27
+ CHAR_ASPECT = 2
28
+
29
+ def __init__(self):
30
+ pass
31
+
32
+ def text_width(self, text: str, fontsize: float) -> float:
33
+ """Estimate the width of a text string when rendered with the given font."""
34
+ # NFKC: split ligatures, combine diacritics
35
+ return len(unicodedata.normalize("NFKC", text)) * (fontsize / self.CHAR_ASPECT)
36
+
37
+ def text_encode(self, text: str) -> bytes:
38
+ return text.encode("utf-16be")
39
+
40
+ def register(self, pdf: Pdf):
41
+ """Register the glyphless font.
42
+
43
+ Create several data structures in the Pdf to describe the font. While it create
44
+ the data, a reference should be set in at least one page's /Resources dictionary
45
+ to retain the font in the output PDF and ensure it is usable on that page.
46
+ """
47
+ PLACEHOLDER = Name.Placeholder
48
+
49
+ basefont = pdf.make_indirect(
50
+ Dictionary(
51
+ BaseFont=Name.GlyphLessFont,
52
+ DescendantFonts=[PLACEHOLDER],
53
+ Encoding=Name("/Identity-H"),
54
+ Subtype=Name.Type0,
55
+ ToUnicode=PLACEHOLDER,
56
+ Type=Name.Font,
57
+ )
58
+ )
59
+ cid_font_type2 = pdf.make_indirect(
60
+ Dictionary(
61
+ BaseFont=Name.GlyphLessFont,
62
+ CIDToGIDMap=PLACEHOLDER,
63
+ CIDSystemInfo=Dictionary(
64
+ Ordering="Identity",
65
+ Registry="Adobe",
66
+ Supplement=0,
67
+ ),
68
+ FontDescriptor=PLACEHOLDER,
69
+ Subtype=Name.CIDFontType2,
70
+ Type=Name.Font,
71
+ DW=1000 // self.CHAR_ASPECT,
72
+ )
73
+ )
74
+ basefont.DescendantFonts = [cid_font_type2]
75
+ cid_font_type2.CIDToGIDMap = pdf.make_stream(self.CID_TO_GID_DATA, Filter=Name.FlateDecode)
76
+ basefont.ToUnicode = pdf.make_stream(
77
+ b"/CIDInit /ProcSet findresource begin\n"
78
+ b"12 dict begin\n"
79
+ b"begincmap\n"
80
+ b"/CIDSystemInfo\n"
81
+ b"<<\n"
82
+ b" /Registry (Adobe)\n"
83
+ b" /Ordering (UCS)\n"
84
+ b" /Supplement 0\n"
85
+ b">> def\n"
86
+ b"/CMapName /Adobe-Identify-UCS def\n"
87
+ b"/CMapType 2 def\n"
88
+ b"1 begincodespacerange\n"
89
+ b"<0000> <FFFF>\n"
90
+ b"endcodespacerange\n"
91
+ b"1 beginbfrange\n"
92
+ b"<0000> <FFFF> <0000>\n"
93
+ b"endbfrange\n"
94
+ b"endcmap\n"
95
+ b"CMapName currentdict /CMap defineresource pop\n"
96
+ b"end\n"
97
+ b"end\n"
98
+ )
99
+ font_descriptor = pdf.make_indirect(
100
+ Dictionary(
101
+ Ascent=1000,
102
+ CapHeight=1000,
103
+ Descent=-1,
104
+ Flags=5, # Fixed pitch and symbolic
105
+ FontBBox=[0, 0, 1000 // self.CHAR_ASPECT, 1000],
106
+ FontFile2=PLACEHOLDER,
107
+ FontName=Name.GlyphLessFont,
108
+ ItalicAngle=0,
109
+ StemV=80,
110
+ Type=Name.FontDescriptor,
111
+ )
112
+ )
113
+ font_descriptor.FontFile2 = pdf.make_stream(self.GLYPHLESS_FONT)
114
+ cid_font_type2.FontDescriptor = font_descriptor
115
+ return basefont
116
+
117
+
118
+ class Courier(EncodableFont):
119
+ """Courier font."""
120
+
121
+ def text_width(self, text: str, fontsize: float) -> float:
122
+ """Estimate the width of a text string when rendered with the given font."""
123
+ return len(text) * fontsize
124
+
125
+ def text_encode(self, text: str) -> bytes:
126
+ return text.encode("pdfdoc", errors="ignore")
127
+
128
+ def register(self, pdf: Pdf) -> Dictionary:
129
+ """Register the font."""
130
+ return pdf.make_indirect(
131
+ Dictionary(
132
+ BaseFont=Name.Courier,
133
+ Type=Name.Font,
134
+ Subtype=Name.Type1,
135
+ )
136
+ )