natural-pdf 0.1.11__py3-none-any.whl → 0.1.13__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- natural_pdf/__init__.py +7 -2
- natural_pdf/analyzers/shape_detection_mixin.py +1092 -0
- natural_pdf/analyzers/text_options.py +9 -1
- natural_pdf/analyzers/text_structure.py +371 -58
- natural_pdf/classification/manager.py +3 -4
- natural_pdf/collections/pdf_collection.py +19 -39
- natural_pdf/core/element_manager.py +11 -1
- natural_pdf/core/highlighting_service.py +146 -75
- natural_pdf/core/page.py +287 -188
- natural_pdf/core/pdf.py +57 -42
- natural_pdf/elements/base.py +51 -0
- natural_pdf/elements/collections.py +362 -67
- natural_pdf/elements/line.py +5 -0
- natural_pdf/elements/region.py +396 -23
- natural_pdf/exporters/data/__init__.py +0 -0
- natural_pdf/exporters/data/pdf.ttf +0 -0
- natural_pdf/exporters/data/sRGB.icc +0 -0
- natural_pdf/exporters/hocr.py +40 -61
- natural_pdf/exporters/hocr_font.py +7 -13
- natural_pdf/exporters/original_pdf.py +10 -13
- natural_pdf/exporters/paddleocr.py +51 -11
- natural_pdf/exporters/searchable_pdf.py +0 -10
- natural_pdf/flows/__init__.py +12 -0
- natural_pdf/flows/collections.py +533 -0
- natural_pdf/flows/element.py +382 -0
- natural_pdf/flows/flow.py +216 -0
- natural_pdf/flows/region.py +458 -0
- natural_pdf/search/__init__.py +65 -52
- natural_pdf/search/lancedb_search_service.py +325 -0
- natural_pdf/search/numpy_search_service.py +255 -0
- natural_pdf/search/searchable_mixin.py +25 -71
- natural_pdf/selectors/parser.py +163 -8
- natural_pdf/widgets/viewer.py +22 -31
- {natural_pdf-0.1.11.dist-info → natural_pdf-0.1.13.dist-info}/METADATA +55 -49
- {natural_pdf-0.1.11.dist-info → natural_pdf-0.1.13.dist-info}/RECORD +38 -30
- {natural_pdf-0.1.11.dist-info → natural_pdf-0.1.13.dist-info}/WHEEL +1 -1
- natural_pdf/search/haystack_search_service.py +0 -687
- natural_pdf/search/haystack_utils.py +0 -474
- natural_pdf/utils/tqdm_utils.py +0 -51
- {natural_pdf-0.1.11.dist-info → natural_pdf-0.1.13.dist-info}/licenses/LICENSE +0 -0
- {natural_pdf-0.1.11.dist-info → natural_pdf-0.1.13.dist-info}/top_level.txt +0 -0
natural_pdf/exporters/hocr.py
CHANGED
@@ -66,28 +66,28 @@ class HocrTransform:
|
|
66
66
|
"""
|
67
67
|
|
68
68
|
box_pattern = re.compile(
|
69
|
-
r
|
69
|
+
r"""
|
70
70
|
bbox \s+
|
71
71
|
(\d+) \s+ # left: uint
|
72
72
|
(\d+) \s+ # top: uint
|
73
73
|
(\d+) \s+ # right: uint
|
74
74
|
(\d+) # bottom: uint
|
75
|
-
|
75
|
+
""",
|
76
76
|
re.VERBOSE,
|
77
77
|
)
|
78
78
|
baseline_pattern = re.compile(
|
79
|
-
r
|
79
|
+
r"""
|
80
80
|
baseline \s+
|
81
81
|
([\-\+]?\d*\.?\d*) \s+ # +/- decimal float
|
82
82
|
([\-\+]?\d+) # +/- int
|
83
|
-
|
83
|
+
""",
|
84
84
|
re.VERBOSE,
|
85
85
|
)
|
86
86
|
textangle_pattern = re.compile(
|
87
|
-
r
|
87
|
+
r"""
|
88
88
|
textangle \s+
|
89
89
|
([\-\+]?\d*\.?\d*) # +/- decimal float
|
90
|
-
|
90
|
+
""",
|
91
91
|
re.VERBOSE,
|
92
92
|
)
|
93
93
|
|
@@ -121,12 +121,12 @@ class HocrTransform:
|
|
121
121
|
|
122
122
|
# if the hOCR file has a namespace, ElementTree requires its use to
|
123
123
|
# find elements
|
124
|
-
matches = re.match(r
|
125
|
-
self.xmlns =
|
124
|
+
matches = re.match(r"({.*})html", self.hocr.getroot().tag)
|
125
|
+
self.xmlns = ""
|
126
126
|
if matches:
|
127
127
|
self.xmlns = matches.group(1)
|
128
128
|
|
129
|
-
for div in self.hocr.findall(self._child_xpath(
|
129
|
+
for div in self.hocr.findall(self._child_xpath("div", "ocr_page")):
|
130
130
|
coords = self.element_coordinates(div)
|
131
131
|
if not coords:
|
132
132
|
raise HocrTransformError("hocr file is missing page dimensions")
|
@@ -137,16 +137,16 @@ class HocrTransform:
|
|
137
137
|
|
138
138
|
def _get_element_text(self, element: Element) -> str:
|
139
139
|
"""Return the textual content of the element and its children."""
|
140
|
-
text = element.text if element.text is not None else
|
140
|
+
text = element.text if element.text is not None else ""
|
141
141
|
for child in element:
|
142
142
|
text += self._get_element_text(child)
|
143
|
-
text += element.tail if element.tail is not None else
|
143
|
+
text += element.tail if element.tail is not None else ""
|
144
144
|
return text
|
145
145
|
|
146
146
|
@classmethod
|
147
147
|
def element_coordinates(cls, element: Element) -> Rectangle | None:
|
148
148
|
"""Get coordinates of the bounding box around an element."""
|
149
|
-
matches = cls.box_pattern.search(element.attrib.get(
|
149
|
+
matches = cls.box_pattern.search(element.attrib.get("title", ""))
|
150
150
|
if not matches:
|
151
151
|
return None
|
152
152
|
return Rectangle(
|
@@ -159,7 +159,7 @@ class HocrTransform:
|
|
159
159
|
@classmethod
|
160
160
|
def baseline(cls, element: Element) -> tuple[float, float]:
|
161
161
|
"""Get baseline's slope and intercept."""
|
162
|
-
matches = cls.baseline_pattern.search(element.attrib.get(
|
162
|
+
matches = cls.baseline_pattern.search(element.attrib.get("title", ""))
|
163
163
|
if not matches:
|
164
164
|
return (0.0, 0.0)
|
165
165
|
return float(matches.group(1)), int(matches.group(2))
|
@@ -167,7 +167,7 @@ class HocrTransform:
|
|
167
167
|
@classmethod
|
168
168
|
def textangle(cls, element: Element) -> float:
|
169
169
|
"""Get text angle of an element."""
|
170
|
-
matches = cls.textangle_pattern.search(element.attrib.get(
|
170
|
+
matches = cls.textangle_pattern.search(element.attrib.get("title", ""))
|
171
171
|
if not matches:
|
172
172
|
return 0.0
|
173
173
|
return float(matches.group(1))
|
@@ -220,13 +220,13 @@ class HocrTransform:
|
|
220
220
|
with canvas.do.save_state(cm=page_matrix):
|
221
221
|
self._debug_draw_paragraph_boxes(canvas)
|
222
222
|
found_lines = False
|
223
|
-
for par in self.hocr.iterfind(self._child_xpath(
|
223
|
+
for par in self.hocr.iterfind(self._child_xpath("p", "ocr_par")):
|
224
224
|
for line in (
|
225
225
|
element
|
226
|
-
for element in par.iterfind(self._child_xpath(
|
227
|
-
if
|
228
|
-
and element.attrib[
|
229
|
-
in {
|
226
|
+
for element in par.iterfind(self._child_xpath("span"))
|
227
|
+
if "class" in element.attrib
|
228
|
+
and element.attrib["class"]
|
229
|
+
in {"ocr_header", "ocr_line", "ocr_textfloat", "ocr_caption"}
|
230
230
|
):
|
231
231
|
found_lines = True
|
232
232
|
direction = self._get_text_direction(par)
|
@@ -242,7 +242,7 @@ class HocrTransform:
|
|
242
242
|
|
243
243
|
if not found_lines:
|
244
244
|
# Tesseract did not report any lines (just words)
|
245
|
-
root = self.hocr.find(self._child_xpath(
|
245
|
+
root = self.hocr.find(self._child_xpath("div", "ocr_page"))
|
246
246
|
direction = self._get_text_direction(root)
|
247
247
|
self._do_line(
|
248
248
|
canvas,
|
@@ -254,27 +254,21 @@ class HocrTransform:
|
|
254
254
|
)
|
255
255
|
# put the image on the page, scaled to fill the page
|
256
256
|
if image_filename is not None:
|
257
|
-
canvas.do.draw_image(
|
258
|
-
image_filename, 0, 0, width=self.width, height=self.height
|
259
|
-
)
|
257
|
+
canvas.do.draw_image(image_filename, 0, 0, width=self.width, height=self.height)
|
260
258
|
|
261
259
|
# finish up the page and save it
|
262
260
|
canvas.to_pdf().save(out_filename)
|
263
261
|
|
264
262
|
def _get_text_direction(self, par):
|
265
263
|
"""Get the text direction of the paragraph.
|
266
|
-
|
264
|
+
|
267
265
|
Arabic, Hebrew, Persian, are right-to-left languages.
|
268
266
|
When the paragraph element is None, defaults to left-to-right.
|
269
267
|
"""
|
270
268
|
if par is None:
|
271
269
|
return TextDirection.LTR
|
272
|
-
|
273
|
-
return (
|
274
|
-
TextDirection.RTL
|
275
|
-
if par.attrib.get('dir', 'ltr') == 'rtl'
|
276
|
-
else TextDirection.LTR
|
277
|
-
)
|
270
|
+
|
271
|
+
return TextDirection.RTL if par.attrib.get("dir", "ltr") == "rtl" else TextDirection.LTR
|
278
272
|
|
279
273
|
def _get_inject_word_breaks(self, par):
|
280
274
|
"""Determine whether word breaks should be injected.
|
@@ -283,9 +277,9 @@ class HocrTransform:
|
|
283
277
|
words are usually one or two characters and separators are usually explicit.
|
284
278
|
In all other languages, we inject word breaks to help word segmentation.
|
285
279
|
"""
|
286
|
-
lang = par.attrib.get(
|
280
|
+
lang = par.attrib.get("lang", "")
|
287
281
|
log.debug(lang)
|
288
|
-
if lang in {
|
282
|
+
if lang in {"chi_sim", "chi_tra", "jpn", "kor"}:
|
289
283
|
return False
|
290
284
|
return True
|
291
285
|
|
@@ -339,8 +333,7 @@ class HocrTransform:
|
|
339
333
|
# size as the true bounding box of the line.
|
340
334
|
top_left_corner = (line_min_aabb.llx, line_min_aabb.lly)
|
341
335
|
line_size_aabb_matrix = (
|
342
|
-
Matrix()
|
343
|
-
.translated(*top_left_corner)
|
336
|
+
Matrix().translated(*top_left_corner)
|
344
337
|
# Note: negative sign (textangle is counter-clockwise, see hOCR spec)
|
345
338
|
.rotated(-self.textangle(line))
|
346
339
|
)
|
@@ -371,12 +364,10 @@ class HocrTransform:
|
|
371
364
|
text.font(self._fontname, fontsize)
|
372
365
|
text.render_mode(3 if invisible_text else 0)
|
373
366
|
|
374
|
-
self._debug_draw_baseline(
|
375
|
-
canvas, baseline_matrix.inverse().transform(line_min_aabb), 0
|
376
|
-
)
|
367
|
+
self._debug_draw_baseline(canvas, baseline_matrix.inverse().transform(line_min_aabb), 0)
|
377
368
|
|
378
369
|
canvas.do.fill_color(BLACK) # text in black
|
379
|
-
elements = line.findall(self._child_xpath(
|
370
|
+
elements = line.findall(self._child_xpath("span", elemclass))
|
380
371
|
for elem, next_elem in pairwise(elements + [None]):
|
381
372
|
self._do_line_word(
|
382
373
|
canvas,
|
@@ -405,7 +396,7 @@ class HocrTransform:
|
|
405
396
|
if elem is None:
|
406
397
|
return
|
407
398
|
elemtxt = self.normalize_text(self._get_element_text(elem).strip())
|
408
|
-
if elemtxt ==
|
399
|
+
if elemtxt == "":
|
409
400
|
return
|
410
401
|
|
411
402
|
hocr_box = self.element_coordinates(elem)
|
@@ -430,9 +421,7 @@ class HocrTransform:
|
|
430
421
|
text.show(self._font.text_encode(elemtxt))
|
431
422
|
|
432
423
|
# Get coordinates of the next word (if there is one)
|
433
|
-
hocr_next_box = (
|
434
|
-
self.element_coordinates(next_elem) if next_elem is not None else None
|
435
|
-
)
|
424
|
+
hocr_next_box = self.element_coordinates(next_elem) if next_elem is not None else None
|
436
425
|
if hocr_next_box is None:
|
437
426
|
return
|
438
427
|
# Render a space between this word and the next word. The explicit space helps
|
@@ -447,16 +436,14 @@ class HocrTransform:
|
|
447
436
|
elif text_direction == TextDirection.RTL:
|
448
437
|
space_box = Rectangle(next_box.urx, box.lly, box.llx, next_box.ury)
|
449
438
|
self._debug_draw_space_bbox(canvas, space_box)
|
450
|
-
space_width = self._font.text_width(
|
439
|
+
space_width = self._font.text_width(" ", fontsize)
|
451
440
|
if space_width > 0 and space_box.width > 0:
|
452
441
|
if text_direction == TextDirection.LTR:
|
453
442
|
text.text_transform(Matrix(1, 0, 0, -1, space_box.llx, 0))
|
454
443
|
elif text_direction == TextDirection.RTL:
|
455
|
-
text.text_transform(
|
456
|
-
Matrix(-1, 0, 0, -1, space_box.llx + space_box.width, 0)
|
457
|
-
)
|
444
|
+
text.text_transform(Matrix(-1, 0, 0, -1, space_box.llx + space_box.width, 0))
|
458
445
|
text.horiz_scale(100 * space_box.width / space_width)
|
459
|
-
text.show(self._font.text_encode(
|
446
|
+
text.show(self._font.text_encode(" "))
|
460
447
|
|
461
448
|
def _debug_draw_paragraph_boxes(self, canvas: Canvas, color=CYAN):
|
462
449
|
"""Draw boxes around paragraphs in the document."""
|
@@ -465,16 +452,14 @@ class HocrTransform:
|
|
465
452
|
with canvas.do.save_state():
|
466
453
|
# draw box around paragraph
|
467
454
|
canvas.do.stroke_color(color).line_width(0.1)
|
468
|
-
for elem in self.hocr.iterfind(self._child_xpath(
|
455
|
+
for elem in self.hocr.iterfind(self._child_xpath("p", "ocr_par")):
|
469
456
|
elemtxt = self._get_element_text(elem).strip()
|
470
457
|
if len(elemtxt) == 0:
|
471
458
|
continue
|
472
459
|
ocr_par = self.element_coordinates(elem)
|
473
460
|
if ocr_par is None:
|
474
461
|
continue
|
475
|
-
canvas.do.rect(
|
476
|
-
ocr_par.llx, ocr_par.lly, ocr_par.width, ocr_par.height, fill=False
|
477
|
-
)
|
462
|
+
canvas.do.rect(ocr_par.llx, ocr_par.lly, ocr_par.width, ocr_par.height, fill=False)
|
478
463
|
|
479
464
|
def _debug_draw_line_bbox(self, canvas: Canvas, line_box: Rectangle, color=BLUE):
|
480
465
|
"""Render the bounding box of a text line."""
|
@@ -485,22 +470,16 @@ class HocrTransform:
|
|
485
470
|
line_box.llx, line_box.lly, line_box.width, line_box.height, fill=False
|
486
471
|
)
|
487
472
|
|
488
|
-
def _debug_draw_word_triangle(
|
489
|
-
self, canvas: Canvas, box: Rectangle, color=RED, line_width=0.1
|
490
|
-
):
|
473
|
+
def _debug_draw_word_triangle(self, canvas: Canvas, box: Rectangle, color=RED, line_width=0.1):
|
491
474
|
"""Render a triangle that conveys word height and drawing direction."""
|
492
475
|
if not self.render_options.render_triangle: # pragma: no cover
|
493
476
|
return
|
494
477
|
with canvas.do.save_state():
|
495
478
|
canvas.do.stroke_color(color).line_width(line_width).line(
|
496
479
|
box.llx, box.lly, box.urx, box.lly
|
497
|
-
).line(box.urx, box.lly, box.llx, box.ury).line(
|
498
|
-
box.llx, box.lly, box.llx, box.ury
|
499
|
-
)
|
480
|
+
).line(box.urx, box.lly, box.llx, box.ury).line(box.llx, box.lly, box.llx, box.ury)
|
500
481
|
|
501
|
-
def _debug_draw_word_bbox(
|
502
|
-
self, canvas: Canvas, box: Rectangle, color=GREEN, line_width=0.1
|
503
|
-
):
|
482
|
+
def _debug_draw_word_bbox(self, canvas: Canvas, box: Rectangle, color=GREEN, line_width=0.1):
|
504
483
|
"""Render a box depicting the word."""
|
505
484
|
if not self.render_options.render_word_bbox: # pragma: no cover
|
506
485
|
return
|
@@ -537,4 +516,4 @@ class HocrTransform:
|
|
537
516
|
baseline_lly,
|
538
517
|
line_box.urx,
|
539
518
|
baseline_lly,
|
540
|
-
)
|
519
|
+
)
|
@@ -8,11 +8,7 @@ import unicodedata
|
|
8
8
|
import zlib
|
9
9
|
from importlib.resources import files as package_files
|
10
10
|
|
11
|
-
from pikepdf import
|
12
|
-
Dictionary,
|
13
|
-
Name,
|
14
|
-
Pdf,
|
15
|
-
)
|
11
|
+
from pikepdf import Dictionary, Name, Pdf
|
16
12
|
from pikepdf.canvas import Font
|
17
13
|
|
18
14
|
log = logging.getLogger(__name__)
|
@@ -25,8 +21,8 @@ class EncodableFont(Font):
|
|
25
21
|
|
26
22
|
class GlyphlessFont(EncodableFont):
|
27
23
|
CID_TO_GID_DATA = zlib.compress(b"\x00\x01" * 65536)
|
28
|
-
GLYPHLESS_FONT_NAME =
|
29
|
-
GLYPHLESS_FONT_PACKAGE_PATH =
|
24
|
+
GLYPHLESS_FONT_NAME = "pdf.ttf"
|
25
|
+
GLYPHLESS_FONT_PACKAGE_PATH = "natural_pdf.exporters.data"
|
30
26
|
GLYPHLESS_FONT = (package_files(GLYPHLESS_FONT_PACKAGE_PATH) / GLYPHLESS_FONT_NAME).read_bytes()
|
31
27
|
CHAR_ASPECT = 2
|
32
28
|
|
@@ -39,7 +35,7 @@ class GlyphlessFont(EncodableFont):
|
|
39
35
|
return len(unicodedata.normalize("NFKC", text)) * (fontsize / self.CHAR_ASPECT)
|
40
36
|
|
41
37
|
def text_encode(self, text: str) -> bytes:
|
42
|
-
return text.encode(
|
38
|
+
return text.encode("utf-16be")
|
43
39
|
|
44
40
|
def register(self, pdf: Pdf):
|
45
41
|
"""Register the glyphless font.
|
@@ -76,9 +72,7 @@ class GlyphlessFont(EncodableFont):
|
|
76
72
|
)
|
77
73
|
)
|
78
74
|
basefont.DescendantFonts = [cid_font_type2]
|
79
|
-
cid_font_type2.CIDToGIDMap = pdf.make_stream(
|
80
|
-
self.CID_TO_GID_DATA, Filter=Name.FlateDecode
|
81
|
-
)
|
75
|
+
cid_font_type2.CIDToGIDMap = pdf.make_stream(self.CID_TO_GID_DATA, Filter=Name.FlateDecode)
|
82
76
|
basefont.ToUnicode = pdf.make_stream(
|
83
77
|
b"/CIDInit /ProcSet findresource begin\n"
|
84
78
|
b"12 dict begin\n"
|
@@ -129,7 +123,7 @@ class Courier(EncodableFont):
|
|
129
123
|
return len(text) * fontsize
|
130
124
|
|
131
125
|
def text_encode(self, text: str) -> bytes:
|
132
|
-
return text.encode(
|
126
|
+
return text.encode("pdfdoc", errors="ignore")
|
133
127
|
|
134
128
|
def register(self, pdf: Pdf) -> Dictionary:
|
135
129
|
"""Register the font."""
|
@@ -139,4 +133,4 @@ class Courier(EncodableFont):
|
|
139
133
|
Type=Name.Font,
|
140
134
|
Subtype=Name.Type1,
|
141
135
|
)
|
142
|
-
)
|
136
|
+
)
|
@@ -44,7 +44,7 @@ def create_original_pdf(
|
|
44
44
|
if pikepdf is None:
|
45
45
|
raise ImportError(
|
46
46
|
"Saving original PDF pages requires 'pikepdf'. "
|
47
|
-
|
47
|
+
'Install with: pip install "natural-pdf[ocr-export]"'
|
48
48
|
)
|
49
49
|
|
50
50
|
output_path_str = str(output_path)
|
@@ -55,18 +55,17 @@ def create_original_pdf(
|
|
55
55
|
if not source.pages:
|
56
56
|
raise ValueError("Cannot save an empty collection/PDF.")
|
57
57
|
pages_to_extract = source.pages
|
58
|
-
elif hasattr(source, "page") and hasattr(source, "number"):
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
58
|
+
elif hasattr(source, "page") and hasattr(source, "number"): # Single Page object
|
59
|
+
# Check if it's a natural_pdf.core.page.Page or similar duck-typed object
|
60
|
+
if hasattr(source, "pdf") and source.pdf and hasattr(source.pdf, "path"):
|
61
|
+
pages_to_extract = [source]
|
62
|
+
else:
|
63
63
|
raise ValueError("Input Page object does not have a valid PDF reference with a path.")
|
64
64
|
else:
|
65
65
|
raise TypeError(f"Unsupported source type for create_original_pdf: {type(source)}")
|
66
66
|
|
67
|
-
|
68
67
|
if not pages_to_extract:
|
69
|
-
|
68
|
+
raise ValueError("No valid pages found in the source object.")
|
70
69
|
|
71
70
|
# Verify all pages come from the same PDF and get path
|
72
71
|
first_page_pdf_path = None
|
@@ -115,16 +114,14 @@ def create_original_pdf(
|
|
115
114
|
)
|
116
115
|
|
117
116
|
except pikepdf.PasswordError:
|
118
|
-
logger.error(
|
119
|
-
f"Failed to open password-protected source PDF: {first_page_pdf_path}"
|
120
|
-
)
|
117
|
+
logger.error(f"Failed to open password-protected source PDF: {first_page_pdf_path}")
|
121
118
|
raise RuntimeError(
|
122
119
|
f"Source PDF '{first_page_pdf_path}' is password-protected."
|
123
|
-
) from None
|
120
|
+
) from None # Raise specific error without chaining the generic Exception
|
124
121
|
except Exception as e:
|
125
122
|
logger.error(
|
126
123
|
f"Failed to save original pages PDF to '{output_path_str}': {e}",
|
127
124
|
exc_info=True,
|
128
125
|
)
|
129
126
|
# Re-raise as RuntimeError for consistent API error handling
|
130
|
-
raise RuntimeError(f"Failed to save original pages PDF: {e}") from e
|
127
|
+
raise RuntimeError(f"Failed to save original pages PDF: {e}") from e
|
@@ -3,8 +3,9 @@ import os
|
|
3
3
|
import random
|
4
4
|
import shutil
|
5
5
|
from typing import TYPE_CHECKING, List, Optional, Set, Tuple, Union
|
6
|
+
import collections
|
6
7
|
|
7
|
-
from tqdm import tqdm
|
8
|
+
from tqdm.auto import tqdm
|
8
9
|
|
9
10
|
from natural_pdf.exporters.base import FinetuneExporter
|
10
11
|
|
@@ -33,19 +34,20 @@ class PaddleOCRRecognitionExporter(FinetuneExporter):
|
|
33
34
|
def __init__(
|
34
35
|
self,
|
35
36
|
resolution: int = 150,
|
36
|
-
padding: int =
|
37
|
+
padding: int = 0,
|
37
38
|
selector: Optional[str] = None,
|
38
39
|
corrected_only: bool = False,
|
39
40
|
split_ratio: Optional[float] = 0.9,
|
40
41
|
include_guide: bool = True,
|
41
42
|
random_seed: Optional[int] = 42,
|
43
|
+
min_char_freq: int = 3,
|
42
44
|
):
|
43
45
|
"""
|
44
46
|
Initialize the PaddleOCR Recognition Exporter.
|
45
47
|
|
46
48
|
Args:
|
47
49
|
resolution: DPI resolution for rendering text region images (default: 150).
|
48
|
-
padding: Padding (in points) to add around text element bbox before cropping (default:
|
50
|
+
padding: Padding (in points) to add around text element bbox before cropping (default: 0).
|
49
51
|
selector: CSS-like selector to filter which TextElements to export.
|
50
52
|
If None and corrected_only is False, all 'text' elements are considered.
|
51
53
|
corrected_only: If True, overrides selector and exports only elements likely
|
@@ -57,6 +59,9 @@ class PaddleOCRRecognitionExporter(FinetuneExporter):
|
|
57
59
|
in the output directory (default: True).
|
58
60
|
random_seed: Seed for the random number generator used for train/val split shuffling,
|
59
61
|
ensuring reproducibility (default: 42).
|
62
|
+
min_char_freq: Minimum frequency for a character to be included in the dictionary.
|
63
|
+
Text elements containing characters below this frequency will be removed.
|
64
|
+
(default: 1, meaning no filtering based on frequency).
|
60
65
|
"""
|
61
66
|
if corrected_only and selector:
|
62
67
|
logger.warning(
|
@@ -76,10 +81,12 @@ class PaddleOCRRecognitionExporter(FinetuneExporter):
|
|
76
81
|
self.split_ratio = split_ratio
|
77
82
|
self.include_guide = include_guide
|
78
83
|
self.random_seed = random_seed
|
84
|
+
self.min_char_freq = min_char_freq
|
79
85
|
|
80
86
|
logger.info(
|
81
87
|
f"Initialized PaddleOCRRecognitionExporter: selector='{self.selector}', resolution={resolution}, "
|
82
|
-
f"padding={padding}, split_ratio={split_ratio}, include_guide={include_guide}"
|
88
|
+
f"padding={padding}, split_ratio={split_ratio}, include_guide={include_guide}, "
|
89
|
+
f"min_char_freq={min_char_freq}"
|
83
90
|
)
|
84
91
|
|
85
92
|
def export(
|
@@ -114,7 +121,7 @@ class PaddleOCRRecognitionExporter(FinetuneExporter):
|
|
114
121
|
|
115
122
|
# --- 2. Collect Elements and Render Images ---
|
116
123
|
labels: List[Tuple[str, str]] = [] # List of (relative_image_path, text_label)
|
117
|
-
|
124
|
+
char_counts: collections.Counter = collections.Counter()
|
118
125
|
elements_processed = 0
|
119
126
|
elements_skipped = 0
|
120
127
|
|
@@ -200,7 +207,7 @@ class PaddleOCRRecognitionExporter(FinetuneExporter):
|
|
200
207
|
labels.append(
|
201
208
|
(relative_image_path.replace(os.path.sep, "/"), element_text)
|
202
209
|
) # Use forward slashes for labels
|
203
|
-
|
210
|
+
char_counts.update(element_text)
|
204
211
|
elements_processed += 1
|
205
212
|
|
206
213
|
except Exception as e:
|
@@ -226,15 +233,48 @@ class PaddleOCRRecognitionExporter(FinetuneExporter):
|
|
226
233
|
|
227
234
|
logger.info(f"Processed {elements_processed} text elements, skipped {elements_skipped}.")
|
228
235
|
|
236
|
+
# --- 2.5 Filter based on character frequency ---
|
237
|
+
if self.min_char_freq > 1:
|
238
|
+
logger.info(f"Filtering elements based on min_char_freq: {self.min_char_freq}")
|
239
|
+
original_label_count = len(labels)
|
240
|
+
rare_chars = {char for char, count in char_counts.items() if count < self.min_char_freq}
|
241
|
+
if rare_chars:
|
242
|
+
logger.info(f"Identified {len(rare_chars)} rare characters: {rare_chars}")
|
243
|
+
filtered_labels = []
|
244
|
+
for img_path, text in labels:
|
245
|
+
if any(char in rare_chars for char in text):
|
246
|
+
elements_skipped += 1 # Count these as skipped due to rare chars
|
247
|
+
elements_processed -=1 # Decrement from processed as it's now being skipped
|
248
|
+
else:
|
249
|
+
filtered_labels.append((img_path, text))
|
250
|
+
|
251
|
+
labels_removed_count = original_label_count - len(filtered_labels)
|
252
|
+
if labels_removed_count > 0:
|
253
|
+
logger.info(f"Removed {labels_removed_count} elements containing rare characters.")
|
254
|
+
labels = filtered_labels
|
255
|
+
|
256
|
+
# Recalculate char_counts based on filtered_labels to update the dictionary
|
257
|
+
char_counts.clear()
|
258
|
+
for _, text in labels:
|
259
|
+
char_counts.update(text)
|
260
|
+
|
261
|
+
if not labels:
|
262
|
+
logger.error(
|
263
|
+
"All elements were removed after character frequency filtering. Aborting."
|
264
|
+
)
|
265
|
+
return
|
266
|
+
else:
|
267
|
+
logger.info("No rare characters found below the frequency threshold.")
|
268
|
+
|
269
|
+
|
229
270
|
# --- 3. Generate Dictionary File (`dict.txt`) ---
|
230
271
|
dict_path = os.path.join(output_dir, "dict.txt")
|
231
272
|
try:
|
232
273
|
# Log the character set before sorting/writing
|
233
|
-
|
234
|
-
|
235
|
-
|
236
|
-
#
|
237
|
-
sorted_chars = sorted(list(char_set), reverse=True)
|
274
|
+
final_chars_for_dict = set(char_counts.keys()) # Use keys from potentially filtered char_counts
|
275
|
+
logger.debug(f"Exporter final char_set for dict: {repr(final_chars_for_dict)}")
|
276
|
+
|
277
|
+
sorted_chars = sorted(list(final_chars_for_dict)) # No specific sorting order needed, just make it consistent
|
238
278
|
with open(dict_path, "w", encoding="utf-8") as f_dict:
|
239
279
|
for char in sorted_chars:
|
240
280
|
# Ensure we don't write empty strings or just newlines as dictionary entries
|
@@ -318,16 +318,6 @@ def create_searchable_pdf(
|
|
318
318
|
dpi: The resolution (dots per inch) for rendering page images and hOCR.
|
319
319
|
"""
|
320
320
|
|
321
|
-
# --- Ensure dependencies are loaded (they should be if installed) ---
|
322
|
-
if Image is None or pikepdf is None or HocrTransform is None:
|
323
|
-
# This should ideally not happen if dependencies are in main install,
|
324
|
-
# but serves as a safeguard during development or if install is broken.
|
325
|
-
raise ImportError(
|
326
|
-
"Required dependencies (Pillow, pikepdf) are missing. "
|
327
|
-
"Please ensure natural-pdf is installed correctly with all dependencies."
|
328
|
-
)
|
329
|
-
# --- End Safeguard Check ---
|
330
|
-
|
331
321
|
# duck type to see if source has .pages, to populate pages =
|
332
322
|
if hasattr(source, "pages"):
|
333
323
|
pages = source.pages
|
@@ -0,0 +1,12 @@
|
|
1
|
+
from .flow import Flow
|
2
|
+
from .element import FlowElement
|
3
|
+
from .region import FlowRegion
|
4
|
+
from .collections import FlowElementCollection, FlowRegionCollection
|
5
|
+
|
6
|
+
__all__ = [
|
7
|
+
"Flow",
|
8
|
+
"FlowElement",
|
9
|
+
"FlowRegion",
|
10
|
+
"FlowElementCollection",
|
11
|
+
"FlowRegionCollection",
|
12
|
+
]
|