natural-pdf 0.1.30__py3-none-any.whl → 0.1.32__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- natural_pdf/analyzers/__init__.py +18 -4
- natural_pdf/analyzers/guides.py +2176 -0
- natural_pdf/analyzers/shape_detection_mixin.py +0 -650
- natural_pdf/core/element_manager.py +86 -27
- natural_pdf/core/page.py +49 -1
- natural_pdf/core/pdf.py +22 -0
- natural_pdf/elements/collections.py +61 -0
- natural_pdf/elements/region.py +257 -14
- natural_pdf/elements/text.py +29 -0
- {natural_pdf-0.1.30.dist-info → natural_pdf-0.1.32.dist-info}/METADATA +1 -1
- {natural_pdf-0.1.30.dist-info → natural_pdf-0.1.32.dist-info}/RECORD +15 -19
- bad_pdf_analysis/analyze_10_more.py +0 -300
- bad_pdf_analysis/analyze_final_10.py +0 -552
- bad_pdf_analysis/analyze_specific_pages.py +0 -394
- bad_pdf_analysis/analyze_specific_pages_direct.py +0 -382
- tools/rtl_smoke_test.py +0 -80
- {natural_pdf-0.1.30.dist-info → natural_pdf-0.1.32.dist-info}/WHEEL +0 -0
- {natural_pdf-0.1.30.dist-info → natural_pdf-0.1.32.dist-info}/entry_points.txt +0 -0
- {natural_pdf-0.1.30.dist-info → natural_pdf-0.1.32.dist-info}/licenses/LICENSE +0 -0
- {natural_pdf-0.1.30.dist-info → natural_pdf-0.1.32.dist-info}/top_level.txt +0 -0
@@ -7,6 +7,7 @@ characters, words, rectangles, and lines extracted from a page.
|
|
7
7
|
|
8
8
|
import logging
|
9
9
|
import re
|
10
|
+
from contextlib import contextmanager
|
10
11
|
from itertools import groupby
|
11
12
|
from typing import Any, Dict, List, Optional, Tuple, Union
|
12
13
|
|
@@ -47,6 +48,33 @@ HIGHLIGHT_DEFAULTS = {
|
|
47
48
|
"color_value_min": 0.4, # HSV V >
|
48
49
|
}
|
49
50
|
|
51
|
+
|
52
|
+
@contextmanager
|
53
|
+
def disable_text_sync():
|
54
|
+
"""
|
55
|
+
Temporarily disable text synchronization for performance.
|
56
|
+
|
57
|
+
This is used when bulk-updating text content where character-level
|
58
|
+
synchronization is not needed, such as during bidi processing.
|
59
|
+
Fixes exponential recursion issue with Arabic/RTL text processing.
|
60
|
+
"""
|
61
|
+
# Save original setter
|
62
|
+
original_setter = TextElement.text.fset
|
63
|
+
|
64
|
+
# Create a fast setter that skips sync
|
65
|
+
def fast_setter(self, value):
|
66
|
+
self._obj["text"] = value
|
67
|
+
# Skip character synchronization for performance
|
68
|
+
|
69
|
+
# Apply fast setter
|
70
|
+
TextElement.text = property(TextElement.text.fget, fast_setter)
|
71
|
+
|
72
|
+
try:
|
73
|
+
yield
|
74
|
+
finally:
|
75
|
+
# Restore original setter
|
76
|
+
TextElement.text = property(TextElement.text.fget, original_setter)
|
77
|
+
|
50
78
|
class NaturalWordExtractor(WordExtractor):
|
51
79
|
"""
|
52
80
|
Custom WordExtractor that splits words based on specified character attributes
|
@@ -202,14 +230,52 @@ class ElementManager:
|
|
202
230
|
char_to_index[key] = idx
|
203
231
|
|
204
232
|
# 2. Instantiate the custom word extractor
|
205
|
-
#
|
233
|
+
# Prefer page-level config over PDF-level for tolerance lookup
|
234
|
+
page_config = getattr(self._page, "_config", {})
|
206
235
|
pdf_config = getattr(self._page._parent, "_config", {})
|
207
|
-
|
208
|
-
|
236
|
+
|
237
|
+
# Start with any explicitly supplied tolerances (may be None)
|
238
|
+
xt = page_config.get("x_tolerance", pdf_config.get("x_tolerance"))
|
239
|
+
yt = page_config.get("y_tolerance", pdf_config.get("y_tolerance"))
|
209
240
|
use_flow = pdf_config.get("use_text_flow", False)
|
210
241
|
|
211
|
-
#
|
212
|
-
#
|
242
|
+
# ------------------------------------------------------------------
|
243
|
+
# Auto-adaptive tolerance: scale based on median character size when
|
244
|
+
# requested and explicit values are absent.
|
245
|
+
# ------------------------------------------------------------------
|
246
|
+
if pdf_config.get("auto_text_tolerance", True):
|
247
|
+
import statistics
|
248
|
+
|
249
|
+
sizes = [c.get("size", 0) for c in prepared_char_dicts if c.get("size")]
|
250
|
+
median_size = None
|
251
|
+
if sizes:
|
252
|
+
median_size = statistics.median(sizes)
|
253
|
+
if xt is None:
|
254
|
+
xt = 0.25 * median_size # ~kerning width
|
255
|
+
# Record back to page config for downstream users
|
256
|
+
page_config["x_tolerance"] = xt
|
257
|
+
if yt is None:
|
258
|
+
yt = 0.6 * median_size # ~line spacing fraction
|
259
|
+
page_config["y_tolerance"] = yt
|
260
|
+
|
261
|
+
# Warn users when the page's font size is extremely small –
|
262
|
+
# this is often the root cause of merged-row/column issues.
|
263
|
+
if median_size and median_size < 6: # 6 pt is unusually small
|
264
|
+
logger.warning(
|
265
|
+
f"Page {self._page.number}: Median font size is only {median_size:.1f} pt; "
|
266
|
+
f"auto-set x_tolerance={xt:.2f}, y_tolerance={yt:.2f}. "
|
267
|
+
"If the output looks wrong you can override these values via "
|
268
|
+
"PDF(..., text_tolerance={'x_tolerance': X, 'y_tolerance': Y}, "
|
269
|
+
"auto_text_tolerance=False)."
|
270
|
+
)
|
271
|
+
|
272
|
+
# Fallback to pdfplumber defaults if still None
|
273
|
+
if xt is None:
|
274
|
+
xt = 3
|
275
|
+
if yt is None:
|
276
|
+
yt = 3
|
277
|
+
|
278
|
+
# List of attributes to preserve on word objects
|
213
279
|
attributes_to_preserve = list(
|
214
280
|
set(
|
215
281
|
self._word_split_attributes
|
@@ -223,7 +289,7 @@ class ElementManager:
|
|
223
289
|
)
|
224
290
|
)
|
225
291
|
|
226
|
-
#
|
292
|
+
# ------------------------------------------------------------------
|
227
293
|
# NEW: Detect direction (LTR vs RTL) per visual line and feed
|
228
294
|
# pdfplumber's WordExtractor with the correct settings.
|
229
295
|
# -------------------------------------------------------------
|
@@ -271,7 +337,9 @@ class ElementManager:
|
|
271
337
|
# Build a WordExtractor tailored for this line's direction
|
272
338
|
if is_rtl_line:
|
273
339
|
line_dir = "ttb" # horizontal lines stacked top→bottom
|
274
|
-
|
340
|
+
# Feed characters in right→left x-order; extractor can then treat
|
341
|
+
# them as left-to-right so that resulting text stays logical.
|
342
|
+
char_dir = "ltr"
|
275
343
|
else:
|
276
344
|
line_dir = "ttb"
|
277
345
|
char_dir = "ltr"
|
@@ -288,9 +356,8 @@ class ElementManager:
|
|
288
356
|
)
|
289
357
|
|
290
358
|
# Prepare character sequence for the extractor:
|
291
|
-
#
|
292
|
-
#
|
293
|
-
# characters appear adjacent when the extractor walks right→left.
|
359
|
+
# Always feed characters in spatial order (x0 ascending)
|
360
|
+
# PDF stores glyphs in visual order, so this gives us the visual sequence
|
294
361
|
line_chars_for_extractor = sorted(line_chars, key=lambda c: c.get("x0", 0))
|
295
362
|
|
296
363
|
try:
|
@@ -324,15 +391,18 @@ class ElementManager:
|
|
324
391
|
# on the whole-line heuristic.
|
325
392
|
rtl_in_word = any(_is_rtl_char(ch.get("text", "")) for ch in char_list)
|
326
393
|
if rtl_in_word:
|
394
|
+
# Convert from visual order (from PDF) to logical order using bidi
|
327
395
|
try:
|
328
396
|
from bidi.algorithm import get_display # type: ignore
|
329
397
|
from natural_pdf.utils.bidi_mirror import mirror_brackets
|
330
398
|
|
331
|
-
|
332
|
-
|
333
|
-
|
399
|
+
with disable_text_sync():
|
400
|
+
# word_element.text is currently in visual order (from PDF)
|
401
|
+
# Convert to logical order using bidi with auto direction detection
|
402
|
+
logical_text = get_display(word_element.text, base_dir='L')
|
403
|
+
# Apply bracket mirroring for logical order
|
404
|
+
word_element.text = mirror_brackets(logical_text)
|
334
405
|
except Exception:
|
335
|
-
# Fallback: keep original text if python-bidi fails
|
336
406
|
pass
|
337
407
|
|
338
408
|
# ------------------------------------------------------------------
|
@@ -415,19 +485,6 @@ class ElementManager:
|
|
415
485
|
f"Page {self._page.number}: Generated {len(generated_words)} words using NaturalWordExtractor."
|
416
486
|
)
|
417
487
|
|
418
|
-
# --- Post-processing pass to ensure every word containing RTL characters is
|
419
|
-
# stored in logical order and with mirrored brackets. This is a
|
420
|
-
# safeguard in case the per-line loop above missed some tokens.
|
421
|
-
try:
|
422
|
-
from bidi.algorithm import get_display # type: ignore
|
423
|
-
from natural_pdf.utils.bidi_mirror import mirror_brackets
|
424
|
-
|
425
|
-
for w in generated_words:
|
426
|
-
if any(_is_rtl_char(ch) for ch in w.text):
|
427
|
-
w.text = mirror_brackets(get_display(w.text, base_dir="R"))
|
428
|
-
except Exception:
|
429
|
-
pass # graceful degradation – keep original text
|
430
|
-
|
431
488
|
# 4. Load other elements (rects, lines)
|
432
489
|
rect_elements = [RectangleElement(r, self._page) for r in self._page._page.rects]
|
433
490
|
line_elements = [LineElement(l, self._page) for l in self._page._page.lines]
|
@@ -463,6 +520,8 @@ class ElementManager:
|
|
463
520
|
|
464
521
|
logger.debug(f"Page {self._page.number}: Element loading complete.")
|
465
522
|
|
523
|
+
# If per-word BiDi was skipped, generated_words already stay in logical order.
|
524
|
+
|
466
525
|
def _prepare_char_dicts(self) -> List[Dict[str, Any]]:
|
467
526
|
"""
|
468
527
|
Prepares a list of character dictionaries from native PDF characters,
|
natural_pdf/core/page.py
CHANGED
@@ -128,6 +128,13 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMi
|
|
128
128
|
"named": {}, # Named regions (name -> region)
|
129
129
|
}
|
130
130
|
|
131
|
+
# -------------------------------------------------------------
|
132
|
+
# Page-scoped configuration begins as a shallow copy of the parent
|
133
|
+
# PDF-level configuration so that auto-computed tolerances or other
|
134
|
+
# page-specific values do not overwrite siblings.
|
135
|
+
# -------------------------------------------------------------
|
136
|
+
self._config = dict(getattr(self._parent, "_config", {}))
|
137
|
+
|
131
138
|
# Initialize ElementManager, passing font_attrs
|
132
139
|
self._element_mgr = ElementManager(self, font_attrs=font_attrs)
|
133
140
|
# self._highlighter = HighlightingService(self) # REMOVED - Use property accessor
|
@@ -1153,10 +1160,20 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMi
|
|
1153
1160
|
# 5. Generate Text Layout using Utility
|
1154
1161
|
# Pass page bbox as layout context
|
1155
1162
|
page_bbox = (0, 0, self.width, self.height)
|
1163
|
+
# Merge PDF-level default tolerances if caller did not override
|
1164
|
+
merged_kwargs = dict(kwargs)
|
1165
|
+
tol_keys = ["x_tolerance", "x_tolerance_ratio", "y_tolerance"]
|
1166
|
+
for k in tol_keys:
|
1167
|
+
if k not in merged_kwargs:
|
1168
|
+
if k in self._config:
|
1169
|
+
merged_kwargs[k] = self._config[k]
|
1170
|
+
elif k in getattr(self._parent, "_config", {}):
|
1171
|
+
merged_kwargs[k] = self._parent._config[k]
|
1172
|
+
|
1156
1173
|
result = generate_text_layout(
|
1157
1174
|
char_dicts=filtered_chars,
|
1158
1175
|
layout_context_bbox=page_bbox,
|
1159
|
-
user_kwargs=
|
1176
|
+
user_kwargs=merged_kwargs,
|
1160
1177
|
)
|
1161
1178
|
|
1162
1179
|
# --- Optional: apply Unicode BiDi algorithm for mixed RTL/LTR correctness ---
|
@@ -1356,6 +1373,37 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMi
|
|
1356
1373
|
|
1357
1374
|
# Use the selected method
|
1358
1375
|
if effective_method == "pdfplumber":
|
1376
|
+
# ---------------------------------------------------------
|
1377
|
+
# Inject auto-computed or user-specified text tolerances so
|
1378
|
+
# pdfplumber uses the same numbers we used for word grouping
|
1379
|
+
# whenever the table algorithm relies on word positions.
|
1380
|
+
# ---------------------------------------------------------
|
1381
|
+
if "text" in (
|
1382
|
+
table_settings.get("vertical_strategy"),
|
1383
|
+
table_settings.get("horizontal_strategy"),
|
1384
|
+
):
|
1385
|
+
print("SETTING IT UP")
|
1386
|
+
pdf_cfg = getattr(self, "_config", getattr(self._parent, "_config", {}))
|
1387
|
+
if "text_x_tolerance" not in table_settings and "x_tolerance" not in table_settings:
|
1388
|
+
x_tol = pdf_cfg.get("x_tolerance")
|
1389
|
+
if x_tol is not None:
|
1390
|
+
table_settings.setdefault("text_x_tolerance", x_tol)
|
1391
|
+
if "text_y_tolerance" not in table_settings and "y_tolerance" not in table_settings:
|
1392
|
+
y_tol = pdf_cfg.get("y_tolerance")
|
1393
|
+
if y_tol is not None:
|
1394
|
+
table_settings.setdefault("text_y_tolerance", y_tol)
|
1395
|
+
|
1396
|
+
# pdfplumber's text strategy benefits from a tight snap tolerance.
|
1397
|
+
if "snap_tolerance" not in table_settings and "snap_x_tolerance" not in table_settings:
|
1398
|
+
# Derive from y_tol if available, else default 1
|
1399
|
+
snap = max(1, round((pdf_cfg.get("y_tolerance", 1)) * 0.9))
|
1400
|
+
table_settings.setdefault("snap_tolerance", snap)
|
1401
|
+
if "join_tolerance" not in table_settings and "join_x_tolerance" not in table_settings:
|
1402
|
+
join = table_settings.get("snap_tolerance", 1)
|
1403
|
+
table_settings.setdefault("join_tolerance", join)
|
1404
|
+
table_settings.setdefault("join_x_tolerance", join)
|
1405
|
+
table_settings.setdefault("join_y_tolerance", join)
|
1406
|
+
|
1359
1407
|
return self._page.extract_tables(table_settings)
|
1360
1408
|
else:
|
1361
1409
|
raise ValueError(
|
natural_pdf/core/pdf.py
CHANGED
@@ -168,6 +168,8 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
|
|
168
168
|
reading_order: bool = True,
|
169
169
|
font_attrs: Optional[List[str]] = None,
|
170
170
|
keep_spaces: bool = True,
|
171
|
+
text_tolerance: Optional[dict] = None,
|
172
|
+
auto_text_tolerance: bool = True,
|
171
173
|
):
|
172
174
|
"""
|
173
175
|
Initialize the enhanced PDF object.
|
@@ -177,6 +179,8 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
|
|
177
179
|
reading_order: Whether to use natural reading order
|
178
180
|
font_attrs: Font attributes for grouping characters into words
|
179
181
|
keep_spaces: Whether to include spaces in word elements
|
182
|
+
text_tolerance: PDFplumber-style tolerance settings
|
183
|
+
auto_text_tolerance: Whether to automatically scale text tolerance
|
180
184
|
"""
|
181
185
|
self._original_path_or_stream = path_or_url_or_stream
|
182
186
|
self._temp_file = None
|
@@ -274,6 +278,24 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
|
|
274
278
|
getattr(self, "_is_stream", False),
|
275
279
|
)
|
276
280
|
|
281
|
+
# --- Text tolerance settings ------------------------------------
|
282
|
+
# Users can pass pdfplumber-style keys (x_tolerance, x_tolerance_ratio,
|
283
|
+
# y_tolerance, etc.) via *text_tolerance*. We also keep a flag that
|
284
|
+
# enables automatic tolerance scaling when explicit values are not
|
285
|
+
# supplied.
|
286
|
+
self._config["auto_text_tolerance"] = bool(auto_text_tolerance)
|
287
|
+
if text_tolerance:
|
288
|
+
# Only copy recognised primitives (numbers / None); ignore junk.
|
289
|
+
allowed = {
|
290
|
+
"x_tolerance",
|
291
|
+
"x_tolerance_ratio",
|
292
|
+
"y_tolerance",
|
293
|
+
"keep_blank_chars", # passthrough convenience
|
294
|
+
}
|
295
|
+
for k, v in text_tolerance.items():
|
296
|
+
if k in allowed:
|
297
|
+
self._config[k] = v
|
298
|
+
|
277
299
|
def _initialize_managers(self):
|
278
300
|
"""Set up manager factories for lazy instantiation."""
|
279
301
|
# Store factories/classes for each manager key
|
@@ -1901,7 +1901,68 @@ class ElementCollection(
|
|
1901
1901
|
)
|
1902
1902
|
)
|
1903
1903
|
|
1904
|
+
# ------------------------------------------------------------------
|
1905
|
+
# NEW METHOD: apply_ocr for collections (supports custom function)
|
1906
|
+
# ------------------------------------------------------------------
|
1907
|
+
def apply_ocr(
|
1908
|
+
self,
|
1909
|
+
*,
|
1910
|
+
function: Optional[Callable[["Region"], Optional[str]]] = None,
|
1911
|
+
show_progress: bool = True,
|
1912
|
+
**kwargs,
|
1913
|
+
) -> "ElementCollection":
|
1914
|
+
"""Apply OCR to every element in the collection.
|
1915
|
+
|
1916
|
+
This is a convenience wrapper that simply iterates over the collection
|
1917
|
+
and calls ``el.apply_ocr(...)`` on each item.
|
1918
|
+
|
1919
|
+
Two modes are supported depending on the arguments provided:
|
1920
|
+
|
1921
|
+
1. **Built-in OCR engines** – pass parameters like ``engine='easyocr'``
|
1922
|
+
or ``languages=['en']`` and each element delegates to the global
|
1923
|
+
OCRManager.
|
1924
|
+
2. **Custom function** – pass a *callable* via the ``function`` keyword
|
1925
|
+
(alias ``ocr_function`` also recognised). The callable will receive
|
1926
|
+
the element/region and must return the recognised text (or ``None``).
|
1927
|
+
Internally this is forwarded through the element's own
|
1928
|
+
:py:meth:`apply_ocr` implementation, so the behaviour mirrors the
|
1929
|
+
single-element API.
|
1930
|
+
|
1931
|
+
Parameters
|
1932
|
+
----------
|
1933
|
+
function : callable, optional
|
1934
|
+
Custom OCR function to use instead of the built-in engines.
|
1935
|
+
show_progress : bool, default True
|
1936
|
+
Display a tqdm progress bar while processing.
|
1937
|
+
**kwargs
|
1938
|
+
Additional parameters forwarded to each element's ``apply_ocr``.
|
1939
|
+
|
1940
|
+
Returns
|
1941
|
+
-------
|
1942
|
+
ElementCollection
|
1943
|
+
*Self* for fluent chaining.
|
1944
|
+
"""
|
1945
|
+
# Alias for backward-compatibility
|
1946
|
+
if function is None and "ocr_function" in kwargs:
|
1947
|
+
function = kwargs.pop("ocr_function")
|
1948
|
+
|
1949
|
+
def _process(el):
|
1950
|
+
if hasattr(el, "apply_ocr"):
|
1951
|
+
if function is not None:
|
1952
|
+
return el.apply_ocr(function=function, **kwargs)
|
1953
|
+
else:
|
1954
|
+
return el.apply_ocr(**kwargs)
|
1955
|
+
else:
|
1956
|
+
logger.warning(
|
1957
|
+
f"Element of type {type(el).__name__} does not support apply_ocr. Skipping."
|
1958
|
+
)
|
1959
|
+
return el
|
1960
|
+
|
1961
|
+
# Use collection's apply helper for optional progress bar
|
1962
|
+
self.apply(_process, show_progress=show_progress)
|
1963
|
+
return self
|
1904
1964
|
|
1965
|
+
# ------------------------------------------------------------------
|
1905
1966
|
|
1906
1967
|
|
1907
1968
|
class PageCollection(Generic[P], ApplyMixin, ShapeDetectionMixin):
|