natural-pdf 0.1.30__py3-none-any.whl → 0.1.32__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -7,6 +7,7 @@ characters, words, rectangles, and lines extracted from a page.
7
7
 
8
8
  import logging
9
9
  import re
10
+ from contextlib import contextmanager
10
11
  from itertools import groupby
11
12
  from typing import Any, Dict, List, Optional, Tuple, Union
12
13
 
@@ -47,6 +48,33 @@ HIGHLIGHT_DEFAULTS = {
47
48
  "color_value_min": 0.4, # HSV V >
48
49
  }
49
50
 
51
+
52
+ @contextmanager
53
+ def disable_text_sync():
54
+ """
55
+ Temporarily disable text synchronization for performance.
56
+
57
+ This is used when bulk-updating text content where character-level
58
+ synchronization is not needed, such as during bidi processing.
59
+ Fixes exponential recursion issue with Arabic/RTL text processing.
60
+ """
61
+ # Save original setter
62
+ original_setter = TextElement.text.fset
63
+
64
+ # Create a fast setter that skips sync
65
+ def fast_setter(self, value):
66
+ self._obj["text"] = value
67
+ # Skip character synchronization for performance
68
+
69
+ # Apply fast setter
70
+ TextElement.text = property(TextElement.text.fget, fast_setter)
71
+
72
+ try:
73
+ yield
74
+ finally:
75
+ # Restore original setter
76
+ TextElement.text = property(TextElement.text.fget, original_setter)
77
+
50
78
  class NaturalWordExtractor(WordExtractor):
51
79
  """
52
80
  Custom WordExtractor that splits words based on specified character attributes
@@ -202,14 +230,52 @@ class ElementManager:
202
230
  char_to_index[key] = idx
203
231
 
204
232
  # 2. Instantiate the custom word extractor
205
- # Get config settings from the parent PDF or use defaults
233
+ # Prefer page-level config over PDF-level for tolerance lookup
234
+ page_config = getattr(self._page, "_config", {})
206
235
  pdf_config = getattr(self._page._parent, "_config", {})
207
- xt = pdf_config.get("x_tolerance", 3)
208
- yt = pdf_config.get("y_tolerance", 3)
236
+
237
+ # Start with any explicitly supplied tolerances (may be None)
238
+ xt = page_config.get("x_tolerance", pdf_config.get("x_tolerance"))
239
+ yt = page_config.get("y_tolerance", pdf_config.get("y_tolerance"))
209
240
  use_flow = pdf_config.get("use_text_flow", False)
210
241
 
211
- # Define which attributes to preserve on the merged word object
212
- # Should include split attributes + any others needed for filtering (like color)
242
+ # ------------------------------------------------------------------
243
+ # Auto-adaptive tolerance: scale based on median character size when
244
+ # requested and explicit values are absent.
245
+ # ------------------------------------------------------------------
246
+ if pdf_config.get("auto_text_tolerance", True):
247
+ import statistics
248
+
249
+ sizes = [c.get("size", 0) for c in prepared_char_dicts if c.get("size")]
250
+ median_size = None
251
+ if sizes:
252
+ median_size = statistics.median(sizes)
253
+ if xt is None:
254
+ xt = 0.25 * median_size # ~kerning width
255
+ # Record back to page config for downstream users
256
+ page_config["x_tolerance"] = xt
257
+ if yt is None:
258
+ yt = 0.6 * median_size # ~line spacing fraction
259
+ page_config["y_tolerance"] = yt
260
+
261
+ # Warn users when the page's font size is extremely small –
262
+ # this is often the root cause of merged-row/column issues.
263
+ if median_size and median_size < 6: # 6 pt is unusually small
264
+ logger.warning(
265
+ f"Page {self._page.number}: Median font size is only {median_size:.1f} pt; "
266
+ f"auto-set x_tolerance={xt:.2f}, y_tolerance={yt:.2f}. "
267
+ "If the output looks wrong you can override these values via "
268
+ "PDF(..., text_tolerance={'x_tolerance': X, 'y_tolerance': Y}, "
269
+ "auto_text_tolerance=False)."
270
+ )
271
+
272
+ # Fallback to pdfplumber defaults if still None
273
+ if xt is None:
274
+ xt = 3
275
+ if yt is None:
276
+ yt = 3
277
+
278
+ # List of attributes to preserve on word objects
213
279
  attributes_to_preserve = list(
214
280
  set(
215
281
  self._word_split_attributes
@@ -223,7 +289,7 @@ class ElementManager:
223
289
  )
224
290
  )
225
291
 
226
- # -------------------------------------------------------------
292
+ # ------------------------------------------------------------------
227
293
  # NEW: Detect direction (LTR vs RTL) per visual line and feed
228
294
  # pdfplumber's WordExtractor with the correct settings.
229
295
  # -------------------------------------------------------------
@@ -271,7 +337,9 @@ class ElementManager:
271
337
  # Build a WordExtractor tailored for this line's direction
272
338
  if is_rtl_line:
273
339
  line_dir = "ttb" # horizontal lines stacked top→bottom
274
- char_dir = "rtl" # characters right→left within the line
340
+ # Feed characters in right→left x-order; extractor can then treat
341
+ # them as left-to-right so that resulting text stays logical.
342
+ char_dir = "ltr"
275
343
  else:
276
344
  line_dir = "ttb"
277
345
  char_dir = "ltr"
@@ -288,9 +356,8 @@ class ElementManager:
288
356
  )
289
357
 
290
358
  # Prepare character sequence for the extractor:
291
- # For LTR lines -> left→right order (x0 ascending)
292
- # For RTL lines -> feed **reversed** list so that neighbouring
293
- # characters appear adjacent when the extractor walks right→left.
359
+ # Always feed characters in spatial order (x0 ascending)
360
+ # PDF stores glyphs in visual order, so this gives us the visual sequence
294
361
  line_chars_for_extractor = sorted(line_chars, key=lambda c: c.get("x0", 0))
295
362
 
296
363
  try:
@@ -324,15 +391,18 @@ class ElementManager:
324
391
  # on the whole-line heuristic.
325
392
  rtl_in_word = any(_is_rtl_char(ch.get("text", "")) for ch in char_list)
326
393
  if rtl_in_word:
394
+ # Convert from visual order (from PDF) to logical order using bidi
327
395
  try:
328
396
  from bidi.algorithm import get_display # type: ignore
329
397
  from natural_pdf.utils.bidi_mirror import mirror_brackets
330
398
 
331
- word_element.text = mirror_brackets(
332
- get_display(word_element.text, base_dir="R")
333
- )
399
+ with disable_text_sync():
400
+ # word_element.text is currently in visual order (from PDF)
401
+ # Convert to logical order using bidi with auto direction detection
402
+ logical_text = get_display(word_element.text, base_dir='L')
403
+ # Apply bracket mirroring for logical order
404
+ word_element.text = mirror_brackets(logical_text)
334
405
  except Exception:
335
- # Fallback: keep original text if python-bidi fails
336
406
  pass
337
407
 
338
408
  # ------------------------------------------------------------------
@@ -415,19 +485,6 @@ class ElementManager:
415
485
  f"Page {self._page.number}: Generated {len(generated_words)} words using NaturalWordExtractor."
416
486
  )
417
487
 
418
- # --- Post-processing pass to ensure every word containing RTL characters is
419
- # stored in logical order and with mirrored brackets. This is a
420
- # safeguard in case the per-line loop above missed some tokens.
421
- try:
422
- from bidi.algorithm import get_display # type: ignore
423
- from natural_pdf.utils.bidi_mirror import mirror_brackets
424
-
425
- for w in generated_words:
426
- if any(_is_rtl_char(ch) for ch in w.text):
427
- w.text = mirror_brackets(get_display(w.text, base_dir="R"))
428
- except Exception:
429
- pass # graceful degradation – keep original text
430
-
431
488
  # 4. Load other elements (rects, lines)
432
489
  rect_elements = [RectangleElement(r, self._page) for r in self._page._page.rects]
433
490
  line_elements = [LineElement(l, self._page) for l in self._page._page.lines]
@@ -463,6 +520,8 @@ class ElementManager:
463
520
 
464
521
  logger.debug(f"Page {self._page.number}: Element loading complete.")
465
522
 
523
+ # If per-word BiDi was skipped, generated_words already stay in logical order.
524
+
466
525
  def _prepare_char_dicts(self) -> List[Dict[str, Any]]:
467
526
  """
468
527
  Prepares a list of character dictionaries from native PDF characters,
natural_pdf/core/page.py CHANGED
@@ -128,6 +128,13 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMi
128
128
  "named": {}, # Named regions (name -> region)
129
129
  }
130
130
 
131
+ # -------------------------------------------------------------
132
+ # Page-scoped configuration begins as a shallow copy of the parent
133
+ # PDF-level configuration so that auto-computed tolerances or other
134
+ # page-specific values do not overwrite siblings.
135
+ # -------------------------------------------------------------
136
+ self._config = dict(getattr(self._parent, "_config", {}))
137
+
131
138
  # Initialize ElementManager, passing font_attrs
132
139
  self._element_mgr = ElementManager(self, font_attrs=font_attrs)
133
140
  # self._highlighter = HighlightingService(self) # REMOVED - Use property accessor
@@ -1153,10 +1160,20 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMi
1153
1160
  # 5. Generate Text Layout using Utility
1154
1161
  # Pass page bbox as layout context
1155
1162
  page_bbox = (0, 0, self.width, self.height)
1163
+ # Merge PDF-level default tolerances if caller did not override
1164
+ merged_kwargs = dict(kwargs)
1165
+ tol_keys = ["x_tolerance", "x_tolerance_ratio", "y_tolerance"]
1166
+ for k in tol_keys:
1167
+ if k not in merged_kwargs:
1168
+ if k in self._config:
1169
+ merged_kwargs[k] = self._config[k]
1170
+ elif k in getattr(self._parent, "_config", {}):
1171
+ merged_kwargs[k] = self._parent._config[k]
1172
+
1156
1173
  result = generate_text_layout(
1157
1174
  char_dicts=filtered_chars,
1158
1175
  layout_context_bbox=page_bbox,
1159
- user_kwargs=kwargs, # Pass original user kwargs
1176
+ user_kwargs=merged_kwargs,
1160
1177
  )
1161
1178
 
1162
1179
  # --- Optional: apply Unicode BiDi algorithm for mixed RTL/LTR correctness ---
@@ -1356,6 +1373,37 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMi
1356
1373
 
1357
1374
  # Use the selected method
1358
1375
  if effective_method == "pdfplumber":
1376
+ # ---------------------------------------------------------
1377
+ # Inject auto-computed or user-specified text tolerances so
1378
+ # pdfplumber uses the same numbers we used for word grouping
1379
+ # whenever the table algorithm relies on word positions.
1380
+ # ---------------------------------------------------------
1381
+ if "text" in (
1382
+ table_settings.get("vertical_strategy"),
1383
+ table_settings.get("horizontal_strategy"),
1384
+ ):
1385
+ print("SETTING IT UP")
1386
+ pdf_cfg = getattr(self, "_config", getattr(self._parent, "_config", {}))
1387
+ if "text_x_tolerance" not in table_settings and "x_tolerance" not in table_settings:
1388
+ x_tol = pdf_cfg.get("x_tolerance")
1389
+ if x_tol is not None:
1390
+ table_settings.setdefault("text_x_tolerance", x_tol)
1391
+ if "text_y_tolerance" not in table_settings and "y_tolerance" not in table_settings:
1392
+ y_tol = pdf_cfg.get("y_tolerance")
1393
+ if y_tol is not None:
1394
+ table_settings.setdefault("text_y_tolerance", y_tol)
1395
+
1396
+ # pdfplumber's text strategy benefits from a tight snap tolerance.
1397
+ if "snap_tolerance" not in table_settings and "snap_x_tolerance" not in table_settings:
1398
+ # Derive from y_tol if available, else default 1
1399
+ snap = max(1, round((pdf_cfg.get("y_tolerance", 1)) * 0.9))
1400
+ table_settings.setdefault("snap_tolerance", snap)
1401
+ if "join_tolerance" not in table_settings and "join_x_tolerance" not in table_settings:
1402
+ join = table_settings.get("snap_tolerance", 1)
1403
+ table_settings.setdefault("join_tolerance", join)
1404
+ table_settings.setdefault("join_x_tolerance", join)
1405
+ table_settings.setdefault("join_y_tolerance", join)
1406
+
1359
1407
  return self._page.extract_tables(table_settings)
1360
1408
  else:
1361
1409
  raise ValueError(
natural_pdf/core/pdf.py CHANGED
@@ -168,6 +168,8 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
168
168
  reading_order: bool = True,
169
169
  font_attrs: Optional[List[str]] = None,
170
170
  keep_spaces: bool = True,
171
+ text_tolerance: Optional[dict] = None,
172
+ auto_text_tolerance: bool = True,
171
173
  ):
172
174
  """
173
175
  Initialize the enhanced PDF object.
@@ -177,6 +179,8 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
177
179
  reading_order: Whether to use natural reading order
178
180
  font_attrs: Font attributes for grouping characters into words
179
181
  keep_spaces: Whether to include spaces in word elements
182
+ text_tolerance: PDFplumber-style tolerance settings
183
+ auto_text_tolerance: Whether to automatically scale text tolerance
180
184
  """
181
185
  self._original_path_or_stream = path_or_url_or_stream
182
186
  self._temp_file = None
@@ -274,6 +278,24 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
274
278
  getattr(self, "_is_stream", False),
275
279
  )
276
280
 
281
+ # --- Text tolerance settings ------------------------------------
282
+ # Users can pass pdfplumber-style keys (x_tolerance, x_tolerance_ratio,
283
+ # y_tolerance, etc.) via *text_tolerance*. We also keep a flag that
284
+ # enables automatic tolerance scaling when explicit values are not
285
+ # supplied.
286
+ self._config["auto_text_tolerance"] = bool(auto_text_tolerance)
287
+ if text_tolerance:
288
+ # Only copy recognised primitives (numbers / None); ignore junk.
289
+ allowed = {
290
+ "x_tolerance",
291
+ "x_tolerance_ratio",
292
+ "y_tolerance",
293
+ "keep_blank_chars", # passthrough convenience
294
+ }
295
+ for k, v in text_tolerance.items():
296
+ if k in allowed:
297
+ self._config[k] = v
298
+
277
299
  def _initialize_managers(self):
278
300
  """Set up manager factories for lazy instantiation."""
279
301
  # Store factories/classes for each manager key
@@ -1901,7 +1901,68 @@ class ElementCollection(
1901
1901
  )
1902
1902
  )
1903
1903
 
1904
+ # ------------------------------------------------------------------
1905
+ # NEW METHOD: apply_ocr for collections (supports custom function)
1906
+ # ------------------------------------------------------------------
1907
+ def apply_ocr(
1908
+ self,
1909
+ *,
1910
+ function: Optional[Callable[["Region"], Optional[str]]] = None,
1911
+ show_progress: bool = True,
1912
+ **kwargs,
1913
+ ) -> "ElementCollection":
1914
+ """Apply OCR to every element in the collection.
1915
+
1916
+ This is a convenience wrapper that simply iterates over the collection
1917
+ and calls ``el.apply_ocr(...)`` on each item.
1918
+
1919
+ Two modes are supported depending on the arguments provided:
1920
+
1921
+ 1. **Built-in OCR engines** – pass parameters like ``engine='easyocr'``
1922
+ or ``languages=['en']`` and each element delegates to the global
1923
+ OCRManager.
1924
+ 2. **Custom function** – pass a *callable* via the ``function`` keyword
1925
+ (alias ``ocr_function`` also recognised). The callable will receive
1926
+ the element/region and must return the recognised text (or ``None``).
1927
+ Internally this is forwarded through the element's own
1928
+ :py:meth:`apply_ocr` implementation, so the behaviour mirrors the
1929
+ single-element API.
1930
+
1931
+ Parameters
1932
+ ----------
1933
+ function : callable, optional
1934
+ Custom OCR function to use instead of the built-in engines.
1935
+ show_progress : bool, default True
1936
+ Display a tqdm progress bar while processing.
1937
+ **kwargs
1938
+ Additional parameters forwarded to each element's ``apply_ocr``.
1939
+
1940
+ Returns
1941
+ -------
1942
+ ElementCollection
1943
+ *Self* for fluent chaining.
1944
+ """
1945
+ # Alias for backward-compatibility
1946
+ if function is None and "ocr_function" in kwargs:
1947
+ function = kwargs.pop("ocr_function")
1948
+
1949
+ def _process(el):
1950
+ if hasattr(el, "apply_ocr"):
1951
+ if function is not None:
1952
+ return el.apply_ocr(function=function, **kwargs)
1953
+ else:
1954
+ return el.apply_ocr(**kwargs)
1955
+ else:
1956
+ logger.warning(
1957
+ f"Element of type {type(el).__name__} does not support apply_ocr. Skipping."
1958
+ )
1959
+ return el
1960
+
1961
+ # Use collection's apply helper for optional progress bar
1962
+ self.apply(_process, show_progress=show_progress)
1963
+ return self
1904
1964
 
1965
+ # ------------------------------------------------------------------
1905
1966
 
1906
1967
 
1907
1968
  class PageCollection(Generic[P], ApplyMixin, ShapeDetectionMixin):