natural-pdf 0.1.31__py3-none-any.whl → 0.1.32__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -230,12 +230,51 @@ class ElementManager:
230
230
  char_to_index[key] = idx
231
231
 
232
232
  # 2. Instantiate the custom word extractor
233
- # Get config settings from the parent PDF or use defaults
233
+ # Prefer page-level config over PDF-level for tolerance lookup
234
+ page_config = getattr(self._page, "_config", {})
234
235
  pdf_config = getattr(self._page._parent, "_config", {})
235
- xt = pdf_config.get("x_tolerance", 3)
236
- yt = pdf_config.get("y_tolerance", 3)
236
+
237
+ # Start with any explicitly supplied tolerances (may be None)
238
+ xt = page_config.get("x_tolerance", pdf_config.get("x_tolerance"))
239
+ yt = page_config.get("y_tolerance", pdf_config.get("y_tolerance"))
237
240
  use_flow = pdf_config.get("use_text_flow", False)
238
241
 
242
+ # ------------------------------------------------------------------
243
+ # Auto-adaptive tolerance: scale based on median character size when
244
+ # requested and explicit values are absent.
245
+ # ------------------------------------------------------------------
246
+ if pdf_config.get("auto_text_tolerance", True):
247
+ import statistics
248
+
249
+ sizes = [c.get("size", 0) for c in prepared_char_dicts if c.get("size")]
250
+ median_size = None
251
+ if sizes:
252
+ median_size = statistics.median(sizes)
253
+ if xt is None:
254
+ xt = 0.25 * median_size # ~kerning width
255
+ # Record back to page config for downstream users
256
+ page_config["x_tolerance"] = xt
257
+ if yt is None:
258
+ yt = 0.6 * median_size # ~line spacing fraction
259
+ page_config["y_tolerance"] = yt
260
+
261
+ # Warn users when the page's font size is extremely small –
262
+ # this is often the root cause of merged-row/column issues.
263
+ if median_size and median_size < 6: # 6 pt is unusually small
264
+ logger.warning(
265
+ f"Page {self._page.number}: Median font size is only {median_size:.1f} pt; "
266
+ f"auto-set x_tolerance={xt:.2f}, y_tolerance={yt:.2f}. "
267
+ "If the output looks wrong you can override these values via "
268
+ "PDF(..., text_tolerance={'x_tolerance': X, 'y_tolerance': Y}, "
269
+ "auto_text_tolerance=False)."
270
+ )
271
+
272
+ # Fallback to pdfplumber defaults if still None
273
+ if xt is None:
274
+ xt = 3
275
+ if yt is None:
276
+ yt = 3
277
+
239
278
  # List of attributes to preserve on word objects
240
279
  attributes_to_preserve = list(
241
280
  set(
natural_pdf/core/page.py CHANGED
@@ -128,6 +128,13 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMi
128
128
  "named": {}, # Named regions (name -> region)
129
129
  }
130
130
 
131
+ # -------------------------------------------------------------
132
+ # Page-scoped configuration begins as a shallow copy of the parent
133
+ # PDF-level configuration so that auto-computed tolerances or other
134
+ # page-specific values do not overwrite siblings.
135
+ # -------------------------------------------------------------
136
+ self._config = dict(getattr(self._parent, "_config", {}))
137
+
131
138
  # Initialize ElementManager, passing font_attrs
132
139
  self._element_mgr = ElementManager(self, font_attrs=font_attrs)
133
140
  # self._highlighter = HighlightingService(self) # REMOVED - Use property accessor
@@ -1153,10 +1160,20 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMi
1153
1160
  # 5. Generate Text Layout using Utility
1154
1161
  # Pass page bbox as layout context
1155
1162
  page_bbox = (0, 0, self.width, self.height)
1163
+ # Merge PDF-level default tolerances if caller did not override
1164
+ merged_kwargs = dict(kwargs)
1165
+ tol_keys = ["x_tolerance", "x_tolerance_ratio", "y_tolerance"]
1166
+ for k in tol_keys:
1167
+ if k not in merged_kwargs:
1168
+ if k in self._config:
1169
+ merged_kwargs[k] = self._config[k]
1170
+ elif k in getattr(self._parent, "_config", {}):
1171
+ merged_kwargs[k] = self._parent._config[k]
1172
+
1156
1173
  result = generate_text_layout(
1157
1174
  char_dicts=filtered_chars,
1158
1175
  layout_context_bbox=page_bbox,
1159
- user_kwargs=kwargs, # Pass original user kwargs
1176
+ user_kwargs=merged_kwargs,
1160
1177
  )
1161
1178
 
1162
1179
  # --- Optional: apply Unicode BiDi algorithm for mixed RTL/LTR correctness ---
@@ -1356,6 +1373,37 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMi
1356
1373
 
1357
1374
  # Use the selected method
1358
1375
  if effective_method == "pdfplumber":
1376
+ # ---------------------------------------------------------
1377
+ # Inject auto-computed or user-specified text tolerances so
1378
+ # pdfplumber uses the same numbers we used for word grouping
1379
+ # whenever the table algorithm relies on word positions.
1380
+ # ---------------------------------------------------------
1381
+ if "text" in (
1382
+ table_settings.get("vertical_strategy"),
1383
+ table_settings.get("horizontal_strategy"),
1384
+ ):
1385
+ print("SETTING IT UP")
1386
+ pdf_cfg = getattr(self, "_config", getattr(self._parent, "_config", {}))
1387
+ if "text_x_tolerance" not in table_settings and "x_tolerance" not in table_settings:
1388
+ x_tol = pdf_cfg.get("x_tolerance")
1389
+ if x_tol is not None:
1390
+ table_settings.setdefault("text_x_tolerance", x_tol)
1391
+ if "text_y_tolerance" not in table_settings and "y_tolerance" not in table_settings:
1392
+ y_tol = pdf_cfg.get("y_tolerance")
1393
+ if y_tol is not None:
1394
+ table_settings.setdefault("text_y_tolerance", y_tol)
1395
+
1396
+ # pdfplumber's text strategy benefits from a tight snap tolerance.
1397
+ if "snap_tolerance" not in table_settings and "snap_x_tolerance" not in table_settings:
1398
+ # Derive from y_tol if available, else default 1
1399
+ snap = max(1, round((pdf_cfg.get("y_tolerance", 1)) * 0.9))
1400
+ table_settings.setdefault("snap_tolerance", snap)
1401
+ if "join_tolerance" not in table_settings and "join_x_tolerance" not in table_settings:
1402
+ join = table_settings.get("snap_tolerance", 1)
1403
+ table_settings.setdefault("join_tolerance", join)
1404
+ table_settings.setdefault("join_x_tolerance", join)
1405
+ table_settings.setdefault("join_y_tolerance", join)
1406
+
1359
1407
  return self._page.extract_tables(table_settings)
1360
1408
  else:
1361
1409
  raise ValueError(
natural_pdf/core/pdf.py CHANGED
@@ -168,6 +168,8 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
168
168
  reading_order: bool = True,
169
169
  font_attrs: Optional[List[str]] = None,
170
170
  keep_spaces: bool = True,
171
+ text_tolerance: Optional[dict] = None,
172
+ auto_text_tolerance: bool = True,
171
173
  ):
172
174
  """
173
175
  Initialize the enhanced PDF object.
@@ -177,6 +179,8 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
177
179
  reading_order: Whether to use natural reading order
178
180
  font_attrs: Font attributes for grouping characters into words
179
181
  keep_spaces: Whether to include spaces in word elements
182
+ text_tolerance: PDFplumber-style tolerance settings
183
+ auto_text_tolerance: Whether to automatically scale text tolerance
180
184
  """
181
185
  self._original_path_or_stream = path_or_url_or_stream
182
186
  self._temp_file = None
@@ -274,6 +278,24 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
274
278
  getattr(self, "_is_stream", False),
275
279
  )
276
280
 
281
+ # --- Text tolerance settings ------------------------------------
282
+ # Users can pass pdfplumber-style keys (x_tolerance, x_tolerance_ratio,
283
+ # y_tolerance, etc.) via *text_tolerance*. We also keep a flag that
284
+ # enables automatic tolerance scaling when explicit values are not
285
+ # supplied.
286
+ self._config["auto_text_tolerance"] = bool(auto_text_tolerance)
287
+ if text_tolerance:
288
+ # Only copy recognised primitives (numbers / None); ignore junk.
289
+ allowed = {
290
+ "x_tolerance",
291
+ "x_tolerance_ratio",
292
+ "y_tolerance",
293
+ "keep_blank_chars", # passthrough convenience
294
+ }
295
+ for k, v in text_tolerance.items():
296
+ if k in allowed:
297
+ self._config[k] = v
298
+
277
299
  def _initialize_managers(self):
278
300
  """Set up manager factories for lazy instantiation."""
279
301
  # Store factories/classes for each manager key
@@ -1901,7 +1901,68 @@ class ElementCollection(
1901
1901
  )
1902
1902
  )
1903
1903
 
1904
+ # ------------------------------------------------------------------
1905
+ # NEW METHOD: apply_ocr for collections (supports custom function)
1906
+ # ------------------------------------------------------------------
1907
+ def apply_ocr(
1908
+ self,
1909
+ *,
1910
+ function: Optional[Callable[["Region"], Optional[str]]] = None,
1911
+ show_progress: bool = True,
1912
+ **kwargs,
1913
+ ) -> "ElementCollection":
1914
+ """Apply OCR to every element in the collection.
1915
+
1916
+ This is a convenience wrapper that simply iterates over the collection
1917
+ and calls ``el.apply_ocr(...)`` on each item.
1918
+
1919
+ Two modes are supported depending on the arguments provided:
1920
+
1921
+ 1. **Built-in OCR engines** – pass parameters like ``engine='easyocr'``
1922
+ or ``languages=['en']`` and each element delegates to the global
1923
+ OCRManager.
1924
+ 2. **Custom function** – pass a *callable* via the ``function`` keyword
1925
+ (alias ``ocr_function`` also recognised). The callable will receive
1926
+ the element/region and must return the recognised text (or ``None``).
1927
+ Internally this is forwarded through the element's own
1928
+ :py:meth:`apply_ocr` implementation, so the behaviour mirrors the
1929
+ single-element API.
1930
+
1931
+ Parameters
1932
+ ----------
1933
+ function : callable, optional
1934
+ Custom OCR function to use instead of the built-in engines.
1935
+ show_progress : bool, default True
1936
+ Display a tqdm progress bar while processing.
1937
+ **kwargs
1938
+ Additional parameters forwarded to each element's ``apply_ocr``.
1939
+
1940
+ Returns
1941
+ -------
1942
+ ElementCollection
1943
+ *Self* for fluent chaining.
1944
+ """
1945
+ # Alias for backward-compatibility
1946
+ if function is None and "ocr_function" in kwargs:
1947
+ function = kwargs.pop("ocr_function")
1948
+
1949
+ def _process(el):
1950
+ if hasattr(el, "apply_ocr"):
1951
+ if function is not None:
1952
+ return el.apply_ocr(function=function, **kwargs)
1953
+ else:
1954
+ return el.apply_ocr(**kwargs)
1955
+ else:
1956
+ logger.warning(
1957
+ f"Element of type {type(el).__name__} does not support apply_ocr. Skipping."
1958
+ )
1959
+ return el
1960
+
1961
+ # Use collection's apply helper for optional progress bar
1962
+ self.apply(_process, show_progress=show_progress)
1963
+ return self
1904
1964
 
1965
+ # ------------------------------------------------------------------
1905
1966
 
1906
1967
 
1907
1968
  class PageCollection(Generic[P], ApplyMixin, ShapeDetectionMixin):
@@ -1319,6 +1319,28 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
1319
1319
  table_settings.setdefault("vertical_strategy", "lines")
1320
1320
  table_settings.setdefault("horizontal_strategy", "lines")
1321
1321
 
1322
+ # -------------------------------------------------------------
1323
+ # Auto-inject tolerances when text-based strategies are requested.
1324
+ # This must happen AFTER alias handling (so strategies are final)
1325
+ # and BEFORE we delegate to _extract_table_* helpers.
1326
+ # -------------------------------------------------------------
1327
+ if "text" in (table_settings.get("vertical_strategy"), table_settings.get("horizontal_strategy")):
1328
+ page_cfg = getattr(self.page, "_config", {})
1329
+ # Ensure text_* tolerances passed to pdfplumber
1330
+ if "text_x_tolerance" not in table_settings and "x_tolerance" not in table_settings:
1331
+ if page_cfg.get("x_tolerance") is not None:
1332
+ table_settings["text_x_tolerance"] = page_cfg["x_tolerance"]
1333
+ if "text_y_tolerance" not in table_settings and "y_tolerance" not in table_settings:
1334
+ if page_cfg.get("y_tolerance") is not None:
1335
+ table_settings["text_y_tolerance"] = page_cfg["y_tolerance"]
1336
+
1337
+ # Snap / join tolerances (~ line spacing)
1338
+ if "snap_tolerance" not in table_settings and "snap_x_tolerance" not in table_settings:
1339
+ snap = max(1, round((page_cfg.get("y_tolerance", 1)) * 0.9))
1340
+ table_settings["snap_tolerance"] = snap
1341
+ if "join_tolerance" not in table_settings and "join_x_tolerance" not in table_settings:
1342
+ table_settings["join_tolerance"] = table_settings["snap_tolerance"]
1343
+
1322
1344
  logger.debug(f"Region {self.bbox}: Extracting table using method '{effective_method}'")
1323
1345
 
1324
1346
  # Use the selected method
@@ -1438,6 +1460,30 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
1438
1460
  Returns:
1439
1461
  List of tables, where each table is a list of rows, and each row is a list of cell values
1440
1462
  """
1463
+ # Inject global PDF-level text tolerances if not explicitly present
1464
+ pdf_cfg = getattr(self.page, "_config", getattr(self.page._parent, "_config", {}))
1465
+ _uses_text = "text" in (
1466
+ table_settings.get("vertical_strategy"),
1467
+ table_settings.get("horizontal_strategy"),
1468
+ )
1469
+ if _uses_text and "text_x_tolerance" not in table_settings and "x_tolerance" not in table_settings:
1470
+ x_tol = pdf_cfg.get("x_tolerance")
1471
+ if x_tol is not None:
1472
+ table_settings.setdefault("text_x_tolerance", x_tol)
1473
+ if _uses_text and "text_y_tolerance" not in table_settings and "y_tolerance" not in table_settings:
1474
+ y_tol = pdf_cfg.get("y_tolerance")
1475
+ if y_tol is not None:
1476
+ table_settings.setdefault("text_y_tolerance", y_tol)
1477
+
1478
+ if _uses_text and "snap_tolerance" not in table_settings and "snap_x_tolerance" not in table_settings:
1479
+ snap = max(1, round((pdf_cfg.get("y_tolerance", 1)) * 0.9))
1480
+ table_settings.setdefault("snap_tolerance", snap)
1481
+ if _uses_text and "join_tolerance" not in table_settings and "join_x_tolerance" not in table_settings:
1482
+ join = table_settings.get("snap_tolerance", 1)
1483
+ table_settings.setdefault("join_tolerance", join)
1484
+ table_settings.setdefault("join_x_tolerance", join)
1485
+ table_settings.setdefault("join_y_tolerance", join)
1486
+
1441
1487
  # Create a crop of the page for this region
1442
1488
  cropped = self.page._page.crop(self.bbox)
1443
1489
 
@@ -1458,6 +1504,21 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
1458
1504
  Returns:
1459
1505
  Table data as a list of rows, where each row is a list of cell values
1460
1506
  """
1507
+ # Inject global PDF-level text tolerances if not explicitly present
1508
+ pdf_cfg = getattr(self.page, "_config", getattr(self.page._parent, "_config", {}))
1509
+ _uses_text = "text" in (
1510
+ table_settings.get("vertical_strategy"),
1511
+ table_settings.get("horizontal_strategy"),
1512
+ )
1513
+ if _uses_text and "text_x_tolerance" not in table_settings and "x_tolerance" not in table_settings:
1514
+ x_tol = pdf_cfg.get("x_tolerance")
1515
+ if x_tol is not None:
1516
+ table_settings.setdefault("text_x_tolerance", x_tol)
1517
+ if _uses_text and "text_y_tolerance" not in table_settings and "y_tolerance" not in table_settings:
1518
+ y_tol = pdf_cfg.get("y_tolerance")
1519
+ if y_tol is not None:
1520
+ table_settings.setdefault("text_y_tolerance", y_tol)
1521
+
1461
1522
  # Create a crop of the page for this region
1462
1523
  cropped = self.page._page.crop(self.bbox)
1463
1524
 
@@ -1943,21 +2004,45 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
1943
2004
  """
1944
2005
  Apply OCR to this region and return the created text elements.
1945
2006
 
2007
+ This method supports two modes:
2008
+ 1. **Built-in OCR Engines** (default) – identical to previous behaviour. Pass typical
2009
+ parameters like ``engine='easyocr'`` or ``languages=['en']`` and the method will
2010
+ route the request through :class:`OCRManager`.
2011
+ 2. **Custom OCR Function** – pass a *callable* under the keyword ``function`` (or
2012
+ ``ocr_function``). The callable will receive *this* Region instance and should
2013
+ return the extracted text (``str``) or ``None``. Internally the call is
2014
+ delegated to :pymeth:`apply_custom_ocr` so the same logic (replacement, element
2015
+ creation, etc.) is re-used.
2016
+
2017
+ Examples
2018
+ ---------
2019
+ >>> def llm_ocr(region):
2020
+ ... image = region.to_image(resolution=300, crop=True)
2021
+ ... return my_llm_client.ocr(image)
2022
+ >>> region.apply_ocr(function=llm_ocr)
2023
+
1946
2024
  Args:
1947
- replace: If True (default), removes existing OCR elements in the region
1948
- before adding new ones. If False, adds new OCR elements without
1949
- removing existing ones.
1950
- **ocr_params: Keyword arguments passed to the OCR Manager.
1951
- Common parameters like `engine`, `languages`, `min_confidence`,
1952
- `device`, and `resolution` (for image rendering) should be
1953
- provided here. **The `languages` list must contain codes
1954
- understood by the specific engine selected.** No mapping
1955
- is performed. Engine-specific settings can be passed in
1956
- an `options` object (e.g., `options=EasyOCROptions(...)`).
2025
+ replace: Whether to remove existing OCR elements first (default ``True``).
2026
+ **ocr_params: Parameters for the built-in OCR manager *or* the special
2027
+ ``function``/``ocr_function`` keyword to trigger custom mode.
2028
+
2029
+ Returns
2030
+ -------
2031
+ Self for chaining.
2032
+ """
2033
+ # --- Custom OCR function path --------------------------------------------------
2034
+ custom_func = ocr_params.pop("function", None) or ocr_params.pop("ocr_function", None)
2035
+ if callable(custom_func):
2036
+ # Delegate to the specialised helper while preserving key kwargs
2037
+ return self.apply_custom_ocr(
2038
+ ocr_function=custom_func,
2039
+ source_label=ocr_params.pop("source_label", "custom-ocr"),
2040
+ replace=replace,
2041
+ confidence=ocr_params.pop("confidence", None),
2042
+ add_to_page=ocr_params.pop("add_to_page", True),
2043
+ )
1957
2044
 
1958
- Returns:
1959
- Self for method chaining.
1960
- """
2045
+ # --- Original built-in OCR engine path (unchanged except docstring) ------------
1961
2046
  # Ensure OCRManager is available
1962
2047
  if not hasattr(self.page._parent, "_ocr_manager") or self.page._parent._ocr_manager is None:
1963
2048
  logger.error("OCRManager not available on parent PDF. Cannot apply OCR to region.")
@@ -2123,6 +2208,133 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
2123
2208
  logger.info(f"Region {self.bbox}: Added {len(created_elements)} elements from OCR.")
2124
2209
  return self
2125
2210
 
2211
+ def apply_custom_ocr(
2212
+ self,
2213
+ ocr_function: Callable[["Region"], Optional[str]],
2214
+ source_label: str = "custom-ocr",
2215
+ replace: bool = True,
2216
+ confidence: Optional[float] = None,
2217
+ add_to_page: bool = True,
2218
+ ) -> "Region":
2219
+ """
2220
+ Apply a custom OCR function to this region and create text elements from the results.
2221
+
2222
+ This is useful when you want to use a custom OCR method (e.g., an LLM API,
2223
+ specialized OCR service, or any custom logic) instead of the built-in OCR engines.
2224
+
2225
+ Args:
2226
+ ocr_function: A callable that takes a Region and returns the OCR'd text (or None).
2227
+ The function receives this region as its argument and should return
2228
+ the extracted text as a string, or None if no text was found.
2229
+ source_label: Label to identify the source of these text elements (default: "custom-ocr").
2230
+ This will be set as the 'source' attribute on created elements.
2231
+ replace: If True (default), removes existing OCR elements in this region before
2232
+ adding new ones. If False, adds new OCR elements alongside existing ones.
2233
+ confidence: Optional confidence score for the OCR result (0.0-1.0).
2234
+ If None, defaults to 1.0 if text is returned, 0.0 if None is returned.
2235
+ add_to_page: If True (default), adds the created text element to the page.
2236
+ If False, creates the element but doesn't add it to the page.
2237
+
2238
+ Returns:
2239
+ Self for method chaining.
2240
+
2241
+ Example:
2242
+ # Using with an LLM
2243
+ def ocr_with_llm(region):
2244
+ image = region.to_image(resolution=300, crop=True)
2245
+ # Call your LLM API here
2246
+ return llm_client.ocr(image)
2247
+
2248
+ region.apply_custom_ocr(ocr_with_llm)
2249
+
2250
+ # Using with a custom OCR service
2251
+ def ocr_with_service(region):
2252
+ img_bytes = region.to_image(crop=True).tobytes()
2253
+ response = ocr_service.process(img_bytes)
2254
+ return response.text
2255
+
2256
+ region.apply_custom_ocr(ocr_with_service, source_label="my-ocr-service")
2257
+ """
2258
+ # If replace is True, remove existing OCR elements in this region
2259
+ if replace:
2260
+ logger.info(
2261
+ f"Region {self.bbox}: Removing existing OCR elements before applying custom OCR."
2262
+ )
2263
+
2264
+ removed_count = 0
2265
+
2266
+ # Helper to remove a single element safely
2267
+ def _safe_remove(elem):
2268
+ nonlocal removed_count
2269
+ success = False
2270
+ if hasattr(elem, "page") and hasattr(elem.page, "_element_mgr"):
2271
+ etype = getattr(elem, "object_type", "word")
2272
+ if etype == "word":
2273
+ etype_key = "words"
2274
+ elif etype == "char":
2275
+ etype_key = "chars"
2276
+ else:
2277
+ etype_key = etype + "s" if not etype.endswith("s") else etype
2278
+ try:
2279
+ success = elem.page._element_mgr.remove_element(elem, etype_key)
2280
+ except Exception:
2281
+ success = False
2282
+ if success:
2283
+ removed_count += 1
2284
+
2285
+ # Remove OCR elements overlapping this region
2286
+ for word in list(self.page._element_mgr.words):
2287
+ if getattr(word, "source", "").startswith("ocr") and self.intersects(word):
2288
+ _safe_remove(word)
2289
+
2290
+ # Also check custom-ocr sources
2291
+ for word in list(self.page._element_mgr.words):
2292
+ if getattr(word, "source", "") == source_label and self.intersects(word):
2293
+ _safe_remove(word)
2294
+
2295
+ if removed_count > 0:
2296
+ logger.info(
2297
+ f"Region {self.bbox}: Removed {removed_count} existing OCR elements."
2298
+ )
2299
+
2300
+ # Call the custom OCR function
2301
+ try:
2302
+ logger.debug(f"Region {self.bbox}: Calling custom OCR function...")
2303
+ ocr_text = ocr_function(self)
2304
+
2305
+ if ocr_text is not None and not isinstance(ocr_text, str):
2306
+ logger.warning(
2307
+ f"Custom OCR function returned non-string type ({type(ocr_text)}). "
2308
+ f"Converting to string."
2309
+ )
2310
+ ocr_text = str(ocr_text)
2311
+
2312
+ except Exception as e:
2313
+ logger.error(
2314
+ f"Error calling custom OCR function for region {self.bbox}: {e}",
2315
+ exc_info=True
2316
+ )
2317
+ return self
2318
+
2319
+ # Create text element if we got text
2320
+ if ocr_text is not None:
2321
+ # Use the to_text_element method to create the element
2322
+ text_element = self.to_text_element(
2323
+ text_content=ocr_text,
2324
+ source_label=source_label,
2325
+ confidence=confidence,
2326
+ add_to_page=add_to_page
2327
+ )
2328
+
2329
+ logger.info(
2330
+ f"Region {self.bbox}: Created text element with {len(ocr_text)} chars"
2331
+ f"{' and added to page' if add_to_page else ''}"
2332
+ )
2333
+ else:
2334
+ logger.debug(f"Region {self.bbox}: Custom OCR function returned None (no text found)")
2335
+
2336
+ return self
2337
+
2126
2338
  def get_section_between(self, start_element=None, end_element=None, boundary_inclusion="both"):
2127
2339
  """
2128
2340
  Get a section between two elements within this region.
@@ -2917,6 +3129,33 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
2917
3129
  if not hasattr(self, "page") or self.page is None:
2918
3130
  raise ValueError("Region must have a valid 'page' attribute to create a TextElement.")
2919
3131
 
3132
+ # Create character dictionaries for the text
3133
+ char_dicts = []
3134
+ if actual_text:
3135
+ # Create a single character dict that spans the entire region
3136
+ # This is a simplified approach - OCR engines typically create one per character
3137
+ char_dict = {
3138
+ "text": actual_text,
3139
+ "x0": self.x0,
3140
+ "top": self.top,
3141
+ "x1": self.x1,
3142
+ "bottom": self.bottom,
3143
+ "width": self.width,
3144
+ "height": self.height,
3145
+ "object_type": "char",
3146
+ "page_number": self.page.page_number,
3147
+ "fontname": default_font_name,
3148
+ "size": default_font_size,
3149
+ "upright": True,
3150
+ "direction": 1,
3151
+ "adv": self.width,
3152
+ "source": source_label,
3153
+ "confidence": final_confidence,
3154
+ "stroking_color": (0, 0, 0),
3155
+ "non_stroking_color": (0, 0, 0),
3156
+ }
3157
+ char_dicts.append(char_dict)
3158
+
2920
3159
  elem_data = {
2921
3160
  "text": actual_text,
2922
3161
  "x0": self.x0,
@@ -2936,7 +3175,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
2936
3175
  "adv": self.width,
2937
3176
  "source": source_label,
2938
3177
  "confidence": final_confidence,
2939
- "_char_dicts": [],
3178
+ "_char_dicts": char_dicts,
2940
3179
  }
2941
3180
  text_element = TextElement(elem_data, self.page)
2942
3181
 
@@ -2952,6 +3191,10 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
2952
3191
  logger.debug(
2953
3192
  f"TextElement created from region {self.bbox} and added to page {self.page.page_number} as {add_as_type}."
2954
3193
  )
3194
+ # Also add character dictionaries to the chars collection
3195
+ if char_dicts and object_type == "word":
3196
+ for char_dict in char_dicts:
3197
+ self.page._element_mgr.add_element(char_dict, element_type="chars")
2955
3198
  else:
2956
3199
  page_num_str = (
2957
3200
  str(self.page.page_number) if hasattr(self.page, "page_number") else "N/A"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: natural-pdf
3
- Version: 0.1.31
3
+ Version: 0.1.32
4
4
  Summary: A more intuitive interface for working with PDFs
5
5
  Author-email: Jonathan Soma <jonathan.soma@gmail.com>
6
6
  License-Expression: MIT
@@ -1,11 +1,8 @@
1
- bad_pdf_analysis/analyze_10_more.py,sha256=UjsTuHE1GUMoVjkX3afy3x6DfpXyfZXHgS2W1GQqUmw,11906
2
- bad_pdf_analysis/analyze_final_10.py,sha256=xYkIId0nF9LpWHRLDP1_nlJfJfC0b0Tu4mLu-3mim-0,25170
3
- bad_pdf_analysis/analyze_specific_pages.py,sha256=wzq3_ZWR28hFdT7GEkayHPYgsk20OpD476LYmy2rAEk,13725
4
- bad_pdf_analysis/analyze_specific_pages_direct.py,sha256=307gSNplwOtNTR9a0lEQWxlAKGeoZIcDe5z1pROKUXY,14846
5
1
  natural_pdf/__init__.py,sha256=qDFJNF8sbEDO-2WSFAxoWEM8updOUP6dB-ckya0kxfs,3275
6
2
  natural_pdf/cli.py,sha256=IXrP2lCHihr-ed-CFiDbMTnSsutQa1j1PYALOLGbpsc,4019
7
- natural_pdf/analyzers/__init__.py,sha256=dIXjsMqoxKmd9OOnSBzn12wvdIz7D7YNQRAnXslpJSM,142
8
- natural_pdf/analyzers/shape_detection_mixin.py,sha256=0a4uuKQ4Z1Ta_UVuUtX7mVhlwmXdAkoHTyC5wZyp5do,94455
3
+ natural_pdf/analyzers/__init__.py,sha256=MQRctn4i5Q7u8pb8vQVHKEXUiVGpKyPZUECrlDH4AuU,673
4
+ natural_pdf/analyzers/guides.py,sha256=tzyViSBDdM66mT0niwFTDIJ16UzRCZ18Iqv8wA5DYAk,90302
5
+ natural_pdf/analyzers/shape_detection_mixin.py,sha256=q7gDM-z2t7bSTxjfV2aaW3533CySu1qsEpu4wb5Rp-I,62688
9
6
  natural_pdf/analyzers/text_options.py,sha256=qEkDaYWla0rIM_gszEOsu52q7C_dAfV81P2HLJZM2sw,3333
10
7
  natural_pdf/analyzers/text_structure.py,sha256=VfKTsTFrK877sC0grsis9jK3rrgp0Mbp13VWEbukTcs,28437
11
8
  natural_pdf/analyzers/utils.py,sha256=PYbzJzSAHZ7JsMes84WIrSbA0zkjJGs0CLvIeINsf_k,2100
@@ -28,10 +25,10 @@ natural_pdf/classification/results.py,sha256=Mcay-xLBHbYoZ8U7f4gMj2IhhH_yORNEkZH
28
25
  natural_pdf/collections/mixins.py,sha256=sj76Cn6EdBtb5f-bdAV-1qpdixX8tI4BzPccPiYLI1w,5117
29
26
  natural_pdf/collections/pdf_collection.py,sha256=HLlyakM--23ZOeHDPucoM6Tw3yUyMXm0SSoqJwxRc2E,30744
30
27
  natural_pdf/core/__init__.py,sha256=QC8H4M3KbXwMFiQORZ0pdPlzx1Ix6oKKQSS7Ib2KEaA,38
31
- natural_pdf/core/element_manager.py,sha256=Mn4cYqPL_2LD_GK9lf2duExaJF1qhASCKsOdAZdQb00,49821
28
+ natural_pdf/core/element_manager.py,sha256=A6GJk9kwTzt-aSz4-SWaRHLZRbIMFFLce3CpxSyfkV4,51749
32
29
  natural_pdf/core/highlighting_service.py,sha256=WKDqRpex1yS8CWhkNitWtKhxbyRRCLu3Xsct_HTPsD4,40774
33
- natural_pdf/core/page.py,sha256=kQKKqsbOaNeLhW3ark6mueDS-4tsopJcGcoMmKPK6B8,125624
34
- natural_pdf/core/pdf.py,sha256=YfniZp54AyptzMyr7ZP8n617n4wlV28SPrajt32nNBk,74233
30
+ natural_pdf/core/page.py,sha256=843_Fyk1gxZ8nqERJjjjoRD3iM4pFJy9a0zQSyMthiQ,128476
31
+ natural_pdf/core/pdf.py,sha256=mC4GZjPXx_bK6RUlhLpnJnapkHDhbgJpgpcUJOvb7OE,75290
35
32
  natural_pdf/describe/__init__.py,sha256=B3zjuHjFI_dFuBLgXR1Q4v7c72fVDyk84d2hs0H4KV8,561
36
33
  natural_pdf/describe/base.py,sha256=HaWlHltb-dw6ug4mfR_iBLHWxr1OdPwLaUshXRxO7gg,18462
37
34
  natural_pdf/describe/elements.py,sha256=COvKF3B_RbAxXl5ORJDubV4C5PsiuSfuzD0ufPIJTFM,12983
@@ -39,11 +36,11 @@ natural_pdf/describe/mixin.py,sha256=U0x6v8r57KQb8qC3VVo64hvhfXQWsti8vdKBM7AXnMo
39
36
  natural_pdf/describe/summary.py,sha256=7FIF3zF6bzNx-gx4pCJr2XQFKiVzOEDnWsAYQ_mr9L0,7982
40
37
  natural_pdf/elements/__init__.py,sha256=S8XeiNWJ1WcgnyYKdYV1yxQlAxCCO3FfITT8MQwNbyk,41
41
38
  natural_pdf/elements/base.py,sha256=VshU4RstdzONJFq_8UVIjT_lVOai0MwMFsSFrCN-IO8,47299
42
- natural_pdf/elements/collections.py,sha256=52Oac96svzm_QMJcVaItnCG9P7d6JMNiGEx9lHgDEQg,125915
39
+ natural_pdf/elements/collections.py,sha256=1E2MSg2NNcEcoRM2rumrv_CqIdO7DgbRHYEtfw35FaQ,128457
43
40
  natural_pdf/elements/image.py,sha256=UjHNzCgDzOseQmLpkKshcxg51DPmWNIAVYxZ0TAMyUI,1423
44
41
  natural_pdf/elements/line.py,sha256=aQm4pDdlQSDAAXqrdg4AU-oTl9JCXgYuaJN0EYls6E0,4920
45
42
  natural_pdf/elements/rect.py,sha256=kiVa3e377ZnqIOXc89d9ZSY4EcmDxtccdtUw-HOQzpw,3796
46
- natural_pdf/elements/region.py,sha256=v1PzWvQoGHGdn7SQiPf4Oq3hIGueIfYGwcZ05ZU6XPE,127692
43
+ natural_pdf/elements/region.py,sha256=8SKhzCJ6sELZxJcM2i_58YhEKU6HBvaJ7Oj6E3bOsHw,139523
47
44
  natural_pdf/elements/text.py,sha256=kw7u2KfHtDB905YawP7Hs89kcR8XnbtpkYQGEk6LNyk,18860
48
45
  natural_pdf/export/mixin.py,sha256=L1q3MIEFWuvie4j4_EmW7GT3NerbZ1as0XMUoqTS7gM,5083
49
46
  natural_pdf/exporters/__init__.py,sha256=g1WRPCDVzceaUUsm8dchPhzdHFSjYM0NfRyc8iN0mtE,644
@@ -100,13 +97,12 @@ natural_pdf/utils/text_extraction.py,sha256=mDeN1_VevNi3RwvFe48PM5vBh-A5WeBlYgP6
100
97
  natural_pdf/utils/visualization.py,sha256=n3IZpbY5cf9LItzGavBcNyVZZrrUVxjYnmqZHYPa7NU,9386
101
98
  natural_pdf/widgets/__init__.py,sha256=QTVaUmsw__FCweFYZebwPssQxxUFUMd0wpm_cUbGZJY,181
102
99
  natural_pdf/widgets/viewer.py,sha256=2VUY1TzWMDe9I-IVNOosKZ2LaqpjLB62ftMAdk-s6_8,24952
103
- natural_pdf-0.1.31.dist-info/licenses/LICENSE,sha256=9zfwINwJlarbDmdh6iJV4QUG54QSJlSAUcnC1YiC_Ns,1074
100
+ natural_pdf-0.1.32.dist-info/licenses/LICENSE,sha256=9zfwINwJlarbDmdh6iJV4QUG54QSJlSAUcnC1YiC_Ns,1074
104
101
  optimization/memory_comparison.py,sha256=XEHtjduSmzXzxnsJMvemTcq-OAlvGUBAm5wwnOXq8TY,6524
105
102
  optimization/pdf_analyzer.py,sha256=G3XWhsEqIYbohEgTqz6wzxkAnOx4MkbvbSspx577-8w,19145
106
103
  optimization/performance_analysis.py,sha256=vVlFDywEXxhJLd9n2KVVqqQnS6rwWoHV_jlogboGF2k,13784
107
104
  optimization/test_cleanup_methods.py,sha256=B_zHiJr1hI8q-tdfBoFi0Jf5lj2PURjA_6teRBGoz8o,6277
108
105
  optimization/test_memory_fix.py,sha256=CWc0OSvFfKE0-nxqJOi_HAQc0GXUPKzkQbTeJp5UqxU,6364
109
- tools/rtl_smoke_test.py,sha256=-ogcbvNzumJasICP0NNQHk4Zb4M1VRx0TnGkJUQC7SM,3043
110
106
  tools/bad_pdf_eval/__init__.py,sha256=Nqnn8clbgv-5l0PgxcTOldg8mkMKrFn4TvPL-rYUUGg,1
111
107
  tools/bad_pdf_eval/analyser.py,sha256=sR31aVVmTXRHS8uwLZXlPefTH2_lskxtAzuZwlhsyOo,13391
112
108
  tools/bad_pdf_eval/collate_summaries.py,sha256=Mcmf1OvVn0S0efj5ypk0syXKSrfUf6L5dowoGvOTgjU,5047
@@ -115,8 +111,8 @@ tools/bad_pdf_eval/export_enrichment_csv.py,sha256=SMEm9WxFUN_RIf8AGfZfjGEmvBvrO
115
111
  tools/bad_pdf_eval/llm_enrich.py,sha256=PsFMymPc8BNck21T3vupTN18pLdum-A_OLoJEKr6f80,12234
116
112
  tools/bad_pdf_eval/reporter.py,sha256=LIhcguDZ5XKgb0WeJsyA7m0kcliebOohzveShvt_KmY,400
117
113
  tools/bad_pdf_eval/utils.py,sha256=FuxaPX6f26IjQXu1vP0a2i9h1jgJNbASb8mRyj5-elE,4849
118
- natural_pdf-0.1.31.dist-info/METADATA,sha256=tqimu2ZReyYu5pS0PsbCo-Z9fIzkpMj1ljGPNbaOFss,6711
119
- natural_pdf-0.1.31.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
120
- natural_pdf-0.1.31.dist-info/entry_points.txt,sha256=1R_KMv7g60UBBpRqGfw7bppsMNGdayR-iJlb9ohEk_8,81
121
- natural_pdf-0.1.31.dist-info/top_level.txt,sha256=oZlRzSc3nZ9sV3L6kD_Di734Pp62ANrm46imFVa51qQ,58
122
- natural_pdf-0.1.31.dist-info/RECORD,,
114
+ natural_pdf-0.1.32.dist-info/METADATA,sha256=CMZIo2BjeLh-b9hezQHMLehZP8brUflCQ69dLtfFyxo,6711
115
+ natural_pdf-0.1.32.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
116
+ natural_pdf-0.1.32.dist-info/entry_points.txt,sha256=1R_KMv7g60UBBpRqGfw7bppsMNGdayR-iJlb9ohEk_8,81
117
+ natural_pdf-0.1.32.dist-info/top_level.txt,sha256=oZlRzSc3nZ9sV3L6kD_Di734Pp62ANrm46imFVa51qQ,58
118
+ natural_pdf-0.1.32.dist-info/RECORD,,