natural-pdf 0.1.31__py3-none-any.whl → 0.1.32__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- natural_pdf/analyzers/__init__.py +18 -4
- natural_pdf/analyzers/guides.py +2176 -0
- natural_pdf/analyzers/shape_detection_mixin.py +0 -650
- natural_pdf/core/element_manager.py +42 -3
- natural_pdf/core/page.py +49 -1
- natural_pdf/core/pdf.py +22 -0
- natural_pdf/elements/collections.py +61 -0
- natural_pdf/elements/region.py +257 -14
- {natural_pdf-0.1.31.dist-info → natural_pdf-0.1.32.dist-info}/METADATA +1 -1
- {natural_pdf-0.1.31.dist-info → natural_pdf-0.1.32.dist-info}/RECORD +14 -18
- bad_pdf_analysis/analyze_10_more.py +0 -300
- bad_pdf_analysis/analyze_final_10.py +0 -552
- bad_pdf_analysis/analyze_specific_pages.py +0 -394
- bad_pdf_analysis/analyze_specific_pages_direct.py +0 -382
- tools/rtl_smoke_test.py +0 -80
- {natural_pdf-0.1.31.dist-info → natural_pdf-0.1.32.dist-info}/WHEEL +0 -0
- {natural_pdf-0.1.31.dist-info → natural_pdf-0.1.32.dist-info}/entry_points.txt +0 -0
- {natural_pdf-0.1.31.dist-info → natural_pdf-0.1.32.dist-info}/licenses/LICENSE +0 -0
- {natural_pdf-0.1.31.dist-info → natural_pdf-0.1.32.dist-info}/top_level.txt +0 -0
@@ -230,12 +230,51 @@ class ElementManager:
|
|
230
230
|
char_to_index[key] = idx
|
231
231
|
|
232
232
|
# 2. Instantiate the custom word extractor
|
233
|
-
#
|
233
|
+
# Prefer page-level config over PDF-level for tolerance lookup
|
234
|
+
page_config = getattr(self._page, "_config", {})
|
234
235
|
pdf_config = getattr(self._page._parent, "_config", {})
|
235
|
-
|
236
|
-
|
236
|
+
|
237
|
+
# Start with any explicitly supplied tolerances (may be None)
|
238
|
+
xt = page_config.get("x_tolerance", pdf_config.get("x_tolerance"))
|
239
|
+
yt = page_config.get("y_tolerance", pdf_config.get("y_tolerance"))
|
237
240
|
use_flow = pdf_config.get("use_text_flow", False)
|
238
241
|
|
242
|
+
# ------------------------------------------------------------------
|
243
|
+
# Auto-adaptive tolerance: scale based on median character size when
|
244
|
+
# requested and explicit values are absent.
|
245
|
+
# ------------------------------------------------------------------
|
246
|
+
if pdf_config.get("auto_text_tolerance", True):
|
247
|
+
import statistics
|
248
|
+
|
249
|
+
sizes = [c.get("size", 0) for c in prepared_char_dicts if c.get("size")]
|
250
|
+
median_size = None
|
251
|
+
if sizes:
|
252
|
+
median_size = statistics.median(sizes)
|
253
|
+
if xt is None:
|
254
|
+
xt = 0.25 * median_size # ~kerning width
|
255
|
+
# Record back to page config for downstream users
|
256
|
+
page_config["x_tolerance"] = xt
|
257
|
+
if yt is None:
|
258
|
+
yt = 0.6 * median_size # ~line spacing fraction
|
259
|
+
page_config["y_tolerance"] = yt
|
260
|
+
|
261
|
+
# Warn users when the page's font size is extremely small –
|
262
|
+
# this is often the root cause of merged-row/column issues.
|
263
|
+
if median_size and median_size < 6: # 6 pt is unusually small
|
264
|
+
logger.warning(
|
265
|
+
f"Page {self._page.number}: Median font size is only {median_size:.1f} pt; "
|
266
|
+
f"auto-set x_tolerance={xt:.2f}, y_tolerance={yt:.2f}. "
|
267
|
+
"If the output looks wrong you can override these values via "
|
268
|
+
"PDF(..., text_tolerance={'x_tolerance': X, 'y_tolerance': Y}, "
|
269
|
+
"auto_text_tolerance=False)."
|
270
|
+
)
|
271
|
+
|
272
|
+
# Fallback to pdfplumber defaults if still None
|
273
|
+
if xt is None:
|
274
|
+
xt = 3
|
275
|
+
if yt is None:
|
276
|
+
yt = 3
|
277
|
+
|
239
278
|
# List of attributes to preserve on word objects
|
240
279
|
attributes_to_preserve = list(
|
241
280
|
set(
|
natural_pdf/core/page.py
CHANGED
@@ -128,6 +128,13 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMi
|
|
128
128
|
"named": {}, # Named regions (name -> region)
|
129
129
|
}
|
130
130
|
|
131
|
+
# -------------------------------------------------------------
|
132
|
+
# Page-scoped configuration begins as a shallow copy of the parent
|
133
|
+
# PDF-level configuration so that auto-computed tolerances or other
|
134
|
+
# page-specific values do not overwrite siblings.
|
135
|
+
# -------------------------------------------------------------
|
136
|
+
self._config = dict(getattr(self._parent, "_config", {}))
|
137
|
+
|
131
138
|
# Initialize ElementManager, passing font_attrs
|
132
139
|
self._element_mgr = ElementManager(self, font_attrs=font_attrs)
|
133
140
|
# self._highlighter = HighlightingService(self) # REMOVED - Use property accessor
|
@@ -1153,10 +1160,20 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMi
|
|
1153
1160
|
# 5. Generate Text Layout using Utility
|
1154
1161
|
# Pass page bbox as layout context
|
1155
1162
|
page_bbox = (0, 0, self.width, self.height)
|
1163
|
+
# Merge PDF-level default tolerances if caller did not override
|
1164
|
+
merged_kwargs = dict(kwargs)
|
1165
|
+
tol_keys = ["x_tolerance", "x_tolerance_ratio", "y_tolerance"]
|
1166
|
+
for k in tol_keys:
|
1167
|
+
if k not in merged_kwargs:
|
1168
|
+
if k in self._config:
|
1169
|
+
merged_kwargs[k] = self._config[k]
|
1170
|
+
elif k in getattr(self._parent, "_config", {}):
|
1171
|
+
merged_kwargs[k] = self._parent._config[k]
|
1172
|
+
|
1156
1173
|
result = generate_text_layout(
|
1157
1174
|
char_dicts=filtered_chars,
|
1158
1175
|
layout_context_bbox=page_bbox,
|
1159
|
-
user_kwargs=
|
1176
|
+
user_kwargs=merged_kwargs,
|
1160
1177
|
)
|
1161
1178
|
|
1162
1179
|
# --- Optional: apply Unicode BiDi algorithm for mixed RTL/LTR correctness ---
|
@@ -1356,6 +1373,37 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMi
|
|
1356
1373
|
|
1357
1374
|
# Use the selected method
|
1358
1375
|
if effective_method == "pdfplumber":
|
1376
|
+
# ---------------------------------------------------------
|
1377
|
+
# Inject auto-computed or user-specified text tolerances so
|
1378
|
+
# pdfplumber uses the same numbers we used for word grouping
|
1379
|
+
# whenever the table algorithm relies on word positions.
|
1380
|
+
# ---------------------------------------------------------
|
1381
|
+
if "text" in (
|
1382
|
+
table_settings.get("vertical_strategy"),
|
1383
|
+
table_settings.get("horizontal_strategy"),
|
1384
|
+
):
|
1385
|
+
print("SETTING IT UP")
|
1386
|
+
pdf_cfg = getattr(self, "_config", getattr(self._parent, "_config", {}))
|
1387
|
+
if "text_x_tolerance" not in table_settings and "x_tolerance" not in table_settings:
|
1388
|
+
x_tol = pdf_cfg.get("x_tolerance")
|
1389
|
+
if x_tol is not None:
|
1390
|
+
table_settings.setdefault("text_x_tolerance", x_tol)
|
1391
|
+
if "text_y_tolerance" not in table_settings and "y_tolerance" not in table_settings:
|
1392
|
+
y_tol = pdf_cfg.get("y_tolerance")
|
1393
|
+
if y_tol is not None:
|
1394
|
+
table_settings.setdefault("text_y_tolerance", y_tol)
|
1395
|
+
|
1396
|
+
# pdfplumber's text strategy benefits from a tight snap tolerance.
|
1397
|
+
if "snap_tolerance" not in table_settings and "snap_x_tolerance" not in table_settings:
|
1398
|
+
# Derive from y_tol if available, else default 1
|
1399
|
+
snap = max(1, round((pdf_cfg.get("y_tolerance", 1)) * 0.9))
|
1400
|
+
table_settings.setdefault("snap_tolerance", snap)
|
1401
|
+
if "join_tolerance" not in table_settings and "join_x_tolerance" not in table_settings:
|
1402
|
+
join = table_settings.get("snap_tolerance", 1)
|
1403
|
+
table_settings.setdefault("join_tolerance", join)
|
1404
|
+
table_settings.setdefault("join_x_tolerance", join)
|
1405
|
+
table_settings.setdefault("join_y_tolerance", join)
|
1406
|
+
|
1359
1407
|
return self._page.extract_tables(table_settings)
|
1360
1408
|
else:
|
1361
1409
|
raise ValueError(
|
natural_pdf/core/pdf.py
CHANGED
@@ -168,6 +168,8 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
|
|
168
168
|
reading_order: bool = True,
|
169
169
|
font_attrs: Optional[List[str]] = None,
|
170
170
|
keep_spaces: bool = True,
|
171
|
+
text_tolerance: Optional[dict] = None,
|
172
|
+
auto_text_tolerance: bool = True,
|
171
173
|
):
|
172
174
|
"""
|
173
175
|
Initialize the enhanced PDF object.
|
@@ -177,6 +179,8 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
|
|
177
179
|
reading_order: Whether to use natural reading order
|
178
180
|
font_attrs: Font attributes for grouping characters into words
|
179
181
|
keep_spaces: Whether to include spaces in word elements
|
182
|
+
text_tolerance: PDFplumber-style tolerance settings
|
183
|
+
auto_text_tolerance: Whether to automatically scale text tolerance
|
180
184
|
"""
|
181
185
|
self._original_path_or_stream = path_or_url_or_stream
|
182
186
|
self._temp_file = None
|
@@ -274,6 +278,24 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
|
|
274
278
|
getattr(self, "_is_stream", False),
|
275
279
|
)
|
276
280
|
|
281
|
+
# --- Text tolerance settings ------------------------------------
|
282
|
+
# Users can pass pdfplumber-style keys (x_tolerance, x_tolerance_ratio,
|
283
|
+
# y_tolerance, etc.) via *text_tolerance*. We also keep a flag that
|
284
|
+
# enables automatic tolerance scaling when explicit values are not
|
285
|
+
# supplied.
|
286
|
+
self._config["auto_text_tolerance"] = bool(auto_text_tolerance)
|
287
|
+
if text_tolerance:
|
288
|
+
# Only copy recognised primitives (numbers / None); ignore junk.
|
289
|
+
allowed = {
|
290
|
+
"x_tolerance",
|
291
|
+
"x_tolerance_ratio",
|
292
|
+
"y_tolerance",
|
293
|
+
"keep_blank_chars", # passthrough convenience
|
294
|
+
}
|
295
|
+
for k, v in text_tolerance.items():
|
296
|
+
if k in allowed:
|
297
|
+
self._config[k] = v
|
298
|
+
|
277
299
|
def _initialize_managers(self):
|
278
300
|
"""Set up manager factories for lazy instantiation."""
|
279
301
|
# Store factories/classes for each manager key
|
@@ -1901,7 +1901,68 @@ class ElementCollection(
|
|
1901
1901
|
)
|
1902
1902
|
)
|
1903
1903
|
|
1904
|
+
# ------------------------------------------------------------------
|
1905
|
+
# NEW METHOD: apply_ocr for collections (supports custom function)
|
1906
|
+
# ------------------------------------------------------------------
|
1907
|
+
def apply_ocr(
|
1908
|
+
self,
|
1909
|
+
*,
|
1910
|
+
function: Optional[Callable[["Region"], Optional[str]]] = None,
|
1911
|
+
show_progress: bool = True,
|
1912
|
+
**kwargs,
|
1913
|
+
) -> "ElementCollection":
|
1914
|
+
"""Apply OCR to every element in the collection.
|
1915
|
+
|
1916
|
+
This is a convenience wrapper that simply iterates over the collection
|
1917
|
+
and calls ``el.apply_ocr(...)`` on each item.
|
1918
|
+
|
1919
|
+
Two modes are supported depending on the arguments provided:
|
1920
|
+
|
1921
|
+
1. **Built-in OCR engines** – pass parameters like ``engine='easyocr'``
|
1922
|
+
or ``languages=['en']`` and each element delegates to the global
|
1923
|
+
OCRManager.
|
1924
|
+
2. **Custom function** – pass a *callable* via the ``function`` keyword
|
1925
|
+
(alias ``ocr_function`` also recognised). The callable will receive
|
1926
|
+
the element/region and must return the recognised text (or ``None``).
|
1927
|
+
Internally this is forwarded through the element's own
|
1928
|
+
:py:meth:`apply_ocr` implementation, so the behaviour mirrors the
|
1929
|
+
single-element API.
|
1930
|
+
|
1931
|
+
Parameters
|
1932
|
+
----------
|
1933
|
+
function : callable, optional
|
1934
|
+
Custom OCR function to use instead of the built-in engines.
|
1935
|
+
show_progress : bool, default True
|
1936
|
+
Display a tqdm progress bar while processing.
|
1937
|
+
**kwargs
|
1938
|
+
Additional parameters forwarded to each element's ``apply_ocr``.
|
1939
|
+
|
1940
|
+
Returns
|
1941
|
+
-------
|
1942
|
+
ElementCollection
|
1943
|
+
*Self* for fluent chaining.
|
1944
|
+
"""
|
1945
|
+
# Alias for backward-compatibility
|
1946
|
+
if function is None and "ocr_function" in kwargs:
|
1947
|
+
function = kwargs.pop("ocr_function")
|
1948
|
+
|
1949
|
+
def _process(el):
|
1950
|
+
if hasattr(el, "apply_ocr"):
|
1951
|
+
if function is not None:
|
1952
|
+
return el.apply_ocr(function=function, **kwargs)
|
1953
|
+
else:
|
1954
|
+
return el.apply_ocr(**kwargs)
|
1955
|
+
else:
|
1956
|
+
logger.warning(
|
1957
|
+
f"Element of type {type(el).__name__} does not support apply_ocr. Skipping."
|
1958
|
+
)
|
1959
|
+
return el
|
1960
|
+
|
1961
|
+
# Use collection's apply helper for optional progress bar
|
1962
|
+
self.apply(_process, show_progress=show_progress)
|
1963
|
+
return self
|
1904
1964
|
|
1965
|
+
# ------------------------------------------------------------------
|
1905
1966
|
|
1906
1967
|
|
1907
1968
|
class PageCollection(Generic[P], ApplyMixin, ShapeDetectionMixin):
|
natural_pdf/elements/region.py
CHANGED
@@ -1319,6 +1319,28 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
|
|
1319
1319
|
table_settings.setdefault("vertical_strategy", "lines")
|
1320
1320
|
table_settings.setdefault("horizontal_strategy", "lines")
|
1321
1321
|
|
1322
|
+
# -------------------------------------------------------------
|
1323
|
+
# Auto-inject tolerances when text-based strategies are requested.
|
1324
|
+
# This must happen AFTER alias handling (so strategies are final)
|
1325
|
+
# and BEFORE we delegate to _extract_table_* helpers.
|
1326
|
+
# -------------------------------------------------------------
|
1327
|
+
if "text" in (table_settings.get("vertical_strategy"), table_settings.get("horizontal_strategy")):
|
1328
|
+
page_cfg = getattr(self.page, "_config", {})
|
1329
|
+
# Ensure text_* tolerances passed to pdfplumber
|
1330
|
+
if "text_x_tolerance" not in table_settings and "x_tolerance" not in table_settings:
|
1331
|
+
if page_cfg.get("x_tolerance") is not None:
|
1332
|
+
table_settings["text_x_tolerance"] = page_cfg["x_tolerance"]
|
1333
|
+
if "text_y_tolerance" not in table_settings and "y_tolerance" not in table_settings:
|
1334
|
+
if page_cfg.get("y_tolerance") is not None:
|
1335
|
+
table_settings["text_y_tolerance"] = page_cfg["y_tolerance"]
|
1336
|
+
|
1337
|
+
# Snap / join tolerances (~ line spacing)
|
1338
|
+
if "snap_tolerance" not in table_settings and "snap_x_tolerance" not in table_settings:
|
1339
|
+
snap = max(1, round((page_cfg.get("y_tolerance", 1)) * 0.9))
|
1340
|
+
table_settings["snap_tolerance"] = snap
|
1341
|
+
if "join_tolerance" not in table_settings and "join_x_tolerance" not in table_settings:
|
1342
|
+
table_settings["join_tolerance"] = table_settings["snap_tolerance"]
|
1343
|
+
|
1322
1344
|
logger.debug(f"Region {self.bbox}: Extracting table using method '{effective_method}'")
|
1323
1345
|
|
1324
1346
|
# Use the selected method
|
@@ -1438,6 +1460,30 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
|
|
1438
1460
|
Returns:
|
1439
1461
|
List of tables, where each table is a list of rows, and each row is a list of cell values
|
1440
1462
|
"""
|
1463
|
+
# Inject global PDF-level text tolerances if not explicitly present
|
1464
|
+
pdf_cfg = getattr(self.page, "_config", getattr(self.page._parent, "_config", {}))
|
1465
|
+
_uses_text = "text" in (
|
1466
|
+
table_settings.get("vertical_strategy"),
|
1467
|
+
table_settings.get("horizontal_strategy"),
|
1468
|
+
)
|
1469
|
+
if _uses_text and "text_x_tolerance" not in table_settings and "x_tolerance" not in table_settings:
|
1470
|
+
x_tol = pdf_cfg.get("x_tolerance")
|
1471
|
+
if x_tol is not None:
|
1472
|
+
table_settings.setdefault("text_x_tolerance", x_tol)
|
1473
|
+
if _uses_text and "text_y_tolerance" not in table_settings and "y_tolerance" not in table_settings:
|
1474
|
+
y_tol = pdf_cfg.get("y_tolerance")
|
1475
|
+
if y_tol is not None:
|
1476
|
+
table_settings.setdefault("text_y_tolerance", y_tol)
|
1477
|
+
|
1478
|
+
if _uses_text and "snap_tolerance" not in table_settings and "snap_x_tolerance" not in table_settings:
|
1479
|
+
snap = max(1, round((pdf_cfg.get("y_tolerance", 1)) * 0.9))
|
1480
|
+
table_settings.setdefault("snap_tolerance", snap)
|
1481
|
+
if _uses_text and "join_tolerance" not in table_settings and "join_x_tolerance" not in table_settings:
|
1482
|
+
join = table_settings.get("snap_tolerance", 1)
|
1483
|
+
table_settings.setdefault("join_tolerance", join)
|
1484
|
+
table_settings.setdefault("join_x_tolerance", join)
|
1485
|
+
table_settings.setdefault("join_y_tolerance", join)
|
1486
|
+
|
1441
1487
|
# Create a crop of the page for this region
|
1442
1488
|
cropped = self.page._page.crop(self.bbox)
|
1443
1489
|
|
@@ -1458,6 +1504,21 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
|
|
1458
1504
|
Returns:
|
1459
1505
|
Table data as a list of rows, where each row is a list of cell values
|
1460
1506
|
"""
|
1507
|
+
# Inject global PDF-level text tolerances if not explicitly present
|
1508
|
+
pdf_cfg = getattr(self.page, "_config", getattr(self.page._parent, "_config", {}))
|
1509
|
+
_uses_text = "text" in (
|
1510
|
+
table_settings.get("vertical_strategy"),
|
1511
|
+
table_settings.get("horizontal_strategy"),
|
1512
|
+
)
|
1513
|
+
if _uses_text and "text_x_tolerance" not in table_settings and "x_tolerance" not in table_settings:
|
1514
|
+
x_tol = pdf_cfg.get("x_tolerance")
|
1515
|
+
if x_tol is not None:
|
1516
|
+
table_settings.setdefault("text_x_tolerance", x_tol)
|
1517
|
+
if _uses_text and "text_y_tolerance" not in table_settings and "y_tolerance" not in table_settings:
|
1518
|
+
y_tol = pdf_cfg.get("y_tolerance")
|
1519
|
+
if y_tol is not None:
|
1520
|
+
table_settings.setdefault("text_y_tolerance", y_tol)
|
1521
|
+
|
1461
1522
|
# Create a crop of the page for this region
|
1462
1523
|
cropped = self.page._page.crop(self.bbox)
|
1463
1524
|
|
@@ -1943,21 +2004,45 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
|
|
1943
2004
|
"""
|
1944
2005
|
Apply OCR to this region and return the created text elements.
|
1945
2006
|
|
2007
|
+
This method supports two modes:
|
2008
|
+
1. **Built-in OCR Engines** (default) – identical to previous behaviour. Pass typical
|
2009
|
+
parameters like ``engine='easyocr'`` or ``languages=['en']`` and the method will
|
2010
|
+
route the request through :class:`OCRManager`.
|
2011
|
+
2. **Custom OCR Function** – pass a *callable* under the keyword ``function`` (or
|
2012
|
+
``ocr_function``). The callable will receive *this* Region instance and should
|
2013
|
+
return the extracted text (``str``) or ``None``. Internally the call is
|
2014
|
+
delegated to :pymeth:`apply_custom_ocr` so the same logic (replacement, element
|
2015
|
+
creation, etc.) is re-used.
|
2016
|
+
|
2017
|
+
Examples
|
2018
|
+
---------
|
2019
|
+
>>> def llm_ocr(region):
|
2020
|
+
... image = region.to_image(resolution=300, crop=True)
|
2021
|
+
... return my_llm_client.ocr(image)
|
2022
|
+
>>> region.apply_ocr(function=llm_ocr)
|
2023
|
+
|
1946
2024
|
Args:
|
1947
|
-
replace:
|
1948
|
-
|
1949
|
-
|
1950
|
-
|
1951
|
-
|
1952
|
-
|
1953
|
-
|
1954
|
-
|
1955
|
-
|
1956
|
-
|
2025
|
+
replace: Whether to remove existing OCR elements first (default ``True``).
|
2026
|
+
**ocr_params: Parameters for the built-in OCR manager *or* the special
|
2027
|
+
``function``/``ocr_function`` keyword to trigger custom mode.
|
2028
|
+
|
2029
|
+
Returns
|
2030
|
+
-------
|
2031
|
+
Self – for chaining.
|
2032
|
+
"""
|
2033
|
+
# --- Custom OCR function path --------------------------------------------------
|
2034
|
+
custom_func = ocr_params.pop("function", None) or ocr_params.pop("ocr_function", None)
|
2035
|
+
if callable(custom_func):
|
2036
|
+
# Delegate to the specialised helper while preserving key kwargs
|
2037
|
+
return self.apply_custom_ocr(
|
2038
|
+
ocr_function=custom_func,
|
2039
|
+
source_label=ocr_params.pop("source_label", "custom-ocr"),
|
2040
|
+
replace=replace,
|
2041
|
+
confidence=ocr_params.pop("confidence", None),
|
2042
|
+
add_to_page=ocr_params.pop("add_to_page", True),
|
2043
|
+
)
|
1957
2044
|
|
1958
|
-
|
1959
|
-
Self for method chaining.
|
1960
|
-
"""
|
2045
|
+
# --- Original built-in OCR engine path (unchanged except docstring) ------------
|
1961
2046
|
# Ensure OCRManager is available
|
1962
2047
|
if not hasattr(self.page._parent, "_ocr_manager") or self.page._parent._ocr_manager is None:
|
1963
2048
|
logger.error("OCRManager not available on parent PDF. Cannot apply OCR to region.")
|
@@ -2123,6 +2208,133 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
|
|
2123
2208
|
logger.info(f"Region {self.bbox}: Added {len(created_elements)} elements from OCR.")
|
2124
2209
|
return self
|
2125
2210
|
|
2211
|
+
def apply_custom_ocr(
|
2212
|
+
self,
|
2213
|
+
ocr_function: Callable[["Region"], Optional[str]],
|
2214
|
+
source_label: str = "custom-ocr",
|
2215
|
+
replace: bool = True,
|
2216
|
+
confidence: Optional[float] = None,
|
2217
|
+
add_to_page: bool = True,
|
2218
|
+
) -> "Region":
|
2219
|
+
"""
|
2220
|
+
Apply a custom OCR function to this region and create text elements from the results.
|
2221
|
+
|
2222
|
+
This is useful when you want to use a custom OCR method (e.g., an LLM API,
|
2223
|
+
specialized OCR service, or any custom logic) instead of the built-in OCR engines.
|
2224
|
+
|
2225
|
+
Args:
|
2226
|
+
ocr_function: A callable that takes a Region and returns the OCR'd text (or None).
|
2227
|
+
The function receives this region as its argument and should return
|
2228
|
+
the extracted text as a string, or None if no text was found.
|
2229
|
+
source_label: Label to identify the source of these text elements (default: "custom-ocr").
|
2230
|
+
This will be set as the 'source' attribute on created elements.
|
2231
|
+
replace: If True (default), removes existing OCR elements in this region before
|
2232
|
+
adding new ones. If False, adds new OCR elements alongside existing ones.
|
2233
|
+
confidence: Optional confidence score for the OCR result (0.0-1.0).
|
2234
|
+
If None, defaults to 1.0 if text is returned, 0.0 if None is returned.
|
2235
|
+
add_to_page: If True (default), adds the created text element to the page.
|
2236
|
+
If False, creates the element but doesn't add it to the page.
|
2237
|
+
|
2238
|
+
Returns:
|
2239
|
+
Self for method chaining.
|
2240
|
+
|
2241
|
+
Example:
|
2242
|
+
# Using with an LLM
|
2243
|
+
def ocr_with_llm(region):
|
2244
|
+
image = region.to_image(resolution=300, crop=True)
|
2245
|
+
# Call your LLM API here
|
2246
|
+
return llm_client.ocr(image)
|
2247
|
+
|
2248
|
+
region.apply_custom_ocr(ocr_with_llm)
|
2249
|
+
|
2250
|
+
# Using with a custom OCR service
|
2251
|
+
def ocr_with_service(region):
|
2252
|
+
img_bytes = region.to_image(crop=True).tobytes()
|
2253
|
+
response = ocr_service.process(img_bytes)
|
2254
|
+
return response.text
|
2255
|
+
|
2256
|
+
region.apply_custom_ocr(ocr_with_service, source_label="my-ocr-service")
|
2257
|
+
"""
|
2258
|
+
# If replace is True, remove existing OCR elements in this region
|
2259
|
+
if replace:
|
2260
|
+
logger.info(
|
2261
|
+
f"Region {self.bbox}: Removing existing OCR elements before applying custom OCR."
|
2262
|
+
)
|
2263
|
+
|
2264
|
+
removed_count = 0
|
2265
|
+
|
2266
|
+
# Helper to remove a single element safely
|
2267
|
+
def _safe_remove(elem):
|
2268
|
+
nonlocal removed_count
|
2269
|
+
success = False
|
2270
|
+
if hasattr(elem, "page") and hasattr(elem.page, "_element_mgr"):
|
2271
|
+
etype = getattr(elem, "object_type", "word")
|
2272
|
+
if etype == "word":
|
2273
|
+
etype_key = "words"
|
2274
|
+
elif etype == "char":
|
2275
|
+
etype_key = "chars"
|
2276
|
+
else:
|
2277
|
+
etype_key = etype + "s" if not etype.endswith("s") else etype
|
2278
|
+
try:
|
2279
|
+
success = elem.page._element_mgr.remove_element(elem, etype_key)
|
2280
|
+
except Exception:
|
2281
|
+
success = False
|
2282
|
+
if success:
|
2283
|
+
removed_count += 1
|
2284
|
+
|
2285
|
+
# Remove OCR elements overlapping this region
|
2286
|
+
for word in list(self.page._element_mgr.words):
|
2287
|
+
if getattr(word, "source", "").startswith("ocr") and self.intersects(word):
|
2288
|
+
_safe_remove(word)
|
2289
|
+
|
2290
|
+
# Also check custom-ocr sources
|
2291
|
+
for word in list(self.page._element_mgr.words):
|
2292
|
+
if getattr(word, "source", "") == source_label and self.intersects(word):
|
2293
|
+
_safe_remove(word)
|
2294
|
+
|
2295
|
+
if removed_count > 0:
|
2296
|
+
logger.info(
|
2297
|
+
f"Region {self.bbox}: Removed {removed_count} existing OCR elements."
|
2298
|
+
)
|
2299
|
+
|
2300
|
+
# Call the custom OCR function
|
2301
|
+
try:
|
2302
|
+
logger.debug(f"Region {self.bbox}: Calling custom OCR function...")
|
2303
|
+
ocr_text = ocr_function(self)
|
2304
|
+
|
2305
|
+
if ocr_text is not None and not isinstance(ocr_text, str):
|
2306
|
+
logger.warning(
|
2307
|
+
f"Custom OCR function returned non-string type ({type(ocr_text)}). "
|
2308
|
+
f"Converting to string."
|
2309
|
+
)
|
2310
|
+
ocr_text = str(ocr_text)
|
2311
|
+
|
2312
|
+
except Exception as e:
|
2313
|
+
logger.error(
|
2314
|
+
f"Error calling custom OCR function for region {self.bbox}: {e}",
|
2315
|
+
exc_info=True
|
2316
|
+
)
|
2317
|
+
return self
|
2318
|
+
|
2319
|
+
# Create text element if we got text
|
2320
|
+
if ocr_text is not None:
|
2321
|
+
# Use the to_text_element method to create the element
|
2322
|
+
text_element = self.to_text_element(
|
2323
|
+
text_content=ocr_text,
|
2324
|
+
source_label=source_label,
|
2325
|
+
confidence=confidence,
|
2326
|
+
add_to_page=add_to_page
|
2327
|
+
)
|
2328
|
+
|
2329
|
+
logger.info(
|
2330
|
+
f"Region {self.bbox}: Created text element with {len(ocr_text)} chars"
|
2331
|
+
f"{' and added to page' if add_to_page else ''}"
|
2332
|
+
)
|
2333
|
+
else:
|
2334
|
+
logger.debug(f"Region {self.bbox}: Custom OCR function returned None (no text found)")
|
2335
|
+
|
2336
|
+
return self
|
2337
|
+
|
2126
2338
|
def get_section_between(self, start_element=None, end_element=None, boundary_inclusion="both"):
|
2127
2339
|
"""
|
2128
2340
|
Get a section between two elements within this region.
|
@@ -2917,6 +3129,33 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
|
|
2917
3129
|
if not hasattr(self, "page") or self.page is None:
|
2918
3130
|
raise ValueError("Region must have a valid 'page' attribute to create a TextElement.")
|
2919
3131
|
|
3132
|
+
# Create character dictionaries for the text
|
3133
|
+
char_dicts = []
|
3134
|
+
if actual_text:
|
3135
|
+
# Create a single character dict that spans the entire region
|
3136
|
+
# This is a simplified approach - OCR engines typically create one per character
|
3137
|
+
char_dict = {
|
3138
|
+
"text": actual_text,
|
3139
|
+
"x0": self.x0,
|
3140
|
+
"top": self.top,
|
3141
|
+
"x1": self.x1,
|
3142
|
+
"bottom": self.bottom,
|
3143
|
+
"width": self.width,
|
3144
|
+
"height": self.height,
|
3145
|
+
"object_type": "char",
|
3146
|
+
"page_number": self.page.page_number,
|
3147
|
+
"fontname": default_font_name,
|
3148
|
+
"size": default_font_size,
|
3149
|
+
"upright": True,
|
3150
|
+
"direction": 1,
|
3151
|
+
"adv": self.width,
|
3152
|
+
"source": source_label,
|
3153
|
+
"confidence": final_confidence,
|
3154
|
+
"stroking_color": (0, 0, 0),
|
3155
|
+
"non_stroking_color": (0, 0, 0),
|
3156
|
+
}
|
3157
|
+
char_dicts.append(char_dict)
|
3158
|
+
|
2920
3159
|
elem_data = {
|
2921
3160
|
"text": actual_text,
|
2922
3161
|
"x0": self.x0,
|
@@ -2936,7 +3175,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
|
|
2936
3175
|
"adv": self.width,
|
2937
3176
|
"source": source_label,
|
2938
3177
|
"confidence": final_confidence,
|
2939
|
-
"_char_dicts":
|
3178
|
+
"_char_dicts": char_dicts,
|
2940
3179
|
}
|
2941
3180
|
text_element = TextElement(elem_data, self.page)
|
2942
3181
|
|
@@ -2952,6 +3191,10 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
|
|
2952
3191
|
logger.debug(
|
2953
3192
|
f"TextElement created from region {self.bbox} and added to page {self.page.page_number} as {add_as_type}."
|
2954
3193
|
)
|
3194
|
+
# Also add character dictionaries to the chars collection
|
3195
|
+
if char_dicts and object_type == "word":
|
3196
|
+
for char_dict in char_dicts:
|
3197
|
+
self.page._element_mgr.add_element(char_dict, element_type="chars")
|
2955
3198
|
else:
|
2956
3199
|
page_num_str = (
|
2957
3200
|
str(self.page.page_number) if hasattr(self.page, "page_number") else "N/A"
|
@@ -1,11 +1,8 @@
|
|
1
|
-
bad_pdf_analysis/analyze_10_more.py,sha256=UjsTuHE1GUMoVjkX3afy3x6DfpXyfZXHgS2W1GQqUmw,11906
|
2
|
-
bad_pdf_analysis/analyze_final_10.py,sha256=xYkIId0nF9LpWHRLDP1_nlJfJfC0b0Tu4mLu-3mim-0,25170
|
3
|
-
bad_pdf_analysis/analyze_specific_pages.py,sha256=wzq3_ZWR28hFdT7GEkayHPYgsk20OpD476LYmy2rAEk,13725
|
4
|
-
bad_pdf_analysis/analyze_specific_pages_direct.py,sha256=307gSNplwOtNTR9a0lEQWxlAKGeoZIcDe5z1pROKUXY,14846
|
5
1
|
natural_pdf/__init__.py,sha256=qDFJNF8sbEDO-2WSFAxoWEM8updOUP6dB-ckya0kxfs,3275
|
6
2
|
natural_pdf/cli.py,sha256=IXrP2lCHihr-ed-CFiDbMTnSsutQa1j1PYALOLGbpsc,4019
|
7
|
-
natural_pdf/analyzers/__init__.py,sha256=
|
8
|
-
natural_pdf/analyzers/
|
3
|
+
natural_pdf/analyzers/__init__.py,sha256=MQRctn4i5Q7u8pb8vQVHKEXUiVGpKyPZUECrlDH4AuU,673
|
4
|
+
natural_pdf/analyzers/guides.py,sha256=tzyViSBDdM66mT0niwFTDIJ16UzRCZ18Iqv8wA5DYAk,90302
|
5
|
+
natural_pdf/analyzers/shape_detection_mixin.py,sha256=q7gDM-z2t7bSTxjfV2aaW3533CySu1qsEpu4wb5Rp-I,62688
|
9
6
|
natural_pdf/analyzers/text_options.py,sha256=qEkDaYWla0rIM_gszEOsu52q7C_dAfV81P2HLJZM2sw,3333
|
10
7
|
natural_pdf/analyzers/text_structure.py,sha256=VfKTsTFrK877sC0grsis9jK3rrgp0Mbp13VWEbukTcs,28437
|
11
8
|
natural_pdf/analyzers/utils.py,sha256=PYbzJzSAHZ7JsMes84WIrSbA0zkjJGs0CLvIeINsf_k,2100
|
@@ -28,10 +25,10 @@ natural_pdf/classification/results.py,sha256=Mcay-xLBHbYoZ8U7f4gMj2IhhH_yORNEkZH
|
|
28
25
|
natural_pdf/collections/mixins.py,sha256=sj76Cn6EdBtb5f-bdAV-1qpdixX8tI4BzPccPiYLI1w,5117
|
29
26
|
natural_pdf/collections/pdf_collection.py,sha256=HLlyakM--23ZOeHDPucoM6Tw3yUyMXm0SSoqJwxRc2E,30744
|
30
27
|
natural_pdf/core/__init__.py,sha256=QC8H4M3KbXwMFiQORZ0pdPlzx1Ix6oKKQSS7Ib2KEaA,38
|
31
|
-
natural_pdf/core/element_manager.py,sha256=
|
28
|
+
natural_pdf/core/element_manager.py,sha256=A6GJk9kwTzt-aSz4-SWaRHLZRbIMFFLce3CpxSyfkV4,51749
|
32
29
|
natural_pdf/core/highlighting_service.py,sha256=WKDqRpex1yS8CWhkNitWtKhxbyRRCLu3Xsct_HTPsD4,40774
|
33
|
-
natural_pdf/core/page.py,sha256=
|
34
|
-
natural_pdf/core/pdf.py,sha256=
|
30
|
+
natural_pdf/core/page.py,sha256=843_Fyk1gxZ8nqERJjjjoRD3iM4pFJy9a0zQSyMthiQ,128476
|
31
|
+
natural_pdf/core/pdf.py,sha256=mC4GZjPXx_bK6RUlhLpnJnapkHDhbgJpgpcUJOvb7OE,75290
|
35
32
|
natural_pdf/describe/__init__.py,sha256=B3zjuHjFI_dFuBLgXR1Q4v7c72fVDyk84d2hs0H4KV8,561
|
36
33
|
natural_pdf/describe/base.py,sha256=HaWlHltb-dw6ug4mfR_iBLHWxr1OdPwLaUshXRxO7gg,18462
|
37
34
|
natural_pdf/describe/elements.py,sha256=COvKF3B_RbAxXl5ORJDubV4C5PsiuSfuzD0ufPIJTFM,12983
|
@@ -39,11 +36,11 @@ natural_pdf/describe/mixin.py,sha256=U0x6v8r57KQb8qC3VVo64hvhfXQWsti8vdKBM7AXnMo
|
|
39
36
|
natural_pdf/describe/summary.py,sha256=7FIF3zF6bzNx-gx4pCJr2XQFKiVzOEDnWsAYQ_mr9L0,7982
|
40
37
|
natural_pdf/elements/__init__.py,sha256=S8XeiNWJ1WcgnyYKdYV1yxQlAxCCO3FfITT8MQwNbyk,41
|
41
38
|
natural_pdf/elements/base.py,sha256=VshU4RstdzONJFq_8UVIjT_lVOai0MwMFsSFrCN-IO8,47299
|
42
|
-
natural_pdf/elements/collections.py,sha256=
|
39
|
+
natural_pdf/elements/collections.py,sha256=1E2MSg2NNcEcoRM2rumrv_CqIdO7DgbRHYEtfw35FaQ,128457
|
43
40
|
natural_pdf/elements/image.py,sha256=UjHNzCgDzOseQmLpkKshcxg51DPmWNIAVYxZ0TAMyUI,1423
|
44
41
|
natural_pdf/elements/line.py,sha256=aQm4pDdlQSDAAXqrdg4AU-oTl9JCXgYuaJN0EYls6E0,4920
|
45
42
|
natural_pdf/elements/rect.py,sha256=kiVa3e377ZnqIOXc89d9ZSY4EcmDxtccdtUw-HOQzpw,3796
|
46
|
-
natural_pdf/elements/region.py,sha256=
|
43
|
+
natural_pdf/elements/region.py,sha256=8SKhzCJ6sELZxJcM2i_58YhEKU6HBvaJ7Oj6E3bOsHw,139523
|
47
44
|
natural_pdf/elements/text.py,sha256=kw7u2KfHtDB905YawP7Hs89kcR8XnbtpkYQGEk6LNyk,18860
|
48
45
|
natural_pdf/export/mixin.py,sha256=L1q3MIEFWuvie4j4_EmW7GT3NerbZ1as0XMUoqTS7gM,5083
|
49
46
|
natural_pdf/exporters/__init__.py,sha256=g1WRPCDVzceaUUsm8dchPhzdHFSjYM0NfRyc8iN0mtE,644
|
@@ -100,13 +97,12 @@ natural_pdf/utils/text_extraction.py,sha256=mDeN1_VevNi3RwvFe48PM5vBh-A5WeBlYgP6
|
|
100
97
|
natural_pdf/utils/visualization.py,sha256=n3IZpbY5cf9LItzGavBcNyVZZrrUVxjYnmqZHYPa7NU,9386
|
101
98
|
natural_pdf/widgets/__init__.py,sha256=QTVaUmsw__FCweFYZebwPssQxxUFUMd0wpm_cUbGZJY,181
|
102
99
|
natural_pdf/widgets/viewer.py,sha256=2VUY1TzWMDe9I-IVNOosKZ2LaqpjLB62ftMAdk-s6_8,24952
|
103
|
-
natural_pdf-0.1.
|
100
|
+
natural_pdf-0.1.32.dist-info/licenses/LICENSE,sha256=9zfwINwJlarbDmdh6iJV4QUG54QSJlSAUcnC1YiC_Ns,1074
|
104
101
|
optimization/memory_comparison.py,sha256=XEHtjduSmzXzxnsJMvemTcq-OAlvGUBAm5wwnOXq8TY,6524
|
105
102
|
optimization/pdf_analyzer.py,sha256=G3XWhsEqIYbohEgTqz6wzxkAnOx4MkbvbSspx577-8w,19145
|
106
103
|
optimization/performance_analysis.py,sha256=vVlFDywEXxhJLd9n2KVVqqQnS6rwWoHV_jlogboGF2k,13784
|
107
104
|
optimization/test_cleanup_methods.py,sha256=B_zHiJr1hI8q-tdfBoFi0Jf5lj2PURjA_6teRBGoz8o,6277
|
108
105
|
optimization/test_memory_fix.py,sha256=CWc0OSvFfKE0-nxqJOi_HAQc0GXUPKzkQbTeJp5UqxU,6364
|
109
|
-
tools/rtl_smoke_test.py,sha256=-ogcbvNzumJasICP0NNQHk4Zb4M1VRx0TnGkJUQC7SM,3043
|
110
106
|
tools/bad_pdf_eval/__init__.py,sha256=Nqnn8clbgv-5l0PgxcTOldg8mkMKrFn4TvPL-rYUUGg,1
|
111
107
|
tools/bad_pdf_eval/analyser.py,sha256=sR31aVVmTXRHS8uwLZXlPefTH2_lskxtAzuZwlhsyOo,13391
|
112
108
|
tools/bad_pdf_eval/collate_summaries.py,sha256=Mcmf1OvVn0S0efj5ypk0syXKSrfUf6L5dowoGvOTgjU,5047
|
@@ -115,8 +111,8 @@ tools/bad_pdf_eval/export_enrichment_csv.py,sha256=SMEm9WxFUN_RIf8AGfZfjGEmvBvrO
|
|
115
111
|
tools/bad_pdf_eval/llm_enrich.py,sha256=PsFMymPc8BNck21T3vupTN18pLdum-A_OLoJEKr6f80,12234
|
116
112
|
tools/bad_pdf_eval/reporter.py,sha256=LIhcguDZ5XKgb0WeJsyA7m0kcliebOohzveShvt_KmY,400
|
117
113
|
tools/bad_pdf_eval/utils.py,sha256=FuxaPX6f26IjQXu1vP0a2i9h1jgJNbASb8mRyj5-elE,4849
|
118
|
-
natural_pdf-0.1.
|
119
|
-
natural_pdf-0.1.
|
120
|
-
natural_pdf-0.1.
|
121
|
-
natural_pdf-0.1.
|
122
|
-
natural_pdf-0.1.
|
114
|
+
natural_pdf-0.1.32.dist-info/METADATA,sha256=CMZIo2BjeLh-b9hezQHMLehZP8brUflCQ69dLtfFyxo,6711
|
115
|
+
natural_pdf-0.1.32.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
116
|
+
natural_pdf-0.1.32.dist-info/entry_points.txt,sha256=1R_KMv7g60UBBpRqGfw7bppsMNGdayR-iJlb9ohEk_8,81
|
117
|
+
natural_pdf-0.1.32.dist-info/top_level.txt,sha256=oZlRzSc3nZ9sV3L6kD_Di734Pp62ANrm46imFVa51qQ,58
|
118
|
+
natural_pdf-0.1.32.dist-info/RECORD,,
|