natural-pdf 0.1.31__py3-none-any.whl → 0.1.33__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- natural_pdf/analyzers/__init__.py +18 -4
- natural_pdf/analyzers/guides.py +2176 -0
- natural_pdf/analyzers/shape_detection_mixin.py +0 -650
- natural_pdf/core/element_manager.py +99 -40
- natural_pdf/core/page.py +76 -3
- natural_pdf/core/pdf.py +38 -3
- natural_pdf/elements/collections.py +61 -0
- natural_pdf/elements/region.py +270 -14
- {natural_pdf-0.1.31.dist-info → natural_pdf-0.1.33.dist-info}/METADATA +1 -1
- {natural_pdf-0.1.31.dist-info → natural_pdf-0.1.33.dist-info}/RECORD +14 -18
- bad_pdf_analysis/analyze_10_more.py +0 -300
- bad_pdf_analysis/analyze_final_10.py +0 -552
- bad_pdf_analysis/analyze_specific_pages.py +0 -394
- bad_pdf_analysis/analyze_specific_pages_direct.py +0 -382
- tools/rtl_smoke_test.py +0 -80
- {natural_pdf-0.1.31.dist-info → natural_pdf-0.1.33.dist-info}/WHEEL +0 -0
- {natural_pdf-0.1.31.dist-info → natural_pdf-0.1.33.dist-info}/entry_points.txt +0 -0
- {natural_pdf-0.1.31.dist-info → natural_pdf-0.1.33.dist-info}/licenses/LICENSE +0 -0
- {natural_pdf-0.1.31.dist-info → natural_pdf-0.1.33.dist-info}/top_level.txt +0 -0
natural_pdf/elements/region.py
CHANGED
@@ -1319,6 +1319,28 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
|
|
1319
1319
|
table_settings.setdefault("vertical_strategy", "lines")
|
1320
1320
|
table_settings.setdefault("horizontal_strategy", "lines")
|
1321
1321
|
|
1322
|
+
# -------------------------------------------------------------
|
1323
|
+
# Auto-inject tolerances when text-based strategies are requested.
|
1324
|
+
# This must happen AFTER alias handling (so strategies are final)
|
1325
|
+
# and BEFORE we delegate to _extract_table_* helpers.
|
1326
|
+
# -------------------------------------------------------------
|
1327
|
+
if "text" in (table_settings.get("vertical_strategy"), table_settings.get("horizontal_strategy")):
|
1328
|
+
page_cfg = getattr(self.page, "_config", {})
|
1329
|
+
# Ensure text_* tolerances passed to pdfplumber
|
1330
|
+
if "text_x_tolerance" not in table_settings and "x_tolerance" not in table_settings:
|
1331
|
+
if page_cfg.get("x_tolerance") is not None:
|
1332
|
+
table_settings["text_x_tolerance"] = page_cfg["x_tolerance"]
|
1333
|
+
if "text_y_tolerance" not in table_settings and "y_tolerance" not in table_settings:
|
1334
|
+
if page_cfg.get("y_tolerance") is not None:
|
1335
|
+
table_settings["text_y_tolerance"] = page_cfg["y_tolerance"]
|
1336
|
+
|
1337
|
+
# Snap / join tolerances (~ line spacing)
|
1338
|
+
if "snap_tolerance" not in table_settings and "snap_x_tolerance" not in table_settings:
|
1339
|
+
snap = max(1, round((page_cfg.get("y_tolerance", 1)) * 0.9))
|
1340
|
+
table_settings["snap_tolerance"] = snap
|
1341
|
+
if "join_tolerance" not in table_settings and "join_x_tolerance" not in table_settings:
|
1342
|
+
table_settings["join_tolerance"] = table_settings["snap_tolerance"]
|
1343
|
+
|
1322
1344
|
logger.debug(f"Region {self.bbox}: Extracting table using method '{effective_method}'")
|
1323
1345
|
|
1324
1346
|
# Use the selected method
|
@@ -1438,6 +1460,30 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
|
|
1438
1460
|
Returns:
|
1439
1461
|
List of tables, where each table is a list of rows, and each row is a list of cell values
|
1440
1462
|
"""
|
1463
|
+
# Inject global PDF-level text tolerances if not explicitly present
|
1464
|
+
pdf_cfg = getattr(self.page, "_config", getattr(self.page._parent, "_config", {}))
|
1465
|
+
_uses_text = "text" in (
|
1466
|
+
table_settings.get("vertical_strategy"),
|
1467
|
+
table_settings.get("horizontal_strategy"),
|
1468
|
+
)
|
1469
|
+
if _uses_text and "text_x_tolerance" not in table_settings and "x_tolerance" not in table_settings:
|
1470
|
+
x_tol = pdf_cfg.get("x_tolerance")
|
1471
|
+
if x_tol is not None:
|
1472
|
+
table_settings.setdefault("text_x_tolerance", x_tol)
|
1473
|
+
if _uses_text and "text_y_tolerance" not in table_settings and "y_tolerance" not in table_settings:
|
1474
|
+
y_tol = pdf_cfg.get("y_tolerance")
|
1475
|
+
if y_tol is not None:
|
1476
|
+
table_settings.setdefault("text_y_tolerance", y_tol)
|
1477
|
+
|
1478
|
+
if _uses_text and "snap_tolerance" not in table_settings and "snap_x_tolerance" not in table_settings:
|
1479
|
+
snap = max(1, round((pdf_cfg.get("y_tolerance", 1)) * 0.9))
|
1480
|
+
table_settings.setdefault("snap_tolerance", snap)
|
1481
|
+
if _uses_text and "join_tolerance" not in table_settings and "join_x_tolerance" not in table_settings:
|
1482
|
+
join = table_settings.get("snap_tolerance", 1)
|
1483
|
+
table_settings.setdefault("join_tolerance", join)
|
1484
|
+
table_settings.setdefault("join_x_tolerance", join)
|
1485
|
+
table_settings.setdefault("join_y_tolerance", join)
|
1486
|
+
|
1441
1487
|
# Create a crop of the page for this region
|
1442
1488
|
cropped = self.page._page.crop(self.bbox)
|
1443
1489
|
|
@@ -1458,6 +1504,21 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
|
|
1458
1504
|
Returns:
|
1459
1505
|
Table data as a list of rows, where each row is a list of cell values
|
1460
1506
|
"""
|
1507
|
+
# Inject global PDF-level text tolerances if not explicitly present
|
1508
|
+
pdf_cfg = getattr(self.page, "_config", getattr(self.page._parent, "_config", {}))
|
1509
|
+
_uses_text = "text" in (
|
1510
|
+
table_settings.get("vertical_strategy"),
|
1511
|
+
table_settings.get("horizontal_strategy"),
|
1512
|
+
)
|
1513
|
+
if _uses_text and "text_x_tolerance" not in table_settings and "x_tolerance" not in table_settings:
|
1514
|
+
x_tol = pdf_cfg.get("x_tolerance")
|
1515
|
+
if x_tol is not None:
|
1516
|
+
table_settings.setdefault("text_x_tolerance", x_tol)
|
1517
|
+
if _uses_text and "text_y_tolerance" not in table_settings and "y_tolerance" not in table_settings:
|
1518
|
+
y_tol = pdf_cfg.get("y_tolerance")
|
1519
|
+
if y_tol is not None:
|
1520
|
+
table_settings.setdefault("text_y_tolerance", y_tol)
|
1521
|
+
|
1461
1522
|
# Create a crop of the page for this region
|
1462
1523
|
cropped = self.page._page.crop(self.bbox)
|
1463
1524
|
|
@@ -1943,21 +2004,45 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
|
|
1943
2004
|
"""
|
1944
2005
|
Apply OCR to this region and return the created text elements.
|
1945
2006
|
|
2007
|
+
This method supports two modes:
|
2008
|
+
1. **Built-in OCR Engines** (default) – identical to previous behaviour. Pass typical
|
2009
|
+
parameters like ``engine='easyocr'`` or ``languages=['en']`` and the method will
|
2010
|
+
route the request through :class:`OCRManager`.
|
2011
|
+
2. **Custom OCR Function** – pass a *callable* under the keyword ``function`` (or
|
2012
|
+
``ocr_function``). The callable will receive *this* Region instance and should
|
2013
|
+
return the extracted text (``str``) or ``None``. Internally the call is
|
2014
|
+
delegated to :pymeth:`apply_custom_ocr` so the same logic (replacement, element
|
2015
|
+
creation, etc.) is re-used.
|
2016
|
+
|
2017
|
+
Examples
|
2018
|
+
---------
|
2019
|
+
>>> def llm_ocr(region):
|
2020
|
+
... image = region.to_image(resolution=300, crop=True)
|
2021
|
+
... return my_llm_client.ocr(image)
|
2022
|
+
>>> region.apply_ocr(function=llm_ocr)
|
2023
|
+
|
1946
2024
|
Args:
|
1947
|
-
replace:
|
1948
|
-
|
1949
|
-
|
1950
|
-
|
1951
|
-
|
1952
|
-
|
1953
|
-
|
1954
|
-
|
1955
|
-
|
1956
|
-
|
2025
|
+
replace: Whether to remove existing OCR elements first (default ``True``).
|
2026
|
+
**ocr_params: Parameters for the built-in OCR manager *or* the special
|
2027
|
+
``function``/``ocr_function`` keyword to trigger custom mode.
|
2028
|
+
|
2029
|
+
Returns
|
2030
|
+
-------
|
2031
|
+
Self – for chaining.
|
2032
|
+
"""
|
2033
|
+
# --- Custom OCR function path --------------------------------------------------
|
2034
|
+
custom_func = ocr_params.pop("function", None) or ocr_params.pop("ocr_function", None)
|
2035
|
+
if callable(custom_func):
|
2036
|
+
# Delegate to the specialised helper while preserving key kwargs
|
2037
|
+
return self.apply_custom_ocr(
|
2038
|
+
ocr_function=custom_func,
|
2039
|
+
source_label=ocr_params.pop("source_label", "custom-ocr"),
|
2040
|
+
replace=replace,
|
2041
|
+
confidence=ocr_params.pop("confidence", None),
|
2042
|
+
add_to_page=ocr_params.pop("add_to_page", True),
|
2043
|
+
)
|
1957
2044
|
|
1958
|
-
|
1959
|
-
Self for method chaining.
|
1960
|
-
"""
|
2045
|
+
# --- Original built-in OCR engine path (unchanged except docstring) ------------
|
1961
2046
|
# Ensure OCRManager is available
|
1962
2047
|
if not hasattr(self.page._parent, "_ocr_manager") or self.page._parent._ocr_manager is None:
|
1963
2048
|
logger.error("OCRManager not available on parent PDF. Cannot apply OCR to region.")
|
@@ -2123,6 +2208,146 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
|
|
2123
2208
|
logger.info(f"Region {self.bbox}: Added {len(created_elements)} elements from OCR.")
|
2124
2209
|
return self
|
2125
2210
|
|
2211
|
+
def apply_custom_ocr(
|
2212
|
+
self,
|
2213
|
+
ocr_function: Callable[["Region"], Optional[str]],
|
2214
|
+
source_label: str = "custom-ocr",
|
2215
|
+
replace: bool = True,
|
2216
|
+
confidence: Optional[float] = None,
|
2217
|
+
add_to_page: bool = True,
|
2218
|
+
) -> "Region":
|
2219
|
+
"""
|
2220
|
+
Apply a custom OCR function to this region and create text elements from the results.
|
2221
|
+
|
2222
|
+
This is useful when you want to use a custom OCR method (e.g., an LLM API,
|
2223
|
+
specialized OCR service, or any custom logic) instead of the built-in OCR engines.
|
2224
|
+
|
2225
|
+
Args:
|
2226
|
+
ocr_function: A callable that takes a Region and returns the OCR'd text (or None).
|
2227
|
+
The function receives this region as its argument and should return
|
2228
|
+
the extracted text as a string, or None if no text was found.
|
2229
|
+
source_label: Label to identify the source of these text elements (default: "custom-ocr").
|
2230
|
+
This will be set as the 'source' attribute on created elements.
|
2231
|
+
replace: If True (default), removes existing OCR elements in this region before
|
2232
|
+
adding new ones. If False, adds new OCR elements alongside existing ones.
|
2233
|
+
confidence: Optional confidence score for the OCR result (0.0-1.0).
|
2234
|
+
If None, defaults to 1.0 if text is returned, 0.0 if None is returned.
|
2235
|
+
add_to_page: If True (default), adds the created text element to the page.
|
2236
|
+
If False, creates the element but doesn't add it to the page.
|
2237
|
+
|
2238
|
+
Returns:
|
2239
|
+
Self for method chaining.
|
2240
|
+
|
2241
|
+
Example:
|
2242
|
+
# Using with an LLM
|
2243
|
+
def ocr_with_llm(region):
|
2244
|
+
image = region.to_image(resolution=300, crop=True)
|
2245
|
+
# Call your LLM API here
|
2246
|
+
return llm_client.ocr(image)
|
2247
|
+
|
2248
|
+
region.apply_custom_ocr(ocr_with_llm)
|
2249
|
+
|
2250
|
+
# Using with a custom OCR service
|
2251
|
+
def ocr_with_service(region):
|
2252
|
+
img_bytes = region.to_image(crop=True).tobytes()
|
2253
|
+
response = ocr_service.process(img_bytes)
|
2254
|
+
return response.text
|
2255
|
+
|
2256
|
+
region.apply_custom_ocr(ocr_with_service, source_label="my-ocr-service")
|
2257
|
+
"""
|
2258
|
+
# If replace is True, remove existing OCR elements in this region
|
2259
|
+
if replace:
|
2260
|
+
logger.info(
|
2261
|
+
f"Region {self.bbox}: Removing existing OCR elements before applying custom OCR."
|
2262
|
+
)
|
2263
|
+
|
2264
|
+
removed_count = 0
|
2265
|
+
|
2266
|
+
# Helper to remove a single element safely
|
2267
|
+
def _safe_remove(elem):
|
2268
|
+
nonlocal removed_count
|
2269
|
+
success = False
|
2270
|
+
if hasattr(elem, "page") and hasattr(elem.page, "_element_mgr"):
|
2271
|
+
etype = getattr(elem, "object_type", "word")
|
2272
|
+
if etype == "word":
|
2273
|
+
etype_key = "words"
|
2274
|
+
elif etype == "char":
|
2275
|
+
etype_key = "chars"
|
2276
|
+
else:
|
2277
|
+
etype_key = etype + "s" if not etype.endswith("s") else etype
|
2278
|
+
try:
|
2279
|
+
success = elem.page._element_mgr.remove_element(elem, etype_key)
|
2280
|
+
except Exception:
|
2281
|
+
success = False
|
2282
|
+
if success:
|
2283
|
+
removed_count += 1
|
2284
|
+
|
2285
|
+
# Remove ALL OCR elements overlapping this region
|
2286
|
+
# Remove elements with source=="ocr" (built-in OCR) or matching the source_label (previous custom OCR)
|
2287
|
+
for word in list(self.page._element_mgr.words):
|
2288
|
+
word_source = getattr(word, "source", "")
|
2289
|
+
# Match built-in OCR behavior: remove elements with source "ocr" exactly
|
2290
|
+
# Also remove elements with the same source_label to avoid duplicates
|
2291
|
+
if (word_source == "ocr" or word_source == source_label) and self.intersects(word):
|
2292
|
+
_safe_remove(word)
|
2293
|
+
|
2294
|
+
# Also remove char dicts if needed (matching built-in OCR)
|
2295
|
+
for char in list(self.page._element_mgr.chars):
|
2296
|
+
# char can be dict or TextElement; normalize
|
2297
|
+
char_src = char.get("source") if isinstance(char, dict) else getattr(char, "source", None)
|
2298
|
+
if char_src == "ocr" or char_src == source_label:
|
2299
|
+
# Rough bbox for dicts
|
2300
|
+
if isinstance(char, dict):
|
2301
|
+
cx0, ctop, cx1, cbottom = char.get("x0", 0), char.get("top", 0), char.get("x1", 0), char.get("bottom", 0)
|
2302
|
+
else:
|
2303
|
+
cx0, ctop, cx1, cbottom = char.x0, char.top, char.x1, char.bottom
|
2304
|
+
# Quick overlap check
|
2305
|
+
if not (cx1 < self.x0 or cx0 > self.x1 or cbottom < self.top or ctop > self.bottom):
|
2306
|
+
_safe_remove(char)
|
2307
|
+
|
2308
|
+
if removed_count > 0:
|
2309
|
+
logger.info(
|
2310
|
+
f"Region {self.bbox}: Removed {removed_count} existing OCR elements."
|
2311
|
+
)
|
2312
|
+
|
2313
|
+
# Call the custom OCR function
|
2314
|
+
try:
|
2315
|
+
logger.debug(f"Region {self.bbox}: Calling custom OCR function...")
|
2316
|
+
ocr_text = ocr_function(self)
|
2317
|
+
|
2318
|
+
if ocr_text is not None and not isinstance(ocr_text, str):
|
2319
|
+
logger.warning(
|
2320
|
+
f"Custom OCR function returned non-string type ({type(ocr_text)}). "
|
2321
|
+
f"Converting to string."
|
2322
|
+
)
|
2323
|
+
ocr_text = str(ocr_text)
|
2324
|
+
|
2325
|
+
except Exception as e:
|
2326
|
+
logger.error(
|
2327
|
+
f"Error calling custom OCR function for region {self.bbox}: {e}",
|
2328
|
+
exc_info=True
|
2329
|
+
)
|
2330
|
+
return self
|
2331
|
+
|
2332
|
+
# Create text element if we got text
|
2333
|
+
if ocr_text is not None:
|
2334
|
+
# Use the to_text_element method to create the element
|
2335
|
+
text_element = self.to_text_element(
|
2336
|
+
text_content=ocr_text,
|
2337
|
+
source_label=source_label,
|
2338
|
+
confidence=confidence,
|
2339
|
+
add_to_page=add_to_page
|
2340
|
+
)
|
2341
|
+
|
2342
|
+
logger.info(
|
2343
|
+
f"Region {self.bbox}: Created text element with {len(ocr_text)} chars"
|
2344
|
+
f"{' and added to page' if add_to_page else ''}"
|
2345
|
+
)
|
2346
|
+
else:
|
2347
|
+
logger.debug(f"Region {self.bbox}: Custom OCR function returned None (no text found)")
|
2348
|
+
|
2349
|
+
return self
|
2350
|
+
|
2126
2351
|
def get_section_between(self, start_element=None, end_element=None, boundary_inclusion="both"):
|
2127
2352
|
"""
|
2128
2353
|
Get a section between two elements within this region.
|
@@ -2917,6 +3142,33 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
|
|
2917
3142
|
if not hasattr(self, "page") or self.page is None:
|
2918
3143
|
raise ValueError("Region must have a valid 'page' attribute to create a TextElement.")
|
2919
3144
|
|
3145
|
+
# Create character dictionaries for the text
|
3146
|
+
char_dicts = []
|
3147
|
+
if actual_text:
|
3148
|
+
# Create a single character dict that spans the entire region
|
3149
|
+
# This is a simplified approach - OCR engines typically create one per character
|
3150
|
+
char_dict = {
|
3151
|
+
"text": actual_text,
|
3152
|
+
"x0": self.x0,
|
3153
|
+
"top": self.top,
|
3154
|
+
"x1": self.x1,
|
3155
|
+
"bottom": self.bottom,
|
3156
|
+
"width": self.width,
|
3157
|
+
"height": self.height,
|
3158
|
+
"object_type": "char",
|
3159
|
+
"page_number": self.page.page_number,
|
3160
|
+
"fontname": default_font_name,
|
3161
|
+
"size": default_font_size,
|
3162
|
+
"upright": True,
|
3163
|
+
"direction": 1,
|
3164
|
+
"adv": self.width,
|
3165
|
+
"source": source_label,
|
3166
|
+
"confidence": final_confidence,
|
3167
|
+
"stroking_color": (0, 0, 0),
|
3168
|
+
"non_stroking_color": (0, 0, 0),
|
3169
|
+
}
|
3170
|
+
char_dicts.append(char_dict)
|
3171
|
+
|
2920
3172
|
elem_data = {
|
2921
3173
|
"text": actual_text,
|
2922
3174
|
"x0": self.x0,
|
@@ -2936,7 +3188,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
|
|
2936
3188
|
"adv": self.width,
|
2937
3189
|
"source": source_label,
|
2938
3190
|
"confidence": final_confidence,
|
2939
|
-
"_char_dicts":
|
3191
|
+
"_char_dicts": char_dicts,
|
2940
3192
|
}
|
2941
3193
|
text_element = TextElement(elem_data, self.page)
|
2942
3194
|
|
@@ -2952,6 +3204,10 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
|
|
2952
3204
|
logger.debug(
|
2953
3205
|
f"TextElement created from region {self.bbox} and added to page {self.page.page_number} as {add_as_type}."
|
2954
3206
|
)
|
3207
|
+
# Also add character dictionaries to the chars collection
|
3208
|
+
if char_dicts and object_type == "word":
|
3209
|
+
for char_dict in char_dicts:
|
3210
|
+
self.page._element_mgr.add_element(char_dict, element_type="chars")
|
2955
3211
|
else:
|
2956
3212
|
page_num_str = (
|
2957
3213
|
str(self.page.page_number) if hasattr(self.page, "page_number") else "N/A"
|
@@ -1,11 +1,8 @@
|
|
1
|
-
bad_pdf_analysis/analyze_10_more.py,sha256=UjsTuHE1GUMoVjkX3afy3x6DfpXyfZXHgS2W1GQqUmw,11906
|
2
|
-
bad_pdf_analysis/analyze_final_10.py,sha256=xYkIId0nF9LpWHRLDP1_nlJfJfC0b0Tu4mLu-3mim-0,25170
|
3
|
-
bad_pdf_analysis/analyze_specific_pages.py,sha256=wzq3_ZWR28hFdT7GEkayHPYgsk20OpD476LYmy2rAEk,13725
|
4
|
-
bad_pdf_analysis/analyze_specific_pages_direct.py,sha256=307gSNplwOtNTR9a0lEQWxlAKGeoZIcDe5z1pROKUXY,14846
|
5
1
|
natural_pdf/__init__.py,sha256=qDFJNF8sbEDO-2WSFAxoWEM8updOUP6dB-ckya0kxfs,3275
|
6
2
|
natural_pdf/cli.py,sha256=IXrP2lCHihr-ed-CFiDbMTnSsutQa1j1PYALOLGbpsc,4019
|
7
|
-
natural_pdf/analyzers/__init__.py,sha256=
|
8
|
-
natural_pdf/analyzers/
|
3
|
+
natural_pdf/analyzers/__init__.py,sha256=MQRctn4i5Q7u8pb8vQVHKEXUiVGpKyPZUECrlDH4AuU,673
|
4
|
+
natural_pdf/analyzers/guides.py,sha256=tzyViSBDdM66mT0niwFTDIJ16UzRCZ18Iqv8wA5DYAk,90302
|
5
|
+
natural_pdf/analyzers/shape_detection_mixin.py,sha256=q7gDM-z2t7bSTxjfV2aaW3533CySu1qsEpu4wb5Rp-I,62688
|
9
6
|
natural_pdf/analyzers/text_options.py,sha256=qEkDaYWla0rIM_gszEOsu52q7C_dAfV81P2HLJZM2sw,3333
|
10
7
|
natural_pdf/analyzers/text_structure.py,sha256=VfKTsTFrK877sC0grsis9jK3rrgp0Mbp13VWEbukTcs,28437
|
11
8
|
natural_pdf/analyzers/utils.py,sha256=PYbzJzSAHZ7JsMes84WIrSbA0zkjJGs0CLvIeINsf_k,2100
|
@@ -28,10 +25,10 @@ natural_pdf/classification/results.py,sha256=Mcay-xLBHbYoZ8U7f4gMj2IhhH_yORNEkZH
|
|
28
25
|
natural_pdf/collections/mixins.py,sha256=sj76Cn6EdBtb5f-bdAV-1qpdixX8tI4BzPccPiYLI1w,5117
|
29
26
|
natural_pdf/collections/pdf_collection.py,sha256=HLlyakM--23ZOeHDPucoM6Tw3yUyMXm0SSoqJwxRc2E,30744
|
30
27
|
natural_pdf/core/__init__.py,sha256=QC8H4M3KbXwMFiQORZ0pdPlzx1Ix6oKKQSS7Ib2KEaA,38
|
31
|
-
natural_pdf/core/element_manager.py,sha256=
|
28
|
+
natural_pdf/core/element_manager.py,sha256=DbRzAKD3to5NpKc73Q-TXZIZkhx8zZtbi_UNu5K7AAU,52766
|
32
29
|
natural_pdf/core/highlighting_service.py,sha256=WKDqRpex1yS8CWhkNitWtKhxbyRRCLu3Xsct_HTPsD4,40774
|
33
|
-
natural_pdf/core/page.py,sha256=
|
34
|
-
natural_pdf/core/pdf.py,sha256=
|
30
|
+
natural_pdf/core/page.py,sha256=k4jezvsLqL07Raglc-rZmMnsVwBMo_A_OerklpBIejY,129477
|
31
|
+
natural_pdf/core/pdf.py,sha256=u0ZCPuIijNecU-AJHLvqfAYVCr9h7MgUKnlEtH6RoZI,75969
|
35
32
|
natural_pdf/describe/__init__.py,sha256=B3zjuHjFI_dFuBLgXR1Q4v7c72fVDyk84d2hs0H4KV8,561
|
36
33
|
natural_pdf/describe/base.py,sha256=HaWlHltb-dw6ug4mfR_iBLHWxr1OdPwLaUshXRxO7gg,18462
|
37
34
|
natural_pdf/describe/elements.py,sha256=COvKF3B_RbAxXl5ORJDubV4C5PsiuSfuzD0ufPIJTFM,12983
|
@@ -39,11 +36,11 @@ natural_pdf/describe/mixin.py,sha256=U0x6v8r57KQb8qC3VVo64hvhfXQWsti8vdKBM7AXnMo
|
|
39
36
|
natural_pdf/describe/summary.py,sha256=7FIF3zF6bzNx-gx4pCJr2XQFKiVzOEDnWsAYQ_mr9L0,7982
|
40
37
|
natural_pdf/elements/__init__.py,sha256=S8XeiNWJ1WcgnyYKdYV1yxQlAxCCO3FfITT8MQwNbyk,41
|
41
38
|
natural_pdf/elements/base.py,sha256=VshU4RstdzONJFq_8UVIjT_lVOai0MwMFsSFrCN-IO8,47299
|
42
|
-
natural_pdf/elements/collections.py,sha256=
|
39
|
+
natural_pdf/elements/collections.py,sha256=1E2MSg2NNcEcoRM2rumrv_CqIdO7DgbRHYEtfw35FaQ,128457
|
43
40
|
natural_pdf/elements/image.py,sha256=UjHNzCgDzOseQmLpkKshcxg51DPmWNIAVYxZ0TAMyUI,1423
|
44
41
|
natural_pdf/elements/line.py,sha256=aQm4pDdlQSDAAXqrdg4AU-oTl9JCXgYuaJN0EYls6E0,4920
|
45
42
|
natural_pdf/elements/rect.py,sha256=kiVa3e377ZnqIOXc89d9ZSY4EcmDxtccdtUw-HOQzpw,3796
|
46
|
-
natural_pdf/elements/region.py,sha256=
|
43
|
+
natural_pdf/elements/region.py,sha256=23J5Tv7ffAgz3IBgDXPq9Ab_lLg2Sog7elFRb6nvvZE,140541
|
47
44
|
natural_pdf/elements/text.py,sha256=kw7u2KfHtDB905YawP7Hs89kcR8XnbtpkYQGEk6LNyk,18860
|
48
45
|
natural_pdf/export/mixin.py,sha256=L1q3MIEFWuvie4j4_EmW7GT3NerbZ1as0XMUoqTS7gM,5083
|
49
46
|
natural_pdf/exporters/__init__.py,sha256=g1WRPCDVzceaUUsm8dchPhzdHFSjYM0NfRyc8iN0mtE,644
|
@@ -100,13 +97,12 @@ natural_pdf/utils/text_extraction.py,sha256=mDeN1_VevNi3RwvFe48PM5vBh-A5WeBlYgP6
|
|
100
97
|
natural_pdf/utils/visualization.py,sha256=n3IZpbY5cf9LItzGavBcNyVZZrrUVxjYnmqZHYPa7NU,9386
|
101
98
|
natural_pdf/widgets/__init__.py,sha256=QTVaUmsw__FCweFYZebwPssQxxUFUMd0wpm_cUbGZJY,181
|
102
99
|
natural_pdf/widgets/viewer.py,sha256=2VUY1TzWMDe9I-IVNOosKZ2LaqpjLB62ftMAdk-s6_8,24952
|
103
|
-
natural_pdf-0.1.
|
100
|
+
natural_pdf-0.1.33.dist-info/licenses/LICENSE,sha256=9zfwINwJlarbDmdh6iJV4QUG54QSJlSAUcnC1YiC_Ns,1074
|
104
101
|
optimization/memory_comparison.py,sha256=XEHtjduSmzXzxnsJMvemTcq-OAlvGUBAm5wwnOXq8TY,6524
|
105
102
|
optimization/pdf_analyzer.py,sha256=G3XWhsEqIYbohEgTqz6wzxkAnOx4MkbvbSspx577-8w,19145
|
106
103
|
optimization/performance_analysis.py,sha256=vVlFDywEXxhJLd9n2KVVqqQnS6rwWoHV_jlogboGF2k,13784
|
107
104
|
optimization/test_cleanup_methods.py,sha256=B_zHiJr1hI8q-tdfBoFi0Jf5lj2PURjA_6teRBGoz8o,6277
|
108
105
|
optimization/test_memory_fix.py,sha256=CWc0OSvFfKE0-nxqJOi_HAQc0GXUPKzkQbTeJp5UqxU,6364
|
109
|
-
tools/rtl_smoke_test.py,sha256=-ogcbvNzumJasICP0NNQHk4Zb4M1VRx0TnGkJUQC7SM,3043
|
110
106
|
tools/bad_pdf_eval/__init__.py,sha256=Nqnn8clbgv-5l0PgxcTOldg8mkMKrFn4TvPL-rYUUGg,1
|
111
107
|
tools/bad_pdf_eval/analyser.py,sha256=sR31aVVmTXRHS8uwLZXlPefTH2_lskxtAzuZwlhsyOo,13391
|
112
108
|
tools/bad_pdf_eval/collate_summaries.py,sha256=Mcmf1OvVn0S0efj5ypk0syXKSrfUf6L5dowoGvOTgjU,5047
|
@@ -115,8 +111,8 @@ tools/bad_pdf_eval/export_enrichment_csv.py,sha256=SMEm9WxFUN_RIf8AGfZfjGEmvBvrO
|
|
115
111
|
tools/bad_pdf_eval/llm_enrich.py,sha256=PsFMymPc8BNck21T3vupTN18pLdum-A_OLoJEKr6f80,12234
|
116
112
|
tools/bad_pdf_eval/reporter.py,sha256=LIhcguDZ5XKgb0WeJsyA7m0kcliebOohzveShvt_KmY,400
|
117
113
|
tools/bad_pdf_eval/utils.py,sha256=FuxaPX6f26IjQXu1vP0a2i9h1jgJNbASb8mRyj5-elE,4849
|
118
|
-
natural_pdf-0.1.
|
119
|
-
natural_pdf-0.1.
|
120
|
-
natural_pdf-0.1.
|
121
|
-
natural_pdf-0.1.
|
122
|
-
natural_pdf-0.1.
|
114
|
+
natural_pdf-0.1.33.dist-info/METADATA,sha256=mSAwh3vuD9aRvO_AC_XBZG5sw9SeiuidC86a7kuV--I,6711
|
115
|
+
natural_pdf-0.1.33.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
116
|
+
natural_pdf-0.1.33.dist-info/entry_points.txt,sha256=1R_KMv7g60UBBpRqGfw7bppsMNGdayR-iJlb9ohEk_8,81
|
117
|
+
natural_pdf-0.1.33.dist-info/top_level.txt,sha256=oZlRzSc3nZ9sV3L6kD_Di734Pp62ANrm46imFVa51qQ,58
|
118
|
+
natural_pdf-0.1.33.dist-info/RECORD,,
|