natural-pdf 0.1.30__py3-none-any.whl → 0.1.32__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- natural_pdf/analyzers/__init__.py +18 -4
- natural_pdf/analyzers/guides.py +2176 -0
- natural_pdf/analyzers/shape_detection_mixin.py +0 -650
- natural_pdf/core/element_manager.py +86 -27
- natural_pdf/core/page.py +49 -1
- natural_pdf/core/pdf.py +22 -0
- natural_pdf/elements/collections.py +61 -0
- natural_pdf/elements/region.py +257 -14
- natural_pdf/elements/text.py +29 -0
- {natural_pdf-0.1.30.dist-info → natural_pdf-0.1.32.dist-info}/METADATA +1 -1
- {natural_pdf-0.1.30.dist-info → natural_pdf-0.1.32.dist-info}/RECORD +15 -19
- bad_pdf_analysis/analyze_10_more.py +0 -300
- bad_pdf_analysis/analyze_final_10.py +0 -552
- bad_pdf_analysis/analyze_specific_pages.py +0 -394
- bad_pdf_analysis/analyze_specific_pages_direct.py +0 -382
- tools/rtl_smoke_test.py +0 -80
- {natural_pdf-0.1.30.dist-info → natural_pdf-0.1.32.dist-info}/WHEEL +0 -0
- {natural_pdf-0.1.30.dist-info → natural_pdf-0.1.32.dist-info}/entry_points.txt +0 -0
- {natural_pdf-0.1.30.dist-info → natural_pdf-0.1.32.dist-info}/licenses/LICENSE +0 -0
- {natural_pdf-0.1.30.dist-info → natural_pdf-0.1.32.dist-info}/top_level.txt +0 -0
natural_pdf/elements/region.py
CHANGED
@@ -1319,6 +1319,28 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
|
|
1319
1319
|
table_settings.setdefault("vertical_strategy", "lines")
|
1320
1320
|
table_settings.setdefault("horizontal_strategy", "lines")
|
1321
1321
|
|
1322
|
+
# -------------------------------------------------------------
|
1323
|
+
# Auto-inject tolerances when text-based strategies are requested.
|
1324
|
+
# This must happen AFTER alias handling (so strategies are final)
|
1325
|
+
# and BEFORE we delegate to _extract_table_* helpers.
|
1326
|
+
# -------------------------------------------------------------
|
1327
|
+
if "text" in (table_settings.get("vertical_strategy"), table_settings.get("horizontal_strategy")):
|
1328
|
+
page_cfg = getattr(self.page, "_config", {})
|
1329
|
+
# Ensure text_* tolerances passed to pdfplumber
|
1330
|
+
if "text_x_tolerance" not in table_settings and "x_tolerance" not in table_settings:
|
1331
|
+
if page_cfg.get("x_tolerance") is not None:
|
1332
|
+
table_settings["text_x_tolerance"] = page_cfg["x_tolerance"]
|
1333
|
+
if "text_y_tolerance" not in table_settings and "y_tolerance" not in table_settings:
|
1334
|
+
if page_cfg.get("y_tolerance") is not None:
|
1335
|
+
table_settings["text_y_tolerance"] = page_cfg["y_tolerance"]
|
1336
|
+
|
1337
|
+
# Snap / join tolerances (~ line spacing)
|
1338
|
+
if "snap_tolerance" not in table_settings and "snap_x_tolerance" not in table_settings:
|
1339
|
+
snap = max(1, round((page_cfg.get("y_tolerance", 1)) * 0.9))
|
1340
|
+
table_settings["snap_tolerance"] = snap
|
1341
|
+
if "join_tolerance" not in table_settings and "join_x_tolerance" not in table_settings:
|
1342
|
+
table_settings["join_tolerance"] = table_settings["snap_tolerance"]
|
1343
|
+
|
1322
1344
|
logger.debug(f"Region {self.bbox}: Extracting table using method '{effective_method}'")
|
1323
1345
|
|
1324
1346
|
# Use the selected method
|
@@ -1438,6 +1460,30 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
|
|
1438
1460
|
Returns:
|
1439
1461
|
List of tables, where each table is a list of rows, and each row is a list of cell values
|
1440
1462
|
"""
|
1463
|
+
# Inject global PDF-level text tolerances if not explicitly present
|
1464
|
+
pdf_cfg = getattr(self.page, "_config", getattr(self.page._parent, "_config", {}))
|
1465
|
+
_uses_text = "text" in (
|
1466
|
+
table_settings.get("vertical_strategy"),
|
1467
|
+
table_settings.get("horizontal_strategy"),
|
1468
|
+
)
|
1469
|
+
if _uses_text and "text_x_tolerance" not in table_settings and "x_tolerance" not in table_settings:
|
1470
|
+
x_tol = pdf_cfg.get("x_tolerance")
|
1471
|
+
if x_tol is not None:
|
1472
|
+
table_settings.setdefault("text_x_tolerance", x_tol)
|
1473
|
+
if _uses_text and "text_y_tolerance" not in table_settings and "y_tolerance" not in table_settings:
|
1474
|
+
y_tol = pdf_cfg.get("y_tolerance")
|
1475
|
+
if y_tol is not None:
|
1476
|
+
table_settings.setdefault("text_y_tolerance", y_tol)
|
1477
|
+
|
1478
|
+
if _uses_text and "snap_tolerance" not in table_settings and "snap_x_tolerance" not in table_settings:
|
1479
|
+
snap = max(1, round((pdf_cfg.get("y_tolerance", 1)) * 0.9))
|
1480
|
+
table_settings.setdefault("snap_tolerance", snap)
|
1481
|
+
if _uses_text and "join_tolerance" not in table_settings and "join_x_tolerance" not in table_settings:
|
1482
|
+
join = table_settings.get("snap_tolerance", 1)
|
1483
|
+
table_settings.setdefault("join_tolerance", join)
|
1484
|
+
table_settings.setdefault("join_x_tolerance", join)
|
1485
|
+
table_settings.setdefault("join_y_tolerance", join)
|
1486
|
+
|
1441
1487
|
# Create a crop of the page for this region
|
1442
1488
|
cropped = self.page._page.crop(self.bbox)
|
1443
1489
|
|
@@ -1458,6 +1504,21 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
|
|
1458
1504
|
Returns:
|
1459
1505
|
Table data as a list of rows, where each row is a list of cell values
|
1460
1506
|
"""
|
1507
|
+
# Inject global PDF-level text tolerances if not explicitly present
|
1508
|
+
pdf_cfg = getattr(self.page, "_config", getattr(self.page._parent, "_config", {}))
|
1509
|
+
_uses_text = "text" in (
|
1510
|
+
table_settings.get("vertical_strategy"),
|
1511
|
+
table_settings.get("horizontal_strategy"),
|
1512
|
+
)
|
1513
|
+
if _uses_text and "text_x_tolerance" not in table_settings and "x_tolerance" not in table_settings:
|
1514
|
+
x_tol = pdf_cfg.get("x_tolerance")
|
1515
|
+
if x_tol is not None:
|
1516
|
+
table_settings.setdefault("text_x_tolerance", x_tol)
|
1517
|
+
if _uses_text and "text_y_tolerance" not in table_settings and "y_tolerance" not in table_settings:
|
1518
|
+
y_tol = pdf_cfg.get("y_tolerance")
|
1519
|
+
if y_tol is not None:
|
1520
|
+
table_settings.setdefault("text_y_tolerance", y_tol)
|
1521
|
+
|
1461
1522
|
# Create a crop of the page for this region
|
1462
1523
|
cropped = self.page._page.crop(self.bbox)
|
1463
1524
|
|
@@ -1943,21 +2004,45 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
|
|
1943
2004
|
"""
|
1944
2005
|
Apply OCR to this region and return the created text elements.
|
1945
2006
|
|
2007
|
+
This method supports two modes:
|
2008
|
+
1. **Built-in OCR Engines** (default) – identical to previous behaviour. Pass typical
|
2009
|
+
parameters like ``engine='easyocr'`` or ``languages=['en']`` and the method will
|
2010
|
+
route the request through :class:`OCRManager`.
|
2011
|
+
2. **Custom OCR Function** – pass a *callable* under the keyword ``function`` (or
|
2012
|
+
``ocr_function``). The callable will receive *this* Region instance and should
|
2013
|
+
return the extracted text (``str``) or ``None``. Internally the call is
|
2014
|
+
delegated to :pymeth:`apply_custom_ocr` so the same logic (replacement, element
|
2015
|
+
creation, etc.) is re-used.
|
2016
|
+
|
2017
|
+
Examples
|
2018
|
+
---------
|
2019
|
+
>>> def llm_ocr(region):
|
2020
|
+
... image = region.to_image(resolution=300, crop=True)
|
2021
|
+
... return my_llm_client.ocr(image)
|
2022
|
+
>>> region.apply_ocr(function=llm_ocr)
|
2023
|
+
|
1946
2024
|
Args:
|
1947
|
-
replace:
|
1948
|
-
|
1949
|
-
|
1950
|
-
|
1951
|
-
|
1952
|
-
|
1953
|
-
|
1954
|
-
|
1955
|
-
|
1956
|
-
|
2025
|
+
replace: Whether to remove existing OCR elements first (default ``True``).
|
2026
|
+
**ocr_params: Parameters for the built-in OCR manager *or* the special
|
2027
|
+
``function``/``ocr_function`` keyword to trigger custom mode.
|
2028
|
+
|
2029
|
+
Returns
|
2030
|
+
-------
|
2031
|
+
Self – for chaining.
|
2032
|
+
"""
|
2033
|
+
# --- Custom OCR function path --------------------------------------------------
|
2034
|
+
custom_func = ocr_params.pop("function", None) or ocr_params.pop("ocr_function", None)
|
2035
|
+
if callable(custom_func):
|
2036
|
+
# Delegate to the specialised helper while preserving key kwargs
|
2037
|
+
return self.apply_custom_ocr(
|
2038
|
+
ocr_function=custom_func,
|
2039
|
+
source_label=ocr_params.pop("source_label", "custom-ocr"),
|
2040
|
+
replace=replace,
|
2041
|
+
confidence=ocr_params.pop("confidence", None),
|
2042
|
+
add_to_page=ocr_params.pop("add_to_page", True),
|
2043
|
+
)
|
1957
2044
|
|
1958
|
-
|
1959
|
-
Self for method chaining.
|
1960
|
-
"""
|
2045
|
+
# --- Original built-in OCR engine path (unchanged except docstring) ------------
|
1961
2046
|
# Ensure OCRManager is available
|
1962
2047
|
if not hasattr(self.page._parent, "_ocr_manager") or self.page._parent._ocr_manager is None:
|
1963
2048
|
logger.error("OCRManager not available on parent PDF. Cannot apply OCR to region.")
|
@@ -2123,6 +2208,133 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
|
|
2123
2208
|
logger.info(f"Region {self.bbox}: Added {len(created_elements)} elements from OCR.")
|
2124
2209
|
return self
|
2125
2210
|
|
2211
|
+
def apply_custom_ocr(
|
2212
|
+
self,
|
2213
|
+
ocr_function: Callable[["Region"], Optional[str]],
|
2214
|
+
source_label: str = "custom-ocr",
|
2215
|
+
replace: bool = True,
|
2216
|
+
confidence: Optional[float] = None,
|
2217
|
+
add_to_page: bool = True,
|
2218
|
+
) -> "Region":
|
2219
|
+
"""
|
2220
|
+
Apply a custom OCR function to this region and create text elements from the results.
|
2221
|
+
|
2222
|
+
This is useful when you want to use a custom OCR method (e.g., an LLM API,
|
2223
|
+
specialized OCR service, or any custom logic) instead of the built-in OCR engines.
|
2224
|
+
|
2225
|
+
Args:
|
2226
|
+
ocr_function: A callable that takes a Region and returns the OCR'd text (or None).
|
2227
|
+
The function receives this region as its argument and should return
|
2228
|
+
the extracted text as a string, or None if no text was found.
|
2229
|
+
source_label: Label to identify the source of these text elements (default: "custom-ocr").
|
2230
|
+
This will be set as the 'source' attribute on created elements.
|
2231
|
+
replace: If True (default), removes existing OCR elements in this region before
|
2232
|
+
adding new ones. If False, adds new OCR elements alongside existing ones.
|
2233
|
+
confidence: Optional confidence score for the OCR result (0.0-1.0).
|
2234
|
+
If None, defaults to 1.0 if text is returned, 0.0 if None is returned.
|
2235
|
+
add_to_page: If True (default), adds the created text element to the page.
|
2236
|
+
If False, creates the element but doesn't add it to the page.
|
2237
|
+
|
2238
|
+
Returns:
|
2239
|
+
Self for method chaining.
|
2240
|
+
|
2241
|
+
Example:
|
2242
|
+
# Using with an LLM
|
2243
|
+
def ocr_with_llm(region):
|
2244
|
+
image = region.to_image(resolution=300, crop=True)
|
2245
|
+
# Call your LLM API here
|
2246
|
+
return llm_client.ocr(image)
|
2247
|
+
|
2248
|
+
region.apply_custom_ocr(ocr_with_llm)
|
2249
|
+
|
2250
|
+
# Using with a custom OCR service
|
2251
|
+
def ocr_with_service(region):
|
2252
|
+
img_bytes = region.to_image(crop=True).tobytes()
|
2253
|
+
response = ocr_service.process(img_bytes)
|
2254
|
+
return response.text
|
2255
|
+
|
2256
|
+
region.apply_custom_ocr(ocr_with_service, source_label="my-ocr-service")
|
2257
|
+
"""
|
2258
|
+
# If replace is True, remove existing OCR elements in this region
|
2259
|
+
if replace:
|
2260
|
+
logger.info(
|
2261
|
+
f"Region {self.bbox}: Removing existing OCR elements before applying custom OCR."
|
2262
|
+
)
|
2263
|
+
|
2264
|
+
removed_count = 0
|
2265
|
+
|
2266
|
+
# Helper to remove a single element safely
|
2267
|
+
def _safe_remove(elem):
|
2268
|
+
nonlocal removed_count
|
2269
|
+
success = False
|
2270
|
+
if hasattr(elem, "page") and hasattr(elem.page, "_element_mgr"):
|
2271
|
+
etype = getattr(elem, "object_type", "word")
|
2272
|
+
if etype == "word":
|
2273
|
+
etype_key = "words"
|
2274
|
+
elif etype == "char":
|
2275
|
+
etype_key = "chars"
|
2276
|
+
else:
|
2277
|
+
etype_key = etype + "s" if not etype.endswith("s") else etype
|
2278
|
+
try:
|
2279
|
+
success = elem.page._element_mgr.remove_element(elem, etype_key)
|
2280
|
+
except Exception:
|
2281
|
+
success = False
|
2282
|
+
if success:
|
2283
|
+
removed_count += 1
|
2284
|
+
|
2285
|
+
# Remove OCR elements overlapping this region
|
2286
|
+
for word in list(self.page._element_mgr.words):
|
2287
|
+
if getattr(word, "source", "").startswith("ocr") and self.intersects(word):
|
2288
|
+
_safe_remove(word)
|
2289
|
+
|
2290
|
+
# Also check custom-ocr sources
|
2291
|
+
for word in list(self.page._element_mgr.words):
|
2292
|
+
if getattr(word, "source", "") == source_label and self.intersects(word):
|
2293
|
+
_safe_remove(word)
|
2294
|
+
|
2295
|
+
if removed_count > 0:
|
2296
|
+
logger.info(
|
2297
|
+
f"Region {self.bbox}: Removed {removed_count} existing OCR elements."
|
2298
|
+
)
|
2299
|
+
|
2300
|
+
# Call the custom OCR function
|
2301
|
+
try:
|
2302
|
+
logger.debug(f"Region {self.bbox}: Calling custom OCR function...")
|
2303
|
+
ocr_text = ocr_function(self)
|
2304
|
+
|
2305
|
+
if ocr_text is not None and not isinstance(ocr_text, str):
|
2306
|
+
logger.warning(
|
2307
|
+
f"Custom OCR function returned non-string type ({type(ocr_text)}). "
|
2308
|
+
f"Converting to string."
|
2309
|
+
)
|
2310
|
+
ocr_text = str(ocr_text)
|
2311
|
+
|
2312
|
+
except Exception as e:
|
2313
|
+
logger.error(
|
2314
|
+
f"Error calling custom OCR function for region {self.bbox}: {e}",
|
2315
|
+
exc_info=True
|
2316
|
+
)
|
2317
|
+
return self
|
2318
|
+
|
2319
|
+
# Create text element if we got text
|
2320
|
+
if ocr_text is not None:
|
2321
|
+
# Use the to_text_element method to create the element
|
2322
|
+
text_element = self.to_text_element(
|
2323
|
+
text_content=ocr_text,
|
2324
|
+
source_label=source_label,
|
2325
|
+
confidence=confidence,
|
2326
|
+
add_to_page=add_to_page
|
2327
|
+
)
|
2328
|
+
|
2329
|
+
logger.info(
|
2330
|
+
f"Region {self.bbox}: Created text element with {len(ocr_text)} chars"
|
2331
|
+
f"{' and added to page' if add_to_page else ''}"
|
2332
|
+
)
|
2333
|
+
else:
|
2334
|
+
logger.debug(f"Region {self.bbox}: Custom OCR function returned None (no text found)")
|
2335
|
+
|
2336
|
+
return self
|
2337
|
+
|
2126
2338
|
def get_section_between(self, start_element=None, end_element=None, boundary_inclusion="both"):
|
2127
2339
|
"""
|
2128
2340
|
Get a section between two elements within this region.
|
@@ -2917,6 +3129,33 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
|
|
2917
3129
|
if not hasattr(self, "page") or self.page is None:
|
2918
3130
|
raise ValueError("Region must have a valid 'page' attribute to create a TextElement.")
|
2919
3131
|
|
3132
|
+
# Create character dictionaries for the text
|
3133
|
+
char_dicts = []
|
3134
|
+
if actual_text:
|
3135
|
+
# Create a single character dict that spans the entire region
|
3136
|
+
# This is a simplified approach - OCR engines typically create one per character
|
3137
|
+
char_dict = {
|
3138
|
+
"text": actual_text,
|
3139
|
+
"x0": self.x0,
|
3140
|
+
"top": self.top,
|
3141
|
+
"x1": self.x1,
|
3142
|
+
"bottom": self.bottom,
|
3143
|
+
"width": self.width,
|
3144
|
+
"height": self.height,
|
3145
|
+
"object_type": "char",
|
3146
|
+
"page_number": self.page.page_number,
|
3147
|
+
"fontname": default_font_name,
|
3148
|
+
"size": default_font_size,
|
3149
|
+
"upright": True,
|
3150
|
+
"direction": 1,
|
3151
|
+
"adv": self.width,
|
3152
|
+
"source": source_label,
|
3153
|
+
"confidence": final_confidence,
|
3154
|
+
"stroking_color": (0, 0, 0),
|
3155
|
+
"non_stroking_color": (0, 0, 0),
|
3156
|
+
}
|
3157
|
+
char_dicts.append(char_dict)
|
3158
|
+
|
2920
3159
|
elem_data = {
|
2921
3160
|
"text": actual_text,
|
2922
3161
|
"x0": self.x0,
|
@@ -2936,7 +3175,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
|
|
2936
3175
|
"adv": self.width,
|
2937
3176
|
"source": source_label,
|
2938
3177
|
"confidence": final_confidence,
|
2939
|
-
"_char_dicts":
|
3178
|
+
"_char_dicts": char_dicts,
|
2940
3179
|
}
|
2941
3180
|
text_element = TextElement(elem_data, self.page)
|
2942
3181
|
|
@@ -2952,6 +3191,10 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
|
|
2952
3191
|
logger.debug(
|
2953
3192
|
f"TextElement created from region {self.bbox} and added to page {self.page.page_number} as {add_as_type}."
|
2954
3193
|
)
|
3194
|
+
# Also add character dictionaries to the chars collection
|
3195
|
+
if char_dicts and object_type == "word":
|
3196
|
+
for char_dict in char_dicts:
|
3197
|
+
self.page._element_mgr.add_element(char_dict, element_type="chars")
|
2955
3198
|
else:
|
2956
3199
|
page_num_str = (
|
2957
3200
|
str(self.page.page_number) if hasattr(self.page, "page_number") else "N/A"
|
natural_pdf/elements/text.py
CHANGED
@@ -468,3 +468,32 @@ class TextElement(Element):
|
|
468
468
|
info[f"raw_{prop}"] = self._obj[prop]
|
469
469
|
|
470
470
|
return info
|
471
|
+
|
472
|
+
@property
|
473
|
+
def visual_text(self) -> str:
|
474
|
+
"""Return the text converted to *visual* order using the Unicode BiDi algorithm.
|
475
|
+
|
476
|
+
This helper is intentionally side-effect–free: it does **not** mutate
|
477
|
+
``self.text`` or the underlying character dictionaries. It should be
|
478
|
+
used by UI / rendering code that needs human-readable RTL/LTR mixing.
|
479
|
+
"""
|
480
|
+
logical = self.text
|
481
|
+
if not logical:
|
482
|
+
return logical
|
483
|
+
|
484
|
+
# Quick check – bail out if no RTL chars to save import/CPU.
|
485
|
+
import unicodedata
|
486
|
+
|
487
|
+
if not any(unicodedata.bidirectional(ch) in ("R", "AL", "AN") for ch in logical):
|
488
|
+
return logical
|
489
|
+
|
490
|
+
try:
|
491
|
+
from bidi.algorithm import get_display # type: ignore
|
492
|
+
from natural_pdf.utils.bidi_mirror import mirror_brackets
|
493
|
+
|
494
|
+
# Convert from logical order to visual order
|
495
|
+
visual = get_display(logical, base_dir="R")
|
496
|
+
return mirror_brackets(visual)
|
497
|
+
except Exception:
|
498
|
+
# If python-bidi is missing or errors, fall back to logical order
|
499
|
+
return logical
|
@@ -1,11 +1,8 @@
|
|
1
|
-
bad_pdf_analysis/analyze_10_more.py,sha256=UjsTuHE1GUMoVjkX3afy3x6DfpXyfZXHgS2W1GQqUmw,11906
|
2
|
-
bad_pdf_analysis/analyze_final_10.py,sha256=xYkIId0nF9LpWHRLDP1_nlJfJfC0b0Tu4mLu-3mim-0,25170
|
3
|
-
bad_pdf_analysis/analyze_specific_pages.py,sha256=wzq3_ZWR28hFdT7GEkayHPYgsk20OpD476LYmy2rAEk,13725
|
4
|
-
bad_pdf_analysis/analyze_specific_pages_direct.py,sha256=307gSNplwOtNTR9a0lEQWxlAKGeoZIcDe5z1pROKUXY,14846
|
5
1
|
natural_pdf/__init__.py,sha256=qDFJNF8sbEDO-2WSFAxoWEM8updOUP6dB-ckya0kxfs,3275
|
6
2
|
natural_pdf/cli.py,sha256=IXrP2lCHihr-ed-CFiDbMTnSsutQa1j1PYALOLGbpsc,4019
|
7
|
-
natural_pdf/analyzers/__init__.py,sha256=
|
8
|
-
natural_pdf/analyzers/
|
3
|
+
natural_pdf/analyzers/__init__.py,sha256=MQRctn4i5Q7u8pb8vQVHKEXUiVGpKyPZUECrlDH4AuU,673
|
4
|
+
natural_pdf/analyzers/guides.py,sha256=tzyViSBDdM66mT0niwFTDIJ16UzRCZ18Iqv8wA5DYAk,90302
|
5
|
+
natural_pdf/analyzers/shape_detection_mixin.py,sha256=q7gDM-z2t7bSTxjfV2aaW3533CySu1qsEpu4wb5Rp-I,62688
|
9
6
|
natural_pdf/analyzers/text_options.py,sha256=qEkDaYWla0rIM_gszEOsu52q7C_dAfV81P2HLJZM2sw,3333
|
10
7
|
natural_pdf/analyzers/text_structure.py,sha256=VfKTsTFrK877sC0grsis9jK3rrgp0Mbp13VWEbukTcs,28437
|
11
8
|
natural_pdf/analyzers/utils.py,sha256=PYbzJzSAHZ7JsMes84WIrSbA0zkjJGs0CLvIeINsf_k,2100
|
@@ -28,10 +25,10 @@ natural_pdf/classification/results.py,sha256=Mcay-xLBHbYoZ8U7f4gMj2IhhH_yORNEkZH
|
|
28
25
|
natural_pdf/collections/mixins.py,sha256=sj76Cn6EdBtb5f-bdAV-1qpdixX8tI4BzPccPiYLI1w,5117
|
29
26
|
natural_pdf/collections/pdf_collection.py,sha256=HLlyakM--23ZOeHDPucoM6Tw3yUyMXm0SSoqJwxRc2E,30744
|
30
27
|
natural_pdf/core/__init__.py,sha256=QC8H4M3KbXwMFiQORZ0pdPlzx1Ix6oKKQSS7Ib2KEaA,38
|
31
|
-
natural_pdf/core/element_manager.py,sha256=
|
28
|
+
natural_pdf/core/element_manager.py,sha256=A6GJk9kwTzt-aSz4-SWaRHLZRbIMFFLce3CpxSyfkV4,51749
|
32
29
|
natural_pdf/core/highlighting_service.py,sha256=WKDqRpex1yS8CWhkNitWtKhxbyRRCLu3Xsct_HTPsD4,40774
|
33
|
-
natural_pdf/core/page.py,sha256=
|
34
|
-
natural_pdf/core/pdf.py,sha256=
|
30
|
+
natural_pdf/core/page.py,sha256=843_Fyk1gxZ8nqERJjjjoRD3iM4pFJy9a0zQSyMthiQ,128476
|
31
|
+
natural_pdf/core/pdf.py,sha256=mC4GZjPXx_bK6RUlhLpnJnapkHDhbgJpgpcUJOvb7OE,75290
|
35
32
|
natural_pdf/describe/__init__.py,sha256=B3zjuHjFI_dFuBLgXR1Q4v7c72fVDyk84d2hs0H4KV8,561
|
36
33
|
natural_pdf/describe/base.py,sha256=HaWlHltb-dw6ug4mfR_iBLHWxr1OdPwLaUshXRxO7gg,18462
|
37
34
|
natural_pdf/describe/elements.py,sha256=COvKF3B_RbAxXl5ORJDubV4C5PsiuSfuzD0ufPIJTFM,12983
|
@@ -39,12 +36,12 @@ natural_pdf/describe/mixin.py,sha256=U0x6v8r57KQb8qC3VVo64hvhfXQWsti8vdKBM7AXnMo
|
|
39
36
|
natural_pdf/describe/summary.py,sha256=7FIF3zF6bzNx-gx4pCJr2XQFKiVzOEDnWsAYQ_mr9L0,7982
|
40
37
|
natural_pdf/elements/__init__.py,sha256=S8XeiNWJ1WcgnyYKdYV1yxQlAxCCO3FfITT8MQwNbyk,41
|
41
38
|
natural_pdf/elements/base.py,sha256=VshU4RstdzONJFq_8UVIjT_lVOai0MwMFsSFrCN-IO8,47299
|
42
|
-
natural_pdf/elements/collections.py,sha256=
|
39
|
+
natural_pdf/elements/collections.py,sha256=1E2MSg2NNcEcoRM2rumrv_CqIdO7DgbRHYEtfw35FaQ,128457
|
43
40
|
natural_pdf/elements/image.py,sha256=UjHNzCgDzOseQmLpkKshcxg51DPmWNIAVYxZ0TAMyUI,1423
|
44
41
|
natural_pdf/elements/line.py,sha256=aQm4pDdlQSDAAXqrdg4AU-oTl9JCXgYuaJN0EYls6E0,4920
|
45
42
|
natural_pdf/elements/rect.py,sha256=kiVa3e377ZnqIOXc89d9ZSY4EcmDxtccdtUw-HOQzpw,3796
|
46
|
-
natural_pdf/elements/region.py,sha256=
|
47
|
-
natural_pdf/elements/text.py,sha256=
|
43
|
+
natural_pdf/elements/region.py,sha256=8SKhzCJ6sELZxJcM2i_58YhEKU6HBvaJ7Oj6E3bOsHw,139523
|
44
|
+
natural_pdf/elements/text.py,sha256=kw7u2KfHtDB905YawP7Hs89kcR8XnbtpkYQGEk6LNyk,18860
|
48
45
|
natural_pdf/export/mixin.py,sha256=L1q3MIEFWuvie4j4_EmW7GT3NerbZ1as0XMUoqTS7gM,5083
|
49
46
|
natural_pdf/exporters/__init__.py,sha256=g1WRPCDVzceaUUsm8dchPhzdHFSjYM0NfRyc8iN0mtE,644
|
50
47
|
natural_pdf/exporters/base.py,sha256=XhR1xlkHOh7suOuX7mWbsj1h2o1pZNet-OAS5YCJyeI,2115
|
@@ -100,13 +97,12 @@ natural_pdf/utils/text_extraction.py,sha256=mDeN1_VevNi3RwvFe48PM5vBh-A5WeBlYgP6
|
|
100
97
|
natural_pdf/utils/visualization.py,sha256=n3IZpbY5cf9LItzGavBcNyVZZrrUVxjYnmqZHYPa7NU,9386
|
101
98
|
natural_pdf/widgets/__init__.py,sha256=QTVaUmsw__FCweFYZebwPssQxxUFUMd0wpm_cUbGZJY,181
|
102
99
|
natural_pdf/widgets/viewer.py,sha256=2VUY1TzWMDe9I-IVNOosKZ2LaqpjLB62ftMAdk-s6_8,24952
|
103
|
-
natural_pdf-0.1.
|
100
|
+
natural_pdf-0.1.32.dist-info/licenses/LICENSE,sha256=9zfwINwJlarbDmdh6iJV4QUG54QSJlSAUcnC1YiC_Ns,1074
|
104
101
|
optimization/memory_comparison.py,sha256=XEHtjduSmzXzxnsJMvemTcq-OAlvGUBAm5wwnOXq8TY,6524
|
105
102
|
optimization/pdf_analyzer.py,sha256=G3XWhsEqIYbohEgTqz6wzxkAnOx4MkbvbSspx577-8w,19145
|
106
103
|
optimization/performance_analysis.py,sha256=vVlFDywEXxhJLd9n2KVVqqQnS6rwWoHV_jlogboGF2k,13784
|
107
104
|
optimization/test_cleanup_methods.py,sha256=B_zHiJr1hI8q-tdfBoFi0Jf5lj2PURjA_6teRBGoz8o,6277
|
108
105
|
optimization/test_memory_fix.py,sha256=CWc0OSvFfKE0-nxqJOi_HAQc0GXUPKzkQbTeJp5UqxU,6364
|
109
|
-
tools/rtl_smoke_test.py,sha256=-ogcbvNzumJasICP0NNQHk4Zb4M1VRx0TnGkJUQC7SM,3043
|
110
106
|
tools/bad_pdf_eval/__init__.py,sha256=Nqnn8clbgv-5l0PgxcTOldg8mkMKrFn4TvPL-rYUUGg,1
|
111
107
|
tools/bad_pdf_eval/analyser.py,sha256=sR31aVVmTXRHS8uwLZXlPefTH2_lskxtAzuZwlhsyOo,13391
|
112
108
|
tools/bad_pdf_eval/collate_summaries.py,sha256=Mcmf1OvVn0S0efj5ypk0syXKSrfUf6L5dowoGvOTgjU,5047
|
@@ -115,8 +111,8 @@ tools/bad_pdf_eval/export_enrichment_csv.py,sha256=SMEm9WxFUN_RIf8AGfZfjGEmvBvrO
|
|
115
111
|
tools/bad_pdf_eval/llm_enrich.py,sha256=PsFMymPc8BNck21T3vupTN18pLdum-A_OLoJEKr6f80,12234
|
116
112
|
tools/bad_pdf_eval/reporter.py,sha256=LIhcguDZ5XKgb0WeJsyA7m0kcliebOohzveShvt_KmY,400
|
117
113
|
tools/bad_pdf_eval/utils.py,sha256=FuxaPX6f26IjQXu1vP0a2i9h1jgJNbASb8mRyj5-elE,4849
|
118
|
-
natural_pdf-0.1.
|
119
|
-
natural_pdf-0.1.
|
120
|
-
natural_pdf-0.1.
|
121
|
-
natural_pdf-0.1.
|
122
|
-
natural_pdf-0.1.
|
114
|
+
natural_pdf-0.1.32.dist-info/METADATA,sha256=CMZIo2BjeLh-b9hezQHMLehZP8brUflCQ69dLtfFyxo,6711
|
115
|
+
natural_pdf-0.1.32.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
116
|
+
natural_pdf-0.1.32.dist-info/entry_points.txt,sha256=1R_KMv7g60UBBpRqGfw7bppsMNGdayR-iJlb9ohEk_8,81
|
117
|
+
natural_pdf-0.1.32.dist-info/top_level.txt,sha256=oZlRzSc3nZ9sV3L6kD_Di734Pp62ANrm46imFVa51qQ,58
|
118
|
+
natural_pdf-0.1.32.dist-info/RECORD,,
|