natural-pdf 0.1.21__py3-none-any.whl → 0.1.22__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- natural_pdf/cli.py +8 -27
- natural_pdf/core/pdf.py +31 -45
- natural_pdf/describe/base.py +2 -2
- natural_pdf/describe/elements.py +1 -1
- {natural_pdf-0.1.21.dist-info → natural_pdf-0.1.22.dist-info}/METADATA +1 -2
- {natural_pdf-0.1.21.dist-info → natural_pdf-0.1.22.dist-info}/RECORD +10 -10
- {natural_pdf-0.1.21.dist-info → natural_pdf-0.1.22.dist-info}/WHEEL +0 -0
- {natural_pdf-0.1.21.dist-info → natural_pdf-0.1.22.dist-info}/entry_points.txt +0 -0
- {natural_pdf-0.1.21.dist-info → natural_pdf-0.1.22.dist-info}/licenses/LICENSE +0 -0
- {natural_pdf-0.1.21.dist-info → natural_pdf-0.1.22.dist-info}/top_level.txt +0 -0
natural_pdf/cli.py
CHANGED
@@ -11,7 +11,9 @@ from packaging.requirements import Requirement
|
|
11
11
|
# ---------------------------------------------------------------------------
|
12
12
|
INSTALL_RECIPES: Dict[str, list[str]] = {
|
13
13
|
# heavyweight stacks
|
14
|
-
"paddle": ["paddlepaddle>=3.0.0", "paddleocr>=3.0.1", "paddlex>=3.0.2"],
|
14
|
+
"paddle": ["paddlepaddle>=3.0.0", "paddleocr>=3.0.1", "paddlex>=3.0.2", "pandas>=2.2.0"],
|
15
|
+
"numpy-high": ["numpy>=2.0"],
|
16
|
+
"numpy-low": ["numpy<1.27"],
|
15
17
|
"surya": ["surya-ocr>=0.13.0"],
|
16
18
|
"yolo": ["doclayout_yolo", "huggingface_hub>=0.29.3"],
|
17
19
|
"docling": ["docling"],
|
@@ -24,7 +26,7 @@ INSTALL_RECIPES: Dict[str, list[str]] = {
|
|
24
26
|
|
25
27
|
def _build_pip_install_args(requirements: list[str], upgrade: bool = True):
|
26
28
|
"""Return the pip command list to install/upgrade the given requirement strings."""
|
27
|
-
cmd = [sys.executable, "-m", "pip", "install"]
|
29
|
+
cmd = [sys.executable, "-m", "pip", "--quiet", "install"]
|
28
30
|
if upgrade:
|
29
31
|
cmd.append("--upgrade")
|
30
32
|
cmd.extend(requirements)
|
@@ -48,34 +50,13 @@ def cmd_install(args):
|
|
48
50
|
|
49
51
|
requirements = INSTALL_RECIPES[group_key]
|
50
52
|
|
51
|
-
# Skip paddlex upgrade if already satisfied
|
52
|
-
if group_key == "paddle":
|
53
|
-
try:
|
54
|
-
dist = distribution("paddlex")
|
55
|
-
from packaging.version import parse as V
|
56
|
-
if V(dist.version) >= V("3.0.2"):
|
57
|
-
print("✓ paddlex already ≥ 3.0.2 – nothing to do.")
|
58
|
-
continue
|
59
|
-
except PackageNotFoundError:
|
60
|
-
pass
|
61
|
-
|
62
53
|
# Special handling for paddle stack: install paddlepaddle & paddleocr first
|
63
54
|
# each in its own resolver run, then paddlex.
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
pip_cmd = _build_pip_install_args([req])
|
68
|
-
_run(pip_cmd)
|
69
|
-
|
70
|
-
# paddlex last to override the strict pin
|
71
|
-
pip_cmd = _build_pip_install_args(["paddlex==3.0.2"])
|
55
|
+
base_reqs = [r for r in requirements]
|
56
|
+
for req in base_reqs:
|
57
|
+
pip_cmd = _build_pip_install_args([req])
|
72
58
|
_run(pip_cmd)
|
73
|
-
|
74
|
-
else:
|
75
|
-
for req in requirements:
|
76
|
-
pip_cmd = _build_pip_install_args([req])
|
77
|
-
_run(pip_cmd)
|
78
|
-
print("✔ Finished installing extra dependencies for", group_key)
|
59
|
+
print("✔ Finished installing extra dependencies for", group_key)
|
79
60
|
|
80
61
|
|
81
62
|
def main():
|
natural_pdf/core/pdf.py
CHANGED
@@ -275,56 +275,42 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
|
|
275
275
|
)
|
276
276
|
|
277
277
|
def _initialize_managers(self):
|
278
|
-
"""
|
279
|
-
|
280
|
-
|
281
|
-
|
282
|
-
# Resolve the entry in DEFAULT_MANAGERS which can be:
|
283
|
-
# 1. A class -> instantiate directly
|
284
|
-
# 2. A factory (callable) returning a class -> call then instantiate
|
285
|
-
# 3. A factory returning a **ready instance** -> use as-is
|
286
|
-
|
287
|
-
resolved = manager_class_or_factory
|
288
|
-
|
289
|
-
# If we have a callable that is *not* a class, call it to obtain the real target
|
290
|
-
# (This is the lazy-import factory case.)
|
291
|
-
if not isinstance(resolved, type) and callable(resolved):
|
292
|
-
resolved = resolved()
|
293
|
-
|
294
|
-
# At this point `resolved` is either a class or an already-created instance
|
295
|
-
if isinstance(resolved, type):
|
296
|
-
instance = resolved() # Instantiate class
|
297
|
-
self._managers[key] = instance
|
298
|
-
logger.debug(f"Initialized manager for key '{key}': {resolved.__name__}")
|
299
|
-
else:
|
300
|
-
# Assume factory already returned an instance
|
301
|
-
self._managers[key] = resolved
|
302
|
-
logger.debug(
|
303
|
-
f"Initialized manager instance for key '{key}': {type(resolved).__name__} (factory-provided instance)"
|
304
|
-
)
|
305
|
-
except Exception as e:
|
306
|
-
logger.error(f"Failed to initialize manager for key '{key}': {e}")
|
307
|
-
self._managers[key] = None
|
278
|
+
"""Set up manager factories for lazy instantiation."""
|
279
|
+
# Store factories/classes for each manager key
|
280
|
+
self._manager_factories = dict(DEFAULT_MANAGERS)
|
281
|
+
self._managers = {} # Will hold instantiated managers
|
308
282
|
|
309
283
|
def get_manager(self, key: str) -> Any:
|
310
|
-
"""Retrieve a manager instance by its key."""
|
311
|
-
|
284
|
+
"""Retrieve a manager instance by its key, instantiating it lazily if needed."""
|
285
|
+
# Check if already instantiated
|
286
|
+
if key in self._managers:
|
287
|
+
manager_instance = self._managers[key]
|
288
|
+
if manager_instance is None:
|
289
|
+
raise RuntimeError(f"Manager '{key}' failed to initialize previously.")
|
290
|
+
return manager_instance
|
291
|
+
|
292
|
+
# Not instantiated yet: get factory/class
|
293
|
+
if not hasattr(self, "_manager_factories") or key not in self._manager_factories:
|
312
294
|
raise KeyError(
|
313
|
-
f"No manager registered for key '{key}'. Available: {list(self.
|
295
|
+
f"No manager registered for key '{key}'. Available: {list(getattr(self, '_manager_factories', {}).keys())}"
|
314
296
|
)
|
315
|
-
|
316
|
-
|
317
|
-
|
318
|
-
|
319
|
-
|
320
|
-
|
321
|
-
|
322
|
-
|
323
|
-
)
|
297
|
+
factory_or_class = self._manager_factories[key]
|
298
|
+
try:
|
299
|
+
resolved = factory_or_class
|
300
|
+
# If it's a callable that's not a class, call it to get the class/instance
|
301
|
+
if not isinstance(resolved, type) and callable(resolved):
|
302
|
+
resolved = resolved()
|
303
|
+
# If it's a class, instantiate it
|
304
|
+
if isinstance(resolved, type):
|
305
|
+
instance = resolved()
|
324
306
|
else:
|
325
|
-
|
326
|
-
|
327
|
-
|
307
|
+
instance = resolved # Already an instance
|
308
|
+
self._managers[key] = instance
|
309
|
+
return instance
|
310
|
+
except Exception as e:
|
311
|
+
logger.error(f"Failed to initialize manager for key '{key}': {e}")
|
312
|
+
self._managers[key] = None
|
313
|
+
raise RuntimeError(f"Manager '{key}' failed to initialize: {e}") from e
|
328
314
|
|
329
315
|
def _initialize_highlighter(self):
|
330
316
|
pass
|
natural_pdf/describe/base.py
CHANGED
@@ -292,8 +292,8 @@ def _extract_element_value(element: "Element", column: str) -> Any:
|
|
292
292
|
try:
|
293
293
|
if column == 'text':
|
294
294
|
text = getattr(element, 'text', '')
|
295
|
-
if text and len(text) >
|
296
|
-
return text[:
|
295
|
+
if text and len(text) > 60:
|
296
|
+
return text[:60] + "..."
|
297
297
|
return text or ""
|
298
298
|
|
299
299
|
elif column == 'page':
|
natural_pdf/describe/elements.py
CHANGED
@@ -396,7 +396,7 @@ def _analyze_ocr_quality(elements: List["Element"]) -> Dict[str, Any]:
|
|
396
396
|
text = getattr(element, 'text', '').strip()
|
397
397
|
if text:
|
398
398
|
# Truncate long text
|
399
|
-
display_text = text[:
|
399
|
+
display_text = text[:60] + "..." if len(text) > 60 else text
|
400
400
|
element_confidences.append((confidence, display_text))
|
401
401
|
|
402
402
|
if element_confidences:
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: natural-pdf
|
3
|
-
Version: 0.1.
|
3
|
+
Version: 0.1.22
|
4
4
|
Summary: A more intuitive interface for working with PDFs
|
5
5
|
Author-email: Jonathan Soma <jonathan.soma@gmail.com>
|
6
6
|
License-Expression: MIT
|
@@ -21,7 +21,6 @@ Requires-Dist: urllib3
|
|
21
21
|
Requires-Dist: tqdm
|
22
22
|
Requires-Dist: pydantic
|
23
23
|
Requires-Dist: jenkspy
|
24
|
-
Requires-Dist: pikepdf
|
25
24
|
Requires-Dist: scipy
|
26
25
|
Requires-Dist: torch
|
27
26
|
Requires-Dist: torchvision
|
@@ -1,5 +1,5 @@
|
|
1
1
|
natural_pdf/__init__.py,sha256=qDFJNF8sbEDO-2WSFAxoWEM8updOUP6dB-ckya0kxfs,3275
|
2
|
-
natural_pdf/cli.py,sha256
|
2
|
+
natural_pdf/cli.py,sha256=-U3vWyI7Qcxr3b5EWY7d2AcsOZJ7D9RnQIMzQsfoDQM,3971
|
3
3
|
natural_pdf/analyzers/__init__.py,sha256=dIXjsMqoxKmd9OOnSBzn12wvdIz7D7YNQRAnXslpJSM,142
|
4
4
|
natural_pdf/analyzers/shape_detection_mixin.py,sha256=HHefg-v7CJMxYiJHxdGOdqdtbWe9yk4OBoW3a_aRrjM,81798
|
5
5
|
natural_pdf/analyzers/text_options.py,sha256=qEkDaYWla0rIM_gszEOsu52q7C_dAfV81P2HLJZM2sw,3333
|
@@ -27,10 +27,10 @@ natural_pdf/core/__init__.py,sha256=QC8H4M3KbXwMFiQORZ0pdPlzx1Ix6oKKQSS7Ib2KEaA,
|
|
27
27
|
natural_pdf/core/element_manager.py,sha256=_UdXu51sLi6STzc8Pj4k8R721G3yJixXDLuRHn3hmr8,25731
|
28
28
|
natural_pdf/core/highlighting_service.py,sha256=_kQUS6_BBvsLBuSZloFrVag6jN90KzHa0ULyGBjufSs,36955
|
29
29
|
natural_pdf/core/page.py,sha256=i3DriIQwoO4RuSrkrCXv44Dz8OL9KXPa2y4GhsD1y18,118324
|
30
|
-
natural_pdf/core/pdf.py,sha256=
|
30
|
+
natural_pdf/core/pdf.py,sha256=0hsIdvKmr6C3WGdD9N9wHGHaZN4th4QwduLwmJ8rWpM,73269
|
31
31
|
natural_pdf/describe/__init__.py,sha256=B3zjuHjFI_dFuBLgXR1Q4v7c72fVDyk84d2hs0H4KV8,561
|
32
|
-
natural_pdf/describe/base.py,sha256=
|
33
|
-
natural_pdf/describe/elements.py,sha256=
|
32
|
+
natural_pdf/describe/base.py,sha256=LAZLc_thK2u2surgGd0Pk7CN2uVaZK9AbMOE3-1RmQ4,16842
|
33
|
+
natural_pdf/describe/elements.py,sha256=xD8wwR1z5IKat7RIwoAwQRUEL6zJTEwcOKorF4F-xPg,12717
|
34
34
|
natural_pdf/describe/mixin.py,sha256=U0x6v8r57KQb8qC3VVo64hvhfXQWsti8vdKBM7AXnMo,3116
|
35
35
|
natural_pdf/describe/summary.py,sha256=dPtjrn6fQ8nL0F74RITX2vXlDX7ZgaX9JQPnJB-S_XQ,6735
|
36
36
|
natural_pdf/elements/__init__.py,sha256=S8XeiNWJ1WcgnyYKdYV1yxQlAxCCO3FfITT8MQwNbyk,41
|
@@ -91,9 +91,9 @@ natural_pdf/utils/text_extraction.py,sha256=z6Jhy11pakYCsEpkvh8ldw6DkUFsYF1hCL9Y
|
|
91
91
|
natural_pdf/utils/visualization.py,sha256=30pRWQdsRJh2pSObh-brKVsFgC1n8tHmSrta_UDnVPw,8989
|
92
92
|
natural_pdf/widgets/__init__.py,sha256=QTVaUmsw__FCweFYZebwPssQxxUFUMd0wpm_cUbGZJY,181
|
93
93
|
natural_pdf/widgets/viewer.py,sha256=2VUY1TzWMDe9I-IVNOosKZ2LaqpjLB62ftMAdk-s6_8,24952
|
94
|
-
natural_pdf-0.1.
|
95
|
-
natural_pdf-0.1.
|
96
|
-
natural_pdf-0.1.
|
97
|
-
natural_pdf-0.1.
|
98
|
-
natural_pdf-0.1.
|
99
|
-
natural_pdf-0.1.
|
94
|
+
natural_pdf-0.1.22.dist-info/licenses/LICENSE,sha256=9zfwINwJlarbDmdh6iJV4QUG54QSJlSAUcnC1YiC_Ns,1074
|
95
|
+
natural_pdf-0.1.22.dist-info/METADATA,sha256=_e2-VmJ4W8_3IZEQc6QqP2LG99f2b-8f1wZRhDSBZsw,6238
|
96
|
+
natural_pdf-0.1.22.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
97
|
+
natural_pdf-0.1.22.dist-info/entry_points.txt,sha256=1R_KMv7g60UBBpRqGfw7bppsMNGdayR-iJlb9ohEk_8,81
|
98
|
+
natural_pdf-0.1.22.dist-info/top_level.txt,sha256=Cyw1zmNDlUZfb5moU-WUWGprrwH7ln_8LDGdmMHF1xI,17
|
99
|
+
natural_pdf-0.1.22.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|