natural-pdf 0.1.21__py3-none-any.whl → 0.1.22__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
natural_pdf/cli.py CHANGED
@@ -11,7 +11,9 @@ from packaging.requirements import Requirement
11
11
  # ---------------------------------------------------------------------------
12
12
  INSTALL_RECIPES: Dict[str, list[str]] = {
13
13
  # heavyweight stacks
14
- "paddle": ["paddlepaddle>=3.0.0", "paddleocr>=3.0.1", "paddlex>=3.0.2"],
14
+ "paddle": ["paddlepaddle>=3.0.0", "paddleocr>=3.0.1", "paddlex>=3.0.2", "pandas>=2.2.0"],
15
+ "numpy-high": ["numpy>=2.0"],
16
+ "numpy-low": ["numpy<1.27"],
15
17
  "surya": ["surya-ocr>=0.13.0"],
16
18
  "yolo": ["doclayout_yolo", "huggingface_hub>=0.29.3"],
17
19
  "docling": ["docling"],
@@ -24,7 +26,7 @@ INSTALL_RECIPES: Dict[str, list[str]] = {
24
26
 
25
27
  def _build_pip_install_args(requirements: list[str], upgrade: bool = True):
26
28
  """Return the pip command list to install/upgrade the given requirement strings."""
27
- cmd = [sys.executable, "-m", "pip", "install"]
29
+ cmd = [sys.executable, "-m", "pip", "--quiet", "install"]
28
30
  if upgrade:
29
31
  cmd.append("--upgrade")
30
32
  cmd.extend(requirements)
@@ -48,34 +50,13 @@ def cmd_install(args):
48
50
 
49
51
  requirements = INSTALL_RECIPES[group_key]
50
52
 
51
- # Skip paddlex upgrade if already satisfied
52
- if group_key == "paddle":
53
- try:
54
- dist = distribution("paddlex")
55
- from packaging.version import parse as V
56
- if V(dist.version) >= V("3.0.2"):
57
- print("✓ paddlex already ≥ 3.0.2 – nothing to do.")
58
- continue
59
- except PackageNotFoundError:
60
- pass
61
-
62
53
  # Special handling for paddle stack: install paddlepaddle & paddleocr first
63
54
  # each in its own resolver run, then paddlex.
64
- if group_key == "paddle":
65
- base_reqs = [r for r in requirements if not r.startswith("paddlex")]
66
- for req in base_reqs:
67
- pip_cmd = _build_pip_install_args([req])
68
- _run(pip_cmd)
69
-
70
- # paddlex last to override the strict pin
71
- pip_cmd = _build_pip_install_args(["paddlex==3.0.2"])
55
+ base_reqs = [r for r in requirements]
56
+ for req in base_reqs:
57
+ pip_cmd = _build_pip_install_args([req])
72
58
  _run(pip_cmd)
73
- print("✔ Paddle stack installed (paddlex upgraded to 3.0.2)")
74
- else:
75
- for req in requirements:
76
- pip_cmd = _build_pip_install_args([req])
77
- _run(pip_cmd)
78
- print("✔ Finished installing extra dependencies for", group_key)
59
+ print("✔ Finished installing extra dependencies for", group_key)
79
60
 
80
61
 
81
62
  def main():
natural_pdf/core/pdf.py CHANGED
@@ -275,56 +275,42 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
275
275
  )
276
276
 
277
277
  def _initialize_managers(self):
278
- """Initialize manager instances based on DEFAULT_MANAGERS."""
279
- self._managers = {}
280
- for key, manager_class_or_factory in DEFAULT_MANAGERS.items():
281
- try:
282
- # Resolve the entry in DEFAULT_MANAGERS which can be:
283
- # 1. A class -> instantiate directly
284
- # 2. A factory (callable) returning a class -> call then instantiate
285
- # 3. A factory returning a **ready instance** -> use as-is
286
-
287
- resolved = manager_class_or_factory
288
-
289
- # If we have a callable that is *not* a class, call it to obtain the real target
290
- # (This is the lazy-import factory case.)
291
- if not isinstance(resolved, type) and callable(resolved):
292
- resolved = resolved()
293
-
294
- # At this point `resolved` is either a class or an already-created instance
295
- if isinstance(resolved, type):
296
- instance = resolved() # Instantiate class
297
- self._managers[key] = instance
298
- logger.debug(f"Initialized manager for key '{key}': {resolved.__name__}")
299
- else:
300
- # Assume factory already returned an instance
301
- self._managers[key] = resolved
302
- logger.debug(
303
- f"Initialized manager instance for key '{key}': {type(resolved).__name__} (factory-provided instance)"
304
- )
305
- except Exception as e:
306
- logger.error(f"Failed to initialize manager for key '{key}': {e}")
307
- self._managers[key] = None
278
+ """Set up manager factories for lazy instantiation."""
279
+ # Store factories/classes for each manager key
280
+ self._manager_factories = dict(DEFAULT_MANAGERS)
281
+ self._managers = {} # Will hold instantiated managers
308
282
 
309
283
  def get_manager(self, key: str) -> Any:
310
- """Retrieve a manager instance by its key."""
311
- if key not in self._managers:
284
+ """Retrieve a manager instance by its key, instantiating it lazily if needed."""
285
+ # Check if already instantiated
286
+ if key in self._managers:
287
+ manager_instance = self._managers[key]
288
+ if manager_instance is None:
289
+ raise RuntimeError(f"Manager '{key}' failed to initialize previously.")
290
+ return manager_instance
291
+
292
+ # Not instantiated yet: get factory/class
293
+ if not hasattr(self, "_manager_factories") or key not in self._manager_factories:
312
294
  raise KeyError(
313
- f"No manager registered for key '{key}'. Available: {list(self._managers.keys())}"
295
+ f"No manager registered for key '{key}'. Available: {list(getattr(self, '_manager_factories', {}).keys())}"
314
296
  )
315
-
316
- manager_instance = self._managers.get(key)
317
-
318
- if manager_instance is None:
319
- manager_class = DEFAULT_MANAGERS.get(key)
320
- if manager_class:
321
- raise RuntimeError(
322
- f"Manager '{key}' ({manager_class.__name__}) failed to initialize previously."
323
- )
297
+ factory_or_class = self._manager_factories[key]
298
+ try:
299
+ resolved = factory_or_class
300
+ # If it's a callable that's not a class, call it to get the class/instance
301
+ if not isinstance(resolved, type) and callable(resolved):
302
+ resolved = resolved()
303
+ # If it's a class, instantiate it
304
+ if isinstance(resolved, type):
305
+ instance = resolved()
324
306
  else:
325
- raise RuntimeError(f"Manager '{key}' failed to initialize (class not found).")
326
-
327
- return manager_instance
307
+ instance = resolved # Already an instance
308
+ self._managers[key] = instance
309
+ return instance
310
+ except Exception as e:
311
+ logger.error(f"Failed to initialize manager for key '{key}': {e}")
312
+ self._managers[key] = None
313
+ raise RuntimeError(f"Manager '{key}' failed to initialize: {e}") from e
328
314
 
329
315
  def _initialize_highlighter(self):
330
316
  pass
@@ -292,8 +292,8 @@ def _extract_element_value(element: "Element", column: str) -> Any:
292
292
  try:
293
293
  if column == 'text':
294
294
  text = getattr(element, 'text', '')
295
- if text and len(text) > 50:
296
- return text[:50] + "..."
295
+ if text and len(text) > 60:
296
+ return text[:60] + "..."
297
297
  return text or ""
298
298
 
299
299
  elif column == 'page':
@@ -396,7 +396,7 @@ def _analyze_ocr_quality(elements: List["Element"]) -> Dict[str, Any]:
396
396
  text = getattr(element, 'text', '').strip()
397
397
  if text:
398
398
  # Truncate long text
399
- display_text = text[:50] + "..." if len(text) > 50 else text
399
+ display_text = text[:60] + "..." if len(text) > 60 else text
400
400
  element_confidences.append((confidence, display_text))
401
401
 
402
402
  if element_confidences:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: natural-pdf
3
- Version: 0.1.21
3
+ Version: 0.1.22
4
4
  Summary: A more intuitive interface for working with PDFs
5
5
  Author-email: Jonathan Soma <jonathan.soma@gmail.com>
6
6
  License-Expression: MIT
@@ -21,7 +21,6 @@ Requires-Dist: urllib3
21
21
  Requires-Dist: tqdm
22
22
  Requires-Dist: pydantic
23
23
  Requires-Dist: jenkspy
24
- Requires-Dist: pikepdf
25
24
  Requires-Dist: scipy
26
25
  Requires-Dist: torch
27
26
  Requires-Dist: torchvision
@@ -1,5 +1,5 @@
1
1
  natural_pdf/__init__.py,sha256=qDFJNF8sbEDO-2WSFAxoWEM8updOUP6dB-ckya0kxfs,3275
2
- natural_pdf/cli.py,sha256=0nAGVO2f_40E3G9c3Q0bfK5mhROyUJH5W25-YJVLMIo,4749
2
+ natural_pdf/cli.py,sha256=-U3vWyI7Qcxr3b5EWY7d2AcsOZJ7D9RnQIMzQsfoDQM,3971
3
3
  natural_pdf/analyzers/__init__.py,sha256=dIXjsMqoxKmd9OOnSBzn12wvdIz7D7YNQRAnXslpJSM,142
4
4
  natural_pdf/analyzers/shape_detection_mixin.py,sha256=HHefg-v7CJMxYiJHxdGOdqdtbWe9yk4OBoW3a_aRrjM,81798
5
5
  natural_pdf/analyzers/text_options.py,sha256=qEkDaYWla0rIM_gszEOsu52q7C_dAfV81P2HLJZM2sw,3333
@@ -27,10 +27,10 @@ natural_pdf/core/__init__.py,sha256=QC8H4M3KbXwMFiQORZ0pdPlzx1Ix6oKKQSS7Ib2KEaA,
27
27
  natural_pdf/core/element_manager.py,sha256=_UdXu51sLi6STzc8Pj4k8R721G3yJixXDLuRHn3hmr8,25731
28
28
  natural_pdf/core/highlighting_service.py,sha256=_kQUS6_BBvsLBuSZloFrVag6jN90KzHa0ULyGBjufSs,36955
29
29
  natural_pdf/core/page.py,sha256=i3DriIQwoO4RuSrkrCXv44Dz8OL9KXPa2y4GhsD1y18,118324
30
- natural_pdf/core/pdf.py,sha256=yBvb1iGw9gwVPJ3Rm1EBaZ8_g60TuW_Elhg2EOcJMzc,73871
30
+ natural_pdf/core/pdf.py,sha256=0hsIdvKmr6C3WGdD9N9wHGHaZN4th4QwduLwmJ8rWpM,73269
31
31
  natural_pdf/describe/__init__.py,sha256=B3zjuHjFI_dFuBLgXR1Q4v7c72fVDyk84d2hs0H4KV8,561
32
- natural_pdf/describe/base.py,sha256=7USCFIl4mI5b15LTVkwvhAn_mngMwhwxCnVYaZz5Vdc,16842
33
- natural_pdf/describe/elements.py,sha256=BOkz2wDhGh6P8NOm6pSNxitgmVokLTISztaFhrxMcdw,12717
32
+ natural_pdf/describe/base.py,sha256=LAZLc_thK2u2surgGd0Pk7CN2uVaZK9AbMOE3-1RmQ4,16842
33
+ natural_pdf/describe/elements.py,sha256=xD8wwR1z5IKat7RIwoAwQRUEL6zJTEwcOKorF4F-xPg,12717
34
34
  natural_pdf/describe/mixin.py,sha256=U0x6v8r57KQb8qC3VVo64hvhfXQWsti8vdKBM7AXnMo,3116
35
35
  natural_pdf/describe/summary.py,sha256=dPtjrn6fQ8nL0F74RITX2vXlDX7ZgaX9JQPnJB-S_XQ,6735
36
36
  natural_pdf/elements/__init__.py,sha256=S8XeiNWJ1WcgnyYKdYV1yxQlAxCCO3FfITT8MQwNbyk,41
@@ -91,9 +91,9 @@ natural_pdf/utils/text_extraction.py,sha256=z6Jhy11pakYCsEpkvh8ldw6DkUFsYF1hCL9Y
91
91
  natural_pdf/utils/visualization.py,sha256=30pRWQdsRJh2pSObh-brKVsFgC1n8tHmSrta_UDnVPw,8989
92
92
  natural_pdf/widgets/__init__.py,sha256=QTVaUmsw__FCweFYZebwPssQxxUFUMd0wpm_cUbGZJY,181
93
93
  natural_pdf/widgets/viewer.py,sha256=2VUY1TzWMDe9I-IVNOosKZ2LaqpjLB62ftMAdk-s6_8,24952
94
- natural_pdf-0.1.21.dist-info/licenses/LICENSE,sha256=9zfwINwJlarbDmdh6iJV4QUG54QSJlSAUcnC1YiC_Ns,1074
95
- natural_pdf-0.1.21.dist-info/METADATA,sha256=zo7RfNG44xTbSSdOgGjciqEVMOMCxRX3BY7qtH3roW8,6261
96
- natural_pdf-0.1.21.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
97
- natural_pdf-0.1.21.dist-info/entry_points.txt,sha256=1R_KMv7g60UBBpRqGfw7bppsMNGdayR-iJlb9ohEk_8,81
98
- natural_pdf-0.1.21.dist-info/top_level.txt,sha256=Cyw1zmNDlUZfb5moU-WUWGprrwH7ln_8LDGdmMHF1xI,17
99
- natural_pdf-0.1.21.dist-info/RECORD,,
94
+ natural_pdf-0.1.22.dist-info/licenses/LICENSE,sha256=9zfwINwJlarbDmdh6iJV4QUG54QSJlSAUcnC1YiC_Ns,1074
95
+ natural_pdf-0.1.22.dist-info/METADATA,sha256=_e2-VmJ4W8_3IZEQc6QqP2LG99f2b-8f1wZRhDSBZsw,6238
96
+ natural_pdf-0.1.22.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
97
+ natural_pdf-0.1.22.dist-info/entry_points.txt,sha256=1R_KMv7g60UBBpRqGfw7bppsMNGdayR-iJlb9ohEk_8,81
98
+ natural_pdf-0.1.22.dist-info/top_level.txt,sha256=Cyw1zmNDlUZfb5moU-WUWGprrwH7ln_8LDGdmMHF1xI,17
99
+ natural_pdf-0.1.22.dist-info/RECORD,,