natural-pdf 0.1.16__py3-none-any.whl → 0.1.18__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -5,25 +5,41 @@ from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union
5
5
 
6
6
  from PIL import Image
7
7
 
8
+ # Lazy imports for heavy dependencies to avoid loading at module level
8
9
  # Use try-except for robustness if dependencies are missing
9
- try:
10
+ _CLASSIFICATION_AVAILABLE = None
11
+
12
+ def _check_classification_dependencies():
13
+ """Lazy check for classification dependencies."""
14
+ global _CLASSIFICATION_AVAILABLE
15
+ if _CLASSIFICATION_AVAILABLE is None:
16
+ try:
17
+ import torch
18
+ import transformers
19
+ _CLASSIFICATION_AVAILABLE = True
20
+ except ImportError:
21
+ _CLASSIFICATION_AVAILABLE = False
22
+ return _CLASSIFICATION_AVAILABLE
23
+
24
+ def _get_torch():
25
+ """Lazy import for torch."""
10
26
  import torch
27
+ return torch
28
+
29
+ def _get_transformers_components():
30
+ """Lazy import for transformers components."""
11
31
  from transformers import (
12
32
  AutoModelForSequenceClassification,
13
33
  AutoModelForZeroShotImageClassification,
14
34
  AutoTokenizer,
15
35
  pipeline,
16
36
  )
17
-
18
- _CLASSIFICATION_AVAILABLE = True
19
- except ImportError:
20
- _CLASSIFICATION_AVAILABLE = False
21
- # Define dummy types for type hinting if imports fail
22
- pipeline = object
23
- AutoTokenizer = object
24
- AutoModelForZeroShotImageClassification = object
25
- AutoModelForSequenceClassification = object
26
- torch = None
37
+ return {
38
+ 'AutoModelForSequenceClassification': AutoModelForSequenceClassification,
39
+ 'AutoModelForZeroShotImageClassification': AutoModelForZeroShotImageClassification,
40
+ 'AutoTokenizer': AutoTokenizer,
41
+ 'pipeline': pipeline,
42
+ }
27
43
 
28
44
  from tqdm.auto import tqdm
29
45
 
@@ -41,6 +57,11 @@ _PIPELINE_CACHE: Dict[str, "Pipeline"] = {}
41
57
  _TOKENIZER_CACHE: Dict[str, Any] = {}
42
58
  _MODEL_CACHE: Dict[str, Any] = {}
43
59
 
60
+ # Export the availability check function for external use
61
+ def is_classification_available() -> bool:
62
+ """Check if classification dependencies are available."""
63
+ return _check_classification_dependencies()
64
+
44
65
 
45
66
  class ClassificationError(Exception):
46
67
  """Custom exception for classification errors."""
@@ -66,7 +87,7 @@ class ClassificationManager:
66
87
  model_mapping: Optional dictionary mapping aliases ('text', 'vision') to model IDs.
67
88
  default_device: Default device ('cpu', 'cuda') if not specified in classify calls.
68
89
  """
69
- if not _CLASSIFICATION_AVAILABLE:
90
+ if not _check_classification_dependencies():
70
91
  raise ImportError(
71
92
  "Classification dependencies missing. "
72
93
  'Install with: pip install "natural-pdf[core-ml]"'
@@ -81,7 +102,7 @@ class ClassificationManager:
81
102
 
82
103
  def is_available(self) -> bool:
83
104
  """Check if required dependencies are installed."""
84
- return _CLASSIFICATION_AVAILABLE
105
+ return _check_classification_dependencies()
85
106
 
86
107
  def _get_pipeline(self, model_id: str, using: str) -> "Pipeline":
87
108
  """Get or create a classification pipeline."""
@@ -92,6 +113,10 @@ class ClassificationManager:
92
113
  )
93
114
  start_time = time.time()
94
115
  try:
116
+ # Lazy import transformers components
117
+ transformers_components = _get_transformers_components()
118
+ pipeline = transformers_components['pipeline']
119
+
95
120
  task = (
96
121
  "zero-shot-classification"
97
122
  if using == "text"
natural_pdf/core/page.py CHANGED
@@ -61,6 +61,7 @@ from natural_pdf.classification.manager import ClassificationManager # For type
61
61
  # # --- Classification Imports --- #
62
62
  from natural_pdf.classification.mixin import ClassificationMixin # Import classification mixin
63
63
  from natural_pdf.core.element_manager import ElementManager
64
+ from natural_pdf.describe.mixin import DescribeMixin # Import describe mixin
64
65
  from natural_pdf.elements.base import Element # Import base element
65
66
  from natural_pdf.elements.text import TextElement
66
67
  from natural_pdf.extraction.mixin import ExtractionMixin # Import extraction mixin
@@ -92,7 +93,7 @@ except ImportError:
92
93
  logger = logging.getLogger(__name__)
93
94
 
94
95
 
95
- class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin):
96
+ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMixin):
96
97
  """
97
98
  Enhanced Page wrapper built on top of pdfplumber.Page.
98
99
 
natural_pdf/core/pdf.py CHANGED
@@ -25,9 +25,10 @@ from typing import (
25
25
  import pdfplumber
26
26
  from PIL import Image
27
27
  from tqdm.auto import tqdm
28
+ import weakref
28
29
 
29
30
  from natural_pdf.analyzers.layout.layout_manager import LayoutManager
30
- from natural_pdf.classification.manager import ClassificationError, ClassificationManager
31
+ from natural_pdf.classification.manager import ClassificationError
31
32
  from natural_pdf.classification.mixin import ClassificationMixin
32
33
  from natural_pdf.classification.results import ClassificationResult
33
34
  from natural_pdf.core.highlighting_service import HighlightingService
@@ -72,8 +73,13 @@ except ImportError:
72
73
 
73
74
  logger = logging.getLogger("natural_pdf.core.pdf")
74
75
 
76
+ def _get_classification_manager_class():
77
+ """Lazy import for ClassificationManager."""
78
+ from natural_pdf.classification.manager import ClassificationManager
79
+ return ClassificationManager
80
+
75
81
  DEFAULT_MANAGERS = {
76
- "classification": ClassificationManager,
82
+ "classification": _get_classification_manager_class,
77
83
  "structured_data": StructuredDataManager,
78
84
  }
79
85
 
@@ -91,6 +97,62 @@ except ImportError:
91
97
  img2pdf = None
92
98
  # End Deskew Imports
93
99
 
100
+ # --- Lazy Page List Helper --- #
101
+ from collections.abc import Sequence
102
+
103
+ class _LazyPageList(Sequence):
104
+ """A lightweight, list-like object that lazily instantiates natural-pdf Page objects.
105
+
106
+ The sequence holds `None` placeholders until an index is accessed, at which point
107
+ a real `Page` object is created, cached, and returned. Slices and iteration are
108
+ also supported and will materialise pages on demand.
109
+ """
110
+
111
+ def __init__(self, parent_pdf: "PDF", plumber_pdf: "pdfplumber.PDF", font_attrs=None):
112
+ self._parent_pdf = parent_pdf
113
+ self._plumber_pdf = plumber_pdf
114
+ self._font_attrs = font_attrs
115
+ # One slot per pdfplumber page – initially all None
116
+ self._cache: List[Optional["Page"]] = [None] * len(self._plumber_pdf.pages)
117
+
118
+ # Internal helper -----------------------------------------------------
119
+ def _create_page(self, index: int) -> "Page":
120
+ cached = self._cache[index]
121
+ if cached is None:
122
+ # Import here to avoid circular import problems
123
+ from natural_pdf.core.page import Page
124
+
125
+ plumber_page = self._plumber_pdf.pages[index]
126
+ cached = Page(plumber_page, parent=self._parent_pdf, index=index, font_attrs=self._font_attrs)
127
+ self._cache[index] = cached
128
+ return cached
129
+
130
+ # Sequence protocol ---------------------------------------------------
131
+ def __len__(self) -> int:
132
+ return len(self._cache)
133
+
134
+ def __getitem__(self, key):
135
+ if isinstance(key, slice):
136
+ # Materialise pages for slice lazily as well
137
+ indices = range(*key.indices(len(self)))
138
+ return [self._create_page(i) for i in indices]
139
+ elif isinstance(key, int):
140
+ if key < 0:
141
+ key += len(self)
142
+ if key < 0 or key >= len(self):
143
+ raise IndexError("Page index out of range")
144
+ return self._create_page(key)
145
+ else:
146
+ raise TypeError("Page indices must be integers or slices")
147
+
148
+ def __iter__(self):
149
+ for i in range(len(self)):
150
+ yield self._create_page(i)
151
+
152
+ def __repr__(self) -> str: # pragma: no cover
153
+ return f"<_LazyPageList(len={len(self)})>"
154
+
155
+ # --- End Lazy Page List Helper --- #
94
156
 
95
157
  class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
96
158
  """
@@ -129,6 +191,15 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
129
191
  self.source_path = "<stream>" # Identifier for source
130
192
  self.path = self.source_path # Use source identifier as path for streams
131
193
  stream_to_open = path_or_url_or_stream
194
+ try:
195
+ if hasattr(path_or_url_or_stream, "read"):
196
+ # If caller provided an in-memory binary stream, capture bytes for potential re-export
197
+ current_pos = path_or_url_or_stream.tell()
198
+ path_or_url_or_stream.seek(0)
199
+ self._original_bytes = path_or_url_or_stream.read()
200
+ path_or_url_or_stream.seek(current_pos)
201
+ except Exception:
202
+ pass
132
203
  elif isinstance(path_or_url_or_stream, (str, Path)):
133
204
  path_or_url = str(path_or_url_or_stream)
134
205
  self.source_path = path_or_url # Store original path/URL as source
@@ -137,21 +208,15 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
137
208
  if is_url:
138
209
  logger.info(f"Downloading PDF from URL: {path_or_url}")
139
210
  try:
140
- # Use a context manager for the temporary file
141
- with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as temp_f:
142
- self._temp_file = temp_f # Store reference if needed for cleanup
143
- with urllib.request.urlopen(path_or_url) as response:
144
- temp_f.write(response.read())
145
- temp_f.flush()
146
- self._resolved_path = temp_f.name
147
- logger.info(f"PDF downloaded to temporary file: {self._resolved_path}")
148
- stream_to_open = self._resolved_path
211
+ with urllib.request.urlopen(path_or_url) as response:
212
+ data = response.read()
213
+ # Load directly into an in-memory buffer no temp file needed
214
+ buffer = io.BytesIO(data)
215
+ buffer.seek(0)
216
+ self._temp_file = None # No on-disk temp file
217
+ self._resolved_path = path_or_url # For repr / get_id purposes
218
+ stream_to_open = buffer # pdfplumber accepts file-like objects
149
219
  except Exception as e:
150
- if self._temp_file and hasattr(self._temp_file, "name"):
151
- try:
152
- os.unlink(self._temp_file.name)
153
- except: # noqa E722
154
- pass
155
220
  logger.error(f"Failed to download PDF from URL: {e}")
156
221
  raise ValueError(f"Failed to download PDF from URL: {e}")
157
222
  else:
@@ -187,12 +252,8 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
187
252
  # self._classification_manager_instance = ClassificationManager() # Removed this line
188
253
  self._manager_registry = {}
189
254
 
190
- from natural_pdf.core.page import Page
191
-
192
- self._pages = [
193
- Page(p, parent=self, index=i, font_attrs=font_attrs)
194
- for i, p in enumerate(self._pdf.pages)
195
- ]
255
+ # Lazily instantiate pages only when accessed
256
+ self._pages = _LazyPageList(self, self._pdf, font_attrs=font_attrs)
196
257
 
197
258
  self._element_cache = {}
198
259
  self._exclusions = []
@@ -204,15 +265,45 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
204
265
  self._initialize_highlighter()
205
266
  self.analyses: Dict[str, Any] = {}
206
267
 
268
+ # --- Automatic cleanup when object is garbage-collected ---
269
+ self._finalizer = weakref.finalize(
270
+ self,
271
+ PDF._finalize_cleanup,
272
+ self._pdf,
273
+ getattr(self, "_temp_file", None),
274
+ getattr(self, "_is_stream", False),
275
+ )
276
+
207
277
  def _initialize_managers(self):
208
278
  """Initialize manager instances based on DEFAULT_MANAGERS."""
209
279
  self._managers = {}
210
- for key, manager_class in DEFAULT_MANAGERS.items():
280
+ for key, manager_class_or_factory in DEFAULT_MANAGERS.items():
211
281
  try:
212
- self._managers[key] = manager_class()
213
- logger.debug(f"Initialized manager for key '{key}': {manager_class.__name__}")
282
+ # Resolve the entry in DEFAULT_MANAGERS which can be:
283
+ # 1. A class -> instantiate directly
284
+ # 2. A factory (callable) returning a class -> call then instantiate
285
+ # 3. A factory returning a **ready instance** -> use as-is
286
+
287
+ resolved = manager_class_or_factory
288
+
289
+ # If we have a callable that is *not* a class, call it to obtain the real target
290
+ # (This is the lazy-import factory case.)
291
+ if not isinstance(resolved, type) and callable(resolved):
292
+ resolved = resolved()
293
+
294
+ # At this point `resolved` is either a class or an already-created instance
295
+ if isinstance(resolved, type):
296
+ instance = resolved() # Instantiate class
297
+ self._managers[key] = instance
298
+ logger.debug(f"Initialized manager for key '{key}': {resolved.__name__}")
299
+ else:
300
+ # Assume factory already returned an instance
301
+ self._managers[key] = resolved
302
+ logger.debug(
303
+ f"Initialized manager instance for key '{key}': {type(resolved).__name__} (factory-provided instance)"
304
+ )
214
305
  except Exception as e:
215
- logger.error(f"Failed to initialize manager {manager_class.__name__}: {e}")
306
+ logger.error(f"Failed to initialize manager for key '{key}': {e}")
216
307
  self._managers[key] = None
217
308
 
218
309
  def get_manager(self, key: str) -> Any:
@@ -1220,6 +1311,10 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
1220
1311
  except Exception as e:
1221
1312
  logger.warning(f"Failed to clean up temporary file '{temp_file_path}': {e}")
1222
1313
 
1314
+ # Cancels the weakref finalizer so we don't double-clean
1315
+ if hasattr(self, "_finalizer") and self._finalizer.alive:
1316
+ self._finalizer()
1317
+
1223
1318
  def __enter__(self):
1224
1319
  """Context manager entry."""
1225
1320
  return self
@@ -1404,12 +1499,9 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
1404
1499
  raise ClassificationError(f"Cannot get ClassificationManager: {e}") from e
1405
1500
 
1406
1501
  if not manager or not manager.is_available():
1407
- try:
1408
- from natural_pdf.classification.manager import _CLASSIFICATION_AVAILABLE
1409
-
1410
- if not _CLASSIFICATION_AVAILABLE:
1411
- raise ImportError("Classification dependencies missing.")
1412
- except ImportError:
1502
+ from natural_pdf.classification.manager import is_classification_available
1503
+
1504
+ if not is_classification_available():
1413
1505
  raise ImportError(
1414
1506
  "Classification dependencies missing. "
1415
1507
  'Install with: pip install "natural-pdf[core-ml]"'
@@ -1723,3 +1815,20 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
1723
1815
  raise ValueError(f"Unsupported model_type for PDF classification: {model_type}")
1724
1816
 
1725
1817
  # --- End Classification Mixin Implementation ---
1818
+
1819
+ # Static helper for weakref.finalize to avoid capturing 'self'
1820
+ @staticmethod
1821
+ def _finalize_cleanup(plumber_pdf, temp_file_obj, is_stream):
1822
+ try:
1823
+ if plumber_pdf is not None:
1824
+ plumber_pdf.close()
1825
+ except Exception:
1826
+ pass
1827
+
1828
+ if temp_file_obj and not is_stream:
1829
+ try:
1830
+ path = temp_file_obj.name if hasattr(temp_file_obj, "name") else None
1831
+ if path and os.path.exists(path):
1832
+ os.unlink(path)
1833
+ except Exception:
1834
+ pass
@@ -0,0 +1,21 @@
1
+ """
2
+ Describe functionality for natural-pdf.
3
+
4
+ Provides summary and inspection methods for pages, collections, and regions.
5
+ """
6
+
7
+ from .base import describe_page, describe_collection, inspect_collection, describe_region, describe_element
8
+ from .summary import ElementSummary, InspectionSummary
9
+ from .mixin import DescribeMixin, InspectMixin
10
+
11
+ __all__ = [
12
+ 'describe_page',
13
+ 'describe_collection',
14
+ 'inspect_collection',
15
+ 'describe_region',
16
+ 'describe_element',
17
+ 'ElementSummary',
18
+ 'InspectionSummary',
19
+ 'DescribeMixin',
20
+ 'InspectMixin'
21
+ ]