natural-pdf 0.1.17__py3-none-any.whl → 0.1.19__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -5,25 +5,41 @@ from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union
5
5
 
6
6
  from PIL import Image
7
7
 
8
+ # Lazy imports for heavy dependencies to avoid loading at module level
8
9
  # Use try-except for robustness if dependencies are missing
9
- try:
10
+ _CLASSIFICATION_AVAILABLE = None
11
+
12
+ def _check_classification_dependencies():
13
+ """Lazy check for classification dependencies."""
14
+ global _CLASSIFICATION_AVAILABLE
15
+ if _CLASSIFICATION_AVAILABLE is None:
16
+ try:
17
+ import torch
18
+ import transformers
19
+ _CLASSIFICATION_AVAILABLE = True
20
+ except ImportError:
21
+ _CLASSIFICATION_AVAILABLE = False
22
+ return _CLASSIFICATION_AVAILABLE
23
+
24
+ def _get_torch():
25
+ """Lazy import for torch."""
10
26
  import torch
27
+ return torch
28
+
29
+ def _get_transformers_components():
30
+ """Lazy import for transformers components."""
11
31
  from transformers import (
12
32
  AutoModelForSequenceClassification,
13
33
  AutoModelForZeroShotImageClassification,
14
34
  AutoTokenizer,
15
35
  pipeline,
16
36
  )
17
-
18
- _CLASSIFICATION_AVAILABLE = True
19
- except ImportError:
20
- _CLASSIFICATION_AVAILABLE = False
21
- # Define dummy types for type hinting if imports fail
22
- pipeline = object
23
- AutoTokenizer = object
24
- AutoModelForZeroShotImageClassification = object
25
- AutoModelForSequenceClassification = object
26
- torch = None
37
+ return {
38
+ 'AutoModelForSequenceClassification': AutoModelForSequenceClassification,
39
+ 'AutoModelForZeroShotImageClassification': AutoModelForZeroShotImageClassification,
40
+ 'AutoTokenizer': AutoTokenizer,
41
+ 'pipeline': pipeline,
42
+ }
27
43
 
28
44
  from tqdm.auto import tqdm
29
45
 
@@ -41,6 +57,11 @@ _PIPELINE_CACHE: Dict[str, "Pipeline"] = {}
41
57
  _TOKENIZER_CACHE: Dict[str, Any] = {}
42
58
  _MODEL_CACHE: Dict[str, Any] = {}
43
59
 
60
+ # Export the availability check function for external use
61
+ def is_classification_available() -> bool:
62
+ """Check if classification dependencies are available."""
63
+ return _check_classification_dependencies()
64
+
44
65
 
45
66
  class ClassificationError(Exception):
46
67
  """Custom exception for classification errors."""
@@ -66,7 +87,7 @@ class ClassificationManager:
66
87
  model_mapping: Optional dictionary mapping aliases ('text', 'vision') to model IDs.
67
88
  default_device: Default device ('cpu', 'cuda') if not specified in classify calls.
68
89
  """
69
- if not _CLASSIFICATION_AVAILABLE:
90
+ if not _check_classification_dependencies():
70
91
  raise ImportError(
71
92
  "Classification dependencies missing. "
72
93
  'Install with: pip install "natural-pdf[core-ml]"'
@@ -81,7 +102,7 @@ class ClassificationManager:
81
102
 
82
103
  def is_available(self) -> bool:
83
104
  """Check if required dependencies are installed."""
84
- return _CLASSIFICATION_AVAILABLE
105
+ return _check_classification_dependencies()
85
106
 
86
107
  def _get_pipeline(self, model_id: str, using: str) -> "Pipeline":
87
108
  """Get or create a classification pipeline."""
@@ -92,6 +113,10 @@ class ClassificationManager:
92
113
  )
93
114
  start_time = time.time()
94
115
  try:
116
+ # Lazy import transformers components
117
+ transformers_components = _get_transformers_components()
118
+ pipeline = transformers_components['pipeline']
119
+
95
120
  task = (
96
121
  "zero-shot-classification"
97
122
  if using == "text"
natural_pdf/core/pdf.py CHANGED
@@ -25,9 +25,10 @@ from typing import (
25
25
  import pdfplumber
26
26
  from PIL import Image
27
27
  from tqdm.auto import tqdm
28
+ import weakref
28
29
 
29
30
  from natural_pdf.analyzers.layout.layout_manager import LayoutManager
30
- from natural_pdf.classification.manager import ClassificationError, ClassificationManager
31
+ from natural_pdf.classification.manager import ClassificationError
31
32
  from natural_pdf.classification.mixin import ClassificationMixin
32
33
  from natural_pdf.classification.results import ClassificationResult
33
34
  from natural_pdf.core.highlighting_service import HighlightingService
@@ -72,8 +73,13 @@ except ImportError:
72
73
 
73
74
  logger = logging.getLogger("natural_pdf.core.pdf")
74
75
 
76
+ def _get_classification_manager_class():
77
+ """Lazy import for ClassificationManager."""
78
+ from natural_pdf.classification.manager import ClassificationManager
79
+ return ClassificationManager
80
+
75
81
  DEFAULT_MANAGERS = {
76
- "classification": ClassificationManager,
82
+ "classification": _get_classification_manager_class,
77
83
  "structured_data": StructuredDataManager,
78
84
  }
79
85
 
@@ -91,6 +97,62 @@ except ImportError:
91
97
  img2pdf = None
92
98
  # End Deskew Imports
93
99
 
100
+ # --- Lazy Page List Helper --- #
101
+ from collections.abc import Sequence
102
+
103
+ class _LazyPageList(Sequence):
104
+ """A lightweight, list-like object that lazily instantiates natural-pdf Page objects.
105
+
106
+ The sequence holds `None` placeholders until an index is accessed, at which point
107
+ a real `Page` object is created, cached, and returned. Slices and iteration are
108
+ also supported and will materialise pages on demand.
109
+ """
110
+
111
+ def __init__(self, parent_pdf: "PDF", plumber_pdf: "pdfplumber.PDF", font_attrs=None):
112
+ self._parent_pdf = parent_pdf
113
+ self._plumber_pdf = plumber_pdf
114
+ self._font_attrs = font_attrs
115
+ # One slot per pdfplumber page – initially all None
116
+ self._cache: List[Optional["Page"]] = [None] * len(self._plumber_pdf.pages)
117
+
118
+ # Internal helper -----------------------------------------------------
119
+ def _create_page(self, index: int) -> "Page":
120
+ cached = self._cache[index]
121
+ if cached is None:
122
+ # Import here to avoid circular import problems
123
+ from natural_pdf.core.page import Page
124
+
125
+ plumber_page = self._plumber_pdf.pages[index]
126
+ cached = Page(plumber_page, parent=self._parent_pdf, index=index, font_attrs=self._font_attrs)
127
+ self._cache[index] = cached
128
+ return cached
129
+
130
+ # Sequence protocol ---------------------------------------------------
131
+ def __len__(self) -> int:
132
+ return len(self._cache)
133
+
134
+ def __getitem__(self, key):
135
+ if isinstance(key, slice):
136
+ # Materialise pages for slice lazily as well
137
+ indices = range(*key.indices(len(self)))
138
+ return [self._create_page(i) for i in indices]
139
+ elif isinstance(key, int):
140
+ if key < 0:
141
+ key += len(self)
142
+ if key < 0 or key >= len(self):
143
+ raise IndexError("Page index out of range")
144
+ return self._create_page(key)
145
+ else:
146
+ raise TypeError("Page indices must be integers or slices")
147
+
148
+ def __iter__(self):
149
+ for i in range(len(self)):
150
+ yield self._create_page(i)
151
+
152
+ def __repr__(self) -> str: # pragma: no cover
153
+ return f"<_LazyPageList(len={len(self)})>"
154
+
155
+ # --- End Lazy Page List Helper --- #
94
156
 
95
157
  class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
96
158
  """
@@ -129,6 +191,15 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
129
191
  self.source_path = "<stream>" # Identifier for source
130
192
  self.path = self.source_path # Use source identifier as path for streams
131
193
  stream_to_open = path_or_url_or_stream
194
+ try:
195
+ if hasattr(path_or_url_or_stream, "read"):
196
+ # If caller provided an in-memory binary stream, capture bytes for potential re-export
197
+ current_pos = path_or_url_or_stream.tell()
198
+ path_or_url_or_stream.seek(0)
199
+ self._original_bytes = path_or_url_or_stream.read()
200
+ path_or_url_or_stream.seek(current_pos)
201
+ except Exception:
202
+ pass
132
203
  elif isinstance(path_or_url_or_stream, (str, Path)):
133
204
  path_or_url = str(path_or_url_or_stream)
134
205
  self.source_path = path_or_url # Store original path/URL as source
@@ -137,21 +208,15 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
137
208
  if is_url:
138
209
  logger.info(f"Downloading PDF from URL: {path_or_url}")
139
210
  try:
140
- # Use a context manager for the temporary file
141
- with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as temp_f:
142
- self._temp_file = temp_f # Store reference if needed for cleanup
143
- with urllib.request.urlopen(path_or_url) as response:
144
- temp_f.write(response.read())
145
- temp_f.flush()
146
- self._resolved_path = temp_f.name
147
- logger.info(f"PDF downloaded to temporary file: {self._resolved_path}")
148
- stream_to_open = self._resolved_path
211
+ with urllib.request.urlopen(path_or_url) as response:
212
+ data = response.read()
213
+ # Load directly into an in-memory buffer no temp file needed
214
+ buffer = io.BytesIO(data)
215
+ buffer.seek(0)
216
+ self._temp_file = None # No on-disk temp file
217
+ self._resolved_path = path_or_url # For repr / get_id purposes
218
+ stream_to_open = buffer # pdfplumber accepts file-like objects
149
219
  except Exception as e:
150
- if self._temp_file and hasattr(self._temp_file, "name"):
151
- try:
152
- os.unlink(self._temp_file.name)
153
- except: # noqa E722
154
- pass
155
220
  logger.error(f"Failed to download PDF from URL: {e}")
156
221
  raise ValueError(f"Failed to download PDF from URL: {e}")
157
222
  else:
@@ -187,12 +252,8 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
187
252
  # self._classification_manager_instance = ClassificationManager() # Removed this line
188
253
  self._manager_registry = {}
189
254
 
190
- from natural_pdf.core.page import Page
191
-
192
- self._pages = [
193
- Page(p, parent=self, index=i, font_attrs=font_attrs)
194
- for i, p in enumerate(self._pdf.pages)
195
- ]
255
+ # Lazily instantiate pages only when accessed
256
+ self._pages = _LazyPageList(self, self._pdf, font_attrs=font_attrs)
196
257
 
197
258
  self._element_cache = {}
198
259
  self._exclusions = []
@@ -204,15 +265,45 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
204
265
  self._initialize_highlighter()
205
266
  self.analyses: Dict[str, Any] = {}
206
267
 
268
+ # --- Automatic cleanup when object is garbage-collected ---
269
+ self._finalizer = weakref.finalize(
270
+ self,
271
+ PDF._finalize_cleanup,
272
+ self._pdf,
273
+ getattr(self, "_temp_file", None),
274
+ getattr(self, "_is_stream", False),
275
+ )
276
+
207
277
  def _initialize_managers(self):
208
278
  """Initialize manager instances based on DEFAULT_MANAGERS."""
209
279
  self._managers = {}
210
- for key, manager_class in DEFAULT_MANAGERS.items():
280
+ for key, manager_class_or_factory in DEFAULT_MANAGERS.items():
211
281
  try:
212
- self._managers[key] = manager_class()
213
- logger.debug(f"Initialized manager for key '{key}': {manager_class.__name__}")
282
+ # Resolve the entry in DEFAULT_MANAGERS which can be:
283
+ # 1. A class -> instantiate directly
284
+ # 2. A factory (callable) returning a class -> call then instantiate
285
+ # 3. A factory returning a **ready instance** -> use as-is
286
+
287
+ resolved = manager_class_or_factory
288
+
289
+ # If we have a callable that is *not* a class, call it to obtain the real target
290
+ # (This is the lazy-import factory case.)
291
+ if not isinstance(resolved, type) and callable(resolved):
292
+ resolved = resolved()
293
+
294
+ # At this point `resolved` is either a class or an already-created instance
295
+ if isinstance(resolved, type):
296
+ instance = resolved() # Instantiate class
297
+ self._managers[key] = instance
298
+ logger.debug(f"Initialized manager for key '{key}': {resolved.__name__}")
299
+ else:
300
+ # Assume factory already returned an instance
301
+ self._managers[key] = resolved
302
+ logger.debug(
303
+ f"Initialized manager instance for key '{key}': {type(resolved).__name__} (factory-provided instance)"
304
+ )
214
305
  except Exception as e:
215
- logger.error(f"Failed to initialize manager {manager_class.__name__}: {e}")
306
+ logger.error(f"Failed to initialize manager for key '{key}': {e}")
216
307
  self._managers[key] = None
217
308
 
218
309
  def get_manager(self, key: str) -> Any:
@@ -1220,6 +1311,10 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
1220
1311
  except Exception as e:
1221
1312
  logger.warning(f"Failed to clean up temporary file '{temp_file_path}': {e}")
1222
1313
 
1314
+ # Cancels the weakref finalizer so we don't double-clean
1315
+ if hasattr(self, "_finalizer") and self._finalizer.alive:
1316
+ self._finalizer()
1317
+
1223
1318
  def __enter__(self):
1224
1319
  """Context manager entry."""
1225
1320
  return self
@@ -1404,12 +1499,9 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
1404
1499
  raise ClassificationError(f"Cannot get ClassificationManager: {e}") from e
1405
1500
 
1406
1501
  if not manager or not manager.is_available():
1407
- try:
1408
- from natural_pdf.classification.manager import _CLASSIFICATION_AVAILABLE
1409
-
1410
- if not _CLASSIFICATION_AVAILABLE:
1411
- raise ImportError("Classification dependencies missing.")
1412
- except ImportError:
1502
+ from natural_pdf.classification.manager import is_classification_available
1503
+
1504
+ if not is_classification_available():
1413
1505
  raise ImportError(
1414
1506
  "Classification dependencies missing. "
1415
1507
  'Install with: pip install "natural-pdf[core-ml]"'
@@ -1723,3 +1815,20 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
1723
1815
  raise ValueError(f"Unsupported model_type for PDF classification: {model_type}")
1724
1816
 
1725
1817
  # --- End Classification Mixin Implementation ---
1818
+
1819
+ # Static helper for weakref.finalize to avoid capturing 'self'
1820
+ @staticmethod
1821
+ def _finalize_cleanup(plumber_pdf, temp_file_obj, is_stream):
1822
+ try:
1823
+ if plumber_pdf is not None:
1824
+ plumber_pdf.close()
1825
+ except Exception:
1826
+ pass
1827
+
1828
+ if temp_file_obj and not is_stream:
1829
+ try:
1830
+ path = temp_file_obj.name if hasattr(temp_file_obj, "name") else None
1831
+ if path and os.path.exists(path):
1832
+ os.unlink(path)
1833
+ except Exception:
1834
+ pass
@@ -1,4 +1,15 @@
1
1
  from .base import FinetuneExporter
2
- from .paddleocr import PaddleOCRRecognitionExporter
2
+
3
+ # Lazy import for PaddleOCRRecognitionExporter to avoid heavy paddle dependencies at module level
4
+ def _get_paddleocr_exporter():
5
+ """Lazy import for PaddleOCRRecognitionExporter."""
6
+ from .paddleocr import PaddleOCRRecognitionExporter
7
+ return PaddleOCRRecognitionExporter
8
+
9
+ # Make PaddleOCRRecognitionExporter available through attribute access
10
+ def __getattr__(name):
11
+ if name == "PaddleOCRRecognitionExporter":
12
+ return _get_paddleocr_exporter()
13
+ raise AttributeError(f"module '{__name__}' has no attribute '{name}'")
3
14
 
4
15
  __all__ = ["FinetuneExporter", "PaddleOCRRecognitionExporter"]
@@ -16,6 +16,7 @@ from dataclasses import dataclass
16
16
  from itertools import pairwise
17
17
  from math import atan, pi
18
18
  from pathlib import Path
19
+ from typing import Optional, Union
19
20
  from xml.etree import ElementTree
20
21
 
21
22
  from pikepdf import Matrix, Name, Rectangle
@@ -94,12 +95,12 @@ class HocrTransform:
94
95
  def __init__(
95
96
  self,
96
97
  *,
97
- hocr_filename: str | Path,
98
+ hocr_filename: Union[str, Path],
98
99
  dpi: float,
99
100
  debug: bool = False,
100
101
  fontname: Name = Name("/f-0-0"),
101
102
  font: Font = GlyphlessFont(),
102
- debug_render_options: DebugRenderOptions | None = None,
103
+ debug_render_options: Optional[DebugRenderOptions] = None,
103
104
  ):
104
105
  """Initialize the HocrTransform object."""
105
106
  if debug:
@@ -144,7 +145,7 @@ class HocrTransform:
144
145
  return text
145
146
 
146
147
  @classmethod
147
- def element_coordinates(cls, element: Element) -> Rectangle | None:
148
+ def element_coordinates(cls, element: Element) -> Optional[Rectangle]:
148
149
  """Get coordinates of the bounding box around an element."""
149
150
  matches = cls.box_pattern.search(element.attrib.get("title", ""))
150
151
  if not matches:
@@ -172,7 +173,7 @@ class HocrTransform:
172
173
  return 0.0
173
174
  return float(matches.group(1))
174
175
 
175
- def _child_xpath(self, html_tag: str, html_class: str | None = None) -> str:
176
+ def _child_xpath(self, html_tag: str, html_class: Optional[str] = None) -> str:
176
177
  xpath = f".//{self.xmlns}{html_tag}"
177
178
  if html_class:
178
179
  xpath += f"[@class='{html_class}']"
@@ -187,7 +188,7 @@ class HocrTransform:
187
188
  self,
188
189
  *,
189
190
  out_filename: Path,
190
- image_filename: Path | None = None,
191
+ image_filename: Optional[Path] = None,
191
192
  invisible_text: bool = True,
192
193
  ) -> None:
193
194
  """Creates a PDF file with an image superimposed on top of the text.
@@ -291,7 +292,7 @@ class HocrTransform:
291
292
  def _do_line(
292
293
  self,
293
294
  canvas: Canvas,
294
- line: Element | None,
295
+ line: Optional[Element],
295
296
  elemclass: str,
296
297
  invisible_text: bool,
297
298
  text_direction: TextDirection,
@@ -387,8 +388,8 @@ class HocrTransform:
387
388
  line_matrix: Matrix,
388
389
  text: Text,
389
390
  fontsize: float,
390
- elem: Element | None,
391
- next_elem: Element | None,
391
+ elem: Optional[Element],
392
+ next_elem: Optional[Element],
392
393
  text_direction: TextDirection,
393
394
  inject_word_breaks: bool,
394
395
  ):
@@ -4,6 +4,8 @@ Module for exporting original PDF pages without modification.
4
4
 
5
5
  import logging
6
6
  import os
7
+ import io
8
+ import urllib.request
7
9
  from pathlib import Path
8
10
  from typing import TYPE_CHECKING, List, Set, Union
9
11
 
@@ -69,8 +71,11 @@ def create_original_pdf(
69
71
 
70
72
  # Verify all pages come from the same PDF and get path
71
73
  first_page_pdf_path = None
74
+ first_page_pdf_obj = None
72
75
  if hasattr(pages_to_extract[0], "pdf") and pages_to_extract[0].pdf:
73
- first_page_pdf_path = getattr(pages_to_extract[0].pdf, "path", None)
76
+ src_pdf = pages_to_extract[0].pdf
77
+ first_page_pdf_path = getattr(src_pdf, "path", None)
78
+ first_page_pdf_obj = src_pdf
74
79
 
75
80
  if not first_page_pdf_path:
76
81
  raise ValueError(
@@ -93,7 +98,28 @@ def create_original_pdf(
93
98
  )
94
99
 
95
100
  try:
96
- with pikepdf.Pdf.open(first_page_pdf_path) as source_pikepdf_doc:
101
+ # Prefer opening via filesystem path when it exists locally
102
+ if first_page_pdf_path and os.path.exists(first_page_pdf_path):
103
+ source_handle = pikepdf.Pdf.open(first_page_pdf_path)
104
+ else:
105
+ # Fallback: attempt to open from in-memory bytes stored on PDF object
106
+ if first_page_pdf_obj is not None and hasattr(first_page_pdf_obj, "_original_bytes") and first_page_pdf_obj._original_bytes:
107
+ source_handle = pikepdf.Pdf.open(io.BytesIO(first_page_pdf_obj._original_bytes))
108
+ else:
109
+ # Attempt to download bytes directly if path looks like URL
110
+ if isinstance(first_page_pdf_path, str) and first_page_pdf_path.startswith(("http://", "https://")):
111
+ try:
112
+ with urllib.request.urlopen(first_page_pdf_path) as resp:
113
+ data = resp.read()
114
+ source_handle = pikepdf.Pdf.open(io.BytesIO(data))
115
+ except Exception as dl_err:
116
+ raise FileNotFoundError(
117
+ f"Source PDF bytes not available and download failed for {first_page_pdf_path}: {dl_err}"
118
+ )
119
+ else:
120
+ raise FileNotFoundError(f"Source PDF bytes not available for {first_page_pdf_path}")
121
+
122
+ with source_handle as source_pikepdf_doc:
97
123
  target_pikepdf_doc = pikepdf.Pdf.new()
98
124
 
99
125
  for page_index in sorted_indices:
@@ -113,6 +139,9 @@ def create_original_pdf(
113
139
  f"Successfully saved original pages PDF ({len(target_pikepdf_doc.pages)} pages) to: {output_path_str}"
114
140
  )
115
141
 
142
+ except FileNotFoundError as e:
143
+ logger.error(str(e))
144
+ raise RuntimeError(f"Failed to save original pages PDF: {e}")
116
145
  except pikepdf.PasswordError:
117
146
  logger.error(f"Failed to open password-protected source PDF: {first_page_pdf_path}")
118
147
  raise RuntimeError(
@@ -27,7 +27,6 @@ class SuryaOCREngine(OCREngine):
27
27
  if not self.is_available():
28
28
  raise ImportError("Surya OCR library is not installed or available.")
29
29
 
30
- # Store languages for use in _process_single_image
31
30
  self._langs = languages
32
31
 
33
32
  from surya.detection import DetectionPredictor
@@ -63,7 +62,6 @@ class SuryaOCREngine(OCREngine):
63
62
  if not self._recognition_predictor or not self._detection_predictor:
64
63
  raise RuntimeError("Surya predictors are not initialized.")
65
64
 
66
- # Store languages instance variable during initialization to use here
67
65
  langs = (
68
66
  [self._langs] # Send all languages together in one list per image
69
67
  if hasattr(self, "_langs")
@@ -75,6 +73,7 @@ class SuryaOCREngine(OCREngine):
75
73
  results = self._detection_predictor(images=[image])
76
74
  else:
77
75
  results = self._recognition_predictor(
76
+ langs=langs,
78
77
  images=[image],
79
78
  det_predictor=self._detection_predictor,
80
79
  )
@@ -11,7 +11,8 @@ from PIL import Image
11
11
  from .engine import OCREngine
12
12
  from .engine_doctr import DoctrOCREngine
13
13
  from .engine_easyocr import EasyOCREngine
14
- from .engine_paddle import PaddleOCREngine
14
+ # Lazy import for PaddleOCREngine to avoid heavy paddle dependencies at module level
15
+ # from .engine_paddle import PaddleOCREngine
15
16
  from .engine_surya import SuryaOCREngine
16
17
  from .ocr_options import (
17
18
  BaseOCROptions,
@@ -28,10 +29,16 @@ logger = logging.getLogger(__name__)
28
29
  class OCRManager:
29
30
  """Manages OCR engine selection, configuration, and execution."""
30
31
 
32
+ @staticmethod
33
+ def _get_paddle_engine_class():
34
+ """Lazy import for PaddleOCREngine to avoid heavy paddle dependencies at module level."""
35
+ from .engine_paddle import PaddleOCREngine
36
+ return PaddleOCREngine
37
+
31
38
  # Registry mapping engine names to classes and default options
32
39
  ENGINE_REGISTRY: Dict[str, Dict[str, Any]] = {
33
40
  "easyocr": {"class": EasyOCREngine, "options_class": EasyOCROptions},
34
- "paddle": {"class": PaddleOCREngine, "options_class": PaddleOCROptions},
41
+ "paddle": {"class": lambda: OCRManager._get_paddle_engine_class(), "options_class": PaddleOCROptions},
35
42
  "surya": {"class": SuryaOCREngine, "options_class": SuryaOCROptions},
36
43
  "doctr": {"class": DoctrOCREngine, "options_class": DoctrOCROptions},
37
44
  # Add other engines here
@@ -76,7 +83,12 @@ class OCRManager:
76
83
  logger.info(
77
84
  f"[{threading.current_thread().name}] Creating shared instance of engine: {engine_name}"
78
85
  )
79
- engine_class = self.ENGINE_REGISTRY[engine_name]["class"]
86
+ engine_class_or_factory = self.ENGINE_REGISTRY[engine_name]["class"]
87
+ # Handle lazy loading - if it's a lambda function, call it to get the actual class
88
+ if callable(engine_class_or_factory) and getattr(engine_class_or_factory, '__name__', '') == '<lambda>':
89
+ engine_class = engine_class_or_factory()
90
+ else:
91
+ engine_class = engine_class_or_factory
80
92
  start_time = time.monotonic() # Optional: time initialization
81
93
  try:
82
94
  engine_instance = engine_class() # Instantiate first
@@ -277,7 +289,12 @@ class OCRManager:
277
289
  for name, registry_entry in self.ENGINE_REGISTRY.items():
278
290
  try:
279
291
  # Temporarily instantiate to check availability without caching
280
- engine_class = registry_entry["class"]
292
+ engine_class_or_factory = registry_entry["class"]
293
+ # Handle lazy loading - if it's a lambda function, call it to get the actual class
294
+ if callable(engine_class_or_factory) and getattr(engine_class_or_factory, '__name__', '') == '<lambda>':
295
+ engine_class = engine_class_or_factory()
296
+ else:
297
+ engine_class = engine_class_or_factory
281
298
  if engine_class().is_available():
282
299
  available.append(name)
283
300
  except Exception as e:
@@ -18,7 +18,8 @@ SEARCH_DEPENDENCIES_AVAILABLE = False
18
18
 
19
19
  try:
20
20
  import numpy as np
21
- import sentence_transformers
21
+ # Lazy import for sentence_transformers to avoid heavy loading at module level
22
+ # import sentence_transformers
22
23
 
23
24
  # Basic search dependencies are available
24
25
  SEARCH_DEPENDENCIES_AVAILABLE = True
@@ -46,12 +47,28 @@ except ImportError:
46
47
  logger = logging.getLogger(__name__)
47
48
 
48
49
 
50
+ def _check_sentence_transformers():
51
+ """Lazy check for sentence_transformers availability."""
52
+ try:
53
+ import sentence_transformers
54
+ return True
55
+ except ImportError:
56
+ return False
57
+
58
+
49
59
  def check_search_availability():
50
60
  """Check if required search dependencies are available."""
51
61
  if not SEARCH_DEPENDENCIES_AVAILABLE:
52
62
  raise ImportError(
53
- "Search functionality requires 'sentence-transformers' and NumPy. "
54
- "Install with: pip install natural-pdf[search] (or pip install sentence-transformers numpy)"
63
+ "Search functionality requires 'lancedb' and pyarrow. "
64
+ "Install with: pip install natural-pdf[search] (or pip install lancedb pyarrow)"
65
+ )
66
+
67
+ # Lazy check for sentence_transformers when actually needed
68
+ if not _check_sentence_transformers():
69
+ raise ImportError(
70
+ "Search functionality requires 'sentence-transformers'. "
71
+ "Install with: pip install sentence-transformers"
55
72
  )
56
73
 
57
74
 
@@ -1,12 +1,14 @@
1
1
  import logging
2
+ import os
2
3
  import shutil
3
4
  import tempfile
4
5
  from pathlib import Path
5
- from typing import Any, Dict, Iterable, List, Optional
6
+ from typing import Any, Dict, Iterable, List, Optional, Union
6
7
 
7
8
  import lancedb
8
9
  import pyarrow as pa
9
- from sentence_transformers import SentenceTransformer
10
+ # Lazy import for SentenceTransformer to avoid heavy loading at module level
11
+ # from sentence_transformers import SentenceTransformer
10
12
 
11
13
  from .search_options import BaseSearchOptions
12
14
  from .search_service_protocol import (
@@ -17,8 +19,14 @@ from .search_service_protocol import (
17
19
 
18
20
  logger = logging.getLogger(__name__)
19
21
 
20
- DEFAULT_EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
21
- DEFAULT_LANCEDB_PERSIST_PATH = "./natural_pdf_lancedb_index"
22
+ DEFAULT_EMBEDDING_MODEL = "all-MiniLM-L6-v2"
23
+ DEFAULT_LANCEDB_PERSIST_PATH = "./lancedb_data"
24
+
25
+
26
+ def _get_sentence_transformer(model_name: str):
27
+ """Lazy import and instantiation of SentenceTransformer."""
28
+ from sentence_transformers import SentenceTransformer
29
+ return SentenceTransformer(model_name)
22
30
 
23
31
 
24
32
  class LanceDBSearchService(SearchServiceProtocol):
@@ -41,7 +49,7 @@ class LanceDBSearchService(SearchServiceProtocol):
41
49
  self._db = None
42
50
  self._table = None
43
51
 
44
- self.embedding_model = SentenceTransformer(self._embedding_model_name)
52
+ self.embedding_model = _get_sentence_transformer(self._embedding_model_name)
45
53
  test_embedding = self.embedding_model.encode("test")
46
54
  self._embedding_dims = len(test_embedding)
47
55
 
@@ -1,21 +1,31 @@
1
1
  import json
2
2
  import logging
3
+ import os
4
+ import tempfile
3
5
  from pathlib import Path
4
6
  from typing import Any, Dict, Iterable, List, Optional, Union
5
7
 
6
8
  import numpy as np
7
- from sentence_transformers import SentenceTransformer
9
+ # Lazy import for SentenceTransformer to avoid heavy loading at module level
10
+ # from sentence_transformers import SentenceTransformer
8
11
 
9
12
  from .search_options import BaseSearchOptions
10
13
  from .search_service_protocol import (
11
14
  Indexable,
12
15
  IndexConfigurationError,
16
+ SearchResult,
13
17
  SearchServiceProtocol,
14
18
  )
15
19
 
16
20
  logger = logging.getLogger(__name__)
17
21
 
18
- DEFAULT_EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
22
+ DEFAULT_EMBEDDING_MODEL = "all-MiniLM-L6-v2"
23
+
24
+
25
+ def _get_sentence_transformer(model_name: str):
26
+ """Lazy import and instantiation of SentenceTransformer."""
27
+ from sentence_transformers import SentenceTransformer
28
+ return SentenceTransformer(model_name)
19
29
 
20
30
 
21
31
  class NumpySearchService(SearchServiceProtocol):
@@ -38,7 +48,7 @@ class NumpySearchService(SearchServiceProtocol):
38
48
 
39
49
  self.collection_name = collection_name
40
50
  self._embedding_model_name = embedding_model_name
41
- self.embedding_model = SentenceTransformer(self._embedding_model_name)
51
+ self.embedding_model = _get_sentence_transformer(self._embedding_model_name)
42
52
  self._embedding_dims = len(self.embedding_model.encode("test"))
43
53
 
44
54
  # Simple in-memory storage
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: natural-pdf
3
- Version: 0.1.17
3
+ Version: 0.1.19
4
4
  Summary: A more intuitive interface for working with PDFs
5
5
  Author-email: Jonathan Soma <jonathan.soma@gmail.com>
6
6
  License-Expression: MIT
@@ -11,6 +11,7 @@ Classifier: Operating System :: OS Independent
11
11
  Requires-Python: >=3.9
12
12
  Description-Content-Type: text/markdown
13
13
  License-File: LICENSE
14
+ Requires-Dist: pandas
14
15
  Requires-Dist: pdfplumber
15
16
  Requires-Dist: colormath2
16
17
  Requires-Dist: pillow
@@ -20,14 +21,15 @@ Requires-Dist: urllib3
20
21
  Requires-Dist: tqdm
21
22
  Requires-Dist: pydantic
22
23
  Requires-Dist: jenkspy
23
- Requires-Dist: pikepdf>=9.7.0
24
+ Requires-Dist: pikepdf
24
25
  Requires-Dist: scipy
25
26
  Requires-Dist: torch
26
27
  Requires-Dist: torchvision
27
- Requires-Dist: transformers[sentencepiece]<=4.34.1
28
+ Requires-Dist: transformers[sentencepiece]
28
29
  Requires-Dist: huggingface_hub>=0.29.3
29
30
  Requires-Dist: sentence-transformers
30
31
  Requires-Dist: timm
32
+ Requires-Dist: ipywidgets>=7.0.0
31
33
  Provides-Extra: test
32
34
  Requires-Dist: pytest; extra == "test"
33
35
  Requires-Dist: pytest-xdist; extra == "test"
@@ -38,9 +40,7 @@ Requires-Dist: pyarrow; extra == "search"
38
40
  Provides-Extra: favorites
39
41
  Requires-Dist: natural-pdf[deskew]; extra == "favorites"
40
42
  Requires-Dist: natural-pdf[ocr-export]; extra == "favorites"
41
- Requires-Dist: natural-pdf[search]; extra == "favorites"
42
- Requires-Dist: ipywidgets; extra == "favorites"
43
- Requires-Dist: surya-ocr; extra == "favorites"
43
+ Requires-Dist: natural-pdf[paddle]; extra == "favorites"
44
44
  Provides-Extra: dev
45
45
  Requires-Dist: black; extra == "dev"
46
46
  Requires-Dist: isort; extra == "dev"
@@ -61,23 +61,22 @@ Requires-Dist: setuptools; extra == "dev"
61
61
  Provides-Extra: deskew
62
62
  Requires-Dist: deskew>=1.5; extra == "deskew"
63
63
  Requires-Dist: img2pdf; extra == "deskew"
64
- Provides-Extra: addons
65
- Requires-Dist: surya-ocr; extra == "addons"
66
- Requires-Dist: doclayout_yolo; extra == "addons"
67
- Requires-Dist: paddlepaddle>=3.0.0; extra == "addons"
68
- Requires-Dist: paddleocr>=3.0.0; extra == "addons"
69
- Requires-Dist: ipywidgets>=7.0.0; extra == "addons"
70
- Requires-Dist: easyocr; extra == "addons"
71
- Requires-Dist: surya-ocr; extra == "addons"
72
- Requires-Dist: doclayout_yolo; extra == "addons"
73
- Requires-Dist: python-doctr[torch]; extra == "addons"
74
- Requires-Dist: docling; extra == "addons"
75
64
  Provides-Extra: all
76
65
  Requires-Dist: natural-pdf[ocr-export]; extra == "all"
77
66
  Requires-Dist: natural-pdf[deskew]; extra == "all"
78
67
  Requires-Dist: natural-pdf[test]; extra == "all"
79
68
  Requires-Dist: natural-pdf[search]; extra == "all"
80
- Requires-Dist: natural-pdf[addons]; extra == "all"
69
+ Requires-Dist: natural-pdf[extras]; extra == "all"
70
+ Requires-Dist: natural-pdf[favorites]; extra == "all"
71
+ Provides-Extra: paddle
72
+ Requires-Dist: paddlepaddle>=3.0.0; extra == "paddle"
73
+ Requires-Dist: paddleocr>=3.0.1; extra == "paddle"
74
+ Requires-Dist: paddlex>=3.0.1; extra == "paddle"
75
+ Provides-Extra: extras
76
+ Requires-Dist: surya-ocr>=0.13.0; extra == "extras"
77
+ Requires-Dist: doclayout_yolo; extra == "extras"
78
+ Requires-Dist: easyocr; extra == "extras"
79
+ Requires-Dist: natural-pdf[paddle]; extra == "extras"
81
80
  Provides-Extra: ocr-export
82
81
  Requires-Dist: pikepdf; extra == "ocr-export"
83
82
  Provides-Extra: export-extras
@@ -17,7 +17,7 @@ natural_pdf/analyzers/layout/surya.py,sha256=4RdnhRxSS3i3Ns5mFhOA9-P0xd7Ms19uZuK
17
17
  natural_pdf/analyzers/layout/table_structure_utils.py,sha256=nISZDBd46RPYkFHxbQyIHwg9WweG4DslpoYJ31OMJYA,2768
18
18
  natural_pdf/analyzers/layout/tatr.py,sha256=cVr0ZyhY2mNLAKZ4DGMm-b7XNJpILKh8x8ZpyDeUhLk,15032
19
19
  natural_pdf/analyzers/layout/yolo.py,sha256=ANo2U4EZgeN2eYKM1bZIuysiuJLgwl4JeQchrRxOKwA,8388
20
- natural_pdf/classification/manager.py,sha256=-rdZzGP_JK4RDDxIEgdY8_gHRNS0cNHhpOSodjxbd84,17853
20
+ natural_pdf/classification/manager.py,sha256=pzuTP-34W9N3im1ZFhCfQpOu37VSHEx4JHoHNxyy6o0,18894
21
21
  natural_pdf/classification/mixin.py,sha256=_XtoqCMqj1nxZYskIV2RbVYiVVcEWzFwae4s5vpzC74,6566
22
22
  natural_pdf/classification/results.py,sha256=El1dY7cBQVOB5lP-uj52dWgH6Y7TeQgJOVcZD-OLjes,2778
23
23
  natural_pdf/collections/mixins.py,sha256=sj76Cn6EdBtb5f-bdAV-1qpdixX8tI4BzPccPiYLI1w,5117
@@ -26,7 +26,7 @@ natural_pdf/core/__init__.py,sha256=QC8H4M3KbXwMFiQORZ0pdPlzx1Ix6oKKQSS7Ib2KEaA,
26
26
  natural_pdf/core/element_manager.py,sha256=_UdXu51sLi6STzc8Pj4k8R721G3yJixXDLuRHn3hmr8,25731
27
27
  natural_pdf/core/highlighting_service.py,sha256=_kQUS6_BBvsLBuSZloFrVag6jN90KzHa0ULyGBjufSs,36955
28
28
  natural_pdf/core/page.py,sha256=i3DriIQwoO4RuSrkrCXv44Dz8OL9KXPa2y4GhsD1y18,118324
29
- natural_pdf/core/pdf.py,sha256=bAoGPiKIrFaebLwULMT-9VkHQ_wkE_zNl4hlbMLk-2w,69325
29
+ natural_pdf/core/pdf.py,sha256=yBvb1iGw9gwVPJ3Rm1EBaZ8_g60TuW_Elhg2EOcJMzc,73871
30
30
  natural_pdf/describe/__init__.py,sha256=B3zjuHjFI_dFuBLgXR1Q4v7c72fVDyk84d2hs0H4KV8,561
31
31
  natural_pdf/describe/base.py,sha256=7USCFIl4mI5b15LTVkwvhAn_mngMwhwxCnVYaZz5Vdc,16842
32
32
  natural_pdf/describe/elements.py,sha256=BOkz2wDhGh6P8NOm6pSNxitgmVokLTISztaFhrxMcdw,12717
@@ -40,11 +40,11 @@ natural_pdf/elements/rect.py,sha256=kiVa3e377ZnqIOXc89d9ZSY4EcmDxtccdtUw-HOQzpw,
40
40
  natural_pdf/elements/region.py,sha256=hBklYKcXJWyxayu9todYQOZ-d9KVDtqeV-CIt9IcSn8,123400
41
41
  natural_pdf/elements/text.py,sha256=13HvVZGinj2Vm_fFCAnqi7hohtoKvnpCp3VCfkpeAbc,11146
42
42
  natural_pdf/export/mixin.py,sha256=L1q3MIEFWuvie4j4_EmW7GT3NerbZ1as0XMUoqTS7gM,5083
43
- natural_pdf/exporters/__init__.py,sha256=7MnvRLLQdwtg-ULu-8uK8C84GsKiJamyhRw_GgWhw7k,151
43
+ natural_pdf/exporters/__init__.py,sha256=XG0ckcKHgG7IVma75syORUme6wEItUvDA46aCZzGqrU,639
44
44
  natural_pdf/exporters/base.py,sha256=XhR1xlkHOh7suOuX7mWbsj1h2o1pZNet-OAS5YCJyeI,2115
45
- natural_pdf/exporters/hocr.py,sha256=MOb5sTxe-GlMSOtmqp3p4SY_ZigwOtmd4sj_zMRCIQY,19907
45
+ natural_pdf/exporters/hocr.py,sha256=wksvJvWLSxuAfhYzg_0T2_W8eqDoMgAVC-gwZ9FoO_k,19969
46
46
  natural_pdf/exporters/hocr_font.py,sha256=1wsGOMj6zoaRN2rxCwrv4MMLGawpNz984WgXpmWekgw,4574
47
- natural_pdf/exporters/original_pdf.py,sha256=zsZPg_lUoEerKIzzoEw-qGdM5XBg_LZhFJeVKnCUp4o,5054
47
+ natural_pdf/exporters/original_pdf.py,sha256=dtvC4er6TWOfqq-n24Pejw3mlAuPd8IVyihggJtcf0s,6634
48
48
  natural_pdf/exporters/paddleocr.py,sha256=IAG2p9YeImYcsIvb6a_L5mMrKarvaMaDvRrvdlY6bX4,19489
49
49
  natural_pdf/exporters/searchable_pdf.py,sha256=G2Tc4tpDXSYIufXJlkA8ppW_3DuzHAaweYKae33pI_c,16290
50
50
  natural_pdf/exporters/data/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -63,16 +63,16 @@ natural_pdf/ocr/engine.py,sha256=ZBC1tZNM5EDbGDJJmZI9mNHr4nCMLEZvUFhiJq8GdF4,874
63
63
  natural_pdf/ocr/engine_doctr.py,sha256=ptKrupMWoulZb-R93zr9btoe94JPWU7vlJuN7OBJEIM,17740
64
64
  natural_pdf/ocr/engine_easyocr.py,sha256=bWz6kHUgAJfe3rqdnZBAF-IPvw3B35DlvX5KDdFUtzo,9888
65
65
  natural_pdf/ocr/engine_paddle.py,sha256=ZUtyjso_UjjAPnJt5ac-AtOpR6PfOhO76iOyjngGzr0,16198
66
- natural_pdf/ocr/engine_surya.py,sha256=Qc3geQQzJ1-9WS1aho38jfvd7yxbYOUVeIpzpapHLRg,5159
66
+ natural_pdf/ocr/engine_surya.py,sha256=PNjvpsHnBghAoa-df52HEyvXzfNI-gTFgKvs2LxHgKo,5051
67
67
  natural_pdf/ocr/ocr_factory.py,sha256=gBFXdFs7E4aCynHz06sQsAhaO3s8yhgoFgN5nyxtg9c,5221
68
- natural_pdf/ocr/ocr_manager.py,sha256=O-wSx50k9pcf0M8N_5nKVefS55r6tMJWRF8KjktA8ts,13664
68
+ natural_pdf/ocr/ocr_manager.py,sha256=M1GRAThzWl5iMkQJ41j84G6cJ7XruQD_HoPPzWf7nUk,14742
69
69
  natural_pdf/ocr/ocr_options.py,sha256=l33QKu_93r-uwi3t_v8UH8pEgHo6HTVzP4tfmQFRF1w,5488
70
70
  natural_pdf/ocr/utils.py,sha256=OxuHwDbHWj6setvnC0QYwMHrAjxGkhmLzWHpMqqGupA,4397
71
71
  natural_pdf/qa/__init__.py,sha256=Pjo62JTnUNEjGNsC437mvsS5KQ5m7X_BibGvavR9AW0,108
72
72
  natural_pdf/qa/document_qa.py,sha256=Jw4yyq3Vifn57D0ANmOfUlZeG8CJjBkItZBV-8ZAmos,15111
73
- natural_pdf/search/__init__.py,sha256=RHP1E-5m3hhLXz__g7EvZihBJjPTDtUYh_bZr_NwDo0,3724
74
- natural_pdf/search/lancedb_search_service.py,sha256=kgm-nYXjPQBkEkWE0gkdpL4V53xm_CEX4rZ5KBpxgfM,14190
75
- natural_pdf/search/numpy_search_service.py,sha256=5zkkZds-Dcp8PsrvTJdyW15fS1ffHDLVjeiXTGWoRsY,10006
73
+ natural_pdf/search/__init__.py,sha256=0Xa7tT_2q57wHObFMQLQLd4gd9AV0oyS-svV6BmmdMI,4276
74
+ natural_pdf/search/lancedb_search_service.py,sha256=6dz2IEZUWk3hFW28C-LF_85pWohd7Sr5k44bM0pBdm4,14472
75
+ natural_pdf/search/numpy_search_service.py,sha256=MoPBlyHTDqah1IrwBzyglEyiXlF4wqaU_5mml_ngvGc,10328
76
76
  natural_pdf/search/search_options.py,sha256=sq_e8_jSROicD94b_xtDtLnjEr_Zsy4icjzPkK0a8QA,3566
77
77
  natural_pdf/search/search_service_protocol.py,sha256=Dl-Q-CrutkhZwI69scbW9EWPeYM63qxB60_EA7YqIYo,6699
78
78
  natural_pdf/search/searchable_mixin.py,sha256=hqQ_AuID5eTGRCtKYdFLZ1zF35y73uk3x1M1VW9Il8U,23514
@@ -90,8 +90,8 @@ natural_pdf/utils/text_extraction.py,sha256=z6Jhy11pakYCsEpkvh8ldw6DkUFsYF1hCL9Y
90
90
  natural_pdf/utils/visualization.py,sha256=30pRWQdsRJh2pSObh-brKVsFgC1n8tHmSrta_UDnVPw,8989
91
91
  natural_pdf/widgets/__init__.py,sha256=QTVaUmsw__FCweFYZebwPssQxxUFUMd0wpm_cUbGZJY,181
92
92
  natural_pdf/widgets/viewer.py,sha256=2VUY1TzWMDe9I-IVNOosKZ2LaqpjLB62ftMAdk-s6_8,24952
93
- natural_pdf-0.1.17.dist-info/licenses/LICENSE,sha256=9zfwINwJlarbDmdh6iJV4QUG54QSJlSAUcnC1YiC_Ns,1074
94
- natural_pdf-0.1.17.dist-info/METADATA,sha256=yGeusUaYx_R_aRl0lUnAHVfBav9Zw43MXDYcB3b6BcA,6753
95
- natural_pdf-0.1.17.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
96
- natural_pdf-0.1.17.dist-info/top_level.txt,sha256=Cyw1zmNDlUZfb5moU-WUWGprrwH7ln_8LDGdmMHF1xI,17
97
- natural_pdf-0.1.17.dist-info/RECORD,,
93
+ natural_pdf-0.1.19.dist-info/licenses/LICENSE,sha256=9zfwINwJlarbDmdh6iJV4QUG54QSJlSAUcnC1YiC_Ns,1074
94
+ natural_pdf-0.1.19.dist-info/METADATA,sha256=brYXFREotSwixV1gbp19_SN7otJ8QBzAcQ1dssI_73g,6645
95
+ natural_pdf-0.1.19.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
96
+ natural_pdf-0.1.19.dist-info/top_level.txt,sha256=Cyw1zmNDlUZfb5moU-WUWGprrwH7ln_8LDGdmMHF1xI,17
97
+ natural_pdf-0.1.19.dist-info/RECORD,,