natural-pdf 0.1.3__py3-none-any.whl → 0.1.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -215,8 +215,8 @@ class PDFCollection(SearchableMixin): # Inherit from the mixin
215
215
  """Returns the list of PDF objects held by the collection."""
216
216
  return self._pdfs
217
217
 
218
- # --- Other Methods (e.g., apply_ocr_to_pages - could leverage service in future?) ---
219
- def apply_ocr_to_pages(self, *args, **kwargs):
218
+ # --- Other Methods (e.g., apply_ocr - could leverage service in future?) ---
219
+ def apply_ocr(self, *args, **kwargs):
220
220
  PDF = self._get_pdf_class()
221
221
  # Delegate to individual PDF objects
222
222
  logger.info("Applying OCR to relevant PDFs in collection...")
@@ -225,8 +225,8 @@ class PDFCollection(SearchableMixin): # Inherit from the mixin
225
225
  # We need to figure out which pages belong to which PDF if batching here
226
226
  # For now, simpler to call on each PDF
227
227
  try:
228
- # Assume apply_ocr_to_pages exists on PDF and accepts similar args
229
- pdf.apply_ocr_to_pages(*args, **kwargs)
228
+ # Assume apply_ocr exists on PDF and accepts similar args
229
+ pdf.apply_ocr(*args, **kwargs)
230
230
  except Exception as e:
231
231
  logger.error(f"Failed applying OCR to {pdf.path}: {e}", exc_info=True)
232
232
  return self
natural_pdf/core/page.py CHANGED
@@ -1074,19 +1074,19 @@ class Page:
1074
1074
  device: Optional[str] = None,
1075
1075
  ) -> List[TextElement]:
1076
1076
  """
1077
- Apply OCR to THIS page and add results to page elements via PDF.apply_ocr_to_pages.
1077
+ Apply OCR to THIS page and add results to page elements via PDF.apply_ocr.
1078
1078
 
1079
1079
  Returns:
1080
1080
  List of created TextElements derived from OCR results for this page.
1081
1081
  """
1082
- if not hasattr(self._parent, 'apply_ocr_to_pages'):
1083
- logger.error(f"Page {self.number}: Parent PDF missing 'apply_ocr_to_pages'. Cannot apply OCR.")
1082
+ if not hasattr(self._parent, 'apply_ocr'):
1083
+ logger.error(f"Page {self.number}: Parent PDF missing 'apply_ocr'. Cannot apply OCR.")
1084
1084
  return []
1085
1085
 
1086
- logger.info(f"Page {self.number}: Delegating apply_ocr to PDF.apply_ocr_to_pages.")
1086
+ logger.info(f"Page {self.number}: Delegating apply_ocr to PDF.apply_ocr.")
1087
1087
  try:
1088
1088
  # Delegate to parent PDF, targeting only this page's index
1089
- self._parent.apply_ocr_to_pages(
1089
+ self._parent.apply_ocr(
1090
1090
  pages=[self.index],
1091
1091
  engine=engine, options=options, languages=languages,
1092
1092
  min_confidence=min_confidence, device=device
natural_pdf/core/pdf.py CHANGED
@@ -198,7 +198,7 @@ class PDF:
198
198
 
199
199
  return self
200
200
 
201
- def apply_ocr_to_pages(
201
+ def apply_ocr(
202
202
  self,
203
203
  pages: Optional[Union[Iterable[int], range, slice]] = None,
204
204
  engine: Optional[str] = None,
@@ -598,7 +598,7 @@ class PDF:
598
598
  Requires optional dependencies. Install with: pip install "natural-pdf[ocr-save]"
599
599
 
600
600
  Note: OCR must have been applied to the pages beforehand
601
- (e.g., using pdf.apply_ocr_to_pages()).
601
+ (e.g., using pdf.apply_ocr()).
602
602
 
603
603
  Args:
604
604
  output_path: Path to save the searchable PDF.
@@ -614,7 +614,7 @@ class PDF:
614
614
  # For now, we pass through and let the exporter handle pages without OCR elements.
615
615
  # if not any(page.get_elements(source='ocr') for page in self.pages):
616
616
  # logger.warning("No OCR elements found on pages. "
617
- # "Ensure apply_ocr_to_pages() was called. "
617
+ # "Ensure apply_ocr() was called. "
618
618
  # "Output PDF might not be searchable.")
619
619
 
620
620
  # Convert pathlib.Path to string if necessary
@@ -1009,8 +1009,7 @@ class PageCollection(Generic[P]):
1009
1009
  """
1010
1010
  Applies OCR to all pages within this collection using batch processing.
1011
1011
 
1012
- This delegates the work to the parent PDF object's `apply_ocr_to_pages`
1013
- method for efficiency. The OCR results (TextElements) are added directly
1012
+ This delegates the work to the parent PDF object's `apply_ocr` method for efficiency. The OCR results (TextElements) are added directly
1014
1013
  to the respective Page objects within this collection.
1015
1014
 
1016
1015
  Args:
@@ -1028,8 +1027,8 @@ class PageCollection(Generic[P]):
1028
1027
  Raises:
1029
1028
  RuntimeError: If pages in the collection lack a parent PDF object
1030
1029
  or if the parent PDF object lacks the required
1031
- `apply_ocr_to_pages` method.
1032
- (Propagates exceptions from PDF.apply_ocr_to_pages)
1030
+ `apply_ocr` method.
1031
+ (Propagates exceptions from PDF.apply_ocr)
1033
1032
  """
1034
1033
  if not self.pages:
1035
1034
  logger.warning("Cannot apply OCR to an empty PageCollection.")
@@ -1042,16 +1041,17 @@ class PageCollection(Generic[P]):
1042
1041
 
1043
1042
  parent_pdf = first_page._parent
1044
1043
 
1045
- if not hasattr(parent_pdf, 'apply_ocr_to_pages') or not callable(parent_pdf.apply_ocr_to_pages):
1046
- raise RuntimeError("Parent PDF object does not have the required 'apply_ocr_to_pages' method.")
1044
+ # Updated check for renamed method
1045
+ if not hasattr(parent_pdf, 'apply_ocr') or not callable(parent_pdf.apply_ocr):
1046
+ raise RuntimeError("Parent PDF object does not have the required 'apply_ocr' method.")
1047
1047
 
1048
1048
  # Get the 0-based indices of the pages in this collection
1049
1049
  page_indices = [p.index for p in self.pages]
1050
1050
 
1051
1051
  logger.info(f"Applying OCR via parent PDF to page indices: {page_indices} in collection.")
1052
1052
 
1053
- # Delegate the batch call to the parent PDF object
1054
- parent_pdf.apply_ocr_to_pages(
1053
+ # Delegate the batch call to the parent PDF object (using renamed method)
1054
+ parent_pdf.apply_ocr(
1055
1055
  pages=page_indices,
1056
1056
  engine=engine,
1057
1057
  options=options,
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: natural-pdf
3
- Version: 0.1.3
3
+ Version: 0.1.4
4
4
  Summary: A more intuitive interface for working with PDFs
5
5
  Author-email: Jonathan Soma <jonathan.soma@gmail.com>
6
6
  License-Expression: MIT
@@ -8,24 +8,24 @@ Project-URL: Homepage, https://github.com/jsoma/natural-pdf
8
8
  Project-URL: Repository, https://github.com/jsoma/natural-pdf
9
9
  Classifier: Programming Language :: Python :: 3
10
10
  Classifier: Operating System :: OS Independent
11
- Requires-Python: >=3.7
11
+ Requires-Python: >=3.9
12
12
  Description-Content-Type: text/markdown
13
13
  License-File: LICENSE
14
- Requires-Dist: pdfplumber>=0.7.0
15
- Requires-Dist: Pillow>=8.0.0
16
- Requires-Dist: colour>=0.1.5
17
- Requires-Dist: numpy>=1.20.0
18
- Requires-Dist: urllib3>=1.26.0
19
- Requires-Dist: torch>=2.0.0
20
- Requires-Dist: torchvision>=0.15.0
21
- Requires-Dist: transformers>=4.30.0
22
- Requires-Dist: huggingface_hub>=0.19.0
23
- Requires-Dist: ocrmypdf>=16.0.0
24
- Requires-Dist: pikepdf>=10.0.0
14
+ Requires-Dist: pdfplumber
15
+ Requires-Dist: Pillow
16
+ Requires-Dist: colour
17
+ Requires-Dist: numpy
18
+ Requires-Dist: urllib3
19
+ Requires-Dist: torch
20
+ Requires-Dist: torchvision
21
+ Requires-Dist: transformers
22
+ Requires-Dist: huggingface_hub
23
+ Requires-Dist: ocrmypdf
24
+ Requires-Dist: pikepdf
25
25
  Provides-Extra: interactive
26
26
  Requires-Dist: ipywidgets<9.0.0,>=7.0.0; extra == "interactive"
27
27
  Provides-Extra: haystack
28
- Requires-Dist: haystack-ai>=2.0.0b5; extra == "haystack"
28
+ Requires-Dist: haystack-ai; extra == "haystack"
29
29
  Requires-Dist: chroma-haystack; extra == "haystack"
30
30
  Requires-Dist: sentence-transformers; extra == "haystack"
31
31
  Provides-Extra: easyocr
@@ -45,7 +45,7 @@ Requires-Dist: paddlepaddle; extra == "all"
45
45
  Requires-Dist: paddleocr; extra == "all"
46
46
  Requires-Dist: doclayout_yolo; extra == "all"
47
47
  Requires-Dist: surya-ocr; extra == "all"
48
- Requires-Dist: haystack-ai>=2.0.0b5; extra == "all"
48
+ Requires-Dist: haystack-ai; extra == "all"
49
49
  Requires-Dist: chroma-haystack; extra == "all"
50
50
  Requires-Dist: sentence-transformers; extra == "all"
51
51
  Dynamic: license-file
@@ -13,15 +13,15 @@ natural_pdf/analyzers/layout/paddle.py,sha256=QCasH_Z9UITX6wRGlE_HjmwkBuANz9Yyw5
13
13
  natural_pdf/analyzers/layout/surya.py,sha256=Ibwo42TioJ-BZP3-2T13KCtH3kLSWQh7C9ZYuk1kUQo,12657
14
14
  natural_pdf/analyzers/layout/tatr.py,sha256=H0Xygk9jA46-vlPleoal94cuDyz-LHTSxVb3e6gpmV8,11956
15
15
  natural_pdf/analyzers/layout/yolo.py,sha256=NSQK3TcS1qN8D2MDxCvcwTpS_kvzGy3I2LepJDUceoQ,7699
16
- natural_pdf/collections/pdf_collection.py,sha256=Da8saWBTguxk16pNzMxCrFwatrWk_qrcG0RVPQybro8,12159
16
+ natural_pdf/collections/pdf_collection.py,sha256=LLtixKaKRzPRfZNdDQQ7HY3wyWbBcefPYvf_4Ke-FLw,12123
17
17
  natural_pdf/core/__init__.py,sha256=GUuFtj2Apc9biAdUOlnL8leL3BQncEzubvpiAUaU3ss,37
18
18
  natural_pdf/core/element_manager.py,sha256=H1896JSt48ASLSmG22xEXMY-xSKcpYsUlYmYMD48i6Q,17117
19
19
  natural_pdf/core/highlighting_service.py,sha256=a-40UMohOglYrw4klW1GuQ_p3jZOxnAfPOXPORThr4U,31476
20
- natural_pdf/core/page.py,sha256=qhumZqmwHoBlGodiCvYE0z34Iu1WSs32V4_Iz_Sfaow,69350
21
- natural_pdf/core/pdf.py,sha256=MLN-asJ_d5spmCjLz7SDp74t__vioszfKEFooBul7nU,41167
20
+ natural_pdf/core/page.py,sha256=7LSqJbGHhpKQliAdcy7aRQzkr8sO9jUP68bzy7uH54U,69305
21
+ natural_pdf/core/pdf.py,sha256=ALCO7YB_oaMtGZpS6JHJglrIIDbUd63sSso0oNAAP9k,41140
22
22
  natural_pdf/elements/__init__.py,sha256=6FGHZm2oONd8zErahMEawuB4AvJR5jOZPt4KtEwbj80,40
23
23
  natural_pdf/elements/base.py,sha256=9SQ-O2qbQe9Avbf9JI-p6vWlyThZVch-p1yqXWSrBHw,35750
24
- natural_pdf/elements/collections.py,sha256=RJf4cBZeLfCtfS0-SjzYFRCtbzYjWsgk3LrcTwJAYMs,62392
24
+ natural_pdf/elements/collections.py,sha256=G6H-6VtCWq_KW-A0y9XhyHLOIWxz-1vHByfC6dq8lmU,62387
25
25
  natural_pdf/elements/line.py,sha256=QvVdhf_K6rwJkq3q67JmgdZpDhrBgWuSMF-Q25malP4,4783
26
26
  natural_pdf/elements/rect.py,sha256=dls9g-R213O78HvfAJMak3_eV14Zh654Zw7hqTTXxDQ,3949
27
27
  natural_pdf/elements/region.py,sha256=5dXHYbbdO1QNgkD6b6I34ezHt-SHKx_aH1ubzbfMHQs,74370
@@ -54,8 +54,8 @@ natural_pdf/utils/visualization.py,sha256=14BM-K4ovDqHniNbxbP_y9KaEYNlkbpELGAv9_
54
54
  natural_pdf/widgets/__init__.py,sha256=qckw3DjdVTsASPLJ8uUrGKg3MFhvzHndUpeNGlqwg6A,215
55
55
  natural_pdf/widgets/viewer.py,sha256=h_amj_uvf-vRqEsFg4P00fgKxawLAd9jjC1ohUza4BY,37479
56
56
  natural_pdf/widgets/frontend/viewer.js,sha256=w8ywfz_IOAAv2nP_qaf2VBUkF1KhjT3zorhJxM1-CfU,4371
57
- natural_pdf-0.1.3.dist-info/licenses/LICENSE,sha256=9zfwINwJlarbDmdh6iJV4QUG54QSJlSAUcnC1YiC_Ns,1074
58
- natural_pdf-0.1.3.dist-info/METADATA,sha256=kBSb1SueOGQFw97pvHBxlJYcuNwxAB-lInLKows0BEs,5069
59
- natural_pdf-0.1.3.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
60
- natural_pdf-0.1.3.dist-info/top_level.txt,sha256=XtfS3IiR1fTjaQG9TjGDjZsB1Ih2GXQteDbJ2dXlLvQ,12
61
- natural_pdf-0.1.3.dist-info/RECORD,,
57
+ natural_pdf-0.1.4.dist-info/licenses/LICENSE,sha256=9zfwINwJlarbDmdh6iJV4QUG54QSJlSAUcnC1YiC_Ns,1074
58
+ natural_pdf-0.1.4.dist-info/METADATA,sha256=Qbj7uNu_w5OfHexqGGFEi1VQCELaidq670nHDArAtqE,4967
59
+ natural_pdf-0.1.4.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
60
+ natural_pdf-0.1.4.dist-info/top_level.txt,sha256=XtfS3IiR1fTjaQG9TjGDjZsB1Ih2GXQteDbJ2dXlLvQ,12
61
+ natural_pdf-0.1.4.dist-info/RECORD,,