natural-pdf 0.1.3__py3-none-any.whl → 0.1.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- natural_pdf/collections/pdf_collection.py +4 -4
- natural_pdf/core/page.py +5 -5
- natural_pdf/core/pdf.py +3 -3
- natural_pdf/elements/collections.py +8 -8
- {natural_pdf-0.1.3.dist-info → natural_pdf-0.1.4.dist-info}/METADATA +15 -15
- {natural_pdf-0.1.3.dist-info → natural_pdf-0.1.4.dist-info}/RECORD +9 -9
- {natural_pdf-0.1.3.dist-info → natural_pdf-0.1.4.dist-info}/WHEEL +0 -0
- {natural_pdf-0.1.3.dist-info → natural_pdf-0.1.4.dist-info}/licenses/LICENSE +0 -0
- {natural_pdf-0.1.3.dist-info → natural_pdf-0.1.4.dist-info}/top_level.txt +0 -0
@@ -215,8 +215,8 @@ class PDFCollection(SearchableMixin): # Inherit from the mixin
|
|
215
215
|
"""Returns the list of PDF objects held by the collection."""
|
216
216
|
return self._pdfs
|
217
217
|
|
218
|
-
# --- Other Methods (e.g.,
|
219
|
-
def
|
218
|
+
# --- Other Methods (e.g., apply_ocr - could leverage service in future?) ---
|
219
|
+
def apply_ocr(self, *args, **kwargs):
|
220
220
|
PDF = self._get_pdf_class()
|
221
221
|
# Delegate to individual PDF objects
|
222
222
|
logger.info("Applying OCR to relevant PDFs in collection...")
|
@@ -225,8 +225,8 @@ class PDFCollection(SearchableMixin): # Inherit from the mixin
|
|
225
225
|
# We need to figure out which pages belong to which PDF if batching here
|
226
226
|
# For now, simpler to call on each PDF
|
227
227
|
try:
|
228
|
-
# Assume
|
229
|
-
pdf.
|
228
|
+
# Assume apply_ocr exists on PDF and accepts similar args
|
229
|
+
pdf.apply_ocr(*args, **kwargs)
|
230
230
|
except Exception as e:
|
231
231
|
logger.error(f"Failed applying OCR to {pdf.path}: {e}", exc_info=True)
|
232
232
|
return self
|
natural_pdf/core/page.py
CHANGED
@@ -1074,19 +1074,19 @@ class Page:
|
|
1074
1074
|
device: Optional[str] = None,
|
1075
1075
|
) -> List[TextElement]:
|
1076
1076
|
"""
|
1077
|
-
Apply OCR to THIS page and add results to page elements via PDF.
|
1077
|
+
Apply OCR to THIS page and add results to page elements via PDF.apply_ocr.
|
1078
1078
|
|
1079
1079
|
Returns:
|
1080
1080
|
List of created TextElements derived from OCR results for this page.
|
1081
1081
|
"""
|
1082
|
-
if not hasattr(self._parent, '
|
1083
|
-
logger.error(f"Page {self.number}: Parent PDF missing '
|
1082
|
+
if not hasattr(self._parent, 'apply_ocr'):
|
1083
|
+
logger.error(f"Page {self.number}: Parent PDF missing 'apply_ocr'. Cannot apply OCR.")
|
1084
1084
|
return []
|
1085
1085
|
|
1086
|
-
logger.info(f"Page {self.number}: Delegating apply_ocr to PDF.
|
1086
|
+
logger.info(f"Page {self.number}: Delegating apply_ocr to PDF.apply_ocr.")
|
1087
1087
|
try:
|
1088
1088
|
# Delegate to parent PDF, targeting only this page's index
|
1089
|
-
self._parent.
|
1089
|
+
self._parent.apply_ocr(
|
1090
1090
|
pages=[self.index],
|
1091
1091
|
engine=engine, options=options, languages=languages,
|
1092
1092
|
min_confidence=min_confidence, device=device
|
natural_pdf/core/pdf.py
CHANGED
@@ -198,7 +198,7 @@ class PDF:
|
|
198
198
|
|
199
199
|
return self
|
200
200
|
|
201
|
-
def
|
201
|
+
def apply_ocr(
|
202
202
|
self,
|
203
203
|
pages: Optional[Union[Iterable[int], range, slice]] = None,
|
204
204
|
engine: Optional[str] = None,
|
@@ -598,7 +598,7 @@ class PDF:
|
|
598
598
|
Requires optional dependencies. Install with: pip install "natural-pdf[ocr-save]"
|
599
599
|
|
600
600
|
Note: OCR must have been applied to the pages beforehand
|
601
|
-
(e.g., using pdf.
|
601
|
+
(e.g., using pdf.apply_ocr()).
|
602
602
|
|
603
603
|
Args:
|
604
604
|
output_path: Path to save the searchable PDF.
|
@@ -614,7 +614,7 @@ class PDF:
|
|
614
614
|
# For now, we pass through and let the exporter handle pages without OCR elements.
|
615
615
|
# if not any(page.get_elements(source='ocr') for page in self.pages):
|
616
616
|
# logger.warning("No OCR elements found on pages. "
|
617
|
-
# "Ensure
|
617
|
+
# "Ensure apply_ocr() was called. "
|
618
618
|
# "Output PDF might not be searchable.")
|
619
619
|
|
620
620
|
# Convert pathlib.Path to string if necessary
|
@@ -1009,8 +1009,7 @@ class PageCollection(Generic[P]):
|
|
1009
1009
|
"""
|
1010
1010
|
Applies OCR to all pages within this collection using batch processing.
|
1011
1011
|
|
1012
|
-
This delegates the work to the parent PDF object's `
|
1013
|
-
method for efficiency. The OCR results (TextElements) are added directly
|
1012
|
+
This delegates the work to the parent PDF object's `apply_ocr` method for efficiency. The OCR results (TextElements) are added directly
|
1014
1013
|
to the respective Page objects within this collection.
|
1015
1014
|
|
1016
1015
|
Args:
|
@@ -1028,8 +1027,8 @@ class PageCollection(Generic[P]):
|
|
1028
1027
|
Raises:
|
1029
1028
|
RuntimeError: If pages in the collection lack a parent PDF object
|
1030
1029
|
or if the parent PDF object lacks the required
|
1031
|
-
`
|
1032
|
-
(Propagates exceptions from PDF.
|
1030
|
+
`apply_ocr` method.
|
1031
|
+
(Propagates exceptions from PDF.apply_ocr)
|
1033
1032
|
"""
|
1034
1033
|
if not self.pages:
|
1035
1034
|
logger.warning("Cannot apply OCR to an empty PageCollection.")
|
@@ -1042,16 +1041,17 @@ class PageCollection(Generic[P]):
|
|
1042
1041
|
|
1043
1042
|
parent_pdf = first_page._parent
|
1044
1043
|
|
1045
|
-
|
1046
|
-
|
1044
|
+
# Updated check for renamed method
|
1045
|
+
if not hasattr(parent_pdf, 'apply_ocr') or not callable(parent_pdf.apply_ocr):
|
1046
|
+
raise RuntimeError("Parent PDF object does not have the required 'apply_ocr' method.")
|
1047
1047
|
|
1048
1048
|
# Get the 0-based indices of the pages in this collection
|
1049
1049
|
page_indices = [p.index for p in self.pages]
|
1050
1050
|
|
1051
1051
|
logger.info(f"Applying OCR via parent PDF to page indices: {page_indices} in collection.")
|
1052
1052
|
|
1053
|
-
# Delegate the batch call to the parent PDF object
|
1054
|
-
parent_pdf.
|
1053
|
+
# Delegate the batch call to the parent PDF object (using renamed method)
|
1054
|
+
parent_pdf.apply_ocr(
|
1055
1055
|
pages=page_indices,
|
1056
1056
|
engine=engine,
|
1057
1057
|
options=options,
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: natural-pdf
|
3
|
-
Version: 0.1.
|
3
|
+
Version: 0.1.4
|
4
4
|
Summary: A more intuitive interface for working with PDFs
|
5
5
|
Author-email: Jonathan Soma <jonathan.soma@gmail.com>
|
6
6
|
License-Expression: MIT
|
@@ -8,24 +8,24 @@ Project-URL: Homepage, https://github.com/jsoma/natural-pdf
|
|
8
8
|
Project-URL: Repository, https://github.com/jsoma/natural-pdf
|
9
9
|
Classifier: Programming Language :: Python :: 3
|
10
10
|
Classifier: Operating System :: OS Independent
|
11
|
-
Requires-Python: >=3.
|
11
|
+
Requires-Python: >=3.9
|
12
12
|
Description-Content-Type: text/markdown
|
13
13
|
License-File: LICENSE
|
14
|
-
Requires-Dist: pdfplumber
|
15
|
-
Requires-Dist: Pillow
|
16
|
-
Requires-Dist: colour
|
17
|
-
Requires-Dist: numpy
|
18
|
-
Requires-Dist: urllib3
|
19
|
-
Requires-Dist: torch
|
20
|
-
Requires-Dist: torchvision
|
21
|
-
Requires-Dist: transformers
|
22
|
-
Requires-Dist: huggingface_hub
|
23
|
-
Requires-Dist: ocrmypdf
|
24
|
-
Requires-Dist: pikepdf
|
14
|
+
Requires-Dist: pdfplumber
|
15
|
+
Requires-Dist: Pillow
|
16
|
+
Requires-Dist: colour
|
17
|
+
Requires-Dist: numpy
|
18
|
+
Requires-Dist: urllib3
|
19
|
+
Requires-Dist: torch
|
20
|
+
Requires-Dist: torchvision
|
21
|
+
Requires-Dist: transformers
|
22
|
+
Requires-Dist: huggingface_hub
|
23
|
+
Requires-Dist: ocrmypdf
|
24
|
+
Requires-Dist: pikepdf
|
25
25
|
Provides-Extra: interactive
|
26
26
|
Requires-Dist: ipywidgets<9.0.0,>=7.0.0; extra == "interactive"
|
27
27
|
Provides-Extra: haystack
|
28
|
-
Requires-Dist: haystack-ai
|
28
|
+
Requires-Dist: haystack-ai; extra == "haystack"
|
29
29
|
Requires-Dist: chroma-haystack; extra == "haystack"
|
30
30
|
Requires-Dist: sentence-transformers; extra == "haystack"
|
31
31
|
Provides-Extra: easyocr
|
@@ -45,7 +45,7 @@ Requires-Dist: paddlepaddle; extra == "all"
|
|
45
45
|
Requires-Dist: paddleocr; extra == "all"
|
46
46
|
Requires-Dist: doclayout_yolo; extra == "all"
|
47
47
|
Requires-Dist: surya-ocr; extra == "all"
|
48
|
-
Requires-Dist: haystack-ai
|
48
|
+
Requires-Dist: haystack-ai; extra == "all"
|
49
49
|
Requires-Dist: chroma-haystack; extra == "all"
|
50
50
|
Requires-Dist: sentence-transformers; extra == "all"
|
51
51
|
Dynamic: license-file
|
@@ -13,15 +13,15 @@ natural_pdf/analyzers/layout/paddle.py,sha256=QCasH_Z9UITX6wRGlE_HjmwkBuANz9Yyw5
|
|
13
13
|
natural_pdf/analyzers/layout/surya.py,sha256=Ibwo42TioJ-BZP3-2T13KCtH3kLSWQh7C9ZYuk1kUQo,12657
|
14
14
|
natural_pdf/analyzers/layout/tatr.py,sha256=H0Xygk9jA46-vlPleoal94cuDyz-LHTSxVb3e6gpmV8,11956
|
15
15
|
natural_pdf/analyzers/layout/yolo.py,sha256=NSQK3TcS1qN8D2MDxCvcwTpS_kvzGy3I2LepJDUceoQ,7699
|
16
|
-
natural_pdf/collections/pdf_collection.py,sha256=
|
16
|
+
natural_pdf/collections/pdf_collection.py,sha256=LLtixKaKRzPRfZNdDQQ7HY3wyWbBcefPYvf_4Ke-FLw,12123
|
17
17
|
natural_pdf/core/__init__.py,sha256=GUuFtj2Apc9biAdUOlnL8leL3BQncEzubvpiAUaU3ss,37
|
18
18
|
natural_pdf/core/element_manager.py,sha256=H1896JSt48ASLSmG22xEXMY-xSKcpYsUlYmYMD48i6Q,17117
|
19
19
|
natural_pdf/core/highlighting_service.py,sha256=a-40UMohOglYrw4klW1GuQ_p3jZOxnAfPOXPORThr4U,31476
|
20
|
-
natural_pdf/core/page.py,sha256=
|
21
|
-
natural_pdf/core/pdf.py,sha256=
|
20
|
+
natural_pdf/core/page.py,sha256=7LSqJbGHhpKQliAdcy7aRQzkr8sO9jUP68bzy7uH54U,69305
|
21
|
+
natural_pdf/core/pdf.py,sha256=ALCO7YB_oaMtGZpS6JHJglrIIDbUd63sSso0oNAAP9k,41140
|
22
22
|
natural_pdf/elements/__init__.py,sha256=6FGHZm2oONd8zErahMEawuB4AvJR5jOZPt4KtEwbj80,40
|
23
23
|
natural_pdf/elements/base.py,sha256=9SQ-O2qbQe9Avbf9JI-p6vWlyThZVch-p1yqXWSrBHw,35750
|
24
|
-
natural_pdf/elements/collections.py,sha256=
|
24
|
+
natural_pdf/elements/collections.py,sha256=G6H-6VtCWq_KW-A0y9XhyHLOIWxz-1vHByfC6dq8lmU,62387
|
25
25
|
natural_pdf/elements/line.py,sha256=QvVdhf_K6rwJkq3q67JmgdZpDhrBgWuSMF-Q25malP4,4783
|
26
26
|
natural_pdf/elements/rect.py,sha256=dls9g-R213O78HvfAJMak3_eV14Zh654Zw7hqTTXxDQ,3949
|
27
27
|
natural_pdf/elements/region.py,sha256=5dXHYbbdO1QNgkD6b6I34ezHt-SHKx_aH1ubzbfMHQs,74370
|
@@ -54,8 +54,8 @@ natural_pdf/utils/visualization.py,sha256=14BM-K4ovDqHniNbxbP_y9KaEYNlkbpELGAv9_
|
|
54
54
|
natural_pdf/widgets/__init__.py,sha256=qckw3DjdVTsASPLJ8uUrGKg3MFhvzHndUpeNGlqwg6A,215
|
55
55
|
natural_pdf/widgets/viewer.py,sha256=h_amj_uvf-vRqEsFg4P00fgKxawLAd9jjC1ohUza4BY,37479
|
56
56
|
natural_pdf/widgets/frontend/viewer.js,sha256=w8ywfz_IOAAv2nP_qaf2VBUkF1KhjT3zorhJxM1-CfU,4371
|
57
|
-
natural_pdf-0.1.
|
58
|
-
natural_pdf-0.1.
|
59
|
-
natural_pdf-0.1.
|
60
|
-
natural_pdf-0.1.
|
61
|
-
natural_pdf-0.1.
|
57
|
+
natural_pdf-0.1.4.dist-info/licenses/LICENSE,sha256=9zfwINwJlarbDmdh6iJV4QUG54QSJlSAUcnC1YiC_Ns,1074
|
58
|
+
natural_pdf-0.1.4.dist-info/METADATA,sha256=Qbj7uNu_w5OfHexqGGFEi1VQCELaidq670nHDArAtqE,4967
|
59
|
+
natural_pdf-0.1.4.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
|
60
|
+
natural_pdf-0.1.4.dist-info/top_level.txt,sha256=XtfS3IiR1fTjaQG9TjGDjZsB1Ih2GXQteDbJ2dXlLvQ,12
|
61
|
+
natural_pdf-0.1.4.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|