natural-pdf 0.1.5__py3-none-any.whl → 0.1.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docs/ocr/index.md +34 -47
- docs/tutorials/01-loading-and-extraction.ipynb +60 -46
- docs/tutorials/02-finding-elements.ipynb +42 -42
- docs/tutorials/03-extracting-blocks.ipynb +17 -17
- docs/tutorials/04-table-extraction.ipynb +12 -12
- docs/tutorials/05-excluding-content.ipynb +30 -30
- docs/tutorials/06-document-qa.ipynb +28 -28
- docs/tutorials/07-layout-analysis.ipynb +63 -35
- docs/tutorials/07-working-with-regions.ipynb +55 -51
- docs/tutorials/07-working-with-regions.md +2 -2
- docs/tutorials/08-spatial-navigation.ipynb +60 -60
- docs/tutorials/09-section-extraction.ipynb +113 -113
- docs/tutorials/10-form-field-extraction.ipynb +78 -50
- docs/tutorials/11-enhanced-table-processing.ipynb +6 -6
- docs/tutorials/12-ocr-integration.ipynb +149 -131
- docs/tutorials/12-ocr-integration.md +0 -13
- docs/tutorials/13-semantic-search.ipynb +313 -873
- natural_pdf/__init__.py +21 -23
- natural_pdf/analyzers/layout/gemini.py +264 -0
- natural_pdf/analyzers/layout/layout_manager.py +28 -1
- natural_pdf/analyzers/layout/layout_options.py +11 -0
- natural_pdf/analyzers/layout/yolo.py +6 -2
- natural_pdf/collections/pdf_collection.py +21 -0
- natural_pdf/core/element_manager.py +16 -13
- natural_pdf/core/page.py +165 -36
- natural_pdf/core/pdf.py +146 -41
- natural_pdf/elements/base.py +11 -17
- natural_pdf/elements/collections.py +100 -38
- natural_pdf/elements/region.py +77 -38
- natural_pdf/elements/text.py +5 -0
- natural_pdf/ocr/__init__.py +49 -36
- natural_pdf/ocr/engine.py +146 -51
- natural_pdf/ocr/engine_easyocr.py +141 -161
- natural_pdf/ocr/engine_paddle.py +107 -193
- natural_pdf/ocr/engine_surya.py +75 -148
- natural_pdf/ocr/ocr_factory.py +114 -0
- natural_pdf/ocr/ocr_manager.py +65 -93
- natural_pdf/ocr/ocr_options.py +7 -17
- natural_pdf/ocr/utils.py +98 -0
- natural_pdf/templates/spa/css/style.css +334 -0
- natural_pdf/templates/spa/index.html +31 -0
- natural_pdf/templates/spa/js/app.js +472 -0
- natural_pdf/templates/spa/words.txt +235976 -0
- natural_pdf/utils/debug.py +32 -0
- natural_pdf/utils/identifiers.py +29 -0
- natural_pdf/utils/packaging.py +418 -0
- {natural_pdf-0.1.5.dist-info → natural_pdf-0.1.6.dist-info}/METADATA +41 -19
- {natural_pdf-0.1.5.dist-info → natural_pdf-0.1.6.dist-info}/RECORD +51 -44
- {natural_pdf-0.1.5.dist-info → natural_pdf-0.1.6.dist-info}/WHEEL +1 -1
- {natural_pdf-0.1.5.dist-info → natural_pdf-0.1.6.dist-info}/top_level.txt +0 -1
- natural_pdf/templates/ocr_debug.html +0 -517
- tests/test_loading.py +0 -50
- tests/test_optional_deps.py +0 -298
- {natural_pdf-0.1.5.dist-info → natural_pdf-0.1.6.dist-info}/licenses/LICENSE +0 -0
tests/test_optional_deps.py
DELETED
@@ -1,298 +0,0 @@
|
|
1
|
-
import importlib # Use importlib for checking
|
2
|
-
import sys
|
3
|
-
|
4
|
-
import pytest
|
5
|
-
|
6
|
-
from natural_pdf import PDF, PDFCollection # Import PDFCollection
|
7
|
-
from natural_pdf.core.page import Page
|
8
|
-
|
9
|
-
# --- Fixtures --- #
|
10
|
-
|
11
|
-
# Define PDF paths relative to the project root (where pytest is usually run)
|
12
|
-
TEST_PDF_URL = "https://github.com/jsoma/natural-pdf/raw/refs/heads/main/pdfs/01-practice.pdf"
|
13
|
-
NEEDS_OCR_PDF_PATH = "pdfs/needs-ocr.pdf"
|
14
|
-
STANDARD_PDF_PATH = "pdfs/01-practice.pdf"
|
15
|
-
|
16
|
-
|
17
|
-
@pytest.fixture(scope="module")
|
18
|
-
def standard_pdf_page():
|
19
|
-
"""Fixture to load the first page of the standard test PDF."""
|
20
|
-
try:
|
21
|
-
# Use the local path if available, otherwise fallback to URL?
|
22
|
-
# For consistency in tests, let's stick to the local path for now.
|
23
|
-
# Assume the pdfs directory is in the root alongside tests/
|
24
|
-
pdf = PDF(STANDARD_PDF_PATH)
|
25
|
-
if not pdf.pages:
|
26
|
-
pytest.fail(f"Standard PDF has no pages: {STANDARD_PDF_PATH}")
|
27
|
-
return pdf.pages[0]
|
28
|
-
except Exception as e:
|
29
|
-
pytest.fail(f"Failed to load standard PDF ({STANDARD_PDF_PATH}) for module tests: {e}")
|
30
|
-
|
31
|
-
|
32
|
-
@pytest.fixture(scope="module")
|
33
|
-
def needs_ocr_pdf_page():
|
34
|
-
"""Fixture to load the first page of the OCR test PDF."""
|
35
|
-
try:
|
36
|
-
pdf = PDF(NEEDS_OCR_PDF_PATH)
|
37
|
-
if not pdf.pages:
|
38
|
-
pytest.fail(f"OCR PDF has no pages: {NEEDS_OCR_PDF_PATH}")
|
39
|
-
return pdf.pages[0]
|
40
|
-
except Exception as e:
|
41
|
-
pytest.fail(f"Failed to load OCR PDF ({NEEDS_OCR_PDF_PATH}) for module tests: {e}")
|
42
|
-
|
43
|
-
|
44
|
-
@pytest.fixture(scope="module")
|
45
|
-
def standard_pdf_collection():
|
46
|
-
"""Fixture to create a PDFCollection with the standard test PDF."""
|
47
|
-
try:
|
48
|
-
# Use a list containing the path
|
49
|
-
collection = PDFCollection([STANDARD_PDF_PATH])
|
50
|
-
assert len(collection.pdfs) == 1
|
51
|
-
return collection
|
52
|
-
except Exception as e:
|
53
|
-
pytest.fail(f"Failed to create PDFCollection ({STANDARD_PDF_PATH}) for module tests: {e}")
|
54
|
-
|
55
|
-
|
56
|
-
# --- Helper --- #
|
57
|
-
|
58
|
-
|
59
|
-
def is_extra_installed(extra_name):
|
60
|
-
"""Checks if packages associated with an extra appear importable."""
|
61
|
-
extra_packages = {
|
62
|
-
"interactive": ["ipywidgets"],
|
63
|
-
"easyocr": ["easyocr"],
|
64
|
-
"paddle": ["paddleocr"],
|
65
|
-
"surya": ["surya"],
|
66
|
-
"layout_yolo": ["doclayout_yolo"], # Correct package name for this extra
|
67
|
-
"haystack": ["haystack"],
|
68
|
-
"qa": ["transformers"],
|
69
|
-
}
|
70
|
-
if extra_name not in extra_packages:
|
71
|
-
return False
|
72
|
-
|
73
|
-
packages_to_check = extra_packages[extra_name]
|
74
|
-
try:
|
75
|
-
for pkg_name in packages_to_check:
|
76
|
-
importlib.import_module(pkg_name)
|
77
|
-
return True
|
78
|
-
except ImportError:
|
79
|
-
return False
|
80
|
-
|
81
|
-
|
82
|
-
# --- Interactive Viewer (ipywidgets) Tests --- (Existing)
|
83
|
-
|
84
|
-
|
85
|
-
def test_ipywidgets_availability_flag():
|
86
|
-
"""Tests the internal _IPYWIDGETS_AVAILABLE flag based on environment."""
|
87
|
-
try:
|
88
|
-
from natural_pdf.widgets.viewer import _IPYWIDGETS_AVAILABLE
|
89
|
-
|
90
|
-
flag_value = _IPYWIDGETS_AVAILABLE
|
91
|
-
except ImportError:
|
92
|
-
pytest.fail("Could not import or find _IPYWIDGETS_AVAILABLE in natural_pdf.widgets.viewer")
|
93
|
-
should_be_installed = is_extra_installed("interactive")
|
94
|
-
assert (
|
95
|
-
flag_value == should_be_installed
|
96
|
-
), f"_IPYWIDGETS_AVAILABLE flag mismatch. Expected {should_be_installed}, got {flag_value}."
|
97
|
-
|
98
|
-
|
99
|
-
def test_page_viewer_widget_creation_when_installed(standard_pdf_page):
|
100
|
-
"""Tests that Page.viewer() returns a widget when ipywidgets is installed."""
|
101
|
-
pytest.importorskip("ipywidgets")
|
102
|
-
from natural_pdf.widgets.viewer import SimpleInteractiveViewerWidget
|
103
|
-
|
104
|
-
viewer_instance = standard_pdf_page.viewer()
|
105
|
-
assert (
|
106
|
-
viewer_instance is not None
|
107
|
-
), "Page.viewer() should return an object when ipywidgets is installed."
|
108
|
-
assert isinstance(
|
109
|
-
viewer_instance, SimpleInteractiveViewerWidget
|
110
|
-
), f"Page.viewer() returned type {type(viewer_instance)}, expected SimpleInteractiveViewerWidget."
|
111
|
-
|
112
|
-
|
113
|
-
def test_page_viewer_widget_creation_when_not_installed(standard_pdf_page):
|
114
|
-
"""Tests that Page.viewer() returns None when ipywidgets is missing."""
|
115
|
-
if is_extra_installed("interactive"):
|
116
|
-
pytest.skip("Skipping test: ipywidgets IS installed in this environment.")
|
117
|
-
viewer_instance = standard_pdf_page.viewer()
|
118
|
-
assert (
|
119
|
-
viewer_instance is None
|
120
|
-
), "Page.viewer() should return None when ipywidgets is not installed."
|
121
|
-
|
122
|
-
|
123
|
-
# --- EasyOCR Tests --- #
|
124
|
-
|
125
|
-
|
126
|
-
def test_ocr_easyocr_works_when_installed(needs_ocr_pdf_page):
|
127
|
-
"""Test running EasyOCR when installed."""
|
128
|
-
pytest.importorskip("easyocr")
|
129
|
-
try:
|
130
|
-
# Use extract_ocr_elements which doesn't modify the page state
|
131
|
-
ocr_elements = needs_ocr_pdf_page.extract_ocr_elements(engine="easyocr")
|
132
|
-
assert isinstance(ocr_elements, list)
|
133
|
-
assert len(ocr_elements) > 0, "EasyOCR should find text elements on the OCR PDF."
|
134
|
-
# Check if the first element looks like a TextElement (basic check)
|
135
|
-
assert hasattr(ocr_elements[0], "text"), "OCR result should have text attribute."
|
136
|
-
assert hasattr(ocr_elements[0], "bbox"), "OCR result should have bbox attribute."
|
137
|
-
except Exception as e:
|
138
|
-
pytest.fail(f"EasyOCR extraction failed when installed: {e}")
|
139
|
-
|
140
|
-
|
141
|
-
def test_ocr_easyocr_fails_gracefully_when_not_installed(needs_ocr_pdf_page):
|
142
|
-
"""Test calling EasyOCR when not installed."""
|
143
|
-
if is_extra_installed("easyocr"):
|
144
|
-
pytest.skip("Skipping test: EasyOCR IS installed.")
|
145
|
-
# Check how OCRManager handles unavailable engines - assuming it returns empty list
|
146
|
-
ocr_elements = needs_ocr_pdf_page.extract_ocr_elements(engine="easyocr")
|
147
|
-
assert (
|
148
|
-
ocr_elements == []
|
149
|
-
), "extract_ocr_elements should return empty list for unavailable engine."
|
150
|
-
|
151
|
-
|
152
|
-
# --- PaddleOCR Tests --- #
|
153
|
-
|
154
|
-
|
155
|
-
def test_ocr_paddle_works_when_installed(needs_ocr_pdf_page):
|
156
|
-
"""Test running PaddleOCR when installed."""
|
157
|
-
pytest.importorskip("paddleocr")
|
158
|
-
if sys.platform == "darwin":
|
159
|
-
pytest.skip("PaddleOCR tests skipped on macOS")
|
160
|
-
try:
|
161
|
-
ocr_elements = needs_ocr_pdf_page.extract_ocr_elements(engine="paddle")
|
162
|
-
assert isinstance(ocr_elements, list)
|
163
|
-
assert len(ocr_elements) > 0, "PaddleOCR should find text elements on the OCR PDF."
|
164
|
-
assert hasattr(ocr_elements[0], "text")
|
165
|
-
assert hasattr(ocr_elements[0], "bbox")
|
166
|
-
except Exception as e:
|
167
|
-
pytest.fail(f"PaddleOCR extraction failed when installed: {e}")
|
168
|
-
|
169
|
-
|
170
|
-
def test_ocr_paddle_fails_gracefully_when_not_installed(needs_ocr_pdf_page):
|
171
|
-
"""Test calling PaddleOCR when not installed."""
|
172
|
-
if is_extra_installed("paddle"):
|
173
|
-
pytest.skip("Skipping test: PaddleOCR IS installed.")
|
174
|
-
if sys.platform == "darwin": # Also skip if check fails but platform is darwin
|
175
|
-
pytest.skip("PaddleOCR tests skipped on macOS")
|
176
|
-
# Check how OCRManager handles unavailable engines - assume it returns empty list (KEEPING THIS)
|
177
|
-
# It might be reasonable for OCR manager to return [] if engine isn't there,
|
178
|
-
# vs layout which is explicitly requested.
|
179
|
-
# Alternatively, OCRManager could also be changed to raise errors.
|
180
|
-
ocr_elements = needs_ocr_pdf_page.extract_ocr_elements(engine="paddle")
|
181
|
-
assert (
|
182
|
-
ocr_elements == []
|
183
|
-
), "extract_ocr_elements should return empty list for unavailable engine."
|
184
|
-
|
185
|
-
|
186
|
-
# --- Surya Tests --- #
|
187
|
-
|
188
|
-
|
189
|
-
def test_layout_surya_works_when_installed(standard_pdf_page): # Use standard PDF for layout
|
190
|
-
"""Test running Surya layout analysis when installed."""
|
191
|
-
pytest.importorskip("surya")
|
192
|
-
if sys.version_info < (3, 10):
|
193
|
-
pytest.skip("Surya tests skipped on Python < 3.10")
|
194
|
-
try:
|
195
|
-
layout_regions = standard_pdf_page.analyze_layout(engine="surya")
|
196
|
-
from natural_pdf.elements.collections import ElementCollection # Import needed for check
|
197
|
-
|
198
|
-
assert isinstance(
|
199
|
-
layout_regions, ElementCollection
|
200
|
-
), "analyze_layout should return an ElementCollection"
|
201
|
-
# Layout might return empty list, check type
|
202
|
-
# assert len(layout_regions) > 0, "Surya should find layout regions." # Keep commented unless specific PDF guarantees regions
|
203
|
-
except Exception as e:
|
204
|
-
pytest.fail(f"Surya layout analysis failed when installed: {e}")
|
205
|
-
|
206
|
-
|
207
|
-
def test_layout_surya_fails_gracefully_when_not_installed(standard_pdf_page):
|
208
|
-
"""Test calling Surya layout analysis when not installed raises error."""
|
209
|
-
if is_extra_installed("surya"):
|
210
|
-
pytest.skip("Skipping test: Surya IS installed.")
|
211
|
-
if sys.version_info < (3, 10): # Also skip if check fails but Python is < 3.10
|
212
|
-
pytest.skip("Surya tests skipped on Python < 3.10")
|
213
|
-
# Expect RuntimeError because engine is known but unavailable
|
214
|
-
with pytest.raises(RuntimeError, match="not available"):
|
215
|
-
_ = standard_pdf_page.analyze_layout(engine="surya")
|
216
|
-
|
217
|
-
|
218
|
-
# --- Layout YOLO Tests --- #
|
219
|
-
|
220
|
-
|
221
|
-
def test_layout_yolo_works_when_installed(standard_pdf_page):
|
222
|
-
"""Test running YOLO layout analysis when installed."""
|
223
|
-
# Check for the *actual* package associated with the extra
|
224
|
-
pytest.importorskip("doclayout_yolo")
|
225
|
-
try:
|
226
|
-
layout_regions = standard_pdf_page.analyze_layout(engine="yolo")
|
227
|
-
from natural_pdf.elements.collections import ElementCollection
|
228
|
-
|
229
|
-
assert isinstance(layout_regions, ElementCollection)
|
230
|
-
except Exception as e:
|
231
|
-
pytest.fail(f"YOLO layout analysis failed when installed: {e}")
|
232
|
-
|
233
|
-
|
234
|
-
def test_layout_yolo_fails_gracefully_when_not_installed(standard_pdf_page):
|
235
|
-
"""Test calling YOLO layout analysis when not installed raises error."""
|
236
|
-
if is_extra_installed("layout_yolo"):
|
237
|
-
pytest.skip("Skipping test: Layout YOLO IS installed.")
|
238
|
-
# Expect RuntimeError because engine is known but unavailable
|
239
|
-
with pytest.raises(RuntimeError, match="not available"):
|
240
|
-
_ = standard_pdf_page.analyze_layout(engine="yolo")
|
241
|
-
|
242
|
-
|
243
|
-
# --- Haystack Tests --- #
|
244
|
-
|
245
|
-
|
246
|
-
def test_search_haystack_works_when_installed(standard_pdf_collection):
|
247
|
-
"""Test basic Haystack initialization via PDFCollection when installed."""
|
248
|
-
pytest.importorskip("haystack")
|
249
|
-
pytest.importorskip("chromadb")
|
250
|
-
try:
|
251
|
-
# Initialize search on the collection
|
252
|
-
# This should setup the SearchService internally
|
253
|
-
standard_pdf_collection.init_search(
|
254
|
-
index=False
|
255
|
-
) # index=False to avoid actual indexing time
|
256
|
-
# Basic check: did init_search run without error?
|
257
|
-
# Optionally check if _search_service is now initialized on the collection
|
258
|
-
assert hasattr(standard_pdf_collection, "_search_service")
|
259
|
-
assert standard_pdf_collection._search_service is not None
|
260
|
-
# TODO: Add actual indexing and searching tests later
|
261
|
-
except ImportError as ie:
|
262
|
-
pytest.fail(f"Import failed even though haystack/chromadb seem installed: {ie}")
|
263
|
-
except Exception as e:
|
264
|
-
pytest.fail(f"Haystack integration failed when installed: {e}")
|
265
|
-
|
266
|
-
|
267
|
-
def test_search_haystack_fails_gracefully_when_not_installed(standard_pdf_collection):
|
268
|
-
"""Test initializing Haystack features via PDFCollection when not installed."""
|
269
|
-
if is_extra_installed("haystack"):
|
270
|
-
pytest.skip("Skipping test: Haystack IS installed.")
|
271
|
-
|
272
|
-
# Expect an ImportError when trying to initialize search
|
273
|
-
with pytest.raises(
|
274
|
-
ImportError,
|
275
|
-
match="Search Service could not be created. Ensure Haystack extras are installed",
|
276
|
-
):
|
277
|
-
standard_pdf_collection.init_search(index=False)
|
278
|
-
|
279
|
-
|
280
|
-
# --- QA Tests --- #
|
281
|
-
|
282
|
-
|
283
|
-
def test_qa_works_when_installed(standard_pdf_page):
|
284
|
-
"""Test basic QA functionality (requires transformers core dep)."""
|
285
|
-
# No importorskip needed as transformers is core
|
286
|
-
try:
|
287
|
-
# Simple question
|
288
|
-
result = standard_pdf_page.ask("What is this document about?")
|
289
|
-
assert isinstance(result, dict)
|
290
|
-
assert "answer" in result
|
291
|
-
assert "confidence" in result
|
292
|
-
# We don't know the answer, but it should run
|
293
|
-
except Exception as e:
|
294
|
-
pytest.fail(f"QA execution failed: {e}")
|
295
|
-
|
296
|
-
|
297
|
-
# No 'fails gracefully' test needed for QA as its core dep (transformers) is always installed.
|
298
|
-
# We might need tests for *specific models* if they require separate downloads/setup.
|
File without changes
|