natural-pdf 0.1.5__py3-none-any.whl → 0.1.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (61) hide show
  1. docs/finetuning/index.md +176 -0
  2. docs/ocr/index.md +34 -47
  3. docs/tutorials/01-loading-and-extraction.ipynb +34 -1536
  4. docs/tutorials/02-finding-elements.ipynb +42 -42
  5. docs/tutorials/03-extracting-blocks.ipynb +17 -17
  6. docs/tutorials/04-table-extraction.ipynb +12 -12
  7. docs/tutorials/05-excluding-content.ipynb +30 -30
  8. docs/tutorials/06-document-qa.ipynb +28 -28
  9. docs/tutorials/07-layout-analysis.ipynb +63 -35
  10. docs/tutorials/07-working-with-regions.ipynb +55 -51
  11. docs/tutorials/07-working-with-regions.md +2 -2
  12. docs/tutorials/08-spatial-navigation.ipynb +60 -60
  13. docs/tutorials/09-section-extraction.ipynb +113 -113
  14. docs/tutorials/10-form-field-extraction.ipynb +78 -50
  15. docs/tutorials/11-enhanced-table-processing.ipynb +6 -6
  16. docs/tutorials/12-ocr-integration.ipynb +149 -131
  17. docs/tutorials/12-ocr-integration.md +0 -13
  18. docs/tutorials/13-semantic-search.ipynb +313 -873
  19. natural_pdf/__init__.py +21 -22
  20. natural_pdf/analyzers/layout/gemini.py +280 -0
  21. natural_pdf/analyzers/layout/layout_manager.py +28 -1
  22. natural_pdf/analyzers/layout/layout_options.py +11 -0
  23. natural_pdf/analyzers/layout/yolo.py +6 -2
  24. natural_pdf/collections/pdf_collection.py +24 -0
  25. natural_pdf/core/element_manager.py +18 -13
  26. natural_pdf/core/page.py +174 -36
  27. natural_pdf/core/pdf.py +156 -42
  28. natural_pdf/elements/base.py +9 -17
  29. natural_pdf/elements/collections.py +99 -38
  30. natural_pdf/elements/region.py +77 -37
  31. natural_pdf/elements/text.py +5 -0
  32. natural_pdf/exporters/__init__.py +4 -0
  33. natural_pdf/exporters/base.py +61 -0
  34. natural_pdf/exporters/paddleocr.py +345 -0
  35. natural_pdf/ocr/__init__.py +57 -36
  36. natural_pdf/ocr/engine.py +160 -49
  37. natural_pdf/ocr/engine_easyocr.py +178 -157
  38. natural_pdf/ocr/engine_paddle.py +114 -189
  39. natural_pdf/ocr/engine_surya.py +87 -144
  40. natural_pdf/ocr/ocr_factory.py +125 -0
  41. natural_pdf/ocr/ocr_manager.py +65 -89
  42. natural_pdf/ocr/ocr_options.py +8 -13
  43. natural_pdf/ocr/utils.py +113 -0
  44. natural_pdf/templates/finetune/fine_tune_paddleocr.md +415 -0
  45. natural_pdf/templates/spa/css/style.css +334 -0
  46. natural_pdf/templates/spa/index.html +31 -0
  47. natural_pdf/templates/spa/js/app.js +472 -0
  48. natural_pdf/templates/spa/words.txt +235976 -0
  49. natural_pdf/utils/debug.py +34 -0
  50. natural_pdf/utils/identifiers.py +33 -0
  51. natural_pdf/utils/packaging.py +485 -0
  52. natural_pdf/utils/text_extraction.py +44 -64
  53. natural_pdf/utils/visualization.py +1 -1
  54. {natural_pdf-0.1.5.dist-info → natural_pdf-0.1.7.dist-info}/METADATA +44 -20
  55. {natural_pdf-0.1.5.dist-info → natural_pdf-0.1.7.dist-info}/RECORD +58 -47
  56. {natural_pdf-0.1.5.dist-info → natural_pdf-0.1.7.dist-info}/WHEEL +1 -1
  57. {natural_pdf-0.1.5.dist-info → natural_pdf-0.1.7.dist-info}/top_level.txt +0 -1
  58. natural_pdf/templates/ocr_debug.html +0 -517
  59. tests/test_loading.py +0 -50
  60. tests/test_optional_deps.py +0 -298
  61. {natural_pdf-0.1.5.dist-info → natural_pdf-0.1.7.dist-info}/licenses/LICENSE +0 -0
@@ -1,298 +0,0 @@
1
- import importlib # Use importlib for checking
2
- import sys
3
-
4
- import pytest
5
-
6
- from natural_pdf import PDF, PDFCollection # Import PDFCollection
7
- from natural_pdf.core.page import Page
8
-
9
- # --- Fixtures --- #
10
-
11
- # Define PDF paths relative to the project root (where pytest is usually run)
12
- TEST_PDF_URL = "https://github.com/jsoma/natural-pdf/raw/refs/heads/main/pdfs/01-practice.pdf"
13
- NEEDS_OCR_PDF_PATH = "pdfs/needs-ocr.pdf"
14
- STANDARD_PDF_PATH = "pdfs/01-practice.pdf"
15
-
16
-
17
- @pytest.fixture(scope="module")
18
- def standard_pdf_page():
19
- """Fixture to load the first page of the standard test PDF."""
20
- try:
21
- # Use the local path if available, otherwise fallback to URL?
22
- # For consistency in tests, let's stick to the local path for now.
23
- # Assume the pdfs directory is in the root alongside tests/
24
- pdf = PDF(STANDARD_PDF_PATH)
25
- if not pdf.pages:
26
- pytest.fail(f"Standard PDF has no pages: {STANDARD_PDF_PATH}")
27
- return pdf.pages[0]
28
- except Exception as e:
29
- pytest.fail(f"Failed to load standard PDF ({STANDARD_PDF_PATH}) for module tests: {e}")
30
-
31
-
32
- @pytest.fixture(scope="module")
33
- def needs_ocr_pdf_page():
34
- """Fixture to load the first page of the OCR test PDF."""
35
- try:
36
- pdf = PDF(NEEDS_OCR_PDF_PATH)
37
- if not pdf.pages:
38
- pytest.fail(f"OCR PDF has no pages: {NEEDS_OCR_PDF_PATH}")
39
- return pdf.pages[0]
40
- except Exception as e:
41
- pytest.fail(f"Failed to load OCR PDF ({NEEDS_OCR_PDF_PATH}) for module tests: {e}")
42
-
43
-
44
- @pytest.fixture(scope="module")
45
- def standard_pdf_collection():
46
- """Fixture to create a PDFCollection with the standard test PDF."""
47
- try:
48
- # Use a list containing the path
49
- collection = PDFCollection([STANDARD_PDF_PATH])
50
- assert len(collection.pdfs) == 1
51
- return collection
52
- except Exception as e:
53
- pytest.fail(f"Failed to create PDFCollection ({STANDARD_PDF_PATH}) for module tests: {e}")
54
-
55
-
56
- # --- Helper --- #
57
-
58
-
59
- def is_extra_installed(extra_name):
60
- """Checks if packages associated with an extra appear importable."""
61
- extra_packages = {
62
- "interactive": ["ipywidgets"],
63
- "easyocr": ["easyocr"],
64
- "paddle": ["paddleocr"],
65
- "surya": ["surya"],
66
- "layout_yolo": ["doclayout_yolo"], # Correct package name for this extra
67
- "haystack": ["haystack"],
68
- "qa": ["transformers"],
69
- }
70
- if extra_name not in extra_packages:
71
- return False
72
-
73
- packages_to_check = extra_packages[extra_name]
74
- try:
75
- for pkg_name in packages_to_check:
76
- importlib.import_module(pkg_name)
77
- return True
78
- except ImportError:
79
- return False
80
-
81
-
82
- # --- Interactive Viewer (ipywidgets) Tests --- (Existing)
83
-
84
-
85
- def test_ipywidgets_availability_flag():
86
- """Tests the internal _IPYWIDGETS_AVAILABLE flag based on environment."""
87
- try:
88
- from natural_pdf.widgets.viewer import _IPYWIDGETS_AVAILABLE
89
-
90
- flag_value = _IPYWIDGETS_AVAILABLE
91
- except ImportError:
92
- pytest.fail("Could not import or find _IPYWIDGETS_AVAILABLE in natural_pdf.widgets.viewer")
93
- should_be_installed = is_extra_installed("interactive")
94
- assert (
95
- flag_value == should_be_installed
96
- ), f"_IPYWIDGETS_AVAILABLE flag mismatch. Expected {should_be_installed}, got {flag_value}."
97
-
98
-
99
- def test_page_viewer_widget_creation_when_installed(standard_pdf_page):
100
- """Tests that Page.viewer() returns a widget when ipywidgets is installed."""
101
- pytest.importorskip("ipywidgets")
102
- from natural_pdf.widgets.viewer import SimpleInteractiveViewerWidget
103
-
104
- viewer_instance = standard_pdf_page.viewer()
105
- assert (
106
- viewer_instance is not None
107
- ), "Page.viewer() should return an object when ipywidgets is installed."
108
- assert isinstance(
109
- viewer_instance, SimpleInteractiveViewerWidget
110
- ), f"Page.viewer() returned type {type(viewer_instance)}, expected SimpleInteractiveViewerWidget."
111
-
112
-
113
- def test_page_viewer_widget_creation_when_not_installed(standard_pdf_page):
114
- """Tests that Page.viewer() returns None when ipywidgets is missing."""
115
- if is_extra_installed("interactive"):
116
- pytest.skip("Skipping test: ipywidgets IS installed in this environment.")
117
- viewer_instance = standard_pdf_page.viewer()
118
- assert (
119
- viewer_instance is None
120
- ), "Page.viewer() should return None when ipywidgets is not installed."
121
-
122
-
123
- # --- EasyOCR Tests --- #
124
-
125
-
126
- def test_ocr_easyocr_works_when_installed(needs_ocr_pdf_page):
127
- """Test running EasyOCR when installed."""
128
- pytest.importorskip("easyocr")
129
- try:
130
- # Use extract_ocr_elements which doesn't modify the page state
131
- ocr_elements = needs_ocr_pdf_page.extract_ocr_elements(engine="easyocr")
132
- assert isinstance(ocr_elements, list)
133
- assert len(ocr_elements) > 0, "EasyOCR should find text elements on the OCR PDF."
134
- # Check if the first element looks like a TextElement (basic check)
135
- assert hasattr(ocr_elements[0], "text"), "OCR result should have text attribute."
136
- assert hasattr(ocr_elements[0], "bbox"), "OCR result should have bbox attribute."
137
- except Exception as e:
138
- pytest.fail(f"EasyOCR extraction failed when installed: {e}")
139
-
140
-
141
- def test_ocr_easyocr_fails_gracefully_when_not_installed(needs_ocr_pdf_page):
142
- """Test calling EasyOCR when not installed."""
143
- if is_extra_installed("easyocr"):
144
- pytest.skip("Skipping test: EasyOCR IS installed.")
145
- # Check how OCRManager handles unavailable engines - assuming it returns empty list
146
- ocr_elements = needs_ocr_pdf_page.extract_ocr_elements(engine="easyocr")
147
- assert (
148
- ocr_elements == []
149
- ), "extract_ocr_elements should return empty list for unavailable engine."
150
-
151
-
152
- # --- PaddleOCR Tests --- #
153
-
154
-
155
- def test_ocr_paddle_works_when_installed(needs_ocr_pdf_page):
156
- """Test running PaddleOCR when installed."""
157
- pytest.importorskip("paddleocr")
158
- if sys.platform == "darwin":
159
- pytest.skip("PaddleOCR tests skipped on macOS")
160
- try:
161
- ocr_elements = needs_ocr_pdf_page.extract_ocr_elements(engine="paddle")
162
- assert isinstance(ocr_elements, list)
163
- assert len(ocr_elements) > 0, "PaddleOCR should find text elements on the OCR PDF."
164
- assert hasattr(ocr_elements[0], "text")
165
- assert hasattr(ocr_elements[0], "bbox")
166
- except Exception as e:
167
- pytest.fail(f"PaddleOCR extraction failed when installed: {e}")
168
-
169
-
170
- def test_ocr_paddle_fails_gracefully_when_not_installed(needs_ocr_pdf_page):
171
- """Test calling PaddleOCR when not installed."""
172
- if is_extra_installed("paddle"):
173
- pytest.skip("Skipping test: PaddleOCR IS installed.")
174
- if sys.platform == "darwin": # Also skip if check fails but platform is darwin
175
- pytest.skip("PaddleOCR tests skipped on macOS")
176
- # Check how OCRManager handles unavailable engines - assume it returns empty list (KEEPING THIS)
177
- # It might be reasonable for OCR manager to return [] if engine isn't there,
178
- # vs layout which is explicitly requested.
179
- # Alternatively, OCRManager could also be changed to raise errors.
180
- ocr_elements = needs_ocr_pdf_page.extract_ocr_elements(engine="paddle")
181
- assert (
182
- ocr_elements == []
183
- ), "extract_ocr_elements should return empty list for unavailable engine."
184
-
185
-
186
- # --- Surya Tests --- #
187
-
188
-
189
- def test_layout_surya_works_when_installed(standard_pdf_page): # Use standard PDF for layout
190
- """Test running Surya layout analysis when installed."""
191
- pytest.importorskip("surya")
192
- if sys.version_info < (3, 10):
193
- pytest.skip("Surya tests skipped on Python < 3.10")
194
- try:
195
- layout_regions = standard_pdf_page.analyze_layout(engine="surya")
196
- from natural_pdf.elements.collections import ElementCollection # Import needed for check
197
-
198
- assert isinstance(
199
- layout_regions, ElementCollection
200
- ), "analyze_layout should return an ElementCollection"
201
- # Layout might return empty list, check type
202
- # assert len(layout_regions) > 0, "Surya should find layout regions." # Keep commented unless specific PDF guarantees regions
203
- except Exception as e:
204
- pytest.fail(f"Surya layout analysis failed when installed: {e}")
205
-
206
-
207
- def test_layout_surya_fails_gracefully_when_not_installed(standard_pdf_page):
208
- """Test calling Surya layout analysis when not installed raises error."""
209
- if is_extra_installed("surya"):
210
- pytest.skip("Skipping test: Surya IS installed.")
211
- if sys.version_info < (3, 10): # Also skip if check fails but Python is < 3.10
212
- pytest.skip("Surya tests skipped on Python < 3.10")
213
- # Expect RuntimeError because engine is known but unavailable
214
- with pytest.raises(RuntimeError, match="not available"):
215
- _ = standard_pdf_page.analyze_layout(engine="surya")
216
-
217
-
218
- # --- Layout YOLO Tests --- #
219
-
220
-
221
- def test_layout_yolo_works_when_installed(standard_pdf_page):
222
- """Test running YOLO layout analysis when installed."""
223
- # Check for the *actual* package associated with the extra
224
- pytest.importorskip("doclayout_yolo")
225
- try:
226
- layout_regions = standard_pdf_page.analyze_layout(engine="yolo")
227
- from natural_pdf.elements.collections import ElementCollection
228
-
229
- assert isinstance(layout_regions, ElementCollection)
230
- except Exception as e:
231
- pytest.fail(f"YOLO layout analysis failed when installed: {e}")
232
-
233
-
234
- def test_layout_yolo_fails_gracefully_when_not_installed(standard_pdf_page):
235
- """Test calling YOLO layout analysis when not installed raises error."""
236
- if is_extra_installed("layout_yolo"):
237
- pytest.skip("Skipping test: Layout YOLO IS installed.")
238
- # Expect RuntimeError because engine is known but unavailable
239
- with pytest.raises(RuntimeError, match="not available"):
240
- _ = standard_pdf_page.analyze_layout(engine="yolo")
241
-
242
-
243
- # --- Haystack Tests --- #
244
-
245
-
246
- def test_search_haystack_works_when_installed(standard_pdf_collection):
247
- """Test basic Haystack initialization via PDFCollection when installed."""
248
- pytest.importorskip("haystack")
249
- pytest.importorskip("chromadb")
250
- try:
251
- # Initialize search on the collection
252
- # This should setup the SearchService internally
253
- standard_pdf_collection.init_search(
254
- index=False
255
- ) # index=False to avoid actual indexing time
256
- # Basic check: did init_search run without error?
257
- # Optionally check if _search_service is now initialized on the collection
258
- assert hasattr(standard_pdf_collection, "_search_service")
259
- assert standard_pdf_collection._search_service is not None
260
- # TODO: Add actual indexing and searching tests later
261
- except ImportError as ie:
262
- pytest.fail(f"Import failed even though haystack/chromadb seem installed: {ie}")
263
- except Exception as e:
264
- pytest.fail(f"Haystack integration failed when installed: {e}")
265
-
266
-
267
- def test_search_haystack_fails_gracefully_when_not_installed(standard_pdf_collection):
268
- """Test initializing Haystack features via PDFCollection when not installed."""
269
- if is_extra_installed("haystack"):
270
- pytest.skip("Skipping test: Haystack IS installed.")
271
-
272
- # Expect an ImportError when trying to initialize search
273
- with pytest.raises(
274
- ImportError,
275
- match="Search Service could not be created. Ensure Haystack extras are installed",
276
- ):
277
- standard_pdf_collection.init_search(index=False)
278
-
279
-
280
- # --- QA Tests --- #
281
-
282
-
283
- def test_qa_works_when_installed(standard_pdf_page):
284
- """Test basic QA functionality (requires transformers core dep)."""
285
- # No importorskip needed as transformers is core
286
- try:
287
- # Simple question
288
- result = standard_pdf_page.ask("What is this document about?")
289
- assert isinstance(result, dict)
290
- assert "answer" in result
291
- assert "confidence" in result
292
- # We don't know the answer, but it should run
293
- except Exception as e:
294
- pytest.fail(f"QA execution failed: {e}")
295
-
296
-
297
- # No 'fails gracefully' test needed for QA as its core dep (transformers) is always installed.
298
- # We might need tests for *specific models* if they require separate downloads/setup.