natural-pdf 0.1.7__tar.gz → 0.1.9__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- natural_pdf-0.1.9/.cursor/rules/analysis_framework.mdc +58 -0
- natural_pdf-0.1.9/.cursor/rules/coding-style.mdc +5 -0
- natural_pdf-0.1.9/.cursor/rules/edit-md-instead-of-ipynb.mdc +5 -0
- natural_pdf-0.1.9/.cursor/rules/minimal-comments.mdc +5 -0
- natural_pdf-0.1.9/.cursor/rules/natural-pdf-overview.mdc +5 -0
- natural_pdf-0.1.9/.cursor/rules/user-friendly-library-code.mdc +5 -0
- {natural_pdf-0.1.7 → natural_pdf-0.1.9}/.gitignore +1 -0
- natural_pdf-0.1.7/execute_notebooks.py → natural_pdf-0.1.9/01-execute_notebooks.py +2 -0
- natural_pdf-0.1.9/MANIFEST.in +48 -0
- {natural_pdf-0.1.7 → natural_pdf-0.1.9}/PKG-INFO +17 -3
- natural_pdf-0.1.9/audit_packaging.py +56 -0
- {natural_pdf-0.1.7 → natural_pdf-0.1.9}/check_run_md.sh +15 -1
- natural_pdf-0.1.9/docs/categorizing-documents/index.md +168 -0
- natural_pdf-0.1.9/docs/data-extraction/index.md +109 -0
- natural_pdf-0.1.9/docs/element-selection/index.ipynb +969 -0
- {natural_pdf-0.1.7 → natural_pdf-0.1.9}/docs/element-selection/index.md +20 -0
- {natural_pdf-0.1.7 → natural_pdf-0.1.9}/docs/index.md +19 -0
- {natural_pdf-0.1.7 → natural_pdf-0.1.9}/docs/ocr/index.md +63 -16
- natural_pdf-0.1.9/docs/tutorials/01-loading-and-extraction.ipynb +1628 -0
- natural_pdf-0.1.9/docs/tutorials/02-finding-elements.ipynb +374 -0
- {natural_pdf-0.1.7 → natural_pdf-0.1.9}/docs/tutorials/02-finding-elements.md +3 -3
- natural_pdf-0.1.9/docs/tutorials/03-extracting-blocks.ipynb +152 -0
- {natural_pdf-0.1.7 → natural_pdf-0.1.9}/docs/tutorials/04-table-extraction.ipynb +17 -12
- natural_pdf-0.1.9/docs/tutorials/05-excluding-content.ipynb +275 -0
- {natural_pdf-0.1.7 → natural_pdf-0.1.9}/docs/tutorials/06-document-qa.ipynb +36 -31
- natural_pdf-0.1.9/docs/tutorials/07-layout-analysis.ipynb +269 -0
- natural_pdf-0.1.9/docs/tutorials/07-working-with-regions.ipynb +414 -0
- natural_pdf-0.1.9/docs/tutorials/08-spatial-navigation.ipynb +513 -0
- natural_pdf-0.1.9/docs/tutorials/09-section-extraction.ipynb +2439 -0
- natural_pdf-0.1.9/docs/tutorials/10-form-field-extraction.ipynb +503 -0
- {natural_pdf-0.1.7 → natural_pdf-0.1.9}/docs/tutorials/11-enhanced-table-processing.ipynb +11 -6
- natural_pdf-0.1.9/docs/tutorials/12-ocr-integration.ipynb +3712 -0
- natural_pdf-0.1.9/docs/tutorials/12-ocr-integration.md +137 -0
- {natural_pdf-0.1.7 → natural_pdf-0.1.9}/docs/tutorials/13-semantic-search.ipynb +629 -546
- {natural_pdf-0.1.7 → natural_pdf-0.1.9}/docs/tutorials/13-semantic-search.md +8 -7
- {natural_pdf-0.1.7 → natural_pdf-0.1.9}/mkdocs.yml +3 -1
- {natural_pdf-0.1.7 → natural_pdf-0.1.9}/natural_pdf/__init__.py +3 -0
- {natural_pdf-0.1.7 → natural_pdf-0.1.9}/natural_pdf/analyzers/layout/base.py +1 -5
- {natural_pdf-0.1.7 → natural_pdf-0.1.9}/natural_pdf/analyzers/layout/gemini.py +61 -51
- {natural_pdf-0.1.7 → natural_pdf-0.1.9}/natural_pdf/analyzers/layout/layout_analyzer.py +40 -11
- {natural_pdf-0.1.7 → natural_pdf-0.1.9}/natural_pdf/analyzers/layout/layout_manager.py +26 -84
- {natural_pdf-0.1.7 → natural_pdf-0.1.9}/natural_pdf/analyzers/layout/layout_options.py +7 -0
- natural_pdf-0.1.9/natural_pdf/analyzers/layout/pdfplumber_table_finder.py +142 -0
- {natural_pdf-0.1.7 → natural_pdf-0.1.9}/natural_pdf/analyzers/layout/surya.py +46 -123
- {natural_pdf-0.1.7 → natural_pdf-0.1.9}/natural_pdf/analyzers/layout/tatr.py +51 -4
- {natural_pdf-0.1.7 → natural_pdf-0.1.9}/natural_pdf/analyzers/text_structure.py +3 -5
- {natural_pdf-0.1.7 → natural_pdf-0.1.9}/natural_pdf/analyzers/utils.py +3 -3
- natural_pdf-0.1.9/natural_pdf/classification/manager.py +422 -0
- natural_pdf-0.1.9/natural_pdf/classification/mixin.py +163 -0
- natural_pdf-0.1.9/natural_pdf/classification/results.py +80 -0
- natural_pdf-0.1.9/natural_pdf/collections/mixins.py +111 -0
- natural_pdf-0.1.9/natural_pdf/collections/pdf_collection.py +730 -0
- {natural_pdf-0.1.7 → natural_pdf-0.1.9}/natural_pdf/core/element_manager.py +83 -0
- {natural_pdf-0.1.7 → natural_pdf-0.1.9}/natural_pdf/core/highlighting_service.py +13 -22
- {natural_pdf-0.1.7 → natural_pdf-0.1.9}/natural_pdf/core/page.py +578 -93
- natural_pdf-0.1.9/natural_pdf/core/pdf.py +1539 -0
- {natural_pdf-0.1.7 → natural_pdf-0.1.9}/natural_pdf/elements/base.py +134 -40
- {natural_pdf-0.1.7 → natural_pdf-0.1.9}/natural_pdf/elements/collections.py +712 -109
- {natural_pdf-0.1.7 → natural_pdf-0.1.9}/natural_pdf/elements/region.py +722 -69
- {natural_pdf-0.1.7 → natural_pdf-0.1.9}/natural_pdf/elements/text.py +4 -1
- natural_pdf-0.1.9/natural_pdf/export/mixin.py +137 -0
- {natural_pdf-0.1.7 → natural_pdf-0.1.9}/natural_pdf/exporters/base.py +3 -3
- {natural_pdf-0.1.7 → natural_pdf-0.1.9}/natural_pdf/exporters/paddleocr.py +5 -4
- natural_pdf-0.1.9/natural_pdf/extraction/manager.py +135 -0
- natural_pdf-0.1.9/natural_pdf/extraction/mixin.py +279 -0
- natural_pdf-0.1.9/natural_pdf/extraction/result.py +23 -0
- {natural_pdf-0.1.7 → natural_pdf-0.1.9}/natural_pdf/ocr/__init__.py +5 -5
- natural_pdf-0.1.9/natural_pdf/ocr/engine_doctr.py +346 -0
- {natural_pdf-0.1.7 → natural_pdf-0.1.9}/natural_pdf/ocr/engine_easyocr.py +6 -3
- {natural_pdf-0.1.7 → natural_pdf-0.1.9}/natural_pdf/ocr/ocr_factory.py +24 -4
- {natural_pdf-0.1.7 → natural_pdf-0.1.9}/natural_pdf/ocr/ocr_manager.py +122 -26
- natural_pdf-0.1.9/natural_pdf/ocr/ocr_options.py +198 -0
- {natural_pdf-0.1.7 → natural_pdf-0.1.9}/natural_pdf/ocr/utils.py +19 -6
- {natural_pdf-0.1.7 → natural_pdf-0.1.9}/natural_pdf/qa/document_qa.py +0 -4
- {natural_pdf-0.1.7 → natural_pdf-0.1.9}/natural_pdf/search/__init__.py +20 -34
- natural_pdf-0.1.9/natural_pdf/search/haystack_search_service.py +687 -0
- {natural_pdf-0.1.7 → natural_pdf-0.1.9}/natural_pdf/search/haystack_utils.py +99 -75
- {natural_pdf-0.1.7 → natural_pdf-0.1.9}/natural_pdf/search/search_service_protocol.py +11 -12
- natural_pdf-0.1.9/natural_pdf/selectors/parser.py +612 -0
- {natural_pdf-0.1.7 → natural_pdf-0.1.9}/natural_pdf/templates/finetune/fine_tune_paddleocr.md +10 -5
- {natural_pdf-0.1.7 → natural_pdf-0.1.9}/natural_pdf/utils/debug.py +3 -3
- natural_pdf-0.1.9/natural_pdf/utils/locks.py +8 -0
- {natural_pdf-0.1.7 → natural_pdf-0.1.9}/natural_pdf/utils/packaging.py +8 -6
- {natural_pdf-0.1.7 → natural_pdf-0.1.9}/natural_pdf/utils/text_extraction.py +60 -1
- natural_pdf-0.1.9/natural_pdf/utils/tqdm_utils.py +51 -0
- {natural_pdf-0.1.7 → natural_pdf-0.1.9}/natural_pdf/utils/visualization.py +18 -0
- {natural_pdf-0.1.7 → natural_pdf-0.1.9}/natural_pdf/widgets/viewer.py +4 -25
- {natural_pdf-0.1.7 → natural_pdf-0.1.9}/natural_pdf.egg-info/PKG-INFO +17 -3
- {natural_pdf-0.1.7 → natural_pdf-0.1.9}/natural_pdf.egg-info/SOURCES.txt +23 -3
- {natural_pdf-0.1.7 → natural_pdf-0.1.9}/natural_pdf.egg-info/requires.txt +19 -2
- {natural_pdf-0.1.7 → natural_pdf-0.1.9}/natural_pdf.egg-info/top_level.txt +1 -3
- {natural_pdf-0.1.7 → natural_pdf-0.1.9}/pyproject.toml +35 -12
- {natural_pdf-0.1.7 → natural_pdf-0.1.9}/tests/exporters/test_paddleocr_exporter.py +4 -3
- {natural_pdf-0.1.7 → natural_pdf-0.1.9}/tests/test_optional_deps.py +43 -17
- natural_pdf-0.1.7/MANIFEST.in +0 -8
- natural_pdf-0.1.7/docs/element-selection/index.ipynb +0 -915
- natural_pdf-0.1.7/docs/tutorials/01-loading-and-extraction.ipynb +0 -194
- natural_pdf-0.1.7/docs/tutorials/02-finding-elements.ipynb +0 -340
- natural_pdf-0.1.7/docs/tutorials/03-extracting-blocks.ipynb +0 -147
- natural_pdf-0.1.7/docs/tutorials/05-excluding-content.ipynb +0 -270
- natural_pdf-0.1.7/docs/tutorials/07-layout-analysis.ipynb +0 -288
- natural_pdf-0.1.7/docs/tutorials/07-working-with-regions.ipynb +0 -413
- natural_pdf-0.1.7/docs/tutorials/08-spatial-navigation.ipynb +0 -508
- natural_pdf-0.1.7/docs/tutorials/09-section-extraction.ipynb +0 -2434
- natural_pdf-0.1.7/docs/tutorials/10-form-field-extraction.ipynb +0 -512
- natural_pdf-0.1.7/docs/tutorials/12-ocr-integration.ipynb +0 -604
- natural_pdf-0.1.7/docs/tutorials/12-ocr-integration.md +0 -175
- natural_pdf-0.1.7/natural_pdf/collections/pdf_collection.py +0 -311
- natural_pdf-0.1.7/natural_pdf/core/pdf.py +0 -1087
- natural_pdf-0.1.7/natural_pdf/ocr/ocr_options.py +0 -115
- natural_pdf-0.1.7/natural_pdf/search/haystack_search_service.py +0 -643
- natural_pdf-0.1.7/natural_pdf/selectors/parser.py +0 -411
- natural_pdf-0.1.7/notebooks/Examples.ipynb +0 -1293
- {natural_pdf-0.1.7 → natural_pdf-0.1.9}/.github/workflows/docs.yml +0 -0
- natural_pdf-0.1.7/run_all_tutorials.sh → natural_pdf-0.1.9/02-run_all_tutorials.sh +0 -0
- {natural_pdf-0.1.7 → natural_pdf-0.1.9}/CLAUDE.md +0 -0
- {natural_pdf-0.1.7 → natural_pdf-0.1.9}/LICENSE +0 -0
- {natural_pdf-0.1.7 → natural_pdf-0.1.9}/README.md +0 -0
- {natural_pdf-0.1.7 → natural_pdf-0.1.9}/docs/api/index.md +0 -0
- {natural_pdf-0.1.7 → natural_pdf-0.1.9}/docs/assets/favicon.png +0 -0
- {natural_pdf-0.1.7 → natural_pdf-0.1.9}/docs/assets/favicon.svg +0 -0
- {natural_pdf-0.1.7 → natural_pdf-0.1.9}/docs/assets/javascripts/custom.js +0 -0
- {natural_pdf-0.1.7 → natural_pdf-0.1.9}/docs/assets/logo.svg +0 -0
- {natural_pdf-0.1.7 → natural_pdf-0.1.9}/docs/assets/sample-screen.png +0 -0
- {natural_pdf-0.1.7 → natural_pdf-0.1.9}/docs/assets/social-preview.png +0 -0
- {natural_pdf-0.1.7 → natural_pdf-0.1.9}/docs/assets/social-preview.svg +0 -0
- {natural_pdf-0.1.7 → natural_pdf-0.1.9}/docs/assets/stylesheets/custom.css +0 -0
- {natural_pdf-0.1.7 → natural_pdf-0.1.9}/docs/document-qa/index.ipynb +0 -0
- {natural_pdf-0.1.7 → natural_pdf-0.1.9}/docs/document-qa/index.md +0 -0
- {natural_pdf-0.1.7 → natural_pdf-0.1.9}/docs/finetuning/index.md +0 -0
- {natural_pdf-0.1.7 → natural_pdf-0.1.9}/docs/installation/index.md +0 -0
- {natural_pdf-0.1.7 → natural_pdf-0.1.9}/docs/interactive-widget/index.ipynb +0 -0
- {natural_pdf-0.1.7 → natural_pdf-0.1.9}/docs/interactive-widget/index.md +0 -0
- {natural_pdf-0.1.7 → natural_pdf-0.1.9}/docs/layout-analysis/index.ipynb +0 -0
- {natural_pdf-0.1.7 → natural_pdf-0.1.9}/docs/layout-analysis/index.md +0 -0
- {natural_pdf-0.1.7 → natural_pdf-0.1.9}/docs/pdf-navigation/index.ipynb +0 -0
- {natural_pdf-0.1.7 → natural_pdf-0.1.9}/docs/pdf-navigation/index.md +0 -0
- {natural_pdf-0.1.7 → natural_pdf-0.1.9}/docs/regions/index.ipynb +0 -0
- {natural_pdf-0.1.7 → natural_pdf-0.1.9}/docs/regions/index.md +0 -0
- {natural_pdf-0.1.7 → natural_pdf-0.1.9}/docs/tables/index.ipynb +0 -0
- {natural_pdf-0.1.7 → natural_pdf-0.1.9}/docs/tables/index.md +0 -0
- {natural_pdf-0.1.7 → natural_pdf-0.1.9}/docs/text-analysis/index.ipynb +0 -0
- {natural_pdf-0.1.7 → natural_pdf-0.1.9}/docs/text-analysis/index.md +0 -0
- {natural_pdf-0.1.7 → natural_pdf-0.1.9}/docs/text-extraction/index.ipynb +0 -0
- {natural_pdf-0.1.7 → natural_pdf-0.1.9}/docs/text-extraction/index.md +0 -0
- {natural_pdf-0.1.7 → natural_pdf-0.1.9}/docs/tutorials/01-loading-and-extraction.md +0 -0
- {natural_pdf-0.1.7 → natural_pdf-0.1.9}/docs/tutorials/03-extracting-blocks.md +0 -0
- {natural_pdf-0.1.7 → natural_pdf-0.1.9}/docs/tutorials/04-table-extraction.md +0 -0
- {natural_pdf-0.1.7 → natural_pdf-0.1.9}/docs/tutorials/05-excluding-content.md +0 -0
- {natural_pdf-0.1.7 → natural_pdf-0.1.9}/docs/tutorials/06-document-qa.md +0 -0
- {natural_pdf-0.1.7 → natural_pdf-0.1.9}/docs/tutorials/07-layout-analysis.md +0 -0
- {natural_pdf-0.1.7 → natural_pdf-0.1.9}/docs/tutorials/07-working-with-regions.md +0 -0
- {natural_pdf-0.1.7 → natural_pdf-0.1.9}/docs/tutorials/08-spatial-navigation.md +0 -0
- {natural_pdf-0.1.7 → natural_pdf-0.1.9}/docs/tutorials/09-section-extraction.md +0 -0
- {natural_pdf-0.1.7 → natural_pdf-0.1.9}/docs/tutorials/10-form-field-extraction.md +0 -0
- {natural_pdf-0.1.7 → natural_pdf-0.1.9}/docs/tutorials/11-enhanced-table-processing.md +0 -0
- {natural_pdf-0.1.7 → natural_pdf-0.1.9}/docs/visual-debugging/index.ipynb +0 -0
- {natural_pdf-0.1.7 → natural_pdf-0.1.9}/docs/visual-debugging/index.md +0 -0
- {natural_pdf-0.1.7 → natural_pdf-0.1.9}/docs/visual-debugging/region.png +0 -0
- {natural_pdf-0.1.7 → natural_pdf-0.1.9}/natural_pdf/analyzers/__init__.py +0 -0
- {natural_pdf-0.1.7 → natural_pdf-0.1.9}/natural_pdf/analyzers/layout/__init__.py +0 -0
- {natural_pdf-0.1.7 → natural_pdf-0.1.9}/natural_pdf/analyzers/layout/docling.py +0 -0
- {natural_pdf-0.1.7 → natural_pdf-0.1.9}/natural_pdf/analyzers/layout/paddle.py +0 -0
- {natural_pdf-0.1.7 → natural_pdf-0.1.9}/natural_pdf/analyzers/layout/yolo.py +0 -0
- {natural_pdf-0.1.7 → natural_pdf-0.1.9}/natural_pdf/analyzers/text_options.py +0 -0
- {natural_pdf-0.1.7 → natural_pdf-0.1.9}/natural_pdf/core/__init__.py +0 -0
- {natural_pdf-0.1.7 → natural_pdf-0.1.9}/natural_pdf/elements/__init__.py +0 -0
- {natural_pdf-0.1.7 → natural_pdf-0.1.9}/natural_pdf/elements/line.py +0 -0
- {natural_pdf-0.1.7 → natural_pdf-0.1.9}/natural_pdf/elements/rect.py +0 -0
- {natural_pdf-0.1.7 → natural_pdf-0.1.9}/natural_pdf/exporters/__init__.py +0 -0
- {natural_pdf-0.1.7 → natural_pdf-0.1.9}/natural_pdf/exporters/searchable_pdf.py +0 -0
- {natural_pdf-0.1.7 → natural_pdf-0.1.9}/natural_pdf/ocr/engine.py +0 -0
- {natural_pdf-0.1.7 → natural_pdf-0.1.9}/natural_pdf/ocr/engine_paddle.py +0 -0
- {natural_pdf-0.1.7 → natural_pdf-0.1.9}/natural_pdf/ocr/engine_surya.py +0 -0
- {natural_pdf-0.1.7 → natural_pdf-0.1.9}/natural_pdf/qa/__init__.py +0 -0
- {natural_pdf-0.1.7 → natural_pdf-0.1.9}/natural_pdf/search/search_options.py +0 -0
- {natural_pdf-0.1.7 → natural_pdf-0.1.9}/natural_pdf/search/searchable_mixin.py +0 -0
- {natural_pdf-0.1.7 → natural_pdf-0.1.9}/natural_pdf/selectors/__init__.py +0 -0
- {natural_pdf-0.1.7 → natural_pdf-0.1.9}/natural_pdf/templates/__init__.py +0 -0
- {natural_pdf-0.1.7 → natural_pdf-0.1.9}/natural_pdf/templates/spa/css/style.css +0 -0
- {natural_pdf-0.1.7 → natural_pdf-0.1.9}/natural_pdf/templates/spa/index.html +0 -0
- {natural_pdf-0.1.7 → natural_pdf-0.1.9}/natural_pdf/templates/spa/js/app.js +0 -0
- {natural_pdf-0.1.7 → natural_pdf-0.1.9}/natural_pdf/templates/spa/words.txt +0 -0
- {natural_pdf-0.1.7 → natural_pdf-0.1.9}/natural_pdf/utils/__init__.py +0 -0
- {natural_pdf-0.1.7 → natural_pdf-0.1.9}/natural_pdf/utils/highlighting.py +0 -0
- {natural_pdf-0.1.7 → natural_pdf-0.1.9}/natural_pdf/utils/identifiers.py +1 -1
- {natural_pdf-0.1.7 → natural_pdf-0.1.9}/natural_pdf/utils/reading_order.py +0 -0
- {natural_pdf-0.1.7 → natural_pdf-0.1.9}/natural_pdf/widgets/__init__.py +0 -0
- {natural_pdf-0.1.7 → natural_pdf-0.1.9}/natural_pdf/widgets/frontend/viewer.js +0 -0
- {natural_pdf-0.1.7 → natural_pdf-0.1.9}/natural_pdf.egg-info/dependency_links.txt +0 -0
- {natural_pdf-0.1.7 → natural_pdf-0.1.9}/noxfile.py +0 -0
- {natural_pdf-0.1.7 → natural_pdf-0.1.9}/pdfs/.gitkeep +0 -0
- {natural_pdf-0.1.7 → natural_pdf-0.1.9}/pdfs/01-practice.pdf +0 -0
- {natural_pdf-0.1.7 → natural_pdf-0.1.9}/pdfs/0500000US42001.pdf +0 -0
- {natural_pdf-0.1.7 → natural_pdf-0.1.9}/pdfs/0500000US42007.pdf +0 -0
- {natural_pdf-0.1.7 → natural_pdf-0.1.9}/pdfs/2014 Statistics.pdf +0 -0
- {natural_pdf-0.1.7 → natural_pdf-0.1.9}/pdfs/2019 Statistics.pdf +0 -0
- {natural_pdf-0.1.7 → natural_pdf-0.1.9}/pdfs/Atlanta_Public_Schools_GA_sample.pdf +0 -0
- {natural_pdf-0.1.7 → natural_pdf-0.1.9}/pdfs/needs-ocr.pdf +0 -0
- {natural_pdf-0.1.7 → natural_pdf-0.1.9}/publish.sh +0 -0
- {natural_pdf-0.1.7 → natural_pdf-0.1.9}/sample-screen.png +0 -0
- {natural_pdf-0.1.7 → natural_pdf-0.1.9}/setup.cfg +0 -0
- {natural_pdf-0.1.7 → natural_pdf-0.1.9}/tests/test_loading.py +0 -0
@@ -0,0 +1,58 @@
|
|
1
|
+
---
|
2
|
+
description:
|
3
|
+
globs:
|
4
|
+
alwaysApply: false
|
5
|
+
---
|
6
|
+
\
|
7
|
+
# Analysis Framework Guide
|
8
|
+
|
9
|
+
This document outlines the agreed-upon structure for adding and managing machine learning analysis results (like classification, NER, summarization) on `Page` and `Region` objects within the `natural-pdf` library.
|
10
|
+
|
11
|
+
## Summary of Framework (Implemented for Classification)
|
12
|
+
|
13
|
+
1. **Central Registry (`element.analyses`)**:
|
14
|
+
* A dictionary attribute named `analyses` exists on [natural_pdf/core/page.py](mdc:natural_pdf/core/page.py) (`Page`) and [natural_pdf/elements/region.py](mdc:natural_pdf/elements/region.py) (`Region`) objects.
|
15
|
+
* It stores results from different analysis types, keyed by an `analysis_key` string.
|
16
|
+
* Example: `page.analyses = {'classification': <ClassificationResult...>, 'ner_run_1': <NERResult...>}`
|
17
|
+
|
18
|
+
2. **Structured Result Objects**:
|
19
|
+
* Each analysis type should have a dedicated `Result` class (e.g., [natural_pdf/classification/results.py](mdc:natural_pdf/classification/results.py) contains `ClassificationResult`, `CategoryScore`).
|
20
|
+
* These objects store structured findings and metadata (model used, parameters, timestamp, `using` mode).
|
21
|
+
* Stored as values in the `analyses` dictionary.
|
22
|
+
|
23
|
+
3. **Manager Registry (`PDF.get_manager`)**:
|
24
|
+
* The [natural_pdf/core/pdf.py](mdc:natural_pdf/core/pdf.py) `PDF` class has a `get_manager(manager_type)` method.
|
25
|
+
* This handles lazy initialization and retrieval of specific analysis managers (e.g., `ClassificationManager` from [natural_pdf/classification/manager.py](mdc:natural_pdf/classification/manager.py)).
|
26
|
+
* Managers encapsulate ML model interaction and result processing.
|
27
|
+
|
28
|
+
4. **Invocation Methods (`element.classify`, etc.)**:
|
29
|
+
* Methods are added to `Page`/`Region` (often via Mixins like [natural_pdf/classification/mixin.py](mdc:natural_pdf/classification/mixin.py)).
|
30
|
+
* They accept an optional `analysis_key: str` parameter.
|
31
|
+
* **Default:** If omitted, uses a standard key (e.g., `'classification'`) and *overwrites* previous results under that key.
|
32
|
+
* **Custom:** If provided, stores the result under the custom key, allowing multiple results of the same type to coexist.
|
33
|
+
* They use the `PDF.get_manager` to get the appropriate manager, call it, and store the returned `Result` object in `element.analyses[analysis_key]`.
|
34
|
+
|
35
|
+
5. **Parameter Renaming**:
|
36
|
+
* The parameter specifying text vs. vision analysis has been standardized to `using=` (e.g., `using='text'`, `using='vision'`).
|
37
|
+
|
38
|
+
6. **Convenience Accessors**:
|
39
|
+
* Simple properties/methods (e.g., `element.category`, `element.category_confidence`) provide easy access to results.
|
40
|
+
* These *always* read from the **default** key in the `analyses` registry (e.g., `analyses['classification']`).
|
41
|
+
|
42
|
+
## TODO List for New Analysis Features
|
43
|
+
|
44
|
+
* [ ] **NER**: Create `NERManager`, `NERResult`, `Entity`, `element.apply_ner()`, `element.entities` property. Implement optional `source_elements` mapping.
|
45
|
+
* [ ] **Summarization**: Create `SummarizationManager`, `SummarizationResult`, `element.summarize()`, `element.summary` property.
|
46
|
+
* [ ] **Translation**: Create `TranslationManager`, `TranslationResult`, `element.translate()`, `element.translated_text()` method.
|
47
|
+
* [ ] **Structured Data Extraction**: Create `StructuredDataManager`, `StructuredDataResult`, `element.extract_structured_data()`.
|
48
|
+
* [ ] **Ad-hoc Analysis**: Implement `element.run_custom_analysis()` or similar.
|
49
|
+
* [ ] **Documentation**: Update user docs for the framework.
|
50
|
+
|
51
|
+
## Coding Conventions for New Analyses
|
52
|
+
|
53
|
+
1. **Manager**: New `Manager` class in `natural_pdf/<task>/manager.py`. Handles ML logic.
|
54
|
+
2. **Registration**: Update `PDF.get_manager` to initialize the new manager. Check `is_available()`.
|
55
|
+
3. **Result Object**: New `Result` class(es) in `natural_pdf/<task>/results.py`. Stores params and findings.
|
56
|
+
4. **Element Method**: Add method to `Page`/`Region` (via Mixin?). Must take `analysis_key` (defaulting to standard task name). Calls manager, stores result in `analyses[analysis_key]`.
|
57
|
+
5. **Accessor**: Add convenience property/method accessing `analyses[DEFAULT_KEY]`.
|
58
|
+
6. **Dependencies**: Use `try...except ImportError` and extras in `pyproject.toml`.
|
@@ -0,0 +1,48 @@
|
|
1
|
+
include README.md
|
2
|
+
include LICENSE
|
3
|
+
|
4
|
+
# HTML templates
|
5
|
+
recursive-include natural_pdf/templates *.html
|
6
|
+
|
7
|
+
# Documentation assets
|
8
|
+
recursive-include docs *.md *.png *.jpg *.gif
|
9
|
+
|
10
|
+
# Remove common build garbage
|
11
|
+
global-exclude __pycache__ *.py[cod] *.so .DS_Store
|
12
|
+
global-exclude *hidden*
|
13
|
+
|
14
|
+
# 💣 Critical: prevent recursion bugs
|
15
|
+
prune build
|
16
|
+
prune dist
|
17
|
+
prune .nox
|
18
|
+
prune .venv
|
19
|
+
prune env
|
20
|
+
prune venv
|
21
|
+
|
22
|
+
# General junk
|
23
|
+
exclude .notebook_cache.json
|
24
|
+
exclude Untitled.ipynb
|
25
|
+
exclude conversation.md
|
26
|
+
exclude transcript.md
|
27
|
+
exclude sample.py
|
28
|
+
exclude sample2.py
|
29
|
+
exclude requirements.lock
|
30
|
+
exclude install.sh
|
31
|
+
|
32
|
+
# Directories to exclude
|
33
|
+
prune .venv
|
34
|
+
prune output
|
35
|
+
prune results
|
36
|
+
prune natural_pdf_index
|
37
|
+
prune hidden
|
38
|
+
prune pdfs/hidden
|
39
|
+
prune my_paddleocr_finetune_data
|
40
|
+
prune notebooks
|
41
|
+
prune docs/tutorials/pdfs
|
42
|
+
|
43
|
+
# Individual files in nested directories
|
44
|
+
exclude docs/tutorials/needs-ocr-searchable.pdf
|
45
|
+
exclude notebooks/Examples.md
|
46
|
+
|
47
|
+
# File patterns
|
48
|
+
global-exclude *.hocr
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: natural-pdf
|
3
|
-
Version: 0.1.
|
3
|
+
Version: 0.1.9
|
4
4
|
Summary: A more intuitive interface for working with PDFs
|
5
5
|
Author-email: Jonathan Soma <jonathan.soma@gmail.com>
|
6
6
|
License-Expression: MIT
|
@@ -17,11 +17,13 @@ Requires-Dist: colour
|
|
17
17
|
Requires-Dist: numpy
|
18
18
|
Requires-Dist: urllib3
|
19
19
|
Requires-Dist: tqdm
|
20
|
+
Requires-Dist: pydantic
|
20
21
|
Provides-Extra: interactive
|
21
22
|
Requires-Dist: ipywidgets<9.0.0,>=7.0.0; extra == "interactive"
|
22
23
|
Provides-Extra: haystack
|
23
24
|
Requires-Dist: haystack-ai; extra == "haystack"
|
24
|
-
Requires-Dist:
|
25
|
+
Requires-Dist: lancedb-haystack; extra == "haystack"
|
26
|
+
Requires-Dist: lancedb; extra == "haystack"
|
25
27
|
Requires-Dist: sentence-transformers; extra == "haystack"
|
26
28
|
Requires-Dist: natural-pdf[core-ml]; extra == "haystack"
|
27
29
|
Provides-Extra: easyocr
|
@@ -36,6 +38,9 @@ Requires-Dist: natural-pdf[core-ml]; extra == "layout-yolo"
|
|
36
38
|
Provides-Extra: surya
|
37
39
|
Requires-Dist: surya-ocr; extra == "surya"
|
38
40
|
Requires-Dist: natural-pdf[core-ml]; extra == "surya"
|
41
|
+
Provides-Extra: doctr
|
42
|
+
Requires-Dist: python-doctr[torch]; extra == "doctr"
|
43
|
+
Requires-Dist: natural-pdf[core-ml]; extra == "doctr"
|
39
44
|
Provides-Extra: qa
|
40
45
|
Requires-Dist: natural-pdf[core-ml]; extra == "qa"
|
41
46
|
Provides-Extra: docling
|
@@ -43,7 +48,10 @@ Requires-Dist: docling; extra == "docling"
|
|
43
48
|
Requires-Dist: natural-pdf[core-ml]; extra == "docling"
|
44
49
|
Provides-Extra: llm
|
45
50
|
Requires-Dist: openai>=1.0; extra == "llm"
|
46
|
-
|
51
|
+
Provides-Extra: classification
|
52
|
+
Requires-Dist: sentence-transformers; extra == "classification"
|
53
|
+
Requires-Dist: timm; extra == "classification"
|
54
|
+
Requires-Dist: natural-pdf[core-ml]; extra == "classification"
|
47
55
|
Provides-Extra: test
|
48
56
|
Requires-Dist: pytest; extra == "test"
|
49
57
|
Provides-Extra: dev
|
@@ -59,6 +67,9 @@ Requires-Dist: pipdeptree; extra == "dev"
|
|
59
67
|
Requires-Dist: nbformat; extra == "dev"
|
60
68
|
Requires-Dist: jupytext; extra == "dev"
|
61
69
|
Requires-Dist: nbclient; extra == "dev"
|
70
|
+
Provides-Extra: deskew
|
71
|
+
Requires-Dist: deskew>=1.5; extra == "deskew"
|
72
|
+
Requires-Dist: img2pdf; extra == "deskew"
|
62
73
|
Provides-Extra: all
|
63
74
|
Requires-Dist: natural-pdf[interactive]; extra == "all"
|
64
75
|
Requires-Dist: natural-pdf[haystack]; extra == "all"
|
@@ -66,10 +77,13 @@ Requires-Dist: natural-pdf[easyocr]; extra == "all"
|
|
66
77
|
Requires-Dist: natural-pdf[paddle]; extra == "all"
|
67
78
|
Requires-Dist: natural-pdf[layout_yolo]; extra == "all"
|
68
79
|
Requires-Dist: natural-pdf[surya]; extra == "all"
|
80
|
+
Requires-Dist: natural-pdf[doctr]; extra == "all"
|
69
81
|
Requires-Dist: natural-pdf[qa]; extra == "all"
|
70
82
|
Requires-Dist: natural-pdf[ocr-export]; extra == "all"
|
71
83
|
Requires-Dist: natural-pdf[docling]; extra == "all"
|
72
84
|
Requires-Dist: natural-pdf[llm]; extra == "all"
|
85
|
+
Requires-Dist: natural-pdf[classification]; extra == "all"
|
86
|
+
Requires-Dist: natural-pdf[deskew]; extra == "all"
|
73
87
|
Requires-Dist: natural-pdf[test]; extra == "all"
|
74
88
|
Provides-Extra: core-ml
|
75
89
|
Requires-Dist: torch; extra == "core-ml"
|
@@ -0,0 +1,56 @@
|
|
1
|
+
import subprocess
|
2
|
+
import tarfile
|
3
|
+
import zipfile
|
4
|
+
from pathlib import Path
|
5
|
+
|
6
|
+
DIST_DIR = Path("dist")
|
7
|
+
|
8
|
+
|
9
|
+
def build_package():
|
10
|
+
subprocess.run(["python", "-m", "build", "--sdist", "--wheel"], check=True)
|
11
|
+
|
12
|
+
|
13
|
+
def get_sdist_files():
|
14
|
+
sdist_path = next(DIST_DIR.glob("*.tar.gz"))
|
15
|
+
with tarfile.open(sdist_path, "r:gz") as tar:
|
16
|
+
return sorted(str(Path(m.name)) for m in tar.getmembers() if m.isfile())
|
17
|
+
|
18
|
+
|
19
|
+
def get_wheel_files():
|
20
|
+
wheel_path = next(DIST_DIR.glob("*.whl"))
|
21
|
+
with zipfile.ZipFile(wheel_path, "r") as zipf:
|
22
|
+
return sorted(str(f) for f in zipf.namelist() if not f.endswith("/"))
|
23
|
+
|
24
|
+
|
25
|
+
def get_gitignored_files():
|
26
|
+
proc = subprocess.run(
|
27
|
+
["git", "ls-files", "--others", "-i", "--exclude-standard"],
|
28
|
+
check=True,
|
29
|
+
capture_output=True,
|
30
|
+
text=True,
|
31
|
+
)
|
32
|
+
return sorted(proc.stdout.strip().splitlines())
|
33
|
+
|
34
|
+
|
35
|
+
def diff_lists(packaged, ignored):
|
36
|
+
return sorted(set(packaged) & set(ignored))
|
37
|
+
|
38
|
+
|
39
|
+
def main():
|
40
|
+
build_package()
|
41
|
+
|
42
|
+
sdist_files = get_sdist_files()
|
43
|
+
wheel_files = get_wheel_files()
|
44
|
+
ignored_files = get_gitignored_files()
|
45
|
+
|
46
|
+
print("\n🚫 Files in *sdist* that are also .gitignored:")
|
47
|
+
for f in diff_lists(sdist_files, ignored_files):
|
48
|
+
print(" •", f)
|
49
|
+
|
50
|
+
print("\n🚫 Files in *wheel* that are also .gitignored:")
|
51
|
+
for f in diff_lists(wheel_files, ignored_files):
|
52
|
+
print(" •", f)
|
53
|
+
|
54
|
+
|
55
|
+
if __name__ == "__main__":
|
56
|
+
main()
|
@@ -9,12 +9,26 @@ fi
|
|
9
9
|
|
10
10
|
MARKDOWN_FILE=$1
|
11
11
|
NOTEBOOK_FILE="${MARKDOWN_FILE%.md}.ipynb"
|
12
|
+
KERNEL_NAME="natural-pdf"
|
12
13
|
|
13
14
|
echo "Converting $MARKDOWN_FILE to notebook..."
|
14
15
|
# Jupytext will now automatically add tags based on markdown metadata
|
15
16
|
jupytext --to ipynb "$MARKDOWN_FILE" || { echo "Conversion failed"; exit 1; }
|
16
17
|
|
18
|
+
echo "Patching notebook $NOTEBOOK_FILE with kernel $KERNEL_NAME..."
|
19
|
+
python3 - <<EOF
|
20
|
+
import nbformat
|
21
|
+
nb = nbformat.read("$NOTEBOOK_FILE", as_version=4)
|
22
|
+
nb.metadata["kernelspec"] = {
|
23
|
+
"name": "$KERNEL_NAME",
|
24
|
+
"display_name": "Python ($KERNEL_NAME)",
|
25
|
+
"language": "python"
|
26
|
+
}
|
27
|
+
nbformat.write(nb, "$NOTEBOOK_FILE")
|
28
|
+
EOF
|
29
|
+
|
30
|
+
|
17
31
|
echo "Executing notebook $NOTEBOOK_FILE..."
|
18
|
-
jupyter execute "$NOTEBOOK_FILE" --inplace || { echo "Execution failed"; exit 1; }
|
32
|
+
jupyter execute "$NOTEBOOK_FILE" --inplace --ExecutePreprocessor.kernel_name=natural-pdf || { echo "Execution failed"; exit 1; }
|
19
33
|
|
20
34
|
echo "Success! Notebook executed and results saved to $NOTEBOOK_FILE"
|
@@ -0,0 +1,168 @@
|
|
1
|
+
# Categorizing Pages and Regions
|
2
|
+
|
3
|
+
Natural PDF allows you to automatically categorize pages or specific regions within a page using machine learning models. This is incredibly useful for filtering large collections of documents or understanding the structure and content of individual PDFs.
|
4
|
+
|
5
|
+
## Installation
|
6
|
+
|
7
|
+
To use the classification features, you need to install the optional dependencies:
|
8
|
+
|
9
|
+
```bash
|
10
|
+
pip install "natural-pdf[classification]"
|
11
|
+
```
|
12
|
+
|
13
|
+
This installs necessary libraries like `torch`, `transformers`, and others.
|
14
|
+
|
15
|
+
## Core Concept: The `.classify()` Method
|
16
|
+
|
17
|
+
The primary way to perform categorization is using the `.classify()` method available on `Page` and `Region` objects.
|
18
|
+
|
19
|
+
```python
|
20
|
+
from natural_pdf import PDF
|
21
|
+
|
22
|
+
# Example: Classify a Page
|
23
|
+
pdf = PDF("pdfs/01-practice.pdf")
|
24
|
+
page = pdf.pages[0]
|
25
|
+
categories = ["invoice", "letter", "report cover", "data table"]
|
26
|
+
results = page.classify(categories=categories, model="text")
|
27
|
+
|
28
|
+
# Access the top result
|
29
|
+
print(f"Top Category: {page.category}")
|
30
|
+
print(f"Confidence: {page.category_confidence:.3f}")
|
31
|
+
|
32
|
+
# Access all results
|
33
|
+
# print(page.classification_results)
|
34
|
+
```
|
35
|
+
|
36
|
+
**Key Arguments:**
|
37
|
+
|
38
|
+
* `categories` (required): A list of strings representing the potential categories you want to classify the item into.
|
39
|
+
* `model` (optional): Specifies which classification model or strategy to use. Defaults to `"text"`.
|
40
|
+
* `"text"`: Uses a text-based model (default: `facebook/bart-large-mnli`) suitable for classifying based on language content.
|
41
|
+
* `"vision"`: Uses a vision-based model (default: `openai/clip-vit-base-patch32`) suitable for classifying based on visual layout and appearance.
|
42
|
+
* Specific Model ID: You can provide a Hugging Face model ID (e.g., `"google/siglip-base-patch16-224"`, `"MoritzLaurer/mDeBERTa-v3-base-mnli-xnli"`) compatible with zero-shot text or image classification. The library attempts to infer whether it's text or vision, but you might need `using`.
|
43
|
+
* `using` (optional): Explicitly set to `"text"` or `"vision"` if the automatic inference based on the `model` ID fails or is ambiguous.
|
44
|
+
* `min_confidence` (optional): A float between 0.0 and 1.0. Only categories with a confidence score greater than or equal to this threshold will be included in the results (default: 0.0).
|
45
|
+
|
46
|
+
## Text vs. Vision Classification
|
47
|
+
|
48
|
+
Choosing the right model type depends on your goal:
|
49
|
+
|
50
|
+
### Text Classification (`model="text"`)
|
51
|
+
|
52
|
+
* **How it works:** Extracts the text from the page or region and analyzes the language content.
|
53
|
+
* **Best for:**
|
54
|
+
* **Topic Identification:** Determining what a page or section is *about* (e.g., "budget discussion," "environmental impact," "legal terms").
|
55
|
+
* **Content-Driven Document Types:** Identifying document types primarily defined by their text (e.g., emails, meeting minutes, news articles, reports).
|
56
|
+
* **Data Journalism Example:** You have thousands of pages of government reports. You can use text classification to find all pages discussing "public health funding" or classify paragraphs within environmental impact statements to find mentions of specific endangered species.
|
57
|
+
|
58
|
+
```python
|
59
|
+
# Find pages related to finance
|
60
|
+
financial_categories = ["budget", "revenue", "expenditure", "forecast"]
|
61
|
+
pdf.classify_pages(categories=financial_categories, model="text")
|
62
|
+
budget_pages = [p for p in pdf.pages if p.category == "budget"]
|
63
|
+
```
|
64
|
+
|
65
|
+
### Vision Classification (`model="vision"`)
|
66
|
+
|
67
|
+
* **How it works:** Renders the page or region as an image and analyzes its visual layout, structure, and appearance.
|
68
|
+
* **Best for:**
|
69
|
+
* **Layout-Driven Document Types:** Identifying documents recognizable by their structure (e.g., invoices, receipts, forms, presentation slides, title pages).
|
70
|
+
* **Identifying Visual Elements:** Distinguishing between pages dominated by text, tables, charts, or images.
|
71
|
+
* **Data Journalism Example:** You have a scanned archive of campaign finance filings containing various document types. You can use vision classification to quickly isolate all the pages that look like donation receipts or expenditure forms, even if the OCR quality is poor.
|
72
|
+
|
73
|
+
```python
|
74
|
+
# Find pages that look like invoices or receipts
|
75
|
+
visual_categories = ["invoice", "receipt", "letter", "form"]
|
76
|
+
page.classify(categories=visual_categories, model="vision")
|
77
|
+
if page.category in ["invoice", "receipt"]:
|
78
|
+
print(f"Page {page.number} looks like an invoice or receipt.")
|
79
|
+
```
|
80
|
+
|
81
|
+
## Classifying Specific Objects
|
82
|
+
|
83
|
+
### Pages (`page.classify(...)`)
|
84
|
+
|
85
|
+
Classifying a whole page is useful for sorting documents or identifying the overall purpose of a page within a larger document.
|
86
|
+
|
87
|
+
```python
|
88
|
+
# Classify the first page
|
89
|
+
page = pdf.pages[0]
|
90
|
+
page_types = ["cover page", "table of contents", "chapter start", "appendix"]
|
91
|
+
page.classify(categories=page_types, model="vision") # Vision often good for page structure
|
92
|
+
print(f"Page 1 Type: {page.category}")
|
93
|
+
```
|
94
|
+
|
95
|
+
### Regions (`region.classify(...)`)
|
96
|
+
|
97
|
+
Classifying a specific region allows for more granular analysis within a page. You might first detect regions using Layout Analysis and then classify those regions.
|
98
|
+
|
99
|
+
```python
|
100
|
+
# Assume layout analysis has run, find paragraphs
|
101
|
+
paragraphs = page.find_all("region[type=paragraph]")
|
102
|
+
if paragraphs:
|
103
|
+
# Classify the topic of the first paragraph
|
104
|
+
topic_categories = ["introduction", "methodology", "results", "conclusion"]
|
105
|
+
# Use text model for topic
|
106
|
+
paragraphs[0].classify(categories=topic_categories, model="text")
|
107
|
+
print(f"First paragraph category: {paragraphs[0].category}")
|
108
|
+
```
|
109
|
+
|
110
|
+
## Accessing Classification Results
|
111
|
+
|
112
|
+
After running `.classify()`, you can access the results:
|
113
|
+
|
114
|
+
* `page.category` or `region.category`: Returns the string label of the category with the highest confidence score from the *last* classification run. Returns `None` if no classification has been run or no category met the threshold.
|
115
|
+
* `page.category_confidence` or `region.category_confidence`: Returns the float confidence score (0.0-1.0) for the top category. Returns `None` otherwise.
|
116
|
+
* `page.classification_results` or `region.classification_results`: Returns the full result dictionary stored in the object's `.metadata['classification']`, containing the model used, engine type, categories provided, timestamp, and a list of all scores above the threshold sorted by confidence. Returns `None` if no classification has been run.
|
117
|
+
|
118
|
+
```python
|
119
|
+
results = page.classify(categories=["invoice", "letter"], model="text", min_confidence=0.5)
|
120
|
+
|
121
|
+
if page.category == "invoice":
|
122
|
+
print(f"Found an invoice with confidence {page.category_confidence:.2f}")
|
123
|
+
|
124
|
+
# See all results above the threshold
|
125
|
+
# print(page.classification_results['scores'])
|
126
|
+
```
|
127
|
+
|
128
|
+
## Classifying Collections
|
129
|
+
|
130
|
+
For batch processing, use the `.classify_all()` method on `PDFCollection` or `ElementCollection` objects. This displays a progress bar tracking individual items (pages or elements).
|
131
|
+
|
132
|
+
### PDFCollection (`collection.classify_all(...)`)
|
133
|
+
|
134
|
+
Classifies pages across all PDFs in the collection. Use `max_workers` for parallel processing across different PDF files.
|
135
|
+
|
136
|
+
```python
|
137
|
+
collection = natural_pdf.PDFCollection.from_directory("./documents/")
|
138
|
+
categories = ["form", "datasheet", "image", "text document"]
|
139
|
+
|
140
|
+
# Classify all pages using vision model, processing 4 PDFs concurrently
|
141
|
+
collection.classify_all(categories=categories, model="vision", max_workers=4)
|
142
|
+
|
143
|
+
# Filter PDFs containing forms
|
144
|
+
form_pdfs = []
|
145
|
+
for pdf in collection:
|
146
|
+
if any(p.category == "form" for p in pdf.pages if p.category):
|
147
|
+
form_pdfs.append(pdf.path)
|
148
|
+
pdf.close() # Remember to close PDFs
|
149
|
+
|
150
|
+
print(f"Found forms in: {form_pdfs}")
|
151
|
+
```
|
152
|
+
|
153
|
+
### ElementCollection (`element_collection.classify_all(...)`)
|
154
|
+
|
155
|
+
Classifies all classifiable elements (currently `Page` and `Region`) within the collection.
|
156
|
+
|
157
|
+
```python
|
158
|
+
# Assume 'pdf' is loaded and 'layout_regions' is an ElementCollection of Regions
|
159
|
+
layout_regions = pdf.find_all("region")
|
160
|
+
region_types = ["paragraph", "list", "table", "figure", "caption"]
|
161
|
+
|
162
|
+
# Classify all detected regions based on vision
|
163
|
+
layout_regions.classify_all(categories=region_types, model="vision")
|
164
|
+
|
165
|
+
# Count table regions
|
166
|
+
table_count = sum(1 for r in layout_regions if r.category == "table")
|
167
|
+
print(f"Found {table_count} regions classified as tables.")
|
168
|
+
```
|
@@ -0,0 +1,109 @@
|
|
1
|
+
# Structured Data Extraction
|
2
|
+
|
3
|
+
Extracting specific, structured information (like invoice numbers, dates, or addresses) from documents often requires more than simple text extraction. Natural PDF integrates with LLMs to pull out [structured data](https://platform.openai.com/docs/guides/structured-outputs).
|
4
|
+
|
5
|
+
You need to install more than just the tiny baby default `natural_pdf` for this:
|
6
|
+
```
|
7
|
+
# Install just the LLM portions
|
8
|
+
pip install "natural_pdf[llm]"
|
9
|
+
|
10
|
+
# Install eeeeeverything
|
11
|
+
pip install "natural_pdf[all]"
|
12
|
+
```
|
13
|
+
|
14
|
+
## Introduction
|
15
|
+
|
16
|
+
This feature allows you to define the exact data structure you want using a Pydantic model and then instruct an LLM to populate that structure based on the content of a PDF element (like a `Page` or `Region`).
|
17
|
+
|
18
|
+
> Not sure how to write a Pydantic schema? Just ask an LLM! "Write me a Pydantic schema to pull out an invoice number (an integer), a company name (string) and a date (string)." It'll go fine.
|
19
|
+
|
20
|
+
## Basic Extraction
|
21
|
+
|
22
|
+
1. **Define a Schema:** Create a Pydantic model for your desired data.
|
23
|
+
2. **Extract:** Use `.extract()` on a `PDF`, `Page`, or `Region` object.
|
24
|
+
3. **Access:** Use `.extracted()` to retrieve the results.
|
25
|
+
|
26
|
+
```python
|
27
|
+
from natural_pdf import PDF
|
28
|
+
from pydantic import BaseModel, Field
|
29
|
+
from openai import OpenAI
|
30
|
+
|
31
|
+
# Initialize your LLM client
|
32
|
+
# Anything OpenAI-compatible works!
|
33
|
+
client = OpenAI(
|
34
|
+
api_key="ANTHROPIC_API_KEY", # Your Anthropic API key
|
35
|
+
base_url="https://api.anthropic.com/v1/" # Anthropic's API endpoint
|
36
|
+
)
|
37
|
+
|
38
|
+
# Load the PDF
|
39
|
+
pdf = PDF("path/to/your/document.pdf")
|
40
|
+
page = pdf.pages[0]
|
41
|
+
|
42
|
+
# Define your schema
|
43
|
+
class InvoiceInfo(BaseModel):
|
44
|
+
invoice_number: str = Field(description="The main invoice identifier")
|
45
|
+
total_amount: float = Field(description="The final amount due")
|
46
|
+
company_name: Optional[str] = Field(None, description="The name of the issuing company")
|
47
|
+
|
48
|
+
# Extract data
|
49
|
+
page.extract(schema=InvoiceInfo, client=client)
|
50
|
+
|
51
|
+
# Access the full result object
|
52
|
+
full_data = page.extracted()
|
53
|
+
print(full_data)
|
54
|
+
|
55
|
+
# Access a single field
|
56
|
+
inv_num = page.extracted('invoice_number')
|
57
|
+
print(f"Invoice Number: {inv_num}")
|
58
|
+
```
|
59
|
+
|
60
|
+
## Keys and Overwriting
|
61
|
+
|
62
|
+
- By default, results are stored under the key `"default-structured"` in the element's `.analyses` dictionary.
|
63
|
+
- Use the `analysis_key` parameter in `.extract()` to store results under a different name (e.g., `analysis_key="customer_details"`).
|
64
|
+
- Attempting to extract using an existing `analysis_key` will raise an error unless `overwrite=True` is specified.
|
65
|
+
|
66
|
+
```python
|
67
|
+
# Extract using a specific key
|
68
|
+
page.extract(InvoiceInfo, client=client, analysis_key="invoice_header")
|
69
|
+
|
70
|
+
# Access using the specific key
|
71
|
+
header_data = page.extracted(analysis_key="invoice_header")
|
72
|
+
company = page.extracted('company_name', analysis_key="invoice_header")
|
73
|
+
```
|
74
|
+
|
75
|
+
## Text vs vision
|
76
|
+
|
77
|
+
When sending a page (or a region or etc) to an LLM, you can choose either `using='text'` (default) or `using='vision'`.
|
78
|
+
|
79
|
+
- `text` sends the text, somewhat respecting layout using `.extract_text(layout=True)`
|
80
|
+
- `vision` sends an image of the page with `.to_image(resolution=72)` (no highlights or labels)
|
81
|
+
|
82
|
+
## Batch and bulk extraction
|
83
|
+
|
84
|
+
If you have a lot of pages or a lot of PDFs or a lot of anything, the `.extract()` and `.extracted()` methods work identically on most parts of a PDF - regions, pages, collections of pdfs, etc, allowing a lot of flexibility in what you analyze.
|
85
|
+
|
86
|
+
```python
|
87
|
+
# Assuming 'header_region' is a Region object you defined
|
88
|
+
header_region.extract(InvoiceInfo, client)
|
89
|
+
company = header_region.extracted('company_name')
|
90
|
+
```
|
91
|
+
|
92
|
+
Furthermore, you can apply extraction to collections of elements (like `pdf.pages`, or the result of `pdf.find_all(...)`) using the `.apply()` method. This iterates through the collection and calls `.extract()` on each item.
|
93
|
+
|
94
|
+
```python
|
95
|
+
# Example: Extract InvoiceInfo from the first 5 pages
|
96
|
+
results = pdf.pages[:5].apply(
|
97
|
+
lambda page: page.extract(
|
98
|
+
client=client,
|
99
|
+
schema=InvoiceInfo,
|
100
|
+
client=client,
|
101
|
+
analysis_key="page_invoice_info",
|
102
|
+
)
|
103
|
+
)
|
104
|
+
|
105
|
+
# Access results for the first page in the collection
|
106
|
+
pdf.pages[0].extracted('company_name', analysis_key="page_invoice_info")
|
107
|
+
```
|
108
|
+
|
109
|
+
This provides a powerful way to turn unstructured PDF content into structured, usable data.
|