natural-pdf 0.1.9__tar.gz → 0.1.10__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {natural_pdf-0.1.9 → natural_pdf-0.1.10}/PKG-INFO +1 -1
- {natural_pdf-0.1.9 → natural_pdf-0.1.10}/docs/categorizing-documents/index.md +20 -23
- {natural_pdf-0.1.9 → natural_pdf-0.1.10}/docs/index.md +4 -4
- natural_pdf-0.1.10/docs/tutorials/01-loading-and-extraction.ipynb +1628 -0
- {natural_pdf-0.1.9 → natural_pdf-0.1.10}/docs/tutorials/02-finding-elements.ipynb +46 -46
- {natural_pdf-0.1.9 → natural_pdf-0.1.10}/docs/tutorials/03-extracting-blocks.ipynb +17 -17
- {natural_pdf-0.1.9 → natural_pdf-0.1.10}/docs/tutorials/04-table-extraction.ipynb +12 -12
- {natural_pdf-0.1.9 → natural_pdf-0.1.10}/docs/tutorials/05-excluding-content.ipynb +30 -30
- {natural_pdf-0.1.9 → natural_pdf-0.1.10}/docs/tutorials/06-document-qa.ipynb +28 -28
- natural_pdf-0.1.10/docs/tutorials/07-layout-analysis.ipynb +269 -0
- {natural_pdf-0.1.9 → natural_pdf-0.1.10}/docs/tutorials/07-working-with-regions.ipynb +48 -48
- natural_pdf-0.1.10/docs/tutorials/08-spatial-navigation.ipynb +513 -0
- {natural_pdf-0.1.9 → natural_pdf-0.1.10}/docs/tutorials/09-section-extraction.ipynb +111 -111
- {natural_pdf-0.1.9 → natural_pdf-0.1.10}/docs/tutorials/10-form-field-extraction.ipynb +52 -52
- {natural_pdf-0.1.9 → natural_pdf-0.1.10}/docs/tutorials/11-enhanced-table-processing.ipynb +6 -6
- {natural_pdf-0.1.9 → natural_pdf-0.1.10}/docs/tutorials/12-ocr-integration.ipynb +998 -998
- {natural_pdf-0.1.9 → natural_pdf-0.1.10}/docs/tutorials/13-semantic-search.ipynb +331 -331
- natural_pdf-0.1.10/docs/tutorials/14-categorizing-documents.ipynb +2365 -0
- natural_pdf-0.1.10/docs/tutorials/14-categorizing-documents.md +99 -0
- {natural_pdf-0.1.9 → natural_pdf-0.1.10}/mkdocs.yml +1 -0
- {natural_pdf-0.1.9 → natural_pdf-0.1.10}/natural_pdf/classification/manager.py +26 -22
- {natural_pdf-0.1.9 → natural_pdf-0.1.10}/natural_pdf/classification/mixin.py +7 -7
- {natural_pdf-0.1.9 → natural_pdf-0.1.10}/natural_pdf/classification/results.py +17 -9
- {natural_pdf-0.1.9 → natural_pdf-0.1.10}/natural_pdf/collections/mixins.py +17 -0
- {natural_pdf-0.1.9 → natural_pdf-0.1.10}/natural_pdf/collections/pdf_collection.py +78 -46
- {natural_pdf-0.1.9 → natural_pdf-0.1.10}/natural_pdf/core/pdf.py +62 -6
- {natural_pdf-0.1.9 → natural_pdf-0.1.10}/natural_pdf/elements/collections.py +107 -3
- {natural_pdf-0.1.9 → natural_pdf-0.1.10}/natural_pdf.egg-info/PKG-INFO +1 -1
- {natural_pdf-0.1.9 → natural_pdf-0.1.10}/natural_pdf.egg-info/SOURCES.txt +3 -0
- natural_pdf-0.1.10/pdfs/cia-doc.pdf +0 -0
- natural_pdf-0.1.9/docs/tutorials/01-loading-and-extraction.ipynb +0 -1628
- natural_pdf-0.1.9/docs/tutorials/07-layout-analysis.ipynb +0 -269
- natural_pdf-0.1.9/docs/tutorials/08-spatial-navigation.ipynb +0 -513
- {natural_pdf-0.1.9 → natural_pdf-0.1.10}/.cursor/rules/analysis_framework.mdc +0 -0
- {natural_pdf-0.1.9 → natural_pdf-0.1.10}/.cursor/rules/coding-style.mdc +0 -0
- {natural_pdf-0.1.9 → natural_pdf-0.1.10}/.cursor/rules/edit-md-instead-of-ipynb.mdc +0 -0
- {natural_pdf-0.1.9 → natural_pdf-0.1.10}/.cursor/rules/minimal-comments.mdc +0 -0
- {natural_pdf-0.1.9 → natural_pdf-0.1.10}/.cursor/rules/natural-pdf-overview.mdc +0 -0
- {natural_pdf-0.1.9 → natural_pdf-0.1.10}/.cursor/rules/user-friendly-library-code.mdc +0 -0
- {natural_pdf-0.1.9 → natural_pdf-0.1.10}/.github/workflows/docs.yml +0 -0
- {natural_pdf-0.1.9 → natural_pdf-0.1.10}/.gitignore +0 -0
- {natural_pdf-0.1.9 → natural_pdf-0.1.10}/01-execute_notebooks.py +0 -0
- {natural_pdf-0.1.9 → natural_pdf-0.1.10}/02-run_all_tutorials.sh +0 -0
- {natural_pdf-0.1.9 → natural_pdf-0.1.10}/CLAUDE.md +0 -0
- {natural_pdf-0.1.9 → natural_pdf-0.1.10}/LICENSE +0 -0
- {natural_pdf-0.1.9 → natural_pdf-0.1.10}/MANIFEST.in +0 -0
- {natural_pdf-0.1.9 → natural_pdf-0.1.10}/README.md +0 -0
- {natural_pdf-0.1.9 → natural_pdf-0.1.10}/audit_packaging.py +0 -0
- {natural_pdf-0.1.9 → natural_pdf-0.1.10}/check_run_md.sh +0 -0
- {natural_pdf-0.1.9 → natural_pdf-0.1.10}/docs/api/index.md +0 -0
- {natural_pdf-0.1.9 → natural_pdf-0.1.10}/docs/assets/favicon.png +0 -0
- {natural_pdf-0.1.9 → natural_pdf-0.1.10}/docs/assets/favicon.svg +0 -0
- {natural_pdf-0.1.9 → natural_pdf-0.1.10}/docs/assets/javascripts/custom.js +0 -0
- {natural_pdf-0.1.9 → natural_pdf-0.1.10}/docs/assets/logo.svg +0 -0
- {natural_pdf-0.1.9 → natural_pdf-0.1.10}/docs/assets/sample-screen.png +0 -0
- {natural_pdf-0.1.9 → natural_pdf-0.1.10}/docs/assets/social-preview.png +0 -0
- {natural_pdf-0.1.9 → natural_pdf-0.1.10}/docs/assets/social-preview.svg +0 -0
- {natural_pdf-0.1.9 → natural_pdf-0.1.10}/docs/assets/stylesheets/custom.css +0 -0
- {natural_pdf-0.1.9 → natural_pdf-0.1.10}/docs/data-extraction/index.md +0 -0
- {natural_pdf-0.1.9 → natural_pdf-0.1.10}/docs/document-qa/index.ipynb +0 -0
- {natural_pdf-0.1.9 → natural_pdf-0.1.10}/docs/document-qa/index.md +0 -0
- {natural_pdf-0.1.9 → natural_pdf-0.1.10}/docs/element-selection/index.ipynb +0 -0
- {natural_pdf-0.1.9 → natural_pdf-0.1.10}/docs/element-selection/index.md +0 -0
- {natural_pdf-0.1.9 → natural_pdf-0.1.10}/docs/finetuning/index.md +0 -0
- {natural_pdf-0.1.9 → natural_pdf-0.1.10}/docs/installation/index.md +0 -0
- {natural_pdf-0.1.9 → natural_pdf-0.1.10}/docs/interactive-widget/index.ipynb +0 -0
- {natural_pdf-0.1.9 → natural_pdf-0.1.10}/docs/interactive-widget/index.md +0 -0
- {natural_pdf-0.1.9 → natural_pdf-0.1.10}/docs/layout-analysis/index.ipynb +0 -0
- {natural_pdf-0.1.9 → natural_pdf-0.1.10}/docs/layout-analysis/index.md +0 -0
- {natural_pdf-0.1.9 → natural_pdf-0.1.10}/docs/ocr/index.md +0 -0
- {natural_pdf-0.1.9 → natural_pdf-0.1.10}/docs/pdf-navigation/index.ipynb +0 -0
- {natural_pdf-0.1.9 → natural_pdf-0.1.10}/docs/pdf-navigation/index.md +0 -0
- {natural_pdf-0.1.9 → natural_pdf-0.1.10}/docs/regions/index.ipynb +0 -0
- {natural_pdf-0.1.9 → natural_pdf-0.1.10}/docs/regions/index.md +0 -0
- {natural_pdf-0.1.9 → natural_pdf-0.1.10}/docs/tables/index.ipynb +0 -0
- {natural_pdf-0.1.9 → natural_pdf-0.1.10}/docs/tables/index.md +0 -0
- {natural_pdf-0.1.9 → natural_pdf-0.1.10}/docs/text-analysis/index.ipynb +0 -0
- {natural_pdf-0.1.9 → natural_pdf-0.1.10}/docs/text-analysis/index.md +0 -0
- {natural_pdf-0.1.9 → natural_pdf-0.1.10}/docs/text-extraction/index.ipynb +0 -0
- {natural_pdf-0.1.9 → natural_pdf-0.1.10}/docs/text-extraction/index.md +0 -0
- {natural_pdf-0.1.9 → natural_pdf-0.1.10}/docs/tutorials/01-loading-and-extraction.md +0 -0
- {natural_pdf-0.1.9 → natural_pdf-0.1.10}/docs/tutorials/02-finding-elements.md +0 -0
- {natural_pdf-0.1.9 → natural_pdf-0.1.10}/docs/tutorials/03-extracting-blocks.md +0 -0
- {natural_pdf-0.1.9 → natural_pdf-0.1.10}/docs/tutorials/04-table-extraction.md +0 -0
- {natural_pdf-0.1.9 → natural_pdf-0.1.10}/docs/tutorials/05-excluding-content.md +0 -0
- {natural_pdf-0.1.9 → natural_pdf-0.1.10}/docs/tutorials/06-document-qa.md +0 -0
- {natural_pdf-0.1.9 → natural_pdf-0.1.10}/docs/tutorials/07-layout-analysis.md +0 -0
- {natural_pdf-0.1.9 → natural_pdf-0.1.10}/docs/tutorials/07-working-with-regions.md +0 -0
- {natural_pdf-0.1.9 → natural_pdf-0.1.10}/docs/tutorials/08-spatial-navigation.md +0 -0
- {natural_pdf-0.1.9 → natural_pdf-0.1.10}/docs/tutorials/09-section-extraction.md +0 -0
- {natural_pdf-0.1.9 → natural_pdf-0.1.10}/docs/tutorials/10-form-field-extraction.md +0 -0
- {natural_pdf-0.1.9 → natural_pdf-0.1.10}/docs/tutorials/11-enhanced-table-processing.md +0 -0
- {natural_pdf-0.1.9 → natural_pdf-0.1.10}/docs/tutorials/12-ocr-integration.md +0 -0
- {natural_pdf-0.1.9 → natural_pdf-0.1.10}/docs/tutorials/13-semantic-search.md +0 -0
- {natural_pdf-0.1.9 → natural_pdf-0.1.10}/docs/visual-debugging/index.ipynb +0 -0
- {natural_pdf-0.1.9 → natural_pdf-0.1.10}/docs/visual-debugging/index.md +0 -0
- {natural_pdf-0.1.9 → natural_pdf-0.1.10}/docs/visual-debugging/region.png +0 -0
- {natural_pdf-0.1.9 → natural_pdf-0.1.10}/natural_pdf/__init__.py +0 -0
- {natural_pdf-0.1.9 → natural_pdf-0.1.10}/natural_pdf/analyzers/__init__.py +0 -0
- {natural_pdf-0.1.9 → natural_pdf-0.1.10}/natural_pdf/analyzers/layout/__init__.py +0 -0
- {natural_pdf-0.1.9 → natural_pdf-0.1.10}/natural_pdf/analyzers/layout/base.py +0 -0
- {natural_pdf-0.1.9 → natural_pdf-0.1.10}/natural_pdf/analyzers/layout/docling.py +0 -0
- {natural_pdf-0.1.9 → natural_pdf-0.1.10}/natural_pdf/analyzers/layout/gemini.py +0 -0
- {natural_pdf-0.1.9 → natural_pdf-0.1.10}/natural_pdf/analyzers/layout/layout_analyzer.py +0 -0
- {natural_pdf-0.1.9 → natural_pdf-0.1.10}/natural_pdf/analyzers/layout/layout_manager.py +0 -0
- {natural_pdf-0.1.9 → natural_pdf-0.1.10}/natural_pdf/analyzers/layout/layout_options.py +0 -0
- {natural_pdf-0.1.9 → natural_pdf-0.1.10}/natural_pdf/analyzers/layout/paddle.py +0 -0
- {natural_pdf-0.1.9 → natural_pdf-0.1.10}/natural_pdf/analyzers/layout/pdfplumber_table_finder.py +0 -0
- {natural_pdf-0.1.9 → natural_pdf-0.1.10}/natural_pdf/analyzers/layout/surya.py +0 -0
- {natural_pdf-0.1.9 → natural_pdf-0.1.10}/natural_pdf/analyzers/layout/tatr.py +0 -0
- {natural_pdf-0.1.9 → natural_pdf-0.1.10}/natural_pdf/analyzers/layout/yolo.py +0 -0
- {natural_pdf-0.1.9 → natural_pdf-0.1.10}/natural_pdf/analyzers/text_options.py +0 -0
- {natural_pdf-0.1.9 → natural_pdf-0.1.10}/natural_pdf/analyzers/text_structure.py +0 -0
- {natural_pdf-0.1.9 → natural_pdf-0.1.10}/natural_pdf/analyzers/utils.py +0 -0
- {natural_pdf-0.1.9 → natural_pdf-0.1.10}/natural_pdf/core/__init__.py +0 -0
- {natural_pdf-0.1.9 → natural_pdf-0.1.10}/natural_pdf/core/element_manager.py +0 -0
- {natural_pdf-0.1.9 → natural_pdf-0.1.10}/natural_pdf/core/highlighting_service.py +0 -0
- {natural_pdf-0.1.9 → natural_pdf-0.1.10}/natural_pdf/core/page.py +0 -0
- {natural_pdf-0.1.9 → natural_pdf-0.1.10}/natural_pdf/elements/__init__.py +0 -0
- {natural_pdf-0.1.9 → natural_pdf-0.1.10}/natural_pdf/elements/base.py +0 -0
- {natural_pdf-0.1.9 → natural_pdf-0.1.10}/natural_pdf/elements/line.py +0 -0
- {natural_pdf-0.1.9 → natural_pdf-0.1.10}/natural_pdf/elements/rect.py +0 -0
- {natural_pdf-0.1.9 → natural_pdf-0.1.10}/natural_pdf/elements/region.py +0 -0
- {natural_pdf-0.1.9 → natural_pdf-0.1.10}/natural_pdf/elements/text.py +0 -0
- {natural_pdf-0.1.9 → natural_pdf-0.1.10}/natural_pdf/export/mixin.py +0 -0
- {natural_pdf-0.1.9 → natural_pdf-0.1.10}/natural_pdf/exporters/__init__.py +0 -0
- {natural_pdf-0.1.9 → natural_pdf-0.1.10}/natural_pdf/exporters/base.py +0 -0
- {natural_pdf-0.1.9 → natural_pdf-0.1.10}/natural_pdf/exporters/paddleocr.py +0 -0
- {natural_pdf-0.1.9 → natural_pdf-0.1.10}/natural_pdf/exporters/searchable_pdf.py +0 -0
- {natural_pdf-0.1.9 → natural_pdf-0.1.10}/natural_pdf/extraction/manager.py +0 -0
- {natural_pdf-0.1.9 → natural_pdf-0.1.10}/natural_pdf/extraction/mixin.py +0 -0
- {natural_pdf-0.1.9 → natural_pdf-0.1.10}/natural_pdf/extraction/result.py +0 -0
- {natural_pdf-0.1.9 → natural_pdf-0.1.10}/natural_pdf/ocr/__init__.py +0 -0
- {natural_pdf-0.1.9 → natural_pdf-0.1.10}/natural_pdf/ocr/engine.py +0 -0
- {natural_pdf-0.1.9 → natural_pdf-0.1.10}/natural_pdf/ocr/engine_doctr.py +0 -0
- {natural_pdf-0.1.9 → natural_pdf-0.1.10}/natural_pdf/ocr/engine_easyocr.py +0 -0
- {natural_pdf-0.1.9 → natural_pdf-0.1.10}/natural_pdf/ocr/engine_paddle.py +0 -0
- {natural_pdf-0.1.9 → natural_pdf-0.1.10}/natural_pdf/ocr/engine_surya.py +0 -0
- {natural_pdf-0.1.9 → natural_pdf-0.1.10}/natural_pdf/ocr/ocr_factory.py +0 -0
- {natural_pdf-0.1.9 → natural_pdf-0.1.10}/natural_pdf/ocr/ocr_manager.py +0 -0
- {natural_pdf-0.1.9 → natural_pdf-0.1.10}/natural_pdf/ocr/ocr_options.py +0 -0
- {natural_pdf-0.1.9 → natural_pdf-0.1.10}/natural_pdf/ocr/utils.py +0 -0
- {natural_pdf-0.1.9 → natural_pdf-0.1.10}/natural_pdf/qa/__init__.py +0 -0
- {natural_pdf-0.1.9 → natural_pdf-0.1.10}/natural_pdf/qa/document_qa.py +0 -0
- {natural_pdf-0.1.9 → natural_pdf-0.1.10}/natural_pdf/search/__init__.py +0 -0
- {natural_pdf-0.1.9 → natural_pdf-0.1.10}/natural_pdf/search/haystack_search_service.py +0 -0
- {natural_pdf-0.1.9 → natural_pdf-0.1.10}/natural_pdf/search/haystack_utils.py +0 -0
- {natural_pdf-0.1.9 → natural_pdf-0.1.10}/natural_pdf/search/search_options.py +0 -0
- {natural_pdf-0.1.9 → natural_pdf-0.1.10}/natural_pdf/search/search_service_protocol.py +0 -0
- {natural_pdf-0.1.9 → natural_pdf-0.1.10}/natural_pdf/search/searchable_mixin.py +0 -0
- {natural_pdf-0.1.9 → natural_pdf-0.1.10}/natural_pdf/selectors/__init__.py +0 -0
- {natural_pdf-0.1.9 → natural_pdf-0.1.10}/natural_pdf/selectors/parser.py +0 -0
- {natural_pdf-0.1.9 → natural_pdf-0.1.10}/natural_pdf/templates/__init__.py +0 -0
- {natural_pdf-0.1.9 → natural_pdf-0.1.10}/natural_pdf/templates/finetune/fine_tune_paddleocr.md +0 -0
- {natural_pdf-0.1.9 → natural_pdf-0.1.10}/natural_pdf/templates/spa/css/style.css +0 -0
- {natural_pdf-0.1.9 → natural_pdf-0.1.10}/natural_pdf/templates/spa/index.html +0 -0
- {natural_pdf-0.1.9 → natural_pdf-0.1.10}/natural_pdf/templates/spa/js/app.js +0 -0
- {natural_pdf-0.1.9 → natural_pdf-0.1.10}/natural_pdf/templates/spa/words.txt +0 -0
- {natural_pdf-0.1.9 → natural_pdf-0.1.10}/natural_pdf/utils/__init__.py +0 -0
- {natural_pdf-0.1.9 → natural_pdf-0.1.10}/natural_pdf/utils/debug.py +0 -0
- {natural_pdf-0.1.9 → natural_pdf-0.1.10}/natural_pdf/utils/highlighting.py +0 -0
- {natural_pdf-0.1.9 → natural_pdf-0.1.10}/natural_pdf/utils/identifiers.py +0 -0
- {natural_pdf-0.1.9 → natural_pdf-0.1.10}/natural_pdf/utils/locks.py +0 -0
- {natural_pdf-0.1.9 → natural_pdf-0.1.10}/natural_pdf/utils/packaging.py +0 -0
- {natural_pdf-0.1.9 → natural_pdf-0.1.10}/natural_pdf/utils/reading_order.py +0 -0
- {natural_pdf-0.1.9 → natural_pdf-0.1.10}/natural_pdf/utils/text_extraction.py +0 -0
- {natural_pdf-0.1.9 → natural_pdf-0.1.10}/natural_pdf/utils/tqdm_utils.py +0 -0
- {natural_pdf-0.1.9 → natural_pdf-0.1.10}/natural_pdf/utils/visualization.py +0 -0
- {natural_pdf-0.1.9 → natural_pdf-0.1.10}/natural_pdf/widgets/__init__.py +0 -0
- {natural_pdf-0.1.9 → natural_pdf-0.1.10}/natural_pdf/widgets/frontend/viewer.js +0 -0
- {natural_pdf-0.1.9 → natural_pdf-0.1.10}/natural_pdf/widgets/viewer.py +0 -0
- {natural_pdf-0.1.9 → natural_pdf-0.1.10}/natural_pdf.egg-info/dependency_links.txt +0 -0
- {natural_pdf-0.1.9 → natural_pdf-0.1.10}/natural_pdf.egg-info/requires.txt +0 -0
- {natural_pdf-0.1.9 → natural_pdf-0.1.10}/natural_pdf.egg-info/top_level.txt +0 -0
- {natural_pdf-0.1.9 → natural_pdf-0.1.10}/noxfile.py +0 -0
- {natural_pdf-0.1.9 → natural_pdf-0.1.10}/pdfs/.gitkeep +0 -0
- {natural_pdf-0.1.9 → natural_pdf-0.1.10}/pdfs/01-practice.pdf +0 -0
- {natural_pdf-0.1.9 → natural_pdf-0.1.10}/pdfs/0500000US42001.pdf +0 -0
- {natural_pdf-0.1.9 → natural_pdf-0.1.10}/pdfs/0500000US42007.pdf +0 -0
- {natural_pdf-0.1.9 → natural_pdf-0.1.10}/pdfs/2014 Statistics.pdf +0 -0
- {natural_pdf-0.1.9 → natural_pdf-0.1.10}/pdfs/2019 Statistics.pdf +0 -0
- {natural_pdf-0.1.9 → natural_pdf-0.1.10}/pdfs/Atlanta_Public_Schools_GA_sample.pdf +0 -0
- {natural_pdf-0.1.9 → natural_pdf-0.1.10}/pdfs/needs-ocr.pdf +0 -0
- {natural_pdf-0.1.9 → natural_pdf-0.1.10}/publish.sh +0 -0
- {natural_pdf-0.1.9 → natural_pdf-0.1.10}/pyproject.toml +0 -0
- {natural_pdf-0.1.9 → natural_pdf-0.1.10}/sample-screen.png +0 -0
- {natural_pdf-0.1.9 → natural_pdf-0.1.10}/setup.cfg +0 -0
- {natural_pdf-0.1.9 → natural_pdf-0.1.10}/tests/exporters/test_paddleocr_exporter.py +0 -0
- {natural_pdf-0.1.9 → natural_pdf-0.1.10}/tests/test_loading.py +0 -0
- {natural_pdf-0.1.9 → natural_pdf-0.1.10}/tests/test_optional_deps.py +0 -0
@@ -22,32 +22,29 @@ from natural_pdf import PDF
|
|
22
22
|
# Example: Classify a Page
|
23
23
|
pdf = PDF("pdfs/01-practice.pdf")
|
24
24
|
page = pdf.pages[0]
|
25
|
-
|
26
|
-
|
25
|
+
labels = ["invoice", "letter", "report cover", "data table"]
|
26
|
+
page.classify(labels, using="text")
|
27
27
|
|
28
28
|
# Access the top result
|
29
29
|
print(f"Top Category: {page.category}")
|
30
30
|
print(f"Confidence: {page.category_confidence:.3f}")
|
31
|
-
|
32
|
-
# Access all results
|
33
|
-
# print(page.classification_results)
|
34
31
|
```
|
35
32
|
|
36
33
|
**Key Arguments:**
|
37
34
|
|
38
|
-
* `
|
39
|
-
* `
|
35
|
+
* `labels` (required): A list of strings representing the potential labels you want to classify the item into.
|
36
|
+
* `using` (optional): Specifies which classification model or strategy to use. Defaults to `"text"`.
|
40
37
|
* `"text"`: Uses a text-based model (default: `facebook/bart-large-mnli`) suitable for classifying based on language content.
|
41
38
|
* `"vision"`: Uses a vision-based model (default: `openai/clip-vit-base-patch32`) suitable for classifying based on visual layout and appearance.
|
42
39
|
* Specific Model ID: You can provide a Hugging Face model ID (e.g., `"google/siglip-base-patch16-224"`, `"MoritzLaurer/mDeBERTa-v3-base-mnli-xnli"`) compatible with zero-shot text or image classification. The library attempts to infer whether it's text or vision, but you might need `using`.
|
43
|
-
* `
|
44
|
-
* `min_confidence` (optional): A float between 0.0 and 1.0. Only
|
40
|
+
* `model` (optional): Explicitly model ID (HuggingFace repo name)
|
41
|
+
* `min_confidence` (optional): A float between 0.0 and 1.0. Only labels with a confidence score greater than or equal to this threshold will be included in the results (default: 0.0).
|
45
42
|
|
46
43
|
## Text vs. Vision Classification
|
47
44
|
|
48
45
|
Choosing the right model type depends on your goal:
|
49
46
|
|
50
|
-
### Text Classification (`
|
47
|
+
### Text Classification (`using="text"`)
|
51
48
|
|
52
49
|
* **How it works:** Extracts the text from the page or region and analyzes the language content.
|
53
50
|
* **Best for:**
|
@@ -57,12 +54,12 @@ Choosing the right model type depends on your goal:
|
|
57
54
|
|
58
55
|
```python
|
59
56
|
# Find pages related to finance
|
60
|
-
|
61
|
-
pdf.classify_pages(
|
57
|
+
financial_labels = ["budget", "revenue", "expenditure", "forecast"]
|
58
|
+
pdf.classify_pages(financial_labels, using="text")
|
62
59
|
budget_pages = [p for p in pdf.pages if p.category == "budget"]
|
63
60
|
```
|
64
61
|
|
65
|
-
### Vision Classification (`
|
62
|
+
### Vision Classification (`using="vision"`)
|
66
63
|
|
67
64
|
* **How it works:** Renders the page or region as an image and analyzes its visual layout, structure, and appearance.
|
68
65
|
* **Best for:**
|
@@ -72,8 +69,8 @@ budget_pages = [p for p in pdf.pages if p.category == "budget"]
|
|
72
69
|
|
73
70
|
```python
|
74
71
|
# Find pages that look like invoices or receipts
|
75
|
-
|
76
|
-
page.classify(
|
72
|
+
visual_labels = ["invoice", "receipt", "letter", "form"]
|
73
|
+
page.classify(visual_labels, using="vision")
|
77
74
|
if page.category in ["invoice", "receipt"]:
|
78
75
|
print(f"Page {page.number} looks like an invoice or receipt.")
|
79
76
|
```
|
@@ -88,7 +85,7 @@ Classifying a whole page is useful for sorting documents or identifying the over
|
|
88
85
|
# Classify the first page
|
89
86
|
page = pdf.pages[0]
|
90
87
|
page_types = ["cover page", "table of contents", "chapter start", "appendix"]
|
91
|
-
page.classify(
|
88
|
+
page.classify(page_types, using="vision") # Vision often good for page structure
|
92
89
|
print(f"Page 1 Type: {page.category}")
|
93
90
|
```
|
94
91
|
|
@@ -101,9 +98,9 @@ Classifying a specific region allows for more granular analysis within a page. Y
|
|
101
98
|
paragraphs = page.find_all("region[type=paragraph]")
|
102
99
|
if paragraphs:
|
103
100
|
# Classify the topic of the first paragraph
|
104
|
-
|
101
|
+
topic_labels = ["introduction", "methodology", "results", "conclusion"]
|
105
102
|
# Use text model for topic
|
106
|
-
paragraphs[0].classify(
|
103
|
+
paragraphs[0].classify(topic_labels, using="text")
|
107
104
|
print(f"First paragraph category: {paragraphs[0].category}")
|
108
105
|
```
|
109
106
|
|
@@ -113,10 +110,10 @@ After running `.classify()`, you can access the results:
|
|
113
110
|
|
114
111
|
* `page.category` or `region.category`: Returns the string label of the category with the highest confidence score from the *last* classification run. Returns `None` if no classification has been run or no category met the threshold.
|
115
112
|
* `page.category_confidence` or `region.category_confidence`: Returns the float confidence score (0.0-1.0) for the top category. Returns `None` otherwise.
|
116
|
-
* `page.classification_results` or `region.classification_results`: Returns the full result dictionary stored in the object's `.metadata['classification']`, containing the model used, engine type,
|
113
|
+
* `page.classification_results` or `region.classification_results`: Returns the full result dictionary stored in the object's `.metadata['classification']`, containing the model used, engine type, labels provided, timestamp, and a list of all scores above the threshold sorted by confidence. Returns `None` if no classification has been run.
|
117
114
|
|
118
115
|
```python
|
119
|
-
results = page.classify(
|
116
|
+
results = page.classify(["invoice", "letter"], using="text", min_confidence=0.5)
|
120
117
|
|
121
118
|
if page.category == "invoice":
|
122
119
|
print(f"Found an invoice with confidence {page.category_confidence:.2f}")
|
@@ -135,10 +132,10 @@ Classifies pages across all PDFs in the collection. Use `max_workers` for parall
|
|
135
132
|
|
136
133
|
```python
|
137
134
|
collection = natural_pdf.PDFCollection.from_directory("./documents/")
|
138
|
-
|
135
|
+
labels = ["form", "datasheet", "image", "text document"]
|
139
136
|
|
140
137
|
# Classify all pages using vision model, processing 4 PDFs concurrently
|
141
|
-
collection.classify_all(
|
138
|
+
collection.classify_all(labels, using="vision", max_workers=4)
|
142
139
|
|
143
140
|
# Filter PDFs containing forms
|
144
141
|
form_pdfs = []
|
@@ -160,7 +157,7 @@ layout_regions = pdf.find_all("region")
|
|
160
157
|
region_types = ["paragraph", "list", "table", "figure", "caption"]
|
161
158
|
|
162
159
|
# Classify all detected regions based on vision
|
163
|
-
layout_regions.classify_all(
|
160
|
+
layout_regions.classify_all(region_types, model="vision")
|
164
161
|
|
165
162
|
# Count table regions
|
166
163
|
table_count = sum(1 for r in layout_regions if r.category == "table")
|
@@ -140,14 +140,14 @@ Categorize pages or specific regions based on their content using text or vision
|
|
140
140
|
|
141
141
|
```python
|
142
142
|
# Classify a page based on text
|
143
|
-
|
144
|
-
page.classify(
|
143
|
+
labels = ["invoice", "scientific article", "presentation"]
|
144
|
+
page.classify(labels, using="text")
|
145
145
|
print(f"Page Category: {page.category} (Confidence: {page.category_confidence:.2f})")
|
146
146
|
|
147
147
|
|
148
148
|
# Classify a page based on what it looks like
|
149
|
-
|
150
|
-
page.classify(
|
149
|
+
labels = ["invoice", "scientific article", "presentation"]
|
150
|
+
page.classify(labels, using="vision")
|
151
151
|
print(f"Page Category: {page.category} (Confidence: {page.category_confidence:.2f})")
|
152
152
|
```
|
153
153
|
|