natural-pdf 0.1.8__tar.gz → 0.1.9__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- natural_pdf-0.1.9/MANIFEST.in +48 -0
- {natural_pdf-0.1.8 → natural_pdf-0.1.9}/PKG-INFO +12 -3
- natural_pdf-0.1.9/audit_packaging.py +56 -0
- {natural_pdf-0.1.8 → natural_pdf-0.1.9}/docs/data-extraction/index.md +41 -19
- natural_pdf-0.1.9/docs/tutorials/01-loading-and-extraction.ipynb +1628 -0
- natural_pdf-0.1.9/docs/tutorials/02-finding-elements.ipynb +374 -0
- {natural_pdf-0.1.8 → natural_pdf-0.1.9}/docs/tutorials/02-finding-elements.md +3 -3
- natural_pdf-0.1.9/docs/tutorials/03-extracting-blocks.ipynb +152 -0
- {natural_pdf-0.1.8 → natural_pdf-0.1.9}/docs/tutorials/04-table-extraction.ipynb +12 -12
- natural_pdf-0.1.9/docs/tutorials/05-excluding-content.ipynb +275 -0
- {natural_pdf-0.1.8 → natural_pdf-0.1.9}/docs/tutorials/06-document-qa.ipynb +28 -28
- natural_pdf-0.1.9/docs/tutorials/07-layout-analysis.ipynb +269 -0
- natural_pdf-0.1.9/docs/tutorials/07-working-with-regions.ipynb +414 -0
- natural_pdf-0.1.9/docs/tutorials/08-spatial-navigation.ipynb +513 -0
- natural_pdf-0.1.9/docs/tutorials/09-section-extraction.ipynb +2439 -0
- natural_pdf-0.1.9/docs/tutorials/10-form-field-extraction.ipynb +503 -0
- {natural_pdf-0.1.8 → natural_pdf-0.1.9}/docs/tutorials/11-enhanced-table-processing.ipynb +6 -6
- {natural_pdf-0.1.8 → natural_pdf-0.1.9}/docs/tutorials/12-ocr-integration.ipynb +1007 -1007
- {natural_pdf-0.1.8 → natural_pdf-0.1.9}/docs/tutorials/13-semantic-search.ipynb +335 -642
- {natural_pdf-0.1.8 → natural_pdf-0.1.9}/docs/tutorials/13-semantic-search.md +8 -7
- {natural_pdf-0.1.8 → natural_pdf-0.1.9}/natural_pdf/__init__.py +1 -0
- {natural_pdf-0.1.8 → natural_pdf-0.1.9}/natural_pdf/analyzers/layout/base.py +1 -5
- {natural_pdf-0.1.8 → natural_pdf-0.1.9}/natural_pdf/analyzers/layout/gemini.py +61 -51
- {natural_pdf-0.1.8 → natural_pdf-0.1.9}/natural_pdf/analyzers/layout/layout_analyzer.py +40 -11
- {natural_pdf-0.1.8 → natural_pdf-0.1.9}/natural_pdf/analyzers/layout/layout_manager.py +26 -84
- {natural_pdf-0.1.8 → natural_pdf-0.1.9}/natural_pdf/analyzers/layout/layout_options.py +7 -0
- natural_pdf-0.1.9/natural_pdf/analyzers/layout/pdfplumber_table_finder.py +142 -0
- {natural_pdf-0.1.8 → natural_pdf-0.1.9}/natural_pdf/analyzers/layout/surya.py +46 -123
- {natural_pdf-0.1.8 → natural_pdf-0.1.9}/natural_pdf/analyzers/layout/tatr.py +51 -4
- {natural_pdf-0.1.8 → natural_pdf-0.1.9}/natural_pdf/analyzers/text_structure.py +3 -5
- {natural_pdf-0.1.8 → natural_pdf-0.1.9}/natural_pdf/analyzers/utils.py +3 -3
- natural_pdf-0.1.9/natural_pdf/classification/manager.py +422 -0
- {natural_pdf-0.1.8 → natural_pdf-0.1.9}/natural_pdf/classification/mixin.py +49 -35
- natural_pdf-0.1.9/natural_pdf/classification/results.py +80 -0
- natural_pdf-0.1.9/natural_pdf/collections/mixins.py +111 -0
- {natural_pdf-0.1.8 → natural_pdf-0.1.9}/natural_pdf/collections/pdf_collection.py +177 -64
- {natural_pdf-0.1.8 → natural_pdf-0.1.9}/natural_pdf/core/element_manager.py +30 -14
- {natural_pdf-0.1.8 → natural_pdf-0.1.9}/natural_pdf/core/highlighting_service.py +13 -22
- {natural_pdf-0.1.8 → natural_pdf-0.1.9}/natural_pdf/core/page.py +423 -101
- {natural_pdf-0.1.8 → natural_pdf-0.1.9}/natural_pdf/core/pdf.py +633 -190
- {natural_pdf-0.1.8 → natural_pdf-0.1.9}/natural_pdf/elements/base.py +134 -40
- {natural_pdf-0.1.8 → natural_pdf-0.1.9}/natural_pdf/elements/collections.py +503 -131
- {natural_pdf-0.1.8 → natural_pdf-0.1.9}/natural_pdf/elements/region.py +659 -90
- {natural_pdf-0.1.8 → natural_pdf-0.1.9}/natural_pdf/elements/text.py +1 -1
- natural_pdf-0.1.9/natural_pdf/export/mixin.py +137 -0
- {natural_pdf-0.1.8 → natural_pdf-0.1.9}/natural_pdf/exporters/base.py +3 -3
- {natural_pdf-0.1.8 → natural_pdf-0.1.9}/natural_pdf/exporters/paddleocr.py +4 -3
- {natural_pdf-0.1.8 → natural_pdf-0.1.9}/natural_pdf/extraction/manager.py +50 -49
- {natural_pdf-0.1.8 → natural_pdf-0.1.9}/natural_pdf/extraction/mixin.py +90 -57
- natural_pdf-0.1.9/natural_pdf/extraction/result.py +23 -0
- {natural_pdf-0.1.8 → natural_pdf-0.1.9}/natural_pdf/ocr/__init__.py +5 -5
- natural_pdf-0.1.9/natural_pdf/ocr/engine_doctr.py +346 -0
- {natural_pdf-0.1.8 → natural_pdf-0.1.9}/natural_pdf/ocr/ocr_factory.py +24 -4
- {natural_pdf-0.1.8 → natural_pdf-0.1.9}/natural_pdf/ocr/ocr_manager.py +61 -25
- {natural_pdf-0.1.8 → natural_pdf-0.1.9}/natural_pdf/ocr/ocr_options.py +70 -10
- {natural_pdf-0.1.8 → natural_pdf-0.1.9}/natural_pdf/ocr/utils.py +6 -4
- {natural_pdf-0.1.8 → natural_pdf-0.1.9}/natural_pdf/search/__init__.py +20 -34
- natural_pdf-0.1.9/natural_pdf/search/haystack_search_service.py +687 -0
- {natural_pdf-0.1.8 → natural_pdf-0.1.9}/natural_pdf/search/haystack_utils.py +99 -75
- {natural_pdf-0.1.8 → natural_pdf-0.1.9}/natural_pdf/search/search_service_protocol.py +11 -12
- {natural_pdf-0.1.8 → natural_pdf-0.1.9}/natural_pdf/selectors/parser.py +219 -143
- {natural_pdf-0.1.8 → natural_pdf-0.1.9}/natural_pdf/utils/debug.py +3 -3
- {natural_pdf-0.1.8 → natural_pdf-0.1.9}/natural_pdf/utils/locks.py +1 -1
- {natural_pdf-0.1.8 → natural_pdf-0.1.9}/natural_pdf/utils/packaging.py +8 -6
- {natural_pdf-0.1.8 → natural_pdf-0.1.9}/natural_pdf/utils/text_extraction.py +24 -16
- {natural_pdf-0.1.8 → natural_pdf-0.1.9}/natural_pdf/utils/tqdm_utils.py +18 -10
- {natural_pdf-0.1.8 → natural_pdf-0.1.9}/natural_pdf/utils/visualization.py +18 -0
- {natural_pdf-0.1.8 → natural_pdf-0.1.9}/natural_pdf/widgets/viewer.py +4 -25
- {natural_pdf-0.1.8 → natural_pdf-0.1.9}/natural_pdf.egg-info/PKG-INFO +12 -3
- {natural_pdf-0.1.8 → natural_pdf-0.1.9}/natural_pdf.egg-info/SOURCES.txt +4 -1
- {natural_pdf-0.1.8 → natural_pdf-0.1.9}/natural_pdf.egg-info/requires.txt +13 -2
- {natural_pdf-0.1.8 → natural_pdf-0.1.9}/natural_pdf.egg-info/top_level.txt +0 -2
- {natural_pdf-0.1.8 → natural_pdf-0.1.9}/pyproject.toml +28 -12
- {natural_pdf-0.1.8 → natural_pdf-0.1.9}/tests/exporters/test_paddleocr_exporter.py +4 -3
- {natural_pdf-0.1.8 → natural_pdf-0.1.9}/tests/test_optional_deps.py +43 -17
- natural_pdf-0.1.8/MANIFEST.in +0 -8
- natural_pdf-0.1.8/docs/tutorials/01-loading-and-extraction.ipynb +0 -1873
- natural_pdf-0.1.8/docs/tutorials/02-finding-elements.ipynb +0 -417
- natural_pdf-0.1.8/docs/tutorials/03-extracting-blocks.ipynb +0 -152
- natural_pdf-0.1.8/docs/tutorials/05-excluding-content.ipynb +0 -275
- natural_pdf-0.1.8/docs/tutorials/07-layout-analysis.ipynb +0 -293
- natural_pdf-0.1.8/docs/tutorials/07-working-with-regions.ipynb +0 -414
- natural_pdf-0.1.8/docs/tutorials/08-spatial-navigation.ipynb +0 -513
- natural_pdf-0.1.8/docs/tutorials/09-section-extraction.ipynb +0 -2439
- natural_pdf-0.1.8/docs/tutorials/10-form-field-extraction.ipynb +0 -517
- natural_pdf-0.1.8/natural_pdf/classification/manager.py +0 -343
- natural_pdf-0.1.8/natural_pdf/classification/results.py +0 -62
- natural_pdf-0.1.8/natural_pdf/collections/mixins.py +0 -63
- natural_pdf-0.1.8/natural_pdf/extraction/result.py +0 -37
- natural_pdf-0.1.8/natural_pdf/search/haystack_search_service.py +0 -643
- natural_pdf-0.1.8/notebooks/Examples.ipynb +0 -1293
- {natural_pdf-0.1.8 → natural_pdf-0.1.9}/.cursor/rules/analysis_framework.mdc +0 -0
- {natural_pdf-0.1.8 → natural_pdf-0.1.9}/.cursor/rules/coding-style.mdc +0 -0
- {natural_pdf-0.1.8 → natural_pdf-0.1.9}/.cursor/rules/edit-md-instead-of-ipynb.mdc +0 -0
- {natural_pdf-0.1.8 → natural_pdf-0.1.9}/.cursor/rules/minimal-comments.mdc +0 -0
- {natural_pdf-0.1.8 → natural_pdf-0.1.9}/.cursor/rules/natural-pdf-overview.mdc +0 -0
- {natural_pdf-0.1.8 → natural_pdf-0.1.9}/.cursor/rules/user-friendly-library-code.mdc +0 -0
- {natural_pdf-0.1.8 → natural_pdf-0.1.9}/.github/workflows/docs.yml +0 -0
- {natural_pdf-0.1.8 → natural_pdf-0.1.9}/.gitignore +0 -0
- {natural_pdf-0.1.8 → natural_pdf-0.1.9}/01-execute_notebooks.py +0 -0
- {natural_pdf-0.1.8 → natural_pdf-0.1.9}/02-run_all_tutorials.sh +0 -0
- {natural_pdf-0.1.8 → natural_pdf-0.1.9}/CLAUDE.md +0 -0
- {natural_pdf-0.1.8 → natural_pdf-0.1.9}/LICENSE +0 -0
- {natural_pdf-0.1.8 → natural_pdf-0.1.9}/README.md +0 -0
- {natural_pdf-0.1.8 → natural_pdf-0.1.9}/check_run_md.sh +0 -0
- {natural_pdf-0.1.8 → natural_pdf-0.1.9}/docs/api/index.md +0 -0
- {natural_pdf-0.1.8 → natural_pdf-0.1.9}/docs/assets/favicon.png +0 -0
- {natural_pdf-0.1.8 → natural_pdf-0.1.9}/docs/assets/favicon.svg +0 -0
- {natural_pdf-0.1.8 → natural_pdf-0.1.9}/docs/assets/javascripts/custom.js +0 -0
- {natural_pdf-0.1.8 → natural_pdf-0.1.9}/docs/assets/logo.svg +0 -0
- {natural_pdf-0.1.8 → natural_pdf-0.1.9}/docs/assets/sample-screen.png +0 -0
- {natural_pdf-0.1.8 → natural_pdf-0.1.9}/docs/assets/social-preview.png +0 -0
- {natural_pdf-0.1.8 → natural_pdf-0.1.9}/docs/assets/social-preview.svg +0 -0
- {natural_pdf-0.1.8 → natural_pdf-0.1.9}/docs/assets/stylesheets/custom.css +0 -0
- {natural_pdf-0.1.8 → natural_pdf-0.1.9}/docs/categorizing-documents/index.md +0 -0
- {natural_pdf-0.1.8 → natural_pdf-0.1.9}/docs/document-qa/index.ipynb +0 -0
- {natural_pdf-0.1.8 → natural_pdf-0.1.9}/docs/document-qa/index.md +0 -0
- {natural_pdf-0.1.8 → natural_pdf-0.1.9}/docs/element-selection/index.ipynb +0 -0
- {natural_pdf-0.1.8 → natural_pdf-0.1.9}/docs/element-selection/index.md +0 -0
- {natural_pdf-0.1.8 → natural_pdf-0.1.9}/docs/finetuning/index.md +0 -0
- {natural_pdf-0.1.8 → natural_pdf-0.1.9}/docs/index.md +0 -0
- {natural_pdf-0.1.8 → natural_pdf-0.1.9}/docs/installation/index.md +0 -0
- {natural_pdf-0.1.8 → natural_pdf-0.1.9}/docs/interactive-widget/index.ipynb +0 -0
- {natural_pdf-0.1.8 → natural_pdf-0.1.9}/docs/interactive-widget/index.md +0 -0
- {natural_pdf-0.1.8 → natural_pdf-0.1.9}/docs/layout-analysis/index.ipynb +0 -0
- {natural_pdf-0.1.8 → natural_pdf-0.1.9}/docs/layout-analysis/index.md +0 -0
- {natural_pdf-0.1.8 → natural_pdf-0.1.9}/docs/ocr/index.md +0 -0
- {natural_pdf-0.1.8 → natural_pdf-0.1.9}/docs/pdf-navigation/index.ipynb +0 -0
- {natural_pdf-0.1.8 → natural_pdf-0.1.9}/docs/pdf-navigation/index.md +0 -0
- {natural_pdf-0.1.8 → natural_pdf-0.1.9}/docs/regions/index.ipynb +0 -0
- {natural_pdf-0.1.8 → natural_pdf-0.1.9}/docs/regions/index.md +0 -0
- {natural_pdf-0.1.8 → natural_pdf-0.1.9}/docs/tables/index.ipynb +0 -0
- {natural_pdf-0.1.8 → natural_pdf-0.1.9}/docs/tables/index.md +0 -0
- {natural_pdf-0.1.8 → natural_pdf-0.1.9}/docs/text-analysis/index.ipynb +0 -0
- {natural_pdf-0.1.8 → natural_pdf-0.1.9}/docs/text-analysis/index.md +0 -0
- {natural_pdf-0.1.8 → natural_pdf-0.1.9}/docs/text-extraction/index.ipynb +0 -0
- {natural_pdf-0.1.8 → natural_pdf-0.1.9}/docs/text-extraction/index.md +0 -0
- {natural_pdf-0.1.8 → natural_pdf-0.1.9}/docs/tutorials/01-loading-and-extraction.md +0 -0
- {natural_pdf-0.1.8 → natural_pdf-0.1.9}/docs/tutorials/03-extracting-blocks.md +0 -0
- {natural_pdf-0.1.8 → natural_pdf-0.1.9}/docs/tutorials/04-table-extraction.md +0 -0
- {natural_pdf-0.1.8 → natural_pdf-0.1.9}/docs/tutorials/05-excluding-content.md +0 -0
- {natural_pdf-0.1.8 → natural_pdf-0.1.9}/docs/tutorials/06-document-qa.md +0 -0
- {natural_pdf-0.1.8 → natural_pdf-0.1.9}/docs/tutorials/07-layout-analysis.md +0 -0
- {natural_pdf-0.1.8 → natural_pdf-0.1.9}/docs/tutorials/07-working-with-regions.md +0 -0
- {natural_pdf-0.1.8 → natural_pdf-0.1.9}/docs/tutorials/08-spatial-navigation.md +0 -0
- {natural_pdf-0.1.8 → natural_pdf-0.1.9}/docs/tutorials/09-section-extraction.md +0 -0
- {natural_pdf-0.1.8 → natural_pdf-0.1.9}/docs/tutorials/10-form-field-extraction.md +0 -0
- {natural_pdf-0.1.8 → natural_pdf-0.1.9}/docs/tutorials/11-enhanced-table-processing.md +0 -0
- {natural_pdf-0.1.8 → natural_pdf-0.1.9}/docs/tutorials/12-ocr-integration.md +0 -0
- {natural_pdf-0.1.8 → natural_pdf-0.1.9}/docs/visual-debugging/index.ipynb +0 -0
- {natural_pdf-0.1.8 → natural_pdf-0.1.9}/docs/visual-debugging/index.md +0 -0
- {natural_pdf-0.1.8 → natural_pdf-0.1.9}/docs/visual-debugging/region.png +0 -0
- {natural_pdf-0.1.8 → natural_pdf-0.1.9}/mkdocs.yml +0 -0
- {natural_pdf-0.1.8 → natural_pdf-0.1.9}/natural_pdf/analyzers/__init__.py +0 -0
- {natural_pdf-0.1.8 → natural_pdf-0.1.9}/natural_pdf/analyzers/layout/__init__.py +0 -0
- {natural_pdf-0.1.8 → natural_pdf-0.1.9}/natural_pdf/analyzers/layout/docling.py +0 -0
- {natural_pdf-0.1.8 → natural_pdf-0.1.9}/natural_pdf/analyzers/layout/paddle.py +0 -0
- {natural_pdf-0.1.8 → natural_pdf-0.1.9}/natural_pdf/analyzers/layout/yolo.py +0 -0
- {natural_pdf-0.1.8 → natural_pdf-0.1.9}/natural_pdf/analyzers/text_options.py +0 -0
- {natural_pdf-0.1.8 → natural_pdf-0.1.9}/natural_pdf/core/__init__.py +0 -0
- {natural_pdf-0.1.8 → natural_pdf-0.1.9}/natural_pdf/elements/__init__.py +0 -0
- {natural_pdf-0.1.8 → natural_pdf-0.1.9}/natural_pdf/elements/line.py +0 -0
- {natural_pdf-0.1.8 → natural_pdf-0.1.9}/natural_pdf/elements/rect.py +0 -0
- {natural_pdf-0.1.8 → natural_pdf-0.1.9}/natural_pdf/exporters/__init__.py +0 -0
- {natural_pdf-0.1.8 → natural_pdf-0.1.9}/natural_pdf/exporters/searchable_pdf.py +0 -0
- {natural_pdf-0.1.8 → natural_pdf-0.1.9}/natural_pdf/ocr/engine.py +0 -0
- {natural_pdf-0.1.8 → natural_pdf-0.1.9}/natural_pdf/ocr/engine_easyocr.py +0 -0
- {natural_pdf-0.1.8 → natural_pdf-0.1.9}/natural_pdf/ocr/engine_paddle.py +0 -0
- {natural_pdf-0.1.8 → natural_pdf-0.1.9}/natural_pdf/ocr/engine_surya.py +0 -0
- {natural_pdf-0.1.8 → natural_pdf-0.1.9}/natural_pdf/qa/__init__.py +0 -0
- {natural_pdf-0.1.8 → natural_pdf-0.1.9}/natural_pdf/qa/document_qa.py +0 -0
- {natural_pdf-0.1.8 → natural_pdf-0.1.9}/natural_pdf/search/search_options.py +0 -0
- {natural_pdf-0.1.8 → natural_pdf-0.1.9}/natural_pdf/search/searchable_mixin.py +0 -0
- {natural_pdf-0.1.8 → natural_pdf-0.1.9}/natural_pdf/selectors/__init__.py +0 -0
- {natural_pdf-0.1.8 → natural_pdf-0.1.9}/natural_pdf/templates/__init__.py +0 -0
- {natural_pdf-0.1.8 → natural_pdf-0.1.9}/natural_pdf/templates/finetune/fine_tune_paddleocr.md +0 -0
- {natural_pdf-0.1.8 → natural_pdf-0.1.9}/natural_pdf/templates/spa/css/style.css +0 -0
- {natural_pdf-0.1.8 → natural_pdf-0.1.9}/natural_pdf/templates/spa/index.html +0 -0
- {natural_pdf-0.1.8 → natural_pdf-0.1.9}/natural_pdf/templates/spa/js/app.js +0 -0
- {natural_pdf-0.1.8 → natural_pdf-0.1.9}/natural_pdf/templates/spa/words.txt +0 -0
- {natural_pdf-0.1.8 → natural_pdf-0.1.9}/natural_pdf/utils/__init__.py +0 -0
- {natural_pdf-0.1.8 → natural_pdf-0.1.9}/natural_pdf/utils/highlighting.py +0 -0
- {natural_pdf-0.1.8 → natural_pdf-0.1.9}/natural_pdf/utils/identifiers.py +1 -1
- {natural_pdf-0.1.8 → natural_pdf-0.1.9}/natural_pdf/utils/reading_order.py +0 -0
- {natural_pdf-0.1.8 → natural_pdf-0.1.9}/natural_pdf/widgets/__init__.py +0 -0
- {natural_pdf-0.1.8 → natural_pdf-0.1.9}/natural_pdf/widgets/frontend/viewer.js +0 -0
- {natural_pdf-0.1.8 → natural_pdf-0.1.9}/natural_pdf.egg-info/dependency_links.txt +0 -0
- {natural_pdf-0.1.8 → natural_pdf-0.1.9}/noxfile.py +0 -0
- {natural_pdf-0.1.8 → natural_pdf-0.1.9}/pdfs/.gitkeep +0 -0
- {natural_pdf-0.1.8 → natural_pdf-0.1.9}/pdfs/01-practice.pdf +0 -0
- {natural_pdf-0.1.8 → natural_pdf-0.1.9}/pdfs/0500000US42001.pdf +0 -0
- {natural_pdf-0.1.8 → natural_pdf-0.1.9}/pdfs/0500000US42007.pdf +0 -0
- {natural_pdf-0.1.8 → natural_pdf-0.1.9}/pdfs/2014 Statistics.pdf +0 -0
- {natural_pdf-0.1.8 → natural_pdf-0.1.9}/pdfs/2019 Statistics.pdf +0 -0
- {natural_pdf-0.1.8 → natural_pdf-0.1.9}/pdfs/Atlanta_Public_Schools_GA_sample.pdf +0 -0
- {natural_pdf-0.1.8 → natural_pdf-0.1.9}/pdfs/needs-ocr.pdf +0 -0
- {natural_pdf-0.1.8 → natural_pdf-0.1.9}/publish.sh +0 -0
- {natural_pdf-0.1.8 → natural_pdf-0.1.9}/sample-screen.png +0 -0
- {natural_pdf-0.1.8 → natural_pdf-0.1.9}/setup.cfg +0 -0
- {natural_pdf-0.1.8 → natural_pdf-0.1.9}/tests/test_loading.py +0 -0
@@ -0,0 +1,48 @@
|
|
1
|
+
include README.md
|
2
|
+
include LICENSE
|
3
|
+
|
4
|
+
# HTML templates
|
5
|
+
recursive-include natural_pdf/templates *.html
|
6
|
+
|
7
|
+
# Documentation assets
|
8
|
+
recursive-include docs *.md *.png *.jpg *.gif
|
9
|
+
|
10
|
+
# Remove common build garbage
|
11
|
+
global-exclude __pycache__ *.py[cod] *.so .DS_Store
|
12
|
+
global-exclude *hidden*
|
13
|
+
|
14
|
+
# 💣 Critical: prevent recursion bugs
|
15
|
+
prune build
|
16
|
+
prune dist
|
17
|
+
prune .nox
|
18
|
+
prune .venv
|
19
|
+
prune env
|
20
|
+
prune venv
|
21
|
+
|
22
|
+
# General junk
|
23
|
+
exclude .notebook_cache.json
|
24
|
+
exclude Untitled.ipynb
|
25
|
+
exclude conversation.md
|
26
|
+
exclude transcript.md
|
27
|
+
exclude sample.py
|
28
|
+
exclude sample2.py
|
29
|
+
exclude requirements.lock
|
30
|
+
exclude install.sh
|
31
|
+
|
32
|
+
# Directories to exclude
|
33
|
+
prune .venv
|
34
|
+
prune output
|
35
|
+
prune results
|
36
|
+
prune natural_pdf_index
|
37
|
+
prune hidden
|
38
|
+
prune pdfs/hidden
|
39
|
+
prune my_paddleocr_finetune_data
|
40
|
+
prune notebooks
|
41
|
+
prune docs/tutorials/pdfs
|
42
|
+
|
43
|
+
# Individual files in nested directories
|
44
|
+
exclude docs/tutorials/needs-ocr-searchable.pdf
|
45
|
+
exclude notebooks/Examples.md
|
46
|
+
|
47
|
+
# File patterns
|
48
|
+
global-exclude *.hocr
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: natural-pdf
|
3
|
-
Version: 0.1.
|
3
|
+
Version: 0.1.9
|
4
4
|
Summary: A more intuitive interface for working with PDFs
|
5
5
|
Author-email: Jonathan Soma <jonathan.soma@gmail.com>
|
6
6
|
License-Expression: MIT
|
@@ -17,11 +17,13 @@ Requires-Dist: colour
|
|
17
17
|
Requires-Dist: numpy
|
18
18
|
Requires-Dist: urllib3
|
19
19
|
Requires-Dist: tqdm
|
20
|
+
Requires-Dist: pydantic
|
20
21
|
Provides-Extra: interactive
|
21
22
|
Requires-Dist: ipywidgets<9.0.0,>=7.0.0; extra == "interactive"
|
22
23
|
Provides-Extra: haystack
|
23
24
|
Requires-Dist: haystack-ai; extra == "haystack"
|
24
|
-
Requires-Dist:
|
25
|
+
Requires-Dist: lancedb-haystack; extra == "haystack"
|
26
|
+
Requires-Dist: lancedb; extra == "haystack"
|
25
27
|
Requires-Dist: sentence-transformers; extra == "haystack"
|
26
28
|
Requires-Dist: natural-pdf[core-ml]; extra == "haystack"
|
27
29
|
Provides-Extra: easyocr
|
@@ -36,6 +38,9 @@ Requires-Dist: natural-pdf[core-ml]; extra == "layout-yolo"
|
|
36
38
|
Provides-Extra: surya
|
37
39
|
Requires-Dist: surya-ocr; extra == "surya"
|
38
40
|
Requires-Dist: natural-pdf[core-ml]; extra == "surya"
|
41
|
+
Provides-Extra: doctr
|
42
|
+
Requires-Dist: python-doctr[torch]; extra == "doctr"
|
43
|
+
Requires-Dist: natural-pdf[core-ml]; extra == "doctr"
|
39
44
|
Provides-Extra: qa
|
40
45
|
Requires-Dist: natural-pdf[core-ml]; extra == "qa"
|
41
46
|
Provides-Extra: docling
|
@@ -43,7 +48,6 @@ Requires-Dist: docling; extra == "docling"
|
|
43
48
|
Requires-Dist: natural-pdf[core-ml]; extra == "docling"
|
44
49
|
Provides-Extra: llm
|
45
50
|
Requires-Dist: openai>=1.0; extra == "llm"
|
46
|
-
Requires-Dist: pydantic; extra == "llm"
|
47
51
|
Provides-Extra: classification
|
48
52
|
Requires-Dist: sentence-transformers; extra == "classification"
|
49
53
|
Requires-Dist: timm; extra == "classification"
|
@@ -63,6 +67,9 @@ Requires-Dist: pipdeptree; extra == "dev"
|
|
63
67
|
Requires-Dist: nbformat; extra == "dev"
|
64
68
|
Requires-Dist: jupytext; extra == "dev"
|
65
69
|
Requires-Dist: nbclient; extra == "dev"
|
70
|
+
Provides-Extra: deskew
|
71
|
+
Requires-Dist: deskew>=1.5; extra == "deskew"
|
72
|
+
Requires-Dist: img2pdf; extra == "deskew"
|
66
73
|
Provides-Extra: all
|
67
74
|
Requires-Dist: natural-pdf[interactive]; extra == "all"
|
68
75
|
Requires-Dist: natural-pdf[haystack]; extra == "all"
|
@@ -70,11 +77,13 @@ Requires-Dist: natural-pdf[easyocr]; extra == "all"
|
|
70
77
|
Requires-Dist: natural-pdf[paddle]; extra == "all"
|
71
78
|
Requires-Dist: natural-pdf[layout_yolo]; extra == "all"
|
72
79
|
Requires-Dist: natural-pdf[surya]; extra == "all"
|
80
|
+
Requires-Dist: natural-pdf[doctr]; extra == "all"
|
73
81
|
Requires-Dist: natural-pdf[qa]; extra == "all"
|
74
82
|
Requires-Dist: natural-pdf[ocr-export]; extra == "all"
|
75
83
|
Requires-Dist: natural-pdf[docling]; extra == "all"
|
76
84
|
Requires-Dist: natural-pdf[llm]; extra == "all"
|
77
85
|
Requires-Dist: natural-pdf[classification]; extra == "all"
|
86
|
+
Requires-Dist: natural-pdf[deskew]; extra == "all"
|
78
87
|
Requires-Dist: natural-pdf[test]; extra == "all"
|
79
88
|
Provides-Extra: core-ml
|
80
89
|
Requires-Dist: torch; extra == "core-ml"
|
@@ -0,0 +1,56 @@
|
|
1
|
+
import subprocess
|
2
|
+
import tarfile
|
3
|
+
import zipfile
|
4
|
+
from pathlib import Path
|
5
|
+
|
6
|
+
DIST_DIR = Path("dist")
|
7
|
+
|
8
|
+
|
9
|
+
def build_package():
|
10
|
+
subprocess.run(["python", "-m", "build", "--sdist", "--wheel"], check=True)
|
11
|
+
|
12
|
+
|
13
|
+
def get_sdist_files():
|
14
|
+
sdist_path = next(DIST_DIR.glob("*.tar.gz"))
|
15
|
+
with tarfile.open(sdist_path, "r:gz") as tar:
|
16
|
+
return sorted(str(Path(m.name)) for m in tar.getmembers() if m.isfile())
|
17
|
+
|
18
|
+
|
19
|
+
def get_wheel_files():
|
20
|
+
wheel_path = next(DIST_DIR.glob("*.whl"))
|
21
|
+
with zipfile.ZipFile(wheel_path, "r") as zipf:
|
22
|
+
return sorted(str(f) for f in zipf.namelist() if not f.endswith("/"))
|
23
|
+
|
24
|
+
|
25
|
+
def get_gitignored_files():
|
26
|
+
proc = subprocess.run(
|
27
|
+
["git", "ls-files", "--others", "-i", "--exclude-standard"],
|
28
|
+
check=True,
|
29
|
+
capture_output=True,
|
30
|
+
text=True,
|
31
|
+
)
|
32
|
+
return sorted(proc.stdout.strip().splitlines())
|
33
|
+
|
34
|
+
|
35
|
+
def diff_lists(packaged, ignored):
|
36
|
+
return sorted(set(packaged) & set(ignored))
|
37
|
+
|
38
|
+
|
39
|
+
def main():
|
40
|
+
build_package()
|
41
|
+
|
42
|
+
sdist_files = get_sdist_files()
|
43
|
+
wheel_files = get_wheel_files()
|
44
|
+
ignored_files = get_gitignored_files()
|
45
|
+
|
46
|
+
print("\n🚫 Files in *sdist* that are also .gitignored:")
|
47
|
+
for f in diff_lists(sdist_files, ignored_files):
|
48
|
+
print(" •", f)
|
49
|
+
|
50
|
+
print("\n🚫 Files in *wheel* that are also .gitignored:")
|
51
|
+
for f in diff_lists(wheel_files, ignored_files):
|
52
|
+
print(" •", f)
|
53
|
+
|
54
|
+
|
55
|
+
if __name__ == "__main__":
|
56
|
+
main()
|
@@ -1,42 +1,56 @@
|
|
1
1
|
# Structured Data Extraction
|
2
2
|
|
3
|
-
Extracting specific, structured information (like invoice numbers, dates, or addresses) from documents often requires more than simple text extraction. Natural PDF integrates with
|
3
|
+
Extracting specific, structured information (like invoice numbers, dates, or addresses) from documents often requires more than simple text extraction. Natural PDF integrates with LLMs to pull out [structured data](https://platform.openai.com/docs/guides/structured-outputs).
|
4
|
+
|
5
|
+
You need to install more than just the tiny baby default `natural_pdf` for this:
|
6
|
+
```
|
7
|
+
# Install just the LLM portions
|
8
|
+
pip install "natural_pdf[llm]"
|
9
|
+
|
10
|
+
# Install eeeeeverything
|
11
|
+
pip install "natural_pdf[all]"
|
12
|
+
```
|
4
13
|
|
5
14
|
## Introduction
|
6
15
|
|
7
16
|
This feature allows you to define the exact data structure you want using a Pydantic model and then instruct an LLM to populate that structure based on the content of a PDF element (like a `Page` or `Region`).
|
8
17
|
|
18
|
+
> Not sure how to write a Pydantic schema? Just ask an LLM! "Write me a Pydantic schema to pull out an invoice number (an integer), a company name (string) and a date (string)." It'll go fine.
|
19
|
+
|
9
20
|
## Basic Extraction
|
10
21
|
|
11
22
|
1. **Define a Schema:** Create a Pydantic model for your desired data.
|
12
|
-
2. **Extract:** Use
|
13
|
-
3. **Access:** Use
|
23
|
+
2. **Extract:** Use `.extract()` on a `PDF`, `Page`, or `Region` object.
|
24
|
+
3. **Access:** Use `.extracted()` to retrieve the results.
|
14
25
|
|
15
26
|
```python
|
16
27
|
from natural_pdf import PDF
|
17
28
|
from pydantic import BaseModel, Field
|
18
|
-
from openai import OpenAI
|
29
|
+
from openai import OpenAI
|
19
30
|
|
20
|
-
#
|
21
|
-
|
31
|
+
# Initialize your LLM client
|
32
|
+
# Anything OpenAI-compatible works!
|
33
|
+
client = OpenAI(
|
34
|
+
api_key="ANTHROPIC_API_KEY", # Your Anthropic API key
|
35
|
+
base_url="https://api.anthropic.com/v1/" # Anthropic's API endpoint
|
36
|
+
)
|
22
37
|
|
23
38
|
# Load the PDF
|
24
39
|
pdf = PDF("path/to/your/document.pdf")
|
25
40
|
page = pdf.pages[0]
|
26
41
|
|
27
|
-
#
|
42
|
+
# Define your schema
|
28
43
|
class InvoiceInfo(BaseModel):
|
29
44
|
invoice_number: str = Field(description="The main invoice identifier")
|
30
45
|
total_amount: float = Field(description="The final amount due")
|
31
46
|
company_name: Optional[str] = Field(None, description="The name of the issuing company")
|
32
47
|
|
33
|
-
#
|
48
|
+
# Extract data
|
34
49
|
page.extract(schema=InvoiceInfo, client=client)
|
35
50
|
|
36
|
-
# 3. Access the results
|
37
51
|
# Access the full result object
|
38
52
|
full_data = page.extracted()
|
39
|
-
print(full_data)
|
53
|
+
print(full_data)
|
40
54
|
|
41
55
|
# Access a single field
|
42
56
|
inv_num = page.extracted('invoice_number')
|
@@ -51,16 +65,23 @@ print(f"Invoice Number: {inv_num}")
|
|
51
65
|
|
52
66
|
```python
|
53
67
|
# Extract using a specific key
|
54
|
-
page.extract(InvoiceInfo, client, analysis_key="invoice_header")
|
68
|
+
page.extract(InvoiceInfo, client=client, analysis_key="invoice_header")
|
55
69
|
|
56
70
|
# Access using the specific key
|
57
71
|
header_data = page.extracted(analysis_key="invoice_header")
|
58
72
|
company = page.extracted('company_name', analysis_key="invoice_header")
|
59
73
|
```
|
60
74
|
|
61
|
-
##
|
75
|
+
## Text vs vision
|
76
|
+
|
77
|
+
When sending a page (or a region or etc) to an LLM, you can choose either `using='text'` (default) or `using='vision'`.
|
78
|
+
|
79
|
+
- `text` sends the text, somewhat respecting layout using `.extract_text(layout=True)`
|
80
|
+
- `vision` sends an image of the page with `.to_image(resolution=72)` (no highlights or labels)
|
81
|
+
|
82
|
+
## Batch and bulk extraction
|
62
83
|
|
63
|
-
|
84
|
+
If you have a lot of pages or a lot of PDFs or a lot of anything, the `.extract()` and `.extracted()` methods work identically on most parts of a PDF - regions, pages, collections of pdfs, etc, allowing a lot of flexibility in what you analyze.
|
64
85
|
|
65
86
|
```python
|
66
87
|
# Assuming 'header_region' is a Region object you defined
|
@@ -73,15 +94,16 @@ Furthermore, you can apply extraction to collections of elements (like `pdf.page
|
|
73
94
|
```python
|
74
95
|
# Example: Extract InvoiceInfo from the first 5 pages
|
75
96
|
results = pdf.pages[:5].apply(
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
97
|
+
lambda page: page.extract(
|
98
|
+
client=client,
|
99
|
+
schema=InvoiceInfo,
|
100
|
+
client=client,
|
101
|
+
analysis_key="page_invoice_info",
|
102
|
+
)
|
81
103
|
)
|
82
104
|
|
83
105
|
# Access results for the first page in the collection
|
84
|
-
|
106
|
+
pdf.pages[0].extracted('company_name', analysis_key="page_invoice_info")
|
85
107
|
```
|
86
108
|
|
87
109
|
This provides a powerful way to turn unstructured PDF content into structured, usable data.
|