natural-pdf 0.1.16__tar.gz → 0.1.18__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {natural_pdf-0.1.16 → natural_pdf-0.1.18}/.gitignore +1 -0
- {natural_pdf-0.1.16 → natural_pdf-0.1.18}/01-execute_notebooks.py +1 -0
- {natural_pdf-0.1.16/natural_pdf.egg-info → natural_pdf-0.1.18}/PKG-INFO +16 -16
- natural_pdf-0.1.18/docs/describe/index.ipynb +438 -0
- natural_pdf-0.1.18/docs/describe/index.md +42 -0
- {natural_pdf-0.1.16 → natural_pdf-0.1.18}/docs/layout-analysis/index.ipynb +422 -213
- {natural_pdf-0.1.16 → natural_pdf-0.1.18}/docs/layout-analysis/index.md +10 -6
- natural_pdf-0.1.18/docs/tutorials/01-loading-and-extraction.ipynb +320 -0
- natural_pdf-0.1.18/docs/tutorials/02-finding-elements.ipynb +344 -0
- natural_pdf-0.1.18/docs/tutorials/03-extracting-blocks.ipynb +151 -0
- natural_pdf-0.1.18/docs/tutorials/04-table-extraction.ipynb +557 -0
- natural_pdf-0.1.18/docs/tutorials/05-excluding-content.ipynb +274 -0
- {natural_pdf-0.1.16 → natural_pdf-0.1.18}/docs/tutorials/06-document-qa.ipynb +40 -53
- natural_pdf-0.1.18/docs/tutorials/07-layout-analysis.ipynb +615 -0
- {natural_pdf-0.1.16 → natural_pdf-0.1.18}/docs/tutorials/07-working-with-regions.ipynb +67 -75
- natural_pdf-0.1.18/docs/tutorials/08-spatial-navigation.ipynb +512 -0
- {natural_pdf-0.1.16 → natural_pdf-0.1.18}/docs/tutorials/09-section-extraction.ipynb +105 -141
- {natural_pdf-0.1.16 → natural_pdf-0.1.18}/docs/tutorials/10-form-field-extraction.ipynb +57 -65
- {natural_pdf-0.1.16 → natural_pdf-0.1.18}/docs/tutorials/11-enhanced-table-processing.ipynb +13 -14
- natural_pdf-0.1.18/docs/tutorials/12-ocr-integration.ipynb +4197 -0
- {natural_pdf-0.1.16 → natural_pdf-0.1.18}/docs/tutorials/12-ocr-integration.md +12 -1
- {natural_pdf-0.1.16 → natural_pdf-0.1.18}/docs/tutorials/13-semantic-search.ipynb +148 -183
- natural_pdf-0.1.18/docs/tutorials/14-categorizing-documents.ipynb +2138 -0
- {natural_pdf-0.1.16 → natural_pdf-0.1.18}/docs/tutorials/14-categorizing-documents.md +1 -1
- {natural_pdf-0.1.16 → natural_pdf-0.1.18}/mkdocs.yml +1 -0
- {natural_pdf-0.1.16 → natural_pdf-0.1.18}/natural_pdf/classification/manager.py +38 -13
- {natural_pdf-0.1.16 → natural_pdf-0.1.18}/natural_pdf/core/page.py +2 -1
- {natural_pdf-0.1.16 → natural_pdf-0.1.18}/natural_pdf/core/pdf.py +141 -32
- natural_pdf-0.1.18/natural_pdf/describe/__init__.py +21 -0
- natural_pdf-0.1.18/natural_pdf/describe/base.py +457 -0
- natural_pdf-0.1.18/natural_pdf/describe/elements.py +411 -0
- natural_pdf-0.1.18/natural_pdf/describe/mixin.py +84 -0
- natural_pdf-0.1.18/natural_pdf/describe/summary.py +186 -0
- {natural_pdf-0.1.16 → natural_pdf-0.1.18}/natural_pdf/elements/base.py +2 -1
- {natural_pdf-0.1.16 → natural_pdf-0.1.18}/natural_pdf/elements/collections.py +11 -1
- {natural_pdf-0.1.16 → natural_pdf-0.1.18}/natural_pdf/elements/region.py +4 -1
- natural_pdf-0.1.18/natural_pdf/exporters/__init__.py +15 -0
- {natural_pdf-0.1.16 → natural_pdf-0.1.18}/natural_pdf/exporters/hocr.py +9 -8
- {natural_pdf-0.1.16 → natural_pdf-0.1.18}/natural_pdf/exporters/original_pdf.py +31 -2
- {natural_pdf-0.1.16 → natural_pdf-0.1.18}/natural_pdf/ocr/engine_surya.py +1 -2
- {natural_pdf-0.1.16 → natural_pdf-0.1.18}/natural_pdf/ocr/ocr_manager.py +21 -4
- {natural_pdf-0.1.16 → natural_pdf-0.1.18}/natural_pdf/search/__init__.py +20 -3
- {natural_pdf-0.1.16 → natural_pdf-0.1.18}/natural_pdf/search/lancedb_search_service.py +13 -5
- {natural_pdf-0.1.16 → natural_pdf-0.1.18}/natural_pdf/search/numpy_search_service.py +13 -3
- {natural_pdf-0.1.16 → natural_pdf-0.1.18/natural_pdf.egg-info}/PKG-INFO +16 -16
- {natural_pdf-0.1.16 → natural_pdf-0.1.18}/natural_pdf.egg-info/SOURCES.txt +10 -0
- {natural_pdf-0.1.16 → natural_pdf-0.1.18}/natural_pdf.egg-info/requires.txt +16 -13
- {natural_pdf-0.1.16 → natural_pdf-0.1.18}/noxfile.py +13 -3
- natural_pdf-0.1.18/pdfs/appendix_fy2026.pdf +0 -0
- {natural_pdf-0.1.16 → natural_pdf-0.1.18}/pyproject.toml +24 -19
- natural_pdf-0.1.18/test_install.sh +46 -0
- natural_pdf-0.1.18/tests/test_tutorials.py +50 -0
- natural_pdf-0.1.16/docs/tutorials/01-loading-and-extraction.ipynb +0 -328
- natural_pdf-0.1.16/docs/tutorials/02-finding-elements.ipynb +0 -352
- natural_pdf-0.1.16/docs/tutorials/03-extracting-blocks.ipynb +0 -159
- natural_pdf-0.1.16/docs/tutorials/04-table-extraction.ipynb +0 -579
- natural_pdf-0.1.16/docs/tutorials/05-excluding-content.ipynb +0 -8402
- natural_pdf-0.1.16/docs/tutorials/07-layout-analysis.ipynb +0 -630
- natural_pdf-0.1.16/docs/tutorials/08-spatial-navigation.ipynb +0 -520
- natural_pdf-0.1.16/docs/tutorials/12-ocr-integration.ipynb +0 -4129
- natural_pdf-0.1.16/docs/tutorials/14-categorizing-documents.ipynb +0 -2142
- natural_pdf-0.1.16/natural_pdf/exporters/__init__.py +0 -4
- {natural_pdf-0.1.16 → natural_pdf-0.1.18}/.cursor/rules/analysis_framework.mdc +0 -0
- {natural_pdf-0.1.16 → natural_pdf-0.1.18}/.cursor/rules/coding-style.mdc +0 -0
- {natural_pdf-0.1.16 → natural_pdf-0.1.18}/.cursor/rules/edit-md-instead-of-ipynb.mdc +0 -0
- {natural_pdf-0.1.16 → natural_pdf-0.1.18}/.cursor/rules/minimal-comments.mdc +0 -0
- {natural_pdf-0.1.16 → natural_pdf-0.1.18}/.cursor/rules/natural-pdf-overview.mdc +0 -0
- {natural_pdf-0.1.16 → natural_pdf-0.1.18}/.cursor/rules/user-friendly-library-code.mdc +0 -0
- {natural_pdf-0.1.16 → natural_pdf-0.1.18}/.github/workflows/docs.yml +0 -0
- {natural_pdf-0.1.16 → natural_pdf-0.1.18}/.pre-commit-config.yaml +0 -0
- {natural_pdf-0.1.16 → natural_pdf-0.1.18}/02-run_all_tutorials.sh +0 -0
- {natural_pdf-0.1.16 → natural_pdf-0.1.18}/CLAUDE.md +0 -0
- {natural_pdf-0.1.16 → natural_pdf-0.1.18}/LICENSE +0 -0
- {natural_pdf-0.1.16 → natural_pdf-0.1.18}/MANIFEST.in +0 -0
- {natural_pdf-0.1.16 → natural_pdf-0.1.18}/README.md +0 -0
- {natural_pdf-0.1.16 → natural_pdf-0.1.18}/audit_packaging.py +0 -0
- {natural_pdf-0.1.16 → natural_pdf-0.1.18}/check_run_md.sh +0 -0
- {natural_pdf-0.1.16 → natural_pdf-0.1.18}/docs/api/index.md +0 -0
- {natural_pdf-0.1.16 → natural_pdf-0.1.18}/docs/assets/favicon.png +0 -0
- {natural_pdf-0.1.16 → natural_pdf-0.1.18}/docs/assets/favicon.svg +0 -0
- {natural_pdf-0.1.16 → natural_pdf-0.1.18}/docs/assets/javascripts/custom.js +0 -0
- {natural_pdf-0.1.16 → natural_pdf-0.1.18}/docs/assets/logo.svg +0 -0
- {natural_pdf-0.1.16 → natural_pdf-0.1.18}/docs/assets/sample-screen.png +0 -0
- {natural_pdf-0.1.16 → natural_pdf-0.1.18}/docs/assets/social-preview.png +0 -0
- {natural_pdf-0.1.16 → natural_pdf-0.1.18}/docs/assets/social-preview.svg +0 -0
- {natural_pdf-0.1.16 → natural_pdf-0.1.18}/docs/assets/stylesheets/custom.css +0 -0
- {natural_pdf-0.1.16 → natural_pdf-0.1.18}/docs/categorizing-documents/index.md +0 -0
- {natural_pdf-0.1.16 → natural_pdf-0.1.18}/docs/data-extraction/index.md +0 -0
- {natural_pdf-0.1.16 → natural_pdf-0.1.18}/docs/document-qa/index.ipynb +0 -0
- {natural_pdf-0.1.16 → natural_pdf-0.1.18}/docs/document-qa/index.md +0 -0
- {natural_pdf-0.1.16 → natural_pdf-0.1.18}/docs/element-selection/index.ipynb +0 -0
- {natural_pdf-0.1.16 → natural_pdf-0.1.18}/docs/element-selection/index.md +0 -0
- {natural_pdf-0.1.16 → natural_pdf-0.1.18}/docs/finetuning/index.md +0 -0
- {natural_pdf-0.1.16 → natural_pdf-0.1.18}/docs/index.md +0 -0
- {natural_pdf-0.1.16 → natural_pdf-0.1.18}/docs/installation/index.md +0 -0
- {natural_pdf-0.1.16 → natural_pdf-0.1.18}/docs/interactive-widget/index.ipynb +0 -0
- {natural_pdf-0.1.16 → natural_pdf-0.1.18}/docs/interactive-widget/index.md +0 -0
- {natural_pdf-0.1.16 → natural_pdf-0.1.18}/docs/loops-and-groups/index.ipynb +0 -0
- {natural_pdf-0.1.16 → natural_pdf-0.1.18}/docs/loops-and-groups/index.md +0 -0
- {natural_pdf-0.1.16 → natural_pdf-0.1.18}/docs/ocr/index.md +0 -0
- {natural_pdf-0.1.16 → natural_pdf-0.1.18}/docs/pdf-navigation/index.ipynb +0 -0
- {natural_pdf-0.1.16 → natural_pdf-0.1.18}/docs/pdf-navigation/index.md +0 -0
- {natural_pdf-0.1.16 → natural_pdf-0.1.18}/docs/reflowing-pages/index.ipynb +0 -0
- {natural_pdf-0.1.16 → natural_pdf-0.1.18}/docs/reflowing-pages/index.md +0 -0
- {natural_pdf-0.1.16 → natural_pdf-0.1.18}/docs/regions/index.ipynb +0 -0
- {natural_pdf-0.1.16 → natural_pdf-0.1.18}/docs/regions/index.md +0 -0
- {natural_pdf-0.1.16 → natural_pdf-0.1.18}/docs/tables/index.ipynb +0 -0
- {natural_pdf-0.1.16 → natural_pdf-0.1.18}/docs/tables/index.md +0 -0
- {natural_pdf-0.1.16 → natural_pdf-0.1.18}/docs/text-analysis/index.ipynb +0 -0
- {natural_pdf-0.1.16 → natural_pdf-0.1.18}/docs/text-analysis/index.md +0 -0
- {natural_pdf-0.1.16 → natural_pdf-0.1.18}/docs/text-extraction/index.ipynb +0 -0
- {natural_pdf-0.1.16 → natural_pdf-0.1.18}/docs/text-extraction/index.md +0 -0
- {natural_pdf-0.1.16 → natural_pdf-0.1.18}/docs/tutorials/01-loading-and-extraction.md +0 -0
- {natural_pdf-0.1.16 → natural_pdf-0.1.18}/docs/tutorials/02-finding-elements.md +0 -0
- {natural_pdf-0.1.16 → natural_pdf-0.1.18}/docs/tutorials/03-extracting-blocks.md +0 -0
- {natural_pdf-0.1.16 → natural_pdf-0.1.18}/docs/tutorials/04-table-extraction.md +0 -0
- {natural_pdf-0.1.16 → natural_pdf-0.1.18}/docs/tutorials/05-excluding-content.md +0 -0
- {natural_pdf-0.1.16 → natural_pdf-0.1.18}/docs/tutorials/06-document-qa.md +0 -0
- {natural_pdf-0.1.16 → natural_pdf-0.1.18}/docs/tutorials/07-layout-analysis.md +0 -0
- {natural_pdf-0.1.16 → natural_pdf-0.1.18}/docs/tutorials/07-working-with-regions.md +0 -0
- {natural_pdf-0.1.16 → natural_pdf-0.1.18}/docs/tutorials/08-spatial-navigation.md +0 -0
- {natural_pdf-0.1.16 → natural_pdf-0.1.18}/docs/tutorials/09-section-extraction.md +0 -0
- {natural_pdf-0.1.16 → natural_pdf-0.1.18}/docs/tutorials/10-form-field-extraction.md +0 -0
- {natural_pdf-0.1.16 → natural_pdf-0.1.18}/docs/tutorials/11-enhanced-table-processing.md +0 -0
- {natural_pdf-0.1.16 → natural_pdf-0.1.18}/docs/tutorials/13-semantic-search.md +0 -0
- {natural_pdf-0.1.16 → natural_pdf-0.1.18}/docs/visual-debugging/index.ipynb +0 -0
- {natural_pdf-0.1.16 → natural_pdf-0.1.18}/docs/visual-debugging/index.md +0 -0
- {natural_pdf-0.1.16 → natural_pdf-0.1.18}/docs/visual-debugging/region.png +0 -0
- {natural_pdf-0.1.16 → natural_pdf-0.1.18}/natural_pdf/__init__.py +0 -0
- {natural_pdf-0.1.16 → natural_pdf-0.1.18}/natural_pdf/analyzers/__init__.py +0 -0
- {natural_pdf-0.1.16 → natural_pdf-0.1.18}/natural_pdf/analyzers/layout/__init__.py +0 -0
- {natural_pdf-0.1.16 → natural_pdf-0.1.18}/natural_pdf/analyzers/layout/base.py +0 -0
- {natural_pdf-0.1.16 → natural_pdf-0.1.18}/natural_pdf/analyzers/layout/docling.py +0 -0
- {natural_pdf-0.1.16 → natural_pdf-0.1.18}/natural_pdf/analyzers/layout/gemini.py +0 -0
- {natural_pdf-0.1.16 → natural_pdf-0.1.18}/natural_pdf/analyzers/layout/layout_analyzer.py +0 -0
- {natural_pdf-0.1.16 → natural_pdf-0.1.18}/natural_pdf/analyzers/layout/layout_manager.py +0 -0
- {natural_pdf-0.1.16 → natural_pdf-0.1.18}/natural_pdf/analyzers/layout/layout_options.py +0 -0
- {natural_pdf-0.1.16 → natural_pdf-0.1.18}/natural_pdf/analyzers/layout/paddle.py +0 -0
- {natural_pdf-0.1.16 → natural_pdf-0.1.18}/natural_pdf/analyzers/layout/pdfplumber_table_finder.py +0 -0
- {natural_pdf-0.1.16 → natural_pdf-0.1.18}/natural_pdf/analyzers/layout/surya.py +0 -0
- {natural_pdf-0.1.16 → natural_pdf-0.1.18}/natural_pdf/analyzers/layout/table_structure_utils.py +0 -0
- {natural_pdf-0.1.16 → natural_pdf-0.1.18}/natural_pdf/analyzers/layout/tatr.py +0 -0
- {natural_pdf-0.1.16 → natural_pdf-0.1.18}/natural_pdf/analyzers/layout/yolo.py +0 -0
- {natural_pdf-0.1.16 → natural_pdf-0.1.18}/natural_pdf/analyzers/shape_detection_mixin.py +0 -0
- {natural_pdf-0.1.16 → natural_pdf-0.1.18}/natural_pdf/analyzers/text_options.py +0 -0
- {natural_pdf-0.1.16 → natural_pdf-0.1.18}/natural_pdf/analyzers/text_structure.py +0 -0
- {natural_pdf-0.1.16 → natural_pdf-0.1.18}/natural_pdf/analyzers/utils.py +0 -0
- {natural_pdf-0.1.16 → natural_pdf-0.1.18}/natural_pdf/classification/mixin.py +0 -0
- {natural_pdf-0.1.16 → natural_pdf-0.1.18}/natural_pdf/classification/results.py +0 -0
- {natural_pdf-0.1.16 → natural_pdf-0.1.18}/natural_pdf/collections/mixins.py +0 -0
- {natural_pdf-0.1.16 → natural_pdf-0.1.18}/natural_pdf/collections/pdf_collection.py +0 -0
- {natural_pdf-0.1.16 → natural_pdf-0.1.18}/natural_pdf/core/__init__.py +0 -0
- {natural_pdf-0.1.16 → natural_pdf-0.1.18}/natural_pdf/core/element_manager.py +0 -0
- {natural_pdf-0.1.16 → natural_pdf-0.1.18}/natural_pdf/core/highlighting_service.py +0 -0
- {natural_pdf-0.1.16 → natural_pdf-0.1.18}/natural_pdf/elements/__init__.py +0 -0
- {natural_pdf-0.1.16 → natural_pdf-0.1.18}/natural_pdf/elements/line.py +0 -0
- {natural_pdf-0.1.16 → natural_pdf-0.1.18}/natural_pdf/elements/rect.py +0 -0
- {natural_pdf-0.1.16 → natural_pdf-0.1.18}/natural_pdf/elements/text.py +0 -0
- {natural_pdf-0.1.16 → natural_pdf-0.1.18}/natural_pdf/export/mixin.py +0 -0
- {natural_pdf-0.1.16 → natural_pdf-0.1.18}/natural_pdf/exporters/base.py +0 -0
- {natural_pdf-0.1.16 → natural_pdf-0.1.18}/natural_pdf/exporters/data/__init__.py +0 -0
- {natural_pdf-0.1.16 → natural_pdf-0.1.18}/natural_pdf/exporters/data/pdf.ttf +0 -0
- {natural_pdf-0.1.16 → natural_pdf-0.1.18}/natural_pdf/exporters/data/sRGB.icc +0 -0
- {natural_pdf-0.1.16 → natural_pdf-0.1.18}/natural_pdf/exporters/hocr_font.py +0 -0
- {natural_pdf-0.1.16 → natural_pdf-0.1.18}/natural_pdf/exporters/paddleocr.py +0 -0
- {natural_pdf-0.1.16 → natural_pdf-0.1.18}/natural_pdf/exporters/searchable_pdf.py +0 -0
- {natural_pdf-0.1.16 → natural_pdf-0.1.18}/natural_pdf/extraction/manager.py +0 -0
- {natural_pdf-0.1.16 → natural_pdf-0.1.18}/natural_pdf/extraction/mixin.py +0 -0
- {natural_pdf-0.1.16 → natural_pdf-0.1.18}/natural_pdf/extraction/result.py +0 -0
- {natural_pdf-0.1.16 → natural_pdf-0.1.18}/natural_pdf/flows/__init__.py +0 -0
- {natural_pdf-0.1.16 → natural_pdf-0.1.18}/natural_pdf/flows/collections.py +0 -0
- {natural_pdf-0.1.16 → natural_pdf-0.1.18}/natural_pdf/flows/element.py +0 -0
- {natural_pdf-0.1.16 → natural_pdf-0.1.18}/natural_pdf/flows/flow.py +0 -0
- {natural_pdf-0.1.16 → natural_pdf-0.1.18}/natural_pdf/flows/region.py +0 -0
- {natural_pdf-0.1.16 → natural_pdf-0.1.18}/natural_pdf/ocr/__init__.py +0 -0
- {natural_pdf-0.1.16 → natural_pdf-0.1.18}/natural_pdf/ocr/engine.py +0 -0
- {natural_pdf-0.1.16 → natural_pdf-0.1.18}/natural_pdf/ocr/engine_doctr.py +0 -0
- {natural_pdf-0.1.16 → natural_pdf-0.1.18}/natural_pdf/ocr/engine_easyocr.py +0 -0
- {natural_pdf-0.1.16 → natural_pdf-0.1.18}/natural_pdf/ocr/engine_paddle.py +0 -0
- {natural_pdf-0.1.16 → natural_pdf-0.1.18}/natural_pdf/ocr/ocr_factory.py +0 -0
- {natural_pdf-0.1.16 → natural_pdf-0.1.18}/natural_pdf/ocr/ocr_options.py +0 -0
- {natural_pdf-0.1.16 → natural_pdf-0.1.18}/natural_pdf/ocr/utils.py +0 -0
- {natural_pdf-0.1.16 → natural_pdf-0.1.18}/natural_pdf/qa/__init__.py +0 -0
- {natural_pdf-0.1.16 → natural_pdf-0.1.18}/natural_pdf/qa/document_qa.py +0 -0
- {natural_pdf-0.1.16 → natural_pdf-0.1.18}/natural_pdf/search/search_options.py +0 -0
- {natural_pdf-0.1.16 → natural_pdf-0.1.18}/natural_pdf/search/search_service_protocol.py +0 -0
- {natural_pdf-0.1.16 → natural_pdf-0.1.18}/natural_pdf/search/searchable_mixin.py +0 -0
- {natural_pdf-0.1.16 → natural_pdf-0.1.18}/natural_pdf/selectors/__init__.py +0 -0
- {natural_pdf-0.1.16 → natural_pdf-0.1.18}/natural_pdf/selectors/parser.py +0 -0
- {natural_pdf-0.1.16 → natural_pdf-0.1.18}/natural_pdf/templates/__init__.py +0 -0
- {natural_pdf-0.1.16 → natural_pdf-0.1.18}/natural_pdf/templates/finetune/fine_tune_paddleocr.md +0 -0
- {natural_pdf-0.1.16 → natural_pdf-0.1.18}/natural_pdf/templates/spa/css/style.css +0 -0
- {natural_pdf-0.1.16 → natural_pdf-0.1.18}/natural_pdf/templates/spa/index.html +0 -0
- {natural_pdf-0.1.16 → natural_pdf-0.1.18}/natural_pdf/templates/spa/js/app.js +0 -0
- {natural_pdf-0.1.16 → natural_pdf-0.1.18}/natural_pdf/templates/spa/words.txt +0 -0
- {natural_pdf-0.1.16 → natural_pdf-0.1.18}/natural_pdf/utils/__init__.py +0 -0
- {natural_pdf-0.1.16 → natural_pdf-0.1.18}/natural_pdf/utils/debug.py +0 -0
- {natural_pdf-0.1.16 → natural_pdf-0.1.18}/natural_pdf/utils/highlighting.py +0 -0
- {natural_pdf-0.1.16 → natural_pdf-0.1.18}/natural_pdf/utils/identifiers.py +0 -0
- {natural_pdf-0.1.16 → natural_pdf-0.1.18}/natural_pdf/utils/locks.py +0 -0
- {natural_pdf-0.1.16 → natural_pdf-0.1.18}/natural_pdf/utils/packaging.py +0 -0
- {natural_pdf-0.1.16 → natural_pdf-0.1.18}/natural_pdf/utils/reading_order.py +0 -0
- {natural_pdf-0.1.16 → natural_pdf-0.1.18}/natural_pdf/utils/text_extraction.py +0 -0
- {natural_pdf-0.1.16 → natural_pdf-0.1.18}/natural_pdf/utils/visualization.py +0 -0
- {natural_pdf-0.1.16 → natural_pdf-0.1.18}/natural_pdf/widgets/__init__.py +0 -0
- {natural_pdf-0.1.16 → natural_pdf-0.1.18}/natural_pdf/widgets/viewer.py +0 -0
- {natural_pdf-0.1.16 → natural_pdf-0.1.18}/natural_pdf.egg-info/dependency_links.txt +0 -0
- {natural_pdf-0.1.16 → natural_pdf-0.1.18}/natural_pdf.egg-info/top_level.txt +0 -0
- {natural_pdf-0.1.16 → natural_pdf-0.1.18}/pdfs/.gitkeep +0 -0
- {natural_pdf-0.1.16 → natural_pdf-0.1.18}/pdfs/01-practice.pdf +0 -0
- {natural_pdf-0.1.16 → natural_pdf-0.1.18}/pdfs/0500000US42001.pdf +0 -0
- {natural_pdf-0.1.16 → natural_pdf-0.1.18}/pdfs/0500000US42007.pdf +0 -0
- {natural_pdf-0.1.16 → natural_pdf-0.1.18}/pdfs/2014 Statistics.pdf +0 -0
- {natural_pdf-0.1.16 → natural_pdf-0.1.18}/pdfs/2019 Statistics.pdf +0 -0
- {natural_pdf-0.1.16 → natural_pdf-0.1.18}/pdfs/30.pdf +0 -0
- {natural_pdf-0.1.16 → natural_pdf-0.1.18}/pdfs/Atlanta_Public_Schools_GA_sample.pdf +0 -0
- {natural_pdf-0.1.16 → natural_pdf-0.1.18}/pdfs/anexo_edital_6604_1743480-table.pdf +0 -0
- {natural_pdf-0.1.16 → natural_pdf-0.1.18}/pdfs/cia-doc.pdf +0 -0
- {natural_pdf-0.1.16 → natural_pdf-0.1.18}/pdfs/geometry.pdf +0 -0
- {natural_pdf-0.1.16 → natural_pdf-0.1.18}/pdfs/image.png +0 -0
- {natural_pdf-0.1.16 → natural_pdf-0.1.18}/pdfs/image.png.pdf +0 -0
- {natural_pdf-0.1.16 → natural_pdf-0.1.18}/pdfs/multicolumn.pdf +0 -0
- {natural_pdf-0.1.16 → natural_pdf-0.1.18}/pdfs/needs-ocr.pdf +0 -0
- {natural_pdf-0.1.16 → natural_pdf-0.1.18}/pdfs/red.pdf +0 -0
- {natural_pdf-0.1.16 → natural_pdf-0.1.18}/pdfs/tiny-ocr-2.pdf +0 -0
- {natural_pdf-0.1.16 → natural_pdf-0.1.18}/pdfs/tiny-ocr-3.pdf +0 -0
- {natural_pdf-0.1.16 → natural_pdf-0.1.18}/pdfs/tiny-ocr-small.jpg +0 -0
- {natural_pdf-0.1.16 → natural_pdf-0.1.18}/pdfs/tiny-ocr-wide.jpg +0 -0
- {natural_pdf-0.1.16 → natural_pdf-0.1.18}/pdfs/tiny-ocr.pdf +0 -0
- {natural_pdf-0.1.16 → natural_pdf-0.1.18}/pdfs/tiny.pdf +0 -0
- {natural_pdf-0.1.16 → natural_pdf-0.1.18}/pdfs/word-counter.pdf +0 -0
- {natural_pdf-0.1.16 → natural_pdf-0.1.18}/publish.sh +0 -0
- {natural_pdf-0.1.16 → natural_pdf-0.1.18}/sample-screen.png +0 -0
- {natural_pdf-0.1.16 → natural_pdf-0.1.18}/setup.cfg +0 -0
- {natural_pdf-0.1.16 → natural_pdf-0.1.18}/tests/conftest.py +0 -0
- {natural_pdf-0.1.16 → natural_pdf-0.1.18}/tests/exporters/test_paddleocr_exporter.py +0 -0
- {natural_pdf-0.1.16 → natural_pdf-0.1.18}/tests/test_core/test_containment_geometry.py +0 -0
- {natural_pdf-0.1.16 → natural_pdf-0.1.18}/tests/test_core/test_elements.py +0 -0
- {natural_pdf-0.1.16 → natural_pdf-0.1.18}/tests/test_core/test_loading.py +0 -0
- {natural_pdf-0.1.16 → natural_pdf-0.1.18}/tests/test_core/test_spatial.py +0 -0
- {natural_pdf-0.1.16 → natural_pdf-0.1.18}/tests/test_core/test_text_extraction.py +0 -0
- {natural_pdf-0.1.16 → natural_pdf-0.1.18}/tests/test_loading_original.py +0 -0
- {natural_pdf-0.1.16 → natural_pdf-0.1.18}/tests/test_optional_deps.py +0 -0
- {natural_pdf-0.1.16 → natural_pdf-0.1.18}/uv.lock +0 -0
@@ -23,6 +23,7 @@ DOCS_DIR = Path("docs")
|
|
23
23
|
CACHE_FILE = Path(".notebook_cache.json")
|
24
24
|
# Add relative paths or glob patterns from DOCS_DIR, e.g., 'api/', '**/_*.md'
|
25
25
|
EXCLUDE_PATTERNS = [
|
26
|
+
"describe/index.md",
|
26
27
|
"installation/index.md",
|
27
28
|
"ocr/index.md",
|
28
29
|
"explanations",
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: natural-pdf
|
3
|
-
Version: 0.1.
|
3
|
+
Version: 0.1.18
|
4
4
|
Summary: A more intuitive interface for working with PDFs
|
5
5
|
Author-email: Jonathan Soma <jonathan.soma@gmail.com>
|
6
6
|
License-Expression: MIT
|
@@ -11,6 +11,7 @@ Classifier: Operating System :: OS Independent
|
|
11
11
|
Requires-Python: >=3.9
|
12
12
|
Description-Content-Type: text/markdown
|
13
13
|
License-File: LICENSE
|
14
|
+
Requires-Dist: pandas
|
14
15
|
Requires-Dist: pdfplumber
|
15
16
|
Requires-Dist: colormath2
|
16
17
|
Requires-Dist: pillow
|
@@ -20,14 +21,15 @@ Requires-Dist: urllib3
|
|
20
21
|
Requires-Dist: tqdm
|
21
22
|
Requires-Dist: pydantic
|
22
23
|
Requires-Dist: jenkspy
|
23
|
-
Requires-Dist: pikepdf
|
24
|
+
Requires-Dist: pikepdf
|
24
25
|
Requires-Dist: scipy
|
25
26
|
Requires-Dist: torch
|
26
27
|
Requires-Dist: torchvision
|
27
|
-
Requires-Dist: transformers[sentencepiece]
|
28
|
+
Requires-Dist: transformers[sentencepiece]
|
28
29
|
Requires-Dist: huggingface_hub>=0.29.3
|
29
30
|
Requires-Dist: sentence-transformers
|
30
31
|
Requires-Dist: timm
|
32
|
+
Requires-Dist: ipywidgets>=7.0.0
|
31
33
|
Provides-Extra: test
|
32
34
|
Requires-Dist: pytest; extra == "test"
|
33
35
|
Requires-Dist: pytest-xdist; extra == "test"
|
@@ -39,7 +41,6 @@ Provides-Extra: favorites
|
|
39
41
|
Requires-Dist: natural-pdf[deskew]; extra == "favorites"
|
40
42
|
Requires-Dist: natural-pdf[ocr-export]; extra == "favorites"
|
41
43
|
Requires-Dist: natural-pdf[search]; extra == "favorites"
|
42
|
-
Requires-Dist: ipywidgets; extra == "favorites"
|
43
44
|
Requires-Dist: surya-ocr; extra == "favorites"
|
44
45
|
Provides-Extra: dev
|
45
46
|
Requires-Dist: black; extra == "dev"
|
@@ -61,23 +62,22 @@ Requires-Dist: setuptools; extra == "dev"
|
|
61
62
|
Provides-Extra: deskew
|
62
63
|
Requires-Dist: deskew>=1.5; extra == "deskew"
|
63
64
|
Requires-Dist: img2pdf; extra == "deskew"
|
64
|
-
Provides-Extra: addons
|
65
|
-
Requires-Dist: surya-ocr; extra == "addons"
|
66
|
-
Requires-Dist: doclayout_yolo; extra == "addons"
|
67
|
-
Requires-Dist: paddlepaddle>=3.0.0; extra == "addons"
|
68
|
-
Requires-Dist: paddleocr>=3.0.0; extra == "addons"
|
69
|
-
Requires-Dist: ipywidgets>=7.0.0; extra == "addons"
|
70
|
-
Requires-Dist: easyocr; extra == "addons"
|
71
|
-
Requires-Dist: surya-ocr; extra == "addons"
|
72
|
-
Requires-Dist: doclayout_yolo; extra == "addons"
|
73
|
-
Requires-Dist: python-doctr[torch]; extra == "addons"
|
74
|
-
Requires-Dist: docling; extra == "addons"
|
75
65
|
Provides-Extra: all
|
76
66
|
Requires-Dist: natural-pdf[ocr-export]; extra == "all"
|
77
67
|
Requires-Dist: natural-pdf[deskew]; extra == "all"
|
78
68
|
Requires-Dist: natural-pdf[test]; extra == "all"
|
79
69
|
Requires-Dist: natural-pdf[search]; extra == "all"
|
80
|
-
Requires-Dist: natural-pdf[
|
70
|
+
Requires-Dist: natural-pdf[extras]; extra == "all"
|
71
|
+
Requires-Dist: natural-pdf[favorites]; extra == "all"
|
72
|
+
Provides-Extra: paddle
|
73
|
+
Requires-Dist: paddlepaddle>=3.0.0; extra == "paddle"
|
74
|
+
Requires-Dist: paddleocr>=3.0.1; extra == "paddle"
|
75
|
+
Requires-Dist: paddlex>=3.0.1; extra == "paddle"
|
76
|
+
Provides-Extra: extras
|
77
|
+
Requires-Dist: surya-ocr; extra == "extras"
|
78
|
+
Requires-Dist: doclayout_yolo; extra == "extras"
|
79
|
+
Requires-Dist: easyocr; extra == "extras"
|
80
|
+
Requires-Dist: natural-pdf[paddle]; extra == "extras"
|
81
81
|
Provides-Extra: ocr-export
|
82
82
|
Requires-Dist: pikepdf; extra == "ocr-export"
|
83
83
|
Provides-Extra: export-extras
|
@@ -0,0 +1,438 @@
|
|
1
|
+
{
|
2
|
+
"cells": [
|
3
|
+
{
|
4
|
+
"cell_type": "markdown",
|
5
|
+
"id": "725fe29c",
|
6
|
+
"metadata": {},
|
7
|
+
"source": [
|
8
|
+
"# Describe Functionality\n",
|
9
|
+
"\n",
|
10
|
+
"The `describe()` and `inspect()` methods provide an easy way to understand the contents of your PDF elements without having to visualize them as images.\n",
|
11
|
+
"\n",
|
12
|
+
"## Basic Usage\n",
|
13
|
+
"\n",
|
14
|
+
"Get a summary of an entire page:"
|
15
|
+
]
|
16
|
+
},
|
17
|
+
{
|
18
|
+
"cell_type": "code",
|
19
|
+
"execution_count": 1,
|
20
|
+
"id": "5bc39925",
|
21
|
+
"metadata": {
|
22
|
+
"execution": {
|
23
|
+
"iopub.execute_input": "2025-06-15T16:10:40.070939Z",
|
24
|
+
"iopub.status.busy": "2025-06-15T16:10:40.070809Z",
|
25
|
+
"iopub.status.idle": "2025-06-15T16:10:45.409526Z",
|
26
|
+
"shell.execute_reply": "2025-06-15T16:10:45.409235Z"
|
27
|
+
}
|
28
|
+
},
|
29
|
+
"outputs": [
|
30
|
+
{
|
31
|
+
"name": "stderr",
|
32
|
+
"output_type": "stream",
|
33
|
+
"text": [
|
34
|
+
"CropBox missing from /Page, defaulting to MediaBox\n"
|
35
|
+
]
|
36
|
+
},
|
37
|
+
{
|
38
|
+
"data": {
|
39
|
+
"text/markdown": [
|
40
|
+
"## Page 1 Summary\n",
|
41
|
+
"\n",
|
42
|
+
"**Elements**:\n",
|
43
|
+
" - **text**: 44 elements\n",
|
44
|
+
" - **line**: 21 elements\n",
|
45
|
+
" - **rect**: 8 elements\n",
|
46
|
+
"\n",
|
47
|
+
"**Text Analysis**:\n",
|
48
|
+
" - **typography**:\n",
|
49
|
+
" - **fonts**:\n",
|
50
|
+
" - Helvetica: 44\n",
|
51
|
+
" - **sizes**:\n",
|
52
|
+
" - 10.0pt: 40\n",
|
53
|
+
" - 8.0pt: 3\n",
|
54
|
+
" - 12.0pt: 1\n",
|
55
|
+
" - styles: 9 bold\n",
|
56
|
+
" - **colors**:\n",
|
57
|
+
" - black: 43\n",
|
58
|
+
" - other: 1"
|
59
|
+
],
|
60
|
+
"text/plain": [
|
61
|
+
"## Page 1 Summary\n",
|
62
|
+
"\n",
|
63
|
+
"**Elements**:\n",
|
64
|
+
" - **text**: 44 elements\n",
|
65
|
+
" - **line**: 21 elements\n",
|
66
|
+
" - **rect**: 8 elements\n",
|
67
|
+
"\n",
|
68
|
+
"**Text Analysis**:\n",
|
69
|
+
" - **typography**:\n",
|
70
|
+
" - **fonts**:\n",
|
71
|
+
" - Helvetica: 44\n",
|
72
|
+
" - **sizes**:\n",
|
73
|
+
" - 10.0pt: 40\n",
|
74
|
+
" - 8.0pt: 3\n",
|
75
|
+
" - 12.0pt: 1\n",
|
76
|
+
" - styles: 9 bold\n",
|
77
|
+
" - **colors**:\n",
|
78
|
+
" - black: 43\n",
|
79
|
+
" - other: 1"
|
80
|
+
]
|
81
|
+
},
|
82
|
+
"execution_count": 1,
|
83
|
+
"metadata": {},
|
84
|
+
"output_type": "execute_result"
|
85
|
+
}
|
86
|
+
],
|
87
|
+
"source": [
|
88
|
+
"from natural_pdf import PDF\n",
|
89
|
+
"\n",
|
90
|
+
"pdf = PDF(\"https://github.com/jsoma/natural-pdf/raw/refs/heads/main/pdfs/01-practice.pdf\")\n",
|
91
|
+
"page = pdf.pages[0]\n",
|
92
|
+
"\n",
|
93
|
+
"page.describe()"
|
94
|
+
]
|
95
|
+
},
|
96
|
+
{
|
97
|
+
"cell_type": "markdown",
|
98
|
+
"id": "b0e3354c",
|
99
|
+
"metadata": {},
|
100
|
+
"source": [
|
101
|
+
"## Element collection summaries\n",
|
102
|
+
"\n",
|
103
|
+
"You can describe element collections on a page with `.describe()`."
|
104
|
+
]
|
105
|
+
},
|
106
|
+
{
|
107
|
+
"cell_type": "code",
|
108
|
+
"execution_count": 2,
|
109
|
+
"id": "eac4e22b",
|
110
|
+
"metadata": {
|
111
|
+
"execution": {
|
112
|
+
"iopub.execute_input": "2025-06-15T16:10:45.411081Z",
|
113
|
+
"iopub.status.busy": "2025-06-15T16:10:45.410822Z",
|
114
|
+
"iopub.status.idle": "2025-06-15T16:10:45.413903Z",
|
115
|
+
"shell.execute_reply": "2025-06-15T16:10:45.413620Z"
|
116
|
+
}
|
117
|
+
},
|
118
|
+
"outputs": [
|
119
|
+
{
|
120
|
+
"data": {
|
121
|
+
"text/markdown": [
|
122
|
+
"## Collection Summary (44 elements)\n",
|
123
|
+
"\n",
|
124
|
+
"**Typography**:\n",
|
125
|
+
" - **fonts**:\n",
|
126
|
+
" - Helvetica: 44\n",
|
127
|
+
" - **sizes**:\n",
|
128
|
+
" - 10.0pt: 40\n",
|
129
|
+
" - 8.0pt: 3\n",
|
130
|
+
" - 12.0pt: 1\n",
|
131
|
+
" - **styles**: 9 bold\n",
|
132
|
+
" - **colors**:\n",
|
133
|
+
" - black: 43\n",
|
134
|
+
" - other: 1"
|
135
|
+
],
|
136
|
+
"text/plain": [
|
137
|
+
"## Collection Summary (44 elements)\n",
|
138
|
+
"\n",
|
139
|
+
"**Typography**:\n",
|
140
|
+
" - **fonts**:\n",
|
141
|
+
" - Helvetica: 44\n",
|
142
|
+
" - **sizes**:\n",
|
143
|
+
" - 10.0pt: 40\n",
|
144
|
+
" - 8.0pt: 3\n",
|
145
|
+
" - 12.0pt: 1\n",
|
146
|
+
" - **styles**: 9 bold\n",
|
147
|
+
" - **colors**:\n",
|
148
|
+
" - black: 43\n",
|
149
|
+
" - other: 1"
|
150
|
+
]
|
151
|
+
},
|
152
|
+
"execution_count": 2,
|
153
|
+
"metadata": {},
|
154
|
+
"output_type": "execute_result"
|
155
|
+
}
|
156
|
+
],
|
157
|
+
"source": [
|
158
|
+
"# Describe all elements on the page\n",
|
159
|
+
"page.find_all('text').describe()"
|
160
|
+
]
|
161
|
+
},
|
162
|
+
{
|
163
|
+
"cell_type": "code",
|
164
|
+
"execution_count": 3,
|
165
|
+
"id": "503c2a31",
|
166
|
+
"metadata": {
|
167
|
+
"execution": {
|
168
|
+
"iopub.execute_input": "2025-06-15T16:10:45.415172Z",
|
169
|
+
"iopub.status.busy": "2025-06-15T16:10:45.415052Z",
|
170
|
+
"iopub.status.idle": "2025-06-15T16:10:45.417592Z",
|
171
|
+
"shell.execute_reply": "2025-06-15T16:10:45.417343Z"
|
172
|
+
}
|
173
|
+
},
|
174
|
+
"outputs": [
|
175
|
+
{
|
176
|
+
"data": {
|
177
|
+
"text/markdown": [
|
178
|
+
"## Collection Summary (8 elements)\n",
|
179
|
+
"\n",
|
180
|
+
"**Size Stats**:\n",
|
181
|
+
" - **width range**: 8-180\n",
|
182
|
+
" - **height range**: 8-35\n",
|
183
|
+
" - **avg area**: 844 sq pts\n",
|
184
|
+
"\n",
|
185
|
+
"**Styles**:\n",
|
186
|
+
" - **stroke widths**:\n",
|
187
|
+
" - 0.5: 7\n",
|
188
|
+
" - **colors**:\n",
|
189
|
+
" - black: 8"
|
190
|
+
],
|
191
|
+
"text/plain": [
|
192
|
+
"## Collection Summary (8 elements)\n",
|
193
|
+
"\n",
|
194
|
+
"**Size Stats**:\n",
|
195
|
+
" - **width range**: 8-180\n",
|
196
|
+
" - **height range**: 8-35\n",
|
197
|
+
" - **avg area**: 844 sq pts\n",
|
198
|
+
"\n",
|
199
|
+
"**Styles**:\n",
|
200
|
+
" - **stroke widths**:\n",
|
201
|
+
" - 0.5: 7\n",
|
202
|
+
" - **colors**:\n",
|
203
|
+
" - black: 8"
|
204
|
+
]
|
205
|
+
},
|
206
|
+
"execution_count": 3,
|
207
|
+
"metadata": {},
|
208
|
+
"output_type": "execute_result"
|
209
|
+
}
|
210
|
+
],
|
211
|
+
"source": [
|
212
|
+
"# Describe all elements on the page\n",
|
213
|
+
"page.find_all('rect').describe()"
|
214
|
+
]
|
215
|
+
},
|
216
|
+
{
|
217
|
+
"cell_type": "markdown",
|
218
|
+
"id": "5b468a5e",
|
219
|
+
"metadata": {},
|
220
|
+
"source": [
|
221
|
+
"## Inspecting lists of elements\n",
|
222
|
+
"\n",
|
223
|
+
"For more detail, you can view specific details of element collections with `inspect()`."
|
224
|
+
]
|
225
|
+
},
|
226
|
+
{
|
227
|
+
"cell_type": "code",
|
228
|
+
"execution_count": 4,
|
229
|
+
"id": "ea04905b",
|
230
|
+
"metadata": {
|
231
|
+
"execution": {
|
232
|
+
"iopub.execute_input": "2025-06-15T16:10:45.418792Z",
|
233
|
+
"iopub.status.busy": "2025-06-15T16:10:45.418688Z",
|
234
|
+
"iopub.status.idle": "2025-06-15T16:10:45.421581Z",
|
235
|
+
"shell.execute_reply": "2025-06-15T16:10:45.421321Z"
|
236
|
+
}
|
237
|
+
},
|
238
|
+
"outputs": [
|
239
|
+
{
|
240
|
+
"data": {
|
241
|
+
"text/markdown": [
|
242
|
+
"## Collection Inspection (44 elements)\n",
|
243
|
+
"\n",
|
244
|
+
"### Word Elements\n",
|
245
|
+
"\n",
|
246
|
+
"| text | x0 | top | x1 | bottom | font_family | size | bold | italic | source | confidence | color |\n",
|
247
|
+
"|------|------|------|------|------|------|------|------|------|------|------|------|\n",
|
248
|
+
"| Jungle Health and Safety Inspection Service | 385 | 36 | 542 | 44 | Helvetica | 8 | False | False | native | 1.00 | #000000 |\n",
|
249
|
+
"| INS-UP70N51NCL41R | 385 | 46 | 466 | 54 | Helvetica | 8 | False | False | native | 1.00 | #ff0000 |\n",
|
250
|
+
"| Site: | 50 | 84 | 74 | 94 | Helvetica | 10 | True | False | native | 1.00 | #000000 |\n",
|
251
|
+
"| Durham’s Meatpacking | 74 | 84 | 182 | 94 | Helvetica | 10 | False | False | native | 1.00 | #000000 |\n",
|
252
|
+
"| Chicago, Ill. | 182 | 84 | 235 | 94 | Helvetica | 10 | False | False | native | 1.00 | #000000 |\n",
|
253
|
+
"| Date: | 50 | 104 | 81 | 114 | Helvetica | 10 | True | False | native | 1.00 | #000000 |\n",
|
254
|
+
"| February 3, 1905 | 81 | 104 | 157 | 114 | Helvetica | 10 | False | False | native | 1.00 | #000000 |\n",
|
255
|
+
"| Violation Count: | 50 | 124 | 130 | 134 | Helvetica | 10 | True | False | native | 1.00 | #000000 |\n",
|
256
|
+
"| 7 | 130 | 124 | 136 | 134 | Helvetica | 10 | False | False | native | 1.00 | #000000 |\n",
|
257
|
+
"| Summary: | 50 | 144 | 102 | 154 | Helvetica | 10 | True | False | native | 1.00 | #000000 |\n",
|
258
|
+
"| Worst of any, however, were the fertilizer men, an... | 102 | 144 | 506 | 154 | Helvetica | 10 | False | False | native | 1.00 | #000000 |\n",
|
259
|
+
"| These people could not be shown to the visitor - f... | 50 | 160 | 512 | 170 | Helvetica | 10 | False | False | native | 1.00 | #000000 |\n",
|
260
|
+
"| visitor at a hundred yards, and as for the other m... | 50 | 176 | 491 | 186 | Helvetica | 10 | False | False | native | 1.00 | #000000 |\n",
|
261
|
+
"| some of which there were open vats near the level ... | 50 | 192 | 496 | 202 | Helvetica | 10 | False | False | native | 1.00 | #000000 |\n",
|
262
|
+
"| into the vats; and when they were fished out, ther... | 50 | 208 | 465 | 218 | Helvetica | 10 | False | False | native | 1.00 | #000000 |\n",
|
263
|
+
"| exhibiting - sometimes they would be overlooked fo... | 50 | 224 | 492 | 234 | Helvetica | 10 | False | False | native | 1.00 | #000000 |\n",
|
264
|
+
"| to the world as Durham’s Pure Leaf Lard! | 50 | 240 | 232 | 250 | Helvetica | 10 | False | False | native | 1.00 | #000000 |\n",
|
265
|
+
"| Violations | 50 | 372 | 107 | 384 | Helvetica | 12 | True | False | native | 1.00 | #000000 |\n",
|
266
|
+
"| Statute | 55 | 398 | 89 | 408 | Helvetica | 10 | True | False | native | 1.00 | #000000 |\n",
|
267
|
+
"| Description | 105 | 398 | 160 | 408 | Helvetica | 10 | True | False | native | 1.00 | #000000 |\n",
|
268
|
+
"| Level | 455 | 398 | 481 | 408 | Helvetica | 10 | True | False | native | 1.00 | #000000 |\n",
|
269
|
+
"| Repeat? | 505 | 398 | 544 | 408 | Helvetica | 10 | True | False | native | 1.00 | #000000 |\n",
|
270
|
+
"| 4.12.7 | 55 | 418 | 83 | 428 | Helvetica | 10 | False | False | native | 1.00 | #000000 |\n",
|
271
|
+
"| Unsanitary Working Conditions. | 105 | 418 | 245 | 428 | Helvetica | 10 | False | False | native | 1.00 | #000000 |\n",
|
272
|
+
"| Critical | 455 | 418 | 486 | 428 | Helvetica | 10 | False | False | native | 1.00 | #000000 |\n",
|
273
|
+
"| 5.8.3 | 55 | 438 | 77 | 448 | Helvetica | 10 | False | False | native | 1.00 | #000000 |\n",
|
274
|
+
"| Inadequate Protective Equipment. | 105 | 438 | 256 | 448 | Helvetica | 10 | False | False | native | 1.00 | #000000 |\n",
|
275
|
+
"| Serious | 455 | 438 | 489 | 448 | Helvetica | 10 | False | False | native | 1.00 | #000000 |\n",
|
276
|
+
"| 6.3.9 | 55 | 458 | 77 | 468 | Helvetica | 10 | False | False | native | 1.00 | #000000 |\n",
|
277
|
+
"| Ineffective Injury Prevention. | 105 | 458 | 231 | 468 | Helvetica | 10 | False | False | native | 1.00 | #000000 |\n",
|
278
|
+
"_Showing 30 of 44 elements (pass limit= to see more)_"
|
279
|
+
],
|
280
|
+
"text/plain": [
|
281
|
+
"## Collection Inspection (44 elements)\n",
|
282
|
+
"\n",
|
283
|
+
"### Word Elements\n",
|
284
|
+
"\n",
|
285
|
+
"| text | x0 | top | x1 | bottom | font_family | size | bold | italic | source | confidence | color |\n",
|
286
|
+
"|------|------|------|------|------|------|------|------|------|------|------|------|\n",
|
287
|
+
"| Jungle Health and Safety Inspection Service | 385 | 36 | 542 | 44 | Helvetica | 8 | False | False | native | 1.00 | #000000 |\n",
|
288
|
+
"| INS-UP70N51NCL41R | 385 | 46 | 466 | 54 | Helvetica | 8 | False | False | native | 1.00 | #ff0000 |\n",
|
289
|
+
"| Site: | 50 | 84 | 74 | 94 | Helvetica | 10 | True | False | native | 1.00 | #000000 |\n",
|
290
|
+
"| Durham’s Meatpacking | 74 | 84 | 182 | 94 | Helvetica | 10 | False | False | native | 1.00 | #000000 |\n",
|
291
|
+
"| Chicago, Ill. | 182 | 84 | 235 | 94 | Helvetica | 10 | False | False | native | 1.00 | #000000 |\n",
|
292
|
+
"| Date: | 50 | 104 | 81 | 114 | Helvetica | 10 | True | False | native | 1.00 | #000000 |\n",
|
293
|
+
"| February 3, 1905 | 81 | 104 | 157 | 114 | Helvetica | 10 | False | False | native | 1.00 | #000000 |\n",
|
294
|
+
"| Violation Count: | 50 | 124 | 130 | 134 | Helvetica | 10 | True | False | native | 1.00 | #000000 |\n",
|
295
|
+
"| 7 | 130 | 124 | 136 | 134 | Helvetica | 10 | False | False | native | 1.00 | #000000 |\n",
|
296
|
+
"| Summary: | 50 | 144 | 102 | 154 | Helvetica | 10 | True | False | native | 1.00 | #000000 |\n",
|
297
|
+
"| Worst of any, however, were the fertilizer men, an... | 102 | 144 | 506 | 154 | Helvetica | 10 | False | False | native | 1.00 | #000000 |\n",
|
298
|
+
"| These people could not be shown to the visitor - f... | 50 | 160 | 512 | 170 | Helvetica | 10 | False | False | native | 1.00 | #000000 |\n",
|
299
|
+
"| visitor at a hundred yards, and as for the other m... | 50 | 176 | 491 | 186 | Helvetica | 10 | False | False | native | 1.00 | #000000 |\n",
|
300
|
+
"| some of which there were open vats near the level ... | 50 | 192 | 496 | 202 | Helvetica | 10 | False | False | native | 1.00 | #000000 |\n",
|
301
|
+
"| into the vats; and when they were fished out, ther... | 50 | 208 | 465 | 218 | Helvetica | 10 | False | False | native | 1.00 | #000000 |\n",
|
302
|
+
"| exhibiting - sometimes they would be overlooked fo... | 50 | 224 | 492 | 234 | Helvetica | 10 | False | False | native | 1.00 | #000000 |\n",
|
303
|
+
"| to the world as Durham’s Pure Leaf Lard! | 50 | 240 | 232 | 250 | Helvetica | 10 | False | False | native | 1.00 | #000000 |\n",
|
304
|
+
"| Violations | 50 | 372 | 107 | 384 | Helvetica | 12 | True | False | native | 1.00 | #000000 |\n",
|
305
|
+
"| Statute | 55 | 398 | 89 | 408 | Helvetica | 10 | True | False | native | 1.00 | #000000 |\n",
|
306
|
+
"| Description | 105 | 398 | 160 | 408 | Helvetica | 10 | True | False | native | 1.00 | #000000 |\n",
|
307
|
+
"| Level | 455 | 398 | 481 | 408 | Helvetica | 10 | True | False | native | 1.00 | #000000 |\n",
|
308
|
+
"| Repeat? | 505 | 398 | 544 | 408 | Helvetica | 10 | True | False | native | 1.00 | #000000 |\n",
|
309
|
+
"| 4.12.7 | 55 | 418 | 83 | 428 | Helvetica | 10 | False | False | native | 1.00 | #000000 |\n",
|
310
|
+
"| Unsanitary Working Conditions. | 105 | 418 | 245 | 428 | Helvetica | 10 | False | False | native | 1.00 | #000000 |\n",
|
311
|
+
"| Critical | 455 | 418 | 486 | 428 | Helvetica | 10 | False | False | native | 1.00 | #000000 |\n",
|
312
|
+
"| 5.8.3 | 55 | 438 | 77 | 448 | Helvetica | 10 | False | False | native | 1.00 | #000000 |\n",
|
313
|
+
"| Inadequate Protective Equipment. | 105 | 438 | 256 | 448 | Helvetica | 10 | False | False | native | 1.00 | #000000 |\n",
|
314
|
+
"| Serious | 455 | 438 | 489 | 448 | Helvetica | 10 | False | False | native | 1.00 | #000000 |\n",
|
315
|
+
"| 6.3.9 | 55 | 458 | 77 | 468 | Helvetica | 10 | False | False | native | 1.00 | #000000 |\n",
|
316
|
+
"| Ineffective Injury Prevention. | 105 | 458 | 231 | 468 | Helvetica | 10 | False | False | native | 1.00 | #000000 |\n",
|
317
|
+
"_Showing 30 of 44 elements (pass limit= to see more)_"
|
318
|
+
]
|
319
|
+
},
|
320
|
+
"execution_count": 4,
|
321
|
+
"metadata": {},
|
322
|
+
"output_type": "execute_result"
|
323
|
+
}
|
324
|
+
],
|
325
|
+
"source": [
|
326
|
+
"page.find_all('text').inspect()"
|
327
|
+
]
|
328
|
+
},
|
329
|
+
{
|
330
|
+
"cell_type": "code",
|
331
|
+
"execution_count": 5,
|
332
|
+
"id": "06c8d813",
|
333
|
+
"metadata": {
|
334
|
+
"execution": {
|
335
|
+
"iopub.execute_input": "2025-06-15T16:10:45.422781Z",
|
336
|
+
"iopub.status.busy": "2025-06-15T16:10:45.422665Z",
|
337
|
+
"iopub.status.idle": "2025-06-15T16:10:45.425191Z",
|
338
|
+
"shell.execute_reply": "2025-06-15T16:10:45.424938Z"
|
339
|
+
}
|
340
|
+
},
|
341
|
+
"outputs": [
|
342
|
+
{
|
343
|
+
"data": {
|
344
|
+
"text/markdown": [
|
345
|
+
"## Collection Inspection (21 elements)\n",
|
346
|
+
"\n",
|
347
|
+
"### Line Elements\n",
|
348
|
+
"\n",
|
349
|
+
"| x0 | top | x1 | bottom | width | is_horizontal | is_vertical |\n",
|
350
|
+
"|------|------|------|------|------|------|------|\n",
|
351
|
+
"| 50 | 352 | 550 | 352 | 2 | True | False |\n",
|
352
|
+
"| 50 | 392 | 550 | 392 | 0 | True | False |\n",
|
353
|
+
"| 50 | 392 | 50 | 552 | 0 | False | True |\n",
|
354
|
+
"| 100 | 392 | 100 | 552 | 0 | False | True |\n",
|
355
|
+
"| 450 | 392 | 450 | 552 | 0 | False | True |\n",
|
356
|
+
"| 500 | 392 | 500 | 552 | 0 | False | True |\n",
|
357
|
+
"| 550 | 392 | 550 | 552 | 0 | False | True |\n",
|
358
|
+
"| 50 | 412 | 550 | 412 | 0 | True | False |\n",
|
359
|
+
"| 520 | 418 | 528 | 426 | 0 | False | False |\n",
|
360
|
+
"| 520 | 418 | 528 | 426 | 0 | False | False |\n",
|
361
|
+
"| 50 | 432 | 550 | 432 | 0 | True | False |\n",
|
362
|
+
"| 520 | 438 | 528 | 446 | 0 | False | False |\n",
|
363
|
+
"| 520 | 438 | 528 | 446 | 0 | False | False |\n",
|
364
|
+
"| 50 | 452 | 550 | 452 | 0 | True | False |\n",
|
365
|
+
"| 50 | 472 | 550 | 472 | 0 | True | False |\n",
|
366
|
+
"| 50 | 492 | 550 | 492 | 0 | True | False |\n",
|
367
|
+
"| 50 | 512 | 550 | 512 | 0 | True | False |\n",
|
368
|
+
"| 520 | 518 | 528 | 526 | 0 | False | False |\n",
|
369
|
+
"| 520 | 518 | 528 | 526 | 0 | False | False |\n",
|
370
|
+
"| 50 | 532 | 550 | 532 | 0 | True | False |\n",
|
371
|
+
"| 50 | 552 | 550 | 552 | 0 | True | False |"
|
372
|
+
],
|
373
|
+
"text/plain": [
|
374
|
+
"## Collection Inspection (21 elements)\n",
|
375
|
+
"\n",
|
376
|
+
"### Line Elements\n",
|
377
|
+
"\n",
|
378
|
+
"| x0 | top | x1 | bottom | width | is_horizontal | is_vertical |\n",
|
379
|
+
"|------|------|------|------|------|------|------|\n",
|
380
|
+
"| 50 | 352 | 550 | 352 | 2 | True | False |\n",
|
381
|
+
"| 50 | 392 | 550 | 392 | 0 | True | False |\n",
|
382
|
+
"| 50 | 392 | 50 | 552 | 0 | False | True |\n",
|
383
|
+
"| 100 | 392 | 100 | 552 | 0 | False | True |\n",
|
384
|
+
"| 450 | 392 | 450 | 552 | 0 | False | True |\n",
|
385
|
+
"| 500 | 392 | 500 | 552 | 0 | False | True |\n",
|
386
|
+
"| 550 | 392 | 550 | 552 | 0 | False | True |\n",
|
387
|
+
"| 50 | 412 | 550 | 412 | 0 | True | False |\n",
|
388
|
+
"| 520 | 418 | 528 | 426 | 0 | False | False |\n",
|
389
|
+
"| 520 | 418 | 528 | 426 | 0 | False | False |\n",
|
390
|
+
"| 50 | 432 | 550 | 432 | 0 | True | False |\n",
|
391
|
+
"| 520 | 438 | 528 | 446 | 0 | False | False |\n",
|
392
|
+
"| 520 | 438 | 528 | 446 | 0 | False | False |\n",
|
393
|
+
"| 50 | 452 | 550 | 452 | 0 | True | False |\n",
|
394
|
+
"| 50 | 472 | 550 | 472 | 0 | True | False |\n",
|
395
|
+
"| 50 | 492 | 550 | 492 | 0 | True | False |\n",
|
396
|
+
"| 50 | 512 | 550 | 512 | 0 | True | False |\n",
|
397
|
+
"| 520 | 518 | 528 | 526 | 0 | False | False |\n",
|
398
|
+
"| 520 | 518 | 528 | 526 | 0 | False | False |\n",
|
399
|
+
"| 50 | 532 | 550 | 532 | 0 | True | False |\n",
|
400
|
+
"| 50 | 552 | 550 | 552 | 0 | True | False |"
|
401
|
+
]
|
402
|
+
},
|
403
|
+
"execution_count": 5,
|
404
|
+
"metadata": {},
|
405
|
+
"output_type": "execute_result"
|
406
|
+
}
|
407
|
+
],
|
408
|
+
"source": [
|
409
|
+
"page.find_all('line').inspect()"
|
410
|
+
]
|
411
|
+
}
|
412
|
+
],
|
413
|
+
"metadata": {
|
414
|
+
"jupytext": {
|
415
|
+
"cell_metadata_filter": "-all",
|
416
|
+
"main_language": "python",
|
417
|
+
"notebook_metadata_filter": "-all",
|
418
|
+
"text_representation": {
|
419
|
+
"extension": ".md",
|
420
|
+
"format_name": "markdown"
|
421
|
+
}
|
422
|
+
},
|
423
|
+
"language_info": {
|
424
|
+
"codemirror_mode": {
|
425
|
+
"name": "ipython",
|
426
|
+
"version": 3
|
427
|
+
},
|
428
|
+
"file_extension": ".py",
|
429
|
+
"mimetype": "text/x-python",
|
430
|
+
"name": "python",
|
431
|
+
"nbconvert_exporter": "python",
|
432
|
+
"pygments_lexer": "ipython3",
|
433
|
+
"version": "3.11.11"
|
434
|
+
}
|
435
|
+
},
|
436
|
+
"nbformat": 4,
|
437
|
+
"nbformat_minor": 5
|
438
|
+
}
|
@@ -0,0 +1,42 @@
|
|
1
|
+
# Describe Functionality
|
2
|
+
|
3
|
+
The `describe()` and `inspect()` methods provide an easy way to understand the contents of your PDF elements without having to visualize them as images.
|
4
|
+
|
5
|
+
## Basic Usage
|
6
|
+
|
7
|
+
Get a summary of an entire page:
|
8
|
+
|
9
|
+
```python
|
10
|
+
from natural_pdf import PDF
|
11
|
+
|
12
|
+
pdf = PDF("https://github.com/jsoma/natural-pdf/raw/refs/heads/main/pdfs/01-practice.pdf")
|
13
|
+
page = pdf.pages[0]
|
14
|
+
|
15
|
+
page.describe()
|
16
|
+
```
|
17
|
+
|
18
|
+
## Element collection summaries
|
19
|
+
|
20
|
+
You can describe element collections on a page with `.describe()`.
|
21
|
+
|
22
|
+
```python
|
23
|
+
# Describe all elements on the page
|
24
|
+
page.find_all('text').describe()
|
25
|
+
```
|
26
|
+
|
27
|
+
```python
|
28
|
+
# Describe all elements on the page
|
29
|
+
page.find_all('rect').describe()
|
30
|
+
```
|
31
|
+
|
32
|
+
## Inspecting lists of elements
|
33
|
+
|
34
|
+
For more detail, you can view specific details of element collections with `inspect()`.
|
35
|
+
|
36
|
+
```python
|
37
|
+
page.find_all('text').inspect()
|
38
|
+
```
|
39
|
+
|
40
|
+
```python
|
41
|
+
page.find_all('line').inspect()
|
42
|
+
```
|