natural-pdf 0.2.2__tar.gz → 0.2.4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {natural_pdf-0.2.2/natural_pdf.egg-info → natural_pdf-0.2.4}/PKG-INFO +1 -1
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/docs/quick-reference/index.md +15 -1
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/docs/visual-debugging/index.md +63 -1
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/natural_pdf/collections/mixins.py +16 -3
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/natural_pdf/core/highlighting_service.py +25 -1
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/natural_pdf/core/page.py +5 -3
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/natural_pdf/core/page_collection.py +14 -14
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/natural_pdf/core/pdf.py +4 -1
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/natural_pdf/core/pdf_collection.py +131 -4
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/natural_pdf/core/render_spec.py +46 -2
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/natural_pdf/elements/base.py +66 -28
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/natural_pdf/elements/element_collection.py +10 -10
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/natural_pdf/elements/region.py +29 -27
- natural_pdf-0.2.4/natural_pdf/vision/__init__.py +7 -0
- natural_pdf-0.2.4/natural_pdf/vision/mixin.py +209 -0
- natural_pdf-0.2.4/natural_pdf/vision/results.py +146 -0
- natural_pdf-0.2.4/natural_pdf/vision/similarity.py +321 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4/natural_pdf.egg-info}/PKG-INFO +1 -1
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/natural_pdf.egg-info/SOURCES.txt +9 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/natural_pdf.egg-info/top_level.txt +0 -1
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/tests/test_core/test_containment_geometry.py +6 -6
- natural_pdf-0.2.4/tests/test_element_show_crop_highlights.py +168 -0
- natural_pdf-0.2.4/tests/test_expand.py +150 -0
- natural_pdf-0.2.4/tests/test_find_similar.py +147 -0
- natural_pdf-0.2.4/tests/test_highlight_regions.py +161 -0
- natural_pdf-0.2.4/tests/test_region_show_crop_highlights.py +219 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/.cursor/rules/analysis_framework.mdc +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/.cursor/rules/coding-style.mdc +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/.cursor/rules/edit-md-instead-of-ipynb.mdc +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/.cursor/rules/minimal-comments.mdc +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/.cursor/rules/natural-pdf-overview.mdc +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/.cursor/rules/user-friendly-library-code.mdc +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/.github/workflows/ci.yml +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/.github/workflows/docs.yml +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/.github/workflows/nightly-tutorials.yml +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/.gitignore +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/.pre-commit-config.yaml +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/01-execute_notebooks.py +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/02-run_all_tutorials.sh +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/CLAUDE.md +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/LICENSE +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/MANIFEST.in +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/README.md +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/audit_packaging.py +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/check_run_md.sh +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/docs/api/index.md +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/docs/assets/favicon.png +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/docs/assets/favicon.svg +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/docs/assets/javascripts/custom.js +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/docs/assets/logo.svg +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/docs/assets/sample-screen.png +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/docs/assets/social-preview.png +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/docs/assets/social-preview.svg +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/docs/assets/stylesheets/custom.css +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/docs/categorizing-documents/index.md +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/docs/data-extraction/index.md +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/docs/describe/index.md +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/docs/document-qa/index.md +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/docs/element-selection/index.md +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/docs/extracting-clean-text/index.md +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/docs/finetuning/index.md +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/docs/fix-messy-tables/index.md +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/docs/fix-messy-tables/table_1.csv +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/docs/fix-messy-tables/table_2.csv +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/docs/fix-messy-tables/table_3.csv +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/docs/index.md +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/docs/installation/index.md +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/docs/interactive-widget/index.md +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/docs/layout-analysis/index.md +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/docs/loops-and-groups/index.md +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/docs/ocr/index.md +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/docs/pdf-navigation/index.md +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/docs/process-forms-and-invoices/extracted_form_data.csv +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/docs/process-forms-and-invoices/index.md +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/docs/reflowing-pages/index.md +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/docs/regions/index.md +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/docs/tables/index.md +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/docs/text-analysis/index.md +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/docs/tutorials/01-loading-and-extraction.md +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/docs/tutorials/02-finding-elements.md +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/docs/tutorials/03-extracting-blocks.md +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/docs/tutorials/04-table-extraction.md +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/docs/tutorials/05-excluding-content.md +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/docs/tutorials/06-document-qa.md +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/docs/tutorials/07-layout-analysis.md +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/docs/tutorials/07-working-with-regions.md +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/docs/tutorials/08-spatial-navigation.md +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/docs/tutorials/09-section-extraction.md +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/docs/tutorials/10-form-field-extraction.md +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/docs/tutorials/11-enhanced-table-processing.md +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/docs/tutorials/12-ocr-integration.md +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/docs/tutorials/13-semantic-search.md +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/docs/tutorials/14-categorizing-documents.md +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/docs/visual-debugging/region.png +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/mkdocs.yml +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/natural_pdf/__init__.py +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/natural_pdf/analyzers/__init__.py +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/natural_pdf/analyzers/guides.py +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/natural_pdf/analyzers/layout/__init__.py +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/natural_pdf/analyzers/layout/base.py +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/natural_pdf/analyzers/layout/docling.py +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/natural_pdf/analyzers/layout/gemini.py +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/natural_pdf/analyzers/layout/layout_analyzer.py +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/natural_pdf/analyzers/layout/layout_manager.py +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/natural_pdf/analyzers/layout/layout_options.py +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/natural_pdf/analyzers/layout/paddle.py +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/natural_pdf/analyzers/layout/pdfplumber_table_finder.py +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/natural_pdf/analyzers/layout/surya.py +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/natural_pdf/analyzers/layout/table_structure_utils.py +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/natural_pdf/analyzers/layout/tatr.py +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/natural_pdf/analyzers/layout/yolo.py +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/natural_pdf/analyzers/shape_detection_mixin.py +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/natural_pdf/analyzers/text_options.py +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/natural_pdf/analyzers/text_structure.py +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/natural_pdf/analyzers/utils.py +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/natural_pdf/classification/manager.py +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/natural_pdf/classification/mixin.py +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/natural_pdf/classification/results.py +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/natural_pdf/cli.py +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/natural_pdf/core/__init__.py +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/natural_pdf/core/element_manager.py +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/natural_pdf/core/page_groupby.py +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/natural_pdf/describe/__init__.py +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/natural_pdf/describe/base.py +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/natural_pdf/describe/elements.py +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/natural_pdf/describe/mixin.py +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/natural_pdf/describe/summary.py +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/natural_pdf/elements/__init__.py +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/natural_pdf/elements/image.py +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/natural_pdf/elements/line.py +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/natural_pdf/elements/rect.py +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/natural_pdf/elements/text.py +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/natural_pdf/export/mixin.py +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/natural_pdf/exporters/__init__.py +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/natural_pdf/exporters/base.py +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/natural_pdf/exporters/data/__init__.py +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/natural_pdf/exporters/data/pdf.ttf +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/natural_pdf/exporters/data/sRGB.icc +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/natural_pdf/exporters/hocr.py +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/natural_pdf/exporters/hocr_font.py +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/natural_pdf/exporters/original_pdf.py +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/natural_pdf/exporters/paddleocr.py +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/natural_pdf/exporters/searchable_pdf.py +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/natural_pdf/extraction/manager.py +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/natural_pdf/extraction/mixin.py +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/natural_pdf/extraction/result.py +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/natural_pdf/flows/__init__.py +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/natural_pdf/flows/collections.py +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/natural_pdf/flows/element.py +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/natural_pdf/flows/flow.py +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/natural_pdf/flows/region.py +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/natural_pdf/ocr/__init__.py +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/natural_pdf/ocr/engine.py +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/natural_pdf/ocr/engine_doctr.py +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/natural_pdf/ocr/engine_easyocr.py +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/natural_pdf/ocr/engine_paddle.py +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/natural_pdf/ocr/engine_surya.py +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/natural_pdf/ocr/ocr_factory.py +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/natural_pdf/ocr/ocr_manager.py +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/natural_pdf/ocr/ocr_options.py +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/natural_pdf/ocr/utils.py +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/natural_pdf/qa/__init__.py +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/natural_pdf/qa/document_qa.py +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/natural_pdf/qa/qa_result.py +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/natural_pdf/search/__init__.py +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/natural_pdf/search/lancedb_search_service.py +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/natural_pdf/search/numpy_search_service.py +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/natural_pdf/search/search_options.py +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/natural_pdf/search/search_service_protocol.py +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/natural_pdf/search/searchable_mixin.py +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/natural_pdf/selectors/__init__.py +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/natural_pdf/selectors/parser.py +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/natural_pdf/tables/__init__.py +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/natural_pdf/tables/result.py +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/natural_pdf/templates/__init__.py +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/natural_pdf/templates/finetune/fine_tune_paddleocr.md +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/natural_pdf/templates/spa/css/style.css +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/natural_pdf/templates/spa/index.html +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/natural_pdf/templates/spa/js/app.js +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/natural_pdf/templates/spa/words.txt +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/natural_pdf/text_mixin.py +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/natural_pdf/utils/__init__.py +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/natural_pdf/utils/bidi_mirror.py +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/natural_pdf/utils/debug.py +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/natural_pdf/utils/highlighting.py +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/natural_pdf/utils/identifiers.py +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/natural_pdf/utils/layout.py +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/natural_pdf/utils/locks.py +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/natural_pdf/utils/packaging.py +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/natural_pdf/utils/reading_order.py +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/natural_pdf/utils/text_extraction.py +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/natural_pdf/utils/visualization.py +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/natural_pdf/widgets/__init__.py +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/natural_pdf/widgets/viewer.py +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/natural_pdf.egg-info/dependency_links.txt +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/natural_pdf.egg-info/entry_points.txt +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/natural_pdf.egg-info/requires.txt +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/noxfile.py +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/optimization/memory_comparison.py +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/optimization/pdf_analyzer.py +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/optimization/performance_analysis.py +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/optimization/performance_results/image_heavy_snapshots.csv +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/optimization/performance_results/image_heavy_snapshots.json +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/optimization/performance_results/text_heavy_snapshots.csv +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/optimization/performance_results/text_heavy_snapshots.json +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/optimization/test_cleanup_methods.py +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/optimization/test_memory_fix.py +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/publish.sh +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/pyproject.toml +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/sample-screen.png +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/setup.cfg +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/test_install.sh +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/tests/conftest.py +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/tests/exporters/test_paddleocr_exporter.py +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/tests/test_annotate.py +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/tests/test_arabic_performance.py +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/tests/test_arabic_real_world.py +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/tests/test_color_conversion.py +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/tests/test_core/test_elements.py +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/tests/test_core/test_loading.py +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/tests/test_core/test_spatial.py +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/tests/test_core/test_text_extraction.py +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/tests/test_core/test_text_layer.py +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/tests/test_directional_defaults.py +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/tests/test_document_qa.py +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/tests/test_element_collection_slicing.py +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/tests/test_exclusions.py +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/tests/test_extraction_error.py +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/tests/test_extraction_mixin_fix.py +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/tests/test_extraction_text_and_vision.py +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/tests/test_extraction_working.py +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/tests/test_first_last_selectors.py +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/tests/test_flow_region_directional.py +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/tests/test_groupby.py +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/tests/test_guides.py +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/tests/test_guides_apply_exclusions.py +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/tests/test_guides_apply_exclusions_simple.py +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/tests/test_guides_extract_table.py +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/tests/test_guides_extract_table_real.py +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/tests/test_guides_integration.py +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/tests/test_highlight_detection.py +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/tests/test_highlight_protocol.py +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/tests/test_highlight_protocol_simple.py +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/tests/test_loading_original.py +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/tests/test_multi_page_table_discovery.py +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/tests/test_optional_deps.py +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/tests/test_page_exclusion_lists.py +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/tests/test_pdf_add_exclusion_elementcollection.py +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/tests/test_region_viewer.py +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/tests/test_sections_end_only.py +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/tests/test_show_column_layout.py +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/tests/test_show_edge_cases.py +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/tests/test_show_exclusions.py +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/tests/test_show_exclusions_feature.py +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/tests/test_show_limit.py +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/tests/test_skip_repeating_headers_multipage.py +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/tests/test_strikethrough_detection.py +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/tests/test_table_result_header_mismatch.py +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/tests/test_table_result_keep_blank.py +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/tests/test_tiny_text_tables.py +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/tests/test_tiny_text_tables_table.py +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/tests/test_tutorials.py +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/tests/test_underline_detection.py +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/tests/test_update_text.py +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/todo/bad_pdf_analysis.md +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/todo/evaluation.md +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/tools/bad_pdf_eval/IMPROVEMENTS_SUMMARY.md +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/tools/bad_pdf_eval/LLM_NaturalPDF_CheatSheet.md +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/tools/bad_pdf_eval/LLM_NaturalPDF_Workflows.md +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/tools/bad_pdf_eval/README.md +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/tools/bad_pdf_eval/__init__.py +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/tools/bad_pdf_eval/analyser.py +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/tools/bad_pdf_eval/collate_summaries.py +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/tools/bad_pdf_eval/compile_attempts_markdown.py +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/tools/bad_pdf_eval/eval_suite.py +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/tools/bad_pdf_eval/evaluate_quality.py +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/tools/bad_pdf_eval/export_enrichment_csv.py +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/tools/bad_pdf_eval/extraction_decision_tree.md +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/tools/bad_pdf_eval/llm_enrich.py +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/tools/bad_pdf_eval/llm_enrich_with_retry.py +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/tools/bad_pdf_eval/reporter.py +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/tools/bad_pdf_eval/utils.py +0 -0
- {natural_pdf-0.2.2 → natural_pdf-0.2.4}/uv.lock +0 -0
@@ -156,11 +156,25 @@ elements.show(color="red") # Single collection
|
|
156
156
|
elements.show(color="blue", label="Headers") # With label
|
157
157
|
elements.show(group_by='type') # Color by type
|
158
158
|
|
159
|
-
#
|
159
|
+
# Quick highlighting (one-liner)
|
160
|
+
page.highlight(elements1, elements2, elements3) # Multiple elements
|
161
|
+
page.highlight( # With custom colors
|
162
|
+
(elements1, 'red'),
|
163
|
+
(elements2, 'blue'),
|
164
|
+
(elements3, 'green')
|
165
|
+
)
|
166
|
+
|
167
|
+
# Multiple collections with context manager
|
160
168
|
with page.highlights() as h:
|
161
169
|
h.add(elements1, color="red", label="Type 1")
|
162
170
|
h.add(elements2, color="blue", label="Type 2")
|
163
171
|
h.show()
|
172
|
+
|
173
|
+
# Auto-display in Jupyter/Colab
|
174
|
+
with page.highlights(show=True) as h:
|
175
|
+
h.add(elements1, label="Headers")
|
176
|
+
h.add(elements2, label="Content")
|
177
|
+
# Displays automatically when exiting context
|
164
178
|
```
|
165
179
|
|
166
180
|
### Viewing
|
@@ -83,6 +83,47 @@ with page.highlights() as h:
|
|
83
83
|
h.show()
|
84
84
|
```
|
85
85
|
|
86
|
+
### Jupyter/Colab Support
|
87
|
+
|
88
|
+
In Jupyter notebooks and Google Colab, you can use `show=True` to automatically display the highlights when exiting the context:
|
89
|
+
|
90
|
+
```python
|
91
|
+
# Automatically displays the image in Jupyter/Colab
|
92
|
+
with page.highlights(show=True) as h:
|
93
|
+
h.add(summary_elements, label='Summary')
|
94
|
+
h.add(date_elements, label='Date')
|
95
|
+
h.add(line_elements, label='Lines')
|
96
|
+
# No need to call h.show() - displays automatically!
|
97
|
+
```
|
98
|
+
|
99
|
+
### Quick Highlighting with `.highlight()`
|
100
|
+
|
101
|
+
For simple highlighting tasks, use the `.highlight()` convenience method:
|
102
|
+
|
103
|
+
```python
|
104
|
+
# Highlight multiple elements in one line
|
105
|
+
page.highlight(summary_elements, date_elements, line_elements)
|
106
|
+
|
107
|
+
# With custom colors
|
108
|
+
page.highlight(
|
109
|
+
(summary_elements, 'red'),
|
110
|
+
(date_elements, 'blue'),
|
111
|
+
(line_elements, 'green')
|
112
|
+
)
|
113
|
+
|
114
|
+
# With colors and labels
|
115
|
+
page.highlight(
|
116
|
+
(summary_elements, 'red', 'Summary Text'),
|
117
|
+
(date_elements, 'blue', 'Date Fields'),
|
118
|
+
(line_elements, 'green', 'Separator Lines')
|
119
|
+
)
|
120
|
+
|
121
|
+
# Pass additional parameters like width or resolution
|
122
|
+
page.highlight(summary_elements, date_elements, width=800, labels=True)
|
123
|
+
```
|
124
|
+
|
125
|
+
This method is particularly useful in Jupyter/Colab environments where the image displays automatically as the cell output.
|
126
|
+
|
86
127
|
## Customizing Multiple Highlights
|
87
128
|
|
88
129
|
Customize the appearance of multiple highlights using the context manager:
|
@@ -133,7 +174,7 @@ content = title.below(height=200)
|
|
133
174
|
content.show()
|
134
175
|
```
|
135
176
|
|
136
|
-
Or look at just the region by itself
|
177
|
+
Or look at just the region by itself:
|
137
178
|
|
138
179
|
```python
|
139
180
|
# Find a title and create a region below it
|
@@ -144,6 +185,27 @@ content = title.below(height=200)
|
|
144
185
|
content.show(crop=True)
|
145
186
|
```
|
146
187
|
|
188
|
+
### Highlighting Multiple Regions
|
189
|
+
|
190
|
+
The `.highlight()` method works with regions too:
|
191
|
+
|
192
|
+
```python
|
193
|
+
# Create multiple regions
|
194
|
+
left = page.region(left=0, right=page.width/3, top=0, bottom=page.height)
|
195
|
+
mid = page.region(left=page.width/3, right=page.width/3*2, top=0, bottom=page.height)
|
196
|
+
right = page.region(left=page.width/3*2, right=page.width, top=0, bottom=page.height)
|
197
|
+
|
198
|
+
# Highlight all three regions
|
199
|
+
page.highlight(left, mid, right)
|
200
|
+
|
201
|
+
# Or with custom colors
|
202
|
+
page.highlight(
|
203
|
+
(left, 'red', 'Left Column'),
|
204
|
+
(mid, 'green', 'Middle Column'),
|
205
|
+
(right, 'blue', 'Right Column')
|
206
|
+
)
|
207
|
+
```
|
208
|
+
|
147
209
|
## Working with Text Styles
|
148
210
|
|
149
211
|
Visualize text styles to understand the document structure:
|
@@ -29,9 +29,22 @@ class DirectionalCollectionMixin:
|
|
29
29
|
"""Find regions to the right of all elements in this collection."""
|
30
30
|
return self.apply(lambda element: element.right(**kwargs))
|
31
31
|
|
32
|
-
def expand(self, **kwargs) -> "ElementCollection":
|
33
|
-
"""Expand all elements in this collection.
|
34
|
-
|
32
|
+
def expand(self, *args, **kwargs) -> "ElementCollection":
|
33
|
+
"""Expand all elements in this collection.
|
34
|
+
|
35
|
+
Args:
|
36
|
+
*args: If a single positional argument is provided, expands all elements
|
37
|
+
by that amount in all directions.
|
38
|
+
**kwargs: Keyword arguments for directional expansion (left, right, top, bottom, etc.)
|
39
|
+
|
40
|
+
Examples:
|
41
|
+
# Expand all elements by 5 pixels in all directions
|
42
|
+
collection.expand(5)
|
43
|
+
|
44
|
+
# Expand with different amounts in each direction
|
45
|
+
collection.expand(left=10, right=5, top=3, bottom=7)
|
46
|
+
"""
|
47
|
+
return self.apply(lambda element: element.expand(*args, **kwargs))
|
35
48
|
|
36
49
|
|
37
50
|
class ApplyMixin:
|
@@ -335,6 +335,7 @@ class HighlightContext:
|
|
335
335
|
self.show_on_exit = show_on_exit
|
336
336
|
self.highlight_groups = []
|
337
337
|
self._color_manager = ColorManager()
|
338
|
+
self._exit_image = None # Store image for Jupyter display
|
338
339
|
|
339
340
|
def add(
|
340
341
|
self,
|
@@ -421,6 +422,11 @@ class HighlightContext:
|
|
421
422
|
)
|
422
423
|
return None
|
423
424
|
|
425
|
+
@property
|
426
|
+
def image(self) -> Optional[Image.Image]:
|
427
|
+
"""Get the last generated image (useful after context exit)."""
|
428
|
+
return self._exit_image
|
429
|
+
|
424
430
|
def __enter__(self) -> "HighlightContext":
|
425
431
|
"""Enter the context."""
|
426
432
|
return self
|
@@ -428,7 +434,25 @@ class HighlightContext:
|
|
428
434
|
def __exit__(self, exc_type, exc_val, exc_tb):
|
429
435
|
"""Exit the context, optionally showing highlights."""
|
430
436
|
if self.show_on_exit and not exc_type:
|
431
|
-
self.show()
|
437
|
+
self._exit_image = self.show()
|
438
|
+
|
439
|
+
# Check if we're in a Jupyter/IPython environment
|
440
|
+
try:
|
441
|
+
# Try to get IPython instance
|
442
|
+
from IPython import get_ipython
|
443
|
+
|
444
|
+
ipython = get_ipython()
|
445
|
+
if ipython is not None:
|
446
|
+
# We're in IPython/Jupyter
|
447
|
+
from IPython.display import display
|
448
|
+
|
449
|
+
if self._exit_image is not None:
|
450
|
+
display(self._exit_image)
|
451
|
+
except (ImportError, NameError):
|
452
|
+
# Not in Jupyter or IPython not available - that's OK
|
453
|
+
pass
|
454
|
+
|
455
|
+
# __exit__ must return False to not suppress exceptions
|
432
456
|
return False
|
433
457
|
|
434
458
|
|
@@ -78,6 +78,7 @@ from natural_pdf.utils.locks import pdf_render_lock # Import the lock
|
|
78
78
|
|
79
79
|
# # Import new utils
|
80
80
|
from natural_pdf.utils.text_extraction import filter_chars_spatially, generate_text_layout
|
81
|
+
from natural_pdf.vision.mixin import VisualSearchMixin
|
81
82
|
from natural_pdf.widgets.viewer import _IPYWIDGETS_AVAILABLE, InteractiveViewerWidget
|
82
83
|
|
83
84
|
# --- End Classification Imports --- #
|
@@ -101,6 +102,7 @@ class Page(
|
|
101
102
|
ExtractionMixin,
|
102
103
|
ShapeDetectionMixin,
|
103
104
|
DescribeMixin,
|
105
|
+
VisualSearchMixin,
|
104
106
|
Visualizable,
|
105
107
|
):
|
106
108
|
"""Enhanced Page wrapper built on top of pdfplumber.Page.
|
@@ -1976,7 +1978,7 @@ class Page(
|
|
1976
1978
|
"""Get all line elements on this page."""
|
1977
1979
|
return self._element_mgr.lines
|
1978
1980
|
|
1979
|
-
def
|
1981
|
+
def add_highlight(
|
1980
1982
|
self,
|
1981
1983
|
bbox: Optional[Tuple[float, float, float, float]] = None,
|
1982
1984
|
color: Optional[Union[Tuple, str]] = None,
|
@@ -1987,7 +1989,7 @@ class Page(
|
|
1987
1989
|
existing: str = "append",
|
1988
1990
|
) -> "Page":
|
1989
1991
|
"""
|
1990
|
-
|
1992
|
+
Add a highlight to a bounding box or the entire page.
|
1991
1993
|
Delegates to the central HighlightingService.
|
1992
1994
|
|
1993
1995
|
Args:
|
@@ -2015,7 +2017,7 @@ class Page(
|
|
2015
2017
|
)
|
2016
2018
|
return self
|
2017
2019
|
|
2018
|
-
def
|
2020
|
+
def add_highlight_polygon(
|
2019
2021
|
self,
|
2020
2022
|
polygon: List[Tuple[float, float]],
|
2021
2023
|
color: Optional[Union[Tuple, str]] = None,
|
@@ -259,7 +259,7 @@ class PageCollection(TextMixin, Generic[P], ApplyMixin, ShapeDetectionMixin, Vis
|
|
259
259
|
self,
|
260
260
|
*,
|
261
261
|
text: str,
|
262
|
-
|
262
|
+
overlap: str = "full",
|
263
263
|
apply_exclusions: bool = True,
|
264
264
|
regex: bool = False,
|
265
265
|
case: bool = True,
|
@@ -271,7 +271,7 @@ class PageCollection(TextMixin, Generic[P], ApplyMixin, ShapeDetectionMixin, Vis
|
|
271
271
|
self,
|
272
272
|
selector: str,
|
273
273
|
*,
|
274
|
-
|
274
|
+
overlap: str = "full",
|
275
275
|
apply_exclusions: bool = True,
|
276
276
|
regex: bool = False,
|
277
277
|
case: bool = True,
|
@@ -283,7 +283,7 @@ class PageCollection(TextMixin, Generic[P], ApplyMixin, ShapeDetectionMixin, Vis
|
|
283
283
|
selector: Optional[str] = None,
|
284
284
|
*,
|
285
285
|
text: Optional[str] = None,
|
286
|
-
|
286
|
+
overlap: str = "full",
|
287
287
|
apply_exclusions: bool = True,
|
288
288
|
regex: bool = False,
|
289
289
|
case: bool = True,
|
@@ -297,9 +297,9 @@ class PageCollection(TextMixin, Generic[P], ApplyMixin, ShapeDetectionMixin, Vis
|
|
297
297
|
Args:
|
298
298
|
selector: CSS-like selector string.
|
299
299
|
text: Text content to search for (equivalent to 'text:contains(...)').
|
300
|
-
|
301
|
-
'
|
302
|
-
(default: "
|
300
|
+
overlap: How to determine if elements overlap: 'full' (fully inside),
|
301
|
+
'partial' (any overlap), or 'center' (center point inside).
|
302
|
+
(default: "full")
|
303
303
|
apply_exclusions: Whether to exclude elements in exclusion regions (default: True).
|
304
304
|
regex: Whether to use regex for text search (`selector` or `text`) (default: False).
|
305
305
|
case: Whether to do case-sensitive text search (`selector` or `text`) (default: True).
|
@@ -313,7 +313,7 @@ class PageCollection(TextMixin, Generic[P], ApplyMixin, ShapeDetectionMixin, Vis
|
|
313
313
|
element = page.find(
|
314
314
|
selector=selector,
|
315
315
|
text=text,
|
316
|
-
|
316
|
+
overlap=overlap,
|
317
317
|
apply_exclusions=apply_exclusions,
|
318
318
|
regex=regex,
|
319
319
|
case=case,
|
@@ -328,7 +328,7 @@ class PageCollection(TextMixin, Generic[P], ApplyMixin, ShapeDetectionMixin, Vis
|
|
328
328
|
self,
|
329
329
|
*,
|
330
330
|
text: str,
|
331
|
-
|
331
|
+
overlap: str = "full",
|
332
332
|
apply_exclusions: bool = True,
|
333
333
|
regex: bool = False,
|
334
334
|
case: bool = True,
|
@@ -340,7 +340,7 @@ class PageCollection(TextMixin, Generic[P], ApplyMixin, ShapeDetectionMixin, Vis
|
|
340
340
|
self,
|
341
341
|
selector: str,
|
342
342
|
*,
|
343
|
-
|
343
|
+
overlap: str = "full",
|
344
344
|
apply_exclusions: bool = True,
|
345
345
|
regex: bool = False,
|
346
346
|
case: bool = True,
|
@@ -352,7 +352,7 @@ class PageCollection(TextMixin, Generic[P], ApplyMixin, ShapeDetectionMixin, Vis
|
|
352
352
|
selector: Optional[str] = None,
|
353
353
|
*,
|
354
354
|
text: Optional[str] = None,
|
355
|
-
|
355
|
+
overlap: str = "full",
|
356
356
|
apply_exclusions: bool = True,
|
357
357
|
regex: bool = False,
|
358
358
|
case: bool = True,
|
@@ -366,9 +366,9 @@ class PageCollection(TextMixin, Generic[P], ApplyMixin, ShapeDetectionMixin, Vis
|
|
366
366
|
Args:
|
367
367
|
selector: CSS-like selector string.
|
368
368
|
text: Text content to search for (equivalent to 'text:contains(...)').
|
369
|
-
|
370
|
-
'
|
371
|
-
(default: "
|
369
|
+
overlap: How to determine if elements overlap: 'full' (fully inside),
|
370
|
+
'partial' (any overlap), or 'center' (center point inside).
|
371
|
+
(default: "full")
|
372
372
|
apply_exclusions: Whether to exclude elements in exclusion regions (default: True).
|
373
373
|
regex: Whether to use regex for text search (`selector` or `text`) (default: False).
|
374
374
|
case: Whether to do case-sensitive text search (`selector` or `text`) (default: True).
|
@@ -383,7 +383,7 @@ class PageCollection(TextMixin, Generic[P], ApplyMixin, ShapeDetectionMixin, Vis
|
|
383
383
|
elements = page.find_all(
|
384
384
|
selector=selector,
|
385
385
|
text=text,
|
386
|
-
|
386
|
+
overlap=overlap,
|
387
387
|
apply_exclusions=apply_exclusions,
|
388
388
|
regex=regex,
|
389
389
|
case=case,
|
@@ -42,6 +42,7 @@ from natural_pdf.ocr import OCRManager, OCROptions
|
|
42
42
|
from natural_pdf.selectors.parser import parse_selector
|
43
43
|
from natural_pdf.text_mixin import TextMixin
|
44
44
|
from natural_pdf.utils.locks import pdf_render_lock
|
45
|
+
from natural_pdf.vision.mixin import VisualSearchMixin
|
45
46
|
|
46
47
|
if TYPE_CHECKING:
|
47
48
|
from natural_pdf.elements.element_collection import ElementCollection
|
@@ -252,7 +253,9 @@ class _LazyPageList(Sequence):
|
|
252
253
|
# --- End Lazy Page List Helper --- #
|
253
254
|
|
254
255
|
|
255
|
-
class PDF(
|
256
|
+
class PDF(
|
257
|
+
TextMixin, ExtractionMixin, ExportMixin, ClassificationMixin, VisualSearchMixin, Visualizable
|
258
|
+
):
|
256
259
|
"""Enhanced PDF wrapper built on top of pdfplumber.
|
257
260
|
|
258
261
|
This class provides a fluent interface for working with PDF documents,
|
@@ -40,6 +40,7 @@ logger = logging.getLogger(__name__)
|
|
40
40
|
from natural_pdf.core.pdf import PDF
|
41
41
|
from natural_pdf.elements.region import Region
|
42
42
|
from natural_pdf.export.mixin import ExportMixin
|
43
|
+
from natural_pdf.vision.mixin import VisualSearchMixin
|
43
44
|
|
44
45
|
# --- Search Imports ---
|
45
46
|
try:
|
@@ -69,8 +70,8 @@ from natural_pdf.search.searchable_mixin import SearchableMixin # Import the ne
|
|
69
70
|
|
70
71
|
|
71
72
|
class PDFCollection(
|
72
|
-
SearchableMixin, ApplyMixin, ExportMixin, ShapeDetectionMixin
|
73
|
-
):
|
73
|
+
SearchableMixin, ApplyMixin, ExportMixin, ShapeDetectionMixin, VisualSearchMixin
|
74
|
+
):
|
74
75
|
def __init__(
|
75
76
|
self,
|
76
77
|
source: Union[str, Iterable[Union[str, "PDF"]]],
|
@@ -258,8 +259,6 @@ class PDFCollection(
|
|
258
259
|
return iter(self._pdfs)
|
259
260
|
|
260
261
|
def __repr__(self) -> str:
|
261
|
-
# Removed search status
|
262
|
-
return f"<PDFCollection(count={len(self._pdfs)})>"
|
263
262
|
return f"<PDFCollection(count={len(self._pdfs)})>"
|
264
263
|
|
265
264
|
@property
|
@@ -267,6 +266,134 @@ class PDFCollection(
|
|
267
266
|
"""Returns the list of PDF objects held by the collection."""
|
268
267
|
return self._pdfs
|
269
268
|
|
269
|
+
def show(self, limit: Optional[int] = 30, per_pdf_limit: Optional[int] = 10, **kwargs):
|
270
|
+
"""
|
271
|
+
Display all PDFs in the collection with labels.
|
272
|
+
|
273
|
+
Each PDF is shown with its pages in a grid layout (6 columns by default),
|
274
|
+
and all PDFs are stacked vertically with labels.
|
275
|
+
|
276
|
+
Args:
|
277
|
+
limit: Maximum total pages to show across all PDFs (default: 30)
|
278
|
+
per_pdf_limit: Maximum pages to show per PDF (default: 10)
|
279
|
+
**kwargs: Additional arguments passed to each PDF's show() method
|
280
|
+
(e.g., columns, exclusions, resolution, etc.)
|
281
|
+
|
282
|
+
Returns:
|
283
|
+
Displayed image in Jupyter or None
|
284
|
+
"""
|
285
|
+
if not self._pdfs:
|
286
|
+
print("Empty collection")
|
287
|
+
return None
|
288
|
+
|
289
|
+
# Import here to avoid circular imports
|
290
|
+
import numpy as np
|
291
|
+
from PIL import Image, ImageDraw, ImageFont
|
292
|
+
|
293
|
+
# Calculate pages per PDF if total limit is set
|
294
|
+
if limit and not per_pdf_limit:
|
295
|
+
per_pdf_limit = max(1, limit // len(self._pdfs))
|
296
|
+
|
297
|
+
# Collect images from each PDF
|
298
|
+
all_images = []
|
299
|
+
total_pages_shown = 0
|
300
|
+
|
301
|
+
for pdf in self._pdfs:
|
302
|
+
if limit and total_pages_shown >= limit:
|
303
|
+
break
|
304
|
+
|
305
|
+
# Calculate limit for this PDF
|
306
|
+
pdf_limit = per_pdf_limit
|
307
|
+
if limit:
|
308
|
+
remaining = limit - total_pages_shown
|
309
|
+
pdf_limit = min(per_pdf_limit or remaining, remaining)
|
310
|
+
|
311
|
+
# Get PDF identifier
|
312
|
+
pdf_name = getattr(pdf, "filename", None) or getattr(pdf, "path", "Unknown")
|
313
|
+
if isinstance(pdf_name, Path):
|
314
|
+
pdf_name = pdf_name.name
|
315
|
+
elif "/" in str(pdf_name):
|
316
|
+
pdf_name = str(pdf_name).split("/")[-1]
|
317
|
+
|
318
|
+
# Render this PDF
|
319
|
+
try:
|
320
|
+
# Get render specs from the PDF
|
321
|
+
render_specs = pdf._get_render_specs(mode="show", max_pages=pdf_limit, **kwargs)
|
322
|
+
|
323
|
+
if not render_specs:
|
324
|
+
continue
|
325
|
+
|
326
|
+
# Get the highlighter and render without displaying
|
327
|
+
highlighter = pdf._get_highlighter()
|
328
|
+
pdf_image = highlighter.unified_render(
|
329
|
+
specs=render_specs,
|
330
|
+
layout="grid" if len(render_specs) > 1 else "single",
|
331
|
+
columns=6,
|
332
|
+
**kwargs,
|
333
|
+
)
|
334
|
+
|
335
|
+
if pdf_image:
|
336
|
+
# Add label above the PDF image
|
337
|
+
label_height = 40
|
338
|
+
label_bg_color = (240, 240, 240)
|
339
|
+
label_text_color = (0, 0, 0)
|
340
|
+
|
341
|
+
# Create new image with space for label
|
342
|
+
width, height = pdf_image.size
|
343
|
+
labeled_image = Image.new("RGB", (width, height + label_height), "white")
|
344
|
+
|
345
|
+
# Draw label background
|
346
|
+
draw = ImageDraw.Draw(labeled_image)
|
347
|
+
draw.rectangle([0, 0, width, label_height], fill=label_bg_color)
|
348
|
+
|
349
|
+
# Draw label text
|
350
|
+
try:
|
351
|
+
# Try to use a nice font if available
|
352
|
+
font = ImageFont.truetype("Arial", 20)
|
353
|
+
except:
|
354
|
+
# Fallback to default font
|
355
|
+
font = ImageFont.load_default()
|
356
|
+
|
357
|
+
label_text = f"{pdf_name} ({len(pdf.pages)} pages)"
|
358
|
+
draw.text((10, 10), label_text, fill=label_text_color, font=font)
|
359
|
+
|
360
|
+
# Paste PDF image below label
|
361
|
+
labeled_image.paste(pdf_image, (0, label_height))
|
362
|
+
|
363
|
+
all_images.append(labeled_image)
|
364
|
+
total_pages_shown += min(pdf_limit, len(pdf.pages))
|
365
|
+
|
366
|
+
except Exception as e:
|
367
|
+
logger.warning(f"Failed to render PDF {pdf_name}: {e}")
|
368
|
+
continue
|
369
|
+
|
370
|
+
if not all_images:
|
371
|
+
print("No PDFs could be rendered")
|
372
|
+
return None
|
373
|
+
|
374
|
+
# Combine all images vertically
|
375
|
+
if len(all_images) == 1:
|
376
|
+
combined = all_images[0]
|
377
|
+
else:
|
378
|
+
# Add spacing between PDFs
|
379
|
+
spacing = 20
|
380
|
+
total_height = sum(img.height for img in all_images) + spacing * (len(all_images) - 1)
|
381
|
+
max_width = max(img.width for img in all_images)
|
382
|
+
|
383
|
+
combined = Image.new("RGB", (max_width, total_height), "white")
|
384
|
+
|
385
|
+
y_offset = 0
|
386
|
+
for i, img in enumerate(all_images):
|
387
|
+
# Center images if they're narrower than max width
|
388
|
+
x_offset = (max_width - img.width) // 2
|
389
|
+
combined.paste(img, (x_offset, y_offset))
|
390
|
+
y_offset += img.height
|
391
|
+
if i < len(all_images) - 1:
|
392
|
+
y_offset += spacing
|
393
|
+
|
394
|
+
# Return the combined image (Jupyter will display it automatically)
|
395
|
+
return combined
|
396
|
+
|
270
397
|
@overload
|
271
398
|
def find_all(
|
272
399
|
self,
|
@@ -92,6 +92,50 @@ class Visualizable:
|
|
92
92
|
_get_render_specs() to gain full image generation capabilities.
|
93
93
|
"""
|
94
94
|
|
95
|
+
def highlight(self, *elements, **kwargs):
|
96
|
+
"""
|
97
|
+
Convenience method for highlighting elements in Jupyter/Colab.
|
98
|
+
|
99
|
+
This method creates a highlight context, adds the elements, and returns
|
100
|
+
the resulting image. It's designed for simple one-liner usage in notebooks.
|
101
|
+
|
102
|
+
Args:
|
103
|
+
*elements: Elements or element collections to highlight
|
104
|
+
**kwargs: Additional parameters passed to show()
|
105
|
+
|
106
|
+
Returns:
|
107
|
+
PIL Image with highlights
|
108
|
+
|
109
|
+
Example:
|
110
|
+
# Simple one-liner highlighting
|
111
|
+
page.highlight(left, mid, right)
|
112
|
+
|
113
|
+
# With custom colors
|
114
|
+
page.highlight(
|
115
|
+
(tables, 'blue'),
|
116
|
+
(headers, 'red'),
|
117
|
+
(footers, 'green')
|
118
|
+
)
|
119
|
+
"""
|
120
|
+
from natural_pdf.core.highlighting_service import HighlightContext
|
121
|
+
|
122
|
+
# Create context and add elements
|
123
|
+
ctx = HighlightContext(self, show_on_exit=False)
|
124
|
+
|
125
|
+
for element in elements:
|
126
|
+
if isinstance(element, tuple) and len(element) == 2:
|
127
|
+
# Element with color: (element, color)
|
128
|
+
ctx.add(element[0], color=element[1])
|
129
|
+
elif isinstance(element, tuple) and len(element) == 3:
|
130
|
+
# Element with color and label: (element, color, label)
|
131
|
+
ctx.add(element[0], color=element[1], label=element[2])
|
132
|
+
else:
|
133
|
+
# Just element
|
134
|
+
ctx.add(element)
|
135
|
+
|
136
|
+
# Return the image directly
|
137
|
+
return ctx.show(**kwargs)
|
138
|
+
|
95
139
|
def _get_render_specs(
|
96
140
|
self, mode: Literal["show", "render"] = "show", **kwargs
|
97
141
|
) -> List[RenderSpec]:
|
@@ -142,7 +186,7 @@ class Visualizable:
|
|
142
186
|
color: Optional[Union[str, Tuple[int, int, int]]] = None,
|
143
187
|
labels: bool = True,
|
144
188
|
label_format: Optional[str] = None,
|
145
|
-
highlights: Optional[List[Dict[str, Any]]] = None,
|
189
|
+
highlights: Optional[Union[List[Dict[str, Any]], bool]] = None,
|
146
190
|
legend_position: str = "right",
|
147
191
|
annotate: Optional[Union[str, List[str]]] = None,
|
148
192
|
# Layout options for multi-page/region
|
@@ -167,7 +211,7 @@ class Visualizable:
|
|
167
211
|
color: Default highlight color
|
168
212
|
labels: Whether to show labels for highlights
|
169
213
|
label_format: Format string for labels (e.g., "Element {index}")
|
170
|
-
highlights: Additional highlight groups to show
|
214
|
+
highlights: Additional highlight groups to show, or False to disable all highlights
|
171
215
|
legend_position: Position of legend/colorbar ('right', 'left', 'top', 'bottom')
|
172
216
|
annotate: Attribute name(s) to display on highlights (string or list)
|
173
217
|
layout: How to arrange multiple pages/regions (defaults to 'grid' for multi-page, 'single' for single page)
|