natural-pdf 0.2.3__tar.gz → 0.2.4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {natural_pdf-0.2.3/natural_pdf.egg-info → natural_pdf-0.2.4}/PKG-INFO +1 -1
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/natural_pdf/core/page.py +2 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/natural_pdf/core/pdf.py +4 -1
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/natural_pdf/core/pdf_collection.py +131 -4
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/natural_pdf/core/render_spec.py +2 -2
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/natural_pdf/elements/base.py +18 -14
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/natural_pdf/elements/region.py +10 -8
- natural_pdf-0.2.4/natural_pdf/vision/__init__.py +7 -0
- natural_pdf-0.2.4/natural_pdf/vision/mixin.py +209 -0
- natural_pdf-0.2.4/natural_pdf/vision/results.py +146 -0
- natural_pdf-0.2.4/natural_pdf/vision/similarity.py +321 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4/natural_pdf.egg-info}/PKG-INFO +1 -1
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/natural_pdf.egg-info/SOURCES.txt +7 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/natural_pdf.egg-info/top_level.txt +0 -1
- natural_pdf-0.2.4/tests/test_element_show_crop_highlights.py +168 -0
- natural_pdf-0.2.4/tests/test_find_similar.py +147 -0
- natural_pdf-0.2.4/tests/test_region_show_crop_highlights.py +219 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/.cursor/rules/analysis_framework.mdc +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/.cursor/rules/coding-style.mdc +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/.cursor/rules/edit-md-instead-of-ipynb.mdc +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/.cursor/rules/minimal-comments.mdc +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/.cursor/rules/natural-pdf-overview.mdc +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/.cursor/rules/user-friendly-library-code.mdc +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/.github/workflows/ci.yml +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/.github/workflows/docs.yml +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/.github/workflows/nightly-tutorials.yml +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/.gitignore +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/.pre-commit-config.yaml +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/01-execute_notebooks.py +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/02-run_all_tutorials.sh +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/CLAUDE.md +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/LICENSE +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/MANIFEST.in +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/README.md +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/audit_packaging.py +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/check_run_md.sh +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/docs/api/index.md +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/docs/assets/favicon.png +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/docs/assets/favicon.svg +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/docs/assets/javascripts/custom.js +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/docs/assets/logo.svg +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/docs/assets/sample-screen.png +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/docs/assets/social-preview.png +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/docs/assets/social-preview.svg +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/docs/assets/stylesheets/custom.css +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/docs/categorizing-documents/index.md +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/docs/data-extraction/index.md +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/docs/describe/index.md +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/docs/document-qa/index.md +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/docs/element-selection/index.md +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/docs/extracting-clean-text/index.md +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/docs/finetuning/index.md +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/docs/fix-messy-tables/index.md +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/docs/fix-messy-tables/table_1.csv +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/docs/fix-messy-tables/table_2.csv +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/docs/fix-messy-tables/table_3.csv +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/docs/index.md +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/docs/installation/index.md +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/docs/interactive-widget/index.md +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/docs/layout-analysis/index.md +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/docs/loops-and-groups/index.md +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/docs/ocr/index.md +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/docs/pdf-navigation/index.md +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/docs/process-forms-and-invoices/extracted_form_data.csv +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/docs/process-forms-and-invoices/index.md +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/docs/quick-reference/index.md +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/docs/reflowing-pages/index.md +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/docs/regions/index.md +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/docs/tables/index.md +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/docs/text-analysis/index.md +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/docs/tutorials/01-loading-and-extraction.md +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/docs/tutorials/02-finding-elements.md +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/docs/tutorials/03-extracting-blocks.md +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/docs/tutorials/04-table-extraction.md +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/docs/tutorials/05-excluding-content.md +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/docs/tutorials/06-document-qa.md +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/docs/tutorials/07-layout-analysis.md +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/docs/tutorials/07-working-with-regions.md +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/docs/tutorials/08-spatial-navigation.md +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/docs/tutorials/09-section-extraction.md +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/docs/tutorials/10-form-field-extraction.md +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/docs/tutorials/11-enhanced-table-processing.md +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/docs/tutorials/12-ocr-integration.md +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/docs/tutorials/13-semantic-search.md +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/docs/tutorials/14-categorizing-documents.md +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/docs/visual-debugging/index.md +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/docs/visual-debugging/region.png +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/mkdocs.yml +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/natural_pdf/__init__.py +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/natural_pdf/analyzers/__init__.py +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/natural_pdf/analyzers/guides.py +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/natural_pdf/analyzers/layout/__init__.py +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/natural_pdf/analyzers/layout/base.py +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/natural_pdf/analyzers/layout/docling.py +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/natural_pdf/analyzers/layout/gemini.py +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/natural_pdf/analyzers/layout/layout_analyzer.py +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/natural_pdf/analyzers/layout/layout_manager.py +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/natural_pdf/analyzers/layout/layout_options.py +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/natural_pdf/analyzers/layout/paddle.py +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/natural_pdf/analyzers/layout/pdfplumber_table_finder.py +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/natural_pdf/analyzers/layout/surya.py +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/natural_pdf/analyzers/layout/table_structure_utils.py +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/natural_pdf/analyzers/layout/tatr.py +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/natural_pdf/analyzers/layout/yolo.py +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/natural_pdf/analyzers/shape_detection_mixin.py +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/natural_pdf/analyzers/text_options.py +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/natural_pdf/analyzers/text_structure.py +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/natural_pdf/analyzers/utils.py +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/natural_pdf/classification/manager.py +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/natural_pdf/classification/mixin.py +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/natural_pdf/classification/results.py +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/natural_pdf/cli.py +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/natural_pdf/collections/mixins.py +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/natural_pdf/core/__init__.py +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/natural_pdf/core/element_manager.py +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/natural_pdf/core/highlighting_service.py +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/natural_pdf/core/page_collection.py +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/natural_pdf/core/page_groupby.py +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/natural_pdf/describe/__init__.py +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/natural_pdf/describe/base.py +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/natural_pdf/describe/elements.py +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/natural_pdf/describe/mixin.py +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/natural_pdf/describe/summary.py +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/natural_pdf/elements/__init__.py +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/natural_pdf/elements/element_collection.py +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/natural_pdf/elements/image.py +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/natural_pdf/elements/line.py +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/natural_pdf/elements/rect.py +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/natural_pdf/elements/text.py +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/natural_pdf/export/mixin.py +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/natural_pdf/exporters/__init__.py +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/natural_pdf/exporters/base.py +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/natural_pdf/exporters/data/__init__.py +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/natural_pdf/exporters/data/pdf.ttf +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/natural_pdf/exporters/data/sRGB.icc +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/natural_pdf/exporters/hocr.py +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/natural_pdf/exporters/hocr_font.py +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/natural_pdf/exporters/original_pdf.py +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/natural_pdf/exporters/paddleocr.py +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/natural_pdf/exporters/searchable_pdf.py +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/natural_pdf/extraction/manager.py +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/natural_pdf/extraction/mixin.py +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/natural_pdf/extraction/result.py +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/natural_pdf/flows/__init__.py +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/natural_pdf/flows/collections.py +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/natural_pdf/flows/element.py +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/natural_pdf/flows/flow.py +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/natural_pdf/flows/region.py +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/natural_pdf/ocr/__init__.py +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/natural_pdf/ocr/engine.py +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/natural_pdf/ocr/engine_doctr.py +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/natural_pdf/ocr/engine_easyocr.py +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/natural_pdf/ocr/engine_paddle.py +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/natural_pdf/ocr/engine_surya.py +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/natural_pdf/ocr/ocr_factory.py +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/natural_pdf/ocr/ocr_manager.py +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/natural_pdf/ocr/ocr_options.py +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/natural_pdf/ocr/utils.py +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/natural_pdf/qa/__init__.py +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/natural_pdf/qa/document_qa.py +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/natural_pdf/qa/qa_result.py +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/natural_pdf/search/__init__.py +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/natural_pdf/search/lancedb_search_service.py +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/natural_pdf/search/numpy_search_service.py +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/natural_pdf/search/search_options.py +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/natural_pdf/search/search_service_protocol.py +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/natural_pdf/search/searchable_mixin.py +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/natural_pdf/selectors/__init__.py +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/natural_pdf/selectors/parser.py +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/natural_pdf/tables/__init__.py +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/natural_pdf/tables/result.py +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/natural_pdf/templates/__init__.py +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/natural_pdf/templates/finetune/fine_tune_paddleocr.md +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/natural_pdf/templates/spa/css/style.css +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/natural_pdf/templates/spa/index.html +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/natural_pdf/templates/spa/js/app.js +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/natural_pdf/templates/spa/words.txt +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/natural_pdf/text_mixin.py +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/natural_pdf/utils/__init__.py +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/natural_pdf/utils/bidi_mirror.py +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/natural_pdf/utils/debug.py +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/natural_pdf/utils/highlighting.py +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/natural_pdf/utils/identifiers.py +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/natural_pdf/utils/layout.py +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/natural_pdf/utils/locks.py +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/natural_pdf/utils/packaging.py +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/natural_pdf/utils/reading_order.py +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/natural_pdf/utils/text_extraction.py +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/natural_pdf/utils/visualization.py +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/natural_pdf/widgets/__init__.py +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/natural_pdf/widgets/viewer.py +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/natural_pdf.egg-info/dependency_links.txt +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/natural_pdf.egg-info/entry_points.txt +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/natural_pdf.egg-info/requires.txt +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/noxfile.py +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/optimization/memory_comparison.py +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/optimization/pdf_analyzer.py +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/optimization/performance_analysis.py +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/optimization/performance_results/image_heavy_snapshots.csv +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/optimization/performance_results/image_heavy_snapshots.json +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/optimization/performance_results/text_heavy_snapshots.csv +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/optimization/performance_results/text_heavy_snapshots.json +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/optimization/test_cleanup_methods.py +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/optimization/test_memory_fix.py +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/publish.sh +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/pyproject.toml +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/sample-screen.png +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/setup.cfg +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/test_install.sh +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/tests/conftest.py +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/tests/exporters/test_paddleocr_exporter.py +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/tests/test_annotate.py +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/tests/test_arabic_performance.py +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/tests/test_arabic_real_world.py +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/tests/test_color_conversion.py +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/tests/test_core/test_containment_geometry.py +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/tests/test_core/test_elements.py +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/tests/test_core/test_loading.py +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/tests/test_core/test_spatial.py +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/tests/test_core/test_text_extraction.py +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/tests/test_core/test_text_layer.py +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/tests/test_directional_defaults.py +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/tests/test_document_qa.py +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/tests/test_element_collection_slicing.py +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/tests/test_exclusions.py +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/tests/test_expand.py +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/tests/test_extraction_error.py +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/tests/test_extraction_mixin_fix.py +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/tests/test_extraction_text_and_vision.py +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/tests/test_extraction_working.py +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/tests/test_first_last_selectors.py +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/tests/test_flow_region_directional.py +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/tests/test_groupby.py +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/tests/test_guides.py +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/tests/test_guides_apply_exclusions.py +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/tests/test_guides_apply_exclusions_simple.py +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/tests/test_guides_extract_table.py +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/tests/test_guides_extract_table_real.py +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/tests/test_guides_integration.py +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/tests/test_highlight_detection.py +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/tests/test_highlight_protocol.py +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/tests/test_highlight_protocol_simple.py +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/tests/test_highlight_regions.py +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/tests/test_loading_original.py +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/tests/test_multi_page_table_discovery.py +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/tests/test_optional_deps.py +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/tests/test_page_exclusion_lists.py +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/tests/test_pdf_add_exclusion_elementcollection.py +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/tests/test_region_viewer.py +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/tests/test_sections_end_only.py +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/tests/test_show_column_layout.py +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/tests/test_show_edge_cases.py +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/tests/test_show_exclusions.py +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/tests/test_show_exclusions_feature.py +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/tests/test_show_limit.py +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/tests/test_skip_repeating_headers_multipage.py +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/tests/test_strikethrough_detection.py +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/tests/test_table_result_header_mismatch.py +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/tests/test_table_result_keep_blank.py +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/tests/test_tiny_text_tables.py +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/tests/test_tiny_text_tables_table.py +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/tests/test_tutorials.py +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/tests/test_underline_detection.py +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/tests/test_update_text.py +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/todo/bad_pdf_analysis.md +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/todo/evaluation.md +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/tools/bad_pdf_eval/IMPROVEMENTS_SUMMARY.md +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/tools/bad_pdf_eval/LLM_NaturalPDF_CheatSheet.md +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/tools/bad_pdf_eval/LLM_NaturalPDF_Workflows.md +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/tools/bad_pdf_eval/README.md +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/tools/bad_pdf_eval/__init__.py +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/tools/bad_pdf_eval/analyser.py +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/tools/bad_pdf_eval/collate_summaries.py +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/tools/bad_pdf_eval/compile_attempts_markdown.py +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/tools/bad_pdf_eval/eval_suite.py +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/tools/bad_pdf_eval/evaluate_quality.py +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/tools/bad_pdf_eval/export_enrichment_csv.py +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/tools/bad_pdf_eval/extraction_decision_tree.md +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/tools/bad_pdf_eval/llm_enrich.py +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/tools/bad_pdf_eval/llm_enrich_with_retry.py +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/tools/bad_pdf_eval/reporter.py +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/tools/bad_pdf_eval/utils.py +0 -0
- {natural_pdf-0.2.3 → natural_pdf-0.2.4}/uv.lock +0 -0
@@ -78,6 +78,7 @@ from natural_pdf.utils.locks import pdf_render_lock # Import the lock
|
|
78
78
|
|
79
79
|
# # Import new utils
|
80
80
|
from natural_pdf.utils.text_extraction import filter_chars_spatially, generate_text_layout
|
81
|
+
from natural_pdf.vision.mixin import VisualSearchMixin
|
81
82
|
from natural_pdf.widgets.viewer import _IPYWIDGETS_AVAILABLE, InteractiveViewerWidget
|
82
83
|
|
83
84
|
# --- End Classification Imports --- #
|
@@ -101,6 +102,7 @@ class Page(
|
|
101
102
|
ExtractionMixin,
|
102
103
|
ShapeDetectionMixin,
|
103
104
|
DescribeMixin,
|
105
|
+
VisualSearchMixin,
|
104
106
|
Visualizable,
|
105
107
|
):
|
106
108
|
"""Enhanced Page wrapper built on top of pdfplumber.Page.
|
@@ -42,6 +42,7 @@ from natural_pdf.ocr import OCRManager, OCROptions
|
|
42
42
|
from natural_pdf.selectors.parser import parse_selector
|
43
43
|
from natural_pdf.text_mixin import TextMixin
|
44
44
|
from natural_pdf.utils.locks import pdf_render_lock
|
45
|
+
from natural_pdf.vision.mixin import VisualSearchMixin
|
45
46
|
|
46
47
|
if TYPE_CHECKING:
|
47
48
|
from natural_pdf.elements.element_collection import ElementCollection
|
@@ -252,7 +253,9 @@ class _LazyPageList(Sequence):
|
|
252
253
|
# --- End Lazy Page List Helper --- #
|
253
254
|
|
254
255
|
|
255
|
-
class PDF(
|
256
|
+
class PDF(
|
257
|
+
TextMixin, ExtractionMixin, ExportMixin, ClassificationMixin, VisualSearchMixin, Visualizable
|
258
|
+
):
|
256
259
|
"""Enhanced PDF wrapper built on top of pdfplumber.
|
257
260
|
|
258
261
|
This class provides a fluent interface for working with PDF documents,
|
@@ -40,6 +40,7 @@ logger = logging.getLogger(__name__)
|
|
40
40
|
from natural_pdf.core.pdf import PDF
|
41
41
|
from natural_pdf.elements.region import Region
|
42
42
|
from natural_pdf.export.mixin import ExportMixin
|
43
|
+
from natural_pdf.vision.mixin import VisualSearchMixin
|
43
44
|
|
44
45
|
# --- Search Imports ---
|
45
46
|
try:
|
@@ -69,8 +70,8 @@ from natural_pdf.search.searchable_mixin import SearchableMixin # Import the ne
|
|
69
70
|
|
70
71
|
|
71
72
|
class PDFCollection(
|
72
|
-
SearchableMixin, ApplyMixin, ExportMixin, ShapeDetectionMixin
|
73
|
-
):
|
73
|
+
SearchableMixin, ApplyMixin, ExportMixin, ShapeDetectionMixin, VisualSearchMixin
|
74
|
+
):
|
74
75
|
def __init__(
|
75
76
|
self,
|
76
77
|
source: Union[str, Iterable[Union[str, "PDF"]]],
|
@@ -258,8 +259,6 @@ class PDFCollection(
|
|
258
259
|
return iter(self._pdfs)
|
259
260
|
|
260
261
|
def __repr__(self) -> str:
|
261
|
-
# Removed search status
|
262
|
-
return f"<PDFCollection(count={len(self._pdfs)})>"
|
263
262
|
return f"<PDFCollection(count={len(self._pdfs)})>"
|
264
263
|
|
265
264
|
@property
|
@@ -267,6 +266,134 @@ class PDFCollection(
|
|
267
266
|
"""Returns the list of PDF objects held by the collection."""
|
268
267
|
return self._pdfs
|
269
268
|
|
269
|
+
def show(self, limit: Optional[int] = 30, per_pdf_limit: Optional[int] = 10, **kwargs):
|
270
|
+
"""
|
271
|
+
Display all PDFs in the collection with labels.
|
272
|
+
|
273
|
+
Each PDF is shown with its pages in a grid layout (6 columns by default),
|
274
|
+
and all PDFs are stacked vertically with labels.
|
275
|
+
|
276
|
+
Args:
|
277
|
+
limit: Maximum total pages to show across all PDFs (default: 30)
|
278
|
+
per_pdf_limit: Maximum pages to show per PDF (default: 10)
|
279
|
+
**kwargs: Additional arguments passed to each PDF's show() method
|
280
|
+
(e.g., columns, exclusions, resolution, etc.)
|
281
|
+
|
282
|
+
Returns:
|
283
|
+
Displayed image in Jupyter or None
|
284
|
+
"""
|
285
|
+
if not self._pdfs:
|
286
|
+
print("Empty collection")
|
287
|
+
return None
|
288
|
+
|
289
|
+
# Import here to avoid circular imports
|
290
|
+
import numpy as np
|
291
|
+
from PIL import Image, ImageDraw, ImageFont
|
292
|
+
|
293
|
+
# Calculate pages per PDF if total limit is set
|
294
|
+
if limit and not per_pdf_limit:
|
295
|
+
per_pdf_limit = max(1, limit // len(self._pdfs))
|
296
|
+
|
297
|
+
# Collect images from each PDF
|
298
|
+
all_images = []
|
299
|
+
total_pages_shown = 0
|
300
|
+
|
301
|
+
for pdf in self._pdfs:
|
302
|
+
if limit and total_pages_shown >= limit:
|
303
|
+
break
|
304
|
+
|
305
|
+
# Calculate limit for this PDF
|
306
|
+
pdf_limit = per_pdf_limit
|
307
|
+
if limit:
|
308
|
+
remaining = limit - total_pages_shown
|
309
|
+
pdf_limit = min(per_pdf_limit or remaining, remaining)
|
310
|
+
|
311
|
+
# Get PDF identifier
|
312
|
+
pdf_name = getattr(pdf, "filename", None) or getattr(pdf, "path", "Unknown")
|
313
|
+
if isinstance(pdf_name, Path):
|
314
|
+
pdf_name = pdf_name.name
|
315
|
+
elif "/" in str(pdf_name):
|
316
|
+
pdf_name = str(pdf_name).split("/")[-1]
|
317
|
+
|
318
|
+
# Render this PDF
|
319
|
+
try:
|
320
|
+
# Get render specs from the PDF
|
321
|
+
render_specs = pdf._get_render_specs(mode="show", max_pages=pdf_limit, **kwargs)
|
322
|
+
|
323
|
+
if not render_specs:
|
324
|
+
continue
|
325
|
+
|
326
|
+
# Get the highlighter and render without displaying
|
327
|
+
highlighter = pdf._get_highlighter()
|
328
|
+
pdf_image = highlighter.unified_render(
|
329
|
+
specs=render_specs,
|
330
|
+
layout="grid" if len(render_specs) > 1 else "single",
|
331
|
+
columns=6,
|
332
|
+
**kwargs,
|
333
|
+
)
|
334
|
+
|
335
|
+
if pdf_image:
|
336
|
+
# Add label above the PDF image
|
337
|
+
label_height = 40
|
338
|
+
label_bg_color = (240, 240, 240)
|
339
|
+
label_text_color = (0, 0, 0)
|
340
|
+
|
341
|
+
# Create new image with space for label
|
342
|
+
width, height = pdf_image.size
|
343
|
+
labeled_image = Image.new("RGB", (width, height + label_height), "white")
|
344
|
+
|
345
|
+
# Draw label background
|
346
|
+
draw = ImageDraw.Draw(labeled_image)
|
347
|
+
draw.rectangle([0, 0, width, label_height], fill=label_bg_color)
|
348
|
+
|
349
|
+
# Draw label text
|
350
|
+
try:
|
351
|
+
# Try to use a nice font if available
|
352
|
+
font = ImageFont.truetype("Arial", 20)
|
353
|
+
except:
|
354
|
+
# Fallback to default font
|
355
|
+
font = ImageFont.load_default()
|
356
|
+
|
357
|
+
label_text = f"{pdf_name} ({len(pdf.pages)} pages)"
|
358
|
+
draw.text((10, 10), label_text, fill=label_text_color, font=font)
|
359
|
+
|
360
|
+
# Paste PDF image below label
|
361
|
+
labeled_image.paste(pdf_image, (0, label_height))
|
362
|
+
|
363
|
+
all_images.append(labeled_image)
|
364
|
+
total_pages_shown += min(pdf_limit, len(pdf.pages))
|
365
|
+
|
366
|
+
except Exception as e:
|
367
|
+
logger.warning(f"Failed to render PDF {pdf_name}: {e}")
|
368
|
+
continue
|
369
|
+
|
370
|
+
if not all_images:
|
371
|
+
print("No PDFs could be rendered")
|
372
|
+
return None
|
373
|
+
|
374
|
+
# Combine all images vertically
|
375
|
+
if len(all_images) == 1:
|
376
|
+
combined = all_images[0]
|
377
|
+
else:
|
378
|
+
# Add spacing between PDFs
|
379
|
+
spacing = 20
|
380
|
+
total_height = sum(img.height for img in all_images) + spacing * (len(all_images) - 1)
|
381
|
+
max_width = max(img.width for img in all_images)
|
382
|
+
|
383
|
+
combined = Image.new("RGB", (max_width, total_height), "white")
|
384
|
+
|
385
|
+
y_offset = 0
|
386
|
+
for i, img in enumerate(all_images):
|
387
|
+
# Center images if they're narrower than max width
|
388
|
+
x_offset = (max_width - img.width) // 2
|
389
|
+
combined.paste(img, (x_offset, y_offset))
|
390
|
+
y_offset += img.height
|
391
|
+
if i < len(all_images) - 1:
|
392
|
+
y_offset += spacing
|
393
|
+
|
394
|
+
# Return the combined image (Jupyter will display it automatically)
|
395
|
+
return combined
|
396
|
+
|
270
397
|
@overload
|
271
398
|
def find_all(
|
272
399
|
self,
|
@@ -186,7 +186,7 @@ class Visualizable:
|
|
186
186
|
color: Optional[Union[str, Tuple[int, int, int]]] = None,
|
187
187
|
labels: bool = True,
|
188
188
|
label_format: Optional[str] = None,
|
189
|
-
highlights: Optional[List[Dict[str, Any]]] = None,
|
189
|
+
highlights: Optional[Union[List[Dict[str, Any]], bool]] = None,
|
190
190
|
legend_position: str = "right",
|
191
191
|
annotate: Optional[Union[str, List[str]]] = None,
|
192
192
|
# Layout options for multi-page/region
|
@@ -211,7 +211,7 @@ class Visualizable:
|
|
211
211
|
color: Default highlight color
|
212
212
|
labels: Whether to show labels for highlights
|
213
213
|
label_format: Format string for labels (e.g., "Element {index}")
|
214
|
-
highlights: Additional highlight groups to show
|
214
|
+
highlights: Additional highlight groups to show, or False to disable all highlights
|
215
215
|
legend_position: Position of legend/colorbar ('right', 'left', 'top', 'bottom')
|
216
216
|
annotate: Attribute name(s) to display on highlights (string or list)
|
217
217
|
layout: How to arrange multiple pages/regions (defaults to 'grid' for multi-page, 'single' for single page)
|
@@ -1192,7 +1192,7 @@ class Element(
|
|
1192
1192
|
self,
|
1193
1193
|
mode: Literal["show", "render"] = "show",
|
1194
1194
|
color: Optional[Union[str, Tuple[int, int, int]]] = None,
|
1195
|
-
highlights: Optional[List[Dict[str, Any]]] = None,
|
1195
|
+
highlights: Optional[Union[List[Dict[str, Any]], bool]] = None,
|
1196
1196
|
crop: Union[bool, Literal["content"]] = False,
|
1197
1197
|
crop_bbox: Optional[Tuple[float, float, float, float]] = None,
|
1198
1198
|
label: Optional[str] = None,
|
@@ -1203,7 +1203,7 @@ class Element(
|
|
1203
1203
|
Args:
|
1204
1204
|
mode: Rendering mode - 'show' includes highlights, 'render' is clean
|
1205
1205
|
color: Color for highlighting this element in show mode
|
1206
|
-
highlights: Additional highlight groups to show
|
1206
|
+
highlights: Additional highlight groups to show, or False to disable all highlights
|
1207
1207
|
crop: Whether to crop to element bounds
|
1208
1208
|
crop_bbox: Explicit crop bounds
|
1209
1209
|
label: Optional label for this element
|
@@ -1225,19 +1225,23 @@ class Element(
|
|
1225
1225
|
if hasattr(self, "bbox") and self.bbox:
|
1226
1226
|
spec.crop_bbox = self.bbox
|
1227
1227
|
|
1228
|
-
# Add highlight in show mode
|
1229
|
-
if mode == "show":
|
1230
|
-
#
|
1231
|
-
|
1232
|
-
|
1233
|
-
|
1234
|
-
|
1235
|
-
|
1236
|
-
|
1237
|
-
|
1228
|
+
# Add highlight in show mode (unless explicitly disabled with highlights=False)
|
1229
|
+
if mode == "show" and highlights is not False:
|
1230
|
+
# Only highlight this element if:
|
1231
|
+
# 1. We're not cropping, OR
|
1232
|
+
# 2. We're cropping but color was explicitly specified
|
1233
|
+
if not crop or color is not None:
|
1234
|
+
# Use provided label or generate one
|
1235
|
+
element_label = label if label is not None else self.__class__.__name__
|
1236
|
+
|
1237
|
+
spec.add_highlight(
|
1238
|
+
element=self,
|
1239
|
+
color=color or "red", # Default red for single element
|
1240
|
+
label=element_label,
|
1241
|
+
)
|
1238
1242
|
|
1239
|
-
# Add additional highlight groups if provided
|
1240
|
-
if highlights:
|
1243
|
+
# Add additional highlight groups if provided (and highlights is a list)
|
1244
|
+
if highlights and isinstance(highlights, list):
|
1241
1245
|
for group in highlights:
|
1242
1246
|
group_elements = group.get("elements", [])
|
1243
1247
|
group_color = group.get("color", color)
|
@@ -221,7 +221,7 @@ class Region(
|
|
221
221
|
self,
|
222
222
|
mode: Literal["show", "render"] = "show",
|
223
223
|
color: Optional[Union[str, Tuple[int, int, int]]] = None,
|
224
|
-
highlights: Optional[List[Dict[str, Any]]] = None,
|
224
|
+
highlights: Optional[Union[List[Dict[str, Any]], bool]] = None,
|
225
225
|
crop: Union[bool, Literal["content"]] = True, # Default to True for regions
|
226
226
|
crop_bbox: Optional[Tuple[float, float, float, float]] = None,
|
227
227
|
**kwargs,
|
@@ -231,7 +231,7 @@ class Region(
|
|
231
231
|
Args:
|
232
232
|
mode: Rendering mode - 'show' includes highlights, 'render' is clean
|
233
233
|
color: Color for highlighting this region in show mode
|
234
|
-
highlights: Additional highlight groups to show
|
234
|
+
highlights: Additional highlight groups to show, or False to disable all highlights
|
235
235
|
crop: Whether to crop to this region
|
236
236
|
crop_bbox: Explicit crop bounds (overrides region bounds)
|
237
237
|
**kwargs: Additional parameters
|
@@ -250,10 +250,12 @@ class Region(
|
|
250
250
|
# Crop to this region's bounds
|
251
251
|
spec.crop_bbox = self.bbox
|
252
252
|
|
253
|
-
# Add highlights in show mode
|
254
|
-
if mode == "show":
|
255
|
-
#
|
256
|
-
|
253
|
+
# Add highlights in show mode (unless explicitly disabled with highlights=False)
|
254
|
+
if mode == "show" and highlights is not False:
|
255
|
+
# Only highlight this region if:
|
256
|
+
# 1. We're not cropping, OR
|
257
|
+
# 2. We're cropping but color was explicitly specified
|
258
|
+
if not crop or color is not None:
|
257
259
|
spec.add_highlight(
|
258
260
|
bbox=self.bbox,
|
259
261
|
polygon=self.polygon if self.has_polygon else None,
|
@@ -261,8 +263,8 @@ class Region(
|
|
261
263
|
label=self.label or self.name or "Region",
|
262
264
|
)
|
263
265
|
|
264
|
-
# Add additional highlight groups if provided
|
265
|
-
if highlights:
|
266
|
+
# Add additional highlight groups if provided (and highlights is a list)
|
267
|
+
if highlights and isinstance(highlights, list):
|
266
268
|
for group in highlights:
|
267
269
|
elements = group.get("elements", [])
|
268
270
|
group_color = group.get("color", color)
|
@@ -0,0 +1,7 @@
|
|
1
|
+
"""Vision module for visual similarity and pattern matching"""
|
2
|
+
|
3
|
+
from .mixin import VisualSearchMixin
|
4
|
+
from .results import Match, MatchResults
|
5
|
+
from .similarity import VisualMatcher, compute_phash
|
6
|
+
|
7
|
+
__all__ = ["VisualMatcher", "compute_phash", "Match", "MatchResults", "VisualSearchMixin"]
|
@@ -0,0 +1,209 @@
|
|
1
|
+
"""Mixin to add visual similarity search to Page/PDF/PDFCollection"""
|
2
|
+
|
3
|
+
from typing import List, Optional, Tuple, Union
|
4
|
+
|
5
|
+
import numpy as np
|
6
|
+
from PIL import Image
|
7
|
+
from tqdm.auto import tqdm
|
8
|
+
|
9
|
+
from .results import Match, MatchResults
|
10
|
+
from .similarity import VisualMatcher, compute_phash
|
11
|
+
|
12
|
+
|
13
|
+
class VisualSearchMixin:
|
14
|
+
"""Add find_similar method to classes that include this mixin"""
|
15
|
+
|
16
|
+
def find_similar(
|
17
|
+
self,
|
18
|
+
examples: Union["Element", "Region", List[Union["Element", "Region"]]],
|
19
|
+
using: str = "vision",
|
20
|
+
confidence: float = 0.6,
|
21
|
+
sizes: Optional[Union[float, Tuple, List]] = (0.8, 1.2),
|
22
|
+
resolution: int = 72,
|
23
|
+
hash_size: int = 20,
|
24
|
+
step_factor: float = 0.1,
|
25
|
+
max_per_page: Optional[int] = None,
|
26
|
+
show_progress: bool = True,
|
27
|
+
**kwargs,
|
28
|
+
) -> MatchResults:
|
29
|
+
"""
|
30
|
+
Find regions visually similar to the given example(s).
|
31
|
+
|
32
|
+
Args:
|
33
|
+
examples: Single element/region or list of examples to search for
|
34
|
+
using: Search method - currently only 'vision' is supported
|
35
|
+
confidence: Minimum similarity score (0-1)
|
36
|
+
sizes: Size variations to search. Can be:
|
37
|
+
- float: ±percentage (e.g., 0.2 = 80%-120%)
|
38
|
+
- tuple(min, max): search range with smart logarithmic steps (default: (0.8, 1.0))
|
39
|
+
- tuple(min, max, step): explicit step size
|
40
|
+
- list: exact sizes to try (e.g., [0.8, 1.0, 1.2])
|
41
|
+
resolution: Resolution for image comparison (DPI) (default: 72)
|
42
|
+
hash_size: Size of perceptual hash grid (default: 12)
|
43
|
+
step_factor: Step size as fraction of template size (default: 0.1)
|
44
|
+
max_per_page: Maximum matches to return per page
|
45
|
+
show_progress: Show progress bar for multi-page searches (default: True)
|
46
|
+
**kwargs: Additional options
|
47
|
+
|
48
|
+
Returns:
|
49
|
+
MatchResults collection
|
50
|
+
"""
|
51
|
+
if using != "vision":
|
52
|
+
raise NotImplementedError(f"using='{using}' not yet supported")
|
53
|
+
|
54
|
+
# Ensure examples is a list
|
55
|
+
if not isinstance(examples, list):
|
56
|
+
examples = [examples]
|
57
|
+
|
58
|
+
# Initialize matcher with specified hash size
|
59
|
+
matcher = VisualMatcher(hash_size=hash_size)
|
60
|
+
|
61
|
+
# Prepare templates
|
62
|
+
templates = []
|
63
|
+
for example in examples:
|
64
|
+
# Render the example region/element
|
65
|
+
example_image = example.render(resolution=resolution, crop=True)
|
66
|
+
template_hash = compute_phash(example_image, hash_size=hash_size)
|
67
|
+
templates.append({"image": example_image, "hash": template_hash, "source": example})
|
68
|
+
|
69
|
+
# Get pages to search based on the object type
|
70
|
+
if hasattr(self, "__class__") and self.__class__.__name__ == "PDFCollection":
|
71
|
+
# PDFCollection needs to iterate through all PDFs
|
72
|
+
pages_to_search = []
|
73
|
+
for pdf in self:
|
74
|
+
pages_to_search.extend(pdf.pages)
|
75
|
+
elif hasattr(self, "pages"): # PDF
|
76
|
+
pages_to_search = self.pages
|
77
|
+
elif hasattr(self, "number"): # Single page
|
78
|
+
pages_to_search = [self]
|
79
|
+
else:
|
80
|
+
raise TypeError(f"Cannot search in {type(self)}")
|
81
|
+
|
82
|
+
# Calculate total operations for progress bar
|
83
|
+
total_operations = 0
|
84
|
+
if show_progress:
|
85
|
+
# Get scales that will be searched
|
86
|
+
scales = matcher._get_search_scales(sizes)
|
87
|
+
|
88
|
+
# Pre-calculate for all pages and templates
|
89
|
+
for page in pages_to_search:
|
90
|
+
# Estimate page image size
|
91
|
+
page_w = int(page.width * resolution / 72.0)
|
92
|
+
page_h = int(page.height * resolution / 72.0)
|
93
|
+
|
94
|
+
for template_data in templates:
|
95
|
+
template_w, template_h = template_data["image"].size
|
96
|
+
|
97
|
+
for scale in scales:
|
98
|
+
scaled_w = int(template_w * scale)
|
99
|
+
scaled_h = int(template_h * scale)
|
100
|
+
|
101
|
+
if scaled_w <= page_w and scaled_h <= page_h:
|
102
|
+
step_x = max(1, int(scaled_w * step_factor))
|
103
|
+
step_y = max(1, int(scaled_h * step_factor))
|
104
|
+
|
105
|
+
x_windows = len(range(0, page_w - scaled_w + 1, step_x))
|
106
|
+
y_windows = len(range(0, page_h - scaled_h + 1, step_y))
|
107
|
+
total_operations += x_windows * y_windows
|
108
|
+
|
109
|
+
# Search each page
|
110
|
+
all_matches = []
|
111
|
+
|
112
|
+
# Create single progress bar for all operations
|
113
|
+
progress_bar = None
|
114
|
+
operations_done = 0
|
115
|
+
last_update = 0
|
116
|
+
update_frequency = max(1, total_operations // 1000) # Update at most 1000 times
|
117
|
+
|
118
|
+
if show_progress and total_operations > 0:
|
119
|
+
progress_bar = tqdm(
|
120
|
+
total=total_operations,
|
121
|
+
desc="Searching",
|
122
|
+
unit="window",
|
123
|
+
miniters=update_frequency, # Minimum iterations between updates
|
124
|
+
mininterval=0.1, # Minimum time between updates (seconds)
|
125
|
+
)
|
126
|
+
|
127
|
+
for page_idx, page in enumerate(pages_to_search):
|
128
|
+
# Render the full page once
|
129
|
+
page_image = page.render(resolution=resolution)
|
130
|
+
|
131
|
+
# Convert page coordinates to image coordinates
|
132
|
+
scale = resolution / 72.0 # PDF is 72 DPI
|
133
|
+
|
134
|
+
page_matches = []
|
135
|
+
|
136
|
+
# Search for each template
|
137
|
+
for template_idx, template_data in enumerate(templates):
|
138
|
+
template_image = template_data["image"]
|
139
|
+
template_hash = template_data["hash"]
|
140
|
+
|
141
|
+
# Custom progress callback to update our main progress bar
|
142
|
+
def update_progress():
|
143
|
+
nonlocal operations_done, last_update
|
144
|
+
operations_done += 1
|
145
|
+
|
146
|
+
# Only update progress bar every N operations to avoid overwhelming output
|
147
|
+
if progress_bar and (
|
148
|
+
operations_done - last_update >= update_frequency
|
149
|
+
or operations_done == total_operations
|
150
|
+
):
|
151
|
+
progress_bar.update(operations_done - last_update)
|
152
|
+
last_update = operations_done
|
153
|
+
|
154
|
+
# Update description with current page/template info
|
155
|
+
if len(pages_to_search) > 1:
|
156
|
+
progress_bar.set_description(
|
157
|
+
f"Page {page.number}/{len(pages_to_search)}"
|
158
|
+
)
|
159
|
+
elif len(templates) > 1:
|
160
|
+
progress_bar.set_description(
|
161
|
+
f"Template {template_idx + 1}/{len(templates)}"
|
162
|
+
)
|
163
|
+
|
164
|
+
# Find matches in this page - never show internal progress
|
165
|
+
candidates = matcher.find_matches_in_image(
|
166
|
+
template_image,
|
167
|
+
page_image,
|
168
|
+
template_hash=template_hash,
|
169
|
+
confidence_threshold=confidence,
|
170
|
+
sizes=sizes,
|
171
|
+
step_factor=step_factor,
|
172
|
+
show_progress=False, # We handle progress ourselves
|
173
|
+
progress_callback=update_progress if progress_bar else None,
|
174
|
+
**kwargs,
|
175
|
+
)
|
176
|
+
|
177
|
+
# Convert image coordinates back to PDF coordinates
|
178
|
+
for candidate in candidates:
|
179
|
+
img_x0, img_y0, img_x1, img_y1 = candidate.bbox
|
180
|
+
|
181
|
+
# Convert from image pixels to PDF points
|
182
|
+
# No flipping needed! PDF coordinates map directly to PIL coordinates
|
183
|
+
pdf_x0 = img_x0 / scale
|
184
|
+
pdf_y0 = img_y0 / scale
|
185
|
+
pdf_x1 = img_x1 / scale
|
186
|
+
pdf_y1 = img_y1 / scale
|
187
|
+
|
188
|
+
# Create Match object
|
189
|
+
match = Match(
|
190
|
+
page=page,
|
191
|
+
bbox=(pdf_x0, pdf_y0, pdf_x1, pdf_y1),
|
192
|
+
confidence=candidate.confidence,
|
193
|
+
source_example=template_data["source"],
|
194
|
+
)
|
195
|
+
page_matches.append(match)
|
196
|
+
|
197
|
+
# Apply max_per_page limit if specified
|
198
|
+
if max_per_page and len(page_matches) > max_per_page:
|
199
|
+
# Sort by confidence and take top N
|
200
|
+
page_matches.sort(key=lambda m: m.confidence, reverse=True)
|
201
|
+
page_matches = page_matches[:max_per_page]
|
202
|
+
|
203
|
+
all_matches.extend(page_matches)
|
204
|
+
|
205
|
+
# Close progress bar
|
206
|
+
if progress_bar:
|
207
|
+
progress_bar.close()
|
208
|
+
|
209
|
+
return MatchResults(all_matches)
|