natural-pdf 0.2.17__tar.gz → 0.2.18__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {natural_pdf-0.2.17/natural_pdf.egg-info → natural_pdf-0.2.18}/PKG-INFO +1 -1
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/natural_pdf/core/page.py +42 -9
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/natural_pdf/utils/spatial.py +42 -39
- {natural_pdf-0.2.17 → natural_pdf-0.2.18/natural_pdf.egg-info}/PKG-INFO +1 -1
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/natural_pdf.egg-info/SOURCES.txt +2 -0
- natural_pdf-0.2.18/tests/demo_multipage.py +56 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/tests/test_aggregate_selectors.py +2 -2
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/tests/test_element_exclusions.py +18 -0
- natural_pdf-0.2.18/tests/test_exclusion_recursion_fix.py +46 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/tests/test_expand_enhanced.py +3 -3
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/tests/test_pdf_exclusions_in_find_methods.py +13 -5
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/.cursor/rules/analysis_framework.mdc +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/.cursor/rules/coding-style.mdc +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/.cursor/rules/edit-md-instead-of-ipynb.mdc +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/.cursor/rules/minimal-comments.mdc +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/.cursor/rules/natural-pdf-overview.mdc +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/.cursor/rules/user-friendly-library-code.mdc +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/.github/workflows/ci.yml +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/.github/workflows/docs.yml +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/.github/workflows/nightly-tutorials.yml +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/.gitignore +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/.pre-commit-config.yaml +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/01-execute_notebooks.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/02-run_all_tutorials.sh +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/CLAUDE.md +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/LICENSE +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/MANIFEST.in +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/README.md +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/audit_packaging.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/check_run_md.sh +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/docs/api/index.md +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/docs/assets/favicon.png +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/docs/assets/favicon.svg +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/docs/assets/javascripts/custom.js +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/docs/assets/logo.svg +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/docs/assets/sample-screen.png +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/docs/assets/social-preview.png +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/docs/assets/social-preview.svg +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/docs/assets/stylesheets/custom.css +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/docs/categorizing-documents/index.md +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/docs/data-extraction/index.md +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/docs/describe/index.md +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/docs/document-qa/index.md +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/docs/element-selection/index.md +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/docs/extracting-clean-text/index.md +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/docs/finetuning/index.md +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/docs/fix-messy-tables/index.md +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/docs/fix-messy-tables/table_1.csv +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/docs/fix-messy-tables/table_2.csv +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/docs/fix-messy-tables/table_3.csv +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/docs/guide_adjustment_stream.md +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/docs/guides_boundary_columns.md +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/docs/index.md +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/docs/installation/index.md +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/docs/interactive-widget/index.md +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/docs/layout-analysis/index.md +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/docs/loops-and-groups/index.md +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/docs/ocr/index.md +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/docs/pdf-navigation/index.md +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/docs/process-forms-and-invoices/extracted_form_data.csv +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/docs/process-forms-and-invoices/index.md +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/docs/quick-reference/index.md +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/docs/reflowing-pages/index.md +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/docs/regions/index.md +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/docs/tables/index.md +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/docs/text-analysis/index.md +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/docs/tutorials/01-loading-and-extraction.md +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/docs/tutorials/02-finding-elements.md +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/docs/tutorials/03-extracting-blocks.md +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/docs/tutorials/04-table-extraction.md +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/docs/tutorials/05-excluding-content.md +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/docs/tutorials/06-document-qa.md +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/docs/tutorials/07-layout-analysis.md +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/docs/tutorials/07-working-with-regions.md +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/docs/tutorials/08-spatial-navigation.md +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/docs/tutorials/09-section-extraction.md +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/docs/tutorials/10-form-field-extraction.md +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/docs/tutorials/11-enhanced-table-processing.md +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/docs/tutorials/12-ocr-integration.md +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/docs/tutorials/13-semantic-search.md +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/docs/tutorials/14-categorizing-documents.md +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/docs/visual-debugging/index.md +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/docs/visual-debugging/region.png +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/mkdocs.yml +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/natural_pdf/__init__.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/natural_pdf/analyzers/__init__.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/natural_pdf/analyzers/guides.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/natural_pdf/analyzers/layout/__init__.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/natural_pdf/analyzers/layout/base.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/natural_pdf/analyzers/layout/docling.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/natural_pdf/analyzers/layout/gemini.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/natural_pdf/analyzers/layout/layout_analyzer.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/natural_pdf/analyzers/layout/layout_manager.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/natural_pdf/analyzers/layout/layout_options.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/natural_pdf/analyzers/layout/paddle.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/natural_pdf/analyzers/layout/pdfplumber_table_finder.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/natural_pdf/analyzers/layout/surya.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/natural_pdf/analyzers/layout/table_structure_utils.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/natural_pdf/analyzers/layout/tatr.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/natural_pdf/analyzers/layout/yolo.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/natural_pdf/analyzers/shape_detection_mixin.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/natural_pdf/analyzers/text_options.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/natural_pdf/analyzers/text_structure.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/natural_pdf/analyzers/utils.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/natural_pdf/classification/manager.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/natural_pdf/classification/mixin.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/natural_pdf/classification/results.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/natural_pdf/cli.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/natural_pdf/collections/mixins.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/natural_pdf/core/__init__.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/natural_pdf/core/element_manager.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/natural_pdf/core/highlighting_service.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/natural_pdf/core/page_collection.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/natural_pdf/core/page_groupby.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/natural_pdf/core/pdf.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/natural_pdf/core/pdf_collection.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/natural_pdf/core/render_spec.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/natural_pdf/describe/__init__.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/natural_pdf/describe/base.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/natural_pdf/describe/elements.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/natural_pdf/describe/mixin.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/natural_pdf/describe/summary.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/natural_pdf/elements/__init__.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/natural_pdf/elements/base.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/natural_pdf/elements/element_collection.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/natural_pdf/elements/image.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/natural_pdf/elements/line.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/natural_pdf/elements/rect.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/natural_pdf/elements/region.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/natural_pdf/elements/text.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/natural_pdf/export/mixin.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/natural_pdf/exporters/__init__.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/natural_pdf/exporters/base.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/natural_pdf/exporters/data/__init__.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/natural_pdf/exporters/data/pdf.ttf +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/natural_pdf/exporters/data/sRGB.icc +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/natural_pdf/exporters/hocr.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/natural_pdf/exporters/hocr_font.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/natural_pdf/exporters/original_pdf.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/natural_pdf/exporters/paddleocr.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/natural_pdf/exporters/searchable_pdf.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/natural_pdf/extraction/manager.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/natural_pdf/extraction/mixin.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/natural_pdf/extraction/result.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/natural_pdf/flows/__init__.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/natural_pdf/flows/collections.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/natural_pdf/flows/element.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/natural_pdf/flows/flow.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/natural_pdf/flows/region.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/natural_pdf/ocr/__init__.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/natural_pdf/ocr/engine.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/natural_pdf/ocr/engine_doctr.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/natural_pdf/ocr/engine_easyocr.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/natural_pdf/ocr/engine_paddle.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/natural_pdf/ocr/engine_surya.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/natural_pdf/ocr/ocr_factory.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/natural_pdf/ocr/ocr_manager.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/natural_pdf/ocr/ocr_options.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/natural_pdf/ocr/utils.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/natural_pdf/qa/__init__.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/natural_pdf/qa/document_qa.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/natural_pdf/qa/qa_result.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/natural_pdf/search/__init__.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/natural_pdf/search/lancedb_search_service.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/natural_pdf/search/numpy_search_service.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/natural_pdf/search/search_options.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/natural_pdf/search/search_service_protocol.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/natural_pdf/search/searchable_mixin.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/natural_pdf/selectors/__init__.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/natural_pdf/selectors/parser.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/natural_pdf/tables/__init__.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/natural_pdf/tables/result.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/natural_pdf/templates/__init__.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/natural_pdf/templates/finetune/fine_tune_paddleocr.md +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/natural_pdf/templates/spa/css/style.css +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/natural_pdf/templates/spa/index.html +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/natural_pdf/templates/spa/js/app.js +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/natural_pdf/templates/spa/words.txt +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/natural_pdf/text_mixin.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/natural_pdf/utils/__init__.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/natural_pdf/utils/bidi_mirror.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/natural_pdf/utils/color_utils.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/natural_pdf/utils/debug.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/natural_pdf/utils/highlighting.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/natural_pdf/utils/identifiers.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/natural_pdf/utils/layout.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/natural_pdf/utils/locks.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/natural_pdf/utils/packaging.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/natural_pdf/utils/pdfminer_patches.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/natural_pdf/utils/reading_order.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/natural_pdf/utils/sections.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/natural_pdf/utils/text_extraction.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/natural_pdf/utils/visualization.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/natural_pdf/vision/__init__.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/natural_pdf/vision/mixin.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/natural_pdf/vision/results.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/natural_pdf/vision/similarity.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/natural_pdf/vision/template_matching.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/natural_pdf/widgets/__init__.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/natural_pdf/widgets/viewer.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/natural_pdf.egg-info/dependency_links.txt +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/natural_pdf.egg-info/entry_points.txt +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/natural_pdf.egg-info/requires.txt +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/natural_pdf.egg-info/top_level.txt +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/noxfile.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/optimization/memory_comparison.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/optimization/pdf_analyzer.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/optimization/performance_analysis.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/optimization/performance_results/image_heavy_snapshots.csv +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/optimization/performance_results/image_heavy_snapshots.json +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/optimization/performance_results/text_heavy_snapshots.csv +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/optimization/performance_results/text_heavy_snapshots.json +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/optimization/test_cleanup_methods.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/optimization/test_memory_fix.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/publish.sh +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/pyproject.toml +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/sample-screen.png +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/setup.cfg +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/temp/fix_page_exclusions.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/temp/test_draw_guides.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/temp/test_draw_guides_interactive.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/temp/test_exclusion_with_debug.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/temp/test_find_exclusions_fix.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/temp/test_find_exclusions_fix_no_recursion.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/temp/test_fix_real_pdf.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/temp/test_fix_working.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/temp/test_fixed_pdf_exclusions.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/temp/test_guide_draw_notebook.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/temp/test_horizontal_top_bottom.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/temp/test_inline_js.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/temp/test_marker_order.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/temp/test_original_exclusions_now_work.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/temp/test_pdf_exclusions_with_guides.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/temp/test_region_exclusions_detailed.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/temp/test_stripes_real_pdf.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/temp/test_vertical_stripes.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/temp/test_widget_functionality.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/temp/test_widget_simple.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/tests/conftest.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/tests/exporters/test_paddleocr_exporter.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/tests/test_annotate.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/tests/test_arabic_performance.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/tests/test_arabic_real_world.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/tests/test_auto_multipage_option.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/tests/test_color_conversion.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/tests/test_color_hex_display.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/tests/test_core/test_containment_geometry.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/tests/test_core/test_elements.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/tests/test_core/test_loading.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/tests/test_core/test_spatial.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/tests/test_core/test_text_extraction.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/tests/test_core/test_text_layer.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/tests/test_crop_enhancements.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/tests/test_crop_region_highlights.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/tests/test_directional_defaults.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/tests/test_dissolve.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/tests/test_dissolve_cross_page_bug.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/tests/test_dissolve_debug_issue.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/tests/test_dissolve_real_world_issue.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/tests/test_dissolve_single_elements.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/tests/test_dissolve_vertical_offset_issue.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/tests/test_document_qa.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/tests/test_element_addition.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/tests/test_element_collection_guides.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/tests/test_element_collection_show_cols.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/tests/test_element_collection_slicing.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/tests/test_element_show_crop_highlights.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/tests/test_empty_pseudo_class.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/tests/test_exclude_multi_page.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/tests/test_exclude_real_pdf.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/tests/test_exclusions.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/tests/test_expand.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/tests/test_extraction_error.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/tests/test_extraction_mixin_fix.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/tests/test_extraction_text_and_vision.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/tests/test_extraction_working.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/tests/test_find_similar.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/tests/test_first_last_selectors.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/tests/test_fix_get_sections_zero_height.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/tests/test_flow_region_directional.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/tests/test_get_sections_fix_comprehensive.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/tests/test_get_sections_zero_height.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/tests/test_groupby.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/tests/test_guide_adjustment_stream.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/tests/test_guides.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/tests/test_guides_apply_exclusions.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/tests/test_guides_apply_exclusions_simple.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/tests/test_guides_boundaries.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/tests/test_guides_extract_table.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/tests/test_guides_extract_table_collections.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/tests/test_guides_extract_table_exclusions.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/tests/test_guides_extract_table_real.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/tests/test_guides_from_headers.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/tests/test_guides_from_stripes.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/tests/test_guides_integration.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/tests/test_guides_marker_sorting.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/tests/test_guides_partial.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/tests/test_highlight_color_falsy.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/tests/test_highlight_detection.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/tests/test_highlight_detection_comprehensive.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/tests/test_highlight_offset.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/tests/test_highlight_protocol.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/tests/test_highlight_protocol_simple.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/tests/test_highlight_regions.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/tests/test_horizontal_guides_alignment.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/tests/test_include_boundaries_comprehensive.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/tests/test_include_boundaries_final.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/tests/test_include_boundaries_final_verification.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/tests/test_include_boundaries_fix.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/tests/test_include_boundaries_mock.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/tests/test_include_boundaries_simple.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/tests/test_include_boundaries_types_pdf.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/tests/test_include_boundaries_verification.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/tests/test_include_boundaries_with_real_text.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/tests/test_loading_original.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/tests/test_match_results_sorting.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/tests/test_merge_connected.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/tests/test_merge_connected_real_world.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/tests/test_merge_method.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/tests/test_merged_flowregion_specs.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/tests/test_mixed_collection_rendering.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/tests/test_multi_page_table_discovery.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/tests/test_multipage_directional.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/tests/test_negative_bounds_pdf.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/tests/test_optional_deps.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/tests/test_page_exclusion_lists.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/tests/test_pdf_add_exclusion_elementcollection.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/tests/test_pdfminer_bug_status.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/tests/test_pdfminer_color_bug.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/tests/test_pdfminer_color_stack_bug.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/tests/test_phash_masking.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/tests/test_region_find_similar.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/tests/test_region_show_crop_highlights.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/tests/test_region_viewer.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/tests/test_sections_end_only.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/tests/test_sections_with_start_and_end.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/tests/test_show_column_layout.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/tests/test_show_edge_cases.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/tests/test_show_exclusions.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/tests/test_show_exclusions_feature.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/tests/test_show_limit.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/tests/test_skip_repeating_headers_multipage.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/tests/test_slice_cache_reuse.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/tests/test_slice_exclusion_fix.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/tests/test_slice_exclusion_issue.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/tests/test_slice_exclusion_mock.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/tests/test_sliced_collection_exclusions.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/tests/test_smart_exclusion.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/tests/test_spatial_offset.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/tests/test_strikethrough_detection.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/tests/test_table_result_header_mismatch.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/tests/test_table_result_keep_blank.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/tests/test_template_matching.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/tests/test_template_white_masking.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/tests/test_tiny_text_tables.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/tests/test_tiny_text_tables_table.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/tests/test_tutorials.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/tests/test_underline_detection.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/tests/test_update_text.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/todo/bad_pdf_analysis.md +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/todo/evaluation.md +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/tools/bad_pdf_eval/IMPROVEMENTS_SUMMARY.md +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/tools/bad_pdf_eval/LLM_NaturalPDF_CheatSheet.md +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/tools/bad_pdf_eval/LLM_NaturalPDF_Workflows.md +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/tools/bad_pdf_eval/README.md +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/tools/bad_pdf_eval/__init__.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/tools/bad_pdf_eval/analyser.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/tools/bad_pdf_eval/collate_summaries.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/tools/bad_pdf_eval/compile_attempts_markdown.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/tools/bad_pdf_eval/eval_suite.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/tools/bad_pdf_eval/evaluate_quality.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/tools/bad_pdf_eval/export_enrichment_csv.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/tools/bad_pdf_eval/extraction_decision_tree.md +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/tools/bad_pdf_eval/llm_enrich.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/tools/bad_pdf_eval/llm_enrich_with_retry.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/tools/bad_pdf_eval/reporter.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/tools/bad_pdf_eval/utils.py +0 -0
- {natural_pdf-0.2.17 → natural_pdf-0.2.18}/uv.lock +0 -0
@@ -1,5 +1,6 @@
|
|
1
1
|
import base64
|
2
2
|
import concurrent.futures # Added import
|
3
|
+
import contextlib
|
3
4
|
import hashlib
|
4
5
|
import io
|
5
6
|
import json
|
@@ -275,6 +276,9 @@ class Page(
|
|
275
276
|
self._load_elements()
|
276
277
|
self._to_image_cache: Dict[tuple, Optional["Image.Image"]] = {}
|
277
278
|
|
279
|
+
# Flag to prevent infinite recursion when computing exclusions
|
280
|
+
self._computing_exclusions = False
|
281
|
+
|
278
282
|
def _get_render_specs(
|
279
283
|
self,
|
280
284
|
mode: Literal["show", "render"] = "show",
|
@@ -412,6 +416,35 @@ class Page(
|
|
412
416
|
self._exclusions = []
|
413
417
|
return self
|
414
418
|
|
419
|
+
@contextlib.contextmanager
|
420
|
+
def without_exclusions(self):
|
421
|
+
"""
|
422
|
+
Context manager that temporarily disables exclusion processing.
|
423
|
+
|
424
|
+
This prevents infinite recursion when exclusion callables themselves
|
425
|
+
use find() operations. While in this context, all find operations
|
426
|
+
will skip exclusion filtering.
|
427
|
+
|
428
|
+
Example:
|
429
|
+
```python
|
430
|
+
# This exclusion would normally cause infinite recursion:
|
431
|
+
page.add_exclusion(lambda p: p.find("text:contains('Header')").expand())
|
432
|
+
|
433
|
+
# But internally, it's safe because we use:
|
434
|
+
with page.without_exclusions():
|
435
|
+
region = exclusion_callable(page)
|
436
|
+
```
|
437
|
+
|
438
|
+
Yields:
|
439
|
+
The page object with exclusions temporarily disabled.
|
440
|
+
"""
|
441
|
+
old_value = self._computing_exclusions
|
442
|
+
self._computing_exclusions = True
|
443
|
+
try:
|
444
|
+
yield self
|
445
|
+
finally:
|
446
|
+
self._computing_exclusions = old_value
|
447
|
+
|
415
448
|
def add_exclusion(
|
416
449
|
self,
|
417
450
|
exclusion_func_or_region: Union[
|
@@ -759,15 +792,10 @@ class Page(
|
|
759
792
|
if debug:
|
760
793
|
print(f" - Evaluating callable '{exclusion_label}'...")
|
761
794
|
|
762
|
-
#
|
763
|
-
|
764
|
-
|
765
|
-
|
766
|
-
# Call the function - Expects it to return a Region or None
|
767
|
-
region_result = exclusion_item(self)
|
768
|
-
|
769
|
-
# Restore exclusions
|
770
|
-
self._exclusions = temp_original_exclusions
|
795
|
+
# Use context manager to prevent infinite recursion
|
796
|
+
with self.without_exclusions():
|
797
|
+
# Call the function - Expects it to return a Region or None
|
798
|
+
region_result = exclusion_item(self)
|
771
799
|
|
772
800
|
if isinstance(region_result, Region):
|
773
801
|
# Assign the label to the returned region
|
@@ -947,6 +975,11 @@ class Page(
|
|
947
975
|
Returns:
|
948
976
|
A new list containing only the elements not excluded.
|
949
977
|
"""
|
978
|
+
# Skip exclusion filtering if we're currently computing exclusions
|
979
|
+
# This prevents infinite recursion when exclusion callables use find operations
|
980
|
+
if self._computing_exclusions:
|
981
|
+
return elements
|
982
|
+
|
950
983
|
# Check both page-level and PDF-level exclusions
|
951
984
|
has_page_exclusions = bool(self._exclusions)
|
952
985
|
has_pdf_exclusions = (
|
@@ -10,11 +10,11 @@ with include_boundaries='none'.
|
|
10
10
|
|
11
11
|
Example:
|
12
12
|
from natural_pdf.utils.spatial import is_element_in_region
|
13
|
-
|
13
|
+
|
14
14
|
# Check if element is in region using center-based logic (default)
|
15
15
|
if is_element_in_region(element, region):
|
16
16
|
print("Element is in region")
|
17
|
-
|
17
|
+
|
18
18
|
# Use different strategies
|
19
19
|
if is_element_in_region(element, region, strategy="intersects"):
|
20
20
|
print("Element overlaps with region")
|
@@ -35,16 +35,16 @@ InclusionStrategy = Literal["center", "intersects", "contains"]
|
|
35
35
|
|
36
36
|
def is_element_in_region(
|
37
37
|
element: "Element",
|
38
|
-
region: "Region",
|
38
|
+
region: "Region",
|
39
39
|
strategy: InclusionStrategy = "center",
|
40
|
-
check_page: bool = True
|
40
|
+
check_page: bool = True,
|
41
41
|
) -> bool:
|
42
42
|
"""
|
43
43
|
Unified function to check if an element is inside a region.
|
44
|
-
|
44
|
+
|
45
45
|
This centralizes the logic used across Region, Page, and Flow to ensure
|
46
46
|
consistent behavior throughout the library.
|
47
|
-
|
47
|
+
|
48
48
|
Args:
|
49
49
|
element: The element to check
|
50
50
|
region: The region to check against
|
@@ -53,7 +53,7 @@ def is_element_in_region(
|
|
53
53
|
- "intersects": Element belongs if any part overlaps
|
54
54
|
- "contains": Element belongs only if fully contained
|
55
55
|
check_page: Whether to verify element and region are on the same page
|
56
|
-
|
56
|
+
|
57
57
|
Returns:
|
58
58
|
bool: True if element is in region according to the strategy
|
59
59
|
"""
|
@@ -61,18 +61,18 @@ def is_element_in_region(
|
|
61
61
|
if not hasattr(element, "bbox") or not element.bbox:
|
62
62
|
logger.debug(f"Element lacks bbox attributes: {element}")
|
63
63
|
return False
|
64
|
-
|
64
|
+
|
65
65
|
if not hasattr(region, "bbox") or not region.bbox:
|
66
66
|
logger.debug(f"Region lacks bbox attributes: {region}")
|
67
67
|
return False
|
68
|
-
|
68
|
+
|
69
69
|
# Check page membership if requested
|
70
70
|
if check_page:
|
71
71
|
if not hasattr(element, "page") or not hasattr(region, "page"):
|
72
72
|
return False
|
73
73
|
if element.page != region.page:
|
74
74
|
return False
|
75
|
-
|
75
|
+
|
76
76
|
# Apply the appropriate strategy
|
77
77
|
if strategy == "center":
|
78
78
|
# Use existing region method if available
|
@@ -82,37 +82,43 @@ def is_element_in_region(
|
|
82
82
|
# Fallback calculation
|
83
83
|
elem_center_x = (element.x0 + element.x1) / 2
|
84
84
|
elem_center_y = (element.top + element.bottom) / 2
|
85
|
-
|
85
|
+
|
86
86
|
# Use region's is_point_inside if available
|
87
87
|
if hasattr(region, "is_point_inside"):
|
88
88
|
return region.is_point_inside(elem_center_x, elem_center_y)
|
89
89
|
else:
|
90
90
|
# Simple bounds check
|
91
|
-
return (
|
92
|
-
|
93
|
-
|
91
|
+
return (
|
92
|
+
region.x0 <= elem_center_x <= region.x1
|
93
|
+
and region.top <= elem_center_y <= region.bottom
|
94
|
+
)
|
95
|
+
|
94
96
|
elif strategy == "intersects":
|
95
97
|
# Use existing region method if available
|
96
98
|
if hasattr(region, "intersects"):
|
97
99
|
return region.intersects(element)
|
98
100
|
else:
|
99
101
|
# Simple bbox overlap check
|
100
|
-
return not (
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
102
|
+
return not (
|
103
|
+
element.x1 < region.x0
|
104
|
+
or element.x0 > region.x1
|
105
|
+
or element.bottom < region.top
|
106
|
+
or element.top > region.bottom
|
107
|
+
)
|
108
|
+
|
105
109
|
elif strategy == "contains":
|
106
110
|
# Use existing region method if available
|
107
111
|
if hasattr(region, "contains"):
|
108
112
|
return region.contains(element)
|
109
113
|
else:
|
110
114
|
# Simple full containment check
|
111
|
-
return (
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
115
|
+
return (
|
116
|
+
region.x0 <= element.x0
|
117
|
+
and element.x1 <= region.x1
|
118
|
+
and region.top <= element.top
|
119
|
+
and element.bottom <= region.bottom
|
120
|
+
)
|
121
|
+
|
116
122
|
else:
|
117
123
|
raise ValueError(f"Unknown inclusion strategy: {strategy}")
|
118
124
|
|
@@ -120,10 +126,10 @@ def is_element_in_region(
|
|
120
126
|
def get_inclusion_strategy() -> InclusionStrategy:
|
121
127
|
"""
|
122
128
|
Get the current global inclusion strategy.
|
123
|
-
|
129
|
+
|
124
130
|
This could be made configurable via environment variable or settings.
|
125
131
|
For now, returns the default strategy.
|
126
|
-
|
132
|
+
|
127
133
|
Returns:
|
128
134
|
The current inclusion strategy (default: "center")
|
129
135
|
"""
|
@@ -132,38 +138,35 @@ def get_inclusion_strategy() -> InclusionStrategy:
|
|
132
138
|
return "center"
|
133
139
|
|
134
140
|
|
135
|
-
def calculate_element_overlap_percentage(
|
136
|
-
element: "Element",
|
137
|
-
region: "Region"
|
138
|
-
) -> float:
|
141
|
+
def calculate_element_overlap_percentage(element: "Element", region: "Region") -> float:
|
139
142
|
"""
|
140
143
|
Calculate what percentage of an element overlaps with a region.
|
141
|
-
|
144
|
+
|
142
145
|
Args:
|
143
146
|
element: The element to check
|
144
147
|
region: The region to check against
|
145
|
-
|
148
|
+
|
146
149
|
Returns:
|
147
150
|
float: Percentage of element area that overlaps with region (0.0 to 1.0)
|
148
151
|
"""
|
149
152
|
if not hasattr(element, "bbox") or not hasattr(region, "bbox"):
|
150
153
|
return 0.0
|
151
|
-
|
154
|
+
|
152
155
|
# Calculate intersection bounds
|
153
156
|
intersect_x0 = max(element.x0, region.x0)
|
154
|
-
intersect_y0 = max(element.top, region.top)
|
157
|
+
intersect_y0 = max(element.top, region.top)
|
155
158
|
intersect_x1 = min(element.x1, region.x1)
|
156
159
|
intersect_y1 = min(element.bottom, region.bottom)
|
157
|
-
|
160
|
+
|
158
161
|
# Check if there's an intersection
|
159
162
|
if intersect_x1 <= intersect_x0 or intersect_y1 <= intersect_y0:
|
160
163
|
return 0.0
|
161
|
-
|
164
|
+
|
162
165
|
# Calculate areas
|
163
166
|
element_area = (element.x1 - element.x0) * (element.bottom - element.top)
|
164
167
|
if element_area == 0:
|
165
168
|
return 0.0
|
166
|
-
|
169
|
+
|
167
170
|
intersect_area = (intersect_x1 - intersect_x0) * (intersect_y1 - intersect_y0)
|
168
|
-
|
169
|
-
return intersect_area / element_area
|
171
|
+
|
172
|
+
return intersect_area / element_area
|
@@ -230,6 +230,7 @@ temp/test_vertical_stripes.py
|
|
230
230
|
temp/test_widget_functionality.py
|
231
231
|
temp/test_widget_simple.py
|
232
232
|
tests/conftest.py
|
233
|
+
tests/demo_multipage.py
|
233
234
|
tests/test_aggregate_selectors.py
|
234
235
|
tests/test_annotate.py
|
235
236
|
tests/test_arabic_performance.py
|
@@ -256,6 +257,7 @@ tests/test_element_show_crop_highlights.py
|
|
256
257
|
tests/test_empty_pseudo_class.py
|
257
258
|
tests/test_exclude_multi_page.py
|
258
259
|
tests/test_exclude_real_pdf.py
|
260
|
+
tests/test_exclusion_recursion_fix.py
|
259
261
|
tests/test_exclusions.py
|
260
262
|
tests/test_expand.py
|
261
263
|
tests/test_expand_enhanced.py
|
@@ -0,0 +1,56 @@
|
|
1
|
+
"""Demo script showing multipage directional navigation."""
|
2
|
+
|
3
|
+
import natural_pdf as npdf
|
4
|
+
from natural_pdf import PDF
|
5
|
+
|
6
|
+
|
7
|
+
def main():
|
8
|
+
"""Demonstrate multipage directional navigation."""
|
9
|
+
pdf = PDF("pdfs/sections.pdf")
|
10
|
+
|
11
|
+
print("=== Multipage Directional Navigation Demo ===\n")
|
12
|
+
|
13
|
+
# Find Section 1 on page 1
|
14
|
+
section1 = pdf.pages[0].find("text:contains(Section 1)")
|
15
|
+
print(f"Found Section 1 on page {section1.page.number}")
|
16
|
+
|
17
|
+
# Without multipage - stops at page boundary
|
18
|
+
print("\n1. Without multipage=True:")
|
19
|
+
result = section1.below(until="text:contains(Section 6)")
|
20
|
+
print(f" Result type: {type(result).__name__}")
|
21
|
+
print(f" Result on page: {result.page.number}")
|
22
|
+
print(f" Text excerpt: {result.extract_text()[:50]}...")
|
23
|
+
|
24
|
+
# With multipage=True - crosses page boundary
|
25
|
+
print("\n2. With multipage=True:")
|
26
|
+
result = section1.below(until="text:contains(Section 6)", multipage=True)
|
27
|
+
print(f" Result type: {type(result).__name__}")
|
28
|
+
if hasattr(result, "constituent_regions"):
|
29
|
+
print(f" Spans {len(result.constituent_regions)} pages")
|
30
|
+
text = result.extract_text()
|
31
|
+
print(f" Contains 'Section 6': {'Section 6' in text}")
|
32
|
+
|
33
|
+
# Using global option
|
34
|
+
print("\n3. Using global auto_multipage option:")
|
35
|
+
original = npdf.options.layout.auto_multipage
|
36
|
+
npdf.set_option("layout.auto_multipage", True)
|
37
|
+
|
38
|
+
result = section1.below(until="text:contains(Section 6)") # No multipage param needed!
|
39
|
+
print(f" Result type: {type(result).__name__}")
|
40
|
+
text = result.extract_text()
|
41
|
+
print(f" Contains 'Section 6': {'Section 6' in text}")
|
42
|
+
|
43
|
+
# Restore original setting
|
44
|
+
npdf.options.layout.auto_multipage = original
|
45
|
+
|
46
|
+
# Example of above() with multipage
|
47
|
+
print("\n4. Using above() with multipage:")
|
48
|
+
section6 = pdf.pages[1].find("text:contains(Section 6)")
|
49
|
+
result = section6.above(multipage=True)
|
50
|
+
print(f" Result type: {type(result).__name__}")
|
51
|
+
if hasattr(result, "constituent_regions"):
|
52
|
+
print(f" Spans {len(result.constituent_regions)} pages")
|
53
|
+
|
54
|
+
|
55
|
+
if __name__ == "__main__":
|
56
|
+
main()
|
@@ -2,7 +2,7 @@
|
|
2
2
|
|
3
3
|
import pytest
|
4
4
|
|
5
|
-
from natural_pdf import
|
5
|
+
from natural_pdf import PDF
|
6
6
|
from natural_pdf.elements.element_collection import ElementCollection
|
7
7
|
|
8
8
|
|
@@ -13,7 +13,7 @@ class TestAggregateSelectors:
|
|
13
13
|
def sample_pdf(self):
|
14
14
|
"""Create a sample PDF for testing."""
|
15
15
|
# This assumes we have a test PDF with various text sizes and positions
|
16
|
-
pdf =
|
16
|
+
pdf = PDF("tests/fixtures/sample.pdf")
|
17
17
|
return pdf
|
18
18
|
|
19
19
|
def test_min_max_coordinates(self, sample_pdf):
|
@@ -17,6 +17,15 @@ def test_callable_exclusion_returning_element():
|
|
17
17
|
mock_page._exclusions = []
|
18
18
|
mock_page._parent = None
|
19
19
|
|
20
|
+
# Add context manager support to mock
|
21
|
+
from contextlib import contextmanager
|
22
|
+
|
23
|
+
@contextmanager
|
24
|
+
def mock_without_exclusions():
|
25
|
+
yield mock_page
|
26
|
+
|
27
|
+
mock_page.without_exclusions = mock_without_exclusions
|
28
|
+
|
20
29
|
# Create mock element with expand() method
|
21
30
|
mock_element = Mock()
|
22
31
|
mock_element.bbox = (100, 200, 300, 400)
|
@@ -114,6 +123,15 @@ def test_pdf_level_element_exclusions():
|
|
114
123
|
mock_page._exclusions = []
|
115
124
|
mock_page._parent = mock_pdf
|
116
125
|
|
126
|
+
# Add context manager support to mock
|
127
|
+
from contextlib import contextmanager
|
128
|
+
|
129
|
+
@contextmanager
|
130
|
+
def mock_without_exclusions():
|
131
|
+
yield mock_page
|
132
|
+
|
133
|
+
mock_page.without_exclusions = mock_without_exclusions
|
134
|
+
|
117
135
|
# Call _get_exclusion_regions
|
118
136
|
from natural_pdf.core.page import Page
|
119
137
|
|
@@ -0,0 +1,46 @@
|
|
1
|
+
#!/usr/bin/env python3
|
2
|
+
"""Test the exclusion recursion fix."""
|
3
|
+
|
4
|
+
import os
|
5
|
+
import sys
|
6
|
+
|
7
|
+
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
8
|
+
|
9
|
+
import natural_pdf as npdf
|
10
|
+
|
11
|
+
|
12
|
+
def test_exclusion_with_find():
|
13
|
+
"""Test that exclusions using find() don't cause infinite recursion."""
|
14
|
+
# Create a simple test PDF
|
15
|
+
pdf = npdf.PDF("pdfs/sections.pdf")
|
16
|
+
page = pdf.pages[0]
|
17
|
+
|
18
|
+
# This should NOT cause infinite recursion anymore
|
19
|
+
# Use safer lambdas that handle None returns
|
20
|
+
page.add_exclusion(
|
21
|
+
lambda p: (
|
22
|
+
p.find("text:contains('Section')").above()
|
23
|
+
if p.find("text:contains('Section')")
|
24
|
+
else None
|
25
|
+
)
|
26
|
+
)
|
27
|
+
page.add_exclusion(lambda p: p.find("text").expand() if p.find("text") else None)
|
28
|
+
|
29
|
+
# Try to extract text - this should work without recursion
|
30
|
+
text = page.extract_text()
|
31
|
+
print(f"Successfully extracted {len(text)} characters")
|
32
|
+
|
33
|
+
# Try finding elements - this should also work
|
34
|
+
elements = page.find_all("text")
|
35
|
+
print(f"Found {len(elements)} text elements after applying exclusions")
|
36
|
+
|
37
|
+
# Test with ElementCollection return
|
38
|
+
page.add_exclusion(lambda p: p.find_all("text:contains('Header')"))
|
39
|
+
text2 = page.extract_text()
|
40
|
+
print(f"Successfully extracted {len(text2)} characters with ElementCollection exclusion")
|
41
|
+
|
42
|
+
print("✅ All tests passed - no infinite recursion!")
|
43
|
+
|
44
|
+
|
45
|
+
if __name__ == "__main__":
|
46
|
+
test_exclusion_with_find()
|
@@ -76,7 +76,7 @@ def test_expand_with_selectors():
|
|
76
76
|
# Test expanding right until "Repeat?" (excluding)
|
77
77
|
expanded = statute.expand(right='text:contains("Repeat?")')
|
78
78
|
assert expanded.x0 == statute.x0
|
79
|
-
assert expanded.x1 == repeat.x0 # Should stop
|
79
|
+
assert expanded.x1 == repeat.x0 - 0.01 # Should stop just before "Repeat?" with default offset
|
80
80
|
assert expanded.top == statute.top
|
81
81
|
assert expanded.bottom == statute.bottom
|
82
82
|
|
@@ -99,7 +99,7 @@ def test_expand_with_selectors_not_found():
|
|
99
99
|
# Test with selector that won't match anything
|
100
100
|
expanded = element.expand(right='text:contains("NonExistentText")')
|
101
101
|
assert expanded.x0 == element.x0
|
102
|
-
assert expanded.x1 ==
|
102
|
+
assert expanded.x1 == page.width # Should expand to page edge when selector not found
|
103
103
|
assert expanded.top == element.top
|
104
104
|
assert expanded.bottom == element.bottom
|
105
105
|
|
@@ -127,7 +127,7 @@ def test_expand_mixed_parameters():
|
|
127
127
|
# The right edge should be at "Repeat?" if found
|
128
128
|
repeat = page.find('text:contains("Repeat?")')
|
129
129
|
if repeat and repeat.x0 > element.x1:
|
130
|
-
assert expanded.x1 == repeat.x0
|
130
|
+
assert expanded.x1 == repeat.x0 - 0.01 # With default offset
|
131
131
|
|
132
132
|
|
133
133
|
def test_expand_with_factors():
|
@@ -59,10 +59,12 @@ def test_find_with_pdf_exclusions():
|
|
59
59
|
|
60
60
|
# Create a real page instance but with our mocked attributes
|
61
61
|
page = Page.__new__(Page)
|
62
|
-
page.
|
62
|
+
page._index = 0 # Set the internal _index attribute
|
63
63
|
page._exclusions = []
|
64
64
|
page._parent = mock_pdf
|
65
65
|
page._page_obj = Mock()
|
66
|
+
page._computing_exclusions = False # Add the new flag
|
67
|
+
page._computing_exclusions = False # Add the new flag
|
66
68
|
|
67
69
|
# Mock the internal methods we need
|
68
70
|
page._get_exclusion_regions = Mock(return_value=[mock_region])
|
@@ -113,10 +115,11 @@ def test_find_all_with_pdf_exclusions():
|
|
113
115
|
|
114
116
|
# Create a real page instance
|
115
117
|
page = Page.__new__(Page)
|
116
|
-
page.
|
118
|
+
page._index = 0 # Set the internal _index attribute
|
117
119
|
page._exclusions = []
|
118
120
|
page._parent = mock_pdf
|
119
121
|
page._page_obj = Mock()
|
122
|
+
page._computing_exclusions = False # Add the new flag
|
120
123
|
|
121
124
|
# Mock the methods
|
122
125
|
mock_collection = Mock(spec=ElementCollection)
|
@@ -142,17 +145,22 @@ def test_get_elements_with_pdf_exclusions():
|
|
142
145
|
|
143
146
|
# Create a real page instance
|
144
147
|
page = Page.__new__(Page)
|
145
|
-
page.
|
148
|
+
page._index = 0 # Set the internal _index attribute
|
146
149
|
page._exclusions = [] # Empty page exclusions
|
147
150
|
page._parent = mock_pdf
|
148
151
|
page._page_obj = Mock()
|
152
|
+
page._computing_exclusions = False # Add the new flag
|
149
153
|
|
150
154
|
# Mock elements
|
151
155
|
all_elements = [Mock(), Mock(), Mock()]
|
152
156
|
filtered_elements = [all_elements[0], all_elements[2]] # Exclude middle one
|
153
157
|
|
154
|
-
# Mock the
|
155
|
-
|
158
|
+
# Mock the element manager
|
159
|
+
mock_element_mgr = Mock()
|
160
|
+
mock_element_mgr.get_all_elements = Mock(return_value=all_elements)
|
161
|
+
page._element_mgr = mock_element_mgr
|
162
|
+
|
163
|
+
# Mock the filter method
|
156
164
|
page._filter_elements_by_exclusions = Mock(return_value=filtered_elements)
|
157
165
|
|
158
166
|
# Test get_elements() with apply_exclusions=True
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|