natural-pdf 0.2.6__tar.gz → 0.2.9__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {natural_pdf-0.2.6/natural_pdf.egg-info → natural_pdf-0.2.9}/PKG-INFO +1 -1
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf/core/page.py +114 -18
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf/core/page_collection.py +41 -19
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf/core/pdf.py +14 -14
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf/elements/element_collection.py +62 -15
- {natural_pdf-0.2.6 → natural_pdf-0.2.9/natural_pdf.egg-info}/PKG-INFO +1 -1
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/tests/test_color_hex_display.py +4 -3
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/tests/test_fix_get_sections_zero_height.py +4 -2
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/tests/test_get_sections_fix_comprehensive.py +7 -4
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/tests/test_guides_extract_table.py +1 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/tests/test_guides_extract_table_collections.py +2 -2
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/tests/test_include_boundaries_mock.py +45 -34
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/tests/test_merge_method.py +8 -6
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/tests/test_sections_with_start_and_end.py +13 -4
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/tests/test_slice_cache_reuse.py +27 -12
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/tests/test_slice_exclusion_mock.py +12 -12
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/.cursor/rules/analysis_framework.mdc +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/.cursor/rules/coding-style.mdc +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/.cursor/rules/edit-md-instead-of-ipynb.mdc +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/.cursor/rules/minimal-comments.mdc +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/.cursor/rules/natural-pdf-overview.mdc +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/.cursor/rules/user-friendly-library-code.mdc +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/.github/workflows/ci.yml +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/.github/workflows/docs.yml +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/.github/workflows/nightly-tutorials.yml +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/.gitignore +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/.pre-commit-config.yaml +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/01-execute_notebooks.py +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/02-run_all_tutorials.sh +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/CLAUDE.md +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/LICENSE +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/MANIFEST.in +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/README.md +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/audit_packaging.py +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/check_run_md.sh +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/docs/api/index.md +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/docs/assets/favicon.png +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/docs/assets/favicon.svg +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/docs/assets/javascripts/custom.js +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/docs/assets/logo.svg +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/docs/assets/sample-screen.png +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/docs/assets/social-preview.png +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/docs/assets/social-preview.svg +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/docs/assets/stylesheets/custom.css +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/docs/categorizing-documents/index.md +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/docs/data-extraction/index.md +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/docs/describe/index.md +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/docs/document-qa/index.md +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/docs/element-selection/index.md +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/docs/extracting-clean-text/index.md +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/docs/finetuning/index.md +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/docs/fix-messy-tables/index.md +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/docs/fix-messy-tables/table_1.csv +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/docs/fix-messy-tables/table_2.csv +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/docs/fix-messy-tables/table_3.csv +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/docs/index.md +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/docs/installation/index.md +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/docs/interactive-widget/index.md +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/docs/layout-analysis/index.md +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/docs/loops-and-groups/index.md +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/docs/ocr/index.md +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/docs/pdf-navigation/index.md +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/docs/process-forms-and-invoices/extracted_form_data.csv +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/docs/process-forms-and-invoices/index.md +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/docs/quick-reference/index.md +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/docs/reflowing-pages/index.md +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/docs/regions/index.md +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/docs/tables/index.md +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/docs/text-analysis/index.md +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/docs/tutorials/01-loading-and-extraction.md +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/docs/tutorials/02-finding-elements.md +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/docs/tutorials/03-extracting-blocks.md +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/docs/tutorials/04-table-extraction.md +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/docs/tutorials/05-excluding-content.md +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/docs/tutorials/06-document-qa.md +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/docs/tutorials/07-layout-analysis.md +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/docs/tutorials/07-working-with-regions.md +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/docs/tutorials/08-spatial-navigation.md +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/docs/tutorials/09-section-extraction.md +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/docs/tutorials/10-form-field-extraction.md +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/docs/tutorials/11-enhanced-table-processing.md +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/docs/tutorials/12-ocr-integration.md +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/docs/tutorials/13-semantic-search.md +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/docs/tutorials/14-categorizing-documents.md +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/docs/visual-debugging/index.md +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/docs/visual-debugging/region.png +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/mkdocs.yml +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf/__init__.py +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf/analyzers/__init__.py +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf/analyzers/guides.py +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf/analyzers/layout/__init__.py +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf/analyzers/layout/base.py +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf/analyzers/layout/docling.py +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf/analyzers/layout/gemini.py +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf/analyzers/layout/layout_analyzer.py +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf/analyzers/layout/layout_manager.py +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf/analyzers/layout/layout_options.py +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf/analyzers/layout/paddle.py +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf/analyzers/layout/pdfplumber_table_finder.py +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf/analyzers/layout/surya.py +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf/analyzers/layout/table_structure_utils.py +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf/analyzers/layout/tatr.py +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf/analyzers/layout/yolo.py +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf/analyzers/shape_detection_mixin.py +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf/analyzers/text_options.py +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf/analyzers/text_structure.py +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf/analyzers/utils.py +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf/classification/manager.py +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf/classification/mixin.py +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf/classification/results.py +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf/cli.py +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf/collections/mixins.py +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf/core/__init__.py +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf/core/element_manager.py +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf/core/highlighting_service.py +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf/core/page_groupby.py +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf/core/pdf_collection.py +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf/core/render_spec.py +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf/describe/__init__.py +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf/describe/base.py +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf/describe/elements.py +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf/describe/mixin.py +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf/describe/summary.py +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf/elements/__init__.py +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf/elements/base.py +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf/elements/image.py +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf/elements/line.py +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf/elements/rect.py +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf/elements/region.py +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf/elements/text.py +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf/export/mixin.py +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf/exporters/__init__.py +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf/exporters/base.py +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf/exporters/data/__init__.py +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf/exporters/data/pdf.ttf +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf/exporters/data/sRGB.icc +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf/exporters/hocr.py +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf/exporters/hocr_font.py +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf/exporters/original_pdf.py +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf/exporters/paddleocr.py +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf/exporters/searchable_pdf.py +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf/extraction/manager.py +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf/extraction/mixin.py +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf/extraction/result.py +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf/flows/__init__.py +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf/flows/collections.py +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf/flows/element.py +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf/flows/flow.py +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf/flows/region.py +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf/ocr/__init__.py +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf/ocr/engine.py +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf/ocr/engine_doctr.py +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf/ocr/engine_easyocr.py +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf/ocr/engine_paddle.py +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf/ocr/engine_surya.py +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf/ocr/ocr_factory.py +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf/ocr/ocr_manager.py +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf/ocr/ocr_options.py +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf/ocr/utils.py +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf/qa/__init__.py +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf/qa/document_qa.py +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf/qa/qa_result.py +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf/search/__init__.py +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf/search/lancedb_search_service.py +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf/search/numpy_search_service.py +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf/search/search_options.py +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf/search/search_service_protocol.py +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf/search/searchable_mixin.py +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf/selectors/__init__.py +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf/selectors/parser.py +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf/tables/__init__.py +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf/tables/result.py +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf/templates/__init__.py +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf/templates/finetune/fine_tune_paddleocr.md +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf/templates/spa/css/style.css +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf/templates/spa/index.html +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf/templates/spa/js/app.js +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf/templates/spa/words.txt +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf/text_mixin.py +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf/utils/__init__.py +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf/utils/bidi_mirror.py +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf/utils/color_utils.py +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf/utils/debug.py +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf/utils/highlighting.py +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf/utils/identifiers.py +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf/utils/layout.py +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf/utils/locks.py +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf/utils/packaging.py +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf/utils/reading_order.py +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf/utils/text_extraction.py +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf/utils/visualization.py +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf/vision/__init__.py +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf/vision/mixin.py +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf/vision/results.py +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf/vision/similarity.py +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf/widgets/__init__.py +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf/widgets/viewer.py +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf.egg-info/SOURCES.txt +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf.egg-info/dependency_links.txt +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf.egg-info/entry_points.txt +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf.egg-info/requires.txt +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf.egg-info/top_level.txt +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/noxfile.py +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/optimization/memory_comparison.py +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/optimization/pdf_analyzer.py +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/optimization/performance_analysis.py +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/optimization/performance_results/image_heavy_snapshots.csv +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/optimization/performance_results/image_heavy_snapshots.json +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/optimization/performance_results/text_heavy_snapshots.csv +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/optimization/performance_results/text_heavy_snapshots.json +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/optimization/test_cleanup_methods.py +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/optimization/test_memory_fix.py +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/publish.sh +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/pyproject.toml +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/sample-screen.png +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/setup.cfg +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/tests/conftest.py +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/tests/exporters/test_paddleocr_exporter.py +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/tests/test_annotate.py +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/tests/test_arabic_performance.py +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/tests/test_arabic_real_world.py +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/tests/test_color_conversion.py +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/tests/test_core/test_containment_geometry.py +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/tests/test_core/test_elements.py +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/tests/test_core/test_loading.py +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/tests/test_core/test_spatial.py +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/tests/test_core/test_text_extraction.py +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/tests/test_core/test_text_layer.py +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/tests/test_crop_enhancements.py +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/tests/test_crop_region_highlights.py +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/tests/test_directional_defaults.py +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/tests/test_dissolve.py +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/tests/test_dissolve_cross_page_bug.py +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/tests/test_dissolve_debug_issue.py +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/tests/test_dissolve_real_world_issue.py +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/tests/test_dissolve_single_elements.py +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/tests/test_dissolve_vertical_offset_issue.py +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/tests/test_document_qa.py +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/tests/test_element_addition.py +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/tests/test_element_collection_show_cols.py +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/tests/test_element_collection_slicing.py +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/tests/test_element_show_crop_highlights.py +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/tests/test_empty_pseudo_class.py +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/tests/test_exclusions.py +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/tests/test_expand.py +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/tests/test_extraction_error.py +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/tests/test_extraction_mixin_fix.py +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/tests/test_extraction_text_and_vision.py +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/tests/test_extraction_working.py +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/tests/test_find_similar.py +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/tests/test_first_last_selectors.py +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/tests/test_flow_region_directional.py +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/tests/test_get_sections_zero_height.py +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/tests/test_groupby.py +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/tests/test_guides.py +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/tests/test_guides_apply_exclusions.py +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/tests/test_guides_apply_exclusions_simple.py +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/tests/test_guides_extract_table_exclusions.py +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/tests/test_guides_extract_table_real.py +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/tests/test_guides_integration.py +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/tests/test_highlight_detection.py +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/tests/test_highlight_detection_comprehensive.py +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/tests/test_highlight_protocol.py +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/tests/test_highlight_protocol_simple.py +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/tests/test_highlight_regions.py +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/tests/test_include_boundaries_comprehensive.py +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/tests/test_include_boundaries_debug.py +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/tests/test_include_boundaries_final.py +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/tests/test_include_boundaries_final_verification.py +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/tests/test_include_boundaries_fix.py +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/tests/test_include_boundaries_simple.py +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/tests/test_include_boundaries_types_pdf.py +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/tests/test_include_boundaries_verification.py +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/tests/test_include_boundaries_with_real_text.py +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/tests/test_loading_original.py +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/tests/test_merge_connected.py +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/tests/test_merge_connected_real_world.py +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/tests/test_multi_page_table_discovery.py +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/tests/test_optional_deps.py +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/tests/test_page_exclusion_lists.py +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/tests/test_pdf_add_exclusion_elementcollection.py +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/tests/test_region_show_crop_highlights.py +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/tests/test_region_viewer.py +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/tests/test_sections_end_only.py +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/tests/test_show_column_layout.py +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/tests/test_show_edge_cases.py +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/tests/test_show_exclusions.py +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/tests/test_show_exclusions_feature.py +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/tests/test_show_limit.py +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/tests/test_skip_repeating_headers_multipage.py +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/tests/test_slice_exclusion_fix.py +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/tests/test_slice_exclusion_issue.py +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/tests/test_sliced_collection_exclusions.py +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/tests/test_strikethrough_detection.py +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/tests/test_table_result_header_mismatch.py +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/tests/test_table_result_keep_blank.py +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/tests/test_tiny_text_tables.py +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/tests/test_tiny_text_tables_table.py +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/tests/test_tutorials.py +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/tests/test_underline_detection.py +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/tests/test_update_text.py +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/todo/bad_pdf_analysis.md +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/todo/evaluation.md +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/tools/bad_pdf_eval/IMPROVEMENTS_SUMMARY.md +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/tools/bad_pdf_eval/LLM_NaturalPDF_CheatSheet.md +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/tools/bad_pdf_eval/LLM_NaturalPDF_Workflows.md +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/tools/bad_pdf_eval/README.md +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/tools/bad_pdf_eval/__init__.py +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/tools/bad_pdf_eval/analyser.py +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/tools/bad_pdf_eval/collate_summaries.py +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/tools/bad_pdf_eval/compile_attempts_markdown.py +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/tools/bad_pdf_eval/eval_suite.py +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/tools/bad_pdf_eval/evaluate_quality.py +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/tools/bad_pdf_eval/export_enrichment_csv.py +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/tools/bad_pdf_eval/extraction_decision_tree.md +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/tools/bad_pdf_eval/llm_enrich.py +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/tools/bad_pdf_eval/llm_enrich_with_retry.py +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/tools/bad_pdf_eval/reporter.py +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/tools/bad_pdf_eval/utils.py +0 -0
- {natural_pdf-0.2.6 → natural_pdf-0.2.9}/uv.lock +0 -0
@@ -717,14 +717,23 @@ class Page(
|
|
717
717
|
|
718
718
|
# Add PDF-level exclusions if we have a parent PDF
|
719
719
|
if hasattr(self, "_parent") and self._parent and hasattr(self._parent, "_exclusions"):
|
720
|
+
# Get existing labels to check for duplicates
|
721
|
+
existing_labels = set()
|
722
|
+
for exc in all_exclusions:
|
723
|
+
if len(exc) >= 2 and exc[1]: # Has a label
|
724
|
+
existing_labels.add(exc[1])
|
725
|
+
|
720
726
|
for pdf_exclusion in self._parent._exclusions:
|
721
|
-
# Check if this exclusion is already in our list (avoid duplicates)
|
722
|
-
if pdf_exclusion
|
723
|
-
|
724
|
-
|
725
|
-
|
726
|
-
|
727
|
-
|
727
|
+
# Check if this exclusion label is already in our list (avoid duplicates)
|
728
|
+
label = pdf_exclusion[1] if len(pdf_exclusion) >= 2 else None
|
729
|
+
if label and label in existing_labels:
|
730
|
+
continue # Skip this exclusion as it's already been applied
|
731
|
+
|
732
|
+
# Ensure consistent format (PDF exclusions might be 2-tuples, need to be 3-tuples)
|
733
|
+
if len(pdf_exclusion) == 2:
|
734
|
+
# Convert to 3-tuple format with default method
|
735
|
+
pdf_exclusion = (pdf_exclusion[0], pdf_exclusion[1], "region")
|
736
|
+
all_exclusions.append(pdf_exclusion)
|
728
737
|
|
729
738
|
if debug:
|
730
739
|
print(
|
@@ -829,6 +838,36 @@ class Page(
|
|
829
838
|
regions.append(exclusion_item) # Label is already on the Region object
|
830
839
|
if debug:
|
831
840
|
print(f" - Added direct region '{label}': {exclusion_item}")
|
841
|
+
|
842
|
+
# Process string selectors (from PDF-level exclusions)
|
843
|
+
elif isinstance(exclusion_item, str):
|
844
|
+
selector_str = exclusion_item
|
845
|
+
matching_elements = self.find_all(selector_str, apply_exclusions=False)
|
846
|
+
|
847
|
+
if debug:
|
848
|
+
print(
|
849
|
+
f" - Evaluating selector '{exclusion_label}': found {len(matching_elements)} elements"
|
850
|
+
)
|
851
|
+
|
852
|
+
if method == "region":
|
853
|
+
# Convert each matching element to a region
|
854
|
+
for el in matching_elements:
|
855
|
+
try:
|
856
|
+
bbox_coords = (
|
857
|
+
float(el.x0),
|
858
|
+
float(el.top),
|
859
|
+
float(el.x1),
|
860
|
+
float(el.bottom),
|
861
|
+
)
|
862
|
+
region = Region(self, bbox_coords, label=label)
|
863
|
+
regions.append(region)
|
864
|
+
if debug:
|
865
|
+
print(f" ✓ Added region from selector match: {bbox_coords}")
|
866
|
+
except Exception as e:
|
867
|
+
if debug:
|
868
|
+
print(f" ✗ Failed to create region from element: {e}")
|
869
|
+
# If method is "element", it will be handled in _filter_elements_by_exclusions
|
870
|
+
|
832
871
|
# Element-based exclusions are not converted to regions here
|
833
872
|
# They will be handled separately in _filter_elements_by_exclusions
|
834
873
|
|
@@ -852,7 +891,16 @@ class Page(
|
|
852
891
|
Returns:
|
853
892
|
A new list containing only the elements not excluded.
|
854
893
|
"""
|
855
|
-
|
894
|
+
# Check both page-level and PDF-level exclusions
|
895
|
+
has_page_exclusions = bool(self._exclusions)
|
896
|
+
has_pdf_exclusions = (
|
897
|
+
hasattr(self, "_parent")
|
898
|
+
and self._parent
|
899
|
+
and hasattr(self._parent, "_exclusions")
|
900
|
+
and bool(self._parent._exclusions)
|
901
|
+
)
|
902
|
+
|
903
|
+
if not has_page_exclusions and not has_pdf_exclusions:
|
856
904
|
if debug_exclusions:
|
857
905
|
print(
|
858
906
|
f"Page {self.index}: No exclusions defined, returning all {len(elements)} elements."
|
@@ -865,9 +913,15 @@ class Page(
|
|
865
913
|
)
|
866
914
|
|
867
915
|
# Collect element-based exclusions
|
868
|
-
|
916
|
+
# Store element bboxes for comparison instead of object ids
|
917
|
+
excluded_element_bboxes = set() # Use set for O(1) lookup
|
918
|
+
|
919
|
+
# Process both page-level and PDF-level exclusions
|
920
|
+
all_exclusions = list(self._exclusions) if has_page_exclusions else []
|
921
|
+
if has_pdf_exclusions:
|
922
|
+
all_exclusions.extend(self._parent._exclusions)
|
869
923
|
|
870
|
-
for exclusion_data in
|
924
|
+
for exclusion_data in all_exclusions:
|
871
925
|
# Handle both old format (2-tuple) and new format (3-tuple)
|
872
926
|
if len(exclusion_data) == 2:
|
873
927
|
exclusion_item, label = exclusion_data
|
@@ -883,16 +937,31 @@ class Page(
|
|
883
937
|
if isinstance(exclusion_item, Region):
|
884
938
|
continue
|
885
939
|
|
940
|
+
# Handle string selectors for element-based exclusions
|
941
|
+
if isinstance(exclusion_item, str) and method == "element":
|
942
|
+
selector_str = exclusion_item
|
943
|
+
matching_elements = self.find_all(selector_str, apply_exclusions=False)
|
944
|
+
for el in matching_elements:
|
945
|
+
if hasattr(el, "bbox"):
|
946
|
+
bbox = tuple(el.bbox)
|
947
|
+
excluded_element_bboxes.add(bbox)
|
948
|
+
if debug_exclusions:
|
949
|
+
print(
|
950
|
+
f" - Added element exclusion from selector '{selector_str}': {bbox}"
|
951
|
+
)
|
952
|
+
|
886
953
|
# Handle element-based exclusions
|
887
|
-
|
888
|
-
|
954
|
+
elif method == "element" and hasattr(exclusion_item, "bbox"):
|
955
|
+
# Store bbox tuple for comparison
|
956
|
+
bbox = tuple(exclusion_item.bbox)
|
957
|
+
excluded_element_bboxes.add(bbox)
|
889
958
|
if debug_exclusions:
|
890
|
-
print(f" - Added element exclusion: {exclusion_item}")
|
959
|
+
print(f" - Added element exclusion with bbox {bbox}: {exclusion_item}")
|
891
960
|
|
892
961
|
if debug_exclusions:
|
893
962
|
print(
|
894
963
|
f"Page {self.index}: Applying {len(exclusion_regions)} region exclusions "
|
895
|
-
f"and {len(
|
964
|
+
f"and {len(excluded_element_bboxes)} element exclusions to {len(elements)} elements."
|
896
965
|
)
|
897
966
|
|
898
967
|
filtered_elements = []
|
@@ -903,7 +972,7 @@ class Page(
|
|
903
972
|
exclude = False
|
904
973
|
|
905
974
|
# Check element-based exclusions first (faster)
|
906
|
-
if
|
975
|
+
if hasattr(element, "bbox") and tuple(element.bbox) in excluded_element_bboxes:
|
907
976
|
exclude = True
|
908
977
|
element_excluded_count += 1
|
909
978
|
if debug_exclusions:
|
@@ -2487,10 +2556,23 @@ class Page(
|
|
2487
2556
|
return self
|
2488
2557
|
|
2489
2558
|
def get_section_between(
|
2490
|
-
self,
|
2559
|
+
self,
|
2560
|
+
start_element=None,
|
2561
|
+
end_element=None,
|
2562
|
+
include_boundaries="both",
|
2563
|
+
orientation="vertical",
|
2491
2564
|
) -> Optional["Region"]: # Return Optional
|
2492
2565
|
"""
|
2493
2566
|
Get a section between two elements on this page.
|
2567
|
+
|
2568
|
+
Args:
|
2569
|
+
start_element: Element marking the start of the section
|
2570
|
+
end_element: Element marking the end of the section
|
2571
|
+
include_boundaries: How to include boundary elements: 'start', 'end', 'both', or 'none'
|
2572
|
+
orientation: 'vertical' (default) or 'horizontal' - determines section direction
|
2573
|
+
|
2574
|
+
Returns:
|
2575
|
+
Region representing the section
|
2494
2576
|
"""
|
2495
2577
|
# Create a full-page region to operate within
|
2496
2578
|
page_region = self.create_region(0, 0, self.width, self.height)
|
@@ -2501,6 +2583,7 @@ class Page(
|
|
2501
2583
|
start_element=start_element,
|
2502
2584
|
end_element=end_element,
|
2503
2585
|
include_boundaries=include_boundaries,
|
2586
|
+
orientation=orientation,
|
2504
2587
|
)
|
2505
2588
|
except Exception as e:
|
2506
2589
|
logger.error(
|
@@ -2575,10 +2658,23 @@ class Page(
|
|
2575
2658
|
if include_boundaries not in valid_inclusions:
|
2576
2659
|
raise ValueError(f"include_boundaries must be one of {valid_inclusions}")
|
2577
2660
|
|
2578
|
-
if not start_elements:
|
2579
|
-
# Return an empty ElementCollection if no
|
2661
|
+
if not start_elements and not end_elements:
|
2662
|
+
# Return an empty ElementCollection if no boundary elements at all
|
2580
2663
|
return ElementCollection([])
|
2581
2664
|
|
2665
|
+
# If we only have end elements, create implicit start elements
|
2666
|
+
if not start_elements and end_elements:
|
2667
|
+
# Delegate to PageCollection implementation for consistency
|
2668
|
+
from natural_pdf.core.page_collection import PageCollection
|
2669
|
+
|
2670
|
+
pages = PageCollection([self])
|
2671
|
+
return pages.get_sections(
|
2672
|
+
start_elements=start_elements,
|
2673
|
+
end_elements=end_elements,
|
2674
|
+
include_boundaries=include_boundaries,
|
2675
|
+
orientation=orientation,
|
2676
|
+
)
|
2677
|
+
|
2582
2678
|
# Combine start and end elements with their type
|
2583
2679
|
all_boundaries = []
|
2584
2680
|
for el in start_elements:
|
@@ -537,10 +537,14 @@ class PageCollection(TextMixin, Generic[P], ApplyMixin, ShapeDetectionMixin, Vis
|
|
537
537
|
first_page = self.pages[0]
|
538
538
|
first_start = Region(first_page, (0, 0, first_page.width, 1))
|
539
539
|
first_start.is_implicit_start = True
|
540
|
+
# Don't mark this as created from any end element, so it can pair with any end
|
540
541
|
start_elements.append(first_start)
|
541
542
|
|
542
543
|
# For each end element (except the last), add an implicit start after it
|
543
|
-
|
544
|
+
# Sort by page, then top, then bottom (for elements with same top), then x0
|
545
|
+
sorted_end_elements = sorted(
|
546
|
+
end_elements, key=lambda e: (e.page.index, e.top, e.bottom, e.x0)
|
547
|
+
)
|
544
548
|
for i, end_elem in enumerate(sorted_end_elements[:-1]): # Exclude last end element
|
545
549
|
# Create implicit start element right after this end element
|
546
550
|
implicit_start = Region(
|
@@ -838,29 +842,47 @@ class PageCollection(TextMixin, Generic[P], ApplyMixin, ShapeDetectionMixin, Vis
|
|
838
842
|
# Create a section from current_start to just before this boundary
|
839
843
|
start_element = current_start["element"]
|
840
844
|
|
841
|
-
#
|
845
|
+
# Create section from current start to just before this new start
|
842
846
|
if start_element.page == boundary["element"].page:
|
843
|
-
|
844
|
-
|
845
|
-
|
847
|
+
from natural_pdf.elements.region import Region
|
848
|
+
|
849
|
+
next_start = boundary["element"]
|
850
|
+
|
851
|
+
# Create section based on orientation
|
846
852
|
if orientation == "vertical":
|
847
|
-
|
853
|
+
# Determine vertical bounds
|
854
|
+
if include_boundaries in ["start", "both"]:
|
855
|
+
top = start_element.top
|
856
|
+
else:
|
857
|
+
top = start_element.bottom
|
858
|
+
|
859
|
+
# The section ends just before the next start
|
860
|
+
bottom = next_start.top
|
861
|
+
|
862
|
+
# Create the section with full page width
|
863
|
+
if top < bottom:
|
864
|
+
section = Region(
|
865
|
+
start_element.page, (0, top, start_element.page.width, bottom)
|
866
|
+
)
|
867
|
+
section.start_element = start_element
|
868
|
+
sections.append(section)
|
848
869
|
else: # horizontal
|
849
|
-
|
870
|
+
# Determine horizontal bounds
|
871
|
+
if include_boundaries in ["start", "both"]:
|
872
|
+
left = start_element.x0
|
873
|
+
else:
|
874
|
+
left = start_element.x1
|
850
875
|
|
851
|
-
|
852
|
-
|
853
|
-
page_elements.index(boundary["element"]) - 1
|
854
|
-
if boundary["element"] in page_elements
|
855
|
-
else -1
|
856
|
-
)
|
857
|
-
end_element = page_elements[end_idx] if end_idx >= 0 else None
|
876
|
+
# The section ends just before the next start
|
877
|
+
right = next_start.x0
|
858
878
|
|
859
|
-
|
860
|
-
|
861
|
-
|
862
|
-
|
863
|
-
|
879
|
+
# Create the section with full page height
|
880
|
+
if left < right:
|
881
|
+
section = Region(
|
882
|
+
start_element.page, (left, 0, right, start_element.page.height)
|
883
|
+
)
|
884
|
+
section.start_element = start_element
|
885
|
+
sections.append(section)
|
864
886
|
else:
|
865
887
|
# Cross-page section - create from current_start to the end of its page
|
866
888
|
from natural_pdf.elements.region import Region
|
@@ -252,6 +252,16 @@ class _LazyPageList(Sequence):
|
|
252
252
|
logger.warning(f"Failed to apply region to page {cached.number}: {e}")
|
253
253
|
|
254
254
|
self._cache[index] = cached
|
255
|
+
|
256
|
+
# Also cache in the parent PDF's main page list if this is a slice
|
257
|
+
if (
|
258
|
+
hasattr(self._parent_pdf, "_pages")
|
259
|
+
and hasattr(self._parent_pdf._pages, "_cache")
|
260
|
+
and actual_page_index < len(self._parent_pdf._pages._cache)
|
261
|
+
and self._parent_pdf._pages._cache[actual_page_index] is None
|
262
|
+
):
|
263
|
+
self._parent_pdf._pages._cache[actual_page_index] = cached
|
264
|
+
|
255
265
|
return cached
|
256
266
|
|
257
267
|
# Sequence protocol ---------------------------------------------------
|
@@ -720,26 +730,16 @@ class PDF(
|
|
720
730
|
# Store for bookkeeping and lazy application
|
721
731
|
self._exclusions.append((exclusion_func, label))
|
722
732
|
|
723
|
-
#
|
724
|
-
|
725
|
-
if self._pages._cache[i] is not None: # Only apply to existing pages
|
726
|
-
try:
|
727
|
-
self._pages._cache[i].add_exclusion(exclusion_func, label=label)
|
728
|
-
except Exception as e:
|
729
|
-
logger.warning(f"Failed to apply exclusion to existing page {i}: {e}")
|
733
|
+
# Don't modify already-cached pages - they will get PDF-level exclusions
|
734
|
+
# dynamically through _get_exclusion_regions()
|
730
735
|
return self
|
731
736
|
|
732
737
|
# Fallback to original callable / Region behaviour ------------------
|
733
738
|
exclusion_data = (exclusion_func, label)
|
734
739
|
self._exclusions.append(exclusion_data)
|
735
740
|
|
736
|
-
#
|
737
|
-
|
738
|
-
if self._pages._cache[i] is not None: # Only apply to existing pages
|
739
|
-
try:
|
740
|
-
self._pages._cache[i].add_exclusion(exclusion_func, label=label)
|
741
|
-
except Exception as e:
|
742
|
-
logger.warning(f"Failed to apply exclusion to existing page {i}: {e}")
|
741
|
+
# Don't modify already-cached pages - they will get PDF-level exclusions
|
742
|
+
# dynamically through _get_exclusion_regions()
|
743
743
|
|
744
744
|
return self
|
745
745
|
|
@@ -621,6 +621,7 @@ class ElementCollection(
|
|
621
621
|
|
622
622
|
def extract_text(
|
623
623
|
self,
|
624
|
+
separator: str = " ",
|
624
625
|
preserve_whitespace: bool = True,
|
625
626
|
use_exclusions: bool = True,
|
626
627
|
strip: Optional[bool] = None,
|
@@ -632,6 +633,7 @@ class ElementCollection(
|
|
632
633
|
pdfplumber's layout engine if layout=True is specified.
|
633
634
|
|
634
635
|
Args:
|
636
|
+
separator: String to join text from elements. Default is a single space.
|
635
637
|
preserve_whitespace: Deprecated. Use layout=False for simple joining.
|
636
638
|
use_exclusions: Deprecated. Exclusions should be applied *before* creating
|
637
639
|
the collection or by filtering the collection itself.
|
@@ -648,15 +650,49 @@ class ElementCollection(
|
|
648
650
|
Returns:
|
649
651
|
Combined text from elements, potentially with layout-based spacing.
|
650
652
|
"""
|
651
|
-
#
|
652
|
-
|
653
|
+
# Check if we have any elements at all
|
654
|
+
if not self._elements:
|
655
|
+
return ""
|
656
|
+
|
657
|
+
# Check if all elements are TextElements with character data
|
658
|
+
text_elements_with_chars = [
|
653
659
|
el
|
654
660
|
for el in self._elements
|
655
|
-
if isinstance(el, TextElement) and hasattr(el, "_char_dicts")
|
661
|
+
if isinstance(el, TextElement) and hasattr(el, "_char_dicts") and el._char_dicts
|
656
662
|
]
|
657
663
|
|
658
|
-
|
659
|
-
|
664
|
+
# If we have a mixed collection (Regions, TextElements without chars, etc),
|
665
|
+
# use a simpler approach: call extract_text on each element
|
666
|
+
if len(text_elements_with_chars) < len(self._elements):
|
667
|
+
# Mixed collection - extract text from each element
|
668
|
+
element_texts = []
|
669
|
+
|
670
|
+
# Sort elements by position first
|
671
|
+
sorted_elements = sorted(
|
672
|
+
self._elements,
|
673
|
+
key=lambda el: (
|
674
|
+
el.page.index if hasattr(el, "page") else 0,
|
675
|
+
el.top if hasattr(el, "top") else 0,
|
676
|
+
el.x0 if hasattr(el, "x0") else 0,
|
677
|
+
),
|
678
|
+
)
|
679
|
+
|
680
|
+
for el in sorted_elements:
|
681
|
+
if hasattr(el, "extract_text"):
|
682
|
+
# Call extract_text on the element (works for TextElement, Region, etc)
|
683
|
+
text = el.extract_text(**kwargs)
|
684
|
+
if text:
|
685
|
+
element_texts.append(text)
|
686
|
+
elif hasattr(el, "text"):
|
687
|
+
# Fallback to text property if available
|
688
|
+
text = getattr(el, "text", "")
|
689
|
+
if text:
|
690
|
+
element_texts.append(text)
|
691
|
+
|
692
|
+
return separator.join(element_texts)
|
693
|
+
|
694
|
+
# All elements are TextElements with char data - use the original approach
|
695
|
+
text_elements = text_elements_with_chars
|
660
696
|
|
661
697
|
# Collect all character dictionaries
|
662
698
|
all_char_dicts = []
|
@@ -665,11 +701,20 @@ class ElementCollection(
|
|
665
701
|
|
666
702
|
if not all_char_dicts:
|
667
703
|
# Handle case where elements exist but have no char dicts
|
668
|
-
logger.
|
704
|
+
logger.debug(
|
669
705
|
"ElementCollection.extract_text: No character dictionaries found in TextElements."
|
670
706
|
)
|
671
|
-
|
672
|
-
|
707
|
+
# Sort elements by position before joining
|
708
|
+
sorted_text_elements = sorted(
|
709
|
+
text_elements,
|
710
|
+
key=lambda el: (
|
711
|
+
el.page.index if hasattr(el, "page") else 0,
|
712
|
+
el.top if hasattr(el, "top") else 0,
|
713
|
+
el.x0 if hasattr(el, "x0") else 0,
|
714
|
+
),
|
715
|
+
)
|
716
|
+
return separator.join(
|
717
|
+
getattr(el, "text", "") for el in sorted_text_elements
|
673
718
|
) # Fallback to simple join of word text
|
674
719
|
|
675
720
|
# Apply content filtering if provided
|
@@ -736,15 +781,17 @@ class ElementCollection(
|
|
736
781
|
result = " ".join(c.get("text", "") for c in all_char_dicts)
|
737
782
|
|
738
783
|
else:
|
784
|
+
print("JOIN WITHOUT LAYOUT")
|
739
785
|
# Default: Simple join without layout
|
740
786
|
logger.debug("ElementCollection.extract_text: Using simple join (layout=False).")
|
741
|
-
|
742
|
-
|
743
|
-
|
744
|
-
|
745
|
-
#
|
746
|
-
|
747
|
-
#
|
787
|
+
result = separator.join(el.extract_text() for el in text_elements)
|
788
|
+
|
789
|
+
# # Sort chars by document order (page, top, x0)
|
790
|
+
# all_char_dicts.sort(
|
791
|
+
# key=lambda c: (c.get("page_number", 0), c.get("top", 0), c.get("x0", 0))
|
792
|
+
# )
|
793
|
+
# # Simple join of character text
|
794
|
+
# result = "".join(c.get("text", "") for c in all_char_dicts)
|
748
795
|
|
749
796
|
# Determine final strip flag – same rule as global helper unless caller overrides
|
750
797
|
strip_text = strip if strip is not None else (not use_layout)
|
@@ -114,8 +114,9 @@ class TestGroupByColorDisplay:
|
|
114
114
|
colors = [(1, 0, 0), (0, 1, 0), (0, 0, 1)]
|
115
115
|
for i, color in enumerate(colors):
|
116
116
|
page = MagicMock()
|
117
|
-
#
|
118
|
-
|
117
|
+
# PageGroupBy groups by the text content of the element found
|
118
|
+
# So we need to return the color tuple as the extracted text
|
119
|
+
page.find.return_value = MagicMock(extract_text=lambda c=color: c)
|
119
120
|
mock_pages.append(page)
|
120
121
|
|
121
122
|
collection = PageCollection(mock_pages)
|
@@ -141,7 +142,7 @@ class TestGroupByColorDisplay:
|
|
141
142
|
colors = [(255, 0, 0), (0, 255, 0)]
|
142
143
|
for color in colors:
|
143
144
|
page = MagicMock()
|
144
|
-
page.find.return_value = MagicMock(extract_text=lambda c=color:
|
145
|
+
page.find.return_value = MagicMock(extract_text=lambda c=color: c)
|
145
146
|
mock_pages.append(page)
|
146
147
|
|
147
148
|
collection = PageCollection(mock_pages)
|
@@ -68,7 +68,8 @@ def test_edge_case_single_end_element():
|
|
68
68
|
print(f"\nSingle end element: bottom={end_elem.bottom}")
|
69
69
|
|
70
70
|
# Create sections with single end element
|
71
|
-
|
71
|
+
# When using only end elements, we typically want to include the end boundary
|
72
|
+
sections = page.get_sections(end_elements=[end_elem], include_boundaries="end")
|
72
73
|
|
73
74
|
print(f"Sections created: {len(sections)}")
|
74
75
|
|
@@ -80,7 +81,8 @@ def test_edge_case_single_end_element():
|
|
80
81
|
print(f"Expected height: {end_elem.bottom}")
|
81
82
|
|
82
83
|
# Height should be approximately end_elem.bottom (from top of page)
|
83
|
-
|
84
|
+
# Allow for small rounding differences
|
85
|
+
assert abs(section.height - end_elem.bottom) <= 1.0
|
84
86
|
|
85
87
|
|
86
88
|
def test_mixed_start_end_elements():
|
@@ -115,13 +115,16 @@ def test_implicit_start_not_paired_with_source_end():
|
|
115
115
|
|
116
116
|
print(f"\nSections created: {len(sections)}")
|
117
117
|
|
118
|
-
#
|
119
|
-
#
|
118
|
+
# With default include_boundaries="start", sections exclude the end boundary
|
119
|
+
# So the first section should go from top of page to TOP of first end element
|
120
120
|
# There should NOT be a zero-height section at first end
|
121
121
|
|
122
|
+
# Sort end elements like the implementation does
|
123
|
+
sorted_ends = sorted(end_elements, key=lambda e: (e.page.index, e.top, e.bottom, e.x0))
|
124
|
+
|
122
125
|
expected_sections = [
|
123
|
-
(0,
|
124
|
-
|
126
|
+
(0, sorted_ends[0].top), # Top to TOP of first sorted end (exclude end boundary)
|
127
|
+
# Second section continues from there - we don't check its end
|
125
128
|
]
|
126
129
|
|
127
130
|
for i, section in enumerate(sections):
|
@@ -77,13 +77,13 @@ def test_extract_table_collection_header_options():
|
|
77
77
|
|
78
78
|
# Test header=None
|
79
79
|
result2 = guide.extract_table(pages, header=None)
|
80
|
-
df2 = result2.to_df()
|
80
|
+
df2 = result2.to_df(header=None) # Need to pass header=None to to_df as well
|
81
81
|
assert isinstance(df2.columns[0], int) # Should use numeric indices
|
82
82
|
|
83
83
|
# Test custom headers
|
84
84
|
custom_headers = ["A", "B", "C", "D", "E", "F", "G", "H"]
|
85
85
|
result3 = guide.extract_table(pages, header=custom_headers)
|
86
|
-
df3 = result3.to_df()
|
86
|
+
df3 = result3.to_df(header=custom_headers) # Pass custom headers to to_df
|
87
87
|
assert list(df3.columns) == custom_headers
|
88
88
|
|
89
89
|
|