natural-pdf 0.2.5__tar.gz → 0.2.6__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/.gitignore +1 -0
- {natural_pdf-0.2.5/natural_pdf.egg-info → natural_pdf-0.2.6}/PKG-INFO +1 -1
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/natural_pdf/analyzers/guides.py +94 -42
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/natural_pdf/core/page.py +110 -44
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/natural_pdf/core/page_collection.py +223 -34
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/natural_pdf/core/page_groupby.py +20 -2
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/natural_pdf/core/pdf.py +3 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/natural_pdf/core/render_spec.py +20 -5
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/natural_pdf/describe/base.py +1 -1
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/natural_pdf/describe/elements.py +1 -1
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/natural_pdf/elements/base.py +84 -8
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/natural_pdf/elements/element_collection.py +730 -12
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/natural_pdf/elements/region.py +181 -48
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/natural_pdf/flows/flow.py +3 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/natural_pdf/selectors/parser.py +2 -2
- natural_pdf-0.2.6/natural_pdf/utils/color_utils.py +100 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6/natural_pdf.egg-info}/PKG-INFO +1 -1
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/natural_pdf.egg-info/SOURCES.txt +32 -1
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/natural_pdf.egg-info/top_level.txt +1 -0
- natural_pdf-0.2.6/tests/test_color_hex_display.py +194 -0
- natural_pdf-0.2.6/tests/test_crop_enhancements.py +149 -0
- natural_pdf-0.2.6/tests/test_crop_region_highlights.py +119 -0
- natural_pdf-0.2.6/tests/test_dissolve.py +471 -0
- natural_pdf-0.2.6/tests/test_dissolve_cross_page_bug.py +155 -0
- natural_pdf-0.2.6/tests/test_dissolve_debug_issue.py +195 -0
- natural_pdf-0.2.6/tests/test_dissolve_real_world_issue.py +201 -0
- natural_pdf-0.2.6/tests/test_dissolve_single_elements.py +159 -0
- natural_pdf-0.2.6/tests/test_dissolve_vertical_offset_issue.py +139 -0
- natural_pdf-0.2.6/tests/test_element_addition.py +176 -0
- natural_pdf-0.2.6/tests/test_element_collection_show_cols.py +132 -0
- natural_pdf-0.2.6/tests/test_empty_pseudo_class.py +215 -0
- natural_pdf-0.2.6/tests/test_fix_get_sections_zero_height.py +120 -0
- natural_pdf-0.2.6/tests/test_get_sections_fix_comprehensive.py +183 -0
- natural_pdf-0.2.6/tests/test_get_sections_zero_height.py +179 -0
- natural_pdf-0.2.5/tests/test_guides_extract_table_from_pages.py → natural_pdf-0.2.6/tests/test_guides_extract_table_collections.py +78 -55
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/tests/test_guides_extract_table_exclusions.py +41 -40
- natural_pdf-0.2.6/tests/test_highlight_detection.py +40 -0
- natural_pdf-0.2.6/tests/test_highlight_detection_comprehensive.py +94 -0
- natural_pdf-0.2.6/tests/test_include_boundaries_comprehensive.py +124 -0
- natural_pdf-0.2.6/tests/test_include_boundaries_debug.py +67 -0
- natural_pdf-0.2.6/tests/test_include_boundaries_final.py +159 -0
- natural_pdf-0.2.6/tests/test_include_boundaries_final_verification.py +126 -0
- natural_pdf-0.2.6/tests/test_include_boundaries_fix.py +126 -0
- natural_pdf-0.2.6/tests/test_include_boundaries_mock.py +188 -0
- natural_pdf-0.2.6/tests/test_include_boundaries_simple.py +119 -0
- natural_pdf-0.2.6/tests/test_include_boundaries_types_pdf.py +113 -0
- natural_pdf-0.2.6/tests/test_include_boundaries_verification.py +134 -0
- natural_pdf-0.2.6/tests/test_include_boundaries_with_real_text.py +104 -0
- natural_pdf-0.2.6/tests/test_merge_connected.py +302 -0
- natural_pdf-0.2.6/tests/test_merge_connected_real_world.py +240 -0
- natural_pdf-0.2.6/tests/test_merge_method.py +185 -0
- natural_pdf-0.2.6/tests/test_sections_with_start_and_end.py +98 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/tests/test_slice_cache_reuse.py +43 -40
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/tests/test_slice_exclusion_fix.py +37 -34
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/tests/test_slice_exclusion_issue.py +22 -16
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/tests/test_slice_exclusion_mock.py +49 -49
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/tests/test_sliced_collection_exclusions.py +50 -42
- natural_pdf-0.2.5/tests/test_highlight_detection.py +0 -11
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/.cursor/rules/analysis_framework.mdc +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/.cursor/rules/coding-style.mdc +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/.cursor/rules/edit-md-instead-of-ipynb.mdc +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/.cursor/rules/minimal-comments.mdc +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/.cursor/rules/natural-pdf-overview.mdc +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/.cursor/rules/user-friendly-library-code.mdc +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/.github/workflows/ci.yml +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/.github/workflows/docs.yml +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/.github/workflows/nightly-tutorials.yml +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/.pre-commit-config.yaml +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/01-execute_notebooks.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/02-run_all_tutorials.sh +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/CLAUDE.md +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/LICENSE +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/MANIFEST.in +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/README.md +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/audit_packaging.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/check_run_md.sh +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/docs/api/index.md +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/docs/assets/favicon.png +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/docs/assets/favicon.svg +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/docs/assets/javascripts/custom.js +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/docs/assets/logo.svg +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/docs/assets/sample-screen.png +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/docs/assets/social-preview.png +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/docs/assets/social-preview.svg +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/docs/assets/stylesheets/custom.css +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/docs/categorizing-documents/index.md +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/docs/data-extraction/index.md +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/docs/describe/index.md +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/docs/document-qa/index.md +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/docs/element-selection/index.md +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/docs/extracting-clean-text/index.md +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/docs/finetuning/index.md +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/docs/fix-messy-tables/index.md +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/docs/fix-messy-tables/table_1.csv +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/docs/fix-messy-tables/table_2.csv +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/docs/fix-messy-tables/table_3.csv +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/docs/index.md +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/docs/installation/index.md +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/docs/interactive-widget/index.md +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/docs/layout-analysis/index.md +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/docs/loops-and-groups/index.md +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/docs/ocr/index.md +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/docs/pdf-navigation/index.md +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/docs/process-forms-and-invoices/extracted_form_data.csv +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/docs/process-forms-and-invoices/index.md +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/docs/quick-reference/index.md +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/docs/reflowing-pages/index.md +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/docs/regions/index.md +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/docs/tables/index.md +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/docs/text-analysis/index.md +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/docs/tutorials/01-loading-and-extraction.md +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/docs/tutorials/02-finding-elements.md +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/docs/tutorials/03-extracting-blocks.md +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/docs/tutorials/04-table-extraction.md +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/docs/tutorials/05-excluding-content.md +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/docs/tutorials/06-document-qa.md +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/docs/tutorials/07-layout-analysis.md +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/docs/tutorials/07-working-with-regions.md +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/docs/tutorials/08-spatial-navigation.md +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/docs/tutorials/09-section-extraction.md +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/docs/tutorials/10-form-field-extraction.md +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/docs/tutorials/11-enhanced-table-processing.md +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/docs/tutorials/12-ocr-integration.md +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/docs/tutorials/13-semantic-search.md +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/docs/tutorials/14-categorizing-documents.md +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/docs/visual-debugging/index.md +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/docs/visual-debugging/region.png +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/mkdocs.yml +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/natural_pdf/__init__.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/natural_pdf/analyzers/__init__.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/natural_pdf/analyzers/layout/__init__.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/natural_pdf/analyzers/layout/base.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/natural_pdf/analyzers/layout/docling.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/natural_pdf/analyzers/layout/gemini.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/natural_pdf/analyzers/layout/layout_analyzer.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/natural_pdf/analyzers/layout/layout_manager.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/natural_pdf/analyzers/layout/layout_options.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/natural_pdf/analyzers/layout/paddle.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/natural_pdf/analyzers/layout/pdfplumber_table_finder.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/natural_pdf/analyzers/layout/surya.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/natural_pdf/analyzers/layout/table_structure_utils.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/natural_pdf/analyzers/layout/tatr.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/natural_pdf/analyzers/layout/yolo.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/natural_pdf/analyzers/shape_detection_mixin.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/natural_pdf/analyzers/text_options.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/natural_pdf/analyzers/text_structure.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/natural_pdf/analyzers/utils.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/natural_pdf/classification/manager.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/natural_pdf/classification/mixin.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/natural_pdf/classification/results.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/natural_pdf/cli.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/natural_pdf/collections/mixins.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/natural_pdf/core/__init__.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/natural_pdf/core/element_manager.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/natural_pdf/core/highlighting_service.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/natural_pdf/core/pdf_collection.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/natural_pdf/describe/__init__.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/natural_pdf/describe/mixin.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/natural_pdf/describe/summary.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/natural_pdf/elements/__init__.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/natural_pdf/elements/image.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/natural_pdf/elements/line.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/natural_pdf/elements/rect.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/natural_pdf/elements/text.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/natural_pdf/export/mixin.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/natural_pdf/exporters/__init__.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/natural_pdf/exporters/base.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/natural_pdf/exporters/data/__init__.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/natural_pdf/exporters/data/pdf.ttf +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/natural_pdf/exporters/data/sRGB.icc +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/natural_pdf/exporters/hocr.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/natural_pdf/exporters/hocr_font.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/natural_pdf/exporters/original_pdf.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/natural_pdf/exporters/paddleocr.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/natural_pdf/exporters/searchable_pdf.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/natural_pdf/extraction/manager.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/natural_pdf/extraction/mixin.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/natural_pdf/extraction/result.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/natural_pdf/flows/__init__.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/natural_pdf/flows/collections.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/natural_pdf/flows/element.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/natural_pdf/flows/region.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/natural_pdf/ocr/__init__.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/natural_pdf/ocr/engine.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/natural_pdf/ocr/engine_doctr.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/natural_pdf/ocr/engine_easyocr.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/natural_pdf/ocr/engine_paddle.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/natural_pdf/ocr/engine_surya.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/natural_pdf/ocr/ocr_factory.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/natural_pdf/ocr/ocr_manager.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/natural_pdf/ocr/ocr_options.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/natural_pdf/ocr/utils.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/natural_pdf/qa/__init__.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/natural_pdf/qa/document_qa.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/natural_pdf/qa/qa_result.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/natural_pdf/search/__init__.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/natural_pdf/search/lancedb_search_service.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/natural_pdf/search/numpy_search_service.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/natural_pdf/search/search_options.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/natural_pdf/search/search_service_protocol.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/natural_pdf/search/searchable_mixin.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/natural_pdf/selectors/__init__.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/natural_pdf/tables/__init__.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/natural_pdf/tables/result.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/natural_pdf/templates/__init__.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/natural_pdf/templates/finetune/fine_tune_paddleocr.md +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/natural_pdf/templates/spa/css/style.css +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/natural_pdf/templates/spa/index.html +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/natural_pdf/templates/spa/js/app.js +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/natural_pdf/templates/spa/words.txt +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/natural_pdf/text_mixin.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/natural_pdf/utils/__init__.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/natural_pdf/utils/bidi_mirror.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/natural_pdf/utils/debug.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/natural_pdf/utils/highlighting.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/natural_pdf/utils/identifiers.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/natural_pdf/utils/layout.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/natural_pdf/utils/locks.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/natural_pdf/utils/packaging.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/natural_pdf/utils/reading_order.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/natural_pdf/utils/text_extraction.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/natural_pdf/utils/visualization.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/natural_pdf/vision/__init__.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/natural_pdf/vision/mixin.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/natural_pdf/vision/results.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/natural_pdf/vision/similarity.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/natural_pdf/widgets/__init__.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/natural_pdf/widgets/viewer.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/natural_pdf.egg-info/dependency_links.txt +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/natural_pdf.egg-info/entry_points.txt +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/natural_pdf.egg-info/requires.txt +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/noxfile.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/optimization/memory_comparison.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/optimization/pdf_analyzer.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/optimization/performance_analysis.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/optimization/performance_results/image_heavy_snapshots.csv +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/optimization/performance_results/image_heavy_snapshots.json +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/optimization/performance_results/text_heavy_snapshots.csv +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/optimization/performance_results/text_heavy_snapshots.json +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/optimization/test_cleanup_methods.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/optimization/test_memory_fix.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/publish.sh +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/pyproject.toml +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/sample-screen.png +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/setup.cfg +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/tests/conftest.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/tests/exporters/test_paddleocr_exporter.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/tests/test_annotate.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/tests/test_arabic_performance.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/tests/test_arabic_real_world.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/tests/test_color_conversion.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/tests/test_core/test_containment_geometry.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/tests/test_core/test_elements.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/tests/test_core/test_loading.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/tests/test_core/test_spatial.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/tests/test_core/test_text_extraction.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/tests/test_core/test_text_layer.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/tests/test_directional_defaults.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/tests/test_document_qa.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/tests/test_element_collection_slicing.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/tests/test_element_show_crop_highlights.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/tests/test_exclusions.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/tests/test_expand.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/tests/test_extraction_error.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/tests/test_extraction_mixin_fix.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/tests/test_extraction_text_and_vision.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/tests/test_extraction_working.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/tests/test_find_similar.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/tests/test_first_last_selectors.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/tests/test_flow_region_directional.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/tests/test_groupby.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/tests/test_guides.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/tests/test_guides_apply_exclusions.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/tests/test_guides_apply_exclusions_simple.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/tests/test_guides_extract_table.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/tests/test_guides_extract_table_real.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/tests/test_guides_integration.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/tests/test_highlight_protocol.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/tests/test_highlight_protocol_simple.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/tests/test_highlight_regions.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/tests/test_loading_original.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/tests/test_multi_page_table_discovery.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/tests/test_optional_deps.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/tests/test_page_exclusion_lists.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/tests/test_pdf_add_exclusion_elementcollection.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/tests/test_region_show_crop_highlights.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/tests/test_region_viewer.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/tests/test_sections_end_only.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/tests/test_show_column_layout.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/tests/test_show_edge_cases.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/tests/test_show_exclusions.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/tests/test_show_exclusions_feature.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/tests/test_show_limit.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/tests/test_skip_repeating_headers_multipage.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/tests/test_strikethrough_detection.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/tests/test_table_result_header_mismatch.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/tests/test_table_result_keep_blank.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/tests/test_tiny_text_tables.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/tests/test_tiny_text_tables_table.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/tests/test_tutorials.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/tests/test_underline_detection.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/tests/test_update_text.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/todo/bad_pdf_analysis.md +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/todo/evaluation.md +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/tools/bad_pdf_eval/IMPROVEMENTS_SUMMARY.md +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/tools/bad_pdf_eval/LLM_NaturalPDF_CheatSheet.md +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/tools/bad_pdf_eval/LLM_NaturalPDF_Workflows.md +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/tools/bad_pdf_eval/README.md +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/tools/bad_pdf_eval/__init__.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/tools/bad_pdf_eval/analyser.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/tools/bad_pdf_eval/collate_summaries.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/tools/bad_pdf_eval/compile_attempts_markdown.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/tools/bad_pdf_eval/eval_suite.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/tools/bad_pdf_eval/evaluate_quality.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/tools/bad_pdf_eval/export_enrichment_csv.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/tools/bad_pdf_eval/extraction_decision_tree.md +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/tools/bad_pdf_eval/llm_enrich.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/tools/bad_pdf_eval/llm_enrich_with_retry.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/tools/bad_pdf_eval/reporter.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/tools/bad_pdf_eval/utils.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.6}/uv.lock +0 -0
@@ -3462,7 +3462,15 @@ class Guides:
|
|
3462
3462
|
|
3463
3463
|
def extract_table(
|
3464
3464
|
self,
|
3465
|
-
target: Optional[
|
3465
|
+
target: Optional[
|
3466
|
+
Union[
|
3467
|
+
"Page",
|
3468
|
+
"Region",
|
3469
|
+
"PageCollection",
|
3470
|
+
"ElementCollection",
|
3471
|
+
List[Union["Page", "Region"]],
|
3472
|
+
]
|
3473
|
+
] = None,
|
3466
3474
|
source: str = "guides_temp",
|
3467
3475
|
cell_padding: float = 0.5,
|
3468
3476
|
include_outer_boundaries: bool = False,
|
@@ -3477,6 +3485,8 @@ class Guides:
|
|
3477
3485
|
apply_exclusions: bool = True,
|
3478
3486
|
*,
|
3479
3487
|
multi_page: Literal["auto", True, False] = "auto",
|
3488
|
+
header: Union[str, List[str], None] = "first",
|
3489
|
+
skip_repeating_headers: Optional[bool] = None,
|
3480
3490
|
) -> "TableResult":
|
3481
3491
|
"""
|
3482
3492
|
Extract table data directly from guides without leaving temporary regions.
|
@@ -3487,8 +3497,11 @@ class Guides:
|
|
3487
3497
|
3. Cleans up all temporary regions
|
3488
3498
|
4. Returns the TableResult
|
3489
3499
|
|
3500
|
+
When passed a collection (PageCollection, ElementCollection, or list), this method
|
3501
|
+
will extract tables from each element and combine them into a single result.
|
3502
|
+
|
3490
3503
|
Args:
|
3491
|
-
target: Page or
|
3504
|
+
target: Page, Region, or collection of Pages/Regions to extract from (uses self.context if None)
|
3492
3505
|
source: Source label for temporary regions (will be cleaned up)
|
3493
3506
|
cell_padding: Internal padding for cell regions in points
|
3494
3507
|
include_outer_boundaries: Whether to add boundaries at edges if missing
|
@@ -3502,6 +3515,13 @@ class Guides:
|
|
3502
3515
|
content_filter: Content filtering function or patterns
|
3503
3516
|
apply_exclusions: Whether to apply exclusion regions during text extraction (default: True)
|
3504
3517
|
multi_page: Controls multi-region table creation for FlowRegions
|
3518
|
+
header: How to handle headers when extracting from collections:
|
3519
|
+
- "first": Use first row of first element as headers (default)
|
3520
|
+
- "all": Expect headers on each element, use from first element
|
3521
|
+
- None: No headers, use numeric indices
|
3522
|
+
- List[str]: Custom column names
|
3523
|
+
skip_repeating_headers: Whether to remove duplicate header rows when extracting from collections.
|
3524
|
+
Defaults to True when header is "first" or "all", False otherwise.
|
3505
3525
|
|
3506
3526
|
Returns:
|
3507
3527
|
TableResult: Extracted table data
|
@@ -3513,20 +3533,49 @@ class Guides:
|
|
3513
3533
|
```python
|
3514
3534
|
from natural_pdf.analyzers import Guides
|
3515
3535
|
|
3516
|
-
#
|
3536
|
+
# Single page extraction
|
3517
3537
|
guides = Guides.from_lines(page, source_label="detected")
|
3518
|
-
|
3519
|
-
# Extract table directly - no temporary regions left behind
|
3520
3538
|
table_data = guides.extract_table()
|
3521
|
-
|
3522
|
-
# Convert to pandas DataFrame
|
3523
3539
|
df = table_data.to_df()
|
3540
|
+
|
3541
|
+
# Multiple page extraction
|
3542
|
+
guides = Guides(pages[0])
|
3543
|
+
guides.vertical.from_content(['Column 1', 'Column 2'])
|
3544
|
+
table_result = guides.extract_table(pages, header=['Col1', 'Col2'])
|
3545
|
+
df = table_result.to_df()
|
3546
|
+
|
3547
|
+
# Region collection extraction
|
3548
|
+
regions = pdf.find_all('region[type=table]')
|
3549
|
+
guides = Guides(regions[0])
|
3550
|
+
guides.vertical.from_lines(n=3)
|
3551
|
+
table_result = guides.extract_table(regions)
|
3524
3552
|
```
|
3525
3553
|
"""
|
3526
|
-
|
3527
|
-
|
3554
|
+
from natural_pdf.core.page_collection import PageCollection
|
3555
|
+
from natural_pdf.elements.element_collection import ElementCollection
|
3556
|
+
|
3557
|
+
target_obj = target if target is not None else self.context
|
3558
|
+
if target_obj is None:
|
3528
3559
|
raise ValueError("No target object available. Provide target parameter or context.")
|
3529
3560
|
|
3561
|
+
# Check if target is a collection - if so, delegate to _extract_table_from_collection
|
3562
|
+
if isinstance(target_obj, (PageCollection, ElementCollection, list)):
|
3563
|
+
# For collections, pass through most parameters as-is
|
3564
|
+
return self._extract_table_from_collection(
|
3565
|
+
elements=target_obj,
|
3566
|
+
header=header,
|
3567
|
+
skip_repeating_headers=skip_repeating_headers,
|
3568
|
+
method=method,
|
3569
|
+
table_settings=table_settings,
|
3570
|
+
use_ocr=use_ocr,
|
3571
|
+
ocr_config=ocr_config,
|
3572
|
+
text_options=text_options,
|
3573
|
+
cell_extraction_func=cell_extraction_func,
|
3574
|
+
show_progress=show_progress,
|
3575
|
+
content_filter=content_filter,
|
3576
|
+
apply_exclusions=apply_exclusions,
|
3577
|
+
)
|
3578
|
+
|
3530
3579
|
# Get the page for cleanup later
|
3531
3580
|
if hasattr(target_obj, "x0") and hasattr(target_obj, "top"): # Region
|
3532
3581
|
page = target_obj._page
|
@@ -3597,9 +3646,9 @@ class Guides:
|
|
3597
3646
|
except Exception as cleanup_err:
|
3598
3647
|
logger.warning(f"Failed to clean up temporary regions: {cleanup_err}")
|
3599
3648
|
|
3600
|
-
def
|
3649
|
+
def _extract_table_from_collection(
|
3601
3650
|
self,
|
3602
|
-
|
3651
|
+
elements: Union["PageCollection", "ElementCollection", List[Union["Page", "Region"]]],
|
3603
3652
|
header: Union[str, List[str], None] = "first",
|
3604
3653
|
skip_repeating_headers: Optional[bool] = None,
|
3605
3654
|
method: Optional[str] = None,
|
@@ -3613,17 +3662,17 @@ class Guides:
|
|
3613
3662
|
apply_exclusions: bool = True,
|
3614
3663
|
) -> "TableResult":
|
3615
3664
|
"""
|
3616
|
-
Extract tables from multiple pages using this guide pattern.
|
3665
|
+
Extract tables from multiple pages or regions using this guide pattern.
|
3617
3666
|
|
3618
|
-
This method applies the guide to each
|
3667
|
+
This method applies the guide to each element, extracts tables, and combines
|
3619
3668
|
them into a single TableResult. Dynamic guides (using lambdas) are evaluated
|
3620
|
-
for each
|
3669
|
+
for each element.
|
3621
3670
|
|
3622
3671
|
Args:
|
3623
|
-
|
3672
|
+
elements: PageCollection, ElementCollection, or list of Pages/Regions to extract from
|
3624
3673
|
header: How to handle headers:
|
3625
|
-
- "first": Use first row of first
|
3626
|
-
- "all": Expect headers on each
|
3674
|
+
- "first": Use first row of first element as headers (default)
|
3675
|
+
- "all": Expect headers on each element, use from first element
|
3627
3676
|
- None: No headers, use numeric indices
|
3628
3677
|
- List[str]: Custom column names
|
3629
3678
|
skip_repeating_headers: Whether to remove duplicate header rows.
|
@@ -3634,35 +3683,36 @@ class Guides:
|
|
3634
3683
|
ocr_config: OCR configuration parameters
|
3635
3684
|
text_options: Dictionary of options for the 'text' method
|
3636
3685
|
cell_extraction_func: Optional callable for custom cell text extraction
|
3637
|
-
show_progress: Show progress bar for multi-
|
3686
|
+
show_progress: Show progress bar for multi-element extraction (default: True)
|
3638
3687
|
content_filter: Content filtering function or patterns
|
3639
3688
|
apply_exclusions: Whether to apply exclusion regions during extraction
|
3640
3689
|
|
3641
3690
|
Returns:
|
3642
|
-
TableResult: Combined table data from all
|
3691
|
+
TableResult: Combined table data from all elements
|
3643
3692
|
|
3644
3693
|
Example:
|
3645
3694
|
```python
|
3646
3695
|
# Create guide with static vertical, dynamic horizontal
|
3647
|
-
guide = Guides(
|
3696
|
+
guide = Guides(regions[0])
|
3648
3697
|
guide.vertical.from_content(columns, outer="last")
|
3649
|
-
guide.horizontal.from_content(lambda
|
3698
|
+
guide.horizontal.from_content(lambda r: r.find_all('text:starts-with(NF-)'))
|
3650
3699
|
|
3651
|
-
# Extract from all
|
3652
|
-
table_result = guide.
|
3700
|
+
# Extract from all regions
|
3701
|
+
table_result = guide._extract_table_from_collection(regions, header=columns)
|
3653
3702
|
df = table_result.to_df()
|
3654
3703
|
```
|
3655
3704
|
"""
|
3656
3705
|
from natural_pdf.core.page_collection import PageCollection
|
3706
|
+
from natural_pdf.elements.element_collection import ElementCollection
|
3657
3707
|
from natural_pdf.tables.result import TableResult
|
3658
3708
|
|
3659
|
-
# Convert to list if it's a
|
3660
|
-
if isinstance(
|
3661
|
-
|
3709
|
+
# Convert to list if it's a collection
|
3710
|
+
if isinstance(elements, (PageCollection, ElementCollection)):
|
3711
|
+
element_list = list(elements)
|
3662
3712
|
else:
|
3663
|
-
|
3713
|
+
element_list = elements
|
3664
3714
|
|
3665
|
-
if not
|
3715
|
+
if not element_list:
|
3666
3716
|
return TableResult([])
|
3667
3717
|
|
3668
3718
|
# Determine header handling
|
@@ -3673,37 +3723,39 @@ class Guides:
|
|
3673
3723
|
header_row = None
|
3674
3724
|
|
3675
3725
|
# Configure progress bar
|
3676
|
-
iterator =
|
3677
|
-
if show_progress and len(
|
3726
|
+
iterator = element_list
|
3727
|
+
if show_progress and len(element_list) > 1:
|
3678
3728
|
try:
|
3679
3729
|
from tqdm.auto import tqdm
|
3680
3730
|
|
3681
|
-
iterator = tqdm(
|
3731
|
+
iterator = tqdm(
|
3732
|
+
element_list, desc="Extracting tables from elements", unit="element"
|
3733
|
+
)
|
3682
3734
|
except ImportError:
|
3683
3735
|
pass
|
3684
3736
|
|
3685
|
-
for i,
|
3686
|
-
# Create a new Guides object for this
|
3687
|
-
|
3737
|
+
for i, element in enumerate(iterator):
|
3738
|
+
# Create a new Guides object for this element
|
3739
|
+
element_guide = Guides(element)
|
3688
3740
|
|
3689
3741
|
# Copy vertical guides (usually static)
|
3690
3742
|
if hasattr(self.vertical, "_callable") and self.vertical._callable is not None:
|
3691
3743
|
# If vertical is dynamic (lambda), evaluate it
|
3692
|
-
|
3744
|
+
element_guide.vertical.from_content(self.vertical._callable(element))
|
3693
3745
|
else:
|
3694
3746
|
# Copy static vertical positions
|
3695
|
-
|
3747
|
+
element_guide.vertical.data = self.vertical.data.copy()
|
3696
3748
|
|
3697
3749
|
# Handle horizontal guides
|
3698
3750
|
if hasattr(self.horizontal, "_callable") and self.horizontal._callable is not None:
|
3699
3751
|
# If horizontal is dynamic (lambda), evaluate it
|
3700
|
-
|
3752
|
+
element_guide.horizontal.from_content(self.horizontal._callable(element))
|
3701
3753
|
else:
|
3702
3754
|
# Copy static horizontal positions
|
3703
|
-
|
3755
|
+
element_guide.horizontal.data = self.horizontal.data.copy()
|
3704
3756
|
|
3705
|
-
# Extract table from this
|
3706
|
-
table_result =
|
3757
|
+
# Extract table from this element
|
3758
|
+
table_result = element_guide.extract_table(
|
3707
3759
|
method=method,
|
3708
3760
|
table_settings=table_settings,
|
3709
3761
|
use_ocr=use_ocr,
|
@@ -3719,7 +3771,7 @@ class Guides:
|
|
3719
3771
|
rows = list(table_result)
|
3720
3772
|
|
3721
3773
|
# Handle headers based on strategy
|
3722
|
-
if i == 0: # First
|
3774
|
+
if i == 0: # First element
|
3723
3775
|
if header == "first" or header == "all":
|
3724
3776
|
# Use first row as header
|
3725
3777
|
if rows:
|
@@ -3728,7 +3780,7 @@ class Guides:
|
|
3728
3780
|
elif isinstance(header, list):
|
3729
3781
|
# Custom headers provided
|
3730
3782
|
header_row = header
|
3731
|
-
else: # Subsequent
|
3783
|
+
else: # Subsequent elements
|
3732
3784
|
if header == "all" and skip_repeating_headers and rows:
|
3733
3785
|
# Expect and remove header row
|
3734
3786
|
if rows and header_row and rows[0] == header_row:
|
@@ -2525,11 +2525,20 @@ class Page(
|
|
2525
2525
|
include_boundaries="start",
|
2526
2526
|
y_threshold=5.0,
|
2527
2527
|
bounding_box=None,
|
2528
|
+
orientation="vertical",
|
2528
2529
|
) -> "ElementCollection[Region]":
|
2529
2530
|
"""
|
2530
2531
|
Get sections of a page defined by start/end elements.
|
2531
2532
|
Uses the page-level implementation.
|
2532
2533
|
|
2534
|
+
Args:
|
2535
|
+
start_elements: Elements or selector string that mark the start of sections
|
2536
|
+
end_elements: Elements or selector string that mark the end of sections
|
2537
|
+
include_boundaries: How to include boundary elements: 'start', 'end', 'both', or 'none'
|
2538
|
+
y_threshold: Threshold for vertical alignment (only used for vertical orientation)
|
2539
|
+
bounding_box: Optional bounding box to constrain sections
|
2540
|
+
orientation: 'vertical' (default) or 'horizontal' - determines section direction
|
2541
|
+
|
2533
2542
|
Returns:
|
2534
2543
|
An ElementCollection containing the found Region objects.
|
2535
2544
|
"""
|
@@ -2577,11 +2586,14 @@ class Page(
|
|
2577
2586
|
for el in end_elements:
|
2578
2587
|
all_boundaries.append((el, "end"))
|
2579
2588
|
|
2580
|
-
# Sort all boundary elements
|
2589
|
+
# Sort all boundary elements based on orientation
|
2581
2590
|
try:
|
2582
|
-
|
2591
|
+
if orientation == "vertical":
|
2592
|
+
all_boundaries.sort(key=lambda x: (x[0].top, x[0].x0))
|
2593
|
+
else: # horizontal
|
2594
|
+
all_boundaries.sort(key=lambda x: (x[0].x0, x[0].top))
|
2583
2595
|
except AttributeError as e:
|
2584
|
-
logger.error(f"Error sorting boundaries: Element missing
|
2596
|
+
logger.error(f"Error sorting boundaries: Element missing position attribute? {e}")
|
2585
2597
|
return ElementCollection([]) # Cannot proceed if elements lack position
|
2586
2598
|
|
2587
2599
|
# Process sorted boundaries to find sections
|
@@ -2593,72 +2605,126 @@ class Page(
|
|
2593
2605
|
# If we have an active section, this start implicitly ends it
|
2594
2606
|
if active_section_started:
|
2595
2607
|
end_boundary_el = element # Use this start as the end boundary
|
2596
|
-
# Determine region boundaries
|
2608
|
+
# Determine region boundaries based on orientation
|
2609
|
+
if orientation == "vertical":
|
2610
|
+
sec_top = (
|
2611
|
+
current_start_element.top
|
2612
|
+
if include_boundaries in ["start", "both"]
|
2613
|
+
else current_start_element.bottom
|
2614
|
+
)
|
2615
|
+
sec_bottom = (
|
2616
|
+
end_boundary_el.top
|
2617
|
+
if include_boundaries not in ["end", "both"]
|
2618
|
+
else end_boundary_el.bottom
|
2619
|
+
)
|
2620
|
+
|
2621
|
+
if sec_top < sec_bottom: # Ensure valid region
|
2622
|
+
x0, _, x1, _ = get_bounds()
|
2623
|
+
region = self.create_region(x0, sec_top, x1, sec_bottom)
|
2624
|
+
region.start_element = current_start_element
|
2625
|
+
region.end_element = end_boundary_el # Mark the element that ended it
|
2626
|
+
region.is_end_next_start = True # Mark how it ended
|
2627
|
+
regions.append(region)
|
2628
|
+
else: # horizontal
|
2629
|
+
sec_left = (
|
2630
|
+
current_start_element.x0
|
2631
|
+
if include_boundaries in ["start", "both"]
|
2632
|
+
else current_start_element.x1
|
2633
|
+
)
|
2634
|
+
sec_right = (
|
2635
|
+
end_boundary_el.x0
|
2636
|
+
if include_boundaries not in ["end", "both"]
|
2637
|
+
else end_boundary_el.x1
|
2638
|
+
)
|
2639
|
+
|
2640
|
+
if sec_left < sec_right: # Ensure valid region
|
2641
|
+
_, y0, _, y1 = get_bounds()
|
2642
|
+
region = self.create_region(sec_left, y0, sec_right, y1)
|
2643
|
+
region.start_element = current_start_element
|
2644
|
+
region.end_element = end_boundary_el # Mark the element that ended it
|
2645
|
+
region.is_end_next_start = True # Mark how it ended
|
2646
|
+
regions.append(region)
|
2647
|
+
active_section_started = False # Reset for the new start
|
2648
|
+
|
2649
|
+
# Set this as the potential start of the next section
|
2650
|
+
current_start_element = element
|
2651
|
+
active_section_started = True
|
2652
|
+
|
2653
|
+
elif element_type == "end" and active_section_started:
|
2654
|
+
# We found an explicit end for the current section
|
2655
|
+
end_boundary_el = element
|
2656
|
+
if orientation == "vertical":
|
2597
2657
|
sec_top = (
|
2598
2658
|
current_start_element.top
|
2599
2659
|
if include_boundaries in ["start", "both"]
|
2600
2660
|
else current_start_element.bottom
|
2601
2661
|
)
|
2602
2662
|
sec_bottom = (
|
2603
|
-
end_boundary_el.
|
2604
|
-
if include_boundaries
|
2605
|
-
else end_boundary_el.
|
2663
|
+
end_boundary_el.bottom
|
2664
|
+
if include_boundaries in ["end", "both"]
|
2665
|
+
else end_boundary_el.top
|
2606
2666
|
)
|
2607
2667
|
|
2608
2668
|
if sec_top < sec_bottom: # Ensure valid region
|
2609
2669
|
x0, _, x1, _ = get_bounds()
|
2610
2670
|
region = self.create_region(x0, sec_top, x1, sec_bottom)
|
2611
2671
|
region.start_element = current_start_element
|
2612
|
-
region.end_element = end_boundary_el
|
2613
|
-
region.is_end_next_start =
|
2672
|
+
region.end_element = end_boundary_el
|
2673
|
+
region.is_end_next_start = False
|
2614
2674
|
regions.append(region)
|
2615
|
-
|
2675
|
+
else: # horizontal
|
2676
|
+
sec_left = (
|
2677
|
+
current_start_element.x0
|
2678
|
+
if include_boundaries in ["start", "both"]
|
2679
|
+
else current_start_element.x1
|
2680
|
+
)
|
2681
|
+
sec_right = (
|
2682
|
+
end_boundary_el.x1
|
2683
|
+
if include_boundaries in ["end", "both"]
|
2684
|
+
else end_boundary_el.x0
|
2685
|
+
)
|
2616
2686
|
|
2617
|
-
|
2618
|
-
|
2619
|
-
|
2687
|
+
if sec_left < sec_right: # Ensure valid region
|
2688
|
+
_, y0, _, y1 = get_bounds()
|
2689
|
+
region = self.create_region(sec_left, y0, sec_right, y1)
|
2690
|
+
region.start_element = current_start_element
|
2691
|
+
region.end_element = end_boundary_el
|
2692
|
+
region.is_end_next_start = False
|
2693
|
+
regions.append(region)
|
2620
2694
|
|
2621
|
-
|
2622
|
-
|
2623
|
-
|
2695
|
+
# Reset: section ended explicitly
|
2696
|
+
current_start_element = None
|
2697
|
+
active_section_started = False
|
2698
|
+
|
2699
|
+
# Handle the last section if it was started but never explicitly ended
|
2700
|
+
if active_section_started:
|
2701
|
+
if orientation == "vertical":
|
2624
2702
|
sec_top = (
|
2625
2703
|
current_start_element.top
|
2626
2704
|
if include_boundaries in ["start", "both"]
|
2627
2705
|
else current_start_element.bottom
|
2628
2706
|
)
|
2629
|
-
|
2630
|
-
|
2631
|
-
|
2632
|
-
|
2707
|
+
x0, _, x1, page_bottom = get_bounds()
|
2708
|
+
if sec_top < page_bottom:
|
2709
|
+
region = self.create_region(x0, sec_top, x1, page_bottom)
|
2710
|
+
region.start_element = current_start_element
|
2711
|
+
region.end_element = None # Ended by page end
|
2712
|
+
region.is_end_next_start = False
|
2713
|
+
regions.append(region)
|
2714
|
+
else: # horizontal
|
2715
|
+
sec_left = (
|
2716
|
+
current_start_element.x0
|
2717
|
+
if include_boundaries in ["start", "both"]
|
2718
|
+
else current_start_element.x1
|
2633
2719
|
)
|
2634
|
-
|
2635
|
-
if
|
2636
|
-
|
2637
|
-
region = self.create_region(x0, sec_top, x1, sec_bottom)
|
2720
|
+
page_left, y0, page_right, y1 = get_bounds()
|
2721
|
+
if sec_left < page_right:
|
2722
|
+
region = self.create_region(sec_left, y0, page_right, y1)
|
2638
2723
|
region.start_element = current_start_element
|
2639
|
-
region.end_element =
|
2724
|
+
region.end_element = None # Ended by page end
|
2640
2725
|
region.is_end_next_start = False
|
2641
2726
|
regions.append(region)
|
2642
2727
|
|
2643
|
-
# Reset: section ended explicitly
|
2644
|
-
current_start_element = None
|
2645
|
-
active_section_started = False
|
2646
|
-
|
2647
|
-
# Handle the last section if it was started but never explicitly ended
|
2648
|
-
if active_section_started:
|
2649
|
-
sec_top = (
|
2650
|
-
current_start_element.top
|
2651
|
-
if include_boundaries in ["start", "both"]
|
2652
|
-
else current_start_element.bottom
|
2653
|
-
)
|
2654
|
-
x0, _, x1, page_bottom = get_bounds()
|
2655
|
-
if sec_top < page_bottom:
|
2656
|
-
region = self.create_region(x0, sec_top, x1, page_bottom)
|
2657
|
-
region.start_element = current_start_element
|
2658
|
-
region.end_element = None # Ended by page end
|
2659
|
-
region.is_end_next_start = False
|
2660
|
-
regions.append(region)
|
2661
|
-
|
2662
2728
|
return ElementCollection(regions)
|
2663
2729
|
|
2664
2730
|
def __repr__(self) -> str:
|