natural-pdf 0.2.4__tar.gz → 0.2.6__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/.gitignore +1 -0
- {natural_pdf-0.2.4/natural_pdf.egg-info → natural_pdf-0.2.6}/PKG-INFO +1 -1
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/natural_pdf/analyzers/guides.py +246 -18
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/natural_pdf/core/element_manager.py +5 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/natural_pdf/core/page.py +150 -48
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/natural_pdf/core/page_collection.py +223 -34
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/natural_pdf/core/page_groupby.py +20 -2
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/natural_pdf/core/pdf.py +44 -2
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/natural_pdf/core/render_spec.py +20 -5
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/natural_pdf/describe/base.py +1 -1
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/natural_pdf/describe/elements.py +1 -1
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/natural_pdf/elements/base.py +84 -8
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/natural_pdf/elements/element_collection.py +730 -12
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/natural_pdf/elements/region.py +213 -61
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/natural_pdf/flows/flow.py +3 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/natural_pdf/selectors/parser.py +2 -2
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/natural_pdf/tables/result.py +39 -6
- natural_pdf-0.2.6/natural_pdf/utils/color_utils.py +100 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6/natural_pdf.egg-info}/PKG-INFO +1 -1
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/natural_pdf.egg-info/SOURCES.txt +38 -1
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/natural_pdf.egg-info/top_level.txt +1 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/tests/test_arabic_real_world.py +0 -3
- natural_pdf-0.2.6/tests/test_color_hex_display.py +194 -0
- natural_pdf-0.2.6/tests/test_crop_enhancements.py +149 -0
- natural_pdf-0.2.6/tests/test_crop_region_highlights.py +119 -0
- natural_pdf-0.2.6/tests/test_dissolve.py +471 -0
- natural_pdf-0.2.6/tests/test_dissolve_cross_page_bug.py +155 -0
- natural_pdf-0.2.6/tests/test_dissolve_debug_issue.py +195 -0
- natural_pdf-0.2.6/tests/test_dissolve_real_world_issue.py +201 -0
- natural_pdf-0.2.6/tests/test_dissolve_single_elements.py +159 -0
- natural_pdf-0.2.6/tests/test_dissolve_vertical_offset_issue.py +139 -0
- natural_pdf-0.2.6/tests/test_element_addition.py +176 -0
- natural_pdf-0.2.6/tests/test_element_collection_show_cols.py +132 -0
- natural_pdf-0.2.6/tests/test_empty_pseudo_class.py +215 -0
- natural_pdf-0.2.6/tests/test_fix_get_sections_zero_height.py +120 -0
- natural_pdf-0.2.6/tests/test_get_sections_fix_comprehensive.py +183 -0
- natural_pdf-0.2.6/tests/test_get_sections_zero_height.py +179 -0
- natural_pdf-0.2.6/tests/test_guides_extract_table_collections.py +165 -0
- natural_pdf-0.2.6/tests/test_guides_extract_table_exclusions.py +181 -0
- natural_pdf-0.2.6/tests/test_highlight_detection.py +40 -0
- natural_pdf-0.2.6/tests/test_highlight_detection_comprehensive.py +94 -0
- natural_pdf-0.2.6/tests/test_include_boundaries_comprehensive.py +124 -0
- natural_pdf-0.2.6/tests/test_include_boundaries_debug.py +67 -0
- natural_pdf-0.2.6/tests/test_include_boundaries_final.py +159 -0
- natural_pdf-0.2.6/tests/test_include_boundaries_final_verification.py +126 -0
- natural_pdf-0.2.6/tests/test_include_boundaries_fix.py +126 -0
- natural_pdf-0.2.6/tests/test_include_boundaries_mock.py +188 -0
- natural_pdf-0.2.6/tests/test_include_boundaries_simple.py +119 -0
- natural_pdf-0.2.6/tests/test_include_boundaries_types_pdf.py +113 -0
- natural_pdf-0.2.6/tests/test_include_boundaries_verification.py +134 -0
- natural_pdf-0.2.6/tests/test_include_boundaries_with_real_text.py +104 -0
- natural_pdf-0.2.6/tests/test_merge_connected.py +302 -0
- natural_pdf-0.2.6/tests/test_merge_connected_real_world.py +240 -0
- natural_pdf-0.2.6/tests/test_merge_method.py +185 -0
- natural_pdf-0.2.6/tests/test_sections_with_start_and_end.py +98 -0
- natural_pdf-0.2.6/tests/test_slice_cache_reuse.py +202 -0
- natural_pdf-0.2.6/tests/test_slice_exclusion_fix.py +148 -0
- natural_pdf-0.2.6/tests/test_slice_exclusion_issue.py +78 -0
- natural_pdf-0.2.6/tests/test_slice_exclusion_mock.py +158 -0
- natural_pdf-0.2.6/tests/test_sliced_collection_exclusions.py +166 -0
- natural_pdf-0.2.4/test_install.sh +0 -46
- natural_pdf-0.2.4/tests/test_highlight_detection.py +0 -11
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/.cursor/rules/analysis_framework.mdc +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/.cursor/rules/coding-style.mdc +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/.cursor/rules/edit-md-instead-of-ipynb.mdc +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/.cursor/rules/minimal-comments.mdc +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/.cursor/rules/natural-pdf-overview.mdc +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/.cursor/rules/user-friendly-library-code.mdc +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/.github/workflows/ci.yml +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/.github/workflows/docs.yml +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/.github/workflows/nightly-tutorials.yml +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/.pre-commit-config.yaml +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/01-execute_notebooks.py +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/02-run_all_tutorials.sh +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/CLAUDE.md +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/LICENSE +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/MANIFEST.in +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/README.md +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/audit_packaging.py +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/check_run_md.sh +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/docs/api/index.md +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/docs/assets/favicon.png +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/docs/assets/favicon.svg +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/docs/assets/javascripts/custom.js +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/docs/assets/logo.svg +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/docs/assets/sample-screen.png +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/docs/assets/social-preview.png +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/docs/assets/social-preview.svg +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/docs/assets/stylesheets/custom.css +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/docs/categorizing-documents/index.md +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/docs/data-extraction/index.md +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/docs/describe/index.md +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/docs/document-qa/index.md +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/docs/element-selection/index.md +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/docs/extracting-clean-text/index.md +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/docs/finetuning/index.md +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/docs/fix-messy-tables/index.md +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/docs/fix-messy-tables/table_1.csv +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/docs/fix-messy-tables/table_2.csv +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/docs/fix-messy-tables/table_3.csv +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/docs/index.md +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/docs/installation/index.md +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/docs/interactive-widget/index.md +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/docs/layout-analysis/index.md +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/docs/loops-and-groups/index.md +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/docs/ocr/index.md +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/docs/pdf-navigation/index.md +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/docs/process-forms-and-invoices/extracted_form_data.csv +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/docs/process-forms-and-invoices/index.md +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/docs/quick-reference/index.md +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/docs/reflowing-pages/index.md +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/docs/regions/index.md +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/docs/tables/index.md +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/docs/text-analysis/index.md +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/docs/tutorials/01-loading-and-extraction.md +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/docs/tutorials/02-finding-elements.md +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/docs/tutorials/03-extracting-blocks.md +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/docs/tutorials/04-table-extraction.md +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/docs/tutorials/05-excluding-content.md +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/docs/tutorials/06-document-qa.md +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/docs/tutorials/07-layout-analysis.md +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/docs/tutorials/07-working-with-regions.md +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/docs/tutorials/08-spatial-navigation.md +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/docs/tutorials/09-section-extraction.md +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/docs/tutorials/10-form-field-extraction.md +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/docs/tutorials/11-enhanced-table-processing.md +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/docs/tutorials/12-ocr-integration.md +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/docs/tutorials/13-semantic-search.md +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/docs/tutorials/14-categorizing-documents.md +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/docs/visual-debugging/index.md +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/docs/visual-debugging/region.png +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/mkdocs.yml +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/natural_pdf/__init__.py +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/natural_pdf/analyzers/__init__.py +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/natural_pdf/analyzers/layout/__init__.py +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/natural_pdf/analyzers/layout/base.py +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/natural_pdf/analyzers/layout/docling.py +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/natural_pdf/analyzers/layout/gemini.py +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/natural_pdf/analyzers/layout/layout_analyzer.py +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/natural_pdf/analyzers/layout/layout_manager.py +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/natural_pdf/analyzers/layout/layout_options.py +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/natural_pdf/analyzers/layout/paddle.py +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/natural_pdf/analyzers/layout/pdfplumber_table_finder.py +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/natural_pdf/analyzers/layout/surya.py +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/natural_pdf/analyzers/layout/table_structure_utils.py +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/natural_pdf/analyzers/layout/tatr.py +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/natural_pdf/analyzers/layout/yolo.py +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/natural_pdf/analyzers/shape_detection_mixin.py +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/natural_pdf/analyzers/text_options.py +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/natural_pdf/analyzers/text_structure.py +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/natural_pdf/analyzers/utils.py +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/natural_pdf/classification/manager.py +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/natural_pdf/classification/mixin.py +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/natural_pdf/classification/results.py +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/natural_pdf/cli.py +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/natural_pdf/collections/mixins.py +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/natural_pdf/core/__init__.py +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/natural_pdf/core/highlighting_service.py +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/natural_pdf/core/pdf_collection.py +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/natural_pdf/describe/__init__.py +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/natural_pdf/describe/mixin.py +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/natural_pdf/describe/summary.py +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/natural_pdf/elements/__init__.py +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/natural_pdf/elements/image.py +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/natural_pdf/elements/line.py +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/natural_pdf/elements/rect.py +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/natural_pdf/elements/text.py +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/natural_pdf/export/mixin.py +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/natural_pdf/exporters/__init__.py +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/natural_pdf/exporters/base.py +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/natural_pdf/exporters/data/__init__.py +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/natural_pdf/exporters/data/pdf.ttf +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/natural_pdf/exporters/data/sRGB.icc +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/natural_pdf/exporters/hocr.py +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/natural_pdf/exporters/hocr_font.py +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/natural_pdf/exporters/original_pdf.py +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/natural_pdf/exporters/paddleocr.py +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/natural_pdf/exporters/searchable_pdf.py +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/natural_pdf/extraction/manager.py +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/natural_pdf/extraction/mixin.py +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/natural_pdf/extraction/result.py +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/natural_pdf/flows/__init__.py +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/natural_pdf/flows/collections.py +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/natural_pdf/flows/element.py +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/natural_pdf/flows/region.py +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/natural_pdf/ocr/__init__.py +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/natural_pdf/ocr/engine.py +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/natural_pdf/ocr/engine_doctr.py +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/natural_pdf/ocr/engine_easyocr.py +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/natural_pdf/ocr/engine_paddle.py +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/natural_pdf/ocr/engine_surya.py +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/natural_pdf/ocr/ocr_factory.py +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/natural_pdf/ocr/ocr_manager.py +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/natural_pdf/ocr/ocr_options.py +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/natural_pdf/ocr/utils.py +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/natural_pdf/qa/__init__.py +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/natural_pdf/qa/document_qa.py +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/natural_pdf/qa/qa_result.py +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/natural_pdf/search/__init__.py +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/natural_pdf/search/lancedb_search_service.py +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/natural_pdf/search/numpy_search_service.py +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/natural_pdf/search/search_options.py +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/natural_pdf/search/search_service_protocol.py +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/natural_pdf/search/searchable_mixin.py +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/natural_pdf/selectors/__init__.py +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/natural_pdf/tables/__init__.py +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/natural_pdf/templates/__init__.py +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/natural_pdf/templates/finetune/fine_tune_paddleocr.md +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/natural_pdf/templates/spa/css/style.css +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/natural_pdf/templates/spa/index.html +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/natural_pdf/templates/spa/js/app.js +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/natural_pdf/templates/spa/words.txt +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/natural_pdf/text_mixin.py +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/natural_pdf/utils/__init__.py +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/natural_pdf/utils/bidi_mirror.py +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/natural_pdf/utils/debug.py +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/natural_pdf/utils/highlighting.py +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/natural_pdf/utils/identifiers.py +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/natural_pdf/utils/layout.py +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/natural_pdf/utils/locks.py +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/natural_pdf/utils/packaging.py +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/natural_pdf/utils/reading_order.py +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/natural_pdf/utils/text_extraction.py +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/natural_pdf/utils/visualization.py +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/natural_pdf/vision/__init__.py +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/natural_pdf/vision/mixin.py +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/natural_pdf/vision/results.py +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/natural_pdf/vision/similarity.py +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/natural_pdf/widgets/__init__.py +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/natural_pdf/widgets/viewer.py +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/natural_pdf.egg-info/dependency_links.txt +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/natural_pdf.egg-info/entry_points.txt +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/natural_pdf.egg-info/requires.txt +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/noxfile.py +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/optimization/memory_comparison.py +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/optimization/pdf_analyzer.py +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/optimization/performance_analysis.py +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/optimization/performance_results/image_heavy_snapshots.csv +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/optimization/performance_results/image_heavy_snapshots.json +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/optimization/performance_results/text_heavy_snapshots.csv +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/optimization/performance_results/text_heavy_snapshots.json +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/optimization/test_cleanup_methods.py +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/optimization/test_memory_fix.py +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/publish.sh +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/pyproject.toml +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/sample-screen.png +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/setup.cfg +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/tests/conftest.py +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/tests/exporters/test_paddleocr_exporter.py +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/tests/test_annotate.py +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/tests/test_arabic_performance.py +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/tests/test_color_conversion.py +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/tests/test_core/test_containment_geometry.py +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/tests/test_core/test_elements.py +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/tests/test_core/test_loading.py +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/tests/test_core/test_spatial.py +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/tests/test_core/test_text_extraction.py +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/tests/test_core/test_text_layer.py +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/tests/test_directional_defaults.py +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/tests/test_document_qa.py +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/tests/test_element_collection_slicing.py +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/tests/test_element_show_crop_highlights.py +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/tests/test_exclusions.py +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/tests/test_expand.py +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/tests/test_extraction_error.py +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/tests/test_extraction_mixin_fix.py +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/tests/test_extraction_text_and_vision.py +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/tests/test_extraction_working.py +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/tests/test_find_similar.py +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/tests/test_first_last_selectors.py +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/tests/test_flow_region_directional.py +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/tests/test_groupby.py +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/tests/test_guides.py +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/tests/test_guides_apply_exclusions.py +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/tests/test_guides_apply_exclusions_simple.py +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/tests/test_guides_extract_table.py +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/tests/test_guides_extract_table_real.py +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/tests/test_guides_integration.py +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/tests/test_highlight_protocol.py +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/tests/test_highlight_protocol_simple.py +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/tests/test_highlight_regions.py +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/tests/test_loading_original.py +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/tests/test_multi_page_table_discovery.py +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/tests/test_optional_deps.py +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/tests/test_page_exclusion_lists.py +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/tests/test_pdf_add_exclusion_elementcollection.py +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/tests/test_region_show_crop_highlights.py +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/tests/test_region_viewer.py +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/tests/test_sections_end_only.py +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/tests/test_show_column_layout.py +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/tests/test_show_edge_cases.py +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/tests/test_show_exclusions.py +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/tests/test_show_exclusions_feature.py +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/tests/test_show_limit.py +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/tests/test_skip_repeating_headers_multipage.py +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/tests/test_strikethrough_detection.py +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/tests/test_table_result_header_mismatch.py +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/tests/test_table_result_keep_blank.py +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/tests/test_tiny_text_tables.py +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/tests/test_tiny_text_tables_table.py +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/tests/test_tutorials.py +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/tests/test_underline_detection.py +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/tests/test_update_text.py +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/todo/bad_pdf_analysis.md +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/todo/evaluation.md +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/tools/bad_pdf_eval/IMPROVEMENTS_SUMMARY.md +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/tools/bad_pdf_eval/LLM_NaturalPDF_CheatSheet.md +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/tools/bad_pdf_eval/LLM_NaturalPDF_Workflows.md +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/tools/bad_pdf_eval/README.md +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/tools/bad_pdf_eval/__init__.py +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/tools/bad_pdf_eval/analyser.py +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/tools/bad_pdf_eval/collate_summaries.py +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/tools/bad_pdf_eval/compile_attempts_markdown.py +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/tools/bad_pdf_eval/eval_suite.py +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/tools/bad_pdf_eval/evaluate_quality.py +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/tools/bad_pdf_eval/export_enrichment_csv.py +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/tools/bad_pdf_eval/extraction_decision_tree.md +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/tools/bad_pdf_eval/llm_enrich.py +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/tools/bad_pdf_eval/llm_enrich_with_retry.py +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/tools/bad_pdf_eval/reporter.py +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/tools/bad_pdf_eval/utils.py +0 -0
- {natural_pdf-0.2.4 → natural_pdf-0.2.6}/uv.lock +0 -0
@@ -143,7 +143,7 @@ class GuidesList(UserList):
|
|
143
143
|
|
144
144
|
def from_content(
|
145
145
|
self,
|
146
|
-
markers: Union[str, List[str], "ElementCollection", None],
|
146
|
+
markers: Union[str, List[str], "ElementCollection", Callable, None],
|
147
147
|
obj: Optional[Union["Page", "Region", "FlowRegion"]] = None,
|
148
148
|
align: Literal["left", "right", "center", "between"] = "left",
|
149
149
|
outer: bool = True,
|
@@ -160,6 +160,7 @@ class GuidesList(UserList):
|
|
160
160
|
- str: single selector (e.g., 'text:contains("Name")') or literal text
|
161
161
|
- List[str]: list of selectors or literal text strings
|
162
162
|
- ElementCollection: collection of elements to extract text from
|
163
|
+
- Callable: function that takes a page and returns markers
|
163
164
|
- None: no markers
|
164
165
|
obj: Page/Region/FlowRegion to search (uses parent's context if None)
|
165
166
|
align: How to align guides relative to found elements
|
@@ -174,13 +175,22 @@ class GuidesList(UserList):
|
|
174
175
|
if target_obj is None:
|
175
176
|
raise ValueError("No object provided and no context available")
|
176
177
|
|
178
|
+
# Store callable markers for later evaluation
|
179
|
+
if callable(markers):
|
180
|
+
self._callable = markers
|
181
|
+
# For now, evaluate with the current target object to get initial guides
|
182
|
+
actual_markers = markers(target_obj)
|
183
|
+
else:
|
184
|
+
self._callable = None
|
185
|
+
actual_markers = markers
|
186
|
+
|
177
187
|
# Check if parent is in flow mode
|
178
188
|
if self._parent.is_flow_region:
|
179
189
|
# Create guides across all constituent regions
|
180
190
|
all_guides = []
|
181
191
|
for region in self._parent.context.constituent_regions:
|
182
192
|
# Normalize markers for this region
|
183
|
-
marker_texts = _normalize_markers(
|
193
|
+
marker_texts = _normalize_markers(actual_markers, region)
|
184
194
|
|
185
195
|
# Create guides for this region
|
186
196
|
region_guides = Guides.from_content(
|
@@ -263,7 +273,7 @@ class GuidesList(UserList):
|
|
263
273
|
|
264
274
|
# Original single-region logic
|
265
275
|
# Normalize markers to list of text strings
|
266
|
-
marker_texts = _normalize_markers(
|
276
|
+
marker_texts = _normalize_markers(actual_markers, target_obj)
|
267
277
|
|
268
278
|
# Create guides for this axis
|
269
279
|
new_guides = Guides.from_content(
|
@@ -1541,11 +1551,15 @@ class Guides:
|
|
1541
1551
|
# Add outer guides if requested
|
1542
1552
|
if outer and bounds:
|
1543
1553
|
if axis == "vertical":
|
1544
|
-
|
1545
|
-
|
1554
|
+
if outer == True or outer == "first":
|
1555
|
+
guides_coords.insert(0, bounds[0]) # x0
|
1556
|
+
if outer == True or outer == "last":
|
1557
|
+
guides_coords.append(bounds[2]) # x1
|
1546
1558
|
else:
|
1547
|
-
|
1548
|
-
|
1559
|
+
if outer == True or outer == "first":
|
1560
|
+
guides_coords.insert(0, bounds[1]) # y0
|
1561
|
+
if outer == True or outer == "last":
|
1562
|
+
guides_coords.append(bounds[3]) # y1
|
1549
1563
|
|
1550
1564
|
# Remove duplicates and sort
|
1551
1565
|
guides_coords = sorted(list(set(guides_coords)))
|
@@ -3302,7 +3316,7 @@ class Guides:
|
|
3302
3316
|
markers: Union[str, List[str], "ElementCollection", None] = None,
|
3303
3317
|
obj: Optional[Union["Page", "Region"]] = None,
|
3304
3318
|
align: Literal["left", "right", "center", "between"] = "left",
|
3305
|
-
outer: bool = True,
|
3319
|
+
outer: Union[str, bool] = True,
|
3306
3320
|
tolerance: float = 5,
|
3307
3321
|
apply_exclusions: bool = True,
|
3308
3322
|
) -> "Guides":
|
@@ -3319,7 +3333,10 @@ class Guides:
|
|
3319
3333
|
- None: no markers
|
3320
3334
|
obj: Page or Region to search (uses self.context if None)
|
3321
3335
|
align: How to align guides relative to found elements
|
3322
|
-
outer: Whether to add outer boundary guides
|
3336
|
+
outer: Whether to add outer boundary guides. Can be:
|
3337
|
+
- bool: True/False to add/not add both
|
3338
|
+
- "first": To add boundary before the first element
|
3339
|
+
- "last": To add boundary before the last element
|
3323
3340
|
tolerance: Tolerance for snapping to element edges
|
3324
3341
|
apply_exclusions: Whether to apply exclusion zones when searching for text
|
3325
3342
|
|
@@ -3445,7 +3462,15 @@ class Guides:
|
|
3445
3462
|
|
3446
3463
|
def extract_table(
|
3447
3464
|
self,
|
3448
|
-
target: Optional[
|
3465
|
+
target: Optional[
|
3466
|
+
Union[
|
3467
|
+
"Page",
|
3468
|
+
"Region",
|
3469
|
+
"PageCollection",
|
3470
|
+
"ElementCollection",
|
3471
|
+
List[Union["Page", "Region"]],
|
3472
|
+
]
|
3473
|
+
] = None,
|
3449
3474
|
source: str = "guides_temp",
|
3450
3475
|
cell_padding: float = 0.5,
|
3451
3476
|
include_outer_boundaries: bool = False,
|
@@ -3457,8 +3482,11 @@ class Guides:
|
|
3457
3482
|
cell_extraction_func: Optional[Callable[["Region"], Optional[str]]] = None,
|
3458
3483
|
show_progress: bool = False,
|
3459
3484
|
content_filter: Optional[Union[str, Callable[[str], bool], List[str]]] = None,
|
3485
|
+
apply_exclusions: bool = True,
|
3460
3486
|
*,
|
3461
3487
|
multi_page: Literal["auto", True, False] = "auto",
|
3488
|
+
header: Union[str, List[str], None] = "first",
|
3489
|
+
skip_repeating_headers: Optional[bool] = None,
|
3462
3490
|
) -> "TableResult":
|
3463
3491
|
"""
|
3464
3492
|
Extract table data directly from guides without leaving temporary regions.
|
@@ -3469,8 +3497,11 @@ class Guides:
|
|
3469
3497
|
3. Cleans up all temporary regions
|
3470
3498
|
4. Returns the TableResult
|
3471
3499
|
|
3500
|
+
When passed a collection (PageCollection, ElementCollection, or list), this method
|
3501
|
+
will extract tables from each element and combine them into a single result.
|
3502
|
+
|
3472
3503
|
Args:
|
3473
|
-
target: Page or
|
3504
|
+
target: Page, Region, or collection of Pages/Regions to extract from (uses self.context if None)
|
3474
3505
|
source: Source label for temporary regions (will be cleaned up)
|
3475
3506
|
cell_padding: Internal padding for cell regions in points
|
3476
3507
|
include_outer_boundaries: Whether to add boundaries at edges if missing
|
@@ -3482,7 +3513,15 @@ class Guides:
|
|
3482
3513
|
cell_extraction_func: Optional callable for custom cell text extraction
|
3483
3514
|
show_progress: Controls progress bar for text method
|
3484
3515
|
content_filter: Content filtering function or patterns
|
3516
|
+
apply_exclusions: Whether to apply exclusion regions during text extraction (default: True)
|
3485
3517
|
multi_page: Controls multi-region table creation for FlowRegions
|
3518
|
+
header: How to handle headers when extracting from collections:
|
3519
|
+
- "first": Use first row of first element as headers (default)
|
3520
|
+
- "all": Expect headers on each element, use from first element
|
3521
|
+
- None: No headers, use numeric indices
|
3522
|
+
- List[str]: Custom column names
|
3523
|
+
skip_repeating_headers: Whether to remove duplicate header rows when extracting from collections.
|
3524
|
+
Defaults to True when header is "first" or "all", False otherwise.
|
3486
3525
|
|
3487
3526
|
Returns:
|
3488
3527
|
TableResult: Extracted table data
|
@@ -3494,20 +3533,49 @@ class Guides:
|
|
3494
3533
|
```python
|
3495
3534
|
from natural_pdf.analyzers import Guides
|
3496
3535
|
|
3497
|
-
#
|
3536
|
+
# Single page extraction
|
3498
3537
|
guides = Guides.from_lines(page, source_label="detected")
|
3499
|
-
|
3500
|
-
# Extract table directly - no temporary regions left behind
|
3501
3538
|
table_data = guides.extract_table()
|
3502
|
-
|
3503
|
-
# Convert to pandas DataFrame
|
3504
3539
|
df = table_data.to_df()
|
3540
|
+
|
3541
|
+
# Multiple page extraction
|
3542
|
+
guides = Guides(pages[0])
|
3543
|
+
guides.vertical.from_content(['Column 1', 'Column 2'])
|
3544
|
+
table_result = guides.extract_table(pages, header=['Col1', 'Col2'])
|
3545
|
+
df = table_result.to_df()
|
3546
|
+
|
3547
|
+
# Region collection extraction
|
3548
|
+
regions = pdf.find_all('region[type=table]')
|
3549
|
+
guides = Guides(regions[0])
|
3550
|
+
guides.vertical.from_lines(n=3)
|
3551
|
+
table_result = guides.extract_table(regions)
|
3505
3552
|
```
|
3506
3553
|
"""
|
3507
|
-
|
3508
|
-
|
3554
|
+
from natural_pdf.core.page_collection import PageCollection
|
3555
|
+
from natural_pdf.elements.element_collection import ElementCollection
|
3556
|
+
|
3557
|
+
target_obj = target if target is not None else self.context
|
3558
|
+
if target_obj is None:
|
3509
3559
|
raise ValueError("No target object available. Provide target parameter or context.")
|
3510
3560
|
|
3561
|
+
# Check if target is a collection - if so, delegate to _extract_table_from_collection
|
3562
|
+
if isinstance(target_obj, (PageCollection, ElementCollection, list)):
|
3563
|
+
# For collections, pass through most parameters as-is
|
3564
|
+
return self._extract_table_from_collection(
|
3565
|
+
elements=target_obj,
|
3566
|
+
header=header,
|
3567
|
+
skip_repeating_headers=skip_repeating_headers,
|
3568
|
+
method=method,
|
3569
|
+
table_settings=table_settings,
|
3570
|
+
use_ocr=use_ocr,
|
3571
|
+
ocr_config=ocr_config,
|
3572
|
+
text_options=text_options,
|
3573
|
+
cell_extraction_func=cell_extraction_func,
|
3574
|
+
show_progress=show_progress,
|
3575
|
+
content_filter=content_filter,
|
3576
|
+
apply_exclusions=apply_exclusions,
|
3577
|
+
)
|
3578
|
+
|
3511
3579
|
# Get the page for cleanup later
|
3512
3580
|
if hasattr(target_obj, "x0") and hasattr(target_obj, "top"): # Region
|
3513
3581
|
page = target_obj._page
|
@@ -3552,6 +3620,7 @@ class Guides:
|
|
3552
3620
|
cell_extraction_func=cell_extraction_func,
|
3553
3621
|
show_progress=show_progress,
|
3554
3622
|
content_filter=content_filter,
|
3623
|
+
apply_exclusions=apply_exclusions,
|
3555
3624
|
)
|
3556
3625
|
|
3557
3626
|
return table_result
|
@@ -3577,6 +3646,165 @@ class Guides:
|
|
3577
3646
|
except Exception as cleanup_err:
|
3578
3647
|
logger.warning(f"Failed to clean up temporary regions: {cleanup_err}")
|
3579
3648
|
|
3649
|
+
def _extract_table_from_collection(
|
3650
|
+
self,
|
3651
|
+
elements: Union["PageCollection", "ElementCollection", List[Union["Page", "Region"]]],
|
3652
|
+
header: Union[str, List[str], None] = "first",
|
3653
|
+
skip_repeating_headers: Optional[bool] = None,
|
3654
|
+
method: Optional[str] = None,
|
3655
|
+
table_settings: Optional[dict] = None,
|
3656
|
+
use_ocr: bool = False,
|
3657
|
+
ocr_config: Optional[dict] = None,
|
3658
|
+
text_options: Optional[Dict] = None,
|
3659
|
+
cell_extraction_func: Optional[Callable[["Region"], Optional[str]]] = None,
|
3660
|
+
show_progress: bool = True,
|
3661
|
+
content_filter: Optional[Union[str, Callable[[str], bool], List[str]]] = None,
|
3662
|
+
apply_exclusions: bool = True,
|
3663
|
+
) -> "TableResult":
|
3664
|
+
"""
|
3665
|
+
Extract tables from multiple pages or regions using this guide pattern.
|
3666
|
+
|
3667
|
+
This method applies the guide to each element, extracts tables, and combines
|
3668
|
+
them into a single TableResult. Dynamic guides (using lambdas) are evaluated
|
3669
|
+
for each element.
|
3670
|
+
|
3671
|
+
Args:
|
3672
|
+
elements: PageCollection, ElementCollection, or list of Pages/Regions to extract from
|
3673
|
+
header: How to handle headers:
|
3674
|
+
- "first": Use first row of first element as headers (default)
|
3675
|
+
- "all": Expect headers on each element, use from first element
|
3676
|
+
- None: No headers, use numeric indices
|
3677
|
+
- List[str]: Custom column names
|
3678
|
+
skip_repeating_headers: Whether to remove duplicate header rows.
|
3679
|
+
Defaults to True when header is "first" or "all", False otherwise.
|
3680
|
+
method: Table extraction method (passed to extract_table)
|
3681
|
+
table_settings: Settings for pdfplumber table extraction
|
3682
|
+
use_ocr: Whether to use OCR for text extraction
|
3683
|
+
ocr_config: OCR configuration parameters
|
3684
|
+
text_options: Dictionary of options for the 'text' method
|
3685
|
+
cell_extraction_func: Optional callable for custom cell text extraction
|
3686
|
+
show_progress: Show progress bar for multi-element extraction (default: True)
|
3687
|
+
content_filter: Content filtering function or patterns
|
3688
|
+
apply_exclusions: Whether to apply exclusion regions during extraction
|
3689
|
+
|
3690
|
+
Returns:
|
3691
|
+
TableResult: Combined table data from all elements
|
3692
|
+
|
3693
|
+
Example:
|
3694
|
+
```python
|
3695
|
+
# Create guide with static vertical, dynamic horizontal
|
3696
|
+
guide = Guides(regions[0])
|
3697
|
+
guide.vertical.from_content(columns, outer="last")
|
3698
|
+
guide.horizontal.from_content(lambda r: r.find_all('text:starts-with(NF-)'))
|
3699
|
+
|
3700
|
+
# Extract from all regions
|
3701
|
+
table_result = guide._extract_table_from_collection(regions, header=columns)
|
3702
|
+
df = table_result.to_df()
|
3703
|
+
```
|
3704
|
+
"""
|
3705
|
+
from natural_pdf.core.page_collection import PageCollection
|
3706
|
+
from natural_pdf.elements.element_collection import ElementCollection
|
3707
|
+
from natural_pdf.tables.result import TableResult
|
3708
|
+
|
3709
|
+
# Convert to list if it's a collection
|
3710
|
+
if isinstance(elements, (PageCollection, ElementCollection)):
|
3711
|
+
element_list = list(elements)
|
3712
|
+
else:
|
3713
|
+
element_list = elements
|
3714
|
+
|
3715
|
+
if not element_list:
|
3716
|
+
return TableResult([])
|
3717
|
+
|
3718
|
+
# Determine header handling
|
3719
|
+
if skip_repeating_headers is None:
|
3720
|
+
skip_repeating_headers = header in ["first", "all"] or isinstance(header, list)
|
3721
|
+
|
3722
|
+
all_rows = []
|
3723
|
+
header_row = None
|
3724
|
+
|
3725
|
+
# Configure progress bar
|
3726
|
+
iterator = element_list
|
3727
|
+
if show_progress and len(element_list) > 1:
|
3728
|
+
try:
|
3729
|
+
from tqdm.auto import tqdm
|
3730
|
+
|
3731
|
+
iterator = tqdm(
|
3732
|
+
element_list, desc="Extracting tables from elements", unit="element"
|
3733
|
+
)
|
3734
|
+
except ImportError:
|
3735
|
+
pass
|
3736
|
+
|
3737
|
+
for i, element in enumerate(iterator):
|
3738
|
+
# Create a new Guides object for this element
|
3739
|
+
element_guide = Guides(element)
|
3740
|
+
|
3741
|
+
# Copy vertical guides (usually static)
|
3742
|
+
if hasattr(self.vertical, "_callable") and self.vertical._callable is not None:
|
3743
|
+
# If vertical is dynamic (lambda), evaluate it
|
3744
|
+
element_guide.vertical.from_content(self.vertical._callable(element))
|
3745
|
+
else:
|
3746
|
+
# Copy static vertical positions
|
3747
|
+
element_guide.vertical.data = self.vertical.data.copy()
|
3748
|
+
|
3749
|
+
# Handle horizontal guides
|
3750
|
+
if hasattr(self.horizontal, "_callable") and self.horizontal._callable is not None:
|
3751
|
+
# If horizontal is dynamic (lambda), evaluate it
|
3752
|
+
element_guide.horizontal.from_content(self.horizontal._callable(element))
|
3753
|
+
else:
|
3754
|
+
# Copy static horizontal positions
|
3755
|
+
element_guide.horizontal.data = self.horizontal.data.copy()
|
3756
|
+
|
3757
|
+
# Extract table from this element
|
3758
|
+
table_result = element_guide.extract_table(
|
3759
|
+
method=method,
|
3760
|
+
table_settings=table_settings,
|
3761
|
+
use_ocr=use_ocr,
|
3762
|
+
ocr_config=ocr_config,
|
3763
|
+
text_options=text_options,
|
3764
|
+
cell_extraction_func=cell_extraction_func,
|
3765
|
+
show_progress=False, # Don't show nested progress
|
3766
|
+
content_filter=content_filter,
|
3767
|
+
apply_exclusions=apply_exclusions,
|
3768
|
+
)
|
3769
|
+
|
3770
|
+
# Convert to list of rows
|
3771
|
+
rows = list(table_result)
|
3772
|
+
|
3773
|
+
# Handle headers based on strategy
|
3774
|
+
if i == 0: # First element
|
3775
|
+
if header == "first" or header == "all":
|
3776
|
+
# Use first row as header
|
3777
|
+
if rows:
|
3778
|
+
header_row = rows[0]
|
3779
|
+
rows = rows[1:] # Remove header from data
|
3780
|
+
elif isinstance(header, list):
|
3781
|
+
# Custom headers provided
|
3782
|
+
header_row = header
|
3783
|
+
else: # Subsequent elements
|
3784
|
+
if header == "all" and skip_repeating_headers and rows:
|
3785
|
+
# Expect and remove header row
|
3786
|
+
if rows and header_row and rows[0] == header_row:
|
3787
|
+
rows = rows[1:]
|
3788
|
+
elif rows:
|
3789
|
+
# Still remove first row if it looks like a header
|
3790
|
+
rows = rows[1:]
|
3791
|
+
|
3792
|
+
# Add rows to combined result
|
3793
|
+
all_rows.extend(rows)
|
3794
|
+
|
3795
|
+
# Create final TableResult
|
3796
|
+
if isinstance(header, list):
|
3797
|
+
# Custom headers - prepend to data
|
3798
|
+
final_result = TableResult(all_rows)
|
3799
|
+
elif header_row is not None:
|
3800
|
+
# Prepend discovered header
|
3801
|
+
final_result = TableResult([header_row] + all_rows)
|
3802
|
+
else:
|
3803
|
+
# No headers
|
3804
|
+
final_result = TableResult(all_rows)
|
3805
|
+
|
3806
|
+
return final_result
|
3807
|
+
|
3580
3808
|
def _get_flow_orientation(self) -> Literal["vertical", "horizontal", "unknown"]:
|
3581
3809
|
"""Determines if a FlowRegion's constituent parts are arranged vertically or horizontally."""
|
3582
3810
|
if not self.is_flow_region or len(self.context.constituent_regions) < 2:
|
@@ -939,6 +939,11 @@ class ElementManager:
|
|
939
939
|
self.load_elements()
|
940
940
|
return self._elements.get("chars", [])
|
941
941
|
|
942
|
+
def invalidate_cache(self):
|
943
|
+
"""Invalidate the cached elements, forcing a reload on next access."""
|
944
|
+
self._elements = None
|
945
|
+
logger.debug(f"Page {self._page.number}: ElementManager cache invalidated")
|
946
|
+
|
942
947
|
@property
|
943
948
|
def words(self):
|
944
949
|
"""Get all word elements."""
|