natural-pdf 0.2.5__tar.gz → 0.2.8__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/.gitignore +1 -0
- {natural_pdf-0.2.5/natural_pdf.egg-info → natural_pdf-0.2.8}/PKG-INFO +1 -1
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/natural_pdf/analyzers/guides.py +94 -42
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/natural_pdf/core/page.py +224 -62
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/natural_pdf/core/page_collection.py +261 -50
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/natural_pdf/core/page_groupby.py +20 -2
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/natural_pdf/core/pdf.py +17 -14
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/natural_pdf/core/render_spec.py +20 -5
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/natural_pdf/describe/base.py +1 -1
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/natural_pdf/describe/elements.py +1 -1
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/natural_pdf/elements/base.py +84 -8
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/natural_pdf/elements/element_collection.py +757 -20
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/natural_pdf/elements/region.py +181 -48
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/natural_pdf/flows/flow.py +3 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/natural_pdf/selectors/parser.py +2 -2
- natural_pdf-0.2.8/natural_pdf/utils/color_utils.py +100 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8/natural_pdf.egg-info}/PKG-INFO +1 -1
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/natural_pdf.egg-info/SOURCES.txt +32 -1
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/natural_pdf.egg-info/top_level.txt +1 -0
- natural_pdf-0.2.8/tests/test_color_hex_display.py +195 -0
- natural_pdf-0.2.8/tests/test_crop_enhancements.py +149 -0
- natural_pdf-0.2.8/tests/test_crop_region_highlights.py +119 -0
- natural_pdf-0.2.8/tests/test_dissolve.py +471 -0
- natural_pdf-0.2.8/tests/test_dissolve_cross_page_bug.py +155 -0
- natural_pdf-0.2.8/tests/test_dissolve_debug_issue.py +195 -0
- natural_pdf-0.2.8/tests/test_dissolve_real_world_issue.py +201 -0
- natural_pdf-0.2.8/tests/test_dissolve_single_elements.py +159 -0
- natural_pdf-0.2.8/tests/test_dissolve_vertical_offset_issue.py +139 -0
- natural_pdf-0.2.8/tests/test_element_addition.py +176 -0
- natural_pdf-0.2.8/tests/test_element_collection_show_cols.py +132 -0
- natural_pdf-0.2.8/tests/test_empty_pseudo_class.py +215 -0
- natural_pdf-0.2.8/tests/test_fix_get_sections_zero_height.py +122 -0
- natural_pdf-0.2.8/tests/test_get_sections_fix_comprehensive.py +186 -0
- natural_pdf-0.2.8/tests/test_get_sections_zero_height.py +179 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/tests/test_guides_extract_table.py +1 -0
- natural_pdf-0.2.5/tests/test_guides_extract_table_from_pages.py → natural_pdf-0.2.8/tests/test_guides_extract_table_collections.py +80 -57
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/tests/test_guides_extract_table_exclusions.py +41 -40
- natural_pdf-0.2.8/tests/test_highlight_detection.py +40 -0
- natural_pdf-0.2.8/tests/test_highlight_detection_comprehensive.py +94 -0
- natural_pdf-0.2.8/tests/test_include_boundaries_comprehensive.py +124 -0
- natural_pdf-0.2.8/tests/test_include_boundaries_debug.py +67 -0
- natural_pdf-0.2.8/tests/test_include_boundaries_final.py +159 -0
- natural_pdf-0.2.8/tests/test_include_boundaries_final_verification.py +126 -0
- natural_pdf-0.2.8/tests/test_include_boundaries_fix.py +126 -0
- natural_pdf-0.2.8/tests/test_include_boundaries_mock.py +199 -0
- natural_pdf-0.2.8/tests/test_include_boundaries_simple.py +119 -0
- natural_pdf-0.2.8/tests/test_include_boundaries_types_pdf.py +113 -0
- natural_pdf-0.2.8/tests/test_include_boundaries_verification.py +134 -0
- natural_pdf-0.2.8/tests/test_include_boundaries_with_real_text.py +104 -0
- natural_pdf-0.2.8/tests/test_merge_connected.py +302 -0
- natural_pdf-0.2.8/tests/test_merge_connected_real_world.py +240 -0
- natural_pdf-0.2.8/tests/test_merge_method.py +187 -0
- natural_pdf-0.2.8/tests/test_sections_with_start_and_end.py +107 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/tests/test_slice_cache_reuse.py +70 -52
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/tests/test_slice_exclusion_fix.py +37 -34
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/tests/test_slice_exclusion_issue.py +22 -16
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/tests/test_slice_exclusion_mock.py +56 -56
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/tests/test_sliced_collection_exclusions.py +50 -42
- natural_pdf-0.2.5/tests/test_highlight_detection.py +0 -11
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/.cursor/rules/analysis_framework.mdc +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/.cursor/rules/coding-style.mdc +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/.cursor/rules/edit-md-instead-of-ipynb.mdc +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/.cursor/rules/minimal-comments.mdc +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/.cursor/rules/natural-pdf-overview.mdc +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/.cursor/rules/user-friendly-library-code.mdc +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/.github/workflows/ci.yml +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/.github/workflows/docs.yml +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/.github/workflows/nightly-tutorials.yml +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/.pre-commit-config.yaml +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/01-execute_notebooks.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/02-run_all_tutorials.sh +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/CLAUDE.md +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/LICENSE +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/MANIFEST.in +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/README.md +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/audit_packaging.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/check_run_md.sh +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/docs/api/index.md +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/docs/assets/favicon.png +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/docs/assets/favicon.svg +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/docs/assets/javascripts/custom.js +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/docs/assets/logo.svg +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/docs/assets/sample-screen.png +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/docs/assets/social-preview.png +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/docs/assets/social-preview.svg +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/docs/assets/stylesheets/custom.css +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/docs/categorizing-documents/index.md +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/docs/data-extraction/index.md +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/docs/describe/index.md +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/docs/document-qa/index.md +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/docs/element-selection/index.md +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/docs/extracting-clean-text/index.md +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/docs/finetuning/index.md +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/docs/fix-messy-tables/index.md +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/docs/fix-messy-tables/table_1.csv +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/docs/fix-messy-tables/table_2.csv +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/docs/fix-messy-tables/table_3.csv +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/docs/index.md +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/docs/installation/index.md +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/docs/interactive-widget/index.md +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/docs/layout-analysis/index.md +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/docs/loops-and-groups/index.md +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/docs/ocr/index.md +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/docs/pdf-navigation/index.md +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/docs/process-forms-and-invoices/extracted_form_data.csv +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/docs/process-forms-and-invoices/index.md +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/docs/quick-reference/index.md +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/docs/reflowing-pages/index.md +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/docs/regions/index.md +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/docs/tables/index.md +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/docs/text-analysis/index.md +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/docs/tutorials/01-loading-and-extraction.md +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/docs/tutorials/02-finding-elements.md +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/docs/tutorials/03-extracting-blocks.md +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/docs/tutorials/04-table-extraction.md +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/docs/tutorials/05-excluding-content.md +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/docs/tutorials/06-document-qa.md +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/docs/tutorials/07-layout-analysis.md +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/docs/tutorials/07-working-with-regions.md +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/docs/tutorials/08-spatial-navigation.md +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/docs/tutorials/09-section-extraction.md +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/docs/tutorials/10-form-field-extraction.md +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/docs/tutorials/11-enhanced-table-processing.md +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/docs/tutorials/12-ocr-integration.md +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/docs/tutorials/13-semantic-search.md +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/docs/tutorials/14-categorizing-documents.md +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/docs/visual-debugging/index.md +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/docs/visual-debugging/region.png +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/mkdocs.yml +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/natural_pdf/__init__.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/natural_pdf/analyzers/__init__.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/natural_pdf/analyzers/layout/__init__.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/natural_pdf/analyzers/layout/base.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/natural_pdf/analyzers/layout/docling.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/natural_pdf/analyzers/layout/gemini.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/natural_pdf/analyzers/layout/layout_analyzer.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/natural_pdf/analyzers/layout/layout_manager.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/natural_pdf/analyzers/layout/layout_options.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/natural_pdf/analyzers/layout/paddle.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/natural_pdf/analyzers/layout/pdfplumber_table_finder.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/natural_pdf/analyzers/layout/surya.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/natural_pdf/analyzers/layout/table_structure_utils.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/natural_pdf/analyzers/layout/tatr.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/natural_pdf/analyzers/layout/yolo.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/natural_pdf/analyzers/shape_detection_mixin.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/natural_pdf/analyzers/text_options.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/natural_pdf/analyzers/text_structure.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/natural_pdf/analyzers/utils.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/natural_pdf/classification/manager.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/natural_pdf/classification/mixin.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/natural_pdf/classification/results.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/natural_pdf/cli.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/natural_pdf/collections/mixins.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/natural_pdf/core/__init__.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/natural_pdf/core/element_manager.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/natural_pdf/core/highlighting_service.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/natural_pdf/core/pdf_collection.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/natural_pdf/describe/__init__.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/natural_pdf/describe/mixin.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/natural_pdf/describe/summary.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/natural_pdf/elements/__init__.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/natural_pdf/elements/image.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/natural_pdf/elements/line.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/natural_pdf/elements/rect.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/natural_pdf/elements/text.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/natural_pdf/export/mixin.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/natural_pdf/exporters/__init__.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/natural_pdf/exporters/base.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/natural_pdf/exporters/data/__init__.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/natural_pdf/exporters/data/pdf.ttf +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/natural_pdf/exporters/data/sRGB.icc +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/natural_pdf/exporters/hocr.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/natural_pdf/exporters/hocr_font.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/natural_pdf/exporters/original_pdf.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/natural_pdf/exporters/paddleocr.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/natural_pdf/exporters/searchable_pdf.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/natural_pdf/extraction/manager.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/natural_pdf/extraction/mixin.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/natural_pdf/extraction/result.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/natural_pdf/flows/__init__.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/natural_pdf/flows/collections.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/natural_pdf/flows/element.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/natural_pdf/flows/region.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/natural_pdf/ocr/__init__.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/natural_pdf/ocr/engine.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/natural_pdf/ocr/engine_doctr.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/natural_pdf/ocr/engine_easyocr.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/natural_pdf/ocr/engine_paddle.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/natural_pdf/ocr/engine_surya.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/natural_pdf/ocr/ocr_factory.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/natural_pdf/ocr/ocr_manager.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/natural_pdf/ocr/ocr_options.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/natural_pdf/ocr/utils.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/natural_pdf/qa/__init__.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/natural_pdf/qa/document_qa.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/natural_pdf/qa/qa_result.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/natural_pdf/search/__init__.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/natural_pdf/search/lancedb_search_service.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/natural_pdf/search/numpy_search_service.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/natural_pdf/search/search_options.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/natural_pdf/search/search_service_protocol.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/natural_pdf/search/searchable_mixin.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/natural_pdf/selectors/__init__.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/natural_pdf/tables/__init__.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/natural_pdf/tables/result.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/natural_pdf/templates/__init__.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/natural_pdf/templates/finetune/fine_tune_paddleocr.md +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/natural_pdf/templates/spa/css/style.css +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/natural_pdf/templates/spa/index.html +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/natural_pdf/templates/spa/js/app.js +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/natural_pdf/templates/spa/words.txt +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/natural_pdf/text_mixin.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/natural_pdf/utils/__init__.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/natural_pdf/utils/bidi_mirror.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/natural_pdf/utils/debug.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/natural_pdf/utils/highlighting.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/natural_pdf/utils/identifiers.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/natural_pdf/utils/layout.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/natural_pdf/utils/locks.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/natural_pdf/utils/packaging.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/natural_pdf/utils/reading_order.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/natural_pdf/utils/text_extraction.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/natural_pdf/utils/visualization.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/natural_pdf/vision/__init__.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/natural_pdf/vision/mixin.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/natural_pdf/vision/results.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/natural_pdf/vision/similarity.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/natural_pdf/widgets/__init__.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/natural_pdf/widgets/viewer.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/natural_pdf.egg-info/dependency_links.txt +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/natural_pdf.egg-info/entry_points.txt +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/natural_pdf.egg-info/requires.txt +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/noxfile.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/optimization/memory_comparison.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/optimization/pdf_analyzer.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/optimization/performance_analysis.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/optimization/performance_results/image_heavy_snapshots.csv +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/optimization/performance_results/image_heavy_snapshots.json +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/optimization/performance_results/text_heavy_snapshots.csv +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/optimization/performance_results/text_heavy_snapshots.json +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/optimization/test_cleanup_methods.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/optimization/test_memory_fix.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/publish.sh +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/pyproject.toml +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/sample-screen.png +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/setup.cfg +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/tests/conftest.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/tests/exporters/test_paddleocr_exporter.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/tests/test_annotate.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/tests/test_arabic_performance.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/tests/test_arabic_real_world.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/tests/test_color_conversion.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/tests/test_core/test_containment_geometry.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/tests/test_core/test_elements.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/tests/test_core/test_loading.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/tests/test_core/test_spatial.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/tests/test_core/test_text_extraction.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/tests/test_core/test_text_layer.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/tests/test_directional_defaults.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/tests/test_document_qa.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/tests/test_element_collection_slicing.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/tests/test_element_show_crop_highlights.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/tests/test_exclusions.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/tests/test_expand.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/tests/test_extraction_error.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/tests/test_extraction_mixin_fix.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/tests/test_extraction_text_and_vision.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/tests/test_extraction_working.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/tests/test_find_similar.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/tests/test_first_last_selectors.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/tests/test_flow_region_directional.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/tests/test_groupby.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/tests/test_guides.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/tests/test_guides_apply_exclusions.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/tests/test_guides_apply_exclusions_simple.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/tests/test_guides_extract_table_real.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/tests/test_guides_integration.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/tests/test_highlight_protocol.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/tests/test_highlight_protocol_simple.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/tests/test_highlight_regions.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/tests/test_loading_original.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/tests/test_multi_page_table_discovery.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/tests/test_optional_deps.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/tests/test_page_exclusion_lists.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/tests/test_pdf_add_exclusion_elementcollection.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/tests/test_region_show_crop_highlights.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/tests/test_region_viewer.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/tests/test_sections_end_only.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/tests/test_show_column_layout.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/tests/test_show_edge_cases.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/tests/test_show_exclusions.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/tests/test_show_exclusions_feature.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/tests/test_show_limit.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/tests/test_skip_repeating_headers_multipage.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/tests/test_strikethrough_detection.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/tests/test_table_result_header_mismatch.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/tests/test_table_result_keep_blank.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/tests/test_tiny_text_tables.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/tests/test_tiny_text_tables_table.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/tests/test_tutorials.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/tests/test_underline_detection.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/tests/test_update_text.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/todo/bad_pdf_analysis.md +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/todo/evaluation.md +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/tools/bad_pdf_eval/IMPROVEMENTS_SUMMARY.md +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/tools/bad_pdf_eval/LLM_NaturalPDF_CheatSheet.md +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/tools/bad_pdf_eval/LLM_NaturalPDF_Workflows.md +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/tools/bad_pdf_eval/README.md +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/tools/bad_pdf_eval/__init__.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/tools/bad_pdf_eval/analyser.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/tools/bad_pdf_eval/collate_summaries.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/tools/bad_pdf_eval/compile_attempts_markdown.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/tools/bad_pdf_eval/eval_suite.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/tools/bad_pdf_eval/evaluate_quality.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/tools/bad_pdf_eval/export_enrichment_csv.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/tools/bad_pdf_eval/extraction_decision_tree.md +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/tools/bad_pdf_eval/llm_enrich.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/tools/bad_pdf_eval/llm_enrich_with_retry.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/tools/bad_pdf_eval/reporter.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/tools/bad_pdf_eval/utils.py +0 -0
- {natural_pdf-0.2.5 → natural_pdf-0.2.8}/uv.lock +0 -0
@@ -3462,7 +3462,15 @@ class Guides:
|
|
3462
3462
|
|
3463
3463
|
def extract_table(
|
3464
3464
|
self,
|
3465
|
-
target: Optional[
|
3465
|
+
target: Optional[
|
3466
|
+
Union[
|
3467
|
+
"Page",
|
3468
|
+
"Region",
|
3469
|
+
"PageCollection",
|
3470
|
+
"ElementCollection",
|
3471
|
+
List[Union["Page", "Region"]],
|
3472
|
+
]
|
3473
|
+
] = None,
|
3466
3474
|
source: str = "guides_temp",
|
3467
3475
|
cell_padding: float = 0.5,
|
3468
3476
|
include_outer_boundaries: bool = False,
|
@@ -3477,6 +3485,8 @@ class Guides:
|
|
3477
3485
|
apply_exclusions: bool = True,
|
3478
3486
|
*,
|
3479
3487
|
multi_page: Literal["auto", True, False] = "auto",
|
3488
|
+
header: Union[str, List[str], None] = "first",
|
3489
|
+
skip_repeating_headers: Optional[bool] = None,
|
3480
3490
|
) -> "TableResult":
|
3481
3491
|
"""
|
3482
3492
|
Extract table data directly from guides without leaving temporary regions.
|
@@ -3487,8 +3497,11 @@ class Guides:
|
|
3487
3497
|
3. Cleans up all temporary regions
|
3488
3498
|
4. Returns the TableResult
|
3489
3499
|
|
3500
|
+
When passed a collection (PageCollection, ElementCollection, or list), this method
|
3501
|
+
will extract tables from each element and combine them into a single result.
|
3502
|
+
|
3490
3503
|
Args:
|
3491
|
-
target: Page or
|
3504
|
+
target: Page, Region, or collection of Pages/Regions to extract from (uses self.context if None)
|
3492
3505
|
source: Source label for temporary regions (will be cleaned up)
|
3493
3506
|
cell_padding: Internal padding for cell regions in points
|
3494
3507
|
include_outer_boundaries: Whether to add boundaries at edges if missing
|
@@ -3502,6 +3515,13 @@ class Guides:
|
|
3502
3515
|
content_filter: Content filtering function or patterns
|
3503
3516
|
apply_exclusions: Whether to apply exclusion regions during text extraction (default: True)
|
3504
3517
|
multi_page: Controls multi-region table creation for FlowRegions
|
3518
|
+
header: How to handle headers when extracting from collections:
|
3519
|
+
- "first": Use first row of first element as headers (default)
|
3520
|
+
- "all": Expect headers on each element, use from first element
|
3521
|
+
- None: No headers, use numeric indices
|
3522
|
+
- List[str]: Custom column names
|
3523
|
+
skip_repeating_headers: Whether to remove duplicate header rows when extracting from collections.
|
3524
|
+
Defaults to True when header is "first" or "all", False otherwise.
|
3505
3525
|
|
3506
3526
|
Returns:
|
3507
3527
|
TableResult: Extracted table data
|
@@ -3513,20 +3533,49 @@ class Guides:
|
|
3513
3533
|
```python
|
3514
3534
|
from natural_pdf.analyzers import Guides
|
3515
3535
|
|
3516
|
-
#
|
3536
|
+
# Single page extraction
|
3517
3537
|
guides = Guides.from_lines(page, source_label="detected")
|
3518
|
-
|
3519
|
-
# Extract table directly - no temporary regions left behind
|
3520
3538
|
table_data = guides.extract_table()
|
3521
|
-
|
3522
|
-
# Convert to pandas DataFrame
|
3523
3539
|
df = table_data.to_df()
|
3540
|
+
|
3541
|
+
# Multiple page extraction
|
3542
|
+
guides = Guides(pages[0])
|
3543
|
+
guides.vertical.from_content(['Column 1', 'Column 2'])
|
3544
|
+
table_result = guides.extract_table(pages, header=['Col1', 'Col2'])
|
3545
|
+
df = table_result.to_df()
|
3546
|
+
|
3547
|
+
# Region collection extraction
|
3548
|
+
regions = pdf.find_all('region[type=table]')
|
3549
|
+
guides = Guides(regions[0])
|
3550
|
+
guides.vertical.from_lines(n=3)
|
3551
|
+
table_result = guides.extract_table(regions)
|
3524
3552
|
```
|
3525
3553
|
"""
|
3526
|
-
|
3527
|
-
|
3554
|
+
from natural_pdf.core.page_collection import PageCollection
|
3555
|
+
from natural_pdf.elements.element_collection import ElementCollection
|
3556
|
+
|
3557
|
+
target_obj = target if target is not None else self.context
|
3558
|
+
if target_obj is None:
|
3528
3559
|
raise ValueError("No target object available. Provide target parameter or context.")
|
3529
3560
|
|
3561
|
+
# Check if target is a collection - if so, delegate to _extract_table_from_collection
|
3562
|
+
if isinstance(target_obj, (PageCollection, ElementCollection, list)):
|
3563
|
+
# For collections, pass through most parameters as-is
|
3564
|
+
return self._extract_table_from_collection(
|
3565
|
+
elements=target_obj,
|
3566
|
+
header=header,
|
3567
|
+
skip_repeating_headers=skip_repeating_headers,
|
3568
|
+
method=method,
|
3569
|
+
table_settings=table_settings,
|
3570
|
+
use_ocr=use_ocr,
|
3571
|
+
ocr_config=ocr_config,
|
3572
|
+
text_options=text_options,
|
3573
|
+
cell_extraction_func=cell_extraction_func,
|
3574
|
+
show_progress=show_progress,
|
3575
|
+
content_filter=content_filter,
|
3576
|
+
apply_exclusions=apply_exclusions,
|
3577
|
+
)
|
3578
|
+
|
3530
3579
|
# Get the page for cleanup later
|
3531
3580
|
if hasattr(target_obj, "x0") and hasattr(target_obj, "top"): # Region
|
3532
3581
|
page = target_obj._page
|
@@ -3597,9 +3646,9 @@ class Guides:
|
|
3597
3646
|
except Exception as cleanup_err:
|
3598
3647
|
logger.warning(f"Failed to clean up temporary regions: {cleanup_err}")
|
3599
3648
|
|
3600
|
-
def
|
3649
|
+
def _extract_table_from_collection(
|
3601
3650
|
self,
|
3602
|
-
|
3651
|
+
elements: Union["PageCollection", "ElementCollection", List[Union["Page", "Region"]]],
|
3603
3652
|
header: Union[str, List[str], None] = "first",
|
3604
3653
|
skip_repeating_headers: Optional[bool] = None,
|
3605
3654
|
method: Optional[str] = None,
|
@@ -3613,17 +3662,17 @@ class Guides:
|
|
3613
3662
|
apply_exclusions: bool = True,
|
3614
3663
|
) -> "TableResult":
|
3615
3664
|
"""
|
3616
|
-
Extract tables from multiple pages using this guide pattern.
|
3665
|
+
Extract tables from multiple pages or regions using this guide pattern.
|
3617
3666
|
|
3618
|
-
This method applies the guide to each
|
3667
|
+
This method applies the guide to each element, extracts tables, and combines
|
3619
3668
|
them into a single TableResult. Dynamic guides (using lambdas) are evaluated
|
3620
|
-
for each
|
3669
|
+
for each element.
|
3621
3670
|
|
3622
3671
|
Args:
|
3623
|
-
|
3672
|
+
elements: PageCollection, ElementCollection, or list of Pages/Regions to extract from
|
3624
3673
|
header: How to handle headers:
|
3625
|
-
- "first": Use first row of first
|
3626
|
-
- "all": Expect headers on each
|
3674
|
+
- "first": Use first row of first element as headers (default)
|
3675
|
+
- "all": Expect headers on each element, use from first element
|
3627
3676
|
- None: No headers, use numeric indices
|
3628
3677
|
- List[str]: Custom column names
|
3629
3678
|
skip_repeating_headers: Whether to remove duplicate header rows.
|
@@ -3634,35 +3683,36 @@ class Guides:
|
|
3634
3683
|
ocr_config: OCR configuration parameters
|
3635
3684
|
text_options: Dictionary of options for the 'text' method
|
3636
3685
|
cell_extraction_func: Optional callable for custom cell text extraction
|
3637
|
-
show_progress: Show progress bar for multi-
|
3686
|
+
show_progress: Show progress bar for multi-element extraction (default: True)
|
3638
3687
|
content_filter: Content filtering function or patterns
|
3639
3688
|
apply_exclusions: Whether to apply exclusion regions during extraction
|
3640
3689
|
|
3641
3690
|
Returns:
|
3642
|
-
TableResult: Combined table data from all
|
3691
|
+
TableResult: Combined table data from all elements
|
3643
3692
|
|
3644
3693
|
Example:
|
3645
3694
|
```python
|
3646
3695
|
# Create guide with static vertical, dynamic horizontal
|
3647
|
-
guide = Guides(
|
3696
|
+
guide = Guides(regions[0])
|
3648
3697
|
guide.vertical.from_content(columns, outer="last")
|
3649
|
-
guide.horizontal.from_content(lambda
|
3698
|
+
guide.horizontal.from_content(lambda r: r.find_all('text:starts-with(NF-)'))
|
3650
3699
|
|
3651
|
-
# Extract from all
|
3652
|
-
table_result = guide.
|
3700
|
+
# Extract from all regions
|
3701
|
+
table_result = guide._extract_table_from_collection(regions, header=columns)
|
3653
3702
|
df = table_result.to_df()
|
3654
3703
|
```
|
3655
3704
|
"""
|
3656
3705
|
from natural_pdf.core.page_collection import PageCollection
|
3706
|
+
from natural_pdf.elements.element_collection import ElementCollection
|
3657
3707
|
from natural_pdf.tables.result import TableResult
|
3658
3708
|
|
3659
|
-
# Convert to list if it's a
|
3660
|
-
if isinstance(
|
3661
|
-
|
3709
|
+
# Convert to list if it's a collection
|
3710
|
+
if isinstance(elements, (PageCollection, ElementCollection)):
|
3711
|
+
element_list = list(elements)
|
3662
3712
|
else:
|
3663
|
-
|
3713
|
+
element_list = elements
|
3664
3714
|
|
3665
|
-
if not
|
3715
|
+
if not element_list:
|
3666
3716
|
return TableResult([])
|
3667
3717
|
|
3668
3718
|
# Determine header handling
|
@@ -3673,37 +3723,39 @@ class Guides:
|
|
3673
3723
|
header_row = None
|
3674
3724
|
|
3675
3725
|
# Configure progress bar
|
3676
|
-
iterator =
|
3677
|
-
if show_progress and len(
|
3726
|
+
iterator = element_list
|
3727
|
+
if show_progress and len(element_list) > 1:
|
3678
3728
|
try:
|
3679
3729
|
from tqdm.auto import tqdm
|
3680
3730
|
|
3681
|
-
iterator = tqdm(
|
3731
|
+
iterator = tqdm(
|
3732
|
+
element_list, desc="Extracting tables from elements", unit="element"
|
3733
|
+
)
|
3682
3734
|
except ImportError:
|
3683
3735
|
pass
|
3684
3736
|
|
3685
|
-
for i,
|
3686
|
-
# Create a new Guides object for this
|
3687
|
-
|
3737
|
+
for i, element in enumerate(iterator):
|
3738
|
+
# Create a new Guides object for this element
|
3739
|
+
element_guide = Guides(element)
|
3688
3740
|
|
3689
3741
|
# Copy vertical guides (usually static)
|
3690
3742
|
if hasattr(self.vertical, "_callable") and self.vertical._callable is not None:
|
3691
3743
|
# If vertical is dynamic (lambda), evaluate it
|
3692
|
-
|
3744
|
+
element_guide.vertical.from_content(self.vertical._callable(element))
|
3693
3745
|
else:
|
3694
3746
|
# Copy static vertical positions
|
3695
|
-
|
3747
|
+
element_guide.vertical.data = self.vertical.data.copy()
|
3696
3748
|
|
3697
3749
|
# Handle horizontal guides
|
3698
3750
|
if hasattr(self.horizontal, "_callable") and self.horizontal._callable is not None:
|
3699
3751
|
# If horizontal is dynamic (lambda), evaluate it
|
3700
|
-
|
3752
|
+
element_guide.horizontal.from_content(self.horizontal._callable(element))
|
3701
3753
|
else:
|
3702
3754
|
# Copy static horizontal positions
|
3703
|
-
|
3755
|
+
element_guide.horizontal.data = self.horizontal.data.copy()
|
3704
3756
|
|
3705
|
-
# Extract table from this
|
3706
|
-
table_result =
|
3757
|
+
# Extract table from this element
|
3758
|
+
table_result = element_guide.extract_table(
|
3707
3759
|
method=method,
|
3708
3760
|
table_settings=table_settings,
|
3709
3761
|
use_ocr=use_ocr,
|
@@ -3719,7 +3771,7 @@ class Guides:
|
|
3719
3771
|
rows = list(table_result)
|
3720
3772
|
|
3721
3773
|
# Handle headers based on strategy
|
3722
|
-
if i == 0: # First
|
3774
|
+
if i == 0: # First element
|
3723
3775
|
if header == "first" or header == "all":
|
3724
3776
|
# Use first row as header
|
3725
3777
|
if rows:
|
@@ -3728,7 +3780,7 @@ class Guides:
|
|
3728
3780
|
elif isinstance(header, list):
|
3729
3781
|
# Custom headers provided
|
3730
3782
|
header_row = header
|
3731
|
-
else: # Subsequent
|
3783
|
+
else: # Subsequent elements
|
3732
3784
|
if header == "all" and skip_repeating_headers and rows:
|
3733
3785
|
# Expect and remove header row
|
3734
3786
|
if rows and header_row and rows[0] == header_row:
|