natural-pdf 0.2.11__tar.gz → 0.2.12__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {natural_pdf-0.2.11/natural_pdf.egg-info → natural_pdf-0.2.12}/PKG-INFO +1 -1
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/natural_pdf/analyzers/guides.py +196 -43
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/natural_pdf/core/page.py +56 -8
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/natural_pdf/elements/region.py +5 -3
- {natural_pdf-0.2.11 → natural_pdf-0.2.12/natural_pdf.egg-info}/PKG-INFO +1 -1
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/natural_pdf.egg-info/SOURCES.txt +26 -0
- natural_pdf-0.2.12/temp/debug_cell_extraction.py +42 -0
- natural_pdf-0.2.12/temp/debug_exclusion_overlap.py +43 -0
- natural_pdf-0.2.12/temp/debug_exclusions_guides.py +67 -0
- natural_pdf-0.2.12/temp/debug_extra_guide.py +41 -0
- natural_pdf-0.2.12/temp/debug_outer_boundaries.py +46 -0
- natural_pdf-0.2.12/temp/debug_st_search.py +33 -0
- natural_pdf-0.2.12/temp/fix_page_exclusions.py +42 -0
- natural_pdf-0.2.12/temp/test_exclusion_with_debug.py +30 -0
- natural_pdf-0.2.12/temp/test_find_exclusions_fix.py +53 -0
- natural_pdf-0.2.12/temp/test_find_exclusions_fix_no_recursion.py +97 -0
- natural_pdf-0.2.12/temp/test_fix_real_pdf.py +48 -0
- natural_pdf-0.2.12/temp/test_fix_working.py +55 -0
- natural_pdf-0.2.12/temp/test_fixed_pdf_exclusions.py +67 -0
- natural_pdf-0.2.12/temp/test_horizontal_top_bottom.py +53 -0
- natural_pdf-0.2.12/temp/test_marker_order.py +45 -0
- natural_pdf-0.2.12/temp/test_original_exclusions_now_work.py +56 -0
- natural_pdf-0.2.12/temp/test_pdf_exclusions_with_guides.py +84 -0
- natural_pdf-0.2.12/temp/test_region_exclusions_detailed.py +25 -0
- natural_pdf-0.2.12/temp/test_stripes_real_pdf.py +62 -0
- natural_pdf-0.2.12/temp/test_vertical_stripes.py +55 -0
- natural_pdf-0.2.12/tests/test_element_collection_guides.py +140 -0
- natural_pdf-0.2.12/tests/test_element_exclusions.py +130 -0
- natural_pdf-0.2.12/tests/test_guides_from_stripes.py +224 -0
- natural_pdf-0.2.12/tests/test_guides_marker_sorting.py +177 -0
- natural_pdf-0.2.12/tests/test_horizontal_guides_alignment.py +203 -0
- natural_pdf-0.2.12/tests/test_pdf_exclusions_in_find_methods.py +169 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/.cursor/rules/analysis_framework.mdc +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/.cursor/rules/coding-style.mdc +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/.cursor/rules/edit-md-instead-of-ipynb.mdc +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/.cursor/rules/minimal-comments.mdc +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/.cursor/rules/natural-pdf-overview.mdc +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/.cursor/rules/user-friendly-library-code.mdc +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/.github/workflows/ci.yml +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/.github/workflows/docs.yml +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/.github/workflows/nightly-tutorials.yml +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/.gitignore +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/.pre-commit-config.yaml +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/01-execute_notebooks.py +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/02-run_all_tutorials.sh +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/CLAUDE.md +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/LICENSE +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/MANIFEST.in +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/README.md +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/audit_packaging.py +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/check_run_md.sh +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/docs/api/index.md +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/docs/assets/favicon.png +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/docs/assets/favicon.svg +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/docs/assets/javascripts/custom.js +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/docs/assets/logo.svg +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/docs/assets/sample-screen.png +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/docs/assets/social-preview.png +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/docs/assets/social-preview.svg +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/docs/assets/stylesheets/custom.css +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/docs/categorizing-documents/index.md +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/docs/data-extraction/index.md +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/docs/describe/index.md +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/docs/document-qa/index.md +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/docs/element-selection/index.md +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/docs/extracting-clean-text/index.md +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/docs/finetuning/index.md +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/docs/fix-messy-tables/index.md +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/docs/fix-messy-tables/table_1.csv +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/docs/fix-messy-tables/table_2.csv +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/docs/fix-messy-tables/table_3.csv +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/docs/index.md +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/docs/installation/index.md +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/docs/interactive-widget/index.md +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/docs/layout-analysis/index.md +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/docs/loops-and-groups/index.md +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/docs/ocr/index.md +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/docs/pdf-navigation/index.md +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/docs/process-forms-and-invoices/extracted_form_data.csv +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/docs/process-forms-and-invoices/index.md +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/docs/quick-reference/index.md +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/docs/reflowing-pages/index.md +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/docs/regions/index.md +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/docs/tables/index.md +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/docs/text-analysis/index.md +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/docs/tutorials/01-loading-and-extraction.md +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/docs/tutorials/02-finding-elements.md +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/docs/tutorials/03-extracting-blocks.md +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/docs/tutorials/04-table-extraction.md +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/docs/tutorials/05-excluding-content.md +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/docs/tutorials/06-document-qa.md +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/docs/tutorials/07-layout-analysis.md +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/docs/tutorials/07-working-with-regions.md +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/docs/tutorials/08-spatial-navigation.md +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/docs/tutorials/09-section-extraction.md +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/docs/tutorials/10-form-field-extraction.md +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/docs/tutorials/11-enhanced-table-processing.md +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/docs/tutorials/12-ocr-integration.md +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/docs/tutorials/13-semantic-search.md +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/docs/tutorials/14-categorizing-documents.md +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/docs/visual-debugging/index.md +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/docs/visual-debugging/region.png +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/mkdocs.yml +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/natural_pdf/__init__.py +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/natural_pdf/analyzers/__init__.py +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/natural_pdf/analyzers/layout/__init__.py +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/natural_pdf/analyzers/layout/base.py +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/natural_pdf/analyzers/layout/docling.py +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/natural_pdf/analyzers/layout/gemini.py +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/natural_pdf/analyzers/layout/layout_analyzer.py +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/natural_pdf/analyzers/layout/layout_manager.py +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/natural_pdf/analyzers/layout/layout_options.py +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/natural_pdf/analyzers/layout/paddle.py +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/natural_pdf/analyzers/layout/pdfplumber_table_finder.py +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/natural_pdf/analyzers/layout/surya.py +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/natural_pdf/analyzers/layout/table_structure_utils.py +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/natural_pdf/analyzers/layout/tatr.py +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/natural_pdf/analyzers/layout/yolo.py +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/natural_pdf/analyzers/shape_detection_mixin.py +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/natural_pdf/analyzers/text_options.py +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/natural_pdf/analyzers/text_structure.py +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/natural_pdf/analyzers/utils.py +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/natural_pdf/classification/manager.py +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/natural_pdf/classification/mixin.py +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/natural_pdf/classification/results.py +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/natural_pdf/cli.py +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/natural_pdf/collections/mixins.py +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/natural_pdf/core/__init__.py +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/natural_pdf/core/element_manager.py +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/natural_pdf/core/highlighting_service.py +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/natural_pdf/core/page_collection.py +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/natural_pdf/core/page_groupby.py +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/natural_pdf/core/pdf.py +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/natural_pdf/core/pdf_collection.py +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/natural_pdf/core/render_spec.py +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/natural_pdf/describe/__init__.py +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/natural_pdf/describe/base.py +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/natural_pdf/describe/elements.py +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/natural_pdf/describe/mixin.py +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/natural_pdf/describe/summary.py +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/natural_pdf/elements/__init__.py +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/natural_pdf/elements/base.py +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/natural_pdf/elements/element_collection.py +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/natural_pdf/elements/image.py +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/natural_pdf/elements/line.py +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/natural_pdf/elements/rect.py +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/natural_pdf/elements/text.py +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/natural_pdf/export/mixin.py +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/natural_pdf/exporters/__init__.py +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/natural_pdf/exporters/base.py +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/natural_pdf/exporters/data/__init__.py +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/natural_pdf/exporters/data/pdf.ttf +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/natural_pdf/exporters/data/sRGB.icc +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/natural_pdf/exporters/hocr.py +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/natural_pdf/exporters/hocr_font.py +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/natural_pdf/exporters/original_pdf.py +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/natural_pdf/exporters/paddleocr.py +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/natural_pdf/exporters/searchable_pdf.py +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/natural_pdf/extraction/manager.py +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/natural_pdf/extraction/mixin.py +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/natural_pdf/extraction/result.py +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/natural_pdf/flows/__init__.py +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/natural_pdf/flows/collections.py +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/natural_pdf/flows/element.py +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/natural_pdf/flows/flow.py +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/natural_pdf/flows/region.py +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/natural_pdf/ocr/__init__.py +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/natural_pdf/ocr/engine.py +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/natural_pdf/ocr/engine_doctr.py +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/natural_pdf/ocr/engine_easyocr.py +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/natural_pdf/ocr/engine_paddle.py +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/natural_pdf/ocr/engine_surya.py +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/natural_pdf/ocr/ocr_factory.py +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/natural_pdf/ocr/ocr_manager.py +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/natural_pdf/ocr/ocr_options.py +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/natural_pdf/ocr/utils.py +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/natural_pdf/qa/__init__.py +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/natural_pdf/qa/document_qa.py +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/natural_pdf/qa/qa_result.py +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/natural_pdf/search/__init__.py +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/natural_pdf/search/lancedb_search_service.py +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/natural_pdf/search/numpy_search_service.py +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/natural_pdf/search/search_options.py +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/natural_pdf/search/search_service_protocol.py +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/natural_pdf/search/searchable_mixin.py +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/natural_pdf/selectors/__init__.py +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/natural_pdf/selectors/parser.py +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/natural_pdf/tables/__init__.py +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/natural_pdf/tables/result.py +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/natural_pdf/templates/__init__.py +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/natural_pdf/templates/finetune/fine_tune_paddleocr.md +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/natural_pdf/templates/spa/css/style.css +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/natural_pdf/templates/spa/index.html +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/natural_pdf/templates/spa/js/app.js +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/natural_pdf/templates/spa/words.txt +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/natural_pdf/text_mixin.py +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/natural_pdf/utils/__init__.py +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/natural_pdf/utils/bidi_mirror.py +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/natural_pdf/utils/color_utils.py +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/natural_pdf/utils/debug.py +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/natural_pdf/utils/highlighting.py +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/natural_pdf/utils/identifiers.py +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/natural_pdf/utils/layout.py +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/natural_pdf/utils/locks.py +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/natural_pdf/utils/packaging.py +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/natural_pdf/utils/reading_order.py +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/natural_pdf/utils/text_extraction.py +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/natural_pdf/utils/visualization.py +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/natural_pdf/vision/__init__.py +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/natural_pdf/vision/mixin.py +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/natural_pdf/vision/results.py +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/natural_pdf/vision/similarity.py +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/natural_pdf/widgets/__init__.py +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/natural_pdf/widgets/viewer.py +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/natural_pdf.egg-info/dependency_links.txt +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/natural_pdf.egg-info/entry_points.txt +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/natural_pdf.egg-info/requires.txt +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/natural_pdf.egg-info/top_level.txt +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/noxfile.py +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/optimization/memory_comparison.py +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/optimization/pdf_analyzer.py +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/optimization/performance_analysis.py +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/optimization/performance_results/image_heavy_snapshots.csv +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/optimization/performance_results/image_heavy_snapshots.json +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/optimization/performance_results/text_heavy_snapshots.csv +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/optimization/performance_results/text_heavy_snapshots.json +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/optimization/test_cleanup_methods.py +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/optimization/test_memory_fix.py +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/publish.sh +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/pyproject.toml +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/sample-screen.png +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/setup.cfg +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/tests/conftest.py +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/tests/exporters/test_paddleocr_exporter.py +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/tests/test_annotate.py +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/tests/test_arabic_performance.py +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/tests/test_arabic_real_world.py +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/tests/test_color_conversion.py +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/tests/test_color_hex_display.py +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/tests/test_core/test_containment_geometry.py +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/tests/test_core/test_elements.py +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/tests/test_core/test_loading.py +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/tests/test_core/test_spatial.py +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/tests/test_core/test_text_extraction.py +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/tests/test_core/test_text_layer.py +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/tests/test_crop_enhancements.py +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/tests/test_crop_region_highlights.py +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/tests/test_directional_defaults.py +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/tests/test_dissolve.py +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/tests/test_dissolve_cross_page_bug.py +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/tests/test_dissolve_debug_issue.py +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/tests/test_dissolve_real_world_issue.py +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/tests/test_dissolve_single_elements.py +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/tests/test_dissolve_vertical_offset_issue.py +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/tests/test_document_qa.py +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/tests/test_element_addition.py +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/tests/test_element_collection_show_cols.py +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/tests/test_element_collection_slicing.py +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/tests/test_element_show_crop_highlights.py +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/tests/test_empty_pseudo_class.py +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/tests/test_exclusions.py +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/tests/test_expand.py +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/tests/test_extraction_error.py +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/tests/test_extraction_mixin_fix.py +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/tests/test_extraction_text_and_vision.py +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/tests/test_extraction_working.py +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/tests/test_find_similar.py +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/tests/test_first_last_selectors.py +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/tests/test_fix_get_sections_zero_height.py +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/tests/test_flow_region_directional.py +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/tests/test_get_sections_fix_comprehensive.py +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/tests/test_get_sections_zero_height.py +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/tests/test_groupby.py +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/tests/test_guides.py +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/tests/test_guides_apply_exclusions.py +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/tests/test_guides_apply_exclusions_simple.py +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/tests/test_guides_extract_table.py +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/tests/test_guides_extract_table_collections.py +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/tests/test_guides_extract_table_exclusions.py +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/tests/test_guides_extract_table_real.py +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/tests/test_guides_integration.py +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/tests/test_highlight_detection.py +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/tests/test_highlight_detection_comprehensive.py +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/tests/test_highlight_protocol.py +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/tests/test_highlight_protocol_simple.py +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/tests/test_highlight_regions.py +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/tests/test_include_boundaries_comprehensive.py +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/tests/test_include_boundaries_debug.py +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/tests/test_include_boundaries_final.py +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/tests/test_include_boundaries_final_verification.py +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/tests/test_include_boundaries_fix.py +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/tests/test_include_boundaries_mock.py +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/tests/test_include_boundaries_simple.py +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/tests/test_include_boundaries_types_pdf.py +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/tests/test_include_boundaries_verification.py +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/tests/test_include_boundaries_with_real_text.py +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/tests/test_loading_original.py +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/tests/test_merge_connected.py +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/tests/test_merge_connected_real_world.py +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/tests/test_merge_method.py +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/tests/test_multi_page_table_discovery.py +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/tests/test_optional_deps.py +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/tests/test_page_exclusion_lists.py +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/tests/test_pdf_add_exclusion_elementcollection.py +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/tests/test_region_show_crop_highlights.py +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/tests/test_region_viewer.py +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/tests/test_sections_end_only.py +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/tests/test_sections_with_start_and_end.py +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/tests/test_show_column_layout.py +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/tests/test_show_edge_cases.py +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/tests/test_show_exclusions.py +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/tests/test_show_exclusions_feature.py +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/tests/test_show_limit.py +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/tests/test_skip_repeating_headers_multipage.py +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/tests/test_slice_cache_reuse.py +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/tests/test_slice_exclusion_fix.py +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/tests/test_slice_exclusion_issue.py +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/tests/test_slice_exclusion_mock.py +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/tests/test_sliced_collection_exclusions.py +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/tests/test_strikethrough_detection.py +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/tests/test_table_result_header_mismatch.py +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/tests/test_table_result_keep_blank.py +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/tests/test_tiny_text_tables.py +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/tests/test_tiny_text_tables_table.py +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/tests/test_tutorials.py +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/tests/test_underline_detection.py +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/tests/test_update_text.py +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/todo/bad_pdf_analysis.md +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/todo/evaluation.md +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/tools/bad_pdf_eval/IMPROVEMENTS_SUMMARY.md +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/tools/bad_pdf_eval/LLM_NaturalPDF_CheatSheet.md +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/tools/bad_pdf_eval/LLM_NaturalPDF_Workflows.md +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/tools/bad_pdf_eval/README.md +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/tools/bad_pdf_eval/__init__.py +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/tools/bad_pdf_eval/analyser.py +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/tools/bad_pdf_eval/collate_summaries.py +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/tools/bad_pdf_eval/compile_attempts_markdown.py +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/tools/bad_pdf_eval/eval_suite.py +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/tools/bad_pdf_eval/evaluate_quality.py +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/tools/bad_pdf_eval/export_enrichment_csv.py +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/tools/bad_pdf_eval/extraction_decision_tree.md +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/tools/bad_pdf_eval/llm_enrich.py +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/tools/bad_pdf_eval/llm_enrich_with_retry.py +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/tools/bad_pdf_eval/reporter.py +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/tools/bad_pdf_eval/utils.py +0 -0
- {natural_pdf-0.2.11 → natural_pdf-0.2.12}/uv.lock +0 -0
@@ -185,7 +185,9 @@ class GuidesList(UserList):
|
|
185
185
|
self,
|
186
186
|
markers: Union[str, List[str], "ElementCollection", Callable, None],
|
187
187
|
obj: Optional[Union["Page", "Region", "FlowRegion"]] = None,
|
188
|
-
align:
|
188
|
+
align: Union[
|
189
|
+
Literal["left", "right", "center", "between"], Literal["top", "bottom"]
|
190
|
+
] = "left",
|
189
191
|
outer: bool = True,
|
190
192
|
tolerance: float = 5,
|
191
193
|
*,
|
@@ -203,7 +205,10 @@ class GuidesList(UserList):
|
|
203
205
|
- Callable: function that takes a page and returns markers
|
204
206
|
- None: no markers
|
205
207
|
obj: Page/Region/FlowRegion to search (uses parent's context if None)
|
206
|
-
align: How to align guides relative to found elements
|
208
|
+
align: How to align guides relative to found elements:
|
209
|
+
- For vertical guides: 'left', 'right', 'center', 'between'
|
210
|
+
- For horizontal guides: 'top', 'bottom', 'center', 'between'
|
211
|
+
- Note: 'left'/'right' also work for horizontal (mapped to top/bottom)
|
207
212
|
outer: Whether to add outer boundary guides
|
208
213
|
tolerance: Tolerance for snapping to element edges
|
209
214
|
apply_exclusions: Whether to apply exclusion zones when searching for text
|
@@ -224,19 +229,25 @@ class GuidesList(UserList):
|
|
224
229
|
self._callable = None
|
225
230
|
actual_markers = markers
|
226
231
|
|
232
|
+
# Normalize alignment for horizontal guides
|
233
|
+
if self._axis == "horizontal":
|
234
|
+
if align == "top":
|
235
|
+
align = "left"
|
236
|
+
elif align == "bottom":
|
237
|
+
align = "right"
|
238
|
+
|
227
239
|
# Check if parent is in flow mode
|
228
240
|
if self._parent.is_flow_region:
|
229
241
|
# Create guides across all constituent regions
|
230
242
|
all_guides = []
|
231
243
|
for region in self._parent.context.constituent_regions:
|
232
|
-
#
|
233
|
-
marker_texts = _normalize_markers(actual_markers, region)
|
244
|
+
# Pass markers directly - from_content will handle them properly
|
234
245
|
|
235
246
|
# Create guides for this region
|
236
247
|
region_guides = Guides.from_content(
|
237
248
|
obj=region,
|
238
249
|
axis=self._axis,
|
239
|
-
markers=
|
250
|
+
markers=actual_markers, # Pass original markers, not normalized text
|
240
251
|
align=align,
|
241
252
|
outer=outer,
|
242
253
|
tolerance=tolerance,
|
@@ -312,14 +323,14 @@ class GuidesList(UserList):
|
|
312
323
|
return self._parent
|
313
324
|
|
314
325
|
# Original single-region logic
|
315
|
-
#
|
316
|
-
|
326
|
+
# Pass markers directly to from_content which will handle them properly
|
327
|
+
# (no need to normalize here since from_content now handles ElementCollection)
|
317
328
|
|
318
329
|
# Create guides for this axis
|
319
330
|
new_guides = Guides.from_content(
|
320
331
|
obj=target_obj,
|
321
332
|
axis=self._axis,
|
322
|
-
markers=
|
333
|
+
markers=actual_markers, # Pass original markers, not normalized text
|
323
334
|
align=align,
|
324
335
|
outer=outer,
|
325
336
|
tolerance=tolerance,
|
@@ -930,6 +941,82 @@ class GuidesList(UserList):
|
|
930
941
|
self.data.clear()
|
931
942
|
return self._parent
|
932
943
|
|
944
|
+
def from_stripes(
|
945
|
+
self,
|
946
|
+
stripes=None,
|
947
|
+
color=None, # Explicitly specify stripe color
|
948
|
+
) -> "Guides":
|
949
|
+
"""Create guides from striped table rows or columns.
|
950
|
+
|
951
|
+
Creates guides at both edges of stripe elements (e.g., colored table rows).
|
952
|
+
Perfect for zebra-striped tables where you need guides at every row boundary.
|
953
|
+
|
954
|
+
Args:
|
955
|
+
stripes: Elements representing stripes. If None, auto-detects.
|
956
|
+
color: Specific color to look for (e.g., '#00ffff'). If None, finds most common.
|
957
|
+
|
958
|
+
Examples:
|
959
|
+
# Auto-detect zebra stripes
|
960
|
+
guides.horizontal.from_stripes()
|
961
|
+
|
962
|
+
# Specific color
|
963
|
+
guides.horizontal.from_stripes(color='#00ffff')
|
964
|
+
|
965
|
+
# Manual selection
|
966
|
+
stripes = page.find_all('rect[fill=#00ffff]')
|
967
|
+
guides.horizontal.from_stripes(stripes)
|
968
|
+
|
969
|
+
# Vertical stripes
|
970
|
+
guides.vertical.from_stripes(color='#e0e0e0')
|
971
|
+
|
972
|
+
Returns:
|
973
|
+
Parent Guides object for chaining
|
974
|
+
"""
|
975
|
+
from collections import defaultdict
|
976
|
+
|
977
|
+
target_obj = self._parent.context
|
978
|
+
if target_obj is None:
|
979
|
+
raise ValueError("No context available for stripe detection")
|
980
|
+
|
981
|
+
if stripes is None:
|
982
|
+
if color:
|
983
|
+
# User specified color
|
984
|
+
stripes = target_obj.find_all(f"rect[fill={color}]")
|
985
|
+
else:
|
986
|
+
# Auto-detect most common non-white fill
|
987
|
+
all_rects = target_obj.find_all("rect[fill]")
|
988
|
+
|
989
|
+
# Group by fill color
|
990
|
+
fill_counts = defaultdict(list)
|
991
|
+
for rect in all_rects:
|
992
|
+
if rect.fill and rect.fill not in ["#ffffff", "white", "none", "transparent"]:
|
993
|
+
fill_counts[rect.fill].append(rect)
|
994
|
+
|
995
|
+
if not fill_counts:
|
996
|
+
return self._parent # No stripes found
|
997
|
+
|
998
|
+
# Find most common fill color
|
999
|
+
stripes = max(fill_counts.values(), key=len)
|
1000
|
+
|
1001
|
+
if not stripes:
|
1002
|
+
return self._parent
|
1003
|
+
|
1004
|
+
# Get both edges of each stripe
|
1005
|
+
edges = []
|
1006
|
+
if self._axis == "horizontal":
|
1007
|
+
for stripe in stripes:
|
1008
|
+
edges.extend([stripe.top, stripe.bottom])
|
1009
|
+
else:
|
1010
|
+
for stripe in stripes:
|
1011
|
+
edges.extend([stripe.x0, stripe.x1])
|
1012
|
+
|
1013
|
+
# Remove duplicates and sort
|
1014
|
+
edges = sorted(set(edges))
|
1015
|
+
|
1016
|
+
# Add guides
|
1017
|
+
self.extend(edges)
|
1018
|
+
return self._parent
|
1019
|
+
|
933
1020
|
def __add__(self, other):
|
934
1021
|
"""Handle addition of GuidesList objects by returning combined data."""
|
935
1022
|
if isinstance(other, GuidesList):
|
@@ -1459,7 +1546,9 @@ class Guides:
|
|
1459
1546
|
obj: Union["Page", "Region", "FlowRegion"],
|
1460
1547
|
axis: Literal["vertical", "horizontal"] = "vertical",
|
1461
1548
|
markers: Union[str, List[str], "ElementCollection", None] = None,
|
1462
|
-
align:
|
1549
|
+
align: Union[
|
1550
|
+
Literal["left", "right", "center", "between"], Literal["top", "bottom"]
|
1551
|
+
] = "left",
|
1463
1552
|
outer: bool = True,
|
1464
1553
|
tolerance: float = 5,
|
1465
1554
|
apply_exclusions: bool = True,
|
@@ -1475,7 +1564,9 @@ class Guides:
|
|
1475
1564
|
- List[str]: list of selectors or literal text strings
|
1476
1565
|
- ElementCollection: collection of elements to extract text from
|
1477
1566
|
- None: no markers
|
1478
|
-
align: Where to place guides relative to found text
|
1567
|
+
align: Where to place guides relative to found text:
|
1568
|
+
- For vertical guides: 'left', 'right', 'center', 'between'
|
1569
|
+
- For horizontal guides: 'top', 'bottom', 'center', 'between'
|
1479
1570
|
outer: Whether to add guides at the boundaries
|
1480
1571
|
tolerance: Maximum distance to search for text
|
1481
1572
|
apply_exclusions: Whether to apply exclusion zones when searching for text
|
@@ -1483,6 +1574,13 @@ class Guides:
|
|
1483
1574
|
Returns:
|
1484
1575
|
New Guides object aligned to text content
|
1485
1576
|
"""
|
1577
|
+
# Normalize alignment for horizontal guides
|
1578
|
+
if axis == "horizontal":
|
1579
|
+
if align == "top":
|
1580
|
+
align = "left"
|
1581
|
+
elif align == "bottom":
|
1582
|
+
align = "right"
|
1583
|
+
|
1486
1584
|
# Handle FlowRegion
|
1487
1585
|
if hasattr(obj, "constituent_regions"):
|
1488
1586
|
guides = cls(context=obj)
|
@@ -1530,39 +1628,51 @@ class Guides:
|
|
1530
1628
|
elif hasattr(obj, "width"):
|
1531
1629
|
bounds = (0, 0, obj.width, obj.height)
|
1532
1630
|
|
1533
|
-
#
|
1534
|
-
|
1631
|
+
# Handle different marker types
|
1632
|
+
elements_to_process = []
|
1535
1633
|
|
1536
|
-
#
|
1537
|
-
|
1538
|
-
|
1539
|
-
|
1540
|
-
|
1541
|
-
|
1542
|
-
|
1543
|
-
|
1544
|
-
|
1545
|
-
|
1546
|
-
|
1547
|
-
|
1548
|
-
|
1549
|
-
# For between, collect left edges for processing later
|
1550
|
-
guides_coords.append(element.x0)
|
1551
|
-
else: # horizontal
|
1552
|
-
if align == "left": # top for horizontal
|
1553
|
-
guides_coords.append(element.top)
|
1554
|
-
elif align == "right": # bottom for horizontal
|
1555
|
-
guides_coords.append(element.bottom)
|
1556
|
-
elif align == "center":
|
1557
|
-
guides_coords.append((element.top + element.bottom) / 2)
|
1558
|
-
elif align == "between":
|
1559
|
-
# For between, collect top edges for processing later
|
1560
|
-
guides_coords.append(element.top)
|
1634
|
+
# Check if markers is an ElementCollection or has elements attribute
|
1635
|
+
if hasattr(markers, "elements") or hasattr(markers, "_elements"):
|
1636
|
+
# It's an ElementCollection - use elements directly
|
1637
|
+
elements_to_process = getattr(markers, "elements", getattr(markers, "_elements", []))
|
1638
|
+
elif hasattr(markers, "__iter__") and not isinstance(markers, str):
|
1639
|
+
# Check if it's an iterable of elements (not strings)
|
1640
|
+
try:
|
1641
|
+
markers_list = list(markers)
|
1642
|
+
if markers_list and hasattr(markers_list[0], "x0"):
|
1643
|
+
# It's a list of elements
|
1644
|
+
elements_to_process = markers_list
|
1645
|
+
except:
|
1646
|
+
pass
|
1561
1647
|
|
1562
|
-
|
1563
|
-
|
1564
|
-
|
1565
|
-
|
1648
|
+
if elements_to_process:
|
1649
|
+
# Process elements directly without text search
|
1650
|
+
for element in elements_to_process:
|
1651
|
+
if axis == "vertical":
|
1652
|
+
if align == "left":
|
1653
|
+
guides_coords.append(element.x0)
|
1654
|
+
elif align == "right":
|
1655
|
+
guides_coords.append(element.x1)
|
1656
|
+
elif align == "center":
|
1657
|
+
guides_coords.append((element.x0 + element.x1) / 2)
|
1658
|
+
elif align == "between":
|
1659
|
+
# For between, collect left edges for processing later
|
1660
|
+
guides_coords.append(element.x0)
|
1661
|
+
else: # horizontal
|
1662
|
+
if align == "left": # top for horizontal
|
1663
|
+
guides_coords.append(element.top)
|
1664
|
+
elif align == "right": # bottom for horizontal
|
1665
|
+
guides_coords.append(element.bottom)
|
1666
|
+
elif align == "center":
|
1667
|
+
guides_coords.append((element.top + element.bottom) / 2)
|
1668
|
+
elif align == "between":
|
1669
|
+
# For between, collect top edges for processing later
|
1670
|
+
guides_coords.append(element.top)
|
1671
|
+
else:
|
1672
|
+
# Fall back to text-based search
|
1673
|
+
marker_texts = _normalize_markers(markers, obj)
|
1674
|
+
|
1675
|
+
# Find each marker and determine guide position
|
1566
1676
|
for marker in marker_texts:
|
1567
1677
|
if hasattr(obj, "find"):
|
1568
1678
|
element = obj.find(
|
@@ -1570,9 +1680,52 @@ class Guides:
|
|
1570
1680
|
)
|
1571
1681
|
if element:
|
1572
1682
|
if axis == "vertical":
|
1573
|
-
|
1683
|
+
if align == "left":
|
1684
|
+
guides_coords.append(element.x0)
|
1685
|
+
elif align == "right":
|
1686
|
+
guides_coords.append(element.x1)
|
1687
|
+
elif align == "center":
|
1688
|
+
guides_coords.append((element.x0 + element.x1) / 2)
|
1689
|
+
elif align == "between":
|
1690
|
+
# For between, collect left edges for processing later
|
1691
|
+
guides_coords.append(element.x0)
|
1574
1692
|
else: # horizontal
|
1575
|
-
|
1693
|
+
if align == "left": # top for horizontal
|
1694
|
+
guides_coords.append(element.top)
|
1695
|
+
elif align == "right": # bottom for horizontal
|
1696
|
+
guides_coords.append(element.bottom)
|
1697
|
+
elif align == "center":
|
1698
|
+
guides_coords.append((element.top + element.bottom) / 2)
|
1699
|
+
elif align == "between":
|
1700
|
+
# For between, collect top edges for processing later
|
1701
|
+
guides_coords.append(element.top)
|
1702
|
+
|
1703
|
+
# Handle 'between' alignment - find midpoints between adjacent markers
|
1704
|
+
if align == "between" and len(guides_coords) >= 2:
|
1705
|
+
# We need to get the right and left edges of each marker
|
1706
|
+
marker_bounds = []
|
1707
|
+
|
1708
|
+
if elements_to_process:
|
1709
|
+
# Use elements directly
|
1710
|
+
for element in elements_to_process:
|
1711
|
+
if axis == "vertical":
|
1712
|
+
marker_bounds.append((element.x0, element.x1))
|
1713
|
+
else: # horizontal
|
1714
|
+
marker_bounds.append((element.top, element.bottom))
|
1715
|
+
else:
|
1716
|
+
# Fall back to text search
|
1717
|
+
if "marker_texts" not in locals():
|
1718
|
+
marker_texts = _normalize_markers(markers, obj)
|
1719
|
+
for marker in marker_texts:
|
1720
|
+
if hasattr(obj, "find"):
|
1721
|
+
element = obj.find(
|
1722
|
+
f'text:contains("{marker}")', apply_exclusions=apply_exclusions
|
1723
|
+
)
|
1724
|
+
if element:
|
1725
|
+
if axis == "vertical":
|
1726
|
+
marker_bounds.append((element.x0, element.x1))
|
1727
|
+
else: # horizontal
|
1728
|
+
marker_bounds.append((element.top, element.bottom))
|
1576
1729
|
|
1577
1730
|
# Sort markers by their left edge (or top edge for horizontal)
|
1578
1731
|
marker_bounds.sort(key=lambda x: x[0])
|
@@ -815,11 +815,38 @@ class Page(
|
|
815
815
|
if debug:
|
816
816
|
print(f" ✗ Empty iterable returned from callable '{label}'")
|
817
817
|
elif region_result:
|
818
|
-
|
819
|
-
|
820
|
-
|
821
|
-
if
|
822
|
-
|
818
|
+
# Check if it's a single Element that can be converted to a Region
|
819
|
+
from natural_pdf.elements.base import Element
|
820
|
+
|
821
|
+
if isinstance(region_result, Element) or (
|
822
|
+
hasattr(region_result, "bbox") and hasattr(region_result, "expand")
|
823
|
+
):
|
824
|
+
try:
|
825
|
+
# Convert Element to Region using expand()
|
826
|
+
expanded_region = region_result.expand()
|
827
|
+
if isinstance(expanded_region, Region):
|
828
|
+
expanded_region.label = label
|
829
|
+
regions.append(expanded_region)
|
830
|
+
if debug:
|
831
|
+
print(
|
832
|
+
f" ✓ Converted Element to Region from callable '{label}': {expanded_region}"
|
833
|
+
)
|
834
|
+
else:
|
835
|
+
if debug:
|
836
|
+
print(
|
837
|
+
f" ✗ Element.expand() did not return a Region: {type(expanded_region)}"
|
838
|
+
)
|
839
|
+
except Exception as e:
|
840
|
+
if debug:
|
841
|
+
print(f" ✗ Failed to convert Element to Region: {e}")
|
842
|
+
else:
|
843
|
+
logger.warning(
|
844
|
+
f"Callable exclusion '{exclusion_label}' returned non-Region object: {type(region_result)}. Skipping."
|
845
|
+
)
|
846
|
+
if debug:
|
847
|
+
print(
|
848
|
+
f" ✗ Callable returned non-Region/None: {type(region_result)}"
|
849
|
+
)
|
823
850
|
else:
|
824
851
|
if debug:
|
825
852
|
print(
|
@@ -839,6 +866,27 @@ class Page(
|
|
839
866
|
if debug:
|
840
867
|
print(f" - Added direct region '{label}': {exclusion_item}")
|
841
868
|
|
869
|
+
# Process direct Element objects - convert to Region
|
870
|
+
elif hasattr(exclusion_item, "bbox") and hasattr(exclusion_item, "expand"):
|
871
|
+
try:
|
872
|
+
# Convert Element to Region using expand()
|
873
|
+
expanded_region = exclusion_item.expand()
|
874
|
+
if isinstance(expanded_region, Region):
|
875
|
+
expanded_region.label = label
|
876
|
+
regions.append(expanded_region)
|
877
|
+
if debug:
|
878
|
+
print(
|
879
|
+
f" - Converted direct Element to Region '{label}': {expanded_region}"
|
880
|
+
)
|
881
|
+
else:
|
882
|
+
if debug:
|
883
|
+
print(
|
884
|
+
f" - Element.expand() did not return a Region: {type(expanded_region)}"
|
885
|
+
)
|
886
|
+
except Exception as e:
|
887
|
+
if debug:
|
888
|
+
print(f" - Failed to convert Element to Region: {e}")
|
889
|
+
|
842
890
|
# Process string selectors (from PDF-level exclusions)
|
843
891
|
elif isinstance(exclusion_item, str):
|
844
892
|
selector_str = exclusion_item
|
@@ -1081,7 +1129,7 @@ class Page(
|
|
1081
1129
|
) # _apply_selector doesn't filter
|
1082
1130
|
|
1083
1131
|
# Filter the results based on exclusions if requested
|
1084
|
-
if apply_exclusions and
|
1132
|
+
if apply_exclusions and results_collection:
|
1085
1133
|
filtered_elements = self._filter_elements_by_exclusions(results_collection.elements)
|
1086
1134
|
# Return the first element from the filtered list
|
1087
1135
|
return filtered_elements[0] if filtered_elements else None
|
@@ -1176,7 +1224,7 @@ class Page(
|
|
1176
1224
|
) # _apply_selector doesn't filter
|
1177
1225
|
|
1178
1226
|
# Filter the results based on exclusions if requested
|
1179
|
-
if apply_exclusions and
|
1227
|
+
if apply_exclusions and results_collection:
|
1180
1228
|
filtered_elements = self._filter_elements_by_exclusions(results_collection.elements)
|
1181
1229
|
return ElementCollection(filtered_elements)
|
1182
1230
|
else:
|
@@ -1548,7 +1596,7 @@ class Page(
|
|
1548
1596
|
all_elements = self._element_mgr.get_all_elements()
|
1549
1597
|
|
1550
1598
|
# Apply exclusions if requested
|
1551
|
-
if apply_exclusions
|
1599
|
+
if apply_exclusions:
|
1552
1600
|
return self._filter_elements_by_exclusions(
|
1553
1601
|
all_elements, debug_exclusions=debug_exclusions
|
1554
1602
|
)
|
@@ -1270,7 +1270,8 @@ class Region(
|
|
1270
1270
|
# 3. Get Relevant Exclusions (overlapping this region)
|
1271
1271
|
apply_exclusions_flag = kwargs.get("apply_exclusions", apply_exclusions)
|
1272
1272
|
exclusion_regions = []
|
1273
|
-
if apply_exclusions_flag
|
1273
|
+
if apply_exclusions_flag:
|
1274
|
+
# Always call _get_exclusion_regions to get both page and PDF level exclusions
|
1274
1275
|
all_page_exclusions = self._page._get_exclusion_regions(
|
1275
1276
|
include_callable=True, debug=debug
|
1276
1277
|
)
|
@@ -1281,10 +1282,11 @@ class Region(
|
|
1281
1282
|
exclusion_regions = overlapping_exclusions
|
1282
1283
|
if debug:
|
1283
1284
|
logger.debug(
|
1284
|
-
f"Region {self.bbox}:
|
1285
|
+
f"Region {self.bbox}: Found {len(all_page_exclusions)} total exclusions, "
|
1286
|
+
f"{len(exclusion_regions)} overlapping this region."
|
1285
1287
|
)
|
1286
1288
|
elif debug:
|
1287
|
-
logger.debug(f"Region {self.bbox}: Not applying exclusions.")
|
1289
|
+
logger.debug(f"Region {self.bbox}: Not applying exclusions (apply_exclusions=False).")
|
1288
1290
|
|
1289
1291
|
# 4. Spatially Filter Characters using Utility
|
1290
1292
|
# Pass self as the target_region for precise polygon checks etc.
|
@@ -203,6 +203,26 @@ optimization/performance_results/image_heavy_snapshots.csv
|
|
203
203
|
optimization/performance_results/image_heavy_snapshots.json
|
204
204
|
optimization/performance_results/text_heavy_snapshots.csv
|
205
205
|
optimization/performance_results/text_heavy_snapshots.json
|
206
|
+
temp/debug_cell_extraction.py
|
207
|
+
temp/debug_exclusion_overlap.py
|
208
|
+
temp/debug_exclusions_guides.py
|
209
|
+
temp/debug_extra_guide.py
|
210
|
+
temp/debug_outer_boundaries.py
|
211
|
+
temp/debug_st_search.py
|
212
|
+
temp/fix_page_exclusions.py
|
213
|
+
temp/test_exclusion_with_debug.py
|
214
|
+
temp/test_find_exclusions_fix.py
|
215
|
+
temp/test_find_exclusions_fix_no_recursion.py
|
216
|
+
temp/test_fix_real_pdf.py
|
217
|
+
temp/test_fix_working.py
|
218
|
+
temp/test_fixed_pdf_exclusions.py
|
219
|
+
temp/test_horizontal_top_bottom.py
|
220
|
+
temp/test_marker_order.py
|
221
|
+
temp/test_original_exclusions_now_work.py
|
222
|
+
temp/test_pdf_exclusions_with_guides.py
|
223
|
+
temp/test_region_exclusions_detailed.py
|
224
|
+
temp/test_stripes_real_pdf.py
|
225
|
+
temp/test_vertical_stripes.py
|
206
226
|
tests/conftest.py
|
207
227
|
tests/test_annotate.py
|
208
228
|
tests/test_arabic_performance.py
|
@@ -220,8 +240,10 @@ tests/test_dissolve_single_elements.py
|
|
220
240
|
tests/test_dissolve_vertical_offset_issue.py
|
221
241
|
tests/test_document_qa.py
|
222
242
|
tests/test_element_addition.py
|
243
|
+
tests/test_element_collection_guides.py
|
223
244
|
tests/test_element_collection_show_cols.py
|
224
245
|
tests/test_element_collection_slicing.py
|
246
|
+
tests/test_element_exclusions.py
|
225
247
|
tests/test_element_show_crop_highlights.py
|
226
248
|
tests/test_empty_pseudo_class.py
|
227
249
|
tests/test_exclusions.py
|
@@ -244,12 +266,15 @@ tests/test_guides_extract_table.py
|
|
244
266
|
tests/test_guides_extract_table_collections.py
|
245
267
|
tests/test_guides_extract_table_exclusions.py
|
246
268
|
tests/test_guides_extract_table_real.py
|
269
|
+
tests/test_guides_from_stripes.py
|
247
270
|
tests/test_guides_integration.py
|
271
|
+
tests/test_guides_marker_sorting.py
|
248
272
|
tests/test_highlight_detection.py
|
249
273
|
tests/test_highlight_detection_comprehensive.py
|
250
274
|
tests/test_highlight_protocol.py
|
251
275
|
tests/test_highlight_protocol_simple.py
|
252
276
|
tests/test_highlight_regions.py
|
277
|
+
tests/test_horizontal_guides_alignment.py
|
253
278
|
tests/test_include_boundaries_comprehensive.py
|
254
279
|
tests/test_include_boundaries_debug.py
|
255
280
|
tests/test_include_boundaries_final.py
|
@@ -268,6 +293,7 @@ tests/test_multi_page_table_discovery.py
|
|
268
293
|
tests/test_optional_deps.py
|
269
294
|
tests/test_page_exclusion_lists.py
|
270
295
|
tests/test_pdf_add_exclusion_elementcollection.py
|
296
|
+
tests/test_pdf_exclusions_in_find_methods.py
|
271
297
|
tests/test_region_show_crop_highlights.py
|
272
298
|
tests/test_region_viewer.py
|
273
299
|
tests/test_sections_end_only.py
|
@@ -0,0 +1,42 @@
|
|
1
|
+
"""Debug cell text extraction with exclusions"""
|
2
|
+
from natural_pdf import PDF
|
3
|
+
from natural_pdf.analyzers.guides import Guides
|
4
|
+
|
5
|
+
pdf = PDF("pdfs/m27.pdf")
|
6
|
+
page = pdf.pages[0]
|
7
|
+
|
8
|
+
# Add exclusions
|
9
|
+
pdf.add_exclusion(lambda page: page.find(text="PREMISE").above(), label="header")
|
10
|
+
|
11
|
+
# Check exclusions are registered
|
12
|
+
print("Exclusions on page:")
|
13
|
+
exclusions = page._get_exclusion_regions(debug=True)
|
14
|
+
|
15
|
+
# Create guides and build grid
|
16
|
+
headers = page.find(text="NUMBER").right(include_source=True).expand(top=3, bottom=3).find_all('text')
|
17
|
+
guides = Guides(page)
|
18
|
+
guides.vertical.from_content(headers, align='left')
|
19
|
+
guides.horizontal.from_stripes()
|
20
|
+
|
21
|
+
# Build grid and get cells
|
22
|
+
grid_result = guides.build_grid(include_outer_boundaries=True)
|
23
|
+
cells = grid_result["regions"]["cells"]
|
24
|
+
|
25
|
+
print(f"\nTotal cells: {len(cells)}")
|
26
|
+
|
27
|
+
# Check first row cells (these should be in excluded area)
|
28
|
+
first_row_cells = [c for c in cells if c.bbox[1] < 90] # y < 90
|
29
|
+
print(f"\nFirst row cells: {len(first_row_cells)}")
|
30
|
+
|
31
|
+
for i, cell in enumerate(first_row_cells[:3]):
|
32
|
+
print(f"\nCell {i}:")
|
33
|
+
print(f" Bbox: {cell.bbox}")
|
34
|
+
print(f" Raw text: {repr(cell.extract_text(apply_exclusions=False))}")
|
35
|
+
print(f" With exclusions: {repr(cell.extract_text(apply_exclusions=True))}")
|
36
|
+
|
37
|
+
# Now test the full table extraction
|
38
|
+
print("\n\nFull table extraction:")
|
39
|
+
result = guides.extract_table(include_outer_boundaries=True, apply_exclusions=True, header=False)
|
40
|
+
df = result.to_df()
|
41
|
+
print("\nFirst row of dataframe:")
|
42
|
+
print(df.iloc[0].to_dict() if not df.empty else "Empty")
|
@@ -0,0 +1,43 @@
|
|
1
|
+
"""Debug how exclusions work with overlapping regions"""
|
2
|
+
from natural_pdf import PDF
|
3
|
+
from natural_pdf.analyzers.guides import Guides
|
4
|
+
|
5
|
+
pdf = PDF("pdfs/m27.pdf")
|
6
|
+
page = pdf.pages[0]
|
7
|
+
|
8
|
+
# Add exclusion
|
9
|
+
pdf.add_exclusion(lambda page: page.find(text="PREMISE").above(), label="header")
|
10
|
+
|
11
|
+
# Get the exclusion region
|
12
|
+
exclusions = page._get_exclusion_regions()
|
13
|
+
excl_region = exclusions[0]
|
14
|
+
print(f"Exclusion region: {excl_region.bbox}")
|
15
|
+
print(f"Exclusion bottom: {excl_region.bbox[3]}")
|
16
|
+
|
17
|
+
# Create a test cell that overlaps the exclusion
|
18
|
+
# Cell 1 from before: (32.06, 0.5, 73.18288, 79.53999999999996)
|
19
|
+
test_cell = page.region(32.06, 0.5, 73.18288, 79.53999999999996)
|
20
|
+
|
21
|
+
print(f"\nTest cell: {test_cell.bbox}")
|
22
|
+
print(f"Cell overlaps exclusion: top={test_cell.bbox[1]} < excl_bottom={excl_region.bbox[3]}")
|
23
|
+
|
24
|
+
# Extract text from different y-ranges
|
25
|
+
print("\nText in different parts of the cell:")
|
26
|
+
|
27
|
+
# Part above exclusion line (should be empty)
|
28
|
+
upper_part = page.region(32.06, 0.5, 73.18288, 59.12)
|
29
|
+
print(f"Upper part (0.5 to 59.12): '{upper_part.extract_text(apply_exclusions=True)}'")
|
30
|
+
|
31
|
+
# Part below exclusion line (should have text)
|
32
|
+
lower_part = page.region(32.06, 59.12, 73.18288, 79.54)
|
33
|
+
print(f"Lower part (59.12 to 79.54): '{lower_part.extract_text()}'")
|
34
|
+
|
35
|
+
# The whole cell
|
36
|
+
print(f"Whole cell with exclusions: '{test_cell.extract_text(apply_exclusions=True)}'")
|
37
|
+
print(f"Whole cell without exclusions: '{test_cell.extract_text(apply_exclusions=False)}'")
|
38
|
+
|
39
|
+
# Check what text elements are in this region
|
40
|
+
print("\nText elements in cell:")
|
41
|
+
cell_texts = test_cell.find_all('text')
|
42
|
+
for t in cell_texts[:5]:
|
43
|
+
print(f" '{t.text}' at y={t.top:.2f}-{t.bottom:.2f}")
|