natural-pdf 0.2.8__tar.gz → 0.2.10__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {natural_pdf-0.2.8/natural_pdf.egg-info → natural_pdf-0.2.10}/PKG-INFO +1 -1
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/natural_pdf/analyzers/guides.py +499 -3
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/natural_pdf/cli.py +1 -1
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/natural_pdf/elements/element_collection.py +61 -33
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/natural_pdf/elements/region.py +61 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10/natural_pdf.egg-info}/PKG-INFO +1 -1
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/tests/test_guides.py +71 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/.cursor/rules/analysis_framework.mdc +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/.cursor/rules/coding-style.mdc +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/.cursor/rules/edit-md-instead-of-ipynb.mdc +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/.cursor/rules/minimal-comments.mdc +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/.cursor/rules/natural-pdf-overview.mdc +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/.cursor/rules/user-friendly-library-code.mdc +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/.github/workflows/ci.yml +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/.github/workflows/docs.yml +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/.github/workflows/nightly-tutorials.yml +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/.gitignore +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/.pre-commit-config.yaml +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/01-execute_notebooks.py +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/02-run_all_tutorials.sh +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/CLAUDE.md +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/LICENSE +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/MANIFEST.in +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/README.md +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/audit_packaging.py +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/check_run_md.sh +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/docs/api/index.md +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/docs/assets/favicon.png +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/docs/assets/favicon.svg +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/docs/assets/javascripts/custom.js +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/docs/assets/logo.svg +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/docs/assets/sample-screen.png +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/docs/assets/social-preview.png +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/docs/assets/social-preview.svg +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/docs/assets/stylesheets/custom.css +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/docs/categorizing-documents/index.md +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/docs/data-extraction/index.md +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/docs/describe/index.md +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/docs/document-qa/index.md +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/docs/element-selection/index.md +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/docs/extracting-clean-text/index.md +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/docs/finetuning/index.md +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/docs/fix-messy-tables/index.md +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/docs/fix-messy-tables/table_1.csv +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/docs/fix-messy-tables/table_2.csv +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/docs/fix-messy-tables/table_3.csv +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/docs/index.md +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/docs/installation/index.md +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/docs/interactive-widget/index.md +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/docs/layout-analysis/index.md +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/docs/loops-and-groups/index.md +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/docs/ocr/index.md +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/docs/pdf-navigation/index.md +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/docs/process-forms-and-invoices/extracted_form_data.csv +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/docs/process-forms-and-invoices/index.md +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/docs/quick-reference/index.md +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/docs/reflowing-pages/index.md +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/docs/regions/index.md +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/docs/tables/index.md +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/docs/text-analysis/index.md +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/docs/tutorials/01-loading-and-extraction.md +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/docs/tutorials/02-finding-elements.md +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/docs/tutorials/03-extracting-blocks.md +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/docs/tutorials/04-table-extraction.md +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/docs/tutorials/05-excluding-content.md +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/docs/tutorials/06-document-qa.md +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/docs/tutorials/07-layout-analysis.md +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/docs/tutorials/07-working-with-regions.md +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/docs/tutorials/08-spatial-navigation.md +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/docs/tutorials/09-section-extraction.md +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/docs/tutorials/10-form-field-extraction.md +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/docs/tutorials/11-enhanced-table-processing.md +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/docs/tutorials/12-ocr-integration.md +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/docs/tutorials/13-semantic-search.md +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/docs/tutorials/14-categorizing-documents.md +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/docs/visual-debugging/index.md +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/docs/visual-debugging/region.png +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/mkdocs.yml +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/natural_pdf/__init__.py +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/natural_pdf/analyzers/__init__.py +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/natural_pdf/analyzers/layout/__init__.py +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/natural_pdf/analyzers/layout/base.py +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/natural_pdf/analyzers/layout/docling.py +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/natural_pdf/analyzers/layout/gemini.py +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/natural_pdf/analyzers/layout/layout_analyzer.py +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/natural_pdf/analyzers/layout/layout_manager.py +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/natural_pdf/analyzers/layout/layout_options.py +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/natural_pdf/analyzers/layout/paddle.py +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/natural_pdf/analyzers/layout/pdfplumber_table_finder.py +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/natural_pdf/analyzers/layout/surya.py +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/natural_pdf/analyzers/layout/table_structure_utils.py +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/natural_pdf/analyzers/layout/tatr.py +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/natural_pdf/analyzers/layout/yolo.py +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/natural_pdf/analyzers/shape_detection_mixin.py +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/natural_pdf/analyzers/text_options.py +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/natural_pdf/analyzers/text_structure.py +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/natural_pdf/analyzers/utils.py +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/natural_pdf/classification/manager.py +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/natural_pdf/classification/mixin.py +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/natural_pdf/classification/results.py +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/natural_pdf/collections/mixins.py +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/natural_pdf/core/__init__.py +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/natural_pdf/core/element_manager.py +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/natural_pdf/core/highlighting_service.py +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/natural_pdf/core/page.py +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/natural_pdf/core/page_collection.py +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/natural_pdf/core/page_groupby.py +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/natural_pdf/core/pdf.py +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/natural_pdf/core/pdf_collection.py +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/natural_pdf/core/render_spec.py +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/natural_pdf/describe/__init__.py +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/natural_pdf/describe/base.py +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/natural_pdf/describe/elements.py +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/natural_pdf/describe/mixin.py +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/natural_pdf/describe/summary.py +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/natural_pdf/elements/__init__.py +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/natural_pdf/elements/base.py +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/natural_pdf/elements/image.py +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/natural_pdf/elements/line.py +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/natural_pdf/elements/rect.py +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/natural_pdf/elements/text.py +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/natural_pdf/export/mixin.py +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/natural_pdf/exporters/__init__.py +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/natural_pdf/exporters/base.py +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/natural_pdf/exporters/data/__init__.py +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/natural_pdf/exporters/data/pdf.ttf +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/natural_pdf/exporters/data/sRGB.icc +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/natural_pdf/exporters/hocr.py +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/natural_pdf/exporters/hocr_font.py +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/natural_pdf/exporters/original_pdf.py +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/natural_pdf/exporters/paddleocr.py +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/natural_pdf/exporters/searchable_pdf.py +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/natural_pdf/extraction/manager.py +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/natural_pdf/extraction/mixin.py +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/natural_pdf/extraction/result.py +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/natural_pdf/flows/__init__.py +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/natural_pdf/flows/collections.py +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/natural_pdf/flows/element.py +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/natural_pdf/flows/flow.py +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/natural_pdf/flows/region.py +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/natural_pdf/ocr/__init__.py +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/natural_pdf/ocr/engine.py +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/natural_pdf/ocr/engine_doctr.py +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/natural_pdf/ocr/engine_easyocr.py +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/natural_pdf/ocr/engine_paddle.py +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/natural_pdf/ocr/engine_surya.py +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/natural_pdf/ocr/ocr_factory.py +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/natural_pdf/ocr/ocr_manager.py +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/natural_pdf/ocr/ocr_options.py +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/natural_pdf/ocr/utils.py +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/natural_pdf/qa/__init__.py +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/natural_pdf/qa/document_qa.py +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/natural_pdf/qa/qa_result.py +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/natural_pdf/search/__init__.py +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/natural_pdf/search/lancedb_search_service.py +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/natural_pdf/search/numpy_search_service.py +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/natural_pdf/search/search_options.py +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/natural_pdf/search/search_service_protocol.py +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/natural_pdf/search/searchable_mixin.py +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/natural_pdf/selectors/__init__.py +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/natural_pdf/selectors/parser.py +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/natural_pdf/tables/__init__.py +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/natural_pdf/tables/result.py +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/natural_pdf/templates/__init__.py +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/natural_pdf/templates/finetune/fine_tune_paddleocr.md +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/natural_pdf/templates/spa/css/style.css +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/natural_pdf/templates/spa/index.html +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/natural_pdf/templates/spa/js/app.js +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/natural_pdf/templates/spa/words.txt +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/natural_pdf/text_mixin.py +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/natural_pdf/utils/__init__.py +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/natural_pdf/utils/bidi_mirror.py +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/natural_pdf/utils/color_utils.py +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/natural_pdf/utils/debug.py +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/natural_pdf/utils/highlighting.py +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/natural_pdf/utils/identifiers.py +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/natural_pdf/utils/layout.py +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/natural_pdf/utils/locks.py +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/natural_pdf/utils/packaging.py +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/natural_pdf/utils/reading_order.py +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/natural_pdf/utils/text_extraction.py +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/natural_pdf/utils/visualization.py +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/natural_pdf/vision/__init__.py +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/natural_pdf/vision/mixin.py +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/natural_pdf/vision/results.py +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/natural_pdf/vision/similarity.py +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/natural_pdf/widgets/__init__.py +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/natural_pdf/widgets/viewer.py +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/natural_pdf.egg-info/SOURCES.txt +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/natural_pdf.egg-info/dependency_links.txt +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/natural_pdf.egg-info/entry_points.txt +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/natural_pdf.egg-info/requires.txt +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/natural_pdf.egg-info/top_level.txt +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/noxfile.py +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/optimization/memory_comparison.py +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/optimization/pdf_analyzer.py +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/optimization/performance_analysis.py +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/optimization/performance_results/image_heavy_snapshots.csv +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/optimization/performance_results/image_heavy_snapshots.json +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/optimization/performance_results/text_heavy_snapshots.csv +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/optimization/performance_results/text_heavy_snapshots.json +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/optimization/test_cleanup_methods.py +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/optimization/test_memory_fix.py +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/publish.sh +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/pyproject.toml +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/sample-screen.png +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/setup.cfg +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/tests/conftest.py +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/tests/exporters/test_paddleocr_exporter.py +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/tests/test_annotate.py +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/tests/test_arabic_performance.py +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/tests/test_arabic_real_world.py +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/tests/test_color_conversion.py +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/tests/test_color_hex_display.py +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/tests/test_core/test_containment_geometry.py +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/tests/test_core/test_elements.py +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/tests/test_core/test_loading.py +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/tests/test_core/test_spatial.py +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/tests/test_core/test_text_extraction.py +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/tests/test_core/test_text_layer.py +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/tests/test_crop_enhancements.py +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/tests/test_crop_region_highlights.py +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/tests/test_directional_defaults.py +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/tests/test_dissolve.py +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/tests/test_dissolve_cross_page_bug.py +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/tests/test_dissolve_debug_issue.py +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/tests/test_dissolve_real_world_issue.py +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/tests/test_dissolve_single_elements.py +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/tests/test_dissolve_vertical_offset_issue.py +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/tests/test_document_qa.py +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/tests/test_element_addition.py +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/tests/test_element_collection_show_cols.py +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/tests/test_element_collection_slicing.py +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/tests/test_element_show_crop_highlights.py +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/tests/test_empty_pseudo_class.py +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/tests/test_exclusions.py +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/tests/test_expand.py +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/tests/test_extraction_error.py +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/tests/test_extraction_mixin_fix.py +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/tests/test_extraction_text_and_vision.py +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/tests/test_extraction_working.py +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/tests/test_find_similar.py +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/tests/test_first_last_selectors.py +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/tests/test_fix_get_sections_zero_height.py +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/tests/test_flow_region_directional.py +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/tests/test_get_sections_fix_comprehensive.py +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/tests/test_get_sections_zero_height.py +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/tests/test_groupby.py +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/tests/test_guides_apply_exclusions.py +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/tests/test_guides_apply_exclusions_simple.py +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/tests/test_guides_extract_table.py +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/tests/test_guides_extract_table_collections.py +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/tests/test_guides_extract_table_exclusions.py +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/tests/test_guides_extract_table_real.py +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/tests/test_guides_integration.py +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/tests/test_highlight_detection.py +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/tests/test_highlight_detection_comprehensive.py +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/tests/test_highlight_protocol.py +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/tests/test_highlight_protocol_simple.py +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/tests/test_highlight_regions.py +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/tests/test_include_boundaries_comprehensive.py +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/tests/test_include_boundaries_debug.py +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/tests/test_include_boundaries_final.py +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/tests/test_include_boundaries_final_verification.py +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/tests/test_include_boundaries_fix.py +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/tests/test_include_boundaries_mock.py +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/tests/test_include_boundaries_simple.py +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/tests/test_include_boundaries_types_pdf.py +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/tests/test_include_boundaries_verification.py +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/tests/test_include_boundaries_with_real_text.py +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/tests/test_loading_original.py +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/tests/test_merge_connected.py +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/tests/test_merge_connected_real_world.py +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/tests/test_merge_method.py +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/tests/test_multi_page_table_discovery.py +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/tests/test_optional_deps.py +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/tests/test_page_exclusion_lists.py +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/tests/test_pdf_add_exclusion_elementcollection.py +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/tests/test_region_show_crop_highlights.py +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/tests/test_region_viewer.py +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/tests/test_sections_end_only.py +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/tests/test_sections_with_start_and_end.py +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/tests/test_show_column_layout.py +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/tests/test_show_edge_cases.py +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/tests/test_show_exclusions.py +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/tests/test_show_exclusions_feature.py +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/tests/test_show_limit.py +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/tests/test_skip_repeating_headers_multipage.py +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/tests/test_slice_cache_reuse.py +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/tests/test_slice_exclusion_fix.py +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/tests/test_slice_exclusion_issue.py +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/tests/test_slice_exclusion_mock.py +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/tests/test_sliced_collection_exclusions.py +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/tests/test_strikethrough_detection.py +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/tests/test_table_result_header_mismatch.py +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/tests/test_table_result_keep_blank.py +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/tests/test_tiny_text_tables.py +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/tests/test_tiny_text_tables_table.py +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/tests/test_tutorials.py +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/tests/test_underline_detection.py +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/tests/test_update_text.py +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/todo/bad_pdf_analysis.md +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/todo/evaluation.md +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/tools/bad_pdf_eval/IMPROVEMENTS_SUMMARY.md +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/tools/bad_pdf_eval/LLM_NaturalPDF_CheatSheet.md +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/tools/bad_pdf_eval/LLM_NaturalPDF_Workflows.md +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/tools/bad_pdf_eval/README.md +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/tools/bad_pdf_eval/__init__.py +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/tools/bad_pdf_eval/analyser.py +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/tools/bad_pdf_eval/collate_summaries.py +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/tools/bad_pdf_eval/compile_attempts_markdown.py +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/tools/bad_pdf_eval/eval_suite.py +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/tools/bad_pdf_eval/evaluate_quality.py +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/tools/bad_pdf_eval/export_enrichment_csv.py +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/tools/bad_pdf_eval/extraction_decision_tree.md +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/tools/bad_pdf_eval/llm_enrich.py +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/tools/bad_pdf_eval/llm_enrich_with_retry.py +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/tools/bad_pdf_eval/reporter.py +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/tools/bad_pdf_eval/utils.py +0 -0
- {natural_pdf-0.2.8 → natural_pdf-0.2.10}/uv.lock +0 -0
@@ -128,19 +128,59 @@ class GuidesList(UserList):
|
|
128
128
|
"""A list of guide coordinates that also provides methods for creating guides."""
|
129
129
|
|
130
130
|
def __init__(self, parent_guides: "Guides", axis: Literal["vertical", "horizontal"], data=None):
|
131
|
-
|
131
|
+
# Always sort the initial data
|
132
|
+
super().__init__(sorted(data) if data else [])
|
132
133
|
self._parent = parent_guides
|
133
134
|
self._axis = axis
|
134
135
|
|
135
136
|
def __getitem__(self, i):
|
136
|
-
"""Override to handle slicing properly."""
|
137
|
+
"""Override to handle slicing and negative indexing properly."""
|
137
138
|
if isinstance(i, slice):
|
138
139
|
# Return a new GuidesList with the sliced data
|
139
140
|
return self.__class__(self._parent, self._axis, self.data[i])
|
140
141
|
else:
|
141
|
-
# For single index,
|
142
|
+
# For single index, handle negative indices properly
|
143
|
+
if i < 0:
|
144
|
+
# Convert negative index to positive
|
145
|
+
i = len(self.data) + i
|
142
146
|
return self.data[i]
|
143
147
|
|
148
|
+
def __setitem__(self, i, item):
|
149
|
+
"""Override to maintain sorted order."""
|
150
|
+
self.data[i] = item
|
151
|
+
self.data.sort()
|
152
|
+
|
153
|
+
def append(self, item):
|
154
|
+
"""Override to maintain sorted order."""
|
155
|
+
self.data.append(item)
|
156
|
+
self.data.sort()
|
157
|
+
|
158
|
+
def extend(self, other):
|
159
|
+
"""Override to maintain sorted order."""
|
160
|
+
self.data.extend(other)
|
161
|
+
self.data.sort()
|
162
|
+
|
163
|
+
def insert(self, i, item):
|
164
|
+
"""Override to maintain sorted order."""
|
165
|
+
self.data.append(item) # Just append and sort
|
166
|
+
self.data.sort()
|
167
|
+
|
168
|
+
def __iadd__(self, other):
|
169
|
+
"""Override to maintain sorted order."""
|
170
|
+
self.data.extend(other)
|
171
|
+
self.data.sort()
|
172
|
+
return self
|
173
|
+
|
174
|
+
@property
|
175
|
+
def data(self):
|
176
|
+
"""Get the data list."""
|
177
|
+
return self._data
|
178
|
+
|
179
|
+
@data.setter
|
180
|
+
def data(self, value):
|
181
|
+
"""Set the data list, always keeping it sorted."""
|
182
|
+
self._data = sorted(value) if value else []
|
183
|
+
|
144
184
|
def from_content(
|
145
185
|
self,
|
146
186
|
markers: Union[str, List[str], "ElementCollection", Callable, None],
|
@@ -1842,6 +1882,370 @@ class Guides:
|
|
1842
1882
|
self.horizontal.pop(index)
|
1843
1883
|
return self
|
1844
1884
|
|
1885
|
+
# -------------------------------------------------------------------------
|
1886
|
+
# Region extraction properties
|
1887
|
+
# -------------------------------------------------------------------------
|
1888
|
+
|
1889
|
+
@property
|
1890
|
+
def columns(self):
|
1891
|
+
"""Access columns by index like guides.columns[0]."""
|
1892
|
+
return _ColumnAccessor(self)
|
1893
|
+
|
1894
|
+
@property
|
1895
|
+
def rows(self):
|
1896
|
+
"""Access rows by index like guides.rows[0]."""
|
1897
|
+
return _RowAccessor(self)
|
1898
|
+
|
1899
|
+
@property
|
1900
|
+
def cells(self):
|
1901
|
+
"""Access cells by index like guides.cells[row][col] or guides.cells[row, col]."""
|
1902
|
+
return _CellAccessor(self)
|
1903
|
+
|
1904
|
+
# -------------------------------------------------------------------------
|
1905
|
+
# Region extraction methods (alternative API)
|
1906
|
+
# -------------------------------------------------------------------------
|
1907
|
+
|
1908
|
+
def column(self, index: int, obj: Optional[Union["Page", "Region"]] = None) -> "Region":
|
1909
|
+
"""
|
1910
|
+
Get a column region from the guides.
|
1911
|
+
|
1912
|
+
Args:
|
1913
|
+
index: Column index (0-based)
|
1914
|
+
obj: Page or Region to create the column on (uses self.context if None)
|
1915
|
+
|
1916
|
+
Returns:
|
1917
|
+
Region representing the specified column
|
1918
|
+
|
1919
|
+
Raises:
|
1920
|
+
IndexError: If column index is out of range
|
1921
|
+
"""
|
1922
|
+
target = obj or self.context
|
1923
|
+
if target is None:
|
1924
|
+
raise ValueError("No context available for region creation")
|
1925
|
+
|
1926
|
+
if not self.vertical or index < 0 or index >= len(self.vertical) - 1:
|
1927
|
+
raise IndexError(
|
1928
|
+
f"Column index {index} out of range (have {len(self.vertical)-1} columns)"
|
1929
|
+
)
|
1930
|
+
|
1931
|
+
# Get bounds from context
|
1932
|
+
bounds = self._get_context_bounds()
|
1933
|
+
if not bounds:
|
1934
|
+
raise ValueError("Could not determine bounds")
|
1935
|
+
_, y0, _, y1 = bounds
|
1936
|
+
|
1937
|
+
# Get column boundaries
|
1938
|
+
x0 = self.vertical[index]
|
1939
|
+
x1 = self.vertical[index + 1]
|
1940
|
+
|
1941
|
+
# Create region using absolute coordinates
|
1942
|
+
if hasattr(target, "region"):
|
1943
|
+
# Target has a region method (Page)
|
1944
|
+
return target.region(x0, y0, x1, y1)
|
1945
|
+
elif hasattr(target, "page"):
|
1946
|
+
# Target is a Region, use its parent page
|
1947
|
+
# The coordinates from guides are already absolute
|
1948
|
+
return target.page.region(x0, y0, x1, y1)
|
1949
|
+
else:
|
1950
|
+
raise TypeError(f"Cannot create region on {type(target)}")
|
1951
|
+
|
1952
|
+
def row(self, index: int, obj: Optional[Union["Page", "Region"]] = None) -> "Region":
|
1953
|
+
"""
|
1954
|
+
Get a row region from the guides.
|
1955
|
+
|
1956
|
+
Args:
|
1957
|
+
index: Row index (0-based)
|
1958
|
+
obj: Page or Region to create the row on (uses self.context if None)
|
1959
|
+
|
1960
|
+
Returns:
|
1961
|
+
Region representing the specified row
|
1962
|
+
|
1963
|
+
Raises:
|
1964
|
+
IndexError: If row index is out of range
|
1965
|
+
"""
|
1966
|
+
target = obj or self.context
|
1967
|
+
if target is None:
|
1968
|
+
raise ValueError("No context available for region creation")
|
1969
|
+
|
1970
|
+
if not self.horizontal or index < 0 or index >= len(self.horizontal) - 1:
|
1971
|
+
raise IndexError(f"Row index {index} out of range (have {len(self.horizontal)-1} rows)")
|
1972
|
+
|
1973
|
+
# Get bounds from context
|
1974
|
+
bounds = self._get_context_bounds()
|
1975
|
+
if not bounds:
|
1976
|
+
raise ValueError("Could not determine bounds")
|
1977
|
+
x0, _, x1, _ = bounds
|
1978
|
+
|
1979
|
+
# Get row boundaries
|
1980
|
+
y0 = self.horizontal[index]
|
1981
|
+
y1 = self.horizontal[index + 1]
|
1982
|
+
|
1983
|
+
# Create region using absolute coordinates
|
1984
|
+
if hasattr(target, "region"):
|
1985
|
+
# Target has a region method (Page)
|
1986
|
+
return target.region(x0, y0, x1, y1)
|
1987
|
+
elif hasattr(target, "page"):
|
1988
|
+
# Target is a Region, use its parent page
|
1989
|
+
# The coordinates from guides are already absolute
|
1990
|
+
return target.page.region(x0, y0, x1, y1)
|
1991
|
+
else:
|
1992
|
+
raise TypeError(f"Cannot create region on {type(target)}")
|
1993
|
+
|
1994
|
+
def cell(self, row: int, col: int, obj: Optional[Union["Page", "Region"]] = None) -> "Region":
|
1995
|
+
"""
|
1996
|
+
Get a cell region from the guides.
|
1997
|
+
|
1998
|
+
Args:
|
1999
|
+
row: Row index (0-based)
|
2000
|
+
col: Column index (0-based)
|
2001
|
+
obj: Page or Region to create the cell on (uses self.context if None)
|
2002
|
+
|
2003
|
+
Returns:
|
2004
|
+
Region representing the specified cell
|
2005
|
+
|
2006
|
+
Raises:
|
2007
|
+
IndexError: If row or column index is out of range
|
2008
|
+
"""
|
2009
|
+
target = obj or self.context
|
2010
|
+
if target is None:
|
2011
|
+
raise ValueError("No context available for region creation")
|
2012
|
+
|
2013
|
+
if not self.vertical or col < 0 or col >= len(self.vertical) - 1:
|
2014
|
+
raise IndexError(
|
2015
|
+
f"Column index {col} out of range (have {len(self.vertical)-1} columns)"
|
2016
|
+
)
|
2017
|
+
if not self.horizontal or row < 0 or row >= len(self.horizontal) - 1:
|
2018
|
+
raise IndexError(f"Row index {row} out of range (have {len(self.horizontal)-1} rows)")
|
2019
|
+
|
2020
|
+
# Get cell boundaries
|
2021
|
+
x0 = self.vertical[col]
|
2022
|
+
x1 = self.vertical[col + 1]
|
2023
|
+
y0 = self.horizontal[row]
|
2024
|
+
y1 = self.horizontal[row + 1]
|
2025
|
+
|
2026
|
+
# Create region using absolute coordinates
|
2027
|
+
if hasattr(target, "region"):
|
2028
|
+
# Target has a region method (Page)
|
2029
|
+
return target.region(x0, y0, x1, y1)
|
2030
|
+
elif hasattr(target, "page"):
|
2031
|
+
# Target is a Region, use its parent page
|
2032
|
+
# The coordinates from guides are already absolute
|
2033
|
+
return target.page.region(x0, y0, x1, y1)
|
2034
|
+
else:
|
2035
|
+
raise TypeError(f"Cannot create region on {type(target)}")
|
2036
|
+
|
2037
|
+
def left_of(self, guide_index: int, obj: Optional[Union["Page", "Region"]] = None) -> "Region":
|
2038
|
+
"""
|
2039
|
+
Get a region to the left of a vertical guide.
|
2040
|
+
|
2041
|
+
Args:
|
2042
|
+
guide_index: Vertical guide index
|
2043
|
+
obj: Page or Region to create the region on (uses self.context if None)
|
2044
|
+
|
2045
|
+
Returns:
|
2046
|
+
Region to the left of the specified guide
|
2047
|
+
"""
|
2048
|
+
target = obj or self.context
|
2049
|
+
if target is None:
|
2050
|
+
raise ValueError("No context available for region creation")
|
2051
|
+
|
2052
|
+
if not self.vertical or guide_index < 0 or guide_index >= len(self.vertical):
|
2053
|
+
raise IndexError(f"Guide index {guide_index} out of range")
|
2054
|
+
|
2055
|
+
# Get bounds from context
|
2056
|
+
bounds = self._get_context_bounds()
|
2057
|
+
if not bounds:
|
2058
|
+
raise ValueError("Could not determine bounds")
|
2059
|
+
x0, y0, _, y1 = bounds
|
2060
|
+
|
2061
|
+
# Create region from left edge to guide
|
2062
|
+
x1 = self.vertical[guide_index]
|
2063
|
+
|
2064
|
+
if hasattr(target, "region"):
|
2065
|
+
return target.region(x0, y0, x1, y1)
|
2066
|
+
else:
|
2067
|
+
raise TypeError(f"Cannot create region on {type(target)}")
|
2068
|
+
|
2069
|
+
def right_of(self, guide_index: int, obj: Optional[Union["Page", "Region"]] = None) -> "Region":
|
2070
|
+
"""
|
2071
|
+
Get a region to the right of a vertical guide.
|
2072
|
+
|
2073
|
+
Args:
|
2074
|
+
guide_index: Vertical guide index
|
2075
|
+
obj: Page or Region to create the region on (uses self.context if None)
|
2076
|
+
|
2077
|
+
Returns:
|
2078
|
+
Region to the right of the specified guide
|
2079
|
+
"""
|
2080
|
+
target = obj or self.context
|
2081
|
+
if target is None:
|
2082
|
+
raise ValueError("No context available for region creation")
|
2083
|
+
|
2084
|
+
if not self.vertical or guide_index < 0 or guide_index >= len(self.vertical):
|
2085
|
+
raise IndexError(f"Guide index {guide_index} out of range")
|
2086
|
+
|
2087
|
+
# Get bounds from context
|
2088
|
+
bounds = self._get_context_bounds()
|
2089
|
+
if not bounds:
|
2090
|
+
raise ValueError("Could not determine bounds")
|
2091
|
+
_, y0, x1, y1 = bounds
|
2092
|
+
|
2093
|
+
# Create region from guide to right edge
|
2094
|
+
x0 = self.vertical[guide_index]
|
2095
|
+
|
2096
|
+
if hasattr(target, "region"):
|
2097
|
+
return target.region(x0, y0, x1, y1)
|
2098
|
+
else:
|
2099
|
+
raise TypeError(f"Cannot create region on {type(target)}")
|
2100
|
+
|
2101
|
+
def above(self, guide_index: int, obj: Optional[Union["Page", "Region"]] = None) -> "Region":
|
2102
|
+
"""
|
2103
|
+
Get a region above a horizontal guide.
|
2104
|
+
|
2105
|
+
Args:
|
2106
|
+
guide_index: Horizontal guide index
|
2107
|
+
obj: Page or Region to create the region on (uses self.context if None)
|
2108
|
+
|
2109
|
+
Returns:
|
2110
|
+
Region above the specified guide
|
2111
|
+
"""
|
2112
|
+
target = obj or self.context
|
2113
|
+
if target is None:
|
2114
|
+
raise ValueError("No context available for region creation")
|
2115
|
+
|
2116
|
+
if not self.horizontal or guide_index < 0 or guide_index >= len(self.horizontal):
|
2117
|
+
raise IndexError(f"Guide index {guide_index} out of range")
|
2118
|
+
|
2119
|
+
# Get bounds from context
|
2120
|
+
bounds = self._get_context_bounds()
|
2121
|
+
if not bounds:
|
2122
|
+
raise ValueError("Could not determine bounds")
|
2123
|
+
x0, y0, x1, _ = bounds
|
2124
|
+
|
2125
|
+
# Create region from top edge to guide
|
2126
|
+
y1 = self.horizontal[guide_index]
|
2127
|
+
|
2128
|
+
if hasattr(target, "region"):
|
2129
|
+
return target.region(x0, y0, x1, y1)
|
2130
|
+
else:
|
2131
|
+
raise TypeError(f"Cannot create region on {type(target)}")
|
2132
|
+
|
2133
|
+
def below(self, guide_index: int, obj: Optional[Union["Page", "Region"]] = None) -> "Region":
|
2134
|
+
"""
|
2135
|
+
Get a region below a horizontal guide.
|
2136
|
+
|
2137
|
+
Args:
|
2138
|
+
guide_index: Horizontal guide index
|
2139
|
+
obj: Page or Region to create the region on (uses self.context if None)
|
2140
|
+
|
2141
|
+
Returns:
|
2142
|
+
Region below the specified guide
|
2143
|
+
"""
|
2144
|
+
target = obj or self.context
|
2145
|
+
if target is None:
|
2146
|
+
raise ValueError("No context available for region creation")
|
2147
|
+
|
2148
|
+
if not self.horizontal or guide_index < 0 or guide_index >= len(self.horizontal):
|
2149
|
+
raise IndexError(f"Guide index {guide_index} out of range")
|
2150
|
+
|
2151
|
+
# Get bounds from context
|
2152
|
+
bounds = self._get_context_bounds()
|
2153
|
+
if not bounds:
|
2154
|
+
raise ValueError("Could not determine bounds")
|
2155
|
+
x0, _, x1, y1 = bounds
|
2156
|
+
|
2157
|
+
# Create region from guide to bottom edge
|
2158
|
+
y0 = self.horizontal[guide_index]
|
2159
|
+
|
2160
|
+
if hasattr(target, "region"):
|
2161
|
+
return target.region(x0, y0, x1, y1)
|
2162
|
+
else:
|
2163
|
+
raise TypeError(f"Cannot create region on {type(target)}")
|
2164
|
+
|
2165
|
+
def between_vertical(
|
2166
|
+
self, start_index: int, end_index: int, obj: Optional[Union["Page", "Region"]] = None
|
2167
|
+
) -> "Region":
|
2168
|
+
"""
|
2169
|
+
Get a region between two vertical guides.
|
2170
|
+
|
2171
|
+
Args:
|
2172
|
+
start_index: Starting vertical guide index
|
2173
|
+
end_index: Ending vertical guide index
|
2174
|
+
obj: Page or Region to create the region on (uses self.context if None)
|
2175
|
+
|
2176
|
+
Returns:
|
2177
|
+
Region between the specified guides
|
2178
|
+
"""
|
2179
|
+
target = obj or self.context
|
2180
|
+
if target is None:
|
2181
|
+
raise ValueError("No context available for region creation")
|
2182
|
+
|
2183
|
+
if not self.vertical:
|
2184
|
+
raise ValueError("No vertical guides available")
|
2185
|
+
if start_index < 0 or start_index >= len(self.vertical):
|
2186
|
+
raise IndexError(f"Start index {start_index} out of range")
|
2187
|
+
if end_index < 0 or end_index >= len(self.vertical):
|
2188
|
+
raise IndexError(f"End index {end_index} out of range")
|
2189
|
+
if start_index >= end_index:
|
2190
|
+
raise ValueError("Start index must be less than end index")
|
2191
|
+
|
2192
|
+
# Get bounds from context
|
2193
|
+
bounds = self._get_context_bounds()
|
2194
|
+
if not bounds:
|
2195
|
+
raise ValueError("Could not determine bounds")
|
2196
|
+
_, y0, _, y1 = bounds
|
2197
|
+
|
2198
|
+
# Get horizontal boundaries
|
2199
|
+
x0 = self.vertical[start_index]
|
2200
|
+
x1 = self.vertical[end_index]
|
2201
|
+
|
2202
|
+
if hasattr(target, "region"):
|
2203
|
+
return target.region(x0, y0, x1, y1)
|
2204
|
+
else:
|
2205
|
+
raise TypeError(f"Cannot create region on {type(target)}")
|
2206
|
+
|
2207
|
+
def between_horizontal(
|
2208
|
+
self, start_index: int, end_index: int, obj: Optional[Union["Page", "Region"]] = None
|
2209
|
+
) -> "Region":
|
2210
|
+
"""
|
2211
|
+
Get a region between two horizontal guides.
|
2212
|
+
|
2213
|
+
Args:
|
2214
|
+
start_index: Starting horizontal guide index
|
2215
|
+
end_index: Ending horizontal guide index
|
2216
|
+
obj: Page or Region to create the region on (uses self.context if None)
|
2217
|
+
|
2218
|
+
Returns:
|
2219
|
+
Region between the specified guides
|
2220
|
+
"""
|
2221
|
+
target = obj or self.context
|
2222
|
+
if target is None:
|
2223
|
+
raise ValueError("No context available for region creation")
|
2224
|
+
|
2225
|
+
if not self.horizontal:
|
2226
|
+
raise ValueError("No horizontal guides available")
|
2227
|
+
if start_index < 0 or start_index >= len(self.horizontal):
|
2228
|
+
raise IndexError(f"Start index {start_index} out of range")
|
2229
|
+
if end_index < 0 or end_index >= len(self.horizontal):
|
2230
|
+
raise IndexError(f"End index {end_index} out of range")
|
2231
|
+
if start_index >= end_index:
|
2232
|
+
raise ValueError("Start index must be less than end index")
|
2233
|
+
|
2234
|
+
# Get bounds from context
|
2235
|
+
bounds = self._get_context_bounds()
|
2236
|
+
if not bounds:
|
2237
|
+
raise ValueError("Could not determine bounds")
|
2238
|
+
x0, _, x1, _ = bounds
|
2239
|
+
|
2240
|
+
# Get vertical boundaries
|
2241
|
+
y0 = self.horizontal[start_index]
|
2242
|
+
y1 = self.horizontal[end_index]
|
2243
|
+
|
2244
|
+
if hasattr(target, "region"):
|
2245
|
+
return target.region(x0, y0, x1, y1)
|
2246
|
+
else:
|
2247
|
+
raise TypeError(f"Cannot create region on {type(target)}")
|
2248
|
+
|
1845
2249
|
# -------------------------------------------------------------------------
|
1846
2250
|
# Operations
|
1847
2251
|
# -------------------------------------------------------------------------
|
@@ -3825,3 +4229,95 @@ class Guides:
|
|
3825
4229
|
return "vertical"
|
3826
4230
|
else:
|
3827
4231
|
return "horizontal"
|
4232
|
+
|
4233
|
+
|
4234
|
+
# -------------------------------------------------------------------------
|
4235
|
+
# Accessor classes for property-based access
|
4236
|
+
# -------------------------------------------------------------------------
|
4237
|
+
|
4238
|
+
|
4239
|
+
class _ColumnAccessor:
|
4240
|
+
"""Provides indexed access to columns via guides.columns[index]."""
|
4241
|
+
|
4242
|
+
def __init__(self, guides: "Guides"):
|
4243
|
+
self._guides = guides
|
4244
|
+
|
4245
|
+
def __len__(self):
|
4246
|
+
"""Return number of columns (vertical guides - 1)."""
|
4247
|
+
return max(0, len(self._guides.vertical) - 1)
|
4248
|
+
|
4249
|
+
def __getitem__(self, index: int) -> "Region":
|
4250
|
+
"""Get column at the specified index."""
|
4251
|
+
# Handle negative indexing
|
4252
|
+
if index < 0:
|
4253
|
+
index = len(self) + index
|
4254
|
+
return self._guides.column(index)
|
4255
|
+
|
4256
|
+
|
4257
|
+
class _RowAccessor:
|
4258
|
+
"""Provides indexed access to rows via guides.rows[index]."""
|
4259
|
+
|
4260
|
+
def __init__(self, guides: "Guides"):
|
4261
|
+
self._guides = guides
|
4262
|
+
|
4263
|
+
def __len__(self):
|
4264
|
+
"""Return number of rows (horizontal guides - 1)."""
|
4265
|
+
return max(0, len(self._guides.horizontal) - 1)
|
4266
|
+
|
4267
|
+
def __getitem__(self, index: int) -> "Region":
|
4268
|
+
"""Get row at the specified index."""
|
4269
|
+
# Handle negative indexing
|
4270
|
+
if index < 0:
|
4271
|
+
index = len(self) + index
|
4272
|
+
return self._guides.row(index)
|
4273
|
+
|
4274
|
+
|
4275
|
+
class _CellAccessor:
|
4276
|
+
"""Provides indexed access to cells via guides.cells[row][col] or guides.cells[row, col]."""
|
4277
|
+
|
4278
|
+
def __init__(self, guides: "Guides"):
|
4279
|
+
self._guides = guides
|
4280
|
+
|
4281
|
+
def __getitem__(self, key) -> Union["Region", "_CellRowAccessor"]:
|
4282
|
+
"""
|
4283
|
+
Get cell(s) at the specified position.
|
4284
|
+
|
4285
|
+
Supports:
|
4286
|
+
- guides.cells[row, col] - tuple indexing
|
4287
|
+
- guides.cells[row][col] - nested indexing
|
4288
|
+
"""
|
4289
|
+
if isinstance(key, tuple) and len(key) == 2:
|
4290
|
+
# Direct tuple access: guides.cells[row, col]
|
4291
|
+
row, col = key
|
4292
|
+
# Handle negative indexing for both row and col
|
4293
|
+
if row < 0:
|
4294
|
+
row = len(self._guides.rows) + row
|
4295
|
+
if col < 0:
|
4296
|
+
col = len(self._guides.columns) + col
|
4297
|
+
return self._guides.cell(row, col)
|
4298
|
+
elif isinstance(key, int):
|
4299
|
+
# First level of nested access: guides.cells[row]
|
4300
|
+
# Handle negative indexing for row
|
4301
|
+
if key < 0:
|
4302
|
+
key = len(self._guides.rows) + key
|
4303
|
+
# Return a row accessor that allows [col] indexing
|
4304
|
+
return _CellRowAccessor(self._guides, key)
|
4305
|
+
else:
|
4306
|
+
raise TypeError(
|
4307
|
+
f"Cell indices must be integers or tuple of two integers, got {type(key)}"
|
4308
|
+
)
|
4309
|
+
|
4310
|
+
|
4311
|
+
class _CellRowAccessor:
|
4312
|
+
"""Provides column access for a specific row in nested cell indexing."""
|
4313
|
+
|
4314
|
+
def __init__(self, guides: "Guides", row: int):
|
4315
|
+
self._guides = guides
|
4316
|
+
self._row = row
|
4317
|
+
|
4318
|
+
def __getitem__(self, col: int) -> "Region":
|
4319
|
+
"""Get cell at [row][col]."""
|
4320
|
+
# Handle negative indexing for column
|
4321
|
+
if col < 0:
|
4322
|
+
col = len(self._guides.columns) + col
|
4323
|
+
return self._guides.cell(self._row, col)
|
@@ -16,7 +16,7 @@ INSTALL_RECIPES: Dict[str, list[str]] = {
|
|
16
16
|
"paddle": ["paddlepaddle>=3.0.0", "paddleocr>=3.0.1", "paddlex>=3.0.2", "pandas>=2.2.0"],
|
17
17
|
"numpy-high": ["numpy>=2.0"],
|
18
18
|
"numpy-low": ["numpy<1.27"],
|
19
|
-
"surya": ["surya-ocr
|
19
|
+
"surya": ["surya-ocr<0.15"],
|
20
20
|
"yolo": ["doclayout_yolo", "huggingface_hub>=0.29.3"],
|
21
21
|
"docling": ["docling"],
|
22
22
|
# light helpers
|
@@ -633,9 +633,7 @@ class ElementCollection(
|
|
633
633
|
pdfplumber's layout engine if layout=True is specified.
|
634
634
|
|
635
635
|
Args:
|
636
|
-
separator: String to
|
637
|
-
using simple joining (layout=False). Default is a single space.
|
638
|
-
Ignored when layout=True as the layout engine handles spacing.
|
636
|
+
separator: String to join text from elements. Default is a single space.
|
639
637
|
preserve_whitespace: Deprecated. Use layout=False for simple joining.
|
640
638
|
use_exclusions: Deprecated. Exclusions should be applied *before* creating
|
641
639
|
the collection or by filtering the collection itself.
|
@@ -652,15 +650,49 @@ class ElementCollection(
|
|
652
650
|
Returns:
|
653
651
|
Combined text from elements, potentially with layout-based spacing.
|
654
652
|
"""
|
655
|
-
#
|
656
|
-
|
653
|
+
# Check if we have any elements at all
|
654
|
+
if not self._elements:
|
655
|
+
return ""
|
656
|
+
|
657
|
+
# Check if all elements are TextElements with character data
|
658
|
+
text_elements_with_chars = [
|
657
659
|
el
|
658
660
|
for el in self._elements
|
659
|
-
if isinstance(el, TextElement) and hasattr(el, "_char_dicts")
|
661
|
+
if isinstance(el, TextElement) and hasattr(el, "_char_dicts") and el._char_dicts
|
660
662
|
]
|
661
663
|
|
662
|
-
|
663
|
-
|
664
|
+
# If we have a mixed collection (Regions, TextElements without chars, etc),
|
665
|
+
# use a simpler approach: call extract_text on each element
|
666
|
+
if len(text_elements_with_chars) < len(self._elements):
|
667
|
+
# Mixed collection - extract text from each element
|
668
|
+
element_texts = []
|
669
|
+
|
670
|
+
# Sort elements by position first
|
671
|
+
sorted_elements = sorted(
|
672
|
+
self._elements,
|
673
|
+
key=lambda el: (
|
674
|
+
el.page.index if hasattr(el, "page") else 0,
|
675
|
+
el.top if hasattr(el, "top") else 0,
|
676
|
+
el.x0 if hasattr(el, "x0") else 0,
|
677
|
+
),
|
678
|
+
)
|
679
|
+
|
680
|
+
for el in sorted_elements:
|
681
|
+
if hasattr(el, "extract_text"):
|
682
|
+
# Call extract_text on the element (works for TextElement, Region, etc)
|
683
|
+
text = el.extract_text(**kwargs)
|
684
|
+
if text:
|
685
|
+
element_texts.append(text)
|
686
|
+
elif hasattr(el, "text"):
|
687
|
+
# Fallback to text property if available
|
688
|
+
text = getattr(el, "text", "")
|
689
|
+
if text:
|
690
|
+
element_texts.append(text)
|
691
|
+
|
692
|
+
return separator.join(element_texts)
|
693
|
+
|
694
|
+
# All elements are TextElements with char data - use the original approach
|
695
|
+
text_elements = text_elements_with_chars
|
664
696
|
|
665
697
|
# Collect all character dictionaries
|
666
698
|
all_char_dicts = []
|
@@ -669,11 +701,20 @@ class ElementCollection(
|
|
669
701
|
|
670
702
|
if not all_char_dicts:
|
671
703
|
# Handle case where elements exist but have no char dicts
|
672
|
-
logger.
|
704
|
+
logger.debug(
|
673
705
|
"ElementCollection.extract_text: No character dictionaries found in TextElements."
|
674
706
|
)
|
707
|
+
# Sort elements by position before joining
|
708
|
+
sorted_text_elements = sorted(
|
709
|
+
text_elements,
|
710
|
+
key=lambda el: (
|
711
|
+
el.page.index if hasattr(el, "page") else 0,
|
712
|
+
el.top if hasattr(el, "top") else 0,
|
713
|
+
el.x0 if hasattr(el, "x0") else 0,
|
714
|
+
),
|
715
|
+
)
|
675
716
|
return separator.join(
|
676
|
-
getattr(el, "text", "") for el in
|
717
|
+
getattr(el, "text", "") for el in sorted_text_elements
|
677
718
|
) # Fallback to simple join of word text
|
678
719
|
|
679
720
|
# Apply content filtering if provided
|
@@ -737,33 +778,20 @@ class ElementCollection(
|
|
737
778
|
all_char_dicts.sort(
|
738
779
|
key=lambda c: (c.get("page_number", 0), c.get("top", 0), c.get("x0", 0))
|
739
780
|
)
|
740
|
-
result =
|
781
|
+
result = " ".join(c.get("text", "") for c in all_char_dicts)
|
741
782
|
|
742
783
|
else:
|
784
|
+
print("JOIN WITHOUT LAYOUT")
|
743
785
|
# Default: Simple join without layout
|
744
786
|
logger.debug("ElementCollection.extract_text: Using simple join (layout=False).")
|
745
|
-
|
746
|
-
|
747
|
-
#
|
748
|
-
#
|
749
|
-
|
750
|
-
#
|
751
|
-
|
752
|
-
|
753
|
-
key=lambda el: (
|
754
|
-
el.page.index if hasattr(el, "page") else 0,
|
755
|
-
el.top if hasattr(el, "top") else 0,
|
756
|
-
el.x0 if hasattr(el, "x0") else 0,
|
757
|
-
),
|
758
|
-
)
|
759
|
-
|
760
|
-
# Extract text from each element
|
761
|
-
element_texts = []
|
762
|
-
for el in sorted_elements:
|
763
|
-
if hasattr(el, "text") and el.text:
|
764
|
-
element_texts.append(el.text)
|
765
|
-
|
766
|
-
result = separator.join(element_texts)
|
787
|
+
result = separator.join(el.extract_text() for el in text_elements)
|
788
|
+
|
789
|
+
# # Sort chars by document order (page, top, x0)
|
790
|
+
# all_char_dicts.sort(
|
791
|
+
# key=lambda c: (c.get("page_number", 0), c.get("top", 0), c.get("x0", 0))
|
792
|
+
# )
|
793
|
+
# # Simple join of character text
|
794
|
+
# result = "".join(c.get("text", "") for c in all_char_dicts)
|
767
795
|
|
768
796
|
# Determine final strip flag – same rule as global helper unless caller overrides
|
769
797
|
strip_text = strip if strip is not None else (not use_layout)
|