natural-pdf 0.2.1.dev0__tar.gz → 0.2.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/.gitignore +1 -1
- natural_pdf-0.2.3/CLAUDE.md +85 -0
- {natural_pdf-0.2.1.dev0/natural_pdf.egg-info → natural_pdf-0.2.3}/PKG-INFO +2 -2
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/docs/layout-analysis/index.md +1 -1
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/docs/quick-reference/index.md +15 -1
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/docs/visual-debugging/index.md +63 -1
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/natural_pdf/analyzers/guides.py +159 -3
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/natural_pdf/collections/mixins.py +16 -3
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/natural_pdf/core/highlighting_service.py +33 -9
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/natural_pdf/core/page.py +138 -7
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/natural_pdf/core/page_collection.py +51 -14
- natural_pdf-0.2.3/natural_pdf/core/page_groupby.py +229 -0
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/natural_pdf/core/render_spec.py +62 -4
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/natural_pdf/elements/base.py +102 -20
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/natural_pdf/elements/element_collection.py +11 -10
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/natural_pdf/elements/region.py +21 -21
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/natural_pdf/elements/text.py +5 -0
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/natural_pdf/extraction/manager.py +8 -14
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/natural_pdf/extraction/mixin.py +35 -21
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/natural_pdf/selectors/parser.py +2 -2
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/natural_pdf/tables/result.py +37 -0
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3/natural_pdf.egg-info}/PKG-INFO +2 -2
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/natural_pdf.egg-info/SOURCES.txt +24 -0
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/natural_pdf.egg-info/requires.txt +1 -1
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/natural_pdf.egg-info/top_level.txt +1 -1
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/optimization/performance_analysis.py +1 -1
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/pyproject.toml +1 -1
- natural_pdf-0.2.3/tests/test_color_conversion.py +193 -0
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/tests/test_core/test_containment_geometry.py +6 -6
- natural_pdf-0.2.3/tests/test_directional_defaults.py +248 -0
- natural_pdf-0.2.3/tests/test_expand.py +150 -0
- natural_pdf-0.2.3/tests/test_extraction_error.py +85 -0
- natural_pdf-0.2.3/tests/test_extraction_mixin_fix.py +131 -0
- natural_pdf-0.2.3/tests/test_extraction_text_and_vision.py +250 -0
- natural_pdf-0.2.3/tests/test_extraction_working.py +147 -0
- natural_pdf-0.2.3/tests/test_first_last_selectors.py +99 -0
- natural_pdf-0.2.3/tests/test_groupby.py +307 -0
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/tests/test_guides.py +1 -1
- natural_pdf-0.2.3/tests/test_guides_apply_exclusions.py +216 -0
- natural_pdf-0.2.3/tests/test_guides_apply_exclusions_simple.py +72 -0
- natural_pdf-0.2.3/tests/test_guides_extract_table.py +252 -0
- natural_pdf-0.2.3/tests/test_guides_extract_table_real.py +247 -0
- natural_pdf-0.2.3/tests/test_highlight_regions.py +161 -0
- natural_pdf-0.2.3/tests/test_page_exclusion_lists.py +220 -0
- natural_pdf-0.2.3/tests/test_pdf_add_exclusion_elementcollection.py +170 -0
- natural_pdf-0.2.3/tests/test_show_column_layout.py +180 -0
- natural_pdf-0.2.3/tests/test_show_edge_cases.py +191 -0
- natural_pdf-0.2.3/tests/test_show_exclusions.py +77 -0
- natural_pdf-0.2.3/tests/test_show_exclusions_feature.py +125 -0
- natural_pdf-0.2.3/tests/test_show_limit.py +173 -0
- natural_pdf-0.2.3/tests/test_table_result_header_mismatch.py +138 -0
- natural_pdf-0.2.3/tests/test_table_result_keep_blank.py +198 -0
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/todo/evaluation.md +1 -1
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/tools/bad_pdf_eval/analyser.py +1 -1
- natural_pdf-0.2.1.dev0/CLAUDE.md +0 -524
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/.cursor/rules/analysis_framework.mdc +0 -0
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/.cursor/rules/coding-style.mdc +0 -0
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/.cursor/rules/edit-md-instead-of-ipynb.mdc +0 -0
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/.cursor/rules/minimal-comments.mdc +0 -0
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/.cursor/rules/natural-pdf-overview.mdc +0 -0
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/.cursor/rules/user-friendly-library-code.mdc +0 -0
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/.github/workflows/ci.yml +0 -0
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/.github/workflows/docs.yml +0 -0
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/.github/workflows/nightly-tutorials.yml +0 -0
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/.pre-commit-config.yaml +0 -0
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/01-execute_notebooks.py +0 -0
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/02-run_all_tutorials.sh +0 -0
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/LICENSE +0 -0
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/MANIFEST.in +0 -0
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/README.md +0 -0
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/audit_packaging.py +0 -0
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/check_run_md.sh +0 -0
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/docs/api/index.md +0 -0
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/docs/assets/favicon.png +0 -0
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/docs/assets/favicon.svg +0 -0
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/docs/assets/javascripts/custom.js +0 -0
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/docs/assets/logo.svg +0 -0
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/docs/assets/sample-screen.png +0 -0
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/docs/assets/social-preview.png +0 -0
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/docs/assets/social-preview.svg +0 -0
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/docs/assets/stylesheets/custom.css +0 -0
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/docs/categorizing-documents/index.md +0 -0
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/docs/data-extraction/index.md +0 -0
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/docs/describe/index.md +0 -0
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/docs/document-qa/index.md +0 -0
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/docs/element-selection/index.md +0 -0
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/docs/extracting-clean-text/index.md +0 -0
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/docs/finetuning/index.md +0 -0
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/docs/fix-messy-tables/index.md +0 -0
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/docs/fix-messy-tables/table_1.csv +0 -0
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/docs/fix-messy-tables/table_2.csv +0 -0
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/docs/fix-messy-tables/table_3.csv +0 -0
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/docs/index.md +0 -0
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/docs/installation/index.md +0 -0
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/docs/interactive-widget/index.md +0 -0
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/docs/loops-and-groups/index.md +0 -0
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/docs/ocr/index.md +0 -0
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/docs/pdf-navigation/index.md +0 -0
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/docs/process-forms-and-invoices/extracted_form_data.csv +0 -0
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/docs/process-forms-and-invoices/index.md +0 -0
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/docs/reflowing-pages/index.md +0 -0
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/docs/regions/index.md +0 -0
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/docs/tables/index.md +0 -0
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/docs/text-analysis/index.md +0 -0
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/docs/tutorials/01-loading-and-extraction.md +0 -0
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/docs/tutorials/02-finding-elements.md +0 -0
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/docs/tutorials/03-extracting-blocks.md +0 -0
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/docs/tutorials/04-table-extraction.md +0 -0
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/docs/tutorials/05-excluding-content.md +0 -0
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/docs/tutorials/06-document-qa.md +0 -0
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/docs/tutorials/07-layout-analysis.md +0 -0
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/docs/tutorials/07-working-with-regions.md +0 -0
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/docs/tutorials/08-spatial-navigation.md +0 -0
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/docs/tutorials/09-section-extraction.md +0 -0
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/docs/tutorials/10-form-field-extraction.md +0 -0
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/docs/tutorials/11-enhanced-table-processing.md +0 -0
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/docs/tutorials/12-ocr-integration.md +0 -0
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/docs/tutorials/13-semantic-search.md +0 -0
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/docs/tutorials/14-categorizing-documents.md +0 -0
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/docs/visual-debugging/region.png +0 -0
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/mkdocs.yml +0 -0
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/natural_pdf/__init__.py +0 -0
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/natural_pdf/analyzers/__init__.py +0 -0
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/natural_pdf/analyzers/layout/__init__.py +0 -0
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/natural_pdf/analyzers/layout/base.py +0 -0
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/natural_pdf/analyzers/layout/docling.py +0 -0
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/natural_pdf/analyzers/layout/gemini.py +0 -0
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/natural_pdf/analyzers/layout/layout_analyzer.py +0 -0
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/natural_pdf/analyzers/layout/layout_manager.py +0 -0
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/natural_pdf/analyzers/layout/layout_options.py +0 -0
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/natural_pdf/analyzers/layout/paddle.py +0 -0
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/natural_pdf/analyzers/layout/pdfplumber_table_finder.py +0 -0
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/natural_pdf/analyzers/layout/surya.py +0 -0
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/natural_pdf/analyzers/layout/table_structure_utils.py +0 -0
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/natural_pdf/analyzers/layout/tatr.py +0 -0
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/natural_pdf/analyzers/layout/yolo.py +0 -0
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/natural_pdf/analyzers/shape_detection_mixin.py +0 -0
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/natural_pdf/analyzers/text_options.py +0 -0
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/natural_pdf/analyzers/text_structure.py +0 -0
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/natural_pdf/analyzers/utils.py +0 -0
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/natural_pdf/classification/manager.py +0 -0
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/natural_pdf/classification/mixin.py +0 -0
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/natural_pdf/classification/results.py +0 -0
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/natural_pdf/cli.py +0 -0
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/natural_pdf/core/__init__.py +0 -0
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/natural_pdf/core/element_manager.py +0 -0
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/natural_pdf/core/pdf.py +0 -0
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/natural_pdf/core/pdf_collection.py +0 -0
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/natural_pdf/describe/__init__.py +0 -0
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/natural_pdf/describe/base.py +0 -0
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/natural_pdf/describe/elements.py +0 -0
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/natural_pdf/describe/mixin.py +0 -0
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/natural_pdf/describe/summary.py +0 -0
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/natural_pdf/elements/__init__.py +0 -0
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/natural_pdf/elements/image.py +0 -0
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/natural_pdf/elements/line.py +0 -0
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/natural_pdf/elements/rect.py +0 -0
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/natural_pdf/export/mixin.py +0 -0
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/natural_pdf/exporters/__init__.py +0 -0
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/natural_pdf/exporters/base.py +0 -0
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/natural_pdf/exporters/data/__init__.py +0 -0
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/natural_pdf/exporters/data/pdf.ttf +0 -0
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/natural_pdf/exporters/data/sRGB.icc +0 -0
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/natural_pdf/exporters/hocr.py +0 -0
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/natural_pdf/exporters/hocr_font.py +0 -0
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/natural_pdf/exporters/original_pdf.py +0 -0
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/natural_pdf/exporters/paddleocr.py +0 -0
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/natural_pdf/exporters/searchable_pdf.py +0 -0
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/natural_pdf/extraction/result.py +0 -0
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/natural_pdf/flows/__init__.py +0 -0
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/natural_pdf/flows/collections.py +0 -0
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/natural_pdf/flows/element.py +0 -0
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/natural_pdf/flows/flow.py +0 -0
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/natural_pdf/flows/region.py +0 -0
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/natural_pdf/ocr/__init__.py +0 -0
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/natural_pdf/ocr/engine.py +0 -0
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/natural_pdf/ocr/engine_doctr.py +0 -0
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/natural_pdf/ocr/engine_easyocr.py +0 -0
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/natural_pdf/ocr/engine_paddle.py +0 -0
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/natural_pdf/ocr/engine_surya.py +0 -0
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/natural_pdf/ocr/ocr_factory.py +0 -0
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/natural_pdf/ocr/ocr_manager.py +0 -0
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/natural_pdf/ocr/ocr_options.py +0 -0
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/natural_pdf/ocr/utils.py +0 -0
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/natural_pdf/qa/__init__.py +0 -0
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/natural_pdf/qa/document_qa.py +0 -0
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/natural_pdf/qa/qa_result.py +0 -0
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/natural_pdf/search/__init__.py +0 -0
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/natural_pdf/search/lancedb_search_service.py +0 -0
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/natural_pdf/search/numpy_search_service.py +0 -0
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/natural_pdf/search/search_options.py +0 -0
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/natural_pdf/search/search_service_protocol.py +0 -0
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/natural_pdf/search/searchable_mixin.py +0 -0
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/natural_pdf/selectors/__init__.py +0 -0
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/natural_pdf/tables/__init__.py +0 -0
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/natural_pdf/templates/__init__.py +0 -0
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/natural_pdf/templates/finetune/fine_tune_paddleocr.md +0 -0
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/natural_pdf/templates/spa/css/style.css +0 -0
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/natural_pdf/templates/spa/index.html +0 -0
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/natural_pdf/templates/spa/js/app.js +0 -0
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/natural_pdf/templates/spa/words.txt +0 -0
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/natural_pdf/text_mixin.py +0 -0
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/natural_pdf/utils/__init__.py +0 -0
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/natural_pdf/utils/bidi_mirror.py +0 -0
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/natural_pdf/utils/debug.py +0 -0
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/natural_pdf/utils/highlighting.py +0 -0
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/natural_pdf/utils/identifiers.py +0 -0
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/natural_pdf/utils/layout.py +0 -0
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/natural_pdf/utils/locks.py +0 -0
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/natural_pdf/utils/packaging.py +0 -0
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/natural_pdf/utils/reading_order.py +0 -0
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/natural_pdf/utils/text_extraction.py +0 -0
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/natural_pdf/utils/visualization.py +0 -0
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/natural_pdf/widgets/__init__.py +0 -0
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/natural_pdf/widgets/viewer.py +0 -0
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/natural_pdf.egg-info/dependency_links.txt +0 -0
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/natural_pdf.egg-info/entry_points.txt +0 -0
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/noxfile.py +0 -0
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/optimization/memory_comparison.py +0 -0
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/optimization/pdf_analyzer.py +0 -0
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/optimization/performance_results/image_heavy_snapshots.csv +0 -0
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/optimization/performance_results/image_heavy_snapshots.json +0 -0
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/optimization/performance_results/text_heavy_snapshots.csv +0 -0
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/optimization/performance_results/text_heavy_snapshots.json +0 -0
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/optimization/test_cleanup_methods.py +0 -0
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/optimization/test_memory_fix.py +0 -0
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/publish.sh +0 -0
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/sample-screen.png +0 -0
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/setup.cfg +0 -0
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/test_install.sh +0 -0
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/tests/conftest.py +0 -0
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/tests/exporters/test_paddleocr_exporter.py +0 -0
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/tests/test_annotate.py +0 -0
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/tests/test_arabic_performance.py +0 -0
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/tests/test_arabic_real_world.py +0 -0
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/tests/test_core/test_elements.py +0 -0
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/tests/test_core/test_loading.py +0 -0
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/tests/test_core/test_spatial.py +0 -0
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/tests/test_core/test_text_extraction.py +0 -0
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/tests/test_core/test_text_layer.py +0 -0
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/tests/test_document_qa.py +0 -0
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/tests/test_element_collection_slicing.py +0 -0
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/tests/test_exclusions.py +0 -0
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/tests/test_flow_region_directional.py +0 -0
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/tests/test_guides_integration.py +0 -0
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/tests/test_highlight_detection.py +0 -0
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/tests/test_highlight_protocol.py +0 -0
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/tests/test_highlight_protocol_simple.py +0 -0
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/tests/test_loading_original.py +0 -0
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/tests/test_multi_page_table_discovery.py +0 -0
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/tests/test_optional_deps.py +0 -0
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/tests/test_region_viewer.py +0 -0
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/tests/test_sections_end_only.py +0 -0
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/tests/test_skip_repeating_headers_multipage.py +0 -0
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/tests/test_strikethrough_detection.py +0 -0
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/tests/test_tiny_text_tables.py +0 -0
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/tests/test_tiny_text_tables_table.py +0 -0
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/tests/test_tutorials.py +0 -0
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/tests/test_underline_detection.py +0 -0
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/tests/test_update_text.py +0 -0
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/todo/bad_pdf_analysis.md +0 -0
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/tools/bad_pdf_eval/IMPROVEMENTS_SUMMARY.md +0 -0
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/tools/bad_pdf_eval/LLM_NaturalPDF_CheatSheet.md +0 -0
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/tools/bad_pdf_eval/LLM_NaturalPDF_Workflows.md +0 -0
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/tools/bad_pdf_eval/README.md +0 -0
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/tools/bad_pdf_eval/__init__.py +0 -0
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/tools/bad_pdf_eval/collate_summaries.py +0 -0
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/tools/bad_pdf_eval/compile_attempts_markdown.py +0 -0
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/tools/bad_pdf_eval/eval_suite.py +0 -0
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/tools/bad_pdf_eval/evaluate_quality.py +0 -0
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/tools/bad_pdf_eval/export_enrichment_csv.py +0 -0
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/tools/bad_pdf_eval/extraction_decision_tree.md +0 -0
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/tools/bad_pdf_eval/llm_enrich.py +0 -0
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/tools/bad_pdf_eval/llm_enrich_with_retry.py +0 -0
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/tools/bad_pdf_eval/reporter.py +0 -0
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/tools/bad_pdf_eval/utils.py +0 -0
- {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/uv.lock +0 -0
@@ -0,0 +1,85 @@
|
|
1
|
+
# Natural PDF Library Analysis
|
2
|
+
|
3
|
+
## Library Overview
|
4
|
+
Natural PDF is a Python library for intelligent PDF document processing that combines traditional PDF parsing with modern AI capabilities. It provides a jQuery-like API for selecting and manipulating PDF elements with spatial awareness.
|
5
|
+
|
6
|
+
## Core Goals & Purpose
|
7
|
+
- **Intelligent PDF Processing**: Goes beyond simple text extraction to understand document structure and spatial relationships
|
8
|
+
- **AI-Enhanced Workflows**: Integrates OCR, document Q&A, classification, and LLM-based data extraction
|
9
|
+
- **Spatial Navigation**: Provides methods like `.below()`, `.above()`, `.left()` for intuitive element selection
|
10
|
+
- **Multi-format Support**: Handles both text-based PDFs and image-based (OCR-required) documents
|
11
|
+
|
12
|
+
## Key Use Cases & Workflows
|
13
|
+
|
14
|
+
### 1. Basic Text and Table Extraction
|
15
|
+
- Load PDFs from local files or URLs
|
16
|
+
- Extract text with layout preservation
|
17
|
+
- Find and extract tables automatically
|
18
|
+
- Use spatial selectors: `page.find('text:contains(Violations)').below()`
|
19
|
+
|
20
|
+
### 2. OCR Integration
|
21
|
+
- Multiple OCR engines supported: EasyOCR (default), Surya, PaddleOCR, DocTR
|
22
|
+
- Configurable resolution and detection modes
|
23
|
+
- OCR correction using LLMs
|
24
|
+
- Human-in-the-loop correction workflows with exportable packages
|
25
|
+
|
26
|
+
### 3. AI-Powered Data Extraction
|
27
|
+
- **Document Q&A**: Extractive question answering with confidence scores
|
28
|
+
- **Structured Data**: Extract specific fields with schema validation using Pydantic
|
29
|
+
- **LLM Integration**: OpenAI/Gemini compatible for advanced extraction
|
30
|
+
- **Classification**: Document/page categorization using text or vision models
|
31
|
+
|
32
|
+
### 4. Advanced Document Processing
|
33
|
+
- **Multi-column/Page Flows**: Reflow content across columns or pages for proper reading order
|
34
|
+
- **Layout Analysis**: YOLO, TATR for automatic document structure detection
|
35
|
+
- **Visual Element Detection**: Checkbox classification, form field extraction
|
36
|
+
- **Table Structure Detection**: Manual line detection for complex tables
|
37
|
+
|
38
|
+
### 5. Visualization and Display
|
39
|
+
- **Page Limit for show()**: By default, `pdf.show()` displays only the first 30 pages to prevent overwhelming displays
|
40
|
+
- Use `pdf.show(limit=10)` to show fewer pages
|
41
|
+
- Use `pdf.show(limit=None)` to display all pages
|
42
|
+
- Works with all layout options: `pdf.show(limit=20, layout='grid', columns=4)`
|
43
|
+
- **Exclusion Zone Visualization**: Use `exclusions='red'` parameter to visualize exclusion zones
|
44
|
+
- `page.show(exclusions='red')` highlights exclusions in red
|
45
|
+
- `page.show(exclusions='blue')` highlights exclusions in blue
|
46
|
+
- `page.show(exclusions=True)` uses default red color
|
47
|
+
- Works at PDF level too: `pdf.show(exclusions='green')`
|
48
|
+
|
49
|
+
### 6. Directional Navigation Improvements
|
50
|
+
- **Smart defaults for spatial methods**:
|
51
|
+
- `.left()` and `.right()` now default to `height='element'` (matches element height)
|
52
|
+
- `.above()` and `.below()` continue to default to `width='full'` (full page width)
|
53
|
+
- This matches common use cases: looking sideways usually wants same height, looking up/down wants full width
|
54
|
+
- **Enhanced discoverability**:
|
55
|
+
- Docstrings include examples showing different height/width options
|
56
|
+
- Clear parameter names ('height' for left/right, 'width' for above/below)
|
57
|
+
|
58
|
+
### 6a. Enhanced Exclusion Support
|
59
|
+
- **ElementCollection support in callable exclusions**: `pdf.add_exclusion(lambda page: page.find_all('text:contains("Header")'))` now works
|
60
|
+
- **List/iterable support**: Callable exclusions can return lists or other iterables of elements
|
61
|
+
- **Automatic conversion**: Elements from iterables are automatically converted to exclusion regions
|
62
|
+
- **Backward compatibility**: Existing Region and callable exclusions continue to work unchanged
|
63
|
+
|
64
|
+
### 7. Page Grouping with groupby()
|
65
|
+
- **Simple grouping by selector text**: `pages.groupby('text[size=16]')` groups by header text
|
66
|
+
- **Callable functions for complex logic**: `pages.groupby(lambda p: p.find('text:contains("CITY")').extract_text())`
|
67
|
+
- **Pandas-style iteration**: `for title, pages in grouped:` (no `.items()` needed)
|
68
|
+
- **Dict-like access**: `grouped.get('CITY OF MADISON')` or `grouped.get_group('key')`
|
69
|
+
- **Index-based access**: `grouped[0]` (first group), `grouped[-1]` (last group), `grouped['key']` (by name)
|
70
|
+
- **Group exploration**: `grouped.info()` shows all groups with indexes and page counts
|
71
|
+
- **Batch operations**: `grouped.apply(lambda pages: len(pages.find_all('table')))`
|
72
|
+
- **Visual inspection**: `grouped.show(limit=2)` shows first 2 pages of each group
|
73
|
+
- **Progress bar support**: Automatic progress bars for large collections, disable with `show_progress=False`
|
74
|
+
- **None handling**: Pages with no matching elements group under `None` key
|
75
|
+
|
76
|
+
## Development Best Practices
|
77
|
+
|
78
|
+
### File and Resource Management
|
79
|
+
- When making temp files, put them in temp/
|
80
|
+
- When creating test files, put them in tests/
|
81
|
+
- Most fixes and changes need a test, and should be done with test-driven development
|
82
|
+
|
83
|
+
### Environment and Tooling
|
84
|
+
- Always use the virtual environment in .venv
|
85
|
+
- Use uv when possible for efficient package management
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: natural-pdf
|
3
|
-
Version: 0.2.
|
3
|
+
Version: 0.2.3
|
4
4
|
Summary: A more intuitive interface for working with PDFs
|
5
5
|
Author-email: Jonathan Soma <jonathan.soma@gmail.com>
|
6
6
|
License-Expression: MIT
|
@@ -14,7 +14,7 @@ License-File: LICENSE
|
|
14
14
|
Requires-Dist: scikit-learn
|
15
15
|
Requires-Dist: markdown
|
16
16
|
Requires-Dist: pandas
|
17
|
-
Requires-Dist: pdfplumber
|
17
|
+
Requires-Dist: pdfplumber>=0.11.7
|
18
18
|
Requires-Dist: colormath2
|
19
19
|
Requires-Dist: pillow
|
20
20
|
Requires-Dist: colour
|
@@ -105,7 +105,7 @@ page.find_all('region[model=tatr]').show(group_by='region_type', width=700)
|
|
105
105
|
|
106
106
|
# page.analyze_layout(engine="docling")
|
107
107
|
# page.find_all('region[model=docling]').show(group_by='region_type')
|
108
|
-
# page.
|
108
|
+
# page.render(width=700)
|
109
109
|
```
|
110
110
|
|
111
111
|
```python
|
@@ -156,11 +156,25 @@ elements.show(color="red") # Single collection
|
|
156
156
|
elements.show(color="blue", label="Headers") # With label
|
157
157
|
elements.show(group_by='type') # Color by type
|
158
158
|
|
159
|
-
#
|
159
|
+
# Quick highlighting (one-liner)
|
160
|
+
page.highlight(elements1, elements2, elements3) # Multiple elements
|
161
|
+
page.highlight( # With custom colors
|
162
|
+
(elements1, 'red'),
|
163
|
+
(elements2, 'blue'),
|
164
|
+
(elements3, 'green')
|
165
|
+
)
|
166
|
+
|
167
|
+
# Multiple collections with context manager
|
160
168
|
with page.highlights() as h:
|
161
169
|
h.add(elements1, color="red", label="Type 1")
|
162
170
|
h.add(elements2, color="blue", label="Type 2")
|
163
171
|
h.show()
|
172
|
+
|
173
|
+
# Auto-display in Jupyter/Colab
|
174
|
+
with page.highlights(show=True) as h:
|
175
|
+
h.add(elements1, label="Headers")
|
176
|
+
h.add(elements2, label="Content")
|
177
|
+
# Displays automatically when exiting context
|
164
178
|
```
|
165
179
|
|
166
180
|
### Viewing
|
@@ -83,6 +83,47 @@ with page.highlights() as h:
|
|
83
83
|
h.show()
|
84
84
|
```
|
85
85
|
|
86
|
+
### Jupyter/Colab Support
|
87
|
+
|
88
|
+
In Jupyter notebooks and Google Colab, you can use `show=True` to automatically display the highlights when exiting the context:
|
89
|
+
|
90
|
+
```python
|
91
|
+
# Automatically displays the image in Jupyter/Colab
|
92
|
+
with page.highlights(show=True) as h:
|
93
|
+
h.add(summary_elements, label='Summary')
|
94
|
+
h.add(date_elements, label='Date')
|
95
|
+
h.add(line_elements, label='Lines')
|
96
|
+
# No need to call h.show() - displays automatically!
|
97
|
+
```
|
98
|
+
|
99
|
+
### Quick Highlighting with `.highlight()`
|
100
|
+
|
101
|
+
For simple highlighting tasks, use the `.highlight()` convenience method:
|
102
|
+
|
103
|
+
```python
|
104
|
+
# Highlight multiple elements in one line
|
105
|
+
page.highlight(summary_elements, date_elements, line_elements)
|
106
|
+
|
107
|
+
# With custom colors
|
108
|
+
page.highlight(
|
109
|
+
(summary_elements, 'red'),
|
110
|
+
(date_elements, 'blue'),
|
111
|
+
(line_elements, 'green')
|
112
|
+
)
|
113
|
+
|
114
|
+
# With colors and labels
|
115
|
+
page.highlight(
|
116
|
+
(summary_elements, 'red', 'Summary Text'),
|
117
|
+
(date_elements, 'blue', 'Date Fields'),
|
118
|
+
(line_elements, 'green', 'Separator Lines')
|
119
|
+
)
|
120
|
+
|
121
|
+
# Pass additional parameters like width or resolution
|
122
|
+
page.highlight(summary_elements, date_elements, width=800, labels=True)
|
123
|
+
```
|
124
|
+
|
125
|
+
This method is particularly useful in Jupyter/Colab environments where the image displays automatically as the cell output.
|
126
|
+
|
86
127
|
## Customizing Multiple Highlights
|
87
128
|
|
88
129
|
Customize the appearance of multiple highlights using the context manager:
|
@@ -133,7 +174,7 @@ content = title.below(height=200)
|
|
133
174
|
content.show()
|
134
175
|
```
|
135
176
|
|
136
|
-
Or look at just the region by itself
|
177
|
+
Or look at just the region by itself:
|
137
178
|
|
138
179
|
```python
|
139
180
|
# Find a title and create a region below it
|
@@ -144,6 +185,27 @@ content = title.below(height=200)
|
|
144
185
|
content.show(crop=True)
|
145
186
|
```
|
146
187
|
|
188
|
+
### Highlighting Multiple Regions
|
189
|
+
|
190
|
+
The `.highlight()` method works with regions too:
|
191
|
+
|
192
|
+
```python
|
193
|
+
# Create multiple regions
|
194
|
+
left = page.region(left=0, right=page.width/3, top=0, bottom=page.height)
|
195
|
+
mid = page.region(left=page.width/3, right=page.width/3*2, top=0, bottom=page.height)
|
196
|
+
right = page.region(left=page.width/3*2, right=page.width, top=0, bottom=page.height)
|
197
|
+
|
198
|
+
# Highlight all three regions
|
199
|
+
page.highlight(left, mid, right)
|
200
|
+
|
201
|
+
# Or with custom colors
|
202
|
+
page.highlight(
|
203
|
+
(left, 'red', 'Left Column'),
|
204
|
+
(mid, 'green', 'Middle Column'),
|
205
|
+
(right, 'blue', 'Right Column')
|
206
|
+
)
|
207
|
+
```
|
208
|
+
|
147
209
|
## Working with Text Styles
|
148
210
|
|
149
211
|
Visualize text styles to understand the document structure:
|
@@ -3,7 +3,7 @@
|
|
3
3
|
import json
|
4
4
|
import logging
|
5
5
|
from collections import UserList
|
6
|
-
from typing import TYPE_CHECKING, Any, Dict, List, Literal, Optional, Tuple, Union
|
6
|
+
from typing import TYPE_CHECKING, Any, Callable, Dict, List, Literal, Optional, Tuple, Union
|
7
7
|
|
8
8
|
import numpy as np
|
9
9
|
from PIL import Image, ImageDraw
|
@@ -16,6 +16,7 @@ if TYPE_CHECKING:
|
|
16
16
|
from natural_pdf.elements.element_collection import ElementCollection
|
17
17
|
from natural_pdf.elements.region import Region
|
18
18
|
from natural_pdf.flows.region import FlowRegion
|
19
|
+
from natural_pdf.tables.result import TableResult
|
19
20
|
|
20
21
|
logger = logging.getLogger(__name__)
|
21
22
|
|
@@ -131,6 +132,15 @@ class GuidesList(UserList):
|
|
131
132
|
self._parent = parent_guides
|
132
133
|
self._axis = axis
|
133
134
|
|
135
|
+
def __getitem__(self, i):
|
136
|
+
"""Override to handle slicing properly."""
|
137
|
+
if isinstance(i, slice):
|
138
|
+
# Return a new GuidesList with the sliced data
|
139
|
+
return self.__class__(self._parent, self._axis, self.data[i])
|
140
|
+
else:
|
141
|
+
# For single index, return the value directly
|
142
|
+
return self.data[i]
|
143
|
+
|
134
144
|
def from_content(
|
135
145
|
self,
|
136
146
|
markers: Union[str, List[str], "ElementCollection", None],
|
@@ -140,6 +150,7 @@ class GuidesList(UserList):
|
|
140
150
|
tolerance: float = 5,
|
141
151
|
*,
|
142
152
|
append: bool = False,
|
153
|
+
apply_exclusions: bool = True,
|
143
154
|
) -> "Guides":
|
144
155
|
"""
|
145
156
|
Create guides from content markers and add to this axis.
|
@@ -154,6 +165,7 @@ class GuidesList(UserList):
|
|
154
165
|
align: How to align guides relative to found elements
|
155
166
|
outer: Whether to add outer boundary guides
|
156
167
|
tolerance: Tolerance for snapping to element edges
|
168
|
+
apply_exclusions: Whether to apply exclusion zones when searching for text
|
157
169
|
|
158
170
|
Returns:
|
159
171
|
Parent Guides object for chaining
|
@@ -178,6 +190,7 @@ class GuidesList(UserList):
|
|
178
190
|
align=align,
|
179
191
|
outer=outer,
|
180
192
|
tolerance=tolerance,
|
193
|
+
apply_exclusions=apply_exclusions,
|
181
194
|
)
|
182
195
|
|
183
196
|
# Collect guides from this region
|
@@ -260,6 +273,7 @@ class GuidesList(UserList):
|
|
260
273
|
align=align,
|
261
274
|
outer=outer,
|
262
275
|
tolerance=tolerance,
|
276
|
+
apply_exclusions=apply_exclusions,
|
263
277
|
)
|
264
278
|
|
265
279
|
# Replace or append based on parameter
|
@@ -1398,6 +1412,7 @@ class Guides:
|
|
1398
1412
|
align: Literal["left", "right", "center", "between"] = "left",
|
1399
1413
|
outer: bool = True,
|
1400
1414
|
tolerance: float = 5,
|
1415
|
+
apply_exclusions: bool = True,
|
1401
1416
|
) -> "Guides":
|
1402
1417
|
"""
|
1403
1418
|
Create guides based on text content positions.
|
@@ -1413,6 +1428,7 @@ class Guides:
|
|
1413
1428
|
align: Where to place guides relative to found text
|
1414
1429
|
outer: Whether to add guides at the boundaries
|
1415
1430
|
tolerance: Maximum distance to search for text
|
1431
|
+
apply_exclusions: Whether to apply exclusion zones when searching for text
|
1416
1432
|
|
1417
1433
|
Returns:
|
1418
1434
|
New Guides object aligned to text content
|
@@ -1431,6 +1447,7 @@ class Guides:
|
|
1431
1447
|
align=align,
|
1432
1448
|
outer=outer,
|
1433
1449
|
tolerance=tolerance,
|
1450
|
+
apply_exclusions=apply_exclusions,
|
1434
1451
|
)
|
1435
1452
|
|
1436
1453
|
# Store in flow guides
|
@@ -1469,7 +1486,7 @@ class Guides:
|
|
1469
1486
|
# Find each marker and determine guide position
|
1470
1487
|
for marker in marker_texts:
|
1471
1488
|
if hasattr(obj, "find"):
|
1472
|
-
element = obj.find(f'text:contains("{marker}")')
|
1489
|
+
element = obj.find(f'text:contains("{marker}")', apply_exclusions=apply_exclusions)
|
1473
1490
|
if element:
|
1474
1491
|
if axis == "vertical":
|
1475
1492
|
if align == "left":
|
@@ -1498,7 +1515,9 @@ class Guides:
|
|
1498
1515
|
marker_bounds = []
|
1499
1516
|
for marker in marker_texts:
|
1500
1517
|
if hasattr(obj, "find"):
|
1501
|
-
element = obj.find(
|
1518
|
+
element = obj.find(
|
1519
|
+
f'text:contains("{marker}")', apply_exclusions=apply_exclusions
|
1520
|
+
)
|
1502
1521
|
if element:
|
1503
1522
|
if axis == "vertical":
|
1504
1523
|
marker_bounds.append((element.x0, element.x1))
|
@@ -3285,6 +3304,7 @@ class Guides:
|
|
3285
3304
|
align: Literal["left", "right", "center", "between"] = "left",
|
3286
3305
|
outer: bool = True,
|
3287
3306
|
tolerance: float = 5,
|
3307
|
+
apply_exclusions: bool = True,
|
3288
3308
|
) -> "Guides":
|
3289
3309
|
"""
|
3290
3310
|
Instance method: Add guides from content, allowing chaining.
|
@@ -3301,6 +3321,7 @@ class Guides:
|
|
3301
3321
|
align: How to align guides relative to found elements
|
3302
3322
|
outer: Whether to add outer boundary guides
|
3303
3323
|
tolerance: Tolerance for snapping to element edges
|
3324
|
+
apply_exclusions: Whether to apply exclusion zones when searching for text
|
3304
3325
|
|
3305
3326
|
Returns:
|
3306
3327
|
Self for method chaining
|
@@ -3318,6 +3339,7 @@ class Guides:
|
|
3318
3339
|
align=align,
|
3319
3340
|
outer=outer,
|
3320
3341
|
tolerance=tolerance,
|
3342
|
+
apply_exclusions=apply_exclusions,
|
3321
3343
|
)
|
3322
3344
|
|
3323
3345
|
# Add the appropriate coordinates to this object
|
@@ -3421,6 +3443,140 @@ class Guides:
|
|
3421
3443
|
|
3422
3444
|
return self
|
3423
3445
|
|
3446
|
+
def extract_table(
|
3447
|
+
self,
|
3448
|
+
target: Optional[Union["Page", "Region"]] = None,
|
3449
|
+
source: str = "guides_temp",
|
3450
|
+
cell_padding: float = 0.5,
|
3451
|
+
include_outer_boundaries: bool = False,
|
3452
|
+
method: Optional[str] = None,
|
3453
|
+
table_settings: Optional[dict] = None,
|
3454
|
+
use_ocr: bool = False,
|
3455
|
+
ocr_config: Optional[dict] = None,
|
3456
|
+
text_options: Optional[Dict] = None,
|
3457
|
+
cell_extraction_func: Optional[Callable[["Region"], Optional[str]]] = None,
|
3458
|
+
show_progress: bool = False,
|
3459
|
+
content_filter: Optional[Union[str, Callable[[str], bool], List[str]]] = None,
|
3460
|
+
*,
|
3461
|
+
multi_page: Literal["auto", True, False] = "auto",
|
3462
|
+
) -> "TableResult":
|
3463
|
+
"""
|
3464
|
+
Extract table data directly from guides without leaving temporary regions.
|
3465
|
+
|
3466
|
+
This method:
|
3467
|
+
1. Creates table structure using build_grid()
|
3468
|
+
2. Extracts table data from the created table region
|
3469
|
+
3. Cleans up all temporary regions
|
3470
|
+
4. Returns the TableResult
|
3471
|
+
|
3472
|
+
Args:
|
3473
|
+
target: Page or Region to create regions on (uses self.context if None)
|
3474
|
+
source: Source label for temporary regions (will be cleaned up)
|
3475
|
+
cell_padding: Internal padding for cell regions in points
|
3476
|
+
include_outer_boundaries: Whether to add boundaries at edges if missing
|
3477
|
+
method: Table extraction method ('tatr', 'pdfplumber', 'text', etc.)
|
3478
|
+
table_settings: Settings for pdfplumber table extraction
|
3479
|
+
use_ocr: Whether to use OCR for text extraction
|
3480
|
+
ocr_config: OCR configuration parameters
|
3481
|
+
text_options: Dictionary of options for the 'text' method
|
3482
|
+
cell_extraction_func: Optional callable for custom cell text extraction
|
3483
|
+
show_progress: Controls progress bar for text method
|
3484
|
+
content_filter: Content filtering function or patterns
|
3485
|
+
multi_page: Controls multi-region table creation for FlowRegions
|
3486
|
+
|
3487
|
+
Returns:
|
3488
|
+
TableResult: Extracted table data
|
3489
|
+
|
3490
|
+
Raises:
|
3491
|
+
ValueError: If no table region is created from the guides
|
3492
|
+
|
3493
|
+
Example:
|
3494
|
+
```python
|
3495
|
+
from natural_pdf.analyzers import Guides
|
3496
|
+
|
3497
|
+
# Create guides from detected lines
|
3498
|
+
guides = Guides.from_lines(page, source_label="detected")
|
3499
|
+
|
3500
|
+
# Extract table directly - no temporary regions left behind
|
3501
|
+
table_data = guides.extract_table()
|
3502
|
+
|
3503
|
+
# Convert to pandas DataFrame
|
3504
|
+
df = table_data.to_df()
|
3505
|
+
```
|
3506
|
+
"""
|
3507
|
+
target_obj = target or self.context
|
3508
|
+
if not target_obj:
|
3509
|
+
raise ValueError("No target object available. Provide target parameter or context.")
|
3510
|
+
|
3511
|
+
# Get the page for cleanup later
|
3512
|
+
if hasattr(target_obj, "x0") and hasattr(target_obj, "top"): # Region
|
3513
|
+
page = target_obj._page
|
3514
|
+
element_manager = page._element_mgr
|
3515
|
+
elif hasattr(target_obj, "_element_mgr"): # Page
|
3516
|
+
page = target_obj
|
3517
|
+
element_manager = page._element_mgr
|
3518
|
+
else:
|
3519
|
+
raise ValueError(f"Target object {target_obj} is not a Page or Region")
|
3520
|
+
|
3521
|
+
try:
|
3522
|
+
# Step 1: Build grid structure (creates temporary regions)
|
3523
|
+
grid_result = self.build_grid(
|
3524
|
+
target=target_obj,
|
3525
|
+
source=source,
|
3526
|
+
cell_padding=cell_padding,
|
3527
|
+
include_outer_boundaries=include_outer_boundaries,
|
3528
|
+
multi_page=multi_page,
|
3529
|
+
)
|
3530
|
+
|
3531
|
+
# Step 2: Get the table region and extract table data
|
3532
|
+
table_region = grid_result["regions"]["table"]
|
3533
|
+
if table_region is None:
|
3534
|
+
raise ValueError(
|
3535
|
+
"No table region was created from the guides. Check that you have both vertical and horizontal guides."
|
3536
|
+
)
|
3537
|
+
|
3538
|
+
# Handle multi-page case where table_region might be a list
|
3539
|
+
if isinstance(table_region, list):
|
3540
|
+
if not table_region:
|
3541
|
+
raise ValueError("No table regions were created from the guides.")
|
3542
|
+
# Use the first table region for extraction
|
3543
|
+
table_region = table_region[0]
|
3544
|
+
|
3545
|
+
# Step 3: Extract table data using the region's extract_table method
|
3546
|
+
table_result = table_region.extract_table(
|
3547
|
+
method=method,
|
3548
|
+
table_settings=table_settings,
|
3549
|
+
use_ocr=use_ocr,
|
3550
|
+
ocr_config=ocr_config,
|
3551
|
+
text_options=text_options,
|
3552
|
+
cell_extraction_func=cell_extraction_func,
|
3553
|
+
show_progress=show_progress,
|
3554
|
+
content_filter=content_filter,
|
3555
|
+
)
|
3556
|
+
|
3557
|
+
return table_result
|
3558
|
+
|
3559
|
+
finally:
|
3560
|
+
# Step 4: Clean up all temporary regions created by build_grid
|
3561
|
+
# This ensures no regions are left behind regardless of success/failure
|
3562
|
+
try:
|
3563
|
+
regions_to_remove = [
|
3564
|
+
r
|
3565
|
+
for r in element_manager.regions
|
3566
|
+
if getattr(r, "source", None) == source
|
3567
|
+
and getattr(r, "region_type", None)
|
3568
|
+
in {"table", "table_row", "table_column", "table_cell"}
|
3569
|
+
]
|
3570
|
+
|
3571
|
+
for region in regions_to_remove:
|
3572
|
+
element_manager.remove_element(region, element_type="regions")
|
3573
|
+
|
3574
|
+
if regions_to_remove:
|
3575
|
+
logger.debug(f"Cleaned up {len(regions_to_remove)} temporary regions")
|
3576
|
+
|
3577
|
+
except Exception as cleanup_err:
|
3578
|
+
logger.warning(f"Failed to clean up temporary regions: {cleanup_err}")
|
3579
|
+
|
3424
3580
|
def _get_flow_orientation(self) -> Literal["vertical", "horizontal", "unknown"]:
|
3425
3581
|
"""Determines if a FlowRegion's constituent parts are arranged vertically or horizontally."""
|
3426
3582
|
if not self.is_flow_region or len(self.context.constituent_regions) < 2:
|
@@ -29,9 +29,22 @@ class DirectionalCollectionMixin:
|
|
29
29
|
"""Find regions to the right of all elements in this collection."""
|
30
30
|
return self.apply(lambda element: element.right(**kwargs))
|
31
31
|
|
32
|
-
def expand(self, **kwargs) -> "ElementCollection":
|
33
|
-
"""Expand all elements in this collection.
|
34
|
-
|
32
|
+
def expand(self, *args, **kwargs) -> "ElementCollection":
|
33
|
+
"""Expand all elements in this collection.
|
34
|
+
|
35
|
+
Args:
|
36
|
+
*args: If a single positional argument is provided, expands all elements
|
37
|
+
by that amount in all directions.
|
38
|
+
**kwargs: Keyword arguments for directional expansion (left, right, top, bottom, etc.)
|
39
|
+
|
40
|
+
Examples:
|
41
|
+
# Expand all elements by 5 pixels in all directions
|
42
|
+
collection.expand(5)
|
43
|
+
|
44
|
+
# Expand with different amounts in each direction
|
45
|
+
collection.expand(left=10, right=5, top=3, bottom=7)
|
46
|
+
"""
|
47
|
+
return self.apply(lambda element: element.expand(*args, **kwargs))
|
35
48
|
|
36
49
|
|
37
50
|
class ApplyMixin:
|
@@ -335,6 +335,7 @@ class HighlightContext:
|
|
335
335
|
self.show_on_exit = show_on_exit
|
336
336
|
self.highlight_groups = []
|
337
337
|
self._color_manager = ColorManager()
|
338
|
+
self._exit_image = None # Store image for Jupyter display
|
338
339
|
|
339
340
|
def add(
|
340
341
|
self,
|
@@ -421,6 +422,11 @@ class HighlightContext:
|
|
421
422
|
)
|
422
423
|
return None
|
423
424
|
|
425
|
+
@property
|
426
|
+
def image(self) -> Optional[Image.Image]:
|
427
|
+
"""Get the last generated image (useful after context exit)."""
|
428
|
+
return self._exit_image
|
429
|
+
|
424
430
|
def __enter__(self) -> "HighlightContext":
|
425
431
|
"""Enter the context."""
|
426
432
|
return self
|
@@ -428,7 +434,25 @@ class HighlightContext:
|
|
428
434
|
def __exit__(self, exc_type, exc_val, exc_tb):
|
429
435
|
"""Exit the context, optionally showing highlights."""
|
430
436
|
if self.show_on_exit and not exc_type:
|
431
|
-
self.show()
|
437
|
+
self._exit_image = self.show()
|
438
|
+
|
439
|
+
# Check if we're in a Jupyter/IPython environment
|
440
|
+
try:
|
441
|
+
# Try to get IPython instance
|
442
|
+
from IPython import get_ipython
|
443
|
+
|
444
|
+
ipython = get_ipython()
|
445
|
+
if ipython is not None:
|
446
|
+
# We're in IPython/Jupyter
|
447
|
+
from IPython.display import display
|
448
|
+
|
449
|
+
if self._exit_image is not None:
|
450
|
+
display(self._exit_image)
|
451
|
+
except (ImportError, NameError):
|
452
|
+
# Not in Jupyter or IPython not available - that's OK
|
453
|
+
pass
|
454
|
+
|
455
|
+
# __exit__ must return False to not suppress exceptions
|
432
456
|
return False
|
433
457
|
|
434
458
|
|
@@ -689,7 +713,7 @@ class HighlightingService:
|
|
689
713
|
logger.debug(f"Added highlight to page {page_index}: {highlight}")
|
690
714
|
|
691
715
|
# --- Invalidate page-level image cache --------------------------------
|
692
|
-
# The Page.
|
716
|
+
# The Page.render method maintains an internal cache keyed by rendering
|
693
717
|
# parameters. Because the cache key currently does **not** incorporate
|
694
718
|
# any information about the highlights themselves, it can return stale
|
695
719
|
# images after highlights are added or removed. To ensure the next
|
@@ -700,11 +724,11 @@ class HighlightingService:
|
|
700
724
|
if hasattr(page_obj, "_to_image_cache"):
|
701
725
|
page_obj._to_image_cache.clear()
|
702
726
|
logger.debug(
|
703
|
-
f"Cleared cached
|
727
|
+
f"Cleared cached render images for page {page_index} after adding a highlight."
|
704
728
|
)
|
705
729
|
except Exception as cache_err: # pragma: no cover – never fail highlight creation
|
706
730
|
logger.warning(
|
707
|
-
f"Failed to invalidate
|
731
|
+
f"Failed to invalidate render cache for page {page_index}: {cache_err}",
|
708
732
|
exc_info=True,
|
709
733
|
)
|
710
734
|
|
@@ -737,11 +761,11 @@ class HighlightingService:
|
|
737
761
|
if hasattr(page_obj, "_to_image_cache"):
|
738
762
|
page_obj._to_image_cache.clear()
|
739
763
|
logger.debug(
|
740
|
-
f"Cleared cached
|
764
|
+
f"Cleared cached render images for page {page_index} after removing highlights."
|
741
765
|
)
|
742
766
|
except Exception as cache_err: # pragma: no cover
|
743
767
|
logger.warning(
|
744
|
-
f"Failed to invalidate
|
768
|
+
f"Failed to invalidate render cache for page {page_index}: {cache_err}",
|
745
769
|
exc_info=True,
|
746
770
|
)
|
747
771
|
|
@@ -760,7 +784,7 @@ class HighlightingService:
|
|
760
784
|
labels: bool = True,
|
761
785
|
legend_position: str = "right",
|
762
786
|
render_ocr: bool = False,
|
763
|
-
**kwargs, # Pass other args to pdfplumber.page.to_image if needed
|
787
|
+
**kwargs, # Pass other args to pdfplumber.page.to_image if needed (internal API)
|
764
788
|
) -> Optional[Image.Image]:
|
765
789
|
"""
|
766
790
|
Renders a specific page with its highlights.
|
@@ -773,7 +797,7 @@ class HighlightingService:
|
|
773
797
|
labels: Whether to include a legend for highlights.
|
774
798
|
legend_position: Position of the legend.
|
775
799
|
render_ocr: Whether to render OCR text on the image.
|
776
|
-
kwargs: Additional keyword arguments for pdfplumber's page.to_image (e.g., width, height).
|
800
|
+
kwargs: Additional keyword arguments for pdfplumber's internal page.to_image (e.g., width, height).
|
777
801
|
|
778
802
|
Returns:
|
779
803
|
A PIL Image object of the rendered page, or None if rendering fails.
|
@@ -957,7 +981,7 @@ class HighlightingService:
|
|
957
981
|
crop_bbox: Optional bounding box (x0, top, x1, bottom) in PDF coordinate
|
958
982
|
space to crop the output image to, before legends or other overlays are
|
959
983
|
applied. If None, no cropping is performed.
|
960
|
-
**kwargs: Additional args for pdfplumber's to_image (e.g., width, height).
|
984
|
+
**kwargs: Additional args for pdfplumber's internal to_image (e.g., width, height).
|
961
985
|
|
962
986
|
Returns:
|
963
987
|
PIL Image of the preview, or None if rendering fails.
|