natural-pdf 0.2.15__tar.gz → 0.2.17__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/.gitignore +1 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/CLAUDE.md +13 -0
- {natural_pdf-0.2.15/natural_pdf.egg-info → natural_pdf-0.2.17}/PKG-INFO +1 -1
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/docs/element-selection/index.md +110 -0
- natural_pdf-0.2.17/docs/guide_adjustment_stream.md +90 -0
- natural_pdf-0.2.17/docs/guides_boundary_columns.md +156 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/docs/quick-reference/index.md +60 -13
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/docs/regions/index.md +65 -1
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/docs/tutorials/05-excluding-content.md +104 -8
- natural_pdf-0.2.17/docs/tutorials/08-spatial-navigation.md +449 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/natural_pdf/__init__.py +45 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/natural_pdf/analyzers/guides.py +359 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/natural_pdf/core/element_manager.py +4 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/natural_pdf/core/page.py +88 -22
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/natural_pdf/core/page_collection.py +75 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/natural_pdf/core/pdf.py +33 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/natural_pdf/describe/base.py +48 -7
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/natural_pdf/elements/base.py +408 -43
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/natural_pdf/elements/element_collection.py +83 -10
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/natural_pdf/elements/region.py +217 -178
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/natural_pdf/elements/text.py +5 -3
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/natural_pdf/flows/element.py +48 -46
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/natural_pdf/flows/flow.py +175 -480
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/natural_pdf/flows/region.py +76 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/natural_pdf/selectors/parser.py +180 -9
- natural_pdf-0.2.17/natural_pdf/utils/pdfminer_patches.py +136 -0
- natural_pdf-0.2.17/natural_pdf/utils/sections.py +346 -0
- natural_pdf-0.2.17/natural_pdf/utils/spatial.py +169 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17/natural_pdf.egg-info}/PKG-INFO +1 -1
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/natural_pdf.egg-info/SOURCES.txt +22 -1
- natural_pdf-0.2.17/tests/test_aggregate_selectors.py +249 -0
- natural_pdf-0.2.17/tests/test_auto_multipage_option.py +63 -0
- natural_pdf-0.2.17/tests/test_exclude_multi_page.py +101 -0
- natural_pdf-0.2.17/tests/test_exclude_real_pdf.py +98 -0
- natural_pdf-0.2.17/tests/test_expand_enhanced.py +206 -0
- natural_pdf-0.2.17/tests/test_guide_adjustment_stream.py +121 -0
- natural_pdf-0.2.17/tests/test_guides_boundaries.py +266 -0
- natural_pdf-0.2.17/tests/test_guides_from_headers.py +143 -0
- natural_pdf-0.2.17/tests/test_guides_partial.py +110 -0
- natural_pdf-0.2.17/tests/test_highlight_color_falsy.py +54 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/tests/test_highlight_detection_comprehensive.py +3 -2
- natural_pdf-0.2.17/tests/test_include_boundaries_fix.py +127 -0
- natural_pdf-0.2.17/tests/test_merged_flowregion_specs.py +152 -0
- natural_pdf-0.2.17/tests/test_mixed_collection_rendering.py +96 -0
- natural_pdf-0.2.17/tests/test_multipage_directional.py +159 -0
- natural_pdf-0.2.17/tests/test_pdfminer_bug_status.py +32 -0
- natural_pdf-0.2.17/tests/test_pdfminer_color_bug.py +65 -0
- natural_pdf-0.2.17/tests/test_pdfminer_color_stack_bug.py +70 -0
- natural_pdf-0.2.17/tests/test_smart_exclusion.py +122 -0
- natural_pdf-0.2.15/docs/tutorials/08-spatial-navigation.md +0 -237
- natural_pdf-0.2.15/tests/test_include_boundaries_debug.py +0 -67
- natural_pdf-0.2.15/tests/test_include_boundaries_fix.py +0 -126
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/.cursor/rules/analysis_framework.mdc +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/.cursor/rules/coding-style.mdc +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/.cursor/rules/edit-md-instead-of-ipynb.mdc +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/.cursor/rules/minimal-comments.mdc +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/.cursor/rules/natural-pdf-overview.mdc +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/.cursor/rules/user-friendly-library-code.mdc +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/.github/workflows/ci.yml +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/.github/workflows/docs.yml +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/.github/workflows/nightly-tutorials.yml +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/.pre-commit-config.yaml +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/01-execute_notebooks.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/02-run_all_tutorials.sh +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/LICENSE +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/MANIFEST.in +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/README.md +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/audit_packaging.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/check_run_md.sh +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/docs/api/index.md +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/docs/assets/favicon.png +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/docs/assets/favicon.svg +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/docs/assets/javascripts/custom.js +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/docs/assets/logo.svg +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/docs/assets/sample-screen.png +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/docs/assets/social-preview.png +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/docs/assets/social-preview.svg +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/docs/assets/stylesheets/custom.css +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/docs/categorizing-documents/index.md +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/docs/data-extraction/index.md +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/docs/describe/index.md +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/docs/document-qa/index.md +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/docs/extracting-clean-text/index.md +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/docs/finetuning/index.md +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/docs/fix-messy-tables/index.md +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/docs/fix-messy-tables/table_1.csv +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/docs/fix-messy-tables/table_2.csv +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/docs/fix-messy-tables/table_3.csv +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/docs/index.md +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/docs/installation/index.md +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/docs/interactive-widget/index.md +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/docs/layout-analysis/index.md +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/docs/loops-and-groups/index.md +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/docs/ocr/index.md +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/docs/pdf-navigation/index.md +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/docs/process-forms-and-invoices/extracted_form_data.csv +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/docs/process-forms-and-invoices/index.md +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/docs/reflowing-pages/index.md +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/docs/tables/index.md +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/docs/text-analysis/index.md +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/docs/tutorials/01-loading-and-extraction.md +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/docs/tutorials/02-finding-elements.md +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/docs/tutorials/03-extracting-blocks.md +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/docs/tutorials/04-table-extraction.md +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/docs/tutorials/06-document-qa.md +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/docs/tutorials/07-layout-analysis.md +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/docs/tutorials/07-working-with-regions.md +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/docs/tutorials/09-section-extraction.md +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/docs/tutorials/10-form-field-extraction.md +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/docs/tutorials/11-enhanced-table-processing.md +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/docs/tutorials/12-ocr-integration.md +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/docs/tutorials/13-semantic-search.md +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/docs/tutorials/14-categorizing-documents.md +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/docs/visual-debugging/index.md +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/docs/visual-debugging/region.png +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/mkdocs.yml +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/natural_pdf/analyzers/__init__.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/natural_pdf/analyzers/layout/__init__.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/natural_pdf/analyzers/layout/base.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/natural_pdf/analyzers/layout/docling.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/natural_pdf/analyzers/layout/gemini.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/natural_pdf/analyzers/layout/layout_analyzer.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/natural_pdf/analyzers/layout/layout_manager.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/natural_pdf/analyzers/layout/layout_options.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/natural_pdf/analyzers/layout/paddle.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/natural_pdf/analyzers/layout/pdfplumber_table_finder.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/natural_pdf/analyzers/layout/surya.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/natural_pdf/analyzers/layout/table_structure_utils.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/natural_pdf/analyzers/layout/tatr.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/natural_pdf/analyzers/layout/yolo.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/natural_pdf/analyzers/shape_detection_mixin.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/natural_pdf/analyzers/text_options.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/natural_pdf/analyzers/text_structure.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/natural_pdf/analyzers/utils.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/natural_pdf/classification/manager.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/natural_pdf/classification/mixin.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/natural_pdf/classification/results.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/natural_pdf/cli.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/natural_pdf/collections/mixins.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/natural_pdf/core/__init__.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/natural_pdf/core/highlighting_service.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/natural_pdf/core/page_groupby.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/natural_pdf/core/pdf_collection.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/natural_pdf/core/render_spec.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/natural_pdf/describe/__init__.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/natural_pdf/describe/elements.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/natural_pdf/describe/mixin.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/natural_pdf/describe/summary.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/natural_pdf/elements/__init__.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/natural_pdf/elements/image.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/natural_pdf/elements/line.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/natural_pdf/elements/rect.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/natural_pdf/export/mixin.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/natural_pdf/exporters/__init__.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/natural_pdf/exporters/base.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/natural_pdf/exporters/data/__init__.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/natural_pdf/exporters/data/pdf.ttf +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/natural_pdf/exporters/data/sRGB.icc +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/natural_pdf/exporters/hocr.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/natural_pdf/exporters/hocr_font.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/natural_pdf/exporters/original_pdf.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/natural_pdf/exporters/paddleocr.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/natural_pdf/exporters/searchable_pdf.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/natural_pdf/extraction/manager.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/natural_pdf/extraction/mixin.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/natural_pdf/extraction/result.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/natural_pdf/flows/__init__.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/natural_pdf/flows/collections.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/natural_pdf/ocr/__init__.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/natural_pdf/ocr/engine.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/natural_pdf/ocr/engine_doctr.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/natural_pdf/ocr/engine_easyocr.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/natural_pdf/ocr/engine_paddle.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/natural_pdf/ocr/engine_surya.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/natural_pdf/ocr/ocr_factory.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/natural_pdf/ocr/ocr_manager.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/natural_pdf/ocr/ocr_options.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/natural_pdf/ocr/utils.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/natural_pdf/qa/__init__.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/natural_pdf/qa/document_qa.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/natural_pdf/qa/qa_result.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/natural_pdf/search/__init__.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/natural_pdf/search/lancedb_search_service.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/natural_pdf/search/numpy_search_service.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/natural_pdf/search/search_options.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/natural_pdf/search/search_service_protocol.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/natural_pdf/search/searchable_mixin.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/natural_pdf/selectors/__init__.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/natural_pdf/tables/__init__.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/natural_pdf/tables/result.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/natural_pdf/templates/__init__.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/natural_pdf/templates/finetune/fine_tune_paddleocr.md +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/natural_pdf/templates/spa/css/style.css +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/natural_pdf/templates/spa/index.html +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/natural_pdf/templates/spa/js/app.js +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/natural_pdf/templates/spa/words.txt +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/natural_pdf/text_mixin.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/natural_pdf/utils/__init__.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/natural_pdf/utils/bidi_mirror.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/natural_pdf/utils/color_utils.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/natural_pdf/utils/debug.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/natural_pdf/utils/highlighting.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/natural_pdf/utils/identifiers.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/natural_pdf/utils/layout.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/natural_pdf/utils/locks.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/natural_pdf/utils/packaging.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/natural_pdf/utils/reading_order.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/natural_pdf/utils/text_extraction.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/natural_pdf/utils/visualization.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/natural_pdf/vision/__init__.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/natural_pdf/vision/mixin.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/natural_pdf/vision/results.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/natural_pdf/vision/similarity.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/natural_pdf/vision/template_matching.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/natural_pdf/widgets/__init__.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/natural_pdf/widgets/viewer.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/natural_pdf.egg-info/dependency_links.txt +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/natural_pdf.egg-info/entry_points.txt +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/natural_pdf.egg-info/requires.txt +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/natural_pdf.egg-info/top_level.txt +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/noxfile.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/optimization/memory_comparison.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/optimization/pdf_analyzer.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/optimization/performance_analysis.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/optimization/performance_results/image_heavy_snapshots.csv +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/optimization/performance_results/image_heavy_snapshots.json +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/optimization/performance_results/text_heavy_snapshots.csv +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/optimization/performance_results/text_heavy_snapshots.json +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/optimization/test_cleanup_methods.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/optimization/test_memory_fix.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/publish.sh +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/pyproject.toml +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/sample-screen.png +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/setup.cfg +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/temp/fix_page_exclusions.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/temp/test_draw_guides.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/temp/test_draw_guides_interactive.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/temp/test_exclusion_with_debug.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/temp/test_find_exclusions_fix.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/temp/test_find_exclusions_fix_no_recursion.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/temp/test_fix_real_pdf.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/temp/test_fix_working.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/temp/test_fixed_pdf_exclusions.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/temp/test_guide_draw_notebook.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/temp/test_horizontal_top_bottom.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/temp/test_inline_js.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/temp/test_marker_order.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/temp/test_original_exclusions_now_work.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/temp/test_pdf_exclusions_with_guides.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/temp/test_region_exclusions_detailed.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/temp/test_stripes_real_pdf.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/temp/test_vertical_stripes.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/temp/test_widget_functionality.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/temp/test_widget_simple.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/tests/conftest.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/tests/exporters/test_paddleocr_exporter.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/tests/test_annotate.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/tests/test_arabic_performance.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/tests/test_arabic_real_world.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/tests/test_color_conversion.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/tests/test_color_hex_display.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/tests/test_core/test_containment_geometry.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/tests/test_core/test_elements.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/tests/test_core/test_loading.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/tests/test_core/test_spatial.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/tests/test_core/test_text_extraction.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/tests/test_core/test_text_layer.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/tests/test_crop_enhancements.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/tests/test_crop_region_highlights.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/tests/test_directional_defaults.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/tests/test_dissolve.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/tests/test_dissolve_cross_page_bug.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/tests/test_dissolve_debug_issue.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/tests/test_dissolve_real_world_issue.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/tests/test_dissolve_single_elements.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/tests/test_dissolve_vertical_offset_issue.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/tests/test_document_qa.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/tests/test_element_addition.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/tests/test_element_collection_guides.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/tests/test_element_collection_show_cols.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/tests/test_element_collection_slicing.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/tests/test_element_exclusions.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/tests/test_element_show_crop_highlights.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/tests/test_empty_pseudo_class.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/tests/test_exclusions.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/tests/test_expand.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/tests/test_extraction_error.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/tests/test_extraction_mixin_fix.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/tests/test_extraction_text_and_vision.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/tests/test_extraction_working.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/tests/test_find_similar.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/tests/test_first_last_selectors.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/tests/test_fix_get_sections_zero_height.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/tests/test_flow_region_directional.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/tests/test_get_sections_fix_comprehensive.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/tests/test_get_sections_zero_height.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/tests/test_groupby.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/tests/test_guides.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/tests/test_guides_apply_exclusions.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/tests/test_guides_apply_exclusions_simple.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/tests/test_guides_extract_table.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/tests/test_guides_extract_table_collections.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/tests/test_guides_extract_table_exclusions.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/tests/test_guides_extract_table_real.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/tests/test_guides_from_stripes.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/tests/test_guides_integration.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/tests/test_guides_marker_sorting.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/tests/test_highlight_detection.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/tests/test_highlight_offset.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/tests/test_highlight_protocol.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/tests/test_highlight_protocol_simple.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/tests/test_highlight_regions.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/tests/test_horizontal_guides_alignment.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/tests/test_include_boundaries_comprehensive.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/tests/test_include_boundaries_final.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/tests/test_include_boundaries_final_verification.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/tests/test_include_boundaries_mock.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/tests/test_include_boundaries_simple.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/tests/test_include_boundaries_types_pdf.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/tests/test_include_boundaries_verification.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/tests/test_include_boundaries_with_real_text.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/tests/test_loading_original.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/tests/test_match_results_sorting.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/tests/test_merge_connected.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/tests/test_merge_connected_real_world.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/tests/test_merge_method.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/tests/test_multi_page_table_discovery.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/tests/test_negative_bounds_pdf.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/tests/test_optional_deps.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/tests/test_page_exclusion_lists.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/tests/test_pdf_add_exclusion_elementcollection.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/tests/test_pdf_exclusions_in_find_methods.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/tests/test_phash_masking.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/tests/test_region_find_similar.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/tests/test_region_show_crop_highlights.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/tests/test_region_viewer.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/tests/test_sections_end_only.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/tests/test_sections_with_start_and_end.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/tests/test_show_column_layout.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/tests/test_show_edge_cases.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/tests/test_show_exclusions.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/tests/test_show_exclusions_feature.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/tests/test_show_limit.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/tests/test_skip_repeating_headers_multipage.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/tests/test_slice_cache_reuse.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/tests/test_slice_exclusion_fix.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/tests/test_slice_exclusion_issue.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/tests/test_slice_exclusion_mock.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/tests/test_sliced_collection_exclusions.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/tests/test_spatial_offset.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/tests/test_strikethrough_detection.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/tests/test_table_result_header_mismatch.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/tests/test_table_result_keep_blank.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/tests/test_template_matching.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/tests/test_template_white_masking.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/tests/test_tiny_text_tables.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/tests/test_tiny_text_tables_table.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/tests/test_tutorials.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/tests/test_underline_detection.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/tests/test_update_text.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/todo/bad_pdf_analysis.md +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/todo/evaluation.md +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/tools/bad_pdf_eval/IMPROVEMENTS_SUMMARY.md +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/tools/bad_pdf_eval/LLM_NaturalPDF_CheatSheet.md +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/tools/bad_pdf_eval/LLM_NaturalPDF_Workflows.md +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/tools/bad_pdf_eval/README.md +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/tools/bad_pdf_eval/__init__.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/tools/bad_pdf_eval/analyser.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/tools/bad_pdf_eval/collate_summaries.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/tools/bad_pdf_eval/compile_attempts_markdown.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/tools/bad_pdf_eval/eval_suite.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/tools/bad_pdf_eval/evaluate_quality.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/tools/bad_pdf_eval/export_enrichment_csv.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/tools/bad_pdf_eval/extraction_decision_tree.md +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/tools/bad_pdf_eval/llm_enrich.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/tools/bad_pdf_eval/llm_enrich_with_retry.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/tools/bad_pdf_eval/reporter.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/tools/bad_pdf_eval/utils.py +0 -0
- {natural_pdf-0.2.15 → natural_pdf-0.2.17}/uv.lock +0 -0
@@ -61,6 +61,19 @@ Natural PDF is a Python library for intelligent PDF document processing that com
|
|
61
61
|
- **Automatic conversion**: Elements from iterables are automatically converted to exclusion regions
|
62
62
|
- **Backward compatibility**: Existing Region and callable exclusions continue to work unchanged
|
63
63
|
|
64
|
+
### 6b. Multi-page Directional Navigation
|
65
|
+
- **multipage parameter**: Directional methods now accept `multipage=True` to span pages
|
66
|
+
- `element.below(until="text:contains('End')", multipage=True)` searches across pages
|
67
|
+
- Returns `FlowRegion` when spanning multiple pages, `Region` when on single page
|
68
|
+
- Works with all directional methods: `.below()`, `.above()`, `.left()`, `.right()`
|
69
|
+
- **Global auto_multipage option**: Set default behavior for all directional navigation
|
70
|
+
- `npdf.set_option('layout.auto_multipage', True)` enables multipage by default
|
71
|
+
- Individual calls can override with `multipage=False`
|
72
|
+
- **Use cases**:
|
73
|
+
- Extract content between headers on different pages
|
74
|
+
- Find tables that span page boundaries
|
75
|
+
- Navigate document structure without manual page handling
|
76
|
+
|
64
77
|
### 7. Page Grouping with groupby()
|
65
78
|
- **Simple grouping by selector text**: `pages.groupby('text[size=16]')` groups by header text
|
66
79
|
- **Callable functions for complex logic**: `pages.groupby(lambda p: p.find('text:contains("CITY")').extract_text())`
|
@@ -223,6 +223,116 @@ headings.extract_text()
|
|
223
223
|
|
224
224
|
*Note: `.highest()`, `.lowest()`, etc. will complain if your collection spans multiple pages.*
|
225
225
|
|
226
|
+
## Finding Elements with Statistical Properties
|
227
|
+
|
228
|
+
Sometimes you need to find elements based on their extreme values - the leftmost text, the largest font, or the most common color. Natural PDF's aggregate selectors make this easy using statistical functions like `min()`, `max()`, and `avg()`.
|
229
|
+
|
230
|
+
### Position-Based Selection
|
231
|
+
|
232
|
+
```python
|
233
|
+
# Find the leftmost text element on the page
|
234
|
+
leftmost = page.find('text[x0=min()]')
|
235
|
+
leftmost.show()
|
236
|
+
```
|
237
|
+
|
238
|
+
```python
|
239
|
+
# Find the rightmost text (useful for page numbers)
|
240
|
+
rightmost = page.find('text[x1=max()]')
|
241
|
+
rightmost.show()
|
242
|
+
```
|
243
|
+
|
244
|
+
```python
|
245
|
+
# Find text at the top and bottom of the page
|
246
|
+
topmost = page.find('text[top=min()]')
|
247
|
+
bottommost = page.find('text[bottom=max()]')
|
248
|
+
```
|
249
|
+
|
250
|
+
### Size and Dimension Selection
|
251
|
+
|
252
|
+
```python
|
253
|
+
# Find the largest text (often titles or headings)
|
254
|
+
largest_text = page.find('text[size=max()]')
|
255
|
+
print(f"Largest text: {largest_text.extract_text()} (size: {largest_text.size})")
|
256
|
+
```
|
257
|
+
|
258
|
+
```python
|
259
|
+
# Find elements with average dimensions
|
260
|
+
avg_width_text = page.find_all('text[width=avg()]')
|
261
|
+
median_height_text = page.find_all('text[height=median()]')
|
262
|
+
```
|
263
|
+
|
264
|
+
### Finding Most Common Values
|
265
|
+
|
266
|
+
The `mode()` function (or its alias `most_common()`) finds elements with the most frequently occurring value for any attribute:
|
267
|
+
|
268
|
+
```python
|
269
|
+
# Find text with the most common font size (body text)
|
270
|
+
body_text = page.find_all('text[size=mode()]')
|
271
|
+
print(f"Most common font size: {body_text.first.size if body_text else 'N/A'}")
|
272
|
+
```
|
273
|
+
|
274
|
+
```python
|
275
|
+
# Find elements with the most common font name
|
276
|
+
common_font = page.find_all('text[fontname=most_common()]')
|
277
|
+
```
|
278
|
+
|
279
|
+
### Color Proximity Matching
|
280
|
+
|
281
|
+
For color attributes, you can find elements with colors closest to a target:
|
282
|
+
|
283
|
+
```python
|
284
|
+
# Find text closest to red
|
285
|
+
red_text = page.find_all('text[color=closest("red")]')
|
286
|
+
|
287
|
+
# Find rectangles with fill color closest to blue
|
288
|
+
blue_rects = page.find_all('rect[fill=closest("#0000FF")]')
|
289
|
+
|
290
|
+
# Works with any color format
|
291
|
+
nearly_black = page.find_all('text[color=closest("rgb(10,10,10)")]')
|
292
|
+
```
|
293
|
+
|
294
|
+
### Combining Aggregate Conditions
|
295
|
+
|
296
|
+
Multiple aggregate conditions create an intersection - elements must satisfy ALL conditions:
|
297
|
+
|
298
|
+
```python
|
299
|
+
# Find text that is both leftmost AND largest
|
300
|
+
special_text = page.find('text[x0=min()][size=max()]')
|
301
|
+
|
302
|
+
# Find the topmost element among large text
|
303
|
+
topmost_large = page.find('text[size>12][top=min()]')
|
304
|
+
```
|
305
|
+
|
306
|
+
### Using Aggregates in Complex Selectors
|
307
|
+
|
308
|
+
Aggregate functions work seamlessly with all Natural PDF features:
|
309
|
+
|
310
|
+
```python
|
311
|
+
# In OR selectors - find either the leftmost text OR the largest rectangle
|
312
|
+
elements = page.find_all('text[x0=min()]|rect[width=max()]')
|
313
|
+
|
314
|
+
# With spatial navigation
|
315
|
+
element = page.find('text')
|
316
|
+
# Navigate right until reaching the leftmost element
|
317
|
+
right_region = element.right(until='text[x0=min()]')
|
318
|
+
|
319
|
+
# With filters - leftmost among bold text
|
320
|
+
leftmost_bold = page.find('text:bold[x0=min()]')
|
321
|
+
```
|
322
|
+
|
323
|
+
### Available Aggregate Functions
|
324
|
+
|
325
|
+
| Function | Alias | Description | Works On |
|
326
|
+
|----------|-------|-------------|----------|
|
327
|
+
| `min()` | - | Minimum value | Numeric attributes |
|
328
|
+
| `max()` | - | Maximum value | Numeric attributes |
|
329
|
+
| `avg()` | `mean()` | Average/mean value | Numeric attributes |
|
330
|
+
| `median()` | - | Median value | Numeric attributes |
|
331
|
+
| `mode()` | `most_common()` | Most frequent value | Any attribute |
|
332
|
+
| `closest(value)` | - | Closest match (colors only) | Color attributes |
|
333
|
+
|
334
|
+
**Note**: Aggregates are calculated across all elements of the same type. For example, `text[x0=min()]` finds the minimum x0 among ALL text elements, not just those matching other filters.
|
335
|
+
|
226
336
|
## Dealing with Weird Font Names
|
227
337
|
|
228
338
|
PDFs sometimes have bizarre font names that don't look like normal fonts. Don't worry - they're usually normal fonts with weird internal names.
|
@@ -0,0 +1,90 @@
|
|
1
|
+
# Guide Adjustment for Stream Extraction
|
2
|
+
|
3
|
+
## Overview
|
4
|
+
|
5
|
+
When using the `stream` extraction method (text-based edge detection) with explicit vertical guides, Natural PDF automatically adjusts guides that fall outside text bounds to ensure proper table extraction.
|
6
|
+
|
7
|
+
## The Problem
|
8
|
+
|
9
|
+
In pdfplumber's stream method, horizontal edges are only created where text exists. If vertical guides are placed outside the horizontal extent of text (e.g., at x=0 when text starts at x=51.6), these guides won't intersect with horizontal edges, causing missing columns in the extracted table.
|
10
|
+
|
11
|
+
## The Solution
|
12
|
+
|
13
|
+
Natural PDF automatically clips vertical guides to text bounds when:
|
14
|
+
1. Using `method="stream"` or `horizontal_strategy="text"`
|
15
|
+
2. Explicit vertical lines are provided
|
16
|
+
3. Text elements exist in the region
|
17
|
+
|
18
|
+
## Example
|
19
|
+
|
20
|
+
```python
|
21
|
+
from natural_pdf import PDF
|
22
|
+
from natural_pdf.analyzers.guides import Guides
|
23
|
+
|
24
|
+
# Load PDF and find headers
|
25
|
+
pdf = PDF("document.pdf")
|
26
|
+
page = pdf[0]
|
27
|
+
headers = page.find_all("text[y<100]") # Find header row
|
28
|
+
|
29
|
+
# Create guides from headers
|
30
|
+
guides = Guides(page)
|
31
|
+
guides.vertical.from_headers(headers, margin=0)
|
32
|
+
|
33
|
+
# Guides might include page boundaries (0, page.width)
|
34
|
+
# which could be outside text bounds
|
35
|
+
|
36
|
+
# Extract table - guides are automatically adjusted
|
37
|
+
table = page.extract_table(method="stream", verticals=guides.vertical.data)
|
38
|
+
|
39
|
+
# All columns including first and last are properly extracted
|
40
|
+
```
|
41
|
+
|
42
|
+
## How It Works
|
43
|
+
|
44
|
+
1. **Detection**: When stream method is used with explicit vertical guides
|
45
|
+
2. **Text Bounds**: The system finds all text elements and determines their bounding box
|
46
|
+
3. **Adjustment**:
|
47
|
+
- Guides left of text bounds are moved to the left edge of text
|
48
|
+
- Guides right of text bounds are moved to the right edge of text
|
49
|
+
- Guides within text bounds remain unchanged
|
50
|
+
4. **Extraction**: The adjusted guides are used for table extraction
|
51
|
+
|
52
|
+
## When This Applies
|
53
|
+
|
54
|
+
Guide adjustment happens when ALL of these conditions are met:
|
55
|
+
- Extraction method is `pdfplumber` (or its aliases `stream`)
|
56
|
+
- `horizontal_strategy` is `"text"` (text-based edge detection)
|
57
|
+
- `vertical_strategy` is `"explicit"` (using provided guides)
|
58
|
+
- `explicit_vertical_lines` are provided in table settings
|
59
|
+
|
60
|
+
## Debugging
|
61
|
+
|
62
|
+
Enable debug logging to see guide adjustments:
|
63
|
+
|
64
|
+
```python
|
65
|
+
import logging
|
66
|
+
logging.basicConfig(level=logging.DEBUG)
|
67
|
+
|
68
|
+
# Extract table - will show adjustment messages
|
69
|
+
table = page.extract_table(method="stream", verticals=guides.vertical.data)
|
70
|
+
```
|
71
|
+
|
72
|
+
Example debug output:
|
73
|
+
```
|
74
|
+
Region (0, 0, 1224, 1584): Adjusted left guide from 0.0 to 51.6
|
75
|
+
Region (0, 0, 1224, 1584): Adjusted right guide from 1224.0 to 1155.7
|
76
|
+
Region (0, 0, 1224, 1584): Adjusted 26 guides for stream extraction. Text bounds: 51.6-1155.7
|
77
|
+
```
|
78
|
+
|
79
|
+
## Other Methods
|
80
|
+
|
81
|
+
This adjustment only applies to stream/text-based extraction. When using:
|
82
|
+
- `method="lattice"` (line-based): No adjustment, guides used as-is
|
83
|
+
- `method="tatr"` or `method="text"`: Different extraction methods, guides not used
|
84
|
+
|
85
|
+
## Best Practices
|
86
|
+
|
87
|
+
1. **Use from_headers()**: This method creates appropriate guides for your content
|
88
|
+
2. **Set margin=0**: For tables that span the full width of text
|
89
|
+
3. **Verify with lattice first**: If your PDF has visible lines, lattice method may work better
|
90
|
+
4. **Check text bounds**: Use `page.find_all("text").merge().bbox` to see text extent
|
@@ -0,0 +1,156 @@
|
|
1
|
+
# Missing First/Last Columns in guides.extract_table()
|
2
|
+
|
3
|
+
## Problem Description
|
4
|
+
|
5
|
+
When using `guides.extract_table()`, the first and last columns may be missing from the extracted table. This happens because the `Guides.from_lines()` method by default does not include the page boundaries (x=0 and x=page.width) as vertical guides.
|
6
|
+
|
7
|
+
### Example of the Issue
|
8
|
+
|
9
|
+
```python
|
10
|
+
# Default behavior - may miss boundary columns
|
11
|
+
guides = Guides.from_lines(page)
|
12
|
+
result = guides.extract_table()
|
13
|
+
# First column "OFFICER" and last column may be missing
|
14
|
+
```
|
15
|
+
|
16
|
+
## Root Cause
|
17
|
+
|
18
|
+
The `from_lines()` method detects lines in the PDF but doesn't automatically add guides at the page boundaries. If your table's first column starts at x=0 or the last column ends at x=page.width, and there are no explicit vertical lines at these positions, those columns won't have guides and will be excluded from extraction.
|
19
|
+
|
20
|
+
## Solutions
|
21
|
+
|
22
|
+
### Solution 1: Use the `outer` parameter (Recommended)
|
23
|
+
|
24
|
+
The simplest fix is to use the `outer=True` parameter when creating guides:
|
25
|
+
|
26
|
+
```python
|
27
|
+
# Include outer boundaries when detecting lines
|
28
|
+
guides = Guides.from_lines(page, outer=True)
|
29
|
+
result = guides.extract_table()
|
30
|
+
```
|
31
|
+
|
32
|
+
### Solution 2: Use `include_outer_boundaries` in extract_table
|
33
|
+
|
34
|
+
If you've already created guides, you can include boundaries during extraction:
|
35
|
+
|
36
|
+
```python
|
37
|
+
# Create guides normally
|
38
|
+
guides = Guides.from_lines(page)
|
39
|
+
|
40
|
+
# Include boundaries during extraction
|
41
|
+
result = guides.extract_table(include_outer_boundaries=True)
|
42
|
+
```
|
43
|
+
|
44
|
+
### Solution 3: Manually add boundary guides
|
45
|
+
|
46
|
+
For more control, you can manually add guides at the page boundaries:
|
47
|
+
|
48
|
+
```python
|
49
|
+
# Create guides
|
50
|
+
guides = Guides.from_lines(page)
|
51
|
+
|
52
|
+
# Add page boundaries
|
53
|
+
guides.vertical.add([0, page.width])
|
54
|
+
|
55
|
+
# Extract table
|
56
|
+
result = guides.extract_table()
|
57
|
+
```
|
58
|
+
|
59
|
+
### Solution 4: Create guides from specific positions
|
60
|
+
|
61
|
+
If you know the exact column positions:
|
62
|
+
|
63
|
+
```python
|
64
|
+
# Create guides with specific positions including boundaries
|
65
|
+
guides = Guides(page)
|
66
|
+
guides.vertical.add([0, 100, 200, 300, 400, page.width])
|
67
|
+
guides.horizontal.from_lines(page) # Get horizontal guides from lines
|
68
|
+
|
69
|
+
result = guides.extract_table()
|
70
|
+
```
|
71
|
+
|
72
|
+
## Best Practices
|
73
|
+
|
74
|
+
1. **Always use `outer=True`** when you expect table content at page boundaries:
|
75
|
+
```python
|
76
|
+
guides = Guides.from_lines(page, outer=True)
|
77
|
+
```
|
78
|
+
|
79
|
+
2. **Check your guides** before extraction:
|
80
|
+
```python
|
81
|
+
guides = Guides.from_lines(page)
|
82
|
+
print(f"Vertical guides: {guides.vertical.data}")
|
83
|
+
print(f"Page width: {page.width}")
|
84
|
+
|
85
|
+
# Check if boundaries are included
|
86
|
+
has_left = 0 in guides.vertical.data
|
87
|
+
has_right = page.width in guides.vertical.data
|
88
|
+
```
|
89
|
+
|
90
|
+
3. **Visualize guides** to debug issues:
|
91
|
+
```python
|
92
|
+
# Show the page with guides overlaid
|
93
|
+
guides.show()
|
94
|
+
```
|
95
|
+
|
96
|
+
## Complete Example
|
97
|
+
|
98
|
+
```python
|
99
|
+
from natural_pdf import PDF
|
100
|
+
from natural_pdf.analyzers import Guides
|
101
|
+
|
102
|
+
# Load PDF
|
103
|
+
pdf = PDF("document.pdf")
|
104
|
+
page = pdf[0]
|
105
|
+
|
106
|
+
# Method 1: Best practice - use outer=True
|
107
|
+
guides = Guides.from_lines(page, outer=True)
|
108
|
+
table = guides.extract_table()
|
109
|
+
df = table.to_df()
|
110
|
+
print(df)
|
111
|
+
|
112
|
+
# Method 2: Alternative - use include_outer_boundaries
|
113
|
+
guides = Guides.from_lines(page)
|
114
|
+
table = guides.extract_table(include_outer_boundaries=True)
|
115
|
+
df = table.to_df()
|
116
|
+
print(df)
|
117
|
+
|
118
|
+
# Method 3: Manual control
|
119
|
+
guides = Guides.from_lines(page)
|
120
|
+
if 0 not in guides.vertical.data:
|
121
|
+
guides.vertical.add([0])
|
122
|
+
if page.width not in guides.vertical.data:
|
123
|
+
guides.vertical.add([page.width])
|
124
|
+
table = guides.extract_table()
|
125
|
+
df = table.to_df()
|
126
|
+
print(df)
|
127
|
+
```
|
128
|
+
|
129
|
+
## When This Issue Occurs
|
130
|
+
|
131
|
+
This issue typically occurs when:
|
132
|
+
- Tables are designed with no margins (content starts at x=0)
|
133
|
+
- Tables span the full page width
|
134
|
+
- PDF generators don't include explicit border lines at page edges
|
135
|
+
- Content is positioned exactly at page boundaries
|
136
|
+
|
137
|
+
## Verification
|
138
|
+
|
139
|
+
To verify if this is your issue:
|
140
|
+
|
141
|
+
```python
|
142
|
+
# Check text positions
|
143
|
+
texts = page.find_all('text')
|
144
|
+
min_x = min(t.x0 for t in texts)
|
145
|
+
max_x = max(t.x1 for t in texts)
|
146
|
+
|
147
|
+
print(f"Text spans from x={min_x} to x={max_x}")
|
148
|
+
print(f"Page width: {page.width}")
|
149
|
+
|
150
|
+
# Check guides
|
151
|
+
guides = Guides.from_lines(page)
|
152
|
+
print(f"First guide: {guides.vertical.data[0] if guides.vertical.data else 'None'}")
|
153
|
+
print(f"Last guide: {guides.vertical.data[-1] if guides.vertical.data else 'None'}")
|
154
|
+
|
155
|
+
# If min_x < first guide or max_x > last guide, you need boundaries
|
156
|
+
```
|
@@ -74,6 +74,18 @@ page.find_all('text[source=pdf]') # Original PDF text
|
|
74
74
|
page.find_all('text[confidence>=0.8]') # High-confidence OCR
|
75
75
|
```
|
76
76
|
|
77
|
+
### Statistical Selectors (Aggregates)
|
78
|
+
```py
|
79
|
+
page.find('text[x0=min()]') # Leftmost text
|
80
|
+
page.find('text[x1=max()]') # Rightmost text
|
81
|
+
page.find('text[size=max()]') # Largest text
|
82
|
+
page.find('text[width=avg()]') # Average width text
|
83
|
+
page.find('text[height=median()]') # Median height text
|
84
|
+
page.find('text[fontname=mode()]') # Most common font
|
85
|
+
page.find('text[color=closest("red")]') # Closest to red
|
86
|
+
page.find('text[x0=min()][size=max()]') # Leftmost AND largest
|
87
|
+
```
|
88
|
+
|
77
89
|
## Essential Methods
|
78
90
|
|
79
91
|
### Finding Elements
|
@@ -86,10 +98,21 @@ element.previous() # Previous element
|
|
86
98
|
|
87
99
|
### Spatial Navigation
|
88
100
|
```py
|
89
|
-
|
101
|
+
# Smart defaults (new in 0.9.0)
|
102
|
+
element.left() # Default height='element' (matches element height)
|
103
|
+
element.right() # Default height='element' (matches element height)
|
104
|
+
element.above() # Default width='full' (full page width)
|
105
|
+
element.below() # Default width='full' (full page width)
|
106
|
+
|
107
|
+
# Custom dimensions
|
108
|
+
element.above(height=100) # Fixed height above
|
90
109
|
element.below(until='line:horizontal') # Below until boundary
|
91
|
-
element.left(width=200) #
|
92
|
-
element.right()
|
110
|
+
element.left(width=200) # Fixed width to left
|
111
|
+
element.right(height='full') # Full page height to right
|
112
|
+
|
113
|
+
# Exclusion handling
|
114
|
+
element.below(apply_exclusions=True) # Skip exclusion zones
|
115
|
+
element.expand('down', 50, apply_exclusions=True) # Expand with exclusions
|
93
116
|
```
|
94
117
|
|
95
118
|
### Text Extraction
|
@@ -194,9 +217,16 @@ page.viewer() # Launch interactive viewer (Jup
|
|
194
217
|
|
195
218
|
### Page-Level Exclusions
|
196
219
|
```py
|
197
|
-
|
198
|
-
page.
|
199
|
-
page.
|
220
|
+
# Smart exclusion behavior (new in 0.9.0)
|
221
|
+
text_element = page.find('text:contains("CONFIDENTIAL")')
|
222
|
+
page.add_exclusion(text_element) # Excludes just the text bounding box
|
223
|
+
|
224
|
+
# Traditional region exclusion
|
225
|
+
header_region = page.find('text:contains("CONFIDENTIAL")').above()
|
226
|
+
page.add_exclusion(header_region) # Excludes entire region
|
227
|
+
|
228
|
+
# Manage exclusions
|
229
|
+
page.clear_exclusions() # Remove all exclusions
|
200
230
|
text = page.extract_text(use_exclusions=False) # Ignore exclusions
|
201
231
|
```
|
202
232
|
|
@@ -207,10 +237,27 @@ pdf.add_exclusion(
|
|
207
237
|
lambda p: p.create_region(0, 0, p.width, p.height * 0.1),
|
208
238
|
label="Header"
|
209
239
|
)
|
240
|
+
|
241
|
+
# Exclude specific text elements (new in 0.9.0)
|
242
|
+
pdf.add_exclusion(
|
243
|
+
lambda p: p.find_all('text:contains("Header")'), # Returns ElementCollection
|
244
|
+
label="Headers"
|
245
|
+
)
|
210
246
|
```
|
211
247
|
|
212
248
|
## Configuration Options
|
213
249
|
|
250
|
+
### Global Layout Settings
|
251
|
+
```py
|
252
|
+
import natural_pdf
|
253
|
+
|
254
|
+
# Configure global directional offset (default: 5)
|
255
|
+
natural_pdf.options.layout.directional_offset = 10 # Larger gap for directional methods
|
256
|
+
|
257
|
+
# Reset to default
|
258
|
+
natural_pdf.options.layout.directional_offset = 5
|
259
|
+
```
|
260
|
+
|
214
261
|
### OCR Engines
|
215
262
|
```py
|
216
263
|
from natural_pdf.ocr import EasyOCROptions, PaddleOCROptions
|
@@ -231,17 +278,17 @@ page.analyze_layout(engine='yolo', options=yolo_opts)
|
|
231
278
|
|
232
279
|
### Extract Inspection Report Data
|
233
280
|
```py
|
234
|
-
# Find violation count
|
235
|
-
violations = page.find('text:contains("Violation Count"):right(
|
281
|
+
# Find violation count (uses smart default height='element')
|
282
|
+
violations = page.find('text:contains("Violation Count"):right()')
|
236
283
|
|
237
284
|
# Get inspection number from the header box (regex search)
|
238
285
|
inspection_num = page.find('text:contains("INS-[A-Z0-9]+")', regex=True)
|
239
286
|
|
240
|
-
# Extract inspection date
|
287
|
+
# Extract inspection date (custom width for wider field)
|
241
288
|
inspection_date = page.find('text:contains("Date:"):right(width=150)')
|
242
289
|
|
243
|
-
# Get site name (
|
244
|
-
site_name = page.find('text:contains("Site:"):right(
|
290
|
+
# Get site name (uses smart default height='element')
|
291
|
+
site_name = page.find('text:contains("Site:"):right()').extract_text()
|
245
292
|
```
|
246
293
|
|
247
294
|
### Process Forms
|
@@ -250,9 +297,9 @@ site_name = page.find('text:contains("Site:"):right(width=300)').extract_text()
|
|
250
297
|
page.add_exclusion(page.create_region(0, 0, page.width, 50))
|
251
298
|
page.add_exclusion(page.create_region(0, page.height-50, page.width, page.height))
|
252
299
|
|
253
|
-
# Extract form fields
|
300
|
+
# Extract form fields (smart defaults + exclusion handling)
|
254
301
|
fields = page.find_all('text:bold')
|
255
|
-
values = [field.right(
|
302
|
+
values = [field.right(apply_exclusions=True).extract_text() for field in fields]
|
256
303
|
```
|
257
304
|
|
258
305
|
### Handle Scanned Documents
|
@@ -43,7 +43,12 @@ mid_region.show(color="blue")
|
|
43
43
|
|
44
44
|
### Using Element Methods: `above()`, `below()`, `left()`, `right()`
|
45
45
|
|
46
|
-
You can create regions relative to existing elements.
|
46
|
+
You can create regions relative to existing elements. Natural PDF uses smart defaults for these directional methods:
|
47
|
+
|
48
|
+
- **`.left()` and `.right()`**: Default to `height='element'` (matches the element's height)
|
49
|
+
- **`.above()` and `.below()`**: Default to `width='full'` (full page width)
|
50
|
+
|
51
|
+
These defaults match common use cases - when looking sideways you usually want the same height as your reference element, while looking up/down typically needs the full page width.
|
47
52
|
|
48
53
|
```python
|
49
54
|
# Find a heading-like element
|
@@ -51,6 +56,7 @@ heading = page.find('text[size>=12]:bold')
|
|
51
56
|
|
52
57
|
# Create a region below this heading element
|
53
58
|
if heading:
|
59
|
+
# Uses default width='full' - extends across full page width
|
54
60
|
region_below = heading.below()
|
55
61
|
|
56
62
|
# Highlight the heading and the region below it
|
@@ -60,6 +66,24 @@ if heading:
|
|
60
66
|
h.show()
|
61
67
|
```
|
62
68
|
|
69
|
+
```python
|
70
|
+
# Create regions to the left and right with smart defaults
|
71
|
+
if heading:
|
72
|
+
# Default height='element' - matches heading height
|
73
|
+
region_left = heading.left()
|
74
|
+
region_right = heading.right()
|
75
|
+
|
76
|
+
# Or specify custom dimensions
|
77
|
+
region_left_tall = heading.left(height=200) # 200px tall
|
78
|
+
region_right_full = heading.right(height='full') # Full page height
|
79
|
+
|
80
|
+
with page.highlights() as h:
|
81
|
+
h.add(heading, color="red")
|
82
|
+
h.add(region_left, color="green", label="Left (element height)")
|
83
|
+
h.add(region_right, color="blue", label="Right (element height)")
|
84
|
+
h.show()
|
85
|
+
```
|
86
|
+
|
63
87
|
```python
|
64
88
|
# Create a region with height limit
|
65
89
|
if heading:
|
@@ -212,6 +236,46 @@ with page.highlights() as h:
|
|
212
236
|
h.show()
|
213
237
|
```
|
214
238
|
|
239
|
+
### Global Offset Configuration
|
240
|
+
|
241
|
+
You can configure global offsets that will be applied to all regions created with directional methods. This is useful for consistently adding padding or margins:
|
242
|
+
|
243
|
+
```python
|
244
|
+
from natural_pdf import PDF
|
245
|
+
|
246
|
+
# Configure global offsets for all PDFs
|
247
|
+
PDF.configure_offsets(
|
248
|
+
below_offset=5, # Add 5px gap below elements
|
249
|
+
above_offset=5, # Add 5px gap above elements
|
250
|
+
left_offset=2, # Add 2px gap to the left
|
251
|
+
right_offset=2 # Add 2px gap to the right
|
252
|
+
)
|
253
|
+
|
254
|
+
# Now all directional methods will include these offsets
|
255
|
+
heading = page.find('text:bold')
|
256
|
+
if heading:
|
257
|
+
# This region will start 5px below the heading (not touching)
|
258
|
+
content_below = heading.below()
|
259
|
+
|
260
|
+
# This region will end 5px above the heading
|
261
|
+
content_above = heading.above(height=100)
|
262
|
+
```
|
263
|
+
|
264
|
+
```python
|
265
|
+
# Reset to default offsets (all 0)
|
266
|
+
PDF.configure_offsets(
|
267
|
+
below_offset=0,
|
268
|
+
above_offset=0,
|
269
|
+
left_offset=0,
|
270
|
+
right_offset=0
|
271
|
+
)
|
272
|
+
```
|
273
|
+
|
274
|
+
These offsets are particularly useful when:
|
275
|
+
- Extracting text that might be too close to headers/footers
|
276
|
+
- Creating regions that need consistent spacing
|
277
|
+
- Working with documents that have tight layouts
|
278
|
+
|
215
279
|
## Using Exclusion Zones with Regions
|
216
280
|
|
217
281
|
Exclusion zones are regions that you want to ignore during operations like text extraction.
|