natural-pdf 0.2.18__tar.gz → 0.2.20__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- natural_pdf-0.2.20/CHECKBOX_DETECTION.md +172 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/CLAUDE.md +1 -0
- {natural_pdf-0.2.18/natural_pdf.egg-info → natural_pdf-0.2.20}/PKG-INFO +1 -1
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/docs/element-selection/index.md +45 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/docs/layout-analysis/index.md +59 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/docs/quick-reference/index.md +2 -0
- natural_pdf-0.2.20/example_checkbox_usage.py +55 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/natural_pdf/__init__.py +8 -0
- natural_pdf-0.2.20/natural_pdf/analyzers/checkbox/__init__.py +6 -0
- natural_pdf-0.2.20/natural_pdf/analyzers/checkbox/base.py +265 -0
- natural_pdf-0.2.20/natural_pdf/analyzers/checkbox/checkbox_analyzer.py +329 -0
- natural_pdf-0.2.20/natural_pdf/analyzers/checkbox/checkbox_manager.py +166 -0
- natural_pdf-0.2.20/natural_pdf/analyzers/checkbox/checkbox_options.py +60 -0
- natural_pdf-0.2.20/natural_pdf/analyzers/checkbox/mixin.py +95 -0
- natural_pdf-0.2.20/natural_pdf/analyzers/checkbox/rtdetr.py +201 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/natural_pdf/analyzers/guides.py +26 -2
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/natural_pdf/collections/mixins.py +14 -5
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/natural_pdf/core/element_manager.py +5 -1
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/natural_pdf/core/page.py +61 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/natural_pdf/core/page_collection.py +41 -1
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/natural_pdf/core/pdf.py +24 -1
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/natural_pdf/describe/base.py +20 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/natural_pdf/elements/base.py +152 -10
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/natural_pdf/elements/element_collection.py +41 -2
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/natural_pdf/elements/region.py +115 -2
- natural_pdf-0.2.20/natural_pdf/judge.py +1509 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/natural_pdf/selectors/parser.py +42 -1
- {natural_pdf-0.2.18 → natural_pdf-0.2.20/natural_pdf.egg-info}/PKG-INFO +1 -1
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/natural_pdf.egg-info/SOURCES.txt +42 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/natural_pdf.egg-info/top_level.txt +2 -0
- natural_pdf-0.2.20/temp/check_model.py +49 -0
- natural_pdf-0.2.20/temp/check_pdf_content.py +9 -0
- natural_pdf-0.2.20/temp/checkbox_checks.py +590 -0
- natural_pdf-0.2.20/temp/checkbox_simple.py +117 -0
- natural_pdf-0.2.20/temp/checkbox_ux_ideas.py +400 -0
- natural_pdf-0.2.20/temp/context_manager_prototype.py +177 -0
- natural_pdf-0.2.20/temp/convert_to_hf.py +60 -0
- natural_pdf-0.2.20/temp/demo_text_closest.py +66 -0
- natural_pdf-0.2.20/temp/inspect_model.py +43 -0
- natural_pdf-0.2.20/temp/rtdetr_dinov2_test.py +49 -0
- natural_pdf-0.2.20/temp/test_closest_debug.py +26 -0
- natural_pdf-0.2.20/temp/test_closest_debug2.py +22 -0
- natural_pdf-0.2.20/temp/test_context_exploration.py +85 -0
- natural_pdf-0.2.20/temp/test_durham.py +30 -0
- natural_pdf-0.2.20/temp/test_empty_string.py +16 -0
- natural_pdf-0.2.20/temp/test_similarity.py +15 -0
- natural_pdf-0.2.20/tests/test_closest_substring_sorting.py +136 -0
- natural_pdf-0.2.20/tests/test_closest_until.py +119 -0
- natural_pdf-0.2.20/tests/test_closest_until_comparison.py +106 -0
- natural_pdf-0.2.20/tests/test_closest_until_debug.py +81 -0
- natural_pdf-0.2.20/tests/test_closest_until_fix.py +112 -0
- natural_pdf-0.2.20/tests/test_closest_until_ordering.py +117 -0
- natural_pdf-0.2.20/tests/test_extract_text_words.py +116 -0
- natural_pdf-0.2.20/tests/test_from_parameter.py +154 -0
- natural_pdf-0.2.20/tests/test_from_parameter_example.py +69 -0
- natural_pdf-0.2.20/tests/test_from_self_exclusion.py +60 -0
- natural_pdf-0.2.20/tests/test_from_simple.py +56 -0
- natural_pdf-0.2.20/tests/test_guides_from_headers_strings.py +76 -0
- natural_pdf-0.2.20/tests/test_text_closest_selector.py +179 -0
- natural_pdf-0.2.20/tests/test_within_constraint.py +214 -0
- natural_pdf-0.2.20/tests/test_words_vs_find_all_text.py +97 -0
- natural_pdf-0.2.20/tests/test_words_vs_find_all_text_summary.md +54 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/.cursor/rules/analysis_framework.mdc +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/.cursor/rules/coding-style.mdc +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/.cursor/rules/edit-md-instead-of-ipynb.mdc +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/.cursor/rules/minimal-comments.mdc +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/.cursor/rules/natural-pdf-overview.mdc +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/.cursor/rules/user-friendly-library-code.mdc +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/.github/workflows/ci.yml +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/.github/workflows/docs.yml +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/.github/workflows/nightly-tutorials.yml +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/.gitignore +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/.pre-commit-config.yaml +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/01-execute_notebooks.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/02-run_all_tutorials.sh +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/LICENSE +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/MANIFEST.in +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/README.md +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/audit_packaging.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/check_run_md.sh +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/docs/api/index.md +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/docs/assets/favicon.png +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/docs/assets/favicon.svg +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/docs/assets/javascripts/custom.js +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/docs/assets/logo.svg +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/docs/assets/sample-screen.png +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/docs/assets/social-preview.png +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/docs/assets/social-preview.svg +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/docs/assets/stylesheets/custom.css +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/docs/categorizing-documents/index.md +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/docs/data-extraction/index.md +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/docs/describe/index.md +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/docs/document-qa/index.md +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/docs/extracting-clean-text/index.md +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/docs/finetuning/index.md +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/docs/fix-messy-tables/index.md +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/docs/fix-messy-tables/table_1.csv +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/docs/fix-messy-tables/table_2.csv +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/docs/fix-messy-tables/table_3.csv +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/docs/guide_adjustment_stream.md +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/docs/guides_boundary_columns.md +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/docs/index.md +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/docs/installation/index.md +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/docs/interactive-widget/index.md +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/docs/loops-and-groups/index.md +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/docs/ocr/index.md +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/docs/pdf-navigation/index.md +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/docs/process-forms-and-invoices/extracted_form_data.csv +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/docs/process-forms-and-invoices/index.md +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/docs/reflowing-pages/index.md +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/docs/regions/index.md +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/docs/tables/index.md +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/docs/text-analysis/index.md +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/docs/tutorials/01-loading-and-extraction.md +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/docs/tutorials/02-finding-elements.md +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/docs/tutorials/03-extracting-blocks.md +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/docs/tutorials/04-table-extraction.md +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/docs/tutorials/05-excluding-content.md +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/docs/tutorials/06-document-qa.md +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/docs/tutorials/07-layout-analysis.md +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/docs/tutorials/07-working-with-regions.md +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/docs/tutorials/08-spatial-navigation.md +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/docs/tutorials/09-section-extraction.md +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/docs/tutorials/10-form-field-extraction.md +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/docs/tutorials/11-enhanced-table-processing.md +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/docs/tutorials/12-ocr-integration.md +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/docs/tutorials/13-semantic-search.md +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/docs/tutorials/14-categorizing-documents.md +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/docs/visual-debugging/index.md +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/docs/visual-debugging/region.png +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/mkdocs.yml +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/natural_pdf/analyzers/__init__.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/natural_pdf/analyzers/layout/__init__.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/natural_pdf/analyzers/layout/base.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/natural_pdf/analyzers/layout/docling.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/natural_pdf/analyzers/layout/gemini.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/natural_pdf/analyzers/layout/layout_analyzer.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/natural_pdf/analyzers/layout/layout_manager.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/natural_pdf/analyzers/layout/layout_options.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/natural_pdf/analyzers/layout/paddle.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/natural_pdf/analyzers/layout/pdfplumber_table_finder.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/natural_pdf/analyzers/layout/surya.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/natural_pdf/analyzers/layout/table_structure_utils.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/natural_pdf/analyzers/layout/tatr.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/natural_pdf/analyzers/layout/yolo.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/natural_pdf/analyzers/shape_detection_mixin.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/natural_pdf/analyzers/text_options.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/natural_pdf/analyzers/text_structure.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/natural_pdf/analyzers/utils.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/natural_pdf/classification/manager.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/natural_pdf/classification/mixin.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/natural_pdf/classification/results.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/natural_pdf/cli.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/natural_pdf/core/__init__.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/natural_pdf/core/highlighting_service.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/natural_pdf/core/page_groupby.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/natural_pdf/core/pdf_collection.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/natural_pdf/core/render_spec.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/natural_pdf/describe/__init__.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/natural_pdf/describe/elements.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/natural_pdf/describe/mixin.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/natural_pdf/describe/summary.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/natural_pdf/elements/__init__.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/natural_pdf/elements/image.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/natural_pdf/elements/line.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/natural_pdf/elements/rect.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/natural_pdf/elements/text.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/natural_pdf/export/mixin.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/natural_pdf/exporters/__init__.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/natural_pdf/exporters/base.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/natural_pdf/exporters/data/__init__.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/natural_pdf/exporters/data/pdf.ttf +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/natural_pdf/exporters/data/sRGB.icc +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/natural_pdf/exporters/hocr.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/natural_pdf/exporters/hocr_font.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/natural_pdf/exporters/original_pdf.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/natural_pdf/exporters/paddleocr.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/natural_pdf/exporters/searchable_pdf.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/natural_pdf/extraction/manager.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/natural_pdf/extraction/mixin.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/natural_pdf/extraction/result.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/natural_pdf/flows/__init__.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/natural_pdf/flows/collections.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/natural_pdf/flows/element.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/natural_pdf/flows/flow.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/natural_pdf/flows/region.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/natural_pdf/ocr/__init__.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/natural_pdf/ocr/engine.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/natural_pdf/ocr/engine_doctr.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/natural_pdf/ocr/engine_easyocr.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/natural_pdf/ocr/engine_paddle.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/natural_pdf/ocr/engine_surya.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/natural_pdf/ocr/ocr_factory.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/natural_pdf/ocr/ocr_manager.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/natural_pdf/ocr/ocr_options.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/natural_pdf/ocr/utils.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/natural_pdf/qa/__init__.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/natural_pdf/qa/document_qa.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/natural_pdf/qa/qa_result.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/natural_pdf/search/__init__.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/natural_pdf/search/lancedb_search_service.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/natural_pdf/search/numpy_search_service.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/natural_pdf/search/search_options.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/natural_pdf/search/search_service_protocol.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/natural_pdf/search/searchable_mixin.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/natural_pdf/selectors/__init__.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/natural_pdf/tables/__init__.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/natural_pdf/tables/result.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/natural_pdf/templates/__init__.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/natural_pdf/templates/finetune/fine_tune_paddleocr.md +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/natural_pdf/templates/spa/css/style.css +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/natural_pdf/templates/spa/index.html +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/natural_pdf/templates/spa/js/app.js +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/natural_pdf/templates/spa/words.txt +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/natural_pdf/text_mixin.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/natural_pdf/utils/__init__.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/natural_pdf/utils/bidi_mirror.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/natural_pdf/utils/color_utils.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/natural_pdf/utils/debug.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/natural_pdf/utils/highlighting.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/natural_pdf/utils/identifiers.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/natural_pdf/utils/layout.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/natural_pdf/utils/locks.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/natural_pdf/utils/packaging.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/natural_pdf/utils/pdfminer_patches.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/natural_pdf/utils/reading_order.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/natural_pdf/utils/sections.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/natural_pdf/utils/spatial.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/natural_pdf/utils/text_extraction.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/natural_pdf/utils/visualization.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/natural_pdf/vision/__init__.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/natural_pdf/vision/mixin.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/natural_pdf/vision/results.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/natural_pdf/vision/similarity.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/natural_pdf/vision/template_matching.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/natural_pdf/widgets/__init__.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/natural_pdf/widgets/viewer.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/natural_pdf.egg-info/dependency_links.txt +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/natural_pdf.egg-info/entry_points.txt +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/natural_pdf.egg-info/requires.txt +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/noxfile.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/optimization/memory_comparison.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/optimization/pdf_analyzer.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/optimization/performance_analysis.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/optimization/performance_results/image_heavy_snapshots.csv +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/optimization/performance_results/image_heavy_snapshots.json +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/optimization/performance_results/text_heavy_snapshots.csv +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/optimization/performance_results/text_heavy_snapshots.json +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/optimization/test_cleanup_methods.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/optimization/test_memory_fix.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/publish.sh +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/pyproject.toml +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/sample-screen.png +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/setup.cfg +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/temp/fix_page_exclusions.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/temp/test_draw_guides.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/temp/test_draw_guides_interactive.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/temp/test_exclusion_with_debug.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/temp/test_find_exclusions_fix.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/temp/test_find_exclusions_fix_no_recursion.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/temp/test_fix_real_pdf.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/temp/test_fix_working.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/temp/test_fixed_pdf_exclusions.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/temp/test_guide_draw_notebook.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/temp/test_horizontal_top_bottom.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/temp/test_inline_js.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/temp/test_marker_order.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/temp/test_original_exclusions_now_work.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/temp/test_pdf_exclusions_with_guides.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/temp/test_region_exclusions_detailed.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/temp/test_stripes_real_pdf.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/temp/test_vertical_stripes.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/temp/test_widget_functionality.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/temp/test_widget_simple.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/tests/conftest.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/tests/demo_multipage.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/tests/exporters/test_paddleocr_exporter.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/tests/test_aggregate_selectors.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/tests/test_annotate.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/tests/test_arabic_performance.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/tests/test_arabic_real_world.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/tests/test_auto_multipage_option.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/tests/test_color_conversion.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/tests/test_color_hex_display.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/tests/test_core/test_containment_geometry.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/tests/test_core/test_elements.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/tests/test_core/test_loading.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/tests/test_core/test_spatial.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/tests/test_core/test_text_extraction.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/tests/test_core/test_text_layer.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/tests/test_crop_enhancements.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/tests/test_crop_region_highlights.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/tests/test_directional_defaults.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/tests/test_dissolve.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/tests/test_dissolve_cross_page_bug.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/tests/test_dissolve_debug_issue.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/tests/test_dissolve_real_world_issue.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/tests/test_dissolve_single_elements.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/tests/test_dissolve_vertical_offset_issue.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/tests/test_document_qa.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/tests/test_element_addition.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/tests/test_element_collection_guides.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/tests/test_element_collection_show_cols.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/tests/test_element_collection_slicing.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/tests/test_element_exclusions.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/tests/test_element_show_crop_highlights.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/tests/test_empty_pseudo_class.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/tests/test_exclude_multi_page.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/tests/test_exclude_real_pdf.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/tests/test_exclusion_recursion_fix.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/tests/test_exclusions.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/tests/test_expand.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/tests/test_expand_enhanced.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/tests/test_extraction_error.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/tests/test_extraction_mixin_fix.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/tests/test_extraction_text_and_vision.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/tests/test_extraction_working.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/tests/test_find_similar.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/tests/test_first_last_selectors.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/tests/test_fix_get_sections_zero_height.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/tests/test_flow_region_directional.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/tests/test_get_sections_fix_comprehensive.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/tests/test_get_sections_zero_height.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/tests/test_groupby.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/tests/test_guide_adjustment_stream.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/tests/test_guides.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/tests/test_guides_apply_exclusions.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/tests/test_guides_apply_exclusions_simple.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/tests/test_guides_boundaries.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/tests/test_guides_extract_table.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/tests/test_guides_extract_table_collections.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/tests/test_guides_extract_table_exclusions.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/tests/test_guides_extract_table_real.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/tests/test_guides_from_headers.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/tests/test_guides_from_stripes.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/tests/test_guides_integration.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/tests/test_guides_marker_sorting.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/tests/test_guides_partial.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/tests/test_highlight_color_falsy.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/tests/test_highlight_detection.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/tests/test_highlight_detection_comprehensive.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/tests/test_highlight_offset.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/tests/test_highlight_protocol.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/tests/test_highlight_protocol_simple.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/tests/test_highlight_regions.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/tests/test_horizontal_guides_alignment.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/tests/test_include_boundaries_comprehensive.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/tests/test_include_boundaries_final.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/tests/test_include_boundaries_final_verification.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/tests/test_include_boundaries_fix.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/tests/test_include_boundaries_mock.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/tests/test_include_boundaries_simple.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/tests/test_include_boundaries_types_pdf.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/tests/test_include_boundaries_verification.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/tests/test_include_boundaries_with_real_text.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/tests/test_loading_original.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/tests/test_match_results_sorting.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/tests/test_merge_connected.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/tests/test_merge_connected_real_world.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/tests/test_merge_method.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/tests/test_merged_flowregion_specs.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/tests/test_mixed_collection_rendering.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/tests/test_multi_page_table_discovery.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/tests/test_multipage_directional.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/tests/test_negative_bounds_pdf.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/tests/test_optional_deps.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/tests/test_page_exclusion_lists.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/tests/test_pdf_add_exclusion_elementcollection.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/tests/test_pdf_exclusions_in_find_methods.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/tests/test_pdfminer_bug_status.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/tests/test_pdfminer_color_bug.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/tests/test_pdfminer_color_stack_bug.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/tests/test_phash_masking.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/tests/test_region_find_similar.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/tests/test_region_show_crop_highlights.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/tests/test_region_viewer.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/tests/test_sections_end_only.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/tests/test_sections_with_start_and_end.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/tests/test_show_column_layout.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/tests/test_show_edge_cases.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/tests/test_show_exclusions.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/tests/test_show_exclusions_feature.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/tests/test_show_limit.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/tests/test_skip_repeating_headers_multipage.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/tests/test_slice_cache_reuse.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/tests/test_slice_exclusion_fix.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/tests/test_slice_exclusion_issue.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/tests/test_slice_exclusion_mock.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/tests/test_sliced_collection_exclusions.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/tests/test_smart_exclusion.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/tests/test_spatial_offset.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/tests/test_strikethrough_detection.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/tests/test_table_result_header_mismatch.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/tests/test_table_result_keep_blank.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/tests/test_template_matching.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/tests/test_template_white_masking.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/tests/test_tiny_text_tables.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/tests/test_tiny_text_tables_table.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/tests/test_tutorials.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/tests/test_underline_detection.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/tests/test_update_text.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/todo/bad_pdf_analysis.md +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/todo/evaluation.md +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/tools/bad_pdf_eval/IMPROVEMENTS_SUMMARY.md +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/tools/bad_pdf_eval/LLM_NaturalPDF_CheatSheet.md +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/tools/bad_pdf_eval/LLM_NaturalPDF_Workflows.md +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/tools/bad_pdf_eval/README.md +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/tools/bad_pdf_eval/__init__.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/tools/bad_pdf_eval/analyser.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/tools/bad_pdf_eval/collate_summaries.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/tools/bad_pdf_eval/compile_attempts_markdown.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/tools/bad_pdf_eval/eval_suite.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/tools/bad_pdf_eval/evaluate_quality.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/tools/bad_pdf_eval/export_enrichment_csv.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/tools/bad_pdf_eval/extraction_decision_tree.md +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/tools/bad_pdf_eval/llm_enrich.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/tools/bad_pdf_eval/llm_enrich_with_retry.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/tools/bad_pdf_eval/reporter.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/tools/bad_pdf_eval/utils.py +0 -0
- {natural_pdf-0.2.18 → natural_pdf-0.2.20}/uv.lock +0 -0
@@ -0,0 +1,172 @@
|
|
1
|
+
# Checkbox Detection in Natural PDF
|
2
|
+
|
3
|
+
Natural PDF now includes built-in checkbox detection using computer vision models. This feature can automatically detect checkboxes in PDF documents and determine whether they are checked or unchecked.
|
4
|
+
|
5
|
+
## Quick Start
|
6
|
+
|
7
|
+
```python
|
8
|
+
import natural_pdf as npdf
|
9
|
+
|
10
|
+
# Load PDF and detect checkboxes
|
11
|
+
pdf = npdf.PDF("form.pdf")
|
12
|
+
checkboxes = pdf[0].detect_checkboxes()
|
13
|
+
|
14
|
+
# Check results
|
15
|
+
for cb in checkboxes:
|
16
|
+
print(f"Checkbox at {cb.bbox}: {'✓' if cb.is_checked else '✗'}")
|
17
|
+
```
|
18
|
+
|
19
|
+
## Features
|
20
|
+
|
21
|
+
### 1. Basic Detection
|
22
|
+
```python
|
23
|
+
# Detect all checkboxes on a page
|
24
|
+
checkboxes = page.detect_checkboxes()
|
25
|
+
|
26
|
+
# Access checkbox properties
|
27
|
+
checkbox = checkboxes[0]
|
28
|
+
print(checkbox.is_checked) # True/False
|
29
|
+
print(checkbox.checkbox_state) # "checked"/"unchecked"
|
30
|
+
print(checkbox.confidence) # Detection confidence (0-1)
|
31
|
+
```
|
32
|
+
|
33
|
+
### 2. Using Selectors
|
34
|
+
```python
|
35
|
+
# Find checked/unchecked boxes
|
36
|
+
checked = page.find_all('checkbox:checked')
|
37
|
+
unchecked = page.find_all('checkbox:unchecked')
|
38
|
+
|
39
|
+
# All checkboxes
|
40
|
+
all_checkboxes = page.find_all('checkbox')
|
41
|
+
|
42
|
+
# By attributes (note: use is_checked, not checked)
|
43
|
+
checked = page.find_all('checkbox[is_checked=true]')
|
44
|
+
```
|
45
|
+
|
46
|
+
### 3. Limited Detection
|
47
|
+
When you know the expected number of checkboxes:
|
48
|
+
```python
|
49
|
+
# Get top 10 checkboxes by confidence
|
50
|
+
checkboxes = page.detect_checkboxes(limit=10)
|
51
|
+
```
|
52
|
+
|
53
|
+
### 4. Multi-level Detection
|
54
|
+
```python
|
55
|
+
# Entire PDF
|
56
|
+
all_checkboxes = pdf.detect_checkboxes()
|
57
|
+
|
58
|
+
# Page collection
|
59
|
+
pages = pdf[0:5]
|
60
|
+
checkboxes = pages.detect_checkboxes()
|
61
|
+
|
62
|
+
# Within a region
|
63
|
+
region = page.find('text:contains("Options")').below()
|
64
|
+
checkboxes = region.detect_checkboxes()
|
65
|
+
```
|
66
|
+
|
67
|
+
### 5. Visualization
|
68
|
+
```python
|
69
|
+
# Show detected checkboxes
|
70
|
+
checkboxes.show()
|
71
|
+
|
72
|
+
# Checkboxes display their state in repr
|
73
|
+
print(checkboxes[0])
|
74
|
+
# <Region type='checkbox' [checked] bbox=(100, 200, 120, 220)>
|
75
|
+
```
|
76
|
+
|
77
|
+
## Advanced Configuration
|
78
|
+
|
79
|
+
### Custom Detection Options
|
80
|
+
```python
|
81
|
+
from natural_pdf.analyzers.checkbox import CheckboxOptions
|
82
|
+
|
83
|
+
# Higher confidence threshold (default is 0.05)
|
84
|
+
options = CheckboxOptions(confidence=0.5)
|
85
|
+
checkboxes = page.detect_checkboxes(options=options)
|
86
|
+
|
87
|
+
# Different resolution (default is 150 DPI)
|
88
|
+
checkboxes = page.detect_checkboxes(resolution=300)
|
89
|
+
|
90
|
+
# GPU acceleration
|
91
|
+
checkboxes = page.detect_checkboxes(device='cuda')
|
92
|
+
```
|
93
|
+
|
94
|
+
### Custom Models
|
95
|
+
```python
|
96
|
+
# Use a different checkbox detection model
|
97
|
+
options = CheckboxOptions(
|
98
|
+
model_repo="your-org/your-checkbox-model",
|
99
|
+
label_mapping={
|
100
|
+
"empty_box": "unchecked",
|
101
|
+
"ticked_box": "checked",
|
102
|
+
}
|
103
|
+
)
|
104
|
+
checkboxes = page.detect_checkboxes(options=options)
|
105
|
+
```
|
106
|
+
|
107
|
+
### Disable Text Filtering
|
108
|
+
```python
|
109
|
+
# If your checkboxes contain text for some reason
|
110
|
+
checkboxes = page.detect_checkboxes(reject_with_text=False)
|
111
|
+
|
112
|
+
# Or with options
|
113
|
+
options = CheckboxOptions(reject_with_text=False)
|
114
|
+
checkboxes = page.detect_checkboxes(options=options)
|
115
|
+
```
|
116
|
+
|
117
|
+
## Implementation Details
|
118
|
+
|
119
|
+
- **Default Model**: Uses `wendys-llc/rtdetr-v2-r50-chkbx` RT-DETR model
|
120
|
+
- **Low Confidence**: Default confidence is 0.02 (very low to catch all checkboxes)
|
121
|
+
- **Resolution**: Renders at 150 DPI by default for efficiency
|
122
|
+
- **No Overlaps**: Aggressive NMS rejects ANY overlapping detections
|
123
|
+
- **Text Filtering**: Automatically rejects detections containing text (real checkboxes should be empty)
|
124
|
+
- **Architecture**: Follows the same pattern as layout detection for consistency
|
125
|
+
|
126
|
+
## Common Use Cases
|
127
|
+
|
128
|
+
### Form Processing
|
129
|
+
```python
|
130
|
+
# Extract form checkbox states
|
131
|
+
form_data = {}
|
132
|
+
for cb in page.detect_checkboxes():
|
133
|
+
# Find nearby text label
|
134
|
+
label = cb.left('text').extract_text() or cb.above('text').extract_text()
|
135
|
+
form_data[label] = cb.is_checked
|
136
|
+
```
|
137
|
+
|
138
|
+
### Validation
|
139
|
+
```python
|
140
|
+
# Ensure all required checkboxes are checked
|
141
|
+
required = ["Terms", "Privacy", "Age"]
|
142
|
+
checkboxes = page.detect_checkboxes()
|
143
|
+
|
144
|
+
for req in required:
|
145
|
+
cb = page.find(f'text:contains("{req}")').right('checkbox:first')
|
146
|
+
if not cb or not cb.is_checked:
|
147
|
+
print(f"Warning: {req} not checked!")
|
148
|
+
```
|
149
|
+
|
150
|
+
### Batch Processing
|
151
|
+
```python
|
152
|
+
# Process multiple forms
|
153
|
+
for pdf_path in pdf_files:
|
154
|
+
pdf = npdf.PDF(pdf_path)
|
155
|
+
results = []
|
156
|
+
|
157
|
+
for page in pdf.pages:
|
158
|
+
checkboxes = page.detect_checkboxes(limit=20)
|
159
|
+
checked_count = len([cb for cb in checkboxes if cb.is_checked])
|
160
|
+
results.append({
|
161
|
+
'page': page.number,
|
162
|
+
'total': len(checkboxes),
|
163
|
+
'checked': checked_count
|
164
|
+
})
|
165
|
+
```
|
166
|
+
|
167
|
+
## Troubleshooting
|
168
|
+
|
169
|
+
1. **No checkboxes detected**: Try lowering confidence threshold
|
170
|
+
2. **Too many false positives**: Increase confidence threshold
|
171
|
+
3. **Missing transformers**: Install with `pip install transformers torch`
|
172
|
+
4. **Selector syntax**: Use `:checked`/`:unchecked` or `[is_checked=true]`
|
@@ -96,3 +96,4 @@ Natural PDF is a Python library for intelligent PDF document processing that com
|
|
96
96
|
### Environment and Tooling
|
97
97
|
- Always use the virtual environment in .venv
|
98
98
|
- Use uv when possible for efficient package management
|
99
|
+
- Don't create new PDFs for testing, just use pdfs/01-practice.pdf.
|
@@ -117,6 +117,7 @@ These are powerful filters that let you find elements based on their content or
|
|
117
117
|
| Pseudo-Class | Example | What It Finds |
|
118
118
|
|-----------------------|-----------------------------------|---------------|
|
119
119
|
| `:contains('text')` | `text:contains('Report')` | Elements containing specific text |
|
120
|
+
| `:closest('text')` | `text:closest('Invoice Date')` | Fuzzy text matching (great for OCR errors) |
|
120
121
|
| `:bold` | `text:bold` | Bold text (detected automatically) |
|
121
122
|
| `:italic` | `text:italic` | Italic text |
|
122
123
|
| `:strike` | `text:strike` | Struck-through text |
|
@@ -187,6 +188,34 @@ page.find_all('text:contains("INS-\\w+")', regex=True)
|
|
187
188
|
page.find_all('text:contains("jungle health")', regex=True, case=False)
|
188
189
|
```
|
189
190
|
|
191
|
+
### Fuzzy Text Matching for OCR Errors
|
192
|
+
|
193
|
+
When working with OCR'd PDFs, text recognition isn't always perfect. The `:closest()` pseudo-class helps you find text even when it contains errors:
|
194
|
+
|
195
|
+
```python
|
196
|
+
# Find "Invoice Date" even if OCR read it as "Invice Date" or "Invoice Dat"
|
197
|
+
page.find('text:closest("Invoice Date")')
|
198
|
+
|
199
|
+
# Specify a similarity threshold (0.0 to 1.0)
|
200
|
+
# 0.8 = 80% similar
|
201
|
+
page.find_all('text:closest("Date of Review@0.8")')
|
202
|
+
|
203
|
+
# Default threshold is 0.0 - returns all text sorted by similarity
|
204
|
+
# Exact substring matches always come first
|
205
|
+
all_sorted = page.find_all('text:closest("Durham")')
|
206
|
+
```
|
207
|
+
|
208
|
+
The `:closest()` selector is particularly useful for:
|
209
|
+
- OCR errors like "rn" read as "m" (Durharn → Durham)
|
210
|
+
- Missing punctuation (Date: → Date)
|
211
|
+
- Character confusion (l/I, 0/O)
|
212
|
+
- Partial matches when you're not sure of the exact text
|
213
|
+
|
214
|
+
```python
|
215
|
+
# Combine with other selectors for more precision
|
216
|
+
page.find('text:closest("Total Amount@0.7")[size>12]')
|
217
|
+
```
|
218
|
+
|
190
219
|
## Working with Groups of Elements
|
191
220
|
|
192
221
|
`find_all()` returns an `ElementCollection` - like a list, but with PDF-specific superpowers.
|
@@ -221,6 +250,22 @@ service_headings = headings.filter(lambda heading: 'Service' in heading.extract_
|
|
221
250
|
headings.extract_text()
|
222
251
|
```
|
223
252
|
|
253
|
+
### Applying Functions to Collections
|
254
|
+
|
255
|
+
The `.apply()` method lets you transform each element in a collection. It preserves the collection type even when results are empty:
|
256
|
+
|
257
|
+
```python
|
258
|
+
# Apply a function to each element
|
259
|
+
uppercase_texts = texts.apply(lambda t: t.extract_text().upper())
|
260
|
+
|
261
|
+
# Navigate from each element - returns an ElementCollection
|
262
|
+
regions_below = headings.apply(lambda h: h.below())
|
263
|
+
|
264
|
+
# Even empty results maintain the collection type
|
265
|
+
empty_collection = page.find_all('nonexistent').apply(lambda x: x.expand(10))
|
266
|
+
# Returns ElementCollection([]) not []
|
267
|
+
```
|
268
|
+
|
224
269
|
*Note: `.highest()`, `.lowest()`, etc. will complain if your collection spans multiple pages.*
|
225
270
|
|
226
271
|
## Finding Elements with Statistical Properties
|
@@ -207,6 +207,64 @@ regions = page.analyze_layout(engine="gemini", options=options)
|
|
207
207
|
- The client must be compatible with the OpenAI API (see the `openai` Python package).
|
208
208
|
- This feature is intended for advanced users who need LLM-based layout analysis.
|
209
209
|
|
210
|
+
## Using Judge for Visual Classification
|
211
|
+
|
212
|
+
Natural PDF includes a `Judge` class that can learn to classify visual elements like checkboxes. This is particularly useful after layout detection when you need to determine the state of detected elements.
|
213
|
+
|
214
|
+
### Example: Checkbox Classification
|
215
|
+
|
216
|
+
```python
|
217
|
+
from natural_pdf import Judge
|
218
|
+
|
219
|
+
# Create a judge for checkbox classification
|
220
|
+
judge = Judge("form_checkboxes", labels=["checked", "unchecked"])
|
221
|
+
|
222
|
+
# Train with examples
|
223
|
+
checked_region = page.find("text=Acceptable").left(width=20)
|
224
|
+
unchecked_region = page.find("text=Deficient").left(width=20)
|
225
|
+
|
226
|
+
judge.add(checked_region, "checked")
|
227
|
+
judge.add(unchecked_region, "unchecked")
|
228
|
+
|
229
|
+
# Classify new checkboxes
|
230
|
+
new_checkbox = page.find("text=At-Risk").left(width=20)
|
231
|
+
result = judge.decide(new_checkbox)
|
232
|
+
print(f"Checkbox is: {result.label} (confidence: {result.score:.2f})")
|
233
|
+
|
234
|
+
# Find which checkbox is selected among multiple options
|
235
|
+
checkboxes = [
|
236
|
+
page.find("text=Option A").left(width=20),
|
237
|
+
page.find("text=Option B").left(width=20),
|
238
|
+
page.find("text=Option C").left(width=20)
|
239
|
+
]
|
240
|
+
selected = judge.pick("checked", checkboxes, labels=["Option A", "Option B", "Option C"])
|
241
|
+
print(f"Selected: {selected.label}")
|
242
|
+
```
|
243
|
+
|
244
|
+
### Key Features of Judge
|
245
|
+
|
246
|
+
1. **Simple Training**: Requires minimal examples (even just one per class)
|
247
|
+
2. **Robust to Imbalance**: Uses Youden's J weights and prior correction
|
248
|
+
3. **Interactive Teaching**: Use `judge.teach()` in Jupyter for labeling
|
249
|
+
4. **Visual Inspection**: Use `judge.inspect()` to see predictions on training data
|
250
|
+
5. **Persistence**: Save/load trained judges with `judge.save()` and `Judge.load()`
|
251
|
+
|
252
|
+
### Advanced Usage
|
253
|
+
|
254
|
+
```python
|
255
|
+
# Adjust prior if you expect more checked boxes
|
256
|
+
judge = Judge("checkboxes", labels=["checked", "unchecked"], target_prior=0.7)
|
257
|
+
|
258
|
+
# Interactive teaching in Jupyter
|
259
|
+
judge.teach() # Use arrow keys to label examples
|
260
|
+
|
261
|
+
# Visual inspection with previews
|
262
|
+
judge.inspect(preview=True) # Shows HTML table with images
|
263
|
+
|
264
|
+
# Count checkboxes by type
|
265
|
+
checked_count = judge.count("checked", checkbox_regions)
|
266
|
+
```
|
267
|
+
|
210
268
|
## Next Steps
|
211
269
|
|
212
270
|
Layout analysis provides regions that you can use for:
|
@@ -214,3 +272,4 @@ Layout analysis provides regions that you can use for:
|
|
214
272
|
- [Table Extraction](../tables/index.ipynb): Especially powerful with TATR regions.
|
215
273
|
- [Text Extraction](../text-extraction/index.ipynb): Extract text only from specific region types (e.g., paragraphs).
|
216
274
|
- [Document QA](../document-qa/index.ipynb): Focus question answering on specific detected regions.
|
275
|
+
- Visual Classification: Use Judge to classify detected elements (checkboxes, signatures, etc.)
|
@@ -40,6 +40,8 @@ data = table_region.extract_table()
|
|
40
40
|
page.find('text:contains("Invoice")') # Contains text
|
41
41
|
page.find('text:contains("total")', case=False) # Case insensitive
|
42
42
|
page.find('text:contains("\\d+")', regex=True) # Regex pattern
|
43
|
+
page.find('text:closest("Invoice Date")') # Fuzzy match (OCR errors)
|
44
|
+
page.find('text:closest("Total@0.8")') # 80% similarity threshold
|
43
45
|
```
|
44
46
|
|
45
47
|
### Text Formatting
|
@@ -0,0 +1,55 @@
|
|
1
|
+
"""Example usage of checkbox detection in Natural PDF."""
|
2
|
+
|
3
|
+
import natural_pdf as npdf
|
4
|
+
|
5
|
+
# Load a PDF
|
6
|
+
pdf = npdf.PDF("pdfs/01-practice.pdf")
|
7
|
+
page = pdf[0]
|
8
|
+
|
9
|
+
# Basic checkbox detection
|
10
|
+
print("=== Basic Checkbox Detection ===")
|
11
|
+
checkboxes = page.detect_checkboxes()
|
12
|
+
print(f"Found {len(checkboxes)} checkboxes")
|
13
|
+
|
14
|
+
# Show what was found
|
15
|
+
for i, cb in enumerate(checkboxes[:3]):
|
16
|
+
print(f"\nCheckbox {i}:")
|
17
|
+
print(f" State: {'Checked' if cb.is_checked else 'Unchecked'}")
|
18
|
+
print(f" Confidence: {cb.confidence:.2f}")
|
19
|
+
print(f" Position: {cb.bbox}")
|
20
|
+
|
21
|
+
# Using selectors to filter checkboxes
|
22
|
+
print("\n=== Using Selectors ===")
|
23
|
+
checked = page.find_all("checkbox:checked")
|
24
|
+
unchecked = page.find_all("checkbox:unchecked")
|
25
|
+
print(f"Checked boxes: {len(checked)}")
|
26
|
+
print(f"Unchecked boxes: {len(unchecked)}")
|
27
|
+
|
28
|
+
# Limit detection when you know expected count
|
29
|
+
print("\n=== Limited Detection ===")
|
30
|
+
# If you know there should be 10 checkboxes on a form
|
31
|
+
limited_checkboxes = page.detect_checkboxes(limit=10)
|
32
|
+
print(f"Found top {len(limited_checkboxes)} checkboxes by confidence")
|
33
|
+
|
34
|
+
# Multi-page detection
|
35
|
+
print("\n=== Multi-page Detection ===")
|
36
|
+
all_checkboxes = pdf.detect_checkboxes(show_progress=False)
|
37
|
+
print(f"Total checkboxes in PDF: {len(all_checkboxes)}")
|
38
|
+
|
39
|
+
# Visualize checkboxes
|
40
|
+
print("\n=== Visualization ===")
|
41
|
+
print("Showing detected checkboxes...")
|
42
|
+
checkboxes.show()
|
43
|
+
|
44
|
+
# Advanced: Using custom options
|
45
|
+
print("\n=== Advanced Options ===")
|
46
|
+
from natural_pdf.analyzers.checkbox import CheckboxOptions
|
47
|
+
|
48
|
+
# Higher confidence threshold
|
49
|
+
options = CheckboxOptions(confidence=0.5)
|
50
|
+
high_conf_checkboxes = page.detect_checkboxes(options=options)
|
51
|
+
print(f"High confidence checkboxes: {len(high_conf_checkboxes)}")
|
52
|
+
|
53
|
+
# GPU acceleration if available
|
54
|
+
gpu_checkboxes = page.detect_checkboxes(device="cuda")
|
55
|
+
print(f"GPU-detected checkboxes: {len(gpu_checkboxes)}")
|
@@ -66,6 +66,7 @@ class Options:
|
|
66
66
|
self.layout = ConfigSection(
|
67
67
|
directional_offset=0.01, # Offset in points when using directional methods
|
68
68
|
auto_multipage=False, # Whether directional methods span pages by default
|
69
|
+
directional_within=None, # Region to constrain directional operations to
|
69
70
|
)
|
70
71
|
|
71
72
|
|
@@ -126,6 +127,9 @@ from natural_pdf.elements.region import Region
|
|
126
127
|
from natural_pdf.flows.flow import Flow
|
127
128
|
from natural_pdf.flows.region import FlowRegion
|
128
129
|
|
130
|
+
# Judge for visual classification
|
131
|
+
from natural_pdf.judge import Decision, Judge, JudgeError, PickResult
|
132
|
+
|
129
133
|
# Search options (if extras installed)
|
130
134
|
try:
|
131
135
|
from natural_pdf.search.search_options import (
|
@@ -165,6 +169,10 @@ __all__ = [
|
|
165
169
|
"Flow",
|
166
170
|
"FlowRegion",
|
167
171
|
"Guides",
|
172
|
+
"Judge",
|
173
|
+
"Decision",
|
174
|
+
"PickResult",
|
175
|
+
"JudgeError",
|
168
176
|
"TextSearchOptions",
|
169
177
|
"MultiModalSearchOptions",
|
170
178
|
"BaseSearchOptions",
|
@@ -0,0 +1,265 @@
|
|
1
|
+
"""Base class for checkbox detection engines."""
|
2
|
+
|
3
|
+
import logging
|
4
|
+
from abc import ABC, abstractmethod
|
5
|
+
from typing import Any, Dict, List, Set
|
6
|
+
|
7
|
+
from PIL import Image
|
8
|
+
|
9
|
+
from .checkbox_options import CheckboxOptions
|
10
|
+
|
11
|
+
logger = logging.getLogger(__name__)
|
12
|
+
|
13
|
+
|
14
|
+
class CheckboxDetector(ABC):
|
15
|
+
"""Abstract base class for checkbox detection engines.
|
16
|
+
|
17
|
+
This class defines the standard interface that all checkbox detection engines
|
18
|
+
must implement in natural-pdf. Checkbox detectors analyze document images to
|
19
|
+
identify checkboxes and their states (checked/unchecked).
|
20
|
+
|
21
|
+
Subclasses must implement:
|
22
|
+
- detect(): Core checkbox detection for a single image
|
23
|
+
- is_available(): Check if engine dependencies are installed
|
24
|
+
- _load_model_from_options(): Load and configure the detection model
|
25
|
+
- _get_cache_key(): Generate cache keys for model instances
|
26
|
+
|
27
|
+
Attributes:
|
28
|
+
logger: Logger instance for the specific detector.
|
29
|
+
_model_cache: Dictionary cache for loaded model instances.
|
30
|
+
"""
|
31
|
+
|
32
|
+
def __init__(self):
|
33
|
+
"""Initialize the base checkbox detector."""
|
34
|
+
self.logger = logging.getLogger(f"{__name__}.{self.__class__.__name__}")
|
35
|
+
self.logger.info(f"Initializing {self.__class__.__name__}")
|
36
|
+
self._model_cache: Dict[str, Any] = {} # Cache for initialized models
|
37
|
+
|
38
|
+
@abstractmethod
|
39
|
+
def detect(self, image: Image.Image, options: CheckboxOptions) -> List[Dict[str, Any]]:
|
40
|
+
"""
|
41
|
+
Detect checkboxes in a given PIL Image.
|
42
|
+
|
43
|
+
Args:
|
44
|
+
image: PIL Image of the page/region to analyze.
|
45
|
+
options: Instance of CheckboxOptions with configuration.
|
46
|
+
|
47
|
+
Returns:
|
48
|
+
List of detection dictionaries with:
|
49
|
+
- 'bbox': Tuple[float, float, float, float] - (x0, y0, x1, y1) relative to image
|
50
|
+
- 'class': str - Original class name from model (e.g., 'checkbox', 'checked_checkbox')
|
51
|
+
- 'normalized_class': str - Always 'checkbox'
|
52
|
+
- 'is_checked': bool - Whether checkbox is checked
|
53
|
+
- 'checkbox_state': str - 'checked' or 'unchecked'
|
54
|
+
- 'confidence': float - Confidence score (0.0-1.0)
|
55
|
+
- 'model': str - Name of the model used
|
56
|
+
- 'source': str - Always 'checkbox'
|
57
|
+
"""
|
58
|
+
raise NotImplementedError("Subclasses must implement this method")
|
59
|
+
|
60
|
+
@classmethod
|
61
|
+
@abstractmethod
|
62
|
+
def is_available(cls) -> bool:
|
63
|
+
"""
|
64
|
+
Check if the detector's dependencies are installed and usable.
|
65
|
+
|
66
|
+
Returns:
|
67
|
+
True if the detector is available, False otherwise.
|
68
|
+
"""
|
69
|
+
raise NotImplementedError("Subclasses must implement this method")
|
70
|
+
|
71
|
+
def _get_cache_key(self, options: CheckboxOptions) -> str:
|
72
|
+
"""
|
73
|
+
Generate a cache key for model loading based on relevant options.
|
74
|
+
|
75
|
+
Args:
|
76
|
+
options: The options dataclass instance.
|
77
|
+
|
78
|
+
Returns:
|
79
|
+
A string cache key.
|
80
|
+
"""
|
81
|
+
# Base key includes device, subclasses should add model specifics
|
82
|
+
device_key = str(options.device).lower()
|
83
|
+
return f"{self.__class__.__name__}_{device_key}"
|
84
|
+
|
85
|
+
def _get_model(self, options: CheckboxOptions) -> Any:
|
86
|
+
"""
|
87
|
+
Get or initialize the underlying model based on options, using caching.
|
88
|
+
"""
|
89
|
+
cache_key = self._get_cache_key(options)
|
90
|
+
if cache_key not in self._model_cache:
|
91
|
+
self.logger.info(f"Loading model for cache key: {cache_key}")
|
92
|
+
try:
|
93
|
+
# Ensure dependencies are met before loading
|
94
|
+
if not self.is_available():
|
95
|
+
raise RuntimeError(f"{self.__class__.__name__} dependencies are not met.")
|
96
|
+
self._model_cache[cache_key] = self._load_model_from_options(options)
|
97
|
+
self.logger.info(f"Model loaded successfully for key: {cache_key}")
|
98
|
+
except Exception as e:
|
99
|
+
self.logger.error(f"Failed to load model for key {cache_key}: {e}", exc_info=True)
|
100
|
+
# Remove potentially corrupted cache entry
|
101
|
+
self._model_cache.pop(cache_key, None)
|
102
|
+
raise
|
103
|
+
else:
|
104
|
+
self.logger.debug(f"Using cached model for key: {cache_key}")
|
105
|
+
return self._model_cache[cache_key]
|
106
|
+
|
107
|
+
@abstractmethod
|
108
|
+
def _load_model_from_options(self, options: CheckboxOptions) -> Any:
|
109
|
+
"""
|
110
|
+
Load and configure the detection model based on provided options.
|
111
|
+
|
112
|
+
Args:
|
113
|
+
options: The options dataclass instance.
|
114
|
+
|
115
|
+
Returns:
|
116
|
+
The loaded model object(s).
|
117
|
+
"""
|
118
|
+
raise NotImplementedError("Subclasses must implement _load_model_from_options")
|
119
|
+
|
120
|
+
def _map_label_to_state(self, label: str, options: CheckboxOptions) -> tuple[bool, str]:
|
121
|
+
"""
|
122
|
+
Map model output label to checkbox state.
|
123
|
+
|
124
|
+
Args:
|
125
|
+
label: Raw label from model (e.g., 'checked_checkbox', '1')
|
126
|
+
options: Options containing label mapping
|
127
|
+
|
128
|
+
Returns:
|
129
|
+
Tuple of (is_checked: bool, state: str)
|
130
|
+
"""
|
131
|
+
# Normalize label
|
132
|
+
normalized_label = str(label).lower().strip()
|
133
|
+
|
134
|
+
# Check mapping
|
135
|
+
if normalized_label in options.label_mapping:
|
136
|
+
state = options.label_mapping[normalized_label]
|
137
|
+
is_checked = state == "checked"
|
138
|
+
return is_checked, state
|
139
|
+
|
140
|
+
# Default heuristic if not in mapping
|
141
|
+
if any(term in normalized_label for term in ["checked", "tick", "filled", "1"]):
|
142
|
+
return True, "checked"
|
143
|
+
else:
|
144
|
+
return False, "unchecked"
|
145
|
+
|
146
|
+
def _apply_nms(
|
147
|
+
self, detections: List[Dict[str, Any]], iou_threshold: float
|
148
|
+
) -> List[Dict[str, Any]]:
|
149
|
+
"""
|
150
|
+
Apply non-maximum suppression to remove overlapping detections.
|
151
|
+
For checkboxes, we reject ANY meaningful overlap.
|
152
|
+
|
153
|
+
Args:
|
154
|
+
detections: List of detection dictionaries
|
155
|
+
iou_threshold: IoU threshold for suppression (ignored for checkboxes - we use stricter rules)
|
156
|
+
|
157
|
+
Returns:
|
158
|
+
Filtered list of detections
|
159
|
+
"""
|
160
|
+
if not detections:
|
161
|
+
return detections
|
162
|
+
|
163
|
+
# Sort by confidence (descending), then by area (ascending) to prefer smaller boxes
|
164
|
+
def sort_key(det):
|
165
|
+
bbox = det["bbox"]
|
166
|
+
area = (bbox[2] - bbox[0]) * (bbox[3] - bbox[1])
|
167
|
+
return (-det["confidence"], area)
|
168
|
+
|
169
|
+
sorted_detections = sorted(detections, key=sort_key)
|
170
|
+
|
171
|
+
keep = []
|
172
|
+
for i, det in enumerate(sorted_detections):
|
173
|
+
should_keep = True
|
174
|
+
det_bbox = det["bbox"]
|
175
|
+
|
176
|
+
for kept_det in keep:
|
177
|
+
kept_bbox = kept_det["bbox"]
|
178
|
+
|
179
|
+
# Check for ANY overlap at all
|
180
|
+
if self._boxes_overlap(det_bbox, kept_bbox):
|
181
|
+
should_keep = False
|
182
|
+
logger.debug(f"Rejecting box {det_bbox} due to overlap with {kept_bbox}")
|
183
|
+
break
|
184
|
+
|
185
|
+
if should_keep:
|
186
|
+
keep.append(det)
|
187
|
+
logger.debug(f"Keeping box {det_bbox} with confidence {det['confidence']}")
|
188
|
+
|
189
|
+
logger.info(f"NMS: Reduced {len(detections)} detections to {len(keep)}")
|
190
|
+
return keep
|
191
|
+
|
192
|
+
def _boxes_overlap(self, box1: tuple, box2: tuple) -> bool:
|
193
|
+
"""Check if two boxes have any overlap at all."""
|
194
|
+
x1_min, y1_min, x1_max, y1_max = box1
|
195
|
+
x2_min, y2_min, x2_max, y2_max = box2
|
196
|
+
|
197
|
+
# Check if boxes are separated
|
198
|
+
if x1_max <= x2_min or x2_max <= x1_min:
|
199
|
+
return False
|
200
|
+
if y1_max <= y2_min or y2_max <= y1_min:
|
201
|
+
return False
|
202
|
+
|
203
|
+
# If we get here, boxes overlap
|
204
|
+
return True
|
205
|
+
|
206
|
+
def _compute_intersection_ratio(self, box1: tuple, box2: tuple) -> float:
|
207
|
+
"""
|
208
|
+
Compute intersection ratio relative to the smaller box.
|
209
|
+
This is more aggressive than IoU for checkbox detection.
|
210
|
+
"""
|
211
|
+
x1_min, y1_min, x1_max, y1_max = box1
|
212
|
+
x2_min, y2_min, x2_max, y2_max = box2
|
213
|
+
|
214
|
+
# Intersection
|
215
|
+
inter_xmin = max(x1_min, x2_min)
|
216
|
+
inter_ymin = max(y1_min, y2_min)
|
217
|
+
inter_xmax = min(x1_max, x2_max)
|
218
|
+
inter_ymax = min(y1_max, y2_max)
|
219
|
+
|
220
|
+
if inter_xmax < inter_xmin or inter_ymax < inter_ymin:
|
221
|
+
return 0.0
|
222
|
+
|
223
|
+
inter_area = (inter_xmax - inter_xmin) * (inter_ymax - inter_ymin)
|
224
|
+
|
225
|
+
# Areas of both boxes
|
226
|
+
area1 = (x1_max - x1_min) * (y1_max - y1_min)
|
227
|
+
area2 = (x2_max - x2_min) * (y2_max - y2_min)
|
228
|
+
|
229
|
+
# Ratio relative to smaller box
|
230
|
+
smaller_area = min(area1, area2)
|
231
|
+
if smaller_area == 0:
|
232
|
+
return 0.0
|
233
|
+
|
234
|
+
return inter_area / smaller_area
|
235
|
+
|
236
|
+
def _compute_iou(self, box1: tuple, box2: tuple) -> float:
|
237
|
+
"""Compute IoU between two boxes."""
|
238
|
+
x1_min, y1_min, x1_max, y1_max = box1
|
239
|
+
x2_min, y2_min, x2_max, y2_max = box2
|
240
|
+
|
241
|
+
# Intersection
|
242
|
+
inter_xmin = max(x1_min, x2_min)
|
243
|
+
inter_ymin = max(y1_min, y2_min)
|
244
|
+
inter_xmax = min(x1_max, x2_max)
|
245
|
+
inter_ymax = min(y1_max, y2_max)
|
246
|
+
|
247
|
+
if inter_xmax < inter_xmin or inter_ymax < inter_ymin:
|
248
|
+
return 0.0
|
249
|
+
|
250
|
+
inter_area = (inter_xmax - inter_xmin) * (inter_ymax - inter_ymin)
|
251
|
+
|
252
|
+
# Union
|
253
|
+
area1 = (x1_max - x1_min) * (y1_max - y1_min)
|
254
|
+
area2 = (x2_max - x2_min) * (y2_max - y2_min)
|
255
|
+
union_area = area1 + area2 - inter_area
|
256
|
+
|
257
|
+
if union_area == 0:
|
258
|
+
return 0.0
|
259
|
+
|
260
|
+
return inter_area / union_area
|
261
|
+
|
262
|
+
def __del__(self):
|
263
|
+
"""Cleanup resources."""
|
264
|
+
self.logger.info(f"Cleaning up {self.__class__.__name__} resources.")
|
265
|
+
self._model_cache.clear()
|