natural-pdf 0.1.24__tar.gz → 0.1.27__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {natural_pdf-0.1.24/natural_pdf.egg-info → natural_pdf-0.1.27}/PKG-INFO +1 -1
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/docs/ocr/index.md +18 -29
- {natural_pdf-0.1.24/natural_pdf/templates/spa → natural_pdf-0.1.27/docs/ocr-tool}/css/style.css +4 -0
- natural_pdf-0.1.27/docs/ocr-tool/index.html +31 -0
- natural_pdf-0.1.27/docs/ocr-tool/js/vendor/FileSaver.min.js +3 -0
- natural_pdf-0.1.27/docs/ocr-tool/js/vendor/babel.min.js +2 -0
- natural_pdf-0.1.27/docs/ocr-tool/js/vendor/hooks.umd.js +2 -0
- natural_pdf-0.1.27/docs/ocr-tool/js/vendor/htm-preact.umd.min.js +1 -0
- natural_pdf-0.1.27/docs/ocr-tool/js/vendor/htm.umd.min.js +1 -0
- natural_pdf-0.1.27/docs/ocr-tool/js/vendor/jszip.min.js +13 -0
- natural_pdf-0.1.27/docs/ocr-tool/js/vendor/preact.umd.min.js +1 -0
- natural_pdf-0.1.27/docs/ocr-tool/js/vendor/react-dom.development.js +29924 -0
- natural_pdf-0.1.27/docs/ocr-tool/js/vendor/react.development.js +3343 -0
- natural_pdf-0.1.27/docs/reflowing-pages/index.ipynb +358 -0
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/docs/reflowing-pages/index.md +4 -3
- natural_pdf-0.1.27/docs/tutorials/01-loading-and-extraction.ipynb +312 -0
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/docs/tutorials/02-finding-elements.ipynb +42 -42
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/docs/tutorials/03-extracting-blocks.ipynb +17 -17
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/docs/tutorials/04-table-extraction.ipynb +30 -30
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/docs/tutorials/05-excluding-content.ipynb +29 -29
- natural_pdf-0.1.27/docs/tutorials/06-document-qa.ipynb +445 -0
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/docs/tutorials/07-layout-analysis.ipynb +41 -41
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/docs/tutorials/07-working-with-regions.ipynb +58 -58
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/docs/tutorials/08-spatial-navigation.ipynb +71 -71
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/docs/tutorials/09-section-extraction.ipynb +109 -109
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/docs/tutorials/10-form-field-extraction.ipynb +57 -57
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/docs/tutorials/11-enhanced-table-processing.ipynb +141 -141
- natural_pdf-0.1.27/docs/tutorials/12-ocr-integration.ipynb +4771 -0
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/docs/tutorials/13-semantic-search.ipynb +112 -112
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/natural_pdf/core/page.py +66 -7
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/natural_pdf/describe/summary.py +2 -2
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/natural_pdf/elements/line.py +9 -4
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/natural_pdf/elements/region.py +48 -12
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/natural_pdf/elements/text.py +50 -1
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/natural_pdf/qa/document_qa.py +62 -8
- natural_pdf-0.1.27/natural_pdf/templates/spa/css/style.css +338 -0
- natural_pdf-0.1.27/natural_pdf/templates/spa/js/app.js +472 -0
- natural_pdf-0.1.27/natural_pdf/templates/spa/words.txt +235976 -0
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/natural_pdf/utils/packaging.py +23 -9
- {natural_pdf-0.1.24 → natural_pdf-0.1.27/natural_pdf.egg-info}/PKG-INFO +1 -1
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/natural_pdf.egg-info/SOURCES.txt +13 -0
- natural_pdf-0.1.27/pdfs/needs-ocr.pdf +0 -0
- natural_pdf-0.1.24/docs/reflowing-pages/index.ipynb +0 -360
- natural_pdf-0.1.24/docs/tutorials/01-loading-and-extraction.ipynb +0 -312
- natural_pdf-0.1.24/docs/tutorials/06-document-qa.ipynb +0 -445
- natural_pdf-0.1.24/docs/tutorials/12-ocr-integration.ipynb +0 -4733
- natural_pdf-0.1.24/pdfs/needs-ocr.pdf +0 -0
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/.cursor/rules/analysis_framework.mdc +0 -0
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/.cursor/rules/coding-style.mdc +0 -0
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/.cursor/rules/edit-md-instead-of-ipynb.mdc +0 -0
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/.cursor/rules/minimal-comments.mdc +0 -0
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/.cursor/rules/natural-pdf-overview.mdc +0 -0
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/.cursor/rules/user-friendly-library-code.mdc +0 -0
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/.github/workflows/docs.yml +0 -0
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/.gitignore +0 -0
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/.pre-commit-config.yaml +0 -0
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/01-execute_notebooks.py +0 -0
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/02-run_all_tutorials.sh +0 -0
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/CLAUDE.md +0 -0
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/LICENSE +0 -0
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/MANIFEST.in +0 -0
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/README.md +0 -0
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/audit_packaging.py +0 -0
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/check_run_md.sh +0 -0
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/docs/api/index.md +0 -0
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/docs/assets/favicon.png +0 -0
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/docs/assets/favicon.svg +0 -0
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/docs/assets/javascripts/custom.js +0 -0
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/docs/assets/logo.svg +0 -0
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/docs/assets/sample-screen.png +0 -0
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/docs/assets/social-preview.png +0 -0
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/docs/assets/social-preview.svg +0 -0
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/docs/assets/stylesheets/custom.css +0 -0
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/docs/categorizing-documents/index.md +0 -0
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/docs/data-extraction/index.md +0 -0
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/docs/describe/index.ipynb +0 -0
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/docs/describe/index.md +0 -0
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/docs/document-qa/index.ipynb +0 -0
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/docs/document-qa/index.md +0 -0
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/docs/element-selection/index.ipynb +0 -0
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/docs/element-selection/index.md +0 -0
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/docs/extracting-clean-text/index.ipynb +0 -0
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/docs/extracting-clean-text/index.md +0 -0
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/docs/finetuning/index.md +0 -0
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/docs/fix-messy-tables/index.ipynb +0 -0
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/docs/fix-messy-tables/index.md +0 -0
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/docs/fix-messy-tables/table_1.csv +0 -0
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/docs/fix-messy-tables/table_2.csv +0 -0
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/docs/fix-messy-tables/table_3.csv +0 -0
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/docs/index.md +0 -0
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/docs/installation/index.md +0 -0
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/docs/interactive-widget/index.ipynb +0 -0
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/docs/interactive-widget/index.md +0 -0
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/docs/layout-analysis/index.ipynb +0 -0
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/docs/layout-analysis/index.md +0 -0
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/docs/loops-and-groups/index.ipynb +0 -0
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/docs/loops-and-groups/index.md +0 -0
- {natural_pdf-0.1.24/natural_pdf/templates/spa → natural_pdf-0.1.27/docs/ocr-tool}/js/app.js +0 -0
- {natural_pdf-0.1.24/natural_pdf/templates/spa → natural_pdf-0.1.27/docs/ocr-tool}/words.txt +0 -0
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/docs/pdf-navigation/index.ipynb +0 -0
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/docs/pdf-navigation/index.md +0 -0
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/docs/process-forms-and-invoices/extracted_form_data.csv +0 -0
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/docs/process-forms-and-invoices/index.ipynb +0 -0
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/docs/process-forms-and-invoices/index.md +0 -0
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/docs/quick-reference/index.ipynb +0 -0
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/docs/quick-reference/index.md +0 -0
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/docs/regions/index.ipynb +0 -0
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/docs/regions/index.md +0 -0
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/docs/tables/index.ipynb +0 -0
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/docs/tables/index.md +0 -0
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/docs/text-analysis/index.ipynb +0 -0
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/docs/text-analysis/index.md +0 -0
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/docs/text-extraction/index.ipynb +0 -0
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/docs/tutorials/01-loading-and-extraction.md +0 -0
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/docs/tutorials/02-finding-elements.md +0 -0
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/docs/tutorials/03-extracting-blocks.md +0 -0
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/docs/tutorials/04-table-extraction.md +0 -0
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/docs/tutorials/05-excluding-content.md +0 -0
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/docs/tutorials/06-document-qa.md +0 -0
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/docs/tutorials/07-layout-analysis.md +0 -0
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/docs/tutorials/07-working-with-regions.md +0 -0
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/docs/tutorials/08-spatial-navigation.md +0 -0
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/docs/tutorials/09-section-extraction.md +0 -0
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/docs/tutorials/10-form-field-extraction.md +0 -0
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/docs/tutorials/11-enhanced-table-processing.md +0 -0
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/docs/tutorials/12-ocr-integration.md +0 -0
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/docs/tutorials/13-semantic-search.md +0 -0
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/docs/tutorials/14-categorizing-documents.ipynb +0 -0
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/docs/tutorials/14-categorizing-documents.md +0 -0
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/docs/visual-debugging/index.ipynb +0 -0
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/docs/visual-debugging/index.md +0 -0
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/docs/visual-debugging/region.png +0 -0
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/mkdocs.yml +0 -0
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/natural_pdf/__init__.py +0 -0
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/natural_pdf/analyzers/__init__.py +0 -0
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/natural_pdf/analyzers/layout/__init__.py +0 -0
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/natural_pdf/analyzers/layout/base.py +0 -0
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/natural_pdf/analyzers/layout/docling.py +0 -0
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/natural_pdf/analyzers/layout/gemini.py +0 -0
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/natural_pdf/analyzers/layout/layout_analyzer.py +0 -0
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/natural_pdf/analyzers/layout/layout_manager.py +0 -0
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/natural_pdf/analyzers/layout/layout_options.py +0 -0
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/natural_pdf/analyzers/layout/paddle.py +0 -0
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/natural_pdf/analyzers/layout/pdfplumber_table_finder.py +0 -0
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/natural_pdf/analyzers/layout/surya.py +0 -0
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/natural_pdf/analyzers/layout/table_structure_utils.py +0 -0
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/natural_pdf/analyzers/layout/tatr.py +0 -0
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/natural_pdf/analyzers/layout/yolo.py +0 -0
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/natural_pdf/analyzers/shape_detection_mixin.py +0 -0
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/natural_pdf/analyzers/text_options.py +0 -0
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/natural_pdf/analyzers/text_structure.py +0 -0
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/natural_pdf/analyzers/utils.py +0 -0
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/natural_pdf/classification/manager.py +0 -0
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/natural_pdf/classification/mixin.py +0 -0
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/natural_pdf/classification/results.py +0 -0
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/natural_pdf/cli.py +0 -0
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/natural_pdf/collections/mixins.py +0 -0
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/natural_pdf/collections/pdf_collection.py +0 -0
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/natural_pdf/core/__init__.py +0 -0
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/natural_pdf/core/element_manager.py +0 -0
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/natural_pdf/core/highlighting_service.py +0 -0
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/natural_pdf/core/pdf.py +0 -0
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/natural_pdf/describe/__init__.py +0 -0
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/natural_pdf/describe/base.py +0 -0
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/natural_pdf/describe/elements.py +0 -0
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/natural_pdf/describe/mixin.py +0 -0
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/natural_pdf/elements/__init__.py +0 -0
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/natural_pdf/elements/base.py +0 -0
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/natural_pdf/elements/collections.py +0 -0
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/natural_pdf/elements/rect.py +0 -0
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/natural_pdf/export/mixin.py +0 -0
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/natural_pdf/exporters/__init__.py +0 -0
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/natural_pdf/exporters/base.py +0 -0
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/natural_pdf/exporters/data/__init__.py +0 -0
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/natural_pdf/exporters/data/pdf.ttf +0 -0
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/natural_pdf/exporters/data/sRGB.icc +0 -0
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/natural_pdf/exporters/hocr.py +0 -0
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/natural_pdf/exporters/hocr_font.py +0 -0
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/natural_pdf/exporters/original_pdf.py +0 -0
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/natural_pdf/exporters/paddleocr.py +0 -0
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/natural_pdf/exporters/searchable_pdf.py +0 -0
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/natural_pdf/extraction/manager.py +0 -0
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/natural_pdf/extraction/mixin.py +0 -0
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/natural_pdf/extraction/result.py +0 -0
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/natural_pdf/flows/__init__.py +0 -0
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/natural_pdf/flows/collections.py +0 -0
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/natural_pdf/flows/element.py +0 -0
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/natural_pdf/flows/flow.py +0 -0
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/natural_pdf/flows/region.py +0 -0
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/natural_pdf/ocr/__init__.py +0 -0
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/natural_pdf/ocr/engine.py +0 -0
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/natural_pdf/ocr/engine_doctr.py +0 -0
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/natural_pdf/ocr/engine_easyocr.py +0 -0
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/natural_pdf/ocr/engine_paddle.py +0 -0
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/natural_pdf/ocr/engine_surya.py +0 -0
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/natural_pdf/ocr/ocr_factory.py +0 -0
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/natural_pdf/ocr/ocr_manager.py +0 -0
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/natural_pdf/ocr/ocr_options.py +0 -0
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/natural_pdf/ocr/utils.py +0 -0
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/natural_pdf/qa/__init__.py +0 -0
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/natural_pdf/qa/qa_result.py +0 -0
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/natural_pdf/search/__init__.py +0 -0
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/natural_pdf/search/lancedb_search_service.py +0 -0
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/natural_pdf/search/numpy_search_service.py +0 -0
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/natural_pdf/search/search_options.py +0 -0
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/natural_pdf/search/search_service_protocol.py +0 -0
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/natural_pdf/search/searchable_mixin.py +0 -0
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/natural_pdf/selectors/__init__.py +0 -0
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/natural_pdf/selectors/parser.py +0 -0
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/natural_pdf/templates/__init__.py +0 -0
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/natural_pdf/templates/finetune/fine_tune_paddleocr.md +0 -0
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/natural_pdf/templates/spa/index.html +0 -0
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/natural_pdf/utils/__init__.py +0 -0
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/natural_pdf/utils/debug.py +0 -0
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/natural_pdf/utils/highlighting.py +0 -0
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/natural_pdf/utils/identifiers.py +0 -0
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/natural_pdf/utils/locks.py +0 -0
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/natural_pdf/utils/reading_order.py +0 -0
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/natural_pdf/utils/text_extraction.py +0 -0
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/natural_pdf/utils/visualization.py +0 -0
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/natural_pdf/widgets/__init__.py +0 -0
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/natural_pdf/widgets/viewer.py +0 -0
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/natural_pdf.egg-info/dependency_links.txt +0 -0
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/natural_pdf.egg-info/entry_points.txt +0 -0
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/natural_pdf.egg-info/requires.txt +0 -0
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/natural_pdf.egg-info/top_level.txt +0 -0
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/noxfile.py +0 -0
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/pdfs/.gitkeep +0 -0
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/pdfs/01-practice.pdf +0 -0
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/pdfs/0500000US42001.pdf +0 -0
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/pdfs/0500000US42007.pdf +0 -0
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/pdfs/1107231007033739008.pdf +0 -0
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/pdfs/2014 Statistics.pdf +0 -0
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/pdfs/2019 Statistics.pdf +0 -0
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/pdfs/30.pdf +0 -0
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/pdfs/Atlanta_Public_Schools_GA_sample.pdf +0 -0
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/pdfs/anexo_edital_6604_1743480-table.pdf +0 -0
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/pdfs/appendix_fy2026.pdf +0 -0
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/pdfs/cia-doc.pdf +0 -0
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/pdfs/geometry.pdf +0 -0
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/pdfs/image.png +0 -0
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/pdfs/image.png.pdf +0 -0
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/pdfs/multicolumn.pdf +0 -0
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/pdfs/red.pdf +0 -0
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/pdfs/tiny-ocr-2.pdf +0 -0
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/pdfs/tiny-ocr-3.pdf +0 -0
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/pdfs/tiny-ocr-small.jpg +0 -0
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/pdfs/tiny-ocr-wide.jpg +0 -0
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/pdfs/tiny-ocr.pdf +0 -0
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/pdfs/tiny.pdf +0 -0
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/pdfs/word-counter.pdf +0 -0
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/publish.sh +0 -0
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/pyproject.toml +0 -0
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/sample-screen.png +0 -0
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/setup.cfg +0 -0
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/test_install.sh +0 -0
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/tests/conftest.py +0 -0
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/tests/exporters/test_paddleocr_exporter.py +0 -0
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/tests/test_core/test_containment_geometry.py +0 -0
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/tests/test_core/test_elements.py +0 -0
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/tests/test_core/test_loading.py +0 -0
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/tests/test_core/test_spatial.py +0 -0
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/tests/test_core/test_text_extraction.py +0 -0
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/tests/test_loading_original.py +0 -0
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/tests/test_optional_deps.py +0 -0
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/tests/test_tutorials.py +0 -0
- {natural_pdf-0.1.24 → natural_pdf-0.1.27}/uv.lock +0 -0
@@ -6,12 +6,12 @@ Got a PDF that's actually just a bunch of scanned images? Or maybe a PDF where t
|
|
6
6
|
|
7
7
|
Natural PDF supports multiple OCR engines, each with different strengths:
|
8
8
|
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
9
|
+
- [EasyOCR](https://github.com/JaidedAI/EasyOCR)
|
10
|
+
- [PaddleOCR](https://paddlepaddle.github.io/PaddleOCR/latest/en/index.html)
|
11
|
+
- [Surya](https://github.com/datalab-to/surya)
|
12
|
+
- [DocTR](https://github.com/mindee/doctr)
|
13
|
+
|
14
|
+
What are those strengths??? It honestly doesn't even matter, *it's so easy to try each of them you can just see what works best for you*.
|
15
15
|
|
16
16
|
If you try to use an engine that isn't installed, Natural PDF will tell you exactly what to install.
|
17
17
|
|
@@ -27,7 +27,7 @@ pdf = PDF("https://github.com/jsoma/natural-pdf/raw/refs/heads/main/pdfs/needs-o
|
|
27
27
|
page = pdf.pages[0]
|
28
28
|
|
29
29
|
# Apply OCR using the default engine
|
30
|
-
ocr_elements = page.apply_ocr(
|
30
|
+
ocr_elements = page.apply_ocr()
|
31
31
|
|
32
32
|
# Extract the text (uses OCR results automatically)
|
33
33
|
text = page.extract_text()
|
@@ -68,29 +68,16 @@ easy_opts = EasyOCROptions(
|
|
68
68
|
batch_size=8 # Process multiple regions at once
|
69
69
|
)
|
70
70
|
ocr_elements = page.apply_ocr(engine='easyocr', options=easy_opts)
|
71
|
-
|
72
|
-
# Configure Surya for high-accuracy line detection
|
73
|
-
surya_opts = SuryaOCROptions(
|
74
|
-
languages=['en', 'de'],
|
75
|
-
min_confidence=0.4 # Minimum confidence for results
|
76
|
-
)
|
77
|
-
ocr_elements = page.apply_ocr(engine='surya', options=surya_opts)
|
78
71
|
```
|
79
72
|
|
80
|
-
##
|
73
|
+
## OCRing regions
|
81
74
|
|
82
|
-
|
75
|
+
Don't want to apply OCR to an entire page? You don't need to!
|
83
76
|
|
84
77
|
```python
|
85
|
-
#
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
# You can also OCR just a specific region
|
90
|
-
title = page.find('text:contains("Title")')
|
91
|
-
if title:
|
92
|
-
content_region = title.below(height=300)
|
93
|
-
region_ocr_elements = content_region.apply_ocr(engine='paddle', languages=['en'])
|
78
|
+
# Grab the top half of the page
|
79
|
+
region = page.region(0, 0, height=page.height/2, width=page.width)
|
80
|
+
region.apply_ocr(engine='paddle')
|
94
81
|
```
|
95
82
|
|
96
83
|
*Note: Running OCR again on the same area will replace the previous OCR results.*
|
@@ -195,7 +182,10 @@ Natural PDF includes a web app for reviewing and correcting OCR results:
|
|
195
182
|
create_correction_task_package(pdf, "correction_package.zip", overwrite=True)
|
196
183
|
```
|
197
184
|
|
198
|
-
2. **
|
185
|
+
2. **Visit [the live OCR tool](https://jsoma.github.io/natural-pdf/ocr-tool)** and upload your zip file.
|
186
|
+
|
187
|
+
If you're a crazy person, alternatively you can do it locally like this:
|
188
|
+
|
199
189
|
```bash
|
200
190
|
# Find where Natural PDF is installed
|
201
191
|
NATURAL_PDF_PATH=$(python -c "import site; print(site.getsitepackages()[0])")/natural_pdf
|
@@ -203,10 +193,9 @@ Natural PDF includes a web app for reviewing and correcting OCR results:
|
|
203
193
|
# Start the web server
|
204
194
|
cd $NATURAL_PDF_PATH/templates/spa
|
205
195
|
python -m http.server 8000
|
206
|
-
```
|
207
196
|
|
208
|
-
|
209
|
-
|
197
|
+
# Open http://localhost:8000 in your browser
|
198
|
+
```
|
210
199
|
|
211
200
|
## Next Steps
|
212
201
|
|
@@ -0,0 +1,31 @@
|
|
1
|
+
<!DOCTYPE html>
|
2
|
+
<html lang="en">
|
3
|
+
<head>
|
4
|
+
<meta charset="UTF-8">
|
5
|
+
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
6
|
+
<title>OCR Correction Tool</title>
|
7
|
+
<link rel="stylesheet" href="css/style.css">
|
8
|
+
</head>
|
9
|
+
<body>
|
10
|
+
<header>
|
11
|
+
<h1>OCR Correction Tool</h1>
|
12
|
+
</header>
|
13
|
+
|
14
|
+
<main id="app">
|
15
|
+
<p>Loading application...</p>
|
16
|
+
</main>
|
17
|
+
|
18
|
+
<footer>
|
19
|
+
<p>Generated by natural-pdf</p>
|
20
|
+
</footer>
|
21
|
+
|
22
|
+
<script src="js/vendor/react.development.js"></script>
|
23
|
+
<script src="js/vendor/react-dom.development.js"></script>
|
24
|
+
<script src="js/vendor/babel.min.js"></script>
|
25
|
+
<script src="js/vendor/jszip.min.js"></script>
|
26
|
+
<script src="js/vendor/FileSaver.min.js"></script>
|
27
|
+
|
28
|
+
<script type="text/babel" src="js/app.js"></script>
|
29
|
+
|
30
|
+
</body>
|
31
|
+
</html>
|
@@ -0,0 +1,3 @@
|
|
1
|
+
(function(a,b){if("function"==typeof define&&define.amd)define([],b);else if("undefined"!=typeof exports)b();else{b(),a.FileSaver={exports:{}}.exports}})(this,function(){"use strict";function b(a,b){return"undefined"==typeof b?b={autoBom:!1}:"object"!=typeof b&&(console.warn("Deprecated: Expected third argument to be a object"),b={autoBom:!b}),b.autoBom&&/^\s*(?:text\/\S*|application\/xml|\S*\/\S*\+xml)\s*;.*charset\s*=\s*utf-8/i.test(a.type)?new Blob(["\uFEFF",a],{type:a.type}):a}function c(a,b,c){var d=new XMLHttpRequest;d.open("GET",a),d.responseType="blob",d.onload=function(){g(d.response,b,c)},d.onerror=function(){console.error("could not download file")},d.send()}function d(a){var b=new XMLHttpRequest;b.open("HEAD",a,!1);try{b.send()}catch(a){}return 200<=b.status&&299>=b.status}function e(a){try{a.dispatchEvent(new MouseEvent("click"))}catch(c){var b=document.createEvent("MouseEvents");b.initMouseEvent("click",!0,!0,window,0,0,0,80,20,!1,!1,!1,!1,0,null),a.dispatchEvent(b)}}var f="object"==typeof window&&window.window===window?window:"object"==typeof self&&self.self===self?self:"object"==typeof global&&global.global===global?global:void 0,a=f.navigator&&/Macintosh/.test(navigator.userAgent)&&/AppleWebKit/.test(navigator.userAgent)&&!/Safari/.test(navigator.userAgent),g=f.saveAs||("object"!=typeof window||window!==f?function(){}:"download"in HTMLAnchorElement.prototype&&!a?function(b,g,h){var i=f.URL||f.webkitURL,j=document.createElement("a");g=g||b.name||"download",j.download=g,j.rel="noopener","string"==typeof b?(j.href=b,j.origin===location.origin?e(j):d(j.href)?c(b,g,h):e(j,j.target="_blank")):(j.href=i.createObjectURL(b),setTimeout(function(){i.revokeObjectURL(j.href)},4E4),setTimeout(function(){e(j)},0))}:"msSaveOrOpenBlob"in navigator?function(f,g,h){if(g=g||f.name||"download","string"!=typeof f)navigator.msSaveOrOpenBlob(b(f,h),g);else if(d(f))c(f,g,h);else{var i=document.createElement("a");i.href=f,i.target="_blank",setTimeout(function(){e(i)})}}:function(b,d,e,g){if(g=g||open("","_blank"),g&&(g.document.title=g.document.body.innerText="downloading..."),"string"==typeof b)return c(b,d,e);var h="application/octet-stream"===b.type,i=/constructor/i.test(f.HTMLElement)||f.safari,j=/CriOS\/[\d]+/.test(navigator.userAgent);if((j||h&&i||a)&&"undefined"!=typeof FileReader){var k=new FileReader;k.onloadend=function(){var a=k.result;a=j?a:a.replace(/^data:[^;]*;/,"data:attachment/file;"),g?g.location.href=a:location=a,g=null},k.readAsDataURL(b)}else{var l=f.URL||f.webkitURL,m=l.createObjectURL(b);g?g.location=m:location.href=m,g=null,setTimeout(function(){l.revokeObjectURL(m)},4E4)}});f.saveAs=g.saveAs=g,"undefined"!=typeof module&&(module.exports=g)});
|
2
|
+
|
3
|
+
//# sourceMappingURL=FileSaver.min.js.map
|