natural-pdf 0.1.1__tar.gz → 0.1.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {natural_pdf-0.1.1 → natural_pdf-0.1.3}/.gitignore +5 -1
- natural_pdf-0.1.3/PKG-INFO +137 -0
- natural_pdf-0.1.3/README.md +85 -0
- natural_pdf-0.1.3/docs/assets/sample-screen.png +0 -0
- natural_pdf-0.1.3/docs/index.md +170 -0
- {natural_pdf-0.1.1 → natural_pdf-0.1.3}/docs/installation/index.md +1 -2
- {natural_pdf-0.1.1 → natural_pdf-0.1.3}/docs/regions/index.ipynb +124 -158
- {natural_pdf-0.1.1 → natural_pdf-0.1.3}/docs/regions/index.md +4 -10
- natural_pdf-0.1.3/docs/tutorials/01-loading-and-extraction.ipynb +1658 -0
- {natural_pdf-0.1.1 → natural_pdf-0.1.3}/docs/tutorials/02-finding-elements.ipynb +43 -47
- {natural_pdf-0.1.1 → natural_pdf-0.1.3}/docs/tutorials/03-extracting-blocks.ipynb +18 -22
- {natural_pdf-0.1.1 → natural_pdf-0.1.3}/docs/tutorials/04-table-extraction.ipynb +13 -17
- {natural_pdf-0.1.1 → natural_pdf-0.1.3}/docs/tutorials/05-excluding-content.ipynb +66 -39
- {natural_pdf-0.1.1 → natural_pdf-0.1.3}/docs/tutorials/05-excluding-content.md +13 -10
- {natural_pdf-0.1.1 → natural_pdf-0.1.3}/docs/tutorials/06-document-qa.ipynb +29 -33
- natural_pdf-0.1.3/docs/tutorials/07-layout-analysis.ipynb +260 -0
- {natural_pdf-0.1.1 → natural_pdf-0.1.3}/docs/tutorials/07-working-with-regions.ipynb +49 -53
- natural_pdf-0.1.3/docs/tutorials/08-spatial-navigation.ipynb +508 -0
- {natural_pdf-0.1.1 → natural_pdf-0.1.3}/docs/tutorials/09-section-extraction.ipynb +98 -102
- {natural_pdf-0.1.1 → natural_pdf-0.1.3}/docs/tutorials/10-form-field-extraction.ipynb +51 -55
- {natural_pdf-0.1.1 → natural_pdf-0.1.3}/docs/tutorials/11-enhanced-table-processing.ipynb +7 -11
- {natural_pdf-0.1.1 → natural_pdf-0.1.3}/docs/tutorials/12-ocr-integration.ipynb +173 -65
- {natural_pdf-0.1.1 → natural_pdf-0.1.3}/docs/tutorials/12-ocr-integration.md +32 -0
- natural_pdf-0.1.3/docs/tutorials/13-semantic-search.ipynb +1908 -0
- natural_pdf-0.1.3/docs/tutorials/13-semantic-search.md +77 -0
- {natural_pdf-0.1.1 → natural_pdf-0.1.3}/mkdocs.yml +2 -0
- {natural_pdf-0.1.1 → natural_pdf-0.1.3}/natural_pdf/__init__.py +33 -1
- natural_pdf-0.1.3/natural_pdf/analyzers/layout/layout_analyzer.py +255 -0
- {natural_pdf-0.1.1 → natural_pdf-0.1.3}/natural_pdf/analyzers/layout/layout_manager.py +9 -6
- {natural_pdf-0.1.1 → natural_pdf-0.1.3}/natural_pdf/analyzers/layout/layout_options.py +2 -4
- natural_pdf-0.1.3/natural_pdf/analyzers/layout/surya.py +259 -0
- natural_pdf-0.1.3/natural_pdf/collections/pdf_collection.py +259 -0
- {natural_pdf-0.1.1 → natural_pdf-0.1.3}/natural_pdf/core/page.py +97 -69
- {natural_pdf-0.1.1 → natural_pdf-0.1.3}/natural_pdf/core/pdf.py +382 -171
- {natural_pdf-0.1.1 → natural_pdf-0.1.3}/natural_pdf/elements/region.py +55 -26
- natural_pdf-0.1.3/natural_pdf/exporters/__init__.py +1 -0
- natural_pdf-0.1.3/natural_pdf/exporters/searchable_pdf.py +252 -0
- natural_pdf-0.1.3/natural_pdf/search/__init__.py +94 -0
- natural_pdf-0.1.3/natural_pdf/search/haystack_search_service.py +520 -0
- natural_pdf-0.1.3/natural_pdf/search/haystack_utils.py +386 -0
- natural_pdf-0.1.3/natural_pdf/search/search_options.py +72 -0
- natural_pdf-0.1.3/natural_pdf/search/search_service_protocol.py +189 -0
- natural_pdf-0.1.3/natural_pdf/search/searchable_mixin.py +464 -0
- natural_pdf-0.1.3/natural_pdf.egg-info/PKG-INFO +137 -0
- {natural_pdf-0.1.1 → natural_pdf-0.1.3}/natural_pdf.egg-info/SOURCES.txt +13 -0
- {natural_pdf-0.1.1 → natural_pdf-0.1.3}/natural_pdf.egg-info/requires.txt +10 -0
- natural_pdf-0.1.3/notebooks/Examples.ipynb +1293 -0
- {natural_pdf-0.1.1 → natural_pdf-0.1.3}/pyproject.toml +12 -2
- natural_pdf-0.1.3/sample-screen.png +0 -0
- natural_pdf-0.1.1/PKG-INFO +0 -295
- natural_pdf-0.1.1/README.md +0 -252
- natural_pdf-0.1.1/docs/index.md +0 -299
- natural_pdf-0.1.1/docs/tutorials/01-loading-and-extraction.ipynb +0 -1137
- natural_pdf-0.1.1/docs/tutorials/07-layout-analysis.ipynb +0 -264
- natural_pdf-0.1.1/docs/tutorials/08-spatial-navigation.ipynb +0 -512
- natural_pdf-0.1.1/natural_pdf/analyzers/layout/layout_analyzer.py +0 -166
- natural_pdf-0.1.1/natural_pdf/analyzers/layout/surya.py +0 -151
- natural_pdf-0.1.1/natural_pdf.egg-info/PKG-INFO +0 -295
- natural_pdf-0.1.1/notebooks/Examples.ipynb +0 -1166
- {natural_pdf-0.1.1 → natural_pdf-0.1.3}/.github/workflows/docs.yml +0 -0
- {natural_pdf-0.1.1 → natural_pdf-0.1.3}/CLAUDE.md +0 -0
- {natural_pdf-0.1.1 → natural_pdf-0.1.3}/LICENSE +0 -0
- {natural_pdf-0.1.1 → natural_pdf-0.1.3}/MANIFEST.in +0 -0
- {natural_pdf-0.1.1 → natural_pdf-0.1.3}/check_run_md.sh +0 -0
- {natural_pdf-0.1.1 → natural_pdf-0.1.3}/docs/api/index.md +0 -0
- {natural_pdf-0.1.1 → natural_pdf-0.1.3}/docs/assets/favicon.png +0 -0
- {natural_pdf-0.1.1 → natural_pdf-0.1.3}/docs/assets/favicon.svg +0 -0
- {natural_pdf-0.1.1 → natural_pdf-0.1.3}/docs/assets/javascripts/custom.js +0 -0
- {natural_pdf-0.1.1 → natural_pdf-0.1.3}/docs/assets/logo.svg +0 -0
- {natural_pdf-0.1.1 → natural_pdf-0.1.3}/docs/assets/social-preview.png +0 -0
- {natural_pdf-0.1.1 → natural_pdf-0.1.3}/docs/assets/social-preview.svg +0 -0
- {natural_pdf-0.1.1 → natural_pdf-0.1.3}/docs/assets/stylesheets/custom.css +0 -0
- {natural_pdf-0.1.1 → natural_pdf-0.1.3}/docs/document-qa/index.ipynb +0 -0
- {natural_pdf-0.1.1 → natural_pdf-0.1.3}/docs/document-qa/index.md +0 -0
- {natural_pdf-0.1.1 → natural_pdf-0.1.3}/docs/element-selection/index.ipynb +0 -0
- {natural_pdf-0.1.1 → natural_pdf-0.1.3}/docs/element-selection/index.md +0 -0
- {natural_pdf-0.1.1 → natural_pdf-0.1.3}/docs/interactive-widget/index.ipynb +0 -0
- {natural_pdf-0.1.1 → natural_pdf-0.1.3}/docs/interactive-widget/index.md +0 -0
- {natural_pdf-0.1.1 → natural_pdf-0.1.3}/docs/layout-analysis/index.ipynb +0 -0
- {natural_pdf-0.1.1 → natural_pdf-0.1.3}/docs/layout-analysis/index.md +0 -0
- {natural_pdf-0.1.1 → natural_pdf-0.1.3}/docs/ocr/index.md +0 -0
- {natural_pdf-0.1.1 → natural_pdf-0.1.3}/docs/pdf-navigation/index.ipynb +0 -0
- {natural_pdf-0.1.1 → natural_pdf-0.1.3}/docs/pdf-navigation/index.md +0 -0
- {natural_pdf-0.1.1 → natural_pdf-0.1.3}/docs/tables/index.ipynb +0 -0
- {natural_pdf-0.1.1 → natural_pdf-0.1.3}/docs/tables/index.md +0 -0
- {natural_pdf-0.1.1 → natural_pdf-0.1.3}/docs/text-analysis/index.ipynb +0 -0
- {natural_pdf-0.1.1 → natural_pdf-0.1.3}/docs/text-analysis/index.md +0 -0
- {natural_pdf-0.1.1 → natural_pdf-0.1.3}/docs/text-extraction/index.ipynb +0 -0
- {natural_pdf-0.1.1 → natural_pdf-0.1.3}/docs/text-extraction/index.md +0 -0
- {natural_pdf-0.1.1 → natural_pdf-0.1.3}/docs/tutorials/01-loading-and-extraction.md +0 -0
- {natural_pdf-0.1.1 → natural_pdf-0.1.3}/docs/tutorials/02-finding-elements.md +0 -0
- {natural_pdf-0.1.1 → natural_pdf-0.1.3}/docs/tutorials/03-extracting-blocks.md +0 -0
- {natural_pdf-0.1.1 → natural_pdf-0.1.3}/docs/tutorials/04-table-extraction.md +0 -0
- {natural_pdf-0.1.1 → natural_pdf-0.1.3}/docs/tutorials/06-document-qa.md +0 -0
- {natural_pdf-0.1.1 → natural_pdf-0.1.3}/docs/tutorials/07-layout-analysis.md +0 -0
- {natural_pdf-0.1.1 → natural_pdf-0.1.3}/docs/tutorials/07-working-with-regions.md +0 -0
- {natural_pdf-0.1.1 → natural_pdf-0.1.3}/docs/tutorials/08-spatial-navigation.md +0 -0
- {natural_pdf-0.1.1 → natural_pdf-0.1.3}/docs/tutorials/09-section-extraction.md +0 -0
- {natural_pdf-0.1.1 → natural_pdf-0.1.3}/docs/tutorials/10-form-field-extraction.md +0 -0
- {natural_pdf-0.1.1 → natural_pdf-0.1.3}/docs/tutorials/11-enhanced-table-processing.md +0 -0
- {natural_pdf-0.1.1 → natural_pdf-0.1.3}/docs/visual-debugging/index.ipynb +0 -0
- {natural_pdf-0.1.1 → natural_pdf-0.1.3}/docs/visual-debugging/index.md +0 -0
- {natural_pdf-0.1.1 → natural_pdf-0.1.3}/docs/visual-debugging/region.png +0 -0
- {natural_pdf-0.1.1 → natural_pdf-0.1.3}/execute_notebooks.py +0 -0
- {natural_pdf-0.1.1 → natural_pdf-0.1.3}/natural_pdf/analyzers/__init__.py +0 -0
- {natural_pdf-0.1.1 → natural_pdf-0.1.3}/natural_pdf/analyzers/layout/__init__.py +0 -0
- {natural_pdf-0.1.1 → natural_pdf-0.1.3}/natural_pdf/analyzers/layout/base.py +0 -0
- {natural_pdf-0.1.1 → natural_pdf-0.1.3}/natural_pdf/analyzers/layout/docling.py +0 -0
- {natural_pdf-0.1.1 → natural_pdf-0.1.3}/natural_pdf/analyzers/layout/paddle.py +0 -0
- {natural_pdf-0.1.1 → natural_pdf-0.1.3}/natural_pdf/analyzers/layout/tatr.py +0 -0
- {natural_pdf-0.1.1 → natural_pdf-0.1.3}/natural_pdf/analyzers/layout/yolo.py +0 -0
- {natural_pdf-0.1.1 → natural_pdf-0.1.3}/natural_pdf/analyzers/text_options.py +0 -0
- {natural_pdf-0.1.1 → natural_pdf-0.1.3}/natural_pdf/analyzers/text_structure.py +0 -0
- {natural_pdf-0.1.1 → natural_pdf-0.1.3}/natural_pdf/analyzers/utils.py +0 -0
- {natural_pdf-0.1.1 → natural_pdf-0.1.3}/natural_pdf/core/__init__.py +0 -0
- {natural_pdf-0.1.1 → natural_pdf-0.1.3}/natural_pdf/core/element_manager.py +0 -0
- {natural_pdf-0.1.1 → natural_pdf-0.1.3}/natural_pdf/core/highlighting_service.py +0 -0
- {natural_pdf-0.1.1 → natural_pdf-0.1.3}/natural_pdf/elements/__init__.py +0 -0
- {natural_pdf-0.1.1 → natural_pdf-0.1.3}/natural_pdf/elements/base.py +0 -0
- {natural_pdf-0.1.1 → natural_pdf-0.1.3}/natural_pdf/elements/collections.py +0 -0
- {natural_pdf-0.1.1 → natural_pdf-0.1.3}/natural_pdf/elements/line.py +0 -0
- {natural_pdf-0.1.1 → natural_pdf-0.1.3}/natural_pdf/elements/rect.py +0 -0
- {natural_pdf-0.1.1 → natural_pdf-0.1.3}/natural_pdf/elements/text.py +0 -0
- {natural_pdf-0.1.1 → natural_pdf-0.1.3}/natural_pdf/ocr/__init__.py +0 -0
- {natural_pdf-0.1.1 → natural_pdf-0.1.3}/natural_pdf/ocr/engine.py +0 -0
- {natural_pdf-0.1.1 → natural_pdf-0.1.3}/natural_pdf/ocr/engine_easyocr.py +0 -0
- {natural_pdf-0.1.1 → natural_pdf-0.1.3}/natural_pdf/ocr/engine_paddle.py +0 -0
- {natural_pdf-0.1.1 → natural_pdf-0.1.3}/natural_pdf/ocr/engine_surya.py +0 -0
- {natural_pdf-0.1.1 → natural_pdf-0.1.3}/natural_pdf/ocr/ocr_manager.py +0 -0
- {natural_pdf-0.1.1 → natural_pdf-0.1.3}/natural_pdf/ocr/ocr_options.py +0 -0
- {natural_pdf-0.1.1 → natural_pdf-0.1.3}/natural_pdf/qa/__init__.py +0 -0
- {natural_pdf-0.1.1 → natural_pdf-0.1.3}/natural_pdf/qa/document_qa.py +0 -0
- {natural_pdf-0.1.1 → natural_pdf-0.1.3}/natural_pdf/selectors/__init__.py +0 -0
- {natural_pdf-0.1.1 → natural_pdf-0.1.3}/natural_pdf/selectors/parser.py +0 -0
- {natural_pdf-0.1.1 → natural_pdf-0.1.3}/natural_pdf/templates/__init__.py +0 -0
- {natural_pdf-0.1.1 → natural_pdf-0.1.3}/natural_pdf/templates/ocr_debug.html +0 -0
- {natural_pdf-0.1.1 → natural_pdf-0.1.3}/natural_pdf/utils/__init__.py +0 -0
- {natural_pdf-0.1.1 → natural_pdf-0.1.3}/natural_pdf/utils/highlighting.py +0 -0
- {natural_pdf-0.1.1 → natural_pdf-0.1.3}/natural_pdf/utils/reading_order.py +0 -0
- {natural_pdf-0.1.1 → natural_pdf-0.1.3}/natural_pdf/utils/visualization.py +0 -0
- {natural_pdf-0.1.1 → natural_pdf-0.1.3}/natural_pdf/widgets/__init__.py +0 -0
- {natural_pdf-0.1.1 → natural_pdf-0.1.3}/natural_pdf/widgets/frontend/viewer.js +0 -0
- {natural_pdf-0.1.1 → natural_pdf-0.1.3}/natural_pdf/widgets/viewer.py +0 -0
- {natural_pdf-0.1.1 → natural_pdf-0.1.3}/natural_pdf.egg-info/dependency_links.txt +0 -0
- {natural_pdf-0.1.1 → natural_pdf-0.1.3}/natural_pdf.egg-info/top_level.txt +0 -0
- {natural_pdf-0.1.1 → natural_pdf-0.1.3}/output/all_detected_regions.png +0 -0
- {natural_pdf-0.1.1 → natural_pdf-0.1.3}/output/all_elements.png +0 -0
- {natural_pdf-0.1.1 → natural_pdf-0.1.3}/output/basic_highlighting.png +0 -0
- {natural_pdf-0.1.1 → natural_pdf-0.1.3}/output/chainable_layout.png +0 -0
- {natural_pdf-0.1.1 → natural_pdf-0.1.3}/output/chained_analysis.png +0 -0
- {natural_pdf-0.1.1 → natural_pdf-0.1.3}/output/color_names.png +0 -0
- {natural_pdf-0.1.1 → natural_pdf-0.1.3}/output/color_names_with_boxes.png +0 -0
- {natural_pdf-0.1.1 → natural_pdf-0.1.3}/output/conf_display_highlight_all.png +0 -0
- {natural_pdf-0.1.1 → natural_pdf-0.1.3}/output/conf_display_highlight_layout.png +0 -0
- {natural_pdf-0.1.1 → natural_pdf-0.1.3}/output/conf_display_layout_only.png +0 -0
- {natural_pdf-0.1.1 → natural_pdf-0.1.3}/output/confidence_color_coded.png +0 -0
- {natural_pdf-0.1.1 → natural_pdf-0.1.3}/output/debug_page_image.png +0 -0
- {natural_pdf-0.1.1 → natural_pdf-0.1.3}/output/detected_table.png +0 -0
- {natural_pdf-0.1.1 → natural_pdf-0.1.3}/output/dimension_analysis.txt +0 -0
- {natural_pdf-0.1.1 → natural_pdf-0.1.3}/output/direct_ocr_debug.png +0 -0
- {natural_pdf-0.1.1 → natural_pdf-0.1.3}/output/easyocr_debug_input.png +0 -0
- {natural_pdf-0.1.1 → natural_pdf-0.1.3}/output/easyocr_results.png +0 -0
- {natural_pdf-0.1.1 → natural_pdf-0.1.3}/output/easyocr_test_input.png +0 -0
- {natural_pdf-0.1.1 → natural_pdf-0.1.3}/output/exclusion_optimization_regions.png +0 -0
- {natural_pdf-0.1.1 → natural_pdf-0.1.3}/output/explicit_confidence_display.png +0 -0
- {natural_pdf-0.1.1 → natural_pdf-0.1.3}/output/footer_overlap_test.png +0 -0
- {natural_pdf-0.1.1 → natural_pdf-0.1.3}/output/highlight_all.png +0 -0
- {natural_pdf-0.1.1 → natural_pdf-0.1.3}/output/highlight_all_styles.png +0 -0
- {natural_pdf-0.1.1 → natural_pdf-0.1.3}/output/highlight_all_with_all_layouts.png +0 -0
- {natural_pdf-0.1.1 → natural_pdf-0.1.3}/output/highlight_all_with_attrs.png +0 -0
- {natural_pdf-0.1.1 → natural_pdf-0.1.3}/output/highlight_all_with_yolo.png +0 -0
- {natural_pdf-0.1.1 → natural_pdf-0.1.3}/output/highlight_by_confidence.png +0 -0
- {natural_pdf-0.1.1 → natural_pdf-0.1.3}/output/highlight_color_test_1.png +0 -0
- {natural_pdf-0.1.1 → natural_pdf-0.1.3}/output/highlight_color_test_2.png +0 -0
- {natural_pdf-0.1.1 → natural_pdf-0.1.3}/output/highlight_color_test_3.png +0 -0
- {natural_pdf-0.1.1 → natural_pdf-0.1.3}/output/highlight_color_test_4.png +0 -0
- {natural_pdf-0.1.1 → natural_pdf-0.1.3}/output/highlight_layout_method.png +0 -0
- {natural_pdf-0.1.1 → natural_pdf-0.1.3}/output/highlight_multiple.png +0 -0
- {natural_pdf-0.1.1 → natural_pdf-0.1.3}/output/highlight_no_attrs.png +0 -0
- {natural_pdf-0.1.1 → natural_pdf-0.1.3}/output/highlight_region.png +0 -0
- {natural_pdf-0.1.1 → natural_pdf-0.1.3}/output/highlight_single.png +0 -0
- {natural_pdf-0.1.1 → natural_pdf-0.1.3}/output/highlight_specific_types.png +0 -0
- {natural_pdf-0.1.1 → natural_pdf-0.1.3}/output/highlight_specific_types_with_boxes.png +0 -0
- {natural_pdf-0.1.1 → natural_pdf-0.1.3}/output/highlight_specific_types_with_tables.png +0 -0
- {natural_pdf-0.1.1 → natural_pdf-0.1.3}/output/highlight_test.png +0 -0
- {natural_pdf-0.1.1 → natural_pdf-0.1.3}/output/highlight_test_colors.png +0 -0
- {natural_pdf-0.1.1 → natural_pdf-0.1.3}/output/highlight_test_individual.png +0 -0
- {natural_pdf-0.1.1 → natural_pdf-0.1.3}/output/highlight_test_individual_annotated.png +0 -0
- {natural_pdf-0.1.1 → natural_pdf-0.1.3}/output/highlight_test_individual_with_structure.png +0 -0
- {natural_pdf-0.1.1 → natural_pdf-0.1.3}/output/highlight_test_individual_with_structure_yolo.png +0 -0
- {natural_pdf-0.1.1 → natural_pdf-0.1.3}/output/highlight_test_individual_with_tables.png +0 -0
- {natural_pdf-0.1.1 → natural_pdf-0.1.3}/output/highlight_with_attrs.png +0 -0
- {natural_pdf-0.1.1 → natural_pdf-0.1.3}/output/layout_conf_default.png +0 -0
- {natural_pdf-0.1.1 → natural_pdf-0.1.3}/output/layout_conf_high.png +0 -0
- {natural_pdf-0.1.1 → natural_pdf-0.1.3}/output/layout_detection.png +0 -0
- {natural_pdf-0.1.1 → natural_pdf-0.1.3}/output/layout_fix_test.png +0 -0
- {natural_pdf-0.1.1 → natural_pdf-0.1.3}/output/layout_fix_test2.png +0 -0
- {natural_pdf-0.1.1 → natural_pdf-0.1.3}/output/layout_fix_test3.png +0 -0
- {natural_pdf-0.1.1 → natural_pdf-0.1.3}/output/layout_fix_test4.png +0 -0
- {natural_pdf-0.1.1 → natural_pdf-0.1.3}/output/model_comparison.png +0 -0
- {natural_pdf-0.1.1 → natural_pdf-0.1.3}/output/multiple_attributes_display.png +0 -0
- {natural_pdf-0.1.1 → natural_pdf-0.1.3}/output/ocr_confidence_visualization.png +0 -0
- {natural_pdf-0.1.1 → natural_pdf-0.1.3}/output/ocr_debug.png +0 -0
- {natural_pdf-0.1.1 → natural_pdf-0.1.3}/output/ocr_debug_page.html +0 -0
- {natural_pdf-0.1.1 → natural_pdf-0.1.3}/output/ocr_highlight_all_test.png +0 -0
- {natural_pdf-0.1.1 → natural_pdf-0.1.3}/output/ocr_highlight_test.png +0 -0
- {natural_pdf-0.1.1 → natural_pdf-0.1.3}/output/ocr_highlighted.png +0 -0
- {natural_pdf-0.1.1 → natural_pdf-0.1.3}/output/ocr_simplified.png +0 -0
- {natural_pdf-0.1.1 → natural_pdf-0.1.3}/output/ocr_threshold_comparison.png +0 -0
- {natural_pdf-0.1.1 → natural_pdf-0.1.3}/output/ocr_visualization_clean.png +0 -0
- {natural_pdf-0.1.1 → natural_pdf-0.1.3}/output/ocr_visualization_highlights.png +0 -0
- {natural_pdf-0.1.1 → natural_pdf-0.1.3}/output/ocr_visualization_text.png +0 -0
- {natural_pdf-0.1.1 → natural_pdf-0.1.3}/output/paddle_layout_detection.png +0 -0
- {natural_pdf-0.1.1 → natural_pdf-0.1.3}/output/paddle_layout_polygons.png +0 -0
- {natural_pdf-0.1.1 → natural_pdf-0.1.3}/output/paddle_layout_sources.png +0 -0
- {natural_pdf-0.1.1 → natural_pdf-0.1.3}/output/paddle_layout_with_text.png +0 -0
- {natural_pdf-0.1.1 → natural_pdf-0.1.3}/output/paddle_layout_without_text.png +0 -0
- {natural_pdf-0.1.1 → natural_pdf-0.1.3}/output/paddleocr_highlights.png +0 -0
- {natural_pdf-0.1.1 → natural_pdf-0.1.3}/output/paddleocr_results.png +0 -0
- {natural_pdf-0.1.1 → natural_pdf-0.1.3}/output/paddleocr_test_input.png +0 -0
- {natural_pdf-0.1.1 → natural_pdf-0.1.3}/output/page_1_for_ocr.png +0 -0
- {natural_pdf-0.1.1 → natural_pdf-0.1.3}/output/page_4_for_ocr.png +0 -0
- {natural_pdf-0.1.1 → natural_pdf-0.1.3}/output/region_exclusion_test.png +0 -0
- {natural_pdf-0.1.1 → natural_pdf-0.1.3}/output/region_management_test.png +0 -0
- {natural_pdf-0.1.1 → natural_pdf-0.1.3}/output/region_ocr_cropped.png +0 -0
- {natural_pdf-0.1.1 → natural_pdf-0.1.3}/output/region_ocr_debug.png +0 -0
- {natural_pdf-0.1.1 → natural_pdf-0.1.3}/output/region_ocr_full_page.png +0 -0
- {natural_pdf-0.1.1 → natural_pdf-0.1.3}/output/region_ocr_highlighted.png +0 -0
- {natural_pdf-0.1.1 → natural_pdf-0.1.3}/output/spatial_navigation.png +0 -0
- {natural_pdf-0.1.1 → natural_pdf-0.1.3}/output/standard_highlight_all.png +0 -0
- {natural_pdf-0.1.1 → natural_pdf-0.1.3}/output/table_no_ocr.csv +0 -0
- {natural_pdf-0.1.1 → natural_pdf-0.1.3}/output/table_structure.png +0 -0
- {natural_pdf-0.1.1 → natural_pdf-0.1.3}/output/table_structure_detail.png +0 -0
- {natural_pdf-0.1.1 → natural_pdf-0.1.3}/output/table_with_ocr.csv +0 -0
- {natural_pdf-0.1.1 → natural_pdf-0.1.3}/output/tatr_cells_test.png +0 -0
- {natural_pdf-0.1.1 → natural_pdf-0.1.3}/output/tatr_ocr_table_test.png +0 -0
- {natural_pdf-0.1.1 → natural_pdf-0.1.3}/output/tatr_regions.png +0 -0
- {natural_pdf-0.1.1 → natural_pdf-0.1.3}/output/tatr_regions.txt +0 -0
- {natural_pdf-0.1.1 → natural_pdf-0.1.3}/output/text_styles.png +0 -0
- {natural_pdf-0.1.1 → natural_pdf-0.1.3}/output/titles_only.png +0 -0
- {natural_pdf-0.1.1 → natural_pdf-0.1.3}/output/width_1200px.png +0 -0
- {natural_pdf-0.1.1 → natural_pdf-0.1.3}/output/width_800px.png +0 -0
- {natural_pdf-0.1.1 → natural_pdf-0.1.3}/output/width_default.png +0 -0
- {natural_pdf-0.1.1 → natural_pdf-0.1.3}/output/width_with_scale.png +0 -0
- {natural_pdf-0.1.1 → natural_pdf-0.1.3}/output/yolo_regions.png +0 -0
- {natural_pdf-0.1.1 → natural_pdf-0.1.3}/output/yolo_regions.txt +0 -0
- {natural_pdf-0.1.1 → natural_pdf-0.1.3}/pdfs/.gitkeep +0 -0
- {natural_pdf-0.1.1 → natural_pdf-0.1.3}/pdfs/01-practice.pdf +0 -0
- {natural_pdf-0.1.1 → natural_pdf-0.1.3}/pdfs/0500000US42001.pdf +0 -0
- {natural_pdf-0.1.1 → natural_pdf-0.1.3}/pdfs/0500000US42007.pdf +0 -0
- {natural_pdf-0.1.1 → natural_pdf-0.1.3}/pdfs/2014 Statistics.pdf +0 -0
- {natural_pdf-0.1.1 → natural_pdf-0.1.3}/pdfs/2019 Statistics.pdf +0 -0
- {natural_pdf-0.1.1 → natural_pdf-0.1.3}/pdfs/Atlanta_Public_Schools_GA_sample.pdf +0 -0
- {natural_pdf-0.1.1 → natural_pdf-0.1.3}/pdfs/needs-ocr.pdf +0 -0
- {natural_pdf-0.1.1 → natural_pdf-0.1.3}/publish.sh +0 -0
- {natural_pdf-0.1.1 → natural_pdf-0.1.3}/run_all_tutorials.sh +0 -0
- {natural_pdf-0.1.1 → natural_pdf-0.1.3}/setup.cfg +0 -0
@@ -5,7 +5,11 @@ docs/tutorials/pdfs
|
|
5
5
|
install.sh
|
6
6
|
notebooks/Examples.md
|
7
7
|
transcript.md
|
8
|
-
|
8
|
+
natural_pdf_index
|
9
|
+
results
|
10
|
+
docs/tutorials/needs-ocr-searchable.pdf
|
11
|
+
sample.py
|
12
|
+
sample2.py
|
9
13
|
|
10
14
|
# Created by https://www.toptal.com/developers/gitignore/api/python,macos,visualstudiocode,jupyternotebooks
|
11
15
|
# Edit at https://www.toptal.com/developers/gitignore?templates=python,macos,visualstudiocode,jupyternotebooks
|
@@ -0,0 +1,137 @@
|
|
1
|
+
Metadata-Version: 2.4
|
2
|
+
Name: natural-pdf
|
3
|
+
Version: 0.1.3
|
4
|
+
Summary: A more intuitive interface for working with PDFs
|
5
|
+
Author-email: Jonathan Soma <jonathan.soma@gmail.com>
|
6
|
+
License-Expression: MIT
|
7
|
+
Project-URL: Homepage, https://github.com/jsoma/natural-pdf
|
8
|
+
Project-URL: Repository, https://github.com/jsoma/natural-pdf
|
9
|
+
Classifier: Programming Language :: Python :: 3
|
10
|
+
Classifier: Operating System :: OS Independent
|
11
|
+
Requires-Python: >=3.7
|
12
|
+
Description-Content-Type: text/markdown
|
13
|
+
License-File: LICENSE
|
14
|
+
Requires-Dist: pdfplumber>=0.7.0
|
15
|
+
Requires-Dist: Pillow>=8.0.0
|
16
|
+
Requires-Dist: colour>=0.1.5
|
17
|
+
Requires-Dist: numpy>=1.20.0
|
18
|
+
Requires-Dist: urllib3>=1.26.0
|
19
|
+
Requires-Dist: torch>=2.0.0
|
20
|
+
Requires-Dist: torchvision>=0.15.0
|
21
|
+
Requires-Dist: transformers>=4.30.0
|
22
|
+
Requires-Dist: huggingface_hub>=0.19.0
|
23
|
+
Requires-Dist: ocrmypdf>=16.0.0
|
24
|
+
Requires-Dist: pikepdf>=10.0.0
|
25
|
+
Provides-Extra: interactive
|
26
|
+
Requires-Dist: ipywidgets<9.0.0,>=7.0.0; extra == "interactive"
|
27
|
+
Provides-Extra: haystack
|
28
|
+
Requires-Dist: haystack-ai>=2.0.0b5; extra == "haystack"
|
29
|
+
Requires-Dist: chroma-haystack; extra == "haystack"
|
30
|
+
Requires-Dist: sentence-transformers; extra == "haystack"
|
31
|
+
Provides-Extra: easyocr
|
32
|
+
Requires-Dist: easyocr; extra == "easyocr"
|
33
|
+
Provides-Extra: paddle
|
34
|
+
Requires-Dist: paddlepaddle; extra == "paddle"
|
35
|
+
Requires-Dist: paddleocr; extra == "paddle"
|
36
|
+
Provides-Extra: layout-yolo
|
37
|
+
Requires-Dist: doclayout_yolo; extra == "layout-yolo"
|
38
|
+
Provides-Extra: surya
|
39
|
+
Requires-Dist: surya-ocr; extra == "surya"
|
40
|
+
Provides-Extra: qa
|
41
|
+
Provides-Extra: all
|
42
|
+
Requires-Dist: ipywidgets<9.0.0,>=7.0.0; extra == "all"
|
43
|
+
Requires-Dist: easyocr; extra == "all"
|
44
|
+
Requires-Dist: paddlepaddle; extra == "all"
|
45
|
+
Requires-Dist: paddleocr; extra == "all"
|
46
|
+
Requires-Dist: doclayout_yolo; extra == "all"
|
47
|
+
Requires-Dist: surya-ocr; extra == "all"
|
48
|
+
Requires-Dist: haystack-ai>=2.0.0b5; extra == "all"
|
49
|
+
Requires-Dist: chroma-haystack; extra == "all"
|
50
|
+
Requires-Dist: sentence-transformers; extra == "all"
|
51
|
+
Dynamic: license-file
|
52
|
+
|
53
|
+
# Natural PDF
|
54
|
+
|
55
|
+
A friendly library for working with PDFs, built on top of [pdfplumber](https://github.com/jsvine/pdfplumber).
|
56
|
+
|
57
|
+
Natural PDF lets you find and extract content from PDFs using simple code that makes sense.
|
58
|
+
|
59
|
+
- [Complete documentation here](https://jsoma.github.io/natural-pdf)
|
60
|
+
- [Live demos here](https://colab.research.google.com/github/jsoma/natural-pdf/)
|
61
|
+
|
62
|
+
<div style="max-width: 400px; margin: auto"><a href="sample-screen.png"><img src="sample-screen.png"></a></div>
|
63
|
+
|
64
|
+
## Installation
|
65
|
+
|
66
|
+
```bash
|
67
|
+
pip install natural-pdf
|
68
|
+
```
|
69
|
+
|
70
|
+
For optional features like specific OCR engines, layout analysis models, or the interactive Jupyter widget, you can install extras:
|
71
|
+
|
72
|
+
```bash
|
73
|
+
# Example: Install with EasyOCR support
|
74
|
+
pip install natural-pdf[easyocr]
|
75
|
+
pip install natural-pdf[surya]
|
76
|
+
pip install natural-pdf[paddle]
|
77
|
+
|
78
|
+
# Example: Install with interactive viewer support
|
79
|
+
pip install natural-pdf[interactive]
|
80
|
+
|
81
|
+
# Example: Install with semantic search support (Haystack)
|
82
|
+
pip install natural-pdf[haystack]
|
83
|
+
|
84
|
+
# Install everything
|
85
|
+
pip install natural-pdf[all]
|
86
|
+
```
|
87
|
+
|
88
|
+
See the [installation guide](https://jsoma.github.io/natural-pdf/installation/) for more details on extras.
|
89
|
+
|
90
|
+
## Quick Start
|
91
|
+
|
92
|
+
```python
|
93
|
+
from natural_pdf import PDF
|
94
|
+
|
95
|
+
# Open a PDF
|
96
|
+
pdf = PDF('document.pdf')
|
97
|
+
page = pdf.pages[0]
|
98
|
+
|
99
|
+
# Find elements using CSS-like selectors
|
100
|
+
heading = page.find('text:contains("Summary"):bold')
|
101
|
+
|
102
|
+
# Extract content below the heading
|
103
|
+
content = heading.below().extract_text()
|
104
|
+
print("Content below Summary:", content[:100] + "...")
|
105
|
+
|
106
|
+
# Exclude headers/footers automatically (example)
|
107
|
+
# You might define these based on common text or position
|
108
|
+
page.add_exclusion(page.find('text:contains("CONFIDENTIAL")').above())
|
109
|
+
page.add_exclusion(page.find_all('line')[-1].below())
|
110
|
+
|
111
|
+
# Extract clean text from the page
|
112
|
+
clean_text = page.extract_text()
|
113
|
+
print("\nClean page text:", clean_text[:200] + "...")
|
114
|
+
|
115
|
+
# Highlight the heading and view the page
|
116
|
+
heading.highlight(color='red')
|
117
|
+
page.to_image()
|
118
|
+
```
|
119
|
+
|
120
|
+
And as a fun bonus, `page.viewer()` will provide an interactive method to explore the PDF.
|
121
|
+
|
122
|
+
## Key Features
|
123
|
+
|
124
|
+
Natural PDF offers a range of features for working with PDFs:
|
125
|
+
|
126
|
+
* **CSS-like Selectors:** Find elements using intuitive query strings (`page.find('text:bold')`).
|
127
|
+
* **Spatial Navigation:** Select content relative to other elements (`heading.below()`, `element.select_until(...)`).
|
128
|
+
* **Text & Table Extraction:** Get clean text or structured table data, automatically handling exclusions.
|
129
|
+
* **OCR Integration:** Extract text from scanned documents using engines like EasyOCR, PaddleOCR, or Surya.
|
130
|
+
* **Layout Analysis:** Detect document structures (titles, paragraphs, tables) using AI models.
|
131
|
+
* **Document QA:** Ask natural language questions about your document's content.
|
132
|
+
* **Semantic Search:** Index PDFs and find relevant pages or documents based on semantic meaning using Haystack.
|
133
|
+
* **Visual Debugging:** Highlight elements and use an interactive viewer or save images to understand your selections.
|
134
|
+
|
135
|
+
## Learn More
|
136
|
+
|
137
|
+
Dive deeper into the features and explore advanced usage in the [**Complete Documentation**](https://jsoma.github.io/natural-pdf).
|
@@ -0,0 +1,85 @@
|
|
1
|
+
# Natural PDF
|
2
|
+
|
3
|
+
A friendly library for working with PDFs, built on top of [pdfplumber](https://github.com/jsvine/pdfplumber).
|
4
|
+
|
5
|
+
Natural PDF lets you find and extract content from PDFs using simple code that makes sense.
|
6
|
+
|
7
|
+
- [Complete documentation here](https://jsoma.github.io/natural-pdf)
|
8
|
+
- [Live demos here](https://colab.research.google.com/github/jsoma/natural-pdf/)
|
9
|
+
|
10
|
+
<div style="max-width: 400px; margin: auto"><a href="sample-screen.png"><img src="sample-screen.png"></a></div>
|
11
|
+
|
12
|
+
## Installation
|
13
|
+
|
14
|
+
```bash
|
15
|
+
pip install natural-pdf
|
16
|
+
```
|
17
|
+
|
18
|
+
For optional features like specific OCR engines, layout analysis models, or the interactive Jupyter widget, you can install extras:
|
19
|
+
|
20
|
+
```bash
|
21
|
+
# Example: Install with EasyOCR support
|
22
|
+
pip install natural-pdf[easyocr]
|
23
|
+
pip install natural-pdf[surya]
|
24
|
+
pip install natural-pdf[paddle]
|
25
|
+
|
26
|
+
# Example: Install with interactive viewer support
|
27
|
+
pip install natural-pdf[interactive]
|
28
|
+
|
29
|
+
# Example: Install with semantic search support (Haystack)
|
30
|
+
pip install natural-pdf[haystack]
|
31
|
+
|
32
|
+
# Install everything
|
33
|
+
pip install natural-pdf[all]
|
34
|
+
```
|
35
|
+
|
36
|
+
See the [installation guide](https://jsoma.github.io/natural-pdf/installation/) for more details on extras.
|
37
|
+
|
38
|
+
## Quick Start
|
39
|
+
|
40
|
+
```python
|
41
|
+
from natural_pdf import PDF
|
42
|
+
|
43
|
+
# Open a PDF
|
44
|
+
pdf = PDF('document.pdf')
|
45
|
+
page = pdf.pages[0]
|
46
|
+
|
47
|
+
# Find elements using CSS-like selectors
|
48
|
+
heading = page.find('text:contains("Summary"):bold')
|
49
|
+
|
50
|
+
# Extract content below the heading
|
51
|
+
content = heading.below().extract_text()
|
52
|
+
print("Content below Summary:", content[:100] + "...")
|
53
|
+
|
54
|
+
# Exclude headers/footers automatically (example)
|
55
|
+
# You might define these based on common text or position
|
56
|
+
page.add_exclusion(page.find('text:contains("CONFIDENTIAL")').above())
|
57
|
+
page.add_exclusion(page.find_all('line')[-1].below())
|
58
|
+
|
59
|
+
# Extract clean text from the page
|
60
|
+
clean_text = page.extract_text()
|
61
|
+
print("\nClean page text:", clean_text[:200] + "...")
|
62
|
+
|
63
|
+
# Highlight the heading and view the page
|
64
|
+
heading.highlight(color='red')
|
65
|
+
page.to_image()
|
66
|
+
```
|
67
|
+
|
68
|
+
And as a fun bonus, `page.viewer()` will provide an interactive method to explore the PDF.
|
69
|
+
|
70
|
+
## Key Features
|
71
|
+
|
72
|
+
Natural PDF offers a range of features for working with PDFs:
|
73
|
+
|
74
|
+
* **CSS-like Selectors:** Find elements using intuitive query strings (`page.find('text:bold')`).
|
75
|
+
* **Spatial Navigation:** Select content relative to other elements (`heading.below()`, `element.select_until(...)`).
|
76
|
+
* **Text & Table Extraction:** Get clean text or structured table data, automatically handling exclusions.
|
77
|
+
* **OCR Integration:** Extract text from scanned documents using engines like EasyOCR, PaddleOCR, or Surya.
|
78
|
+
* **Layout Analysis:** Detect document structures (titles, paragraphs, tables) using AI models.
|
79
|
+
* **Document QA:** Ask natural language questions about your document's content.
|
80
|
+
* **Semantic Search:** Index PDFs and find relevant pages or documents based on semantic meaning using Haystack.
|
81
|
+
* **Visual Debugging:** Highlight elements and use an interactive viewer or save images to understand your selections.
|
82
|
+
|
83
|
+
## Learn More
|
84
|
+
|
85
|
+
Dive deeper into the features and explore advanced usage in the [**Complete Documentation**](https://jsoma.github.io/natural-pdf).
|
Binary file
|
@@ -0,0 +1,170 @@
|
|
1
|
+
# Natural PDF
|
2
|
+
|
3
|
+
A friendly library for working with PDFs, built on top of [pdfplumber](https://github.com/jsvine/pdfplumber).
|
4
|
+
|
5
|
+
Natural PDF lets you find and extract content from PDFs using simple code that makes sense.
|
6
|
+
|
7
|
+
- [Live demo here](https://colab.research.google.com/github/jsoma/natural-pdf/blob/main/notebooks/Examples.ipynb)
|
8
|
+
|
9
|
+
<div style="max-width: 400px; margin: auto"><a href="assets/sample-screen.png"><img src="assets/sample-screen.png"></a></div>
|
10
|
+
|
11
|
+
## Installation
|
12
|
+
|
13
|
+
```
|
14
|
+
pip install natural_pdf
|
15
|
+
# All the extras
|
16
|
+
pip install "natural_pdf[all]"
|
17
|
+
```
|
18
|
+
|
19
|
+
## Quick Example
|
20
|
+
|
21
|
+
```python
|
22
|
+
from natural_pdf import PDF
|
23
|
+
|
24
|
+
pdf = PDF('document.pdf')
|
25
|
+
page = pdf.pages[0]
|
26
|
+
|
27
|
+
# Find the title and get content below it
|
28
|
+
title = page.find('text:contains("Summary"):bold')
|
29
|
+
content = title.below().extract_text()
|
30
|
+
|
31
|
+
# Exclude everything above 'CONFIDENTIAL' and below last line on page
|
32
|
+
page.add_exclusion(page.find('text:contains("CONFIDENTIAL")').above())
|
33
|
+
page.add_exclusion(page.find_all('line')[-1].below())
|
34
|
+
|
35
|
+
# Get the clean text without header/footer
|
36
|
+
clean_text = page.extract_text()
|
37
|
+
```
|
38
|
+
|
39
|
+
## Key Features
|
40
|
+
|
41
|
+
Here are a few highlights of what you can do:
|
42
|
+
|
43
|
+
### Find Elements with Selectors
|
44
|
+
|
45
|
+
Use CSS-like selectors to find text, shapes, and more.
|
46
|
+
|
47
|
+
```python
|
48
|
+
# Find bold text containing "Revenue"
|
49
|
+
page.find('text:contains("Revenue"):bold').extract_text()
|
50
|
+
|
51
|
+
# Find all large text
|
52
|
+
page.find_all('text[size>=12]').extract_text()
|
53
|
+
```
|
54
|
+
|
55
|
+
[Learn more about selectors →](element-selection/index.ipynb)
|
56
|
+
|
57
|
+
### Navigate Spatially
|
58
|
+
|
59
|
+
Move around the page relative to elements, not just coordinates.
|
60
|
+
|
61
|
+
```python
|
62
|
+
# Extract text below a specific heading
|
63
|
+
intro_text = page.find('text:contains("Introduction")').below().extract_text()
|
64
|
+
|
65
|
+
# Extract text from one heading to the next
|
66
|
+
methods_text = page.find('text:contains("Methods")').below(
|
67
|
+
until='text:contains("Results")'
|
68
|
+
).extract_text()
|
69
|
+
```
|
70
|
+
|
71
|
+
[Explore more navigation methods →](pdf-navigation/index.ipynb)
|
72
|
+
|
73
|
+
### Extract Clean Text
|
74
|
+
|
75
|
+
Easily extract text content, automatically handling common page elements like headers and footers (if exclusions are set).
|
76
|
+
|
77
|
+
```python
|
78
|
+
# Extract all text from the page (respecting exclusions)
|
79
|
+
page_text = page.extract_text()
|
80
|
+
|
81
|
+
# Extract text from a specific region
|
82
|
+
some_region = page.find(...)
|
83
|
+
region_text = some_region.extract_text()
|
84
|
+
```
|
85
|
+
|
86
|
+
[Learn about text extraction →](text-extraction/index.ipynb)
|
87
|
+
[Learn about exclusion zones →](regions/index.ipynb#exclusion-zones)
|
88
|
+
|
89
|
+
### Apply OCR
|
90
|
+
|
91
|
+
Extract text from scanned documents using various OCR engines.
|
92
|
+
|
93
|
+
```python
|
94
|
+
# Apply OCR using the default engine
|
95
|
+
ocr_elements = page.apply_ocr()
|
96
|
+
|
97
|
+
# Extract text (will use OCR results if available)
|
98
|
+
text = page.extract_text()
|
99
|
+
```
|
100
|
+
|
101
|
+
[Explore OCR options →](ocr/index.md)
|
102
|
+
|
103
|
+
### Analyze Document Layout
|
104
|
+
|
105
|
+
Use AI models to detect document structures like titles, paragraphs, and tables.
|
106
|
+
|
107
|
+
```python
|
108
|
+
# Detect document structure
|
109
|
+
page.analyze_layout()
|
110
|
+
|
111
|
+
# Highlight titles and tables
|
112
|
+
page.find_all('region[type=title]').highlight(color="purple")
|
113
|
+
page.find_all('region[type=table]').highlight(color="blue")
|
114
|
+
|
115
|
+
# Extract data from the first table
|
116
|
+
table_data = page.find('region[type=table]').extract_table()
|
117
|
+
```
|
118
|
+
|
119
|
+
[Learn about layout models →](layout-analysis/index.ipynb)
|
120
|
+
[Working with tables? →](tables/index.ipynb)
|
121
|
+
|
122
|
+
### Document Question Answering
|
123
|
+
|
124
|
+
Ask natural language questions directly to your documents.
|
125
|
+
|
126
|
+
```python
|
127
|
+
# Ask a question
|
128
|
+
result = pdf.ask("What was the company's revenue in 2022?")
|
129
|
+
if result.get("found", False):
|
130
|
+
print(f"Answer: {result['answer']}")
|
131
|
+
```
|
132
|
+
|
133
|
+
[Learn about Document QA →](document-qa/index.ipynb)
|
134
|
+
|
135
|
+
### Visualize Your Work
|
136
|
+
|
137
|
+
Debug and understand your extractions visually.
|
138
|
+
|
139
|
+
```python
|
140
|
+
# Highlight headings
|
141
|
+
page.find_all('text[size>=14]').highlight(color="red", label="Headings")
|
142
|
+
|
143
|
+
# Launch the interactive viewer (Jupyter)
|
144
|
+
# Requires: pip install natural-pdf[interactive]
|
145
|
+
page.viewer()
|
146
|
+
|
147
|
+
# Or save an image
|
148
|
+
# page.save_image("highlighted.png")
|
149
|
+
```
|
150
|
+
|
151
|
+
[See more visualization options →](visual-debugging/index.ipynb)
|
152
|
+
|
153
|
+
## Documentation Topics
|
154
|
+
|
155
|
+
Choose what you want to learn about:
|
156
|
+
|
157
|
+
### Task-based Guides
|
158
|
+
- [Getting Started](installation/index.md): Install the library and run your first extraction
|
159
|
+
- [PDF Navigation](pdf-navigation/index.ipynb): Open PDFs and work with pages
|
160
|
+
- [Element Selection](element-selection/index.ipynb): Find text and other elements using selectors
|
161
|
+
- [Text Extraction](text-extraction/index.ipynb): Extract clean text from documents
|
162
|
+
- [Regions](regions/index.ipynb): Work with specific areas of a page
|
163
|
+
- [Visual Debugging](visual-debugging/index.ipynb): See what you're extracting
|
164
|
+
- [OCR](ocr/index.md): Extract text from scanned documents
|
165
|
+
- [Layout Analysis](layout-analysis/index.ipynb): Detect document structure
|
166
|
+
- [Tables](tables/index.ipynb): Extract tabular data
|
167
|
+
- [Document QA](document-qa/index.ipynb): Ask questions to your documents
|
168
|
+
|
169
|
+
### Reference
|
170
|
+
- [API Reference](api/index.md): Complete library reference
|