natural-pdf 0.1.8__py3-none-any.whl → 0.1.10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (134) hide show
  1. natural_pdf/__init__.py +1 -0
  2. natural_pdf/analyzers/layout/base.py +1 -5
  3. natural_pdf/analyzers/layout/gemini.py +61 -51
  4. natural_pdf/analyzers/layout/layout_analyzer.py +40 -11
  5. natural_pdf/analyzers/layout/layout_manager.py +26 -84
  6. natural_pdf/analyzers/layout/layout_options.py +7 -0
  7. natural_pdf/analyzers/layout/pdfplumber_table_finder.py +142 -0
  8. natural_pdf/analyzers/layout/surya.py +46 -123
  9. natural_pdf/analyzers/layout/tatr.py +51 -4
  10. natural_pdf/analyzers/text_structure.py +3 -5
  11. natural_pdf/analyzers/utils.py +3 -3
  12. natural_pdf/classification/manager.py +241 -158
  13. natural_pdf/classification/mixin.py +52 -38
  14. natural_pdf/classification/results.py +71 -45
  15. natural_pdf/collections/mixins.py +85 -20
  16. natural_pdf/collections/pdf_collection.py +245 -100
  17. natural_pdf/core/element_manager.py +30 -14
  18. natural_pdf/core/highlighting_service.py +13 -22
  19. natural_pdf/core/page.py +423 -101
  20. natural_pdf/core/pdf.py +694 -195
  21. natural_pdf/elements/base.py +134 -40
  22. natural_pdf/elements/collections.py +610 -134
  23. natural_pdf/elements/region.py +659 -90
  24. natural_pdf/elements/text.py +1 -1
  25. natural_pdf/export/mixin.py +137 -0
  26. natural_pdf/exporters/base.py +3 -3
  27. natural_pdf/exporters/paddleocr.py +4 -3
  28. natural_pdf/extraction/manager.py +50 -49
  29. natural_pdf/extraction/mixin.py +90 -57
  30. natural_pdf/extraction/result.py +9 -23
  31. natural_pdf/ocr/__init__.py +5 -5
  32. natural_pdf/ocr/engine_doctr.py +346 -0
  33. natural_pdf/ocr/ocr_factory.py +24 -4
  34. natural_pdf/ocr/ocr_manager.py +61 -25
  35. natural_pdf/ocr/ocr_options.py +70 -10
  36. natural_pdf/ocr/utils.py +6 -4
  37. natural_pdf/search/__init__.py +20 -34
  38. natural_pdf/search/haystack_search_service.py +309 -265
  39. natural_pdf/search/haystack_utils.py +99 -75
  40. natural_pdf/search/search_service_protocol.py +11 -12
  41. natural_pdf/selectors/parser.py +219 -143
  42. natural_pdf/utils/debug.py +3 -3
  43. natural_pdf/utils/identifiers.py +1 -1
  44. natural_pdf/utils/locks.py +1 -1
  45. natural_pdf/utils/packaging.py +8 -6
  46. natural_pdf/utils/text_extraction.py +24 -16
  47. natural_pdf/utils/tqdm_utils.py +18 -10
  48. natural_pdf/utils/visualization.py +18 -0
  49. natural_pdf/widgets/viewer.py +4 -25
  50. {natural_pdf-0.1.8.dist-info → natural_pdf-0.1.10.dist-info}/METADATA +12 -3
  51. natural_pdf-0.1.10.dist-info/RECORD +80 -0
  52. {natural_pdf-0.1.8.dist-info → natural_pdf-0.1.10.dist-info}/WHEEL +1 -1
  53. {natural_pdf-0.1.8.dist-info → natural_pdf-0.1.10.dist-info}/top_level.txt +0 -2
  54. docs/api/index.md +0 -386
  55. docs/assets/favicon.png +0 -3
  56. docs/assets/favicon.svg +0 -3
  57. docs/assets/javascripts/custom.js +0 -17
  58. docs/assets/logo.svg +0 -3
  59. docs/assets/sample-screen.png +0 -0
  60. docs/assets/social-preview.png +0 -17
  61. docs/assets/social-preview.svg +0 -17
  62. docs/assets/stylesheets/custom.css +0 -65
  63. docs/categorizing-documents/index.md +0 -168
  64. docs/data-extraction/index.md +0 -87
  65. docs/document-qa/index.ipynb +0 -435
  66. docs/document-qa/index.md +0 -79
  67. docs/element-selection/index.ipynb +0 -969
  68. docs/element-selection/index.md +0 -249
  69. docs/finetuning/index.md +0 -176
  70. docs/index.md +0 -189
  71. docs/installation/index.md +0 -69
  72. docs/interactive-widget/index.ipynb +0 -962
  73. docs/interactive-widget/index.md +0 -12
  74. docs/layout-analysis/index.ipynb +0 -818
  75. docs/layout-analysis/index.md +0 -185
  76. docs/ocr/index.md +0 -256
  77. docs/pdf-navigation/index.ipynb +0 -314
  78. docs/pdf-navigation/index.md +0 -97
  79. docs/regions/index.ipynb +0 -816
  80. docs/regions/index.md +0 -294
  81. docs/tables/index.ipynb +0 -658
  82. docs/tables/index.md +0 -144
  83. docs/text-analysis/index.ipynb +0 -370
  84. docs/text-analysis/index.md +0 -105
  85. docs/text-extraction/index.ipynb +0 -1478
  86. docs/text-extraction/index.md +0 -292
  87. docs/tutorials/01-loading-and-extraction.ipynb +0 -1873
  88. docs/tutorials/01-loading-and-extraction.md +0 -95
  89. docs/tutorials/02-finding-elements.ipynb +0 -417
  90. docs/tutorials/02-finding-elements.md +0 -149
  91. docs/tutorials/03-extracting-blocks.ipynb +0 -152
  92. docs/tutorials/03-extracting-blocks.md +0 -48
  93. docs/tutorials/04-table-extraction.ipynb +0 -119
  94. docs/tutorials/04-table-extraction.md +0 -50
  95. docs/tutorials/05-excluding-content.ipynb +0 -275
  96. docs/tutorials/05-excluding-content.md +0 -109
  97. docs/tutorials/06-document-qa.ipynb +0 -337
  98. docs/tutorials/06-document-qa.md +0 -91
  99. docs/tutorials/07-layout-analysis.ipynb +0 -293
  100. docs/tutorials/07-layout-analysis.md +0 -66
  101. docs/tutorials/07-working-with-regions.ipynb +0 -414
  102. docs/tutorials/07-working-with-regions.md +0 -151
  103. docs/tutorials/08-spatial-navigation.ipynb +0 -513
  104. docs/tutorials/08-spatial-navigation.md +0 -190
  105. docs/tutorials/09-section-extraction.ipynb +0 -2439
  106. docs/tutorials/09-section-extraction.md +0 -256
  107. docs/tutorials/10-form-field-extraction.ipynb +0 -517
  108. docs/tutorials/10-form-field-extraction.md +0 -201
  109. docs/tutorials/11-enhanced-table-processing.ipynb +0 -59
  110. docs/tutorials/11-enhanced-table-processing.md +0 -9
  111. docs/tutorials/12-ocr-integration.ipynb +0 -3712
  112. docs/tutorials/12-ocr-integration.md +0 -137
  113. docs/tutorials/13-semantic-search.ipynb +0 -1718
  114. docs/tutorials/13-semantic-search.md +0 -77
  115. docs/visual-debugging/index.ipynb +0 -2970
  116. docs/visual-debugging/index.md +0 -157
  117. docs/visual-debugging/region.png +0 -0
  118. natural_pdf/templates/finetune/fine_tune_paddleocr.md +0 -420
  119. natural_pdf/templates/spa/css/style.css +0 -334
  120. natural_pdf/templates/spa/index.html +0 -31
  121. natural_pdf/templates/spa/js/app.js +0 -472
  122. natural_pdf/templates/spa/words.txt +0 -235976
  123. natural_pdf/widgets/frontend/viewer.js +0 -88
  124. natural_pdf-0.1.8.dist-info/RECORD +0 -156
  125. notebooks/Examples.ipynb +0 -1293
  126. pdfs/.gitkeep +0 -0
  127. pdfs/01-practice.pdf +0 -543
  128. pdfs/0500000US42001.pdf +0 -0
  129. pdfs/0500000US42007.pdf +0 -0
  130. pdfs/2014 Statistics.pdf +0 -0
  131. pdfs/2019 Statistics.pdf +0 -0
  132. pdfs/Atlanta_Public_Schools_GA_sample.pdf +0 -0
  133. pdfs/needs-ocr.pdf +0 -0
  134. {natural_pdf-0.1.8.dist-info → natural_pdf-0.1.10.dist-info}/licenses/LICENSE +0 -0
@@ -1,137 +0,0 @@
1
- # OCR Integration for Scanned Documents
2
-
3
- Optical Character Recognition (OCR) allows you to extract text from scanned documents where the text isn't embedded in the PDF. This tutorial demonstrates how to work with scanned documents.
4
-
5
- ```python
6
- #%pip install "natural-pdf[all]"
7
- ```
8
-
9
- ```python
10
- from natural_pdf import PDF
11
-
12
- # Load a PDF
13
- pdf = PDF("https://github.com/jsoma/natural-pdf/raw/refs/heads/main/pdfs/needs-ocr.pdf")
14
- page = pdf.pages[0]
15
-
16
- # Try extracting text without OCR
17
- text_without_ocr = page.extract_text()
18
- f"Without OCR: {len(text_without_ocr)} characters extracted"
19
- ```
20
-
21
- ## Applying OCR and Finding Elements
22
-
23
- The core method is `page.apply_ocr()`. This runs the OCR process and adds `TextElement` objects to the page. You can specify the engine and languages.
24
-
25
- **Note:** Re-applying OCR to the same page or region will automatically remove any previously generated OCR elements for that area before adding the new ones.
26
-
27
- ```python
28
- # Apply OCR using the default engine (EasyOCR) for English
29
- page.apply_ocr(languages=['en'])
30
-
31
- # Select all text pieces found by OCR
32
- text_elements = page.find_all('text[source=ocr]')
33
- print(f"Found {len(text_elements)} text elements using default OCR")
34
-
35
- # Visualize the elements
36
- text_elements.highlight()
37
-
38
- # Apply OCR using PaddleOCR for English and Chinese
39
- page.apply_ocr(engine='paddle', languages=['en', 'ch_sim'])
40
-
41
- # Apply OCR using SuryaOCR for English and German
42
- page.apply_ocr(engine='surya', languages=['en', 'de'])
43
-
44
- text_with_ocr = page.extract_text()
45
- print(f"\nExtracted text after OCR:\n{text_with_ocr[:150]}...")
46
- ```
47
-
48
- ## Advanced OCR Configuration
49
-
50
- For more control, import and use the specific `Options` class for your chosen engine within the `apply_ocr` call.
51
-
52
- ```python
53
- from natural_pdf.ocr import PaddleOCROptions, EasyOCROptions, SuryaOCROptions
54
-
55
- # Re-apply OCR using EasyOCR with specific options
56
- easy_opts = EasyOCROptions(
57
- paragraph=False,
58
- )
59
- page.apply_ocr(engine='easyocr', languages=['en'], min_confidence=0.1, options=easy_opts)
60
-
61
- paddle_opts = PaddleOCROptions(
62
- use_angle_cls=False,
63
- det_db_thresh=0.3,
64
- )
65
- page.apply_ocr(engine='paddle', languages=['en'], options=paddle_opts)
66
-
67
- surya_opts = SuryaOCROptions()
68
- page.apply_ocr(engine='surya', languages=['en'], min_confidence=0.5, detect_only=True, options=surya_opts)
69
- ```
70
-
71
- ## Interactive OCR Correction / Debugging
72
-
73
- If OCR results aren't perfect, you can use the bundled interactive web application (SPA) to review and correct them.
74
-
75
- 1. **Package the data:**
76
- After running `apply_ocr` (or `apply_layout`), use `create_correction_task_package` to create a zip file containing the PDF images and detected elements.
77
-
78
- ```python
79
- from natural_pdf.utils.packaging import create_correction_task_package
80
-
81
- page.apply_ocr()
82
-
83
- create_correction_task_package(pdf, "correction_package.zip", overwrite=True)
84
- ```
85
-
86
- 2. **Run the SPA:**
87
- Navigate to the SPA directory within the installed `natural_pdf` library in your terminal and start a simple web server.
88
-
89
- 3. **Use the SPA:**
90
- Open `http://localhost:8000` in your browser. Drag the `correction_package.zip` file onto the page to load the document. You can then click on text elements to correct the OCR results.
91
-
92
-
93
- ## Working with Multiple Pages
94
-
95
- Apply OCR or layout analysis to all pages using the `PDF` object.
96
-
97
- ```python
98
- # Process all pages in the document
99
-
100
- # Apply OCR to all pages (example using EasyOCR)
101
- pdf.apply_ocr(engine='easyocr', languages=['en'])
102
- print(f"Applied OCR to {len(pdf.pages)} pages.")
103
-
104
- # Or apply layout analysis to all pages (example using Paddle)
105
- # pdf.apply_layout(engine='paddle')
106
- # print(f"Applied Layout Analysis to {len(pdf.pages)} pages.")
107
-
108
- # Extract text from all pages (uses OCR results if available)
109
- all_text_content = pdf.extract_text(page_separator="\\n\\n---\\n\\n")
110
-
111
- print(f"\nCombined text from all pages:\n{all_text_content[:500]}...")
112
- ```
113
-
114
- ## Saving PDFs with Searchable Text
115
-
116
- After applying OCR to a PDF, you can save a new version of the PDF where the recognized text is embedded as an invisible layer. This makes the text searchable and copyable in standard PDF viewers.
117
-
118
- Use the `save_searchable()` method on the `PDF` object:
119
-
120
- ```python
121
- from natural_pdf import PDF
122
-
123
- input_pdf_path = "https://github.com/jsoma/natural-pdf/raw/refs/heads/main/pdfs/needs-ocr.pdf"
124
-
125
- pdf = PDF(input_pdf_path)
126
- # Apply OCR to all pages before saving
127
- # Use desired engine and options
128
- pdf.apply_ocr(engine='easyocr', languages=['en'])
129
-
130
- pdf.save_searchable("needs-ocr-searchable.pdf")
131
-
132
- print("Saved searchable PDF to needs-ocr-searchable.pdf")
133
- ```
134
-
135
- This creates `needs-ocr-searchable.pdf`, which looks identical to the original but now has a text layer corresponding to the OCR results. You can adjust the rendering resolution used during saving with the `dpi` parameter (default is 300).
136
-
137
- OCR integration enables you to work with scanned documents, historical archives, and image-based PDFs that don't have embedded text. By combining OCR with natural-pdf's layout analysis capabilities, you can turn any document into structured, searchable data.