natural-pdf 0.1.3__py3-none-any.whl → 0.1.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (132) hide show
  1. docs/api/index.md +386 -0
  2. docs/assets/favicon.png +3 -0
  3. docs/assets/favicon.svg +3 -0
  4. docs/assets/javascripts/custom.js +17 -0
  5. docs/assets/logo.svg +3 -0
  6. docs/assets/sample-screen.png +0 -0
  7. docs/assets/social-preview.png +17 -0
  8. docs/assets/social-preview.svg +17 -0
  9. docs/assets/stylesheets/custom.css +65 -0
  10. docs/document-qa/index.ipynb +435 -0
  11. docs/document-qa/index.md +79 -0
  12. docs/element-selection/index.ipynb +915 -0
  13. docs/element-selection/index.md +229 -0
  14. docs/index.md +170 -0
  15. docs/installation/index.md +69 -0
  16. docs/interactive-widget/index.ipynb +962 -0
  17. docs/interactive-widget/index.md +12 -0
  18. docs/layout-analysis/index.ipynb +818 -0
  19. docs/layout-analysis/index.md +185 -0
  20. docs/ocr/index.md +222 -0
  21. docs/pdf-navigation/index.ipynb +314 -0
  22. docs/pdf-navigation/index.md +97 -0
  23. docs/regions/index.ipynb +816 -0
  24. docs/regions/index.md +294 -0
  25. docs/tables/index.ipynb +658 -0
  26. docs/tables/index.md +144 -0
  27. docs/text-analysis/index.ipynb +370 -0
  28. docs/text-analysis/index.md +105 -0
  29. docs/text-extraction/index.ipynb +1478 -0
  30. docs/text-extraction/index.md +292 -0
  31. docs/tutorials/01-loading-and-extraction.ipynb +1696 -0
  32. docs/tutorials/01-loading-and-extraction.md +95 -0
  33. docs/tutorials/02-finding-elements.ipynb +340 -0
  34. docs/tutorials/02-finding-elements.md +149 -0
  35. docs/tutorials/03-extracting-blocks.ipynb +147 -0
  36. docs/tutorials/03-extracting-blocks.md +48 -0
  37. docs/tutorials/04-table-extraction.ipynb +114 -0
  38. docs/tutorials/04-table-extraction.md +50 -0
  39. docs/tutorials/05-excluding-content.ipynb +270 -0
  40. docs/tutorials/05-excluding-content.md +109 -0
  41. docs/tutorials/06-document-qa.ipynb +332 -0
  42. docs/tutorials/06-document-qa.md +91 -0
  43. docs/tutorials/07-layout-analysis.ipynb +260 -0
  44. docs/tutorials/07-layout-analysis.md +66 -0
  45. docs/tutorials/07-working-with-regions.ipynb +409 -0
  46. docs/tutorials/07-working-with-regions.md +151 -0
  47. docs/tutorials/08-spatial-navigation.ipynb +508 -0
  48. docs/tutorials/08-spatial-navigation.md +190 -0
  49. docs/tutorials/09-section-extraction.ipynb +2434 -0
  50. docs/tutorials/09-section-extraction.md +256 -0
  51. docs/tutorials/10-form-field-extraction.ipynb +484 -0
  52. docs/tutorials/10-form-field-extraction.md +201 -0
  53. docs/tutorials/11-enhanced-table-processing.ipynb +54 -0
  54. docs/tutorials/11-enhanced-table-processing.md +9 -0
  55. docs/tutorials/12-ocr-integration.ipynb +586 -0
  56. docs/tutorials/12-ocr-integration.md +188 -0
  57. docs/tutorials/13-semantic-search.ipynb +1888 -0
  58. docs/tutorials/13-semantic-search.md +77 -0
  59. docs/visual-debugging/index.ipynb +2970 -0
  60. docs/visual-debugging/index.md +157 -0
  61. docs/visual-debugging/region.png +0 -0
  62. natural_pdf/__init__.py +39 -20
  63. natural_pdf/analyzers/__init__.py +2 -1
  64. natural_pdf/analyzers/layout/base.py +32 -24
  65. natural_pdf/analyzers/layout/docling.py +131 -72
  66. natural_pdf/analyzers/layout/layout_analyzer.py +156 -113
  67. natural_pdf/analyzers/layout/layout_manager.py +98 -58
  68. natural_pdf/analyzers/layout/layout_options.py +32 -17
  69. natural_pdf/analyzers/layout/paddle.py +152 -95
  70. natural_pdf/analyzers/layout/surya.py +164 -92
  71. natural_pdf/analyzers/layout/tatr.py +149 -84
  72. natural_pdf/analyzers/layout/yolo.py +84 -44
  73. natural_pdf/analyzers/text_options.py +22 -15
  74. natural_pdf/analyzers/text_structure.py +131 -85
  75. natural_pdf/analyzers/utils.py +30 -23
  76. natural_pdf/collections/pdf_collection.py +126 -98
  77. natural_pdf/core/__init__.py +1 -1
  78. natural_pdf/core/element_manager.py +416 -337
  79. natural_pdf/core/highlighting_service.py +268 -196
  80. natural_pdf/core/page.py +910 -516
  81. natural_pdf/core/pdf.py +387 -289
  82. natural_pdf/elements/__init__.py +1 -1
  83. natural_pdf/elements/base.py +302 -214
  84. natural_pdf/elements/collections.py +714 -514
  85. natural_pdf/elements/line.py +39 -36
  86. natural_pdf/elements/rect.py +32 -30
  87. natural_pdf/elements/region.py +854 -883
  88. natural_pdf/elements/text.py +122 -99
  89. natural_pdf/exporters/__init__.py +0 -1
  90. natural_pdf/exporters/searchable_pdf.py +261 -102
  91. natural_pdf/ocr/__init__.py +23 -14
  92. natural_pdf/ocr/engine.py +17 -8
  93. natural_pdf/ocr/engine_easyocr.py +63 -47
  94. natural_pdf/ocr/engine_paddle.py +97 -68
  95. natural_pdf/ocr/engine_surya.py +54 -44
  96. natural_pdf/ocr/ocr_manager.py +88 -62
  97. natural_pdf/ocr/ocr_options.py +16 -10
  98. natural_pdf/qa/__init__.py +1 -1
  99. natural_pdf/qa/document_qa.py +119 -111
  100. natural_pdf/search/__init__.py +37 -31
  101. natural_pdf/search/haystack_search_service.py +312 -189
  102. natural_pdf/search/haystack_utils.py +186 -122
  103. natural_pdf/search/search_options.py +25 -14
  104. natural_pdf/search/search_service_protocol.py +12 -6
  105. natural_pdf/search/searchable_mixin.py +261 -176
  106. natural_pdf/selectors/__init__.py +2 -1
  107. natural_pdf/selectors/parser.py +159 -316
  108. natural_pdf/templates/__init__.py +1 -1
  109. natural_pdf/utils/highlighting.py +8 -2
  110. natural_pdf/utils/reading_order.py +65 -63
  111. natural_pdf/utils/text_extraction.py +195 -0
  112. natural_pdf/utils/visualization.py +70 -61
  113. natural_pdf/widgets/__init__.py +2 -3
  114. natural_pdf/widgets/viewer.py +749 -718
  115. {natural_pdf-0.1.3.dist-info → natural_pdf-0.1.5.dist-info}/METADATA +29 -15
  116. natural_pdf-0.1.5.dist-info/RECORD +134 -0
  117. natural_pdf-0.1.5.dist-info/top_level.txt +5 -0
  118. notebooks/Examples.ipynb +1293 -0
  119. pdfs/.gitkeep +0 -0
  120. pdfs/01-practice.pdf +543 -0
  121. pdfs/0500000US42001.pdf +0 -0
  122. pdfs/0500000US42007.pdf +0 -0
  123. pdfs/2014 Statistics.pdf +0 -0
  124. pdfs/2019 Statistics.pdf +0 -0
  125. pdfs/Atlanta_Public_Schools_GA_sample.pdf +0 -0
  126. pdfs/needs-ocr.pdf +0 -0
  127. tests/test_loading.py +50 -0
  128. tests/test_optional_deps.py +298 -0
  129. natural_pdf-0.1.3.dist-info/RECORD +0 -61
  130. natural_pdf-0.1.3.dist-info/top_level.txt +0 -1
  131. {natural_pdf-0.1.3.dist-info → natural_pdf-0.1.5.dist-info}/WHEEL +0 -0
  132. {natural_pdf-0.1.3.dist-info → natural_pdf-0.1.5.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,188 @@
1
+ # OCR Integration for Scanned Documents
2
+
3
+ Optical Character Recognition (OCR) allows you to extract text from scanned documents where the text isn't embedded in the PDF. This tutorial demonstrates how to work with scanned documents.
4
+
5
+ ```python
6
+ #%pip install "natural-pdf[all]"
7
+ ```
8
+
9
+ ```python
10
+ from natural_pdf import PDF
11
+
12
+ # Load a PDF
13
+ pdf = PDF("https://github.com/jsoma/natural-pdf/raw/refs/heads/main/pdfs/needs-ocr.pdf")
14
+ page = pdf.pages[0]
15
+
16
+ # Try extracting text without OCR
17
+ text_without_ocr = page.extract_text()
18
+ f"Without OCR: {len(text_without_ocr)} characters extracted"
19
+ ```
20
+
21
+ ## Enabling OCR
22
+
23
+ ```python
24
+ # Enable OCR for text extraction
25
+ page.use_ocr = True
26
+
27
+ # Extract text with OCR enabled
28
+ text_with_ocr = page.extract_text()
29
+
30
+ # Preview the extracted text
31
+ text_with_ocr[:200] + "..." if len(text_with_ocr) > 200 else text_with_ocr
32
+ ```
33
+
34
+ ## Finding Text Elements with OCR
35
+
36
+ ```python
37
+ # Convert text-as-image to text elements
38
+ page.apply_ocr()
39
+
40
+ # Select all text pieces on the page
41
+ text_elements = page.find_all('text')
42
+ f"Found {len(text_elements)} text elements"
43
+
44
+ # Visualize the elements
45
+ text_elements.highlight()
46
+ ```
47
+
48
+ ## OCR Configuration Options
49
+
50
+ ```python
51
+ # Set OCR configuration for better results
52
+ page.ocr_config = {
53
+ 'language': 'eng', # English
54
+ 'dpi': 300, # Higher resolution
55
+ }
56
+
57
+ # Extract text with the improved configuration
58
+ improved_text = page.extract_text()
59
+
60
+ # Preview the text
61
+ improved_text[:200] + "..." if len(improved_text) > 200 else improved_text
62
+ ```
63
+
64
+ ## Working with Multi-language Documents
65
+
66
+ ```python
67
+ # Configure for multiple languages
68
+ page.ocr_config = {
69
+ 'language': 'eng+fra+deu', # English, French, German
70
+ 'dpi': 300
71
+ }
72
+
73
+ # Extract text with multi-language support
74
+ multilang_text = page.extract_text()
75
+ multilang_text[:200]
76
+ ```
77
+
78
+ ## Extracting Tables from Scanned Documents
79
+
80
+ ```python
81
+ # Enable OCR and analyze the document layout
82
+ page.use_ocr = True
83
+ page.analyze_layout()
84
+
85
+ # Find table regions
86
+ table_regions = page.find_all('region[type=table]')
87
+
88
+ # Visualize any detected tables
89
+ table_regions.highlight()
90
+
91
+ # Extract the first table if found
92
+ if table_regions:
93
+ table_data = table_regions[0].extract_table()
94
+ table_data
95
+ else:
96
+ "No tables found in the document"
97
+ ```
98
+
99
+ ## Finding Form Fields in Scanned Documents
100
+
101
+ ```python
102
+ # Look for potential form labels (containing a colon)
103
+ labels = page.find_all('text:contains(":")')
104
+
105
+ # Visualize the labels
106
+ labels.highlight()
107
+
108
+ # Extract form data by looking to the right of each label
109
+ form_data = {}
110
+ for label in labels:
111
+ # Clean the label text
112
+ field_name = label.text.strip().rstrip(':')
113
+
114
+ # Find the value to the right
115
+ value_element = label.right(width=200)
116
+ value = value_element.extract_text().strip()
117
+
118
+ # Add to our dictionary
119
+ form_data[field_name] = value
120
+
121
+ # Display the extracted data
122
+ form_data
123
+ ```
124
+
125
+ ## Combining OCR with Layout Analysis
126
+
127
+ ```python
128
+ # Apply OCR and analyze layout
129
+ page.use_ocr = True
130
+ page.analyze_layout()
131
+
132
+ # Find document structure elements
133
+ headings = page.find_all('region[type=heading]')
134
+ paragraphs = page.find_all('region[type=paragraph]')
135
+
136
+ # Visualize the structure
137
+ headings.highlight(color="red", label="Headings")
138
+ paragraphs.highlight(color="blue", label="Paragraphs")
139
+
140
+ # Create a simple document outline
141
+ document_outline = []
142
+ for heading in headings:
143
+ heading_text = heading.extract_text()
144
+ document_outline.append(heading_text)
145
+
146
+ document_outline
147
+ ```
148
+
149
+ ## Working with Multiple Pages
150
+
151
+ ```python
152
+ # Process all pages in the document
153
+ all_text = []
154
+
155
+ for i, page in enumerate(pdf.pages):
156
+ # Enable OCR for each page
157
+ page.use_ocr = True
158
+
159
+ # Extract text
160
+ page_text = page.extract_text()
161
+
162
+ # Add to our collection with page number
163
+ all_text.append(f"Page {i+1}: {page_text[:100]}...")
164
+
165
+ # Show the first few pages
166
+ all_text
167
+ ```
168
+
169
+ ## Saving PDFs with Searchable Text
170
+
171
+ After applying OCR to a PDF, you can save a new version of the PDF where the recognized text is embedded as an invisible layer. This makes the text searchable and copyable in standard PDF viewers.
172
+
173
+ Use the `save_searchable()` method on the `PDF` object:
174
+
175
+ ```python
176
+ from natural_pdf import PDF
177
+
178
+ input_pdf_path = "https://github.com/jsoma/natural-pdf/raw/refs/heads/main/pdfs/needs-ocr.pdf"
179
+
180
+ pdf = PDF(input_pdf_path)
181
+ pdf.apply_ocr()
182
+
183
+ pdf.save_searchable("needs-ocr-searchable.pdf")
184
+ ```
185
+
186
+ This creates `needs-ocr-searchable.pdf`, which looks identical to the original but now has a text layer corresponding to the OCR results. You can adjust the rendering resolution used during saving with the `dpi` parameter (default is 300).
187
+
188
+ OCR integration enables you to work with scanned documents, historical archives, and image-based PDFs that don't have embedded text. By combining OCR with natural-pdf's layout analysis capabilities, you can turn any document into structured, searchable data.