natural-pdf 0.1.4__py3-none-any.whl → 0.1.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (141) hide show
  1. docs/api/index.md +386 -0
  2. docs/assets/favicon.png +3 -0
  3. docs/assets/favicon.svg +3 -0
  4. docs/assets/javascripts/custom.js +17 -0
  5. docs/assets/logo.svg +3 -0
  6. docs/assets/sample-screen.png +0 -0
  7. docs/assets/social-preview.png +17 -0
  8. docs/assets/social-preview.svg +17 -0
  9. docs/assets/stylesheets/custom.css +65 -0
  10. docs/document-qa/index.ipynb +435 -0
  11. docs/document-qa/index.md +79 -0
  12. docs/element-selection/index.ipynb +915 -0
  13. docs/element-selection/index.md +229 -0
  14. docs/index.md +170 -0
  15. docs/installation/index.md +69 -0
  16. docs/interactive-widget/index.ipynb +962 -0
  17. docs/interactive-widget/index.md +12 -0
  18. docs/layout-analysis/index.ipynb +818 -0
  19. docs/layout-analysis/index.md +185 -0
  20. docs/ocr/index.md +209 -0
  21. docs/pdf-navigation/index.ipynb +314 -0
  22. docs/pdf-navigation/index.md +97 -0
  23. docs/regions/index.ipynb +816 -0
  24. docs/regions/index.md +294 -0
  25. docs/tables/index.ipynb +658 -0
  26. docs/tables/index.md +144 -0
  27. docs/text-analysis/index.ipynb +370 -0
  28. docs/text-analysis/index.md +105 -0
  29. docs/text-extraction/index.ipynb +1478 -0
  30. docs/text-extraction/index.md +292 -0
  31. docs/tutorials/01-loading-and-extraction.ipynb +1710 -0
  32. docs/tutorials/01-loading-and-extraction.md +95 -0
  33. docs/tutorials/02-finding-elements.ipynb +340 -0
  34. docs/tutorials/02-finding-elements.md +149 -0
  35. docs/tutorials/03-extracting-blocks.ipynb +147 -0
  36. docs/tutorials/03-extracting-blocks.md +48 -0
  37. docs/tutorials/04-table-extraction.ipynb +114 -0
  38. docs/tutorials/04-table-extraction.md +50 -0
  39. docs/tutorials/05-excluding-content.ipynb +270 -0
  40. docs/tutorials/05-excluding-content.md +109 -0
  41. docs/tutorials/06-document-qa.ipynb +332 -0
  42. docs/tutorials/06-document-qa.md +91 -0
  43. docs/tutorials/07-layout-analysis.ipynb +288 -0
  44. docs/tutorials/07-layout-analysis.md +66 -0
  45. docs/tutorials/07-working-with-regions.ipynb +413 -0
  46. docs/tutorials/07-working-with-regions.md +151 -0
  47. docs/tutorials/08-spatial-navigation.ipynb +508 -0
  48. docs/tutorials/08-spatial-navigation.md +190 -0
  49. docs/tutorials/09-section-extraction.ipynb +2434 -0
  50. docs/tutorials/09-section-extraction.md +256 -0
  51. docs/tutorials/10-form-field-extraction.ipynb +512 -0
  52. docs/tutorials/10-form-field-extraction.md +201 -0
  53. docs/tutorials/11-enhanced-table-processing.ipynb +54 -0
  54. docs/tutorials/11-enhanced-table-processing.md +9 -0
  55. docs/tutorials/12-ocr-integration.ipynb +604 -0
  56. docs/tutorials/12-ocr-integration.md +175 -0
  57. docs/tutorials/13-semantic-search.ipynb +1328 -0
  58. docs/tutorials/13-semantic-search.md +77 -0
  59. docs/visual-debugging/index.ipynb +2970 -0
  60. docs/visual-debugging/index.md +157 -0
  61. docs/visual-debugging/region.png +0 -0
  62. natural_pdf/__init__.py +50 -33
  63. natural_pdf/analyzers/__init__.py +2 -1
  64. natural_pdf/analyzers/layout/base.py +32 -24
  65. natural_pdf/analyzers/layout/docling.py +131 -72
  66. natural_pdf/analyzers/layout/gemini.py +264 -0
  67. natural_pdf/analyzers/layout/layout_analyzer.py +156 -113
  68. natural_pdf/analyzers/layout/layout_manager.py +125 -58
  69. natural_pdf/analyzers/layout/layout_options.py +43 -17
  70. natural_pdf/analyzers/layout/paddle.py +152 -95
  71. natural_pdf/analyzers/layout/surya.py +164 -92
  72. natural_pdf/analyzers/layout/tatr.py +149 -84
  73. natural_pdf/analyzers/layout/yolo.py +89 -45
  74. natural_pdf/analyzers/text_options.py +22 -15
  75. natural_pdf/analyzers/text_structure.py +131 -85
  76. natural_pdf/analyzers/utils.py +30 -23
  77. natural_pdf/collections/pdf_collection.py +146 -97
  78. natural_pdf/core/__init__.py +1 -1
  79. natural_pdf/core/element_manager.py +419 -337
  80. natural_pdf/core/highlighting_service.py +268 -196
  81. natural_pdf/core/page.py +1044 -521
  82. natural_pdf/core/pdf.py +516 -313
  83. natural_pdf/elements/__init__.py +1 -1
  84. natural_pdf/elements/base.py +307 -225
  85. natural_pdf/elements/collections.py +805 -543
  86. natural_pdf/elements/line.py +39 -36
  87. natural_pdf/elements/rect.py +32 -30
  88. natural_pdf/elements/region.py +889 -879
  89. natural_pdf/elements/text.py +127 -99
  90. natural_pdf/exporters/__init__.py +0 -1
  91. natural_pdf/exporters/searchable_pdf.py +261 -102
  92. natural_pdf/ocr/__init__.py +57 -35
  93. natural_pdf/ocr/engine.py +150 -46
  94. natural_pdf/ocr/engine_easyocr.py +146 -150
  95. natural_pdf/ocr/engine_paddle.py +118 -175
  96. natural_pdf/ocr/engine_surya.py +78 -141
  97. natural_pdf/ocr/ocr_factory.py +114 -0
  98. natural_pdf/ocr/ocr_manager.py +122 -124
  99. natural_pdf/ocr/ocr_options.py +16 -20
  100. natural_pdf/ocr/utils.py +98 -0
  101. natural_pdf/qa/__init__.py +1 -1
  102. natural_pdf/qa/document_qa.py +119 -111
  103. natural_pdf/search/__init__.py +37 -31
  104. natural_pdf/search/haystack_search_service.py +312 -189
  105. natural_pdf/search/haystack_utils.py +186 -122
  106. natural_pdf/search/search_options.py +25 -14
  107. natural_pdf/search/search_service_protocol.py +12 -6
  108. natural_pdf/search/searchable_mixin.py +261 -176
  109. natural_pdf/selectors/__init__.py +2 -1
  110. natural_pdf/selectors/parser.py +159 -316
  111. natural_pdf/templates/__init__.py +1 -1
  112. natural_pdf/templates/spa/css/style.css +334 -0
  113. natural_pdf/templates/spa/index.html +31 -0
  114. natural_pdf/templates/spa/js/app.js +472 -0
  115. natural_pdf/templates/spa/words.txt +235976 -0
  116. natural_pdf/utils/debug.py +32 -0
  117. natural_pdf/utils/highlighting.py +8 -2
  118. natural_pdf/utils/identifiers.py +29 -0
  119. natural_pdf/utils/packaging.py +418 -0
  120. natural_pdf/utils/reading_order.py +65 -63
  121. natural_pdf/utils/text_extraction.py +195 -0
  122. natural_pdf/utils/visualization.py +70 -61
  123. natural_pdf/widgets/__init__.py +2 -3
  124. natural_pdf/widgets/viewer.py +749 -718
  125. {natural_pdf-0.1.4.dist-info → natural_pdf-0.1.6.dist-info}/METADATA +53 -17
  126. natural_pdf-0.1.6.dist-info/RECORD +141 -0
  127. {natural_pdf-0.1.4.dist-info → natural_pdf-0.1.6.dist-info}/WHEEL +1 -1
  128. natural_pdf-0.1.6.dist-info/top_level.txt +4 -0
  129. notebooks/Examples.ipynb +1293 -0
  130. pdfs/.gitkeep +0 -0
  131. pdfs/01-practice.pdf +543 -0
  132. pdfs/0500000US42001.pdf +0 -0
  133. pdfs/0500000US42007.pdf +0 -0
  134. pdfs/2014 Statistics.pdf +0 -0
  135. pdfs/2019 Statistics.pdf +0 -0
  136. pdfs/Atlanta_Public_Schools_GA_sample.pdf +0 -0
  137. pdfs/needs-ocr.pdf +0 -0
  138. natural_pdf/templates/ocr_debug.html +0 -517
  139. natural_pdf-0.1.4.dist-info/RECORD +0 -61
  140. natural_pdf-0.1.4.dist-info/top_level.txt +0 -1
  141. {natural_pdf-0.1.4.dist-info → natural_pdf-0.1.6.dist-info}/licenses/LICENSE +0 -0
@@ -5,33 +5,37 @@ Module for exporting PDF content to various formats.
5
5
  import logging
6
6
  import os
7
7
  import tempfile
8
- from typing import TYPE_CHECKING, List, Dict, Any, Tuple
8
+ import xml.etree.ElementTree as ET
9
+ from typing import TYPE_CHECKING, Any, Dict, List, Tuple, Union
10
+ from xml.etree.ElementTree import Element as ETElement
11
+ from xml.etree.ElementTree import SubElement
9
12
 
10
13
  # Lazy imports for optional dependencies
11
14
  try:
12
15
  from PIL import Image
13
16
  except ImportError:
14
- Image = None # type: ignore
17
+ Image = None # type: ignore
15
18
 
16
19
  try:
17
20
  import pikepdf
18
21
  except ImportError:
19
- pikepdf = None # type: ignore
22
+ pikepdf = None # type: ignore
20
23
 
21
24
  try:
22
25
  from ocrmypdf.hocrtransform import HocrTransform
23
26
  except ImportError:
24
- HocrTransform = None # type: ignore
27
+ HocrTransform = None # type: ignore
25
28
 
26
29
  if TYPE_CHECKING:
27
- from natural_pdf.core.pdf import PDF
28
30
  from natural_pdf.core.page import Page
31
+ from natural_pdf.core.pdf import PDF
32
+ from natural_pdf.elements.collections import PageCollection
29
33
 
30
34
 
31
35
  logger = logging.getLogger(__name__)
32
36
 
33
37
  # --- Constants ---
34
- HOCR_TEMPLATE_HEADER = '''<?xml version="1.0" encoding="UTF-8"?>
38
+ HOCR_TEMPLATE_HEADER = """<?xml version="1.0" encoding="UTF-8"?>
35
39
  <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
36
40
  "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
37
41
  <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
@@ -42,27 +46,27 @@ HOCR_TEMPLATE_HEADER = '''<?xml version="1.0" encoding="UTF-8"?>
42
46
  <meta name='ocr-capabilities' content='ocr_page ocr_carea ocr_par ocr_line ocrx_word'/>
43
47
  </head>
44
48
  <body>
45
- '''
49
+ """
46
50
 
47
- HOCR_TEMPLATE_PAGE = ''' <div class='ocr_page' id='page_{page_num}' title='image "{image_path}"; bbox 0 0 {width} {height}; ppageno {page_num}'>
48
- '''
51
+ HOCR_TEMPLATE_PAGE = """ <div class='ocr_page' id='page_{page_num}' title='image "{image_path}"; bbox 0 0 {width} {height}; ppageno {page_num}'>
52
+ """
49
53
 
50
- HOCR_TEMPLATE_WORD = ''' <span class='ocrx_word' id='word_{page_num}_{word_id}' title='bbox {x0} {y0} {x1} {y1}; x_wconf {confidence}'>{text}</span>
51
- '''
54
+ HOCR_TEMPLATE_WORD = """ <span class='ocrx_word' id='word_{page_num}_{word_id}' title='bbox {x0} {y0} {x1} {y1}; x_wconf {confidence}'>{text}</span>
55
+ """
52
56
 
53
- HOCR_TEMPLATE_LINE_START = ''' <span class='ocr_line' id='line_{page_num}_{line_id}' title='bbox {x0} {y0} {x1} {y1}'>
54
- '''
55
- HOCR_TEMPLATE_LINE_END = ''' </span>
56
- '''
57
+ HOCR_TEMPLATE_LINE_START = """ <span class='ocr_line' id='line_{page_num}_{line_id}' title='bbox {x0} {y0} {x1} {y1}; baseline 0 0; x_size 0; x_descenders 0; x_ascenders 0'>
58
+ """
59
+ HOCR_TEMPLATE_LINE_END = """ </span>
60
+ """
57
61
 
58
- HOCR_TEMPLATE_FOOTER = ''' </div>
62
+ HOCR_TEMPLATE_FOOTER = """ </div>
59
63
  </body>
60
64
  </html>
61
- '''
65
+ """
62
66
  # --- End Constants ---
63
67
 
64
68
 
65
- def _generate_hocr_for_page(page: 'Page', image_width: int, image_height: int) -> str:
69
+ def _generate_hocr_for_page(page: "Page", image_width: int, image_height: int) -> str:
66
70
  """
67
71
  Generates an hOCR string for a given Page object based on its OCR elements.
68
72
 
@@ -79,96 +83,240 @@ def _generate_hocr_for_page(page: 'Page', image_width: int, image_height: int) -
79
83
  """
80
84
  # Attempt to get OCR elements (words) using find_all with selector
81
85
  # Use find_all which returns an ElementCollection
82
- ocr_elements_collection = page.find_all('text[source=ocr]')
83
- ocr_elements = ocr_elements_collection.elements # Get the list of elements
86
+ ocr_elements_collection = page.find_all("text[source=ocr]")
87
+ ocr_elements = ocr_elements_collection.elements # Get the list of elements
84
88
 
85
89
  if not ocr_elements:
86
- logger.warning(f"Page {page.number} has no OCR elements (text[source=ocr]) to generate hOCR from.")
90
+ logger.warning(
91
+ f"Page {page.number} has no OCR elements (text[source=ocr]) to generate hOCR from."
92
+ )
87
93
  # Return minimal valid hOCR for an empty page
88
94
  hocr_content = HOCR_TEMPLATE_HEADER
89
- hocr_content += HOCR_TEMPLATE_PAGE.format(page_num=page.index, image_path="", width=image_width, height=image_height)
95
+ hocr_content += HOCR_TEMPLATE_PAGE.format(
96
+ page_num=page.index, image_path="", width=image_width, height=image_height
97
+ )
90
98
  hocr_content += HOCR_TEMPLATE_FOOTER
91
99
  return hocr_content
92
100
 
93
-
94
- # --- TODO: Implement logic to group words into lines if necessary ---
95
- # For now, just output words directly. A more advanced implementation
96
- # might group words geometrically into lines first.
97
- # Example (simple, assuming elements are somewhat sorted):
98
- # lines = []
99
- # current_line = []
100
- # last_y = -1
101
- # for word in ocr_elements:
102
- # if not current_line or abs(word.y0 - last_y) < threshold: # Simple Y-based grouping
103
- # current_line.append(word)
104
- # last_y = word.y0
105
- # else:
106
- # lines.append(current_line)
107
- # current_line = [word]
108
- # last_y = word.y0
109
- # if current_line:
110
- # lines.append(current_line)
111
- # --- End Line Grouping Placeholder ---
112
-
113
-
114
- hocr_content = HOCR_TEMPLATE_HEADER
115
- hocr_content += HOCR_TEMPLATE_PAGE.format(page_num=page.index, image_path="", width=image_width, height=image_height) # image_path is often unused
116
-
117
- # Scale factors from PDF points (page dims) to image pixels (rendered image dims)
118
- # Note: Assumes OCR element coordinates are in PDF points (page.width/height)
101
+ # --- Start Line Grouping Logic ---
102
+ logger.debug(f"Page {page.index}: Grouping {len(ocr_elements)} words into lines.")
103
+ ocr_elements.sort(key=lambda el: (el.bbox[1], el.bbox[0]))
104
+ lines = []
105
+ current_line = []
106
+ if ocr_elements:
107
+ current_line.append(ocr_elements[0])
108
+ for i in range(1, len(ocr_elements)):
109
+ current_word = ocr_elements[i]
110
+ last_word = current_line[-1]
111
+ last_word_y0, last_word_y1 = last_word.bbox[1], last_word.bbox[3]
112
+ current_word_y0, current_word_y1 = current_word.bbox[1], current_word.bbox[3]
113
+ last_word_center_y = (last_word_y0 + last_word_y1) / 2
114
+ current_word_center_y = (current_word_y0 + current_word_y1) / 2
115
+ last_word_height = last_word_y1 - last_word_y0
116
+ current_word_height = current_word_y1 - current_word_y0
117
+ avg_height = (last_word_height + current_word_height) / 2
118
+ if avg_height <= 0:
119
+ avg_height = 1
120
+ tolerance_factor = 0.7
121
+ threshold = avg_height * tolerance_factor
122
+ delta_y = abs(current_word_center_y - last_word_center_y)
123
+ # if delta_y < threshold:
124
+ # current_line.append(current_word)
125
+ # else:
126
+ lines.append(current_line)
127
+ current_line = [current_word]
128
+ if current_line:
129
+ lines.append(current_line)
130
+ logger.debug(f"Page {page.index}: Grouped into {len(lines)} lines.")
131
+ # --- End Line Grouping Logic ---
132
+
133
+ # --- Start ElementTree hOCR Generation ---
119
134
  scale_x = image_width / page.width if page.width > 0 else 1
120
135
  scale_y = image_height / page.height if page.height > 0 else 1
121
136
 
137
+ # Create root element
138
+ page_hocr = ETElement(
139
+ "html", attrib={"xmlns": "http://www.w3.org/1999/xhtml", "xml:lang": "en"}
140
+ )
141
+
142
+ # Head
143
+ head = SubElement(page_hocr, "head")
144
+ SubElement(head, "title").text = ""
145
+ SubElement(
146
+ head, "meta", attrib={"http-equiv": "Content-Type", "content": "text/html;charset=utf-8"}
147
+ )
148
+ SubElement(head, "meta", attrib={"name": "ocr-system", "content": "natural-pdf"})
149
+ SubElement(
150
+ head,
151
+ "meta",
152
+ attrib={
153
+ "name": "ocr-capabilities",
154
+ "content": "ocr_page ocr_carea ocr_par ocr_line ocrx_word",
155
+ },
156
+ )
157
+
158
+ # Body and Page
159
+ body = SubElement(page_hocr, "body")
160
+ page_div = SubElement(
161
+ body,
162
+ "div",
163
+ attrib={
164
+ "class": "ocr_page",
165
+ "id": f"page_{page.index}",
166
+ "title": f"image; bbox 0 0 {image_width} {image_height}; ppageno {page.index}",
167
+ },
168
+ )
169
+
170
+ # Calculate overall bbox for carea/par (image coords)
171
+ min_area_x0, min_area_y0 = image_width, image_height
172
+ max_area_x1, max_area_y1 = 0, 0
173
+ if lines:
174
+ for line_words in lines:
175
+ for word in line_words:
176
+ (x0, y0, x1, y1) = word.bbox
177
+ img_x0 = int(x0 * scale_x)
178
+ img_y0 = int(y0 * scale_y)
179
+ img_x1 = int(x1 * scale_x)
180
+ img_y1 = int(y1 * scale_y)
181
+ min_area_x0 = min(min_area_x0, img_x0)
182
+ min_area_y0 = min(min_area_y0, img_y0)
183
+ max_area_x1 = max(max_area_x1, img_x1)
184
+ max_area_y1 = max(max_area_y1, img_y1)
185
+ area_img_x0, area_img_y0 = max(0, min_area_x0), max(0, min_area_y0)
186
+ area_img_x1, area_img_y1 = min(image_width, max_area_x1), min(image_height, max_area_y1)
187
+ if area_img_x0 >= area_img_x1 or area_img_y0 >= area_img_y1:
188
+ area_img_x0, area_img_y0, area_img_x1, area_img_y1 = 0, 0, image_width, image_height
189
+ else:
190
+ area_img_x0, area_img_y0, area_img_x1, area_img_y1 = 0, 0, image_width, image_height
191
+
192
+ # Add Carea and Par wrappers (assuming one block/paragraph per page for simplicity)
193
+ block_div = SubElement(
194
+ page_div, # Attach to page_div now
195
+ "div",
196
+ attrib={
197
+ "class": "ocr_carea",
198
+ "id": "block_0_1", # Simple ID
199
+ "title": f"bbox {area_img_x0} {area_img_y0} {area_img_x1} {area_img_y1}",
200
+ },
201
+ )
202
+ par_div = SubElement(
203
+ block_div,
204
+ "p",
205
+ attrib={
206
+ "class": "ocr_par",
207
+ "id": "par_0_1", # Simple ID
208
+ "title": f"bbox {area_img_x0} {area_img_y0} {area_img_x1} {area_img_y1}",
209
+ },
210
+ )
211
+
212
+ # Loop through lines and words
122
213
  word_id_counter = 0
123
- for word in ocr_elements:
124
- # Scale coordinates to image dimensions
125
- img_x0 = int(word.x0 * scale_x)
126
- img_y0 = int(word.y0 * scale_y)
127
- img_x1 = int(word.x1 * scale_x)
128
- img_y1 = int(word.y1 * scale_y)
129
-
130
- # Ensure coordinates are within image bounds
131
- img_x0 = max(0, img_x0)
132
- img_y0 = max(0, img_y0)
133
- img_x1 = min(image_width, img_x1)
134
- img_y1 = min(image_height, img_y1)
135
-
136
- # Basic escaping for XML - might need more robust escaping
137
- text = word.text.replace('&', '&amp;').replace('<', '&lt;').replace('>', '&gt;')
138
-
139
- # Confidence (assuming it exists, default to 99 if not)
140
- confidence = getattr(word, 'confidence', 0.99) * 100 # hOCR often uses 0-100
141
-
142
- hocr_content += HOCR_TEMPLATE_WORD.format(
143
- page_num=page.index,
144
- word_id=word_id_counter,
145
- x0=img_x0,
146
- y0=img_y0,
147
- x1=img_x1,
148
- y1=img_y1,
149
- confidence=int(confidence),
150
- text=text
214
+ line_id_counter = 0
215
+ for current_line_words in lines:
216
+ if not current_line_words:
217
+ continue
218
+
219
+ # Sort words in line by x0
220
+ current_line_words.sort(key=lambda el: el.bbox[0])
221
+
222
+ # Calculate line bbox (image coords)
223
+ min_line_x0, min_line_y0 = image_width, image_height
224
+ max_line_x1, max_line_y1 = 0, 0
225
+ for word in current_line_words:
226
+ (x0, y0, x1, y1) = word.bbox
227
+ img_x0, img_y0 = int(x0 * scale_x), int(y0 * scale_y)
228
+ img_x1, img_y1 = int(x1 * scale_x), int(y1 * scale_y)
229
+ min_line_x0, min_line_y0 = min(min_line_x0, img_x0), min(min_line_y0, img_y0)
230
+ max_line_x1, max_line_y1 = max(max_line_x1, img_x1), max(max_line_y1, img_y1)
231
+
232
+ line_img_x0, line_img_y0 = max(0, min_line_x0), max(0, min_line_y0)
233
+ line_img_x1, line_img_y1 = min(image_width, max_line_x1), min(image_height, max_line_y1)
234
+ if line_img_x0 >= line_img_x1 or line_img_y0 >= line_img_y1:
235
+ line_img_x0, line_img_y0, line_img_x1, line_img_y1 = 0, 0, 1, 1
236
+
237
+ # Create ocr_line span
238
+ line_span = SubElement(
239
+ par_div, # Attach line to paragraph
240
+ "span",
241
+ attrib={
242
+ "class": "ocr_line",
243
+ "id": f"line_{page.index}_{line_id_counter}",
244
+ "title": f"bbox {line_img_x0} {line_img_y0} {line_img_x1} {line_img_y1}; baseline 0 0; x_size 0; x_descenders 0; x_ascenders 0",
245
+ },
151
246
  )
152
- word_id_counter += 1
153
- hocr_content += "\n" # Add newline for readability
154
247
 
248
+ # Add words to line
249
+ for word in current_line_words:
250
+ (x0, y0, x1, y1) = word.bbox
251
+ img_x0, img_y0 = int(x0 * scale_x), int(y0 * scale_y)
252
+ img_x1, img_y1 = int(x1 * scale_x), int(y1 * scale_y)
253
+
254
+ img_x0, img_y0 = max(0, img_x0), max(0, img_y0)
255
+ img_x1, img_y1 = min(image_width, img_x1), min(image_height, img_y1)
256
+ if img_x1 <= img_x0:
257
+ img_x1 = img_x0 + 1
258
+ if img_y1 <= img_y0:
259
+ img_y1 = img_y0 + 1
260
+
261
+ # --- Strip whitespace and check if word is empty --- #
262
+ text = word.text.strip().replace("&", "&amp;").replace("<", "&lt;").replace(">", "&gt;")
263
+ if not text:
264
+ continue # Skip adding this word if it becomes empty after stripping
265
+ # --- End strip ---
266
+ confidence = getattr(word, "confidence", 1.00)
267
+
268
+ word_span = SubElement(
269
+ line_span, # Attach word to line
270
+ "span",
271
+ attrib={
272
+ "class": "ocrx_word",
273
+ "id": f"word_{page.index}_{word_id_counter}",
274
+ "title": f"bbox {img_x0} {img_y0} {img_x1} {img_y1}; x_wconf {confidence}",
275
+ },
276
+ )
277
+ word_span.text = text
278
+ word_id_counter += 1
279
+ line_id_counter += 1
280
+
281
+ # Convert ElementTree to string
282
+ # xml_declaration = '<?xml version="1.0" encoding="UTF-8"?>\n' # No longer needed
283
+ # doctype_declaration = '''<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
284
+ # "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">\n''' # No longer needed
285
+ # ET.indent(page_hocr) # Optional: for pretty printing, requires Python 3.9+
286
+ # Need bytes for writing, then decode for HocrTransform if it needs str
287
+ # Let's stick to unicode string output for now, as the file write expects it.
288
+ hocr_content = ET.tostring(
289
+ page_hocr, encoding="unicode", method="xml"
290
+ ) # Revert back to method='xml'
291
+ # hocr_content = xml_declaration + doctype_declaration + hocr_string_content # Removed string addition
292
+ # --- End ElementTree hOCR Generation ---
293
+
294
+ # --- Add code to save hOCR output for inspection ---
295
+ try:
296
+ hocr_output_path = "natural_pdf_hocr_output.hocr"
297
+ with open(hocr_output_path, "w", encoding="utf-8") as f_out:
298
+ f_out.write(hocr_content)
299
+ logger.info(f"Saved hOCR content for page {page.index} to: {hocr_output_path}")
300
+ except Exception as e:
301
+ logger.error(f"Failed to save hOCR output to file: {e}")
302
+ # --- End save hOCR ---
155
303
 
156
- hocr_content += HOCR_TEMPLATE_FOOTER
157
304
  return hocr_content
158
305
 
159
306
 
160
- def create_searchable_pdf(pdf_object: 'PDF', output_path: str, dpi: int = 300):
307
+ def create_searchable_pdf(
308
+ source: Union["Page", "PageCollection", "PDF"], output_path: str, dpi: int = 300
309
+ ):
161
310
  """
162
311
  Creates a searchable PDF from a natural_pdf.PDF object using OCR results.
163
312
 
164
313
  Relies on ocrmypdf for hOCR transformation. Requires optional dependencies.
165
314
 
166
315
  Args:
167
- pdf_object: The natural_pdf.PDF instance (OCR should have been run).
316
+ source: The natural_pdf.PDF, PageCollection, or Page object
168
317
  output_path: The path to save the resulting searchable PDF.
169
318
  dpi: The resolution (dots per inch) for rendering page images and hOCR.
170
319
  """
171
- # _check_dependencies() # Removed check
172
320
 
173
321
  # --- Ensure dependencies are loaded (they should be if installed) ---
174
322
  if Image is None or pikepdf is None or HocrTransform is None:
@@ -180,7 +328,13 @@ def create_searchable_pdf(pdf_object: 'PDF', output_path: str, dpi: int = 300):
180
328
  )
181
329
  # --- End Safeguard Check ---
182
330
 
183
- logger.info(f"Starting searchable PDF creation for '{pdf_object.source_path}' -> '{output_path}' at {dpi} DPI.")
331
+ # duck type to see if source has .pages, to populate pages =
332
+ if hasattr(source, "pages"):
333
+ pages = source.pages
334
+ else:
335
+ pages = [source]
336
+
337
+ logger.info(f"Starting searchable PDF creation '{output_path}' at {dpi} DPI.")
184
338
 
185
339
  temp_pdf_pages: List[str] = []
186
340
  output_abs_path = os.path.abspath(output_path)
@@ -188,10 +342,12 @@ def create_searchable_pdf(pdf_object: 'PDF', output_path: str, dpi: int = 300):
188
342
  with tempfile.TemporaryDirectory() as tmpdir:
189
343
  logger.debug(f"Using temporary directory: {tmpdir}")
190
344
 
191
- for i, page in enumerate(pdf_object.pages):
192
- logger.debug(f"Processing page {page.number} (index {i})...")
345
+ for i, page in enumerate(pages):
346
+ logger.debug(f"Processing page {i+1} of {len(pages)}...")
193
347
  page_base_name = f"page_{i}"
194
- img_path = os.path.join(tmpdir, f"{page_base_name}.png") # Use PNG for potentially better quality
348
+ img_path = os.path.join(
349
+ tmpdir, f"{page_base_name}.png"
350
+ ) # Use PNG for potentially better quality
195
351
  hocr_path = os.path.join(tmpdir, f"{page_base_name}.hocr")
196
352
  pdf_page_path = os.path.join(tmpdir, f"{page_base_name}.pdf")
197
353
 
@@ -200,18 +356,17 @@ def create_searchable_pdf(pdf_object: 'PDF', output_path: str, dpi: int = 300):
200
356
  logger.debug(f" Rendering page {i} to image ({dpi} DPI)...")
201
357
  # Use the Page's to_image method
202
358
  pil_image = page.to_image(resolution=dpi, include_highlights=False)
203
- pil_image.save(img_path, format='PNG')
359
+ pil_image.save(img_path, format="PNG")
204
360
  img_width, img_height = pil_image.size
205
361
  logger.debug(f" Image saved to {img_path} ({img_width}x{img_height})")
206
362
 
207
363
  # 2. Generate hOCR
208
364
  logger.debug(f" Generating hOCR...")
209
365
  hocr_content = _generate_hocr_for_page(page, img_width, img_height)
210
- with open(hocr_path, 'w', encoding='utf-8') as f:
366
+ with open(hocr_path, "w", encoding="utf-8") as f:
211
367
  f.write(hocr_content)
212
368
  logger.debug(f" hOCR saved to {hocr_path}")
213
369
 
214
-
215
370
  # 3. Use HocrTransform to create searchable PDF page
216
371
  logger.debug(f" Running HocrTransform...")
217
372
  hocr_transform = HocrTransform(hocr_filename=hocr_path, dpi=dpi)
@@ -221,11 +376,11 @@ def create_searchable_pdf(pdf_object: 'PDF', output_path: str, dpi: int = 300):
221
376
  logger.debug(f" Temporary PDF page saved to {pdf_page_path}")
222
377
 
223
378
  except Exception as e:
224
- logger.error(f" Failed to process page {page.number}: {e}", exc_info=True)
225
- # Decide whether to skip or raise error
226
- # For now, let's skip and continue
227
- logger.warning(f" Skipping page {page.number} due to error.")
228
- continue # Skip to the next page
379
+ logger.error(f" Failed to process page {page.number}: {e}", exc_info=True)
380
+ # Decide whether to skip or raise error
381
+ # For now, let's skip and continue
382
+ logger.warning(f" Skipping page {page.number} due to error.")
383
+ continue # Skip to the next page
229
384
 
230
385
  # 4. Merge temporary PDF pages
231
386
  if not temp_pdf_pages:
@@ -237,16 +392,20 @@ def create_searchable_pdf(pdf_object: 'PDF', output_path: str, dpi: int = 300):
237
392
  # Use pikepdf for merging
238
393
  output_pdf = pikepdf.Pdf.new()
239
394
  for temp_pdf_path in temp_pdf_pages:
240
- with pikepdf.Pdf.open(temp_pdf_path) as src_page_pdf:
241
- # Assuming each temp PDF has exactly one page
242
- if len(src_page_pdf.pages) == 1:
243
- output_pdf.pages.append(src_page_pdf.pages[0])
244
- else:
245
- logger.warning(f"Temporary PDF '{temp_pdf_path}' had unexpected number of pages ({len(src_page_pdf.pages)}). Skipping.")
395
+ with pikepdf.Pdf.open(temp_pdf_path) as src_page_pdf:
396
+ # Assuming each temp PDF has exactly one page
397
+ if len(src_page_pdf.pages) == 1:
398
+ output_pdf.pages.append(src_page_pdf.pages[0])
399
+ else:
400
+ logger.warning(
401
+ f"Temporary PDF '{temp_pdf_path}' had unexpected number of pages ({len(src_page_pdf.pages)}). Skipping."
402
+ )
246
403
  output_pdf.save(output_abs_path)
247
404
  logger.info(f"Successfully saved merged searchable PDF to: {output_abs_path}")
248
405
  except Exception as e:
249
- logger.error(f"Failed to merge temporary PDFs into '{output_abs_path}': {e}", exc_info=True)
406
+ logger.error(
407
+ f"Failed to merge temporary PDFs into '{output_abs_path}': {e}", exc_info=True
408
+ )
250
409
  raise RuntimeError(f"Failed to save final PDF: {e}") from e
251
410
 
252
- logger.debug("Temporary directory cleaned up.")
411
+ logger.debug("Temporary directory cleaned up.")
@@ -3,54 +3,76 @@ OCR engines for natural-pdf.
3
3
 
4
4
  This module provides different OCR engines that can be used with natural-pdf.
5
5
  """
6
+
6
7
  import logging
7
8
 
8
9
  # Set up module logger
9
10
  logger = logging.getLogger("natural_pdf.ocr")
10
- from .ocr_manager import OCRManager
11
- from .engine import OCREngine
12
- from .ocr_options import OCROptions
13
- from .engine import OCREngine
14
- from .engine_paddle import PaddleOCREngine
15
- from .engine_surya import SuryaOCREngine
16
11
 
17
- __all__ = ['OCRManager', 'OCREngine', 'OCROptions', 'EasyOCREngine', 'PaddleOCREngine', 'SuryaOCREngine']
12
+ # Import the base classes that are always available
13
+ from .engine import OCREngine
14
+ from .ocr_options import OCROptions, BaseOCROptions, EasyOCROptions, PaddleOCROptions, SuryaOCROptions
15
+ from .ocr_manager import OCRManager
16
+ from .ocr_factory import OCRFactory
18
17
 
19
- DEFAULT_ENGINE = SuryaOCREngine
18
+ # Add all public symbols that should be available when importing this module
19
+ __all__ = [
20
+ "OCRManager",
21
+ "OCREngine",
22
+ "OCROptions",
23
+ "BaseOCROptions",
24
+ "EasyOCROptions",
25
+ "PaddleOCROptions",
26
+ "SuryaOCROptions",
27
+ "OCRFactory",
28
+ "get_engine",
29
+ "list_available_engines"
30
+ ]
20
31
 
21
32
  def get_engine(engine_name=None, **kwargs):
22
33
  """
23
- Get OCR engine by name.
24
-
34
+ Get OCR engine by name with graceful handling of missing dependencies.
35
+
25
36
  Args:
26
- engine_name: Name of the engine to use ('easyocr', 'paddleocr', etc.)
27
- If None, the default engine is used (PaddleOCR if available, otherwise EasyOCR)
37
+ engine_name: Name of the engine to use ('easyocr', 'paddle', 'surya')
38
+ If None, the best available engine is used
28
39
  **kwargs: Additional arguments to pass to the engine constructor
29
-
40
+
30
41
  Returns:
31
42
  OCREngine instance
43
+
44
+ Raises:
45
+ ImportError: If the requested engine's dependencies aren't installed
46
+ ValueError: If the engine_name is unknown
32
47
  """
33
- logger.debug(f"Initializing OCR engine: {engine_name or 'default'}")
34
-
35
- if engine_name is None or engine_name == 'default':
36
- engine = DEFAULT_ENGINE(**kwargs)
37
- logger.info(f"Using default OCR engine: {engine.__class__.__name__}")
38
- return engine
39
-
40
- if engine_name.lower() == 'easyocr':
41
- logger.info("Initializing EasyOCR engine")
42
- return EasyOCREngine(**kwargs)
48
+ logger.debug(f"Initializing OCR engine: {engine_name or 'best available'}")
43
49
 
44
- if engine_name.lower() == 'paddleocr':
45
- try:
46
- from .engine_paddle import PaddleOCREngine
47
- logger.info("Initializing PaddleOCR engine")
48
- return PaddleOCREngine(**kwargs)
49
- except ImportError:
50
- logger.error("PaddleOCR is not installed")
51
- raise ImportError(
52
- "PaddleOCR is not installed. Please install it with: pip install paddlepaddle paddleocr"
53
- )
50
+ try:
51
+ if engine_name is None or engine_name == "default":
52
+ # Use the factory to get the best available engine
53
+ engine = OCRFactory.get_recommended_engine(**kwargs)
54
+ logger.info(f"Using recommended OCR engine: {engine.__class__.__name__}")
55
+ return engine
56
+
57
+ # Use the factory to create a specific engine
58
+ normalized_name = engine_name.lower()
59
+ if normalized_name in ["easyocr", "paddle", "surya"]:
60
+ return OCRFactory.create_engine(normalized_name, **kwargs)
61
+ else:
62
+ raise ValueError(f"Unknown OCR engine: {engine_name}")
63
+
64
+ except ImportError as e:
65
+ logger.error(f"OCR engine dependency error: {e}")
66
+ raise
67
+ except Exception as e:
68
+ logger.error(f"Error initializing OCR engine: {e}")
69
+ raise
70
+
71
+ def list_available_engines():
72
+ """
73
+ List all available OCR engines.
54
74
 
55
- logger.error(f"Unknown OCR engine: {engine_name}")
56
- raise ValueError(f"Unknown OCR engine: {engine_name}")
75
+ Returns:
76
+ Dict[str, bool]: Dictionary mapping engine names to availability status
77
+ """
78
+ return OCRFactory.list_available_engines()