natural-pdf 0.1.3__py3-none-any.whl → 0.1.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (132) hide show
  1. docs/api/index.md +386 -0
  2. docs/assets/favicon.png +3 -0
  3. docs/assets/favicon.svg +3 -0
  4. docs/assets/javascripts/custom.js +17 -0
  5. docs/assets/logo.svg +3 -0
  6. docs/assets/sample-screen.png +0 -0
  7. docs/assets/social-preview.png +17 -0
  8. docs/assets/social-preview.svg +17 -0
  9. docs/assets/stylesheets/custom.css +65 -0
  10. docs/document-qa/index.ipynb +435 -0
  11. docs/document-qa/index.md +79 -0
  12. docs/element-selection/index.ipynb +915 -0
  13. docs/element-selection/index.md +229 -0
  14. docs/index.md +170 -0
  15. docs/installation/index.md +69 -0
  16. docs/interactive-widget/index.ipynb +962 -0
  17. docs/interactive-widget/index.md +12 -0
  18. docs/layout-analysis/index.ipynb +818 -0
  19. docs/layout-analysis/index.md +185 -0
  20. docs/ocr/index.md +222 -0
  21. docs/pdf-navigation/index.ipynb +314 -0
  22. docs/pdf-navigation/index.md +97 -0
  23. docs/regions/index.ipynb +816 -0
  24. docs/regions/index.md +294 -0
  25. docs/tables/index.ipynb +658 -0
  26. docs/tables/index.md +144 -0
  27. docs/text-analysis/index.ipynb +370 -0
  28. docs/text-analysis/index.md +105 -0
  29. docs/text-extraction/index.ipynb +1478 -0
  30. docs/text-extraction/index.md +292 -0
  31. docs/tutorials/01-loading-and-extraction.ipynb +1696 -0
  32. docs/tutorials/01-loading-and-extraction.md +95 -0
  33. docs/tutorials/02-finding-elements.ipynb +340 -0
  34. docs/tutorials/02-finding-elements.md +149 -0
  35. docs/tutorials/03-extracting-blocks.ipynb +147 -0
  36. docs/tutorials/03-extracting-blocks.md +48 -0
  37. docs/tutorials/04-table-extraction.ipynb +114 -0
  38. docs/tutorials/04-table-extraction.md +50 -0
  39. docs/tutorials/05-excluding-content.ipynb +270 -0
  40. docs/tutorials/05-excluding-content.md +109 -0
  41. docs/tutorials/06-document-qa.ipynb +332 -0
  42. docs/tutorials/06-document-qa.md +91 -0
  43. docs/tutorials/07-layout-analysis.ipynb +260 -0
  44. docs/tutorials/07-layout-analysis.md +66 -0
  45. docs/tutorials/07-working-with-regions.ipynb +409 -0
  46. docs/tutorials/07-working-with-regions.md +151 -0
  47. docs/tutorials/08-spatial-navigation.ipynb +508 -0
  48. docs/tutorials/08-spatial-navigation.md +190 -0
  49. docs/tutorials/09-section-extraction.ipynb +2434 -0
  50. docs/tutorials/09-section-extraction.md +256 -0
  51. docs/tutorials/10-form-field-extraction.ipynb +484 -0
  52. docs/tutorials/10-form-field-extraction.md +201 -0
  53. docs/tutorials/11-enhanced-table-processing.ipynb +54 -0
  54. docs/tutorials/11-enhanced-table-processing.md +9 -0
  55. docs/tutorials/12-ocr-integration.ipynb +586 -0
  56. docs/tutorials/12-ocr-integration.md +188 -0
  57. docs/tutorials/13-semantic-search.ipynb +1888 -0
  58. docs/tutorials/13-semantic-search.md +77 -0
  59. docs/visual-debugging/index.ipynb +2970 -0
  60. docs/visual-debugging/index.md +157 -0
  61. docs/visual-debugging/region.png +0 -0
  62. natural_pdf/__init__.py +39 -20
  63. natural_pdf/analyzers/__init__.py +2 -1
  64. natural_pdf/analyzers/layout/base.py +32 -24
  65. natural_pdf/analyzers/layout/docling.py +131 -72
  66. natural_pdf/analyzers/layout/layout_analyzer.py +156 -113
  67. natural_pdf/analyzers/layout/layout_manager.py +98 -58
  68. natural_pdf/analyzers/layout/layout_options.py +32 -17
  69. natural_pdf/analyzers/layout/paddle.py +152 -95
  70. natural_pdf/analyzers/layout/surya.py +164 -92
  71. natural_pdf/analyzers/layout/tatr.py +149 -84
  72. natural_pdf/analyzers/layout/yolo.py +84 -44
  73. natural_pdf/analyzers/text_options.py +22 -15
  74. natural_pdf/analyzers/text_structure.py +131 -85
  75. natural_pdf/analyzers/utils.py +30 -23
  76. natural_pdf/collections/pdf_collection.py +126 -98
  77. natural_pdf/core/__init__.py +1 -1
  78. natural_pdf/core/element_manager.py +416 -337
  79. natural_pdf/core/highlighting_service.py +268 -196
  80. natural_pdf/core/page.py +910 -516
  81. natural_pdf/core/pdf.py +387 -289
  82. natural_pdf/elements/__init__.py +1 -1
  83. natural_pdf/elements/base.py +302 -214
  84. natural_pdf/elements/collections.py +714 -514
  85. natural_pdf/elements/line.py +39 -36
  86. natural_pdf/elements/rect.py +32 -30
  87. natural_pdf/elements/region.py +854 -883
  88. natural_pdf/elements/text.py +122 -99
  89. natural_pdf/exporters/__init__.py +0 -1
  90. natural_pdf/exporters/searchable_pdf.py +261 -102
  91. natural_pdf/ocr/__init__.py +23 -14
  92. natural_pdf/ocr/engine.py +17 -8
  93. natural_pdf/ocr/engine_easyocr.py +63 -47
  94. natural_pdf/ocr/engine_paddle.py +97 -68
  95. natural_pdf/ocr/engine_surya.py +54 -44
  96. natural_pdf/ocr/ocr_manager.py +88 -62
  97. natural_pdf/ocr/ocr_options.py +16 -10
  98. natural_pdf/qa/__init__.py +1 -1
  99. natural_pdf/qa/document_qa.py +119 -111
  100. natural_pdf/search/__init__.py +37 -31
  101. natural_pdf/search/haystack_search_service.py +312 -189
  102. natural_pdf/search/haystack_utils.py +186 -122
  103. natural_pdf/search/search_options.py +25 -14
  104. natural_pdf/search/search_service_protocol.py +12 -6
  105. natural_pdf/search/searchable_mixin.py +261 -176
  106. natural_pdf/selectors/__init__.py +2 -1
  107. natural_pdf/selectors/parser.py +159 -316
  108. natural_pdf/templates/__init__.py +1 -1
  109. natural_pdf/utils/highlighting.py +8 -2
  110. natural_pdf/utils/reading_order.py +65 -63
  111. natural_pdf/utils/text_extraction.py +195 -0
  112. natural_pdf/utils/visualization.py +70 -61
  113. natural_pdf/widgets/__init__.py +2 -3
  114. natural_pdf/widgets/viewer.py +749 -718
  115. {natural_pdf-0.1.3.dist-info → natural_pdf-0.1.5.dist-info}/METADATA +29 -15
  116. natural_pdf-0.1.5.dist-info/RECORD +134 -0
  117. natural_pdf-0.1.5.dist-info/top_level.txt +5 -0
  118. notebooks/Examples.ipynb +1293 -0
  119. pdfs/.gitkeep +0 -0
  120. pdfs/01-practice.pdf +543 -0
  121. pdfs/0500000US42001.pdf +0 -0
  122. pdfs/0500000US42007.pdf +0 -0
  123. pdfs/2014 Statistics.pdf +0 -0
  124. pdfs/2019 Statistics.pdf +0 -0
  125. pdfs/Atlanta_Public_Schools_GA_sample.pdf +0 -0
  126. pdfs/needs-ocr.pdf +0 -0
  127. tests/test_loading.py +50 -0
  128. tests/test_optional_deps.py +298 -0
  129. natural_pdf-0.1.3.dist-info/RECORD +0 -61
  130. natural_pdf-0.1.3.dist-info/top_level.txt +0 -1
  131. {natural_pdf-0.1.3.dist-info → natural_pdf-0.1.5.dist-info}/WHEEL +0 -0
  132. {natural_pdf-0.1.3.dist-info → natural_pdf-0.1.5.dist-info}/licenses/LICENSE +0 -0
@@ -5,33 +5,37 @@ Module for exporting PDF content to various formats.
5
5
  import logging
6
6
  import os
7
7
  import tempfile
8
- from typing import TYPE_CHECKING, List, Dict, Any, Tuple
8
+ import xml.etree.ElementTree as ET
9
+ from typing import TYPE_CHECKING, Any, Dict, List, Tuple, Union
10
+ from xml.etree.ElementTree import Element as ETElement
11
+ from xml.etree.ElementTree import SubElement
9
12
 
10
13
  # Lazy imports for optional dependencies
11
14
  try:
12
15
  from PIL import Image
13
16
  except ImportError:
14
- Image = None # type: ignore
17
+ Image = None # type: ignore
15
18
 
16
19
  try:
17
20
  import pikepdf
18
21
  except ImportError:
19
- pikepdf = None # type: ignore
22
+ pikepdf = None # type: ignore
20
23
 
21
24
  try:
22
25
  from ocrmypdf.hocrtransform import HocrTransform
23
26
  except ImportError:
24
- HocrTransform = None # type: ignore
27
+ HocrTransform = None # type: ignore
25
28
 
26
29
  if TYPE_CHECKING:
27
- from natural_pdf.core.pdf import PDF
28
30
  from natural_pdf.core.page import Page
31
+ from natural_pdf.core.pdf import PDF
32
+ from natural_pdf.elements.collections import PageCollection
29
33
 
30
34
 
31
35
  logger = logging.getLogger(__name__)
32
36
 
33
37
  # --- Constants ---
34
- HOCR_TEMPLATE_HEADER = '''<?xml version="1.0" encoding="UTF-8"?>
38
+ HOCR_TEMPLATE_HEADER = """<?xml version="1.0" encoding="UTF-8"?>
35
39
  <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
36
40
  "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
37
41
  <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
@@ -42,27 +46,27 @@ HOCR_TEMPLATE_HEADER = '''<?xml version="1.0" encoding="UTF-8"?>
42
46
  <meta name='ocr-capabilities' content='ocr_page ocr_carea ocr_par ocr_line ocrx_word'/>
43
47
  </head>
44
48
  <body>
45
- '''
49
+ """
46
50
 
47
- HOCR_TEMPLATE_PAGE = ''' <div class='ocr_page' id='page_{page_num}' title='image "{image_path}"; bbox 0 0 {width} {height}; ppageno {page_num}'>
48
- '''
51
+ HOCR_TEMPLATE_PAGE = """ <div class='ocr_page' id='page_{page_num}' title='image "{image_path}"; bbox 0 0 {width} {height}; ppageno {page_num}'>
52
+ """
49
53
 
50
- HOCR_TEMPLATE_WORD = ''' <span class='ocrx_word' id='word_{page_num}_{word_id}' title='bbox {x0} {y0} {x1} {y1}; x_wconf {confidence}'>{text}</span>
51
- '''
54
+ HOCR_TEMPLATE_WORD = """ <span class='ocrx_word' id='word_{page_num}_{word_id}' title='bbox {x0} {y0} {x1} {y1}; x_wconf {confidence}'>{text}</span>
55
+ """
52
56
 
53
- HOCR_TEMPLATE_LINE_START = ''' <span class='ocr_line' id='line_{page_num}_{line_id}' title='bbox {x0} {y0} {x1} {y1}'>
54
- '''
55
- HOCR_TEMPLATE_LINE_END = ''' </span>
56
- '''
57
+ HOCR_TEMPLATE_LINE_START = """ <span class='ocr_line' id='line_{page_num}_{line_id}' title='bbox {x0} {y0} {x1} {y1}; baseline 0 0; x_size 0; x_descenders 0; x_ascenders 0'>
58
+ """
59
+ HOCR_TEMPLATE_LINE_END = """ </span>
60
+ """
57
61
 
58
- HOCR_TEMPLATE_FOOTER = ''' </div>
62
+ HOCR_TEMPLATE_FOOTER = """ </div>
59
63
  </body>
60
64
  </html>
61
- '''
65
+ """
62
66
  # --- End Constants ---
63
67
 
64
68
 
65
- def _generate_hocr_for_page(page: 'Page', image_width: int, image_height: int) -> str:
69
+ def _generate_hocr_for_page(page: "Page", image_width: int, image_height: int) -> str:
66
70
  """
67
71
  Generates an hOCR string for a given Page object based on its OCR elements.
68
72
 
@@ -79,96 +83,240 @@ def _generate_hocr_for_page(page: 'Page', image_width: int, image_height: int) -
79
83
  """
80
84
  # Attempt to get OCR elements (words) using find_all with selector
81
85
  # Use find_all which returns an ElementCollection
82
- ocr_elements_collection = page.find_all('text[source=ocr]')
83
- ocr_elements = ocr_elements_collection.elements # Get the list of elements
86
+ ocr_elements_collection = page.find_all("text[source=ocr]")
87
+ ocr_elements = ocr_elements_collection.elements # Get the list of elements
84
88
 
85
89
  if not ocr_elements:
86
- logger.warning(f"Page {page.number} has no OCR elements (text[source=ocr]) to generate hOCR from.")
90
+ logger.warning(
91
+ f"Page {page.number} has no OCR elements (text[source=ocr]) to generate hOCR from."
92
+ )
87
93
  # Return minimal valid hOCR for an empty page
88
94
  hocr_content = HOCR_TEMPLATE_HEADER
89
- hocr_content += HOCR_TEMPLATE_PAGE.format(page_num=page.index, image_path="", width=image_width, height=image_height)
95
+ hocr_content += HOCR_TEMPLATE_PAGE.format(
96
+ page_num=page.index, image_path="", width=image_width, height=image_height
97
+ )
90
98
  hocr_content += HOCR_TEMPLATE_FOOTER
91
99
  return hocr_content
92
100
 
93
-
94
- # --- TODO: Implement logic to group words into lines if necessary ---
95
- # For now, just output words directly. A more advanced implementation
96
- # might group words geometrically into lines first.
97
- # Example (simple, assuming elements are somewhat sorted):
98
- # lines = []
99
- # current_line = []
100
- # last_y = -1
101
- # for word in ocr_elements:
102
- # if not current_line or abs(word.y0 - last_y) < threshold: # Simple Y-based grouping
103
- # current_line.append(word)
104
- # last_y = word.y0
105
- # else:
106
- # lines.append(current_line)
107
- # current_line = [word]
108
- # last_y = word.y0
109
- # if current_line:
110
- # lines.append(current_line)
111
- # --- End Line Grouping Placeholder ---
112
-
113
-
114
- hocr_content = HOCR_TEMPLATE_HEADER
115
- hocr_content += HOCR_TEMPLATE_PAGE.format(page_num=page.index, image_path="", width=image_width, height=image_height) # image_path is often unused
116
-
117
- # Scale factors from PDF points (page dims) to image pixels (rendered image dims)
118
- # Note: Assumes OCR element coordinates are in PDF points (page.width/height)
101
+ # --- Start Line Grouping Logic ---
102
+ logger.debug(f"Page {page.index}: Grouping {len(ocr_elements)} words into lines.")
103
+ ocr_elements.sort(key=lambda el: (el.bbox[1], el.bbox[0]))
104
+ lines = []
105
+ current_line = []
106
+ if ocr_elements:
107
+ current_line.append(ocr_elements[0])
108
+ for i in range(1, len(ocr_elements)):
109
+ current_word = ocr_elements[i]
110
+ last_word = current_line[-1]
111
+ last_word_y0, last_word_y1 = last_word.bbox[1], last_word.bbox[3]
112
+ current_word_y0, current_word_y1 = current_word.bbox[1], current_word.bbox[3]
113
+ last_word_center_y = (last_word_y0 + last_word_y1) / 2
114
+ current_word_center_y = (current_word_y0 + current_word_y1) / 2
115
+ last_word_height = last_word_y1 - last_word_y0
116
+ current_word_height = current_word_y1 - current_word_y0
117
+ avg_height = (last_word_height + current_word_height) / 2
118
+ if avg_height <= 0:
119
+ avg_height = 1
120
+ tolerance_factor = 0.7
121
+ threshold = avg_height * tolerance_factor
122
+ delta_y = abs(current_word_center_y - last_word_center_y)
123
+ # if delta_y < threshold:
124
+ # current_line.append(current_word)
125
+ # else:
126
+ lines.append(current_line)
127
+ current_line = [current_word]
128
+ if current_line:
129
+ lines.append(current_line)
130
+ logger.debug(f"Page {page.index}: Grouped into {len(lines)} lines.")
131
+ # --- End Line Grouping Logic ---
132
+
133
+ # --- Start ElementTree hOCR Generation ---
119
134
  scale_x = image_width / page.width if page.width > 0 else 1
120
135
  scale_y = image_height / page.height if page.height > 0 else 1
121
136
 
137
+ # Create root element
138
+ page_hocr = ETElement(
139
+ "html", attrib={"xmlns": "http://www.w3.org/1999/xhtml", "xml:lang": "en"}
140
+ )
141
+
142
+ # Head
143
+ head = SubElement(page_hocr, "head")
144
+ SubElement(head, "title").text = ""
145
+ SubElement(
146
+ head, "meta", attrib={"http-equiv": "Content-Type", "content": "text/html;charset=utf-8"}
147
+ )
148
+ SubElement(head, "meta", attrib={"name": "ocr-system", "content": "natural-pdf"})
149
+ SubElement(
150
+ head,
151
+ "meta",
152
+ attrib={
153
+ "name": "ocr-capabilities",
154
+ "content": "ocr_page ocr_carea ocr_par ocr_line ocrx_word",
155
+ },
156
+ )
157
+
158
+ # Body and Page
159
+ body = SubElement(page_hocr, "body")
160
+ page_div = SubElement(
161
+ body,
162
+ "div",
163
+ attrib={
164
+ "class": "ocr_page",
165
+ "id": f"page_{page.index}",
166
+ "title": f"image; bbox 0 0 {image_width} {image_height}; ppageno {page.index}",
167
+ },
168
+ )
169
+
170
+ # Calculate overall bbox for carea/par (image coords)
171
+ min_area_x0, min_area_y0 = image_width, image_height
172
+ max_area_x1, max_area_y1 = 0, 0
173
+ if lines:
174
+ for line_words in lines:
175
+ for word in line_words:
176
+ (x0, y0, x1, y1) = word.bbox
177
+ img_x0 = int(x0 * scale_x)
178
+ img_y0 = int(y0 * scale_y)
179
+ img_x1 = int(x1 * scale_x)
180
+ img_y1 = int(y1 * scale_y)
181
+ min_area_x0 = min(min_area_x0, img_x0)
182
+ min_area_y0 = min(min_area_y0, img_y0)
183
+ max_area_x1 = max(max_area_x1, img_x1)
184
+ max_area_y1 = max(max_area_y1, img_y1)
185
+ area_img_x0, area_img_y0 = max(0, min_area_x0), max(0, min_area_y0)
186
+ area_img_x1, area_img_y1 = min(image_width, max_area_x1), min(image_height, max_area_y1)
187
+ if area_img_x0 >= area_img_x1 or area_img_y0 >= area_img_y1:
188
+ area_img_x0, area_img_y0, area_img_x1, area_img_y1 = 0, 0, image_width, image_height
189
+ else:
190
+ area_img_x0, area_img_y0, area_img_x1, area_img_y1 = 0, 0, image_width, image_height
191
+
192
+ # Add Carea and Par wrappers (assuming one block/paragraph per page for simplicity)
193
+ block_div = SubElement(
194
+ page_div, # Attach to page_div now
195
+ "div",
196
+ attrib={
197
+ "class": "ocr_carea",
198
+ "id": "block_0_1", # Simple ID
199
+ "title": f"bbox {area_img_x0} {area_img_y0} {area_img_x1} {area_img_y1}",
200
+ },
201
+ )
202
+ par_div = SubElement(
203
+ block_div,
204
+ "p",
205
+ attrib={
206
+ "class": "ocr_par",
207
+ "id": "par_0_1", # Simple ID
208
+ "title": f"bbox {area_img_x0} {area_img_y0} {area_img_x1} {area_img_y1}",
209
+ },
210
+ )
211
+
212
+ # Loop through lines and words
122
213
  word_id_counter = 0
123
- for word in ocr_elements:
124
- # Scale coordinates to image dimensions
125
- img_x0 = int(word.x0 * scale_x)
126
- img_y0 = int(word.y0 * scale_y)
127
- img_x1 = int(word.x1 * scale_x)
128
- img_y1 = int(word.y1 * scale_y)
129
-
130
- # Ensure coordinates are within image bounds
131
- img_x0 = max(0, img_x0)
132
- img_y0 = max(0, img_y0)
133
- img_x1 = min(image_width, img_x1)
134
- img_y1 = min(image_height, img_y1)
135
-
136
- # Basic escaping for XML - might need more robust escaping
137
- text = word.text.replace('&', '&amp;').replace('<', '&lt;').replace('>', '&gt;')
138
-
139
- # Confidence (assuming it exists, default to 99 if not)
140
- confidence = getattr(word, 'confidence', 0.99) * 100 # hOCR often uses 0-100
141
-
142
- hocr_content += HOCR_TEMPLATE_WORD.format(
143
- page_num=page.index,
144
- word_id=word_id_counter,
145
- x0=img_x0,
146
- y0=img_y0,
147
- x1=img_x1,
148
- y1=img_y1,
149
- confidence=int(confidence),
150
- text=text
214
+ line_id_counter = 0
215
+ for current_line_words in lines:
216
+ if not current_line_words:
217
+ continue
218
+
219
+ # Sort words in line by x0
220
+ current_line_words.sort(key=lambda el: el.bbox[0])
221
+
222
+ # Calculate line bbox (image coords)
223
+ min_line_x0, min_line_y0 = image_width, image_height
224
+ max_line_x1, max_line_y1 = 0, 0
225
+ for word in current_line_words:
226
+ (x0, y0, x1, y1) = word.bbox
227
+ img_x0, img_y0 = int(x0 * scale_x), int(y0 * scale_y)
228
+ img_x1, img_y1 = int(x1 * scale_x), int(y1 * scale_y)
229
+ min_line_x0, min_line_y0 = min(min_line_x0, img_x0), min(min_line_y0, img_y0)
230
+ max_line_x1, max_line_y1 = max(max_line_x1, img_x1), max(max_line_y1, img_y1)
231
+
232
+ line_img_x0, line_img_y0 = max(0, min_line_x0), max(0, min_line_y0)
233
+ line_img_x1, line_img_y1 = min(image_width, max_line_x1), min(image_height, max_line_y1)
234
+ if line_img_x0 >= line_img_x1 or line_img_y0 >= line_img_y1:
235
+ line_img_x0, line_img_y0, line_img_x1, line_img_y1 = 0, 0, 1, 1
236
+
237
+ # Create ocr_line span
238
+ line_span = SubElement(
239
+ par_div, # Attach line to paragraph
240
+ "span",
241
+ attrib={
242
+ "class": "ocr_line",
243
+ "id": f"line_{page.index}_{line_id_counter}",
244
+ "title": f"bbox {line_img_x0} {line_img_y0} {line_img_x1} {line_img_y1}; baseline 0 0; x_size 0; x_descenders 0; x_ascenders 0",
245
+ },
151
246
  )
152
- word_id_counter += 1
153
- hocr_content += "\n" # Add newline for readability
154
247
 
248
+ # Add words to line
249
+ for word in current_line_words:
250
+ (x0, y0, x1, y1) = word.bbox
251
+ img_x0, img_y0 = int(x0 * scale_x), int(y0 * scale_y)
252
+ img_x1, img_y1 = int(x1 * scale_x), int(y1 * scale_y)
253
+
254
+ img_x0, img_y0 = max(0, img_x0), max(0, img_y0)
255
+ img_x1, img_y1 = min(image_width, img_x1), min(image_height, img_y1)
256
+ if img_x1 <= img_x0:
257
+ img_x1 = img_x0 + 1
258
+ if img_y1 <= img_y0:
259
+ img_y1 = img_y0 + 1
260
+
261
+ # --- Strip whitespace and check if word is empty --- #
262
+ text = word.text.strip().replace("&", "&amp;").replace("<", "&lt;").replace(">", "&gt;")
263
+ if not text:
264
+ continue # Skip adding this word if it becomes empty after stripping
265
+ # --- End strip ---
266
+ confidence = getattr(word, "confidence", 1.00)
267
+
268
+ word_span = SubElement(
269
+ line_span, # Attach word to line
270
+ "span",
271
+ attrib={
272
+ "class": "ocrx_word",
273
+ "id": f"word_{page.index}_{word_id_counter}",
274
+ "title": f"bbox {img_x0} {img_y0} {img_x1} {img_y1}; x_wconf {confidence}",
275
+ },
276
+ )
277
+ word_span.text = text
278
+ word_id_counter += 1
279
+ line_id_counter += 1
280
+
281
+ # Convert ElementTree to string
282
+ # xml_declaration = '<?xml version="1.0" encoding="UTF-8"?>\n' # No longer needed
283
+ # doctype_declaration = '''<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
284
+ # "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">\n''' # No longer needed
285
+ # ET.indent(page_hocr) # Optional: for pretty printing, requires Python 3.9+
286
+ # Need bytes for writing, then decode for HocrTransform if it needs str
287
+ # Let's stick to unicode string output for now, as the file write expects it.
288
+ hocr_content = ET.tostring(
289
+ page_hocr, encoding="unicode", method="xml"
290
+ ) # Revert back to method='xml'
291
+ # hocr_content = xml_declaration + doctype_declaration + hocr_string_content # Removed string addition
292
+ # --- End ElementTree hOCR Generation ---
293
+
294
+ # --- Add code to save hOCR output for inspection ---
295
+ try:
296
+ hocr_output_path = "natural_pdf_hocr_output.hocr"
297
+ with open(hocr_output_path, "w", encoding="utf-8") as f_out:
298
+ f_out.write(hocr_content)
299
+ logger.info(f"Saved hOCR content for page {page.index} to: {hocr_output_path}")
300
+ except Exception as e:
301
+ logger.error(f"Failed to save hOCR output to file: {e}")
302
+ # --- End save hOCR ---
155
303
 
156
- hocr_content += HOCR_TEMPLATE_FOOTER
157
304
  return hocr_content
158
305
 
159
306
 
160
- def create_searchable_pdf(pdf_object: 'PDF', output_path: str, dpi: int = 300):
307
+ def create_searchable_pdf(
308
+ source: Union["Page", "PageCollection", "PDF"], output_path: str, dpi: int = 300
309
+ ):
161
310
  """
162
311
  Creates a searchable PDF from a natural_pdf.PDF object using OCR results.
163
312
 
164
313
  Relies on ocrmypdf for hOCR transformation. Requires optional dependencies.
165
314
 
166
315
  Args:
167
- pdf_object: The natural_pdf.PDF instance (OCR should have been run).
316
+ source: The natural_pdf.PDF, PageCollection, or Page object
168
317
  output_path: The path to save the resulting searchable PDF.
169
318
  dpi: The resolution (dots per inch) for rendering page images and hOCR.
170
319
  """
171
- # _check_dependencies() # Removed check
172
320
 
173
321
  # --- Ensure dependencies are loaded (they should be if installed) ---
174
322
  if Image is None or pikepdf is None or HocrTransform is None:
@@ -180,7 +328,13 @@ def create_searchable_pdf(pdf_object: 'PDF', output_path: str, dpi: int = 300):
180
328
  )
181
329
  # --- End Safeguard Check ---
182
330
 
183
- logger.info(f"Starting searchable PDF creation for '{pdf_object.source_path}' -> '{output_path}' at {dpi} DPI.")
331
+ # duck type to see if source has .pages, to populate pages =
332
+ if hasattr(source, "pages"):
333
+ pages = source.pages
334
+ else:
335
+ pages = [source]
336
+
337
+ logger.info(f"Starting searchable PDF creation '{output_path}' at {dpi} DPI.")
184
338
 
185
339
  temp_pdf_pages: List[str] = []
186
340
  output_abs_path = os.path.abspath(output_path)
@@ -188,10 +342,12 @@ def create_searchable_pdf(pdf_object: 'PDF', output_path: str, dpi: int = 300):
188
342
  with tempfile.TemporaryDirectory() as tmpdir:
189
343
  logger.debug(f"Using temporary directory: {tmpdir}")
190
344
 
191
- for i, page in enumerate(pdf_object.pages):
192
- logger.debug(f"Processing page {page.number} (index {i})...")
345
+ for i, page in enumerate(pages):
346
+ logger.debug(f"Processing page {i+1} of {len(pages)}...")
193
347
  page_base_name = f"page_{i}"
194
- img_path = os.path.join(tmpdir, f"{page_base_name}.png") # Use PNG for potentially better quality
348
+ img_path = os.path.join(
349
+ tmpdir, f"{page_base_name}.png"
350
+ ) # Use PNG for potentially better quality
195
351
  hocr_path = os.path.join(tmpdir, f"{page_base_name}.hocr")
196
352
  pdf_page_path = os.path.join(tmpdir, f"{page_base_name}.pdf")
197
353
 
@@ -200,18 +356,17 @@ def create_searchable_pdf(pdf_object: 'PDF', output_path: str, dpi: int = 300):
200
356
  logger.debug(f" Rendering page {i} to image ({dpi} DPI)...")
201
357
  # Use the Page's to_image method
202
358
  pil_image = page.to_image(resolution=dpi, include_highlights=False)
203
- pil_image.save(img_path, format='PNG')
359
+ pil_image.save(img_path, format="PNG")
204
360
  img_width, img_height = pil_image.size
205
361
  logger.debug(f" Image saved to {img_path} ({img_width}x{img_height})")
206
362
 
207
363
  # 2. Generate hOCR
208
364
  logger.debug(f" Generating hOCR...")
209
365
  hocr_content = _generate_hocr_for_page(page, img_width, img_height)
210
- with open(hocr_path, 'w', encoding='utf-8') as f:
366
+ with open(hocr_path, "w", encoding="utf-8") as f:
211
367
  f.write(hocr_content)
212
368
  logger.debug(f" hOCR saved to {hocr_path}")
213
369
 
214
-
215
370
  # 3. Use HocrTransform to create searchable PDF page
216
371
  logger.debug(f" Running HocrTransform...")
217
372
  hocr_transform = HocrTransform(hocr_filename=hocr_path, dpi=dpi)
@@ -221,11 +376,11 @@ def create_searchable_pdf(pdf_object: 'PDF', output_path: str, dpi: int = 300):
221
376
  logger.debug(f" Temporary PDF page saved to {pdf_page_path}")
222
377
 
223
378
  except Exception as e:
224
- logger.error(f" Failed to process page {page.number}: {e}", exc_info=True)
225
- # Decide whether to skip or raise error
226
- # For now, let's skip and continue
227
- logger.warning(f" Skipping page {page.number} due to error.")
228
- continue # Skip to the next page
379
+ logger.error(f" Failed to process page {page.number}: {e}", exc_info=True)
380
+ # Decide whether to skip or raise error
381
+ # For now, let's skip and continue
382
+ logger.warning(f" Skipping page {page.number} due to error.")
383
+ continue # Skip to the next page
229
384
 
230
385
  # 4. Merge temporary PDF pages
231
386
  if not temp_pdf_pages:
@@ -237,16 +392,20 @@ def create_searchable_pdf(pdf_object: 'PDF', output_path: str, dpi: int = 300):
237
392
  # Use pikepdf for merging
238
393
  output_pdf = pikepdf.Pdf.new()
239
394
  for temp_pdf_path in temp_pdf_pages:
240
- with pikepdf.Pdf.open(temp_pdf_path) as src_page_pdf:
241
- # Assuming each temp PDF has exactly one page
242
- if len(src_page_pdf.pages) == 1:
243
- output_pdf.pages.append(src_page_pdf.pages[0])
244
- else:
245
- logger.warning(f"Temporary PDF '{temp_pdf_path}' had unexpected number of pages ({len(src_page_pdf.pages)}). Skipping.")
395
+ with pikepdf.Pdf.open(temp_pdf_path) as src_page_pdf:
396
+ # Assuming each temp PDF has exactly one page
397
+ if len(src_page_pdf.pages) == 1:
398
+ output_pdf.pages.append(src_page_pdf.pages[0])
399
+ else:
400
+ logger.warning(
401
+ f"Temporary PDF '{temp_pdf_path}' had unexpected number of pages ({len(src_page_pdf.pages)}). Skipping."
402
+ )
246
403
  output_pdf.save(output_abs_path)
247
404
  logger.info(f"Successfully saved merged searchable PDF to: {output_abs_path}")
248
405
  except Exception as e:
249
- logger.error(f"Failed to merge temporary PDFs into '{output_abs_path}': {e}", exc_info=True)
406
+ logger.error(
407
+ f"Failed to merge temporary PDFs into '{output_abs_path}': {e}", exc_info=True
408
+ )
250
409
  raise RuntimeError(f"Failed to save final PDF: {e}") from e
251
410
 
252
- logger.debug("Temporary directory cleaned up.")
411
+ logger.debug("Temporary directory cleaned up.")
@@ -3,47 +3,56 @@ OCR engines for natural-pdf.
3
3
 
4
4
  This module provides different OCR engines that can be used with natural-pdf.
5
5
  """
6
+
6
7
  import logging
7
8
 
8
9
  # Set up module logger
9
10
  logger = logging.getLogger("natural_pdf.ocr")
10
- from .ocr_manager import OCRManager
11
- from .engine import OCREngine
12
- from .ocr_options import OCROptions
13
11
  from .engine import OCREngine
14
12
  from .engine_paddle import PaddleOCREngine
15
13
  from .engine_surya import SuryaOCREngine
14
+ from .ocr_manager import OCRManager
15
+ from .ocr_options import OCROptions
16
16
 
17
- __all__ = ['OCRManager', 'OCREngine', 'OCROptions', 'EasyOCREngine', 'PaddleOCREngine', 'SuryaOCREngine']
17
+ __all__ = [
18
+ "OCRManager",
19
+ "OCREngine",
20
+ "OCROptions",
21
+ "EasyOCREngine",
22
+ "PaddleOCREngine",
23
+ "SuryaOCREngine",
24
+ ]
18
25
 
19
26
  DEFAULT_ENGINE = SuryaOCREngine
20
27
 
28
+
21
29
  def get_engine(engine_name=None, **kwargs):
22
30
  """
23
31
  Get OCR engine by name.
24
-
32
+
25
33
  Args:
26
34
  engine_name: Name of the engine to use ('easyocr', 'paddleocr', etc.)
27
35
  If None, the default engine is used (PaddleOCR if available, otherwise EasyOCR)
28
36
  **kwargs: Additional arguments to pass to the engine constructor
29
-
37
+
30
38
  Returns:
31
39
  OCREngine instance
32
40
  """
33
41
  logger.debug(f"Initializing OCR engine: {engine_name or 'default'}")
34
-
35
- if engine_name is None or engine_name == 'default':
42
+
43
+ if engine_name is None or engine_name == "default":
36
44
  engine = DEFAULT_ENGINE(**kwargs)
37
45
  logger.info(f"Using default OCR engine: {engine.__class__.__name__}")
38
46
  return engine
39
-
40
- if engine_name.lower() == 'easyocr':
47
+
48
+ if engine_name.lower() == "easyocr":
41
49
  logger.info("Initializing EasyOCR engine")
42
50
  return EasyOCREngine(**kwargs)
43
-
44
- if engine_name.lower() == 'paddleocr':
51
+
52
+ if engine_name.lower() == "paddleocr":
45
53
  try:
46
54
  from .engine_paddle import PaddleOCREngine
55
+
47
56
  logger.info("Initializing PaddleOCR engine")
48
57
  return PaddleOCREngine(**kwargs)
49
58
  except ImportError:
@@ -51,6 +60,6 @@ def get_engine(engine_name=None, **kwargs):
51
60
  raise ImportError(
52
61
  "PaddleOCR is not installed. Please install it with: pip install paddlepaddle paddleocr"
53
62
  )
54
-
63
+
55
64
  logger.error(f"Unknown OCR engine: {engine_name}")
56
- raise ValueError(f"Unknown OCR engine: {engine_name}")
65
+ raise ValueError(f"Unknown OCR engine: {engine_name}")
natural_pdf/ocr/engine.py CHANGED
@@ -1,7 +1,8 @@
1
1
  # ocr_engine_base.py
2
2
  import logging
3
3
  from abc import ABC, abstractmethod
4
- from typing import Dict, List, Any, Optional, Tuple, Union
4
+ from typing import Any, Dict, List, Optional, Tuple, Union
5
+
5
6
  from PIL import Image
6
7
 
7
8
  # Assuming ocr_options defines BaseOCROptions
@@ -9,6 +10,7 @@ from .ocr_options import BaseOCROptions
9
10
 
10
11
  logger = logging.getLogger(__name__)
11
12
 
13
+
12
14
  class OCREngine(ABC):
13
15
  """Abstract Base Class for OCR engines."""
14
16
 
@@ -16,14 +18,14 @@ class OCREngine(ABC):
16
18
  """Initializes the base OCR engine."""
17
19
  self.logger = logging.getLogger(f"{__name__}.{self.__class__.__name__}")
18
20
  self.logger.info(f"Initializing {self.__class__.__name__}")
19
- self._reader_cache = {} # Cache for initialized models/readers
21
+ self._reader_cache = {} # Cache for initialized models/readers
20
22
 
21
23
  @abstractmethod
22
24
  def process_image(
23
25
  self,
24
- images: Union[Image.Image, List[Image.Image]], # Accept single or list
25
- options: BaseOCROptions
26
- ) -> Union[List[Dict[str, Any]], List[List[Dict[str, Any]]]]: # Return single or list of lists
26
+ images: Union[Image.Image, List[Image.Image]], # Accept single or list
27
+ options: BaseOCROptions,
28
+ ) -> Union[List[Dict[str, Any]], List[List[Dict[str, Any]]]]: # Return single or list of lists
27
29
  """
28
30
  Processes a single image or a batch of images using the specific engine and options.
29
31
 
@@ -80,10 +82,18 @@ class OCREngine(ABC):
80
82
  Tuple[float, float, float, float] or None if conversion fails.
81
83
  """
82
84
  try:
83
- if isinstance(bbox, (list, tuple)) and len(bbox) == 4 and all(isinstance(n, (int, float)) for n in bbox):
85
+ if (
86
+ isinstance(bbox, (list, tuple))
87
+ and len(bbox) == 4
88
+ and all(isinstance(n, (int, float)) for n in bbox)
89
+ ):
84
90
  # Already in (x0, y0, x1, y1) format (or similar)
85
91
  return tuple(float(c) for c in bbox[:4])
86
- elif isinstance(bbox, (list, tuple)) and len(bbox) > 0 and isinstance(bbox[0], (list, tuple)):
92
+ elif (
93
+ isinstance(bbox, (list, tuple))
94
+ and len(bbox) > 0
95
+ and isinstance(bbox[0], (list, tuple))
96
+ ):
87
97
  # Polygon format [[x1,y1],[x2,y2],...]
88
98
  x_coords = [float(point[0]) for point in bbox]
89
99
  y_coords = [float(point[1]) for point in bbox]
@@ -101,4 +111,3 @@ class OCREngine(ABC):
101
111
  self.logger.info(f"Cleaning up {self.__class__.__name__} resources.")
102
112
  # Clear reader cache to free up memory/GPU resources
103
113
  self._reader_cache.clear()
104
-