natural-pdf 0.1.3__py3-none-any.whl → 0.1.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docs/api/index.md +386 -0
- docs/assets/favicon.png +3 -0
- docs/assets/favicon.svg +3 -0
- docs/assets/javascripts/custom.js +17 -0
- docs/assets/logo.svg +3 -0
- docs/assets/sample-screen.png +0 -0
- docs/assets/social-preview.png +17 -0
- docs/assets/social-preview.svg +17 -0
- docs/assets/stylesheets/custom.css +65 -0
- docs/document-qa/index.ipynb +435 -0
- docs/document-qa/index.md +79 -0
- docs/element-selection/index.ipynb +915 -0
- docs/element-selection/index.md +229 -0
- docs/index.md +170 -0
- docs/installation/index.md +69 -0
- docs/interactive-widget/index.ipynb +962 -0
- docs/interactive-widget/index.md +12 -0
- docs/layout-analysis/index.ipynb +818 -0
- docs/layout-analysis/index.md +185 -0
- docs/ocr/index.md +222 -0
- docs/pdf-navigation/index.ipynb +314 -0
- docs/pdf-navigation/index.md +97 -0
- docs/regions/index.ipynb +816 -0
- docs/regions/index.md +294 -0
- docs/tables/index.ipynb +658 -0
- docs/tables/index.md +144 -0
- docs/text-analysis/index.ipynb +370 -0
- docs/text-analysis/index.md +105 -0
- docs/text-extraction/index.ipynb +1478 -0
- docs/text-extraction/index.md +292 -0
- docs/tutorials/01-loading-and-extraction.ipynb +1696 -0
- docs/tutorials/01-loading-and-extraction.md +95 -0
- docs/tutorials/02-finding-elements.ipynb +340 -0
- docs/tutorials/02-finding-elements.md +149 -0
- docs/tutorials/03-extracting-blocks.ipynb +147 -0
- docs/tutorials/03-extracting-blocks.md +48 -0
- docs/tutorials/04-table-extraction.ipynb +114 -0
- docs/tutorials/04-table-extraction.md +50 -0
- docs/tutorials/05-excluding-content.ipynb +270 -0
- docs/tutorials/05-excluding-content.md +109 -0
- docs/tutorials/06-document-qa.ipynb +332 -0
- docs/tutorials/06-document-qa.md +91 -0
- docs/tutorials/07-layout-analysis.ipynb +260 -0
- docs/tutorials/07-layout-analysis.md +66 -0
- docs/tutorials/07-working-with-regions.ipynb +409 -0
- docs/tutorials/07-working-with-regions.md +151 -0
- docs/tutorials/08-spatial-navigation.ipynb +508 -0
- docs/tutorials/08-spatial-navigation.md +190 -0
- docs/tutorials/09-section-extraction.ipynb +2434 -0
- docs/tutorials/09-section-extraction.md +256 -0
- docs/tutorials/10-form-field-extraction.ipynb +484 -0
- docs/tutorials/10-form-field-extraction.md +201 -0
- docs/tutorials/11-enhanced-table-processing.ipynb +54 -0
- docs/tutorials/11-enhanced-table-processing.md +9 -0
- docs/tutorials/12-ocr-integration.ipynb +586 -0
- docs/tutorials/12-ocr-integration.md +188 -0
- docs/tutorials/13-semantic-search.ipynb +1888 -0
- docs/tutorials/13-semantic-search.md +77 -0
- docs/visual-debugging/index.ipynb +2970 -0
- docs/visual-debugging/index.md +157 -0
- docs/visual-debugging/region.png +0 -0
- natural_pdf/__init__.py +39 -20
- natural_pdf/analyzers/__init__.py +2 -1
- natural_pdf/analyzers/layout/base.py +32 -24
- natural_pdf/analyzers/layout/docling.py +131 -72
- natural_pdf/analyzers/layout/layout_analyzer.py +156 -113
- natural_pdf/analyzers/layout/layout_manager.py +98 -58
- natural_pdf/analyzers/layout/layout_options.py +32 -17
- natural_pdf/analyzers/layout/paddle.py +152 -95
- natural_pdf/analyzers/layout/surya.py +164 -92
- natural_pdf/analyzers/layout/tatr.py +149 -84
- natural_pdf/analyzers/layout/yolo.py +84 -44
- natural_pdf/analyzers/text_options.py +22 -15
- natural_pdf/analyzers/text_structure.py +131 -85
- natural_pdf/analyzers/utils.py +30 -23
- natural_pdf/collections/pdf_collection.py +126 -98
- natural_pdf/core/__init__.py +1 -1
- natural_pdf/core/element_manager.py +416 -337
- natural_pdf/core/highlighting_service.py +268 -196
- natural_pdf/core/page.py +910 -516
- natural_pdf/core/pdf.py +387 -289
- natural_pdf/elements/__init__.py +1 -1
- natural_pdf/elements/base.py +302 -214
- natural_pdf/elements/collections.py +714 -514
- natural_pdf/elements/line.py +39 -36
- natural_pdf/elements/rect.py +32 -30
- natural_pdf/elements/region.py +854 -883
- natural_pdf/elements/text.py +122 -99
- natural_pdf/exporters/__init__.py +0 -1
- natural_pdf/exporters/searchable_pdf.py +261 -102
- natural_pdf/ocr/__init__.py +23 -14
- natural_pdf/ocr/engine.py +17 -8
- natural_pdf/ocr/engine_easyocr.py +63 -47
- natural_pdf/ocr/engine_paddle.py +97 -68
- natural_pdf/ocr/engine_surya.py +54 -44
- natural_pdf/ocr/ocr_manager.py +88 -62
- natural_pdf/ocr/ocr_options.py +16 -10
- natural_pdf/qa/__init__.py +1 -1
- natural_pdf/qa/document_qa.py +119 -111
- natural_pdf/search/__init__.py +37 -31
- natural_pdf/search/haystack_search_service.py +312 -189
- natural_pdf/search/haystack_utils.py +186 -122
- natural_pdf/search/search_options.py +25 -14
- natural_pdf/search/search_service_protocol.py +12 -6
- natural_pdf/search/searchable_mixin.py +261 -176
- natural_pdf/selectors/__init__.py +2 -1
- natural_pdf/selectors/parser.py +159 -316
- natural_pdf/templates/__init__.py +1 -1
- natural_pdf/utils/highlighting.py +8 -2
- natural_pdf/utils/reading_order.py +65 -63
- natural_pdf/utils/text_extraction.py +195 -0
- natural_pdf/utils/visualization.py +70 -61
- natural_pdf/widgets/__init__.py +2 -3
- natural_pdf/widgets/viewer.py +749 -718
- {natural_pdf-0.1.3.dist-info → natural_pdf-0.1.5.dist-info}/METADATA +29 -15
- natural_pdf-0.1.5.dist-info/RECORD +134 -0
- natural_pdf-0.1.5.dist-info/top_level.txt +5 -0
- notebooks/Examples.ipynb +1293 -0
- pdfs/.gitkeep +0 -0
- pdfs/01-practice.pdf +543 -0
- pdfs/0500000US42001.pdf +0 -0
- pdfs/0500000US42007.pdf +0 -0
- pdfs/2014 Statistics.pdf +0 -0
- pdfs/2019 Statistics.pdf +0 -0
- pdfs/Atlanta_Public_Schools_GA_sample.pdf +0 -0
- pdfs/needs-ocr.pdf +0 -0
- tests/test_loading.py +50 -0
- tests/test_optional_deps.py +298 -0
- natural_pdf-0.1.3.dist-info/RECORD +0 -61
- natural_pdf-0.1.3.dist-info/top_level.txt +0 -1
- {natural_pdf-0.1.3.dist-info → natural_pdf-0.1.5.dist-info}/WHEEL +0 -0
- {natural_pdf-0.1.3.dist-info → natural_pdf-0.1.5.dist-info}/licenses/LICENSE +0 -0
@@ -5,33 +5,37 @@ Module for exporting PDF content to various formats.
|
|
5
5
|
import logging
|
6
6
|
import os
|
7
7
|
import tempfile
|
8
|
-
|
8
|
+
import xml.etree.ElementTree as ET
|
9
|
+
from typing import TYPE_CHECKING, Any, Dict, List, Tuple, Union
|
10
|
+
from xml.etree.ElementTree import Element as ETElement
|
11
|
+
from xml.etree.ElementTree import SubElement
|
9
12
|
|
10
13
|
# Lazy imports for optional dependencies
|
11
14
|
try:
|
12
15
|
from PIL import Image
|
13
16
|
except ImportError:
|
14
|
-
Image = None
|
17
|
+
Image = None # type: ignore
|
15
18
|
|
16
19
|
try:
|
17
20
|
import pikepdf
|
18
21
|
except ImportError:
|
19
|
-
pikepdf = None
|
22
|
+
pikepdf = None # type: ignore
|
20
23
|
|
21
24
|
try:
|
22
25
|
from ocrmypdf.hocrtransform import HocrTransform
|
23
26
|
except ImportError:
|
24
|
-
HocrTransform = None
|
27
|
+
HocrTransform = None # type: ignore
|
25
28
|
|
26
29
|
if TYPE_CHECKING:
|
27
|
-
from natural_pdf.core.pdf import PDF
|
28
30
|
from natural_pdf.core.page import Page
|
31
|
+
from natural_pdf.core.pdf import PDF
|
32
|
+
from natural_pdf.elements.collections import PageCollection
|
29
33
|
|
30
34
|
|
31
35
|
logger = logging.getLogger(__name__)
|
32
36
|
|
33
37
|
# --- Constants ---
|
34
|
-
HOCR_TEMPLATE_HEADER =
|
38
|
+
HOCR_TEMPLATE_HEADER = """<?xml version="1.0" encoding="UTF-8"?>
|
35
39
|
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
|
36
40
|
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
|
37
41
|
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
|
@@ -42,27 +46,27 @@ HOCR_TEMPLATE_HEADER = '''<?xml version="1.0" encoding="UTF-8"?>
|
|
42
46
|
<meta name='ocr-capabilities' content='ocr_page ocr_carea ocr_par ocr_line ocrx_word'/>
|
43
47
|
</head>
|
44
48
|
<body>
|
45
|
-
|
49
|
+
"""
|
46
50
|
|
47
|
-
HOCR_TEMPLATE_PAGE =
|
48
|
-
|
51
|
+
HOCR_TEMPLATE_PAGE = """ <div class='ocr_page' id='page_{page_num}' title='image "{image_path}"; bbox 0 0 {width} {height}; ppageno {page_num}'>
|
52
|
+
"""
|
49
53
|
|
50
|
-
HOCR_TEMPLATE_WORD =
|
51
|
-
|
54
|
+
HOCR_TEMPLATE_WORD = """ <span class='ocrx_word' id='word_{page_num}_{word_id}' title='bbox {x0} {y0} {x1} {y1}; x_wconf {confidence}'>{text}</span>
|
55
|
+
"""
|
52
56
|
|
53
|
-
HOCR_TEMPLATE_LINE_START =
|
54
|
-
|
55
|
-
HOCR_TEMPLATE_LINE_END =
|
56
|
-
|
57
|
+
HOCR_TEMPLATE_LINE_START = """ <span class='ocr_line' id='line_{page_num}_{line_id}' title='bbox {x0} {y0} {x1} {y1}; baseline 0 0; x_size 0; x_descenders 0; x_ascenders 0'>
|
58
|
+
"""
|
59
|
+
HOCR_TEMPLATE_LINE_END = """ </span>
|
60
|
+
"""
|
57
61
|
|
58
|
-
HOCR_TEMPLATE_FOOTER =
|
62
|
+
HOCR_TEMPLATE_FOOTER = """ </div>
|
59
63
|
</body>
|
60
64
|
</html>
|
61
|
-
|
65
|
+
"""
|
62
66
|
# --- End Constants ---
|
63
67
|
|
64
68
|
|
65
|
-
def _generate_hocr_for_page(page:
|
69
|
+
def _generate_hocr_for_page(page: "Page", image_width: int, image_height: int) -> str:
|
66
70
|
"""
|
67
71
|
Generates an hOCR string for a given Page object based on its OCR elements.
|
68
72
|
|
@@ -79,96 +83,240 @@ def _generate_hocr_for_page(page: 'Page', image_width: int, image_height: int) -
|
|
79
83
|
"""
|
80
84
|
# Attempt to get OCR elements (words) using find_all with selector
|
81
85
|
# Use find_all which returns an ElementCollection
|
82
|
-
ocr_elements_collection = page.find_all(
|
83
|
-
ocr_elements = ocr_elements_collection.elements
|
86
|
+
ocr_elements_collection = page.find_all("text[source=ocr]")
|
87
|
+
ocr_elements = ocr_elements_collection.elements # Get the list of elements
|
84
88
|
|
85
89
|
if not ocr_elements:
|
86
|
-
logger.warning(
|
90
|
+
logger.warning(
|
91
|
+
f"Page {page.number} has no OCR elements (text[source=ocr]) to generate hOCR from."
|
92
|
+
)
|
87
93
|
# Return minimal valid hOCR for an empty page
|
88
94
|
hocr_content = HOCR_TEMPLATE_HEADER
|
89
|
-
hocr_content += HOCR_TEMPLATE_PAGE.format(
|
95
|
+
hocr_content += HOCR_TEMPLATE_PAGE.format(
|
96
|
+
page_num=page.index, image_path="", width=image_width, height=image_height
|
97
|
+
)
|
90
98
|
hocr_content += HOCR_TEMPLATE_FOOTER
|
91
99
|
return hocr_content
|
92
100
|
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
101
|
+
# --- Start Line Grouping Logic ---
|
102
|
+
logger.debug(f"Page {page.index}: Grouping {len(ocr_elements)} words into lines.")
|
103
|
+
ocr_elements.sort(key=lambda el: (el.bbox[1], el.bbox[0]))
|
104
|
+
lines = []
|
105
|
+
current_line = []
|
106
|
+
if ocr_elements:
|
107
|
+
current_line.append(ocr_elements[0])
|
108
|
+
for i in range(1, len(ocr_elements)):
|
109
|
+
current_word = ocr_elements[i]
|
110
|
+
last_word = current_line[-1]
|
111
|
+
last_word_y0, last_word_y1 = last_word.bbox[1], last_word.bbox[3]
|
112
|
+
current_word_y0, current_word_y1 = current_word.bbox[1], current_word.bbox[3]
|
113
|
+
last_word_center_y = (last_word_y0 + last_word_y1) / 2
|
114
|
+
current_word_center_y = (current_word_y0 + current_word_y1) / 2
|
115
|
+
last_word_height = last_word_y1 - last_word_y0
|
116
|
+
current_word_height = current_word_y1 - current_word_y0
|
117
|
+
avg_height = (last_word_height + current_word_height) / 2
|
118
|
+
if avg_height <= 0:
|
119
|
+
avg_height = 1
|
120
|
+
tolerance_factor = 0.7
|
121
|
+
threshold = avg_height * tolerance_factor
|
122
|
+
delta_y = abs(current_word_center_y - last_word_center_y)
|
123
|
+
# if delta_y < threshold:
|
124
|
+
# current_line.append(current_word)
|
125
|
+
# else:
|
126
|
+
lines.append(current_line)
|
127
|
+
current_line = [current_word]
|
128
|
+
if current_line:
|
129
|
+
lines.append(current_line)
|
130
|
+
logger.debug(f"Page {page.index}: Grouped into {len(lines)} lines.")
|
131
|
+
# --- End Line Grouping Logic ---
|
132
|
+
|
133
|
+
# --- Start ElementTree hOCR Generation ---
|
119
134
|
scale_x = image_width / page.width if page.width > 0 else 1
|
120
135
|
scale_y = image_height / page.height if page.height > 0 else 1
|
121
136
|
|
137
|
+
# Create root element
|
138
|
+
page_hocr = ETElement(
|
139
|
+
"html", attrib={"xmlns": "http://www.w3.org/1999/xhtml", "xml:lang": "en"}
|
140
|
+
)
|
141
|
+
|
142
|
+
# Head
|
143
|
+
head = SubElement(page_hocr, "head")
|
144
|
+
SubElement(head, "title").text = ""
|
145
|
+
SubElement(
|
146
|
+
head, "meta", attrib={"http-equiv": "Content-Type", "content": "text/html;charset=utf-8"}
|
147
|
+
)
|
148
|
+
SubElement(head, "meta", attrib={"name": "ocr-system", "content": "natural-pdf"})
|
149
|
+
SubElement(
|
150
|
+
head,
|
151
|
+
"meta",
|
152
|
+
attrib={
|
153
|
+
"name": "ocr-capabilities",
|
154
|
+
"content": "ocr_page ocr_carea ocr_par ocr_line ocrx_word",
|
155
|
+
},
|
156
|
+
)
|
157
|
+
|
158
|
+
# Body and Page
|
159
|
+
body = SubElement(page_hocr, "body")
|
160
|
+
page_div = SubElement(
|
161
|
+
body,
|
162
|
+
"div",
|
163
|
+
attrib={
|
164
|
+
"class": "ocr_page",
|
165
|
+
"id": f"page_{page.index}",
|
166
|
+
"title": f"image; bbox 0 0 {image_width} {image_height}; ppageno {page.index}",
|
167
|
+
},
|
168
|
+
)
|
169
|
+
|
170
|
+
# Calculate overall bbox for carea/par (image coords)
|
171
|
+
min_area_x0, min_area_y0 = image_width, image_height
|
172
|
+
max_area_x1, max_area_y1 = 0, 0
|
173
|
+
if lines:
|
174
|
+
for line_words in lines:
|
175
|
+
for word in line_words:
|
176
|
+
(x0, y0, x1, y1) = word.bbox
|
177
|
+
img_x0 = int(x0 * scale_x)
|
178
|
+
img_y0 = int(y0 * scale_y)
|
179
|
+
img_x1 = int(x1 * scale_x)
|
180
|
+
img_y1 = int(y1 * scale_y)
|
181
|
+
min_area_x0 = min(min_area_x0, img_x0)
|
182
|
+
min_area_y0 = min(min_area_y0, img_y0)
|
183
|
+
max_area_x1 = max(max_area_x1, img_x1)
|
184
|
+
max_area_y1 = max(max_area_y1, img_y1)
|
185
|
+
area_img_x0, area_img_y0 = max(0, min_area_x0), max(0, min_area_y0)
|
186
|
+
area_img_x1, area_img_y1 = min(image_width, max_area_x1), min(image_height, max_area_y1)
|
187
|
+
if area_img_x0 >= area_img_x1 or area_img_y0 >= area_img_y1:
|
188
|
+
area_img_x0, area_img_y0, area_img_x1, area_img_y1 = 0, 0, image_width, image_height
|
189
|
+
else:
|
190
|
+
area_img_x0, area_img_y0, area_img_x1, area_img_y1 = 0, 0, image_width, image_height
|
191
|
+
|
192
|
+
# Add Carea and Par wrappers (assuming one block/paragraph per page for simplicity)
|
193
|
+
block_div = SubElement(
|
194
|
+
page_div, # Attach to page_div now
|
195
|
+
"div",
|
196
|
+
attrib={
|
197
|
+
"class": "ocr_carea",
|
198
|
+
"id": "block_0_1", # Simple ID
|
199
|
+
"title": f"bbox {area_img_x0} {area_img_y0} {area_img_x1} {area_img_y1}",
|
200
|
+
},
|
201
|
+
)
|
202
|
+
par_div = SubElement(
|
203
|
+
block_div,
|
204
|
+
"p",
|
205
|
+
attrib={
|
206
|
+
"class": "ocr_par",
|
207
|
+
"id": "par_0_1", # Simple ID
|
208
|
+
"title": f"bbox {area_img_x0} {area_img_y0} {area_img_x1} {area_img_y1}",
|
209
|
+
},
|
210
|
+
)
|
211
|
+
|
212
|
+
# Loop through lines and words
|
122
213
|
word_id_counter = 0
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
214
|
+
line_id_counter = 0
|
215
|
+
for current_line_words in lines:
|
216
|
+
if not current_line_words:
|
217
|
+
continue
|
218
|
+
|
219
|
+
# Sort words in line by x0
|
220
|
+
current_line_words.sort(key=lambda el: el.bbox[0])
|
221
|
+
|
222
|
+
# Calculate line bbox (image coords)
|
223
|
+
min_line_x0, min_line_y0 = image_width, image_height
|
224
|
+
max_line_x1, max_line_y1 = 0, 0
|
225
|
+
for word in current_line_words:
|
226
|
+
(x0, y0, x1, y1) = word.bbox
|
227
|
+
img_x0, img_y0 = int(x0 * scale_x), int(y0 * scale_y)
|
228
|
+
img_x1, img_y1 = int(x1 * scale_x), int(y1 * scale_y)
|
229
|
+
min_line_x0, min_line_y0 = min(min_line_x0, img_x0), min(min_line_y0, img_y0)
|
230
|
+
max_line_x1, max_line_y1 = max(max_line_x1, img_x1), max(max_line_y1, img_y1)
|
231
|
+
|
232
|
+
line_img_x0, line_img_y0 = max(0, min_line_x0), max(0, min_line_y0)
|
233
|
+
line_img_x1, line_img_y1 = min(image_width, max_line_x1), min(image_height, max_line_y1)
|
234
|
+
if line_img_x0 >= line_img_x1 or line_img_y0 >= line_img_y1:
|
235
|
+
line_img_x0, line_img_y0, line_img_x1, line_img_y1 = 0, 0, 1, 1
|
236
|
+
|
237
|
+
# Create ocr_line span
|
238
|
+
line_span = SubElement(
|
239
|
+
par_div, # Attach line to paragraph
|
240
|
+
"span",
|
241
|
+
attrib={
|
242
|
+
"class": "ocr_line",
|
243
|
+
"id": f"line_{page.index}_{line_id_counter}",
|
244
|
+
"title": f"bbox {line_img_x0} {line_img_y0} {line_img_x1} {line_img_y1}; baseline 0 0; x_size 0; x_descenders 0; x_ascenders 0",
|
245
|
+
},
|
151
246
|
)
|
152
|
-
word_id_counter += 1
|
153
|
-
hocr_content += "\n" # Add newline for readability
|
154
247
|
|
248
|
+
# Add words to line
|
249
|
+
for word in current_line_words:
|
250
|
+
(x0, y0, x1, y1) = word.bbox
|
251
|
+
img_x0, img_y0 = int(x0 * scale_x), int(y0 * scale_y)
|
252
|
+
img_x1, img_y1 = int(x1 * scale_x), int(y1 * scale_y)
|
253
|
+
|
254
|
+
img_x0, img_y0 = max(0, img_x0), max(0, img_y0)
|
255
|
+
img_x1, img_y1 = min(image_width, img_x1), min(image_height, img_y1)
|
256
|
+
if img_x1 <= img_x0:
|
257
|
+
img_x1 = img_x0 + 1
|
258
|
+
if img_y1 <= img_y0:
|
259
|
+
img_y1 = img_y0 + 1
|
260
|
+
|
261
|
+
# --- Strip whitespace and check if word is empty --- #
|
262
|
+
text = word.text.strip().replace("&", "&").replace("<", "<").replace(">", ">")
|
263
|
+
if not text:
|
264
|
+
continue # Skip adding this word if it becomes empty after stripping
|
265
|
+
# --- End strip ---
|
266
|
+
confidence = getattr(word, "confidence", 1.00)
|
267
|
+
|
268
|
+
word_span = SubElement(
|
269
|
+
line_span, # Attach word to line
|
270
|
+
"span",
|
271
|
+
attrib={
|
272
|
+
"class": "ocrx_word",
|
273
|
+
"id": f"word_{page.index}_{word_id_counter}",
|
274
|
+
"title": f"bbox {img_x0} {img_y0} {img_x1} {img_y1}; x_wconf {confidence}",
|
275
|
+
},
|
276
|
+
)
|
277
|
+
word_span.text = text
|
278
|
+
word_id_counter += 1
|
279
|
+
line_id_counter += 1
|
280
|
+
|
281
|
+
# Convert ElementTree to string
|
282
|
+
# xml_declaration = '<?xml version="1.0" encoding="UTF-8"?>\n' # No longer needed
|
283
|
+
# doctype_declaration = '''<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
|
284
|
+
# "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">\n''' # No longer needed
|
285
|
+
# ET.indent(page_hocr) # Optional: for pretty printing, requires Python 3.9+
|
286
|
+
# Need bytes for writing, then decode for HocrTransform if it needs str
|
287
|
+
# Let's stick to unicode string output for now, as the file write expects it.
|
288
|
+
hocr_content = ET.tostring(
|
289
|
+
page_hocr, encoding="unicode", method="xml"
|
290
|
+
) # Revert back to method='xml'
|
291
|
+
# hocr_content = xml_declaration + doctype_declaration + hocr_string_content # Removed string addition
|
292
|
+
# --- End ElementTree hOCR Generation ---
|
293
|
+
|
294
|
+
# --- Add code to save hOCR output for inspection ---
|
295
|
+
try:
|
296
|
+
hocr_output_path = "natural_pdf_hocr_output.hocr"
|
297
|
+
with open(hocr_output_path, "w", encoding="utf-8") as f_out:
|
298
|
+
f_out.write(hocr_content)
|
299
|
+
logger.info(f"Saved hOCR content for page {page.index} to: {hocr_output_path}")
|
300
|
+
except Exception as e:
|
301
|
+
logger.error(f"Failed to save hOCR output to file: {e}")
|
302
|
+
# --- End save hOCR ---
|
155
303
|
|
156
|
-
hocr_content += HOCR_TEMPLATE_FOOTER
|
157
304
|
return hocr_content
|
158
305
|
|
159
306
|
|
160
|
-
def create_searchable_pdf(
|
307
|
+
def create_searchable_pdf(
|
308
|
+
source: Union["Page", "PageCollection", "PDF"], output_path: str, dpi: int = 300
|
309
|
+
):
|
161
310
|
"""
|
162
311
|
Creates a searchable PDF from a natural_pdf.PDF object using OCR results.
|
163
312
|
|
164
313
|
Relies on ocrmypdf for hOCR transformation. Requires optional dependencies.
|
165
314
|
|
166
315
|
Args:
|
167
|
-
|
316
|
+
source: The natural_pdf.PDF, PageCollection, or Page object
|
168
317
|
output_path: The path to save the resulting searchable PDF.
|
169
318
|
dpi: The resolution (dots per inch) for rendering page images and hOCR.
|
170
319
|
"""
|
171
|
-
# _check_dependencies() # Removed check
|
172
320
|
|
173
321
|
# --- Ensure dependencies are loaded (they should be if installed) ---
|
174
322
|
if Image is None or pikepdf is None or HocrTransform is None:
|
@@ -180,7 +328,13 @@ def create_searchable_pdf(pdf_object: 'PDF', output_path: str, dpi: int = 300):
|
|
180
328
|
)
|
181
329
|
# --- End Safeguard Check ---
|
182
330
|
|
183
|
-
|
331
|
+
# duck type to see if source has .pages, to populate pages =
|
332
|
+
if hasattr(source, "pages"):
|
333
|
+
pages = source.pages
|
334
|
+
else:
|
335
|
+
pages = [source]
|
336
|
+
|
337
|
+
logger.info(f"Starting searchable PDF creation '{output_path}' at {dpi} DPI.")
|
184
338
|
|
185
339
|
temp_pdf_pages: List[str] = []
|
186
340
|
output_abs_path = os.path.abspath(output_path)
|
@@ -188,10 +342,12 @@ def create_searchable_pdf(pdf_object: 'PDF', output_path: str, dpi: int = 300):
|
|
188
342
|
with tempfile.TemporaryDirectory() as tmpdir:
|
189
343
|
logger.debug(f"Using temporary directory: {tmpdir}")
|
190
344
|
|
191
|
-
for i, page in enumerate(
|
192
|
-
logger.debug(f"Processing page {
|
345
|
+
for i, page in enumerate(pages):
|
346
|
+
logger.debug(f"Processing page {i+1} of {len(pages)}...")
|
193
347
|
page_base_name = f"page_{i}"
|
194
|
-
img_path = os.path.join(
|
348
|
+
img_path = os.path.join(
|
349
|
+
tmpdir, f"{page_base_name}.png"
|
350
|
+
) # Use PNG for potentially better quality
|
195
351
|
hocr_path = os.path.join(tmpdir, f"{page_base_name}.hocr")
|
196
352
|
pdf_page_path = os.path.join(tmpdir, f"{page_base_name}.pdf")
|
197
353
|
|
@@ -200,18 +356,17 @@ def create_searchable_pdf(pdf_object: 'PDF', output_path: str, dpi: int = 300):
|
|
200
356
|
logger.debug(f" Rendering page {i} to image ({dpi} DPI)...")
|
201
357
|
# Use the Page's to_image method
|
202
358
|
pil_image = page.to_image(resolution=dpi, include_highlights=False)
|
203
|
-
pil_image.save(img_path, format=
|
359
|
+
pil_image.save(img_path, format="PNG")
|
204
360
|
img_width, img_height = pil_image.size
|
205
361
|
logger.debug(f" Image saved to {img_path} ({img_width}x{img_height})")
|
206
362
|
|
207
363
|
# 2. Generate hOCR
|
208
364
|
logger.debug(f" Generating hOCR...")
|
209
365
|
hocr_content = _generate_hocr_for_page(page, img_width, img_height)
|
210
|
-
with open(hocr_path,
|
366
|
+
with open(hocr_path, "w", encoding="utf-8") as f:
|
211
367
|
f.write(hocr_content)
|
212
368
|
logger.debug(f" hOCR saved to {hocr_path}")
|
213
369
|
|
214
|
-
|
215
370
|
# 3. Use HocrTransform to create searchable PDF page
|
216
371
|
logger.debug(f" Running HocrTransform...")
|
217
372
|
hocr_transform = HocrTransform(hocr_filename=hocr_path, dpi=dpi)
|
@@ -221,11 +376,11 @@ def create_searchable_pdf(pdf_object: 'PDF', output_path: str, dpi: int = 300):
|
|
221
376
|
logger.debug(f" Temporary PDF page saved to {pdf_page_path}")
|
222
377
|
|
223
378
|
except Exception as e:
|
224
|
-
|
225
|
-
|
226
|
-
|
227
|
-
|
228
|
-
|
379
|
+
logger.error(f" Failed to process page {page.number}: {e}", exc_info=True)
|
380
|
+
# Decide whether to skip or raise error
|
381
|
+
# For now, let's skip and continue
|
382
|
+
logger.warning(f" Skipping page {page.number} due to error.")
|
383
|
+
continue # Skip to the next page
|
229
384
|
|
230
385
|
# 4. Merge temporary PDF pages
|
231
386
|
if not temp_pdf_pages:
|
@@ -237,16 +392,20 @@ def create_searchable_pdf(pdf_object: 'PDF', output_path: str, dpi: int = 300):
|
|
237
392
|
# Use pikepdf for merging
|
238
393
|
output_pdf = pikepdf.Pdf.new()
|
239
394
|
for temp_pdf_path in temp_pdf_pages:
|
240
|
-
|
241
|
-
|
242
|
-
|
243
|
-
|
244
|
-
|
245
|
-
|
395
|
+
with pikepdf.Pdf.open(temp_pdf_path) as src_page_pdf:
|
396
|
+
# Assuming each temp PDF has exactly one page
|
397
|
+
if len(src_page_pdf.pages) == 1:
|
398
|
+
output_pdf.pages.append(src_page_pdf.pages[0])
|
399
|
+
else:
|
400
|
+
logger.warning(
|
401
|
+
f"Temporary PDF '{temp_pdf_path}' had unexpected number of pages ({len(src_page_pdf.pages)}). Skipping."
|
402
|
+
)
|
246
403
|
output_pdf.save(output_abs_path)
|
247
404
|
logger.info(f"Successfully saved merged searchable PDF to: {output_abs_path}")
|
248
405
|
except Exception as e:
|
249
|
-
logger.error(
|
406
|
+
logger.error(
|
407
|
+
f"Failed to merge temporary PDFs into '{output_abs_path}': {e}", exc_info=True
|
408
|
+
)
|
250
409
|
raise RuntimeError(f"Failed to save final PDF: {e}") from e
|
251
410
|
|
252
|
-
logger.debug("Temporary directory cleaned up.")
|
411
|
+
logger.debug("Temporary directory cleaned up.")
|
natural_pdf/ocr/__init__.py
CHANGED
@@ -3,47 +3,56 @@ OCR engines for natural-pdf.
|
|
3
3
|
|
4
4
|
This module provides different OCR engines that can be used with natural-pdf.
|
5
5
|
"""
|
6
|
+
|
6
7
|
import logging
|
7
8
|
|
8
9
|
# Set up module logger
|
9
10
|
logger = logging.getLogger("natural_pdf.ocr")
|
10
|
-
from .ocr_manager import OCRManager
|
11
|
-
from .engine import OCREngine
|
12
|
-
from .ocr_options import OCROptions
|
13
11
|
from .engine import OCREngine
|
14
12
|
from .engine_paddle import PaddleOCREngine
|
15
13
|
from .engine_surya import SuryaOCREngine
|
14
|
+
from .ocr_manager import OCRManager
|
15
|
+
from .ocr_options import OCROptions
|
16
16
|
|
17
|
-
__all__ = [
|
17
|
+
__all__ = [
|
18
|
+
"OCRManager",
|
19
|
+
"OCREngine",
|
20
|
+
"OCROptions",
|
21
|
+
"EasyOCREngine",
|
22
|
+
"PaddleOCREngine",
|
23
|
+
"SuryaOCREngine",
|
24
|
+
]
|
18
25
|
|
19
26
|
DEFAULT_ENGINE = SuryaOCREngine
|
20
27
|
|
28
|
+
|
21
29
|
def get_engine(engine_name=None, **kwargs):
|
22
30
|
"""
|
23
31
|
Get OCR engine by name.
|
24
|
-
|
32
|
+
|
25
33
|
Args:
|
26
34
|
engine_name: Name of the engine to use ('easyocr', 'paddleocr', etc.)
|
27
35
|
If None, the default engine is used (PaddleOCR if available, otherwise EasyOCR)
|
28
36
|
**kwargs: Additional arguments to pass to the engine constructor
|
29
|
-
|
37
|
+
|
30
38
|
Returns:
|
31
39
|
OCREngine instance
|
32
40
|
"""
|
33
41
|
logger.debug(f"Initializing OCR engine: {engine_name or 'default'}")
|
34
|
-
|
35
|
-
if engine_name is None or engine_name ==
|
42
|
+
|
43
|
+
if engine_name is None or engine_name == "default":
|
36
44
|
engine = DEFAULT_ENGINE(**kwargs)
|
37
45
|
logger.info(f"Using default OCR engine: {engine.__class__.__name__}")
|
38
46
|
return engine
|
39
|
-
|
40
|
-
if engine_name.lower() ==
|
47
|
+
|
48
|
+
if engine_name.lower() == "easyocr":
|
41
49
|
logger.info("Initializing EasyOCR engine")
|
42
50
|
return EasyOCREngine(**kwargs)
|
43
|
-
|
44
|
-
if engine_name.lower() ==
|
51
|
+
|
52
|
+
if engine_name.lower() == "paddleocr":
|
45
53
|
try:
|
46
54
|
from .engine_paddle import PaddleOCREngine
|
55
|
+
|
47
56
|
logger.info("Initializing PaddleOCR engine")
|
48
57
|
return PaddleOCREngine(**kwargs)
|
49
58
|
except ImportError:
|
@@ -51,6 +60,6 @@ def get_engine(engine_name=None, **kwargs):
|
|
51
60
|
raise ImportError(
|
52
61
|
"PaddleOCR is not installed. Please install it with: pip install paddlepaddle paddleocr"
|
53
62
|
)
|
54
|
-
|
63
|
+
|
55
64
|
logger.error(f"Unknown OCR engine: {engine_name}")
|
56
|
-
raise ValueError(f"Unknown OCR engine: {engine_name}")
|
65
|
+
raise ValueError(f"Unknown OCR engine: {engine_name}")
|
natural_pdf/ocr/engine.py
CHANGED
@@ -1,7 +1,8 @@
|
|
1
1
|
# ocr_engine_base.py
|
2
2
|
import logging
|
3
3
|
from abc import ABC, abstractmethod
|
4
|
-
from typing import Dict, List,
|
4
|
+
from typing import Any, Dict, List, Optional, Tuple, Union
|
5
|
+
|
5
6
|
from PIL import Image
|
6
7
|
|
7
8
|
# Assuming ocr_options defines BaseOCROptions
|
@@ -9,6 +10,7 @@ from .ocr_options import BaseOCROptions
|
|
9
10
|
|
10
11
|
logger = logging.getLogger(__name__)
|
11
12
|
|
13
|
+
|
12
14
|
class OCREngine(ABC):
|
13
15
|
"""Abstract Base Class for OCR engines."""
|
14
16
|
|
@@ -16,14 +18,14 @@ class OCREngine(ABC):
|
|
16
18
|
"""Initializes the base OCR engine."""
|
17
19
|
self.logger = logging.getLogger(f"{__name__}.{self.__class__.__name__}")
|
18
20
|
self.logger.info(f"Initializing {self.__class__.__name__}")
|
19
|
-
self._reader_cache = {}
|
21
|
+
self._reader_cache = {} # Cache for initialized models/readers
|
20
22
|
|
21
23
|
@abstractmethod
|
22
24
|
def process_image(
|
23
25
|
self,
|
24
|
-
images: Union[Image.Image, List[Image.Image]],
|
25
|
-
options: BaseOCROptions
|
26
|
-
) -> Union[List[Dict[str, Any]], List[List[Dict[str, Any]]]]:
|
26
|
+
images: Union[Image.Image, List[Image.Image]], # Accept single or list
|
27
|
+
options: BaseOCROptions,
|
28
|
+
) -> Union[List[Dict[str, Any]], List[List[Dict[str, Any]]]]: # Return single or list of lists
|
27
29
|
"""
|
28
30
|
Processes a single image or a batch of images using the specific engine and options.
|
29
31
|
|
@@ -80,10 +82,18 @@ class OCREngine(ABC):
|
|
80
82
|
Tuple[float, float, float, float] or None if conversion fails.
|
81
83
|
"""
|
82
84
|
try:
|
83
|
-
if
|
85
|
+
if (
|
86
|
+
isinstance(bbox, (list, tuple))
|
87
|
+
and len(bbox) == 4
|
88
|
+
and all(isinstance(n, (int, float)) for n in bbox)
|
89
|
+
):
|
84
90
|
# Already in (x0, y0, x1, y1) format (or similar)
|
85
91
|
return tuple(float(c) for c in bbox[:4])
|
86
|
-
elif
|
92
|
+
elif (
|
93
|
+
isinstance(bbox, (list, tuple))
|
94
|
+
and len(bbox) > 0
|
95
|
+
and isinstance(bbox[0], (list, tuple))
|
96
|
+
):
|
87
97
|
# Polygon format [[x1,y1],[x2,y2],...]
|
88
98
|
x_coords = [float(point[0]) for point in bbox]
|
89
99
|
y_coords = [float(point[1]) for point in bbox]
|
@@ -101,4 +111,3 @@ class OCREngine(ABC):
|
|
101
111
|
self.logger.info(f"Cleaning up {self.__class__.__name__} resources.")
|
102
112
|
# Clear reader cache to free up memory/GPU resources
|
103
113
|
self._reader_cache.clear()
|
104
|
-
|