natural-pdf 0.1.5__py3-none-any.whl → 0.1.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docs/ocr/index.md +34 -47
- docs/tutorials/01-loading-and-extraction.ipynb +60 -46
- docs/tutorials/02-finding-elements.ipynb +42 -42
- docs/tutorials/03-extracting-blocks.ipynb +17 -17
- docs/tutorials/04-table-extraction.ipynb +12 -12
- docs/tutorials/05-excluding-content.ipynb +30 -30
- docs/tutorials/06-document-qa.ipynb +28 -28
- docs/tutorials/07-layout-analysis.ipynb +63 -35
- docs/tutorials/07-working-with-regions.ipynb +55 -51
- docs/tutorials/07-working-with-regions.md +2 -2
- docs/tutorials/08-spatial-navigation.ipynb +60 -60
- docs/tutorials/09-section-extraction.ipynb +113 -113
- docs/tutorials/10-form-field-extraction.ipynb +78 -50
- docs/tutorials/11-enhanced-table-processing.ipynb +6 -6
- docs/tutorials/12-ocr-integration.ipynb +149 -131
- docs/tutorials/12-ocr-integration.md +0 -13
- docs/tutorials/13-semantic-search.ipynb +313 -873
- natural_pdf/__init__.py +21 -23
- natural_pdf/analyzers/layout/gemini.py +264 -0
- natural_pdf/analyzers/layout/layout_manager.py +28 -1
- natural_pdf/analyzers/layout/layout_options.py +11 -0
- natural_pdf/analyzers/layout/yolo.py +6 -2
- natural_pdf/collections/pdf_collection.py +21 -0
- natural_pdf/core/element_manager.py +16 -13
- natural_pdf/core/page.py +165 -36
- natural_pdf/core/pdf.py +146 -41
- natural_pdf/elements/base.py +11 -17
- natural_pdf/elements/collections.py +100 -38
- natural_pdf/elements/region.py +77 -38
- natural_pdf/elements/text.py +5 -0
- natural_pdf/ocr/__init__.py +49 -36
- natural_pdf/ocr/engine.py +146 -51
- natural_pdf/ocr/engine_easyocr.py +141 -161
- natural_pdf/ocr/engine_paddle.py +107 -193
- natural_pdf/ocr/engine_surya.py +75 -148
- natural_pdf/ocr/ocr_factory.py +114 -0
- natural_pdf/ocr/ocr_manager.py +65 -93
- natural_pdf/ocr/ocr_options.py +7 -17
- natural_pdf/ocr/utils.py +98 -0
- natural_pdf/templates/spa/css/style.css +334 -0
- natural_pdf/templates/spa/index.html +31 -0
- natural_pdf/templates/spa/js/app.js +472 -0
- natural_pdf/templates/spa/words.txt +235976 -0
- natural_pdf/utils/debug.py +32 -0
- natural_pdf/utils/identifiers.py +29 -0
- natural_pdf/utils/packaging.py +418 -0
- {natural_pdf-0.1.5.dist-info → natural_pdf-0.1.6.dist-info}/METADATA +41 -19
- {natural_pdf-0.1.5.dist-info → natural_pdf-0.1.6.dist-info}/RECORD +51 -44
- {natural_pdf-0.1.5.dist-info → natural_pdf-0.1.6.dist-info}/WHEEL +1 -1
- {natural_pdf-0.1.5.dist-info → natural_pdf-0.1.6.dist-info}/top_level.txt +0 -1
- natural_pdf/templates/ocr_debug.html +0 -517
- tests/test_loading.py +0 -50
- tests/test_optional_deps.py +0 -298
- {natural_pdf-0.1.5.dist-info → natural_pdf-0.1.6.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,32 @@
|
|
1
|
+
"""
|
2
|
+
OCR debug utilities for natural-pdf.
|
3
|
+
"""
|
4
|
+
import base64
|
5
|
+
import io
|
6
|
+
import json
|
7
|
+
import os
|
8
|
+
import importlib.util
|
9
|
+
import importlib.resources
|
10
|
+
import webbrowser
|
11
|
+
from typing import Dict, List, Any, Optional, Union, Tuple
|
12
|
+
|
13
|
+
from PIL import Image
|
14
|
+
|
15
|
+
# Assuming Page type hint is available or define a placeholder
|
16
|
+
try:
|
17
|
+
from natural_pdf.core.page import Page
|
18
|
+
except ImportError:
|
19
|
+
Page = Any # Placeholder
|
20
|
+
|
21
|
+
def _get_page_image_base64(page: Page) -> str:
|
22
|
+
"""Generate a base64 encoded image of the page."""
|
23
|
+
# Create a clean image of the page without highlights for the base background
|
24
|
+
# Use a fixed scale consistent with the HTML/JS rendering logic
|
25
|
+
img = page.to_image(scale=2.0, include_highlights=False)
|
26
|
+
if img is None:
|
27
|
+
raise ValueError(f"Failed to render image for page {page.number}")
|
28
|
+
|
29
|
+
# Convert to base64
|
30
|
+
buffered = io.BytesIO()
|
31
|
+
img.save(buffered, format="PNG")
|
32
|
+
return f"data:image/png;base64,{base64.b64encode(buffered.getvalue()).decode('utf-8')}"
|
@@ -0,0 +1,29 @@
|
|
1
|
+
"""
|
2
|
+
Utilities for generating consistent identifiers.
|
3
|
+
"""
|
4
|
+
import hashlib
|
5
|
+
import base64
|
6
|
+
import os
|
7
|
+
|
8
|
+
def generate_short_path_hash(path_str: str, length: int = 8) -> str:
|
9
|
+
"""
|
10
|
+
Generates a short, filesystem-safe hash ID from a path string.
|
11
|
+
|
12
|
+
Args:
|
13
|
+
path_str: The absolute path string.
|
14
|
+
length: The desired length of the short ID (default: 8).
|
15
|
+
|
16
|
+
Returns:
|
17
|
+
A short hash string using URL-safe Base64 encoding.
|
18
|
+
"""
|
19
|
+
# Ensure consistency by using the absolute path
|
20
|
+
normalized_path = os.path.abspath(path_str)
|
21
|
+
path_bytes = normalized_path.encode('utf-8')
|
22
|
+
# Use SHA-256 for good collision resistance
|
23
|
+
full_hash = hashlib.sha256(path_bytes).digest() # Get binary hash
|
24
|
+
# Encode using URL-safe Base64 and remove padding '=' characters
|
25
|
+
b64_encoded = base64.urlsafe_b64encode(full_hash).decode('ascii').rstrip('=')
|
26
|
+
# Return the first 'length' characters
|
27
|
+
if length <= 0 or length > len(b64_encoded):
|
28
|
+
raise ValueError(f"Invalid length specified: {length}. Must be between 1 and {len(b64_encoded)}.")
|
29
|
+
return b64_encoded[:length]
|
@@ -0,0 +1,418 @@
|
|
1
|
+
"""
|
2
|
+
Utilities for packaging data for external processes, like correction tasks.
|
3
|
+
"""
|
4
|
+
|
5
|
+
import os
|
6
|
+
import base64
|
7
|
+
import io
|
8
|
+
import json
|
9
|
+
import zipfile
|
10
|
+
import tempfile
|
11
|
+
import logging
|
12
|
+
import shutil
|
13
|
+
from typing import Any, List, Union, Iterable, TYPE_CHECKING, Dict
|
14
|
+
from tqdm import tqdm
|
15
|
+
from natural_pdf.elements.text import TextElement
|
16
|
+
|
17
|
+
# Import the specific PDF/Page types if possible, otherwise use Any
|
18
|
+
if TYPE_CHECKING:
|
19
|
+
from natural_pdf.core.pdf import PDF
|
20
|
+
from natural_pdf.core.page import Page
|
21
|
+
from natural_pdf.collections.pdf_collection import PDFCollection
|
22
|
+
else:
|
23
|
+
PDF = Any
|
24
|
+
Page = Any
|
25
|
+
PDFCollection = Any
|
26
|
+
|
27
|
+
from natural_pdf.utils.identifiers import generate_short_path_hash
|
28
|
+
|
29
|
+
logger = logging.getLogger(__name__)
|
30
|
+
|
31
|
+
def create_correction_task_package(
|
32
|
+
source: Union['PDF', 'PDFCollection', List['PDF']],
|
33
|
+
output_zip_path: str,
|
34
|
+
overwrite: bool = False,
|
35
|
+
suggest = None,
|
36
|
+
resolution: int = 150,
|
37
|
+
) -> None:
|
38
|
+
"""
|
39
|
+
Creates a zip package containing data for an OCR correction task.
|
40
|
+
|
41
|
+
The package includes:
|
42
|
+
- manifest.json: Metadata about pages and OCR regions (using original PDF coordinates).
|
43
|
+
- images/ directory: Rendered full-page images.
|
44
|
+
|
45
|
+
Args:
|
46
|
+
source: The PDF object, PDFCollection, or list of PDF objects to process.
|
47
|
+
output_zip_path: The full path where the output zip file should be saved.
|
48
|
+
overwrite: If True, overwrite the output zip file if it already exists.
|
49
|
+
suggest: Function that takes the region and returns an OCR suggestion
|
50
|
+
|
51
|
+
Raises:
|
52
|
+
FileNotFoundError: If the output directory cannot be created.
|
53
|
+
FileExistsError: If the output zip file exists and overwrite is False.
|
54
|
+
TypeError: If the source type is invalid.
|
55
|
+
ValueError: If no valid pages with OCR data are found in the source.
|
56
|
+
"""
|
57
|
+
if os.path.exists(output_zip_path) and not overwrite:
|
58
|
+
raise FileExistsError(f"Output file already exists: {output_zip_path}. Set overwrite=True to replace it.")
|
59
|
+
|
60
|
+
# --- Resolve source to a list of PDF objects ---
|
61
|
+
pdfs_to_process: List['PDF'] = []
|
62
|
+
if hasattr(source, '__class__') and source.__class__.__name__ == 'PDF': # Check type without direct import
|
63
|
+
pdfs_to_process = [source]
|
64
|
+
elif hasattr(source, '__class__') and source.__class__.__name__ == 'PDFCollection':
|
65
|
+
pdfs_to_process = source.pdfs # Assuming PDFCollection has a .pdfs property
|
66
|
+
elif isinstance(source, list) and all(hasattr(p, '__class__') and p.__class__.__name__ == 'PDF' for p in source):
|
67
|
+
pdfs_to_process = source
|
68
|
+
else:
|
69
|
+
raise TypeError(f"Unsupported source type: {type(source)}. Must be PDF, PDFCollection, or List[PDF].")
|
70
|
+
|
71
|
+
if not pdfs_to_process:
|
72
|
+
logger.warning("No PDF documents provided in the source.")
|
73
|
+
return
|
74
|
+
|
75
|
+
manifest_data = {
|
76
|
+
"pdfs": [], # Store pdf-level info if needed later?
|
77
|
+
"pages": []
|
78
|
+
}
|
79
|
+
total_regions_found = 0
|
80
|
+
|
81
|
+
# Use a temporary directory for staging files before zipping
|
82
|
+
with tempfile.TemporaryDirectory() as temp_dir:
|
83
|
+
images_dir = os.path.join(temp_dir, "images")
|
84
|
+
os.makedirs(images_dir)
|
85
|
+
logger.info(f"Using temporary directory for staging: {temp_dir}")
|
86
|
+
|
87
|
+
# --- Process each PDF ---
|
88
|
+
for pdf in pdfs_to_process:
|
89
|
+
if not hasattr(pdf, 'path') or not hasattr(pdf, 'pages'):
|
90
|
+
logger.warning(f"Skipping invalid PDF object: {pdf}")
|
91
|
+
continue
|
92
|
+
|
93
|
+
pdf_path = pdf.path # Should be the resolved, absolute path
|
94
|
+
pdf_short_id = generate_short_path_hash(pdf_path)
|
95
|
+
logger.debug(f"Processing PDF: {pdf_path} (ID: {pdf_short_id})")
|
96
|
+
|
97
|
+
pdf_has_ocr_regions = False
|
98
|
+
for page in pdf.pages:
|
99
|
+
if not hasattr(page, 'index') or not hasattr(page, 'number') or \
|
100
|
+
not hasattr(page, 'width') or not hasattr(page, 'height') or \
|
101
|
+
not hasattr(page, 'find_all') or not hasattr(page, 'to_image'):
|
102
|
+
logger.warning(f"Skipping invalid Page object in {pdf_path}: page index {getattr(page, 'index', 'unknown')}")
|
103
|
+
continue
|
104
|
+
|
105
|
+
# 1. Extract OCR elements for this page
|
106
|
+
try:
|
107
|
+
# Important: apply_exclusions=False ensures we get *all* OCR data
|
108
|
+
# regardless of user exclusions set on the PDF/page object.
|
109
|
+
ocr_elements = page.find_all('text[source=ocr]', apply_exclusions=False).elements
|
110
|
+
except Exception as e:
|
111
|
+
logger.error(f"Failed to extract OCR elements for {pdf_path} page {page.number}: {e}", exc_info=True)
|
112
|
+
continue # Skip this page if element extraction fails
|
113
|
+
|
114
|
+
if not ocr_elements:
|
115
|
+
logger.debug(f"No OCR elements found for {pdf_path} page {page.number}. Skipping page in manifest.")
|
116
|
+
continue # Skip page if no OCR elements
|
117
|
+
|
118
|
+
pdf_has_ocr_regions = True # Mark that this PDF is relevant
|
119
|
+
logger.debug(f" Found {len(ocr_elements)} OCR elements on page {page.number}")
|
120
|
+
total_regions_found += len(ocr_elements)
|
121
|
+
|
122
|
+
# 2. Render and save page image
|
123
|
+
image_filename = f"{pdf_short_id}_page_{page.index}.png"
|
124
|
+
image_save_path = os.path.join(images_dir, image_filename)
|
125
|
+
try:
|
126
|
+
img = page.to_image(resolution=resolution, include_highlights=False)
|
127
|
+
if img is None:
|
128
|
+
raise ValueError("page.to_image returned None")
|
129
|
+
img.save(image_save_path, "PNG")
|
130
|
+
except Exception as e:
|
131
|
+
logger.error(f"Failed to render/save image for {pdf_path} page {page.number}: {e}", exc_info=True)
|
132
|
+
# If image fails, we cannot proceed with this page for the task
|
133
|
+
pdf_has_ocr_regions = False # Reset flag for this page
|
134
|
+
continue
|
135
|
+
|
136
|
+
# 3. Prepare region data for manifest
|
137
|
+
page_regions_data = []
|
138
|
+
# Calculate scaling factor from PDF coordinates (72 DPI) to image pixels
|
139
|
+
coord_scale_factor = resolution / 72.0
|
140
|
+
|
141
|
+
i = -1
|
142
|
+
for elem in tqdm(ocr_elements):
|
143
|
+
i += 1
|
144
|
+
# Basic check for necessary attributes
|
145
|
+
if not all(hasattr(elem, attr) for attr in ['x0', 'top', 'x1', 'bottom', 'text']):
|
146
|
+
logger.warning(f"Skipping invalid OCR element {i} on {pdf_path} page {page.number}")
|
147
|
+
continue
|
148
|
+
region_id = f"r_{page.index}_{i}" # ID unique within page
|
149
|
+
|
150
|
+
# Scale coordinates to match the 300 DPI image
|
151
|
+
scaled_bbox = [
|
152
|
+
elem.x0 * coord_scale_factor,
|
153
|
+
elem.top * coord_scale_factor,
|
154
|
+
elem.x1 * coord_scale_factor,
|
155
|
+
elem.bottom * coord_scale_factor
|
156
|
+
]
|
157
|
+
|
158
|
+
corrected = elem.text
|
159
|
+
|
160
|
+
if suggest:
|
161
|
+
corrected = suggest(elem.to_region(), getattr(elem, 'confidence', None))
|
162
|
+
|
163
|
+
page_regions_data.append({
|
164
|
+
"resolution": resolution,
|
165
|
+
"id": region_id,
|
166
|
+
"bbox": scaled_bbox,
|
167
|
+
"ocr_text": elem.text,
|
168
|
+
"confidence": getattr(elem, 'confidence', None), # Include confidence if available
|
169
|
+
"corrected_text": corrected,
|
170
|
+
"modified": False
|
171
|
+
})
|
172
|
+
|
173
|
+
# 4. Add page data to manifest if it has regions
|
174
|
+
if page_regions_data:
|
175
|
+
manifest_data["pages"].append({
|
176
|
+
"pdf_source": pdf_path,
|
177
|
+
"pdf_short_id": pdf_short_id,
|
178
|
+
"page_number": page.number,
|
179
|
+
"page_index": page.index,
|
180
|
+
"image_path": f"images/{image_filename}", # Relative path within zip
|
181
|
+
"width": page.width,
|
182
|
+
"height": page.height,
|
183
|
+
"regions": page_regions_data
|
184
|
+
})
|
185
|
+
else:
|
186
|
+
# If, after checks, no valid regions remain, ensure flag is correct
|
187
|
+
pdf_has_ocr_regions = False
|
188
|
+
|
189
|
+
|
190
|
+
# --- Final Checks and Zipping ---
|
191
|
+
if not manifest_data["pages"] or total_regions_found == 0:
|
192
|
+
logger.error("No pages with valid OCR regions and successfully rendered images found in the source PDFs. Cannot create task package.")
|
193
|
+
# Consider raising ValueError here instead of just returning
|
194
|
+
raise ValueError("No valid pages with OCR data found to create a task package.")
|
195
|
+
|
196
|
+
manifest_path = os.path.join(temp_dir, "manifest.json")
|
197
|
+
try:
|
198
|
+
with open(manifest_path, 'w', encoding='utf-8') as f_manifest:
|
199
|
+
json.dump(manifest_data, f_manifest, indent=2)
|
200
|
+
except Exception as e:
|
201
|
+
logger.error(f"Failed to write manifest.json: {e}", exc_info=True)
|
202
|
+
raise # Re-raise error, cannot proceed
|
203
|
+
|
204
|
+
# --- Copy SPA files into temp dir ---
|
205
|
+
try:
|
206
|
+
# Find the path to the spa template directory relative to this file
|
207
|
+
# Using __file__ assumes this script is installed alongside the templates
|
208
|
+
utils_dir = os.path.dirname(os.path.abspath(__file__))
|
209
|
+
templates_dir = os.path.join(os.path.dirname(utils_dir), 'templates') # Go up one level from utils
|
210
|
+
spa_template_dir = os.path.join(templates_dir, 'spa')
|
211
|
+
|
212
|
+
if not os.path.isdir(spa_template_dir):
|
213
|
+
raise FileNotFoundError(f"SPA template directory not found at {spa_template_dir}")
|
214
|
+
|
215
|
+
logger.info(f"Copying SPA shell from: {spa_template_dir}")
|
216
|
+
# Copy contents of spa_template_dir/* into temp_dir/
|
217
|
+
# dirs_exist_ok=True handles merging if subdirs like js/ already exist (Python 3.8+)
|
218
|
+
shutil.copytree(spa_template_dir, temp_dir, dirs_exist_ok=True)
|
219
|
+
|
220
|
+
except Exception as e:
|
221
|
+
logger.error(f"Failed to copy SPA template files: {e}", exc_info=True)
|
222
|
+
raise RuntimeError("Could not package SPA files.") from e
|
223
|
+
|
224
|
+
# --- Create the final zip file ---
|
225
|
+
try:
|
226
|
+
logger.info(f"Creating zip package at: {output_zip_path}")
|
227
|
+
with zipfile.ZipFile(output_zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
|
228
|
+
# Add manifest.json
|
229
|
+
zipf.write(manifest_path, arcname="manifest.json")
|
230
|
+
# Add images directory
|
231
|
+
for root, _, files in os.walk(images_dir):
|
232
|
+
for file in files:
|
233
|
+
full_path = os.path.join(root, file)
|
234
|
+
# Create the correct archive name (e.g., images/...)
|
235
|
+
arcname = os.path.relpath(full_path, temp_dir)
|
236
|
+
zipf.write(full_path, arcname=arcname)
|
237
|
+
logger.info(f"Successfully created correction task package: {output_zip_path} ({total_regions_found} regions total)")
|
238
|
+
|
239
|
+
except Exception as e:
|
240
|
+
logger.error(f"Failed to create zip file {output_zip_path}: {e}", exc_info=True)
|
241
|
+
# Attempt to clean up existing zip if creation failed partially
|
242
|
+
if os.path.exists(output_zip_path):
|
243
|
+
try: os.remove(output_zip_path)
|
244
|
+
except: pass
|
245
|
+
raise # Re-raise error
|
246
|
+
|
247
|
+
# Temporary directory is automatically cleaned up by context manager
|
248
|
+
|
249
|
+
def import_ocr_from_manifest(
|
250
|
+
pdf: 'PDF',
|
251
|
+
manifest_path: str
|
252
|
+
) -> Dict[str, int]:
|
253
|
+
"""
|
254
|
+
Imports OCR data into a PDF object from a manifest file.
|
255
|
+
|
256
|
+
Reads a manifest.json file (typically generated by create_correction_task_package
|
257
|
+
and potentially modified externally) and populates the corresponding pages
|
258
|
+
of the PDF object with new TextElement objects based on the manifest data.
|
259
|
+
It uses the 'corrected_text' field and bounding box from the manifest.
|
260
|
+
|
261
|
+
This function assumes you want to replace or provide the primary OCR data
|
262
|
+
from the manifest, rather than correcting existing elements.
|
263
|
+
Existing OCR elements on the pages are NOT automatically cleared.
|
264
|
+
|
265
|
+
Args:
|
266
|
+
pdf: The natural_pdf.core.pdf.PDF object to populate with OCR data.
|
267
|
+
manifest_path: Path to the manifest.json file.
|
268
|
+
|
269
|
+
Returns:
|
270
|
+
A dictionary containing counts of imported and skipped regions:
|
271
|
+
{'imported': count, 'skipped': count}
|
272
|
+
|
273
|
+
Raises:
|
274
|
+
FileNotFoundError: If the manifest_path does not exist.
|
275
|
+
ValueError: If the manifest is invalid or contains data for a different PDF.
|
276
|
+
TypeError: If the input pdf object is not a valid PDF instance.
|
277
|
+
"""
|
278
|
+
if not (hasattr(pdf, '__class__') and pdf.__class__.__name__ == 'PDF'):
|
279
|
+
raise TypeError(f"Input must be a natural_pdf PDF object, got {type(pdf)}")
|
280
|
+
|
281
|
+
if not os.path.exists(manifest_path):
|
282
|
+
raise FileNotFoundError(f"Manifest file not found: {manifest_path}")
|
283
|
+
|
284
|
+
logger.info(f"Importing OCR data into PDF '{pdf.path}' from manifest '{manifest_path}'")
|
285
|
+
|
286
|
+
try:
|
287
|
+
with open(manifest_path, 'r', encoding='utf-8') as f:
|
288
|
+
manifest_data = json.load(f)
|
289
|
+
except json.JSONDecodeError as e:
|
290
|
+
logger.error(f"Failed to parse manifest file: {e}")
|
291
|
+
raise ValueError(f"Invalid JSON in manifest file: {manifest_path}") from e
|
292
|
+
except Exception as e:
|
293
|
+
logger.error(f"Failed to read manifest file: {e}")
|
294
|
+
raise
|
295
|
+
|
296
|
+
imported_count = 0
|
297
|
+
skipped_count = 0
|
298
|
+
processed_pages = 0
|
299
|
+
|
300
|
+
manifest_pages = manifest_data.get("pages", [])
|
301
|
+
if not manifest_pages:
|
302
|
+
logger.warning("Manifest contains no page data.")
|
303
|
+
return {'imported': 0, 'skipped': 0}
|
304
|
+
|
305
|
+
# --- Pre-check PDF source consistency ---
|
306
|
+
first_manifest_pdf_path = manifest_pages[0].get("pdf_source")
|
307
|
+
if first_manifest_pdf_path != pdf.path:
|
308
|
+
# Allow matching based on just the filename if paths differ (e.g., absolute vs relative)
|
309
|
+
if os.path.basename(first_manifest_pdf_path) != os.path.basename(pdf.path):
|
310
|
+
logger.error(f"Manifest PDF source ('{first_manifest_pdf_path}') does not match target PDF path ('{pdf.path}'). Aborting.")
|
311
|
+
raise ValueError("Manifest source PDF does not match the provided PDF object.")
|
312
|
+
else:
|
313
|
+
logger.warning(f"Manifest PDF source path ('{first_manifest_pdf_path}') differs from target PDF path ('{pdf.path}'), but filenames match. Proceeding cautiously.")
|
314
|
+
|
315
|
+
|
316
|
+
pdf_pages_by_index = {page.index: page for page in pdf.pages}
|
317
|
+
|
318
|
+
for page_data in tqdm(manifest_pages, desc="Importing OCR Data"):
|
319
|
+
page_index = page_data.get("page_index")
|
320
|
+
manifest_pdf_path = page_data.get("pdf_source")
|
321
|
+
|
322
|
+
# Check consistency for every page? (Maybe overkill if pre-checked)
|
323
|
+
if manifest_pdf_path != pdf.path and os.path.basename(manifest_pdf_path) != os.path.basename(pdf.path):
|
324
|
+
logger.warning(f"Skipping page index {page_index} due to PDF source mismatch ('{manifest_pdf_path}' vs '{pdf.path}')")
|
325
|
+
skipped_count += len(page_data.get("regions", [])) # Count all regions as skipped
|
326
|
+
continue
|
327
|
+
|
328
|
+
if page_index is None:
|
329
|
+
logger.warning(f"Skipping page entry with missing 'page_index': {page_data.get('page_number')}")
|
330
|
+
skipped_count += len(page_data.get("regions", []))
|
331
|
+
continue
|
332
|
+
|
333
|
+
page = pdf_pages_by_index.get(page_index)
|
334
|
+
if page is None:
|
335
|
+
logger.warning(f"Could not find page with index {page_index} in the target PDF. Skipping.")
|
336
|
+
skipped_count += len(page_data.get("regions", []))
|
337
|
+
continue
|
338
|
+
|
339
|
+
processed_pages += 1
|
340
|
+
# We are adding elements, no need to fetch existing ones unless we want to prevent duplicates (not implemented here)
|
341
|
+
|
342
|
+
regions_to_add = []
|
343
|
+
for region_data in page_data.get("regions", []):
|
344
|
+
# We import all regions, not just modified ones
|
345
|
+
# if not region_data.get("modified", False):
|
346
|
+
# continue # Only process modified regions
|
347
|
+
|
348
|
+
region_id = region_data.get("id", "unknown")
|
349
|
+
manifest_bbox = region_data.get("bbox")
|
350
|
+
# Use corrected_text as the primary text source for the new element
|
351
|
+
text_to_import = region_data.get("corrected_text")
|
352
|
+
# Fallback to ocr_text if corrected_text is missing (though unlikely from the SPA)
|
353
|
+
if text_to_import is None:
|
354
|
+
text_to_import = region_data.get("ocr_text")
|
355
|
+
|
356
|
+
resolution = region_data.get("resolution") # Mandatory from export
|
357
|
+
confidence = region_data.get("confidence") # Optional
|
358
|
+
|
359
|
+
if not all([manifest_bbox, text_to_import is not None, resolution]):
|
360
|
+
logger.warning(f"Skipping incomplete/invalid region data on page {page_index}, region id '{region_id}': Missing bbox, text, or resolution.")
|
361
|
+
skipped_count += 1
|
362
|
+
continue
|
363
|
+
|
364
|
+
# Convert manifest bbox (image pixels) back to PDF coordinates (points @ 72 DPI)
|
365
|
+
try:
|
366
|
+
scale_factor = 72.0 / float(resolution)
|
367
|
+
pdf_x0 = manifest_bbox[0] * scale_factor
|
368
|
+
pdf_top = manifest_bbox[1] * scale_factor
|
369
|
+
pdf_x1 = manifest_bbox[2] * scale_factor
|
370
|
+
pdf_bottom = manifest_bbox[3] * scale_factor
|
371
|
+
except (ValueError, TypeError, IndexError, ZeroDivisionError):
|
372
|
+
logger.warning(f"Invalid bbox or resolution for region '{region_id}' on page {page_index}. Skipping.")
|
373
|
+
skipped_count += 1
|
374
|
+
continue
|
375
|
+
|
376
|
+
# --- Create New Element ---
|
377
|
+
try:
|
378
|
+
new_element = TextElement(
|
379
|
+
text=text_to_import,
|
380
|
+
x0=pdf_x0,
|
381
|
+
top=pdf_top,
|
382
|
+
x1=pdf_x1,
|
383
|
+
bottom=pdf_bottom,
|
384
|
+
page=page, # Reference to the parent Page object
|
385
|
+
source='manifest-import', # Indicate origin
|
386
|
+
confidence=confidence, # Pass confidence if available
|
387
|
+
# Add metadata from manifest if needed? Maybe original_ocr?
|
388
|
+
metadata={'original_ocr': region_data.get("ocr_text")} if region_data.get("ocr_text") != text_to_import else {}
|
389
|
+
)
|
390
|
+
regions_to_add.append(new_element)
|
391
|
+
imported_count += 1
|
392
|
+
except Exception as e:
|
393
|
+
logger.error(f"Error creating TextElement for region '{region_id}' on page {page_index}: {e}", exc_info=True)
|
394
|
+
skipped_count += 1
|
395
|
+
|
396
|
+
# --- Add Elements to Page ---
|
397
|
+
# Add all created elements for this page in one go
|
398
|
+
if regions_to_add:
|
399
|
+
try:
|
400
|
+
# Accessing _elements directly; use manager if a public add method exists
|
401
|
+
if hasattr(page, '_elements') and hasattr(page._elements, 'elements') and isinstance(page._elements.elements, list):
|
402
|
+
page._elements.elements.extend(regions_to_add)
|
403
|
+
# TODO: Should potentially invalidate page element cache if exists
|
404
|
+
else:
|
405
|
+
logger.error(f"Could not add elements to page {page.index}, page._elements structure unexpected.")
|
406
|
+
# Decrement count as they weren't actually added
|
407
|
+
imported_count -= len(regions_to_add)
|
408
|
+
skipped_count += len(regions_to_add)
|
409
|
+
|
410
|
+
except Exception as e:
|
411
|
+
logger.error(f"Error adding elements to page {page.index}: {e}", exc_info=True)
|
412
|
+
# Decrement count as they weren't actually added
|
413
|
+
imported_count -= len(regions_to_add)
|
414
|
+
skipped_count += len(regions_to_add)
|
415
|
+
|
416
|
+
|
417
|
+
logger.info(f"Import process finished. Imported: {imported_count}, Skipped: {skipped_count}. Processed {processed_pages} pages from manifest.")
|
418
|
+
return {'imported': imported_count, 'skipped': skipped_count}
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: natural-pdf
|
3
|
-
Version: 0.1.
|
3
|
+
Version: 0.1.6
|
4
4
|
Summary: A more intuitive interface for working with PDFs
|
5
5
|
Author-email: Jonathan Soma <jonathan.soma@gmail.com>
|
6
6
|
License-Expression: MIT
|
@@ -16,12 +16,7 @@ Requires-Dist: Pillow
|
|
16
16
|
Requires-Dist: colour
|
17
17
|
Requires-Dist: numpy
|
18
18
|
Requires-Dist: urllib3
|
19
|
-
Requires-Dist:
|
20
|
-
Requires-Dist: torchvision
|
21
|
-
Requires-Dist: transformers
|
22
|
-
Requires-Dist: huggingface_hub
|
23
|
-
Requires-Dist: ocrmypdf
|
24
|
-
Requires-Dist: pikepdf
|
19
|
+
Requires-Dist: tqdm
|
25
20
|
Provides-Extra: interactive
|
26
21
|
Requires-Dist: ipywidgets<9.0.0,>=7.0.0; extra == "interactive"
|
27
22
|
Provides-Extra: haystack
|
@@ -29,16 +24,27 @@ Requires-Dist: haystack-ai; extra == "haystack"
|
|
29
24
|
Requires-Dist: chroma-haystack; extra == "haystack"
|
30
25
|
Requires-Dist: sentence-transformers; extra == "haystack"
|
31
26
|
Requires-Dist: protobuf<4; extra == "haystack"
|
27
|
+
Requires-Dist: natural-pdf[core-ml]; extra == "haystack"
|
32
28
|
Provides-Extra: easyocr
|
33
29
|
Requires-Dist: easyocr; extra == "easyocr"
|
30
|
+
Requires-Dist: natural-pdf[core-ml]; extra == "easyocr"
|
34
31
|
Provides-Extra: paddle
|
35
32
|
Requires-Dist: paddlepaddle; extra == "paddle"
|
36
33
|
Requires-Dist: paddleocr; extra == "paddle"
|
37
34
|
Provides-Extra: layout-yolo
|
38
35
|
Requires-Dist: doclayout_yolo; extra == "layout-yolo"
|
36
|
+
Requires-Dist: natural-pdf[core-ml]; extra == "layout-yolo"
|
39
37
|
Provides-Extra: surya
|
40
38
|
Requires-Dist: surya-ocr; extra == "surya"
|
39
|
+
Requires-Dist: natural-pdf[core-ml]; extra == "surya"
|
41
40
|
Provides-Extra: qa
|
41
|
+
Requires-Dist: natural-pdf[core-ml]; extra == "qa"
|
42
|
+
Provides-Extra: docling
|
43
|
+
Requires-Dist: docling; extra == "docling"
|
44
|
+
Requires-Dist: natural-pdf[core-ml]; extra == "docling"
|
45
|
+
Provides-Extra: llm
|
46
|
+
Requires-Dist: openai>=1.0; extra == "llm"
|
47
|
+
Requires-Dist: pydantic; extra == "llm"
|
42
48
|
Provides-Extra: test
|
43
49
|
Requires-Dist: pytest; extra == "test"
|
44
50
|
Provides-Extra: dev
|
@@ -50,18 +56,30 @@ Requires-Dist: nox; extra == "dev"
|
|
50
56
|
Requires-Dist: nox-uv; extra == "dev"
|
51
57
|
Requires-Dist: build; extra == "dev"
|
52
58
|
Requires-Dist: uv; extra == "dev"
|
59
|
+
Requires-Dist: pipdeptree; extra == "dev"
|
60
|
+
Requires-Dist: nbformat; extra == "dev"
|
61
|
+
Requires-Dist: jupytext; extra == "dev"
|
62
|
+
Requires-Dist: nbclient; extra == "dev"
|
53
63
|
Provides-Extra: all
|
54
|
-
Requires-Dist:
|
55
|
-
Requires-Dist:
|
56
|
-
Requires-Dist:
|
57
|
-
Requires-Dist:
|
58
|
-
Requires-Dist:
|
59
|
-
Requires-Dist: surya
|
60
|
-
Requires-Dist:
|
61
|
-
Requires-Dist:
|
62
|
-
Requires-Dist:
|
63
|
-
Requires-Dist:
|
64
|
-
Requires-Dist:
|
64
|
+
Requires-Dist: natural-pdf[interactive]; extra == "all"
|
65
|
+
Requires-Dist: natural-pdf[haystack]; extra == "all"
|
66
|
+
Requires-Dist: natural-pdf[easyocr]; extra == "all"
|
67
|
+
Requires-Dist: natural-pdf[paddle]; extra == "all"
|
68
|
+
Requires-Dist: natural-pdf[layout_yolo]; extra == "all"
|
69
|
+
Requires-Dist: natural-pdf[surya]; extra == "all"
|
70
|
+
Requires-Dist: natural-pdf[qa]; extra == "all"
|
71
|
+
Requires-Dist: natural-pdf[ocr-export]; extra == "all"
|
72
|
+
Requires-Dist: natural-pdf[docling]; extra == "all"
|
73
|
+
Requires-Dist: natural-pdf[llm]; extra == "all"
|
74
|
+
Requires-Dist: natural-pdf[test]; extra == "all"
|
75
|
+
Provides-Extra: core-ml
|
76
|
+
Requires-Dist: torch; extra == "core-ml"
|
77
|
+
Requires-Dist: torchvision; extra == "core-ml"
|
78
|
+
Requires-Dist: transformers; extra == "core-ml"
|
79
|
+
Requires-Dist: huggingface_hub; extra == "core-ml"
|
80
|
+
Provides-Extra: ocr-export
|
81
|
+
Requires-Dist: ocrmypdf; extra == "ocr-export"
|
82
|
+
Requires-Dist: pikepdf; extra == "ocr-export"
|
65
83
|
Dynamic: license-file
|
66
84
|
|
67
85
|
# Natural PDF
|
@@ -89,6 +107,10 @@ pip install natural-pdf[easyocr]
|
|
89
107
|
pip install natural-pdf[surya]
|
90
108
|
pip install natural-pdf[paddle]
|
91
109
|
|
110
|
+
# Example: Install support for features using Large Language Models (e.g., via OpenAI-compatible APIs)
|
111
|
+
pip install natural-pdf[llm]
|
112
|
+
# (May require setting API key environment variables, e.g., GOOGLE_API_KEY for Gemini)
|
113
|
+
|
92
114
|
# Example: Install with interactive viewer support
|
93
115
|
pip install natural-pdf[interactive]
|
94
116
|
|
@@ -141,7 +163,7 @@ Natural PDF offers a range of features for working with PDFs:
|
|
141
163
|
* **Spatial Navigation:** Select content relative to other elements (`heading.below()`, `element.select_until(...)`).
|
142
164
|
* **Text & Table Extraction:** Get clean text or structured table data, automatically handling exclusions.
|
143
165
|
* **OCR Integration:** Extract text from scanned documents using engines like EasyOCR, PaddleOCR, or Surya.
|
144
|
-
* **Layout Analysis:** Detect document structures (titles, paragraphs, tables) using
|
166
|
+
* **Layout Analysis:** Detect document structures (titles, paragraphs, tables) using various engines (e.g., YOLO, Paddle, LLM via API).
|
145
167
|
* **Document QA:** Ask natural language questions about your document's content.
|
146
168
|
* **Semantic Search:** Index PDFs and find relevant pages or documents based on semantic meaning using Haystack.
|
147
169
|
* **Visual Debugging:** Highlight elements and use an interactive viewer or save images to understand your selections.
|