natural-pdf 0.1.10__py3-none-any.whl → 0.1.11__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- natural_pdf/__init__.py +24 -40
- natural_pdf/core/page.py +17 -17
- natural_pdf/core/pdf.py +130 -12
- natural_pdf/elements/collections.py +229 -29
- natural_pdf/elements/region.py +2 -3
- natural_pdf/exporters/hocr.py +540 -0
- natural_pdf/exporters/hocr_font.py +142 -0
- natural_pdf/exporters/original_pdf.py +130 -0
- natural_pdf/exporters/searchable_pdf.py +3 -3
- natural_pdf/ocr/engine_surya.py +1 -1
- {natural_pdf-0.1.10.dist-info → natural_pdf-0.1.11.dist-info}/METADATA +1 -2
- {natural_pdf-0.1.10.dist-info → natural_pdf-0.1.11.dist-info}/RECORD +15 -12
- {natural_pdf-0.1.10.dist-info → natural_pdf-0.1.11.dist-info}/WHEEL +1 -1
- {natural_pdf-0.1.10.dist-info → natural_pdf-0.1.11.dist-info}/licenses/LICENSE +0 -0
- {natural_pdf-0.1.10.dist-info → natural_pdf-0.1.11.dist-info}/top_level.txt +0 -0
@@ -38,10 +38,30 @@ from natural_pdf.ocr import OCROptions
|
|
38
38
|
from natural_pdf.ocr.utils import _apply_ocr_correction_to_elements
|
39
39
|
from natural_pdf.selectors.parser import parse_selector, selector_to_filter_func
|
40
40
|
|
41
|
+
# Potentially lazy imports for optional dependencies needed in save_pdf
|
42
|
+
try:
|
43
|
+
import pikepdf
|
44
|
+
except ImportError:
|
45
|
+
pikepdf = None
|
46
|
+
|
47
|
+
try:
|
48
|
+
from natural_pdf.exporters.searchable_pdf import create_searchable_pdf
|
49
|
+
pass
|
50
|
+
except ImportError:
|
51
|
+
create_searchable_pdf = None
|
52
|
+
|
53
|
+
# ---> ADDED Import for the new exporter
|
54
|
+
try:
|
55
|
+
from natural_pdf.exporters.original_pdf import create_original_pdf
|
56
|
+
except ImportError:
|
57
|
+
create_original_pdf = None
|
58
|
+
# <--- END ADDED
|
59
|
+
|
41
60
|
logger = logging.getLogger(__name__)
|
42
61
|
|
43
62
|
if TYPE_CHECKING:
|
44
63
|
from natural_pdf.core.page import Page
|
64
|
+
from natural_pdf.core.pdf import PDF # ---> ADDED PDF type hint
|
45
65
|
from natural_pdf.elements.region import Region
|
46
66
|
|
47
67
|
T = TypeVar("T")
|
@@ -2290,6 +2310,13 @@ class PageCollection(Generic[P], ApplyMixin):
|
|
2290
2310
|
Returns:
|
2291
2311
|
PIL Image of the page grid or None if no pages
|
2292
2312
|
"""
|
2313
|
+
# Ensure PIL is imported, handle potential ImportError if not done globally/lazily
|
2314
|
+
try:
|
2315
|
+
from PIL import Image, ImageDraw, ImageFont
|
2316
|
+
except ImportError:
|
2317
|
+
logger.error("Pillow library not found, required for to_image(). Install with 'pip install Pillow'")
|
2318
|
+
return None
|
2319
|
+
|
2293
2320
|
if not self.pages:
|
2294
2321
|
logger.warning("Cannot generate image for empty PageCollection")
|
2295
2322
|
return None
|
@@ -2298,64 +2325,117 @@ class PageCollection(Generic[P], ApplyMixin):
|
|
2298
2325
|
pages_to_render = self.pages[:max_pages] if max_pages else self.pages
|
2299
2326
|
|
2300
2327
|
# Load font once outside the loop
|
2301
|
-
font =
|
2328
|
+
font = None
|
2329
|
+
if add_labels:
|
2330
|
+
try:
|
2331
|
+
# Try loading a commonly available font first
|
2332
|
+
font = ImageFont.truetype("DejaVuSans.ttf", 16)
|
2333
|
+
except IOError:
|
2334
|
+
try:
|
2335
|
+
font = ImageFont.load_default(16)
|
2336
|
+
except IOError:
|
2337
|
+
logger.warning("Default font not found. Labels cannot be added.")
|
2338
|
+
add_labels = False # Disable if no font
|
2302
2339
|
|
2303
2340
|
# Render individual page images
|
2304
2341
|
page_images = []
|
2305
2342
|
for page in pages_to_render:
|
2306
|
-
|
2343
|
+
try:
|
2344
|
+
# Assume page.to_image returns a PIL Image or None
|
2345
|
+
img = page.to_image(width=page_width, include_highlights=True) # Render with highlights for visual context
|
2346
|
+
if img is None:
|
2347
|
+
logger.warning(f"Failed to generate image for page {page.number}. Skipping.")
|
2348
|
+
continue
|
2349
|
+
except Exception as img_err:
|
2350
|
+
logger.error(f"Error generating image for page {page.number}: {img_err}", exc_info=True)
|
2351
|
+
continue
|
2352
|
+
|
2307
2353
|
|
2308
2354
|
# Add page number label
|
2309
|
-
if add_labels and font:
|
2355
|
+
if add_labels and font:
|
2310
2356
|
draw = ImageDraw.Draw(img)
|
2311
|
-
pdf_name = Path(page.pdf.path).stem if hasattr(page, "pdf") and page.pdf else ""
|
2312
|
-
label_text = f"p{page.number}
|
2357
|
+
pdf_name = Path(page.pdf.path).stem if hasattr(page, "pdf") and page.pdf and hasattr(page.pdf, "path") else ""
|
2358
|
+
label_text = f"p{page.number}"
|
2359
|
+
if pdf_name:
|
2360
|
+
label_text += f" - {pdf_name}"
|
2313
2361
|
|
2314
2362
|
# Add category if requested and available
|
2315
2363
|
if show_category:
|
2316
|
-
|
2317
|
-
|
2318
|
-
|
2319
|
-
|
2320
|
-
|
2364
|
+
# Placeholder logic - adjust based on how classification results are stored
|
2365
|
+
category = None
|
2366
|
+
confidence = None
|
2367
|
+
if hasattr(page, 'analyses') and page.analyses and 'classification' in page.analyses:
|
2368
|
+
result = page.analyses['classification']
|
2369
|
+
# Adapt based on actual structure of classification result
|
2370
|
+
category = getattr(result, 'label', None) or result.get('label', None) if isinstance(result, dict) else None
|
2371
|
+
confidence = getattr(result, 'score', None) or result.get('score', None) if isinstance(result, dict) else None
|
2321
2372
|
|
2322
|
-
|
2323
|
-
|
2324
|
-
|
2325
|
-
|
2326
|
-
|
2327
|
-
bg_rect = (bbox[0] - 2, bbox[1] - 2, bbox[2] + 2, bbox[3] + 2)
|
2373
|
+
if category is not None and confidence is not None:
|
2374
|
+
try:
|
2375
|
+
category_str = f"{category} ({confidence:.2f})" # Format confidence
|
2376
|
+
label_text += f"\\n{category_str}"
|
2377
|
+
except (TypeError, ValueError): pass # Ignore formatting errors
|
2328
2378
|
|
2329
|
-
# Draw white background rectangle
|
2330
|
-
draw.rectangle(bg_rect, fill=(255, 255, 255))
|
2331
2379
|
|
2332
|
-
#
|
2333
|
-
|
2380
|
+
# Calculate bounding box for multi-line text and draw background/text
|
2381
|
+
try:
|
2382
|
+
# Using textbbox for potentially better accuracy with specific fonts
|
2383
|
+
# Note: textbbox needs Pillow 8+
|
2384
|
+
bbox = draw.textbbox((5, 5), label_text, font=font, spacing=2) # Use textbbox if available
|
2385
|
+
bg_rect = (max(0, bbox[0] - 2), max(0, bbox[1] - 2),
|
2386
|
+
min(img.width, bbox[2] + 2), min(img.height, bbox[3] + 2))
|
2387
|
+
|
2388
|
+
# Draw semi-transparent background
|
2389
|
+
overlay = Image.new('RGBA', img.size, (255, 255, 255, 0))
|
2390
|
+
draw_overlay = ImageDraw.Draw(overlay)
|
2391
|
+
draw_overlay.rectangle(bg_rect, fill=(255, 255, 255, 180)) # White with alpha
|
2392
|
+
img = Image.alpha_composite(img.convert('RGBA'), overlay).convert('RGB')
|
2393
|
+
draw = ImageDraw.Draw(img) # Recreate draw object
|
2394
|
+
|
2395
|
+
# Draw the potentially multi-line text
|
2396
|
+
draw.multiline_text((5, 5), label_text, fill=(0, 0, 0), font=font, spacing=2)
|
2397
|
+
except AttributeError: # Fallback for older Pillow without textbbox
|
2398
|
+
# Approximate size and draw
|
2399
|
+
# This might not be perfectly aligned
|
2400
|
+
draw.rectangle((2, 2, 150, 40), fill=(255, 255, 255, 180)) # Simple fixed background
|
2401
|
+
draw.multiline_text((5, 5), label_text, fill=(0, 0, 0), font=font, spacing=2)
|
2402
|
+
except Exception as draw_err:
|
2403
|
+
logger.error(f"Error drawing label on page {page.number}: {draw_err}", exc_info=True)
|
2334
2404
|
|
2335
2405
|
page_images.append(img)
|
2336
2406
|
|
2407
|
+
if not page_images:
|
2408
|
+
logger.warning("No page images were successfully rendered for the grid.")
|
2409
|
+
return None
|
2410
|
+
|
2411
|
+
|
2337
2412
|
# Calculate grid dimensions if not provided
|
2413
|
+
num_images = len(page_images)
|
2338
2414
|
if not rows and not cols:
|
2339
|
-
|
2340
|
-
|
2341
|
-
rows = (len(page_images) + cols - 1) // cols
|
2415
|
+
cols = min(4, int(num_images**0.5) + 1)
|
2416
|
+
rows = (num_images + cols - 1) // cols
|
2342
2417
|
elif rows and not cols:
|
2343
|
-
cols = (
|
2418
|
+
cols = (num_images + rows - 1) // rows
|
2344
2419
|
elif cols and not rows:
|
2345
|
-
rows = (
|
2420
|
+
rows = (num_images + cols - 1) // cols
|
2421
|
+
cols = max(1, cols if cols else 1) # Ensure at least 1
|
2422
|
+
rows = max(1, rows if rows else 1)
|
2423
|
+
|
2346
2424
|
|
2347
2425
|
# Get maximum dimensions for consistent grid cells
|
2348
|
-
max_width = max(img.width for img in page_images)
|
2349
|
-
max_height = max(img.height for img in page_images)
|
2426
|
+
max_width = max(img.width for img in page_images) if page_images else 1
|
2427
|
+
max_height = max(img.height for img in page_images) if page_images else 1
|
2428
|
+
|
2350
2429
|
|
2351
2430
|
# Create grid image
|
2352
2431
|
grid_width = cols * max_width + (cols + 1) * spacing
|
2353
2432
|
grid_height = rows * max_height + (rows + 1) * spacing
|
2354
|
-
grid_img = Image.new("RGB", (grid_width, grid_height), (
|
2433
|
+
grid_img = Image.new("RGB", (grid_width, grid_height), (220, 220, 220)) # Lighter gray background
|
2434
|
+
|
2355
2435
|
|
2356
2436
|
# Place images in grid
|
2357
2437
|
for i, img in enumerate(page_images):
|
2358
|
-
if i >= rows * cols:
|
2438
|
+
if i >= rows * cols: # Ensure we don't exceed grid capacity
|
2359
2439
|
break
|
2360
2440
|
|
2361
2441
|
row = i // cols
|
@@ -2367,3 +2447,123 @@ class PageCollection(Generic[P], ApplyMixin):
|
|
2367
2447
|
grid_img.paste(img, (x, y))
|
2368
2448
|
|
2369
2449
|
return grid_img
|
2450
|
+
|
2451
|
+
def save_pdf(
|
2452
|
+
self,
|
2453
|
+
output_path: Union[str, Path],
|
2454
|
+
ocr: bool = False,
|
2455
|
+
original: bool = False,
|
2456
|
+
dpi: int = 300,
|
2457
|
+
):
|
2458
|
+
"""
|
2459
|
+
Saves the pages in this collection to a new PDF file.
|
2460
|
+
|
2461
|
+
Choose one saving mode:
|
2462
|
+
- `ocr=True`: Creates a new, image-based PDF using OCR results. This
|
2463
|
+
makes the text generated during the natural-pdf session searchable,
|
2464
|
+
but loses original vector content. Requires 'ocr-export' extras.
|
2465
|
+
- `original=True`: Extracts the original pages from the source PDF,
|
2466
|
+
preserving all vector content, fonts, and annotations. OCR results
|
2467
|
+
from the natural-pdf session are NOT included. Requires 'ocr-export' extras.
|
2468
|
+
|
2469
|
+
Args:
|
2470
|
+
output_path: Path to save the new PDF file.
|
2471
|
+
ocr: If True, save as a searchable, image-based PDF using OCR data.
|
2472
|
+
original: If True, save the original, vector-based pages.
|
2473
|
+
dpi: Resolution (dots per inch) used only when ocr=True for
|
2474
|
+
rendering page images and aligning the text layer.
|
2475
|
+
|
2476
|
+
Raises:
|
2477
|
+
ValueError: If the collection is empty, if neither or both 'ocr'
|
2478
|
+
and 'original' are True, or if 'original=True' and
|
2479
|
+
pages originate from different PDFs.
|
2480
|
+
ImportError: If required libraries ('pikepdf', 'Pillow')
|
2481
|
+
are not installed for the chosen mode.
|
2482
|
+
RuntimeError: If an unexpected error occurs during saving.
|
2483
|
+
"""
|
2484
|
+
if not self.pages:
|
2485
|
+
raise ValueError("Cannot save an empty PageCollection.")
|
2486
|
+
|
2487
|
+
if not (ocr ^ original): # XOR: exactly one must be true
|
2488
|
+
raise ValueError("Exactly one of 'ocr' or 'original' must be True.")
|
2489
|
+
|
2490
|
+
output_path_obj = Path(output_path)
|
2491
|
+
output_path_str = str(output_path_obj)
|
2492
|
+
|
2493
|
+
if ocr:
|
2494
|
+
if create_searchable_pdf is None:
|
2495
|
+
raise ImportError(
|
2496
|
+
"Saving with ocr=True requires 'pikepdf' and 'Pillow'. "
|
2497
|
+
"Install with: pip install \\\"natural-pdf[ocr-export]\\\"" # Escaped quotes
|
2498
|
+
)
|
2499
|
+
|
2500
|
+
# Check for non-OCR vector elements (provide a warning)
|
2501
|
+
has_vector_elements = False
|
2502
|
+
for page in self.pages:
|
2503
|
+
# Simplified check for common vector types or non-OCR chars/words
|
2504
|
+
if (hasattr(page, 'rects') and page.rects or
|
2505
|
+
hasattr(page, 'lines') and page.lines or
|
2506
|
+
hasattr(page, 'curves') and page.curves or
|
2507
|
+
(hasattr(page, 'chars') and any(getattr(el, 'source', None) != 'ocr' for el in page.chars)) or
|
2508
|
+
(hasattr(page, 'words') and any(getattr(el, 'source', None) != 'ocr' for el in page.words))):
|
2509
|
+
has_vector_elements = True
|
2510
|
+
break
|
2511
|
+
if has_vector_elements:
|
2512
|
+
logger.warning(
|
2513
|
+
"Warning: Saving with ocr=True creates an image-based PDF. "
|
2514
|
+
"Original vector elements (rects, lines, non-OCR text/chars) "
|
2515
|
+
"on selected pages will not be preserved in the output file."
|
2516
|
+
)
|
2517
|
+
|
2518
|
+
logger.info(f"Saving searchable PDF (OCR text layer) to: {output_path_str}")
|
2519
|
+
try:
|
2520
|
+
# Delegate to the searchable PDF exporter function
|
2521
|
+
# Pass `self` (the PageCollection instance) as the source
|
2522
|
+
create_searchable_pdf(self, output_path_str, dpi=dpi)
|
2523
|
+
# Success log is now inside create_searchable_pdf if needed, or keep here
|
2524
|
+
# logger.info(f"Successfully saved searchable PDF to: {output_path_str}")
|
2525
|
+
except Exception as e:
|
2526
|
+
logger.error(f"Failed to create searchable PDF: {e}", exc_info=True)
|
2527
|
+
# Re-raise as RuntimeError for consistency, potentially handled in exporter too
|
2528
|
+
raise RuntimeError(f"Failed to create searchable PDF: {e}") from e
|
2529
|
+
|
2530
|
+
elif original:
|
2531
|
+
# ---> MODIFIED: Call the new exporter
|
2532
|
+
if create_original_pdf is None:
|
2533
|
+
raise ImportError(
|
2534
|
+
"Saving with original=True requires 'pikepdf'. "
|
2535
|
+
"Install with: pip install \\\"natural-pdf[ocr-export]\\\"" # Escaped quotes
|
2536
|
+
)
|
2537
|
+
|
2538
|
+
# Check for OCR elements (provide a warning) - keep this check here
|
2539
|
+
has_ocr_elements = False
|
2540
|
+
for page in self.pages:
|
2541
|
+
# Use find_all which returns a collection; check if it's non-empty
|
2542
|
+
if hasattr(page, 'find_all'):
|
2543
|
+
ocr_text_elements = page.find_all("text[source=ocr]")
|
2544
|
+
if ocr_text_elements: # Check truthiness of collection
|
2545
|
+
has_ocr_elements = True
|
2546
|
+
break
|
2547
|
+
elif hasattr(page, 'words'): # Fallback check if find_all isn't present?
|
2548
|
+
if any(getattr(el, 'source', None) == 'ocr' for el in page.words):
|
2549
|
+
has_ocr_elements = True
|
2550
|
+
break
|
2551
|
+
|
2552
|
+
if has_ocr_elements:
|
2553
|
+
logger.warning(
|
2554
|
+
"Warning: Saving with original=True preserves original page content. "
|
2555
|
+
"OCR text generated in this session will not be included in the saved file."
|
2556
|
+
)
|
2557
|
+
|
2558
|
+
logger.info(f"Saving original pages PDF to: {output_path_str}")
|
2559
|
+
try:
|
2560
|
+
# Delegate to the original PDF exporter function
|
2561
|
+
# Pass `self` (the PageCollection instance) as the source
|
2562
|
+
create_original_pdf(self, output_path_str)
|
2563
|
+
# Success log is now inside create_original_pdf
|
2564
|
+
# logger.info(f"Successfully saved original pages PDF to: {output_path_str}")
|
2565
|
+
except Exception as e:
|
2566
|
+
# Error logging is handled within create_original_pdf
|
2567
|
+
# Re-raise the exception caught from the exporter
|
2568
|
+
raise e # Keep the original exception type (ValueError, RuntimeError, etc.)
|
2569
|
+
# <--- END MODIFIED
|
natural_pdf/elements/region.py
CHANGED
@@ -55,6 +55,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin):
|
|
55
55
|
bbox: Tuple[float, float, float, float],
|
56
56
|
polygon: List[Tuple[float, float]] = None,
|
57
57
|
parent=None,
|
58
|
+
label: Optional[str] = None,
|
58
59
|
):
|
59
60
|
"""
|
60
61
|
Initialize a region.
|
@@ -74,11 +75,8 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin):
|
|
74
75
|
self.start_element = None
|
75
76
|
self.end_element = None
|
76
77
|
|
77
|
-
# --- ADDED --- Metadata store for mixins
|
78
78
|
self.metadata: Dict[str, Any] = {}
|
79
|
-
# --- NEW --- Central registry for analysis results
|
80
79
|
self.analyses: Dict[str, Any] = {}
|
81
|
-
# --- END ADDED ---
|
82
80
|
|
83
81
|
# Standard attributes for all elements
|
84
82
|
self.object_type = "region" # For selector compatibility
|
@@ -91,6 +89,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin):
|
|
91
89
|
|
92
90
|
# Region management attributes
|
93
91
|
self.name = None
|
92
|
+
self.label = label
|
94
93
|
self.source = None # Will be set by creation methods
|
95
94
|
|
96
95
|
# Hierarchy support for nested document structure
|