natural-pdf 0.1.9__py3-none-any.whl → 0.1.11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -20,6 +20,7 @@ from typing import (
20
20
  )
21
21
 
22
22
  from pdfplumber.utils.geometry import objects_to_bbox
23
+ from PIL import Image, ImageDraw, ImageFont
23
24
 
24
25
  # New Imports
25
26
  from pdfplumber.utils.text import TEXTMAP_KWARGS, WORD_EXTRACTOR_KWARGS, chars_to_textmap
@@ -37,10 +38,30 @@ from natural_pdf.ocr import OCROptions
37
38
  from natural_pdf.ocr.utils import _apply_ocr_correction_to_elements
38
39
  from natural_pdf.selectors.parser import parse_selector, selector_to_filter_func
39
40
 
41
+ # Potentially lazy imports for optional dependencies needed in save_pdf
42
+ try:
43
+ import pikepdf
44
+ except ImportError:
45
+ pikepdf = None
46
+
47
+ try:
48
+ from natural_pdf.exporters.searchable_pdf import create_searchable_pdf
49
+ pass
50
+ except ImportError:
51
+ create_searchable_pdf = None
52
+
53
+ # ---> ADDED Import for the new exporter
54
+ try:
55
+ from natural_pdf.exporters.original_pdf import create_original_pdf
56
+ except ImportError:
57
+ create_original_pdf = None
58
+ # <--- END ADDED
59
+
40
60
  logger = logging.getLogger(__name__)
41
61
 
42
62
  if TYPE_CHECKING:
43
63
  from natural_pdf.core.page import Page
64
+ from natural_pdf.core.pdf import PDF # ---> ADDED PDF type hint
44
65
  from natural_pdf.elements.region import Region
45
66
 
46
67
  T = TypeVar("T")
@@ -1239,7 +1260,7 @@ class ElementCollection(
1239
1260
  # --- Classification Method --- #
1240
1261
  def classify_all(
1241
1262
  self,
1242
- categories: List[str],
1263
+ labels: List[str],
1243
1264
  model: Optional[str] = None,
1244
1265
  using: Optional[str] = None,
1245
1266
  min_confidence: float = 0.0,
@@ -1253,7 +1274,7 @@ class ElementCollection(
1253
1274
  """Classifies all elements in the collection in batch.
1254
1275
 
1255
1276
  Args:
1256
- categories: List of category labels.
1277
+ labels: List of category labels.
1257
1278
  model: Model ID (or alias 'text', 'vision').
1258
1279
  using: Optional processing mode ('text' or 'vision'). Inferred if None.
1259
1280
  min_confidence: Minimum confidence threshold.
@@ -1326,7 +1347,7 @@ class ElementCollection(
1326
1347
  # Call manager's batch classify
1327
1348
  batch_results: List[ClassificationResult] = manager.classify_batch(
1328
1349
  item_contents=items_to_classify,
1329
- categories=categories,
1350
+ labels=labels,
1330
1351
  model_id=model,
1331
1352
  using=inferred_using,
1332
1353
  min_confidence=min_confidence,
@@ -2263,3 +2284,286 @@ class PageCollection(Generic[P], ApplyMixin):
2263
2284
  )
2264
2285
 
2265
2286
  # --- End Deskew Method --- #
2287
+
2288
+ def to_image(
2289
+ self,
2290
+ page_width: int = 300,
2291
+ cols: Optional[int] = 4,
2292
+ rows: Optional[int] = None,
2293
+ max_pages: Optional[int] = None,
2294
+ spacing: int = 10,
2295
+ add_labels: bool = True,
2296
+ show_category: bool = False, # Add new flag
2297
+ ) -> Optional["Image.Image"]:
2298
+ """
2299
+ Generate a grid of page images for this collection.
2300
+
2301
+ Args:
2302
+ page_width: Width in pixels for rendering individual pages
2303
+ cols: Number of columns in grid (default: 4)
2304
+ rows: Number of rows in grid (calculated automatically if None)
2305
+ max_pages: Maximum number of pages to include (default: all)
2306
+ spacing: Spacing between page thumbnails in pixels
2307
+ add_labels: Whether to add page number labels
2308
+ show_category: Whether to add category and confidence labels (if available)
2309
+
2310
+ Returns:
2311
+ PIL Image of the page grid or None if no pages
2312
+ """
2313
+ # Ensure PIL is imported, handle potential ImportError if not done globally/lazily
2314
+ try:
2315
+ from PIL import Image, ImageDraw, ImageFont
2316
+ except ImportError:
2317
+ logger.error("Pillow library not found, required for to_image(). Install with 'pip install Pillow'")
2318
+ return None
2319
+
2320
+ if not self.pages:
2321
+ logger.warning("Cannot generate image for empty PageCollection")
2322
+ return None
2323
+
2324
+ # Limit pages if max_pages is specified
2325
+ pages_to_render = self.pages[:max_pages] if max_pages else self.pages
2326
+
2327
+ # Load font once outside the loop
2328
+ font = None
2329
+ if add_labels:
2330
+ try:
2331
+ # Try loading a commonly available font first
2332
+ font = ImageFont.truetype("DejaVuSans.ttf", 16)
2333
+ except IOError:
2334
+ try:
2335
+ font = ImageFont.load_default(16)
2336
+ except IOError:
2337
+ logger.warning("Default font not found. Labels cannot be added.")
2338
+ add_labels = False # Disable if no font
2339
+
2340
+ # Render individual page images
2341
+ page_images = []
2342
+ for page in pages_to_render:
2343
+ try:
2344
+ # Assume page.to_image returns a PIL Image or None
2345
+ img = page.to_image(width=page_width, include_highlights=True) # Render with highlights for visual context
2346
+ if img is None:
2347
+ logger.warning(f"Failed to generate image for page {page.number}. Skipping.")
2348
+ continue
2349
+ except Exception as img_err:
2350
+ logger.error(f"Error generating image for page {page.number}: {img_err}", exc_info=True)
2351
+ continue
2352
+
2353
+
2354
+ # Add page number label
2355
+ if add_labels and font:
2356
+ draw = ImageDraw.Draw(img)
2357
+ pdf_name = Path(page.pdf.path).stem if hasattr(page, "pdf") and page.pdf and hasattr(page.pdf, "path") else ""
2358
+ label_text = f"p{page.number}"
2359
+ if pdf_name:
2360
+ label_text += f" - {pdf_name}"
2361
+
2362
+ # Add category if requested and available
2363
+ if show_category:
2364
+ # Placeholder logic - adjust based on how classification results are stored
2365
+ category = None
2366
+ confidence = None
2367
+ if hasattr(page, 'analyses') and page.analyses and 'classification' in page.analyses:
2368
+ result = page.analyses['classification']
2369
+ # Adapt based on actual structure of classification result
2370
+ category = getattr(result, 'label', None) or result.get('label', None) if isinstance(result, dict) else None
2371
+ confidence = getattr(result, 'score', None) or result.get('score', None) if isinstance(result, dict) else None
2372
+
2373
+ if category is not None and confidence is not None:
2374
+ try:
2375
+ category_str = f"{category} ({confidence:.2f})" # Format confidence
2376
+ label_text += f"\\n{category_str}"
2377
+ except (TypeError, ValueError): pass # Ignore formatting errors
2378
+
2379
+
2380
+ # Calculate bounding box for multi-line text and draw background/text
2381
+ try:
2382
+ # Using textbbox for potentially better accuracy with specific fonts
2383
+ # Note: textbbox needs Pillow 8+
2384
+ bbox = draw.textbbox((5, 5), label_text, font=font, spacing=2) # Use textbbox if available
2385
+ bg_rect = (max(0, bbox[0] - 2), max(0, bbox[1] - 2),
2386
+ min(img.width, bbox[2] + 2), min(img.height, bbox[3] + 2))
2387
+
2388
+ # Draw semi-transparent background
2389
+ overlay = Image.new('RGBA', img.size, (255, 255, 255, 0))
2390
+ draw_overlay = ImageDraw.Draw(overlay)
2391
+ draw_overlay.rectangle(bg_rect, fill=(255, 255, 255, 180)) # White with alpha
2392
+ img = Image.alpha_composite(img.convert('RGBA'), overlay).convert('RGB')
2393
+ draw = ImageDraw.Draw(img) # Recreate draw object
2394
+
2395
+ # Draw the potentially multi-line text
2396
+ draw.multiline_text((5, 5), label_text, fill=(0, 0, 0), font=font, spacing=2)
2397
+ except AttributeError: # Fallback for older Pillow without textbbox
2398
+ # Approximate size and draw
2399
+ # This might not be perfectly aligned
2400
+ draw.rectangle((2, 2, 150, 40), fill=(255, 255, 255, 180)) # Simple fixed background
2401
+ draw.multiline_text((5, 5), label_text, fill=(0, 0, 0), font=font, spacing=2)
2402
+ except Exception as draw_err:
2403
+ logger.error(f"Error drawing label on page {page.number}: {draw_err}", exc_info=True)
2404
+
2405
+ page_images.append(img)
2406
+
2407
+ if not page_images:
2408
+ logger.warning("No page images were successfully rendered for the grid.")
2409
+ return None
2410
+
2411
+
2412
+ # Calculate grid dimensions if not provided
2413
+ num_images = len(page_images)
2414
+ if not rows and not cols:
2415
+ cols = min(4, int(num_images**0.5) + 1)
2416
+ rows = (num_images + cols - 1) // cols
2417
+ elif rows and not cols:
2418
+ cols = (num_images + rows - 1) // rows
2419
+ elif cols and not rows:
2420
+ rows = (num_images + cols - 1) // cols
2421
+ cols = max(1, cols if cols else 1) # Ensure at least 1
2422
+ rows = max(1, rows if rows else 1)
2423
+
2424
+
2425
+ # Get maximum dimensions for consistent grid cells
2426
+ max_width = max(img.width for img in page_images) if page_images else 1
2427
+ max_height = max(img.height for img in page_images) if page_images else 1
2428
+
2429
+
2430
+ # Create grid image
2431
+ grid_width = cols * max_width + (cols + 1) * spacing
2432
+ grid_height = rows * max_height + (rows + 1) * spacing
2433
+ grid_img = Image.new("RGB", (grid_width, grid_height), (220, 220, 220)) # Lighter gray background
2434
+
2435
+
2436
+ # Place images in grid
2437
+ for i, img in enumerate(page_images):
2438
+ if i >= rows * cols: # Ensure we don't exceed grid capacity
2439
+ break
2440
+
2441
+ row = i // cols
2442
+ col = i % cols
2443
+
2444
+ x = col * max_width + (col + 1) * spacing
2445
+ y = row * max_height + (row + 1) * spacing
2446
+
2447
+ grid_img.paste(img, (x, y))
2448
+
2449
+ return grid_img
2450
+
2451
+ def save_pdf(
2452
+ self,
2453
+ output_path: Union[str, Path],
2454
+ ocr: bool = False,
2455
+ original: bool = False,
2456
+ dpi: int = 300,
2457
+ ):
2458
+ """
2459
+ Saves the pages in this collection to a new PDF file.
2460
+
2461
+ Choose one saving mode:
2462
+ - `ocr=True`: Creates a new, image-based PDF using OCR results. This
2463
+ makes the text generated during the natural-pdf session searchable,
2464
+ but loses original vector content. Requires 'ocr-export' extras.
2465
+ - `original=True`: Extracts the original pages from the source PDF,
2466
+ preserving all vector content, fonts, and annotations. OCR results
2467
+ from the natural-pdf session are NOT included. Requires 'ocr-export' extras.
2468
+
2469
+ Args:
2470
+ output_path: Path to save the new PDF file.
2471
+ ocr: If True, save as a searchable, image-based PDF using OCR data.
2472
+ original: If True, save the original, vector-based pages.
2473
+ dpi: Resolution (dots per inch) used only when ocr=True for
2474
+ rendering page images and aligning the text layer.
2475
+
2476
+ Raises:
2477
+ ValueError: If the collection is empty, if neither or both 'ocr'
2478
+ and 'original' are True, or if 'original=True' and
2479
+ pages originate from different PDFs.
2480
+ ImportError: If required libraries ('pikepdf', 'Pillow')
2481
+ are not installed for the chosen mode.
2482
+ RuntimeError: If an unexpected error occurs during saving.
2483
+ """
2484
+ if not self.pages:
2485
+ raise ValueError("Cannot save an empty PageCollection.")
2486
+
2487
+ if not (ocr ^ original): # XOR: exactly one must be true
2488
+ raise ValueError("Exactly one of 'ocr' or 'original' must be True.")
2489
+
2490
+ output_path_obj = Path(output_path)
2491
+ output_path_str = str(output_path_obj)
2492
+
2493
+ if ocr:
2494
+ if create_searchable_pdf is None:
2495
+ raise ImportError(
2496
+ "Saving with ocr=True requires 'pikepdf' and 'Pillow'. "
2497
+ "Install with: pip install \\\"natural-pdf[ocr-export]\\\"" # Escaped quotes
2498
+ )
2499
+
2500
+ # Check for non-OCR vector elements (provide a warning)
2501
+ has_vector_elements = False
2502
+ for page in self.pages:
2503
+ # Simplified check for common vector types or non-OCR chars/words
2504
+ if (hasattr(page, 'rects') and page.rects or
2505
+ hasattr(page, 'lines') and page.lines or
2506
+ hasattr(page, 'curves') and page.curves or
2507
+ (hasattr(page, 'chars') and any(getattr(el, 'source', None) != 'ocr' for el in page.chars)) or
2508
+ (hasattr(page, 'words') and any(getattr(el, 'source', None) != 'ocr' for el in page.words))):
2509
+ has_vector_elements = True
2510
+ break
2511
+ if has_vector_elements:
2512
+ logger.warning(
2513
+ "Warning: Saving with ocr=True creates an image-based PDF. "
2514
+ "Original vector elements (rects, lines, non-OCR text/chars) "
2515
+ "on selected pages will not be preserved in the output file."
2516
+ )
2517
+
2518
+ logger.info(f"Saving searchable PDF (OCR text layer) to: {output_path_str}")
2519
+ try:
2520
+ # Delegate to the searchable PDF exporter function
2521
+ # Pass `self` (the PageCollection instance) as the source
2522
+ create_searchable_pdf(self, output_path_str, dpi=dpi)
2523
+ # Success log is now inside create_searchable_pdf if needed, or keep here
2524
+ # logger.info(f"Successfully saved searchable PDF to: {output_path_str}")
2525
+ except Exception as e:
2526
+ logger.error(f"Failed to create searchable PDF: {e}", exc_info=True)
2527
+ # Re-raise as RuntimeError for consistency, potentially handled in exporter too
2528
+ raise RuntimeError(f"Failed to create searchable PDF: {e}") from e
2529
+
2530
+ elif original:
2531
+ # ---> MODIFIED: Call the new exporter
2532
+ if create_original_pdf is None:
2533
+ raise ImportError(
2534
+ "Saving with original=True requires 'pikepdf'. "
2535
+ "Install with: pip install \\\"natural-pdf[ocr-export]\\\"" # Escaped quotes
2536
+ )
2537
+
2538
+ # Check for OCR elements (provide a warning) - keep this check here
2539
+ has_ocr_elements = False
2540
+ for page in self.pages:
2541
+ # Use find_all which returns a collection; check if it's non-empty
2542
+ if hasattr(page, 'find_all'):
2543
+ ocr_text_elements = page.find_all("text[source=ocr]")
2544
+ if ocr_text_elements: # Check truthiness of collection
2545
+ has_ocr_elements = True
2546
+ break
2547
+ elif hasattr(page, 'words'): # Fallback check if find_all isn't present?
2548
+ if any(getattr(el, 'source', None) == 'ocr' for el in page.words):
2549
+ has_ocr_elements = True
2550
+ break
2551
+
2552
+ if has_ocr_elements:
2553
+ logger.warning(
2554
+ "Warning: Saving with original=True preserves original page content. "
2555
+ "OCR text generated in this session will not be included in the saved file."
2556
+ )
2557
+
2558
+ logger.info(f"Saving original pages PDF to: {output_path_str}")
2559
+ try:
2560
+ # Delegate to the original PDF exporter function
2561
+ # Pass `self` (the PageCollection instance) as the source
2562
+ create_original_pdf(self, output_path_str)
2563
+ # Success log is now inside create_original_pdf
2564
+ # logger.info(f"Successfully saved original pages PDF to: {output_path_str}")
2565
+ except Exception as e:
2566
+ # Error logging is handled within create_original_pdf
2567
+ # Re-raise the exception caught from the exporter
2568
+ raise e # Keep the original exception type (ValueError, RuntimeError, etc.)
2569
+ # <--- END MODIFIED
@@ -55,6 +55,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin):
55
55
  bbox: Tuple[float, float, float, float],
56
56
  polygon: List[Tuple[float, float]] = None,
57
57
  parent=None,
58
+ label: Optional[str] = None,
58
59
  ):
59
60
  """
60
61
  Initialize a region.
@@ -74,11 +75,8 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin):
74
75
  self.start_element = None
75
76
  self.end_element = None
76
77
 
77
- # --- ADDED --- Metadata store for mixins
78
78
  self.metadata: Dict[str, Any] = {}
79
- # --- NEW --- Central registry for analysis results
80
79
  self.analyses: Dict[str, Any] = {}
81
- # --- END ADDED ---
82
80
 
83
81
  # Standard attributes for all elements
84
82
  self.object_type = "region" # For selector compatibility
@@ -91,6 +89,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin):
91
89
 
92
90
  # Region management attributes
93
91
  self.name = None
92
+ self.label = label
94
93
  self.source = None # Will be set by creation methods
95
94
 
96
95
  # Hierarchy support for nested document structure