natural-pdf 0.1.11__py3-none-any.whl → 0.1.12__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (30) hide show
  1. natural_pdf/__init__.py +7 -2
  2. natural_pdf/analyzers/text_options.py +9 -1
  3. natural_pdf/analyzers/text_structure.py +371 -58
  4. natural_pdf/classification/manager.py +1 -1
  5. natural_pdf/core/element_manager.py +11 -1
  6. natural_pdf/core/highlighting_service.py +120 -40
  7. natural_pdf/core/page.py +4 -2
  8. natural_pdf/core/pdf.py +53 -38
  9. natural_pdf/elements/base.py +17 -0
  10. natural_pdf/elements/collections.py +203 -59
  11. natural_pdf/elements/region.py +43 -11
  12. natural_pdf/exporters/data/__init__.py +0 -0
  13. natural_pdf/exporters/data/pdf.ttf +0 -0
  14. natural_pdf/exporters/data/sRGB.icc +0 -0
  15. natural_pdf/exporters/hocr.py +40 -61
  16. natural_pdf/exporters/hocr_font.py +7 -13
  17. natural_pdf/exporters/original_pdf.py +10 -13
  18. natural_pdf/exporters/searchable_pdf.py +0 -10
  19. natural_pdf/search/__init__.py +65 -52
  20. natural_pdf/search/lancedb_search_service.py +325 -0
  21. natural_pdf/search/numpy_search_service.py +255 -0
  22. natural_pdf/search/searchable_mixin.py +25 -71
  23. natural_pdf/widgets/viewer.py +22 -31
  24. {natural_pdf-0.1.11.dist-info → natural_pdf-0.1.12.dist-info}/METADATA +54 -49
  25. {natural_pdf-0.1.11.dist-info → natural_pdf-0.1.12.dist-info}/RECORD +28 -25
  26. {natural_pdf-0.1.11.dist-info → natural_pdf-0.1.12.dist-info}/WHEEL +1 -1
  27. natural_pdf/search/haystack_search_service.py +0 -687
  28. natural_pdf/search/haystack_utils.py +0 -474
  29. {natural_pdf-0.1.11.dist-info → natural_pdf-0.1.12.dist-info}/licenses/LICENSE +0 -0
  30. {natural_pdf-0.1.11.dist-info → natural_pdf-0.1.12.dist-info}/top_level.txt +0 -0
@@ -20,10 +20,10 @@ from typing import (
20
20
  )
21
21
 
22
22
  from pdfplumber.utils.geometry import objects_to_bbox
23
- from PIL import Image, ImageDraw, ImageFont
24
23
 
25
24
  # New Imports
26
25
  from pdfplumber.utils.text import TEXTMAP_KWARGS, WORD_EXTRACTOR_KWARGS, chars_to_textmap
26
+ from PIL import Image, ImageDraw, ImageFont
27
27
  from tqdm.auto import tqdm
28
28
 
29
29
  from natural_pdf.classification.manager import ClassificationManager
@@ -46,6 +46,7 @@ except ImportError:
46
46
 
47
47
  try:
48
48
  from natural_pdf.exporters.searchable_pdf import create_searchable_pdf
49
+
49
50
  pass
50
51
  except ImportError:
51
52
  create_searchable_pdf = None
@@ -61,7 +62,7 @@ logger = logging.getLogger(__name__)
61
62
 
62
63
  if TYPE_CHECKING:
63
64
  from natural_pdf.core.page import Page
64
- from natural_pdf.core.pdf import PDF # ---> ADDED PDF type hint
65
+ from natural_pdf.core.pdf import PDF # ---> ADDED PDF type hint
65
66
  from natural_pdf.elements.region import Region
66
67
 
67
68
  T = TypeVar("T")
@@ -840,6 +841,7 @@ class ElementCollection(
840
841
  labels: bool = True, # Use 'labels' consistent with service
841
842
  legend_position: str = "right",
842
843
  render_ocr: bool = False,
844
+ width: Optional[int] = None, # Add width parameter
843
845
  ) -> Optional["Image.Image"]:
844
846
  """
845
847
  Generates a temporary preview image highlighting elements in this collection
@@ -862,6 +864,7 @@ class ElementCollection(
862
864
  labels: Whether to include a legend for the temporary highlights.
863
865
  legend_position: Position of the legend ('right', 'left', 'top', 'bottom').
864
866
  render_ocr: Whether to render OCR text.
867
+ width: Optional width for the output image in pixels.
865
868
 
866
869
  Returns:
867
870
  PIL Image object of the temporary preview, or None if rendering fails or
@@ -922,6 +925,7 @@ class ElementCollection(
922
925
  page_index=page.index,
923
926
  temporary_highlights=highlight_data_list,
924
927
  scale=scale,
928
+ width=width, # Pass the width parameter
925
929
  labels=labels, # Use 'labels'
926
930
  legend_position=legend_position,
927
931
  render_ocr=render_ocr,
@@ -1159,10 +1163,96 @@ class ElementCollection(
1159
1163
 
1160
1164
  Args:
1161
1165
  selector: CSS-like selector string
1166
+ contains: How to determine if elements are inside: 'all' (fully inside),
1167
+ 'any' (any overlap), or 'center' (center point inside).
1168
+ (default: "all")
1162
1169
  apply_exclusions: Whether to exclude elements in exclusion regions
1163
1170
  """
1164
1171
  return self.apply(lambda element: element.find(selector, **kwargs))
1165
1172
 
1173
+ @overload
1174
+ def find_all(
1175
+ self,
1176
+ *,
1177
+ text: str,
1178
+ contains: str = "all",
1179
+ apply_exclusions: bool = True,
1180
+ regex: bool = False,
1181
+ case: bool = True,
1182
+ **kwargs,
1183
+ ) -> "ElementCollection": ...
1184
+
1185
+ @overload
1186
+ def find_all(
1187
+ self,
1188
+ selector: str,
1189
+ *,
1190
+ contains: str = "all",
1191
+ apply_exclusions: bool = True,
1192
+ regex: bool = False,
1193
+ case: bool = True,
1194
+ **kwargs,
1195
+ ) -> "ElementCollection": ...
1196
+
1197
+ def find_all(
1198
+ self,
1199
+ selector: Optional[str] = None,
1200
+ *,
1201
+ text: Optional[str] = None,
1202
+ contains: str = "all",
1203
+ apply_exclusions: bool = True,
1204
+ regex: bool = False,
1205
+ case: bool = True,
1206
+ **kwargs,
1207
+ ) -> "ElementCollection":
1208
+ """
1209
+ Find all elements within each element of this collection matching the selector OR text,
1210
+ and return a flattened collection of all found sub-elements.
1211
+
1212
+ Provide EITHER `selector` OR `text`, but not both.
1213
+
1214
+ Args:
1215
+ selector: CSS-like selector string.
1216
+ text: Text content to search for (equivalent to 'text:contains(...)').
1217
+ contains: How to determine if elements are inside: 'all' (fully inside),
1218
+ 'any' (any overlap), or 'center' (center point inside).
1219
+ (default: "all")
1220
+ apply_exclusions: Whether to apply exclusion regions (default: True).
1221
+ regex: Whether to use regex for text search (`selector` or `text`) (default: False).
1222
+ case: Whether to do case-sensitive text search (`selector` or `text`) (default: True).
1223
+ **kwargs: Additional parameters for element filtering.
1224
+
1225
+ Returns:
1226
+ A new ElementCollection containing all matching sub-elements from all elements
1227
+ in this collection.
1228
+ """
1229
+ if selector is None and text is None:
1230
+ raise ValueError("Either 'selector' or 'text' must be provided to find_all.")
1231
+ if selector is not None and text is not None:
1232
+ raise ValueError("Provide either 'selector' or 'text' to find_all, not both.")
1233
+
1234
+ all_found_elements: List[Element] = []
1235
+ for element in self._elements:
1236
+ if hasattr(element, "find_all") and callable(element.find_all):
1237
+ # Element.find_all returns an ElementCollection
1238
+ found_in_element: "ElementCollection" = element.find_all(
1239
+ selector=selector,
1240
+ text=text,
1241
+ contains=contains,
1242
+ apply_exclusions=apply_exclusions,
1243
+ regex=regex,
1244
+ case=case,
1245
+ **kwargs,
1246
+ )
1247
+ if found_in_element and found_in_element.elements:
1248
+ all_found_elements.extend(found_in_element.elements)
1249
+ # else:
1250
+ # Elements in the collection are expected to support find_all.
1251
+ # If an element type doesn't, an AttributeError will naturally occur,
1252
+ # or a more specific check/handling could be added here if needed.
1253
+
1254
+ return ElementCollection(all_found_elements)
1255
+
1166
1256
  def extract_each_text(self, **kwargs) -> List[str]:
1167
1257
  """
1168
1258
  Extract text from each element in this region.
@@ -1633,6 +1723,7 @@ class PageCollection(Generic[P], ApplyMixin):
1633
1723
  self,
1634
1724
  *,
1635
1725
  text: str,
1726
+ contains: str = "all",
1636
1727
  apply_exclusions: bool = True,
1637
1728
  regex: bool = False,
1638
1729
  case: bool = True,
@@ -1644,6 +1735,7 @@ class PageCollection(Generic[P], ApplyMixin):
1644
1735
  self,
1645
1736
  selector: str,
1646
1737
  *,
1738
+ contains: str = "all",
1647
1739
  apply_exclusions: bool = True,
1648
1740
  regex: bool = False,
1649
1741
  case: bool = True,
@@ -1655,6 +1747,7 @@ class PageCollection(Generic[P], ApplyMixin):
1655
1747
  selector: Optional[str] = None,
1656
1748
  *,
1657
1749
  text: Optional[str] = None,
1750
+ contains: str = "all",
1658
1751
  apply_exclusions: bool = True,
1659
1752
  regex: bool = False,
1660
1753
  case: bool = True,
@@ -1668,6 +1761,9 @@ class PageCollection(Generic[P], ApplyMixin):
1668
1761
  Args:
1669
1762
  selector: CSS-like selector string.
1670
1763
  text: Text content to search for (equivalent to 'text:contains(...)').
1764
+ contains: How to determine if elements are inside: 'all' (fully inside),
1765
+ 'any' (any overlap), or 'center' (center point inside).
1766
+ (default: "all")
1671
1767
  apply_exclusions: Whether to exclude elements in exclusion regions (default: True).
1672
1768
  regex: Whether to use regex for text search (`selector` or `text`) (default: False).
1673
1769
  case: Whether to do case-sensitive text search (`selector` or `text`) (default: True).
@@ -1681,6 +1777,7 @@ class PageCollection(Generic[P], ApplyMixin):
1681
1777
  element = page.find(
1682
1778
  selector=selector,
1683
1779
  text=text,
1780
+ contains=contains,
1684
1781
  apply_exclusions=apply_exclusions,
1685
1782
  regex=regex,
1686
1783
  case=case,
@@ -1695,6 +1792,7 @@ class PageCollection(Generic[P], ApplyMixin):
1695
1792
  self,
1696
1793
  *,
1697
1794
  text: str,
1795
+ contains: str = "all",
1698
1796
  apply_exclusions: bool = True,
1699
1797
  regex: bool = False,
1700
1798
  case: bool = True,
@@ -1706,6 +1804,7 @@ class PageCollection(Generic[P], ApplyMixin):
1706
1804
  self,
1707
1805
  selector: str,
1708
1806
  *,
1807
+ contains: str = "all",
1709
1808
  apply_exclusions: bool = True,
1710
1809
  regex: bool = False,
1711
1810
  case: bool = True,
@@ -1717,6 +1816,7 @@ class PageCollection(Generic[P], ApplyMixin):
1717
1816
  selector: Optional[str] = None,
1718
1817
  *,
1719
1818
  text: Optional[str] = None,
1819
+ contains: str = "all",
1720
1820
  apply_exclusions: bool = True,
1721
1821
  regex: bool = False,
1722
1822
  case: bool = True,
@@ -1730,6 +1830,9 @@ class PageCollection(Generic[P], ApplyMixin):
1730
1830
  Args:
1731
1831
  selector: CSS-like selector string.
1732
1832
  text: Text content to search for (equivalent to 'text:contains(...)').
1833
+ contains: How to determine if elements are inside: 'all' (fully inside),
1834
+ 'any' (any overlap), or 'center' (center point inside).
1835
+ (default: "all")
1733
1836
  apply_exclusions: Whether to exclude elements in exclusion regions (default: True).
1734
1837
  regex: Whether to use regex for text search (`selector` or `text`) (default: False).
1735
1838
  case: Whether to do case-sensitive text search (`selector` or `text`) (default: True).
@@ -1744,6 +1847,7 @@ class PageCollection(Generic[P], ApplyMixin):
1744
1847
  elements = page.find_all(
1745
1848
  selector=selector,
1746
1849
  text=text,
1850
+ contains=contains,
1747
1851
  apply_exclusions=apply_exclusions,
1748
1852
  regex=regex,
1749
1853
  case=case,
@@ -2314,8 +2418,10 @@ class PageCollection(Generic[P], ApplyMixin):
2314
2418
  try:
2315
2419
  from PIL import Image, ImageDraw, ImageFont
2316
2420
  except ImportError:
2317
- logger.error("Pillow library not found, required for to_image(). Install with 'pip install Pillow'")
2318
- return None
2421
+ logger.error(
2422
+ "Pillow library not found, required for to_image(). Install with 'pip install Pillow'"
2423
+ )
2424
+ return None
2319
2425
 
2320
2426
  if not self.pages:
2321
2427
  logger.warning("Cannot generate image for empty PageCollection")
@@ -2334,27 +2440,34 @@ class PageCollection(Generic[P], ApplyMixin):
2334
2440
  try:
2335
2441
  font = ImageFont.load_default(16)
2336
2442
  except IOError:
2337
- logger.warning("Default font not found. Labels cannot be added.")
2338
- add_labels = False # Disable if no font
2443
+ logger.warning("Default font not found. Labels cannot be added.")
2444
+ add_labels = False # Disable if no font
2339
2445
 
2340
2446
  # Render individual page images
2341
2447
  page_images = []
2342
2448
  for page in pages_to_render:
2343
2449
  try:
2344
2450
  # Assume page.to_image returns a PIL Image or None
2345
- img = page.to_image(width=page_width, include_highlights=True) # Render with highlights for visual context
2451
+ img = page.to_image(
2452
+ width=page_width, include_highlights=True
2453
+ ) # Render with highlights for visual context
2346
2454
  if img is None:
2347
- logger.warning(f"Failed to generate image for page {page.number}. Skipping.")
2348
- continue
2455
+ logger.warning(f"Failed to generate image for page {page.number}. Skipping.")
2456
+ continue
2349
2457
  except Exception as img_err:
2350
- logger.error(f"Error generating image for page {page.number}: {img_err}", exc_info=True)
2351
- continue
2352
-
2458
+ logger.error(
2459
+ f"Error generating image for page {page.number}: {img_err}", exc_info=True
2460
+ )
2461
+ continue
2353
2462
 
2354
2463
  # Add page number label
2355
2464
  if add_labels and font:
2356
2465
  draw = ImageDraw.Draw(img)
2357
- pdf_name = Path(page.pdf.path).stem if hasattr(page, "pdf") and page.pdf and hasattr(page.pdf, "path") else ""
2466
+ pdf_name = (
2467
+ Path(page.pdf.path).stem
2468
+ if hasattr(page, "pdf") and page.pdf and hasattr(page.pdf, "path")
2469
+ else ""
2470
+ )
2358
2471
  label_text = f"p{page.number}"
2359
2472
  if pdf_name:
2360
2473
  label_text += f" - {pdf_name}"
@@ -2364,43 +2477,65 @@ class PageCollection(Generic[P], ApplyMixin):
2364
2477
  # Placeholder logic - adjust based on how classification results are stored
2365
2478
  category = None
2366
2479
  confidence = None
2367
- if hasattr(page, 'analyses') and page.analyses and 'classification' in page.analyses:
2368
- result = page.analyses['classification']
2480
+ if (
2481
+ hasattr(page, "analyses")
2482
+ and page.analyses
2483
+ and "classification" in page.analyses
2484
+ ):
2485
+ result = page.analyses["classification"]
2369
2486
  # Adapt based on actual structure of classification result
2370
- category = getattr(result, 'label', None) or result.get('label', None) if isinstance(result, dict) else None
2371
- confidence = getattr(result, 'score', None) or result.get('score', None) if isinstance(result, dict) else None
2487
+ category = (
2488
+ getattr(result, "label", None) or result.get("label", None)
2489
+ if isinstance(result, dict)
2490
+ else None
2491
+ )
2492
+ confidence = (
2493
+ getattr(result, "score", None) or result.get("score", None)
2494
+ if isinstance(result, dict)
2495
+ else None
2496
+ )
2372
2497
 
2373
2498
  if category is not None and confidence is not None:
2374
- try:
2375
- category_str = f"{category} ({confidence:.2f})" # Format confidence
2499
+ try:
2500
+ category_str = f"{category} ({confidence:.2f})" # Format confidence
2376
2501
  label_text += f"\\n{category_str}"
2377
- except (TypeError, ValueError): pass # Ignore formatting errors
2378
-
2502
+ except (TypeError, ValueError):
2503
+ pass # Ignore formatting errors
2379
2504
 
2380
2505
  # Calculate bounding box for multi-line text and draw background/text
2381
2506
  try:
2382
2507
  # Using textbbox for potentially better accuracy with specific fonts
2383
2508
  # Note: textbbox needs Pillow 8+
2384
- bbox = draw.textbbox((5, 5), label_text, font=font, spacing=2) # Use textbbox if available
2385
- bg_rect = (max(0, bbox[0] - 2), max(0, bbox[1] - 2),
2386
- min(img.width, bbox[2] + 2), min(img.height, bbox[3] + 2))
2509
+ bbox = draw.textbbox(
2510
+ (5, 5), label_text, font=font, spacing=2
2511
+ ) # Use textbbox if available
2512
+ bg_rect = (
2513
+ max(0, bbox[0] - 2),
2514
+ max(0, bbox[1] - 2),
2515
+ min(img.width, bbox[2] + 2),
2516
+ min(img.height, bbox[3] + 2),
2517
+ )
2387
2518
 
2388
2519
  # Draw semi-transparent background
2389
- overlay = Image.new('RGBA', img.size, (255, 255, 255, 0))
2520
+ overlay = Image.new("RGBA", img.size, (255, 255, 255, 0))
2390
2521
  draw_overlay = ImageDraw.Draw(overlay)
2391
- draw_overlay.rectangle(bg_rect, fill=(255, 255, 255, 180)) # White with alpha
2392
- img = Image.alpha_composite(img.convert('RGBA'), overlay).convert('RGB')
2393
- draw = ImageDraw.Draw(img) # Recreate draw object
2522
+ draw_overlay.rectangle(bg_rect, fill=(255, 255, 255, 180)) # White with alpha
2523
+ img = Image.alpha_composite(img.convert("RGBA"), overlay).convert("RGB")
2524
+ draw = ImageDraw.Draw(img) # Recreate draw object
2394
2525
 
2395
2526
  # Draw the potentially multi-line text
2396
2527
  draw.multiline_text((5, 5), label_text, fill=(0, 0, 0), font=font, spacing=2)
2397
- except AttributeError: # Fallback for older Pillow without textbbox
2528
+ except AttributeError: # Fallback for older Pillow without textbbox
2398
2529
  # Approximate size and draw
2399
2530
  # This might not be perfectly aligned
2400
- draw.rectangle((2, 2, 150, 40), fill=(255, 255, 255, 180)) # Simple fixed background
2401
- draw.multiline_text((5, 5), label_text, fill=(0, 0, 0), font=font, spacing=2)
2531
+ draw.rectangle(
2532
+ (2, 2, 150, 40), fill=(255, 255, 255, 180)
2533
+ ) # Simple fixed background
2534
+ draw.multiline_text((5, 5), label_text, fill=(0, 0, 0), font=font, spacing=2)
2402
2535
  except Exception as draw_err:
2403
- logger.error(f"Error drawing label on page {page.number}: {draw_err}", exc_info=True)
2536
+ logger.error(
2537
+ f"Error drawing label on page {page.number}: {draw_err}", exc_info=True
2538
+ )
2404
2539
 
2405
2540
  page_images.append(img)
2406
2541
 
@@ -2408,7 +2543,6 @@ class PageCollection(Generic[P], ApplyMixin):
2408
2543
  logger.warning("No page images were successfully rendered for the grid.")
2409
2544
  return None
2410
2545
 
2411
-
2412
2546
  # Calculate grid dimensions if not provided
2413
2547
  num_images = len(page_images)
2414
2548
  if not rows and not cols:
@@ -2418,24 +2552,23 @@ class PageCollection(Generic[P], ApplyMixin):
2418
2552
  cols = (num_images + rows - 1) // rows
2419
2553
  elif cols and not rows:
2420
2554
  rows = (num_images + cols - 1) // cols
2421
- cols = max(1, cols if cols else 1) # Ensure at least 1
2555
+ cols = max(1, cols if cols else 1) # Ensure at least 1
2422
2556
  rows = max(1, rows if rows else 1)
2423
2557
 
2424
-
2425
2558
  # Get maximum dimensions for consistent grid cells
2426
2559
  max_width = max(img.width for img in page_images) if page_images else 1
2427
2560
  max_height = max(img.height for img in page_images) if page_images else 1
2428
2561
 
2429
-
2430
2562
  # Create grid image
2431
2563
  grid_width = cols * max_width + (cols + 1) * spacing
2432
2564
  grid_height = rows * max_height + (rows + 1) * spacing
2433
- grid_img = Image.new("RGB", (grid_width, grid_height), (220, 220, 220)) # Lighter gray background
2434
-
2565
+ grid_img = Image.new(
2566
+ "RGB", (grid_width, grid_height), (220, 220, 220)
2567
+ ) # Lighter gray background
2435
2568
 
2436
2569
  # Place images in grid
2437
2570
  for i, img in enumerate(page_images):
2438
- if i >= rows * cols: # Ensure we don't exceed grid capacity
2571
+ if i >= rows * cols: # Ensure we don't exceed grid capacity
2439
2572
  break
2440
2573
 
2441
2574
  row = i // cols
@@ -2484,8 +2617,8 @@ class PageCollection(Generic[P], ApplyMixin):
2484
2617
  if not self.pages:
2485
2618
  raise ValueError("Cannot save an empty PageCollection.")
2486
2619
 
2487
- if not (ocr ^ original): # XOR: exactly one must be true
2488
- raise ValueError("Exactly one of 'ocr' or 'original' must be True.")
2620
+ if not (ocr ^ original): # XOR: exactly one must be true
2621
+ raise ValueError("Exactly one of 'ocr' or 'original' must be True.")
2489
2622
 
2490
2623
  output_path_obj = Path(output_path)
2491
2624
  output_path_str = str(output_path_obj)
@@ -2494,18 +2627,29 @@ class PageCollection(Generic[P], ApplyMixin):
2494
2627
  if create_searchable_pdf is None:
2495
2628
  raise ImportError(
2496
2629
  "Saving with ocr=True requires 'pikepdf' and 'Pillow'. "
2497
- "Install with: pip install \\\"natural-pdf[ocr-export]\\\"" # Escaped quotes
2630
+ 'Install with: pip install \\"natural-pdf[ocr-export]\\"' # Escaped quotes
2498
2631
  )
2499
2632
 
2500
2633
  # Check for non-OCR vector elements (provide a warning)
2501
2634
  has_vector_elements = False
2502
2635
  for page in self.pages:
2503
2636
  # Simplified check for common vector types or non-OCR chars/words
2504
- if (hasattr(page, 'rects') and page.rects or
2505
- hasattr(page, 'lines') and page.lines or
2506
- hasattr(page, 'curves') and page.curves or
2507
- (hasattr(page, 'chars') and any(getattr(el, 'source', None) != 'ocr' for el in page.chars)) or
2508
- (hasattr(page, 'words') and any(getattr(el, 'source', None) != 'ocr' for el in page.words))):
2637
+ if (
2638
+ hasattr(page, "rects")
2639
+ and page.rects
2640
+ or hasattr(page, "lines")
2641
+ and page.lines
2642
+ or hasattr(page, "curves")
2643
+ and page.curves
2644
+ or (
2645
+ hasattr(page, "chars")
2646
+ and any(getattr(el, "source", None) != "ocr" for el in page.chars)
2647
+ )
2648
+ or (
2649
+ hasattr(page, "words")
2650
+ and any(getattr(el, "source", None) != "ocr" for el in page.words)
2651
+ )
2652
+ ):
2509
2653
  has_vector_elements = True
2510
2654
  break
2511
2655
  if has_vector_elements:
@@ -2532,22 +2676,22 @@ class PageCollection(Generic[P], ApplyMixin):
2532
2676
  if create_original_pdf is None:
2533
2677
  raise ImportError(
2534
2678
  "Saving with original=True requires 'pikepdf'. "
2535
- "Install with: pip install \\\"natural-pdf[ocr-export]\\\"" # Escaped quotes
2679
+ 'Install with: pip install \\"natural-pdf[ocr-export]\\"' # Escaped quotes
2536
2680
  )
2537
2681
 
2538
2682
  # Check for OCR elements (provide a warning) - keep this check here
2539
2683
  has_ocr_elements = False
2540
2684
  for page in self.pages:
2541
- # Use find_all which returns a collection; check if it's non-empty
2542
- if hasattr(page, 'find_all'):
2543
- ocr_text_elements = page.find_all("text[source=ocr]")
2544
- if ocr_text_elements: # Check truthiness of collection
2545
- has_ocr_elements = True
2546
- break
2547
- elif hasattr(page, 'words'): # Fallback check if find_all isn't present?
2548
- if any(getattr(el, 'source', None) == 'ocr' for el in page.words):
2549
- has_ocr_elements = True
2550
- break
2685
+ # Use find_all which returns a collection; check if it's non-empty
2686
+ if hasattr(page, "find_all"):
2687
+ ocr_text_elements = page.find_all("text[source=ocr]")
2688
+ if ocr_text_elements: # Check truthiness of collection
2689
+ has_ocr_elements = True
2690
+ break
2691
+ elif hasattr(page, "words"): # Fallback check if find_all isn't present?
2692
+ if any(getattr(el, "source", None) == "ocr" for el in page.words):
2693
+ has_ocr_elements = True
2694
+ break
2551
2695
 
2552
2696
  if has_ocr_elements:
2553
2697
  logger.warning(
@@ -2565,5 +2709,5 @@ class PageCollection(Generic[P], ApplyMixin):
2565
2709
  except Exception as e:
2566
2710
  # Error logging is handled within create_original_pdf
2567
2711
  # Re-raise the exception caught from the exporter
2568
- raise e # Keep the original exception type (ValueError, RuntimeError, etc.)
2712
+ raise e # Keep the original exception type (ValueError, RuntimeError, etc.)
2569
2713
  # <--- END MODIFIED
@@ -772,6 +772,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin):
772
772
  # Add a default color for standalone show
773
773
  color: Optional[Union[Tuple, str]] = "blue",
774
774
  label: Optional[str] = None,
775
+ width: Optional[int] = None, # Add width parameter
775
776
  ) -> "Image.Image":
776
777
  """
777
778
  Show the page with just this region highlighted temporarily.
@@ -782,6 +783,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin):
782
783
  legend_position: Position of the legend
783
784
  color: Color to highlight this region (default: blue)
784
785
  label: Optional label for this region in the legend
786
+ width: Optional width for the output image in pixels
785
787
 
786
788
  Returns:
787
789
  PIL Image of the page with only this region highlighted
@@ -812,6 +814,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin):
812
814
  page_index=self._page.index,
813
815
  temporary_highlights=[temp_highlight_data],
814
816
  scale=scale,
817
+ width=width, # Pass the width parameter
815
818
  labels=labels,
816
819
  legend_position=legend_position,
817
820
  )
@@ -1333,6 +1336,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin):
1333
1336
  self,
1334
1337
  *,
1335
1338
  text: str,
1339
+ contains: str = "all",
1336
1340
  apply_exclusions: bool = True,
1337
1341
  regex: bool = False,
1338
1342
  case: bool = True,
@@ -1344,6 +1348,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin):
1344
1348
  self,
1345
1349
  selector: str,
1346
1350
  *,
1351
+ contains: str = "all",
1347
1352
  apply_exclusions: bool = True,
1348
1353
  regex: bool = False,
1349
1354
  case: bool = True,
@@ -1355,6 +1360,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin):
1355
1360
  selector: Optional[str] = None, # Now optional
1356
1361
  *,
1357
1362
  text: Optional[str] = None, # New text parameter
1363
+ contains: str = "all", # New parameter for containment behavior
1358
1364
  apply_exclusions: bool = True,
1359
1365
  regex: bool = False,
1360
1366
  case: bool = True,
@@ -1368,6 +1374,9 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin):
1368
1374
  Args:
1369
1375
  selector: CSS-like selector string.
1370
1376
  text: Text content to search for (equivalent to 'text:contains(...)').
1377
+ contains: How to determine if elements are inside: 'all' (fully inside),
1378
+ 'any' (any overlap), or 'center' (center point inside).
1379
+ (default: "all")
1371
1380
  apply_exclusions: Whether to exclude elements in exclusion regions (default: True).
1372
1381
  regex: Whether to use regex for text search (`selector` or `text`) (default: False).
1373
1382
  case: Whether to do case-sensitive text search (`selector` or `text`) (default: True).
@@ -1380,6 +1389,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin):
1380
1389
  elements = self.find_all(
1381
1390
  selector=selector,
1382
1391
  text=text,
1392
+ contains=contains,
1383
1393
  apply_exclusions=apply_exclusions,
1384
1394
  regex=regex,
1385
1395
  case=case,
@@ -1392,6 +1402,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin):
1392
1402
  self,
1393
1403
  *,
1394
1404
  text: str,
1405
+ contains: str = "all",
1395
1406
  apply_exclusions: bool = True,
1396
1407
  regex: bool = False,
1397
1408
  case: bool = True,
@@ -1403,6 +1414,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin):
1403
1414
  self,
1404
1415
  selector: str,
1405
1416
  *,
1417
+ contains: str = "all",
1406
1418
  apply_exclusions: bool = True,
1407
1419
  regex: bool = False,
1408
1420
  case: bool = True,
@@ -1414,6 +1426,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin):
1414
1426
  selector: Optional[str] = None, # Now optional
1415
1427
  *,
1416
1428
  text: Optional[str] = None, # New text parameter
1429
+ contains: str = "all", # New parameter to control inside/overlap behavior
1417
1430
  apply_exclusions: bool = True,
1418
1431
  regex: bool = False,
1419
1432
  case: bool = True,
@@ -1427,6 +1440,9 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin):
1427
1440
  Args:
1428
1441
  selector: CSS-like selector string.
1429
1442
  text: Text content to search for (equivalent to 'text:contains(...)').
1443
+ contains: How to determine if elements are inside: 'all' (fully inside),
1444
+ 'any' (any overlap), or 'center' (center point inside).
1445
+ (default: "all")
1430
1446
  apply_exclusions: Whether to exclude elements in exclusion regions (default: True).
1431
1447
  regex: Whether to use regex for text search (`selector` or `text`) (default: False).
1432
1448
  case: Whether to do case-sensitive text search (`selector` or `text`) (default: True).
@@ -1442,6 +1458,10 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin):
1442
1458
  if selector is None and text is None:
1443
1459
  raise ValueError("Provide either 'selector' or 'text'.")
1444
1460
 
1461
+ # Validate contains parameter
1462
+ if contains not in ["all", "any", "center"]:
1463
+ raise ValueError(f"Invalid contains value: {contains}. Must be 'all', 'any', or 'center'")
1464
+
1445
1465
  # Construct selector if 'text' is provided
1446
1466
  effective_selector = ""
1447
1467
  if text is not None:
@@ -1481,22 +1501,34 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin):
1481
1501
  # Let the page handle its exclusion logic if needed
1482
1502
  potential_elements = self.page.find_all(
1483
1503
  selector=effective_selector,
1484
- apply_exclusions=False, # Apply exclusions LATER based on region bbox
1504
+ apply_exclusions=apply_exclusions,
1485
1505
  regex=regex,
1486
1506
  case=case,
1487
1507
  **kwargs,
1488
1508
  )
1489
1509
 
1490
- # Filter these elements to those strictly within the region's bounds
1510
+ # Filter these elements based on the specified containment method
1491
1511
  region_bbox = self.bbox
1492
- matching_elements = [
1493
- el
1494
- for el in potential_elements
1495
- if el.x0 >= region_bbox[0]
1496
- and el.top >= region_bbox[1]
1497
- and el.x1 <= region_bbox[2]
1498
- and el.bottom <= region_bbox[3]
1499
- ]
1512
+ matching_elements = []
1513
+
1514
+ if contains == "all": # Fully inside (strict)
1515
+ matching_elements = [
1516
+ el for el in potential_elements
1517
+ if el.x0 >= region_bbox[0]
1518
+ and el.top >= region_bbox[1]
1519
+ and el.x1 <= region_bbox[2]
1520
+ and el.bottom <= region_bbox[3]
1521
+ ]
1522
+ elif contains == "any": # Any overlap
1523
+ matching_elements = [
1524
+ el for el in potential_elements
1525
+ if self.intersects(el)
1526
+ ]
1527
+ elif contains == "center": # Center point inside
1528
+ matching_elements = [
1529
+ el for el in potential_elements
1530
+ if self.is_element_center_inside(el)
1531
+ ]
1500
1532
 
1501
1533
  return ElementCollection(matching_elements)
1502
1534
 
@@ -1988,7 +2020,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin):
1988
2020
  from natural_pdf.qa.document_qa import get_qa_engine
1989
2021
  except ImportError:
1990
2022
  logger.error(
1991
- "Question answering requires optional dependencies. Install with `pip install natural-pdf[qa]`"
2023
+ "Question answering requires optional dependencies. Install with `pip install natural-pdf[core-ml]`"
1992
2024
  )
1993
2025
  return {
1994
2026
  "answer": None,
File without changes
Binary file
Binary file