natural-pdf 0.1.11__py3-none-any.whl → 0.1.13__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. natural_pdf/__init__.py +7 -2
  2. natural_pdf/analyzers/shape_detection_mixin.py +1092 -0
  3. natural_pdf/analyzers/text_options.py +9 -1
  4. natural_pdf/analyzers/text_structure.py +371 -58
  5. natural_pdf/classification/manager.py +3 -4
  6. natural_pdf/collections/pdf_collection.py +19 -39
  7. natural_pdf/core/element_manager.py +11 -1
  8. natural_pdf/core/highlighting_service.py +146 -75
  9. natural_pdf/core/page.py +287 -188
  10. natural_pdf/core/pdf.py +57 -42
  11. natural_pdf/elements/base.py +51 -0
  12. natural_pdf/elements/collections.py +362 -67
  13. natural_pdf/elements/line.py +5 -0
  14. natural_pdf/elements/region.py +396 -23
  15. natural_pdf/exporters/data/__init__.py +0 -0
  16. natural_pdf/exporters/data/pdf.ttf +0 -0
  17. natural_pdf/exporters/data/sRGB.icc +0 -0
  18. natural_pdf/exporters/hocr.py +40 -61
  19. natural_pdf/exporters/hocr_font.py +7 -13
  20. natural_pdf/exporters/original_pdf.py +10 -13
  21. natural_pdf/exporters/paddleocr.py +51 -11
  22. natural_pdf/exporters/searchable_pdf.py +0 -10
  23. natural_pdf/flows/__init__.py +12 -0
  24. natural_pdf/flows/collections.py +533 -0
  25. natural_pdf/flows/element.py +382 -0
  26. natural_pdf/flows/flow.py +216 -0
  27. natural_pdf/flows/region.py +458 -0
  28. natural_pdf/search/__init__.py +65 -52
  29. natural_pdf/search/lancedb_search_service.py +325 -0
  30. natural_pdf/search/numpy_search_service.py +255 -0
  31. natural_pdf/search/searchable_mixin.py +25 -71
  32. natural_pdf/selectors/parser.py +163 -8
  33. natural_pdf/widgets/viewer.py +22 -31
  34. {natural_pdf-0.1.11.dist-info → natural_pdf-0.1.13.dist-info}/METADATA +55 -49
  35. {natural_pdf-0.1.11.dist-info → natural_pdf-0.1.13.dist-info}/RECORD +38 -30
  36. {natural_pdf-0.1.11.dist-info → natural_pdf-0.1.13.dist-info}/WHEEL +1 -1
  37. natural_pdf/search/haystack_search_service.py +0 -687
  38. natural_pdf/search/haystack_utils.py +0 -474
  39. natural_pdf/utils/tqdm_utils.py +0 -51
  40. {natural_pdf-0.1.11.dist-info → natural_pdf-0.1.13.dist-info}/licenses/LICENSE +0 -0
  41. {natural_pdf-0.1.11.dist-info → natural_pdf-0.1.13.dist-info}/top_level.txt +0 -0
@@ -18,12 +18,13 @@ from typing import (
18
18
  Union,
19
19
  overload,
20
20
  )
21
+ import hashlib
21
22
 
22
23
  from pdfplumber.utils.geometry import objects_to_bbox
23
- from PIL import Image, ImageDraw, ImageFont
24
24
 
25
25
  # New Imports
26
26
  from pdfplumber.utils.text import TEXTMAP_KWARGS, WORD_EXTRACTOR_KWARGS, chars_to_textmap
27
+ from PIL import Image, ImageDraw, ImageFont
27
28
  from tqdm.auto import tqdm
28
29
 
29
30
  from natural_pdf.classification.manager import ClassificationManager
@@ -37,6 +38,8 @@ from natural_pdf.export.mixin import ExportMixin
37
38
  from natural_pdf.ocr import OCROptions
38
39
  from natural_pdf.ocr.utils import _apply_ocr_correction_to_elements
39
40
  from natural_pdf.selectors.parser import parse_selector, selector_to_filter_func
41
+ from natural_pdf.analyzers.shape_detection_mixin import ShapeDetectionMixin
42
+ from tqdm.auto import tqdm
40
43
 
41
44
  # Potentially lazy imports for optional dependencies needed in save_pdf
42
45
  try:
@@ -46,7 +49,6 @@ except ImportError:
46
49
 
47
50
  try:
48
51
  from natural_pdf.exporters.searchable_pdf import create_searchable_pdf
49
- pass
50
52
  except ImportError:
51
53
  create_searchable_pdf = None
52
54
 
@@ -61,8 +63,9 @@ logger = logging.getLogger(__name__)
61
63
 
62
64
  if TYPE_CHECKING:
63
65
  from natural_pdf.core.page import Page
64
- from natural_pdf.core.pdf import PDF # ---> ADDED PDF type hint
66
+ from natural_pdf.core.pdf import PDF # ---> ADDED PDF type hint
65
67
  from natural_pdf.elements.region import Region
68
+ from natural_pdf.elements.text import TextElement # Ensure TextElement is imported
66
69
 
67
70
  T = TypeVar("T")
68
71
  P = TypeVar("P", bound="Page")
@@ -840,6 +843,7 @@ class ElementCollection(
840
843
  labels: bool = True, # Use 'labels' consistent with service
841
844
  legend_position: str = "right",
842
845
  render_ocr: bool = False,
846
+ width: Optional[int] = None, # Add width parameter
843
847
  ) -> Optional["Image.Image"]:
844
848
  """
845
849
  Generates a temporary preview image highlighting elements in this collection
@@ -862,6 +866,7 @@ class ElementCollection(
862
866
  labels: Whether to include a legend for the temporary highlights.
863
867
  legend_position: Position of the legend ('right', 'left', 'top', 'bottom').
864
868
  render_ocr: Whether to render OCR text.
869
+ width: Optional width for the output image in pixels.
865
870
 
866
871
  Returns:
867
872
  PIL Image object of the temporary preview, or None if rendering fails or
@@ -922,6 +927,7 @@ class ElementCollection(
922
927
  page_index=page.index,
923
928
  temporary_highlights=highlight_data_list,
924
929
  scale=scale,
930
+ width=width, # Pass the width parameter
925
931
  labels=labels, # Use 'labels'
926
932
  legend_position=legend_position,
927
933
  render_ocr=render_ocr,
@@ -1159,10 +1165,96 @@ class ElementCollection(
1159
1165
 
1160
1166
  Args:
1161
1167
  selector: CSS-like selector string
1168
+ contains: How to determine if elements are inside: 'all' (fully inside),
1169
+ 'any' (any overlap), or 'center' (center point inside).
1170
+ (default: "all")
1162
1171
  apply_exclusions: Whether to exclude elements in exclusion regions
1163
1172
  """
1164
1173
  return self.apply(lambda element: element.find(selector, **kwargs))
1165
1174
 
1175
+ @overload
1176
+ def find_all(
1177
+ self,
1178
+ *,
1179
+ text: str,
1180
+ contains: str = "all",
1181
+ apply_exclusions: bool = True,
1182
+ regex: bool = False,
1183
+ case: bool = True,
1184
+ **kwargs,
1185
+ ) -> "ElementCollection": ...
1186
+
1187
+ @overload
1188
+ def find_all(
1189
+ self,
1190
+ selector: str,
1191
+ *,
1192
+ contains: str = "all",
1193
+ apply_exclusions: bool = True,
1194
+ regex: bool = False,
1195
+ case: bool = True,
1196
+ **kwargs,
1197
+ ) -> "ElementCollection": ...
1198
+
1199
+ def find_all(
1200
+ self,
1201
+ selector: Optional[str] = None,
1202
+ *,
1203
+ text: Optional[str] = None,
1204
+ contains: str = "all",
1205
+ apply_exclusions: bool = True,
1206
+ regex: bool = False,
1207
+ case: bool = True,
1208
+ **kwargs,
1209
+ ) -> "ElementCollection":
1210
+ """
1211
+ Find all elements within each element of this collection matching the selector OR text,
1212
+ and return a flattened collection of all found sub-elements.
1213
+
1214
+ Provide EITHER `selector` OR `text`, but not both.
1215
+
1216
+ Args:
1217
+ selector: CSS-like selector string.
1218
+ text: Text content to search for (equivalent to 'text:contains(...)').
1219
+ contains: How to determine if elements are inside: 'all' (fully inside),
1220
+ 'any' (any overlap), or 'center' (center point inside).
1221
+ (default: "all")
1222
+ apply_exclusions: Whether to apply exclusion regions (default: True).
1223
+ regex: Whether to use regex for text search (`selector` or `text`) (default: False).
1224
+ case: Whether to do case-sensitive text search (`selector` or `text`) (default: True).
1225
+ **kwargs: Additional parameters for element filtering.
1226
+
1227
+ Returns:
1228
+ A new ElementCollection containing all matching sub-elements from all elements
1229
+ in this collection.
1230
+ """
1231
+ if selector is None and text is None:
1232
+ raise ValueError("Either 'selector' or 'text' must be provided to find_all.")
1233
+ if selector is not None and text is not None:
1234
+ raise ValueError("Provide either 'selector' or 'text' to find_all, not both.")
1235
+
1236
+ all_found_elements: List[Element] = []
1237
+ for element in self._elements:
1238
+ if hasattr(element, "find_all") and callable(element.find_all):
1239
+ # Element.find_all returns an ElementCollection
1240
+ found_in_element: "ElementCollection" = element.find_all(
1241
+ selector=selector,
1242
+ text=text,
1243
+ contains=contains,
1244
+ apply_exclusions=apply_exclusions,
1245
+ regex=regex,
1246
+ case=case,
1247
+ **kwargs,
1248
+ )
1249
+ if found_in_element and found_in_element.elements:
1250
+ all_found_elements.extend(found_in_element.elements)
1251
+ # else:
1252
+ # Elements in the collection are expected to support find_all.
1253
+ # If an element type doesn't, an AttributeError will naturally occur,
1254
+ # or a more specific check/handling could be added here if needed.
1255
+
1256
+ return ElementCollection(all_found_elements)
1257
+
1166
1258
  def extract_each_text(self, **kwargs) -> List[str]:
1167
1259
  """
1168
1260
  Extract text from each element in this region.
@@ -1496,13 +1588,162 @@ class ElementCollection(
1496
1588
 
1497
1589
  return all_data
1498
1590
 
1591
+ def to_text_elements(
1592
+ self,
1593
+ text_content_func: Optional[Callable[["Region"], Optional[str]]] = None,
1594
+ source_label: str = "derived_from_region",
1595
+ object_type: str = "word",
1596
+ default_font_size: float = 10.0,
1597
+ default_font_name: str = "RegionContent",
1598
+ confidence: Optional[float] = None,
1599
+ add_to_page: bool = False # Default is False
1600
+ ) -> "ElementCollection[TextElement]":
1601
+ """
1602
+ Converts each Region in this collection to a TextElement.
1603
+
1604
+ Args:
1605
+ text_content_func: A callable that takes a Region and returns its text
1606
+ (or None). If None, all created TextElements will
1607
+ have text=None.
1608
+ source_label: The 'source' attribute for the new TextElements.
1609
+ object_type: The 'object_type' for the TextElement's data dict.
1610
+ default_font_size: Placeholder font size.
1611
+ default_font_name: Placeholder font name.
1612
+ confidence: Confidence score.
1613
+ add_to_page: If True (default is False), also adds the created
1614
+ TextElements to their respective page's element manager.
1615
+
1616
+ Returns:
1617
+ A new ElementCollection containing the created TextElement objects.
1618
+ """
1619
+ from natural_pdf.elements.region import Region # Local import for type checking if needed or to resolve circularity
1620
+ from natural_pdf.elements.text import TextElement # Ensure TextElement is imported for type hint if not in TYPE_CHECKING
1621
+
1622
+ new_text_elements: List["TextElement"] = []
1623
+ if not self.elements: # Accesses self._elements via property
1624
+ return ElementCollection([])
1625
+
1626
+ page_context_for_adding: Optional["Page"] = None
1627
+ if add_to_page:
1628
+ # Try to determine a consistent page context if adding elements
1629
+ first_valid_region_with_page = next(
1630
+ (el for el in self.elements if isinstance(el, Region) and hasattr(el, 'page') and el.page is not None),
1631
+ None
1632
+ )
1633
+ if first_valid_region_with_page:
1634
+ page_context_for_adding = first_valid_region_with_page.page
1635
+ else:
1636
+ logger.warning("Cannot add TextElements to page: No valid Region with a page attribute found in collection, or first region's page is None.")
1637
+ add_to_page = False # Disable adding if no valid page context can be determined
1638
+
1639
+ for element in self.elements: # Accesses self._elements via property/iterator
1640
+ if isinstance(element, Region):
1641
+ text_el = element.to_text_element(
1642
+ text_content=text_content_func,
1643
+ source_label=source_label,
1644
+ object_type=object_type,
1645
+ default_font_size=default_font_size,
1646
+ default_font_name=default_font_name,
1647
+ confidence=confidence
1648
+ )
1649
+ new_text_elements.append(text_el)
1650
+
1651
+ if add_to_page:
1652
+ if not hasattr(text_el, 'page') or text_el.page is None:
1653
+ logger.warning(f"TextElement created from region {element.bbox} has no page attribute. Cannot add to page.")
1654
+ continue
1655
+
1656
+ if page_context_for_adding and text_el.page == page_context_for_adding:
1657
+ if hasattr(page_context_for_adding, '_element_mgr') and page_context_for_adding._element_mgr is not None:
1658
+ add_as_type = "words" if object_type == "word" else "chars" if object_type == "char" else object_type
1659
+ page_context_for_adding._element_mgr.add_element(text_el, element_type=add_as_type)
1660
+ else:
1661
+ page_num_str = str(page_context_for_adding.page_number) if hasattr(page_context_for_adding, 'page_number') else 'N/A'
1662
+ logger.error(f"Page context for region {element.bbox} (Page {page_num_str}) is missing '_element_mgr'. Cannot add TextElement.")
1663
+ elif page_context_for_adding and text_el.page != page_context_for_adding:
1664
+ current_page_num_str = str(text_el.page.page_number) if hasattr(text_el.page, 'page_number') else "Unknown"
1665
+ context_page_num_str = str(page_context_for_adding.page_number) if hasattr(page_context_for_adding, 'page_number') else "N/A"
1666
+ logger.warning(f"TextElement for region {element.bbox} from page {current_page_num_str} "
1667
+ f"not added as it's different from collection's inferred page context {context_page_num_str}.")
1668
+ elif not page_context_for_adding:
1669
+ logger.warning(f"TextElement for region {element.bbox} created, but no page context was determined for adding.")
1670
+ else:
1671
+ logger.warning(f"Skipping element {type(element)}, not a Region.")
1672
+
1673
+ if add_to_page and page_context_for_adding:
1674
+ page_num_str = str(page_context_for_adding.page_number) if hasattr(page_context_for_adding, 'page_number') else 'N/A'
1675
+ logger.info(f"Created and added {len(new_text_elements)} TextElements to page {page_num_str}.")
1676
+ elif add_to_page and not page_context_for_adding:
1677
+ logger.info(f"Created {len(new_text_elements)} TextElements, but could not add to page as page context was not determined or was inconsistent.")
1678
+ else: # add_to_page is False
1679
+ logger.info(f"Created {len(new_text_elements)} TextElements (not added to page).")
1680
+
1681
+ return ElementCollection(new_text_elements)
1682
+
1683
+ def trim(self, padding: int = 1, threshold: float = 0.95, resolution: float = 150, show_progress: bool = True) -> "ElementCollection":
1684
+ """
1685
+ Trim visual whitespace from each region in the collection.
1686
+
1687
+ Applies the trim() method to each element in the collection,
1688
+ returning a new collection with the trimmed regions.
1689
+
1690
+ Args:
1691
+ padding: Number of pixels to keep as padding after trimming (default: 1)
1692
+ threshold: Threshold for considering a row/column as whitespace (0.0-1.0, default: 0.95)
1693
+ resolution: Resolution for image rendering in DPI (default: 150)
1694
+ show_progress: Whether to show a progress bar for the trimming operation
1695
+
1696
+ Returns:
1697
+ New ElementCollection with trimmed regions
1698
+ """
1699
+ return self.apply(
1700
+ lambda element: element.trim(padding=padding, threshold=threshold, resolution=resolution),
1701
+ show_progress=show_progress
1702
+ )
1703
+
1704
+ def clip(
1705
+ self,
1706
+ obj: Optional[Any] = None,
1707
+ left: Optional[float] = None,
1708
+ top: Optional[float] = None,
1709
+ right: Optional[float] = None,
1710
+ bottom: Optional[float] = None,
1711
+ ) -> "ElementCollection":
1712
+ """
1713
+ Clip each element in the collection to the specified bounds.
1714
+
1715
+ This method applies the clip operation to each individual element,
1716
+ returning a new collection with the clipped elements.
1717
+
1718
+ Args:
1719
+ obj: Optional object with bbox properties (Region, Element, TextElement, etc.)
1720
+ left: Optional left boundary (x0) to clip to
1721
+ top: Optional top boundary to clip to
1722
+ right: Optional right boundary (x1) to clip to
1723
+ bottom: Optional bottom boundary to clip to
1724
+
1725
+ Returns:
1726
+ New ElementCollection containing the clipped elements
1727
+
1728
+ Examples:
1729
+ # Clip each element to another region's bounds
1730
+ clipped_elements = collection.clip(container_region)
1731
+
1732
+ # Clip each element to specific coordinates
1733
+ clipped_elements = collection.clip(left=100, right=400)
1734
+
1735
+ # Mix object bounds with specific overrides
1736
+ clipped_elements = collection.clip(obj=container, bottom=page.height/2)
1737
+ """
1738
+ return self.apply(
1739
+ lambda element: element.clip(obj=obj, left=left, top=top, right=right, bottom=bottom)
1740
+ )
1499
1741
 
1500
- class PageCollection(Generic[P], ApplyMixin):
1501
- """
1502
- A collection of PDF pages with cross-page operations.
1503
1742
 
1504
- This class provides methods for working with multiple pages, such as finding
1505
- elements across pages, extracting text from page ranges, and more.
1743
+ class PageCollection(Generic[P], ApplyMixin, ShapeDetectionMixin):
1744
+ """
1745
+ Represents a collection of Page objects, often from a single PDF document.
1746
+ Provides methods for batch operations on these pages.
1506
1747
  """
1507
1748
 
1508
1749
  def __init__(self, pages: List[P]):
@@ -1633,6 +1874,7 @@ class PageCollection(Generic[P], ApplyMixin):
1633
1874
  self,
1634
1875
  *,
1635
1876
  text: str,
1877
+ contains: str = "all",
1636
1878
  apply_exclusions: bool = True,
1637
1879
  regex: bool = False,
1638
1880
  case: bool = True,
@@ -1644,6 +1886,7 @@ class PageCollection(Generic[P], ApplyMixin):
1644
1886
  self,
1645
1887
  selector: str,
1646
1888
  *,
1889
+ contains: str = "all",
1647
1890
  apply_exclusions: bool = True,
1648
1891
  regex: bool = False,
1649
1892
  case: bool = True,
@@ -1655,6 +1898,7 @@ class PageCollection(Generic[P], ApplyMixin):
1655
1898
  selector: Optional[str] = None,
1656
1899
  *,
1657
1900
  text: Optional[str] = None,
1901
+ contains: str = "all",
1658
1902
  apply_exclusions: bool = True,
1659
1903
  regex: bool = False,
1660
1904
  case: bool = True,
@@ -1668,6 +1912,9 @@ class PageCollection(Generic[P], ApplyMixin):
1668
1912
  Args:
1669
1913
  selector: CSS-like selector string.
1670
1914
  text: Text content to search for (equivalent to 'text:contains(...)').
1915
+ contains: How to determine if elements are inside: 'all' (fully inside),
1916
+ 'any' (any overlap), or 'center' (center point inside).
1917
+ (default: "all")
1671
1918
  apply_exclusions: Whether to exclude elements in exclusion regions (default: True).
1672
1919
  regex: Whether to use regex for text search (`selector` or `text`) (default: False).
1673
1920
  case: Whether to do case-sensitive text search (`selector` or `text`) (default: True).
@@ -1681,6 +1928,7 @@ class PageCollection(Generic[P], ApplyMixin):
1681
1928
  element = page.find(
1682
1929
  selector=selector,
1683
1930
  text=text,
1931
+ contains=contains,
1684
1932
  apply_exclusions=apply_exclusions,
1685
1933
  regex=regex,
1686
1934
  case=case,
@@ -1695,6 +1943,7 @@ class PageCollection(Generic[P], ApplyMixin):
1695
1943
  self,
1696
1944
  *,
1697
1945
  text: str,
1946
+ contains: str = "all",
1698
1947
  apply_exclusions: bool = True,
1699
1948
  regex: bool = False,
1700
1949
  case: bool = True,
@@ -1706,6 +1955,7 @@ class PageCollection(Generic[P], ApplyMixin):
1706
1955
  self,
1707
1956
  selector: str,
1708
1957
  *,
1958
+ contains: str = "all",
1709
1959
  apply_exclusions: bool = True,
1710
1960
  regex: bool = False,
1711
1961
  case: bool = True,
@@ -1717,6 +1967,7 @@ class PageCollection(Generic[P], ApplyMixin):
1717
1967
  selector: Optional[str] = None,
1718
1968
  *,
1719
1969
  text: Optional[str] = None,
1970
+ contains: str = "all",
1720
1971
  apply_exclusions: bool = True,
1721
1972
  regex: bool = False,
1722
1973
  case: bool = True,
@@ -1730,6 +1981,9 @@ class PageCollection(Generic[P], ApplyMixin):
1730
1981
  Args:
1731
1982
  selector: CSS-like selector string.
1732
1983
  text: Text content to search for (equivalent to 'text:contains(...)').
1984
+ contains: How to determine if elements are inside: 'all' (fully inside),
1985
+ 'any' (any overlap), or 'center' (center point inside).
1986
+ (default: "all")
1733
1987
  apply_exclusions: Whether to exclude elements in exclusion regions (default: True).
1734
1988
  regex: Whether to use regex for text search (`selector` or `text`) (default: False).
1735
1989
  case: Whether to do case-sensitive text search (`selector` or `text`) (default: True).
@@ -1744,6 +1998,7 @@ class PageCollection(Generic[P], ApplyMixin):
1744
1998
  elements = page.find_all(
1745
1999
  selector=selector,
1746
2000
  text=text,
2001
+ contains=contains,
1747
2002
  apply_exclusions=apply_exclusions,
1748
2003
  regex=regex,
1749
2004
  case=case,
@@ -1817,7 +2072,7 @@ class PageCollection(Generic[P], ApplyMixin):
1817
2072
  end_elements=None,
1818
2073
  new_section_on_page_break=False,
1819
2074
  boundary_inclusion="both",
1820
- ) -> List["Region"]:
2075
+ ) -> "ElementCollection[Region]":
1821
2076
  """
1822
2077
  Extract sections from a page collection based on start/end elements.
1823
2078
 
@@ -2110,7 +2365,7 @@ class PageCollection(Generic[P], ApplyMixin):
2110
2365
  region.start_element = start_element
2111
2366
  sections.append(region)
2112
2367
 
2113
- return sections
2368
+ return ElementCollection(sections)
2114
2369
 
2115
2370
  def _gather_analysis_data(
2116
2371
  self,
@@ -2314,8 +2569,10 @@ class PageCollection(Generic[P], ApplyMixin):
2314
2569
  try:
2315
2570
  from PIL import Image, ImageDraw, ImageFont
2316
2571
  except ImportError:
2317
- logger.error("Pillow library not found, required for to_image(). Install with 'pip install Pillow'")
2318
- return None
2572
+ logger.error(
2573
+ "Pillow library not found, required for to_image(). Install with 'pip install Pillow'"
2574
+ )
2575
+ return None
2319
2576
 
2320
2577
  if not self.pages:
2321
2578
  logger.warning("Cannot generate image for empty PageCollection")
@@ -2334,27 +2591,34 @@ class PageCollection(Generic[P], ApplyMixin):
2334
2591
  try:
2335
2592
  font = ImageFont.load_default(16)
2336
2593
  except IOError:
2337
- logger.warning("Default font not found. Labels cannot be added.")
2338
- add_labels = False # Disable if no font
2594
+ logger.warning("Default font not found. Labels cannot be added.")
2595
+ add_labels = False # Disable if no font
2339
2596
 
2340
2597
  # Render individual page images
2341
2598
  page_images = []
2342
2599
  for page in pages_to_render:
2343
2600
  try:
2344
2601
  # Assume page.to_image returns a PIL Image or None
2345
- img = page.to_image(width=page_width, include_highlights=True) # Render with highlights for visual context
2602
+ img = page.to_image(
2603
+ width=page_width, include_highlights=True
2604
+ ) # Render with highlights for visual context
2346
2605
  if img is None:
2347
- logger.warning(f"Failed to generate image for page {page.number}. Skipping.")
2348
- continue
2606
+ logger.warning(f"Failed to generate image for page {page.number}. Skipping.")
2607
+ continue
2349
2608
  except Exception as img_err:
2350
- logger.error(f"Error generating image for page {page.number}: {img_err}", exc_info=True)
2351
- continue
2352
-
2609
+ logger.error(
2610
+ f"Error generating image for page {page.number}: {img_err}", exc_info=True
2611
+ )
2612
+ continue
2353
2613
 
2354
2614
  # Add page number label
2355
2615
  if add_labels and font:
2356
2616
  draw = ImageDraw.Draw(img)
2357
- pdf_name = Path(page.pdf.path).stem if hasattr(page, "pdf") and page.pdf and hasattr(page.pdf, "path") else ""
2617
+ pdf_name = (
2618
+ Path(page.pdf.path).stem
2619
+ if hasattr(page, "pdf") and page.pdf and hasattr(page.pdf, "path")
2620
+ else ""
2621
+ )
2358
2622
  label_text = f"p{page.number}"
2359
2623
  if pdf_name:
2360
2624
  label_text += f" - {pdf_name}"
@@ -2364,43 +2628,65 @@ class PageCollection(Generic[P], ApplyMixin):
2364
2628
  # Placeholder logic - adjust based on how classification results are stored
2365
2629
  category = None
2366
2630
  confidence = None
2367
- if hasattr(page, 'analyses') and page.analyses and 'classification' in page.analyses:
2368
- result = page.analyses['classification']
2631
+ if (
2632
+ hasattr(page, "analyses")
2633
+ and page.analyses
2634
+ and "classification" in page.analyses
2635
+ ):
2636
+ result = page.analyses["classification"]
2369
2637
  # Adapt based on actual structure of classification result
2370
- category = getattr(result, 'label', None) or result.get('label', None) if isinstance(result, dict) else None
2371
- confidence = getattr(result, 'score', None) or result.get('score', None) if isinstance(result, dict) else None
2638
+ category = (
2639
+ getattr(result, "label", None) or result.get("label", None)
2640
+ if isinstance(result, dict)
2641
+ else None
2642
+ )
2643
+ confidence = (
2644
+ getattr(result, "score", None) or result.get("score", None)
2645
+ if isinstance(result, dict)
2646
+ else None
2647
+ )
2372
2648
 
2373
2649
  if category is not None and confidence is not None:
2374
- try:
2375
- category_str = f"{category} ({confidence:.2f})" # Format confidence
2650
+ try:
2651
+ category_str = f"{category} ({confidence:.2f})" # Format confidence
2376
2652
  label_text += f"\\n{category_str}"
2377
- except (TypeError, ValueError): pass # Ignore formatting errors
2378
-
2653
+ except (TypeError, ValueError):
2654
+ pass # Ignore formatting errors
2379
2655
 
2380
2656
  # Calculate bounding box for multi-line text and draw background/text
2381
2657
  try:
2382
2658
  # Using textbbox for potentially better accuracy with specific fonts
2383
2659
  # Note: textbbox needs Pillow 8+
2384
- bbox = draw.textbbox((5, 5), label_text, font=font, spacing=2) # Use textbbox if available
2385
- bg_rect = (max(0, bbox[0] - 2), max(0, bbox[1] - 2),
2386
- min(img.width, bbox[2] + 2), min(img.height, bbox[3] + 2))
2660
+ bbox = draw.textbbox(
2661
+ (5, 5), label_text, font=font, spacing=2
2662
+ ) # Use textbbox if available
2663
+ bg_rect = (
2664
+ max(0, bbox[0] - 2),
2665
+ max(0, bbox[1] - 2),
2666
+ min(img.width, bbox[2] + 2),
2667
+ min(img.height, bbox[3] + 2),
2668
+ )
2387
2669
 
2388
2670
  # Draw semi-transparent background
2389
- overlay = Image.new('RGBA', img.size, (255, 255, 255, 0))
2671
+ overlay = Image.new("RGBA", img.size, (255, 255, 255, 0))
2390
2672
  draw_overlay = ImageDraw.Draw(overlay)
2391
- draw_overlay.rectangle(bg_rect, fill=(255, 255, 255, 180)) # White with alpha
2392
- img = Image.alpha_composite(img.convert('RGBA'), overlay).convert('RGB')
2393
- draw = ImageDraw.Draw(img) # Recreate draw object
2673
+ draw_overlay.rectangle(bg_rect, fill=(255, 255, 255, 180)) # White with alpha
2674
+ img = Image.alpha_composite(img.convert("RGBA"), overlay).convert("RGB")
2675
+ draw = ImageDraw.Draw(img) # Recreate draw object
2394
2676
 
2395
2677
  # Draw the potentially multi-line text
2396
2678
  draw.multiline_text((5, 5), label_text, fill=(0, 0, 0), font=font, spacing=2)
2397
- except AttributeError: # Fallback for older Pillow without textbbox
2679
+ except AttributeError: # Fallback for older Pillow without textbbox
2398
2680
  # Approximate size and draw
2399
2681
  # This might not be perfectly aligned
2400
- draw.rectangle((2, 2, 150, 40), fill=(255, 255, 255, 180)) # Simple fixed background
2401
- draw.multiline_text((5, 5), label_text, fill=(0, 0, 0), font=font, spacing=2)
2682
+ draw.rectangle(
2683
+ (2, 2, 150, 40), fill=(255, 255, 255, 180)
2684
+ ) # Simple fixed background
2685
+ draw.multiline_text((5, 5), label_text, fill=(0, 0, 0), font=font, spacing=2)
2402
2686
  except Exception as draw_err:
2403
- logger.error(f"Error drawing label on page {page.number}: {draw_err}", exc_info=True)
2687
+ logger.error(
2688
+ f"Error drawing label on page {page.number}: {draw_err}", exc_info=True
2689
+ )
2404
2690
 
2405
2691
  page_images.append(img)
2406
2692
 
@@ -2408,7 +2694,6 @@ class PageCollection(Generic[P], ApplyMixin):
2408
2694
  logger.warning("No page images were successfully rendered for the grid.")
2409
2695
  return None
2410
2696
 
2411
-
2412
2697
  # Calculate grid dimensions if not provided
2413
2698
  num_images = len(page_images)
2414
2699
  if not rows and not cols:
@@ -2418,24 +2703,23 @@ class PageCollection(Generic[P], ApplyMixin):
2418
2703
  cols = (num_images + rows - 1) // rows
2419
2704
  elif cols and not rows:
2420
2705
  rows = (num_images + cols - 1) // cols
2421
- cols = max(1, cols if cols else 1) # Ensure at least 1
2706
+ cols = max(1, cols if cols else 1) # Ensure at least 1
2422
2707
  rows = max(1, rows if rows else 1)
2423
2708
 
2424
-
2425
2709
  # Get maximum dimensions for consistent grid cells
2426
2710
  max_width = max(img.width for img in page_images) if page_images else 1
2427
2711
  max_height = max(img.height for img in page_images) if page_images else 1
2428
2712
 
2429
-
2430
2713
  # Create grid image
2431
2714
  grid_width = cols * max_width + (cols + 1) * spacing
2432
2715
  grid_height = rows * max_height + (rows + 1) * spacing
2433
- grid_img = Image.new("RGB", (grid_width, grid_height), (220, 220, 220)) # Lighter gray background
2434
-
2716
+ grid_img = Image.new(
2717
+ "RGB", (grid_width, grid_height), (220, 220, 220)
2718
+ ) # Lighter gray background
2435
2719
 
2436
2720
  # Place images in grid
2437
2721
  for i, img in enumerate(page_images):
2438
- if i >= rows * cols: # Ensure we don't exceed grid capacity
2722
+ if i >= rows * cols: # Ensure we don't exceed grid capacity
2439
2723
  break
2440
2724
 
2441
2725
  row = i // cols
@@ -2484,8 +2768,8 @@ class PageCollection(Generic[P], ApplyMixin):
2484
2768
  if not self.pages:
2485
2769
  raise ValueError("Cannot save an empty PageCollection.")
2486
2770
 
2487
- if not (ocr ^ original): # XOR: exactly one must be true
2488
- raise ValueError("Exactly one of 'ocr' or 'original' must be True.")
2771
+ if not (ocr ^ original): # XOR: exactly one must be true
2772
+ raise ValueError("Exactly one of 'ocr' or 'original' must be True.")
2489
2773
 
2490
2774
  output_path_obj = Path(output_path)
2491
2775
  output_path_str = str(output_path_obj)
@@ -2494,18 +2778,29 @@ class PageCollection(Generic[P], ApplyMixin):
2494
2778
  if create_searchable_pdf is None:
2495
2779
  raise ImportError(
2496
2780
  "Saving with ocr=True requires 'pikepdf' and 'Pillow'. "
2497
- "Install with: pip install \\\"natural-pdf[ocr-export]\\\"" # Escaped quotes
2781
+ 'Install with: pip install \\"natural-pdf[ocr-export]\\"' # Escaped quotes
2498
2782
  )
2499
2783
 
2500
2784
  # Check for non-OCR vector elements (provide a warning)
2501
2785
  has_vector_elements = False
2502
2786
  for page in self.pages:
2503
2787
  # Simplified check for common vector types or non-OCR chars/words
2504
- if (hasattr(page, 'rects') and page.rects or
2505
- hasattr(page, 'lines') and page.lines or
2506
- hasattr(page, 'curves') and page.curves or
2507
- (hasattr(page, 'chars') and any(getattr(el, 'source', None) != 'ocr' for el in page.chars)) or
2508
- (hasattr(page, 'words') and any(getattr(el, 'source', None) != 'ocr' for el in page.words))):
2788
+ if (
2789
+ hasattr(page, "rects")
2790
+ and page.rects
2791
+ or hasattr(page, "lines")
2792
+ and page.lines
2793
+ or hasattr(page, "curves")
2794
+ and page.curves
2795
+ or (
2796
+ hasattr(page, "chars")
2797
+ and any(getattr(el, "source", None) != "ocr" for el in page.chars)
2798
+ )
2799
+ or (
2800
+ hasattr(page, "words")
2801
+ and any(getattr(el, "source", None) != "ocr" for el in page.words)
2802
+ )
2803
+ ):
2509
2804
  has_vector_elements = True
2510
2805
  break
2511
2806
  if has_vector_elements:
@@ -2532,22 +2827,22 @@ class PageCollection(Generic[P], ApplyMixin):
2532
2827
  if create_original_pdf is None:
2533
2828
  raise ImportError(
2534
2829
  "Saving with original=True requires 'pikepdf'. "
2535
- "Install with: pip install \\\"natural-pdf[ocr-export]\\\"" # Escaped quotes
2830
+ 'Install with: pip install \\"natural-pdf[ocr-export]\\"' # Escaped quotes
2536
2831
  )
2537
2832
 
2538
2833
  # Check for OCR elements (provide a warning) - keep this check here
2539
2834
  has_ocr_elements = False
2540
2835
  for page in self.pages:
2541
- # Use find_all which returns a collection; check if it's non-empty
2542
- if hasattr(page, 'find_all'):
2543
- ocr_text_elements = page.find_all("text[source=ocr]")
2544
- if ocr_text_elements: # Check truthiness of collection
2545
- has_ocr_elements = True
2546
- break
2547
- elif hasattr(page, 'words'): # Fallback check if find_all isn't present?
2548
- if any(getattr(el, 'source', None) == 'ocr' for el in page.words):
2549
- has_ocr_elements = True
2550
- break
2836
+ # Use find_all which returns a collection; check if it's non-empty
2837
+ if hasattr(page, "find_all"):
2838
+ ocr_text_elements = page.find_all("text[source=ocr]")
2839
+ if ocr_text_elements: # Check truthiness of collection
2840
+ has_ocr_elements = True
2841
+ break
2842
+ elif hasattr(page, "words"): # Fallback check if find_all isn't present?
2843
+ if any(getattr(el, "source", None) == "ocr" for el in page.words):
2844
+ has_ocr_elements = True
2845
+ break
2551
2846
 
2552
2847
  if has_ocr_elements:
2553
2848
  logger.warning(
@@ -2565,5 +2860,5 @@ class PageCollection(Generic[P], ApplyMixin):
2565
2860
  except Exception as e:
2566
2861
  # Error logging is handled within create_original_pdf
2567
2862
  # Re-raise the exception caught from the exporter
2568
- raise e # Keep the original exception type (ValueError, RuntimeError, etc.)
2863
+ raise e # Keep the original exception type (ValueError, RuntimeError, etc.)
2569
2864
  # <--- END MODIFIED