natural-pdf 0.1.11__py3-none-any.whl → 0.1.13__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- natural_pdf/__init__.py +7 -2
- natural_pdf/analyzers/shape_detection_mixin.py +1092 -0
- natural_pdf/analyzers/text_options.py +9 -1
- natural_pdf/analyzers/text_structure.py +371 -58
- natural_pdf/classification/manager.py +3 -4
- natural_pdf/collections/pdf_collection.py +19 -39
- natural_pdf/core/element_manager.py +11 -1
- natural_pdf/core/highlighting_service.py +146 -75
- natural_pdf/core/page.py +287 -188
- natural_pdf/core/pdf.py +57 -42
- natural_pdf/elements/base.py +51 -0
- natural_pdf/elements/collections.py +362 -67
- natural_pdf/elements/line.py +5 -0
- natural_pdf/elements/region.py +396 -23
- natural_pdf/exporters/data/__init__.py +0 -0
- natural_pdf/exporters/data/pdf.ttf +0 -0
- natural_pdf/exporters/data/sRGB.icc +0 -0
- natural_pdf/exporters/hocr.py +40 -61
- natural_pdf/exporters/hocr_font.py +7 -13
- natural_pdf/exporters/original_pdf.py +10 -13
- natural_pdf/exporters/paddleocr.py +51 -11
- natural_pdf/exporters/searchable_pdf.py +0 -10
- natural_pdf/flows/__init__.py +12 -0
- natural_pdf/flows/collections.py +533 -0
- natural_pdf/flows/element.py +382 -0
- natural_pdf/flows/flow.py +216 -0
- natural_pdf/flows/region.py +458 -0
- natural_pdf/search/__init__.py +65 -52
- natural_pdf/search/lancedb_search_service.py +325 -0
- natural_pdf/search/numpy_search_service.py +255 -0
- natural_pdf/search/searchable_mixin.py +25 -71
- natural_pdf/selectors/parser.py +163 -8
- natural_pdf/widgets/viewer.py +22 -31
- {natural_pdf-0.1.11.dist-info → natural_pdf-0.1.13.dist-info}/METADATA +55 -49
- {natural_pdf-0.1.11.dist-info → natural_pdf-0.1.13.dist-info}/RECORD +38 -30
- {natural_pdf-0.1.11.dist-info → natural_pdf-0.1.13.dist-info}/WHEEL +1 -1
- natural_pdf/search/haystack_search_service.py +0 -687
- natural_pdf/search/haystack_utils.py +0 -474
- natural_pdf/utils/tqdm_utils.py +0 -51
- {natural_pdf-0.1.11.dist-info → natural_pdf-0.1.13.dist-info}/licenses/LICENSE +0 -0
- {natural_pdf-0.1.11.dist-info → natural_pdf-0.1.13.dist-info}/top_level.txt +0 -0
@@ -18,12 +18,13 @@ from typing import (
|
|
18
18
|
Union,
|
19
19
|
overload,
|
20
20
|
)
|
21
|
+
import hashlib
|
21
22
|
|
22
23
|
from pdfplumber.utils.geometry import objects_to_bbox
|
23
|
-
from PIL import Image, ImageDraw, ImageFont
|
24
24
|
|
25
25
|
# New Imports
|
26
26
|
from pdfplumber.utils.text import TEXTMAP_KWARGS, WORD_EXTRACTOR_KWARGS, chars_to_textmap
|
27
|
+
from PIL import Image, ImageDraw, ImageFont
|
27
28
|
from tqdm.auto import tqdm
|
28
29
|
|
29
30
|
from natural_pdf.classification.manager import ClassificationManager
|
@@ -37,6 +38,8 @@ from natural_pdf.export.mixin import ExportMixin
|
|
37
38
|
from natural_pdf.ocr import OCROptions
|
38
39
|
from natural_pdf.ocr.utils import _apply_ocr_correction_to_elements
|
39
40
|
from natural_pdf.selectors.parser import parse_selector, selector_to_filter_func
|
41
|
+
from natural_pdf.analyzers.shape_detection_mixin import ShapeDetectionMixin
|
42
|
+
from tqdm.auto import tqdm
|
40
43
|
|
41
44
|
# Potentially lazy imports for optional dependencies needed in save_pdf
|
42
45
|
try:
|
@@ -46,7 +49,6 @@ except ImportError:
|
|
46
49
|
|
47
50
|
try:
|
48
51
|
from natural_pdf.exporters.searchable_pdf import create_searchable_pdf
|
49
|
-
pass
|
50
52
|
except ImportError:
|
51
53
|
create_searchable_pdf = None
|
52
54
|
|
@@ -61,8 +63,9 @@ logger = logging.getLogger(__name__)
|
|
61
63
|
|
62
64
|
if TYPE_CHECKING:
|
63
65
|
from natural_pdf.core.page import Page
|
64
|
-
from natural_pdf.core.pdf import PDF
|
66
|
+
from natural_pdf.core.pdf import PDF # ---> ADDED PDF type hint
|
65
67
|
from natural_pdf.elements.region import Region
|
68
|
+
from natural_pdf.elements.text import TextElement # Ensure TextElement is imported
|
66
69
|
|
67
70
|
T = TypeVar("T")
|
68
71
|
P = TypeVar("P", bound="Page")
|
@@ -840,6 +843,7 @@ class ElementCollection(
|
|
840
843
|
labels: bool = True, # Use 'labels' consistent with service
|
841
844
|
legend_position: str = "right",
|
842
845
|
render_ocr: bool = False,
|
846
|
+
width: Optional[int] = None, # Add width parameter
|
843
847
|
) -> Optional["Image.Image"]:
|
844
848
|
"""
|
845
849
|
Generates a temporary preview image highlighting elements in this collection
|
@@ -862,6 +866,7 @@ class ElementCollection(
|
|
862
866
|
labels: Whether to include a legend for the temporary highlights.
|
863
867
|
legend_position: Position of the legend ('right', 'left', 'top', 'bottom').
|
864
868
|
render_ocr: Whether to render OCR text.
|
869
|
+
width: Optional width for the output image in pixels.
|
865
870
|
|
866
871
|
Returns:
|
867
872
|
PIL Image object of the temporary preview, or None if rendering fails or
|
@@ -922,6 +927,7 @@ class ElementCollection(
|
|
922
927
|
page_index=page.index,
|
923
928
|
temporary_highlights=highlight_data_list,
|
924
929
|
scale=scale,
|
930
|
+
width=width, # Pass the width parameter
|
925
931
|
labels=labels, # Use 'labels'
|
926
932
|
legend_position=legend_position,
|
927
933
|
render_ocr=render_ocr,
|
@@ -1159,10 +1165,96 @@ class ElementCollection(
|
|
1159
1165
|
|
1160
1166
|
Args:
|
1161
1167
|
selector: CSS-like selector string
|
1168
|
+
contains: How to determine if elements are inside: 'all' (fully inside),
|
1169
|
+
'any' (any overlap), or 'center' (center point inside).
|
1170
|
+
(default: "all")
|
1162
1171
|
apply_exclusions: Whether to exclude elements in exclusion regions
|
1163
1172
|
"""
|
1164
1173
|
return self.apply(lambda element: element.find(selector, **kwargs))
|
1165
1174
|
|
1175
|
+
@overload
|
1176
|
+
def find_all(
|
1177
|
+
self,
|
1178
|
+
*,
|
1179
|
+
text: str,
|
1180
|
+
contains: str = "all",
|
1181
|
+
apply_exclusions: bool = True,
|
1182
|
+
regex: bool = False,
|
1183
|
+
case: bool = True,
|
1184
|
+
**kwargs,
|
1185
|
+
) -> "ElementCollection": ...
|
1186
|
+
|
1187
|
+
@overload
|
1188
|
+
def find_all(
|
1189
|
+
self,
|
1190
|
+
selector: str,
|
1191
|
+
*,
|
1192
|
+
contains: str = "all",
|
1193
|
+
apply_exclusions: bool = True,
|
1194
|
+
regex: bool = False,
|
1195
|
+
case: bool = True,
|
1196
|
+
**kwargs,
|
1197
|
+
) -> "ElementCollection": ...
|
1198
|
+
|
1199
|
+
def find_all(
|
1200
|
+
self,
|
1201
|
+
selector: Optional[str] = None,
|
1202
|
+
*,
|
1203
|
+
text: Optional[str] = None,
|
1204
|
+
contains: str = "all",
|
1205
|
+
apply_exclusions: bool = True,
|
1206
|
+
regex: bool = False,
|
1207
|
+
case: bool = True,
|
1208
|
+
**kwargs,
|
1209
|
+
) -> "ElementCollection":
|
1210
|
+
"""
|
1211
|
+
Find all elements within each element of this collection matching the selector OR text,
|
1212
|
+
and return a flattened collection of all found sub-elements.
|
1213
|
+
|
1214
|
+
Provide EITHER `selector` OR `text`, but not both.
|
1215
|
+
|
1216
|
+
Args:
|
1217
|
+
selector: CSS-like selector string.
|
1218
|
+
text: Text content to search for (equivalent to 'text:contains(...)').
|
1219
|
+
contains: How to determine if elements are inside: 'all' (fully inside),
|
1220
|
+
'any' (any overlap), or 'center' (center point inside).
|
1221
|
+
(default: "all")
|
1222
|
+
apply_exclusions: Whether to apply exclusion regions (default: True).
|
1223
|
+
regex: Whether to use regex for text search (`selector` or `text`) (default: False).
|
1224
|
+
case: Whether to do case-sensitive text search (`selector` or `text`) (default: True).
|
1225
|
+
**kwargs: Additional parameters for element filtering.
|
1226
|
+
|
1227
|
+
Returns:
|
1228
|
+
A new ElementCollection containing all matching sub-elements from all elements
|
1229
|
+
in this collection.
|
1230
|
+
"""
|
1231
|
+
if selector is None and text is None:
|
1232
|
+
raise ValueError("Either 'selector' or 'text' must be provided to find_all.")
|
1233
|
+
if selector is not None and text is not None:
|
1234
|
+
raise ValueError("Provide either 'selector' or 'text' to find_all, not both.")
|
1235
|
+
|
1236
|
+
all_found_elements: List[Element] = []
|
1237
|
+
for element in self._elements:
|
1238
|
+
if hasattr(element, "find_all") and callable(element.find_all):
|
1239
|
+
# Element.find_all returns an ElementCollection
|
1240
|
+
found_in_element: "ElementCollection" = element.find_all(
|
1241
|
+
selector=selector,
|
1242
|
+
text=text,
|
1243
|
+
contains=contains,
|
1244
|
+
apply_exclusions=apply_exclusions,
|
1245
|
+
regex=regex,
|
1246
|
+
case=case,
|
1247
|
+
**kwargs,
|
1248
|
+
)
|
1249
|
+
if found_in_element and found_in_element.elements:
|
1250
|
+
all_found_elements.extend(found_in_element.elements)
|
1251
|
+
# else:
|
1252
|
+
# Elements in the collection are expected to support find_all.
|
1253
|
+
# If an element type doesn't, an AttributeError will naturally occur,
|
1254
|
+
# or a more specific check/handling could be added here if needed.
|
1255
|
+
|
1256
|
+
return ElementCollection(all_found_elements)
|
1257
|
+
|
1166
1258
|
def extract_each_text(self, **kwargs) -> List[str]:
|
1167
1259
|
"""
|
1168
1260
|
Extract text from each element in this region.
|
@@ -1496,13 +1588,162 @@ class ElementCollection(
|
|
1496
1588
|
|
1497
1589
|
return all_data
|
1498
1590
|
|
1591
|
+
def to_text_elements(
|
1592
|
+
self,
|
1593
|
+
text_content_func: Optional[Callable[["Region"], Optional[str]]] = None,
|
1594
|
+
source_label: str = "derived_from_region",
|
1595
|
+
object_type: str = "word",
|
1596
|
+
default_font_size: float = 10.0,
|
1597
|
+
default_font_name: str = "RegionContent",
|
1598
|
+
confidence: Optional[float] = None,
|
1599
|
+
add_to_page: bool = False # Default is False
|
1600
|
+
) -> "ElementCollection[TextElement]":
|
1601
|
+
"""
|
1602
|
+
Converts each Region in this collection to a TextElement.
|
1603
|
+
|
1604
|
+
Args:
|
1605
|
+
text_content_func: A callable that takes a Region and returns its text
|
1606
|
+
(or None). If None, all created TextElements will
|
1607
|
+
have text=None.
|
1608
|
+
source_label: The 'source' attribute for the new TextElements.
|
1609
|
+
object_type: The 'object_type' for the TextElement's data dict.
|
1610
|
+
default_font_size: Placeholder font size.
|
1611
|
+
default_font_name: Placeholder font name.
|
1612
|
+
confidence: Confidence score.
|
1613
|
+
add_to_page: If True (default is False), also adds the created
|
1614
|
+
TextElements to their respective page's element manager.
|
1615
|
+
|
1616
|
+
Returns:
|
1617
|
+
A new ElementCollection containing the created TextElement objects.
|
1618
|
+
"""
|
1619
|
+
from natural_pdf.elements.region import Region # Local import for type checking if needed or to resolve circularity
|
1620
|
+
from natural_pdf.elements.text import TextElement # Ensure TextElement is imported for type hint if not in TYPE_CHECKING
|
1621
|
+
|
1622
|
+
new_text_elements: List["TextElement"] = []
|
1623
|
+
if not self.elements: # Accesses self._elements via property
|
1624
|
+
return ElementCollection([])
|
1625
|
+
|
1626
|
+
page_context_for_adding: Optional["Page"] = None
|
1627
|
+
if add_to_page:
|
1628
|
+
# Try to determine a consistent page context if adding elements
|
1629
|
+
first_valid_region_with_page = next(
|
1630
|
+
(el for el in self.elements if isinstance(el, Region) and hasattr(el, 'page') and el.page is not None),
|
1631
|
+
None
|
1632
|
+
)
|
1633
|
+
if first_valid_region_with_page:
|
1634
|
+
page_context_for_adding = first_valid_region_with_page.page
|
1635
|
+
else:
|
1636
|
+
logger.warning("Cannot add TextElements to page: No valid Region with a page attribute found in collection, or first region's page is None.")
|
1637
|
+
add_to_page = False # Disable adding if no valid page context can be determined
|
1638
|
+
|
1639
|
+
for element in self.elements: # Accesses self._elements via property/iterator
|
1640
|
+
if isinstance(element, Region):
|
1641
|
+
text_el = element.to_text_element(
|
1642
|
+
text_content=text_content_func,
|
1643
|
+
source_label=source_label,
|
1644
|
+
object_type=object_type,
|
1645
|
+
default_font_size=default_font_size,
|
1646
|
+
default_font_name=default_font_name,
|
1647
|
+
confidence=confidence
|
1648
|
+
)
|
1649
|
+
new_text_elements.append(text_el)
|
1650
|
+
|
1651
|
+
if add_to_page:
|
1652
|
+
if not hasattr(text_el, 'page') or text_el.page is None:
|
1653
|
+
logger.warning(f"TextElement created from region {element.bbox} has no page attribute. Cannot add to page.")
|
1654
|
+
continue
|
1655
|
+
|
1656
|
+
if page_context_for_adding and text_el.page == page_context_for_adding:
|
1657
|
+
if hasattr(page_context_for_adding, '_element_mgr') and page_context_for_adding._element_mgr is not None:
|
1658
|
+
add_as_type = "words" if object_type == "word" else "chars" if object_type == "char" else object_type
|
1659
|
+
page_context_for_adding._element_mgr.add_element(text_el, element_type=add_as_type)
|
1660
|
+
else:
|
1661
|
+
page_num_str = str(page_context_for_adding.page_number) if hasattr(page_context_for_adding, 'page_number') else 'N/A'
|
1662
|
+
logger.error(f"Page context for region {element.bbox} (Page {page_num_str}) is missing '_element_mgr'. Cannot add TextElement.")
|
1663
|
+
elif page_context_for_adding and text_el.page != page_context_for_adding:
|
1664
|
+
current_page_num_str = str(text_el.page.page_number) if hasattr(text_el.page, 'page_number') else "Unknown"
|
1665
|
+
context_page_num_str = str(page_context_for_adding.page_number) if hasattr(page_context_for_adding, 'page_number') else "N/A"
|
1666
|
+
logger.warning(f"TextElement for region {element.bbox} from page {current_page_num_str} "
|
1667
|
+
f"not added as it's different from collection's inferred page context {context_page_num_str}.")
|
1668
|
+
elif not page_context_for_adding:
|
1669
|
+
logger.warning(f"TextElement for region {element.bbox} created, but no page context was determined for adding.")
|
1670
|
+
else:
|
1671
|
+
logger.warning(f"Skipping element {type(element)}, not a Region.")
|
1672
|
+
|
1673
|
+
if add_to_page and page_context_for_adding:
|
1674
|
+
page_num_str = str(page_context_for_adding.page_number) if hasattr(page_context_for_adding, 'page_number') else 'N/A'
|
1675
|
+
logger.info(f"Created and added {len(new_text_elements)} TextElements to page {page_num_str}.")
|
1676
|
+
elif add_to_page and not page_context_for_adding:
|
1677
|
+
logger.info(f"Created {len(new_text_elements)} TextElements, but could not add to page as page context was not determined or was inconsistent.")
|
1678
|
+
else: # add_to_page is False
|
1679
|
+
logger.info(f"Created {len(new_text_elements)} TextElements (not added to page).")
|
1680
|
+
|
1681
|
+
return ElementCollection(new_text_elements)
|
1682
|
+
|
1683
|
+
def trim(self, padding: int = 1, threshold: float = 0.95, resolution: float = 150, show_progress: bool = True) -> "ElementCollection":
|
1684
|
+
"""
|
1685
|
+
Trim visual whitespace from each region in the collection.
|
1686
|
+
|
1687
|
+
Applies the trim() method to each element in the collection,
|
1688
|
+
returning a new collection with the trimmed regions.
|
1689
|
+
|
1690
|
+
Args:
|
1691
|
+
padding: Number of pixels to keep as padding after trimming (default: 1)
|
1692
|
+
threshold: Threshold for considering a row/column as whitespace (0.0-1.0, default: 0.95)
|
1693
|
+
resolution: Resolution for image rendering in DPI (default: 150)
|
1694
|
+
show_progress: Whether to show a progress bar for the trimming operation
|
1695
|
+
|
1696
|
+
Returns:
|
1697
|
+
New ElementCollection with trimmed regions
|
1698
|
+
"""
|
1699
|
+
return self.apply(
|
1700
|
+
lambda element: element.trim(padding=padding, threshold=threshold, resolution=resolution),
|
1701
|
+
show_progress=show_progress
|
1702
|
+
)
|
1703
|
+
|
1704
|
+
def clip(
|
1705
|
+
self,
|
1706
|
+
obj: Optional[Any] = None,
|
1707
|
+
left: Optional[float] = None,
|
1708
|
+
top: Optional[float] = None,
|
1709
|
+
right: Optional[float] = None,
|
1710
|
+
bottom: Optional[float] = None,
|
1711
|
+
) -> "ElementCollection":
|
1712
|
+
"""
|
1713
|
+
Clip each element in the collection to the specified bounds.
|
1714
|
+
|
1715
|
+
This method applies the clip operation to each individual element,
|
1716
|
+
returning a new collection with the clipped elements.
|
1717
|
+
|
1718
|
+
Args:
|
1719
|
+
obj: Optional object with bbox properties (Region, Element, TextElement, etc.)
|
1720
|
+
left: Optional left boundary (x0) to clip to
|
1721
|
+
top: Optional top boundary to clip to
|
1722
|
+
right: Optional right boundary (x1) to clip to
|
1723
|
+
bottom: Optional bottom boundary to clip to
|
1724
|
+
|
1725
|
+
Returns:
|
1726
|
+
New ElementCollection containing the clipped elements
|
1727
|
+
|
1728
|
+
Examples:
|
1729
|
+
# Clip each element to another region's bounds
|
1730
|
+
clipped_elements = collection.clip(container_region)
|
1731
|
+
|
1732
|
+
# Clip each element to specific coordinates
|
1733
|
+
clipped_elements = collection.clip(left=100, right=400)
|
1734
|
+
|
1735
|
+
# Mix object bounds with specific overrides
|
1736
|
+
clipped_elements = collection.clip(obj=container, bottom=page.height/2)
|
1737
|
+
"""
|
1738
|
+
return self.apply(
|
1739
|
+
lambda element: element.clip(obj=obj, left=left, top=top, right=right, bottom=bottom)
|
1740
|
+
)
|
1499
1741
|
|
1500
|
-
class PageCollection(Generic[P], ApplyMixin):
|
1501
|
-
"""
|
1502
|
-
A collection of PDF pages with cross-page operations.
|
1503
1742
|
|
1504
|
-
|
1505
|
-
|
1743
|
+
class PageCollection(Generic[P], ApplyMixin, ShapeDetectionMixin):
|
1744
|
+
"""
|
1745
|
+
Represents a collection of Page objects, often from a single PDF document.
|
1746
|
+
Provides methods for batch operations on these pages.
|
1506
1747
|
"""
|
1507
1748
|
|
1508
1749
|
def __init__(self, pages: List[P]):
|
@@ -1633,6 +1874,7 @@ class PageCollection(Generic[P], ApplyMixin):
|
|
1633
1874
|
self,
|
1634
1875
|
*,
|
1635
1876
|
text: str,
|
1877
|
+
contains: str = "all",
|
1636
1878
|
apply_exclusions: bool = True,
|
1637
1879
|
regex: bool = False,
|
1638
1880
|
case: bool = True,
|
@@ -1644,6 +1886,7 @@ class PageCollection(Generic[P], ApplyMixin):
|
|
1644
1886
|
self,
|
1645
1887
|
selector: str,
|
1646
1888
|
*,
|
1889
|
+
contains: str = "all",
|
1647
1890
|
apply_exclusions: bool = True,
|
1648
1891
|
regex: bool = False,
|
1649
1892
|
case: bool = True,
|
@@ -1655,6 +1898,7 @@ class PageCollection(Generic[P], ApplyMixin):
|
|
1655
1898
|
selector: Optional[str] = None,
|
1656
1899
|
*,
|
1657
1900
|
text: Optional[str] = None,
|
1901
|
+
contains: str = "all",
|
1658
1902
|
apply_exclusions: bool = True,
|
1659
1903
|
regex: bool = False,
|
1660
1904
|
case: bool = True,
|
@@ -1668,6 +1912,9 @@ class PageCollection(Generic[P], ApplyMixin):
|
|
1668
1912
|
Args:
|
1669
1913
|
selector: CSS-like selector string.
|
1670
1914
|
text: Text content to search for (equivalent to 'text:contains(...)').
|
1915
|
+
contains: How to determine if elements are inside: 'all' (fully inside),
|
1916
|
+
'any' (any overlap), or 'center' (center point inside).
|
1917
|
+
(default: "all")
|
1671
1918
|
apply_exclusions: Whether to exclude elements in exclusion regions (default: True).
|
1672
1919
|
regex: Whether to use regex for text search (`selector` or `text`) (default: False).
|
1673
1920
|
case: Whether to do case-sensitive text search (`selector` or `text`) (default: True).
|
@@ -1681,6 +1928,7 @@ class PageCollection(Generic[P], ApplyMixin):
|
|
1681
1928
|
element = page.find(
|
1682
1929
|
selector=selector,
|
1683
1930
|
text=text,
|
1931
|
+
contains=contains,
|
1684
1932
|
apply_exclusions=apply_exclusions,
|
1685
1933
|
regex=regex,
|
1686
1934
|
case=case,
|
@@ -1695,6 +1943,7 @@ class PageCollection(Generic[P], ApplyMixin):
|
|
1695
1943
|
self,
|
1696
1944
|
*,
|
1697
1945
|
text: str,
|
1946
|
+
contains: str = "all",
|
1698
1947
|
apply_exclusions: bool = True,
|
1699
1948
|
regex: bool = False,
|
1700
1949
|
case: bool = True,
|
@@ -1706,6 +1955,7 @@ class PageCollection(Generic[P], ApplyMixin):
|
|
1706
1955
|
self,
|
1707
1956
|
selector: str,
|
1708
1957
|
*,
|
1958
|
+
contains: str = "all",
|
1709
1959
|
apply_exclusions: bool = True,
|
1710
1960
|
regex: bool = False,
|
1711
1961
|
case: bool = True,
|
@@ -1717,6 +1967,7 @@ class PageCollection(Generic[P], ApplyMixin):
|
|
1717
1967
|
selector: Optional[str] = None,
|
1718
1968
|
*,
|
1719
1969
|
text: Optional[str] = None,
|
1970
|
+
contains: str = "all",
|
1720
1971
|
apply_exclusions: bool = True,
|
1721
1972
|
regex: bool = False,
|
1722
1973
|
case: bool = True,
|
@@ -1730,6 +1981,9 @@ class PageCollection(Generic[P], ApplyMixin):
|
|
1730
1981
|
Args:
|
1731
1982
|
selector: CSS-like selector string.
|
1732
1983
|
text: Text content to search for (equivalent to 'text:contains(...)').
|
1984
|
+
contains: How to determine if elements are inside: 'all' (fully inside),
|
1985
|
+
'any' (any overlap), or 'center' (center point inside).
|
1986
|
+
(default: "all")
|
1733
1987
|
apply_exclusions: Whether to exclude elements in exclusion regions (default: True).
|
1734
1988
|
regex: Whether to use regex for text search (`selector` or `text`) (default: False).
|
1735
1989
|
case: Whether to do case-sensitive text search (`selector` or `text`) (default: True).
|
@@ -1744,6 +1998,7 @@ class PageCollection(Generic[P], ApplyMixin):
|
|
1744
1998
|
elements = page.find_all(
|
1745
1999
|
selector=selector,
|
1746
2000
|
text=text,
|
2001
|
+
contains=contains,
|
1747
2002
|
apply_exclusions=apply_exclusions,
|
1748
2003
|
regex=regex,
|
1749
2004
|
case=case,
|
@@ -1817,7 +2072,7 @@ class PageCollection(Generic[P], ApplyMixin):
|
|
1817
2072
|
end_elements=None,
|
1818
2073
|
new_section_on_page_break=False,
|
1819
2074
|
boundary_inclusion="both",
|
1820
|
-
) ->
|
2075
|
+
) -> "ElementCollection[Region]":
|
1821
2076
|
"""
|
1822
2077
|
Extract sections from a page collection based on start/end elements.
|
1823
2078
|
|
@@ -2110,7 +2365,7 @@ class PageCollection(Generic[P], ApplyMixin):
|
|
2110
2365
|
region.start_element = start_element
|
2111
2366
|
sections.append(region)
|
2112
2367
|
|
2113
|
-
return sections
|
2368
|
+
return ElementCollection(sections)
|
2114
2369
|
|
2115
2370
|
def _gather_analysis_data(
|
2116
2371
|
self,
|
@@ -2314,8 +2569,10 @@ class PageCollection(Generic[P], ApplyMixin):
|
|
2314
2569
|
try:
|
2315
2570
|
from PIL import Image, ImageDraw, ImageFont
|
2316
2571
|
except ImportError:
|
2317
|
-
|
2318
|
-
|
2572
|
+
logger.error(
|
2573
|
+
"Pillow library not found, required for to_image(). Install with 'pip install Pillow'"
|
2574
|
+
)
|
2575
|
+
return None
|
2319
2576
|
|
2320
2577
|
if not self.pages:
|
2321
2578
|
logger.warning("Cannot generate image for empty PageCollection")
|
@@ -2334,27 +2591,34 @@ class PageCollection(Generic[P], ApplyMixin):
|
|
2334
2591
|
try:
|
2335
2592
|
font = ImageFont.load_default(16)
|
2336
2593
|
except IOError:
|
2337
|
-
|
2338
|
-
|
2594
|
+
logger.warning("Default font not found. Labels cannot be added.")
|
2595
|
+
add_labels = False # Disable if no font
|
2339
2596
|
|
2340
2597
|
# Render individual page images
|
2341
2598
|
page_images = []
|
2342
2599
|
for page in pages_to_render:
|
2343
2600
|
try:
|
2344
2601
|
# Assume page.to_image returns a PIL Image or None
|
2345
|
-
img = page.to_image(
|
2602
|
+
img = page.to_image(
|
2603
|
+
width=page_width, include_highlights=True
|
2604
|
+
) # Render with highlights for visual context
|
2346
2605
|
if img is None:
|
2347
|
-
|
2348
|
-
|
2606
|
+
logger.warning(f"Failed to generate image for page {page.number}. Skipping.")
|
2607
|
+
continue
|
2349
2608
|
except Exception as img_err:
|
2350
|
-
|
2351
|
-
|
2352
|
-
|
2609
|
+
logger.error(
|
2610
|
+
f"Error generating image for page {page.number}: {img_err}", exc_info=True
|
2611
|
+
)
|
2612
|
+
continue
|
2353
2613
|
|
2354
2614
|
# Add page number label
|
2355
2615
|
if add_labels and font:
|
2356
2616
|
draw = ImageDraw.Draw(img)
|
2357
|
-
pdf_name =
|
2617
|
+
pdf_name = (
|
2618
|
+
Path(page.pdf.path).stem
|
2619
|
+
if hasattr(page, "pdf") and page.pdf and hasattr(page.pdf, "path")
|
2620
|
+
else ""
|
2621
|
+
)
|
2358
2622
|
label_text = f"p{page.number}"
|
2359
2623
|
if pdf_name:
|
2360
2624
|
label_text += f" - {pdf_name}"
|
@@ -2364,43 +2628,65 @@ class PageCollection(Generic[P], ApplyMixin):
|
|
2364
2628
|
# Placeholder logic - adjust based on how classification results are stored
|
2365
2629
|
category = None
|
2366
2630
|
confidence = None
|
2367
|
-
if
|
2368
|
-
|
2631
|
+
if (
|
2632
|
+
hasattr(page, "analyses")
|
2633
|
+
and page.analyses
|
2634
|
+
and "classification" in page.analyses
|
2635
|
+
):
|
2636
|
+
result = page.analyses["classification"]
|
2369
2637
|
# Adapt based on actual structure of classification result
|
2370
|
-
category =
|
2371
|
-
|
2638
|
+
category = (
|
2639
|
+
getattr(result, "label", None) or result.get("label", None)
|
2640
|
+
if isinstance(result, dict)
|
2641
|
+
else None
|
2642
|
+
)
|
2643
|
+
confidence = (
|
2644
|
+
getattr(result, "score", None) or result.get("score", None)
|
2645
|
+
if isinstance(result, dict)
|
2646
|
+
else None
|
2647
|
+
)
|
2372
2648
|
|
2373
2649
|
if category is not None and confidence is not None:
|
2374
|
-
|
2375
|
-
category_str = f"{category} ({confidence:.2f})"
|
2650
|
+
try:
|
2651
|
+
category_str = f"{category} ({confidence:.2f})" # Format confidence
|
2376
2652
|
label_text += f"\\n{category_str}"
|
2377
|
-
|
2378
|
-
|
2653
|
+
except (TypeError, ValueError):
|
2654
|
+
pass # Ignore formatting errors
|
2379
2655
|
|
2380
2656
|
# Calculate bounding box for multi-line text and draw background/text
|
2381
2657
|
try:
|
2382
2658
|
# Using textbbox for potentially better accuracy with specific fonts
|
2383
2659
|
# Note: textbbox needs Pillow 8+
|
2384
|
-
bbox = draw.textbbox(
|
2385
|
-
|
2386
|
-
|
2660
|
+
bbox = draw.textbbox(
|
2661
|
+
(5, 5), label_text, font=font, spacing=2
|
2662
|
+
) # Use textbbox if available
|
2663
|
+
bg_rect = (
|
2664
|
+
max(0, bbox[0] - 2),
|
2665
|
+
max(0, bbox[1] - 2),
|
2666
|
+
min(img.width, bbox[2] + 2),
|
2667
|
+
min(img.height, bbox[3] + 2),
|
2668
|
+
)
|
2387
2669
|
|
2388
2670
|
# Draw semi-transparent background
|
2389
|
-
overlay = Image.new(
|
2671
|
+
overlay = Image.new("RGBA", img.size, (255, 255, 255, 0))
|
2390
2672
|
draw_overlay = ImageDraw.Draw(overlay)
|
2391
|
-
draw_overlay.rectangle(bg_rect, fill=(255, 255, 255, 180))
|
2392
|
-
img = Image.alpha_composite(img.convert(
|
2393
|
-
draw = ImageDraw.Draw(img)
|
2673
|
+
draw_overlay.rectangle(bg_rect, fill=(255, 255, 255, 180)) # White with alpha
|
2674
|
+
img = Image.alpha_composite(img.convert("RGBA"), overlay).convert("RGB")
|
2675
|
+
draw = ImageDraw.Draw(img) # Recreate draw object
|
2394
2676
|
|
2395
2677
|
# Draw the potentially multi-line text
|
2396
2678
|
draw.multiline_text((5, 5), label_text, fill=(0, 0, 0), font=font, spacing=2)
|
2397
|
-
except AttributeError:
|
2679
|
+
except AttributeError: # Fallback for older Pillow without textbbox
|
2398
2680
|
# Approximate size and draw
|
2399
2681
|
# This might not be perfectly aligned
|
2400
|
-
|
2401
|
-
|
2682
|
+
draw.rectangle(
|
2683
|
+
(2, 2, 150, 40), fill=(255, 255, 255, 180)
|
2684
|
+
) # Simple fixed background
|
2685
|
+
draw.multiline_text((5, 5), label_text, fill=(0, 0, 0), font=font, spacing=2)
|
2402
2686
|
except Exception as draw_err:
|
2403
|
-
|
2687
|
+
logger.error(
|
2688
|
+
f"Error drawing label on page {page.number}: {draw_err}", exc_info=True
|
2689
|
+
)
|
2404
2690
|
|
2405
2691
|
page_images.append(img)
|
2406
2692
|
|
@@ -2408,7 +2694,6 @@ class PageCollection(Generic[P], ApplyMixin):
|
|
2408
2694
|
logger.warning("No page images were successfully rendered for the grid.")
|
2409
2695
|
return None
|
2410
2696
|
|
2411
|
-
|
2412
2697
|
# Calculate grid dimensions if not provided
|
2413
2698
|
num_images = len(page_images)
|
2414
2699
|
if not rows and not cols:
|
@@ -2418,24 +2703,23 @@ class PageCollection(Generic[P], ApplyMixin):
|
|
2418
2703
|
cols = (num_images + rows - 1) // rows
|
2419
2704
|
elif cols and not rows:
|
2420
2705
|
rows = (num_images + cols - 1) // cols
|
2421
|
-
cols = max(1, cols if cols else 1)
|
2706
|
+
cols = max(1, cols if cols else 1) # Ensure at least 1
|
2422
2707
|
rows = max(1, rows if rows else 1)
|
2423
2708
|
|
2424
|
-
|
2425
2709
|
# Get maximum dimensions for consistent grid cells
|
2426
2710
|
max_width = max(img.width for img in page_images) if page_images else 1
|
2427
2711
|
max_height = max(img.height for img in page_images) if page_images else 1
|
2428
2712
|
|
2429
|
-
|
2430
2713
|
# Create grid image
|
2431
2714
|
grid_width = cols * max_width + (cols + 1) * spacing
|
2432
2715
|
grid_height = rows * max_height + (rows + 1) * spacing
|
2433
|
-
grid_img = Image.new(
|
2434
|
-
|
2716
|
+
grid_img = Image.new(
|
2717
|
+
"RGB", (grid_width, grid_height), (220, 220, 220)
|
2718
|
+
) # Lighter gray background
|
2435
2719
|
|
2436
2720
|
# Place images in grid
|
2437
2721
|
for i, img in enumerate(page_images):
|
2438
|
-
if i >= rows * cols:
|
2722
|
+
if i >= rows * cols: # Ensure we don't exceed grid capacity
|
2439
2723
|
break
|
2440
2724
|
|
2441
2725
|
row = i // cols
|
@@ -2484,8 +2768,8 @@ class PageCollection(Generic[P], ApplyMixin):
|
|
2484
2768
|
if not self.pages:
|
2485
2769
|
raise ValueError("Cannot save an empty PageCollection.")
|
2486
2770
|
|
2487
|
-
if not (ocr ^ original):
|
2488
|
-
|
2771
|
+
if not (ocr ^ original): # XOR: exactly one must be true
|
2772
|
+
raise ValueError("Exactly one of 'ocr' or 'original' must be True.")
|
2489
2773
|
|
2490
2774
|
output_path_obj = Path(output_path)
|
2491
2775
|
output_path_str = str(output_path_obj)
|
@@ -2494,18 +2778,29 @@ class PageCollection(Generic[P], ApplyMixin):
|
|
2494
2778
|
if create_searchable_pdf is None:
|
2495
2779
|
raise ImportError(
|
2496
2780
|
"Saving with ocr=True requires 'pikepdf' and 'Pillow'. "
|
2497
|
-
|
2781
|
+
'Install with: pip install \\"natural-pdf[ocr-export]\\"' # Escaped quotes
|
2498
2782
|
)
|
2499
2783
|
|
2500
2784
|
# Check for non-OCR vector elements (provide a warning)
|
2501
2785
|
has_vector_elements = False
|
2502
2786
|
for page in self.pages:
|
2503
2787
|
# Simplified check for common vector types or non-OCR chars/words
|
2504
|
-
if (
|
2505
|
-
hasattr(page,
|
2506
|
-
|
2507
|
-
|
2508
|
-
|
2788
|
+
if (
|
2789
|
+
hasattr(page, "rects")
|
2790
|
+
and page.rects
|
2791
|
+
or hasattr(page, "lines")
|
2792
|
+
and page.lines
|
2793
|
+
or hasattr(page, "curves")
|
2794
|
+
and page.curves
|
2795
|
+
or (
|
2796
|
+
hasattr(page, "chars")
|
2797
|
+
and any(getattr(el, "source", None) != "ocr" for el in page.chars)
|
2798
|
+
)
|
2799
|
+
or (
|
2800
|
+
hasattr(page, "words")
|
2801
|
+
and any(getattr(el, "source", None) != "ocr" for el in page.words)
|
2802
|
+
)
|
2803
|
+
):
|
2509
2804
|
has_vector_elements = True
|
2510
2805
|
break
|
2511
2806
|
if has_vector_elements:
|
@@ -2532,22 +2827,22 @@ class PageCollection(Generic[P], ApplyMixin):
|
|
2532
2827
|
if create_original_pdf is None:
|
2533
2828
|
raise ImportError(
|
2534
2829
|
"Saving with original=True requires 'pikepdf'. "
|
2535
|
-
|
2830
|
+
'Install with: pip install \\"natural-pdf[ocr-export]\\"' # Escaped quotes
|
2536
2831
|
)
|
2537
2832
|
|
2538
2833
|
# Check for OCR elements (provide a warning) - keep this check here
|
2539
2834
|
has_ocr_elements = False
|
2540
2835
|
for page in self.pages:
|
2541
|
-
|
2542
|
-
|
2543
|
-
|
2544
|
-
|
2545
|
-
|
2546
|
-
|
2547
|
-
|
2548
|
-
|
2549
|
-
|
2550
|
-
|
2836
|
+
# Use find_all which returns a collection; check if it's non-empty
|
2837
|
+
if hasattr(page, "find_all"):
|
2838
|
+
ocr_text_elements = page.find_all("text[source=ocr]")
|
2839
|
+
if ocr_text_elements: # Check truthiness of collection
|
2840
|
+
has_ocr_elements = True
|
2841
|
+
break
|
2842
|
+
elif hasattr(page, "words"): # Fallback check if find_all isn't present?
|
2843
|
+
if any(getattr(el, "source", None) == "ocr" for el in page.words):
|
2844
|
+
has_ocr_elements = True
|
2845
|
+
break
|
2551
2846
|
|
2552
2847
|
if has_ocr_elements:
|
2553
2848
|
logger.warning(
|
@@ -2565,5 +2860,5 @@ class PageCollection(Generic[P], ApplyMixin):
|
|
2565
2860
|
except Exception as e:
|
2566
2861
|
# Error logging is handled within create_original_pdf
|
2567
2862
|
# Re-raise the exception caught from the exporter
|
2568
|
-
raise e
|
2863
|
+
raise e # Keep the original exception type (ValueError, RuntimeError, etc.)
|
2569
2864
|
# <--- END MODIFIED
|