natural-pdf 0.1.11__py3-none-any.whl → 0.1.12__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- natural_pdf/__init__.py +7 -2
- natural_pdf/analyzers/text_options.py +9 -1
- natural_pdf/analyzers/text_structure.py +371 -58
- natural_pdf/classification/manager.py +1 -1
- natural_pdf/core/element_manager.py +11 -1
- natural_pdf/core/highlighting_service.py +120 -40
- natural_pdf/core/page.py +4 -2
- natural_pdf/core/pdf.py +53 -38
- natural_pdf/elements/base.py +17 -0
- natural_pdf/elements/collections.py +203 -59
- natural_pdf/elements/region.py +43 -11
- natural_pdf/exporters/data/__init__.py +0 -0
- natural_pdf/exporters/data/pdf.ttf +0 -0
- natural_pdf/exporters/data/sRGB.icc +0 -0
- natural_pdf/exporters/hocr.py +40 -61
- natural_pdf/exporters/hocr_font.py +7 -13
- natural_pdf/exporters/original_pdf.py +10 -13
- natural_pdf/exporters/searchable_pdf.py +0 -10
- natural_pdf/search/__init__.py +65 -52
- natural_pdf/search/lancedb_search_service.py +325 -0
- natural_pdf/search/numpy_search_service.py +255 -0
- natural_pdf/search/searchable_mixin.py +25 -71
- natural_pdf/widgets/viewer.py +22 -31
- {natural_pdf-0.1.11.dist-info → natural_pdf-0.1.12.dist-info}/METADATA +54 -49
- {natural_pdf-0.1.11.dist-info → natural_pdf-0.1.12.dist-info}/RECORD +28 -25
- {natural_pdf-0.1.11.dist-info → natural_pdf-0.1.12.dist-info}/WHEEL +1 -1
- natural_pdf/search/haystack_search_service.py +0 -687
- natural_pdf/search/haystack_utils.py +0 -474
- {natural_pdf-0.1.11.dist-info → natural_pdf-0.1.12.dist-info}/licenses/LICENSE +0 -0
- {natural_pdf-0.1.11.dist-info → natural_pdf-0.1.12.dist-info}/top_level.txt +0 -0
@@ -20,10 +20,10 @@ from typing import (
|
|
20
20
|
)
|
21
21
|
|
22
22
|
from pdfplumber.utils.geometry import objects_to_bbox
|
23
|
-
from PIL import Image, ImageDraw, ImageFont
|
24
23
|
|
25
24
|
# New Imports
|
26
25
|
from pdfplumber.utils.text import TEXTMAP_KWARGS, WORD_EXTRACTOR_KWARGS, chars_to_textmap
|
26
|
+
from PIL import Image, ImageDraw, ImageFont
|
27
27
|
from tqdm.auto import tqdm
|
28
28
|
|
29
29
|
from natural_pdf.classification.manager import ClassificationManager
|
@@ -46,6 +46,7 @@ except ImportError:
|
|
46
46
|
|
47
47
|
try:
|
48
48
|
from natural_pdf.exporters.searchable_pdf import create_searchable_pdf
|
49
|
+
|
49
50
|
pass
|
50
51
|
except ImportError:
|
51
52
|
create_searchable_pdf = None
|
@@ -61,7 +62,7 @@ logger = logging.getLogger(__name__)
|
|
61
62
|
|
62
63
|
if TYPE_CHECKING:
|
63
64
|
from natural_pdf.core.page import Page
|
64
|
-
from natural_pdf.core.pdf import PDF
|
65
|
+
from natural_pdf.core.pdf import PDF # ---> ADDED PDF type hint
|
65
66
|
from natural_pdf.elements.region import Region
|
66
67
|
|
67
68
|
T = TypeVar("T")
|
@@ -840,6 +841,7 @@ class ElementCollection(
|
|
840
841
|
labels: bool = True, # Use 'labels' consistent with service
|
841
842
|
legend_position: str = "right",
|
842
843
|
render_ocr: bool = False,
|
844
|
+
width: Optional[int] = None, # Add width parameter
|
843
845
|
) -> Optional["Image.Image"]:
|
844
846
|
"""
|
845
847
|
Generates a temporary preview image highlighting elements in this collection
|
@@ -862,6 +864,7 @@ class ElementCollection(
|
|
862
864
|
labels: Whether to include a legend for the temporary highlights.
|
863
865
|
legend_position: Position of the legend ('right', 'left', 'top', 'bottom').
|
864
866
|
render_ocr: Whether to render OCR text.
|
867
|
+
width: Optional width for the output image in pixels.
|
865
868
|
|
866
869
|
Returns:
|
867
870
|
PIL Image object of the temporary preview, or None if rendering fails or
|
@@ -922,6 +925,7 @@ class ElementCollection(
|
|
922
925
|
page_index=page.index,
|
923
926
|
temporary_highlights=highlight_data_list,
|
924
927
|
scale=scale,
|
928
|
+
width=width, # Pass the width parameter
|
925
929
|
labels=labels, # Use 'labels'
|
926
930
|
legend_position=legend_position,
|
927
931
|
render_ocr=render_ocr,
|
@@ -1159,10 +1163,96 @@ class ElementCollection(
|
|
1159
1163
|
|
1160
1164
|
Args:
|
1161
1165
|
selector: CSS-like selector string
|
1166
|
+
contains: How to determine if elements are inside: 'all' (fully inside),
|
1167
|
+
'any' (any overlap), or 'center' (center point inside).
|
1168
|
+
(default: "all")
|
1162
1169
|
apply_exclusions: Whether to exclude elements in exclusion regions
|
1163
1170
|
"""
|
1164
1171
|
return self.apply(lambda element: element.find(selector, **kwargs))
|
1165
1172
|
|
1173
|
+
@overload
|
1174
|
+
def find_all(
|
1175
|
+
self,
|
1176
|
+
*,
|
1177
|
+
text: str,
|
1178
|
+
contains: str = "all",
|
1179
|
+
apply_exclusions: bool = True,
|
1180
|
+
regex: bool = False,
|
1181
|
+
case: bool = True,
|
1182
|
+
**kwargs,
|
1183
|
+
) -> "ElementCollection": ...
|
1184
|
+
|
1185
|
+
@overload
|
1186
|
+
def find_all(
|
1187
|
+
self,
|
1188
|
+
selector: str,
|
1189
|
+
*,
|
1190
|
+
contains: str = "all",
|
1191
|
+
apply_exclusions: bool = True,
|
1192
|
+
regex: bool = False,
|
1193
|
+
case: bool = True,
|
1194
|
+
**kwargs,
|
1195
|
+
) -> "ElementCollection": ...
|
1196
|
+
|
1197
|
+
def find_all(
|
1198
|
+
self,
|
1199
|
+
selector: Optional[str] = None,
|
1200
|
+
*,
|
1201
|
+
text: Optional[str] = None,
|
1202
|
+
contains: str = "all",
|
1203
|
+
apply_exclusions: bool = True,
|
1204
|
+
regex: bool = False,
|
1205
|
+
case: bool = True,
|
1206
|
+
**kwargs,
|
1207
|
+
) -> "ElementCollection":
|
1208
|
+
"""
|
1209
|
+
Find all elements within each element of this collection matching the selector OR text,
|
1210
|
+
and return a flattened collection of all found sub-elements.
|
1211
|
+
|
1212
|
+
Provide EITHER `selector` OR `text`, but not both.
|
1213
|
+
|
1214
|
+
Args:
|
1215
|
+
selector: CSS-like selector string.
|
1216
|
+
text: Text content to search for (equivalent to 'text:contains(...)').
|
1217
|
+
contains: How to determine if elements are inside: 'all' (fully inside),
|
1218
|
+
'any' (any overlap), or 'center' (center point inside).
|
1219
|
+
(default: "all")
|
1220
|
+
apply_exclusions: Whether to apply exclusion regions (default: True).
|
1221
|
+
regex: Whether to use regex for text search (`selector` or `text`) (default: False).
|
1222
|
+
case: Whether to do case-sensitive text search (`selector` or `text`) (default: True).
|
1223
|
+
**kwargs: Additional parameters for element filtering.
|
1224
|
+
|
1225
|
+
Returns:
|
1226
|
+
A new ElementCollection containing all matching sub-elements from all elements
|
1227
|
+
in this collection.
|
1228
|
+
"""
|
1229
|
+
if selector is None and text is None:
|
1230
|
+
raise ValueError("Either 'selector' or 'text' must be provided to find_all.")
|
1231
|
+
if selector is not None and text is not None:
|
1232
|
+
raise ValueError("Provide either 'selector' or 'text' to find_all, not both.")
|
1233
|
+
|
1234
|
+
all_found_elements: List[Element] = []
|
1235
|
+
for element in self._elements:
|
1236
|
+
if hasattr(element, "find_all") and callable(element.find_all):
|
1237
|
+
# Element.find_all returns an ElementCollection
|
1238
|
+
found_in_element: "ElementCollection" = element.find_all(
|
1239
|
+
selector=selector,
|
1240
|
+
text=text,
|
1241
|
+
contains=contains,
|
1242
|
+
apply_exclusions=apply_exclusions,
|
1243
|
+
regex=regex,
|
1244
|
+
case=case,
|
1245
|
+
**kwargs,
|
1246
|
+
)
|
1247
|
+
if found_in_element and found_in_element.elements:
|
1248
|
+
all_found_elements.extend(found_in_element.elements)
|
1249
|
+
# else:
|
1250
|
+
# Elements in the collection are expected to support find_all.
|
1251
|
+
# If an element type doesn't, an AttributeError will naturally occur,
|
1252
|
+
# or a more specific check/handling could be added here if needed.
|
1253
|
+
|
1254
|
+
return ElementCollection(all_found_elements)
|
1255
|
+
|
1166
1256
|
def extract_each_text(self, **kwargs) -> List[str]:
|
1167
1257
|
"""
|
1168
1258
|
Extract text from each element in this region.
|
@@ -1633,6 +1723,7 @@ class PageCollection(Generic[P], ApplyMixin):
|
|
1633
1723
|
self,
|
1634
1724
|
*,
|
1635
1725
|
text: str,
|
1726
|
+
contains: str = "all",
|
1636
1727
|
apply_exclusions: bool = True,
|
1637
1728
|
regex: bool = False,
|
1638
1729
|
case: bool = True,
|
@@ -1644,6 +1735,7 @@ class PageCollection(Generic[P], ApplyMixin):
|
|
1644
1735
|
self,
|
1645
1736
|
selector: str,
|
1646
1737
|
*,
|
1738
|
+
contains: str = "all",
|
1647
1739
|
apply_exclusions: bool = True,
|
1648
1740
|
regex: bool = False,
|
1649
1741
|
case: bool = True,
|
@@ -1655,6 +1747,7 @@ class PageCollection(Generic[P], ApplyMixin):
|
|
1655
1747
|
selector: Optional[str] = None,
|
1656
1748
|
*,
|
1657
1749
|
text: Optional[str] = None,
|
1750
|
+
contains: str = "all",
|
1658
1751
|
apply_exclusions: bool = True,
|
1659
1752
|
regex: bool = False,
|
1660
1753
|
case: bool = True,
|
@@ -1668,6 +1761,9 @@ class PageCollection(Generic[P], ApplyMixin):
|
|
1668
1761
|
Args:
|
1669
1762
|
selector: CSS-like selector string.
|
1670
1763
|
text: Text content to search for (equivalent to 'text:contains(...)').
|
1764
|
+
contains: How to determine if elements are inside: 'all' (fully inside),
|
1765
|
+
'any' (any overlap), or 'center' (center point inside).
|
1766
|
+
(default: "all")
|
1671
1767
|
apply_exclusions: Whether to exclude elements in exclusion regions (default: True).
|
1672
1768
|
regex: Whether to use regex for text search (`selector` or `text`) (default: False).
|
1673
1769
|
case: Whether to do case-sensitive text search (`selector` or `text`) (default: True).
|
@@ -1681,6 +1777,7 @@ class PageCollection(Generic[P], ApplyMixin):
|
|
1681
1777
|
element = page.find(
|
1682
1778
|
selector=selector,
|
1683
1779
|
text=text,
|
1780
|
+
contains=contains,
|
1684
1781
|
apply_exclusions=apply_exclusions,
|
1685
1782
|
regex=regex,
|
1686
1783
|
case=case,
|
@@ -1695,6 +1792,7 @@ class PageCollection(Generic[P], ApplyMixin):
|
|
1695
1792
|
self,
|
1696
1793
|
*,
|
1697
1794
|
text: str,
|
1795
|
+
contains: str = "all",
|
1698
1796
|
apply_exclusions: bool = True,
|
1699
1797
|
regex: bool = False,
|
1700
1798
|
case: bool = True,
|
@@ -1706,6 +1804,7 @@ class PageCollection(Generic[P], ApplyMixin):
|
|
1706
1804
|
self,
|
1707
1805
|
selector: str,
|
1708
1806
|
*,
|
1807
|
+
contains: str = "all",
|
1709
1808
|
apply_exclusions: bool = True,
|
1710
1809
|
regex: bool = False,
|
1711
1810
|
case: bool = True,
|
@@ -1717,6 +1816,7 @@ class PageCollection(Generic[P], ApplyMixin):
|
|
1717
1816
|
selector: Optional[str] = None,
|
1718
1817
|
*,
|
1719
1818
|
text: Optional[str] = None,
|
1819
|
+
contains: str = "all",
|
1720
1820
|
apply_exclusions: bool = True,
|
1721
1821
|
regex: bool = False,
|
1722
1822
|
case: bool = True,
|
@@ -1730,6 +1830,9 @@ class PageCollection(Generic[P], ApplyMixin):
|
|
1730
1830
|
Args:
|
1731
1831
|
selector: CSS-like selector string.
|
1732
1832
|
text: Text content to search for (equivalent to 'text:contains(...)').
|
1833
|
+
contains: How to determine if elements are inside: 'all' (fully inside),
|
1834
|
+
'any' (any overlap), or 'center' (center point inside).
|
1835
|
+
(default: "all")
|
1733
1836
|
apply_exclusions: Whether to exclude elements in exclusion regions (default: True).
|
1734
1837
|
regex: Whether to use regex for text search (`selector` or `text`) (default: False).
|
1735
1838
|
case: Whether to do case-sensitive text search (`selector` or `text`) (default: True).
|
@@ -1744,6 +1847,7 @@ class PageCollection(Generic[P], ApplyMixin):
|
|
1744
1847
|
elements = page.find_all(
|
1745
1848
|
selector=selector,
|
1746
1849
|
text=text,
|
1850
|
+
contains=contains,
|
1747
1851
|
apply_exclusions=apply_exclusions,
|
1748
1852
|
regex=regex,
|
1749
1853
|
case=case,
|
@@ -2314,8 +2418,10 @@ class PageCollection(Generic[P], ApplyMixin):
|
|
2314
2418
|
try:
|
2315
2419
|
from PIL import Image, ImageDraw, ImageFont
|
2316
2420
|
except ImportError:
|
2317
|
-
|
2318
|
-
|
2421
|
+
logger.error(
|
2422
|
+
"Pillow library not found, required for to_image(). Install with 'pip install Pillow'"
|
2423
|
+
)
|
2424
|
+
return None
|
2319
2425
|
|
2320
2426
|
if not self.pages:
|
2321
2427
|
logger.warning("Cannot generate image for empty PageCollection")
|
@@ -2334,27 +2440,34 @@ class PageCollection(Generic[P], ApplyMixin):
|
|
2334
2440
|
try:
|
2335
2441
|
font = ImageFont.load_default(16)
|
2336
2442
|
except IOError:
|
2337
|
-
|
2338
|
-
|
2443
|
+
logger.warning("Default font not found. Labels cannot be added.")
|
2444
|
+
add_labels = False # Disable if no font
|
2339
2445
|
|
2340
2446
|
# Render individual page images
|
2341
2447
|
page_images = []
|
2342
2448
|
for page in pages_to_render:
|
2343
2449
|
try:
|
2344
2450
|
# Assume page.to_image returns a PIL Image or None
|
2345
|
-
img = page.to_image(
|
2451
|
+
img = page.to_image(
|
2452
|
+
width=page_width, include_highlights=True
|
2453
|
+
) # Render with highlights for visual context
|
2346
2454
|
if img is None:
|
2347
|
-
|
2348
|
-
|
2455
|
+
logger.warning(f"Failed to generate image for page {page.number}. Skipping.")
|
2456
|
+
continue
|
2349
2457
|
except Exception as img_err:
|
2350
|
-
|
2351
|
-
|
2352
|
-
|
2458
|
+
logger.error(
|
2459
|
+
f"Error generating image for page {page.number}: {img_err}", exc_info=True
|
2460
|
+
)
|
2461
|
+
continue
|
2353
2462
|
|
2354
2463
|
# Add page number label
|
2355
2464
|
if add_labels and font:
|
2356
2465
|
draw = ImageDraw.Draw(img)
|
2357
|
-
pdf_name =
|
2466
|
+
pdf_name = (
|
2467
|
+
Path(page.pdf.path).stem
|
2468
|
+
if hasattr(page, "pdf") and page.pdf and hasattr(page.pdf, "path")
|
2469
|
+
else ""
|
2470
|
+
)
|
2358
2471
|
label_text = f"p{page.number}"
|
2359
2472
|
if pdf_name:
|
2360
2473
|
label_text += f" - {pdf_name}"
|
@@ -2364,43 +2477,65 @@ class PageCollection(Generic[P], ApplyMixin):
|
|
2364
2477
|
# Placeholder logic - adjust based on how classification results are stored
|
2365
2478
|
category = None
|
2366
2479
|
confidence = None
|
2367
|
-
if
|
2368
|
-
|
2480
|
+
if (
|
2481
|
+
hasattr(page, "analyses")
|
2482
|
+
and page.analyses
|
2483
|
+
and "classification" in page.analyses
|
2484
|
+
):
|
2485
|
+
result = page.analyses["classification"]
|
2369
2486
|
# Adapt based on actual structure of classification result
|
2370
|
-
category =
|
2371
|
-
|
2487
|
+
category = (
|
2488
|
+
getattr(result, "label", None) or result.get("label", None)
|
2489
|
+
if isinstance(result, dict)
|
2490
|
+
else None
|
2491
|
+
)
|
2492
|
+
confidence = (
|
2493
|
+
getattr(result, "score", None) or result.get("score", None)
|
2494
|
+
if isinstance(result, dict)
|
2495
|
+
else None
|
2496
|
+
)
|
2372
2497
|
|
2373
2498
|
if category is not None and confidence is not None:
|
2374
|
-
|
2375
|
-
category_str = f"{category} ({confidence:.2f})"
|
2499
|
+
try:
|
2500
|
+
category_str = f"{category} ({confidence:.2f})" # Format confidence
|
2376
2501
|
label_text += f"\\n{category_str}"
|
2377
|
-
|
2378
|
-
|
2502
|
+
except (TypeError, ValueError):
|
2503
|
+
pass # Ignore formatting errors
|
2379
2504
|
|
2380
2505
|
# Calculate bounding box for multi-line text and draw background/text
|
2381
2506
|
try:
|
2382
2507
|
# Using textbbox for potentially better accuracy with specific fonts
|
2383
2508
|
# Note: textbbox needs Pillow 8+
|
2384
|
-
bbox = draw.textbbox(
|
2385
|
-
|
2386
|
-
|
2509
|
+
bbox = draw.textbbox(
|
2510
|
+
(5, 5), label_text, font=font, spacing=2
|
2511
|
+
) # Use textbbox if available
|
2512
|
+
bg_rect = (
|
2513
|
+
max(0, bbox[0] - 2),
|
2514
|
+
max(0, bbox[1] - 2),
|
2515
|
+
min(img.width, bbox[2] + 2),
|
2516
|
+
min(img.height, bbox[3] + 2),
|
2517
|
+
)
|
2387
2518
|
|
2388
2519
|
# Draw semi-transparent background
|
2389
|
-
overlay = Image.new(
|
2520
|
+
overlay = Image.new("RGBA", img.size, (255, 255, 255, 0))
|
2390
2521
|
draw_overlay = ImageDraw.Draw(overlay)
|
2391
|
-
draw_overlay.rectangle(bg_rect, fill=(255, 255, 255, 180))
|
2392
|
-
img = Image.alpha_composite(img.convert(
|
2393
|
-
draw = ImageDraw.Draw(img)
|
2522
|
+
draw_overlay.rectangle(bg_rect, fill=(255, 255, 255, 180)) # White with alpha
|
2523
|
+
img = Image.alpha_composite(img.convert("RGBA"), overlay).convert("RGB")
|
2524
|
+
draw = ImageDraw.Draw(img) # Recreate draw object
|
2394
2525
|
|
2395
2526
|
# Draw the potentially multi-line text
|
2396
2527
|
draw.multiline_text((5, 5), label_text, fill=(0, 0, 0), font=font, spacing=2)
|
2397
|
-
except AttributeError:
|
2528
|
+
except AttributeError: # Fallback for older Pillow without textbbox
|
2398
2529
|
# Approximate size and draw
|
2399
2530
|
# This might not be perfectly aligned
|
2400
|
-
|
2401
|
-
|
2531
|
+
draw.rectangle(
|
2532
|
+
(2, 2, 150, 40), fill=(255, 255, 255, 180)
|
2533
|
+
) # Simple fixed background
|
2534
|
+
draw.multiline_text((5, 5), label_text, fill=(0, 0, 0), font=font, spacing=2)
|
2402
2535
|
except Exception as draw_err:
|
2403
|
-
|
2536
|
+
logger.error(
|
2537
|
+
f"Error drawing label on page {page.number}: {draw_err}", exc_info=True
|
2538
|
+
)
|
2404
2539
|
|
2405
2540
|
page_images.append(img)
|
2406
2541
|
|
@@ -2408,7 +2543,6 @@ class PageCollection(Generic[P], ApplyMixin):
|
|
2408
2543
|
logger.warning("No page images were successfully rendered for the grid.")
|
2409
2544
|
return None
|
2410
2545
|
|
2411
|
-
|
2412
2546
|
# Calculate grid dimensions if not provided
|
2413
2547
|
num_images = len(page_images)
|
2414
2548
|
if not rows and not cols:
|
@@ -2418,24 +2552,23 @@ class PageCollection(Generic[P], ApplyMixin):
|
|
2418
2552
|
cols = (num_images + rows - 1) // rows
|
2419
2553
|
elif cols and not rows:
|
2420
2554
|
rows = (num_images + cols - 1) // cols
|
2421
|
-
cols = max(1, cols if cols else 1)
|
2555
|
+
cols = max(1, cols if cols else 1) # Ensure at least 1
|
2422
2556
|
rows = max(1, rows if rows else 1)
|
2423
2557
|
|
2424
|
-
|
2425
2558
|
# Get maximum dimensions for consistent grid cells
|
2426
2559
|
max_width = max(img.width for img in page_images) if page_images else 1
|
2427
2560
|
max_height = max(img.height for img in page_images) if page_images else 1
|
2428
2561
|
|
2429
|
-
|
2430
2562
|
# Create grid image
|
2431
2563
|
grid_width = cols * max_width + (cols + 1) * spacing
|
2432
2564
|
grid_height = rows * max_height + (rows + 1) * spacing
|
2433
|
-
grid_img = Image.new(
|
2434
|
-
|
2565
|
+
grid_img = Image.new(
|
2566
|
+
"RGB", (grid_width, grid_height), (220, 220, 220)
|
2567
|
+
) # Lighter gray background
|
2435
2568
|
|
2436
2569
|
# Place images in grid
|
2437
2570
|
for i, img in enumerate(page_images):
|
2438
|
-
if i >= rows * cols:
|
2571
|
+
if i >= rows * cols: # Ensure we don't exceed grid capacity
|
2439
2572
|
break
|
2440
2573
|
|
2441
2574
|
row = i // cols
|
@@ -2484,8 +2617,8 @@ class PageCollection(Generic[P], ApplyMixin):
|
|
2484
2617
|
if not self.pages:
|
2485
2618
|
raise ValueError("Cannot save an empty PageCollection.")
|
2486
2619
|
|
2487
|
-
if not (ocr ^ original):
|
2488
|
-
|
2620
|
+
if not (ocr ^ original): # XOR: exactly one must be true
|
2621
|
+
raise ValueError("Exactly one of 'ocr' or 'original' must be True.")
|
2489
2622
|
|
2490
2623
|
output_path_obj = Path(output_path)
|
2491
2624
|
output_path_str = str(output_path_obj)
|
@@ -2494,18 +2627,29 @@ class PageCollection(Generic[P], ApplyMixin):
|
|
2494
2627
|
if create_searchable_pdf is None:
|
2495
2628
|
raise ImportError(
|
2496
2629
|
"Saving with ocr=True requires 'pikepdf' and 'Pillow'. "
|
2497
|
-
|
2630
|
+
'Install with: pip install \\"natural-pdf[ocr-export]\\"' # Escaped quotes
|
2498
2631
|
)
|
2499
2632
|
|
2500
2633
|
# Check for non-OCR vector elements (provide a warning)
|
2501
2634
|
has_vector_elements = False
|
2502
2635
|
for page in self.pages:
|
2503
2636
|
# Simplified check for common vector types or non-OCR chars/words
|
2504
|
-
if (
|
2505
|
-
hasattr(page,
|
2506
|
-
|
2507
|
-
|
2508
|
-
|
2637
|
+
if (
|
2638
|
+
hasattr(page, "rects")
|
2639
|
+
and page.rects
|
2640
|
+
or hasattr(page, "lines")
|
2641
|
+
and page.lines
|
2642
|
+
or hasattr(page, "curves")
|
2643
|
+
and page.curves
|
2644
|
+
or (
|
2645
|
+
hasattr(page, "chars")
|
2646
|
+
and any(getattr(el, "source", None) != "ocr" for el in page.chars)
|
2647
|
+
)
|
2648
|
+
or (
|
2649
|
+
hasattr(page, "words")
|
2650
|
+
and any(getattr(el, "source", None) != "ocr" for el in page.words)
|
2651
|
+
)
|
2652
|
+
):
|
2509
2653
|
has_vector_elements = True
|
2510
2654
|
break
|
2511
2655
|
if has_vector_elements:
|
@@ -2532,22 +2676,22 @@ class PageCollection(Generic[P], ApplyMixin):
|
|
2532
2676
|
if create_original_pdf is None:
|
2533
2677
|
raise ImportError(
|
2534
2678
|
"Saving with original=True requires 'pikepdf'. "
|
2535
|
-
|
2679
|
+
'Install with: pip install \\"natural-pdf[ocr-export]\\"' # Escaped quotes
|
2536
2680
|
)
|
2537
2681
|
|
2538
2682
|
# Check for OCR elements (provide a warning) - keep this check here
|
2539
2683
|
has_ocr_elements = False
|
2540
2684
|
for page in self.pages:
|
2541
|
-
|
2542
|
-
|
2543
|
-
|
2544
|
-
|
2545
|
-
|
2546
|
-
|
2547
|
-
|
2548
|
-
|
2549
|
-
|
2550
|
-
|
2685
|
+
# Use find_all which returns a collection; check if it's non-empty
|
2686
|
+
if hasattr(page, "find_all"):
|
2687
|
+
ocr_text_elements = page.find_all("text[source=ocr]")
|
2688
|
+
if ocr_text_elements: # Check truthiness of collection
|
2689
|
+
has_ocr_elements = True
|
2690
|
+
break
|
2691
|
+
elif hasattr(page, "words"): # Fallback check if find_all isn't present?
|
2692
|
+
if any(getattr(el, "source", None) == "ocr" for el in page.words):
|
2693
|
+
has_ocr_elements = True
|
2694
|
+
break
|
2551
2695
|
|
2552
2696
|
if has_ocr_elements:
|
2553
2697
|
logger.warning(
|
@@ -2565,5 +2709,5 @@ class PageCollection(Generic[P], ApplyMixin):
|
|
2565
2709
|
except Exception as e:
|
2566
2710
|
# Error logging is handled within create_original_pdf
|
2567
2711
|
# Re-raise the exception caught from the exporter
|
2568
|
-
raise e
|
2712
|
+
raise e # Keep the original exception type (ValueError, RuntimeError, etc.)
|
2569
2713
|
# <--- END MODIFIED
|
natural_pdf/elements/region.py
CHANGED
@@ -772,6 +772,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin):
|
|
772
772
|
# Add a default color for standalone show
|
773
773
|
color: Optional[Union[Tuple, str]] = "blue",
|
774
774
|
label: Optional[str] = None,
|
775
|
+
width: Optional[int] = None, # Add width parameter
|
775
776
|
) -> "Image.Image":
|
776
777
|
"""
|
777
778
|
Show the page with just this region highlighted temporarily.
|
@@ -782,6 +783,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin):
|
|
782
783
|
legend_position: Position of the legend
|
783
784
|
color: Color to highlight this region (default: blue)
|
784
785
|
label: Optional label for this region in the legend
|
786
|
+
width: Optional width for the output image in pixels
|
785
787
|
|
786
788
|
Returns:
|
787
789
|
PIL Image of the page with only this region highlighted
|
@@ -812,6 +814,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin):
|
|
812
814
|
page_index=self._page.index,
|
813
815
|
temporary_highlights=[temp_highlight_data],
|
814
816
|
scale=scale,
|
817
|
+
width=width, # Pass the width parameter
|
815
818
|
labels=labels,
|
816
819
|
legend_position=legend_position,
|
817
820
|
)
|
@@ -1333,6 +1336,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin):
|
|
1333
1336
|
self,
|
1334
1337
|
*,
|
1335
1338
|
text: str,
|
1339
|
+
contains: str = "all",
|
1336
1340
|
apply_exclusions: bool = True,
|
1337
1341
|
regex: bool = False,
|
1338
1342
|
case: bool = True,
|
@@ -1344,6 +1348,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin):
|
|
1344
1348
|
self,
|
1345
1349
|
selector: str,
|
1346
1350
|
*,
|
1351
|
+
contains: str = "all",
|
1347
1352
|
apply_exclusions: bool = True,
|
1348
1353
|
regex: bool = False,
|
1349
1354
|
case: bool = True,
|
@@ -1355,6 +1360,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin):
|
|
1355
1360
|
selector: Optional[str] = None, # Now optional
|
1356
1361
|
*,
|
1357
1362
|
text: Optional[str] = None, # New text parameter
|
1363
|
+
contains: str = "all", # New parameter for containment behavior
|
1358
1364
|
apply_exclusions: bool = True,
|
1359
1365
|
regex: bool = False,
|
1360
1366
|
case: bool = True,
|
@@ -1368,6 +1374,9 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin):
|
|
1368
1374
|
Args:
|
1369
1375
|
selector: CSS-like selector string.
|
1370
1376
|
text: Text content to search for (equivalent to 'text:contains(...)').
|
1377
|
+
contains: How to determine if elements are inside: 'all' (fully inside),
|
1378
|
+
'any' (any overlap), or 'center' (center point inside).
|
1379
|
+
(default: "all")
|
1371
1380
|
apply_exclusions: Whether to exclude elements in exclusion regions (default: True).
|
1372
1381
|
regex: Whether to use regex for text search (`selector` or `text`) (default: False).
|
1373
1382
|
case: Whether to do case-sensitive text search (`selector` or `text`) (default: True).
|
@@ -1380,6 +1389,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin):
|
|
1380
1389
|
elements = self.find_all(
|
1381
1390
|
selector=selector,
|
1382
1391
|
text=text,
|
1392
|
+
contains=contains,
|
1383
1393
|
apply_exclusions=apply_exclusions,
|
1384
1394
|
regex=regex,
|
1385
1395
|
case=case,
|
@@ -1392,6 +1402,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin):
|
|
1392
1402
|
self,
|
1393
1403
|
*,
|
1394
1404
|
text: str,
|
1405
|
+
contains: str = "all",
|
1395
1406
|
apply_exclusions: bool = True,
|
1396
1407
|
regex: bool = False,
|
1397
1408
|
case: bool = True,
|
@@ -1403,6 +1414,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin):
|
|
1403
1414
|
self,
|
1404
1415
|
selector: str,
|
1405
1416
|
*,
|
1417
|
+
contains: str = "all",
|
1406
1418
|
apply_exclusions: bool = True,
|
1407
1419
|
regex: bool = False,
|
1408
1420
|
case: bool = True,
|
@@ -1414,6 +1426,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin):
|
|
1414
1426
|
selector: Optional[str] = None, # Now optional
|
1415
1427
|
*,
|
1416
1428
|
text: Optional[str] = None, # New text parameter
|
1429
|
+
contains: str = "all", # New parameter to control inside/overlap behavior
|
1417
1430
|
apply_exclusions: bool = True,
|
1418
1431
|
regex: bool = False,
|
1419
1432
|
case: bool = True,
|
@@ -1427,6 +1440,9 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin):
|
|
1427
1440
|
Args:
|
1428
1441
|
selector: CSS-like selector string.
|
1429
1442
|
text: Text content to search for (equivalent to 'text:contains(...)').
|
1443
|
+
contains: How to determine if elements are inside: 'all' (fully inside),
|
1444
|
+
'any' (any overlap), or 'center' (center point inside).
|
1445
|
+
(default: "all")
|
1430
1446
|
apply_exclusions: Whether to exclude elements in exclusion regions (default: True).
|
1431
1447
|
regex: Whether to use regex for text search (`selector` or `text`) (default: False).
|
1432
1448
|
case: Whether to do case-sensitive text search (`selector` or `text`) (default: True).
|
@@ -1442,6 +1458,10 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin):
|
|
1442
1458
|
if selector is None and text is None:
|
1443
1459
|
raise ValueError("Provide either 'selector' or 'text'.")
|
1444
1460
|
|
1461
|
+
# Validate contains parameter
|
1462
|
+
if contains not in ["all", "any", "center"]:
|
1463
|
+
raise ValueError(f"Invalid contains value: {contains}. Must be 'all', 'any', or 'center'")
|
1464
|
+
|
1445
1465
|
# Construct selector if 'text' is provided
|
1446
1466
|
effective_selector = ""
|
1447
1467
|
if text is not None:
|
@@ -1481,22 +1501,34 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin):
|
|
1481
1501
|
# Let the page handle its exclusion logic if needed
|
1482
1502
|
potential_elements = self.page.find_all(
|
1483
1503
|
selector=effective_selector,
|
1484
|
-
apply_exclusions=
|
1504
|
+
apply_exclusions=apply_exclusions,
|
1485
1505
|
regex=regex,
|
1486
1506
|
case=case,
|
1487
1507
|
**kwargs,
|
1488
1508
|
)
|
1489
1509
|
|
1490
|
-
# Filter these elements
|
1510
|
+
# Filter these elements based on the specified containment method
|
1491
1511
|
region_bbox = self.bbox
|
1492
|
-
matching_elements = [
|
1493
|
-
|
1494
|
-
|
1495
|
-
|
1496
|
-
|
1497
|
-
|
1498
|
-
|
1499
|
-
|
1512
|
+
matching_elements = []
|
1513
|
+
|
1514
|
+
if contains == "all": # Fully inside (strict)
|
1515
|
+
matching_elements = [
|
1516
|
+
el for el in potential_elements
|
1517
|
+
if el.x0 >= region_bbox[0]
|
1518
|
+
and el.top >= region_bbox[1]
|
1519
|
+
and el.x1 <= region_bbox[2]
|
1520
|
+
and el.bottom <= region_bbox[3]
|
1521
|
+
]
|
1522
|
+
elif contains == "any": # Any overlap
|
1523
|
+
matching_elements = [
|
1524
|
+
el for el in potential_elements
|
1525
|
+
if self.intersects(el)
|
1526
|
+
]
|
1527
|
+
elif contains == "center": # Center point inside
|
1528
|
+
matching_elements = [
|
1529
|
+
el for el in potential_elements
|
1530
|
+
if self.is_element_center_inside(el)
|
1531
|
+
]
|
1500
1532
|
|
1501
1533
|
return ElementCollection(matching_elements)
|
1502
1534
|
|
@@ -1988,7 +2020,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin):
|
|
1988
2020
|
from natural_pdf.qa.document_qa import get_qa_engine
|
1989
2021
|
except ImportError:
|
1990
2022
|
logger.error(
|
1991
|
-
"Question answering requires optional dependencies. Install with `pip install natural-pdf[
|
2023
|
+
"Question answering requires optional dependencies. Install with `pip install natural-pdf[core-ml]`"
|
1992
2024
|
)
|
1993
2025
|
return {
|
1994
2026
|
"answer": None,
|
File without changes
|
Binary file
|
Binary file
|