natural-pdf 0.1.22__py3-none-any.whl → 0.1.24__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- natural_pdf/analyzers/shape_detection_mixin.py +43 -3
- natural_pdf/classification/manager.py +1 -1
- natural_pdf/classification/mixin.py +35 -14
- natural_pdf/classification/results.py +16 -1
- natural_pdf/cli.py +1 -0
- natural_pdf/core/highlighting_service.py +23 -0
- natural_pdf/core/page.py +32 -2
- natural_pdf/core/pdf.py +24 -4
- natural_pdf/describe/base.py +11 -1
- natural_pdf/describe/summary.py +26 -0
- natural_pdf/elements/base.py +81 -3
- natural_pdf/elements/collections.py +162 -101
- natural_pdf/elements/region.py +187 -160
- natural_pdf/elements/text.py +15 -7
- natural_pdf/exporters/paddleocr.py +1 -1
- natural_pdf/extraction/manager.py +2 -2
- natural_pdf/extraction/mixin.py +295 -11
- natural_pdf/extraction/result.py +28 -1
- natural_pdf/flows/region.py +117 -2
- natural_pdf/ocr/engine_surya.py +25 -5
- natural_pdf/qa/__init__.py +2 -1
- natural_pdf/qa/document_qa.py +166 -113
- natural_pdf/qa/qa_result.py +55 -0
- natural_pdf/selectors/parser.py +22 -0
- natural_pdf/utils/text_extraction.py +34 -14
- {natural_pdf-0.1.22.dist-info → natural_pdf-0.1.24.dist-info}/METADATA +22 -13
- {natural_pdf-0.1.22.dist-info → natural_pdf-0.1.24.dist-info}/RECORD +31 -30
- {natural_pdf-0.1.22.dist-info → natural_pdf-0.1.24.dist-info}/WHEEL +0 -0
- {natural_pdf-0.1.22.dist-info → natural_pdf-0.1.24.dist-info}/entry_points.txt +0 -0
- {natural_pdf-0.1.22.dist-info → natural_pdf-0.1.24.dist-info}/licenses/LICENSE +0 -0
- {natural_pdf-0.1.22.dist-info → natural_pdf-0.1.24.dist-info}/top_level.txt +0 -0
@@ -290,7 +290,13 @@ class ElementCollection(
|
|
290
290
|
|
291
291
|
return ElementCollection(filtered)
|
292
292
|
|
293
|
-
def extract_text(
|
293
|
+
def extract_text(
|
294
|
+
self,
|
295
|
+
preserve_whitespace: bool = True,
|
296
|
+
use_exclusions: bool = True,
|
297
|
+
strip: Optional[bool] = None,
|
298
|
+
**kwargs,
|
299
|
+
) -> str:
|
294
300
|
"""
|
295
301
|
Extract text from all TextElements in the collection, optionally using
|
296
302
|
pdfplumber's layout engine if layout=True is specified.
|
@@ -303,6 +309,7 @@ class ElementCollection(
|
|
303
309
|
`chars_to_textmap` function ONLY if `layout=True` is passed.
|
304
310
|
See Page.extract_text docstring for common parameters.
|
305
311
|
If `layout=False` or omitted, performs a simple join.
|
312
|
+
strip: Whether to strip whitespace from the extracted text.
|
306
313
|
|
307
314
|
Returns:
|
308
315
|
Combined text from elements, potentially with layout-based spacing.
|
@@ -399,6 +406,12 @@ class ElementCollection(
|
|
399
406
|
result = "".join(c.get("text", "") for c in all_char_dicts)
|
400
407
|
# Replace multiple spaces created by joining possibly overlapping chars? Maybe not necessary.
|
401
408
|
|
409
|
+
# Determine final strip flag – same rule as global helper unless caller overrides
|
410
|
+
strip_text = strip if strip is not None else (not use_layout)
|
411
|
+
|
412
|
+
if strip_text and isinstance(result, str):
|
413
|
+
result = "\n".join(line.rstrip() for line in result.splitlines()).strip()
|
414
|
+
|
402
415
|
return result
|
403
416
|
|
404
417
|
def filter(self, func: Callable[["Element"], bool]) -> "ElementCollection":
|
@@ -852,6 +865,7 @@ class ElementCollection(
|
|
852
865
|
render_ocr: bool = False,
|
853
866
|
width: Optional[int] = None, # Add width parameter
|
854
867
|
page: Optional[Any] = None, # NEW: Optional page parameter for empty collections
|
868
|
+
crop: bool = False, # NEW: If True, crop output to element bounds
|
855
869
|
) -> Optional["Image.Image"]:
|
856
870
|
"""
|
857
871
|
Generates a temporary preview image highlighting elements in this collection
|
@@ -875,6 +889,9 @@ class ElementCollection(
|
|
875
889
|
legend_position: Position of the legend ('right', 'left', 'top', 'bottom').
|
876
890
|
render_ocr: Whether to render OCR text.
|
877
891
|
width: Optional width for the output image in pixels.
|
892
|
+
crop: If True, crop the resulting image to the tight bounding box
|
893
|
+
containing all elements in the collection. The elements are
|
894
|
+
still highlighted first, then the image is cropped.
|
878
895
|
|
879
896
|
Returns:
|
880
897
|
PIL Image object of the temporary preview, or None if rendering fails or
|
@@ -931,7 +948,23 @@ class ElementCollection(
|
|
931
948
|
|
932
949
|
# 2. Call render_preview on the HighlightingService
|
933
950
|
try:
|
934
|
-
|
951
|
+
# Calculate crop bounding box in PDF coordinates if crop is requested
|
952
|
+
crop_bbox = None
|
953
|
+
if crop:
|
954
|
+
try:
|
955
|
+
crop_bbox = (
|
956
|
+
min(el.x0 for el in self._elements),
|
957
|
+
min(el.top for el in self._elements),
|
958
|
+
max(el.x1 for el in self._elements),
|
959
|
+
max(el.bottom for el in self._elements),
|
960
|
+
)
|
961
|
+
except Exception as bbox_err:
|
962
|
+
logger.error(
|
963
|
+
f"Error determining crop bbox for collection show: {bbox_err}",
|
964
|
+
exc_info=True,
|
965
|
+
)
|
966
|
+
|
967
|
+
img = service.render_preview(
|
935
968
|
page_index=page.index,
|
936
969
|
temporary_highlights=highlight_data_list,
|
937
970
|
scale=scale,
|
@@ -939,7 +972,9 @@ class ElementCollection(
|
|
939
972
|
labels=labels, # Use 'labels'
|
940
973
|
legend_position=legend_position,
|
941
974
|
render_ocr=render_ocr,
|
975
|
+
crop_bbox=crop_bbox,
|
942
976
|
)
|
977
|
+
return img
|
943
978
|
except Exception as e:
|
944
979
|
logger.error(f"Error calling highlighting_service.render_preview: {e}", exc_info=True)
|
945
980
|
return None
|
@@ -1798,8 +1833,40 @@ class ElementCollection(
|
|
1798
1833
|
# Mix object bounds with specific overrides
|
1799
1834
|
clipped_elements = collection.clip(obj=container, bottom=page.height/2)
|
1800
1835
|
"""
|
1836
|
+
# --- NEW BEHAVIOUR: support per-element clipping with sequences --- #
|
1837
|
+
from collections.abc import Sequence # Local import to avoid top-level issues
|
1838
|
+
|
1839
|
+
# Detect if *obj* is a sequence meant to map one-to-one with the elements
|
1840
|
+
clip_objs = None # type: Optional[List[Any]]
|
1841
|
+
if isinstance(obj, ElementCollection):
|
1842
|
+
clip_objs = obj.elements
|
1843
|
+
elif isinstance(obj, Sequence) and not isinstance(obj, (str, bytes)):
|
1844
|
+
clip_objs = list(obj)
|
1845
|
+
|
1846
|
+
if clip_objs is not None:
|
1847
|
+
if len(clip_objs) != len(self._elements):
|
1848
|
+
raise ValueError(
|
1849
|
+
f"Number of clipping objects ({len(clip_objs)}) does not match number of "
|
1850
|
+
f"elements in collection ({len(self._elements)})."
|
1851
|
+
)
|
1852
|
+
|
1853
|
+
clipped_elements = [
|
1854
|
+
el.clip(
|
1855
|
+
obj=clip_obj,
|
1856
|
+
left=left,
|
1857
|
+
top=top,
|
1858
|
+
right=right,
|
1859
|
+
bottom=bottom,
|
1860
|
+
)
|
1861
|
+
for el, clip_obj in zip(self._elements, clip_objs)
|
1862
|
+
]
|
1863
|
+
return ElementCollection(clipped_elements)
|
1864
|
+
|
1865
|
+
# Fallback to original behaviour: apply same clipping parameters to all elements
|
1801
1866
|
return self.apply(
|
1802
|
-
lambda element: element.clip(
|
1867
|
+
lambda element: element.clip(
|
1868
|
+
obj=obj, left=left, top=top, right=right, bottom=bottom
|
1869
|
+
)
|
1803
1870
|
)
|
1804
1871
|
|
1805
1872
|
|
@@ -1838,13 +1905,20 @@ class PageCollection(Generic[P], ApplyMixin, ShapeDetectionMixin):
|
|
1838
1905
|
"""Return a string representation showing the page count."""
|
1839
1906
|
return f"<PageCollection(count={len(self)})>"
|
1840
1907
|
|
1841
|
-
def extract_text(
|
1908
|
+
def extract_text(
|
1909
|
+
self,
|
1910
|
+
keep_blank_chars: bool = True,
|
1911
|
+
apply_exclusions: bool = True,
|
1912
|
+
strip: Optional[bool] = None,
|
1913
|
+
**kwargs,
|
1914
|
+
) -> str:
|
1842
1915
|
"""
|
1843
1916
|
Extract text from all pages in the collection.
|
1844
1917
|
|
1845
1918
|
Args:
|
1846
1919
|
keep_blank_chars: Whether to keep blank characters (default: True)
|
1847
1920
|
apply_exclusions: Whether to apply exclusion regions (default: True)
|
1921
|
+
strip: Whether to strip whitespace from the extracted text.
|
1848
1922
|
**kwargs: Additional extraction parameters
|
1849
1923
|
|
1850
1924
|
Returns:
|
@@ -1853,11 +1927,22 @@ class PageCollection(Generic[P], ApplyMixin, ShapeDetectionMixin):
|
|
1853
1927
|
texts = []
|
1854
1928
|
for page in self.pages:
|
1855
1929
|
text = page.extract_text(
|
1856
|
-
keep_blank_chars=keep_blank_chars,
|
1930
|
+
keep_blank_chars=keep_blank_chars,
|
1931
|
+
apply_exclusions=apply_exclusions,
|
1932
|
+
**kwargs,
|
1857
1933
|
)
|
1858
1934
|
texts.append(text)
|
1859
1935
|
|
1860
|
-
|
1936
|
+
combined = "\n".join(texts)
|
1937
|
+
|
1938
|
+
# Default strip behaviour: if caller picks, honour; else respect layout flag passed via kwargs.
|
1939
|
+
use_layout = kwargs.get("layout", False)
|
1940
|
+
strip_final = strip if strip is not None else (not use_layout)
|
1941
|
+
|
1942
|
+
if strip_final:
|
1943
|
+
combined = "\n".join(line.rstrip() for line in combined.splitlines()).strip()
|
1944
|
+
|
1945
|
+
return combined
|
1861
1946
|
|
1862
1947
|
def apply_ocr(
|
1863
1948
|
self,
|
@@ -2253,6 +2338,44 @@ class PageCollection(Generic[P], ApplyMixin, ShapeDetectionMixin):
|
|
2253
2338
|
|
2254
2339
|
# Generate sections
|
2255
2340
|
sections = []
|
2341
|
+
|
2342
|
+
# --- Helper: build a FlowRegion spanning multiple pages ---
|
2343
|
+
def _build_flow_region(start_el, end_el):
|
2344
|
+
"""Return a FlowRegion that covers from *start_el* to *end_el* (inclusive).
|
2345
|
+
If *end_el* is None, the region continues to the bottom of the last
|
2346
|
+
page in this PageCollection."""
|
2347
|
+
# Local imports to avoid top-level cycles
|
2348
|
+
from natural_pdf.elements.region import Region
|
2349
|
+
from natural_pdf.flows.flow import Flow
|
2350
|
+
from natural_pdf.flows.element import FlowElement
|
2351
|
+
from natural_pdf.flows.region import FlowRegion
|
2352
|
+
|
2353
|
+
start_pg = start_el.page
|
2354
|
+
end_pg = end_el.page if end_el is not None else self.pages[-1]
|
2355
|
+
|
2356
|
+
parts: list[Region] = []
|
2357
|
+
# Slice of first page
|
2358
|
+
parts.append(Region(start_pg, (0, start_el.top, start_pg.width, start_pg.height)))
|
2359
|
+
|
2360
|
+
# Full middle pages
|
2361
|
+
for pg_idx in range(start_pg.index + 1, end_pg.index):
|
2362
|
+
mid_pg = self.pages[pg_idx]
|
2363
|
+
parts.append(Region(mid_pg, (0, 0, mid_pg.width, mid_pg.height)))
|
2364
|
+
|
2365
|
+
# Slice of last page (if distinct)
|
2366
|
+
if end_pg is not start_pg:
|
2367
|
+
bottom = end_el.bottom if end_el is not None else end_pg.height
|
2368
|
+
parts.append(Region(end_pg, (0, 0, end_pg.width, bottom)))
|
2369
|
+
|
2370
|
+
flow = Flow(segments=parts, arrangement="vertical")
|
2371
|
+
src_fe = FlowElement(physical_object=start_el, flow=flow)
|
2372
|
+
return FlowRegion(flow=flow,
|
2373
|
+
constituent_regions=parts,
|
2374
|
+
source_flow_element=src_fe,
|
2375
|
+
boundary_element_found=end_el)
|
2376
|
+
|
2377
|
+
# ------------------------------------------------------------------
|
2378
|
+
|
2256
2379
|
current_start = None
|
2257
2380
|
|
2258
2381
|
for i, boundary in enumerate(section_boundaries):
|
@@ -2273,50 +2396,9 @@ class PageCollection(Generic[P], ApplyMixin, ShapeDetectionMixin):
|
|
2273
2396
|
)
|
2274
2397
|
sections.append(section)
|
2275
2398
|
else:
|
2276
|
-
# Create
|
2277
|
-
|
2278
|
-
|
2279
|
-
# Get the start and end pages
|
2280
|
-
start_page = start_element.page
|
2281
|
-
end_page = end_element.page
|
2282
|
-
|
2283
|
-
# Create a combined region
|
2284
|
-
combined_region = Region(
|
2285
|
-
start_page, (0, start_element.top, start_page.width, start_page.height)
|
2286
|
-
)
|
2287
|
-
combined_region._spans_pages = True
|
2288
|
-
combined_region._page_range = (start_page.index, end_page.index)
|
2289
|
-
combined_region.start_element = start_element
|
2290
|
-
combined_region.end_element = end_element
|
2291
|
-
|
2292
|
-
# Get all elements that fall within this multi-page region
|
2293
|
-
combined_elements = []
|
2294
|
-
|
2295
|
-
# Get elements from the first page
|
2296
|
-
first_page_elements = [
|
2297
|
-
e
|
2298
|
-
for e in all_elements
|
2299
|
-
if e.page == start_page and e.top >= start_element.top
|
2300
|
-
]
|
2301
|
-
combined_elements.extend(first_page_elements)
|
2302
|
-
|
2303
|
-
# Get elements from middle pages (if any)
|
2304
|
-
for page_idx in range(start_page.index + 1, end_page.index):
|
2305
|
-
middle_page_elements = [e for e in all_elements if e.page.index == page_idx]
|
2306
|
-
combined_elements.extend(middle_page_elements)
|
2307
|
-
|
2308
|
-
# Get elements from the last page
|
2309
|
-
last_page_elements = [
|
2310
|
-
e
|
2311
|
-
for e in all_elements
|
2312
|
-
if e.page == end_page and e.bottom <= end_element.bottom
|
2313
|
-
]
|
2314
|
-
combined_elements.extend(last_page_elements)
|
2315
|
-
|
2316
|
-
# Store the elements in the combined region
|
2317
|
-
combined_region._multi_page_elements = combined_elements
|
2318
|
-
|
2319
|
-
sections.append(combined_region)
|
2399
|
+
# Create FlowRegion spanning pages
|
2400
|
+
flow_region = _build_flow_region(start_element, end_element)
|
2401
|
+
sections.append(flow_region)
|
2320
2402
|
|
2321
2403
|
current_start = None
|
2322
2404
|
|
@@ -2372,54 +2454,9 @@ class PageCollection(Generic[P], ApplyMixin, ShapeDetectionMixin):
|
|
2372
2454
|
last_page_elements.sort(key=lambda e: (e.top, e.x0))
|
2373
2455
|
end_element = last_page_elements[-1] if last_page_elements else None
|
2374
2456
|
|
2375
|
-
# Create
|
2376
|
-
|
2377
|
-
|
2378
|
-
if start_page == last_page:
|
2379
|
-
# Simple case - both on same page
|
2380
|
-
section = start_page.get_section_between(
|
2381
|
-
start_element, end_element, boundary_inclusion
|
2382
|
-
)
|
2383
|
-
sections.append(section)
|
2384
|
-
else:
|
2385
|
-
# Create a multi-page section
|
2386
|
-
combined_region = Region(
|
2387
|
-
start_page, (0, start_element.top, start_page.width, start_page.height)
|
2388
|
-
)
|
2389
|
-
combined_region._spans_pages = True
|
2390
|
-
combined_region._page_range = (start_page.index, last_page.index)
|
2391
|
-
combined_region.start_element = start_element
|
2392
|
-
combined_region.end_element = end_element
|
2393
|
-
|
2394
|
-
# Get all elements that fall within this multi-page region
|
2395
|
-
combined_elements = []
|
2396
|
-
|
2397
|
-
# Get elements from the first page
|
2398
|
-
first_page_elements = [
|
2399
|
-
e
|
2400
|
-
for e in all_elements
|
2401
|
-
if e.page == start_page and e.top >= start_element.top
|
2402
|
-
]
|
2403
|
-
combined_elements.extend(first_page_elements)
|
2404
|
-
|
2405
|
-
# Get elements from middle pages (if any)
|
2406
|
-
for page_idx in range(start_page.index + 1, last_page.index):
|
2407
|
-
middle_page_elements = [e for e in all_elements if e.page.index == page_idx]
|
2408
|
-
combined_elements.extend(middle_page_elements)
|
2409
|
-
|
2410
|
-
# Get elements from the last page
|
2411
|
-
last_page_elements = [
|
2412
|
-
e
|
2413
|
-
for e in all_elements
|
2414
|
-
if e.page == last_page
|
2415
|
-
and (end_element is None or e.bottom <= end_element.bottom)
|
2416
|
-
]
|
2417
|
-
combined_elements.extend(last_page_elements)
|
2418
|
-
|
2419
|
-
# Store the elements in the combined region
|
2420
|
-
combined_region._multi_page_elements = combined_elements
|
2421
|
-
|
2422
|
-
sections.append(combined_region)
|
2457
|
+
# Create FlowRegion spanning multiple pages using helper
|
2458
|
+
flow_region = _build_flow_region(start_element, end_element)
|
2459
|
+
sections.append(flow_region)
|
2423
2460
|
else:
|
2424
2461
|
# With start_elements only, create a section to the end of the current page
|
2425
2462
|
from natural_pdf.elements.region import Region
|
@@ -2607,13 +2644,13 @@ class PageCollection(Generic[P], ApplyMixin, ShapeDetectionMixin):
|
|
2607
2644
|
|
2608
2645
|
def to_image(
|
2609
2646
|
self,
|
2610
|
-
page_width: int =
|
2647
|
+
page_width: Optional[int] = None,
|
2611
2648
|
cols: Optional[int] = 4,
|
2612
2649
|
rows: Optional[int] = None,
|
2613
2650
|
max_pages: Optional[int] = None,
|
2614
2651
|
spacing: int = 10,
|
2615
|
-
add_labels: bool = True,
|
2616
|
-
show_category: bool = False,
|
2652
|
+
add_labels: bool = True, # Add new flag
|
2653
|
+
show_category: bool = False,
|
2617
2654
|
) -> Optional["Image.Image"]:
|
2618
2655
|
"""
|
2619
2656
|
Generate a grid of page images for this collection.
|
@@ -2630,6 +2667,16 @@ class PageCollection(Generic[P], ApplyMixin, ShapeDetectionMixin):
|
|
2630
2667
|
Returns:
|
2631
2668
|
PIL Image of the page grid or None if no pages
|
2632
2669
|
"""
|
2670
|
+
# Determine default page width from global options if not explicitly provided
|
2671
|
+
if page_width is None:
|
2672
|
+
try:
|
2673
|
+
import natural_pdf
|
2674
|
+
|
2675
|
+
page_width = natural_pdf.options.image.width or 300
|
2676
|
+
except Exception:
|
2677
|
+
# Fallback if natural_pdf import fails in some edge context
|
2678
|
+
page_width = 300
|
2679
|
+
|
2633
2680
|
# Ensure PIL is imported, handle potential ImportError if not done globally/lazily
|
2634
2681
|
try:
|
2635
2682
|
from PIL import Image, ImageDraw, ImageFont
|
@@ -2927,3 +2974,17 @@ class PageCollection(Generic[P], ApplyMixin, ShapeDetectionMixin):
|
|
2927
2974
|
# Re-raise the exception caught from the exporter
|
2928
2975
|
raise e # Keep the original exception type (ValueError, RuntimeError, etc.)
|
2929
2976
|
# <--- END MODIFIED
|
2977
|
+
|
2978
|
+
# Alias .to_image() to .show() for convenience
|
2979
|
+
def show(
|
2980
|
+
self,
|
2981
|
+
*args,
|
2982
|
+
**kwargs,
|
2983
|
+
) -> Optional["Image.Image"]:
|
2984
|
+
"""Display pages similarly to ``to_image``.
|
2985
|
+
|
2986
|
+
This is a thin wrapper around :py:meth:`to_image` so that the API mirrors
|
2987
|
+
ElementCollection, where ``show()`` already exists. It forwards all
|
2988
|
+
arguments and returns the resulting ``PIL.Image`` instance.
|
2989
|
+
"""
|
2990
|
+
return self.to_image(*args, **kwargs)
|