natural-pdf 0.1.23__py3-none-any.whl → 0.1.24__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- natural_pdf/analyzers/shape_detection_mixin.py +40 -0
- natural_pdf/core/highlighting_service.py +4 -4
- natural_pdf/core/page.py +16 -2
- natural_pdf/describe/base.py +11 -1
- natural_pdf/describe/summary.py +26 -0
- natural_pdf/elements/base.py +2 -2
- natural_pdf/elements/collections.py +139 -100
- natural_pdf/elements/region.py +133 -12
- natural_pdf/elements/text.py +15 -7
- natural_pdf/flows/region.py +116 -1
- natural_pdf/qa/document_qa.py +162 -105
- natural_pdf/utils/text_extraction.py +34 -14
- {natural_pdf-0.1.23.dist-info → natural_pdf-0.1.24.dist-info}/METADATA +2 -1
- {natural_pdf-0.1.23.dist-info → natural_pdf-0.1.24.dist-info}/RECORD +18 -18
- {natural_pdf-0.1.23.dist-info → natural_pdf-0.1.24.dist-info}/WHEEL +0 -0
- {natural_pdf-0.1.23.dist-info → natural_pdf-0.1.24.dist-info}/entry_points.txt +0 -0
- {natural_pdf-0.1.23.dist-info → natural_pdf-0.1.24.dist-info}/licenses/LICENSE +0 -0
- {natural_pdf-0.1.23.dist-info → natural_pdf-0.1.24.dist-info}/top_level.txt +0 -0
@@ -1490,6 +1490,45 @@ class ShapeDetectionMixin:
|
|
1490
1490
|
|
1491
1491
|
element_manager = page_object_for_elements._element_mgr
|
1492
1492
|
|
1493
|
+
# ------------------------------------------------------------------
|
1494
|
+
# CLEAN-UP existing table-related regions from earlier runs to avoid duplicates
|
1495
|
+
# ------------------------------------------------------------------
|
1496
|
+
try:
|
1497
|
+
_purge_types = {"table", "table_row", "table_column", "table_cell"}
|
1498
|
+
|
1499
|
+
if (
|
1500
|
+
hasattr(element_manager, "_elements")
|
1501
|
+
and "regions" in element_manager._elements
|
1502
|
+
):
|
1503
|
+
_orig_len = len(element_manager._elements["regions"])
|
1504
|
+
element_manager._elements["regions"] = [
|
1505
|
+
r
|
1506
|
+
for r in element_manager._elements["regions"]
|
1507
|
+
if not (
|
1508
|
+
getattr(r, "source", None) == source_label
|
1509
|
+
and getattr(r, "region_type", None) in _purge_types
|
1510
|
+
)
|
1511
|
+
]
|
1512
|
+
_removed = _orig_len - len(element_manager._elements["regions"])
|
1513
|
+
if _removed:
|
1514
|
+
logger.info(
|
1515
|
+
f"Removed {_removed} previous table-related regions (source='{source_label}') before regeneration."
|
1516
|
+
)
|
1517
|
+
|
1518
|
+
if hasattr(page_object_for_elements, "_regions") and "detected" in page_object_for_elements._regions:
|
1519
|
+
page_object_for_elements._regions["detected"] = [
|
1520
|
+
r
|
1521
|
+
for r in page_object_for_elements._regions["detected"]
|
1522
|
+
if not (
|
1523
|
+
getattr(r, "source", None) == source_label
|
1524
|
+
and getattr(r, "region_type", None) in _purge_types
|
1525
|
+
)
|
1526
|
+
]
|
1527
|
+
except Exception as _cleanup_err:
|
1528
|
+
logger.warning(
|
1529
|
+
f"Table-region cleanup failed: {_cleanup_err}", exc_info=True
|
1530
|
+
)
|
1531
|
+
|
1493
1532
|
# Get lines with the specified source
|
1494
1533
|
all_lines = element_manager.lines # Access lines from the correct element manager
|
1495
1534
|
filtered_lines = [
|
@@ -1724,6 +1763,7 @@ class ShapeDetectionMixin:
|
|
1724
1763
|
logger.info(
|
1725
1764
|
f"Created {tables_created} table, {rows_created} rows, {cols_created} columns, and {cells_created} table cells from detected lines (source: '{source_label}') for {self}."
|
1726
1765
|
)
|
1766
|
+
|
1727
1767
|
return self
|
1728
1768
|
|
1729
1769
|
|
@@ -863,10 +863,10 @@ class HighlightingService:
|
|
863
863
|
if crop_bbox is not None:
|
864
864
|
cb_x0, cb_top, cb_x1, cb_bottom = crop_bbox
|
865
865
|
# Convert to pixel coordinates using actual scales
|
866
|
-
left_px = int(cb_x0 * actual_scale_x) -
|
867
|
-
top_px = int(cb_top * actual_scale_y) -
|
868
|
-
right_px = int(cb_x1 * actual_scale_x) +
|
869
|
-
bottom_px = int(cb_bottom * actual_scale_y) +
|
866
|
+
left_px = int(cb_x0 * actual_scale_x) - 1
|
867
|
+
top_px = int(cb_top * actual_scale_y) - 1
|
868
|
+
right_px = int(cb_x1 * actual_scale_x) + 1
|
869
|
+
bottom_px = int(cb_bottom * actual_scale_y) + 1
|
870
870
|
|
871
871
|
# Safeguard coordinates within bounds
|
872
872
|
left_px = max(0, min(left_px, rendered_image.width - 1))
|
natural_pdf/core/page.py
CHANGED
@@ -2235,12 +2235,12 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMi
|
|
2235
2235
|
|
2236
2236
|
def ask(
|
2237
2237
|
self,
|
2238
|
-
question: str,
|
2238
|
+
question: Union[str, List[str], Tuple[str, ...]],
|
2239
2239
|
min_confidence: float = 0.1,
|
2240
2240
|
model: str = None,
|
2241
2241
|
debug: bool = False,
|
2242
2242
|
**kwargs,
|
2243
|
-
) -> Dict[str, Any]:
|
2243
|
+
) -> Union[Dict[str, Any], List[Dict[str, Any]]]:
|
2244
2244
|
"""
|
2245
2245
|
Ask a question about the page content using document QA.
|
2246
2246
|
"""
|
@@ -2824,3 +2824,17 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMi
|
|
2824
2824
|
if not hasattr(self, "metadata") or self.metadata is None:
|
2825
2825
|
self.metadata = {}
|
2826
2826
|
self.metadata["analysis"] = value
|
2827
|
+
|
2828
|
+
def inspect(self, limit: int = 30) -> "InspectionSummary":
|
2829
|
+
"""
|
2830
|
+
Inspect all elements on this page with detailed tabular view.
|
2831
|
+
Equivalent to page.find_all('*').inspect().
|
2832
|
+
|
2833
|
+
Args:
|
2834
|
+
limit: Maximum elements per type to show (default: 30)
|
2835
|
+
|
2836
|
+
Returns:
|
2837
|
+
InspectionSummary with element tables showing coordinates,
|
2838
|
+
properties, and other details for each element
|
2839
|
+
"""
|
2840
|
+
return self.find_all('*').inspect(limit=limit)
|
natural_pdf/describe/base.py
CHANGED
@@ -269,7 +269,7 @@ def _get_columns_for_type(element_type: str, show_page_column: bool) -> List[str
|
|
269
269
|
base_columns = ['x0', 'top', 'x1', 'bottom']
|
270
270
|
|
271
271
|
if element_type == 'word':
|
272
|
-
columns = ['text'] + base_columns + ['font_family', 'size', 'bold', 'italic', 'source', 'confidence']
|
272
|
+
columns = ['text'] + base_columns + ['font_family', 'font_variant', 'size', 'bold', 'italic', 'source', 'confidence']
|
273
273
|
# Add color for text elements
|
274
274
|
columns.append('color')
|
275
275
|
elif element_type == 'rect':
|
@@ -315,6 +315,16 @@ def _extract_element_value(element: "Element", column: str) -> Any:
|
|
315
315
|
# Fallback to fontname
|
316
316
|
return getattr(element, 'fontname', '')
|
317
317
|
|
318
|
+
elif column == 'font_variant':
|
319
|
+
variant = getattr(element, 'font_variant', None)
|
320
|
+
if variant:
|
321
|
+
return variant
|
322
|
+
# Fallback – try to derive from fontname if property missing
|
323
|
+
fontname = getattr(element, 'fontname', '')
|
324
|
+
if "+" in fontname:
|
325
|
+
return fontname.split("+", 1)[0]
|
326
|
+
return ''
|
327
|
+
|
318
328
|
elif column in ['bold', 'italic']:
|
319
329
|
value = getattr(element, column, False)
|
320
330
|
return value if isinstance(value, bool) else False
|
natural_pdf/describe/summary.py
CHANGED
@@ -128,6 +128,32 @@ class ElementSummary:
|
|
128
128
|
""
|
129
129
|
]
|
130
130
|
|
131
|
+
# Added for better VS Code and other frontends support
|
132
|
+
def _repr_html_(self) -> str: # type: ignore
|
133
|
+
"""Return HTML representation so rich rendering works in more frontends.
|
134
|
+
|
135
|
+
Many notebook frontends (including VS Code) give priority to the
|
136
|
+
``_repr_html_`` method over Markdown. When available, we convert the
|
137
|
+
generated Markdown to HTML using the *markdown* library. If the
|
138
|
+
library is not installed we simply wrap the Markdown in a ``<pre>``
|
139
|
+
block so that at least the plain-text representation is visible.
|
140
|
+
"""
|
141
|
+
md_source = self._to_markdown()
|
142
|
+
try:
|
143
|
+
import markdown as _markdown # pylint: disable=import-error
|
144
|
+
|
145
|
+
# Convert markdown to HTML. We explicitly enable tables so the
|
146
|
+
# element and inspection summaries render nicely.
|
147
|
+
return _markdown.markdown(md_source, extensions=["tables"])
|
148
|
+
except Exception: # noqa: BLE001, broad-except
|
149
|
+
# Fallback: present the Markdown as-is inside a <pre> block.
|
150
|
+
escaped = (
|
151
|
+
md_source.replace("&", "&")
|
152
|
+
.replace("<", "<")
|
153
|
+
.replace(">", ">")
|
154
|
+
)
|
155
|
+
return f"<pre>{escaped}</pre>"
|
156
|
+
|
131
157
|
|
132
158
|
class InspectionSummary(ElementSummary):
|
133
159
|
"""
|
natural_pdf/elements/base.py
CHANGED
@@ -174,8 +174,8 @@ class DirectionalMixin:
|
|
174
174
|
# Adjust cross boundaries if cross_size is 'element'
|
175
175
|
if cross_size == "element":
|
176
176
|
if is_horizontal: # Adjust y0, y1
|
177
|
-
y0 = min(y0, self.
|
178
|
-
y1 = max(y1, self.
|
177
|
+
y0 = min(y0, self.top)
|
178
|
+
y1 = max(y1, self.bottom)
|
179
179
|
else: # Adjust x0, x1
|
180
180
|
x0 = min(x0, self.x0)
|
181
181
|
x1 = max(x1, self.x1)
|
@@ -290,7 +290,13 @@ class ElementCollection(
|
|
290
290
|
|
291
291
|
return ElementCollection(filtered)
|
292
292
|
|
293
|
-
def extract_text(
|
293
|
+
def extract_text(
|
294
|
+
self,
|
295
|
+
preserve_whitespace: bool = True,
|
296
|
+
use_exclusions: bool = True,
|
297
|
+
strip: Optional[bool] = None,
|
298
|
+
**kwargs,
|
299
|
+
) -> str:
|
294
300
|
"""
|
295
301
|
Extract text from all TextElements in the collection, optionally using
|
296
302
|
pdfplumber's layout engine if layout=True is specified.
|
@@ -303,6 +309,7 @@ class ElementCollection(
|
|
303
309
|
`chars_to_textmap` function ONLY if `layout=True` is passed.
|
304
310
|
See Page.extract_text docstring for common parameters.
|
305
311
|
If `layout=False` or omitted, performs a simple join.
|
312
|
+
strip: Whether to strip whitespace from the extracted text.
|
306
313
|
|
307
314
|
Returns:
|
308
315
|
Combined text from elements, potentially with layout-based spacing.
|
@@ -399,6 +406,12 @@ class ElementCollection(
|
|
399
406
|
result = "".join(c.get("text", "") for c in all_char_dicts)
|
400
407
|
# Replace multiple spaces created by joining possibly overlapping chars? Maybe not necessary.
|
401
408
|
|
409
|
+
# Determine final strip flag – same rule as global helper unless caller overrides
|
410
|
+
strip_text = strip if strip is not None else (not use_layout)
|
411
|
+
|
412
|
+
if strip_text and isinstance(result, str):
|
413
|
+
result = "\n".join(line.rstrip() for line in result.splitlines()).strip()
|
414
|
+
|
402
415
|
return result
|
403
416
|
|
404
417
|
def filter(self, func: Callable[["Element"], bool]) -> "ElementCollection":
|
@@ -1820,8 +1833,40 @@ class ElementCollection(
|
|
1820
1833
|
# Mix object bounds with specific overrides
|
1821
1834
|
clipped_elements = collection.clip(obj=container, bottom=page.height/2)
|
1822
1835
|
"""
|
1836
|
+
# --- NEW BEHAVIOUR: support per-element clipping with sequences --- #
|
1837
|
+
from collections.abc import Sequence # Local import to avoid top-level issues
|
1838
|
+
|
1839
|
+
# Detect if *obj* is a sequence meant to map one-to-one with the elements
|
1840
|
+
clip_objs = None # type: Optional[List[Any]]
|
1841
|
+
if isinstance(obj, ElementCollection):
|
1842
|
+
clip_objs = obj.elements
|
1843
|
+
elif isinstance(obj, Sequence) and not isinstance(obj, (str, bytes)):
|
1844
|
+
clip_objs = list(obj)
|
1845
|
+
|
1846
|
+
if clip_objs is not None:
|
1847
|
+
if len(clip_objs) != len(self._elements):
|
1848
|
+
raise ValueError(
|
1849
|
+
f"Number of clipping objects ({len(clip_objs)}) does not match number of "
|
1850
|
+
f"elements in collection ({len(self._elements)})."
|
1851
|
+
)
|
1852
|
+
|
1853
|
+
clipped_elements = [
|
1854
|
+
el.clip(
|
1855
|
+
obj=clip_obj,
|
1856
|
+
left=left,
|
1857
|
+
top=top,
|
1858
|
+
right=right,
|
1859
|
+
bottom=bottom,
|
1860
|
+
)
|
1861
|
+
for el, clip_obj in zip(self._elements, clip_objs)
|
1862
|
+
]
|
1863
|
+
return ElementCollection(clipped_elements)
|
1864
|
+
|
1865
|
+
# Fallback to original behaviour: apply same clipping parameters to all elements
|
1823
1866
|
return self.apply(
|
1824
|
-
lambda element: element.clip(
|
1867
|
+
lambda element: element.clip(
|
1868
|
+
obj=obj, left=left, top=top, right=right, bottom=bottom
|
1869
|
+
)
|
1825
1870
|
)
|
1826
1871
|
|
1827
1872
|
|
@@ -1860,13 +1905,20 @@ class PageCollection(Generic[P], ApplyMixin, ShapeDetectionMixin):
|
|
1860
1905
|
"""Return a string representation showing the page count."""
|
1861
1906
|
return f"<PageCollection(count={len(self)})>"
|
1862
1907
|
|
1863
|
-
def extract_text(
|
1908
|
+
def extract_text(
|
1909
|
+
self,
|
1910
|
+
keep_blank_chars: bool = True,
|
1911
|
+
apply_exclusions: bool = True,
|
1912
|
+
strip: Optional[bool] = None,
|
1913
|
+
**kwargs,
|
1914
|
+
) -> str:
|
1864
1915
|
"""
|
1865
1916
|
Extract text from all pages in the collection.
|
1866
1917
|
|
1867
1918
|
Args:
|
1868
1919
|
keep_blank_chars: Whether to keep blank characters (default: True)
|
1869
1920
|
apply_exclusions: Whether to apply exclusion regions (default: True)
|
1921
|
+
strip: Whether to strip whitespace from the extracted text.
|
1870
1922
|
**kwargs: Additional extraction parameters
|
1871
1923
|
|
1872
1924
|
Returns:
|
@@ -1875,11 +1927,22 @@ class PageCollection(Generic[P], ApplyMixin, ShapeDetectionMixin):
|
|
1875
1927
|
texts = []
|
1876
1928
|
for page in self.pages:
|
1877
1929
|
text = page.extract_text(
|
1878
|
-
keep_blank_chars=keep_blank_chars,
|
1930
|
+
keep_blank_chars=keep_blank_chars,
|
1931
|
+
apply_exclusions=apply_exclusions,
|
1932
|
+
**kwargs,
|
1879
1933
|
)
|
1880
1934
|
texts.append(text)
|
1881
1935
|
|
1882
|
-
|
1936
|
+
combined = "\n".join(texts)
|
1937
|
+
|
1938
|
+
# Default strip behaviour: if caller picks, honour; else respect layout flag passed via kwargs.
|
1939
|
+
use_layout = kwargs.get("layout", False)
|
1940
|
+
strip_final = strip if strip is not None else (not use_layout)
|
1941
|
+
|
1942
|
+
if strip_final:
|
1943
|
+
combined = "\n".join(line.rstrip() for line in combined.splitlines()).strip()
|
1944
|
+
|
1945
|
+
return combined
|
1883
1946
|
|
1884
1947
|
def apply_ocr(
|
1885
1948
|
self,
|
@@ -2275,6 +2338,44 @@ class PageCollection(Generic[P], ApplyMixin, ShapeDetectionMixin):
|
|
2275
2338
|
|
2276
2339
|
# Generate sections
|
2277
2340
|
sections = []
|
2341
|
+
|
2342
|
+
# --- Helper: build a FlowRegion spanning multiple pages ---
|
2343
|
+
def _build_flow_region(start_el, end_el):
|
2344
|
+
"""Return a FlowRegion that covers from *start_el* to *end_el* (inclusive).
|
2345
|
+
If *end_el* is None, the region continues to the bottom of the last
|
2346
|
+
page in this PageCollection."""
|
2347
|
+
# Local imports to avoid top-level cycles
|
2348
|
+
from natural_pdf.elements.region import Region
|
2349
|
+
from natural_pdf.flows.flow import Flow
|
2350
|
+
from natural_pdf.flows.element import FlowElement
|
2351
|
+
from natural_pdf.flows.region import FlowRegion
|
2352
|
+
|
2353
|
+
start_pg = start_el.page
|
2354
|
+
end_pg = end_el.page if end_el is not None else self.pages[-1]
|
2355
|
+
|
2356
|
+
parts: list[Region] = []
|
2357
|
+
# Slice of first page
|
2358
|
+
parts.append(Region(start_pg, (0, start_el.top, start_pg.width, start_pg.height)))
|
2359
|
+
|
2360
|
+
# Full middle pages
|
2361
|
+
for pg_idx in range(start_pg.index + 1, end_pg.index):
|
2362
|
+
mid_pg = self.pages[pg_idx]
|
2363
|
+
parts.append(Region(mid_pg, (0, 0, mid_pg.width, mid_pg.height)))
|
2364
|
+
|
2365
|
+
# Slice of last page (if distinct)
|
2366
|
+
if end_pg is not start_pg:
|
2367
|
+
bottom = end_el.bottom if end_el is not None else end_pg.height
|
2368
|
+
parts.append(Region(end_pg, (0, 0, end_pg.width, bottom)))
|
2369
|
+
|
2370
|
+
flow = Flow(segments=parts, arrangement="vertical")
|
2371
|
+
src_fe = FlowElement(physical_object=start_el, flow=flow)
|
2372
|
+
return FlowRegion(flow=flow,
|
2373
|
+
constituent_regions=parts,
|
2374
|
+
source_flow_element=src_fe,
|
2375
|
+
boundary_element_found=end_el)
|
2376
|
+
|
2377
|
+
# ------------------------------------------------------------------
|
2378
|
+
|
2278
2379
|
current_start = None
|
2279
2380
|
|
2280
2381
|
for i, boundary in enumerate(section_boundaries):
|
@@ -2295,50 +2396,9 @@ class PageCollection(Generic[P], ApplyMixin, ShapeDetectionMixin):
|
|
2295
2396
|
)
|
2296
2397
|
sections.append(section)
|
2297
2398
|
else:
|
2298
|
-
# Create
|
2299
|
-
|
2300
|
-
|
2301
|
-
# Get the start and end pages
|
2302
|
-
start_page = start_element.page
|
2303
|
-
end_page = end_element.page
|
2304
|
-
|
2305
|
-
# Create a combined region
|
2306
|
-
combined_region = Region(
|
2307
|
-
start_page, (0, start_element.top, start_page.width, start_page.height)
|
2308
|
-
)
|
2309
|
-
combined_region._spans_pages = True
|
2310
|
-
combined_region._page_range = (start_page.index, end_page.index)
|
2311
|
-
combined_region.start_element = start_element
|
2312
|
-
combined_region.end_element = end_element
|
2313
|
-
|
2314
|
-
# Get all elements that fall within this multi-page region
|
2315
|
-
combined_elements = []
|
2316
|
-
|
2317
|
-
# Get elements from the first page
|
2318
|
-
first_page_elements = [
|
2319
|
-
e
|
2320
|
-
for e in all_elements
|
2321
|
-
if e.page == start_page and e.top >= start_element.top
|
2322
|
-
]
|
2323
|
-
combined_elements.extend(first_page_elements)
|
2324
|
-
|
2325
|
-
# Get elements from middle pages (if any)
|
2326
|
-
for page_idx in range(start_page.index + 1, end_page.index):
|
2327
|
-
middle_page_elements = [e for e in all_elements if e.page.index == page_idx]
|
2328
|
-
combined_elements.extend(middle_page_elements)
|
2329
|
-
|
2330
|
-
# Get elements from the last page
|
2331
|
-
last_page_elements = [
|
2332
|
-
e
|
2333
|
-
for e in all_elements
|
2334
|
-
if e.page == end_page and e.bottom <= end_element.bottom
|
2335
|
-
]
|
2336
|
-
combined_elements.extend(last_page_elements)
|
2337
|
-
|
2338
|
-
# Store the elements in the combined region
|
2339
|
-
combined_region._multi_page_elements = combined_elements
|
2340
|
-
|
2341
|
-
sections.append(combined_region)
|
2399
|
+
# Create FlowRegion spanning pages
|
2400
|
+
flow_region = _build_flow_region(start_element, end_element)
|
2401
|
+
sections.append(flow_region)
|
2342
2402
|
|
2343
2403
|
current_start = None
|
2344
2404
|
|
@@ -2394,54 +2454,9 @@ class PageCollection(Generic[P], ApplyMixin, ShapeDetectionMixin):
|
|
2394
2454
|
last_page_elements.sort(key=lambda e: (e.top, e.x0))
|
2395
2455
|
end_element = last_page_elements[-1] if last_page_elements else None
|
2396
2456
|
|
2397
|
-
# Create
|
2398
|
-
|
2399
|
-
|
2400
|
-
if start_page == last_page:
|
2401
|
-
# Simple case - both on same page
|
2402
|
-
section = start_page.get_section_between(
|
2403
|
-
start_element, end_element, boundary_inclusion
|
2404
|
-
)
|
2405
|
-
sections.append(section)
|
2406
|
-
else:
|
2407
|
-
# Create a multi-page section
|
2408
|
-
combined_region = Region(
|
2409
|
-
start_page, (0, start_element.top, start_page.width, start_page.height)
|
2410
|
-
)
|
2411
|
-
combined_region._spans_pages = True
|
2412
|
-
combined_region._page_range = (start_page.index, last_page.index)
|
2413
|
-
combined_region.start_element = start_element
|
2414
|
-
combined_region.end_element = end_element
|
2415
|
-
|
2416
|
-
# Get all elements that fall within this multi-page region
|
2417
|
-
combined_elements = []
|
2418
|
-
|
2419
|
-
# Get elements from the first page
|
2420
|
-
first_page_elements = [
|
2421
|
-
e
|
2422
|
-
for e in all_elements
|
2423
|
-
if e.page == start_page and e.top >= start_element.top
|
2424
|
-
]
|
2425
|
-
combined_elements.extend(first_page_elements)
|
2426
|
-
|
2427
|
-
# Get elements from middle pages (if any)
|
2428
|
-
for page_idx in range(start_page.index + 1, last_page.index):
|
2429
|
-
middle_page_elements = [e for e in all_elements if e.page.index == page_idx]
|
2430
|
-
combined_elements.extend(middle_page_elements)
|
2431
|
-
|
2432
|
-
# Get elements from the last page
|
2433
|
-
last_page_elements = [
|
2434
|
-
e
|
2435
|
-
for e in all_elements
|
2436
|
-
if e.page == last_page
|
2437
|
-
and (end_element is None or e.bottom <= end_element.bottom)
|
2438
|
-
]
|
2439
|
-
combined_elements.extend(last_page_elements)
|
2440
|
-
|
2441
|
-
# Store the elements in the combined region
|
2442
|
-
combined_region._multi_page_elements = combined_elements
|
2443
|
-
|
2444
|
-
sections.append(combined_region)
|
2457
|
+
# Create FlowRegion spanning multiple pages using helper
|
2458
|
+
flow_region = _build_flow_region(start_element, end_element)
|
2459
|
+
sections.append(flow_region)
|
2445
2460
|
else:
|
2446
2461
|
# With start_elements only, create a section to the end of the current page
|
2447
2462
|
from natural_pdf.elements.region import Region
|
@@ -2629,13 +2644,13 @@ class PageCollection(Generic[P], ApplyMixin, ShapeDetectionMixin):
|
|
2629
2644
|
|
2630
2645
|
def to_image(
|
2631
2646
|
self,
|
2632
|
-
page_width: int =
|
2647
|
+
page_width: Optional[int] = None,
|
2633
2648
|
cols: Optional[int] = 4,
|
2634
2649
|
rows: Optional[int] = None,
|
2635
2650
|
max_pages: Optional[int] = None,
|
2636
2651
|
spacing: int = 10,
|
2637
|
-
add_labels: bool = True,
|
2638
|
-
show_category: bool = False,
|
2652
|
+
add_labels: bool = True, # Add new flag
|
2653
|
+
show_category: bool = False,
|
2639
2654
|
) -> Optional["Image.Image"]:
|
2640
2655
|
"""
|
2641
2656
|
Generate a grid of page images for this collection.
|
@@ -2652,6 +2667,16 @@ class PageCollection(Generic[P], ApplyMixin, ShapeDetectionMixin):
|
|
2652
2667
|
Returns:
|
2653
2668
|
PIL Image of the page grid or None if no pages
|
2654
2669
|
"""
|
2670
|
+
# Determine default page width from global options if not explicitly provided
|
2671
|
+
if page_width is None:
|
2672
|
+
try:
|
2673
|
+
import natural_pdf
|
2674
|
+
|
2675
|
+
page_width = natural_pdf.options.image.width or 300
|
2676
|
+
except Exception:
|
2677
|
+
# Fallback if natural_pdf import fails in some edge context
|
2678
|
+
page_width = 300
|
2679
|
+
|
2655
2680
|
# Ensure PIL is imported, handle potential ImportError if not done globally/lazily
|
2656
2681
|
try:
|
2657
2682
|
from PIL import Image, ImageDraw, ImageFont
|
@@ -2949,3 +2974,17 @@ class PageCollection(Generic[P], ApplyMixin, ShapeDetectionMixin):
|
|
2949
2974
|
# Re-raise the exception caught from the exporter
|
2950
2975
|
raise e # Keep the original exception type (ValueError, RuntimeError, etc.)
|
2951
2976
|
# <--- END MODIFIED
|
2977
|
+
|
2978
|
+
# Alias .to_image() to .show() for convenience
|
2979
|
+
def show(
|
2980
|
+
self,
|
2981
|
+
*args,
|
2982
|
+
**kwargs,
|
2983
|
+
) -> Optional["Image.Image"]:
|
2984
|
+
"""Display pages similarly to ``to_image``.
|
2985
|
+
|
2986
|
+
This is a thin wrapper around :py:meth:`to_image` so that the API mirrors
|
2987
|
+
ElementCollection, where ``show()`` already exists. It forwards all
|
2988
|
+
arguments and returns the resulting ``PIL.Image`` instance.
|
2989
|
+
"""
|
2990
|
+
return self.to_image(*args, **kwargs)
|