natural-pdf 0.1.23__py3-none-any.whl → 0.1.26.dev0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1490,6 +1490,45 @@ class ShapeDetectionMixin:
1490
1490
 
1491
1491
  element_manager = page_object_for_elements._element_mgr
1492
1492
 
1493
+ # ------------------------------------------------------------------
1494
+ # CLEAN-UP existing table-related regions from earlier runs to avoid duplicates
1495
+ # ------------------------------------------------------------------
1496
+ try:
1497
+ _purge_types = {"table", "table_row", "table_column", "table_cell"}
1498
+
1499
+ if (
1500
+ hasattr(element_manager, "_elements")
1501
+ and "regions" in element_manager._elements
1502
+ ):
1503
+ _orig_len = len(element_manager._elements["regions"])
1504
+ element_manager._elements["regions"] = [
1505
+ r
1506
+ for r in element_manager._elements["regions"]
1507
+ if not (
1508
+ getattr(r, "source", None) == source_label
1509
+ and getattr(r, "region_type", None) in _purge_types
1510
+ )
1511
+ ]
1512
+ _removed = _orig_len - len(element_manager._elements["regions"])
1513
+ if _removed:
1514
+ logger.info(
1515
+ f"Removed {_removed} previous table-related regions (source='{source_label}') before regeneration."
1516
+ )
1517
+
1518
+ if hasattr(page_object_for_elements, "_regions") and "detected" in page_object_for_elements._regions:
1519
+ page_object_for_elements._regions["detected"] = [
1520
+ r
1521
+ for r in page_object_for_elements._regions["detected"]
1522
+ if not (
1523
+ getattr(r, "source", None) == source_label
1524
+ and getattr(r, "region_type", None) in _purge_types
1525
+ )
1526
+ ]
1527
+ except Exception as _cleanup_err:
1528
+ logger.warning(
1529
+ f"Table-region cleanup failed: {_cleanup_err}", exc_info=True
1530
+ )
1531
+
1493
1532
  # Get lines with the specified source
1494
1533
  all_lines = element_manager.lines # Access lines from the correct element manager
1495
1534
  filtered_lines = [
@@ -1724,6 +1763,7 @@ class ShapeDetectionMixin:
1724
1763
  logger.info(
1725
1764
  f"Created {tables_created} table, {rows_created} rows, {cols_created} columns, and {cells_created} table cells from detected lines (source: '{source_label}') for {self}."
1726
1765
  )
1766
+
1727
1767
  return self
1728
1768
 
1729
1769
 
@@ -863,10 +863,10 @@ class HighlightingService:
863
863
  if crop_bbox is not None:
864
864
  cb_x0, cb_top, cb_x1, cb_bottom = crop_bbox
865
865
  # Convert to pixel coordinates using actual scales
866
- left_px = int(cb_x0 * actual_scale_x) - 2
867
- top_px = int(cb_top * actual_scale_y) - 2
868
- right_px = int(cb_x1 * actual_scale_x) + 2
869
- bottom_px = int(cb_bottom * actual_scale_y) + 2
866
+ left_px = int(cb_x0 * actual_scale_x) - 1
867
+ top_px = int(cb_top * actual_scale_y) - 1
868
+ right_px = int(cb_x1 * actual_scale_x) + 1
869
+ bottom_px = int(cb_bottom * actual_scale_y) + 1
870
870
 
871
871
  # Safeguard coordinates within bounds
872
872
  left_px = max(0, min(left_px, rendered_image.width - 1))
natural_pdf/core/page.py CHANGED
@@ -1576,8 +1576,12 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMi
1576
1576
  render_ocr: Whether to render OCR text on highlights.
1577
1577
  resolution: Resolution in DPI for base page image (default: scale * 72).
1578
1578
  include_highlights: Whether to render highlights.
1579
- exclusions: If 'mask', excluded regions will be whited out on the image.
1580
- (default: None).
1579
+ exclusions: Accepts one of the following:
1580
+ • None – no masking (default)
1581
+ • "mask" – mask using solid white (back-compat)
1582
+ • CSS/HTML colour string (e.g. "red", "#ff0000", "#ff000080")
1583
+ • Tuple of RGB or RGBA values (ints 0-255 or floats 0-1)
1584
+ All excluded regions are filled with this colour.
1581
1585
  **kwargs: Additional parameters for pdfplumber.to_image.
1582
1586
 
1583
1587
  Returns:
@@ -1690,7 +1694,52 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMi
1690
1694
  # --- Apply exclusion masking if requested ---
1691
1695
  # This modifies 'rendered_image_component'
1692
1696
  image_after_masking = rendered_image_component # Start with the rendered image
1693
- if exclusions == "mask" and self._exclusions:
1697
+
1698
+ # Determine if masking is requested and establish the fill colour
1699
+ mask_requested = exclusions is not None and self._exclusions
1700
+ mask_color: Union[str, Tuple[int, int, int, int]] = "white" # default
1701
+
1702
+ if mask_requested:
1703
+ if exclusions != "mask":
1704
+ # Attempt to parse custom colour input
1705
+ try:
1706
+ if isinstance(exclusions, tuple):
1707
+ # Handle RGB/RGBA tuples with ints 0-255 or floats 0-1
1708
+ processed = []
1709
+ all_float = all(isinstance(c, float) for c in exclusions)
1710
+ for i, c in enumerate(exclusions):
1711
+ if isinstance(c, float):
1712
+ val = int(c * 255) if all_float or i == 3 else int(c)
1713
+ else:
1714
+ val = int(c)
1715
+ processed.append(max(0, min(255, val)))
1716
+ if len(processed) == 3:
1717
+ processed.append(255) # add full alpha
1718
+ mask_color = tuple(processed) # type: ignore[assignment]
1719
+ elif isinstance(exclusions, str):
1720
+ # Try using the optional 'colour' library for rich parsing
1721
+ try:
1722
+ from colour import Color # type: ignore
1723
+
1724
+ color_obj = Color(exclusions)
1725
+ mask_color = (
1726
+ int(color_obj.red * 255),
1727
+ int(color_obj.green * 255),
1728
+ int(color_obj.blue * 255),
1729
+ 255,
1730
+ )
1731
+ except Exception:
1732
+ # Fallback: if parsing fails, treat as plain string accepted by PIL
1733
+ mask_color = exclusions # e.g. "red"
1734
+ else:
1735
+ logger.warning(
1736
+ f"Unsupported exclusions colour spec: {exclusions!r}. Using white."
1737
+ )
1738
+ except Exception as colour_parse_err: # pragma: no cover
1739
+ logger.warning(
1740
+ f"Failed to parse exclusions colour {exclusions!r}: {colour_parse_err}. Using white."
1741
+ )
1742
+
1694
1743
  try:
1695
1744
  # Ensure image is mutable (RGB or RGBA)
1696
1745
  if image_after_masking.mode not in ("RGB", "RGBA"):
@@ -1701,17 +1750,23 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMi
1701
1750
  )
1702
1751
  if exclusion_regions:
1703
1752
  draw = ImageDraw.Draw(image_after_masking)
1704
- # Calculate the scaling factor used for the image
1753
+ # Scaling factor for converting PDF pts image px
1705
1754
  img_scale = render_resolution / 72.0
1706
1755
 
1756
+ # Determine fill colour compatible with current mode
1757
+ def _mode_compatible(colour):
1758
+ if isinstance(colour, tuple) and image_after_masking.mode != "RGBA":
1759
+ return colour[:3] # drop alpha for RGB images
1760
+ return colour
1761
+
1762
+ fill_colour = _mode_compatible(mask_color)
1763
+
1707
1764
  for region in exclusion_regions:
1708
- # Convert PDF points (x0, top, x1, bottom) to image pixels
1709
1765
  img_x0 = region.x0 * img_scale
1710
1766
  img_top = region.top * img_scale
1711
1767
  img_x1 = region.x1 * img_scale
1712
1768
  img_bottom = region.bottom * img_scale
1713
1769
 
1714
- # Draw a white rectangle over the excluded area
1715
1770
  img_coords = (
1716
1771
  max(0, img_x0),
1717
1772
  max(0, img_top),
@@ -1719,7 +1774,7 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMi
1719
1774
  min(image_after_masking.height, img_bottom),
1720
1775
  )
1721
1776
  if img_coords[0] < img_coords[2] and img_coords[1] < img_coords[3]:
1722
- draw.rectangle(img_coords, fill="white")
1777
+ draw.rectangle(img_coords, fill=fill_colour)
1723
1778
  else: # pragma: no cover
1724
1779
  logger.warning(
1725
1780
  f"Skipping invalid exclusion rect for masking: {img_coords}"
@@ -1994,6 +2049,10 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMi
1994
2049
  )
1995
2050
  return ElementCollection([]) # Return empty collection
1996
2051
 
2052
+ # Clear existing detected regions if 'replace' is specified
2053
+ if existing == "replace":
2054
+ self.clear_detected_layout_regions()
2055
+
1997
2056
  # The analyzer's analyze_layout method already adds regions to the page
1998
2057
  # and its element manager. We just need to retrieve them.
1999
2058
  analyzer.analyze_layout(
@@ -2235,12 +2294,12 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMi
2235
2294
 
2236
2295
  def ask(
2237
2296
  self,
2238
- question: str,
2297
+ question: Union[str, List[str], Tuple[str, ...]],
2239
2298
  min_confidence: float = 0.1,
2240
2299
  model: str = None,
2241
2300
  debug: bool = False,
2242
2301
  **kwargs,
2243
- ) -> Dict[str, Any]:
2302
+ ) -> Union[Dict[str, Any], List[Dict[str, Any]]]:
2244
2303
  """
2245
2304
  Ask a question about the page content using document QA.
2246
2305
  """
@@ -2824,3 +2883,17 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMi
2824
2883
  if not hasattr(self, "metadata") or self.metadata is None:
2825
2884
  self.metadata = {}
2826
2885
  self.metadata["analysis"] = value
2886
+
2887
+ def inspect(self, limit: int = 30) -> "InspectionSummary":
2888
+ """
2889
+ Inspect all elements on this page with detailed tabular view.
2890
+ Equivalent to page.find_all('*').inspect().
2891
+
2892
+ Args:
2893
+ limit: Maximum elements per type to show (default: 30)
2894
+
2895
+ Returns:
2896
+ InspectionSummary with element tables showing coordinates,
2897
+ properties, and other details for each element
2898
+ """
2899
+ return self.find_all('*').inspect(limit=limit)
@@ -269,7 +269,7 @@ def _get_columns_for_type(element_type: str, show_page_column: bool) -> List[str
269
269
  base_columns = ['x0', 'top', 'x1', 'bottom']
270
270
 
271
271
  if element_type == 'word':
272
- columns = ['text'] + base_columns + ['font_family', 'size', 'bold', 'italic', 'source', 'confidence']
272
+ columns = ['text'] + base_columns + ['font_family', 'font_variant', 'size', 'bold', 'italic', 'source', 'confidence']
273
273
  # Add color for text elements
274
274
  columns.append('color')
275
275
  elif element_type == 'rect':
@@ -315,6 +315,16 @@ def _extract_element_value(element: "Element", column: str) -> Any:
315
315
  # Fallback to fontname
316
316
  return getattr(element, 'fontname', '')
317
317
 
318
+ elif column == 'font_variant':
319
+ variant = getattr(element, 'font_variant', None)
320
+ if variant:
321
+ return variant
322
+ # Fallback – try to derive from fontname if property missing
323
+ fontname = getattr(element, 'fontname', '')
324
+ if "+" in fontname:
325
+ return fontname.split("+", 1)[0]
326
+ return ''
327
+
318
328
  elif column in ['bold', 'italic']:
319
329
  value = getattr(element, column, False)
320
330
  return value if isinstance(value, bool) else False
@@ -56,8 +56,8 @@ class ElementSummary:
56
56
  section_title = name.replace('_', ' ').title()
57
57
 
58
58
  if isinstance(data, dict):
59
- lines = [f"**{section_title}**:"]
60
- lines.extend(self._format_dict(data, indent=" "))
59
+ lines = [f"**{section_title}**:", ""]
60
+ lines.extend(self._format_dict(data, indent=""))
61
61
  elif isinstance(data, list):
62
62
  lines = [f"**{section_title}**: {', '.join(str(item) for item in data)}"]
63
63
  else:
@@ -128,6 +128,32 @@ class ElementSummary:
128
128
  ""
129
129
  ]
130
130
 
131
+ # Added for better VS Code and other frontends support
132
+ def _repr_html_(self) -> str: # type: ignore
133
+ """Return HTML representation so rich rendering works in more frontends.
134
+
135
+ Many notebook frontends (including VS Code) give priority to the
136
+ ``_repr_html_`` method over Markdown. When available, we convert the
137
+ generated Markdown to HTML using the *markdown* library. If the
138
+ library is not installed we simply wrap the Markdown in a ``<pre>``
139
+ block so that at least the plain-text representation is visible.
140
+ """
141
+ md_source = self._to_markdown()
142
+ try:
143
+ import markdown as _markdown # pylint: disable=import-error
144
+
145
+ # Convert markdown to HTML. We explicitly enable tables so the
146
+ # element and inspection summaries render nicely.
147
+ return _markdown.markdown(md_source, extensions=["tables"])
148
+ except Exception: # noqa: BLE001, broad-except
149
+ # Fallback: present the Markdown as-is inside a <pre> block.
150
+ escaped = (
151
+ md_source.replace("&", "&amp;")
152
+ .replace("<", "&lt;")
153
+ .replace(">", "&gt;")
154
+ )
155
+ return f"<pre>{escaped}</pre>"
156
+
131
157
 
132
158
  class InspectionSummary(ElementSummary):
133
159
  """
@@ -174,8 +174,8 @@ class DirectionalMixin:
174
174
  # Adjust cross boundaries if cross_size is 'element'
175
175
  if cross_size == "element":
176
176
  if is_horizontal: # Adjust y0, y1
177
- y0 = min(y0, self.y0)
178
- y1 = max(y1, self.y1)
177
+ y0 = min(y0, self.top)
178
+ y1 = max(y1, self.bottom)
179
179
  else: # Adjust x0, x1
180
180
  x0 = min(x0, self.x0)
181
181
  x1 = max(x1, self.x1)
@@ -290,7 +290,13 @@ class ElementCollection(
290
290
 
291
291
  return ElementCollection(filtered)
292
292
 
293
- def extract_text(self, preserve_whitespace=True, use_exclusions=True, **kwargs) -> str:
293
+ def extract_text(
294
+ self,
295
+ preserve_whitespace: bool = True,
296
+ use_exclusions: bool = True,
297
+ strip: Optional[bool] = None,
298
+ **kwargs,
299
+ ) -> str:
294
300
  """
295
301
  Extract text from all TextElements in the collection, optionally using
296
302
  pdfplumber's layout engine if layout=True is specified.
@@ -303,6 +309,7 @@ class ElementCollection(
303
309
  `chars_to_textmap` function ONLY if `layout=True` is passed.
304
310
  See Page.extract_text docstring for common parameters.
305
311
  If `layout=False` or omitted, performs a simple join.
312
+ strip: Whether to strip whitespace from the extracted text.
306
313
 
307
314
  Returns:
308
315
  Combined text from elements, potentially with layout-based spacing.
@@ -399,6 +406,12 @@ class ElementCollection(
399
406
  result = "".join(c.get("text", "") for c in all_char_dicts)
400
407
  # Replace multiple spaces created by joining possibly overlapping chars? Maybe not necessary.
401
408
 
409
+ # Determine final strip flag – same rule as global helper unless caller overrides
410
+ strip_text = strip if strip is not None else (not use_layout)
411
+
412
+ if strip_text and isinstance(result, str):
413
+ result = "\n".join(line.rstrip() for line in result.splitlines()).strip()
414
+
402
415
  return result
403
416
 
404
417
  def filter(self, func: Callable[["Element"], bool]) -> "ElementCollection":
@@ -1820,8 +1833,40 @@ class ElementCollection(
1820
1833
  # Mix object bounds with specific overrides
1821
1834
  clipped_elements = collection.clip(obj=container, bottom=page.height/2)
1822
1835
  """
1836
+ # --- NEW BEHAVIOUR: support per-element clipping with sequences --- #
1837
+ from collections.abc import Sequence # Local import to avoid top-level issues
1838
+
1839
+ # Detect if *obj* is a sequence meant to map one-to-one with the elements
1840
+ clip_objs = None # type: Optional[List[Any]]
1841
+ if isinstance(obj, ElementCollection):
1842
+ clip_objs = obj.elements
1843
+ elif isinstance(obj, Sequence) and not isinstance(obj, (str, bytes)):
1844
+ clip_objs = list(obj)
1845
+
1846
+ if clip_objs is not None:
1847
+ if len(clip_objs) != len(self._elements):
1848
+ raise ValueError(
1849
+ f"Number of clipping objects ({len(clip_objs)}) does not match number of "
1850
+ f"elements in collection ({len(self._elements)})."
1851
+ )
1852
+
1853
+ clipped_elements = [
1854
+ el.clip(
1855
+ obj=clip_obj,
1856
+ left=left,
1857
+ top=top,
1858
+ right=right,
1859
+ bottom=bottom,
1860
+ )
1861
+ for el, clip_obj in zip(self._elements, clip_objs)
1862
+ ]
1863
+ return ElementCollection(clipped_elements)
1864
+
1865
+ # Fallback to original behaviour: apply same clipping parameters to all elements
1823
1866
  return self.apply(
1824
- lambda element: element.clip(obj=obj, left=left, top=top, right=right, bottom=bottom)
1867
+ lambda element: element.clip(
1868
+ obj=obj, left=left, top=top, right=right, bottom=bottom
1869
+ )
1825
1870
  )
1826
1871
 
1827
1872
 
@@ -1860,13 +1905,20 @@ class PageCollection(Generic[P], ApplyMixin, ShapeDetectionMixin):
1860
1905
  """Return a string representation showing the page count."""
1861
1906
  return f"<PageCollection(count={len(self)})>"
1862
1907
 
1863
- def extract_text(self, keep_blank_chars=True, apply_exclusions=True, **kwargs) -> str:
1908
+ def extract_text(
1909
+ self,
1910
+ keep_blank_chars: bool = True,
1911
+ apply_exclusions: bool = True,
1912
+ strip: Optional[bool] = None,
1913
+ **kwargs,
1914
+ ) -> str:
1864
1915
  """
1865
1916
  Extract text from all pages in the collection.
1866
1917
 
1867
1918
  Args:
1868
1919
  keep_blank_chars: Whether to keep blank characters (default: True)
1869
1920
  apply_exclusions: Whether to apply exclusion regions (default: True)
1921
+ strip: Whether to strip whitespace from the extracted text.
1870
1922
  **kwargs: Additional extraction parameters
1871
1923
 
1872
1924
  Returns:
@@ -1875,11 +1927,22 @@ class PageCollection(Generic[P], ApplyMixin, ShapeDetectionMixin):
1875
1927
  texts = []
1876
1928
  for page in self.pages:
1877
1929
  text = page.extract_text(
1878
- keep_blank_chars=keep_blank_chars, apply_exclusions=apply_exclusions, **kwargs
1930
+ keep_blank_chars=keep_blank_chars,
1931
+ apply_exclusions=apply_exclusions,
1932
+ **kwargs,
1879
1933
  )
1880
1934
  texts.append(text)
1881
1935
 
1882
- return "\n".join(texts)
1936
+ combined = "\n".join(texts)
1937
+
1938
+ # Default strip behaviour: if caller picks, honour; else respect layout flag passed via kwargs.
1939
+ use_layout = kwargs.get("layout", False)
1940
+ strip_final = strip if strip is not None else (not use_layout)
1941
+
1942
+ if strip_final:
1943
+ combined = "\n".join(line.rstrip() for line in combined.splitlines()).strip()
1944
+
1945
+ return combined
1883
1946
 
1884
1947
  def apply_ocr(
1885
1948
  self,
@@ -2275,6 +2338,44 @@ class PageCollection(Generic[P], ApplyMixin, ShapeDetectionMixin):
2275
2338
 
2276
2339
  # Generate sections
2277
2340
  sections = []
2341
+
2342
+ # --- Helper: build a FlowRegion spanning multiple pages ---
2343
+ def _build_flow_region(start_el, end_el):
2344
+ """Return a FlowRegion that covers from *start_el* to *end_el* (inclusive).
2345
+ If *end_el* is None, the region continues to the bottom of the last
2346
+ page in this PageCollection."""
2347
+ # Local imports to avoid top-level cycles
2348
+ from natural_pdf.elements.region import Region
2349
+ from natural_pdf.flows.flow import Flow
2350
+ from natural_pdf.flows.element import FlowElement
2351
+ from natural_pdf.flows.region import FlowRegion
2352
+
2353
+ start_pg = start_el.page
2354
+ end_pg = end_el.page if end_el is not None else self.pages[-1]
2355
+
2356
+ parts: list[Region] = []
2357
+ # Slice of first page
2358
+ parts.append(Region(start_pg, (0, start_el.top, start_pg.width, start_pg.height)))
2359
+
2360
+ # Full middle pages
2361
+ for pg_idx in range(start_pg.index + 1, end_pg.index):
2362
+ mid_pg = self.pages[pg_idx]
2363
+ parts.append(Region(mid_pg, (0, 0, mid_pg.width, mid_pg.height)))
2364
+
2365
+ # Slice of last page (if distinct)
2366
+ if end_pg is not start_pg:
2367
+ bottom = end_el.bottom if end_el is not None else end_pg.height
2368
+ parts.append(Region(end_pg, (0, 0, end_pg.width, bottom)))
2369
+
2370
+ flow = Flow(segments=parts, arrangement="vertical")
2371
+ src_fe = FlowElement(physical_object=start_el, flow=flow)
2372
+ return FlowRegion(flow=flow,
2373
+ constituent_regions=parts,
2374
+ source_flow_element=src_fe,
2375
+ boundary_element_found=end_el)
2376
+
2377
+ # ------------------------------------------------------------------
2378
+
2278
2379
  current_start = None
2279
2380
 
2280
2381
  for i, boundary in enumerate(section_boundaries):
@@ -2295,50 +2396,9 @@ class PageCollection(Generic[P], ApplyMixin, ShapeDetectionMixin):
2295
2396
  )
2296
2397
  sections.append(section)
2297
2398
  else:
2298
- # Create a multi-page section
2299
- from natural_pdf.elements.region import Region
2300
-
2301
- # Get the start and end pages
2302
- start_page = start_element.page
2303
- end_page = end_element.page
2304
-
2305
- # Create a combined region
2306
- combined_region = Region(
2307
- start_page, (0, start_element.top, start_page.width, start_page.height)
2308
- )
2309
- combined_region._spans_pages = True
2310
- combined_region._page_range = (start_page.index, end_page.index)
2311
- combined_region.start_element = start_element
2312
- combined_region.end_element = end_element
2313
-
2314
- # Get all elements that fall within this multi-page region
2315
- combined_elements = []
2316
-
2317
- # Get elements from the first page
2318
- first_page_elements = [
2319
- e
2320
- for e in all_elements
2321
- if e.page == start_page and e.top >= start_element.top
2322
- ]
2323
- combined_elements.extend(first_page_elements)
2324
-
2325
- # Get elements from middle pages (if any)
2326
- for page_idx in range(start_page.index + 1, end_page.index):
2327
- middle_page_elements = [e for e in all_elements if e.page.index == page_idx]
2328
- combined_elements.extend(middle_page_elements)
2329
-
2330
- # Get elements from the last page
2331
- last_page_elements = [
2332
- e
2333
- for e in all_elements
2334
- if e.page == end_page and e.bottom <= end_element.bottom
2335
- ]
2336
- combined_elements.extend(last_page_elements)
2337
-
2338
- # Store the elements in the combined region
2339
- combined_region._multi_page_elements = combined_elements
2340
-
2341
- sections.append(combined_region)
2399
+ # Create FlowRegion spanning pages
2400
+ flow_region = _build_flow_region(start_element, end_element)
2401
+ sections.append(flow_region)
2342
2402
 
2343
2403
  current_start = None
2344
2404
 
@@ -2394,54 +2454,9 @@ class PageCollection(Generic[P], ApplyMixin, ShapeDetectionMixin):
2394
2454
  last_page_elements.sort(key=lambda e: (e.top, e.x0))
2395
2455
  end_element = last_page_elements[-1] if last_page_elements else None
2396
2456
 
2397
- # Create a multi-page section
2398
- from natural_pdf.elements.region import Region
2399
-
2400
- if start_page == last_page:
2401
- # Simple case - both on same page
2402
- section = start_page.get_section_between(
2403
- start_element, end_element, boundary_inclusion
2404
- )
2405
- sections.append(section)
2406
- else:
2407
- # Create a multi-page section
2408
- combined_region = Region(
2409
- start_page, (0, start_element.top, start_page.width, start_page.height)
2410
- )
2411
- combined_region._spans_pages = True
2412
- combined_region._page_range = (start_page.index, last_page.index)
2413
- combined_region.start_element = start_element
2414
- combined_region.end_element = end_element
2415
-
2416
- # Get all elements that fall within this multi-page region
2417
- combined_elements = []
2418
-
2419
- # Get elements from the first page
2420
- first_page_elements = [
2421
- e
2422
- for e in all_elements
2423
- if e.page == start_page and e.top >= start_element.top
2424
- ]
2425
- combined_elements.extend(first_page_elements)
2426
-
2427
- # Get elements from middle pages (if any)
2428
- for page_idx in range(start_page.index + 1, last_page.index):
2429
- middle_page_elements = [e for e in all_elements if e.page.index == page_idx]
2430
- combined_elements.extend(middle_page_elements)
2431
-
2432
- # Get elements from the last page
2433
- last_page_elements = [
2434
- e
2435
- for e in all_elements
2436
- if e.page == last_page
2437
- and (end_element is None or e.bottom <= end_element.bottom)
2438
- ]
2439
- combined_elements.extend(last_page_elements)
2440
-
2441
- # Store the elements in the combined region
2442
- combined_region._multi_page_elements = combined_elements
2443
-
2444
- sections.append(combined_region)
2457
+ # Create FlowRegion spanning multiple pages using helper
2458
+ flow_region = _build_flow_region(start_element, end_element)
2459
+ sections.append(flow_region)
2445
2460
  else:
2446
2461
  # With start_elements only, create a section to the end of the current page
2447
2462
  from natural_pdf.elements.region import Region
@@ -2629,13 +2644,13 @@ class PageCollection(Generic[P], ApplyMixin, ShapeDetectionMixin):
2629
2644
 
2630
2645
  def to_image(
2631
2646
  self,
2632
- page_width: int = 300,
2647
+ page_width: Optional[int] = None,
2633
2648
  cols: Optional[int] = 4,
2634
2649
  rows: Optional[int] = None,
2635
2650
  max_pages: Optional[int] = None,
2636
2651
  spacing: int = 10,
2637
- add_labels: bool = True,
2638
- show_category: bool = False, # Add new flag
2652
+ add_labels: bool = True, # Add new flag
2653
+ show_category: bool = False,
2639
2654
  ) -> Optional["Image.Image"]:
2640
2655
  """
2641
2656
  Generate a grid of page images for this collection.
@@ -2652,6 +2667,16 @@ class PageCollection(Generic[P], ApplyMixin, ShapeDetectionMixin):
2652
2667
  Returns:
2653
2668
  PIL Image of the page grid or None if no pages
2654
2669
  """
2670
+ # Determine default page width from global options if not explicitly provided
2671
+ if page_width is None:
2672
+ try:
2673
+ import natural_pdf
2674
+
2675
+ page_width = natural_pdf.options.image.width or 300
2676
+ except Exception:
2677
+ # Fallback if natural_pdf import fails in some edge context
2678
+ page_width = 300
2679
+
2655
2680
  # Ensure PIL is imported, handle potential ImportError if not done globally/lazily
2656
2681
  try:
2657
2682
  from PIL import Image, ImageDraw, ImageFont
@@ -2949,3 +2974,17 @@ class PageCollection(Generic[P], ApplyMixin, ShapeDetectionMixin):
2949
2974
  # Re-raise the exception caught from the exporter
2950
2975
  raise e # Keep the original exception type (ValueError, RuntimeError, etc.)
2951
2976
  # <--- END MODIFIED
2977
+
2978
+ # Alias .to_image() to .show() for convenience
2979
+ def show(
2980
+ self,
2981
+ *args,
2982
+ **kwargs,
2983
+ ) -> Optional["Image.Image"]:
2984
+ """Display pages similarly to ``to_image``.
2985
+
2986
+ This is a thin wrapper around :py:meth:`to_image` so that the API mirrors
2987
+ ElementCollection, where ``show()`` already exists. It forwards all
2988
+ arguments and returns the resulting ``PIL.Image`` instance.
2989
+ """
2990
+ return self.to_image(*args, **kwargs)
@@ -94,6 +94,14 @@ class LineElement(Element):
94
94
  # Vertical if x-change is within tolerance and y-change is significant
95
95
  return dx <= tolerance and dy > tolerance
96
96
 
97
+ @property
98
+ def orientation(self) -> str:
99
+ """Get the orientation of the line ('horizontal', 'vertical', or 'diagonal')."""
100
+ if self.is_horizontal:
101
+ return "horizontal"
102
+ elif self.is_vertical:
103
+ return "vertical"
104
+
97
105
  def text_above(self, distance: float = 5, **kwargs) -> Any:
98
106
  """
99
107
  Get text elements above this line.
@@ -142,7 +150,4 @@ class LineElement(Element):
142
150
 
143
151
  def __repr__(self) -> str:
144
152
  """String representation of the line element."""
145
- line_type = (
146
- "horizontal" if self.is_horizontal else "vertical" if self.is_vertical else "diagonal"
147
- )
148
- return f"<LineElement type={line_type} width={self.width:.1f} bbox={self.bbox}>"
153
+ return f"<LineElement type={self.orientation} width={self.width:.1f} bbox={self.bbox}>"