natural-pdf 0.1.23__py3-none-any.whl → 0.1.24__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1490,6 +1490,45 @@ class ShapeDetectionMixin:
1490
1490
 
1491
1491
  element_manager = page_object_for_elements._element_mgr
1492
1492
 
1493
+ # ------------------------------------------------------------------
1494
+ # CLEAN-UP existing table-related regions from earlier runs to avoid duplicates
1495
+ # ------------------------------------------------------------------
1496
+ try:
1497
+ _purge_types = {"table", "table_row", "table_column", "table_cell"}
1498
+
1499
+ if (
1500
+ hasattr(element_manager, "_elements")
1501
+ and "regions" in element_manager._elements
1502
+ ):
1503
+ _orig_len = len(element_manager._elements["regions"])
1504
+ element_manager._elements["regions"] = [
1505
+ r
1506
+ for r in element_manager._elements["regions"]
1507
+ if not (
1508
+ getattr(r, "source", None) == source_label
1509
+ and getattr(r, "region_type", None) in _purge_types
1510
+ )
1511
+ ]
1512
+ _removed = _orig_len - len(element_manager._elements["regions"])
1513
+ if _removed:
1514
+ logger.info(
1515
+ f"Removed {_removed} previous table-related regions (source='{source_label}') before regeneration."
1516
+ )
1517
+
1518
+ if hasattr(page_object_for_elements, "_regions") and "detected" in page_object_for_elements._regions:
1519
+ page_object_for_elements._regions["detected"] = [
1520
+ r
1521
+ for r in page_object_for_elements._regions["detected"]
1522
+ if not (
1523
+ getattr(r, "source", None) == source_label
1524
+ and getattr(r, "region_type", None) in _purge_types
1525
+ )
1526
+ ]
1527
+ except Exception as _cleanup_err:
1528
+ logger.warning(
1529
+ f"Table-region cleanup failed: {_cleanup_err}", exc_info=True
1530
+ )
1531
+
1493
1532
  # Get lines with the specified source
1494
1533
  all_lines = element_manager.lines # Access lines from the correct element manager
1495
1534
  filtered_lines = [
@@ -1724,6 +1763,7 @@ class ShapeDetectionMixin:
1724
1763
  logger.info(
1725
1764
  f"Created {tables_created} table, {rows_created} rows, {cols_created} columns, and {cells_created} table cells from detected lines (source: '{source_label}') for {self}."
1726
1765
  )
1766
+
1727
1767
  return self
1728
1768
 
1729
1769
 
@@ -863,10 +863,10 @@ class HighlightingService:
863
863
  if crop_bbox is not None:
864
864
  cb_x0, cb_top, cb_x1, cb_bottom = crop_bbox
865
865
  # Convert to pixel coordinates using actual scales
866
- left_px = int(cb_x0 * actual_scale_x) - 2
867
- top_px = int(cb_top * actual_scale_y) - 2
868
- right_px = int(cb_x1 * actual_scale_x) + 2
869
- bottom_px = int(cb_bottom * actual_scale_y) + 2
866
+ left_px = int(cb_x0 * actual_scale_x) - 1
867
+ top_px = int(cb_top * actual_scale_y) - 1
868
+ right_px = int(cb_x1 * actual_scale_x) + 1
869
+ bottom_px = int(cb_bottom * actual_scale_y) + 1
870
870
 
871
871
  # Safeguard coordinates within bounds
872
872
  left_px = max(0, min(left_px, rendered_image.width - 1))
natural_pdf/core/page.py CHANGED
@@ -2235,12 +2235,12 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMi
2235
2235
 
2236
2236
  def ask(
2237
2237
  self,
2238
- question: str,
2238
+ question: Union[str, List[str], Tuple[str, ...]],
2239
2239
  min_confidence: float = 0.1,
2240
2240
  model: str = None,
2241
2241
  debug: bool = False,
2242
2242
  **kwargs,
2243
- ) -> Dict[str, Any]:
2243
+ ) -> Union[Dict[str, Any], List[Dict[str, Any]]]:
2244
2244
  """
2245
2245
  Ask a question about the page content using document QA.
2246
2246
  """
@@ -2824,3 +2824,17 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMi
2824
2824
  if not hasattr(self, "metadata") or self.metadata is None:
2825
2825
  self.metadata = {}
2826
2826
  self.metadata["analysis"] = value
2827
+
2828
+ def inspect(self, limit: int = 30) -> "InspectionSummary":
2829
+ """
2830
+ Inspect all elements on this page with detailed tabular view.
2831
+ Equivalent to page.find_all('*').inspect().
2832
+
2833
+ Args:
2834
+ limit: Maximum elements per type to show (default: 30)
2835
+
2836
+ Returns:
2837
+ InspectionSummary with element tables showing coordinates,
2838
+ properties, and other details for each element
2839
+ """
2840
+ return self.find_all('*').inspect(limit=limit)
@@ -269,7 +269,7 @@ def _get_columns_for_type(element_type: str, show_page_column: bool) -> List[str
269
269
  base_columns = ['x0', 'top', 'x1', 'bottom']
270
270
 
271
271
  if element_type == 'word':
272
- columns = ['text'] + base_columns + ['font_family', 'size', 'bold', 'italic', 'source', 'confidence']
272
+ columns = ['text'] + base_columns + ['font_family', 'font_variant', 'size', 'bold', 'italic', 'source', 'confidence']
273
273
  # Add color for text elements
274
274
  columns.append('color')
275
275
  elif element_type == 'rect':
@@ -315,6 +315,16 @@ def _extract_element_value(element: "Element", column: str) -> Any:
315
315
  # Fallback to fontname
316
316
  return getattr(element, 'fontname', '')
317
317
 
318
+ elif column == 'font_variant':
319
+ variant = getattr(element, 'font_variant', None)
320
+ if variant:
321
+ return variant
322
+ # Fallback – try to derive from fontname if property missing
323
+ fontname = getattr(element, 'fontname', '')
324
+ if "+" in fontname:
325
+ return fontname.split("+", 1)[0]
326
+ return ''
327
+
318
328
  elif column in ['bold', 'italic']:
319
329
  value = getattr(element, column, False)
320
330
  return value if isinstance(value, bool) else False
@@ -128,6 +128,32 @@ class ElementSummary:
128
128
  ""
129
129
  ]
130
130
 
131
+ # Added for better VS Code and other frontends support
132
+ def _repr_html_(self) -> str: # type: ignore
133
+ """Return HTML representation so rich rendering works in more frontends.
134
+
135
+ Many notebook frontends (including VS Code) give priority to the
136
+ ``_repr_html_`` method over Markdown. When available, we convert the
137
+ generated Markdown to HTML using the *markdown* library. If the
138
+ library is not installed we simply wrap the Markdown in a ``<pre>``
139
+ block so that at least the plain-text representation is visible.
140
+ """
141
+ md_source = self._to_markdown()
142
+ try:
143
+ import markdown as _markdown # pylint: disable=import-error
144
+
145
+ # Convert markdown to HTML. We explicitly enable tables so the
146
+ # element and inspection summaries render nicely.
147
+ return _markdown.markdown(md_source, extensions=["tables"])
148
+ except Exception: # noqa: BLE001, broad-except
149
+ # Fallback: present the Markdown as-is inside a <pre> block.
150
+ escaped = (
151
+ md_source.replace("&", "&amp;")
152
+ .replace("<", "&lt;")
153
+ .replace(">", "&gt;")
154
+ )
155
+ return f"<pre>{escaped}</pre>"
156
+
131
157
 
132
158
  class InspectionSummary(ElementSummary):
133
159
  """
@@ -174,8 +174,8 @@ class DirectionalMixin:
174
174
  # Adjust cross boundaries if cross_size is 'element'
175
175
  if cross_size == "element":
176
176
  if is_horizontal: # Adjust y0, y1
177
- y0 = min(y0, self.y0)
178
- y1 = max(y1, self.y1)
177
+ y0 = min(y0, self.top)
178
+ y1 = max(y1, self.bottom)
179
179
  else: # Adjust x0, x1
180
180
  x0 = min(x0, self.x0)
181
181
  x1 = max(x1, self.x1)
@@ -290,7 +290,13 @@ class ElementCollection(
290
290
 
291
291
  return ElementCollection(filtered)
292
292
 
293
- def extract_text(self, preserve_whitespace=True, use_exclusions=True, **kwargs) -> str:
293
+ def extract_text(
294
+ self,
295
+ preserve_whitespace: bool = True,
296
+ use_exclusions: bool = True,
297
+ strip: Optional[bool] = None,
298
+ **kwargs,
299
+ ) -> str:
294
300
  """
295
301
  Extract text from all TextElements in the collection, optionally using
296
302
  pdfplumber's layout engine if layout=True is specified.
@@ -303,6 +309,7 @@ class ElementCollection(
303
309
  `chars_to_textmap` function ONLY if `layout=True` is passed.
304
310
  See Page.extract_text docstring for common parameters.
305
311
  If `layout=False` or omitted, performs a simple join.
312
+ strip: Whether to strip whitespace from the extracted text.
306
313
 
307
314
  Returns:
308
315
  Combined text from elements, potentially with layout-based spacing.
@@ -399,6 +406,12 @@ class ElementCollection(
399
406
  result = "".join(c.get("text", "") for c in all_char_dicts)
400
407
  # Replace multiple spaces created by joining possibly overlapping chars? Maybe not necessary.
401
408
 
409
+ # Determine final strip flag – same rule as global helper unless caller overrides
410
+ strip_text = strip if strip is not None else (not use_layout)
411
+
412
+ if strip_text and isinstance(result, str):
413
+ result = "\n".join(line.rstrip() for line in result.splitlines()).strip()
414
+
402
415
  return result
403
416
 
404
417
  def filter(self, func: Callable[["Element"], bool]) -> "ElementCollection":
@@ -1820,8 +1833,40 @@ class ElementCollection(
1820
1833
  # Mix object bounds with specific overrides
1821
1834
  clipped_elements = collection.clip(obj=container, bottom=page.height/2)
1822
1835
  """
1836
+ # --- NEW BEHAVIOUR: support per-element clipping with sequences --- #
1837
+ from collections.abc import Sequence # Local import to avoid top-level issues
1838
+
1839
+ # Detect if *obj* is a sequence meant to map one-to-one with the elements
1840
+ clip_objs = None # type: Optional[List[Any]]
1841
+ if isinstance(obj, ElementCollection):
1842
+ clip_objs = obj.elements
1843
+ elif isinstance(obj, Sequence) and not isinstance(obj, (str, bytes)):
1844
+ clip_objs = list(obj)
1845
+
1846
+ if clip_objs is not None:
1847
+ if len(clip_objs) != len(self._elements):
1848
+ raise ValueError(
1849
+ f"Number of clipping objects ({len(clip_objs)}) does not match number of "
1850
+ f"elements in collection ({len(self._elements)})."
1851
+ )
1852
+
1853
+ clipped_elements = [
1854
+ el.clip(
1855
+ obj=clip_obj,
1856
+ left=left,
1857
+ top=top,
1858
+ right=right,
1859
+ bottom=bottom,
1860
+ )
1861
+ for el, clip_obj in zip(self._elements, clip_objs)
1862
+ ]
1863
+ return ElementCollection(clipped_elements)
1864
+
1865
+ # Fallback to original behaviour: apply same clipping parameters to all elements
1823
1866
  return self.apply(
1824
- lambda element: element.clip(obj=obj, left=left, top=top, right=right, bottom=bottom)
1867
+ lambda element: element.clip(
1868
+ obj=obj, left=left, top=top, right=right, bottom=bottom
1869
+ )
1825
1870
  )
1826
1871
 
1827
1872
 
@@ -1860,13 +1905,20 @@ class PageCollection(Generic[P], ApplyMixin, ShapeDetectionMixin):
1860
1905
  """Return a string representation showing the page count."""
1861
1906
  return f"<PageCollection(count={len(self)})>"
1862
1907
 
1863
- def extract_text(self, keep_blank_chars=True, apply_exclusions=True, **kwargs) -> str:
1908
+ def extract_text(
1909
+ self,
1910
+ keep_blank_chars: bool = True,
1911
+ apply_exclusions: bool = True,
1912
+ strip: Optional[bool] = None,
1913
+ **kwargs,
1914
+ ) -> str:
1864
1915
  """
1865
1916
  Extract text from all pages in the collection.
1866
1917
 
1867
1918
  Args:
1868
1919
  keep_blank_chars: Whether to keep blank characters (default: True)
1869
1920
  apply_exclusions: Whether to apply exclusion regions (default: True)
1921
+ strip: Whether to strip whitespace from the extracted text.
1870
1922
  **kwargs: Additional extraction parameters
1871
1923
 
1872
1924
  Returns:
@@ -1875,11 +1927,22 @@ class PageCollection(Generic[P], ApplyMixin, ShapeDetectionMixin):
1875
1927
  texts = []
1876
1928
  for page in self.pages:
1877
1929
  text = page.extract_text(
1878
- keep_blank_chars=keep_blank_chars, apply_exclusions=apply_exclusions, **kwargs
1930
+ keep_blank_chars=keep_blank_chars,
1931
+ apply_exclusions=apply_exclusions,
1932
+ **kwargs,
1879
1933
  )
1880
1934
  texts.append(text)
1881
1935
 
1882
- return "\n".join(texts)
1936
+ combined = "\n".join(texts)
1937
+
1938
+ # Default strip behaviour: if caller picks, honour; else respect layout flag passed via kwargs.
1939
+ use_layout = kwargs.get("layout", False)
1940
+ strip_final = strip if strip is not None else (not use_layout)
1941
+
1942
+ if strip_final:
1943
+ combined = "\n".join(line.rstrip() for line in combined.splitlines()).strip()
1944
+
1945
+ return combined
1883
1946
 
1884
1947
  def apply_ocr(
1885
1948
  self,
@@ -2275,6 +2338,44 @@ class PageCollection(Generic[P], ApplyMixin, ShapeDetectionMixin):
2275
2338
 
2276
2339
  # Generate sections
2277
2340
  sections = []
2341
+
2342
+ # --- Helper: build a FlowRegion spanning multiple pages ---
2343
+ def _build_flow_region(start_el, end_el):
2344
+ """Return a FlowRegion that covers from *start_el* to *end_el* (inclusive).
2345
+ If *end_el* is None, the region continues to the bottom of the last
2346
+ page in this PageCollection."""
2347
+ # Local imports to avoid top-level cycles
2348
+ from natural_pdf.elements.region import Region
2349
+ from natural_pdf.flows.flow import Flow
2350
+ from natural_pdf.flows.element import FlowElement
2351
+ from natural_pdf.flows.region import FlowRegion
2352
+
2353
+ start_pg = start_el.page
2354
+ end_pg = end_el.page if end_el is not None else self.pages[-1]
2355
+
2356
+ parts: list[Region] = []
2357
+ # Slice of first page
2358
+ parts.append(Region(start_pg, (0, start_el.top, start_pg.width, start_pg.height)))
2359
+
2360
+ # Full middle pages
2361
+ for pg_idx in range(start_pg.index + 1, end_pg.index):
2362
+ mid_pg = self.pages[pg_idx]
2363
+ parts.append(Region(mid_pg, (0, 0, mid_pg.width, mid_pg.height)))
2364
+
2365
+ # Slice of last page (if distinct)
2366
+ if end_pg is not start_pg:
2367
+ bottom = end_el.bottom if end_el is not None else end_pg.height
2368
+ parts.append(Region(end_pg, (0, 0, end_pg.width, bottom)))
2369
+
2370
+ flow = Flow(segments=parts, arrangement="vertical")
2371
+ src_fe = FlowElement(physical_object=start_el, flow=flow)
2372
+ return FlowRegion(flow=flow,
2373
+ constituent_regions=parts,
2374
+ source_flow_element=src_fe,
2375
+ boundary_element_found=end_el)
2376
+
2377
+ # ------------------------------------------------------------------
2378
+
2278
2379
  current_start = None
2279
2380
 
2280
2381
  for i, boundary in enumerate(section_boundaries):
@@ -2295,50 +2396,9 @@ class PageCollection(Generic[P], ApplyMixin, ShapeDetectionMixin):
2295
2396
  )
2296
2397
  sections.append(section)
2297
2398
  else:
2298
- # Create a multi-page section
2299
- from natural_pdf.elements.region import Region
2300
-
2301
- # Get the start and end pages
2302
- start_page = start_element.page
2303
- end_page = end_element.page
2304
-
2305
- # Create a combined region
2306
- combined_region = Region(
2307
- start_page, (0, start_element.top, start_page.width, start_page.height)
2308
- )
2309
- combined_region._spans_pages = True
2310
- combined_region._page_range = (start_page.index, end_page.index)
2311
- combined_region.start_element = start_element
2312
- combined_region.end_element = end_element
2313
-
2314
- # Get all elements that fall within this multi-page region
2315
- combined_elements = []
2316
-
2317
- # Get elements from the first page
2318
- first_page_elements = [
2319
- e
2320
- for e in all_elements
2321
- if e.page == start_page and e.top >= start_element.top
2322
- ]
2323
- combined_elements.extend(first_page_elements)
2324
-
2325
- # Get elements from middle pages (if any)
2326
- for page_idx in range(start_page.index + 1, end_page.index):
2327
- middle_page_elements = [e for e in all_elements if e.page.index == page_idx]
2328
- combined_elements.extend(middle_page_elements)
2329
-
2330
- # Get elements from the last page
2331
- last_page_elements = [
2332
- e
2333
- for e in all_elements
2334
- if e.page == end_page and e.bottom <= end_element.bottom
2335
- ]
2336
- combined_elements.extend(last_page_elements)
2337
-
2338
- # Store the elements in the combined region
2339
- combined_region._multi_page_elements = combined_elements
2340
-
2341
- sections.append(combined_region)
2399
+ # Create FlowRegion spanning pages
2400
+ flow_region = _build_flow_region(start_element, end_element)
2401
+ sections.append(flow_region)
2342
2402
 
2343
2403
  current_start = None
2344
2404
 
@@ -2394,54 +2454,9 @@ class PageCollection(Generic[P], ApplyMixin, ShapeDetectionMixin):
2394
2454
  last_page_elements.sort(key=lambda e: (e.top, e.x0))
2395
2455
  end_element = last_page_elements[-1] if last_page_elements else None
2396
2456
 
2397
- # Create a multi-page section
2398
- from natural_pdf.elements.region import Region
2399
-
2400
- if start_page == last_page:
2401
- # Simple case - both on same page
2402
- section = start_page.get_section_between(
2403
- start_element, end_element, boundary_inclusion
2404
- )
2405
- sections.append(section)
2406
- else:
2407
- # Create a multi-page section
2408
- combined_region = Region(
2409
- start_page, (0, start_element.top, start_page.width, start_page.height)
2410
- )
2411
- combined_region._spans_pages = True
2412
- combined_region._page_range = (start_page.index, last_page.index)
2413
- combined_region.start_element = start_element
2414
- combined_region.end_element = end_element
2415
-
2416
- # Get all elements that fall within this multi-page region
2417
- combined_elements = []
2418
-
2419
- # Get elements from the first page
2420
- first_page_elements = [
2421
- e
2422
- for e in all_elements
2423
- if e.page == start_page and e.top >= start_element.top
2424
- ]
2425
- combined_elements.extend(first_page_elements)
2426
-
2427
- # Get elements from middle pages (if any)
2428
- for page_idx in range(start_page.index + 1, last_page.index):
2429
- middle_page_elements = [e for e in all_elements if e.page.index == page_idx]
2430
- combined_elements.extend(middle_page_elements)
2431
-
2432
- # Get elements from the last page
2433
- last_page_elements = [
2434
- e
2435
- for e in all_elements
2436
- if e.page == last_page
2437
- and (end_element is None or e.bottom <= end_element.bottom)
2438
- ]
2439
- combined_elements.extend(last_page_elements)
2440
-
2441
- # Store the elements in the combined region
2442
- combined_region._multi_page_elements = combined_elements
2443
-
2444
- sections.append(combined_region)
2457
+ # Create FlowRegion spanning multiple pages using helper
2458
+ flow_region = _build_flow_region(start_element, end_element)
2459
+ sections.append(flow_region)
2445
2460
  else:
2446
2461
  # With start_elements only, create a section to the end of the current page
2447
2462
  from natural_pdf.elements.region import Region
@@ -2629,13 +2644,13 @@ class PageCollection(Generic[P], ApplyMixin, ShapeDetectionMixin):
2629
2644
 
2630
2645
  def to_image(
2631
2646
  self,
2632
- page_width: int = 300,
2647
+ page_width: Optional[int] = None,
2633
2648
  cols: Optional[int] = 4,
2634
2649
  rows: Optional[int] = None,
2635
2650
  max_pages: Optional[int] = None,
2636
2651
  spacing: int = 10,
2637
- add_labels: bool = True,
2638
- show_category: bool = False, # Add new flag
2652
+ add_labels: bool = True, # Add new flag
2653
+ show_category: bool = False,
2639
2654
  ) -> Optional["Image.Image"]:
2640
2655
  """
2641
2656
  Generate a grid of page images for this collection.
@@ -2652,6 +2667,16 @@ class PageCollection(Generic[P], ApplyMixin, ShapeDetectionMixin):
2652
2667
  Returns:
2653
2668
  PIL Image of the page grid or None if no pages
2654
2669
  """
2670
+ # Determine default page width from global options if not explicitly provided
2671
+ if page_width is None:
2672
+ try:
2673
+ import natural_pdf
2674
+
2675
+ page_width = natural_pdf.options.image.width or 300
2676
+ except Exception:
2677
+ # Fallback if natural_pdf import fails in some edge context
2678
+ page_width = 300
2679
+
2655
2680
  # Ensure PIL is imported, handle potential ImportError if not done globally/lazily
2656
2681
  try:
2657
2682
  from PIL import Image, ImageDraw, ImageFont
@@ -2949,3 +2974,17 @@ class PageCollection(Generic[P], ApplyMixin, ShapeDetectionMixin):
2949
2974
  # Re-raise the exception caught from the exporter
2950
2975
  raise e # Keep the original exception type (ValueError, RuntimeError, etc.)
2951
2976
  # <--- END MODIFIED
2977
+
2978
+ # Alias .to_image() to .show() for convenience
2979
+ def show(
2980
+ self,
2981
+ *args,
2982
+ **kwargs,
2983
+ ) -> Optional["Image.Image"]:
2984
+ """Display pages similarly to ``to_image``.
2985
+
2986
+ This is a thin wrapper around :py:meth:`to_image` so that the API mirrors
2987
+ ElementCollection, where ``show()`` already exists. It forwards all
2988
+ arguments and returns the resulting ``PIL.Image`` instance.
2989
+ """
2990
+ return self.to_image(*args, **kwargs)