natural-pdf 0.1.22__py3-none-any.whl → 0.1.24__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (31) hide show
  1. natural_pdf/analyzers/shape_detection_mixin.py +43 -3
  2. natural_pdf/classification/manager.py +1 -1
  3. natural_pdf/classification/mixin.py +35 -14
  4. natural_pdf/classification/results.py +16 -1
  5. natural_pdf/cli.py +1 -0
  6. natural_pdf/core/highlighting_service.py +23 -0
  7. natural_pdf/core/page.py +32 -2
  8. natural_pdf/core/pdf.py +24 -4
  9. natural_pdf/describe/base.py +11 -1
  10. natural_pdf/describe/summary.py +26 -0
  11. natural_pdf/elements/base.py +81 -3
  12. natural_pdf/elements/collections.py +162 -101
  13. natural_pdf/elements/region.py +187 -160
  14. natural_pdf/elements/text.py +15 -7
  15. natural_pdf/exporters/paddleocr.py +1 -1
  16. natural_pdf/extraction/manager.py +2 -2
  17. natural_pdf/extraction/mixin.py +295 -11
  18. natural_pdf/extraction/result.py +28 -1
  19. natural_pdf/flows/region.py +117 -2
  20. natural_pdf/ocr/engine_surya.py +25 -5
  21. natural_pdf/qa/__init__.py +2 -1
  22. natural_pdf/qa/document_qa.py +166 -113
  23. natural_pdf/qa/qa_result.py +55 -0
  24. natural_pdf/selectors/parser.py +22 -0
  25. natural_pdf/utils/text_extraction.py +34 -14
  26. {natural_pdf-0.1.22.dist-info → natural_pdf-0.1.24.dist-info}/METADATA +22 -13
  27. {natural_pdf-0.1.22.dist-info → natural_pdf-0.1.24.dist-info}/RECORD +31 -30
  28. {natural_pdf-0.1.22.dist-info → natural_pdf-0.1.24.dist-info}/WHEEL +0 -0
  29. {natural_pdf-0.1.22.dist-info → natural_pdf-0.1.24.dist-info}/entry_points.txt +0 -0
  30. {natural_pdf-0.1.22.dist-info → natural_pdf-0.1.24.dist-info}/licenses/LICENSE +0 -0
  31. {natural_pdf-0.1.22.dist-info → natural_pdf-0.1.24.dist-info}/top_level.txt +0 -0
@@ -290,7 +290,13 @@ class ElementCollection(
290
290
 
291
291
  return ElementCollection(filtered)
292
292
 
293
- def extract_text(self, preserve_whitespace=True, use_exclusions=True, **kwargs) -> str:
293
+ def extract_text(
294
+ self,
295
+ preserve_whitespace: bool = True,
296
+ use_exclusions: bool = True,
297
+ strip: Optional[bool] = None,
298
+ **kwargs,
299
+ ) -> str:
294
300
  """
295
301
  Extract text from all TextElements in the collection, optionally using
296
302
  pdfplumber's layout engine if layout=True is specified.
@@ -303,6 +309,7 @@ class ElementCollection(
303
309
  `chars_to_textmap` function ONLY if `layout=True` is passed.
304
310
  See Page.extract_text docstring for common parameters.
305
311
  If `layout=False` or omitted, performs a simple join.
312
+ strip: Whether to strip whitespace from the extracted text.
306
313
 
307
314
  Returns:
308
315
  Combined text from elements, potentially with layout-based spacing.
@@ -399,6 +406,12 @@ class ElementCollection(
399
406
  result = "".join(c.get("text", "") for c in all_char_dicts)
400
407
  # Replace multiple spaces created by joining possibly overlapping chars? Maybe not necessary.
401
408
 
409
+ # Determine final strip flag – same rule as global helper unless caller overrides
410
+ strip_text = strip if strip is not None else (not use_layout)
411
+
412
+ if strip_text and isinstance(result, str):
413
+ result = "\n".join(line.rstrip() for line in result.splitlines()).strip()
414
+
402
415
  return result
403
416
 
404
417
  def filter(self, func: Callable[["Element"], bool]) -> "ElementCollection":
@@ -852,6 +865,7 @@ class ElementCollection(
852
865
  render_ocr: bool = False,
853
866
  width: Optional[int] = None, # Add width parameter
854
867
  page: Optional[Any] = None, # NEW: Optional page parameter for empty collections
868
+ crop: bool = False, # NEW: If True, crop output to element bounds
855
869
  ) -> Optional["Image.Image"]:
856
870
  """
857
871
  Generates a temporary preview image highlighting elements in this collection
@@ -875,6 +889,9 @@ class ElementCollection(
875
889
  legend_position: Position of the legend ('right', 'left', 'top', 'bottom').
876
890
  render_ocr: Whether to render OCR text.
877
891
  width: Optional width for the output image in pixels.
892
+ crop: If True, crop the resulting image to the tight bounding box
893
+ containing all elements in the collection. The elements are
894
+ still highlighted first, then the image is cropped.
878
895
 
879
896
  Returns:
880
897
  PIL Image object of the temporary preview, or None if rendering fails or
@@ -931,7 +948,23 @@ class ElementCollection(
931
948
 
932
949
  # 2. Call render_preview on the HighlightingService
933
950
  try:
934
- return service.render_preview(
951
+ # Calculate crop bounding box in PDF coordinates if crop is requested
952
+ crop_bbox = None
953
+ if crop:
954
+ try:
955
+ crop_bbox = (
956
+ min(el.x0 for el in self._elements),
957
+ min(el.top for el in self._elements),
958
+ max(el.x1 for el in self._elements),
959
+ max(el.bottom for el in self._elements),
960
+ )
961
+ except Exception as bbox_err:
962
+ logger.error(
963
+ f"Error determining crop bbox for collection show: {bbox_err}",
964
+ exc_info=True,
965
+ )
966
+
967
+ img = service.render_preview(
935
968
  page_index=page.index,
936
969
  temporary_highlights=highlight_data_list,
937
970
  scale=scale,
@@ -939,7 +972,9 @@ class ElementCollection(
939
972
  labels=labels, # Use 'labels'
940
973
  legend_position=legend_position,
941
974
  render_ocr=render_ocr,
975
+ crop_bbox=crop_bbox,
942
976
  )
977
+ return img
943
978
  except Exception as e:
944
979
  logger.error(f"Error calling highlighting_service.render_preview: {e}", exc_info=True)
945
980
  return None
@@ -1798,8 +1833,40 @@ class ElementCollection(
1798
1833
  # Mix object bounds with specific overrides
1799
1834
  clipped_elements = collection.clip(obj=container, bottom=page.height/2)
1800
1835
  """
1836
+ # --- NEW BEHAVIOUR: support per-element clipping with sequences --- #
1837
+ from collections.abc import Sequence # Local import to avoid top-level issues
1838
+
1839
+ # Detect if *obj* is a sequence meant to map one-to-one with the elements
1840
+ clip_objs = None # type: Optional[List[Any]]
1841
+ if isinstance(obj, ElementCollection):
1842
+ clip_objs = obj.elements
1843
+ elif isinstance(obj, Sequence) and not isinstance(obj, (str, bytes)):
1844
+ clip_objs = list(obj)
1845
+
1846
+ if clip_objs is not None:
1847
+ if len(clip_objs) != len(self._elements):
1848
+ raise ValueError(
1849
+ f"Number of clipping objects ({len(clip_objs)}) does not match number of "
1850
+ f"elements in collection ({len(self._elements)})."
1851
+ )
1852
+
1853
+ clipped_elements = [
1854
+ el.clip(
1855
+ obj=clip_obj,
1856
+ left=left,
1857
+ top=top,
1858
+ right=right,
1859
+ bottom=bottom,
1860
+ )
1861
+ for el, clip_obj in zip(self._elements, clip_objs)
1862
+ ]
1863
+ return ElementCollection(clipped_elements)
1864
+
1865
+ # Fallback to original behaviour: apply same clipping parameters to all elements
1801
1866
  return self.apply(
1802
- lambda element: element.clip(obj=obj, left=left, top=top, right=right, bottom=bottom)
1867
+ lambda element: element.clip(
1868
+ obj=obj, left=left, top=top, right=right, bottom=bottom
1869
+ )
1803
1870
  )
1804
1871
 
1805
1872
 
@@ -1838,13 +1905,20 @@ class PageCollection(Generic[P], ApplyMixin, ShapeDetectionMixin):
1838
1905
  """Return a string representation showing the page count."""
1839
1906
  return f"<PageCollection(count={len(self)})>"
1840
1907
 
1841
- def extract_text(self, keep_blank_chars=True, apply_exclusions=True, **kwargs) -> str:
1908
+ def extract_text(
1909
+ self,
1910
+ keep_blank_chars: bool = True,
1911
+ apply_exclusions: bool = True,
1912
+ strip: Optional[bool] = None,
1913
+ **kwargs,
1914
+ ) -> str:
1842
1915
  """
1843
1916
  Extract text from all pages in the collection.
1844
1917
 
1845
1918
  Args:
1846
1919
  keep_blank_chars: Whether to keep blank characters (default: True)
1847
1920
  apply_exclusions: Whether to apply exclusion regions (default: True)
1921
+ strip: Whether to strip whitespace from the extracted text.
1848
1922
  **kwargs: Additional extraction parameters
1849
1923
 
1850
1924
  Returns:
@@ -1853,11 +1927,22 @@ class PageCollection(Generic[P], ApplyMixin, ShapeDetectionMixin):
1853
1927
  texts = []
1854
1928
  for page in self.pages:
1855
1929
  text = page.extract_text(
1856
- keep_blank_chars=keep_blank_chars, apply_exclusions=apply_exclusions, **kwargs
1930
+ keep_blank_chars=keep_blank_chars,
1931
+ apply_exclusions=apply_exclusions,
1932
+ **kwargs,
1857
1933
  )
1858
1934
  texts.append(text)
1859
1935
 
1860
- return "\n".join(texts)
1936
+ combined = "\n".join(texts)
1937
+
1938
+ # Default strip behaviour: if caller picks, honour; else respect layout flag passed via kwargs.
1939
+ use_layout = kwargs.get("layout", False)
1940
+ strip_final = strip if strip is not None else (not use_layout)
1941
+
1942
+ if strip_final:
1943
+ combined = "\n".join(line.rstrip() for line in combined.splitlines()).strip()
1944
+
1945
+ return combined
1861
1946
 
1862
1947
  def apply_ocr(
1863
1948
  self,
@@ -2253,6 +2338,44 @@ class PageCollection(Generic[P], ApplyMixin, ShapeDetectionMixin):
2253
2338
 
2254
2339
  # Generate sections
2255
2340
  sections = []
2341
+
2342
+ # --- Helper: build a FlowRegion spanning multiple pages ---
2343
+ def _build_flow_region(start_el, end_el):
2344
+ """Return a FlowRegion that covers from *start_el* to *end_el* (inclusive).
2345
+ If *end_el* is None, the region continues to the bottom of the last
2346
+ page in this PageCollection."""
2347
+ # Local imports to avoid top-level cycles
2348
+ from natural_pdf.elements.region import Region
2349
+ from natural_pdf.flows.flow import Flow
2350
+ from natural_pdf.flows.element import FlowElement
2351
+ from natural_pdf.flows.region import FlowRegion
2352
+
2353
+ start_pg = start_el.page
2354
+ end_pg = end_el.page if end_el is not None else self.pages[-1]
2355
+
2356
+ parts: list[Region] = []
2357
+ # Slice of first page
2358
+ parts.append(Region(start_pg, (0, start_el.top, start_pg.width, start_pg.height)))
2359
+
2360
+ # Full middle pages
2361
+ for pg_idx in range(start_pg.index + 1, end_pg.index):
2362
+ mid_pg = self.pages[pg_idx]
2363
+ parts.append(Region(mid_pg, (0, 0, mid_pg.width, mid_pg.height)))
2364
+
2365
+ # Slice of last page (if distinct)
2366
+ if end_pg is not start_pg:
2367
+ bottom = end_el.bottom if end_el is not None else end_pg.height
2368
+ parts.append(Region(end_pg, (0, 0, end_pg.width, bottom)))
2369
+
2370
+ flow = Flow(segments=parts, arrangement="vertical")
2371
+ src_fe = FlowElement(physical_object=start_el, flow=flow)
2372
+ return FlowRegion(flow=flow,
2373
+ constituent_regions=parts,
2374
+ source_flow_element=src_fe,
2375
+ boundary_element_found=end_el)
2376
+
2377
+ # ------------------------------------------------------------------
2378
+
2256
2379
  current_start = None
2257
2380
 
2258
2381
  for i, boundary in enumerate(section_boundaries):
@@ -2273,50 +2396,9 @@ class PageCollection(Generic[P], ApplyMixin, ShapeDetectionMixin):
2273
2396
  )
2274
2397
  sections.append(section)
2275
2398
  else:
2276
- # Create a multi-page section
2277
- from natural_pdf.elements.region import Region
2278
-
2279
- # Get the start and end pages
2280
- start_page = start_element.page
2281
- end_page = end_element.page
2282
-
2283
- # Create a combined region
2284
- combined_region = Region(
2285
- start_page, (0, start_element.top, start_page.width, start_page.height)
2286
- )
2287
- combined_region._spans_pages = True
2288
- combined_region._page_range = (start_page.index, end_page.index)
2289
- combined_region.start_element = start_element
2290
- combined_region.end_element = end_element
2291
-
2292
- # Get all elements that fall within this multi-page region
2293
- combined_elements = []
2294
-
2295
- # Get elements from the first page
2296
- first_page_elements = [
2297
- e
2298
- for e in all_elements
2299
- if e.page == start_page and e.top >= start_element.top
2300
- ]
2301
- combined_elements.extend(first_page_elements)
2302
-
2303
- # Get elements from middle pages (if any)
2304
- for page_idx in range(start_page.index + 1, end_page.index):
2305
- middle_page_elements = [e for e in all_elements if e.page.index == page_idx]
2306
- combined_elements.extend(middle_page_elements)
2307
-
2308
- # Get elements from the last page
2309
- last_page_elements = [
2310
- e
2311
- for e in all_elements
2312
- if e.page == end_page and e.bottom <= end_element.bottom
2313
- ]
2314
- combined_elements.extend(last_page_elements)
2315
-
2316
- # Store the elements in the combined region
2317
- combined_region._multi_page_elements = combined_elements
2318
-
2319
- sections.append(combined_region)
2399
+ # Create FlowRegion spanning pages
2400
+ flow_region = _build_flow_region(start_element, end_element)
2401
+ sections.append(flow_region)
2320
2402
 
2321
2403
  current_start = None
2322
2404
 
@@ -2372,54 +2454,9 @@ class PageCollection(Generic[P], ApplyMixin, ShapeDetectionMixin):
2372
2454
  last_page_elements.sort(key=lambda e: (e.top, e.x0))
2373
2455
  end_element = last_page_elements[-1] if last_page_elements else None
2374
2456
 
2375
- # Create a multi-page section
2376
- from natural_pdf.elements.region import Region
2377
-
2378
- if start_page == last_page:
2379
- # Simple case - both on same page
2380
- section = start_page.get_section_between(
2381
- start_element, end_element, boundary_inclusion
2382
- )
2383
- sections.append(section)
2384
- else:
2385
- # Create a multi-page section
2386
- combined_region = Region(
2387
- start_page, (0, start_element.top, start_page.width, start_page.height)
2388
- )
2389
- combined_region._spans_pages = True
2390
- combined_region._page_range = (start_page.index, last_page.index)
2391
- combined_region.start_element = start_element
2392
- combined_region.end_element = end_element
2393
-
2394
- # Get all elements that fall within this multi-page region
2395
- combined_elements = []
2396
-
2397
- # Get elements from the first page
2398
- first_page_elements = [
2399
- e
2400
- for e in all_elements
2401
- if e.page == start_page and e.top >= start_element.top
2402
- ]
2403
- combined_elements.extend(first_page_elements)
2404
-
2405
- # Get elements from middle pages (if any)
2406
- for page_idx in range(start_page.index + 1, last_page.index):
2407
- middle_page_elements = [e for e in all_elements if e.page.index == page_idx]
2408
- combined_elements.extend(middle_page_elements)
2409
-
2410
- # Get elements from the last page
2411
- last_page_elements = [
2412
- e
2413
- for e in all_elements
2414
- if e.page == last_page
2415
- and (end_element is None or e.bottom <= end_element.bottom)
2416
- ]
2417
- combined_elements.extend(last_page_elements)
2418
-
2419
- # Store the elements in the combined region
2420
- combined_region._multi_page_elements = combined_elements
2421
-
2422
- sections.append(combined_region)
2457
+ # Create FlowRegion spanning multiple pages using helper
2458
+ flow_region = _build_flow_region(start_element, end_element)
2459
+ sections.append(flow_region)
2423
2460
  else:
2424
2461
  # With start_elements only, create a section to the end of the current page
2425
2462
  from natural_pdf.elements.region import Region
@@ -2607,13 +2644,13 @@ class PageCollection(Generic[P], ApplyMixin, ShapeDetectionMixin):
2607
2644
 
2608
2645
  def to_image(
2609
2646
  self,
2610
- page_width: int = 300,
2647
+ page_width: Optional[int] = None,
2611
2648
  cols: Optional[int] = 4,
2612
2649
  rows: Optional[int] = None,
2613
2650
  max_pages: Optional[int] = None,
2614
2651
  spacing: int = 10,
2615
- add_labels: bool = True,
2616
- show_category: bool = False, # Add new flag
2652
+ add_labels: bool = True, # Add new flag
2653
+ show_category: bool = False,
2617
2654
  ) -> Optional["Image.Image"]:
2618
2655
  """
2619
2656
  Generate a grid of page images for this collection.
@@ -2630,6 +2667,16 @@ class PageCollection(Generic[P], ApplyMixin, ShapeDetectionMixin):
2630
2667
  Returns:
2631
2668
  PIL Image of the page grid or None if no pages
2632
2669
  """
2670
+ # Determine default page width from global options if not explicitly provided
2671
+ if page_width is None:
2672
+ try:
2673
+ import natural_pdf
2674
+
2675
+ page_width = natural_pdf.options.image.width or 300
2676
+ except Exception:
2677
+ # Fallback if natural_pdf import fails in some edge context
2678
+ page_width = 300
2679
+
2633
2680
  # Ensure PIL is imported, handle potential ImportError if not done globally/lazily
2634
2681
  try:
2635
2682
  from PIL import Image, ImageDraw, ImageFont
@@ -2927,3 +2974,17 @@ class PageCollection(Generic[P], ApplyMixin, ShapeDetectionMixin):
2927
2974
  # Re-raise the exception caught from the exporter
2928
2975
  raise e # Keep the original exception type (ValueError, RuntimeError, etc.)
2929
2976
  # <--- END MODIFIED
2977
+
2978
+ # Alias .to_image() to .show() for convenience
2979
+ def show(
2980
+ self,
2981
+ *args,
2982
+ **kwargs,
2983
+ ) -> Optional["Image.Image"]:
2984
+ """Display pages similarly to ``to_image``.
2985
+
2986
+ This is a thin wrapper around :py:meth:`to_image` so that the API mirrors
2987
+ ElementCollection, where ``show()`` already exists. It forwards all
2988
+ arguments and returns the resulting ``PIL.Image`` instance.
2989
+ """
2990
+ return self.to_image(*args, **kwargs)