natural-pdf 0.2.12__py3-none-any.whl → 0.2.15__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -92,6 +92,16 @@ class HighlightRenderer:
92
92
 
93
93
  def _draw_highlights(self):
94
94
  """Draws all highlight shapes, borders, vertices, and attributes."""
95
+ # Get the pdfplumber page offset for coordinate translation
96
+ page_offset_x = 0
97
+ page_offset_y = 0
98
+
99
+ if hasattr(self.page, "_page") and hasattr(self.page._page, "bbox"):
100
+ # PDFPlumber page bbox might have negative offsets
101
+ page_offset_x = -self.page._page.bbox[0]
102
+ page_offset_y = -self.page._page.bbox[1]
103
+ logger.debug(f"Applying highlight offset: x={page_offset_x}, y={page_offset_y}")
104
+
95
105
  for highlight in self.highlights:
96
106
  # Create a transparent overlay for this single highlight
97
107
  overlay = Image.new("RGBA", self.base_image.size, (0, 0, 0, 0))
@@ -101,7 +111,11 @@ class HighlightRenderer:
101
111
 
102
112
  if highlight.is_polygon:
103
113
  scaled_polygon = [
104
- (p[0] * self.scale_factor, p[1] * self.scale_factor) for p in highlight.polygon
114
+ (
115
+ (p[0] + page_offset_x) * self.scale_factor,
116
+ (p[1] + page_offset_y) * self.scale_factor,
117
+ )
118
+ for p in highlight.polygon
105
119
  ]
106
120
  # Draw polygon fill and border
107
121
  draw.polygon(
@@ -117,11 +131,16 @@ class HighlightRenderer:
117
131
  else: # Rectangle
118
132
  x0, top, x1, bottom = highlight.bbox
119
133
  x0_s, top_s, x1_s, bottom_s = (
120
- x0 * self.scale_factor,
121
- top * self.scale_factor,
122
- x1 * self.scale_factor,
123
- bottom * self.scale_factor,
134
+ (x0 + page_offset_x) * self.scale_factor,
135
+ (top + page_offset_y) * self.scale_factor,
136
+ (x1 + page_offset_x) * self.scale_factor,
137
+ (bottom + page_offset_y) * self.scale_factor,
124
138
  )
139
+ logger.debug(f"Original bbox: ({x0}, {top}, {x1}, {bottom})")
140
+ logger.debug(
141
+ f"Offset bbox: ({x0 + page_offset_x}, {top + page_offset_y}, {x1 + page_offset_x}, {bottom + page_offset_y})"
142
+ )
143
+ logger.debug(f"Scaled bbox: ({x0_s}, {top_s}, {x1_s}, {bottom_s})")
125
144
  scaled_bbox = [x0_s, top_s, x1_s, bottom_s]
126
145
  # Draw rectangle fill and border
127
146
  draw.rectangle(
@@ -1482,11 +1501,22 @@ class HighlightingService:
1482
1501
  offset_x = crop_offset[0] * scale_factor
1483
1502
  offset_y = crop_offset[1] * scale_factor
1484
1503
 
1504
+ # Add pdfplumber page offset for coordinate translation
1505
+ page_offset_x = 0
1506
+ page_offset_y = 0
1507
+ if hasattr(page, "_page") and hasattr(page._page, "bbox"):
1508
+ # PDFPlumber page bbox might have negative offsets
1509
+ page_offset_x = -page._page.bbox[0]
1510
+ page_offset_y = -page._page.bbox[1]
1511
+
1485
1512
  # Draw the highlight
1486
1513
  if polygon:
1487
1514
  # Scale polygon points and apply offset
1488
1515
  scaled_polygon = [
1489
- (p[0] * scale_factor - offset_x, p[1] * scale_factor - offset_y)
1516
+ (
1517
+ (p[0] + page_offset_x) * scale_factor - offset_x,
1518
+ (p[1] + page_offset_y) * scale_factor - offset_y,
1519
+ )
1490
1520
  for p in polygon
1491
1521
  ]
1492
1522
  draw.polygon(
@@ -1496,10 +1526,10 @@ class HighlightingService:
1496
1526
  # Scale bbox and apply offset
1497
1527
  x0, y0, x1, y1 = bbox
1498
1528
  scaled_bbox = [
1499
- x0 * scale_factor - offset_x,
1500
- y0 * scale_factor - offset_y,
1501
- x1 * scale_factor - offset_x,
1502
- y1 * scale_factor - offset_y,
1529
+ (x0 + page_offset_x) * scale_factor - offset_x,
1530
+ (y0 + page_offset_y) * scale_factor - offset_y,
1531
+ (x1 + page_offset_x) * scale_factor - offset_x,
1532
+ (y1 + page_offset_y) * scale_factor - offset_y,
1503
1533
  ]
1504
1534
  draw.rectangle(
1505
1535
  scaled_bbox, fill=color, outline=(color[0], color[1], color[2], BORDER_ALPHA)
@@ -106,6 +106,7 @@ class DirectionalMixin:
106
106
  include_source: bool = False,
107
107
  until: Optional[str] = None,
108
108
  include_endpoint: bool = True,
109
+ offset: float = 0.0,
109
110
  **kwargs,
110
111
  ) -> "Region":
111
112
  """
@@ -118,6 +119,7 @@ class DirectionalMixin:
118
119
  include_source: Whether to include this element/region's area in the result
119
120
  until: Optional selector string to specify a boundary element
120
121
  include_endpoint: Whether to include the boundary element found by 'until'
122
+ offset: Pixel offset when excluding source/endpoint (default: 0.1)
121
123
  **kwargs: Additional parameters for the 'until' selector search
122
124
 
123
125
  Returns:
@@ -127,7 +129,7 @@ class DirectionalMixin:
127
129
 
128
130
  is_horizontal = direction in ("left", "right")
129
131
  is_positive = direction in ("right", "below") # right/below are positive directions
130
- pixel_offset = 1 # Offset for excluding elements/endpoints
132
+ pixel_offset = offset # Use provided offset for excluding elements/endpoints
131
133
 
132
134
  # 1. Determine initial boundaries based on direction and include_source
133
135
  if is_horizontal:
@@ -260,6 +262,7 @@ class DirectionalMixin:
260
262
  include_source: bool = False,
261
263
  until: Optional[str] = None,
262
264
  include_endpoint: bool = True,
265
+ offset: float = 0.1,
263
266
  **kwargs,
264
267
  ) -> "Region":
265
268
  """
@@ -271,6 +274,7 @@ class DirectionalMixin:
271
274
  include_source: Whether to include this element/region in the result (default: False)
272
275
  until: Optional selector string to specify an upper boundary element
273
276
  include_endpoint: Whether to include the boundary element in the region (default: True)
277
+ offset: Pixel offset when excluding source/endpoint (default: 0.1)
274
278
  **kwargs: Additional parameters
275
279
 
276
280
  Returns:
@@ -295,6 +299,7 @@ class DirectionalMixin:
295
299
  include_source=include_source,
296
300
  until=until,
297
301
  include_endpoint=include_endpoint,
302
+ offset=offset,
298
303
  **kwargs,
299
304
  )
300
305
 
@@ -305,6 +310,7 @@ class DirectionalMixin:
305
310
  include_source: bool = False,
306
311
  until: Optional[str] = None,
307
312
  include_endpoint: bool = True,
313
+ offset: float = 0.1,
308
314
  **kwargs,
309
315
  ) -> "Region":
310
316
  """
@@ -316,6 +322,7 @@ class DirectionalMixin:
316
322
  include_source: Whether to include this element/region in the result (default: False)
317
323
  until: Optional selector string to specify a lower boundary element
318
324
  include_endpoint: Whether to include the boundary element in the region (default: True)
325
+ offset: Pixel offset when excluding source/endpoint (default: 0.1)
319
326
  **kwargs: Additional parameters
320
327
 
321
328
  Returns:
@@ -340,6 +347,7 @@ class DirectionalMixin:
340
347
  include_source=include_source,
341
348
  until=until,
342
349
  include_endpoint=include_endpoint,
350
+ offset=offset,
343
351
  **kwargs,
344
352
  )
345
353
 
@@ -350,6 +358,7 @@ class DirectionalMixin:
350
358
  include_source: bool = False,
351
359
  until: Optional[str] = None,
352
360
  include_endpoint: bool = True,
361
+ offset: float = 0.1,
353
362
  **kwargs,
354
363
  ) -> "Region":
355
364
  """
@@ -361,6 +370,7 @@ class DirectionalMixin:
361
370
  include_source: Whether to include this element/region in the result (default: False)
362
371
  until: Optional selector string to specify a left boundary element
363
372
  include_endpoint: Whether to include the boundary element in the region (default: True)
373
+ offset: Pixel offset when excluding source/endpoint (default: 0.1)
364
374
  **kwargs: Additional parameters
365
375
 
366
376
  Returns:
@@ -385,6 +395,7 @@ class DirectionalMixin:
385
395
  include_source=include_source,
386
396
  until=until,
387
397
  include_endpoint=include_endpoint,
398
+ offset=offset,
388
399
  **kwargs,
389
400
  )
390
401
 
@@ -395,6 +406,7 @@ class DirectionalMixin:
395
406
  include_source: bool = False,
396
407
  until: Optional[str] = None,
397
408
  include_endpoint: bool = True,
409
+ offset: float = 0.1,
398
410
  **kwargs,
399
411
  ) -> "Region":
400
412
  """
@@ -406,6 +418,7 @@ class DirectionalMixin:
406
418
  include_source: Whether to include this element/region in the result (default: False)
407
419
  until: Optional selector string to specify a right boundary element
408
420
  include_endpoint: Whether to include the boundary element in the region (default: True)
421
+ offset: Pixel offset when excluding source/endpoint (default: 0.1)
409
422
  **kwargs: Additional parameters
410
423
 
411
424
  Returns:
@@ -430,6 +443,7 @@ class DirectionalMixin:
430
443
  include_source=include_source,
431
444
  until=until,
432
445
  include_endpoint=include_endpoint,
446
+ offset=offset,
433
447
  **kwargs,
434
448
  )
435
449
 
@@ -1195,6 +1209,9 @@ class Element(
1195
1209
 
1196
1210
  return self
1197
1211
 
1212
+ def exclude(self):
1213
+ self.page.add_exclusion(self)
1214
+
1198
1215
  def _get_render_specs(
1199
1216
  self,
1200
1217
  mode: Literal["show", "render"] = "show",
@@ -888,6 +888,9 @@ class ElementCollection(
888
888
  self._elements.sort(key=key, reverse=reverse)
889
889
  return self
890
890
 
891
+ def exclude(self):
892
+ self.page.add_exclusion(self)
893
+
891
894
  def highlight(
892
895
  self,
893
896
  label: Optional[str] = None,
@@ -1902,13 +1905,87 @@ class ElementCollection(
1902
1905
 
1903
1906
  return ElementCollection(all_found_elements)
1904
1907
 
1905
- def extract_each_text(self, **kwargs) -> List[str]:
1906
- """
1907
- Extract text from each element in this region.
1908
+ def extract_each_text(
1909
+ self,
1910
+ order: Optional[Union[str, Callable[[T], Any]]] = None,
1911
+ *,
1912
+ newlines: bool = True,
1913
+ **kwargs,
1914
+ ) -> List[str]:
1915
+ """Return a list with the extracted text for every element.
1916
+
1917
+ Parameters
1918
+ ----------
1919
+ order
1920
+ Controls the ordering of elements **before** extraction:
1921
+
1922
+ * ``None`` (default) – keep the collection's current order.
1923
+ * ``callable`` – a function that will be used as ``key`` for :pyfunc:`sorted`.
1924
+ * ``"ltr"`` – left-to-right ordering (x0, then y-top).
1925
+ * ``"rtl"`` – right-to-left ordering (−x0, then y-top).
1926
+ * ``"natural"`` – natural reading order (y-top, then x0).
1927
+
1928
+ Remaining keyword arguments are forwarded to each element's
1929
+ :py:meth:`extract_text` method.
1908
1930
  """
1909
- return self.apply(
1910
- lambda element: element.extract_text(**kwargs) if element is not None else None
1911
- )
1931
+
1932
+ # -- Determine ordering --------------------------------------------------
1933
+ elements: List[T] = list(self._elements) # make a shallow copy we can sort
1934
+
1935
+ if order is not None and len(elements) > 1:
1936
+ try:
1937
+ if callable(order):
1938
+ elements.sort(key=order)
1939
+ elif isinstance(order, str):
1940
+ preset = order.lower()
1941
+ if preset in {"ltr", "left-to-right"}:
1942
+ elements.sort(
1943
+ key=lambda el: (
1944
+ (
1945
+ getattr(el, "page", None).index
1946
+ if hasattr(el, "page") and el.page
1947
+ else 0
1948
+ ),
1949
+ getattr(el, "x0", 0),
1950
+ getattr(el, "top", 0),
1951
+ )
1952
+ )
1953
+ elif preset in {"rtl", "right-to-left"}:
1954
+ elements.sort(
1955
+ key=lambda el: (
1956
+ (
1957
+ getattr(el, "page", None).index
1958
+ if hasattr(el, "page") and el.page
1959
+ else 0
1960
+ ),
1961
+ -getattr(el, "x0", 0),
1962
+ getattr(el, "top", 0),
1963
+ )
1964
+ )
1965
+ elif preset in {"natural", "tdlr", "top-down"}:
1966
+ elements.sort(
1967
+ key=lambda el: (
1968
+ (
1969
+ getattr(el, "page", None).index
1970
+ if hasattr(el, "page") and el.page
1971
+ else 0
1972
+ ),
1973
+ getattr(el, "top", 0),
1974
+ getattr(el, "x0", 0),
1975
+ )
1976
+ )
1977
+ else:
1978
+ # Unknown preset – silently ignore to keep original order
1979
+ pass
1980
+ except Exception:
1981
+ # If anything goes wrong, fall back to original order
1982
+ pass
1983
+
1984
+ # -- Extract ----------------------------------------------------------------
1985
+ return [
1986
+ el.extract_text(newlines=newlines, **kwargs) if el is not None else None # type: ignore[arg-type]
1987
+ for el in elements
1988
+ ]
1912
1989
 
1913
1990
  def correct_ocr(
1914
1991
  self,
@@ -2673,10 +2750,17 @@ class ElementCollection(
2673
2750
  else:
2674
2751
  v_dist = 0 # Vertically overlapping
2675
2752
 
2676
- # Use Chebyshev distance (max of horizontal and vertical)
2677
- # This creates a square proximity zone
2678
- distance = max(h_dist, v_dist)
2753
+ # ------------------------------------------------------------------
2754
+ # Decide connection logic based on vertical_gap parameter
2755
+ # ------------------------------------------------------------------
2756
+ if vertical_gap is not None:
2757
+ # Consider elements connected when they vertically stack within
2758
+ # the allowed gap **and** have some horizontal overlap
2759
+ horizontal_overlap = not (h_dist > 0)
2760
+ return horizontal_overlap and v_dist <= vertical_gap
2679
2761
 
2762
+ # Fallback to legacy Chebyshev distance using ``threshold``
2763
+ distance = max(h_dist, v_dist)
2680
2764
  return distance <= threshold
2681
2765
 
2682
2766
  def _merge_region_group(
@@ -2752,6 +2836,9 @@ class ElementCollection(
2752
2836
  def dissolve(
2753
2837
  self,
2754
2838
  padding: float = 2.0,
2839
+ *,
2840
+ vertical_gap: Optional[float] = None,
2841
+ vertical: Optional[bool] = False,
2755
2842
  geometry: Literal["rect", "polygon"] = "rect",
2756
2843
  group_by: List[str] = None,
2757
2844
  ) -> "ElementCollection":
@@ -2764,8 +2851,19 @@ class ElementCollection(
2764
2851
  bounding boxes.
2765
2852
 
2766
2853
  Args:
2767
- padding: Maximum distance in points between elements to consider
2768
- them connected. Default is 2.0 points.
2854
+ padding: Maximum chebyshev distance (in any direction) between
2855
+ elements to consider them connected **when ``vertical_gap`` is
2856
+ not provided**. Default 2.0 pt.
2857
+
2858
+ vertical_gap: If given, switches to *stack-aware* dissolve:
2859
+ two elements are connected when their horizontal projections
2860
+ overlap (any amount) **and** the vertical distance between them
2861
+ is ≤ ``vertical_gap``. This lets you combine multi-line labels
2862
+ that share the same column but have blank space between lines.
2863
+
2864
+ vertical: If given, automatically sets vertical_gap to maximum to
2865
+ allow for easy vertical stacking.
2866
+
2769
2867
  geometry: Type of geometry to use for merged regions. Currently only
2770
2868
  "rect" (bounding box) is supported. "polygon" will raise
2771
2869
  NotImplementedError.
@@ -2807,6 +2905,9 @@ class ElementCollection(
2807
2905
  if geometry not in ["rect", "polygon"]:
2808
2906
  raise ValueError(f"Invalid geometry type: {geometry}. Must be 'rect' or 'polygon'")
2809
2907
 
2908
+ if vertical:
2909
+ vertical_gap = float("inf")
2910
+
2810
2911
  from natural_pdf.elements.region import Region
2811
2912
 
2812
2913
  # Filter to elements with bbox (all elements that can be dissolved)
@@ -2835,7 +2936,9 @@ class ElementCollection(
2835
2936
  logger.debug(f"Processing group {group_key} with {len(group_elements)} elements")
2836
2937
 
2837
2938
  # Find connected components within this group
2838
- components = self._find_connected_components_elements(group_elements, padding)
2939
+ components = self._find_connected_components_elements(
2940
+ group_elements, padding, vertical_gap
2941
+ )
2839
2942
 
2840
2943
  # Merge each component
2841
2944
  for component_elements in components:
@@ -2894,7 +2997,7 @@ class ElementCollection(
2894
2997
  return groups
2895
2998
 
2896
2999
  def _find_connected_components_elements(
2897
- self, elements: List["Element"], padding: float
3000
+ self, elements: List["Element"], padding: float, vertical_gap: Optional[float] = None
2898
3001
  ) -> List[List["Element"]]:
2899
3002
  """Find connected components among elements using union-find."""
2900
3003
  if not elements:
@@ -2919,7 +3022,7 @@ class ElementCollection(
2919
3022
  # Check all pairs of elements for connectivity
2920
3023
  for i in range(len(elements)):
2921
3024
  for j in range(i + 1, len(elements)):
2922
- if self._are_elements_connected(elements[i], elements[j], padding):
3025
+ if self._are_elements_connected(elements[i], elements[j], padding, vertical_gap):
2923
3026
  union(i, j)
2924
3027
 
2925
3028
  # Group elements by their connected component
@@ -3004,7 +3107,9 @@ class ElementCollection(
3004
3107
 
3005
3108
  return merged_region
3006
3109
 
3007
- def _are_elements_connected(self, elem1: "Element", elem2: "Element", threshold: float) -> bool:
3110
+ def _are_elements_connected(
3111
+ self, elem1: "Element", elem2: "Element", threshold: float, vertical_gap: float | None
3112
+ ) -> bool:
3008
3113
  """Check if two elements are connected (adjacent or overlapping)."""
3009
3114
  # Check if elements are on the same page
3010
3115
  # Handle edge cases where elements might not have a page attribute
@@ -3057,6 +3162,12 @@ class ElementCollection(
3057
3162
  # This creates a square proximity zone
3058
3163
  distance = max(h_dist, v_dist)
3059
3164
 
3165
+ if vertical_gap is not None:
3166
+ # 1. vertical distance ≤ vertical_gap
3167
+ # 2. horizontal ranges overlap OR touch
3168
+ h_overlap = (min(x1_1, x1_2) - max(x0_1, x0_2)) >= 0
3169
+ return h_overlap and v_dist <= vertical_gap
3170
+
3060
3171
  return distance <= threshold
3061
3172
 
3062
3173
  def _copy_element_attributes_to_region(
@@ -3163,3 +3274,30 @@ class ElementCollection(
3163
3274
  return self
3164
3275
 
3165
3276
  # ------------------------------------------------------------------
3277
+
3278
+ # ------------------------------------------------------------------
3279
+ # Public alias: combine
3280
+ # ------------------------------------------------------------------
3281
+ def combine(
3282
+ self,
3283
+ padding: float = 2.0,
3284
+ *,
3285
+ vertical_gap: Optional[float] = None,
3286
+ vertical: Optional[bool] = False,
3287
+ geometry: Literal["rect", "polygon"] = "rect",
3288
+ group_by: List[str] = None,
3289
+ ) -> "ElementCollection":
3290
+ """Alias for :py:meth:`dissolve` – retained for discoverability.
3291
+
3292
+ Many users find the verb *combine* more intuitive than *dissolve* when
3293
+ merging nearby or stacked elements into unified Regions. The parameters
3294
+ are identical; see :py:meth:`dissolve` for full documentation.
3295
+ """
3296
+
3297
+ return self.dissolve(
3298
+ padding=padding,
3299
+ vertical_gap=vertical_gap,
3300
+ vertical=vertical,
3301
+ geometry=geometry,
3302
+ group_by=group_by,
3303
+ )
@@ -88,6 +88,40 @@ class RectangleElement(Element):
88
88
  """Get the stroke width of the rectangle."""
89
89
  return self._obj.get("linewidth", 0)
90
90
 
91
+ @property
92
+ def is_horizontal(self) -> bool:
93
+ """Check if this is a horizontal line based on coordinates."""
94
+ # Calculate absolute difference in coordinates
95
+ dx = abs(self.x1 - self.x0)
96
+ dy = abs(self.top - self.bottom)
97
+
98
+ # Define a tolerance for near-horizontal lines (e.g., 1 point)
99
+ tolerance = 1.0
100
+
101
+ # Horizontal if y-change is within tolerance and x-change is significant
102
+ return dy <= tolerance and dx > tolerance
103
+
104
+ @property
105
+ def is_vertical(self) -> bool:
106
+ """Check if this is a vertical line based on coordinates."""
107
+ # Calculate absolute difference in coordinates
108
+ dx = abs(self.x1 - self.x0)
109
+ dy = abs(self.top - self.bottom)
110
+
111
+ # Define a tolerance for near-vertical lines (e.g., 1 point)
112
+ tolerance = 1.0
113
+
114
+ # Vertical if x-change is within tolerance and y-change is significant
115
+ return dx <= tolerance and dy > tolerance
116
+
117
+ @property
118
+ def orientation(self) -> str:
119
+ """Get the orientation of the line ('horizontal', 'vertical', or 'diagonal')."""
120
+ if self.is_horizontal:
121
+ return "horizontal"
122
+ elif self.is_vertical:
123
+ return "vertical"
124
+
91
125
  def extract_text(self, **kwargs) -> str:
92
126
  """
93
127
  Extract text from inside this rectangle.
@@ -45,6 +45,7 @@ from natural_pdf.utils.locks import pdf_render_lock # Import the lock
45
45
 
46
46
  # Import new utils
47
47
  from natural_pdf.utils.text_extraction import filter_chars_spatially, generate_text_layout
48
+ from natural_pdf.vision.mixin import VisualSearchMixin
48
49
 
49
50
  # Import viewer widget support
50
51
  from natural_pdf.widgets.viewer import _IPYWIDGETS_AVAILABLE, InteractiveViewerWidget
@@ -80,6 +81,7 @@ class Region(
80
81
  ExtractionMixin,
81
82
  ShapeDetectionMixin,
82
83
  DescribeMixin,
84
+ VisualSearchMixin,
83
85
  Visualizable,
84
86
  ):
85
87
  """Represents a rectangular region on a page.
@@ -736,6 +738,9 @@ class Region(
736
738
  and self.bottom > element.top
737
739
  )
738
740
 
741
+ def exclude(self):
742
+ self.page.add_exclusion(self)
743
+
739
744
  def highlight(
740
745
  self,
741
746
  label: Optional[str] = None,
@@ -1227,7 +1232,13 @@ class Region(
1227
1232
  return [e for e in page_elements if self._is_element_in_region(e)]
1228
1233
 
1229
1234
  def extract_text(
1230
- self, apply_exclusions=True, debug=False, content_filter=None, **kwargs
1235
+ self,
1236
+ apply_exclusions: bool = True,
1237
+ debug: bool = False,
1238
+ *,
1239
+ newlines: Union[bool, str] = True,
1240
+ content_filter=None,
1241
+ **kwargs,
1231
1242
  ) -> str:
1232
1243
  """
1233
1244
  Extract text from this region, respecting page exclusions and using pdfplumber's
@@ -1236,6 +1247,7 @@ class Region(
1236
1247
  Args:
1237
1248
  apply_exclusions: Whether to apply exclusion regions defined on the parent page.
1238
1249
  debug: Enable verbose debugging output for filtering steps.
1250
+ newlines: Whether to strip newline characters from the extracted text.
1239
1251
  content_filter: Optional content filter to exclude specific text patterns. Can be:
1240
1252
  - A regex pattern string (characters matching the pattern are EXCLUDED)
1241
1253
  - A callable that takes text and returns True to KEEP the character
@@ -1309,6 +1321,18 @@ class Region(
1309
1321
  user_kwargs=final_kwargs, # Pass kwargs including content_filter
1310
1322
  )
1311
1323
 
1324
+ # Flexible newline handling (same logic as TextElement)
1325
+ if isinstance(newlines, bool):
1326
+ if newlines is False:
1327
+ replacement = " "
1328
+ else:
1329
+ replacement = None
1330
+ else:
1331
+ replacement = str(newlines)
1332
+
1333
+ if replacement is not None:
1334
+ result = result.replace("\n", replacement).replace("\r", replacement)
1335
+
1312
1336
  logger.debug(f"Region {self.bbox}: extract_text finished, result length: {len(result)}.")
1313
1337
  return result
1314
1338
 
@@ -1692,7 +1716,21 @@ class Region(
1692
1716
  else:
1693
1717
  filtered_page = base_plumber_page
1694
1718
 
1695
- cropped = filtered_page.crop(self.bbox)
1719
+ # Ensure bbox is within pdfplumber page bounds
1720
+ page_bbox = filtered_page.bbox
1721
+ clipped_bbox = (
1722
+ max(self.bbox[0], page_bbox[0]), # x0
1723
+ max(self.bbox[1], page_bbox[1]), # y0
1724
+ min(self.bbox[2], page_bbox[2]), # x1
1725
+ min(self.bbox[3], page_bbox[3]), # y1
1726
+ )
1727
+
1728
+ # Only crop if the clipped bbox is valid (has positive width and height)
1729
+ if clipped_bbox[2] > clipped_bbox[0] and clipped_bbox[3] > clipped_bbox[1]:
1730
+ cropped = filtered_page.crop(clipped_bbox)
1731
+ else:
1732
+ # If the region is completely outside the page bounds, return empty list
1733
+ return []
1696
1734
 
1697
1735
  # Extract all tables from the cropped area
1698
1736
  tables = cropped.extract_tables(table_settings)
@@ -1786,7 +1824,21 @@ class Region(
1786
1824
  filtered_page = base_plumber_page
1787
1825
 
1788
1826
  # Now crop the (possibly filtered) page to the region bbox
1789
- cropped = filtered_page.crop(self.bbox)
1827
+ # Ensure bbox is within pdfplumber page bounds
1828
+ page_bbox = filtered_page.bbox
1829
+ clipped_bbox = (
1830
+ max(self.bbox[0], page_bbox[0]), # x0
1831
+ max(self.bbox[1], page_bbox[1]), # y0
1832
+ min(self.bbox[2], page_bbox[2]), # x1
1833
+ min(self.bbox[3], page_bbox[3]), # y1
1834
+ )
1835
+
1836
+ # Only crop if the clipped bbox is valid (has positive width and height)
1837
+ if clipped_bbox[2] > clipped_bbox[0] and clipped_bbox[3] > clipped_bbox[1]:
1838
+ cropped = filtered_page.crop(clipped_bbox)
1839
+ else:
1840
+ # If the region is completely outside the page bounds, return empty table
1841
+ return []
1790
1842
 
1791
1843
  # Extract the single largest table from the cropped area
1792
1844
  table = cropped.extract_table(table_settings)
@@ -2,7 +2,7 @@
2
2
  Text element classes for natural-pdf.
3
3
  """
4
4
 
5
- from typing import TYPE_CHECKING, Any, Dict, Optional
5
+ from typing import TYPE_CHECKING, Any, Dict, Optional, Union
6
6
 
7
7
  from natural_pdf.elements.base import Element
8
8
 
@@ -236,7 +236,13 @@ class TextElement(Element):
236
236
  return (0, 0, 0)
237
237
 
238
238
  def extract_text(
239
- self, keep_blank_chars=True, strip: Optional[bool] = True, content_filter=None, **kwargs
239
+ self,
240
+ keep_blank_chars: bool = True,
241
+ strip: Optional[bool] = True,
242
+ *,
243
+ newlines: Union[bool, str] = True,
244
+ content_filter=None,
245
+ **kwargs,
240
246
  ) -> str:
241
247
  """
242
248
  Extract text from this element.
@@ -292,6 +298,18 @@ class TextElement(Element):
292
298
  if strip:
293
299
  result = result.strip()
294
300
 
301
+ # Flexible newline handling
302
+ if isinstance(newlines, bool):
303
+ if newlines is False:
304
+ replacement = " " # single space when False
305
+ else:
306
+ replacement = None # keep as-is when True
307
+ else:
308
+ replacement = str(newlines)
309
+
310
+ if replacement is not None:
311
+ result = result.replace("\n", replacement).replace("\r", replacement)
312
+
295
313
  return result
296
314
 
297
315
  def contains(self, substring: str, case_sensitive: bool = True) -> bool: