natural-pdf 0.2.8__py3-none-any.whl → 0.2.10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -128,19 +128,59 @@ class GuidesList(UserList):
128
128
  """A list of guide coordinates that also provides methods for creating guides."""
129
129
 
130
130
  def __init__(self, parent_guides: "Guides", axis: Literal["vertical", "horizontal"], data=None):
131
- super().__init__(data or [])
131
+ # Always sort the initial data
132
+ super().__init__(sorted(data) if data else [])
132
133
  self._parent = parent_guides
133
134
  self._axis = axis
134
135
 
135
136
  def __getitem__(self, i):
136
- """Override to handle slicing properly."""
137
+ """Override to handle slicing and negative indexing properly."""
137
138
  if isinstance(i, slice):
138
139
  # Return a new GuidesList with the sliced data
139
140
  return self.__class__(self._parent, self._axis, self.data[i])
140
141
  else:
141
- # For single index, return the value directly
142
+ # For single index, handle negative indices properly
143
+ if i < 0:
144
+ # Convert negative index to positive
145
+ i = len(self.data) + i
142
146
  return self.data[i]
143
147
 
148
+ def __setitem__(self, i, item):
149
+ """Override to maintain sorted order."""
150
+ self.data[i] = item
151
+ self.data.sort()
152
+
153
+ def append(self, item):
154
+ """Override to maintain sorted order."""
155
+ self.data.append(item)
156
+ self.data.sort()
157
+
158
+ def extend(self, other):
159
+ """Override to maintain sorted order."""
160
+ self.data.extend(other)
161
+ self.data.sort()
162
+
163
+ def insert(self, i, item):
164
+ """Override to maintain sorted order."""
165
+ self.data.append(item) # Just append and sort
166
+ self.data.sort()
167
+
168
+ def __iadd__(self, other):
169
+ """Override to maintain sorted order."""
170
+ self.data.extend(other)
171
+ self.data.sort()
172
+ return self
173
+
174
+ @property
175
+ def data(self):
176
+ """Get the data list."""
177
+ return self._data
178
+
179
+ @data.setter
180
+ def data(self, value):
181
+ """Set the data list, always keeping it sorted."""
182
+ self._data = sorted(value) if value else []
183
+
144
184
  def from_content(
145
185
  self,
146
186
  markers: Union[str, List[str], "ElementCollection", Callable, None],
@@ -1842,6 +1882,370 @@ class Guides:
1842
1882
  self.horizontal.pop(index)
1843
1883
  return self
1844
1884
 
1885
+ # -------------------------------------------------------------------------
1886
+ # Region extraction properties
1887
+ # -------------------------------------------------------------------------
1888
+
1889
+ @property
1890
+ def columns(self):
1891
+ """Access columns by index like guides.columns[0]."""
1892
+ return _ColumnAccessor(self)
1893
+
1894
+ @property
1895
+ def rows(self):
1896
+ """Access rows by index like guides.rows[0]."""
1897
+ return _RowAccessor(self)
1898
+
1899
+ @property
1900
+ def cells(self):
1901
+ """Access cells by index like guides.cells[row][col] or guides.cells[row, col]."""
1902
+ return _CellAccessor(self)
1903
+
1904
+ # -------------------------------------------------------------------------
1905
+ # Region extraction methods (alternative API)
1906
+ # -------------------------------------------------------------------------
1907
+
1908
+ def column(self, index: int, obj: Optional[Union["Page", "Region"]] = None) -> "Region":
1909
+ """
1910
+ Get a column region from the guides.
1911
+
1912
+ Args:
1913
+ index: Column index (0-based)
1914
+ obj: Page or Region to create the column on (uses self.context if None)
1915
+
1916
+ Returns:
1917
+ Region representing the specified column
1918
+
1919
+ Raises:
1920
+ IndexError: If column index is out of range
1921
+ """
1922
+ target = obj or self.context
1923
+ if target is None:
1924
+ raise ValueError("No context available for region creation")
1925
+
1926
+ if not self.vertical or index < 0 or index >= len(self.vertical) - 1:
1927
+ raise IndexError(
1928
+ f"Column index {index} out of range (have {len(self.vertical)-1} columns)"
1929
+ )
1930
+
1931
+ # Get bounds from context
1932
+ bounds = self._get_context_bounds()
1933
+ if not bounds:
1934
+ raise ValueError("Could not determine bounds")
1935
+ _, y0, _, y1 = bounds
1936
+
1937
+ # Get column boundaries
1938
+ x0 = self.vertical[index]
1939
+ x1 = self.vertical[index + 1]
1940
+
1941
+ # Create region using absolute coordinates
1942
+ if hasattr(target, "region"):
1943
+ # Target has a region method (Page)
1944
+ return target.region(x0, y0, x1, y1)
1945
+ elif hasattr(target, "page"):
1946
+ # Target is a Region, use its parent page
1947
+ # The coordinates from guides are already absolute
1948
+ return target.page.region(x0, y0, x1, y1)
1949
+ else:
1950
+ raise TypeError(f"Cannot create region on {type(target)}")
1951
+
1952
+ def row(self, index: int, obj: Optional[Union["Page", "Region"]] = None) -> "Region":
1953
+ """
1954
+ Get a row region from the guides.
1955
+
1956
+ Args:
1957
+ index: Row index (0-based)
1958
+ obj: Page or Region to create the row on (uses self.context if None)
1959
+
1960
+ Returns:
1961
+ Region representing the specified row
1962
+
1963
+ Raises:
1964
+ IndexError: If row index is out of range
1965
+ """
1966
+ target = obj or self.context
1967
+ if target is None:
1968
+ raise ValueError("No context available for region creation")
1969
+
1970
+ if not self.horizontal or index < 0 or index >= len(self.horizontal) - 1:
1971
+ raise IndexError(f"Row index {index} out of range (have {len(self.horizontal)-1} rows)")
1972
+
1973
+ # Get bounds from context
1974
+ bounds = self._get_context_bounds()
1975
+ if not bounds:
1976
+ raise ValueError("Could not determine bounds")
1977
+ x0, _, x1, _ = bounds
1978
+
1979
+ # Get row boundaries
1980
+ y0 = self.horizontal[index]
1981
+ y1 = self.horizontal[index + 1]
1982
+
1983
+ # Create region using absolute coordinates
1984
+ if hasattr(target, "region"):
1985
+ # Target has a region method (Page)
1986
+ return target.region(x0, y0, x1, y1)
1987
+ elif hasattr(target, "page"):
1988
+ # Target is a Region, use its parent page
1989
+ # The coordinates from guides are already absolute
1990
+ return target.page.region(x0, y0, x1, y1)
1991
+ else:
1992
+ raise TypeError(f"Cannot create region on {type(target)}")
1993
+
1994
+ def cell(self, row: int, col: int, obj: Optional[Union["Page", "Region"]] = None) -> "Region":
1995
+ """
1996
+ Get a cell region from the guides.
1997
+
1998
+ Args:
1999
+ row: Row index (0-based)
2000
+ col: Column index (0-based)
2001
+ obj: Page or Region to create the cell on (uses self.context if None)
2002
+
2003
+ Returns:
2004
+ Region representing the specified cell
2005
+
2006
+ Raises:
2007
+ IndexError: If row or column index is out of range
2008
+ """
2009
+ target = obj or self.context
2010
+ if target is None:
2011
+ raise ValueError("No context available for region creation")
2012
+
2013
+ if not self.vertical or col < 0 or col >= len(self.vertical) - 1:
2014
+ raise IndexError(
2015
+ f"Column index {col} out of range (have {len(self.vertical)-1} columns)"
2016
+ )
2017
+ if not self.horizontal or row < 0 or row >= len(self.horizontal) - 1:
2018
+ raise IndexError(f"Row index {row} out of range (have {len(self.horizontal)-1} rows)")
2019
+
2020
+ # Get cell boundaries
2021
+ x0 = self.vertical[col]
2022
+ x1 = self.vertical[col + 1]
2023
+ y0 = self.horizontal[row]
2024
+ y1 = self.horizontal[row + 1]
2025
+
2026
+ # Create region using absolute coordinates
2027
+ if hasattr(target, "region"):
2028
+ # Target has a region method (Page)
2029
+ return target.region(x0, y0, x1, y1)
2030
+ elif hasattr(target, "page"):
2031
+ # Target is a Region, use its parent page
2032
+ # The coordinates from guides are already absolute
2033
+ return target.page.region(x0, y0, x1, y1)
2034
+ else:
2035
+ raise TypeError(f"Cannot create region on {type(target)}")
2036
+
2037
+ def left_of(self, guide_index: int, obj: Optional[Union["Page", "Region"]] = None) -> "Region":
2038
+ """
2039
+ Get a region to the left of a vertical guide.
2040
+
2041
+ Args:
2042
+ guide_index: Vertical guide index
2043
+ obj: Page or Region to create the region on (uses self.context if None)
2044
+
2045
+ Returns:
2046
+ Region to the left of the specified guide
2047
+ """
2048
+ target = obj or self.context
2049
+ if target is None:
2050
+ raise ValueError("No context available for region creation")
2051
+
2052
+ if not self.vertical or guide_index < 0 or guide_index >= len(self.vertical):
2053
+ raise IndexError(f"Guide index {guide_index} out of range")
2054
+
2055
+ # Get bounds from context
2056
+ bounds = self._get_context_bounds()
2057
+ if not bounds:
2058
+ raise ValueError("Could not determine bounds")
2059
+ x0, y0, _, y1 = bounds
2060
+
2061
+ # Create region from left edge to guide
2062
+ x1 = self.vertical[guide_index]
2063
+
2064
+ if hasattr(target, "region"):
2065
+ return target.region(x0, y0, x1, y1)
2066
+ else:
2067
+ raise TypeError(f"Cannot create region on {type(target)}")
2068
+
2069
+ def right_of(self, guide_index: int, obj: Optional[Union["Page", "Region"]] = None) -> "Region":
2070
+ """
2071
+ Get a region to the right of a vertical guide.
2072
+
2073
+ Args:
2074
+ guide_index: Vertical guide index
2075
+ obj: Page or Region to create the region on (uses self.context if None)
2076
+
2077
+ Returns:
2078
+ Region to the right of the specified guide
2079
+ """
2080
+ target = obj or self.context
2081
+ if target is None:
2082
+ raise ValueError("No context available for region creation")
2083
+
2084
+ if not self.vertical or guide_index < 0 or guide_index >= len(self.vertical):
2085
+ raise IndexError(f"Guide index {guide_index} out of range")
2086
+
2087
+ # Get bounds from context
2088
+ bounds = self._get_context_bounds()
2089
+ if not bounds:
2090
+ raise ValueError("Could not determine bounds")
2091
+ _, y0, x1, y1 = bounds
2092
+
2093
+ # Create region from guide to right edge
2094
+ x0 = self.vertical[guide_index]
2095
+
2096
+ if hasattr(target, "region"):
2097
+ return target.region(x0, y0, x1, y1)
2098
+ else:
2099
+ raise TypeError(f"Cannot create region on {type(target)}")
2100
+
2101
+ def above(self, guide_index: int, obj: Optional[Union["Page", "Region"]] = None) -> "Region":
2102
+ """
2103
+ Get a region above a horizontal guide.
2104
+
2105
+ Args:
2106
+ guide_index: Horizontal guide index
2107
+ obj: Page or Region to create the region on (uses self.context if None)
2108
+
2109
+ Returns:
2110
+ Region above the specified guide
2111
+ """
2112
+ target = obj or self.context
2113
+ if target is None:
2114
+ raise ValueError("No context available for region creation")
2115
+
2116
+ if not self.horizontal or guide_index < 0 or guide_index >= len(self.horizontal):
2117
+ raise IndexError(f"Guide index {guide_index} out of range")
2118
+
2119
+ # Get bounds from context
2120
+ bounds = self._get_context_bounds()
2121
+ if not bounds:
2122
+ raise ValueError("Could not determine bounds")
2123
+ x0, y0, x1, _ = bounds
2124
+
2125
+ # Create region from top edge to guide
2126
+ y1 = self.horizontal[guide_index]
2127
+
2128
+ if hasattr(target, "region"):
2129
+ return target.region(x0, y0, x1, y1)
2130
+ else:
2131
+ raise TypeError(f"Cannot create region on {type(target)}")
2132
+
2133
+ def below(self, guide_index: int, obj: Optional[Union["Page", "Region"]] = None) -> "Region":
2134
+ """
2135
+ Get a region below a horizontal guide.
2136
+
2137
+ Args:
2138
+ guide_index: Horizontal guide index
2139
+ obj: Page or Region to create the region on (uses self.context if None)
2140
+
2141
+ Returns:
2142
+ Region below the specified guide
2143
+ """
2144
+ target = obj or self.context
2145
+ if target is None:
2146
+ raise ValueError("No context available for region creation")
2147
+
2148
+ if not self.horizontal or guide_index < 0 or guide_index >= len(self.horizontal):
2149
+ raise IndexError(f"Guide index {guide_index} out of range")
2150
+
2151
+ # Get bounds from context
2152
+ bounds = self._get_context_bounds()
2153
+ if not bounds:
2154
+ raise ValueError("Could not determine bounds")
2155
+ x0, _, x1, y1 = bounds
2156
+
2157
+ # Create region from guide to bottom edge
2158
+ y0 = self.horizontal[guide_index]
2159
+
2160
+ if hasattr(target, "region"):
2161
+ return target.region(x0, y0, x1, y1)
2162
+ else:
2163
+ raise TypeError(f"Cannot create region on {type(target)}")
2164
+
2165
+ def between_vertical(
2166
+ self, start_index: int, end_index: int, obj: Optional[Union["Page", "Region"]] = None
2167
+ ) -> "Region":
2168
+ """
2169
+ Get a region between two vertical guides.
2170
+
2171
+ Args:
2172
+ start_index: Starting vertical guide index
2173
+ end_index: Ending vertical guide index
2174
+ obj: Page or Region to create the region on (uses self.context if None)
2175
+
2176
+ Returns:
2177
+ Region between the specified guides
2178
+ """
2179
+ target = obj or self.context
2180
+ if target is None:
2181
+ raise ValueError("No context available for region creation")
2182
+
2183
+ if not self.vertical:
2184
+ raise ValueError("No vertical guides available")
2185
+ if start_index < 0 or start_index >= len(self.vertical):
2186
+ raise IndexError(f"Start index {start_index} out of range")
2187
+ if end_index < 0 or end_index >= len(self.vertical):
2188
+ raise IndexError(f"End index {end_index} out of range")
2189
+ if start_index >= end_index:
2190
+ raise ValueError("Start index must be less than end index")
2191
+
2192
+ # Get bounds from context
2193
+ bounds = self._get_context_bounds()
2194
+ if not bounds:
2195
+ raise ValueError("Could not determine bounds")
2196
+ _, y0, _, y1 = bounds
2197
+
2198
+ # Get horizontal boundaries
2199
+ x0 = self.vertical[start_index]
2200
+ x1 = self.vertical[end_index]
2201
+
2202
+ if hasattr(target, "region"):
2203
+ return target.region(x0, y0, x1, y1)
2204
+ else:
2205
+ raise TypeError(f"Cannot create region on {type(target)}")
2206
+
2207
+ def between_horizontal(
2208
+ self, start_index: int, end_index: int, obj: Optional[Union["Page", "Region"]] = None
2209
+ ) -> "Region":
2210
+ """
2211
+ Get a region between two horizontal guides.
2212
+
2213
+ Args:
2214
+ start_index: Starting horizontal guide index
2215
+ end_index: Ending horizontal guide index
2216
+ obj: Page or Region to create the region on (uses self.context if None)
2217
+
2218
+ Returns:
2219
+ Region between the specified guides
2220
+ """
2221
+ target = obj or self.context
2222
+ if target is None:
2223
+ raise ValueError("No context available for region creation")
2224
+
2225
+ if not self.horizontal:
2226
+ raise ValueError("No horizontal guides available")
2227
+ if start_index < 0 or start_index >= len(self.horizontal):
2228
+ raise IndexError(f"Start index {start_index} out of range")
2229
+ if end_index < 0 or end_index >= len(self.horizontal):
2230
+ raise IndexError(f"End index {end_index} out of range")
2231
+ if start_index >= end_index:
2232
+ raise ValueError("Start index must be less than end index")
2233
+
2234
+ # Get bounds from context
2235
+ bounds = self._get_context_bounds()
2236
+ if not bounds:
2237
+ raise ValueError("Could not determine bounds")
2238
+ x0, _, x1, _ = bounds
2239
+
2240
+ # Get vertical boundaries
2241
+ y0 = self.horizontal[start_index]
2242
+ y1 = self.horizontal[end_index]
2243
+
2244
+ if hasattr(target, "region"):
2245
+ return target.region(x0, y0, x1, y1)
2246
+ else:
2247
+ raise TypeError(f"Cannot create region on {type(target)}")
2248
+
1845
2249
  # -------------------------------------------------------------------------
1846
2250
  # Operations
1847
2251
  # -------------------------------------------------------------------------
@@ -3825,3 +4229,95 @@ class Guides:
3825
4229
  return "vertical"
3826
4230
  else:
3827
4231
  return "horizontal"
4232
+
4233
+
4234
+ # -------------------------------------------------------------------------
4235
+ # Accessor classes for property-based access
4236
+ # -------------------------------------------------------------------------
4237
+
4238
+
4239
+ class _ColumnAccessor:
4240
+ """Provides indexed access to columns via guides.columns[index]."""
4241
+
4242
+ def __init__(self, guides: "Guides"):
4243
+ self._guides = guides
4244
+
4245
+ def __len__(self):
4246
+ """Return number of columns (vertical guides - 1)."""
4247
+ return max(0, len(self._guides.vertical) - 1)
4248
+
4249
+ def __getitem__(self, index: int) -> "Region":
4250
+ """Get column at the specified index."""
4251
+ # Handle negative indexing
4252
+ if index < 0:
4253
+ index = len(self) + index
4254
+ return self._guides.column(index)
4255
+
4256
+
4257
+ class _RowAccessor:
4258
+ """Provides indexed access to rows via guides.rows[index]."""
4259
+
4260
+ def __init__(self, guides: "Guides"):
4261
+ self._guides = guides
4262
+
4263
+ def __len__(self):
4264
+ """Return number of rows (horizontal guides - 1)."""
4265
+ return max(0, len(self._guides.horizontal) - 1)
4266
+
4267
+ def __getitem__(self, index: int) -> "Region":
4268
+ """Get row at the specified index."""
4269
+ # Handle negative indexing
4270
+ if index < 0:
4271
+ index = len(self) + index
4272
+ return self._guides.row(index)
4273
+
4274
+
4275
+ class _CellAccessor:
4276
+ """Provides indexed access to cells via guides.cells[row][col] or guides.cells[row, col]."""
4277
+
4278
+ def __init__(self, guides: "Guides"):
4279
+ self._guides = guides
4280
+
4281
+ def __getitem__(self, key) -> Union["Region", "_CellRowAccessor"]:
4282
+ """
4283
+ Get cell(s) at the specified position.
4284
+
4285
+ Supports:
4286
+ - guides.cells[row, col] - tuple indexing
4287
+ - guides.cells[row][col] - nested indexing
4288
+ """
4289
+ if isinstance(key, tuple) and len(key) == 2:
4290
+ # Direct tuple access: guides.cells[row, col]
4291
+ row, col = key
4292
+ # Handle negative indexing for both row and col
4293
+ if row < 0:
4294
+ row = len(self._guides.rows) + row
4295
+ if col < 0:
4296
+ col = len(self._guides.columns) + col
4297
+ return self._guides.cell(row, col)
4298
+ elif isinstance(key, int):
4299
+ # First level of nested access: guides.cells[row]
4300
+ # Handle negative indexing for row
4301
+ if key < 0:
4302
+ key = len(self._guides.rows) + key
4303
+ # Return a row accessor that allows [col] indexing
4304
+ return _CellRowAccessor(self._guides, key)
4305
+ else:
4306
+ raise TypeError(
4307
+ f"Cell indices must be integers or tuple of two integers, got {type(key)}"
4308
+ )
4309
+
4310
+
4311
+ class _CellRowAccessor:
4312
+ """Provides column access for a specific row in nested cell indexing."""
4313
+
4314
+ def __init__(self, guides: "Guides", row: int):
4315
+ self._guides = guides
4316
+ self._row = row
4317
+
4318
+ def __getitem__(self, col: int) -> "Region":
4319
+ """Get cell at [row][col]."""
4320
+ # Handle negative indexing for column
4321
+ if col < 0:
4322
+ col = len(self._guides.columns) + col
4323
+ return self._guides.cell(self._row, col)
natural_pdf/cli.py CHANGED
@@ -16,7 +16,7 @@ INSTALL_RECIPES: Dict[str, list[str]] = {
16
16
  "paddle": ["paddlepaddle>=3.0.0", "paddleocr>=3.0.1", "paddlex>=3.0.2", "pandas>=2.2.0"],
17
17
  "numpy-high": ["numpy>=2.0"],
18
18
  "numpy-low": ["numpy<1.27"],
19
- "surya": ["surya-ocr>=0.13.0"],
19
+ "surya": ["surya-ocr<0.15"],
20
20
  "yolo": ["doclayout_yolo", "huggingface_hub>=0.29.3"],
21
21
  "docling": ["docling"],
22
22
  # light helpers
@@ -633,9 +633,7 @@ class ElementCollection(
633
633
  pdfplumber's layout engine if layout=True is specified.
634
634
 
635
635
  Args:
636
- separator: String to insert between text from different elements when
637
- using simple joining (layout=False). Default is a single space.
638
- Ignored when layout=True as the layout engine handles spacing.
636
+ separator: String to join text from elements. Default is a single space.
639
637
  preserve_whitespace: Deprecated. Use layout=False for simple joining.
640
638
  use_exclusions: Deprecated. Exclusions should be applied *before* creating
641
639
  the collection or by filtering the collection itself.
@@ -652,15 +650,49 @@ class ElementCollection(
652
650
  Returns:
653
651
  Combined text from elements, potentially with layout-based spacing.
654
652
  """
655
- # Filter to just TextElements that likely have _char_dicts
656
- text_elements = [
653
+ # Check if we have any elements at all
654
+ if not self._elements:
655
+ return ""
656
+
657
+ # Check if all elements are TextElements with character data
658
+ text_elements_with_chars = [
657
659
  el
658
660
  for el in self._elements
659
- if isinstance(el, TextElement) and hasattr(el, "_char_dicts")
661
+ if isinstance(el, TextElement) and hasattr(el, "_char_dicts") and el._char_dicts
660
662
  ]
661
663
 
662
- if not text_elements:
663
- return ""
664
+ # If we have a mixed collection (Regions, TextElements without chars, etc),
665
+ # use a simpler approach: call extract_text on each element
666
+ if len(text_elements_with_chars) < len(self._elements):
667
+ # Mixed collection - extract text from each element
668
+ element_texts = []
669
+
670
+ # Sort elements by position first
671
+ sorted_elements = sorted(
672
+ self._elements,
673
+ key=lambda el: (
674
+ el.page.index if hasattr(el, "page") else 0,
675
+ el.top if hasattr(el, "top") else 0,
676
+ el.x0 if hasattr(el, "x0") else 0,
677
+ ),
678
+ )
679
+
680
+ for el in sorted_elements:
681
+ if hasattr(el, "extract_text"):
682
+ # Call extract_text on the element (works for TextElement, Region, etc)
683
+ text = el.extract_text(**kwargs)
684
+ if text:
685
+ element_texts.append(text)
686
+ elif hasattr(el, "text"):
687
+ # Fallback to text property if available
688
+ text = getattr(el, "text", "")
689
+ if text:
690
+ element_texts.append(text)
691
+
692
+ return separator.join(element_texts)
693
+
694
+ # All elements are TextElements with char data - use the original approach
695
+ text_elements = text_elements_with_chars
664
696
 
665
697
  # Collect all character dictionaries
666
698
  all_char_dicts = []
@@ -669,11 +701,20 @@ class ElementCollection(
669
701
 
670
702
  if not all_char_dicts:
671
703
  # Handle case where elements exist but have no char dicts
672
- logger.warning(
704
+ logger.debug(
673
705
  "ElementCollection.extract_text: No character dictionaries found in TextElements."
674
706
  )
707
+ # Sort elements by position before joining
708
+ sorted_text_elements = sorted(
709
+ text_elements,
710
+ key=lambda el: (
711
+ el.page.index if hasattr(el, "page") else 0,
712
+ el.top if hasattr(el, "top") else 0,
713
+ el.x0 if hasattr(el, "x0") else 0,
714
+ ),
715
+ )
675
716
  return separator.join(
676
- getattr(el, "text", "") for el in text_elements
717
+ getattr(el, "text", "") for el in sorted_text_elements
677
718
  ) # Fallback to simple join of word text
678
719
 
679
720
  # Apply content filtering if provided
@@ -737,33 +778,20 @@ class ElementCollection(
737
778
  all_char_dicts.sort(
738
779
  key=lambda c: (c.get("page_number", 0), c.get("top", 0), c.get("x0", 0))
739
780
  )
740
- result = separator.join(c.get("text", "") for c in all_char_dicts)
781
+ result = " ".join(c.get("text", "") for c in all_char_dicts)
741
782
 
742
783
  else:
784
+ print("JOIN WITHOUT LAYOUT")
743
785
  # Default: Simple join without layout
744
786
  logger.debug("ElementCollection.extract_text: Using simple join (layout=False).")
745
-
746
- # Instead of joining all characters individually, we need to:
747
- # 1. Extract text from each element
748
- # 2. Join the element texts with the separator
749
-
750
- # Sort elements by document order (page, top, x0)
751
- sorted_elements = sorted(
752
- text_elements,
753
- key=lambda el: (
754
- el.page.index if hasattr(el, "page") else 0,
755
- el.top if hasattr(el, "top") else 0,
756
- el.x0 if hasattr(el, "x0") else 0,
757
- ),
758
- )
759
-
760
- # Extract text from each element
761
- element_texts = []
762
- for el in sorted_elements:
763
- if hasattr(el, "text") and el.text:
764
- element_texts.append(el.text)
765
-
766
- result = separator.join(element_texts)
787
+ result = separator.join(el.extract_text() for el in text_elements)
788
+
789
+ # # Sort chars by document order (page, top, x0)
790
+ # all_char_dicts.sort(
791
+ # key=lambda c: (c.get("page_number", 0), c.get("top", 0), c.get("x0", 0))
792
+ # )
793
+ # # Simple join of character text
794
+ # result = "".join(c.get("text", "") for c in all_char_dicts)
767
795
 
768
796
  # Determine final strip flag – same rule as global helper unless caller overrides
769
797
  strip_text = strip if strip is not None else (not use_layout)
@@ -1138,6 +1138,67 @@ class Region(
1138
1138
  )
1139
1139
  return clipped_region
1140
1140
 
1141
+ def region(
1142
+ self,
1143
+ left: float = None,
1144
+ top: float = None,
1145
+ right: float = None,
1146
+ bottom: float = None,
1147
+ width: Union[str, float, None] = None,
1148
+ height: Optional[float] = None,
1149
+ relative: bool = False,
1150
+ ) -> "Region":
1151
+ """
1152
+ Create a sub-region within this region using the same API as Page.region().
1153
+
1154
+ By default, coordinates are absolute (relative to the page), matching Page.region().
1155
+ Set relative=True to use coordinates relative to this region's top-left corner.
1156
+
1157
+ Args:
1158
+ left: Left x-coordinate (absolute by default, or relative to region if relative=True)
1159
+ top: Top y-coordinate (absolute by default, or relative to region if relative=True)
1160
+ right: Right x-coordinate (absolute by default, or relative to region if relative=True)
1161
+ bottom: Bottom y-coordinate (absolute by default, or relative to region if relative=True)
1162
+ width: Width definition (same as Page.region())
1163
+ height: Height of the region (same as Page.region())
1164
+ relative: If True, coordinates are relative to this region's top-left (0,0).
1165
+ If False (default), coordinates are absolute page coordinates.
1166
+
1167
+ Returns:
1168
+ Region object for the specified coordinates, clipped to this region's bounds
1169
+
1170
+ Examples:
1171
+ # Absolute coordinates (default) - same as page.region()
1172
+ sub = region.region(left=100, top=200, width=50, height=30)
1173
+
1174
+ # Relative to region's top-left
1175
+ sub = region.region(left=10, top=10, width=50, height=30, relative=True)
1176
+
1177
+ # Mix relative positioning with this region's bounds
1178
+ sub = region.region(left=region.x0 + 10, width=50, height=30)
1179
+ """
1180
+ # If relative coordinates requested, convert to absolute
1181
+ if relative:
1182
+ if left is not None:
1183
+ left = self.x0 + left
1184
+ if top is not None:
1185
+ top = self.top + top
1186
+ if right is not None:
1187
+ right = self.x0 + right
1188
+ if bottom is not None:
1189
+ bottom = self.top + bottom
1190
+
1191
+ # For numeric width/height with relative coords, we need to handle the calculation
1192
+ # in the context of absolute positioning
1193
+
1194
+ # Use the parent page's region method to create the region with all its logic
1195
+ new_region = self.page.region(
1196
+ left=left, top=top, right=right, bottom=bottom, width=width, height=height
1197
+ )
1198
+
1199
+ # Clip the new region to this region's bounds
1200
+ return new_region.clip(self)
1201
+
1141
1202
  def get_elements(
1142
1203
  self, selector: Optional[str] = None, apply_exclusions=True, **kwargs
1143
1204
  ) -> List["Element"]:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: natural-pdf
3
- Version: 0.2.8
3
+ Version: 0.2.10
4
4
  Summary: A more intuitive interface for working with PDFs
5
5
  Author-email: Jonathan Soma <jonathan.soma@gmail.com>
6
6
  License-Expression: MIT
@@ -1,8 +1,8 @@
1
1
  natural_pdf/__init__.py,sha256=N4pR0LbuPEnUYFZqbdVqc_FGKldgwPQc1wjJhYKTBBM,3417
2
- natural_pdf/cli.py,sha256=SkPwhhMM-GhLsj3O1n1Agxz4KOxcZ08sj8hVQSFJB5c,4064
2
+ natural_pdf/cli.py,sha256=0zO9ZoRiP8JmyGBaVavrMATnvbARWTl7WD2PEefu9BM,4061
3
3
  natural_pdf/text_mixin.py,sha256=eFCiHj6Okcw3aum4955BepcI2NPRalkf9UFFVTc_H30,4012
4
4
  natural_pdf/analyzers/__init__.py,sha256=3XGoNq3OgiVkZP7tOdeP5XVUl7fDgyztdA8DlOcMLXg,1138
5
- natural_pdf/analyzers/guides.py,sha256=RHFTc2n6kzKrjsd2pk-1MfG1esuEpnTJr8GrsTqlF3A,160441
5
+ natural_pdf/analyzers/guides.py,sha256=O3MaeVLgH5l1qbj2fpJCUsDKF44350CiUb_T1J6HTiQ,178025
6
6
  natural_pdf/analyzers/shape_detection_mixin.py,sha256=mgpyJ4jIulz9l9HCqThabJIsLSrXh9BB2AmLxUoHmw0,62584
7
7
  natural_pdf/analyzers/text_options.py,sha256=qEkDaYWla0rIM_gszEOsu52q7C_dAfV81P2HLJZM2sw,3333
8
8
  natural_pdf/analyzers/text_structure.py,sha256=3WWusi-BI0krUnJxB05DD6XmKj5qRNvQBqH7zOQGm1M,28451
@@ -40,11 +40,11 @@ natural_pdf/describe/mixin.py,sha256=rkX14aGrSz7Jvxx8Rbxv3eSfbO-_29DipwpstrV2pDQ
40
40
  natural_pdf/describe/summary.py,sha256=cfT4ZQkeatCDAOwWPwhtEVXisNgk6E57fAXAnoRysSU,7645
41
41
  natural_pdf/elements/__init__.py,sha256=ICNikmLeIEuSYypz-KnkBn8xR1hR7rge4hsa1KLkyWY,42
42
42
  natural_pdf/elements/base.py,sha256=92ukTtRCQFsa5KvKflChCt4mt0ZGS4ecGYCQTNMO4zU,58907
43
- natural_pdf/elements/element_collection.py,sha256=-piFQGiDPiqmnl-Cpoi3PGPmGe4AYvpl0IqaJGxBsBc,129405
43
+ natural_pdf/elements/element_collection.py,sha256=idM_BUWEfbCJ5Sq0Ae_KfbVHy8TdkNfzs7iWkFe_j2I,130707
44
44
  natural_pdf/elements/image.py,sha256=zu-P2Y8fRoEXf6IeZU0EYRWsgZ6I_a5vy1FA3VXTGkQ,1424
45
45
  natural_pdf/elements/line.py,sha256=TFn7KXjPT_jUQyQyabU0F7XYU4dC-qadwodJMZF4DCU,3844
46
46
  natural_pdf/elements/rect.py,sha256=0lNkVkPkvbRbrFED856RXoUcTcDkeeOIs5xldKGAQT8,3324
47
- natural_pdf/elements/region.py,sha256=XLbaMEQ-DXzbh4Xnv72ebS1ZlT5EuWpistz0O6bOSag,162583
47
+ natural_pdf/elements/region.py,sha256=hCpbKg0R5TGfWEskZ6P-o_ZXPKhU4keaYjWIVX0Y7F4,165244
48
48
  natural_pdf/elements/text.py,sha256=829uSJv9E-8cC6T6iR_Va7Xtv54pJoyRN78fq4NN1d4,20687
49
49
  natural_pdf/export/mixin.py,sha256=L1q3MIEFWuvie4j4_EmW7GT3NerbZ1as0XMUoqTS7gM,5083
50
50
  natural_pdf/exporters/__init__.py,sha256=QffoARekR6WzXEd05oxOytly4qPdBizuIF-SUkeFpig,643
@@ -107,7 +107,7 @@ natural_pdf/vision/results.py,sha256=F2zXG3MVZIpOUvPkJHotOq6-9rFz68BaO_8pnSndlOs
107
107
  natural_pdf/vision/similarity.py,sha256=YH8legN-t9uf1b_XULi4JLNDaRfPNKQwU1FZ4Qu08jY,11740
108
108
  natural_pdf/widgets/__init__.py,sha256=QTVaUmsw__FCweFYZebwPssQxxUFUMd0wpm_cUbGZJY,181
109
109
  natural_pdf/widgets/viewer.py,sha256=KW3JogdR2TMg2ECUMYp8hwd060hfg8EsYBWxb5IEzBY,24942
110
- natural_pdf-0.2.8.dist-info/licenses/LICENSE,sha256=9zfwINwJlarbDmdh6iJV4QUG54QSJlSAUcnC1YiC_Ns,1074
110
+ natural_pdf-0.2.10.dist-info/licenses/LICENSE,sha256=9zfwINwJlarbDmdh6iJV4QUG54QSJlSAUcnC1YiC_Ns,1074
111
111
  optimization/memory_comparison.py,sha256=0i_foFSRmppj-fY069qjwH36s_zkx-1L2ASAAlepWzA,6541
112
112
  optimization/pdf_analyzer.py,sha256=HjrmTgu2qchxPeDckc5kjgxppGwd40UESrYS9Myj7pY,19352
113
113
  optimization/performance_analysis.py,sha256=JBXnR9hc7Ix7YCnt3EJPSpsyqIUgKsc7GEffQ_TDCBk,13033
@@ -124,8 +124,8 @@ tools/bad_pdf_eval/llm_enrich.py,sha256=mCh4KGi1HmIkzGjj5rrHz1Osd7sEX1IZ_FW08H1t
124
124
  tools/bad_pdf_eval/llm_enrich_with_retry.py,sha256=XUtPF1hUvqd3frDXT0wDTXoonuAivhjM5vgFdZ-tm0A,9373
125
125
  tools/bad_pdf_eval/reporter.py,sha256=e1g__mkSB4q02p3mGWOwMhvFs7F2HJosNBxup0-LkyU,400
126
126
  tools/bad_pdf_eval/utils.py,sha256=hR95XQ7qf7Cu6BdyX0L7ggGVx-ah5sK0jHWblTJUUic,4896
127
- natural_pdf-0.2.8.dist-info/METADATA,sha256=tuWXV-mY9zU0qsVsXhrrp3aGBfSxlklUxS_Dlllqmp4,6959
128
- natural_pdf-0.2.8.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
129
- natural_pdf-0.2.8.dist-info/entry_points.txt,sha256=1R_KMv7g60UBBpRqGfw7bppsMNGdayR-iJlb9ohEk_8,81
130
- natural_pdf-0.2.8.dist-info/top_level.txt,sha256=80t0F2ZeX4vN4Ke5iTflcOk_PN_0USn33ha3X6X86Ik,36
131
- natural_pdf-0.2.8.dist-info/RECORD,,
127
+ natural_pdf-0.2.10.dist-info/METADATA,sha256=DnjO1O3cHjhT-sXzLjIHQ286xqBAzeSJdPR50yol4D4,6960
128
+ natural_pdf-0.2.10.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
129
+ natural_pdf-0.2.10.dist-info/entry_points.txt,sha256=1R_KMv7g60UBBpRqGfw7bppsMNGdayR-iJlb9ohEk_8,81
130
+ natural_pdf-0.2.10.dist-info/top_level.txt,sha256=80t0F2ZeX4vN4Ke5iTflcOk_PN_0USn33ha3X6X86Ik,36
131
+ natural_pdf-0.2.10.dist-info/RECORD,,