natural-pdf 0.2.1.dev0__py3-none-any.whl → 0.2.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -260,7 +260,7 @@ class DirectionalMixin:
260
260
 
261
261
  Args:
262
262
  height: Height of the region above, in points
263
- width: Width mode - "full" for full page width or "element" for element width
263
+ width: Width mode - "full" (default) for full page width or "element" for element width
264
264
  include_source: Whether to include this element/region in the result (default: False)
265
265
  until: Optional selector string to specify an upper boundary element
266
266
  include_endpoint: Whether to include the boundary element in the region (default: True)
@@ -268,6 +268,18 @@ class DirectionalMixin:
268
268
 
269
269
  Returns:
270
270
  Region object representing the area above
271
+
272
+ Examples:
273
+ ```python
274
+ # Default: full page width
275
+ signature.above() # Gets everything above across full page width
276
+
277
+ # Match element width
278
+ signature.above(width='element') # Gets region above matching signature width
279
+
280
+ # Stop at specific element
281
+ signature.above(until='text:contains("Date")') # Region from date to signature
282
+ ```
271
283
  """
272
284
  return self._direction(
273
285
  direction="above",
@@ -293,7 +305,7 @@ class DirectionalMixin:
293
305
 
294
306
  Args:
295
307
  height: Height of the region below, in points
296
- width: Width mode - "full" for full page width or "element" for element width
308
+ width: Width mode - "full" (default) for full page width or "element" for element width
297
309
  include_source: Whether to include this element/region in the result (default: False)
298
310
  until: Optional selector string to specify a lower boundary element
299
311
  include_endpoint: Whether to include the boundary element in the region (default: True)
@@ -301,6 +313,18 @@ class DirectionalMixin:
301
313
 
302
314
  Returns:
303
315
  Region object representing the area below
316
+
317
+ Examples:
318
+ ```python
319
+ # Default: full page width
320
+ header.below() # Gets everything below across full page width
321
+
322
+ # Match element width
323
+ header.below(width='element') # Gets region below matching header width
324
+
325
+ # Limited height
326
+ header.below(height=200) # Gets 200pt tall region below header
327
+ ```
304
328
  """
305
329
  return self._direction(
306
330
  direction="below",
@@ -315,7 +339,7 @@ class DirectionalMixin:
315
339
  def left(
316
340
  self,
317
341
  width: Optional[float] = None,
318
- height: str = "full",
342
+ height: str = "element",
319
343
  include_source: bool = False,
320
344
  until: Optional[str] = None,
321
345
  include_endpoint: bool = True,
@@ -326,7 +350,7 @@ class DirectionalMixin:
326
350
 
327
351
  Args:
328
352
  width: Width of the region to the left, in points
329
- height: Height mode - "full" for full page height or "element" for element height
353
+ height: Height mode - "element" (default) for element height or "full" for full page height
330
354
  include_source: Whether to include this element/region in the result (default: False)
331
355
  until: Optional selector string to specify a left boundary element
332
356
  include_endpoint: Whether to include the boundary element in the region (default: True)
@@ -334,6 +358,18 @@ class DirectionalMixin:
334
358
 
335
359
  Returns:
336
360
  Region object representing the area to the left
361
+
362
+ Examples:
363
+ ```python
364
+ # Default: matches element height
365
+ table.left() # Gets region to the left at same height as table
366
+
367
+ # Full page height
368
+ table.left(height='full') # Gets entire left side of page
369
+
370
+ # Custom height
371
+ table.left(height=100) # Gets 100pt tall region to the left
372
+ ```
337
373
  """
338
374
  return self._direction(
339
375
  direction="left",
@@ -348,7 +384,7 @@ class DirectionalMixin:
348
384
  def right(
349
385
  self,
350
386
  width: Optional[float] = None,
351
- height: str = "full",
387
+ height: str = "element",
352
388
  include_source: bool = False,
353
389
  until: Optional[str] = None,
354
390
  include_endpoint: bool = True,
@@ -359,7 +395,7 @@ class DirectionalMixin:
359
395
 
360
396
  Args:
361
397
  width: Width of the region to the right, in points
362
- height: Height mode - "full" for full page height or "element" for element height
398
+ height: Height mode - "element" (default) for element height or "full" for full page height
363
399
  include_source: Whether to include this element/region in the result (default: False)
364
400
  until: Optional selector string to specify a right boundary element
365
401
  include_endpoint: Whether to include the boundary element in the region (default: True)
@@ -367,6 +403,18 @@ class DirectionalMixin:
367
403
 
368
404
  Returns:
369
405
  Region object representing the area to the right
406
+
407
+ Examples:
408
+ ```python
409
+ # Default: matches element height
410
+ label.right() # Gets region to the right at same height as label
411
+
412
+ # Full page height
413
+ label.right(height='full') # Gets entire right side of page
414
+
415
+ # Custom height
416
+ label.right(height=50) # Gets 50pt tall region to the right
417
+ ```
370
418
  """
371
419
  return self._direction(
372
420
  direction="right",
@@ -381,8 +429,28 @@ class DirectionalMixin:
381
429
  def to_region(self):
382
430
  return self.expand()
383
431
 
432
+ @overload
433
+ def expand(self, amount: float) -> "Region":
434
+ """Expand in all directions by the same amount."""
435
+ ...
436
+
437
+ @overload
438
+ def expand(
439
+ self,
440
+ *,
441
+ left: float = 0,
442
+ right: float = 0,
443
+ top: float = 0,
444
+ bottom: float = 0,
445
+ width_factor: float = 1.0,
446
+ height_factor: float = 1.0,
447
+ ) -> "Region":
448
+ """Expand by different amounts in each direction."""
449
+ ...
450
+
384
451
  def expand(
385
452
  self,
453
+ amount: Optional[float] = None,
386
454
  left: float = 0,
387
455
  right: float = 0,
388
456
  top: float = 0,
@@ -394,6 +462,7 @@ class DirectionalMixin:
394
462
  Create a new region expanded from this element/region.
395
463
 
396
464
  Args:
465
+ amount: If provided as the first positional argument, expand all edges by this amount
397
466
  left: Amount to expand left edge (positive value expands leftwards)
398
467
  right: Amount to expand right edge (positive value expands rightwards)
399
468
  top: Amount to expand top edge (positive value expands upwards)
@@ -403,7 +472,20 @@ class DirectionalMixin:
403
472
 
404
473
  Returns:
405
474
  New expanded Region object
475
+
476
+ Examples:
477
+ # Expand 5 pixels in all directions
478
+ expanded = element.expand(5)
479
+
480
+ # Expand by different amounts in each direction
481
+ expanded = element.expand(left=10, right=5, top=3, bottom=7)
482
+
483
+ # Use width/height factors
484
+ expanded = element.expand(width_factor=1.5, height_factor=2.0)
406
485
  """
486
+ # If amount is provided as first positional argument, use it for all directions
487
+ if amount is not None:
488
+ left = right = top = bottom = amount
407
489
  # Start with current coordinates
408
490
  new_x0 = self.x0
409
491
  new_x1 = self.x1
@@ -1212,7 +1294,7 @@ class Element(
1212
1294
  self,
1213
1295
  *,
1214
1296
  text: str,
1215
- contains: str = "all",
1297
+ overlap: str = "full",
1216
1298
  apply_exclusions: bool = True,
1217
1299
  regex: bool = False,
1218
1300
  case: bool = True,
@@ -1224,7 +1306,7 @@ class Element(
1224
1306
  self,
1225
1307
  selector: str,
1226
1308
  *,
1227
- contains: str = "all",
1309
+ overlap: str = "full",
1228
1310
  apply_exclusions: bool = True,
1229
1311
  regex: bool = False,
1230
1312
  case: bool = True,
@@ -1236,7 +1318,7 @@ class Element(
1236
1318
  selector: Optional[str] = None,
1237
1319
  *,
1238
1320
  text: Optional[str] = None,
1239
- contains: str = "all",
1321
+ overlap: str = "full",
1240
1322
  apply_exclusions: bool = True,
1241
1323
  regex: bool = False,
1242
1324
  case: bool = True,
@@ -1251,9 +1333,9 @@ class Element(
1251
1333
  Args:
1252
1334
  selector: CSS-like selector string.
1253
1335
  text: Text content to search for (equivalent to 'text:contains(...)').
1254
- contains: How to determine if elements are inside: 'all' (fully inside),
1255
- 'any' (any overlap), or 'center' (center point inside).
1256
- (default: "all")
1336
+ overlap: How to determine if elements overlap with this element: 'full' (fully inside),
1337
+ 'partial' (any overlap), or 'center' (center point inside).
1338
+ (default: "full")
1257
1339
  apply_exclusions: Whether to apply exclusion regions (default: True).
1258
1340
  regex: Whether to use regex for text search (`selector` or `text`) (default: False).
1259
1341
  case: Whether to do case-sensitive text search (`selector` or `text`) (default: True).
@@ -1270,7 +1352,7 @@ class Element(
1270
1352
  return temp_region.find(
1271
1353
  selector=selector,
1272
1354
  text=text,
1273
- contains=contains,
1355
+ overlap=overlap,
1274
1356
  apply_exclusions=apply_exclusions,
1275
1357
  regex=regex,
1276
1358
  case=case,
@@ -1282,7 +1364,7 @@ class Element(
1282
1364
  self,
1283
1365
  *,
1284
1366
  text: str,
1285
- contains: str = "all",
1367
+ overlap: str = "full",
1286
1368
  apply_exclusions: bool = True,
1287
1369
  regex: bool = False,
1288
1370
  case: bool = True,
@@ -1294,7 +1376,7 @@ class Element(
1294
1376
  self,
1295
1377
  selector: str,
1296
1378
  *,
1297
- contains: str = "all",
1379
+ overlap: str = "full",
1298
1380
  apply_exclusions: bool = True,
1299
1381
  regex: bool = False,
1300
1382
  case: bool = True,
@@ -1306,7 +1388,7 @@ class Element(
1306
1388
  selector: Optional[str] = None,
1307
1389
  *,
1308
1390
  text: Optional[str] = None,
1309
- contains: str = "all",
1391
+ overlap: str = "full",
1310
1392
  apply_exclusions: bool = True,
1311
1393
  regex: bool = False,
1312
1394
  case: bool = True,
@@ -1321,9 +1403,9 @@ class Element(
1321
1403
  Args:
1322
1404
  selector: CSS-like selector string.
1323
1405
  text: Text content to search for (equivalent to 'text:contains(...)').
1324
- contains: How to determine if elements are inside: 'all' (fully inside),
1325
- 'any' (any overlap), or 'center' (center point inside).
1326
- (default: "all")
1406
+ overlap: How to determine if elements overlap with this element: 'full' (fully inside),
1407
+ 'partial' (any overlap), or 'center' (center point inside).
1408
+ (default: "full")
1327
1409
  apply_exclusions: Whether to apply exclusion regions (default: True).
1328
1410
  regex: Whether to use regex for text search (`selector` or `text`) (default: False).
1329
1411
  case: Whether to do case-sensitive text search (`selector` or `text`) (default: True).
@@ -1340,7 +1422,7 @@ class Element(
1340
1422
  return temp_region.find_all(
1341
1423
  selector=selector,
1342
1424
  text=text,
1343
- contains=contains,
1425
+ overlap=overlap,
1344
1426
  apply_exclusions=apply_exclusions,
1345
1427
  regex=regex,
1346
1428
  case=case,
@@ -891,6 +891,7 @@ class ElementCollection(
891
891
  label_format: Optional[str] = None,
892
892
  annotate: Optional[List[str]] = None,
893
893
  bins: Optional[Union[int, List[float]]] = None,
894
+ **kwargs,
894
895
  ) -> List[Dict]:
895
896
  """
896
897
  Determines the parameters for highlighting each element based on the strategy.
@@ -1672,9 +1673,9 @@ class ElementCollection(
1672
1673
 
1673
1674
  Args:
1674
1675
  selector: CSS-like selector string
1675
- contains: How to determine if elements are inside: 'all' (fully inside),
1676
- 'any' (any overlap), or 'center' (center point inside).
1677
- (default: "all")
1676
+ overlap: How to determine if elements overlap: 'full' (fully inside),
1677
+ 'partial' (any overlap), or 'center' (center point inside).
1678
+ (default: "full")
1678
1679
  apply_exclusions: Whether to exclude elements in exclusion regions
1679
1680
  """
1680
1681
  return self.apply(lambda element: element.find(selector, **kwargs))
@@ -1684,7 +1685,7 @@ class ElementCollection(
1684
1685
  self,
1685
1686
  *,
1686
1687
  text: str,
1687
- contains: str = "all",
1688
+ overlap: str = "full",
1688
1689
  apply_exclusions: bool = True,
1689
1690
  regex: bool = False,
1690
1691
  case: bool = True,
@@ -1696,7 +1697,7 @@ class ElementCollection(
1696
1697
  self,
1697
1698
  selector: str,
1698
1699
  *,
1699
- contains: str = "all",
1700
+ overlap: str = "full",
1700
1701
  apply_exclusions: bool = True,
1701
1702
  regex: bool = False,
1702
1703
  case: bool = True,
@@ -1708,7 +1709,7 @@ class ElementCollection(
1708
1709
  selector: Optional[str] = None,
1709
1710
  *,
1710
1711
  text: Optional[str] = None,
1711
- contains: str = "all",
1712
+ overlap: str = "full",
1712
1713
  apply_exclusions: bool = True,
1713
1714
  regex: bool = False,
1714
1715
  case: bool = True,
@@ -1723,9 +1724,9 @@ class ElementCollection(
1723
1724
  Args:
1724
1725
  selector: CSS-like selector string.
1725
1726
  text: Text content to search for (equivalent to 'text:contains(...)').
1726
- contains: How to determine if elements are inside: 'all' (fully inside),
1727
- 'any' (any overlap), or 'center' (center point inside).
1728
- (default: "all")
1727
+ overlap: How to determine if elements overlap: 'full' (fully inside),
1728
+ 'partial' (any overlap), or 'center' (center point inside).
1729
+ (default: "full")
1729
1730
  apply_exclusions: Whether to apply exclusion regions (default: True).
1730
1731
  regex: Whether to use regex for text search (`selector` or `text`) (default: False).
1731
1732
  case: Whether to do case-sensitive text search (`selector` or `text`) (default: True).
@@ -1747,7 +1748,7 @@ class ElementCollection(
1747
1748
  found_in_element: "ElementCollection" = element.find_all(
1748
1749
  selector=selector,
1749
1750
  text=text,
1750
- contains=contains,
1751
+ overlap=overlap,
1751
1752
  apply_exclusions=apply_exclusions,
1752
1753
  regex=regex,
1753
1754
  case=case,
@@ -960,7 +960,7 @@ class Region(
960
960
  right_content_col = min(width - 1, content_col_indices[-1] + padding)
961
961
 
962
962
  # Convert trimmed pixel coordinates back to PDF coordinates
963
- scale_factor = resolution / 72.0 # Scale factor used in to_image()
963
+ scale_factor = resolution / 72.0 # Scale factor used in render()
964
964
 
965
965
  # Calculate new PDF coordinates and ensure they are Python floats
966
966
  trimmed_x0 = float(work_region.x0 + (left_content_col / scale_factor))
@@ -1982,7 +1982,7 @@ class Region(
1982
1982
  self,
1983
1983
  *,
1984
1984
  text: str,
1985
- contains: str = "all",
1985
+ overlap: str = "full",
1986
1986
  apply_exclusions: bool = True,
1987
1987
  regex: bool = False,
1988
1988
  case: bool = True,
@@ -1994,7 +1994,7 @@ class Region(
1994
1994
  self,
1995
1995
  selector: str,
1996
1996
  *,
1997
- contains: str = "all",
1997
+ overlap: str = "full",
1998
1998
  apply_exclusions: bool = True,
1999
1999
  regex: bool = False,
2000
2000
  case: bool = True,
@@ -2006,7 +2006,7 @@ class Region(
2006
2006
  selector: Optional[str] = None, # Now optional
2007
2007
  *,
2008
2008
  text: Optional[str] = None, # New text parameter
2009
- contains: str = "all", # New parameter for containment behavior
2009
+ overlap: str = "full", # How elements overlap with the region
2010
2010
  apply_exclusions: bool = True,
2011
2011
  regex: bool = False,
2012
2012
  case: bool = True,
@@ -2020,9 +2020,9 @@ class Region(
2020
2020
  Args:
2021
2021
  selector: CSS-like selector string.
2022
2022
  text: Text content to search for (equivalent to 'text:contains(...)').
2023
- contains: How to determine if elements are inside: 'all' (fully inside),
2024
- 'any' (any overlap), or 'center' (center point inside).
2025
- (default: "all")
2023
+ overlap: How to determine if elements overlap with the region: 'full' (fully inside),
2024
+ 'partial' (any overlap), or 'center' (center point inside).
2025
+ (default: "full")
2026
2026
  apply_exclusions: Whether to exclude elements in exclusion regions (default: True).
2027
2027
  regex: Whether to use regex for text search (`selector` or `text`) (default: False).
2028
2028
  case: Whether to do case-sensitive text search (`selector` or `text`) (default: True).
@@ -2035,7 +2035,7 @@ class Region(
2035
2035
  elements = self.find_all(
2036
2036
  selector=selector,
2037
2037
  text=text,
2038
- contains=contains,
2038
+ overlap=overlap,
2039
2039
  apply_exclusions=apply_exclusions,
2040
2040
  regex=regex,
2041
2041
  case=case,
@@ -2048,7 +2048,7 @@ class Region(
2048
2048
  self,
2049
2049
  *,
2050
2050
  text: str,
2051
- contains: str = "all",
2051
+ overlap: str = "full",
2052
2052
  apply_exclusions: bool = True,
2053
2053
  regex: bool = False,
2054
2054
  case: bool = True,
@@ -2060,7 +2060,7 @@ class Region(
2060
2060
  self,
2061
2061
  selector: str,
2062
2062
  *,
2063
- contains: str = "all",
2063
+ overlap: str = "full",
2064
2064
  apply_exclusions: bool = True,
2065
2065
  regex: bool = False,
2066
2066
  case: bool = True,
@@ -2072,7 +2072,7 @@ class Region(
2072
2072
  selector: Optional[str] = None, # Now optional
2073
2073
  *,
2074
2074
  text: Optional[str] = None, # New text parameter
2075
- contains: str = "all", # New parameter to control inside/overlap behavior
2075
+ overlap: str = "full", # How elements overlap with the region
2076
2076
  apply_exclusions: bool = True,
2077
2077
  regex: bool = False,
2078
2078
  case: bool = True,
@@ -2086,9 +2086,9 @@ class Region(
2086
2086
  Args:
2087
2087
  selector: CSS-like selector string.
2088
2088
  text: Text content to search for (equivalent to 'text:contains(...)').
2089
- contains: How to determine if elements are inside: 'all' (fully inside),
2090
- 'any' (any overlap), or 'center' (center point inside).
2091
- (default: "all")
2089
+ overlap: How to determine if elements overlap with the region: 'full' (fully inside),
2090
+ 'partial' (any overlap), or 'center' (center point inside).
2091
+ (default: "full")
2092
2092
  apply_exclusions: Whether to exclude elements in exclusion regions (default: True).
2093
2093
  regex: Whether to use regex for text search (`selector` or `text`) (default: False).
2094
2094
  case: Whether to do case-sensitive text search (`selector` or `text`) (default: True).
@@ -2104,10 +2104,10 @@ class Region(
2104
2104
  if selector is None and text is None:
2105
2105
  raise ValueError("Provide either 'selector' or 'text'.")
2106
2106
 
2107
- # Validate contains parameter
2108
- if contains not in ["all", "any", "center"]:
2107
+ # Validate overlap parameter
2108
+ if overlap not in ["full", "partial", "center"]:
2109
2109
  raise ValueError(
2110
- f"Invalid contains value: {contains}. Must be 'all', 'any', or 'center'"
2110
+ f"Invalid overlap value: {overlap}. Must be 'full', 'partial', or 'center'"
2111
2111
  )
2112
2112
 
2113
2113
  # Construct selector if 'text' is provided
@@ -2142,7 +2142,7 @@ class Region(
2142
2142
  region_bbox = self.bbox
2143
2143
  matching_elements = []
2144
2144
 
2145
- if contains == "all": # Fully inside (strict)
2145
+ if overlap == "full": # Fully inside (strict)
2146
2146
  matching_elements = [
2147
2147
  el
2148
2148
  for el in potential_elements
@@ -2151,9 +2151,9 @@ class Region(
2151
2151
  and el.x1 <= region_bbox[2]
2152
2152
  and el.bottom <= region_bbox[3]
2153
2153
  ]
2154
- elif contains == "any": # Any overlap
2154
+ elif overlap == "partial": # Any overlap
2155
2155
  matching_elements = [el for el in potential_elements if self.intersects(el)]
2156
- elif contains == "center": # Center point inside
2156
+ elif overlap == "center": # Center point inside
2157
2157
  matching_elements = [
2158
2158
  el for el in potential_elements if self.is_element_center_inside(el)
2159
2159
  ]
@@ -3437,7 +3437,7 @@ class Region(
3437
3437
  r_idx = int(cell.metadata.get("row_index"))
3438
3438
  c_idx = int(cell.metadata.get("col_index"))
3439
3439
  text_val = cell.extract_text(
3440
- layout=False, apply_exclusions=False, content_filter=content_filter
3440
+ layout=False, apply_exclusions=True, content_filter=content_filter
3441
3441
  ).strip()
3442
3442
  table_grid[r_idx][c_idx] = text_val if text_val else None
3443
3443
  except Exception as _err:
@@ -215,6 +215,11 @@ class TextElement(Element):
215
215
  if isinstance(color, (int, float)):
216
216
  return (color, color, color)
217
217
 
218
+ # If it's a single-value tuple (grayscale), treat as grayscale
219
+ if isinstance(color, tuple) and len(color) == 1:
220
+ gray = color[0]
221
+ return (gray, gray, gray)
222
+
218
223
  # If it's a tuple of 3 values, treat as RGB
219
224
  if isinstance(color, tuple) and len(color) == 3:
220
225
  return color
@@ -119,17 +119,11 @@ class StructuredDataManager:
119
119
  )
120
120
  messages = self._prepare_llm_messages(content, prompt, using, schema)
121
121
 
122
- try:
123
- logger.debug(f"Extracting with model '{selected_model}'")
124
- completion = client.beta.chat.completions.parse(
125
- model=selected_model, messages=messages, response_format=schema, **kwargs
126
- )
127
- parsed_data = completion.choices[0].message.parsed
128
- return StructuredDataResult(
129
- data=parsed_data, success=True, error_message=None, model_used=selected_model
130
- )
131
- except Exception as e:
132
- logger.error(f"Extraction failed: {str(e)}")
133
- return StructuredDataResult(
134
- data=None, success=False, error_message=str(e), model_used=selected_model
135
- )
122
+ logger.debug(f"Extracting with model '{selected_model}'")
123
+ completion = client.beta.chat.completions.parse(
124
+ model=selected_model, messages=messages, response_format=schema, **kwargs
125
+ )
126
+ parsed_data = completion.choices[0].message.parsed
127
+ return StructuredDataResult(
128
+ data=parsed_data, success=True, error_message=None, model_used=selected_model
129
+ )
@@ -35,7 +35,7 @@ class ExtractionMixin(ABC):
35
35
 
36
36
  Host class requirements:
37
37
  - Must implement extract_text(**kwargs) -> str
38
- - Must implement to_image(**kwargs) -> PIL.Image
38
+ - Must implement render(**kwargs) -> PIL.Image
39
39
  - Must have access to StructuredDataManager (usually via parent PDF)
40
40
 
41
41
  Example:
@@ -72,25 +72,24 @@ class ExtractionMixin(ABC):
72
72
 
73
73
  Args:
74
74
  using: 'text' or 'vision'
75
- **kwargs: Additional arguments passed to extract_text or to_image
75
+ **kwargs: Additional arguments passed to extract_text or render
76
76
 
77
77
  Returns:
78
78
  str: Extracted text if using='text'
79
79
  PIL.Image.Image: Rendered image if using='vision'
80
80
  None: If content cannot be retrieved
81
81
  """
82
- if not hasattr(self, "extract_text") or not callable(self.extract_text):
83
- logger.error(f"ExtractionMixin requires 'extract_text' method on {self!r}")
84
- return None
85
- if not hasattr(self, "to_image") or not callable(self.to_image):
86
- logger.error(f"ExtractionMixin requires 'to_image' method on {self!r}")
87
- return None
88
-
89
82
  try:
90
83
  if using == "text":
84
+ if not hasattr(self, "extract_text") or not callable(self.extract_text):
85
+ logger.error(f"ExtractionMixin requires 'extract_text' method on {self!r}")
86
+ return None
91
87
  layout = kwargs.pop("layout", True)
92
88
  return self.extract_text(layout=layout, **kwargs)
93
89
  elif using == "vision":
90
+ if not hasattr(self, "render") or not callable(self.render):
91
+ logger.error(f"ExtractionMixin requires 'render' method on {self!r}")
92
+ return None
94
93
  resolution = kwargs.pop("resolution", 72)
95
94
  include_highlights = kwargs.pop("include_highlights", False)
96
95
  labels = kwargs.pop("labels", False)
@@ -102,8 +101,13 @@ class ExtractionMixin(ABC):
102
101
  logger.error(f"Unsupported value for 'using': {using}")
103
102
  return None
104
103
  except Exception as e:
105
- logger.error(f"Error getting {using} content from {self!r}: {e}")
106
- return None
104
+ import warnings
105
+
106
+ warnings.warn(
107
+ f"Error getting {using} content from {self!r}: {e}",
108
+ RuntimeWarning,
109
+ )
110
+ raise
107
111
 
108
112
  def extract(
109
113
  self: Any,
@@ -275,10 +279,7 @@ class ExtractionMixin(ABC):
275
279
  raise RuntimeError("StructuredDataManager is not available")
276
280
 
277
281
  # Get content
278
- layout_for_text = kwargs.pop("layout", True)
279
- content = self._get_extraction_content(
280
- using=using, layout=layout_for_text, **kwargs
281
- ) # Pass kwargs
282
+ content = self._get_extraction_content(using=using, **kwargs) # Pass kwargs
282
283
 
283
284
  if content is None or (
284
285
  using == "text" and isinstance(content, str) and not content.strip()
@@ -359,10 +360,11 @@ class ExtractionMixin(ABC):
359
360
  )
360
361
 
361
362
  if not result.success:
362
- raise ValueError(
363
- f"Stored result for '{target_key}' indicates a failed extraction attempt. "
364
- f"Error: {result.error_message}"
363
+ # Return None for failed extractions to allow batch processing to continue
364
+ logger.warning(
365
+ f"Extraction '{target_key}' failed: {result.error_message}. Returning None."
365
366
  )
367
+ return None
366
368
 
367
369
  if result.data is None:
368
370
  # This case might occur if success=True but data is somehow None
@@ -591,16 +593,28 @@ class ExtractionMixin(ABC):
591
593
  raise RuntimeError("StructuredDataManager is not available")
592
594
 
593
595
  # Content preparation
594
- layout_for_text = kwargs.pop("layout", True)
595
- content = self._get_extraction_content(using=using, layout=layout_for_text, **kwargs)
596
+ content = self._get_extraction_content(using=using, **kwargs)
597
+
598
+ import warnings
596
599
 
597
600
  if content is None or (
598
601
  using == "text" and isinstance(content, str) and not content.strip()
599
602
  ):
603
+ preview = None
604
+ if isinstance(content, str):
605
+ preview = content[:120]
606
+ msg = (
607
+ f"No content available for extraction (using='{using}'). "
608
+ "Ensure the page has a text layer or render() returns an image. "
609
+ "For scanned PDFs run apply_ocr() or switch to using='vision'. "
610
+ f"Content preview: {preview!r}"
611
+ )
612
+ warnings.warn(msg, RuntimeWarning)
613
+
600
614
  result = StructuredDataResult(
601
615
  data=None,
602
616
  success=False,
603
- error_message=f"No content available for extraction (using='{using}')",
617
+ error_message=msg,
604
618
  model_used=model,
605
619
  )
606
620
  else:
@@ -721,8 +721,8 @@ def _build_filter_list(selector: Dict[str, Any], **kwargs) -> List[Dict[str, Any
721
721
  # Start with a base name, modify for specifics like :not
722
722
  filter_name = f"pseudo-class :{name}"
723
723
 
724
- # Relational pseudo-classes are handled separately by the caller
725
- if name in ("above", "below", "near", "left-of", "right-of"):
724
+ # Relational pseudo-classes and collection-level pseudo-classes are handled separately by the caller
725
+ if name in ("above", "below", "near", "left-of", "right-of", "first", "last"):
726
726
  continue
727
727
 
728
728
  # --- Handle :not() ---