natural-pdf 0.1.0__py3-none-any.whl → 0.1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -383,7 +383,7 @@ class HighlightingService:
383
383
  def add(
384
384
  self,
385
385
  page_index: int,
386
- bbox: Tuple[float, float, float, float],
386
+ bbox: Union[Tuple[float, float, float, float], Any], # Relax input type hint
387
387
  color: Optional[Union[Tuple, str]] = None,
388
388
  label: Optional[str] = None,
389
389
  use_color_cycling: bool = False,
@@ -392,9 +392,32 @@ class HighlightingService:
392
392
  existing: str = 'append'
393
393
  ):
394
394
  """Adds a rectangular highlight."""
395
+
396
+ processed_bbox: Tuple[float, float, float, float]
397
+ # Check if bbox is an object with expected attributes (likely a Region)
398
+ # Assuming Region object has x0, top, x1, bottom attributes based on error context
399
+ if (hasattr(bbox, 'x0') and hasattr(bbox, 'top') and
400
+ hasattr(bbox, 'x1') and hasattr(bbox, 'bottom')):
401
+ try:
402
+ # Ensure attributes are numeric before creating tuple
403
+ processed_bbox = (float(bbox.x0), float(bbox.top), float(bbox.x1), float(bbox.bottom))
404
+ except (ValueError, TypeError):
405
+ logger.error(f"Invalid attribute types in bbox object for page {page_index}: {bbox}. Expected numeric values.")
406
+ return
407
+ elif isinstance(bbox, (list, tuple)) and len(bbox) == 4:
408
+ try:
409
+ # Ensure elements are numeric and convert to tuple
410
+ processed_bbox = tuple(float(v) for v in bbox)
411
+ except (ValueError, TypeError):
412
+ logger.error(f"Invalid values in bbox sequence for page {page_index}: {bbox}. Expected numeric values.")
413
+ return
414
+ else:
415
+ logger.error(f"Invalid bbox type or structure provided for page {page_index}: {type(bbox)} - {bbox}. Expected tuple/list of 4 numbers or Region-like object.")
416
+ return # Don't proceed if bbox is invalid
417
+
395
418
  self._add_internal(
396
419
  page_index=page_index,
397
- bbox=bbox,
420
+ bbox=processed_bbox, # Use the processed tuple
398
421
  polygon=None,
399
422
  color_input=color,
400
423
  label=label,
@@ -526,6 +549,7 @@ class HighlightingService:
526
549
  ) -> Optional[Image.Image]:
527
550
  """
528
551
  Renders a specific page with its highlights.
552
+ Legend is now generated based only on highlights present on this page.
529
553
 
530
554
  Args:
531
555
  page_index: The 0-based index of the page to render.
@@ -545,23 +569,19 @@ class HighlightingService:
545
569
  return None
546
570
 
547
571
  page = self._pdf[page_index]
548
- highlights_on_page = self.get_highlights_for_page(page_index)
572
+ highlights_on_page = self.get_highlights_for_page(page_index) # This list will be empty if clear_page was called
549
573
 
550
574
  # --- Get Base Image ---
551
575
  try:
552
576
  render_resolution = resolution if resolution is not None else scale * 72
553
- # Use the underlying pdfplumber page object for base rendering
554
577
  img_object = page._page.to_image(resolution=render_resolution, **kwargs)
555
- # Access the PIL image directly
556
- base_image = img_object.annotated # .annotated usually holds the PIL Image
578
+ base_image = img_object.annotated
557
579
  if not isinstance(base_image, Image.Image):
558
- # Fallback for different pdfplumber versions/outputs
559
580
  png_data = img_object._repr_png_()
560
581
  if png_data:
561
582
  base_image = Image.open(io.BytesIO(png_data)).convert('RGB')
562
583
  else:
563
584
  raise ValueError("Could not extract base PIL image from pdfplumber.")
564
- # Convert to RGBA for compositing
565
585
  base_image = base_image.convert('RGBA')
566
586
  logger.debug(f"Base image for page {page_index} rendered with resolution {render_resolution}.")
567
587
  except Exception as e:
@@ -569,6 +589,7 @@ class HighlightingService:
569
589
  return None
570
590
 
571
591
  # --- Render Highlights ---
592
+ rendered_image: Image.Image
572
593
  if highlights_on_page:
573
594
  renderer = HighlightRenderer(
574
595
  page=page,
@@ -579,21 +600,31 @@ class HighlightingService:
579
600
  )
580
601
  rendered_image = renderer.render()
581
602
  else:
582
- # If no highlights, still need to potentially render OCR if requested
583
603
  if render_ocr:
604
+ # Still render OCR even if no highlights
584
605
  renderer = HighlightRenderer(page, base_image, [], scale, True)
585
- rendered_image = renderer.render() # Will only call _render_ocr_text
606
+ rendered_image = renderer.render()
586
607
  else:
587
608
  rendered_image = base_image # No highlights, no OCR requested
588
609
 
589
- # --- Add Legend ---
610
+ # --- Add Legend (Based ONLY on this page's highlights) ---
590
611
  if labels:
591
- label_colors = self.get_labels_and_colors()
592
- if label_colors:
593
- legend = create_legend(label_colors)
594
- rendered_image = merge_images_with_legend(rendered_image, legend, legend_position)
595
- logger.debug(f"Added legend with {len(label_colors)} labels to page {page_index}.")
596
-
612
+ # CHANGE: Create label_colors map only from highlights_on_page
613
+ labels_colors_on_page: Dict[str, Tuple[int, int, int, int]] = {}
614
+ for hl in highlights_on_page:
615
+ if hl.label and hl.label not in labels_colors_on_page:
616
+ labels_colors_on_page[hl.label] = hl.color
617
+
618
+ if labels_colors_on_page: # Only add legend if there are labels on this page
619
+ legend = create_legend(labels_colors_on_page)
620
+ if legend: # Ensure create_legend didn't return None
621
+ rendered_image = merge_images_with_legend(rendered_image, legend, legend_position)
622
+ logger.debug(f"Added legend with {len(labels_colors_on_page)} labels for page {page_index}.")
623
+ else:
624
+ logger.debug(f"Legend creation returned None for page {page_index}.")
625
+ else:
626
+ logger.debug(f"No labels found on page {page_index}, skipping legend.")
627
+
597
628
  return rendered_image
598
629
 
599
630
  def render_preview(
natural_pdf/core/page.py CHANGED
@@ -9,6 +9,7 @@ import io
9
9
  import json
10
10
 
11
11
  from natural_pdf.elements.collections import ElementCollection
12
+ from natural_pdf.elements.region import Region
12
13
 
13
14
  if TYPE_CHECKING:
14
15
  import pdfplumber
@@ -17,7 +18,6 @@ if TYPE_CHECKING:
17
18
  from natural_pdf.core.highlighting_service import HighlightingService
18
19
  from natural_pdf.elements.base import Element
19
20
 
20
- from natural_pdf.elements.region import Region
21
21
  from natural_pdf.elements.text import TextElement
22
22
  from natural_pdf.analyzers.layout.layout_manager import LayoutManager
23
23
  from natural_pdf.analyzers.layout.layout_options import LayoutOptions
@@ -120,18 +120,50 @@ class Page:
120
120
  raise AttributeError("Parent PDF object does not have a 'highlighter' attribute.")
121
121
  return self._parent.highlighter
122
122
 
123
- def add_exclusion(self, exclusion_func_or_region: Union[Callable[['Page'], Region], Region]) -> 'Page':
123
+ def clear_exclusions(self) -> 'Page':
124
+ """
125
+ Clear all exclusions from the page.
126
+ """
127
+ self._exclusions = []
128
+ return self
129
+
130
+ def add_exclusion(self, exclusion_func_or_region: Union[Callable[['Page'], Region], Region, Any]) -> 'Page':
124
131
  """
125
132
  Add an exclusion to the page. Text from these regions will be excluded from extraction.
133
+ Ensures non-callable items are stored as Region objects if possible.
126
134
 
127
135
  Args:
128
- exclusion_func_or_region: Either a Region object or a function that takes a Page
129
- and returns a Region to exclude
136
+ exclusion_func_or_region: Either a callable function returning a Region,
137
+ a Region object, or another object with a valid .bbox attribute.
130
138
 
131
139
  Returns:
132
140
  Self for method chaining
133
- """
134
- self._exclusions.append(exclusion_func_or_region)
141
+
142
+ Raises:
143
+ TypeError: If a non-callable, non-Region object without a valid bbox is provided.
144
+ """
145
+ if callable(exclusion_func_or_region):
146
+ # Store callable functions directly
147
+ self._exclusions.append(exclusion_func_or_region)
148
+ logger.debug(f"Page {self.index}: Added callable exclusion: {exclusion_func_or_region}")
149
+ elif isinstance(exclusion_func_or_region, Region):
150
+ # Store Region objects directly
151
+ self._exclusions.append(exclusion_func_or_region)
152
+ logger.debug(f"Page {self.index}: Added Region exclusion: {exclusion_func_or_region}")
153
+ elif hasattr(exclusion_func_or_region, 'bbox') and isinstance(getattr(exclusion_func_or_region, 'bbox', None), (tuple, list)) and len(exclusion_func_or_region.bbox) == 4:
154
+ # Convert objects with a valid bbox to a Region before storing
155
+ try:
156
+ bbox_coords = tuple(float(v) for v in exclusion_func_or_region.bbox)
157
+ region_to_add = Region(self, bbox_coords)
158
+ self._exclusions.append(region_to_add)
159
+ logger.debug(f"Page {self.index}: Added exclusion converted to Region from {type(exclusion_func_or_region)}: {region_to_add}")
160
+ except (ValueError, TypeError, Exception) as e:
161
+ # Raise an error if conversion fails
162
+ raise TypeError(f"Failed to convert exclusion object {exclusion_func_or_region} with bbox {getattr(exclusion_func_or_region, 'bbox', 'N/A')} to Region: {e}") from e
163
+ else:
164
+ # Reject invalid types
165
+ raise TypeError(f"Invalid exclusion type: {type(exclusion_func_or_region)}. Must be callable, Region, or have a valid .bbox attribute.")
166
+
135
167
  return self
136
168
 
137
169
  def add_region(self, region: Region, name: Optional[str] = None) -> 'Page':
@@ -190,6 +222,7 @@ class Page:
190
222
  def _get_exclusion_regions(self, include_callable=True, debug=False) -> List[Region]:
191
223
  """
192
224
  Get all exclusion regions for this page.
225
+ Assumes self._exclusions contains only callables or Region objects.
193
226
 
194
227
  Args:
195
228
  include_callable: Whether to evaluate callable exclusion functions
@@ -207,15 +240,14 @@ class Page:
207
240
  for i, exclusion in enumerate(self._exclusions):
208
241
  # Get exclusion label if it's a tuple from PDF level
209
242
  exclusion_label = f"exclusion {i}"
210
- original_exclusion = exclusion
211
-
212
- # Check if it's a tuple from PDF.add_exclusion
243
+ original_exclusion = exclusion # Keep track for debugging
244
+
245
+ # Check if it's a tuple from PDF.add_exclusion (should still be handled if PDF adds labels)
213
246
  if isinstance(exclusion, tuple) and len(exclusion) == 2 and callable(exclusion[0]):
214
- # This is likely from PDF.add_exclusion with (func, label)
215
247
  exclusion_func, label = exclusion
216
248
  if label:
217
249
  exclusion_label = label
218
- exclusion = exclusion_func
250
+ exclusion = exclusion_func # Use the function part
219
251
 
220
252
  # Process callable exclusion functions
221
253
  if callable(exclusion) and include_callable:
@@ -224,40 +256,45 @@ class Page:
224
256
  if debug:
225
257
  print(f" - Evaluating callable {exclusion_label}...")
226
258
 
227
- # Create a temporary copy of exclusions to avoid recursion
228
- original_exclusions = self._exclusions
229
- self._exclusions = [] # Temporarily clear exclusions
259
+ # Temporarily clear exclusions to avoid potential recursion if the callable uses exclusions itself
260
+ # This might be overly cautious depending on use case, but safer.
261
+ temp_original_exclusions = self._exclusions
262
+ self._exclusions = []
230
263
 
231
- # Call the function
232
- region = exclusion(self)
264
+ # Call the function - Expects it to return a Region or None
265
+ region_result = exclusion(self)
233
266
 
234
267
  # Restore exclusions
235
- self._exclusions = original_exclusions
268
+ self._exclusions = temp_original_exclusions
236
269
 
237
- if region:
238
- regions.append(region)
270
+ if isinstance(region_result, Region):
271
+ regions.append(region_result)
239
272
  if debug:
240
- print(f" ✓ Added region: {region}")
273
+ print(f" ✓ Added region from callable: {region_result}")
274
+ elif region_result:
275
+ # Log warning if callable returned something other than Region/None
276
+ logger.warning(f"Callable exclusion {exclusion_label} returned non-Region object: {type(region_result)}. Skipping.")
277
+ if debug:
278
+ print(f" ✗ Callable returned non-Region/None: {type(region_result)}")
241
279
  else:
242
280
  if debug:
243
- print(f" ✗ Function returned None, no region added")
281
+ print(f" ✗ Callable returned None, no region added")
244
282
 
245
283
  except Exception as e:
246
- error_msg = f"Error in {exclusion_label} for page {self.index}: {e}"
284
+ error_msg = f"Error evaluating callable exclusion {exclusion_label} for page {self.index}: {e}"
247
285
  print(error_msg)
248
- # Print more detailed traceback for debugging
249
286
  import traceback
250
287
  print(f" Traceback: {traceback.format_exc().splitlines()[-3:]}")
251
288
 
252
- # Process direct Region objects
253
- elif not callable(exclusion):
254
- # It's already a Region object
289
+ # Process direct Region objects (already validated by add_exclusion)
290
+ elif isinstance(exclusion, Region):
255
291
  regions.append(exclusion)
256
292
  if debug:
257
293
  print(f" - Added direct region: {exclusion}")
294
+ # No else needed, add_exclusion should prevent invalid types
258
295
 
259
296
  if debug:
260
- print(f"Page {self.index}: Found {len(regions)} valid exclusion regions")
297
+ print(f"Page {self.index}: Found {len(regions)} valid exclusion regions to apply")
261
298
 
262
299
  return regions
263
300
 
@@ -1178,6 +1215,34 @@ class Page:
1178
1215
 
1179
1216
  return ElementCollection(detected_regions)
1180
1217
 
1218
+ def clear_detected_layout_regions(self) -> 'Page':
1219
+ """
1220
+ Removes all regions from this page that were added by layout analysis
1221
+ (i.e., regions where `source` attribute is 'detected').
1222
+
1223
+ This clears the regions both from the page's internal `_regions['detected']` list
1224
+ and from the ElementManager's internal list of regions.
1225
+
1226
+ Returns:
1227
+ Self for method chaining.
1228
+ """
1229
+ if not hasattr(self._element_mgr, 'regions') or not hasattr(self._element_mgr, '_elements') or 'regions' not in self._element_mgr._elements:
1230
+ logger.debug(f"Page {self.index}: No regions found in ElementManager, nothing to clear.")
1231
+ self._regions['detected'] = [] # Ensure page's list is also clear
1232
+ return self
1233
+
1234
+ # Filter ElementManager's list to keep only non-detected regions
1235
+ original_count = len(self._element_mgr.regions)
1236
+ self._element_mgr._elements['regions'] = [r for r in self._element_mgr.regions if getattr(r, 'source', None) != 'detected']
1237
+ new_count = len(self._element_mgr.regions)
1238
+ removed_count = original_count - new_count
1239
+
1240
+ # Clear the page's specific list of detected regions
1241
+ self._regions['detected'] = []
1242
+
1243
+ logger.info(f"Page {self.index}: Cleared {removed_count} detected layout regions.")
1244
+ return self
1245
+
1181
1246
  def get_section_between(self, start_element=None, end_element=None, boundary_inclusion='both') -> Optional[Region]: # Return Optional
1182
1247
  """
1183
1248
  Get a section between two elements on this page.
natural_pdf/core/pdf.py CHANGED
@@ -125,6 +125,17 @@ class PDF:
125
125
  from natural_pdf.elements.collections import PageCollection
126
126
  return PageCollection(self._pages)
127
127
 
128
+ def clear_exclusions(self) -> 'PDF':
129
+ """
130
+ Clear all exclusion functions from the PDF.
131
+
132
+ Returns:
133
+ Self for method chaining
134
+ """
135
+
136
+ self._exclusions = []
137
+ return self
138
+
128
139
  def add_exclusion(self, exclusion_func: Callable[[Page], Region], label: str = None) -> 'PDF':
129
140
  """
130
141
  Add an exclusion function to the PDF. Text from these regions will be excluded from extraction.
@@ -7,7 +7,8 @@ from PIL import Image
7
7
  if TYPE_CHECKING:
8
8
  from natural_pdf.core.page import Page
9
9
  from natural_pdf.elements.region import Region
10
- from natural_pdf.elements.base import Element, DirectionalMixin
10
+ from natural_pdf.elements.base import Element
11
+ from natural_pdf.elements.collections import ElementCollection
11
12
 
12
13
 
13
14
  class DirectionalMixin:
@@ -17,7 +18,7 @@ class DirectionalMixin:
17
18
 
18
19
  def _direction(self, direction: str, size: Optional[float] = None,
19
20
  cross_size: str = "full", include_element: bool = False,
20
- until: Optional[str] = None, include_endpoint: bool = True, **kwargs) -> Union['Element', 'Region']:
21
+ until: Optional[str] = None, include_endpoint: bool = True, **kwargs) -> 'Region':
21
22
  """
22
23
  Protected helper method to create a region in a specified direction relative to this element/region.
23
24
 
@@ -154,7 +155,7 @@ class DirectionalMixin:
154
155
  return result
155
156
 
156
157
  def above(self, height: Optional[float] = None, width: str = "full", include_element: bool = False,
157
- until: Optional[str] = None, include_endpoint: bool = True, **kwargs) -> Union['Element', 'Region']:
158
+ until: Optional[str] = None, include_endpoint: bool = True, **kwargs) -> 'Region':
158
159
  """
159
160
  Select region above this element/region.
160
161
 
@@ -180,7 +181,7 @@ class DirectionalMixin:
180
181
  )
181
182
 
182
183
  def below(self, height: Optional[float] = None, width: str = "full", include_element: bool = False,
183
- until: Optional[str] = None, include_endpoint: bool = True, **kwargs) -> Union['Element', 'Region']:
184
+ until: Optional[str] = None, include_endpoint: bool = True, **kwargs) -> 'Region':
184
185
  """
185
186
  Select region below this element/region.
186
187
 
@@ -206,7 +207,7 @@ class DirectionalMixin:
206
207
  )
207
208
 
208
209
  def left(self, width: Optional[float] = None, height: str = "full", include_element: bool = False,
209
- until: Optional[str] = None, include_endpoint: bool = True, **kwargs) -> Union['Element', 'Region']:
210
+ until: Optional[str] = None, include_endpoint: bool = True, **kwargs) -> 'Region':
210
211
  """
211
212
  Select region to the left of this element/region.
212
213
 
@@ -232,7 +233,7 @@ class DirectionalMixin:
232
233
  )
233
234
 
234
235
  def right(self, width: Optional[float] = None, height: str = "full", include_element: bool = False,
235
- until: Optional[str] = None, include_endpoint: bool = True, **kwargs) -> Union['Element', 'Region']:
236
+ until: Optional[str] = None, include_endpoint: bool = True, **kwargs) -> 'Region':
236
237
  """
237
238
  Select region to the right of this element/region.
238
239
 
@@ -257,6 +258,86 @@ class DirectionalMixin:
257
258
  **kwargs
258
259
  )
259
260
 
261
+ def expand(self,
262
+ left: float = 0,
263
+ right: float = 0,
264
+ top_expand: float = 0, # Renamed to avoid conflict
265
+ bottom_expand: float = 0, # Renamed to avoid conflict
266
+ width_factor: float = 1.0,
267
+ height_factor: float = 1.0,
268
+ # Keep original parameter names for backward compatibility
269
+ top: float = None,
270
+ bottom: float = None) -> 'Region':
271
+ """
272
+ Create a new region expanded from this element/region.
273
+
274
+ Args:
275
+ left: Amount to expand left edge (positive value expands leftwards)
276
+ right: Amount to expand right edge (positive value expands rightwards)
277
+ top_expand: Amount to expand top edge (positive value expands upwards)
278
+ bottom_expand: Amount to expand bottom edge (positive value expands downwards)
279
+ width_factor: Factor to multiply width by (applied after absolute expansion)
280
+ height_factor: Factor to multiply height by (applied after absolute expansion)
281
+ top: (DEPRECATED, use top_expand) Amount to expand top edge (upward)
282
+ bottom: (DEPRECATED, use bottom_expand) Amount to expand bottom edge (downward)
283
+
284
+ Returns:
285
+ New expanded Region object
286
+ """
287
+ # Start with current coordinates
288
+ new_x0 = self.x0
289
+ new_x1 = self.x1
290
+ new_top = self.top
291
+ new_bottom = self.bottom
292
+
293
+ # Handle the deprecated parameter names for backward compatibility
294
+ if top is not None:
295
+ top_expand = top
296
+ if bottom is not None:
297
+ bottom_expand = bottom
298
+
299
+ # Apply absolute expansions first
300
+ new_x0 -= left
301
+ new_x1 += right
302
+ new_top -= top_expand # Expand upward (decrease top coordinate)
303
+ new_bottom += bottom_expand # Expand downward (increase bottom coordinate)
304
+
305
+ # Apply percentage factors if provided
306
+ if width_factor != 1.0 or height_factor != 1.0:
307
+ # Calculate center point *after* absolute expansion
308
+ center_x = (new_x0 + new_x1) / 2
309
+ center_y = (new_top + new_bottom) / 2
310
+
311
+ # Calculate current width and height *after* absolute expansion
312
+ current_width = new_x1 - new_x0
313
+ current_height = new_bottom - new_top
314
+
315
+ # Calculate new width and height
316
+ new_width = current_width * width_factor
317
+ new_height = current_height * height_factor
318
+
319
+ # Adjust coordinates based on the new dimensions, keeping the center
320
+ new_x0 = center_x - new_width / 2
321
+ new_x1 = center_x + new_width / 2
322
+ new_top = center_y - new_height / 2
323
+ new_bottom = center_y + new_height / 2
324
+
325
+ # Clamp coordinates to page boundaries
326
+ new_x0 = max(0, new_x0)
327
+ new_top = max(0, new_top)
328
+ new_x1 = min(self.page.width, new_x1)
329
+ new_bottom = min(self.page.height, new_bottom)
330
+
331
+ # Ensure coordinates are valid (x0 <= x1, top <= bottom)
332
+ if new_x0 > new_x1: new_x0 = new_x1 = (new_x0 + new_x1) / 2
333
+ if new_top > new_bottom: new_top = new_bottom = (new_top + new_bottom) / 2
334
+
335
+ # Create new region with expanded bbox
336
+ from natural_pdf.elements.region import Region
337
+ new_region = Region(self.page, (new_x0, new_top, new_x1, new_bottom))
338
+
339
+ return new_region
340
+
260
341
 
261
342
  class Element(DirectionalMixin):
262
343
  """
@@ -415,7 +496,8 @@ class Element(DirectionalMixin):
415
496
  candidates = candidates[:limit] if limit else candidates
416
497
 
417
498
  # Find matching elements
418
- matches = self.page.filter_elements(candidates, selector, **kwargs)
499
+ from natural_pdf.elements.collections import ElementCollection
500
+ matches = ElementCollection(candidates).find_all(selector, **kwargs)
419
501
  return matches[0] if matches else None
420
502
  elif idx + 1 < len(all_elements):
421
503
  # No selector, just return the next element
@@ -449,16 +531,17 @@ class Element(DirectionalMixin):
449
531
 
450
532
  # Search for previous matching element
451
533
  if selector:
452
- # Filter elements before this one
534
+ # Select elements before this one
453
535
  candidates = all_elements[:idx]
454
- # Reverse to start from closest to this element
536
+ # Reverse to search backwards from the current element
455
537
  candidates = candidates[::-1]
456
538
  # Limit search range for performance
457
539
  candidates = candidates[:limit] if limit else candidates
458
540
 
459
- # Find matching elements
460
- matches = self.page.filter_elements(candidates, selector, **kwargs)
461
- return matches[0] if matches else None
541
+ # Find matching elements using ElementCollection
542
+ from natural_pdf.elements.collections import ElementCollection
543
+ matches = ElementCollection(candidates).find_all(selector, **kwargs)
544
+ return matches[0] if matches else None # find_all returns a collection
462
545
  elif idx > 0:
463
546
  # No selector, just return the previous element
464
547
  return all_elements[idx - 1]
@@ -737,8 +820,9 @@ class Element(DirectionalMixin):
737
820
  Returns:
738
821
  First matching element or None
739
822
  """
740
- # Create a temporary region from this element's bounds
741
823
  from natural_pdf.elements.region import Region
824
+
825
+ # Create a temporary region from this element's bounds
742
826
  temp_region = Region(self.page, self.bbox)
743
827
  return temp_region.find(selector, apply_exclusions=apply_exclusions, **kwargs)
744
828
 
@@ -755,7 +839,8 @@ class Element(DirectionalMixin):
755
839
  Returns:
756
840
  ElementCollection with matching elements
757
841
  """
758
- # Create a temporary region from this element's bounds
759
842
  from natural_pdf.elements.region import Region
843
+
844
+ # Create a temporary region from this element's bounds
760
845
  temp_region = Region(self.page, self.bbox)
761
846
  return temp_region.find_all(selector, apply_exclusions=apply_exclusions, **kwargs)
@@ -2,6 +2,7 @@ import logging
2
2
 
3
3
  from typing import List, Optional, Dict, Any, Union, Callable, TypeVar, Generic, Iterator, Tuple, TYPE_CHECKING
4
4
  from natural_pdf.ocr import OCROptions
5
+ from natural_pdf.selectors.parser import parse_selector, selector_to_filter_func
5
6
 
6
7
  logger = logging.getLogger(__name__)
7
8
 
@@ -882,6 +883,61 @@ class ElementCollection(Generic[T]):
882
883
  logger.error(f"Error creating interactive viewer from collection: {e}", exc_info=True)
883
884
  return None
884
885
 
886
+ def find_all(self, selector: str, regex: bool = False, case: bool = True, **kwargs) -> 'ElementCollection[T]':
887
+ """
888
+ Filter elements within this collection matching the selector.
889
+
890
+ Args:
891
+ selector: CSS-like selector string.
892
+ regex: Whether to use regex for text search in :contains (default: False).
893
+ case: Whether to do case-sensitive text search (default: True).
894
+ **kwargs: Additional filter parameters passed to the selector function.
895
+
896
+ Returns:
897
+ A new ElementCollection containing only the matching elements from this collection.
898
+ """
899
+ if not self._elements:
900
+ return ElementCollection([])
901
+
902
+ try:
903
+ selector_obj = parse_selector(selector)
904
+ except Exception as e:
905
+ logger.error(f"Error parsing selector '{selector}': {e}")
906
+ return ElementCollection([]) # Return empty on parse error
907
+
908
+ # Pass regex and case flags to selector function generator
909
+ kwargs['regex'] = regex
910
+ kwargs['case'] = case
911
+
912
+ try:
913
+ filter_func = selector_to_filter_func(selector_obj, **kwargs)
914
+ except Exception as e:
915
+ logger.error(f"Error creating filter function for selector '{selector}': {e}")
916
+ return ElementCollection([]) # Return empty on filter creation error
917
+
918
+ matching_elements = [element for element in self._elements if filter_func(element)]
919
+
920
+ # Note: Unlike Page.find_all, this doesn't re-sort.
921
+ # Sorting should be done explicitly on the collection if needed.
922
+
923
+ return ElementCollection(matching_elements)
924
+
925
+ def find(self, selector: str, regex: bool = False, case: bool = True, **kwargs) -> Optional[T]:
926
+ """
927
+ Find the first element within this collection matching the selector.
928
+
929
+ Args:
930
+ selector: CSS-like selector string.
931
+ regex: Whether to use regex for text search in :contains (default: False).
932
+ case: Whether to do case-sensitive text search (default: True).
933
+ **kwargs: Additional filter parameters passed to the selector function.
934
+
935
+ Returns:
936
+ The first matching element or None.
937
+ """
938
+ results = self.find_all(selector, regex=regex, case=case, **kwargs)
939
+ return results.first
940
+
885
941
  class PageCollection(Generic[P]):
886
942
  """
887
943
  A collection of PDF pages with cross-page operations.