natural-pdf 0.2.1.dev0__py3-none-any.whl → 0.2.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -3,7 +3,7 @@
3
3
  import json
4
4
  import logging
5
5
  from collections import UserList
6
- from typing import TYPE_CHECKING, Any, Dict, List, Literal, Optional, Tuple, Union
6
+ from typing import TYPE_CHECKING, Any, Callable, Dict, List, Literal, Optional, Tuple, Union
7
7
 
8
8
  import numpy as np
9
9
  from PIL import Image, ImageDraw
@@ -16,6 +16,7 @@ if TYPE_CHECKING:
16
16
  from natural_pdf.elements.element_collection import ElementCollection
17
17
  from natural_pdf.elements.region import Region
18
18
  from natural_pdf.flows.region import FlowRegion
19
+ from natural_pdf.tables.result import TableResult
19
20
 
20
21
  logger = logging.getLogger(__name__)
21
22
 
@@ -131,6 +132,15 @@ class GuidesList(UserList):
131
132
  self._parent = parent_guides
132
133
  self._axis = axis
133
134
 
135
+ def __getitem__(self, i):
136
+ """Override to handle slicing properly."""
137
+ if isinstance(i, slice):
138
+ # Return a new GuidesList with the sliced data
139
+ return self.__class__(self._parent, self._axis, self.data[i])
140
+ else:
141
+ # For single index, return the value directly
142
+ return self.data[i]
143
+
134
144
  def from_content(
135
145
  self,
136
146
  markers: Union[str, List[str], "ElementCollection", None],
@@ -140,6 +150,7 @@ class GuidesList(UserList):
140
150
  tolerance: float = 5,
141
151
  *,
142
152
  append: bool = False,
153
+ apply_exclusions: bool = True,
143
154
  ) -> "Guides":
144
155
  """
145
156
  Create guides from content markers and add to this axis.
@@ -154,6 +165,7 @@ class GuidesList(UserList):
154
165
  align: How to align guides relative to found elements
155
166
  outer: Whether to add outer boundary guides
156
167
  tolerance: Tolerance for snapping to element edges
168
+ apply_exclusions: Whether to apply exclusion zones when searching for text
157
169
 
158
170
  Returns:
159
171
  Parent Guides object for chaining
@@ -178,6 +190,7 @@ class GuidesList(UserList):
178
190
  align=align,
179
191
  outer=outer,
180
192
  tolerance=tolerance,
193
+ apply_exclusions=apply_exclusions,
181
194
  )
182
195
 
183
196
  # Collect guides from this region
@@ -260,6 +273,7 @@ class GuidesList(UserList):
260
273
  align=align,
261
274
  outer=outer,
262
275
  tolerance=tolerance,
276
+ apply_exclusions=apply_exclusions,
263
277
  )
264
278
 
265
279
  # Replace or append based on parameter
@@ -1398,6 +1412,7 @@ class Guides:
1398
1412
  align: Literal["left", "right", "center", "between"] = "left",
1399
1413
  outer: bool = True,
1400
1414
  tolerance: float = 5,
1415
+ apply_exclusions: bool = True,
1401
1416
  ) -> "Guides":
1402
1417
  """
1403
1418
  Create guides based on text content positions.
@@ -1413,6 +1428,7 @@ class Guides:
1413
1428
  align: Where to place guides relative to found text
1414
1429
  outer: Whether to add guides at the boundaries
1415
1430
  tolerance: Maximum distance to search for text
1431
+ apply_exclusions: Whether to apply exclusion zones when searching for text
1416
1432
 
1417
1433
  Returns:
1418
1434
  New Guides object aligned to text content
@@ -1431,6 +1447,7 @@ class Guides:
1431
1447
  align=align,
1432
1448
  outer=outer,
1433
1449
  tolerance=tolerance,
1450
+ apply_exclusions=apply_exclusions,
1434
1451
  )
1435
1452
 
1436
1453
  # Store in flow guides
@@ -1469,7 +1486,7 @@ class Guides:
1469
1486
  # Find each marker and determine guide position
1470
1487
  for marker in marker_texts:
1471
1488
  if hasattr(obj, "find"):
1472
- element = obj.find(f'text:contains("{marker}")')
1489
+ element = obj.find(f'text:contains("{marker}")', apply_exclusions=apply_exclusions)
1473
1490
  if element:
1474
1491
  if axis == "vertical":
1475
1492
  if align == "left":
@@ -1498,7 +1515,9 @@ class Guides:
1498
1515
  marker_bounds = []
1499
1516
  for marker in marker_texts:
1500
1517
  if hasattr(obj, "find"):
1501
- element = obj.find(f'text:contains("{marker}")')
1518
+ element = obj.find(
1519
+ f'text:contains("{marker}")', apply_exclusions=apply_exclusions
1520
+ )
1502
1521
  if element:
1503
1522
  if axis == "vertical":
1504
1523
  marker_bounds.append((element.x0, element.x1))
@@ -3285,6 +3304,7 @@ class Guides:
3285
3304
  align: Literal["left", "right", "center", "between"] = "left",
3286
3305
  outer: bool = True,
3287
3306
  tolerance: float = 5,
3307
+ apply_exclusions: bool = True,
3288
3308
  ) -> "Guides":
3289
3309
  """
3290
3310
  Instance method: Add guides from content, allowing chaining.
@@ -3301,6 +3321,7 @@ class Guides:
3301
3321
  align: How to align guides relative to found elements
3302
3322
  outer: Whether to add outer boundary guides
3303
3323
  tolerance: Tolerance for snapping to element edges
3324
+ apply_exclusions: Whether to apply exclusion zones when searching for text
3304
3325
 
3305
3326
  Returns:
3306
3327
  Self for method chaining
@@ -3318,6 +3339,7 @@ class Guides:
3318
3339
  align=align,
3319
3340
  outer=outer,
3320
3341
  tolerance=tolerance,
3342
+ apply_exclusions=apply_exclusions,
3321
3343
  )
3322
3344
 
3323
3345
  # Add the appropriate coordinates to this object
@@ -3421,6 +3443,140 @@ class Guides:
3421
3443
 
3422
3444
  return self
3423
3445
 
3446
+ def extract_table(
3447
+ self,
3448
+ target: Optional[Union["Page", "Region"]] = None,
3449
+ source: str = "guides_temp",
3450
+ cell_padding: float = 0.5,
3451
+ include_outer_boundaries: bool = False,
3452
+ method: Optional[str] = None,
3453
+ table_settings: Optional[dict] = None,
3454
+ use_ocr: bool = False,
3455
+ ocr_config: Optional[dict] = None,
3456
+ text_options: Optional[Dict] = None,
3457
+ cell_extraction_func: Optional[Callable[["Region"], Optional[str]]] = None,
3458
+ show_progress: bool = False,
3459
+ content_filter: Optional[Union[str, Callable[[str], bool], List[str]]] = None,
3460
+ *,
3461
+ multi_page: Literal["auto", True, False] = "auto",
3462
+ ) -> "TableResult":
3463
+ """
3464
+ Extract table data directly from guides without leaving temporary regions.
3465
+
3466
+ This method:
3467
+ 1. Creates table structure using build_grid()
3468
+ 2. Extracts table data from the created table region
3469
+ 3. Cleans up all temporary regions
3470
+ 4. Returns the TableResult
3471
+
3472
+ Args:
3473
+ target: Page or Region to create regions on (uses self.context if None)
3474
+ source: Source label for temporary regions (will be cleaned up)
3475
+ cell_padding: Internal padding for cell regions in points
3476
+ include_outer_boundaries: Whether to add boundaries at edges if missing
3477
+ method: Table extraction method ('tatr', 'pdfplumber', 'text', etc.)
3478
+ table_settings: Settings for pdfplumber table extraction
3479
+ use_ocr: Whether to use OCR for text extraction
3480
+ ocr_config: OCR configuration parameters
3481
+ text_options: Dictionary of options for the 'text' method
3482
+ cell_extraction_func: Optional callable for custom cell text extraction
3483
+ show_progress: Controls progress bar for text method
3484
+ content_filter: Content filtering function or patterns
3485
+ multi_page: Controls multi-region table creation for FlowRegions
3486
+
3487
+ Returns:
3488
+ TableResult: Extracted table data
3489
+
3490
+ Raises:
3491
+ ValueError: If no table region is created from the guides
3492
+
3493
+ Example:
3494
+ ```python
3495
+ from natural_pdf.analyzers import Guides
3496
+
3497
+ # Create guides from detected lines
3498
+ guides = Guides.from_lines(page, source_label="detected")
3499
+
3500
+ # Extract table directly - no temporary regions left behind
3501
+ table_data = guides.extract_table()
3502
+
3503
+ # Convert to pandas DataFrame
3504
+ df = table_data.to_df()
3505
+ ```
3506
+ """
3507
+ target_obj = target or self.context
3508
+ if not target_obj:
3509
+ raise ValueError("No target object available. Provide target parameter or context.")
3510
+
3511
+ # Get the page for cleanup later
3512
+ if hasattr(target_obj, "x0") and hasattr(target_obj, "top"): # Region
3513
+ page = target_obj._page
3514
+ element_manager = page._element_mgr
3515
+ elif hasattr(target_obj, "_element_mgr"): # Page
3516
+ page = target_obj
3517
+ element_manager = page._element_mgr
3518
+ else:
3519
+ raise ValueError(f"Target object {target_obj} is not a Page or Region")
3520
+
3521
+ try:
3522
+ # Step 1: Build grid structure (creates temporary regions)
3523
+ grid_result = self.build_grid(
3524
+ target=target_obj,
3525
+ source=source,
3526
+ cell_padding=cell_padding,
3527
+ include_outer_boundaries=include_outer_boundaries,
3528
+ multi_page=multi_page,
3529
+ )
3530
+
3531
+ # Step 2: Get the table region and extract table data
3532
+ table_region = grid_result["regions"]["table"]
3533
+ if table_region is None:
3534
+ raise ValueError(
3535
+ "No table region was created from the guides. Check that you have both vertical and horizontal guides."
3536
+ )
3537
+
3538
+ # Handle multi-page case where table_region might be a list
3539
+ if isinstance(table_region, list):
3540
+ if not table_region:
3541
+ raise ValueError("No table regions were created from the guides.")
3542
+ # Use the first table region for extraction
3543
+ table_region = table_region[0]
3544
+
3545
+ # Step 3: Extract table data using the region's extract_table method
3546
+ table_result = table_region.extract_table(
3547
+ method=method,
3548
+ table_settings=table_settings,
3549
+ use_ocr=use_ocr,
3550
+ ocr_config=ocr_config,
3551
+ text_options=text_options,
3552
+ cell_extraction_func=cell_extraction_func,
3553
+ show_progress=show_progress,
3554
+ content_filter=content_filter,
3555
+ )
3556
+
3557
+ return table_result
3558
+
3559
+ finally:
3560
+ # Step 4: Clean up all temporary regions created by build_grid
3561
+ # This ensures no regions are left behind regardless of success/failure
3562
+ try:
3563
+ regions_to_remove = [
3564
+ r
3565
+ for r in element_manager.regions
3566
+ if getattr(r, "source", None) == source
3567
+ and getattr(r, "region_type", None)
3568
+ in {"table", "table_row", "table_column", "table_cell"}
3569
+ ]
3570
+
3571
+ for region in regions_to_remove:
3572
+ element_manager.remove_element(region, element_type="regions")
3573
+
3574
+ if regions_to_remove:
3575
+ logger.debug(f"Cleaned up {len(regions_to_remove)} temporary regions")
3576
+
3577
+ except Exception as cleanup_err:
3578
+ logger.warning(f"Failed to clean up temporary regions: {cleanup_err}")
3579
+
3424
3580
  def _get_flow_orientation(self) -> Literal["vertical", "horizontal", "unknown"]:
3425
3581
  """Determines if a FlowRegion's constituent parts are arranged vertically or horizontally."""
3426
3582
  if not self.is_flow_region or len(self.context.constituent_regions) < 2:
@@ -29,9 +29,22 @@ class DirectionalCollectionMixin:
29
29
  """Find regions to the right of all elements in this collection."""
30
30
  return self.apply(lambda element: element.right(**kwargs))
31
31
 
32
- def expand(self, **kwargs) -> "ElementCollection":
33
- """Expand all elements in this collection."""
34
- return self.apply(lambda element: element.expand(**kwargs))
32
+ def expand(self, *args, **kwargs) -> "ElementCollection":
33
+ """Expand all elements in this collection.
34
+
35
+ Args:
36
+ *args: If a single positional argument is provided, expands all elements
37
+ by that amount in all directions.
38
+ **kwargs: Keyword arguments for directional expansion (left, right, top, bottom, etc.)
39
+
40
+ Examples:
41
+ # Expand all elements by 5 pixels in all directions
42
+ collection.expand(5)
43
+
44
+ # Expand with different amounts in each direction
45
+ collection.expand(left=10, right=5, top=3, bottom=7)
46
+ """
47
+ return self.apply(lambda element: element.expand(*args, **kwargs))
35
48
 
36
49
 
37
50
  class ApplyMixin:
@@ -335,6 +335,7 @@ class HighlightContext:
335
335
  self.show_on_exit = show_on_exit
336
336
  self.highlight_groups = []
337
337
  self._color_manager = ColorManager()
338
+ self._exit_image = None # Store image for Jupyter display
338
339
 
339
340
  def add(
340
341
  self,
@@ -421,6 +422,11 @@ class HighlightContext:
421
422
  )
422
423
  return None
423
424
 
425
+ @property
426
+ def image(self) -> Optional[Image.Image]:
427
+ """Get the last generated image (useful after context exit)."""
428
+ return self._exit_image
429
+
424
430
  def __enter__(self) -> "HighlightContext":
425
431
  """Enter the context."""
426
432
  return self
@@ -428,7 +434,25 @@ class HighlightContext:
428
434
  def __exit__(self, exc_type, exc_val, exc_tb):
429
435
  """Exit the context, optionally showing highlights."""
430
436
  if self.show_on_exit and not exc_type:
431
- self.show()
437
+ self._exit_image = self.show()
438
+
439
+ # Check if we're in a Jupyter/IPython environment
440
+ try:
441
+ # Try to get IPython instance
442
+ from IPython import get_ipython
443
+
444
+ ipython = get_ipython()
445
+ if ipython is not None:
446
+ # We're in IPython/Jupyter
447
+ from IPython.display import display
448
+
449
+ if self._exit_image is not None:
450
+ display(self._exit_image)
451
+ except (ImportError, NameError):
452
+ # Not in Jupyter or IPython not available - that's OK
453
+ pass
454
+
455
+ # __exit__ must return False to not suppress exceptions
432
456
  return False
433
457
 
434
458
 
@@ -689,7 +713,7 @@ class HighlightingService:
689
713
  logger.debug(f"Added highlight to page {page_index}: {highlight}")
690
714
 
691
715
  # --- Invalidate page-level image cache --------------------------------
692
- # The Page.to_image method maintains an internal cache keyed by rendering
716
+ # The Page.render method maintains an internal cache keyed by rendering
693
717
  # parameters. Because the cache key currently does **not** incorporate
694
718
  # any information about the highlights themselves, it can return stale
695
719
  # images after highlights are added or removed. To ensure the next
@@ -700,11 +724,11 @@ class HighlightingService:
700
724
  if hasattr(page_obj, "_to_image_cache"):
701
725
  page_obj._to_image_cache.clear()
702
726
  logger.debug(
703
- f"Cleared cached to_image renders for page {page_index} after adding a highlight."
727
+ f"Cleared cached render images for page {page_index} after adding a highlight."
704
728
  )
705
729
  except Exception as cache_err: # pragma: no cover – never fail highlight creation
706
730
  logger.warning(
707
- f"Failed to invalidate to_image cache for page {page_index}: {cache_err}",
731
+ f"Failed to invalidate render cache for page {page_index}: {cache_err}",
708
732
  exc_info=True,
709
733
  )
710
734
 
@@ -737,11 +761,11 @@ class HighlightingService:
737
761
  if hasattr(page_obj, "_to_image_cache"):
738
762
  page_obj._to_image_cache.clear()
739
763
  logger.debug(
740
- f"Cleared cached to_image renders for page {page_index} after removing highlights."
764
+ f"Cleared cached render images for page {page_index} after removing highlights."
741
765
  )
742
766
  except Exception as cache_err: # pragma: no cover
743
767
  logger.warning(
744
- f"Failed to invalidate to_image cache for page {page_index}: {cache_err}",
768
+ f"Failed to invalidate render cache for page {page_index}: {cache_err}",
745
769
  exc_info=True,
746
770
  )
747
771
 
@@ -760,7 +784,7 @@ class HighlightingService:
760
784
  labels: bool = True,
761
785
  legend_position: str = "right",
762
786
  render_ocr: bool = False,
763
- **kwargs, # Pass other args to pdfplumber.page.to_image if needed
787
+ **kwargs, # Pass other args to pdfplumber.page.to_image if needed (internal API)
764
788
  ) -> Optional[Image.Image]:
765
789
  """
766
790
  Renders a specific page with its highlights.
@@ -773,7 +797,7 @@ class HighlightingService:
773
797
  labels: Whether to include a legend for highlights.
774
798
  legend_position: Position of the legend.
775
799
  render_ocr: Whether to render OCR text on the image.
776
- kwargs: Additional keyword arguments for pdfplumber's page.to_image (e.g., width, height).
800
+ kwargs: Additional keyword arguments for pdfplumber's internal page.to_image (e.g., width, height).
777
801
 
778
802
  Returns:
779
803
  A PIL Image object of the rendered page, or None if rendering fails.
@@ -957,7 +981,7 @@ class HighlightingService:
957
981
  crop_bbox: Optional bounding box (x0, top, x1, bottom) in PDF coordinate
958
982
  space to crop the output image to, before legends or other overlays are
959
983
  applied. If None, no cropping is performed.
960
- **kwargs: Additional args for pdfplumber's to_image (e.g., width, height).
984
+ **kwargs: Additional args for pdfplumber's internal to_image (e.g., width, height).
961
985
 
962
986
  Returns:
963
987
  PIL Image of the preview, or None if rendering fails.
natural_pdf/core/page.py CHANGED
@@ -341,6 +341,26 @@ class Page(
341
341
  for elem in elements:
342
342
  spec.add_highlight(element=elem, color=group_color, label=group_label)
343
343
 
344
+ # Handle exclusions visualization
345
+ exclusions_param = kwargs.get("exclusions")
346
+ if exclusions_param:
347
+ # Get exclusion regions
348
+ exclusion_regions = self._get_exclusion_regions(include_callable=True)
349
+
350
+ if exclusion_regions:
351
+ # Determine color for exclusions
352
+ exclusion_color = (
353
+ exclusions_param if isinstance(exclusions_param, str) else "red"
354
+ )
355
+
356
+ # Add exclusion regions as highlights
357
+ for region in exclusion_regions:
358
+ spec.add_highlight(
359
+ element=region,
360
+ color=exclusion_color,
361
+ label=f"Exclusion: {region.label or 'unnamed'}",
362
+ )
363
+
344
364
  return [spec]
345
365
 
346
366
  @property
@@ -391,7 +411,9 @@ class Page(
391
411
 
392
412
  def add_exclusion(
393
413
  self,
394
- exclusion_func_or_region: Union[Callable[["Page"], "Region"], "Region", Any],
414
+ exclusion_func_or_region: Union[
415
+ Callable[["Page"], "Region"], "Region", List[Any], Tuple[Any, ...], Any
416
+ ],
395
417
  label: Optional[str] = None,
396
418
  method: str = "region",
397
419
  ) -> "Page":
@@ -401,7 +423,8 @@ class Page(
401
423
 
402
424
  Args:
403
425
  exclusion_func_or_region: Either a callable function returning a Region,
404
- a Region object, or another object with a valid .bbox attribute.
426
+ a Region object, a list/tuple of regions or elements,
427
+ or another object with a valid .bbox attribute.
405
428
  label: Optional label for this exclusion (e.g., 'header', 'footer').
406
429
  method: Exclusion method - 'region' (exclude all elements in bounding box) or
407
430
  'element' (exclude only the specific elements). Default: 'region'.
@@ -551,10 +574,53 @@ class Page(
551
574
  raise TypeError(
552
575
  f"Failed to convert exclusion object {exclusion_func_or_region} with bbox {getattr(exclusion_func_or_region, 'bbox', 'N/A')} to Region: {e}"
553
576
  ) from e
577
+ elif isinstance(exclusion_func_or_region, (list, tuple)):
578
+ # Handle lists/tuples of regions or elements
579
+ if not exclusion_func_or_region:
580
+ logger.warning(f"Page {self.index}: Empty list provided for exclusion, ignoring.")
581
+ return self
582
+
583
+ if method == "element":
584
+ # Store each element directly
585
+ for item in exclusion_func_or_region:
586
+ if hasattr(item, "bbox") and len(getattr(item, "bbox", [])) == 4:
587
+ self._exclusions.append((item, label, method))
588
+ logger.debug(
589
+ f"Page {self.index}: Added element exclusion from list -> {item}"
590
+ )
591
+ else:
592
+ logger.warning(
593
+ f"Page {self.index}: Skipping item without valid bbox in list: {item}"
594
+ )
595
+ else: # method == "region"
596
+ # Convert each item to a Region and add
597
+ for item in exclusion_func_or_region:
598
+ try:
599
+ if isinstance(item, Region):
600
+ item.label = label
601
+ self._exclusions.append((item, label, method))
602
+ logger.debug(f"Page {self.index}: Added Region from list: {item}")
603
+ elif hasattr(item, "bbox") and len(getattr(item, "bbox", [])) == 4:
604
+ bbox_coords = tuple(float(v) for v in item.bbox)
605
+ region = Region(self, bbox_coords, label=label)
606
+ self._exclusions.append((region, label, method))
607
+ logger.debug(
608
+ f"Page {self.index}: Added exclusion region from list item {bbox_coords}"
609
+ )
610
+ else:
611
+ logger.warning(
612
+ f"Page {self.index}: Skipping item without valid bbox in list: {item}"
613
+ )
614
+ except Exception as e:
615
+ logger.error(
616
+ f"Page {self.index}: Failed to convert list item to Region: {e}"
617
+ )
618
+ continue
619
+ return self
554
620
  else:
555
621
  # Reject invalid types
556
622
  raise TypeError(
557
- f"Invalid exclusion type: {type(exclusion_func_or_region)}. Must be callable, Region, or have a valid .bbox attribute."
623
+ f"Invalid exclusion type: {type(exclusion_func_or_region)}. Must be callable, Region, list/tuple of regions/elements, or have a valid .bbox attribute."
558
624
  )
559
625
 
560
626
  # Append the stored data (tuple of object/callable, label, and method)
@@ -668,6 +734,46 @@ class Page(
668
734
  regions.append(region_result)
669
735
  if debug:
670
736
  print(f" ✓ Added region from callable '{label}': {region_result}")
737
+ elif hasattr(region_result, "__iter__") and hasattr(region_result, "__len__"):
738
+ # Handle ElementCollection or other iterables
739
+ from natural_pdf.elements.element_collection import ElementCollection
740
+
741
+ if isinstance(region_result, ElementCollection) or (
742
+ hasattr(region_result, "__iter__") and region_result
743
+ ):
744
+ if debug:
745
+ print(
746
+ f" Converting {type(region_result)} with {len(region_result)} elements to regions..."
747
+ )
748
+
749
+ # Convert each element to a region
750
+ for elem in region_result:
751
+ try:
752
+ if hasattr(elem, "bbox") and len(elem.bbox) == 4:
753
+ bbox_coords = tuple(float(v) for v in elem.bbox)
754
+ region = Region(self, bbox_coords, label=label)
755
+ regions.append(region)
756
+ if debug:
757
+ print(
758
+ f" ✓ Added region from element: {bbox_coords}"
759
+ )
760
+ else:
761
+ if debug:
762
+ print(
763
+ f" ✗ Skipping element without valid bbox: {elem}"
764
+ )
765
+ except Exception as e:
766
+ if debug:
767
+ print(f" ✗ Failed to convert element to region: {e}")
768
+ continue
769
+
770
+ if debug and len(region_result) > 0:
771
+ print(
772
+ f" ✓ Converted {len(region_result)} elements from callable '{label}'"
773
+ )
774
+ else:
775
+ if debug:
776
+ print(f" ✗ Empty iterable returned from callable '{label}'")
671
777
  elif region_result:
672
778
  logger.warning(
673
779
  f"Callable exclusion '{exclusion_label}' returned non-Region object: {type(region_result)}. Skipping."
@@ -1013,6 +1119,22 @@ class Page(
1013
1119
  "Cannot sort elements in reading order: Missing required attributes (top, x0)."
1014
1120
  )
1015
1121
 
1122
+ # Handle collection-level pseudo-classes (:first, :last) for OR selectors
1123
+ # Note: We only apply :first/:last if they appear in any of the sub-selectors
1124
+ has_first = False
1125
+ has_last = False
1126
+ for sub_selector in selector_obj.get("selectors", []):
1127
+ for pseudo in sub_selector.get("pseudo_classes", []):
1128
+ if pseudo.get("name") == "first":
1129
+ has_first = True
1130
+ elif pseudo.get("name") == "last":
1131
+ has_last = True
1132
+
1133
+ if has_first:
1134
+ matching_elements = matching_elements[:1] if matching_elements else []
1135
+ elif has_last:
1136
+ matching_elements = matching_elements[-1:] if matching_elements else []
1137
+
1016
1138
  # Return result collection
1017
1139
  return ElementCollection(matching_elements)
1018
1140
 
@@ -1134,6 +1256,15 @@ class Page(
1134
1256
  "Cannot sort elements in reading order: Missing required attributes (top, x0)."
1135
1257
  )
1136
1258
 
1259
+ # Handle collection-level pseudo-classes (:first, :last)
1260
+ for pseudo in selector_obj.get("pseudo_classes", []):
1261
+ name = pseudo.get("name")
1262
+
1263
+ if name == "first":
1264
+ matching_elements = matching_elements[:1] if matching_elements else []
1265
+ elif name == "last":
1266
+ matching_elements = matching_elements[-1:] if matching_elements else []
1267
+
1137
1268
  # Create result collection - exclusions are handled by the calling methods (find, find_all)
1138
1269
  result = ElementCollection(matching_elements)
1139
1270
 
@@ -1845,7 +1976,7 @@ class Page(
1845
1976
  """Get all line elements on this page."""
1846
1977
  return self._element_mgr.lines
1847
1978
 
1848
- def highlight(
1979
+ def add_highlight(
1849
1980
  self,
1850
1981
  bbox: Optional[Tuple[float, float, float, float]] = None,
1851
1982
  color: Optional[Union[Tuple, str]] = None,
@@ -1856,7 +1987,7 @@ class Page(
1856
1987
  existing: str = "append",
1857
1988
  ) -> "Page":
1858
1989
  """
1859
- Highlight a bounding box or the entire page.
1990
+ Add a highlight to a bounding box or the entire page.
1860
1991
  Delegates to the central HighlightingService.
1861
1992
 
1862
1993
  Args:
@@ -1884,7 +2015,7 @@ class Page(
1884
2015
  )
1885
2016
  return self
1886
2017
 
1887
- def highlight_polygon(
2018
+ def add_highlight_polygon(
1888
2019
  self,
1889
2020
  polygon: List[Tuple[float, float]],
1890
2021
  color: Optional[Union[Tuple, str]] = None,
@@ -1944,7 +2075,7 @@ class Page(
1944
2075
  render_ocr: Whether to render OCR text.
1945
2076
  include_highlights: Whether to render highlights.
1946
2077
  resolution: Resolution in DPI for base image rendering (default: 144 DPI, equivalent to previous scale=2.0).
1947
- **kwargs: Additional args for pdfplumber's to_image.
2078
+ **kwargs: Additional args for pdfplumber's internal to_image.
1948
2079
 
1949
2080
  Returns:
1950
2081
  Self for method chaining.