natural-pdf 0.2.0__py3-none-any.whl → 0.2.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -3,7 +3,7 @@
3
3
  import json
4
4
  import logging
5
5
  from collections import UserList
6
- from typing import TYPE_CHECKING, Any, Dict, List, Literal, Optional, Tuple, Union
6
+ from typing import TYPE_CHECKING, Any, Callable, Dict, List, Literal, Optional, Tuple, Union
7
7
 
8
8
  import numpy as np
9
9
  from PIL import Image, ImageDraw
@@ -16,6 +16,7 @@ if TYPE_CHECKING:
16
16
  from natural_pdf.elements.element_collection import ElementCollection
17
17
  from natural_pdf.elements.region import Region
18
18
  from natural_pdf.flows.region import FlowRegion
19
+ from natural_pdf.tables.result import TableResult
19
20
 
20
21
  logger = logging.getLogger(__name__)
21
22
 
@@ -131,6 +132,15 @@ class GuidesList(UserList):
131
132
  self._parent = parent_guides
132
133
  self._axis = axis
133
134
 
135
+ def __getitem__(self, i):
136
+ """Override to handle slicing properly."""
137
+ if isinstance(i, slice):
138
+ # Return a new GuidesList with the sliced data
139
+ return self.__class__(self._parent, self._axis, self.data[i])
140
+ else:
141
+ # For single index, return the value directly
142
+ return self.data[i]
143
+
134
144
  def from_content(
135
145
  self,
136
146
  markers: Union[str, List[str], "ElementCollection", None],
@@ -140,6 +150,7 @@ class GuidesList(UserList):
140
150
  tolerance: float = 5,
141
151
  *,
142
152
  append: bool = False,
153
+ apply_exclusions: bool = True,
143
154
  ) -> "Guides":
144
155
  """
145
156
  Create guides from content markers and add to this axis.
@@ -154,6 +165,7 @@ class GuidesList(UserList):
154
165
  align: How to align guides relative to found elements
155
166
  outer: Whether to add outer boundary guides
156
167
  tolerance: Tolerance for snapping to element edges
168
+ apply_exclusions: Whether to apply exclusion zones when searching for text
157
169
 
158
170
  Returns:
159
171
  Parent Guides object for chaining
@@ -178,6 +190,7 @@ class GuidesList(UserList):
178
190
  align=align,
179
191
  outer=outer,
180
192
  tolerance=tolerance,
193
+ apply_exclusions=apply_exclusions,
181
194
  )
182
195
 
183
196
  # Collect guides from this region
@@ -260,6 +273,7 @@ class GuidesList(UserList):
260
273
  align=align,
261
274
  outer=outer,
262
275
  tolerance=tolerance,
276
+ apply_exclusions=apply_exclusions,
263
277
  )
264
278
 
265
279
  # Replace or append based on parameter
@@ -1398,6 +1412,7 @@ class Guides:
1398
1412
  align: Literal["left", "right", "center", "between"] = "left",
1399
1413
  outer: bool = True,
1400
1414
  tolerance: float = 5,
1415
+ apply_exclusions: bool = True,
1401
1416
  ) -> "Guides":
1402
1417
  """
1403
1418
  Create guides based on text content positions.
@@ -1413,6 +1428,7 @@ class Guides:
1413
1428
  align: Where to place guides relative to found text
1414
1429
  outer: Whether to add guides at the boundaries
1415
1430
  tolerance: Maximum distance to search for text
1431
+ apply_exclusions: Whether to apply exclusion zones when searching for text
1416
1432
 
1417
1433
  Returns:
1418
1434
  New Guides object aligned to text content
@@ -1431,6 +1447,7 @@ class Guides:
1431
1447
  align=align,
1432
1448
  outer=outer,
1433
1449
  tolerance=tolerance,
1450
+ apply_exclusions=apply_exclusions,
1434
1451
  )
1435
1452
 
1436
1453
  # Store in flow guides
@@ -1469,7 +1486,7 @@ class Guides:
1469
1486
  # Find each marker and determine guide position
1470
1487
  for marker in marker_texts:
1471
1488
  if hasattr(obj, "find"):
1472
- element = obj.find(f'text:contains("{marker}")')
1489
+ element = obj.find(f'text:contains("{marker}")', apply_exclusions=apply_exclusions)
1473
1490
  if element:
1474
1491
  if axis == "vertical":
1475
1492
  if align == "left":
@@ -1498,7 +1515,9 @@ class Guides:
1498
1515
  marker_bounds = []
1499
1516
  for marker in marker_texts:
1500
1517
  if hasattr(obj, "find"):
1501
- element = obj.find(f'text:contains("{marker}")')
1518
+ element = obj.find(
1519
+ f'text:contains("{marker}")', apply_exclusions=apply_exclusions
1520
+ )
1502
1521
  if element:
1503
1522
  if axis == "vertical":
1504
1523
  marker_bounds.append((element.x0, element.x1))
@@ -3285,6 +3304,7 @@ class Guides:
3285
3304
  align: Literal["left", "right", "center", "between"] = "left",
3286
3305
  outer: bool = True,
3287
3306
  tolerance: float = 5,
3307
+ apply_exclusions: bool = True,
3288
3308
  ) -> "Guides":
3289
3309
  """
3290
3310
  Instance method: Add guides from content, allowing chaining.
@@ -3301,6 +3321,7 @@ class Guides:
3301
3321
  align: How to align guides relative to found elements
3302
3322
  outer: Whether to add outer boundary guides
3303
3323
  tolerance: Tolerance for snapping to element edges
3324
+ apply_exclusions: Whether to apply exclusion zones when searching for text
3304
3325
 
3305
3326
  Returns:
3306
3327
  Self for method chaining
@@ -3318,6 +3339,7 @@ class Guides:
3318
3339
  align=align,
3319
3340
  outer=outer,
3320
3341
  tolerance=tolerance,
3342
+ apply_exclusions=apply_exclusions,
3321
3343
  )
3322
3344
 
3323
3345
  # Add the appropriate coordinates to this object
@@ -3421,6 +3443,140 @@ class Guides:
3421
3443
 
3422
3444
  return self
3423
3445
 
3446
+ def extract_table(
3447
+ self,
3448
+ target: Optional[Union["Page", "Region"]] = None,
3449
+ source: str = "guides_temp",
3450
+ cell_padding: float = 0.5,
3451
+ include_outer_boundaries: bool = False,
3452
+ method: Optional[str] = None,
3453
+ table_settings: Optional[dict] = None,
3454
+ use_ocr: bool = False,
3455
+ ocr_config: Optional[dict] = None,
3456
+ text_options: Optional[Dict] = None,
3457
+ cell_extraction_func: Optional[Callable[["Region"], Optional[str]]] = None,
3458
+ show_progress: bool = False,
3459
+ content_filter: Optional[Union[str, Callable[[str], bool], List[str]]] = None,
3460
+ *,
3461
+ multi_page: Literal["auto", True, False] = "auto",
3462
+ ) -> "TableResult":
3463
+ """
3464
+ Extract table data directly from guides without leaving temporary regions.
3465
+
3466
+ This method:
3467
+ 1. Creates table structure using build_grid()
3468
+ 2. Extracts table data from the created table region
3469
+ 3. Cleans up all temporary regions
3470
+ 4. Returns the TableResult
3471
+
3472
+ Args:
3473
+ target: Page or Region to create regions on (uses self.context if None)
3474
+ source: Source label for temporary regions (will be cleaned up)
3475
+ cell_padding: Internal padding for cell regions in points
3476
+ include_outer_boundaries: Whether to add boundaries at edges if missing
3477
+ method: Table extraction method ('tatr', 'pdfplumber', 'text', etc.)
3478
+ table_settings: Settings for pdfplumber table extraction
3479
+ use_ocr: Whether to use OCR for text extraction
3480
+ ocr_config: OCR configuration parameters
3481
+ text_options: Dictionary of options for the 'text' method
3482
+ cell_extraction_func: Optional callable for custom cell text extraction
3483
+ show_progress: Controls progress bar for text method
3484
+ content_filter: Content filtering function or patterns
3485
+ multi_page: Controls multi-region table creation for FlowRegions
3486
+
3487
+ Returns:
3488
+ TableResult: Extracted table data
3489
+
3490
+ Raises:
3491
+ ValueError: If no table region is created from the guides
3492
+
3493
+ Example:
3494
+ ```python
3495
+ from natural_pdf.analyzers import Guides
3496
+
3497
+ # Create guides from detected lines
3498
+ guides = Guides.from_lines(page, source_label="detected")
3499
+
3500
+ # Extract table directly - no temporary regions left behind
3501
+ table_data = guides.extract_table()
3502
+
3503
+ # Convert to pandas DataFrame
3504
+ df = table_data.to_df()
3505
+ ```
3506
+ """
3507
+ target_obj = target or self.context
3508
+ if not target_obj:
3509
+ raise ValueError("No target object available. Provide target parameter or context.")
3510
+
3511
+ # Get the page for cleanup later
3512
+ if hasattr(target_obj, "x0") and hasattr(target_obj, "top"): # Region
3513
+ page = target_obj._page
3514
+ element_manager = page._element_mgr
3515
+ elif hasattr(target_obj, "_element_mgr"): # Page
3516
+ page = target_obj
3517
+ element_manager = page._element_mgr
3518
+ else:
3519
+ raise ValueError(f"Target object {target_obj} is not a Page or Region")
3520
+
3521
+ try:
3522
+ # Step 1: Build grid structure (creates temporary regions)
3523
+ grid_result = self.build_grid(
3524
+ target=target_obj,
3525
+ source=source,
3526
+ cell_padding=cell_padding,
3527
+ include_outer_boundaries=include_outer_boundaries,
3528
+ multi_page=multi_page,
3529
+ )
3530
+
3531
+ # Step 2: Get the table region and extract table data
3532
+ table_region = grid_result["regions"]["table"]
3533
+ if table_region is None:
3534
+ raise ValueError(
3535
+ "No table region was created from the guides. Check that you have both vertical and horizontal guides."
3536
+ )
3537
+
3538
+ # Handle multi-page case where table_region might be a list
3539
+ if isinstance(table_region, list):
3540
+ if not table_region:
3541
+ raise ValueError("No table regions were created from the guides.")
3542
+ # Use the first table region for extraction
3543
+ table_region = table_region[0]
3544
+
3545
+ # Step 3: Extract table data using the region's extract_table method
3546
+ table_result = table_region.extract_table(
3547
+ method=method,
3548
+ table_settings=table_settings,
3549
+ use_ocr=use_ocr,
3550
+ ocr_config=ocr_config,
3551
+ text_options=text_options,
3552
+ cell_extraction_func=cell_extraction_func,
3553
+ show_progress=show_progress,
3554
+ content_filter=content_filter,
3555
+ )
3556
+
3557
+ return table_result
3558
+
3559
+ finally:
3560
+ # Step 4: Clean up all temporary regions created by build_grid
3561
+ # This ensures no regions are left behind regardless of success/failure
3562
+ try:
3563
+ regions_to_remove = [
3564
+ r
3565
+ for r in element_manager.regions
3566
+ if getattr(r, "source", None) == source
3567
+ and getattr(r, "region_type", None)
3568
+ in {"table", "table_row", "table_column", "table_cell"}
3569
+ ]
3570
+
3571
+ for region in regions_to_remove:
3572
+ element_manager.remove_element(region, element_type="regions")
3573
+
3574
+ if regions_to_remove:
3575
+ logger.debug(f"Cleaned up {len(regions_to_remove)} temporary regions")
3576
+
3577
+ except Exception as cleanup_err:
3578
+ logger.warning(f"Failed to clean up temporary regions: {cleanup_err}")
3579
+
3424
3580
  def _get_flow_orientation(self) -> Literal["vertical", "horizontal", "unknown"]:
3425
3581
  """Determines if a FlowRegion's constituent parts are arranged vertically or horizontally."""
3426
3582
  if not self.is_flow_region or len(self.context.constituent_regions) < 2:
@@ -689,7 +689,7 @@ class HighlightingService:
689
689
  logger.debug(f"Added highlight to page {page_index}: {highlight}")
690
690
 
691
691
  # --- Invalidate page-level image cache --------------------------------
692
- # The Page.to_image method maintains an internal cache keyed by rendering
692
+ # The Page.render method maintains an internal cache keyed by rendering
693
693
  # parameters. Because the cache key currently does **not** incorporate
694
694
  # any information about the highlights themselves, it can return stale
695
695
  # images after highlights are added or removed. To ensure the next
@@ -700,11 +700,11 @@ class HighlightingService:
700
700
  if hasattr(page_obj, "_to_image_cache"):
701
701
  page_obj._to_image_cache.clear()
702
702
  logger.debug(
703
- f"Cleared cached to_image renders for page {page_index} after adding a highlight."
703
+ f"Cleared cached render images for page {page_index} after adding a highlight."
704
704
  )
705
705
  except Exception as cache_err: # pragma: no cover – never fail highlight creation
706
706
  logger.warning(
707
- f"Failed to invalidate to_image cache for page {page_index}: {cache_err}",
707
+ f"Failed to invalidate render cache for page {page_index}: {cache_err}",
708
708
  exc_info=True,
709
709
  )
710
710
 
@@ -737,11 +737,11 @@ class HighlightingService:
737
737
  if hasattr(page_obj, "_to_image_cache"):
738
738
  page_obj._to_image_cache.clear()
739
739
  logger.debug(
740
- f"Cleared cached to_image renders for page {page_index} after removing highlights."
740
+ f"Cleared cached render images for page {page_index} after removing highlights."
741
741
  )
742
742
  except Exception as cache_err: # pragma: no cover
743
743
  logger.warning(
744
- f"Failed to invalidate to_image cache for page {page_index}: {cache_err}",
744
+ f"Failed to invalidate render cache for page {page_index}: {cache_err}",
745
745
  exc_info=True,
746
746
  )
747
747
 
@@ -760,7 +760,7 @@ class HighlightingService:
760
760
  labels: bool = True,
761
761
  legend_position: str = "right",
762
762
  render_ocr: bool = False,
763
- **kwargs, # Pass other args to pdfplumber.page.to_image if needed
763
+ **kwargs, # Pass other args to pdfplumber.page.to_image if needed (internal API)
764
764
  ) -> Optional[Image.Image]:
765
765
  """
766
766
  Renders a specific page with its highlights.
@@ -773,7 +773,7 @@ class HighlightingService:
773
773
  labels: Whether to include a legend for highlights.
774
774
  legend_position: Position of the legend.
775
775
  render_ocr: Whether to render OCR text on the image.
776
- kwargs: Additional keyword arguments for pdfplumber's page.to_image (e.g., width, height).
776
+ kwargs: Additional keyword arguments for pdfplumber's internal page.to_image (e.g., width, height).
777
777
 
778
778
  Returns:
779
779
  A PIL Image object of the rendered page, or None if rendering fails.
@@ -957,7 +957,7 @@ class HighlightingService:
957
957
  crop_bbox: Optional bounding box (x0, top, x1, bottom) in PDF coordinate
958
958
  space to crop the output image to, before legends or other overlays are
959
959
  applied. If None, no cropping is performed.
960
- **kwargs: Additional args for pdfplumber's to_image (e.g., width, height).
960
+ **kwargs: Additional args for pdfplumber's internal to_image (e.g., width, height).
961
961
 
962
962
  Returns:
963
963
  PIL Image of the preview, or None if rendering fails.
natural_pdf/core/page.py CHANGED
@@ -341,6 +341,26 @@ class Page(
341
341
  for elem in elements:
342
342
  spec.add_highlight(element=elem, color=group_color, label=group_label)
343
343
 
344
+ # Handle exclusions visualization
345
+ exclusions_param = kwargs.get("exclusions")
346
+ if exclusions_param:
347
+ # Get exclusion regions
348
+ exclusion_regions = self._get_exclusion_regions(include_callable=True)
349
+
350
+ if exclusion_regions:
351
+ # Determine color for exclusions
352
+ exclusion_color = (
353
+ exclusions_param if isinstance(exclusions_param, str) else "red"
354
+ )
355
+
356
+ # Add exclusion regions as highlights
357
+ for region in exclusion_regions:
358
+ spec.add_highlight(
359
+ element=region,
360
+ color=exclusion_color,
361
+ label=f"Exclusion: {region.label or 'unnamed'}",
362
+ )
363
+
344
364
  return [spec]
345
365
 
346
366
  @property
@@ -391,7 +411,9 @@ class Page(
391
411
 
392
412
  def add_exclusion(
393
413
  self,
394
- exclusion_func_or_region: Union[Callable[["Page"], "Region"], "Region", Any],
414
+ exclusion_func_or_region: Union[
415
+ Callable[["Page"], "Region"], "Region", List[Any], Tuple[Any, ...], Any
416
+ ],
395
417
  label: Optional[str] = None,
396
418
  method: str = "region",
397
419
  ) -> "Page":
@@ -401,7 +423,8 @@ class Page(
401
423
 
402
424
  Args:
403
425
  exclusion_func_or_region: Either a callable function returning a Region,
404
- a Region object, or another object with a valid .bbox attribute.
426
+ a Region object, a list/tuple of regions or elements,
427
+ or another object with a valid .bbox attribute.
405
428
  label: Optional label for this exclusion (e.g., 'header', 'footer').
406
429
  method: Exclusion method - 'region' (exclude all elements in bounding box) or
407
430
  'element' (exclude only the specific elements). Default: 'region'.
@@ -551,10 +574,53 @@ class Page(
551
574
  raise TypeError(
552
575
  f"Failed to convert exclusion object {exclusion_func_or_region} with bbox {getattr(exclusion_func_or_region, 'bbox', 'N/A')} to Region: {e}"
553
576
  ) from e
577
+ elif isinstance(exclusion_func_or_region, (list, tuple)):
578
+ # Handle lists/tuples of regions or elements
579
+ if not exclusion_func_or_region:
580
+ logger.warning(f"Page {self.index}: Empty list provided for exclusion, ignoring.")
581
+ return self
582
+
583
+ if method == "element":
584
+ # Store each element directly
585
+ for item in exclusion_func_or_region:
586
+ if hasattr(item, "bbox") and len(getattr(item, "bbox", [])) == 4:
587
+ self._exclusions.append((item, label, method))
588
+ logger.debug(
589
+ f"Page {self.index}: Added element exclusion from list -> {item}"
590
+ )
591
+ else:
592
+ logger.warning(
593
+ f"Page {self.index}: Skipping item without valid bbox in list: {item}"
594
+ )
595
+ else: # method == "region"
596
+ # Convert each item to a Region and add
597
+ for item in exclusion_func_or_region:
598
+ try:
599
+ if isinstance(item, Region):
600
+ item.label = label
601
+ self._exclusions.append((item, label, method))
602
+ logger.debug(f"Page {self.index}: Added Region from list: {item}")
603
+ elif hasattr(item, "bbox") and len(getattr(item, "bbox", [])) == 4:
604
+ bbox_coords = tuple(float(v) for v in item.bbox)
605
+ region = Region(self, bbox_coords, label=label)
606
+ self._exclusions.append((region, label, method))
607
+ logger.debug(
608
+ f"Page {self.index}: Added exclusion region from list item {bbox_coords}"
609
+ )
610
+ else:
611
+ logger.warning(
612
+ f"Page {self.index}: Skipping item without valid bbox in list: {item}"
613
+ )
614
+ except Exception as e:
615
+ logger.error(
616
+ f"Page {self.index}: Failed to convert list item to Region: {e}"
617
+ )
618
+ continue
619
+ return self
554
620
  else:
555
621
  # Reject invalid types
556
622
  raise TypeError(
557
- f"Invalid exclusion type: {type(exclusion_func_or_region)}. Must be callable, Region, or have a valid .bbox attribute."
623
+ f"Invalid exclusion type: {type(exclusion_func_or_region)}. Must be callable, Region, list/tuple of regions/elements, or have a valid .bbox attribute."
558
624
  )
559
625
 
560
626
  # Append the stored data (tuple of object/callable, label, and method)
@@ -668,6 +734,46 @@ class Page(
668
734
  regions.append(region_result)
669
735
  if debug:
670
736
  print(f" ✓ Added region from callable '{label}': {region_result}")
737
+ elif hasattr(region_result, "__iter__") and hasattr(region_result, "__len__"):
738
+ # Handle ElementCollection or other iterables
739
+ from natural_pdf.elements.element_collection import ElementCollection
740
+
741
+ if isinstance(region_result, ElementCollection) or (
742
+ hasattr(region_result, "__iter__") and region_result
743
+ ):
744
+ if debug:
745
+ print(
746
+ f" Converting {type(region_result)} with {len(region_result)} elements to regions..."
747
+ )
748
+
749
+ # Convert each element to a region
750
+ for elem in region_result:
751
+ try:
752
+ if hasattr(elem, "bbox") and len(elem.bbox) == 4:
753
+ bbox_coords = tuple(float(v) for v in elem.bbox)
754
+ region = Region(self, bbox_coords, label=label)
755
+ regions.append(region)
756
+ if debug:
757
+ print(
758
+ f" ✓ Added region from element: {bbox_coords}"
759
+ )
760
+ else:
761
+ if debug:
762
+ print(
763
+ f" ✗ Skipping element without valid bbox: {elem}"
764
+ )
765
+ except Exception as e:
766
+ if debug:
767
+ print(f" ✗ Failed to convert element to region: {e}")
768
+ continue
769
+
770
+ if debug and len(region_result) > 0:
771
+ print(
772
+ f" ✓ Converted {len(region_result)} elements from callable '{label}'"
773
+ )
774
+ else:
775
+ if debug:
776
+ print(f" ✗ Empty iterable returned from callable '{label}'")
671
777
  elif region_result:
672
778
  logger.warning(
673
779
  f"Callable exclusion '{exclusion_label}' returned non-Region object: {type(region_result)}. Skipping."
@@ -1013,6 +1119,22 @@ class Page(
1013
1119
  "Cannot sort elements in reading order: Missing required attributes (top, x0)."
1014
1120
  )
1015
1121
 
1122
+ # Handle collection-level pseudo-classes (:first, :last) for OR selectors
1123
+ # Note: We only apply :first/:last if they appear in any of the sub-selectors
1124
+ has_first = False
1125
+ has_last = False
1126
+ for sub_selector in selector_obj.get("selectors", []):
1127
+ for pseudo in sub_selector.get("pseudo_classes", []):
1128
+ if pseudo.get("name") == "first":
1129
+ has_first = True
1130
+ elif pseudo.get("name") == "last":
1131
+ has_last = True
1132
+
1133
+ if has_first:
1134
+ matching_elements = matching_elements[:1] if matching_elements else []
1135
+ elif has_last:
1136
+ matching_elements = matching_elements[-1:] if matching_elements else []
1137
+
1016
1138
  # Return result collection
1017
1139
  return ElementCollection(matching_elements)
1018
1140
 
@@ -1134,6 +1256,15 @@ class Page(
1134
1256
  "Cannot sort elements in reading order: Missing required attributes (top, x0)."
1135
1257
  )
1136
1258
 
1259
+ # Handle collection-level pseudo-classes (:first, :last)
1260
+ for pseudo in selector_obj.get("pseudo_classes", []):
1261
+ name = pseudo.get("name")
1262
+
1263
+ if name == "first":
1264
+ matching_elements = matching_elements[:1] if matching_elements else []
1265
+ elif name == "last":
1266
+ matching_elements = matching_elements[-1:] if matching_elements else []
1267
+
1137
1268
  # Create result collection - exclusions are handled by the calling methods (find, find_all)
1138
1269
  result = ElementCollection(matching_elements)
1139
1270
 
@@ -1944,7 +2075,7 @@ class Page(
1944
2075
  render_ocr: Whether to render OCR text.
1945
2076
  include_highlights: Whether to render highlights.
1946
2077
  resolution: Resolution in DPI for base image rendering (default: 144 DPI, equivalent to previous scale=2.0).
1947
- **kwargs: Additional args for pdfplumber's to_image.
2078
+ **kwargs: Additional args for pdfplumber's internal to_image.
1948
2079
 
1949
2080
  Returns:
1950
2081
  Self for method chaining.
@@ -1247,3 +1247,40 @@ class PageCollection(TextMixin, Generic[P], ApplyMixin, ShapeDetectionMixin, Vis
1247
1247
  from natural_pdf.core.highlighting_service import HighlightContext
1248
1248
 
1249
1249
  return HighlightContext(self, show_on_exit=show)
1250
+
1251
+ def groupby(self, by: Union[str, Callable], *, show_progress: bool = True) -> "PageGroupBy":
1252
+ """
1253
+ Group pages by selector text or callable result.
1254
+
1255
+ Args:
1256
+ by: CSS selector string or callable function
1257
+ show_progress: Whether to show progress bar during computation (default: True)
1258
+
1259
+ Returns:
1260
+ PageGroupBy object supporting iteration and dict-like access
1261
+
1262
+ Examples:
1263
+ # Group by header text
1264
+ for title, pages in pdf.pages.groupby('text[size=16]'):
1265
+ print(f"Section: {title}")
1266
+
1267
+ # Group by callable
1268
+ for city, pages in pdf.pages.groupby(lambda p: p.find('text:contains("CITY")').extract_text()):
1269
+ process_city_pages(pages)
1270
+
1271
+ # Quick exploration with indexing
1272
+ grouped = pdf.pages.groupby('text[size=16]')
1273
+ grouped.info() # Show all groups
1274
+ first_section = grouped[0] # First group
1275
+ last_section = grouped[-1] # Last group
1276
+
1277
+ # Dict-like access by name
1278
+ madison_pages = grouped.get('CITY OF MADISON')
1279
+ madison_pages = grouped['CITY OF MADISON'] # Alternative
1280
+
1281
+ # Disable progress bar for small collections
1282
+ grouped = pdf.pages.groupby('text[size=16]', show_progress=False)
1283
+ """
1284
+ from natural_pdf.core.page_groupby import PageGroupBy
1285
+
1286
+ return PageGroupBy(self, by, show_progress=show_progress)