natural-pdf 0.2.1.dev0__py3-none-any.whl → 0.2.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -259,7 +259,7 @@ class PageCollection(TextMixin, Generic[P], ApplyMixin, ShapeDetectionMixin, Vis
259
259
  self,
260
260
  *,
261
261
  text: str,
262
- contains: str = "all",
262
+ overlap: str = "full",
263
263
  apply_exclusions: bool = True,
264
264
  regex: bool = False,
265
265
  case: bool = True,
@@ -271,7 +271,7 @@ class PageCollection(TextMixin, Generic[P], ApplyMixin, ShapeDetectionMixin, Vis
271
271
  self,
272
272
  selector: str,
273
273
  *,
274
- contains: str = "all",
274
+ overlap: str = "full",
275
275
  apply_exclusions: bool = True,
276
276
  regex: bool = False,
277
277
  case: bool = True,
@@ -283,7 +283,7 @@ class PageCollection(TextMixin, Generic[P], ApplyMixin, ShapeDetectionMixin, Vis
283
283
  selector: Optional[str] = None,
284
284
  *,
285
285
  text: Optional[str] = None,
286
- contains: str = "all",
286
+ overlap: str = "full",
287
287
  apply_exclusions: bool = True,
288
288
  regex: bool = False,
289
289
  case: bool = True,
@@ -297,9 +297,9 @@ class PageCollection(TextMixin, Generic[P], ApplyMixin, ShapeDetectionMixin, Vis
297
297
  Args:
298
298
  selector: CSS-like selector string.
299
299
  text: Text content to search for (equivalent to 'text:contains(...)').
300
- contains: How to determine if elements are inside: 'all' (fully inside),
301
- 'any' (any overlap), or 'center' (center point inside).
302
- (default: "all")
300
+ overlap: How to determine if elements overlap: 'full' (fully inside),
301
+ 'partial' (any overlap), or 'center' (center point inside).
302
+ (default: "full")
303
303
  apply_exclusions: Whether to exclude elements in exclusion regions (default: True).
304
304
  regex: Whether to use regex for text search (`selector` or `text`) (default: False).
305
305
  case: Whether to do case-sensitive text search (`selector` or `text`) (default: True).
@@ -313,7 +313,7 @@ class PageCollection(TextMixin, Generic[P], ApplyMixin, ShapeDetectionMixin, Vis
313
313
  element = page.find(
314
314
  selector=selector,
315
315
  text=text,
316
- contains=contains,
316
+ overlap=overlap,
317
317
  apply_exclusions=apply_exclusions,
318
318
  regex=regex,
319
319
  case=case,
@@ -328,7 +328,7 @@ class PageCollection(TextMixin, Generic[P], ApplyMixin, ShapeDetectionMixin, Vis
328
328
  self,
329
329
  *,
330
330
  text: str,
331
- contains: str = "all",
331
+ overlap: str = "full",
332
332
  apply_exclusions: bool = True,
333
333
  regex: bool = False,
334
334
  case: bool = True,
@@ -340,7 +340,7 @@ class PageCollection(TextMixin, Generic[P], ApplyMixin, ShapeDetectionMixin, Vis
340
340
  self,
341
341
  selector: str,
342
342
  *,
343
- contains: str = "all",
343
+ overlap: str = "full",
344
344
  apply_exclusions: bool = True,
345
345
  regex: bool = False,
346
346
  case: bool = True,
@@ -352,7 +352,7 @@ class PageCollection(TextMixin, Generic[P], ApplyMixin, ShapeDetectionMixin, Vis
352
352
  selector: Optional[str] = None,
353
353
  *,
354
354
  text: Optional[str] = None,
355
- contains: str = "all",
355
+ overlap: str = "full",
356
356
  apply_exclusions: bool = True,
357
357
  regex: bool = False,
358
358
  case: bool = True,
@@ -366,9 +366,9 @@ class PageCollection(TextMixin, Generic[P], ApplyMixin, ShapeDetectionMixin, Vis
366
366
  Args:
367
367
  selector: CSS-like selector string.
368
368
  text: Text content to search for (equivalent to 'text:contains(...)').
369
- contains: How to determine if elements are inside: 'all' (fully inside),
370
- 'any' (any overlap), or 'center' (center point inside).
371
- (default: "all")
369
+ overlap: How to determine if elements overlap: 'full' (fully inside),
370
+ 'partial' (any overlap), or 'center' (center point inside).
371
+ (default: "full")
372
372
  apply_exclusions: Whether to exclude elements in exclusion regions (default: True).
373
373
  regex: Whether to use regex for text search (`selector` or `text`) (default: False).
374
374
  case: Whether to do case-sensitive text search (`selector` or `text`) (default: True).
@@ -383,7 +383,7 @@ class PageCollection(TextMixin, Generic[P], ApplyMixin, ShapeDetectionMixin, Vis
383
383
  elements = page.find_all(
384
384
  selector=selector,
385
385
  text=text,
386
- contains=contains,
386
+ overlap=overlap,
387
387
  apply_exclusions=apply_exclusions,
388
388
  regex=regex,
389
389
  case=case,
@@ -1247,3 +1247,40 @@ class PageCollection(TextMixin, Generic[P], ApplyMixin, ShapeDetectionMixin, Vis
1247
1247
  from natural_pdf.core.highlighting_service import HighlightContext
1248
1248
 
1249
1249
  return HighlightContext(self, show_on_exit=show)
1250
+
1251
+ def groupby(self, by: Union[str, Callable], *, show_progress: bool = True) -> "PageGroupBy":
1252
+ """
1253
+ Group pages by selector text or callable result.
1254
+
1255
+ Args:
1256
+ by: CSS selector string or callable function
1257
+ show_progress: Whether to show progress bar during computation (default: True)
1258
+
1259
+ Returns:
1260
+ PageGroupBy object supporting iteration and dict-like access
1261
+
1262
+ Examples:
1263
+ # Group by header text
1264
+ for title, pages in pdf.pages.groupby('text[size=16]'):
1265
+ print(f"Section: {title}")
1266
+
1267
+ # Group by callable
1268
+ for city, pages in pdf.pages.groupby(lambda p: p.find('text:contains("CITY")').extract_text()):
1269
+ process_city_pages(pages)
1270
+
1271
+ # Quick exploration with indexing
1272
+ grouped = pdf.pages.groupby('text[size=16]')
1273
+ grouped.info() # Show all groups
1274
+ first_section = grouped[0] # First group
1275
+ last_section = grouped[-1] # Last group
1276
+
1277
+ # Dict-like access by name
1278
+ madison_pages = grouped.get('CITY OF MADISON')
1279
+ madison_pages = grouped['CITY OF MADISON'] # Alternative
1280
+
1281
+ # Disable progress bar for small collections
1282
+ grouped = pdf.pages.groupby('text[size=16]', show_progress=False)
1283
+ """
1284
+ from natural_pdf.core.page_groupby import PageGroupBy
1285
+
1286
+ return PageGroupBy(self, by, show_progress=show_progress)
@@ -0,0 +1,229 @@
1
+ """
2
+ PageGroupBy class for grouping pages by selector text or callable results.
3
+ """
4
+
5
+ from collections import defaultdict
6
+ from typing import TYPE_CHECKING, Any, Callable, Dict, Iterator, List, Optional, Tuple, Union
7
+
8
+ from tqdm.auto import tqdm
9
+
10
+ if TYPE_CHECKING:
11
+ from natural_pdf.core.page import Page
12
+ from natural_pdf.core.page_collection import PageCollection
13
+
14
+
15
+ class PageGroupBy:
16
+ """
17
+ A groupby object for PageCollections that supports both iteration and dict-like access.
18
+
19
+ This class provides pandas-like groupby functionality for natural-pdf PageCollections.
20
+ Pages are grouped by the result of applying a selector string or callable function.
21
+
22
+ Supports:
23
+ - Direct iteration: for key, pages in grouped:
24
+ - Dict-like access: grouped.get(key), grouped.get_group(key)
25
+ - Batch operations: grouped.apply(func)
26
+ """
27
+
28
+ def __init__(
29
+ self,
30
+ page_collection: "PageCollection",
31
+ by: Union[str, Callable],
32
+ *,
33
+ show_progress: bool = True,
34
+ ):
35
+ """
36
+ Initialize the PageGroupBy object.
37
+
38
+ Args:
39
+ page_collection: The PageCollection to group
40
+ by: CSS selector string or callable function for grouping
41
+ show_progress: Whether to show progress bar during computation (default: True)
42
+ """
43
+ self.page_collection = page_collection
44
+ self.by = by
45
+ self.show_progress = show_progress
46
+ self._groups: Optional[Dict[Any, "PageCollection"]] = None
47
+
48
+ def _compute_groups(self) -> Dict[Any, "PageCollection"]:
49
+ """
50
+ Compute the groups by applying the selector/callable to each page.
51
+
52
+ Returns:
53
+ Dictionary mapping group keys to PageCollection objects
54
+ """
55
+ if self._groups is not None:
56
+ return self._groups
57
+
58
+ groups = defaultdict(list)
59
+
60
+ # Setup progress bar if enabled and collection is large enough
61
+ pages_iterator = self.page_collection.pages
62
+ total_pages = len(self.page_collection)
63
+
64
+ if self.show_progress and total_pages > 1: # Show progress for more than 1 page
65
+ desc = f"Grouping by {'selector' if isinstance(self.by, str) else 'function'}"
66
+ pages_iterator = tqdm(pages_iterator, desc=desc, unit="pages", total=total_pages)
67
+
68
+ for page in pages_iterator:
69
+ if callable(self.by):
70
+ # Apply callable function
71
+ key = self.by(page)
72
+ else:
73
+ # Apply selector string
74
+ element = page.find(self.by)
75
+ if element:
76
+ key = element.extract_text()
77
+ else:
78
+ key = None
79
+
80
+ groups[key].append(page)
81
+
82
+ # Convert lists to PageCollections
83
+ from natural_pdf.core.page_collection import PageCollection
84
+
85
+ self._groups = {key: PageCollection(pages) for key, pages in groups.items()}
86
+
87
+ return self._groups
88
+
89
+ def __iter__(self) -> Iterator[Tuple[Any, "PageCollection"]]:
90
+ """
91
+ Support direct iteration: for key, pages in grouped:
92
+
93
+ Yields:
94
+ Tuples of (group_key, PageCollection)
95
+ """
96
+ groups = self._compute_groups()
97
+ return iter(groups.items())
98
+
99
+ def get(
100
+ self, key: Any, default: Optional["PageCollection"] = None
101
+ ) -> Optional["PageCollection"]:
102
+ """
103
+ Dict-like access to get a specific group.
104
+
105
+ Args:
106
+ key: The group key to look up
107
+ default: Value to return if key is not found
108
+
109
+ Returns:
110
+ PageCollection for the group, or default if not found
111
+ """
112
+ groups = self._compute_groups()
113
+ return groups.get(key, default)
114
+
115
+ def get_group(self, key: Any) -> "PageCollection":
116
+ """
117
+ Pandas-style access to get a specific group.
118
+
119
+ Args:
120
+ key: The group key to look up
121
+
122
+ Returns:
123
+ PageCollection for the group
124
+
125
+ Raises:
126
+ KeyError: If the group key is not found
127
+ """
128
+ groups = self._compute_groups()
129
+ if key not in groups:
130
+ raise KeyError(f"Group key '{key}' not found")
131
+ return groups[key]
132
+
133
+ def keys(self) -> List[Any]:
134
+ """
135
+ Get all group keys.
136
+
137
+ Returns:
138
+ List of all group keys
139
+ """
140
+ groups = self._compute_groups()
141
+ return list(groups.keys())
142
+
143
+ def __getitem__(self, index: Union[int, Any]) -> "PageCollection":
144
+ """
145
+ Access groups by index or key.
146
+
147
+ Args:
148
+ index: Integer index (0-based) or group key
149
+
150
+ Returns:
151
+ PageCollection for the specified group
152
+
153
+ Examples:
154
+ grouped = pages.groupby('text[size=16]')
155
+
156
+ # Access by index (useful for quick exploration)
157
+ first_group = grouped[0] # First group by order
158
+ second_group = grouped[1] # Second group
159
+ last_group = grouped[-1] # Last group
160
+
161
+ # Access by key (same as .get_group())
162
+ madison = grouped['CITY OF MADISON']
163
+ """
164
+ groups = self._compute_groups()
165
+
166
+ if isinstance(index, int):
167
+ # Access by integer index
168
+ keys_list = list(groups.keys())
169
+ original_index = index # Keep original for error message
170
+ if index < 0:
171
+ index = len(keys_list) + index # Support negative indexing
172
+ if not (0 <= index < len(keys_list)):
173
+ raise IndexError(f"Group index {original_index} out of range")
174
+ key = keys_list[index]
175
+ return groups[key]
176
+ else:
177
+ # Access by key (same as get_group)
178
+ if index not in groups:
179
+ raise KeyError(f"Group key '{index}' not found")
180
+ return groups[index]
181
+
182
+ def apply(self, func: Callable[["PageCollection"], Any]) -> Dict[Any, Any]:
183
+ """
184
+ Apply a function to each group.
185
+
186
+ Args:
187
+ func: Function to apply to each PageCollection group
188
+
189
+ Returns:
190
+ Dictionary mapping group keys to function results
191
+ """
192
+ groups = self._compute_groups()
193
+ return {key: func(pages) for key, pages in groups.items()}
194
+
195
+ def show(self, **kwargs):
196
+ """
197
+ Show each group separately with headers.
198
+
199
+ Args:
200
+ **kwargs: Arguments passed to each group's show() method
201
+ """
202
+ groups = self._compute_groups()
203
+ for key, pages in groups.items():
204
+ print(f"\n--- Group: {key} ({len(pages)} pages) ---")
205
+ pages.show(**kwargs)
206
+
207
+ def __len__(self) -> int:
208
+ """Return the number of groups."""
209
+ groups = self._compute_groups()
210
+ return len(groups)
211
+
212
+ def info(self) -> None:
213
+ """
214
+ Print information about all groups.
215
+
216
+ Useful for quick exploration of group structure.
217
+ """
218
+ groups = self._compute_groups()
219
+ print(f"PageGroupBy with {len(groups)} groups:")
220
+ print("-" * 40)
221
+
222
+ for i, (key, pages) in enumerate(groups.items()):
223
+ key_display = f"'{key}'" if key is not None else "None"
224
+ print(f"[{i}] {key_display}: {len(pages)} pages")
225
+
226
+ def __repr__(self) -> str:
227
+ """String representation showing group count."""
228
+ groups = self._compute_groups()
229
+ return f"<PageGroupBy(groups={len(groups)})>"
@@ -92,6 +92,50 @@ class Visualizable:
92
92
  _get_render_specs() to gain full image generation capabilities.
93
93
  """
94
94
 
95
+ def highlight(self, *elements, **kwargs):
96
+ """
97
+ Convenience method for highlighting elements in Jupyter/Colab.
98
+
99
+ This method creates a highlight context, adds the elements, and returns
100
+ the resulting image. It's designed for simple one-liner usage in notebooks.
101
+
102
+ Args:
103
+ *elements: Elements or element collections to highlight
104
+ **kwargs: Additional parameters passed to show()
105
+
106
+ Returns:
107
+ PIL Image with highlights
108
+
109
+ Example:
110
+ # Simple one-liner highlighting
111
+ page.highlight(left, mid, right)
112
+
113
+ # With custom colors
114
+ page.highlight(
115
+ (tables, 'blue'),
116
+ (headers, 'red'),
117
+ (footers, 'green')
118
+ )
119
+ """
120
+ from natural_pdf.core.highlighting_service import HighlightContext
121
+
122
+ # Create context and add elements
123
+ ctx = HighlightContext(self, show_on_exit=False)
124
+
125
+ for element in elements:
126
+ if isinstance(element, tuple) and len(element) == 2:
127
+ # Element with color: (element, color)
128
+ ctx.add(element[0], color=element[1])
129
+ elif isinstance(element, tuple) and len(element) == 3:
130
+ # Element with color and label: (element, color, label)
131
+ ctx.add(element[0], color=element[1], label=element[2])
132
+ else:
133
+ # Just element
134
+ ctx.add(element)
135
+
136
+ # Return the image directly
137
+ return ctx.show(**kwargs)
138
+
95
139
  def _get_render_specs(
96
140
  self, mode: Literal["show", "render"] = "show", **kwargs
97
141
  ) -> List[RenderSpec]:
@@ -146,10 +190,11 @@ class Visualizable:
146
190
  legend_position: str = "right",
147
191
  annotate: Optional[Union[str, List[str]]] = None,
148
192
  # Layout options for multi-page/region
149
- layout: Literal["stack", "grid", "single"] = "stack",
193
+ layout: Optional[Literal["stack", "grid", "single"]] = None,
150
194
  stack_direction: Literal["vertical", "horizontal"] = "vertical",
151
195
  gap: int = 5,
152
- columns: Optional[int] = None, # For grid layout
196
+ columns: Optional[int] = 6, # For grid layout, defaults to 6 columns
197
+ limit: Optional[int] = 30, # Max pages to show (default 30)
153
198
  # Cropping options
154
199
  crop: Union[bool, Literal["content"]] = False,
155
200
  crop_bbox: Optional[Tuple[float, float, float, float]] = None,
@@ -169,10 +214,11 @@ class Visualizable:
169
214
  highlights: Additional highlight groups to show
170
215
  legend_position: Position of legend/colorbar ('right', 'left', 'top', 'bottom')
171
216
  annotate: Attribute name(s) to display on highlights (string or list)
172
- layout: How to arrange multiple pages/regions
217
+ layout: How to arrange multiple pages/regions (defaults to 'grid' for multi-page, 'single' for single page)
173
218
  stack_direction: Direction for stack layout
174
219
  gap: Pixels between stacked images
175
- columns: Number of columns for grid layout
220
+ columns: Number of columns for grid layout (defaults to 6)
221
+ limit: Maximum number of pages to display (default 30, None for all)
176
222
  crop: Whether to crop (True, False, or 'content' for bbox of elements)
177
223
  crop_bbox: Explicit crop bounds
178
224
  **kwargs: Additional parameters passed to rendering
@@ -184,6 +230,10 @@ class Visualizable:
184
230
  if isinstance(annotate, str):
185
231
  annotate = [annotate]
186
232
 
233
+ # Pass limit as max_pages to _get_render_specs
234
+ if limit is not None:
235
+ kwargs["max_pages"] = limit
236
+
187
237
  specs = self._get_render_specs(
188
238
  mode="show",
189
239
  color=color,
@@ -198,6 +248,14 @@ class Visualizable:
198
248
  logger.warning(f"{self.__class__.__name__}.show() generated no render specs")
199
249
  return None
200
250
 
251
+ # Determine default layout based on content and parameters
252
+ if layout is None:
253
+ # For PDFs and multi-page collections, default to grid with 6 columns
254
+ if len(specs) > 1:
255
+ layout = "grid"
256
+ else:
257
+ layout = "single"
258
+
201
259
  highlighter = self._get_highlighter()
202
260
  return highlighter.unified_render(
203
261
  specs=specs,