natural-pdf 0.1.12__py3-none-any.whl → 0.1.13__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,458 @@
1
+ import logging
2
+ from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union
3
+
4
+ from pdfplumber.utils.geometry import objects_to_bbox # For calculating combined bbox
5
+
6
+ # For runtime image manipulation
7
+ from PIL import Image as PIL_Image_Runtime
8
+
9
+ if TYPE_CHECKING:
10
+ from PIL.Image import Image as PIL_Image # For type hints
11
+ from natural_pdf.elements.base import Element as PhysicalElement
12
+ from natural_pdf.elements.region import Region as PhysicalRegion
13
+ from natural_pdf.elements.collections import ElementCollection
14
+ from natural_pdf.core.page import Page as PhysicalPage
15
+ from .flow import Flow
16
+ from .element import FlowElement
17
+
18
+ logger = logging.getLogger(__name__)
19
+
20
+
21
+ class FlowRegion:
22
+ """
23
+ Represents a selected area within a Flow, potentially composed of multiple
24
+ physical Region objects (constituent_regions) that might span across
25
+ different original pages or disjoint physical regions defined in the Flow.
26
+
27
+ A FlowRegion is the result of a directional operation (e.g., .below(), .above())
28
+ on a FlowElement.
29
+ """
30
+
31
+ def __init__(
32
+ self,
33
+ flow: "Flow",
34
+ constituent_regions: List["PhysicalRegion"],
35
+ source_flow_element: "FlowElement",
36
+ boundary_element_found: Optional["PhysicalElement"] = None,
37
+ ):
38
+ """
39
+ Initializes a FlowRegion.
40
+
41
+ Args:
42
+ flow: The Flow instance this region belongs to.
43
+ constituent_regions: A list of physical natural_pdf.elements.region.Region
44
+ objects that make up this FlowRegion.
45
+ source_flow_element: The FlowElement that created this FlowRegion.
46
+ boundary_element_found: The physical element that stopped an 'until' search,
47
+ if applicable.
48
+ """
49
+ self.flow: "Flow" = flow
50
+ self.constituent_regions: List["PhysicalRegion"] = constituent_regions
51
+ self.source_flow_element: "FlowElement" = source_flow_element
52
+ self.boundary_element_found: Optional["PhysicalElement"] = boundary_element_found
53
+
54
+ # Cache for expensive operations
55
+ self._cached_text: Optional[str] = None
56
+ self._cached_elements: Optional["ElementCollection"] = None # Stringized
57
+ self._cached_bbox: Optional[Tuple[float, float, float, float]] = None
58
+
59
+ @property
60
+ def bbox(self) -> Optional[Tuple[float, float, float, float]]:
61
+ """
62
+ Calculates a conceptual bounding box that encompasses all constituent physical regions.
63
+ This is the union of the bounding boxes of the constituent regions in their
64
+ original physical coordinates.
65
+ Returns None if there are no constituent regions.
66
+ """
67
+ if self._cached_bbox is not None:
68
+ return self._cached_bbox
69
+ if not self.constituent_regions:
70
+ return None
71
+
72
+ # Use objects_to_bbox from pdfplumber.utils.geometry to merge bboxes
73
+ # This helper expects a list of objects that have .x0, .top, .x1, .bottom attributes.
74
+ # Our PhysicalRegion objects satisfy this.
75
+ self._cached_bbox = objects_to_bbox(self.constituent_regions)
76
+ return self._cached_bbox
77
+
78
+ @property
79
+ def x0(self) -> Optional[float]:
80
+ return self.bbox[0] if self.bbox else None
81
+
82
+ @property
83
+ def top(self) -> Optional[float]:
84
+ return self.bbox[1] if self.bbox else None
85
+
86
+ @property
87
+ def x1(self) -> Optional[float]:
88
+ return self.bbox[2] if self.bbox else None
89
+
90
+ @property
91
+ def bottom(self) -> Optional[float]:
92
+ return self.bbox[3] if self.bbox else None
93
+
94
+ @property
95
+ def width(self) -> Optional[float]:
96
+ return self.x1 - self.x0 if self.bbox else None
97
+
98
+ @property
99
+ def height(self) -> Optional[float]:
100
+ return self.bottom - self.top if self.bbox else None
101
+
102
+ def extract_text(self, apply_exclusions: bool = True, **kwargs) -> str:
103
+ """
104
+ Extracts and concatenates text from all constituent physical regions.
105
+ The order of concatenation respects the flow's arrangement.
106
+
107
+ Args:
108
+ apply_exclusions: Whether to respect PDF exclusion zones within each
109
+ constituent physical region during text extraction.
110
+ **kwargs: Additional arguments passed to the underlying extract_text method
111
+ of each constituent region.
112
+
113
+ Returns:
114
+ The combined text content as a string.
115
+ """
116
+ if self._cached_text is not None and apply_exclusions: # Simple cache check, might need refinement if kwargs change behavior
117
+ return self._cached_text
118
+
119
+ if not self.constituent_regions:
120
+ return ""
121
+
122
+ texts: List[str] = []
123
+ # For now, simple concatenation. Order depends on how constituent_regions were added.
124
+ # The FlowElement._flow_direction method is responsible for ordering constituent_regions correctly.
125
+ for region in self.constituent_regions:
126
+ texts.append(region.extract_text(apply_exclusions=apply_exclusions, **kwargs))
127
+
128
+ # Join based on flow arrangement (e.g., newline for vertical, space for horizontal)
129
+ # This is a simplification; true layout-aware joining would be more complex.
130
+ joiner = "\n" if self.flow.arrangement == "vertical" else " " # TODO: Make this smarter, consider segment_gap
131
+ extracted = joiner.join(t for t in texts if t)
132
+
133
+ if apply_exclusions: # Only cache if standard exclusion behavior
134
+ self._cached_text = extracted
135
+ return extracted
136
+
137
+ def elements(self, apply_exclusions: bool = True) -> "ElementCollection": # Stringized return
138
+ """
139
+ Collects all unique physical elements from all constituent physical regions.
140
+
141
+ Args:
142
+ apply_exclusions: Whether to respect PDF exclusion zones within each
143
+ constituent physical region when gathering elements.
144
+
145
+ Returns:
146
+ An ElementCollection containing all unique elements.
147
+ """
148
+ from natural_pdf.elements.collections import ElementCollection as RuntimeElementCollection # Local import
149
+
150
+ if self._cached_elements is not None and apply_exclusions: # Simple cache check
151
+ return self._cached_elements
152
+
153
+ if not self.constituent_regions:
154
+ return RuntimeElementCollection([])
155
+
156
+ all_physical_elements: List["PhysicalElement"] = [] # Stringized item type
157
+ seen_elements = set() # To ensure uniqueness if elements are shared or duplicated by region definitions
158
+
159
+ for region in self.constituent_regions:
160
+ # Region.get_elements() returns a list, not ElementCollection
161
+ elements_in_region: List["PhysicalElement"] = region.get_elements(apply_exclusions=apply_exclusions)
162
+ for elem in elements_in_region:
163
+ if elem not in seen_elements: # Check for uniqueness based on object identity
164
+ all_physical_elements.append(elem)
165
+ seen_elements.add(elem)
166
+
167
+ # Basic reading order sort based on original page and coordinates.
168
+ def get_sort_key(phys_elem: "PhysicalElement"): # Stringized param type
169
+ page_idx = -1
170
+ if hasattr(phys_elem, 'page') and hasattr(phys_elem.page, 'index'):
171
+ page_idx = phys_elem.page.index
172
+ return (page_idx, phys_elem.top, phys_elem.x0)
173
+
174
+ try:
175
+ sorted_physical_elements = sorted(all_physical_elements, key=get_sort_key)
176
+ except AttributeError:
177
+ logger.warning("Could not sort elements in FlowRegion by reading order; some elements might be missing page, top or x0 attributes.")
178
+ sorted_physical_elements = all_physical_elements
179
+
180
+ result_collection = RuntimeElementCollection(sorted_physical_elements)
181
+ if apply_exclusions:
182
+ self._cached_elements = result_collection
183
+ return result_collection
184
+
185
+ def find(self, selector: Optional[str] = None, *, text: Optional[str] = None, **kwargs) -> Optional["PhysicalElement"]: # Stringized
186
+ """
187
+ Finds the first physical element within this FlowRegion that matches the selector or text.
188
+ """
189
+ # Uses self.elements() which respects exclusions if apply_exclusions=True by default
190
+ all_elems = self.elements(apply_exclusions=kwargs.get("apply_exclusions", True))
191
+ return all_elems.find(selector=selector, text=text, **kwargs) # ElementCollection.find
192
+
193
+ def find_all(self, selector: Optional[str] = None, *, text: Optional[str] = None, **kwargs) -> "ElementCollection": # Stringized
194
+ """
195
+ Finds all physical elements within this FlowRegion that match the selector or text.
196
+ """
197
+ all_elems = self.elements(apply_exclusions=kwargs.get("apply_exclusions", True))
198
+ return all_elems.find_all(selector=selector, text=text, **kwargs) # ElementCollection.find_all
199
+
200
+ def highlight(self, label: Optional[str] = None, color: Optional[Union[Tuple, str]] = None, **kwargs) -> "FlowRegion": # Stringized
201
+ """
202
+ Highlights all constituent physical regions on their respective pages.
203
+
204
+ Args:
205
+ label: A base label for the highlights. Each constituent region might get an indexed label.
206
+ color: Color for the highlight.
207
+ **kwargs: Additional arguments for the underlying highlight method.
208
+
209
+ Returns:
210
+ Self for method chaining.
211
+ """
212
+ if not self.constituent_regions:
213
+ return self
214
+
215
+ base_label = label if label else "FlowRegionPart"
216
+ for i, region in enumerate(self.constituent_regions):
217
+ current_label = f"{base_label}_{i+1}" if len(self.constituent_regions) > 1 else base_label
218
+ region.highlight(label=current_label, color=color, **kwargs)
219
+ return self
220
+
221
+ def show(
222
+ self,
223
+ scale: float = 2.0,
224
+ labels: bool = True,
225
+ legend_position: str = "right",
226
+ color: Optional[Union[Tuple, str]] = "fuchsia",
227
+ label_prefix: Optional[str] = "FlowPart",
228
+ width: Optional[int] = None,
229
+ stack_direction: str = "vertical",
230
+ stack_gap: int = 5,
231
+ stack_background_color: Tuple[int, int, int] = (255, 255, 255),
232
+ **kwargs
233
+ ) -> Optional["PIL_Image"]:
234
+ """
235
+ Generates and returns a PIL Image of relevant pages with constituent regions highlighted.
236
+ If multiple pages are involved, they are stacked into a single image.
237
+ """
238
+ if not self.constituent_regions:
239
+ logger.info("FlowRegion.show() called with no constituent regions.")
240
+ return None
241
+
242
+ # 1. Group constituent regions by their physical page
243
+ regions_by_page: Dict["PhysicalPage", List["PhysicalRegion"]] = {}
244
+ for region in self.constituent_regions:
245
+ if region.page:
246
+ if region.page not in regions_by_page:
247
+ regions_by_page[region.page] = []
248
+ regions_by_page[region.page].append(region)
249
+ else:
250
+ raise ValueError(f"Constituent region {region.bbox} has no page.")
251
+
252
+ if not regions_by_page:
253
+ logger.info("FlowRegion.show() found no constituent regions with associated pages.")
254
+ return None
255
+
256
+ # 2. Get a highlighter service (e.g., from the first page involved)
257
+ first_page_with_regions = next(iter(regions_by_page.keys()), None)
258
+ highlighter_service = None
259
+ if first_page_with_regions and hasattr(first_page_with_regions, '_highlighter'):
260
+ highlighter_service = first_page_with_regions._highlighter
261
+
262
+ if not highlighter_service:
263
+ raise ValueError(
264
+ "Cannot get highlighter service for FlowRegion.show(). "
265
+ "Ensure constituent regions' pages are initialized with a highlighter."
266
+ )
267
+
268
+ output_page_images: List["PIL_Image_Runtime"] = []
269
+
270
+ # Sort pages by index for consistent output order
271
+ sorted_pages = sorted(regions_by_page.keys(), key=lambda p: p.index if hasattr(p, 'index') else getattr(p, 'page_number', 0))
272
+
273
+ # 3. Render each page with its relevant constituent regions highlighted
274
+ for page_idx, page_obj in enumerate(sorted_pages):
275
+ constituent_regions_on_this_page = regions_by_page[page_obj]
276
+ if not constituent_regions_on_this_page:
277
+ continue
278
+
279
+ temp_highlights_for_page = []
280
+ for i, region_part in enumerate(constituent_regions_on_this_page):
281
+ part_label = None
282
+ if labels and label_prefix: # Ensure labels is True for label_prefix to apply
283
+ # If FlowRegion consists of multiple parts on this page, or overall
284
+ count_indicator = ""
285
+ if len(self.constituent_regions) > 1 : # If flow region has multiple parts overall
286
+ # Find global index of this region_part in self.constituent_regions
287
+ try:
288
+ global_idx = self.constituent_regions.index(region_part)
289
+ count_indicator = f"_{global_idx + 1}"
290
+ except ValueError: # Should not happen if region_part is from the list
291
+ count_indicator = f"_p{page_idx}i{i+1}" # fallback local index
292
+ elif len(constituent_regions_on_this_page) > 1 : # If multiple parts on *this* page, but FR is single part overall
293
+ count_indicator = f"_{i+1}"
294
+
295
+ part_label = f"{label_prefix}{count_indicator}" if label_prefix else None
296
+
297
+ temp_highlights_for_page.append({
298
+ "page_index": page_obj.index if hasattr(page_obj, 'index') else getattr(page_obj, 'page_number', 1) -1,
299
+ "bbox": region_part.bbox,
300
+ "polygon": region_part.polygon if region_part.has_polygon else None,
301
+ "color": color, # Use the passed color
302
+ "label": part_label,
303
+ "use_color_cycling": False, # Keep specific color
304
+ })
305
+
306
+ if not temp_highlights_for_page:
307
+ continue
308
+
309
+ page_image = highlighter_service.render_preview(
310
+ page_index=page_obj.index if hasattr(page_obj, 'index') else getattr(page_obj, 'page_number', 1) -1,
311
+ temporary_highlights=temp_highlights_for_page,
312
+ scale=scale,
313
+ width=width,
314
+ labels=labels, # Pass through labels
315
+ legend_position=legend_position,
316
+ **kwargs
317
+ )
318
+ if page_image:
319
+ output_page_images.append(page_image)
320
+
321
+ # 4. Stack the generated page images if multiple
322
+ if not output_page_images:
323
+ logger.info("FlowRegion.show() produced no page images to concatenate.")
324
+ return None
325
+
326
+ if len(output_page_images) == 1:
327
+ return output_page_images[0]
328
+
329
+ # Stacking logic (same as in FlowRegionCollection.show)
330
+ if stack_direction == "vertical":
331
+ final_width = max(img.width for img in output_page_images)
332
+ final_height = sum(img.height for img in output_page_images) + (len(output_page_images) - 1) * stack_gap
333
+ if final_width == 0 or final_height == 0:
334
+ raise ValueError("Cannot create concatenated image with zero width or height.")
335
+
336
+ concatenated_image = PIL_Image_Runtime.new("RGB", (final_width, final_height), stack_background_color)
337
+ current_y = 0
338
+ for img in output_page_images:
339
+ paste_x = (final_width - img.width) // 2
340
+ concatenated_image.paste(img, (paste_x, current_y))
341
+ current_y += img.height + stack_gap
342
+ return concatenated_image
343
+ elif stack_direction == "horizontal":
344
+ final_width = sum(img.width for img in output_page_images) + (len(output_page_images) - 1) * stack_gap
345
+ final_height = max(img.height for img in output_page_images)
346
+ if final_width == 0 or final_height == 0:
347
+ raise ValueError("Cannot create concatenated image with zero width or height.")
348
+
349
+ concatenated_image = PIL_Image_Runtime.new("RGB", (final_width, final_height), stack_background_color)
350
+ current_x = 0
351
+ for img in output_page_images:
352
+ paste_y = (final_height - img.height) // 2
353
+ concatenated_image.paste(img, (current_x, paste_y))
354
+ current_x += img.width + stack_gap
355
+ return concatenated_image
356
+ else:
357
+ raise ValueError(f"Invalid stack_direction '{stack_direction}' for FlowRegion.show(). Must be 'vertical' or 'horizontal'.")
358
+
359
+ def to_images(
360
+ self,
361
+ resolution: float = 150,
362
+ **kwargs,
363
+ ) -> List["PIL_Image"]:
364
+ """
365
+ Generates and returns a list of cropped PIL Images,
366
+ one for each constituent physical region of this FlowRegion.
367
+ """
368
+ if not self.constituent_regions:
369
+ logger.info("FlowRegion.to_images() called on an empty FlowRegion.")
370
+ return []
371
+
372
+ cropped_images: List["PIL_Image"] = []
373
+ for region_part in self.constituent_regions:
374
+ try:
375
+ img = region_part.to_image(
376
+ resolution=resolution,
377
+ crop_only=True,
378
+ include_highlights=False,
379
+ **kwargs
380
+ )
381
+ if img:
382
+ cropped_images.append(img)
383
+ except Exception as e:
384
+ logger.error(f"Error generating image for constituent region {region_part.bbox}: {e}", exc_info=True)
385
+
386
+ return cropped_images
387
+
388
+ def to_image(self, background_color=(255,255,255), **kwargs) -> Optional["PIL_Image"]:
389
+ """
390
+ Creates a single composite image by stacking the images of its constituent regions.
391
+ Stacking direction is based on the Flow's arrangement.
392
+ Individual region images are obtained by calling to_images(**kwargs).
393
+
394
+ Args:
395
+ background_color: Tuple for RGB background color of the composite image.
396
+ **kwargs: Additional arguments passed to to_images() for rendering individual parts
397
+ (e.g., resolution).
398
+
399
+ Returns:
400
+ A single PIL.Image.Image object, or None if no constituent images.
401
+ """
402
+ # Use PIL_Image_Runtime for creating new images at runtime
403
+ images = self.to_images(**kwargs)
404
+ if not images:
405
+ return None
406
+ if len(images) == 1:
407
+ return images[0]
408
+
409
+ if self.flow.arrangement == "vertical":
410
+ # Stack vertically
411
+ composite_width = max(img.width for img in images)
412
+ composite_height = sum(img.height for img in images)
413
+ if composite_width == 0 or composite_height == 0: return None # Avoid zero-size image
414
+
415
+ new_image = PIL_Image_Runtime.new("RGB", (composite_width, composite_height), background_color)
416
+ current_y = 0
417
+ for img in images:
418
+ # Default to left alignment for vertical stacking
419
+ new_image.paste(img, (0, current_y))
420
+ current_y += img.height
421
+ return new_image
422
+
423
+ elif self.flow.arrangement == "horizontal":
424
+ # Stack horizontally
425
+ composite_width = sum(img.width for img in images)
426
+ composite_height = max(img.height for img in images)
427
+ if composite_width == 0 or composite_height == 0: return None
428
+
429
+ new_image = PIL_Image_Runtime.new("RGB", (composite_width, composite_height), background_color)
430
+ current_x = 0
431
+ for img in images:
432
+ # Default to top alignment for horizontal stacking
433
+ new_image.paste(img, (current_x, 0))
434
+ current_x += img.width
435
+ return new_image
436
+ else:
437
+ # Should not happen if flow.arrangement is validated
438
+ logger.warning(f"Unknown flow arrangement: {self.flow.arrangement}. Cannot stack images.")
439
+ return None
440
+
441
+ def __repr__(self) -> str:
442
+ return (
443
+ f"<FlowRegion constituents={len(self.constituent_regions)}, flow={self.flow}, "
444
+ f"source_bbox={self.source_flow_element.bbox if self.source_flow_element else 'N/A'}>"
445
+ )
446
+
447
+ @property
448
+ def is_empty(self) -> bool:
449
+ """Checks if the FlowRegion contains no constituent regions or if all are empty."""
450
+ if not self.constituent_regions:
451
+ return True
452
+ # A more robust check might see if extract_text() is empty and elements() is empty.
453
+ # For now, if it has regions, it's not considered empty by this simple check.
454
+ # User Point 4: FlowRegion can be empty (no text, no elements). This implies checking content.
455
+ try:
456
+ return not bool(self.extract_text(apply_exclusions=False).strip()) and not bool(self.elements(apply_exclusions=False))
457
+ except Exception:
458
+ return True # If error during check, assume empty to be safe
@@ -71,6 +71,91 @@ def safe_parse_color(value_str: str) -> tuple:
71
71
  return (0, 0, 0)
72
72
 
73
73
 
74
+ def _split_top_level_or(selector: str) -> List[str]:
75
+ """
76
+ Split a selector string on top-level OR operators (| or ,) only.
77
+
78
+ Respects parsing contexts and does not split when | or , appear inside:
79
+ - Quoted strings (both single and double quotes)
80
+ - Parentheses (for pseudo-class arguments like :not(...))
81
+ - Square brackets (for attribute selectors like [attr="value"])
82
+
83
+ Args:
84
+ selector: The selector string to split
85
+
86
+ Returns:
87
+ List of selector parts. If no top-level OR operators found, returns [selector].
88
+
89
+ Examples:
90
+ >>> _split_top_level_or('text:contains("a|b")|text:bold')
91
+ ['text:contains("a|b")', 'text:bold']
92
+
93
+ >>> _split_top_level_or('text:contains("hello,world")')
94
+ ['text:contains("hello,world")']
95
+ """
96
+ if not selector or not isinstance(selector, str):
97
+ return [selector] if selector else []
98
+
99
+ parts = []
100
+ current_part = ""
101
+ i = 0
102
+
103
+ # Parsing state
104
+ in_double_quotes = False
105
+ in_single_quotes = False
106
+ paren_depth = 0
107
+ bracket_depth = 0
108
+
109
+ while i < len(selector):
110
+ char = selector[i]
111
+
112
+ # Handle escape sequences in quotes
113
+ if i > 0 and selector[i-1] == '\\':
114
+ current_part += char
115
+ i += 1
116
+ continue
117
+
118
+ # Handle quote state changes
119
+ if char == '"' and not in_single_quotes:
120
+ in_double_quotes = not in_double_quotes
121
+ elif char == "'" and not in_double_quotes:
122
+ in_single_quotes = not in_single_quotes
123
+
124
+ # Handle parentheses and brackets only when not in quotes
125
+ elif not in_double_quotes and not in_single_quotes:
126
+ if char == '(':
127
+ paren_depth += 1
128
+ elif char == ')':
129
+ paren_depth -= 1
130
+ elif char == '[':
131
+ bracket_depth += 1
132
+ elif char == ']':
133
+ bracket_depth -= 1
134
+
135
+ # Check for top-level OR operators
136
+ elif (char == '|' or char == ',') and paren_depth == 0 and bracket_depth == 0:
137
+ # Found a top-level OR operator
138
+ part = current_part.strip()
139
+ if part: # Only add non-empty parts
140
+ parts.append(part)
141
+ current_part = ""
142
+ i += 1
143
+ continue
144
+
145
+ # Add character to current part
146
+ current_part += char
147
+ i += 1
148
+
149
+ # Add the final part
150
+ final_part = current_part.strip()
151
+ if final_part:
152
+ parts.append(final_part)
153
+
154
+ # If we only found one part, return it as a single-element list
155
+ # If we found multiple parts, those are the OR-separated parts
156
+ return parts if parts else [selector]
157
+
158
+
74
159
  def parse_selector(selector: str) -> Dict[str, Any]:
75
160
  """
76
161
  Parse a CSS-like selector string into a structured selector object.
@@ -80,12 +165,28 @@ def parse_selector(selector: str) -> Dict[str, Any]:
80
165
  - Attribute presence (e.g., '[data-id]')
81
166
  - Attribute value checks with various operators (e.g., '[count=5]', '[name*="bold"]'')
82
167
  - Pseudo-classes (e.g., ':contains("Total")', ':empty', ':not(...)')
168
+ - OR operators (e.g., 'text:contains("A")|text:bold', 'sel1,sel2')
83
169
 
84
170
  Args:
85
171
  selector: CSS-like selector string
86
172
 
87
173
  Returns:
88
- Dict representing the parsed selector
174
+ Dict representing the parsed selector, or compound selector with OR logic
175
+
176
+ Examples:
177
+ >>> parse_selector('text:contains("hello")') # Single selector
178
+ {'type': 'text', 'pseudo_classes': [{'name': 'contains', 'args': 'hello'}], ...}
179
+
180
+ >>> parse_selector('text:contains("A")|text:bold') # OR with pipe
181
+ {'type': 'or', 'selectors': [...]}
182
+
183
+ >>> parse_selector('text:contains("A"),line[width>5]') # OR with comma
184
+ {'type': 'or', 'selectors': [...]}
185
+
186
+ Note:
187
+ OR operators work with all selector types except spatial pseudo-classes
188
+ (:above, :below, :near, :left-of, :right-of) which require page context.
189
+ Spatial relationships within OR selectors are not currently supported.
89
190
  """
90
191
  result = {
91
192
  "type": "any",
@@ -100,6 +201,36 @@ def parse_selector(selector: str) -> Dict[str, Any]:
100
201
 
101
202
  selector = selector.strip()
102
203
 
204
+ # --- Handle OR operators first (| or ,) ---
205
+ # Check if selector contains OR operators at the top level only
206
+ # (not inside quotes, parentheses, or brackets)
207
+ or_parts = _split_top_level_or(selector)
208
+
209
+ # If we found OR parts, parse each one recursively and return compound selector
210
+ if len(or_parts) > 1:
211
+ parsed_selectors = []
212
+ for part in or_parts:
213
+ try:
214
+ parsed_selectors.append(parse_selector(part))
215
+ except (ValueError, TypeError) as e:
216
+ logger.warning(f"Skipping invalid OR selector part '{part}': {e}")
217
+ continue
218
+
219
+ if len(parsed_selectors) > 1:
220
+ return {
221
+ "type": "or",
222
+ "selectors": parsed_selectors
223
+ }
224
+ elif len(parsed_selectors) == 1:
225
+ # Only one valid part, return it directly
226
+ return parsed_selectors[0]
227
+ else:
228
+ # No valid parts, return default
229
+ logger.warning(f"No valid parts found in OR selector '{original_selector_for_error}', returning default selector")
230
+ return result
231
+
232
+ # --- Continue with single selector parsing (existing logic) ---
233
+
103
234
  # --- Handle wildcard selector explicitly ---
104
235
  if selector == "*":
105
236
  # Wildcard matches any type, already the default.
@@ -109,12 +240,6 @@ def parse_selector(selector: str) -> Dict[str, Any]:
109
240
 
110
241
  # 1. Extract type (optional, at the beginning)
111
242
  # Only run if selector wasn't '*'
112
- if selector:
113
- type_match = re.match(r"^([a-zA-Z_\-]+)", selector)
114
- if type_match:
115
- result["type"] = type_match.group(1).lower()
116
- selector = selector[len(type_match.group(0)) :].strip()
117
- # Only run if selector wasn't '*'
118
243
  if selector:
119
244
  type_match = re.match(r"^([a-zA-Z_\-]+)", selector)
120
245
  if type_match:
@@ -597,12 +722,42 @@ def selector_to_filter_func(selector: Dict[str, Any], **kwargs) -> Callable[[Any
597
722
  To inspect the individual filters, call `_build_filter_list` directly.
598
723
 
599
724
  Args:
600
- selector: Parsed selector dictionary
725
+ selector: Parsed selector dictionary (single or compound OR selector)
601
726
  **kwargs: Additional filter parameters (e.g., regex, case).
602
727
 
603
728
  Returns:
604
729
  Function that takes an element and returns True if it matches the selector.
605
730
  """
731
+ # Handle compound OR selectors
732
+ if selector.get("type") == "or":
733
+ sub_selectors = selector.get("selectors", [])
734
+ if not sub_selectors:
735
+ # Empty OR selector, return a function that never matches
736
+ return lambda element: False
737
+
738
+ # Create filter functions for each sub-selector
739
+ sub_filter_funcs = []
740
+ for sub_selector in sub_selectors:
741
+ sub_filter_funcs.append(selector_to_filter_func(sub_selector, **kwargs))
742
+
743
+ if logger.isEnabledFor(logging.DEBUG):
744
+ logger.debug(f"Creating OR filter with {len(sub_filter_funcs)} sub-selectors")
745
+
746
+ # Return OR combination - element matches if ANY sub-selector matches
747
+ def or_filter(element):
748
+ for func in sub_filter_funcs:
749
+ try:
750
+ if func(element):
751
+ return True
752
+ except Exception as e:
753
+ logger.error(f"Error applying OR sub-filter to element: {e}", exc_info=True)
754
+ # Continue to next sub-filter on error
755
+ continue
756
+ return False
757
+
758
+ return or_filter
759
+
760
+ # Handle single selectors (existing logic)
606
761
  filter_list = _build_filter_list(selector, **kwargs)
607
762
 
608
763
  if logger.isEnabledFor(logging.DEBUG):
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: natural-pdf
3
- Version: 0.1.12
3
+ Version: 0.1.13
4
4
  Summary: A more intuitive interface for working with PDFs
5
5
  Author-email: Jonathan Soma <jonathan.soma@gmail.com>
6
6
  License-Expression: MIT
@@ -20,6 +20,7 @@ Requires-Dist: tqdm
20
20
  Requires-Dist: pydantic
21
21
  Requires-Dist: jenkspy
22
22
  Requires-Dist: pikepdf>=9.7.0
23
+ Requires-Dist: scipy
23
24
  Provides-Extra: viewer
24
25
  Requires-Dist: ipywidgets<9.0.0,>=7.0.0; extra == "viewer"
25
26
  Provides-Extra: easyocr