natural-pdf 0.1.0__py3-none-any.whl → 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
natural_pdf/__init__.py CHANGED
@@ -47,7 +47,7 @@ try:
47
47
  except ImportError:
48
48
  HAS_QA = False
49
49
 
50
- __version__ = "0.1.0"
50
+ __version__ = "0.1.1"
51
51
 
52
52
  if HAS_QA:
53
53
  __all__ = ["PDF", "Page", "Region", "ElementCollection", "configure_logging", "DocumentQA", "get_qa_engine"]
@@ -383,7 +383,7 @@ class HighlightingService:
383
383
  def add(
384
384
  self,
385
385
  page_index: int,
386
- bbox: Tuple[float, float, float, float],
386
+ bbox: Union[Tuple[float, float, float, float], Any], # Relax input type hint
387
387
  color: Optional[Union[Tuple, str]] = None,
388
388
  label: Optional[str] = None,
389
389
  use_color_cycling: bool = False,
@@ -392,9 +392,32 @@ class HighlightingService:
392
392
  existing: str = 'append'
393
393
  ):
394
394
  """Adds a rectangular highlight."""
395
+
396
+ processed_bbox: Tuple[float, float, float, float]
397
+ # Check if bbox is an object with expected attributes (likely a Region)
398
+ # Assuming Region object has x0, top, x1, bottom attributes based on error context
399
+ if (hasattr(bbox, 'x0') and hasattr(bbox, 'top') and
400
+ hasattr(bbox, 'x1') and hasattr(bbox, 'bottom')):
401
+ try:
402
+ # Ensure attributes are numeric before creating tuple
403
+ processed_bbox = (float(bbox.x0), float(bbox.top), float(bbox.x1), float(bbox.bottom))
404
+ except (ValueError, TypeError):
405
+ logger.error(f"Invalid attribute types in bbox object for page {page_index}: {bbox}. Expected numeric values.")
406
+ return
407
+ elif isinstance(bbox, (list, tuple)) and len(bbox) == 4:
408
+ try:
409
+ # Ensure elements are numeric and convert to tuple
410
+ processed_bbox = tuple(float(v) for v in bbox)
411
+ except (ValueError, TypeError):
412
+ logger.error(f"Invalid values in bbox sequence for page {page_index}: {bbox}. Expected numeric values.")
413
+ return
414
+ else:
415
+ logger.error(f"Invalid bbox type or structure provided for page {page_index}: {type(bbox)} - {bbox}. Expected tuple/list of 4 numbers or Region-like object.")
416
+ return # Don't proceed if bbox is invalid
417
+
395
418
  self._add_internal(
396
419
  page_index=page_index,
397
- bbox=bbox,
420
+ bbox=processed_bbox, # Use the processed tuple
398
421
  polygon=None,
399
422
  color_input=color,
400
423
  label=label,
@@ -526,6 +549,7 @@ class HighlightingService:
526
549
  ) -> Optional[Image.Image]:
527
550
  """
528
551
  Renders a specific page with its highlights.
552
+ Legend is now generated based only on highlights present on this page.
529
553
 
530
554
  Args:
531
555
  page_index: The 0-based index of the page to render.
@@ -545,23 +569,19 @@ class HighlightingService:
545
569
  return None
546
570
 
547
571
  page = self._pdf[page_index]
548
- highlights_on_page = self.get_highlights_for_page(page_index)
572
+ highlights_on_page = self.get_highlights_for_page(page_index) # This list will be empty if clear_page was called
549
573
 
550
574
  # --- Get Base Image ---
551
575
  try:
552
576
  render_resolution = resolution if resolution is not None else scale * 72
553
- # Use the underlying pdfplumber page object for base rendering
554
577
  img_object = page._page.to_image(resolution=render_resolution, **kwargs)
555
- # Access the PIL image directly
556
- base_image = img_object.annotated # .annotated usually holds the PIL Image
578
+ base_image = img_object.annotated
557
579
  if not isinstance(base_image, Image.Image):
558
- # Fallback for different pdfplumber versions/outputs
559
580
  png_data = img_object._repr_png_()
560
581
  if png_data:
561
582
  base_image = Image.open(io.BytesIO(png_data)).convert('RGB')
562
583
  else:
563
584
  raise ValueError("Could not extract base PIL image from pdfplumber.")
564
- # Convert to RGBA for compositing
565
585
  base_image = base_image.convert('RGBA')
566
586
  logger.debug(f"Base image for page {page_index} rendered with resolution {render_resolution}.")
567
587
  except Exception as e:
@@ -569,6 +589,7 @@ class HighlightingService:
569
589
  return None
570
590
 
571
591
  # --- Render Highlights ---
592
+ rendered_image: Image.Image
572
593
  if highlights_on_page:
573
594
  renderer = HighlightRenderer(
574
595
  page=page,
@@ -579,21 +600,31 @@ class HighlightingService:
579
600
  )
580
601
  rendered_image = renderer.render()
581
602
  else:
582
- # If no highlights, still need to potentially render OCR if requested
583
603
  if render_ocr:
604
+ # Still render OCR even if no highlights
584
605
  renderer = HighlightRenderer(page, base_image, [], scale, True)
585
- rendered_image = renderer.render() # Will only call _render_ocr_text
606
+ rendered_image = renderer.render()
586
607
  else:
587
608
  rendered_image = base_image # No highlights, no OCR requested
588
609
 
589
- # --- Add Legend ---
610
+ # --- Add Legend (Based ONLY on this page's highlights) ---
590
611
  if labels:
591
- label_colors = self.get_labels_and_colors()
592
- if label_colors:
593
- legend = create_legend(label_colors)
594
- rendered_image = merge_images_with_legend(rendered_image, legend, legend_position)
595
- logger.debug(f"Added legend with {len(label_colors)} labels to page {page_index}.")
596
-
612
+ # CHANGE: Create label_colors map only from highlights_on_page
613
+ labels_colors_on_page: Dict[str, Tuple[int, int, int, int]] = {}
614
+ for hl in highlights_on_page:
615
+ if hl.label and hl.label not in labels_colors_on_page:
616
+ labels_colors_on_page[hl.label] = hl.color
617
+
618
+ if labels_colors_on_page: # Only add legend if there are labels on this page
619
+ legend = create_legend(labels_colors_on_page)
620
+ if legend: # Ensure create_legend didn't return None
621
+ rendered_image = merge_images_with_legend(rendered_image, legend, legend_position)
622
+ logger.debug(f"Added legend with {len(labels_colors_on_page)} labels for page {page_index}.")
623
+ else:
624
+ logger.debug(f"Legend creation returned None for page {page_index}.")
625
+ else:
626
+ logger.debug(f"No labels found on page {page_index}, skipping legend.")
627
+
597
628
  return rendered_image
598
629
 
599
630
  def render_preview(
natural_pdf/core/page.py CHANGED
@@ -9,6 +9,7 @@ import io
9
9
  import json
10
10
 
11
11
  from natural_pdf.elements.collections import ElementCollection
12
+ from natural_pdf.elements.region import Region
12
13
 
13
14
  if TYPE_CHECKING:
14
15
  import pdfplumber
@@ -17,7 +18,6 @@ if TYPE_CHECKING:
17
18
  from natural_pdf.core.highlighting_service import HighlightingService
18
19
  from natural_pdf.elements.base import Element
19
20
 
20
- from natural_pdf.elements.region import Region
21
21
  from natural_pdf.elements.text import TextElement
22
22
  from natural_pdf.analyzers.layout.layout_manager import LayoutManager
23
23
  from natural_pdf.analyzers.layout.layout_options import LayoutOptions
@@ -120,18 +120,50 @@ class Page:
120
120
  raise AttributeError("Parent PDF object does not have a 'highlighter' attribute.")
121
121
  return self._parent.highlighter
122
122
 
123
- def add_exclusion(self, exclusion_func_or_region: Union[Callable[['Page'], Region], Region]) -> 'Page':
123
+ def clear_exclusions(self) -> 'Page':
124
+ """
125
+ Clear all exclusions from the page.
126
+ """
127
+ self._exclusions = []
128
+ return self
129
+
130
+ def add_exclusion(self, exclusion_func_or_region: Union[Callable[['Page'], Region], Region, Any]) -> 'Page':
124
131
  """
125
132
  Add an exclusion to the page. Text from these regions will be excluded from extraction.
133
+ Ensures non-callable items are stored as Region objects if possible.
126
134
 
127
135
  Args:
128
- exclusion_func_or_region: Either a Region object or a function that takes a Page
129
- and returns a Region to exclude
136
+ exclusion_func_or_region: Either a callable function returning a Region,
137
+ a Region object, or another object with a valid .bbox attribute.
130
138
 
131
139
  Returns:
132
140
  Self for method chaining
133
- """
134
- self._exclusions.append(exclusion_func_or_region)
141
+
142
+ Raises:
143
+ TypeError: If a non-callable, non-Region object without a valid bbox is provided.
144
+ """
145
+ if callable(exclusion_func_or_region):
146
+ # Store callable functions directly
147
+ self._exclusions.append(exclusion_func_or_region)
148
+ logger.debug(f"Page {self.index}: Added callable exclusion: {exclusion_func_or_region}")
149
+ elif isinstance(exclusion_func_or_region, Region):
150
+ # Store Region objects directly
151
+ self._exclusions.append(exclusion_func_or_region)
152
+ logger.debug(f"Page {self.index}: Added Region exclusion: {exclusion_func_or_region}")
153
+ elif hasattr(exclusion_func_or_region, 'bbox') and isinstance(getattr(exclusion_func_or_region, 'bbox', None), (tuple, list)) and len(exclusion_func_or_region.bbox) == 4:
154
+ # Convert objects with a valid bbox to a Region before storing
155
+ try:
156
+ bbox_coords = tuple(float(v) for v in exclusion_func_or_region.bbox)
157
+ region_to_add = Region(self, bbox_coords)
158
+ self._exclusions.append(region_to_add)
159
+ logger.debug(f"Page {self.index}: Added exclusion converted to Region from {type(exclusion_func_or_region)}: {region_to_add}")
160
+ except (ValueError, TypeError, Exception) as e:
161
+ # Raise an error if conversion fails
162
+ raise TypeError(f"Failed to convert exclusion object {exclusion_func_or_region} with bbox {getattr(exclusion_func_or_region, 'bbox', 'N/A')} to Region: {e}") from e
163
+ else:
164
+ # Reject invalid types
165
+ raise TypeError(f"Invalid exclusion type: {type(exclusion_func_or_region)}. Must be callable, Region, or have a valid .bbox attribute.")
166
+
135
167
  return self
136
168
 
137
169
  def add_region(self, region: Region, name: Optional[str] = None) -> 'Page':
@@ -190,6 +222,7 @@ class Page:
190
222
  def _get_exclusion_regions(self, include_callable=True, debug=False) -> List[Region]:
191
223
  """
192
224
  Get all exclusion regions for this page.
225
+ Assumes self._exclusions contains only callables or Region objects.
193
226
 
194
227
  Args:
195
228
  include_callable: Whether to evaluate callable exclusion functions
@@ -207,15 +240,14 @@ class Page:
207
240
  for i, exclusion in enumerate(self._exclusions):
208
241
  # Get exclusion label if it's a tuple from PDF level
209
242
  exclusion_label = f"exclusion {i}"
210
- original_exclusion = exclusion
211
-
212
- # Check if it's a tuple from PDF.add_exclusion
243
+ original_exclusion = exclusion # Keep track for debugging
244
+
245
+ # Check if it's a tuple from PDF.add_exclusion (should still be handled if PDF adds labels)
213
246
  if isinstance(exclusion, tuple) and len(exclusion) == 2 and callable(exclusion[0]):
214
- # This is likely from PDF.add_exclusion with (func, label)
215
247
  exclusion_func, label = exclusion
216
248
  if label:
217
249
  exclusion_label = label
218
- exclusion = exclusion_func
250
+ exclusion = exclusion_func # Use the function part
219
251
 
220
252
  # Process callable exclusion functions
221
253
  if callable(exclusion) and include_callable:
@@ -224,40 +256,45 @@ class Page:
224
256
  if debug:
225
257
  print(f" - Evaluating callable {exclusion_label}...")
226
258
 
227
- # Create a temporary copy of exclusions to avoid recursion
228
- original_exclusions = self._exclusions
229
- self._exclusions = [] # Temporarily clear exclusions
259
+ # Temporarily clear exclusions to avoid potential recursion if the callable uses exclusions itself
260
+ # This might be overly cautious depending on use case, but safer.
261
+ temp_original_exclusions = self._exclusions
262
+ self._exclusions = []
230
263
 
231
- # Call the function
232
- region = exclusion(self)
264
+ # Call the function - Expects it to return a Region or None
265
+ region_result = exclusion(self)
233
266
 
234
267
  # Restore exclusions
235
- self._exclusions = original_exclusions
268
+ self._exclusions = temp_original_exclusions
236
269
 
237
- if region:
238
- regions.append(region)
270
+ if isinstance(region_result, Region):
271
+ regions.append(region_result)
239
272
  if debug:
240
- print(f" ✓ Added region: {region}")
273
+ print(f" ✓ Added region from callable: {region_result}")
274
+ elif region_result:
275
+ # Log warning if callable returned something other than Region/None
276
+ logger.warning(f"Callable exclusion {exclusion_label} returned non-Region object: {type(region_result)}. Skipping.")
277
+ if debug:
278
+ print(f" ✗ Callable returned non-Region/None: {type(region_result)}")
241
279
  else:
242
280
  if debug:
243
- print(f" ✗ Function returned None, no region added")
281
+ print(f" ✗ Callable returned None, no region added")
244
282
 
245
283
  except Exception as e:
246
- error_msg = f"Error in {exclusion_label} for page {self.index}: {e}"
284
+ error_msg = f"Error evaluating callable exclusion {exclusion_label} for page {self.index}: {e}"
247
285
  print(error_msg)
248
- # Print more detailed traceback for debugging
249
286
  import traceback
250
287
  print(f" Traceback: {traceback.format_exc().splitlines()[-3:]}")
251
288
 
252
- # Process direct Region objects
253
- elif not callable(exclusion):
254
- # It's already a Region object
289
+ # Process direct Region objects (already validated by add_exclusion)
290
+ elif isinstance(exclusion, Region):
255
291
  regions.append(exclusion)
256
292
  if debug:
257
293
  print(f" - Added direct region: {exclusion}")
294
+ # No else needed, add_exclusion should prevent invalid types
258
295
 
259
296
  if debug:
260
- print(f"Page {self.index}: Found {len(regions)} valid exclusion regions")
297
+ print(f"Page {self.index}: Found {len(regions)} valid exclusion regions to apply")
261
298
 
262
299
  return regions
263
300
 
@@ -1178,6 +1215,34 @@ class Page:
1178
1215
 
1179
1216
  return ElementCollection(detected_regions)
1180
1217
 
1218
+ def clear_detected_layout_regions(self) -> 'Page':
1219
+ """
1220
+ Removes all regions from this page that were added by layout analysis
1221
+ (i.e., regions where `source` attribute is 'detected').
1222
+
1223
+ This clears the regions both from the page's internal `_regions['detected']` list
1224
+ and from the ElementManager's internal list of regions.
1225
+
1226
+ Returns:
1227
+ Self for method chaining.
1228
+ """
1229
+ if not hasattr(self._element_mgr, 'regions') or not hasattr(self._element_mgr, '_elements') or 'regions' not in self._element_mgr._elements:
1230
+ logger.debug(f"Page {self.index}: No regions found in ElementManager, nothing to clear.")
1231
+ self._regions['detected'] = [] # Ensure page's list is also clear
1232
+ return self
1233
+
1234
+ # Filter ElementManager's list to keep only non-detected regions
1235
+ original_count = len(self._element_mgr.regions)
1236
+ self._element_mgr._elements['regions'] = [r for r in self._element_mgr.regions if getattr(r, 'source', None) != 'detected']
1237
+ new_count = len(self._element_mgr.regions)
1238
+ removed_count = original_count - new_count
1239
+
1240
+ # Clear the page's specific list of detected regions
1241
+ self._regions['detected'] = []
1242
+
1243
+ logger.info(f"Page {self.index}: Cleared {removed_count} detected layout regions.")
1244
+ return self
1245
+
1181
1246
  def get_section_between(self, start_element=None, end_element=None, boundary_inclusion='both') -> Optional[Region]: # Return Optional
1182
1247
  """
1183
1248
  Get a section between two elements on this page.
natural_pdf/core/pdf.py CHANGED
@@ -125,6 +125,17 @@ class PDF:
125
125
  from natural_pdf.elements.collections import PageCollection
126
126
  return PageCollection(self._pages)
127
127
 
128
+ def clear_exclusions(self) -> 'PDF':
129
+ """
130
+ Clear all exclusion functions from the PDF.
131
+
132
+ Returns:
133
+ Self for method chaining
134
+ """
135
+
136
+ self._exclusions = []
137
+ return self
138
+
128
139
  def add_exclusion(self, exclusion_func: Callable[[Page], Region], label: str = None) -> 'PDF':
129
140
  """
130
141
  Add an exclusion function to the PDF. Text from these regions will be excluded from extraction.
@@ -7,7 +7,8 @@ from PIL import Image
7
7
  if TYPE_CHECKING:
8
8
  from natural_pdf.core.page import Page
9
9
  from natural_pdf.elements.region import Region
10
- from natural_pdf.elements.base import Element, DirectionalMixin
10
+ from natural_pdf.elements.base import Element
11
+ from natural_pdf.elements.collections import ElementCollection
11
12
 
12
13
 
13
14
  class DirectionalMixin:
@@ -17,7 +18,7 @@ class DirectionalMixin:
17
18
 
18
19
  def _direction(self, direction: str, size: Optional[float] = None,
19
20
  cross_size: str = "full", include_element: bool = False,
20
- until: Optional[str] = None, include_endpoint: bool = True, **kwargs) -> Union['Element', 'Region']:
21
+ until: Optional[str] = None, include_endpoint: bool = True, **kwargs) -> 'Region':
21
22
  """
22
23
  Protected helper method to create a region in a specified direction relative to this element/region.
23
24
 
@@ -154,7 +155,7 @@ class DirectionalMixin:
154
155
  return result
155
156
 
156
157
  def above(self, height: Optional[float] = None, width: str = "full", include_element: bool = False,
157
- until: Optional[str] = None, include_endpoint: bool = True, **kwargs) -> Union['Element', 'Region']:
158
+ until: Optional[str] = None, include_endpoint: bool = True, **kwargs) -> 'Region':
158
159
  """
159
160
  Select region above this element/region.
160
161
 
@@ -180,7 +181,7 @@ class DirectionalMixin:
180
181
  )
181
182
 
182
183
  def below(self, height: Optional[float] = None, width: str = "full", include_element: bool = False,
183
- until: Optional[str] = None, include_endpoint: bool = True, **kwargs) -> Union['Element', 'Region']:
184
+ until: Optional[str] = None, include_endpoint: bool = True, **kwargs) -> 'Region':
184
185
  """
185
186
  Select region below this element/region.
186
187
 
@@ -206,7 +207,7 @@ class DirectionalMixin:
206
207
  )
207
208
 
208
209
  def left(self, width: Optional[float] = None, height: str = "full", include_element: bool = False,
209
- until: Optional[str] = None, include_endpoint: bool = True, **kwargs) -> Union['Element', 'Region']:
210
+ until: Optional[str] = None, include_endpoint: bool = True, **kwargs) -> 'Region':
210
211
  """
211
212
  Select region to the left of this element/region.
212
213
 
@@ -232,7 +233,7 @@ class DirectionalMixin:
232
233
  )
233
234
 
234
235
  def right(self, width: Optional[float] = None, height: str = "full", include_element: bool = False,
235
- until: Optional[str] = None, include_endpoint: bool = True, **kwargs) -> Union['Element', 'Region']:
236
+ until: Optional[str] = None, include_endpoint: bool = True, **kwargs) -> 'Region':
236
237
  """
237
238
  Select region to the right of this element/region.
238
239
 
@@ -257,6 +258,86 @@ class DirectionalMixin:
257
258
  **kwargs
258
259
  )
259
260
 
261
+ def expand(self,
262
+ left: float = 0,
263
+ right: float = 0,
264
+ top_expand: float = 0, # Renamed to avoid conflict
265
+ bottom_expand: float = 0, # Renamed to avoid conflict
266
+ width_factor: float = 1.0,
267
+ height_factor: float = 1.0,
268
+ # Keep original parameter names for backward compatibility
269
+ top: float = None,
270
+ bottom: float = None) -> 'Region':
271
+ """
272
+ Create a new region expanded from this element/region.
273
+
274
+ Args:
275
+ left: Amount to expand left edge (positive value expands leftwards)
276
+ right: Amount to expand right edge (positive value expands rightwards)
277
+ top_expand: Amount to expand top edge (positive value expands upwards)
278
+ bottom_expand: Amount to expand bottom edge (positive value expands downwards)
279
+ width_factor: Factor to multiply width by (applied after absolute expansion)
280
+ height_factor: Factor to multiply height by (applied after absolute expansion)
281
+ top: (DEPRECATED, use top_expand) Amount to expand top edge (upward)
282
+ bottom: (DEPRECATED, use bottom_expand) Amount to expand bottom edge (downward)
283
+
284
+ Returns:
285
+ New expanded Region object
286
+ """
287
+ # Start with current coordinates
288
+ new_x0 = self.x0
289
+ new_x1 = self.x1
290
+ new_top = self.top
291
+ new_bottom = self.bottom
292
+
293
+ # Handle the deprecated parameter names for backward compatibility
294
+ if top is not None:
295
+ top_expand = top
296
+ if bottom is not None:
297
+ bottom_expand = bottom
298
+
299
+ # Apply absolute expansions first
300
+ new_x0 -= left
301
+ new_x1 += right
302
+ new_top -= top_expand # Expand upward (decrease top coordinate)
303
+ new_bottom += bottom_expand # Expand downward (increase bottom coordinate)
304
+
305
+ # Apply percentage factors if provided
306
+ if width_factor != 1.0 or height_factor != 1.0:
307
+ # Calculate center point *after* absolute expansion
308
+ center_x = (new_x0 + new_x1) / 2
309
+ center_y = (new_top + new_bottom) / 2
310
+
311
+ # Calculate current width and height *after* absolute expansion
312
+ current_width = new_x1 - new_x0
313
+ current_height = new_bottom - new_top
314
+
315
+ # Calculate new width and height
316
+ new_width = current_width * width_factor
317
+ new_height = current_height * height_factor
318
+
319
+ # Adjust coordinates based on the new dimensions, keeping the center
320
+ new_x0 = center_x - new_width / 2
321
+ new_x1 = center_x + new_width / 2
322
+ new_top = center_y - new_height / 2
323
+ new_bottom = center_y + new_height / 2
324
+
325
+ # Clamp coordinates to page boundaries
326
+ new_x0 = max(0, new_x0)
327
+ new_top = max(0, new_top)
328
+ new_x1 = min(self.page.width, new_x1)
329
+ new_bottom = min(self.page.height, new_bottom)
330
+
331
+ # Ensure coordinates are valid (x0 <= x1, top <= bottom)
332
+ if new_x0 > new_x1: new_x0 = new_x1 = (new_x0 + new_x1) / 2
333
+ if new_top > new_bottom: new_top = new_bottom = (new_top + new_bottom) / 2
334
+
335
+ # Create new region with expanded bbox
336
+ from natural_pdf.elements.region import Region
337
+ new_region = Region(self.page, (new_x0, new_top, new_x1, new_bottom))
338
+
339
+ return new_region
340
+
260
341
 
261
342
  class Element(DirectionalMixin):
262
343
  """
@@ -415,7 +496,8 @@ class Element(DirectionalMixin):
415
496
  candidates = candidates[:limit] if limit else candidates
416
497
 
417
498
  # Find matching elements
418
- matches = self.page.filter_elements(candidates, selector, **kwargs)
499
+ from natural_pdf.elements.collections import ElementCollection
500
+ matches = ElementCollection(candidates).find_all(selector, **kwargs)
419
501
  return matches[0] if matches else None
420
502
  elif idx + 1 < len(all_elements):
421
503
  # No selector, just return the next element
@@ -449,16 +531,17 @@ class Element(DirectionalMixin):
449
531
 
450
532
  # Search for previous matching element
451
533
  if selector:
452
- # Filter elements before this one
534
+ # Select elements before this one
453
535
  candidates = all_elements[:idx]
454
- # Reverse to start from closest to this element
536
+ # Reverse to search backwards from the current element
455
537
  candidates = candidates[::-1]
456
538
  # Limit search range for performance
457
539
  candidates = candidates[:limit] if limit else candidates
458
540
 
459
- # Find matching elements
460
- matches = self.page.filter_elements(candidates, selector, **kwargs)
461
- return matches[0] if matches else None
541
+ # Find matching elements using ElementCollection
542
+ from natural_pdf.elements.collections import ElementCollection
543
+ matches = ElementCollection(candidates).find_all(selector, **kwargs)
544
+ return matches[0] if matches else None # find_all returns a collection
462
545
  elif idx > 0:
463
546
  # No selector, just return the previous element
464
547
  return all_elements[idx - 1]
@@ -737,8 +820,9 @@ class Element(DirectionalMixin):
737
820
  Returns:
738
821
  First matching element or None
739
822
  """
740
- # Create a temporary region from this element's bounds
741
823
  from natural_pdf.elements.region import Region
824
+
825
+ # Create a temporary region from this element's bounds
742
826
  temp_region = Region(self.page, self.bbox)
743
827
  return temp_region.find(selector, apply_exclusions=apply_exclusions, **kwargs)
744
828
 
@@ -755,7 +839,8 @@ class Element(DirectionalMixin):
755
839
  Returns:
756
840
  ElementCollection with matching elements
757
841
  """
758
- # Create a temporary region from this element's bounds
759
842
  from natural_pdf.elements.region import Region
843
+
844
+ # Create a temporary region from this element's bounds
760
845
  temp_region = Region(self.page, self.bbox)
761
846
  return temp_region.find_all(selector, apply_exclusions=apply_exclusions, **kwargs)
@@ -2,6 +2,7 @@ import logging
2
2
 
3
3
  from typing import List, Optional, Dict, Any, Union, Callable, TypeVar, Generic, Iterator, Tuple, TYPE_CHECKING
4
4
  from natural_pdf.ocr import OCROptions
5
+ from natural_pdf.selectors.parser import parse_selector, selector_to_filter_func
5
6
 
6
7
  logger = logging.getLogger(__name__)
7
8
 
@@ -882,6 +883,61 @@ class ElementCollection(Generic[T]):
882
883
  logger.error(f"Error creating interactive viewer from collection: {e}", exc_info=True)
883
884
  return None
884
885
 
886
+ def find_all(self, selector: str, regex: bool = False, case: bool = True, **kwargs) -> 'ElementCollection[T]':
887
+ """
888
+ Filter elements within this collection matching the selector.
889
+
890
+ Args:
891
+ selector: CSS-like selector string.
892
+ regex: Whether to use regex for text search in :contains (default: False).
893
+ case: Whether to do case-sensitive text search (default: True).
894
+ **kwargs: Additional filter parameters passed to the selector function.
895
+
896
+ Returns:
897
+ A new ElementCollection containing only the matching elements from this collection.
898
+ """
899
+ if not self._elements:
900
+ return ElementCollection([])
901
+
902
+ try:
903
+ selector_obj = parse_selector(selector)
904
+ except Exception as e:
905
+ logger.error(f"Error parsing selector '{selector}': {e}")
906
+ return ElementCollection([]) # Return empty on parse error
907
+
908
+ # Pass regex and case flags to selector function generator
909
+ kwargs['regex'] = regex
910
+ kwargs['case'] = case
911
+
912
+ try:
913
+ filter_func = selector_to_filter_func(selector_obj, **kwargs)
914
+ except Exception as e:
915
+ logger.error(f"Error creating filter function for selector '{selector}': {e}")
916
+ return ElementCollection([]) # Return empty on filter creation error
917
+
918
+ matching_elements = [element for element in self._elements if filter_func(element)]
919
+
920
+ # Note: Unlike Page.find_all, this doesn't re-sort.
921
+ # Sorting should be done explicitly on the collection if needed.
922
+
923
+ return ElementCollection(matching_elements)
924
+
925
+ def find(self, selector: str, regex: bool = False, case: bool = True, **kwargs) -> Optional[T]:
926
+ """
927
+ Find the first element within this collection matching the selector.
928
+
929
+ Args:
930
+ selector: CSS-like selector string.
931
+ regex: Whether to use regex for text search in :contains (default: False).
932
+ case: Whether to do case-sensitive text search (default: True).
933
+ **kwargs: Additional filter parameters passed to the selector function.
934
+
935
+ Returns:
936
+ The first matching element or None.
937
+ """
938
+ results = self.find_all(selector, regex=regex, case=case, **kwargs)
939
+ return results.first
940
+
885
941
  class PageCollection(Generic[P]):
886
942
  """
887
943
  A collection of PDF pages with cross-page operations.
@@ -761,8 +761,6 @@ class Region(DirectionalMixin):
761
761
  exclusion_regions = self._page._get_exclusion_regions(include_callable=True)
762
762
 
763
763
  if debug:
764
- import logging
765
- logger = logging.getLogger("natural_pdf.elements.region")
766
764
  logger.debug(f"Region {self.bbox} with {len(exclusion_regions)} exclusion regions")
767
765
 
768
766
  # IMPROVEMENT 1: Check if the region intersects with any exclusion zone
@@ -777,16 +775,12 @@ class Region(DirectionalMixin):
777
775
  if overlap:
778
776
  has_intersection = True
779
777
  if debug:
780
- import logging
781
- logger = logging.getLogger("natural_pdf.elements.region")
782
778
  logger.debug(f" Region intersects with exclusion {i}: {exclusion.bbox}")
783
779
  break
784
780
 
785
781
  # If no intersection, process without exclusions
786
782
  if not has_intersection:
787
783
  if debug:
788
- import logging
789
- logger = logging.getLogger("natural_pdf.elements.region")
790
784
  logger.debug(f" No intersection with any exclusion, ignoring exclusions")
791
785
  apply_exclusions = False
792
786
  exclusion_regions = []
@@ -809,8 +803,6 @@ class Region(DirectionalMixin):
809
803
  abs(exclusion.x1 - self.page.width) < 5)
810
804
 
811
805
  if debug:
812
- import logging
813
- logger = logging.getLogger("natural_pdf.elements.region")
814
806
  logger.debug(f" Exclusion {i}: {exclusion.bbox}, full width: {full_width}")
815
807
 
816
808
  if full_width:
@@ -827,8 +819,6 @@ class Region(DirectionalMixin):
827
819
  bottom_bound = self.bottom
828
820
 
829
821
  if debug:
830
- import logging
831
- logger = logging.getLogger("natural_pdf.elements.region")
832
822
  logger.debug(f" Using cropping approach, initial bounds: ({self.x0}, {top_bound}, {self.x1}, {bottom_bound})")
833
823
 
834
824
  # Process only header/footer exclusions for cropping
@@ -838,8 +828,6 @@ class Region(DirectionalMixin):
838
828
  # Move top bound to exclude the header
839
829
  top_bound = max(top_bound, exclusion.bottom)
840
830
  if debug:
841
- import logging
842
- logger = logging.getLogger("natural_pdf.elements.region")
843
831
  logger.debug(f" Adjusted top bound to {top_bound} due to header exclusion")
844
832
 
845
833
  # If exclusion is at the bottom of our region
@@ -847,14 +835,10 @@ class Region(DirectionalMixin):
847
835
  # Move bottom bound to exclude the footer
848
836
  bottom_bound = min(bottom_bound, exclusion.top)
849
837
  if debug:
850
- import logging
851
- logger = logging.getLogger("natural_pdf.elements.region")
852
838
  logger.debug(f" Adjusted bottom bound to {bottom_bound} due to footer exclusion")
853
839
 
854
840
 
855
841
  if debug:
856
- import logging
857
- logger = logging.getLogger("natural_pdf.elements.region")
858
842
  logger.debug(f" Final bounds after exclusion adjustment: ({self.x0}, {top_bound}, {self.x1}, {bottom_bound})")
859
843
 
860
844
  # If we still have a valid region after exclusions
@@ -865,8 +849,6 @@ class Region(DirectionalMixin):
865
849
  result = cropped.extract_text(keep_blank_chars=keep_blank_chars, **kwargs)
866
850
 
867
851
  if debug:
868
- import logging
869
- logger = logging.getLogger("natural_pdf.elements.region")
870
852
  logger.debug(f" Successfully extracted text using crop, got {len(result)} characters")
871
853
 
872
854
  # Skip the complex filtering approach
@@ -874,16 +856,12 @@ class Region(DirectionalMixin):
874
856
  else:
875
857
  # This would only happen if the region is entirely inside an exclusion zone
876
858
  # or if both top and bottom of the region are excluded leaving no valid area
877
- import logging
878
- logger = logging.getLogger("natural_pdf.elements.region")
879
859
  logger.debug(f"Region {self.bbox} completely covered by exclusions, returning empty string")
880
860
  return ""
881
861
  # We have exclusions, but not all are headers/footers,
882
862
  # or we have a non-rectangular region
883
863
  else:
884
864
  if debug:
885
- import logging
886
- logger = logging.getLogger("natural_pdf.elements.region")
887
865
  logger.debug(f" Mixed exclusion types or non-rectangular region, switching to filtering")
888
866
 
889
867
  # Don't use crop for mixed exclusion types
@@ -902,16 +880,13 @@ class Region(DirectionalMixin):
902
880
  return result
903
881
 
904
882
  # For all other cases (complex exclusions, polygons), we use element filtering
905
- import warnings
906
- import logging
907
- logger = logging.getLogger("natural_pdf.elements.region")
908
-
909
883
  if debug:
910
884
  logger.debug(f"Using element filtering approach for region {self.bbox}")
911
885
 
912
- # Get all elements in this region first
913
- all_elements = self.get_elements(apply_exclusions=False)
914
-
886
+ # Get only word elements in this region first (instead of ALL elements)
887
+ # This prevents duplication from joining both char and word text
888
+ all_elements = [e for e in self.page.words if self._is_element_in_region(e)]
889
+
915
890
  if apply_exclusions and exclusion_regions:
916
891
  if debug:
917
892
  logger.debug(f"Filtering with {len(exclusion_regions)} exclusion zones")
@@ -1325,83 +1300,6 @@ class Region(DirectionalMixin):
1325
1300
 
1326
1301
  return elements
1327
1302
 
1328
- def expand(self,
1329
- left: float = 0,
1330
- right: float = 0,
1331
- top_expand: float = 0, # Renamed to avoid conflict
1332
- bottom_expand: float = 0, # Renamed to avoid conflict
1333
- width_factor: float = 1.0,
1334
- height_factor: float = 1.0,
1335
- # Keep original parameter names for backward compatibility
1336
- top: float = None,
1337
- bottom: float = None) -> 'Region':
1338
- """
1339
- Create a new region expanded from this one.
1340
-
1341
- Args:
1342
- left: Amount to expand left edge
1343
- right: Amount to expand right edge
1344
- top_expand: Amount to expand top edge (upward)
1345
- bottom_expand: Amount to expand bottom edge (downward)
1346
- width_factor: Factor to multiply width by
1347
- height_factor: Factor to multiply height by
1348
- top: (DEPRECATED, use top_expand) Amount to expand top edge (upward)
1349
- bottom: (DEPRECATED, use bottom_expand) Amount to expand bottom edge (downward)
1350
-
1351
- Returns:
1352
- New expanded Region
1353
- """
1354
- # Start with current coordinates
1355
- new_x0 = self.x0
1356
- new_x1 = self.x1
1357
- new_top = self.top
1358
- new_bottom = self.bottom
1359
-
1360
- # Handle the deprecated parameter names for backward compatibility
1361
- if top is not None:
1362
- top_expand = top
1363
- if bottom is not None:
1364
- bottom_expand = bottom
1365
-
1366
- # Apply absolute expansions first
1367
- new_x0 -= left
1368
- new_x1 += right
1369
- new_top -= top_expand # Expand upward (decrease top coordinate)
1370
- new_bottom += bottom_expand # Expand downward (increase bottom coordinate)
1371
-
1372
- # Apply percentage factors if provided
1373
- if width_factor != 1.0 or height_factor != 1.0:
1374
- # Current width and height
1375
- current_width = new_x1 - new_x0
1376
- current_height = new_bottom - new_top
1377
-
1378
- # Calculate new width and height
1379
- new_width = current_width * width_factor
1380
- new_height = current_height * height_factor
1381
-
1382
- # Calculate width and height differences
1383
- width_diff = new_width - current_width
1384
- height_diff = new_height - current_height
1385
-
1386
- # Adjust coordinates to maintain center point
1387
- new_x0 -= width_diff / 2
1388
- new_x1 += width_diff / 2
1389
- new_top -= height_diff / 2
1390
- new_bottom += height_diff / 2
1391
-
1392
- # Create new region with expanded bbox
1393
- new_region = Region(self.page, (new_x0, new_top, new_x1, new_bottom))
1394
-
1395
- # Copy multi-page properties if present
1396
- if self._spans_pages:
1397
- new_region._spans_pages = True
1398
- new_region._multi_page_elements = self._multi_page_elements
1399
- new_region._page_range = self._page_range
1400
- new_region.start_element = self.start_element
1401
- new_region.end_element = self.end_element
1402
-
1403
- return new_region
1404
-
1405
1303
  def get_section_between(self, start_element=None, end_element=None, boundary_inclusion='both'):
1406
1304
  """
1407
1305
  Get a section between two elements within this region.
@@ -5,6 +5,7 @@ from PIL import Image, ImageDraw
5
5
  import os
6
6
  import tempfile
7
7
  import json
8
+ from natural_pdf.elements.collections import ElementCollection
8
9
 
9
10
  logger = logging.getLogger("natural_pdf.qa.document_qa")
10
11
 
@@ -304,8 +305,8 @@ class DocumentQA:
304
305
  # Remove from matched texts to avoid duplicates
305
306
  if element.text in matched_texts:
306
307
  matched_texts.remove(element.text)
307
-
308
- result["source_elements"] = source_elements
308
+
309
+ result["source_elements"] = ElementCollection(source_elements)
309
310
 
310
311
  return result
311
312
 
@@ -386,7 +387,7 @@ class DocumentQA:
386
387
  if element.text in matched_texts:
387
388
  matched_texts.remove(element.text)
388
389
 
389
- result["source_elements"] = source_elements
390
+ result["source_elements"] = ElementCollection(source_elements)
390
391
 
391
392
  return result
392
393
 
@@ -351,4 +351,218 @@ def _is_approximate_match(value1, value2, tolerance: float = 0.1) -> bool:
351
351
  return abs(value1 - value2) <= tolerance
352
352
 
353
353
  # Default to exact match for other types
354
- return value1 == value2
354
+ return value1 == value2
355
+
356
+
357
+ PSEUDO_CLASS_FUNCTIONS = {
358
+ 'bold': lambda el: hasattr(el, 'bold') and el.bold,
359
+ 'italic': lambda el: hasattr(el, 'italic') and el.italic,
360
+ 'first-child': lambda el: hasattr(el, 'parent') and el.parent and el.parent.children[0] == el, # Example placeholder
361
+ 'last-child': lambda el: hasattr(el, 'parent') and el.parent and el.parent.children[-1] == el, # Example placeholder
362
+ # Add the new pseudo-classes for negation
363
+ 'not-bold': lambda el: hasattr(el, 'bold') and not el.bold,
364
+ 'not-italic': lambda el: hasattr(el, 'italic') and not el.italic,
365
+ }
366
+
367
+
368
+ def selector_to_filter_func(selector: Dict[str, Any], **kwargs) -> callable:
369
+ """
370
+ Convert a parsed selector to a filter function.
371
+
372
+ Args:
373
+ selector: Parsed selector dictionary
374
+ **kwargs: Additional filter parameters including:
375
+ - regex: Whether to use regex for text search
376
+ - case: Whether to do case-sensitive text search
377
+
378
+ Returns:
379
+ Function that takes an element and returns True if it matches
380
+ """
381
+ def filter_func(element):
382
+ # Check element type
383
+ if selector['type'] != 'any':
384
+ # Special handling for 'text' type to match both 'text', 'char', and 'word'
385
+ if selector['type'] == 'text':
386
+ if element.type not in ['text', 'char', 'word']:
387
+ return False
388
+ # Special handling for 'region' type to check for detected layout regions
389
+ elif selector['type'] == 'region':
390
+ # Check if this is a Region with region_type property
391
+ if not hasattr(element, 'region_type'):
392
+ return False
393
+
394
+ # If 'type' attribute specified, it will be checked in the attributes section
395
+ # Check for Docling-specific types (section-header, etc.)
396
+ elif hasattr(element, 'normalized_type') and element.normalized_type == selector['type']:
397
+ # This is a direct match with a Docling region type
398
+ pass
399
+ # Otherwise, require exact match with the element's type attribute
400
+ elif not hasattr(element, 'type') or element.type != selector['type']:
401
+ return False
402
+
403
+ # Check attributes
404
+ for name, attr_info in selector['attributes'].items():
405
+ op = attr_info['op']
406
+ value = attr_info['value']
407
+
408
+ # Special case for fontname attribute - allow matching part of the name
409
+ if name == 'fontname' and op == '*=':
410
+ element_value = getattr(element, name, None)
411
+ if element_value is None or value.lower() not in element_value.lower():
412
+ return False
413
+ continue
414
+
415
+ # Convert hyphenated attribute names to underscore for Python properties
416
+ python_name = name.replace('-', '_')
417
+
418
+ # Special case for region attributes
419
+ if selector['type'] == 'region':
420
+ if name == 'type':
421
+ # Use normalized_type for comparison if available
422
+ if hasattr(element, 'normalized_type') and element.normalized_type:
423
+ element_value = element.normalized_type
424
+ else:
425
+ # Convert spaces to hyphens for consistency with the normalized format
426
+ element_value = getattr(element, 'region_type', '').lower().replace(' ', '-')
427
+ elif name == 'model':
428
+ # Special handling for model attribute in regions
429
+ element_value = getattr(element, 'model', None)
430
+ else:
431
+ # Get the attribute value from the element normally
432
+ element_value = getattr(element, python_name, None)
433
+ else:
434
+ # Get the attribute value from the element normally for non-region elements
435
+ element_value = getattr(element, python_name, None)
436
+
437
+ if element_value is None:
438
+ return False
439
+
440
+ # Apply operator
441
+ if op == '=':
442
+ if element_value != value:
443
+ return False
444
+ elif op == '~=':
445
+ # Approximate match (e.g., for colors)
446
+ if not _is_approximate_match(element_value, value):
447
+ return False
448
+ elif op == '>=':
449
+ # Greater than or equal (element value must be >= specified value)
450
+ if not (isinstance(element_value, (int, float)) and
451
+ isinstance(value, (int, float)) and
452
+ element_value >= value):
453
+ return False
454
+ elif op == '<=':
455
+ # Less than or equal (element value must be <= specified value)
456
+ if not (isinstance(element_value, (int, float)) and
457
+ isinstance(value, (int, float)) and
458
+ element_value <= value):
459
+ return False
460
+ elif op == '>':
461
+ # Greater than (element value must be > specified value)
462
+ if not (isinstance(element_value, (int, float)) and
463
+ isinstance(value, (int, float)) and
464
+ element_value > value):
465
+ return False
466
+ elif op == '<':
467
+ # Less than (element value must be < specified value)
468
+ if not (isinstance(element_value, (int, float)) and
469
+ isinstance(value, (int, float)) and
470
+ element_value < value):
471
+ return False
472
+
473
+ # Check pseudo-classes
474
+ for pseudo in selector['pseudo_classes']:
475
+ name = pseudo['name']
476
+ args = pseudo['args']
477
+
478
+ # Handle various pseudo-classes
479
+ if name == 'contains' and hasattr(element, 'text'):
480
+ use_regex = kwargs.get('regex', False)
481
+ ignore_case = not kwargs.get('case', True)
482
+
483
+ if use_regex:
484
+ import re
485
+ if not element.text:
486
+ return False
487
+ try:
488
+ pattern = re.compile(args, re.IGNORECASE if ignore_case else 0)
489
+ if not pattern.search(element.text):
490
+ return False
491
+ except re.error:
492
+ # If regex is invalid, fall back to literal text search
493
+ element_text = element.text
494
+ search_text = args
495
+
496
+ if ignore_case:
497
+ element_text = element_text.lower()
498
+ search_text = search_text.lower()
499
+
500
+ if search_text not in element_text:
501
+ return False
502
+ else:
503
+ # String comparison with case sensitivity option
504
+ if not element.text:
505
+ return False
506
+
507
+ element_text = element.text
508
+ search_text = args
509
+
510
+ if ignore_case:
511
+ element_text = element_text.lower()
512
+ search_text = search_text.lower()
513
+
514
+ if search_text not in element_text:
515
+ return False
516
+ elif name == 'starts-with' and hasattr(element, 'text'):
517
+ if not element.text or not element.text.startswith(args):
518
+ return False
519
+ elif name == 'ends-with' and hasattr(element, 'text'):
520
+ if not element.text or not element.text.endswith(args):
521
+ return False
522
+ elif name == 'bold':
523
+ if not (hasattr(element, 'bold') and element.bold):
524
+ return False
525
+ elif name == 'italic':
526
+ if not (hasattr(element, 'italic') and element.italic):
527
+ return False
528
+ elif name == 'horizontal':
529
+ if not (hasattr(element, 'is_horizontal') and element.is_horizontal):
530
+ return False
531
+ elif name == 'vertical':
532
+ if not (hasattr(element, 'is_vertical') and element.is_vertical):
533
+ return False
534
+ else:
535
+ # Check pseudo-classes (basic ones like :bold, :italic)
536
+ if name in PSEUDO_CLASS_FUNCTIONS:
537
+ if not PSEUDO_CLASS_FUNCTIONS[name](element):
538
+ return False
539
+ elif name == 'contains':
540
+ if not hasattr(element, 'text') or not element.text:
541
+ return False
542
+ text_to_check = element.text
543
+ search_term = args
544
+ if not kwargs.get('case', True): # Check case flag from kwargs
545
+ text_to_check = text_to_check.lower()
546
+ search_term = search_term.lower()
547
+
548
+ if kwargs.get('regex', False): # Check regex flag from kwargs
549
+ try:
550
+ if not re.search(search_term, text_to_check):
551
+ return False
552
+ except re.error as e:
553
+ logger.warning(f"Invalid regex in :contains selector '{search_term}': {e}")
554
+ return False # Invalid regex cannot match
555
+ else:
556
+ if search_term not in text_to_check:
557
+ return False
558
+ # Skip complex pseudo-classes like :near, :above here, handled later
559
+ elif name in ('above', 'below', 'near', 'left-of', 'right-of'):
560
+ pass # Handled separately after initial filtering
561
+ else:
562
+ # Optionally log unknown pseudo-classes
563
+ # logger.warning(f"Unknown pseudo-class: {name}")
564
+ pass
565
+
566
+ return True # Element passes all attribute and simple pseudo-class filters
567
+
568
+ return filter_func
@@ -127,10 +127,10 @@ def create_legend(labels_colors: Dict[str, Tuple[int, int, int, int]],
127
127
  # Try to load a font, use default if not available
128
128
  try:
129
129
  # Use a commonly available font, adjust size
130
- font = ImageFont.truetype("DejaVuSans.ttf", 12)
130
+ font = ImageFont.truetype("DejaVuSans.ttf", 14)
131
131
  except IOError:
132
132
  try:
133
- font = ImageFont.truetype("Arial.ttf", 12)
133
+ font = ImageFont.truetype("Arial.ttf", 14)
134
134
  except IOError:
135
135
  font = ImageFont.load_default()
136
136
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: natural-pdf
3
- Version: 0.1.0
3
+ Version: 0.1.1
4
4
  Summary: A more intuitive interface for working with PDFs
5
5
  Author-email: Jonathan Soma <jonathan.soma@gmail.com>
6
6
  License-Expression: MIT
@@ -48,7 +48,7 @@ A friendly library for working with PDFs, built on top of [pdfplumber](https://g
48
48
  Natural PDF lets you find and extract content from PDFs using simple code that makes sense.
49
49
 
50
50
  - [Complete documentation here](https://jsoma.github.io/natural-pdf)
51
- - [Live demo here](https://colab.research.google.com/github/jsoma/natural-pdf/blob/main/notebooks/Examples.ipynb)
51
+ - [Live demos here](https://colab.research.google.com/github/jsoma/natural-pdf/)
52
52
 
53
53
  ## Features
54
54
 
@@ -74,18 +74,16 @@ pip install natural-pdf
74
74
 
75
75
  # Installs the core library along with required AI dependencies (PyTorch, Transformers)
76
76
  ```bash
77
- # Install with support for specific OCR engines
78
- pip install natural-pdf[easyocr] # EasyOCR engine
79
- pip install natural-pdf[paddle] # PaddleOCR engine (requires paddlepaddle)
80
- pip install natural-pdf[surya] # Surya OCR engine
81
-
82
- # Install with support for YOLO layout detection model
77
+ # Install with support for specific OCR and layout engines
78
+ pip install natural-pdf[easyocr]
79
+ pip install natural-pdf[paddle]
80
+ pip install natural-pdf[surya]
83
81
  pip install natural-pdf[layout_yolo]
84
82
 
85
83
  # Install with support for the interactive Jupyter widget
86
84
  pip install natural-pdf[interactive]
87
85
 
88
- # Install everything
86
+ # Just install everything
89
87
  pip install natural-pdf[all]
90
88
  ```
91
89
 
@@ -119,6 +117,8 @@ clean_text = page.extract_text()
119
117
  print(clean_text)
120
118
  ```
121
119
 
120
+ - [Complete documentation here](https://jsoma.github.io/natural-pdf)
121
+
122
122
  ## Selectors
123
123
 
124
124
  The library supports CSS-like selectors for finding elements:
@@ -185,7 +185,7 @@ Exclusions work efficiently with different region types:
185
185
 
186
186
  ## OCR Integration
187
187
 
188
- Extract text from scanned documents using OCR, with support for multiple engines (EasyOCR, PaddleOCR, Surya):
188
+ Extract text from scanned documents using OCR, with support for multiple engines ([EasyOCR](https://www.jaided.ai/easyocr/), [PaddleOCR](https://paddlepaddle.github.io/PaddleOCR/latest/en/index.html), [Surya](https://github.com/VikParuchuri/surya)):
189
189
 
190
190
  ```python
191
191
  # Apply OCR using a specific engine (e.g., PaddleOCR)
@@ -1,4 +1,4 @@
1
- natural_pdf/__init__.py,sha256=kKHL7SWzk0_ydDDX12X5W3s9-vEKgVYOBubXzp_SCdM,1784
1
+ natural_pdf/__init__.py,sha256=hsSosbPnvDRCfyYAL9bf1haVS6oBxLAl7cbKTWRTHkU,1784
2
2
  natural_pdf/analyzers/__init__.py,sha256=BkSmEqw5J76C2fvYHF86EXQJQWWFNIvjSwRMwfW-Ht0,140
3
3
  natural_pdf/analyzers/text_options.py,sha256=9IGRoem1O2mc1ZNGiM5-VPRZ3c8LLwEk1B3is9UxMoE,2777
4
4
  natural_pdf/analyzers/text_structure.py,sha256=e4G6v0bD7ZJCdo6DcuDD3iZt8KAwBfALMduwZHGh0wI,12415
@@ -15,15 +15,15 @@ natural_pdf/analyzers/layout/tatr.py,sha256=H0Xygk9jA46-vlPleoal94cuDyz-LHTSxVb3
15
15
  natural_pdf/analyzers/layout/yolo.py,sha256=NSQK3TcS1qN8D2MDxCvcwTpS_kvzGy3I2LepJDUceoQ,7699
16
16
  natural_pdf/core/__init__.py,sha256=GUuFtj2Apc9biAdUOlnL8leL3BQncEzubvpiAUaU3ss,37
17
17
  natural_pdf/core/element_manager.py,sha256=H1896JSt48ASLSmG22xEXMY-xSKcpYsUlYmYMD48i6Q,17117
18
- natural_pdf/core/highlighting_service.py,sha256=gcWZnvlscg32anJrh0m3gVgIyrRKTMDHIL5Ft8OOTjA,29454
19
- natural_pdf/core/page.py,sha256=rKXxdnG4cl8qjRoKEBxXL9ncLWvujDoVWQ9_D9ouHxc,64428
20
- natural_pdf/core/pdf.py,sha256=VAAe-BU8bcbCTiQ43fp8lsVy8q8KSfN9eAbFp9mJOWw,28296
18
+ natural_pdf/core/highlighting_service.py,sha256=a-40UMohOglYrw4klW1GuQ_p3jZOxnAfPOXPORThr4U,31476
19
+ natural_pdf/core/page.py,sha256=tnxG-5OhFVuFHt0p-a9YSLU-nXjA8fftg5ViQdH5sOU,68512
20
+ natural_pdf/core/pdf.py,sha256=UzxVfVeCnhSN7rxdJresUj_UNFkcFkeaEjLvwZMJS-c,28532
21
21
  natural_pdf/elements/__init__.py,sha256=6FGHZm2oONd8zErahMEawuB4AvJR5jOZPt4KtEwbj80,40
22
- natural_pdf/elements/base.py,sha256=Bi6hylE1N252d-GSPZy1mFMvnWh18b9dEGbIRXthq88,32057
23
- natural_pdf/elements/collections.py,sha256=qkpUZuf08n-NPhCrOE40cRg-T2F5jpba1Xhuo2CKr-c,59982
22
+ natural_pdf/elements/base.py,sha256=9SQ-O2qbQe9Avbf9JI-p6vWlyThZVch-p1yqXWSrBHw,35750
23
+ natural_pdf/elements/collections.py,sha256=RJf4cBZeLfCtfS0-SjzYFRCtbzYjWsgk3LrcTwJAYMs,62392
24
24
  natural_pdf/elements/line.py,sha256=QvVdhf_K6rwJkq3q67JmgdZpDhrBgWuSMF-Q25malP4,4783
25
25
  natural_pdf/elements/rect.py,sha256=dls9g-R213O78HvfAJMak3_eV14Zh654Zw7hqTTXxDQ,3949
26
- natural_pdf/elements/region.py,sha256=GVenh3ICfojVpSpwKMEayUBBesywowPTTk7y44MLo6g,76835
26
+ natural_pdf/elements/region.py,sha256=MXQK00LLMvwuq94NigeeCVFoGov_RWFe9ZylnIMpzB0,72453
27
27
  natural_pdf/elements/text.py,sha256=OAuy0ozaemj6yjMwhXPsJ76VZtRPeJbmrFTzpDJA2_U,11017
28
28
  natural_pdf/ocr/__init__.py,sha256=mbUUsCfeU6yRsEqNn3I4Len-XY6FfjfKhTAoWDLA1f4,1943
29
29
  natural_pdf/ocr/engine.py,sha256=xDnvhnm4Lr7d83ezglDqOtl9xfx74zOOTyYW-fZHQEQ,4183
@@ -33,20 +33,20 @@ natural_pdf/ocr/engine_surya.py,sha256=gWV_BEuLMqmJcKVlag9i45SsO2uLAtI-dayBm1xbD
33
33
  natural_pdf/ocr/ocr_manager.py,sha256=mAyCntdAnrNv8TIvGYlGs40G2tDAdMQ_Jqb3owiPWW8,9934
34
34
  natural_pdf/ocr/ocr_options.py,sha256=A2CQV172id-90zMpPZWb8CD09ZP0BuQnnCZGEFP4SaQ,3787
35
35
  natural_pdf/qa/__init__.py,sha256=kagdfqNMpTnyzjC2EFy_PBX5us38NnJL548ESSQVzfI,107
36
- natural_pdf/qa/document_qa.py,sha256=x_AYE0kbs7_4n5NC7zWcxQpHFh0vxP3g3q-l_w4RgSU,15845
36
+ natural_pdf/qa/document_qa.py,sha256=QYKKor0RqUQcEdFEBEUdq7L0ktq1WSMfQ-ynTc64cPU,15926
37
37
  natural_pdf/selectors/__init__.py,sha256=Jfk-JBZEpQ7V5FWVGuLJQLH-qOfqNLC2AdicncMhrmY,121
38
- natural_pdf/selectors/parser.py,sha256=scYuM0Kp-Bidc2KaYwOMiMYSeR-6q970-2Xwy5zsdNE,13784
38
+ natural_pdf/selectors/parser.py,sha256=JK1zDVISACkUhzmzWfQMMW8hvsV422lRBFKgDBWOWC4,24108
39
39
  natural_pdf/templates/__init__.py,sha256=i7N8epDxZoDDsK4p2iUiMwzKVs97i_KtNk8ATArqlC4,19
40
40
  natural_pdf/templates/ocr_debug.html,sha256=Zy9StzBeHFQU8ity6cjFSZLe3TY0QOabUux4c5WQUzs,19171
41
41
  natural_pdf/utils/__init__.py,sha256=s3M8FggaK1P3EBYn6R_-HgSDjNc9C73gyKe1hihtNWg,43
42
42
  natural_pdf/utils/highlighting.py,sha256=9H8vbWhwgxzjrL7MhAePXUWZZctLPboNocJzy-1TE_g,675
43
43
  natural_pdf/utils/reading_order.py,sha256=1oihH9ZTqQvIVDYc2oVEYqIXyPzi94ERtelp6TyzmWU,7594
44
- natural_pdf/utils/visualization.py,sha256=p2855QGyRXUFNH8rzgrIVzCSbuf8WXwV_j1YgP518uo,8876
44
+ natural_pdf/utils/visualization.py,sha256=14BM-K4ovDqHniNbxbP_y9KaEYNlkbpELGAv9_8aOZ4,8876
45
45
  natural_pdf/widgets/__init__.py,sha256=qckw3DjdVTsASPLJ8uUrGKg3MFhvzHndUpeNGlqwg6A,215
46
46
  natural_pdf/widgets/viewer.py,sha256=h_amj_uvf-vRqEsFg4P00fgKxawLAd9jjC1ohUza4BY,37479
47
47
  natural_pdf/widgets/frontend/viewer.js,sha256=w8ywfz_IOAAv2nP_qaf2VBUkF1KhjT3zorhJxM1-CfU,4371
48
- natural_pdf-0.1.0.dist-info/licenses/LICENSE,sha256=9zfwINwJlarbDmdh6iJV4QUG54QSJlSAUcnC1YiC_Ns,1074
49
- natural_pdf-0.1.0.dist-info/METADATA,sha256=_RZKF1mkrpUxpdOO0oMy_HGjj0ZlrvAGIblwmheKCQQ,9960
50
- natural_pdf-0.1.0.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
51
- natural_pdf-0.1.0.dist-info/top_level.txt,sha256=XtfS3IiR1fTjaQG9TjGDjZsB1Ih2GXQteDbJ2dXlLvQ,12
52
- natural_pdf-0.1.0.dist-info/RECORD,,
48
+ natural_pdf-0.1.1.dist-info/licenses/LICENSE,sha256=9zfwINwJlarbDmdh6iJV4QUG54QSJlSAUcnC1YiC_Ns,1074
49
+ natural_pdf-0.1.1.dist-info/METADATA,sha256=8o22GEPtEqlSqexFQxy6tVoHTB35LmT63sjbjbjORRE,10009
50
+ natural_pdf-0.1.1.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
51
+ natural_pdf-0.1.1.dist-info/top_level.txt,sha256=XtfS3IiR1fTjaQG9TjGDjZsB1Ih2GXQteDbJ2dXlLvQ,12
52
+ natural_pdf-0.1.1.dist-info/RECORD,,