natural-pdf 0.1.1__py3-none-any.whl → 0.1.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
natural_pdf/core/page.py CHANGED
@@ -7,6 +7,8 @@ from PIL import Image
7
7
  import base64
8
8
  import io
9
9
  import json
10
+ import re
11
+ import hashlib
10
12
 
11
13
  from natural_pdf.elements.collections import ElementCollection
12
14
  from natural_pdf.elements.region import Region
@@ -96,6 +98,11 @@ class Page:
96
98
  """Get page number (1-based)."""
97
99
  return self._page.page_number
98
100
 
101
+ @property
102
+ def page_number(self) -> int:
103
+ """Get page number (1-based)."""
104
+ return self._page.page_number
105
+
99
106
  @property
100
107
  def index(self) -> int:
101
108
  """Get page index (0-based)."""
@@ -127,7 +134,7 @@ class Page:
127
134
  self._exclusions = []
128
135
  return self
129
136
 
130
- def add_exclusion(self, exclusion_func_or_region: Union[Callable[['Page'], Region], Region, Any]) -> 'Page':
137
+ def add_exclusion(self, exclusion_func_or_region: Union[Callable[['Page'], Region], Region, Any], label: Optional[str] = None) -> 'Page':
131
138
  """
132
139
  Add an exclusion to the page. Text from these regions will be excluded from extraction.
133
140
  Ensures non-callable items are stored as Region objects if possible.
@@ -135,6 +142,7 @@ class Page:
135
142
  Args:
136
143
  exclusion_func_or_region: Either a callable function returning a Region,
137
144
  a Region object, or another object with a valid .bbox attribute.
145
+ label: Optional label for this exclusion (e.g., 'header', 'footer').
138
146
 
139
147
  Returns:
140
148
  Self for method chaining
@@ -142,28 +150,36 @@ class Page:
142
150
  Raises:
143
151
  TypeError: If a non-callable, non-Region object without a valid bbox is provided.
144
152
  """
153
+ exclusion_data = None # Initialize exclusion data
154
+
145
155
  if callable(exclusion_func_or_region):
146
- # Store callable functions directly
147
- self._exclusions.append(exclusion_func_or_region)
148
- logger.debug(f"Page {self.index}: Added callable exclusion: {exclusion_func_or_region}")
156
+ # Store callable functions along with their label
157
+ exclusion_data = (exclusion_func_or_region, label)
158
+ logger.debug(f"Page {self.index}: Added callable exclusion '{label}': {exclusion_func_or_region}")
149
159
  elif isinstance(exclusion_func_or_region, Region):
150
- # Store Region objects directly
151
- self._exclusions.append(exclusion_func_or_region)
152
- logger.debug(f"Page {self.index}: Added Region exclusion: {exclusion_func_or_region}")
160
+ # Store Region objects directly, assigning the label
161
+ exclusion_func_or_region.label = label # Assign label
162
+ exclusion_data = (exclusion_func_or_region, label) # Store as tuple for consistency
163
+ logger.debug(f"Page {self.index}: Added Region exclusion '{label}': {exclusion_func_or_region}")
153
164
  elif hasattr(exclusion_func_or_region, 'bbox') and isinstance(getattr(exclusion_func_or_region, 'bbox', None), (tuple, list)) and len(exclusion_func_or_region.bbox) == 4:
154
165
  # Convert objects with a valid bbox to a Region before storing
155
166
  try:
156
167
  bbox_coords = tuple(float(v) for v in exclusion_func_or_region.bbox)
157
- region_to_add = Region(self, bbox_coords)
158
- self._exclusions.append(region_to_add)
159
- logger.debug(f"Page {self.index}: Added exclusion converted to Region from {type(exclusion_func_or_region)}: {region_to_add}")
168
+ # Pass the label to the Region constructor
169
+ region_to_add = Region(self, bbox_coords, label=label)
170
+ exclusion_data = (region_to_add, label) # Store as tuple
171
+ logger.debug(f"Page {self.index}: Added exclusion '{label}' converted to Region from {type(exclusion_func_or_region)}: {region_to_add}")
160
172
  except (ValueError, TypeError, Exception) as e:
161
173
  # Raise an error if conversion fails
162
174
  raise TypeError(f"Failed to convert exclusion object {exclusion_func_or_region} with bbox {getattr(exclusion_func_or_region, 'bbox', 'N/A')} to Region: {e}") from e
163
175
  else:
164
176
  # Reject invalid types
165
177
  raise TypeError(f"Invalid exclusion type: {type(exclusion_func_or_region)}. Must be callable, Region, or have a valid .bbox attribute.")
166
-
178
+
179
+ # Append the stored data (tuple of object/callable and label)
180
+ if exclusion_data:
181
+ self._exclusions.append(exclusion_data)
182
+
167
183
  return self
168
184
 
169
185
  def add_region(self, region: Region, name: Optional[str] = None) -> 'Page':
@@ -222,75 +238,66 @@ class Page:
222
238
  def _get_exclusion_regions(self, include_callable=True, debug=False) -> List[Region]:
223
239
  """
224
240
  Get all exclusion regions for this page.
225
- Assumes self._exclusions contains only callables or Region objects.
241
+ Assumes self._exclusions contains tuples of (callable/Region, label).
226
242
 
227
243
  Args:
228
244
  include_callable: Whether to evaluate callable exclusion functions
229
245
  debug: Enable verbose debug logging for exclusion evaluation
230
246
 
231
247
  Returns:
232
- List of Region objects to exclude
248
+ List of Region objects to exclude, with labels assigned.
233
249
  """
234
250
  regions = []
235
251
 
236
- # Track exclusion results for debugging
237
252
  if debug:
238
253
  print(f"\nPage {self.index}: Evaluating {len(self._exclusions)} exclusions")
239
-
240
- for i, exclusion in enumerate(self._exclusions):
241
- # Get exclusion label if it's a tuple from PDF level
242
- exclusion_label = f"exclusion {i}"
243
- original_exclusion = exclusion # Keep track for debugging
244
-
245
- # Check if it's a tuple from PDF.add_exclusion (should still be handled if PDF adds labels)
246
- if isinstance(exclusion, tuple) and len(exclusion) == 2 and callable(exclusion[0]):
247
- exclusion_func, label = exclusion
248
- if label:
249
- exclusion_label = label
250
- exclusion = exclusion_func # Use the function part
251
-
254
+
255
+ for i, exclusion_data in enumerate(self._exclusions):
256
+ # Unpack the exclusion object/callable and its label
257
+ exclusion_item, label = exclusion_data
258
+ exclusion_label = label if label else f"exclusion {i}"
259
+
252
260
  # Process callable exclusion functions
253
- if callable(exclusion) and include_callable:
254
- # It's a function, call it with this page
261
+ if callable(exclusion_item) and include_callable:
255
262
  try:
256
263
  if debug:
257
- print(f" - Evaluating callable {exclusion_label}...")
258
-
259
- # Temporarily clear exclusions to avoid potential recursion if the callable uses exclusions itself
260
- # This might be overly cautious depending on use case, but safer.
264
+ print(f" - Evaluating callable '{exclusion_label}'...")
265
+
266
+ # Temporarily clear exclusions (consider if really needed)
261
267
  temp_original_exclusions = self._exclusions
262
- self._exclusions = []
263
-
268
+ self._exclusions = []
269
+
264
270
  # Call the function - Expects it to return a Region or None
265
- region_result = exclusion(self)
266
-
271
+ region_result = exclusion_item(self)
272
+
267
273
  # Restore exclusions
268
274
  self._exclusions = temp_original_exclusions
269
-
275
+
270
276
  if isinstance(region_result, Region):
277
+ # Assign the label to the returned region
278
+ region_result.label = label
271
279
  regions.append(region_result)
272
280
  if debug:
273
- print(f" ✓ Added region from callable: {region_result}")
281
+ print(f" ✓ Added region from callable '{label}': {region_result}")
274
282
  elif region_result:
275
- # Log warning if callable returned something other than Region/None
276
- logger.warning(f"Callable exclusion {exclusion_label} returned non-Region object: {type(region_result)}. Skipping.")
283
+ logger.warning(f"Callable exclusion '{exclusion_label}' returned non-Region object: {type(region_result)}. Skipping.")
277
284
  if debug:
278
285
  print(f" ✗ Callable returned non-Region/None: {type(region_result)}")
279
286
  else:
280
287
  if debug:
281
- print(f" ✗ Callable returned None, no region added")
282
-
288
+ print(f" ✗ Callable '{exclusion_label}' returned None, no region added")
289
+
283
290
  except Exception as e:
284
- error_msg = f"Error evaluating callable exclusion {exclusion_label} for page {self.index}: {e}"
291
+ error_msg = f"Error evaluating callable exclusion '{exclusion_label}' for page {self.index}: {e}"
285
292
  print(error_msg)
286
293
  import traceback
287
294
  print(f" Traceback: {traceback.format_exc().splitlines()[-3:]}")
288
-
289
- # Process direct Region objects (already validated by add_exclusion)
290
- elif isinstance(exclusion, Region):
291
- regions.append(exclusion)
295
+
296
+ # Process direct Region objects (label was assigned in add_exclusion)
297
+ elif isinstance(exclusion_item, Region):
298
+ regions.append(exclusion_item) # Label is already on the Region object
292
299
  if debug:
293
- print(f" - Added direct region: {exclusion}")
300
+ print(f" - Added direct region '{label}': {exclusion_item}")
294
301
  # No else needed, add_exclusion should prevent invalid types
295
302
 
296
303
  if debug:
@@ -1485,25 +1492,46 @@ class Page:
1485
1492
  RuntimeError: If required dependencies (ipywidgets) are missing.
1486
1493
  ValueError: If image rendering or data preparation fails within from_page.
1487
1494
  """
1488
- # Import the widget class (might need to be moved to top if used elsewhere)
1489
- from natural_pdf.widgets.viewer import SimpleInteractiveViewerWidget
1495
+ # Dynamically import here if needed, or ensure it's globally available
1496
+ try:
1497
+ from natural_pdf.widgets.viewer import SimpleInteractiveViewerWidget
1498
+ except ImportError:
1499
+ logger.error("Interactive viewer requires optional dependencies. Install with `pip install natural-pdf[widgets]`")
1500
+ raise
1501
+
1502
+ # Pass self (the Page object) to the factory method
1503
+ return SimpleInteractiveViewerWidget.from_page(self)
1504
+
1505
+ # --- Indexable Protocol Methods ---
1506
+ def get_id(self) -> str:
1507
+ """Returns a unique identifier for the page (required by Indexable protocol)."""
1508
+ # Ensure path is safe for use in IDs (replace problematic chars)
1509
+ safe_path = re.sub(r'[^a-zA-Z0-9_-]', '_', str(self.pdf.path))
1510
+ return f"pdf_{safe_path}_page_{self.page_number}"
1511
+
1512
+ def get_metadata(self) -> Dict[str, Any]:
1513
+ """Returns metadata associated with the page (required by Indexable protocol)."""
1514
+ # Add content hash here for sync
1515
+ metadata = {
1516
+ "pdf_path": str(self.pdf.path),
1517
+ "page_number": self.page_number,
1518
+ "width": self.width,
1519
+ "height": self.height,
1520
+ "content_hash": self.get_content_hash() # Include the hash
1521
+ }
1522
+ return metadata
1490
1523
 
1491
- logger.info(f"Generating interactive viewer for Page {self.number} using SimpleInteractiveViewerWidget.from_page...")
1524
+ def get_content(self) -> 'Page':
1525
+ """
1526
+ Returns the primary content object (self) for indexing (required by Indexable protocol).
1527
+ SearchService implementations decide how to process this (e.g., call extract_text).
1528
+ """
1529
+ return self # Return the Page object itself
1492
1530
 
1493
- try:
1494
- # Delegate creation entirely to the from_page class method
1495
- viewer_widget = SimpleInteractiveViewerWidget.from_page(self)
1496
- if viewer_widget is None:
1497
- # This case might happen if from_page had error handling to return None, though we removed most.
1498
- # Keeping a check here just in case.
1499
- raise RuntimeError("SimpleInteractiveViewerWidget.from_page returned None, indicating an issue during widget creation.")
1500
-
1501
- logger.info("Interactive viewer widget created successfully.")
1502
- return viewer_widget
1503
- except ImportError as e:
1504
- logger.error("Failed to import SimpleInteractiveViewerWidget. Ensure natural_pdf.widgets and ipywidgets are installed.")
1505
- raise RuntimeError("Widget class not found. ipywidgets or natural_pdf.widgets might be missing or setup incorrect.") from e
1506
- except Exception as e:
1507
- logger.error(f"Failed to create interactive viewer: {e}", exc_info=True)
1508
- # Re-raise the exception to make it visible to the user
1509
- raise RuntimeError(f"Failed to create interactive viewer: {e}") from e
1531
+ def get_content_hash(self) -> str:
1532
+ """Returns a SHA256 hash of the extracted text content (required by Indexable for sync)."""
1533
+ # Hash the extracted text (without exclusions for consistency)
1534
+ # Consider if exclusions should be part of the hash? For now, hash raw text.
1535
+ # Using extract_text directly might be slow if called repeatedly. Cache? TODO: Optimization
1536
+ text_content = self.extract_text(use_exclusions=False, preserve_whitespace=False) # Normalize whitespace?
1537
+ return hashlib.sha256(text_content.encode('utf-8')).hexdigest()