natural-pdf 0.1.2__py3-none-any.whl → 0.1.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- natural_pdf/__init__.py +33 -1
- natural_pdf/collections/pdf_collection.py +259 -0
- natural_pdf/core/page.py +97 -69
- natural_pdf/core/pdf.py +382 -171
- natural_pdf/elements/region.py +3 -1
- natural_pdf/exporters/__init__.py +1 -0
- natural_pdf/exporters/searchable_pdf.py +252 -0
- natural_pdf/search/__init__.py +94 -0
- natural_pdf/search/haystack_search_service.py +520 -0
- natural_pdf/search/haystack_utils.py +386 -0
- natural_pdf/search/search_options.py +72 -0
- natural_pdf/search/search_service_protocol.py +189 -0
- natural_pdf/search/searchable_mixin.py +464 -0
- {natural_pdf-0.1.2.dist-info → natural_pdf-0.1.3.dist-info}/METADATA +14 -1
- {natural_pdf-0.1.2.dist-info → natural_pdf-0.1.3.dist-info}/RECORD +18 -9
- {natural_pdf-0.1.2.dist-info → natural_pdf-0.1.3.dist-info}/WHEEL +0 -0
- {natural_pdf-0.1.2.dist-info → natural_pdf-0.1.3.dist-info}/licenses/LICENSE +0 -0
- {natural_pdf-0.1.2.dist-info → natural_pdf-0.1.3.dist-info}/top_level.txt +0 -0
natural_pdf/__init__.py
CHANGED
@@ -52,4 +52,36 @@ __version__ = "0.1.1"
|
|
52
52
|
if HAS_QA:
|
53
53
|
__all__ = ["PDF", "Page", "Region", "ElementCollection", "configure_logging", "DocumentQA", "get_qa_engine"]
|
54
54
|
else:
|
55
|
-
__all__ = ["PDF", "Page", "Region", "ElementCollection", "configure_logging"]
|
55
|
+
__all__ = ["PDF", "Page", "Region", "ElementCollection", "configure_logging"]
|
56
|
+
|
57
|
+
# Core classes
|
58
|
+
from .core.pdf import PDF
|
59
|
+
from .collections.pdf_collection import PDFCollection
|
60
|
+
from .elements.region import Region
|
61
|
+
|
62
|
+
# Search options (if extras installed)
|
63
|
+
try:
|
64
|
+
from .search.search_options import TextSearchOptions, MultiModalSearchOptions, BaseSearchOptions
|
65
|
+
except ImportError:
|
66
|
+
# Define dummy classes if extras not installed, so imports don't break
|
67
|
+
# but using them will raise the ImportError from check_haystack_availability
|
68
|
+
class TextSearchOptions:
|
69
|
+
def __init__(self, *args, **kwargs): pass
|
70
|
+
class MultiModalSearchOptions:
|
71
|
+
def __init__(self, *args, **kwargs): pass
|
72
|
+
class BaseSearchOptions:
|
73
|
+
def __init__(self, *args, **kwargs): pass
|
74
|
+
|
75
|
+
# Expose logging setup? (Optional)
|
76
|
+
# from . import logging_config
|
77
|
+
# logging_config.setup_logging()
|
78
|
+
|
79
|
+
# Explicitly define what gets imported with 'from natural_pdf import *'
|
80
|
+
__all__ = [
|
81
|
+
'PDF',
|
82
|
+
'PDFCollection',
|
83
|
+
'Region',
|
84
|
+
'TextSearchOptions', # Include search options
|
85
|
+
'MultiModalSearchOptions',
|
86
|
+
'BaseSearchOptions'
|
87
|
+
]
|
@@ -0,0 +1,259 @@
|
|
1
|
+
import os
|
2
|
+
import glob as py_glob
|
3
|
+
import logging
|
4
|
+
from typing import List, Optional, Dict, Any, Union, Iterable, Set, TYPE_CHECKING, Type
|
5
|
+
from pathlib import Path
|
6
|
+
from PIL import Image
|
7
|
+
import re # Added for safe path generation
|
8
|
+
import copy # Added for copying options
|
9
|
+
from tqdm import tqdm
|
10
|
+
|
11
|
+
# Set up logger early
|
12
|
+
logger = logging.getLogger(__name__)
|
13
|
+
|
14
|
+
from natural_pdf.core.pdf import PDF
|
15
|
+
from natural_pdf.elements.region import Region
|
16
|
+
|
17
|
+
# --- Search Imports ---
|
18
|
+
try:
|
19
|
+
from natural_pdf.search.search_service_protocol import (
|
20
|
+
SearchServiceProtocol, SearchOptions, Indexable
|
21
|
+
)
|
22
|
+
from natural_pdf.search.searchable_mixin import SearchableMixin
|
23
|
+
except ImportError as e:
|
24
|
+
logger_init = logging.getLogger(__name__)
|
25
|
+
logger_init.error(f"Failed to import search components. Search functionality disabled. Error: {e}", exc_info=True)
|
26
|
+
# Dummy definitions
|
27
|
+
class SearchableMixin: pass
|
28
|
+
SearchServiceProtocol, SearchOptions, Indexable = object, object, object
|
29
|
+
|
30
|
+
from natural_pdf.search.searchable_mixin import SearchableMixin # Import the new mixin
|
31
|
+
|
32
|
+
class PDFCollection(SearchableMixin): # Inherit from the mixin
|
33
|
+
def __init__(self,
|
34
|
+
source: Union[str, Iterable[Union[str, 'PDF']]],
|
35
|
+
recursive: bool = True,
|
36
|
+
**pdf_options: Any):
|
37
|
+
"""
|
38
|
+
Initializes a collection of PDF documents from various sources.
|
39
|
+
|
40
|
+
Args:
|
41
|
+
source: The source of PDF documents. Can be:
|
42
|
+
- An iterable (e.g., list) of existing PDF objects.
|
43
|
+
- An iterable (e.g., list) of file paths/URLs/globs (strings).
|
44
|
+
- A single file path/URL/directory/glob string.
|
45
|
+
recursive: If source involves directories or glob patterns,
|
46
|
+
whether to search recursively (default: True).
|
47
|
+
**pdf_options: Keyword arguments passed to the PDF constructor.
|
48
|
+
"""
|
49
|
+
self._pdfs: List['PDF'] = []
|
50
|
+
self._pdf_options = pdf_options # Store options for potential slicing later
|
51
|
+
self._recursive = recursive # Store setting for potential slicing
|
52
|
+
|
53
|
+
# Dynamically import PDF class within methods to avoid circular import at module load time
|
54
|
+
PDF = self._get_pdf_class()
|
55
|
+
|
56
|
+
if hasattr(source, '__iter__') and not isinstance(source, str):
|
57
|
+
source_list = list(source)
|
58
|
+
if not source_list: return # Empty list source
|
59
|
+
if isinstance(source_list[0], PDF):
|
60
|
+
if all(isinstance(item, PDF) for item in source_list):
|
61
|
+
self._pdfs = source_list # Direct assignment
|
62
|
+
# Don't adopt search context anymore
|
63
|
+
return
|
64
|
+
else: raise TypeError("Iterable source has mixed PDF/non-PDF objects.")
|
65
|
+
# If it's an iterable but not PDFs, fall through to resolve sources
|
66
|
+
|
67
|
+
# Resolve string, iterable of strings, or single string source to paths/URLs
|
68
|
+
resolved_paths_or_urls = self._resolve_sources_to_paths(source)
|
69
|
+
self._initialize_pdfs(resolved_paths_or_urls, PDF) # Pass PDF class
|
70
|
+
|
71
|
+
self._iter_index = 0
|
72
|
+
|
73
|
+
# Initialize internal search service reference
|
74
|
+
self._search_service: Optional[SearchServiceProtocol] = None
|
75
|
+
|
76
|
+
@staticmethod
|
77
|
+
def _get_pdf_class():
|
78
|
+
"""Helper method to dynamically import the PDF class."""
|
79
|
+
try:
|
80
|
+
# Import needs to resolve path correctly
|
81
|
+
from natural_pdf.core.pdf import PDF
|
82
|
+
return PDF
|
83
|
+
except ImportError as e:
|
84
|
+
logger.error("Could not import PDF class from natural_pdf.core.pdf. Ensure it exists and there are no circular imports at runtime.")
|
85
|
+
raise ImportError("PDF class is required but could not be imported.") from e
|
86
|
+
|
87
|
+
# --- Internal Helpers ---
|
88
|
+
|
89
|
+
def _is_url(self, s: str) -> bool: return s.startswith(('http://', 'https://'))
|
90
|
+
def _has_glob_magic(self, s: str) -> bool: return py_glob.has_magic(s)
|
91
|
+
|
92
|
+
def _execute_glob(self, pattern: str) -> Set[str]:
|
93
|
+
"""Glob for paths and return a set of valid PDF paths."""
|
94
|
+
found_paths = set()
|
95
|
+
try:
|
96
|
+
# Use iglob for potentially large directories/matches
|
97
|
+
paths_iter = py_glob.iglob(pattern, recursive=self._recursive)
|
98
|
+
for path_str in paths_iter:
|
99
|
+
# Use Path object for easier checking
|
100
|
+
p = Path(path_str)
|
101
|
+
if p.is_file() and p.suffix.lower() == ".pdf":
|
102
|
+
found_paths.add(str(p.resolve())) # Store resolved absolute path
|
103
|
+
except Exception as e:
|
104
|
+
logger.error(f"Error processing glob pattern '{pattern}': {e}")
|
105
|
+
return found_paths
|
106
|
+
|
107
|
+
def _resolve_sources_to_paths(self, source: Union[str, Iterable[str]]) -> List[str]:
|
108
|
+
"""Resolves various source types into a list of unique PDF paths/URLs."""
|
109
|
+
final_paths = set()
|
110
|
+
sources_to_process = []
|
111
|
+
|
112
|
+
if isinstance(source, str):
|
113
|
+
sources_to_process.append(source)
|
114
|
+
elif hasattr(source, '__iter__'):
|
115
|
+
sources_to_process.extend(list(source))
|
116
|
+
else: # Should not happen based on __init__ checks, but safeguard
|
117
|
+
raise TypeError(f"Unexpected source type in _resolve_sources_to_paths: {type(source)}")
|
118
|
+
|
119
|
+
for item in sources_to_process:
|
120
|
+
if not isinstance(item, str):
|
121
|
+
logger.warning(f"Skipping non-string item in source list: {type(item)}")
|
122
|
+
continue
|
123
|
+
|
124
|
+
item_path = Path(item)
|
125
|
+
|
126
|
+
if self._is_url(item):
|
127
|
+
final_paths.add(item) # Add URL directly
|
128
|
+
elif self._has_glob_magic(item):
|
129
|
+
glob_results = self._execute_glob(item)
|
130
|
+
final_paths.update(glob_results)
|
131
|
+
elif item_path.is_dir():
|
132
|
+
# Use glob to find PDFs in directory, respecting recursive flag
|
133
|
+
dir_pattern = str(item_path / "**" / "*.pdf") if self._recursive else str(item_path / "*.pdf")
|
134
|
+
dir_glob_results = self._execute_glob(dir_pattern)
|
135
|
+
final_paths.update(dir_glob_results)
|
136
|
+
elif item_path.is_file() and item_path.suffix.lower() == ".pdf":
|
137
|
+
final_paths.add(str(item_path.resolve())) # Add resolved file path
|
138
|
+
else:
|
139
|
+
logger.warning(f"Source item ignored (not a valid URL, directory, file, or glob): {item}")
|
140
|
+
|
141
|
+
return sorted(list(final_paths))
|
142
|
+
|
143
|
+
def _initialize_pdfs(self, paths_or_urls: List[str], PDF_cls: Type):
|
144
|
+
"""Initializes PDF objects from a list of paths/URLs."""
|
145
|
+
logger.info(f"Initializing {len(paths_or_urls)} PDF objects...")
|
146
|
+
failed_count = 0
|
147
|
+
for path_or_url in tqdm(paths_or_urls, desc="Loading PDFs"):
|
148
|
+
try:
|
149
|
+
pdf_instance = PDF_cls(path_or_url, **self._pdf_options)
|
150
|
+
self._pdfs.append(pdf_instance)
|
151
|
+
except Exception as e:
|
152
|
+
logger.error(f"Failed to load PDF: {path_or_url}. Error: {e}", exc_info=False) # Keep log concise
|
153
|
+
failed_count += 1
|
154
|
+
logger.info(f"Successfully initialized {len(self._pdfs)} PDFs. Failed: {failed_count}")
|
155
|
+
|
156
|
+
# --- Public Factory Class Methods (Simplified) ---
|
157
|
+
|
158
|
+
@classmethod
|
159
|
+
def from_paths(cls, paths_or_urls: List[str], **pdf_options: Any) -> 'PDFCollection':
|
160
|
+
"""Creates a PDFCollection explicitly from a list of file paths or URLs."""
|
161
|
+
# __init__ can handle List[str] directly now
|
162
|
+
return cls(paths_or_urls, **pdf_options)
|
163
|
+
|
164
|
+
@classmethod
|
165
|
+
def from_glob(cls, pattern: str, recursive: bool = True, **pdf_options: Any) -> 'PDFCollection':
|
166
|
+
"""Creates a PDFCollection explicitly from a single glob pattern."""
|
167
|
+
# __init__ can handle single glob string directly
|
168
|
+
return cls(pattern, recursive=recursive, **pdf_options)
|
169
|
+
|
170
|
+
@classmethod
|
171
|
+
def from_globs(cls, patterns: List[str], recursive: bool = True, **pdf_options: Any) -> 'PDFCollection':
|
172
|
+
"""Creates a PDFCollection explicitly from a list of glob patterns."""
|
173
|
+
# __init__ can handle List[str] containing globs directly
|
174
|
+
return cls(patterns, recursive=recursive, **pdf_options)
|
175
|
+
|
176
|
+
@classmethod
|
177
|
+
def from_directory(cls, directory_path: str, recursive: bool = True, **pdf_options: Any) -> 'PDFCollection':
|
178
|
+
"""Creates a PDFCollection explicitly from PDF files within a directory."""
|
179
|
+
# __init__ can handle single directory string directly
|
180
|
+
return cls(directory_path, recursive=recursive, **pdf_options)
|
181
|
+
|
182
|
+
# --- Core Collection Methods ---
|
183
|
+
def __len__(self) -> int:
|
184
|
+
return len(self._pdfs)
|
185
|
+
|
186
|
+
def __getitem__(self, key) -> Union['PDF', 'PDFCollection']:
|
187
|
+
# Use dynamic import here as well
|
188
|
+
PDF = self._get_pdf_class()
|
189
|
+
if isinstance(key, slice):
|
190
|
+
# Create a new collection with the sliced PDFs and original options
|
191
|
+
new_collection = PDFCollection.__new__(PDFCollection) # Create blank instance
|
192
|
+
new_collection._pdfs = self._pdfs[key]
|
193
|
+
new_collection._pdf_options = self._pdf_options
|
194
|
+
new_collection._recursive = self._recursive
|
195
|
+
# Search context is not copied/inherited anymore
|
196
|
+
return new_collection
|
197
|
+
elif isinstance(key, int):
|
198
|
+
# Check bounds
|
199
|
+
if 0 <= key < len(self._pdfs):
|
200
|
+
return self._pdfs[key]
|
201
|
+
else:
|
202
|
+
raise IndexError(f"PDF index {key} out of range (0-{len(self._pdfs)-1}).")
|
203
|
+
else:
|
204
|
+
raise TypeError(f"PDF indices must be integers or slices, not {type(key)}.")
|
205
|
+
|
206
|
+
def __iter__(self):
|
207
|
+
return iter(self._pdfs)
|
208
|
+
|
209
|
+
def __repr__(self) -> str:
|
210
|
+
# Removed search status
|
211
|
+
return f"<PDFCollection(count={len(self)})>"
|
212
|
+
|
213
|
+
@property
|
214
|
+
def pdfs(self) -> List['PDF']:
|
215
|
+
"""Returns the list of PDF objects held by the collection."""
|
216
|
+
return self._pdfs
|
217
|
+
|
218
|
+
# --- Other Methods (e.g., apply_ocr_to_pages - could leverage service in future?) ---
|
219
|
+
def apply_ocr_to_pages(self, *args, **kwargs):
|
220
|
+
PDF = self._get_pdf_class()
|
221
|
+
# Delegate to individual PDF objects
|
222
|
+
logger.info("Applying OCR to relevant PDFs in collection...")
|
223
|
+
results = []
|
224
|
+
for pdf in self._pdfs:
|
225
|
+
# We need to figure out which pages belong to which PDF if batching here
|
226
|
+
# For now, simpler to call on each PDF
|
227
|
+
try:
|
228
|
+
# Assume apply_ocr_to_pages exists on PDF and accepts similar args
|
229
|
+
pdf.apply_ocr_to_pages(*args, **kwargs)
|
230
|
+
except Exception as e:
|
231
|
+
logger.error(f"Failed applying OCR to {pdf.path}: {e}", exc_info=True)
|
232
|
+
return self
|
233
|
+
|
234
|
+
# --- Advanced Method Placeholders ---
|
235
|
+
# Placeholder for categorize removed as find_relevant is now implemented
|
236
|
+
|
237
|
+
def categorize(self, categories: List[str], **kwargs):
|
238
|
+
"""Categorizes PDFs in the collection based on content or features."""
|
239
|
+
# Implementation requires integrating with classification models or logic
|
240
|
+
raise NotImplementedError("categorize requires classification implementation.")
|
241
|
+
|
242
|
+
# --- Mixin Required Implementation ---
|
243
|
+
def get_indexable_items(self) -> Iterable[Indexable]:
|
244
|
+
"""Yields Page objects from the collection, conforming to Indexable."""
|
245
|
+
if not self._pdfs:
|
246
|
+
return # Return empty iterator if no PDFs
|
247
|
+
|
248
|
+
for pdf in self._pdfs:
|
249
|
+
if not pdf.pages: # Handle case where a PDF might have 0 pages after loading
|
250
|
+
logger.warning(f"PDF '{pdf.path}' has no pages. Skipping.")
|
251
|
+
continue
|
252
|
+
for page in pdf.pages:
|
253
|
+
# Optional: Add filtering here if needed (e.g., skip empty pages)
|
254
|
+
# Assuming Page object conforms to Indexable
|
255
|
+
# We might still want the empty page check here for efficiency
|
256
|
+
# if not page.extract_text(use_exclusions=False).strip():
|
257
|
+
# logger.debug(f"Skipping empty page {page.page_number} from PDF '{pdf.path}'.")
|
258
|
+
# continue
|
259
|
+
yield page
|
natural_pdf/core/page.py
CHANGED
@@ -7,6 +7,8 @@ from PIL import Image
|
|
7
7
|
import base64
|
8
8
|
import io
|
9
9
|
import json
|
10
|
+
import re
|
11
|
+
import hashlib
|
10
12
|
|
11
13
|
from natural_pdf.elements.collections import ElementCollection
|
12
14
|
from natural_pdf.elements.region import Region
|
@@ -96,6 +98,11 @@ class Page:
|
|
96
98
|
"""Get page number (1-based)."""
|
97
99
|
return self._page.page_number
|
98
100
|
|
101
|
+
@property
|
102
|
+
def page_number(self) -> int:
|
103
|
+
"""Get page number (1-based)."""
|
104
|
+
return self._page.page_number
|
105
|
+
|
99
106
|
@property
|
100
107
|
def index(self) -> int:
|
101
108
|
"""Get page index (0-based)."""
|
@@ -127,7 +134,7 @@ class Page:
|
|
127
134
|
self._exclusions = []
|
128
135
|
return self
|
129
136
|
|
130
|
-
def add_exclusion(self, exclusion_func_or_region: Union[Callable[['Page'], Region], Region, Any]) -> 'Page':
|
137
|
+
def add_exclusion(self, exclusion_func_or_region: Union[Callable[['Page'], Region], Region, Any], label: Optional[str] = None) -> 'Page':
|
131
138
|
"""
|
132
139
|
Add an exclusion to the page. Text from these regions will be excluded from extraction.
|
133
140
|
Ensures non-callable items are stored as Region objects if possible.
|
@@ -135,6 +142,7 @@ class Page:
|
|
135
142
|
Args:
|
136
143
|
exclusion_func_or_region: Either a callable function returning a Region,
|
137
144
|
a Region object, or another object with a valid .bbox attribute.
|
145
|
+
label: Optional label for this exclusion (e.g., 'header', 'footer').
|
138
146
|
|
139
147
|
Returns:
|
140
148
|
Self for method chaining
|
@@ -142,28 +150,36 @@ class Page:
|
|
142
150
|
Raises:
|
143
151
|
TypeError: If a non-callable, non-Region object without a valid bbox is provided.
|
144
152
|
"""
|
153
|
+
exclusion_data = None # Initialize exclusion data
|
154
|
+
|
145
155
|
if callable(exclusion_func_or_region):
|
146
|
-
# Store callable functions
|
147
|
-
|
148
|
-
logger.debug(f"Page {self.index}: Added callable exclusion: {exclusion_func_or_region}")
|
156
|
+
# Store callable functions along with their label
|
157
|
+
exclusion_data = (exclusion_func_or_region, label)
|
158
|
+
logger.debug(f"Page {self.index}: Added callable exclusion '{label}': {exclusion_func_or_region}")
|
149
159
|
elif isinstance(exclusion_func_or_region, Region):
|
150
|
-
# Store Region objects directly
|
151
|
-
|
152
|
-
|
160
|
+
# Store Region objects directly, assigning the label
|
161
|
+
exclusion_func_or_region.label = label # Assign label
|
162
|
+
exclusion_data = (exclusion_func_or_region, label) # Store as tuple for consistency
|
163
|
+
logger.debug(f"Page {self.index}: Added Region exclusion '{label}': {exclusion_func_or_region}")
|
153
164
|
elif hasattr(exclusion_func_or_region, 'bbox') and isinstance(getattr(exclusion_func_or_region, 'bbox', None), (tuple, list)) and len(exclusion_func_or_region.bbox) == 4:
|
154
165
|
# Convert objects with a valid bbox to a Region before storing
|
155
166
|
try:
|
156
167
|
bbox_coords = tuple(float(v) for v in exclusion_func_or_region.bbox)
|
157
|
-
|
158
|
-
self
|
159
|
-
|
168
|
+
# Pass the label to the Region constructor
|
169
|
+
region_to_add = Region(self, bbox_coords, label=label)
|
170
|
+
exclusion_data = (region_to_add, label) # Store as tuple
|
171
|
+
logger.debug(f"Page {self.index}: Added exclusion '{label}' converted to Region from {type(exclusion_func_or_region)}: {region_to_add}")
|
160
172
|
except (ValueError, TypeError, Exception) as e:
|
161
173
|
# Raise an error if conversion fails
|
162
174
|
raise TypeError(f"Failed to convert exclusion object {exclusion_func_or_region} with bbox {getattr(exclusion_func_or_region, 'bbox', 'N/A')} to Region: {e}") from e
|
163
175
|
else:
|
164
176
|
# Reject invalid types
|
165
177
|
raise TypeError(f"Invalid exclusion type: {type(exclusion_func_or_region)}. Must be callable, Region, or have a valid .bbox attribute.")
|
166
|
-
|
178
|
+
|
179
|
+
# Append the stored data (tuple of object/callable and label)
|
180
|
+
if exclusion_data:
|
181
|
+
self._exclusions.append(exclusion_data)
|
182
|
+
|
167
183
|
return self
|
168
184
|
|
169
185
|
def add_region(self, region: Region, name: Optional[str] = None) -> 'Page':
|
@@ -222,75 +238,66 @@ class Page:
|
|
222
238
|
def _get_exclusion_regions(self, include_callable=True, debug=False) -> List[Region]:
|
223
239
|
"""
|
224
240
|
Get all exclusion regions for this page.
|
225
|
-
Assumes self._exclusions contains
|
241
|
+
Assumes self._exclusions contains tuples of (callable/Region, label).
|
226
242
|
|
227
243
|
Args:
|
228
244
|
include_callable: Whether to evaluate callable exclusion functions
|
229
245
|
debug: Enable verbose debug logging for exclusion evaluation
|
230
246
|
|
231
247
|
Returns:
|
232
|
-
List of Region objects to exclude
|
248
|
+
List of Region objects to exclude, with labels assigned.
|
233
249
|
"""
|
234
250
|
regions = []
|
235
251
|
|
236
|
-
# Track exclusion results for debugging
|
237
252
|
if debug:
|
238
253
|
print(f"\nPage {self.index}: Evaluating {len(self._exclusions)} exclusions")
|
239
|
-
|
240
|
-
for i,
|
241
|
-
#
|
242
|
-
|
243
|
-
|
244
|
-
|
245
|
-
# Check if it's a tuple from PDF.add_exclusion (should still be handled if PDF adds labels)
|
246
|
-
if isinstance(exclusion, tuple) and len(exclusion) == 2 and callable(exclusion[0]):
|
247
|
-
exclusion_func, label = exclusion
|
248
|
-
if label:
|
249
|
-
exclusion_label = label
|
250
|
-
exclusion = exclusion_func # Use the function part
|
251
|
-
|
254
|
+
|
255
|
+
for i, exclusion_data in enumerate(self._exclusions):
|
256
|
+
# Unpack the exclusion object/callable and its label
|
257
|
+
exclusion_item, label = exclusion_data
|
258
|
+
exclusion_label = label if label else f"exclusion {i}"
|
259
|
+
|
252
260
|
# Process callable exclusion functions
|
253
|
-
if callable(
|
254
|
-
# It's a function, call it with this page
|
261
|
+
if callable(exclusion_item) and include_callable:
|
255
262
|
try:
|
256
263
|
if debug:
|
257
|
-
print(f" - Evaluating callable {exclusion_label}...")
|
258
|
-
|
259
|
-
# Temporarily clear exclusions
|
260
|
-
# This might be overly cautious depending on use case, but safer.
|
264
|
+
print(f" - Evaluating callable '{exclusion_label}'...")
|
265
|
+
|
266
|
+
# Temporarily clear exclusions (consider if really needed)
|
261
267
|
temp_original_exclusions = self._exclusions
|
262
|
-
self._exclusions = []
|
263
|
-
|
268
|
+
self._exclusions = []
|
269
|
+
|
264
270
|
# Call the function - Expects it to return a Region or None
|
265
|
-
region_result =
|
266
|
-
|
271
|
+
region_result = exclusion_item(self)
|
272
|
+
|
267
273
|
# Restore exclusions
|
268
274
|
self._exclusions = temp_original_exclusions
|
269
|
-
|
275
|
+
|
270
276
|
if isinstance(region_result, Region):
|
277
|
+
# Assign the label to the returned region
|
278
|
+
region_result.label = label
|
271
279
|
regions.append(region_result)
|
272
280
|
if debug:
|
273
|
-
print(f" ✓ Added region from callable: {region_result}")
|
281
|
+
print(f" ✓ Added region from callable '{label}': {region_result}")
|
274
282
|
elif region_result:
|
275
|
-
|
276
|
-
logger.warning(f"Callable exclusion {exclusion_label} returned non-Region object: {type(region_result)}. Skipping.")
|
283
|
+
logger.warning(f"Callable exclusion '{exclusion_label}' returned non-Region object: {type(region_result)}. Skipping.")
|
277
284
|
if debug:
|
278
285
|
print(f" ✗ Callable returned non-Region/None: {type(region_result)}")
|
279
286
|
else:
|
280
287
|
if debug:
|
281
|
-
print(f" ✗ Callable returned None, no region added")
|
282
|
-
|
288
|
+
print(f" ✗ Callable '{exclusion_label}' returned None, no region added")
|
289
|
+
|
283
290
|
except Exception as e:
|
284
|
-
error_msg = f"Error evaluating callable exclusion {exclusion_label} for page {self.index}: {e}"
|
291
|
+
error_msg = f"Error evaluating callable exclusion '{exclusion_label}' for page {self.index}: {e}"
|
285
292
|
print(error_msg)
|
286
293
|
import traceback
|
287
294
|
print(f" Traceback: {traceback.format_exc().splitlines()[-3:]}")
|
288
|
-
|
289
|
-
# Process direct Region objects (
|
290
|
-
elif isinstance(
|
291
|
-
regions.append(
|
295
|
+
|
296
|
+
# Process direct Region objects (label was assigned in add_exclusion)
|
297
|
+
elif isinstance(exclusion_item, Region):
|
298
|
+
regions.append(exclusion_item) # Label is already on the Region object
|
292
299
|
if debug:
|
293
|
-
print(f" - Added direct region: {
|
300
|
+
print(f" - Added direct region '{label}': {exclusion_item}")
|
294
301
|
# No else needed, add_exclusion should prevent invalid types
|
295
302
|
|
296
303
|
if debug:
|
@@ -1485,25 +1492,46 @@ class Page:
|
|
1485
1492
|
RuntimeError: If required dependencies (ipywidgets) are missing.
|
1486
1493
|
ValueError: If image rendering or data preparation fails within from_page.
|
1487
1494
|
"""
|
1488
|
-
#
|
1489
|
-
|
1495
|
+
# Dynamically import here if needed, or ensure it's globally available
|
1496
|
+
try:
|
1497
|
+
from natural_pdf.widgets.viewer import SimpleInteractiveViewerWidget
|
1498
|
+
except ImportError:
|
1499
|
+
logger.error("Interactive viewer requires optional dependencies. Install with `pip install natural-pdf[widgets]`")
|
1500
|
+
raise
|
1501
|
+
|
1502
|
+
# Pass self (the Page object) to the factory method
|
1503
|
+
return SimpleInteractiveViewerWidget.from_page(self)
|
1504
|
+
|
1505
|
+
# --- Indexable Protocol Methods ---
|
1506
|
+
def get_id(self) -> str:
|
1507
|
+
"""Returns a unique identifier for the page (required by Indexable protocol)."""
|
1508
|
+
# Ensure path is safe for use in IDs (replace problematic chars)
|
1509
|
+
safe_path = re.sub(r'[^a-zA-Z0-9_-]', '_', str(self.pdf.path))
|
1510
|
+
return f"pdf_{safe_path}_page_{self.page_number}"
|
1511
|
+
|
1512
|
+
def get_metadata(self) -> Dict[str, Any]:
|
1513
|
+
"""Returns metadata associated with the page (required by Indexable protocol)."""
|
1514
|
+
# Add content hash here for sync
|
1515
|
+
metadata = {
|
1516
|
+
"pdf_path": str(self.pdf.path),
|
1517
|
+
"page_number": self.page_number,
|
1518
|
+
"width": self.width,
|
1519
|
+
"height": self.height,
|
1520
|
+
"content_hash": self.get_content_hash() # Include the hash
|
1521
|
+
}
|
1522
|
+
return metadata
|
1490
1523
|
|
1491
|
-
|
1524
|
+
def get_content(self) -> 'Page':
|
1525
|
+
"""
|
1526
|
+
Returns the primary content object (self) for indexing (required by Indexable protocol).
|
1527
|
+
SearchService implementations decide how to process this (e.g., call extract_text).
|
1528
|
+
"""
|
1529
|
+
return self # Return the Page object itself
|
1492
1530
|
|
1493
|
-
|
1494
|
-
|
1495
|
-
|
1496
|
-
|
1497
|
-
|
1498
|
-
|
1499
|
-
|
1500
|
-
|
1501
|
-
logger.info("Interactive viewer widget created successfully.")
|
1502
|
-
return viewer_widget
|
1503
|
-
except ImportError as e:
|
1504
|
-
logger.error("Failed to import SimpleInteractiveViewerWidget. Ensure natural_pdf.widgets and ipywidgets are installed.")
|
1505
|
-
raise RuntimeError("Widget class not found. ipywidgets or natural_pdf.widgets might be missing or setup incorrect.") from e
|
1506
|
-
except Exception as e:
|
1507
|
-
logger.error(f"Failed to create interactive viewer: {e}", exc_info=True)
|
1508
|
-
# Re-raise the exception to make it visible to the user
|
1509
|
-
raise RuntimeError(f"Failed to create interactive viewer: {e}") from e
|
1531
|
+
def get_content_hash(self) -> str:
|
1532
|
+
"""Returns a SHA256 hash of the extracted text content (required by Indexable for sync)."""
|
1533
|
+
# Hash the extracted text (without exclusions for consistency)
|
1534
|
+
# Consider if exclusions should be part of the hash? For now, hash raw text.
|
1535
|
+
# Using extract_text directly might be slow if called repeatedly. Cache? TODO: Optimization
|
1536
|
+
text_content = self.extract_text(use_exclusions=False, preserve_whitespace=False) # Normalize whitespace?
|
1537
|
+
return hashlib.sha256(text_content.encode('utf-8')).hexdigest()
|