natural-pdf 0.1.1__py3-none-any.whl → 0.1.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -3,6 +3,7 @@ import logging
3
3
  import importlib.util
4
4
  import os
5
5
  import tempfile
6
+ import copy
6
7
  from typing import List, Dict, Any, Optional, Tuple
7
8
  from PIL import Image
8
9
 
@@ -11,20 +12,23 @@ from .layout_options import SuryaLayoutOptions, BaseLayoutOptions
11
12
 
12
13
  logger = logging.getLogger(__name__)
13
14
 
14
- # Check for dependency
15
+ # Check for dependencies
15
16
  surya_spec = importlib.util.find_spec("surya")
16
17
  LayoutPredictor = None
18
+ TableRecPredictor = None
19
+
17
20
  if surya_spec:
18
21
  try:
19
22
  from surya.layout import LayoutPredictor
23
+ from surya.table_rec import TableRecPredictor
20
24
  except ImportError as e:
21
- logger.warning(f"Could not import Surya dependencies: {e}")
25
+ logger.warning(f"Could not import Surya dependencies (layout and/or table_rec): {e}")
22
26
  else:
23
27
  logger.warning("surya not found. SuryaLayoutDetector will not be available.")
24
28
 
25
29
 
26
30
  class SuryaLayoutDetector(LayoutDetector):
27
- """Document layout detector using Surya models."""
31
+ """Document layout and table structure detector using Surya models."""
28
32
 
29
33
  def __init__(self):
30
34
  super().__init__()
@@ -32,120 +36,224 @@ class SuryaLayoutDetector(LayoutDetector):
32
36
  'text', 'pageheader', 'pagefooter', 'sectionheader',
33
37
  'table', 'tableofcontents', 'picture', 'caption',
34
38
  'heading', 'title', 'list', 'listitem', 'code',
35
- 'textinlinemath', 'mathformula', 'form'
39
+ 'textinlinemath', 'mathformula', 'form',
40
+ 'table-row', 'table-column'
36
41
  }
37
- # Predictor instance is cached via _get_model
42
+ self._page_ref = None # To store page reference from options
38
43
 
39
44
  def is_available(self) -> bool:
40
- """Check if surya is installed."""
41
- return LayoutPredictor is not None
45
+ return LayoutPredictor is not None and TableRecPredictor is not None
42
46
 
43
47
  def _get_cache_key(self, options: BaseLayoutOptions) -> str:
44
- """Generate cache key based on model name and device."""
45
48
  if not isinstance(options, SuryaLayoutOptions):
46
- options = SuryaLayoutOptions(device=options.device) # Use base device
47
-
49
+ options = SuryaLayoutOptions(device=options.device)
48
50
  device_key = str(options.device).lower() if options.device else 'default_device'
49
- # Include model_name if it affects loading, otherwise device might be enough
50
51
  model_key = options.model_name
51
52
  return f"{self.__class__.__name__}_{device_key}_{model_key}"
52
53
 
53
- def _load_model_from_options(self, options: BaseLayoutOptions) -> Any:
54
- """Load the Surya LayoutPredictor model."""
54
+ def _load_model_from_options(self, options: BaseLayoutOptions) -> Dict[str, Any]:
55
55
  if not self.is_available():
56
- raise RuntimeError("Surya dependency (surya-ocr) not installed.")
57
-
56
+ raise RuntimeError("Surya dependencies (surya.layout and surya.table_rec) not installed.")
58
57
  if not isinstance(options, SuryaLayoutOptions):
59
58
  raise TypeError("Incorrect options type provided for Surya model loading.")
60
-
61
- self.logger.info(f"Loading Surya LayoutPredictor (device={options.device})...")
62
- try:
63
- # Pass device and potentially other init args from options.extra_args
64
- predictor_args = {'device': options.device} if options.device else {}
65
- predictor_args.update(options.extra_args) # Add any extra init args
66
-
67
- predictor = LayoutPredictor(**predictor_args)
68
- self.logger.info("Surya LayoutPredictor loaded.")
69
- return predictor
59
+ self.logger.info(f"Loading Surya models (device={options.device})...")
60
+ models = {}
61
+ try:
62
+ models['layout'] = LayoutPredictor()
63
+ models['table_rec'] = TableRecPredictor()
64
+ self.logger.info("Surya LayoutPredictor and TableRecPredictor loaded.")
65
+ return models
70
66
  except Exception as e:
71
- self.logger.error(f"Failed to load Surya LayoutPredictor: {e}", exc_info=True)
67
+ self.logger.error(f"Failed to load Surya models: {e}", exc_info=True)
72
68
  raise
69
+
70
+ def _expand_bbox(self, bbox: Tuple[float, float, float, float],
71
+ padding: int, max_width: int, max_height: int) -> Tuple[int, int, int, int]:
72
+ """Expand bbox by padding, clamping to max dimensions."""
73
+ x0, y0, x1, y1 = bbox
74
+ x0 = max(0, int(x0 - padding))
75
+ y0 = max(0, int(y0 - padding))
76
+ x1 = min(max_width, int(x1 + padding))
77
+ y1 = min(max_height, int(y1 + padding))
78
+ return x0, y0, x1, y1
73
79
 
74
80
  def detect(self, image: Image.Image, options: BaseLayoutOptions) -> List[Dict[str, Any]]:
75
- """Detect layout elements in an image using Surya."""
81
+ """Detect layout elements and optionally table structure in an image using Surya."""
76
82
  if not self.is_available():
77
- raise RuntimeError("Surya dependency (surya-ocr) not installed.")
83
+ raise RuntimeError("Surya dependencies (layout and table_rec) not installed.")
78
84
 
79
85
  if not isinstance(options, SuryaLayoutOptions):
80
86
  self.logger.warning("Received BaseLayoutOptions, expected SuryaLayoutOptions. Using defaults.")
81
87
  options = SuryaLayoutOptions(
82
88
  confidence=options.confidence, classes=options.classes,
83
89
  exclude_classes=options.exclude_classes, device=options.device,
84
- extra_args=options.extra_args
90
+ extra_args=options.extra_args,
91
+ recognize_table_structure=True
85
92
  )
93
+
94
+ # Extract page reference and scaling factors from extra_args (passed by LayoutAnalyzer)
95
+ self._page_ref = options.extra_args.get('_page_ref')
96
+ img_scale_x = options.extra_args.get('_img_scale_x')
97
+ img_scale_y = options.extra_args.get('_img_scale_y')
98
+
99
+ # We still need this check, otherwise later steps that need these vars will fail
100
+ can_do_table_rec = options.recognize_table_structure and self._page_ref and img_scale_x is not None and img_scale_y is not None
101
+ if options.recognize_table_structure and not can_do_table_rec:
102
+ logger.warning("Surya table recognition cannot proceed without page reference and scaling factors. Disabling.")
103
+ options.recognize_table_structure = False
86
104
 
87
- self.validate_classes(options.classes or [])
88
- if options.exclude_classes:
89
- self.validate_classes(options.exclude_classes)
90
-
91
- # Get the cached/loaded predictor instance
92
- layout_predictor = self._get_model(options)
93
-
94
- # Surya predictor takes a list of images
95
- input_image_list = [image.convert("RGB")] # Ensure RGB
96
-
97
- detections = []
98
- try:
99
- self.logger.debug("Running Surya layout prediction...")
100
- # Call the predictor (returns a list of LayoutResult objects)
101
- layout_predictions = layout_predictor(input_image_list)
102
- self.logger.debug(f"Surya prediction returned {len(layout_predictions)} results.")
103
-
104
- if not layout_predictions:
105
- self.logger.warning("Surya returned empty predictions list.")
106
- return []
107
-
108
- # Process results for the first (and only) image
109
- prediction = layout_predictions[0] # LayoutResult object
110
-
111
- # Prepare normalized class filters once
112
- normalized_classes_req = {self._normalize_class_name(c) for c in options.classes} if options.classes else None
113
- normalized_classes_excl = {self._normalize_class_name(c) for c in options.exclude_classes} if options.exclude_classes else set()
114
-
115
- for layout_box in prediction.bboxes:
116
- # Extract the class name and normalize it
117
- class_name_orig = layout_box.label
118
- normalized_class = self._normalize_class_name(class_name_orig)
119
- score = float(layout_box.confidence)
120
-
121
- # Apply confidence threshold
122
- if score < options.confidence: continue
123
-
124
- # Apply class filtering
125
- if normalized_classes_req and normalized_class not in normalized_classes_req: continue
126
- if normalized_class in normalized_classes_excl: continue
127
-
128
- # Extract bbox coordinates (Surya provides [x_min, y_min, x_max, y_max])
129
- x_min, y_min, x_max, y_max = map(float, layout_box.bbox)
130
-
131
- # Add detection
132
- detection_data = {
133
- 'bbox': (x_min, y_min, x_max, y_max),
134
- 'class': class_name_orig,
135
- 'confidence': score,
136
- 'normalized_class': normalized_class,
137
- 'source': 'layout',
138
- 'model': 'surya'
139
- # Add polygon etc. if needed, check attributes on layout_box
140
- # 'polygon': layout_box.polygon if hasattr(layout_box, 'polygon') else None,
141
- }
142
- detections.append(detection_data)
143
-
144
- self.logger.info(f"Surya detected {len(detections)} layout elements matching criteria.")
105
+ # Validate classes
106
+ if options.classes: self.validate_classes(options.classes)
107
+ if options.exclude_classes: self.validate_classes(options.exclude_classes)
145
108
 
146
- except Exception as e:
147
- self.logger.error(f"Error during Surya layout detection: {e}", exc_info=True)
148
- raise
109
+ models = self._get_model(options)
110
+ layout_predictor = models['layout']
111
+ table_rec_predictor = models['table_rec']
112
+
113
+ input_image = image.convert("RGB")
114
+ input_image_list = [input_image]
115
+
116
+ initial_layout_detections = [] # Detections relative to input_image
117
+ tables_to_process = []
118
+
119
+ # --- Initial Layout Detection ---
120
+ self.logger.debug("Running Surya layout prediction...")
121
+ layout_predictions = layout_predictor(input_image_list)
122
+ self.logger.debug(f"Surya prediction returned {len(layout_predictions)} results.")
123
+ if not layout_predictions: return []
124
+ prediction = layout_predictions[0]
125
+
126
+ normalized_classes_req = {self._normalize_class_name(c) for c in options.classes} if options.classes else None
127
+ normalized_classes_excl = {self._normalize_class_name(c) for c in options.exclude_classes} if options.exclude_classes else set()
128
+
129
+ for layout_box in prediction.bboxes:
130
+ class_name_orig = layout_box.label
131
+ normalized_class = self._normalize_class_name(class_name_orig)
132
+ score = float(layout_box.confidence)
133
+
134
+ if score < options.confidence: continue
135
+ if normalized_classes_req and normalized_class not in normalized_classes_req: continue
136
+ if normalized_class in normalized_classes_excl: continue
137
+
138
+ x_min, y_min, x_max, y_max = map(float, layout_box.bbox)
139
+ detection_data = {
140
+ 'bbox': (x_min, y_min, x_max, y_max),
141
+ 'class': class_name_orig,
142
+ 'confidence': score,
143
+ 'normalized_class': normalized_class,
144
+ 'source': 'layout',
145
+ 'model': 'surya'
146
+ }
147
+ initial_layout_detections.append(detection_data)
148
+
149
+ if options.recognize_table_structure and normalized_class in ('table', 'tableofcontents'):
150
+ tables_to_process.append(detection_data)
151
+
152
+ self.logger.info(f"Surya initially detected {len(initial_layout_detections)} layout elements matching criteria.")
153
+
154
+ # --- Table Structure Recognition (Optional) ---
155
+ if not options.recognize_table_structure or not tables_to_process:
156
+ self.logger.debug("Skipping Surya table structure recognition (disabled or no tables found).")
157
+ return initial_layout_detections
158
+
159
+ self.logger.info(f"Attempting Surya table structure recognition for {len(tables_to_process)} tables...")
160
+ high_res_crops = []
161
+ pdf_offsets = [] # Store (pdf_x0, pdf_y0) for each crop
162
+
163
+ high_res_dpi = getattr(self._page_ref._parent, '_config', {}).get('surya_table_rec_dpi', 192)
164
+ bbox_padding = getattr(self._page_ref._parent, '_config', {}).get('surya_table_bbox_padding', 10)
165
+ pdf_to_highres_scale = high_res_dpi / 72.0
166
+
167
+ # Render high-res page ONCE
168
+ self.logger.debug(f"Rendering page {self._page_ref.number} at {high_res_dpi} DPI for table recognition...")
169
+ high_res_page_image = self._page_ref.to_image(resolution=high_res_dpi, include_highlights=False)
170
+ if not high_res_page_image:
171
+ raise RuntimeError(f"Failed to render page {self._page_ref.number} at high resolution.")
172
+ self.logger.debug(f" High-res image size: {high_res_page_image.width}x{high_res_page_image.height}")
173
+
174
+ for i, table_detection in enumerate(tables_to_process):
175
+ img_x0, img_y0, img_x1, img_y1 = table_detection['bbox']
176
+
177
+ # PDF coords
178
+ pdf_x0 = img_x0 * img_scale_x
179
+ pdf_y0 = img_y0 * img_scale_y
180
+ pdf_x1 = img_x1 * img_scale_x
181
+ pdf_y1 = img_y1 * img_scale_y
182
+ pdf_x0 = max(0, pdf_x0)
183
+ pdf_y0 = max(0, pdf_y0)
184
+ pdf_x1 = min(self._page_ref.width, pdf_x1)
185
+ pdf_y1 = min(self._page_ref.height, pdf_y1)
186
+
187
+ # High-res image coords
188
+ hr_x0 = pdf_x0 * pdf_to_highres_scale
189
+ hr_y0 = pdf_y0 * pdf_to_highres_scale
190
+ hr_x1 = pdf_x1 * pdf_to_highres_scale
191
+ hr_y1 = pdf_y1 * pdf_to_highres_scale
192
+
193
+ # Expand high-res bbox
194
+ hr_x0_exp, hr_y0_exp, hr_x1_exp, hr_y1_exp = self._expand_bbox(
195
+ (hr_x0, hr_y0, hr_x1, hr_y1),
196
+ padding=bbox_padding,
197
+ max_width=high_res_page_image.width,
198
+ max_height=high_res_page_image.height
199
+ )
200
+
201
+ crop = high_res_page_image.crop((hr_x0_exp, hr_y0_exp, hr_x1_exp, hr_y1_exp))
202
+ high_res_crops.append(crop)
203
+ pdf_offsets.append((pdf_x0, pdf_y0))
204
+
205
+ if not high_res_crops:
206
+ self.logger.info("No valid high-resolution table crops generated.")
207
+ return initial_layout_detections
208
+
209
+ structure_detections = [] # Detections relative to std_res input_image
210
+
211
+ # --- Run Table Recognition (will raise error on failure) ---
212
+ self.logger.debug(f"Running Surya table recognition on {len(high_res_crops)} high-res images...")
213
+ table_predictions = table_rec_predictor(high_res_crops)
214
+ self.logger.debug(f"Surya table recognition returned {len(table_predictions)} results.")
215
+
216
+ # --- Process Results ---
217
+ if len(table_predictions) != len(pdf_offsets):
218
+ # This case is less likely if predictor didn't error, but good sanity check
219
+ raise RuntimeError(f"Mismatch between table inputs ({len(pdf_offsets)}) and predictions ({len(table_predictions)}).")
220
+
221
+ for table_pred, (offset_pdf_x0, offset_pdf_y0) in zip(table_predictions, pdf_offsets):
222
+ # Process Rows
223
+ for row_box in table_pred.rows:
224
+ crop_rx0, crop_ry0, crop_rx1, crop_ry1 = map(float, row_box.bbox)
225
+ pdf_row_x0 = offset_pdf_x0 + crop_rx0 / pdf_to_highres_scale
226
+ pdf_row_y0 = offset_pdf_y0 + crop_ry0 / pdf_to_highres_scale
227
+ pdf_row_x1 = offset_pdf_x0 + crop_rx1 / pdf_to_highres_scale
228
+ pdf_row_y1 = offset_pdf_y0 + crop_ry1 / pdf_to_highres_scale
229
+ img_row_x0 = pdf_row_x0 / img_scale_x
230
+ img_row_y0 = pdf_row_y0 / img_scale_y
231
+ img_row_x1 = pdf_row_x1 / img_scale_x
232
+ img_row_y1 = pdf_row_y1 / img_scale_y
233
+ structure_detections.append({
234
+ 'bbox': (img_row_x0, img_row_y0, img_row_x1, img_row_y1),
235
+ 'class': 'table-row', 'confidence': 1.0, 'normalized_class': 'table-row',
236
+ 'source': 'layout', 'model': 'surya'
237
+ })
238
+
239
+ # Process Columns
240
+ for col_box in table_pred.cols:
241
+ crop_cx0, crop_cy0, crop_cx1, crop_cy1 = map(float, col_box.bbox)
242
+ pdf_col_x0 = offset_pdf_x0 + crop_cx0 / pdf_to_highres_scale
243
+ pdf_col_y0 = offset_pdf_y0 + crop_cy0 / pdf_to_highres_scale
244
+ pdf_col_x1 = offset_pdf_x0 + crop_cx1 / pdf_to_highres_scale
245
+ pdf_col_y1 = offset_pdf_y0 + crop_cy1 / pdf_to_highres_scale
246
+ img_col_x0 = pdf_col_x0 / img_scale_x
247
+ img_col_y0 = pdf_col_y0 / img_scale_y
248
+ img_col_x1 = pdf_col_x1 / img_scale_x
249
+ img_col_y1 = pdf_col_y1 / img_scale_y
250
+ structure_detections.append({
251
+ 'bbox': (img_col_x0, img_col_y0, img_col_x1, img_col_y1),
252
+ 'class': 'table-column', 'confidence': 1.0, 'normalized_class': 'table-column',
253
+ 'source': 'layout', 'model': 'surya'
254
+ })
255
+
256
+ self.logger.info(f"Added {len(structure_detections)} table structure elements.")
149
257
 
150
- return detections
258
+ return initial_layout_detections + structure_detections
151
259
 
@@ -0,0 +1,259 @@
1
+ import os
2
+ import glob as py_glob
3
+ import logging
4
+ from typing import List, Optional, Dict, Any, Union, Iterable, Set, TYPE_CHECKING, Type
5
+ from pathlib import Path
6
+ from PIL import Image
7
+ import re # Added for safe path generation
8
+ import copy # Added for copying options
9
+ from tqdm import tqdm
10
+
11
+ # Set up logger early
12
+ logger = logging.getLogger(__name__)
13
+
14
+ from natural_pdf.core.pdf import PDF
15
+ from natural_pdf.elements.region import Region
16
+
17
+ # --- Search Imports ---
18
+ try:
19
+ from natural_pdf.search.search_service_protocol import (
20
+ SearchServiceProtocol, SearchOptions, Indexable
21
+ )
22
+ from natural_pdf.search.searchable_mixin import SearchableMixin
23
+ except ImportError as e:
24
+ logger_init = logging.getLogger(__name__)
25
+ logger_init.error(f"Failed to import search components. Search functionality disabled. Error: {e}", exc_info=True)
26
+ # Dummy definitions
27
+ class SearchableMixin: pass
28
+ SearchServiceProtocol, SearchOptions, Indexable = object, object, object
29
+
30
+ from natural_pdf.search.searchable_mixin import SearchableMixin # Import the new mixin
31
+
32
+ class PDFCollection(SearchableMixin): # Inherit from the mixin
33
+ def __init__(self,
34
+ source: Union[str, Iterable[Union[str, 'PDF']]],
35
+ recursive: bool = True,
36
+ **pdf_options: Any):
37
+ """
38
+ Initializes a collection of PDF documents from various sources.
39
+
40
+ Args:
41
+ source: The source of PDF documents. Can be:
42
+ - An iterable (e.g., list) of existing PDF objects.
43
+ - An iterable (e.g., list) of file paths/URLs/globs (strings).
44
+ - A single file path/URL/directory/glob string.
45
+ recursive: If source involves directories or glob patterns,
46
+ whether to search recursively (default: True).
47
+ **pdf_options: Keyword arguments passed to the PDF constructor.
48
+ """
49
+ self._pdfs: List['PDF'] = []
50
+ self._pdf_options = pdf_options # Store options for potential slicing later
51
+ self._recursive = recursive # Store setting for potential slicing
52
+
53
+ # Dynamically import PDF class within methods to avoid circular import at module load time
54
+ PDF = self._get_pdf_class()
55
+
56
+ if hasattr(source, '__iter__') and not isinstance(source, str):
57
+ source_list = list(source)
58
+ if not source_list: return # Empty list source
59
+ if isinstance(source_list[0], PDF):
60
+ if all(isinstance(item, PDF) for item in source_list):
61
+ self._pdfs = source_list # Direct assignment
62
+ # Don't adopt search context anymore
63
+ return
64
+ else: raise TypeError("Iterable source has mixed PDF/non-PDF objects.")
65
+ # If it's an iterable but not PDFs, fall through to resolve sources
66
+
67
+ # Resolve string, iterable of strings, or single string source to paths/URLs
68
+ resolved_paths_or_urls = self._resolve_sources_to_paths(source)
69
+ self._initialize_pdfs(resolved_paths_or_urls, PDF) # Pass PDF class
70
+
71
+ self._iter_index = 0
72
+
73
+ # Initialize internal search service reference
74
+ self._search_service: Optional[SearchServiceProtocol] = None
75
+
76
+ @staticmethod
77
+ def _get_pdf_class():
78
+ """Helper method to dynamically import the PDF class."""
79
+ try:
80
+ # Import needs to resolve path correctly
81
+ from natural_pdf.core.pdf import PDF
82
+ return PDF
83
+ except ImportError as e:
84
+ logger.error("Could not import PDF class from natural_pdf.core.pdf. Ensure it exists and there are no circular imports at runtime.")
85
+ raise ImportError("PDF class is required but could not be imported.") from e
86
+
87
+ # --- Internal Helpers ---
88
+
89
+ def _is_url(self, s: str) -> bool: return s.startswith(('http://', 'https://'))
90
+ def _has_glob_magic(self, s: str) -> bool: return py_glob.has_magic(s)
91
+
92
+ def _execute_glob(self, pattern: str) -> Set[str]:
93
+ """Glob for paths and return a set of valid PDF paths."""
94
+ found_paths = set()
95
+ try:
96
+ # Use iglob for potentially large directories/matches
97
+ paths_iter = py_glob.iglob(pattern, recursive=self._recursive)
98
+ for path_str in paths_iter:
99
+ # Use Path object for easier checking
100
+ p = Path(path_str)
101
+ if p.is_file() and p.suffix.lower() == ".pdf":
102
+ found_paths.add(str(p.resolve())) # Store resolved absolute path
103
+ except Exception as e:
104
+ logger.error(f"Error processing glob pattern '{pattern}': {e}")
105
+ return found_paths
106
+
107
+ def _resolve_sources_to_paths(self, source: Union[str, Iterable[str]]) -> List[str]:
108
+ """Resolves various source types into a list of unique PDF paths/URLs."""
109
+ final_paths = set()
110
+ sources_to_process = []
111
+
112
+ if isinstance(source, str):
113
+ sources_to_process.append(source)
114
+ elif hasattr(source, '__iter__'):
115
+ sources_to_process.extend(list(source))
116
+ else: # Should not happen based on __init__ checks, but safeguard
117
+ raise TypeError(f"Unexpected source type in _resolve_sources_to_paths: {type(source)}")
118
+
119
+ for item in sources_to_process:
120
+ if not isinstance(item, str):
121
+ logger.warning(f"Skipping non-string item in source list: {type(item)}")
122
+ continue
123
+
124
+ item_path = Path(item)
125
+
126
+ if self._is_url(item):
127
+ final_paths.add(item) # Add URL directly
128
+ elif self._has_glob_magic(item):
129
+ glob_results = self._execute_glob(item)
130
+ final_paths.update(glob_results)
131
+ elif item_path.is_dir():
132
+ # Use glob to find PDFs in directory, respecting recursive flag
133
+ dir_pattern = str(item_path / "**" / "*.pdf") if self._recursive else str(item_path / "*.pdf")
134
+ dir_glob_results = self._execute_glob(dir_pattern)
135
+ final_paths.update(dir_glob_results)
136
+ elif item_path.is_file() and item_path.suffix.lower() == ".pdf":
137
+ final_paths.add(str(item_path.resolve())) # Add resolved file path
138
+ else:
139
+ logger.warning(f"Source item ignored (not a valid URL, directory, file, or glob): {item}")
140
+
141
+ return sorted(list(final_paths))
142
+
143
+ def _initialize_pdfs(self, paths_or_urls: List[str], PDF_cls: Type):
144
+ """Initializes PDF objects from a list of paths/URLs."""
145
+ logger.info(f"Initializing {len(paths_or_urls)} PDF objects...")
146
+ failed_count = 0
147
+ for path_or_url in tqdm(paths_or_urls, desc="Loading PDFs"):
148
+ try:
149
+ pdf_instance = PDF_cls(path_or_url, **self._pdf_options)
150
+ self._pdfs.append(pdf_instance)
151
+ except Exception as e:
152
+ logger.error(f"Failed to load PDF: {path_or_url}. Error: {e}", exc_info=False) # Keep log concise
153
+ failed_count += 1
154
+ logger.info(f"Successfully initialized {len(self._pdfs)} PDFs. Failed: {failed_count}")
155
+
156
+ # --- Public Factory Class Methods (Simplified) ---
157
+
158
+ @classmethod
159
+ def from_paths(cls, paths_or_urls: List[str], **pdf_options: Any) -> 'PDFCollection':
160
+ """Creates a PDFCollection explicitly from a list of file paths or URLs."""
161
+ # __init__ can handle List[str] directly now
162
+ return cls(paths_or_urls, **pdf_options)
163
+
164
+ @classmethod
165
+ def from_glob(cls, pattern: str, recursive: bool = True, **pdf_options: Any) -> 'PDFCollection':
166
+ """Creates a PDFCollection explicitly from a single glob pattern."""
167
+ # __init__ can handle single glob string directly
168
+ return cls(pattern, recursive=recursive, **pdf_options)
169
+
170
+ @classmethod
171
+ def from_globs(cls, patterns: List[str], recursive: bool = True, **pdf_options: Any) -> 'PDFCollection':
172
+ """Creates a PDFCollection explicitly from a list of glob patterns."""
173
+ # __init__ can handle List[str] containing globs directly
174
+ return cls(patterns, recursive=recursive, **pdf_options)
175
+
176
+ @classmethod
177
+ def from_directory(cls, directory_path: str, recursive: bool = True, **pdf_options: Any) -> 'PDFCollection':
178
+ """Creates a PDFCollection explicitly from PDF files within a directory."""
179
+ # __init__ can handle single directory string directly
180
+ return cls(directory_path, recursive=recursive, **pdf_options)
181
+
182
+ # --- Core Collection Methods ---
183
+ def __len__(self) -> int:
184
+ return len(self._pdfs)
185
+
186
+ def __getitem__(self, key) -> Union['PDF', 'PDFCollection']:
187
+ # Use dynamic import here as well
188
+ PDF = self._get_pdf_class()
189
+ if isinstance(key, slice):
190
+ # Create a new collection with the sliced PDFs and original options
191
+ new_collection = PDFCollection.__new__(PDFCollection) # Create blank instance
192
+ new_collection._pdfs = self._pdfs[key]
193
+ new_collection._pdf_options = self._pdf_options
194
+ new_collection._recursive = self._recursive
195
+ # Search context is not copied/inherited anymore
196
+ return new_collection
197
+ elif isinstance(key, int):
198
+ # Check bounds
199
+ if 0 <= key < len(self._pdfs):
200
+ return self._pdfs[key]
201
+ else:
202
+ raise IndexError(f"PDF index {key} out of range (0-{len(self._pdfs)-1}).")
203
+ else:
204
+ raise TypeError(f"PDF indices must be integers or slices, not {type(key)}.")
205
+
206
+ def __iter__(self):
207
+ return iter(self._pdfs)
208
+
209
+ def __repr__(self) -> str:
210
+ # Removed search status
211
+ return f"<PDFCollection(count={len(self)})>"
212
+
213
+ @property
214
+ def pdfs(self) -> List['PDF']:
215
+ """Returns the list of PDF objects held by the collection."""
216
+ return self._pdfs
217
+
218
+ # --- Other Methods (e.g., apply_ocr_to_pages - could leverage service in future?) ---
219
+ def apply_ocr_to_pages(self, *args, **kwargs):
220
+ PDF = self._get_pdf_class()
221
+ # Delegate to individual PDF objects
222
+ logger.info("Applying OCR to relevant PDFs in collection...")
223
+ results = []
224
+ for pdf in self._pdfs:
225
+ # We need to figure out which pages belong to which PDF if batching here
226
+ # For now, simpler to call on each PDF
227
+ try:
228
+ # Assume apply_ocr_to_pages exists on PDF and accepts similar args
229
+ pdf.apply_ocr_to_pages(*args, **kwargs)
230
+ except Exception as e:
231
+ logger.error(f"Failed applying OCR to {pdf.path}: {e}", exc_info=True)
232
+ return self
233
+
234
+ # --- Advanced Method Placeholders ---
235
+ # Placeholder for categorize removed as find_relevant is now implemented
236
+
237
+ def categorize(self, categories: List[str], **kwargs):
238
+ """Categorizes PDFs in the collection based on content or features."""
239
+ # Implementation requires integrating with classification models or logic
240
+ raise NotImplementedError("categorize requires classification implementation.")
241
+
242
+ # --- Mixin Required Implementation ---
243
+ def get_indexable_items(self) -> Iterable[Indexable]:
244
+ """Yields Page objects from the collection, conforming to Indexable."""
245
+ if not self._pdfs:
246
+ return # Return empty iterator if no PDFs
247
+
248
+ for pdf in self._pdfs:
249
+ if not pdf.pages: # Handle case where a PDF might have 0 pages after loading
250
+ logger.warning(f"PDF '{pdf.path}' has no pages. Skipping.")
251
+ continue
252
+ for page in pdf.pages:
253
+ # Optional: Add filtering here if needed (e.g., skip empty pages)
254
+ # Assuming Page object conforms to Indexable
255
+ # We might still want the empty page check here for efficiency
256
+ # if not page.extract_text(use_exclusions=False).strip():
257
+ # logger.debug(f"Skipping empty page {page.page_number} from PDF '{pdf.path}'.")
258
+ # continue
259
+ yield page