natural-pdf 0.1.1__py3-none-any.whl → 0.1.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- natural_pdf/__init__.py +33 -1
- natural_pdf/analyzers/layout/layout_analyzer.py +133 -44
- natural_pdf/analyzers/layout/layout_manager.py +9 -6
- natural_pdf/analyzers/layout/layout_options.py +2 -4
- natural_pdf/analyzers/layout/surya.py +199 -91
- natural_pdf/collections/pdf_collection.py +259 -0
- natural_pdf/core/page.py +97 -69
- natural_pdf/core/pdf.py +382 -171
- natural_pdf/elements/region.py +55 -26
- natural_pdf/exporters/__init__.py +1 -0
- natural_pdf/exporters/searchable_pdf.py +252 -0
- natural_pdf/search/__init__.py +94 -0
- natural_pdf/search/haystack_search_service.py +520 -0
- natural_pdf/search/haystack_utils.py +386 -0
- natural_pdf/search/search_options.py +72 -0
- natural_pdf/search/search_service_protocol.py +189 -0
- natural_pdf/search/searchable_mixin.py +464 -0
- natural_pdf-0.1.3.dist-info/METADATA +137 -0
- {natural_pdf-0.1.1.dist-info → natural_pdf-0.1.3.dist-info}/RECORD +22 -13
- natural_pdf-0.1.1.dist-info/METADATA +0 -295
- {natural_pdf-0.1.1.dist-info → natural_pdf-0.1.3.dist-info}/WHEEL +0 -0
- {natural_pdf-0.1.1.dist-info → natural_pdf-0.1.3.dist-info}/licenses/LICENSE +0 -0
- {natural_pdf-0.1.1.dist-info → natural_pdf-0.1.3.dist-info}/top_level.txt +0 -0
@@ -3,6 +3,7 @@ import logging
|
|
3
3
|
import importlib.util
|
4
4
|
import os
|
5
5
|
import tempfile
|
6
|
+
import copy
|
6
7
|
from typing import List, Dict, Any, Optional, Tuple
|
7
8
|
from PIL import Image
|
8
9
|
|
@@ -11,20 +12,23 @@ from .layout_options import SuryaLayoutOptions, BaseLayoutOptions
|
|
11
12
|
|
12
13
|
logger = logging.getLogger(__name__)
|
13
14
|
|
14
|
-
# Check for
|
15
|
+
# Check for dependencies
|
15
16
|
surya_spec = importlib.util.find_spec("surya")
|
16
17
|
LayoutPredictor = None
|
18
|
+
TableRecPredictor = None
|
19
|
+
|
17
20
|
if surya_spec:
|
18
21
|
try:
|
19
22
|
from surya.layout import LayoutPredictor
|
23
|
+
from surya.table_rec import TableRecPredictor
|
20
24
|
except ImportError as e:
|
21
|
-
logger.warning(f"Could not import Surya dependencies: {e}")
|
25
|
+
logger.warning(f"Could not import Surya dependencies (layout and/or table_rec): {e}")
|
22
26
|
else:
|
23
27
|
logger.warning("surya not found. SuryaLayoutDetector will not be available.")
|
24
28
|
|
25
29
|
|
26
30
|
class SuryaLayoutDetector(LayoutDetector):
|
27
|
-
"""Document layout detector using Surya models."""
|
31
|
+
"""Document layout and table structure detector using Surya models."""
|
28
32
|
|
29
33
|
def __init__(self):
|
30
34
|
super().__init__()
|
@@ -32,120 +36,224 @@ class SuryaLayoutDetector(LayoutDetector):
|
|
32
36
|
'text', 'pageheader', 'pagefooter', 'sectionheader',
|
33
37
|
'table', 'tableofcontents', 'picture', 'caption',
|
34
38
|
'heading', 'title', 'list', 'listitem', 'code',
|
35
|
-
'textinlinemath', 'mathformula', 'form'
|
39
|
+
'textinlinemath', 'mathformula', 'form',
|
40
|
+
'table-row', 'table-column'
|
36
41
|
}
|
37
|
-
#
|
42
|
+
self._page_ref = None # To store page reference from options
|
38
43
|
|
39
44
|
def is_available(self) -> bool:
|
40
|
-
|
41
|
-
return LayoutPredictor is not None
|
45
|
+
return LayoutPredictor is not None and TableRecPredictor is not None
|
42
46
|
|
43
47
|
def _get_cache_key(self, options: BaseLayoutOptions) -> str:
|
44
|
-
"""Generate cache key based on model name and device."""
|
45
48
|
if not isinstance(options, SuryaLayoutOptions):
|
46
|
-
options = SuryaLayoutOptions(device=options.device)
|
47
|
-
|
49
|
+
options = SuryaLayoutOptions(device=options.device)
|
48
50
|
device_key = str(options.device).lower() if options.device else 'default_device'
|
49
|
-
# Include model_name if it affects loading, otherwise device might be enough
|
50
51
|
model_key = options.model_name
|
51
52
|
return f"{self.__class__.__name__}_{device_key}_{model_key}"
|
52
53
|
|
53
|
-
def _load_model_from_options(self, options: BaseLayoutOptions) -> Any:
|
54
|
-
"""Load the Surya LayoutPredictor model."""
|
54
|
+
def _load_model_from_options(self, options: BaseLayoutOptions) -> Dict[str, Any]:
|
55
55
|
if not self.is_available():
|
56
|
-
raise RuntimeError("Surya
|
57
|
-
|
56
|
+
raise RuntimeError("Surya dependencies (surya.layout and surya.table_rec) not installed.")
|
58
57
|
if not isinstance(options, SuryaLayoutOptions):
|
59
58
|
raise TypeError("Incorrect options type provided for Surya model loading.")
|
60
|
-
|
61
|
-
|
62
|
-
try:
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
predictor = LayoutPredictor(**predictor_args)
|
68
|
-
self.logger.info("Surya LayoutPredictor loaded.")
|
69
|
-
return predictor
|
59
|
+
self.logger.info(f"Loading Surya models (device={options.device})...")
|
60
|
+
models = {}
|
61
|
+
try:
|
62
|
+
models['layout'] = LayoutPredictor()
|
63
|
+
models['table_rec'] = TableRecPredictor()
|
64
|
+
self.logger.info("Surya LayoutPredictor and TableRecPredictor loaded.")
|
65
|
+
return models
|
70
66
|
except Exception as e:
|
71
|
-
self.logger.error(f"Failed to load Surya
|
67
|
+
self.logger.error(f"Failed to load Surya models: {e}", exc_info=True)
|
72
68
|
raise
|
69
|
+
|
70
|
+
def _expand_bbox(self, bbox: Tuple[float, float, float, float],
|
71
|
+
padding: int, max_width: int, max_height: int) -> Tuple[int, int, int, int]:
|
72
|
+
"""Expand bbox by padding, clamping to max dimensions."""
|
73
|
+
x0, y0, x1, y1 = bbox
|
74
|
+
x0 = max(0, int(x0 - padding))
|
75
|
+
y0 = max(0, int(y0 - padding))
|
76
|
+
x1 = min(max_width, int(x1 + padding))
|
77
|
+
y1 = min(max_height, int(y1 + padding))
|
78
|
+
return x0, y0, x1, y1
|
73
79
|
|
74
80
|
def detect(self, image: Image.Image, options: BaseLayoutOptions) -> List[Dict[str, Any]]:
|
75
|
-
"""Detect layout elements in an image using Surya."""
|
81
|
+
"""Detect layout elements and optionally table structure in an image using Surya."""
|
76
82
|
if not self.is_available():
|
77
|
-
raise RuntimeError("Surya
|
83
|
+
raise RuntimeError("Surya dependencies (layout and table_rec) not installed.")
|
78
84
|
|
79
85
|
if not isinstance(options, SuryaLayoutOptions):
|
80
86
|
self.logger.warning("Received BaseLayoutOptions, expected SuryaLayoutOptions. Using defaults.")
|
81
87
|
options = SuryaLayoutOptions(
|
82
88
|
confidence=options.confidence, classes=options.classes,
|
83
89
|
exclude_classes=options.exclude_classes, device=options.device,
|
84
|
-
extra_args=options.extra_args
|
90
|
+
extra_args=options.extra_args,
|
91
|
+
recognize_table_structure=True
|
85
92
|
)
|
93
|
+
|
94
|
+
# Extract page reference and scaling factors from extra_args (passed by LayoutAnalyzer)
|
95
|
+
self._page_ref = options.extra_args.get('_page_ref')
|
96
|
+
img_scale_x = options.extra_args.get('_img_scale_x')
|
97
|
+
img_scale_y = options.extra_args.get('_img_scale_y')
|
98
|
+
|
99
|
+
# We still need this check, otherwise later steps that need these vars will fail
|
100
|
+
can_do_table_rec = options.recognize_table_structure and self._page_ref and img_scale_x is not None and img_scale_y is not None
|
101
|
+
if options.recognize_table_structure and not can_do_table_rec:
|
102
|
+
logger.warning("Surya table recognition cannot proceed without page reference and scaling factors. Disabling.")
|
103
|
+
options.recognize_table_structure = False
|
86
104
|
|
87
|
-
|
88
|
-
if options.
|
89
|
-
|
90
|
-
|
91
|
-
# Get the cached/loaded predictor instance
|
92
|
-
layout_predictor = self._get_model(options)
|
93
|
-
|
94
|
-
# Surya predictor takes a list of images
|
95
|
-
input_image_list = [image.convert("RGB")] # Ensure RGB
|
96
|
-
|
97
|
-
detections = []
|
98
|
-
try:
|
99
|
-
self.logger.debug("Running Surya layout prediction...")
|
100
|
-
# Call the predictor (returns a list of LayoutResult objects)
|
101
|
-
layout_predictions = layout_predictor(input_image_list)
|
102
|
-
self.logger.debug(f"Surya prediction returned {len(layout_predictions)} results.")
|
103
|
-
|
104
|
-
if not layout_predictions:
|
105
|
-
self.logger.warning("Surya returned empty predictions list.")
|
106
|
-
return []
|
107
|
-
|
108
|
-
# Process results for the first (and only) image
|
109
|
-
prediction = layout_predictions[0] # LayoutResult object
|
110
|
-
|
111
|
-
# Prepare normalized class filters once
|
112
|
-
normalized_classes_req = {self._normalize_class_name(c) for c in options.classes} if options.classes else None
|
113
|
-
normalized_classes_excl = {self._normalize_class_name(c) for c in options.exclude_classes} if options.exclude_classes else set()
|
114
|
-
|
115
|
-
for layout_box in prediction.bboxes:
|
116
|
-
# Extract the class name and normalize it
|
117
|
-
class_name_orig = layout_box.label
|
118
|
-
normalized_class = self._normalize_class_name(class_name_orig)
|
119
|
-
score = float(layout_box.confidence)
|
120
|
-
|
121
|
-
# Apply confidence threshold
|
122
|
-
if score < options.confidence: continue
|
123
|
-
|
124
|
-
# Apply class filtering
|
125
|
-
if normalized_classes_req and normalized_class not in normalized_classes_req: continue
|
126
|
-
if normalized_class in normalized_classes_excl: continue
|
127
|
-
|
128
|
-
# Extract bbox coordinates (Surya provides [x_min, y_min, x_max, y_max])
|
129
|
-
x_min, y_min, x_max, y_max = map(float, layout_box.bbox)
|
130
|
-
|
131
|
-
# Add detection
|
132
|
-
detection_data = {
|
133
|
-
'bbox': (x_min, y_min, x_max, y_max),
|
134
|
-
'class': class_name_orig,
|
135
|
-
'confidence': score,
|
136
|
-
'normalized_class': normalized_class,
|
137
|
-
'source': 'layout',
|
138
|
-
'model': 'surya'
|
139
|
-
# Add polygon etc. if needed, check attributes on layout_box
|
140
|
-
# 'polygon': layout_box.polygon if hasattr(layout_box, 'polygon') else None,
|
141
|
-
}
|
142
|
-
detections.append(detection_data)
|
143
|
-
|
144
|
-
self.logger.info(f"Surya detected {len(detections)} layout elements matching criteria.")
|
105
|
+
# Validate classes
|
106
|
+
if options.classes: self.validate_classes(options.classes)
|
107
|
+
if options.exclude_classes: self.validate_classes(options.exclude_classes)
|
145
108
|
|
146
|
-
|
147
|
-
|
148
|
-
|
109
|
+
models = self._get_model(options)
|
110
|
+
layout_predictor = models['layout']
|
111
|
+
table_rec_predictor = models['table_rec']
|
112
|
+
|
113
|
+
input_image = image.convert("RGB")
|
114
|
+
input_image_list = [input_image]
|
115
|
+
|
116
|
+
initial_layout_detections = [] # Detections relative to input_image
|
117
|
+
tables_to_process = []
|
118
|
+
|
119
|
+
# --- Initial Layout Detection ---
|
120
|
+
self.logger.debug("Running Surya layout prediction...")
|
121
|
+
layout_predictions = layout_predictor(input_image_list)
|
122
|
+
self.logger.debug(f"Surya prediction returned {len(layout_predictions)} results.")
|
123
|
+
if not layout_predictions: return []
|
124
|
+
prediction = layout_predictions[0]
|
125
|
+
|
126
|
+
normalized_classes_req = {self._normalize_class_name(c) for c in options.classes} if options.classes else None
|
127
|
+
normalized_classes_excl = {self._normalize_class_name(c) for c in options.exclude_classes} if options.exclude_classes else set()
|
128
|
+
|
129
|
+
for layout_box in prediction.bboxes:
|
130
|
+
class_name_orig = layout_box.label
|
131
|
+
normalized_class = self._normalize_class_name(class_name_orig)
|
132
|
+
score = float(layout_box.confidence)
|
133
|
+
|
134
|
+
if score < options.confidence: continue
|
135
|
+
if normalized_classes_req and normalized_class not in normalized_classes_req: continue
|
136
|
+
if normalized_class in normalized_classes_excl: continue
|
137
|
+
|
138
|
+
x_min, y_min, x_max, y_max = map(float, layout_box.bbox)
|
139
|
+
detection_data = {
|
140
|
+
'bbox': (x_min, y_min, x_max, y_max),
|
141
|
+
'class': class_name_orig,
|
142
|
+
'confidence': score,
|
143
|
+
'normalized_class': normalized_class,
|
144
|
+
'source': 'layout',
|
145
|
+
'model': 'surya'
|
146
|
+
}
|
147
|
+
initial_layout_detections.append(detection_data)
|
148
|
+
|
149
|
+
if options.recognize_table_structure and normalized_class in ('table', 'tableofcontents'):
|
150
|
+
tables_to_process.append(detection_data)
|
151
|
+
|
152
|
+
self.logger.info(f"Surya initially detected {len(initial_layout_detections)} layout elements matching criteria.")
|
153
|
+
|
154
|
+
# --- Table Structure Recognition (Optional) ---
|
155
|
+
if not options.recognize_table_structure or not tables_to_process:
|
156
|
+
self.logger.debug("Skipping Surya table structure recognition (disabled or no tables found).")
|
157
|
+
return initial_layout_detections
|
158
|
+
|
159
|
+
self.logger.info(f"Attempting Surya table structure recognition for {len(tables_to_process)} tables...")
|
160
|
+
high_res_crops = []
|
161
|
+
pdf_offsets = [] # Store (pdf_x0, pdf_y0) for each crop
|
162
|
+
|
163
|
+
high_res_dpi = getattr(self._page_ref._parent, '_config', {}).get('surya_table_rec_dpi', 192)
|
164
|
+
bbox_padding = getattr(self._page_ref._parent, '_config', {}).get('surya_table_bbox_padding', 10)
|
165
|
+
pdf_to_highres_scale = high_res_dpi / 72.0
|
166
|
+
|
167
|
+
# Render high-res page ONCE
|
168
|
+
self.logger.debug(f"Rendering page {self._page_ref.number} at {high_res_dpi} DPI for table recognition...")
|
169
|
+
high_res_page_image = self._page_ref.to_image(resolution=high_res_dpi, include_highlights=False)
|
170
|
+
if not high_res_page_image:
|
171
|
+
raise RuntimeError(f"Failed to render page {self._page_ref.number} at high resolution.")
|
172
|
+
self.logger.debug(f" High-res image size: {high_res_page_image.width}x{high_res_page_image.height}")
|
173
|
+
|
174
|
+
for i, table_detection in enumerate(tables_to_process):
|
175
|
+
img_x0, img_y0, img_x1, img_y1 = table_detection['bbox']
|
176
|
+
|
177
|
+
# PDF coords
|
178
|
+
pdf_x0 = img_x0 * img_scale_x
|
179
|
+
pdf_y0 = img_y0 * img_scale_y
|
180
|
+
pdf_x1 = img_x1 * img_scale_x
|
181
|
+
pdf_y1 = img_y1 * img_scale_y
|
182
|
+
pdf_x0 = max(0, pdf_x0)
|
183
|
+
pdf_y0 = max(0, pdf_y0)
|
184
|
+
pdf_x1 = min(self._page_ref.width, pdf_x1)
|
185
|
+
pdf_y1 = min(self._page_ref.height, pdf_y1)
|
186
|
+
|
187
|
+
# High-res image coords
|
188
|
+
hr_x0 = pdf_x0 * pdf_to_highres_scale
|
189
|
+
hr_y0 = pdf_y0 * pdf_to_highres_scale
|
190
|
+
hr_x1 = pdf_x1 * pdf_to_highres_scale
|
191
|
+
hr_y1 = pdf_y1 * pdf_to_highres_scale
|
192
|
+
|
193
|
+
# Expand high-res bbox
|
194
|
+
hr_x0_exp, hr_y0_exp, hr_x1_exp, hr_y1_exp = self._expand_bbox(
|
195
|
+
(hr_x0, hr_y0, hr_x1, hr_y1),
|
196
|
+
padding=bbox_padding,
|
197
|
+
max_width=high_res_page_image.width,
|
198
|
+
max_height=high_res_page_image.height
|
199
|
+
)
|
200
|
+
|
201
|
+
crop = high_res_page_image.crop((hr_x0_exp, hr_y0_exp, hr_x1_exp, hr_y1_exp))
|
202
|
+
high_res_crops.append(crop)
|
203
|
+
pdf_offsets.append((pdf_x0, pdf_y0))
|
204
|
+
|
205
|
+
if not high_res_crops:
|
206
|
+
self.logger.info("No valid high-resolution table crops generated.")
|
207
|
+
return initial_layout_detections
|
208
|
+
|
209
|
+
structure_detections = [] # Detections relative to std_res input_image
|
210
|
+
|
211
|
+
# --- Run Table Recognition (will raise error on failure) ---
|
212
|
+
self.logger.debug(f"Running Surya table recognition on {len(high_res_crops)} high-res images...")
|
213
|
+
table_predictions = table_rec_predictor(high_res_crops)
|
214
|
+
self.logger.debug(f"Surya table recognition returned {len(table_predictions)} results.")
|
215
|
+
|
216
|
+
# --- Process Results ---
|
217
|
+
if len(table_predictions) != len(pdf_offsets):
|
218
|
+
# This case is less likely if predictor didn't error, but good sanity check
|
219
|
+
raise RuntimeError(f"Mismatch between table inputs ({len(pdf_offsets)}) and predictions ({len(table_predictions)}).")
|
220
|
+
|
221
|
+
for table_pred, (offset_pdf_x0, offset_pdf_y0) in zip(table_predictions, pdf_offsets):
|
222
|
+
# Process Rows
|
223
|
+
for row_box in table_pred.rows:
|
224
|
+
crop_rx0, crop_ry0, crop_rx1, crop_ry1 = map(float, row_box.bbox)
|
225
|
+
pdf_row_x0 = offset_pdf_x0 + crop_rx0 / pdf_to_highres_scale
|
226
|
+
pdf_row_y0 = offset_pdf_y0 + crop_ry0 / pdf_to_highres_scale
|
227
|
+
pdf_row_x1 = offset_pdf_x0 + crop_rx1 / pdf_to_highres_scale
|
228
|
+
pdf_row_y1 = offset_pdf_y0 + crop_ry1 / pdf_to_highres_scale
|
229
|
+
img_row_x0 = pdf_row_x0 / img_scale_x
|
230
|
+
img_row_y0 = pdf_row_y0 / img_scale_y
|
231
|
+
img_row_x1 = pdf_row_x1 / img_scale_x
|
232
|
+
img_row_y1 = pdf_row_y1 / img_scale_y
|
233
|
+
structure_detections.append({
|
234
|
+
'bbox': (img_row_x0, img_row_y0, img_row_x1, img_row_y1),
|
235
|
+
'class': 'table-row', 'confidence': 1.0, 'normalized_class': 'table-row',
|
236
|
+
'source': 'layout', 'model': 'surya'
|
237
|
+
})
|
238
|
+
|
239
|
+
# Process Columns
|
240
|
+
for col_box in table_pred.cols:
|
241
|
+
crop_cx0, crop_cy0, crop_cx1, crop_cy1 = map(float, col_box.bbox)
|
242
|
+
pdf_col_x0 = offset_pdf_x0 + crop_cx0 / pdf_to_highres_scale
|
243
|
+
pdf_col_y0 = offset_pdf_y0 + crop_cy0 / pdf_to_highres_scale
|
244
|
+
pdf_col_x1 = offset_pdf_x0 + crop_cx1 / pdf_to_highres_scale
|
245
|
+
pdf_col_y1 = offset_pdf_y0 + crop_cy1 / pdf_to_highres_scale
|
246
|
+
img_col_x0 = pdf_col_x0 / img_scale_x
|
247
|
+
img_col_y0 = pdf_col_y0 / img_scale_y
|
248
|
+
img_col_x1 = pdf_col_x1 / img_scale_x
|
249
|
+
img_col_y1 = pdf_col_y1 / img_scale_y
|
250
|
+
structure_detections.append({
|
251
|
+
'bbox': (img_col_x0, img_col_y0, img_col_x1, img_col_y1),
|
252
|
+
'class': 'table-column', 'confidence': 1.0, 'normalized_class': 'table-column',
|
253
|
+
'source': 'layout', 'model': 'surya'
|
254
|
+
})
|
255
|
+
|
256
|
+
self.logger.info(f"Added {len(structure_detections)} table structure elements.")
|
149
257
|
|
150
|
-
return
|
258
|
+
return initial_layout_detections + structure_detections
|
151
259
|
|
@@ -0,0 +1,259 @@
|
|
1
|
+
import os
|
2
|
+
import glob as py_glob
|
3
|
+
import logging
|
4
|
+
from typing import List, Optional, Dict, Any, Union, Iterable, Set, TYPE_CHECKING, Type
|
5
|
+
from pathlib import Path
|
6
|
+
from PIL import Image
|
7
|
+
import re # Added for safe path generation
|
8
|
+
import copy # Added for copying options
|
9
|
+
from tqdm import tqdm
|
10
|
+
|
11
|
+
# Set up logger early
|
12
|
+
logger = logging.getLogger(__name__)
|
13
|
+
|
14
|
+
from natural_pdf.core.pdf import PDF
|
15
|
+
from natural_pdf.elements.region import Region
|
16
|
+
|
17
|
+
# --- Search Imports ---
|
18
|
+
try:
|
19
|
+
from natural_pdf.search.search_service_protocol import (
|
20
|
+
SearchServiceProtocol, SearchOptions, Indexable
|
21
|
+
)
|
22
|
+
from natural_pdf.search.searchable_mixin import SearchableMixin
|
23
|
+
except ImportError as e:
|
24
|
+
logger_init = logging.getLogger(__name__)
|
25
|
+
logger_init.error(f"Failed to import search components. Search functionality disabled. Error: {e}", exc_info=True)
|
26
|
+
# Dummy definitions
|
27
|
+
class SearchableMixin: pass
|
28
|
+
SearchServiceProtocol, SearchOptions, Indexable = object, object, object
|
29
|
+
|
30
|
+
from natural_pdf.search.searchable_mixin import SearchableMixin # Import the new mixin
|
31
|
+
|
32
|
+
class PDFCollection(SearchableMixin): # Inherit from the mixin
|
33
|
+
def __init__(self,
|
34
|
+
source: Union[str, Iterable[Union[str, 'PDF']]],
|
35
|
+
recursive: bool = True,
|
36
|
+
**pdf_options: Any):
|
37
|
+
"""
|
38
|
+
Initializes a collection of PDF documents from various sources.
|
39
|
+
|
40
|
+
Args:
|
41
|
+
source: The source of PDF documents. Can be:
|
42
|
+
- An iterable (e.g., list) of existing PDF objects.
|
43
|
+
- An iterable (e.g., list) of file paths/URLs/globs (strings).
|
44
|
+
- A single file path/URL/directory/glob string.
|
45
|
+
recursive: If source involves directories or glob patterns,
|
46
|
+
whether to search recursively (default: True).
|
47
|
+
**pdf_options: Keyword arguments passed to the PDF constructor.
|
48
|
+
"""
|
49
|
+
self._pdfs: List['PDF'] = []
|
50
|
+
self._pdf_options = pdf_options # Store options for potential slicing later
|
51
|
+
self._recursive = recursive # Store setting for potential slicing
|
52
|
+
|
53
|
+
# Dynamically import PDF class within methods to avoid circular import at module load time
|
54
|
+
PDF = self._get_pdf_class()
|
55
|
+
|
56
|
+
if hasattr(source, '__iter__') and not isinstance(source, str):
|
57
|
+
source_list = list(source)
|
58
|
+
if not source_list: return # Empty list source
|
59
|
+
if isinstance(source_list[0], PDF):
|
60
|
+
if all(isinstance(item, PDF) for item in source_list):
|
61
|
+
self._pdfs = source_list # Direct assignment
|
62
|
+
# Don't adopt search context anymore
|
63
|
+
return
|
64
|
+
else: raise TypeError("Iterable source has mixed PDF/non-PDF objects.")
|
65
|
+
# If it's an iterable but not PDFs, fall through to resolve sources
|
66
|
+
|
67
|
+
# Resolve string, iterable of strings, or single string source to paths/URLs
|
68
|
+
resolved_paths_or_urls = self._resolve_sources_to_paths(source)
|
69
|
+
self._initialize_pdfs(resolved_paths_or_urls, PDF) # Pass PDF class
|
70
|
+
|
71
|
+
self._iter_index = 0
|
72
|
+
|
73
|
+
# Initialize internal search service reference
|
74
|
+
self._search_service: Optional[SearchServiceProtocol] = None
|
75
|
+
|
76
|
+
@staticmethod
|
77
|
+
def _get_pdf_class():
|
78
|
+
"""Helper method to dynamically import the PDF class."""
|
79
|
+
try:
|
80
|
+
# Import needs to resolve path correctly
|
81
|
+
from natural_pdf.core.pdf import PDF
|
82
|
+
return PDF
|
83
|
+
except ImportError as e:
|
84
|
+
logger.error("Could not import PDF class from natural_pdf.core.pdf. Ensure it exists and there are no circular imports at runtime.")
|
85
|
+
raise ImportError("PDF class is required but could not be imported.") from e
|
86
|
+
|
87
|
+
# --- Internal Helpers ---
|
88
|
+
|
89
|
+
def _is_url(self, s: str) -> bool: return s.startswith(('http://', 'https://'))
|
90
|
+
def _has_glob_magic(self, s: str) -> bool: return py_glob.has_magic(s)
|
91
|
+
|
92
|
+
def _execute_glob(self, pattern: str) -> Set[str]:
|
93
|
+
"""Glob for paths and return a set of valid PDF paths."""
|
94
|
+
found_paths = set()
|
95
|
+
try:
|
96
|
+
# Use iglob for potentially large directories/matches
|
97
|
+
paths_iter = py_glob.iglob(pattern, recursive=self._recursive)
|
98
|
+
for path_str in paths_iter:
|
99
|
+
# Use Path object for easier checking
|
100
|
+
p = Path(path_str)
|
101
|
+
if p.is_file() and p.suffix.lower() == ".pdf":
|
102
|
+
found_paths.add(str(p.resolve())) # Store resolved absolute path
|
103
|
+
except Exception as e:
|
104
|
+
logger.error(f"Error processing glob pattern '{pattern}': {e}")
|
105
|
+
return found_paths
|
106
|
+
|
107
|
+
def _resolve_sources_to_paths(self, source: Union[str, Iterable[str]]) -> List[str]:
|
108
|
+
"""Resolves various source types into a list of unique PDF paths/URLs."""
|
109
|
+
final_paths = set()
|
110
|
+
sources_to_process = []
|
111
|
+
|
112
|
+
if isinstance(source, str):
|
113
|
+
sources_to_process.append(source)
|
114
|
+
elif hasattr(source, '__iter__'):
|
115
|
+
sources_to_process.extend(list(source))
|
116
|
+
else: # Should not happen based on __init__ checks, but safeguard
|
117
|
+
raise TypeError(f"Unexpected source type in _resolve_sources_to_paths: {type(source)}")
|
118
|
+
|
119
|
+
for item in sources_to_process:
|
120
|
+
if not isinstance(item, str):
|
121
|
+
logger.warning(f"Skipping non-string item in source list: {type(item)}")
|
122
|
+
continue
|
123
|
+
|
124
|
+
item_path = Path(item)
|
125
|
+
|
126
|
+
if self._is_url(item):
|
127
|
+
final_paths.add(item) # Add URL directly
|
128
|
+
elif self._has_glob_magic(item):
|
129
|
+
glob_results = self._execute_glob(item)
|
130
|
+
final_paths.update(glob_results)
|
131
|
+
elif item_path.is_dir():
|
132
|
+
# Use glob to find PDFs in directory, respecting recursive flag
|
133
|
+
dir_pattern = str(item_path / "**" / "*.pdf") if self._recursive else str(item_path / "*.pdf")
|
134
|
+
dir_glob_results = self._execute_glob(dir_pattern)
|
135
|
+
final_paths.update(dir_glob_results)
|
136
|
+
elif item_path.is_file() and item_path.suffix.lower() == ".pdf":
|
137
|
+
final_paths.add(str(item_path.resolve())) # Add resolved file path
|
138
|
+
else:
|
139
|
+
logger.warning(f"Source item ignored (not a valid URL, directory, file, or glob): {item}")
|
140
|
+
|
141
|
+
return sorted(list(final_paths))
|
142
|
+
|
143
|
+
def _initialize_pdfs(self, paths_or_urls: List[str], PDF_cls: Type):
|
144
|
+
"""Initializes PDF objects from a list of paths/URLs."""
|
145
|
+
logger.info(f"Initializing {len(paths_or_urls)} PDF objects...")
|
146
|
+
failed_count = 0
|
147
|
+
for path_or_url in tqdm(paths_or_urls, desc="Loading PDFs"):
|
148
|
+
try:
|
149
|
+
pdf_instance = PDF_cls(path_or_url, **self._pdf_options)
|
150
|
+
self._pdfs.append(pdf_instance)
|
151
|
+
except Exception as e:
|
152
|
+
logger.error(f"Failed to load PDF: {path_or_url}. Error: {e}", exc_info=False) # Keep log concise
|
153
|
+
failed_count += 1
|
154
|
+
logger.info(f"Successfully initialized {len(self._pdfs)} PDFs. Failed: {failed_count}")
|
155
|
+
|
156
|
+
# --- Public Factory Class Methods (Simplified) ---
|
157
|
+
|
158
|
+
@classmethod
|
159
|
+
def from_paths(cls, paths_or_urls: List[str], **pdf_options: Any) -> 'PDFCollection':
|
160
|
+
"""Creates a PDFCollection explicitly from a list of file paths or URLs."""
|
161
|
+
# __init__ can handle List[str] directly now
|
162
|
+
return cls(paths_or_urls, **pdf_options)
|
163
|
+
|
164
|
+
@classmethod
|
165
|
+
def from_glob(cls, pattern: str, recursive: bool = True, **pdf_options: Any) -> 'PDFCollection':
|
166
|
+
"""Creates a PDFCollection explicitly from a single glob pattern."""
|
167
|
+
# __init__ can handle single glob string directly
|
168
|
+
return cls(pattern, recursive=recursive, **pdf_options)
|
169
|
+
|
170
|
+
@classmethod
|
171
|
+
def from_globs(cls, patterns: List[str], recursive: bool = True, **pdf_options: Any) -> 'PDFCollection':
|
172
|
+
"""Creates a PDFCollection explicitly from a list of glob patterns."""
|
173
|
+
# __init__ can handle List[str] containing globs directly
|
174
|
+
return cls(patterns, recursive=recursive, **pdf_options)
|
175
|
+
|
176
|
+
@classmethod
|
177
|
+
def from_directory(cls, directory_path: str, recursive: bool = True, **pdf_options: Any) -> 'PDFCollection':
|
178
|
+
"""Creates a PDFCollection explicitly from PDF files within a directory."""
|
179
|
+
# __init__ can handle single directory string directly
|
180
|
+
return cls(directory_path, recursive=recursive, **pdf_options)
|
181
|
+
|
182
|
+
# --- Core Collection Methods ---
|
183
|
+
def __len__(self) -> int:
|
184
|
+
return len(self._pdfs)
|
185
|
+
|
186
|
+
def __getitem__(self, key) -> Union['PDF', 'PDFCollection']:
|
187
|
+
# Use dynamic import here as well
|
188
|
+
PDF = self._get_pdf_class()
|
189
|
+
if isinstance(key, slice):
|
190
|
+
# Create a new collection with the sliced PDFs and original options
|
191
|
+
new_collection = PDFCollection.__new__(PDFCollection) # Create blank instance
|
192
|
+
new_collection._pdfs = self._pdfs[key]
|
193
|
+
new_collection._pdf_options = self._pdf_options
|
194
|
+
new_collection._recursive = self._recursive
|
195
|
+
# Search context is not copied/inherited anymore
|
196
|
+
return new_collection
|
197
|
+
elif isinstance(key, int):
|
198
|
+
# Check bounds
|
199
|
+
if 0 <= key < len(self._pdfs):
|
200
|
+
return self._pdfs[key]
|
201
|
+
else:
|
202
|
+
raise IndexError(f"PDF index {key} out of range (0-{len(self._pdfs)-1}).")
|
203
|
+
else:
|
204
|
+
raise TypeError(f"PDF indices must be integers or slices, not {type(key)}.")
|
205
|
+
|
206
|
+
def __iter__(self):
|
207
|
+
return iter(self._pdfs)
|
208
|
+
|
209
|
+
def __repr__(self) -> str:
|
210
|
+
# Removed search status
|
211
|
+
return f"<PDFCollection(count={len(self)})>"
|
212
|
+
|
213
|
+
@property
|
214
|
+
def pdfs(self) -> List['PDF']:
|
215
|
+
"""Returns the list of PDF objects held by the collection."""
|
216
|
+
return self._pdfs
|
217
|
+
|
218
|
+
# --- Other Methods (e.g., apply_ocr_to_pages - could leverage service in future?) ---
|
219
|
+
def apply_ocr_to_pages(self, *args, **kwargs):
|
220
|
+
PDF = self._get_pdf_class()
|
221
|
+
# Delegate to individual PDF objects
|
222
|
+
logger.info("Applying OCR to relevant PDFs in collection...")
|
223
|
+
results = []
|
224
|
+
for pdf in self._pdfs:
|
225
|
+
# We need to figure out which pages belong to which PDF if batching here
|
226
|
+
# For now, simpler to call on each PDF
|
227
|
+
try:
|
228
|
+
# Assume apply_ocr_to_pages exists on PDF and accepts similar args
|
229
|
+
pdf.apply_ocr_to_pages(*args, **kwargs)
|
230
|
+
except Exception as e:
|
231
|
+
logger.error(f"Failed applying OCR to {pdf.path}: {e}", exc_info=True)
|
232
|
+
return self
|
233
|
+
|
234
|
+
# --- Advanced Method Placeholders ---
|
235
|
+
# Placeholder for categorize removed as find_relevant is now implemented
|
236
|
+
|
237
|
+
def categorize(self, categories: List[str], **kwargs):
|
238
|
+
"""Categorizes PDFs in the collection based on content or features."""
|
239
|
+
# Implementation requires integrating with classification models or logic
|
240
|
+
raise NotImplementedError("categorize requires classification implementation.")
|
241
|
+
|
242
|
+
# --- Mixin Required Implementation ---
|
243
|
+
def get_indexable_items(self) -> Iterable[Indexable]:
|
244
|
+
"""Yields Page objects from the collection, conforming to Indexable."""
|
245
|
+
if not self._pdfs:
|
246
|
+
return # Return empty iterator if no PDFs
|
247
|
+
|
248
|
+
for pdf in self._pdfs:
|
249
|
+
if not pdf.pages: # Handle case where a PDF might have 0 pages after loading
|
250
|
+
logger.warning(f"PDF '{pdf.path}' has no pages. Skipping.")
|
251
|
+
continue
|
252
|
+
for page in pdf.pages:
|
253
|
+
# Optional: Add filtering here if needed (e.g., skip empty pages)
|
254
|
+
# Assuming Page object conforms to Indexable
|
255
|
+
# We might still want the empty page check here for efficiency
|
256
|
+
# if not page.extract_text(use_exclusions=False).strip():
|
257
|
+
# logger.debug(f"Skipping empty page {page.page_number} from PDF '{pdf.path}'.")
|
258
|
+
# continue
|
259
|
+
yield page
|