abstractcore 2.4.4__py3-none-any.whl → 2.4.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- abstractcore/cli/__init__.py +9 -0
- abstractcore/cli/main.py +759 -0
- abstractcore/cli/vision_config.py +491 -0
- abstractcore/core/interface.py +7 -0
- abstractcore/core/session.py +27 -2
- abstractcore/media/handlers/__init__.py +16 -0
- abstractcore/media/handlers/anthropic_handler.py +326 -0
- abstractcore/media/handlers/local_handler.py +541 -0
- abstractcore/media/handlers/openai_handler.py +281 -0
- abstractcore/media/processors/__init__.py +13 -0
- abstractcore/media/processors/image_processor.py +610 -0
- abstractcore/media/processors/office_processor.py +490 -0
- abstractcore/media/processors/pdf_processor.py +485 -0
- abstractcore/media/processors/text_processor.py +557 -0
- abstractcore/media/utils/__init__.py +22 -0
- abstractcore/media/utils/image_scaler.py +306 -0
- abstractcore/providers/anthropic_provider.py +14 -2
- abstractcore/providers/base.py +24 -0
- abstractcore/providers/huggingface_provider.py +23 -9
- abstractcore/providers/lmstudio_provider.py +6 -1
- abstractcore/providers/mlx_provider.py +20 -7
- abstractcore/providers/ollama_provider.py +6 -1
- abstractcore/providers/openai_provider.py +6 -2
- abstractcore/tools/common_tools.py +651 -1
- abstractcore/utils/version.py +1 -1
- {abstractcore-2.4.4.dist-info → abstractcore-2.4.6.dist-info}/METADATA +59 -9
- {abstractcore-2.4.4.dist-info → abstractcore-2.4.6.dist-info}/RECORD +31 -17
- {abstractcore-2.4.4.dist-info → abstractcore-2.4.6.dist-info}/entry_points.txt +2 -0
- {abstractcore-2.4.4.dist-info → abstractcore-2.4.6.dist-info}/WHEEL +0 -0
- {abstractcore-2.4.4.dist-info → abstractcore-2.4.6.dist-info}/licenses/LICENSE +0 -0
- {abstractcore-2.4.4.dist-info → abstractcore-2.4.6.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,485 @@
|
|
|
1
|
+
"""
|
|
2
|
+
PDF processor using PyMuPDF4LLM for optimized LLM processing.
|
|
3
|
+
|
|
4
|
+
This module provides comprehensive PDF processing capabilities using PyMuPDF4LLM,
|
|
5
|
+
optimized for LLM consumption with excellent markdown output and structure preservation.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
from typing import Optional, Dict, Any, List, Union, Tuple
|
|
10
|
+
|
|
11
|
+
try:
|
|
12
|
+
import pymupdf4llm
|
|
13
|
+
PYMUPDF4LLM_AVAILABLE = True
|
|
14
|
+
except ImportError:
|
|
15
|
+
PYMUPDF4LLM_AVAILABLE = False
|
|
16
|
+
pymupdf4llm = None
|
|
17
|
+
|
|
18
|
+
try:
|
|
19
|
+
import pymupdf as fitz
|
|
20
|
+
PYMUPDF_AVAILABLE = True
|
|
21
|
+
except ImportError:
|
|
22
|
+
PYMUPDF_AVAILABLE = False
|
|
23
|
+
fitz = None
|
|
24
|
+
|
|
25
|
+
from ..base import BaseMediaHandler, MediaProcessingError
|
|
26
|
+
from ..types import MediaContent, MediaType, ContentFormat
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class PDFProcessor(BaseMediaHandler):
|
|
30
|
+
"""
|
|
31
|
+
PDF processor using PyMuPDF4LLM for LLM-optimized document processing.
|
|
32
|
+
|
|
33
|
+
Provides high-quality text extraction, structure preservation, table detection,
|
|
34
|
+
and image extraction from PDF documents.
|
|
35
|
+
"""
|
|
36
|
+
|
|
37
|
+
def __init__(self, **kwargs):
|
|
38
|
+
"""
|
|
39
|
+
Initialize the PDF processor.
|
|
40
|
+
|
|
41
|
+
Args:
|
|
42
|
+
**kwargs: Configuration parameters including:
|
|
43
|
+
- extract_images: Whether to extract embedded images
|
|
44
|
+
- preserve_tables: Whether to preserve table formatting
|
|
45
|
+
- markdown_output: Whether to output as markdown
|
|
46
|
+
- page_range: Tuple of (start_page, end_page) or None for all pages
|
|
47
|
+
- extract_metadata: Whether to extract PDF metadata
|
|
48
|
+
"""
|
|
49
|
+
if not PYMUPDF4LLM_AVAILABLE:
|
|
50
|
+
raise ImportError(
|
|
51
|
+
"PyMuPDF4LLM is required for PDF processing. "
|
|
52
|
+
"Install with: pip install \"abstractcore[media]\""
|
|
53
|
+
)
|
|
54
|
+
|
|
55
|
+
super().__init__(**kwargs)
|
|
56
|
+
|
|
57
|
+
# PDF processing configuration
|
|
58
|
+
self.extract_images = kwargs.get('extract_images', False)
|
|
59
|
+
self.preserve_tables = kwargs.get('preserve_tables', True)
|
|
60
|
+
self.markdown_output = kwargs.get('markdown_output', True)
|
|
61
|
+
self.page_range = kwargs.get('page_range', None)
|
|
62
|
+
self.extract_metadata = kwargs.get('extract_metadata', True)
|
|
63
|
+
|
|
64
|
+
# Set capabilities for PDF processing
|
|
65
|
+
from ..types import MediaCapabilities
|
|
66
|
+
self.capabilities = MediaCapabilities(
|
|
67
|
+
vision_support=self.extract_images,
|
|
68
|
+
audio_support=False,
|
|
69
|
+
video_support=False,
|
|
70
|
+
document_support=True,
|
|
71
|
+
supported_document_formats=['pdf'],
|
|
72
|
+
max_file_size=self.max_file_size
|
|
73
|
+
)
|
|
74
|
+
|
|
75
|
+
self.logger.debug(
|
|
76
|
+
f"Initialized PDFProcessor with extract_images={self.extract_images}, "
|
|
77
|
+
f"preserve_tables={self.preserve_tables}, markdown_output={self.markdown_output}"
|
|
78
|
+
)
|
|
79
|
+
|
|
80
|
+
def _process_internal(self, file_path: Path, media_type: MediaType, **kwargs) -> MediaContent:
|
|
81
|
+
"""
|
|
82
|
+
Process a PDF file and return optimized content for LLM consumption.
|
|
83
|
+
|
|
84
|
+
Args:
|
|
85
|
+
file_path: Path to the PDF file
|
|
86
|
+
media_type: Detected media type (should be DOCUMENT)
|
|
87
|
+
**kwargs: Additional processing parameters:
|
|
88
|
+
- page_range: Override default page range
|
|
89
|
+
- extract_images: Override default image extraction
|
|
90
|
+
- output_format: 'markdown', 'text', or 'structured'
|
|
91
|
+
- dpi: DPI for image extraction (default: 150)
|
|
92
|
+
|
|
93
|
+
Returns:
|
|
94
|
+
MediaContent with processed PDF content
|
|
95
|
+
|
|
96
|
+
Raises:
|
|
97
|
+
MediaProcessingError: If PDF processing fails
|
|
98
|
+
"""
|
|
99
|
+
if media_type != MediaType.DOCUMENT:
|
|
100
|
+
raise MediaProcessingError(f"PDFProcessor only handles document types, got {media_type}")
|
|
101
|
+
|
|
102
|
+
try:
|
|
103
|
+
# Override defaults with kwargs
|
|
104
|
+
page_range = kwargs.get('page_range', self.page_range)
|
|
105
|
+
extract_images = kwargs.get('extract_images', self.extract_images)
|
|
106
|
+
output_format = kwargs.get('output_format', 'markdown' if self.markdown_output else 'text')
|
|
107
|
+
dpi = kwargs.get('dpi', 150)
|
|
108
|
+
|
|
109
|
+
# Process PDF with PyMuPDF4LLM
|
|
110
|
+
content, metadata = self._extract_pdf_content(
|
|
111
|
+
file_path, page_range, extract_images, output_format, dpi
|
|
112
|
+
)
|
|
113
|
+
|
|
114
|
+
# Determine content format and MIME type based on output format
|
|
115
|
+
if output_format == 'markdown':
|
|
116
|
+
mime_type = 'text/markdown'
|
|
117
|
+
elif output_format == 'structured':
|
|
118
|
+
mime_type = 'application/json'
|
|
119
|
+
else:
|
|
120
|
+
mime_type = 'text/plain'
|
|
121
|
+
|
|
122
|
+
return self._create_media_content(
|
|
123
|
+
content=content,
|
|
124
|
+
file_path=file_path,
|
|
125
|
+
media_type=MediaType.DOCUMENT,
|
|
126
|
+
content_format=ContentFormat.TEXT,
|
|
127
|
+
mime_type=mime_type,
|
|
128
|
+
**metadata
|
|
129
|
+
)
|
|
130
|
+
|
|
131
|
+
except Exception as e:
|
|
132
|
+
raise MediaProcessingError(f"Failed to process PDF {file_path}: {str(e)}") from e
|
|
133
|
+
|
|
134
|
+
def _extract_pdf_content(self, file_path: Path, page_range: Optional[Tuple[int, int]],
|
|
135
|
+
extract_images: bool, output_format: str, dpi: int) -> Tuple[str, Dict[str, Any]]:
|
|
136
|
+
"""
|
|
137
|
+
Extract content from PDF using PyMuPDF4LLM.
|
|
138
|
+
|
|
139
|
+
Args:
|
|
140
|
+
file_path: Path to the PDF file
|
|
141
|
+
page_range: Optional page range to process
|
|
142
|
+
extract_images: Whether to extract images
|
|
143
|
+
output_format: Output format ('markdown', 'text', 'structured')
|
|
144
|
+
dpi: DPI for image extraction
|
|
145
|
+
|
|
146
|
+
Returns:
|
|
147
|
+
Tuple of (content, metadata)
|
|
148
|
+
"""
|
|
149
|
+
try:
|
|
150
|
+
# Configure PyMuPDF4LLM options
|
|
151
|
+
extraction_options = {
|
|
152
|
+
'pages': page_range,
|
|
153
|
+
'write_images': extract_images,
|
|
154
|
+
'image_format': 'png',
|
|
155
|
+
'dpi': dpi,
|
|
156
|
+
'table_strategy': 'lines_strict' if self.preserve_tables else 'lines'
|
|
157
|
+
}
|
|
158
|
+
|
|
159
|
+
# Remove None values from options
|
|
160
|
+
extraction_options = {k: v for k, v in extraction_options.items() if v is not None}
|
|
161
|
+
|
|
162
|
+
if output_format == 'markdown':
|
|
163
|
+
# Use PyMuPDF4LLM for markdown extraction
|
|
164
|
+
md_text = pymupdf4llm.to_markdown(str(file_path), **extraction_options)
|
|
165
|
+
content = md_text
|
|
166
|
+
else:
|
|
167
|
+
# Use regular PyMuPDF for text extraction if available
|
|
168
|
+
if PYMUPDF_AVAILABLE:
|
|
169
|
+
content, metadata = self._extract_with_pymupdf(file_path, page_range, extract_images)
|
|
170
|
+
else:
|
|
171
|
+
# Fallback to PyMuPDF4LLM text extraction
|
|
172
|
+
md_text = pymupdf4llm.to_markdown(str(file_path), **extraction_options)
|
|
173
|
+
# Convert markdown to plain text (basic conversion)
|
|
174
|
+
content = self._markdown_to_text(md_text)
|
|
175
|
+
|
|
176
|
+
# Extract metadata
|
|
177
|
+
metadata = self._extract_pdf_metadata(file_path)
|
|
178
|
+
|
|
179
|
+
# Add processing metadata
|
|
180
|
+
metadata.update({
|
|
181
|
+
'extraction_method': 'pymupdf4llm',
|
|
182
|
+
'output_format': output_format,
|
|
183
|
+
'page_range': page_range,
|
|
184
|
+
'images_extracted': extract_images,
|
|
185
|
+
'tables_preserved': self.preserve_tables,
|
|
186
|
+
'content_length': len(content)
|
|
187
|
+
})
|
|
188
|
+
|
|
189
|
+
return content, metadata
|
|
190
|
+
|
|
191
|
+
except Exception as e:
|
|
192
|
+
raise MediaProcessingError(f"PyMuPDF4LLM extraction failed: {str(e)}") from e
|
|
193
|
+
|
|
194
|
+
def _extract_with_pymupdf(self, file_path: Path, page_range: Optional[Tuple[int, int]],
|
|
195
|
+
extract_images: bool) -> Tuple[str, Dict[str, Any]]:
|
|
196
|
+
"""
|
|
197
|
+
Extract content using regular PyMuPDF for text-only extraction.
|
|
198
|
+
|
|
199
|
+
Args:
|
|
200
|
+
file_path: Path to the PDF file
|
|
201
|
+
page_range: Optional page range to process
|
|
202
|
+
extract_images: Whether to extract images
|
|
203
|
+
|
|
204
|
+
Returns:
|
|
205
|
+
Tuple of (content, metadata)
|
|
206
|
+
"""
|
|
207
|
+
doc = fitz.open(str(file_path))
|
|
208
|
+
content_parts = []
|
|
209
|
+
images = []
|
|
210
|
+
|
|
211
|
+
try:
|
|
212
|
+
# Determine page range
|
|
213
|
+
start_page = page_range[0] if page_range else 0
|
|
214
|
+
end_page = page_range[1] if page_range else doc.page_count - 1
|
|
215
|
+
end_page = min(end_page, doc.page_count - 1)
|
|
216
|
+
|
|
217
|
+
for page_num in range(start_page, end_page + 1):
|
|
218
|
+
page = doc[page_num]
|
|
219
|
+
|
|
220
|
+
# Extract text
|
|
221
|
+
page_text = page.get_text()
|
|
222
|
+
if page_text.strip():
|
|
223
|
+
content_parts.append(f"# Page {page_num + 1}\n\n{page_text}\n")
|
|
224
|
+
|
|
225
|
+
# Extract images if requested
|
|
226
|
+
if extract_images:
|
|
227
|
+
page_images = self._extract_page_images(page, page_num)
|
|
228
|
+
images.extend(page_images)
|
|
229
|
+
|
|
230
|
+
content = "\n".join(content_parts)
|
|
231
|
+
|
|
232
|
+
metadata = {
|
|
233
|
+
'page_count': doc.page_count,
|
|
234
|
+
'processed_pages': end_page - start_page + 1,
|
|
235
|
+
'images_found': len(images),
|
|
236
|
+
'extraction_method': 'pymupdf'
|
|
237
|
+
}
|
|
238
|
+
|
|
239
|
+
if images:
|
|
240
|
+
metadata['images'] = images
|
|
241
|
+
|
|
242
|
+
return content, metadata
|
|
243
|
+
|
|
244
|
+
finally:
|
|
245
|
+
doc.close()
|
|
246
|
+
|
|
247
|
+
def _extract_page_images(self, page, page_num: int) -> List[Dict[str, Any]]:
|
|
248
|
+
"""
|
|
249
|
+
Extract images from a PDF page.
|
|
250
|
+
|
|
251
|
+
Args:
|
|
252
|
+
page: PyMuPDF page object
|
|
253
|
+
page_num: Page number
|
|
254
|
+
|
|
255
|
+
Returns:
|
|
256
|
+
List of image metadata dictionaries
|
|
257
|
+
"""
|
|
258
|
+
images = []
|
|
259
|
+
|
|
260
|
+
try:
|
|
261
|
+
# Get image list
|
|
262
|
+
image_list = page.get_images()
|
|
263
|
+
|
|
264
|
+
for img_index, img in enumerate(image_list):
|
|
265
|
+
# Extract image
|
|
266
|
+
xref = img[0]
|
|
267
|
+
pix = fitz.Pixmap(page.parent, xref)
|
|
268
|
+
|
|
269
|
+
if pix.n - pix.alpha < 4: # GRAY or RGB
|
|
270
|
+
# Convert to PNG bytes
|
|
271
|
+
img_data = pix.tobytes("png")
|
|
272
|
+
|
|
273
|
+
# Create image metadata
|
|
274
|
+
image_info = {
|
|
275
|
+
'page': page_num + 1,
|
|
276
|
+
'index': img_index,
|
|
277
|
+
'width': pix.width,
|
|
278
|
+
'height': pix.height,
|
|
279
|
+
'colorspace': pix.colorspace.name if pix.colorspace else 'Unknown',
|
|
280
|
+
'size_bytes': len(img_data),
|
|
281
|
+
'format': 'png'
|
|
282
|
+
}
|
|
283
|
+
|
|
284
|
+
images.append(image_info)
|
|
285
|
+
|
|
286
|
+
pix = None # Free memory
|
|
287
|
+
|
|
288
|
+
except Exception as e:
|
|
289
|
+
self.logger.warning(f"Failed to extract images from page {page_num}: {e}")
|
|
290
|
+
|
|
291
|
+
return images
|
|
292
|
+
|
|
293
|
+
def _extract_pdf_metadata(self, file_path: Path) -> Dict[str, Any]:
|
|
294
|
+
"""
|
|
295
|
+
Extract metadata from PDF file.
|
|
296
|
+
|
|
297
|
+
Args:
|
|
298
|
+
file_path: Path to the PDF file
|
|
299
|
+
|
|
300
|
+
Returns:
|
|
301
|
+
Dictionary of PDF metadata
|
|
302
|
+
"""
|
|
303
|
+
metadata = {}
|
|
304
|
+
|
|
305
|
+
try:
|
|
306
|
+
if PYMUPDF_AVAILABLE:
|
|
307
|
+
doc = fitz.open(str(file_path))
|
|
308
|
+
try:
|
|
309
|
+
pdf_metadata = doc.metadata
|
|
310
|
+
|
|
311
|
+
# Extract useful metadata
|
|
312
|
+
metadata.update({
|
|
313
|
+
'title': pdf_metadata.get('title', ''),
|
|
314
|
+
'author': pdf_metadata.get('author', ''),
|
|
315
|
+
'subject': pdf_metadata.get('subject', ''),
|
|
316
|
+
'creator': pdf_metadata.get('creator', ''),
|
|
317
|
+
'producer': pdf_metadata.get('producer', ''),
|
|
318
|
+
'creation_date': pdf_metadata.get('creationDate', ''),
|
|
319
|
+
'modification_date': pdf_metadata.get('modDate', ''),
|
|
320
|
+
'page_count': doc.page_count,
|
|
321
|
+
'encrypted': doc.needs_pass,
|
|
322
|
+
'pdf_version': doc.pdf_version()
|
|
323
|
+
})
|
|
324
|
+
|
|
325
|
+
# Clean up empty values
|
|
326
|
+
metadata = {k: v for k, v in metadata.items() if v}
|
|
327
|
+
|
|
328
|
+
finally:
|
|
329
|
+
doc.close()
|
|
330
|
+
|
|
331
|
+
except Exception as e:
|
|
332
|
+
self.logger.warning(f"Failed to extract PDF metadata: {e}")
|
|
333
|
+
metadata['metadata_extraction_error'] = str(e)
|
|
334
|
+
|
|
335
|
+
return metadata
|
|
336
|
+
|
|
337
|
+
def _markdown_to_text(self, markdown_content: str) -> str:
|
|
338
|
+
"""
|
|
339
|
+
Convert markdown content to plain text (basic conversion).
|
|
340
|
+
|
|
341
|
+
Args:
|
|
342
|
+
markdown_content: Markdown content
|
|
343
|
+
|
|
344
|
+
Returns:
|
|
345
|
+
Plain text content
|
|
346
|
+
"""
|
|
347
|
+
import re
|
|
348
|
+
|
|
349
|
+
# Remove markdown formatting
|
|
350
|
+
text = markdown_content
|
|
351
|
+
|
|
352
|
+
# Remove headers
|
|
353
|
+
text = re.sub(r'^#+\s*', '', text, flags=re.MULTILINE)
|
|
354
|
+
|
|
355
|
+
# Remove bold/italic
|
|
356
|
+
text = re.sub(r'\*\*([^*]+)\*\*', r'\1', text)
|
|
357
|
+
text = re.sub(r'\*([^*]+)\*', r'\1', text)
|
|
358
|
+
|
|
359
|
+
# Remove links but keep text
|
|
360
|
+
text = re.sub(r'\[([^\]]+)\]\([^)]+\)', r'\1', text)
|
|
361
|
+
|
|
362
|
+
# Remove inline code
|
|
363
|
+
text = re.sub(r'`([^`]+)`', r'\1', text)
|
|
364
|
+
|
|
365
|
+
# Remove code blocks
|
|
366
|
+
text = re.sub(r'```[^`]*```', '', text, flags=re.DOTALL)
|
|
367
|
+
|
|
368
|
+
# Clean up extra whitespace
|
|
369
|
+
text = re.sub(r'\n\s*\n', '\n\n', text)
|
|
370
|
+
|
|
371
|
+
return text.strip()
|
|
372
|
+
|
|
373
|
+
def get_pdf_info(self, file_path: Union[str, Path]) -> Dict[str, Any]:
|
|
374
|
+
"""
|
|
375
|
+
Get comprehensive information about a PDF without full processing.
|
|
376
|
+
|
|
377
|
+
Args:
|
|
378
|
+
file_path: Path to the PDF file
|
|
379
|
+
|
|
380
|
+
Returns:
|
|
381
|
+
Dictionary with PDF information
|
|
382
|
+
"""
|
|
383
|
+
file_path = Path(file_path)
|
|
384
|
+
|
|
385
|
+
try:
|
|
386
|
+
if PYMUPDF_AVAILABLE:
|
|
387
|
+
doc = fitz.open(str(file_path))
|
|
388
|
+
try:
|
|
389
|
+
info = {
|
|
390
|
+
'filename': file_path.name,
|
|
391
|
+
'file_size': file_path.stat().st_size,
|
|
392
|
+
'page_count': doc.page_count,
|
|
393
|
+
'encrypted': doc.needs_pass,
|
|
394
|
+
'pdf_version': doc.pdf_version(),
|
|
395
|
+
'metadata': doc.metadata
|
|
396
|
+
}
|
|
397
|
+
|
|
398
|
+
# Get first page info
|
|
399
|
+
if doc.page_count > 0:
|
|
400
|
+
first_page = doc[0]
|
|
401
|
+
info['page_size'] = first_page.rect
|
|
402
|
+
info['first_page_text_length'] = len(first_page.get_text())
|
|
403
|
+
|
|
404
|
+
return info
|
|
405
|
+
|
|
406
|
+
finally:
|
|
407
|
+
doc.close()
|
|
408
|
+
else:
|
|
409
|
+
# Basic file info only
|
|
410
|
+
return {
|
|
411
|
+
'filename': file_path.name,
|
|
412
|
+
'file_size': file_path.stat().st_size,
|
|
413
|
+
'pymupdf_not_available': True
|
|
414
|
+
}
|
|
415
|
+
|
|
416
|
+
except Exception as e:
|
|
417
|
+
return {
|
|
418
|
+
'filename': file_path.name,
|
|
419
|
+
'error': str(e),
|
|
420
|
+
'file_size': file_path.stat().st_size if file_path.exists() else 0
|
|
421
|
+
}
|
|
422
|
+
|
|
423
|
+
def extract_text_from_pages(self, file_path: Union[str, Path],
|
|
424
|
+
start_page: int, end_page: int) -> str:
|
|
425
|
+
"""
|
|
426
|
+
Extract text from specific pages of a PDF.
|
|
427
|
+
|
|
428
|
+
Args:
|
|
429
|
+
file_path: Path to the PDF file
|
|
430
|
+
start_page: Starting page number (1-based)
|
|
431
|
+
end_page: Ending page number (1-based)
|
|
432
|
+
|
|
433
|
+
Returns:
|
|
434
|
+
Extracted text from specified pages
|
|
435
|
+
"""
|
|
436
|
+
file_path = Path(file_path)
|
|
437
|
+
|
|
438
|
+
try:
|
|
439
|
+
# Convert to 0-based indexing
|
|
440
|
+
page_range = (start_page - 1, end_page - 1)
|
|
441
|
+
|
|
442
|
+
# Use PyMuPDF4LLM for extraction
|
|
443
|
+
extraction_options = {
|
|
444
|
+
'pages': page_range,
|
|
445
|
+
'write_images': False,
|
|
446
|
+
'table_strategy': 'lines_strict' if self.preserve_tables else 'lines'
|
|
447
|
+
}
|
|
448
|
+
|
|
449
|
+
if self.markdown_output:
|
|
450
|
+
content = pymupdf4llm.to_markdown(str(file_path), **extraction_options)
|
|
451
|
+
else:
|
|
452
|
+
# Extract as markdown then convert to text
|
|
453
|
+
md_content = pymupdf4llm.to_markdown(str(file_path), **extraction_options)
|
|
454
|
+
content = self._markdown_to_text(md_content)
|
|
455
|
+
|
|
456
|
+
return content
|
|
457
|
+
|
|
458
|
+
except Exception as e:
|
|
459
|
+
raise MediaProcessingError(f"Failed to extract text from pages {start_page}-{end_page}: {str(e)}") from e
|
|
460
|
+
|
|
461
|
+
def get_processing_info(self) -> Dict[str, Any]:
|
|
462
|
+
"""
|
|
463
|
+
Get information about the PDF processor capabilities.
|
|
464
|
+
|
|
465
|
+
Returns:
|
|
466
|
+
Dictionary with processor information
|
|
467
|
+
"""
|
|
468
|
+
return {
|
|
469
|
+
'processor_type': 'PDFProcessor',
|
|
470
|
+
'supported_formats': ['pdf'],
|
|
471
|
+
'capabilities': {
|
|
472
|
+
'extract_images': self.extract_images,
|
|
473
|
+
'preserve_tables': self.preserve_tables,
|
|
474
|
+
'markdown_output': self.markdown_output,
|
|
475
|
+
'page_range_support': True,
|
|
476
|
+
'metadata_extraction': self.extract_metadata,
|
|
477
|
+
'pymupdf4llm_integration': True,
|
|
478
|
+
'text_extraction': True,
|
|
479
|
+
'structure_preservation': True
|
|
480
|
+
},
|
|
481
|
+
'dependencies': {
|
|
482
|
+
'pymupdf4llm': PYMUPDF4LLM_AVAILABLE,
|
|
483
|
+
'pymupdf': PYMUPDF_AVAILABLE
|
|
484
|
+
}
|
|
485
|
+
}
|