abstractcore 2.4.4__py3-none-any.whl → 2.4.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (31) hide show
  1. abstractcore/cli/__init__.py +9 -0
  2. abstractcore/cli/main.py +759 -0
  3. abstractcore/cli/vision_config.py +491 -0
  4. abstractcore/core/interface.py +7 -0
  5. abstractcore/core/session.py +27 -2
  6. abstractcore/media/handlers/__init__.py +16 -0
  7. abstractcore/media/handlers/anthropic_handler.py +326 -0
  8. abstractcore/media/handlers/local_handler.py +541 -0
  9. abstractcore/media/handlers/openai_handler.py +281 -0
  10. abstractcore/media/processors/__init__.py +13 -0
  11. abstractcore/media/processors/image_processor.py +610 -0
  12. abstractcore/media/processors/office_processor.py +490 -0
  13. abstractcore/media/processors/pdf_processor.py +485 -0
  14. abstractcore/media/processors/text_processor.py +557 -0
  15. abstractcore/media/utils/__init__.py +22 -0
  16. abstractcore/media/utils/image_scaler.py +306 -0
  17. abstractcore/providers/anthropic_provider.py +14 -2
  18. abstractcore/providers/base.py +24 -0
  19. abstractcore/providers/huggingface_provider.py +23 -9
  20. abstractcore/providers/lmstudio_provider.py +6 -1
  21. abstractcore/providers/mlx_provider.py +20 -7
  22. abstractcore/providers/ollama_provider.py +6 -1
  23. abstractcore/providers/openai_provider.py +6 -2
  24. abstractcore/tools/common_tools.py +651 -1
  25. abstractcore/utils/version.py +1 -1
  26. {abstractcore-2.4.4.dist-info → abstractcore-2.4.6.dist-info}/METADATA +59 -9
  27. {abstractcore-2.4.4.dist-info → abstractcore-2.4.6.dist-info}/RECORD +31 -17
  28. {abstractcore-2.4.4.dist-info → abstractcore-2.4.6.dist-info}/entry_points.txt +2 -0
  29. {abstractcore-2.4.4.dist-info → abstractcore-2.4.6.dist-info}/WHEEL +0 -0
  30. {abstractcore-2.4.4.dist-info → abstractcore-2.4.6.dist-info}/licenses/LICENSE +0 -0
  31. {abstractcore-2.4.4.dist-info → abstractcore-2.4.6.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,485 @@
1
+ """
2
+ PDF processor using PyMuPDF4LLM for optimized LLM processing.
3
+
4
+ This module provides comprehensive PDF processing capabilities using PyMuPDF4LLM,
5
+ optimized for LLM consumption with excellent markdown output and structure preservation.
6
+ """
7
+
8
+ from pathlib import Path
9
+ from typing import Optional, Dict, Any, List, Union, Tuple
10
+
11
+ try:
12
+ import pymupdf4llm
13
+ PYMUPDF4LLM_AVAILABLE = True
14
+ except ImportError:
15
+ PYMUPDF4LLM_AVAILABLE = False
16
+ pymupdf4llm = None
17
+
18
+ try:
19
+ import pymupdf as fitz
20
+ PYMUPDF_AVAILABLE = True
21
+ except ImportError:
22
+ PYMUPDF_AVAILABLE = False
23
+ fitz = None
24
+
25
+ from ..base import BaseMediaHandler, MediaProcessingError
26
+ from ..types import MediaContent, MediaType, ContentFormat
27
+
28
+
29
+ class PDFProcessor(BaseMediaHandler):
30
+ """
31
+ PDF processor using PyMuPDF4LLM for LLM-optimized document processing.
32
+
33
+ Provides high-quality text extraction, structure preservation, table detection,
34
+ and image extraction from PDF documents.
35
+ """
36
+
37
+ def __init__(self, **kwargs):
38
+ """
39
+ Initialize the PDF processor.
40
+
41
+ Args:
42
+ **kwargs: Configuration parameters including:
43
+ - extract_images: Whether to extract embedded images
44
+ - preserve_tables: Whether to preserve table formatting
45
+ - markdown_output: Whether to output as markdown
46
+ - page_range: Tuple of (start_page, end_page) or None for all pages
47
+ - extract_metadata: Whether to extract PDF metadata
48
+ """
49
+ if not PYMUPDF4LLM_AVAILABLE:
50
+ raise ImportError(
51
+ "PyMuPDF4LLM is required for PDF processing. "
52
+ "Install with: pip install \"abstractcore[media]\""
53
+ )
54
+
55
+ super().__init__(**kwargs)
56
+
57
+ # PDF processing configuration
58
+ self.extract_images = kwargs.get('extract_images', False)
59
+ self.preserve_tables = kwargs.get('preserve_tables', True)
60
+ self.markdown_output = kwargs.get('markdown_output', True)
61
+ self.page_range = kwargs.get('page_range', None)
62
+ self.extract_metadata = kwargs.get('extract_metadata', True)
63
+
64
+ # Set capabilities for PDF processing
65
+ from ..types import MediaCapabilities
66
+ self.capabilities = MediaCapabilities(
67
+ vision_support=self.extract_images,
68
+ audio_support=False,
69
+ video_support=False,
70
+ document_support=True,
71
+ supported_document_formats=['pdf'],
72
+ max_file_size=self.max_file_size
73
+ )
74
+
75
+ self.logger.debug(
76
+ f"Initialized PDFProcessor with extract_images={self.extract_images}, "
77
+ f"preserve_tables={self.preserve_tables}, markdown_output={self.markdown_output}"
78
+ )
79
+
80
+ def _process_internal(self, file_path: Path, media_type: MediaType, **kwargs) -> MediaContent:
81
+ """
82
+ Process a PDF file and return optimized content for LLM consumption.
83
+
84
+ Args:
85
+ file_path: Path to the PDF file
86
+ media_type: Detected media type (should be DOCUMENT)
87
+ **kwargs: Additional processing parameters:
88
+ - page_range: Override default page range
89
+ - extract_images: Override default image extraction
90
+ - output_format: 'markdown', 'text', or 'structured'
91
+ - dpi: DPI for image extraction (default: 150)
92
+
93
+ Returns:
94
+ MediaContent with processed PDF content
95
+
96
+ Raises:
97
+ MediaProcessingError: If PDF processing fails
98
+ """
99
+ if media_type != MediaType.DOCUMENT:
100
+ raise MediaProcessingError(f"PDFProcessor only handles document types, got {media_type}")
101
+
102
+ try:
103
+ # Override defaults with kwargs
104
+ page_range = kwargs.get('page_range', self.page_range)
105
+ extract_images = kwargs.get('extract_images', self.extract_images)
106
+ output_format = kwargs.get('output_format', 'markdown' if self.markdown_output else 'text')
107
+ dpi = kwargs.get('dpi', 150)
108
+
109
+ # Process PDF with PyMuPDF4LLM
110
+ content, metadata = self._extract_pdf_content(
111
+ file_path, page_range, extract_images, output_format, dpi
112
+ )
113
+
114
+ # Determine content format and MIME type based on output format
115
+ if output_format == 'markdown':
116
+ mime_type = 'text/markdown'
117
+ elif output_format == 'structured':
118
+ mime_type = 'application/json'
119
+ else:
120
+ mime_type = 'text/plain'
121
+
122
+ return self._create_media_content(
123
+ content=content,
124
+ file_path=file_path,
125
+ media_type=MediaType.DOCUMENT,
126
+ content_format=ContentFormat.TEXT,
127
+ mime_type=mime_type,
128
+ **metadata
129
+ )
130
+
131
+ except Exception as e:
132
+ raise MediaProcessingError(f"Failed to process PDF {file_path}: {str(e)}") from e
133
+
134
+ def _extract_pdf_content(self, file_path: Path, page_range: Optional[Tuple[int, int]],
135
+ extract_images: bool, output_format: str, dpi: int) -> Tuple[str, Dict[str, Any]]:
136
+ """
137
+ Extract content from PDF using PyMuPDF4LLM.
138
+
139
+ Args:
140
+ file_path: Path to the PDF file
141
+ page_range: Optional page range to process
142
+ extract_images: Whether to extract images
143
+ output_format: Output format ('markdown', 'text', 'structured')
144
+ dpi: DPI for image extraction
145
+
146
+ Returns:
147
+ Tuple of (content, metadata)
148
+ """
149
+ try:
150
+ # Configure PyMuPDF4LLM options
151
+ extraction_options = {
152
+ 'pages': page_range,
153
+ 'write_images': extract_images,
154
+ 'image_format': 'png',
155
+ 'dpi': dpi,
156
+ 'table_strategy': 'lines_strict' if self.preserve_tables else 'lines'
157
+ }
158
+
159
+ # Remove None values from options
160
+ extraction_options = {k: v for k, v in extraction_options.items() if v is not None}
161
+
162
+ if output_format == 'markdown':
163
+ # Use PyMuPDF4LLM for markdown extraction
164
+ md_text = pymupdf4llm.to_markdown(str(file_path), **extraction_options)
165
+ content = md_text
166
+ else:
167
+ # Use regular PyMuPDF for text extraction if available
168
+ if PYMUPDF_AVAILABLE:
169
+ content, metadata = self._extract_with_pymupdf(file_path, page_range, extract_images)
170
+ else:
171
+ # Fallback to PyMuPDF4LLM text extraction
172
+ md_text = pymupdf4llm.to_markdown(str(file_path), **extraction_options)
173
+ # Convert markdown to plain text (basic conversion)
174
+ content = self._markdown_to_text(md_text)
175
+
176
+ # Extract metadata
177
+ metadata = self._extract_pdf_metadata(file_path)
178
+
179
+ # Add processing metadata
180
+ metadata.update({
181
+ 'extraction_method': 'pymupdf4llm',
182
+ 'output_format': output_format,
183
+ 'page_range': page_range,
184
+ 'images_extracted': extract_images,
185
+ 'tables_preserved': self.preserve_tables,
186
+ 'content_length': len(content)
187
+ })
188
+
189
+ return content, metadata
190
+
191
+ except Exception as e:
192
+ raise MediaProcessingError(f"PyMuPDF4LLM extraction failed: {str(e)}") from e
193
+
194
+ def _extract_with_pymupdf(self, file_path: Path, page_range: Optional[Tuple[int, int]],
195
+ extract_images: bool) -> Tuple[str, Dict[str, Any]]:
196
+ """
197
+ Extract content using regular PyMuPDF for text-only extraction.
198
+
199
+ Args:
200
+ file_path: Path to the PDF file
201
+ page_range: Optional page range to process
202
+ extract_images: Whether to extract images
203
+
204
+ Returns:
205
+ Tuple of (content, metadata)
206
+ """
207
+ doc = fitz.open(str(file_path))
208
+ content_parts = []
209
+ images = []
210
+
211
+ try:
212
+ # Determine page range
213
+ start_page = page_range[0] if page_range else 0
214
+ end_page = page_range[1] if page_range else doc.page_count - 1
215
+ end_page = min(end_page, doc.page_count - 1)
216
+
217
+ for page_num in range(start_page, end_page + 1):
218
+ page = doc[page_num]
219
+
220
+ # Extract text
221
+ page_text = page.get_text()
222
+ if page_text.strip():
223
+ content_parts.append(f"# Page {page_num + 1}\n\n{page_text}\n")
224
+
225
+ # Extract images if requested
226
+ if extract_images:
227
+ page_images = self._extract_page_images(page, page_num)
228
+ images.extend(page_images)
229
+
230
+ content = "\n".join(content_parts)
231
+
232
+ metadata = {
233
+ 'page_count': doc.page_count,
234
+ 'processed_pages': end_page - start_page + 1,
235
+ 'images_found': len(images),
236
+ 'extraction_method': 'pymupdf'
237
+ }
238
+
239
+ if images:
240
+ metadata['images'] = images
241
+
242
+ return content, metadata
243
+
244
+ finally:
245
+ doc.close()
246
+
247
+ def _extract_page_images(self, page, page_num: int) -> List[Dict[str, Any]]:
248
+ """
249
+ Extract images from a PDF page.
250
+
251
+ Args:
252
+ page: PyMuPDF page object
253
+ page_num: Page number
254
+
255
+ Returns:
256
+ List of image metadata dictionaries
257
+ """
258
+ images = []
259
+
260
+ try:
261
+ # Get image list
262
+ image_list = page.get_images()
263
+
264
+ for img_index, img in enumerate(image_list):
265
+ # Extract image
266
+ xref = img[0]
267
+ pix = fitz.Pixmap(page.parent, xref)
268
+
269
+ if pix.n - pix.alpha < 4: # GRAY or RGB
270
+ # Convert to PNG bytes
271
+ img_data = pix.tobytes("png")
272
+
273
+ # Create image metadata
274
+ image_info = {
275
+ 'page': page_num + 1,
276
+ 'index': img_index,
277
+ 'width': pix.width,
278
+ 'height': pix.height,
279
+ 'colorspace': pix.colorspace.name if pix.colorspace else 'Unknown',
280
+ 'size_bytes': len(img_data),
281
+ 'format': 'png'
282
+ }
283
+
284
+ images.append(image_info)
285
+
286
+ pix = None # Free memory
287
+
288
+ except Exception as e:
289
+ self.logger.warning(f"Failed to extract images from page {page_num}: {e}")
290
+
291
+ return images
292
+
293
+ def _extract_pdf_metadata(self, file_path: Path) -> Dict[str, Any]:
294
+ """
295
+ Extract metadata from PDF file.
296
+
297
+ Args:
298
+ file_path: Path to the PDF file
299
+
300
+ Returns:
301
+ Dictionary of PDF metadata
302
+ """
303
+ metadata = {}
304
+
305
+ try:
306
+ if PYMUPDF_AVAILABLE:
307
+ doc = fitz.open(str(file_path))
308
+ try:
309
+ pdf_metadata = doc.metadata
310
+
311
+ # Extract useful metadata
312
+ metadata.update({
313
+ 'title': pdf_metadata.get('title', ''),
314
+ 'author': pdf_metadata.get('author', ''),
315
+ 'subject': pdf_metadata.get('subject', ''),
316
+ 'creator': pdf_metadata.get('creator', ''),
317
+ 'producer': pdf_metadata.get('producer', ''),
318
+ 'creation_date': pdf_metadata.get('creationDate', ''),
319
+ 'modification_date': pdf_metadata.get('modDate', ''),
320
+ 'page_count': doc.page_count,
321
+ 'encrypted': doc.needs_pass,
322
+ 'pdf_version': doc.pdf_version()
323
+ })
324
+
325
+ # Clean up empty values
326
+ metadata = {k: v for k, v in metadata.items() if v}
327
+
328
+ finally:
329
+ doc.close()
330
+
331
+ except Exception as e:
332
+ self.logger.warning(f"Failed to extract PDF metadata: {e}")
333
+ metadata['metadata_extraction_error'] = str(e)
334
+
335
+ return metadata
336
+
337
+ def _markdown_to_text(self, markdown_content: str) -> str:
338
+ """
339
+ Convert markdown content to plain text (basic conversion).
340
+
341
+ Args:
342
+ markdown_content: Markdown content
343
+
344
+ Returns:
345
+ Plain text content
346
+ """
347
+ import re
348
+
349
+ # Remove markdown formatting
350
+ text = markdown_content
351
+
352
+ # Remove headers
353
+ text = re.sub(r'^#+\s*', '', text, flags=re.MULTILINE)
354
+
355
+ # Remove bold/italic
356
+ text = re.sub(r'\*\*([^*]+)\*\*', r'\1', text)
357
+ text = re.sub(r'\*([^*]+)\*', r'\1', text)
358
+
359
+ # Remove links but keep text
360
+ text = re.sub(r'\[([^\]]+)\]\([^)]+\)', r'\1', text)
361
+
362
+ # Remove inline code
363
+ text = re.sub(r'`([^`]+)`', r'\1', text)
364
+
365
+ # Remove code blocks
366
+ text = re.sub(r'```[^`]*```', '', text, flags=re.DOTALL)
367
+
368
+ # Clean up extra whitespace
369
+ text = re.sub(r'\n\s*\n', '\n\n', text)
370
+
371
+ return text.strip()
372
+
373
+ def get_pdf_info(self, file_path: Union[str, Path]) -> Dict[str, Any]:
374
+ """
375
+ Get comprehensive information about a PDF without full processing.
376
+
377
+ Args:
378
+ file_path: Path to the PDF file
379
+
380
+ Returns:
381
+ Dictionary with PDF information
382
+ """
383
+ file_path = Path(file_path)
384
+
385
+ try:
386
+ if PYMUPDF_AVAILABLE:
387
+ doc = fitz.open(str(file_path))
388
+ try:
389
+ info = {
390
+ 'filename': file_path.name,
391
+ 'file_size': file_path.stat().st_size,
392
+ 'page_count': doc.page_count,
393
+ 'encrypted': doc.needs_pass,
394
+ 'pdf_version': doc.pdf_version(),
395
+ 'metadata': doc.metadata
396
+ }
397
+
398
+ # Get first page info
399
+ if doc.page_count > 0:
400
+ first_page = doc[0]
401
+ info['page_size'] = first_page.rect
402
+ info['first_page_text_length'] = len(first_page.get_text())
403
+
404
+ return info
405
+
406
+ finally:
407
+ doc.close()
408
+ else:
409
+ # Basic file info only
410
+ return {
411
+ 'filename': file_path.name,
412
+ 'file_size': file_path.stat().st_size,
413
+ 'pymupdf_not_available': True
414
+ }
415
+
416
+ except Exception as e:
417
+ return {
418
+ 'filename': file_path.name,
419
+ 'error': str(e),
420
+ 'file_size': file_path.stat().st_size if file_path.exists() else 0
421
+ }
422
+
423
+ def extract_text_from_pages(self, file_path: Union[str, Path],
424
+ start_page: int, end_page: int) -> str:
425
+ """
426
+ Extract text from specific pages of a PDF.
427
+
428
+ Args:
429
+ file_path: Path to the PDF file
430
+ start_page: Starting page number (1-based)
431
+ end_page: Ending page number (1-based)
432
+
433
+ Returns:
434
+ Extracted text from specified pages
435
+ """
436
+ file_path = Path(file_path)
437
+
438
+ try:
439
+ # Convert to 0-based indexing
440
+ page_range = (start_page - 1, end_page - 1)
441
+
442
+ # Use PyMuPDF4LLM for extraction
443
+ extraction_options = {
444
+ 'pages': page_range,
445
+ 'write_images': False,
446
+ 'table_strategy': 'lines_strict' if self.preserve_tables else 'lines'
447
+ }
448
+
449
+ if self.markdown_output:
450
+ content = pymupdf4llm.to_markdown(str(file_path), **extraction_options)
451
+ else:
452
+ # Extract as markdown then convert to text
453
+ md_content = pymupdf4llm.to_markdown(str(file_path), **extraction_options)
454
+ content = self._markdown_to_text(md_content)
455
+
456
+ return content
457
+
458
+ except Exception as e:
459
+ raise MediaProcessingError(f"Failed to extract text from pages {start_page}-{end_page}: {str(e)}") from e
460
+
461
+ def get_processing_info(self) -> Dict[str, Any]:
462
+ """
463
+ Get information about the PDF processor capabilities.
464
+
465
+ Returns:
466
+ Dictionary with processor information
467
+ """
468
+ return {
469
+ 'processor_type': 'PDFProcessor',
470
+ 'supported_formats': ['pdf'],
471
+ 'capabilities': {
472
+ 'extract_images': self.extract_images,
473
+ 'preserve_tables': self.preserve_tables,
474
+ 'markdown_output': self.markdown_output,
475
+ 'page_range_support': True,
476
+ 'metadata_extraction': self.extract_metadata,
477
+ 'pymupdf4llm_integration': True,
478
+ 'text_extraction': True,
479
+ 'structure_preservation': True
480
+ },
481
+ 'dependencies': {
482
+ 'pymupdf4llm': PYMUPDF4LLM_AVAILABLE,
483
+ 'pymupdf': PYMUPDF_AVAILABLE
484
+ }
485
+ }