abstractcore 2.4.4__py3-none-any.whl → 2.4.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (31) hide show
  1. abstractcore/cli/__init__.py +9 -0
  2. abstractcore/cli/main.py +759 -0
  3. abstractcore/cli/vision_config.py +491 -0
  4. abstractcore/core/interface.py +7 -0
  5. abstractcore/core/session.py +27 -2
  6. abstractcore/media/handlers/__init__.py +16 -0
  7. abstractcore/media/handlers/anthropic_handler.py +326 -0
  8. abstractcore/media/handlers/local_handler.py +541 -0
  9. abstractcore/media/handlers/openai_handler.py +281 -0
  10. abstractcore/media/processors/__init__.py +13 -0
  11. abstractcore/media/processors/image_processor.py +610 -0
  12. abstractcore/media/processors/office_processor.py +490 -0
  13. abstractcore/media/processors/pdf_processor.py +485 -0
  14. abstractcore/media/processors/text_processor.py +557 -0
  15. abstractcore/media/utils/__init__.py +22 -0
  16. abstractcore/media/utils/image_scaler.py +306 -0
  17. abstractcore/providers/anthropic_provider.py +14 -2
  18. abstractcore/providers/base.py +24 -0
  19. abstractcore/providers/huggingface_provider.py +23 -9
  20. abstractcore/providers/lmstudio_provider.py +6 -1
  21. abstractcore/providers/mlx_provider.py +20 -7
  22. abstractcore/providers/ollama_provider.py +6 -1
  23. abstractcore/providers/openai_provider.py +6 -2
  24. abstractcore/tools/common_tools.py +651 -1
  25. abstractcore/utils/version.py +1 -1
  26. {abstractcore-2.4.4.dist-info → abstractcore-2.4.6.dist-info}/METADATA +59 -9
  27. {abstractcore-2.4.4.dist-info → abstractcore-2.4.6.dist-info}/RECORD +31 -17
  28. {abstractcore-2.4.4.dist-info → abstractcore-2.4.6.dist-info}/entry_points.txt +2 -0
  29. {abstractcore-2.4.4.dist-info → abstractcore-2.4.6.dist-info}/WHEEL +0 -0
  30. {abstractcore-2.4.4.dist-info → abstractcore-2.4.6.dist-info}/licenses/LICENSE +0 -0
  31. {abstractcore-2.4.4.dist-info → abstractcore-2.4.6.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,490 @@
1
+ """
2
+ Office document processor using unstructured library for SOTA document processing.
3
+
4
+ This module provides comprehensive processing capabilities for Microsoft Office documents
5
+ (DOCX, XLSX, PPT) using the unstructured library, which is the SOTA solution for
6
+ document processing in 2025.
7
+ """
8
+
9
+ import logging
10
+ from pathlib import Path
11
+ from typing import Optional, Dict, Any, List, Union, Tuple
12
+ import json
13
+
14
+ from ..base import BaseMediaHandler, MediaProcessingError
15
+ from ..types import MediaContent, MediaType, ContentFormat, MediaProcessingResult
16
+
17
+
18
+ class OfficeProcessor(BaseMediaHandler):
19
+ """
20
+ Office document processor using unstructured library.
21
+
22
+ Supports processing of:
23
+ - DOCX (Word documents)
24
+ - XLSX (Excel spreadsheets)
25
+ - PPTX (PowerPoint presentations)
26
+
27
+ Uses the unstructured library for SOTA document processing with intelligent
28
+ element detection, table extraction, and structure preservation.
29
+ """
30
+
31
+ def __init__(self, **kwargs):
32
+ """
33
+ Initialize Office processor.
34
+
35
+ Args:
36
+ **kwargs: Additional configuration options
37
+ """
38
+ super().__init__(**kwargs)
39
+ self.logger = logging.getLogger(__name__)
40
+
41
+ # Configuration options
42
+ self.extract_tables = kwargs.get('extract_tables', True)
43
+ self.preserve_structure = kwargs.get('preserve_structure', True)
44
+ self.extract_images = kwargs.get('extract_images', False) # Images in Office docs
45
+ self.markdown_output = kwargs.get('markdown_output', True)
46
+ self.include_metadata = kwargs.get('include_metadata', True)
47
+
48
+ # Chunking options for large documents
49
+ self.chunk_size = kwargs.get('chunk_size', None) # No chunking by default
50
+ self.chunk_overlap = kwargs.get('chunk_overlap', 0)
51
+
52
+ # Check if unstructured library is available
53
+ self._check_dependencies()
54
+
55
+ # Set capabilities for office processing
56
+ from ..types import MediaCapabilities
57
+ self.capabilities = MediaCapabilities(
58
+ vision_support=False,
59
+ audio_support=False,
60
+ video_support=False,
61
+ document_support=True,
62
+ supported_document_formats=['docx', 'xlsx', 'pptx'],
63
+ max_file_size=self.max_file_size
64
+ )
65
+
66
+ def _check_dependencies(self):
67
+ """Check if required dependencies are available."""
68
+ try:
69
+ import unstructured
70
+ from unstructured.partition.auto import partition
71
+ from unstructured.partition.docx import partition_docx
72
+ from unstructured.partition.xlsx import partition_xlsx
73
+ from unstructured.partition.pptx import partition_pptx
74
+ self._unstructured_available = True
75
+ self.logger.debug("Unstructured library available for Office document processing")
76
+ except ImportError as e:
77
+ self._unstructured_available = False
78
+ self.logger.warning(f"Unstructured library not available: {e}")
79
+
80
+ def can_process(self, file_path: Path) -> bool:
81
+ """
82
+ Check if this processor can handle the file.
83
+
84
+ Args:
85
+ file_path: Path to the file
86
+
87
+ Returns:
88
+ True if file can be processed
89
+ """
90
+ if not self._unstructured_available:
91
+ return False
92
+
93
+ supported_extensions = {'.docx', '.xlsx', '.pptx'}
94
+ return file_path.suffix.lower() in supported_extensions
95
+
96
+ def _process_internal(self, file_path: Path, media_type: MediaType, **kwargs) -> MediaContent:
97
+ """
98
+ Internal processing method for Office documents.
99
+
100
+ Args:
101
+ file_path: Path to the Office document
102
+ media_type: Detected media type (should be DOCUMENT)
103
+ **kwargs: Additional processing options
104
+
105
+ Returns:
106
+ MediaContent with processed Office document content
107
+
108
+ Raises:
109
+ MediaProcessingError: If processing fails
110
+ """
111
+ if media_type != MediaType.DOCUMENT:
112
+ raise MediaProcessingError(f"OfficeProcessor only handles document types, got {media_type}")
113
+
114
+ if not self._unstructured_available:
115
+ raise MediaProcessingError(
116
+ "Unstructured library not available. Install with: pip install \"abstractcore[media]\""
117
+ )
118
+
119
+ try:
120
+ # Extract content based on file type
121
+ file_extension = file_path.suffix.lower()
122
+
123
+ if file_extension == '.docx':
124
+ content, metadata = self._process_docx(file_path, **kwargs)
125
+ elif file_extension == '.xlsx':
126
+ content, metadata = self._process_xlsx(file_path, **kwargs)
127
+ elif file_extension == '.pptx':
128
+ content, metadata = self._process_pptx(file_path, **kwargs)
129
+ else:
130
+ raise MediaProcessingError(f"Unsupported Office file type: {file_extension}")
131
+
132
+ # Create MediaContent object
133
+ return self._create_media_content(
134
+ content=content,
135
+ media_type=MediaType.DOCUMENT,
136
+ content_format=ContentFormat.TEXT,
137
+ mime_type=self._get_mime_type(file_extension),
138
+ file_path=file_path,
139
+ metadata=metadata
140
+ )
141
+
142
+ except Exception as e:
143
+ raise MediaProcessingError(f"Office document processing failed: {str(e)}")
144
+
145
+ def process_file(self, file_path: Path, **kwargs) -> MediaProcessingResult:
146
+ """
147
+ Process an Office document file.
148
+
149
+ Args:
150
+ file_path: Path to the Office document
151
+ **kwargs: Additional processing options
152
+
153
+ Returns:
154
+ MediaProcessingResult with extracted content
155
+ """
156
+ if not self._unstructured_available:
157
+ return MediaProcessingResult(
158
+ success=False,
159
+ error_message="Unstructured library not available. Install with: pip install \"abstractcore[media]\""
160
+ )
161
+
162
+ if not self.can_process(file_path):
163
+ return MediaProcessingResult(
164
+ success=False,
165
+ error_message=f"Unsupported Office file type: {file_path.suffix}"
166
+ )
167
+
168
+ try:
169
+ self.logger.info(f"Processing Office document: {file_path}")
170
+
171
+ # Extract content based on file type
172
+ file_extension = file_path.suffix.lower()
173
+
174
+ if file_extension == '.docx':
175
+ content, metadata = self._process_docx(file_path, **kwargs)
176
+ elif file_extension == '.xlsx':
177
+ content, metadata = self._process_xlsx(file_path, **kwargs)
178
+ elif file_extension == '.pptx':
179
+ content, metadata = self._process_pptx(file_path, **kwargs)
180
+ else:
181
+ return MediaProcessingResult(
182
+ success=False,
183
+ error_message=f"Unsupported file extension: {file_extension}"
184
+ )
185
+
186
+ # Create MediaContent
187
+ media_content = MediaContent(
188
+ media_type=MediaType.DOCUMENT,
189
+ content=content,
190
+ content_format=ContentFormat.TEXT,
191
+ mime_type=self._get_mime_type(file_extension),
192
+ file_path=str(file_path),
193
+ metadata=metadata
194
+ )
195
+
196
+ return MediaProcessingResult(
197
+ success=True,
198
+ media_content=media_content,
199
+ processing_time=0 # Would be calculated in real implementation
200
+ )
201
+
202
+ except Exception as e:
203
+ self.logger.error(f"Error processing Office document {file_path}: {e}")
204
+ return MediaProcessingResult(
205
+ success=False,
206
+ error_message=f"Office document processing failed: {str(e)}"
207
+ )
208
+
209
+ def _process_docx(self, file_path: Path, **kwargs) -> Tuple[str, Dict[str, Any]]:
210
+ """
211
+ Process a DOCX document using unstructured.
212
+
213
+ Args:
214
+ file_path: Path to DOCX file
215
+ **kwargs: Processing options
216
+
217
+ Returns:
218
+ Tuple of (content, metadata)
219
+ """
220
+ from unstructured.partition.docx import partition_docx
221
+
222
+ # Partition the document
223
+ elements = partition_docx(
224
+ filename=str(file_path),
225
+ include_metadata=self.include_metadata,
226
+ extract_image_block_types=["Image"] if self.extract_images else []
227
+ )
228
+
229
+ # Convert to structured format
230
+ content_parts = []
231
+ tables = []
232
+ images = []
233
+
234
+ for element in elements:
235
+ # Get element type and text content directly from the element
236
+ element_type = type(element).__name__
237
+ text_content = str(element)
238
+
239
+ if element_type == 'Table' and self.extract_tables:
240
+ # Extract table content
241
+ tables.append(text_content)
242
+ if self.markdown_output:
243
+ content_parts.append(f"\n**Table:**\n{text_content}\n")
244
+ else:
245
+ content_parts.append(f"\nTable: {text_content}\n")
246
+
247
+ elif element_type == 'Image' and self.extract_images:
248
+ images.append(text_content)
249
+ content_parts.append(f"\n[Image: {text_content}]\n")
250
+
251
+ elif text_content.strip():
252
+ if self.markdown_output and element_type in ['Title', 'Header']:
253
+ # Format headers in markdown
254
+ content_parts.append(f"\n## {text_content}\n")
255
+ else:
256
+ content_parts.append(text_content)
257
+
258
+ # Combine content
259
+ content = '\n'.join(content_parts) if content_parts else "No text content found"
260
+
261
+ # Build metadata
262
+ metadata = {
263
+ 'file_name': file_path.name,
264
+ 'file_type': 'docx',
265
+ 'file_size': file_path.stat().st_size,
266
+ 'element_count': len(elements),
267
+ 'table_count': len(tables),
268
+ 'image_count': len(images),
269
+ 'processing_method': 'unstructured-docx'
270
+ }
271
+
272
+ if self.include_metadata and elements:
273
+ # Add document-level metadata from first element
274
+ first_element = elements[0]
275
+ if hasattr(first_element, 'metadata') and first_element.metadata:
276
+ metadata.update({
277
+ 'author': getattr(first_element.metadata, 'author', None),
278
+ 'creation_date': getattr(first_element.metadata, 'creation_date', None),
279
+ 'last_modified': getattr(first_element.metadata, 'last_modified', None)
280
+ })
281
+
282
+ return content, metadata
283
+
284
+ def _process_xlsx(self, file_path: Path, **kwargs) -> Tuple[str, Dict[str, Any]]:
285
+ """
286
+ Process an XLSX spreadsheet using unstructured.
287
+
288
+ Args:
289
+ file_path: Path to XLSX file
290
+ **kwargs: Processing options
291
+
292
+ Returns:
293
+ Tuple of (content, metadata)
294
+ """
295
+ from unstructured.partition.xlsx import partition_xlsx
296
+
297
+ # Partition the spreadsheet
298
+ elements = partition_xlsx(
299
+ filename=str(file_path),
300
+ include_metadata=self.include_metadata
301
+ )
302
+
303
+ content_parts = []
304
+ sheet_data = {}
305
+
306
+ current_sheet = None
307
+ for element in elements:
308
+ # Get element content directly
309
+ text_content = str(element)
310
+
311
+ # For XLSX, try to get sheet information from element if available
312
+ sheet_name = 'Sheet1' # Default sheet name
313
+ if hasattr(element, 'metadata') and element.metadata:
314
+ sheet_name = getattr(element.metadata, 'sheet_name', 'Sheet1')
315
+
316
+ if sheet_name != current_sheet:
317
+ if self.markdown_output:
318
+ content_parts.append(f"\n## Sheet: {sheet_name}\n")
319
+ else:
320
+ content_parts.append(f"\nSheet: {sheet_name}\n")
321
+ current_sheet = sheet_name
322
+ sheet_data[sheet_name] = []
323
+
324
+ if text_content.strip():
325
+ sheet_data[sheet_name].append(text_content)
326
+ content_parts.append(text_content)
327
+
328
+ # Format as tables if structured output is requested
329
+ if self.markdown_output and sheet_data:
330
+ formatted_content = []
331
+ for sheet_name, data in sheet_data.items():
332
+ formatted_content.append(f"\n## {sheet_name}\n")
333
+
334
+ # Try to format as table if data looks tabular
335
+ if len(data) > 1:
336
+ # Simple table formatting - could be enhanced
337
+ formatted_content.append("| " + " | ".join(str(item) for item in data[:5]) + " |")
338
+ if len(data) > 1:
339
+ formatted_content.append("|" + "---|" * min(5, len(data)) + "|")
340
+ for row in data[1:6]: # Limit to first few rows
341
+ formatted_content.append("| " + str(row) + " |")
342
+ if len(data) > 6:
343
+ formatted_content.append("... (additional rows truncated)")
344
+ else:
345
+ formatted_content.extend(data)
346
+
347
+ content = '\n'.join(formatted_content)
348
+ else:
349
+ content = '\n'.join(content_parts) if content_parts else "No data found"
350
+
351
+ # Build metadata
352
+ metadata = {
353
+ 'file_name': file_path.name,
354
+ 'file_type': 'xlsx',
355
+ 'file_size': file_path.stat().st_size,
356
+ 'sheet_count': len(sheet_data),
357
+ 'sheet_names': list(sheet_data.keys()),
358
+ 'total_cells': sum(len(data) for data in sheet_data.values()),
359
+ 'processing_method': 'unstructured-xlsx'
360
+ }
361
+
362
+ return content, metadata
363
+
364
+ def _process_pptx(self, file_path: Path, **kwargs) -> Tuple[str, Dict[str, Any]]:
365
+ """
366
+ Process a PPTX presentation using unstructured.
367
+
368
+ Args:
369
+ file_path: Path to PPTX file
370
+ **kwargs: Processing options
371
+
372
+ Returns:
373
+ Tuple of (content, metadata)
374
+ """
375
+ from unstructured.partition.pptx import partition_pptx
376
+
377
+ # Partition the presentation
378
+ elements = partition_pptx(
379
+ filename=str(file_path),
380
+ include_metadata=self.include_metadata
381
+ )
382
+
383
+ content_parts = []
384
+ slide_count = 0
385
+ current_slide = None
386
+
387
+ for element in elements:
388
+ # Get element content directly
389
+ text_content = str(element)
390
+ element_type = type(element).__name__
391
+
392
+ # Track slide information - try to get from element metadata if available
393
+ slide_number = None
394
+ if hasattr(element, 'metadata') and element.metadata:
395
+ slide_number = getattr(element.metadata, 'slide_number', None)
396
+
397
+ if slide_number != current_slide:
398
+ slide_count += 1
399
+ if self.markdown_output:
400
+ content_parts.append(f"\n## Slide {slide_count}\n")
401
+ else:
402
+ content_parts.append(f"\nSlide {slide_count}:\n")
403
+ current_slide = slide_number
404
+
405
+ if text_content.strip():
406
+ if self.markdown_output and element_type == 'Title':
407
+ content_parts.append(f"### {text_content}\n")
408
+ elif element_type == 'ListItem':
409
+ content_parts.append(f"- {text_content}")
410
+ else:
411
+ content_parts.append(text_content)
412
+
413
+ content = '\n'.join(content_parts) if content_parts else "No text content found"
414
+
415
+ # Build metadata
416
+ metadata = {
417
+ 'file_name': file_path.name,
418
+ 'file_type': 'pptx',
419
+ 'file_size': file_path.stat().st_size,
420
+ 'slide_count': slide_count,
421
+ 'element_count': len(elements),
422
+ 'processing_method': 'unstructured-pptx'
423
+ }
424
+
425
+ if self.include_metadata and elements:
426
+ # Add presentation-level metadata
427
+ first_element = elements[0]
428
+ if hasattr(first_element, 'metadata') and first_element.metadata:
429
+ metadata.update({
430
+ 'author': getattr(first_element.metadata, 'author', None),
431
+ 'creation_date': getattr(first_element.metadata, 'creation_date', None),
432
+ 'last_modified': getattr(first_element.metadata, 'last_modified', None)
433
+ })
434
+
435
+ return content, metadata
436
+
437
+ def _get_mime_type(self, file_extension: str) -> str:
438
+ """Get MIME type for Office file extension."""
439
+ mime_types = {
440
+ '.docx': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
441
+ '.xlsx': 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
442
+ '.pptx': 'application/vnd.openxmlformats-officedocument.presentationml.presentation'
443
+ }
444
+ return mime_types.get(file_extension.lower(), 'application/octet-stream')
445
+
446
+ def get_supported_formats(self) -> List[str]:
447
+ """Get list of supported file formats."""
448
+ if self._unstructured_available:
449
+ return ['docx', 'xlsx', 'pptx']
450
+ return []
451
+
452
+ def estimate_processing_time(self, file_path: Path) -> float:
453
+ """
454
+ Estimate processing time for a file.
455
+
456
+ Args:
457
+ file_path: Path to the file
458
+
459
+ Returns:
460
+ Estimated processing time in seconds
461
+ """
462
+ if not file_path.exists():
463
+ return 0.0
464
+
465
+ # Rough estimation based on file size
466
+ file_size_mb = file_path.stat().st_size / (1024 * 1024)
467
+
468
+ # Office documents generally process at ~2MB/second with unstructured
469
+ return max(1.0, file_size_mb / 2.0)
470
+
471
+ def supports_chunking(self) -> bool:
472
+ """Check if this processor supports document chunking."""
473
+ return True
474
+
475
+ def get_processing_info(self) -> Dict[str, Any]:
476
+ """Get information about this processor."""
477
+ return {
478
+ 'name': 'OfficeProcessor',
479
+ 'supported_formats': self.get_supported_formats(),
480
+ 'library': 'unstructured',
481
+ 'library_available': self._unstructured_available,
482
+ 'features': {
483
+ 'table_extraction': self.extract_tables,
484
+ 'structure_preservation': self.preserve_structure,
485
+ 'image_extraction': self.extract_images,
486
+ 'markdown_output': self.markdown_output,
487
+ 'metadata_extraction': self.include_metadata,
488
+ 'chunking_support': self.supports_chunking()
489
+ }
490
+ }