abstractcore 2.4.3__py3-none-any.whl → 2.4.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- abstractcore/cli/__init__.py +9 -0
- abstractcore/cli/main.py +759 -0
- abstractcore/cli/vision_config.py +491 -0
- abstractcore/media/handlers/__init__.py +16 -0
- abstractcore/media/handlers/anthropic_handler.py +326 -0
- abstractcore/media/handlers/local_handler.py +541 -0
- abstractcore/media/handlers/openai_handler.py +281 -0
- abstractcore/media/processors/__init__.py +13 -0
- abstractcore/media/processors/image_processor.py +610 -0
- abstractcore/media/processors/office_processor.py +490 -0
- abstractcore/media/processors/pdf_processor.py +485 -0
- abstractcore/media/processors/text_processor.py +557 -0
- abstractcore/media/utils/__init__.py +22 -0
- abstractcore/media/utils/image_scaler.py +306 -0
- abstractcore/providers/base.py +97 -0
- abstractcore/providers/huggingface_provider.py +17 -8
- abstractcore/utils/version.py +1 -1
- {abstractcore-2.4.3.dist-info → abstractcore-2.4.5.dist-info}/METADATA +1 -1
- {abstractcore-2.4.3.dist-info → abstractcore-2.4.5.dist-info}/RECORD +23 -9
- {abstractcore-2.4.3.dist-info → abstractcore-2.4.5.dist-info}/entry_points.txt +2 -0
- {abstractcore-2.4.3.dist-info → abstractcore-2.4.5.dist-info}/WHEEL +0 -0
- {abstractcore-2.4.3.dist-info → abstractcore-2.4.5.dist-info}/licenses/LICENSE +0 -0
- {abstractcore-2.4.3.dist-info → abstractcore-2.4.5.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,490 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Office document processor using unstructured library for SOTA document processing.
|
|
3
|
+
|
|
4
|
+
This module provides comprehensive processing capabilities for Microsoft Office documents
|
|
5
|
+
(DOCX, XLSX, PPT) using the unstructured library, which is the SOTA solution for
|
|
6
|
+
document processing in 2025.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
import logging
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
from typing import Optional, Dict, Any, List, Union, Tuple
|
|
12
|
+
import json
|
|
13
|
+
|
|
14
|
+
from ..base import BaseMediaHandler, MediaProcessingError
|
|
15
|
+
from ..types import MediaContent, MediaType, ContentFormat, MediaProcessingResult
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class OfficeProcessor(BaseMediaHandler):
|
|
19
|
+
"""
|
|
20
|
+
Office document processor using unstructured library.
|
|
21
|
+
|
|
22
|
+
Supports processing of:
|
|
23
|
+
- DOCX (Word documents)
|
|
24
|
+
- XLSX (Excel spreadsheets)
|
|
25
|
+
- PPTX (PowerPoint presentations)
|
|
26
|
+
|
|
27
|
+
Uses the unstructured library for SOTA document processing with intelligent
|
|
28
|
+
element detection, table extraction, and structure preservation.
|
|
29
|
+
"""
|
|
30
|
+
|
|
31
|
+
def __init__(self, **kwargs):
|
|
32
|
+
"""
|
|
33
|
+
Initialize Office processor.
|
|
34
|
+
|
|
35
|
+
Args:
|
|
36
|
+
**kwargs: Additional configuration options
|
|
37
|
+
"""
|
|
38
|
+
super().__init__(**kwargs)
|
|
39
|
+
self.logger = logging.getLogger(__name__)
|
|
40
|
+
|
|
41
|
+
# Configuration options
|
|
42
|
+
self.extract_tables = kwargs.get('extract_tables', True)
|
|
43
|
+
self.preserve_structure = kwargs.get('preserve_structure', True)
|
|
44
|
+
self.extract_images = kwargs.get('extract_images', False) # Images in Office docs
|
|
45
|
+
self.markdown_output = kwargs.get('markdown_output', True)
|
|
46
|
+
self.include_metadata = kwargs.get('include_metadata', True)
|
|
47
|
+
|
|
48
|
+
# Chunking options for large documents
|
|
49
|
+
self.chunk_size = kwargs.get('chunk_size', None) # No chunking by default
|
|
50
|
+
self.chunk_overlap = kwargs.get('chunk_overlap', 0)
|
|
51
|
+
|
|
52
|
+
# Check if unstructured library is available
|
|
53
|
+
self._check_dependencies()
|
|
54
|
+
|
|
55
|
+
# Set capabilities for office processing
|
|
56
|
+
from ..types import MediaCapabilities
|
|
57
|
+
self.capabilities = MediaCapabilities(
|
|
58
|
+
vision_support=False,
|
|
59
|
+
audio_support=False,
|
|
60
|
+
video_support=False,
|
|
61
|
+
document_support=True,
|
|
62
|
+
supported_document_formats=['docx', 'xlsx', 'pptx'],
|
|
63
|
+
max_file_size=self.max_file_size
|
|
64
|
+
)
|
|
65
|
+
|
|
66
|
+
def _check_dependencies(self):
|
|
67
|
+
"""Check if required dependencies are available."""
|
|
68
|
+
try:
|
|
69
|
+
import unstructured
|
|
70
|
+
from unstructured.partition.auto import partition
|
|
71
|
+
from unstructured.partition.docx import partition_docx
|
|
72
|
+
from unstructured.partition.xlsx import partition_xlsx
|
|
73
|
+
from unstructured.partition.pptx import partition_pptx
|
|
74
|
+
self._unstructured_available = True
|
|
75
|
+
self.logger.debug("Unstructured library available for Office document processing")
|
|
76
|
+
except ImportError as e:
|
|
77
|
+
self._unstructured_available = False
|
|
78
|
+
self.logger.warning(f"Unstructured library not available: {e}")
|
|
79
|
+
|
|
80
|
+
def can_process(self, file_path: Path) -> bool:
|
|
81
|
+
"""
|
|
82
|
+
Check if this processor can handle the file.
|
|
83
|
+
|
|
84
|
+
Args:
|
|
85
|
+
file_path: Path to the file
|
|
86
|
+
|
|
87
|
+
Returns:
|
|
88
|
+
True if file can be processed
|
|
89
|
+
"""
|
|
90
|
+
if not self._unstructured_available:
|
|
91
|
+
return False
|
|
92
|
+
|
|
93
|
+
supported_extensions = {'.docx', '.xlsx', '.pptx'}
|
|
94
|
+
return file_path.suffix.lower() in supported_extensions
|
|
95
|
+
|
|
96
|
+
def _process_internal(self, file_path: Path, media_type: MediaType, **kwargs) -> MediaContent:
|
|
97
|
+
"""
|
|
98
|
+
Internal processing method for Office documents.
|
|
99
|
+
|
|
100
|
+
Args:
|
|
101
|
+
file_path: Path to the Office document
|
|
102
|
+
media_type: Detected media type (should be DOCUMENT)
|
|
103
|
+
**kwargs: Additional processing options
|
|
104
|
+
|
|
105
|
+
Returns:
|
|
106
|
+
MediaContent with processed Office document content
|
|
107
|
+
|
|
108
|
+
Raises:
|
|
109
|
+
MediaProcessingError: If processing fails
|
|
110
|
+
"""
|
|
111
|
+
if media_type != MediaType.DOCUMENT:
|
|
112
|
+
raise MediaProcessingError(f"OfficeProcessor only handles document types, got {media_type}")
|
|
113
|
+
|
|
114
|
+
if not self._unstructured_available:
|
|
115
|
+
raise MediaProcessingError(
|
|
116
|
+
"Unstructured library not available. Install with: pip install \"abstractcore[media]\""
|
|
117
|
+
)
|
|
118
|
+
|
|
119
|
+
try:
|
|
120
|
+
# Extract content based on file type
|
|
121
|
+
file_extension = file_path.suffix.lower()
|
|
122
|
+
|
|
123
|
+
if file_extension == '.docx':
|
|
124
|
+
content, metadata = self._process_docx(file_path, **kwargs)
|
|
125
|
+
elif file_extension == '.xlsx':
|
|
126
|
+
content, metadata = self._process_xlsx(file_path, **kwargs)
|
|
127
|
+
elif file_extension == '.pptx':
|
|
128
|
+
content, metadata = self._process_pptx(file_path, **kwargs)
|
|
129
|
+
else:
|
|
130
|
+
raise MediaProcessingError(f"Unsupported Office file type: {file_extension}")
|
|
131
|
+
|
|
132
|
+
# Create MediaContent object
|
|
133
|
+
return self._create_media_content(
|
|
134
|
+
content=content,
|
|
135
|
+
media_type=MediaType.DOCUMENT,
|
|
136
|
+
content_format=ContentFormat.TEXT,
|
|
137
|
+
mime_type=self._get_mime_type(file_extension),
|
|
138
|
+
file_path=file_path,
|
|
139
|
+
metadata=metadata
|
|
140
|
+
)
|
|
141
|
+
|
|
142
|
+
except Exception as e:
|
|
143
|
+
raise MediaProcessingError(f"Office document processing failed: {str(e)}")
|
|
144
|
+
|
|
145
|
+
def process_file(self, file_path: Path, **kwargs) -> MediaProcessingResult:
|
|
146
|
+
"""
|
|
147
|
+
Process an Office document file.
|
|
148
|
+
|
|
149
|
+
Args:
|
|
150
|
+
file_path: Path to the Office document
|
|
151
|
+
**kwargs: Additional processing options
|
|
152
|
+
|
|
153
|
+
Returns:
|
|
154
|
+
MediaProcessingResult with extracted content
|
|
155
|
+
"""
|
|
156
|
+
if not self._unstructured_available:
|
|
157
|
+
return MediaProcessingResult(
|
|
158
|
+
success=False,
|
|
159
|
+
error_message="Unstructured library not available. Install with: pip install \"abstractcore[media]\""
|
|
160
|
+
)
|
|
161
|
+
|
|
162
|
+
if not self.can_process(file_path):
|
|
163
|
+
return MediaProcessingResult(
|
|
164
|
+
success=False,
|
|
165
|
+
error_message=f"Unsupported Office file type: {file_path.suffix}"
|
|
166
|
+
)
|
|
167
|
+
|
|
168
|
+
try:
|
|
169
|
+
self.logger.info(f"Processing Office document: {file_path}")
|
|
170
|
+
|
|
171
|
+
# Extract content based on file type
|
|
172
|
+
file_extension = file_path.suffix.lower()
|
|
173
|
+
|
|
174
|
+
if file_extension == '.docx':
|
|
175
|
+
content, metadata = self._process_docx(file_path, **kwargs)
|
|
176
|
+
elif file_extension == '.xlsx':
|
|
177
|
+
content, metadata = self._process_xlsx(file_path, **kwargs)
|
|
178
|
+
elif file_extension == '.pptx':
|
|
179
|
+
content, metadata = self._process_pptx(file_path, **kwargs)
|
|
180
|
+
else:
|
|
181
|
+
return MediaProcessingResult(
|
|
182
|
+
success=False,
|
|
183
|
+
error_message=f"Unsupported file extension: {file_extension}"
|
|
184
|
+
)
|
|
185
|
+
|
|
186
|
+
# Create MediaContent
|
|
187
|
+
media_content = MediaContent(
|
|
188
|
+
media_type=MediaType.DOCUMENT,
|
|
189
|
+
content=content,
|
|
190
|
+
content_format=ContentFormat.TEXT,
|
|
191
|
+
mime_type=self._get_mime_type(file_extension),
|
|
192
|
+
file_path=str(file_path),
|
|
193
|
+
metadata=metadata
|
|
194
|
+
)
|
|
195
|
+
|
|
196
|
+
return MediaProcessingResult(
|
|
197
|
+
success=True,
|
|
198
|
+
media_content=media_content,
|
|
199
|
+
processing_time=0 # Would be calculated in real implementation
|
|
200
|
+
)
|
|
201
|
+
|
|
202
|
+
except Exception as e:
|
|
203
|
+
self.logger.error(f"Error processing Office document {file_path}: {e}")
|
|
204
|
+
return MediaProcessingResult(
|
|
205
|
+
success=False,
|
|
206
|
+
error_message=f"Office document processing failed: {str(e)}"
|
|
207
|
+
)
|
|
208
|
+
|
|
209
|
+
def _process_docx(self, file_path: Path, **kwargs) -> Tuple[str, Dict[str, Any]]:
|
|
210
|
+
"""
|
|
211
|
+
Process a DOCX document using unstructured.
|
|
212
|
+
|
|
213
|
+
Args:
|
|
214
|
+
file_path: Path to DOCX file
|
|
215
|
+
**kwargs: Processing options
|
|
216
|
+
|
|
217
|
+
Returns:
|
|
218
|
+
Tuple of (content, metadata)
|
|
219
|
+
"""
|
|
220
|
+
from unstructured.partition.docx import partition_docx
|
|
221
|
+
|
|
222
|
+
# Partition the document
|
|
223
|
+
elements = partition_docx(
|
|
224
|
+
filename=str(file_path),
|
|
225
|
+
include_metadata=self.include_metadata,
|
|
226
|
+
extract_image_block_types=["Image"] if self.extract_images else []
|
|
227
|
+
)
|
|
228
|
+
|
|
229
|
+
# Convert to structured format
|
|
230
|
+
content_parts = []
|
|
231
|
+
tables = []
|
|
232
|
+
images = []
|
|
233
|
+
|
|
234
|
+
for element in elements:
|
|
235
|
+
# Get element type and text content directly from the element
|
|
236
|
+
element_type = type(element).__name__
|
|
237
|
+
text_content = str(element)
|
|
238
|
+
|
|
239
|
+
if element_type == 'Table' and self.extract_tables:
|
|
240
|
+
# Extract table content
|
|
241
|
+
tables.append(text_content)
|
|
242
|
+
if self.markdown_output:
|
|
243
|
+
content_parts.append(f"\n**Table:**\n{text_content}\n")
|
|
244
|
+
else:
|
|
245
|
+
content_parts.append(f"\nTable: {text_content}\n")
|
|
246
|
+
|
|
247
|
+
elif element_type == 'Image' and self.extract_images:
|
|
248
|
+
images.append(text_content)
|
|
249
|
+
content_parts.append(f"\n[Image: {text_content}]\n")
|
|
250
|
+
|
|
251
|
+
elif text_content.strip():
|
|
252
|
+
if self.markdown_output and element_type in ['Title', 'Header']:
|
|
253
|
+
# Format headers in markdown
|
|
254
|
+
content_parts.append(f"\n## {text_content}\n")
|
|
255
|
+
else:
|
|
256
|
+
content_parts.append(text_content)
|
|
257
|
+
|
|
258
|
+
# Combine content
|
|
259
|
+
content = '\n'.join(content_parts) if content_parts else "No text content found"
|
|
260
|
+
|
|
261
|
+
# Build metadata
|
|
262
|
+
metadata = {
|
|
263
|
+
'file_name': file_path.name,
|
|
264
|
+
'file_type': 'docx',
|
|
265
|
+
'file_size': file_path.stat().st_size,
|
|
266
|
+
'element_count': len(elements),
|
|
267
|
+
'table_count': len(tables),
|
|
268
|
+
'image_count': len(images),
|
|
269
|
+
'processing_method': 'unstructured-docx'
|
|
270
|
+
}
|
|
271
|
+
|
|
272
|
+
if self.include_metadata and elements:
|
|
273
|
+
# Add document-level metadata from first element
|
|
274
|
+
first_element = elements[0]
|
|
275
|
+
if hasattr(first_element, 'metadata') and first_element.metadata:
|
|
276
|
+
metadata.update({
|
|
277
|
+
'author': getattr(first_element.metadata, 'author', None),
|
|
278
|
+
'creation_date': getattr(first_element.metadata, 'creation_date', None),
|
|
279
|
+
'last_modified': getattr(first_element.metadata, 'last_modified', None)
|
|
280
|
+
})
|
|
281
|
+
|
|
282
|
+
return content, metadata
|
|
283
|
+
|
|
284
|
+
def _process_xlsx(self, file_path: Path, **kwargs) -> Tuple[str, Dict[str, Any]]:
|
|
285
|
+
"""
|
|
286
|
+
Process an XLSX spreadsheet using unstructured.
|
|
287
|
+
|
|
288
|
+
Args:
|
|
289
|
+
file_path: Path to XLSX file
|
|
290
|
+
**kwargs: Processing options
|
|
291
|
+
|
|
292
|
+
Returns:
|
|
293
|
+
Tuple of (content, metadata)
|
|
294
|
+
"""
|
|
295
|
+
from unstructured.partition.xlsx import partition_xlsx
|
|
296
|
+
|
|
297
|
+
# Partition the spreadsheet
|
|
298
|
+
elements = partition_xlsx(
|
|
299
|
+
filename=str(file_path),
|
|
300
|
+
include_metadata=self.include_metadata
|
|
301
|
+
)
|
|
302
|
+
|
|
303
|
+
content_parts = []
|
|
304
|
+
sheet_data = {}
|
|
305
|
+
|
|
306
|
+
current_sheet = None
|
|
307
|
+
for element in elements:
|
|
308
|
+
# Get element content directly
|
|
309
|
+
text_content = str(element)
|
|
310
|
+
|
|
311
|
+
# For XLSX, try to get sheet information from element if available
|
|
312
|
+
sheet_name = 'Sheet1' # Default sheet name
|
|
313
|
+
if hasattr(element, 'metadata') and element.metadata:
|
|
314
|
+
sheet_name = getattr(element.metadata, 'sheet_name', 'Sheet1')
|
|
315
|
+
|
|
316
|
+
if sheet_name != current_sheet:
|
|
317
|
+
if self.markdown_output:
|
|
318
|
+
content_parts.append(f"\n## Sheet: {sheet_name}\n")
|
|
319
|
+
else:
|
|
320
|
+
content_parts.append(f"\nSheet: {sheet_name}\n")
|
|
321
|
+
current_sheet = sheet_name
|
|
322
|
+
sheet_data[sheet_name] = []
|
|
323
|
+
|
|
324
|
+
if text_content.strip():
|
|
325
|
+
sheet_data[sheet_name].append(text_content)
|
|
326
|
+
content_parts.append(text_content)
|
|
327
|
+
|
|
328
|
+
# Format as tables if structured output is requested
|
|
329
|
+
if self.markdown_output and sheet_data:
|
|
330
|
+
formatted_content = []
|
|
331
|
+
for sheet_name, data in sheet_data.items():
|
|
332
|
+
formatted_content.append(f"\n## {sheet_name}\n")
|
|
333
|
+
|
|
334
|
+
# Try to format as table if data looks tabular
|
|
335
|
+
if len(data) > 1:
|
|
336
|
+
# Simple table formatting - could be enhanced
|
|
337
|
+
formatted_content.append("| " + " | ".join(str(item) for item in data[:5]) + " |")
|
|
338
|
+
if len(data) > 1:
|
|
339
|
+
formatted_content.append("|" + "---|" * min(5, len(data)) + "|")
|
|
340
|
+
for row in data[1:6]: # Limit to first few rows
|
|
341
|
+
formatted_content.append("| " + str(row) + " |")
|
|
342
|
+
if len(data) > 6:
|
|
343
|
+
formatted_content.append("... (additional rows truncated)")
|
|
344
|
+
else:
|
|
345
|
+
formatted_content.extend(data)
|
|
346
|
+
|
|
347
|
+
content = '\n'.join(formatted_content)
|
|
348
|
+
else:
|
|
349
|
+
content = '\n'.join(content_parts) if content_parts else "No data found"
|
|
350
|
+
|
|
351
|
+
# Build metadata
|
|
352
|
+
metadata = {
|
|
353
|
+
'file_name': file_path.name,
|
|
354
|
+
'file_type': 'xlsx',
|
|
355
|
+
'file_size': file_path.stat().st_size,
|
|
356
|
+
'sheet_count': len(sheet_data),
|
|
357
|
+
'sheet_names': list(sheet_data.keys()),
|
|
358
|
+
'total_cells': sum(len(data) for data in sheet_data.values()),
|
|
359
|
+
'processing_method': 'unstructured-xlsx'
|
|
360
|
+
}
|
|
361
|
+
|
|
362
|
+
return content, metadata
|
|
363
|
+
|
|
364
|
+
def _process_pptx(self, file_path: Path, **kwargs) -> Tuple[str, Dict[str, Any]]:
|
|
365
|
+
"""
|
|
366
|
+
Process a PPTX presentation using unstructured.
|
|
367
|
+
|
|
368
|
+
Args:
|
|
369
|
+
file_path: Path to PPTX file
|
|
370
|
+
**kwargs: Processing options
|
|
371
|
+
|
|
372
|
+
Returns:
|
|
373
|
+
Tuple of (content, metadata)
|
|
374
|
+
"""
|
|
375
|
+
from unstructured.partition.pptx import partition_pptx
|
|
376
|
+
|
|
377
|
+
# Partition the presentation
|
|
378
|
+
elements = partition_pptx(
|
|
379
|
+
filename=str(file_path),
|
|
380
|
+
include_metadata=self.include_metadata
|
|
381
|
+
)
|
|
382
|
+
|
|
383
|
+
content_parts = []
|
|
384
|
+
slide_count = 0
|
|
385
|
+
current_slide = None
|
|
386
|
+
|
|
387
|
+
for element in elements:
|
|
388
|
+
# Get element content directly
|
|
389
|
+
text_content = str(element)
|
|
390
|
+
element_type = type(element).__name__
|
|
391
|
+
|
|
392
|
+
# Track slide information - try to get from element metadata if available
|
|
393
|
+
slide_number = None
|
|
394
|
+
if hasattr(element, 'metadata') and element.metadata:
|
|
395
|
+
slide_number = getattr(element.metadata, 'slide_number', None)
|
|
396
|
+
|
|
397
|
+
if slide_number != current_slide:
|
|
398
|
+
slide_count += 1
|
|
399
|
+
if self.markdown_output:
|
|
400
|
+
content_parts.append(f"\n## Slide {slide_count}\n")
|
|
401
|
+
else:
|
|
402
|
+
content_parts.append(f"\nSlide {slide_count}:\n")
|
|
403
|
+
current_slide = slide_number
|
|
404
|
+
|
|
405
|
+
if text_content.strip():
|
|
406
|
+
if self.markdown_output and element_type == 'Title':
|
|
407
|
+
content_parts.append(f"### {text_content}\n")
|
|
408
|
+
elif element_type == 'ListItem':
|
|
409
|
+
content_parts.append(f"- {text_content}")
|
|
410
|
+
else:
|
|
411
|
+
content_parts.append(text_content)
|
|
412
|
+
|
|
413
|
+
content = '\n'.join(content_parts) if content_parts else "No text content found"
|
|
414
|
+
|
|
415
|
+
# Build metadata
|
|
416
|
+
metadata = {
|
|
417
|
+
'file_name': file_path.name,
|
|
418
|
+
'file_type': 'pptx',
|
|
419
|
+
'file_size': file_path.stat().st_size,
|
|
420
|
+
'slide_count': slide_count,
|
|
421
|
+
'element_count': len(elements),
|
|
422
|
+
'processing_method': 'unstructured-pptx'
|
|
423
|
+
}
|
|
424
|
+
|
|
425
|
+
if self.include_metadata and elements:
|
|
426
|
+
# Add presentation-level metadata
|
|
427
|
+
first_element = elements[0]
|
|
428
|
+
if hasattr(first_element, 'metadata') and first_element.metadata:
|
|
429
|
+
metadata.update({
|
|
430
|
+
'author': getattr(first_element.metadata, 'author', None),
|
|
431
|
+
'creation_date': getattr(first_element.metadata, 'creation_date', None),
|
|
432
|
+
'last_modified': getattr(first_element.metadata, 'last_modified', None)
|
|
433
|
+
})
|
|
434
|
+
|
|
435
|
+
return content, metadata
|
|
436
|
+
|
|
437
|
+
def _get_mime_type(self, file_extension: str) -> str:
|
|
438
|
+
"""Get MIME type for Office file extension."""
|
|
439
|
+
mime_types = {
|
|
440
|
+
'.docx': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
|
|
441
|
+
'.xlsx': 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
|
|
442
|
+
'.pptx': 'application/vnd.openxmlformats-officedocument.presentationml.presentation'
|
|
443
|
+
}
|
|
444
|
+
return mime_types.get(file_extension.lower(), 'application/octet-stream')
|
|
445
|
+
|
|
446
|
+
def get_supported_formats(self) -> List[str]:
|
|
447
|
+
"""Get list of supported file formats."""
|
|
448
|
+
if self._unstructured_available:
|
|
449
|
+
return ['docx', 'xlsx', 'pptx']
|
|
450
|
+
return []
|
|
451
|
+
|
|
452
|
+
def estimate_processing_time(self, file_path: Path) -> float:
|
|
453
|
+
"""
|
|
454
|
+
Estimate processing time for a file.
|
|
455
|
+
|
|
456
|
+
Args:
|
|
457
|
+
file_path: Path to the file
|
|
458
|
+
|
|
459
|
+
Returns:
|
|
460
|
+
Estimated processing time in seconds
|
|
461
|
+
"""
|
|
462
|
+
if not file_path.exists():
|
|
463
|
+
return 0.0
|
|
464
|
+
|
|
465
|
+
# Rough estimation based on file size
|
|
466
|
+
file_size_mb = file_path.stat().st_size / (1024 * 1024)
|
|
467
|
+
|
|
468
|
+
# Office documents generally process at ~2MB/second with unstructured
|
|
469
|
+
return max(1.0, file_size_mb / 2.0)
|
|
470
|
+
|
|
471
|
+
def supports_chunking(self) -> bool:
|
|
472
|
+
"""Check if this processor supports document chunking."""
|
|
473
|
+
return True
|
|
474
|
+
|
|
475
|
+
def get_processing_info(self) -> Dict[str, Any]:
|
|
476
|
+
"""Get information about this processor."""
|
|
477
|
+
return {
|
|
478
|
+
'name': 'OfficeProcessor',
|
|
479
|
+
'supported_formats': self.get_supported_formats(),
|
|
480
|
+
'library': 'unstructured',
|
|
481
|
+
'library_available': self._unstructured_available,
|
|
482
|
+
'features': {
|
|
483
|
+
'table_extraction': self.extract_tables,
|
|
484
|
+
'structure_preservation': self.preserve_structure,
|
|
485
|
+
'image_extraction': self.extract_images,
|
|
486
|
+
'markdown_output': self.markdown_output,
|
|
487
|
+
'metadata_extraction': self.include_metadata,
|
|
488
|
+
'chunking_support': self.supports_chunking()
|
|
489
|
+
}
|
|
490
|
+
}
|