aiecs 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of aiecs might be problematic. Click here for more details.

Files changed (90) hide show
  1. aiecs/__init__.py +75 -0
  2. aiecs/__main__.py +41 -0
  3. aiecs/aiecs_client.py +295 -0
  4. aiecs/application/__init__.py +10 -0
  5. aiecs/application/executors/__init__.py +10 -0
  6. aiecs/application/executors/operation_executor.py +341 -0
  7. aiecs/config/__init__.py +15 -0
  8. aiecs/config/config.py +117 -0
  9. aiecs/config/registry.py +19 -0
  10. aiecs/core/__init__.py +46 -0
  11. aiecs/core/interface/__init__.py +34 -0
  12. aiecs/core/interface/execution_interface.py +150 -0
  13. aiecs/core/interface/storage_interface.py +214 -0
  14. aiecs/domain/__init__.py +20 -0
  15. aiecs/domain/context/__init__.py +28 -0
  16. aiecs/domain/context/content_engine.py +982 -0
  17. aiecs/domain/context/conversation_models.py +306 -0
  18. aiecs/domain/execution/__init__.py +12 -0
  19. aiecs/domain/execution/model.py +49 -0
  20. aiecs/domain/task/__init__.py +13 -0
  21. aiecs/domain/task/dsl_processor.py +460 -0
  22. aiecs/domain/task/model.py +50 -0
  23. aiecs/domain/task/task_context.py +257 -0
  24. aiecs/infrastructure/__init__.py +26 -0
  25. aiecs/infrastructure/messaging/__init__.py +13 -0
  26. aiecs/infrastructure/messaging/celery_task_manager.py +341 -0
  27. aiecs/infrastructure/messaging/websocket_manager.py +289 -0
  28. aiecs/infrastructure/monitoring/__init__.py +12 -0
  29. aiecs/infrastructure/monitoring/executor_metrics.py +138 -0
  30. aiecs/infrastructure/monitoring/structured_logger.py +50 -0
  31. aiecs/infrastructure/monitoring/tracing_manager.py +376 -0
  32. aiecs/infrastructure/persistence/__init__.py +12 -0
  33. aiecs/infrastructure/persistence/database_manager.py +286 -0
  34. aiecs/infrastructure/persistence/file_storage.py +671 -0
  35. aiecs/infrastructure/persistence/redis_client.py +162 -0
  36. aiecs/llm/__init__.py +54 -0
  37. aiecs/llm/base_client.py +99 -0
  38. aiecs/llm/client_factory.py +339 -0
  39. aiecs/llm/custom_callbacks.py +228 -0
  40. aiecs/llm/openai_client.py +125 -0
  41. aiecs/llm/vertex_client.py +186 -0
  42. aiecs/llm/xai_client.py +184 -0
  43. aiecs/main.py +351 -0
  44. aiecs/scripts/DEPENDENCY_SYSTEM_SUMMARY.md +241 -0
  45. aiecs/scripts/README_DEPENDENCY_CHECKER.md +309 -0
  46. aiecs/scripts/README_WEASEL_PATCH.md +126 -0
  47. aiecs/scripts/__init__.py +3 -0
  48. aiecs/scripts/dependency_checker.py +825 -0
  49. aiecs/scripts/dependency_fixer.py +348 -0
  50. aiecs/scripts/download_nlp_data.py +348 -0
  51. aiecs/scripts/fix_weasel_validator.py +121 -0
  52. aiecs/scripts/fix_weasel_validator.sh +82 -0
  53. aiecs/scripts/patch_weasel_library.sh +188 -0
  54. aiecs/scripts/quick_dependency_check.py +269 -0
  55. aiecs/scripts/run_weasel_patch.sh +41 -0
  56. aiecs/scripts/setup_nlp_data.sh +217 -0
  57. aiecs/tasks/__init__.py +2 -0
  58. aiecs/tasks/worker.py +111 -0
  59. aiecs/tools/__init__.py +196 -0
  60. aiecs/tools/base_tool.py +202 -0
  61. aiecs/tools/langchain_adapter.py +361 -0
  62. aiecs/tools/task_tools/__init__.py +82 -0
  63. aiecs/tools/task_tools/chart_tool.py +704 -0
  64. aiecs/tools/task_tools/classfire_tool.py +901 -0
  65. aiecs/tools/task_tools/image_tool.py +397 -0
  66. aiecs/tools/task_tools/office_tool.py +600 -0
  67. aiecs/tools/task_tools/pandas_tool.py +565 -0
  68. aiecs/tools/task_tools/report_tool.py +499 -0
  69. aiecs/tools/task_tools/research_tool.py +363 -0
  70. aiecs/tools/task_tools/scraper_tool.py +548 -0
  71. aiecs/tools/task_tools/search_api.py +7 -0
  72. aiecs/tools/task_tools/stats_tool.py +513 -0
  73. aiecs/tools/temp_file_manager.py +126 -0
  74. aiecs/tools/tool_executor/__init__.py +35 -0
  75. aiecs/tools/tool_executor/tool_executor.py +518 -0
  76. aiecs/utils/LLM_output_structor.py +409 -0
  77. aiecs/utils/__init__.py +23 -0
  78. aiecs/utils/base_callback.py +50 -0
  79. aiecs/utils/execution_utils.py +158 -0
  80. aiecs/utils/logging.py +1 -0
  81. aiecs/utils/prompt_loader.py +13 -0
  82. aiecs/utils/token_usage_repository.py +279 -0
  83. aiecs/ws/__init__.py +0 -0
  84. aiecs/ws/socket_server.py +41 -0
  85. aiecs-1.0.0.dist-info/METADATA +610 -0
  86. aiecs-1.0.0.dist-info/RECORD +90 -0
  87. aiecs-1.0.0.dist-info/WHEEL +5 -0
  88. aiecs-1.0.0.dist-info/entry_points.txt +7 -0
  89. aiecs-1.0.0.dist-info/licenses/LICENSE +225 -0
  90. aiecs-1.0.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,600 @@
1
+ import os
2
+ import logging
3
+ from typing import List, Dict, Optional, Any
4
+
5
+ import pandas as pd
6
+ import pdfplumber
7
+ import pytesseract
8
+ from PIL import Image
9
+ from tika import parser
10
+ from docx import Document as DocxDocument
11
+ from docx.shared import Pt
12
+ from pptx import Presentation
13
+ from pptx.util import Inches
14
+ from pydantic import BaseModel, field_validator, ValidationError, ConfigDict
15
+ from pydantic_settings import BaseSettings
16
+
17
+ from aiecs.tools.base_tool import BaseTool
18
+ from aiecs.tools import register_tool
19
+
20
+ # Configuration for OfficeTool
21
+ class OfficeSettings(BaseSettings):
22
+ """
23
+ Configuration for OfficeTool.
24
+
25
+ Attributes:
26
+ max_file_size_mb (int): Maximum file size in megabytes.
27
+ default_font (str): Default font for documents.
28
+ default_font_size (int): Default font size in points.
29
+ allowed_extensions (List[str]): Allowed document file extensions.
30
+ env_prefix (str): Environment variable prefix for settings.
31
+ """
32
+ max_file_size_mb: int = 100
33
+ default_font: str = "Arial"
34
+ default_font_size: int = 12
35
+ allowed_extensions: List[str] = ['.docx', '.pptx', '.xlsx', '.pdf', '.png', '.jpg', '.jpeg', '.tiff', '.bmp', '.gif']
36
+ env_prefix: str = 'OFFICE_TOOL_'
37
+
38
+ model_config = ConfigDict(env_prefix='OFFICE_TOOL_')
39
+
40
+ # Exceptions
41
+ class OfficeToolError(Exception):
42
+ """Base exception for OfficeTool errors."""
43
+ pass
44
+
45
+ class InputValidationError(OfficeToolError):
46
+ """Raised when input validation fails."""
47
+ pass
48
+
49
+ class FileOperationError(OfficeToolError):
50
+ """Raised when file operations fail."""
51
+ pass
52
+
53
+ class SecurityError(OfficeToolError):
54
+ """Raised for security-related issues."""
55
+ pass
56
+
57
+ class ContentValidationError(OfficeToolError):
58
+ """Raised when document content validation fails."""
59
+ pass
60
+
61
+ # Base schema for common fields
62
+ class BaseFileSchema(BaseModel):
63
+ file_path: Optional[str] = None
64
+ output_path: Optional[str] = None
65
+ image_path: Optional[str] = None
66
+
67
+ @field_validator('file_path', 'output_path', 'image_path')
68
+ def validate_path(cls, v: Optional[str], field) -> Optional[str]:
69
+ """Validate file paths for existence, size, extension, and path traversal."""
70
+ if not v:
71
+ return v
72
+ settings = OfficeSettings()
73
+ abs_path = os.path.abspath(os.path.normpath(v))
74
+ # Check for path traversal
75
+ if '..' in v or '~' in v or '%' in v:
76
+ raise SecurityError(f"Path traversal attempt detected: {v}")
77
+ # Ensure path is in allowed directories
78
+ base_dir = os.path.abspath(os.getcwd())
79
+ allowed_dirs = [os.path.abspath(os.path.normpath(d)) for d in ['/tmp', './data', './uploads']]
80
+ if not abs_path.startswith(base_dir) and not any(abs_path.startswith(d) for d in allowed_dirs):
81
+ raise SecurityError(f"Path not in allowed directories: {abs_path}")
82
+ # Check extension
83
+ ext = os.path.splitext(abs_path)[1].lower()
84
+ if ext not in settings.allowed_extensions:
85
+ raise SecurityError(f"Extension '{ext}' not allowed for '{field.field_name}', expected {settings.allowed_extensions}")
86
+ # Check file existence and size for input paths
87
+ if field.field_name == 'file_path':
88
+ if not os.path.isfile(abs_path):
89
+ raise FileOperationError(f"{field.field_name}: File not found: {abs_path}")
90
+ size_mb = os.path.getsize(abs_path) / (1024 * 1024)
91
+ if size_mb > settings.max_file_size_mb:
92
+ raise FileOperationError(f"{field.field_name}: File too large: {size_mb:.1f}MB, max {settings.max_file_size_mb}MB")
93
+ # Check for existing output paths
94
+ elif field.field_name == 'output_path' and os.path.exists(abs_path):
95
+ raise FileOperationError(f"{field.field_name}: File already exists: {abs_path}")
96
+ return abs_path
97
+
98
+ # Schemas for operations
99
+ class ReadDocxSchema(BaseFileSchema):
100
+ """Schema for reading DOCX files."""
101
+ file_path: str
102
+ include_tables: bool = False
103
+
104
+ class WriteDocxSchema(BaseFileSchema):
105
+ """Schema for writing DOCX files."""
106
+ text: str
107
+ output_path: str
108
+ table_data: Optional[List[List[str]]] = None
109
+
110
+ class ReadPptxSchema(BaseFileSchema):
111
+ """Schema for reading PPTX files."""
112
+ file_path: str
113
+
114
+ class WritePptxSchema(BaseFileSchema):
115
+ """Schema for writing PPTX files."""
116
+ slides: List[str]
117
+ output_path: str
118
+ image_path: Optional[str] = None
119
+
120
+ class ReadXlsxSchema(BaseFileSchema):
121
+ """Schema for reading XLSX files."""
122
+ file_path: str
123
+ sheet_name: Optional[str] = None
124
+
125
+ class WriteXlsxSchema(BaseFileSchema):
126
+ """Schema for writing XLSX files."""
127
+ data: List[Dict]
128
+ output_path: str
129
+ sheet_name: str = 'Sheet1'
130
+
131
+ class ExtractTextSchema(BaseFileSchema):
132
+ """Schema for extracting text from files."""
133
+ file_path: str
134
+
135
+ @register_tool('office')
136
+ class OfficeTool(BaseTool):
137
+ """
138
+ Office document processing tool supporting:
139
+ - read_docx: Read content from DOCX files.
140
+ - write_docx: Write content to DOCX files.
141
+ - read_pptx: Read content from PPTX files.
142
+ - write_pptx: Write content to PPTX files.
143
+ - read_xlsx: Read content from XLSX files.
144
+ - write_xlsx: Write content to XLSX files.
145
+ - extract_text: Extract text from various file formats.
146
+
147
+ Inherits from BaseTool to leverage ToolExecutor for caching, concurrency, and error handling.
148
+ """
149
+ def __init__(self, config: Optional[Dict[str, Any]] = None):
150
+ """
151
+ Initialize OfficeTool with settings.
152
+
153
+ Args:
154
+ config (Dict, optional): Configuration overrides for OfficeSettings.
155
+
156
+ Raises:
157
+ ValueError: If config contains invalid settings.
158
+ """
159
+ super().__init__(config)
160
+ self.settings = OfficeSettings()
161
+ if config:
162
+ try:
163
+ self.settings = self.settings.model_validate({**self.settings.model_dump(), **config})
164
+ except ValidationError as e:
165
+ raise ValueError(f"Invalid configuration: {e}")
166
+ self.logger = logging.getLogger(__name__)
167
+ if not self.logger.handlers:
168
+ handler = logging.StreamHandler()
169
+ handler.setFormatter(logging.Formatter('%(asctime)s %(levelname)s %(message)s'))
170
+ self.logger.addHandler(handler)
171
+ self.logger.setLevel(logging.INFO)
172
+
173
+ def _validate_document(self, file_path: str, file_type: str) -> None:
174
+ """
175
+ Validate document structure before processing.
176
+
177
+ Args:
178
+ file_path (str): Path to the document file.
179
+ file_type (str): Type of document ('docx', 'pptx', 'xlsx', 'pdf', 'image').
180
+
181
+ Raises:
182
+ ContentValidationError: If document structure is invalid.
183
+ """
184
+ try:
185
+ if file_type == 'docx':
186
+ doc = DocxDocument(file_path)
187
+ if not hasattr(doc, 'paragraphs'):
188
+ raise ContentValidationError("Invalid DOCX structure")
189
+ elif file_type == 'pptx':
190
+ prs = Presentation(file_path)
191
+ if not hasattr(prs, 'slides'):
192
+ raise ContentValidationError("Invalid PPTX structure")
193
+ elif file_type == 'xlsx':
194
+ # Just validate that file can be read - don't care about return type
195
+ pd.read_excel(file_path, nrows=5)
196
+ elif file_type == 'pdf':
197
+ with pdfplumber.open(file_path) as pdf:
198
+ if len(pdf.pages) == 0:
199
+ raise ContentValidationError("PDF has no pages")
200
+ elif file_type == 'image':
201
+ img = Image.open(file_path)
202
+ img.verify() # Verify it's a valid image
203
+ else:
204
+ # Use tika as fallback for other formats
205
+ parsed = parser.from_file(file_path)
206
+ if not parsed or not parsed.get('content'):
207
+ raise ContentValidationError("Unable to parse file content")
208
+ except Exception as e:
209
+ raise ContentValidationError(f"Invalid {file_type.upper()} file: {str(e)}")
210
+
211
+ def _sanitize_text(self, text: str) -> str:
212
+ """
213
+ Sanitize text to remove potentially harmful control characters.
214
+
215
+ Args:
216
+ text (str): Input text.
217
+
218
+ Returns:
219
+ str: Sanitized text.
220
+ """
221
+ if not text:
222
+ return ""
223
+ return ''.join(char for char in text if ord(char) >= 32 or char in '\n\r\t')
224
+
225
+ def _sanitize_table_data(self, table_data: Optional[List[List[str]]]) -> Optional[List[List[str]]]:
226
+ """
227
+ Sanitize table data to remove harmful content.
228
+
229
+ Args:
230
+ table_data (Optional[List[List[str]]]): Table data to sanitize.
231
+
232
+ Returns:
233
+ Optional[List[List[str]]]: Sanitized table data.
234
+ """
235
+ if not table_data:
236
+ return None
237
+ return [[self._sanitize_text(str(cell)) for cell in row] for row in table_data]
238
+
239
+ def _sanitize_data(self, data_list: List[Dict]) -> List[Dict]:
240
+ """
241
+ Sanitize Excel data to remove harmful content and enforce limits.
242
+
243
+ Args:
244
+ data_list (List[Dict]): List of dictionaries to sanitize.
245
+
246
+ Returns:
247
+ List[Dict]: Sanitized data.
248
+ """
249
+ if not data_list:
250
+ return []
251
+ sanitized = []
252
+ for item in data_list:
253
+ clean_item = {}
254
+ for k, v in item.items():
255
+ clean_key = self._sanitize_text(str(k))[:255] # Excel key limit with sanitization
256
+ if isinstance(v, str):
257
+ clean_value = self._sanitize_text(v)[:32767] # Excel cell limit
258
+ else:
259
+ clean_value = v
260
+ clean_item[clean_key] = clean_value
261
+ sanitized.append(clean_item)
262
+ return sanitized
263
+
264
+ def _extract_pdf_text(self, file_path: str) -> str:
265
+ """
266
+ Extract text from PDF using pdfplumber.
267
+
268
+ Args:
269
+ file_path (str): Path to the PDF file.
270
+
271
+ Returns:
272
+ str: Extracted text content.
273
+
274
+ Raises:
275
+ FileOperationError: If PDF text extraction fails.
276
+ """
277
+ try:
278
+ text_content = []
279
+ with pdfplumber.open(file_path) as pdf:
280
+ for page in pdf.pages:
281
+ page_text = page.extract_text()
282
+ if page_text:
283
+ text_content.append(page_text)
284
+ return '\n'.join(text_content)
285
+ except Exception as e:
286
+ raise FileOperationError(f"Failed to extract PDF text: {str(e)}")
287
+
288
+ def _extract_image_text(self, file_path: str) -> str:
289
+ """
290
+ Extract text from image using pytesseract OCR.
291
+
292
+ Args:
293
+ file_path (str): Path to the image file.
294
+
295
+ Returns:
296
+ str: Extracted text content.
297
+
298
+ Raises:
299
+ FileOperationError: If image text extraction fails.
300
+ """
301
+ try:
302
+ image = Image.open(file_path)
303
+ # Convert to RGB if necessary
304
+ if image.mode != 'RGB':
305
+ image = image.convert('RGB')
306
+ text = pytesseract.image_to_string(image, lang='eng+chi_sim')
307
+ return text.strip()
308
+ except Exception as e:
309
+ raise FileOperationError(f"Failed to extract image text: {str(e)}")
310
+
311
+ def _extract_tika_text(self, file_path: str) -> str:
312
+ """
313
+ Extract text using Apache Tika as fallback.
314
+
315
+ Args:
316
+ file_path (str): Path to the file.
317
+
318
+ Returns:
319
+ str: Extracted text content.
320
+
321
+ Raises:
322
+ FileOperationError: If Tika text extraction fails.
323
+ """
324
+ try:
325
+ parsed = parser.from_file(file_path)
326
+ content = parsed.get('content', '')
327
+ return content.strip() if content else ""
328
+ except Exception as e:
329
+ raise FileOperationError(f"Failed to extract text with Tika: {str(e)}")
330
+
331
+ def read_docx(self, file_path: str, include_tables: bool = False) -> Dict[str, Any]:
332
+ """
333
+ Read content from a DOCX file.
334
+
335
+ Args:
336
+ file_path (str): Path to the DOCX file.
337
+ include_tables (bool): Whether to include table data.
338
+
339
+ Returns:
340
+ Dict[str, Any]: Document content {'paragraphs': List[str], 'tables': Optional[List[List[List[str]]]]}.
341
+
342
+ Raises:
343
+ FileOperationError: If file cannot be read.
344
+ ContentValidationError: If document structure is invalid.
345
+ """
346
+ try:
347
+ self._validate_document(file_path, 'docx')
348
+ doc = DocxDocument(file_path)
349
+ paras = [p.text for p in doc.paragraphs if p.text.strip()]
350
+ tables = None
351
+ if include_tables:
352
+ tables = [[[cell.text for cell in row.cells] for row in table.rows] for table in doc.tables]
353
+ return {'paragraphs': paras, 'tables': tables}
354
+ except ContentValidationError:
355
+ raise
356
+ except Exception as e:
357
+ raise FileOperationError(f"Failed to read DOCX: {str(e)}")
358
+
359
+ def write_docx(self, text: str, output_path: str, table_data: Optional[List[List[str]]] = None) -> Dict[str, Any]:
360
+ """
361
+ Write content to a DOCX file.
362
+
363
+ Args:
364
+ text (str): Text content to write.
365
+ output_path (str): Path to save the DOCX file.
366
+ table_data (Optional[List[List[str]]]): Table data to include.
367
+
368
+ Returns:
369
+ Dict[str, Any]: Status {'success': bool, 'file_path': str}.
370
+
371
+ Raises:
372
+ FileOperationError: If file cannot be written.
373
+ """
374
+ try:
375
+ sanitized_text = self._sanitize_text(text)
376
+ sanitized_table_data = self._sanitize_table_data(table_data)
377
+ doc = DocxDocument()
378
+ style = doc.styles['Normal']
379
+ style.font.name = self.settings.default_font
380
+ style.font.size = Pt(self.settings.default_font_size)
381
+ for line in sanitized_text.splitlines():
382
+ doc.add_paragraph(line)
383
+ if sanitized_table_data and sanitized_table_data[0]:
384
+ # Find maximum number of columns to handle irregular table data
385
+ max_cols = max(len(row) for row in sanitized_table_data)
386
+ table = doc.add_table(rows=len(sanitized_table_data), cols=max_cols)
387
+ for i, row in enumerate(sanitized_table_data):
388
+ for j in range(max_cols):
389
+ if j < len(row):
390
+ table.rows[i].cells[j].text = str(row[j])
391
+ else:
392
+ table.rows[i].cells[j].text = "" # Empty cell for missing data
393
+ doc.save(output_path)
394
+ return {'success': True, 'file_path': output_path}
395
+ except Exception as e:
396
+ raise FileOperationError(f"Failed to write DOCX: {str(e)}")
397
+
398
+ def read_pptx(self, file_path: str) -> List[str]:
399
+ """
400
+ Read content from a PPTX file.
401
+
402
+ Args:
403
+ file_path (str): Path to the PPTX file.
404
+
405
+ Returns:
406
+ List[str]: List of text content from slides.
407
+
408
+ Raises:
409
+ FileOperationError: If file cannot be read.
410
+ ContentValidationError: If document structure is invalid.
411
+ """
412
+ try:
413
+ self._validate_document(file_path, 'pptx')
414
+ prs = Presentation(file_path)
415
+ texts = []
416
+ for slide in prs.slides:
417
+ for shape in slide.shapes:
418
+ if hasattr(shape, 'text'):
419
+ txt = shape.text.strip()
420
+ if txt:
421
+ texts.append(txt)
422
+ return texts
423
+ except ContentValidationError:
424
+ raise
425
+ except Exception as e:
426
+ raise FileOperationError(f"Failed to read PPTX: {str(e)}")
427
+
428
+ def write_pptx(self, slides: List[str], output_path: str, image_path: Optional[str] = None) -> Dict[str, Any]:
429
+ """
430
+ Write content to a PPTX file.
431
+
432
+ Args:
433
+ slides (List[str]): List of slide contents.
434
+ output_path (str): Path to save the PPTX file.
435
+ image_path (Optional[str]): Path to an image to include on the first slide.
436
+
437
+ Returns:
438
+ Dict[str, Any]: Status {'success': bool, 'file_path': str}.
439
+
440
+ Raises:
441
+ FileOperationError: If file cannot be written.
442
+ """
443
+ try:
444
+ sanitized_slides = [self._sanitize_text(slide) for slide in slides]
445
+ prs = Presentation()
446
+ blank = prs.slide_layouts[6]
447
+ for idx, content in enumerate(sanitized_slides):
448
+ slide = prs.slides.add_slide(blank)
449
+ box = slide.shapes.add_textbox(Inches(1), Inches(1), Inches(8), Inches(5))
450
+ tf = box.text_frame
451
+ lines = content.splitlines()
452
+ if lines:
453
+ # Set text for the first paragraph (which already exists)
454
+ tf.text = lines[0]
455
+ # Add additional paragraphs for remaining lines
456
+ for line in lines[1:]:
457
+ p = tf.add_paragraph()
458
+ p.text = line
459
+ if idx == 0 and image_path:
460
+ try:
461
+ slide.shapes.add_picture(image_path, Inches(1), Inches(6), Inches(4))
462
+ except Exception as img_err:
463
+ self.logger.warning(f"Could not add image to slide: {img_err}")
464
+ prs.save(output_path)
465
+ return {'success': True, 'file_path': output_path}
466
+ except Exception as e:
467
+ raise FileOperationError(f"Failed to write PPTX: {str(e)}")
468
+
469
+ def read_xlsx(self, file_path: str, sheet_name: Optional[str] = None) -> List[Dict]:
470
+ """
471
+ Read content from an XLSX file.
472
+
473
+ Args:
474
+ file_path (str): Path to the XLSX file.
475
+ sheet_name (Optional[str]): Name of the sheet to read.
476
+
477
+ Returns:
478
+ List[Dict]: List of dictionaries representing Excel data.
479
+
480
+ Raises:
481
+ FileOperationError: If file cannot be read.
482
+ ContentValidationError: If document structure is invalid.
483
+ """
484
+ try:
485
+ self._validate_document(file_path, 'xlsx')
486
+ data = pd.read_excel(file_path, sheet_name=sheet_name)
487
+
488
+ # Handle different return types from pd.read_excel()
489
+ if isinstance(data, pd.DataFrame):
490
+ # Single sheet or specific sheet requested
491
+ return data.to_dict(orient='records')
492
+ elif isinstance(data, dict):
493
+ # Multiple sheets returned as dict - use the first sheet
494
+ first_sheet_name = list(data.keys())[0]
495
+ first_df = data[first_sheet_name]
496
+ return first_df.to_dict(orient='records')
497
+ else:
498
+ raise FileOperationError("Unexpected data type returned from Excel file")
499
+
500
+ except ContentValidationError:
501
+ raise
502
+ except Exception as e:
503
+ raise FileOperationError(f"Failed to read XLSX: {str(e)}")
504
+
505
+ def write_xlsx(self, data: List[Dict], output_path: str, sheet_name: str = 'Sheet1') -> Dict[str, Any]:
506
+ """
507
+ Write content to an XLSX file.
508
+
509
+ Args:
510
+ data (List[Dict]): Data to write.
511
+ output_path (str): Path to save the XLSX file.
512
+ sheet_name (str): Name of the sheet.
513
+
514
+ Returns:
515
+ Dict[str, Any]: Status {'success': bool, 'file_path': str}.
516
+
517
+ Raises:
518
+ FileOperationError: If file cannot be written.
519
+ """
520
+ try:
521
+ sanitized_data = self._sanitize_data(data)
522
+ if not sanitized_data:
523
+ pd.DataFrame().to_excel(output_path, index=False, sheet_name=sheet_name)
524
+ else:
525
+ pd.DataFrame(sanitized_data).to_excel(output_path, index=False, sheet_name=sheet_name)
526
+ return {'success': True, 'file_path': output_path}
527
+ except Exception as e:
528
+ raise FileOperationError(f"Failed to write XLSX: {str(e)}")
529
+
530
+ def extract_text(self, file_path: str) -> str:
531
+ """
532
+ Extract text from various file formats using combination library approach.
533
+
534
+ Args:
535
+ file_path (str): Path to the file.
536
+
537
+ Returns:
538
+ str: Extracted text content.
539
+
540
+ Raises:
541
+ FileOperationError: If text extraction fails.
542
+ ContentValidationError: If document structure is invalid.
543
+ """
544
+ try:
545
+ file_ext = os.path.splitext(file_path)[1].lower()
546
+
547
+ # Determine file type and validate
548
+ if file_ext == '.pdf':
549
+ file_type = 'pdf'
550
+ elif file_ext == '.docx':
551
+ file_type = 'docx'
552
+ elif file_ext == '.pptx':
553
+ file_type = 'pptx'
554
+ elif file_ext == '.xlsx':
555
+ file_type = 'xlsx'
556
+ elif file_ext in ['.png', '.jpg', '.jpeg', '.tiff', '.bmp', '.gif']:
557
+ file_type = 'image'
558
+ else:
559
+ file_type = 'other'
560
+
561
+ # Validate document structure
562
+ self._validate_document(file_path, file_type)
563
+
564
+ # Extract text based on file type
565
+ if file_type == 'pdf':
566
+ return self._sanitize_text(self._extract_pdf_text(file_path))
567
+ elif file_type == 'docx':
568
+ doc = DocxDocument(file_path)
569
+ paragraphs = [p.text for p in doc.paragraphs if p.text.strip()]
570
+ return self._sanitize_text('\n'.join(paragraphs))
571
+ elif file_type == 'pptx':
572
+ prs = Presentation(file_path)
573
+ texts = []
574
+ for slide in prs.slides:
575
+ for shape in slide.shapes:
576
+ if hasattr(shape, 'text') and shape.text.strip():
577
+ texts.append(shape.text)
578
+ return self._sanitize_text('\n'.join(texts))
579
+ elif file_type == 'xlsx':
580
+ data = pd.read_excel(file_path)
581
+ # Handle different return types from pd.read_excel()
582
+ if isinstance(data, pd.DataFrame):
583
+ return self._sanitize_text(data.to_string(index=False))
584
+ elif isinstance(data, dict):
585
+ # Multiple sheets returned as dict - use the first sheet
586
+ first_sheet_name = list(data.keys())[0]
587
+ first_df = data[first_sheet_name]
588
+ return self._sanitize_text(first_df.to_string(index=False))
589
+ else:
590
+ return self._sanitize_text("") # Fallback for unexpected data types
591
+ elif file_type == 'image':
592
+ return self._sanitize_text(self._extract_image_text(file_path))
593
+ else:
594
+ # Use Tika as fallback for other formats
595
+ return self._sanitize_text(self._extract_tika_text(file_path))
596
+
597
+ except ContentValidationError:
598
+ raise
599
+ except Exception as e:
600
+ raise FileOperationError(f"Failed to extract text: {str(e)}")