aiecs 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of aiecs might be problematic. Click here for more details.

Files changed (90) hide show
  1. aiecs/__init__.py +75 -0
  2. aiecs/__main__.py +41 -0
  3. aiecs/aiecs_client.py +295 -0
  4. aiecs/application/__init__.py +10 -0
  5. aiecs/application/executors/__init__.py +10 -0
  6. aiecs/application/executors/operation_executor.py +341 -0
  7. aiecs/config/__init__.py +15 -0
  8. aiecs/config/config.py +117 -0
  9. aiecs/config/registry.py +19 -0
  10. aiecs/core/__init__.py +46 -0
  11. aiecs/core/interface/__init__.py +34 -0
  12. aiecs/core/interface/execution_interface.py +150 -0
  13. aiecs/core/interface/storage_interface.py +214 -0
  14. aiecs/domain/__init__.py +20 -0
  15. aiecs/domain/context/__init__.py +28 -0
  16. aiecs/domain/context/content_engine.py +982 -0
  17. aiecs/domain/context/conversation_models.py +306 -0
  18. aiecs/domain/execution/__init__.py +12 -0
  19. aiecs/domain/execution/model.py +49 -0
  20. aiecs/domain/task/__init__.py +13 -0
  21. aiecs/domain/task/dsl_processor.py +460 -0
  22. aiecs/domain/task/model.py +50 -0
  23. aiecs/domain/task/task_context.py +257 -0
  24. aiecs/infrastructure/__init__.py +26 -0
  25. aiecs/infrastructure/messaging/__init__.py +13 -0
  26. aiecs/infrastructure/messaging/celery_task_manager.py +341 -0
  27. aiecs/infrastructure/messaging/websocket_manager.py +289 -0
  28. aiecs/infrastructure/monitoring/__init__.py +12 -0
  29. aiecs/infrastructure/monitoring/executor_metrics.py +138 -0
  30. aiecs/infrastructure/monitoring/structured_logger.py +50 -0
  31. aiecs/infrastructure/monitoring/tracing_manager.py +376 -0
  32. aiecs/infrastructure/persistence/__init__.py +12 -0
  33. aiecs/infrastructure/persistence/database_manager.py +286 -0
  34. aiecs/infrastructure/persistence/file_storage.py +671 -0
  35. aiecs/infrastructure/persistence/redis_client.py +162 -0
  36. aiecs/llm/__init__.py +54 -0
  37. aiecs/llm/base_client.py +99 -0
  38. aiecs/llm/client_factory.py +339 -0
  39. aiecs/llm/custom_callbacks.py +228 -0
  40. aiecs/llm/openai_client.py +125 -0
  41. aiecs/llm/vertex_client.py +186 -0
  42. aiecs/llm/xai_client.py +184 -0
  43. aiecs/main.py +351 -0
  44. aiecs/scripts/DEPENDENCY_SYSTEM_SUMMARY.md +241 -0
  45. aiecs/scripts/README_DEPENDENCY_CHECKER.md +309 -0
  46. aiecs/scripts/README_WEASEL_PATCH.md +126 -0
  47. aiecs/scripts/__init__.py +3 -0
  48. aiecs/scripts/dependency_checker.py +825 -0
  49. aiecs/scripts/dependency_fixer.py +348 -0
  50. aiecs/scripts/download_nlp_data.py +348 -0
  51. aiecs/scripts/fix_weasel_validator.py +121 -0
  52. aiecs/scripts/fix_weasel_validator.sh +82 -0
  53. aiecs/scripts/patch_weasel_library.sh +188 -0
  54. aiecs/scripts/quick_dependency_check.py +269 -0
  55. aiecs/scripts/run_weasel_patch.sh +41 -0
  56. aiecs/scripts/setup_nlp_data.sh +217 -0
  57. aiecs/tasks/__init__.py +2 -0
  58. aiecs/tasks/worker.py +111 -0
  59. aiecs/tools/__init__.py +196 -0
  60. aiecs/tools/base_tool.py +202 -0
  61. aiecs/tools/langchain_adapter.py +361 -0
  62. aiecs/tools/task_tools/__init__.py +82 -0
  63. aiecs/tools/task_tools/chart_tool.py +704 -0
  64. aiecs/tools/task_tools/classfire_tool.py +901 -0
  65. aiecs/tools/task_tools/image_tool.py +397 -0
  66. aiecs/tools/task_tools/office_tool.py +600 -0
  67. aiecs/tools/task_tools/pandas_tool.py +565 -0
  68. aiecs/tools/task_tools/report_tool.py +499 -0
  69. aiecs/tools/task_tools/research_tool.py +363 -0
  70. aiecs/tools/task_tools/scraper_tool.py +548 -0
  71. aiecs/tools/task_tools/search_api.py +7 -0
  72. aiecs/tools/task_tools/stats_tool.py +513 -0
  73. aiecs/tools/temp_file_manager.py +126 -0
  74. aiecs/tools/tool_executor/__init__.py +35 -0
  75. aiecs/tools/tool_executor/tool_executor.py +518 -0
  76. aiecs/utils/LLM_output_structor.py +409 -0
  77. aiecs/utils/__init__.py +23 -0
  78. aiecs/utils/base_callback.py +50 -0
  79. aiecs/utils/execution_utils.py +158 -0
  80. aiecs/utils/logging.py +1 -0
  81. aiecs/utils/prompt_loader.py +13 -0
  82. aiecs/utils/token_usage_repository.py +279 -0
  83. aiecs/ws/__init__.py +0 -0
  84. aiecs/ws/socket_server.py +41 -0
  85. aiecs-1.0.0.dist-info/METADATA +610 -0
  86. aiecs-1.0.0.dist-info/RECORD +90 -0
  87. aiecs-1.0.0.dist-info/WHEEL +5 -0
  88. aiecs-1.0.0.dist-info/entry_points.txt +7 -0
  89. aiecs-1.0.0.dist-info/licenses/LICENSE +225 -0
  90. aiecs-1.0.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,704 @@
1
+ import os
2
+ import json
3
+ import csv
4
+ import tempfile
5
+ import logging
6
+ from typing import Dict, Any, List, Optional, Union, Tuple
7
+ from enum import Enum
8
+
9
+ from pydantic import BaseModel, Field, field_validator, ConfigDict
10
+ import numpy as np
11
+ import pandas as pd
12
+ import matplotlib.pyplot as plt
13
+ import seaborn as sns
14
+
15
+ from aiecs.tools import register_tool
16
+ from aiecs.tools.base_tool import BaseTool
17
+ from aiecs.tools.tool_executor import measure_execution_time
18
+
19
+ # Enums for configuration options
20
+ class ExportFormat(str, Enum):
21
+ JSON = "json"
22
+ CSV = "csv"
23
+ HTML = "html"
24
+ EXCEL = "excel"
25
+ MARKDOWN = "markdown"
26
+
27
+ class VisualizationType(str, Enum):
28
+ HISTOGRAM = "histogram"
29
+ BOXPLOT = "boxplot"
30
+ SCATTER = "scatter"
31
+ BAR = "bar"
32
+ LINE = "line"
33
+ HEATMAP = "heatmap"
34
+ PAIR = "pair"
35
+
36
+ @register_tool('chart')
37
+ class ChartTool(BaseTool):
38
+ """Chart and visualization tool: creates charts and exports data in various formats."""
39
+
40
+ # Configuration schema
41
+ class Config(BaseModel):
42
+ model_config = ConfigDict()
43
+ """Configuration for the chart tool"""
44
+ export_dir: str = Field(
45
+ default=os.path.join(tempfile.gettempdir(), 'chart_exports'),
46
+ description="Directory to export files to"
47
+ )
48
+ plot_dpi: int = Field(
49
+ default=100,
50
+ description="DPI for plot exports"
51
+ )
52
+ plot_figsize: Tuple[int, int] = Field(
53
+ default=(10, 6),
54
+ description="Default figure size (width, height) in inches"
55
+ )
56
+ allowed_extensions: List[str] = Field(
57
+ default=['.csv', '.xlsx', '.xls', '.json', '.parquet', '.feather', '.sav', '.sas7bdat', '.por'],
58
+ description="Allowed file extensions"
59
+ )
60
+
61
+ # Input schemas for operations
62
+ class ReadDataSchema(BaseModel):
63
+ """Schema for reading data files"""
64
+ file_path: str = Field(
65
+ description="Path to the data file"
66
+ )
67
+ nrows: Optional[int] = Field(
68
+ default=None,
69
+ description="Number of rows to read"
70
+ )
71
+ sheet_name: Optional[Union[str, int]] = Field(
72
+ default=0,
73
+ description="Sheet name or index for Excel files"
74
+ )
75
+ export_format: Optional[ExportFormat] = Field(
76
+ default=None,
77
+ description="Format to export results in"
78
+ )
79
+ export_path: Optional[str] = Field(
80
+ default=None,
81
+ description="Path to export results to"
82
+ )
83
+
84
+ @field_validator('file_path')
85
+ @classmethod
86
+ def validate_file_path(cls, v):
87
+ if not os.path.isfile(v):
88
+ raise ValueError(f"File not found: {v}")
89
+ return v
90
+
91
+ @field_validator('export_path')
92
+ @classmethod
93
+ def validate_export_path(cls, v, info):
94
+ if v and 'export_format' not in info.data:
95
+ raise ValueError("export_format must be specified when export_path is provided")
96
+ return v
97
+
98
+ class VisualizationSchema(BaseModel):
99
+ """Schema for data visualization"""
100
+ file_path: str = Field(
101
+ description="Path to the data file"
102
+ )
103
+ plot_type: VisualizationType = Field(
104
+ description="Type of visualization to create"
105
+ )
106
+ x: Optional[str] = Field(
107
+ default=None,
108
+ description="Column to use for x-axis"
109
+ )
110
+ y: Optional[str] = Field(
111
+ default=None,
112
+ description="Column to use for y-axis"
113
+ )
114
+ hue: Optional[str] = Field(
115
+ default=None,
116
+ description="Column to use for color encoding"
117
+ )
118
+ variables: Optional[List[str]] = Field(
119
+ default=None,
120
+ description="List of variables to include in the visualization"
121
+ )
122
+ title: Optional[str] = Field(
123
+ default=None,
124
+ description="Title for the visualization"
125
+ )
126
+ figsize: Optional[Tuple[int, int]] = Field(
127
+ default=None,
128
+ description="Figure size (width, height) in inches"
129
+ )
130
+ output_path: Optional[str] = Field(
131
+ default=None,
132
+ description="Path to save the visualization"
133
+ )
134
+ dpi: Optional[int] = Field(
135
+ default=None,
136
+ description="DPI for the visualization"
137
+ )
138
+ export_format: Optional[ExportFormat] = Field(
139
+ default=None,
140
+ description="Format to export results in"
141
+ )
142
+ export_path: Optional[str] = Field(
143
+ default=None,
144
+ description="Path to export results to"
145
+ )
146
+
147
+ @field_validator('file_path')
148
+ @classmethod
149
+ def validate_file_path(cls, v):
150
+ if not os.path.isfile(v):
151
+ raise ValueError(f"File not found: {v}")
152
+ return v
153
+
154
+ @field_validator('export_path')
155
+ @classmethod
156
+ def validate_export_path(cls, v, info):
157
+ if v and 'export_format' not in info.data:
158
+ raise ValueError("export_format must be specified when export_path is provided")
159
+ return v
160
+
161
+ class ExportDataSchema(BaseModel):
162
+ """Schema for exporting data"""
163
+ file_path: str = Field(
164
+ description="Path to the data file"
165
+ )
166
+ variables: Optional[List[str]] = Field(
167
+ default=None,
168
+ description="List of variables to include in the export"
169
+ )
170
+ format: ExportFormat = Field(
171
+ description="Format to export data in"
172
+ )
173
+ export_path: Optional[str] = Field(
174
+ default=None,
175
+ description="Path to save the exported data"
176
+ )
177
+ export_format: Optional[ExportFormat] = Field(
178
+ default=None,
179
+ description="Format to export results in"
180
+ )
181
+
182
+ @field_validator('file_path')
183
+ @classmethod
184
+ def validate_file_path(cls, v):
185
+ if not os.path.isfile(v):
186
+ raise ValueError(f"File not found: {v}")
187
+ return v
188
+
189
+ @field_validator('export_path')
190
+ @classmethod
191
+ def validate_export_path(cls, v, info):
192
+ if v and 'export_format' not in info.data:
193
+ raise ValueError("export_format must be specified when export_path is provided")
194
+ return v
195
+
196
+ def __init__(self, config: Optional[Dict[str, Any]] = None):
197
+ """
198
+ Initialize the chart tool
199
+
200
+ Args:
201
+ config: Optional configuration for the tool
202
+ """
203
+ super().__init__(config)
204
+
205
+ # Parse configuration
206
+ self.config = self.Config(**(config or {}))
207
+
208
+ # Create export directory if it doesn't exist
209
+ os.makedirs(self.config.export_dir, exist_ok=True)
210
+
211
+ # Set up logger
212
+ self.logger = logging.getLogger(__name__)
213
+
214
+ # Set default matplotlib style
215
+ plt.style.use('seaborn-v0_8-whitegrid')
216
+
217
+ def _load_data(self, file_path: str, nrows: Optional[int] = None, sheet_name: Optional[Union[str, int]] = 0) -> pd.DataFrame:
218
+ """
219
+ Load data from various file formats into a pandas DataFrame
220
+
221
+ Args:
222
+ file_path: Path to the data file
223
+ nrows: Number of rows to read
224
+ sheet_name: Sheet name or index for Excel files
225
+
226
+ Returns:
227
+ Loaded DataFrame
228
+ """
229
+ # Determine file type and read accordingly
230
+ ext = os.path.splitext(file_path)[1].lower()
231
+
232
+ try:
233
+ if ext == '.sav':
234
+ import pyreadstat
235
+ df, meta = pyreadstat.read_sav(file_path)
236
+ return df
237
+ elif ext == '.sas7bdat':
238
+ import pyreadstat
239
+ df, meta = pyreadstat.read_sas7bdat(file_path)
240
+ return df
241
+ elif ext == '.por':
242
+ import pyreadstat
243
+ df, meta = pyreadstat.read_por(file_path)
244
+ return df
245
+ elif ext == '.csv':
246
+ return pd.read_csv(file_path, nrows=nrows)
247
+ elif ext in ['.xlsx', '.xls']:
248
+ return pd.read_excel(file_path, sheet_name=sheet_name, nrows=nrows)
249
+ elif ext == '.json':
250
+ return pd.read_json(file_path)
251
+ elif ext == '.parquet':
252
+ return pd.read_parquet(file_path)
253
+ elif ext == '.feather':
254
+ return pd.read_feather(file_path)
255
+ else:
256
+ raise ValueError(f"Unsupported file format: {ext}")
257
+ except Exception as e:
258
+ raise ValueError(f"Error reading file {file_path}: {str(e)}")
259
+
260
+ def _export_result(self, result: Dict[str, Any], path: str, format: ExportFormat) -> None:
261
+ """
262
+ Export results to the specified format
263
+
264
+ Args:
265
+ result: Result to export
266
+ path: Path to save the exported result
267
+ format: Format to export in
268
+ """
269
+ os.makedirs(os.path.dirname(os.path.abspath(path)), exist_ok=True)
270
+
271
+ try:
272
+ if format == ExportFormat.JSON:
273
+ # Convert numpy types to Python native types
274
+ def json_serialize(obj):
275
+ if isinstance(obj, (np.integer, np.int64)):
276
+ return int(obj)
277
+ elif isinstance(obj, (np.floating, np.float64)):
278
+ return float(obj)
279
+ elif isinstance(obj, np.ndarray):
280
+ return obj.tolist()
281
+ elif isinstance(obj, pd.DataFrame):
282
+ return obj.to_dict(orient='records')
283
+ return str(obj)
284
+
285
+ with open(path, 'w') as f:
286
+ json.dump(result, f, default=json_serialize, indent=2)
287
+
288
+ elif format == ExportFormat.CSV:
289
+ # Find the first dict or DataFrame in the result
290
+ data_to_export = None
291
+ for key, value in result.items():
292
+ if isinstance(value, dict) and value:
293
+ data_to_export = pd.DataFrame(value)
294
+ break
295
+ elif isinstance(value, pd.DataFrame):
296
+ data_to_export = value
297
+ break
298
+
299
+ if data_to_export is not None:
300
+ data_to_export.to_csv(path, index=False)
301
+ else:
302
+ # Fallback: convert the entire result to a flat structure
303
+ flat_data = {}
304
+ for k, v in result.items():
305
+ if not isinstance(v, (dict, list, pd.DataFrame)):
306
+ flat_data[k] = v
307
+
308
+ with open(path, 'w', newline='') as f:
309
+ writer = csv.writer(f)
310
+ writer.writerow(flat_data.keys())
311
+ writer.writerow(flat_data.values())
312
+
313
+ elif format == ExportFormat.HTML:
314
+ # Convert to HTML table
315
+ html_content = "<html><body><h1>Chart Results</h1>"
316
+ for key, value in result.items():
317
+ html_content += f"<h2>{key}</h2>"
318
+ if isinstance(value, pd.DataFrame):
319
+ html_content += value.to_html()
320
+ elif isinstance(value, dict):
321
+ html_content += "<table border='1'><tr><th>Parameter</th><th>Value</th></tr>"
322
+ for k, v in value.items():
323
+ html_content += f"<tr><td>{k}</td><td>{v}</td></tr>"
324
+ html_content += "</table>"
325
+ else:
326
+ html_content += f"<p>{value}</p>"
327
+ html_content += "</body></html>"
328
+
329
+ with open(path, 'w') as f:
330
+ f.write(html_content)
331
+
332
+ elif format == ExportFormat.EXCEL:
333
+ with pd.ExcelWriter(path) as writer:
334
+ for key, value in result.items():
335
+ if isinstance(value, pd.DataFrame):
336
+ value.to_excel(writer, sheet_name=key[:31]) # Excel sheet names limited to 31 chars
337
+ elif isinstance(value, dict):
338
+ pd.DataFrame(value, index=[0]).to_excel(writer, sheet_name=key[:31])
339
+ else:
340
+ pd.DataFrame({key: [value]}).to_excel(writer, sheet_name='Summary')
341
+
342
+ elif format == ExportFormat.MARKDOWN:
343
+ with open(path, 'w') as f:
344
+ f.write("# Chart Results\n\n")
345
+ for key, value in result.items():
346
+ f.write(f"## {key}\n\n")
347
+ if isinstance(value, pd.DataFrame):
348
+ f.write(value.to_markdown())
349
+ elif isinstance(value, dict):
350
+ f.write("| Parameter | Value |\n|-----------|-------|\n")
351
+ for k, v in value.items():
352
+ f.write(f"| {k} | {v} |\n")
353
+ else:
354
+ f.write(f"{value}\n\n")
355
+
356
+ return path
357
+ except Exception as e:
358
+ raise ValueError(f"Error exporting to {format}: {str(e)}")
359
+
360
+ def _create_visualization(self, df: pd.DataFrame, plot_type: VisualizationType,
361
+ x: Optional[str] = None, y: Optional[str] = None,
362
+ hue: Optional[str] = None, variables: Optional[List[str]] = None,
363
+ title: Optional[str] = None, figsize: Optional[Tuple[int, int]] = None,
364
+ output_path: Optional[str] = None, dpi: Optional[int] = None) -> str:
365
+ """
366
+ Create a visualization based on the parameters and return the path to the saved image
367
+
368
+ Args:
369
+ df: DataFrame to visualize
370
+ plot_type: Type of visualization to create
371
+ x: Column to use for x-axis
372
+ y: Column to use for y-axis
373
+ hue: Column to use for color encoding
374
+ variables: List of variables to include in the visualization
375
+ title: Title for the visualization
376
+ figsize: Figure size (width, height) in inches
377
+ output_path: Path to save the visualization
378
+ dpi: DPI for the visualization
379
+
380
+ Returns:
381
+ Path to the saved visualization
382
+ """
383
+ if not output_path:
384
+ output_path = os.path.join(self.config.export_dir, f"plot_{os.urandom(4).hex()}.png")
385
+ elif not os.path.isabs(output_path):
386
+ output_path = os.path.join(self.config.export_dir, output_path)
387
+
388
+ os.makedirs(os.path.dirname(os.path.abspath(output_path)), exist_ok=True)
389
+
390
+ try:
391
+ figsize = figsize or self.config.plot_figsize
392
+ dpi = dpi or self.config.plot_dpi
393
+
394
+ plt.figure(figsize=figsize)
395
+
396
+ if plot_type == VisualizationType.HISTOGRAM:
397
+ if variables:
398
+ for var in variables:
399
+ sns.histplot(data=df, x=var, kde=True, label=var)
400
+ plt.legend()
401
+ else:
402
+ sns.histplot(data=df, x=x, hue=hue)
403
+
404
+ elif plot_type == VisualizationType.BOXPLOT:
405
+ sns.boxplot(data=df, x=x, y=y, hue=hue)
406
+
407
+ elif plot_type == VisualizationType.SCATTER:
408
+ sns.scatterplot(data=df, x=x, y=y, hue=hue)
409
+
410
+ elif plot_type == VisualizationType.BAR:
411
+ sns.barplot(data=df, x=x, y=y, hue=hue)
412
+
413
+ elif plot_type == VisualizationType.LINE:
414
+ sns.lineplot(data=df, x=x, y=y, hue=hue)
415
+
416
+ elif plot_type == VisualizationType.HEATMAP:
417
+ if variables:
418
+ corr = df[variables].corr()
419
+ else:
420
+ corr = df.corr()
421
+ sns.heatmap(corr, annot=True, cmap='coolwarm', fmt=".2f")
422
+
423
+ elif plot_type == VisualizationType.PAIR:
424
+ if variables:
425
+ plot_vars = variables + [hue] if hue else variables
426
+ sns.pairplot(df[plot_vars], hue=hue)
427
+ else:
428
+ sns.pairplot(df, hue=hue)
429
+
430
+ if title:
431
+ plt.title(title)
432
+
433
+ plt.tight_layout()
434
+ plt.savefig(output_path, dpi=dpi)
435
+ plt.close()
436
+
437
+ return output_path
438
+ except Exception as e:
439
+ raise ValueError(f"Error creating visualization: {str(e)}")
440
+
441
+ def _validate_variables(self, df: pd.DataFrame, variables: List[str]) -> None:
442
+ """
443
+ Validate that variables exist in the DataFrame
444
+
445
+ Args:
446
+ df: DataFrame to check
447
+ variables: List of variables to validate
448
+
449
+ Raises:
450
+ ValueError: If any variables are not found in the DataFrame
451
+ """
452
+ if not variables:
453
+ return
454
+
455
+ available_columns = set(df.columns)
456
+ missing = [col for col in variables if col not in available_columns]
457
+ if missing:
458
+ raise ValueError(
459
+ f"Variables not found in dataset: {', '.join(missing)}. Available columns: {list(available_columns)}"
460
+ )
461
+
462
+ def _to_json_serializable(self, result: Union[pd.DataFrame, pd.Series, Dict]) -> Union[List[Dict], Dict]:
463
+ """
464
+ Convert result to JSON serializable format
465
+
466
+ Args:
467
+ result: Result to convert
468
+
469
+ Returns:
470
+ JSON serializable result
471
+ """
472
+ if isinstance(result, pd.DataFrame):
473
+ # Handle datetime columns
474
+ for col in result.select_dtypes(include=['datetime64']).columns:
475
+ result[col] = result[col].dt.strftime('%Y-%m-%d %H:%M:%S')
476
+ return result.to_dict(orient="records")
477
+ elif isinstance(result, pd.Series):
478
+ if pd.api.types.is_datetime64_any_dtype(result):
479
+ result = result.dt.strftime('%Y-%m-%d %H:%M:%S')
480
+ return result.to_dict()
481
+ elif isinstance(result, dict):
482
+ # Handle numpy types and datetime objects
483
+ def convert_value(v):
484
+ if isinstance(v, (np.floating, np.integer)):
485
+ return float(v)
486
+ elif isinstance(v, np.bool_):
487
+ return bool(v)
488
+ elif isinstance(v, (pd.Timestamp, np.datetime64)):
489
+ return str(v)
490
+ elif isinstance(v, np.ndarray):
491
+ return v.tolist()
492
+ elif pd.isna(v):
493
+ return None
494
+ return v
495
+
496
+ return {k: convert_value(v) for k, v in result.items()}
497
+ return result
498
+
499
+ @measure_execution_time
500
+ def read_data(self, file_path: str, nrows: Optional[int] = None,
501
+ sheet_name: Optional[Union[str, int]] = 0,
502
+ export_format: Optional[ExportFormat] = None,
503
+ export_path: Optional[str] = None) -> Dict[str, Any]:
504
+ """
505
+ Read data from various file formats
506
+
507
+ Args:
508
+ file_path: Path to the data file
509
+ nrows: Number of rows to read
510
+ sheet_name: Sheet name or index for Excel files
511
+ export_format: Format to export results in
512
+ export_path: Path to export results to
513
+
514
+ Returns:
515
+ Dictionary with data summary
516
+ """
517
+ # Validate file path
518
+ if not os.path.isfile(file_path):
519
+ raise ValueError(f"File not found: {file_path}")
520
+
521
+ # Check file extension
522
+ ext = os.path.splitext(file_path)[1].lower()
523
+ if ext not in self.config.allowed_extensions:
524
+ raise ValueError(f"Extension '{ext}' not allowed. Supported formats: {', '.join(self.config.allowed_extensions)}")
525
+
526
+ # Load data
527
+ df = self._load_data(file_path, nrows, sheet_name)
528
+
529
+ # Create result
530
+ result = {
531
+ 'variables': df.columns.tolist(),
532
+ 'observations': len(df),
533
+ 'dtypes': {col: str(dtype) for col, dtype in df.dtypes.items()},
534
+ 'memory_usage': df.memory_usage(deep=True).sum() / (1024 * 1024), # MB
535
+ 'preview': df.head(5).to_dict(orient='records')
536
+ }
537
+
538
+ # Handle export if requested
539
+ if export_format and export_path:
540
+ if not os.path.isabs(export_path):
541
+ export_path = os.path.join(self.config.export_dir, export_path)
542
+
543
+ self._export_result(result, export_path, export_format)
544
+ result['exported_to'] = export_path
545
+
546
+ return result
547
+
548
+ @measure_execution_time
549
+ def visualize(self, file_path: str, plot_type: VisualizationType,
550
+ x: Optional[str] = None, y: Optional[str] = None,
551
+ hue: Optional[str] = None, variables: Optional[List[str]] = None,
552
+ title: Optional[str] = None, figsize: Optional[Tuple[int, int]] = None,
553
+ output_path: Optional[str] = None, dpi: Optional[int] = None,
554
+ export_format: Optional[ExportFormat] = None,
555
+ export_path: Optional[str] = None) -> Dict[str, Any]:
556
+ """
557
+ Create data visualizations
558
+
559
+ Args:
560
+ file_path: Path to the data file
561
+ plot_type: Type of visualization to create
562
+ x: Column to use for x-axis
563
+ y: Column to use for y-axis
564
+ hue: Column to use for color encoding
565
+ variables: List of variables to include in the visualization
566
+ title: Title for the visualization
567
+ figsize: Figure size (width, height) in inches
568
+ output_path: Path to save the visualization
569
+ dpi: DPI for the visualization
570
+ export_format: Format to export results in
571
+ export_path: Path to export results to
572
+
573
+ Returns:
574
+ Dictionary with visualization details
575
+ """
576
+ # Validate file path
577
+ if not os.path.isfile(file_path):
578
+ raise ValueError(f"File not found: {file_path}")
579
+
580
+ # Check file extension
581
+ ext = os.path.splitext(file_path)[1].lower()
582
+ if ext not in self.config.allowed_extensions:
583
+ raise ValueError(f"Extension '{ext}' not allowed. Supported formats: {', '.join(self.config.allowed_extensions)}")
584
+
585
+ # Load data
586
+ df = self._load_data(file_path)
587
+
588
+ # Validate variables
589
+ vars_to_check = []
590
+ if variables:
591
+ vars_to_check.extend(variables)
592
+ if x:
593
+ vars_to_check.append(x)
594
+ if y:
595
+ vars_to_check.append(y)
596
+ if hue:
597
+ vars_to_check.append(hue)
598
+
599
+ self._validate_variables(df, vars_to_check)
600
+
601
+ # Create visualization
602
+ output_path = self._create_visualization(
603
+ df, plot_type, x, y, hue, variables, title, figsize, output_path, dpi
604
+ )
605
+
606
+ # Create result
607
+ result = {
608
+ 'plot_type': plot_type,
609
+ 'output_path': output_path,
610
+ 'variables': variables or [x, y, hue],
611
+ 'title': title or f"{plot_type.capitalize()} Plot"
612
+ }
613
+
614
+ # Handle export if requested
615
+ if export_format and export_path:
616
+ if not os.path.isabs(export_path):
617
+ export_path = os.path.join(self.config.export_dir, export_path)
618
+
619
+ self._export_result(result, export_path, export_format)
620
+ result['exported_to'] = export_path
621
+
622
+ return result
623
+
624
+ @measure_execution_time
625
+ def export_data(self, file_path: str, format: ExportFormat,
626
+ variables: Optional[List[str]] = None,
627
+ export_path: Optional[str] = None,
628
+ export_format: Optional[ExportFormat] = None) -> Dict[str, Any]:
629
+ """
630
+ Export data to various formats
631
+
632
+ Args:
633
+ file_path: Path to the data file
634
+ format: Format to export data in
635
+ variables: List of variables to include in the export
636
+ export_path: Path to save the exported data
637
+ export_format: Format to export results in
638
+
639
+ Returns:
640
+ Dictionary with export details
641
+ """
642
+ # Validate file path
643
+ if not os.path.isfile(file_path):
644
+ raise ValueError(f"File not found: {file_path}")
645
+
646
+ # Check file extension
647
+ ext = os.path.splitext(file_path)[1].lower()
648
+ if ext not in self.config.allowed_extensions:
649
+ raise ValueError(f"Extension '{ext}' not allowed. Supported formats: {', '.join(self.config.allowed_extensions)}")
650
+
651
+ # Load data
652
+ df = self._load_data(file_path)
653
+
654
+ # Validate variables
655
+ if variables:
656
+ self._validate_variables(df, variables)
657
+ df = df[variables]
658
+
659
+ # Determine export path
660
+ if not export_path:
661
+ ext = "." + format.value
662
+ if format == ExportFormat.EXCEL:
663
+ ext = ".xlsx"
664
+ export_path = os.path.join(self.config.export_dir, f"export_{os.urandom(4).hex()}{ext}")
665
+ elif not os.path.isabs(export_path):
666
+ export_path = os.path.join(self.config.export_dir, export_path)
667
+
668
+ # Create export directory if it doesn't exist
669
+ os.makedirs(os.path.dirname(os.path.abspath(export_path)), exist_ok=True)
670
+
671
+ # Export data
672
+ try:
673
+ if format == ExportFormat.JSON:
674
+ df.to_json(export_path, orient='records', indent=2)
675
+ elif format == ExportFormat.CSV:
676
+ df.to_csv(export_path, index=False)
677
+ elif format == ExportFormat.HTML:
678
+ df.to_html(export_path)
679
+ elif format == ExportFormat.EXCEL:
680
+ df.to_excel(export_path, index=False)
681
+ elif format == ExportFormat.MARKDOWN:
682
+ with open(export_path, 'w') as f:
683
+ f.write(df.to_markdown())
684
+ except Exception as e:
685
+ raise ValueError(f"Error exporting to {format}: {str(e)}")
686
+
687
+ # Create result
688
+ result = {
689
+ 'format': format,
690
+ 'path': export_path,
691
+ 'rows': len(df),
692
+ 'columns': len(df.columns),
693
+ 'variables': df.columns.tolist()
694
+ }
695
+
696
+ # Handle export if requested
697
+ if export_format and export_path:
698
+ if not os.path.isabs(export_path):
699
+ export_path = os.path.join(self.config.export_dir, export_path)
700
+
701
+ self._export_result(result, export_path, export_format)
702
+ result['exported_to'] = export_path
703
+
704
+ return result