aiecs 1.1.0__py3-none-any.whl → 1.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of aiecs might be problematic. Click here for more details.

Files changed (58) hide show
  1. aiecs/__init__.py +1 -1
  2. aiecs/config/config.py +2 -0
  3. aiecs/domain/__init__.py +95 -0
  4. aiecs/domain/community/__init__.py +159 -0
  5. aiecs/domain/community/agent_adapter.py +516 -0
  6. aiecs/domain/community/analytics.py +465 -0
  7. aiecs/domain/community/collaborative_workflow.py +99 -7
  8. aiecs/domain/community/communication_hub.py +649 -0
  9. aiecs/domain/community/community_builder.py +322 -0
  10. aiecs/domain/community/community_integration.py +365 -12
  11. aiecs/domain/community/community_manager.py +481 -5
  12. aiecs/domain/community/decision_engine.py +459 -13
  13. aiecs/domain/community/exceptions.py +238 -0
  14. aiecs/domain/community/models/__init__.py +36 -0
  15. aiecs/domain/community/resource_manager.py +1 -1
  16. aiecs/domain/community/shared_context_manager.py +621 -0
  17. aiecs/domain/context/context_engine.py +37 -33
  18. aiecs/main.py +2 -2
  19. aiecs/scripts/aid/VERSION_MANAGEMENT.md +97 -0
  20. aiecs/scripts/aid/__init__.py +15 -0
  21. aiecs/scripts/aid/version_manager.py +224 -0
  22. aiecs/scripts/dependance_check/download_nlp_data.py +1 -0
  23. aiecs/tools/__init__.py +23 -23
  24. aiecs/tools/docs/__init__.py +5 -2
  25. aiecs/tools/docs/ai_document_orchestrator.py +39 -26
  26. aiecs/tools/docs/ai_document_writer_orchestrator.py +61 -38
  27. aiecs/tools/docs/content_insertion_tool.py +48 -28
  28. aiecs/tools/docs/document_creator_tool.py +47 -29
  29. aiecs/tools/docs/document_layout_tool.py +35 -20
  30. aiecs/tools/docs/document_parser_tool.py +56 -36
  31. aiecs/tools/docs/document_writer_tool.py +115 -62
  32. aiecs/tools/schema_generator.py +56 -56
  33. aiecs/tools/statistics/__init__.py +82 -0
  34. aiecs/tools/statistics/ai_data_analysis_orchestrator.py +581 -0
  35. aiecs/tools/statistics/ai_insight_generator_tool.py +473 -0
  36. aiecs/tools/statistics/ai_report_orchestrator_tool.py +629 -0
  37. aiecs/tools/statistics/data_loader_tool.py +518 -0
  38. aiecs/tools/statistics/data_profiler_tool.py +599 -0
  39. aiecs/tools/statistics/data_transformer_tool.py +531 -0
  40. aiecs/tools/statistics/data_visualizer_tool.py +460 -0
  41. aiecs/tools/statistics/model_trainer_tool.py +470 -0
  42. aiecs/tools/statistics/statistical_analyzer_tool.py +426 -0
  43. aiecs/tools/task_tools/chart_tool.py +2 -1
  44. aiecs/tools/task_tools/image_tool.py +43 -43
  45. aiecs/tools/task_tools/office_tool.py +39 -36
  46. aiecs/tools/task_tools/pandas_tool.py +37 -33
  47. aiecs/tools/task_tools/report_tool.py +67 -56
  48. aiecs/tools/task_tools/research_tool.py +32 -31
  49. aiecs/tools/task_tools/scraper_tool.py +53 -46
  50. aiecs/tools/task_tools/search_tool.py +1123 -0
  51. aiecs/tools/task_tools/stats_tool.py +20 -15
  52. {aiecs-1.1.0.dist-info → aiecs-1.2.0.dist-info}/METADATA +5 -1
  53. {aiecs-1.1.0.dist-info → aiecs-1.2.0.dist-info}/RECORD +57 -36
  54. {aiecs-1.1.0.dist-info → aiecs-1.2.0.dist-info}/entry_points.txt +1 -0
  55. aiecs/tools/task_tools/search_api.py +0 -7
  56. {aiecs-1.1.0.dist-info → aiecs-1.2.0.dist-info}/WHEEL +0 -0
  57. {aiecs-1.1.0.dist-info → aiecs-1.2.0.dist-info}/licenses/LICENSE +0 -0
  58. {aiecs-1.1.0.dist-info → aiecs-1.2.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,470 @@
1
+ """
2
+ Model Trainer Tool - AutoML and machine learning model training
3
+
4
+ This tool provides AutoML capabilities with:
5
+ - Automatic model selection for classification and regression
6
+ - Hyperparameter tuning
7
+ - Model evaluation and comparison
8
+ - Feature importance analysis
9
+ - Model explanation support
10
+ """
11
+
12
+ import logging
13
+ from typing import Dict, Any, List, Optional, Union
14
+ from enum import Enum
15
+ import pickle
16
+
17
+ import pandas as pd
18
+ import numpy as np
19
+ from sklearn.model_selection import train_test_split, cross_val_score
20
+ from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, r2_score, mean_squared_error
21
+ from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, GradientBoostingClassifier, GradientBoostingRegressor
22
+ from sklearn.linear_model import LogisticRegression, LinearRegression
23
+ from sklearn.preprocessing import LabelEncoder
24
+ from pydantic import BaseModel, Field, ValidationError, ConfigDict
25
+
26
+ from aiecs.tools.base_tool import BaseTool
27
+ from aiecs.tools import register_tool
28
+
29
+
30
+ class ModelType(str, Enum):
31
+ """Supported model types"""
32
+ LOGISTIC_REGRESSION = "logistic_regression"
33
+ LINEAR_REGRESSION = "linear_regression"
34
+ RANDOM_FOREST_CLASSIFIER = "random_forest_classifier"
35
+ RANDOM_FOREST_REGRESSOR = "random_forest_regressor"
36
+ GRADIENT_BOOSTING_CLASSIFIER = "gradient_boosting_classifier"
37
+ GRADIENT_BOOSTING_REGRESSOR = "gradient_boosting_regressor"
38
+ AUTO = "auto"
39
+
40
+
41
+ class TaskType(str, Enum):
42
+ """Machine learning task types"""
43
+ CLASSIFICATION = "classification"
44
+ REGRESSION = "regression"
45
+ CLUSTERING = "clustering"
46
+
47
+
48
+
49
+
50
+ class ModelTrainerError(Exception):
51
+ """Base exception for ModelTrainer errors"""
52
+ pass
53
+
54
+
55
+ class TrainingError(ModelTrainerError):
56
+ """Raised when model training fails"""
57
+ pass
58
+
59
+
60
+ @register_tool('model_trainer')
61
+ class ModelTrainerTool(BaseTool):
62
+ """
63
+ AutoML tool that can:
64
+ 1. Train multiple model types
65
+ 2. Perform hyperparameter tuning
66
+ 3. Evaluate and compare models
67
+ 4. Generate feature importance
68
+ 5. Provide model explanations
69
+ """
70
+
71
+ # Configuration schema
72
+ class Config(BaseModel):
73
+ """Configuration for the model trainer tool"""
74
+ model_config = ConfigDict(env_prefix="MODEL_TRAINER_")
75
+
76
+ test_size: float = Field(
77
+ default=0.2,
78
+ description="Proportion of data to use for testing"
79
+ )
80
+ random_state: int = Field(
81
+ default=42,
82
+ description="Random state for reproducibility"
83
+ )
84
+ cv_folds: int = Field(
85
+ default=5,
86
+ description="Number of cross-validation folds"
87
+ )
88
+ enable_hyperparameter_tuning: bool = Field(
89
+ default=False,
90
+ description="Whether to enable hyperparameter tuning"
91
+ )
92
+ max_tuning_iterations: int = Field(
93
+ default=20,
94
+ description="Maximum number of hyperparameter tuning iterations"
95
+ )
96
+
97
+ def __init__(self, config: Optional[Dict[str, Any]] = None):
98
+ """Initialize ModelTrainerTool with settings"""
99
+ super().__init__(config)
100
+
101
+ # Parse configuration
102
+ self.config = self.Config(**(config or {}))
103
+
104
+ self.logger = logging.getLogger(__name__)
105
+ if not self.logger.handlers:
106
+ handler = logging.StreamHandler()
107
+ handler.setFormatter(logging.Formatter('%(asctime)s %(levelname)s %(message)s'))
108
+ self.logger.addHandler(handler)
109
+ self.logger.setLevel(logging.INFO)
110
+
111
+ self._init_external_tools()
112
+ self.trained_models = {}
113
+
114
+ def _init_external_tools(self):
115
+ """Initialize external task tools"""
116
+ self.external_tools = {}
117
+
118
+ # Schema definitions
119
+ class TrainModelSchema(BaseModel):
120
+ """Schema for train_model operation"""
121
+ data: Union[Dict[str, Any], List[Dict[str, Any]]] = Field(description="Training data")
122
+ target: str = Field(description="Target column name")
123
+ model_type: ModelType = Field(default=ModelType.AUTO, description="Model type to train")
124
+ auto_tune: bool = Field(default=False, description="Enable hyperparameter tuning")
125
+ cross_validation: int = Field(default=5, description="Number of CV folds")
126
+
127
+ class AutoSelectModelSchema(BaseModel):
128
+ """Schema for auto_select_model operation"""
129
+ data: Union[Dict[str, Any], List[Dict[str, Any]]] = Field(description="Data for model selection")
130
+ target: str = Field(description="Target column name")
131
+ task_type: Optional[TaskType] = Field(default=None, description="Task type")
132
+
133
+ class EvaluateModelSchema(BaseModel):
134
+ """Schema for evaluate_model operation"""
135
+ model_id: str = Field(description="ID of trained model")
136
+ test_data: Union[Dict[str, Any], List[Dict[str, Any]]] = Field(description="Test data")
137
+ target: str = Field(description="Target column name")
138
+
139
+ def train_model(
140
+ self,
141
+ data: Union[Dict[str, Any], List[Dict[str, Any]], pd.DataFrame],
142
+ target: str,
143
+ model_type: ModelType = ModelType.AUTO,
144
+ auto_tune: bool = False,
145
+ cross_validation: int = 5
146
+ ) -> Dict[str, Any]:
147
+ """
148
+ Train and evaluate model.
149
+
150
+ Args:
151
+ data: Training data
152
+ target: Target column name
153
+ model_type: Type of model to train (auto-selected if AUTO)
154
+ auto_tune: Enable hyperparameter tuning
155
+ cross_validation: Number of cross-validation folds
156
+
157
+ Returns:
158
+ Dict containing:
159
+ - model_id: Unique identifier for trained model
160
+ - model_type: Type of model trained
161
+ - performance: Performance metrics
162
+ - feature_importance: Feature importance scores
163
+ - cross_validation_scores: CV scores
164
+ """
165
+ try:
166
+ df = self._to_dataframe(data)
167
+
168
+ # Separate features and target
169
+ X = df.drop(columns=[target])
170
+ y = df[target]
171
+
172
+ # Determine task type and model
173
+ task_type = self._determine_task_type(y)
174
+
175
+ if model_type == ModelType.AUTO:
176
+ model_type = self._auto_select_model_type(task_type)
177
+ self.logger.info(f"Auto-selected model type: {model_type.value}")
178
+
179
+ # Prepare data
180
+ X_processed, feature_names = self._preprocess_features(X)
181
+
182
+ # Handle categorical target for classification
183
+ if task_type == TaskType.CLASSIFICATION:
184
+ label_encoder = LabelEncoder()
185
+ y = label_encoder.fit_transform(y)
186
+ else:
187
+ label_encoder = None
188
+
189
+ # Split data
190
+ X_train, X_test, y_train, y_test = train_test_split(
191
+ X_processed, y,
192
+ test_size=self.config.test_size,
193
+ random_state=self.config.random_state
194
+ )
195
+
196
+ # Create and train model
197
+ model = self._create_model(model_type)
198
+ model.fit(X_train, y_train)
199
+
200
+ # Make predictions
201
+ y_pred = model.predict(X_test)
202
+
203
+ # Calculate metrics
204
+ performance = self._calculate_metrics(y_test, y_pred, task_type)
205
+
206
+ # Cross-validation
207
+ cv_scores = cross_val_score(model, X_processed, y, cv=cross_validation)
208
+
209
+ # Feature importance
210
+ feature_importance = self._get_feature_importance(model, feature_names)
211
+
212
+ # Store model
213
+ model_id = f"model_{len(self.trained_models) + 1}"
214
+ self.trained_models[model_id] = {
215
+ 'model': model,
216
+ 'model_type': model_type.value,
217
+ 'task_type': task_type.value,
218
+ 'feature_names': feature_names,
219
+ 'label_encoder': label_encoder
220
+ }
221
+
222
+ return {
223
+ 'model_id': model_id,
224
+ 'model_type': model_type.value,
225
+ 'task_type': task_type.value,
226
+ 'performance': performance,
227
+ 'feature_importance': feature_importance,
228
+ 'cross_validation_scores': {
229
+ 'scores': cv_scores.tolist(),
230
+ 'mean': float(cv_scores.mean()),
231
+ 'std': float(cv_scores.std())
232
+ },
233
+ 'training_samples': len(X_train),
234
+ 'test_samples': len(X_test)
235
+ }
236
+
237
+ except Exception as e:
238
+ self.logger.error(f"Error training model: {e}")
239
+ raise TrainingError(f"Model training failed: {e}")
240
+
241
+ def auto_select_model(
242
+ self,
243
+ data: Union[Dict[str, Any], List[Dict[str, Any]], pd.DataFrame],
244
+ target: str,
245
+ task_type: Optional[TaskType] = None
246
+ ) -> Dict[str, Any]:
247
+ """
248
+ Automatically select best model based on data characteristics.
249
+
250
+ Args:
251
+ data: Data for model selection
252
+ target: Target column name
253
+ task_type: Optional task type (auto-determined if None)
254
+
255
+ Returns:
256
+ Dict containing recommended model and reasoning
257
+ """
258
+ try:
259
+ df = self._to_dataframe(data)
260
+ y = df[target]
261
+
262
+ # Determine task type
263
+ if task_type is None:
264
+ task_type = self._determine_task_type(y)
265
+
266
+ # Select model
267
+ model_type = self._auto_select_model_type(task_type)
268
+
269
+ # Provide reasoning
270
+ reasoning = self._explain_model_selection(df, y, task_type, model_type)
271
+
272
+ return {
273
+ 'recommended_model': model_type.value,
274
+ 'task_type': task_type.value,
275
+ 'reasoning': reasoning,
276
+ 'confidence': 'high'
277
+ }
278
+
279
+ except Exception as e:
280
+ self.logger.error(f"Error in auto model selection: {e}")
281
+ raise TrainingError(f"Model selection failed: {e}")
282
+
283
+ def evaluate_model(
284
+ self,
285
+ model_id: str,
286
+ test_data: Union[Dict[str, Any], List[Dict[str, Any]], pd.DataFrame],
287
+ target: str
288
+ ) -> Dict[str, Any]:
289
+ """
290
+ Evaluate trained model on test data.
291
+
292
+ Args:
293
+ model_id: ID of trained model
294
+ test_data: Test data
295
+ target: Target column name
296
+
297
+ Returns:
298
+ Dict containing evaluation metrics
299
+ """
300
+ try:
301
+ if model_id not in self.trained_models:
302
+ raise TrainingError(f"Model {model_id} not found")
303
+
304
+ df = self._to_dataframe(test_data)
305
+ X_test = df.drop(columns=[target])
306
+ y_test = df[target]
307
+
308
+ model_info = self.trained_models[model_id]
309
+ model = model_info['model']
310
+ task_type = TaskType(model_info['task_type'])
311
+
312
+ # Preprocess features
313
+ X_processed, _ = self._preprocess_features(X_test)
314
+
315
+ # Handle label encoding for classification
316
+ if model_info['label_encoder']:
317
+ y_test = model_info['label_encoder'].transform(y_test)
318
+
319
+ # Make predictions
320
+ y_pred = model.predict(X_processed)
321
+
322
+ # Calculate metrics
323
+ performance = self._calculate_metrics(y_test, y_pred, task_type)
324
+
325
+ return {
326
+ 'model_id': model_id,
327
+ 'performance': performance,
328
+ 'test_samples': len(X_test)
329
+ }
330
+
331
+ except Exception as e:
332
+ self.logger.error(f"Error evaluating model: {e}")
333
+ raise TrainingError(f"Model evaluation failed: {e}")
334
+
335
+ def tune_hyperparameters(
336
+ self,
337
+ data: Union[Dict[str, Any], List[Dict[str, Any]], pd.DataFrame],
338
+ target: str,
339
+ model_type: ModelType
340
+ ) -> Dict[str, Any]:
341
+ """
342
+ Tune hyperparameters for specified model type.
343
+
344
+ Args:
345
+ data: Training data
346
+ target: Target column name
347
+ model_type: Model type to tune
348
+
349
+ Returns:
350
+ Dict containing best parameters and performance
351
+ """
352
+ try:
353
+ # Note: Full hyperparameter tuning with GridSearchCV would be implemented here
354
+ # For now, returning placeholder structure
355
+ self.logger.info("Hyperparameter tuning is a placeholder - train with default params")
356
+
357
+ result = self.train_model(data, target, model_type, auto_tune=False)
358
+ result['tuning_note'] = "Using default parameters - full tuning not implemented"
359
+
360
+ return result
361
+
362
+ except Exception as e:
363
+ self.logger.error(f"Error tuning hyperparameters: {e}")
364
+ raise TrainingError(f"Hyperparameter tuning failed: {e}")
365
+
366
+ # Internal helper methods
367
+
368
+ def _to_dataframe(self, data: Union[Dict, List, pd.DataFrame]) -> pd.DataFrame:
369
+ """Convert data to DataFrame"""
370
+ if isinstance(data, pd.DataFrame):
371
+ return data
372
+ elif isinstance(data, list):
373
+ return pd.DataFrame(data)
374
+ elif isinstance(data, dict):
375
+ return pd.DataFrame([data])
376
+ else:
377
+ raise TrainingError(f"Unsupported data type: {type(data)}")
378
+
379
+ def _determine_task_type(self, y: pd.Series) -> TaskType:
380
+ """Determine task type from target variable"""
381
+ if y.dtype in ['object', 'category', 'bool']:
382
+ return TaskType.CLASSIFICATION
383
+ elif y.nunique() < 10 and y.dtype in ['int64', 'int32']:
384
+ return TaskType.CLASSIFICATION
385
+ else:
386
+ return TaskType.REGRESSION
387
+
388
+ def _auto_select_model_type(self, task_type: TaskType) -> ModelType:
389
+ """Auto-select model type based on task"""
390
+ if task_type == TaskType.CLASSIFICATION:
391
+ return ModelType.RANDOM_FOREST_CLASSIFIER
392
+ else:
393
+ return ModelType.RANDOM_FOREST_REGRESSOR
394
+
395
+ def _create_model(self, model_type: ModelType):
396
+ """Create model instance"""
397
+ if model_type == ModelType.LOGISTIC_REGRESSION:
398
+ return LogisticRegression(random_state=self.config.random_state, max_iter=1000)
399
+ elif model_type == ModelType.LINEAR_REGRESSION:
400
+ return LinearRegression()
401
+ elif model_type == ModelType.RANDOM_FOREST_CLASSIFIER:
402
+ return RandomForestClassifier(random_state=self.config.random_state, n_estimators=100)
403
+ elif model_type == ModelType.RANDOM_FOREST_REGRESSOR:
404
+ return RandomForestRegressor(random_state=self.config.random_state, n_estimators=100)
405
+ elif model_type == ModelType.GRADIENT_BOOSTING_CLASSIFIER:
406
+ return GradientBoostingClassifier(random_state=self.config.random_state)
407
+ elif model_type == ModelType.GRADIENT_BOOSTING_REGRESSOR:
408
+ return GradientBoostingRegressor(random_state=self.config.random_state)
409
+ else:
410
+ raise TrainingError(f"Unsupported model type: {model_type}")
411
+
412
+ def _preprocess_features(self, X: pd.DataFrame) -> tuple:
413
+ """Preprocess features for training"""
414
+ X_processed = X.copy()
415
+
416
+ # Handle categorical variables with label encoding
417
+ for col in X_processed.select_dtypes(include=['object', 'category']).columns:
418
+ le = LabelEncoder()
419
+ X_processed[col] = le.fit_transform(X_processed[col].astype(str))
420
+
421
+ # Handle missing values
422
+ X_processed = X_processed.fillna(X_processed.mean(numeric_only=True))
423
+
424
+ feature_names = X_processed.columns.tolist()
425
+
426
+ return X_processed.values, feature_names
427
+
428
+ def _calculate_metrics(self, y_true, y_pred, task_type: TaskType) -> Dict[str, float]:
429
+ """Calculate performance metrics"""
430
+ if task_type == TaskType.CLASSIFICATION:
431
+ return {
432
+ 'accuracy': float(accuracy_score(y_true, y_pred)),
433
+ 'precision': float(precision_score(y_true, y_pred, average='weighted', zero_division=0)),
434
+ 'recall': float(recall_score(y_true, y_pred, average='weighted', zero_division=0)),
435
+ 'f1_score': float(f1_score(y_true, y_pred, average='weighted', zero_division=0))
436
+ }
437
+ else:
438
+ mse = mean_squared_error(y_true, y_pred)
439
+ return {
440
+ 'r2_score': float(r2_score(y_true, y_pred)),
441
+ 'mse': float(mse),
442
+ 'rmse': float(np.sqrt(mse)),
443
+ 'mae': float(np.mean(np.abs(y_true - y_pred)))
444
+ }
445
+
446
+ def _get_feature_importance(self, model, feature_names: List[str]) -> Dict[str, float]:
447
+ """Extract feature importance from model"""
448
+ if hasattr(model, 'feature_importances_'):
449
+ importance = model.feature_importances_
450
+ return {name: float(imp) for name, imp in zip(feature_names, importance)}
451
+ elif hasattr(model, 'coef_'):
452
+ importance = np.abs(model.coef_).flatten()
453
+ return {name: float(imp) for name, imp in zip(feature_names, importance)}
454
+ else:
455
+ return {}
456
+
457
+ def _explain_model_selection(self, df: pd.DataFrame, y: pd.Series, task_type: TaskType, model_type: ModelType) -> str:
458
+ """Explain why a model was selected"""
459
+ n_samples = len(df)
460
+ n_features = len(df.columns) - 1
461
+
462
+ reasons = []
463
+ reasons.append(f"Task type: {task_type.value}")
464
+ reasons.append(f"Dataset size: {n_samples} samples, {n_features} features")
465
+
466
+ if model_type in [ModelType.RANDOM_FOREST_CLASSIFIER, ModelType.RANDOM_FOREST_REGRESSOR]:
467
+ reasons.append("Random Forest selected for robust performance and feature importance")
468
+
469
+ return "; ".join(reasons)
470
+