aiecs 1.0.8__py3-none-any.whl → 1.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of aiecs might be problematic. Click here for more details.

Files changed (81) hide show
  1. aiecs/__init__.py +1 -1
  2. aiecs/aiecs_client.py +159 -1
  3. aiecs/config/config.py +6 -0
  4. aiecs/domain/__init__.py +95 -0
  5. aiecs/domain/community/__init__.py +159 -0
  6. aiecs/domain/community/agent_adapter.py +516 -0
  7. aiecs/domain/community/analytics.py +465 -0
  8. aiecs/domain/community/collaborative_workflow.py +99 -7
  9. aiecs/domain/community/communication_hub.py +649 -0
  10. aiecs/domain/community/community_builder.py +322 -0
  11. aiecs/domain/community/community_integration.py +365 -12
  12. aiecs/domain/community/community_manager.py +481 -5
  13. aiecs/domain/community/decision_engine.py +459 -13
  14. aiecs/domain/community/exceptions.py +238 -0
  15. aiecs/domain/community/models/__init__.py +36 -0
  16. aiecs/domain/community/resource_manager.py +1 -1
  17. aiecs/domain/community/shared_context_manager.py +621 -0
  18. aiecs/domain/context/__init__.py +24 -0
  19. aiecs/domain/context/context_engine.py +37 -33
  20. aiecs/main.py +20 -2
  21. aiecs/scripts/aid/VERSION_MANAGEMENT.md +97 -0
  22. aiecs/scripts/aid/__init__.py +15 -0
  23. aiecs/scripts/aid/version_manager.py +224 -0
  24. aiecs/scripts/dependance_check/__init__.py +18 -0
  25. aiecs/scripts/{download_nlp_data.py → dependance_check/download_nlp_data.py} +51 -8
  26. aiecs/scripts/dependance_patch/__init__.py +8 -0
  27. aiecs/scripts/dependance_patch/fix_weasel/__init__.py +12 -0
  28. aiecs/scripts/tools_develop/README.md +340 -0
  29. aiecs/scripts/tools_develop/__init__.py +16 -0
  30. aiecs/scripts/tools_develop/check_type_annotations.py +263 -0
  31. aiecs/scripts/tools_develop/validate_tool_schemas.py +346 -0
  32. aiecs/tools/__init__.py +53 -34
  33. aiecs/tools/docs/__init__.py +106 -0
  34. aiecs/tools/docs/ai_document_orchestrator.py +556 -0
  35. aiecs/tools/docs/ai_document_writer_orchestrator.py +2222 -0
  36. aiecs/tools/docs/content_insertion_tool.py +1234 -0
  37. aiecs/tools/docs/document_creator_tool.py +1179 -0
  38. aiecs/tools/docs/document_layout_tool.py +1105 -0
  39. aiecs/tools/docs/document_parser_tool.py +924 -0
  40. aiecs/tools/docs/document_writer_tool.py +1636 -0
  41. aiecs/tools/langchain_adapter.py +102 -51
  42. aiecs/tools/schema_generator.py +265 -0
  43. aiecs/tools/statistics/__init__.py +82 -0
  44. aiecs/tools/statistics/ai_data_analysis_orchestrator.py +581 -0
  45. aiecs/tools/statistics/ai_insight_generator_tool.py +473 -0
  46. aiecs/tools/statistics/ai_report_orchestrator_tool.py +629 -0
  47. aiecs/tools/statistics/data_loader_tool.py +518 -0
  48. aiecs/tools/statistics/data_profiler_tool.py +599 -0
  49. aiecs/tools/statistics/data_transformer_tool.py +531 -0
  50. aiecs/tools/statistics/data_visualizer_tool.py +460 -0
  51. aiecs/tools/statistics/model_trainer_tool.py +470 -0
  52. aiecs/tools/statistics/statistical_analyzer_tool.py +426 -0
  53. aiecs/tools/task_tools/chart_tool.py +2 -1
  54. aiecs/tools/task_tools/image_tool.py +43 -43
  55. aiecs/tools/task_tools/office_tool.py +48 -36
  56. aiecs/tools/task_tools/pandas_tool.py +37 -33
  57. aiecs/tools/task_tools/report_tool.py +67 -56
  58. aiecs/tools/task_tools/research_tool.py +32 -31
  59. aiecs/tools/task_tools/scraper_tool.py +53 -46
  60. aiecs/tools/task_tools/search_tool.py +1123 -0
  61. aiecs/tools/task_tools/stats_tool.py +20 -15
  62. {aiecs-1.0.8.dist-info → aiecs-1.2.0.dist-info}/METADATA +5 -1
  63. aiecs-1.2.0.dist-info/RECORD +135 -0
  64. aiecs-1.2.0.dist-info/entry_points.txt +10 -0
  65. aiecs/tools/task_tools/search_api.py +0 -7
  66. aiecs-1.0.8.dist-info/RECORD +0 -98
  67. aiecs-1.0.8.dist-info/entry_points.txt +0 -7
  68. /aiecs/scripts/{DEPENDENCY_SYSTEM_SUMMARY.md → dependance_check/DEPENDENCY_SYSTEM_SUMMARY.md} +0 -0
  69. /aiecs/scripts/{README_DEPENDENCY_CHECKER.md → dependance_check/README_DEPENDENCY_CHECKER.md} +0 -0
  70. /aiecs/scripts/{dependency_checker.py → dependance_check/dependency_checker.py} +0 -0
  71. /aiecs/scripts/{dependency_fixer.py → dependance_check/dependency_fixer.py} +0 -0
  72. /aiecs/scripts/{quick_dependency_check.py → dependance_check/quick_dependency_check.py} +0 -0
  73. /aiecs/scripts/{setup_nlp_data.sh → dependance_check/setup_nlp_data.sh} +0 -0
  74. /aiecs/scripts/{README_WEASEL_PATCH.md → dependance_patch/fix_weasel/README_WEASEL_PATCH.md} +0 -0
  75. /aiecs/scripts/{fix_weasel_validator.py → dependance_patch/fix_weasel/fix_weasel_validator.py} +0 -0
  76. /aiecs/scripts/{fix_weasel_validator.sh → dependance_patch/fix_weasel/fix_weasel_validator.sh} +0 -0
  77. /aiecs/scripts/{patch_weasel_library.sh → dependance_patch/fix_weasel/patch_weasel_library.sh} +0 -0
  78. /aiecs/scripts/{run_weasel_patch.sh → dependance_patch/fix_weasel/run_weasel_patch.sh} +0 -0
  79. {aiecs-1.0.8.dist-info → aiecs-1.2.0.dist-info}/WHEEL +0 -0
  80. {aiecs-1.0.8.dist-info → aiecs-1.2.0.dist-info}/licenses/LICENSE +0 -0
  81. {aiecs-1.0.8.dist-info → aiecs-1.2.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,531 @@
1
+ """
2
+ Data Transformer Tool - Data cleaning, transformation, and feature engineering
3
+
4
+ This tool provides comprehensive data transformation capabilities with:
5
+ - Data cleaning and preprocessing
6
+ - Feature engineering and encoding
7
+ - Normalization and standardization
8
+ - Transformation pipelines
9
+ - Missing value handling
10
+ """
11
+
12
+ import logging
13
+ from typing import Dict, Any, List, Optional, Union
14
+ from enum import Enum
15
+
16
+ import pandas as pd
17
+ import numpy as np
18
+ from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, LabelEncoder, OneHotEncoder
19
+ from sklearn.impute import SimpleImputer
20
+ from pydantic import BaseModel, Field, ValidationError, ConfigDict
21
+
22
+ from aiecs.tools.base_tool import BaseTool
23
+ from aiecs.tools import register_tool
24
+
25
+
26
+ class TransformationType(str, Enum):
27
+ """Types of transformations"""
28
+ # Cleaning operations
29
+ REMOVE_DUPLICATES = "remove_duplicates"
30
+ FILL_MISSING = "fill_missing"
31
+ REMOVE_OUTLIERS = "remove_outliers"
32
+
33
+ # Transformation operations
34
+ NORMALIZE = "normalize"
35
+ STANDARDIZE = "standardize"
36
+ LOG_TRANSFORM = "log_transform"
37
+ BOX_COX = "box_cox"
38
+
39
+ # Encoding operations
40
+ ONE_HOT_ENCODE = "one_hot_encode"
41
+ LABEL_ENCODE = "label_encode"
42
+ TARGET_ENCODE = "target_encode"
43
+
44
+ # Feature engineering
45
+ POLYNOMIAL_FEATURES = "polynomial_features"
46
+ INTERACTION_FEATURES = "interaction_features"
47
+ BINNING = "binning"
48
+ AGGREGATION = "aggregation"
49
+
50
+
51
+ class MissingValueStrategy(str, Enum):
52
+ """Strategies for handling missing values"""
53
+ DROP = "drop"
54
+ MEAN = "mean"
55
+ MEDIAN = "median"
56
+ MODE = "mode"
57
+ FORWARD_FILL = "forward_fill"
58
+ BACKWARD_FILL = "backward_fill"
59
+ INTERPOLATE = "interpolate"
60
+ CONSTANT = "constant"
61
+
62
+
63
+
64
+
65
+ class DataTransformerError(Exception):
66
+ """Base exception for DataTransformer errors"""
67
+ pass
68
+
69
+
70
+ class TransformationError(DataTransformerError):
71
+ """Raised when transformation fails"""
72
+ pass
73
+
74
+
75
+ @register_tool('data_transformer')
76
+ class DataTransformerTool(BaseTool):
77
+ """
78
+ Advanced data transformation tool that can:
79
+ 1. Clean and preprocess data
80
+ 2. Engineer features
81
+ 3. Transform and normalize data
82
+ 4. Build transformation pipelines
83
+
84
+ Integrates with pandas_tool for core operations.
85
+ """
86
+
87
+ # Configuration schema
88
+ class Config(BaseModel):
89
+ """Configuration for the data transformer tool"""
90
+ model_config = ConfigDict(env_prefix="DATA_TRANSFORMER_")
91
+
92
+ outlier_std_threshold: float = Field(
93
+ default=3.0,
94
+ description="Standard deviation threshold for outlier detection"
95
+ )
96
+ default_missing_strategy: str = Field(
97
+ default="mean",
98
+ description="Default strategy for handling missing values"
99
+ )
100
+ enable_pipeline_caching: bool = Field(
101
+ default=True,
102
+ description="Whether to enable transformation pipeline caching"
103
+ )
104
+ max_one_hot_categories: int = Field(
105
+ default=10,
106
+ description="Maximum number of categories for one-hot encoding"
107
+ )
108
+
109
+ def __init__(self, config: Optional[Dict[str, Any]] = None):
110
+ """
111
+ Initialize DataTransformerTool with settings.
112
+
113
+ Args:
114
+ config: Optional configuration overrides
115
+ """
116
+ super().__init__(config)
117
+
118
+ # Parse configuration
119
+ self.config = self.Config(**(config or {}))
120
+
121
+ self.logger = logging.getLogger(__name__)
122
+ if not self.logger.handlers:
123
+ handler = logging.StreamHandler()
124
+ handler.setFormatter(logging.Formatter('%(asctime)s %(levelname)s %(message)s'))
125
+ self.logger.addHandler(handler)
126
+ self.logger.setLevel(logging.INFO)
127
+
128
+ # Initialize external tools
129
+ self._init_external_tools()
130
+
131
+ # Initialize transformation pipeline cache
132
+ self.pipeline_cache = {}
133
+
134
+ def _init_external_tools(self):
135
+ """Initialize external task tools"""
136
+ self.external_tools = {}
137
+
138
+ # Initialize PandasTool for data operations
139
+ try:
140
+ from aiecs.tools.task_tools.pandas_tool import PandasTool
141
+ self.external_tools['pandas'] = PandasTool()
142
+ self.logger.info("PandasTool initialized successfully")
143
+ except ImportError:
144
+ self.logger.warning("PandasTool not available")
145
+ self.external_tools['pandas'] = None
146
+
147
+ # Schema definitions
148
+ class TransformDataSchema(BaseModel):
149
+ """Schema for transform_data operation"""
150
+ data: Union[Dict[str, Any], List[Dict[str, Any]]] = Field(description="Data to transform")
151
+ transformations: List[Dict[str, Any]] = Field(description="List of transformation steps")
152
+ validate: bool = Field(default=True, description="Validate transformations")
153
+
154
+ class AutoTransformSchema(BaseModel):
155
+ """Schema for auto_transform operation"""
156
+ data: Union[Dict[str, Any], List[Dict[str, Any]]] = Field(description="Data to transform")
157
+ target_column: Optional[str] = Field(default=None, description="Target column name")
158
+ task_type: Optional[str] = Field(default=None, description="Task type: classification or regression")
159
+
160
+ class HandleMissingValuesSchema(BaseModel):
161
+ """Schema for handle_missing_values operation"""
162
+ data: Union[Dict[str, Any], List[Dict[str, Any]]] = Field(description="Data with missing values")
163
+ strategy: MissingValueStrategy = Field(default=MissingValueStrategy.MEAN, description="Strategy for handling missing values")
164
+ columns: Optional[List[str]] = Field(default=None, description="Specific columns to handle")
165
+ fill_value: Optional[Any] = Field(default=None, description="Value for constant strategy")
166
+
167
+ class EncodeFeaturesSchema(BaseModel):
168
+ """Schema for encode_features operation"""
169
+ data: Union[Dict[str, Any], List[Dict[str, Any]]] = Field(description="Data to encode")
170
+ columns: List[str] = Field(description="Columns to encode")
171
+ method: str = Field(default="one_hot", description="Encoding method: one_hot or label")
172
+
173
+ def transform_data(
174
+ self,
175
+ data: Union[Dict[str, Any], List[Dict[str, Any]], pd.DataFrame],
176
+ transformations: List[Dict[str, Any]],
177
+ validate: bool = True
178
+ ) -> Dict[str, Any]:
179
+ """
180
+ Apply transformation pipeline to data.
181
+
182
+ Args:
183
+ data: Data to transform
184
+ transformations: List of transformation steps, each containing:
185
+ - type: TransformationType
186
+ - columns: List of columns (optional)
187
+ - params: Additional parameters
188
+ validate: Whether to validate transformations
189
+
190
+ Returns:
191
+ Dict containing:
192
+ - transformed_data: Transformed DataFrame
193
+ - transformation_log: Log of applied transformations
194
+ - quality_improvement: Quality metrics comparison
195
+
196
+ Raises:
197
+ TransformationError: If transformation fails
198
+ """
199
+ try:
200
+ df = self._to_dataframe(data)
201
+ original_df = df.copy()
202
+
203
+ transformation_log = []
204
+
205
+ for i, transform in enumerate(transformations):
206
+ trans_type = transform.get('type')
207
+ columns = transform.get('columns')
208
+ params = transform.get('params', {})
209
+
210
+ self.logger.info(f"Applying transformation {i+1}/{len(transformations)}: {trans_type}")
211
+
212
+ # Apply transformation
213
+ df = self._apply_single_transformation(df, trans_type, columns, params)
214
+
215
+ transformation_log.append({
216
+ 'step': i + 1,
217
+ 'type': trans_type,
218
+ 'columns': columns,
219
+ 'params': params,
220
+ 'status': 'success'
221
+ })
222
+
223
+ # Calculate quality improvement
224
+ quality_improvement = self._calculate_quality_improvement(original_df, df)
225
+
226
+ return {
227
+ 'transformed_data': df,
228
+ 'transformation_log': transformation_log,
229
+ 'quality_improvement': quality_improvement,
230
+ 'original_shape': original_df.shape,
231
+ 'new_shape': df.shape
232
+ }
233
+
234
+ except Exception as e:
235
+ self.logger.error(f"Error in transformation pipeline: {e}")
236
+ raise TransformationError(f"Transformation failed: {e}")
237
+
238
+ def auto_transform(
239
+ self,
240
+ data: Union[Dict[str, Any], List[Dict[str, Any]], pd.DataFrame],
241
+ target_column: Optional[str] = None,
242
+ task_type: Optional[str] = None
243
+ ) -> Dict[str, Any]:
244
+ """
245
+ Automatically determine and apply optimal transformations.
246
+
247
+ Args:
248
+ data: Data to transform
249
+ target_column: Target column for ML tasks
250
+ task_type: Type of task (classification or regression)
251
+
252
+ Returns:
253
+ Dict containing transformed data and applied transformations
254
+ """
255
+ try:
256
+ df = self._to_dataframe(data)
257
+
258
+ # Determine transformations needed
259
+ transformations = self._determine_transformations(df, target_column, task_type)
260
+
261
+ # Apply transformations
262
+ result = self.transform_data(df, transformations, validate=True)
263
+ result['auto_detected_transformations'] = transformations
264
+
265
+ return result
266
+
267
+ except Exception as e:
268
+ self.logger.error(f"Error in auto transform: {e}")
269
+ raise TransformationError(f"Auto transform failed: {e}")
270
+
271
+ def handle_missing_values(
272
+ self,
273
+ data: Union[Dict[str, Any], List[Dict[str, Any]], pd.DataFrame],
274
+ strategy: MissingValueStrategy = MissingValueStrategy.MEAN,
275
+ columns: Optional[List[str]] = None,
276
+ fill_value: Optional[Any] = None
277
+ ) -> Dict[str, Any]:
278
+ """
279
+ Handle missing values in data.
280
+
281
+ Args:
282
+ data: Data with missing values
283
+ strategy: Strategy for handling missing values
284
+ columns: Specific columns to handle (None for all)
285
+ fill_value: Value for constant strategy
286
+
287
+ Returns:
288
+ Dict containing data with handled missing values
289
+ """
290
+ try:
291
+ df = self._to_dataframe(data)
292
+ original_missing = df.isnull().sum().sum()
293
+
294
+ # Select columns to handle
295
+ cols_to_handle = columns if columns else df.columns.tolist()
296
+
297
+ # Apply strategy
298
+ if strategy == MissingValueStrategy.DROP:
299
+ df = df.dropna(subset=cols_to_handle)
300
+ elif strategy == MissingValueStrategy.MEAN:
301
+ for col in cols_to_handle:
302
+ if df[col].dtype in ['int64', 'float64']:
303
+ df[col].fillna(df[col].mean(), inplace=True)
304
+ elif strategy == MissingValueStrategy.MEDIAN:
305
+ for col in cols_to_handle:
306
+ if df[col].dtype in ['int64', 'float64']:
307
+ df[col].fillna(df[col].median(), inplace=True)
308
+ elif strategy == MissingValueStrategy.MODE:
309
+ for col in cols_to_handle:
310
+ if not df[col].mode().empty:
311
+ df[col].fillna(df[col].mode()[0], inplace=True)
312
+ elif strategy == MissingValueStrategy.FORWARD_FILL:
313
+ df[cols_to_handle] = df[cols_to_handle].fillna(method='ffill')
314
+ elif strategy == MissingValueStrategy.BACKWARD_FILL:
315
+ df[cols_to_handle] = df[cols_to_handle].fillna(method='bfill')
316
+ elif strategy == MissingValueStrategy.INTERPOLATE:
317
+ for col in cols_to_handle:
318
+ if df[col].dtype in ['int64', 'float64']:
319
+ df[col] = df[col].interpolate()
320
+ elif strategy == MissingValueStrategy.CONSTANT:
321
+ df[cols_to_handle] = df[cols_to_handle].fillna(fill_value)
322
+
323
+ final_missing = df.isnull().sum().sum()
324
+
325
+ return {
326
+ 'data': df,
327
+ 'original_missing': int(original_missing),
328
+ 'final_missing': int(final_missing),
329
+ 'missing_handled': int(original_missing - final_missing),
330
+ 'strategy': strategy.value
331
+ }
332
+
333
+ except Exception as e:
334
+ self.logger.error(f"Error handling missing values: {e}")
335
+ raise TransformationError(f"Failed to handle missing values: {e}")
336
+
337
+ def encode_features(
338
+ self,
339
+ data: Union[Dict[str, Any], List[Dict[str, Any]], pd.DataFrame],
340
+ columns: List[str],
341
+ method: str = "one_hot"
342
+ ) -> Dict[str, Any]:
343
+ """
344
+ Encode categorical features.
345
+
346
+ Args:
347
+ data: Data to encode
348
+ columns: Columns to encode
349
+ method: Encoding method (one_hot or label)
350
+
351
+ Returns:
352
+ Dict containing encoded data
353
+ """
354
+ try:
355
+ df = self._to_dataframe(data)
356
+
357
+ if method == "one_hot":
358
+ # One-hot encoding
359
+ df_encoded = pd.get_dummies(df, columns=columns, prefix=columns)
360
+ encoding_info = {
361
+ 'method': 'one_hot',
362
+ 'original_columns': columns,
363
+ 'new_columns': [col for col in df_encoded.columns if col not in df.columns]
364
+ }
365
+ elif method == "label":
366
+ # Label encoding
367
+ df_encoded = df.copy()
368
+ encoders = {}
369
+ for col in columns:
370
+ le = LabelEncoder()
371
+ df_encoded[col] = le.fit_transform(df[col].astype(str))
372
+ encoders[col] = le
373
+ encoding_info = {
374
+ 'method': 'label',
375
+ 'columns': columns,
376
+ 'encoders': encoders
377
+ }
378
+ else:
379
+ raise TransformationError(f"Unsupported encoding method: {method}")
380
+
381
+ return {
382
+ 'data': df_encoded,
383
+ 'encoding_info': encoding_info,
384
+ 'original_shape': df.shape,
385
+ 'new_shape': df_encoded.shape
386
+ }
387
+
388
+ except Exception as e:
389
+ self.logger.error(f"Error encoding features: {e}")
390
+ raise TransformationError(f"Feature encoding failed: {e}")
391
+
392
+ # Internal helper methods
393
+
394
+ def _to_dataframe(self, data: Union[Dict, List, pd.DataFrame]) -> pd.DataFrame:
395
+ """Convert data to DataFrame"""
396
+ if isinstance(data, pd.DataFrame):
397
+ return data
398
+ elif isinstance(data, list):
399
+ return pd.DataFrame(data)
400
+ elif isinstance(data, dict):
401
+ return pd.DataFrame([data])
402
+ else:
403
+ raise TransformationError(f"Unsupported data type: {type(data)}")
404
+
405
+ def _apply_single_transformation(self, df: pd.DataFrame, trans_type: str, columns: Optional[List[str]], params: Dict[str, Any]) -> pd.DataFrame:
406
+ """Apply a single transformation"""
407
+ if trans_type == TransformationType.REMOVE_DUPLICATES.value:
408
+ return df.drop_duplicates()
409
+
410
+ elif trans_type == TransformationType.FILL_MISSING.value:
411
+ strategy = params.get('strategy', 'mean')
412
+ for col in (columns or df.columns):
413
+ if df[col].isnull().any():
414
+ if strategy == 'mean' and df[col].dtype in ['int64', 'float64']:
415
+ df[col].fillna(df[col].mean(), inplace=True)
416
+ elif strategy == 'median' and df[col].dtype in ['int64', 'float64']:
417
+ df[col].fillna(df[col].median(), inplace=True)
418
+ elif strategy == 'mode':
419
+ if not df[col].mode().empty:
420
+ df[col].fillna(df[col].mode()[0], inplace=True)
421
+ return df
422
+
423
+ elif trans_type == TransformationType.REMOVE_OUTLIERS.value:
424
+ for col in (columns or df.select_dtypes(include=[np.number]).columns):
425
+ if df[col].std() > 0:
426
+ z_scores = np.abs((df[col] - df[col].mean()) / df[col].std())
427
+ df = df[z_scores < self.config.outlier_std_threshold]
428
+ return df
429
+
430
+ elif trans_type == TransformationType.STANDARDIZE.value:
431
+ scaler = StandardScaler()
432
+ cols = columns or df.select_dtypes(include=[np.number]).columns.tolist()
433
+ df[cols] = scaler.fit_transform(df[cols])
434
+ return df
435
+
436
+ elif trans_type == TransformationType.NORMALIZE.value:
437
+ scaler = MinMaxScaler()
438
+ cols = columns or df.select_dtypes(include=[np.number]).columns.tolist()
439
+ df[cols] = scaler.fit_transform(df[cols])
440
+ return df
441
+
442
+ elif trans_type == TransformationType.LOG_TRANSFORM.value:
443
+ cols = columns or df.select_dtypes(include=[np.number]).columns.tolist()
444
+ for col in cols:
445
+ if (df[col] > 0).all():
446
+ df[col] = np.log(df[col])
447
+ return df
448
+
449
+ elif trans_type == TransformationType.ONE_HOT_ENCODE.value:
450
+ cols = columns or df.select_dtypes(include=['object']).columns.tolist()
451
+ return pd.get_dummies(df, columns=cols)
452
+
453
+ elif trans_type == TransformationType.LABEL_ENCODE.value:
454
+ cols = columns or df.select_dtypes(include=['object']).columns.tolist()
455
+ for col in cols:
456
+ le = LabelEncoder()
457
+ df[col] = le.fit_transform(df[col].astype(str))
458
+ return df
459
+
460
+ else:
461
+ self.logger.warning(f"Transformation type {trans_type} not implemented, skipping")
462
+ return df
463
+
464
+ def _determine_transformations(self, df: pd.DataFrame, target_column: Optional[str], task_type: Optional[str]) -> List[Dict[str, Any]]:
465
+ """Determine transformations needed for data"""
466
+ transformations = []
467
+
468
+ # Remove duplicates if present
469
+ if df.duplicated().sum() > 0:
470
+ transformations.append({
471
+ 'type': TransformationType.REMOVE_DUPLICATES.value,
472
+ 'columns': None,
473
+ 'params': {}
474
+ })
475
+
476
+ # Handle missing values
477
+ if df.isnull().sum().sum() > 0:
478
+ transformations.append({
479
+ 'type': TransformationType.FILL_MISSING.value,
480
+ 'columns': None,
481
+ 'params': {'strategy': 'mean'}
482
+ })
483
+
484
+ # Encode categorical variables
485
+ categorical_cols = df.select_dtypes(include=['object']).columns.tolist()
486
+ if target_column and target_column in categorical_cols:
487
+ categorical_cols.remove(target_column)
488
+
489
+ if len(categorical_cols) > 0:
490
+ # Use label encoding if too many categories, otherwise one-hot
491
+ for col in categorical_cols:
492
+ if df[col].nunique() > self.config.max_one_hot_categories:
493
+ transformations.append({
494
+ 'type': TransformationType.LABEL_ENCODE.value,
495
+ 'columns': [col],
496
+ 'params': {}
497
+ })
498
+ else:
499
+ transformations.append({
500
+ 'type': TransformationType.ONE_HOT_ENCODE.value,
501
+ 'columns': [col],
502
+ 'params': {}
503
+ })
504
+
505
+ # Standardize numeric features
506
+ numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
507
+ if target_column and target_column in numeric_cols:
508
+ numeric_cols.remove(target_column)
509
+
510
+ if len(numeric_cols) > 0:
511
+ transformations.append({
512
+ 'type': TransformationType.STANDARDIZE.value,
513
+ 'columns': numeric_cols,
514
+ 'params': {}
515
+ })
516
+
517
+ return transformations
518
+
519
+ def _calculate_quality_improvement(self, original_df: pd.DataFrame, transformed_df: pd.DataFrame) -> Dict[str, Any]:
520
+ """Calculate quality improvement metrics"""
521
+ return {
522
+ 'missing_before': int(original_df.isnull().sum().sum()),
523
+ 'missing_after': int(transformed_df.isnull().sum().sum()),
524
+ 'duplicates_before': int(original_df.duplicated().sum()),
525
+ 'duplicates_after': int(transformed_df.duplicated().sum()),
526
+ 'rows_before': len(original_df),
527
+ 'rows_after': len(transformed_df),
528
+ 'columns_before': len(original_df.columns),
529
+ 'columns_after': len(transformed_df.columns)
530
+ }
531
+