mcli-framework 7.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mcli-framework might be problematic. Click here for more details.

Files changed (186) hide show
  1. mcli/app/chat_cmd.py +42 -0
  2. mcli/app/commands_cmd.py +226 -0
  3. mcli/app/completion_cmd.py +216 -0
  4. mcli/app/completion_helpers.py +288 -0
  5. mcli/app/cron_test_cmd.py +697 -0
  6. mcli/app/logs_cmd.py +419 -0
  7. mcli/app/main.py +492 -0
  8. mcli/app/model/model.py +1060 -0
  9. mcli/app/model_cmd.py +227 -0
  10. mcli/app/redis_cmd.py +269 -0
  11. mcli/app/video/video.py +1114 -0
  12. mcli/app/visual_cmd.py +303 -0
  13. mcli/chat/chat.py +2409 -0
  14. mcli/chat/command_rag.py +514 -0
  15. mcli/chat/enhanced_chat.py +652 -0
  16. mcli/chat/system_controller.py +1010 -0
  17. mcli/chat/system_integration.py +1016 -0
  18. mcli/cli.py +25 -0
  19. mcli/config.toml +20 -0
  20. mcli/lib/api/api.py +586 -0
  21. mcli/lib/api/daemon_client.py +203 -0
  22. mcli/lib/api/daemon_client_local.py +44 -0
  23. mcli/lib/api/daemon_decorator.py +217 -0
  24. mcli/lib/api/mcli_decorators.py +1032 -0
  25. mcli/lib/auth/auth.py +85 -0
  26. mcli/lib/auth/aws_manager.py +85 -0
  27. mcli/lib/auth/azure_manager.py +91 -0
  28. mcli/lib/auth/credential_manager.py +192 -0
  29. mcli/lib/auth/gcp_manager.py +93 -0
  30. mcli/lib/auth/key_manager.py +117 -0
  31. mcli/lib/auth/mcli_manager.py +93 -0
  32. mcli/lib/auth/token_manager.py +75 -0
  33. mcli/lib/auth/token_util.py +1011 -0
  34. mcli/lib/config/config.py +47 -0
  35. mcli/lib/discovery/__init__.py +1 -0
  36. mcli/lib/discovery/command_discovery.py +274 -0
  37. mcli/lib/erd/erd.py +1345 -0
  38. mcli/lib/erd/generate_graph.py +453 -0
  39. mcli/lib/files/files.py +76 -0
  40. mcli/lib/fs/fs.py +109 -0
  41. mcli/lib/lib.py +29 -0
  42. mcli/lib/logger/logger.py +611 -0
  43. mcli/lib/performance/optimizer.py +409 -0
  44. mcli/lib/performance/rust_bridge.py +502 -0
  45. mcli/lib/performance/uvloop_config.py +154 -0
  46. mcli/lib/pickles/pickles.py +50 -0
  47. mcli/lib/search/cached_vectorizer.py +479 -0
  48. mcli/lib/services/data_pipeline.py +460 -0
  49. mcli/lib/services/lsh_client.py +441 -0
  50. mcli/lib/services/redis_service.py +387 -0
  51. mcli/lib/shell/shell.py +137 -0
  52. mcli/lib/toml/toml.py +33 -0
  53. mcli/lib/ui/styling.py +47 -0
  54. mcli/lib/ui/visual_effects.py +634 -0
  55. mcli/lib/watcher/watcher.py +185 -0
  56. mcli/ml/api/app.py +215 -0
  57. mcli/ml/api/middleware.py +224 -0
  58. mcli/ml/api/routers/admin_router.py +12 -0
  59. mcli/ml/api/routers/auth_router.py +244 -0
  60. mcli/ml/api/routers/backtest_router.py +12 -0
  61. mcli/ml/api/routers/data_router.py +12 -0
  62. mcli/ml/api/routers/model_router.py +302 -0
  63. mcli/ml/api/routers/monitoring_router.py +12 -0
  64. mcli/ml/api/routers/portfolio_router.py +12 -0
  65. mcli/ml/api/routers/prediction_router.py +267 -0
  66. mcli/ml/api/routers/trade_router.py +12 -0
  67. mcli/ml/api/routers/websocket_router.py +76 -0
  68. mcli/ml/api/schemas.py +64 -0
  69. mcli/ml/auth/auth_manager.py +425 -0
  70. mcli/ml/auth/models.py +154 -0
  71. mcli/ml/auth/permissions.py +302 -0
  72. mcli/ml/backtesting/backtest_engine.py +502 -0
  73. mcli/ml/backtesting/performance_metrics.py +393 -0
  74. mcli/ml/cache.py +400 -0
  75. mcli/ml/cli/main.py +398 -0
  76. mcli/ml/config/settings.py +394 -0
  77. mcli/ml/configs/dvc_config.py +230 -0
  78. mcli/ml/configs/mlflow_config.py +131 -0
  79. mcli/ml/configs/mlops_manager.py +293 -0
  80. mcli/ml/dashboard/app.py +532 -0
  81. mcli/ml/dashboard/app_integrated.py +738 -0
  82. mcli/ml/dashboard/app_supabase.py +560 -0
  83. mcli/ml/dashboard/app_training.py +615 -0
  84. mcli/ml/dashboard/cli.py +51 -0
  85. mcli/ml/data_ingestion/api_connectors.py +501 -0
  86. mcli/ml/data_ingestion/data_pipeline.py +567 -0
  87. mcli/ml/data_ingestion/stream_processor.py +512 -0
  88. mcli/ml/database/migrations/env.py +94 -0
  89. mcli/ml/database/models.py +667 -0
  90. mcli/ml/database/session.py +200 -0
  91. mcli/ml/experimentation/ab_testing.py +845 -0
  92. mcli/ml/features/ensemble_features.py +607 -0
  93. mcli/ml/features/political_features.py +676 -0
  94. mcli/ml/features/recommendation_engine.py +809 -0
  95. mcli/ml/features/stock_features.py +573 -0
  96. mcli/ml/features/test_feature_engineering.py +346 -0
  97. mcli/ml/logging.py +85 -0
  98. mcli/ml/mlops/data_versioning.py +518 -0
  99. mcli/ml/mlops/experiment_tracker.py +377 -0
  100. mcli/ml/mlops/model_serving.py +481 -0
  101. mcli/ml/mlops/pipeline_orchestrator.py +614 -0
  102. mcli/ml/models/base_models.py +324 -0
  103. mcli/ml/models/ensemble_models.py +675 -0
  104. mcli/ml/models/recommendation_models.py +474 -0
  105. mcli/ml/models/test_models.py +487 -0
  106. mcli/ml/monitoring/drift_detection.py +676 -0
  107. mcli/ml/monitoring/metrics.py +45 -0
  108. mcli/ml/optimization/portfolio_optimizer.py +834 -0
  109. mcli/ml/preprocessing/data_cleaners.py +451 -0
  110. mcli/ml/preprocessing/feature_extractors.py +491 -0
  111. mcli/ml/preprocessing/ml_pipeline.py +382 -0
  112. mcli/ml/preprocessing/politician_trading_preprocessor.py +569 -0
  113. mcli/ml/preprocessing/test_preprocessing.py +294 -0
  114. mcli/ml/scripts/populate_sample_data.py +200 -0
  115. mcli/ml/tasks.py +400 -0
  116. mcli/ml/tests/test_integration.py +429 -0
  117. mcli/ml/tests/test_training_dashboard.py +387 -0
  118. mcli/public/oi/oi.py +15 -0
  119. mcli/public/public.py +4 -0
  120. mcli/self/self_cmd.py +1246 -0
  121. mcli/workflow/daemon/api_daemon.py +800 -0
  122. mcli/workflow/daemon/async_command_database.py +681 -0
  123. mcli/workflow/daemon/async_process_manager.py +591 -0
  124. mcli/workflow/daemon/client.py +530 -0
  125. mcli/workflow/daemon/commands.py +1196 -0
  126. mcli/workflow/daemon/daemon.py +905 -0
  127. mcli/workflow/daemon/daemon_api.py +59 -0
  128. mcli/workflow/daemon/enhanced_daemon.py +571 -0
  129. mcli/workflow/daemon/process_cli.py +244 -0
  130. mcli/workflow/daemon/process_manager.py +439 -0
  131. mcli/workflow/daemon/test_daemon.py +275 -0
  132. mcli/workflow/dashboard/dashboard_cmd.py +113 -0
  133. mcli/workflow/docker/docker.py +0 -0
  134. mcli/workflow/file/file.py +100 -0
  135. mcli/workflow/gcloud/config.toml +21 -0
  136. mcli/workflow/gcloud/gcloud.py +58 -0
  137. mcli/workflow/git_commit/ai_service.py +328 -0
  138. mcli/workflow/git_commit/commands.py +430 -0
  139. mcli/workflow/lsh_integration.py +355 -0
  140. mcli/workflow/model_service/client.py +594 -0
  141. mcli/workflow/model_service/download_and_run_efficient_models.py +288 -0
  142. mcli/workflow/model_service/lightweight_embedder.py +397 -0
  143. mcli/workflow/model_service/lightweight_model_server.py +714 -0
  144. mcli/workflow/model_service/lightweight_test.py +241 -0
  145. mcli/workflow/model_service/model_service.py +1955 -0
  146. mcli/workflow/model_service/ollama_efficient_runner.py +425 -0
  147. mcli/workflow/model_service/pdf_processor.py +386 -0
  148. mcli/workflow/model_service/test_efficient_runner.py +234 -0
  149. mcli/workflow/model_service/test_example.py +315 -0
  150. mcli/workflow/model_service/test_integration.py +131 -0
  151. mcli/workflow/model_service/test_new_features.py +149 -0
  152. mcli/workflow/openai/openai.py +99 -0
  153. mcli/workflow/politician_trading/commands.py +1790 -0
  154. mcli/workflow/politician_trading/config.py +134 -0
  155. mcli/workflow/politician_trading/connectivity.py +490 -0
  156. mcli/workflow/politician_trading/data_sources.py +395 -0
  157. mcli/workflow/politician_trading/database.py +410 -0
  158. mcli/workflow/politician_trading/demo.py +248 -0
  159. mcli/workflow/politician_trading/models.py +165 -0
  160. mcli/workflow/politician_trading/monitoring.py +413 -0
  161. mcli/workflow/politician_trading/scrapers.py +966 -0
  162. mcli/workflow/politician_trading/scrapers_california.py +412 -0
  163. mcli/workflow/politician_trading/scrapers_eu.py +377 -0
  164. mcli/workflow/politician_trading/scrapers_uk.py +350 -0
  165. mcli/workflow/politician_trading/scrapers_us_states.py +438 -0
  166. mcli/workflow/politician_trading/supabase_functions.py +354 -0
  167. mcli/workflow/politician_trading/workflow.py +852 -0
  168. mcli/workflow/registry/registry.py +180 -0
  169. mcli/workflow/repo/repo.py +223 -0
  170. mcli/workflow/scheduler/commands.py +493 -0
  171. mcli/workflow/scheduler/cron_parser.py +238 -0
  172. mcli/workflow/scheduler/job.py +182 -0
  173. mcli/workflow/scheduler/monitor.py +139 -0
  174. mcli/workflow/scheduler/persistence.py +324 -0
  175. mcli/workflow/scheduler/scheduler.py +679 -0
  176. mcli/workflow/sync/sync_cmd.py +437 -0
  177. mcli/workflow/sync/test_cmd.py +314 -0
  178. mcli/workflow/videos/videos.py +242 -0
  179. mcli/workflow/wakatime/wakatime.py +11 -0
  180. mcli/workflow/workflow.py +37 -0
  181. mcli_framework-7.0.0.dist-info/METADATA +479 -0
  182. mcli_framework-7.0.0.dist-info/RECORD +186 -0
  183. mcli_framework-7.0.0.dist-info/WHEEL +5 -0
  184. mcli_framework-7.0.0.dist-info/entry_points.txt +7 -0
  185. mcli_framework-7.0.0.dist-info/licenses/LICENSE +21 -0
  186. mcli_framework-7.0.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,607 @@
1
+ """Ensemble feature engineering and feature interaction systems"""
2
+
3
+ import numpy as np
4
+ import pandas as pd
5
+ from datetime import datetime, timedelta
6
+ from typing import Any, Dict, List, Optional, Tuple, Union, Callable
7
+ from dataclasses import dataclass
8
+ import logging
9
+ from itertools import combinations
10
+ import warnings
11
+ from sklearn.feature_selection import SelectKBest, f_regression, mutual_info_regression
12
+ from sklearn.preprocessing import PolynomialFeatures
13
+ from sklearn.decomposition import PCA
14
+ from sklearn.cluster import KMeans
15
+
16
+ logger = logging.getLogger(__name__)
17
+
18
+
19
+ @dataclass
20
+ class EnsembleFeatureConfig:
21
+ """Configuration for ensemble feature engineering"""
22
+
23
+ # Feature interaction settings
24
+ max_interaction_degree: int = 2
25
+ max_features_for_interactions: int = 50
26
+ interaction_selection_method: str = "mutual_info" # "mutual_info", "f_test", "correlation"
27
+
28
+ # Polynomial feature settings
29
+ enable_polynomial_features: bool = True
30
+ polynomial_degree: int = 2
31
+ include_bias: bool = False
32
+
33
+ # Clustering features
34
+ enable_clustering_features: bool = True
35
+ n_clusters: int = 5
36
+ clustering_features: List[str] = None
37
+
38
+ # Feature selection settings
39
+ feature_selection_k: int = 100
40
+ selection_score_func: str = "f_regression" # "f_regression", "mutual_info"
41
+
42
+ # Rolling feature aggregations
43
+ rolling_windows: List[int] = None
44
+ rolling_functions: List[str] = None
45
+
46
+ def __post_init__(self):
47
+ if self.clustering_features is None:
48
+ self.clustering_features = [
49
+ "total_influence",
50
+ "transaction_amount_cleaned",
51
+ "trading_frequency_score",
52
+ "volatility_20",
53
+ "rsi",
54
+ ]
55
+
56
+ if self.rolling_windows is None:
57
+ self.rolling_windows = [5, 10, 20, 50]
58
+
59
+ if self.rolling_functions is None:
60
+ self.rolling_functions = ["mean", "std", "min", "max", "skew"]
61
+
62
+
63
+ class EnsembleFeatureBuilder:
64
+ """Builds comprehensive feature sets for ensemble models"""
65
+
66
+ def __init__(self, config: Optional[EnsembleFeatureConfig] = None):
67
+ self.config = config or EnsembleFeatureConfig()
68
+ self.feature_importance_cache = {}
69
+ self.interaction_cache = {}
70
+
71
+ def build_ensemble_features(
72
+ self,
73
+ base_features: pd.DataFrame,
74
+ target_column: Optional[str] = None,
75
+ include_interactions: bool = True,
76
+ include_clustering: bool = True,
77
+ include_rolling: bool = True,
78
+ ) -> pd.DataFrame:
79
+ """Build comprehensive feature set for ensemble models"""
80
+
81
+ logger.info("Building ensemble features")
82
+ df = base_features.copy()
83
+
84
+ # Get numerical features for processing
85
+ numerical_features = self._get_numerical_features(df)
86
+ logger.info(f"Processing {len(numerical_features)} numerical features")
87
+
88
+ # Add rolling aggregations
89
+ if include_rolling and len(numerical_features) > 0:
90
+ df = self._add_rolling_features(df, numerical_features)
91
+
92
+ # Add interaction features
93
+ if include_interactions and len(numerical_features) > 0:
94
+ df = self._add_interaction_features(df, numerical_features, target_column)
95
+
96
+ # Add polynomial features (subset)
97
+ if self.config.enable_polynomial_features and len(numerical_features) > 0:
98
+ df = self._add_polynomial_features(
99
+ df, numerical_features[:10]
100
+ ) # Limit to avoid explosion
101
+
102
+ # Add clustering features
103
+ if include_clustering and self.config.enable_clustering_features:
104
+ df = self._add_clustering_features(df)
105
+
106
+ # Add statistical features
107
+ df = self._add_statistical_features(df, numerical_features)
108
+
109
+ # Add rank features
110
+ df = self._add_rank_features(df, numerical_features)
111
+
112
+ logger.info(f"Final feature count: {len(df.columns)}")
113
+ return df
114
+
115
+ def _get_numerical_features(self, df: pd.DataFrame) -> List[str]:
116
+ """Get list of numerical feature columns"""
117
+ numerical_features = []
118
+ for col in df.columns:
119
+ if (
120
+ df[col].dtype in ["int64", "float64"]
121
+ and not col.startswith("target_")
122
+ and not col.endswith("_id")
123
+ and col not in ["index"]
124
+ and df[col].notna().sum() > 0
125
+ ):
126
+ numerical_features.append(col)
127
+ return numerical_features
128
+
129
+ def _add_rolling_features(
130
+ self, df: pd.DataFrame, numerical_features: List[str]
131
+ ) -> pd.DataFrame:
132
+ """Add rolling window aggregation features"""
133
+ logger.info("Adding rolling aggregation features")
134
+
135
+ # Ensure we have date column for time-based rolling
136
+ if "transaction_date_dt" not in df.columns:
137
+ # Create synthetic time index if no date column
138
+ df["synthetic_time_index"] = range(len(df))
139
+ time_col = "synthetic_time_index"
140
+ else:
141
+ df = df.sort_values("transaction_date_dt")
142
+ time_col = "transaction_date_dt"
143
+
144
+ # Select top features for rolling (avoid too many features)
145
+ features_for_rolling = numerical_features[:20]
146
+
147
+ for window in self.config.rolling_windows:
148
+ if window >= len(df):
149
+ continue
150
+
151
+ for feature in features_for_rolling:
152
+ if feature not in df.columns:
153
+ continue
154
+
155
+ try:
156
+ # Basic rolling aggregations
157
+ df[f"{feature}_rolling_{window}_mean"] = (
158
+ df[feature].rolling(window=window, min_periods=1).mean()
159
+ )
160
+ df[f"{feature}_rolling_{window}_std"] = (
161
+ df[feature].rolling(window=window, min_periods=1).std()
162
+ )
163
+
164
+ # Rolling rank (percentile within window)
165
+ df[f"{feature}_rolling_{window}_rank"] = (
166
+ df[feature].rolling(window=window, min_periods=1).rank(pct=True)
167
+ )
168
+
169
+ # Rolling z-score
170
+ rolling_mean = df[feature].rolling(window=window, min_periods=1).mean()
171
+ rolling_std = df[feature].rolling(window=window, min_periods=1).std()
172
+ df[f"{feature}_rolling_{window}_zscore"] = (df[feature] - rolling_mean) / (
173
+ rolling_std + 1e-8
174
+ )
175
+
176
+ except Exception as e:
177
+ logger.warning(f"Failed to create rolling features for {feature}: {e}")
178
+
179
+ return df
180
+
181
+ def _add_interaction_features(
182
+ self, df: pd.DataFrame, numerical_features: List[str], target_column: Optional[str]
183
+ ) -> pd.DataFrame:
184
+ """Add feature interaction terms"""
185
+ logger.info("Adding feature interaction terms")
186
+
187
+ # Limit features to avoid combinatorial explosion
188
+ if len(numerical_features) > self.config.max_features_for_interactions:
189
+ # Select top features based on correlation with target or variance
190
+ if target_column and target_column in df.columns:
191
+ feature_scores = []
192
+ for feature in numerical_features:
193
+ try:
194
+ corr = abs(df[feature].corr(df[target_column]))
195
+ feature_scores.append((feature, corr))
196
+ except:
197
+ feature_scores.append((feature, 0))
198
+
199
+ feature_scores.sort(key=lambda x: x[1], reverse=True)
200
+ selected_features = [
201
+ f[0] for f in feature_scores[: self.config.max_features_for_interactions]
202
+ ]
203
+ else:
204
+ # Select by variance
205
+ feature_vars = []
206
+ for feature in numerical_features:
207
+ try:
208
+ var = df[feature].var()
209
+ feature_vars.append((feature, var))
210
+ except:
211
+ feature_vars.append((feature, 0))
212
+
213
+ feature_vars.sort(key=lambda x: x[1], reverse=True)
214
+ selected_features = [
215
+ f[0] for f in feature_vars[: self.config.max_features_for_interactions]
216
+ ]
217
+ else:
218
+ selected_features = numerical_features
219
+
220
+ # Create pairwise interactions
221
+ interaction_count = 0
222
+ max_interactions = 200 # Limit total interactions
223
+
224
+ for feature1, feature2 in combinations(selected_features, 2):
225
+ if interaction_count >= max_interactions:
226
+ break
227
+
228
+ if feature1 not in df.columns or feature2 not in df.columns:
229
+ continue
230
+
231
+ try:
232
+ # Multiplicative interaction
233
+ df[f"{feature1}_x_{feature2}"] = df[feature1] * df[feature2]
234
+
235
+ # Ratio interaction (avoid division by zero)
236
+ df[f"{feature1}_div_{feature2}"] = df[feature1] / (abs(df[feature2]) + 1e-8)
237
+
238
+ # Difference interaction
239
+ df[f"{feature1}_minus_{feature2}"] = df[feature1] - df[feature2]
240
+
241
+ interaction_count += 3
242
+
243
+ # Add some conditional interactions for key features
244
+ if "influence" in feature1.lower() or "influence" in feature2.lower():
245
+ # Conditional interactions based on influence
246
+ high_influence = df[feature1] > df[feature1].quantile(0.7)
247
+ df[f"{feature2}_when_high_{feature1}"] = np.where(
248
+ high_influence, df[feature2], 0
249
+ )
250
+ interaction_count += 1
251
+
252
+ except Exception as e:
253
+ logger.warning(f"Failed to create interaction {feature1} x {feature2}: {e}")
254
+
255
+ logger.info(f"Created {interaction_count} interaction features")
256
+ return df
257
+
258
+ def _add_polynomial_features(
259
+ self, df: pd.DataFrame, selected_features: List[str]
260
+ ) -> pd.DataFrame:
261
+ """Add polynomial features for key variables"""
262
+ logger.info("Adding polynomial features")
263
+
264
+ # Limit to top features to avoid memory issues
265
+ features_for_poly = selected_features[:5]
266
+
267
+ try:
268
+ # Create polynomial features
269
+ poly = PolynomialFeatures(
270
+ degree=self.config.polynomial_degree,
271
+ include_bias=self.config.include_bias,
272
+ interaction_only=False,
273
+ )
274
+
275
+ # Prepare data (handle missing values)
276
+ poly_data = df[features_for_poly].fillna(0)
277
+
278
+ if len(poly_data) > 0 and len(features_for_poly) > 0:
279
+ poly_features = poly.fit_transform(poly_data)
280
+
281
+ # Get feature names
282
+ poly_feature_names = poly.get_feature_names_out(features_for_poly)
283
+
284
+ # Add polynomial features to dataframe (skip original features)
285
+ start_idx = len(features_for_poly)
286
+ for i, name in enumerate(poly_feature_names[start_idx:], start_idx):
287
+ df[f"poly_{name}"] = poly_features[:, i]
288
+
289
+ logger.info(
290
+ f"Added {len(poly_feature_names) - len(features_for_poly)} polynomial features"
291
+ )
292
+
293
+ except Exception as e:
294
+ logger.warning(f"Failed to create polynomial features: {e}")
295
+
296
+ return df
297
+
298
+ def _add_clustering_features(self, df: pd.DataFrame) -> pd.DataFrame:
299
+ """Add clustering-based features"""
300
+ logger.info("Adding clustering features")
301
+
302
+ # Select features for clustering
303
+ clustering_features = []
304
+ for feature in self.config.clustering_features:
305
+ if feature in df.columns:
306
+ clustering_features.append(feature)
307
+
308
+ if len(clustering_features) < 2:
309
+ logger.warning("Insufficient features for clustering")
310
+ return df
311
+
312
+ try:
313
+ # Prepare clustering data
314
+ cluster_data = df[clustering_features].fillna(0)
315
+
316
+ # Apply K-means clustering
317
+ kmeans = KMeans(n_clusters=self.config.n_clusters, random_state=42, n_init=10)
318
+ cluster_labels = kmeans.fit_predict(cluster_data)
319
+
320
+ df["cluster_label"] = cluster_labels
321
+
322
+ # Add distance to cluster centers
323
+ cluster_centers = kmeans.cluster_centers_
324
+ distances = []
325
+
326
+ for i, row in cluster_data.iterrows():
327
+ center = cluster_centers[cluster_labels[i]]
328
+ distance = np.sqrt(np.sum((row.values - center) ** 2))
329
+ distances.append(distance)
330
+
331
+ df["cluster_distance"] = distances
332
+
333
+ # Add cluster-based features
334
+ for cluster_id in range(self.config.n_clusters):
335
+ df[f"is_cluster_{cluster_id}"] = (df["cluster_label"] == cluster_id).astype(int)
336
+
337
+ # Cluster statistics
338
+ cluster_stats = df.groupby("cluster_label")[clustering_features].agg(["mean", "std"])
339
+
340
+ for feature in clustering_features:
341
+ for stat in ["mean", "std"]:
342
+ cluster_stat_dict = cluster_stats[(feature, stat)].to_dict()
343
+ df[f"cluster_{feature}_{stat}"] = df["cluster_label"].map(cluster_stat_dict)
344
+
345
+ logger.info(f"Added clustering features with {self.config.n_clusters} clusters")
346
+
347
+ except Exception as e:
348
+ logger.warning(f"Failed to create clustering features: {e}")
349
+
350
+ return df
351
+
352
+ def _add_statistical_features(
353
+ self, df: pd.DataFrame, numerical_features: List[str]
354
+ ) -> pd.DataFrame:
355
+ """Add statistical transformation features"""
356
+ logger.info("Adding statistical features")
357
+
358
+ # Select subset of features for statistical transforms
359
+ stat_features = numerical_features[:15]
360
+
361
+ for feature in stat_features:
362
+ if feature not in df.columns:
363
+ continue
364
+
365
+ try:
366
+ feature_data = df[feature].fillna(0)
367
+
368
+ # Log transform (for positive values)
369
+ if (feature_data > 0).all():
370
+ df[f"{feature}_log"] = np.log1p(feature_data)
371
+
372
+ # Square root transform
373
+ if (feature_data >= 0).all():
374
+ df[f"{feature}_sqrt"] = np.sqrt(feature_data)
375
+
376
+ # Inverse transform (avoid division by zero)
377
+ df[f"{feature}_inv"] = 1 / (abs(feature_data) + 1e-8)
378
+
379
+ # Standardized (z-score)
380
+ mean_val = feature_data.mean()
381
+ std_val = feature_data.std()
382
+ if std_val > 0:
383
+ df[f"{feature}_zscore"] = (feature_data - mean_val) / std_val
384
+
385
+ # Binned features
386
+ df[f"{feature}_binned"] = pd.cut(feature_data, bins=5, labels=False)
387
+
388
+ except Exception as e:
389
+ logger.warning(f"Failed to create statistical features for {feature}: {e}")
390
+
391
+ return df
392
+
393
+ def _add_rank_features(self, df: pd.DataFrame, numerical_features: List[str]) -> pd.DataFrame:
394
+ """Add rank-based features"""
395
+ logger.info("Adding rank features")
396
+
397
+ # Select subset for ranking
398
+ rank_features = numerical_features[:10]
399
+
400
+ for feature in rank_features:
401
+ if feature not in df.columns:
402
+ continue
403
+
404
+ try:
405
+ # Percentile rank
406
+ df[f"{feature}_pct_rank"] = df[feature].rank(pct=True)
407
+
408
+ # Quantile binning
409
+ df[f"{feature}_quantile"] = pd.qcut(
410
+ df[feature], q=10, labels=False, duplicates="drop"
411
+ )
412
+
413
+ except Exception as e:
414
+ logger.warning(f"Failed to create rank features for {feature}: {e}")
415
+
416
+ return df
417
+
418
+
419
+ class FeatureInteractionEngine:
420
+ """Advanced feature interaction discovery and generation"""
421
+
422
+ def __init__(self, config: Optional[EnsembleFeatureConfig] = None):
423
+ self.config = config or EnsembleFeatureConfig()
424
+
425
+ def discover_interactions(
426
+ self, df: pd.DataFrame, target_column: str, max_interactions: int = 50
427
+ ) -> List[Tuple[str, str, float]]:
428
+ """Discover important feature interactions based on target correlation"""
429
+
430
+ numerical_features = self._get_numerical_features(df)
431
+ interactions = []
432
+
433
+ logger.info(f"Discovering interactions among {len(numerical_features)} features")
434
+
435
+ for feature1, feature2 in combinations(numerical_features, 2):
436
+ if feature1 not in df.columns or feature2 not in df.columns:
437
+ continue
438
+
439
+ try:
440
+ # Create interaction term
441
+ interaction_term = df[feature1] * df[feature2]
442
+
443
+ # Calculate correlation with target
444
+ correlation = abs(interaction_term.corr(df[target_column]))
445
+
446
+ if not np.isnan(correlation) and correlation > 0.1:
447
+ interactions.append((feature1, feature2, correlation))
448
+
449
+ except Exception as e:
450
+ continue
451
+
452
+ # Sort by correlation strength
453
+ interactions.sort(key=lambda x: x[2], reverse=True)
454
+
455
+ logger.info(f"Discovered {len(interactions)} significant interactions")
456
+ return interactions[:max_interactions]
457
+
458
+ def _get_numerical_features(self, df: pd.DataFrame) -> List[str]:
459
+ """Get numerical features for interaction discovery"""
460
+ return [
461
+ col
462
+ for col in df.columns
463
+ if df[col].dtype in ["int64", "float64"]
464
+ and not col.startswith("target_")
465
+ and df[col].notna().sum() > 0
466
+ ]
467
+
468
+ def generate_advanced_interactions(
469
+ self, df: pd.DataFrame, feature_pairs: List[Tuple[str, str]]
470
+ ) -> pd.DataFrame:
471
+ """Generate advanced interaction terms for discovered feature pairs"""
472
+
473
+ df_enhanced = df.copy()
474
+
475
+ for feature1, feature2 in feature_pairs:
476
+ if feature1 not in df.columns or feature2 not in df.columns:
477
+ continue
478
+
479
+ try:
480
+ # Conditional interactions
481
+ df_enhanced[f"{feature1}_when_high_{feature2}"] = np.where(
482
+ df[feature2] > df[feature2].median(), df[feature1], 0
483
+ )
484
+
485
+ df_enhanced[f"{feature2}_when_high_{feature1}"] = np.where(
486
+ df[feature1] > df[feature1].median(), df[feature2], 0
487
+ )
488
+
489
+ # Non-linear interactions
490
+ df_enhanced[f"{feature1}_squared_x_{feature2}"] = (df[feature1] ** 2) * df[feature2]
491
+
492
+ # Min/max interactions
493
+ df_enhanced[f"min_{feature1}_{feature2}"] = np.minimum(df[feature1], df[feature2])
494
+ df_enhanced[f"max_{feature1}_{feature2}"] = np.maximum(df[feature1], df[feature2])
495
+
496
+ except Exception as e:
497
+ logger.warning(
498
+ f"Failed to create advanced interactions for {feature1}, {feature2}: {e}"
499
+ )
500
+
501
+ return df_enhanced
502
+
503
+
504
+ class DynamicFeatureSelector:
505
+ """Dynamic feature selection based on multiple criteria"""
506
+
507
+ def __init__(self, config: Optional[EnsembleFeatureConfig] = None):
508
+ self.config = config or EnsembleFeatureConfig()
509
+
510
+ def select_features(
511
+ self,
512
+ df: pd.DataFrame,
513
+ target_column: str,
514
+ selection_methods: Optional[List[str]] = None,
515
+ ) -> Tuple[pd.DataFrame, Dict[str, Any]]:
516
+ """Select features using multiple criteria"""
517
+
518
+ if selection_methods is None:
519
+ selection_methods = ["variance", "correlation", "mutual_info"]
520
+
521
+ feature_scores = {}
522
+ selected_features = set()
523
+
524
+ # Get feature columns (exclude target)
525
+ feature_columns = [
526
+ col for col in df.columns if col != target_column and not col.startswith("target_")
527
+ ]
528
+
529
+ logger.info(f"Selecting from {len(feature_columns)} features")
530
+
531
+ # Apply different selection methods
532
+ for method in selection_methods:
533
+ method_features = self._apply_selection_method(
534
+ df[feature_columns], df[target_column], method
535
+ )
536
+ feature_scores[method] = method_features
537
+ selected_features.update(method_features[:50]) # Top 50 from each method
538
+
539
+ # Combine selections
540
+ final_features = list(selected_features)[: self.config.feature_selection_k]
541
+
542
+ # Create result dataframe
543
+ result_df = df[[target_column] + final_features].copy()
544
+
545
+ selection_info = {
546
+ "original_feature_count": len(feature_columns),
547
+ "selected_feature_count": len(final_features),
548
+ "selection_methods": selection_methods,
549
+ "feature_scores": feature_scores,
550
+ "selected_features": final_features,
551
+ }
552
+
553
+ logger.info(
554
+ f"Selected {len(final_features)} features from {len(feature_columns)} original features"
555
+ )
556
+
557
+ return result_df, selection_info
558
+
559
+ def _apply_selection_method(self, X: pd.DataFrame, y: pd.Series, method: str) -> List[str]:
560
+ """Apply specific feature selection method"""
561
+
562
+ try:
563
+ if method == "variance":
564
+ # Variance-based selection
565
+ variances = X.var()
566
+ feature_scores = variances.sort_values(ascending=False)
567
+ return feature_scores.index.tolist()
568
+
569
+ elif method == "correlation":
570
+ # Correlation-based selection
571
+ correlations = X.corrwith(y).abs()
572
+ feature_scores = correlations.sort_values(ascending=False)
573
+ return feature_scores.dropna().index.tolist()
574
+
575
+ elif method == "mutual_info":
576
+ # Mutual information selection
577
+ X_filled = X.fillna(0)
578
+ y_filled = y.fillna(0)
579
+
580
+ # Use a subset to avoid memory issues
581
+ if len(X.columns) > 100:
582
+ selected_cols = X.columns[:100]
583
+ X_subset = X_filled[selected_cols]
584
+ else:
585
+ X_subset = X_filled
586
+
587
+ mi_scores = mutual_info_regression(X_subset, y_filled, random_state=42)
588
+ feature_scores = pd.Series(mi_scores, index=X_subset.columns)
589
+ feature_scores = feature_scores.sort_values(ascending=False)
590
+ return feature_scores.index.tolist()
591
+
592
+ elif method == "f_test":
593
+ # F-test based selection
594
+ X_filled = X.fillna(0)
595
+ y_filled = y.fillna(0)
596
+
597
+ selector = SelectKBest(score_func=f_regression, k=min(50, len(X.columns)))
598
+ selector.fit(X_filled, y_filled)
599
+
600
+ selected_indices = selector.get_support(indices=True)
601
+ return X.columns[selected_indices].tolist()
602
+
603
+ except Exception as e:
604
+ logger.warning(f"Feature selection method {method} failed: {e}")
605
+ return []
606
+
607
+ return []