mcli-framework 7.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mcli-framework might be problematic. Click here for more details.
- mcli/app/chat_cmd.py +42 -0
- mcli/app/commands_cmd.py +226 -0
- mcli/app/completion_cmd.py +216 -0
- mcli/app/completion_helpers.py +288 -0
- mcli/app/cron_test_cmd.py +697 -0
- mcli/app/logs_cmd.py +419 -0
- mcli/app/main.py +492 -0
- mcli/app/model/model.py +1060 -0
- mcli/app/model_cmd.py +227 -0
- mcli/app/redis_cmd.py +269 -0
- mcli/app/video/video.py +1114 -0
- mcli/app/visual_cmd.py +303 -0
- mcli/chat/chat.py +2409 -0
- mcli/chat/command_rag.py +514 -0
- mcli/chat/enhanced_chat.py +652 -0
- mcli/chat/system_controller.py +1010 -0
- mcli/chat/system_integration.py +1016 -0
- mcli/cli.py +25 -0
- mcli/config.toml +20 -0
- mcli/lib/api/api.py +586 -0
- mcli/lib/api/daemon_client.py +203 -0
- mcli/lib/api/daemon_client_local.py +44 -0
- mcli/lib/api/daemon_decorator.py +217 -0
- mcli/lib/api/mcli_decorators.py +1032 -0
- mcli/lib/auth/auth.py +85 -0
- mcli/lib/auth/aws_manager.py +85 -0
- mcli/lib/auth/azure_manager.py +91 -0
- mcli/lib/auth/credential_manager.py +192 -0
- mcli/lib/auth/gcp_manager.py +93 -0
- mcli/lib/auth/key_manager.py +117 -0
- mcli/lib/auth/mcli_manager.py +93 -0
- mcli/lib/auth/token_manager.py +75 -0
- mcli/lib/auth/token_util.py +1011 -0
- mcli/lib/config/config.py +47 -0
- mcli/lib/discovery/__init__.py +1 -0
- mcli/lib/discovery/command_discovery.py +274 -0
- mcli/lib/erd/erd.py +1345 -0
- mcli/lib/erd/generate_graph.py +453 -0
- mcli/lib/files/files.py +76 -0
- mcli/lib/fs/fs.py +109 -0
- mcli/lib/lib.py +29 -0
- mcli/lib/logger/logger.py +611 -0
- mcli/lib/performance/optimizer.py +409 -0
- mcli/lib/performance/rust_bridge.py +502 -0
- mcli/lib/performance/uvloop_config.py +154 -0
- mcli/lib/pickles/pickles.py +50 -0
- mcli/lib/search/cached_vectorizer.py +479 -0
- mcli/lib/services/data_pipeline.py +460 -0
- mcli/lib/services/lsh_client.py +441 -0
- mcli/lib/services/redis_service.py +387 -0
- mcli/lib/shell/shell.py +137 -0
- mcli/lib/toml/toml.py +33 -0
- mcli/lib/ui/styling.py +47 -0
- mcli/lib/ui/visual_effects.py +634 -0
- mcli/lib/watcher/watcher.py +185 -0
- mcli/ml/api/app.py +215 -0
- mcli/ml/api/middleware.py +224 -0
- mcli/ml/api/routers/admin_router.py +12 -0
- mcli/ml/api/routers/auth_router.py +244 -0
- mcli/ml/api/routers/backtest_router.py +12 -0
- mcli/ml/api/routers/data_router.py +12 -0
- mcli/ml/api/routers/model_router.py +302 -0
- mcli/ml/api/routers/monitoring_router.py +12 -0
- mcli/ml/api/routers/portfolio_router.py +12 -0
- mcli/ml/api/routers/prediction_router.py +267 -0
- mcli/ml/api/routers/trade_router.py +12 -0
- mcli/ml/api/routers/websocket_router.py +76 -0
- mcli/ml/api/schemas.py +64 -0
- mcli/ml/auth/auth_manager.py +425 -0
- mcli/ml/auth/models.py +154 -0
- mcli/ml/auth/permissions.py +302 -0
- mcli/ml/backtesting/backtest_engine.py +502 -0
- mcli/ml/backtesting/performance_metrics.py +393 -0
- mcli/ml/cache.py +400 -0
- mcli/ml/cli/main.py +398 -0
- mcli/ml/config/settings.py +394 -0
- mcli/ml/configs/dvc_config.py +230 -0
- mcli/ml/configs/mlflow_config.py +131 -0
- mcli/ml/configs/mlops_manager.py +293 -0
- mcli/ml/dashboard/app.py +532 -0
- mcli/ml/dashboard/app_integrated.py +738 -0
- mcli/ml/dashboard/app_supabase.py +560 -0
- mcli/ml/dashboard/app_training.py +615 -0
- mcli/ml/dashboard/cli.py +51 -0
- mcli/ml/data_ingestion/api_connectors.py +501 -0
- mcli/ml/data_ingestion/data_pipeline.py +567 -0
- mcli/ml/data_ingestion/stream_processor.py +512 -0
- mcli/ml/database/migrations/env.py +94 -0
- mcli/ml/database/models.py +667 -0
- mcli/ml/database/session.py +200 -0
- mcli/ml/experimentation/ab_testing.py +845 -0
- mcli/ml/features/ensemble_features.py +607 -0
- mcli/ml/features/political_features.py +676 -0
- mcli/ml/features/recommendation_engine.py +809 -0
- mcli/ml/features/stock_features.py +573 -0
- mcli/ml/features/test_feature_engineering.py +346 -0
- mcli/ml/logging.py +85 -0
- mcli/ml/mlops/data_versioning.py +518 -0
- mcli/ml/mlops/experiment_tracker.py +377 -0
- mcli/ml/mlops/model_serving.py +481 -0
- mcli/ml/mlops/pipeline_orchestrator.py +614 -0
- mcli/ml/models/base_models.py +324 -0
- mcli/ml/models/ensemble_models.py +675 -0
- mcli/ml/models/recommendation_models.py +474 -0
- mcli/ml/models/test_models.py +487 -0
- mcli/ml/monitoring/drift_detection.py +676 -0
- mcli/ml/monitoring/metrics.py +45 -0
- mcli/ml/optimization/portfolio_optimizer.py +834 -0
- mcli/ml/preprocessing/data_cleaners.py +451 -0
- mcli/ml/preprocessing/feature_extractors.py +491 -0
- mcli/ml/preprocessing/ml_pipeline.py +382 -0
- mcli/ml/preprocessing/politician_trading_preprocessor.py +569 -0
- mcli/ml/preprocessing/test_preprocessing.py +294 -0
- mcli/ml/scripts/populate_sample_data.py +200 -0
- mcli/ml/tasks.py +400 -0
- mcli/ml/tests/test_integration.py +429 -0
- mcli/ml/tests/test_training_dashboard.py +387 -0
- mcli/public/oi/oi.py +15 -0
- mcli/public/public.py +4 -0
- mcli/self/self_cmd.py +1246 -0
- mcli/workflow/daemon/api_daemon.py +800 -0
- mcli/workflow/daemon/async_command_database.py +681 -0
- mcli/workflow/daemon/async_process_manager.py +591 -0
- mcli/workflow/daemon/client.py +530 -0
- mcli/workflow/daemon/commands.py +1196 -0
- mcli/workflow/daemon/daemon.py +905 -0
- mcli/workflow/daemon/daemon_api.py +59 -0
- mcli/workflow/daemon/enhanced_daemon.py +571 -0
- mcli/workflow/daemon/process_cli.py +244 -0
- mcli/workflow/daemon/process_manager.py +439 -0
- mcli/workflow/daemon/test_daemon.py +275 -0
- mcli/workflow/dashboard/dashboard_cmd.py +113 -0
- mcli/workflow/docker/docker.py +0 -0
- mcli/workflow/file/file.py +100 -0
- mcli/workflow/gcloud/config.toml +21 -0
- mcli/workflow/gcloud/gcloud.py +58 -0
- mcli/workflow/git_commit/ai_service.py +328 -0
- mcli/workflow/git_commit/commands.py +430 -0
- mcli/workflow/lsh_integration.py +355 -0
- mcli/workflow/model_service/client.py +594 -0
- mcli/workflow/model_service/download_and_run_efficient_models.py +288 -0
- mcli/workflow/model_service/lightweight_embedder.py +397 -0
- mcli/workflow/model_service/lightweight_model_server.py +714 -0
- mcli/workflow/model_service/lightweight_test.py +241 -0
- mcli/workflow/model_service/model_service.py +1955 -0
- mcli/workflow/model_service/ollama_efficient_runner.py +425 -0
- mcli/workflow/model_service/pdf_processor.py +386 -0
- mcli/workflow/model_service/test_efficient_runner.py +234 -0
- mcli/workflow/model_service/test_example.py +315 -0
- mcli/workflow/model_service/test_integration.py +131 -0
- mcli/workflow/model_service/test_new_features.py +149 -0
- mcli/workflow/openai/openai.py +99 -0
- mcli/workflow/politician_trading/commands.py +1790 -0
- mcli/workflow/politician_trading/config.py +134 -0
- mcli/workflow/politician_trading/connectivity.py +490 -0
- mcli/workflow/politician_trading/data_sources.py +395 -0
- mcli/workflow/politician_trading/database.py +410 -0
- mcli/workflow/politician_trading/demo.py +248 -0
- mcli/workflow/politician_trading/models.py +165 -0
- mcli/workflow/politician_trading/monitoring.py +413 -0
- mcli/workflow/politician_trading/scrapers.py +966 -0
- mcli/workflow/politician_trading/scrapers_california.py +412 -0
- mcli/workflow/politician_trading/scrapers_eu.py +377 -0
- mcli/workflow/politician_trading/scrapers_uk.py +350 -0
- mcli/workflow/politician_trading/scrapers_us_states.py +438 -0
- mcli/workflow/politician_trading/supabase_functions.py +354 -0
- mcli/workflow/politician_trading/workflow.py +852 -0
- mcli/workflow/registry/registry.py +180 -0
- mcli/workflow/repo/repo.py +223 -0
- mcli/workflow/scheduler/commands.py +493 -0
- mcli/workflow/scheduler/cron_parser.py +238 -0
- mcli/workflow/scheduler/job.py +182 -0
- mcli/workflow/scheduler/monitor.py +139 -0
- mcli/workflow/scheduler/persistence.py +324 -0
- mcli/workflow/scheduler/scheduler.py +679 -0
- mcli/workflow/sync/sync_cmd.py +437 -0
- mcli/workflow/sync/test_cmd.py +314 -0
- mcli/workflow/videos/videos.py +242 -0
- mcli/workflow/wakatime/wakatime.py +11 -0
- mcli/workflow/workflow.py +37 -0
- mcli_framework-7.0.0.dist-info/METADATA +479 -0
- mcli_framework-7.0.0.dist-info/RECORD +186 -0
- mcli_framework-7.0.0.dist-info/WHEEL +5 -0
- mcli_framework-7.0.0.dist-info/entry_points.txt +7 -0
- mcli_framework-7.0.0.dist-info/licenses/LICENSE +21 -0
- mcli_framework-7.0.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,607 @@
|
|
|
1
|
+
"""Ensemble feature engineering and feature interaction systems"""
|
|
2
|
+
|
|
3
|
+
import numpy as np
|
|
4
|
+
import pandas as pd
|
|
5
|
+
from datetime import datetime, timedelta
|
|
6
|
+
from typing import Any, Dict, List, Optional, Tuple, Union, Callable
|
|
7
|
+
from dataclasses import dataclass
|
|
8
|
+
import logging
|
|
9
|
+
from itertools import combinations
|
|
10
|
+
import warnings
|
|
11
|
+
from sklearn.feature_selection import SelectKBest, f_regression, mutual_info_regression
|
|
12
|
+
from sklearn.preprocessing import PolynomialFeatures
|
|
13
|
+
from sklearn.decomposition import PCA
|
|
14
|
+
from sklearn.cluster import KMeans
|
|
15
|
+
|
|
16
|
+
logger = logging.getLogger(__name__)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
@dataclass
|
|
20
|
+
class EnsembleFeatureConfig:
|
|
21
|
+
"""Configuration for ensemble feature engineering"""
|
|
22
|
+
|
|
23
|
+
# Feature interaction settings
|
|
24
|
+
max_interaction_degree: int = 2
|
|
25
|
+
max_features_for_interactions: int = 50
|
|
26
|
+
interaction_selection_method: str = "mutual_info" # "mutual_info", "f_test", "correlation"
|
|
27
|
+
|
|
28
|
+
# Polynomial feature settings
|
|
29
|
+
enable_polynomial_features: bool = True
|
|
30
|
+
polynomial_degree: int = 2
|
|
31
|
+
include_bias: bool = False
|
|
32
|
+
|
|
33
|
+
# Clustering features
|
|
34
|
+
enable_clustering_features: bool = True
|
|
35
|
+
n_clusters: int = 5
|
|
36
|
+
clustering_features: List[str] = None
|
|
37
|
+
|
|
38
|
+
# Feature selection settings
|
|
39
|
+
feature_selection_k: int = 100
|
|
40
|
+
selection_score_func: str = "f_regression" # "f_regression", "mutual_info"
|
|
41
|
+
|
|
42
|
+
# Rolling feature aggregations
|
|
43
|
+
rolling_windows: List[int] = None
|
|
44
|
+
rolling_functions: List[str] = None
|
|
45
|
+
|
|
46
|
+
def __post_init__(self):
|
|
47
|
+
if self.clustering_features is None:
|
|
48
|
+
self.clustering_features = [
|
|
49
|
+
"total_influence",
|
|
50
|
+
"transaction_amount_cleaned",
|
|
51
|
+
"trading_frequency_score",
|
|
52
|
+
"volatility_20",
|
|
53
|
+
"rsi",
|
|
54
|
+
]
|
|
55
|
+
|
|
56
|
+
if self.rolling_windows is None:
|
|
57
|
+
self.rolling_windows = [5, 10, 20, 50]
|
|
58
|
+
|
|
59
|
+
if self.rolling_functions is None:
|
|
60
|
+
self.rolling_functions = ["mean", "std", "min", "max", "skew"]
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
class EnsembleFeatureBuilder:
|
|
64
|
+
"""Builds comprehensive feature sets for ensemble models"""
|
|
65
|
+
|
|
66
|
+
def __init__(self, config: Optional[EnsembleFeatureConfig] = None):
|
|
67
|
+
self.config = config or EnsembleFeatureConfig()
|
|
68
|
+
self.feature_importance_cache = {}
|
|
69
|
+
self.interaction_cache = {}
|
|
70
|
+
|
|
71
|
+
def build_ensemble_features(
|
|
72
|
+
self,
|
|
73
|
+
base_features: pd.DataFrame,
|
|
74
|
+
target_column: Optional[str] = None,
|
|
75
|
+
include_interactions: bool = True,
|
|
76
|
+
include_clustering: bool = True,
|
|
77
|
+
include_rolling: bool = True,
|
|
78
|
+
) -> pd.DataFrame:
|
|
79
|
+
"""Build comprehensive feature set for ensemble models"""
|
|
80
|
+
|
|
81
|
+
logger.info("Building ensemble features")
|
|
82
|
+
df = base_features.copy()
|
|
83
|
+
|
|
84
|
+
# Get numerical features for processing
|
|
85
|
+
numerical_features = self._get_numerical_features(df)
|
|
86
|
+
logger.info(f"Processing {len(numerical_features)} numerical features")
|
|
87
|
+
|
|
88
|
+
# Add rolling aggregations
|
|
89
|
+
if include_rolling and len(numerical_features) > 0:
|
|
90
|
+
df = self._add_rolling_features(df, numerical_features)
|
|
91
|
+
|
|
92
|
+
# Add interaction features
|
|
93
|
+
if include_interactions and len(numerical_features) > 0:
|
|
94
|
+
df = self._add_interaction_features(df, numerical_features, target_column)
|
|
95
|
+
|
|
96
|
+
# Add polynomial features (subset)
|
|
97
|
+
if self.config.enable_polynomial_features and len(numerical_features) > 0:
|
|
98
|
+
df = self._add_polynomial_features(
|
|
99
|
+
df, numerical_features[:10]
|
|
100
|
+
) # Limit to avoid explosion
|
|
101
|
+
|
|
102
|
+
# Add clustering features
|
|
103
|
+
if include_clustering and self.config.enable_clustering_features:
|
|
104
|
+
df = self._add_clustering_features(df)
|
|
105
|
+
|
|
106
|
+
# Add statistical features
|
|
107
|
+
df = self._add_statistical_features(df, numerical_features)
|
|
108
|
+
|
|
109
|
+
# Add rank features
|
|
110
|
+
df = self._add_rank_features(df, numerical_features)
|
|
111
|
+
|
|
112
|
+
logger.info(f"Final feature count: {len(df.columns)}")
|
|
113
|
+
return df
|
|
114
|
+
|
|
115
|
+
def _get_numerical_features(self, df: pd.DataFrame) -> List[str]:
|
|
116
|
+
"""Get list of numerical feature columns"""
|
|
117
|
+
numerical_features = []
|
|
118
|
+
for col in df.columns:
|
|
119
|
+
if (
|
|
120
|
+
df[col].dtype in ["int64", "float64"]
|
|
121
|
+
and not col.startswith("target_")
|
|
122
|
+
and not col.endswith("_id")
|
|
123
|
+
and col not in ["index"]
|
|
124
|
+
and df[col].notna().sum() > 0
|
|
125
|
+
):
|
|
126
|
+
numerical_features.append(col)
|
|
127
|
+
return numerical_features
|
|
128
|
+
|
|
129
|
+
def _add_rolling_features(
|
|
130
|
+
self, df: pd.DataFrame, numerical_features: List[str]
|
|
131
|
+
) -> pd.DataFrame:
|
|
132
|
+
"""Add rolling window aggregation features"""
|
|
133
|
+
logger.info("Adding rolling aggregation features")
|
|
134
|
+
|
|
135
|
+
# Ensure we have date column for time-based rolling
|
|
136
|
+
if "transaction_date_dt" not in df.columns:
|
|
137
|
+
# Create synthetic time index if no date column
|
|
138
|
+
df["synthetic_time_index"] = range(len(df))
|
|
139
|
+
time_col = "synthetic_time_index"
|
|
140
|
+
else:
|
|
141
|
+
df = df.sort_values("transaction_date_dt")
|
|
142
|
+
time_col = "transaction_date_dt"
|
|
143
|
+
|
|
144
|
+
# Select top features for rolling (avoid too many features)
|
|
145
|
+
features_for_rolling = numerical_features[:20]
|
|
146
|
+
|
|
147
|
+
for window in self.config.rolling_windows:
|
|
148
|
+
if window >= len(df):
|
|
149
|
+
continue
|
|
150
|
+
|
|
151
|
+
for feature in features_for_rolling:
|
|
152
|
+
if feature not in df.columns:
|
|
153
|
+
continue
|
|
154
|
+
|
|
155
|
+
try:
|
|
156
|
+
# Basic rolling aggregations
|
|
157
|
+
df[f"{feature}_rolling_{window}_mean"] = (
|
|
158
|
+
df[feature].rolling(window=window, min_periods=1).mean()
|
|
159
|
+
)
|
|
160
|
+
df[f"{feature}_rolling_{window}_std"] = (
|
|
161
|
+
df[feature].rolling(window=window, min_periods=1).std()
|
|
162
|
+
)
|
|
163
|
+
|
|
164
|
+
# Rolling rank (percentile within window)
|
|
165
|
+
df[f"{feature}_rolling_{window}_rank"] = (
|
|
166
|
+
df[feature].rolling(window=window, min_periods=1).rank(pct=True)
|
|
167
|
+
)
|
|
168
|
+
|
|
169
|
+
# Rolling z-score
|
|
170
|
+
rolling_mean = df[feature].rolling(window=window, min_periods=1).mean()
|
|
171
|
+
rolling_std = df[feature].rolling(window=window, min_periods=1).std()
|
|
172
|
+
df[f"{feature}_rolling_{window}_zscore"] = (df[feature] - rolling_mean) / (
|
|
173
|
+
rolling_std + 1e-8
|
|
174
|
+
)
|
|
175
|
+
|
|
176
|
+
except Exception as e:
|
|
177
|
+
logger.warning(f"Failed to create rolling features for {feature}: {e}")
|
|
178
|
+
|
|
179
|
+
return df
|
|
180
|
+
|
|
181
|
+
def _add_interaction_features(
|
|
182
|
+
self, df: pd.DataFrame, numerical_features: List[str], target_column: Optional[str]
|
|
183
|
+
) -> pd.DataFrame:
|
|
184
|
+
"""Add feature interaction terms"""
|
|
185
|
+
logger.info("Adding feature interaction terms")
|
|
186
|
+
|
|
187
|
+
# Limit features to avoid combinatorial explosion
|
|
188
|
+
if len(numerical_features) > self.config.max_features_for_interactions:
|
|
189
|
+
# Select top features based on correlation with target or variance
|
|
190
|
+
if target_column and target_column in df.columns:
|
|
191
|
+
feature_scores = []
|
|
192
|
+
for feature in numerical_features:
|
|
193
|
+
try:
|
|
194
|
+
corr = abs(df[feature].corr(df[target_column]))
|
|
195
|
+
feature_scores.append((feature, corr))
|
|
196
|
+
except:
|
|
197
|
+
feature_scores.append((feature, 0))
|
|
198
|
+
|
|
199
|
+
feature_scores.sort(key=lambda x: x[1], reverse=True)
|
|
200
|
+
selected_features = [
|
|
201
|
+
f[0] for f in feature_scores[: self.config.max_features_for_interactions]
|
|
202
|
+
]
|
|
203
|
+
else:
|
|
204
|
+
# Select by variance
|
|
205
|
+
feature_vars = []
|
|
206
|
+
for feature in numerical_features:
|
|
207
|
+
try:
|
|
208
|
+
var = df[feature].var()
|
|
209
|
+
feature_vars.append((feature, var))
|
|
210
|
+
except:
|
|
211
|
+
feature_vars.append((feature, 0))
|
|
212
|
+
|
|
213
|
+
feature_vars.sort(key=lambda x: x[1], reverse=True)
|
|
214
|
+
selected_features = [
|
|
215
|
+
f[0] for f in feature_vars[: self.config.max_features_for_interactions]
|
|
216
|
+
]
|
|
217
|
+
else:
|
|
218
|
+
selected_features = numerical_features
|
|
219
|
+
|
|
220
|
+
# Create pairwise interactions
|
|
221
|
+
interaction_count = 0
|
|
222
|
+
max_interactions = 200 # Limit total interactions
|
|
223
|
+
|
|
224
|
+
for feature1, feature2 in combinations(selected_features, 2):
|
|
225
|
+
if interaction_count >= max_interactions:
|
|
226
|
+
break
|
|
227
|
+
|
|
228
|
+
if feature1 not in df.columns or feature2 not in df.columns:
|
|
229
|
+
continue
|
|
230
|
+
|
|
231
|
+
try:
|
|
232
|
+
# Multiplicative interaction
|
|
233
|
+
df[f"{feature1}_x_{feature2}"] = df[feature1] * df[feature2]
|
|
234
|
+
|
|
235
|
+
# Ratio interaction (avoid division by zero)
|
|
236
|
+
df[f"{feature1}_div_{feature2}"] = df[feature1] / (abs(df[feature2]) + 1e-8)
|
|
237
|
+
|
|
238
|
+
# Difference interaction
|
|
239
|
+
df[f"{feature1}_minus_{feature2}"] = df[feature1] - df[feature2]
|
|
240
|
+
|
|
241
|
+
interaction_count += 3
|
|
242
|
+
|
|
243
|
+
# Add some conditional interactions for key features
|
|
244
|
+
if "influence" in feature1.lower() or "influence" in feature2.lower():
|
|
245
|
+
# Conditional interactions based on influence
|
|
246
|
+
high_influence = df[feature1] > df[feature1].quantile(0.7)
|
|
247
|
+
df[f"{feature2}_when_high_{feature1}"] = np.where(
|
|
248
|
+
high_influence, df[feature2], 0
|
|
249
|
+
)
|
|
250
|
+
interaction_count += 1
|
|
251
|
+
|
|
252
|
+
except Exception as e:
|
|
253
|
+
logger.warning(f"Failed to create interaction {feature1} x {feature2}: {e}")
|
|
254
|
+
|
|
255
|
+
logger.info(f"Created {interaction_count} interaction features")
|
|
256
|
+
return df
|
|
257
|
+
|
|
258
|
+
def _add_polynomial_features(
|
|
259
|
+
self, df: pd.DataFrame, selected_features: List[str]
|
|
260
|
+
) -> pd.DataFrame:
|
|
261
|
+
"""Add polynomial features for key variables"""
|
|
262
|
+
logger.info("Adding polynomial features")
|
|
263
|
+
|
|
264
|
+
# Limit to top features to avoid memory issues
|
|
265
|
+
features_for_poly = selected_features[:5]
|
|
266
|
+
|
|
267
|
+
try:
|
|
268
|
+
# Create polynomial features
|
|
269
|
+
poly = PolynomialFeatures(
|
|
270
|
+
degree=self.config.polynomial_degree,
|
|
271
|
+
include_bias=self.config.include_bias,
|
|
272
|
+
interaction_only=False,
|
|
273
|
+
)
|
|
274
|
+
|
|
275
|
+
# Prepare data (handle missing values)
|
|
276
|
+
poly_data = df[features_for_poly].fillna(0)
|
|
277
|
+
|
|
278
|
+
if len(poly_data) > 0 and len(features_for_poly) > 0:
|
|
279
|
+
poly_features = poly.fit_transform(poly_data)
|
|
280
|
+
|
|
281
|
+
# Get feature names
|
|
282
|
+
poly_feature_names = poly.get_feature_names_out(features_for_poly)
|
|
283
|
+
|
|
284
|
+
# Add polynomial features to dataframe (skip original features)
|
|
285
|
+
start_idx = len(features_for_poly)
|
|
286
|
+
for i, name in enumerate(poly_feature_names[start_idx:], start_idx):
|
|
287
|
+
df[f"poly_{name}"] = poly_features[:, i]
|
|
288
|
+
|
|
289
|
+
logger.info(
|
|
290
|
+
f"Added {len(poly_feature_names) - len(features_for_poly)} polynomial features"
|
|
291
|
+
)
|
|
292
|
+
|
|
293
|
+
except Exception as e:
|
|
294
|
+
logger.warning(f"Failed to create polynomial features: {e}")
|
|
295
|
+
|
|
296
|
+
return df
|
|
297
|
+
|
|
298
|
+
def _add_clustering_features(self, df: pd.DataFrame) -> pd.DataFrame:
|
|
299
|
+
"""Add clustering-based features"""
|
|
300
|
+
logger.info("Adding clustering features")
|
|
301
|
+
|
|
302
|
+
# Select features for clustering
|
|
303
|
+
clustering_features = []
|
|
304
|
+
for feature in self.config.clustering_features:
|
|
305
|
+
if feature in df.columns:
|
|
306
|
+
clustering_features.append(feature)
|
|
307
|
+
|
|
308
|
+
if len(clustering_features) < 2:
|
|
309
|
+
logger.warning("Insufficient features for clustering")
|
|
310
|
+
return df
|
|
311
|
+
|
|
312
|
+
try:
|
|
313
|
+
# Prepare clustering data
|
|
314
|
+
cluster_data = df[clustering_features].fillna(0)
|
|
315
|
+
|
|
316
|
+
# Apply K-means clustering
|
|
317
|
+
kmeans = KMeans(n_clusters=self.config.n_clusters, random_state=42, n_init=10)
|
|
318
|
+
cluster_labels = kmeans.fit_predict(cluster_data)
|
|
319
|
+
|
|
320
|
+
df["cluster_label"] = cluster_labels
|
|
321
|
+
|
|
322
|
+
# Add distance to cluster centers
|
|
323
|
+
cluster_centers = kmeans.cluster_centers_
|
|
324
|
+
distances = []
|
|
325
|
+
|
|
326
|
+
for i, row in cluster_data.iterrows():
|
|
327
|
+
center = cluster_centers[cluster_labels[i]]
|
|
328
|
+
distance = np.sqrt(np.sum((row.values - center) ** 2))
|
|
329
|
+
distances.append(distance)
|
|
330
|
+
|
|
331
|
+
df["cluster_distance"] = distances
|
|
332
|
+
|
|
333
|
+
# Add cluster-based features
|
|
334
|
+
for cluster_id in range(self.config.n_clusters):
|
|
335
|
+
df[f"is_cluster_{cluster_id}"] = (df["cluster_label"] == cluster_id).astype(int)
|
|
336
|
+
|
|
337
|
+
# Cluster statistics
|
|
338
|
+
cluster_stats = df.groupby("cluster_label")[clustering_features].agg(["mean", "std"])
|
|
339
|
+
|
|
340
|
+
for feature in clustering_features:
|
|
341
|
+
for stat in ["mean", "std"]:
|
|
342
|
+
cluster_stat_dict = cluster_stats[(feature, stat)].to_dict()
|
|
343
|
+
df[f"cluster_{feature}_{stat}"] = df["cluster_label"].map(cluster_stat_dict)
|
|
344
|
+
|
|
345
|
+
logger.info(f"Added clustering features with {self.config.n_clusters} clusters")
|
|
346
|
+
|
|
347
|
+
except Exception as e:
|
|
348
|
+
logger.warning(f"Failed to create clustering features: {e}")
|
|
349
|
+
|
|
350
|
+
return df
|
|
351
|
+
|
|
352
|
+
def _add_statistical_features(
|
|
353
|
+
self, df: pd.DataFrame, numerical_features: List[str]
|
|
354
|
+
) -> pd.DataFrame:
|
|
355
|
+
"""Add statistical transformation features"""
|
|
356
|
+
logger.info("Adding statistical features")
|
|
357
|
+
|
|
358
|
+
# Select subset of features for statistical transforms
|
|
359
|
+
stat_features = numerical_features[:15]
|
|
360
|
+
|
|
361
|
+
for feature in stat_features:
|
|
362
|
+
if feature not in df.columns:
|
|
363
|
+
continue
|
|
364
|
+
|
|
365
|
+
try:
|
|
366
|
+
feature_data = df[feature].fillna(0)
|
|
367
|
+
|
|
368
|
+
# Log transform (for positive values)
|
|
369
|
+
if (feature_data > 0).all():
|
|
370
|
+
df[f"{feature}_log"] = np.log1p(feature_data)
|
|
371
|
+
|
|
372
|
+
# Square root transform
|
|
373
|
+
if (feature_data >= 0).all():
|
|
374
|
+
df[f"{feature}_sqrt"] = np.sqrt(feature_data)
|
|
375
|
+
|
|
376
|
+
# Inverse transform (avoid division by zero)
|
|
377
|
+
df[f"{feature}_inv"] = 1 / (abs(feature_data) + 1e-8)
|
|
378
|
+
|
|
379
|
+
# Standardized (z-score)
|
|
380
|
+
mean_val = feature_data.mean()
|
|
381
|
+
std_val = feature_data.std()
|
|
382
|
+
if std_val > 0:
|
|
383
|
+
df[f"{feature}_zscore"] = (feature_data - mean_val) / std_val
|
|
384
|
+
|
|
385
|
+
# Binned features
|
|
386
|
+
df[f"{feature}_binned"] = pd.cut(feature_data, bins=5, labels=False)
|
|
387
|
+
|
|
388
|
+
except Exception as e:
|
|
389
|
+
logger.warning(f"Failed to create statistical features for {feature}: {e}")
|
|
390
|
+
|
|
391
|
+
return df
|
|
392
|
+
|
|
393
|
+
def _add_rank_features(self, df: pd.DataFrame, numerical_features: List[str]) -> pd.DataFrame:
|
|
394
|
+
"""Add rank-based features"""
|
|
395
|
+
logger.info("Adding rank features")
|
|
396
|
+
|
|
397
|
+
# Select subset for ranking
|
|
398
|
+
rank_features = numerical_features[:10]
|
|
399
|
+
|
|
400
|
+
for feature in rank_features:
|
|
401
|
+
if feature not in df.columns:
|
|
402
|
+
continue
|
|
403
|
+
|
|
404
|
+
try:
|
|
405
|
+
# Percentile rank
|
|
406
|
+
df[f"{feature}_pct_rank"] = df[feature].rank(pct=True)
|
|
407
|
+
|
|
408
|
+
# Quantile binning
|
|
409
|
+
df[f"{feature}_quantile"] = pd.qcut(
|
|
410
|
+
df[feature], q=10, labels=False, duplicates="drop"
|
|
411
|
+
)
|
|
412
|
+
|
|
413
|
+
except Exception as e:
|
|
414
|
+
logger.warning(f"Failed to create rank features for {feature}: {e}")
|
|
415
|
+
|
|
416
|
+
return df
|
|
417
|
+
|
|
418
|
+
|
|
419
|
+
class FeatureInteractionEngine:
|
|
420
|
+
"""Advanced feature interaction discovery and generation"""
|
|
421
|
+
|
|
422
|
+
def __init__(self, config: Optional[EnsembleFeatureConfig] = None):
|
|
423
|
+
self.config = config or EnsembleFeatureConfig()
|
|
424
|
+
|
|
425
|
+
def discover_interactions(
|
|
426
|
+
self, df: pd.DataFrame, target_column: str, max_interactions: int = 50
|
|
427
|
+
) -> List[Tuple[str, str, float]]:
|
|
428
|
+
"""Discover important feature interactions based on target correlation"""
|
|
429
|
+
|
|
430
|
+
numerical_features = self._get_numerical_features(df)
|
|
431
|
+
interactions = []
|
|
432
|
+
|
|
433
|
+
logger.info(f"Discovering interactions among {len(numerical_features)} features")
|
|
434
|
+
|
|
435
|
+
for feature1, feature2 in combinations(numerical_features, 2):
|
|
436
|
+
if feature1 not in df.columns or feature2 not in df.columns:
|
|
437
|
+
continue
|
|
438
|
+
|
|
439
|
+
try:
|
|
440
|
+
# Create interaction term
|
|
441
|
+
interaction_term = df[feature1] * df[feature2]
|
|
442
|
+
|
|
443
|
+
# Calculate correlation with target
|
|
444
|
+
correlation = abs(interaction_term.corr(df[target_column]))
|
|
445
|
+
|
|
446
|
+
if not np.isnan(correlation) and correlation > 0.1:
|
|
447
|
+
interactions.append((feature1, feature2, correlation))
|
|
448
|
+
|
|
449
|
+
except Exception as e:
|
|
450
|
+
continue
|
|
451
|
+
|
|
452
|
+
# Sort by correlation strength
|
|
453
|
+
interactions.sort(key=lambda x: x[2], reverse=True)
|
|
454
|
+
|
|
455
|
+
logger.info(f"Discovered {len(interactions)} significant interactions")
|
|
456
|
+
return interactions[:max_interactions]
|
|
457
|
+
|
|
458
|
+
def _get_numerical_features(self, df: pd.DataFrame) -> List[str]:
|
|
459
|
+
"""Get numerical features for interaction discovery"""
|
|
460
|
+
return [
|
|
461
|
+
col
|
|
462
|
+
for col in df.columns
|
|
463
|
+
if df[col].dtype in ["int64", "float64"]
|
|
464
|
+
and not col.startswith("target_")
|
|
465
|
+
and df[col].notna().sum() > 0
|
|
466
|
+
]
|
|
467
|
+
|
|
468
|
+
def generate_advanced_interactions(
|
|
469
|
+
self, df: pd.DataFrame, feature_pairs: List[Tuple[str, str]]
|
|
470
|
+
) -> pd.DataFrame:
|
|
471
|
+
"""Generate advanced interaction terms for discovered feature pairs"""
|
|
472
|
+
|
|
473
|
+
df_enhanced = df.copy()
|
|
474
|
+
|
|
475
|
+
for feature1, feature2 in feature_pairs:
|
|
476
|
+
if feature1 not in df.columns or feature2 not in df.columns:
|
|
477
|
+
continue
|
|
478
|
+
|
|
479
|
+
try:
|
|
480
|
+
# Conditional interactions
|
|
481
|
+
df_enhanced[f"{feature1}_when_high_{feature2}"] = np.where(
|
|
482
|
+
df[feature2] > df[feature2].median(), df[feature1], 0
|
|
483
|
+
)
|
|
484
|
+
|
|
485
|
+
df_enhanced[f"{feature2}_when_high_{feature1}"] = np.where(
|
|
486
|
+
df[feature1] > df[feature1].median(), df[feature2], 0
|
|
487
|
+
)
|
|
488
|
+
|
|
489
|
+
# Non-linear interactions
|
|
490
|
+
df_enhanced[f"{feature1}_squared_x_{feature2}"] = (df[feature1] ** 2) * df[feature2]
|
|
491
|
+
|
|
492
|
+
# Min/max interactions
|
|
493
|
+
df_enhanced[f"min_{feature1}_{feature2}"] = np.minimum(df[feature1], df[feature2])
|
|
494
|
+
df_enhanced[f"max_{feature1}_{feature2}"] = np.maximum(df[feature1], df[feature2])
|
|
495
|
+
|
|
496
|
+
except Exception as e:
|
|
497
|
+
logger.warning(
|
|
498
|
+
f"Failed to create advanced interactions for {feature1}, {feature2}: {e}"
|
|
499
|
+
)
|
|
500
|
+
|
|
501
|
+
return df_enhanced
|
|
502
|
+
|
|
503
|
+
|
|
504
|
+
class DynamicFeatureSelector:
|
|
505
|
+
"""Dynamic feature selection based on multiple criteria"""
|
|
506
|
+
|
|
507
|
+
def __init__(self, config: Optional[EnsembleFeatureConfig] = None):
|
|
508
|
+
self.config = config or EnsembleFeatureConfig()
|
|
509
|
+
|
|
510
|
+
def select_features(
|
|
511
|
+
self,
|
|
512
|
+
df: pd.DataFrame,
|
|
513
|
+
target_column: str,
|
|
514
|
+
selection_methods: Optional[List[str]] = None,
|
|
515
|
+
) -> Tuple[pd.DataFrame, Dict[str, Any]]:
|
|
516
|
+
"""Select features using multiple criteria"""
|
|
517
|
+
|
|
518
|
+
if selection_methods is None:
|
|
519
|
+
selection_methods = ["variance", "correlation", "mutual_info"]
|
|
520
|
+
|
|
521
|
+
feature_scores = {}
|
|
522
|
+
selected_features = set()
|
|
523
|
+
|
|
524
|
+
# Get feature columns (exclude target)
|
|
525
|
+
feature_columns = [
|
|
526
|
+
col for col in df.columns if col != target_column and not col.startswith("target_")
|
|
527
|
+
]
|
|
528
|
+
|
|
529
|
+
logger.info(f"Selecting from {len(feature_columns)} features")
|
|
530
|
+
|
|
531
|
+
# Apply different selection methods
|
|
532
|
+
for method in selection_methods:
|
|
533
|
+
method_features = self._apply_selection_method(
|
|
534
|
+
df[feature_columns], df[target_column], method
|
|
535
|
+
)
|
|
536
|
+
feature_scores[method] = method_features
|
|
537
|
+
selected_features.update(method_features[:50]) # Top 50 from each method
|
|
538
|
+
|
|
539
|
+
# Combine selections
|
|
540
|
+
final_features = list(selected_features)[: self.config.feature_selection_k]
|
|
541
|
+
|
|
542
|
+
# Create result dataframe
|
|
543
|
+
result_df = df[[target_column] + final_features].copy()
|
|
544
|
+
|
|
545
|
+
selection_info = {
|
|
546
|
+
"original_feature_count": len(feature_columns),
|
|
547
|
+
"selected_feature_count": len(final_features),
|
|
548
|
+
"selection_methods": selection_methods,
|
|
549
|
+
"feature_scores": feature_scores,
|
|
550
|
+
"selected_features": final_features,
|
|
551
|
+
}
|
|
552
|
+
|
|
553
|
+
logger.info(
|
|
554
|
+
f"Selected {len(final_features)} features from {len(feature_columns)} original features"
|
|
555
|
+
)
|
|
556
|
+
|
|
557
|
+
return result_df, selection_info
|
|
558
|
+
|
|
559
|
+
def _apply_selection_method(self, X: pd.DataFrame, y: pd.Series, method: str) -> List[str]:
|
|
560
|
+
"""Apply specific feature selection method"""
|
|
561
|
+
|
|
562
|
+
try:
|
|
563
|
+
if method == "variance":
|
|
564
|
+
# Variance-based selection
|
|
565
|
+
variances = X.var()
|
|
566
|
+
feature_scores = variances.sort_values(ascending=False)
|
|
567
|
+
return feature_scores.index.tolist()
|
|
568
|
+
|
|
569
|
+
elif method == "correlation":
|
|
570
|
+
# Correlation-based selection
|
|
571
|
+
correlations = X.corrwith(y).abs()
|
|
572
|
+
feature_scores = correlations.sort_values(ascending=False)
|
|
573
|
+
return feature_scores.dropna().index.tolist()
|
|
574
|
+
|
|
575
|
+
elif method == "mutual_info":
|
|
576
|
+
# Mutual information selection
|
|
577
|
+
X_filled = X.fillna(0)
|
|
578
|
+
y_filled = y.fillna(0)
|
|
579
|
+
|
|
580
|
+
# Use a subset to avoid memory issues
|
|
581
|
+
if len(X.columns) > 100:
|
|
582
|
+
selected_cols = X.columns[:100]
|
|
583
|
+
X_subset = X_filled[selected_cols]
|
|
584
|
+
else:
|
|
585
|
+
X_subset = X_filled
|
|
586
|
+
|
|
587
|
+
mi_scores = mutual_info_regression(X_subset, y_filled, random_state=42)
|
|
588
|
+
feature_scores = pd.Series(mi_scores, index=X_subset.columns)
|
|
589
|
+
feature_scores = feature_scores.sort_values(ascending=False)
|
|
590
|
+
return feature_scores.index.tolist()
|
|
591
|
+
|
|
592
|
+
elif method == "f_test":
|
|
593
|
+
# F-test based selection
|
|
594
|
+
X_filled = X.fillna(0)
|
|
595
|
+
y_filled = y.fillna(0)
|
|
596
|
+
|
|
597
|
+
selector = SelectKBest(score_func=f_regression, k=min(50, len(X.columns)))
|
|
598
|
+
selector.fit(X_filled, y_filled)
|
|
599
|
+
|
|
600
|
+
selected_indices = selector.get_support(indices=True)
|
|
601
|
+
return X.columns[selected_indices].tolist()
|
|
602
|
+
|
|
603
|
+
except Exception as e:
|
|
604
|
+
logger.warning(f"Feature selection method {method} failed: {e}")
|
|
605
|
+
return []
|
|
606
|
+
|
|
607
|
+
return []
|