featcopilot 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- featcopilot/__init__.py +29 -0
- featcopilot/core/__init__.py +13 -0
- featcopilot/core/base.py +195 -0
- featcopilot/core/feature.py +224 -0
- featcopilot/core/registry.py +128 -0
- featcopilot/engines/__init__.py +13 -0
- featcopilot/engines/relational.py +256 -0
- featcopilot/engines/tabular.py +293 -0
- featcopilot/engines/text.py +211 -0
- featcopilot/engines/timeseries.py +402 -0
- featcopilot/llm/__init__.py +16 -0
- featcopilot/llm/code_generator.py +295 -0
- featcopilot/llm/copilot_client.py +521 -0
- featcopilot/llm/explainer.py +200 -0
- featcopilot/llm/semantic_engine.py +379 -0
- featcopilot/selection/__init__.py +13 -0
- featcopilot/selection/importance.py +161 -0
- featcopilot/selection/redundancy.py +156 -0
- featcopilot/selection/statistical.py +199 -0
- featcopilot/selection/unified.py +172 -0
- featcopilot/transformers/__init__.py +11 -0
- featcopilot/transformers/sklearn_compat.py +401 -0
- featcopilot/utils/__init__.py +9 -0
- featcopilot/utils/cache.py +221 -0
- featcopilot/utils/parallel.py +109 -0
- featcopilot-0.1.0.dist-info/METADATA +218 -0
- featcopilot-0.1.0.dist-info/RECORD +29 -0
- featcopilot-0.1.0.dist-info/WHEEL +5 -0
- featcopilot-0.1.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,161 @@
|
|
|
1
|
+
"""Model-based feature importance selection."""
|
|
2
|
+
|
|
3
|
+
from typing import Optional, Union
|
|
4
|
+
|
|
5
|
+
import numpy as np
|
|
6
|
+
import pandas as pd
|
|
7
|
+
|
|
8
|
+
from featcopilot.core.base import BaseSelector
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class ImportanceSelector(BaseSelector):
|
|
12
|
+
"""
|
|
13
|
+
Feature selector based on model importance scores.
|
|
14
|
+
|
|
15
|
+
Uses tree-based models to evaluate feature importance.
|
|
16
|
+
|
|
17
|
+
Parameters
|
|
18
|
+
----------
|
|
19
|
+
model : str, default='random_forest'
|
|
20
|
+
Model to use ('random_forest', 'gradient_boosting', 'xgboost')
|
|
21
|
+
max_features : int, optional
|
|
22
|
+
Maximum features to select
|
|
23
|
+
threshold : float, optional
|
|
24
|
+
Minimum importance threshold
|
|
25
|
+
|
|
26
|
+
Examples
|
|
27
|
+
--------
|
|
28
|
+
>>> selector = ImportanceSelector(model='random_forest', max_features=50)
|
|
29
|
+
>>> X_selected = selector.fit_transform(X, y)
|
|
30
|
+
"""
|
|
31
|
+
|
|
32
|
+
def __init__(
|
|
33
|
+
self,
|
|
34
|
+
model: str = "random_forest",
|
|
35
|
+
max_features: Optional[int] = None,
|
|
36
|
+
threshold: Optional[float] = None,
|
|
37
|
+
n_estimators: int = 100,
|
|
38
|
+
verbose: bool = False,
|
|
39
|
+
**kwargs,
|
|
40
|
+
):
|
|
41
|
+
super().__init__(**kwargs)
|
|
42
|
+
self.model_type = model
|
|
43
|
+
self.max_features = max_features
|
|
44
|
+
self.threshold = threshold
|
|
45
|
+
self.n_estimators = n_estimators
|
|
46
|
+
self.verbose = verbose
|
|
47
|
+
self._model = None
|
|
48
|
+
|
|
49
|
+
def fit(
|
|
50
|
+
self, X: Union[pd.DataFrame, np.ndarray], y: Union[pd.Series, np.ndarray], **kwargs
|
|
51
|
+
) -> "ImportanceSelector":
|
|
52
|
+
"""
|
|
53
|
+
Fit selector using a tree model.
|
|
54
|
+
|
|
55
|
+
Parameters
|
|
56
|
+
----------
|
|
57
|
+
X : DataFrame or ndarray
|
|
58
|
+
Input features
|
|
59
|
+
y : Series or ndarray
|
|
60
|
+
Target variable
|
|
61
|
+
|
|
62
|
+
Returns
|
|
63
|
+
-------
|
|
64
|
+
self : ImportanceSelector
|
|
65
|
+
"""
|
|
66
|
+
X = self._validate_input(X)
|
|
67
|
+
y = np.array(y)
|
|
68
|
+
|
|
69
|
+
# Determine task type
|
|
70
|
+
unique_y = len(np.unique(y))
|
|
71
|
+
is_classification = unique_y < 20 and not np.issubdtype(y.dtype, np.floating)
|
|
72
|
+
|
|
73
|
+
# Create model
|
|
74
|
+
self._model = self._create_model(is_classification)
|
|
75
|
+
|
|
76
|
+
# Fit model
|
|
77
|
+
X_array = X.fillna(0).values
|
|
78
|
+
self._model.fit(X_array, y)
|
|
79
|
+
|
|
80
|
+
# Get importances
|
|
81
|
+
importances = self._model.feature_importances_
|
|
82
|
+
self._feature_scores = dict(zip(X.columns, importances))
|
|
83
|
+
|
|
84
|
+
# Select features
|
|
85
|
+
self._select_features()
|
|
86
|
+
|
|
87
|
+
self._is_fitted = True
|
|
88
|
+
return self
|
|
89
|
+
|
|
90
|
+
def _create_model(self, is_classification: bool):
|
|
91
|
+
"""Create the appropriate model."""
|
|
92
|
+
if self.model_type == "random_forest":
|
|
93
|
+
if is_classification:
|
|
94
|
+
from sklearn.ensemble import RandomForestClassifier
|
|
95
|
+
|
|
96
|
+
return RandomForestClassifier(n_estimators=self.n_estimators, random_state=42, n_jobs=-1)
|
|
97
|
+
else:
|
|
98
|
+
from sklearn.ensemble import RandomForestRegressor
|
|
99
|
+
|
|
100
|
+
return RandomForestRegressor(n_estimators=self.n_estimators, random_state=42, n_jobs=-1)
|
|
101
|
+
|
|
102
|
+
elif self.model_type == "gradient_boosting":
|
|
103
|
+
if is_classification:
|
|
104
|
+
from sklearn.ensemble import GradientBoostingClassifier
|
|
105
|
+
|
|
106
|
+
return GradientBoostingClassifier(n_estimators=self.n_estimators, random_state=42)
|
|
107
|
+
else:
|
|
108
|
+
from sklearn.ensemble import GradientBoostingRegressor
|
|
109
|
+
|
|
110
|
+
return GradientBoostingRegressor(n_estimators=self.n_estimators, random_state=42)
|
|
111
|
+
|
|
112
|
+
elif self.model_type == "xgboost":
|
|
113
|
+
try:
|
|
114
|
+
import xgboost as xgb
|
|
115
|
+
|
|
116
|
+
if is_classification:
|
|
117
|
+
return xgb.XGBClassifier(n_estimators=self.n_estimators, random_state=42, n_jobs=-1)
|
|
118
|
+
else:
|
|
119
|
+
return xgb.XGBRegressor(n_estimators=self.n_estimators, random_state=42, n_jobs=-1)
|
|
120
|
+
except ImportError:
|
|
121
|
+
if self.verbose:
|
|
122
|
+
print("XGBoost not available, falling back to RandomForest")
|
|
123
|
+
return self._create_model_fallback(is_classification)
|
|
124
|
+
|
|
125
|
+
else:
|
|
126
|
+
raise ValueError(f"Unknown model type: {self.model_type}")
|
|
127
|
+
|
|
128
|
+
def _create_model_fallback(self, is_classification: bool):
|
|
129
|
+
"""Fallback to RandomForest."""
|
|
130
|
+
if is_classification:
|
|
131
|
+
from sklearn.ensemble import RandomForestClassifier
|
|
132
|
+
|
|
133
|
+
return RandomForestClassifier(n_estimators=self.n_estimators, random_state=42)
|
|
134
|
+
else:
|
|
135
|
+
from sklearn.ensemble import RandomForestRegressor
|
|
136
|
+
|
|
137
|
+
return RandomForestRegressor(n_estimators=self.n_estimators, random_state=42)
|
|
138
|
+
|
|
139
|
+
def _select_features(self) -> None:
|
|
140
|
+
"""Select features based on importance."""
|
|
141
|
+
sorted_features = sorted(self._feature_scores.items(), key=lambda x: x[1], reverse=True)
|
|
142
|
+
|
|
143
|
+
if self.threshold is not None:
|
|
144
|
+
sorted_features = [(name, score) for name, score in sorted_features if score >= self.threshold]
|
|
145
|
+
|
|
146
|
+
if self.max_features is not None:
|
|
147
|
+
sorted_features = sorted_features[: self.max_features]
|
|
148
|
+
|
|
149
|
+
self._selected_features = [name for name, _ in sorted_features]
|
|
150
|
+
|
|
151
|
+
if self.verbose:
|
|
152
|
+
print(f"ImportanceSelector: Selected {len(self._selected_features)} features")
|
|
153
|
+
|
|
154
|
+
def transform(self, X: Union[pd.DataFrame, np.ndarray], **kwargs) -> pd.DataFrame:
|
|
155
|
+
"""Select features from data."""
|
|
156
|
+
if not self._is_fitted:
|
|
157
|
+
raise RuntimeError("Selector must be fitted before transform")
|
|
158
|
+
|
|
159
|
+
X = self._validate_input(X)
|
|
160
|
+
available = [f for f in self._selected_features if f in X.columns]
|
|
161
|
+
return X[available]
|
|
@@ -0,0 +1,156 @@
|
|
|
1
|
+
"""Redundancy elimination through correlation analysis."""
|
|
2
|
+
|
|
3
|
+
from typing import Optional, Union
|
|
4
|
+
|
|
5
|
+
import numpy as np
|
|
6
|
+
import pandas as pd
|
|
7
|
+
|
|
8
|
+
from featcopilot.core.base import BaseSelector
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class RedundancyEliminator(BaseSelector):
|
|
12
|
+
"""
|
|
13
|
+
Eliminate redundant features based on correlation.
|
|
14
|
+
|
|
15
|
+
Removes highly correlated features, keeping the one with
|
|
16
|
+
higher importance (if provided) or the first one.
|
|
17
|
+
|
|
18
|
+
Parameters
|
|
19
|
+
----------
|
|
20
|
+
correlation_threshold : float, default=0.95
|
|
21
|
+
Correlation threshold for redundancy
|
|
22
|
+
method : str, default='pearson'
|
|
23
|
+
Correlation method ('pearson', 'spearman', 'kendall')
|
|
24
|
+
|
|
25
|
+
Examples
|
|
26
|
+
--------
|
|
27
|
+
>>> eliminator = RedundancyEliminator(correlation_threshold=0.95)
|
|
28
|
+
>>> X_reduced = eliminator.fit_transform(X, y)
|
|
29
|
+
"""
|
|
30
|
+
|
|
31
|
+
def __init__(
|
|
32
|
+
self,
|
|
33
|
+
correlation_threshold: float = 0.95,
|
|
34
|
+
method: str = "pearson",
|
|
35
|
+
importance_scores: Optional[dict[str, float]] = None,
|
|
36
|
+
verbose: bool = False,
|
|
37
|
+
**kwargs,
|
|
38
|
+
):
|
|
39
|
+
super().__init__(**kwargs)
|
|
40
|
+
self.correlation_threshold = correlation_threshold
|
|
41
|
+
self.method = method
|
|
42
|
+
self.importance_scores = importance_scores or {}
|
|
43
|
+
self.verbose = verbose
|
|
44
|
+
self._correlation_matrix: Optional[pd.DataFrame] = None
|
|
45
|
+
|
|
46
|
+
def fit_transform(
|
|
47
|
+
self,
|
|
48
|
+
X: Union[pd.DataFrame, np.ndarray],
|
|
49
|
+
y: Optional[Union[pd.Series, np.ndarray]] = None,
|
|
50
|
+
**kwargs,
|
|
51
|
+
) -> pd.DataFrame:
|
|
52
|
+
"""Fit and transform in one step (y is optional for this selector)."""
|
|
53
|
+
return self.fit(X, y, **kwargs).transform(X, **kwargs)
|
|
54
|
+
|
|
55
|
+
def fit(
|
|
56
|
+
self,
|
|
57
|
+
X: Union[pd.DataFrame, np.ndarray],
|
|
58
|
+
y: Optional[Union[pd.Series, np.ndarray]] = None,
|
|
59
|
+
importance_scores: Optional[dict[str, float]] = None,
|
|
60
|
+
**kwargs,
|
|
61
|
+
) -> "RedundancyEliminator":
|
|
62
|
+
"""
|
|
63
|
+
Fit eliminator by computing correlations.
|
|
64
|
+
|
|
65
|
+
Parameters
|
|
66
|
+
----------
|
|
67
|
+
X : DataFrame or ndarray
|
|
68
|
+
Input features
|
|
69
|
+
y : Series or ndarray, optional
|
|
70
|
+
Target variable (unused)
|
|
71
|
+
importance_scores : dict, optional
|
|
72
|
+
Pre-computed importance scores
|
|
73
|
+
|
|
74
|
+
Returns
|
|
75
|
+
-------
|
|
76
|
+
self : RedundancyEliminator
|
|
77
|
+
"""
|
|
78
|
+
X = self._validate_input(X)
|
|
79
|
+
|
|
80
|
+
if importance_scores:
|
|
81
|
+
self.importance_scores = importance_scores
|
|
82
|
+
|
|
83
|
+
# Compute correlation matrix
|
|
84
|
+
numeric_cols = X.select_dtypes(include=[np.number]).columns
|
|
85
|
+
self._correlation_matrix = X[numeric_cols].corr(method=self.method)
|
|
86
|
+
|
|
87
|
+
# Find redundant features
|
|
88
|
+
self._find_redundant_features(numeric_cols)
|
|
89
|
+
|
|
90
|
+
self._is_fitted = True
|
|
91
|
+
return self
|
|
92
|
+
|
|
93
|
+
def _find_redundant_features(self, columns: list[str]) -> None:
|
|
94
|
+
"""Identify and mark redundant features for removal."""
|
|
95
|
+
to_remove: set[str] = set()
|
|
96
|
+
checked_pairs: set[tuple] = set()
|
|
97
|
+
|
|
98
|
+
for i, col1 in enumerate(columns):
|
|
99
|
+
if col1 in to_remove:
|
|
100
|
+
continue
|
|
101
|
+
|
|
102
|
+
for col2 in columns[i + 1 :]:
|
|
103
|
+
if col2 in to_remove:
|
|
104
|
+
continue
|
|
105
|
+
|
|
106
|
+
pair = tuple(sorted([col1, col2]))
|
|
107
|
+
if pair in checked_pairs:
|
|
108
|
+
continue
|
|
109
|
+
checked_pairs.add(pair)
|
|
110
|
+
|
|
111
|
+
# Get correlation
|
|
112
|
+
corr = abs(self._correlation_matrix.loc[col1, col2])
|
|
113
|
+
|
|
114
|
+
if corr >= self.correlation_threshold:
|
|
115
|
+
# Decide which to remove based on importance
|
|
116
|
+
imp1 = self.importance_scores.get(col1, 0)
|
|
117
|
+
imp2 = self.importance_scores.get(col2, 0)
|
|
118
|
+
|
|
119
|
+
if imp1 >= imp2:
|
|
120
|
+
to_remove.add(col2)
|
|
121
|
+
if self.verbose:
|
|
122
|
+
print(f"Removing {col2} (corr={corr:.3f} with {col1})")
|
|
123
|
+
else:
|
|
124
|
+
to_remove.add(col1)
|
|
125
|
+
if self.verbose:
|
|
126
|
+
print(f"Removing {col1} (corr={corr:.3f} with {col2})")
|
|
127
|
+
break # col1 is removed, move to next
|
|
128
|
+
|
|
129
|
+
# Selected features are those not removed
|
|
130
|
+
self._selected_features = [c for c in columns if c not in to_remove]
|
|
131
|
+
self._removed_features = list(to_remove)
|
|
132
|
+
|
|
133
|
+
if self.verbose:
|
|
134
|
+
print(f"RedundancyEliminator: Removed {len(to_remove)} redundant features")
|
|
135
|
+
|
|
136
|
+
def transform(self, X: Union[pd.DataFrame, np.ndarray], **kwargs) -> pd.DataFrame:
|
|
137
|
+
"""Remove redundant features."""
|
|
138
|
+
if not self._is_fitted:
|
|
139
|
+
raise RuntimeError("Eliminator must be fitted before transform")
|
|
140
|
+
|
|
141
|
+
X = self._validate_input(X)
|
|
142
|
+
|
|
143
|
+
# Keep selected features plus any non-numeric columns
|
|
144
|
+
non_numeric = X.select_dtypes(exclude=[np.number]).columns.tolist()
|
|
145
|
+
keep_cols = [c for c in self._selected_features if c in X.columns]
|
|
146
|
+
keep_cols.extend([c for c in non_numeric if c not in keep_cols])
|
|
147
|
+
|
|
148
|
+
return X[keep_cols]
|
|
149
|
+
|
|
150
|
+
def get_removed_features(self) -> list[str]:
|
|
151
|
+
"""Get list of removed redundant features."""
|
|
152
|
+
return getattr(self, "_removed_features", [])
|
|
153
|
+
|
|
154
|
+
def get_correlation_matrix(self) -> Optional[pd.DataFrame]:
|
|
155
|
+
"""Get the computed correlation matrix."""
|
|
156
|
+
return self._correlation_matrix
|
|
@@ -0,0 +1,199 @@
|
|
|
1
|
+
"""Statistical feature selection methods."""
|
|
2
|
+
|
|
3
|
+
from typing import Optional, Union
|
|
4
|
+
|
|
5
|
+
import numpy as np
|
|
6
|
+
import pandas as pd
|
|
7
|
+
|
|
8
|
+
from featcopilot.core.base import BaseSelector
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class StatisticalSelector(BaseSelector):
|
|
12
|
+
"""
|
|
13
|
+
Feature selector based on statistical tests.
|
|
14
|
+
|
|
15
|
+
Uses statistical tests to evaluate feature relevance:
|
|
16
|
+
- Mutual information
|
|
17
|
+
- Chi-square test (categorical)
|
|
18
|
+
- F-test (ANOVA)
|
|
19
|
+
- Correlation with target
|
|
20
|
+
|
|
21
|
+
Parameters
|
|
22
|
+
----------
|
|
23
|
+
method : str, default='mutual_info'
|
|
24
|
+
Selection method ('mutual_info', 'f_test', 'chi2', 'correlation')
|
|
25
|
+
max_features : int, optional
|
|
26
|
+
Maximum features to select
|
|
27
|
+
threshold : float, optional
|
|
28
|
+
Minimum score threshold
|
|
29
|
+
|
|
30
|
+
Examples
|
|
31
|
+
--------
|
|
32
|
+
>>> selector = StatisticalSelector(method='mutual_info', max_features=50)
|
|
33
|
+
>>> X_selected = selector.fit_transform(X, y)
|
|
34
|
+
"""
|
|
35
|
+
|
|
36
|
+
METHODS = ["mutual_info", "f_test", "chi2", "correlation"]
|
|
37
|
+
|
|
38
|
+
def __init__(
|
|
39
|
+
self,
|
|
40
|
+
method: str = "mutual_info",
|
|
41
|
+
max_features: Optional[int] = None,
|
|
42
|
+
threshold: Optional[float] = None,
|
|
43
|
+
verbose: bool = False,
|
|
44
|
+
**kwargs,
|
|
45
|
+
):
|
|
46
|
+
super().__init__(**kwargs)
|
|
47
|
+
if method not in self.METHODS:
|
|
48
|
+
raise ValueError(f"Method must be one of {self.METHODS}")
|
|
49
|
+
|
|
50
|
+
self.method = method
|
|
51
|
+
self.max_features = max_features
|
|
52
|
+
self.threshold = threshold
|
|
53
|
+
self.verbose = verbose
|
|
54
|
+
|
|
55
|
+
def fit(
|
|
56
|
+
self, X: Union[pd.DataFrame, np.ndarray], y: Union[pd.Series, np.ndarray], **kwargs
|
|
57
|
+
) -> "StatisticalSelector":
|
|
58
|
+
"""
|
|
59
|
+
Fit selector to compute feature scores.
|
|
60
|
+
|
|
61
|
+
Parameters
|
|
62
|
+
----------
|
|
63
|
+
X : DataFrame or ndarray
|
|
64
|
+
Input features
|
|
65
|
+
y : Series or ndarray
|
|
66
|
+
Target variable
|
|
67
|
+
|
|
68
|
+
Returns
|
|
69
|
+
-------
|
|
70
|
+
self : StatisticalSelector
|
|
71
|
+
"""
|
|
72
|
+
X = self._validate_input(X)
|
|
73
|
+
y = np.array(y)
|
|
74
|
+
|
|
75
|
+
# Compute scores based on method
|
|
76
|
+
if self.method == "mutual_info":
|
|
77
|
+
scores = self._compute_mutual_info(X, y)
|
|
78
|
+
elif self.method == "f_test":
|
|
79
|
+
scores = self._compute_f_test(X, y)
|
|
80
|
+
elif self.method == "chi2":
|
|
81
|
+
scores = self._compute_chi2(X, y)
|
|
82
|
+
elif self.method == "correlation":
|
|
83
|
+
scores = self._compute_correlation(X, y)
|
|
84
|
+
else:
|
|
85
|
+
raise ValueError(f"Unknown method: {self.method}")
|
|
86
|
+
|
|
87
|
+
self._feature_scores = dict(zip(X.columns, scores))
|
|
88
|
+
|
|
89
|
+
# Select features
|
|
90
|
+
self._select_features()
|
|
91
|
+
|
|
92
|
+
self._is_fitted = True
|
|
93
|
+
return self
|
|
94
|
+
|
|
95
|
+
def _compute_mutual_info(self, X: pd.DataFrame, y: np.ndarray) -> np.ndarray:
|
|
96
|
+
"""Compute mutual information scores."""
|
|
97
|
+
from sklearn.feature_selection import mutual_info_classif, mutual_info_regression
|
|
98
|
+
|
|
99
|
+
# Determine if classification or regression
|
|
100
|
+
unique_y = len(np.unique(y))
|
|
101
|
+
is_classification = unique_y < 20 and y.dtype in [np.int32, np.int64, "object"]
|
|
102
|
+
|
|
103
|
+
X_array = X.fillna(0).values
|
|
104
|
+
|
|
105
|
+
if is_classification:
|
|
106
|
+
scores = mutual_info_classif(X_array, y, random_state=42)
|
|
107
|
+
else:
|
|
108
|
+
scores = mutual_info_regression(X_array, y, random_state=42)
|
|
109
|
+
|
|
110
|
+
return scores
|
|
111
|
+
|
|
112
|
+
def _compute_f_test(self, X: pd.DataFrame, y: np.ndarray) -> np.ndarray:
|
|
113
|
+
"""Compute F-test scores."""
|
|
114
|
+
from sklearn.feature_selection import f_classif, f_regression
|
|
115
|
+
|
|
116
|
+
unique_y = len(np.unique(y))
|
|
117
|
+
is_classification = unique_y < 20
|
|
118
|
+
|
|
119
|
+
X_array = X.fillna(0).values
|
|
120
|
+
|
|
121
|
+
if is_classification:
|
|
122
|
+
scores, _ = f_classif(X_array, y)
|
|
123
|
+
else:
|
|
124
|
+
scores, _ = f_regression(X_array, y)
|
|
125
|
+
|
|
126
|
+
# Handle NaN scores
|
|
127
|
+
scores = np.nan_to_num(scores, 0)
|
|
128
|
+
return scores
|
|
129
|
+
|
|
130
|
+
def _compute_chi2(self, X: pd.DataFrame, y: np.ndarray) -> np.ndarray:
|
|
131
|
+
"""Compute chi-square scores (for non-negative features)."""
|
|
132
|
+
from sklearn.feature_selection import chi2
|
|
133
|
+
|
|
134
|
+
X_array = X.fillna(0).values
|
|
135
|
+
|
|
136
|
+
# Chi2 requires non-negative values
|
|
137
|
+
X_positive = X_array - X_array.min(axis=0) + 1e-8
|
|
138
|
+
|
|
139
|
+
try:
|
|
140
|
+
scores, _ = chi2(X_positive, y)
|
|
141
|
+
scores = np.nan_to_num(scores, 0)
|
|
142
|
+
except Exception:
|
|
143
|
+
# Fallback to mutual information
|
|
144
|
+
scores = self._compute_mutual_info(X, y)
|
|
145
|
+
|
|
146
|
+
return scores
|
|
147
|
+
|
|
148
|
+
def _compute_correlation(self, X: pd.DataFrame, y: np.ndarray) -> np.ndarray:
|
|
149
|
+
"""Compute absolute correlation with target."""
|
|
150
|
+
scores = []
|
|
151
|
+
for col in X.columns:
|
|
152
|
+
try:
|
|
153
|
+
corr = np.abs(np.corrcoef(X[col].fillna(0).values, y)[0, 1])
|
|
154
|
+
scores.append(corr if not np.isnan(corr) else 0)
|
|
155
|
+
except Exception:
|
|
156
|
+
scores.append(0)
|
|
157
|
+
|
|
158
|
+
return np.array(scores)
|
|
159
|
+
|
|
160
|
+
def _select_features(self) -> None:
|
|
161
|
+
"""Select features based on scores."""
|
|
162
|
+
# Sort features by score
|
|
163
|
+
sorted_features = sorted(self._feature_scores.items(), key=lambda x: x[1], reverse=True)
|
|
164
|
+
|
|
165
|
+
# Apply threshold
|
|
166
|
+
if self.threshold is not None:
|
|
167
|
+
sorted_features = [(name, score) for name, score in sorted_features if score >= self.threshold]
|
|
168
|
+
|
|
169
|
+
# Apply max_features limit
|
|
170
|
+
if self.max_features is not None:
|
|
171
|
+
sorted_features = sorted_features[: self.max_features]
|
|
172
|
+
|
|
173
|
+
self._selected_features = [name for name, _ in sorted_features]
|
|
174
|
+
|
|
175
|
+
if self.verbose:
|
|
176
|
+
print(f"StatisticalSelector: Selected {len(self._selected_features)} features")
|
|
177
|
+
|
|
178
|
+
def transform(self, X: Union[pd.DataFrame, np.ndarray], **kwargs) -> pd.DataFrame:
|
|
179
|
+
"""
|
|
180
|
+
Select features from data.
|
|
181
|
+
|
|
182
|
+
Parameters
|
|
183
|
+
----------
|
|
184
|
+
X : DataFrame or ndarray
|
|
185
|
+
Input features
|
|
186
|
+
|
|
187
|
+
Returns
|
|
188
|
+
-------
|
|
189
|
+
X_selected : DataFrame
|
|
190
|
+
Data with only selected features
|
|
191
|
+
"""
|
|
192
|
+
if not self._is_fitted:
|
|
193
|
+
raise RuntimeError("Selector must be fitted before transform")
|
|
194
|
+
|
|
195
|
+
X = self._validate_input(X)
|
|
196
|
+
|
|
197
|
+
# Keep only selected features that exist in X
|
|
198
|
+
available = [f for f in self._selected_features if f in X.columns]
|
|
199
|
+
return X[available]
|
|
@@ -0,0 +1,172 @@
|
|
|
1
|
+
"""Unified feature selector combining multiple methods."""
|
|
2
|
+
|
|
3
|
+
from typing import Optional, Union
|
|
4
|
+
|
|
5
|
+
import numpy as np
|
|
6
|
+
import pandas as pd
|
|
7
|
+
|
|
8
|
+
from featcopilot.core.base import BaseSelector
|
|
9
|
+
from featcopilot.selection.importance import ImportanceSelector
|
|
10
|
+
from featcopilot.selection.redundancy import RedundancyEliminator
|
|
11
|
+
from featcopilot.selection.statistical import StatisticalSelector
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class FeatureSelector(BaseSelector):
|
|
15
|
+
"""
|
|
16
|
+
Unified feature selector combining multiple selection methods.
|
|
17
|
+
|
|
18
|
+
Combines statistical tests, model importance, and redundancy
|
|
19
|
+
elimination for comprehensive feature selection.
|
|
20
|
+
|
|
21
|
+
Parameters
|
|
22
|
+
----------
|
|
23
|
+
methods : list, default=['mutual_info', 'importance']
|
|
24
|
+
Selection methods to use
|
|
25
|
+
max_features : int, optional
|
|
26
|
+
Maximum features to select
|
|
27
|
+
correlation_threshold : float, default=0.95
|
|
28
|
+
Threshold for redundancy elimination
|
|
29
|
+
|
|
30
|
+
Examples
|
|
31
|
+
--------
|
|
32
|
+
>>> selector = FeatureSelector(
|
|
33
|
+
... methods=['mutual_info', 'importance', 'correlation'],
|
|
34
|
+
... max_features=50,
|
|
35
|
+
... correlation_threshold=0.95
|
|
36
|
+
... )
|
|
37
|
+
>>> X_selected = selector.fit_transform(X, y)
|
|
38
|
+
"""
|
|
39
|
+
|
|
40
|
+
def __init__(
|
|
41
|
+
self,
|
|
42
|
+
methods: Optional[list[str]] = None,
|
|
43
|
+
max_features: Optional[int] = None,
|
|
44
|
+
correlation_threshold: float = 0.95,
|
|
45
|
+
combination: str = "union",
|
|
46
|
+
verbose: bool = False,
|
|
47
|
+
**kwargs,
|
|
48
|
+
):
|
|
49
|
+
super().__init__(**kwargs)
|
|
50
|
+
self.methods = methods or ["mutual_info", "importance"]
|
|
51
|
+
self.max_features = max_features
|
|
52
|
+
self.correlation_threshold = correlation_threshold
|
|
53
|
+
self.combination = combination # 'union' or 'intersection'
|
|
54
|
+
self.verbose = verbose
|
|
55
|
+
self._selectors: dict[str, BaseSelector] = {}
|
|
56
|
+
self._method_scores: dict[str, dict[str, float]] = {}
|
|
57
|
+
|
|
58
|
+
def fit(self, X: Union[pd.DataFrame, np.ndarray], y: Union[pd.Series, np.ndarray], **kwargs) -> "FeatureSelector":
|
|
59
|
+
"""
|
|
60
|
+
Fit all selection methods.
|
|
61
|
+
|
|
62
|
+
Parameters
|
|
63
|
+
----------
|
|
64
|
+
X : DataFrame or ndarray
|
|
65
|
+
Input features
|
|
66
|
+
y : Series or ndarray
|
|
67
|
+
Target variable
|
|
68
|
+
|
|
69
|
+
Returns
|
|
70
|
+
-------
|
|
71
|
+
self : FeatureSelector
|
|
72
|
+
"""
|
|
73
|
+
X = self._validate_input(X)
|
|
74
|
+
y = np.array(y)
|
|
75
|
+
|
|
76
|
+
# Initialize and fit each selector
|
|
77
|
+
for method in self.methods:
|
|
78
|
+
selector = self._create_selector(method)
|
|
79
|
+
selector.fit(X, y)
|
|
80
|
+
self._selectors[method] = selector
|
|
81
|
+
self._method_scores[method] = selector.get_feature_scores()
|
|
82
|
+
|
|
83
|
+
# Combine scores from all methods
|
|
84
|
+
self._combine_scores(X.columns.tolist())
|
|
85
|
+
|
|
86
|
+
# Apply redundancy elimination
|
|
87
|
+
if self.correlation_threshold < 1.0:
|
|
88
|
+
eliminator = RedundancyEliminator(
|
|
89
|
+
correlation_threshold=self.correlation_threshold,
|
|
90
|
+
importance_scores=self._feature_scores,
|
|
91
|
+
verbose=self.verbose,
|
|
92
|
+
)
|
|
93
|
+
eliminator.fit(X)
|
|
94
|
+
non_redundant = set(eliminator.get_selected_features())
|
|
95
|
+
self._feature_scores = {k: v for k, v in self._feature_scores.items() if k in non_redundant}
|
|
96
|
+
|
|
97
|
+
# Final selection
|
|
98
|
+
self._final_selection()
|
|
99
|
+
|
|
100
|
+
self._is_fitted = True
|
|
101
|
+
return self
|
|
102
|
+
|
|
103
|
+
def _create_selector(self, method: str) -> BaseSelector:
|
|
104
|
+
"""Create selector for a given method."""
|
|
105
|
+
if method == "mutual_info":
|
|
106
|
+
return StatisticalSelector(method="mutual_info", verbose=self.verbose)
|
|
107
|
+
elif method == "f_test":
|
|
108
|
+
return StatisticalSelector(method="f_test", verbose=self.verbose)
|
|
109
|
+
elif method == "chi2":
|
|
110
|
+
return StatisticalSelector(method="chi2", verbose=self.verbose)
|
|
111
|
+
elif method == "correlation":
|
|
112
|
+
return StatisticalSelector(method="correlation", verbose=self.verbose)
|
|
113
|
+
elif method == "importance":
|
|
114
|
+
return ImportanceSelector(model="random_forest", verbose=self.verbose)
|
|
115
|
+
elif method == "xgboost":
|
|
116
|
+
return ImportanceSelector(model="xgboost", verbose=self.verbose)
|
|
117
|
+
else:
|
|
118
|
+
raise ValueError(f"Unknown selection method: {method}")
|
|
119
|
+
|
|
120
|
+
def _combine_scores(self, columns: list[str]) -> None:
|
|
121
|
+
"""Combine scores from multiple methods."""
|
|
122
|
+
combined = {}
|
|
123
|
+
|
|
124
|
+
for col in columns:
|
|
125
|
+
scores = []
|
|
126
|
+
for _, method_scores in self._method_scores.items():
|
|
127
|
+
if col in method_scores:
|
|
128
|
+
# Normalize score to 0-1 range
|
|
129
|
+
all_scores = list(method_scores.values())
|
|
130
|
+
max_score = max(all_scores) if all_scores else 1
|
|
131
|
+
if max_score > 0:
|
|
132
|
+
normalized = method_scores[col] / max_score
|
|
133
|
+
else:
|
|
134
|
+
normalized = 0
|
|
135
|
+
scores.append(normalized)
|
|
136
|
+
|
|
137
|
+
# Average normalized scores
|
|
138
|
+
if scores:
|
|
139
|
+
combined[col] = np.mean(scores)
|
|
140
|
+
else:
|
|
141
|
+
combined[col] = 0
|
|
142
|
+
|
|
143
|
+
self._feature_scores = combined
|
|
144
|
+
|
|
145
|
+
def _final_selection(self) -> None:
|
|
146
|
+
"""Make final feature selection."""
|
|
147
|
+
sorted_features = sorted(self._feature_scores.items(), key=lambda x: x[1], reverse=True)
|
|
148
|
+
|
|
149
|
+
if self.max_features is not None:
|
|
150
|
+
sorted_features = sorted_features[: self.max_features]
|
|
151
|
+
|
|
152
|
+
self._selected_features = [name for name, _ in sorted_features]
|
|
153
|
+
|
|
154
|
+
if self.verbose:
|
|
155
|
+
print(f"FeatureSelector: Selected {len(self._selected_features)} features")
|
|
156
|
+
|
|
157
|
+
def transform(self, X: Union[pd.DataFrame, np.ndarray], **kwargs) -> pd.DataFrame:
|
|
158
|
+
"""Select features from data."""
|
|
159
|
+
if not self._is_fitted:
|
|
160
|
+
raise RuntimeError("Selector must be fitted before transform")
|
|
161
|
+
|
|
162
|
+
X = self._validate_input(X)
|
|
163
|
+
available = [f for f in self._selected_features if f in X.columns]
|
|
164
|
+
return X[available]
|
|
165
|
+
|
|
166
|
+
def get_method_scores(self) -> dict[str, dict[str, float]]:
|
|
167
|
+
"""Get scores from each individual method."""
|
|
168
|
+
return self._method_scores
|
|
169
|
+
|
|
170
|
+
def get_ranking(self) -> list[tuple]:
|
|
171
|
+
"""Get feature ranking as list of (name, score) tuples."""
|
|
172
|
+
return sorted(self._feature_scores.items(), key=lambda x: x[1], reverse=True)
|