featcopilot 0.1.0__py3-none-any.whl → 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -6,6 +6,9 @@ import numpy as np
6
6
  import pandas as pd
7
7
 
8
8
  from featcopilot.core.base import BaseSelector
9
+ from featcopilot.utils.logger import get_logger
10
+
11
+ logger = get_logger(__name__)
9
12
 
10
13
 
11
14
  class StatisticalSelector(BaseSelector):
@@ -95,67 +98,137 @@ class StatisticalSelector(BaseSelector):
95
98
  def _compute_mutual_info(self, X: pd.DataFrame, y: np.ndarray) -> np.ndarray:
96
99
  """Compute mutual information scores."""
97
100
  from sklearn.feature_selection import mutual_info_classif, mutual_info_regression
101
+ from sklearn.preprocessing import LabelEncoder
98
102
 
99
- # Determine if classification or regression
100
- unique_y = len(np.unique(y))
101
- is_classification = unique_y < 20 and y.dtype in [np.int32, np.int64, "object"]
102
-
103
- X_array = X.fillna(0).values
103
+ # Encode string labels if needed
104
+ y_encoded = y
105
+ if y.dtype == object or y.dtype.kind in ("U", "S"):
106
+ le = LabelEncoder()
107
+ y_encoded = le.fit_transform(y)
104
108
 
105
- if is_classification:
106
- scores = mutual_info_classif(X_array, y, random_state=42)
107
- else:
108
- scores = mutual_info_regression(X_array, y, random_state=42)
109
+ # Determine if classification or regression
110
+ # Check if target is categorical (object type) or has discrete integer values
111
+ unique_y = len(np.unique(y_encoded))
112
+ is_classification = (
113
+ y.dtype == object
114
+ or y.dtype.kind in ("U", "S")
115
+ or (np.issubdtype(y_encoded.dtype, np.integer) and unique_y <= len(y_encoded) * 0.1)
116
+ )
117
+
118
+ # Filter to numeric columns only
119
+ numeric_cols = X.select_dtypes(include=[np.number]).columns.tolist()
120
+ scores = np.zeros(len(X.columns))
121
+
122
+ if numeric_cols:
123
+ X_numeric = X[numeric_cols].fillna(0).values
124
+ numeric_indices = [X.columns.get_loc(c) for c in numeric_cols]
125
+
126
+ if is_classification:
127
+ numeric_scores = mutual_info_classif(X_numeric, y_encoded, random_state=42)
128
+ else:
129
+ numeric_scores = mutual_info_regression(X_numeric, y_encoded, random_state=42)
130
+
131
+ for i, idx in enumerate(numeric_indices):
132
+ scores[idx] = numeric_scores[i]
109
133
 
110
134
  return scores
111
135
 
112
136
  def _compute_f_test(self, X: pd.DataFrame, y: np.ndarray) -> np.ndarray:
113
137
  """Compute F-test scores."""
114
138
  from sklearn.feature_selection import f_classif, f_regression
139
+ from sklearn.preprocessing import LabelEncoder
115
140
 
116
- unique_y = len(np.unique(y))
117
- is_classification = unique_y < 20
141
+ # Encode string labels if needed
142
+ y_encoded = y
143
+ if y.dtype == object or y.dtype.kind in ("U", "S"):
144
+ le = LabelEncoder()
145
+ y_encoded = le.fit_transform(y)
118
146
 
119
- X_array = X.fillna(0).values
147
+ # Determine if classification or regression
148
+ unique_y = len(np.unique(y_encoded))
149
+ is_classification = (
150
+ y.dtype == object
151
+ or y.dtype.kind in ("U", "S")
152
+ or (np.issubdtype(y_encoded.dtype, np.integer) and unique_y <= len(y_encoded) * 0.1)
153
+ )
120
154
 
121
- if is_classification:
122
- scores, _ = f_classif(X_array, y)
123
- else:
124
- scores, _ = f_regression(X_array, y)
155
+ # Filter to numeric columns only
156
+ numeric_cols = X.select_dtypes(include=[np.number]).columns.tolist()
157
+ scores = np.zeros(len(X.columns))
158
+
159
+ if numeric_cols:
160
+ X_numeric = X[numeric_cols].fillna(0).values
161
+ numeric_indices = [X.columns.get_loc(c) for c in numeric_cols]
162
+
163
+ if is_classification:
164
+ numeric_scores, _ = f_classif(X_numeric, y_encoded)
165
+ else:
166
+ numeric_scores, _ = f_regression(X_numeric, y_encoded)
167
+
168
+ # Handle NaN scores
169
+ numeric_scores = np.nan_to_num(numeric_scores, 0)
170
+
171
+ for i, idx in enumerate(numeric_indices):
172
+ scores[idx] = numeric_scores[i]
125
173
 
126
- # Handle NaN scores
127
- scores = np.nan_to_num(scores, 0)
128
174
  return scores
129
175
 
130
176
  def _compute_chi2(self, X: pd.DataFrame, y: np.ndarray) -> np.ndarray:
131
177
  """Compute chi-square scores (for non-negative features)."""
132
178
  from sklearn.feature_selection import chi2
179
+ from sklearn.preprocessing import LabelEncoder
133
180
 
134
- X_array = X.fillna(0).values
181
+ # Encode string labels if needed
182
+ y_encoded = y
183
+ if y.dtype == object or y.dtype.kind in ("U", "S"):
184
+ le = LabelEncoder()
185
+ y_encoded = le.fit_transform(y)
135
186
 
136
- # Chi2 requires non-negative values
137
- X_positive = X_array - X_array.min(axis=0) + 1e-8
187
+ # Filter to numeric columns only
188
+ numeric_cols = X.select_dtypes(include=[np.number]).columns.tolist()
189
+ scores = np.zeros(len(X.columns))
138
190
 
139
- try:
140
- scores, _ = chi2(X_positive, y)
141
- scores = np.nan_to_num(scores, 0)
142
- except Exception:
143
- # Fallback to mutual information
144
- scores = self._compute_mutual_info(X, y)
191
+ if numeric_cols:
192
+ X_numeric = X[numeric_cols].fillna(0).values
193
+ numeric_indices = [X.columns.get_loc(c) for c in numeric_cols]
194
+
195
+ # Chi2 requires non-negative values
196
+ X_positive = X_numeric - X_numeric.min(axis=0) + 1e-8
197
+
198
+ try:
199
+ numeric_scores, _ = chi2(X_positive, y_encoded)
200
+ numeric_scores = np.nan_to_num(numeric_scores, 0)
201
+ except Exception:
202
+ # Fallback to mutual information
203
+ return self._compute_mutual_info(X, y)
204
+
205
+ for i, idx in enumerate(numeric_indices):
206
+ scores[idx] = numeric_scores[i]
145
207
 
146
208
  return scores
147
209
 
148
210
  def _compute_correlation(self, X: pd.DataFrame, y: np.ndarray) -> np.ndarray:
149
211
  """Compute absolute correlation with target."""
150
- scores = []
151
- for col in X.columns:
212
+ from sklearn.preprocessing import LabelEncoder
213
+
214
+ # Encode string labels if needed
215
+ y_encoded = y
216
+ if y.dtype == object or y.dtype.kind in ("U", "S"):
217
+ le = LabelEncoder()
218
+ y_encoded = le.fit_transform(y).astype(float)
219
+
220
+ numeric_cols = X.select_dtypes(include=[np.number]).columns.tolist()
221
+ scores = np.zeros(len(X.columns))
222
+
223
+ for col in numeric_cols:
152
224
  try:
153
- corr = np.abs(np.corrcoef(X[col].fillna(0).values, y)[0, 1])
154
- scores.append(corr if not np.isnan(corr) else 0)
225
+ idx = X.columns.get_loc(col)
226
+ corr = np.abs(np.corrcoef(X[col].fillna(0).values, y_encoded)[0, 1])
227
+ scores[idx] = corr if not np.isnan(corr) else 0
155
228
  except Exception:
156
- scores.append(0)
229
+ pass
157
230
 
158
- return np.array(scores)
231
+ return scores
159
232
 
160
233
  def _select_features(self) -> None:
161
234
  """Select features based on scores."""
@@ -173,7 +246,7 @@ class StatisticalSelector(BaseSelector):
173
246
  self._selected_features = [name for name, _ in sorted_features]
174
247
 
175
248
  if self.verbose:
176
- print(f"StatisticalSelector: Selected {len(self._selected_features)} features")
249
+ logger.info(f"StatisticalSelector: Selected {len(self._selected_features)} features")
177
250
 
178
251
  def transform(self, X: Union[pd.DataFrame, np.ndarray], **kwargs) -> pd.DataFrame:
179
252
  """
@@ -9,6 +9,9 @@ from featcopilot.core.base import BaseSelector
9
9
  from featcopilot.selection.importance import ImportanceSelector
10
10
  from featcopilot.selection.redundancy import RedundancyEliminator
11
11
  from featcopilot.selection.statistical import StatisticalSelector
12
+ from featcopilot.utils.logger import get_logger
13
+
14
+ logger = get_logger(__name__)
12
15
 
13
16
 
14
17
  class FeatureSelector(BaseSelector):
@@ -43,6 +46,7 @@ class FeatureSelector(BaseSelector):
43
46
  max_features: Optional[int] = None,
44
47
  correlation_threshold: float = 0.95,
45
48
  combination: str = "union",
49
+ original_features: Optional[set[str]] = None,
46
50
  verbose: bool = False,
47
51
  **kwargs,
48
52
  ):
@@ -51,6 +55,7 @@ class FeatureSelector(BaseSelector):
51
55
  self.max_features = max_features
52
56
  self.correlation_threshold = correlation_threshold
53
57
  self.combination = combination # 'union' or 'intersection'
58
+ self.original_features = original_features or set()
54
59
  self.verbose = verbose
55
60
  self._selectors: dict[str, BaseSelector] = {}
56
61
  self._method_scores: dict[str, dict[str, float]] = {}
@@ -73,6 +78,10 @@ class FeatureSelector(BaseSelector):
73
78
  X = self._validate_input(X)
74
79
  y = np.array(y)
75
80
 
81
+ # Identify categorical/text columns (can't be scored by numeric methods)
82
+ categorical_cols = X.select_dtypes(include=["object", "category"]).columns.tolist()
83
+ numeric_cols = X.select_dtypes(include=[np.number]).columns.tolist()
84
+
76
85
  # Initialize and fit each selector
77
86
  for method in self.methods:
78
87
  selector = self._create_selector(method)
@@ -83,11 +92,27 @@ class FeatureSelector(BaseSelector):
83
92
  # Combine scores from all methods
84
93
  self._combine_scores(X.columns.tolist())
85
94
 
95
+ # Give categorical columns a minimum score so they're not filtered out
96
+ # Original categorical columns are important for models that can handle them
97
+ if categorical_cols:
98
+ # Get the median score of numeric features to use as baseline for categorical
99
+ numeric_scores = [v for k, v in self._feature_scores.items() if k in numeric_cols and v > 0]
100
+ if numeric_scores:
101
+ baseline_score = np.median(numeric_scores)
102
+ else:
103
+ baseline_score = 0.5 # Default if no numeric scores
104
+
105
+ for col in categorical_cols:
106
+ if col in self.original_features:
107
+ # Original categorical columns get a baseline score
108
+ self._feature_scores[col] = max(self._feature_scores.get(col, 0), baseline_score)
109
+
86
110
  # Apply redundancy elimination
87
111
  if self.correlation_threshold < 1.0:
88
112
  eliminator = RedundancyEliminator(
89
113
  correlation_threshold=self.correlation_threshold,
90
114
  importance_scores=self._feature_scores,
115
+ original_features=self.original_features,
91
116
  verbose=self.verbose,
92
117
  )
93
118
  eliminator.fit(X)
@@ -146,13 +171,42 @@ class FeatureSelector(BaseSelector):
146
171
  """Make final feature selection."""
147
172
  sorted_features = sorted(self._feature_scores.items(), key=lambda x: x[1], reverse=True)
148
173
 
174
+ # Always include original features first
175
+ original_selected = []
176
+ derived_selected = []
177
+
178
+ for name, score in sorted_features:
179
+ if name in self.original_features:
180
+ original_selected.append(name)
181
+ else:
182
+ # Only include derived features with meaningful importance (> 1% of max)
183
+ max_score = max(self._feature_scores.values()) if self._feature_scores else 1.0
184
+ importance_threshold = max_score * 0.01 # 1% threshold
185
+ if score >= importance_threshold:
186
+ derived_selected.append(name)
187
+ elif self.verbose:
188
+ logger.debug(f"Excluding low-importance feature {name} (score={score:.4f})")
189
+
190
+ # Apply max_features limit only to derived features
149
191
  if self.max_features is not None:
150
- sorted_features = sorted_features[: self.max_features]
192
+ # Reserve slots for original features, then fill with top derived
193
+ n_derived = max(0, self.max_features - len(original_selected))
194
+ derived_selected = derived_selected[:n_derived]
151
195
 
152
- self._selected_features = [name for name, _ in sorted_features]
196
+ self._selected_features = original_selected + derived_selected
197
+
198
+ # Ensure we never have fewer features than original
199
+ if len(self._selected_features) < len(self.original_features):
200
+ # This should not happen, but add all original features as safety
201
+ for f in self.original_features:
202
+ if f not in self._selected_features:
203
+ self._selected_features.append(f)
153
204
 
154
205
  if self.verbose:
155
- print(f"FeatureSelector: Selected {len(self._selected_features)} features")
206
+ logger.info(
207
+ f"FeatureSelector: Selected {len(self._selected_features)} features "
208
+ f"({len(original_selected)} original + {len(derived_selected)} derived)"
209
+ )
156
210
 
157
211
  def transform(self, X: Union[pd.DataFrame, np.ndarray], **kwargs) -> pd.DataFrame:
158
212
  """Select features from data."""
@@ -0,0 +1,17 @@
1
+ """Feature store integrations for FeatCopilot.
2
+
3
+ Provides interfaces to save and retrieve engineered features
4
+ from popular feature stores like Feast, enabling feature reuse
5
+ and serving in production ML systems.
6
+ """
7
+
8
+ from featcopilot.stores.base import BaseFeatureStore, FeatureStoreConfig
9
+ from featcopilot.stores.feast_store import FeastFeatureStore
10
+ from featcopilot.stores.rule_store import TransformRuleStore
11
+
12
+ __all__ = [
13
+ "BaseFeatureStore",
14
+ "FeatureStoreConfig",
15
+ "FeastFeatureStore",
16
+ "TransformRuleStore",
17
+ ]
@@ -0,0 +1,166 @@
1
+ """Base classes for feature store integrations."""
2
+
3
+ from abc import ABC, abstractmethod
4
+ from typing import Any, Optional
5
+
6
+ import pandas as pd
7
+ from pydantic import BaseModel, Field
8
+
9
+ from featcopilot.core.feature import FeatureSet
10
+
11
+
12
+ class FeatureStoreConfig(BaseModel):
13
+ """Base configuration for feature stores."""
14
+
15
+ name: str = Field(description="Feature store name")
16
+ entity_columns: list[str] = Field(default_factory=list, description="Entity/key columns")
17
+ timestamp_column: Optional[str] = Field(default=None, description="Event timestamp column")
18
+ feature_prefix: str = Field(default="", description="Prefix for feature names")
19
+ tags: dict[str, str] = Field(default_factory=dict, description="Tags/labels for features")
20
+
21
+
22
+ class BaseFeatureStore(ABC):
23
+ """
24
+ Abstract base class for feature store integrations.
25
+
26
+ Provides a unified interface for saving and retrieving
27
+ engineered features from various feature stores.
28
+
29
+ Parameters
30
+ ----------
31
+ config : FeatureStoreConfig
32
+ Configuration for the feature store
33
+
34
+ Examples
35
+ --------
36
+ >>> store = ConcreteFeatureStore(config)
37
+ >>> store.save_features(X_transformed, feature_set, feature_view_name='my_features')
38
+ >>> features = store.get_features(entity_df, feature_names=['feat1', 'feat2'])
39
+ """
40
+
41
+ def __init__(self, config: FeatureStoreConfig):
42
+ self.config = config
43
+ self._is_initialized = False
44
+
45
+ @abstractmethod
46
+ def initialize(self) -> None:
47
+ """
48
+ Initialize connection to the feature store.
49
+
50
+ This should be called before any other operations.
51
+ """
52
+ pass
53
+
54
+ @abstractmethod
55
+ def save_features(
56
+ self,
57
+ df: pd.DataFrame,
58
+ feature_set: Optional[FeatureSet] = None,
59
+ feature_view_name: str = "featcopilot_features",
60
+ description: Optional[str] = None,
61
+ **kwargs,
62
+ ) -> None:
63
+ """
64
+ Save features to the feature store.
65
+
66
+ Parameters
67
+ ----------
68
+ df : DataFrame
69
+ DataFrame containing features to save
70
+ feature_set : FeatureSet, optional
71
+ FeatCopilot FeatureSet with metadata
72
+ feature_view_name : str
73
+ Name for the feature view/table
74
+ description : str, optional
75
+ Description of the feature view
76
+ **kwargs
77
+ Additional store-specific options
78
+ """
79
+ pass
80
+
81
+ @abstractmethod
82
+ def get_features(
83
+ self,
84
+ entity_df: pd.DataFrame,
85
+ feature_names: list[str],
86
+ feature_view_name: str = "featcopilot_features",
87
+ **kwargs,
88
+ ) -> pd.DataFrame:
89
+ """
90
+ Retrieve features from the feature store.
91
+
92
+ Parameters
93
+ ----------
94
+ entity_df : DataFrame
95
+ DataFrame with entity keys and timestamps
96
+ feature_names : list
97
+ Names of features to retrieve
98
+ feature_view_name : str
99
+ Name of the feature view/table
100
+ **kwargs
101
+ Additional store-specific options
102
+
103
+ Returns
104
+ -------
105
+ DataFrame
106
+ DataFrame with requested features
107
+ """
108
+ pass
109
+
110
+ @abstractmethod
111
+ def list_feature_views(self) -> list[str]:
112
+ """
113
+ List all feature views in the store.
114
+
115
+ Returns
116
+ -------
117
+ list
118
+ Names of feature views
119
+ """
120
+ pass
121
+
122
+ @abstractmethod
123
+ def get_feature_view_schema(self, feature_view_name: str) -> dict[str, Any]:
124
+ """
125
+ Get schema/metadata for a feature view.
126
+
127
+ Parameters
128
+ ----------
129
+ feature_view_name : str
130
+ Name of the feature view
131
+
132
+ Returns
133
+ -------
134
+ dict
135
+ Schema information
136
+ """
137
+ pass
138
+
139
+ @abstractmethod
140
+ def delete_feature_view(self, feature_view_name: str) -> bool:
141
+ """
142
+ Delete a feature view.
143
+
144
+ Parameters
145
+ ----------
146
+ feature_view_name : str
147
+ Name of the feature view to delete
148
+
149
+ Returns
150
+ -------
151
+ bool
152
+ Whether deletion was successful
153
+ """
154
+ pass
155
+
156
+ def close(self) -> None:
157
+ """Close connection to the feature store."""
158
+ self._is_initialized = False
159
+
160
+ def __enter__(self):
161
+ self.initialize()
162
+ return self
163
+
164
+ def __exit__(self, exc_type, exc_val, exc_tb):
165
+ self.close()
166
+ return False