featcopilot 0.2.0__py3-none-any.whl → 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -98,67 +98,137 @@ class StatisticalSelector(BaseSelector):
98
98
  def _compute_mutual_info(self, X: pd.DataFrame, y: np.ndarray) -> np.ndarray:
99
99
  """Compute mutual information scores."""
100
100
  from sklearn.feature_selection import mutual_info_classif, mutual_info_regression
101
+ from sklearn.preprocessing import LabelEncoder
101
102
 
102
- # Determine if classification or regression
103
- unique_y = len(np.unique(y))
104
- is_classification = unique_y < 20 and y.dtype in [np.int32, np.int64, "object"]
105
-
106
- X_array = X.fillna(0).values
103
+ # Encode string labels if needed
104
+ y_encoded = y
105
+ if y.dtype == object or y.dtype.kind in ("U", "S"):
106
+ le = LabelEncoder()
107
+ y_encoded = le.fit_transform(y)
107
108
 
108
- if is_classification:
109
- scores = mutual_info_classif(X_array, y, random_state=42)
110
- else:
111
- scores = mutual_info_regression(X_array, y, random_state=42)
109
+ # Determine if classification or regression
110
+ # Check if target is categorical (object type) or has discrete integer values
111
+ unique_y = len(np.unique(y_encoded))
112
+ is_classification = (
113
+ y.dtype == object
114
+ or y.dtype.kind in ("U", "S")
115
+ or (np.issubdtype(y_encoded.dtype, np.integer) and unique_y <= len(y_encoded) * 0.1)
116
+ )
117
+
118
+ # Filter to numeric columns only
119
+ numeric_cols = X.select_dtypes(include=[np.number]).columns.tolist()
120
+ scores = np.zeros(len(X.columns))
121
+
122
+ if numeric_cols:
123
+ X_numeric = X[numeric_cols].fillna(0).values
124
+ numeric_indices = [X.columns.get_loc(c) for c in numeric_cols]
125
+
126
+ if is_classification:
127
+ numeric_scores = mutual_info_classif(X_numeric, y_encoded, random_state=42)
128
+ else:
129
+ numeric_scores = mutual_info_regression(X_numeric, y_encoded, random_state=42)
130
+
131
+ for i, idx in enumerate(numeric_indices):
132
+ scores[idx] = numeric_scores[i]
112
133
 
113
134
  return scores
114
135
 
115
136
  def _compute_f_test(self, X: pd.DataFrame, y: np.ndarray) -> np.ndarray:
116
137
  """Compute F-test scores."""
117
138
  from sklearn.feature_selection import f_classif, f_regression
139
+ from sklearn.preprocessing import LabelEncoder
140
+
141
+ # Encode string labels if needed
142
+ y_encoded = y
143
+ if y.dtype == object or y.dtype.kind in ("U", "S"):
144
+ le = LabelEncoder()
145
+ y_encoded = le.fit_transform(y)
118
146
 
119
- unique_y = len(np.unique(y))
120
- is_classification = unique_y < 20
147
+ # Determine if classification or regression
148
+ unique_y = len(np.unique(y_encoded))
149
+ is_classification = (
150
+ y.dtype == object
151
+ or y.dtype.kind in ("U", "S")
152
+ or (np.issubdtype(y_encoded.dtype, np.integer) and unique_y <= len(y_encoded) * 0.1)
153
+ )
121
154
 
122
- X_array = X.fillna(0).values
155
+ # Filter to numeric columns only
156
+ numeric_cols = X.select_dtypes(include=[np.number]).columns.tolist()
157
+ scores = np.zeros(len(X.columns))
123
158
 
124
- if is_classification:
125
- scores, _ = f_classif(X_array, y)
126
- else:
127
- scores, _ = f_regression(X_array, y)
159
+ if numeric_cols:
160
+ X_numeric = X[numeric_cols].fillna(0).values
161
+ numeric_indices = [X.columns.get_loc(c) for c in numeric_cols]
162
+
163
+ if is_classification:
164
+ numeric_scores, _ = f_classif(X_numeric, y_encoded)
165
+ else:
166
+ numeric_scores, _ = f_regression(X_numeric, y_encoded)
167
+
168
+ # Handle NaN scores
169
+ numeric_scores = np.nan_to_num(numeric_scores, 0)
170
+
171
+ for i, idx in enumerate(numeric_indices):
172
+ scores[idx] = numeric_scores[i]
128
173
 
129
- # Handle NaN scores
130
- scores = np.nan_to_num(scores, 0)
131
174
  return scores
132
175
 
133
176
  def _compute_chi2(self, X: pd.DataFrame, y: np.ndarray) -> np.ndarray:
134
177
  """Compute chi-square scores (for non-negative features)."""
135
178
  from sklearn.feature_selection import chi2
179
+ from sklearn.preprocessing import LabelEncoder
136
180
 
137
- X_array = X.fillna(0).values
181
+ # Encode string labels if needed
182
+ y_encoded = y
183
+ if y.dtype == object or y.dtype.kind in ("U", "S"):
184
+ le = LabelEncoder()
185
+ y_encoded = le.fit_transform(y)
138
186
 
139
- # Chi2 requires non-negative values
140
- X_positive = X_array - X_array.min(axis=0) + 1e-8
187
+ # Filter to numeric columns only
188
+ numeric_cols = X.select_dtypes(include=[np.number]).columns.tolist()
189
+ scores = np.zeros(len(X.columns))
141
190
 
142
- try:
143
- scores, _ = chi2(X_positive, y)
144
- scores = np.nan_to_num(scores, 0)
145
- except Exception:
146
- # Fallback to mutual information
147
- scores = self._compute_mutual_info(X, y)
191
+ if numeric_cols:
192
+ X_numeric = X[numeric_cols].fillna(0).values
193
+ numeric_indices = [X.columns.get_loc(c) for c in numeric_cols]
194
+
195
+ # Chi2 requires non-negative values
196
+ X_positive = X_numeric - X_numeric.min(axis=0) + 1e-8
197
+
198
+ try:
199
+ numeric_scores, _ = chi2(X_positive, y_encoded)
200
+ numeric_scores = np.nan_to_num(numeric_scores, 0)
201
+ except Exception:
202
+ # Fallback to mutual information
203
+ return self._compute_mutual_info(X, y)
204
+
205
+ for i, idx in enumerate(numeric_indices):
206
+ scores[idx] = numeric_scores[i]
148
207
 
149
208
  return scores
150
209
 
151
210
  def _compute_correlation(self, X: pd.DataFrame, y: np.ndarray) -> np.ndarray:
152
211
  """Compute absolute correlation with target."""
153
- scores = []
154
- for col in X.columns:
212
+ from sklearn.preprocessing import LabelEncoder
213
+
214
+ # Encode string labels if needed
215
+ y_encoded = y
216
+ if y.dtype == object or y.dtype.kind in ("U", "S"):
217
+ le = LabelEncoder()
218
+ y_encoded = le.fit_transform(y).astype(float)
219
+
220
+ numeric_cols = X.select_dtypes(include=[np.number]).columns.tolist()
221
+ scores = np.zeros(len(X.columns))
222
+
223
+ for col in numeric_cols:
155
224
  try:
156
- corr = np.abs(np.corrcoef(X[col].fillna(0).values, y)[0, 1])
157
- scores.append(corr if not np.isnan(corr) else 0)
225
+ idx = X.columns.get_loc(col)
226
+ corr = np.abs(np.corrcoef(X[col].fillna(0).values, y_encoded)[0, 1])
227
+ scores[idx] = corr if not np.isnan(corr) else 0
158
228
  except Exception:
159
- scores.append(0)
229
+ pass
160
230
 
161
- return np.array(scores)
231
+ return scores
162
232
 
163
233
  def _select_features(self) -> None:
164
234
  """Select features based on scores."""
@@ -46,6 +46,7 @@ class FeatureSelector(BaseSelector):
46
46
  max_features: Optional[int] = None,
47
47
  correlation_threshold: float = 0.95,
48
48
  combination: str = "union",
49
+ original_features: Optional[set[str]] = None,
49
50
  verbose: bool = False,
50
51
  **kwargs,
51
52
  ):
@@ -54,6 +55,7 @@ class FeatureSelector(BaseSelector):
54
55
  self.max_features = max_features
55
56
  self.correlation_threshold = correlation_threshold
56
57
  self.combination = combination # 'union' or 'intersection'
58
+ self.original_features = original_features or set()
57
59
  self.verbose = verbose
58
60
  self._selectors: dict[str, BaseSelector] = {}
59
61
  self._method_scores: dict[str, dict[str, float]] = {}
@@ -76,6 +78,10 @@ class FeatureSelector(BaseSelector):
76
78
  X = self._validate_input(X)
77
79
  y = np.array(y)
78
80
 
81
+ # Identify categorical/text columns (can't be scored by numeric methods)
82
+ categorical_cols = X.select_dtypes(include=["object", "category"]).columns.tolist()
83
+ numeric_cols = X.select_dtypes(include=[np.number]).columns.tolist()
84
+
79
85
  # Initialize and fit each selector
80
86
  for method in self.methods:
81
87
  selector = self._create_selector(method)
@@ -86,11 +92,27 @@ class FeatureSelector(BaseSelector):
86
92
  # Combine scores from all methods
87
93
  self._combine_scores(X.columns.tolist())
88
94
 
95
+ # Give categorical columns a minimum score so they're not filtered out
96
+ # Original categorical columns are important for models that can handle them
97
+ if categorical_cols:
98
+ # Get the median score of numeric features to use as baseline for categorical
99
+ numeric_scores = [v for k, v in self._feature_scores.items() if k in numeric_cols and v > 0]
100
+ if numeric_scores:
101
+ baseline_score = np.median(numeric_scores)
102
+ else:
103
+ baseline_score = 0.5 # Default if no numeric scores
104
+
105
+ for col in categorical_cols:
106
+ if col in self.original_features:
107
+ # Original categorical columns get a baseline score
108
+ self._feature_scores[col] = max(self._feature_scores.get(col, 0), baseline_score)
109
+
89
110
  # Apply redundancy elimination
90
111
  if self.correlation_threshold < 1.0:
91
112
  eliminator = RedundancyEliminator(
92
113
  correlation_threshold=self.correlation_threshold,
93
114
  importance_scores=self._feature_scores,
115
+ original_features=self.original_features,
94
116
  verbose=self.verbose,
95
117
  )
96
118
  eliminator.fit(X)
@@ -149,13 +171,42 @@ class FeatureSelector(BaseSelector):
149
171
  """Make final feature selection."""
150
172
  sorted_features = sorted(self._feature_scores.items(), key=lambda x: x[1], reverse=True)
151
173
 
174
+ # Always include original features first
175
+ original_selected = []
176
+ derived_selected = []
177
+
178
+ for name, score in sorted_features:
179
+ if name in self.original_features:
180
+ original_selected.append(name)
181
+ else:
182
+ # Only include derived features with meaningful importance (> 1% of max)
183
+ max_score = max(self._feature_scores.values()) if self._feature_scores else 1.0
184
+ importance_threshold = max_score * 0.01 # 1% threshold
185
+ if score >= importance_threshold:
186
+ derived_selected.append(name)
187
+ elif self.verbose:
188
+ logger.debug(f"Excluding low-importance feature {name} (score={score:.4f})")
189
+
190
+ # Apply max_features limit only to derived features
152
191
  if self.max_features is not None:
153
- sorted_features = sorted_features[: self.max_features]
192
+ # Reserve slots for original features, then fill with top derived
193
+ n_derived = max(0, self.max_features - len(original_selected))
194
+ derived_selected = derived_selected[:n_derived]
154
195
 
155
- self._selected_features = [name for name, _ in sorted_features]
196
+ self._selected_features = original_selected + derived_selected
197
+
198
+ # Ensure we never have fewer features than original
199
+ if len(self._selected_features) < len(self.original_features):
200
+ # This should not happen, but add all original features as safety
201
+ for f in self.original_features:
202
+ if f not in self._selected_features:
203
+ self._selected_features.append(f)
156
204
 
157
205
  if self.verbose:
158
- logger.info(f"FeatureSelector: Selected {len(self._selected_features)} features")
206
+ logger.info(
207
+ f"FeatureSelector: Selected {len(self._selected_features)} features "
208
+ f"({len(original_selected)} original + {len(derived_selected)} derived)"
209
+ )
159
210
 
160
211
  def transform(self, X: Union[pd.DataFrame, np.ndarray], **kwargs) -> pd.DataFrame:
161
212
  """Select features from data."""
@@ -7,9 +7,11 @@ and serving in production ML systems.
7
7
 
8
8
  from featcopilot.stores.base import BaseFeatureStore, FeatureStoreConfig
9
9
  from featcopilot.stores.feast_store import FeastFeatureStore
10
+ from featcopilot.stores.rule_store import TransformRuleStore
10
11
 
11
12
  __all__ = [
12
13
  "BaseFeatureStore",
13
14
  "FeatureStoreConfig",
14
15
  "FeastFeatureStore",
16
+ "TransformRuleStore",
15
17
  ]
@@ -0,0 +1,343 @@
1
+ """Persistent storage for transform rules.
2
+
3
+ Provides JSON-file based storage for saving, loading, and searching
4
+ reusable transform rules.
5
+ """
6
+
7
+ import json
8
+ import os
9
+ from pathlib import Path
10
+ from typing import Optional
11
+
12
+ from featcopilot.core.transform_rule import TransformRule
13
+ from featcopilot.utils.logger import get_logger
14
+
15
+ logger = get_logger(__name__)
16
+
17
+
18
+ class TransformRuleStore:
19
+ """
20
+ Persistent storage for transform rules.
21
+
22
+ Stores rules in a JSON file for reuse across sessions and datasets.
23
+ Supports searching by tags, description similarity, and column patterns.
24
+
25
+ Parameters
26
+ ----------
27
+ path : str, optional
28
+ Path to the JSON file for storage. Defaults to ~/.featcopilot/rules.json
29
+
30
+ Examples
31
+ --------
32
+ >>> store = TransformRuleStore()
33
+ >>> store.save_rule(rule)
34
+ >>> matching = store.find_matching_rules(columns=["price", "quantity"])
35
+ >>> all_rules = store.list_rules()
36
+ """
37
+
38
+ DEFAULT_PATH = "~/.featcopilot/rules.json"
39
+
40
+ def __init__(self, path: Optional[str] = None):
41
+ self.path = Path(os.path.expanduser(path or self.DEFAULT_PATH))
42
+ self._rules: dict[str, TransformRule] = {}
43
+ self._ensure_directory()
44
+ self._load()
45
+
46
+ def _ensure_directory(self) -> None:
47
+ """Ensure the storage directory exists."""
48
+ self.path.parent.mkdir(parents=True, exist_ok=True)
49
+
50
+ def _load(self) -> None:
51
+ """Load rules from storage file."""
52
+ if self.path.exists():
53
+ try:
54
+ with open(self.path, encoding="utf-8") as f:
55
+ data = json.load(f)
56
+ self._rules = {rule_id: TransformRule.from_dict(rule_data) for rule_id, rule_data in data.items()}
57
+ logger.debug(f"Loaded {len(self._rules)} rules from {self.path}")
58
+ except (json.JSONDecodeError, KeyError) as e:
59
+ logger.warning(f"Failed to load rules from {self.path}: {e}")
60
+ self._rules = {}
61
+ else:
62
+ self._rules = {}
63
+
64
+ def _save(self) -> None:
65
+ """Save rules to storage file."""
66
+ try:
67
+ with open(self.path, "w", encoding="utf-8") as f:
68
+ data = {rule_id: rule.to_dict() for rule_id, rule in self._rules.items()}
69
+ json.dump(data, f, indent=2)
70
+ logger.debug(f"Saved {len(self._rules)} rules to {self.path}")
71
+ except OSError as e:
72
+ logger.error(f"Failed to save rules to {self.path}: {e}")
73
+ raise
74
+
75
+ def save_rule(self, rule: TransformRule) -> str:
76
+ """
77
+ Save a rule to the store.
78
+
79
+ Parameters
80
+ ----------
81
+ rule : TransformRule
82
+ The rule to save
83
+
84
+ Returns
85
+ -------
86
+ str
87
+ The rule's ID
88
+ """
89
+ self._rules[rule.id] = rule
90
+ self._save()
91
+ logger.info(f"Saved rule '{rule.name}' with ID {rule.id}")
92
+ return rule.id
93
+
94
+ def get_rule(self, rule_id: str) -> Optional[TransformRule]:
95
+ """
96
+ Get a rule by ID.
97
+
98
+ Parameters
99
+ ----------
100
+ rule_id : str
101
+ The rule's ID
102
+
103
+ Returns
104
+ -------
105
+ TransformRule or None
106
+ The rule if found, None otherwise
107
+ """
108
+ return self._rules.get(rule_id)
109
+
110
+ def get_rule_by_name(self, name: str) -> Optional[TransformRule]:
111
+ """
112
+ Get a rule by name.
113
+
114
+ Parameters
115
+ ----------
116
+ name : str
117
+ The rule's name
118
+
119
+ Returns
120
+ -------
121
+ TransformRule or None
122
+ The first rule matching the name, None if not found
123
+ """
124
+ for rule in self._rules.values():
125
+ if rule.name == name:
126
+ return rule
127
+ return None
128
+
129
+ def delete_rule(self, rule_id: str) -> bool:
130
+ """
131
+ Delete a rule by ID.
132
+
133
+ Parameters
134
+ ----------
135
+ rule_id : str
136
+ The rule's ID
137
+
138
+ Returns
139
+ -------
140
+ bool
141
+ True if deleted, False if not found
142
+ """
143
+ if rule_id in self._rules:
144
+ del self._rules[rule_id]
145
+ self._save()
146
+ logger.info(f"Deleted rule {rule_id}")
147
+ return True
148
+ return False
149
+
150
+ def list_rules(self, tags: Optional[list[str]] = None) -> list[TransformRule]:
151
+ """
152
+ List all rules, optionally filtered by tags.
153
+
154
+ Parameters
155
+ ----------
156
+ tags : list[str], optional
157
+ Filter rules that have all specified tags
158
+
159
+ Returns
160
+ -------
161
+ list[TransformRule]
162
+ List of matching rules
163
+ """
164
+ rules = list(self._rules.values())
165
+
166
+ if tags:
167
+ rules = [r for r in rules if all(t in r.tags for t in tags)]
168
+
169
+ return rules
170
+
171
+ def find_matching_rules(
172
+ self,
173
+ columns: Optional[list[str]] = None,
174
+ description: Optional[str] = None,
175
+ tags: Optional[list[str]] = None,
176
+ min_usage: int = 0,
177
+ ) -> list[tuple[TransformRule, dict[str, str]]]:
178
+ """
179
+ Find rules that can be applied to the given context.
180
+
181
+ Parameters
182
+ ----------
183
+ columns : list[str], optional
184
+ Available column names to match against
185
+ description : str, optional
186
+ Description to search for (keyword matching)
187
+ tags : list[str], optional
188
+ Required tags
189
+ min_usage : int, default=0
190
+ Minimum usage count
191
+
192
+ Returns
193
+ -------
194
+ list[tuple[TransformRule, dict]]
195
+ List of (rule, column_mapping) tuples for applicable rules,
196
+ sorted by usage count (most used first)
197
+ """
198
+ results: list[tuple[TransformRule, dict[str, str]]] = []
199
+
200
+ for rule in self._rules.values():
201
+ # Filter by usage count
202
+ if rule.usage_count < min_usage:
203
+ continue
204
+
205
+ # Filter by tags
206
+ if tags and not all(t in rule.tags for t in tags):
207
+ continue
208
+
209
+ # Filter by description keywords
210
+ if description:
211
+ keywords = description.lower().split()
212
+ rule_text = f"{rule.name} {rule.description}".lower()
213
+ if not any(kw in rule_text for kw in keywords):
214
+ continue
215
+
216
+ # Check column compatibility
217
+ mapping: dict[str, str] = {}
218
+ if columns:
219
+ matches, mapping = rule.matches_columns(columns)
220
+ if not matches:
221
+ continue
222
+
223
+ results.append((rule, mapping))
224
+
225
+ # Sort by usage count (descending)
226
+ results.sort(key=lambda x: x[0].usage_count, reverse=True)
227
+
228
+ return results
229
+
230
+ def search_by_description(self, query: str, limit: int = 10) -> list[TransformRule]:
231
+ """
232
+ Search rules by description similarity.
233
+
234
+ Parameters
235
+ ----------
236
+ query : str
237
+ Search query
238
+ limit : int, default=10
239
+ Maximum number of results
240
+
241
+ Returns
242
+ -------
243
+ list[TransformRule]
244
+ Matching rules sorted by relevance
245
+ """
246
+ query_words = set(query.lower().split())
247
+ scored_rules: list[tuple[float, TransformRule]] = []
248
+
249
+ for rule in self._rules.values():
250
+ rule_words = set(f"{rule.name} {rule.description}".lower().split())
251
+
252
+ # Simple word overlap scoring
253
+ overlap = len(query_words & rule_words)
254
+ if overlap > 0:
255
+ score = overlap / len(query_words)
256
+ scored_rules.append((score, rule))
257
+
258
+ # Sort by score descending
259
+ scored_rules.sort(key=lambda x: x[0], reverse=True)
260
+
261
+ return [rule for _, rule in scored_rules[:limit]]
262
+
263
+ def import_rules(self, path: str, merge: bool = True) -> int:
264
+ """
265
+ Import rules from another JSON file.
266
+
267
+ Parameters
268
+ ----------
269
+ path : str
270
+ Path to import from
271
+ merge : bool, default=True
272
+ If True, merge with existing rules. If False, replace all.
273
+
274
+ Returns
275
+ -------
276
+ int
277
+ Number of rules imported
278
+ """
279
+ import_path = Path(os.path.expanduser(path))
280
+
281
+ if not import_path.exists():
282
+ raise FileNotFoundError(f"Import file not found: {path}")
283
+
284
+ with open(import_path, encoding="utf-8") as f:
285
+ data = json.load(f)
286
+
287
+ if not merge:
288
+ self._rules = {}
289
+
290
+ count = 0
291
+ for _rule_id, rule_data in data.items():
292
+ rule = TransformRule.from_dict(rule_data)
293
+ self._rules[rule.id] = rule
294
+ count += 1
295
+
296
+ self._save()
297
+ logger.info(f"Imported {count} rules from {path}")
298
+
299
+ return count
300
+
301
+ def export_rules(self, path: str, tags: Optional[list[str]] = None) -> int:
302
+ """
303
+ Export rules to a JSON file.
304
+
305
+ Parameters
306
+ ----------
307
+ path : str
308
+ Path to export to
309
+ tags : list[str], optional
310
+ Only export rules with these tags
311
+
312
+ Returns
313
+ -------
314
+ int
315
+ Number of rules exported
316
+ """
317
+ export_path = Path(os.path.expanduser(path))
318
+ export_path.parent.mkdir(parents=True, exist_ok=True)
319
+
320
+ rules_to_export = self.list_rules(tags=tags)
321
+
322
+ with open(export_path, "w", encoding="utf-8") as f:
323
+ data = {r.id: r.to_dict() for r in rules_to_export}
324
+ json.dump(data, f, indent=2)
325
+
326
+ logger.info(f"Exported {len(rules_to_export)} rules to {path}")
327
+
328
+ return len(rules_to_export)
329
+
330
+ def clear(self) -> None:
331
+ """Remove all rules from the store."""
332
+ self._rules = {}
333
+ self._save()
334
+ logger.info("Cleared all rules")
335
+
336
+ def __len__(self) -> int:
337
+ return len(self._rules)
338
+
339
+ def __contains__(self, rule_id: str) -> bool:
340
+ return rule_id in self._rules
341
+
342
+ def __iter__(self):
343
+ return iter(self._rules.values())
@@ -90,7 +90,7 @@ class AutoFeatureEngineer(BaseEstimator, TransformerMixin):
90
90
  Parameters
91
91
  ----------
92
92
  engines : list, default=['tabular']
93
- Engines to use ('tabular', 'timeseries', 'text', 'llm')
93
+ Engines to use ('tabular', 'timeseries', 'relational', 'text', 'llm')
94
94
  max_features : int, optional
95
95
  Maximum features to generate/select
96
96
  selection_methods : list, default=['mutual_info', 'importance']
@@ -199,6 +199,8 @@ class AutoFeatureEngineer(BaseEstimator, TransformerMixin):
199
199
  return TimeSeriesEngine(max_features=self.max_features, verbose=self.verbose)
200
200
  elif engine_name == "text":
201
201
  return TextEngine(max_features=self.max_features, verbose=self.verbose)
202
+ elif engine_name == "relational":
203
+ return RelationalEngine(max_features=self.max_features, verbose=self.verbose)
202
204
  elif engine_name == "llm":
203
205
  from featcopilot.llm.semantic_engine import SemanticEngine
204
206
 
@@ -294,12 +296,19 @@ class AutoFeatureEngineer(BaseEstimator, TransformerMixin):
294
296
  self.fit(X, y, column_descriptions, task_description, **fit_params)
295
297
  result = self.transform(X)
296
298
 
299
+ # Track original features (input columns) vs derived features
300
+ if isinstance(X, np.ndarray):
301
+ original_features = {f"feature_{i}" for i in range(X.shape[1])}
302
+ else:
303
+ original_features = set(X.columns)
304
+
297
305
  # Apply feature selection if enabled and y is provided
298
306
  if apply_selection and y is not None and self.max_features:
299
307
  self._selector = FeatureSelector(
300
308
  methods=self.selection_methods,
301
309
  max_features=self.max_features,
302
310
  correlation_threshold=self.correlation_threshold,
311
+ original_features=original_features,
303
312
  verbose=self.verbose,
304
313
  )
305
314
  result = self._selector.fit_transform(result, y)