featcopilot 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,401 @@
1
+ """Scikit-learn compatible feature engineering transformers.
2
+
3
+ Provides drop-in sklearn transformers for feature engineering pipelines.
4
+ """
5
+
6
+ from typing import Any, Optional, Union
7
+
8
+ import numpy as np
9
+ import pandas as pd
10
+ from sklearn.base import BaseEstimator, TransformerMixin
11
+
12
+ from featcopilot.core.feature import FeatureSet
13
+ from featcopilot.engines.relational import RelationalEngine
14
+ from featcopilot.engines.tabular import TabularEngine
15
+ from featcopilot.engines.text import TextEngine
16
+ from featcopilot.engines.timeseries import TimeSeriesEngine
17
+ from featcopilot.selection.unified import FeatureSelector
18
+
19
+
20
+ class FeatureEngineerTransformer(BaseEstimator, TransformerMixin):
21
+ """
22
+ Scikit-learn compatible feature engineering transformer.
23
+
24
+ Wraps individual engines for use in sklearn pipelines.
25
+
26
+ Parameters
27
+ ----------
28
+ engine : str, default='tabular'
29
+ Engine type ('tabular', 'timeseries', 'relational', 'text')
30
+ **engine_kwargs : dict
31
+ Arguments passed to the engine
32
+
33
+ Examples
34
+ --------
35
+ >>> from sklearn.pipeline import Pipeline
36
+ >>> pipe = Pipeline([
37
+ ... ('features', FeatureEngineerTransformer(engine='tabular')),
38
+ ... ('model', LogisticRegression())
39
+ ... ])
40
+ """
41
+
42
+ def __init__(self, engine: str = "tabular", **engine_kwargs):
43
+ self.engine = engine
44
+ self.engine_kwargs = engine_kwargs
45
+ self._engine_instance = None
46
+
47
+ def _create_engine(self):
48
+ """Create the appropriate engine instance."""
49
+ engines = {
50
+ "tabular": TabularEngine,
51
+ "timeseries": TimeSeriesEngine,
52
+ "relational": RelationalEngine,
53
+ "text": TextEngine,
54
+ }
55
+
56
+ if self.engine not in engines:
57
+ raise ValueError(f"Unknown engine: {self.engine}")
58
+
59
+ return engines[self.engine](**self.engine_kwargs)
60
+
61
+ def fit(self, X, y=None, **fit_params):
62
+ """Fit the transformer."""
63
+ self._engine_instance = self._create_engine()
64
+ self._engine_instance.fit(X, y, **fit_params)
65
+ return self
66
+
67
+ def transform(self, X, **transform_params):
68
+ """Transform data to generate features."""
69
+ if self._engine_instance is None:
70
+ raise RuntimeError("Transformer must be fitted before transform")
71
+ return self._engine_instance.transform(X, **transform_params)
72
+
73
+ def get_feature_names_out(self, input_features=None):
74
+ """Get output feature names."""
75
+ if self._engine_instance is None:
76
+ return []
77
+ return self._engine_instance.get_feature_names()
78
+
79
+
80
+ class AutoFeatureEngineer(BaseEstimator, TransformerMixin):
81
+ """
82
+ Main auto feature engineering class.
83
+
84
+ Combines multiple engines and selection methods for comprehensive
85
+ automated feature engineering with LLM capabilities.
86
+
87
+ Parameters
88
+ ----------
89
+ engines : list, default=['tabular']
90
+ Engines to use ('tabular', 'timeseries', 'text', 'llm')
91
+ max_features : int, optional
92
+ Maximum features to generate/select
93
+ selection_methods : list, default=['mutual_info', 'importance']
94
+ Feature selection methods
95
+ llm_config : dict, optional
96
+ Configuration for LLM engine
97
+ verbose : bool, default=False
98
+ Verbose output
99
+
100
+ Examples
101
+ --------
102
+ >>> engineer = AutoFeatureEngineer(
103
+ ... engines=['tabular', 'llm'],
104
+ ... max_features=100,
105
+ ... llm_config={'model': 'gpt-5', 'enable_semantic': True}
106
+ ... )
107
+ >>> X_transformed = engineer.fit_transform(X, y)
108
+ """
109
+
110
+ def __init__(
111
+ self,
112
+ engines: Optional[list[str]] = None,
113
+ max_features: Optional[int] = None,
114
+ selection_methods: Optional[list[str]] = None,
115
+ correlation_threshold: float = 0.95,
116
+ llm_config: Optional[dict[str, Any]] = None,
117
+ verbose: bool = False,
118
+ ):
119
+ self.engines = engines or ["tabular"]
120
+ self.max_features = max_features
121
+ self.selection_methods = selection_methods or ["mutual_info", "importance"]
122
+ self.correlation_threshold = correlation_threshold
123
+ self.llm_config = llm_config or {}
124
+ self.verbose = verbose
125
+
126
+ self._engine_instances: dict[str, Any] = {}
127
+ self._selector: Optional[FeatureSelector] = None
128
+ self._feature_set = FeatureSet()
129
+ self._is_fitted = False
130
+ self._column_descriptions: dict[str, str] = {}
131
+ self._task_description: str = ""
132
+
133
+ def fit(
134
+ self,
135
+ X: Union[pd.DataFrame, np.ndarray],
136
+ y: Optional[Union[pd.Series, np.ndarray]] = None,
137
+ column_descriptions: Optional[dict[str, str]] = None,
138
+ task_description: str = "prediction task",
139
+ **fit_params,
140
+ ) -> "AutoFeatureEngineer":
141
+ """
142
+ Fit the auto feature engineer.
143
+
144
+ Parameters
145
+ ----------
146
+ X : DataFrame or ndarray
147
+ Input data
148
+ y : Series or ndarray, optional
149
+ Target variable
150
+ column_descriptions : dict, optional
151
+ Human-readable descriptions of columns (for LLM)
152
+ task_description : str
153
+ Description of the ML task (for LLM)
154
+ **fit_params : dict
155
+ Additional parameters
156
+
157
+ Returns
158
+ -------
159
+ self : AutoFeatureEngineer
160
+ """
161
+ # Convert to DataFrame if needed
162
+ if isinstance(X, np.ndarray):
163
+ X = pd.DataFrame(X, columns=[f"feature_{i}" for i in range(X.shape[1])])
164
+
165
+ self._column_descriptions = column_descriptions or {}
166
+ self._task_description = task_description
167
+
168
+ # Fit each engine
169
+ for engine_name in self.engines:
170
+ engine = self._create_engine(engine_name)
171
+
172
+ if engine_name == "llm":
173
+ engine.fit(
174
+ X,
175
+ y,
176
+ column_descriptions=column_descriptions,
177
+ task_description=task_description,
178
+ **fit_params,
179
+ )
180
+ else:
181
+ engine.fit(X, y, **fit_params)
182
+
183
+ self._engine_instances[engine_name] = engine
184
+
185
+ if self.verbose:
186
+ print(f"Fitted {engine_name} engine")
187
+
188
+ self._is_fitted = True
189
+ return self
190
+
191
+ def _create_engine(self, engine_name: str):
192
+ """Create an engine instance."""
193
+ if engine_name == "tabular":
194
+ return TabularEngine(max_features=self.max_features, verbose=self.verbose)
195
+ elif engine_name == "timeseries":
196
+ return TimeSeriesEngine(max_features=self.max_features, verbose=self.verbose)
197
+ elif engine_name == "text":
198
+ return TextEngine(max_features=self.max_features, verbose=self.verbose)
199
+ elif engine_name == "llm":
200
+ from featcopilot.llm.semantic_engine import SemanticEngine
201
+
202
+ return SemanticEngine(
203
+ model=self.llm_config.get("model", "gpt-5"),
204
+ max_suggestions=self.llm_config.get("max_suggestions", 20),
205
+ domain=self.llm_config.get("domain"),
206
+ verbose=self.verbose,
207
+ )
208
+ else:
209
+ raise ValueError(f"Unknown engine: {engine_name}")
210
+
211
+ def transform(self, X: Union[pd.DataFrame, np.ndarray], **transform_params) -> pd.DataFrame:
212
+ """
213
+ Transform data using fitted engines.
214
+
215
+ Parameters
216
+ ----------
217
+ X : DataFrame or ndarray
218
+ Input data
219
+ **transform_params : dict
220
+ Additional parameters
221
+
222
+ Returns
223
+ -------
224
+ X_transformed : DataFrame
225
+ Data with generated features
226
+ """
227
+ if not self._is_fitted:
228
+ raise RuntimeError("Must call fit before transform")
229
+
230
+ if isinstance(X, np.ndarray):
231
+ X = pd.DataFrame(X, columns=[f"feature_{i}" for i in range(X.shape[1])])
232
+
233
+ result = X.copy()
234
+
235
+ # Transform with each engine
236
+ for engine_name, engine in self._engine_instances.items():
237
+ transformed = engine.transform(X, **transform_params)
238
+
239
+ # Add new features to result
240
+ new_cols = [c for c in transformed.columns if c not in result.columns]
241
+ for col in new_cols:
242
+ result[col] = transformed[col]
243
+
244
+ if self.verbose:
245
+ print(f"{engine_name}: Added {len(new_cols)} features")
246
+
247
+ # Handle infinities and NaNs
248
+ result = result.replace([np.inf, -np.inf], np.nan)
249
+
250
+ # Apply selection if selector was fitted
251
+ if self._selector is not None:
252
+ selected_features = self._selector.get_selected_features()
253
+ # Keep only selected features that exist in result
254
+ available = [f for f in selected_features if f in result.columns]
255
+ result = result[available]
256
+
257
+ return result
258
+
259
+ def fit_transform(
260
+ self,
261
+ X: Union[pd.DataFrame, np.ndarray],
262
+ y: Optional[Union[pd.Series, np.ndarray]] = None,
263
+ column_descriptions: Optional[dict[str, str]] = None,
264
+ task_description: str = "prediction task",
265
+ apply_selection: bool = True,
266
+ **fit_params,
267
+ ) -> pd.DataFrame:
268
+ """
269
+ Fit and transform in one step.
270
+
271
+ Parameters
272
+ ----------
273
+ X : DataFrame or ndarray
274
+ Input data
275
+ y : Series or ndarray, optional
276
+ Target variable
277
+ column_descriptions : dict, optional
278
+ Human-readable column descriptions
279
+ task_description : str
280
+ ML task description
281
+ apply_selection : bool, default=True
282
+ Whether to apply feature selection
283
+ **fit_params : dict
284
+ Additional parameters
285
+
286
+ Returns
287
+ -------
288
+ X_transformed : DataFrame
289
+ Transformed data with generated features
290
+ """
291
+ self.fit(X, y, column_descriptions, task_description, **fit_params)
292
+ result = self.transform(X)
293
+
294
+ # Apply feature selection if enabled and y is provided
295
+ if apply_selection and y is not None and self.max_features:
296
+ self._selector = FeatureSelector(
297
+ methods=self.selection_methods,
298
+ max_features=self.max_features,
299
+ correlation_threshold=self.correlation_threshold,
300
+ verbose=self.verbose,
301
+ )
302
+ result = self._selector.fit_transform(result, y)
303
+
304
+ if self.verbose:
305
+ print(f"Selected {len(self._selector.get_selected_features())} features")
306
+
307
+ return result
308
+
309
+ def get_feature_names(self) -> list[str]:
310
+ """Get names of all generated features."""
311
+ names = []
312
+ for engine in self._engine_instances.values():
313
+ names.extend(engine.get_feature_names())
314
+ return names
315
+
316
+ def get_feature_names_out(self, input_features=None) -> list[str]:
317
+ """Sklearn-compatible method for feature names."""
318
+ return self.get_feature_names()
319
+
320
+ def explain_features(self) -> dict[str, str]:
321
+ """
322
+ Get explanations for all features.
323
+
324
+ Returns
325
+ -------
326
+ explanations : dict
327
+ Mapping of feature names to explanations
328
+ """
329
+ explanations = {}
330
+
331
+ for _, engine in self._engine_instances.items():
332
+ if hasattr(engine, "get_feature_explanations"):
333
+ explanations.update(engine.get_feature_explanations())
334
+ elif hasattr(engine, "get_feature_set"):
335
+ feature_set = engine.get_feature_set()
336
+ explanations.update(feature_set.get_explanations())
337
+
338
+ return explanations
339
+
340
+ def get_feature_code(self) -> dict[str, str]:
341
+ """
342
+ Get code for generated features.
343
+
344
+ Returns
345
+ -------
346
+ code : dict
347
+ Mapping of feature names to Python code
348
+ """
349
+ code = {}
350
+
351
+ for _, engine in self._engine_instances.items():
352
+ if hasattr(engine, "get_feature_code"):
353
+ code.update(engine.get_feature_code())
354
+
355
+ return code
356
+
357
+ def generate_custom_features(self, prompt: str, n_features: int = 5) -> list[dict[str, Any]]:
358
+ """
359
+ Generate custom features via LLM prompt.
360
+
361
+ Parameters
362
+ ----------
363
+ prompt : str
364
+ Natural language description of desired features
365
+ n_features : int, default=5
366
+ Number of features to generate
367
+
368
+ Returns
369
+ -------
370
+ features : list
371
+ List of generated feature definitions
372
+ """
373
+ if "llm" not in self._engine_instances:
374
+ raise RuntimeError("LLM engine not enabled. Add 'llm' to engines list.")
375
+
376
+ llm_engine = self._engine_instances["llm"]
377
+ return llm_engine.suggest_more_features(prompt, n_features)
378
+
379
+ @property
380
+ def feature_importances_(self) -> Optional[dict[str, float]]:
381
+ """Get feature importance scores if selection was applied."""
382
+ if self._selector is not None:
383
+ return self._selector.get_feature_scores()
384
+ return None
385
+
386
+ def get_params(self, deep=True):
387
+ """Get parameters for sklearn compatibility."""
388
+ return {
389
+ "engines": self.engines,
390
+ "max_features": self.max_features,
391
+ "selection_methods": self.selection_methods,
392
+ "correlation_threshold": self.correlation_threshold,
393
+ "llm_config": self.llm_config,
394
+ "verbose": self.verbose,
395
+ }
396
+
397
+ def set_params(self, **params):
398
+ """Set parameters for sklearn compatibility."""
399
+ for key, value in params.items():
400
+ setattr(self, key, value)
401
+ return self
@@ -0,0 +1,9 @@
1
+ """Utility functions and classes."""
2
+
3
+ from featcopilot.utils.cache import FeatureCache
4
+ from featcopilot.utils.parallel import parallel_apply
5
+
6
+ __all__ = [
7
+ "parallel_apply",
8
+ "FeatureCache",
9
+ ]
@@ -0,0 +1,221 @@
1
+ """Feature caching utilities."""
2
+
3
+ import hashlib
4
+ import json
5
+ import pickle
6
+ from pathlib import Path
7
+ from typing import Any, Optional
8
+
9
+ import pandas as pd
10
+
11
+
12
+ class FeatureCache:
13
+ """
14
+ Cache for computed features.
15
+
16
+ Stores computed features to avoid recomputation.
17
+ Supports both in-memory and disk-based caching.
18
+
19
+ Parameters
20
+ ----------
21
+ cache_dir : str, optional
22
+ Directory for disk cache
23
+ max_memory_items : int, default=100
24
+ Maximum items in memory cache
25
+
26
+ Examples
27
+ --------
28
+ >>> cache = FeatureCache(cache_dir='.feature_cache')
29
+ >>> cache.set('my_feature', feature_data, metadata={'source': 'tabular'})
30
+ >>> data = cache.get('my_feature')
31
+ """
32
+
33
+ def __init__(self, cache_dir: Optional[str] = None, max_memory_items: int = 100):
34
+ self.cache_dir = Path(cache_dir) if cache_dir else None
35
+ self.max_memory_items = max_memory_items
36
+ self._memory_cache: dict[str, Any] = {}
37
+ self._metadata: dict[str, dict] = {}
38
+
39
+ if self.cache_dir:
40
+ self.cache_dir.mkdir(parents=True, exist_ok=True)
41
+
42
+ def _get_cache_key(self, key: str, data_hash: Optional[str] = None) -> str:
43
+ """Generate cache key."""
44
+ if data_hash:
45
+ return f"{key}_{data_hash}"
46
+ return key
47
+
48
+ def _compute_data_hash(self, df: pd.DataFrame) -> str:
49
+ """Compute hash of DataFrame for cache invalidation."""
50
+ # Hash based on shape and sample of data
51
+ shape_str = f"{df.shape}"
52
+ sample = df.head(10).to_string()
53
+ combined = f"{shape_str}_{sample}"
54
+ return hashlib.md5(combined.encode()).hexdigest()[:16]
55
+
56
+ def get(self, key: str, data_hash: Optional[str] = None) -> Optional[Any]:
57
+ """
58
+ Get cached item.
59
+
60
+ Parameters
61
+ ----------
62
+ key : str
63
+ Cache key
64
+ data_hash : str, optional
65
+ Data hash for validation
66
+
67
+ Returns
68
+ -------
69
+ value : Any or None
70
+ Cached value or None if not found
71
+ """
72
+ cache_key = self._get_cache_key(key, data_hash)
73
+
74
+ # Check memory cache first
75
+ if cache_key in self._memory_cache:
76
+ return self._memory_cache[cache_key]
77
+
78
+ # Check disk cache
79
+ if self.cache_dir:
80
+ cache_path = self.cache_dir / f"{cache_key}.pkl"
81
+ if cache_path.exists():
82
+ try:
83
+ with open(cache_path, "rb") as f:
84
+ value = pickle.load(f)
85
+ # Store in memory cache
86
+ self._memory_cache[cache_key] = value
87
+ return value
88
+ except Exception:
89
+ pass
90
+
91
+ return None
92
+
93
+ def set(
94
+ self,
95
+ key: str,
96
+ value: Any,
97
+ data_hash: Optional[str] = None,
98
+ metadata: Optional[dict] = None,
99
+ persist: bool = True,
100
+ ) -> None:
101
+ """
102
+ Set cached item.
103
+
104
+ Parameters
105
+ ----------
106
+ key : str
107
+ Cache key
108
+ value : Any
109
+ Value to cache
110
+ data_hash : str, optional
111
+ Data hash for validation
112
+ metadata : dict, optional
113
+ Additional metadata
114
+ persist : bool, default=True
115
+ Whether to persist to disk
116
+ """
117
+ cache_key = self._get_cache_key(key, data_hash)
118
+
119
+ # Add to memory cache
120
+ self._memory_cache[cache_key] = value
121
+ self._metadata[cache_key] = metadata or {}
122
+
123
+ # Evict if over limit
124
+ if len(self._memory_cache) > self.max_memory_items:
125
+ oldest_key = next(iter(self._memory_cache))
126
+ del self._memory_cache[oldest_key]
127
+ self._metadata.pop(oldest_key, None)
128
+
129
+ # Persist to disk
130
+ if persist and self.cache_dir:
131
+ cache_path = self.cache_dir / f"{cache_key}.pkl"
132
+ try:
133
+ with open(cache_path, "wb") as f:
134
+ pickle.dump(value, f)
135
+
136
+ # Save metadata
137
+ meta_path = self.cache_dir / f"{cache_key}.meta.json"
138
+ with open(meta_path, "w") as f:
139
+ json.dump(metadata or {}, f)
140
+ except Exception:
141
+ pass
142
+
143
+ def has(self, key: str, data_hash: Optional[str] = None) -> bool:
144
+ """Check if key exists in cache."""
145
+ return self.get(key, data_hash) is not None
146
+
147
+ def delete(self, key: str, data_hash: Optional[str] = None) -> bool:
148
+ """
149
+ Delete cached item.
150
+
151
+ Parameters
152
+ ----------
153
+ key : str
154
+ Cache key
155
+ data_hash : str, optional
156
+ Data hash
157
+
158
+ Returns
159
+ -------
160
+ deleted : bool
161
+ Whether item was deleted
162
+ """
163
+ cache_key = self._get_cache_key(key, data_hash)
164
+ deleted = False
165
+
166
+ if cache_key in self._memory_cache:
167
+ del self._memory_cache[cache_key]
168
+ self._metadata.pop(cache_key, None)
169
+ deleted = True
170
+
171
+ if self.cache_dir:
172
+ cache_path = self.cache_dir / f"{cache_key}.pkl"
173
+ meta_path = self.cache_dir / f"{cache_key}.meta.json"
174
+
175
+ if cache_path.exists():
176
+ cache_path.unlink()
177
+ deleted = True
178
+
179
+ if meta_path.exists():
180
+ meta_path.unlink()
181
+
182
+ return deleted
183
+
184
+ def clear(self) -> None:
185
+ """Clear all cached items."""
186
+ self._memory_cache.clear()
187
+ self._metadata.clear()
188
+
189
+ if self.cache_dir:
190
+ for f in self.cache_dir.glob("*.pkl"):
191
+ f.unlink()
192
+ for f in self.cache_dir.glob("*.meta.json"):
193
+ f.unlink()
194
+
195
+ def get_metadata(self, key: str, data_hash: Optional[str] = None) -> Optional[dict]:
196
+ """Get metadata for cached item."""
197
+ cache_key = self._get_cache_key(key, data_hash)
198
+
199
+ if cache_key in self._metadata:
200
+ return self._metadata[cache_key]
201
+
202
+ if self.cache_dir:
203
+ meta_path = self.cache_dir / f"{cache_key}.meta.json"
204
+ if meta_path.exists():
205
+ try:
206
+ with open(meta_path) as f:
207
+ return json.load(f)
208
+ except Exception:
209
+ pass
210
+
211
+ return None
212
+
213
+ def list_keys(self) -> list:
214
+ """List all cached keys."""
215
+ keys = set(self._memory_cache.keys())
216
+
217
+ if self.cache_dir:
218
+ for f in self.cache_dir.glob("*.pkl"):
219
+ keys.add(f.stem)
220
+
221
+ return list(keys)