featcopilot 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,200 @@
1
+ """Feature explanation generator using LLM.
2
+
3
+ Generates human-readable explanations for features.
4
+ """
5
+
6
+ from typing import Optional
7
+
8
+ import pandas as pd
9
+
10
+ from featcopilot.core.feature import Feature, FeatureSet
11
+ from featcopilot.llm.copilot_client import SyncCopilotFeatureClient
12
+
13
+
14
+ class FeatureExplainer:
15
+ """
16
+ Generate human-readable explanations for features.
17
+
18
+ Uses LLM to create interpretable explanations that can be
19
+ understood by non-technical stakeholders.
20
+
21
+ Parameters
22
+ ----------
23
+ model : str, default='gpt-5'
24
+ LLM model to use
25
+
26
+ Examples
27
+ --------
28
+ >>> explainer = FeatureExplainer()
29
+ >>> explanations = explainer.explain_features(feature_set, task='predict churn')
30
+ """
31
+
32
+ def __init__(self, model: str = "gpt-5", verbose: bool = False):
33
+ self.model = model
34
+ self.verbose = verbose
35
+ self._client: Optional[SyncCopilotFeatureClient] = None
36
+
37
+ def _ensure_client(self) -> None:
38
+ """Ensure client is initialized."""
39
+ if self._client is None:
40
+ self._client = SyncCopilotFeatureClient(model=self.model)
41
+ self._client.start()
42
+
43
+ def explain_feature(
44
+ self,
45
+ feature: Feature,
46
+ column_descriptions: Optional[dict[str, str]] = None,
47
+ task_description: Optional[str] = None,
48
+ ) -> str:
49
+ """
50
+ Generate explanation for a single feature.
51
+
52
+ Parameters
53
+ ----------
54
+ feature : Feature
55
+ Feature to explain
56
+ column_descriptions : dict, optional
57
+ Descriptions of source columns
58
+ task_description : str, optional
59
+ ML task description
60
+
61
+ Returns
62
+ -------
63
+ explanation : str
64
+ Human-readable explanation
65
+ """
66
+ self._ensure_client()
67
+
68
+ explanation = self._client.explain_feature(
69
+ feature_name=feature.name,
70
+ feature_code=feature.code or feature.transformation,
71
+ column_descriptions=column_descriptions,
72
+ task_description=task_description,
73
+ )
74
+
75
+ return explanation
76
+
77
+ def explain_features(
78
+ self,
79
+ features: FeatureSet,
80
+ column_descriptions: Optional[dict[str, str]] = None,
81
+ task_description: Optional[str] = None,
82
+ batch_size: int = 5,
83
+ ) -> dict[str, str]:
84
+ """
85
+ Generate explanations for multiple features.
86
+
87
+ Parameters
88
+ ----------
89
+ features : FeatureSet
90
+ Features to explain
91
+ column_descriptions : dict, optional
92
+ Descriptions of source columns
93
+ task_description : str, optional
94
+ ML task description
95
+ batch_size : int, default=5
96
+ Number of features to explain in each LLM call
97
+
98
+ Returns
99
+ -------
100
+ explanations : dict
101
+ Mapping of feature names to explanations
102
+ """
103
+ explanations = {}
104
+
105
+ for feature in features:
106
+ # Skip if already has explanation
107
+ if feature.explanation:
108
+ explanations[feature.name] = feature.explanation
109
+ continue
110
+
111
+ try:
112
+ explanation = self.explain_feature(feature, column_descriptions, task_description)
113
+ explanations[feature.name] = explanation
114
+ feature.explanation = explanation
115
+
116
+ except Exception as e:
117
+ if self.verbose:
118
+ print(f"Could not explain {feature.name}: {e}")
119
+ explanations[feature.name] = f"Feature based on: {', '.join(feature.source_columns)}"
120
+
121
+ return explanations
122
+
123
+ def generate_feature_report(
124
+ self,
125
+ features: FeatureSet,
126
+ X: pd.DataFrame,
127
+ column_descriptions: Optional[dict[str, str]] = None,
128
+ task_description: Optional[str] = None,
129
+ ) -> str:
130
+ """
131
+ Generate a comprehensive report about features.
132
+
133
+ Parameters
134
+ ----------
135
+ features : FeatureSet
136
+ Features to report on
137
+ X : DataFrame
138
+ Data with features
139
+ column_descriptions : dict, optional
140
+ Descriptions of source columns
141
+ task_description : str, optional
142
+ ML task description
143
+
144
+ Returns
145
+ -------
146
+ report : str
147
+ Markdown-formatted report
148
+ """
149
+ explanations = self.explain_features(features, column_descriptions, task_description)
150
+
151
+ report = "# Feature Engineering Report\n\n"
152
+
153
+ if task_description:
154
+ report += f"**Task:** {task_description}\n\n"
155
+
156
+ report += f"**Total Features Generated:** {len(features)}\n\n"
157
+
158
+ # Summary by origin
159
+ report += "## Features by Origin\n\n"
160
+ origins = {}
161
+ for feature in features:
162
+ origin = feature.origin.value
163
+ origins[origin] = origins.get(origin, 0) + 1
164
+
165
+ for origin, count in sorted(origins.items()):
166
+ report += f"- {origin}: {count}\n"
167
+
168
+ # Feature details
169
+ report += "\n## Feature Details\n\n"
170
+
171
+ for feature in features:
172
+ report += f"### {feature.name}\n\n"
173
+ report += f"- **Type:** {feature.dtype.value}\n"
174
+ report += f"- **Origin:** {feature.origin.value}\n"
175
+ report += f"- **Source Columns:** {', '.join(feature.source_columns)}\n"
176
+
177
+ if feature.name in X.columns:
178
+ report += f"- **Non-null Values:** {X[feature.name].notna().sum()}\n"
179
+ if X[feature.name].dtype in ["float64", "int64"]:
180
+ report += f"- **Mean:** {X[feature.name].mean():.4f}\n"
181
+ report += f"- **Std:** {X[feature.name].std():.4f}\n"
182
+
183
+ explanation = explanations.get(feature.name, "")
184
+ if explanation:
185
+ report += f"\n**Explanation:** {explanation}\n"
186
+
187
+ if feature.code:
188
+ report += f"\n**Code:**\n```python\n{feature.code}\n```\n"
189
+
190
+ report += "\n"
191
+
192
+ return report
193
+
194
+ def __del__(self):
195
+ """Clean up client."""
196
+ if self._client:
197
+ try:
198
+ self._client.stop()
199
+ except Exception:
200
+ pass
@@ -0,0 +1,379 @@
1
+ """LLM-powered semantic feature engineering engine.
2
+
3
+ Uses contextual understanding of data to generate meaningful features.
4
+ """
5
+
6
+ from typing import Any, Optional, Union
7
+
8
+ import numpy as np
9
+ import pandas as pd
10
+ from pydantic import Field
11
+
12
+ from featcopilot.core.base import BaseEngine, EngineConfig
13
+ from featcopilot.core.feature import Feature, FeatureOrigin, FeatureSet, FeatureType
14
+ from featcopilot.llm.copilot_client import SyncCopilotFeatureClient
15
+
16
+
17
+ class SemanticEngineConfig(EngineConfig):
18
+ """Configuration for semantic feature engine."""
19
+
20
+ name: str = "SemanticEngine"
21
+ model: str = Field(default="gpt-5", description="LLM model to use")
22
+ max_suggestions: int = Field(default=20, description="Max features to suggest")
23
+ validate_features: bool = Field(default=True, description="Validate generated code")
24
+ domain: Optional[str] = Field(default=None, description="Domain context")
25
+ temperature: float = Field(default=0.3, description="LLM temperature")
26
+
27
+
28
+ class SemanticEngine(BaseEngine):
29
+ """
30
+ LLM-powered semantic feature engineering engine.
31
+
32
+ Uses GitHub Copilot SDK to:
33
+ - Understand column semantics from names and descriptions
34
+ - Generate domain-aware features
35
+ - Create interpretable features with explanations
36
+ - Generate custom Python code for complex transformations
37
+
38
+ This is the KEY DIFFERENTIATOR from existing libraries like CAAFE.
39
+
40
+ Parameters
41
+ ----------
42
+ model : str, default='gpt-5'
43
+ LLM model to use
44
+ max_suggestions : int, default=20
45
+ Maximum number of features to suggest
46
+ validate_features : bool, default=True
47
+ Whether to validate generated feature code
48
+ domain : str, optional
49
+ Domain context (e.g., 'healthcare', 'finance', 'retail')
50
+
51
+ Examples
52
+ --------
53
+ >>> engine = SemanticEngine(model='gpt-5', domain='healthcare')
54
+ >>> X_features = engine.fit_transform(
55
+ ... X, y,
56
+ ... column_descriptions={'age': 'Patient age', 'bmi': 'Body mass index'},
57
+ ... task_description='Predict diabetes risk'
58
+ ... )
59
+ """
60
+
61
+ def __init__(
62
+ self,
63
+ model: str = "gpt-5",
64
+ max_suggestions: int = 20,
65
+ validate_features: bool = True,
66
+ domain: Optional[str] = None,
67
+ verbose: bool = False,
68
+ **kwargs,
69
+ ):
70
+ config = SemanticEngineConfig(
71
+ model=model,
72
+ max_suggestions=max_suggestions,
73
+ validate_features=validate_features,
74
+ domain=domain,
75
+ verbose=verbose,
76
+ **kwargs,
77
+ )
78
+ super().__init__(config=config)
79
+ self.config: SemanticEngineConfig = config
80
+ self._client: Optional[SyncCopilotFeatureClient] = None
81
+ self._suggested_features: list[dict[str, Any]] = []
82
+ self._feature_set = FeatureSet()
83
+ self._column_info: dict[str, str] = {}
84
+ self._column_descriptions: dict[str, str] = {}
85
+ self._task_description: str = ""
86
+
87
+ def _ensure_client(self) -> None:
88
+ """Ensure Copilot client is initialized."""
89
+ if self._client is None:
90
+ self._client = SyncCopilotFeatureClient(model=self.config.model)
91
+ self._client.start()
92
+
93
+ def fit(
94
+ self,
95
+ X: Union[pd.DataFrame, np.ndarray],
96
+ y: Optional[Union[pd.Series, np.ndarray]] = None,
97
+ column_descriptions: Optional[dict[str, str]] = None,
98
+ task_description: str = "classification/regression task",
99
+ **kwargs,
100
+ ) -> "SemanticEngine":
101
+ """
102
+ Fit the engine by analyzing data and generating feature suggestions.
103
+
104
+ Parameters
105
+ ----------
106
+ X : DataFrame
107
+ Input data
108
+ y : Series, optional
109
+ Target variable
110
+ column_descriptions : dict, optional
111
+ Human-readable descriptions of columns
112
+ task_description : str
113
+ Description of the ML task
114
+
115
+ Returns
116
+ -------
117
+ self : SemanticEngine
118
+ """
119
+ X = self._validate_input(X)
120
+ self._ensure_client()
121
+
122
+ # Store metadata
123
+ self._column_descriptions = column_descriptions or {}
124
+ self._task_description = task_description
125
+
126
+ # Build column info
127
+ self._column_info = {}
128
+ for col in X.columns:
129
+ dtype = str(X[col].dtype)
130
+ if X[col].dtype == "object":
131
+ dtype = "string"
132
+ elif np.issubdtype(X[col].dtype, np.integer):
133
+ dtype = "integer"
134
+ elif np.issubdtype(X[col].dtype, np.floating):
135
+ dtype = "float"
136
+ self._column_info[col] = dtype
137
+
138
+ # Get LLM suggestions
139
+ if self.config.verbose:
140
+ print("SemanticEngine: Requesting feature suggestions from LLM...")
141
+
142
+ self._suggested_features = self._client.suggest_features(
143
+ column_info=self._column_info,
144
+ task_description=task_description,
145
+ column_descriptions=column_descriptions,
146
+ domain=self.config.domain,
147
+ max_suggestions=self.config.max_suggestions,
148
+ )
149
+
150
+ if self.config.verbose:
151
+ print(f"SemanticEngine: Received {len(self._suggested_features)} suggestions")
152
+
153
+ # Validate features if enabled
154
+ if self.config.validate_features:
155
+ self._validate_suggestions(X)
156
+
157
+ # Build feature set
158
+ self._build_feature_set()
159
+
160
+ self._is_fitted = True
161
+ return self
162
+
163
+ def _validate_suggestions(self, X: pd.DataFrame) -> None:
164
+ """Validate suggested feature code."""
165
+ valid_features = []
166
+ sample_data = {col: X[col].head(100).tolist() for col in X.columns}
167
+
168
+ for feature in self._suggested_features:
169
+ code = feature.get("code", "")
170
+ if not code:
171
+ continue
172
+
173
+ result = self._client.validate_feature_code(code, sample_data)
174
+
175
+ if result["valid"]:
176
+ valid_features.append(feature)
177
+ elif self.config.verbose:
178
+ print(
179
+ f"SemanticEngine: Invalid feature '{feature.get('name', 'unknown')}': {result.get('error', 'unknown error')}"
180
+ )
181
+
182
+ self._suggested_features = valid_features
183
+
184
+ if self.config.verbose:
185
+ print(f"SemanticEngine: {len(valid_features)} valid features after validation")
186
+
187
+ def _build_feature_set(self) -> None:
188
+ """Build FeatureSet from suggestions."""
189
+ self._feature_set = FeatureSet()
190
+
191
+ for suggestion in self._suggested_features:
192
+ feature = Feature(
193
+ name=suggestion.get("name", f"llm_feature_{len(self._feature_set)}"),
194
+ dtype=FeatureType.NUMERIC,
195
+ origin=FeatureOrigin.LLM_GENERATED,
196
+ source_columns=suggestion.get("source_columns", []),
197
+ transformation="llm_generated",
198
+ explanation=suggestion.get("explanation", ""),
199
+ code=suggestion.get("code", ""),
200
+ )
201
+ self._feature_set.add(feature)
202
+
203
+ def transform(self, X: Union[pd.DataFrame, np.ndarray], **kwargs) -> pd.DataFrame:
204
+ """
205
+ Generate LLM-suggested features.
206
+
207
+ Parameters
208
+ ----------
209
+ X : DataFrame
210
+ Input data
211
+
212
+ Returns
213
+ -------
214
+ X_features : DataFrame
215
+ Data with generated features
216
+ """
217
+ if not self._is_fitted:
218
+ raise RuntimeError("Engine must be fitted before transform")
219
+
220
+ X = self._validate_input(X)
221
+ result = X.copy()
222
+
223
+ successful_features = []
224
+
225
+ for suggestion in self._suggested_features:
226
+ name = suggestion.get("name", "")
227
+ code = suggestion.get("code", "")
228
+
229
+ if not code:
230
+ continue
231
+
232
+ try:
233
+ # Execute feature code
234
+ local_vars = {"df": result, "np": np, "pd": pd}
235
+ exec(
236
+ code,
237
+ {
238
+ "__builtins__": {
239
+ "len": len,
240
+ "sum": sum,
241
+ "max": max,
242
+ "min": min,
243
+ "abs": abs,
244
+ "round": round,
245
+ "int": int,
246
+ "float": float,
247
+ "str": str,
248
+ "list": list,
249
+ "dict": dict,
250
+ "set": set,
251
+ }
252
+ },
253
+ local_vars,
254
+ )
255
+
256
+ if "result" in local_vars:
257
+ feature_values = local_vars["result"]
258
+
259
+ # Ensure it's a Series with correct index
260
+ if isinstance(feature_values, pd.Series):
261
+ result[name] = feature_values.values
262
+ else:
263
+ result[name] = feature_values
264
+
265
+ successful_features.append(name)
266
+
267
+ except Exception as e:
268
+ if self.config.verbose:
269
+ print(f"SemanticEngine: Error computing '{name}': {e}")
270
+
271
+ # Handle infinities and NaNs
272
+ result = result.replace([np.inf, -np.inf], np.nan)
273
+
274
+ self._feature_names = successful_features
275
+
276
+ if self.config.verbose:
277
+ print(f"SemanticEngine: Successfully generated {len(successful_features)} features")
278
+
279
+ return result
280
+
281
+ def get_feature_explanations(self) -> dict[str, str]:
282
+ """
283
+ Get explanations for all generated features.
284
+
285
+ Returns
286
+ -------
287
+ explanations : dict
288
+ Mapping of feature names to explanations
289
+ """
290
+ return {s.get("name", ""): s.get("explanation", "") for s in self._suggested_features if s.get("name")}
291
+
292
+ def get_feature_code(self) -> dict[str, str]:
293
+ """
294
+ Get code for all generated features.
295
+
296
+ Returns
297
+ -------
298
+ code : dict
299
+ Mapping of feature names to Python code
300
+ """
301
+ return {s.get("name", ""): s.get("code", "") for s in self._suggested_features if s.get("name")}
302
+
303
+ def suggest_more_features(self, focus_area: str, n_features: int = 5) -> list[dict[str, Any]]:
304
+ """
305
+ Request additional feature suggestions in a specific area.
306
+
307
+ Parameters
308
+ ----------
309
+ focus_area : str
310
+ Area to focus on (e.g., 'interactions', 'ratios', 'time-based')
311
+ n_features : int, default=5
312
+ Number of additional features to suggest
313
+
314
+ Returns
315
+ -------
316
+ suggestions : list
317
+ New feature suggestions
318
+ """
319
+ self._ensure_client()
320
+
321
+ # Build focused prompt
322
+ enhanced_task = f"{self._task_description}\n\nFocus specifically on: {focus_area}"
323
+
324
+ new_suggestions = self._client.suggest_features(
325
+ column_info=self._column_info,
326
+ task_description=enhanced_task,
327
+ column_descriptions=self._column_descriptions,
328
+ domain=self.config.domain,
329
+ max_suggestions=n_features,
330
+ )
331
+
332
+ return new_suggestions
333
+
334
+ def generate_custom_feature(self, description: str, constraints: Optional[list[str]] = None) -> dict[str, Any]:
335
+ """
336
+ Generate a specific feature from natural language description.
337
+
338
+ Parameters
339
+ ----------
340
+ description : str
341
+ Natural language description of desired feature
342
+ constraints : list, optional
343
+ Constraints on the generated code
344
+
345
+ Returns
346
+ -------
347
+ feature : dict
348
+ Generated feature with name, code, and explanation
349
+ """
350
+ self._ensure_client()
351
+
352
+ code = self._client.generate_feature_code(
353
+ description=description,
354
+ column_info=self._column_info,
355
+ constraints=constraints,
356
+ )
357
+
358
+ # Generate name from description
359
+ name = "_".join(description.lower().split()[:4]).replace("-", "_")
360
+ name = "".join(c if c.isalnum() or c == "_" else "" for c in name)
361
+
362
+ return {
363
+ "name": name,
364
+ "code": code,
365
+ "description": description,
366
+ "explanation": f"Custom feature: {description}",
367
+ }
368
+
369
+ def get_feature_set(self) -> FeatureSet:
370
+ """Get the feature set with metadata."""
371
+ return self._feature_set
372
+
373
+ def __del__(self):
374
+ """Clean up client on deletion."""
375
+ if self._client:
376
+ try:
377
+ self._client.stop()
378
+ except Exception:
379
+ pass
@@ -0,0 +1,13 @@
1
+ """Feature selection module."""
2
+
3
+ from featcopilot.selection.importance import ImportanceSelector
4
+ from featcopilot.selection.redundancy import RedundancyEliminator
5
+ from featcopilot.selection.statistical import StatisticalSelector
6
+ from featcopilot.selection.unified import FeatureSelector
7
+
8
+ __all__ = [
9
+ "StatisticalSelector",
10
+ "ImportanceSelector",
11
+ "RedundancyEliminator",
12
+ "FeatureSelector",
13
+ ]