featcopilot 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- featcopilot/__init__.py +29 -0
- featcopilot/core/__init__.py +13 -0
- featcopilot/core/base.py +195 -0
- featcopilot/core/feature.py +224 -0
- featcopilot/core/registry.py +128 -0
- featcopilot/engines/__init__.py +13 -0
- featcopilot/engines/relational.py +256 -0
- featcopilot/engines/tabular.py +293 -0
- featcopilot/engines/text.py +211 -0
- featcopilot/engines/timeseries.py +402 -0
- featcopilot/llm/__init__.py +16 -0
- featcopilot/llm/code_generator.py +295 -0
- featcopilot/llm/copilot_client.py +521 -0
- featcopilot/llm/explainer.py +200 -0
- featcopilot/llm/semantic_engine.py +379 -0
- featcopilot/selection/__init__.py +13 -0
- featcopilot/selection/importance.py +161 -0
- featcopilot/selection/redundancy.py +156 -0
- featcopilot/selection/statistical.py +199 -0
- featcopilot/selection/unified.py +172 -0
- featcopilot/transformers/__init__.py +11 -0
- featcopilot/transformers/sklearn_compat.py +401 -0
- featcopilot/utils/__init__.py +9 -0
- featcopilot/utils/cache.py +221 -0
- featcopilot/utils/parallel.py +109 -0
- featcopilot-0.1.0.dist-info/METADATA +218 -0
- featcopilot-0.1.0.dist-info/RECORD +29 -0
- featcopilot-0.1.0.dist-info/WHEEL +5 -0
- featcopilot-0.1.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,295 @@
|
|
|
1
|
+
"""LLM-powered feature code generator.
|
|
2
|
+
|
|
3
|
+
Generates Python code for custom features based on natural language descriptions.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
import re
|
|
7
|
+
from typing import Optional
|
|
8
|
+
|
|
9
|
+
import pandas as pd
|
|
10
|
+
|
|
11
|
+
from featcopilot.core.feature import Feature, FeatureOrigin, FeatureType
|
|
12
|
+
from featcopilot.llm.copilot_client import SyncCopilotFeatureClient
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class FeatureCodeGenerator:
|
|
16
|
+
"""
|
|
17
|
+
Generate Python code for features from natural language descriptions.
|
|
18
|
+
|
|
19
|
+
Uses LLM to understand feature requirements and generate
|
|
20
|
+
working pandas code.
|
|
21
|
+
|
|
22
|
+
Parameters
|
|
23
|
+
----------
|
|
24
|
+
model : str, default='gpt-5'
|
|
25
|
+
LLM model to use
|
|
26
|
+
validate : bool, default=True
|
|
27
|
+
Whether to validate generated code
|
|
28
|
+
|
|
29
|
+
Examples
|
|
30
|
+
--------
|
|
31
|
+
>>> generator = FeatureCodeGenerator()
|
|
32
|
+
>>> feature = generator.generate(
|
|
33
|
+
... description="Calculate BMI from height and weight",
|
|
34
|
+
... columns={'height_m': 'float', 'weight_kg': 'float'}
|
|
35
|
+
... )
|
|
36
|
+
"""
|
|
37
|
+
|
|
38
|
+
def __init__(self, model: str = "gpt-5", validate: bool = True, verbose: bool = False):
|
|
39
|
+
self.model = model
|
|
40
|
+
self.validate = validate
|
|
41
|
+
self.verbose = verbose
|
|
42
|
+
self._client: Optional[SyncCopilotFeatureClient] = None
|
|
43
|
+
|
|
44
|
+
def _ensure_client(self) -> None:
|
|
45
|
+
"""Ensure client is initialized."""
|
|
46
|
+
if self._client is None:
|
|
47
|
+
self._client = SyncCopilotFeatureClient(model=self.model)
|
|
48
|
+
self._client.start()
|
|
49
|
+
|
|
50
|
+
def generate(
|
|
51
|
+
self,
|
|
52
|
+
description: str,
|
|
53
|
+
columns: dict[str, str],
|
|
54
|
+
constraints: Optional[list[str]] = None,
|
|
55
|
+
sample_data: Optional[pd.DataFrame] = None,
|
|
56
|
+
) -> Feature:
|
|
57
|
+
"""
|
|
58
|
+
Generate a feature from natural language description.
|
|
59
|
+
|
|
60
|
+
Parameters
|
|
61
|
+
----------
|
|
62
|
+
description : str
|
|
63
|
+
Natural language description of the feature
|
|
64
|
+
columns : dict
|
|
65
|
+
Available columns and their types
|
|
66
|
+
constraints : list, optional
|
|
67
|
+
Code constraints (e.g., "avoid division by zero")
|
|
68
|
+
sample_data : DataFrame, optional
|
|
69
|
+
Sample data for validation
|
|
70
|
+
|
|
71
|
+
Returns
|
|
72
|
+
-------
|
|
73
|
+
feature : Feature
|
|
74
|
+
Generated feature with code
|
|
75
|
+
"""
|
|
76
|
+
self._ensure_client()
|
|
77
|
+
|
|
78
|
+
# Generate code
|
|
79
|
+
code = self._client.generate_feature_code(
|
|
80
|
+
description=description,
|
|
81
|
+
column_info=columns,
|
|
82
|
+
constraints=constraints,
|
|
83
|
+
)
|
|
84
|
+
|
|
85
|
+
# Clean code
|
|
86
|
+
code = self._clean_code(code)
|
|
87
|
+
|
|
88
|
+
# Generate feature name
|
|
89
|
+
name = self._generate_name(description)
|
|
90
|
+
|
|
91
|
+
# Detect source columns
|
|
92
|
+
source_columns = self._detect_source_columns(code, list(columns.keys()))
|
|
93
|
+
|
|
94
|
+
# Validate if enabled
|
|
95
|
+
if self.validate and sample_data is not None:
|
|
96
|
+
validation = self._client.validate_feature_code(
|
|
97
|
+
code, {col: sample_data[col].tolist() for col in sample_data.columns}
|
|
98
|
+
)
|
|
99
|
+
if not validation["valid"]:
|
|
100
|
+
if self.verbose:
|
|
101
|
+
print(f"Code validation failed: {validation['error']}")
|
|
102
|
+
# Try to fix common issues
|
|
103
|
+
code = self._fix_common_issues(code, validation["error"])
|
|
104
|
+
|
|
105
|
+
feature = Feature(
|
|
106
|
+
name=name,
|
|
107
|
+
dtype=FeatureType.NUMERIC,
|
|
108
|
+
origin=FeatureOrigin.LLM_GENERATED,
|
|
109
|
+
source_columns=source_columns,
|
|
110
|
+
transformation="custom",
|
|
111
|
+
explanation=description,
|
|
112
|
+
code=code,
|
|
113
|
+
)
|
|
114
|
+
|
|
115
|
+
return feature
|
|
116
|
+
|
|
117
|
+
def generate_batch(
|
|
118
|
+
self,
|
|
119
|
+
descriptions: list[str],
|
|
120
|
+
columns: dict[str, str],
|
|
121
|
+
sample_data: Optional[pd.DataFrame] = None,
|
|
122
|
+
) -> list[Feature]:
|
|
123
|
+
"""
|
|
124
|
+
Generate multiple features from descriptions.
|
|
125
|
+
|
|
126
|
+
Parameters
|
|
127
|
+
----------
|
|
128
|
+
descriptions : list
|
|
129
|
+
List of feature descriptions
|
|
130
|
+
columns : dict
|
|
131
|
+
Available columns and their types
|
|
132
|
+
sample_data : DataFrame, optional
|
|
133
|
+
Sample data for validation
|
|
134
|
+
|
|
135
|
+
Returns
|
|
136
|
+
-------
|
|
137
|
+
features : list
|
|
138
|
+
List of generated features
|
|
139
|
+
"""
|
|
140
|
+
features = []
|
|
141
|
+
for desc in descriptions:
|
|
142
|
+
try:
|
|
143
|
+
feature = self.generate(desc, columns, sample_data=sample_data)
|
|
144
|
+
features.append(feature)
|
|
145
|
+
except Exception as e:
|
|
146
|
+
if self.verbose:
|
|
147
|
+
print(f"Failed to generate feature for '{desc}': {e}")
|
|
148
|
+
|
|
149
|
+
return features
|
|
150
|
+
|
|
151
|
+
def _clean_code(self, code: str) -> str:
|
|
152
|
+
"""Clean and normalize generated code."""
|
|
153
|
+
# Remove markdown code blocks
|
|
154
|
+
code = code.strip()
|
|
155
|
+
if code.startswith("```"):
|
|
156
|
+
lines = code.split("\n")
|
|
157
|
+
code = "\n".join(line for line in lines if not line.startswith("```"))
|
|
158
|
+
|
|
159
|
+
# Remove comments
|
|
160
|
+
lines = []
|
|
161
|
+
for line in code.split("\n"):
|
|
162
|
+
if not line.strip().startswith("#"):
|
|
163
|
+
lines.append(line)
|
|
164
|
+
code = "\n".join(lines).strip()
|
|
165
|
+
|
|
166
|
+
# Ensure result assignment
|
|
167
|
+
if "result" not in code:
|
|
168
|
+
# Try to extract the expression and wrap it
|
|
169
|
+
if "=" in code:
|
|
170
|
+
# Already has an assignment, replace variable name
|
|
171
|
+
code = re.sub(r"^(\w+)\s*=", "result =", code)
|
|
172
|
+
else:
|
|
173
|
+
# Raw expression
|
|
174
|
+
code = f"result = {code}"
|
|
175
|
+
|
|
176
|
+
return code
|
|
177
|
+
|
|
178
|
+
def _generate_name(self, description: str) -> str:
|
|
179
|
+
"""Generate a feature name from description."""
|
|
180
|
+
# Take first few significant words
|
|
181
|
+
words = description.lower().split()
|
|
182
|
+
significant = [
|
|
183
|
+
w for w in words if len(w) > 2 and w not in {"the", "and", "for", "from", "with", "calculate", "compute"}
|
|
184
|
+
][:4]
|
|
185
|
+
|
|
186
|
+
name = "_".join(significant)
|
|
187
|
+
# Clean up
|
|
188
|
+
name = re.sub(r"[^a-z0-9_]", "", name)
|
|
189
|
+
name = re.sub(r"_+", "_", name)
|
|
190
|
+
|
|
191
|
+
return name or "custom_feature"
|
|
192
|
+
|
|
193
|
+
def _detect_source_columns(self, code: str, available_columns: list[str]) -> list[str]:
|
|
194
|
+
"""Detect which columns are used in the code."""
|
|
195
|
+
sources = []
|
|
196
|
+
for col in available_columns:
|
|
197
|
+
# Check for df['col'] or df["col"] or df.col patterns
|
|
198
|
+
patterns = [
|
|
199
|
+
f"df['{col}']",
|
|
200
|
+
f'df["{col}"]',
|
|
201
|
+
f"df.{col}",
|
|
202
|
+
]
|
|
203
|
+
if any(pattern in code for pattern in patterns):
|
|
204
|
+
sources.append(col)
|
|
205
|
+
|
|
206
|
+
return sources
|
|
207
|
+
|
|
208
|
+
def _fix_common_issues(self, code: str, error: str) -> str:
|
|
209
|
+
"""Try to fix common code issues."""
|
|
210
|
+
if "division by zero" in error.lower():
|
|
211
|
+
# Add small epsilon to divisors
|
|
212
|
+
code = re.sub(r"/\s*\(([^)]+)\)", r"/ (\1 + 1e-8)", code)
|
|
213
|
+
code = re.sub(r"/\s*df\['([^']+)'\]", r"/ (df['\1'] + 1e-8)", code)
|
|
214
|
+
|
|
215
|
+
if "keyerror" in error.lower() or "not found" in error.lower():
|
|
216
|
+
# Can't fix missing columns
|
|
217
|
+
pass
|
|
218
|
+
|
|
219
|
+
if "syntax" in error.lower():
|
|
220
|
+
# Try removing problematic characters
|
|
221
|
+
code = code.replace("'", "'").replace("'", "'")
|
|
222
|
+
code = code.replace(""", '"').replace(""", '"')
|
|
223
|
+
|
|
224
|
+
return code
|
|
225
|
+
|
|
226
|
+
def generate_domain_features(self, domain: str, columns: dict[str, str], n_features: int = 5) -> list[Feature]:
|
|
227
|
+
"""
|
|
228
|
+
Generate domain-specific features.
|
|
229
|
+
|
|
230
|
+
Parameters
|
|
231
|
+
----------
|
|
232
|
+
domain : str
|
|
233
|
+
Domain name (e.g., 'healthcare', 'finance', 'retail')
|
|
234
|
+
columns : dict
|
|
235
|
+
Available columns and their types
|
|
236
|
+
n_features : int, default=5
|
|
237
|
+
Number of features to generate
|
|
238
|
+
|
|
239
|
+
Returns
|
|
240
|
+
-------
|
|
241
|
+
features : list
|
|
242
|
+
Generated domain-specific features
|
|
243
|
+
"""
|
|
244
|
+
domain_prompts = {
|
|
245
|
+
"healthcare": [
|
|
246
|
+
"Calculate BMI if height and weight columns exist",
|
|
247
|
+
"Create age group categories (pediatric, adult, elderly)",
|
|
248
|
+
"Calculate medication count normalized by age",
|
|
249
|
+
"Create comorbidity score from diagnosis codes",
|
|
250
|
+
"Calculate length of stay relative to average",
|
|
251
|
+
],
|
|
252
|
+
"finance": [
|
|
253
|
+
"Calculate debt-to-income ratio",
|
|
254
|
+
"Create credit utilization percentage",
|
|
255
|
+
"Calculate payment-to-income ratio",
|
|
256
|
+
"Create account age in years",
|
|
257
|
+
"Calculate average transaction amount",
|
|
258
|
+
],
|
|
259
|
+
"retail": [
|
|
260
|
+
"Calculate average order value",
|
|
261
|
+
"Create recency score (days since last purchase)",
|
|
262
|
+
"Calculate purchase frequency per month",
|
|
263
|
+
"Create customer lifetime value estimate",
|
|
264
|
+
"Calculate category diversity score",
|
|
265
|
+
],
|
|
266
|
+
"telecom": [
|
|
267
|
+
"Calculate average monthly charges",
|
|
268
|
+
"Create contract length in months",
|
|
269
|
+
"Calculate service usage intensity",
|
|
270
|
+
"Create support ticket frequency",
|
|
271
|
+
"Calculate revenue per service",
|
|
272
|
+
],
|
|
273
|
+
}
|
|
274
|
+
|
|
275
|
+
prompts = domain_prompts.get(
|
|
276
|
+
domain.lower(),
|
|
277
|
+
[
|
|
278
|
+
f"Create a useful feature for {domain} analytics",
|
|
279
|
+
f"Calculate a key metric for {domain}",
|
|
280
|
+
f"Create an interaction feature relevant to {domain}",
|
|
281
|
+
],
|
|
282
|
+
)
|
|
283
|
+
|
|
284
|
+
# Select prompts based on available columns
|
|
285
|
+
applicable_prompts = prompts[:n_features]
|
|
286
|
+
|
|
287
|
+
return self.generate_batch(applicable_prompts, columns)
|
|
288
|
+
|
|
289
|
+
def __del__(self):
|
|
290
|
+
"""Clean up client."""
|
|
291
|
+
if self._client:
|
|
292
|
+
try:
|
|
293
|
+
self._client.stop()
|
|
294
|
+
except Exception:
|
|
295
|
+
pass
|