featcopilot 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,295 @@
1
+ """LLM-powered feature code generator.
2
+
3
+ Generates Python code for custom features based on natural language descriptions.
4
+ """
5
+
6
+ import re
7
+ from typing import Optional
8
+
9
+ import pandas as pd
10
+
11
+ from featcopilot.core.feature import Feature, FeatureOrigin, FeatureType
12
+ from featcopilot.llm.copilot_client import SyncCopilotFeatureClient
13
+
14
+
15
+ class FeatureCodeGenerator:
16
+ """
17
+ Generate Python code for features from natural language descriptions.
18
+
19
+ Uses LLM to understand feature requirements and generate
20
+ working pandas code.
21
+
22
+ Parameters
23
+ ----------
24
+ model : str, default='gpt-5'
25
+ LLM model to use
26
+ validate : bool, default=True
27
+ Whether to validate generated code
28
+
29
+ Examples
30
+ --------
31
+ >>> generator = FeatureCodeGenerator()
32
+ >>> feature = generator.generate(
33
+ ... description="Calculate BMI from height and weight",
34
+ ... columns={'height_m': 'float', 'weight_kg': 'float'}
35
+ ... )
36
+ """
37
+
38
+ def __init__(self, model: str = "gpt-5", validate: bool = True, verbose: bool = False):
39
+ self.model = model
40
+ self.validate = validate
41
+ self.verbose = verbose
42
+ self._client: Optional[SyncCopilotFeatureClient] = None
43
+
44
+ def _ensure_client(self) -> None:
45
+ """Ensure client is initialized."""
46
+ if self._client is None:
47
+ self._client = SyncCopilotFeatureClient(model=self.model)
48
+ self._client.start()
49
+
50
+ def generate(
51
+ self,
52
+ description: str,
53
+ columns: dict[str, str],
54
+ constraints: Optional[list[str]] = None,
55
+ sample_data: Optional[pd.DataFrame] = None,
56
+ ) -> Feature:
57
+ """
58
+ Generate a feature from natural language description.
59
+
60
+ Parameters
61
+ ----------
62
+ description : str
63
+ Natural language description of the feature
64
+ columns : dict
65
+ Available columns and their types
66
+ constraints : list, optional
67
+ Code constraints (e.g., "avoid division by zero")
68
+ sample_data : DataFrame, optional
69
+ Sample data for validation
70
+
71
+ Returns
72
+ -------
73
+ feature : Feature
74
+ Generated feature with code
75
+ """
76
+ self._ensure_client()
77
+
78
+ # Generate code
79
+ code = self._client.generate_feature_code(
80
+ description=description,
81
+ column_info=columns,
82
+ constraints=constraints,
83
+ )
84
+
85
+ # Clean code
86
+ code = self._clean_code(code)
87
+
88
+ # Generate feature name
89
+ name = self._generate_name(description)
90
+
91
+ # Detect source columns
92
+ source_columns = self._detect_source_columns(code, list(columns.keys()))
93
+
94
+ # Validate if enabled
95
+ if self.validate and sample_data is not None:
96
+ validation = self._client.validate_feature_code(
97
+ code, {col: sample_data[col].tolist() for col in sample_data.columns}
98
+ )
99
+ if not validation["valid"]:
100
+ if self.verbose:
101
+ print(f"Code validation failed: {validation['error']}")
102
+ # Try to fix common issues
103
+ code = self._fix_common_issues(code, validation["error"])
104
+
105
+ feature = Feature(
106
+ name=name,
107
+ dtype=FeatureType.NUMERIC,
108
+ origin=FeatureOrigin.LLM_GENERATED,
109
+ source_columns=source_columns,
110
+ transformation="custom",
111
+ explanation=description,
112
+ code=code,
113
+ )
114
+
115
+ return feature
116
+
117
+ def generate_batch(
118
+ self,
119
+ descriptions: list[str],
120
+ columns: dict[str, str],
121
+ sample_data: Optional[pd.DataFrame] = None,
122
+ ) -> list[Feature]:
123
+ """
124
+ Generate multiple features from descriptions.
125
+
126
+ Parameters
127
+ ----------
128
+ descriptions : list
129
+ List of feature descriptions
130
+ columns : dict
131
+ Available columns and their types
132
+ sample_data : DataFrame, optional
133
+ Sample data for validation
134
+
135
+ Returns
136
+ -------
137
+ features : list
138
+ List of generated features
139
+ """
140
+ features = []
141
+ for desc in descriptions:
142
+ try:
143
+ feature = self.generate(desc, columns, sample_data=sample_data)
144
+ features.append(feature)
145
+ except Exception as e:
146
+ if self.verbose:
147
+ print(f"Failed to generate feature for '{desc}': {e}")
148
+
149
+ return features
150
+
151
+ def _clean_code(self, code: str) -> str:
152
+ """Clean and normalize generated code."""
153
+ # Remove markdown code blocks
154
+ code = code.strip()
155
+ if code.startswith("```"):
156
+ lines = code.split("\n")
157
+ code = "\n".join(line for line in lines if not line.startswith("```"))
158
+
159
+ # Remove comments
160
+ lines = []
161
+ for line in code.split("\n"):
162
+ if not line.strip().startswith("#"):
163
+ lines.append(line)
164
+ code = "\n".join(lines).strip()
165
+
166
+ # Ensure result assignment
167
+ if "result" not in code:
168
+ # Try to extract the expression and wrap it
169
+ if "=" in code:
170
+ # Already has an assignment, replace variable name
171
+ code = re.sub(r"^(\w+)\s*=", "result =", code)
172
+ else:
173
+ # Raw expression
174
+ code = f"result = {code}"
175
+
176
+ return code
177
+
178
+ def _generate_name(self, description: str) -> str:
179
+ """Generate a feature name from description."""
180
+ # Take first few significant words
181
+ words = description.lower().split()
182
+ significant = [
183
+ w for w in words if len(w) > 2 and w not in {"the", "and", "for", "from", "with", "calculate", "compute"}
184
+ ][:4]
185
+
186
+ name = "_".join(significant)
187
+ # Clean up
188
+ name = re.sub(r"[^a-z0-9_]", "", name)
189
+ name = re.sub(r"_+", "_", name)
190
+
191
+ return name or "custom_feature"
192
+
193
+ def _detect_source_columns(self, code: str, available_columns: list[str]) -> list[str]:
194
+ """Detect which columns are used in the code."""
195
+ sources = []
196
+ for col in available_columns:
197
+ # Check for df['col'] or df["col"] or df.col patterns
198
+ patterns = [
199
+ f"df['{col}']",
200
+ f'df["{col}"]',
201
+ f"df.{col}",
202
+ ]
203
+ if any(pattern in code for pattern in patterns):
204
+ sources.append(col)
205
+
206
+ return sources
207
+
208
+ def _fix_common_issues(self, code: str, error: str) -> str:
209
+ """Try to fix common code issues."""
210
+ if "division by zero" in error.lower():
211
+ # Add small epsilon to divisors
212
+ code = re.sub(r"/\s*\(([^)]+)\)", r"/ (\1 + 1e-8)", code)
213
+ code = re.sub(r"/\s*df\['([^']+)'\]", r"/ (df['\1'] + 1e-8)", code)
214
+
215
+ if "keyerror" in error.lower() or "not found" in error.lower():
216
+ # Can't fix missing columns
217
+ pass
218
+
219
+ if "syntax" in error.lower():
220
+ # Try removing problematic characters
221
+ code = code.replace("'", "'").replace("'", "'")
222
+ code = code.replace(""", '"').replace(""", '"')
223
+
224
+ return code
225
+
226
+ def generate_domain_features(self, domain: str, columns: dict[str, str], n_features: int = 5) -> list[Feature]:
227
+ """
228
+ Generate domain-specific features.
229
+
230
+ Parameters
231
+ ----------
232
+ domain : str
233
+ Domain name (e.g., 'healthcare', 'finance', 'retail')
234
+ columns : dict
235
+ Available columns and their types
236
+ n_features : int, default=5
237
+ Number of features to generate
238
+
239
+ Returns
240
+ -------
241
+ features : list
242
+ Generated domain-specific features
243
+ """
244
+ domain_prompts = {
245
+ "healthcare": [
246
+ "Calculate BMI if height and weight columns exist",
247
+ "Create age group categories (pediatric, adult, elderly)",
248
+ "Calculate medication count normalized by age",
249
+ "Create comorbidity score from diagnosis codes",
250
+ "Calculate length of stay relative to average",
251
+ ],
252
+ "finance": [
253
+ "Calculate debt-to-income ratio",
254
+ "Create credit utilization percentage",
255
+ "Calculate payment-to-income ratio",
256
+ "Create account age in years",
257
+ "Calculate average transaction amount",
258
+ ],
259
+ "retail": [
260
+ "Calculate average order value",
261
+ "Create recency score (days since last purchase)",
262
+ "Calculate purchase frequency per month",
263
+ "Create customer lifetime value estimate",
264
+ "Calculate category diversity score",
265
+ ],
266
+ "telecom": [
267
+ "Calculate average monthly charges",
268
+ "Create contract length in months",
269
+ "Calculate service usage intensity",
270
+ "Create support ticket frequency",
271
+ "Calculate revenue per service",
272
+ ],
273
+ }
274
+
275
+ prompts = domain_prompts.get(
276
+ domain.lower(),
277
+ [
278
+ f"Create a useful feature for {domain} analytics",
279
+ f"Calculate a key metric for {domain}",
280
+ f"Create an interaction feature relevant to {domain}",
281
+ ],
282
+ )
283
+
284
+ # Select prompts based on available columns
285
+ applicable_prompts = prompts[:n_features]
286
+
287
+ return self.generate_batch(applicable_prompts, columns)
288
+
289
+ def __del__(self):
290
+ """Clean up client."""
291
+ if self._client:
292
+ try:
293
+ self._client.stop()
294
+ except Exception:
295
+ pass