featcopilot 0.2.0__py3-none-any.whl → 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,403 @@
1
+ """LLM-powered transform rule generator.
2
+
3
+ Generates reusable transform rules from natural language descriptions
4
+ using GitHub Copilot SDK.
5
+ """
6
+
7
+ import json
8
+ import re
9
+ from typing import Optional
10
+
11
+ import pandas as pd
12
+
13
+ from featcopilot.core.transform_rule import TransformRule
14
+ from featcopilot.llm.copilot_client import SyncCopilotFeatureClient
15
+ from featcopilot.stores.rule_store import TransformRuleStore
16
+ from featcopilot.utils.logger import get_logger
17
+
18
+ logger = get_logger(__name__)
19
+
20
+
21
+ class TransformRuleGenerator:
22
+ """
23
+ Generate reusable transform rules from natural language descriptions.
24
+
25
+ Uses LLM to understand transformation requirements and generate
26
+ reusable Python code that can be applied across different datasets.
27
+
28
+ Parameters
29
+ ----------
30
+ model : str, default='gpt-5.2'
31
+ LLM model to use
32
+ store : TransformRuleStore, optional
33
+ Rule store for saving and retrieving rules
34
+ validate : bool, default=True
35
+ Whether to validate generated code
36
+
37
+ Examples
38
+ --------
39
+ >>> generator = TransformRuleGenerator()
40
+ >>> rule = generator.generate_from_description(
41
+ ... description="Calculate the ratio of price to quantity",
42
+ ... columns={"price": "float", "quantity": "int"}
43
+ ... )
44
+ >>> generator.save_rule(rule)
45
+ """
46
+
47
+ def __init__(
48
+ self,
49
+ model: str = "gpt-5.2",
50
+ store: Optional[TransformRuleStore] = None,
51
+ validate: bool = True,
52
+ verbose: bool = False,
53
+ ):
54
+ self.model = model
55
+ self.store = store
56
+ self.validate = validate
57
+ self.verbose = verbose
58
+ self._client: Optional[SyncCopilotFeatureClient] = None
59
+
60
+ def _ensure_client(self) -> None:
61
+ """Ensure LLM client is initialized."""
62
+ if self._client is None:
63
+ self._client = SyncCopilotFeatureClient(model=self.model)
64
+ self._client.start()
65
+
66
+ def generate_from_description(
67
+ self,
68
+ description: str,
69
+ columns: dict[str, str],
70
+ sample_data: Optional[pd.DataFrame] = None,
71
+ tags: Optional[list[str]] = None,
72
+ save: bool = False,
73
+ ) -> TransformRule:
74
+ """
75
+ Generate a transform rule from natural language description.
76
+
77
+ Parameters
78
+ ----------
79
+ description : str
80
+ Natural language description of the transformation
81
+ columns : dict
82
+ Available columns and their types (e.g., {"price": "float"})
83
+ sample_data : DataFrame, optional
84
+ Sample data for validation
85
+ tags : list[str], optional
86
+ Tags to add to the rule
87
+ save : bool, default=False
88
+ Whether to save the rule to the store
89
+
90
+ Returns
91
+ -------
92
+ TransformRule
93
+ Generated transform rule
94
+
95
+ Examples
96
+ --------
97
+ >>> rule = generator.generate_from_description(
98
+ ... description="Calculate BMI from height in meters and weight in kg",
99
+ ... columns={"height_m": "float", "weight_kg": "float"},
100
+ ... tags=["healthcare", "bmi"]
101
+ ... )
102
+ """
103
+ self._ensure_client()
104
+
105
+ # Build prompt for rule generation
106
+ prompt = self._build_generation_prompt(description, columns)
107
+
108
+ # Get LLM response
109
+ response = self._client.send_prompt(prompt)
110
+
111
+ # Parse response into rule
112
+ rule = self._parse_rule_response(response, description, columns, tags)
113
+
114
+ # Validate if enabled
115
+ if self.validate and sample_data is not None:
116
+ rule = self._validate_and_fix(rule, sample_data)
117
+
118
+ # Save if requested
119
+ if save and self.store is not None:
120
+ self.store.save_rule(rule)
121
+
122
+ return rule
123
+
124
+ def _build_generation_prompt(self, description: str, columns: dict[str, str]) -> str:
125
+ """Build prompt for rule generation."""
126
+ column_list = "\n".join(f"- {col} ({dtype})" for col, dtype in columns.items())
127
+
128
+ return f"""You are an expert data scientist creating a REUSABLE feature transformation rule.
129
+
130
+ ## Task
131
+ Create a reusable transformation rule based on this description:
132
+ "{description}"
133
+
134
+ ## Available Columns
135
+ {column_list}
136
+
137
+ ## Requirements
138
+ 1. Generate Python code using pandas (assume `df` is the DataFrame)
139
+ 2. Make the code REUSABLE by using the actual column names that can be substituted later
140
+ 3. Assign the result to a variable called `result`
141
+ 4. Handle edge cases (division by zero, missing values)
142
+ 5. The rule should be generalizable to similar columns in other datasets
143
+
144
+ ## Output Format
145
+ Return a JSON object with these fields:
146
+ - "name": short snake_case name for the rule (e.g., "ratio_calculation")
147
+ - "code": Python code that computes the transformation (single expression or multiple lines)
148
+ - "input_columns": list of column names used as inputs
149
+ - "output_type": "numeric", "categorical", or "boolean"
150
+ - "column_patterns": list of regex patterns to match similar columns (e.g., [".*price.*", ".*amount.*"])
151
+ - "explanation": brief explanation of what the rule does
152
+
153
+ Example output:
154
+ {{
155
+ "name": "ratio_calculation",
156
+ "code": "result = df['price'] / (df['quantity'] + 1e-8)",
157
+ "input_columns": ["price", "quantity"],
158
+ "output_type": "numeric",
159
+ "column_patterns": [".*price.*", ".*quantity.*"],
160
+ "explanation": "Calculates the ratio of price to quantity, handling division by zero"
161
+ }}
162
+
163
+ Return ONLY the JSON object, no other text.
164
+ """
165
+
166
+ def _parse_rule_response(
167
+ self,
168
+ response: str,
169
+ description: str,
170
+ columns: dict[str, str],
171
+ tags: Optional[list[str]] = None,
172
+ ) -> TransformRule:
173
+ """Parse LLM response into a TransformRule."""
174
+ try:
175
+ # Clean response
176
+ response = response.strip()
177
+ if response.startswith("```"):
178
+ lines = response.split("\n")
179
+ response = "\n".join(line for line in lines if not line.startswith("```"))
180
+
181
+ data = json.loads(response)
182
+
183
+ return TransformRule(
184
+ name=data.get("name", "custom_rule"),
185
+ description=description,
186
+ code=self._clean_code(data.get("code", "")),
187
+ input_columns=data.get("input_columns", list(columns.keys())),
188
+ output_type=data.get("output_type", "numeric"),
189
+ column_patterns=data.get("column_patterns", []),
190
+ tags=tags or [],
191
+ metadata={"original_columns": columns, "explanation": data.get("explanation", "")},
192
+ )
193
+
194
+ except json.JSONDecodeError:
195
+ # Try to extract JSON from response
196
+ json_match = re.search(r"\{.*\}", response, re.DOTALL)
197
+ if json_match:
198
+ try:
199
+ data = json.loads(json_match.group())
200
+ return self._parse_rule_response(json_match.group(), description, columns, tags)
201
+ except json.JSONDecodeError:
202
+ pass
203
+
204
+ # Fallback: create basic rule from response
205
+ logger.warning("Could not parse JSON response, creating basic rule")
206
+ return TransformRule(
207
+ name=self._generate_name(description),
208
+ description=description,
209
+ code=self._extract_code(response),
210
+ input_columns=list(columns.keys()),
211
+ tags=tags or [],
212
+ )
213
+
214
+ def _clean_code(self, code: str) -> str:
215
+ """Clean and normalize generated code."""
216
+ code = code.strip()
217
+
218
+ # Remove markdown code blocks
219
+ if code.startswith("```"):
220
+ lines = code.split("\n")
221
+ code = "\n".join(line for line in lines if not line.startswith("```"))
222
+
223
+ # Ensure result assignment
224
+ if "result" not in code and "=" in code:
225
+ code = re.sub(r"^(\w+)\s*=", "result =", code, count=1)
226
+ elif "result" not in code:
227
+ code = f"result = {code}"
228
+
229
+ return code.strip()
230
+
231
+ def _extract_code(self, response: str) -> str:
232
+ """Extract code from a response that isn't valid JSON."""
233
+ # Look for code patterns
234
+ code_patterns = [
235
+ r"result\s*=\s*[^\n]+",
236
+ r"df\[['\"][^\n]+",
237
+ ]
238
+
239
+ for pattern in code_patterns:
240
+ match = re.search(pattern, response)
241
+ if match:
242
+ return match.group()
243
+
244
+ # Fallback
245
+ return "result = df.iloc[:, 0]"
246
+
247
+ def _generate_name(self, description: str) -> str:
248
+ """Generate rule name from description."""
249
+ words = description.lower().split()
250
+ significant = [w for w in words if len(w) > 2 and w not in {"the", "and", "for", "from", "with", "calculate"}][
251
+ :3
252
+ ]
253
+ name = "_".join(significant)
254
+ name = re.sub(r"[^a-z0-9_]", "", name)
255
+ return name or "custom_rule"
256
+
257
+ def _validate_and_fix(self, rule: TransformRule, sample_data: pd.DataFrame) -> TransformRule:
258
+ """Validate rule code and attempt to fix issues."""
259
+ validation = self._client.validate_feature_code(
260
+ rule.code, {col: sample_data[col].tolist() for col in sample_data.columns}
261
+ )
262
+
263
+ if not validation["valid"]:
264
+ if self.verbose:
265
+ logger.warning(f"Rule validation failed: {validation['error']}")
266
+
267
+ # Try to fix common issues
268
+ fixed_code = self._fix_common_issues(rule.code, validation["error"])
269
+
270
+ # Re-validate
271
+ validation = self._client.validate_feature_code(
272
+ fixed_code, {col: sample_data[col].tolist() for col in sample_data.columns}
273
+ )
274
+
275
+ if validation["valid"]:
276
+ rule.code = fixed_code
277
+ else:
278
+ logger.warning(f"Could not fix rule code: {validation['error']}")
279
+
280
+ return rule
281
+
282
+ def _fix_common_issues(self, code: str, error: str) -> str:
283
+ """Attempt to fix common code issues."""
284
+ if "division by zero" in error.lower():
285
+ code = re.sub(r"/\s*\(([^)]+)\)", r"/ (\1 + 1e-8)", code)
286
+ code = re.sub(r"/\s*df\['([^']+)'\]", r"/ (df['\1'] + 1e-8)", code)
287
+
288
+ if "syntax" in error.lower():
289
+ code = code.replace("'", "'").replace("'", "'")
290
+ code = code.replace(""", '"').replace(""", '"')
291
+
292
+ return code
293
+
294
+ def suggest_rules(
295
+ self,
296
+ columns: dict[str, str],
297
+ task_description: Optional[str] = None,
298
+ limit: int = 5,
299
+ ) -> list[tuple[TransformRule, dict[str, str]]]:
300
+ """
301
+ Suggest applicable rules from the store for given columns.
302
+
303
+ Parameters
304
+ ----------
305
+ columns : dict
306
+ Available columns and their types
307
+ task_description : str, optional
308
+ Description of the ML task for better matching
309
+ limit : int, default=5
310
+ Maximum number of suggestions
311
+
312
+ Returns
313
+ -------
314
+ list[tuple[TransformRule, dict]]
315
+ List of (rule, column_mapping) tuples
316
+ """
317
+ if self.store is None:
318
+ logger.warning("No rule store configured")
319
+ return []
320
+
321
+ column_names = list(columns.keys())
322
+ matching = self.store.find_matching_rules(columns=column_names, description=task_description)
323
+
324
+ return matching[:limit]
325
+
326
+ def generate_and_suggest(
327
+ self,
328
+ description: str,
329
+ columns: dict[str, str],
330
+ sample_data: Optional[pd.DataFrame] = None,
331
+ tags: Optional[list[str]] = None,
332
+ ) -> tuple[Optional[TransformRule], list[tuple[TransformRule, dict[str, str]]]]:
333
+ """
334
+ Find existing matching rules or generate a new one.
335
+
336
+ First searches for existing rules that match the description and columns.
337
+ If no good matches found, generates a new rule.
338
+
339
+ Parameters
340
+ ----------
341
+ description : str
342
+ Natural language description
343
+ columns : dict
344
+ Available columns and their types
345
+ sample_data : DataFrame, optional
346
+ Sample data for validation
347
+ tags : list[str], optional
348
+ Tags for the new rule
349
+
350
+ Returns
351
+ -------
352
+ new_rule : TransformRule or None
353
+ Newly generated rule (None if existing rules found)
354
+ existing_rules : list
355
+ List of matching existing rules with column mappings
356
+ """
357
+ # Search for existing rules
358
+ existing = self.suggest_rules(columns, description, limit=3)
359
+
360
+ if existing:
361
+ if self.verbose:
362
+ logger.info(f"Found {len(existing)} existing matching rules")
363
+ return None, existing
364
+
365
+ # Generate new rule
366
+ if self.verbose:
367
+ logger.info("No matching rules found, generating new rule")
368
+
369
+ new_rule = self.generate_from_description(
370
+ description=description,
371
+ columns=columns,
372
+ sample_data=sample_data,
373
+ tags=tags,
374
+ save=False,
375
+ )
376
+
377
+ return new_rule, []
378
+
379
+ def save_rule(self, rule: TransformRule) -> str:
380
+ """
381
+ Save a rule to the store.
382
+
383
+ Parameters
384
+ ----------
385
+ rule : TransformRule
386
+ Rule to save
387
+
388
+ Returns
389
+ -------
390
+ str
391
+ Rule ID
392
+ """
393
+ if self.store is None:
394
+ raise ValueError("No rule store configured")
395
+ return self.store.save_rule(rule)
396
+
397
+ def __del__(self):
398
+ """Clean up client."""
399
+ if self._client:
400
+ try:
401
+ self._client.stop()
402
+ except Exception:
403
+ pass
@@ -66,23 +66,51 @@ class ImportanceSelector(BaseSelector):
66
66
  -------
67
67
  self : ImportanceSelector
68
68
  """
69
+ from sklearn.preprocessing import LabelEncoder
70
+
69
71
  X = self._validate_input(X)
70
72
  y = np.array(y)
71
73
 
74
+ # Encode string labels if needed
75
+ y_encoded = y
76
+ if y.dtype == object or y.dtype.kind in ("U", "S"):
77
+ le = LabelEncoder()
78
+ y_encoded = le.fit_transform(y)
79
+
72
80
  # Determine task type
73
- unique_y = len(np.unique(y))
74
- is_classification = unique_y < 20 and not np.issubdtype(y.dtype, np.floating)
81
+ unique_y = len(np.unique(y_encoded))
82
+ is_classification = (
83
+ y.dtype == object
84
+ or y.dtype.kind in ("U", "S")
85
+ or (np.issubdtype(y_encoded.dtype, np.integer) and unique_y <= len(y_encoded) * 0.1)
86
+ )
75
87
 
76
88
  # Create model
77
89
  self._model = self._create_model(is_classification)
78
90
 
79
- # Fit model
80
- X_array = X.fillna(0).values
81
- self._model.fit(X_array, y)
91
+ # Filter to numeric columns only
92
+ numeric_cols = X.select_dtypes(include=[np.number]).columns.tolist()
93
+ self._numeric_cols = numeric_cols
94
+
95
+ if not numeric_cols:
96
+ # No numeric columns, return empty selection
97
+ self._feature_scores = {col: 0.0 for col in X.columns}
98
+ self._select_features()
99
+ self._is_fitted = True
100
+ return self
82
101
 
83
- # Get importances
102
+ X_numeric = X[numeric_cols].fillna(0).values
103
+ self._model.fit(X_numeric, y_encoded)
104
+
105
+ # Get importances for numeric columns
84
106
  importances = self._model.feature_importances_
85
- self._feature_scores = dict(zip(X.columns, importances))
107
+ self._feature_scores = {}
108
+ for col in X.columns:
109
+ if col in numeric_cols:
110
+ idx = numeric_cols.index(col)
111
+ self._feature_scores[col] = importances[idx]
112
+ else:
113
+ self._feature_scores[col] = 0.0
86
114
 
87
115
  # Select features
88
116
  self._select_features()
@@ -24,6 +24,10 @@ class RedundancyEliminator(BaseSelector):
24
24
  Correlation threshold for redundancy
25
25
  method : str, default='pearson'
26
26
  Correlation method ('pearson', 'spearman', 'kendall')
27
+ original_features : set[str], optional
28
+ Set of original feature names to prefer over derived features
29
+ original_preference : float, default=0.1
30
+ Bonus added to importance scores of original features to prefer them
27
31
 
28
32
  Examples
29
33
  --------
@@ -36,6 +40,8 @@ class RedundancyEliminator(BaseSelector):
36
40
  correlation_threshold: float = 0.95,
37
41
  method: str = "pearson",
38
42
  importance_scores: Optional[dict[str, float]] = None,
43
+ original_features: Optional[set[str]] = None,
44
+ original_preference: float = 0.1,
39
45
  verbose: bool = False,
40
46
  **kwargs,
41
47
  ):
@@ -43,6 +49,8 @@ class RedundancyEliminator(BaseSelector):
43
49
  self.correlation_threshold = correlation_threshold
44
50
  self.method = method
45
51
  self.importance_scores = importance_scores or {}
52
+ self.original_features = original_features or set()
53
+ self.original_preference = original_preference
46
54
  self.verbose = verbose
47
55
  self._correlation_matrix: Optional[pd.DataFrame] = None
48
56
 
@@ -83,17 +91,19 @@ class RedundancyEliminator(BaseSelector):
83
91
  if importance_scores:
84
92
  self.importance_scores = importance_scores
85
93
 
86
- # Compute correlation matrix
87
- numeric_cols = X.select_dtypes(include=[np.number]).columns
94
+ # Compute correlation matrix (only for numeric columns)
95
+ numeric_cols = X.select_dtypes(include=[np.number]).columns.tolist()
96
+ non_numeric_cols = X.select_dtypes(exclude=[np.number]).columns.tolist()
97
+
88
98
  self._correlation_matrix = X[numeric_cols].corr(method=self.method)
89
99
 
90
- # Find redundant features
91
- self._find_redundant_features(numeric_cols)
100
+ # Find redundant features among numeric columns
101
+ self._find_redundant_features(numeric_cols, non_numeric_cols)
92
102
 
93
103
  self._is_fitted = True
94
104
  return self
95
105
 
96
- def _find_redundant_features(self, columns: list[str]) -> None:
106
+ def _find_redundant_features(self, columns: list[str], non_numeric_cols: list[str]) -> None:
97
107
  """Identify and mark redundant features for removal."""
98
108
  to_remove: set[str] = set()
99
109
  checked_pairs: set[tuple] = set()
@@ -115,22 +125,38 @@ class RedundancyEliminator(BaseSelector):
115
125
  corr = abs(self._correlation_matrix.loc[col1, col2])
116
126
 
117
127
  if corr >= self.correlation_threshold:
118
- # Decide which to remove based on importance
128
+ # Decide which to remove based on importance + original feature preference
119
129
  imp1 = self.importance_scores.get(col1, 0)
120
130
  imp2 = self.importance_scores.get(col2, 0)
121
131
 
132
+ # Add preference bonus for original features
133
+ # This ensures original features are preferred over derived ones
134
+ is_orig1 = col1 in self.original_features
135
+ is_orig2 = col2 in self.original_features
136
+
137
+ if is_orig1 and not is_orig2:
138
+ # col1 is original, col2 is derived - prefer col1
139
+ imp1 += self.original_preference
140
+ elif is_orig2 and not is_orig1:
141
+ # col2 is original, col1 is derived - prefer col2
142
+ imp2 += self.original_preference
143
+
122
144
  if imp1 >= imp2:
123
145
  to_remove.add(col2)
124
146
  if self.verbose:
125
- logger.info(f"Removing {col2} (corr={corr:.3f} with {col1})")
147
+ orig_tag = " (derived)" if not is_orig2 else ""
148
+ logger.info(f"Removing {col2}{orig_tag} (corr={corr:.3f} with {col1})")
126
149
  else:
127
150
  to_remove.add(col1)
128
151
  if self.verbose:
129
- logger.info(f"Removing {col1} (corr={corr:.3f} with {col2})")
152
+ orig_tag = " (derived)" if not is_orig1 else ""
153
+ logger.info(f"Removing {col1}{orig_tag} (corr={corr:.3f} with {col2})")
130
154
  break # col1 is removed, move to next
131
155
 
132
- # Selected features are those not removed
156
+ # Selected features are those not removed (numeric) plus all non-numeric columns
157
+ # Non-numeric columns (categorical/text) are always preserved
133
158
  self._selected_features = [c for c in columns if c not in to_remove]
159
+ self._selected_features.extend(non_numeric_cols) # Always include non-numeric
134
160
  self._removed_features = list(to_remove)
135
161
 
136
162
  if self.verbose: