featcopilot 0.1.0__py3-none-any.whl → 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- featcopilot/__init__.py +10 -1
- featcopilot/core/__init__.py +2 -0
- featcopilot/core/feature.py +5 -1
- featcopilot/core/transform_rule.py +276 -0
- featcopilot/engines/relational.py +5 -2
- featcopilot/engines/tabular.py +151 -5
- featcopilot/engines/text.py +352 -11
- featcopilot/engines/timeseries.py +235 -3
- featcopilot/llm/__init__.py +6 -1
- featcopilot/llm/code_generator.py +7 -4
- featcopilot/llm/copilot_client.py +97 -20
- featcopilot/llm/explainer.py +6 -3
- featcopilot/llm/litellm_client.py +595 -0
- featcopilot/llm/semantic_engine.py +717 -26
- featcopilot/llm/transform_rule_generator.py +403 -0
- featcopilot/selection/importance.py +40 -9
- featcopilot/selection/redundancy.py +39 -10
- featcopilot/selection/statistical.py +107 -34
- featcopilot/selection/unified.py +57 -3
- featcopilot/stores/__init__.py +17 -0
- featcopilot/stores/base.py +166 -0
- featcopilot/stores/feast_store.py +541 -0
- featcopilot/stores/rule_store.py +343 -0
- featcopilot/transformers/sklearn_compat.py +18 -6
- featcopilot/utils/__init__.py +14 -0
- featcopilot/utils/logger.py +47 -0
- featcopilot/utils/models.py +287 -0
- featcopilot/utils/parallel.py +5 -1
- {featcopilot-0.1.0.dist-info → featcopilot-0.3.0.dist-info}/METADATA +56 -25
- featcopilot-0.3.0.dist-info/RECORD +38 -0
- featcopilot-0.1.0.dist-info/RECORD +0 -29
- {featcopilot-0.1.0.dist-info → featcopilot-0.3.0.dist-info}/WHEEL +0 -0
- {featcopilot-0.1.0.dist-info → featcopilot-0.3.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,403 @@
|
|
|
1
|
+
"""LLM-powered transform rule generator.
|
|
2
|
+
|
|
3
|
+
Generates reusable transform rules from natural language descriptions
|
|
4
|
+
using GitHub Copilot SDK.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import json
|
|
8
|
+
import re
|
|
9
|
+
from typing import Optional
|
|
10
|
+
|
|
11
|
+
import pandas as pd
|
|
12
|
+
|
|
13
|
+
from featcopilot.core.transform_rule import TransformRule
|
|
14
|
+
from featcopilot.llm.copilot_client import SyncCopilotFeatureClient
|
|
15
|
+
from featcopilot.stores.rule_store import TransformRuleStore
|
|
16
|
+
from featcopilot.utils.logger import get_logger
|
|
17
|
+
|
|
18
|
+
logger = get_logger(__name__)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class TransformRuleGenerator:
|
|
22
|
+
"""
|
|
23
|
+
Generate reusable transform rules from natural language descriptions.
|
|
24
|
+
|
|
25
|
+
Uses LLM to understand transformation requirements and generate
|
|
26
|
+
reusable Python code that can be applied across different datasets.
|
|
27
|
+
|
|
28
|
+
Parameters
|
|
29
|
+
----------
|
|
30
|
+
model : str, default='gpt-5.2'
|
|
31
|
+
LLM model to use
|
|
32
|
+
store : TransformRuleStore, optional
|
|
33
|
+
Rule store for saving and retrieving rules
|
|
34
|
+
validate : bool, default=True
|
|
35
|
+
Whether to validate generated code
|
|
36
|
+
|
|
37
|
+
Examples
|
|
38
|
+
--------
|
|
39
|
+
>>> generator = TransformRuleGenerator()
|
|
40
|
+
>>> rule = generator.generate_from_description(
|
|
41
|
+
... description="Calculate the ratio of price to quantity",
|
|
42
|
+
... columns={"price": "float", "quantity": "int"}
|
|
43
|
+
... )
|
|
44
|
+
>>> generator.save_rule(rule)
|
|
45
|
+
"""
|
|
46
|
+
|
|
47
|
+
def __init__(
|
|
48
|
+
self,
|
|
49
|
+
model: str = "gpt-5.2",
|
|
50
|
+
store: Optional[TransformRuleStore] = None,
|
|
51
|
+
validate: bool = True,
|
|
52
|
+
verbose: bool = False,
|
|
53
|
+
):
|
|
54
|
+
self.model = model
|
|
55
|
+
self.store = store
|
|
56
|
+
self.validate = validate
|
|
57
|
+
self.verbose = verbose
|
|
58
|
+
self._client: Optional[SyncCopilotFeatureClient] = None
|
|
59
|
+
|
|
60
|
+
def _ensure_client(self) -> None:
|
|
61
|
+
"""Ensure LLM client is initialized."""
|
|
62
|
+
if self._client is None:
|
|
63
|
+
self._client = SyncCopilotFeatureClient(model=self.model)
|
|
64
|
+
self._client.start()
|
|
65
|
+
|
|
66
|
+
def generate_from_description(
|
|
67
|
+
self,
|
|
68
|
+
description: str,
|
|
69
|
+
columns: dict[str, str],
|
|
70
|
+
sample_data: Optional[pd.DataFrame] = None,
|
|
71
|
+
tags: Optional[list[str]] = None,
|
|
72
|
+
save: bool = False,
|
|
73
|
+
) -> TransformRule:
|
|
74
|
+
"""
|
|
75
|
+
Generate a transform rule from natural language description.
|
|
76
|
+
|
|
77
|
+
Parameters
|
|
78
|
+
----------
|
|
79
|
+
description : str
|
|
80
|
+
Natural language description of the transformation
|
|
81
|
+
columns : dict
|
|
82
|
+
Available columns and their types (e.g., {"price": "float"})
|
|
83
|
+
sample_data : DataFrame, optional
|
|
84
|
+
Sample data for validation
|
|
85
|
+
tags : list[str], optional
|
|
86
|
+
Tags to add to the rule
|
|
87
|
+
save : bool, default=False
|
|
88
|
+
Whether to save the rule to the store
|
|
89
|
+
|
|
90
|
+
Returns
|
|
91
|
+
-------
|
|
92
|
+
TransformRule
|
|
93
|
+
Generated transform rule
|
|
94
|
+
|
|
95
|
+
Examples
|
|
96
|
+
--------
|
|
97
|
+
>>> rule = generator.generate_from_description(
|
|
98
|
+
... description="Calculate BMI from height in meters and weight in kg",
|
|
99
|
+
... columns={"height_m": "float", "weight_kg": "float"},
|
|
100
|
+
... tags=["healthcare", "bmi"]
|
|
101
|
+
... )
|
|
102
|
+
"""
|
|
103
|
+
self._ensure_client()
|
|
104
|
+
|
|
105
|
+
# Build prompt for rule generation
|
|
106
|
+
prompt = self._build_generation_prompt(description, columns)
|
|
107
|
+
|
|
108
|
+
# Get LLM response
|
|
109
|
+
response = self._client.send_prompt(prompt)
|
|
110
|
+
|
|
111
|
+
# Parse response into rule
|
|
112
|
+
rule = self._parse_rule_response(response, description, columns, tags)
|
|
113
|
+
|
|
114
|
+
# Validate if enabled
|
|
115
|
+
if self.validate and sample_data is not None:
|
|
116
|
+
rule = self._validate_and_fix(rule, sample_data)
|
|
117
|
+
|
|
118
|
+
# Save if requested
|
|
119
|
+
if save and self.store is not None:
|
|
120
|
+
self.store.save_rule(rule)
|
|
121
|
+
|
|
122
|
+
return rule
|
|
123
|
+
|
|
124
|
+
def _build_generation_prompt(self, description: str, columns: dict[str, str]) -> str:
|
|
125
|
+
"""Build prompt for rule generation."""
|
|
126
|
+
column_list = "\n".join(f"- {col} ({dtype})" for col, dtype in columns.items())
|
|
127
|
+
|
|
128
|
+
return f"""You are an expert data scientist creating a REUSABLE feature transformation rule.
|
|
129
|
+
|
|
130
|
+
## Task
|
|
131
|
+
Create a reusable transformation rule based on this description:
|
|
132
|
+
"{description}"
|
|
133
|
+
|
|
134
|
+
## Available Columns
|
|
135
|
+
{column_list}
|
|
136
|
+
|
|
137
|
+
## Requirements
|
|
138
|
+
1. Generate Python code using pandas (assume `df` is the DataFrame)
|
|
139
|
+
2. Make the code REUSABLE by using the actual column names that can be substituted later
|
|
140
|
+
3. Assign the result to a variable called `result`
|
|
141
|
+
4. Handle edge cases (division by zero, missing values)
|
|
142
|
+
5. The rule should be generalizable to similar columns in other datasets
|
|
143
|
+
|
|
144
|
+
## Output Format
|
|
145
|
+
Return a JSON object with these fields:
|
|
146
|
+
- "name": short snake_case name for the rule (e.g., "ratio_calculation")
|
|
147
|
+
- "code": Python code that computes the transformation (single expression or multiple lines)
|
|
148
|
+
- "input_columns": list of column names used as inputs
|
|
149
|
+
- "output_type": "numeric", "categorical", or "boolean"
|
|
150
|
+
- "column_patterns": list of regex patterns to match similar columns (e.g., [".*price.*", ".*amount.*"])
|
|
151
|
+
- "explanation": brief explanation of what the rule does
|
|
152
|
+
|
|
153
|
+
Example output:
|
|
154
|
+
{{
|
|
155
|
+
"name": "ratio_calculation",
|
|
156
|
+
"code": "result = df['price'] / (df['quantity'] + 1e-8)",
|
|
157
|
+
"input_columns": ["price", "quantity"],
|
|
158
|
+
"output_type": "numeric",
|
|
159
|
+
"column_patterns": [".*price.*", ".*quantity.*"],
|
|
160
|
+
"explanation": "Calculates the ratio of price to quantity, handling division by zero"
|
|
161
|
+
}}
|
|
162
|
+
|
|
163
|
+
Return ONLY the JSON object, no other text.
|
|
164
|
+
"""
|
|
165
|
+
|
|
166
|
+
def _parse_rule_response(
|
|
167
|
+
self,
|
|
168
|
+
response: str,
|
|
169
|
+
description: str,
|
|
170
|
+
columns: dict[str, str],
|
|
171
|
+
tags: Optional[list[str]] = None,
|
|
172
|
+
) -> TransformRule:
|
|
173
|
+
"""Parse LLM response into a TransformRule."""
|
|
174
|
+
try:
|
|
175
|
+
# Clean response
|
|
176
|
+
response = response.strip()
|
|
177
|
+
if response.startswith("```"):
|
|
178
|
+
lines = response.split("\n")
|
|
179
|
+
response = "\n".join(line for line in lines if not line.startswith("```"))
|
|
180
|
+
|
|
181
|
+
data = json.loads(response)
|
|
182
|
+
|
|
183
|
+
return TransformRule(
|
|
184
|
+
name=data.get("name", "custom_rule"),
|
|
185
|
+
description=description,
|
|
186
|
+
code=self._clean_code(data.get("code", "")),
|
|
187
|
+
input_columns=data.get("input_columns", list(columns.keys())),
|
|
188
|
+
output_type=data.get("output_type", "numeric"),
|
|
189
|
+
column_patterns=data.get("column_patterns", []),
|
|
190
|
+
tags=tags or [],
|
|
191
|
+
metadata={"original_columns": columns, "explanation": data.get("explanation", "")},
|
|
192
|
+
)
|
|
193
|
+
|
|
194
|
+
except json.JSONDecodeError:
|
|
195
|
+
# Try to extract JSON from response
|
|
196
|
+
json_match = re.search(r"\{.*\}", response, re.DOTALL)
|
|
197
|
+
if json_match:
|
|
198
|
+
try:
|
|
199
|
+
data = json.loads(json_match.group())
|
|
200
|
+
return self._parse_rule_response(json_match.group(), description, columns, tags)
|
|
201
|
+
except json.JSONDecodeError:
|
|
202
|
+
pass
|
|
203
|
+
|
|
204
|
+
# Fallback: create basic rule from response
|
|
205
|
+
logger.warning("Could not parse JSON response, creating basic rule")
|
|
206
|
+
return TransformRule(
|
|
207
|
+
name=self._generate_name(description),
|
|
208
|
+
description=description,
|
|
209
|
+
code=self._extract_code(response),
|
|
210
|
+
input_columns=list(columns.keys()),
|
|
211
|
+
tags=tags or [],
|
|
212
|
+
)
|
|
213
|
+
|
|
214
|
+
def _clean_code(self, code: str) -> str:
|
|
215
|
+
"""Clean and normalize generated code."""
|
|
216
|
+
code = code.strip()
|
|
217
|
+
|
|
218
|
+
# Remove markdown code blocks
|
|
219
|
+
if code.startswith("```"):
|
|
220
|
+
lines = code.split("\n")
|
|
221
|
+
code = "\n".join(line for line in lines if not line.startswith("```"))
|
|
222
|
+
|
|
223
|
+
# Ensure result assignment
|
|
224
|
+
if "result" not in code and "=" in code:
|
|
225
|
+
code = re.sub(r"^(\w+)\s*=", "result =", code, count=1)
|
|
226
|
+
elif "result" not in code:
|
|
227
|
+
code = f"result = {code}"
|
|
228
|
+
|
|
229
|
+
return code.strip()
|
|
230
|
+
|
|
231
|
+
def _extract_code(self, response: str) -> str:
|
|
232
|
+
"""Extract code from a response that isn't valid JSON."""
|
|
233
|
+
# Look for code patterns
|
|
234
|
+
code_patterns = [
|
|
235
|
+
r"result\s*=\s*[^\n]+",
|
|
236
|
+
r"df\[['\"][^\n]+",
|
|
237
|
+
]
|
|
238
|
+
|
|
239
|
+
for pattern in code_patterns:
|
|
240
|
+
match = re.search(pattern, response)
|
|
241
|
+
if match:
|
|
242
|
+
return match.group()
|
|
243
|
+
|
|
244
|
+
# Fallback
|
|
245
|
+
return "result = df.iloc[:, 0]"
|
|
246
|
+
|
|
247
|
+
def _generate_name(self, description: str) -> str:
|
|
248
|
+
"""Generate rule name from description."""
|
|
249
|
+
words = description.lower().split()
|
|
250
|
+
significant = [w for w in words if len(w) > 2 and w not in {"the", "and", "for", "from", "with", "calculate"}][
|
|
251
|
+
:3
|
|
252
|
+
]
|
|
253
|
+
name = "_".join(significant)
|
|
254
|
+
name = re.sub(r"[^a-z0-9_]", "", name)
|
|
255
|
+
return name or "custom_rule"
|
|
256
|
+
|
|
257
|
+
def _validate_and_fix(self, rule: TransformRule, sample_data: pd.DataFrame) -> TransformRule:
|
|
258
|
+
"""Validate rule code and attempt to fix issues."""
|
|
259
|
+
validation = self._client.validate_feature_code(
|
|
260
|
+
rule.code, {col: sample_data[col].tolist() for col in sample_data.columns}
|
|
261
|
+
)
|
|
262
|
+
|
|
263
|
+
if not validation["valid"]:
|
|
264
|
+
if self.verbose:
|
|
265
|
+
logger.warning(f"Rule validation failed: {validation['error']}")
|
|
266
|
+
|
|
267
|
+
# Try to fix common issues
|
|
268
|
+
fixed_code = self._fix_common_issues(rule.code, validation["error"])
|
|
269
|
+
|
|
270
|
+
# Re-validate
|
|
271
|
+
validation = self._client.validate_feature_code(
|
|
272
|
+
fixed_code, {col: sample_data[col].tolist() for col in sample_data.columns}
|
|
273
|
+
)
|
|
274
|
+
|
|
275
|
+
if validation["valid"]:
|
|
276
|
+
rule.code = fixed_code
|
|
277
|
+
else:
|
|
278
|
+
logger.warning(f"Could not fix rule code: {validation['error']}")
|
|
279
|
+
|
|
280
|
+
return rule
|
|
281
|
+
|
|
282
|
+
def _fix_common_issues(self, code: str, error: str) -> str:
|
|
283
|
+
"""Attempt to fix common code issues."""
|
|
284
|
+
if "division by zero" in error.lower():
|
|
285
|
+
code = re.sub(r"/\s*\(([^)]+)\)", r"/ (\1 + 1e-8)", code)
|
|
286
|
+
code = re.sub(r"/\s*df\['([^']+)'\]", r"/ (df['\1'] + 1e-8)", code)
|
|
287
|
+
|
|
288
|
+
if "syntax" in error.lower():
|
|
289
|
+
code = code.replace("'", "'").replace("'", "'")
|
|
290
|
+
code = code.replace(""", '"').replace(""", '"')
|
|
291
|
+
|
|
292
|
+
return code
|
|
293
|
+
|
|
294
|
+
def suggest_rules(
|
|
295
|
+
self,
|
|
296
|
+
columns: dict[str, str],
|
|
297
|
+
task_description: Optional[str] = None,
|
|
298
|
+
limit: int = 5,
|
|
299
|
+
) -> list[tuple[TransformRule, dict[str, str]]]:
|
|
300
|
+
"""
|
|
301
|
+
Suggest applicable rules from the store for given columns.
|
|
302
|
+
|
|
303
|
+
Parameters
|
|
304
|
+
----------
|
|
305
|
+
columns : dict
|
|
306
|
+
Available columns and their types
|
|
307
|
+
task_description : str, optional
|
|
308
|
+
Description of the ML task for better matching
|
|
309
|
+
limit : int, default=5
|
|
310
|
+
Maximum number of suggestions
|
|
311
|
+
|
|
312
|
+
Returns
|
|
313
|
+
-------
|
|
314
|
+
list[tuple[TransformRule, dict]]
|
|
315
|
+
List of (rule, column_mapping) tuples
|
|
316
|
+
"""
|
|
317
|
+
if self.store is None:
|
|
318
|
+
logger.warning("No rule store configured")
|
|
319
|
+
return []
|
|
320
|
+
|
|
321
|
+
column_names = list(columns.keys())
|
|
322
|
+
matching = self.store.find_matching_rules(columns=column_names, description=task_description)
|
|
323
|
+
|
|
324
|
+
return matching[:limit]
|
|
325
|
+
|
|
326
|
+
def generate_and_suggest(
|
|
327
|
+
self,
|
|
328
|
+
description: str,
|
|
329
|
+
columns: dict[str, str],
|
|
330
|
+
sample_data: Optional[pd.DataFrame] = None,
|
|
331
|
+
tags: Optional[list[str]] = None,
|
|
332
|
+
) -> tuple[Optional[TransformRule], list[tuple[TransformRule, dict[str, str]]]]:
|
|
333
|
+
"""
|
|
334
|
+
Find existing matching rules or generate a new one.
|
|
335
|
+
|
|
336
|
+
First searches for existing rules that match the description and columns.
|
|
337
|
+
If no good matches found, generates a new rule.
|
|
338
|
+
|
|
339
|
+
Parameters
|
|
340
|
+
----------
|
|
341
|
+
description : str
|
|
342
|
+
Natural language description
|
|
343
|
+
columns : dict
|
|
344
|
+
Available columns and their types
|
|
345
|
+
sample_data : DataFrame, optional
|
|
346
|
+
Sample data for validation
|
|
347
|
+
tags : list[str], optional
|
|
348
|
+
Tags for the new rule
|
|
349
|
+
|
|
350
|
+
Returns
|
|
351
|
+
-------
|
|
352
|
+
new_rule : TransformRule or None
|
|
353
|
+
Newly generated rule (None if existing rules found)
|
|
354
|
+
existing_rules : list
|
|
355
|
+
List of matching existing rules with column mappings
|
|
356
|
+
"""
|
|
357
|
+
# Search for existing rules
|
|
358
|
+
existing = self.suggest_rules(columns, description, limit=3)
|
|
359
|
+
|
|
360
|
+
if existing:
|
|
361
|
+
if self.verbose:
|
|
362
|
+
logger.info(f"Found {len(existing)} existing matching rules")
|
|
363
|
+
return None, existing
|
|
364
|
+
|
|
365
|
+
# Generate new rule
|
|
366
|
+
if self.verbose:
|
|
367
|
+
logger.info("No matching rules found, generating new rule")
|
|
368
|
+
|
|
369
|
+
new_rule = self.generate_from_description(
|
|
370
|
+
description=description,
|
|
371
|
+
columns=columns,
|
|
372
|
+
sample_data=sample_data,
|
|
373
|
+
tags=tags,
|
|
374
|
+
save=False,
|
|
375
|
+
)
|
|
376
|
+
|
|
377
|
+
return new_rule, []
|
|
378
|
+
|
|
379
|
+
def save_rule(self, rule: TransformRule) -> str:
|
|
380
|
+
"""
|
|
381
|
+
Save a rule to the store.
|
|
382
|
+
|
|
383
|
+
Parameters
|
|
384
|
+
----------
|
|
385
|
+
rule : TransformRule
|
|
386
|
+
Rule to save
|
|
387
|
+
|
|
388
|
+
Returns
|
|
389
|
+
-------
|
|
390
|
+
str
|
|
391
|
+
Rule ID
|
|
392
|
+
"""
|
|
393
|
+
if self.store is None:
|
|
394
|
+
raise ValueError("No rule store configured")
|
|
395
|
+
return self.store.save_rule(rule)
|
|
396
|
+
|
|
397
|
+
def __del__(self):
|
|
398
|
+
"""Clean up client."""
|
|
399
|
+
if self._client:
|
|
400
|
+
try:
|
|
401
|
+
self._client.stop()
|
|
402
|
+
except Exception:
|
|
403
|
+
pass
|
|
@@ -6,6 +6,9 @@ import numpy as np
|
|
|
6
6
|
import pandas as pd
|
|
7
7
|
|
|
8
8
|
from featcopilot.core.base import BaseSelector
|
|
9
|
+
from featcopilot.utils.logger import get_logger
|
|
10
|
+
|
|
11
|
+
logger = get_logger(__name__)
|
|
9
12
|
|
|
10
13
|
|
|
11
14
|
class ImportanceSelector(BaseSelector):
|
|
@@ -63,23 +66,51 @@ class ImportanceSelector(BaseSelector):
|
|
|
63
66
|
-------
|
|
64
67
|
self : ImportanceSelector
|
|
65
68
|
"""
|
|
69
|
+
from sklearn.preprocessing import LabelEncoder
|
|
70
|
+
|
|
66
71
|
X = self._validate_input(X)
|
|
67
72
|
y = np.array(y)
|
|
68
73
|
|
|
74
|
+
# Encode string labels if needed
|
|
75
|
+
y_encoded = y
|
|
76
|
+
if y.dtype == object or y.dtype.kind in ("U", "S"):
|
|
77
|
+
le = LabelEncoder()
|
|
78
|
+
y_encoded = le.fit_transform(y)
|
|
79
|
+
|
|
69
80
|
# Determine task type
|
|
70
|
-
unique_y = len(np.unique(
|
|
71
|
-
is_classification =
|
|
81
|
+
unique_y = len(np.unique(y_encoded))
|
|
82
|
+
is_classification = (
|
|
83
|
+
y.dtype == object
|
|
84
|
+
or y.dtype.kind in ("U", "S")
|
|
85
|
+
or (np.issubdtype(y_encoded.dtype, np.integer) and unique_y <= len(y_encoded) * 0.1)
|
|
86
|
+
)
|
|
72
87
|
|
|
73
88
|
# Create model
|
|
74
89
|
self._model = self._create_model(is_classification)
|
|
75
90
|
|
|
76
|
-
#
|
|
77
|
-
|
|
78
|
-
self.
|
|
91
|
+
# Filter to numeric columns only
|
|
92
|
+
numeric_cols = X.select_dtypes(include=[np.number]).columns.tolist()
|
|
93
|
+
self._numeric_cols = numeric_cols
|
|
79
94
|
|
|
80
|
-
|
|
95
|
+
if not numeric_cols:
|
|
96
|
+
# No numeric columns, return empty selection
|
|
97
|
+
self._feature_scores = {col: 0.0 for col in X.columns}
|
|
98
|
+
self._select_features()
|
|
99
|
+
self._is_fitted = True
|
|
100
|
+
return self
|
|
101
|
+
|
|
102
|
+
X_numeric = X[numeric_cols].fillna(0).values
|
|
103
|
+
self._model.fit(X_numeric, y_encoded)
|
|
104
|
+
|
|
105
|
+
# Get importances for numeric columns
|
|
81
106
|
importances = self._model.feature_importances_
|
|
82
|
-
self._feature_scores =
|
|
107
|
+
self._feature_scores = {}
|
|
108
|
+
for col in X.columns:
|
|
109
|
+
if col in numeric_cols:
|
|
110
|
+
idx = numeric_cols.index(col)
|
|
111
|
+
self._feature_scores[col] = importances[idx]
|
|
112
|
+
else:
|
|
113
|
+
self._feature_scores[col] = 0.0
|
|
83
114
|
|
|
84
115
|
# Select features
|
|
85
116
|
self._select_features()
|
|
@@ -119,7 +150,7 @@ class ImportanceSelector(BaseSelector):
|
|
|
119
150
|
return xgb.XGBRegressor(n_estimators=self.n_estimators, random_state=42, n_jobs=-1)
|
|
120
151
|
except ImportError:
|
|
121
152
|
if self.verbose:
|
|
122
|
-
|
|
153
|
+
logger.warning("XGBoost not available, falling back to RandomForest")
|
|
123
154
|
return self._create_model_fallback(is_classification)
|
|
124
155
|
|
|
125
156
|
else:
|
|
@@ -149,7 +180,7 @@ class ImportanceSelector(BaseSelector):
|
|
|
149
180
|
self._selected_features = [name for name, _ in sorted_features]
|
|
150
181
|
|
|
151
182
|
if self.verbose:
|
|
152
|
-
|
|
183
|
+
logger.info(f"ImportanceSelector: Selected {len(self._selected_features)} features")
|
|
153
184
|
|
|
154
185
|
def transform(self, X: Union[pd.DataFrame, np.ndarray], **kwargs) -> pd.DataFrame:
|
|
155
186
|
"""Select features from data."""
|
|
@@ -6,6 +6,9 @@ import numpy as np
|
|
|
6
6
|
import pandas as pd
|
|
7
7
|
|
|
8
8
|
from featcopilot.core.base import BaseSelector
|
|
9
|
+
from featcopilot.utils.logger import get_logger
|
|
10
|
+
|
|
11
|
+
logger = get_logger(__name__)
|
|
9
12
|
|
|
10
13
|
|
|
11
14
|
class RedundancyEliminator(BaseSelector):
|
|
@@ -21,6 +24,10 @@ class RedundancyEliminator(BaseSelector):
|
|
|
21
24
|
Correlation threshold for redundancy
|
|
22
25
|
method : str, default='pearson'
|
|
23
26
|
Correlation method ('pearson', 'spearman', 'kendall')
|
|
27
|
+
original_features : set[str], optional
|
|
28
|
+
Set of original feature names to prefer over derived features
|
|
29
|
+
original_preference : float, default=0.1
|
|
30
|
+
Bonus added to importance scores of original features to prefer them
|
|
24
31
|
|
|
25
32
|
Examples
|
|
26
33
|
--------
|
|
@@ -33,6 +40,8 @@ class RedundancyEliminator(BaseSelector):
|
|
|
33
40
|
correlation_threshold: float = 0.95,
|
|
34
41
|
method: str = "pearson",
|
|
35
42
|
importance_scores: Optional[dict[str, float]] = None,
|
|
43
|
+
original_features: Optional[set[str]] = None,
|
|
44
|
+
original_preference: float = 0.1,
|
|
36
45
|
verbose: bool = False,
|
|
37
46
|
**kwargs,
|
|
38
47
|
):
|
|
@@ -40,6 +49,8 @@ class RedundancyEliminator(BaseSelector):
|
|
|
40
49
|
self.correlation_threshold = correlation_threshold
|
|
41
50
|
self.method = method
|
|
42
51
|
self.importance_scores = importance_scores or {}
|
|
52
|
+
self.original_features = original_features or set()
|
|
53
|
+
self.original_preference = original_preference
|
|
43
54
|
self.verbose = verbose
|
|
44
55
|
self._correlation_matrix: Optional[pd.DataFrame] = None
|
|
45
56
|
|
|
@@ -80,17 +91,19 @@ class RedundancyEliminator(BaseSelector):
|
|
|
80
91
|
if importance_scores:
|
|
81
92
|
self.importance_scores = importance_scores
|
|
82
93
|
|
|
83
|
-
# Compute correlation matrix
|
|
84
|
-
numeric_cols = X.select_dtypes(include=[np.number]).columns
|
|
94
|
+
# Compute correlation matrix (only for numeric columns)
|
|
95
|
+
numeric_cols = X.select_dtypes(include=[np.number]).columns.tolist()
|
|
96
|
+
non_numeric_cols = X.select_dtypes(exclude=[np.number]).columns.tolist()
|
|
97
|
+
|
|
85
98
|
self._correlation_matrix = X[numeric_cols].corr(method=self.method)
|
|
86
99
|
|
|
87
|
-
# Find redundant features
|
|
88
|
-
self._find_redundant_features(numeric_cols)
|
|
100
|
+
# Find redundant features among numeric columns
|
|
101
|
+
self._find_redundant_features(numeric_cols, non_numeric_cols)
|
|
89
102
|
|
|
90
103
|
self._is_fitted = True
|
|
91
104
|
return self
|
|
92
105
|
|
|
93
|
-
def _find_redundant_features(self, columns: list[str]) -> None:
|
|
106
|
+
def _find_redundant_features(self, columns: list[str], non_numeric_cols: list[str]) -> None:
|
|
94
107
|
"""Identify and mark redundant features for removal."""
|
|
95
108
|
to_remove: set[str] = set()
|
|
96
109
|
checked_pairs: set[tuple] = set()
|
|
@@ -112,26 +125,42 @@ class RedundancyEliminator(BaseSelector):
|
|
|
112
125
|
corr = abs(self._correlation_matrix.loc[col1, col2])
|
|
113
126
|
|
|
114
127
|
if corr >= self.correlation_threshold:
|
|
115
|
-
# Decide which to remove based on importance
|
|
128
|
+
# Decide which to remove based on importance + original feature preference
|
|
116
129
|
imp1 = self.importance_scores.get(col1, 0)
|
|
117
130
|
imp2 = self.importance_scores.get(col2, 0)
|
|
118
131
|
|
|
132
|
+
# Add preference bonus for original features
|
|
133
|
+
# This ensures original features are preferred over derived ones
|
|
134
|
+
is_orig1 = col1 in self.original_features
|
|
135
|
+
is_orig2 = col2 in self.original_features
|
|
136
|
+
|
|
137
|
+
if is_orig1 and not is_orig2:
|
|
138
|
+
# col1 is original, col2 is derived - prefer col1
|
|
139
|
+
imp1 += self.original_preference
|
|
140
|
+
elif is_orig2 and not is_orig1:
|
|
141
|
+
# col2 is original, col1 is derived - prefer col2
|
|
142
|
+
imp2 += self.original_preference
|
|
143
|
+
|
|
119
144
|
if imp1 >= imp2:
|
|
120
145
|
to_remove.add(col2)
|
|
121
146
|
if self.verbose:
|
|
122
|
-
|
|
147
|
+
orig_tag = " (derived)" if not is_orig2 else ""
|
|
148
|
+
logger.info(f"Removing {col2}{orig_tag} (corr={corr:.3f} with {col1})")
|
|
123
149
|
else:
|
|
124
150
|
to_remove.add(col1)
|
|
125
151
|
if self.verbose:
|
|
126
|
-
|
|
152
|
+
orig_tag = " (derived)" if not is_orig1 else ""
|
|
153
|
+
logger.info(f"Removing {col1}{orig_tag} (corr={corr:.3f} with {col2})")
|
|
127
154
|
break # col1 is removed, move to next
|
|
128
155
|
|
|
129
|
-
# Selected features are those not removed
|
|
156
|
+
# Selected features are those not removed (numeric) plus all non-numeric columns
|
|
157
|
+
# Non-numeric columns (categorical/text) are always preserved
|
|
130
158
|
self._selected_features = [c for c in columns if c not in to_remove]
|
|
159
|
+
self._selected_features.extend(non_numeric_cols) # Always include non-numeric
|
|
131
160
|
self._removed_features = list(to_remove)
|
|
132
161
|
|
|
133
162
|
if self.verbose:
|
|
134
|
-
|
|
163
|
+
logger.info(f"RedundancyEliminator: Removed {len(to_remove)} redundant features")
|
|
135
164
|
|
|
136
165
|
def transform(self, X: Union[pd.DataFrame, np.ndarray], **kwargs) -> pd.DataFrame:
|
|
137
166
|
"""Remove redundant features."""
|