featcopilot 0.2.0__py3-none-any.whl → 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- featcopilot/__init__.py +7 -0
- featcopilot/core/__init__.py +2 -0
- featcopilot/core/transform_rule.py +276 -0
- featcopilot/engines/tabular.py +145 -2
- featcopilot/engines/text.py +346 -8
- featcopilot/engines/timeseries.py +230 -1
- featcopilot/llm/__init__.py +2 -0
- featcopilot/llm/copilot_client.py +50 -17
- featcopilot/llm/semantic_engine.py +652 -10
- featcopilot/llm/transform_rule_generator.py +403 -0
- featcopilot/selection/importance.py +35 -7
- featcopilot/selection/redundancy.py +35 -9
- featcopilot/selection/statistical.py +103 -33
- featcopilot/selection/unified.py +54 -3
- featcopilot/stores/__init__.py +2 -0
- featcopilot/stores/rule_store.py +343 -0
- featcopilot/transformers/sklearn_compat.py +10 -1
- {featcopilot-0.2.0.dist-info → featcopilot-0.3.0.dist-info}/METADATA +27 -19
- featcopilot-0.3.0.dist-info/RECORD +38 -0
- featcopilot-0.2.0.dist-info/RECORD +0 -35
- {featcopilot-0.2.0.dist-info → featcopilot-0.3.0.dist-info}/WHEEL +0 -0
- {featcopilot-0.2.0.dist-info → featcopilot-0.3.0.dist-info}/top_level.txt +0 -0
|
@@ -98,67 +98,137 @@ class StatisticalSelector(BaseSelector):
|
|
|
98
98
|
def _compute_mutual_info(self, X: pd.DataFrame, y: np.ndarray) -> np.ndarray:
|
|
99
99
|
"""Compute mutual information scores."""
|
|
100
100
|
from sklearn.feature_selection import mutual_info_classif, mutual_info_regression
|
|
101
|
+
from sklearn.preprocessing import LabelEncoder
|
|
101
102
|
|
|
102
|
-
#
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
103
|
+
# Encode string labels if needed
|
|
104
|
+
y_encoded = y
|
|
105
|
+
if y.dtype == object or y.dtype.kind in ("U", "S"):
|
|
106
|
+
le = LabelEncoder()
|
|
107
|
+
y_encoded = le.fit_transform(y)
|
|
107
108
|
|
|
108
|
-
if
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
109
|
+
# Determine if classification or regression
|
|
110
|
+
# Check if target is categorical (object type) or has discrete integer values
|
|
111
|
+
unique_y = len(np.unique(y_encoded))
|
|
112
|
+
is_classification = (
|
|
113
|
+
y.dtype == object
|
|
114
|
+
or y.dtype.kind in ("U", "S")
|
|
115
|
+
or (np.issubdtype(y_encoded.dtype, np.integer) and unique_y <= len(y_encoded) * 0.1)
|
|
116
|
+
)
|
|
117
|
+
|
|
118
|
+
# Filter to numeric columns only
|
|
119
|
+
numeric_cols = X.select_dtypes(include=[np.number]).columns.tolist()
|
|
120
|
+
scores = np.zeros(len(X.columns))
|
|
121
|
+
|
|
122
|
+
if numeric_cols:
|
|
123
|
+
X_numeric = X[numeric_cols].fillna(0).values
|
|
124
|
+
numeric_indices = [X.columns.get_loc(c) for c in numeric_cols]
|
|
125
|
+
|
|
126
|
+
if is_classification:
|
|
127
|
+
numeric_scores = mutual_info_classif(X_numeric, y_encoded, random_state=42)
|
|
128
|
+
else:
|
|
129
|
+
numeric_scores = mutual_info_regression(X_numeric, y_encoded, random_state=42)
|
|
130
|
+
|
|
131
|
+
for i, idx in enumerate(numeric_indices):
|
|
132
|
+
scores[idx] = numeric_scores[i]
|
|
112
133
|
|
|
113
134
|
return scores
|
|
114
135
|
|
|
115
136
|
def _compute_f_test(self, X: pd.DataFrame, y: np.ndarray) -> np.ndarray:
|
|
116
137
|
"""Compute F-test scores."""
|
|
117
138
|
from sklearn.feature_selection import f_classif, f_regression
|
|
139
|
+
from sklearn.preprocessing import LabelEncoder
|
|
140
|
+
|
|
141
|
+
# Encode string labels if needed
|
|
142
|
+
y_encoded = y
|
|
143
|
+
if y.dtype == object or y.dtype.kind in ("U", "S"):
|
|
144
|
+
le = LabelEncoder()
|
|
145
|
+
y_encoded = le.fit_transform(y)
|
|
118
146
|
|
|
119
|
-
|
|
120
|
-
|
|
147
|
+
# Determine if classification or regression
|
|
148
|
+
unique_y = len(np.unique(y_encoded))
|
|
149
|
+
is_classification = (
|
|
150
|
+
y.dtype == object
|
|
151
|
+
or y.dtype.kind in ("U", "S")
|
|
152
|
+
or (np.issubdtype(y_encoded.dtype, np.integer) and unique_y <= len(y_encoded) * 0.1)
|
|
153
|
+
)
|
|
121
154
|
|
|
122
|
-
|
|
155
|
+
# Filter to numeric columns only
|
|
156
|
+
numeric_cols = X.select_dtypes(include=[np.number]).columns.tolist()
|
|
157
|
+
scores = np.zeros(len(X.columns))
|
|
123
158
|
|
|
124
|
-
if
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
159
|
+
if numeric_cols:
|
|
160
|
+
X_numeric = X[numeric_cols].fillna(0).values
|
|
161
|
+
numeric_indices = [X.columns.get_loc(c) for c in numeric_cols]
|
|
162
|
+
|
|
163
|
+
if is_classification:
|
|
164
|
+
numeric_scores, _ = f_classif(X_numeric, y_encoded)
|
|
165
|
+
else:
|
|
166
|
+
numeric_scores, _ = f_regression(X_numeric, y_encoded)
|
|
167
|
+
|
|
168
|
+
# Handle NaN scores
|
|
169
|
+
numeric_scores = np.nan_to_num(numeric_scores, 0)
|
|
170
|
+
|
|
171
|
+
for i, idx in enumerate(numeric_indices):
|
|
172
|
+
scores[idx] = numeric_scores[i]
|
|
128
173
|
|
|
129
|
-
# Handle NaN scores
|
|
130
|
-
scores = np.nan_to_num(scores, 0)
|
|
131
174
|
return scores
|
|
132
175
|
|
|
133
176
|
def _compute_chi2(self, X: pd.DataFrame, y: np.ndarray) -> np.ndarray:
|
|
134
177
|
"""Compute chi-square scores (for non-negative features)."""
|
|
135
178
|
from sklearn.feature_selection import chi2
|
|
179
|
+
from sklearn.preprocessing import LabelEncoder
|
|
136
180
|
|
|
137
|
-
|
|
181
|
+
# Encode string labels if needed
|
|
182
|
+
y_encoded = y
|
|
183
|
+
if y.dtype == object or y.dtype.kind in ("U", "S"):
|
|
184
|
+
le = LabelEncoder()
|
|
185
|
+
y_encoded = le.fit_transform(y)
|
|
138
186
|
|
|
139
|
-
#
|
|
140
|
-
|
|
187
|
+
# Filter to numeric columns only
|
|
188
|
+
numeric_cols = X.select_dtypes(include=[np.number]).columns.tolist()
|
|
189
|
+
scores = np.zeros(len(X.columns))
|
|
141
190
|
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
#
|
|
147
|
-
|
|
191
|
+
if numeric_cols:
|
|
192
|
+
X_numeric = X[numeric_cols].fillna(0).values
|
|
193
|
+
numeric_indices = [X.columns.get_loc(c) for c in numeric_cols]
|
|
194
|
+
|
|
195
|
+
# Chi2 requires non-negative values
|
|
196
|
+
X_positive = X_numeric - X_numeric.min(axis=0) + 1e-8
|
|
197
|
+
|
|
198
|
+
try:
|
|
199
|
+
numeric_scores, _ = chi2(X_positive, y_encoded)
|
|
200
|
+
numeric_scores = np.nan_to_num(numeric_scores, 0)
|
|
201
|
+
except Exception:
|
|
202
|
+
# Fallback to mutual information
|
|
203
|
+
return self._compute_mutual_info(X, y)
|
|
204
|
+
|
|
205
|
+
for i, idx in enumerate(numeric_indices):
|
|
206
|
+
scores[idx] = numeric_scores[i]
|
|
148
207
|
|
|
149
208
|
return scores
|
|
150
209
|
|
|
151
210
|
def _compute_correlation(self, X: pd.DataFrame, y: np.ndarray) -> np.ndarray:
|
|
152
211
|
"""Compute absolute correlation with target."""
|
|
153
|
-
|
|
154
|
-
|
|
212
|
+
from sklearn.preprocessing import LabelEncoder
|
|
213
|
+
|
|
214
|
+
# Encode string labels if needed
|
|
215
|
+
y_encoded = y
|
|
216
|
+
if y.dtype == object or y.dtype.kind in ("U", "S"):
|
|
217
|
+
le = LabelEncoder()
|
|
218
|
+
y_encoded = le.fit_transform(y).astype(float)
|
|
219
|
+
|
|
220
|
+
numeric_cols = X.select_dtypes(include=[np.number]).columns.tolist()
|
|
221
|
+
scores = np.zeros(len(X.columns))
|
|
222
|
+
|
|
223
|
+
for col in numeric_cols:
|
|
155
224
|
try:
|
|
156
|
-
|
|
157
|
-
|
|
225
|
+
idx = X.columns.get_loc(col)
|
|
226
|
+
corr = np.abs(np.corrcoef(X[col].fillna(0).values, y_encoded)[0, 1])
|
|
227
|
+
scores[idx] = corr if not np.isnan(corr) else 0
|
|
158
228
|
except Exception:
|
|
159
|
-
|
|
229
|
+
pass
|
|
160
230
|
|
|
161
|
-
return
|
|
231
|
+
return scores
|
|
162
232
|
|
|
163
233
|
def _select_features(self) -> None:
|
|
164
234
|
"""Select features based on scores."""
|
featcopilot/selection/unified.py
CHANGED
|
@@ -46,6 +46,7 @@ class FeatureSelector(BaseSelector):
|
|
|
46
46
|
max_features: Optional[int] = None,
|
|
47
47
|
correlation_threshold: float = 0.95,
|
|
48
48
|
combination: str = "union",
|
|
49
|
+
original_features: Optional[set[str]] = None,
|
|
49
50
|
verbose: bool = False,
|
|
50
51
|
**kwargs,
|
|
51
52
|
):
|
|
@@ -54,6 +55,7 @@ class FeatureSelector(BaseSelector):
|
|
|
54
55
|
self.max_features = max_features
|
|
55
56
|
self.correlation_threshold = correlation_threshold
|
|
56
57
|
self.combination = combination # 'union' or 'intersection'
|
|
58
|
+
self.original_features = original_features or set()
|
|
57
59
|
self.verbose = verbose
|
|
58
60
|
self._selectors: dict[str, BaseSelector] = {}
|
|
59
61
|
self._method_scores: dict[str, dict[str, float]] = {}
|
|
@@ -76,6 +78,10 @@ class FeatureSelector(BaseSelector):
|
|
|
76
78
|
X = self._validate_input(X)
|
|
77
79
|
y = np.array(y)
|
|
78
80
|
|
|
81
|
+
# Identify categorical/text columns (can't be scored by numeric methods)
|
|
82
|
+
categorical_cols = X.select_dtypes(include=["object", "category"]).columns.tolist()
|
|
83
|
+
numeric_cols = X.select_dtypes(include=[np.number]).columns.tolist()
|
|
84
|
+
|
|
79
85
|
# Initialize and fit each selector
|
|
80
86
|
for method in self.methods:
|
|
81
87
|
selector = self._create_selector(method)
|
|
@@ -86,11 +92,27 @@ class FeatureSelector(BaseSelector):
|
|
|
86
92
|
# Combine scores from all methods
|
|
87
93
|
self._combine_scores(X.columns.tolist())
|
|
88
94
|
|
|
95
|
+
# Give categorical columns a minimum score so they're not filtered out
|
|
96
|
+
# Original categorical columns are important for models that can handle them
|
|
97
|
+
if categorical_cols:
|
|
98
|
+
# Get the median score of numeric features to use as baseline for categorical
|
|
99
|
+
numeric_scores = [v for k, v in self._feature_scores.items() if k in numeric_cols and v > 0]
|
|
100
|
+
if numeric_scores:
|
|
101
|
+
baseline_score = np.median(numeric_scores)
|
|
102
|
+
else:
|
|
103
|
+
baseline_score = 0.5 # Default if no numeric scores
|
|
104
|
+
|
|
105
|
+
for col in categorical_cols:
|
|
106
|
+
if col in self.original_features:
|
|
107
|
+
# Original categorical columns get a baseline score
|
|
108
|
+
self._feature_scores[col] = max(self._feature_scores.get(col, 0), baseline_score)
|
|
109
|
+
|
|
89
110
|
# Apply redundancy elimination
|
|
90
111
|
if self.correlation_threshold < 1.0:
|
|
91
112
|
eliminator = RedundancyEliminator(
|
|
92
113
|
correlation_threshold=self.correlation_threshold,
|
|
93
114
|
importance_scores=self._feature_scores,
|
|
115
|
+
original_features=self.original_features,
|
|
94
116
|
verbose=self.verbose,
|
|
95
117
|
)
|
|
96
118
|
eliminator.fit(X)
|
|
@@ -149,13 +171,42 @@ class FeatureSelector(BaseSelector):
|
|
|
149
171
|
"""Make final feature selection."""
|
|
150
172
|
sorted_features = sorted(self._feature_scores.items(), key=lambda x: x[1], reverse=True)
|
|
151
173
|
|
|
174
|
+
# Always include original features first
|
|
175
|
+
original_selected = []
|
|
176
|
+
derived_selected = []
|
|
177
|
+
|
|
178
|
+
for name, score in sorted_features:
|
|
179
|
+
if name in self.original_features:
|
|
180
|
+
original_selected.append(name)
|
|
181
|
+
else:
|
|
182
|
+
# Only include derived features with meaningful importance (> 1% of max)
|
|
183
|
+
max_score = max(self._feature_scores.values()) if self._feature_scores else 1.0
|
|
184
|
+
importance_threshold = max_score * 0.01 # 1% threshold
|
|
185
|
+
if score >= importance_threshold:
|
|
186
|
+
derived_selected.append(name)
|
|
187
|
+
elif self.verbose:
|
|
188
|
+
logger.debug(f"Excluding low-importance feature {name} (score={score:.4f})")
|
|
189
|
+
|
|
190
|
+
# Apply max_features limit only to derived features
|
|
152
191
|
if self.max_features is not None:
|
|
153
|
-
|
|
192
|
+
# Reserve slots for original features, then fill with top derived
|
|
193
|
+
n_derived = max(0, self.max_features - len(original_selected))
|
|
194
|
+
derived_selected = derived_selected[:n_derived]
|
|
154
195
|
|
|
155
|
-
self._selected_features =
|
|
196
|
+
self._selected_features = original_selected + derived_selected
|
|
197
|
+
|
|
198
|
+
# Ensure we never have fewer features than original
|
|
199
|
+
if len(self._selected_features) < len(self.original_features):
|
|
200
|
+
# This should not happen, but add all original features as safety
|
|
201
|
+
for f in self.original_features:
|
|
202
|
+
if f not in self._selected_features:
|
|
203
|
+
self._selected_features.append(f)
|
|
156
204
|
|
|
157
205
|
if self.verbose:
|
|
158
|
-
logger.info(
|
|
206
|
+
logger.info(
|
|
207
|
+
f"FeatureSelector: Selected {len(self._selected_features)} features "
|
|
208
|
+
f"({len(original_selected)} original + {len(derived_selected)} derived)"
|
|
209
|
+
)
|
|
159
210
|
|
|
160
211
|
def transform(self, X: Union[pd.DataFrame, np.ndarray], **kwargs) -> pd.DataFrame:
|
|
161
212
|
"""Select features from data."""
|
featcopilot/stores/__init__.py
CHANGED
|
@@ -7,9 +7,11 @@ and serving in production ML systems.
|
|
|
7
7
|
|
|
8
8
|
from featcopilot.stores.base import BaseFeatureStore, FeatureStoreConfig
|
|
9
9
|
from featcopilot.stores.feast_store import FeastFeatureStore
|
|
10
|
+
from featcopilot.stores.rule_store import TransformRuleStore
|
|
10
11
|
|
|
11
12
|
__all__ = [
|
|
12
13
|
"BaseFeatureStore",
|
|
13
14
|
"FeatureStoreConfig",
|
|
14
15
|
"FeastFeatureStore",
|
|
16
|
+
"TransformRuleStore",
|
|
15
17
|
]
|
|
@@ -0,0 +1,343 @@
|
|
|
1
|
+
"""Persistent storage for transform rules.
|
|
2
|
+
|
|
3
|
+
Provides JSON-file based storage for saving, loading, and searching
|
|
4
|
+
reusable transform rules.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import json
|
|
8
|
+
import os
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
from typing import Optional
|
|
11
|
+
|
|
12
|
+
from featcopilot.core.transform_rule import TransformRule
|
|
13
|
+
from featcopilot.utils.logger import get_logger
|
|
14
|
+
|
|
15
|
+
logger = get_logger(__name__)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class TransformRuleStore:
|
|
19
|
+
"""
|
|
20
|
+
Persistent storage for transform rules.
|
|
21
|
+
|
|
22
|
+
Stores rules in a JSON file for reuse across sessions and datasets.
|
|
23
|
+
Supports searching by tags, description similarity, and column patterns.
|
|
24
|
+
|
|
25
|
+
Parameters
|
|
26
|
+
----------
|
|
27
|
+
path : str, optional
|
|
28
|
+
Path to the JSON file for storage. Defaults to ~/.featcopilot/rules.json
|
|
29
|
+
|
|
30
|
+
Examples
|
|
31
|
+
--------
|
|
32
|
+
>>> store = TransformRuleStore()
|
|
33
|
+
>>> store.save_rule(rule)
|
|
34
|
+
>>> matching = store.find_matching_rules(columns=["price", "quantity"])
|
|
35
|
+
>>> all_rules = store.list_rules()
|
|
36
|
+
"""
|
|
37
|
+
|
|
38
|
+
DEFAULT_PATH = "~/.featcopilot/rules.json"
|
|
39
|
+
|
|
40
|
+
def __init__(self, path: Optional[str] = None):
|
|
41
|
+
self.path = Path(os.path.expanduser(path or self.DEFAULT_PATH))
|
|
42
|
+
self._rules: dict[str, TransformRule] = {}
|
|
43
|
+
self._ensure_directory()
|
|
44
|
+
self._load()
|
|
45
|
+
|
|
46
|
+
def _ensure_directory(self) -> None:
|
|
47
|
+
"""Ensure the storage directory exists."""
|
|
48
|
+
self.path.parent.mkdir(parents=True, exist_ok=True)
|
|
49
|
+
|
|
50
|
+
def _load(self) -> None:
|
|
51
|
+
"""Load rules from storage file."""
|
|
52
|
+
if self.path.exists():
|
|
53
|
+
try:
|
|
54
|
+
with open(self.path, encoding="utf-8") as f:
|
|
55
|
+
data = json.load(f)
|
|
56
|
+
self._rules = {rule_id: TransformRule.from_dict(rule_data) for rule_id, rule_data in data.items()}
|
|
57
|
+
logger.debug(f"Loaded {len(self._rules)} rules from {self.path}")
|
|
58
|
+
except (json.JSONDecodeError, KeyError) as e:
|
|
59
|
+
logger.warning(f"Failed to load rules from {self.path}: {e}")
|
|
60
|
+
self._rules = {}
|
|
61
|
+
else:
|
|
62
|
+
self._rules = {}
|
|
63
|
+
|
|
64
|
+
def _save(self) -> None:
|
|
65
|
+
"""Save rules to storage file."""
|
|
66
|
+
try:
|
|
67
|
+
with open(self.path, "w", encoding="utf-8") as f:
|
|
68
|
+
data = {rule_id: rule.to_dict() for rule_id, rule in self._rules.items()}
|
|
69
|
+
json.dump(data, f, indent=2)
|
|
70
|
+
logger.debug(f"Saved {len(self._rules)} rules to {self.path}")
|
|
71
|
+
except OSError as e:
|
|
72
|
+
logger.error(f"Failed to save rules to {self.path}: {e}")
|
|
73
|
+
raise
|
|
74
|
+
|
|
75
|
+
def save_rule(self, rule: TransformRule) -> str:
|
|
76
|
+
"""
|
|
77
|
+
Save a rule to the store.
|
|
78
|
+
|
|
79
|
+
Parameters
|
|
80
|
+
----------
|
|
81
|
+
rule : TransformRule
|
|
82
|
+
The rule to save
|
|
83
|
+
|
|
84
|
+
Returns
|
|
85
|
+
-------
|
|
86
|
+
str
|
|
87
|
+
The rule's ID
|
|
88
|
+
"""
|
|
89
|
+
self._rules[rule.id] = rule
|
|
90
|
+
self._save()
|
|
91
|
+
logger.info(f"Saved rule '{rule.name}' with ID {rule.id}")
|
|
92
|
+
return rule.id
|
|
93
|
+
|
|
94
|
+
def get_rule(self, rule_id: str) -> Optional[TransformRule]:
|
|
95
|
+
"""
|
|
96
|
+
Get a rule by ID.
|
|
97
|
+
|
|
98
|
+
Parameters
|
|
99
|
+
----------
|
|
100
|
+
rule_id : str
|
|
101
|
+
The rule's ID
|
|
102
|
+
|
|
103
|
+
Returns
|
|
104
|
+
-------
|
|
105
|
+
TransformRule or None
|
|
106
|
+
The rule if found, None otherwise
|
|
107
|
+
"""
|
|
108
|
+
return self._rules.get(rule_id)
|
|
109
|
+
|
|
110
|
+
def get_rule_by_name(self, name: str) -> Optional[TransformRule]:
|
|
111
|
+
"""
|
|
112
|
+
Get a rule by name.
|
|
113
|
+
|
|
114
|
+
Parameters
|
|
115
|
+
----------
|
|
116
|
+
name : str
|
|
117
|
+
The rule's name
|
|
118
|
+
|
|
119
|
+
Returns
|
|
120
|
+
-------
|
|
121
|
+
TransformRule or None
|
|
122
|
+
The first rule matching the name, None if not found
|
|
123
|
+
"""
|
|
124
|
+
for rule in self._rules.values():
|
|
125
|
+
if rule.name == name:
|
|
126
|
+
return rule
|
|
127
|
+
return None
|
|
128
|
+
|
|
129
|
+
def delete_rule(self, rule_id: str) -> bool:
|
|
130
|
+
"""
|
|
131
|
+
Delete a rule by ID.
|
|
132
|
+
|
|
133
|
+
Parameters
|
|
134
|
+
----------
|
|
135
|
+
rule_id : str
|
|
136
|
+
The rule's ID
|
|
137
|
+
|
|
138
|
+
Returns
|
|
139
|
+
-------
|
|
140
|
+
bool
|
|
141
|
+
True if deleted, False if not found
|
|
142
|
+
"""
|
|
143
|
+
if rule_id in self._rules:
|
|
144
|
+
del self._rules[rule_id]
|
|
145
|
+
self._save()
|
|
146
|
+
logger.info(f"Deleted rule {rule_id}")
|
|
147
|
+
return True
|
|
148
|
+
return False
|
|
149
|
+
|
|
150
|
+
def list_rules(self, tags: Optional[list[str]] = None) -> list[TransformRule]:
|
|
151
|
+
"""
|
|
152
|
+
List all rules, optionally filtered by tags.
|
|
153
|
+
|
|
154
|
+
Parameters
|
|
155
|
+
----------
|
|
156
|
+
tags : list[str], optional
|
|
157
|
+
Filter rules that have all specified tags
|
|
158
|
+
|
|
159
|
+
Returns
|
|
160
|
+
-------
|
|
161
|
+
list[TransformRule]
|
|
162
|
+
List of matching rules
|
|
163
|
+
"""
|
|
164
|
+
rules = list(self._rules.values())
|
|
165
|
+
|
|
166
|
+
if tags:
|
|
167
|
+
rules = [r for r in rules if all(t in r.tags for t in tags)]
|
|
168
|
+
|
|
169
|
+
return rules
|
|
170
|
+
|
|
171
|
+
def find_matching_rules(
|
|
172
|
+
self,
|
|
173
|
+
columns: Optional[list[str]] = None,
|
|
174
|
+
description: Optional[str] = None,
|
|
175
|
+
tags: Optional[list[str]] = None,
|
|
176
|
+
min_usage: int = 0,
|
|
177
|
+
) -> list[tuple[TransformRule, dict[str, str]]]:
|
|
178
|
+
"""
|
|
179
|
+
Find rules that can be applied to the given context.
|
|
180
|
+
|
|
181
|
+
Parameters
|
|
182
|
+
----------
|
|
183
|
+
columns : list[str], optional
|
|
184
|
+
Available column names to match against
|
|
185
|
+
description : str, optional
|
|
186
|
+
Description to search for (keyword matching)
|
|
187
|
+
tags : list[str], optional
|
|
188
|
+
Required tags
|
|
189
|
+
min_usage : int, default=0
|
|
190
|
+
Minimum usage count
|
|
191
|
+
|
|
192
|
+
Returns
|
|
193
|
+
-------
|
|
194
|
+
list[tuple[TransformRule, dict]]
|
|
195
|
+
List of (rule, column_mapping) tuples for applicable rules,
|
|
196
|
+
sorted by usage count (most used first)
|
|
197
|
+
"""
|
|
198
|
+
results: list[tuple[TransformRule, dict[str, str]]] = []
|
|
199
|
+
|
|
200
|
+
for rule in self._rules.values():
|
|
201
|
+
# Filter by usage count
|
|
202
|
+
if rule.usage_count < min_usage:
|
|
203
|
+
continue
|
|
204
|
+
|
|
205
|
+
# Filter by tags
|
|
206
|
+
if tags and not all(t in rule.tags for t in tags):
|
|
207
|
+
continue
|
|
208
|
+
|
|
209
|
+
# Filter by description keywords
|
|
210
|
+
if description:
|
|
211
|
+
keywords = description.lower().split()
|
|
212
|
+
rule_text = f"{rule.name} {rule.description}".lower()
|
|
213
|
+
if not any(kw in rule_text for kw in keywords):
|
|
214
|
+
continue
|
|
215
|
+
|
|
216
|
+
# Check column compatibility
|
|
217
|
+
mapping: dict[str, str] = {}
|
|
218
|
+
if columns:
|
|
219
|
+
matches, mapping = rule.matches_columns(columns)
|
|
220
|
+
if not matches:
|
|
221
|
+
continue
|
|
222
|
+
|
|
223
|
+
results.append((rule, mapping))
|
|
224
|
+
|
|
225
|
+
# Sort by usage count (descending)
|
|
226
|
+
results.sort(key=lambda x: x[0].usage_count, reverse=True)
|
|
227
|
+
|
|
228
|
+
return results
|
|
229
|
+
|
|
230
|
+
def search_by_description(self, query: str, limit: int = 10) -> list[TransformRule]:
|
|
231
|
+
"""
|
|
232
|
+
Search rules by description similarity.
|
|
233
|
+
|
|
234
|
+
Parameters
|
|
235
|
+
----------
|
|
236
|
+
query : str
|
|
237
|
+
Search query
|
|
238
|
+
limit : int, default=10
|
|
239
|
+
Maximum number of results
|
|
240
|
+
|
|
241
|
+
Returns
|
|
242
|
+
-------
|
|
243
|
+
list[TransformRule]
|
|
244
|
+
Matching rules sorted by relevance
|
|
245
|
+
"""
|
|
246
|
+
query_words = set(query.lower().split())
|
|
247
|
+
scored_rules: list[tuple[float, TransformRule]] = []
|
|
248
|
+
|
|
249
|
+
for rule in self._rules.values():
|
|
250
|
+
rule_words = set(f"{rule.name} {rule.description}".lower().split())
|
|
251
|
+
|
|
252
|
+
# Simple word overlap scoring
|
|
253
|
+
overlap = len(query_words & rule_words)
|
|
254
|
+
if overlap > 0:
|
|
255
|
+
score = overlap / len(query_words)
|
|
256
|
+
scored_rules.append((score, rule))
|
|
257
|
+
|
|
258
|
+
# Sort by score descending
|
|
259
|
+
scored_rules.sort(key=lambda x: x[0], reverse=True)
|
|
260
|
+
|
|
261
|
+
return [rule for _, rule in scored_rules[:limit]]
|
|
262
|
+
|
|
263
|
+
def import_rules(self, path: str, merge: bool = True) -> int:
|
|
264
|
+
"""
|
|
265
|
+
Import rules from another JSON file.
|
|
266
|
+
|
|
267
|
+
Parameters
|
|
268
|
+
----------
|
|
269
|
+
path : str
|
|
270
|
+
Path to import from
|
|
271
|
+
merge : bool, default=True
|
|
272
|
+
If True, merge with existing rules. If False, replace all.
|
|
273
|
+
|
|
274
|
+
Returns
|
|
275
|
+
-------
|
|
276
|
+
int
|
|
277
|
+
Number of rules imported
|
|
278
|
+
"""
|
|
279
|
+
import_path = Path(os.path.expanduser(path))
|
|
280
|
+
|
|
281
|
+
if not import_path.exists():
|
|
282
|
+
raise FileNotFoundError(f"Import file not found: {path}")
|
|
283
|
+
|
|
284
|
+
with open(import_path, encoding="utf-8") as f:
|
|
285
|
+
data = json.load(f)
|
|
286
|
+
|
|
287
|
+
if not merge:
|
|
288
|
+
self._rules = {}
|
|
289
|
+
|
|
290
|
+
count = 0
|
|
291
|
+
for _rule_id, rule_data in data.items():
|
|
292
|
+
rule = TransformRule.from_dict(rule_data)
|
|
293
|
+
self._rules[rule.id] = rule
|
|
294
|
+
count += 1
|
|
295
|
+
|
|
296
|
+
self._save()
|
|
297
|
+
logger.info(f"Imported {count} rules from {path}")
|
|
298
|
+
|
|
299
|
+
return count
|
|
300
|
+
|
|
301
|
+
def export_rules(self, path: str, tags: Optional[list[str]] = None) -> int:
|
|
302
|
+
"""
|
|
303
|
+
Export rules to a JSON file.
|
|
304
|
+
|
|
305
|
+
Parameters
|
|
306
|
+
----------
|
|
307
|
+
path : str
|
|
308
|
+
Path to export to
|
|
309
|
+
tags : list[str], optional
|
|
310
|
+
Only export rules with these tags
|
|
311
|
+
|
|
312
|
+
Returns
|
|
313
|
+
-------
|
|
314
|
+
int
|
|
315
|
+
Number of rules exported
|
|
316
|
+
"""
|
|
317
|
+
export_path = Path(os.path.expanduser(path))
|
|
318
|
+
export_path.parent.mkdir(parents=True, exist_ok=True)
|
|
319
|
+
|
|
320
|
+
rules_to_export = self.list_rules(tags=tags)
|
|
321
|
+
|
|
322
|
+
with open(export_path, "w", encoding="utf-8") as f:
|
|
323
|
+
data = {r.id: r.to_dict() for r in rules_to_export}
|
|
324
|
+
json.dump(data, f, indent=2)
|
|
325
|
+
|
|
326
|
+
logger.info(f"Exported {len(rules_to_export)} rules to {path}")
|
|
327
|
+
|
|
328
|
+
return len(rules_to_export)
|
|
329
|
+
|
|
330
|
+
def clear(self) -> None:
|
|
331
|
+
"""Remove all rules from the store."""
|
|
332
|
+
self._rules = {}
|
|
333
|
+
self._save()
|
|
334
|
+
logger.info("Cleared all rules")
|
|
335
|
+
|
|
336
|
+
def __len__(self) -> int:
|
|
337
|
+
return len(self._rules)
|
|
338
|
+
|
|
339
|
+
def __contains__(self, rule_id: str) -> bool:
|
|
340
|
+
return rule_id in self._rules
|
|
341
|
+
|
|
342
|
+
def __iter__(self):
|
|
343
|
+
return iter(self._rules.values())
|
|
@@ -90,7 +90,7 @@ class AutoFeatureEngineer(BaseEstimator, TransformerMixin):
|
|
|
90
90
|
Parameters
|
|
91
91
|
----------
|
|
92
92
|
engines : list, default=['tabular']
|
|
93
|
-
Engines to use ('tabular', 'timeseries', 'text', 'llm')
|
|
93
|
+
Engines to use ('tabular', 'timeseries', 'relational', 'text', 'llm')
|
|
94
94
|
max_features : int, optional
|
|
95
95
|
Maximum features to generate/select
|
|
96
96
|
selection_methods : list, default=['mutual_info', 'importance']
|
|
@@ -199,6 +199,8 @@ class AutoFeatureEngineer(BaseEstimator, TransformerMixin):
|
|
|
199
199
|
return TimeSeriesEngine(max_features=self.max_features, verbose=self.verbose)
|
|
200
200
|
elif engine_name == "text":
|
|
201
201
|
return TextEngine(max_features=self.max_features, verbose=self.verbose)
|
|
202
|
+
elif engine_name == "relational":
|
|
203
|
+
return RelationalEngine(max_features=self.max_features, verbose=self.verbose)
|
|
202
204
|
elif engine_name == "llm":
|
|
203
205
|
from featcopilot.llm.semantic_engine import SemanticEngine
|
|
204
206
|
|
|
@@ -294,12 +296,19 @@ class AutoFeatureEngineer(BaseEstimator, TransformerMixin):
|
|
|
294
296
|
self.fit(X, y, column_descriptions, task_description, **fit_params)
|
|
295
297
|
result = self.transform(X)
|
|
296
298
|
|
|
299
|
+
# Track original features (input columns) vs derived features
|
|
300
|
+
if isinstance(X, np.ndarray):
|
|
301
|
+
original_features = {f"feature_{i}" for i in range(X.shape[1])}
|
|
302
|
+
else:
|
|
303
|
+
original_features = set(X.columns)
|
|
304
|
+
|
|
297
305
|
# Apply feature selection if enabled and y is provided
|
|
298
306
|
if apply_selection and y is not None and self.max_features:
|
|
299
307
|
self._selector = FeatureSelector(
|
|
300
308
|
methods=self.selection_methods,
|
|
301
309
|
max_features=self.max_features,
|
|
302
310
|
correlation_threshold=self.correlation_threshold,
|
|
311
|
+
original_features=original_features,
|
|
303
312
|
verbose=self.verbose,
|
|
304
313
|
)
|
|
305
314
|
result = self._selector.fit_transform(result, y)
|