featcopilot 0.1.0__tar.gz → 0.3.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {featcopilot-0.1.0 → featcopilot-0.3.0}/PKG-INFO +56 -25
- {featcopilot-0.1.0 → featcopilot-0.3.0}/README.md +39 -24
- {featcopilot-0.1.0 → featcopilot-0.3.0}/featcopilot/__init__.py +10 -1
- {featcopilot-0.1.0 → featcopilot-0.3.0}/featcopilot/core/__init__.py +2 -0
- {featcopilot-0.1.0 → featcopilot-0.3.0}/featcopilot/core/feature.py +5 -1
- featcopilot-0.3.0/featcopilot/core/transform_rule.py +276 -0
- {featcopilot-0.1.0 → featcopilot-0.3.0}/featcopilot/engines/relational.py +5 -2
- {featcopilot-0.1.0 → featcopilot-0.3.0}/featcopilot/engines/tabular.py +151 -5
- featcopilot-0.3.0/featcopilot/engines/text.py +552 -0
- {featcopilot-0.1.0 → featcopilot-0.3.0}/featcopilot/engines/timeseries.py +235 -3
- {featcopilot-0.1.0 → featcopilot-0.3.0}/featcopilot/llm/__init__.py +6 -1
- {featcopilot-0.1.0 → featcopilot-0.3.0}/featcopilot/llm/code_generator.py +7 -4
- {featcopilot-0.1.0 → featcopilot-0.3.0}/featcopilot/llm/copilot_client.py +97 -20
- {featcopilot-0.1.0 → featcopilot-0.3.0}/featcopilot/llm/explainer.py +6 -3
- featcopilot-0.3.0/featcopilot/llm/litellm_client.py +595 -0
- featcopilot-0.3.0/featcopilot/llm/semantic_engine.py +1070 -0
- featcopilot-0.3.0/featcopilot/llm/transform_rule_generator.py +403 -0
- {featcopilot-0.1.0 → featcopilot-0.3.0}/featcopilot/selection/importance.py +40 -9
- {featcopilot-0.1.0 → featcopilot-0.3.0}/featcopilot/selection/redundancy.py +39 -10
- {featcopilot-0.1.0 → featcopilot-0.3.0}/featcopilot/selection/statistical.py +107 -34
- {featcopilot-0.1.0 → featcopilot-0.3.0}/featcopilot/selection/unified.py +57 -3
- featcopilot-0.3.0/featcopilot/stores/__init__.py +17 -0
- featcopilot-0.3.0/featcopilot/stores/base.py +166 -0
- featcopilot-0.3.0/featcopilot/stores/feast_store.py +541 -0
- featcopilot-0.3.0/featcopilot/stores/rule_store.py +343 -0
- {featcopilot-0.1.0 → featcopilot-0.3.0}/featcopilot/transformers/sklearn_compat.py +18 -6
- featcopilot-0.3.0/featcopilot/utils/__init__.py +23 -0
- featcopilot-0.3.0/featcopilot/utils/logger.py +47 -0
- featcopilot-0.3.0/featcopilot/utils/models.py +287 -0
- {featcopilot-0.1.0 → featcopilot-0.3.0}/featcopilot/utils/parallel.py +5 -1
- {featcopilot-0.1.0 → featcopilot-0.3.0}/featcopilot.egg-info/PKG-INFO +56 -25
- {featcopilot-0.1.0 → featcopilot-0.3.0}/featcopilot.egg-info/SOURCES.txt +13 -1
- {featcopilot-0.1.0 → featcopilot-0.3.0}/featcopilot.egg-info/requires.txt +19 -0
- {featcopilot-0.1.0 → featcopilot-0.3.0}/pyproject.toml +20 -1
- featcopilot-0.3.0/tests/test_litellm.py +249 -0
- featcopilot-0.3.0/tests/test_stores.py +261 -0
- featcopilot-0.3.0/tests/test_transform_rules.py +507 -0
- featcopilot-0.1.0/featcopilot/engines/text.py +0 -211
- featcopilot-0.1.0/featcopilot/llm/semantic_engine.py +0 -379
- featcopilot-0.1.0/featcopilot/utils/__init__.py +0 -9
- {featcopilot-0.1.0 → featcopilot-0.3.0}/featcopilot/core/base.py +0 -0
- {featcopilot-0.1.0 → featcopilot-0.3.0}/featcopilot/core/registry.py +0 -0
- {featcopilot-0.1.0 → featcopilot-0.3.0}/featcopilot/engines/__init__.py +0 -0
- {featcopilot-0.1.0 → featcopilot-0.3.0}/featcopilot/selection/__init__.py +0 -0
- {featcopilot-0.1.0 → featcopilot-0.3.0}/featcopilot/transformers/__init__.py +0 -0
- {featcopilot-0.1.0 → featcopilot-0.3.0}/featcopilot/utils/cache.py +0 -0
- {featcopilot-0.1.0 → featcopilot-0.3.0}/featcopilot.egg-info/dependency_links.txt +0 -0
- {featcopilot-0.1.0 → featcopilot-0.3.0}/featcopilot.egg-info/top_level.txt +0 -0
- {featcopilot-0.1.0 → featcopilot-0.3.0}/setup.cfg +0 -0
- {featcopilot-0.1.0 → featcopilot-0.3.0}/tests/test_autofeat.py +0 -0
- {featcopilot-0.1.0 → featcopilot-0.3.0}/tests/test_core.py +0 -0
- {featcopilot-0.1.0 → featcopilot-0.3.0}/tests/test_engines.py +0 -0
- {featcopilot-0.1.0 → featcopilot-0.3.0}/tests/test_selection.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: featcopilot
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.3.0
|
|
4
4
|
Summary: Next-generation LLM-powered auto feature engineering framework with GitHub Copilot SDK
|
|
5
5
|
Author: FeatCopilot Contributors
|
|
6
6
|
License: MIT
|
|
@@ -28,11 +28,27 @@ Requires-Dist: pydantic>=2.0.0
|
|
|
28
28
|
Requires-Dist: joblib>=1.1.0
|
|
29
29
|
Provides-Extra: llm
|
|
30
30
|
Requires-Dist: github-copilot-sdk>=0.1.0; extra == "llm"
|
|
31
|
+
Requires-Dist: nest_asyncio>=1.5.0; extra == "llm"
|
|
32
|
+
Provides-Extra: litellm
|
|
33
|
+
Requires-Dist: litellm>=1.0.0; extra == "litellm"
|
|
34
|
+
Requires-Dist: nest_asyncio>=1.5.0; extra == "litellm"
|
|
31
35
|
Provides-Extra: timeseries
|
|
32
36
|
Requires-Dist: statsmodels>=0.13.0; extra == "timeseries"
|
|
37
|
+
Provides-Extra: feast
|
|
38
|
+
Requires-Dist: feast>=0.30.0; extra == "feast"
|
|
33
39
|
Provides-Extra: full
|
|
34
40
|
Requires-Dist: github-copilot-sdk>=0.1.0; extra == "full"
|
|
41
|
+
Requires-Dist: litellm>=1.0.0; extra == "full"
|
|
35
42
|
Requires-Dist: statsmodels>=0.13.0; extra == "full"
|
|
43
|
+
Requires-Dist: feast>=0.30.0; extra == "full"
|
|
44
|
+
Requires-Dist: nest_asyncio>=1.5.0; extra == "full"
|
|
45
|
+
Provides-Extra: benchmark
|
|
46
|
+
Requires-Dist: github-copilot-sdk>=0.1.0; extra == "benchmark"
|
|
47
|
+
Requires-Dist: statsmodels>=0.13.0; extra == "benchmark"
|
|
48
|
+
Requires-Dist: flaml[automl,blendsearch]>=2.0.0; extra == "benchmark"
|
|
49
|
+
Requires-Dist: autogluon.tabular[fastai]>=1.5.0; extra == "benchmark"
|
|
50
|
+
Requires-Dist: h2o>=3.40.0; extra == "benchmark"
|
|
51
|
+
Requires-Dist: numpy<2; extra == "benchmark"
|
|
36
52
|
Provides-Extra: dev
|
|
37
53
|
Requires-Dist: pytest>=7.0.0; extra == "dev"
|
|
38
54
|
Requires-Dist: pytest-asyncio>=0.21.0; extra == "dev"
|
|
@@ -44,32 +60,39 @@ Requires-Dist: pre-commit>=3.6.0; extra == "dev"
|
|
|
44
60
|
|
|
45
61
|
# FeatCopilot 🚀
|
|
46
62
|
|
|
47
|
-
**Next-Generation LLM-Powered Auto Feature Engineering
|
|
63
|
+
**Next-Generation LLM-Powered Auto Feature Engineering Framework**
|
|
48
64
|
|
|
49
|
-
FeatCopilot
|
|
65
|
+
FeatCopilot automatically generates, selects, and explains predictive features using semantic understanding. It analyzes column meanings, applies domain-aware transformations, and provides human-readable explanations—turning raw data into ML-ready features in seconds.
|
|
66
|
+
|
|
67
|
+
## 🎬 Introduction Video
|
|
68
|
+
|
|
69
|
+
[](https://www.youtube.com/watch?v=H7m50TLGHFk)
|
|
50
70
|
|
|
51
71
|
## 📊 Benchmark Highlights
|
|
52
72
|
|
|
53
|
-
###
|
|
73
|
+
### Simple Models Benchmark (42 Datasets)
|
|
74
|
+
|
|
75
|
+
| Configuration | Improved | Avg Improvement | Best Improvement |
|
|
76
|
+
|---------------|----------|-----------------|------------------|
|
|
77
|
+
| **Tabular Engine** | 20 (48%) | +4.54% | +197% (delays_zurich) |
|
|
78
|
+
| **Tabular + LLM** | 23 (55%) | +6.12% | +420% (delays_zurich) |
|
|
79
|
+
|
|
80
|
+
Models: RandomForest (n_estimators=200, max_depth=20), LogisticRegression/Ridge
|
|
54
81
|
|
|
55
|
-
|
|
56
|
-
|-----------|--------------------:|----------:|
|
|
57
|
-
| **Text Classification** | **+12.44%** | +49.02% (News Headlines) |
|
|
58
|
-
| Time Series | +1.51% | +12.12% (Retail Demand) |
|
|
59
|
-
| Classification | +0.54% | +4.35% |
|
|
60
|
-
| Regression | +0.65% | +5.57% |
|
|
82
|
+
### AutoML Benchmark (FLAML, 120s budget)
|
|
61
83
|
|
|
62
|
-
|
|
84
|
+
| Metric | Value |
|
|
85
|
+
|--------|-------|
|
|
86
|
+
| **Datasets** | 41 |
|
|
87
|
+
| **Improved** | 19 (46%) |
|
|
88
|
+
| **Best Improvement** | +8.55% (abalone) |
|
|
63
89
|
|
|
64
|
-
|
|
65
|
-
|-----------|--------------------:|----------:|
|
|
66
|
-
| **Regression** | **+7.79%** | +19.66% (Retail Demand) |
|
|
67
|
-
| Classification | +2.38% | +2.87% |
|
|
90
|
+
### Key Results
|
|
68
91
|
|
|
69
|
-
- ✅
|
|
70
|
-
- 🧠 **+
|
|
71
|
-
-
|
|
72
|
-
-
|
|
92
|
+
- ✅ **+197% improvement** on delays_zurich (tabular only)
|
|
93
|
+
- 🧠 **+420% improvement** with LLM-enhanced features
|
|
94
|
+
- 📈 **+8.98%** on abalone regression task
|
|
95
|
+
- 🚀 **+5.68%** on complex_classification
|
|
73
96
|
|
|
74
97
|
[View Full Benchmark Results](https://thinkall.github.io/featcopilot/user-guide/benchmarks/)
|
|
75
98
|
|
|
@@ -87,7 +110,7 @@ FeatCopilot is a unified feature engineering framework that combines the best ap
|
|
|
87
110
|
# Basic installation
|
|
88
111
|
pip install featcopilot
|
|
89
112
|
|
|
90
|
-
# With LLM capabilities
|
|
113
|
+
# With LLM capabilities
|
|
91
114
|
pip install featcopilot[llm]
|
|
92
115
|
|
|
93
116
|
# Full installation
|
|
@@ -111,12 +134,12 @@ X_transformed = engineer.fit_transform(X, y) # <1 second
|
|
|
111
134
|
print(f"Features: {X.shape[1]} -> {X_transformed.shape[1]}")
|
|
112
135
|
```
|
|
113
136
|
|
|
114
|
-
### LLM Mode (With
|
|
137
|
+
### LLM Mode (With LiteLLM)
|
|
115
138
|
|
|
116
139
|
```python
|
|
117
140
|
from featcopilot import AutoFeatureEngineer
|
|
118
141
|
|
|
119
|
-
# LLM-powered semantic features (+
|
|
142
|
+
# LLM-powered semantic features (+420% max improvement)
|
|
120
143
|
engineer = AutoFeatureEngineer(
|
|
121
144
|
engines=['tabular', 'llm'],
|
|
122
145
|
max_features=50
|
|
@@ -164,16 +187,24 @@ engine = TimeSeriesEngine(
|
|
|
164
187
|
```
|
|
165
188
|
|
|
166
189
|
### LLM Engine
|
|
167
|
-
Uses GitHub Copilot SDK for intelligent feature generation.
|
|
190
|
+
Uses GitHub Copilot SDK (default) or LiteLLM (100+ providers) for intelligent feature generation.
|
|
168
191
|
|
|
169
192
|
```python
|
|
170
193
|
from featcopilot.llm import SemanticEngine
|
|
171
194
|
|
|
195
|
+
# Default: GitHub Copilot SDK
|
|
172
196
|
engine = SemanticEngine(
|
|
173
|
-
model='gpt-5',
|
|
197
|
+
model='gpt-5.2',
|
|
174
198
|
max_suggestions=20,
|
|
175
199
|
validate_features=True
|
|
176
200
|
)
|
|
201
|
+
|
|
202
|
+
# Alternative: LiteLLM backend
|
|
203
|
+
engine = SemanticEngine(
|
|
204
|
+
model='gpt-4o',
|
|
205
|
+
backend='litellm',
|
|
206
|
+
max_suggestions=20
|
|
207
|
+
)
|
|
177
208
|
```
|
|
178
209
|
|
|
179
210
|
## Feature Selection
|
|
@@ -211,7 +242,7 @@ X_selected = selector.fit_transform(X, y)
|
|
|
211
242
|
|
|
212
243
|
- Python 3.9+
|
|
213
244
|
- NumPy, Pandas, Scikit-learn
|
|
214
|
-
- GitHub Copilot
|
|
245
|
+
- GitHub Copilot SDK (default) or LiteLLM (for 100+ LLM providers)
|
|
215
246
|
|
|
216
247
|
## License
|
|
217
248
|
|
|
@@ -1,31 +1,38 @@
|
|
|
1
1
|
# FeatCopilot 🚀
|
|
2
2
|
|
|
3
|
-
**Next-Generation LLM-Powered Auto Feature Engineering
|
|
3
|
+
**Next-Generation LLM-Powered Auto Feature Engineering Framework**
|
|
4
4
|
|
|
5
|
-
FeatCopilot
|
|
5
|
+
FeatCopilot automatically generates, selects, and explains predictive features using semantic understanding. It analyzes column meanings, applies domain-aware transformations, and provides human-readable explanations—turning raw data into ML-ready features in seconds.
|
|
6
|
+
|
|
7
|
+
## 🎬 Introduction Video
|
|
8
|
+
|
|
9
|
+
[](https://www.youtube.com/watch?v=H7m50TLGHFk)
|
|
6
10
|
|
|
7
11
|
## 📊 Benchmark Highlights
|
|
8
12
|
|
|
9
|
-
###
|
|
13
|
+
### Simple Models Benchmark (42 Datasets)
|
|
14
|
+
|
|
15
|
+
| Configuration | Improved | Avg Improvement | Best Improvement |
|
|
16
|
+
|---------------|----------|-----------------|------------------|
|
|
17
|
+
| **Tabular Engine** | 20 (48%) | +4.54% | +197% (delays_zurich) |
|
|
18
|
+
| **Tabular + LLM** | 23 (55%) | +6.12% | +420% (delays_zurich) |
|
|
19
|
+
|
|
20
|
+
Models: RandomForest (n_estimators=200, max_depth=20), LogisticRegression/Ridge
|
|
10
21
|
|
|
11
|
-
|
|
12
|
-
|-----------|--------------------:|----------:|
|
|
13
|
-
| **Text Classification** | **+12.44%** | +49.02% (News Headlines) |
|
|
14
|
-
| Time Series | +1.51% | +12.12% (Retail Demand) |
|
|
15
|
-
| Classification | +0.54% | +4.35% |
|
|
16
|
-
| Regression | +0.65% | +5.57% |
|
|
22
|
+
### AutoML Benchmark (FLAML, 120s budget)
|
|
17
23
|
|
|
18
|
-
|
|
24
|
+
| Metric | Value |
|
|
25
|
+
|--------|-------|
|
|
26
|
+
| **Datasets** | 41 |
|
|
27
|
+
| **Improved** | 19 (46%) |
|
|
28
|
+
| **Best Improvement** | +8.55% (abalone) |
|
|
19
29
|
|
|
20
|
-
|
|
21
|
-
|-----------|--------------------:|----------:|
|
|
22
|
-
| **Regression** | **+7.79%** | +19.66% (Retail Demand) |
|
|
23
|
-
| Classification | +2.38% | +2.87% |
|
|
30
|
+
### Key Results
|
|
24
31
|
|
|
25
|
-
- ✅
|
|
26
|
-
- 🧠 **+
|
|
27
|
-
-
|
|
28
|
-
-
|
|
32
|
+
- ✅ **+197% improvement** on delays_zurich (tabular only)
|
|
33
|
+
- 🧠 **+420% improvement** with LLM-enhanced features
|
|
34
|
+
- 📈 **+8.98%** on abalone regression task
|
|
35
|
+
- 🚀 **+5.68%** on complex_classification
|
|
29
36
|
|
|
30
37
|
[View Full Benchmark Results](https://thinkall.github.io/featcopilot/user-guide/benchmarks/)
|
|
31
38
|
|
|
@@ -43,7 +50,7 @@ FeatCopilot is a unified feature engineering framework that combines the best ap
|
|
|
43
50
|
# Basic installation
|
|
44
51
|
pip install featcopilot
|
|
45
52
|
|
|
46
|
-
# With LLM capabilities
|
|
53
|
+
# With LLM capabilities
|
|
47
54
|
pip install featcopilot[llm]
|
|
48
55
|
|
|
49
56
|
# Full installation
|
|
@@ -67,12 +74,12 @@ X_transformed = engineer.fit_transform(X, y) # <1 second
|
|
|
67
74
|
print(f"Features: {X.shape[1]} -> {X_transformed.shape[1]}")
|
|
68
75
|
```
|
|
69
76
|
|
|
70
|
-
### LLM Mode (With
|
|
77
|
+
### LLM Mode (With LiteLLM)
|
|
71
78
|
|
|
72
79
|
```python
|
|
73
80
|
from featcopilot import AutoFeatureEngineer
|
|
74
81
|
|
|
75
|
-
# LLM-powered semantic features (+
|
|
82
|
+
# LLM-powered semantic features (+420% max improvement)
|
|
76
83
|
engineer = AutoFeatureEngineer(
|
|
77
84
|
engines=['tabular', 'llm'],
|
|
78
85
|
max_features=50
|
|
@@ -120,16 +127,24 @@ engine = TimeSeriesEngine(
|
|
|
120
127
|
```
|
|
121
128
|
|
|
122
129
|
### LLM Engine
|
|
123
|
-
Uses GitHub Copilot SDK for intelligent feature generation.
|
|
130
|
+
Uses GitHub Copilot SDK (default) or LiteLLM (100+ providers) for intelligent feature generation.
|
|
124
131
|
|
|
125
132
|
```python
|
|
126
133
|
from featcopilot.llm import SemanticEngine
|
|
127
134
|
|
|
135
|
+
# Default: GitHub Copilot SDK
|
|
128
136
|
engine = SemanticEngine(
|
|
129
|
-
model='gpt-5',
|
|
137
|
+
model='gpt-5.2',
|
|
130
138
|
max_suggestions=20,
|
|
131
139
|
validate_features=True
|
|
132
140
|
)
|
|
141
|
+
|
|
142
|
+
# Alternative: LiteLLM backend
|
|
143
|
+
engine = SemanticEngine(
|
|
144
|
+
model='gpt-4o',
|
|
145
|
+
backend='litellm',
|
|
146
|
+
max_suggestions=20
|
|
147
|
+
)
|
|
133
148
|
```
|
|
134
149
|
|
|
135
150
|
## Feature Selection
|
|
@@ -167,7 +182,7 @@ X_selected = selector.fit_transform(X, y)
|
|
|
167
182
|
|
|
168
183
|
- Python 3.9+
|
|
169
184
|
- NumPy, Pandas, Scikit-learn
|
|
170
|
-
- GitHub Copilot
|
|
185
|
+
- GitHub Copilot SDK (default) or LiteLLM (for 100+ LLM providers)
|
|
171
186
|
|
|
172
187
|
## License
|
|
173
188
|
|
|
@@ -5,11 +5,16 @@ A unified feature engineering framework combining traditional approaches
|
|
|
5
5
|
with novel LLM-powered capabilities via GitHub Copilot SDK.
|
|
6
6
|
"""
|
|
7
7
|
|
|
8
|
-
|
|
8
|
+
from importlib.metadata import version
|
|
9
|
+
|
|
10
|
+
__version__ = version("featcopilot")
|
|
9
11
|
__author__ = "FeatCopilot Contributors"
|
|
10
12
|
|
|
11
13
|
from featcopilot.core.base import BaseEngine, BaseSelector
|
|
12
14
|
from featcopilot.core.feature import Feature, FeatureSet
|
|
15
|
+
from featcopilot.core.transform_rule import TransformRule
|
|
16
|
+
from featcopilot.llm.transform_rule_generator import TransformRuleGenerator
|
|
17
|
+
from featcopilot.stores.rule_store import TransformRuleStore
|
|
13
18
|
from featcopilot.transformers.sklearn_compat import (
|
|
14
19
|
AutoFeatureEngineer,
|
|
15
20
|
FeatureEngineerTransformer,
|
|
@@ -21,6 +26,10 @@ __all__ = [
|
|
|
21
26
|
"BaseSelector",
|
|
22
27
|
"Feature",
|
|
23
28
|
"FeatureSet",
|
|
29
|
+
# Transform Rules
|
|
30
|
+
"TransformRule",
|
|
31
|
+
"TransformRuleStore",
|
|
32
|
+
"TransformRuleGenerator",
|
|
24
33
|
# Main API
|
|
25
34
|
"AutoFeatureEngineer",
|
|
26
35
|
"FeatureEngineerTransformer",
|
|
@@ -3,6 +3,7 @@
|
|
|
3
3
|
from featcopilot.core.base import BaseEngine, BaseSelector
|
|
4
4
|
from featcopilot.core.feature import Feature, FeatureSet
|
|
5
5
|
from featcopilot.core.registry import FeatureRegistry
|
|
6
|
+
from featcopilot.core.transform_rule import TransformRule
|
|
6
7
|
|
|
7
8
|
__all__ = [
|
|
8
9
|
"BaseEngine",
|
|
@@ -10,4 +11,5 @@ __all__ = [
|
|
|
10
11
|
"Feature",
|
|
11
12
|
"FeatureSet",
|
|
12
13
|
"FeatureRegistry",
|
|
14
|
+
"TransformRule",
|
|
13
15
|
]
|
|
@@ -7,6 +7,10 @@ from typing import Any, Optional
|
|
|
7
7
|
import numpy as np
|
|
8
8
|
import pandas as pd
|
|
9
9
|
|
|
10
|
+
from featcopilot.utils.logger import get_logger
|
|
11
|
+
|
|
12
|
+
logger = get_logger(__name__)
|
|
13
|
+
|
|
10
14
|
|
|
11
15
|
class FeatureType(Enum):
|
|
12
16
|
"""Types of features."""
|
|
@@ -220,5 +224,5 @@ class FeatureSet:
|
|
|
220
224
|
result[feature.name] = feature.compute(df)
|
|
221
225
|
except Exception as e:
|
|
222
226
|
# Log warning but continue
|
|
223
|
-
|
|
227
|
+
logger.warning(f"Could not compute feature {feature.name}: {e}")
|
|
224
228
|
return result
|
|
@@ -0,0 +1,276 @@
|
|
|
1
|
+
"""Transform rule model for reusable feature transformations.
|
|
2
|
+
|
|
3
|
+
Defines TransformRule - a reusable transformation that can be created from
|
|
4
|
+
natural language descriptions and applied across different datasets.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import re
|
|
8
|
+
import uuid
|
|
9
|
+
from datetime import datetime, timezone
|
|
10
|
+
from typing import Any, Optional
|
|
11
|
+
|
|
12
|
+
import numpy as np
|
|
13
|
+
import pandas as pd
|
|
14
|
+
from pydantic import BaseModel, Field
|
|
15
|
+
|
|
16
|
+
from featcopilot.utils.logger import get_logger
|
|
17
|
+
|
|
18
|
+
logger = get_logger(__name__)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class TransformRule(BaseModel):
|
|
22
|
+
"""
|
|
23
|
+
A reusable feature transformation rule.
|
|
24
|
+
|
|
25
|
+
Transform rules capture feature engineering logic that can be generated
|
|
26
|
+
from natural language descriptions and reused across different datasets.
|
|
27
|
+
|
|
28
|
+
Parameters
|
|
29
|
+
----------
|
|
30
|
+
id : str, optional
|
|
31
|
+
Unique identifier for the rule
|
|
32
|
+
name : str
|
|
33
|
+
Human-readable name for the rule
|
|
34
|
+
description : str
|
|
35
|
+
Natural language description of what the rule does
|
|
36
|
+
code : str
|
|
37
|
+
Python code that implements the transformation
|
|
38
|
+
input_columns : list[str]
|
|
39
|
+
Column names or patterns this rule expects as input
|
|
40
|
+
output_name : str, optional
|
|
41
|
+
Name for the output feature (default: derived from rule name)
|
|
42
|
+
output_type : str
|
|
43
|
+
Expected output data type ('numeric', 'categorical', 'boolean')
|
|
44
|
+
tags : list[str]
|
|
45
|
+
Tags for categorization and search
|
|
46
|
+
column_patterns : list[str]
|
|
47
|
+
Regex patterns for matching columns (e.g., 'price.*', '.*_amount')
|
|
48
|
+
usage_count : int
|
|
49
|
+
Number of times this rule has been applied
|
|
50
|
+
created_at : str
|
|
51
|
+
ISO timestamp of rule creation
|
|
52
|
+
metadata : dict
|
|
53
|
+
Additional metadata
|
|
54
|
+
|
|
55
|
+
Examples
|
|
56
|
+
--------
|
|
57
|
+
>>> rule = TransformRule(
|
|
58
|
+
... name="ratio_calculation",
|
|
59
|
+
... description="Calculate ratio of two numeric columns",
|
|
60
|
+
... code="result = df['{col1}'] / (df['{col2}'] + 1e-8)",
|
|
61
|
+
... input_columns=["col1", "col2"],
|
|
62
|
+
... tags=["ratio", "numeric"]
|
|
63
|
+
... )
|
|
64
|
+
>>> result = rule.apply(df, column_mapping={"col1": "price", "col2": "quantity"})
|
|
65
|
+
"""
|
|
66
|
+
|
|
67
|
+
id: str = Field(default_factory=lambda: str(uuid.uuid4())[:8], description="Unique rule identifier")
|
|
68
|
+
name: str = Field(description="Human-readable rule name")
|
|
69
|
+
description: str = Field(description="Natural language description of the transformation")
|
|
70
|
+
code: str = Field(description="Python code implementing the transformation")
|
|
71
|
+
input_columns: list[str] = Field(default_factory=list, description="Expected input column names or placeholders")
|
|
72
|
+
output_name: Optional[str] = Field(default=None, description="Output feature name")
|
|
73
|
+
output_type: str = Field(default="numeric", description="Output data type")
|
|
74
|
+
tags: list[str] = Field(default_factory=list, description="Tags for categorization")
|
|
75
|
+
column_patterns: list[str] = Field(default_factory=list, description="Regex patterns for column matching")
|
|
76
|
+
usage_count: int = Field(default=0, description="Number of times applied")
|
|
77
|
+
created_at: str = Field(
|
|
78
|
+
default_factory=lambda: datetime.now(timezone.utc).isoformat(), description="Creation timestamp"
|
|
79
|
+
)
|
|
80
|
+
metadata: dict[str, Any] = Field(default_factory=dict, description="Additional metadata")
|
|
81
|
+
|
|
82
|
+
def get_output_name(self, column_mapping: Optional[dict[str, str]] = None) -> str:
|
|
83
|
+
"""
|
|
84
|
+
Get the output feature name.
|
|
85
|
+
|
|
86
|
+
Parameters
|
|
87
|
+
----------
|
|
88
|
+
column_mapping : dict, optional
|
|
89
|
+
Mapping from placeholder columns to actual column names
|
|
90
|
+
|
|
91
|
+
Returns
|
|
92
|
+
-------
|
|
93
|
+
str
|
|
94
|
+
Output feature name
|
|
95
|
+
"""
|
|
96
|
+
if self.output_name:
|
|
97
|
+
return self.output_name
|
|
98
|
+
|
|
99
|
+
# Generate name from input columns
|
|
100
|
+
if column_mapping and self.input_columns:
|
|
101
|
+
cols = [column_mapping.get(c, c) for c in self.input_columns[:2]]
|
|
102
|
+
return f"{'_'.join(cols)}_{self.name}"
|
|
103
|
+
|
|
104
|
+
return f"rule_{self.name}"
|
|
105
|
+
|
|
106
|
+
def matches_columns(self, columns: list[str]) -> tuple[bool, dict[str, str]]:
|
|
107
|
+
"""
|
|
108
|
+
Check if this rule can be applied to the given columns.
|
|
109
|
+
|
|
110
|
+
Parameters
|
|
111
|
+
----------
|
|
112
|
+
columns : list[str]
|
|
113
|
+
Available column names
|
|
114
|
+
|
|
115
|
+
Returns
|
|
116
|
+
-------
|
|
117
|
+
matches : bool
|
|
118
|
+
Whether the rule can be applied
|
|
119
|
+
mapping : dict
|
|
120
|
+
Suggested mapping from rule's input_columns to actual columns
|
|
121
|
+
"""
|
|
122
|
+
if not self.input_columns:
|
|
123
|
+
return True, {}
|
|
124
|
+
|
|
125
|
+
mapping = {}
|
|
126
|
+
|
|
127
|
+
for input_col in self.input_columns:
|
|
128
|
+
# Try exact match first
|
|
129
|
+
if input_col in columns:
|
|
130
|
+
mapping[input_col] = input_col
|
|
131
|
+
continue
|
|
132
|
+
|
|
133
|
+
# Try pattern matching
|
|
134
|
+
matched = False
|
|
135
|
+
for pattern in self.column_patterns:
|
|
136
|
+
regex = re.compile(pattern, re.IGNORECASE)
|
|
137
|
+
for col in columns:
|
|
138
|
+
if regex.match(col) and col not in mapping.values():
|
|
139
|
+
mapping[input_col] = col
|
|
140
|
+
matched = True
|
|
141
|
+
break
|
|
142
|
+
if matched:
|
|
143
|
+
break
|
|
144
|
+
|
|
145
|
+
# Try fuzzy matching by checking if input_col is substring
|
|
146
|
+
if not matched:
|
|
147
|
+
for col in columns:
|
|
148
|
+
if input_col.lower() in col.lower() and col not in mapping.values():
|
|
149
|
+
mapping[input_col] = col
|
|
150
|
+
matched = True
|
|
151
|
+
break
|
|
152
|
+
|
|
153
|
+
if not matched:
|
|
154
|
+
return False, {}
|
|
155
|
+
|
|
156
|
+
return len(mapping) == len(self.input_columns), mapping
|
|
157
|
+
|
|
158
|
+
def apply(
|
|
159
|
+
self,
|
|
160
|
+
df: pd.DataFrame,
|
|
161
|
+
column_mapping: Optional[dict[str, str]] = None,
|
|
162
|
+
validate: bool = True,
|
|
163
|
+
) -> pd.Series:
|
|
164
|
+
"""
|
|
165
|
+
Apply the transformation rule to a DataFrame.
|
|
166
|
+
|
|
167
|
+
Parameters
|
|
168
|
+
----------
|
|
169
|
+
df : DataFrame
|
|
170
|
+
Input data
|
|
171
|
+
column_mapping : dict, optional
|
|
172
|
+
Mapping from rule's input_columns to actual column names
|
|
173
|
+
validate : bool, default=True
|
|
174
|
+
Whether to validate before execution
|
|
175
|
+
|
|
176
|
+
Returns
|
|
177
|
+
-------
|
|
178
|
+
Series
|
|
179
|
+
Transformed feature values
|
|
180
|
+
|
|
181
|
+
Raises
|
|
182
|
+
------
|
|
183
|
+
ValueError
|
|
184
|
+
If required columns are missing or code execution fails
|
|
185
|
+
"""
|
|
186
|
+
column_mapping = column_mapping or {}
|
|
187
|
+
|
|
188
|
+
# Prepare the code with actual column names
|
|
189
|
+
code = self._prepare_code(column_mapping)
|
|
190
|
+
|
|
191
|
+
if validate:
|
|
192
|
+
# Check required columns exist
|
|
193
|
+
for input_col in self.input_columns:
|
|
194
|
+
actual_col = column_mapping.get(input_col, input_col)
|
|
195
|
+
if actual_col not in df.columns:
|
|
196
|
+
raise ValueError(f"Required column '{actual_col}' not found in DataFrame")
|
|
197
|
+
|
|
198
|
+
# Execute the code in a restricted environment
|
|
199
|
+
local_vars: dict[str, Any] = {"df": df, "np": np, "pd": pd}
|
|
200
|
+
try:
|
|
201
|
+
exec(self._get_safe_code(code), {"__builtins__": self._get_safe_builtins()}, local_vars)
|
|
202
|
+
|
|
203
|
+
if "result" not in local_vars:
|
|
204
|
+
raise ValueError("Code did not produce a 'result' variable")
|
|
205
|
+
|
|
206
|
+
result = local_vars["result"]
|
|
207
|
+
|
|
208
|
+
# Increment usage count
|
|
209
|
+
self.usage_count += 1
|
|
210
|
+
|
|
211
|
+
return result
|
|
212
|
+
|
|
213
|
+
except Exception as e:
|
|
214
|
+
logger.error(f"Failed to apply rule '{self.name}': {e}")
|
|
215
|
+
raise ValueError(f"Rule execution failed: {e}") from e
|
|
216
|
+
|
|
217
|
+
def _prepare_code(self, column_mapping: dict[str, str]) -> str:
|
|
218
|
+
"""Substitute column placeholders with actual column names."""
|
|
219
|
+
code = self.code
|
|
220
|
+
|
|
221
|
+
# Replace {col} style placeholders
|
|
222
|
+
for placeholder, actual in column_mapping.items():
|
|
223
|
+
code = code.replace(f"{{{{ '{placeholder}' }}}}", f"'{actual}'")
|
|
224
|
+
code = code.replace(f"{{{placeholder}}}", actual)
|
|
225
|
+
code = code.replace(f"df['{placeholder}']", f"df['{actual}']")
|
|
226
|
+
code = code.replace(f'df["{placeholder}"]', f'df["{actual}"]')
|
|
227
|
+
|
|
228
|
+
return code
|
|
229
|
+
|
|
230
|
+
def _get_safe_code(self, code: str) -> str:
|
|
231
|
+
"""Wrap code for safe execution."""
|
|
232
|
+
return code
|
|
233
|
+
|
|
234
|
+
def _get_safe_builtins(self) -> dict[str, Any]:
|
|
235
|
+
"""Get restricted builtins for safe code execution."""
|
|
236
|
+
return {
|
|
237
|
+
"len": len,
|
|
238
|
+
"sum": sum,
|
|
239
|
+
"max": max,
|
|
240
|
+
"min": min,
|
|
241
|
+
"int": int,
|
|
242
|
+
"float": float,
|
|
243
|
+
"str": str,
|
|
244
|
+
"bool": bool,
|
|
245
|
+
"abs": abs,
|
|
246
|
+
"round": round,
|
|
247
|
+
"pow": pow,
|
|
248
|
+
"range": range,
|
|
249
|
+
"list": list,
|
|
250
|
+
"dict": dict,
|
|
251
|
+
"set": set,
|
|
252
|
+
"tuple": tuple,
|
|
253
|
+
"sorted": sorted,
|
|
254
|
+
"reversed": reversed,
|
|
255
|
+
"enumerate": enumerate,
|
|
256
|
+
"zip": zip,
|
|
257
|
+
"any": any,
|
|
258
|
+
"all": all,
|
|
259
|
+
"map": map,
|
|
260
|
+
"filter": filter,
|
|
261
|
+
"isinstance": isinstance,
|
|
262
|
+
"hasattr": hasattr,
|
|
263
|
+
"getattr": getattr,
|
|
264
|
+
}
|
|
265
|
+
|
|
266
|
+
def to_dict(self) -> dict[str, Any]:
|
|
267
|
+
"""Convert rule to dictionary for serialization."""
|
|
268
|
+
return self.model_dump()
|
|
269
|
+
|
|
270
|
+
@classmethod
|
|
271
|
+
def from_dict(cls, data: dict[str, Any]) -> "TransformRule":
|
|
272
|
+
"""Create rule from dictionary."""
|
|
273
|
+
return cls(**data)
|
|
274
|
+
|
|
275
|
+
def __repr__(self) -> str:
|
|
276
|
+
return f"TransformRule(name='{self.name}', description='{self.description[:50]}...')"
|
|
@@ -11,6 +11,9 @@ from pydantic import Field
|
|
|
11
11
|
|
|
12
12
|
from featcopilot.core.base import BaseEngine, EngineConfig
|
|
13
13
|
from featcopilot.core.feature import FeatureSet
|
|
14
|
+
from featcopilot.utils.logger import get_logger
|
|
15
|
+
|
|
16
|
+
logger = get_logger(__name__)
|
|
14
17
|
|
|
15
18
|
|
|
16
19
|
class RelationalEngineConfig(EngineConfig):
|
|
@@ -141,7 +144,7 @@ class RelationalEngine(BaseEngine):
|
|
|
141
144
|
self._primary_columns = X.columns.tolist()
|
|
142
145
|
|
|
143
146
|
if self.config.verbose:
|
|
144
|
-
|
|
147
|
+
logger.info(f"RelationalEngine: {len(self._relationships)} relationships defined")
|
|
145
148
|
|
|
146
149
|
self._is_fitted = True
|
|
147
150
|
return self
|
|
@@ -191,7 +194,7 @@ class RelationalEngine(BaseEngine):
|
|
|
191
194
|
self._feature_names = [c for c in result.columns if c not in X.columns]
|
|
192
195
|
|
|
193
196
|
if self.config.verbose:
|
|
194
|
-
|
|
197
|
+
logger.info(f"RelationalEngine: Generated {len(self._feature_names)} features")
|
|
195
198
|
|
|
196
199
|
return result
|
|
197
200
|
|