featcopilot 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- featcopilot-0.1.0/PKG-INFO +218 -0
- featcopilot-0.1.0/README.md +174 -0
- featcopilot-0.1.0/featcopilot/__init__.py +29 -0
- featcopilot-0.1.0/featcopilot/core/__init__.py +13 -0
- featcopilot-0.1.0/featcopilot/core/base.py +195 -0
- featcopilot-0.1.0/featcopilot/core/feature.py +224 -0
- featcopilot-0.1.0/featcopilot/core/registry.py +128 -0
- featcopilot-0.1.0/featcopilot/engines/__init__.py +13 -0
- featcopilot-0.1.0/featcopilot/engines/relational.py +256 -0
- featcopilot-0.1.0/featcopilot/engines/tabular.py +293 -0
- featcopilot-0.1.0/featcopilot/engines/text.py +211 -0
- featcopilot-0.1.0/featcopilot/engines/timeseries.py +402 -0
- featcopilot-0.1.0/featcopilot/llm/__init__.py +16 -0
- featcopilot-0.1.0/featcopilot/llm/code_generator.py +295 -0
- featcopilot-0.1.0/featcopilot/llm/copilot_client.py +521 -0
- featcopilot-0.1.0/featcopilot/llm/explainer.py +200 -0
- featcopilot-0.1.0/featcopilot/llm/semantic_engine.py +379 -0
- featcopilot-0.1.0/featcopilot/selection/__init__.py +13 -0
- featcopilot-0.1.0/featcopilot/selection/importance.py +161 -0
- featcopilot-0.1.0/featcopilot/selection/redundancy.py +156 -0
- featcopilot-0.1.0/featcopilot/selection/statistical.py +199 -0
- featcopilot-0.1.0/featcopilot/selection/unified.py +172 -0
- featcopilot-0.1.0/featcopilot/transformers/__init__.py +11 -0
- featcopilot-0.1.0/featcopilot/transformers/sklearn_compat.py +401 -0
- featcopilot-0.1.0/featcopilot/utils/__init__.py +9 -0
- featcopilot-0.1.0/featcopilot/utils/cache.py +221 -0
- featcopilot-0.1.0/featcopilot/utils/parallel.py +109 -0
- featcopilot-0.1.0/featcopilot.egg-info/PKG-INFO +218 -0
- featcopilot-0.1.0/featcopilot.egg-info/SOURCES.txt +36 -0
- featcopilot-0.1.0/featcopilot.egg-info/dependency_links.txt +1 -0
- featcopilot-0.1.0/featcopilot.egg-info/requires.txt +25 -0
- featcopilot-0.1.0/featcopilot.egg-info/top_level.txt +1 -0
- featcopilot-0.1.0/pyproject.toml +94 -0
- featcopilot-0.1.0/setup.cfg +4 -0
- featcopilot-0.1.0/tests/test_autofeat.py +97 -0
- featcopilot-0.1.0/tests/test_core.py +127 -0
- featcopilot-0.1.0/tests/test_engines.py +120 -0
- featcopilot-0.1.0/tests/test_selection.py +162 -0
|
@@ -0,0 +1,218 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: featcopilot
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Next-generation LLM-powered auto feature engineering framework with GitHub Copilot SDK
|
|
5
|
+
Author: FeatCopilot Contributors
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/thinkall/featcopilot
|
|
8
|
+
Project-URL: Documentation, https://github.com/thinkall/featcopilot#readme
|
|
9
|
+
Project-URL: Repository, https://github.com/thinkall/featcopilot
|
|
10
|
+
Keywords: machine-learning,feature-engineering,automl,llm,copilot,data-science
|
|
11
|
+
Classifier: Development Status :: 4 - Beta
|
|
12
|
+
Classifier: Intended Audience :: Developers
|
|
13
|
+
Classifier: Intended Audience :: Science/Research
|
|
14
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
15
|
+
Classifier: Programming Language :: Python :: 3
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
20
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
21
|
+
Requires-Python: >=3.9
|
|
22
|
+
Description-Content-Type: text/markdown
|
|
23
|
+
Requires-Dist: numpy>=1.21.0
|
|
24
|
+
Requires-Dist: pandas>=1.3.0
|
|
25
|
+
Requires-Dist: scipy>=1.7.0
|
|
26
|
+
Requires-Dist: scikit-learn>=1.0.0
|
|
27
|
+
Requires-Dist: pydantic>=2.0.0
|
|
28
|
+
Requires-Dist: joblib>=1.1.0
|
|
29
|
+
Provides-Extra: llm
|
|
30
|
+
Requires-Dist: github-copilot-sdk>=0.1.0; extra == "llm"
|
|
31
|
+
Provides-Extra: timeseries
|
|
32
|
+
Requires-Dist: statsmodels>=0.13.0; extra == "timeseries"
|
|
33
|
+
Provides-Extra: full
|
|
34
|
+
Requires-Dist: github-copilot-sdk>=0.1.0; extra == "full"
|
|
35
|
+
Requires-Dist: statsmodels>=0.13.0; extra == "full"
|
|
36
|
+
Provides-Extra: dev
|
|
37
|
+
Requires-Dist: pytest>=7.0.0; extra == "dev"
|
|
38
|
+
Requires-Dist: pytest-asyncio>=0.21.0; extra == "dev"
|
|
39
|
+
Requires-Dist: pytest-cov>=4.0.0; extra == "dev"
|
|
40
|
+
Requires-Dist: black>=23.0.0; extra == "dev"
|
|
41
|
+
Requires-Dist: ruff>=0.1.0; extra == "dev"
|
|
42
|
+
Requires-Dist: mypy>=1.0.0; extra == "dev"
|
|
43
|
+
Requires-Dist: pre-commit>=3.6.0; extra == "dev"
|
|
44
|
+
|
|
45
|
+
# FeatCopilot 🚀
|
|
46
|
+
|
|
47
|
+
**Next-Generation LLM-Powered Auto Feature Engineering with GitHub Copilot SDK**
|
|
48
|
+
|
|
49
|
+
FeatCopilot is a unified feature engineering framework that combines the best approaches from existing libraries (Featuretools, TSFresh, AutoFeat, OpenFE) with novel LLM-powered capabilities via GitHub Copilot SDK.
|
|
50
|
+
|
|
51
|
+
## 📊 Benchmark Highlights
|
|
52
|
+
|
|
53
|
+
### Tabular Engine (Fast Mode - <1s)
|
|
54
|
+
|
|
55
|
+
| Task Type | Average Improvement | Best Case |
|
|
56
|
+
|-----------|--------------------:|----------:|
|
|
57
|
+
| **Text Classification** | **+12.44%** | +49.02% (News Headlines) |
|
|
58
|
+
| Time Series | +1.51% | +12.12% (Retail Demand) |
|
|
59
|
+
| Classification | +0.54% | +4.35% |
|
|
60
|
+
| Regression | +0.65% | +5.57% |
|
|
61
|
+
|
|
62
|
+
### LLM Engine (With Copilot - 30-60s)
|
|
63
|
+
|
|
64
|
+
| Task Type | Average Improvement | Best Case |
|
|
65
|
+
|-----------|--------------------:|----------:|
|
|
66
|
+
| **Regression** | **+7.79%** | +19.66% (Retail Demand) |
|
|
67
|
+
| Classification | +2.38% | +2.87% |
|
|
68
|
+
|
|
69
|
+
- ✅ **12/12 wins** on text classification (tabular mode)
|
|
70
|
+
- 🧠 **+19.66% max improvement** with LLM-powered features
|
|
71
|
+
- ⚡ **<1 second** (tabular) or **30-60s** (with LLM) processing time
|
|
72
|
+
- 📈 Largest gains with simple models (LogisticRegression, Ridge)
|
|
73
|
+
|
|
74
|
+
[View Full Benchmark Results](https://thinkall.github.io/featcopilot/user-guide/benchmarks/)
|
|
75
|
+
|
|
76
|
+
## Key Features
|
|
77
|
+
|
|
78
|
+
- 🔧 **Multi-Engine Architecture**: Tabular, time series, relational, and text feature engines
|
|
79
|
+
- 🤖 **LLM-Powered Intelligence**: Semantic feature discovery, domain-aware generation, and code synthesis
|
|
80
|
+
- 📊 **Intelligent Selection**: Statistical testing, importance ranking, and redundancy elimination
|
|
81
|
+
- 🔌 **Scikit-learn Compatible**: Drop-in replacement for sklearn transformers
|
|
82
|
+
- 📝 **Interpretable**: Every feature comes with human-readable explanations
|
|
83
|
+
|
|
84
|
+
## Installation
|
|
85
|
+
|
|
86
|
+
```bash
|
|
87
|
+
# Basic installation
|
|
88
|
+
pip install featcopilot
|
|
89
|
+
|
|
90
|
+
# With LLM capabilities (requires GitHub Copilot)
|
|
91
|
+
pip install featcopilot[llm]
|
|
92
|
+
|
|
93
|
+
# Full installation
|
|
94
|
+
pip install featcopilot[full]
|
|
95
|
+
```
|
|
96
|
+
|
|
97
|
+
## Quick Start
|
|
98
|
+
|
|
99
|
+
### Fast Mode (Tabular Only)
|
|
100
|
+
|
|
101
|
+
```python
|
|
102
|
+
from featcopilot import AutoFeatureEngineer
|
|
103
|
+
|
|
104
|
+
# Sub-second feature engineering
|
|
105
|
+
engineer = AutoFeatureEngineer(
|
|
106
|
+
engines=['tabular'],
|
|
107
|
+
max_features=50
|
|
108
|
+
)
|
|
109
|
+
|
|
110
|
+
X_transformed = engineer.fit_transform(X, y) # <1 second
|
|
111
|
+
print(f"Features: {X.shape[1]} -> {X_transformed.shape[1]}")
|
|
112
|
+
```
|
|
113
|
+
|
|
114
|
+
### LLM Mode (With Copilot)
|
|
115
|
+
|
|
116
|
+
```python
|
|
117
|
+
from featcopilot import AutoFeatureEngineer
|
|
118
|
+
|
|
119
|
+
# LLM-powered semantic features (+19.66% max improvement)
|
|
120
|
+
engineer = AutoFeatureEngineer(
|
|
121
|
+
engines=['tabular', 'llm'],
|
|
122
|
+
max_features=50
|
|
123
|
+
)
|
|
124
|
+
|
|
125
|
+
X_transformed = engineer.fit_transform(
|
|
126
|
+
X, y,
|
|
127
|
+
column_descriptions={
|
|
128
|
+
'age': 'Customer age in years',
|
|
129
|
+
'income': 'Annual household income in USD',
|
|
130
|
+
'tenure': 'Months as customer',
|
|
131
|
+
},
|
|
132
|
+
task_description="Predict customer churn"
|
|
133
|
+
) # 30-60 seconds
|
|
134
|
+
|
|
135
|
+
# Get LLM-generated explanations
|
|
136
|
+
for feature, explanation in engineer.explain_features().items():
|
|
137
|
+
print(f"{feature}: {explanation}")
|
|
138
|
+
```
|
|
139
|
+
|
|
140
|
+
## Engines
|
|
141
|
+
|
|
142
|
+
### Tabular Engine
|
|
143
|
+
Generates polynomial features, interaction terms, and mathematical transformations.
|
|
144
|
+
|
|
145
|
+
```python
|
|
146
|
+
from featcopilot.engines import TabularEngine
|
|
147
|
+
|
|
148
|
+
engine = TabularEngine(
|
|
149
|
+
polynomial_degree=2,
|
|
150
|
+
interaction_only=False,
|
|
151
|
+
include_transforms=['log', 'sqrt', 'square']
|
|
152
|
+
)
|
|
153
|
+
```
|
|
154
|
+
|
|
155
|
+
### Time Series Engine
|
|
156
|
+
Extracts statistical, frequency, and temporal features from time series data.
|
|
157
|
+
|
|
158
|
+
```python
|
|
159
|
+
from featcopilot.engines import TimeSeriesEngine
|
|
160
|
+
|
|
161
|
+
engine = TimeSeriesEngine(
|
|
162
|
+
features=['mean', 'std', 'skew', 'autocorr', 'fft_coefficients']
|
|
163
|
+
)
|
|
164
|
+
```
|
|
165
|
+
|
|
166
|
+
### LLM Engine
|
|
167
|
+
Uses GitHub Copilot SDK for intelligent feature generation.
|
|
168
|
+
|
|
169
|
+
```python
|
|
170
|
+
from featcopilot.llm import SemanticEngine
|
|
171
|
+
|
|
172
|
+
engine = SemanticEngine(
|
|
173
|
+
model='gpt-5',
|
|
174
|
+
max_suggestions=20,
|
|
175
|
+
validate_features=True
|
|
176
|
+
)
|
|
177
|
+
```
|
|
178
|
+
|
|
179
|
+
## Feature Selection
|
|
180
|
+
|
|
181
|
+
```python
|
|
182
|
+
from featcopilot.selection import FeatureSelector
|
|
183
|
+
|
|
184
|
+
selector = FeatureSelector(
|
|
185
|
+
methods=['mutual_info', 'importance', 'correlation'],
|
|
186
|
+
max_features=30,
|
|
187
|
+
correlation_threshold=0.95
|
|
188
|
+
)
|
|
189
|
+
|
|
190
|
+
X_selected = selector.fit_transform(X, y)
|
|
191
|
+
```
|
|
192
|
+
|
|
193
|
+
## Comparison with Existing Libraries
|
|
194
|
+
|
|
195
|
+
| Feature | FeatCopilot | Featuretools | TSFresh | AutoFeat | OpenFE | CAAFE |
|
|
196
|
+
|---------|-------------|--------------|---------|----------|--------|-------|
|
|
197
|
+
| Tabular Features | ✅ | ❌ | ❌ | ✅ | ✅ | ✅ |
|
|
198
|
+
| Time Series | ✅ | ❌ | ✅ | ❌ | ❌ | ❌ |
|
|
199
|
+
| Relational | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ |
|
|
200
|
+
| LLM-Powered | ✅ | ❌ | ❌ | ❌ | ❌ | ✅ |
|
|
201
|
+
| Semantic Understanding | ✅ | ❌ | ❌ | ❌ | ❌ | ⚠️ |
|
|
202
|
+
| Code Generation | ✅ | ❌ | ❌ | ❌ | ❌ | ⚠️ |
|
|
203
|
+
| Sklearn Compatible | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ |
|
|
204
|
+
| Interpretable | ✅ | ⚠️ | ⚠️ | ⚠️ | ❌ | ✅ |
|
|
205
|
+
|
|
206
|
+
## Documentation
|
|
207
|
+
|
|
208
|
+
📖 **Full Documentation**: [https://thinkall.github.io/featcopilot/](https://thinkall.github.io/featcopilot/)
|
|
209
|
+
|
|
210
|
+
## Requirements
|
|
211
|
+
|
|
212
|
+
- Python 3.9+
|
|
213
|
+
- NumPy, Pandas, Scikit-learn
|
|
214
|
+
- GitHub Copilot CLI (for LLM features)
|
|
215
|
+
|
|
216
|
+
## License
|
|
217
|
+
|
|
218
|
+
MIT License
|
|
@@ -0,0 +1,174 @@
|
|
|
1
|
+
# FeatCopilot 🚀
|
|
2
|
+
|
|
3
|
+
**Next-Generation LLM-Powered Auto Feature Engineering with GitHub Copilot SDK**
|
|
4
|
+
|
|
5
|
+
FeatCopilot is a unified feature engineering framework that combines the best approaches from existing libraries (Featuretools, TSFresh, AutoFeat, OpenFE) with novel LLM-powered capabilities via GitHub Copilot SDK.
|
|
6
|
+
|
|
7
|
+
## 📊 Benchmark Highlights
|
|
8
|
+
|
|
9
|
+
### Tabular Engine (Fast Mode - <1s)
|
|
10
|
+
|
|
11
|
+
| Task Type | Average Improvement | Best Case |
|
|
12
|
+
|-----------|--------------------:|----------:|
|
|
13
|
+
| **Text Classification** | **+12.44%** | +49.02% (News Headlines) |
|
|
14
|
+
| Time Series | +1.51% | +12.12% (Retail Demand) |
|
|
15
|
+
| Classification | +0.54% | +4.35% |
|
|
16
|
+
| Regression | +0.65% | +5.57% |
|
|
17
|
+
|
|
18
|
+
### LLM Engine (With Copilot - 30-60s)
|
|
19
|
+
|
|
20
|
+
| Task Type | Average Improvement | Best Case |
|
|
21
|
+
|-----------|--------------------:|----------:|
|
|
22
|
+
| **Regression** | **+7.79%** | +19.66% (Retail Demand) |
|
|
23
|
+
| Classification | +2.38% | +2.87% |
|
|
24
|
+
|
|
25
|
+
- ✅ **12/12 wins** on text classification (tabular mode)
|
|
26
|
+
- 🧠 **+19.66% max improvement** with LLM-powered features
|
|
27
|
+
- ⚡ **<1 second** (tabular) or **30-60s** (with LLM) processing time
|
|
28
|
+
- 📈 Largest gains with simple models (LogisticRegression, Ridge)
|
|
29
|
+
|
|
30
|
+
[View Full Benchmark Results](https://thinkall.github.io/featcopilot/user-guide/benchmarks/)
|
|
31
|
+
|
|
32
|
+
## Key Features
|
|
33
|
+
|
|
34
|
+
- 🔧 **Multi-Engine Architecture**: Tabular, time series, relational, and text feature engines
|
|
35
|
+
- 🤖 **LLM-Powered Intelligence**: Semantic feature discovery, domain-aware generation, and code synthesis
|
|
36
|
+
- 📊 **Intelligent Selection**: Statistical testing, importance ranking, and redundancy elimination
|
|
37
|
+
- 🔌 **Scikit-learn Compatible**: Drop-in replacement for sklearn transformers
|
|
38
|
+
- 📝 **Interpretable**: Every feature comes with human-readable explanations
|
|
39
|
+
|
|
40
|
+
## Installation
|
|
41
|
+
|
|
42
|
+
```bash
|
|
43
|
+
# Basic installation
|
|
44
|
+
pip install featcopilot
|
|
45
|
+
|
|
46
|
+
# With LLM capabilities (requires GitHub Copilot)
|
|
47
|
+
pip install featcopilot[llm]
|
|
48
|
+
|
|
49
|
+
# Full installation
|
|
50
|
+
pip install featcopilot[full]
|
|
51
|
+
```
|
|
52
|
+
|
|
53
|
+
## Quick Start
|
|
54
|
+
|
|
55
|
+
### Fast Mode (Tabular Only)
|
|
56
|
+
|
|
57
|
+
```python
|
|
58
|
+
from featcopilot import AutoFeatureEngineer
|
|
59
|
+
|
|
60
|
+
# Sub-second feature engineering
|
|
61
|
+
engineer = AutoFeatureEngineer(
|
|
62
|
+
engines=['tabular'],
|
|
63
|
+
max_features=50
|
|
64
|
+
)
|
|
65
|
+
|
|
66
|
+
X_transformed = engineer.fit_transform(X, y) # <1 second
|
|
67
|
+
print(f"Features: {X.shape[1]} -> {X_transformed.shape[1]}")
|
|
68
|
+
```
|
|
69
|
+
|
|
70
|
+
### LLM Mode (With Copilot)
|
|
71
|
+
|
|
72
|
+
```python
|
|
73
|
+
from featcopilot import AutoFeatureEngineer
|
|
74
|
+
|
|
75
|
+
# LLM-powered semantic features (+19.66% max improvement)
|
|
76
|
+
engineer = AutoFeatureEngineer(
|
|
77
|
+
engines=['tabular', 'llm'],
|
|
78
|
+
max_features=50
|
|
79
|
+
)
|
|
80
|
+
|
|
81
|
+
X_transformed = engineer.fit_transform(
|
|
82
|
+
X, y,
|
|
83
|
+
column_descriptions={
|
|
84
|
+
'age': 'Customer age in years',
|
|
85
|
+
'income': 'Annual household income in USD',
|
|
86
|
+
'tenure': 'Months as customer',
|
|
87
|
+
},
|
|
88
|
+
task_description="Predict customer churn"
|
|
89
|
+
) # 30-60 seconds
|
|
90
|
+
|
|
91
|
+
# Get LLM-generated explanations
|
|
92
|
+
for feature, explanation in engineer.explain_features().items():
|
|
93
|
+
print(f"{feature}: {explanation}")
|
|
94
|
+
```
|
|
95
|
+
|
|
96
|
+
## Engines
|
|
97
|
+
|
|
98
|
+
### Tabular Engine
|
|
99
|
+
Generates polynomial features, interaction terms, and mathematical transformations.
|
|
100
|
+
|
|
101
|
+
```python
|
|
102
|
+
from featcopilot.engines import TabularEngine
|
|
103
|
+
|
|
104
|
+
engine = TabularEngine(
|
|
105
|
+
polynomial_degree=2,
|
|
106
|
+
interaction_only=False,
|
|
107
|
+
include_transforms=['log', 'sqrt', 'square']
|
|
108
|
+
)
|
|
109
|
+
```
|
|
110
|
+
|
|
111
|
+
### Time Series Engine
|
|
112
|
+
Extracts statistical, frequency, and temporal features from time series data.
|
|
113
|
+
|
|
114
|
+
```python
|
|
115
|
+
from featcopilot.engines import TimeSeriesEngine
|
|
116
|
+
|
|
117
|
+
engine = TimeSeriesEngine(
|
|
118
|
+
features=['mean', 'std', 'skew', 'autocorr', 'fft_coefficients']
|
|
119
|
+
)
|
|
120
|
+
```
|
|
121
|
+
|
|
122
|
+
### LLM Engine
|
|
123
|
+
Uses GitHub Copilot SDK for intelligent feature generation.
|
|
124
|
+
|
|
125
|
+
```python
|
|
126
|
+
from featcopilot.llm import SemanticEngine
|
|
127
|
+
|
|
128
|
+
engine = SemanticEngine(
|
|
129
|
+
model='gpt-5',
|
|
130
|
+
max_suggestions=20,
|
|
131
|
+
validate_features=True
|
|
132
|
+
)
|
|
133
|
+
```
|
|
134
|
+
|
|
135
|
+
## Feature Selection
|
|
136
|
+
|
|
137
|
+
```python
|
|
138
|
+
from featcopilot.selection import FeatureSelector
|
|
139
|
+
|
|
140
|
+
selector = FeatureSelector(
|
|
141
|
+
methods=['mutual_info', 'importance', 'correlation'],
|
|
142
|
+
max_features=30,
|
|
143
|
+
correlation_threshold=0.95
|
|
144
|
+
)
|
|
145
|
+
|
|
146
|
+
X_selected = selector.fit_transform(X, y)
|
|
147
|
+
```
|
|
148
|
+
|
|
149
|
+
## Comparison with Existing Libraries
|
|
150
|
+
|
|
151
|
+
| Feature | FeatCopilot | Featuretools | TSFresh | AutoFeat | OpenFE | CAAFE |
|
|
152
|
+
|---------|-------------|--------------|---------|----------|--------|-------|
|
|
153
|
+
| Tabular Features | ✅ | ❌ | ❌ | ✅ | ✅ | ✅ |
|
|
154
|
+
| Time Series | ✅ | ❌ | ✅ | ❌ | ❌ | ❌ |
|
|
155
|
+
| Relational | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ |
|
|
156
|
+
| LLM-Powered | ✅ | ❌ | ❌ | ❌ | ❌ | ✅ |
|
|
157
|
+
| Semantic Understanding | ✅ | ❌ | ❌ | ❌ | ❌ | ⚠️ |
|
|
158
|
+
| Code Generation | ✅ | ❌ | ❌ | ❌ | ❌ | ⚠️ |
|
|
159
|
+
| Sklearn Compatible | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ |
|
|
160
|
+
| Interpretable | ✅ | ⚠️ | ⚠️ | ⚠️ | ❌ | ✅ |
|
|
161
|
+
|
|
162
|
+
## Documentation
|
|
163
|
+
|
|
164
|
+
📖 **Full Documentation**: [https://thinkall.github.io/featcopilot/](https://thinkall.github.io/featcopilot/)
|
|
165
|
+
|
|
166
|
+
## Requirements
|
|
167
|
+
|
|
168
|
+
- Python 3.9+
|
|
169
|
+
- NumPy, Pandas, Scikit-learn
|
|
170
|
+
- GitHub Copilot CLI (for LLM features)
|
|
171
|
+
|
|
172
|
+
## License
|
|
173
|
+
|
|
174
|
+
MIT License
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
"""
|
|
2
|
+
FeatCopilot - Next-Generation LLM-Powered Auto Feature Engineering
|
|
3
|
+
|
|
4
|
+
A unified feature engineering framework combining traditional approaches
|
|
5
|
+
with novel LLM-powered capabilities via GitHub Copilot SDK.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
__version__ = "0.1.0"
|
|
9
|
+
__author__ = "FeatCopilot Contributors"
|
|
10
|
+
|
|
11
|
+
from featcopilot.core.base import BaseEngine, BaseSelector
|
|
12
|
+
from featcopilot.core.feature import Feature, FeatureSet
|
|
13
|
+
from featcopilot.transformers.sklearn_compat import (
|
|
14
|
+
AutoFeatureEngineer,
|
|
15
|
+
FeatureEngineerTransformer,
|
|
16
|
+
)
|
|
17
|
+
|
|
18
|
+
__all__ = [
|
|
19
|
+
# Core
|
|
20
|
+
"BaseEngine",
|
|
21
|
+
"BaseSelector",
|
|
22
|
+
"Feature",
|
|
23
|
+
"FeatureSet",
|
|
24
|
+
# Main API
|
|
25
|
+
"AutoFeatureEngineer",
|
|
26
|
+
"FeatureEngineerTransformer",
|
|
27
|
+
# Version
|
|
28
|
+
"__version__",
|
|
29
|
+
]
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
"""Core module containing base classes and interfaces."""
|
|
2
|
+
|
|
3
|
+
from featcopilot.core.base import BaseEngine, BaseSelector
|
|
4
|
+
from featcopilot.core.feature import Feature, FeatureSet
|
|
5
|
+
from featcopilot.core.registry import FeatureRegistry
|
|
6
|
+
|
|
7
|
+
__all__ = [
|
|
8
|
+
"BaseEngine",
|
|
9
|
+
"BaseSelector",
|
|
10
|
+
"Feature",
|
|
11
|
+
"FeatureSet",
|
|
12
|
+
"FeatureRegistry",
|
|
13
|
+
]
|
|
@@ -0,0 +1,195 @@
|
|
|
1
|
+
"""Base classes for feature engineering engines and selectors."""
|
|
2
|
+
|
|
3
|
+
from abc import ABC, abstractmethod
|
|
4
|
+
from typing import Any, Optional, Union
|
|
5
|
+
|
|
6
|
+
import numpy as np
|
|
7
|
+
import pandas as pd
|
|
8
|
+
from pydantic import BaseModel, Field
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class EngineConfig(BaseModel):
|
|
12
|
+
"""Configuration for feature engineering engines."""
|
|
13
|
+
|
|
14
|
+
name: str = Field(description="Engine name")
|
|
15
|
+
enabled: bool = Field(default=True, description="Whether engine is enabled")
|
|
16
|
+
max_features: Optional[int] = Field(default=None, description="Max features to generate")
|
|
17
|
+
verbose: bool = Field(default=False, description="Verbose output")
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class BaseEngine(ABC):
|
|
21
|
+
"""
|
|
22
|
+
Abstract base class for feature engineering engines.
|
|
23
|
+
|
|
24
|
+
All engines (tabular, timeseries, relational, llm) inherit from this class.
|
|
25
|
+
"""
|
|
26
|
+
|
|
27
|
+
def __init__(self, config: Optional[EngineConfig] = None, **kwargs):
|
|
28
|
+
self.config = config or EngineConfig(name=self.__class__.__name__, **kwargs)
|
|
29
|
+
self._is_fitted = False
|
|
30
|
+
self._feature_names: list[str] = []
|
|
31
|
+
self._feature_metadata: dict[str, Any] = {}
|
|
32
|
+
|
|
33
|
+
@property
|
|
34
|
+
def is_fitted(self) -> bool:
|
|
35
|
+
"""Check if engine has been fitted."""
|
|
36
|
+
return self._is_fitted
|
|
37
|
+
|
|
38
|
+
@abstractmethod
|
|
39
|
+
def fit(
|
|
40
|
+
self,
|
|
41
|
+
X: Union[pd.DataFrame, np.ndarray],
|
|
42
|
+
y: Optional[Union[pd.Series, np.ndarray]] = None,
|
|
43
|
+
**kwargs,
|
|
44
|
+
) -> "BaseEngine":
|
|
45
|
+
"""
|
|
46
|
+
Fit the engine to the data.
|
|
47
|
+
|
|
48
|
+
Parameters
|
|
49
|
+
----------
|
|
50
|
+
X : DataFrame or ndarray
|
|
51
|
+
Input features
|
|
52
|
+
y : Series or ndarray, optional
|
|
53
|
+
Target variable
|
|
54
|
+
**kwargs : dict
|
|
55
|
+
Additional parameters
|
|
56
|
+
|
|
57
|
+
Returns
|
|
58
|
+
-------
|
|
59
|
+
self : BaseEngine
|
|
60
|
+
Fitted engine
|
|
61
|
+
"""
|
|
62
|
+
pass
|
|
63
|
+
|
|
64
|
+
@abstractmethod
|
|
65
|
+
def transform(self, X: Union[pd.DataFrame, np.ndarray], **kwargs) -> pd.DataFrame:
|
|
66
|
+
"""
|
|
67
|
+
Transform data to generate new features.
|
|
68
|
+
|
|
69
|
+
Parameters
|
|
70
|
+
----------
|
|
71
|
+
X : DataFrame or ndarray
|
|
72
|
+
Input features
|
|
73
|
+
**kwargs : dict
|
|
74
|
+
Additional parameters
|
|
75
|
+
|
|
76
|
+
Returns
|
|
77
|
+
-------
|
|
78
|
+
X_transformed : DataFrame
|
|
79
|
+
Transformed features
|
|
80
|
+
"""
|
|
81
|
+
pass
|
|
82
|
+
|
|
83
|
+
def fit_transform(
|
|
84
|
+
self,
|
|
85
|
+
X: Union[pd.DataFrame, np.ndarray],
|
|
86
|
+
y: Optional[Union[pd.Series, np.ndarray]] = None,
|
|
87
|
+
**kwargs,
|
|
88
|
+
) -> pd.DataFrame:
|
|
89
|
+
"""Fit and transform in one step."""
|
|
90
|
+
return self.fit(X, y, **kwargs).transform(X, **kwargs)
|
|
91
|
+
|
|
92
|
+
def get_feature_names(self) -> list[str]:
|
|
93
|
+
"""Get names of generated features."""
|
|
94
|
+
return self._feature_names.copy()
|
|
95
|
+
|
|
96
|
+
def get_feature_metadata(self) -> dict[str, Any]:
|
|
97
|
+
"""Get metadata for generated features."""
|
|
98
|
+
return self._feature_metadata.copy()
|
|
99
|
+
|
|
100
|
+
def _validate_input(self, X: Union[pd.DataFrame, np.ndarray]) -> pd.DataFrame:
|
|
101
|
+
"""Convert input to DataFrame and validate."""
|
|
102
|
+
if isinstance(X, np.ndarray):
|
|
103
|
+
X = pd.DataFrame(X, columns=[f"feature_{i}" for i in range(X.shape[1])])
|
|
104
|
+
elif not isinstance(X, pd.DataFrame):
|
|
105
|
+
raise TypeError(f"Expected DataFrame or ndarray, got {type(X)}")
|
|
106
|
+
return X
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
class SelectorConfig(BaseModel):
|
|
110
|
+
"""Configuration for feature selectors."""
|
|
111
|
+
|
|
112
|
+
max_features: Optional[int] = Field(default=None, description="Max features to select")
|
|
113
|
+
min_importance: float = Field(default=0.0, description="Minimum importance threshold")
|
|
114
|
+
correlation_threshold: float = Field(default=0.95, description="Threshold for correlation-based elimination")
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
class BaseSelector(ABC):
|
|
118
|
+
"""
|
|
119
|
+
Abstract base class for feature selection.
|
|
120
|
+
|
|
121
|
+
Handles selection of most important/relevant features from generated set.
|
|
122
|
+
"""
|
|
123
|
+
|
|
124
|
+
def __init__(self, config: Optional[SelectorConfig] = None, **kwargs):
|
|
125
|
+
self.config = config or SelectorConfig(**kwargs)
|
|
126
|
+
self._is_fitted = False
|
|
127
|
+
self._selected_features: list[str] = []
|
|
128
|
+
self._feature_scores: dict[str, float] = {}
|
|
129
|
+
|
|
130
|
+
@property
|
|
131
|
+
def is_fitted(self) -> bool:
|
|
132
|
+
"""Check if selector has been fitted."""
|
|
133
|
+
return self._is_fitted
|
|
134
|
+
|
|
135
|
+
@abstractmethod
|
|
136
|
+
def fit(self, X: Union[pd.DataFrame, np.ndarray], y: Union[pd.Series, np.ndarray], **kwargs) -> "BaseSelector":
|
|
137
|
+
"""
|
|
138
|
+
Fit the selector to determine feature importance.
|
|
139
|
+
|
|
140
|
+
Parameters
|
|
141
|
+
----------
|
|
142
|
+
X : DataFrame or ndarray
|
|
143
|
+
Input features
|
|
144
|
+
y : Series or ndarray
|
|
145
|
+
Target variable
|
|
146
|
+
**kwargs : dict
|
|
147
|
+
Additional parameters
|
|
148
|
+
|
|
149
|
+
Returns
|
|
150
|
+
-------
|
|
151
|
+
self : BaseSelector
|
|
152
|
+
Fitted selector
|
|
153
|
+
"""
|
|
154
|
+
pass
|
|
155
|
+
|
|
156
|
+
@abstractmethod
|
|
157
|
+
def transform(self, X: Union[pd.DataFrame, np.ndarray], **kwargs) -> pd.DataFrame:
|
|
158
|
+
"""
|
|
159
|
+
Transform data to keep only selected features.
|
|
160
|
+
|
|
161
|
+
Parameters
|
|
162
|
+
----------
|
|
163
|
+
X : DataFrame or ndarray
|
|
164
|
+
Input features
|
|
165
|
+
**kwargs : dict
|
|
166
|
+
Additional parameters
|
|
167
|
+
|
|
168
|
+
Returns
|
|
169
|
+
-------
|
|
170
|
+
X_selected : DataFrame
|
|
171
|
+
Data with only selected features
|
|
172
|
+
"""
|
|
173
|
+
pass
|
|
174
|
+
|
|
175
|
+
def fit_transform(
|
|
176
|
+
self, X: Union[pd.DataFrame, np.ndarray], y: Union[pd.Series, np.ndarray], **kwargs
|
|
177
|
+
) -> pd.DataFrame:
|
|
178
|
+
"""Fit and transform in one step."""
|
|
179
|
+
return self.fit(X, y, **kwargs).transform(X, **kwargs)
|
|
180
|
+
|
|
181
|
+
def get_selected_features(self) -> list[str]:
|
|
182
|
+
"""Get names of selected features."""
|
|
183
|
+
return self._selected_features.copy()
|
|
184
|
+
|
|
185
|
+
def get_feature_scores(self) -> dict[str, float]:
|
|
186
|
+
"""Get importance scores for all features."""
|
|
187
|
+
return self._feature_scores.copy()
|
|
188
|
+
|
|
189
|
+
def _validate_input(self, X: Union[pd.DataFrame, np.ndarray]) -> pd.DataFrame:
|
|
190
|
+
"""Convert input to DataFrame and validate."""
|
|
191
|
+
if isinstance(X, np.ndarray):
|
|
192
|
+
X = pd.DataFrame(X, columns=[f"feature_{i}" for i in range(X.shape[1])])
|
|
193
|
+
elif not isinstance(X, pd.DataFrame):
|
|
194
|
+
raise TypeError(f"Expected DataFrame or ndarray, got {type(X)}")
|
|
195
|
+
return X
|