featcopilot 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. featcopilot-0.1.0/PKG-INFO +218 -0
  2. featcopilot-0.1.0/README.md +174 -0
  3. featcopilot-0.1.0/featcopilot/__init__.py +29 -0
  4. featcopilot-0.1.0/featcopilot/core/__init__.py +13 -0
  5. featcopilot-0.1.0/featcopilot/core/base.py +195 -0
  6. featcopilot-0.1.0/featcopilot/core/feature.py +224 -0
  7. featcopilot-0.1.0/featcopilot/core/registry.py +128 -0
  8. featcopilot-0.1.0/featcopilot/engines/__init__.py +13 -0
  9. featcopilot-0.1.0/featcopilot/engines/relational.py +256 -0
  10. featcopilot-0.1.0/featcopilot/engines/tabular.py +293 -0
  11. featcopilot-0.1.0/featcopilot/engines/text.py +211 -0
  12. featcopilot-0.1.0/featcopilot/engines/timeseries.py +402 -0
  13. featcopilot-0.1.0/featcopilot/llm/__init__.py +16 -0
  14. featcopilot-0.1.0/featcopilot/llm/code_generator.py +295 -0
  15. featcopilot-0.1.0/featcopilot/llm/copilot_client.py +521 -0
  16. featcopilot-0.1.0/featcopilot/llm/explainer.py +200 -0
  17. featcopilot-0.1.0/featcopilot/llm/semantic_engine.py +379 -0
  18. featcopilot-0.1.0/featcopilot/selection/__init__.py +13 -0
  19. featcopilot-0.1.0/featcopilot/selection/importance.py +161 -0
  20. featcopilot-0.1.0/featcopilot/selection/redundancy.py +156 -0
  21. featcopilot-0.1.0/featcopilot/selection/statistical.py +199 -0
  22. featcopilot-0.1.0/featcopilot/selection/unified.py +172 -0
  23. featcopilot-0.1.0/featcopilot/transformers/__init__.py +11 -0
  24. featcopilot-0.1.0/featcopilot/transformers/sklearn_compat.py +401 -0
  25. featcopilot-0.1.0/featcopilot/utils/__init__.py +9 -0
  26. featcopilot-0.1.0/featcopilot/utils/cache.py +221 -0
  27. featcopilot-0.1.0/featcopilot/utils/parallel.py +109 -0
  28. featcopilot-0.1.0/featcopilot.egg-info/PKG-INFO +218 -0
  29. featcopilot-0.1.0/featcopilot.egg-info/SOURCES.txt +36 -0
  30. featcopilot-0.1.0/featcopilot.egg-info/dependency_links.txt +1 -0
  31. featcopilot-0.1.0/featcopilot.egg-info/requires.txt +25 -0
  32. featcopilot-0.1.0/featcopilot.egg-info/top_level.txt +1 -0
  33. featcopilot-0.1.0/pyproject.toml +94 -0
  34. featcopilot-0.1.0/setup.cfg +4 -0
  35. featcopilot-0.1.0/tests/test_autofeat.py +97 -0
  36. featcopilot-0.1.0/tests/test_core.py +127 -0
  37. featcopilot-0.1.0/tests/test_engines.py +120 -0
  38. featcopilot-0.1.0/tests/test_selection.py +162 -0
@@ -0,0 +1,218 @@
1
+ Metadata-Version: 2.4
2
+ Name: featcopilot
3
+ Version: 0.1.0
4
+ Summary: Next-generation LLM-powered auto feature engineering framework with GitHub Copilot SDK
5
+ Author: FeatCopilot Contributors
6
+ License: MIT
7
+ Project-URL: Homepage, https://github.com/thinkall/featcopilot
8
+ Project-URL: Documentation, https://github.com/thinkall/featcopilot#readme
9
+ Project-URL: Repository, https://github.com/thinkall/featcopilot
10
+ Keywords: machine-learning,feature-engineering,automl,llm,copilot,data-science
11
+ Classifier: Development Status :: 4 - Beta
12
+ Classifier: Intended Audience :: Developers
13
+ Classifier: Intended Audience :: Science/Research
14
+ Classifier: License :: OSI Approved :: MIT License
15
+ Classifier: Programming Language :: Python :: 3
16
+ Classifier: Programming Language :: Python :: 3.9
17
+ Classifier: Programming Language :: Python :: 3.10
18
+ Classifier: Programming Language :: Python :: 3.11
19
+ Classifier: Programming Language :: Python :: 3.12
20
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
21
+ Requires-Python: >=3.9
22
+ Description-Content-Type: text/markdown
23
+ Requires-Dist: numpy>=1.21.0
24
+ Requires-Dist: pandas>=1.3.0
25
+ Requires-Dist: scipy>=1.7.0
26
+ Requires-Dist: scikit-learn>=1.0.0
27
+ Requires-Dist: pydantic>=2.0.0
28
+ Requires-Dist: joblib>=1.1.0
29
+ Provides-Extra: llm
30
+ Requires-Dist: github-copilot-sdk>=0.1.0; extra == "llm"
31
+ Provides-Extra: timeseries
32
+ Requires-Dist: statsmodels>=0.13.0; extra == "timeseries"
33
+ Provides-Extra: full
34
+ Requires-Dist: github-copilot-sdk>=0.1.0; extra == "full"
35
+ Requires-Dist: statsmodels>=0.13.0; extra == "full"
36
+ Provides-Extra: dev
37
+ Requires-Dist: pytest>=7.0.0; extra == "dev"
38
+ Requires-Dist: pytest-asyncio>=0.21.0; extra == "dev"
39
+ Requires-Dist: pytest-cov>=4.0.0; extra == "dev"
40
+ Requires-Dist: black>=23.0.0; extra == "dev"
41
+ Requires-Dist: ruff>=0.1.0; extra == "dev"
42
+ Requires-Dist: mypy>=1.0.0; extra == "dev"
43
+ Requires-Dist: pre-commit>=3.6.0; extra == "dev"
44
+
45
+ # FeatCopilot 🚀
46
+
47
+ **Next-Generation LLM-Powered Auto Feature Engineering with GitHub Copilot SDK**
48
+
49
+ FeatCopilot is a unified feature engineering framework that combines the best approaches from existing libraries (Featuretools, TSFresh, AutoFeat, OpenFE) with novel LLM-powered capabilities via GitHub Copilot SDK.
50
+
51
+ ## 📊 Benchmark Highlights
52
+
53
+ ### Tabular Engine (Fast Mode - <1s)
54
+
55
+ | Task Type | Average Improvement | Best Case |
56
+ |-----------|--------------------:|----------:|
57
+ | **Text Classification** | **+12.44%** | +49.02% (News Headlines) |
58
+ | Time Series | +1.51% | +12.12% (Retail Demand) |
59
+ | Classification | +0.54% | +4.35% |
60
+ | Regression | +0.65% | +5.57% |
61
+
62
+ ### LLM Engine (With Copilot - 30-60s)
63
+
64
+ | Task Type | Average Improvement | Best Case |
65
+ |-----------|--------------------:|----------:|
66
+ | **Regression** | **+7.79%** | +19.66% (Retail Demand) |
67
+ | Classification | +2.38% | +2.87% |
68
+
69
+ - ✅ **12/12 wins** on text classification (tabular mode)
70
+ - 🧠 **+19.66% max improvement** with LLM-powered features
71
+ - ⚡ **<1 second** (tabular) or **30-60s** (with LLM) processing time
72
+ - 📈 Largest gains with simple models (LogisticRegression, Ridge)
73
+
74
+ [View Full Benchmark Results](https://thinkall.github.io/featcopilot/user-guide/benchmarks/)
75
+
76
+ ## Key Features
77
+
78
+ - 🔧 **Multi-Engine Architecture**: Tabular, time series, relational, and text feature engines
79
+ - 🤖 **LLM-Powered Intelligence**: Semantic feature discovery, domain-aware generation, and code synthesis
80
+ - 📊 **Intelligent Selection**: Statistical testing, importance ranking, and redundancy elimination
81
+ - 🔌 **Scikit-learn Compatible**: Drop-in replacement for sklearn transformers
82
+ - 📝 **Interpretable**: Every feature comes with human-readable explanations
83
+
84
+ ## Installation
85
+
86
+ ```bash
87
+ # Basic installation
88
+ pip install featcopilot
89
+
90
+ # With LLM capabilities (requires GitHub Copilot)
91
+ pip install featcopilot[llm]
92
+
93
+ # Full installation
94
+ pip install featcopilot[full]
95
+ ```
96
+
97
+ ## Quick Start
98
+
99
+ ### Fast Mode (Tabular Only)
100
+
101
+ ```python
102
+ from featcopilot import AutoFeatureEngineer
103
+
104
+ # Sub-second feature engineering
105
+ engineer = AutoFeatureEngineer(
106
+ engines=['tabular'],
107
+ max_features=50
108
+ )
109
+
110
+ X_transformed = engineer.fit_transform(X, y) # <1 second
111
+ print(f"Features: {X.shape[1]} -> {X_transformed.shape[1]}")
112
+ ```
113
+
114
+ ### LLM Mode (With Copilot)
115
+
116
+ ```python
117
+ from featcopilot import AutoFeatureEngineer
118
+
119
+ # LLM-powered semantic features (+19.66% max improvement)
120
+ engineer = AutoFeatureEngineer(
121
+ engines=['tabular', 'llm'],
122
+ max_features=50
123
+ )
124
+
125
+ X_transformed = engineer.fit_transform(
126
+ X, y,
127
+ column_descriptions={
128
+ 'age': 'Customer age in years',
129
+ 'income': 'Annual household income in USD',
130
+ 'tenure': 'Months as customer',
131
+ },
132
+ task_description="Predict customer churn"
133
+ ) # 30-60 seconds
134
+
135
+ # Get LLM-generated explanations
136
+ for feature, explanation in engineer.explain_features().items():
137
+ print(f"{feature}: {explanation}")
138
+ ```
139
+
140
+ ## Engines
141
+
142
+ ### Tabular Engine
143
+ Generates polynomial features, interaction terms, and mathematical transformations.
144
+
145
+ ```python
146
+ from featcopilot.engines import TabularEngine
147
+
148
+ engine = TabularEngine(
149
+ polynomial_degree=2,
150
+ interaction_only=False,
151
+ include_transforms=['log', 'sqrt', 'square']
152
+ )
153
+ ```
154
+
155
+ ### Time Series Engine
156
+ Extracts statistical, frequency, and temporal features from time series data.
157
+
158
+ ```python
159
+ from featcopilot.engines import TimeSeriesEngine
160
+
161
+ engine = TimeSeriesEngine(
162
+ features=['mean', 'std', 'skew', 'autocorr', 'fft_coefficients']
163
+ )
164
+ ```
165
+
166
+ ### LLM Engine
167
+ Uses GitHub Copilot SDK for intelligent feature generation.
168
+
169
+ ```python
170
+ from featcopilot.llm import SemanticEngine
171
+
172
+ engine = SemanticEngine(
173
+ model='gpt-5',
174
+ max_suggestions=20,
175
+ validate_features=True
176
+ )
177
+ ```
178
+
179
+ ## Feature Selection
180
+
181
+ ```python
182
+ from featcopilot.selection import FeatureSelector
183
+
184
+ selector = FeatureSelector(
185
+ methods=['mutual_info', 'importance', 'correlation'],
186
+ max_features=30,
187
+ correlation_threshold=0.95
188
+ )
189
+
190
+ X_selected = selector.fit_transform(X, y)
191
+ ```
192
+
193
+ ## Comparison with Existing Libraries
194
+
195
+ | Feature | FeatCopilot | Featuretools | TSFresh | AutoFeat | OpenFE | CAAFE |
196
+ |---------|-------------|--------------|---------|----------|--------|-------|
197
+ | Tabular Features | ✅ | ❌ | ❌ | ✅ | ✅ | ✅ |
198
+ | Time Series | ✅ | ❌ | ✅ | ❌ | ❌ | ❌ |
199
+ | Relational | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ |
200
+ | LLM-Powered | ✅ | ❌ | ❌ | ❌ | ❌ | ✅ |
201
+ | Semantic Understanding | ✅ | ❌ | ❌ | ❌ | ❌ | ⚠️ |
202
+ | Code Generation | ✅ | ❌ | ❌ | ❌ | ❌ | ⚠️ |
203
+ | Sklearn Compatible | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ |
204
+ | Interpretable | ✅ | ⚠️ | ⚠️ | ⚠️ | ❌ | ✅ |
205
+
206
+ ## Documentation
207
+
208
+ 📖 **Full Documentation**: [https://thinkall.github.io/featcopilot/](https://thinkall.github.io/featcopilot/)
209
+
210
+ ## Requirements
211
+
212
+ - Python 3.9+
213
+ - NumPy, Pandas, Scikit-learn
214
+ - GitHub Copilot CLI (for LLM features)
215
+
216
+ ## License
217
+
218
+ MIT License
@@ -0,0 +1,174 @@
1
+ # FeatCopilot 🚀
2
+
3
+ **Next-Generation LLM-Powered Auto Feature Engineering with GitHub Copilot SDK**
4
+
5
+ FeatCopilot is a unified feature engineering framework that combines the best approaches from existing libraries (Featuretools, TSFresh, AutoFeat, OpenFE) with novel LLM-powered capabilities via GitHub Copilot SDK.
6
+
7
+ ## 📊 Benchmark Highlights
8
+
9
+ ### Tabular Engine (Fast Mode - <1s)
10
+
11
+ | Task Type | Average Improvement | Best Case |
12
+ |-----------|--------------------:|----------:|
13
+ | **Text Classification** | **+12.44%** | +49.02% (News Headlines) |
14
+ | Time Series | +1.51% | +12.12% (Retail Demand) |
15
+ | Classification | +0.54% | +4.35% |
16
+ | Regression | +0.65% | +5.57% |
17
+
18
+ ### LLM Engine (With Copilot - 30-60s)
19
+
20
+ | Task Type | Average Improvement | Best Case |
21
+ |-----------|--------------------:|----------:|
22
+ | **Regression** | **+7.79%** | +19.66% (Retail Demand) |
23
+ | Classification | +2.38% | +2.87% |
24
+
25
+ - ✅ **12/12 wins** on text classification (tabular mode)
26
+ - 🧠 **+19.66% max improvement** with LLM-powered features
27
+ - ⚡ **<1 second** (tabular) or **30-60s** (with LLM) processing time
28
+ - 📈 Largest gains with simple models (LogisticRegression, Ridge)
29
+
30
+ [View Full Benchmark Results](https://thinkall.github.io/featcopilot/user-guide/benchmarks/)
31
+
32
+ ## Key Features
33
+
34
+ - 🔧 **Multi-Engine Architecture**: Tabular, time series, relational, and text feature engines
35
+ - 🤖 **LLM-Powered Intelligence**: Semantic feature discovery, domain-aware generation, and code synthesis
36
+ - 📊 **Intelligent Selection**: Statistical testing, importance ranking, and redundancy elimination
37
+ - 🔌 **Scikit-learn Compatible**: Drop-in replacement for sklearn transformers
38
+ - 📝 **Interpretable**: Every feature comes with human-readable explanations
39
+
40
+ ## Installation
41
+
42
+ ```bash
43
+ # Basic installation
44
+ pip install featcopilot
45
+
46
+ # With LLM capabilities (requires GitHub Copilot)
47
+ pip install featcopilot[llm]
48
+
49
+ # Full installation
50
+ pip install featcopilot[full]
51
+ ```
52
+
53
+ ## Quick Start
54
+
55
+ ### Fast Mode (Tabular Only)
56
+
57
+ ```python
58
+ from featcopilot import AutoFeatureEngineer
59
+
60
+ # Sub-second feature engineering
61
+ engineer = AutoFeatureEngineer(
62
+ engines=['tabular'],
63
+ max_features=50
64
+ )
65
+
66
+ X_transformed = engineer.fit_transform(X, y) # <1 second
67
+ print(f"Features: {X.shape[1]} -> {X_transformed.shape[1]}")
68
+ ```
69
+
70
+ ### LLM Mode (With Copilot)
71
+
72
+ ```python
73
+ from featcopilot import AutoFeatureEngineer
74
+
75
+ # LLM-powered semantic features (+19.66% max improvement)
76
+ engineer = AutoFeatureEngineer(
77
+ engines=['tabular', 'llm'],
78
+ max_features=50
79
+ )
80
+
81
+ X_transformed = engineer.fit_transform(
82
+ X, y,
83
+ column_descriptions={
84
+ 'age': 'Customer age in years',
85
+ 'income': 'Annual household income in USD',
86
+ 'tenure': 'Months as customer',
87
+ },
88
+ task_description="Predict customer churn"
89
+ ) # 30-60 seconds
90
+
91
+ # Get LLM-generated explanations
92
+ for feature, explanation in engineer.explain_features().items():
93
+ print(f"{feature}: {explanation}")
94
+ ```
95
+
96
+ ## Engines
97
+
98
+ ### Tabular Engine
99
+ Generates polynomial features, interaction terms, and mathematical transformations.
100
+
101
+ ```python
102
+ from featcopilot.engines import TabularEngine
103
+
104
+ engine = TabularEngine(
105
+ polynomial_degree=2,
106
+ interaction_only=False,
107
+ include_transforms=['log', 'sqrt', 'square']
108
+ )
109
+ ```
110
+
111
+ ### Time Series Engine
112
+ Extracts statistical, frequency, and temporal features from time series data.
113
+
114
+ ```python
115
+ from featcopilot.engines import TimeSeriesEngine
116
+
117
+ engine = TimeSeriesEngine(
118
+ features=['mean', 'std', 'skew', 'autocorr', 'fft_coefficients']
119
+ )
120
+ ```
121
+
122
+ ### LLM Engine
123
+ Uses GitHub Copilot SDK for intelligent feature generation.
124
+
125
+ ```python
126
+ from featcopilot.llm import SemanticEngine
127
+
128
+ engine = SemanticEngine(
129
+ model='gpt-5',
130
+ max_suggestions=20,
131
+ validate_features=True
132
+ )
133
+ ```
134
+
135
+ ## Feature Selection
136
+
137
+ ```python
138
+ from featcopilot.selection import FeatureSelector
139
+
140
+ selector = FeatureSelector(
141
+ methods=['mutual_info', 'importance', 'correlation'],
142
+ max_features=30,
143
+ correlation_threshold=0.95
144
+ )
145
+
146
+ X_selected = selector.fit_transform(X, y)
147
+ ```
148
+
149
+ ## Comparison with Existing Libraries
150
+
151
+ | Feature | FeatCopilot | Featuretools | TSFresh | AutoFeat | OpenFE | CAAFE |
152
+ |---------|-------------|--------------|---------|----------|--------|-------|
153
+ | Tabular Features | ✅ | ❌ | ❌ | ✅ | ✅ | ✅ |
154
+ | Time Series | ✅ | ❌ | ✅ | ❌ | ❌ | ❌ |
155
+ | Relational | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ |
156
+ | LLM-Powered | ✅ | ❌ | ❌ | ❌ | ❌ | ✅ |
157
+ | Semantic Understanding | ✅ | ❌ | ❌ | ❌ | ❌ | ⚠️ |
158
+ | Code Generation | ✅ | ❌ | ❌ | ❌ | ❌ | ⚠️ |
159
+ | Sklearn Compatible | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ |
160
+ | Interpretable | ✅ | ⚠️ | ⚠️ | ⚠️ | ❌ | ✅ |
161
+
162
+ ## Documentation
163
+
164
+ 📖 **Full Documentation**: [https://thinkall.github.io/featcopilot/](https://thinkall.github.io/featcopilot/)
165
+
166
+ ## Requirements
167
+
168
+ - Python 3.9+
169
+ - NumPy, Pandas, Scikit-learn
170
+ - GitHub Copilot CLI (for LLM features)
171
+
172
+ ## License
173
+
174
+ MIT License
@@ -0,0 +1,29 @@
1
+ """
2
+ FeatCopilot - Next-Generation LLM-Powered Auto Feature Engineering
3
+
4
+ A unified feature engineering framework combining traditional approaches
5
+ with novel LLM-powered capabilities via GitHub Copilot SDK.
6
+ """
7
+
8
+ __version__ = "0.1.0"
9
+ __author__ = "FeatCopilot Contributors"
10
+
11
+ from featcopilot.core.base import BaseEngine, BaseSelector
12
+ from featcopilot.core.feature import Feature, FeatureSet
13
+ from featcopilot.transformers.sklearn_compat import (
14
+ AutoFeatureEngineer,
15
+ FeatureEngineerTransformer,
16
+ )
17
+
18
+ __all__ = [
19
+ # Core
20
+ "BaseEngine",
21
+ "BaseSelector",
22
+ "Feature",
23
+ "FeatureSet",
24
+ # Main API
25
+ "AutoFeatureEngineer",
26
+ "FeatureEngineerTransformer",
27
+ # Version
28
+ "__version__",
29
+ ]
@@ -0,0 +1,13 @@
1
+ """Core module containing base classes and interfaces."""
2
+
3
+ from featcopilot.core.base import BaseEngine, BaseSelector
4
+ from featcopilot.core.feature import Feature, FeatureSet
5
+ from featcopilot.core.registry import FeatureRegistry
6
+
7
+ __all__ = [
8
+ "BaseEngine",
9
+ "BaseSelector",
10
+ "Feature",
11
+ "FeatureSet",
12
+ "FeatureRegistry",
13
+ ]
@@ -0,0 +1,195 @@
1
+ """Base classes for feature engineering engines and selectors."""
2
+
3
+ from abc import ABC, abstractmethod
4
+ from typing import Any, Optional, Union
5
+
6
+ import numpy as np
7
+ import pandas as pd
8
+ from pydantic import BaseModel, Field
9
+
10
+
11
+ class EngineConfig(BaseModel):
12
+ """Configuration for feature engineering engines."""
13
+
14
+ name: str = Field(description="Engine name")
15
+ enabled: bool = Field(default=True, description="Whether engine is enabled")
16
+ max_features: Optional[int] = Field(default=None, description="Max features to generate")
17
+ verbose: bool = Field(default=False, description="Verbose output")
18
+
19
+
20
+ class BaseEngine(ABC):
21
+ """
22
+ Abstract base class for feature engineering engines.
23
+
24
+ All engines (tabular, timeseries, relational, llm) inherit from this class.
25
+ """
26
+
27
+ def __init__(self, config: Optional[EngineConfig] = None, **kwargs):
28
+ self.config = config or EngineConfig(name=self.__class__.__name__, **kwargs)
29
+ self._is_fitted = False
30
+ self._feature_names: list[str] = []
31
+ self._feature_metadata: dict[str, Any] = {}
32
+
33
+ @property
34
+ def is_fitted(self) -> bool:
35
+ """Check if engine has been fitted."""
36
+ return self._is_fitted
37
+
38
+ @abstractmethod
39
+ def fit(
40
+ self,
41
+ X: Union[pd.DataFrame, np.ndarray],
42
+ y: Optional[Union[pd.Series, np.ndarray]] = None,
43
+ **kwargs,
44
+ ) -> "BaseEngine":
45
+ """
46
+ Fit the engine to the data.
47
+
48
+ Parameters
49
+ ----------
50
+ X : DataFrame or ndarray
51
+ Input features
52
+ y : Series or ndarray, optional
53
+ Target variable
54
+ **kwargs : dict
55
+ Additional parameters
56
+
57
+ Returns
58
+ -------
59
+ self : BaseEngine
60
+ Fitted engine
61
+ """
62
+ pass
63
+
64
+ @abstractmethod
65
+ def transform(self, X: Union[pd.DataFrame, np.ndarray], **kwargs) -> pd.DataFrame:
66
+ """
67
+ Transform data to generate new features.
68
+
69
+ Parameters
70
+ ----------
71
+ X : DataFrame or ndarray
72
+ Input features
73
+ **kwargs : dict
74
+ Additional parameters
75
+
76
+ Returns
77
+ -------
78
+ X_transformed : DataFrame
79
+ Transformed features
80
+ """
81
+ pass
82
+
83
+ def fit_transform(
84
+ self,
85
+ X: Union[pd.DataFrame, np.ndarray],
86
+ y: Optional[Union[pd.Series, np.ndarray]] = None,
87
+ **kwargs,
88
+ ) -> pd.DataFrame:
89
+ """Fit and transform in one step."""
90
+ return self.fit(X, y, **kwargs).transform(X, **kwargs)
91
+
92
+ def get_feature_names(self) -> list[str]:
93
+ """Get names of generated features."""
94
+ return self._feature_names.copy()
95
+
96
+ def get_feature_metadata(self) -> dict[str, Any]:
97
+ """Get metadata for generated features."""
98
+ return self._feature_metadata.copy()
99
+
100
+ def _validate_input(self, X: Union[pd.DataFrame, np.ndarray]) -> pd.DataFrame:
101
+ """Convert input to DataFrame and validate."""
102
+ if isinstance(X, np.ndarray):
103
+ X = pd.DataFrame(X, columns=[f"feature_{i}" for i in range(X.shape[1])])
104
+ elif not isinstance(X, pd.DataFrame):
105
+ raise TypeError(f"Expected DataFrame or ndarray, got {type(X)}")
106
+ return X
107
+
108
+
109
+ class SelectorConfig(BaseModel):
110
+ """Configuration for feature selectors."""
111
+
112
+ max_features: Optional[int] = Field(default=None, description="Max features to select")
113
+ min_importance: float = Field(default=0.0, description="Minimum importance threshold")
114
+ correlation_threshold: float = Field(default=0.95, description="Threshold for correlation-based elimination")
115
+
116
+
117
+ class BaseSelector(ABC):
118
+ """
119
+ Abstract base class for feature selection.
120
+
121
+ Handles selection of most important/relevant features from generated set.
122
+ """
123
+
124
+ def __init__(self, config: Optional[SelectorConfig] = None, **kwargs):
125
+ self.config = config or SelectorConfig(**kwargs)
126
+ self._is_fitted = False
127
+ self._selected_features: list[str] = []
128
+ self._feature_scores: dict[str, float] = {}
129
+
130
+ @property
131
+ def is_fitted(self) -> bool:
132
+ """Check if selector has been fitted."""
133
+ return self._is_fitted
134
+
135
+ @abstractmethod
136
+ def fit(self, X: Union[pd.DataFrame, np.ndarray], y: Union[pd.Series, np.ndarray], **kwargs) -> "BaseSelector":
137
+ """
138
+ Fit the selector to determine feature importance.
139
+
140
+ Parameters
141
+ ----------
142
+ X : DataFrame or ndarray
143
+ Input features
144
+ y : Series or ndarray
145
+ Target variable
146
+ **kwargs : dict
147
+ Additional parameters
148
+
149
+ Returns
150
+ -------
151
+ self : BaseSelector
152
+ Fitted selector
153
+ """
154
+ pass
155
+
156
+ @abstractmethod
157
+ def transform(self, X: Union[pd.DataFrame, np.ndarray], **kwargs) -> pd.DataFrame:
158
+ """
159
+ Transform data to keep only selected features.
160
+
161
+ Parameters
162
+ ----------
163
+ X : DataFrame or ndarray
164
+ Input features
165
+ **kwargs : dict
166
+ Additional parameters
167
+
168
+ Returns
169
+ -------
170
+ X_selected : DataFrame
171
+ Data with only selected features
172
+ """
173
+ pass
174
+
175
+ def fit_transform(
176
+ self, X: Union[pd.DataFrame, np.ndarray], y: Union[pd.Series, np.ndarray], **kwargs
177
+ ) -> pd.DataFrame:
178
+ """Fit and transform in one step."""
179
+ return self.fit(X, y, **kwargs).transform(X, **kwargs)
180
+
181
+ def get_selected_features(self) -> list[str]:
182
+ """Get names of selected features."""
183
+ return self._selected_features.copy()
184
+
185
+ def get_feature_scores(self) -> dict[str, float]:
186
+ """Get importance scores for all features."""
187
+ return self._feature_scores.copy()
188
+
189
+ def _validate_input(self, X: Union[pd.DataFrame, np.ndarray]) -> pd.DataFrame:
190
+ """Convert input to DataFrame and validate."""
191
+ if isinstance(X, np.ndarray):
192
+ X = pd.DataFrame(X, columns=[f"feature_{i}" for i in range(X.shape[1])])
193
+ elif not isinstance(X, pd.DataFrame):
194
+ raise TypeError(f"Expected DataFrame or ndarray, got {type(X)}")
195
+ return X