featcopilot 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,29 @@
1
+ """
2
+ FeatCopilot - Next-Generation LLM-Powered Auto Feature Engineering
3
+
4
+ A unified feature engineering framework combining traditional approaches
5
+ with novel LLM-powered capabilities via GitHub Copilot SDK.
6
+ """
7
+
8
+ __version__ = "0.1.0"
9
+ __author__ = "FeatCopilot Contributors"
10
+
11
+ from featcopilot.core.base import BaseEngine, BaseSelector
12
+ from featcopilot.core.feature import Feature, FeatureSet
13
+ from featcopilot.transformers.sklearn_compat import (
14
+ AutoFeatureEngineer,
15
+ FeatureEngineerTransformer,
16
+ )
17
+
18
+ __all__ = [
19
+ # Core
20
+ "BaseEngine",
21
+ "BaseSelector",
22
+ "Feature",
23
+ "FeatureSet",
24
+ # Main API
25
+ "AutoFeatureEngineer",
26
+ "FeatureEngineerTransformer",
27
+ # Version
28
+ "__version__",
29
+ ]
@@ -0,0 +1,13 @@
1
+ """Core module containing base classes and interfaces."""
2
+
3
+ from featcopilot.core.base import BaseEngine, BaseSelector
4
+ from featcopilot.core.feature import Feature, FeatureSet
5
+ from featcopilot.core.registry import FeatureRegistry
6
+
7
+ __all__ = [
8
+ "BaseEngine",
9
+ "BaseSelector",
10
+ "Feature",
11
+ "FeatureSet",
12
+ "FeatureRegistry",
13
+ ]
@@ -0,0 +1,195 @@
1
+ """Base classes for feature engineering engines and selectors."""
2
+
3
+ from abc import ABC, abstractmethod
4
+ from typing import Any, Optional, Union
5
+
6
+ import numpy as np
7
+ import pandas as pd
8
+ from pydantic import BaseModel, Field
9
+
10
+
11
+ class EngineConfig(BaseModel):
12
+ """Configuration for feature engineering engines."""
13
+
14
+ name: str = Field(description="Engine name")
15
+ enabled: bool = Field(default=True, description="Whether engine is enabled")
16
+ max_features: Optional[int] = Field(default=None, description="Max features to generate")
17
+ verbose: bool = Field(default=False, description="Verbose output")
18
+
19
+
20
+ class BaseEngine(ABC):
21
+ """
22
+ Abstract base class for feature engineering engines.
23
+
24
+ All engines (tabular, timeseries, relational, llm) inherit from this class.
25
+ """
26
+
27
+ def __init__(self, config: Optional[EngineConfig] = None, **kwargs):
28
+ self.config = config or EngineConfig(name=self.__class__.__name__, **kwargs)
29
+ self._is_fitted = False
30
+ self._feature_names: list[str] = []
31
+ self._feature_metadata: dict[str, Any] = {}
32
+
33
+ @property
34
+ def is_fitted(self) -> bool:
35
+ """Check if engine has been fitted."""
36
+ return self._is_fitted
37
+
38
+ @abstractmethod
39
+ def fit(
40
+ self,
41
+ X: Union[pd.DataFrame, np.ndarray],
42
+ y: Optional[Union[pd.Series, np.ndarray]] = None,
43
+ **kwargs,
44
+ ) -> "BaseEngine":
45
+ """
46
+ Fit the engine to the data.
47
+
48
+ Parameters
49
+ ----------
50
+ X : DataFrame or ndarray
51
+ Input features
52
+ y : Series or ndarray, optional
53
+ Target variable
54
+ **kwargs : dict
55
+ Additional parameters
56
+
57
+ Returns
58
+ -------
59
+ self : BaseEngine
60
+ Fitted engine
61
+ """
62
+ pass
63
+
64
+ @abstractmethod
65
+ def transform(self, X: Union[pd.DataFrame, np.ndarray], **kwargs) -> pd.DataFrame:
66
+ """
67
+ Transform data to generate new features.
68
+
69
+ Parameters
70
+ ----------
71
+ X : DataFrame or ndarray
72
+ Input features
73
+ **kwargs : dict
74
+ Additional parameters
75
+
76
+ Returns
77
+ -------
78
+ X_transformed : DataFrame
79
+ Transformed features
80
+ """
81
+ pass
82
+
83
+ def fit_transform(
84
+ self,
85
+ X: Union[pd.DataFrame, np.ndarray],
86
+ y: Optional[Union[pd.Series, np.ndarray]] = None,
87
+ **kwargs,
88
+ ) -> pd.DataFrame:
89
+ """Fit and transform in one step."""
90
+ return self.fit(X, y, **kwargs).transform(X, **kwargs)
91
+
92
+ def get_feature_names(self) -> list[str]:
93
+ """Get names of generated features."""
94
+ return self._feature_names.copy()
95
+
96
+ def get_feature_metadata(self) -> dict[str, Any]:
97
+ """Get metadata for generated features."""
98
+ return self._feature_metadata.copy()
99
+
100
+ def _validate_input(self, X: Union[pd.DataFrame, np.ndarray]) -> pd.DataFrame:
101
+ """Convert input to DataFrame and validate."""
102
+ if isinstance(X, np.ndarray):
103
+ X = pd.DataFrame(X, columns=[f"feature_{i}" for i in range(X.shape[1])])
104
+ elif not isinstance(X, pd.DataFrame):
105
+ raise TypeError(f"Expected DataFrame or ndarray, got {type(X)}")
106
+ return X
107
+
108
+
109
+ class SelectorConfig(BaseModel):
110
+ """Configuration for feature selectors."""
111
+
112
+ max_features: Optional[int] = Field(default=None, description="Max features to select")
113
+ min_importance: float = Field(default=0.0, description="Minimum importance threshold")
114
+ correlation_threshold: float = Field(default=0.95, description="Threshold for correlation-based elimination")
115
+
116
+
117
+ class BaseSelector(ABC):
118
+ """
119
+ Abstract base class for feature selection.
120
+
121
+ Handles selection of most important/relevant features from generated set.
122
+ """
123
+
124
+ def __init__(self, config: Optional[SelectorConfig] = None, **kwargs):
125
+ self.config = config or SelectorConfig(**kwargs)
126
+ self._is_fitted = False
127
+ self._selected_features: list[str] = []
128
+ self._feature_scores: dict[str, float] = {}
129
+
130
+ @property
131
+ def is_fitted(self) -> bool:
132
+ """Check if selector has been fitted."""
133
+ return self._is_fitted
134
+
135
+ @abstractmethod
136
+ def fit(self, X: Union[pd.DataFrame, np.ndarray], y: Union[pd.Series, np.ndarray], **kwargs) -> "BaseSelector":
137
+ """
138
+ Fit the selector to determine feature importance.
139
+
140
+ Parameters
141
+ ----------
142
+ X : DataFrame or ndarray
143
+ Input features
144
+ y : Series or ndarray
145
+ Target variable
146
+ **kwargs : dict
147
+ Additional parameters
148
+
149
+ Returns
150
+ -------
151
+ self : BaseSelector
152
+ Fitted selector
153
+ """
154
+ pass
155
+
156
+ @abstractmethod
157
+ def transform(self, X: Union[pd.DataFrame, np.ndarray], **kwargs) -> pd.DataFrame:
158
+ """
159
+ Transform data to keep only selected features.
160
+
161
+ Parameters
162
+ ----------
163
+ X : DataFrame or ndarray
164
+ Input features
165
+ **kwargs : dict
166
+ Additional parameters
167
+
168
+ Returns
169
+ -------
170
+ X_selected : DataFrame
171
+ Data with only selected features
172
+ """
173
+ pass
174
+
175
+ def fit_transform(
176
+ self, X: Union[pd.DataFrame, np.ndarray], y: Union[pd.Series, np.ndarray], **kwargs
177
+ ) -> pd.DataFrame:
178
+ """Fit and transform in one step."""
179
+ return self.fit(X, y, **kwargs).transform(X, **kwargs)
180
+
181
+ def get_selected_features(self) -> list[str]:
182
+ """Get names of selected features."""
183
+ return self._selected_features.copy()
184
+
185
+ def get_feature_scores(self) -> dict[str, float]:
186
+ """Get importance scores for all features."""
187
+ return self._feature_scores.copy()
188
+
189
+ def _validate_input(self, X: Union[pd.DataFrame, np.ndarray]) -> pd.DataFrame:
190
+ """Convert input to DataFrame and validate."""
191
+ if isinstance(X, np.ndarray):
192
+ X = pd.DataFrame(X, columns=[f"feature_{i}" for i in range(X.shape[1])])
193
+ elif not isinstance(X, pd.DataFrame):
194
+ raise TypeError(f"Expected DataFrame or ndarray, got {type(X)}")
195
+ return X
@@ -0,0 +1,224 @@
1
+ """Feature representation and feature sets."""
2
+
3
+ from dataclasses import dataclass, field
4
+ from enum import Enum
5
+ from typing import Any, Optional
6
+
7
+ import numpy as np
8
+ import pandas as pd
9
+
10
+
11
+ class FeatureType(Enum):
12
+ """Types of features."""
13
+
14
+ NUMERIC = "numeric"
15
+ CATEGORICAL = "categorical"
16
+ DATETIME = "datetime"
17
+ TEXT = "text"
18
+ BOOLEAN = "boolean"
19
+
20
+
21
+ class FeatureOrigin(Enum):
22
+ """Origin/source of feature."""
23
+
24
+ ORIGINAL = "original" # Original input feature
25
+ POLYNOMIAL = "polynomial" # Polynomial transformation
26
+ INTERACTION = "interaction" # Interaction between features
27
+ AGGREGATION = "aggregation" # Aggregation operation
28
+ TIMESERIES = "timeseries" # Time series extraction
29
+ LLM_GENERATED = "llm_generated" # Generated by LLM
30
+ LLM_SUGGESTED = "llm_suggested" # Suggested by LLM, implemented traditionally
31
+ CUSTOM = "custom" # Custom user-defined
32
+
33
+
34
+ @dataclass
35
+ class Feature:
36
+ """
37
+ Represents a single feature with metadata.
38
+
39
+ Attributes
40
+ ----------
41
+ name : str
42
+ Feature name
43
+ dtype : FeatureType
44
+ Data type of feature
45
+ origin : FeatureOrigin
46
+ How the feature was created
47
+ source_columns : list
48
+ Original columns used to create this feature
49
+ transformation : str
50
+ Description of transformation applied
51
+ explanation : str, optional
52
+ Human-readable explanation of the feature
53
+ code : str, optional
54
+ Python code that generates this feature
55
+ importance : float, optional
56
+ Feature importance score
57
+ metadata : dict
58
+ Additional metadata
59
+ """
60
+
61
+ name: str
62
+ dtype: FeatureType = FeatureType.NUMERIC
63
+ origin: FeatureOrigin = FeatureOrigin.ORIGINAL
64
+ source_columns: list[str] = field(default_factory=list)
65
+ transformation: str = ""
66
+ explanation: Optional[str] = None
67
+ code: Optional[str] = None
68
+ importance: Optional[float] = None
69
+ metadata: dict[str, Any] = field(default_factory=dict)
70
+
71
+ def __post_init__(self):
72
+ if not self.source_columns:
73
+ self.source_columns = [self.name]
74
+
75
+ def to_dict(self) -> dict[str, Any]:
76
+ """Convert feature to dictionary."""
77
+ return {
78
+ "name": self.name,
79
+ "dtype": self.dtype.value,
80
+ "origin": self.origin.value,
81
+ "source_columns": self.source_columns,
82
+ "transformation": self.transformation,
83
+ "explanation": self.explanation,
84
+ "code": self.code,
85
+ "importance": self.importance,
86
+ "metadata": self.metadata,
87
+ }
88
+
89
+ @classmethod
90
+ def from_dict(cls, data: dict[str, Any]) -> "Feature":
91
+ """Create feature from dictionary."""
92
+ return cls(
93
+ name=data["name"],
94
+ dtype=FeatureType(data.get("dtype", "numeric")),
95
+ origin=FeatureOrigin(data.get("origin", "original")),
96
+ source_columns=data.get("source_columns", []),
97
+ transformation=data.get("transformation", ""),
98
+ explanation=data.get("explanation"),
99
+ code=data.get("code"),
100
+ importance=data.get("importance"),
101
+ metadata=data.get("metadata", {}),
102
+ )
103
+
104
+ def compute(self, df: pd.DataFrame) -> pd.Series:
105
+ """
106
+ Compute feature values from DataFrame using stored code.
107
+
108
+ Parameters
109
+ ----------
110
+ df : DataFrame
111
+ Input data
112
+
113
+ Returns
114
+ -------
115
+ Series
116
+ Computed feature values
117
+ """
118
+ if self.code:
119
+ # Execute stored code to compute feature
120
+ local_vars = {"df": df, "np": np, "pd": pd}
121
+ exec(self.code, {"__builtins__": {}}, local_vars)
122
+ if "result" in local_vars:
123
+ return local_vars["result"]
124
+ raise ValueError(f"No code defined for feature {self.name}")
125
+
126
+
127
+ class FeatureSet:
128
+ """
129
+ Collection of features with operations for manipulation.
130
+
131
+ Provides methods for adding, removing, filtering, and combining features.
132
+ """
133
+
134
+ def __init__(self, features: Optional[list[Feature]] = None):
135
+ self._features: dict[str, Feature] = {}
136
+ if features:
137
+ for f in features:
138
+ self.add(f)
139
+
140
+ def __len__(self) -> int:
141
+ return len(self._features)
142
+
143
+ def __iter__(self):
144
+ return iter(self._features.values())
145
+
146
+ def __contains__(self, name: str) -> bool:
147
+ return name in self._features
148
+
149
+ def __getitem__(self, name: str) -> Feature:
150
+ return self._features[name]
151
+
152
+ def add(self, feature: Feature) -> None:
153
+ """Add a feature to the set."""
154
+ self._features[feature.name] = feature
155
+
156
+ def remove(self, name: str) -> Optional[Feature]:
157
+ """Remove and return a feature by name."""
158
+ return self._features.pop(name, None)
159
+
160
+ def get(self, name: str) -> Optional[Feature]:
161
+ """Get a feature by name."""
162
+ return self._features.get(name)
163
+
164
+ def get_names(self) -> list[str]:
165
+ """Get all feature names."""
166
+ return list(self._features.keys())
167
+
168
+ def filter_by_origin(self, origin: FeatureOrigin) -> "FeatureSet":
169
+ """Filter features by origin."""
170
+ return FeatureSet([f for f in self._features.values() if f.origin == origin])
171
+
172
+ def filter_by_type(self, dtype: FeatureType) -> "FeatureSet":
173
+ """Filter features by data type."""
174
+ return FeatureSet([f for f in self._features.values() if f.dtype == dtype])
175
+
176
+ def filter_by_importance(self, min_importance: float) -> "FeatureSet":
177
+ """Filter features by minimum importance."""
178
+ return FeatureSet(
179
+ [f for f in self._features.values() if f.importance is not None and f.importance >= min_importance]
180
+ )
181
+
182
+ def sort_by_importance(self, descending: bool = True) -> list[Feature]:
183
+ """Sort features by importance."""
184
+ features = [f for f in self._features.values() if f.importance is not None]
185
+ return sorted(features, key=lambda f: f.importance or 0, reverse=descending)
186
+
187
+ def merge(self, other: "FeatureSet") -> "FeatureSet":
188
+ """Merge with another feature set."""
189
+ result = FeatureSet(list(self._features.values()))
190
+ for f in other:
191
+ result.add(f)
192
+ return result
193
+
194
+ def to_dataframe(self) -> pd.DataFrame:
195
+ """Convert feature set to DataFrame with metadata."""
196
+ return pd.DataFrame([f.to_dict() for f in self._features.values()])
197
+
198
+ def get_explanations(self) -> dict[str, str]:
199
+ """Get explanations for all features that have them."""
200
+ return {f.name: f.explanation for f in self._features.values() if f.explanation}
201
+
202
+ def compute_all(self, df: pd.DataFrame) -> pd.DataFrame:
203
+ """
204
+ Compute all features that have code defined.
205
+
206
+ Parameters
207
+ ----------
208
+ df : DataFrame
209
+ Input data
210
+
211
+ Returns
212
+ -------
213
+ DataFrame
214
+ DataFrame with computed features
215
+ """
216
+ result = df.copy()
217
+ for feature in self._features.values():
218
+ if feature.code and feature.name not in result.columns:
219
+ try:
220
+ result[feature.name] = feature.compute(df)
221
+ except Exception as e:
222
+ # Log warning but continue
223
+ print(f"Warning: Could not compute feature {feature.name}: {e}")
224
+ return result
@@ -0,0 +1,128 @@
1
+ """Feature registry for tracking and managing features."""
2
+
3
+ from typing import Callable, Optional
4
+
5
+ from featcopilot.core.feature import Feature, FeatureOrigin
6
+
7
+
8
+ class FeatureRegistry:
9
+ """
10
+ Global registry for feature definitions and generators.
11
+
12
+ Provides registration and lookup of:
13
+ - Feature transformation functions
14
+ - Feature generator classes
15
+ - Custom feature definitions
16
+ """
17
+
18
+ _instance: Optional["FeatureRegistry"] = None
19
+ _transformations: dict[str, Callable] = {}
20
+ _generators: dict[str, type] = {}
21
+
22
+ def __new__(cls) -> "FeatureRegistry":
23
+ """Singleton pattern for global registry."""
24
+ if cls._instance is None:
25
+ cls._instance = super().__new__(cls)
26
+ cls._instance._init_default_transformations()
27
+ return cls._instance
28
+
29
+ def _init_default_transformations(self) -> None:
30
+ """Initialize default transformation functions."""
31
+ import numpy as np
32
+
33
+ self._transformations = {
34
+ "log": lambda x: np.log1p(np.abs(x)),
35
+ "log10": lambda x: np.log10(np.abs(x) + 1),
36
+ "sqrt": lambda x: np.sqrt(np.abs(x)),
37
+ "square": lambda x: x**2,
38
+ "cube": lambda x: x**3,
39
+ "reciprocal": lambda x: 1 / (x + 1e-8),
40
+ "abs": lambda x: np.abs(x),
41
+ "sign": lambda x: np.sign(x),
42
+ "exp": lambda x: np.exp(np.clip(x, -50, 50)),
43
+ "sin": lambda x: np.sin(x),
44
+ "cos": lambda x: np.cos(x),
45
+ "tanh": lambda x: np.tanh(x),
46
+ }
47
+
48
+ def register_transformation(self, name: str, func: Callable) -> None:
49
+ """
50
+ Register a transformation function.
51
+
52
+ Parameters
53
+ ----------
54
+ name : str
55
+ Name of transformation
56
+ func : callable
57
+ Function that takes array and returns transformed array
58
+ """
59
+ self._transformations[name] = func
60
+
61
+ def get_transformation(self, name: str) -> Optional[Callable]:
62
+ """Get a registered transformation by name."""
63
+ return self._transformations.get(name)
64
+
65
+ def list_transformations(self) -> list[str]:
66
+ """List all registered transformation names."""
67
+ return list(self._transformations.keys())
68
+
69
+ def register_generator(self, name: str, generator_class: type) -> None:
70
+ """
71
+ Register a feature generator class.
72
+
73
+ Parameters
74
+ ----------
75
+ name : str
76
+ Name of generator
77
+ generator_class : type
78
+ Class that generates features
79
+ """
80
+ self._generators[name] = generator_class
81
+
82
+ def get_generator(self, name: str) -> Optional[type]:
83
+ """Get a registered generator by name."""
84
+ return self._generators.get(name)
85
+
86
+ def list_generators(self) -> list[str]:
87
+ """List all registered generator names."""
88
+ return list(self._generators.keys())
89
+
90
+ def create_feature(self, name: str, transformation: str, source_columns: list[str], **kwargs) -> Feature:
91
+ """
92
+ Create a feature using a registered transformation.
93
+
94
+ Parameters
95
+ ----------
96
+ name : str
97
+ Feature name
98
+ transformation : str
99
+ Name of registered transformation
100
+ source_columns : list
101
+ Columns used in transformation
102
+ **kwargs : dict
103
+ Additional feature attributes
104
+
105
+ Returns
106
+ -------
107
+ Feature
108
+ Created feature object
109
+ """
110
+ func = self.get_transformation(transformation)
111
+ if func is None:
112
+ raise ValueError(f"Unknown transformation: {transformation}")
113
+
114
+ # Generate code string for the transformation
115
+ code = f"result = {transformation}(df['{source_columns[0]}'])"
116
+
117
+ return Feature(
118
+ name=name,
119
+ source_columns=source_columns,
120
+ transformation=transformation,
121
+ code=code,
122
+ origin=FeatureOrigin.POLYNOMIAL,
123
+ **kwargs,
124
+ )
125
+
126
+
127
+ # Global registry instance
128
+ registry = FeatureRegistry()
@@ -0,0 +1,13 @@
1
+ """Feature engineering engines."""
2
+
3
+ from featcopilot.engines.relational import RelationalEngine
4
+ from featcopilot.engines.tabular import TabularEngine
5
+ from featcopilot.engines.text import TextEngine
6
+ from featcopilot.engines.timeseries import TimeSeriesEngine
7
+
8
+ __all__ = [
9
+ "TabularEngine",
10
+ "TimeSeriesEngine",
11
+ "RelationalEngine",
12
+ "TextEngine",
13
+ ]