featcopilot 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- featcopilot/__init__.py +29 -0
- featcopilot/core/__init__.py +13 -0
- featcopilot/core/base.py +195 -0
- featcopilot/core/feature.py +224 -0
- featcopilot/core/registry.py +128 -0
- featcopilot/engines/__init__.py +13 -0
- featcopilot/engines/relational.py +256 -0
- featcopilot/engines/tabular.py +293 -0
- featcopilot/engines/text.py +211 -0
- featcopilot/engines/timeseries.py +402 -0
- featcopilot/llm/__init__.py +16 -0
- featcopilot/llm/code_generator.py +295 -0
- featcopilot/llm/copilot_client.py +521 -0
- featcopilot/llm/explainer.py +200 -0
- featcopilot/llm/semantic_engine.py +379 -0
- featcopilot/selection/__init__.py +13 -0
- featcopilot/selection/importance.py +161 -0
- featcopilot/selection/redundancy.py +156 -0
- featcopilot/selection/statistical.py +199 -0
- featcopilot/selection/unified.py +172 -0
- featcopilot/transformers/__init__.py +11 -0
- featcopilot/transformers/sklearn_compat.py +401 -0
- featcopilot/utils/__init__.py +9 -0
- featcopilot/utils/cache.py +221 -0
- featcopilot/utils/parallel.py +109 -0
- featcopilot-0.1.0.dist-info/METADATA +218 -0
- featcopilot-0.1.0.dist-info/RECORD +29 -0
- featcopilot-0.1.0.dist-info/WHEEL +5 -0
- featcopilot-0.1.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,256 @@
|
|
|
1
|
+
"""Relational feature engineering engine.
|
|
2
|
+
|
|
3
|
+
Generates aggregation features from related tables (inspired by Featuretools).
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from typing import Optional, Union
|
|
7
|
+
|
|
8
|
+
import numpy as np
|
|
9
|
+
import pandas as pd
|
|
10
|
+
from pydantic import Field
|
|
11
|
+
|
|
12
|
+
from featcopilot.core.base import BaseEngine, EngineConfig
|
|
13
|
+
from featcopilot.core.feature import FeatureSet
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class RelationalEngineConfig(EngineConfig):
|
|
17
|
+
"""Configuration for relational feature engine."""
|
|
18
|
+
|
|
19
|
+
name: str = "RelationalEngine"
|
|
20
|
+
aggregation_functions: list[str] = Field(
|
|
21
|
+
default_factory=lambda: ["mean", "sum", "min", "max", "count", "std"],
|
|
22
|
+
description="Aggregation functions to apply",
|
|
23
|
+
)
|
|
24
|
+
max_depth: int = Field(default=2, ge=1, le=4, description="Max depth for feature synthesis")
|
|
25
|
+
include_time_based: bool = Field(default=True, description="Include time-based aggregations")
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class RelationalEngine(BaseEngine):
|
|
29
|
+
"""
|
|
30
|
+
Relational feature engineering engine.
|
|
31
|
+
|
|
32
|
+
Generates features from related tables using aggregation operations,
|
|
33
|
+
similar to Featuretools' Deep Feature Synthesis but with:
|
|
34
|
+
- Simpler API
|
|
35
|
+
- LLM integration capabilities
|
|
36
|
+
- Better performance
|
|
37
|
+
|
|
38
|
+
Parameters
|
|
39
|
+
----------
|
|
40
|
+
aggregation_functions : list
|
|
41
|
+
Aggregation functions to use (mean, sum, min, max, count, std, etc.)
|
|
42
|
+
max_depth : int, default=2
|
|
43
|
+
Maximum depth for feature synthesis
|
|
44
|
+
|
|
45
|
+
Examples
|
|
46
|
+
--------
|
|
47
|
+
>>> engine = RelationalEngine()
|
|
48
|
+
>>> engine.add_relationship('orders', 'customers', 'customer_id')
|
|
49
|
+
>>> X_features = engine.fit_transform(orders_df, related_tables={'customers': customers_df})
|
|
50
|
+
"""
|
|
51
|
+
|
|
52
|
+
AGGREGATION_FUNCTIONS = {
|
|
53
|
+
"mean": np.mean,
|
|
54
|
+
"sum": np.sum,
|
|
55
|
+
"min": np.min,
|
|
56
|
+
"max": np.max,
|
|
57
|
+
"count": len,
|
|
58
|
+
"std": np.std,
|
|
59
|
+
"median": np.median,
|
|
60
|
+
"first": lambda x: x.iloc[0] if len(x) > 0 else np.nan,
|
|
61
|
+
"last": lambda x: x.iloc[-1] if len(x) > 0 else np.nan,
|
|
62
|
+
"nunique": lambda x: len(set(x)),
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
def __init__(
|
|
66
|
+
self,
|
|
67
|
+
aggregation_functions: Optional[list[str]] = None,
|
|
68
|
+
max_depth: int = 2,
|
|
69
|
+
max_features: Optional[int] = None,
|
|
70
|
+
verbose: bool = False,
|
|
71
|
+
**kwargs,
|
|
72
|
+
):
|
|
73
|
+
config = RelationalEngineConfig(
|
|
74
|
+
aggregation_functions=aggregation_functions or ["mean", "sum", "count", "max", "min"],
|
|
75
|
+
max_depth=max_depth,
|
|
76
|
+
max_features=max_features,
|
|
77
|
+
verbose=verbose,
|
|
78
|
+
**kwargs,
|
|
79
|
+
)
|
|
80
|
+
super().__init__(config=config)
|
|
81
|
+
self.config: RelationalEngineConfig = config
|
|
82
|
+
self._relationships: list[dict[str, str]] = []
|
|
83
|
+
self._feature_set = FeatureSet()
|
|
84
|
+
|
|
85
|
+
def add_relationship(
|
|
86
|
+
self, child_table: str, parent_table: str, key_column: str, parent_key: Optional[str] = None
|
|
87
|
+
) -> "RelationalEngine":
|
|
88
|
+
"""
|
|
89
|
+
Define a relationship between tables.
|
|
90
|
+
|
|
91
|
+
Parameters
|
|
92
|
+
----------
|
|
93
|
+
child_table : str
|
|
94
|
+
Name of child table (many side)
|
|
95
|
+
parent_table : str
|
|
96
|
+
Name of parent table (one side)
|
|
97
|
+
key_column : str
|
|
98
|
+
Foreign key column in child table
|
|
99
|
+
parent_key : str, optional
|
|
100
|
+
Primary key column in parent table (defaults to key_column)
|
|
101
|
+
|
|
102
|
+
Returns
|
|
103
|
+
-------
|
|
104
|
+
self : RelationalEngine
|
|
105
|
+
"""
|
|
106
|
+
self._relationships.append(
|
|
107
|
+
{
|
|
108
|
+
"child": child_table,
|
|
109
|
+
"parent": parent_table,
|
|
110
|
+
"child_key": key_column,
|
|
111
|
+
"parent_key": parent_key or key_column,
|
|
112
|
+
}
|
|
113
|
+
)
|
|
114
|
+
return self
|
|
115
|
+
|
|
116
|
+
def fit(
|
|
117
|
+
self,
|
|
118
|
+
X: Union[pd.DataFrame, np.ndarray],
|
|
119
|
+
y: Optional[Union[pd.Series, np.ndarray]] = None,
|
|
120
|
+
related_tables: Optional[dict[str, pd.DataFrame]] = None,
|
|
121
|
+
**kwargs,
|
|
122
|
+
) -> "RelationalEngine":
|
|
123
|
+
"""
|
|
124
|
+
Fit the engine to the data.
|
|
125
|
+
|
|
126
|
+
Parameters
|
|
127
|
+
----------
|
|
128
|
+
X : DataFrame
|
|
129
|
+
Primary table
|
|
130
|
+
y : Series, optional
|
|
131
|
+
Target variable
|
|
132
|
+
related_tables : dict, optional
|
|
133
|
+
Dictionary of related tables {name: DataFrame}
|
|
134
|
+
|
|
135
|
+
Returns
|
|
136
|
+
-------
|
|
137
|
+
self : RelationalEngine
|
|
138
|
+
"""
|
|
139
|
+
X = self._validate_input(X)
|
|
140
|
+
self._related_tables = related_tables or {}
|
|
141
|
+
self._primary_columns = X.columns.tolist()
|
|
142
|
+
|
|
143
|
+
if self.config.verbose:
|
|
144
|
+
print(f"RelationalEngine: {len(self._relationships)} relationships defined")
|
|
145
|
+
|
|
146
|
+
self._is_fitted = True
|
|
147
|
+
return self
|
|
148
|
+
|
|
149
|
+
def transform(
|
|
150
|
+
self,
|
|
151
|
+
X: Union[pd.DataFrame, np.ndarray],
|
|
152
|
+
related_tables: Optional[dict[str, pd.DataFrame]] = None,
|
|
153
|
+
**kwargs,
|
|
154
|
+
) -> pd.DataFrame:
|
|
155
|
+
"""
|
|
156
|
+
Generate aggregation features.
|
|
157
|
+
|
|
158
|
+
Parameters
|
|
159
|
+
----------
|
|
160
|
+
X : DataFrame
|
|
161
|
+
Primary table
|
|
162
|
+
related_tables : dict, optional
|
|
163
|
+
Dictionary of related tables
|
|
164
|
+
|
|
165
|
+
Returns
|
|
166
|
+
-------
|
|
167
|
+
X_features : DataFrame
|
|
168
|
+
DataFrame with aggregated features
|
|
169
|
+
"""
|
|
170
|
+
if not self._is_fitted:
|
|
171
|
+
raise RuntimeError("Engine must be fitted before transform")
|
|
172
|
+
|
|
173
|
+
X = self._validate_input(X)
|
|
174
|
+
related_tables = related_tables or self._related_tables
|
|
175
|
+
result = X.copy()
|
|
176
|
+
|
|
177
|
+
# Generate features from relationships
|
|
178
|
+
for rel in self._relationships:
|
|
179
|
+
if rel["parent"] not in related_tables:
|
|
180
|
+
continue
|
|
181
|
+
|
|
182
|
+
parent_df = related_tables[rel["parent"]]
|
|
183
|
+
features = self._aggregate_from_relationship(
|
|
184
|
+
X, parent_df, rel["child_key"], rel["parent_key"], rel["parent"]
|
|
185
|
+
)
|
|
186
|
+
result = pd.concat([result, features], axis=1)
|
|
187
|
+
|
|
188
|
+
# Generate group-by aggregations within the primary table
|
|
189
|
+
result = self._add_self_aggregations(result)
|
|
190
|
+
|
|
191
|
+
self._feature_names = [c for c in result.columns if c not in X.columns]
|
|
192
|
+
|
|
193
|
+
if self.config.verbose:
|
|
194
|
+
print(f"RelationalEngine: Generated {len(self._feature_names)} features")
|
|
195
|
+
|
|
196
|
+
return result
|
|
197
|
+
|
|
198
|
+
def _aggregate_from_relationship(
|
|
199
|
+
self,
|
|
200
|
+
child_df: pd.DataFrame,
|
|
201
|
+
parent_df: pd.DataFrame,
|
|
202
|
+
child_key: str,
|
|
203
|
+
parent_key: str,
|
|
204
|
+
parent_name: str,
|
|
205
|
+
) -> pd.DataFrame:
|
|
206
|
+
"""Generate aggregation features from a parent table."""
|
|
207
|
+
features = pd.DataFrame(index=child_df.index)
|
|
208
|
+
|
|
209
|
+
# Get numeric columns from parent
|
|
210
|
+
numeric_cols = parent_df.select_dtypes(include=[np.number]).columns
|
|
211
|
+
numeric_cols = [c for c in numeric_cols if c != parent_key]
|
|
212
|
+
|
|
213
|
+
# Merge and aggregate
|
|
214
|
+
for col in numeric_cols:
|
|
215
|
+
for agg_name in self.config.aggregation_functions:
|
|
216
|
+
if agg_name not in self.AGGREGATION_FUNCTIONS:
|
|
217
|
+
continue
|
|
218
|
+
|
|
219
|
+
feature_name = f"{parent_name}_{col}_{agg_name}"
|
|
220
|
+
|
|
221
|
+
# Group by parent key and aggregate
|
|
222
|
+
agg_values = parent_df.groupby(parent_key)[col].agg(agg_name).to_dict()
|
|
223
|
+
|
|
224
|
+
# Map to child table
|
|
225
|
+
if child_key in child_df.columns:
|
|
226
|
+
features[feature_name] = child_df[child_key].map(agg_values)
|
|
227
|
+
|
|
228
|
+
return features
|
|
229
|
+
|
|
230
|
+
def _add_self_aggregations(self, df: pd.DataFrame) -> pd.DataFrame:
|
|
231
|
+
"""Add aggregations within the same table (e.g., by category columns)."""
|
|
232
|
+
result = df.copy()
|
|
233
|
+
|
|
234
|
+
# Find categorical columns that could be used for grouping
|
|
235
|
+
cat_cols = df.select_dtypes(include=["object", "category"]).columns.tolist()
|
|
236
|
+
num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
|
|
237
|
+
|
|
238
|
+
# Limit to avoid explosion
|
|
239
|
+
cat_cols = cat_cols[:3]
|
|
240
|
+
num_cols = num_cols[:5]
|
|
241
|
+
|
|
242
|
+
for cat_col in cat_cols:
|
|
243
|
+
for num_col in num_cols:
|
|
244
|
+
for agg_name in ["mean", "count"]: # Limited aggregations for self
|
|
245
|
+
if agg_name not in self.AGGREGATION_FUNCTIONS:
|
|
246
|
+
continue
|
|
247
|
+
|
|
248
|
+
feature_name = f"{num_col}_by_{cat_col}_{agg_name}"
|
|
249
|
+
agg_values = df.groupby(cat_col)[num_col].transform(agg_name)
|
|
250
|
+
result[feature_name] = agg_values
|
|
251
|
+
|
|
252
|
+
return result
|
|
253
|
+
|
|
254
|
+
def get_feature_set(self) -> FeatureSet:
|
|
255
|
+
"""Get the feature set with metadata."""
|
|
256
|
+
return self._feature_set
|
|
@@ -0,0 +1,293 @@
|
|
|
1
|
+
"""Tabular feature engineering engine.
|
|
2
|
+
|
|
3
|
+
Generates polynomial features, interaction terms, and mathematical transformations.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from itertools import combinations
|
|
7
|
+
from typing import Optional, Union
|
|
8
|
+
|
|
9
|
+
import numpy as np
|
|
10
|
+
import pandas as pd
|
|
11
|
+
from pydantic import Field
|
|
12
|
+
|
|
13
|
+
from featcopilot.core.base import BaseEngine, EngineConfig
|
|
14
|
+
from featcopilot.core.feature import Feature, FeatureOrigin, FeatureSet, FeatureType
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class TabularEngineConfig(EngineConfig):
|
|
18
|
+
"""Configuration for tabular feature engine."""
|
|
19
|
+
|
|
20
|
+
name: str = "TabularEngine"
|
|
21
|
+
polynomial_degree: int = Field(default=2, ge=1, le=4, description="Max polynomial degree")
|
|
22
|
+
interaction_only: bool = Field(default=False, description="Only interaction terms, no powers")
|
|
23
|
+
include_bias: bool = Field(default=False, description="Include bias/intercept term")
|
|
24
|
+
include_transforms: list[str] = Field(
|
|
25
|
+
default_factory=lambda: ["log", "sqrt", "square"],
|
|
26
|
+
description="Mathematical transformations to apply",
|
|
27
|
+
)
|
|
28
|
+
numeric_only: bool = Field(default=True, description="Only process numeric columns")
|
|
29
|
+
min_unique_values: int = Field(default=5, description="Min unique values for continuous")
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class TabularEngine(BaseEngine):
|
|
33
|
+
"""
|
|
34
|
+
Tabular feature engineering engine.
|
|
35
|
+
|
|
36
|
+
Generates:
|
|
37
|
+
- Polynomial features (x^2, x^3, etc.)
|
|
38
|
+
- Interaction features (x1 * x2)
|
|
39
|
+
- Mathematical transformations (log, sqrt, etc.)
|
|
40
|
+
- Ratio features (x1 / x2)
|
|
41
|
+
- Difference features (x1 - x2)
|
|
42
|
+
|
|
43
|
+
Parameters
|
|
44
|
+
----------
|
|
45
|
+
polynomial_degree : int, default=2
|
|
46
|
+
Maximum degree for polynomial features
|
|
47
|
+
interaction_only : bool, default=False
|
|
48
|
+
If True, only generate interaction terms, not polynomial powers
|
|
49
|
+
include_transforms : list, default=['log', 'sqrt', 'square']
|
|
50
|
+
Mathematical transformations to apply
|
|
51
|
+
max_features : int, optional
|
|
52
|
+
Maximum number of features to generate
|
|
53
|
+
|
|
54
|
+
Examples
|
|
55
|
+
--------
|
|
56
|
+
>>> engine = TabularEngine(polynomial_degree=2, include_transforms=['log', 'sqrt'])
|
|
57
|
+
>>> X_transformed = engine.fit_transform(X)
|
|
58
|
+
"""
|
|
59
|
+
|
|
60
|
+
# Available transformations
|
|
61
|
+
TRANSFORMATIONS = {
|
|
62
|
+
"log": ("log1p", lambda x: np.log1p(np.abs(x))),
|
|
63
|
+
"log10": ("log10", lambda x: np.log10(np.abs(x) + 1)),
|
|
64
|
+
"sqrt": ("sqrt", lambda x: np.sqrt(np.abs(x))),
|
|
65
|
+
"square": ("sq", lambda x: x**2),
|
|
66
|
+
"cube": ("cb", lambda x: x**3),
|
|
67
|
+
"reciprocal": ("recip", lambda x: 1 / (x + 1e-8)),
|
|
68
|
+
"exp": ("exp", lambda x: np.exp(np.clip(x, -50, 50))),
|
|
69
|
+
"tanh": ("tanh", lambda x: np.tanh(x)),
|
|
70
|
+
"sin": ("sin", lambda x: np.sin(x)),
|
|
71
|
+
"cos": ("cos", lambda x: np.cos(x)),
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
def __init__(
|
|
75
|
+
self,
|
|
76
|
+
polynomial_degree: int = 2,
|
|
77
|
+
interaction_only: bool = False,
|
|
78
|
+
include_transforms: Optional[list[str]] = None,
|
|
79
|
+
max_features: Optional[int] = None,
|
|
80
|
+
verbose: bool = False,
|
|
81
|
+
**kwargs,
|
|
82
|
+
):
|
|
83
|
+
config = TabularEngineConfig(
|
|
84
|
+
polynomial_degree=polynomial_degree,
|
|
85
|
+
interaction_only=interaction_only,
|
|
86
|
+
include_transforms=include_transforms or ["log", "sqrt", "square"],
|
|
87
|
+
max_features=max_features,
|
|
88
|
+
verbose=verbose,
|
|
89
|
+
**kwargs,
|
|
90
|
+
)
|
|
91
|
+
super().__init__(config=config)
|
|
92
|
+
self.config: TabularEngineConfig = config
|
|
93
|
+
self._numeric_columns: list[str] = []
|
|
94
|
+
self._feature_set = FeatureSet()
|
|
95
|
+
|
|
96
|
+
def fit(
|
|
97
|
+
self,
|
|
98
|
+
X: Union[pd.DataFrame, np.ndarray],
|
|
99
|
+
y: Optional[Union[pd.Series, np.ndarray]] = None,
|
|
100
|
+
**kwargs,
|
|
101
|
+
) -> "TabularEngine":
|
|
102
|
+
"""
|
|
103
|
+
Fit the engine to identify numeric columns and plan features.
|
|
104
|
+
|
|
105
|
+
Parameters
|
|
106
|
+
----------
|
|
107
|
+
X : DataFrame or ndarray
|
|
108
|
+
Input features
|
|
109
|
+
y : Series or ndarray, optional
|
|
110
|
+
Target variable (unused, for API compatibility)
|
|
111
|
+
|
|
112
|
+
Returns
|
|
113
|
+
-------
|
|
114
|
+
self : TabularEngine
|
|
115
|
+
"""
|
|
116
|
+
X = self._validate_input(X)
|
|
117
|
+
|
|
118
|
+
# Identify numeric columns
|
|
119
|
+
self._numeric_columns = X.select_dtypes(include=[np.number]).columns.tolist()
|
|
120
|
+
|
|
121
|
+
# Filter by unique values
|
|
122
|
+
self._numeric_columns = [
|
|
123
|
+
col for col in self._numeric_columns if X[col].nunique() >= self.config.min_unique_values
|
|
124
|
+
]
|
|
125
|
+
|
|
126
|
+
if self.config.verbose:
|
|
127
|
+
print(f"TabularEngine: Found {len(self._numeric_columns)} numeric columns")
|
|
128
|
+
|
|
129
|
+
# Plan features to generate
|
|
130
|
+
self._plan_features(X)
|
|
131
|
+
self._is_fitted = True
|
|
132
|
+
|
|
133
|
+
return self
|
|
134
|
+
|
|
135
|
+
def _plan_features(self, X: pd.DataFrame) -> None:
|
|
136
|
+
"""Plan which features to generate."""
|
|
137
|
+
self._feature_set = FeatureSet()
|
|
138
|
+
cols = self._numeric_columns
|
|
139
|
+
|
|
140
|
+
# 1. Polynomial features (powers)
|
|
141
|
+
if not self.config.interaction_only:
|
|
142
|
+
for col in cols:
|
|
143
|
+
for degree in range(2, self.config.polynomial_degree + 1):
|
|
144
|
+
feature = Feature(
|
|
145
|
+
name=f"{col}_pow{degree}",
|
|
146
|
+
dtype=FeatureType.NUMERIC,
|
|
147
|
+
origin=FeatureOrigin.POLYNOMIAL,
|
|
148
|
+
source_columns=[col],
|
|
149
|
+
transformation=f"power_{degree}",
|
|
150
|
+
explanation=f"{col} raised to power {degree}",
|
|
151
|
+
code=f"result = df['{col}'] ** {degree}",
|
|
152
|
+
)
|
|
153
|
+
self._feature_set.add(feature)
|
|
154
|
+
|
|
155
|
+
# 2. Interaction features (pairwise products)
|
|
156
|
+
for col1, col2 in combinations(cols, 2):
|
|
157
|
+
feature = Feature(
|
|
158
|
+
name=f"{col1}_x_{col2}",
|
|
159
|
+
dtype=FeatureType.NUMERIC,
|
|
160
|
+
origin=FeatureOrigin.INTERACTION,
|
|
161
|
+
source_columns=[col1, col2],
|
|
162
|
+
transformation="multiply",
|
|
163
|
+
explanation=f"Product of {col1} and {col2}",
|
|
164
|
+
code=f"result = df['{col1}'] * df['{col2}']",
|
|
165
|
+
)
|
|
166
|
+
self._feature_set.add(feature)
|
|
167
|
+
|
|
168
|
+
# 3. Mathematical transformations
|
|
169
|
+
for col in cols:
|
|
170
|
+
for transform_name in self.config.include_transforms:
|
|
171
|
+
if transform_name in self.TRANSFORMATIONS:
|
|
172
|
+
suffix, func = self.TRANSFORMATIONS[transform_name]
|
|
173
|
+
feature = Feature(
|
|
174
|
+
name=f"{col}_{suffix}",
|
|
175
|
+
dtype=FeatureType.NUMERIC,
|
|
176
|
+
origin=FeatureOrigin.POLYNOMIAL,
|
|
177
|
+
source_columns=[col],
|
|
178
|
+
transformation=transform_name,
|
|
179
|
+
explanation=f"{transform_name} transformation of {col}",
|
|
180
|
+
)
|
|
181
|
+
self._feature_set.add(feature)
|
|
182
|
+
|
|
183
|
+
# 4. Ratio features (for positive columns)
|
|
184
|
+
for col1, col2 in combinations(cols, 2):
|
|
185
|
+
feature = Feature(
|
|
186
|
+
name=f"{col1}_div_{col2}",
|
|
187
|
+
dtype=FeatureType.NUMERIC,
|
|
188
|
+
origin=FeatureOrigin.INTERACTION,
|
|
189
|
+
source_columns=[col1, col2],
|
|
190
|
+
transformation="divide",
|
|
191
|
+
explanation=f"Ratio of {col1} to {col2}",
|
|
192
|
+
code=f"result = df['{col1}'] / (df['{col2}'] + 1e-8)",
|
|
193
|
+
)
|
|
194
|
+
self._feature_set.add(feature)
|
|
195
|
+
|
|
196
|
+
# 5. Difference features
|
|
197
|
+
for col1, col2 in combinations(cols, 2):
|
|
198
|
+
feature = Feature(
|
|
199
|
+
name=f"{col1}_minus_{col2}",
|
|
200
|
+
dtype=FeatureType.NUMERIC,
|
|
201
|
+
origin=FeatureOrigin.INTERACTION,
|
|
202
|
+
source_columns=[col1, col2],
|
|
203
|
+
transformation="subtract",
|
|
204
|
+
explanation=f"Difference between {col1} and {col2}",
|
|
205
|
+
code=f"result = df['{col1}'] - df['{col2}']",
|
|
206
|
+
)
|
|
207
|
+
self._feature_set.add(feature)
|
|
208
|
+
|
|
209
|
+
if self.config.verbose:
|
|
210
|
+
print(f"TabularEngine: Planned {len(self._feature_set)} features")
|
|
211
|
+
|
|
212
|
+
def transform(self, X: Union[pd.DataFrame, np.ndarray], **kwargs) -> pd.DataFrame:
|
|
213
|
+
"""
|
|
214
|
+
Generate new features from input data.
|
|
215
|
+
|
|
216
|
+
Parameters
|
|
217
|
+
----------
|
|
218
|
+
X : DataFrame or ndarray
|
|
219
|
+
Input features
|
|
220
|
+
|
|
221
|
+
Returns
|
|
222
|
+
-------
|
|
223
|
+
X_transformed : DataFrame
|
|
224
|
+
DataFrame with original and generated features
|
|
225
|
+
"""
|
|
226
|
+
if not self._is_fitted:
|
|
227
|
+
raise RuntimeError("Engine must be fitted before transform")
|
|
228
|
+
|
|
229
|
+
X = self._validate_input(X)
|
|
230
|
+
result = X.copy()
|
|
231
|
+
|
|
232
|
+
cols = self._numeric_columns
|
|
233
|
+
feature_count = 0
|
|
234
|
+
max_features = self.config.max_features
|
|
235
|
+
|
|
236
|
+
# Generate polynomial features
|
|
237
|
+
if not self.config.interaction_only:
|
|
238
|
+
for col in cols:
|
|
239
|
+
if max_features and feature_count >= max_features:
|
|
240
|
+
break
|
|
241
|
+
for degree in range(2, self.config.polynomial_degree + 1):
|
|
242
|
+
name = f"{col}_pow{degree}"
|
|
243
|
+
result[name] = X[col] ** degree
|
|
244
|
+
feature_count += 1
|
|
245
|
+
if max_features and feature_count >= max_features:
|
|
246
|
+
break
|
|
247
|
+
|
|
248
|
+
# Generate interactions
|
|
249
|
+
for col1, col2 in combinations(cols, 2):
|
|
250
|
+
if max_features and feature_count >= max_features:
|
|
251
|
+
break
|
|
252
|
+
result[f"{col1}_x_{col2}"] = X[col1] * X[col2]
|
|
253
|
+
feature_count += 1
|
|
254
|
+
|
|
255
|
+
# Apply transformations
|
|
256
|
+
for col in cols:
|
|
257
|
+
if max_features and feature_count >= max_features:
|
|
258
|
+
break
|
|
259
|
+
for transform_name in self.config.include_transforms:
|
|
260
|
+
if transform_name in self.TRANSFORMATIONS:
|
|
261
|
+
if max_features and feature_count >= max_features:
|
|
262
|
+
break
|
|
263
|
+
suffix, func = self.TRANSFORMATIONS[transform_name]
|
|
264
|
+
result[f"{col}_{suffix}"] = func(X[col])
|
|
265
|
+
feature_count += 1
|
|
266
|
+
|
|
267
|
+
# Generate ratios
|
|
268
|
+
for col1, col2 in combinations(cols, 2):
|
|
269
|
+
if max_features and feature_count >= max_features:
|
|
270
|
+
break
|
|
271
|
+
result[f"{col1}_div_{col2}"] = X[col1] / (X[col2] + 1e-8)
|
|
272
|
+
feature_count += 1
|
|
273
|
+
|
|
274
|
+
# Generate differences
|
|
275
|
+
for col1, col2 in combinations(cols, 2):
|
|
276
|
+
if max_features and feature_count >= max_features:
|
|
277
|
+
break
|
|
278
|
+
result[f"{col1}_minus_{col2}"] = X[col1] - X[col2]
|
|
279
|
+
feature_count += 1
|
|
280
|
+
|
|
281
|
+
# Handle infinities and NaNs
|
|
282
|
+
result = result.replace([np.inf, -np.inf], np.nan)
|
|
283
|
+
|
|
284
|
+
self._feature_names = [c for c in result.columns if c not in X.columns]
|
|
285
|
+
|
|
286
|
+
if self.config.verbose:
|
|
287
|
+
print(f"TabularEngine: Generated {len(self._feature_names)} features")
|
|
288
|
+
|
|
289
|
+
return result
|
|
290
|
+
|
|
291
|
+
def get_feature_set(self) -> FeatureSet:
|
|
292
|
+
"""Get the feature set with metadata."""
|
|
293
|
+
return self._feature_set
|