featcopilot 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,256 @@
1
+ """Relational feature engineering engine.
2
+
3
+ Generates aggregation features from related tables (inspired by Featuretools).
4
+ """
5
+
6
+ from typing import Optional, Union
7
+
8
+ import numpy as np
9
+ import pandas as pd
10
+ from pydantic import Field
11
+
12
+ from featcopilot.core.base import BaseEngine, EngineConfig
13
+ from featcopilot.core.feature import FeatureSet
14
+
15
+
16
+ class RelationalEngineConfig(EngineConfig):
17
+ """Configuration for relational feature engine."""
18
+
19
+ name: str = "RelationalEngine"
20
+ aggregation_functions: list[str] = Field(
21
+ default_factory=lambda: ["mean", "sum", "min", "max", "count", "std"],
22
+ description="Aggregation functions to apply",
23
+ )
24
+ max_depth: int = Field(default=2, ge=1, le=4, description="Max depth for feature synthesis")
25
+ include_time_based: bool = Field(default=True, description="Include time-based aggregations")
26
+
27
+
28
+ class RelationalEngine(BaseEngine):
29
+ """
30
+ Relational feature engineering engine.
31
+
32
+ Generates features from related tables using aggregation operations,
33
+ similar to Featuretools' Deep Feature Synthesis but with:
34
+ - Simpler API
35
+ - LLM integration capabilities
36
+ - Better performance
37
+
38
+ Parameters
39
+ ----------
40
+ aggregation_functions : list
41
+ Aggregation functions to use (mean, sum, min, max, count, std, etc.)
42
+ max_depth : int, default=2
43
+ Maximum depth for feature synthesis
44
+
45
+ Examples
46
+ --------
47
+ >>> engine = RelationalEngine()
48
+ >>> engine.add_relationship('orders', 'customers', 'customer_id')
49
+ >>> X_features = engine.fit_transform(orders_df, related_tables={'customers': customers_df})
50
+ """
51
+
52
+ AGGREGATION_FUNCTIONS = {
53
+ "mean": np.mean,
54
+ "sum": np.sum,
55
+ "min": np.min,
56
+ "max": np.max,
57
+ "count": len,
58
+ "std": np.std,
59
+ "median": np.median,
60
+ "first": lambda x: x.iloc[0] if len(x) > 0 else np.nan,
61
+ "last": lambda x: x.iloc[-1] if len(x) > 0 else np.nan,
62
+ "nunique": lambda x: len(set(x)),
63
+ }
64
+
65
+ def __init__(
66
+ self,
67
+ aggregation_functions: Optional[list[str]] = None,
68
+ max_depth: int = 2,
69
+ max_features: Optional[int] = None,
70
+ verbose: bool = False,
71
+ **kwargs,
72
+ ):
73
+ config = RelationalEngineConfig(
74
+ aggregation_functions=aggregation_functions or ["mean", "sum", "count", "max", "min"],
75
+ max_depth=max_depth,
76
+ max_features=max_features,
77
+ verbose=verbose,
78
+ **kwargs,
79
+ )
80
+ super().__init__(config=config)
81
+ self.config: RelationalEngineConfig = config
82
+ self._relationships: list[dict[str, str]] = []
83
+ self._feature_set = FeatureSet()
84
+
85
+ def add_relationship(
86
+ self, child_table: str, parent_table: str, key_column: str, parent_key: Optional[str] = None
87
+ ) -> "RelationalEngine":
88
+ """
89
+ Define a relationship between tables.
90
+
91
+ Parameters
92
+ ----------
93
+ child_table : str
94
+ Name of child table (many side)
95
+ parent_table : str
96
+ Name of parent table (one side)
97
+ key_column : str
98
+ Foreign key column in child table
99
+ parent_key : str, optional
100
+ Primary key column in parent table (defaults to key_column)
101
+
102
+ Returns
103
+ -------
104
+ self : RelationalEngine
105
+ """
106
+ self._relationships.append(
107
+ {
108
+ "child": child_table,
109
+ "parent": parent_table,
110
+ "child_key": key_column,
111
+ "parent_key": parent_key or key_column,
112
+ }
113
+ )
114
+ return self
115
+
116
+ def fit(
117
+ self,
118
+ X: Union[pd.DataFrame, np.ndarray],
119
+ y: Optional[Union[pd.Series, np.ndarray]] = None,
120
+ related_tables: Optional[dict[str, pd.DataFrame]] = None,
121
+ **kwargs,
122
+ ) -> "RelationalEngine":
123
+ """
124
+ Fit the engine to the data.
125
+
126
+ Parameters
127
+ ----------
128
+ X : DataFrame
129
+ Primary table
130
+ y : Series, optional
131
+ Target variable
132
+ related_tables : dict, optional
133
+ Dictionary of related tables {name: DataFrame}
134
+
135
+ Returns
136
+ -------
137
+ self : RelationalEngine
138
+ """
139
+ X = self._validate_input(X)
140
+ self._related_tables = related_tables or {}
141
+ self._primary_columns = X.columns.tolist()
142
+
143
+ if self.config.verbose:
144
+ print(f"RelationalEngine: {len(self._relationships)} relationships defined")
145
+
146
+ self._is_fitted = True
147
+ return self
148
+
149
+ def transform(
150
+ self,
151
+ X: Union[pd.DataFrame, np.ndarray],
152
+ related_tables: Optional[dict[str, pd.DataFrame]] = None,
153
+ **kwargs,
154
+ ) -> pd.DataFrame:
155
+ """
156
+ Generate aggregation features.
157
+
158
+ Parameters
159
+ ----------
160
+ X : DataFrame
161
+ Primary table
162
+ related_tables : dict, optional
163
+ Dictionary of related tables
164
+
165
+ Returns
166
+ -------
167
+ X_features : DataFrame
168
+ DataFrame with aggregated features
169
+ """
170
+ if not self._is_fitted:
171
+ raise RuntimeError("Engine must be fitted before transform")
172
+
173
+ X = self._validate_input(X)
174
+ related_tables = related_tables or self._related_tables
175
+ result = X.copy()
176
+
177
+ # Generate features from relationships
178
+ for rel in self._relationships:
179
+ if rel["parent"] not in related_tables:
180
+ continue
181
+
182
+ parent_df = related_tables[rel["parent"]]
183
+ features = self._aggregate_from_relationship(
184
+ X, parent_df, rel["child_key"], rel["parent_key"], rel["parent"]
185
+ )
186
+ result = pd.concat([result, features], axis=1)
187
+
188
+ # Generate group-by aggregations within the primary table
189
+ result = self._add_self_aggregations(result)
190
+
191
+ self._feature_names = [c for c in result.columns if c not in X.columns]
192
+
193
+ if self.config.verbose:
194
+ print(f"RelationalEngine: Generated {len(self._feature_names)} features")
195
+
196
+ return result
197
+
198
+ def _aggregate_from_relationship(
199
+ self,
200
+ child_df: pd.DataFrame,
201
+ parent_df: pd.DataFrame,
202
+ child_key: str,
203
+ parent_key: str,
204
+ parent_name: str,
205
+ ) -> pd.DataFrame:
206
+ """Generate aggregation features from a parent table."""
207
+ features = pd.DataFrame(index=child_df.index)
208
+
209
+ # Get numeric columns from parent
210
+ numeric_cols = parent_df.select_dtypes(include=[np.number]).columns
211
+ numeric_cols = [c for c in numeric_cols if c != parent_key]
212
+
213
+ # Merge and aggregate
214
+ for col in numeric_cols:
215
+ for agg_name in self.config.aggregation_functions:
216
+ if agg_name not in self.AGGREGATION_FUNCTIONS:
217
+ continue
218
+
219
+ feature_name = f"{parent_name}_{col}_{agg_name}"
220
+
221
+ # Group by parent key and aggregate
222
+ agg_values = parent_df.groupby(parent_key)[col].agg(agg_name).to_dict()
223
+
224
+ # Map to child table
225
+ if child_key in child_df.columns:
226
+ features[feature_name] = child_df[child_key].map(agg_values)
227
+
228
+ return features
229
+
230
+ def _add_self_aggregations(self, df: pd.DataFrame) -> pd.DataFrame:
231
+ """Add aggregations within the same table (e.g., by category columns)."""
232
+ result = df.copy()
233
+
234
+ # Find categorical columns that could be used for grouping
235
+ cat_cols = df.select_dtypes(include=["object", "category"]).columns.tolist()
236
+ num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
237
+
238
+ # Limit to avoid explosion
239
+ cat_cols = cat_cols[:3]
240
+ num_cols = num_cols[:5]
241
+
242
+ for cat_col in cat_cols:
243
+ for num_col in num_cols:
244
+ for agg_name in ["mean", "count"]: # Limited aggregations for self
245
+ if agg_name not in self.AGGREGATION_FUNCTIONS:
246
+ continue
247
+
248
+ feature_name = f"{num_col}_by_{cat_col}_{agg_name}"
249
+ agg_values = df.groupby(cat_col)[num_col].transform(agg_name)
250
+ result[feature_name] = agg_values
251
+
252
+ return result
253
+
254
+ def get_feature_set(self) -> FeatureSet:
255
+ """Get the feature set with metadata."""
256
+ return self._feature_set
@@ -0,0 +1,293 @@
1
+ """Tabular feature engineering engine.
2
+
3
+ Generates polynomial features, interaction terms, and mathematical transformations.
4
+ """
5
+
6
+ from itertools import combinations
7
+ from typing import Optional, Union
8
+
9
+ import numpy as np
10
+ import pandas as pd
11
+ from pydantic import Field
12
+
13
+ from featcopilot.core.base import BaseEngine, EngineConfig
14
+ from featcopilot.core.feature import Feature, FeatureOrigin, FeatureSet, FeatureType
15
+
16
+
17
+ class TabularEngineConfig(EngineConfig):
18
+ """Configuration for tabular feature engine."""
19
+
20
+ name: str = "TabularEngine"
21
+ polynomial_degree: int = Field(default=2, ge=1, le=4, description="Max polynomial degree")
22
+ interaction_only: bool = Field(default=False, description="Only interaction terms, no powers")
23
+ include_bias: bool = Field(default=False, description="Include bias/intercept term")
24
+ include_transforms: list[str] = Field(
25
+ default_factory=lambda: ["log", "sqrt", "square"],
26
+ description="Mathematical transformations to apply",
27
+ )
28
+ numeric_only: bool = Field(default=True, description="Only process numeric columns")
29
+ min_unique_values: int = Field(default=5, description="Min unique values for continuous")
30
+
31
+
32
+ class TabularEngine(BaseEngine):
33
+ """
34
+ Tabular feature engineering engine.
35
+
36
+ Generates:
37
+ - Polynomial features (x^2, x^3, etc.)
38
+ - Interaction features (x1 * x2)
39
+ - Mathematical transformations (log, sqrt, etc.)
40
+ - Ratio features (x1 / x2)
41
+ - Difference features (x1 - x2)
42
+
43
+ Parameters
44
+ ----------
45
+ polynomial_degree : int, default=2
46
+ Maximum degree for polynomial features
47
+ interaction_only : bool, default=False
48
+ If True, only generate interaction terms, not polynomial powers
49
+ include_transforms : list, default=['log', 'sqrt', 'square']
50
+ Mathematical transformations to apply
51
+ max_features : int, optional
52
+ Maximum number of features to generate
53
+
54
+ Examples
55
+ --------
56
+ >>> engine = TabularEngine(polynomial_degree=2, include_transforms=['log', 'sqrt'])
57
+ >>> X_transformed = engine.fit_transform(X)
58
+ """
59
+
60
+ # Available transformations
61
+ TRANSFORMATIONS = {
62
+ "log": ("log1p", lambda x: np.log1p(np.abs(x))),
63
+ "log10": ("log10", lambda x: np.log10(np.abs(x) + 1)),
64
+ "sqrt": ("sqrt", lambda x: np.sqrt(np.abs(x))),
65
+ "square": ("sq", lambda x: x**2),
66
+ "cube": ("cb", lambda x: x**3),
67
+ "reciprocal": ("recip", lambda x: 1 / (x + 1e-8)),
68
+ "exp": ("exp", lambda x: np.exp(np.clip(x, -50, 50))),
69
+ "tanh": ("tanh", lambda x: np.tanh(x)),
70
+ "sin": ("sin", lambda x: np.sin(x)),
71
+ "cos": ("cos", lambda x: np.cos(x)),
72
+ }
73
+
74
+ def __init__(
75
+ self,
76
+ polynomial_degree: int = 2,
77
+ interaction_only: bool = False,
78
+ include_transforms: Optional[list[str]] = None,
79
+ max_features: Optional[int] = None,
80
+ verbose: bool = False,
81
+ **kwargs,
82
+ ):
83
+ config = TabularEngineConfig(
84
+ polynomial_degree=polynomial_degree,
85
+ interaction_only=interaction_only,
86
+ include_transforms=include_transforms or ["log", "sqrt", "square"],
87
+ max_features=max_features,
88
+ verbose=verbose,
89
+ **kwargs,
90
+ )
91
+ super().__init__(config=config)
92
+ self.config: TabularEngineConfig = config
93
+ self._numeric_columns: list[str] = []
94
+ self._feature_set = FeatureSet()
95
+
96
+ def fit(
97
+ self,
98
+ X: Union[pd.DataFrame, np.ndarray],
99
+ y: Optional[Union[pd.Series, np.ndarray]] = None,
100
+ **kwargs,
101
+ ) -> "TabularEngine":
102
+ """
103
+ Fit the engine to identify numeric columns and plan features.
104
+
105
+ Parameters
106
+ ----------
107
+ X : DataFrame or ndarray
108
+ Input features
109
+ y : Series or ndarray, optional
110
+ Target variable (unused, for API compatibility)
111
+
112
+ Returns
113
+ -------
114
+ self : TabularEngine
115
+ """
116
+ X = self._validate_input(X)
117
+
118
+ # Identify numeric columns
119
+ self._numeric_columns = X.select_dtypes(include=[np.number]).columns.tolist()
120
+
121
+ # Filter by unique values
122
+ self._numeric_columns = [
123
+ col for col in self._numeric_columns if X[col].nunique() >= self.config.min_unique_values
124
+ ]
125
+
126
+ if self.config.verbose:
127
+ print(f"TabularEngine: Found {len(self._numeric_columns)} numeric columns")
128
+
129
+ # Plan features to generate
130
+ self._plan_features(X)
131
+ self._is_fitted = True
132
+
133
+ return self
134
+
135
+ def _plan_features(self, X: pd.DataFrame) -> None:
136
+ """Plan which features to generate."""
137
+ self._feature_set = FeatureSet()
138
+ cols = self._numeric_columns
139
+
140
+ # 1. Polynomial features (powers)
141
+ if not self.config.interaction_only:
142
+ for col in cols:
143
+ for degree in range(2, self.config.polynomial_degree + 1):
144
+ feature = Feature(
145
+ name=f"{col}_pow{degree}",
146
+ dtype=FeatureType.NUMERIC,
147
+ origin=FeatureOrigin.POLYNOMIAL,
148
+ source_columns=[col],
149
+ transformation=f"power_{degree}",
150
+ explanation=f"{col} raised to power {degree}",
151
+ code=f"result = df['{col}'] ** {degree}",
152
+ )
153
+ self._feature_set.add(feature)
154
+
155
+ # 2. Interaction features (pairwise products)
156
+ for col1, col2 in combinations(cols, 2):
157
+ feature = Feature(
158
+ name=f"{col1}_x_{col2}",
159
+ dtype=FeatureType.NUMERIC,
160
+ origin=FeatureOrigin.INTERACTION,
161
+ source_columns=[col1, col2],
162
+ transformation="multiply",
163
+ explanation=f"Product of {col1} and {col2}",
164
+ code=f"result = df['{col1}'] * df['{col2}']",
165
+ )
166
+ self._feature_set.add(feature)
167
+
168
+ # 3. Mathematical transformations
169
+ for col in cols:
170
+ for transform_name in self.config.include_transforms:
171
+ if transform_name in self.TRANSFORMATIONS:
172
+ suffix, func = self.TRANSFORMATIONS[transform_name]
173
+ feature = Feature(
174
+ name=f"{col}_{suffix}",
175
+ dtype=FeatureType.NUMERIC,
176
+ origin=FeatureOrigin.POLYNOMIAL,
177
+ source_columns=[col],
178
+ transformation=transform_name,
179
+ explanation=f"{transform_name} transformation of {col}",
180
+ )
181
+ self._feature_set.add(feature)
182
+
183
+ # 4. Ratio features (for positive columns)
184
+ for col1, col2 in combinations(cols, 2):
185
+ feature = Feature(
186
+ name=f"{col1}_div_{col2}",
187
+ dtype=FeatureType.NUMERIC,
188
+ origin=FeatureOrigin.INTERACTION,
189
+ source_columns=[col1, col2],
190
+ transformation="divide",
191
+ explanation=f"Ratio of {col1} to {col2}",
192
+ code=f"result = df['{col1}'] / (df['{col2}'] + 1e-8)",
193
+ )
194
+ self._feature_set.add(feature)
195
+
196
+ # 5. Difference features
197
+ for col1, col2 in combinations(cols, 2):
198
+ feature = Feature(
199
+ name=f"{col1}_minus_{col2}",
200
+ dtype=FeatureType.NUMERIC,
201
+ origin=FeatureOrigin.INTERACTION,
202
+ source_columns=[col1, col2],
203
+ transformation="subtract",
204
+ explanation=f"Difference between {col1} and {col2}",
205
+ code=f"result = df['{col1}'] - df['{col2}']",
206
+ )
207
+ self._feature_set.add(feature)
208
+
209
+ if self.config.verbose:
210
+ print(f"TabularEngine: Planned {len(self._feature_set)} features")
211
+
212
+ def transform(self, X: Union[pd.DataFrame, np.ndarray], **kwargs) -> pd.DataFrame:
213
+ """
214
+ Generate new features from input data.
215
+
216
+ Parameters
217
+ ----------
218
+ X : DataFrame or ndarray
219
+ Input features
220
+
221
+ Returns
222
+ -------
223
+ X_transformed : DataFrame
224
+ DataFrame with original and generated features
225
+ """
226
+ if not self._is_fitted:
227
+ raise RuntimeError("Engine must be fitted before transform")
228
+
229
+ X = self._validate_input(X)
230
+ result = X.copy()
231
+
232
+ cols = self._numeric_columns
233
+ feature_count = 0
234
+ max_features = self.config.max_features
235
+
236
+ # Generate polynomial features
237
+ if not self.config.interaction_only:
238
+ for col in cols:
239
+ if max_features and feature_count >= max_features:
240
+ break
241
+ for degree in range(2, self.config.polynomial_degree + 1):
242
+ name = f"{col}_pow{degree}"
243
+ result[name] = X[col] ** degree
244
+ feature_count += 1
245
+ if max_features and feature_count >= max_features:
246
+ break
247
+
248
+ # Generate interactions
249
+ for col1, col2 in combinations(cols, 2):
250
+ if max_features and feature_count >= max_features:
251
+ break
252
+ result[f"{col1}_x_{col2}"] = X[col1] * X[col2]
253
+ feature_count += 1
254
+
255
+ # Apply transformations
256
+ for col in cols:
257
+ if max_features and feature_count >= max_features:
258
+ break
259
+ for transform_name in self.config.include_transforms:
260
+ if transform_name in self.TRANSFORMATIONS:
261
+ if max_features and feature_count >= max_features:
262
+ break
263
+ suffix, func = self.TRANSFORMATIONS[transform_name]
264
+ result[f"{col}_{suffix}"] = func(X[col])
265
+ feature_count += 1
266
+
267
+ # Generate ratios
268
+ for col1, col2 in combinations(cols, 2):
269
+ if max_features and feature_count >= max_features:
270
+ break
271
+ result[f"{col1}_div_{col2}"] = X[col1] / (X[col2] + 1e-8)
272
+ feature_count += 1
273
+
274
+ # Generate differences
275
+ for col1, col2 in combinations(cols, 2):
276
+ if max_features and feature_count >= max_features:
277
+ break
278
+ result[f"{col1}_minus_{col2}"] = X[col1] - X[col2]
279
+ feature_count += 1
280
+
281
+ # Handle infinities and NaNs
282
+ result = result.replace([np.inf, -np.inf], np.nan)
283
+
284
+ self._feature_names = [c for c in result.columns if c not in X.columns]
285
+
286
+ if self.config.verbose:
287
+ print(f"TabularEngine: Generated {len(self._feature_names)} features")
288
+
289
+ return result
290
+
291
+ def get_feature_set(self) -> FeatureSet:
292
+ """Get the feature set with metadata."""
293
+ return self._feature_set