featcopilot 0.1.0__py3-none-any.whl → 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
featcopilot/__init__.py CHANGED
@@ -5,11 +5,16 @@ A unified feature engineering framework combining traditional approaches
5
5
  with novel LLM-powered capabilities via GitHub Copilot SDK.
6
6
  """
7
7
 
8
- __version__ = "0.1.0"
8
+ from importlib.metadata import version
9
+
10
+ __version__ = version("featcopilot")
9
11
  __author__ = "FeatCopilot Contributors"
10
12
 
11
13
  from featcopilot.core.base import BaseEngine, BaseSelector
12
14
  from featcopilot.core.feature import Feature, FeatureSet
15
+ from featcopilot.core.transform_rule import TransformRule
16
+ from featcopilot.llm.transform_rule_generator import TransformRuleGenerator
17
+ from featcopilot.stores.rule_store import TransformRuleStore
13
18
  from featcopilot.transformers.sklearn_compat import (
14
19
  AutoFeatureEngineer,
15
20
  FeatureEngineerTransformer,
@@ -21,6 +26,10 @@ __all__ = [
21
26
  "BaseSelector",
22
27
  "Feature",
23
28
  "FeatureSet",
29
+ # Transform Rules
30
+ "TransformRule",
31
+ "TransformRuleStore",
32
+ "TransformRuleGenerator",
24
33
  # Main API
25
34
  "AutoFeatureEngineer",
26
35
  "FeatureEngineerTransformer",
@@ -3,6 +3,7 @@
3
3
  from featcopilot.core.base import BaseEngine, BaseSelector
4
4
  from featcopilot.core.feature import Feature, FeatureSet
5
5
  from featcopilot.core.registry import FeatureRegistry
6
+ from featcopilot.core.transform_rule import TransformRule
6
7
 
7
8
  __all__ = [
8
9
  "BaseEngine",
@@ -10,4 +11,5 @@ __all__ = [
10
11
  "Feature",
11
12
  "FeatureSet",
12
13
  "FeatureRegistry",
14
+ "TransformRule",
13
15
  ]
@@ -7,6 +7,10 @@ from typing import Any, Optional
7
7
  import numpy as np
8
8
  import pandas as pd
9
9
 
10
+ from featcopilot.utils.logger import get_logger
11
+
12
+ logger = get_logger(__name__)
13
+
10
14
 
11
15
  class FeatureType(Enum):
12
16
  """Types of features."""
@@ -220,5 +224,5 @@ class FeatureSet:
220
224
  result[feature.name] = feature.compute(df)
221
225
  except Exception as e:
222
226
  # Log warning but continue
223
- print(f"Warning: Could not compute feature {feature.name}: {e}")
227
+ logger.warning(f"Could not compute feature {feature.name}: {e}")
224
228
  return result
@@ -0,0 +1,276 @@
1
+ """Transform rule model for reusable feature transformations.
2
+
3
+ Defines TransformRule - a reusable transformation that can be created from
4
+ natural language descriptions and applied across different datasets.
5
+ """
6
+
7
+ import re
8
+ import uuid
9
+ from datetime import datetime, timezone
10
+ from typing import Any, Optional
11
+
12
+ import numpy as np
13
+ import pandas as pd
14
+ from pydantic import BaseModel, Field
15
+
16
+ from featcopilot.utils.logger import get_logger
17
+
18
+ logger = get_logger(__name__)
19
+
20
+
21
+ class TransformRule(BaseModel):
22
+ """
23
+ A reusable feature transformation rule.
24
+
25
+ Transform rules capture feature engineering logic that can be generated
26
+ from natural language descriptions and reused across different datasets.
27
+
28
+ Parameters
29
+ ----------
30
+ id : str, optional
31
+ Unique identifier for the rule
32
+ name : str
33
+ Human-readable name for the rule
34
+ description : str
35
+ Natural language description of what the rule does
36
+ code : str
37
+ Python code that implements the transformation
38
+ input_columns : list[str]
39
+ Column names or patterns this rule expects as input
40
+ output_name : str, optional
41
+ Name for the output feature (default: derived from rule name)
42
+ output_type : str
43
+ Expected output data type ('numeric', 'categorical', 'boolean')
44
+ tags : list[str]
45
+ Tags for categorization and search
46
+ column_patterns : list[str]
47
+ Regex patterns for matching columns (e.g., 'price.*', '.*_amount')
48
+ usage_count : int
49
+ Number of times this rule has been applied
50
+ created_at : str
51
+ ISO timestamp of rule creation
52
+ metadata : dict
53
+ Additional metadata
54
+
55
+ Examples
56
+ --------
57
+ >>> rule = TransformRule(
58
+ ... name="ratio_calculation",
59
+ ... description="Calculate ratio of two numeric columns",
60
+ ... code="result = df['{col1}'] / (df['{col2}'] + 1e-8)",
61
+ ... input_columns=["col1", "col2"],
62
+ ... tags=["ratio", "numeric"]
63
+ ... )
64
+ >>> result = rule.apply(df, column_mapping={"col1": "price", "col2": "quantity"})
65
+ """
66
+
67
+ id: str = Field(default_factory=lambda: str(uuid.uuid4())[:8], description="Unique rule identifier")
68
+ name: str = Field(description="Human-readable rule name")
69
+ description: str = Field(description="Natural language description of the transformation")
70
+ code: str = Field(description="Python code implementing the transformation")
71
+ input_columns: list[str] = Field(default_factory=list, description="Expected input column names or placeholders")
72
+ output_name: Optional[str] = Field(default=None, description="Output feature name")
73
+ output_type: str = Field(default="numeric", description="Output data type")
74
+ tags: list[str] = Field(default_factory=list, description="Tags for categorization")
75
+ column_patterns: list[str] = Field(default_factory=list, description="Regex patterns for column matching")
76
+ usage_count: int = Field(default=0, description="Number of times applied")
77
+ created_at: str = Field(
78
+ default_factory=lambda: datetime.now(timezone.utc).isoformat(), description="Creation timestamp"
79
+ )
80
+ metadata: dict[str, Any] = Field(default_factory=dict, description="Additional metadata")
81
+
82
+ def get_output_name(self, column_mapping: Optional[dict[str, str]] = None) -> str:
83
+ """
84
+ Get the output feature name.
85
+
86
+ Parameters
87
+ ----------
88
+ column_mapping : dict, optional
89
+ Mapping from placeholder columns to actual column names
90
+
91
+ Returns
92
+ -------
93
+ str
94
+ Output feature name
95
+ """
96
+ if self.output_name:
97
+ return self.output_name
98
+
99
+ # Generate name from input columns
100
+ if column_mapping and self.input_columns:
101
+ cols = [column_mapping.get(c, c) for c in self.input_columns[:2]]
102
+ return f"{'_'.join(cols)}_{self.name}"
103
+
104
+ return f"rule_{self.name}"
105
+
106
+ def matches_columns(self, columns: list[str]) -> tuple[bool, dict[str, str]]:
107
+ """
108
+ Check if this rule can be applied to the given columns.
109
+
110
+ Parameters
111
+ ----------
112
+ columns : list[str]
113
+ Available column names
114
+
115
+ Returns
116
+ -------
117
+ matches : bool
118
+ Whether the rule can be applied
119
+ mapping : dict
120
+ Suggested mapping from rule's input_columns to actual columns
121
+ """
122
+ if not self.input_columns:
123
+ return True, {}
124
+
125
+ mapping = {}
126
+
127
+ for input_col in self.input_columns:
128
+ # Try exact match first
129
+ if input_col in columns:
130
+ mapping[input_col] = input_col
131
+ continue
132
+
133
+ # Try pattern matching
134
+ matched = False
135
+ for pattern in self.column_patterns:
136
+ regex = re.compile(pattern, re.IGNORECASE)
137
+ for col in columns:
138
+ if regex.match(col) and col not in mapping.values():
139
+ mapping[input_col] = col
140
+ matched = True
141
+ break
142
+ if matched:
143
+ break
144
+
145
+ # Try fuzzy matching by checking if input_col is substring
146
+ if not matched:
147
+ for col in columns:
148
+ if input_col.lower() in col.lower() and col not in mapping.values():
149
+ mapping[input_col] = col
150
+ matched = True
151
+ break
152
+
153
+ if not matched:
154
+ return False, {}
155
+
156
+ return len(mapping) == len(self.input_columns), mapping
157
+
158
+ def apply(
159
+ self,
160
+ df: pd.DataFrame,
161
+ column_mapping: Optional[dict[str, str]] = None,
162
+ validate: bool = True,
163
+ ) -> pd.Series:
164
+ """
165
+ Apply the transformation rule to a DataFrame.
166
+
167
+ Parameters
168
+ ----------
169
+ df : DataFrame
170
+ Input data
171
+ column_mapping : dict, optional
172
+ Mapping from rule's input_columns to actual column names
173
+ validate : bool, default=True
174
+ Whether to validate before execution
175
+
176
+ Returns
177
+ -------
178
+ Series
179
+ Transformed feature values
180
+
181
+ Raises
182
+ ------
183
+ ValueError
184
+ If required columns are missing or code execution fails
185
+ """
186
+ column_mapping = column_mapping or {}
187
+
188
+ # Prepare the code with actual column names
189
+ code = self._prepare_code(column_mapping)
190
+
191
+ if validate:
192
+ # Check required columns exist
193
+ for input_col in self.input_columns:
194
+ actual_col = column_mapping.get(input_col, input_col)
195
+ if actual_col not in df.columns:
196
+ raise ValueError(f"Required column '{actual_col}' not found in DataFrame")
197
+
198
+ # Execute the code in a restricted environment
199
+ local_vars: dict[str, Any] = {"df": df, "np": np, "pd": pd}
200
+ try:
201
+ exec(self._get_safe_code(code), {"__builtins__": self._get_safe_builtins()}, local_vars)
202
+
203
+ if "result" not in local_vars:
204
+ raise ValueError("Code did not produce a 'result' variable")
205
+
206
+ result = local_vars["result"]
207
+
208
+ # Increment usage count
209
+ self.usage_count += 1
210
+
211
+ return result
212
+
213
+ except Exception as e:
214
+ logger.error(f"Failed to apply rule '{self.name}': {e}")
215
+ raise ValueError(f"Rule execution failed: {e}") from e
216
+
217
+ def _prepare_code(self, column_mapping: dict[str, str]) -> str:
218
+ """Substitute column placeholders with actual column names."""
219
+ code = self.code
220
+
221
+ # Replace {col} style placeholders
222
+ for placeholder, actual in column_mapping.items():
223
+ code = code.replace(f"{{{{ '{placeholder}' }}}}", f"'{actual}'")
224
+ code = code.replace(f"{{{placeholder}}}", actual)
225
+ code = code.replace(f"df['{placeholder}']", f"df['{actual}']")
226
+ code = code.replace(f'df["{placeholder}"]', f'df["{actual}"]')
227
+
228
+ return code
229
+
230
+ def _get_safe_code(self, code: str) -> str:
231
+ """Wrap code for safe execution."""
232
+ return code
233
+
234
+ def _get_safe_builtins(self) -> dict[str, Any]:
235
+ """Get restricted builtins for safe code execution."""
236
+ return {
237
+ "len": len,
238
+ "sum": sum,
239
+ "max": max,
240
+ "min": min,
241
+ "int": int,
242
+ "float": float,
243
+ "str": str,
244
+ "bool": bool,
245
+ "abs": abs,
246
+ "round": round,
247
+ "pow": pow,
248
+ "range": range,
249
+ "list": list,
250
+ "dict": dict,
251
+ "set": set,
252
+ "tuple": tuple,
253
+ "sorted": sorted,
254
+ "reversed": reversed,
255
+ "enumerate": enumerate,
256
+ "zip": zip,
257
+ "any": any,
258
+ "all": all,
259
+ "map": map,
260
+ "filter": filter,
261
+ "isinstance": isinstance,
262
+ "hasattr": hasattr,
263
+ "getattr": getattr,
264
+ }
265
+
266
+ def to_dict(self) -> dict[str, Any]:
267
+ """Convert rule to dictionary for serialization."""
268
+ return self.model_dump()
269
+
270
+ @classmethod
271
+ def from_dict(cls, data: dict[str, Any]) -> "TransformRule":
272
+ """Create rule from dictionary."""
273
+ return cls(**data)
274
+
275
+ def __repr__(self) -> str:
276
+ return f"TransformRule(name='{self.name}', description='{self.description[:50]}...')"
@@ -11,6 +11,9 @@ from pydantic import Field
11
11
 
12
12
  from featcopilot.core.base import BaseEngine, EngineConfig
13
13
  from featcopilot.core.feature import FeatureSet
14
+ from featcopilot.utils.logger import get_logger
15
+
16
+ logger = get_logger(__name__)
14
17
 
15
18
 
16
19
  class RelationalEngineConfig(EngineConfig):
@@ -141,7 +144,7 @@ class RelationalEngine(BaseEngine):
141
144
  self._primary_columns = X.columns.tolist()
142
145
 
143
146
  if self.config.verbose:
144
- print(f"RelationalEngine: {len(self._relationships)} relationships defined")
147
+ logger.info(f"RelationalEngine: {len(self._relationships)} relationships defined")
145
148
 
146
149
  self._is_fitted = True
147
150
  return self
@@ -191,7 +194,7 @@ class RelationalEngine(BaseEngine):
191
194
  self._feature_names = [c for c in result.columns if c not in X.columns]
192
195
 
193
196
  if self.config.verbose:
194
- print(f"RelationalEngine: Generated {len(self._feature_names)} features")
197
+ logger.info(f"RelationalEngine: Generated {len(self._feature_names)} features")
195
198
 
196
199
  return result
197
200
 
@@ -12,6 +12,9 @@ from pydantic import Field
12
12
 
13
13
  from featcopilot.core.base import BaseEngine, EngineConfig
14
14
  from featcopilot.core.feature import Feature, FeatureOrigin, FeatureSet, FeatureType
15
+ from featcopilot.utils.logger import get_logger
16
+
17
+ logger = get_logger(__name__)
15
18
 
16
19
 
17
20
  class TabularEngineConfig(EngineConfig):
@@ -27,6 +30,16 @@ class TabularEngineConfig(EngineConfig):
27
30
  )
28
31
  numeric_only: bool = Field(default=True, description="Only process numeric columns")
29
32
  min_unique_values: int = Field(default=5, description="Min unique values for continuous")
33
+ # Categorical encoding settings
34
+ encode_categorical: bool = Field(default=True, description="Auto-encode categorical columns")
35
+ keep_original_categorical: bool = Field(
36
+ default=True, description="Keep original categorical columns (for models that handle them natively)"
37
+ )
38
+ onehot_ratio_threshold: float = Field(default=0.05, description="Max n_unique/n_rows ratio for one-hot encoding")
39
+ target_encode_ratio_threshold: float = Field(
40
+ default=0.5, description="Max n_unique/n_rows ratio for target encoding"
41
+ )
42
+ min_samples_per_category: int = Field(default=3, description="Min samples per category to include")
30
43
 
31
44
 
32
45
  class TabularEngine(BaseEngine):
@@ -78,6 +91,10 @@ class TabularEngine(BaseEngine):
78
91
  include_transforms: Optional[list[str]] = None,
79
92
  max_features: Optional[int] = None,
80
93
  verbose: bool = False,
94
+ encode_categorical: bool = True,
95
+ onehot_ratio_threshold: float = 0.05,
96
+ target_encode_ratio_threshold: float = 0.5,
97
+ min_samples_per_category: int = 3,
81
98
  **kwargs,
82
99
  ):
83
100
  config = TabularEngineConfig(
@@ -86,12 +103,22 @@ class TabularEngine(BaseEngine):
86
103
  include_transforms=include_transforms or ["log", "sqrt", "square"],
87
104
  max_features=max_features,
88
105
  verbose=verbose,
106
+ encode_categorical=encode_categorical,
107
+ onehot_ratio_threshold=onehot_ratio_threshold,
108
+ target_encode_ratio_threshold=target_encode_ratio_threshold,
109
+ min_samples_per_category=min_samples_per_category,
89
110
  **kwargs,
90
111
  )
91
112
  super().__init__(config=config)
92
113
  self.config: TabularEngineConfig = config
93
114
  self._numeric_columns: list[str] = []
94
115
  self._feature_set = FeatureSet()
116
+ # Categorical encoding state
117
+ self._onehot_columns: list[str] = []
118
+ self._target_encode_columns: list[str] = []
119
+ self._onehot_categories: dict[str, list] = {}
120
+ self._target_encode_maps: dict[str, dict] = {}
121
+ self._target_encode_global_mean: float = 0.0
95
122
 
96
123
  def fit(
97
124
  self,
@@ -107,7 +134,7 @@ class TabularEngine(BaseEngine):
107
134
  X : DataFrame or ndarray
108
135
  Input features
109
136
  y : Series or ndarray, optional
110
- Target variable (unused, for API compatibility)
137
+ Target variable (used for target encoding of categorical columns)
111
138
 
112
139
  Returns
113
140
  -------
@@ -124,7 +151,11 @@ class TabularEngine(BaseEngine):
124
151
  ]
125
152
 
126
153
  if self.config.verbose:
127
- print(f"TabularEngine: Found {len(self._numeric_columns)} numeric columns")
154
+ logger.info(f"TabularEngine: Found {len(self._numeric_columns)} numeric columns")
155
+
156
+ # Handle categorical columns
157
+ if self.config.encode_categorical:
158
+ self._fit_categorical_encoding(X, y)
128
159
 
129
160
  # Plan features to generate
130
161
  self._plan_features(X)
@@ -132,6 +163,81 @@ class TabularEngine(BaseEngine):
132
163
 
133
164
  return self
134
165
 
166
+ def _fit_categorical_encoding(self, X: pd.DataFrame, y: Optional[Union[pd.Series, np.ndarray]] = None) -> None:
167
+ """Fit categorical encoding based on cardinality ratio."""
168
+ self._onehot_columns = []
169
+ self._target_encode_columns = []
170
+ self._onehot_categories = {}
171
+ self._target_encode_maps = {}
172
+ self._target_label_encoder = None # For string targets
173
+
174
+ # Find categorical columns (object or category dtype)
175
+ cat_cols = X.select_dtypes(include=["object", "category"]).columns.tolist()
176
+
177
+ if not cat_cols:
178
+ return
179
+
180
+ n_rows = len(X)
181
+ y_encoded = None
182
+ if y is not None:
183
+ y_series = pd.Series(y) if not isinstance(y, pd.Series) else y
184
+
185
+ # Check if target is string/categorical - encode it for target encoding
186
+ if y_series.dtype == "object" or y_series.dtype.name == "category":
187
+ from sklearn.preprocessing import LabelEncoder
188
+
189
+ self._target_label_encoder = LabelEncoder()
190
+ y_encoded = pd.Series(self._target_label_encoder.fit_transform(y_series.astype(str)))
191
+ self._target_encode_global_mean = float(y_encoded.mean())
192
+ else:
193
+ y_encoded = y_series
194
+ self._target_encode_global_mean = float(y_series.mean())
195
+
196
+ for col in cat_cols:
197
+ n_unique = X[col].nunique()
198
+ ratio = n_unique / n_rows
199
+
200
+ # Count samples per category
201
+ value_counts = X[col].value_counts()
202
+ # Filter categories with enough samples
203
+ valid_categories = value_counts[value_counts >= self.config.min_samples_per_category].index.tolist()
204
+
205
+ if len(valid_categories) == 0:
206
+ if self.config.verbose:
207
+ logger.info(f"TabularEngine: Skipping '{col}' - no categories with enough samples")
208
+ continue
209
+
210
+ if ratio <= self.config.onehot_ratio_threshold:
211
+ # One-hot encoding for low cardinality
212
+ self._onehot_columns.append(col)
213
+ self._onehot_categories[col] = valid_categories
214
+ if self.config.verbose:
215
+ logger.info(
216
+ f"TabularEngine: One-hot encoding '{col}' "
217
+ f"({len(valid_categories)} categories, ratio={ratio:.4f})"
218
+ )
219
+
220
+ elif ratio <= self.config.target_encode_ratio_threshold and y_encoded is not None:
221
+ # Target encoding for medium cardinality
222
+ self._target_encode_columns.append(col)
223
+ # Compute target mean per category (using encoded target for string labels)
224
+ df_temp = pd.DataFrame({"col": X[col], "y": y_encoded})
225
+ target_means = df_temp.groupby("col")["y"].mean().to_dict()
226
+ # Only keep valid categories
227
+ self._target_encode_maps[col] = {k: v for k, v in target_means.items() if k in valid_categories}
228
+ if self.config.verbose:
229
+ logger.info(
230
+ f"TabularEngine: Target encoding '{col}' "
231
+ f"({len(self._target_encode_maps[col])} categories, ratio={ratio:.4f})"
232
+ )
233
+
234
+ else:
235
+ # High cardinality - likely ID column, skip
236
+ if self.config.verbose:
237
+ logger.info(
238
+ f"TabularEngine: Skipping '{col}' - high cardinality " f"({n_unique} unique, ratio={ratio:.4f})"
239
+ )
240
+
135
241
  def _plan_features(self, X: pd.DataFrame) -> None:
136
242
  """Plan which features to generate."""
137
243
  self._feature_set = FeatureSet()
@@ -207,7 +313,7 @@ class TabularEngine(BaseEngine):
207
313
  self._feature_set.add(feature)
208
314
 
209
315
  if self.config.verbose:
210
- print(f"TabularEngine: Planned {len(self._feature_set)} features")
316
+ logger.info(f"TabularEngine: Planned {len(self._feature_set)} features")
211
317
 
212
318
  def transform(self, X: Union[pd.DataFrame, np.ndarray], **kwargs) -> pd.DataFrame:
213
319
  """
@@ -228,11 +334,19 @@ class TabularEngine(BaseEngine):
228
334
 
229
335
  X = self._validate_input(X)
230
336
  result = X.copy()
337
+ original_columns = set(X.columns)
338
+
339
+ # Apply categorical encoding first
340
+ if self.config.encode_categorical:
341
+ result = self._transform_categorical(result)
231
342
 
232
343
  cols = self._numeric_columns
233
- feature_count = 0
234
344
  max_features = self.config.max_features
235
345
 
346
+ # Count categorical features generated so far against max_features
347
+ categorical_features = [c for c in result.columns if c not in original_columns]
348
+ feature_count = len(categorical_features)
349
+
236
350
  # Generate polynomial features
237
351
  if not self.config.interaction_only:
238
352
  for col in cols:
@@ -284,7 +398,39 @@ class TabularEngine(BaseEngine):
284
398
  self._feature_names = [c for c in result.columns if c not in X.columns]
285
399
 
286
400
  if self.config.verbose:
287
- print(f"TabularEngine: Generated {len(self._feature_names)} features")
401
+ logger.info(f"TabularEngine: Generated {len(self._feature_names)} features")
402
+
403
+ return result
404
+
405
+ def _transform_categorical(self, X: pd.DataFrame) -> pd.DataFrame:
406
+ """Apply categorical encoding to DataFrame."""
407
+ result = X.copy()
408
+
409
+ # One-hot encoding
410
+ for col in self._onehot_columns:
411
+ if col not in result.columns:
412
+ continue
413
+ categories = self._onehot_categories.get(col, [])
414
+ for cat in categories:
415
+ col_name = f"{col}_{cat}"
416
+ result[col_name] = (result[col] == cat).astype(int)
417
+ # Add "other" column for rare categories
418
+ col_other = f"{col}_other"
419
+ result[col_other] = (~result[col].isin(categories)).astype(int)
420
+ # Drop original column only if not keeping original categorical
421
+ if not self.config.keep_original_categorical:
422
+ result = result.drop(columns=[col])
423
+
424
+ # Target encoding
425
+ for col in self._target_encode_columns:
426
+ if col not in result.columns:
427
+ continue
428
+ encode_map = self._target_encode_maps.get(col, {})
429
+ col_name = f"{col}_target_encoded"
430
+ result[col_name] = result[col].map(encode_map).fillna(self._target_encode_global_mean)
431
+ # Drop original column only if not keeping original categorical
432
+ if not self.config.keep_original_categorical:
433
+ result = result.drop(columns=[col])
288
434
 
289
435
  return result
290
436