featcopilot 0.2.0__tar.gz → 0.3.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. {featcopilot-0.2.0 → featcopilot-0.3.0}/PKG-INFO +27 -19
  2. {featcopilot-0.2.0 → featcopilot-0.3.0}/README.md +24 -17
  3. {featcopilot-0.2.0 → featcopilot-0.3.0}/featcopilot/__init__.py +7 -0
  4. {featcopilot-0.2.0 → featcopilot-0.3.0}/featcopilot/core/__init__.py +2 -0
  5. featcopilot-0.3.0/featcopilot/core/transform_rule.py +276 -0
  6. {featcopilot-0.2.0 → featcopilot-0.3.0}/featcopilot/engines/tabular.py +145 -2
  7. featcopilot-0.3.0/featcopilot/engines/text.py +552 -0
  8. {featcopilot-0.2.0 → featcopilot-0.3.0}/featcopilot/engines/timeseries.py +230 -1
  9. {featcopilot-0.2.0 → featcopilot-0.3.0}/featcopilot/llm/__init__.py +2 -0
  10. {featcopilot-0.2.0 → featcopilot-0.3.0}/featcopilot/llm/copilot_client.py +50 -17
  11. featcopilot-0.3.0/featcopilot/llm/semantic_engine.py +1070 -0
  12. featcopilot-0.3.0/featcopilot/llm/transform_rule_generator.py +403 -0
  13. {featcopilot-0.2.0 → featcopilot-0.3.0}/featcopilot/selection/importance.py +35 -7
  14. {featcopilot-0.2.0 → featcopilot-0.3.0}/featcopilot/selection/redundancy.py +35 -9
  15. {featcopilot-0.2.0 → featcopilot-0.3.0}/featcopilot/selection/statistical.py +103 -33
  16. {featcopilot-0.2.0 → featcopilot-0.3.0}/featcopilot/selection/unified.py +54 -3
  17. {featcopilot-0.2.0 → featcopilot-0.3.0}/featcopilot/stores/__init__.py +2 -0
  18. featcopilot-0.3.0/featcopilot/stores/rule_store.py +343 -0
  19. {featcopilot-0.2.0 → featcopilot-0.3.0}/featcopilot/transformers/sklearn_compat.py +10 -1
  20. {featcopilot-0.2.0 → featcopilot-0.3.0}/featcopilot.egg-info/PKG-INFO +27 -19
  21. {featcopilot-0.2.0 → featcopilot-0.3.0}/featcopilot.egg-info/SOURCES.txt +5 -1
  22. {featcopilot-0.2.0 → featcopilot-0.3.0}/featcopilot.egg-info/requires.txt +2 -1
  23. {featcopilot-0.2.0 → featcopilot-0.3.0}/pyproject.toml +3 -2
  24. featcopilot-0.3.0/tests/test_transform_rules.py +507 -0
  25. featcopilot-0.2.0/featcopilot/engines/text.py +0 -214
  26. featcopilot-0.2.0/featcopilot/llm/semantic_engine.py +0 -428
  27. {featcopilot-0.2.0 → featcopilot-0.3.0}/featcopilot/core/base.py +0 -0
  28. {featcopilot-0.2.0 → featcopilot-0.3.0}/featcopilot/core/feature.py +0 -0
  29. {featcopilot-0.2.0 → featcopilot-0.3.0}/featcopilot/core/registry.py +0 -0
  30. {featcopilot-0.2.0 → featcopilot-0.3.0}/featcopilot/engines/__init__.py +0 -0
  31. {featcopilot-0.2.0 → featcopilot-0.3.0}/featcopilot/engines/relational.py +0 -0
  32. {featcopilot-0.2.0 → featcopilot-0.3.0}/featcopilot/llm/code_generator.py +0 -0
  33. {featcopilot-0.2.0 → featcopilot-0.3.0}/featcopilot/llm/explainer.py +0 -0
  34. {featcopilot-0.2.0 → featcopilot-0.3.0}/featcopilot/llm/litellm_client.py +0 -0
  35. {featcopilot-0.2.0 → featcopilot-0.3.0}/featcopilot/selection/__init__.py +0 -0
  36. {featcopilot-0.2.0 → featcopilot-0.3.0}/featcopilot/stores/base.py +0 -0
  37. {featcopilot-0.2.0 → featcopilot-0.3.0}/featcopilot/stores/feast_store.py +0 -0
  38. {featcopilot-0.2.0 → featcopilot-0.3.0}/featcopilot/transformers/__init__.py +0 -0
  39. {featcopilot-0.2.0 → featcopilot-0.3.0}/featcopilot/utils/__init__.py +0 -0
  40. {featcopilot-0.2.0 → featcopilot-0.3.0}/featcopilot/utils/cache.py +0 -0
  41. {featcopilot-0.2.0 → featcopilot-0.3.0}/featcopilot/utils/logger.py +0 -0
  42. {featcopilot-0.2.0 → featcopilot-0.3.0}/featcopilot/utils/models.py +0 -0
  43. {featcopilot-0.2.0 → featcopilot-0.3.0}/featcopilot/utils/parallel.py +0 -0
  44. {featcopilot-0.2.0 → featcopilot-0.3.0}/featcopilot.egg-info/dependency_links.txt +0 -0
  45. {featcopilot-0.2.0 → featcopilot-0.3.0}/featcopilot.egg-info/top_level.txt +0 -0
  46. {featcopilot-0.2.0 → featcopilot-0.3.0}/setup.cfg +0 -0
  47. {featcopilot-0.2.0 → featcopilot-0.3.0}/tests/test_autofeat.py +0 -0
  48. {featcopilot-0.2.0 → featcopilot-0.3.0}/tests/test_core.py +0 -0
  49. {featcopilot-0.2.0 → featcopilot-0.3.0}/tests/test_engines.py +0 -0
  50. {featcopilot-0.2.0 → featcopilot-0.3.0}/tests/test_litellm.py +0 -0
  51. {featcopilot-0.2.0 → featcopilot-0.3.0}/tests/test_selection.py +0 -0
  52. {featcopilot-0.2.0 → featcopilot-0.3.0}/tests/test_stores.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: featcopilot
3
- Version: 0.2.0
3
+ Version: 0.3.0
4
4
  Summary: Next-generation LLM-powered auto feature engineering framework with GitHub Copilot SDK
5
5
  Author: FeatCopilot Contributors
6
6
  License: MIT
@@ -46,8 +46,9 @@ Provides-Extra: benchmark
46
46
  Requires-Dist: github-copilot-sdk>=0.1.0; extra == "benchmark"
47
47
  Requires-Dist: statsmodels>=0.13.0; extra == "benchmark"
48
48
  Requires-Dist: flaml[automl,blendsearch]>=2.0.0; extra == "benchmark"
49
- Requires-Dist: autogluon.tabular>=1.0.0; extra == "benchmark"
49
+ Requires-Dist: autogluon.tabular[fastai]>=1.5.0; extra == "benchmark"
50
50
  Requires-Dist: h2o>=3.40.0; extra == "benchmark"
51
+ Requires-Dist: numpy<2; extra == "benchmark"
51
52
  Provides-Extra: dev
52
53
  Requires-Dist: pytest>=7.0.0; extra == "dev"
53
54
  Requires-Dist: pytest-asyncio>=0.21.0; extra == "dev"
@@ -63,28 +64,35 @@ Requires-Dist: pre-commit>=3.6.0; extra == "dev"
63
64
 
64
65
  FeatCopilot automatically generates, selects, and explains predictive features using semantic understanding. It analyzes column meanings, applies domain-aware transformations, and provides human-readable explanations—turning raw data into ML-ready features in seconds.
65
66
 
67
+ ## 🎬 Introduction Video
68
+
69
+ [![FeatCopilot Introduction](https://img.youtube.com/vi/H7m50TLGHFk/0.jpg)](https://www.youtube.com/watch?v=H7m50TLGHFk)
70
+
66
71
  ## 📊 Benchmark Highlights
67
72
 
68
- ### Tabular Engine (Fast Mode - <1s)
73
+ ### Simple Models Benchmark (42 Datasets)
74
+
75
+ | Configuration | Improved | Avg Improvement | Best Improvement |
76
+ |---------------|----------|-----------------|------------------|
77
+ | **Tabular Engine** | 20 (48%) | +4.54% | +197% (delays_zurich) |
78
+ | **Tabular + LLM** | 23 (55%) | +6.12% | +420% (delays_zurich) |
79
+
80
+ Models: RandomForest (n_estimators=200, max_depth=20), LogisticRegression/Ridge
69
81
 
70
- | Task Type | Average Improvement | Best Case |
71
- |-----------|--------------------:|----------:|
72
- | **Text Classification** | **+12.44%** | +49.02% (News Headlines) |
73
- | Time Series | +1.51% | +12.12% (Retail Demand) |
74
- | Classification | +0.54% | +4.35% |
75
- | Regression | +0.65% | +5.57% |
82
+ ### AutoML Benchmark (FLAML, 120s budget)
76
83
 
77
- ### LLM Engine (With LiteLLM - 30-60s)
84
+ | Metric | Value |
85
+ |--------|-------|
86
+ | **Datasets** | 41 |
87
+ | **Improved** | 19 (46%) |
88
+ | **Best Improvement** | +8.55% (abalone) |
78
89
 
79
- | Task Type | Average Improvement | Best Case |
80
- |-----------|--------------------:|----------:|
81
- | **Regression** | **+7.79%** | +19.66% (Retail Demand) |
82
- | Classification | +2.38% | +2.87% |
90
+ ### Key Results
83
91
 
84
- - ✅ **12/12 wins** on text classification (tabular mode)
85
- - 🧠 **+19.66% max improvement** with LLM-powered features
86
- - **<1 second** (tabular) or **30-60s** (with LLM) processing time
87
- - 📈 Largest gains with simple models (LogisticRegression, Ridge)
92
+ - ✅ **+197% improvement** on delays_zurich (tabular only)
93
+ - 🧠 **+420% improvement** with LLM-enhanced features
94
+ - 📈 **+8.98%** on abalone regression task
95
+ - 🚀 **+5.68%** on complex_classification
88
96
 
89
97
  [View Full Benchmark Results](https://thinkall.github.io/featcopilot/user-guide/benchmarks/)
90
98
 
@@ -131,7 +139,7 @@ print(f"Features: {X.shape[1]} -> {X_transformed.shape[1]}")
131
139
  ```python
132
140
  from featcopilot import AutoFeatureEngineer
133
141
 
134
- # LLM-powered semantic features (+19.66% max improvement)
142
+ # LLM-powered semantic features (+420% max improvement)
135
143
  engineer = AutoFeatureEngineer(
136
144
  engines=['tabular', 'llm'],
137
145
  max_features=50
@@ -4,28 +4,35 @@
4
4
 
5
5
  FeatCopilot automatically generates, selects, and explains predictive features using semantic understanding. It analyzes column meanings, applies domain-aware transformations, and provides human-readable explanations—turning raw data into ML-ready features in seconds.
6
6
 
7
+ ## 🎬 Introduction Video
8
+
9
+ [![FeatCopilot Introduction](https://img.youtube.com/vi/H7m50TLGHFk/0.jpg)](https://www.youtube.com/watch?v=H7m50TLGHFk)
10
+
7
11
  ## 📊 Benchmark Highlights
8
12
 
9
- ### Tabular Engine (Fast Mode - <1s)
13
+ ### Simple Models Benchmark (42 Datasets)
14
+
15
+ | Configuration | Improved | Avg Improvement | Best Improvement |
16
+ |---------------|----------|-----------------|------------------|
17
+ | **Tabular Engine** | 20 (48%) | +4.54% | +197% (delays_zurich) |
18
+ | **Tabular + LLM** | 23 (55%) | +6.12% | +420% (delays_zurich) |
19
+
20
+ Models: RandomForest (n_estimators=200, max_depth=20), LogisticRegression/Ridge
10
21
 
11
- | Task Type | Average Improvement | Best Case |
12
- |-----------|--------------------:|----------:|
13
- | **Text Classification** | **+12.44%** | +49.02% (News Headlines) |
14
- | Time Series | +1.51% | +12.12% (Retail Demand) |
15
- | Classification | +0.54% | +4.35% |
16
- | Regression | +0.65% | +5.57% |
22
+ ### AutoML Benchmark (FLAML, 120s budget)
17
23
 
18
- ### LLM Engine (With LiteLLM - 30-60s)
24
+ | Metric | Value |
25
+ |--------|-------|
26
+ | **Datasets** | 41 |
27
+ | **Improved** | 19 (46%) |
28
+ | **Best Improvement** | +8.55% (abalone) |
19
29
 
20
- | Task Type | Average Improvement | Best Case |
21
- |-----------|--------------------:|----------:|
22
- | **Regression** | **+7.79%** | +19.66% (Retail Demand) |
23
- | Classification | +2.38% | +2.87% |
30
+ ### Key Results
24
31
 
25
- - ✅ **12/12 wins** on text classification (tabular mode)
26
- - 🧠 **+19.66% max improvement** with LLM-powered features
27
- - **<1 second** (tabular) or **30-60s** (with LLM) processing time
28
- - 📈 Largest gains with simple models (LogisticRegression, Ridge)
32
+ - ✅ **+197% improvement** on delays_zurich (tabular only)
33
+ - 🧠 **+420% improvement** with LLM-enhanced features
34
+ - 📈 **+8.98%** on abalone regression task
35
+ - 🚀 **+5.68%** on complex_classification
29
36
 
30
37
  [View Full Benchmark Results](https://thinkall.github.io/featcopilot/user-guide/benchmarks/)
31
38
 
@@ -72,7 +79,7 @@ print(f"Features: {X.shape[1]} -> {X_transformed.shape[1]}")
72
79
  ```python
73
80
  from featcopilot import AutoFeatureEngineer
74
81
 
75
- # LLM-powered semantic features (+19.66% max improvement)
82
+ # LLM-powered semantic features (+420% max improvement)
76
83
  engineer = AutoFeatureEngineer(
77
84
  engines=['tabular', 'llm'],
78
85
  max_features=50
@@ -12,6 +12,9 @@ __author__ = "FeatCopilot Contributors"
12
12
 
13
13
  from featcopilot.core.base import BaseEngine, BaseSelector
14
14
  from featcopilot.core.feature import Feature, FeatureSet
15
+ from featcopilot.core.transform_rule import TransformRule
16
+ from featcopilot.llm.transform_rule_generator import TransformRuleGenerator
17
+ from featcopilot.stores.rule_store import TransformRuleStore
15
18
  from featcopilot.transformers.sklearn_compat import (
16
19
  AutoFeatureEngineer,
17
20
  FeatureEngineerTransformer,
@@ -23,6 +26,10 @@ __all__ = [
23
26
  "BaseSelector",
24
27
  "Feature",
25
28
  "FeatureSet",
29
+ # Transform Rules
30
+ "TransformRule",
31
+ "TransformRuleStore",
32
+ "TransformRuleGenerator",
26
33
  # Main API
27
34
  "AutoFeatureEngineer",
28
35
  "FeatureEngineerTransformer",
@@ -3,6 +3,7 @@
3
3
  from featcopilot.core.base import BaseEngine, BaseSelector
4
4
  from featcopilot.core.feature import Feature, FeatureSet
5
5
  from featcopilot.core.registry import FeatureRegistry
6
+ from featcopilot.core.transform_rule import TransformRule
6
7
 
7
8
  __all__ = [
8
9
  "BaseEngine",
@@ -10,4 +11,5 @@ __all__ = [
10
11
  "Feature",
11
12
  "FeatureSet",
12
13
  "FeatureRegistry",
14
+ "TransformRule",
13
15
  ]
@@ -0,0 +1,276 @@
1
+ """Transform rule model for reusable feature transformations.
2
+
3
+ Defines TransformRule - a reusable transformation that can be created from
4
+ natural language descriptions and applied across different datasets.
5
+ """
6
+
7
+ import re
8
+ import uuid
9
+ from datetime import datetime, timezone
10
+ from typing import Any, Optional
11
+
12
+ import numpy as np
13
+ import pandas as pd
14
+ from pydantic import BaseModel, Field
15
+
16
+ from featcopilot.utils.logger import get_logger
17
+
18
+ logger = get_logger(__name__)
19
+
20
+
21
+ class TransformRule(BaseModel):
22
+ """
23
+ A reusable feature transformation rule.
24
+
25
+ Transform rules capture feature engineering logic that can be generated
26
+ from natural language descriptions and reused across different datasets.
27
+
28
+ Parameters
29
+ ----------
30
+ id : str, optional
31
+ Unique identifier for the rule
32
+ name : str
33
+ Human-readable name for the rule
34
+ description : str
35
+ Natural language description of what the rule does
36
+ code : str
37
+ Python code that implements the transformation
38
+ input_columns : list[str]
39
+ Column names or patterns this rule expects as input
40
+ output_name : str, optional
41
+ Name for the output feature (default: derived from rule name)
42
+ output_type : str
43
+ Expected output data type ('numeric', 'categorical', 'boolean')
44
+ tags : list[str]
45
+ Tags for categorization and search
46
+ column_patterns : list[str]
47
+ Regex patterns for matching columns (e.g., 'price.*', '.*_amount')
48
+ usage_count : int
49
+ Number of times this rule has been applied
50
+ created_at : str
51
+ ISO timestamp of rule creation
52
+ metadata : dict
53
+ Additional metadata
54
+
55
+ Examples
56
+ --------
57
+ >>> rule = TransformRule(
58
+ ... name="ratio_calculation",
59
+ ... description="Calculate ratio of two numeric columns",
60
+ ... code="result = df['{col1}'] / (df['{col2}'] + 1e-8)",
61
+ ... input_columns=["col1", "col2"],
62
+ ... tags=["ratio", "numeric"]
63
+ ... )
64
+ >>> result = rule.apply(df, column_mapping={"col1": "price", "col2": "quantity"})
65
+ """
66
+
67
+ id: str = Field(default_factory=lambda: str(uuid.uuid4())[:8], description="Unique rule identifier")
68
+ name: str = Field(description="Human-readable rule name")
69
+ description: str = Field(description="Natural language description of the transformation")
70
+ code: str = Field(description="Python code implementing the transformation")
71
+ input_columns: list[str] = Field(default_factory=list, description="Expected input column names or placeholders")
72
+ output_name: Optional[str] = Field(default=None, description="Output feature name")
73
+ output_type: str = Field(default="numeric", description="Output data type")
74
+ tags: list[str] = Field(default_factory=list, description="Tags for categorization")
75
+ column_patterns: list[str] = Field(default_factory=list, description="Regex patterns for column matching")
76
+ usage_count: int = Field(default=0, description="Number of times applied")
77
+ created_at: str = Field(
78
+ default_factory=lambda: datetime.now(timezone.utc).isoformat(), description="Creation timestamp"
79
+ )
80
+ metadata: dict[str, Any] = Field(default_factory=dict, description="Additional metadata")
81
+
82
+ def get_output_name(self, column_mapping: Optional[dict[str, str]] = None) -> str:
83
+ """
84
+ Get the output feature name.
85
+
86
+ Parameters
87
+ ----------
88
+ column_mapping : dict, optional
89
+ Mapping from placeholder columns to actual column names
90
+
91
+ Returns
92
+ -------
93
+ str
94
+ Output feature name
95
+ """
96
+ if self.output_name:
97
+ return self.output_name
98
+
99
+ # Generate name from input columns
100
+ if column_mapping and self.input_columns:
101
+ cols = [column_mapping.get(c, c) for c in self.input_columns[:2]]
102
+ return f"{'_'.join(cols)}_{self.name}"
103
+
104
+ return f"rule_{self.name}"
105
+
106
+ def matches_columns(self, columns: list[str]) -> tuple[bool, dict[str, str]]:
107
+ """
108
+ Check if this rule can be applied to the given columns.
109
+
110
+ Parameters
111
+ ----------
112
+ columns : list[str]
113
+ Available column names
114
+
115
+ Returns
116
+ -------
117
+ matches : bool
118
+ Whether the rule can be applied
119
+ mapping : dict
120
+ Suggested mapping from rule's input_columns to actual columns
121
+ """
122
+ if not self.input_columns:
123
+ return True, {}
124
+
125
+ mapping = {}
126
+
127
+ for input_col in self.input_columns:
128
+ # Try exact match first
129
+ if input_col in columns:
130
+ mapping[input_col] = input_col
131
+ continue
132
+
133
+ # Try pattern matching
134
+ matched = False
135
+ for pattern in self.column_patterns:
136
+ regex = re.compile(pattern, re.IGNORECASE)
137
+ for col in columns:
138
+ if regex.match(col) and col not in mapping.values():
139
+ mapping[input_col] = col
140
+ matched = True
141
+ break
142
+ if matched:
143
+ break
144
+
145
+ # Try fuzzy matching by checking if input_col is substring
146
+ if not matched:
147
+ for col in columns:
148
+ if input_col.lower() in col.lower() and col not in mapping.values():
149
+ mapping[input_col] = col
150
+ matched = True
151
+ break
152
+
153
+ if not matched:
154
+ return False, {}
155
+
156
+ return len(mapping) == len(self.input_columns), mapping
157
+
158
+ def apply(
159
+ self,
160
+ df: pd.DataFrame,
161
+ column_mapping: Optional[dict[str, str]] = None,
162
+ validate: bool = True,
163
+ ) -> pd.Series:
164
+ """
165
+ Apply the transformation rule to a DataFrame.
166
+
167
+ Parameters
168
+ ----------
169
+ df : DataFrame
170
+ Input data
171
+ column_mapping : dict, optional
172
+ Mapping from rule's input_columns to actual column names
173
+ validate : bool, default=True
174
+ Whether to validate before execution
175
+
176
+ Returns
177
+ -------
178
+ Series
179
+ Transformed feature values
180
+
181
+ Raises
182
+ ------
183
+ ValueError
184
+ If required columns are missing or code execution fails
185
+ """
186
+ column_mapping = column_mapping or {}
187
+
188
+ # Prepare the code with actual column names
189
+ code = self._prepare_code(column_mapping)
190
+
191
+ if validate:
192
+ # Check required columns exist
193
+ for input_col in self.input_columns:
194
+ actual_col = column_mapping.get(input_col, input_col)
195
+ if actual_col not in df.columns:
196
+ raise ValueError(f"Required column '{actual_col}' not found in DataFrame")
197
+
198
+ # Execute the code in a restricted environment
199
+ local_vars: dict[str, Any] = {"df": df, "np": np, "pd": pd}
200
+ try:
201
+ exec(self._get_safe_code(code), {"__builtins__": self._get_safe_builtins()}, local_vars)
202
+
203
+ if "result" not in local_vars:
204
+ raise ValueError("Code did not produce a 'result' variable")
205
+
206
+ result = local_vars["result"]
207
+
208
+ # Increment usage count
209
+ self.usage_count += 1
210
+
211
+ return result
212
+
213
+ except Exception as e:
214
+ logger.error(f"Failed to apply rule '{self.name}': {e}")
215
+ raise ValueError(f"Rule execution failed: {e}") from e
216
+
217
+ def _prepare_code(self, column_mapping: dict[str, str]) -> str:
218
+ """Substitute column placeholders with actual column names."""
219
+ code = self.code
220
+
221
+ # Replace {col} style placeholders
222
+ for placeholder, actual in column_mapping.items():
223
+ code = code.replace(f"{{{{ '{placeholder}' }}}}", f"'{actual}'")
224
+ code = code.replace(f"{{{placeholder}}}", actual)
225
+ code = code.replace(f"df['{placeholder}']", f"df['{actual}']")
226
+ code = code.replace(f'df["{placeholder}"]', f'df["{actual}"]')
227
+
228
+ return code
229
+
230
+ def _get_safe_code(self, code: str) -> str:
231
+ """Wrap code for safe execution."""
232
+ return code
233
+
234
+ def _get_safe_builtins(self) -> dict[str, Any]:
235
+ """Get restricted builtins for safe code execution."""
236
+ return {
237
+ "len": len,
238
+ "sum": sum,
239
+ "max": max,
240
+ "min": min,
241
+ "int": int,
242
+ "float": float,
243
+ "str": str,
244
+ "bool": bool,
245
+ "abs": abs,
246
+ "round": round,
247
+ "pow": pow,
248
+ "range": range,
249
+ "list": list,
250
+ "dict": dict,
251
+ "set": set,
252
+ "tuple": tuple,
253
+ "sorted": sorted,
254
+ "reversed": reversed,
255
+ "enumerate": enumerate,
256
+ "zip": zip,
257
+ "any": any,
258
+ "all": all,
259
+ "map": map,
260
+ "filter": filter,
261
+ "isinstance": isinstance,
262
+ "hasattr": hasattr,
263
+ "getattr": getattr,
264
+ }
265
+
266
+ def to_dict(self) -> dict[str, Any]:
267
+ """Convert rule to dictionary for serialization."""
268
+ return self.model_dump()
269
+
270
+ @classmethod
271
+ def from_dict(cls, data: dict[str, Any]) -> "TransformRule":
272
+ """Create rule from dictionary."""
273
+ return cls(**data)
274
+
275
+ def __repr__(self) -> str:
276
+ return f"TransformRule(name='{self.name}', description='{self.description[:50]}...')"
@@ -30,6 +30,16 @@ class TabularEngineConfig(EngineConfig):
30
30
  )
31
31
  numeric_only: bool = Field(default=True, description="Only process numeric columns")
32
32
  min_unique_values: int = Field(default=5, description="Min unique values for continuous")
33
+ # Categorical encoding settings
34
+ encode_categorical: bool = Field(default=True, description="Auto-encode categorical columns")
35
+ keep_original_categorical: bool = Field(
36
+ default=True, description="Keep original categorical columns (for models that handle them natively)"
37
+ )
38
+ onehot_ratio_threshold: float = Field(default=0.05, description="Max n_unique/n_rows ratio for one-hot encoding")
39
+ target_encode_ratio_threshold: float = Field(
40
+ default=0.5, description="Max n_unique/n_rows ratio for target encoding"
41
+ )
42
+ min_samples_per_category: int = Field(default=3, description="Min samples per category to include")
33
43
 
34
44
 
35
45
  class TabularEngine(BaseEngine):
@@ -81,6 +91,10 @@ class TabularEngine(BaseEngine):
81
91
  include_transforms: Optional[list[str]] = None,
82
92
  max_features: Optional[int] = None,
83
93
  verbose: bool = False,
94
+ encode_categorical: bool = True,
95
+ onehot_ratio_threshold: float = 0.05,
96
+ target_encode_ratio_threshold: float = 0.5,
97
+ min_samples_per_category: int = 3,
84
98
  **kwargs,
85
99
  ):
86
100
  config = TabularEngineConfig(
@@ -89,12 +103,22 @@ class TabularEngine(BaseEngine):
89
103
  include_transforms=include_transforms or ["log", "sqrt", "square"],
90
104
  max_features=max_features,
91
105
  verbose=verbose,
106
+ encode_categorical=encode_categorical,
107
+ onehot_ratio_threshold=onehot_ratio_threshold,
108
+ target_encode_ratio_threshold=target_encode_ratio_threshold,
109
+ min_samples_per_category=min_samples_per_category,
92
110
  **kwargs,
93
111
  )
94
112
  super().__init__(config=config)
95
113
  self.config: TabularEngineConfig = config
96
114
  self._numeric_columns: list[str] = []
97
115
  self._feature_set = FeatureSet()
116
+ # Categorical encoding state
117
+ self._onehot_columns: list[str] = []
118
+ self._target_encode_columns: list[str] = []
119
+ self._onehot_categories: dict[str, list] = {}
120
+ self._target_encode_maps: dict[str, dict] = {}
121
+ self._target_encode_global_mean: float = 0.0
98
122
 
99
123
  def fit(
100
124
  self,
@@ -110,7 +134,7 @@ class TabularEngine(BaseEngine):
110
134
  X : DataFrame or ndarray
111
135
  Input features
112
136
  y : Series or ndarray, optional
113
- Target variable (unused, for API compatibility)
137
+ Target variable (used for target encoding of categorical columns)
114
138
 
115
139
  Returns
116
140
  -------
@@ -129,12 +153,91 @@ class TabularEngine(BaseEngine):
129
153
  if self.config.verbose:
130
154
  logger.info(f"TabularEngine: Found {len(self._numeric_columns)} numeric columns")
131
155
 
156
+ # Handle categorical columns
157
+ if self.config.encode_categorical:
158
+ self._fit_categorical_encoding(X, y)
159
+
132
160
  # Plan features to generate
133
161
  self._plan_features(X)
134
162
  self._is_fitted = True
135
163
 
136
164
  return self
137
165
 
166
+ def _fit_categorical_encoding(self, X: pd.DataFrame, y: Optional[Union[pd.Series, np.ndarray]] = None) -> None:
167
+ """Fit categorical encoding based on cardinality ratio."""
168
+ self._onehot_columns = []
169
+ self._target_encode_columns = []
170
+ self._onehot_categories = {}
171
+ self._target_encode_maps = {}
172
+ self._target_label_encoder = None # For string targets
173
+
174
+ # Find categorical columns (object or category dtype)
175
+ cat_cols = X.select_dtypes(include=["object", "category"]).columns.tolist()
176
+
177
+ if not cat_cols:
178
+ return
179
+
180
+ n_rows = len(X)
181
+ y_encoded = None
182
+ if y is not None:
183
+ y_series = pd.Series(y) if not isinstance(y, pd.Series) else y
184
+
185
+ # Check if target is string/categorical - encode it for target encoding
186
+ if y_series.dtype == "object" or y_series.dtype.name == "category":
187
+ from sklearn.preprocessing import LabelEncoder
188
+
189
+ self._target_label_encoder = LabelEncoder()
190
+ y_encoded = pd.Series(self._target_label_encoder.fit_transform(y_series.astype(str)))
191
+ self._target_encode_global_mean = float(y_encoded.mean())
192
+ else:
193
+ y_encoded = y_series
194
+ self._target_encode_global_mean = float(y_series.mean())
195
+
196
+ for col in cat_cols:
197
+ n_unique = X[col].nunique()
198
+ ratio = n_unique / n_rows
199
+
200
+ # Count samples per category
201
+ value_counts = X[col].value_counts()
202
+ # Filter categories with enough samples
203
+ valid_categories = value_counts[value_counts >= self.config.min_samples_per_category].index.tolist()
204
+
205
+ if len(valid_categories) == 0:
206
+ if self.config.verbose:
207
+ logger.info(f"TabularEngine: Skipping '{col}' - no categories with enough samples")
208
+ continue
209
+
210
+ if ratio <= self.config.onehot_ratio_threshold:
211
+ # One-hot encoding for low cardinality
212
+ self._onehot_columns.append(col)
213
+ self._onehot_categories[col] = valid_categories
214
+ if self.config.verbose:
215
+ logger.info(
216
+ f"TabularEngine: One-hot encoding '{col}' "
217
+ f"({len(valid_categories)} categories, ratio={ratio:.4f})"
218
+ )
219
+
220
+ elif ratio <= self.config.target_encode_ratio_threshold and y_encoded is not None:
221
+ # Target encoding for medium cardinality
222
+ self._target_encode_columns.append(col)
223
+ # Compute target mean per category (using encoded target for string labels)
224
+ df_temp = pd.DataFrame({"col": X[col], "y": y_encoded})
225
+ target_means = df_temp.groupby("col")["y"].mean().to_dict()
226
+ # Only keep valid categories
227
+ self._target_encode_maps[col] = {k: v for k, v in target_means.items() if k in valid_categories}
228
+ if self.config.verbose:
229
+ logger.info(
230
+ f"TabularEngine: Target encoding '{col}' "
231
+ f"({len(self._target_encode_maps[col])} categories, ratio={ratio:.4f})"
232
+ )
233
+
234
+ else:
235
+ # High cardinality - likely ID column, skip
236
+ if self.config.verbose:
237
+ logger.info(
238
+ f"TabularEngine: Skipping '{col}' - high cardinality " f"({n_unique} unique, ratio={ratio:.4f})"
239
+ )
240
+
138
241
  def _plan_features(self, X: pd.DataFrame) -> None:
139
242
  """Plan which features to generate."""
140
243
  self._feature_set = FeatureSet()
@@ -231,11 +334,19 @@ class TabularEngine(BaseEngine):
231
334
 
232
335
  X = self._validate_input(X)
233
336
  result = X.copy()
337
+ original_columns = set(X.columns)
338
+
339
+ # Apply categorical encoding first
340
+ if self.config.encode_categorical:
341
+ result = self._transform_categorical(result)
234
342
 
235
343
  cols = self._numeric_columns
236
- feature_count = 0
237
344
  max_features = self.config.max_features
238
345
 
346
+ # Count categorical features generated so far against max_features
347
+ categorical_features = [c for c in result.columns if c not in original_columns]
348
+ feature_count = len(categorical_features)
349
+
239
350
  # Generate polynomial features
240
351
  if not self.config.interaction_only:
241
352
  for col in cols:
@@ -291,6 +402,38 @@ class TabularEngine(BaseEngine):
291
402
 
292
403
  return result
293
404
 
405
+ def _transform_categorical(self, X: pd.DataFrame) -> pd.DataFrame:
406
+ """Apply categorical encoding to DataFrame."""
407
+ result = X.copy()
408
+
409
+ # One-hot encoding
410
+ for col in self._onehot_columns:
411
+ if col not in result.columns:
412
+ continue
413
+ categories = self._onehot_categories.get(col, [])
414
+ for cat in categories:
415
+ col_name = f"{col}_{cat}"
416
+ result[col_name] = (result[col] == cat).astype(int)
417
+ # Add "other" column for rare categories
418
+ col_other = f"{col}_other"
419
+ result[col_other] = (~result[col].isin(categories)).astype(int)
420
+ # Drop original column only if not keeping original categorical
421
+ if not self.config.keep_original_categorical:
422
+ result = result.drop(columns=[col])
423
+
424
+ # Target encoding
425
+ for col in self._target_encode_columns:
426
+ if col not in result.columns:
427
+ continue
428
+ encode_map = self._target_encode_maps.get(col, {})
429
+ col_name = f"{col}_target_encoded"
430
+ result[col_name] = result[col].map(encode_map).fillna(self._target_encode_global_mean)
431
+ # Drop original column only if not keeping original categorical
432
+ if not self.config.keep_original_categorical:
433
+ result = result.drop(columns=[col])
434
+
435
+ return result
436
+
294
437
  def get_feature_set(self) -> FeatureSet:
295
438
  """Get the feature set with metadata."""
296
439
  return self._feature_set