featcopilot 0.1.0__tar.gz → 0.3.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. {featcopilot-0.1.0 → featcopilot-0.3.0}/PKG-INFO +56 -25
  2. {featcopilot-0.1.0 → featcopilot-0.3.0}/README.md +39 -24
  3. {featcopilot-0.1.0 → featcopilot-0.3.0}/featcopilot/__init__.py +10 -1
  4. {featcopilot-0.1.0 → featcopilot-0.3.0}/featcopilot/core/__init__.py +2 -0
  5. {featcopilot-0.1.0 → featcopilot-0.3.0}/featcopilot/core/feature.py +5 -1
  6. featcopilot-0.3.0/featcopilot/core/transform_rule.py +276 -0
  7. {featcopilot-0.1.0 → featcopilot-0.3.0}/featcopilot/engines/relational.py +5 -2
  8. {featcopilot-0.1.0 → featcopilot-0.3.0}/featcopilot/engines/tabular.py +151 -5
  9. featcopilot-0.3.0/featcopilot/engines/text.py +552 -0
  10. {featcopilot-0.1.0 → featcopilot-0.3.0}/featcopilot/engines/timeseries.py +235 -3
  11. {featcopilot-0.1.0 → featcopilot-0.3.0}/featcopilot/llm/__init__.py +6 -1
  12. {featcopilot-0.1.0 → featcopilot-0.3.0}/featcopilot/llm/code_generator.py +7 -4
  13. {featcopilot-0.1.0 → featcopilot-0.3.0}/featcopilot/llm/copilot_client.py +97 -20
  14. {featcopilot-0.1.0 → featcopilot-0.3.0}/featcopilot/llm/explainer.py +6 -3
  15. featcopilot-0.3.0/featcopilot/llm/litellm_client.py +595 -0
  16. featcopilot-0.3.0/featcopilot/llm/semantic_engine.py +1070 -0
  17. featcopilot-0.3.0/featcopilot/llm/transform_rule_generator.py +403 -0
  18. {featcopilot-0.1.0 → featcopilot-0.3.0}/featcopilot/selection/importance.py +40 -9
  19. {featcopilot-0.1.0 → featcopilot-0.3.0}/featcopilot/selection/redundancy.py +39 -10
  20. {featcopilot-0.1.0 → featcopilot-0.3.0}/featcopilot/selection/statistical.py +107 -34
  21. {featcopilot-0.1.0 → featcopilot-0.3.0}/featcopilot/selection/unified.py +57 -3
  22. featcopilot-0.3.0/featcopilot/stores/__init__.py +17 -0
  23. featcopilot-0.3.0/featcopilot/stores/base.py +166 -0
  24. featcopilot-0.3.0/featcopilot/stores/feast_store.py +541 -0
  25. featcopilot-0.3.0/featcopilot/stores/rule_store.py +343 -0
  26. {featcopilot-0.1.0 → featcopilot-0.3.0}/featcopilot/transformers/sklearn_compat.py +18 -6
  27. featcopilot-0.3.0/featcopilot/utils/__init__.py +23 -0
  28. featcopilot-0.3.0/featcopilot/utils/logger.py +47 -0
  29. featcopilot-0.3.0/featcopilot/utils/models.py +287 -0
  30. {featcopilot-0.1.0 → featcopilot-0.3.0}/featcopilot/utils/parallel.py +5 -1
  31. {featcopilot-0.1.0 → featcopilot-0.3.0}/featcopilot.egg-info/PKG-INFO +56 -25
  32. {featcopilot-0.1.0 → featcopilot-0.3.0}/featcopilot.egg-info/SOURCES.txt +13 -1
  33. {featcopilot-0.1.0 → featcopilot-0.3.0}/featcopilot.egg-info/requires.txt +19 -0
  34. {featcopilot-0.1.0 → featcopilot-0.3.0}/pyproject.toml +20 -1
  35. featcopilot-0.3.0/tests/test_litellm.py +249 -0
  36. featcopilot-0.3.0/tests/test_stores.py +261 -0
  37. featcopilot-0.3.0/tests/test_transform_rules.py +507 -0
  38. featcopilot-0.1.0/featcopilot/engines/text.py +0 -211
  39. featcopilot-0.1.0/featcopilot/llm/semantic_engine.py +0 -379
  40. featcopilot-0.1.0/featcopilot/utils/__init__.py +0 -9
  41. {featcopilot-0.1.0 → featcopilot-0.3.0}/featcopilot/core/base.py +0 -0
  42. {featcopilot-0.1.0 → featcopilot-0.3.0}/featcopilot/core/registry.py +0 -0
  43. {featcopilot-0.1.0 → featcopilot-0.3.0}/featcopilot/engines/__init__.py +0 -0
  44. {featcopilot-0.1.0 → featcopilot-0.3.0}/featcopilot/selection/__init__.py +0 -0
  45. {featcopilot-0.1.0 → featcopilot-0.3.0}/featcopilot/transformers/__init__.py +0 -0
  46. {featcopilot-0.1.0 → featcopilot-0.3.0}/featcopilot/utils/cache.py +0 -0
  47. {featcopilot-0.1.0 → featcopilot-0.3.0}/featcopilot.egg-info/dependency_links.txt +0 -0
  48. {featcopilot-0.1.0 → featcopilot-0.3.0}/featcopilot.egg-info/top_level.txt +0 -0
  49. {featcopilot-0.1.0 → featcopilot-0.3.0}/setup.cfg +0 -0
  50. {featcopilot-0.1.0 → featcopilot-0.3.0}/tests/test_autofeat.py +0 -0
  51. {featcopilot-0.1.0 → featcopilot-0.3.0}/tests/test_core.py +0 -0
  52. {featcopilot-0.1.0 → featcopilot-0.3.0}/tests/test_engines.py +0 -0
  53. {featcopilot-0.1.0 → featcopilot-0.3.0}/tests/test_selection.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: featcopilot
3
- Version: 0.1.0
3
+ Version: 0.3.0
4
4
  Summary: Next-generation LLM-powered auto feature engineering framework with GitHub Copilot SDK
5
5
  Author: FeatCopilot Contributors
6
6
  License: MIT
@@ -28,11 +28,27 @@ Requires-Dist: pydantic>=2.0.0
28
28
  Requires-Dist: joblib>=1.1.0
29
29
  Provides-Extra: llm
30
30
  Requires-Dist: github-copilot-sdk>=0.1.0; extra == "llm"
31
+ Requires-Dist: nest_asyncio>=1.5.0; extra == "llm"
32
+ Provides-Extra: litellm
33
+ Requires-Dist: litellm>=1.0.0; extra == "litellm"
34
+ Requires-Dist: nest_asyncio>=1.5.0; extra == "litellm"
31
35
  Provides-Extra: timeseries
32
36
  Requires-Dist: statsmodels>=0.13.0; extra == "timeseries"
37
+ Provides-Extra: feast
38
+ Requires-Dist: feast>=0.30.0; extra == "feast"
33
39
  Provides-Extra: full
34
40
  Requires-Dist: github-copilot-sdk>=0.1.0; extra == "full"
41
+ Requires-Dist: litellm>=1.0.0; extra == "full"
35
42
  Requires-Dist: statsmodels>=0.13.0; extra == "full"
43
+ Requires-Dist: feast>=0.30.0; extra == "full"
44
+ Requires-Dist: nest_asyncio>=1.5.0; extra == "full"
45
+ Provides-Extra: benchmark
46
+ Requires-Dist: github-copilot-sdk>=0.1.0; extra == "benchmark"
47
+ Requires-Dist: statsmodels>=0.13.0; extra == "benchmark"
48
+ Requires-Dist: flaml[automl,blendsearch]>=2.0.0; extra == "benchmark"
49
+ Requires-Dist: autogluon.tabular[fastai]>=1.5.0; extra == "benchmark"
50
+ Requires-Dist: h2o>=3.40.0; extra == "benchmark"
51
+ Requires-Dist: numpy<2; extra == "benchmark"
36
52
  Provides-Extra: dev
37
53
  Requires-Dist: pytest>=7.0.0; extra == "dev"
38
54
  Requires-Dist: pytest-asyncio>=0.21.0; extra == "dev"
@@ -44,32 +60,39 @@ Requires-Dist: pre-commit>=3.6.0; extra == "dev"
44
60
 
45
61
  # FeatCopilot 🚀
46
62
 
47
- **Next-Generation LLM-Powered Auto Feature Engineering with GitHub Copilot SDK**
63
+ **Next-Generation LLM-Powered Auto Feature Engineering Framework**
48
64
 
49
- FeatCopilot is a unified feature engineering framework that combines the best approaches from existing libraries (Featuretools, TSFresh, AutoFeat, OpenFE) with novel LLM-powered capabilities via GitHub Copilot SDK.
65
+ FeatCopilot automatically generates, selects, and explains predictive features using semantic understanding. It analyzes column meanings, applies domain-aware transformations, and provides human-readable explanations—turning raw data into ML-ready features in seconds.
66
+
67
+ ## 🎬 Introduction Video
68
+
69
+ [![FeatCopilot Introduction](https://img.youtube.com/vi/H7m50TLGHFk/0.jpg)](https://www.youtube.com/watch?v=H7m50TLGHFk)
50
70
 
51
71
  ## 📊 Benchmark Highlights
52
72
 
53
- ### Tabular Engine (Fast Mode - <1s)
73
+ ### Simple Models Benchmark (42 Datasets)
74
+
75
+ | Configuration | Improved | Avg Improvement | Best Improvement |
76
+ |---------------|----------|-----------------|------------------|
77
+ | **Tabular Engine** | 20 (48%) | +4.54% | +197% (delays_zurich) |
78
+ | **Tabular + LLM** | 23 (55%) | +6.12% | +420% (delays_zurich) |
79
+
80
+ Models: RandomForest (n_estimators=200, max_depth=20), LogisticRegression/Ridge
54
81
 
55
- | Task Type | Average Improvement | Best Case |
56
- |-----------|--------------------:|----------:|
57
- | **Text Classification** | **+12.44%** | +49.02% (News Headlines) |
58
- | Time Series | +1.51% | +12.12% (Retail Demand) |
59
- | Classification | +0.54% | +4.35% |
60
- | Regression | +0.65% | +5.57% |
82
+ ### AutoML Benchmark (FLAML, 120s budget)
61
83
 
62
- ### LLM Engine (With Copilot - 30-60s)
84
+ | Metric | Value |
85
+ |--------|-------|
86
+ | **Datasets** | 41 |
87
+ | **Improved** | 19 (46%) |
88
+ | **Best Improvement** | +8.55% (abalone) |
63
89
 
64
- | Task Type | Average Improvement | Best Case |
65
- |-----------|--------------------:|----------:|
66
- | **Regression** | **+7.79%** | +19.66% (Retail Demand) |
67
- | Classification | +2.38% | +2.87% |
90
+ ### Key Results
68
91
 
69
- - ✅ **12/12 wins** on text classification (tabular mode)
70
- - 🧠 **+19.66% max improvement** with LLM-powered features
71
- - **<1 second** (tabular) or **30-60s** (with LLM) processing time
72
- - 📈 Largest gains with simple models (LogisticRegression, Ridge)
92
+ - ✅ **+197% improvement** on delays_zurich (tabular only)
93
+ - 🧠 **+420% improvement** with LLM-enhanced features
94
+ - 📈 **+8.98%** on abalone regression task
95
+ - 🚀 **+5.68%** on complex_classification
73
96
 
74
97
  [View Full Benchmark Results](https://thinkall.github.io/featcopilot/user-guide/benchmarks/)
75
98
 
@@ -87,7 +110,7 @@ FeatCopilot is a unified feature engineering framework that combines the best ap
87
110
  # Basic installation
88
111
  pip install featcopilot
89
112
 
90
- # With LLM capabilities (requires GitHub Copilot)
113
+ # With LLM capabilities
91
114
  pip install featcopilot[llm]
92
115
 
93
116
  # Full installation
@@ -111,12 +134,12 @@ X_transformed = engineer.fit_transform(X, y) # <1 second
111
134
  print(f"Features: {X.shape[1]} -> {X_transformed.shape[1]}")
112
135
  ```
113
136
 
114
- ### LLM Mode (With Copilot)
137
+ ### LLM Mode (With LiteLLM)
115
138
 
116
139
  ```python
117
140
  from featcopilot import AutoFeatureEngineer
118
141
 
119
- # LLM-powered semantic features (+19.66% max improvement)
142
+ # LLM-powered semantic features (+420% max improvement)
120
143
  engineer = AutoFeatureEngineer(
121
144
  engines=['tabular', 'llm'],
122
145
  max_features=50
@@ -164,16 +187,24 @@ engine = TimeSeriesEngine(
164
187
  ```
165
188
 
166
189
  ### LLM Engine
167
- Uses GitHub Copilot SDK for intelligent feature generation.
190
+ Uses GitHub Copilot SDK (default) or LiteLLM (100+ providers) for intelligent feature generation.
168
191
 
169
192
  ```python
170
193
  from featcopilot.llm import SemanticEngine
171
194
 
195
+ # Default: GitHub Copilot SDK
172
196
  engine = SemanticEngine(
173
- model='gpt-5',
197
+ model='gpt-5.2',
174
198
  max_suggestions=20,
175
199
  validate_features=True
176
200
  )
201
+
202
+ # Alternative: LiteLLM backend
203
+ engine = SemanticEngine(
204
+ model='gpt-4o',
205
+ backend='litellm',
206
+ max_suggestions=20
207
+ )
177
208
  ```
178
209
 
179
210
  ## Feature Selection
@@ -211,7 +242,7 @@ X_selected = selector.fit_transform(X, y)
211
242
 
212
243
  - Python 3.9+
213
244
  - NumPy, Pandas, Scikit-learn
214
- - GitHub Copilot CLI (for LLM features)
245
+ - GitHub Copilot SDK (default) or LiteLLM (for 100+ LLM providers)
215
246
 
216
247
  ## License
217
248
 
@@ -1,31 +1,38 @@
1
1
  # FeatCopilot 🚀
2
2
 
3
- **Next-Generation LLM-Powered Auto Feature Engineering with GitHub Copilot SDK**
3
+ **Next-Generation LLM-Powered Auto Feature Engineering Framework**
4
4
 
5
- FeatCopilot is a unified feature engineering framework that combines the best approaches from existing libraries (Featuretools, TSFresh, AutoFeat, OpenFE) with novel LLM-powered capabilities via GitHub Copilot SDK.
5
+ FeatCopilot automatically generates, selects, and explains predictive features using semantic understanding. It analyzes column meanings, applies domain-aware transformations, and provides human-readable explanations—turning raw data into ML-ready features in seconds.
6
+
7
+ ## 🎬 Introduction Video
8
+
9
+ [![FeatCopilot Introduction](https://img.youtube.com/vi/H7m50TLGHFk/0.jpg)](https://www.youtube.com/watch?v=H7m50TLGHFk)
6
10
 
7
11
  ## 📊 Benchmark Highlights
8
12
 
9
- ### Tabular Engine (Fast Mode - <1s)
13
+ ### Simple Models Benchmark (42 Datasets)
14
+
15
+ | Configuration | Improved | Avg Improvement | Best Improvement |
16
+ |---------------|----------|-----------------|------------------|
17
+ | **Tabular Engine** | 20 (48%) | +4.54% | +197% (delays_zurich) |
18
+ | **Tabular + LLM** | 23 (55%) | +6.12% | +420% (delays_zurich) |
19
+
20
+ Models: RandomForest (n_estimators=200, max_depth=20), LogisticRegression/Ridge
10
21
 
11
- | Task Type | Average Improvement | Best Case |
12
- |-----------|--------------------:|----------:|
13
- | **Text Classification** | **+12.44%** | +49.02% (News Headlines) |
14
- | Time Series | +1.51% | +12.12% (Retail Demand) |
15
- | Classification | +0.54% | +4.35% |
16
- | Regression | +0.65% | +5.57% |
22
+ ### AutoML Benchmark (FLAML, 120s budget)
17
23
 
18
- ### LLM Engine (With Copilot - 30-60s)
24
+ | Metric | Value |
25
+ |--------|-------|
26
+ | **Datasets** | 41 |
27
+ | **Improved** | 19 (46%) |
28
+ | **Best Improvement** | +8.55% (abalone) |
19
29
 
20
- | Task Type | Average Improvement | Best Case |
21
- |-----------|--------------------:|----------:|
22
- | **Regression** | **+7.79%** | +19.66% (Retail Demand) |
23
- | Classification | +2.38% | +2.87% |
30
+ ### Key Results
24
31
 
25
- - ✅ **12/12 wins** on text classification (tabular mode)
26
- - 🧠 **+19.66% max improvement** with LLM-powered features
27
- - **<1 second** (tabular) or **30-60s** (with LLM) processing time
28
- - 📈 Largest gains with simple models (LogisticRegression, Ridge)
32
+ - ✅ **+197% improvement** on delays_zurich (tabular only)
33
+ - 🧠 **+420% improvement** with LLM-enhanced features
34
+ - 📈 **+8.98%** on abalone regression task
35
+ - 🚀 **+5.68%** on complex_classification
29
36
 
30
37
  [View Full Benchmark Results](https://thinkall.github.io/featcopilot/user-guide/benchmarks/)
31
38
 
@@ -43,7 +50,7 @@ FeatCopilot is a unified feature engineering framework that combines the best ap
43
50
  # Basic installation
44
51
  pip install featcopilot
45
52
 
46
- # With LLM capabilities (requires GitHub Copilot)
53
+ # With LLM capabilities
47
54
  pip install featcopilot[llm]
48
55
 
49
56
  # Full installation
@@ -67,12 +74,12 @@ X_transformed = engineer.fit_transform(X, y) # <1 second
67
74
  print(f"Features: {X.shape[1]} -> {X_transformed.shape[1]}")
68
75
  ```
69
76
 
70
- ### LLM Mode (With Copilot)
77
+ ### LLM Mode (With LiteLLM)
71
78
 
72
79
  ```python
73
80
  from featcopilot import AutoFeatureEngineer
74
81
 
75
- # LLM-powered semantic features (+19.66% max improvement)
82
+ # LLM-powered semantic features (+420% max improvement)
76
83
  engineer = AutoFeatureEngineer(
77
84
  engines=['tabular', 'llm'],
78
85
  max_features=50
@@ -120,16 +127,24 @@ engine = TimeSeriesEngine(
120
127
  ```
121
128
 
122
129
  ### LLM Engine
123
- Uses GitHub Copilot SDK for intelligent feature generation.
130
+ Uses GitHub Copilot SDK (default) or LiteLLM (100+ providers) for intelligent feature generation.
124
131
 
125
132
  ```python
126
133
  from featcopilot.llm import SemanticEngine
127
134
 
135
+ # Default: GitHub Copilot SDK
128
136
  engine = SemanticEngine(
129
- model='gpt-5',
137
+ model='gpt-5.2',
130
138
  max_suggestions=20,
131
139
  validate_features=True
132
140
  )
141
+
142
+ # Alternative: LiteLLM backend
143
+ engine = SemanticEngine(
144
+ model='gpt-4o',
145
+ backend='litellm',
146
+ max_suggestions=20
147
+ )
133
148
  ```
134
149
 
135
150
  ## Feature Selection
@@ -167,7 +182,7 @@ X_selected = selector.fit_transform(X, y)
167
182
 
168
183
  - Python 3.9+
169
184
  - NumPy, Pandas, Scikit-learn
170
- - GitHub Copilot CLI (for LLM features)
185
+ - GitHub Copilot SDK (default) or LiteLLM (for 100+ LLM providers)
171
186
 
172
187
  ## License
173
188
 
@@ -5,11 +5,16 @@ A unified feature engineering framework combining traditional approaches
5
5
  with novel LLM-powered capabilities via GitHub Copilot SDK.
6
6
  """
7
7
 
8
- __version__ = "0.1.0"
8
+ from importlib.metadata import version
9
+
10
+ __version__ = version("featcopilot")
9
11
  __author__ = "FeatCopilot Contributors"
10
12
 
11
13
  from featcopilot.core.base import BaseEngine, BaseSelector
12
14
  from featcopilot.core.feature import Feature, FeatureSet
15
+ from featcopilot.core.transform_rule import TransformRule
16
+ from featcopilot.llm.transform_rule_generator import TransformRuleGenerator
17
+ from featcopilot.stores.rule_store import TransformRuleStore
13
18
  from featcopilot.transformers.sklearn_compat import (
14
19
  AutoFeatureEngineer,
15
20
  FeatureEngineerTransformer,
@@ -21,6 +26,10 @@ __all__ = [
21
26
  "BaseSelector",
22
27
  "Feature",
23
28
  "FeatureSet",
29
+ # Transform Rules
30
+ "TransformRule",
31
+ "TransformRuleStore",
32
+ "TransformRuleGenerator",
24
33
  # Main API
25
34
  "AutoFeatureEngineer",
26
35
  "FeatureEngineerTransformer",
@@ -3,6 +3,7 @@
3
3
  from featcopilot.core.base import BaseEngine, BaseSelector
4
4
  from featcopilot.core.feature import Feature, FeatureSet
5
5
  from featcopilot.core.registry import FeatureRegistry
6
+ from featcopilot.core.transform_rule import TransformRule
6
7
 
7
8
  __all__ = [
8
9
  "BaseEngine",
@@ -10,4 +11,5 @@ __all__ = [
10
11
  "Feature",
11
12
  "FeatureSet",
12
13
  "FeatureRegistry",
14
+ "TransformRule",
13
15
  ]
@@ -7,6 +7,10 @@ from typing import Any, Optional
7
7
  import numpy as np
8
8
  import pandas as pd
9
9
 
10
+ from featcopilot.utils.logger import get_logger
11
+
12
+ logger = get_logger(__name__)
13
+
10
14
 
11
15
  class FeatureType(Enum):
12
16
  """Types of features."""
@@ -220,5 +224,5 @@ class FeatureSet:
220
224
  result[feature.name] = feature.compute(df)
221
225
  except Exception as e:
222
226
  # Log warning but continue
223
- print(f"Warning: Could not compute feature {feature.name}: {e}")
227
+ logger.warning(f"Could not compute feature {feature.name}: {e}")
224
228
  return result
@@ -0,0 +1,276 @@
1
+ """Transform rule model for reusable feature transformations.
2
+
3
+ Defines TransformRule - a reusable transformation that can be created from
4
+ natural language descriptions and applied across different datasets.
5
+ """
6
+
7
+ import re
8
+ import uuid
9
+ from datetime import datetime, timezone
10
+ from typing import Any, Optional
11
+
12
+ import numpy as np
13
+ import pandas as pd
14
+ from pydantic import BaseModel, Field
15
+
16
+ from featcopilot.utils.logger import get_logger
17
+
18
+ logger = get_logger(__name__)
19
+
20
+
21
+ class TransformRule(BaseModel):
22
+ """
23
+ A reusable feature transformation rule.
24
+
25
+ Transform rules capture feature engineering logic that can be generated
26
+ from natural language descriptions and reused across different datasets.
27
+
28
+ Parameters
29
+ ----------
30
+ id : str, optional
31
+ Unique identifier for the rule
32
+ name : str
33
+ Human-readable name for the rule
34
+ description : str
35
+ Natural language description of what the rule does
36
+ code : str
37
+ Python code that implements the transformation
38
+ input_columns : list[str]
39
+ Column names or patterns this rule expects as input
40
+ output_name : str, optional
41
+ Name for the output feature (default: derived from rule name)
42
+ output_type : str
43
+ Expected output data type ('numeric', 'categorical', 'boolean')
44
+ tags : list[str]
45
+ Tags for categorization and search
46
+ column_patterns : list[str]
47
+ Regex patterns for matching columns (e.g., 'price.*', '.*_amount')
48
+ usage_count : int
49
+ Number of times this rule has been applied
50
+ created_at : str
51
+ ISO timestamp of rule creation
52
+ metadata : dict
53
+ Additional metadata
54
+
55
+ Examples
56
+ --------
57
+ >>> rule = TransformRule(
58
+ ... name="ratio_calculation",
59
+ ... description="Calculate ratio of two numeric columns",
60
+ ... code="result = df['{col1}'] / (df['{col2}'] + 1e-8)",
61
+ ... input_columns=["col1", "col2"],
62
+ ... tags=["ratio", "numeric"]
63
+ ... )
64
+ >>> result = rule.apply(df, column_mapping={"col1": "price", "col2": "quantity"})
65
+ """
66
+
67
+ id: str = Field(default_factory=lambda: str(uuid.uuid4())[:8], description="Unique rule identifier")
68
+ name: str = Field(description="Human-readable rule name")
69
+ description: str = Field(description="Natural language description of the transformation")
70
+ code: str = Field(description="Python code implementing the transformation")
71
+ input_columns: list[str] = Field(default_factory=list, description="Expected input column names or placeholders")
72
+ output_name: Optional[str] = Field(default=None, description="Output feature name")
73
+ output_type: str = Field(default="numeric", description="Output data type")
74
+ tags: list[str] = Field(default_factory=list, description="Tags for categorization")
75
+ column_patterns: list[str] = Field(default_factory=list, description="Regex patterns for column matching")
76
+ usage_count: int = Field(default=0, description="Number of times applied")
77
+ created_at: str = Field(
78
+ default_factory=lambda: datetime.now(timezone.utc).isoformat(), description="Creation timestamp"
79
+ )
80
+ metadata: dict[str, Any] = Field(default_factory=dict, description="Additional metadata")
81
+
82
+ def get_output_name(self, column_mapping: Optional[dict[str, str]] = None) -> str:
83
+ """
84
+ Get the output feature name.
85
+
86
+ Parameters
87
+ ----------
88
+ column_mapping : dict, optional
89
+ Mapping from placeholder columns to actual column names
90
+
91
+ Returns
92
+ -------
93
+ str
94
+ Output feature name
95
+ """
96
+ if self.output_name:
97
+ return self.output_name
98
+
99
+ # Generate name from input columns
100
+ if column_mapping and self.input_columns:
101
+ cols = [column_mapping.get(c, c) for c in self.input_columns[:2]]
102
+ return f"{'_'.join(cols)}_{self.name}"
103
+
104
+ return f"rule_{self.name}"
105
+
106
+ def matches_columns(self, columns: list[str]) -> tuple[bool, dict[str, str]]:
107
+ """
108
+ Check if this rule can be applied to the given columns.
109
+
110
+ Parameters
111
+ ----------
112
+ columns : list[str]
113
+ Available column names
114
+
115
+ Returns
116
+ -------
117
+ matches : bool
118
+ Whether the rule can be applied
119
+ mapping : dict
120
+ Suggested mapping from rule's input_columns to actual columns
121
+ """
122
+ if not self.input_columns:
123
+ return True, {}
124
+
125
+ mapping = {}
126
+
127
+ for input_col in self.input_columns:
128
+ # Try exact match first
129
+ if input_col in columns:
130
+ mapping[input_col] = input_col
131
+ continue
132
+
133
+ # Try pattern matching
134
+ matched = False
135
+ for pattern in self.column_patterns:
136
+ regex = re.compile(pattern, re.IGNORECASE)
137
+ for col in columns:
138
+ if regex.match(col) and col not in mapping.values():
139
+ mapping[input_col] = col
140
+ matched = True
141
+ break
142
+ if matched:
143
+ break
144
+
145
+ # Try fuzzy matching by checking if input_col is substring
146
+ if not matched:
147
+ for col in columns:
148
+ if input_col.lower() in col.lower() and col not in mapping.values():
149
+ mapping[input_col] = col
150
+ matched = True
151
+ break
152
+
153
+ if not matched:
154
+ return False, {}
155
+
156
+ return len(mapping) == len(self.input_columns), mapping
157
+
158
+ def apply(
159
+ self,
160
+ df: pd.DataFrame,
161
+ column_mapping: Optional[dict[str, str]] = None,
162
+ validate: bool = True,
163
+ ) -> pd.Series:
164
+ """
165
+ Apply the transformation rule to a DataFrame.
166
+
167
+ Parameters
168
+ ----------
169
+ df : DataFrame
170
+ Input data
171
+ column_mapping : dict, optional
172
+ Mapping from rule's input_columns to actual column names
173
+ validate : bool, default=True
174
+ Whether to validate before execution
175
+
176
+ Returns
177
+ -------
178
+ Series
179
+ Transformed feature values
180
+
181
+ Raises
182
+ ------
183
+ ValueError
184
+ If required columns are missing or code execution fails
185
+ """
186
+ column_mapping = column_mapping or {}
187
+
188
+ # Prepare the code with actual column names
189
+ code = self._prepare_code(column_mapping)
190
+
191
+ if validate:
192
+ # Check required columns exist
193
+ for input_col in self.input_columns:
194
+ actual_col = column_mapping.get(input_col, input_col)
195
+ if actual_col not in df.columns:
196
+ raise ValueError(f"Required column '{actual_col}' not found in DataFrame")
197
+
198
+ # Execute the code in a restricted environment
199
+ local_vars: dict[str, Any] = {"df": df, "np": np, "pd": pd}
200
+ try:
201
+ exec(self._get_safe_code(code), {"__builtins__": self._get_safe_builtins()}, local_vars)
202
+
203
+ if "result" not in local_vars:
204
+ raise ValueError("Code did not produce a 'result' variable")
205
+
206
+ result = local_vars["result"]
207
+
208
+ # Increment usage count
209
+ self.usage_count += 1
210
+
211
+ return result
212
+
213
+ except Exception as e:
214
+ logger.error(f"Failed to apply rule '{self.name}': {e}")
215
+ raise ValueError(f"Rule execution failed: {e}") from e
216
+
217
+ def _prepare_code(self, column_mapping: dict[str, str]) -> str:
218
+ """Substitute column placeholders with actual column names."""
219
+ code = self.code
220
+
221
+ # Replace {col} style placeholders
222
+ for placeholder, actual in column_mapping.items():
223
+ code = code.replace(f"{{{{ '{placeholder}' }}}}", f"'{actual}'")
224
+ code = code.replace(f"{{{placeholder}}}", actual)
225
+ code = code.replace(f"df['{placeholder}']", f"df['{actual}']")
226
+ code = code.replace(f'df["{placeholder}"]', f'df["{actual}"]')
227
+
228
+ return code
229
+
230
+ def _get_safe_code(self, code: str) -> str:
231
+ """Wrap code for safe execution."""
232
+ return code
233
+
234
+ def _get_safe_builtins(self) -> dict[str, Any]:
235
+ """Get restricted builtins for safe code execution."""
236
+ return {
237
+ "len": len,
238
+ "sum": sum,
239
+ "max": max,
240
+ "min": min,
241
+ "int": int,
242
+ "float": float,
243
+ "str": str,
244
+ "bool": bool,
245
+ "abs": abs,
246
+ "round": round,
247
+ "pow": pow,
248
+ "range": range,
249
+ "list": list,
250
+ "dict": dict,
251
+ "set": set,
252
+ "tuple": tuple,
253
+ "sorted": sorted,
254
+ "reversed": reversed,
255
+ "enumerate": enumerate,
256
+ "zip": zip,
257
+ "any": any,
258
+ "all": all,
259
+ "map": map,
260
+ "filter": filter,
261
+ "isinstance": isinstance,
262
+ "hasattr": hasattr,
263
+ "getattr": getattr,
264
+ }
265
+
266
+ def to_dict(self) -> dict[str, Any]:
267
+ """Convert rule to dictionary for serialization."""
268
+ return self.model_dump()
269
+
270
+ @classmethod
271
+ def from_dict(cls, data: dict[str, Any]) -> "TransformRule":
272
+ """Create rule from dictionary."""
273
+ return cls(**data)
274
+
275
+ def __repr__(self) -> str:
276
+ return f"TransformRule(name='{self.name}', description='{self.description[:50]}...')"
@@ -11,6 +11,9 @@ from pydantic import Field
11
11
 
12
12
  from featcopilot.core.base import BaseEngine, EngineConfig
13
13
  from featcopilot.core.feature import FeatureSet
14
+ from featcopilot.utils.logger import get_logger
15
+
16
+ logger = get_logger(__name__)
14
17
 
15
18
 
16
19
  class RelationalEngineConfig(EngineConfig):
@@ -141,7 +144,7 @@ class RelationalEngine(BaseEngine):
141
144
  self._primary_columns = X.columns.tolist()
142
145
 
143
146
  if self.config.verbose:
144
- print(f"RelationalEngine: {len(self._relationships)} relationships defined")
147
+ logger.info(f"RelationalEngine: {len(self._relationships)} relationships defined")
145
148
 
146
149
  self._is_fitted = True
147
150
  return self
@@ -191,7 +194,7 @@ class RelationalEngine(BaseEngine):
191
194
  self._feature_names = [c for c in result.columns if c not in X.columns]
192
195
 
193
196
  if self.config.verbose:
194
- print(f"RelationalEngine: Generated {len(self._feature_names)} features")
197
+ logger.info(f"RelationalEngine: Generated {len(self._feature_names)} features")
195
198
 
196
199
  return result
197
200