featcopilot 0.1.0__py3-none-any.whl → 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- featcopilot/__init__.py +3 -1
- featcopilot/core/feature.py +5 -1
- featcopilot/engines/relational.py +5 -2
- featcopilot/engines/tabular.py +6 -3
- featcopilot/engines/text.py +6 -3
- featcopilot/engines/timeseries.py +5 -2
- featcopilot/llm/__init__.py +4 -1
- featcopilot/llm/code_generator.py +7 -4
- featcopilot/llm/copilot_client.py +67 -23
- featcopilot/llm/explainer.py +6 -3
- featcopilot/llm/litellm_client.py +595 -0
- featcopilot/llm/semantic_engine.py +65 -16
- featcopilot/selection/importance.py +5 -2
- featcopilot/selection/redundancy.py +6 -3
- featcopilot/selection/statistical.py +4 -1
- featcopilot/selection/unified.py +4 -1
- featcopilot/stores/__init__.py +15 -0
- featcopilot/stores/base.py +166 -0
- featcopilot/stores/feast_store.py +541 -0
- featcopilot/transformers/sklearn_compat.py +8 -5
- featcopilot/utils/__init__.py +14 -0
- featcopilot/utils/logger.py +47 -0
- featcopilot/utils/models.py +287 -0
- featcopilot/utils/parallel.py +5 -1
- {featcopilot-0.1.0.dist-info → featcopilot-0.2.0.dist-info}/METADATA +32 -9
- featcopilot-0.2.0.dist-info/RECORD +35 -0
- featcopilot-0.1.0.dist-info/RECORD +0 -29
- {featcopilot-0.1.0.dist-info → featcopilot-0.2.0.dist-info}/WHEEL +0 -0
- {featcopilot-0.1.0.dist-info → featcopilot-0.2.0.dist-info}/top_level.txt +0 -0
featcopilot/__init__.py
CHANGED
|
@@ -5,7 +5,9 @@ A unified feature engineering framework combining traditional approaches
|
|
|
5
5
|
with novel LLM-powered capabilities via GitHub Copilot SDK.
|
|
6
6
|
"""
|
|
7
7
|
|
|
8
|
-
|
|
8
|
+
from importlib.metadata import version
|
|
9
|
+
|
|
10
|
+
__version__ = version("featcopilot")
|
|
9
11
|
__author__ = "FeatCopilot Contributors"
|
|
10
12
|
|
|
11
13
|
from featcopilot.core.base import BaseEngine, BaseSelector
|
featcopilot/core/feature.py
CHANGED
|
@@ -7,6 +7,10 @@ from typing import Any, Optional
|
|
|
7
7
|
import numpy as np
|
|
8
8
|
import pandas as pd
|
|
9
9
|
|
|
10
|
+
from featcopilot.utils.logger import get_logger
|
|
11
|
+
|
|
12
|
+
logger = get_logger(__name__)
|
|
13
|
+
|
|
10
14
|
|
|
11
15
|
class FeatureType(Enum):
|
|
12
16
|
"""Types of features."""
|
|
@@ -220,5 +224,5 @@ class FeatureSet:
|
|
|
220
224
|
result[feature.name] = feature.compute(df)
|
|
221
225
|
except Exception as e:
|
|
222
226
|
# Log warning but continue
|
|
223
|
-
|
|
227
|
+
logger.warning(f"Could not compute feature {feature.name}: {e}")
|
|
224
228
|
return result
|
|
@@ -11,6 +11,9 @@ from pydantic import Field
|
|
|
11
11
|
|
|
12
12
|
from featcopilot.core.base import BaseEngine, EngineConfig
|
|
13
13
|
from featcopilot.core.feature import FeatureSet
|
|
14
|
+
from featcopilot.utils.logger import get_logger
|
|
15
|
+
|
|
16
|
+
logger = get_logger(__name__)
|
|
14
17
|
|
|
15
18
|
|
|
16
19
|
class RelationalEngineConfig(EngineConfig):
|
|
@@ -141,7 +144,7 @@ class RelationalEngine(BaseEngine):
|
|
|
141
144
|
self._primary_columns = X.columns.tolist()
|
|
142
145
|
|
|
143
146
|
if self.config.verbose:
|
|
144
|
-
|
|
147
|
+
logger.info(f"RelationalEngine: {len(self._relationships)} relationships defined")
|
|
145
148
|
|
|
146
149
|
self._is_fitted = True
|
|
147
150
|
return self
|
|
@@ -191,7 +194,7 @@ class RelationalEngine(BaseEngine):
|
|
|
191
194
|
self._feature_names = [c for c in result.columns if c not in X.columns]
|
|
192
195
|
|
|
193
196
|
if self.config.verbose:
|
|
194
|
-
|
|
197
|
+
logger.info(f"RelationalEngine: Generated {len(self._feature_names)} features")
|
|
195
198
|
|
|
196
199
|
return result
|
|
197
200
|
|
featcopilot/engines/tabular.py
CHANGED
|
@@ -12,6 +12,9 @@ from pydantic import Field
|
|
|
12
12
|
|
|
13
13
|
from featcopilot.core.base import BaseEngine, EngineConfig
|
|
14
14
|
from featcopilot.core.feature import Feature, FeatureOrigin, FeatureSet, FeatureType
|
|
15
|
+
from featcopilot.utils.logger import get_logger
|
|
16
|
+
|
|
17
|
+
logger = get_logger(__name__)
|
|
15
18
|
|
|
16
19
|
|
|
17
20
|
class TabularEngineConfig(EngineConfig):
|
|
@@ -124,7 +127,7 @@ class TabularEngine(BaseEngine):
|
|
|
124
127
|
]
|
|
125
128
|
|
|
126
129
|
if self.config.verbose:
|
|
127
|
-
|
|
130
|
+
logger.info(f"TabularEngine: Found {len(self._numeric_columns)} numeric columns")
|
|
128
131
|
|
|
129
132
|
# Plan features to generate
|
|
130
133
|
self._plan_features(X)
|
|
@@ -207,7 +210,7 @@ class TabularEngine(BaseEngine):
|
|
|
207
210
|
self._feature_set.add(feature)
|
|
208
211
|
|
|
209
212
|
if self.config.verbose:
|
|
210
|
-
|
|
213
|
+
logger.info(f"TabularEngine: Planned {len(self._feature_set)} features")
|
|
211
214
|
|
|
212
215
|
def transform(self, X: Union[pd.DataFrame, np.ndarray], **kwargs) -> pd.DataFrame:
|
|
213
216
|
"""
|
|
@@ -284,7 +287,7 @@ class TabularEngine(BaseEngine):
|
|
|
284
287
|
self._feature_names = [c for c in result.columns if c not in X.columns]
|
|
285
288
|
|
|
286
289
|
if self.config.verbose:
|
|
287
|
-
|
|
290
|
+
logger.info(f"TabularEngine: Generated {len(self._feature_names)} features")
|
|
288
291
|
|
|
289
292
|
return result
|
|
290
293
|
|
featcopilot/engines/text.py
CHANGED
|
@@ -11,6 +11,9 @@ from pydantic import Field
|
|
|
11
11
|
|
|
12
12
|
from featcopilot.core.base import BaseEngine, EngineConfig
|
|
13
13
|
from featcopilot.core.feature import FeatureSet
|
|
14
|
+
from featcopilot.utils.logger import get_logger
|
|
15
|
+
|
|
16
|
+
logger = get_logger(__name__)
|
|
14
17
|
|
|
15
18
|
|
|
16
19
|
class TextEngineConfig(EngineConfig):
|
|
@@ -106,7 +109,7 @@ class TextEngine(BaseEngine):
|
|
|
106
109
|
]
|
|
107
110
|
|
|
108
111
|
if self.config.verbose:
|
|
109
|
-
|
|
112
|
+
logger.info(f"TextEngine: Found {len(self._text_columns)} text columns")
|
|
110
113
|
|
|
111
114
|
# Fit TF-IDF vectorizers if needed
|
|
112
115
|
if "tfidf" in self.config.features:
|
|
@@ -135,7 +138,7 @@ class TextEngine(BaseEngine):
|
|
|
135
138
|
|
|
136
139
|
except ImportError:
|
|
137
140
|
if self.config.verbose:
|
|
138
|
-
|
|
141
|
+
logger.warning("TextEngine: sklearn not available for TF-IDF, skipping")
|
|
139
142
|
|
|
140
143
|
def transform(self, X: Union[pd.DataFrame, np.ndarray], **kwargs) -> pd.DataFrame:
|
|
141
144
|
"""
|
|
@@ -191,7 +194,7 @@ class TextEngine(BaseEngine):
|
|
|
191
194
|
self._feature_names = [c for c in result.columns if c not in X.columns]
|
|
192
195
|
|
|
193
196
|
if self.config.verbose:
|
|
194
|
-
|
|
197
|
+
logger.info(f"TextEngine: Extracted {len(self._feature_names)} features")
|
|
195
198
|
|
|
196
199
|
return result
|
|
197
200
|
|
|
@@ -12,6 +12,9 @@ from pydantic import Field
|
|
|
12
12
|
|
|
13
13
|
from featcopilot.core.base import BaseEngine, EngineConfig
|
|
14
14
|
from featcopilot.core.feature import FeatureSet
|
|
15
|
+
from featcopilot.utils.logger import get_logger
|
|
16
|
+
|
|
17
|
+
logger = get_logger(__name__)
|
|
15
18
|
|
|
16
19
|
|
|
17
20
|
class TimeSeriesEngineConfig(EngineConfig):
|
|
@@ -123,7 +126,7 @@ class TimeSeriesEngine(BaseEngine):
|
|
|
123
126
|
self._time_columns = X.select_dtypes(include=[np.number]).columns.tolist()
|
|
124
127
|
|
|
125
128
|
if self.config.verbose:
|
|
126
|
-
|
|
129
|
+
logger.info(f"TimeSeriesEngine: Found {len(self._time_columns)} numeric columns")
|
|
127
130
|
|
|
128
131
|
self._is_fitted = True
|
|
129
132
|
return self
|
|
@@ -177,7 +180,7 @@ class TimeSeriesEngine(BaseEngine):
|
|
|
177
180
|
self._feature_names = list(result.columns)
|
|
178
181
|
|
|
179
182
|
if self.config.verbose:
|
|
180
|
-
|
|
183
|
+
logger.info(f"TimeSeriesEngine: Extracted {len(self._feature_names)} features")
|
|
181
184
|
|
|
182
185
|
return result
|
|
183
186
|
|
featcopilot/llm/__init__.py
CHANGED
|
@@ -1,15 +1,18 @@
|
|
|
1
1
|
"""LLM-powered feature engineering module.
|
|
2
2
|
|
|
3
|
-
Uses GitHub Copilot SDK for intelligent feature generation.
|
|
3
|
+
Uses GitHub Copilot SDK or LiteLLM for intelligent feature generation.
|
|
4
4
|
"""
|
|
5
5
|
|
|
6
6
|
from featcopilot.llm.code_generator import FeatureCodeGenerator
|
|
7
7
|
from featcopilot.llm.copilot_client import CopilotFeatureClient
|
|
8
8
|
from featcopilot.llm.explainer import FeatureExplainer
|
|
9
|
+
from featcopilot.llm.litellm_client import LiteLLMFeatureClient, SyncLiteLLMFeatureClient
|
|
9
10
|
from featcopilot.llm.semantic_engine import SemanticEngine
|
|
10
11
|
|
|
11
12
|
__all__ = [
|
|
12
13
|
"CopilotFeatureClient",
|
|
14
|
+
"LiteLLMFeatureClient",
|
|
15
|
+
"SyncLiteLLMFeatureClient",
|
|
13
16
|
"SemanticEngine",
|
|
14
17
|
"FeatureExplainer",
|
|
15
18
|
"FeatureCodeGenerator",
|
|
@@ -10,6 +10,9 @@ import pandas as pd
|
|
|
10
10
|
|
|
11
11
|
from featcopilot.core.feature import Feature, FeatureOrigin, FeatureType
|
|
12
12
|
from featcopilot.llm.copilot_client import SyncCopilotFeatureClient
|
|
13
|
+
from featcopilot.utils.logger import get_logger
|
|
14
|
+
|
|
15
|
+
logger = get_logger(__name__)
|
|
13
16
|
|
|
14
17
|
|
|
15
18
|
class FeatureCodeGenerator:
|
|
@@ -21,7 +24,7 @@ class FeatureCodeGenerator:
|
|
|
21
24
|
|
|
22
25
|
Parameters
|
|
23
26
|
----------
|
|
24
|
-
model : str, default='gpt-5'
|
|
27
|
+
model : str, default='gpt-5.2'
|
|
25
28
|
LLM model to use
|
|
26
29
|
validate : bool, default=True
|
|
27
30
|
Whether to validate generated code
|
|
@@ -35,7 +38,7 @@ class FeatureCodeGenerator:
|
|
|
35
38
|
... )
|
|
36
39
|
"""
|
|
37
40
|
|
|
38
|
-
def __init__(self, model: str = "gpt-5", validate: bool = True, verbose: bool = False):
|
|
41
|
+
def __init__(self, model: str = "gpt-5.2", validate: bool = True, verbose: bool = False):
|
|
39
42
|
self.model = model
|
|
40
43
|
self.validate = validate
|
|
41
44
|
self.verbose = verbose
|
|
@@ -98,7 +101,7 @@ class FeatureCodeGenerator:
|
|
|
98
101
|
)
|
|
99
102
|
if not validation["valid"]:
|
|
100
103
|
if self.verbose:
|
|
101
|
-
|
|
104
|
+
logger.warning(f"Code validation failed: {validation['error']}")
|
|
102
105
|
# Try to fix common issues
|
|
103
106
|
code = self._fix_common_issues(code, validation["error"])
|
|
104
107
|
|
|
@@ -144,7 +147,7 @@ class FeatureCodeGenerator:
|
|
|
144
147
|
features.append(feature)
|
|
145
148
|
except Exception as e:
|
|
146
149
|
if self.verbose:
|
|
147
|
-
|
|
150
|
+
logger.error(f"Failed to generate feature for '{desc}': {e}")
|
|
148
151
|
|
|
149
152
|
return features
|
|
150
153
|
|
|
@@ -10,11 +10,15 @@ from typing import Any, Optional
|
|
|
10
10
|
|
|
11
11
|
from pydantic import BaseModel, Field
|
|
12
12
|
|
|
13
|
+
from featcopilot.utils.logger import get_logger
|
|
14
|
+
|
|
15
|
+
logger = get_logger(__name__)
|
|
16
|
+
|
|
13
17
|
|
|
14
18
|
class CopilotConfig(BaseModel):
|
|
15
19
|
"""Configuration for Copilot client."""
|
|
16
20
|
|
|
17
|
-
model: str = Field(default="gpt-5", description="Model to use")
|
|
21
|
+
model: str = Field(default="gpt-5.2", description="Model to use")
|
|
18
22
|
temperature: float = Field(default=0.3, ge=0, le=1, description="Temperature for generation")
|
|
19
23
|
max_tokens: int = Field(default=4096, description="Maximum tokens in response")
|
|
20
24
|
timeout: float = Field(default=60.0, description="Timeout in seconds")
|
|
@@ -35,12 +39,12 @@ class CopilotFeatureClient:
|
|
|
35
39
|
----------
|
|
36
40
|
config : CopilotConfig, optional
|
|
37
41
|
Configuration for the client
|
|
38
|
-
model : str, default='gpt-5'
|
|
42
|
+
model : str, default='gpt-5.2'
|
|
39
43
|
Model to use for generation
|
|
40
44
|
|
|
41
45
|
Examples
|
|
42
46
|
--------
|
|
43
|
-
>>> client = CopilotFeatureClient(model='gpt-5')
|
|
47
|
+
>>> client = CopilotFeatureClient(model='gpt-5.2')
|
|
44
48
|
>>> await client.start()
|
|
45
49
|
>>> suggestions = await client.suggest_features(
|
|
46
50
|
... column_info={'age': 'int', 'income': 'float'},
|
|
@@ -49,7 +53,7 @@ class CopilotFeatureClient:
|
|
|
49
53
|
>>> await client.stop()
|
|
50
54
|
"""
|
|
51
55
|
|
|
52
|
-
def __init__(self, config: Optional[CopilotConfig] = None, model: str = "gpt-5", **kwargs):
|
|
56
|
+
def __init__(self, config: Optional[CopilotConfig] = None, model: str = "gpt-5.2", **kwargs):
|
|
53
57
|
self.config = config or CopilotConfig(model=model, **kwargs)
|
|
54
58
|
self._client = None
|
|
55
59
|
self._session = None
|
|
@@ -82,13 +86,13 @@ class CopilotFeatureClient:
|
|
|
82
86
|
# Copilot SDK not installed - use mock mode
|
|
83
87
|
self._copilot_available = False
|
|
84
88
|
self._is_started = True
|
|
85
|
-
|
|
89
|
+
logger.warning("copilot-sdk not installed. Using mock LLM responses.")
|
|
86
90
|
|
|
87
91
|
except Exception as e:
|
|
88
92
|
# Copilot not available - use mock mode
|
|
89
93
|
self._copilot_available = False
|
|
90
94
|
self._is_started = True
|
|
91
|
-
|
|
95
|
+
logger.warning(f"Could not connect to Copilot: {e}. Using mock LLM responses.")
|
|
92
96
|
|
|
93
97
|
return self
|
|
94
98
|
|
|
@@ -469,7 +473,37 @@ result = df['col1'] / (df['col2'] + 1e-8)
|
|
|
469
473
|
local_vars = {"df": df, "np": np, "pd": pd}
|
|
470
474
|
exec(
|
|
471
475
|
code,
|
|
472
|
-
{
|
|
476
|
+
{
|
|
477
|
+
"__builtins__": {
|
|
478
|
+
"len": len,
|
|
479
|
+
"sum": sum,
|
|
480
|
+
"max": max,
|
|
481
|
+
"min": min,
|
|
482
|
+
"int": int,
|
|
483
|
+
"float": float,
|
|
484
|
+
"str": str,
|
|
485
|
+
"bool": bool,
|
|
486
|
+
"abs": abs,
|
|
487
|
+
"round": round,
|
|
488
|
+
"pow": pow,
|
|
489
|
+
"range": range,
|
|
490
|
+
"list": list,
|
|
491
|
+
"dict": dict,
|
|
492
|
+
"set": set,
|
|
493
|
+
"tuple": tuple,
|
|
494
|
+
"sorted": sorted,
|
|
495
|
+
"reversed": reversed,
|
|
496
|
+
"enumerate": enumerate,
|
|
497
|
+
"zip": zip,
|
|
498
|
+
"any": any,
|
|
499
|
+
"all": all,
|
|
500
|
+
"map": map,
|
|
501
|
+
"filter": filter,
|
|
502
|
+
"isinstance": isinstance,
|
|
503
|
+
"hasattr": hasattr,
|
|
504
|
+
"getattr": getattr,
|
|
505
|
+
}
|
|
506
|
+
},
|
|
473
507
|
local_vars,
|
|
474
508
|
)
|
|
475
509
|
|
|
@@ -489,33 +523,43 @@ class SyncCopilotFeatureClient:
|
|
|
489
523
|
|
|
490
524
|
def __init__(self, **kwargs):
|
|
491
525
|
self._async_client = CopilotFeatureClient(**kwargs)
|
|
492
|
-
self._loop = None
|
|
493
526
|
|
|
494
|
-
def
|
|
495
|
-
|
|
527
|
+
def _run_async(self, coro):
|
|
528
|
+
"""Run an async coroutine, handling nested event loops (e.g., Jupyter)."""
|
|
529
|
+
try:
|
|
530
|
+
# Check if we're in a running event loop (e.g., Jupyter)
|
|
531
|
+
loop = asyncio.get_running_loop()
|
|
532
|
+
# We're in a running loop - use nest_asyncio if available
|
|
496
533
|
try:
|
|
497
|
-
|
|
498
|
-
|
|
499
|
-
|
|
500
|
-
|
|
501
|
-
|
|
534
|
+
import nest_asyncio
|
|
535
|
+
|
|
536
|
+
nest_asyncio.apply()
|
|
537
|
+
return loop.run_until_complete(coro)
|
|
538
|
+
except ImportError:
|
|
539
|
+
# nest_asyncio not available, try alternative approach
|
|
540
|
+
import concurrent.futures
|
|
541
|
+
|
|
542
|
+
with concurrent.futures.ThreadPoolExecutor() as executor:
|
|
543
|
+
future = executor.submit(asyncio.run, coro)
|
|
544
|
+
return future.result()
|
|
545
|
+
except RuntimeError:
|
|
546
|
+
# No running event loop - safe to use asyncio.run
|
|
547
|
+
return asyncio.run(coro)
|
|
502
548
|
|
|
503
549
|
def start(self):
|
|
504
|
-
return self.
|
|
550
|
+
return self._run_async(self._async_client.start())
|
|
505
551
|
|
|
506
552
|
def stop(self):
|
|
507
|
-
return self.
|
|
553
|
+
return self._run_async(self._async_client.stop())
|
|
508
554
|
|
|
509
555
|
def suggest_features(self, **kwargs):
|
|
510
|
-
return self.
|
|
556
|
+
return self._run_async(self._async_client.suggest_features(**kwargs))
|
|
511
557
|
|
|
512
558
|
def explain_feature(self, **kwargs):
|
|
513
|
-
return self.
|
|
559
|
+
return self._run_async(self._async_client.explain_feature(**kwargs))
|
|
514
560
|
|
|
515
561
|
def generate_feature_code(self, **kwargs):
|
|
516
|
-
return self.
|
|
562
|
+
return self._run_async(self._async_client.generate_feature_code(**kwargs))
|
|
517
563
|
|
|
518
564
|
def validate_feature_code(self, code: str, sample_data=None):
|
|
519
|
-
return self.
|
|
520
|
-
self._async_client.validate_feature_code(code=code, sample_data=sample_data)
|
|
521
|
-
)
|
|
565
|
+
return self._run_async(self._async_client.validate_feature_code(code=code, sample_data=sample_data))
|
featcopilot/llm/explainer.py
CHANGED
|
@@ -9,6 +9,9 @@ import pandas as pd
|
|
|
9
9
|
|
|
10
10
|
from featcopilot.core.feature import Feature, FeatureSet
|
|
11
11
|
from featcopilot.llm.copilot_client import SyncCopilotFeatureClient
|
|
12
|
+
from featcopilot.utils.logger import get_logger
|
|
13
|
+
|
|
14
|
+
logger = get_logger(__name__)
|
|
12
15
|
|
|
13
16
|
|
|
14
17
|
class FeatureExplainer:
|
|
@@ -20,7 +23,7 @@ class FeatureExplainer:
|
|
|
20
23
|
|
|
21
24
|
Parameters
|
|
22
25
|
----------
|
|
23
|
-
model : str, default='gpt-5'
|
|
26
|
+
model : str, default='gpt-5.2'
|
|
24
27
|
LLM model to use
|
|
25
28
|
|
|
26
29
|
Examples
|
|
@@ -29,7 +32,7 @@ class FeatureExplainer:
|
|
|
29
32
|
>>> explanations = explainer.explain_features(feature_set, task='predict churn')
|
|
30
33
|
"""
|
|
31
34
|
|
|
32
|
-
def __init__(self, model: str = "gpt-5", verbose: bool = False):
|
|
35
|
+
def __init__(self, model: str = "gpt-5.2", verbose: bool = False):
|
|
33
36
|
self.model = model
|
|
34
37
|
self.verbose = verbose
|
|
35
38
|
self._client: Optional[SyncCopilotFeatureClient] = None
|
|
@@ -115,7 +118,7 @@ class FeatureExplainer:
|
|
|
115
118
|
|
|
116
119
|
except Exception as e:
|
|
117
120
|
if self.verbose:
|
|
118
|
-
|
|
121
|
+
logger.error(f"Could not explain {feature.name}: {e}")
|
|
119
122
|
explanations[feature.name] = f"Feature based on: {', '.join(feature.source_columns)}"
|
|
120
123
|
|
|
121
124
|
return explanations
|