featcopilot 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- featcopilot/__init__.py +29 -0
- featcopilot/core/__init__.py +13 -0
- featcopilot/core/base.py +195 -0
- featcopilot/core/feature.py +224 -0
- featcopilot/core/registry.py +128 -0
- featcopilot/engines/__init__.py +13 -0
- featcopilot/engines/relational.py +256 -0
- featcopilot/engines/tabular.py +293 -0
- featcopilot/engines/text.py +211 -0
- featcopilot/engines/timeseries.py +402 -0
- featcopilot/llm/__init__.py +16 -0
- featcopilot/llm/code_generator.py +295 -0
- featcopilot/llm/copilot_client.py +521 -0
- featcopilot/llm/explainer.py +200 -0
- featcopilot/llm/semantic_engine.py +379 -0
- featcopilot/selection/__init__.py +13 -0
- featcopilot/selection/importance.py +161 -0
- featcopilot/selection/redundancy.py +156 -0
- featcopilot/selection/statistical.py +199 -0
- featcopilot/selection/unified.py +172 -0
- featcopilot/transformers/__init__.py +11 -0
- featcopilot/transformers/sklearn_compat.py +401 -0
- featcopilot/utils/__init__.py +9 -0
- featcopilot/utils/cache.py +221 -0
- featcopilot/utils/parallel.py +109 -0
- featcopilot-0.1.0.dist-info/METADATA +218 -0
- featcopilot-0.1.0.dist-info/RECORD +29 -0
- featcopilot-0.1.0.dist-info/WHEEL +5 -0
- featcopilot-0.1.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,211 @@
|
|
|
1
|
+
"""Text feature engineering engine.
|
|
2
|
+
|
|
3
|
+
Generates features from text data using embeddings and NLP techniques.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from typing import Any, Optional, Union
|
|
7
|
+
|
|
8
|
+
import numpy as np
|
|
9
|
+
import pandas as pd
|
|
10
|
+
from pydantic import Field
|
|
11
|
+
|
|
12
|
+
from featcopilot.core.base import BaseEngine, EngineConfig
|
|
13
|
+
from featcopilot.core.feature import FeatureSet
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class TextEngineConfig(EngineConfig):
|
|
17
|
+
"""Configuration for text feature engine."""
|
|
18
|
+
|
|
19
|
+
name: str = "TextEngine"
|
|
20
|
+
features: list[str] = Field(
|
|
21
|
+
default_factory=lambda: ["length", "word_count", "char_stats"],
|
|
22
|
+
description="Feature types to extract",
|
|
23
|
+
)
|
|
24
|
+
max_vocab_size: int = Field(default=5000, description="Max vocabulary size for TF-IDF")
|
|
25
|
+
n_components: int = Field(default=50, description="Components for dimensionality reduction")
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class TextEngine(BaseEngine):
|
|
29
|
+
"""
|
|
30
|
+
Text feature engineering engine.
|
|
31
|
+
|
|
32
|
+
Extracts features from text columns including:
|
|
33
|
+
- Length and character statistics
|
|
34
|
+
- Word count features
|
|
35
|
+
- TF-IDF features (optional)
|
|
36
|
+
- Sentiment features (optional)
|
|
37
|
+
- Embedding features (with LLM integration)
|
|
38
|
+
|
|
39
|
+
Parameters
|
|
40
|
+
----------
|
|
41
|
+
features : list
|
|
42
|
+
Feature types to extract
|
|
43
|
+
max_vocab_size : int, default=5000
|
|
44
|
+
Maximum vocabulary size for TF-IDF
|
|
45
|
+
|
|
46
|
+
Examples
|
|
47
|
+
--------
|
|
48
|
+
>>> engine = TextEngine(features=['length', 'word_count', 'tfidf'])
|
|
49
|
+
>>> X_features = engine.fit_transform(text_df)
|
|
50
|
+
"""
|
|
51
|
+
|
|
52
|
+
def __init__(
|
|
53
|
+
self,
|
|
54
|
+
features: Optional[list[str]] = None,
|
|
55
|
+
max_vocab_size: int = 5000,
|
|
56
|
+
max_features: Optional[int] = None,
|
|
57
|
+
verbose: bool = False,
|
|
58
|
+
**kwargs,
|
|
59
|
+
):
|
|
60
|
+
config = TextEngineConfig(
|
|
61
|
+
features=features or ["length", "word_count", "char_stats"],
|
|
62
|
+
max_vocab_size=max_vocab_size,
|
|
63
|
+
max_features=max_features,
|
|
64
|
+
verbose=verbose,
|
|
65
|
+
**kwargs,
|
|
66
|
+
)
|
|
67
|
+
super().__init__(config=config)
|
|
68
|
+
self.config: TextEngineConfig = config
|
|
69
|
+
self._text_columns: list[str] = []
|
|
70
|
+
self._vectorizers: dict[str, Any] = {}
|
|
71
|
+
self._feature_set = FeatureSet()
|
|
72
|
+
|
|
73
|
+
def fit(
|
|
74
|
+
self,
|
|
75
|
+
X: Union[pd.DataFrame, np.ndarray],
|
|
76
|
+
y: Optional[Union[pd.Series, np.ndarray]] = None,
|
|
77
|
+
text_columns: Optional[list[str]] = None,
|
|
78
|
+
**kwargs,
|
|
79
|
+
) -> "TextEngine":
|
|
80
|
+
"""
|
|
81
|
+
Fit the engine to identify and process text columns.
|
|
82
|
+
|
|
83
|
+
Parameters
|
|
84
|
+
----------
|
|
85
|
+
X : DataFrame
|
|
86
|
+
Input data
|
|
87
|
+
y : Series, optional
|
|
88
|
+
Target variable
|
|
89
|
+
text_columns : list, optional
|
|
90
|
+
Specific columns to treat as text
|
|
91
|
+
|
|
92
|
+
Returns
|
|
93
|
+
-------
|
|
94
|
+
self : TextEngine
|
|
95
|
+
"""
|
|
96
|
+
X = self._validate_input(X)
|
|
97
|
+
|
|
98
|
+
# Identify text columns
|
|
99
|
+
if text_columns:
|
|
100
|
+
self._text_columns = text_columns
|
|
101
|
+
else:
|
|
102
|
+
self._text_columns = X.select_dtypes(include=["object"]).columns.tolist()
|
|
103
|
+
# Filter to likely text columns (not IDs, not low cardinality)
|
|
104
|
+
self._text_columns = [
|
|
105
|
+
col for col in self._text_columns if X[col].str.len().mean() > 10 and X[col].nunique() > 10
|
|
106
|
+
]
|
|
107
|
+
|
|
108
|
+
if self.config.verbose:
|
|
109
|
+
print(f"TextEngine: Found {len(self._text_columns)} text columns")
|
|
110
|
+
|
|
111
|
+
# Fit TF-IDF vectorizers if needed
|
|
112
|
+
if "tfidf" in self.config.features:
|
|
113
|
+
self._fit_tfidf(X)
|
|
114
|
+
|
|
115
|
+
self._is_fitted = True
|
|
116
|
+
return self
|
|
117
|
+
|
|
118
|
+
def _fit_tfidf(self, X: pd.DataFrame) -> None:
|
|
119
|
+
"""Fit TF-IDF vectorizers for text columns."""
|
|
120
|
+
try:
|
|
121
|
+
from sklearn.decomposition import TruncatedSVD
|
|
122
|
+
from sklearn.feature_extraction.text import TfidfVectorizer
|
|
123
|
+
|
|
124
|
+
for col in self._text_columns:
|
|
125
|
+
texts = X[col].fillna("").astype(str)
|
|
126
|
+
vectorizer = TfidfVectorizer(max_features=self.config.max_vocab_size, stop_words="english")
|
|
127
|
+
tfidf_matrix = vectorizer.fit_transform(texts)
|
|
128
|
+
|
|
129
|
+
# Reduce dimensions with SVD
|
|
130
|
+
n_components = min(self.config.n_components, tfidf_matrix.shape[1])
|
|
131
|
+
if n_components > 0:
|
|
132
|
+
svd = TruncatedSVD(n_components=n_components)
|
|
133
|
+
svd.fit(tfidf_matrix)
|
|
134
|
+
self._vectorizers[col] = {"vectorizer": vectorizer, "svd": svd}
|
|
135
|
+
|
|
136
|
+
except ImportError:
|
|
137
|
+
if self.config.verbose:
|
|
138
|
+
print("TextEngine: sklearn not available for TF-IDF, skipping")
|
|
139
|
+
|
|
140
|
+
def transform(self, X: Union[pd.DataFrame, np.ndarray], **kwargs) -> pd.DataFrame:
|
|
141
|
+
"""
|
|
142
|
+
Extract text features.
|
|
143
|
+
|
|
144
|
+
Parameters
|
|
145
|
+
----------
|
|
146
|
+
X : DataFrame
|
|
147
|
+
Input data
|
|
148
|
+
|
|
149
|
+
Returns
|
|
150
|
+
-------
|
|
151
|
+
X_features : DataFrame
|
|
152
|
+
Extracted features
|
|
153
|
+
"""
|
|
154
|
+
if not self._is_fitted:
|
|
155
|
+
raise RuntimeError("Engine must be fitted before transform")
|
|
156
|
+
|
|
157
|
+
X = self._validate_input(X)
|
|
158
|
+
result = X.copy()
|
|
159
|
+
|
|
160
|
+
for col in self._text_columns:
|
|
161
|
+
texts = X[col].fillna("").astype(str)
|
|
162
|
+
|
|
163
|
+
# Length features
|
|
164
|
+
if "length" in self.config.features:
|
|
165
|
+
result[f"{col}_char_length"] = texts.str.len()
|
|
166
|
+
result[f"{col}_word_count"] = texts.str.split().str.len()
|
|
167
|
+
|
|
168
|
+
# Character statistics
|
|
169
|
+
if "char_stats" in self.config.features:
|
|
170
|
+
result[f"{col}_uppercase_ratio"] = texts.apply(
|
|
171
|
+
lambda x: sum(1 for c in x if c.isupper()) / max(len(x), 1)
|
|
172
|
+
)
|
|
173
|
+
result[f"{col}_digit_ratio"] = texts.apply(lambda x: sum(1 for c in x if c.isdigit()) / max(len(x), 1))
|
|
174
|
+
result[f"{col}_space_ratio"] = texts.apply(lambda x: sum(1 for c in x if c.isspace()) / max(len(x), 1))
|
|
175
|
+
result[f"{col}_special_char_count"] = texts.apply(
|
|
176
|
+
lambda x: sum(1 for c in x if not c.isalnum() and not c.isspace())
|
|
177
|
+
)
|
|
178
|
+
|
|
179
|
+
# Word count features
|
|
180
|
+
if "word_count" in self.config.features:
|
|
181
|
+
result[f"{col}_avg_word_length"] = texts.apply(lambda x: np.mean([len(w) for w in x.split()] or [0]))
|
|
182
|
+
result[f"{col}_unique_word_ratio"] = texts.apply(
|
|
183
|
+
lambda x: len(set(x.lower().split())) / max(len(x.split()), 1)
|
|
184
|
+
)
|
|
185
|
+
|
|
186
|
+
# TF-IDF features
|
|
187
|
+
if "tfidf" in self.config.features and col in self._vectorizers:
|
|
188
|
+
tfidf_features = self._transform_tfidf(texts, col)
|
|
189
|
+
result = pd.concat([result, tfidf_features], axis=1)
|
|
190
|
+
|
|
191
|
+
self._feature_names = [c for c in result.columns if c not in X.columns]
|
|
192
|
+
|
|
193
|
+
if self.config.verbose:
|
|
194
|
+
print(f"TextEngine: Extracted {len(self._feature_names)} features")
|
|
195
|
+
|
|
196
|
+
return result
|
|
197
|
+
|
|
198
|
+
def _transform_tfidf(self, texts: pd.Series, col: str) -> pd.DataFrame:
|
|
199
|
+
"""Transform texts using fitted TF-IDF + SVD."""
|
|
200
|
+
vectorizer = self._vectorizers[col]["vectorizer"]
|
|
201
|
+
svd = self._vectorizers[col]["svd"]
|
|
202
|
+
|
|
203
|
+
tfidf_matrix = vectorizer.transform(texts)
|
|
204
|
+
reduced = svd.transform(tfidf_matrix)
|
|
205
|
+
|
|
206
|
+
feature_names = [f"{col}_tfidf_{i}" for i in range(reduced.shape[1])]
|
|
207
|
+
return pd.DataFrame(reduced, columns=feature_names, index=texts.index)
|
|
208
|
+
|
|
209
|
+
def get_feature_set(self) -> FeatureSet:
|
|
210
|
+
"""Get the feature set with metadata."""
|
|
211
|
+
return self._feature_set
|
|
@@ -0,0 +1,402 @@
|
|
|
1
|
+
"""Time series feature engineering engine.
|
|
2
|
+
|
|
3
|
+
Extracts statistical, frequency, and temporal features from time series data.
|
|
4
|
+
Inspired by TSFresh but with better integration and LLM capabilities.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from typing import Optional, Union
|
|
8
|
+
|
|
9
|
+
import numpy as np
|
|
10
|
+
import pandas as pd
|
|
11
|
+
from pydantic import Field
|
|
12
|
+
|
|
13
|
+
from featcopilot.core.base import BaseEngine, EngineConfig
|
|
14
|
+
from featcopilot.core.feature import FeatureSet
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class TimeSeriesEngineConfig(EngineConfig):
|
|
18
|
+
"""Configuration for time series feature engine."""
|
|
19
|
+
|
|
20
|
+
name: str = "TimeSeriesEngine"
|
|
21
|
+
features: list[str] = Field(
|
|
22
|
+
default_factory=lambda: [
|
|
23
|
+
"basic_stats",
|
|
24
|
+
"distribution",
|
|
25
|
+
"autocorrelation",
|
|
26
|
+
"peaks",
|
|
27
|
+
"trends",
|
|
28
|
+
],
|
|
29
|
+
description="Feature groups to extract",
|
|
30
|
+
)
|
|
31
|
+
window_sizes: list[int] = Field(
|
|
32
|
+
default_factory=lambda: [5, 10, 20], description="Window sizes for rolling features"
|
|
33
|
+
)
|
|
34
|
+
n_fft_coefficients: int = Field(default=10, description="Number of FFT coefficients")
|
|
35
|
+
n_autocorr_lags: int = Field(default=10, description="Number of autocorrelation lags")
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
class TimeSeriesEngine(BaseEngine):
|
|
39
|
+
"""
|
|
40
|
+
Time series feature engineering engine.
|
|
41
|
+
|
|
42
|
+
Extracts comprehensive features from time series data including:
|
|
43
|
+
- Basic statistics (mean, std, min, max, etc.)
|
|
44
|
+
- Distribution features (skewness, kurtosis, quantiles)
|
|
45
|
+
- Autocorrelation features
|
|
46
|
+
- Frequency domain features (FFT)
|
|
47
|
+
- Peak and trough features
|
|
48
|
+
- Trend features
|
|
49
|
+
- Rolling window statistics
|
|
50
|
+
|
|
51
|
+
Parameters
|
|
52
|
+
----------
|
|
53
|
+
features : list, default=['basic_stats', 'distribution', 'autocorrelation']
|
|
54
|
+
Feature groups to extract
|
|
55
|
+
window_sizes : list, default=[5, 10, 20]
|
|
56
|
+
Window sizes for rolling features
|
|
57
|
+
max_features : int, optional
|
|
58
|
+
Maximum number of features to generate
|
|
59
|
+
|
|
60
|
+
Examples
|
|
61
|
+
--------
|
|
62
|
+
>>> engine = TimeSeriesEngine(features=['basic_stats', 'autocorrelation'])
|
|
63
|
+
>>> X_features = engine.fit_transform(time_series_df)
|
|
64
|
+
"""
|
|
65
|
+
|
|
66
|
+
# Feature extraction functions
|
|
67
|
+
FEATURE_EXTRACTORS = {
|
|
68
|
+
"basic_stats": "_extract_basic_stats",
|
|
69
|
+
"distribution": "_extract_distribution",
|
|
70
|
+
"autocorrelation": "_extract_autocorrelation",
|
|
71
|
+
"peaks": "_extract_peaks",
|
|
72
|
+
"trends": "_extract_trends",
|
|
73
|
+
"rolling": "_extract_rolling",
|
|
74
|
+
"fft": "_extract_fft",
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
def __init__(
|
|
78
|
+
self,
|
|
79
|
+
features: Optional[list[str]] = None,
|
|
80
|
+
window_sizes: Optional[list[int]] = None,
|
|
81
|
+
max_features: Optional[int] = None,
|
|
82
|
+
verbose: bool = False,
|
|
83
|
+
**kwargs,
|
|
84
|
+
):
|
|
85
|
+
config = TimeSeriesEngineConfig(
|
|
86
|
+
features=features or ["basic_stats", "distribution", "autocorrelation"],
|
|
87
|
+
window_sizes=window_sizes or [5, 10, 20],
|
|
88
|
+
max_features=max_features,
|
|
89
|
+
verbose=verbose,
|
|
90
|
+
**kwargs,
|
|
91
|
+
)
|
|
92
|
+
super().__init__(config=config)
|
|
93
|
+
self.config: TimeSeriesEngineConfig = config
|
|
94
|
+
self._time_columns: list[str] = []
|
|
95
|
+
self._feature_set = FeatureSet()
|
|
96
|
+
|
|
97
|
+
def fit(
|
|
98
|
+
self,
|
|
99
|
+
X: Union[pd.DataFrame, np.ndarray],
|
|
100
|
+
y: Optional[Union[pd.Series, np.ndarray]] = None,
|
|
101
|
+
time_column: Optional[str] = None,
|
|
102
|
+
**kwargs,
|
|
103
|
+
) -> "TimeSeriesEngine":
|
|
104
|
+
"""
|
|
105
|
+
Fit the engine to identify time series columns.
|
|
106
|
+
|
|
107
|
+
Parameters
|
|
108
|
+
----------
|
|
109
|
+
X : DataFrame or ndarray
|
|
110
|
+
Input data (each row is a time series or time-indexed data)
|
|
111
|
+
y : Series or ndarray, optional
|
|
112
|
+
Target variable
|
|
113
|
+
time_column : str, optional
|
|
114
|
+
Column containing timestamps
|
|
115
|
+
|
|
116
|
+
Returns
|
|
117
|
+
-------
|
|
118
|
+
self : TimeSeriesEngine
|
|
119
|
+
"""
|
|
120
|
+
X = self._validate_input(X)
|
|
121
|
+
|
|
122
|
+
# Identify numeric columns for time series analysis
|
|
123
|
+
self._time_columns = X.select_dtypes(include=[np.number]).columns.tolist()
|
|
124
|
+
|
|
125
|
+
if self.config.verbose:
|
|
126
|
+
print(f"TimeSeriesEngine: Found {len(self._time_columns)} numeric columns")
|
|
127
|
+
|
|
128
|
+
self._is_fitted = True
|
|
129
|
+
return self
|
|
130
|
+
|
|
131
|
+
def transform(self, X: Union[pd.DataFrame, np.ndarray], **kwargs) -> pd.DataFrame:
|
|
132
|
+
"""
|
|
133
|
+
Extract time series features from input data.
|
|
134
|
+
|
|
135
|
+
Parameters
|
|
136
|
+
----------
|
|
137
|
+
X : DataFrame or ndarray
|
|
138
|
+
Input data
|
|
139
|
+
|
|
140
|
+
Returns
|
|
141
|
+
-------
|
|
142
|
+
X_features : DataFrame
|
|
143
|
+
Extracted features
|
|
144
|
+
"""
|
|
145
|
+
if not self._is_fitted:
|
|
146
|
+
raise RuntimeError("Engine must be fitted before transform")
|
|
147
|
+
|
|
148
|
+
X = self._validate_input(X)
|
|
149
|
+
features_dict = {}
|
|
150
|
+
|
|
151
|
+
for col in self._time_columns:
|
|
152
|
+
series = X[col].values
|
|
153
|
+
|
|
154
|
+
for feature_group in self.config.features:
|
|
155
|
+
if feature_group in self.FEATURE_EXTRACTORS:
|
|
156
|
+
method_name = self.FEATURE_EXTRACTORS[feature_group]
|
|
157
|
+
method = getattr(self, method_name)
|
|
158
|
+
extracted = method(series, col)
|
|
159
|
+
features_dict.update(extracted)
|
|
160
|
+
|
|
161
|
+
# For DataFrames with multiple rows, extract features across the entire column
|
|
162
|
+
if len(X) > 1:
|
|
163
|
+
# Each column is treated as a single time series
|
|
164
|
+
features_dict = {}
|
|
165
|
+
for col in self._time_columns:
|
|
166
|
+
series = X[col].values
|
|
167
|
+
|
|
168
|
+
for feature_group in self.config.features:
|
|
169
|
+
if feature_group in self.FEATURE_EXTRACTORS:
|
|
170
|
+
method_name = self.FEATURE_EXTRACTORS[feature_group]
|
|
171
|
+
method = getattr(self, method_name)
|
|
172
|
+
extracted = method(series, col)
|
|
173
|
+
features_dict.update(extracted)
|
|
174
|
+
|
|
175
|
+
result = pd.DataFrame([features_dict])
|
|
176
|
+
|
|
177
|
+
self._feature_names = list(result.columns)
|
|
178
|
+
|
|
179
|
+
if self.config.verbose:
|
|
180
|
+
print(f"TimeSeriesEngine: Extracted {len(self._feature_names)} features")
|
|
181
|
+
|
|
182
|
+
return result
|
|
183
|
+
|
|
184
|
+
def _extract_per_row(self, X: pd.DataFrame) -> pd.DataFrame:
|
|
185
|
+
"""Extract features for each row (multiple time series)."""
|
|
186
|
+
all_features = []
|
|
187
|
+
|
|
188
|
+
for idx in range(len(X)):
|
|
189
|
+
row_features = {}
|
|
190
|
+
for col in self._time_columns:
|
|
191
|
+
value = X[col].iloc[idx]
|
|
192
|
+
if isinstance(value, (list, np.ndarray)):
|
|
193
|
+
series = np.array(value)
|
|
194
|
+
else:
|
|
195
|
+
# Single value - create minimal features
|
|
196
|
+
row_features[f"{col}_value"] = value
|
|
197
|
+
continue
|
|
198
|
+
|
|
199
|
+
for feature_group in self.config.features:
|
|
200
|
+
if feature_group in self.FEATURE_EXTRACTORS:
|
|
201
|
+
method_name = self.FEATURE_EXTRACTORS[feature_group]
|
|
202
|
+
method = getattr(self, method_name)
|
|
203
|
+
extracted = method(series, col)
|
|
204
|
+
row_features.update(extracted)
|
|
205
|
+
|
|
206
|
+
all_features.append(row_features)
|
|
207
|
+
|
|
208
|
+
return pd.DataFrame(all_features)
|
|
209
|
+
|
|
210
|
+
def _extract_basic_stats(self, series: np.ndarray, col: str) -> dict[str, float]:
|
|
211
|
+
"""Extract basic statistical features."""
|
|
212
|
+
features = {}
|
|
213
|
+
prefix = col
|
|
214
|
+
|
|
215
|
+
if len(series) == 0:
|
|
216
|
+
return features
|
|
217
|
+
|
|
218
|
+
features[f"{prefix}_mean"] = np.nanmean(series)
|
|
219
|
+
features[f"{prefix}_std"] = np.nanstd(series)
|
|
220
|
+
features[f"{prefix}_min"] = np.nanmin(series)
|
|
221
|
+
features[f"{prefix}_max"] = np.nanmax(series)
|
|
222
|
+
features[f"{prefix}_range"] = features[f"{prefix}_max"] - features[f"{prefix}_min"]
|
|
223
|
+
features[f"{prefix}_median"] = np.nanmedian(series)
|
|
224
|
+
features[f"{prefix}_sum"] = np.nansum(series)
|
|
225
|
+
features[f"{prefix}_length"] = len(series)
|
|
226
|
+
features[f"{prefix}_var"] = np.nanvar(series)
|
|
227
|
+
|
|
228
|
+
# Coefficient of variation
|
|
229
|
+
if features[f"{prefix}_mean"] != 0:
|
|
230
|
+
features[f"{prefix}_cv"] = features[f"{prefix}_std"] / abs(features[f"{prefix}_mean"])
|
|
231
|
+
else:
|
|
232
|
+
features[f"{prefix}_cv"] = 0
|
|
233
|
+
|
|
234
|
+
return features
|
|
235
|
+
|
|
236
|
+
def _extract_distribution(self, series: np.ndarray, col: str) -> dict[str, float]:
|
|
237
|
+
"""Extract distribution-based features."""
|
|
238
|
+
from scipy import stats
|
|
239
|
+
|
|
240
|
+
features = {}
|
|
241
|
+
prefix = col
|
|
242
|
+
|
|
243
|
+
if len(series) < 4:
|
|
244
|
+
return features
|
|
245
|
+
|
|
246
|
+
# Remove NaN values
|
|
247
|
+
series_clean = series[~np.isnan(series)]
|
|
248
|
+
if len(series_clean) < 4:
|
|
249
|
+
return features
|
|
250
|
+
|
|
251
|
+
features[f"{prefix}_skewness"] = stats.skew(series_clean)
|
|
252
|
+
features[f"{prefix}_kurtosis"] = stats.kurtosis(series_clean)
|
|
253
|
+
|
|
254
|
+
# Quantiles
|
|
255
|
+
for q in [0.1, 0.25, 0.75, 0.9]:
|
|
256
|
+
features[f"{prefix}_q{int(q*100)}"] = np.quantile(series_clean, q)
|
|
257
|
+
|
|
258
|
+
# IQR
|
|
259
|
+
q75, q25 = np.quantile(series_clean, [0.75, 0.25])
|
|
260
|
+
features[f"{prefix}_iqr"] = q75 - q25
|
|
261
|
+
|
|
262
|
+
return features
|
|
263
|
+
|
|
264
|
+
def _extract_autocorrelation(self, series: np.ndarray, col: str) -> dict[str, float]:
|
|
265
|
+
"""Extract autocorrelation features."""
|
|
266
|
+
features = {}
|
|
267
|
+
prefix = col
|
|
268
|
+
|
|
269
|
+
if len(series) < self.config.n_autocorr_lags + 1:
|
|
270
|
+
return features
|
|
271
|
+
|
|
272
|
+
series_clean = series[~np.isnan(series)]
|
|
273
|
+
if len(series_clean) < self.config.n_autocorr_lags + 1:
|
|
274
|
+
return features
|
|
275
|
+
|
|
276
|
+
# Compute autocorrelation for different lags
|
|
277
|
+
var = np.var(series_clean)
|
|
278
|
+
|
|
279
|
+
if var == 0:
|
|
280
|
+
return features
|
|
281
|
+
|
|
282
|
+
for lag in range(1, min(self.config.n_autocorr_lags + 1, len(series_clean))):
|
|
283
|
+
autocorr = np.corrcoef(series_clean[:-lag], series_clean[lag:])[0, 1]
|
|
284
|
+
if not np.isnan(autocorr):
|
|
285
|
+
features[f"{prefix}_autocorr_lag{lag}"] = autocorr
|
|
286
|
+
|
|
287
|
+
return features
|
|
288
|
+
|
|
289
|
+
def _extract_peaks(self, series: np.ndarray, col: str) -> dict[str, float]:
|
|
290
|
+
"""Extract peak and trough related features."""
|
|
291
|
+
from scipy.signal import find_peaks
|
|
292
|
+
|
|
293
|
+
features = {}
|
|
294
|
+
prefix = col
|
|
295
|
+
|
|
296
|
+
if len(series) < 3:
|
|
297
|
+
return features
|
|
298
|
+
|
|
299
|
+
series_clean = series[~np.isnan(series)]
|
|
300
|
+
if len(series_clean) < 3:
|
|
301
|
+
return features
|
|
302
|
+
|
|
303
|
+
# Find peaks
|
|
304
|
+
peaks, _ = find_peaks(series_clean)
|
|
305
|
+
troughs, _ = find_peaks(-series_clean)
|
|
306
|
+
|
|
307
|
+
features[f"{prefix}_n_peaks"] = len(peaks)
|
|
308
|
+
features[f"{prefix}_n_troughs"] = len(troughs)
|
|
309
|
+
|
|
310
|
+
if len(peaks) > 0:
|
|
311
|
+
features[f"{prefix}_peak_mean"] = np.mean(series_clean[peaks])
|
|
312
|
+
features[f"{prefix}_peak_max"] = np.max(series_clean[peaks])
|
|
313
|
+
|
|
314
|
+
if len(troughs) > 0:
|
|
315
|
+
features[f"{prefix}_trough_mean"] = np.mean(series_clean[troughs])
|
|
316
|
+
features[f"{prefix}_trough_min"] = np.min(series_clean[troughs])
|
|
317
|
+
|
|
318
|
+
return features
|
|
319
|
+
|
|
320
|
+
def _extract_trends(self, series: np.ndarray, col: str) -> dict[str, float]:
|
|
321
|
+
"""Extract trend-related features."""
|
|
322
|
+
features = {}
|
|
323
|
+
prefix = col
|
|
324
|
+
|
|
325
|
+
if len(series) < 2:
|
|
326
|
+
return features
|
|
327
|
+
|
|
328
|
+
series_clean = series[~np.isnan(series)]
|
|
329
|
+
if len(series_clean) < 2:
|
|
330
|
+
return features
|
|
331
|
+
|
|
332
|
+
# Linear trend (slope)
|
|
333
|
+
x = np.arange(len(series_clean))
|
|
334
|
+
slope, intercept = np.polyfit(x, series_clean, 1)
|
|
335
|
+
features[f"{prefix}_trend_slope"] = slope
|
|
336
|
+
features[f"{prefix}_trend_intercept"] = intercept
|
|
337
|
+
|
|
338
|
+
# First and last differences
|
|
339
|
+
features[f"{prefix}_first_value"] = series_clean[0]
|
|
340
|
+
features[f"{prefix}_last_value"] = series_clean[-1]
|
|
341
|
+
features[f"{prefix}_change"] = series_clean[-1] - series_clean[0]
|
|
342
|
+
|
|
343
|
+
# Mean absolute change
|
|
344
|
+
features[f"{prefix}_mean_abs_change"] = np.mean(np.abs(np.diff(series_clean)))
|
|
345
|
+
|
|
346
|
+
# Mean change
|
|
347
|
+
features[f"{prefix}_mean_change"] = np.mean(np.diff(series_clean))
|
|
348
|
+
|
|
349
|
+
return features
|
|
350
|
+
|
|
351
|
+
def _extract_rolling(self, series: np.ndarray, col: str) -> dict[str, float]:
|
|
352
|
+
"""Extract rolling window features."""
|
|
353
|
+
features = {}
|
|
354
|
+
prefix = col
|
|
355
|
+
|
|
356
|
+
series_clean = series[~np.isnan(series)]
|
|
357
|
+
|
|
358
|
+
for window in self.config.window_sizes:
|
|
359
|
+
if len(series_clean) < window:
|
|
360
|
+
continue
|
|
361
|
+
|
|
362
|
+
# Convert to pandas for rolling operations
|
|
363
|
+
s = pd.Series(series_clean)
|
|
364
|
+
|
|
365
|
+
rolling = s.rolling(window=window)
|
|
366
|
+
features[f"{prefix}_rolling{window}_mean_of_means"] = rolling.mean().mean()
|
|
367
|
+
features[f"{prefix}_rolling{window}_max_of_means"] = rolling.mean().max()
|
|
368
|
+
features[f"{prefix}_rolling{window}_std_of_stds"] = rolling.std().std()
|
|
369
|
+
|
|
370
|
+
return features
|
|
371
|
+
|
|
372
|
+
def _extract_fft(self, series: np.ndarray, col: str) -> dict[str, float]:
|
|
373
|
+
"""Extract FFT (frequency domain) features."""
|
|
374
|
+
features = {}
|
|
375
|
+
prefix = col
|
|
376
|
+
|
|
377
|
+
series_clean = series[~np.isnan(series)]
|
|
378
|
+
if len(series_clean) < 4:
|
|
379
|
+
return features
|
|
380
|
+
|
|
381
|
+
# Compute FFT
|
|
382
|
+
fft_vals = np.fft.fft(series_clean)
|
|
383
|
+
fft_abs = np.abs(fft_vals)
|
|
384
|
+
|
|
385
|
+
# Get first N coefficients (excluding DC component)
|
|
386
|
+
n_coeffs = min(self.config.n_fft_coefficients, len(fft_abs) // 2)
|
|
387
|
+
|
|
388
|
+
for i in range(1, n_coeffs + 1):
|
|
389
|
+
features[f"{prefix}_fft_coeff_{i}"] = fft_abs[i]
|
|
390
|
+
|
|
391
|
+
# Spectral energy
|
|
392
|
+
features[f"{prefix}_spectral_energy"] = np.sum(fft_abs**2)
|
|
393
|
+
|
|
394
|
+
# Dominant frequency
|
|
395
|
+
dominant_idx = np.argmax(fft_abs[1 : len(fft_abs) // 2]) + 1
|
|
396
|
+
features[f"{prefix}_dominant_freq_idx"] = dominant_idx
|
|
397
|
+
|
|
398
|
+
return features
|
|
399
|
+
|
|
400
|
+
def get_feature_set(self) -> FeatureSet:
|
|
401
|
+
"""Get the feature set with metadata."""
|
|
402
|
+
return self._feature_set
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
"""LLM-powered feature engineering module.
|
|
2
|
+
|
|
3
|
+
Uses GitHub Copilot SDK for intelligent feature generation.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from featcopilot.llm.code_generator import FeatureCodeGenerator
|
|
7
|
+
from featcopilot.llm.copilot_client import CopilotFeatureClient
|
|
8
|
+
from featcopilot.llm.explainer import FeatureExplainer
|
|
9
|
+
from featcopilot.llm.semantic_engine import SemanticEngine
|
|
10
|
+
|
|
11
|
+
__all__ = [
|
|
12
|
+
"CopilotFeatureClient",
|
|
13
|
+
"SemanticEngine",
|
|
14
|
+
"FeatureExplainer",
|
|
15
|
+
"FeatureCodeGenerator",
|
|
16
|
+
]
|