featcopilot 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,211 @@
1
+ """Text feature engineering engine.
2
+
3
+ Generates features from text data using embeddings and NLP techniques.
4
+ """
5
+
6
+ from typing import Any, Optional, Union
7
+
8
+ import numpy as np
9
+ import pandas as pd
10
+ from pydantic import Field
11
+
12
+ from featcopilot.core.base import BaseEngine, EngineConfig
13
+ from featcopilot.core.feature import FeatureSet
14
+
15
+
16
+ class TextEngineConfig(EngineConfig):
17
+ """Configuration for text feature engine."""
18
+
19
+ name: str = "TextEngine"
20
+ features: list[str] = Field(
21
+ default_factory=lambda: ["length", "word_count", "char_stats"],
22
+ description="Feature types to extract",
23
+ )
24
+ max_vocab_size: int = Field(default=5000, description="Max vocabulary size for TF-IDF")
25
+ n_components: int = Field(default=50, description="Components for dimensionality reduction")
26
+
27
+
28
+ class TextEngine(BaseEngine):
29
+ """
30
+ Text feature engineering engine.
31
+
32
+ Extracts features from text columns including:
33
+ - Length and character statistics
34
+ - Word count features
35
+ - TF-IDF features (optional)
36
+ - Sentiment features (optional)
37
+ - Embedding features (with LLM integration)
38
+
39
+ Parameters
40
+ ----------
41
+ features : list
42
+ Feature types to extract
43
+ max_vocab_size : int, default=5000
44
+ Maximum vocabulary size for TF-IDF
45
+
46
+ Examples
47
+ --------
48
+ >>> engine = TextEngine(features=['length', 'word_count', 'tfidf'])
49
+ >>> X_features = engine.fit_transform(text_df)
50
+ """
51
+
52
+ def __init__(
53
+ self,
54
+ features: Optional[list[str]] = None,
55
+ max_vocab_size: int = 5000,
56
+ max_features: Optional[int] = None,
57
+ verbose: bool = False,
58
+ **kwargs,
59
+ ):
60
+ config = TextEngineConfig(
61
+ features=features or ["length", "word_count", "char_stats"],
62
+ max_vocab_size=max_vocab_size,
63
+ max_features=max_features,
64
+ verbose=verbose,
65
+ **kwargs,
66
+ )
67
+ super().__init__(config=config)
68
+ self.config: TextEngineConfig = config
69
+ self._text_columns: list[str] = []
70
+ self._vectorizers: dict[str, Any] = {}
71
+ self._feature_set = FeatureSet()
72
+
73
+ def fit(
74
+ self,
75
+ X: Union[pd.DataFrame, np.ndarray],
76
+ y: Optional[Union[pd.Series, np.ndarray]] = None,
77
+ text_columns: Optional[list[str]] = None,
78
+ **kwargs,
79
+ ) -> "TextEngine":
80
+ """
81
+ Fit the engine to identify and process text columns.
82
+
83
+ Parameters
84
+ ----------
85
+ X : DataFrame
86
+ Input data
87
+ y : Series, optional
88
+ Target variable
89
+ text_columns : list, optional
90
+ Specific columns to treat as text
91
+
92
+ Returns
93
+ -------
94
+ self : TextEngine
95
+ """
96
+ X = self._validate_input(X)
97
+
98
+ # Identify text columns
99
+ if text_columns:
100
+ self._text_columns = text_columns
101
+ else:
102
+ self._text_columns = X.select_dtypes(include=["object"]).columns.tolist()
103
+ # Filter to likely text columns (not IDs, not low cardinality)
104
+ self._text_columns = [
105
+ col for col in self._text_columns if X[col].str.len().mean() > 10 and X[col].nunique() > 10
106
+ ]
107
+
108
+ if self.config.verbose:
109
+ print(f"TextEngine: Found {len(self._text_columns)} text columns")
110
+
111
+ # Fit TF-IDF vectorizers if needed
112
+ if "tfidf" in self.config.features:
113
+ self._fit_tfidf(X)
114
+
115
+ self._is_fitted = True
116
+ return self
117
+
118
+ def _fit_tfidf(self, X: pd.DataFrame) -> None:
119
+ """Fit TF-IDF vectorizers for text columns."""
120
+ try:
121
+ from sklearn.decomposition import TruncatedSVD
122
+ from sklearn.feature_extraction.text import TfidfVectorizer
123
+
124
+ for col in self._text_columns:
125
+ texts = X[col].fillna("").astype(str)
126
+ vectorizer = TfidfVectorizer(max_features=self.config.max_vocab_size, stop_words="english")
127
+ tfidf_matrix = vectorizer.fit_transform(texts)
128
+
129
+ # Reduce dimensions with SVD
130
+ n_components = min(self.config.n_components, tfidf_matrix.shape[1])
131
+ if n_components > 0:
132
+ svd = TruncatedSVD(n_components=n_components)
133
+ svd.fit(tfidf_matrix)
134
+ self._vectorizers[col] = {"vectorizer": vectorizer, "svd": svd}
135
+
136
+ except ImportError:
137
+ if self.config.verbose:
138
+ print("TextEngine: sklearn not available for TF-IDF, skipping")
139
+
140
+ def transform(self, X: Union[pd.DataFrame, np.ndarray], **kwargs) -> pd.DataFrame:
141
+ """
142
+ Extract text features.
143
+
144
+ Parameters
145
+ ----------
146
+ X : DataFrame
147
+ Input data
148
+
149
+ Returns
150
+ -------
151
+ X_features : DataFrame
152
+ Extracted features
153
+ """
154
+ if not self._is_fitted:
155
+ raise RuntimeError("Engine must be fitted before transform")
156
+
157
+ X = self._validate_input(X)
158
+ result = X.copy()
159
+
160
+ for col in self._text_columns:
161
+ texts = X[col].fillna("").astype(str)
162
+
163
+ # Length features
164
+ if "length" in self.config.features:
165
+ result[f"{col}_char_length"] = texts.str.len()
166
+ result[f"{col}_word_count"] = texts.str.split().str.len()
167
+
168
+ # Character statistics
169
+ if "char_stats" in self.config.features:
170
+ result[f"{col}_uppercase_ratio"] = texts.apply(
171
+ lambda x: sum(1 for c in x if c.isupper()) / max(len(x), 1)
172
+ )
173
+ result[f"{col}_digit_ratio"] = texts.apply(lambda x: sum(1 for c in x if c.isdigit()) / max(len(x), 1))
174
+ result[f"{col}_space_ratio"] = texts.apply(lambda x: sum(1 for c in x if c.isspace()) / max(len(x), 1))
175
+ result[f"{col}_special_char_count"] = texts.apply(
176
+ lambda x: sum(1 for c in x if not c.isalnum() and not c.isspace())
177
+ )
178
+
179
+ # Word count features
180
+ if "word_count" in self.config.features:
181
+ result[f"{col}_avg_word_length"] = texts.apply(lambda x: np.mean([len(w) for w in x.split()] or [0]))
182
+ result[f"{col}_unique_word_ratio"] = texts.apply(
183
+ lambda x: len(set(x.lower().split())) / max(len(x.split()), 1)
184
+ )
185
+
186
+ # TF-IDF features
187
+ if "tfidf" in self.config.features and col in self._vectorizers:
188
+ tfidf_features = self._transform_tfidf(texts, col)
189
+ result = pd.concat([result, tfidf_features], axis=1)
190
+
191
+ self._feature_names = [c for c in result.columns if c not in X.columns]
192
+
193
+ if self.config.verbose:
194
+ print(f"TextEngine: Extracted {len(self._feature_names)} features")
195
+
196
+ return result
197
+
198
+ def _transform_tfidf(self, texts: pd.Series, col: str) -> pd.DataFrame:
199
+ """Transform texts using fitted TF-IDF + SVD."""
200
+ vectorizer = self._vectorizers[col]["vectorizer"]
201
+ svd = self._vectorizers[col]["svd"]
202
+
203
+ tfidf_matrix = vectorizer.transform(texts)
204
+ reduced = svd.transform(tfidf_matrix)
205
+
206
+ feature_names = [f"{col}_tfidf_{i}" for i in range(reduced.shape[1])]
207
+ return pd.DataFrame(reduced, columns=feature_names, index=texts.index)
208
+
209
+ def get_feature_set(self) -> FeatureSet:
210
+ """Get the feature set with metadata."""
211
+ return self._feature_set
@@ -0,0 +1,402 @@
1
+ """Time series feature engineering engine.
2
+
3
+ Extracts statistical, frequency, and temporal features from time series data.
4
+ Inspired by TSFresh but with better integration and LLM capabilities.
5
+ """
6
+
7
+ from typing import Optional, Union
8
+
9
+ import numpy as np
10
+ import pandas as pd
11
+ from pydantic import Field
12
+
13
+ from featcopilot.core.base import BaseEngine, EngineConfig
14
+ from featcopilot.core.feature import FeatureSet
15
+
16
+
17
+ class TimeSeriesEngineConfig(EngineConfig):
18
+ """Configuration for time series feature engine."""
19
+
20
+ name: str = "TimeSeriesEngine"
21
+ features: list[str] = Field(
22
+ default_factory=lambda: [
23
+ "basic_stats",
24
+ "distribution",
25
+ "autocorrelation",
26
+ "peaks",
27
+ "trends",
28
+ ],
29
+ description="Feature groups to extract",
30
+ )
31
+ window_sizes: list[int] = Field(
32
+ default_factory=lambda: [5, 10, 20], description="Window sizes for rolling features"
33
+ )
34
+ n_fft_coefficients: int = Field(default=10, description="Number of FFT coefficients")
35
+ n_autocorr_lags: int = Field(default=10, description="Number of autocorrelation lags")
36
+
37
+
38
+ class TimeSeriesEngine(BaseEngine):
39
+ """
40
+ Time series feature engineering engine.
41
+
42
+ Extracts comprehensive features from time series data including:
43
+ - Basic statistics (mean, std, min, max, etc.)
44
+ - Distribution features (skewness, kurtosis, quantiles)
45
+ - Autocorrelation features
46
+ - Frequency domain features (FFT)
47
+ - Peak and trough features
48
+ - Trend features
49
+ - Rolling window statistics
50
+
51
+ Parameters
52
+ ----------
53
+ features : list, default=['basic_stats', 'distribution', 'autocorrelation']
54
+ Feature groups to extract
55
+ window_sizes : list, default=[5, 10, 20]
56
+ Window sizes for rolling features
57
+ max_features : int, optional
58
+ Maximum number of features to generate
59
+
60
+ Examples
61
+ --------
62
+ >>> engine = TimeSeriesEngine(features=['basic_stats', 'autocorrelation'])
63
+ >>> X_features = engine.fit_transform(time_series_df)
64
+ """
65
+
66
+ # Feature extraction functions
67
+ FEATURE_EXTRACTORS = {
68
+ "basic_stats": "_extract_basic_stats",
69
+ "distribution": "_extract_distribution",
70
+ "autocorrelation": "_extract_autocorrelation",
71
+ "peaks": "_extract_peaks",
72
+ "trends": "_extract_trends",
73
+ "rolling": "_extract_rolling",
74
+ "fft": "_extract_fft",
75
+ }
76
+
77
+ def __init__(
78
+ self,
79
+ features: Optional[list[str]] = None,
80
+ window_sizes: Optional[list[int]] = None,
81
+ max_features: Optional[int] = None,
82
+ verbose: bool = False,
83
+ **kwargs,
84
+ ):
85
+ config = TimeSeriesEngineConfig(
86
+ features=features or ["basic_stats", "distribution", "autocorrelation"],
87
+ window_sizes=window_sizes or [5, 10, 20],
88
+ max_features=max_features,
89
+ verbose=verbose,
90
+ **kwargs,
91
+ )
92
+ super().__init__(config=config)
93
+ self.config: TimeSeriesEngineConfig = config
94
+ self._time_columns: list[str] = []
95
+ self._feature_set = FeatureSet()
96
+
97
+ def fit(
98
+ self,
99
+ X: Union[pd.DataFrame, np.ndarray],
100
+ y: Optional[Union[pd.Series, np.ndarray]] = None,
101
+ time_column: Optional[str] = None,
102
+ **kwargs,
103
+ ) -> "TimeSeriesEngine":
104
+ """
105
+ Fit the engine to identify time series columns.
106
+
107
+ Parameters
108
+ ----------
109
+ X : DataFrame or ndarray
110
+ Input data (each row is a time series or time-indexed data)
111
+ y : Series or ndarray, optional
112
+ Target variable
113
+ time_column : str, optional
114
+ Column containing timestamps
115
+
116
+ Returns
117
+ -------
118
+ self : TimeSeriesEngine
119
+ """
120
+ X = self._validate_input(X)
121
+
122
+ # Identify numeric columns for time series analysis
123
+ self._time_columns = X.select_dtypes(include=[np.number]).columns.tolist()
124
+
125
+ if self.config.verbose:
126
+ print(f"TimeSeriesEngine: Found {len(self._time_columns)} numeric columns")
127
+
128
+ self._is_fitted = True
129
+ return self
130
+
131
+ def transform(self, X: Union[pd.DataFrame, np.ndarray], **kwargs) -> pd.DataFrame:
132
+ """
133
+ Extract time series features from input data.
134
+
135
+ Parameters
136
+ ----------
137
+ X : DataFrame or ndarray
138
+ Input data
139
+
140
+ Returns
141
+ -------
142
+ X_features : DataFrame
143
+ Extracted features
144
+ """
145
+ if not self._is_fitted:
146
+ raise RuntimeError("Engine must be fitted before transform")
147
+
148
+ X = self._validate_input(X)
149
+ features_dict = {}
150
+
151
+ for col in self._time_columns:
152
+ series = X[col].values
153
+
154
+ for feature_group in self.config.features:
155
+ if feature_group in self.FEATURE_EXTRACTORS:
156
+ method_name = self.FEATURE_EXTRACTORS[feature_group]
157
+ method = getattr(self, method_name)
158
+ extracted = method(series, col)
159
+ features_dict.update(extracted)
160
+
161
+ # For DataFrames with multiple rows, extract features across the entire column
162
+ if len(X) > 1:
163
+ # Each column is treated as a single time series
164
+ features_dict = {}
165
+ for col in self._time_columns:
166
+ series = X[col].values
167
+
168
+ for feature_group in self.config.features:
169
+ if feature_group in self.FEATURE_EXTRACTORS:
170
+ method_name = self.FEATURE_EXTRACTORS[feature_group]
171
+ method = getattr(self, method_name)
172
+ extracted = method(series, col)
173
+ features_dict.update(extracted)
174
+
175
+ result = pd.DataFrame([features_dict])
176
+
177
+ self._feature_names = list(result.columns)
178
+
179
+ if self.config.verbose:
180
+ print(f"TimeSeriesEngine: Extracted {len(self._feature_names)} features")
181
+
182
+ return result
183
+
184
+ def _extract_per_row(self, X: pd.DataFrame) -> pd.DataFrame:
185
+ """Extract features for each row (multiple time series)."""
186
+ all_features = []
187
+
188
+ for idx in range(len(X)):
189
+ row_features = {}
190
+ for col in self._time_columns:
191
+ value = X[col].iloc[idx]
192
+ if isinstance(value, (list, np.ndarray)):
193
+ series = np.array(value)
194
+ else:
195
+ # Single value - create minimal features
196
+ row_features[f"{col}_value"] = value
197
+ continue
198
+
199
+ for feature_group in self.config.features:
200
+ if feature_group in self.FEATURE_EXTRACTORS:
201
+ method_name = self.FEATURE_EXTRACTORS[feature_group]
202
+ method = getattr(self, method_name)
203
+ extracted = method(series, col)
204
+ row_features.update(extracted)
205
+
206
+ all_features.append(row_features)
207
+
208
+ return pd.DataFrame(all_features)
209
+
210
+ def _extract_basic_stats(self, series: np.ndarray, col: str) -> dict[str, float]:
211
+ """Extract basic statistical features."""
212
+ features = {}
213
+ prefix = col
214
+
215
+ if len(series) == 0:
216
+ return features
217
+
218
+ features[f"{prefix}_mean"] = np.nanmean(series)
219
+ features[f"{prefix}_std"] = np.nanstd(series)
220
+ features[f"{prefix}_min"] = np.nanmin(series)
221
+ features[f"{prefix}_max"] = np.nanmax(series)
222
+ features[f"{prefix}_range"] = features[f"{prefix}_max"] - features[f"{prefix}_min"]
223
+ features[f"{prefix}_median"] = np.nanmedian(series)
224
+ features[f"{prefix}_sum"] = np.nansum(series)
225
+ features[f"{prefix}_length"] = len(series)
226
+ features[f"{prefix}_var"] = np.nanvar(series)
227
+
228
+ # Coefficient of variation
229
+ if features[f"{prefix}_mean"] != 0:
230
+ features[f"{prefix}_cv"] = features[f"{prefix}_std"] / abs(features[f"{prefix}_mean"])
231
+ else:
232
+ features[f"{prefix}_cv"] = 0
233
+
234
+ return features
235
+
236
+ def _extract_distribution(self, series: np.ndarray, col: str) -> dict[str, float]:
237
+ """Extract distribution-based features."""
238
+ from scipy import stats
239
+
240
+ features = {}
241
+ prefix = col
242
+
243
+ if len(series) < 4:
244
+ return features
245
+
246
+ # Remove NaN values
247
+ series_clean = series[~np.isnan(series)]
248
+ if len(series_clean) < 4:
249
+ return features
250
+
251
+ features[f"{prefix}_skewness"] = stats.skew(series_clean)
252
+ features[f"{prefix}_kurtosis"] = stats.kurtosis(series_clean)
253
+
254
+ # Quantiles
255
+ for q in [0.1, 0.25, 0.75, 0.9]:
256
+ features[f"{prefix}_q{int(q*100)}"] = np.quantile(series_clean, q)
257
+
258
+ # IQR
259
+ q75, q25 = np.quantile(series_clean, [0.75, 0.25])
260
+ features[f"{prefix}_iqr"] = q75 - q25
261
+
262
+ return features
263
+
264
+ def _extract_autocorrelation(self, series: np.ndarray, col: str) -> dict[str, float]:
265
+ """Extract autocorrelation features."""
266
+ features = {}
267
+ prefix = col
268
+
269
+ if len(series) < self.config.n_autocorr_lags + 1:
270
+ return features
271
+
272
+ series_clean = series[~np.isnan(series)]
273
+ if len(series_clean) < self.config.n_autocorr_lags + 1:
274
+ return features
275
+
276
+ # Compute autocorrelation for different lags
277
+ var = np.var(series_clean)
278
+
279
+ if var == 0:
280
+ return features
281
+
282
+ for lag in range(1, min(self.config.n_autocorr_lags + 1, len(series_clean))):
283
+ autocorr = np.corrcoef(series_clean[:-lag], series_clean[lag:])[0, 1]
284
+ if not np.isnan(autocorr):
285
+ features[f"{prefix}_autocorr_lag{lag}"] = autocorr
286
+
287
+ return features
288
+
289
+ def _extract_peaks(self, series: np.ndarray, col: str) -> dict[str, float]:
290
+ """Extract peak and trough related features."""
291
+ from scipy.signal import find_peaks
292
+
293
+ features = {}
294
+ prefix = col
295
+
296
+ if len(series) < 3:
297
+ return features
298
+
299
+ series_clean = series[~np.isnan(series)]
300
+ if len(series_clean) < 3:
301
+ return features
302
+
303
+ # Find peaks
304
+ peaks, _ = find_peaks(series_clean)
305
+ troughs, _ = find_peaks(-series_clean)
306
+
307
+ features[f"{prefix}_n_peaks"] = len(peaks)
308
+ features[f"{prefix}_n_troughs"] = len(troughs)
309
+
310
+ if len(peaks) > 0:
311
+ features[f"{prefix}_peak_mean"] = np.mean(series_clean[peaks])
312
+ features[f"{prefix}_peak_max"] = np.max(series_clean[peaks])
313
+
314
+ if len(troughs) > 0:
315
+ features[f"{prefix}_trough_mean"] = np.mean(series_clean[troughs])
316
+ features[f"{prefix}_trough_min"] = np.min(series_clean[troughs])
317
+
318
+ return features
319
+
320
+ def _extract_trends(self, series: np.ndarray, col: str) -> dict[str, float]:
321
+ """Extract trend-related features."""
322
+ features = {}
323
+ prefix = col
324
+
325
+ if len(series) < 2:
326
+ return features
327
+
328
+ series_clean = series[~np.isnan(series)]
329
+ if len(series_clean) < 2:
330
+ return features
331
+
332
+ # Linear trend (slope)
333
+ x = np.arange(len(series_clean))
334
+ slope, intercept = np.polyfit(x, series_clean, 1)
335
+ features[f"{prefix}_trend_slope"] = slope
336
+ features[f"{prefix}_trend_intercept"] = intercept
337
+
338
+ # First and last differences
339
+ features[f"{prefix}_first_value"] = series_clean[0]
340
+ features[f"{prefix}_last_value"] = series_clean[-1]
341
+ features[f"{prefix}_change"] = series_clean[-1] - series_clean[0]
342
+
343
+ # Mean absolute change
344
+ features[f"{prefix}_mean_abs_change"] = np.mean(np.abs(np.diff(series_clean)))
345
+
346
+ # Mean change
347
+ features[f"{prefix}_mean_change"] = np.mean(np.diff(series_clean))
348
+
349
+ return features
350
+
351
+ def _extract_rolling(self, series: np.ndarray, col: str) -> dict[str, float]:
352
+ """Extract rolling window features."""
353
+ features = {}
354
+ prefix = col
355
+
356
+ series_clean = series[~np.isnan(series)]
357
+
358
+ for window in self.config.window_sizes:
359
+ if len(series_clean) < window:
360
+ continue
361
+
362
+ # Convert to pandas for rolling operations
363
+ s = pd.Series(series_clean)
364
+
365
+ rolling = s.rolling(window=window)
366
+ features[f"{prefix}_rolling{window}_mean_of_means"] = rolling.mean().mean()
367
+ features[f"{prefix}_rolling{window}_max_of_means"] = rolling.mean().max()
368
+ features[f"{prefix}_rolling{window}_std_of_stds"] = rolling.std().std()
369
+
370
+ return features
371
+
372
+ def _extract_fft(self, series: np.ndarray, col: str) -> dict[str, float]:
373
+ """Extract FFT (frequency domain) features."""
374
+ features = {}
375
+ prefix = col
376
+
377
+ series_clean = series[~np.isnan(series)]
378
+ if len(series_clean) < 4:
379
+ return features
380
+
381
+ # Compute FFT
382
+ fft_vals = np.fft.fft(series_clean)
383
+ fft_abs = np.abs(fft_vals)
384
+
385
+ # Get first N coefficients (excluding DC component)
386
+ n_coeffs = min(self.config.n_fft_coefficients, len(fft_abs) // 2)
387
+
388
+ for i in range(1, n_coeffs + 1):
389
+ features[f"{prefix}_fft_coeff_{i}"] = fft_abs[i]
390
+
391
+ # Spectral energy
392
+ features[f"{prefix}_spectral_energy"] = np.sum(fft_abs**2)
393
+
394
+ # Dominant frequency
395
+ dominant_idx = np.argmax(fft_abs[1 : len(fft_abs) // 2]) + 1
396
+ features[f"{prefix}_dominant_freq_idx"] = dominant_idx
397
+
398
+ return features
399
+
400
+ def get_feature_set(self) -> FeatureSet:
401
+ """Get the feature set with metadata."""
402
+ return self._feature_set
@@ -0,0 +1,16 @@
1
+ """LLM-powered feature engineering module.
2
+
3
+ Uses GitHub Copilot SDK for intelligent feature generation.
4
+ """
5
+
6
+ from featcopilot.llm.code_generator import FeatureCodeGenerator
7
+ from featcopilot.llm.copilot_client import CopilotFeatureClient
8
+ from featcopilot.llm.explainer import FeatureExplainer
9
+ from featcopilot.llm.semantic_engine import SemanticEngine
10
+
11
+ __all__ = [
12
+ "CopilotFeatureClient",
13
+ "SemanticEngine",
14
+ "FeatureExplainer",
15
+ "FeatureCodeGenerator",
16
+ ]