featcopilot 0.1.0__py3-none-any.whl → 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- featcopilot/__init__.py +10 -1
- featcopilot/core/__init__.py +2 -0
- featcopilot/core/feature.py +5 -1
- featcopilot/core/transform_rule.py +276 -0
- featcopilot/engines/relational.py +5 -2
- featcopilot/engines/tabular.py +151 -5
- featcopilot/engines/text.py +352 -11
- featcopilot/engines/timeseries.py +235 -3
- featcopilot/llm/__init__.py +6 -1
- featcopilot/llm/code_generator.py +7 -4
- featcopilot/llm/copilot_client.py +97 -20
- featcopilot/llm/explainer.py +6 -3
- featcopilot/llm/litellm_client.py +595 -0
- featcopilot/llm/semantic_engine.py +717 -26
- featcopilot/llm/transform_rule_generator.py +403 -0
- featcopilot/selection/importance.py +40 -9
- featcopilot/selection/redundancy.py +39 -10
- featcopilot/selection/statistical.py +107 -34
- featcopilot/selection/unified.py +57 -3
- featcopilot/stores/__init__.py +17 -0
- featcopilot/stores/base.py +166 -0
- featcopilot/stores/feast_store.py +541 -0
- featcopilot/stores/rule_store.py +343 -0
- featcopilot/transformers/sklearn_compat.py +18 -6
- featcopilot/utils/__init__.py +14 -0
- featcopilot/utils/logger.py +47 -0
- featcopilot/utils/models.py +287 -0
- featcopilot/utils/parallel.py +5 -1
- {featcopilot-0.1.0.dist-info → featcopilot-0.3.0.dist-info}/METADATA +56 -25
- featcopilot-0.3.0.dist-info/RECORD +38 -0
- featcopilot-0.1.0.dist-info/RECORD +0 -29
- {featcopilot-0.1.0.dist-info → featcopilot-0.3.0.dist-info}/WHEEL +0 -0
- {featcopilot-0.1.0.dist-info → featcopilot-0.3.0.dist-info}/top_level.txt +0 -0
|
@@ -12,6 +12,9 @@ from pydantic import Field
|
|
|
12
12
|
|
|
13
13
|
from featcopilot.core.base import BaseEngine, EngineConfig
|
|
14
14
|
from featcopilot.core.feature import FeatureSet
|
|
15
|
+
from featcopilot.utils.logger import get_logger
|
|
16
|
+
|
|
17
|
+
logger = get_logger(__name__)
|
|
15
18
|
|
|
16
19
|
|
|
17
20
|
class TimeSeriesEngineConfig(EngineConfig):
|
|
@@ -25,6 +28,10 @@ class TimeSeriesEngineConfig(EngineConfig):
|
|
|
25
28
|
"autocorrelation",
|
|
26
29
|
"peaks",
|
|
27
30
|
"trends",
|
|
31
|
+
"entropy",
|
|
32
|
+
"energy",
|
|
33
|
+
"complexity",
|
|
34
|
+
"counts",
|
|
28
35
|
],
|
|
29
36
|
description="Feature groups to extract",
|
|
30
37
|
)
|
|
@@ -33,6 +40,7 @@ class TimeSeriesEngineConfig(EngineConfig):
|
|
|
33
40
|
)
|
|
34
41
|
n_fft_coefficients: int = Field(default=10, description="Number of FFT coefficients")
|
|
35
42
|
n_autocorr_lags: int = Field(default=10, description="Number of autocorrelation lags")
|
|
43
|
+
entropy_bins: int = Field(default=10, description="Number of bins for binned entropy")
|
|
36
44
|
|
|
37
45
|
|
|
38
46
|
class TimeSeriesEngine(BaseEngine):
|
|
@@ -63,7 +71,7 @@ class TimeSeriesEngine(BaseEngine):
|
|
|
63
71
|
>>> X_features = engine.fit_transform(time_series_df)
|
|
64
72
|
"""
|
|
65
73
|
|
|
66
|
-
# Feature extraction functions
|
|
74
|
+
# Feature extraction functions (tsfresh-inspired)
|
|
67
75
|
FEATURE_EXTRACTORS = {
|
|
68
76
|
"basic_stats": "_extract_basic_stats",
|
|
69
77
|
"distribution": "_extract_distribution",
|
|
@@ -72,6 +80,10 @@ class TimeSeriesEngine(BaseEngine):
|
|
|
72
80
|
"trends": "_extract_trends",
|
|
73
81
|
"rolling": "_extract_rolling",
|
|
74
82
|
"fft": "_extract_fft",
|
|
83
|
+
"entropy": "_extract_entropy",
|
|
84
|
+
"energy": "_extract_energy",
|
|
85
|
+
"complexity": "_extract_complexity",
|
|
86
|
+
"counts": "_extract_counts",
|
|
75
87
|
}
|
|
76
88
|
|
|
77
89
|
def __init__(
|
|
@@ -123,7 +135,7 @@ class TimeSeriesEngine(BaseEngine):
|
|
|
123
135
|
self._time_columns = X.select_dtypes(include=[np.number]).columns.tolist()
|
|
124
136
|
|
|
125
137
|
if self.config.verbose:
|
|
126
|
-
|
|
138
|
+
logger.info(f"TimeSeriesEngine: Found {len(self._time_columns)} numeric columns")
|
|
127
139
|
|
|
128
140
|
self._is_fitted = True
|
|
129
141
|
return self
|
|
@@ -177,7 +189,7 @@ class TimeSeriesEngine(BaseEngine):
|
|
|
177
189
|
self._feature_names = list(result.columns)
|
|
178
190
|
|
|
179
191
|
if self.config.verbose:
|
|
180
|
-
|
|
192
|
+
logger.info(f"TimeSeriesEngine: Extracted {len(self._feature_names)} features")
|
|
181
193
|
|
|
182
194
|
return result
|
|
183
195
|
|
|
@@ -397,6 +409,226 @@ class TimeSeriesEngine(BaseEngine):
|
|
|
397
409
|
|
|
398
410
|
return features
|
|
399
411
|
|
|
412
|
+
def _extract_entropy(self, series: np.ndarray, col: str) -> dict[str, float]:
|
|
413
|
+
"""Extract entropy-based features (tsfresh-inspired)."""
|
|
414
|
+
features = {}
|
|
415
|
+
prefix = col
|
|
416
|
+
|
|
417
|
+
series_clean = series[~np.isnan(series)]
|
|
418
|
+
if len(series_clean) < 4:
|
|
419
|
+
return features
|
|
420
|
+
|
|
421
|
+
# Binned entropy
|
|
422
|
+
try:
|
|
423
|
+
hist, _ = np.histogram(series_clean, bins=self.config.entropy_bins)
|
|
424
|
+
hist = hist[hist > 0]
|
|
425
|
+
probs = hist / hist.sum()
|
|
426
|
+
features[f"{prefix}_binned_entropy"] = -np.sum(probs * np.log(probs + 1e-10))
|
|
427
|
+
except Exception:
|
|
428
|
+
features[f"{prefix}_binned_entropy"] = 0
|
|
429
|
+
|
|
430
|
+
# Sample entropy (simplified implementation)
|
|
431
|
+
try:
|
|
432
|
+
features[f"{prefix}_sample_entropy"] = self._sample_entropy(series_clean, m=2, r=0.2)
|
|
433
|
+
except Exception:
|
|
434
|
+
features[f"{prefix}_sample_entropy"] = 0
|
|
435
|
+
|
|
436
|
+
# Approximate entropy
|
|
437
|
+
try:
|
|
438
|
+
features[f"{prefix}_approximate_entropy"] = self._approximate_entropy(series_clean, m=2, r=0.2)
|
|
439
|
+
except Exception:
|
|
440
|
+
features[f"{prefix}_approximate_entropy"] = 0
|
|
441
|
+
|
|
442
|
+
return features
|
|
443
|
+
|
|
444
|
+
def _sample_entropy(self, series: np.ndarray, m: int = 2, r: float = 0.2) -> float:
|
|
445
|
+
"""Compute sample entropy of a time series."""
|
|
446
|
+
n = len(series)
|
|
447
|
+
if n < m + 2:
|
|
448
|
+
return 0
|
|
449
|
+
|
|
450
|
+
# Normalize r by std
|
|
451
|
+
r = r * np.std(series)
|
|
452
|
+
if r == 0:
|
|
453
|
+
return 0
|
|
454
|
+
|
|
455
|
+
def _count_matches(template_length):
|
|
456
|
+
count = 0
|
|
457
|
+
templates = np.array([series[i : i + template_length] for i in range(n - template_length)])
|
|
458
|
+
for i in range(len(templates)):
|
|
459
|
+
for j in range(i + 1, len(templates)):
|
|
460
|
+
if np.max(np.abs(templates[i] - templates[j])) < r:
|
|
461
|
+
count += 1
|
|
462
|
+
return count
|
|
463
|
+
|
|
464
|
+
a = _count_matches(m)
|
|
465
|
+
b = _count_matches(m + 1)
|
|
466
|
+
|
|
467
|
+
if a == 0 or b == 0:
|
|
468
|
+
return 0
|
|
469
|
+
|
|
470
|
+
return -np.log(b / a)
|
|
471
|
+
|
|
472
|
+
def _approximate_entropy(self, series: np.ndarray, m: int = 2, r: float = 0.2) -> float:
|
|
473
|
+
"""Compute approximate entropy of a time series."""
|
|
474
|
+
n = len(series)
|
|
475
|
+
if n < m + 2:
|
|
476
|
+
return 0
|
|
477
|
+
|
|
478
|
+
r = r * np.std(series)
|
|
479
|
+
if r == 0:
|
|
480
|
+
return 0
|
|
481
|
+
|
|
482
|
+
def _phi(m_val):
|
|
483
|
+
patterns = np.array([series[i : i + m_val] for i in range(n - m_val + 1)])
|
|
484
|
+
counts = np.zeros(len(patterns))
|
|
485
|
+
for i, pattern in enumerate(patterns):
|
|
486
|
+
for other in patterns:
|
|
487
|
+
if np.max(np.abs(pattern - other)) < r:
|
|
488
|
+
counts[i] += 1
|
|
489
|
+
counts = counts / len(patterns)
|
|
490
|
+
return np.sum(np.log(counts + 1e-10)) / len(patterns)
|
|
491
|
+
|
|
492
|
+
return _phi(m) - _phi(m + 1)
|
|
493
|
+
|
|
494
|
+
def _extract_energy(self, series: np.ndarray, col: str) -> dict[str, float]:
|
|
495
|
+
"""Extract energy-based features (tsfresh-inspired)."""
|
|
496
|
+
features = {}
|
|
497
|
+
prefix = col
|
|
498
|
+
|
|
499
|
+
series_clean = series[~np.isnan(series)]
|
|
500
|
+
if len(series_clean) < 2:
|
|
501
|
+
return features
|
|
502
|
+
|
|
503
|
+
# Absolute energy: sum of squared values
|
|
504
|
+
features[f"{prefix}_abs_energy"] = np.sum(series_clean**2)
|
|
505
|
+
|
|
506
|
+
# Mean absolute change
|
|
507
|
+
features[f"{prefix}_mean_abs_change"] = np.mean(np.abs(np.diff(series_clean)))
|
|
508
|
+
|
|
509
|
+
# Mean second derivative central
|
|
510
|
+
if len(series_clean) >= 3:
|
|
511
|
+
second_deriv = series_clean[2:] - 2 * series_clean[1:-1] + series_clean[:-2]
|
|
512
|
+
features[f"{prefix}_mean_second_deriv_central"] = np.mean(second_deriv)
|
|
513
|
+
|
|
514
|
+
# Root mean square
|
|
515
|
+
features[f"{prefix}_rms"] = np.sqrt(np.mean(series_clean**2))
|
|
516
|
+
|
|
517
|
+
# Crest factor (peak/rms)
|
|
518
|
+
rms = features[f"{prefix}_rms"]
|
|
519
|
+
if rms > 0:
|
|
520
|
+
features[f"{prefix}_crest_factor"] = np.max(np.abs(series_clean)) / rms
|
|
521
|
+
|
|
522
|
+
return features
|
|
523
|
+
|
|
524
|
+
def _extract_complexity(self, series: np.ndarray, col: str) -> dict[str, float]:
|
|
525
|
+
"""Extract complexity features (tsfresh-inspired)."""
|
|
526
|
+
features = {}
|
|
527
|
+
prefix = col
|
|
528
|
+
|
|
529
|
+
series_clean = series[~np.isnan(series)]
|
|
530
|
+
if len(series_clean) < 3:
|
|
531
|
+
return features
|
|
532
|
+
|
|
533
|
+
# CID_CE: Complexity-invariant distance
|
|
534
|
+
diff = np.diff(series_clean)
|
|
535
|
+
features[f"{prefix}_cid_ce"] = np.sqrt(np.sum(diff**2))
|
|
536
|
+
|
|
537
|
+
# C3: Time series complexity (lag 1)
|
|
538
|
+
if len(series_clean) >= 3:
|
|
539
|
+
n = len(series_clean)
|
|
540
|
+
c3 = np.sum(series_clean[2:n] * series_clean[1 : n - 1] * series_clean[0 : n - 2]) / (n - 2)
|
|
541
|
+
features[f"{prefix}_c3"] = c3
|
|
542
|
+
|
|
543
|
+
# Ratio of unique values to length
|
|
544
|
+
features[f"{prefix}_ratio_unique_values"] = len(np.unique(series_clean)) / len(series_clean)
|
|
545
|
+
|
|
546
|
+
# Has duplicate
|
|
547
|
+
features[f"{prefix}_has_duplicate"] = 1 if len(np.unique(series_clean)) < len(series_clean) else 0
|
|
548
|
+
|
|
549
|
+
# Has duplicate max
|
|
550
|
+
max_val = np.max(series_clean)
|
|
551
|
+
features[f"{prefix}_has_duplicate_max"] = 1 if np.sum(series_clean == max_val) > 1 else 0
|
|
552
|
+
|
|
553
|
+
# Has duplicate min
|
|
554
|
+
min_val = np.min(series_clean)
|
|
555
|
+
features[f"{prefix}_has_duplicate_min"] = 1 if np.sum(series_clean == min_val) > 1 else 0
|
|
556
|
+
|
|
557
|
+
# Sum of reoccurring values
|
|
558
|
+
unique, counts = np.unique(series_clean, return_counts=True)
|
|
559
|
+
reoccurring_mask = counts > 1
|
|
560
|
+
features[f"{prefix}_sum_reoccurring_values"] = np.sum(unique[reoccurring_mask] * counts[reoccurring_mask])
|
|
561
|
+
|
|
562
|
+
# Sum of reoccurring data points
|
|
563
|
+
features[f"{prefix}_sum_reoccurring_data_points"] = np.sum(counts[reoccurring_mask])
|
|
564
|
+
|
|
565
|
+
# Percentage of reoccurring data points
|
|
566
|
+
features[f"{prefix}_pct_reoccurring_data_points"] = np.sum(counts[reoccurring_mask]) / len(series_clean)
|
|
567
|
+
|
|
568
|
+
return features
|
|
569
|
+
|
|
570
|
+
def _extract_counts(self, series: np.ndarray, col: str) -> dict[str, float]:
|
|
571
|
+
"""Extract count-based features (tsfresh-inspired)."""
|
|
572
|
+
features = {}
|
|
573
|
+
prefix = col
|
|
574
|
+
|
|
575
|
+
series_clean = series[~np.isnan(series)]
|
|
576
|
+
if len(series_clean) < 2:
|
|
577
|
+
return features
|
|
578
|
+
|
|
579
|
+
mean_val = np.mean(series_clean)
|
|
580
|
+
|
|
581
|
+
# Count above mean
|
|
582
|
+
features[f"{prefix}_count_above_mean"] = np.sum(series_clean > mean_val)
|
|
583
|
+
|
|
584
|
+
# Count below mean
|
|
585
|
+
features[f"{prefix}_count_below_mean"] = np.sum(series_clean < mean_val)
|
|
586
|
+
|
|
587
|
+
# First location of maximum
|
|
588
|
+
features[f"{prefix}_first_loc_max"] = np.argmax(series_clean) / len(series_clean)
|
|
589
|
+
|
|
590
|
+
# First location of minimum
|
|
591
|
+
features[f"{prefix}_first_loc_min"] = np.argmin(series_clean) / len(series_clean)
|
|
592
|
+
|
|
593
|
+
# Last location of maximum
|
|
594
|
+
features[f"{prefix}_last_loc_max"] = (len(series_clean) - 1 - np.argmax(series_clean[::-1])) / len(series_clean)
|
|
595
|
+
|
|
596
|
+
# Last location of minimum
|
|
597
|
+
features[f"{prefix}_last_loc_min"] = (len(series_clean) - 1 - np.argmin(series_clean[::-1])) / len(series_clean)
|
|
598
|
+
|
|
599
|
+
# Longest strike above mean
|
|
600
|
+
above_mean = series_clean > mean_val
|
|
601
|
+
features[f"{prefix}_longest_strike_above_mean"] = self._longest_consecutive(above_mean)
|
|
602
|
+
|
|
603
|
+
# Longest strike below mean
|
|
604
|
+
below_mean = series_clean < mean_val
|
|
605
|
+
features[f"{prefix}_longest_strike_below_mean"] = self._longest_consecutive(below_mean)
|
|
606
|
+
|
|
607
|
+
# Number of crossings (mean)
|
|
608
|
+
crossings = np.sum(np.diff(np.sign(series_clean - mean_val)) != 0)
|
|
609
|
+
features[f"{prefix}_number_crossings_mean"] = crossings
|
|
610
|
+
|
|
611
|
+
# Number of zero crossings
|
|
612
|
+
zero_crossings = np.sum(np.diff(np.sign(series_clean)) != 0)
|
|
613
|
+
features[f"{prefix}_number_zero_crossings"] = zero_crossings
|
|
614
|
+
|
|
615
|
+
# Absolute sum of changes
|
|
616
|
+
features[f"{prefix}_abs_sum_changes"] = np.sum(np.abs(np.diff(series_clean)))
|
|
617
|
+
|
|
618
|
+
return features
|
|
619
|
+
|
|
620
|
+
def _longest_consecutive(self, bool_array: np.ndarray) -> int:
|
|
621
|
+
"""Find longest consecutive True values in boolean array."""
|
|
622
|
+
max_len = 0
|
|
623
|
+
current_len = 0
|
|
624
|
+
for val in bool_array:
|
|
625
|
+
if val:
|
|
626
|
+
current_len += 1
|
|
627
|
+
max_len = max(max_len, current_len)
|
|
628
|
+
else:
|
|
629
|
+
current_len = 0
|
|
630
|
+
return max_len
|
|
631
|
+
|
|
400
632
|
def get_feature_set(self) -> FeatureSet:
|
|
401
633
|
"""Get the feature set with metadata."""
|
|
402
634
|
return self._feature_set
|
featcopilot/llm/__init__.py
CHANGED
|
@@ -1,16 +1,21 @@
|
|
|
1
1
|
"""LLM-powered feature engineering module.
|
|
2
2
|
|
|
3
|
-
Uses GitHub Copilot SDK for intelligent feature generation.
|
|
3
|
+
Uses GitHub Copilot SDK or LiteLLM for intelligent feature generation.
|
|
4
4
|
"""
|
|
5
5
|
|
|
6
6
|
from featcopilot.llm.code_generator import FeatureCodeGenerator
|
|
7
7
|
from featcopilot.llm.copilot_client import CopilotFeatureClient
|
|
8
8
|
from featcopilot.llm.explainer import FeatureExplainer
|
|
9
|
+
from featcopilot.llm.litellm_client import LiteLLMFeatureClient, SyncLiteLLMFeatureClient
|
|
9
10
|
from featcopilot.llm.semantic_engine import SemanticEngine
|
|
11
|
+
from featcopilot.llm.transform_rule_generator import TransformRuleGenerator
|
|
10
12
|
|
|
11
13
|
__all__ = [
|
|
12
14
|
"CopilotFeatureClient",
|
|
15
|
+
"LiteLLMFeatureClient",
|
|
16
|
+
"SyncLiteLLMFeatureClient",
|
|
13
17
|
"SemanticEngine",
|
|
14
18
|
"FeatureExplainer",
|
|
15
19
|
"FeatureCodeGenerator",
|
|
20
|
+
"TransformRuleGenerator",
|
|
16
21
|
]
|
|
@@ -10,6 +10,9 @@ import pandas as pd
|
|
|
10
10
|
|
|
11
11
|
from featcopilot.core.feature import Feature, FeatureOrigin, FeatureType
|
|
12
12
|
from featcopilot.llm.copilot_client import SyncCopilotFeatureClient
|
|
13
|
+
from featcopilot.utils.logger import get_logger
|
|
14
|
+
|
|
15
|
+
logger = get_logger(__name__)
|
|
13
16
|
|
|
14
17
|
|
|
15
18
|
class FeatureCodeGenerator:
|
|
@@ -21,7 +24,7 @@ class FeatureCodeGenerator:
|
|
|
21
24
|
|
|
22
25
|
Parameters
|
|
23
26
|
----------
|
|
24
|
-
model : str, default='gpt-5'
|
|
27
|
+
model : str, default='gpt-5.2'
|
|
25
28
|
LLM model to use
|
|
26
29
|
validate : bool, default=True
|
|
27
30
|
Whether to validate generated code
|
|
@@ -35,7 +38,7 @@ class FeatureCodeGenerator:
|
|
|
35
38
|
... )
|
|
36
39
|
"""
|
|
37
40
|
|
|
38
|
-
def __init__(self, model: str = "gpt-5", validate: bool = True, verbose: bool = False):
|
|
41
|
+
def __init__(self, model: str = "gpt-5.2", validate: bool = True, verbose: bool = False):
|
|
39
42
|
self.model = model
|
|
40
43
|
self.validate = validate
|
|
41
44
|
self.verbose = verbose
|
|
@@ -98,7 +101,7 @@ class FeatureCodeGenerator:
|
|
|
98
101
|
)
|
|
99
102
|
if not validation["valid"]:
|
|
100
103
|
if self.verbose:
|
|
101
|
-
|
|
104
|
+
logger.warning(f"Code validation failed: {validation['error']}")
|
|
102
105
|
# Try to fix common issues
|
|
103
106
|
code = self._fix_common_issues(code, validation["error"])
|
|
104
107
|
|
|
@@ -144,7 +147,7 @@ class FeatureCodeGenerator:
|
|
|
144
147
|
features.append(feature)
|
|
145
148
|
except Exception as e:
|
|
146
149
|
if self.verbose:
|
|
147
|
-
|
|
150
|
+
logger.error(f"Failed to generate feature for '{desc}': {e}")
|
|
148
151
|
|
|
149
152
|
return features
|
|
150
153
|
|
|
@@ -10,11 +10,15 @@ from typing import Any, Optional
|
|
|
10
10
|
|
|
11
11
|
from pydantic import BaseModel, Field
|
|
12
12
|
|
|
13
|
+
from featcopilot.utils.logger import get_logger
|
|
14
|
+
|
|
15
|
+
logger = get_logger(__name__)
|
|
16
|
+
|
|
13
17
|
|
|
14
18
|
class CopilotConfig(BaseModel):
|
|
15
19
|
"""Configuration for Copilot client."""
|
|
16
20
|
|
|
17
|
-
model: str = Field(default="gpt-5", description="Model to use")
|
|
21
|
+
model: str = Field(default="gpt-5.2", description="Model to use")
|
|
18
22
|
temperature: float = Field(default=0.3, ge=0, le=1, description="Temperature for generation")
|
|
19
23
|
max_tokens: int = Field(default=4096, description="Maximum tokens in response")
|
|
20
24
|
timeout: float = Field(default=60.0, description="Timeout in seconds")
|
|
@@ -35,12 +39,12 @@ class CopilotFeatureClient:
|
|
|
35
39
|
----------
|
|
36
40
|
config : CopilotConfig, optional
|
|
37
41
|
Configuration for the client
|
|
38
|
-
model : str, default='gpt-5'
|
|
42
|
+
model : str, default='gpt-5.2'
|
|
39
43
|
Model to use for generation
|
|
40
44
|
|
|
41
45
|
Examples
|
|
42
46
|
--------
|
|
43
|
-
>>> client = CopilotFeatureClient(model='gpt-5')
|
|
47
|
+
>>> client = CopilotFeatureClient(model='gpt-5.2')
|
|
44
48
|
>>> await client.start()
|
|
45
49
|
>>> suggestions = await client.suggest_features(
|
|
46
50
|
... column_info={'age': 'int', 'income': 'float'},
|
|
@@ -49,7 +53,7 @@ class CopilotFeatureClient:
|
|
|
49
53
|
>>> await client.stop()
|
|
50
54
|
"""
|
|
51
55
|
|
|
52
|
-
def __init__(self, config: Optional[CopilotConfig] = None, model: str = "gpt-5", **kwargs):
|
|
56
|
+
def __init__(self, config: Optional[CopilotConfig] = None, model: str = "gpt-5.2", **kwargs):
|
|
53
57
|
self.config = config or CopilotConfig(model=model, **kwargs)
|
|
54
58
|
self._client = None
|
|
55
59
|
self._session = None
|
|
@@ -82,13 +86,13 @@ class CopilotFeatureClient:
|
|
|
82
86
|
# Copilot SDK not installed - use mock mode
|
|
83
87
|
self._copilot_available = False
|
|
84
88
|
self._is_started = True
|
|
85
|
-
|
|
89
|
+
logger.warning("copilot-sdk not installed. Using mock LLM responses.")
|
|
86
90
|
|
|
87
91
|
except Exception as e:
|
|
88
92
|
# Copilot not available - use mock mode
|
|
89
93
|
self._copilot_available = False
|
|
90
94
|
self._is_started = True
|
|
91
|
-
|
|
95
|
+
logger.warning(f"Could not connect to Copilot: {e}. Using mock LLM responses.")
|
|
92
96
|
|
|
93
97
|
return self
|
|
94
98
|
|
|
@@ -469,7 +473,37 @@ result = df['col1'] / (df['col2'] + 1e-8)
|
|
|
469
473
|
local_vars = {"df": df, "np": np, "pd": pd}
|
|
470
474
|
exec(
|
|
471
475
|
code,
|
|
472
|
-
{
|
|
476
|
+
{
|
|
477
|
+
"__builtins__": {
|
|
478
|
+
"len": len,
|
|
479
|
+
"sum": sum,
|
|
480
|
+
"max": max,
|
|
481
|
+
"min": min,
|
|
482
|
+
"int": int,
|
|
483
|
+
"float": float,
|
|
484
|
+
"str": str,
|
|
485
|
+
"bool": bool,
|
|
486
|
+
"abs": abs,
|
|
487
|
+
"round": round,
|
|
488
|
+
"pow": pow,
|
|
489
|
+
"range": range,
|
|
490
|
+
"list": list,
|
|
491
|
+
"dict": dict,
|
|
492
|
+
"set": set,
|
|
493
|
+
"tuple": tuple,
|
|
494
|
+
"sorted": sorted,
|
|
495
|
+
"reversed": reversed,
|
|
496
|
+
"enumerate": enumerate,
|
|
497
|
+
"zip": zip,
|
|
498
|
+
"any": any,
|
|
499
|
+
"all": all,
|
|
500
|
+
"map": map,
|
|
501
|
+
"filter": filter,
|
|
502
|
+
"isinstance": isinstance,
|
|
503
|
+
"hasattr": hasattr,
|
|
504
|
+
"getattr": getattr,
|
|
505
|
+
}
|
|
506
|
+
},
|
|
473
507
|
local_vars,
|
|
474
508
|
)
|
|
475
509
|
|
|
@@ -491,31 +525,74 @@ class SyncCopilotFeatureClient:
|
|
|
491
525
|
self._async_client = CopilotFeatureClient(**kwargs)
|
|
492
526
|
self._loop = None
|
|
493
527
|
|
|
494
|
-
def
|
|
528
|
+
def _get_or_create_loop(self):
|
|
529
|
+
"""Get or create a persistent event loop for this client."""
|
|
495
530
|
if self._loop is None or self._loop.is_closed():
|
|
531
|
+
self._loop = asyncio.new_event_loop()
|
|
532
|
+
asyncio.set_event_loop(self._loop)
|
|
533
|
+
return self._loop
|
|
534
|
+
|
|
535
|
+
def _run_async(self, coro):
|
|
536
|
+
"""Run an async coroutine, handling various event loop scenarios."""
|
|
537
|
+
try:
|
|
538
|
+
# First, try to get the running loop
|
|
496
539
|
try:
|
|
497
|
-
|
|
540
|
+
loop = asyncio.get_running_loop()
|
|
541
|
+
# We're in a running loop - use nest_asyncio if available
|
|
542
|
+
try:
|
|
543
|
+
import nest_asyncio
|
|
544
|
+
|
|
545
|
+
nest_asyncio.apply()
|
|
546
|
+
return loop.run_until_complete(coro)
|
|
547
|
+
except ImportError:
|
|
548
|
+
# nest_asyncio not available, use thread pool
|
|
549
|
+
import concurrent.futures
|
|
550
|
+
|
|
551
|
+
with concurrent.futures.ThreadPoolExecutor() as executor:
|
|
552
|
+
future = executor.submit(self._run_in_new_loop, coro)
|
|
553
|
+
return future.result(timeout=120)
|
|
498
554
|
except RuntimeError:
|
|
499
|
-
|
|
500
|
-
|
|
501
|
-
|
|
555
|
+
# No running event loop - use our persistent loop
|
|
556
|
+
loop = self._get_or_create_loop()
|
|
557
|
+
return loop.run_until_complete(coro)
|
|
558
|
+
except Exception as e:
|
|
559
|
+
# Last resort - create a completely fresh loop
|
|
560
|
+
try:
|
|
561
|
+
return self._run_in_new_loop(coro)
|
|
562
|
+
except Exception:
|
|
563
|
+
raise e from None
|
|
564
|
+
|
|
565
|
+
def _run_in_new_loop(self, coro):
|
|
566
|
+
"""Run coroutine in a fresh event loop."""
|
|
567
|
+
loop = asyncio.new_event_loop()
|
|
568
|
+
try:
|
|
569
|
+
asyncio.set_event_loop(loop)
|
|
570
|
+
return loop.run_until_complete(coro)
|
|
571
|
+
finally:
|
|
572
|
+
loop.close()
|
|
502
573
|
|
|
503
574
|
def start(self):
|
|
504
|
-
return self.
|
|
575
|
+
return self._run_async(self._async_client.start())
|
|
505
576
|
|
|
506
577
|
def stop(self):
|
|
507
|
-
|
|
578
|
+
result = self._run_async(self._async_client.stop())
|
|
579
|
+
# Close our loop if it exists
|
|
580
|
+
if self._loop is not None and not self._loop.is_closed():
|
|
581
|
+
self._loop.close()
|
|
582
|
+
self._loop = None
|
|
583
|
+
return result
|
|
508
584
|
|
|
509
585
|
def suggest_features(self, **kwargs):
|
|
510
|
-
return self.
|
|
586
|
+
return self._run_async(self._async_client.suggest_features(**kwargs))
|
|
587
|
+
|
|
588
|
+
def send_prompt(self, prompt: str):
|
|
589
|
+
return self._run_async(self._async_client.send_prompt(prompt))
|
|
511
590
|
|
|
512
591
|
def explain_feature(self, **kwargs):
|
|
513
|
-
return self.
|
|
592
|
+
return self._run_async(self._async_client.explain_feature(**kwargs))
|
|
514
593
|
|
|
515
594
|
def generate_feature_code(self, **kwargs):
|
|
516
|
-
return self.
|
|
595
|
+
return self._run_async(self._async_client.generate_feature_code(**kwargs))
|
|
517
596
|
|
|
518
597
|
def validate_feature_code(self, code: str, sample_data=None):
|
|
519
|
-
return self.
|
|
520
|
-
self._async_client.validate_feature_code(code=code, sample_data=sample_data)
|
|
521
|
-
)
|
|
598
|
+
return self._run_async(self._async_client.validate_feature_code(code=code, sample_data=sample_data))
|
featcopilot/llm/explainer.py
CHANGED
|
@@ -9,6 +9,9 @@ import pandas as pd
|
|
|
9
9
|
|
|
10
10
|
from featcopilot.core.feature import Feature, FeatureSet
|
|
11
11
|
from featcopilot.llm.copilot_client import SyncCopilotFeatureClient
|
|
12
|
+
from featcopilot.utils.logger import get_logger
|
|
13
|
+
|
|
14
|
+
logger = get_logger(__name__)
|
|
12
15
|
|
|
13
16
|
|
|
14
17
|
class FeatureExplainer:
|
|
@@ -20,7 +23,7 @@ class FeatureExplainer:
|
|
|
20
23
|
|
|
21
24
|
Parameters
|
|
22
25
|
----------
|
|
23
|
-
model : str, default='gpt-5'
|
|
26
|
+
model : str, default='gpt-5.2'
|
|
24
27
|
LLM model to use
|
|
25
28
|
|
|
26
29
|
Examples
|
|
@@ -29,7 +32,7 @@ class FeatureExplainer:
|
|
|
29
32
|
>>> explanations = explainer.explain_features(feature_set, task='predict churn')
|
|
30
33
|
"""
|
|
31
34
|
|
|
32
|
-
def __init__(self, model: str = "gpt-5", verbose: bool = False):
|
|
35
|
+
def __init__(self, model: str = "gpt-5.2", verbose: bool = False):
|
|
33
36
|
self.model = model
|
|
34
37
|
self.verbose = verbose
|
|
35
38
|
self._client: Optional[SyncCopilotFeatureClient] = None
|
|
@@ -115,7 +118,7 @@ class FeatureExplainer:
|
|
|
115
118
|
|
|
116
119
|
except Exception as e:
|
|
117
120
|
if self.verbose:
|
|
118
|
-
|
|
121
|
+
logger.error(f"Could not explain {feature.name}: {e}")
|
|
119
122
|
explanations[feature.name] = f"Feature based on: {', '.join(feature.source_columns)}"
|
|
120
123
|
|
|
121
124
|
return explanations
|