featcopilot 0.2.0__py3-none-any.whl → 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- featcopilot/__init__.py +7 -0
- featcopilot/core/__init__.py +2 -0
- featcopilot/core/transform_rule.py +276 -0
- featcopilot/engines/tabular.py +145 -2
- featcopilot/engines/text.py +346 -8
- featcopilot/engines/timeseries.py +230 -1
- featcopilot/llm/__init__.py +2 -0
- featcopilot/llm/copilot_client.py +50 -17
- featcopilot/llm/semantic_engine.py +652 -10
- featcopilot/llm/transform_rule_generator.py +403 -0
- featcopilot/selection/importance.py +35 -7
- featcopilot/selection/redundancy.py +35 -9
- featcopilot/selection/statistical.py +103 -33
- featcopilot/selection/unified.py +54 -3
- featcopilot/stores/__init__.py +2 -0
- featcopilot/stores/rule_store.py +343 -0
- featcopilot/transformers/sklearn_compat.py +10 -1
- {featcopilot-0.2.0.dist-info → featcopilot-0.3.0.dist-info}/METADATA +27 -19
- featcopilot-0.3.0.dist-info/RECORD +38 -0
- featcopilot-0.2.0.dist-info/RECORD +0 -35
- {featcopilot-0.2.0.dist-info → featcopilot-0.3.0.dist-info}/WHEEL +0 -0
- {featcopilot-0.2.0.dist-info → featcopilot-0.3.0.dist-info}/top_level.txt +0 -0
|
@@ -28,6 +28,10 @@ class TimeSeriesEngineConfig(EngineConfig):
|
|
|
28
28
|
"autocorrelation",
|
|
29
29
|
"peaks",
|
|
30
30
|
"trends",
|
|
31
|
+
"entropy",
|
|
32
|
+
"energy",
|
|
33
|
+
"complexity",
|
|
34
|
+
"counts",
|
|
31
35
|
],
|
|
32
36
|
description="Feature groups to extract",
|
|
33
37
|
)
|
|
@@ -36,6 +40,7 @@ class TimeSeriesEngineConfig(EngineConfig):
|
|
|
36
40
|
)
|
|
37
41
|
n_fft_coefficients: int = Field(default=10, description="Number of FFT coefficients")
|
|
38
42
|
n_autocorr_lags: int = Field(default=10, description="Number of autocorrelation lags")
|
|
43
|
+
entropy_bins: int = Field(default=10, description="Number of bins for binned entropy")
|
|
39
44
|
|
|
40
45
|
|
|
41
46
|
class TimeSeriesEngine(BaseEngine):
|
|
@@ -66,7 +71,7 @@ class TimeSeriesEngine(BaseEngine):
|
|
|
66
71
|
>>> X_features = engine.fit_transform(time_series_df)
|
|
67
72
|
"""
|
|
68
73
|
|
|
69
|
-
# Feature extraction functions
|
|
74
|
+
# Feature extraction functions (tsfresh-inspired)
|
|
70
75
|
FEATURE_EXTRACTORS = {
|
|
71
76
|
"basic_stats": "_extract_basic_stats",
|
|
72
77
|
"distribution": "_extract_distribution",
|
|
@@ -75,6 +80,10 @@ class TimeSeriesEngine(BaseEngine):
|
|
|
75
80
|
"trends": "_extract_trends",
|
|
76
81
|
"rolling": "_extract_rolling",
|
|
77
82
|
"fft": "_extract_fft",
|
|
83
|
+
"entropy": "_extract_entropy",
|
|
84
|
+
"energy": "_extract_energy",
|
|
85
|
+
"complexity": "_extract_complexity",
|
|
86
|
+
"counts": "_extract_counts",
|
|
78
87
|
}
|
|
79
88
|
|
|
80
89
|
def __init__(
|
|
@@ -400,6 +409,226 @@ class TimeSeriesEngine(BaseEngine):
|
|
|
400
409
|
|
|
401
410
|
return features
|
|
402
411
|
|
|
412
|
+
def _extract_entropy(self, series: np.ndarray, col: str) -> dict[str, float]:
|
|
413
|
+
"""Extract entropy-based features (tsfresh-inspired)."""
|
|
414
|
+
features = {}
|
|
415
|
+
prefix = col
|
|
416
|
+
|
|
417
|
+
series_clean = series[~np.isnan(series)]
|
|
418
|
+
if len(series_clean) < 4:
|
|
419
|
+
return features
|
|
420
|
+
|
|
421
|
+
# Binned entropy
|
|
422
|
+
try:
|
|
423
|
+
hist, _ = np.histogram(series_clean, bins=self.config.entropy_bins)
|
|
424
|
+
hist = hist[hist > 0]
|
|
425
|
+
probs = hist / hist.sum()
|
|
426
|
+
features[f"{prefix}_binned_entropy"] = -np.sum(probs * np.log(probs + 1e-10))
|
|
427
|
+
except Exception:
|
|
428
|
+
features[f"{prefix}_binned_entropy"] = 0
|
|
429
|
+
|
|
430
|
+
# Sample entropy (simplified implementation)
|
|
431
|
+
try:
|
|
432
|
+
features[f"{prefix}_sample_entropy"] = self._sample_entropy(series_clean, m=2, r=0.2)
|
|
433
|
+
except Exception:
|
|
434
|
+
features[f"{prefix}_sample_entropy"] = 0
|
|
435
|
+
|
|
436
|
+
# Approximate entropy
|
|
437
|
+
try:
|
|
438
|
+
features[f"{prefix}_approximate_entropy"] = self._approximate_entropy(series_clean, m=2, r=0.2)
|
|
439
|
+
except Exception:
|
|
440
|
+
features[f"{prefix}_approximate_entropy"] = 0
|
|
441
|
+
|
|
442
|
+
return features
|
|
443
|
+
|
|
444
|
+
def _sample_entropy(self, series: np.ndarray, m: int = 2, r: float = 0.2) -> float:
|
|
445
|
+
"""Compute sample entropy of a time series."""
|
|
446
|
+
n = len(series)
|
|
447
|
+
if n < m + 2:
|
|
448
|
+
return 0
|
|
449
|
+
|
|
450
|
+
# Normalize r by std
|
|
451
|
+
r = r * np.std(series)
|
|
452
|
+
if r == 0:
|
|
453
|
+
return 0
|
|
454
|
+
|
|
455
|
+
def _count_matches(template_length):
|
|
456
|
+
count = 0
|
|
457
|
+
templates = np.array([series[i : i + template_length] for i in range(n - template_length)])
|
|
458
|
+
for i in range(len(templates)):
|
|
459
|
+
for j in range(i + 1, len(templates)):
|
|
460
|
+
if np.max(np.abs(templates[i] - templates[j])) < r:
|
|
461
|
+
count += 1
|
|
462
|
+
return count
|
|
463
|
+
|
|
464
|
+
a = _count_matches(m)
|
|
465
|
+
b = _count_matches(m + 1)
|
|
466
|
+
|
|
467
|
+
if a == 0 or b == 0:
|
|
468
|
+
return 0
|
|
469
|
+
|
|
470
|
+
return -np.log(b / a)
|
|
471
|
+
|
|
472
|
+
def _approximate_entropy(self, series: np.ndarray, m: int = 2, r: float = 0.2) -> float:
|
|
473
|
+
"""Compute approximate entropy of a time series."""
|
|
474
|
+
n = len(series)
|
|
475
|
+
if n < m + 2:
|
|
476
|
+
return 0
|
|
477
|
+
|
|
478
|
+
r = r * np.std(series)
|
|
479
|
+
if r == 0:
|
|
480
|
+
return 0
|
|
481
|
+
|
|
482
|
+
def _phi(m_val):
|
|
483
|
+
patterns = np.array([series[i : i + m_val] for i in range(n - m_val + 1)])
|
|
484
|
+
counts = np.zeros(len(patterns))
|
|
485
|
+
for i, pattern in enumerate(patterns):
|
|
486
|
+
for other in patterns:
|
|
487
|
+
if np.max(np.abs(pattern - other)) < r:
|
|
488
|
+
counts[i] += 1
|
|
489
|
+
counts = counts / len(patterns)
|
|
490
|
+
return np.sum(np.log(counts + 1e-10)) / len(patterns)
|
|
491
|
+
|
|
492
|
+
return _phi(m) - _phi(m + 1)
|
|
493
|
+
|
|
494
|
+
def _extract_energy(self, series: np.ndarray, col: str) -> dict[str, float]:
|
|
495
|
+
"""Extract energy-based features (tsfresh-inspired)."""
|
|
496
|
+
features = {}
|
|
497
|
+
prefix = col
|
|
498
|
+
|
|
499
|
+
series_clean = series[~np.isnan(series)]
|
|
500
|
+
if len(series_clean) < 2:
|
|
501
|
+
return features
|
|
502
|
+
|
|
503
|
+
# Absolute energy: sum of squared values
|
|
504
|
+
features[f"{prefix}_abs_energy"] = np.sum(series_clean**2)
|
|
505
|
+
|
|
506
|
+
# Mean absolute change
|
|
507
|
+
features[f"{prefix}_mean_abs_change"] = np.mean(np.abs(np.diff(series_clean)))
|
|
508
|
+
|
|
509
|
+
# Mean second derivative central
|
|
510
|
+
if len(series_clean) >= 3:
|
|
511
|
+
second_deriv = series_clean[2:] - 2 * series_clean[1:-1] + series_clean[:-2]
|
|
512
|
+
features[f"{prefix}_mean_second_deriv_central"] = np.mean(second_deriv)
|
|
513
|
+
|
|
514
|
+
# Root mean square
|
|
515
|
+
features[f"{prefix}_rms"] = np.sqrt(np.mean(series_clean**2))
|
|
516
|
+
|
|
517
|
+
# Crest factor (peak/rms)
|
|
518
|
+
rms = features[f"{prefix}_rms"]
|
|
519
|
+
if rms > 0:
|
|
520
|
+
features[f"{prefix}_crest_factor"] = np.max(np.abs(series_clean)) / rms
|
|
521
|
+
|
|
522
|
+
return features
|
|
523
|
+
|
|
524
|
+
def _extract_complexity(self, series: np.ndarray, col: str) -> dict[str, float]:
|
|
525
|
+
"""Extract complexity features (tsfresh-inspired)."""
|
|
526
|
+
features = {}
|
|
527
|
+
prefix = col
|
|
528
|
+
|
|
529
|
+
series_clean = series[~np.isnan(series)]
|
|
530
|
+
if len(series_clean) < 3:
|
|
531
|
+
return features
|
|
532
|
+
|
|
533
|
+
# CID_CE: Complexity-invariant distance
|
|
534
|
+
diff = np.diff(series_clean)
|
|
535
|
+
features[f"{prefix}_cid_ce"] = np.sqrt(np.sum(diff**2))
|
|
536
|
+
|
|
537
|
+
# C3: Time series complexity (lag 1)
|
|
538
|
+
if len(series_clean) >= 3:
|
|
539
|
+
n = len(series_clean)
|
|
540
|
+
c3 = np.sum(series_clean[2:n] * series_clean[1 : n - 1] * series_clean[0 : n - 2]) / (n - 2)
|
|
541
|
+
features[f"{prefix}_c3"] = c3
|
|
542
|
+
|
|
543
|
+
# Ratio of unique values to length
|
|
544
|
+
features[f"{prefix}_ratio_unique_values"] = len(np.unique(series_clean)) / len(series_clean)
|
|
545
|
+
|
|
546
|
+
# Has duplicate
|
|
547
|
+
features[f"{prefix}_has_duplicate"] = 1 if len(np.unique(series_clean)) < len(series_clean) else 0
|
|
548
|
+
|
|
549
|
+
# Has duplicate max
|
|
550
|
+
max_val = np.max(series_clean)
|
|
551
|
+
features[f"{prefix}_has_duplicate_max"] = 1 if np.sum(series_clean == max_val) > 1 else 0
|
|
552
|
+
|
|
553
|
+
# Has duplicate min
|
|
554
|
+
min_val = np.min(series_clean)
|
|
555
|
+
features[f"{prefix}_has_duplicate_min"] = 1 if np.sum(series_clean == min_val) > 1 else 0
|
|
556
|
+
|
|
557
|
+
# Sum of reoccurring values
|
|
558
|
+
unique, counts = np.unique(series_clean, return_counts=True)
|
|
559
|
+
reoccurring_mask = counts > 1
|
|
560
|
+
features[f"{prefix}_sum_reoccurring_values"] = np.sum(unique[reoccurring_mask] * counts[reoccurring_mask])
|
|
561
|
+
|
|
562
|
+
# Sum of reoccurring data points
|
|
563
|
+
features[f"{prefix}_sum_reoccurring_data_points"] = np.sum(counts[reoccurring_mask])
|
|
564
|
+
|
|
565
|
+
# Percentage of reoccurring data points
|
|
566
|
+
features[f"{prefix}_pct_reoccurring_data_points"] = np.sum(counts[reoccurring_mask]) / len(series_clean)
|
|
567
|
+
|
|
568
|
+
return features
|
|
569
|
+
|
|
570
|
+
def _extract_counts(self, series: np.ndarray, col: str) -> dict[str, float]:
|
|
571
|
+
"""Extract count-based features (tsfresh-inspired)."""
|
|
572
|
+
features = {}
|
|
573
|
+
prefix = col
|
|
574
|
+
|
|
575
|
+
series_clean = series[~np.isnan(series)]
|
|
576
|
+
if len(series_clean) < 2:
|
|
577
|
+
return features
|
|
578
|
+
|
|
579
|
+
mean_val = np.mean(series_clean)
|
|
580
|
+
|
|
581
|
+
# Count above mean
|
|
582
|
+
features[f"{prefix}_count_above_mean"] = np.sum(series_clean > mean_val)
|
|
583
|
+
|
|
584
|
+
# Count below mean
|
|
585
|
+
features[f"{prefix}_count_below_mean"] = np.sum(series_clean < mean_val)
|
|
586
|
+
|
|
587
|
+
# First location of maximum
|
|
588
|
+
features[f"{prefix}_first_loc_max"] = np.argmax(series_clean) / len(series_clean)
|
|
589
|
+
|
|
590
|
+
# First location of minimum
|
|
591
|
+
features[f"{prefix}_first_loc_min"] = np.argmin(series_clean) / len(series_clean)
|
|
592
|
+
|
|
593
|
+
# Last location of maximum
|
|
594
|
+
features[f"{prefix}_last_loc_max"] = (len(series_clean) - 1 - np.argmax(series_clean[::-1])) / len(series_clean)
|
|
595
|
+
|
|
596
|
+
# Last location of minimum
|
|
597
|
+
features[f"{prefix}_last_loc_min"] = (len(series_clean) - 1 - np.argmin(series_clean[::-1])) / len(series_clean)
|
|
598
|
+
|
|
599
|
+
# Longest strike above mean
|
|
600
|
+
above_mean = series_clean > mean_val
|
|
601
|
+
features[f"{prefix}_longest_strike_above_mean"] = self._longest_consecutive(above_mean)
|
|
602
|
+
|
|
603
|
+
# Longest strike below mean
|
|
604
|
+
below_mean = series_clean < mean_val
|
|
605
|
+
features[f"{prefix}_longest_strike_below_mean"] = self._longest_consecutive(below_mean)
|
|
606
|
+
|
|
607
|
+
# Number of crossings (mean)
|
|
608
|
+
crossings = np.sum(np.diff(np.sign(series_clean - mean_val)) != 0)
|
|
609
|
+
features[f"{prefix}_number_crossings_mean"] = crossings
|
|
610
|
+
|
|
611
|
+
# Number of zero crossings
|
|
612
|
+
zero_crossings = np.sum(np.diff(np.sign(series_clean)) != 0)
|
|
613
|
+
features[f"{prefix}_number_zero_crossings"] = zero_crossings
|
|
614
|
+
|
|
615
|
+
# Absolute sum of changes
|
|
616
|
+
features[f"{prefix}_abs_sum_changes"] = np.sum(np.abs(np.diff(series_clean)))
|
|
617
|
+
|
|
618
|
+
return features
|
|
619
|
+
|
|
620
|
+
def _longest_consecutive(self, bool_array: np.ndarray) -> int:
|
|
621
|
+
"""Find longest consecutive True values in boolean array."""
|
|
622
|
+
max_len = 0
|
|
623
|
+
current_len = 0
|
|
624
|
+
for val in bool_array:
|
|
625
|
+
if val:
|
|
626
|
+
current_len += 1
|
|
627
|
+
max_len = max(max_len, current_len)
|
|
628
|
+
else:
|
|
629
|
+
current_len = 0
|
|
630
|
+
return max_len
|
|
631
|
+
|
|
403
632
|
def get_feature_set(self) -> FeatureSet:
|
|
404
633
|
"""Get the feature set with metadata."""
|
|
405
634
|
return self._feature_set
|
featcopilot/llm/__init__.py
CHANGED
|
@@ -8,6 +8,7 @@ from featcopilot.llm.copilot_client import CopilotFeatureClient
|
|
|
8
8
|
from featcopilot.llm.explainer import FeatureExplainer
|
|
9
9
|
from featcopilot.llm.litellm_client import LiteLLMFeatureClient, SyncLiteLLMFeatureClient
|
|
10
10
|
from featcopilot.llm.semantic_engine import SemanticEngine
|
|
11
|
+
from featcopilot.llm.transform_rule_generator import TransformRuleGenerator
|
|
11
12
|
|
|
12
13
|
__all__ = [
|
|
13
14
|
"CopilotFeatureClient",
|
|
@@ -16,4 +17,5 @@ __all__ = [
|
|
|
16
17
|
"SemanticEngine",
|
|
17
18
|
"FeatureExplainer",
|
|
18
19
|
"FeatureCodeGenerator",
|
|
20
|
+
"TransformRuleGenerator",
|
|
19
21
|
]
|
|
@@ -523,38 +523,71 @@ class SyncCopilotFeatureClient:
|
|
|
523
523
|
|
|
524
524
|
def __init__(self, **kwargs):
|
|
525
525
|
self._async_client = CopilotFeatureClient(**kwargs)
|
|
526
|
+
self._loop = None
|
|
527
|
+
|
|
528
|
+
def _get_or_create_loop(self):
|
|
529
|
+
"""Get or create a persistent event loop for this client."""
|
|
530
|
+
if self._loop is None or self._loop.is_closed():
|
|
531
|
+
self._loop = asyncio.new_event_loop()
|
|
532
|
+
asyncio.set_event_loop(self._loop)
|
|
533
|
+
return self._loop
|
|
526
534
|
|
|
527
535
|
def _run_async(self, coro):
|
|
528
|
-
"""Run an async coroutine, handling
|
|
536
|
+
"""Run an async coroutine, handling various event loop scenarios."""
|
|
529
537
|
try:
|
|
530
|
-
#
|
|
531
|
-
loop = asyncio.get_running_loop()
|
|
532
|
-
# We're in a running loop - use nest_asyncio if available
|
|
538
|
+
# First, try to get the running loop
|
|
533
539
|
try:
|
|
534
|
-
|
|
535
|
-
|
|
536
|
-
|
|
540
|
+
loop = asyncio.get_running_loop()
|
|
541
|
+
# We're in a running loop - use nest_asyncio if available
|
|
542
|
+
try:
|
|
543
|
+
import nest_asyncio
|
|
544
|
+
|
|
545
|
+
nest_asyncio.apply()
|
|
546
|
+
return loop.run_until_complete(coro)
|
|
547
|
+
except ImportError:
|
|
548
|
+
# nest_asyncio not available, use thread pool
|
|
549
|
+
import concurrent.futures
|
|
550
|
+
|
|
551
|
+
with concurrent.futures.ThreadPoolExecutor() as executor:
|
|
552
|
+
future = executor.submit(self._run_in_new_loop, coro)
|
|
553
|
+
return future.result(timeout=120)
|
|
554
|
+
except RuntimeError:
|
|
555
|
+
# No running event loop - use our persistent loop
|
|
556
|
+
loop = self._get_or_create_loop()
|
|
537
557
|
return loop.run_until_complete(coro)
|
|
538
|
-
|
|
539
|
-
|
|
540
|
-
|
|
558
|
+
except Exception as e:
|
|
559
|
+
# Last resort - create a completely fresh loop
|
|
560
|
+
try:
|
|
561
|
+
return self._run_in_new_loop(coro)
|
|
562
|
+
except Exception:
|
|
563
|
+
raise e from None
|
|
541
564
|
|
|
542
|
-
|
|
543
|
-
|
|
544
|
-
|
|
545
|
-
|
|
546
|
-
|
|
547
|
-
return
|
|
565
|
+
def _run_in_new_loop(self, coro):
|
|
566
|
+
"""Run coroutine in a fresh event loop."""
|
|
567
|
+
loop = asyncio.new_event_loop()
|
|
568
|
+
try:
|
|
569
|
+
asyncio.set_event_loop(loop)
|
|
570
|
+
return loop.run_until_complete(coro)
|
|
571
|
+
finally:
|
|
572
|
+
loop.close()
|
|
548
573
|
|
|
549
574
|
def start(self):
|
|
550
575
|
return self._run_async(self._async_client.start())
|
|
551
576
|
|
|
552
577
|
def stop(self):
|
|
553
|
-
|
|
578
|
+
result = self._run_async(self._async_client.stop())
|
|
579
|
+
# Close our loop if it exists
|
|
580
|
+
if self._loop is not None and not self._loop.is_closed():
|
|
581
|
+
self._loop.close()
|
|
582
|
+
self._loop = None
|
|
583
|
+
return result
|
|
554
584
|
|
|
555
585
|
def suggest_features(self, **kwargs):
|
|
556
586
|
return self._run_async(self._async_client.suggest_features(**kwargs))
|
|
557
587
|
|
|
588
|
+
def send_prompt(self, prompt: str):
|
|
589
|
+
return self._run_async(self._async_client.send_prompt(prompt))
|
|
590
|
+
|
|
558
591
|
def explain_feature(self, **kwargs):
|
|
559
592
|
return self._run_async(self._async_client.explain_feature(**kwargs))
|
|
560
593
|
|