felits 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- felits/__init__.py +161 -0
- felits/_compat.py +173 -0
- felits/data.py +89 -0
- felits/feature_extraction/__init__.py +21 -0
- felits/feature_extraction/automated.py +142 -0
- felits/feature_extraction/spectral.py +133 -0
- felits/feature_extraction/temporal.py +177 -0
- felits/feature_selection/__init__.py +59 -0
- felits/feature_selection/causal.py +167 -0
- felits/feature_selection/ensemble.py +105 -0
- felits/feature_selection/information.py +195 -0
- felits/feature_selection/pipeline.py +154 -0
- felits/feature_selection/regularization.py +171 -0
- felits/feature_selection/xai.py +225 -0
- felits/models/__init__.py +28 -0
- felits/models/base.py +97 -0
- felits/models/dl.py +127 -0
- felits/models/sklearn.py +76 -0
- felits/optimization.py +92 -0
- felits/preprocessing/__init__.py +35 -0
- felits/preprocessing/decomposition.py +119 -0
- felits/preprocessing/imputation.py +67 -0
- felits/preprocessing/metrics.py +119 -0
- felits/preprocessing/outliers.py +109 -0
- felits/preprocessing/scaling.py +153 -0
- felits/xai.py +138 -0
- felits-0.1.0.dist-info/METADATA +161 -0
- felits-0.1.0.dist-info/RECORD +30 -0
- felits-0.1.0.dist-info/WHEEL +4 -0
- felits-0.1.0.dist-info/licenses/LICENSE +21 -0
felits/__init__.py
ADDED
|
@@ -0,0 +1,161 @@
|
|
|
1
|
+
"""FELITS: Feature Engineering and Large-scale Integration for Time Series.
|
|
2
|
+
|
|
3
|
+
Top-level package exposing the most commonly used classes and functions
|
|
4
|
+
through a flat, ergonomic API::
|
|
5
|
+
|
|
6
|
+
from felits import (
|
|
7
|
+
HampelFilter, TimeSeriesScaler, SlidingWindowSplitter, Metrics,
|
|
8
|
+
cyclical_encode, fft_features, tsfresh_extract,
|
|
9
|
+
FeatureSelector, granger_feature_selection, shap_feature_selection,
|
|
10
|
+
XGBoostForecaster, RandomForestForecaster, RNNBasedModel,
|
|
11
|
+
OptunaOptimizer, deep_shap_selector,
|
|
12
|
+
)
|
|
13
|
+
|
|
14
|
+
Submodules are still importable for advanced use::
|
|
15
|
+
|
|
16
|
+
from felits.preprocessing import iqr_outlier_detection
|
|
17
|
+
from felits.feature_selection import mrmr_selection
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
from __future__ import annotations
|
|
21
|
+
|
|
22
|
+
from . import data, models, optimization
|
|
23
|
+
from .feature_extraction import (
|
|
24
|
+
cyclical_encode,
|
|
25
|
+
extract_all_features,
|
|
26
|
+
fats_extract,
|
|
27
|
+
fft_features,
|
|
28
|
+
lag_features,
|
|
29
|
+
rolling_statistics,
|
|
30
|
+
shift_features,
|
|
31
|
+
spectral_entropy,
|
|
32
|
+
tsfresh_extract,
|
|
33
|
+
wavelet_features,
|
|
34
|
+
)
|
|
35
|
+
from .feature_selection import (
|
|
36
|
+
FeatureSelector,
|
|
37
|
+
adaptive_lasso_selection,
|
|
38
|
+
elastic_net_selection,
|
|
39
|
+
granger_feature_selection,
|
|
40
|
+
lasso_selection,
|
|
41
|
+
lime_explain_instance,
|
|
42
|
+
mrmr_selection,
|
|
43
|
+
mutual_information_ksg,
|
|
44
|
+
permutation_importance_selection,
|
|
45
|
+
rf_importance_selection,
|
|
46
|
+
select_features,
|
|
47
|
+
shap_feature_selection,
|
|
48
|
+
shap_interaction_selection,
|
|
49
|
+
xgboost_importance_selection,
|
|
50
|
+
)
|
|
51
|
+
from .optimization import OptunaOptimizer
|
|
52
|
+
from .preprocessing import (
|
|
53
|
+
DecompositionResult,
|
|
54
|
+
HampelFilter,
|
|
55
|
+
Metrics,
|
|
56
|
+
SlidingWindowSplitter,
|
|
57
|
+
TimeSeriesScaler,
|
|
58
|
+
forward_fill,
|
|
59
|
+
hampel_filter,
|
|
60
|
+
iqr_outlier_detection,
|
|
61
|
+
linear_interpolate,
|
|
62
|
+
mae,
|
|
63
|
+
mape,
|
|
64
|
+
max_error,
|
|
65
|
+
mse,
|
|
66
|
+
r2,
|
|
67
|
+
rmse,
|
|
68
|
+
seasonal_adjust,
|
|
69
|
+
smape,
|
|
70
|
+
stl_decompose,
|
|
71
|
+
three_sigma_filter,
|
|
72
|
+
time_aware_interpolate,
|
|
73
|
+
)
|
|
74
|
+
from .xai import deep_shap_selector, explain_forecast
|
|
75
|
+
|
|
76
|
+
__version__ = "0.1.0"
|
|
77
|
+
__author__ = "Félix Morales Mareco"
|
|
78
|
+
__license__ = "MIT"
|
|
79
|
+
|
|
80
|
+
__all__ = [
|
|
81
|
+
"__version__",
|
|
82
|
+
"__author__",
|
|
83
|
+
"__license__",
|
|
84
|
+
# data
|
|
85
|
+
"data",
|
|
86
|
+
"models",
|
|
87
|
+
"optimization",
|
|
88
|
+
# preprocessing
|
|
89
|
+
"DecompositionResult",
|
|
90
|
+
"HampelFilter",
|
|
91
|
+
"Metrics",
|
|
92
|
+
"SlidingWindowSplitter",
|
|
93
|
+
"TimeSeriesScaler",
|
|
94
|
+
"forward_fill",
|
|
95
|
+
"hampel_filter",
|
|
96
|
+
"iqr_outlier_detection",
|
|
97
|
+
"linear_interpolate",
|
|
98
|
+
"mae",
|
|
99
|
+
"mape",
|
|
100
|
+
"max_error",
|
|
101
|
+
"mse",
|
|
102
|
+
"r2",
|
|
103
|
+
"rmse",
|
|
104
|
+
"seasonal_adjust",
|
|
105
|
+
"smape",
|
|
106
|
+
"stl_decompose",
|
|
107
|
+
"three_sigma_filter",
|
|
108
|
+
"time_aware_interpolate",
|
|
109
|
+
# feature_extraction
|
|
110
|
+
"cyclical_encode",
|
|
111
|
+
"extract_all_features",
|
|
112
|
+
"fats_extract",
|
|
113
|
+
"fft_features",
|
|
114
|
+
"lag_features",
|
|
115
|
+
"rolling_statistics",
|
|
116
|
+
"shift_features",
|
|
117
|
+
"spectral_entropy",
|
|
118
|
+
"tsfresh_extract",
|
|
119
|
+
"wavelet_features",
|
|
120
|
+
# feature_selection
|
|
121
|
+
"FeatureSelector",
|
|
122
|
+
"adaptive_lasso_selection",
|
|
123
|
+
"elastic_net_selection",
|
|
124
|
+
"granger_feature_selection",
|
|
125
|
+
"lasso_selection",
|
|
126
|
+
"lime_explain_instance",
|
|
127
|
+
"mrmr_selection",
|
|
128
|
+
"mutual_information_ksg",
|
|
129
|
+
"permutation_importance_selection",
|
|
130
|
+
"rf_importance_selection",
|
|
131
|
+
"select_features",
|
|
132
|
+
"shap_feature_selection",
|
|
133
|
+
"shap_interaction_selection",
|
|
134
|
+
"xgboost_importance_selection",
|
|
135
|
+
# optimization
|
|
136
|
+
"OptunaOptimizer",
|
|
137
|
+
# xai
|
|
138
|
+
"deep_shap_selector",
|
|
139
|
+
"explain_forecast",
|
|
140
|
+
]
|
|
141
|
+
|
|
142
|
+
# Re-export the DL models and sklearn forecasters at the top level
|
|
143
|
+
from .models import (
|
|
144
|
+
BahdanauAttention,
|
|
145
|
+
LinearForecaster,
|
|
146
|
+
RandomForestForecaster,
|
|
147
|
+
RNNAttentionModel,
|
|
148
|
+
RNNBasedModel,
|
|
149
|
+
XGBoostForecaster,
|
|
150
|
+
is_dl_available,
|
|
151
|
+
)
|
|
152
|
+
|
|
153
|
+
__all__ += [
|
|
154
|
+
"BahdanauAttention",
|
|
155
|
+
"LinearForecaster",
|
|
156
|
+
"RNNAttentionModel",
|
|
157
|
+
"RNNBasedModel",
|
|
158
|
+
"RandomForestForecaster",
|
|
159
|
+
"XGBoostForecaster",
|
|
160
|
+
"is_dl_available",
|
|
161
|
+
]
|
felits/_compat.py
ADDED
|
@@ -0,0 +1,173 @@
|
|
|
1
|
+
"""Compatibility layer for dual pandas / polars support.
|
|
2
|
+
|
|
3
|
+
FELITS accepts both ``pd.DataFrame`` and ``pl.DataFrame`` (or a mix of both)
|
|
4
|
+
at all public APIs. Internally the library works with :class:`polars.DataFrame`
|
|
5
|
+
for performance. Conversion to pandas happens only at library boundaries
|
|
6
|
+
(e.g. ``statsmodels``, ``tsfresh``) that require it.
|
|
7
|
+
|
|
8
|
+
Usage
|
|
9
|
+
-----
|
|
10
|
+
>>> from felits._compat import to_polars, to_pandas, DataFrameLike
|
|
11
|
+
>>>
|
|
12
|
+
>>> def my_public_api(df: DataFrameLike, target: str):
|
|
13
|
+
... pdf = to_polars(df) # guarantee polars
|
|
14
|
+
... result = pdf.select(pl.col(target))
|
|
15
|
+
... return to_pandas(result) # back to pandas for the caller
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
from __future__ import annotations
|
|
19
|
+
|
|
20
|
+
from typing import TYPE_CHECKING, Union
|
|
21
|
+
|
|
22
|
+
import numpy as np
|
|
23
|
+
|
|
24
|
+
if TYPE_CHECKING:
|
|
25
|
+
import pandas as pd
|
|
26
|
+
import polars as pl
|
|
27
|
+
|
|
28
|
+
DataFrameLike = Union["pd.DataFrame", "pl.DataFrame"]
|
|
29
|
+
SeriesLike = Union["pd.Series", "pl.Series"]
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def to_polars(data: object, *columns: str, include_index: bool = False) -> "pl.DataFrame":
|
|
33
|
+
"""Convert ``data`` to a :class:`polars.DataFrame`.
|
|
34
|
+
|
|
35
|
+
Parameters
|
|
36
|
+
----------
|
|
37
|
+
include_index:
|
|
38
|
+
When ``True`` and ``data`` is a pandas DataFrame with a DatetimeIndex,
|
|
39
|
+
the index is included as the first column named ``"_time"``.
|
|
40
|
+
"""
|
|
41
|
+
import polars as pl
|
|
42
|
+
|
|
43
|
+
if isinstance(data, pl.DataFrame):
|
|
44
|
+
return data
|
|
45
|
+
if isinstance(data, pl.Series):
|
|
46
|
+
return data.to_frame()
|
|
47
|
+
if isinstance(data, str):
|
|
48
|
+
raise TypeError(f"Cannot convert str to polars DataFrame: {data!r}")
|
|
49
|
+
|
|
50
|
+
import pandas as pd
|
|
51
|
+
|
|
52
|
+
if isinstance(data, pd.DataFrame):
|
|
53
|
+
if include_index and isinstance(data.index, pd.DatetimeIndex):
|
|
54
|
+
pdf = pl.from_pandas(data.reset_index())
|
|
55
|
+
idx_name = data.index.name or "_time"
|
|
56
|
+
if pdf.columns[0] == "index":
|
|
57
|
+
pdf = pdf.rename({"index": idx_name})
|
|
58
|
+
return pdf
|
|
59
|
+
return pl.from_pandas(data)
|
|
60
|
+
if isinstance(data, pd.Series):
|
|
61
|
+
return pl.from_pandas(data.to_frame())
|
|
62
|
+
if isinstance(data, np.ndarray):
|
|
63
|
+
if data.ndim == 1:
|
|
64
|
+
data = data.reshape(-1, 1)
|
|
65
|
+
if columns:
|
|
66
|
+
return pl.DataFrame({name: data[:, i] for i, name in enumerate(columns)})
|
|
67
|
+
return pl.DataFrame({f"col_{i}": data[:, i] for i in range(data.shape[1])})
|
|
68
|
+
if isinstance(data, dict):
|
|
69
|
+
return pl.DataFrame(data)
|
|
70
|
+
raise TypeError(f"Cannot convert {type(data).__name__} to polars DataFrame.")
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def to_pandas(data: object) -> "pd.DataFrame":
|
|
74
|
+
"""Convert ``data`` to a :class:`pandas.DataFrame`.
|
|
75
|
+
|
|
76
|
+
Accepts:
|
|
77
|
+
- :class:`pandas.DataFrame` → returned unchanged
|
|
78
|
+
- :class:`polars.DataFrame` → converted via ``pl.to_pandas``
|
|
79
|
+
- :class:`polars.Series` → converted via ``pl.Series.to_pandas()``
|
|
80
|
+
"""
|
|
81
|
+
import pandas as pd
|
|
82
|
+
|
|
83
|
+
if isinstance(data, pd.DataFrame):
|
|
84
|
+
return data
|
|
85
|
+
if isinstance(data, pd.Series):
|
|
86
|
+
return data.to_frame()
|
|
87
|
+
|
|
88
|
+
import polars as pl
|
|
89
|
+
|
|
90
|
+
if isinstance(data, pl.DataFrame):
|
|
91
|
+
return data.to_pandas()
|
|
92
|
+
if isinstance(data, pl.Series):
|
|
93
|
+
return data.to_pandas().to_frame()
|
|
94
|
+
if isinstance(data, np.ndarray):
|
|
95
|
+
cols = [f"col_{i}" for i in range(data.shape[1])] if data.ndim > 1 else ["value"]
|
|
96
|
+
return pd.DataFrame(data, columns=cols)
|
|
97
|
+
raise TypeError(f"Cannot convert {type(data).__name__} to pandas DataFrame.")
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
def is_polars(data: object) -> bool:
|
|
101
|
+
"""Check if ``data`` is a polars DataFrame/Series."""
|
|
102
|
+
try:
|
|
103
|
+
import polars as pl
|
|
104
|
+
|
|
105
|
+
return isinstance(data, (pl.DataFrame, pl.Series))
|
|
106
|
+
except ImportError:
|
|
107
|
+
return False
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
def is_pandas(data: object) -> bool:
|
|
111
|
+
"""Check if ``data`` is a pandas DataFrame/Series."""
|
|
112
|
+
try:
|
|
113
|
+
import pandas as pd
|
|
114
|
+
|
|
115
|
+
return isinstance(data, (pd.DataFrame, pd.Series))
|
|
116
|
+
except ImportError:
|
|
117
|
+
return False
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
def to_numpy(data: object) -> np.ndarray:
|
|
121
|
+
"""Convert DataFrame/Series to numpy, regardless of backend."""
|
|
122
|
+
if isinstance(data, np.ndarray):
|
|
123
|
+
return data
|
|
124
|
+
if hasattr(data, "to_numpy"):
|
|
125
|
+
return data.to_numpy()
|
|
126
|
+
return np.asarray(data, dtype=float)
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
def with_columns(data: "pl.DataFrame", **kwargs: object) -> "pl.DataFrame":
|
|
130
|
+
"""Add or replace columns in a polars DataFrame.
|
|
131
|
+
|
|
132
|
+
Equivalent to ``pandas``' ``df[col] = values`` pattern.
|
|
133
|
+
"""
|
|
134
|
+
import polars as pl
|
|
135
|
+
|
|
136
|
+
for name, value in kwargs.items():
|
|
137
|
+
if isinstance(value, (list, np.ndarray)):
|
|
138
|
+
data = data.with_columns(pl.Series(name, value).alias(name))
|
|
139
|
+
elif isinstance(value, pl.Expr):
|
|
140
|
+
data = data.with_columns(value.alias(name))
|
|
141
|
+
elif isinstance(value, (int, float, str)):
|
|
142
|
+
data = data.with_columns(pl.lit(value).alias(name))
|
|
143
|
+
else:
|
|
144
|
+
raise TypeError(f"Unsupported column type: {type(value).__name__}")
|
|
145
|
+
return data
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
def has_datetime_column(df: "pl.DataFrame") -> bool:
|
|
149
|
+
"""Check if a polars DataFrame has any datetime or date column."""
|
|
150
|
+
import polars as pl
|
|
151
|
+
|
|
152
|
+
return any(
|
|
153
|
+
s in (pl.Datetime, pl.Date, pl.Datetime("ms", "UTC"), pl.Datetime("us", "UTC"))
|
|
154
|
+
for s in df.schema.values()
|
|
155
|
+
)
|
|
156
|
+
|
|
157
|
+
|
|
158
|
+
def datetime_columns(df: "pl.DataFrame") -> list[str]:
|
|
159
|
+
"""Return names of columns with datetime/date dtype."""
|
|
160
|
+
import polars as pl
|
|
161
|
+
|
|
162
|
+
valid = (pl.Datetime, pl.Date, pl.Datetime("ms", "UTC"), pl.Datetime("us", "UTC"))
|
|
163
|
+
return [c for c, t in df.schema.items() if t in valid]
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
def is_pandas_datetime_index(df) -> bool:
|
|
167
|
+
"""Check if df is a pandas DataFrame with a DatetimeIndex."""
|
|
168
|
+
try:
|
|
169
|
+
import pandas as pd
|
|
170
|
+
|
|
171
|
+
return isinstance(df, pd.DataFrame) and isinstance(df.index, pd.DatetimeIndex)
|
|
172
|
+
except ImportError:
|
|
173
|
+
return False
|
felits/data.py
ADDED
|
@@ -0,0 +1,89 @@
|
|
|
1
|
+
"""Data loaders and synthetic time-series generators.
|
|
2
|
+
|
|
3
|
+
The module exposes:
|
|
4
|
+
|
|
5
|
+
- :func:`load_example_dataset` — small CSV datasets bundled with FELITS
|
|
6
|
+
for demos and unit tests.
|
|
7
|
+
- :func:`make_synthetic_ts` — reproducible synthetic univariate time
|
|
8
|
+
series with optional seasonality / noise.
|
|
9
|
+
- :func:`load_sin_data` — convenience loader for the Paraguay SIN
|
|
10
|
+
dataset, used in the original research article. The function
|
|
11
|
+
gracefully returns ``None`` when the file is not present so the test
|
|
12
|
+
suite is not bound to a particular machine.
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
from __future__ import annotations
|
|
16
|
+
|
|
17
|
+
import os
|
|
18
|
+
|
|
19
|
+
import numpy as np
|
|
20
|
+
import pandas as pd
|
|
21
|
+
|
|
22
|
+
__all__ = ["load_example_dataset", "load_sin_data", "make_synthetic_ts"]
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def make_synthetic_ts(
|
|
26
|
+
n_samples: int = 1000,
|
|
27
|
+
n_features: int = 1,
|
|
28
|
+
seasonality: bool = True,
|
|
29
|
+
period: int = 24,
|
|
30
|
+
noise_std: float = 0.1,
|
|
31
|
+
seed: int = 0,
|
|
32
|
+
) -> pd.DataFrame:
|
|
33
|
+
"""Generate a synthetic time series with daily seasonality.
|
|
34
|
+
|
|
35
|
+
Parameters
|
|
36
|
+
----------
|
|
37
|
+
n_samples:
|
|
38
|
+
Number of rows.
|
|
39
|
+
n_features:
|
|
40
|
+
Number of exogenous feature columns (named ``"x0"``, ``"x1"``…).
|
|
41
|
+
seasonality:
|
|
42
|
+
Add a sinusoidal seasonal component to the target.
|
|
43
|
+
period:
|
|
44
|
+
Period of the seasonal component.
|
|
45
|
+
noise_std:
|
|
46
|
+
Standard deviation of the additive Gaussian noise.
|
|
47
|
+
seed:
|
|
48
|
+
Random seed for reproducibility.
|
|
49
|
+
"""
|
|
50
|
+
rng = np.random.default_rng(seed)
|
|
51
|
+
t = np.arange(n_samples)
|
|
52
|
+
y = np.zeros(n_samples)
|
|
53
|
+
if seasonality:
|
|
54
|
+
y = 10 * np.sin(2 * np.pi * t / period) + 5 * np.cos(2 * np.pi * t / (period * 7))
|
|
55
|
+
y += noise_std * rng.standard_normal(n_samples)
|
|
56
|
+
df = pd.DataFrame({"y": y}, index=pd.date_range("2024-01-01", periods=n_samples, freq="h"))
|
|
57
|
+
for i in range(n_features):
|
|
58
|
+
df[f"x{i}"] = rng.standard_normal(n_samples)
|
|
59
|
+
return df
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def load_example_dataset(name: str = "synthetic_demand") -> pd.DataFrame:
|
|
63
|
+
"""Load one of the small example datasets bundled with FELITS.
|
|
64
|
+
|
|
65
|
+
The default ``"synthetic_demand"`` is generated on the fly by
|
|
66
|
+
:func:`make_synthetic_ts` so users can run the quickstart without
|
|
67
|
+
any external files.
|
|
68
|
+
"""
|
|
69
|
+
if name == "synthetic_demand":
|
|
70
|
+
return make_synthetic_ts(n_samples=24 * 30)
|
|
71
|
+
raise ValueError(f"Unknown example dataset {name!r}.")
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def load_sin_data(path: str | None = None) -> pd.DataFrame | None:
|
|
75
|
+
"""Load the Paraguay SIN electricity-demand dataset.
|
|
76
|
+
|
|
77
|
+
The function looks for a CSV file at ``path`` or, when ``path`` is
|
|
78
|
+
``None``, in the conventional ``dataset/processed_dataset.csv``
|
|
79
|
+
location relative to the current working directory. When no file is
|
|
80
|
+
found it returns ``None`` rather than raising, so that the test suite
|
|
81
|
+
is environment-agnostic.
|
|
82
|
+
"""
|
|
83
|
+
if path is None:
|
|
84
|
+
path = os.path.join("dataset", "processed_dataset.csv")
|
|
85
|
+
if not os.path.isfile(path):
|
|
86
|
+
return None
|
|
87
|
+
df = pd.read_csv(path, index_col="Date")
|
|
88
|
+
df.index = pd.to_datetime(df.index)
|
|
89
|
+
return df
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
"""Feature engineering for time series."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from .automated import extract_all_features, fats_extract, tsfresh_extract
|
|
6
|
+
from .spectral import fft_features, spectral_entropy, wavelet_features, welch_psd
|
|
7
|
+
from .temporal import cyclical_encode, lag_features, rolling_statistics, shift_features
|
|
8
|
+
|
|
9
|
+
__all__ = [
|
|
10
|
+
"cyclical_encode",
|
|
11
|
+
"extract_all_features",
|
|
12
|
+
"fats_extract",
|
|
13
|
+
"fft_features",
|
|
14
|
+
"lag_features",
|
|
15
|
+
"rolling_statistics",
|
|
16
|
+
"shift_features",
|
|
17
|
+
"spectral_entropy",
|
|
18
|
+
"tsfresh_extract",
|
|
19
|
+
"wavelet_features",
|
|
20
|
+
"welch_psd",
|
|
21
|
+
]
|
|
@@ -0,0 +1,142 @@
|
|
|
1
|
+
"""Automated feature-extraction backends (FATS, tsfresh)."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import Optional
|
|
6
|
+
|
|
7
|
+
import numpy as np
|
|
8
|
+
from scipy import stats as _stats
|
|
9
|
+
|
|
10
|
+
from .._compat import datetime_columns, is_pandas_datetime_index, to_polars
|
|
11
|
+
|
|
12
|
+
__all__ = [
|
|
13
|
+
"tsfresh_extract",
|
|
14
|
+
"fats_extract",
|
|
15
|
+
"extract_all_features",
|
|
16
|
+
]
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def tsfresh_extract(
|
|
20
|
+
df,
|
|
21
|
+
column_id: str,
|
|
22
|
+
column_sort: str,
|
|
23
|
+
target: Optional[np.ndarray] = None,
|
|
24
|
+
default_fc_parameters: Optional[dict | object] = None,
|
|
25
|
+
n_jobs: int = 0,
|
|
26
|
+
disable_progressbar: bool = True,
|
|
27
|
+
):
|
|
28
|
+
"""Run tsfresh feature extraction (requires pandas DataFrame internally)."""
|
|
29
|
+
try:
|
|
30
|
+
from tsfresh import extract_features as _tsf_extract
|
|
31
|
+
from tsfresh import select_features as _tsf_select
|
|
32
|
+
from tsfresh.feature_extraction import MinimalFCParameters
|
|
33
|
+
except ImportError as exc:
|
|
34
|
+
raise ImportError("tsfresh_extract requires tsfresh.") from exc
|
|
35
|
+
|
|
36
|
+
pdf = to_polars(df)
|
|
37
|
+
pandas_df = pdf.to_pandas()
|
|
38
|
+
|
|
39
|
+
if default_fc_parameters is None:
|
|
40
|
+
default_fc_parameters = MinimalFCParameters()
|
|
41
|
+
features = _tsf_extract(
|
|
42
|
+
pandas_df,
|
|
43
|
+
column_id=column_id,
|
|
44
|
+
column_sort=column_sort,
|
|
45
|
+
default_fc_parameters=default_fc_parameters,
|
|
46
|
+
n_jobs=n_jobs,
|
|
47
|
+
disable_progressbar=disable_progressbar,
|
|
48
|
+
)
|
|
49
|
+
if target is not None:
|
|
50
|
+
unique_ids = pandas_df[column_id].unique()
|
|
51
|
+
target_aligned = (
|
|
52
|
+
target.loc[unique_ids] if hasattr(target, "loc") else target[: len(unique_ids)]
|
|
53
|
+
)
|
|
54
|
+
features = _tsf_select(features, target_aligned)
|
|
55
|
+
return to_polars(features) if not isinstance(features, type(pdf)) else features
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def fats_extract(series, n_periods: int = 1) -> dict[str, float]:
|
|
59
|
+
"""Compute FATS-style scalar features on a 1-D series."""
|
|
60
|
+
arr = np.asarray(series, dtype=float).ravel()
|
|
61
|
+
arr = arr[np.isfinite(arr)]
|
|
62
|
+
if arr.size < 4:
|
|
63
|
+
return {
|
|
64
|
+
k: float("nan")
|
|
65
|
+
for k in [
|
|
66
|
+
"amplitude",
|
|
67
|
+
"beyond_1_std",
|
|
68
|
+
"beyond_2_std",
|
|
69
|
+
"car_sigma",
|
|
70
|
+
"car_mean",
|
|
71
|
+
"car_std",
|
|
72
|
+
"skew",
|
|
73
|
+
"kurtosis",
|
|
74
|
+
"median_abs_dev",
|
|
75
|
+
]
|
|
76
|
+
}
|
|
77
|
+
med = float(np.median(arr))
|
|
78
|
+
mad = float(np.median(np.abs(arr - med)))
|
|
79
|
+
sigma = 1.4826 * mad
|
|
80
|
+
if sigma == 0:
|
|
81
|
+
sigma = float(np.std(arr))
|
|
82
|
+
return {
|
|
83
|
+
"amplitude": float(arr.max() - arr.min()),
|
|
84
|
+
"beyond_1_std": float(np.mean(np.abs(arr - arr.mean()) > 1.0 * arr.std())),
|
|
85
|
+
"beyond_2_std": float(np.mean(np.abs(arr - arr.mean()) > 2.0 * arr.std())),
|
|
86
|
+
"car_sigma": float(arr.std() / max(abs(arr.mean()), 1e-12)),
|
|
87
|
+
"car_mean": float(arr.mean()),
|
|
88
|
+
"car_std": float(arr.std()),
|
|
89
|
+
"skew": float(_stats.skew(arr)),
|
|
90
|
+
"kurtosis": float(_stats.kurtosis(arr)),
|
|
91
|
+
"median_abs_dev": float(mad),
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def extract_all_features(
|
|
96
|
+
df,
|
|
97
|
+
target: str,
|
|
98
|
+
*,
|
|
99
|
+
add_cyclic: bool = True,
|
|
100
|
+
cyclic_period: int = 24,
|
|
101
|
+
add_rolling: bool = True,
|
|
102
|
+
rolling_windows: tuple[int, ...] = (24, 168),
|
|
103
|
+
add_lags: bool = True,
|
|
104
|
+
lags: tuple[int, ...] = (1, 24, 168),
|
|
105
|
+
add_spectral: bool = True,
|
|
106
|
+
):
|
|
107
|
+
"""Convenience pipeline: cyclic + rolling + lag + (optional) spectral features.
|
|
108
|
+
|
|
109
|
+
Returns a :class:`polars.DataFrame` regardless of input type.
|
|
110
|
+
"""
|
|
111
|
+
import polars as pl
|
|
112
|
+
|
|
113
|
+
from .spectral import fft_features, spectral_entropy
|
|
114
|
+
from .temporal import cyclical_encode, lag_features, rolling_statistics
|
|
115
|
+
|
|
116
|
+
pdf = to_polars(df)
|
|
117
|
+
out = pdf
|
|
118
|
+
|
|
119
|
+
if add_cyclic:
|
|
120
|
+
if is_pandas_datetime_index(df):
|
|
121
|
+
out = cyclical_encode(df)
|
|
122
|
+
elif isinstance(out, pl.DataFrame):
|
|
123
|
+
dt_cols = datetime_columns(out)
|
|
124
|
+
if dt_cols:
|
|
125
|
+
out = cyclical_encode(out, datetime_col=dt_cols[0])
|
|
126
|
+
else:
|
|
127
|
+
out = cyclical_encode(out)
|
|
128
|
+
|
|
129
|
+
if add_rolling:
|
|
130
|
+
out = rolling_statistics(out, columns=[target], windows=rolling_windows)
|
|
131
|
+
|
|
132
|
+
if add_lags:
|
|
133
|
+
out = lag_features(out, columns=[target], lags=lags, drop_na=False)
|
|
134
|
+
|
|
135
|
+
if add_spectral:
|
|
136
|
+
target_arr = out[target].to_numpy()
|
|
137
|
+
spec = fft_features(target_arr, top_k=5)
|
|
138
|
+
ent = spectral_entropy(target_arr)
|
|
139
|
+
for k, v in spec.items():
|
|
140
|
+
out = out.with_columns(pl.lit(float(v)).alias(f"fft_{k}"))
|
|
141
|
+
out = out.with_columns(pl.lit(float(ent)).alias("spectral_entropy"))
|
|
142
|
+
return out
|