felits 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
felits/__init__.py ADDED
@@ -0,0 +1,161 @@
1
+ """FELITS: Feature Engineering and Large-scale Integration for Time Series.
2
+
3
+ Top-level package exposing the most commonly used classes and functions
4
+ through a flat, ergonomic API::
5
+
6
+ from felits import (
7
+ HampelFilter, TimeSeriesScaler, SlidingWindowSplitter, Metrics,
8
+ cyclical_encode, fft_features, tsfresh_extract,
9
+ FeatureSelector, granger_feature_selection, shap_feature_selection,
10
+ XGBoostForecaster, RandomForestForecaster, RNNBasedModel,
11
+ OptunaOptimizer, deep_shap_selector,
12
+ )
13
+
14
+ Submodules are still importable for advanced use::
15
+
16
+ from felits.preprocessing import iqr_outlier_detection
17
+ from felits.feature_selection import mrmr_selection
18
+ """
19
+
20
+ from __future__ import annotations
21
+
22
+ from . import data, models, optimization
23
+ from .feature_extraction import (
24
+ cyclical_encode,
25
+ extract_all_features,
26
+ fats_extract,
27
+ fft_features,
28
+ lag_features,
29
+ rolling_statistics,
30
+ shift_features,
31
+ spectral_entropy,
32
+ tsfresh_extract,
33
+ wavelet_features,
34
+ )
35
+ from .feature_selection import (
36
+ FeatureSelector,
37
+ adaptive_lasso_selection,
38
+ elastic_net_selection,
39
+ granger_feature_selection,
40
+ lasso_selection,
41
+ lime_explain_instance,
42
+ mrmr_selection,
43
+ mutual_information_ksg,
44
+ permutation_importance_selection,
45
+ rf_importance_selection,
46
+ select_features,
47
+ shap_feature_selection,
48
+ shap_interaction_selection,
49
+ xgboost_importance_selection,
50
+ )
51
+ from .optimization import OptunaOptimizer
52
+ from .preprocessing import (
53
+ DecompositionResult,
54
+ HampelFilter,
55
+ Metrics,
56
+ SlidingWindowSplitter,
57
+ TimeSeriesScaler,
58
+ forward_fill,
59
+ hampel_filter,
60
+ iqr_outlier_detection,
61
+ linear_interpolate,
62
+ mae,
63
+ mape,
64
+ max_error,
65
+ mse,
66
+ r2,
67
+ rmse,
68
+ seasonal_adjust,
69
+ smape,
70
+ stl_decompose,
71
+ three_sigma_filter,
72
+ time_aware_interpolate,
73
+ )
74
+ from .xai import deep_shap_selector, explain_forecast
75
+
76
+ __version__ = "0.1.0"
77
+ __author__ = "Félix Morales Mareco"
78
+ __license__ = "MIT"
79
+
80
+ __all__ = [
81
+ "__version__",
82
+ "__author__",
83
+ "__license__",
84
+ # data
85
+ "data",
86
+ "models",
87
+ "optimization",
88
+ # preprocessing
89
+ "DecompositionResult",
90
+ "HampelFilter",
91
+ "Metrics",
92
+ "SlidingWindowSplitter",
93
+ "TimeSeriesScaler",
94
+ "forward_fill",
95
+ "hampel_filter",
96
+ "iqr_outlier_detection",
97
+ "linear_interpolate",
98
+ "mae",
99
+ "mape",
100
+ "max_error",
101
+ "mse",
102
+ "r2",
103
+ "rmse",
104
+ "seasonal_adjust",
105
+ "smape",
106
+ "stl_decompose",
107
+ "three_sigma_filter",
108
+ "time_aware_interpolate",
109
+ # feature_extraction
110
+ "cyclical_encode",
111
+ "extract_all_features",
112
+ "fats_extract",
113
+ "fft_features",
114
+ "lag_features",
115
+ "rolling_statistics",
116
+ "shift_features",
117
+ "spectral_entropy",
118
+ "tsfresh_extract",
119
+ "wavelet_features",
120
+ # feature_selection
121
+ "FeatureSelector",
122
+ "adaptive_lasso_selection",
123
+ "elastic_net_selection",
124
+ "granger_feature_selection",
125
+ "lasso_selection",
126
+ "lime_explain_instance",
127
+ "mrmr_selection",
128
+ "mutual_information_ksg",
129
+ "permutation_importance_selection",
130
+ "rf_importance_selection",
131
+ "select_features",
132
+ "shap_feature_selection",
133
+ "shap_interaction_selection",
134
+ "xgboost_importance_selection",
135
+ # optimization
136
+ "OptunaOptimizer",
137
+ # xai
138
+ "deep_shap_selector",
139
+ "explain_forecast",
140
+ ]
141
+
142
+ # Re-export the DL models and sklearn forecasters at the top level
143
+ from .models import (
144
+ BahdanauAttention,
145
+ LinearForecaster,
146
+ RandomForestForecaster,
147
+ RNNAttentionModel,
148
+ RNNBasedModel,
149
+ XGBoostForecaster,
150
+ is_dl_available,
151
+ )
152
+
153
+ __all__ += [
154
+ "BahdanauAttention",
155
+ "LinearForecaster",
156
+ "RNNAttentionModel",
157
+ "RNNBasedModel",
158
+ "RandomForestForecaster",
159
+ "XGBoostForecaster",
160
+ "is_dl_available",
161
+ ]
felits/_compat.py ADDED
@@ -0,0 +1,173 @@
1
+ """Compatibility layer for dual pandas / polars support.
2
+
3
+ FELITS accepts both ``pd.DataFrame`` and ``pl.DataFrame`` (or a mix of both)
4
+ at all public APIs. Internally the library works with :class:`polars.DataFrame`
5
+ for performance. Conversion to pandas happens only at library boundaries
6
+ (e.g. ``statsmodels``, ``tsfresh``) that require it.
7
+
8
+ Usage
9
+ -----
10
+ >>> from felits._compat import to_polars, to_pandas, DataFrameLike
11
+ >>>
12
+ >>> def my_public_api(df: DataFrameLike, target: str):
13
+ ... pdf = to_polars(df) # guarantee polars
14
+ ... result = pdf.select(pl.col(target))
15
+ ... return to_pandas(result) # back to pandas for the caller
16
+ """
17
+
18
+ from __future__ import annotations
19
+
20
+ from typing import TYPE_CHECKING, Union
21
+
22
+ import numpy as np
23
+
24
+ if TYPE_CHECKING:
25
+ import pandas as pd
26
+ import polars as pl
27
+
28
+ DataFrameLike = Union["pd.DataFrame", "pl.DataFrame"]
29
+ SeriesLike = Union["pd.Series", "pl.Series"]
30
+
31
+
32
+ def to_polars(data: object, *columns: str, include_index: bool = False) -> "pl.DataFrame":
33
+ """Convert ``data`` to a :class:`polars.DataFrame`.
34
+
35
+ Parameters
36
+ ----------
37
+ include_index:
38
+ When ``True`` and ``data`` is a pandas DataFrame with a DatetimeIndex,
39
+ the index is included as the first column named ``"_time"``.
40
+ """
41
+ import polars as pl
42
+
43
+ if isinstance(data, pl.DataFrame):
44
+ return data
45
+ if isinstance(data, pl.Series):
46
+ return data.to_frame()
47
+ if isinstance(data, str):
48
+ raise TypeError(f"Cannot convert str to polars DataFrame: {data!r}")
49
+
50
+ import pandas as pd
51
+
52
+ if isinstance(data, pd.DataFrame):
53
+ if include_index and isinstance(data.index, pd.DatetimeIndex):
54
+ pdf = pl.from_pandas(data.reset_index())
55
+ idx_name = data.index.name or "_time"
56
+ if pdf.columns[0] == "index":
57
+ pdf = pdf.rename({"index": idx_name})
58
+ return pdf
59
+ return pl.from_pandas(data)
60
+ if isinstance(data, pd.Series):
61
+ return pl.from_pandas(data.to_frame())
62
+ if isinstance(data, np.ndarray):
63
+ if data.ndim == 1:
64
+ data = data.reshape(-1, 1)
65
+ if columns:
66
+ return pl.DataFrame({name: data[:, i] for i, name in enumerate(columns)})
67
+ return pl.DataFrame({f"col_{i}": data[:, i] for i in range(data.shape[1])})
68
+ if isinstance(data, dict):
69
+ return pl.DataFrame(data)
70
+ raise TypeError(f"Cannot convert {type(data).__name__} to polars DataFrame.")
71
+
72
+
73
+ def to_pandas(data: object) -> "pd.DataFrame":
74
+ """Convert ``data`` to a :class:`pandas.DataFrame`.
75
+
76
+ Accepts:
77
+ - :class:`pandas.DataFrame` → returned unchanged
78
+ - :class:`polars.DataFrame` → converted via ``pl.to_pandas``
79
+ - :class:`polars.Series` → converted via ``pl.Series.to_pandas()``
80
+ """
81
+ import pandas as pd
82
+
83
+ if isinstance(data, pd.DataFrame):
84
+ return data
85
+ if isinstance(data, pd.Series):
86
+ return data.to_frame()
87
+
88
+ import polars as pl
89
+
90
+ if isinstance(data, pl.DataFrame):
91
+ return data.to_pandas()
92
+ if isinstance(data, pl.Series):
93
+ return data.to_pandas().to_frame()
94
+ if isinstance(data, np.ndarray):
95
+ cols = [f"col_{i}" for i in range(data.shape[1])] if data.ndim > 1 else ["value"]
96
+ return pd.DataFrame(data, columns=cols)
97
+ raise TypeError(f"Cannot convert {type(data).__name__} to pandas DataFrame.")
98
+
99
+
100
+ def is_polars(data: object) -> bool:
101
+ """Check if ``data`` is a polars DataFrame/Series."""
102
+ try:
103
+ import polars as pl
104
+
105
+ return isinstance(data, (pl.DataFrame, pl.Series))
106
+ except ImportError:
107
+ return False
108
+
109
+
110
+ def is_pandas(data: object) -> bool:
111
+ """Check if ``data`` is a pandas DataFrame/Series."""
112
+ try:
113
+ import pandas as pd
114
+
115
+ return isinstance(data, (pd.DataFrame, pd.Series))
116
+ except ImportError:
117
+ return False
118
+
119
+
120
+ def to_numpy(data: object) -> np.ndarray:
121
+ """Convert DataFrame/Series to numpy, regardless of backend."""
122
+ if isinstance(data, np.ndarray):
123
+ return data
124
+ if hasattr(data, "to_numpy"):
125
+ return data.to_numpy()
126
+ return np.asarray(data, dtype=float)
127
+
128
+
129
+ def with_columns(data: "pl.DataFrame", **kwargs: object) -> "pl.DataFrame":
130
+ """Add or replace columns in a polars DataFrame.
131
+
132
+ Equivalent to ``pandas``' ``df[col] = values`` pattern.
133
+ """
134
+ import polars as pl
135
+
136
+ for name, value in kwargs.items():
137
+ if isinstance(value, (list, np.ndarray)):
138
+ data = data.with_columns(pl.Series(name, value).alias(name))
139
+ elif isinstance(value, pl.Expr):
140
+ data = data.with_columns(value.alias(name))
141
+ elif isinstance(value, (int, float, str)):
142
+ data = data.with_columns(pl.lit(value).alias(name))
143
+ else:
144
+ raise TypeError(f"Unsupported column type: {type(value).__name__}")
145
+ return data
146
+
147
+
148
+ def has_datetime_column(df: "pl.DataFrame") -> bool:
149
+ """Check if a polars DataFrame has any datetime or date column."""
150
+ import polars as pl
151
+
152
+ return any(
153
+ s in (pl.Datetime, pl.Date, pl.Datetime("ms", "UTC"), pl.Datetime("us", "UTC"))
154
+ for s in df.schema.values()
155
+ )
156
+
157
+
158
+ def datetime_columns(df: "pl.DataFrame") -> list[str]:
159
+ """Return names of columns with datetime/date dtype."""
160
+ import polars as pl
161
+
162
+ valid = (pl.Datetime, pl.Date, pl.Datetime("ms", "UTC"), pl.Datetime("us", "UTC"))
163
+ return [c for c, t in df.schema.items() if t in valid]
164
+
165
+
166
+ def is_pandas_datetime_index(df) -> bool:
167
+ """Check if df is a pandas DataFrame with a DatetimeIndex."""
168
+ try:
169
+ import pandas as pd
170
+
171
+ return isinstance(df, pd.DataFrame) and isinstance(df.index, pd.DatetimeIndex)
172
+ except ImportError:
173
+ return False
felits/data.py ADDED
@@ -0,0 +1,89 @@
1
+ """Data loaders and synthetic time-series generators.
2
+
3
+ The module exposes:
4
+
5
+ - :func:`load_example_dataset` — small CSV datasets bundled with FELITS
6
+ for demos and unit tests.
7
+ - :func:`make_synthetic_ts` — reproducible synthetic univariate time
8
+ series with optional seasonality / noise.
9
+ - :func:`load_sin_data` — convenience loader for the Paraguay SIN
10
+ dataset, used in the original research article. The function
11
+ gracefully returns ``None`` when the file is not present so the test
12
+ suite is not bound to a particular machine.
13
+ """
14
+
15
+ from __future__ import annotations
16
+
17
+ import os
18
+
19
+ import numpy as np
20
+ import pandas as pd
21
+
22
+ __all__ = ["load_example_dataset", "load_sin_data", "make_synthetic_ts"]
23
+
24
+
25
+ def make_synthetic_ts(
26
+ n_samples: int = 1000,
27
+ n_features: int = 1,
28
+ seasonality: bool = True,
29
+ period: int = 24,
30
+ noise_std: float = 0.1,
31
+ seed: int = 0,
32
+ ) -> pd.DataFrame:
33
+ """Generate a synthetic time series with daily seasonality.
34
+
35
+ Parameters
36
+ ----------
37
+ n_samples:
38
+ Number of rows.
39
+ n_features:
40
+ Number of exogenous feature columns (named ``"x0"``, ``"x1"``…).
41
+ seasonality:
42
+ Add a sinusoidal seasonal component to the target.
43
+ period:
44
+ Period of the seasonal component.
45
+ noise_std:
46
+ Standard deviation of the additive Gaussian noise.
47
+ seed:
48
+ Random seed for reproducibility.
49
+ """
50
+ rng = np.random.default_rng(seed)
51
+ t = np.arange(n_samples)
52
+ y = np.zeros(n_samples)
53
+ if seasonality:
54
+ y = 10 * np.sin(2 * np.pi * t / period) + 5 * np.cos(2 * np.pi * t / (period * 7))
55
+ y += noise_std * rng.standard_normal(n_samples)
56
+ df = pd.DataFrame({"y": y}, index=pd.date_range("2024-01-01", periods=n_samples, freq="h"))
57
+ for i in range(n_features):
58
+ df[f"x{i}"] = rng.standard_normal(n_samples)
59
+ return df
60
+
61
+
62
+ def load_example_dataset(name: str = "synthetic_demand") -> pd.DataFrame:
63
+ """Load one of the small example datasets bundled with FELITS.
64
+
65
+ The default ``"synthetic_demand"`` is generated on the fly by
66
+ :func:`make_synthetic_ts` so users can run the quickstart without
67
+ any external files.
68
+ """
69
+ if name == "synthetic_demand":
70
+ return make_synthetic_ts(n_samples=24 * 30)
71
+ raise ValueError(f"Unknown example dataset {name!r}.")
72
+
73
+
74
+ def load_sin_data(path: str | None = None) -> pd.DataFrame | None:
75
+ """Load the Paraguay SIN electricity-demand dataset.
76
+
77
+ The function looks for a CSV file at ``path`` or, when ``path`` is
78
+ ``None``, in the conventional ``dataset/processed_dataset.csv``
79
+ location relative to the current working directory. When no file is
80
+ found it returns ``None`` rather than raising, so that the test suite
81
+ is environment-agnostic.
82
+ """
83
+ if path is None:
84
+ path = os.path.join("dataset", "processed_dataset.csv")
85
+ if not os.path.isfile(path):
86
+ return None
87
+ df = pd.read_csv(path, index_col="Date")
88
+ df.index = pd.to_datetime(df.index)
89
+ return df
@@ -0,0 +1,21 @@
1
+ """Feature engineering for time series."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from .automated import extract_all_features, fats_extract, tsfresh_extract
6
+ from .spectral import fft_features, spectral_entropy, wavelet_features, welch_psd
7
+ from .temporal import cyclical_encode, lag_features, rolling_statistics, shift_features
8
+
9
+ __all__ = [
10
+ "cyclical_encode",
11
+ "extract_all_features",
12
+ "fats_extract",
13
+ "fft_features",
14
+ "lag_features",
15
+ "rolling_statistics",
16
+ "shift_features",
17
+ "spectral_entropy",
18
+ "tsfresh_extract",
19
+ "wavelet_features",
20
+ "welch_psd",
21
+ ]
@@ -0,0 +1,142 @@
1
+ """Automated feature-extraction backends (FATS, tsfresh)."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Optional
6
+
7
+ import numpy as np
8
+ from scipy import stats as _stats
9
+
10
+ from .._compat import datetime_columns, is_pandas_datetime_index, to_polars
11
+
12
+ __all__ = [
13
+ "tsfresh_extract",
14
+ "fats_extract",
15
+ "extract_all_features",
16
+ ]
17
+
18
+
19
+ def tsfresh_extract(
20
+ df,
21
+ column_id: str,
22
+ column_sort: str,
23
+ target: Optional[np.ndarray] = None,
24
+ default_fc_parameters: Optional[dict | object] = None,
25
+ n_jobs: int = 0,
26
+ disable_progressbar: bool = True,
27
+ ):
28
+ """Run tsfresh feature extraction (requires pandas DataFrame internally)."""
29
+ try:
30
+ from tsfresh import extract_features as _tsf_extract
31
+ from tsfresh import select_features as _tsf_select
32
+ from tsfresh.feature_extraction import MinimalFCParameters
33
+ except ImportError as exc:
34
+ raise ImportError("tsfresh_extract requires tsfresh.") from exc
35
+
36
+ pdf = to_polars(df)
37
+ pandas_df = pdf.to_pandas()
38
+
39
+ if default_fc_parameters is None:
40
+ default_fc_parameters = MinimalFCParameters()
41
+ features = _tsf_extract(
42
+ pandas_df,
43
+ column_id=column_id,
44
+ column_sort=column_sort,
45
+ default_fc_parameters=default_fc_parameters,
46
+ n_jobs=n_jobs,
47
+ disable_progressbar=disable_progressbar,
48
+ )
49
+ if target is not None:
50
+ unique_ids = pandas_df[column_id].unique()
51
+ target_aligned = (
52
+ target.loc[unique_ids] if hasattr(target, "loc") else target[: len(unique_ids)]
53
+ )
54
+ features = _tsf_select(features, target_aligned)
55
+ return to_polars(features) if not isinstance(features, type(pdf)) else features
56
+
57
+
58
+ def fats_extract(series, n_periods: int = 1) -> dict[str, float]:
59
+ """Compute FATS-style scalar features on a 1-D series."""
60
+ arr = np.asarray(series, dtype=float).ravel()
61
+ arr = arr[np.isfinite(arr)]
62
+ if arr.size < 4:
63
+ return {
64
+ k: float("nan")
65
+ for k in [
66
+ "amplitude",
67
+ "beyond_1_std",
68
+ "beyond_2_std",
69
+ "car_sigma",
70
+ "car_mean",
71
+ "car_std",
72
+ "skew",
73
+ "kurtosis",
74
+ "median_abs_dev",
75
+ ]
76
+ }
77
+ med = float(np.median(arr))
78
+ mad = float(np.median(np.abs(arr - med)))
79
+ sigma = 1.4826 * mad
80
+ if sigma == 0:
81
+ sigma = float(np.std(arr))
82
+ return {
83
+ "amplitude": float(arr.max() - arr.min()),
84
+ "beyond_1_std": float(np.mean(np.abs(arr - arr.mean()) > 1.0 * arr.std())),
85
+ "beyond_2_std": float(np.mean(np.abs(arr - arr.mean()) > 2.0 * arr.std())),
86
+ "car_sigma": float(arr.std() / max(abs(arr.mean()), 1e-12)),
87
+ "car_mean": float(arr.mean()),
88
+ "car_std": float(arr.std()),
89
+ "skew": float(_stats.skew(arr)),
90
+ "kurtosis": float(_stats.kurtosis(arr)),
91
+ "median_abs_dev": float(mad),
92
+ }
93
+
94
+
95
+ def extract_all_features(
96
+ df,
97
+ target: str,
98
+ *,
99
+ add_cyclic: bool = True,
100
+ cyclic_period: int = 24,
101
+ add_rolling: bool = True,
102
+ rolling_windows: tuple[int, ...] = (24, 168),
103
+ add_lags: bool = True,
104
+ lags: tuple[int, ...] = (1, 24, 168),
105
+ add_spectral: bool = True,
106
+ ):
107
+ """Convenience pipeline: cyclic + rolling + lag + (optional) spectral features.
108
+
109
+ Returns a :class:`polars.DataFrame` regardless of input type.
110
+ """
111
+ import polars as pl
112
+
113
+ from .spectral import fft_features, spectral_entropy
114
+ from .temporal import cyclical_encode, lag_features, rolling_statistics
115
+
116
+ pdf = to_polars(df)
117
+ out = pdf
118
+
119
+ if add_cyclic:
120
+ if is_pandas_datetime_index(df):
121
+ out = cyclical_encode(df)
122
+ elif isinstance(out, pl.DataFrame):
123
+ dt_cols = datetime_columns(out)
124
+ if dt_cols:
125
+ out = cyclical_encode(out, datetime_col=dt_cols[0])
126
+ else:
127
+ out = cyclical_encode(out)
128
+
129
+ if add_rolling:
130
+ out = rolling_statistics(out, columns=[target], windows=rolling_windows)
131
+
132
+ if add_lags:
133
+ out = lag_features(out, columns=[target], lags=lags, drop_na=False)
134
+
135
+ if add_spectral:
136
+ target_arr = out[target].to_numpy()
137
+ spec = fft_features(target_arr, top_k=5)
138
+ ent = spectral_entropy(target_arr)
139
+ for k, v in spec.items():
140
+ out = out.with_columns(pl.lit(float(v)).alias(f"fft_{k}"))
141
+ out = out.with_columns(pl.lit(float(ent)).alias("spectral_entropy"))
142
+ return out