cryptodatapy 0.2.2__py3-none-any.whl → 0.2.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cryptodatapy/conf/fx_tickers.csv +31 -0
- cryptodatapy/transform/clean.py +171 -172
- cryptodatapy/transform/clean_perp_futures_ohlcv.ipynb +1025 -0
- cryptodatapy/transform/filter.py +83 -142
- cryptodatapy/transform/impute.py +36 -83
- cryptodatapy/transform/od.py +221 -450
- {cryptodatapy-0.2.2.dist-info → cryptodatapy-0.2.4.dist-info}/METADATA +4 -1
- {cryptodatapy-0.2.2.dist-info → cryptodatapy-0.2.4.dist-info}/RECORD +10 -8
- {cryptodatapy-0.2.2.dist-info → cryptodatapy-0.2.4.dist-info}/LICENSE +0 -0
- {cryptodatapy-0.2.2.dist-info → cryptodatapy-0.2.4.dist-info}/WHEEL +0 -0
cryptodatapy/transform/od.py
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
from typing import Dict, Optional
|
1
|
+
from typing import Dict, Optional, Union
|
2
2
|
|
3
3
|
import numpy as np
|
4
4
|
import pandas as pd
|
@@ -9,10 +9,17 @@ from statsmodels.tsa.seasonal import STL, seasonal_decompose
|
|
9
9
|
class OutlierDetection:
|
10
10
|
"""
|
11
11
|
Detects outliers.
|
12
|
-
|
13
12
|
"""
|
14
|
-
|
15
|
-
|
13
|
+
def __init__(self,
|
14
|
+
raw_df: pd.DataFrame,
|
15
|
+
excl_cols: Optional[Union[str, list]] = None,
|
16
|
+
log: bool = False,
|
17
|
+
window_size: int = 7,
|
18
|
+
model_type: str = 'estimation',
|
19
|
+
thresh_val: int = 5,
|
20
|
+
plot: bool = False,
|
21
|
+
plot_series: tuple = ('BTC', 'close')
|
22
|
+
):
|
16
23
|
"""
|
17
24
|
Constructor
|
18
25
|
|
@@ -20,26 +27,10 @@ class OutlierDetection:
|
|
20
27
|
----------
|
21
28
|
raw_df: pd.DataFrame - MultiIndex
|
22
29
|
DataFrame MultiIndex with DatetimeIndex (level 0), ticker (level 1) and raw data/values (cols).
|
23
|
-
|
24
|
-
|
25
|
-
self.raw_df = raw_df
|
26
|
-
|
27
|
-
def atr(
|
28
|
-
self,
|
29
|
-
log: bool = False,
|
30
|
-
window_size: int = 7,
|
31
|
-
model_type: str = "estimation",
|
32
|
-
thresh_val: int = 2,
|
33
|
-
plot: bool = False,
|
34
|
-
plot_series: tuple = ("BTC", "close"),
|
35
|
-
) -> Dict[str, pd.DataFrame]:
|
36
|
-
"""
|
37
|
-
Detects outliers using OHLC values and H-L range.
|
38
|
-
|
39
|
-
Parameters
|
40
|
-
----------
|
30
|
+
excl_cols: str or list, optional, default None
|
31
|
+
Columns to exclude from outlier detection.
|
41
32
|
log: bool, default False
|
42
|
-
|
33
|
+
Log transform the series.
|
43
34
|
window_size: int, default 7
|
44
35
|
Number of observations in the rolling window.
|
45
36
|
model_type: str, {'estimation', 'prediction'}, default 'estimation'
|
@@ -47,292 +38,224 @@ class OutlierDetection:
|
|
47
38
|
e.g. expected x_t of series x at time t uses values from [x_t-s, x_t+s].
|
48
39
|
Prediction models use only past and current values to estimate the expected value of a series,
|
49
40
|
e.g. expected x_t of series x at time t uses values from [x_t-s, x_t].
|
50
|
-
thresh_val: int, default
|
41
|
+
thresh_val: int, default 5
|
51
42
|
Value for upper and lower thresholds used in outlier detection.
|
52
43
|
plot: bool, default False
|
53
|
-
Plots series with outliers highlighted
|
44
|
+
Plots series with outliers highlighted with red dots.
|
54
45
|
plot_series: tuple, default ('BTC', 'close')
|
55
|
-
|
46
|
+
Plots the time series of a specific (ticker, field/column) tuple.
|
47
|
+
"""
|
48
|
+
self.raw_df = raw_df
|
49
|
+
self.excl_cols = excl_cols
|
50
|
+
self.log = log
|
51
|
+
self.window_size = window_size
|
52
|
+
self.model_type = model_type
|
53
|
+
self.thresh_val = thresh_val
|
54
|
+
self.plot = plot
|
55
|
+
self.plot_series = plot_series
|
56
|
+
self.df = raw_df.copy() if excl_cols is None else raw_df.drop(columns=excl_cols).copy()
|
57
|
+
self.yhat = None
|
58
|
+
self.outliers = None
|
59
|
+
self.filtered_df = None
|
60
|
+
self.log_transform()
|
61
|
+
|
62
|
+
def log_transform(self) -> None:
|
63
|
+
"""
|
64
|
+
Log transform the dataframe.
|
65
|
+
"""
|
66
|
+
if self.log:
|
67
|
+
# remove negative values
|
68
|
+
self.df[self.df <= 0] = np.nan
|
69
|
+
# log and replace inf
|
70
|
+
self.df = np.log(self.df).replace([np.inf, -np.inf], np.nan)
|
71
|
+
|
72
|
+
def atr(self) -> pd.DataFrame:
|
73
|
+
"""
|
74
|
+
Detects outliers using OHLC values and H-L range.
|
56
75
|
|
57
76
|
Returns
|
58
77
|
-------
|
59
|
-
|
60
|
-
|
61
|
-
with DatetimeIndex (level 0), tickers (level 1) and fields (cols) with forecasted, outlier or filtered
|
62
|
-
values.
|
63
|
-
|
78
|
+
filtered_df: pd.DataFrame - MultiIndex
|
79
|
+
Filtered dataframe with DatetimeIndex (level 0), tickers (level 1) and fields (cols) with outliers removed.
|
64
80
|
"""
|
65
|
-
# sort index and create df copy
|
66
|
-
df = self.raw_df.sort_index(level=1)
|
67
|
-
|
68
|
-
# log
|
69
|
-
if log:
|
70
|
-
df0 = np.log(df).copy()
|
71
|
-
else:
|
72
|
-
df0 = df.copy()
|
73
|
-
|
74
81
|
# ohlc
|
75
|
-
if not all(col in df.columns for col in ["open", "high", "low", "close"]):
|
82
|
+
if not all(col in self.df.columns for col in ["open", "high", "low", "close"]):
|
76
83
|
raise Exception("Dataframe must have OHLC prices to compute ATR.")
|
77
84
|
|
85
|
+
# df copy
|
86
|
+
df0 = self.df.copy()
|
87
|
+
|
78
88
|
# compute true range
|
79
89
|
df0["hl"], df0["hc"], df0["lc"] = (
|
80
|
-
(df0.high -
|
81
|
-
(df0.high -
|
82
|
-
(df0.low -
|
90
|
+
(df0.high - df0.low).abs(),
|
91
|
+
(df0.high - df0.close.groupby(level=1).shift(1)).abs(),
|
92
|
+
(df0.low - df0.close.groupby(level=1).shift(1)).abs(),
|
83
93
|
)
|
84
94
|
df0["tr"] = df0.loc[:, "hl":"lc"].max(axis=1)
|
85
95
|
|
86
96
|
# compute ATR for estimation and prediction models
|
87
|
-
if model_type == "estimation":
|
97
|
+
if self.model_type == "estimation":
|
88
98
|
df0["atr"] = (
|
89
99
|
df0.tr.groupby(level=1)
|
90
|
-
.shift(-1 * int((window_size + 1) / 2))
|
100
|
+
.shift(-1 * int((self.window_size + 1) / 2))
|
91
101
|
.sort_index(level=1)
|
92
|
-
.rolling(window_size, min_periods=1)
|
102
|
+
.rolling(self.window_size, min_periods=1)
|
93
103
|
.mean()
|
94
104
|
.sort_index()
|
95
105
|
)
|
96
106
|
med = (
|
97
|
-
df0.
|
98
|
-
.
|
99
|
-
.shift(-1 * int((window_size + 1) / 2))
|
107
|
+
df0.groupby(level=1)
|
108
|
+
.shift(-1 * int((self.window_size + 1) / 2))
|
100
109
|
.sort_index(level=1)
|
101
|
-
.rolling(window_size, min_periods=1)
|
110
|
+
.rolling(self.window_size, min_periods=1)
|
102
111
|
.median()
|
103
112
|
.sort_index()
|
104
113
|
)
|
105
114
|
else:
|
106
115
|
df0["atr"] = (
|
107
|
-
df0.tr.groupby(level=1).ewm(span=window_size).mean().droplevel(0)
|
116
|
+
df0.tr.groupby(level=1).ewm(span=self.window_size).mean().droplevel(0)
|
108
117
|
)
|
109
118
|
med = (
|
110
|
-
df0.
|
111
|
-
.
|
112
|
-
.rolling(window_size)
|
119
|
+
df0.groupby(level=1)
|
120
|
+
.rolling(self.window_size)
|
113
121
|
.median()
|
114
122
|
.droplevel(0)
|
115
123
|
)
|
116
124
|
|
117
|
-
# compute dev
|
118
|
-
dev = df0
|
125
|
+
# compute dev and score for outliers
|
126
|
+
dev = df0 - med
|
119
127
|
score = dev.divide(df0.atr, axis=0)
|
120
|
-
upper, lower = thresh_val, thresh_val * -1
|
121
128
|
|
122
129
|
# outliers
|
123
|
-
|
124
|
-
|
130
|
+
self.outliers = self.df[score.abs() > self.thresh_val].sort_index()
|
131
|
+
self.filtered_df = self.df[score.abs() < self.thresh_val].sort_index()
|
125
132
|
|
126
|
-
# log
|
127
|
-
if log:
|
128
|
-
|
133
|
+
# log to original scale
|
134
|
+
if self.log:
|
135
|
+
self.yhat = np.exp(med).sort_index()
|
129
136
|
|
130
137
|
# plot
|
131
|
-
if plot:
|
132
|
-
if not isinstance(plot_series, tuple):
|
138
|
+
if self.plot:
|
139
|
+
if not isinstance(self.plot_series, tuple):
|
133
140
|
raise TypeError(
|
134
141
|
"Plot_series must be a tuple specifying the ticker and column/field to "
|
135
142
|
"plot (ticker, column)."
|
136
143
|
)
|
137
144
|
else:
|
138
|
-
self.plot_outliers(
|
145
|
+
self.plot_outliers()
|
139
146
|
|
140
|
-
|
141
|
-
"yhat": med.sort_index(),
|
142
|
-
"outliers": out_df.sort_index(),
|
143
|
-
"filt_vals": filt_df.sort_index(),
|
144
|
-
}
|
147
|
+
return self.filtered_df
|
145
148
|
|
146
|
-
|
147
|
-
|
148
|
-
def iqr(
|
149
|
-
self,
|
150
|
-
log: bool = True,
|
151
|
-
window_size: int = 7,
|
152
|
-
model_type: str = "estimation",
|
153
|
-
thresh_val: int = 1.5,
|
154
|
-
plot: bool = False,
|
155
|
-
plot_series: tuple = ("BTC", "close"),
|
156
|
-
) -> Dict[str, pd.DataFrame]:
|
149
|
+
def iqr(self) -> pd.DataFrame:
|
157
150
|
"""
|
158
151
|
Detects outliers using interquartile range (IQR) method.
|
159
152
|
|
160
|
-
Parameters
|
161
|
-
----------
|
162
|
-
log: bool, default True
|
163
|
-
Converts series into log of series.
|
164
|
-
window_size: int, default 7
|
165
|
-
Number of observations in the rolling window.
|
166
|
-
model_type: str, {'estimation', 'prediction'}, default 'estimation'
|
167
|
-
Estimation models use past, current and future values to estimate the expected value of a series,
|
168
|
-
e.g. expected x_t of series x at time t uses values from [x_t-s, x_t+s].
|
169
|
-
Prediction models use only past and current values to estimate the expected value of a series,
|
170
|
-
e.g. expected x_t of series x at time t uses values from [x_t-s, x_t].
|
171
|
-
thresh_val: int, default 1.5
|
172
|
-
Value for upper and lower thresholds used in outlier detection.
|
173
|
-
Computed as: IQR x thresh_val +/- 75th/25th percentiles (upper/lower bands), respectively.
|
174
|
-
plot: bool, default False
|
175
|
-
Plots series with outliers highlighted (red dots).
|
176
|
-
plot_series: tuple, default ('BTC', 'close')
|
177
|
-
The specific time series to plot given by (ticker, field/column) tuple.
|
178
|
-
|
179
153
|
Returns
|
180
154
|
-------
|
181
|
-
|
182
|
-
|
183
|
-
with DatetimeIndex (level 0), tickers (level 1) and fields (cols) with forecasted, outlier or filtered
|
184
|
-
values.
|
185
|
-
|
155
|
+
filtered_df: pd.DataFrame - MultiIndex
|
156
|
+
Filtered dataframe with DatetimeIndex (level 0), tickers (level 1) and fields (cols) with outliers removed.
|
186
157
|
"""
|
187
|
-
# sort index
|
188
|
-
|
189
|
-
|
190
|
-
# log
|
191
|
-
if log:
|
192
|
-
df0 = np.log(df).copy()
|
193
|
-
else:
|
194
|
-
df0 = df.copy()
|
158
|
+
# sort index
|
159
|
+
df0 = self.df.sort_index(level=1)
|
195
160
|
|
196
161
|
# compute 75th, 50th and 25th percentiles for estimation and prediction models
|
197
|
-
if model_type == "estimation":
|
162
|
+
if self.model_type == "estimation":
|
198
163
|
perc_75th = (
|
199
164
|
df0.groupby(level=1)
|
200
|
-
.shift(-1 * int((window_size + 1) / 2))
|
165
|
+
.shift(-1 * int((self.window_size + 1) / 2))
|
201
166
|
.sort_index(level=1)
|
202
|
-
.rolling(window_size, min_periods=1)
|
167
|
+
.rolling(self.window_size, min_periods=1)
|
203
168
|
.quantile(0.75)
|
204
169
|
)
|
205
170
|
perc_25th = (
|
206
171
|
df0.groupby(level=1)
|
207
|
-
.shift(-1 * int((window_size + 1) / 2))
|
172
|
+
.shift(-1 * int((self.window_size + 1) / 2))
|
208
173
|
.sort_index(level=1)
|
209
|
-
.rolling(window_size, min_periods=1)
|
174
|
+
.rolling(self.window_size, min_periods=1)
|
210
175
|
.quantile(0.25)
|
211
176
|
)
|
212
177
|
med = (
|
213
178
|
df0.groupby(level=1)
|
214
|
-
.shift(-1 * int((window_size + 1) / 2))
|
179
|
+
.shift(-1 * int((self.window_size + 1) / 2))
|
215
180
|
.sort_index(level=1)
|
216
|
-
.rolling(window_size, min_periods=1)
|
181
|
+
.rolling(self.window_size, min_periods=1)
|
217
182
|
.median()
|
218
183
|
)
|
219
184
|
else:
|
220
185
|
perc_75th = (
|
221
|
-
df0.groupby(level=1).rolling(window_size).quantile(0.75).droplevel(0)
|
186
|
+
df0.groupby(level=1).rolling(self.window_size).quantile(0.75).droplevel(0)
|
222
187
|
)
|
223
188
|
perc_25th = (
|
224
|
-
df0.groupby(level=1).rolling(window_size).quantile(0.25).droplevel(0)
|
189
|
+
df0.groupby(level=1).rolling(self.window_size).quantile(0.25).droplevel(0)
|
225
190
|
)
|
226
|
-
med = df0.groupby(level=1).rolling(window_size).median().droplevel(0)
|
191
|
+
med = df0.groupby(level=1).rolling(self.window_size).median().droplevel(0)
|
227
192
|
|
228
193
|
# compute iqr and upper/lower thresholds
|
229
194
|
iqr = perc_75th - perc_25th
|
230
|
-
upper = perc_75th.add(thresh_val * iqr, axis=1)
|
231
|
-
lower = perc_25th.subtract(thresh_val * iqr, axis=1)
|
195
|
+
upper = perc_75th.add(self.thresh_val * iqr, axis=1)
|
196
|
+
lower = perc_25th.subtract(self.thresh_val * iqr, axis=1)
|
232
197
|
|
233
198
|
# detect outliers
|
234
|
-
out_df = df[(df0 > upper) | (df0 < lower)]
|
235
|
-
filt_df = df[(df0 < upper) & (df0 > lower)]
|
199
|
+
out_df = self.df[(df0 > upper) | (df0 < lower)]
|
200
|
+
filt_df = self.df[(df0 < upper) & (df0 > lower)]
|
236
201
|
|
237
|
-
# log
|
238
|
-
if log:
|
202
|
+
# log to original scale
|
203
|
+
if self.log:
|
239
204
|
med = np.exp(med)
|
240
205
|
|
241
206
|
# type conversion
|
242
|
-
|
243
|
-
|
244
|
-
|
207
|
+
self.yhat = med.apply(pd.to_numeric, errors='coerce').convert_dtypes().sort_index()
|
208
|
+
self.outliers = out_df.apply(pd.to_numeric, errors='coerce').convert_dtypes().sort_index()
|
209
|
+
self.filtered_df = filt_df.apply(pd.to_numeric, errors='coerce').convert_dtypes().sort_index()
|
245
210
|
|
246
211
|
# plot
|
247
|
-
if plot:
|
248
|
-
if not isinstance(plot_series, tuple):
|
212
|
+
if self.plot:
|
213
|
+
if not isinstance(self.plot_series, tuple):
|
249
214
|
raise TypeError(
|
250
215
|
"Plot_series must be a tuple specifying the ticker and column/field to "
|
251
216
|
"plot (ticker, column)."
|
252
217
|
)
|
253
218
|
else:
|
254
|
-
self.plot_outliers(
|
219
|
+
self.plot_outliers()
|
255
220
|
|
256
|
-
|
257
|
-
"yhat": med.sort_index(),
|
258
|
-
"outliers": out_df.sort_index(),
|
259
|
-
"filt_vals": filt_df.sort_index(),
|
260
|
-
}
|
221
|
+
return self.filtered_df
|
261
222
|
|
262
|
-
|
263
|
-
|
264
|
-
def mad(
|
265
|
-
self,
|
266
|
-
log: bool = True,
|
267
|
-
window_size: int = 7,
|
268
|
-
model_type: str = "estimation",
|
269
|
-
thresh_val: int = 10,
|
270
|
-
plot: bool = False,
|
271
|
-
plot_series: tuple = ("BTC", "close"),
|
272
|
-
) -> Dict[str, pd.DataFrame]:
|
223
|
+
def mad(self) -> pd.DataFrame:
|
273
224
|
"""
|
274
225
|
Detects outliers using a median absolute deviation method, aka Hampler filter.
|
275
226
|
|
276
|
-
Parameters
|
277
|
-
----------
|
278
|
-
log: bool, default True
|
279
|
-
Converts series into log of series.
|
280
|
-
window_size: int, default 7
|
281
|
-
Number of observations in the rolling window.
|
282
|
-
model_type: str, {'estimation', 'prediction'}, default 'estimation'
|
283
|
-
Estimation models use past, current and future values to estimate the expected value of a series,
|
284
|
-
e.g. expected x_t of series x at time t uses values from [x_t-s, x_t+s].
|
285
|
-
Prediction models use only past and current values to estimate the expected value of a series,
|
286
|
-
e.g. expected x_t of series x at time t uses values from [x_t-s, x_t].
|
287
|
-
thresh_val: int, default 10
|
288
|
-
Value for upper and lower thresholds used in outlier detection.
|
289
|
-
Computed as: [median - thresh_val * mad, median + thresh_val * mad] for lower/upper thresholds.
|
290
|
-
plot: bool, default False
|
291
|
-
Plots series with outliers highlighted (red dots).
|
292
|
-
plot_series: tuple, default ('BTC', 'close')
|
293
|
-
The specific time series to plot given by (ticker, field/column) tuple.
|
294
|
-
|
295
227
|
Returns
|
296
228
|
-------
|
297
|
-
|
298
|
-
|
299
|
-
with DatetimeIndex (level 0), tickers (level 1) and fields (cols) with forecasted, outlier or filtered
|
300
|
-
values.
|
301
|
-
|
229
|
+
filtered_df: pd.DataFrame - MultiIndex
|
230
|
+
Filtered dataframe with DatetimeIndex (level 0), tickers (level 1) and fields (cols) with outliers removed.
|
302
231
|
"""
|
303
232
|
# sort index and create df copy
|
304
|
-
|
305
|
-
|
306
|
-
# log
|
307
|
-
if log:
|
308
|
-
df0 = np.log(df)
|
309
|
-
else:
|
310
|
-
df0 = df
|
233
|
+
df0 = self.df.sort_index(level=1).copy()
|
311
234
|
|
312
235
|
# compute median for estimation and prediction models
|
313
|
-
if model_type == "estimation":
|
236
|
+
if self.model_type == "estimation":
|
314
237
|
med = (
|
315
238
|
df0.groupby(level=1)
|
316
|
-
.shift(-1 * int((window_size + 1) / 2))
|
239
|
+
.shift(-1 * int((self.window_size + 1) / 2))
|
317
240
|
.sort_index(level=1)
|
318
|
-
.rolling(window_size, min_periods=1)
|
241
|
+
.rolling(self.window_size, min_periods=1)
|
319
242
|
.median()
|
320
243
|
)
|
321
244
|
else:
|
322
|
-
med = df0.groupby(level=1).rolling(window_size).median().droplevel(0)
|
245
|
+
med = df0.groupby(level=1).rolling(self.window_size).median().droplevel(0)
|
323
246
|
|
324
247
|
# compute dev, mad, upper/lower thresholds
|
325
248
|
dev = df0 - med
|
326
|
-
mad = dev.abs().groupby(level=1).rolling(window_size).median().droplevel(0)
|
327
|
-
upper = med.add(thresh_val * mad, axis=1)
|
328
|
-
lower = med.subtract(thresh_val * mad, axis=1)
|
249
|
+
mad = dev.abs().groupby(level=1).rolling(self.window_size).median().droplevel(0)
|
250
|
+
upper = med.add(self.thresh_val * mad, axis=1)
|
251
|
+
lower = med.subtract(self.thresh_val * mad, axis=1)
|
329
252
|
|
330
253
|
# outliers
|
331
|
-
out_df = df[(df0 > upper) | (df0 < lower)]
|
332
|
-
filt_df = df[(df0 < upper) & (df0 > lower)]
|
254
|
+
out_df = self.df[(df0 > upper) | (df0 < lower)]
|
255
|
+
filt_df = self.df[(df0 < upper) & (df0 > lower)]
|
333
256
|
|
334
|
-
# log
|
335
|
-
if log:
|
257
|
+
# log to original scale
|
258
|
+
if self.log:
|
336
259
|
med = np.exp(med)
|
337
260
|
|
338
261
|
# type conversion
|
@@ -340,112 +263,73 @@ class OutlierDetection:
|
|
340
263
|
out_df = out_df.apply(pd.to_numeric, errors='coerce').convert_dtypes()
|
341
264
|
filt_df = filt_df.apply(pd.to_numeric, errors='coerce').convert_dtypes()
|
342
265
|
|
266
|
+
self.yhat = med.sort_index()
|
267
|
+
self.outliers = out_df.sort_index()
|
268
|
+
self.filtered_df = filt_df.sort_index()
|
269
|
+
|
343
270
|
# plot
|
344
|
-
if plot:
|
345
|
-
if not isinstance(plot_series, tuple):
|
271
|
+
if self.plot:
|
272
|
+
if not isinstance(self.plot_series, tuple):
|
346
273
|
raise TypeError(
|
347
274
|
"Plot_series must be a tuple specifying the ticker and column/field to "
|
348
275
|
"plot (ticker, column)."
|
349
276
|
)
|
350
277
|
else:
|
351
|
-
self.plot_outliers(
|
352
|
-
|
353
|
-
outliers_dict = {
|
354
|
-
"yhat": med.sort_index(),
|
355
|
-
"outliers": out_df.sort_index(),
|
356
|
-
"filt_vals": filt_df.sort_index(),
|
357
|
-
}
|
278
|
+
self.plot_outliers()
|
358
279
|
|
359
|
-
return
|
280
|
+
return self.filtered_df
|
360
281
|
|
361
|
-
def z_score(
|
362
|
-
self,
|
363
|
-
log: bool = True,
|
364
|
-
window_size: int = 7,
|
365
|
-
model_type: str = "estimation",
|
366
|
-
thresh_val: int = 2,
|
367
|
-
plot: bool = False,
|
368
|
-
plot_series: tuple = ("BTC", "close"),
|
369
|
-
) -> Dict[str, pd.DataFrame]:
|
282
|
+
def z_score(self) -> pd.DataFrame:
|
370
283
|
"""
|
371
284
|
Detects outliers using a z-score method, aka simple moving average.
|
372
285
|
|
373
|
-
Parameters
|
374
|
-
----------
|
375
|
-
log: bool, default True
|
376
|
-
Converts series into log of series.
|
377
|
-
window_size: int, default 7
|
378
|
-
Number of observations in the rolling window.
|
379
|
-
model_type: str, {'estimation', 'prediction'}, default 'estimation'
|
380
|
-
Estimation models use past, current and future values to estimate the expected value of a series,
|
381
|
-
e.g. expected x_t of series x at time t uses values from [x_t-s, x_t+s].
|
382
|
-
Prediction models use only past and current values to estimate the expected value of a series,
|
383
|
-
e.g. expected x_t of series x at time t uses values from [x_t-s, x_t].
|
384
|
-
thresh_val: int, default 2
|
385
|
-
Value for upper and lower thresholds used in outlier detection.
|
386
|
-
plot: bool, default False
|
387
|
-
Plots series with outliers highlighted with red dots.
|
388
|
-
plot_series: tuple, default ('BTC', 'close')
|
389
|
-
Plots the time series of a specific ticker/field combination (tuple).
|
390
|
-
|
391
286
|
Returns
|
392
287
|
-------
|
393
|
-
|
394
|
-
|
395
|
-
with DatetimeIndex (level 0), tickers (level 1) and fields (cols) with forecasted, outlier or filtered
|
396
|
-
values.
|
397
|
-
|
288
|
+
filtered_df: pd.DataFrame - MultiIndex
|
289
|
+
Filtered dataframe with DatetimeIndex (level 0), tickers (level 1) and fields (cols) with outliers removed.
|
398
290
|
"""
|
399
291
|
# sort index and create copy
|
400
|
-
|
401
|
-
|
402
|
-
# log
|
403
|
-
if log:
|
404
|
-
df0 = np.log(df)
|
405
|
-
else:
|
406
|
-
df0 = df
|
292
|
+
df0 = self.df.sort_index(level=1).copy()
|
407
293
|
|
408
294
|
# compute rolling mean and std for estimation and prediction models
|
409
|
-
if model_type == "estimation":
|
295
|
+
if self.model_type == "estimation":
|
410
296
|
roll_mean = (
|
411
297
|
df0.groupby(level=1)
|
412
|
-
.shift(-1 * int((window_size + 1) / 2))
|
298
|
+
.shift(-1 * int((self.window_size + 1) / 2))
|
413
299
|
.sort_index(level=1)
|
414
|
-
.rolling(window_size, min_periods=1)
|
300
|
+
.rolling(self.window_size, min_periods=1)
|
415
301
|
.mean()
|
416
302
|
)
|
417
303
|
roll_std = (
|
418
304
|
df0.groupby(level=1)
|
419
|
-
.shift(-1 * int((window_size + 1) / 2))
|
305
|
+
.shift(-1 * int((self.window_size + 1) / 2))
|
420
306
|
.sort_index(level=1)
|
421
|
-
.rolling(window_size, min_periods=1)
|
307
|
+
.rolling(self.window_size, min_periods=1)
|
422
308
|
.std()
|
423
309
|
)
|
424
310
|
else:
|
425
311
|
roll_mean = (
|
426
312
|
df0.groupby(level=1)
|
427
|
-
.rolling(window_size, min_periods=1)
|
313
|
+
.rolling(self.window_size, min_periods=1)
|
428
314
|
.mean()
|
429
315
|
.droplevel(0)
|
430
316
|
)
|
431
317
|
roll_std = (
|
432
318
|
df0.groupby(level=1)
|
433
|
-
.rolling(window_size, min_periods=1)
|
319
|
+
.rolling(self.window_size, min_periods=1)
|
434
320
|
.std()
|
435
321
|
.droplevel(0)
|
436
322
|
)
|
437
323
|
|
438
324
|
# compute z-score and upper/lower thresh
|
439
325
|
z = (df0 - roll_mean) / roll_std
|
440
|
-
upper = thresh_val
|
441
|
-
lower = thresh_val * -1
|
442
326
|
|
443
327
|
# outliers
|
444
|
-
out_df = df[(
|
445
|
-
filt_df = df[(
|
328
|
+
out_df = self.df[z.abs() > self.thresh_val]
|
329
|
+
filt_df = self.df[z.abs() < self.thresh_val]
|
446
330
|
|
447
|
-
# log
|
448
|
-
if log:
|
331
|
+
# log to original scale
|
332
|
+
if self.log:
|
449
333
|
roll_mean = np.exp(roll_mean)
|
450
334
|
|
451
335
|
# type conversion
|
@@ -453,80 +337,47 @@ class OutlierDetection:
|
|
453
337
|
out_df = out_df.apply(pd.to_numeric, errors='coerce').convert_dtypes()
|
454
338
|
filt_df = filt_df.apply(pd.to_numeric, errors='coerce').convert_dtypes()
|
455
339
|
|
340
|
+
self.yhat = roll_mean.sort_index()
|
341
|
+
self.outliers = out_df.sort_index()
|
342
|
+
self.filtered_df = filt_df.sort_index()
|
343
|
+
|
456
344
|
# plot
|
457
|
-
if plot:
|
458
|
-
if not isinstance(plot_series, tuple):
|
345
|
+
if self.plot:
|
346
|
+
if not isinstance(self.plot_series, tuple):
|
459
347
|
raise TypeError(
|
460
348
|
"Plot_series must be a tuple specifying the ticker and column/field to "
|
461
349
|
"plot (ticker, column)."
|
462
350
|
)
|
463
351
|
else:
|
464
|
-
self.plot_outliers(
|
465
|
-
|
466
|
-
outliers_dict = {
|
467
|
-
"yhat": roll_mean.sort_index(),
|
468
|
-
"outliers": out_df.sort_index(),
|
469
|
-
"filt_vals": filt_df.sort_index(),
|
470
|
-
}
|
352
|
+
self.plot_outliers()
|
471
353
|
|
472
|
-
return
|
354
|
+
return self.filtered_df
|
473
355
|
|
474
|
-
def ewma(
|
475
|
-
self,
|
476
|
-
log: bool = True,
|
477
|
-
window_size: int = 7,
|
478
|
-
thresh_val: int = 1.5,
|
479
|
-
plot: bool = False,
|
480
|
-
plot_series: tuple = ("BTC", "close"),
|
481
|
-
) -> Dict[str, pd.DataFrame]:
|
356
|
+
def ewma(self) -> pd.DataFrame:
|
482
357
|
"""
|
483
358
|
Detects outliers using an exponential moving average method.
|
484
359
|
|
485
|
-
Parameters
|
486
|
-
----------
|
487
|
-
log: bool, default True
|
488
|
-
Converts series into log of series.
|
489
|
-
window_size: int, default 7
|
490
|
-
Number of observations in the rolling window.
|
491
|
-
thresh_val: int, default 1.5
|
492
|
-
Value for upper and lower thresholds used in outlier detection.
|
493
|
-
plot: bool, default False
|
494
|
-
Plots series with outliers highlighted with red dots.
|
495
|
-
plot_series: tuple, default ('BTC', 'close')
|
496
|
-
Plots the time series of a specific ticker/field combination (tuple).
|
497
|
-
|
498
360
|
Returns
|
499
361
|
-------
|
500
|
-
|
501
|
-
|
502
|
-
with DatetimeIndex (level 0), tickers (level 1) and fields (cols) with forecasted, outlier or filtered
|
503
|
-
values.
|
504
|
-
|
362
|
+
filtered_df: pd.DataFrame - MultiIndex
|
363
|
+
Filtered dataframe with DatetimeIndex (level 0), tickers (level 1) and fields (cols) with outliers removed.
|
505
364
|
"""
|
506
365
|
# sort index and create copy
|
507
|
-
|
508
|
-
|
509
|
-
# log
|
510
|
-
if log:
|
511
|
-
df0 = np.log(df)
|
512
|
-
else:
|
513
|
-
df0 = df
|
366
|
+
df0 = self.df.sort_index(level=1).copy()
|
514
367
|
|
515
368
|
# compute ew ma and std for estimation and prediction models
|
516
|
-
ewma = df0.groupby(level=1).ewm(span=window_size).mean().droplevel(0)
|
517
|
-
ewstd = df0.groupby(level=1).ewm(span=window_size).std().droplevel(0)
|
369
|
+
ewma = df0.groupby(level=1).ewm(span=self.window_size).mean().droplevel(0)
|
370
|
+
ewstd = df0.groupby(level=1).ewm(span=self.window_size).std().droplevel(0)
|
518
371
|
|
519
372
|
# compute z-score and upper/lower thresh
|
520
373
|
z = (df0 - ewma) / ewstd
|
521
|
-
upper = thresh_val
|
522
|
-
lower = thresh_val * -1
|
523
374
|
|
524
375
|
# outliers
|
525
|
-
out_df = df[(
|
526
|
-
filt_df = df[(
|
376
|
+
out_df = self.df[z.abs() > self.thresh_val]
|
377
|
+
filt_df = self.df[z.abs() < self.thresh_val]
|
527
378
|
|
528
|
-
# log
|
529
|
-
if log:
|
379
|
+
# log to original scale
|
380
|
+
if self.log:
|
530
381
|
ewma = np.exp(ewma)
|
531
382
|
|
532
383
|
# type conversion
|
@@ -534,35 +385,29 @@ class OutlierDetection:
|
|
534
385
|
out_df = out_df.apply(pd.to_numeric, errors='coerce').convert_dtypes()
|
535
386
|
filt_df = filt_df.apply(pd.to_numeric, errors='coerce').convert_dtypes()
|
536
387
|
|
388
|
+
self.yhat = ewma.sort_index()
|
389
|
+
self.outliers = out_df.sort_index()
|
390
|
+
self.filtered_df = filt_df.sort_index()
|
391
|
+
|
537
392
|
# plot
|
538
|
-
if plot:
|
539
|
-
if not isinstance(plot_series, tuple):
|
393
|
+
if self.plot:
|
394
|
+
if not isinstance(self.plot_series, tuple):
|
540
395
|
raise TypeError(
|
541
396
|
"Plot_series must be a tuple specifying the ticker and column/field to "
|
542
397
|
"plot (ticker, column)."
|
543
398
|
)
|
544
399
|
else:
|
545
|
-
self.plot_outliers(
|
400
|
+
self.plot_outliers()
|
546
401
|
|
547
|
-
|
548
|
-
"yhat": ewma.sort_index(),
|
549
|
-
"outliers": out_df.sort_index(),
|
550
|
-
"filt_vals": filt_df.sort_index(),
|
551
|
-
}
|
552
|
-
|
553
|
-
return outliers_dict
|
402
|
+
return self.filtered_df
|
554
403
|
|
555
404
|
def seasonal_decomp(
|
556
405
|
self,
|
557
|
-
log: bool = True,
|
558
|
-
thresh_val: int = 5,
|
559
406
|
period: int = 7,
|
560
407
|
model: str = "additive",
|
561
408
|
filt: Optional[np.array] = None,
|
562
409
|
two_sided: Optional[bool] = True,
|
563
410
|
extrapolate_trend: Optional[int] = 0,
|
564
|
-
plot: bool = False,
|
565
|
-
plot_series: tuple = ("BTC", "close"),
|
566
411
|
) -> Dict[str, pd.DataFrame]:
|
567
412
|
"""
|
568
413
|
Detects outliers with seasonal decomposition moving averages from statsmodels.
|
@@ -571,10 +416,6 @@ class OutlierDetection:
|
|
571
416
|
|
572
417
|
Parameters
|
573
418
|
----------
|
574
|
-
log: bool, default True
|
575
|
-
Converts series into log of series.
|
576
|
-
thresh_val: int, default 5
|
577
|
-
Value for upper and lower thresholds used in outlier detection.
|
578
419
|
period: int, optional, default 7
|
579
420
|
periodicity of the sequence.
|
580
421
|
model: str, {'additive', 'multiplicative'}, default 'additive'
|
@@ -590,29 +431,16 @@ class OutlierDetection:
|
|
590
431
|
on both ends (or the single one if two_sided is False) considering this many (+1) closest points.
|
591
432
|
If set to ‘freq’, use freq closest points. Setting this parameter results in no NaN values in trend
|
592
433
|
or resid components.
|
593
|
-
plot: bool, default False
|
594
|
-
Plots series with outliers highlighted with red dots.
|
595
|
-
plot_series: tuple, default ('BTC', 'close')
|
596
|
-
Plots the time series of a specific (ticker, field/column) tuple.
|
597
434
|
|
598
435
|
Returns
|
599
436
|
-------
|
600
|
-
|
601
|
-
|
602
|
-
with DatetimeIndex (level 0), tickers (level 1) and fields (cols) with forecasted, outlier or filtered
|
603
|
-
values.
|
604
|
-
|
437
|
+
filtered_df: pd.DataFrame - MultiIndex
|
438
|
+
Filtered dataframe with DatetimeIndex (level 0), tickers (level 1) and fields (cols) with outliers removed.
|
605
439
|
"""
|
606
440
|
# unstack
|
607
|
-
|
441
|
+
df0 = self.df.unstack().copy()
|
608
442
|
# original idx, unstacked idx
|
609
|
-
mult_idx, idx = self.raw_df.index, df.index
|
610
|
-
|
611
|
-
# log
|
612
|
-
if log:
|
613
|
-
df0 = np.log(df)
|
614
|
-
else:
|
615
|
-
df0 = df
|
443
|
+
mult_idx, idx = self.raw_df.index, self.df.unstack().index
|
616
444
|
|
617
445
|
# store resid dfs in dict
|
618
446
|
resid_dict, yhat_dict = {}, {}
|
@@ -653,13 +481,13 @@ class OutlierDetection:
|
|
653
481
|
# convert dict to multiindex
|
654
482
|
resid_df, yhat_df = pd.concat(resid_dict, axis=1), pd.concat(yhat_dict, axis=1)
|
655
483
|
|
656
|
-
# log
|
657
|
-
if log:
|
484
|
+
# log to original scale
|
485
|
+
if self.log:
|
658
486
|
yhat_df = np.exp(yhat_df)
|
659
487
|
|
660
488
|
# filter outliers
|
661
|
-
out_df = df[resid_df.abs() > thresh_val]
|
662
|
-
filt_df = df[resid_df.abs() < thresh_val]
|
489
|
+
out_df = self.df.unstack()[resid_df.abs() > self.thresh_val]
|
490
|
+
filt_df = self.df.unstack()[resid_df.abs() < self.thresh_val]
|
663
491
|
|
664
492
|
# stack and reindex
|
665
493
|
out_df = out_df.stack().reindex(mult_idx)
|
@@ -671,28 +499,24 @@ class OutlierDetection:
|
|
671
499
|
out_df = out_df.apply(pd.to_numeric, errors="ignore").convert_dtypes()
|
672
500
|
filt_df = filt_df.apply(pd.to_numeric, errors="ignore").convert_dtypes()
|
673
501
|
|
502
|
+
self.yhat = yhat_df.sort_index()
|
503
|
+
self.outliers = out_df.sort_index()
|
504
|
+
self.filtered_df = filt_df.sort_index()
|
505
|
+
|
674
506
|
# plot
|
675
|
-
if plot:
|
676
|
-
if not isinstance(plot_series, tuple):
|
507
|
+
if self.plot:
|
508
|
+
if not isinstance(self.plot_series, tuple):
|
677
509
|
raise TypeError(
|
678
510
|
"Plot_series must be a tuple specifying the ticker and column/field to "
|
679
511
|
"plot (ticker, column)."
|
680
512
|
)
|
681
513
|
else:
|
682
|
-
self.plot_outliers(
|
683
|
-
|
684
|
-
outliers_dict = {
|
685
|
-
"yhat": yhat_df.sort_index(),
|
686
|
-
"outliers": out_df.sort_index(),
|
687
|
-
"filt_vals": filt_df.sort_index(),
|
688
|
-
}
|
514
|
+
self.plot_outliers()
|
689
515
|
|
690
|
-
return
|
516
|
+
return self.filtered_df
|
691
517
|
|
692
518
|
def stl(
|
693
519
|
self,
|
694
|
-
log: bool = True,
|
695
|
-
thresh_val: int = 5,
|
696
520
|
period: Optional[int] = 7,
|
697
521
|
seasonal: Optional[int] = 7,
|
698
522
|
trend: Optional[int] = None,
|
@@ -704,9 +528,7 @@ class OutlierDetection:
|
|
704
528
|
seasonal_jump: Optional[int] = 1,
|
705
529
|
trend_jump: Optional[int] = 1,
|
706
530
|
low_pass_jump: Optional[int] = 1,
|
707
|
-
|
708
|
-
plot_series: tuple = ("BTC", "close"),
|
709
|
-
) -> Dict[str, pd.DataFrame]:
|
531
|
+
) -> pd.DataFrame:
|
710
532
|
"""
|
711
533
|
Detects outliers with seasonal decomposition moving averages from statsmodels.
|
712
534
|
|
@@ -714,10 +536,6 @@ class OutlierDetection:
|
|
714
536
|
|
715
537
|
Parameters
|
716
538
|
----------
|
717
|
-
log: bool, default True
|
718
|
-
Converts series into log of series.
|
719
|
-
thresh_val: int, default 5
|
720
|
-
Value for upper and lower thresholds used in outlier detection.
|
721
539
|
period: int, optional, default 7
|
722
540
|
Periodicity of the sequence.
|
723
541
|
seasonal: int, optional, default 7
|
@@ -749,35 +567,23 @@ class OutlierDetection:
|
|
749
567
|
Positive integer determining the linear interpolation step. If larger than 1,
|
750
568
|
the LOESS is used every low_pass_jump points and values between the two are linearly interpolated.
|
751
569
|
Higher values reduce estimation time.
|
752
|
-
plot: bool, default False
|
753
|
-
Plots series with outliers highlighted with red dots.
|
754
|
-
plot_series: tuple, default ('BTC', 'close')
|
755
|
-
Plots the time series of a specific (ticker, field/column) tuple.
|
756
570
|
|
757
571
|
Returns
|
758
572
|
-------
|
759
|
-
|
760
|
-
|
761
|
-
with DatetimeIndex (level 0), tickers (level 1) and fields (cols) with forecasted, outlier or filtered
|
762
|
-
values.
|
763
|
-
|
573
|
+
filtered_df: pd.DataFrame - MultiIndex
|
574
|
+
Filtered dataframe with DatetimeIndex (level 0), tickers (level 1) and fields (cols) with outliers removed.
|
764
575
|
"""
|
765
576
|
# unstack
|
766
|
-
|
577
|
+
df0 = self.df.unstack().copy()
|
767
578
|
# original idx, unstacked idx
|
768
|
-
mult_idx, idx = self.
|
769
|
-
|
770
|
-
# log
|
771
|
-
if log:
|
772
|
-
df0 = np.log(df)
|
773
|
-
else:
|
774
|
-
df0 = df
|
579
|
+
mult_idx, idx = self.df.index, self.df.unstack().index
|
775
580
|
|
776
581
|
# store resid dfs in dict
|
777
582
|
resid_dict, yhat_dict = {}, {}
|
778
583
|
for field in df0.columns.get_level_values(0).unique():
|
779
584
|
resid_df, yhat_df = pd.DataFrame(index=idx), pd.DataFrame(index=idx)
|
780
585
|
for ticker in df0[field].columns:
|
586
|
+
|
781
587
|
# decompose
|
782
588
|
res = STL(
|
783
589
|
df0[field][ticker].dropna(),
|
@@ -818,13 +624,13 @@ class OutlierDetection:
|
|
818
624
|
# convert dict to multiindex
|
819
625
|
resid_df, yhat_df = pd.concat(resid_dict, axis=1), pd.concat(yhat_dict, axis=1)
|
820
626
|
|
821
|
-
# log
|
822
|
-
if log:
|
627
|
+
# log to original scale
|
628
|
+
if self.log:
|
823
629
|
yhat_df = np.exp(yhat_df)
|
824
630
|
|
825
631
|
# filter outliers
|
826
|
-
out_df = df[resid_df.abs() > thresh_val]
|
827
|
-
filt_df = df[resid_df.abs() < thresh_val]
|
632
|
+
out_df = self.df.unstack()[resid_df.abs() > self.thresh_val]
|
633
|
+
filt_df = self.df.unstack()[resid_df.abs() < self.thresh_val]
|
828
634
|
|
829
635
|
# stack and reindex
|
830
636
|
out_df = out_df.stack().reindex(mult_idx)
|
@@ -836,64 +642,41 @@ class OutlierDetection:
|
|
836
642
|
out_df = out_df.apply(pd.to_numeric, errors="ignore").convert_dtypes()
|
837
643
|
filt_df = filt_df.apply(pd.to_numeric, errors="ignore").convert_dtypes()
|
838
644
|
|
645
|
+
self.yhat = yhat_df.sort_index()
|
646
|
+
self.outliers = out_df.sort_index()
|
647
|
+
self.filtered_df = filt_df.sort_index()
|
648
|
+
|
839
649
|
# plot
|
840
|
-
if plot:
|
841
|
-
if not isinstance(plot_series, tuple):
|
650
|
+
if self.plot:
|
651
|
+
if not isinstance(self.plot_series, tuple):
|
842
652
|
raise TypeError(
|
843
653
|
"Plot_series must be a tuple specifying the ticker and column/field to "
|
844
654
|
"plot (ticker, column)."
|
845
655
|
)
|
846
656
|
else:
|
847
|
-
self.plot_outliers(
|
657
|
+
self.plot_outliers()
|
848
658
|
|
849
|
-
|
850
|
-
"yhat": yhat_df.sort_index(),
|
851
|
-
"outliers": out_df.sort_index(),
|
852
|
-
"filt_vals": filt_df.sort_index(),
|
853
|
-
}
|
659
|
+
return self.filtered_df
|
854
660
|
|
855
|
-
|
856
|
-
|
857
|
-
def prophet(
|
858
|
-
self,
|
859
|
-
log: bool = True,
|
860
|
-
interval_width: Optional[float] = 0.99,
|
861
|
-
plot: bool = False,
|
862
|
-
plot_series: tuple = ("BTC", "close"),
|
863
|
-
) -> Dict[str, pd.DataFrame]:
|
661
|
+
def prophet(self, interval_width: Optional[float] = 0.999) -> pd.DataFrame:
|
864
662
|
"""
|
865
663
|
Detects outliers using Prophet, a time series forecasting algorithm published by Facebook.
|
866
664
|
|
867
665
|
Parameters
|
868
666
|
----------
|
869
|
-
log: bool, default True
|
870
|
-
Converts series into log of series.
|
871
667
|
interval_width: float, optional, default 0.99
|
872
668
|
Uncertainty interval estimated by Monte Carlo simulation. The larger the value,
|
873
669
|
the larger the upper/lower thresholds interval for outlier detection.
|
874
|
-
plot: bool, default False
|
875
|
-
Plots series with outliers highlighted with red dots.
|
876
|
-
plot_series: tuple, default ('BTC', 'close')
|
877
|
-
Plots the time series of a specific (ticker, field/column) tuple.
|
878
670
|
|
879
671
|
Returns
|
880
672
|
-------
|
881
|
-
|
882
|
-
|
883
|
-
with DatetimeIndex (level 0), tickers (level 1) and fields (cols) with forecasted, outlier or filtered
|
884
|
-
values.
|
885
|
-
|
673
|
+
filtered_df: pd.DataFrame - MultiIndex
|
674
|
+
Filtered dataframe with DatetimeIndex (level 0), tickers (level 1) and fields (cols) with outliers removed.
|
886
675
|
"""
|
887
676
|
# unstack
|
888
|
-
|
677
|
+
df0 = self.raw_df.unstack().copy()
|
889
678
|
# original idx, unstacked idx
|
890
|
-
mult_idx, idx = self.raw_df.index,
|
891
|
-
|
892
|
-
# log
|
893
|
-
if log:
|
894
|
-
df0 = np.log(df)
|
895
|
-
else:
|
896
|
-
df0 = df
|
679
|
+
mult_idx, idx = self.raw_df.index, df0.index
|
897
680
|
|
898
681
|
# store predictions for fields dfs in dict
|
899
682
|
upper_dict, lower_dict, yhat_dict = {}, {}, {}
|
@@ -949,15 +732,15 @@ class OutlierDetection:
|
|
949
732
|
yhat.columns.names = [None, "ticker"]
|
950
733
|
|
951
734
|
# transform log
|
952
|
-
if log:
|
735
|
+
if self.log:
|
953
736
|
yhat_upper = np.exp(yhat_upper)
|
954
737
|
yhat_lower = np.exp(yhat_lower)
|
955
738
|
yhat = np.exp(yhat)
|
956
739
|
|
957
740
|
# filter outliers
|
958
741
|
yhat_df = yhat
|
959
|
-
out_df = df[df.gt(yhat_upper) | df.lt(yhat_lower)]
|
960
|
-
filt_df = df[df.lt(yhat_upper) & df.gt(yhat_lower)]
|
742
|
+
out_df = self.df.unstack()[self.df.unstack().gt(yhat_upper) | self.df.unstack().lt(yhat_lower)]
|
743
|
+
filt_df = self.df.unstack()[self.df.unstack().lt(yhat_upper) & self.df.unstack().gt(yhat_lower)]
|
961
744
|
|
962
745
|
# stack and reindex
|
963
746
|
yhat_df = yhat_df.stack().reindex(mult_idx)
|
@@ -969,47 +752,35 @@ class OutlierDetection:
|
|
969
752
|
out_df = out_df.apply(pd.to_numeric, errors="ignore").convert_dtypes()
|
970
753
|
filt_df = filt_df.apply(pd.to_numeric, errors="ignore").convert_dtypes()
|
971
754
|
|
755
|
+
self.yhat = yhat_df.sort_index()
|
756
|
+
self.outliers = out_df.sort_index()
|
757
|
+
self.filtered_df = filt_df.sort_index()
|
758
|
+
|
972
759
|
# plot
|
973
|
-
if plot:
|
974
|
-
if not isinstance(plot_series, tuple):
|
760
|
+
if self.plot:
|
761
|
+
if not isinstance(self.plot_series, tuple):
|
975
762
|
raise TypeError(
|
976
763
|
"Plot_series must be a tuple specifying the ticker and column/field to "
|
977
764
|
"plot (ticker, column)."
|
978
765
|
)
|
979
766
|
else:
|
980
|
-
self.plot_outliers(
|
981
|
-
|
982
|
-
outliers_dict = {
|
983
|
-
"yhat": yhat_df.sort_index(),
|
984
|
-
"outliers": out_df.sort_index(),
|
985
|
-
"filt_vals": filt_df.sort_index(),
|
986
|
-
}
|
767
|
+
self.plot_outliers()
|
987
768
|
|
988
|
-
return
|
769
|
+
return self.filtered_df
|
989
770
|
|
990
|
-
def plot_outliers(
|
991
|
-
self, outliers_df: pd.DataFrame, plot_series: Optional[tuple] = None
|
992
|
-
) -> None:
|
771
|
+
def plot_outliers(self) -> None:
|
993
772
|
"""
|
994
773
|
Plots time series with outliers highlighted (red dots).
|
995
|
-
|
996
|
-
Parameters
|
997
|
-
----------
|
998
|
-
outliers_df: pd.DataFrame - MultiIndex
|
999
|
-
Dataframe MultiIndex with DatetimeIndex (level 0), tickers (level 1) and fields (cols) outlier values.
|
1000
|
-
plot_series: tuple, optional, default None
|
1001
|
-
Plots the time series of a specific (ticker, field) tuple.
|
1002
|
-
|
1003
774
|
"""
|
1004
775
|
ax = (
|
1005
|
-
self.
|
776
|
+
self.df.loc[pd.IndexSlice[:, self.plot_series[0]], self.plot_series[1]]
|
1006
777
|
.droplevel(1)
|
1007
778
|
.plot(linewidth=1, figsize=(15, 7), color="#1f77b4", zorder=0)
|
1008
779
|
)
|
1009
|
-
|
780
|
+
self.outliers.unstack()[self.plot_series[1]].reset_index().plot(
|
1010
781
|
kind="scatter",
|
1011
782
|
x="date",
|
1012
|
-
y=plot_series[0],
|
783
|
+
y=self.plot_series[0],
|
1013
784
|
color="#E64E53",
|
1014
785
|
ax=ax,
|
1015
786
|
label="outliers",
|
@@ -1019,5 +790,5 @@ class OutlierDetection:
|
|
1019
790
|
ax.ticklabel_format(style="plain", axis="y")
|
1020
791
|
ax.set_facecolor("whitesmoke")
|
1021
792
|
ax.legend(
|
1022
|
-
[plot_series[1] + "_raw", plot_series[1] + "_outliers"], loc="upper left"
|
793
|
+
[self.plot_series[1] + "_raw", self.plot_series[1] + "_outliers"], loc="upper left"
|
1023
794
|
)
|