cryptodatapy 0.2.2__py3-none-any.whl → 0.2.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,4 +1,4 @@
1
- from typing import Dict, Optional
1
+ from typing import Dict, Optional, Union
2
2
 
3
3
  import numpy as np
4
4
  import pandas as pd
@@ -9,10 +9,17 @@ from statsmodels.tsa.seasonal import STL, seasonal_decompose
9
9
  class OutlierDetection:
10
10
  """
11
11
  Detects outliers.
12
-
13
12
  """
14
-
15
- def __init__(self, raw_df: pd.DataFrame):
13
+ def __init__(self,
14
+ raw_df: pd.DataFrame,
15
+ excl_cols: Optional[Union[str, list]] = None,
16
+ log: bool = False,
17
+ window_size: int = 7,
18
+ model_type: str = 'estimation',
19
+ thresh_val: int = 5,
20
+ plot: bool = False,
21
+ plot_series: tuple = ('BTC', 'close')
22
+ ):
16
23
  """
17
24
  Constructor
18
25
 
@@ -20,26 +27,10 @@ class OutlierDetection:
20
27
  ----------
21
28
  raw_df: pd.DataFrame - MultiIndex
22
29
  DataFrame MultiIndex with DatetimeIndex (level 0), ticker (level 1) and raw data/values (cols).
23
-
24
- """
25
- self.raw_df = raw_df
26
-
27
- def atr(
28
- self,
29
- log: bool = False,
30
- window_size: int = 7,
31
- model_type: str = "estimation",
32
- thresh_val: int = 2,
33
- plot: bool = False,
34
- plot_series: tuple = ("BTC", "close"),
35
- ) -> Dict[str, pd.DataFrame]:
36
- """
37
- Detects outliers using OHLC values and H-L range.
38
-
39
- Parameters
40
- ----------
30
+ excl_cols: str or list, optional, default None
31
+ Columns to exclude from outlier detection.
41
32
  log: bool, default False
42
- Converts series into log of series.
33
+ Log transform the series.
43
34
  window_size: int, default 7
44
35
  Number of observations in the rolling window.
45
36
  model_type: str, {'estimation', 'prediction'}, default 'estimation'
@@ -47,292 +38,224 @@ class OutlierDetection:
47
38
  e.g. expected x_t of series x at time t uses values from [x_t-s, x_t+s].
48
39
  Prediction models use only past and current values to estimate the expected value of a series,
49
40
  e.g. expected x_t of series x at time t uses values from [x_t-s, x_t].
50
- thresh_val: int, default 2
41
+ thresh_val: int, default 5
51
42
  Value for upper and lower thresholds used in outlier detection.
52
43
  plot: bool, default False
53
- Plots series with outliers highlighted (red dots).
44
+ Plots series with outliers highlighted with red dots.
54
45
  plot_series: tuple, default ('BTC', 'close')
55
- The specific time series to plot given by (ticker, field/column) tuple.
46
+ Plots the time series of a specific (ticker, field/column) tuple.
47
+ """
48
+ self.raw_df = raw_df
49
+ self.excl_cols = excl_cols
50
+ self.log = log
51
+ self.window_size = window_size
52
+ self.model_type = model_type
53
+ self.thresh_val = thresh_val
54
+ self.plot = plot
55
+ self.plot_series = plot_series
56
+ self.df = raw_df.copy() if excl_cols is None else raw_df.drop(columns=excl_cols).copy()
57
+ self.yhat = None
58
+ self.outliers = None
59
+ self.filtered_df = None
60
+ self.log_transform()
61
+
62
+ def log_transform(self) -> None:
63
+ """
64
+ Log transform the dataframe.
65
+ """
66
+ if self.log:
67
+ # remove negative values
68
+ self.df[self.df <= 0] = np.nan
69
+ # log and replace inf
70
+ self.df = np.log(self.df).replace([np.inf, -np.inf], np.nan)
71
+
72
+ def atr(self) -> pd.DataFrame:
73
+ """
74
+ Detects outliers using OHLC values and H-L range.
56
75
 
57
76
  Returns
58
77
  -------
59
- outliers_dict: Dictionary of pd.DataFrame - MultiIndex
60
- Dictionary of forecasts (yhat), outliers (outliers) and filtered values (filt_vals) multiindex dataframes
61
- with DatetimeIndex (level 0), tickers (level 1) and fields (cols) with forecasted, outlier or filtered
62
- values.
63
-
78
+ filtered_df: pd.DataFrame - MultiIndex
79
+ Filtered dataframe with DatetimeIndex (level 0), tickers (level 1) and fields (cols) with outliers removed.
64
80
  """
65
- # sort index and create df copy
66
- df = self.raw_df.sort_index(level=1)
67
-
68
- # log
69
- if log:
70
- df0 = np.log(df).copy()
71
- else:
72
- df0 = df.copy()
73
-
74
81
  # ohlc
75
- if not all(col in df.columns for col in ["open", "high", "low", "close"]):
82
+ if not all(col in self.df.columns for col in ["open", "high", "low", "close"]):
76
83
  raise Exception("Dataframe must have OHLC prices to compute ATR.")
77
84
 
85
+ # df copy
86
+ df0 = self.df.copy()
87
+
78
88
  # compute true range
79
89
  df0["hl"], df0["hc"], df0["lc"] = (
80
- (df0.high - df.low).abs(),
81
- (df0.high - df.close.shift(1)).abs(),
82
- (df0.low - df.close.shift(1)).abs(),
90
+ (df0.high - df0.low).abs(),
91
+ (df0.high - df0.close.groupby(level=1).shift(1)).abs(),
92
+ (df0.low - df0.close.groupby(level=1).shift(1)).abs(),
83
93
  )
84
94
  df0["tr"] = df0.loc[:, "hl":"lc"].max(axis=1)
85
95
 
86
96
  # compute ATR for estimation and prediction models
87
- if model_type == "estimation":
97
+ if self.model_type == "estimation":
88
98
  df0["atr"] = (
89
99
  df0.tr.groupby(level=1)
90
- .shift(-1 * int((window_size + 1) / 2))
100
+ .shift(-1 * int((self.window_size + 1) / 2))
91
101
  .sort_index(level=1)
92
- .rolling(window_size, min_periods=1)
102
+ .rolling(self.window_size, min_periods=1)
93
103
  .mean()
94
104
  .sort_index()
95
105
  )
96
106
  med = (
97
- df0.loc[:, :"volume"]
98
- .groupby(level=1)
99
- .shift(-1 * int((window_size + 1) / 2))
107
+ df0.groupby(level=1)
108
+ .shift(-1 * int((self.window_size + 1) / 2))
100
109
  .sort_index(level=1)
101
- .rolling(window_size, min_periods=1)
110
+ .rolling(self.window_size, min_periods=1)
102
111
  .median()
103
112
  .sort_index()
104
113
  )
105
114
  else:
106
115
  df0["atr"] = (
107
- df0.tr.groupby(level=1).ewm(span=window_size).mean().droplevel(0)
116
+ df0.tr.groupby(level=1).ewm(span=self.window_size).mean().droplevel(0)
108
117
  )
109
118
  med = (
110
- df0.loc[:, :"volume"]
111
- .groupby(level=1)
112
- .rolling(window_size)
119
+ df0.groupby(level=1)
120
+ .rolling(self.window_size)
113
121
  .median()
114
122
  .droplevel(0)
115
123
  )
116
124
 
117
- # compute dev, score and upper/lower
118
- dev = df0.loc[:, :"volume"] - med
125
+ # compute dev and score for outliers
126
+ dev = df0 - med
119
127
  score = dev.divide(df0.atr, axis=0)
120
- upper, lower = thresh_val, thresh_val * -1
121
128
 
122
129
  # outliers
123
- out_df = df[(score > upper) | (score < lower)]
124
- filt_df = df[(score < upper) & (score > lower)]
130
+ self.outliers = self.df[score.abs() > self.thresh_val].sort_index()
131
+ self.filtered_df = self.df[score.abs() < self.thresh_val].sort_index()
125
132
 
126
- # log
127
- if log:
128
- med = np.exp(med)
133
+ # log to original scale
134
+ if self.log:
135
+ self.yhat = np.exp(med).sort_index()
129
136
 
130
137
  # plot
131
- if plot:
132
- if not isinstance(plot_series, tuple):
138
+ if self.plot:
139
+ if not isinstance(self.plot_series, tuple):
133
140
  raise TypeError(
134
141
  "Plot_series must be a tuple specifying the ticker and column/field to "
135
142
  "plot (ticker, column)."
136
143
  )
137
144
  else:
138
- self.plot_outliers(out_df, plot_series=plot_series)
145
+ self.plot_outliers()
139
146
 
140
- outliers_dict = {
141
- "yhat": med.sort_index(),
142
- "outliers": out_df.sort_index(),
143
- "filt_vals": filt_df.sort_index(),
144
- }
147
+ return self.filtered_df
145
148
 
146
- return outliers_dict
147
-
148
- def iqr(
149
- self,
150
- log: bool = True,
151
- window_size: int = 7,
152
- model_type: str = "estimation",
153
- thresh_val: int = 1.5,
154
- plot: bool = False,
155
- plot_series: tuple = ("BTC", "close"),
156
- ) -> Dict[str, pd.DataFrame]:
149
+ def iqr(self) -> pd.DataFrame:
157
150
  """
158
151
  Detects outliers using interquartile range (IQR) method.
159
152
 
160
- Parameters
161
- ----------
162
- log: bool, default True
163
- Converts series into log of series.
164
- window_size: int, default 7
165
- Number of observations in the rolling window.
166
- model_type: str, {'estimation', 'prediction'}, default 'estimation'
167
- Estimation models use past, current and future values to estimate the expected value of a series,
168
- e.g. expected x_t of series x at time t uses values from [x_t-s, x_t+s].
169
- Prediction models use only past and current values to estimate the expected value of a series,
170
- e.g. expected x_t of series x at time t uses values from [x_t-s, x_t].
171
- thresh_val: int, default 1.5
172
- Value for upper and lower thresholds used in outlier detection.
173
- Computed as: IQR x thresh_val +/- 75th/25th percentiles (upper/lower bands), respectively.
174
- plot: bool, default False
175
- Plots series with outliers highlighted (red dots).
176
- plot_series: tuple, default ('BTC', 'close')
177
- The specific time series to plot given by (ticker, field/column) tuple.
178
-
179
153
  Returns
180
154
  -------
181
- outliers_dict: Dictionary of pd.DataFrame - MultiIndex
182
- Dictionary of forecasts (yhat), outliers (outliers) and filtered values (filt_vals) multiindex dataframes
183
- with DatetimeIndex (level 0), tickers (level 1) and fields (cols) with forecasted, outlier or filtered
184
- values.
185
-
155
+ filtered_df: pd.DataFrame - MultiIndex
156
+ Filtered dataframe with DatetimeIndex (level 0), tickers (level 1) and fields (cols) with outliers removed.
186
157
  """
187
- # sort index and create df copy
188
- df = self.raw_df.sort_index(level=1)
189
-
190
- # log
191
- if log:
192
- df0 = np.log(df).copy()
193
- else:
194
- df0 = df.copy()
158
+ # sort index
159
+ df0 = self.df.sort_index(level=1)
195
160
 
196
161
  # compute 75th, 50th and 25th percentiles for estimation and prediction models
197
- if model_type == "estimation":
162
+ if self.model_type == "estimation":
198
163
  perc_75th = (
199
164
  df0.groupby(level=1)
200
- .shift(-1 * int((window_size + 1) / 2))
165
+ .shift(-1 * int((self.window_size + 1) / 2))
201
166
  .sort_index(level=1)
202
- .rolling(window_size, min_periods=1)
167
+ .rolling(self.window_size, min_periods=1)
203
168
  .quantile(0.75)
204
169
  )
205
170
  perc_25th = (
206
171
  df0.groupby(level=1)
207
- .shift(-1 * int((window_size + 1) / 2))
172
+ .shift(-1 * int((self.window_size + 1) / 2))
208
173
  .sort_index(level=1)
209
- .rolling(window_size, min_periods=1)
174
+ .rolling(self.window_size, min_periods=1)
210
175
  .quantile(0.25)
211
176
  )
212
177
  med = (
213
178
  df0.groupby(level=1)
214
- .shift(-1 * int((window_size + 1) / 2))
179
+ .shift(-1 * int((self.window_size + 1) / 2))
215
180
  .sort_index(level=1)
216
- .rolling(window_size, min_periods=1)
181
+ .rolling(self.window_size, min_periods=1)
217
182
  .median()
218
183
  )
219
184
  else:
220
185
  perc_75th = (
221
- df0.groupby(level=1).rolling(window_size).quantile(0.75).droplevel(0)
186
+ df0.groupby(level=1).rolling(self.window_size).quantile(0.75).droplevel(0)
222
187
  )
223
188
  perc_25th = (
224
- df0.groupby(level=1).rolling(window_size).quantile(0.25).droplevel(0)
189
+ df0.groupby(level=1).rolling(self.window_size).quantile(0.25).droplevel(0)
225
190
  )
226
- med = df0.groupby(level=1).rolling(window_size).median().droplevel(0)
191
+ med = df0.groupby(level=1).rolling(self.window_size).median().droplevel(0)
227
192
 
228
193
  # compute iqr and upper/lower thresholds
229
194
  iqr = perc_75th - perc_25th
230
- upper = perc_75th.add(thresh_val * iqr, axis=1)
231
- lower = perc_25th.subtract(thresh_val * iqr, axis=1)
195
+ upper = perc_75th.add(self.thresh_val * iqr, axis=1)
196
+ lower = perc_25th.subtract(self.thresh_val * iqr, axis=1)
232
197
 
233
198
  # detect outliers
234
- out_df = df[(df0 > upper) | (df0 < lower)]
235
- filt_df = df[(df0 < upper) & (df0 > lower)]
199
+ out_df = self.df[(df0 > upper) | (df0 < lower)]
200
+ filt_df = self.df[(df0 < upper) & (df0 > lower)]
236
201
 
237
- # log
238
- if log:
202
+ # log to original scale
203
+ if self.log:
239
204
  med = np.exp(med)
240
205
 
241
206
  # type conversion
242
- med = med.apply(pd.to_numeric, errors='coerce').convert_dtypes()
243
- out_df = out_df.apply(pd.to_numeric, errors='coerce').convert_dtypes()
244
- filt_df = filt_df.apply(pd.to_numeric, errors='coerce').convert_dtypes()
207
+ self.yhat = med.apply(pd.to_numeric, errors='coerce').convert_dtypes().sort_index()
208
+ self.outliers = out_df.apply(pd.to_numeric, errors='coerce').convert_dtypes().sort_index()
209
+ self.filtered_df = filt_df.apply(pd.to_numeric, errors='coerce').convert_dtypes().sort_index()
245
210
 
246
211
  # plot
247
- if plot:
248
- if not isinstance(plot_series, tuple):
212
+ if self.plot:
213
+ if not isinstance(self.plot_series, tuple):
249
214
  raise TypeError(
250
215
  "Plot_series must be a tuple specifying the ticker and column/field to "
251
216
  "plot (ticker, column)."
252
217
  )
253
218
  else:
254
- self.plot_outliers(out_df, plot_series=plot_series)
219
+ self.plot_outliers()
255
220
 
256
- outliers_dict = {
257
- "yhat": med.sort_index(),
258
- "outliers": out_df.sort_index(),
259
- "filt_vals": filt_df.sort_index(),
260
- }
221
+ return self.filtered_df
261
222
 
262
- return outliers_dict
263
-
264
- def mad(
265
- self,
266
- log: bool = True,
267
- window_size: int = 7,
268
- model_type: str = "estimation",
269
- thresh_val: int = 10,
270
- plot: bool = False,
271
- plot_series: tuple = ("BTC", "close"),
272
- ) -> Dict[str, pd.DataFrame]:
223
+ def mad(self) -> pd.DataFrame:
273
224
  """
274
225
  Detects outliers using a median absolute deviation method, aka Hampler filter.
275
226
 
276
- Parameters
277
- ----------
278
- log: bool, default True
279
- Converts series into log of series.
280
- window_size: int, default 7
281
- Number of observations in the rolling window.
282
- model_type: str, {'estimation', 'prediction'}, default 'estimation'
283
- Estimation models use past, current and future values to estimate the expected value of a series,
284
- e.g. expected x_t of series x at time t uses values from [x_t-s, x_t+s].
285
- Prediction models use only past and current values to estimate the expected value of a series,
286
- e.g. expected x_t of series x at time t uses values from [x_t-s, x_t].
287
- thresh_val: int, default 10
288
- Value for upper and lower thresholds used in outlier detection.
289
- Computed as: [median - thresh_val * mad, median + thresh_val * mad] for lower/upper thresholds.
290
- plot: bool, default False
291
- Plots series with outliers highlighted (red dots).
292
- plot_series: tuple, default ('BTC', 'close')
293
- The specific time series to plot given by (ticker, field/column) tuple.
294
-
295
227
  Returns
296
228
  -------
297
- outliers_dict: Dictionary of pd.DataFrame - MultiIndex
298
- Dictionary of forecasts (yhat), outliers (outliers) and filtered values (filt_vals) multiindex dataframes
299
- with DatetimeIndex (level 0), tickers (level 1) and fields (cols) with forecasted, outlier or filtered
300
- values.
301
-
229
+ filtered_df: pd.DataFrame - MultiIndex
230
+ Filtered dataframe with DatetimeIndex (level 0), tickers (level 1) and fields (cols) with outliers removed.
302
231
  """
303
232
  # sort index and create df copy
304
- df = self.raw_df.sort_index(level=1).copy()
305
-
306
- # log
307
- if log:
308
- df0 = np.log(df)
309
- else:
310
- df0 = df
233
+ df0 = self.df.sort_index(level=1).copy()
311
234
 
312
235
  # compute median for estimation and prediction models
313
- if model_type == "estimation":
236
+ if self.model_type == "estimation":
314
237
  med = (
315
238
  df0.groupby(level=1)
316
- .shift(-1 * int((window_size + 1) / 2))
239
+ .shift(-1 * int((self.window_size + 1) / 2))
317
240
  .sort_index(level=1)
318
- .rolling(window_size, min_periods=1)
241
+ .rolling(self.window_size, min_periods=1)
319
242
  .median()
320
243
  )
321
244
  else:
322
- med = df0.groupby(level=1).rolling(window_size).median().droplevel(0)
245
+ med = df0.groupby(level=1).rolling(self.window_size).median().droplevel(0)
323
246
 
324
247
  # compute dev, mad, upper/lower thresholds
325
248
  dev = df0 - med
326
- mad = dev.abs().groupby(level=1).rolling(window_size).median().droplevel(0)
327
- upper = med.add(thresh_val * mad, axis=1)
328
- lower = med.subtract(thresh_val * mad, axis=1)
249
+ mad = dev.abs().groupby(level=1).rolling(self.window_size).median().droplevel(0)
250
+ upper = med.add(self.thresh_val * mad, axis=1)
251
+ lower = med.subtract(self.thresh_val * mad, axis=1)
329
252
 
330
253
  # outliers
331
- out_df = df[(df0 > upper) | (df0 < lower)]
332
- filt_df = df[(df0 < upper) & (df0 > lower)]
254
+ out_df = self.df[(df0 > upper) | (df0 < lower)]
255
+ filt_df = self.df[(df0 < upper) & (df0 > lower)]
333
256
 
334
- # log
335
- if log:
257
+ # log to original scale
258
+ if self.log:
336
259
  med = np.exp(med)
337
260
 
338
261
  # type conversion
@@ -340,112 +263,73 @@ class OutlierDetection:
340
263
  out_df = out_df.apply(pd.to_numeric, errors='coerce').convert_dtypes()
341
264
  filt_df = filt_df.apply(pd.to_numeric, errors='coerce').convert_dtypes()
342
265
 
266
+ self.yhat = med.sort_index()
267
+ self.outliers = out_df.sort_index()
268
+ self.filtered_df = filt_df.sort_index()
269
+
343
270
  # plot
344
- if plot:
345
- if not isinstance(plot_series, tuple):
271
+ if self.plot:
272
+ if not isinstance(self.plot_series, tuple):
346
273
  raise TypeError(
347
274
  "Plot_series must be a tuple specifying the ticker and column/field to "
348
275
  "plot (ticker, column)."
349
276
  )
350
277
  else:
351
- self.plot_outliers(out_df, plot_series=plot_series)
352
-
353
- outliers_dict = {
354
- "yhat": med.sort_index(),
355
- "outliers": out_df.sort_index(),
356
- "filt_vals": filt_df.sort_index(),
357
- }
278
+ self.plot_outliers()
358
279
 
359
- return outliers_dict
280
+ return self.filtered_df
360
281
 
361
- def z_score(
362
- self,
363
- log: bool = True,
364
- window_size: int = 7,
365
- model_type: str = "estimation",
366
- thresh_val: int = 2,
367
- plot: bool = False,
368
- plot_series: tuple = ("BTC", "close"),
369
- ) -> Dict[str, pd.DataFrame]:
282
+ def z_score(self) -> pd.DataFrame:
370
283
  """
371
284
  Detects outliers using a z-score method, aka simple moving average.
372
285
 
373
- Parameters
374
- ----------
375
- log: bool, default True
376
- Converts series into log of series.
377
- window_size: int, default 7
378
- Number of observations in the rolling window.
379
- model_type: str, {'estimation', 'prediction'}, default 'estimation'
380
- Estimation models use past, current and future values to estimate the expected value of a series,
381
- e.g. expected x_t of series x at time t uses values from [x_t-s, x_t+s].
382
- Prediction models use only past and current values to estimate the expected value of a series,
383
- e.g. expected x_t of series x at time t uses values from [x_t-s, x_t].
384
- thresh_val: int, default 2
385
- Value for upper and lower thresholds used in outlier detection.
386
- plot: bool, default False
387
- Plots series with outliers highlighted with red dots.
388
- plot_series: tuple, default ('BTC', 'close')
389
- Plots the time series of a specific ticker/field combination (tuple).
390
-
391
286
  Returns
392
287
  -------
393
- outliers_dict: Dictionary of pd.DataFrame - MultiIndex
394
- Dictionary of forecasts (yhat), outliers (outliers) and filtered values (filt_vals) multiindex dataframes
395
- with DatetimeIndex (level 0), tickers (level 1) and fields (cols) with forecasted, outlier or filtered
396
- values.
397
-
288
+ filtered_df: pd.DataFrame - MultiIndex
289
+ Filtered dataframe with DatetimeIndex (level 0), tickers (level 1) and fields (cols) with outliers removed.
398
290
  """
399
291
  # sort index and create copy
400
- df = self.raw_df.sort_index(level=1).copy()
401
-
402
- # log
403
- if log:
404
- df0 = np.log(df)
405
- else:
406
- df0 = df
292
+ df0 = self.df.sort_index(level=1).copy()
407
293
 
408
294
  # compute rolling mean and std for estimation and prediction models
409
- if model_type == "estimation":
295
+ if self.model_type == "estimation":
410
296
  roll_mean = (
411
297
  df0.groupby(level=1)
412
- .shift(-1 * int((window_size + 1) / 2))
298
+ .shift(-1 * int((self.window_size + 1) / 2))
413
299
  .sort_index(level=1)
414
- .rolling(window_size, min_periods=1)
300
+ .rolling(self.window_size, min_periods=1)
415
301
  .mean()
416
302
  )
417
303
  roll_std = (
418
304
  df0.groupby(level=1)
419
- .shift(-1 * int((window_size + 1) / 2))
305
+ .shift(-1 * int((self.window_size + 1) / 2))
420
306
  .sort_index(level=1)
421
- .rolling(window_size, min_periods=1)
307
+ .rolling(self.window_size, min_periods=1)
422
308
  .std()
423
309
  )
424
310
  else:
425
311
  roll_mean = (
426
312
  df0.groupby(level=1)
427
- .rolling(window_size, min_periods=1)
313
+ .rolling(self.window_size, min_periods=1)
428
314
  .mean()
429
315
  .droplevel(0)
430
316
  )
431
317
  roll_std = (
432
318
  df0.groupby(level=1)
433
- .rolling(window_size, min_periods=1)
319
+ .rolling(self.window_size, min_periods=1)
434
320
  .std()
435
321
  .droplevel(0)
436
322
  )
437
323
 
438
324
  # compute z-score and upper/lower thresh
439
325
  z = (df0 - roll_mean) / roll_std
440
- upper = thresh_val
441
- lower = thresh_val * -1
442
326
 
443
327
  # outliers
444
- out_df = df[(z > upper) | (z < lower)]
445
- filt_df = df[(z < upper) & (z > lower)]
328
+ out_df = self.df[z.abs() > self.thresh_val]
329
+ filt_df = self.df[z.abs() < self.thresh_val]
446
330
 
447
- # log
448
- if log:
331
+ # log to original scale
332
+ if self.log:
449
333
  roll_mean = np.exp(roll_mean)
450
334
 
451
335
  # type conversion
@@ -453,80 +337,47 @@ class OutlierDetection:
453
337
  out_df = out_df.apply(pd.to_numeric, errors='coerce').convert_dtypes()
454
338
  filt_df = filt_df.apply(pd.to_numeric, errors='coerce').convert_dtypes()
455
339
 
340
+ self.yhat = roll_mean.sort_index()
341
+ self.outliers = out_df.sort_index()
342
+ self.filtered_df = filt_df.sort_index()
343
+
456
344
  # plot
457
- if plot:
458
- if not isinstance(plot_series, tuple):
345
+ if self.plot:
346
+ if not isinstance(self.plot_series, tuple):
459
347
  raise TypeError(
460
348
  "Plot_series must be a tuple specifying the ticker and column/field to "
461
349
  "plot (ticker, column)."
462
350
  )
463
351
  else:
464
- self.plot_outliers(out_df, plot_series=plot_series)
465
-
466
- outliers_dict = {
467
- "yhat": roll_mean.sort_index(),
468
- "outliers": out_df.sort_index(),
469
- "filt_vals": filt_df.sort_index(),
470
- }
352
+ self.plot_outliers()
471
353
 
472
- return outliers_dict
354
+ return self.filtered_df
473
355
 
474
- def ewma(
475
- self,
476
- log: bool = True,
477
- window_size: int = 7,
478
- thresh_val: int = 1.5,
479
- plot: bool = False,
480
- plot_series: tuple = ("BTC", "close"),
481
- ) -> Dict[str, pd.DataFrame]:
356
+ def ewma(self) -> pd.DataFrame:
482
357
  """
483
358
  Detects outliers using an exponential moving average method.
484
359
 
485
- Parameters
486
- ----------
487
- log: bool, default True
488
- Converts series into log of series.
489
- window_size: int, default 7
490
- Number of observations in the rolling window.
491
- thresh_val: int, default 1.5
492
- Value for upper and lower thresholds used in outlier detection.
493
- plot: bool, default False
494
- Plots series with outliers highlighted with red dots.
495
- plot_series: tuple, default ('BTC', 'close')
496
- Plots the time series of a specific ticker/field combination (tuple).
497
-
498
360
  Returns
499
361
  -------
500
- outliers_dict: Dictionary of pd.DataFrame - MultiIndex
501
- Dictionary of forecasts (yhat), outliers (outliers) and filtered values (filt_vals) multiindex dataframes
502
- with DatetimeIndex (level 0), tickers (level 1) and fields (cols) with forecasted, outlier or filtered
503
- values.
504
-
362
+ filtered_df: pd.DataFrame - MultiIndex
363
+ Filtered dataframe with DatetimeIndex (level 0), tickers (level 1) and fields (cols) with outliers removed.
505
364
  """
506
365
  # sort index and create copy
507
- df = self.raw_df.sort_index(level=1).copy()
508
-
509
- # log
510
- if log:
511
- df0 = np.log(df)
512
- else:
513
- df0 = df
366
+ df0 = self.df.sort_index(level=1).copy()
514
367
 
515
368
  # compute ew ma and std for estimation and prediction models
516
- ewma = df0.groupby(level=1).ewm(span=window_size).mean().droplevel(0)
517
- ewstd = df0.groupby(level=1).ewm(span=window_size).std().droplevel(0)
369
+ ewma = df0.groupby(level=1).ewm(span=self.window_size).mean().droplevel(0)
370
+ ewstd = df0.groupby(level=1).ewm(span=self.window_size).std().droplevel(0)
518
371
 
519
372
  # compute z-score and upper/lower thresh
520
373
  z = (df0 - ewma) / ewstd
521
- upper = thresh_val
522
- lower = thresh_val * -1
523
374
 
524
375
  # outliers
525
- out_df = df[(z > upper) | (z < lower)]
526
- filt_df = df[(z < upper) & (z > lower)]
376
+ out_df = self.df[z.abs() > self.thresh_val]
377
+ filt_df = self.df[z.abs() < self.thresh_val]
527
378
 
528
- # log
529
- if log:
379
+ # log to original scale
380
+ if self.log:
530
381
  ewma = np.exp(ewma)
531
382
 
532
383
  # type conversion
@@ -534,35 +385,29 @@ class OutlierDetection:
534
385
  out_df = out_df.apply(pd.to_numeric, errors='coerce').convert_dtypes()
535
386
  filt_df = filt_df.apply(pd.to_numeric, errors='coerce').convert_dtypes()
536
387
 
388
+ self.yhat = ewma.sort_index()
389
+ self.outliers = out_df.sort_index()
390
+ self.filtered_df = filt_df.sort_index()
391
+
537
392
  # plot
538
- if plot:
539
- if not isinstance(plot_series, tuple):
393
+ if self.plot:
394
+ if not isinstance(self.plot_series, tuple):
540
395
  raise TypeError(
541
396
  "Plot_series must be a tuple specifying the ticker and column/field to "
542
397
  "plot (ticker, column)."
543
398
  )
544
399
  else:
545
- self.plot_outliers(out_df, plot_series=plot_series)
400
+ self.plot_outliers()
546
401
 
547
- outliers_dict = {
548
- "yhat": ewma.sort_index(),
549
- "outliers": out_df.sort_index(),
550
- "filt_vals": filt_df.sort_index(),
551
- }
552
-
553
- return outliers_dict
402
+ return self.filtered_df
554
403
 
555
404
  def seasonal_decomp(
556
405
  self,
557
- log: bool = True,
558
- thresh_val: int = 5,
559
406
  period: int = 7,
560
407
  model: str = "additive",
561
408
  filt: Optional[np.array] = None,
562
409
  two_sided: Optional[bool] = True,
563
410
  extrapolate_trend: Optional[int] = 0,
564
- plot: bool = False,
565
- plot_series: tuple = ("BTC", "close"),
566
411
  ) -> Dict[str, pd.DataFrame]:
567
412
  """
568
413
  Detects outliers with seasonal decomposition moving averages from statsmodels.
@@ -571,10 +416,6 @@ class OutlierDetection:
571
416
 
572
417
  Parameters
573
418
  ----------
574
- log: bool, default True
575
- Converts series into log of series.
576
- thresh_val: int, default 5
577
- Value for upper and lower thresholds used in outlier detection.
578
419
  period: int, optional, default 7
579
420
  periodicity of the sequence.
580
421
  model: str, {'additive', 'multiplicative'}, default 'additive'
@@ -590,29 +431,16 @@ class OutlierDetection:
590
431
  on both ends (or the single one if two_sided is False) considering this many (+1) closest points.
591
432
  If set to ‘freq’, use freq closest points. Setting this parameter results in no NaN values in trend
592
433
  or resid components.
593
- plot: bool, default False
594
- Plots series with outliers highlighted with red dots.
595
- plot_series: tuple, default ('BTC', 'close')
596
- Plots the time series of a specific (ticker, field/column) tuple.
597
434
 
598
435
  Returns
599
436
  -------
600
- outliers_dict: Dictionary of pd.DataFrame - MultiIndex
601
- Dictionary of forecasts (yhat), outliers (outliers) and filtered values (filt_vals) multiindex dataframes
602
- with DatetimeIndex (level 0), tickers (level 1) and fields (cols) with forecasted, outlier or filtered
603
- values.
604
-
437
+ filtered_df: pd.DataFrame - MultiIndex
438
+ Filtered dataframe with DatetimeIndex (level 0), tickers (level 1) and fields (cols) with outliers removed.
605
439
  """
606
440
  # unstack
607
- df = self.raw_df.unstack().copy()
441
+ df0 = self.df.unstack().copy()
608
442
  # original idx, unstacked idx
609
- mult_idx, idx = self.raw_df.index, df.index
610
-
611
- # log
612
- if log:
613
- df0 = np.log(df)
614
- else:
615
- df0 = df
443
+ mult_idx, idx = self.raw_df.index, self.df.unstack().index
616
444
 
617
445
  # store resid dfs in dict
618
446
  resid_dict, yhat_dict = {}, {}
@@ -653,13 +481,13 @@ class OutlierDetection:
653
481
  # convert dict to multiindex
654
482
  resid_df, yhat_df = pd.concat(resid_dict, axis=1), pd.concat(yhat_dict, axis=1)
655
483
 
656
- # log
657
- if log:
484
+ # log to original scale
485
+ if self.log:
658
486
  yhat_df = np.exp(yhat_df)
659
487
 
660
488
  # filter outliers
661
- out_df = df[resid_df.abs() > thresh_val]
662
- filt_df = df[resid_df.abs() < thresh_val]
489
+ out_df = self.df.unstack()[resid_df.abs() > self.thresh_val]
490
+ filt_df = self.df.unstack()[resid_df.abs() < self.thresh_val]
663
491
 
664
492
  # stack and reindex
665
493
  out_df = out_df.stack().reindex(mult_idx)
@@ -671,28 +499,24 @@ class OutlierDetection:
671
499
  out_df = out_df.apply(pd.to_numeric, errors="ignore").convert_dtypes()
672
500
  filt_df = filt_df.apply(pd.to_numeric, errors="ignore").convert_dtypes()
673
501
 
502
+ self.yhat = yhat_df.sort_index()
503
+ self.outliers = out_df.sort_index()
504
+ self.filtered_df = filt_df.sort_index()
505
+
674
506
  # plot
675
- if plot:
676
- if not isinstance(plot_series, tuple):
507
+ if self.plot:
508
+ if not isinstance(self.plot_series, tuple):
677
509
  raise TypeError(
678
510
  "Plot_series must be a tuple specifying the ticker and column/field to "
679
511
  "plot (ticker, column)."
680
512
  )
681
513
  else:
682
- self.plot_outliers(out_df, plot_series=plot_series)
683
-
684
- outliers_dict = {
685
- "yhat": yhat_df.sort_index(),
686
- "outliers": out_df.sort_index(),
687
- "filt_vals": filt_df.sort_index(),
688
- }
514
+ self.plot_outliers()
689
515
 
690
- return outliers_dict
516
+ return self.filtered_df
691
517
 
692
518
  def stl(
693
519
  self,
694
- log: bool = True,
695
- thresh_val: int = 5,
696
520
  period: Optional[int] = 7,
697
521
  seasonal: Optional[int] = 7,
698
522
  trend: Optional[int] = None,
@@ -704,9 +528,7 @@ class OutlierDetection:
704
528
  seasonal_jump: Optional[int] = 1,
705
529
  trend_jump: Optional[int] = 1,
706
530
  low_pass_jump: Optional[int] = 1,
707
- plot: bool = False,
708
- plot_series: tuple = ("BTC", "close"),
709
- ) -> Dict[str, pd.DataFrame]:
531
+ ) -> pd.DataFrame:
710
532
  """
711
533
  Detects outliers with seasonal decomposition moving averages from statsmodels.
712
534
 
@@ -714,10 +536,6 @@ class OutlierDetection:
714
536
 
715
537
  Parameters
716
538
  ----------
717
- log: bool, default True
718
- Converts series into log of series.
719
- thresh_val: int, default 5
720
- Value for upper and lower thresholds used in outlier detection.
721
539
  period: int, optional, default 7
722
540
  Periodicity of the sequence.
723
541
  seasonal: int, optional, default 7
@@ -749,35 +567,23 @@ class OutlierDetection:
749
567
  Positive integer determining the linear interpolation step. If larger than 1,
750
568
  the LOESS is used every low_pass_jump points and values between the two are linearly interpolated.
751
569
  Higher values reduce estimation time.
752
- plot: bool, default False
753
- Plots series with outliers highlighted with red dots.
754
- plot_series: tuple, default ('BTC', 'close')
755
- Plots the time series of a specific (ticker, field/column) tuple.
756
570
 
757
571
  Returns
758
572
  -------
759
- outliers_dict: Dictionary of pd.DataFrame - MultiIndex
760
- Dictionary of forecasts (yhat), outliers (outliers) and filtered values (filt_vals) multiindex dataframes
761
- with DatetimeIndex (level 0), tickers (level 1) and fields (cols) with forecasted, outlier or filtered
762
- values.
763
-
573
+ filtered_df: pd.DataFrame - MultiIndex
574
+ Filtered dataframe with DatetimeIndex (level 0), tickers (level 1) and fields (cols) with outliers removed.
764
575
  """
765
576
  # unstack
766
- df = self.raw_df.unstack().copy()
577
+ df0 = self.df.unstack().copy()
767
578
  # original idx, unstacked idx
768
- mult_idx, idx = self.raw_df.index, df.index
769
-
770
- # log
771
- if log:
772
- df0 = np.log(df)
773
- else:
774
- df0 = df
579
+ mult_idx, idx = self.df.index, self.df.unstack().index
775
580
 
776
581
  # store resid dfs in dict
777
582
  resid_dict, yhat_dict = {}, {}
778
583
  for field in df0.columns.get_level_values(0).unique():
779
584
  resid_df, yhat_df = pd.DataFrame(index=idx), pd.DataFrame(index=idx)
780
585
  for ticker in df0[field].columns:
586
+
781
587
  # decompose
782
588
  res = STL(
783
589
  df0[field][ticker].dropna(),
@@ -818,13 +624,13 @@ class OutlierDetection:
818
624
  # convert dict to multiindex
819
625
  resid_df, yhat_df = pd.concat(resid_dict, axis=1), pd.concat(yhat_dict, axis=1)
820
626
 
821
- # log
822
- if log:
627
+ # log to original scale
628
+ if self.log:
823
629
  yhat_df = np.exp(yhat_df)
824
630
 
825
631
  # filter outliers
826
- out_df = df[resid_df.abs() > thresh_val]
827
- filt_df = df[resid_df.abs() < thresh_val]
632
+ out_df = self.df.unstack()[resid_df.abs() > self.thresh_val]
633
+ filt_df = self.df.unstack()[resid_df.abs() < self.thresh_val]
828
634
 
829
635
  # stack and reindex
830
636
  out_df = out_df.stack().reindex(mult_idx)
@@ -836,64 +642,41 @@ class OutlierDetection:
836
642
  out_df = out_df.apply(pd.to_numeric, errors="ignore").convert_dtypes()
837
643
  filt_df = filt_df.apply(pd.to_numeric, errors="ignore").convert_dtypes()
838
644
 
645
+ self.yhat = yhat_df.sort_index()
646
+ self.outliers = out_df.sort_index()
647
+ self.filtered_df = filt_df.sort_index()
648
+
839
649
  # plot
840
- if plot:
841
- if not isinstance(plot_series, tuple):
650
+ if self.plot:
651
+ if not isinstance(self.plot_series, tuple):
842
652
  raise TypeError(
843
653
  "Plot_series must be a tuple specifying the ticker and column/field to "
844
654
  "plot (ticker, column)."
845
655
  )
846
656
  else:
847
- self.plot_outliers(out_df, plot_series=plot_series)
657
+ self.plot_outliers()
848
658
 
849
- outliers_dict = {
850
- "yhat": yhat_df.sort_index(),
851
- "outliers": out_df.sort_index(),
852
- "filt_vals": filt_df.sort_index(),
853
- }
659
+ return self.filtered_df
854
660
 
855
- return outliers_dict
856
-
857
- def prophet(
858
- self,
859
- log: bool = True,
860
- interval_width: Optional[float] = 0.99,
861
- plot: bool = False,
862
- plot_series: tuple = ("BTC", "close"),
863
- ) -> Dict[str, pd.DataFrame]:
661
+ def prophet(self, interval_width: Optional[float] = 0.999) -> pd.DataFrame:
864
662
  """
865
663
  Detects outliers using Prophet, a time series forecasting algorithm published by Facebook.
866
664
 
867
665
  Parameters
868
666
  ----------
869
- log: bool, default True
870
- Converts series into log of series.
871
667
  interval_width: float, optional, default 0.99
872
668
  Uncertainty interval estimated by Monte Carlo simulation. The larger the value,
873
669
  the larger the upper/lower thresholds interval for outlier detection.
874
- plot: bool, default False
875
- Plots series with outliers highlighted with red dots.
876
- plot_series: tuple, default ('BTC', 'close')
877
- Plots the time series of a specific (ticker, field/column) tuple.
878
670
 
879
671
  Returns
880
672
  -------
881
- outliers_dict: Dictionary of pd.DataFrame - MultiIndex
882
- Dictionary of forecasts (yhat), outliers (outliers) and filtered values (filt_vals) multiindex dataframes
883
- with DatetimeIndex (level 0), tickers (level 1) and fields (cols) with forecasted, outlier or filtered
884
- values.
885
-
673
+ filtered_df: pd.DataFrame - MultiIndex
674
+ Filtered dataframe with DatetimeIndex (level 0), tickers (level 1) and fields (cols) with outliers removed.
886
675
  """
887
676
  # unstack
888
- df = self.raw_df.unstack().copy()
677
+ df0 = self.raw_df.unstack().copy()
889
678
  # original idx, unstacked idx
890
- mult_idx, idx = self.raw_df.index, df.index
891
-
892
- # log
893
- if log:
894
- df0 = np.log(df)
895
- else:
896
- df0 = df
679
+ mult_idx, idx = self.raw_df.index, df0.index
897
680
 
898
681
  # store predictions for fields dfs in dict
899
682
  upper_dict, lower_dict, yhat_dict = {}, {}, {}
@@ -949,15 +732,15 @@ class OutlierDetection:
949
732
  yhat.columns.names = [None, "ticker"]
950
733
 
951
734
  # transform log
952
- if log:
735
+ if self.log:
953
736
  yhat_upper = np.exp(yhat_upper)
954
737
  yhat_lower = np.exp(yhat_lower)
955
738
  yhat = np.exp(yhat)
956
739
 
957
740
  # filter outliers
958
741
  yhat_df = yhat
959
- out_df = df[df.gt(yhat_upper) | df.lt(yhat_lower)]
960
- filt_df = df[df.lt(yhat_upper) & df.gt(yhat_lower)]
742
+ out_df = self.df.unstack()[self.df.unstack().gt(yhat_upper) | self.df.unstack().lt(yhat_lower)]
743
+ filt_df = self.df.unstack()[self.df.unstack().lt(yhat_upper) & self.df.unstack().gt(yhat_lower)]
961
744
 
962
745
  # stack and reindex
963
746
  yhat_df = yhat_df.stack().reindex(mult_idx)
@@ -969,47 +752,35 @@ class OutlierDetection:
969
752
  out_df = out_df.apply(pd.to_numeric, errors="ignore").convert_dtypes()
970
753
  filt_df = filt_df.apply(pd.to_numeric, errors="ignore").convert_dtypes()
971
754
 
755
+ self.yhat = yhat_df.sort_index()
756
+ self.outliers = out_df.sort_index()
757
+ self.filtered_df = filt_df.sort_index()
758
+
972
759
  # plot
973
- if plot:
974
- if not isinstance(plot_series, tuple):
760
+ if self.plot:
761
+ if not isinstance(self.plot_series, tuple):
975
762
  raise TypeError(
976
763
  "Plot_series must be a tuple specifying the ticker and column/field to "
977
764
  "plot (ticker, column)."
978
765
  )
979
766
  else:
980
- self.plot_outliers(out_df, plot_series=plot_series)
981
-
982
- outliers_dict = {
983
- "yhat": yhat_df.sort_index(),
984
- "outliers": out_df.sort_index(),
985
- "filt_vals": filt_df.sort_index(),
986
- }
767
+ self.plot_outliers()
987
768
 
988
- return outliers_dict
769
+ return self.filtered_df
989
770
 
990
- def plot_outliers(
991
- self, outliers_df: pd.DataFrame, plot_series: Optional[tuple] = None
992
- ) -> None:
771
+ def plot_outliers(self) -> None:
993
772
  """
994
773
  Plots time series with outliers highlighted (red dots).
995
-
996
- Parameters
997
- ----------
998
- outliers_df: pd.DataFrame - MultiIndex
999
- Dataframe MultiIndex with DatetimeIndex (level 0), tickers (level 1) and fields (cols) outlier values.
1000
- plot_series: tuple, optional, default None
1001
- Plots the time series of a specific (ticker, field) tuple.
1002
-
1003
774
  """
1004
775
  ax = (
1005
- self.raw_df.loc[pd.IndexSlice[:, plot_series[0]], plot_series[1]]
776
+ self.df.loc[pd.IndexSlice[:, self.plot_series[0]], self.plot_series[1]]
1006
777
  .droplevel(1)
1007
778
  .plot(linewidth=1, figsize=(15, 7), color="#1f77b4", zorder=0)
1008
779
  )
1009
- outliers_df.unstack()[plot_series[1]].reset_index().plot(
780
+ self.outliers.unstack()[self.plot_series[1]].reset_index().plot(
1010
781
  kind="scatter",
1011
782
  x="date",
1012
- y=plot_series[0],
783
+ y=self.plot_series[0],
1013
784
  color="#E64E53",
1014
785
  ax=ax,
1015
786
  label="outliers",
@@ -1019,5 +790,5 @@ class OutlierDetection:
1019
790
  ax.ticklabel_format(style="plain", axis="y")
1020
791
  ax.set_facecolor("whitesmoke")
1021
792
  ax.legend(
1022
- [plot_series[1] + "_raw", plot_series[1] + "_outliers"], loc="upper left"
793
+ [self.plot_series[1] + "_raw", self.plot_series[1] + "_outliers"], loc="upper left"
1023
794
  )