cryptodatapy 0.2.2__py3-none-any.whl → 0.2.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cryptodatapy/conf/fx_tickers.csv +31 -0
- cryptodatapy/transform/clean.py +171 -172
- cryptodatapy/transform/clean_perp_futures_ohlcv.ipynb +1025 -0
- cryptodatapy/transform/filter.py +83 -142
- cryptodatapy/transform/impute.py +36 -83
- cryptodatapy/transform/od.py +221 -450
- {cryptodatapy-0.2.2.dist-info → cryptodatapy-0.2.4.dist-info}/METADATA +4 -1
- {cryptodatapy-0.2.2.dist-info → cryptodatapy-0.2.4.dist-info}/RECORD +10 -8
- {cryptodatapy-0.2.2.dist-info → cryptodatapy-0.2.4.dist-info}/LICENSE +0 -0
- {cryptodatapy-0.2.2.dist-info → cryptodatapy-0.2.4.dist-info}/WHEEL +0 -0
cryptodatapy/transform/filter.py
CHANGED
@@ -7,12 +7,13 @@ import pandas as pd
|
|
7
7
|
class Filter:
|
8
8
|
"""
|
9
9
|
Filters dataframe in tidy format.
|
10
|
-
|
11
10
|
"""
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
11
|
+
def __init__(self,
|
12
|
+
raw_df: pd.DataFrame,
|
13
|
+
excl_cols: Optional[Union[str, list]] = None,
|
14
|
+
plot: bool = False,
|
15
|
+
plot_series: tuple = ("BTC", "close")
|
16
|
+
):
|
16
17
|
"""
|
17
18
|
Constructor
|
18
19
|
|
@@ -22,64 +23,18 @@ class Filter:
|
|
22
23
|
Dataframe with raw data. DatetimeIndex (level 0), ticker (level 1) and raw data (cols), in tidy format.
|
23
24
|
excl_cols: str or list, default None
|
24
25
|
Name of columns to exclude from filtering
|
25
|
-
|
26
26
|
"""
|
27
|
-
|
28
27
|
self.raw_df = raw_df
|
29
28
|
self.excl_cols = excl_cols
|
30
|
-
|
31
|
-
|
32
|
-
self
|
33
|
-
|
34
|
-
plot: bool = False,
|
35
|
-
plot_series: tuple = ("BTC", "close"),
|
36
|
-
) -> pd.DataFrame:
|
37
|
-
"""
|
38
|
-
Filters outliers, replacing them with NaNs.
|
39
|
-
|
40
|
-
Parameters
|
41
|
-
----------
|
42
|
-
outliers_dict: Dictionary of pd.DataFrame - MultiIndex
|
43
|
-
Dictionary of forecasts (yhat), outliers (outliers) and filtered values (filt_vals) multiindex dataframes
|
44
|
-
with DatetimeIndex (level 0), tickers (level 1) and fields (cols) with forecasted, outlier or filtered
|
45
|
-
values.
|
46
|
-
plot: bool, default False
|
47
|
-
Plots series with outliers highlighted with red dots.
|
48
|
-
plot_series: tuple, default ('BTC', 'close')
|
49
|
-
Plots the time series of a specific (ticker, field/column) tuple.
|
50
|
-
|
51
|
-
Returns
|
52
|
-
-------
|
53
|
-
filt_df: DataFrame - MultiIndex
|
54
|
-
Filtered dataFrame with DatetimeIndex (level 0), tickers (level 1) and fields (cols) with outliers removed.
|
55
|
-
|
56
|
-
"""
|
57
|
-
# filter outliers
|
58
|
-
filt_df = outliers_dict["filt_vals"]
|
59
|
-
|
60
|
-
# add excl cols
|
61
|
-
if self.excl_cols is not None:
|
62
|
-
filt_df = pd.concat(
|
63
|
-
[filt_df, self.raw_df[self.excl_cols]], join="outer", axis=1
|
64
|
-
)
|
65
|
-
|
66
|
-
# plot
|
67
|
-
if plot:
|
68
|
-
if not isinstance(plot_series, tuple):
|
69
|
-
raise TypeError(
|
70
|
-
"Plot_series must be a tuple specifying the ticker and column/field to plot (ticker, column)."
|
71
|
-
)
|
72
|
-
else:
|
73
|
-
self.plot_filtered(filt_df, plot_series=plot_series)
|
74
|
-
|
75
|
-
return filt_df
|
29
|
+
self.plot = plot
|
30
|
+
self.plot_series = plot_series
|
31
|
+
self.df = raw_df.copy() if excl_cols is None else raw_df.drop(columns=excl_cols).copy()
|
32
|
+
self.filtered_df = None
|
76
33
|
|
77
34
|
def avg_trading_val(
|
78
35
|
self,
|
79
36
|
thresh_val: int = 10000000,
|
80
37
|
window_size: int = 30,
|
81
|
-
plot: bool = False,
|
82
|
-
plot_series: tuple = ("BTC", "close"),
|
83
38
|
) -> pd.DataFrame:
|
84
39
|
"""
|
85
40
|
Filters values below a threshold of average trading value (price * volume/size in quote currency) over some
|
@@ -91,35 +46,24 @@ class Filter:
|
|
91
46
|
Threshold/cut-off for avg trading value.
|
92
47
|
window_size: int, default 30
|
93
48
|
Size of rolling window.
|
94
|
-
plot: bool, default False
|
95
|
-
Plots series with outliers highlighted with red dots.
|
96
|
-
plot_series: tuple, default ('BTC', 'close')
|
97
|
-
Plots the time series of a specific (ticker, field/column) tuple.
|
98
49
|
|
99
50
|
Returns
|
100
51
|
-------
|
101
|
-
|
52
|
+
filtered_df: DataFrame - MultiIndex
|
102
53
|
Filtered dataFrame with DatetimeIndex (level 0), tickers (level 1) and fields (cols) with values below the
|
103
54
|
threshold removed.
|
104
|
-
|
105
55
|
"""
|
106
|
-
# convert string to list
|
107
|
-
if self.excl_cols is not None:
|
108
|
-
df = self.raw_df.drop(columns=self.excl_cols).copy()
|
109
|
-
else:
|
110
|
-
df = self.raw_df.copy()
|
111
|
-
|
112
56
|
# compute traded val
|
113
|
-
if "close" in df.columns and "volume" in df.columns:
|
114
|
-
df["trading_val"] = df.close * df.volume
|
115
|
-
elif ("bid" in df.columns and "ask" in df.columns) and (
|
116
|
-
"bid_size" in df.columns and "ask_size" in df.columns
|
57
|
+
if "close" in self.df.columns and "volume" in self.df.columns:
|
58
|
+
self.df["trading_val"] = self.df.close * self.df.volume
|
59
|
+
elif ("bid" in self.df.columns and "ask" in self.df.columns) and (
|
60
|
+
"bid_size" in self.df.columns and "ask_size" in self.df.columns
|
117
61
|
):
|
118
|
-
df["trading_val"] = ((df.bid + df.ask) / 2) * (
|
119
|
-
(df.bid_size + df.ask_size) / 2
|
62
|
+
self.df["trading_val"] = ((self.df.bid + self.df.ask) / 2) * (
|
63
|
+
(self.df.bid_size + self.df.ask_size) / 2
|
120
64
|
)
|
121
|
-
elif "trade_size" in df.columns and "trade_price" in df.columns:
|
122
|
-
df["trading_val"] = df.trade_price * df.trade_size
|
65
|
+
elif "trade_size" in self.df.columns and "trade_price" in self.df.columns:
|
66
|
+
self.df["trading_val"] = self.df.trade_price * self.df.trade_size
|
123
67
|
else:
|
124
68
|
raise Exception(
|
125
69
|
"Dataframe must include at least one price series (e.g. close price, trade price, "
|
@@ -127,36 +71,29 @@ class Filter:
|
|
127
71
|
)
|
128
72
|
|
129
73
|
# compute rolling mean/avg
|
130
|
-
df1 = df.groupby(level=1).rolling(window_size).mean().droplevel(0)
|
74
|
+
df1 = self.df.groupby(level=1).rolling(window_size).mean().droplevel(0)
|
131
75
|
# divide by thresh
|
132
76
|
df1 = df1 / thresh_val
|
133
77
|
# filter df1
|
134
|
-
|
135
|
-
df.loc[df1.trading_val > 1].reindex(df.index).drop(columns="trading_val")
|
136
|
-
)
|
137
|
-
# add excl cols
|
138
|
-
if self.excl_cols is not None:
|
139
|
-
filt_df = pd.concat(
|
140
|
-
[filt_df, self.raw_df[self.excl_cols]], join="outer", axis=1
|
141
|
-
)
|
78
|
+
self.filtered_df = self.df.loc[df1.trading_val > 1].reindex(self.df.index).drop(columns="trading_val")
|
142
79
|
|
143
80
|
# plot
|
144
|
-
if plot:
|
145
|
-
if not isinstance(plot_series, tuple):
|
81
|
+
if self.plot:
|
82
|
+
if not isinstance(self.plot_series, tuple):
|
146
83
|
raise TypeError(
|
147
84
|
"Plot_series must be a tuple specifying the ticker and column/field to plot (ticker, column)."
|
148
85
|
)
|
149
86
|
else:
|
150
|
-
self.plot_filtered(
|
87
|
+
self.plot_filtered(plot_series=self.plot_series)
|
151
88
|
|
152
|
-
|
89
|
+
# add excl cols
|
90
|
+
if self.excl_cols is not None:
|
91
|
+
self.filtered_df = pd.concat([self.filtered_df,
|
92
|
+
self.raw_df[self.excl_cols].reindex(self.filtered_df.index)], axis=1)
|
153
93
|
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
plot: bool = False,
|
158
|
-
plot_series: tuple = ("BTC", "close"),
|
159
|
-
) -> pd.DataFrame:
|
94
|
+
return self.filtered_df
|
95
|
+
|
96
|
+
def missing_vals_gaps(self, gap_window: int = 30) -> pd.DataFrame:
|
160
97
|
"""
|
161
98
|
Filters values before a large gap of missing values, replacing them with NaNs.
|
162
99
|
|
@@ -164,27 +101,16 @@ class Filter:
|
|
164
101
|
----------
|
165
102
|
gap_window: int, default 30
|
166
103
|
Size of window where all values are missing (NaNs).
|
167
|
-
plot: bool, default False
|
168
|
-
Plots series with outliers highlighted with red dots.
|
169
|
-
plot_series: tuple, default ('BTC', 'close')
|
170
|
-
Plots the time series of a specific (ticker, field/column) tuple.
|
171
104
|
|
172
105
|
Returns
|
173
106
|
-------
|
174
|
-
|
107
|
+
filtered_df: DataFrame - MultiIndex
|
175
108
|
Filtered dataFrame with DatetimeIndex (level 0), tickers (level 1) and fields (cols) with values before
|
176
109
|
missing values gaps removed.
|
177
|
-
|
178
110
|
"""
|
179
|
-
# convert string to list
|
180
|
-
if self.excl_cols is not None:
|
181
|
-
df = self.raw_df.drop(columns=self.excl_cols).copy()
|
182
|
-
else:
|
183
|
-
df = self.raw_df.copy()
|
184
|
-
|
185
111
|
# window obs count
|
186
112
|
window_count = (
|
187
|
-
df.groupby(level=1)
|
113
|
+
self.df.groupby(level=1)
|
188
114
|
.rolling(window=gap_window, min_periods=gap_window)
|
189
115
|
.count()
|
190
116
|
.droplevel(0)
|
@@ -194,24 +120,25 @@ class Filter:
|
|
194
120
|
for col in gap.unstack().columns:
|
195
121
|
start_idx = gap.unstack()[col].last_valid_index()
|
196
122
|
if start_idx is not None:
|
197
|
-
df.loc[pd.IndexSlice[:start_idx, col[1]], col[0]] = np.nan
|
198
|
-
|
199
|
-
# add excl cols
|
200
|
-
if self.excl_cols is not None:
|
201
|
-
filt_df = pd.concat([df, self.raw_df[self.excl_cols]], join="outer", axis=1)
|
202
|
-
else:
|
203
|
-
filt_df = df
|
123
|
+
self.df.loc[pd.IndexSlice[:start_idx, col[1]], col[0]] = np.nan
|
204
124
|
|
205
125
|
# plot
|
206
|
-
if plot:
|
207
|
-
if not isinstance(plot_series, tuple):
|
126
|
+
if self.plot:
|
127
|
+
if not isinstance(self.plot_series, tuple):
|
208
128
|
raise TypeError(
|
209
129
|
"Plot_series must be a tuple specifying the ticker and column/field to plot (ticker, column)."
|
210
130
|
)
|
211
131
|
else:
|
212
|
-
self.plot_filtered(
|
132
|
+
self.plot_filtered(plot_series=self.plot_series)
|
133
|
+
|
134
|
+
# add excl cols
|
135
|
+
if self.excl_cols is not None:
|
136
|
+
self.filtered_df = pd.concat([self.df,
|
137
|
+
self.raw_df[self.excl_cols].reindex(self.df)], axis=1)
|
138
|
+
else:
|
139
|
+
self.filtered_df = self.df
|
213
140
|
|
214
|
-
return
|
141
|
+
return self.filtered_df
|
215
142
|
|
216
143
|
def min_nobs(self, ts_obs=100, cs_obs=1) -> pd.DataFrame:
|
217
144
|
"""
|
@@ -227,25 +154,47 @@ class Filter:
|
|
227
154
|
|
228
155
|
Returns
|
229
156
|
-------
|
230
|
-
|
157
|
+
filtered_df: DataFrame - MultiIndex
|
231
158
|
Filtered dataFrame with DatetimeIndex (level 0), tickers with minimum number of observations (level 1)
|
232
159
|
and fields (cols).
|
233
|
-
|
234
160
|
"""
|
235
|
-
# create copy
|
236
|
-
df = self.raw_df.copy()
|
237
|
-
|
238
161
|
# drop tickers with nobs < ts_obs
|
239
|
-
obs = df.groupby(level=1).count().min(axis=1)
|
162
|
+
obs = self.df.groupby(level=1).count().min(axis=1)
|
240
163
|
drop_tickers_list = obs[obs < ts_obs].index.to_list()
|
241
|
-
|
164
|
+
self.filtered_df = self.df.drop(drop_tickers_list, level=1, axis=0)
|
242
165
|
|
243
166
|
# drop tickers with nobs < cs_obs
|
244
|
-
obs =
|
167
|
+
obs = self.filtered_df.groupby(level=0).count().min(axis=1)
|
245
168
|
idx_start = obs[obs > cs_obs].index[0]
|
246
|
-
|
169
|
+
self.filtered_df = self.filtered_df.loc[idx_start:]
|
247
170
|
|
248
|
-
return
|
171
|
+
return self.filtered_df
|
172
|
+
|
173
|
+
def remove_delisted(self, field: str = 'close', n_unch_vals: int = 30) -> pd.DataFrame:
|
174
|
+
"""
|
175
|
+
Removes delisted tickers from dataframe.
|
176
|
+
|
177
|
+
Parameters
|
178
|
+
----------
|
179
|
+
field: str, default 'close'
|
180
|
+
Field/column to use for detecting delisted tickers.
|
181
|
+
n_unch_vals: int, default 30
|
182
|
+
Number of consecutive unchanged values to consider a ticker as delisted.
|
183
|
+
|
184
|
+
Returns
|
185
|
+
-------
|
186
|
+
filtered_df: pd.DataFrame - MultiIndex
|
187
|
+
Filtered dataFrame with DatetimeIndex (level 0), tickers (level 1) and fields (cols).
|
188
|
+
"""
|
189
|
+
# delisted tickers
|
190
|
+
delisted_tickers = self.df[field].unstack()[self.df[field].unstack().pct_change().iloc[-n_unch_vals:] == 0].\
|
191
|
+
dropna(how='all', axis=0).dropna(thresh=n_unch_vals, axis=1).columns
|
192
|
+
print(delisted_tickers)
|
193
|
+
|
194
|
+
# drop delisted tickers
|
195
|
+
self.filtered_df = self.df.drop(delisted_tickers, level=1)
|
196
|
+
|
197
|
+
return self.filtered_df
|
249
198
|
|
250
199
|
def tickers(self, tickers_list) -> pd.DataFrame:
|
251
200
|
"""
|
@@ -259,37 +208,29 @@ class Filter:
|
|
259
208
|
|
260
209
|
Returns
|
261
210
|
-------
|
262
|
-
|
211
|
+
filtered_df: pd.DataFrame - MultiIndex
|
263
212
|
Filtered dataFrame with DatetimeIndex (level 0), tickers (level 1) and fields (cols).
|
264
|
-
|
265
213
|
"""
|
266
|
-
# create copy
|
267
|
-
df = self.raw_df.copy()
|
268
214
|
# tickers list
|
269
215
|
if isinstance(tickers_list, str):
|
270
216
|
tickers_list = [tickers_list]
|
217
|
+
|
271
218
|
# drop tickers
|
272
|
-
|
219
|
+
self.filtered_df = self.df.drop(tickers_list, level=1)
|
273
220
|
|
274
|
-
return
|
221
|
+
return self.filtered_df
|
275
222
|
|
276
|
-
|
277
|
-
def plot_filtered(
|
278
|
-
filt_df: pd.DataFrame, plot_series: Optional[tuple] = None
|
279
|
-
) -> None:
|
223
|
+
def plot_filtered(self, plot_series: Optional[tuple] = None) -> None:
|
280
224
|
"""
|
281
225
|
Plots filtered time series.
|
282
226
|
|
283
227
|
Parameters
|
284
228
|
----------
|
285
|
-
filt_df: pd.DataFrame - MultiIndex
|
286
|
-
Dataframe MultiIndex with DatetimeIndex (level 0), tickers (level 1) and filtered values (cols).
|
287
229
|
plot_series: tuple, optional, default None
|
288
230
|
Plots the time series of a specific (ticker, field) tuple.
|
289
|
-
|
290
231
|
"""
|
291
232
|
ax = (
|
292
|
-
|
233
|
+
self.filtered_df.loc[pd.IndexSlice[:, plot_series[0]], plot_series[1]]
|
293
234
|
.droplevel(1)
|
294
235
|
.plot(linewidth=1, figsize=(15, 7), color="#1f77b4", zorder=0)
|
295
236
|
)
|
cryptodatapy/transform/impute.py
CHANGED
@@ -7,67 +7,51 @@ import pandas as pd
|
|
7
7
|
class Impute:
|
8
8
|
"""
|
9
9
|
Handles missing values.
|
10
|
-
|
11
10
|
"""
|
12
|
-
|
13
|
-
def __init__(self, filt_df: pd.DataFrame):
|
14
|
-
|
11
|
+
def __init__(self, filtered_df: pd.DataFrame, plot: bool = False, plot_series: tuple = ("BTC", "close")):
|
15
12
|
"""
|
16
13
|
Constructor
|
17
14
|
|
18
15
|
Parameters
|
19
16
|
----------
|
20
|
-
|
17
|
+
filtered_df: pd.DataFrame - MultiIndex
|
21
18
|
DataFrame MultiIndex with DatetimeIndex (level 0), ticker (level 1) and fields (cols) with filtered values.
|
22
|
-
|
23
19
|
"""
|
24
|
-
self.
|
20
|
+
self.filtered_df = filtered_df.astype(float)
|
21
|
+
self.plot = plot
|
22
|
+
self.plot_series = plot_series
|
23
|
+
self.imputed_df = None
|
25
24
|
|
26
|
-
def fwd_fill(
|
27
|
-
self, plot: bool = False, plot_series: tuple = ("BTC", "close")
|
28
|
-
) -> pd.DataFrame:
|
25
|
+
def fwd_fill(self) -> pd.DataFrame:
|
29
26
|
"""
|
30
27
|
Imputes missing values by imputing missing values with latest non-missing values.
|
31
28
|
|
32
|
-
Parameters
|
33
|
-
----------
|
34
|
-
plot: bool, default False
|
35
|
-
Plots series with outliers highlighted with red dots.
|
36
|
-
plot_series: tuple, default ('BTC', 'close')
|
37
|
-
Plots the time series of a specific (ticker, field/column) tuple.
|
38
|
-
|
39
29
|
Returns
|
40
30
|
-------
|
41
|
-
|
31
|
+
imputed_df: pd.DataFrame - MultiIndex
|
42
32
|
DataFrame MultiIndex with DatetimeIndex (level 0), ticker (level 1) and fields (cols) with imputed values
|
43
33
|
using forward fill method.
|
44
|
-
|
45
34
|
"""
|
46
|
-
# copy df
|
47
|
-
filt_df = self.filt_df.copy()
|
48
|
-
|
49
35
|
# ffill
|
50
|
-
|
36
|
+
self.imputed_df = self.filtered_df.groupby(level=1).ffill()
|
51
37
|
|
52
38
|
# plot
|
53
|
-
if plot:
|
54
|
-
if not isinstance(plot_series, tuple):
|
39
|
+
if self.plot:
|
40
|
+
if not isinstance(self.plot_series, tuple):
|
55
41
|
raise TypeError(
|
56
42
|
"Plot_series must be a tuple specifying the ticker and column/field to plot (ticker, column)."
|
57
43
|
)
|
58
44
|
else:
|
59
|
-
self.plot_imputed(
|
45
|
+
self.plot_imputed()
|
60
46
|
|
61
|
-
return
|
47
|
+
return self.imputed_df
|
62
48
|
|
63
49
|
def interpolate(
|
64
50
|
self,
|
65
51
|
method: str = "linear",
|
66
52
|
order: Optional[int] = None,
|
67
|
-
axis=0,
|
53
|
+
axis: int = 0,
|
68
54
|
limit: Optional[int] = None,
|
69
|
-
plot: bool = False,
|
70
|
-
plot_series: tuple = ("BTC", "close"),
|
71
55
|
) -> pd.DataFrame:
|
72
56
|
"""
|
73
57
|
Imputes missing values by interpolating using various methods.
|
@@ -83,116 +67,85 @@ class Impute:
|
|
83
67
|
Axis to interpolate along.
|
84
68
|
limit: int, optional, default None
|
85
69
|
Maximum number of consecutive NaNs to fill. Must be greater than 0.
|
86
|
-
plot: bool, default False
|
87
|
-
Plots series with outliers highlighted with red dots.
|
88
|
-
plot_series: tuple, default ('BTC', 'close')
|
89
|
-
Plots the time series of a specific (ticker, field/column) tuple.
|
90
70
|
|
91
71
|
Returns
|
92
72
|
-------
|
93
|
-
|
73
|
+
imputed_df: pd.DataFrame - MultiIndex
|
94
74
|
DataFrame MultiIndex with DatetimeIndex (level 0), ticker (level 1) and fields (cols) with imputed values
|
95
75
|
using interpolation method.
|
96
|
-
|
97
76
|
"""
|
98
|
-
# copy df and convert to float for interpolation (code will break if type int64)
|
99
|
-
filt_df = self.filt_df.astype(float).copy()
|
100
|
-
|
101
77
|
# add order if spline or polynomial
|
102
78
|
if (method == "spline" or method == "polynomial") and order is None:
|
103
79
|
order = 3
|
104
80
|
|
105
81
|
# interpolate
|
106
|
-
|
107
|
-
|
108
|
-
.interpolate(method=method, order=order, axis=axis, limit=limit)
|
109
|
-
.stack()
|
110
|
-
.reindex(filt_df.index)
|
111
|
-
)
|
82
|
+
self.imputed_df = self.filtered_df.unstack().interpolate(method=method, order=order, axis=axis,
|
83
|
+
limit=limit).stack().reindex(self.filtered_df.index)
|
112
84
|
|
113
85
|
# type conversion
|
114
|
-
|
86
|
+
self.imputed_df = self.imputed_df.apply(pd.to_numeric, errors="ignore").convert_dtypes()
|
115
87
|
|
116
88
|
# plot
|
117
|
-
if plot:
|
118
|
-
if not isinstance(plot_series, tuple):
|
89
|
+
if self.plot:
|
90
|
+
if not isinstance(self.plot_series, tuple):
|
119
91
|
raise TypeError(
|
120
92
|
"Plot_series must be a tuple specifying the ticker and column/field to plot (ticker, column)."
|
121
93
|
)
|
122
94
|
else:
|
123
|
-
self.plot_imputed(
|
95
|
+
self.plot_imputed()
|
124
96
|
|
125
|
-
return
|
97
|
+
return self.imputed_df
|
126
98
|
|
127
99
|
def fcst(
|
128
100
|
self,
|
129
|
-
|
130
|
-
plot: bool = False,
|
131
|
-
plot_series: tuple = ("BTC", "close"),
|
101
|
+
yhat_df: pd.DataFrame,
|
132
102
|
) -> pd.DataFrame:
|
133
103
|
"""
|
134
104
|
Imputes missing values with forecasts from outlier detection algorithm.
|
135
105
|
|
136
106
|
Parameters
|
137
107
|
----------
|
138
|
-
|
108
|
+
yhat_df: pd.DataFrame - MultiIndex
|
139
109
|
Multiindex dataframe with DatetimeIndex (level 0), tickers (level 1) and fields (cols)
|
140
110
|
with forecasted values.
|
141
|
-
plot: bool, default False
|
142
|
-
Plots series with outliers highlighted with red dots.
|
143
|
-
plot_series: tuple, default ('BTC', 'close')
|
144
|
-
Plots the time series of a specific (ticker, field/column) tuple.
|
145
111
|
|
146
112
|
Returns
|
147
113
|
-------
|
148
|
-
|
114
|
+
imputed_df: pd.DataFrame - MultiIndex
|
149
115
|
DataFrame MultiIndex with DatetimeIndex (level 0), ticker (level 1) and fields (cols) with imputed values
|
150
116
|
using forecasts from outlier detection method.
|
151
|
-
|
152
117
|
"""
|
153
|
-
# copy filtered and forecast dfs
|
154
|
-
filt_df, yhat_df = self.filt_df.copy(), fcst_df.copy()
|
155
|
-
|
156
118
|
# impute missing vals in filtered df with fcst vals
|
157
|
-
imp_yhat = np.where(
|
119
|
+
imp_yhat = np.where(self.filtered_df.isna(), yhat_df, self.filtered_df)
|
158
120
|
# create df
|
159
|
-
|
121
|
+
self.imputed_df = pd.DataFrame(imp_yhat, index=self.filtered_df.index, columns=self.filtered_df.columns)
|
160
122
|
|
161
123
|
# type conversion
|
162
|
-
|
124
|
+
self.imputed_df = self.imputed_df.apply(pd.to_numeric, errors="ignore").convert_dtypes()
|
163
125
|
|
164
126
|
# plot
|
165
|
-
if plot:
|
166
|
-
if not isinstance(plot_series, tuple):
|
127
|
+
if self.plot:
|
128
|
+
if not isinstance(self.plot_series, tuple):
|
167
129
|
raise TypeError(
|
168
130
|
"Plot_series must be a tuple specifying the ticker and column/field to plot (ticker, column)."
|
169
131
|
)
|
170
132
|
else:
|
171
|
-
self.plot_imputed(
|
133
|
+
self.plot_imputed()
|
172
134
|
|
173
|
-
return
|
135
|
+
return self.imputed_df
|
174
136
|
|
175
|
-
|
176
|
-
def plot_imputed(imp_df: pd.DataFrame, plot_series: Optional[tuple] = None) -> None:
|
137
|
+
def plot_imputed(self) -> None:
|
177
138
|
"""
|
178
139
|
Plots filtered time series.
|
179
|
-
|
180
|
-
Parameters
|
181
|
-
----------
|
182
|
-
imp_df: pd.DataFrame - MultiIndex
|
183
|
-
DataFrame MultiIndex with DatetimeIndex (level 0), ticker (level 1) and fields (cols) with imputed values.
|
184
|
-
plot_series: tuple, optional, default None
|
185
|
-
Plots the time series of a specific (ticker, field) tuple.
|
186
|
-
|
187
140
|
"""
|
188
141
|
ax = (
|
189
|
-
|
142
|
+
self.imputed_df.loc[pd.IndexSlice[:, self.plot_series[0]], self.plot_series[1]]
|
190
143
|
.droplevel(1)
|
191
144
|
.plot(linewidth=1, figsize=(15, 7), color="#1f77b4", zorder=0)
|
192
145
|
)
|
193
146
|
ax.grid(color="black", linewidth=0.05)
|
194
147
|
ax.xaxis.grid(False)
|
195
|
-
ax.set_ylabel(plot_series[0])
|
148
|
+
ax.set_ylabel(self.plot_series[0])
|
196
149
|
ax.ticklabel_format(style="plain", axis="y")
|
197
150
|
ax.set_facecolor("whitesmoke")
|
198
|
-
ax.legend([plot_series[1] + "_repaired"], loc="upper left")
|
151
|
+
ax.legend([self.plot_series[1] + "_repaired"], loc="upper left")
|