cryptodatapy 0.2.2__py3-none-any.whl → 0.2.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cryptodatapy/conf/fx_tickers.csv +31 -0
- cryptodatapy/transform/clean.py +171 -172
- cryptodatapy/transform/clean_perp_futures_ohlcv.ipynb +1025 -0
- cryptodatapy/transform/filter.py +83 -142
- cryptodatapy/transform/impute.py +36 -83
- cryptodatapy/transform/od.py +221 -450
- {cryptodatapy-0.2.2.dist-info → cryptodatapy-0.2.4.dist-info}/METADATA +4 -1
- {cryptodatapy-0.2.2.dist-info → cryptodatapy-0.2.4.dist-info}/RECORD +10 -8
- {cryptodatapy-0.2.2.dist-info → cryptodatapy-0.2.4.dist-info}/LICENSE +0 -0
- {cryptodatapy-0.2.2.dist-info → cryptodatapy-0.2.4.dist-info}/WHEEL +0 -0
@@ -0,0 +1,31 @@
|
|
1
|
+
id,name,tiingo_id
|
2
|
+
eurusd,,
|
3
|
+
gbpusd,,
|
4
|
+
usdjpy,,
|
5
|
+
usdchf,,
|
6
|
+
usdcad,,
|
7
|
+
usdsek,,
|
8
|
+
usdnok,,
|
9
|
+
audusd,,
|
10
|
+
nzdusd,,
|
11
|
+
usdars,,
|
12
|
+
usdmxn,,
|
13
|
+
usdbrl,,
|
14
|
+
usdcop,,
|
15
|
+
usdclp,,
|
16
|
+
usdpen,,
|
17
|
+
usdils,,
|
18
|
+
usdrub,,
|
19
|
+
usdczk,,
|
20
|
+
usdpln,,
|
21
|
+
usdhuf,,
|
22
|
+
usdzar,,
|
23
|
+
usdtry,,
|
24
|
+
usdcny,,
|
25
|
+
usdhkd,,
|
26
|
+
usdsgd,,
|
27
|
+
usdtwd,,
|
28
|
+
usdkrw,,
|
29
|
+
usdphp,,
|
30
|
+
usdinr,,
|
31
|
+
usdidr,,
|
cryptodatapy/transform/clean.py
CHANGED
@@ -1,19 +1,57 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
from typing import Optional, Union
|
3
|
-
|
4
3
|
import pandas as pd
|
5
4
|
|
6
|
-
from cryptodatapy.transform.filter import Filter
|
7
|
-
from cryptodatapy.transform.impute import Impute
|
8
5
|
from cryptodatapy.transform.od import OutlierDetection
|
6
|
+
from cryptodatapy.transform.impute import Impute
|
7
|
+
from cryptodatapy.transform.filter import Filter
|
9
8
|
|
10
9
|
|
11
|
-
|
10
|
+
def stitch_dataframes(dfs):
|
12
11
|
"""
|
13
|
-
|
12
|
+
Stitches together dataframes with different start dates.
|
13
|
+
|
14
|
+
Parameters
|
15
|
+
----------
|
16
|
+
dfs: list
|
17
|
+
List of dataframes to be stitched together.
|
14
18
|
|
19
|
+
Returns
|
20
|
+
-------
|
21
|
+
combined_df: pd.DataFrame
|
22
|
+
Combined dataframe with extended start date.
|
15
23
|
"""
|
24
|
+
# check if dfs is a list
|
25
|
+
if not isinstance(dfs, list):
|
26
|
+
raise TypeError("Dataframes must be a list.")
|
27
|
+
|
28
|
+
# check index types
|
29
|
+
if all([isinstance(df.index, pd.MultiIndex) for df in dfs]):
|
30
|
+
dfs.sort(key=lambda df: df.index.levels[0][0], reverse=True)
|
31
|
+
elif all([isinstance(df.index, pd.DatetimeIndex) for df in dfs]):
|
32
|
+
dfs.sort(key=lambda df: df.index[0], reverse=True)
|
33
|
+
else:
|
34
|
+
raise TypeError("Dataframes must be pd.MultiIndex or have DatetimeIndex.")
|
35
|
+
|
36
|
+
# most recent start date
|
37
|
+
combined_df = dfs[0]
|
38
|
+
|
39
|
+
# combine dfs
|
40
|
+
for df in dfs[1:]:
|
41
|
+
combined_df = combined_df.combine_first(df)
|
42
|
+
|
43
|
+
# reorder cols
|
44
|
+
max_columns = max(len(df.columns) for df in dfs)
|
45
|
+
cols = next(df.columns.tolist() for df in dfs if len(df.columns) == max_columns)
|
46
|
+
combined_df = combined_df[cols]
|
16
47
|
|
48
|
+
return combined_df
|
49
|
+
|
50
|
+
|
51
|
+
class CleanData:
|
52
|
+
"""
|
53
|
+
Cleans data to improve data quality.
|
54
|
+
"""
|
17
55
|
def __init__(self, df: pd.DataFrame):
|
18
56
|
"""
|
19
57
|
Constructor
|
@@ -22,26 +60,44 @@ class CleanData:
|
|
22
60
|
----------
|
23
61
|
df: pd.DataFrame
|
24
62
|
DataFrame MultiIndex with DatetimeIndex (level 0), ticker (level 1) and field (cols) values.
|
25
|
-
|
26
63
|
"""
|
27
|
-
self.
|
64
|
+
self.raw_df = df.copy() # keepy copy of raw dataframe
|
28
65
|
self.df = df
|
29
|
-
self.
|
30
|
-
self.
|
31
|
-
self.
|
32
|
-
self.
|
66
|
+
self.excluded_cols = None
|
67
|
+
self.outliers = None
|
68
|
+
self.yhat = None
|
69
|
+
self.filtered_df = None
|
70
|
+
self.filtered_tickers = None
|
71
|
+
self.repaired_df = None
|
72
|
+
self.summary = pd.DataFrame()
|
73
|
+
self.initialize_summary()
|
74
|
+
self.check_types()
|
75
|
+
|
76
|
+
def initialize_summary(self) -> None:
|
77
|
+
"""
|
78
|
+
Initializes summary dataframe with data quality metrics.
|
79
|
+
"""
|
33
80
|
# add obs and missing vals
|
34
|
-
self.summary.loc["n_obs", self.df.unstack().columns] = (
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
81
|
+
self.summary.loc["n_obs", self.df.unstack().columns] = self.df.unstack().notna().sum().values
|
82
|
+
self.summary.loc["%_NaN_start", self.df.unstack().columns] = \
|
83
|
+
(self.df.unstack().isnull().sum() / self.df.unstack().shape[0]).values * 100
|
84
|
+
|
85
|
+
def check_types(self) -> None:
|
86
|
+
"""
|
87
|
+
Checks data types of columns and converts them to the appropriate data types.
|
88
|
+
|
89
|
+
Returns
|
90
|
+
-------
|
91
|
+
CleanData
|
92
|
+
CleanData object
|
93
|
+
"""
|
94
|
+
if not isinstance(self.df, pd.DataFrame):
|
95
|
+
raise TypeError("Data must be a pandas DataFrame.")
|
40
96
|
|
41
97
|
def filter_outliers(
|
42
98
|
self,
|
99
|
+
od_method: str = "mad",
|
43
100
|
excl_cols: Optional[Union[str, list]] = None,
|
44
|
-
od_method: str = "z_score",
|
45
101
|
**kwargs
|
46
102
|
) -> CleanData:
|
47
103
|
"""
|
@@ -49,92 +105,37 @@ class CleanData:
|
|
49
105
|
|
50
106
|
Parameters
|
51
107
|
----------
|
52
|
-
excl_cols: str or list
|
53
|
-
Name of columns to exclude from outlier filtering.
|
54
108
|
od_method: str, {'atr', 'iqr', 'mad', 'z_score', 'ewma', 'stl', 'seasonal_decomp', 'prophet'}, default z_score
|
55
109
|
Outlier detection method to use for filtering.
|
56
|
-
|
57
|
-
|
58
|
-
----------------
|
59
|
-
log: bool, default False
|
60
|
-
Converts series into log of series.
|
61
|
-
window_size: int, default 7
|
62
|
-
Number of observations in the rolling window.
|
63
|
-
model_type: str, {'estimation', 'prediction'}, default 'estimation'
|
64
|
-
Estimation models use past, current and future values to estimate the expected value of a series,
|
65
|
-
e.g. expected x_t of series x at time t uses values from [x_t-s, x_t+s].
|
66
|
-
Prediction models use only past and current values to estimate the expected value of a series,
|
67
|
-
e.g. expected x_t of series x at time t uses values from [x_t-s, x_t].
|
68
|
-
thresh_val: int, default 2
|
69
|
-
Value for upper and lower thresholds used in outlier detection.
|
70
|
-
period: int, optional, default 7
|
71
|
-
periodicity of the sequence.
|
72
|
-
model: str, {'additive', 'multiplicative'}, default 'additive'
|
73
|
-
Type of seasonal component.
|
74
|
-
filt: array-like, optional, default None
|
75
|
-
The filter coefficients for filtering out the seasonal component.
|
76
|
-
The concrete moving average method used in filtering is determined by two_sided.
|
77
|
-
two_sided: bool, optional, default True
|
78
|
-
The moving average method used in filtering. If True (default), a centered moving average is
|
79
|
-
computed using the filt. If False, the filter coefficients are for past values only.
|
80
|
-
extrapolate_trend: int, optional, default 0
|
81
|
-
If set to > 0, the trend resulting from the convolution is linear least-squares extrapolated
|
82
|
-
on both ends (or the single one if two_sided is False) considering this many (+1) closest points.
|
83
|
-
If set to ‘freq’, use freq closest points. Setting this parameter results in no NaN values in trend
|
84
|
-
or resid components.
|
85
|
-
seasonal_deg: int, optional, default 1
|
86
|
-
Degree of seasonal LOESS. 0 (constant) or 1 (constant and trend).
|
87
|
-
trend_deg: int, optional, default 1
|
88
|
-
Degree of trend LOESS. 0 (constant) or 1 (constant and trend).
|
89
|
-
low_pass_deg: int, optional, default 1
|
90
|
-
Degree of low pass LOESS. 0 (constant) or 1 (constant and trend).
|
91
|
-
robust: bool, optional, default False
|
92
|
-
Flag indicating whether to use a weighted version that is robust to some forms of outliers.
|
93
|
-
seasonal_jump: int, optional, default 1
|
94
|
-
Positive integer determining the linear interpolation step. If larger than 1,
|
95
|
-
the LOESS is used every seasonal_jump points and linear interpolation is between fitted points.
|
96
|
-
Higher values reduce estimation time.
|
97
|
-
trend_jump: int, optional, default 1
|
98
|
-
Positive integer determining the linear interpolation step. If larger than 1,
|
99
|
-
the LOESS is used every trend_jump points and values between the two are linearly interpolated.
|
100
|
-
Higher values reduce estimation time.
|
101
|
-
low_pass_jump: int, optional, default 1
|
102
|
-
Positive integer determining the linear interpolation step. If larger than 1,
|
103
|
-
the LOESS is used every low_pass_jump points and values between the two are linearly interpolated.
|
104
|
-
Higher values reduce estimation time.
|
105
|
-
interval_width: float, optional, default 0.99
|
106
|
-
Uncertainty interval estimated by Monte Carlo simulation. The larger the value,
|
107
|
-
the larger the upper/lower thresholds interval for outlier detection.
|
108
|
-
plot: bool, default False
|
109
|
-
Plots series with outliers highlighted (red dots).
|
110
|
-
plot_series: tuple, default ('BTC', 'close')
|
111
|
-
The specific time series to plot given by (ticker, field/column) tuple.
|
110
|
+
excl_cols: str or list
|
111
|
+
Name of columns to exclude from outlier filtering.
|
112
112
|
|
113
113
|
Returns
|
114
114
|
-------
|
115
115
|
CleanData
|
116
116
|
CleanData object
|
117
|
-
|
118
117
|
"""
|
119
118
|
# outlier detection
|
120
|
-
od =
|
121
|
-
|
122
|
-
|
123
|
-
self.fcsts = od["yhat"]
|
119
|
+
od = OutlierDetection(self.df, excl_cols=excl_cols, **kwargs)
|
120
|
+
self.excluded_cols = excl_cols
|
121
|
+
|
124
122
|
# filter outliers
|
125
|
-
|
123
|
+
getattr(od, od_method)()
|
124
|
+
self.filtered_df = od.filtered_df
|
125
|
+
self.outliers = od.outliers
|
126
|
+
self.yhat = od.yhat
|
127
|
+
|
126
128
|
# add to summary
|
127
|
-
self.summary.loc["%_outliers", self.
|
128
|
-
|
129
|
+
self.summary.loc["%_outliers", self.outliers.unstack().columns] = (
|
130
|
+
self.outliers.unstack().notna().sum() / self.df.unstack().shape[0]
|
129
131
|
).values * 100
|
132
|
+
|
130
133
|
# filtered df
|
131
|
-
self.df =
|
134
|
+
self.df = self.filtered_df.sort_index()
|
132
135
|
|
133
136
|
return self
|
134
137
|
|
135
|
-
def repair_outliers(
|
136
|
-
self, imp_method: str = "interpolate", **kwargs
|
137
|
-
) -> CleanData:
|
138
|
+
def repair_outliers(self, imp_method: str = "interpolate", **kwargs) -> CleanData:
|
138
139
|
"""
|
139
140
|
Repairs outliers using an imputation method.
|
140
141
|
|
@@ -143,46 +144,33 @@ class CleanData:
|
|
143
144
|
imp_method: str, {"fwd_fill', 'interpolate', 'fcst'}, default 'fwd_fill'
|
144
145
|
Imputation method used to replace filtered outliers.
|
145
146
|
|
146
|
-
Other Parameters
|
147
|
-
----------------
|
148
|
-
method: str, {'linear', ‘nearest’, ‘zero’, ‘slinear’, ‘quadratic’, ‘cubic’, ‘spline’, ‘barycentric’,
|
149
|
-
‘polynomial’, ‘krogh’, ‘piecewise_polynomial’, ‘pchip’, ‘akima’, ‘cubicspline’}, default spline
|
150
|
-
Interpolation method to use.
|
151
|
-
order: int, optional, default None
|
152
|
-
Order of polynomial or spline.
|
153
|
-
axis: {{0 or ‘index’, 1 or ‘columns’, None}}, default None
|
154
|
-
Axis to interpolate along.
|
155
|
-
limit: int, optional, default None
|
156
|
-
Maximum number of consecutive NaNs to fill. Must be greater than 0.
|
157
|
-
plot: bool, default False
|
158
|
-
Plots series with outliers highlighted with red dots.
|
159
|
-
plot_series: tuple, default ('BTC', 'close')
|
160
|
-
Plots the time series of a specific (ticker, field/column) tuple.
|
161
|
-
|
162
147
|
Returns
|
163
148
|
-------
|
164
149
|
CleanData
|
165
150
|
CleanData object
|
166
|
-
|
167
151
|
"""
|
168
152
|
# impute missing vals
|
169
153
|
if imp_method == "fcst":
|
170
|
-
|
154
|
+
self.repaired_df = getattr(Impute(self.df), imp_method)(self.yhat, **kwargs)
|
171
155
|
else:
|
172
|
-
|
156
|
+
self.repaired_df = getattr(Impute(self.df), imp_method)(**kwargs)
|
157
|
+
|
173
158
|
# add repaired % to summary
|
174
|
-
rep_vals =
|
175
|
-
self.summary.loc["%_imputed", self.df.unstack().columns] = (
|
176
|
-
|
177
|
-
) * 100
|
159
|
+
rep_vals = self.repaired_df.unstack().notna().sum() - self.df.unstack().notna().sum()
|
160
|
+
self.summary.loc["%_imputed", self.df.unstack().columns] = rep_vals / self.df.unstack().shape[0] * 100
|
161
|
+
|
178
162
|
# repaired df
|
179
|
-
self.
|
163
|
+
if self.excluded_cols is not None:
|
164
|
+
self.df = pd.concat([self.repaired_df, self.raw_df[self.excluded_cols]], join="inner", axis=1)
|
165
|
+
else:
|
166
|
+
self.df = self.repaired_df
|
167
|
+
|
168
|
+
# reorder cols
|
169
|
+
self.df = self.df[self.raw_df.columns].sort_index()
|
180
170
|
|
181
171
|
return self
|
182
172
|
|
183
|
-
def filter_avg_trading_val(
|
184
|
-
self, thresh_val: int = 10000000, window_size: int = 30, **kwargs
|
185
|
-
) -> CleanData:
|
173
|
+
def filter_avg_trading_val(self, thresh_val: int = 10000000, window_size: int = 30) -> CleanData:
|
186
174
|
"""
|
187
175
|
Filters values below a threshold of average trading value (price * volume/size in quote currency) over some
|
188
176
|
lookback window, replacing them with NaNs.
|
@@ -194,34 +182,26 @@ class CleanData:
|
|
194
182
|
window_size: int, default 30
|
195
183
|
Size of rolling window.
|
196
184
|
|
197
|
-
Other Parameters
|
198
|
-
----------------
|
199
|
-
plot: bool, default False
|
200
|
-
Plots series with outliers highlighted with red dots.
|
201
|
-
plot_series: tuple, default ('BTC', 'close')
|
202
|
-
Plots the time series of a specific (ticker, field/column) tuple.
|
203
|
-
|
204
185
|
Returns
|
205
186
|
-------
|
206
187
|
CleanData
|
207
188
|
CleanData object
|
208
|
-
|
209
189
|
"""
|
210
190
|
# filter outliers
|
211
|
-
|
212
|
-
|
213
|
-
)
|
191
|
+
self.filtered_df = Filter(self.df).avg_trading_val(thresh_val=thresh_val, window_size=window_size)
|
192
|
+
|
214
193
|
# add to summary
|
215
|
-
|
194
|
+
filtered_vals = self.df.unstack().notna().sum() - self.filtered_df.unstack().notna().sum()
|
216
195
|
self.summary.loc["%_below_avg_trading_val", self.df.unstack().columns] = (
|
217
|
-
|
196
|
+
filtered_vals / self.df.unstack().shape[0]
|
218
197
|
).values * 100
|
198
|
+
|
219
199
|
# filtered df
|
220
|
-
self.df =
|
200
|
+
self.df = self.filtered_df.sort_index()
|
221
201
|
|
222
202
|
return self
|
223
203
|
|
224
|
-
def filter_missing_vals_gaps(self, gap_window: int = 30
|
204
|
+
def filter_missing_vals_gaps(self, gap_window: int = 30) -> CleanData:
|
225
205
|
"""
|
226
206
|
Filters values before a large gap of missing values, replacing them with NaNs.
|
227
207
|
|
@@ -230,37 +210,28 @@ class CleanData:
|
|
230
210
|
gap_window: int, default 30
|
231
211
|
Size of window where all values are missing (NaNs).
|
232
212
|
|
233
|
-
Other Parameters
|
234
|
-
----------------
|
235
|
-
plot: bool, default False
|
236
|
-
Plots series with outliers highlighted with red dots.
|
237
|
-
plot_series: tuple, default ('BTC', 'close')
|
238
|
-
Plots the time series of a specific (ticker, field/column) tuple.
|
239
|
-
|
240
213
|
Returns
|
241
214
|
-------
|
242
215
|
CleanData
|
243
216
|
CleanData object
|
244
|
-
|
245
217
|
"""
|
246
218
|
# filter outliers
|
247
|
-
|
219
|
+
self.filtered_df = Filter(self.df).missing_vals_gaps(gap_window=gap_window)
|
220
|
+
|
248
221
|
# add to summary
|
249
222
|
missing_vals_gap = (
|
250
|
-
self.df.unstack().notna().sum() -
|
223
|
+
self.df.unstack().notna().sum() - self.filtered_df.unstack().notna().sum()
|
251
224
|
)
|
252
225
|
self.summary.loc["%_missing_vals_gaps", self.df.unstack().columns] = (
|
253
226
|
missing_vals_gap / self.df.unstack().shape[0]
|
254
227
|
).values * 100
|
228
|
+
|
255
229
|
# filtered df
|
256
|
-
self.df =
|
230
|
+
self.df = self.filtered_df.sort_index()
|
257
231
|
|
258
232
|
return self
|
259
233
|
|
260
|
-
def filter_min_nobs(self,
|
261
|
-
ts_obs: int = 100,
|
262
|
-
cs_obs: int = 2
|
263
|
-
) -> CleanData:
|
234
|
+
def filter_min_nobs(self, ts_obs: int = 100, cs_obs: int = 2) -> CleanData:
|
264
235
|
"""
|
265
236
|
Removes tickers from dataframe if the ticker has less than a minimum number of observations.
|
266
237
|
|
@@ -277,21 +248,54 @@ class CleanData:
|
|
277
248
|
CleanData object
|
278
249
|
"""
|
279
250
|
# filter outliers
|
280
|
-
|
251
|
+
self.filtered_df = Filter(self.df).min_nobs(ts_obs=ts_obs, cs_obs=cs_obs)
|
252
|
+
|
281
253
|
# tickers < min obs
|
282
|
-
|
283
|
-
set(
|
254
|
+
self.filtered_tickers = list(
|
255
|
+
set(self.filtered_df.index.droplevel(0).unique()).symmetric_difference(
|
284
256
|
set(self.df.index.droplevel(0).unique())
|
285
257
|
)
|
286
258
|
)
|
287
|
-
|
288
|
-
|
289
|
-
|
290
|
-
|
291
|
-
|
259
|
+
|
260
|
+
# add to summary
|
261
|
+
self.summary.loc["n_tickers_below_min_obs", self.df.unstack().columns] = len(self.filtered_tickers)
|
262
|
+
|
263
|
+
# filtered df
|
264
|
+
self.df = self.filtered_df.sort_index()
|
265
|
+
|
266
|
+
return self
|
267
|
+
|
268
|
+
def filter_delisted_tickers(self, field: str = 'close', n_unch_vals: int = 30) -> CleanData:
|
269
|
+
"""
|
270
|
+
Removes delisted tickers from dataframe.
|
271
|
+
|
272
|
+
Parameters
|
273
|
+
----------
|
274
|
+
field: str, default 'close'
|
275
|
+
Field/column to use for detecting delisted tickers.
|
276
|
+
n_unch_vals: int, default 30
|
277
|
+
Number of consecutive unchanged values to consider a ticker as delisted.
|
278
|
+
|
279
|
+
Returns
|
280
|
+
-------
|
281
|
+
CleanData
|
282
|
+
CleanData object
|
283
|
+
"""
|
284
|
+
# filter tickers
|
285
|
+
self.filtered_df = Filter(self.df).remove_delisted(field=field, n_unch_vals=n_unch_vals)
|
286
|
+
|
287
|
+
# tickers < min obs
|
288
|
+
self.filtered_tickers = list(
|
289
|
+
set(self.filtered_df.index.droplevel(0).unique()).symmetric_difference(
|
290
|
+
set(self.df.index.droplevel(0).unique())
|
291
|
+
)
|
292
292
|
)
|
293
|
+
|
294
|
+
# add to summary
|
295
|
+
self.summary.loc["n_filtered_tickers", self.df.unstack().columns] = len(self.filtered_tickers)
|
296
|
+
|
293
297
|
# filtered df
|
294
|
-
self.df =
|
298
|
+
self.df = self.filtered_df.sort_index()
|
295
299
|
|
296
300
|
return self
|
297
301
|
|
@@ -309,30 +313,27 @@ class CleanData:
|
|
309
313
|
-------
|
310
314
|
CleanData
|
311
315
|
CleanData object
|
312
|
-
|
313
316
|
"""
|
314
317
|
# filter tickers
|
315
|
-
|
318
|
+
self.filtered_df = Filter(self.df).tickers(tickers_list)
|
319
|
+
|
316
320
|
# tickers < min obs
|
317
|
-
|
318
|
-
|
321
|
+
|
322
|
+
self.filtered_tickers = list(
|
323
|
+
set(self.filtered_df.index.droplevel(0).unique()).symmetric_difference(
|
319
324
|
set(self.df.index.droplevel(0).unique())
|
320
325
|
)
|
321
326
|
)
|
322
|
-
|
323
|
-
|
324
|
-
|
325
|
-
|
326
|
-
filt_tickers
|
327
|
-
)
|
327
|
+
|
328
|
+
# add to summary
|
329
|
+
self.summary.loc["n_filtered_tickers", self.df.unstack().columns] = len(self.filtered_tickers)
|
330
|
+
|
328
331
|
# filtered df
|
329
|
-
self.df =
|
332
|
+
self.df = self.filtered_df.sort_index()
|
330
333
|
|
331
334
|
return self
|
332
335
|
|
333
|
-
def show_plot(
|
334
|
-
self, plot_series: tuple = ("BTC", "close"), compare_series: bool = True
|
335
|
-
) -> None:
|
336
|
+
def show_plot(self, plot_series: tuple = ("BTC", "close"), compare_series: bool = True) -> None:
|
336
337
|
"""
|
337
338
|
Plots clean time series and compares it to the raw series.
|
338
339
|
|
@@ -342,7 +343,6 @@ class CleanData:
|
|
342
343
|
Plots the time series of a specific (ticker, field) tuple.
|
343
344
|
compare_series: bool, default True
|
344
345
|
Compares clean time series with raw series
|
345
|
-
|
346
346
|
"""
|
347
347
|
ax = (
|
348
348
|
self.df.loc[pd.IndexSlice[:, plot_series[0]], plot_series[1]]
|
@@ -357,7 +357,7 @@ class CleanData:
|
|
357
357
|
)
|
358
358
|
if compare_series:
|
359
359
|
ax = (
|
360
|
-
self.
|
360
|
+
self.raw_df.loc[pd.IndexSlice[:, plot_series[0]], plot_series[1]]
|
361
361
|
.droplevel(1)
|
362
362
|
.plot(
|
363
363
|
linewidth=1,
|
@@ -382,14 +382,13 @@ class CleanData:
|
|
382
382
|
|
383
383
|
Parameters
|
384
384
|
----------
|
385
|
-
attr: str, {'df', 'outliers', '
|
385
|
+
attr: str, {'df', 'outliers', 'yhat', 'filtered_tickers', 'summary'}, default 'df'
|
386
386
|
GetData object attribute to return
|
387
387
|
|
388
388
|
Returns
|
389
389
|
-------
|
390
390
|
CleanData
|
391
391
|
CleanData object
|
392
|
-
|
393
392
|
"""
|
394
393
|
self.summary.loc["%_NaN_end", self.df.unstack().columns] = (
|
395
394
|
self.df.unstack().isnull().sum() / self.df.unstack().shape[0]
|