cryptodatapy 0.2.2__py3-none-any.whl → 0.2.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -7,12 +7,13 @@ import pandas as pd
7
7
  class Filter:
8
8
  """
9
9
  Filters dataframe in tidy format.
10
-
11
10
  """
12
-
13
- def __init__(
14
- self, raw_df: pd.DataFrame, excl_cols: Optional[Union[str, list]] = None
15
- ):
11
+ def __init__(self,
12
+ raw_df: pd.DataFrame,
13
+ excl_cols: Optional[Union[str, list]] = None,
14
+ plot: bool = False,
15
+ plot_series: tuple = ("BTC", "close")
16
+ ):
16
17
  """
17
18
  Constructor
18
19
 
@@ -22,64 +23,18 @@ class Filter:
22
23
  Dataframe with raw data. DatetimeIndex (level 0), ticker (level 1) and raw data (cols), in tidy format.
23
24
  excl_cols: str or list, default None
24
25
  Name of columns to exclude from filtering
25
-
26
26
  """
27
-
28
27
  self.raw_df = raw_df
29
28
  self.excl_cols = excl_cols
30
-
31
- def outliers(
32
- self,
33
- outliers_dict: dict,
34
- plot: bool = False,
35
- plot_series: tuple = ("BTC", "close"),
36
- ) -> pd.DataFrame:
37
- """
38
- Filters outliers, replacing them with NaNs.
39
-
40
- Parameters
41
- ----------
42
- outliers_dict: Dictionary of pd.DataFrame - MultiIndex
43
- Dictionary of forecasts (yhat), outliers (outliers) and filtered values (filt_vals) multiindex dataframes
44
- with DatetimeIndex (level 0), tickers (level 1) and fields (cols) with forecasted, outlier or filtered
45
- values.
46
- plot: bool, default False
47
- Plots series with outliers highlighted with red dots.
48
- plot_series: tuple, default ('BTC', 'close')
49
- Plots the time series of a specific (ticker, field/column) tuple.
50
-
51
- Returns
52
- -------
53
- filt_df: DataFrame - MultiIndex
54
- Filtered dataFrame with DatetimeIndex (level 0), tickers (level 1) and fields (cols) with outliers removed.
55
-
56
- """
57
- # filter outliers
58
- filt_df = outliers_dict["filt_vals"]
59
-
60
- # add excl cols
61
- if self.excl_cols is not None:
62
- filt_df = pd.concat(
63
- [filt_df, self.raw_df[self.excl_cols]], join="outer", axis=1
64
- )
65
-
66
- # plot
67
- if plot:
68
- if not isinstance(plot_series, tuple):
69
- raise TypeError(
70
- "Plot_series must be a tuple specifying the ticker and column/field to plot (ticker, column)."
71
- )
72
- else:
73
- self.plot_filtered(filt_df, plot_series=plot_series)
74
-
75
- return filt_df
29
+ self.plot = plot
30
+ self.plot_series = plot_series
31
+ self.df = raw_df.copy() if excl_cols is None else raw_df.drop(columns=excl_cols).copy()
32
+ self.filtered_df = None
76
33
 
77
34
  def avg_trading_val(
78
35
  self,
79
36
  thresh_val: int = 10000000,
80
37
  window_size: int = 30,
81
- plot: bool = False,
82
- plot_series: tuple = ("BTC", "close"),
83
38
  ) -> pd.DataFrame:
84
39
  """
85
40
  Filters values below a threshold of average trading value (price * volume/size in quote currency) over some
@@ -91,35 +46,24 @@ class Filter:
91
46
  Threshold/cut-off for avg trading value.
92
47
  window_size: int, default 30
93
48
  Size of rolling window.
94
- plot: bool, default False
95
- Plots series with outliers highlighted with red dots.
96
- plot_series: tuple, default ('BTC', 'close')
97
- Plots the time series of a specific (ticker, field/column) tuple.
98
49
 
99
50
  Returns
100
51
  -------
101
- filt_df: DataFrame - MultiIndex
52
+ filtered_df: DataFrame - MultiIndex
102
53
  Filtered dataFrame with DatetimeIndex (level 0), tickers (level 1) and fields (cols) with values below the
103
54
  threshold removed.
104
-
105
55
  """
106
- # convert string to list
107
- if self.excl_cols is not None:
108
- df = self.raw_df.drop(columns=self.excl_cols).copy()
109
- else:
110
- df = self.raw_df.copy()
111
-
112
56
  # compute traded val
113
- if "close" in df.columns and "volume" in df.columns:
114
- df["trading_val"] = df.close * df.volume
115
- elif ("bid" in df.columns and "ask" in df.columns) and (
116
- "bid_size" in df.columns and "ask_size" in df.columns
57
+ if "close" in self.df.columns and "volume" in self.df.columns:
58
+ self.df["trading_val"] = self.df.close * self.df.volume
59
+ elif ("bid" in self.df.columns and "ask" in self.df.columns) and (
60
+ "bid_size" in self.df.columns and "ask_size" in self.df.columns
117
61
  ):
118
- df["trading_val"] = ((df.bid + df.ask) / 2) * (
119
- (df.bid_size + df.ask_size) / 2
62
+ self.df["trading_val"] = ((self.df.bid + self.df.ask) / 2) * (
63
+ (self.df.bid_size + self.df.ask_size) / 2
120
64
  )
121
- elif "trade_size" in df.columns and "trade_price" in df.columns:
122
- df["trading_val"] = df.trade_price * df.trade_size
65
+ elif "trade_size" in self.df.columns and "trade_price" in self.df.columns:
66
+ self.df["trading_val"] = self.df.trade_price * self.df.trade_size
123
67
  else:
124
68
  raise Exception(
125
69
  "Dataframe must include at least one price series (e.g. close price, trade price, "
@@ -127,36 +71,29 @@ class Filter:
127
71
  )
128
72
 
129
73
  # compute rolling mean/avg
130
- df1 = df.groupby(level=1).rolling(window_size).mean().droplevel(0)
74
+ df1 = self.df.groupby(level=1).rolling(window_size).mean().droplevel(0)
131
75
  # divide by thresh
132
76
  df1 = df1 / thresh_val
133
77
  # filter df1
134
- filt_df = (
135
- df.loc[df1.trading_val > 1].reindex(df.index).drop(columns="trading_val")
136
- )
137
- # add excl cols
138
- if self.excl_cols is not None:
139
- filt_df = pd.concat(
140
- [filt_df, self.raw_df[self.excl_cols]], join="outer", axis=1
141
- )
78
+ self.filtered_df = self.df.loc[df1.trading_val > 1].reindex(self.df.index).drop(columns="trading_val")
142
79
 
143
80
  # plot
144
- if plot:
145
- if not isinstance(plot_series, tuple):
81
+ if self.plot:
82
+ if not isinstance(self.plot_series, tuple):
146
83
  raise TypeError(
147
84
  "Plot_series must be a tuple specifying the ticker and column/field to plot (ticker, column)."
148
85
  )
149
86
  else:
150
- self.plot_filtered(filt_df, plot_series=plot_series)
87
+ self.plot_filtered(plot_series=self.plot_series)
151
88
 
152
- return filt_df
89
+ # add excl cols
90
+ if self.excl_cols is not None:
91
+ self.filtered_df = pd.concat([self.filtered_df,
92
+ self.raw_df[self.excl_cols].reindex(self.filtered_df.index)], axis=1)
153
93
 
154
- def missing_vals_gaps(
155
- self,
156
- gap_window: int = 30,
157
- plot: bool = False,
158
- plot_series: tuple = ("BTC", "close"),
159
- ) -> pd.DataFrame:
94
+ return self.filtered_df
95
+
96
+ def missing_vals_gaps(self, gap_window: int = 30) -> pd.DataFrame:
160
97
  """
161
98
  Filters values before a large gap of missing values, replacing them with NaNs.
162
99
 
@@ -164,27 +101,16 @@ class Filter:
164
101
  ----------
165
102
  gap_window: int, default 30
166
103
  Size of window where all values are missing (NaNs).
167
- plot: bool, default False
168
- Plots series with outliers highlighted with red dots.
169
- plot_series: tuple, default ('BTC', 'close')
170
- Plots the time series of a specific (ticker, field/column) tuple.
171
104
 
172
105
  Returns
173
106
  -------
174
- filt_df: DataFrame - MultiIndex
107
+ filtered_df: DataFrame - MultiIndex
175
108
  Filtered dataFrame with DatetimeIndex (level 0), tickers (level 1) and fields (cols) with values before
176
109
  missing values gaps removed.
177
-
178
110
  """
179
- # convert string to list
180
- if self.excl_cols is not None:
181
- df = self.raw_df.drop(columns=self.excl_cols).copy()
182
- else:
183
- df = self.raw_df.copy()
184
-
185
111
  # window obs count
186
112
  window_count = (
187
- df.groupby(level=1)
113
+ self.df.groupby(level=1)
188
114
  .rolling(window=gap_window, min_periods=gap_window)
189
115
  .count()
190
116
  .droplevel(0)
@@ -194,24 +120,25 @@ class Filter:
194
120
  for col in gap.unstack().columns:
195
121
  start_idx = gap.unstack()[col].last_valid_index()
196
122
  if start_idx is not None:
197
- df.loc[pd.IndexSlice[:start_idx, col[1]], col[0]] = np.nan
198
-
199
- # add excl cols
200
- if self.excl_cols is not None:
201
- filt_df = pd.concat([df, self.raw_df[self.excl_cols]], join="outer", axis=1)
202
- else:
203
- filt_df = df
123
+ self.df.loc[pd.IndexSlice[:start_idx, col[1]], col[0]] = np.nan
204
124
 
205
125
  # plot
206
- if plot:
207
- if not isinstance(plot_series, tuple):
126
+ if self.plot:
127
+ if not isinstance(self.plot_series, tuple):
208
128
  raise TypeError(
209
129
  "Plot_series must be a tuple specifying the ticker and column/field to plot (ticker, column)."
210
130
  )
211
131
  else:
212
- self.plot_filtered(filt_df, plot_series=plot_series)
132
+ self.plot_filtered(plot_series=self.plot_series)
133
+
134
+ # add excl cols
135
+ if self.excl_cols is not None:
136
+ self.filtered_df = pd.concat([self.df,
137
+ self.raw_df[self.excl_cols].reindex(self.df)], axis=1)
138
+ else:
139
+ self.filtered_df = self.df
213
140
 
214
- return filt_df
141
+ return self.filtered_df
215
142
 
216
143
  def min_nobs(self, ts_obs=100, cs_obs=1) -> pd.DataFrame:
217
144
  """
@@ -227,25 +154,47 @@ class Filter:
227
154
 
228
155
  Returns
229
156
  -------
230
- filt_df: DataFrame - MultiIndex
157
+ filtered_df: DataFrame - MultiIndex
231
158
  Filtered dataFrame with DatetimeIndex (level 0), tickers with minimum number of observations (level 1)
232
159
  and fields (cols).
233
-
234
160
  """
235
- # create copy
236
- df = self.raw_df.copy()
237
-
238
161
  # drop tickers with nobs < ts_obs
239
- obs = df.groupby(level=1).count().min(axis=1)
162
+ obs = self.df.groupby(level=1).count().min(axis=1)
240
163
  drop_tickers_list = obs[obs < ts_obs].index.to_list()
241
- filt_df = df.drop(drop_tickers_list, level=1, axis=0)
164
+ self.filtered_df = self.df.drop(drop_tickers_list, level=1, axis=0)
242
165
 
243
166
  # drop tickers with nobs < cs_obs
244
- obs = filt_df.groupby(level=0).count().min(axis=1)
167
+ obs = self.filtered_df.groupby(level=0).count().min(axis=1)
245
168
  idx_start = obs[obs > cs_obs].index[0]
246
- filt_df = filt_df.unstack()[filt_df.unstack().index > idx_start].stack()
169
+ self.filtered_df = self.filtered_df.loc[idx_start:]
247
170
 
248
- return filt_df
171
+ return self.filtered_df
172
+
173
+ def remove_delisted(self, field: str = 'close', n_unch_vals: int = 30) -> pd.DataFrame:
174
+ """
175
+ Removes delisted tickers from dataframe.
176
+
177
+ Parameters
178
+ ----------
179
+ field: str, default 'close'
180
+ Field/column to use for detecting delisted tickers.
181
+ n_unch_vals: int, default 30
182
+ Number of consecutive unchanged values to consider a ticker as delisted.
183
+
184
+ Returns
185
+ -------
186
+ filtered_df: pd.DataFrame - MultiIndex
187
+ Filtered dataFrame with DatetimeIndex (level 0), tickers (level 1) and fields (cols).
188
+ """
189
+ # delisted tickers
190
+ delisted_tickers = self.df[field].unstack()[self.df[field].unstack().pct_change().iloc[-n_unch_vals:] == 0].\
191
+ dropna(how='all', axis=0).dropna(thresh=n_unch_vals, axis=1).columns
192
+ print(delisted_tickers)
193
+
194
+ # drop delisted tickers
195
+ self.filtered_df = self.df.drop(delisted_tickers, level=1)
196
+
197
+ return self.filtered_df
249
198
 
250
199
  def tickers(self, tickers_list) -> pd.DataFrame:
251
200
  """
@@ -259,37 +208,29 @@ class Filter:
259
208
 
260
209
  Returns
261
210
  -------
262
- filt_df: pd.DataFrame - MultiIndex
211
+ filtered_df: pd.DataFrame - MultiIndex
263
212
  Filtered dataFrame with DatetimeIndex (level 0), tickers (level 1) and fields (cols).
264
-
265
213
  """
266
- # create copy
267
- df = self.raw_df.copy()
268
214
  # tickers list
269
215
  if isinstance(tickers_list, str):
270
216
  tickers_list = [tickers_list]
217
+
271
218
  # drop tickers
272
- filt_df = df.drop(tickers_list, level=1, axis=0)
219
+ self.filtered_df = self.df.drop(tickers_list, level=1)
273
220
 
274
- return filt_df
221
+ return self.filtered_df
275
222
 
276
- @staticmethod
277
- def plot_filtered(
278
- filt_df: pd.DataFrame, plot_series: Optional[tuple] = None
279
- ) -> None:
223
+ def plot_filtered(self, plot_series: Optional[tuple] = None) -> None:
280
224
  """
281
225
  Plots filtered time series.
282
226
 
283
227
  Parameters
284
228
  ----------
285
- filt_df: pd.DataFrame - MultiIndex
286
- Dataframe MultiIndex with DatetimeIndex (level 0), tickers (level 1) and filtered values (cols).
287
229
  plot_series: tuple, optional, default None
288
230
  Plots the time series of a specific (ticker, field) tuple.
289
-
290
231
  """
291
232
  ax = (
292
- filt_df.loc[pd.IndexSlice[:, plot_series[0]], plot_series[1]]
233
+ self.filtered_df.loc[pd.IndexSlice[:, plot_series[0]], plot_series[1]]
293
234
  .droplevel(1)
294
235
  .plot(linewidth=1, figsize=(15, 7), color="#1f77b4", zorder=0)
295
236
  )
@@ -7,67 +7,51 @@ import pandas as pd
7
7
  class Impute:
8
8
  """
9
9
  Handles missing values.
10
-
11
10
  """
12
-
13
- def __init__(self, filt_df: pd.DataFrame):
14
-
11
+ def __init__(self, filtered_df: pd.DataFrame, plot: bool = False, plot_series: tuple = ("BTC", "close")):
15
12
  """
16
13
  Constructor
17
14
 
18
15
  Parameters
19
16
  ----------
20
- filt_df: pd.DataFrame - MultiIndex
17
+ filtered_df: pd.DataFrame - MultiIndex
21
18
  DataFrame MultiIndex with DatetimeIndex (level 0), ticker (level 1) and fields (cols) with filtered values.
22
-
23
19
  """
24
- self.filt_df = filt_df
20
+ self.filtered_df = filtered_df.astype(float)
21
+ self.plot = plot
22
+ self.plot_series = plot_series
23
+ self.imputed_df = None
25
24
 
26
- def fwd_fill(
27
- self, plot: bool = False, plot_series: tuple = ("BTC", "close")
28
- ) -> pd.DataFrame:
25
+ def fwd_fill(self) -> pd.DataFrame:
29
26
  """
30
27
  Imputes missing values by imputing missing values with latest non-missing values.
31
28
 
32
- Parameters
33
- ----------
34
- plot: bool, default False
35
- Plots series with outliers highlighted with red dots.
36
- plot_series: tuple, default ('BTC', 'close')
37
- Plots the time series of a specific (ticker, field/column) tuple.
38
-
39
29
  Returns
40
30
  -------
41
- imp_df: pd.DataFrame - MultiIndex
31
+ imputed_df: pd.DataFrame - MultiIndex
42
32
  DataFrame MultiIndex with DatetimeIndex (level 0), ticker (level 1) and fields (cols) with imputed values
43
33
  using forward fill method.
44
-
45
34
  """
46
- # copy df
47
- filt_df = self.filt_df.copy()
48
-
49
35
  # ffill
50
- imp_df = filt_df.groupby(level=1).ffill()
36
+ self.imputed_df = self.filtered_df.groupby(level=1).ffill()
51
37
 
52
38
  # plot
53
- if plot:
54
- if not isinstance(plot_series, tuple):
39
+ if self.plot:
40
+ if not isinstance(self.plot_series, tuple):
55
41
  raise TypeError(
56
42
  "Plot_series must be a tuple specifying the ticker and column/field to plot (ticker, column)."
57
43
  )
58
44
  else:
59
- self.plot_imputed(imp_df, plot_series=plot_series)
45
+ self.plot_imputed()
60
46
 
61
- return imp_df
47
+ return self.imputed_df
62
48
 
63
49
  def interpolate(
64
50
  self,
65
51
  method: str = "linear",
66
52
  order: Optional[int] = None,
67
- axis=0,
53
+ axis: int = 0,
68
54
  limit: Optional[int] = None,
69
- plot: bool = False,
70
- plot_series: tuple = ("BTC", "close"),
71
55
  ) -> pd.DataFrame:
72
56
  """
73
57
  Imputes missing values by interpolating using various methods.
@@ -83,116 +67,85 @@ class Impute:
83
67
  Axis to interpolate along.
84
68
  limit: int, optional, default None
85
69
  Maximum number of consecutive NaNs to fill. Must be greater than 0.
86
- plot: bool, default False
87
- Plots series with outliers highlighted with red dots.
88
- plot_series: tuple, default ('BTC', 'close')
89
- Plots the time series of a specific (ticker, field/column) tuple.
90
70
 
91
71
  Returns
92
72
  -------
93
- imp_df: pd.DataFrame - MultiIndex
73
+ imputed_df: pd.DataFrame - MultiIndex
94
74
  DataFrame MultiIndex with DatetimeIndex (level 0), ticker (level 1) and fields (cols) with imputed values
95
75
  using interpolation method.
96
-
97
76
  """
98
- # copy df and convert to float for interpolation (code will break if type int64)
99
- filt_df = self.filt_df.astype(float).copy()
100
-
101
77
  # add order if spline or polynomial
102
78
  if (method == "spline" or method == "polynomial") and order is None:
103
79
  order = 3
104
80
 
105
81
  # interpolate
106
- imp_df = (
107
- filt_df.unstack()
108
- .interpolate(method=method, order=order, axis=axis, limit=limit)
109
- .stack()
110
- .reindex(filt_df.index)
111
- )
82
+ self.imputed_df = self.filtered_df.unstack().interpolate(method=method, order=order, axis=axis,
83
+ limit=limit).stack().reindex(self.filtered_df.index)
112
84
 
113
85
  # type conversion
114
- imp_df = imp_df.apply(pd.to_numeric, errors="ignore").convert_dtypes()
86
+ self.imputed_df = self.imputed_df.apply(pd.to_numeric, errors="ignore").convert_dtypes()
115
87
 
116
88
  # plot
117
- if plot:
118
- if not isinstance(plot_series, tuple):
89
+ if self.plot:
90
+ if not isinstance(self.plot_series, tuple):
119
91
  raise TypeError(
120
92
  "Plot_series must be a tuple specifying the ticker and column/field to plot (ticker, column)."
121
93
  )
122
94
  else:
123
- self.plot_imputed(imp_df, plot_series=plot_series)
95
+ self.plot_imputed()
124
96
 
125
- return imp_df
97
+ return self.imputed_df
126
98
 
127
99
  def fcst(
128
100
  self,
129
- fcst_df: pd.DataFrame,
130
- plot: bool = False,
131
- plot_series: tuple = ("BTC", "close"),
101
+ yhat_df: pd.DataFrame,
132
102
  ) -> pd.DataFrame:
133
103
  """
134
104
  Imputes missing values with forecasts from outlier detection algorithm.
135
105
 
136
106
  Parameters
137
107
  ----------
138
- fcst_df: pd.DataFrame - MultiIndex
108
+ yhat_df: pd.DataFrame - MultiIndex
139
109
  Multiindex dataframe with DatetimeIndex (level 0), tickers (level 1) and fields (cols)
140
110
  with forecasted values.
141
- plot: bool, default False
142
- Plots series with outliers highlighted with red dots.
143
- plot_series: tuple, default ('BTC', 'close')
144
- Plots the time series of a specific (ticker, field/column) tuple.
145
111
 
146
112
  Returns
147
113
  -------
148
- imp_df: pd.DataFrame - MultiIndex
114
+ imputed_df: pd.DataFrame - MultiIndex
149
115
  DataFrame MultiIndex with DatetimeIndex (level 0), ticker (level 1) and fields (cols) with imputed values
150
116
  using forecasts from outlier detection method.
151
-
152
117
  """
153
- # copy filtered and forecast dfs
154
- filt_df, yhat_df = self.filt_df.copy(), fcst_df.copy()
155
-
156
118
  # impute missing vals in filtered df with fcst vals
157
- imp_yhat = np.where(filt_df.isna(), yhat_df, filt_df)
119
+ imp_yhat = np.where(self.filtered_df.isna(), yhat_df, self.filtered_df)
158
120
  # create df
159
- imp_df = pd.DataFrame(imp_yhat, index=filt_df.index, columns=filt_df.columns)
121
+ self.imputed_df = pd.DataFrame(imp_yhat, index=self.filtered_df.index, columns=self.filtered_df.columns)
160
122
 
161
123
  # type conversion
162
- imp_df = imp_df.apply(pd.to_numeric, errors="ignore").convert_dtypes()
124
+ self.imputed_df = self.imputed_df.apply(pd.to_numeric, errors="ignore").convert_dtypes()
163
125
 
164
126
  # plot
165
- if plot:
166
- if not isinstance(plot_series, tuple):
127
+ if self.plot:
128
+ if not isinstance(self.plot_series, tuple):
167
129
  raise TypeError(
168
130
  "Plot_series must be a tuple specifying the ticker and column/field to plot (ticker, column)."
169
131
  )
170
132
  else:
171
- self.plot_imputed(imp_df, plot_series=plot_series)
133
+ self.plot_imputed()
172
134
 
173
- return imp_df
135
+ return self.imputed_df
174
136
 
175
- @staticmethod
176
- def plot_imputed(imp_df: pd.DataFrame, plot_series: Optional[tuple] = None) -> None:
137
+ def plot_imputed(self) -> None:
177
138
  """
178
139
  Plots filtered time series.
179
-
180
- Parameters
181
- ----------
182
- imp_df: pd.DataFrame - MultiIndex
183
- DataFrame MultiIndex with DatetimeIndex (level 0), ticker (level 1) and fields (cols) with imputed values.
184
- plot_series: tuple, optional, default None
185
- Plots the time series of a specific (ticker, field) tuple.
186
-
187
140
  """
188
141
  ax = (
189
- imp_df.loc[pd.IndexSlice[:, plot_series[0]], plot_series[1]]
142
+ self.imputed_df.loc[pd.IndexSlice[:, self.plot_series[0]], self.plot_series[1]]
190
143
  .droplevel(1)
191
144
  .plot(linewidth=1, figsize=(15, 7), color="#1f77b4", zorder=0)
192
145
  )
193
146
  ax.grid(color="black", linewidth=0.05)
194
147
  ax.xaxis.grid(False)
195
- ax.set_ylabel(plot_series[0])
148
+ ax.set_ylabel(self.plot_series[0])
196
149
  ax.ticklabel_format(style="plain", axis="y")
197
150
  ax.set_facecolor("whitesmoke")
198
- ax.legend([plot_series[1] + "_repaired"], loc="upper left")
151
+ ax.legend([self.plot_series[1] + "_repaired"], loc="upper left")