cryptodatapy 0.2.2__py3-none-any.whl → 0.2.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -7,12 +7,13 @@ import pandas as pd
7
7
  class Filter:
8
8
  """
9
9
  Filters dataframe in tidy format.
10
-
11
10
  """
12
-
13
- def __init__(
14
- self, raw_df: pd.DataFrame, excl_cols: Optional[Union[str, list]] = None
15
- ):
11
+ def __init__(self,
12
+ raw_df: pd.DataFrame,
13
+ excl_cols: Optional[Union[str, list]] = None,
14
+ plot: bool = False,
15
+ plot_series: tuple = ("BTC", "close")
16
+ ):
16
17
  """
17
18
  Constructor
18
19
 
@@ -22,64 +23,18 @@ class Filter:
22
23
  Dataframe with raw data. DatetimeIndex (level 0), ticker (level 1) and raw data (cols), in tidy format.
23
24
  excl_cols: str or list, default None
24
25
  Name of columns to exclude from filtering
25
-
26
26
  """
27
-
28
27
  self.raw_df = raw_df
29
28
  self.excl_cols = excl_cols
30
-
31
- def outliers(
32
- self,
33
- outliers_dict: dict,
34
- plot: bool = False,
35
- plot_series: tuple = ("BTC", "close"),
36
- ) -> pd.DataFrame:
37
- """
38
- Filters outliers, replacing them with NaNs.
39
-
40
- Parameters
41
- ----------
42
- outliers_dict: Dictionary of pd.DataFrame - MultiIndex
43
- Dictionary of forecasts (yhat), outliers (outliers) and filtered values (filt_vals) multiindex dataframes
44
- with DatetimeIndex (level 0), tickers (level 1) and fields (cols) with forecasted, outlier or filtered
45
- values.
46
- plot: bool, default False
47
- Plots series with outliers highlighted with red dots.
48
- plot_series: tuple, default ('BTC', 'close')
49
- Plots the time series of a specific (ticker, field/column) tuple.
50
-
51
- Returns
52
- -------
53
- filt_df: DataFrame - MultiIndex
54
- Filtered dataFrame with DatetimeIndex (level 0), tickers (level 1) and fields (cols) with outliers removed.
55
-
56
- """
57
- # filter outliers
58
- filt_df = outliers_dict["filt_vals"]
59
-
60
- # add excl cols
61
- if self.excl_cols is not None:
62
- filt_df = pd.concat(
63
- [filt_df, self.raw_df[self.excl_cols]], join="outer", axis=1
64
- )
65
-
66
- # plot
67
- if plot:
68
- if not isinstance(plot_series, tuple):
69
- raise TypeError(
70
- "Plot_series must be a tuple specifying the ticker and column/field to plot (ticker, column)."
71
- )
72
- else:
73
- self.plot_filtered(filt_df, plot_series=plot_series)
74
-
75
- return filt_df
29
+ self.plot = plot
30
+ self.plot_series = plot_series
31
+ self.df = raw_df.copy() if excl_cols is None else raw_df.drop(columns=excl_cols).copy()
32
+ self.filtered_df = None
76
33
 
77
34
  def avg_trading_val(
78
35
  self,
79
36
  thresh_val: int = 10000000,
80
37
  window_size: int = 30,
81
- plot: bool = False,
82
- plot_series: tuple = ("BTC", "close"),
83
38
  ) -> pd.DataFrame:
84
39
  """
85
40
  Filters values below a threshold of average trading value (price * volume/size in quote currency) over some
@@ -91,35 +46,24 @@ class Filter:
91
46
  Threshold/cut-off for avg trading value.
92
47
  window_size: int, default 30
93
48
  Size of rolling window.
94
- plot: bool, default False
95
- Plots series with outliers highlighted with red dots.
96
- plot_series: tuple, default ('BTC', 'close')
97
- Plots the time series of a specific (ticker, field/column) tuple.
98
49
 
99
50
  Returns
100
51
  -------
101
- filt_df: DataFrame - MultiIndex
52
+ filtered_df: DataFrame - MultiIndex
102
53
  Filtered dataFrame with DatetimeIndex (level 0), tickers (level 1) and fields (cols) with values below the
103
54
  threshold removed.
104
-
105
55
  """
106
- # convert string to list
107
- if self.excl_cols is not None:
108
- df = self.raw_df.drop(columns=self.excl_cols).copy()
109
- else:
110
- df = self.raw_df.copy()
111
-
112
56
  # compute traded val
113
- if "close" in df.columns and "volume" in df.columns:
114
- df["trading_val"] = df.close * df.volume
115
- elif ("bid" in df.columns and "ask" in df.columns) and (
116
- "bid_size" in df.columns and "ask_size" in df.columns
57
+ if "close" in self.df.columns and "volume" in self.df.columns:
58
+ self.df["trading_val"] = self.df.close * self.df.volume
59
+ elif ("bid" in self.df.columns and "ask" in self.df.columns) and (
60
+ "bid_size" in self.df.columns and "ask_size" in self.df.columns
117
61
  ):
118
- df["trading_val"] = ((df.bid + df.ask) / 2) * (
119
- (df.bid_size + df.ask_size) / 2
62
+ self.df["trading_val"] = ((self.df.bid + self.df.ask) / 2) * (
63
+ (self.df.bid_size + self.df.ask_size) / 2
120
64
  )
121
- elif "trade_size" in df.columns and "trade_price" in df.columns:
122
- df["trading_val"] = df.trade_price * df.trade_size
65
+ elif "trade_size" in self.df.columns and "trade_price" in self.df.columns:
66
+ self.df["trading_val"] = self.df.trade_price * self.df.trade_size
123
67
  else:
124
68
  raise Exception(
125
69
  "Dataframe must include at least one price series (e.g. close price, trade price, "
@@ -127,36 +71,28 @@ class Filter:
127
71
  )
128
72
 
129
73
  # compute rolling mean/avg
130
- df1 = df.groupby(level=1).rolling(window_size).mean().droplevel(0)
74
+ df1 = self.df.groupby(level=1).rolling(window_size).mean().droplevel(0)
131
75
  # divide by thresh
132
76
  df1 = df1 / thresh_val
133
77
  # filter df1
134
- filt_df = (
135
- df.loc[df1.trading_val > 1].reindex(df.index).drop(columns="trading_val")
136
- )
137
- # add excl cols
138
- if self.excl_cols is not None:
139
- filt_df = pd.concat(
140
- [filt_df, self.raw_df[self.excl_cols]], join="outer", axis=1
141
- )
78
+ self.filtered_df = self.df.loc[df1.trading_val > 1].reindex(self.df.index).drop(columns="trading_val")
142
79
 
143
80
  # plot
144
- if plot:
145
- if not isinstance(plot_series, tuple):
81
+ if self.plot:
82
+ if not isinstance(self.plot_series, tuple):
146
83
  raise TypeError(
147
84
  "Plot_series must be a tuple specifying the ticker and column/field to plot (ticker, column)."
148
85
  )
149
86
  else:
150
- self.plot_filtered(filt_df, plot_series=plot_series)
87
+ self.plot_filtered(plot_series=self.plot_series)
151
88
 
152
- return filt_df
89
+ # add excl cols
90
+ if self.excl_cols is not None:
91
+ self.filtered_df = pd.concat([self.filtered_df, self.raw_df[self.excl_cols]], join="outer", axis=1)
153
92
 
154
- def missing_vals_gaps(
155
- self,
156
- gap_window: int = 30,
157
- plot: bool = False,
158
- plot_series: tuple = ("BTC", "close"),
159
- ) -> pd.DataFrame:
93
+ return self.filtered_df
94
+
95
+ def missing_vals_gaps(self, gap_window: int = 30) -> pd.DataFrame:
160
96
  """
161
97
  Filters values before a large gap of missing values, replacing them with NaNs.
162
98
 
@@ -164,27 +100,16 @@ class Filter:
164
100
  ----------
165
101
  gap_window: int, default 30
166
102
  Size of window where all values are missing (NaNs).
167
- plot: bool, default False
168
- Plots series with outliers highlighted with red dots.
169
- plot_series: tuple, default ('BTC', 'close')
170
- Plots the time series of a specific (ticker, field/column) tuple.
171
103
 
172
104
  Returns
173
105
  -------
174
- filt_df: DataFrame - MultiIndex
106
+ filtered_df: DataFrame - MultiIndex
175
107
  Filtered dataFrame with DatetimeIndex (level 0), tickers (level 1) and fields (cols) with values before
176
108
  missing values gaps removed.
177
-
178
109
  """
179
- # convert string to list
180
- if self.excl_cols is not None:
181
- df = self.raw_df.drop(columns=self.excl_cols).copy()
182
- else:
183
- df = self.raw_df.copy()
184
-
185
110
  # window obs count
186
111
  window_count = (
187
- df.groupby(level=1)
112
+ self.df.groupby(level=1)
188
113
  .rolling(window=gap_window, min_periods=gap_window)
189
114
  .count()
190
115
  .droplevel(0)
@@ -194,24 +119,24 @@ class Filter:
194
119
  for col in gap.unstack().columns:
195
120
  start_idx = gap.unstack()[col].last_valid_index()
196
121
  if start_idx is not None:
197
- df.loc[pd.IndexSlice[:start_idx, col[1]], col[0]] = np.nan
198
-
199
- # add excl cols
200
- if self.excl_cols is not None:
201
- filt_df = pd.concat([df, self.raw_df[self.excl_cols]], join="outer", axis=1)
202
- else:
203
- filt_df = df
122
+ self.df.loc[pd.IndexSlice[:start_idx, col[1]], col[0]] = np.nan
204
123
 
205
124
  # plot
206
- if plot:
207
- if not isinstance(plot_series, tuple):
125
+ if self.plot:
126
+ if not isinstance(self.plot_series, tuple):
208
127
  raise TypeError(
209
128
  "Plot_series must be a tuple specifying the ticker and column/field to plot (ticker, column)."
210
129
  )
211
130
  else:
212
- self.plot_filtered(filt_df, plot_series=plot_series)
131
+ self.plot_filtered(plot_series=self.plot_series)
132
+
133
+ # add excl cols
134
+ if self.excl_cols is not None:
135
+ self.filtered_df = pd.concat([self.df, self.raw_df[self.excl_cols]], join="outer", axis=1)
136
+ else:
137
+ self.filtered_df = self.df
213
138
 
214
- return filt_df
139
+ return self.filtered_df
215
140
 
216
141
  def min_nobs(self, ts_obs=100, cs_obs=1) -> pd.DataFrame:
217
142
  """
@@ -227,25 +152,21 @@ class Filter:
227
152
 
228
153
  Returns
229
154
  -------
230
- filt_df: DataFrame - MultiIndex
155
+ filtered_df: DataFrame - MultiIndex
231
156
  Filtered dataFrame with DatetimeIndex (level 0), tickers with minimum number of observations (level 1)
232
157
  and fields (cols).
233
-
234
158
  """
235
- # create copy
236
- df = self.raw_df.copy()
237
-
238
159
  # drop tickers with nobs < ts_obs
239
- obs = df.groupby(level=1).count().min(axis=1)
160
+ obs = self.df.groupby(level=1).count().min(axis=1)
240
161
  drop_tickers_list = obs[obs < ts_obs].index.to_list()
241
- filt_df = df.drop(drop_tickers_list, level=1, axis=0)
162
+ self.filtered_df = self.df.drop(drop_tickers_list, level=1, axis=0)
242
163
 
243
164
  # drop tickers with nobs < cs_obs
244
- obs = filt_df.groupby(level=0).count().min(axis=1)
165
+ obs = self.filtered_df.groupby(level=0).count().min(axis=1)
245
166
  idx_start = obs[obs > cs_obs].index[0]
246
- filt_df = filt_df.unstack()[filt_df.unstack().index > idx_start].stack()
247
-
248
- return filt_df
167
+ # self.filtered_df = self.filtered_df.unstack()[self.filtered_df.unstack().index > idx_start].stack()
168
+ self.filtered_df = self.filtered_df.loc[idx_start:]
169
+ return self.filtered_df
249
170
 
250
171
  def tickers(self, tickers_list) -> pd.DataFrame:
251
172
  """
@@ -259,37 +180,29 @@ class Filter:
259
180
 
260
181
  Returns
261
182
  -------
262
- filt_df: pd.DataFrame - MultiIndex
183
+ filtered_df: pd.DataFrame - MultiIndex
263
184
  Filtered dataFrame with DatetimeIndex (level 0), tickers (level 1) and fields (cols).
264
-
265
185
  """
266
- # create copy
267
- df = self.raw_df.copy()
268
186
  # tickers list
269
187
  if isinstance(tickers_list, str):
270
188
  tickers_list = [tickers_list]
189
+
271
190
  # drop tickers
272
- filt_df = df.drop(tickers_list, level=1, axis=0)
191
+ self.filtered_df = self.df.drop(tickers_list, level=1, axis=0)
273
192
 
274
- return filt_df
193
+ return self.filtered_df
275
194
 
276
- @staticmethod
277
- def plot_filtered(
278
- filt_df: pd.DataFrame, plot_series: Optional[tuple] = None
279
- ) -> None:
195
+ def plot_filtered(self, plot_series: Optional[tuple] = None) -> None:
280
196
  """
281
197
  Plots filtered time series.
282
198
 
283
199
  Parameters
284
200
  ----------
285
- filt_df: pd.DataFrame - MultiIndex
286
- Dataframe MultiIndex with DatetimeIndex (level 0), tickers (level 1) and filtered values (cols).
287
201
  plot_series: tuple, optional, default None
288
202
  Plots the time series of a specific (ticker, field) tuple.
289
-
290
203
  """
291
204
  ax = (
292
- filt_df.loc[pd.IndexSlice[:, plot_series[0]], plot_series[1]]
205
+ self.filtered_df.loc[pd.IndexSlice[:, plot_series[0]], plot_series[1]]
293
206
  .droplevel(1)
294
207
  .plot(linewidth=1, figsize=(15, 7), color="#1f77b4", zorder=0)
295
208
  )
@@ -7,67 +7,51 @@ import pandas as pd
7
7
  class Impute:
8
8
  """
9
9
  Handles missing values.
10
-
11
10
  """
12
-
13
- def __init__(self, filt_df: pd.DataFrame):
14
-
11
+ def __init__(self, filtered_df: pd.DataFrame, plot: bool = False, plot_series: tuple = ("BTC", "close")):
15
12
  """
16
13
  Constructor
17
14
 
18
15
  Parameters
19
16
  ----------
20
- filt_df: pd.DataFrame - MultiIndex
17
+ filtered_df: pd.DataFrame - MultiIndex
21
18
  DataFrame MultiIndex with DatetimeIndex (level 0), ticker (level 1) and fields (cols) with filtered values.
22
-
23
19
  """
24
- self.filt_df = filt_df
20
+ self.filtered_df = filtered_df.astype(float)
21
+ self.plot = plot
22
+ self.plot_series = plot_series
23
+ self.imputed_df = None
25
24
 
26
- def fwd_fill(
27
- self, plot: bool = False, plot_series: tuple = ("BTC", "close")
28
- ) -> pd.DataFrame:
25
+ def fwd_fill(self) -> pd.DataFrame:
29
26
  """
30
27
  Imputes missing values by imputing missing values with latest non-missing values.
31
28
 
32
- Parameters
33
- ----------
34
- plot: bool, default False
35
- Plots series with outliers highlighted with red dots.
36
- plot_series: tuple, default ('BTC', 'close')
37
- Plots the time series of a specific (ticker, field/column) tuple.
38
-
39
29
  Returns
40
30
  -------
41
- imp_df: pd.DataFrame - MultiIndex
31
+ imputed_df: pd.DataFrame - MultiIndex
42
32
  DataFrame MultiIndex with DatetimeIndex (level 0), ticker (level 1) and fields (cols) with imputed values
43
33
  using forward fill method.
44
-
45
34
  """
46
- # copy df
47
- filt_df = self.filt_df.copy()
48
-
49
35
  # ffill
50
- imp_df = filt_df.groupby(level=1).ffill()
36
+ self.imputed_df = self.filtered_df.groupby(level=1).ffill()
51
37
 
52
38
  # plot
53
- if plot:
54
- if not isinstance(plot_series, tuple):
39
+ if self.plot:
40
+ if not isinstance(self.plot_series, tuple):
55
41
  raise TypeError(
56
42
  "Plot_series must be a tuple specifying the ticker and column/field to plot (ticker, column)."
57
43
  )
58
44
  else:
59
- self.plot_imputed(imp_df, plot_series=plot_series)
45
+ self.plot_imputed()
60
46
 
61
- return imp_df
47
+ return self.imputed_df
62
48
 
63
49
  def interpolate(
64
50
  self,
65
51
  method: str = "linear",
66
52
  order: Optional[int] = None,
67
- axis=0,
53
+ axis: int = 0,
68
54
  limit: Optional[int] = None,
69
- plot: bool = False,
70
- plot_series: tuple = ("BTC", "close"),
71
55
  ) -> pd.DataFrame:
72
56
  """
73
57
  Imputes missing values by interpolating using various methods.
@@ -83,116 +67,85 @@ class Impute:
83
67
  Axis to interpolate along.
84
68
  limit: int, optional, default None
85
69
  Maximum number of consecutive NaNs to fill. Must be greater than 0.
86
- plot: bool, default False
87
- Plots series with outliers highlighted with red dots.
88
- plot_series: tuple, default ('BTC', 'close')
89
- Plots the time series of a specific (ticker, field/column) tuple.
90
70
 
91
71
  Returns
92
72
  -------
93
- imp_df: pd.DataFrame - MultiIndex
73
+ imputed_df: pd.DataFrame - MultiIndex
94
74
  DataFrame MultiIndex with DatetimeIndex (level 0), ticker (level 1) and fields (cols) with imputed values
95
75
  using interpolation method.
96
-
97
76
  """
98
- # copy df and convert to float for interpolation (code will break if type int64)
99
- filt_df = self.filt_df.astype(float).copy()
100
-
101
77
  # add order if spline or polynomial
102
78
  if (method == "spline" or method == "polynomial") and order is None:
103
79
  order = 3
104
80
 
105
81
  # interpolate
106
- imp_df = (
107
- filt_df.unstack()
108
- .interpolate(method=method, order=order, axis=axis, limit=limit)
109
- .stack()
110
- .reindex(filt_df.index)
111
- )
82
+ self.imputed_df = self.filtered_df.unstack().interpolate(method=method, order=order, axis=axis,
83
+ limit=limit).stack().reindex(self.filtered_df.index)
112
84
 
113
85
  # type conversion
114
- imp_df = imp_df.apply(pd.to_numeric, errors="ignore").convert_dtypes()
86
+ self.imputed_df = self.imputed_df.apply(pd.to_numeric, errors="ignore").convert_dtypes()
115
87
 
116
88
  # plot
117
- if plot:
118
- if not isinstance(plot_series, tuple):
89
+ if self.plot:
90
+ if not isinstance(self.plot_series, tuple):
119
91
  raise TypeError(
120
92
  "Plot_series must be a tuple specifying the ticker and column/field to plot (ticker, column)."
121
93
  )
122
94
  else:
123
- self.plot_imputed(imp_df, plot_series=plot_series)
95
+ self.plot_imputed()
124
96
 
125
- return imp_df
97
+ return self.imputed_df
126
98
 
127
99
  def fcst(
128
100
  self,
129
- fcst_df: pd.DataFrame,
130
- plot: bool = False,
131
- plot_series: tuple = ("BTC", "close"),
101
+ yhat_df: pd.DataFrame,
132
102
  ) -> pd.DataFrame:
133
103
  """
134
104
  Imputes missing values with forecasts from outlier detection algorithm.
135
105
 
136
106
  Parameters
137
107
  ----------
138
- fcst_df: pd.DataFrame - MultiIndex
108
+ yhat_df: pd.DataFrame - MultiIndex
139
109
  Multiindex dataframe with DatetimeIndex (level 0), tickers (level 1) and fields (cols)
140
110
  with forecasted values.
141
- plot: bool, default False
142
- Plots series with outliers highlighted with red dots.
143
- plot_series: tuple, default ('BTC', 'close')
144
- Plots the time series of a specific (ticker, field/column) tuple.
145
111
 
146
112
  Returns
147
113
  -------
148
- imp_df: pd.DataFrame - MultiIndex
114
+ imputed_df: pd.DataFrame - MultiIndex
149
115
  DataFrame MultiIndex with DatetimeIndex (level 0), ticker (level 1) and fields (cols) with imputed values
150
116
  using forecasts from outlier detection method.
151
-
152
117
  """
153
- # copy filtered and forecast dfs
154
- filt_df, yhat_df = self.filt_df.copy(), fcst_df.copy()
155
-
156
118
  # impute missing vals in filtered df with fcst vals
157
- imp_yhat = np.where(filt_df.isna(), yhat_df, filt_df)
119
+ imp_yhat = np.where(self.filtered_df.isna(), yhat_df, self.filtered_df)
158
120
  # create df
159
- imp_df = pd.DataFrame(imp_yhat, index=filt_df.index, columns=filt_df.columns)
121
+ self.imputed_df = pd.DataFrame(imp_yhat, index=self.filtered_df.index, columns=self.filtered_df.columns)
160
122
 
161
123
  # type conversion
162
- imp_df = imp_df.apply(pd.to_numeric, errors="ignore").convert_dtypes()
124
+ self.imputed_df = self.imputed_df.apply(pd.to_numeric, errors="ignore").convert_dtypes()
163
125
 
164
126
  # plot
165
- if plot:
166
- if not isinstance(plot_series, tuple):
127
+ if self.plot:
128
+ if not isinstance(self.plot_series, tuple):
167
129
  raise TypeError(
168
130
  "Plot_series must be a tuple specifying the ticker and column/field to plot (ticker, column)."
169
131
  )
170
132
  else:
171
- self.plot_imputed(imp_df, plot_series=plot_series)
133
+ self.plot_imputed()
172
134
 
173
- return imp_df
135
+ return self.imputed_df
174
136
 
175
- @staticmethod
176
- def plot_imputed(imp_df: pd.DataFrame, plot_series: Optional[tuple] = None) -> None:
137
+ def plot_imputed(self) -> None:
177
138
  """
178
139
  Plots filtered time series.
179
-
180
- Parameters
181
- ----------
182
- imp_df: pd.DataFrame - MultiIndex
183
- DataFrame MultiIndex with DatetimeIndex (level 0), ticker (level 1) and fields (cols) with imputed values.
184
- plot_series: tuple, optional, default None
185
- Plots the time series of a specific (ticker, field) tuple.
186
-
187
140
  """
188
141
  ax = (
189
- imp_df.loc[pd.IndexSlice[:, plot_series[0]], plot_series[1]]
142
+ self.imputed_df.loc[pd.IndexSlice[:, self.plot_series[0]], self.plot_series[1]]
190
143
  .droplevel(1)
191
144
  .plot(linewidth=1, figsize=(15, 7), color="#1f77b4", zorder=0)
192
145
  )
193
146
  ax.grid(color="black", linewidth=0.05)
194
147
  ax.xaxis.grid(False)
195
- ax.set_ylabel(plot_series[0])
148
+ ax.set_ylabel(self.plot_series[0])
196
149
  ax.ticklabel_format(style="plain", axis="y")
197
150
  ax.set_facecolor("whitesmoke")
198
- ax.legend([plot_series[1] + "_repaired"], loc="upper left")
151
+ ax.legend([self.plot_series[1] + "_repaired"], loc="upper left")