cryptodatapy 0.2.2__py3-none-any.whl → 0.2.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,31 @@
1
+ id,name,tiingo_id
2
+ eurusd,,
3
+ gbpusd,,
4
+ usdjpy,,
5
+ usdchf,,
6
+ usdcad,,
7
+ usdsek,,
8
+ usdnok,,
9
+ audusd,,
10
+ nzdusd,,
11
+ usdars,,
12
+ usdmxn,,
13
+ usdbrl,,
14
+ usdcop,,
15
+ usdclp,,
16
+ usdpen,,
17
+ usdils,,
18
+ usdrub,,
19
+ usdczk,,
20
+ usdpln,,
21
+ usdhuf,,
22
+ usdzar,,
23
+ usdtry,,
24
+ usdcny,,
25
+ usdhkd,,
26
+ usdsgd,,
27
+ usdtwd,,
28
+ usdkrw,,
29
+ usdphp,,
30
+ usdinr,,
31
+ usdidr,,
@@ -1,19 +1,57 @@
1
1
  from __future__ import annotations
2
2
  from typing import Optional, Union
3
-
4
3
  import pandas as pd
5
4
 
6
- from cryptodatapy.transform.filter import Filter
7
- from cryptodatapy.transform.impute import Impute
8
5
  from cryptodatapy.transform.od import OutlierDetection
6
+ from cryptodatapy.transform.impute import Impute
7
+ from cryptodatapy.transform.filter import Filter
9
8
 
10
9
 
11
- class CleanData:
10
+ def stitch_dataframes(dfs):
12
11
  """
13
- Cleans data to improve data quality.
12
+ Stitches together dataframes with different start dates.
13
+
14
+ Parameters
15
+ ----------
16
+ dfs: list
17
+ List of dataframes to be stitched together.
14
18
 
19
+ Returns
20
+ -------
21
+ combined_df: pd.DataFrame
22
+ Combined dataframe with extended start date.
15
23
  """
24
+ # check if dfs is a list
25
+ if not isinstance(dfs, list):
26
+ raise TypeError("Dataframes must be a list.")
27
+
28
+ # check index types
29
+ if all([isinstance(df.index, pd.MultiIndex) for df in dfs]):
30
+ dfs.sort(key=lambda df: df.index.levels[0][0], reverse=True)
31
+ elif all([isinstance(df.index, pd.DatetimeIndex) for df in dfs]):
32
+ dfs.sort(key=lambda df: df.index[0], reverse=True)
33
+ else:
34
+ raise TypeError("Dataframes must be pd.MultiIndex or have DatetimeIndex.")
35
+
36
+ # most recent start date
37
+ combined_df = dfs[0]
38
+
39
+ # combine dfs
40
+ for df in dfs[1:]:
41
+ combined_df = combined_df.combine_first(df)
42
+
43
+ # reorder cols
44
+ max_columns = max(len(df.columns) for df in dfs)
45
+ cols = next(df.columns.tolist() for df in dfs if len(df.columns) == max_columns)
46
+ combined_df = combined_df[cols]
16
47
 
48
+ return combined_df
49
+
50
+
51
+ class CleanData:
52
+ """
53
+ Cleans data to improve data quality.
54
+ """
17
55
  def __init__(self, df: pd.DataFrame):
18
56
  """
19
57
  Constructor
@@ -22,26 +60,44 @@ class CleanData:
22
60
  ----------
23
61
  df: pd.DataFrame
24
62
  DataFrame MultiIndex with DatetimeIndex (level 0), ticker (level 1) and field (cols) values.
25
-
26
63
  """
27
- self.start_df = df.copy() # keepy copy of raw dataframe
64
+ self.raw_df = df.copy() # keepy copy of raw dataframe
28
65
  self.df = df
29
- self.outliers = None # outliers
30
- self.fcsts = None # forecasts
31
- self.filtered_tickers = [] # filtered tickers
32
- self.summary = pd.DataFrame() # summary of cleaning ops
66
+ self.excluded_cols = None
67
+ self.outliers = None
68
+ self.yhat = None
69
+ self.filtered_df = None
70
+ self.filtered_tickers = None
71
+ self.repaired_df = None
72
+ self.summary = pd.DataFrame()
73
+ self.initialize_summary()
74
+ self.check_types()
75
+
76
+ def initialize_summary(self) -> None:
77
+ """
78
+ Initializes summary dataframe with data quality metrics.
79
+ """
33
80
  # add obs and missing vals
34
- self.summary.loc["n_obs", self.df.unstack().columns] = (
35
- self.df.unstack().notna().sum().values
36
- )
37
- self.summary.loc["%_NaN_start", self.df.unstack().columns] = (
38
- self.df.unstack().isnull().sum() / self.df.unstack().shape[0]
39
- ).values * 100
81
+ self.summary.loc["n_obs", self.df.unstack().columns] = self.df.unstack().notna().sum().values
82
+ self.summary.loc["%_NaN_start", self.df.unstack().columns] = \
83
+ (self.df.unstack().isnull().sum() / self.df.unstack().shape[0]).values * 100
84
+
85
+ def check_types(self) -> None:
86
+ """
87
+ Checks data types of columns and converts them to the appropriate data types.
88
+
89
+ Returns
90
+ -------
91
+ CleanData
92
+ CleanData object
93
+ """
94
+ if not isinstance(self.df, pd.DataFrame):
95
+ raise TypeError("Data must be a pandas DataFrame.")
40
96
 
41
97
  def filter_outliers(
42
98
  self,
99
+ od_method: str = "mad",
43
100
  excl_cols: Optional[Union[str, list]] = None,
44
- od_method: str = "z_score",
45
101
  **kwargs
46
102
  ) -> CleanData:
47
103
  """
@@ -49,92 +105,37 @@ class CleanData:
49
105
 
50
106
  Parameters
51
107
  ----------
52
- excl_cols: str or list
53
- Name of columns to exclude from outlier filtering.
54
108
  od_method: str, {'atr', 'iqr', 'mad', 'z_score', 'ewma', 'stl', 'seasonal_decomp', 'prophet'}, default z_score
55
109
  Outlier detection method to use for filtering.
56
-
57
- Other Parameters
58
- ----------------
59
- log: bool, default False
60
- Converts series into log of series.
61
- window_size: int, default 7
62
- Number of observations in the rolling window.
63
- model_type: str, {'estimation', 'prediction'}, default 'estimation'
64
- Estimation models use past, current and future values to estimate the expected value of a series,
65
- e.g. expected x_t of series x at time t uses values from [x_t-s, x_t+s].
66
- Prediction models use only past and current values to estimate the expected value of a series,
67
- e.g. expected x_t of series x at time t uses values from [x_t-s, x_t].
68
- thresh_val: int, default 2
69
- Value for upper and lower thresholds used in outlier detection.
70
- period: int, optional, default 7
71
- periodicity of the sequence.
72
- model: str, {'additive', 'multiplicative'}, default 'additive'
73
- Type of seasonal component.
74
- filt: array-like, optional, default None
75
- The filter coefficients for filtering out the seasonal component.
76
- The concrete moving average method used in filtering is determined by two_sided.
77
- two_sided: bool, optional, default True
78
- The moving average method used in filtering. If True (default), a centered moving average is
79
- computed using the filt. If False, the filter coefficients are for past values only.
80
- extrapolate_trend: int, optional, default 0
81
- If set to > 0, the trend resulting from the convolution is linear least-squares extrapolated
82
- on both ends (or the single one if two_sided is False) considering this many (+1) closest points.
83
- If set to ‘freq’, use freq closest points. Setting this parameter results in no NaN values in trend
84
- or resid components.
85
- seasonal_deg: int, optional, default 1
86
- Degree of seasonal LOESS. 0 (constant) or 1 (constant and trend).
87
- trend_deg: int, optional, default 1
88
- Degree of trend LOESS. 0 (constant) or 1 (constant and trend).
89
- low_pass_deg: int, optional, default 1
90
- Degree of low pass LOESS. 0 (constant) or 1 (constant and trend).
91
- robust: bool, optional, default False
92
- Flag indicating whether to use a weighted version that is robust to some forms of outliers.
93
- seasonal_jump: int, optional, default 1
94
- Positive integer determining the linear interpolation step. If larger than 1,
95
- the LOESS is used every seasonal_jump points and linear interpolation is between fitted points.
96
- Higher values reduce estimation time.
97
- trend_jump: int, optional, default 1
98
- Positive integer determining the linear interpolation step. If larger than 1,
99
- the LOESS is used every trend_jump points and values between the two are linearly interpolated.
100
- Higher values reduce estimation time.
101
- low_pass_jump: int, optional, default 1
102
- Positive integer determining the linear interpolation step. If larger than 1,
103
- the LOESS is used every low_pass_jump points and values between the two are linearly interpolated.
104
- Higher values reduce estimation time.
105
- interval_width: float, optional, default 0.99
106
- Uncertainty interval estimated by Monte Carlo simulation. The larger the value,
107
- the larger the upper/lower thresholds interval for outlier detection.
108
- plot: bool, default False
109
- Plots series with outliers highlighted (red dots).
110
- plot_series: tuple, default ('BTC', 'close')
111
- The specific time series to plot given by (ticker, field/column) tuple.
110
+ excl_cols: str or list
111
+ Name of columns to exclude from outlier filtering.
112
112
 
113
113
  Returns
114
114
  -------
115
115
  CleanData
116
116
  CleanData object
117
-
118
117
  """
119
118
  # outlier detection
120
- od = getattr(OutlierDetection(self.df), od_method)(**kwargs)
121
- # add outliers and fcst to obj
122
- self.outliers = od["outliers"]
123
- self.fcsts = od["yhat"]
119
+ od = OutlierDetection(self.df, excl_cols=excl_cols, **kwargs)
120
+ self.excluded_cols = excl_cols
121
+
124
122
  # filter outliers
125
- filt_df = Filter(self.df, excl_cols=excl_cols).outliers(od)
123
+ getattr(od, od_method)()
124
+ self.filtered_df = od.filtered_df
125
+ self.outliers = od.outliers
126
+ self.yhat = od.yhat
127
+
126
128
  # add to summary
127
- self.summary.loc["%_outliers", self.df.unstack().columns] = (
128
- od["outliers"].unstack().notna().sum() / self.df.unstack().shape[0]
129
+ self.summary.loc["%_outliers", self.outliers.unstack().columns] = (
130
+ self.outliers.unstack().notna().sum() / self.df.unstack().shape[0]
129
131
  ).values * 100
132
+
130
133
  # filtered df
131
- self.df = filt_df
134
+ self.df = self.filtered_df.sort_index()
132
135
 
133
136
  return self
134
137
 
135
- def repair_outliers(
136
- self, imp_method: str = "interpolate", **kwargs
137
- ) -> CleanData:
138
+ def repair_outliers(self, imp_method: str = "interpolate", **kwargs) -> CleanData:
138
139
  """
139
140
  Repairs outliers using an imputation method.
140
141
 
@@ -143,46 +144,33 @@ class CleanData:
143
144
  imp_method: str, {"fwd_fill', 'interpolate', 'fcst'}, default 'fwd_fill'
144
145
  Imputation method used to replace filtered outliers.
145
146
 
146
- Other Parameters
147
- ----------------
148
- method: str, {'linear', ‘nearest’, ‘zero’, ‘slinear’, ‘quadratic’, ‘cubic’, ‘spline’, ‘barycentric’,
149
- ‘polynomial’, ‘krogh’, ‘piecewise_polynomial’, ‘pchip’, ‘akima’, ‘cubicspline’}, default spline
150
- Interpolation method to use.
151
- order: int, optional, default None
152
- Order of polynomial or spline.
153
- axis: {{0 or ‘index’, 1 or ‘columns’, None}}, default None
154
- Axis to interpolate along.
155
- limit: int, optional, default None
156
- Maximum number of consecutive NaNs to fill. Must be greater than 0.
157
- plot: bool, default False
158
- Plots series with outliers highlighted with red dots.
159
- plot_series: tuple, default ('BTC', 'close')
160
- Plots the time series of a specific (ticker, field/column) tuple.
161
-
162
147
  Returns
163
148
  -------
164
149
  CleanData
165
150
  CleanData object
166
-
167
151
  """
168
152
  # impute missing vals
169
153
  if imp_method == "fcst":
170
- rep_df = getattr(Impute(self.df), imp_method)(self.fcsts, **kwargs)
154
+ self.repaired_df = getattr(Impute(self.df), imp_method)(self.yhat, **kwargs)
171
155
  else:
172
- rep_df = getattr(Impute(self.df), imp_method)(**kwargs)
156
+ self.repaired_df = getattr(Impute(self.df), imp_method)(**kwargs)
157
+
173
158
  # add repaired % to summary
174
- rep_vals = rep_df.unstack().notna().sum() - self.df.unstack().notna().sum()
175
- self.summary.loc["%_imputed", self.df.unstack().columns] = (
176
- rep_vals / self.df.unstack().shape[0]
177
- ) * 100
159
+ rep_vals = self.repaired_df.unstack().notna().sum() - self.df.unstack().notna().sum()
160
+ self.summary.loc["%_imputed", self.df.unstack().columns] = rep_vals / self.df.unstack().shape[0] * 100
161
+
178
162
  # repaired df
179
- self.df = rep_df
163
+ if self.excluded_cols is not None:
164
+ self.df = pd.concat([self.repaired_df, self.raw_df[self.excluded_cols]], join="inner", axis=1)
165
+ else:
166
+ self.df = self.repaired_df
167
+
168
+ # reorder cols
169
+ self.df = self.df[self.raw_df.columns].sort_index()
180
170
 
181
171
  return self
182
172
 
183
- def filter_avg_trading_val(
184
- self, thresh_val: int = 10000000, window_size: int = 30, **kwargs
185
- ) -> CleanData:
173
+ def filter_avg_trading_val(self, thresh_val: int = 10000000, window_size: int = 30) -> CleanData:
186
174
  """
187
175
  Filters values below a threshold of average trading value (price * volume/size in quote currency) over some
188
176
  lookback window, replacing them with NaNs.
@@ -194,34 +182,26 @@ class CleanData:
194
182
  window_size: int, default 30
195
183
  Size of rolling window.
196
184
 
197
- Other Parameters
198
- ----------------
199
- plot: bool, default False
200
- Plots series with outliers highlighted with red dots.
201
- plot_series: tuple, default ('BTC', 'close')
202
- Plots the time series of a specific (ticker, field/column) tuple.
203
-
204
185
  Returns
205
186
  -------
206
187
  CleanData
207
188
  CleanData object
208
-
209
189
  """
210
190
  # filter outliers
211
- filt_df = Filter(self.df).avg_trading_val(
212
- thresh_val=thresh_val, window_size=window_size, **kwargs
213
- )
191
+ self.filtered_df = Filter(self.df).avg_trading_val(thresh_val=thresh_val, window_size=window_size)
192
+
214
193
  # add to summary
215
- filt_vals = self.df.unstack().notna().sum() - filt_df.unstack().notna().sum()
194
+ filtered_vals = self.df.unstack().notna().sum() - self.filtered_df.unstack().notna().sum()
216
195
  self.summary.loc["%_below_avg_trading_val", self.df.unstack().columns] = (
217
- filt_vals / self.df.unstack().shape[0]
196
+ filtered_vals / self.df.unstack().shape[0]
218
197
  ).values * 100
198
+
219
199
  # filtered df
220
- self.df = filt_df
200
+ self.df = self.filtered_df.sort_index()
221
201
 
222
202
  return self
223
203
 
224
- def filter_missing_vals_gaps(self, gap_window: int = 30, **kwargs) -> CleanData:
204
+ def filter_missing_vals_gaps(self, gap_window: int = 30) -> CleanData:
225
205
  """
226
206
  Filters values before a large gap of missing values, replacing them with NaNs.
227
207
 
@@ -230,37 +210,28 @@ class CleanData:
230
210
  gap_window: int, default 30
231
211
  Size of window where all values are missing (NaNs).
232
212
 
233
- Other Parameters
234
- ----------------
235
- plot: bool, default False
236
- Plots series with outliers highlighted with red dots.
237
- plot_series: tuple, default ('BTC', 'close')
238
- Plots the time series of a specific (ticker, field/column) tuple.
239
-
240
213
  Returns
241
214
  -------
242
215
  CleanData
243
216
  CleanData object
244
-
245
217
  """
246
218
  # filter outliers
247
- filt_df = Filter(self.df).missing_vals_gaps(gap_window=gap_window, **kwargs)
219
+ self.filtered_df = Filter(self.df).missing_vals_gaps(gap_window=gap_window)
220
+
248
221
  # add to summary
249
222
  missing_vals_gap = (
250
- self.df.unstack().notna().sum() - filt_df.unstack().notna().sum()
223
+ self.df.unstack().notna().sum() - self.filtered_df.unstack().notna().sum()
251
224
  )
252
225
  self.summary.loc["%_missing_vals_gaps", self.df.unstack().columns] = (
253
226
  missing_vals_gap / self.df.unstack().shape[0]
254
227
  ).values * 100
228
+
255
229
  # filtered df
256
- self.df = filt_df
230
+ self.df = self.filtered_df.sort_index()
257
231
 
258
232
  return self
259
233
 
260
- def filter_min_nobs(self,
261
- ts_obs: int = 100,
262
- cs_obs: int = 2
263
- ) -> CleanData:
234
+ def filter_min_nobs(self, ts_obs: int = 100, cs_obs: int = 2) -> CleanData:
264
235
  """
265
236
  Removes tickers from dataframe if the ticker has less than a minimum number of observations.
266
237
 
@@ -277,21 +248,54 @@ class CleanData:
277
248
  CleanData object
278
249
  """
279
250
  # filter outliers
280
- filt_df = Filter(self.df).min_nobs(ts_obs=ts_obs, cs_obs=cs_obs)
251
+ self.filtered_df = Filter(self.df).min_nobs(ts_obs=ts_obs, cs_obs=cs_obs)
252
+
281
253
  # tickers < min obs
282
- filt_tickers = list(
283
- set(filt_df.index.droplevel(0).unique()).symmetric_difference(
254
+ self.filtered_tickers = list(
255
+ set(self.filtered_df.index.droplevel(0).unique()).symmetric_difference(
284
256
  set(self.df.index.droplevel(0).unique())
285
257
  )
286
258
  )
287
- # add to obj
288
- if len(filt_tickers) != 0:
289
- self.filtered_tickers.extend(filt_tickers)
290
- self.summary.loc["n_tickers_below_min_obs", self.df.unstack().columns] = len(
291
- filt_tickers
259
+
260
+ # add to summary
261
+ self.summary.loc["n_tickers_below_min_obs", self.df.unstack().columns] = len(self.filtered_tickers)
262
+
263
+ # filtered df
264
+ self.df = self.filtered_df.sort_index()
265
+
266
+ return self
267
+
268
+ def filter_delisted_tickers(self, field: str = 'close', n_unch_vals: int = 30) -> CleanData:
269
+ """
270
+ Removes delisted tickers from dataframe.
271
+
272
+ Parameters
273
+ ----------
274
+ field: str, default 'close'
275
+ Field/column to use for detecting delisted tickers.
276
+ n_unch_vals: int, default 30
277
+ Number of consecutive unchanged values to consider a ticker as delisted.
278
+
279
+ Returns
280
+ -------
281
+ CleanData
282
+ CleanData object
283
+ """
284
+ # filter tickers
285
+ self.filtered_df = Filter(self.df).remove_delisted(field=field, n_unch_vals=n_unch_vals)
286
+
287
+ # tickers < min obs
288
+ self.filtered_tickers = list(
289
+ set(self.filtered_df.index.droplevel(0).unique()).symmetric_difference(
290
+ set(self.df.index.droplevel(0).unique())
291
+ )
292
292
  )
293
+
294
+ # add to summary
295
+ self.summary.loc["n_filtered_tickers", self.df.unstack().columns] = len(self.filtered_tickers)
296
+
293
297
  # filtered df
294
- self.df = filt_df
298
+ self.df = self.filtered_df.sort_index()
295
299
 
296
300
  return self
297
301
 
@@ -309,30 +313,27 @@ class CleanData:
309
313
  -------
310
314
  CleanData
311
315
  CleanData object
312
-
313
316
  """
314
317
  # filter tickers
315
- filt_df = Filter(self.df).tickers(tickers_list)
318
+ self.filtered_df = Filter(self.df).tickers(tickers_list)
319
+
316
320
  # tickers < min obs
317
- filt_tickers = list(
318
- set(filt_df.index.droplevel(0).unique()).symmetric_difference(
321
+
322
+ self.filtered_tickers = list(
323
+ set(self.filtered_df.index.droplevel(0).unique()).symmetric_difference(
319
324
  set(self.df.index.droplevel(0).unique())
320
325
  )
321
326
  )
322
- # add to obj properties
323
- if len(filt_tickers) != 0:
324
- self.filtered_tickers.extend(filt_tickers)
325
- self.summary.loc["n_filtered_tickers", self.df.unstack().columns] = len(
326
- filt_tickers
327
- )
327
+
328
+ # add to summary
329
+ self.summary.loc["n_filtered_tickers", self.df.unstack().columns] = len(self.filtered_tickers)
330
+
328
331
  # filtered df
329
- self.df = filt_df
332
+ self.df = self.filtered_df.sort_index()
330
333
 
331
334
  return self
332
335
 
333
- def show_plot(
334
- self, plot_series: tuple = ("BTC", "close"), compare_series: bool = True
335
- ) -> None:
336
+ def show_plot(self, plot_series: tuple = ("BTC", "close"), compare_series: bool = True) -> None:
336
337
  """
337
338
  Plots clean time series and compares it to the raw series.
338
339
 
@@ -342,7 +343,6 @@ class CleanData:
342
343
  Plots the time series of a specific (ticker, field) tuple.
343
344
  compare_series: bool, default True
344
345
  Compares clean time series with raw series
345
-
346
346
  """
347
347
  ax = (
348
348
  self.df.loc[pd.IndexSlice[:, plot_series[0]], plot_series[1]]
@@ -357,7 +357,7 @@ class CleanData:
357
357
  )
358
358
  if compare_series:
359
359
  ax = (
360
- self.start_df.loc[pd.IndexSlice[:, plot_series[0]], plot_series[1]]
360
+ self.raw_df.loc[pd.IndexSlice[:, plot_series[0]], plot_series[1]]
361
361
  .droplevel(1)
362
362
  .plot(
363
363
  linewidth=1,
@@ -382,14 +382,13 @@ class CleanData:
382
382
 
383
383
  Parameters
384
384
  ----------
385
- attr: str, {'df', 'outliers', 'fcst', 'filtered_tickers', 'summary'}, default 'df'
385
+ attr: str, {'df', 'outliers', 'yhat', 'filtered_tickers', 'summary'}, default 'df'
386
386
  GetData object attribute to return
387
387
 
388
388
  Returns
389
389
  -------
390
390
  CleanData
391
391
  CleanData object
392
-
393
392
  """
394
393
  self.summary.loc["%_NaN_end", self.df.unstack().columns] = (
395
394
  self.df.unstack().isnull().sum() / self.df.unstack().shape[0]