cryptodatapy 0.2.2__py3-none-any.whl → 0.2.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,31 @@
1
+ id,name,tiingo_id
2
+ eurusd,,
3
+ gbpusd,,
4
+ usdjpy,,
5
+ usdchf,,
6
+ usdcad,,
7
+ usdsek,,
8
+ usdnok,,
9
+ audusd,,
10
+ nzdusd,,
11
+ usdars,,
12
+ usdmxn,,
13
+ usdbrl,,
14
+ usdcop,,
15
+ usdclp,,
16
+ usdpen,,
17
+ usdils,,
18
+ usdrub,,
19
+ usdczk,,
20
+ usdpln,,
21
+ usdhuf,,
22
+ usdzar,,
23
+ usdtry,,
24
+ usdcny,,
25
+ usdhkd,,
26
+ usdsgd,,
27
+ usdtwd,,
28
+ usdkrw,,
29
+ usdphp,,
30
+ usdinr,,
31
+ usdidr,,
@@ -1,19 +1,57 @@
1
1
  from __future__ import annotations
2
2
  from typing import Optional, Union
3
-
4
3
  import pandas as pd
5
4
 
6
- from cryptodatapy.transform.filter import Filter
7
- from cryptodatapy.transform.impute import Impute
8
5
  from cryptodatapy.transform.od import OutlierDetection
6
+ from cryptodatapy.transform.impute import Impute
7
+ from cryptodatapy.transform.filter import Filter
9
8
 
10
9
 
11
- class CleanData:
10
+ def stitch_dataframes(dfs):
12
11
  """
13
- Cleans data to improve data quality.
12
+ Stitches together dataframes with different start dates.
14
13
 
14
+ Parameters
15
+ ----------
16
+ dfs: list
17
+ List of dataframes to be stitched together.
18
+
19
+ Returns
20
+ -------
21
+ combined_df: pd.DataFrame
22
+ Combined dataframe with extended start date.
15
23
  """
24
+ # check if dfs is a list
25
+ if not isinstance(dfs, list):
26
+ raise TypeError("Dataframes must be a list.")
27
+
28
+ # check index types
29
+ if all([isinstance(df.index, pd.MultiIndex) for df in dfs]):
30
+ dfs.sort(key=lambda df: df.index.levels[0][0], reverse=True)
31
+ elif all([isinstance(df.index, pd.DatetimeIndex) for df in dfs]):
32
+ dfs.sort(key=lambda df: df.index[0], reverse=True)
33
+ else:
34
+ raise TypeError("Dataframes must be pd.MultiIndex or have DatetimeIndex.")
16
35
 
36
+ # most recent start date
37
+ combined_df = dfs[0]
38
+
39
+ # combine dfs
40
+ for df in dfs[1:]:
41
+ combined_df = combined_df.combine_first(df)
42
+
43
+ # reorder cols
44
+ max_columns = max(len(df.columns) for df in dfs)
45
+ cols = next(df.columns.tolist() for df in dfs if len(df.columns) == max_columns)
46
+ combined_df = combined_df[cols]
47
+
48
+ return combined_df
49
+
50
+
51
+ class CleanData:
52
+ """
53
+ Cleans data to improve data quality.
54
+ """
17
55
  def __init__(self, df: pd.DataFrame):
18
56
  """
19
57
  Constructor
@@ -22,26 +60,44 @@ class CleanData:
22
60
  ----------
23
61
  df: pd.DataFrame
24
62
  DataFrame MultiIndex with DatetimeIndex (level 0), ticker (level 1) and field (cols) values.
25
-
26
63
  """
27
- self.start_df = df.copy() # keepy copy of raw dataframe
64
+ self.raw_df = df.copy() # keepy copy of raw dataframe
28
65
  self.df = df
29
- self.outliers = None # outliers
30
- self.fcsts = None # forecasts
31
- self.filtered_tickers = [] # filtered tickers
32
- self.summary = pd.DataFrame() # summary of cleaning ops
66
+ self.excluded_cols = None
67
+ self.outliers = None
68
+ self.yhat = None
69
+ self.filtered_df = None
70
+ self.filtered_tickers = None
71
+ self.repaired_df = None
72
+ self.summary = pd.DataFrame()
73
+ self.initialize_summary()
74
+ self.check_types()
75
+
76
+ def initialize_summary(self) -> None:
77
+ """
78
+ Initializes summary dataframe with data quality metrics.
79
+ """
33
80
  # add obs and missing vals
34
- self.summary.loc["n_obs", self.df.unstack().columns] = (
35
- self.df.unstack().notna().sum().values
36
- )
37
- self.summary.loc["%_NaN_start", self.df.unstack().columns] = (
38
- self.df.unstack().isnull().sum() / self.df.unstack().shape[0]
39
- ).values * 100
81
+ self.summary.loc["n_obs", self.df.unstack().columns] = self.df.unstack().notna().sum().values
82
+ self.summary.loc["%_NaN_start", self.df.unstack().columns] = \
83
+ (self.df.unstack().isnull().sum() / self.df.unstack().shape[0]).values * 100
84
+
85
+ def check_types(self) -> None:
86
+ """
87
+ Checks data types of columns and converts them to the appropriate data types.
88
+
89
+ Returns
90
+ -------
91
+ CleanData
92
+ CleanData object
93
+ """
94
+ if not isinstance(self.df, pd.DataFrame):
95
+ raise TypeError("Data must be a pandas DataFrame.")
40
96
 
41
97
  def filter_outliers(
42
98
  self,
99
+ od_method: str = "mad",
43
100
  excl_cols: Optional[Union[str, list]] = None,
44
- od_method: str = "z_score",
45
101
  **kwargs
46
102
  ) -> CleanData:
47
103
  """
@@ -49,92 +105,37 @@ class CleanData:
49
105
 
50
106
  Parameters
51
107
  ----------
52
- excl_cols: str or list
53
- Name of columns to exclude from outlier filtering.
54
108
  od_method: str, {'atr', 'iqr', 'mad', 'z_score', 'ewma', 'stl', 'seasonal_decomp', 'prophet'}, default z_score
55
109
  Outlier detection method to use for filtering.
56
-
57
- Other Parameters
58
- ----------------
59
- log: bool, default False
60
- Converts series into log of series.
61
- window_size: int, default 7
62
- Number of observations in the rolling window.
63
- model_type: str, {'estimation', 'prediction'}, default 'estimation'
64
- Estimation models use past, current and future values to estimate the expected value of a series,
65
- e.g. expected x_t of series x at time t uses values from [x_t-s, x_t+s].
66
- Prediction models use only past and current values to estimate the expected value of a series,
67
- e.g. expected x_t of series x at time t uses values from [x_t-s, x_t].
68
- thresh_val: int, default 2
69
- Value for upper and lower thresholds used in outlier detection.
70
- period: int, optional, default 7
71
- periodicity of the sequence.
72
- model: str, {'additive', 'multiplicative'}, default 'additive'
73
- Type of seasonal component.
74
- filt: array-like, optional, default None
75
- The filter coefficients for filtering out the seasonal component.
76
- The concrete moving average method used in filtering is determined by two_sided.
77
- two_sided: bool, optional, default True
78
- The moving average method used in filtering. If True (default), a centered moving average is
79
- computed using the filt. If False, the filter coefficients are for past values only.
80
- extrapolate_trend: int, optional, default 0
81
- If set to > 0, the trend resulting from the convolution is linear least-squares extrapolated
82
- on both ends (or the single one if two_sided is False) considering this many (+1) closest points.
83
- If set to ‘freq’, use freq closest points. Setting this parameter results in no NaN values in trend
84
- or resid components.
85
- seasonal_deg: int, optional, default 1
86
- Degree of seasonal LOESS. 0 (constant) or 1 (constant and trend).
87
- trend_deg: int, optional, default 1
88
- Degree of trend LOESS. 0 (constant) or 1 (constant and trend).
89
- low_pass_deg: int, optional, default 1
90
- Degree of low pass LOESS. 0 (constant) or 1 (constant and trend).
91
- robust: bool, optional, default False
92
- Flag indicating whether to use a weighted version that is robust to some forms of outliers.
93
- seasonal_jump: int, optional, default 1
94
- Positive integer determining the linear interpolation step. If larger than 1,
95
- the LOESS is used every seasonal_jump points and linear interpolation is between fitted points.
96
- Higher values reduce estimation time.
97
- trend_jump: int, optional, default 1
98
- Positive integer determining the linear interpolation step. If larger than 1,
99
- the LOESS is used every trend_jump points and values between the two are linearly interpolated.
100
- Higher values reduce estimation time.
101
- low_pass_jump: int, optional, default 1
102
- Positive integer determining the linear interpolation step. If larger than 1,
103
- the LOESS is used every low_pass_jump points and values between the two are linearly interpolated.
104
- Higher values reduce estimation time.
105
- interval_width: float, optional, default 0.99
106
- Uncertainty interval estimated by Monte Carlo simulation. The larger the value,
107
- the larger the upper/lower thresholds interval for outlier detection.
108
- plot: bool, default False
109
- Plots series with outliers highlighted (red dots).
110
- plot_series: tuple, default ('BTC', 'close')
111
- The specific time series to plot given by (ticker, field/column) tuple.
110
+ excl_cols: str or list
111
+ Name of columns to exclude from outlier filtering.
112
112
 
113
113
  Returns
114
114
  -------
115
115
  CleanData
116
116
  CleanData object
117
-
118
117
  """
119
118
  # outlier detection
120
- od = getattr(OutlierDetection(self.df), od_method)(**kwargs)
121
- # add outliers and fcst to obj
122
- self.outliers = od["outliers"]
123
- self.fcsts = od["yhat"]
119
+ od = OutlierDetection(self.df, excl_cols=excl_cols, **kwargs)
120
+ self.excluded_cols = excl_cols
121
+
124
122
  # filter outliers
125
- filt_df = Filter(self.df, excl_cols=excl_cols).outliers(od)
123
+ getattr(od, od_method)()
124
+ self.filtered_df = od.filtered_df
125
+ self.outliers = od.outliers
126
+ self.yhat = od.yhat
127
+
126
128
  # add to summary
127
- self.summary.loc["%_outliers", self.df.unstack().columns] = (
128
- od["outliers"].unstack().notna().sum() / self.df.unstack().shape[0]
129
+ self.summary.loc["%_outliers", self.outliers.unstack().columns] = (
130
+ self.outliers.unstack().notna().sum() / self.df.unstack().shape[0]
129
131
  ).values * 100
132
+
130
133
  # filtered df
131
- self.df = filt_df
134
+ self.df = self.filtered_df
132
135
 
133
136
  return self
134
137
 
135
- def repair_outliers(
136
- self, imp_method: str = "interpolate", **kwargs
137
- ) -> CleanData:
138
+ def repair_outliers(self, imp_method: str = "interpolate", **kwargs) -> CleanData:
138
139
  """
139
140
  Repairs outliers using an imputation method.
140
141
 
@@ -143,46 +144,32 @@ class CleanData:
143
144
  imp_method: str, {"fwd_fill', 'interpolate', 'fcst'}, default 'fwd_fill'
144
145
  Imputation method used to replace filtered outliers.
145
146
 
146
- Other Parameters
147
- ----------------
148
- method: str, {'linear', ‘nearest’, ‘zero’, ‘slinear’, ‘quadratic’, ‘cubic’, ‘spline’, ‘barycentric’,
149
- ‘polynomial’, ‘krogh’, ‘piecewise_polynomial’, ‘pchip’, ‘akima’, ‘cubicspline’}, default spline
150
- Interpolation method to use.
151
- order: int, optional, default None
152
- Order of polynomial or spline.
153
- axis: {{0 or ‘index’, 1 or ‘columns’, None}}, default None
154
- Axis to interpolate along.
155
- limit: int, optional, default None
156
- Maximum number of consecutive NaNs to fill. Must be greater than 0.
157
- plot: bool, default False
158
- Plots series with outliers highlighted with red dots.
159
- plot_series: tuple, default ('BTC', 'close')
160
- Plots the time series of a specific (ticker, field/column) tuple.
161
-
162
147
  Returns
163
148
  -------
164
149
  CleanData
165
150
  CleanData object
166
-
167
151
  """
168
152
  # impute missing vals
169
153
  if imp_method == "fcst":
170
- rep_df = getattr(Impute(self.df), imp_method)(self.fcsts, **kwargs)
154
+ self.repaired_df = getattr(Impute(self.df), imp_method)(self.yhat, **kwargs)
171
155
  else:
172
- rep_df = getattr(Impute(self.df), imp_method)(**kwargs)
156
+ self.repaired_df = getattr(Impute(self.df), imp_method)(**kwargs)
157
+
173
158
  # add repaired % to summary
174
- rep_vals = rep_df.unstack().notna().sum() - self.df.unstack().notna().sum()
175
- self.summary.loc["%_imputed", self.df.unstack().columns] = (
176
- rep_vals / self.df.unstack().shape[0]
177
- ) * 100
159
+ rep_vals = self.repaired_df.unstack().notna().sum() - self.df.unstack().notna().sum()
160
+ self.summary.loc["%_imputed", self.df.unstack().columns] = rep_vals / self.df.unstack().shape[0] * 100
161
+
178
162
  # repaired df
179
- self.df = rep_df
163
+ if self.excluded_cols is not None:
164
+ self.df = pd.concat([self.repaired_df, self.raw_df[self.excluded_cols]], join="outer", axis=1)
165
+ else:
166
+ self.df = self.repaired_df
167
+ # reorder cols
168
+ self.df = self.df[self.raw_df.columns]
180
169
 
181
170
  return self
182
171
 
183
- def filter_avg_trading_val(
184
- self, thresh_val: int = 10000000, window_size: int = 30, **kwargs
185
- ) -> CleanData:
172
+ def filter_avg_trading_val(self, thresh_val: int = 10000000, window_size: int = 30) -> CleanData:
186
173
  """
187
174
  Filters values below a threshold of average trading value (price * volume/size in quote currency) over some
188
175
  lookback window, replacing them with NaNs.
@@ -194,34 +181,26 @@ class CleanData:
194
181
  window_size: int, default 30
195
182
  Size of rolling window.
196
183
 
197
- Other Parameters
198
- ----------------
199
- plot: bool, default False
200
- Plots series with outliers highlighted with red dots.
201
- plot_series: tuple, default ('BTC', 'close')
202
- Plots the time series of a specific (ticker, field/column) tuple.
203
-
204
184
  Returns
205
185
  -------
206
186
  CleanData
207
187
  CleanData object
208
-
209
188
  """
210
189
  # filter outliers
211
- filt_df = Filter(self.df).avg_trading_val(
212
- thresh_val=thresh_val, window_size=window_size, **kwargs
213
- )
190
+ self.filtered_df = Filter(self.df).avg_trading_val(thresh_val=thresh_val, window_size=window_size)
191
+
214
192
  # add to summary
215
- filt_vals = self.df.unstack().notna().sum() - filt_df.unstack().notna().sum()
193
+ filtered_vals = self.df.unstack().notna().sum() - self.filtered_df.unstack().notna().sum()
216
194
  self.summary.loc["%_below_avg_trading_val", self.df.unstack().columns] = (
217
- filt_vals / self.df.unstack().shape[0]
195
+ filtered_vals / self.df.unstack().shape[0]
218
196
  ).values * 100
197
+
219
198
  # filtered df
220
- self.df = filt_df
199
+ self.df = self.filtered_df
221
200
 
222
201
  return self
223
202
 
224
- def filter_missing_vals_gaps(self, gap_window: int = 30, **kwargs) -> CleanData:
203
+ def filter_missing_vals_gaps(self, gap_window: int = 30) -> CleanData:
225
204
  """
226
205
  Filters values before a large gap of missing values, replacing them with NaNs.
227
206
 
@@ -230,37 +209,28 @@ class CleanData:
230
209
  gap_window: int, default 30
231
210
  Size of window where all values are missing (NaNs).
232
211
 
233
- Other Parameters
234
- ----------------
235
- plot: bool, default False
236
- Plots series with outliers highlighted with red dots.
237
- plot_series: tuple, default ('BTC', 'close')
238
- Plots the time series of a specific (ticker, field/column) tuple.
239
-
240
212
  Returns
241
213
  -------
242
214
  CleanData
243
215
  CleanData object
244
-
245
216
  """
246
217
  # filter outliers
247
- filt_df = Filter(self.df).missing_vals_gaps(gap_window=gap_window, **kwargs)
218
+ self.filtered_df = Filter(self.df).missing_vals_gaps(gap_window=gap_window)
219
+
248
220
  # add to summary
249
221
  missing_vals_gap = (
250
- self.df.unstack().notna().sum() - filt_df.unstack().notna().sum()
222
+ self.df.unstack().notna().sum() - self.filtered_df.unstack().notna().sum()
251
223
  )
252
224
  self.summary.loc["%_missing_vals_gaps", self.df.unstack().columns] = (
253
225
  missing_vals_gap / self.df.unstack().shape[0]
254
226
  ).values * 100
227
+
255
228
  # filtered df
256
- self.df = filt_df
229
+ self.df = self.filtered_df
257
230
 
258
231
  return self
259
232
 
260
- def filter_min_nobs(self,
261
- ts_obs: int = 100,
262
- cs_obs: int = 2
263
- ) -> CleanData:
233
+ def filter_min_nobs(self, ts_obs: int = 100, cs_obs: int = 2) -> CleanData:
264
234
  """
265
235
  Removes tickers from dataframe if the ticker has less than a minimum number of observations.
266
236
 
@@ -277,21 +247,20 @@ class CleanData:
277
247
  CleanData object
278
248
  """
279
249
  # filter outliers
280
- filt_df = Filter(self.df).min_nobs(ts_obs=ts_obs, cs_obs=cs_obs)
250
+ self.filtered_df = Filter(self.df).min_nobs(ts_obs=ts_obs, cs_obs=cs_obs)
251
+
281
252
  # tickers < min obs
282
- filt_tickers = list(
283
- set(filt_df.index.droplevel(0).unique()).symmetric_difference(
253
+ self.filtered_tickers = list(
254
+ set(self.filtered_df.index.droplevel(0).unique()).symmetric_difference(
284
255
  set(self.df.index.droplevel(0).unique())
285
256
  )
286
257
  )
287
- # add to obj
288
- if len(filt_tickers) != 0:
289
- self.filtered_tickers.extend(filt_tickers)
290
- self.summary.loc["n_tickers_below_min_obs", self.df.unstack().columns] = len(
291
- filt_tickers
292
- )
258
+
259
+ # add to summary
260
+ self.summary.loc["n_tickers_below_min_obs", self.df.unstack().columns] = len(self.filtered_tickers)
261
+
293
262
  # filtered df
294
- self.df = filt_df
263
+ self.df = self.filtered_df
295
264
 
296
265
  return self
297
266
 
@@ -309,30 +278,26 @@ class CleanData:
309
278
  -------
310
279
  CleanData
311
280
  CleanData object
312
-
313
281
  """
314
282
  # filter tickers
315
- filt_df = Filter(self.df).tickers(tickers_list)
283
+ self.filtered_df = Filter(self.df).tickers(tickers_list)
284
+
316
285
  # tickers < min obs
317
- filt_tickers = list(
318
- set(filt_df.index.droplevel(0).unique()).symmetric_difference(
286
+ self.filtered_tickers = list(
287
+ set(self.filtered_df.index.droplevel(0).unique()).symmetric_difference(
319
288
  set(self.df.index.droplevel(0).unique())
320
289
  )
321
290
  )
322
- # add to obj properties
323
- if len(filt_tickers) != 0:
324
- self.filtered_tickers.extend(filt_tickers)
325
- self.summary.loc["n_filtered_tickers", self.df.unstack().columns] = len(
326
- filt_tickers
327
- )
291
+
292
+ # add to summary
293
+ self.summary.loc["n_filtered_tickers", self.df.unstack().columns] = len(self.filtered_tickers)
294
+
328
295
  # filtered df
329
- self.df = filt_df
296
+ self.df = self.filtered_df
330
297
 
331
298
  return self
332
299
 
333
- def show_plot(
334
- self, plot_series: tuple = ("BTC", "close"), compare_series: bool = True
335
- ) -> None:
300
+ def show_plot(self, plot_series: tuple = ("BTC", "close"), compare_series: bool = True) -> None:
336
301
  """
337
302
  Plots clean time series and compares it to the raw series.
338
303
 
@@ -342,7 +307,6 @@ class CleanData:
342
307
  Plots the time series of a specific (ticker, field) tuple.
343
308
  compare_series: bool, default True
344
309
  Compares clean time series with raw series
345
-
346
310
  """
347
311
  ax = (
348
312
  self.df.loc[pd.IndexSlice[:, plot_series[0]], plot_series[1]]
@@ -357,7 +321,7 @@ class CleanData:
357
321
  )
358
322
  if compare_series:
359
323
  ax = (
360
- self.start_df.loc[pd.IndexSlice[:, plot_series[0]], plot_series[1]]
324
+ self.raw_df.loc[pd.IndexSlice[:, plot_series[0]], plot_series[1]]
361
325
  .droplevel(1)
362
326
  .plot(
363
327
  linewidth=1,
@@ -382,14 +346,13 @@ class CleanData:
382
346
 
383
347
  Parameters
384
348
  ----------
385
- attr: str, {'df', 'outliers', 'fcst', 'filtered_tickers', 'summary'}, default 'df'
349
+ attr: str, {'df', 'outliers', 'yhat', 'filtered_tickers', 'summary'}, default 'df'
386
350
  GetData object attribute to return
387
351
 
388
352
  Returns
389
353
  -------
390
354
  CleanData
391
355
  CleanData object
392
-
393
356
  """
394
357
  self.summary.loc["%_NaN_end", self.df.unstack().columns] = (
395
358
  self.df.unstack().isnull().sum() / self.df.unstack().shape[0]