cryptodatapy 0.2.2__tar.gz → 0.2.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (72) hide show
  1. {cryptodatapy-0.2.2 → cryptodatapy-0.2.3}/PKG-INFO +1 -1
  2. {cryptodatapy-0.2.2 → cryptodatapy-0.2.3}/pyproject.toml +1 -1
  3. {cryptodatapy-0.2.2 → cryptodatapy-0.2.3}/setup.py +1 -1
  4. cryptodatapy-0.2.3/src/cryptodatapy/conf/fx_tickers.csv +31 -0
  5. cryptodatapy-0.2.3/src/cryptodatapy/transform/clean.py +362 -0
  6. cryptodatapy-0.2.3/src/cryptodatapy/transform/clean_perp_futures_ohlcv.ipynb +1639 -0
  7. {cryptodatapy-0.2.2 → cryptodatapy-0.2.3}/src/cryptodatapy/transform/filter.py +56 -143
  8. {cryptodatapy-0.2.2 → cryptodatapy-0.2.3}/src/cryptodatapy/transform/impute.py +36 -83
  9. {cryptodatapy-0.2.2 → cryptodatapy-0.2.3}/src/cryptodatapy/transform/od.py +221 -450
  10. cryptodatapy-0.2.2/src/cryptodatapy/transform/clean.py +0 -399
  11. {cryptodatapy-0.2.2 → cryptodatapy-0.2.3}/LICENSE +0 -0
  12. {cryptodatapy-0.2.2 → cryptodatapy-0.2.3}/README.md +0 -0
  13. {cryptodatapy-0.2.2 → cryptodatapy-0.2.3}/src/cryptodatapy/.DS_Store +0 -0
  14. {cryptodatapy-0.2.2 → cryptodatapy-0.2.3}/src/cryptodatapy/.idea/.gitignore +0 -0
  15. {cryptodatapy-0.2.2 → cryptodatapy-0.2.3}/src/cryptodatapy/.idea/cryptodatapy.iml +0 -0
  16. {cryptodatapy-0.2.2 → cryptodatapy-0.2.3}/src/cryptodatapy/.idea/csv-plugin.xml +0 -0
  17. {cryptodatapy-0.2.2 → cryptodatapy-0.2.3}/src/cryptodatapy/.idea/inspectionProfiles/Project_Default.xml +0 -0
  18. {cryptodatapy-0.2.2 → cryptodatapy-0.2.3}/src/cryptodatapy/.idea/inspectionProfiles/profiles_settings.xml +0 -0
  19. {cryptodatapy-0.2.2 → cryptodatapy-0.2.3}/src/cryptodatapy/.idea/misc.xml +0 -0
  20. {cryptodatapy-0.2.2 → cryptodatapy-0.2.3}/src/cryptodatapy/.idea/modules.xml +0 -0
  21. {cryptodatapy-0.2.2 → cryptodatapy-0.2.3}/src/cryptodatapy/.idea/vcs.xml +0 -0
  22. {cryptodatapy-0.2.2 → cryptodatapy-0.2.3}/src/cryptodatapy/__init__.py +0 -0
  23. {cryptodatapy-0.2.2 → cryptodatapy-0.2.3}/src/cryptodatapy/conf/__init__.py +0 -0
  24. {cryptodatapy-0.2.2 → cryptodatapy-0.2.3}/src/cryptodatapy/conf/fields.csv +0 -0
  25. {cryptodatapy-0.2.2 → cryptodatapy-0.2.3}/src/cryptodatapy/conf/tickers.csv +0 -0
  26. {cryptodatapy-0.2.2 → cryptodatapy-0.2.3}/src/cryptodatapy/datasets/__init__.py +0 -0
  27. {cryptodatapy-0.2.2 → cryptodatapy-0.2.3}/src/cryptodatapy/datasets/br_econ_calendar.csv +0 -0
  28. {cryptodatapy-0.2.2 → cryptodatapy-0.2.3}/src/cryptodatapy/datasets/ca_econ_calendar.csv +0 -0
  29. {cryptodatapy-0.2.2 → cryptodatapy-0.2.3}/src/cryptodatapy/datasets/cn_econ_calendar.csv +0 -0
  30. {cryptodatapy-0.2.2 → cryptodatapy-0.2.3}/src/cryptodatapy/datasets/de_econ_calendar.csv +0 -0
  31. {cryptodatapy-0.2.2 → cryptodatapy-0.2.3}/src/cryptodatapy/datasets/ez_econ_calendar.csv +0 -0
  32. {cryptodatapy-0.2.2 → cryptodatapy-0.2.3}/src/cryptodatapy/datasets/fr_econ_calendar.csv +0 -0
  33. {cryptodatapy-0.2.2 → cryptodatapy-0.2.3}/src/cryptodatapy/datasets/gb_econ_calendar.csv +0 -0
  34. {cryptodatapy-0.2.2 → cryptodatapy-0.2.3}/src/cryptodatapy/datasets/get_econ_calendars.py +0 -0
  35. {cryptodatapy-0.2.2 → cryptodatapy-0.2.3}/src/cryptodatapy/datasets/id_econ_calendar.csv +0 -0
  36. {cryptodatapy-0.2.2 → cryptodatapy-0.2.3}/src/cryptodatapy/datasets/in_econ_calendar.csv +0 -0
  37. {cryptodatapy-0.2.2 → cryptodatapy-0.2.3}/src/cryptodatapy/datasets/it_econ_calendar.csv +0 -0
  38. {cryptodatapy-0.2.2 → cryptodatapy-0.2.3}/src/cryptodatapy/datasets/jp_econ_calendar.csv +0 -0
  39. {cryptodatapy-0.2.2 → cryptodatapy-0.2.3}/src/cryptodatapy/datasets/kr_econ_calendar.csv +0 -0
  40. {cryptodatapy-0.2.2 → cryptodatapy-0.2.3}/src/cryptodatapy/datasets/mx_econ_calendar.csv +0 -0
  41. {cryptodatapy-0.2.2 → cryptodatapy-0.2.3}/src/cryptodatapy/datasets/ru_econ_calendar.csv +0 -0
  42. {cryptodatapy-0.2.2 → cryptodatapy-0.2.3}/src/cryptodatapy/datasets/tr_econ_calendar.csv +0 -0
  43. {cryptodatapy-0.2.2 → cryptodatapy-0.2.3}/src/cryptodatapy/datasets/us_econ_calendar.csv +0 -0
  44. {cryptodatapy-0.2.2 → cryptodatapy-0.2.3}/src/cryptodatapy/extract/__init__.py +0 -0
  45. {cryptodatapy-0.2.2 → cryptodatapy-0.2.3}/src/cryptodatapy/extract/data_vendors/.ipynb_checkpoints/CCXT-checkpoint.ipynb +0 -0
  46. {cryptodatapy-0.2.2 → cryptodatapy-0.2.3}/src/cryptodatapy/extract/data_vendors/.ipynb_checkpoints/DBNomics-checkpoint.ipynb +0 -0
  47. {cryptodatapy-0.2.2 → cryptodatapy-0.2.3}/src/cryptodatapy/extract/data_vendors/.ipynb_checkpoints/InvestPy-checkpoint.ipynb +0 -0
  48. {cryptodatapy-0.2.2 → cryptodatapy-0.2.3}/src/cryptodatapy/extract/data_vendors/.ipynb_checkpoints/NasdaqDataLink-checkpoint.ipynb +0 -0
  49. {cryptodatapy-0.2.2 → cryptodatapy-0.2.3}/src/cryptodatapy/extract/data_vendors/.ipynb_checkpoints/PandasDataReader-checkpoint.ipynb +0 -0
  50. {cryptodatapy-0.2.2 → cryptodatapy-0.2.3}/src/cryptodatapy/extract/data_vendors/__init__.py +0 -0
  51. {cryptodatapy-0.2.2 → cryptodatapy-0.2.3}/src/cryptodatapy/extract/data_vendors/coinmetrics_api.py +0 -0
  52. {cryptodatapy-0.2.2 → cryptodatapy-0.2.3}/src/cryptodatapy/extract/data_vendors/cryptocompare_api.py +0 -0
  53. {cryptodatapy-0.2.2 → cryptodatapy-0.2.3}/src/cryptodatapy/extract/data_vendors/datavendor.py +0 -0
  54. {cryptodatapy-0.2.2 → cryptodatapy-0.2.3}/src/cryptodatapy/extract/data_vendors/glassnode_api.py +0 -0
  55. {cryptodatapy-0.2.2 → cryptodatapy-0.2.3}/src/cryptodatapy/extract/data_vendors/tiingo_api.py +0 -0
  56. {cryptodatapy-0.2.2 → cryptodatapy-0.2.3}/src/cryptodatapy/extract/datarequest.py +0 -0
  57. {cryptodatapy-0.2.2 → cryptodatapy-0.2.3}/src/cryptodatapy/extract/getdata.py +0 -0
  58. {cryptodatapy-0.2.2 → cryptodatapy-0.2.3}/src/cryptodatapy/extract/libraries/__init__.py +0 -0
  59. {cryptodatapy-0.2.2 → cryptodatapy-0.2.3}/src/cryptodatapy/extract/libraries/ccxt_api.py +0 -0
  60. {cryptodatapy-0.2.2 → cryptodatapy-0.2.3}/src/cryptodatapy/extract/libraries/dbnomics_api.py +0 -0
  61. {cryptodatapy-0.2.2 → cryptodatapy-0.2.3}/src/cryptodatapy/extract/libraries/investpy_api.py +0 -0
  62. {cryptodatapy-0.2.2 → cryptodatapy-0.2.3}/src/cryptodatapy/extract/libraries/library.py +0 -0
  63. {cryptodatapy-0.2.2 → cryptodatapy-0.2.3}/src/cryptodatapy/extract/libraries/pandasdr_api.py +0 -0
  64. {cryptodatapy-0.2.2 → cryptodatapy-0.2.3}/src/cryptodatapy/extract/web/__init__.py +0 -0
  65. {cryptodatapy-0.2.2 → cryptodatapy-0.2.3}/src/cryptodatapy/extract/web/aqr.py +0 -0
  66. {cryptodatapy-0.2.2 → cryptodatapy-0.2.3}/src/cryptodatapy/extract/web/web.py +0 -0
  67. {cryptodatapy-0.2.2 → cryptodatapy-0.2.3}/src/cryptodatapy/transform/__init__.py +0 -0
  68. {cryptodatapy-0.2.2 → cryptodatapy-0.2.3}/src/cryptodatapy/transform/convertparams.py +0 -0
  69. {cryptodatapy-0.2.2 → cryptodatapy-0.2.3}/src/cryptodatapy/transform/wrangle.py +0 -0
  70. {cryptodatapy-0.2.2 → cryptodatapy-0.2.3}/src/cryptodatapy/util/__init__.py +0 -0
  71. {cryptodatapy-0.2.2 → cryptodatapy-0.2.3}/src/cryptodatapy/util/datacatalog.py +0 -0
  72. {cryptodatapy-0.2.2 → cryptodatapy-0.2.3}/src/cryptodatapy/util/datacredentials.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: cryptodatapy
3
- Version: 0.2.2
3
+ Version: 0.2.3
4
4
  Summary: Cryptoasset data library
5
5
  License: Apache-2.0
6
6
  Author: Systamental
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "cryptodatapy"
3
- version = "0.2.2"
3
+ version = "0.2.3"
4
4
  description = "Cryptoasset data library"
5
5
  authors = ["Systamental"]
6
6
  license = "Apache License 2.0"
@@ -103,7 +103,7 @@ extras_require = \
103
103
 
104
104
  setup_kwargs = {
105
105
  'name': 'cryptodatapy',
106
- 'version': '0.2.2',
106
+ 'version': '0.2.3',
107
107
  'description': 'Cryptoasset data library',
108
108
  'long_description': "![](cryptodatapy_logo.jpeg)\n\n# CryptoDataPy\n### _Better data beats fancier algorithms_\n<br/>\n\n**CryptoDataPy** is a python library which makes it easy to build high quality data pipelines \nfor the analysis of digital assets. By providing easy access to over 100,000 time series for thousands of assets, \nit facilitates the pre-processing of a wide range of data from different sources.\n\nCryptoassets generate a huge amount of market, on-chain and off-chain data. \nBut unlike legacy financial markets, this data is often fragmented, \nunstructured and dirty. By extracting data from various sources, \npre-processing it into a user-friendly (tidy) format, detecting and repairing 'bad' data,\nand allowing for easy storage and retrieval, CryptoDataPy allows you to spend less time gathering \nand cleaning data, and more time analyzing it.\n\nOur data includes:\n\n- **Market:** market prices of varying granularity (e.g. tick, trade and bar data, aka OHLC),\nfor spot, futures and options markets, as well as funding rates for the analysis of \ncryptoasset returns.\n- **On-chain:** network health and usage data, circulating supply, asset holder positions and \ncost-basis, for the analysis of underlying crypto network fundamentals.\n- **Off-chain:** news, social media, developer activity, web traffic and search for project interest and \nsentiment, as well as traditional financial market and macroeconomic data for broader financial and \neconomic conditions.\n\nThe library's intuitive interface facilitates each step of the ETL/ETL (extract-transform-load) process:\n\n- **Extract**: Extracting data from a wide range of data sources and file formats.\n- **Transform**: \n - Wrangling data into a pandas DataFrame in a structured and user-friendly format, \n a.k.a [tidy data](https://www.jstatsoft.org/article/view/v059i10). \n - Detecting, scrubbing and repairing 'bad' data (e.g. outliers, missing values, 0s, etc.) to improve the accuracy and reliability\nof machine learning/predictive models.\n- **Load**: Storing clean and ready-for-analysis data and metadata for easy access.\n\n## Installation\n\n```bash\n$ pip install cryptodatapy\n```\n\n## Usage\n\n**CryptoDataPy** allows you to pull ready-to-analyze data from a variety of sources \nwith only a few lines of code.\n\nFirst specify which data you want with a `DataRequest`:\n\n```python\n# import DataRequest\nfrom cryptodatapy.extract.datarequest import DataRequest\n# specify parameters for data request: tickers, fields, start date, end_date, etc.\ndata_req = DataRequest(\n source='glassnode', # name of data source\n tickers=['btc', 'eth'], # list of asset tickers, in CryptoDataPy format, defaults to 'btc'\n fields=['close', 'add_act', 'hashrate'], # list of fields, in CryptoDataPy, defaults to 'close'\n freq=None, # data frequency, defaults to daily \n quote_ccy=None, # defaults to USD/USDT\n exch=None, # defaults to exchange weighted average or Binance\n mkt_type= 'spot', # defaults to spot\n start_date=None, # defaults to start date for longest series\n end_date=None, # defaults to most recent \n tz=None, # defaults to UTC time\n cat=None, # optional, should be specified when asset class is not crypto, eg. 'fx', 'rates', 'macro', etc.\n)\n```\nThen get the data :\n\n```python\n# import GetData\nfrom cryptodatapy.extract.getdata import GetData\n# get data\nGetData(data_req).get_series()\n```\n\nWith the same data request parameters, you can retrieve the same data from a different source:\n\n```python\n# modify data source parameter\ndata_req = DataRequest(\n source='coinmetrics', \n tickers=['btc', 'eth'], \n fields=['close', 'add_act', 'hashrate'], \n req='d',\n start_date='2016-01-01')\n# get data\nGetData(data_req).get_series()\n```\n\nFor more detailed code examples and interactive tutorials \nsee [here](https://github.com/systamental/cryptodatapy/blob/main/docs/example.ipynb).\n## Supported Data Sources\n\n- [CryptoCompare](https://min-api.cryptocompare.com/documentation)\n- [CCXT](https://docs.ccxt.com/en/latest/)\n- [Glassnode](https://docs.glassnode.com/)\n- [Coin Metrics](https://docs.coinmetrics.io/api/v4/)\n- [Tiingo](https://api.tiingo.com/documentation/general/overview)\n- [Yahoo Finance](https://github.com/ranaroussi/yfinance)\n- [Fama-French Data](http://mba.tuck.dartmouth.edu/pages/faculty/ken.french/data_library.html)\n- [AQR](https://www.aqr.com/Insights/Datasets)\n- [Federal Reserve Economic Data (FRED)](https://fred.stlouisfed.org/docs/api/fred/)\n- [DBnomics](https://db.nomics.world/docs/)\n- [WorldBank](https://documents.worldbank.org/en/publication/documents-reports/api)\n- [Pandas-datareader](https://pandas-datareader.readthedocs.io/en/latest/)\n\n## Contributing\n\nInterested in contributing? Check out the contributing guidelines and \ncontact us at info@systamental.com. Please note that this project is s\nreleased with a Code of Conduct. By contributing to this project, you agree \nto abide by its terms.\n\n## License\n\n`cryptodatapy` was created by Systamental. \nIt is licensed under the terms of the Apache License 2.0 license.\n\n",
109
109
  'author': 'Systamental',
@@ -0,0 +1,31 @@
1
+ id,name,tiingo_id
2
+ eurusd,,
3
+ gbpusd,,
4
+ usdjpy,,
5
+ usdchf,,
6
+ usdcad,,
7
+ usdsek,,
8
+ usdnok,,
9
+ audusd,,
10
+ nzdusd,,
11
+ usdars,,
12
+ usdmxn,,
13
+ usdbrl,,
14
+ usdcop,,
15
+ usdclp,,
16
+ usdpen,,
17
+ usdils,,
18
+ usdrub,,
19
+ usdczk,,
20
+ usdpln,,
21
+ usdhuf,,
22
+ usdzar,,
23
+ usdtry,,
24
+ usdcny,,
25
+ usdhkd,,
26
+ usdsgd,,
27
+ usdtwd,,
28
+ usdkrw,,
29
+ usdphp,,
30
+ usdinr,,
31
+ usdidr,,
@@ -0,0 +1,362 @@
1
+ from __future__ import annotations
2
+ from typing import Optional, Union
3
+ import pandas as pd
4
+
5
+ from cryptodatapy.transform.od import OutlierDetection
6
+ from cryptodatapy.transform.impute import Impute
7
+ from cryptodatapy.transform.filter import Filter
8
+
9
+
10
+ def stitch_dataframes(dfs):
11
+ """
12
+ Stitches together dataframes with different start dates.
13
+
14
+ Parameters
15
+ ----------
16
+ dfs: list
17
+ List of dataframes to be stitched together.
18
+
19
+ Returns
20
+ -------
21
+ combined_df: pd.DataFrame
22
+ Combined dataframe with extended start date.
23
+ """
24
+ # check if dfs is a list
25
+ if not isinstance(dfs, list):
26
+ raise TypeError("Dataframes must be a list.")
27
+
28
+ # check index types
29
+ if all([isinstance(df.index, pd.MultiIndex) for df in dfs]):
30
+ dfs.sort(key=lambda df: df.index.levels[0][0], reverse=True)
31
+ elif all([isinstance(df.index, pd.DatetimeIndex) for df in dfs]):
32
+ dfs.sort(key=lambda df: df.index[0], reverse=True)
33
+ else:
34
+ raise TypeError("Dataframes must be pd.MultiIndex or have DatetimeIndex.")
35
+
36
+ # most recent start date
37
+ combined_df = dfs[0]
38
+
39
+ # combine dfs
40
+ for df in dfs[1:]:
41
+ combined_df = combined_df.combine_first(df)
42
+
43
+ # reorder cols
44
+ max_columns = max(len(df.columns) for df in dfs)
45
+ cols = next(df.columns.tolist() for df in dfs if len(df.columns) == max_columns)
46
+ combined_df = combined_df[cols]
47
+
48
+ return combined_df
49
+
50
+
51
+ class CleanData:
52
+ """
53
+ Cleans data to improve data quality.
54
+ """
55
+ def __init__(self, df: pd.DataFrame):
56
+ """
57
+ Constructor
58
+
59
+ Parameters
60
+ ----------
61
+ df: pd.DataFrame
62
+ DataFrame MultiIndex with DatetimeIndex (level 0), ticker (level 1) and field (cols) values.
63
+ """
64
+ self.raw_df = df.copy() # keepy copy of raw dataframe
65
+ self.df = df
66
+ self.excluded_cols = None
67
+ self.outliers = None
68
+ self.yhat = None
69
+ self.filtered_df = None
70
+ self.filtered_tickers = None
71
+ self.repaired_df = None
72
+ self.summary = pd.DataFrame()
73
+ self.initialize_summary()
74
+ self.check_types()
75
+
76
+ def initialize_summary(self) -> None:
77
+ """
78
+ Initializes summary dataframe with data quality metrics.
79
+ """
80
+ # add obs and missing vals
81
+ self.summary.loc["n_obs", self.df.unstack().columns] = self.df.unstack().notna().sum().values
82
+ self.summary.loc["%_NaN_start", self.df.unstack().columns] = \
83
+ (self.df.unstack().isnull().sum() / self.df.unstack().shape[0]).values * 100
84
+
85
+ def check_types(self) -> None:
86
+ """
87
+ Checks data types of columns and converts them to the appropriate data types.
88
+
89
+ Returns
90
+ -------
91
+ CleanData
92
+ CleanData object
93
+ """
94
+ if not isinstance(self.df, pd.DataFrame):
95
+ raise TypeError("Data must be a pandas DataFrame.")
96
+
97
+ def filter_outliers(
98
+ self,
99
+ od_method: str = "mad",
100
+ excl_cols: Optional[Union[str, list]] = None,
101
+ **kwargs
102
+ ) -> CleanData:
103
+ """
104
+ Filters outliers.
105
+
106
+ Parameters
107
+ ----------
108
+ od_method: str, {'atr', 'iqr', 'mad', 'z_score', 'ewma', 'stl', 'seasonal_decomp', 'prophet'}, default z_score
109
+ Outlier detection method to use for filtering.
110
+ excl_cols: str or list
111
+ Name of columns to exclude from outlier filtering.
112
+
113
+ Returns
114
+ -------
115
+ CleanData
116
+ CleanData object
117
+ """
118
+ # outlier detection
119
+ od = OutlierDetection(self.df, excl_cols=excl_cols, **kwargs)
120
+ self.excluded_cols = excl_cols
121
+
122
+ # filter outliers
123
+ getattr(od, od_method)()
124
+ self.filtered_df = od.filtered_df
125
+ self.outliers = od.outliers
126
+ self.yhat = od.yhat
127
+
128
+ # add to summary
129
+ self.summary.loc["%_outliers", self.outliers.unstack().columns] = (
130
+ self.outliers.unstack().notna().sum() / self.df.unstack().shape[0]
131
+ ).values * 100
132
+
133
+ # filtered df
134
+ self.df = self.filtered_df
135
+
136
+ return self
137
+
138
+ def repair_outliers(self, imp_method: str = "interpolate", **kwargs) -> CleanData:
139
+ """
140
+ Repairs outliers using an imputation method.
141
+
142
+ Parameters
143
+ ----------
144
+ imp_method: str, {"fwd_fill', 'interpolate', 'fcst'}, default 'fwd_fill'
145
+ Imputation method used to replace filtered outliers.
146
+
147
+ Returns
148
+ -------
149
+ CleanData
150
+ CleanData object
151
+ """
152
+ # impute missing vals
153
+ if imp_method == "fcst":
154
+ self.repaired_df = getattr(Impute(self.df), imp_method)(self.yhat, **kwargs)
155
+ else:
156
+ self.repaired_df = getattr(Impute(self.df), imp_method)(**kwargs)
157
+
158
+ # add repaired % to summary
159
+ rep_vals = self.repaired_df.unstack().notna().sum() - self.df.unstack().notna().sum()
160
+ self.summary.loc["%_imputed", self.df.unstack().columns] = rep_vals / self.df.unstack().shape[0] * 100
161
+
162
+ # repaired df
163
+ if self.excluded_cols is not None:
164
+ self.df = pd.concat([self.repaired_df, self.raw_df[self.excluded_cols]], join="outer", axis=1)
165
+ else:
166
+ self.df = self.repaired_df
167
+ # reorder cols
168
+ self.df = self.df[self.raw_df.columns]
169
+
170
+ return self
171
+
172
+ def filter_avg_trading_val(self, thresh_val: int = 10000000, window_size: int = 30) -> CleanData:
173
+ """
174
+ Filters values below a threshold of average trading value (price * volume/size in quote currency) over some
175
+ lookback window, replacing them with NaNs.
176
+
177
+ Parameters
178
+ ----------
179
+ thresh_val: int, default 10,000,000
180
+ Threshold/cut-off for avg trading value.
181
+ window_size: int, default 30
182
+ Size of rolling window.
183
+
184
+ Returns
185
+ -------
186
+ CleanData
187
+ CleanData object
188
+ """
189
+ # filter outliers
190
+ self.filtered_df = Filter(self.df).avg_trading_val(thresh_val=thresh_val, window_size=window_size)
191
+
192
+ # add to summary
193
+ filtered_vals = self.df.unstack().notna().sum() - self.filtered_df.unstack().notna().sum()
194
+ self.summary.loc["%_below_avg_trading_val", self.df.unstack().columns] = (
195
+ filtered_vals / self.df.unstack().shape[0]
196
+ ).values * 100
197
+
198
+ # filtered df
199
+ self.df = self.filtered_df
200
+
201
+ return self
202
+
203
+ def filter_missing_vals_gaps(self, gap_window: int = 30) -> CleanData:
204
+ """
205
+ Filters values before a large gap of missing values, replacing them with NaNs.
206
+
207
+ Parameters
208
+ ----------
209
+ gap_window: int, default 30
210
+ Size of window where all values are missing (NaNs).
211
+
212
+ Returns
213
+ -------
214
+ CleanData
215
+ CleanData object
216
+ """
217
+ # filter outliers
218
+ self.filtered_df = Filter(self.df).missing_vals_gaps(gap_window=gap_window)
219
+
220
+ # add to summary
221
+ missing_vals_gap = (
222
+ self.df.unstack().notna().sum() - self.filtered_df.unstack().notna().sum()
223
+ )
224
+ self.summary.loc["%_missing_vals_gaps", self.df.unstack().columns] = (
225
+ missing_vals_gap / self.df.unstack().shape[0]
226
+ ).values * 100
227
+
228
+ # filtered df
229
+ self.df = self.filtered_df
230
+
231
+ return self
232
+
233
+ def filter_min_nobs(self, ts_obs: int = 100, cs_obs: int = 2) -> CleanData:
234
+ """
235
+ Removes tickers from dataframe if the ticker has less than a minimum number of observations.
236
+
237
+ Parameters
238
+ ----------
239
+ ts_obs: int, default 100
240
+ Minimum number of observations for field/column over time series.
241
+ cs_obs: int, default 5
242
+ Minimum number of observations for tickers over the cross-section.
243
+
244
+ Returns
245
+ -------
246
+ CleanData
247
+ CleanData object
248
+ """
249
+ # filter outliers
250
+ self.filtered_df = Filter(self.df).min_nobs(ts_obs=ts_obs, cs_obs=cs_obs)
251
+
252
+ # tickers < min obs
253
+ self.filtered_tickers = list(
254
+ set(self.filtered_df.index.droplevel(0).unique()).symmetric_difference(
255
+ set(self.df.index.droplevel(0).unique())
256
+ )
257
+ )
258
+
259
+ # add to summary
260
+ self.summary.loc["n_tickers_below_min_obs", self.df.unstack().columns] = len(self.filtered_tickers)
261
+
262
+ # filtered df
263
+ self.df = self.filtered_df
264
+
265
+ return self
266
+
267
+ def filter_tickers(self, tickers_list) -> CleanData:
268
+ """
269
+ Removes specified tickers from dataframe.
270
+
271
+ Parameters
272
+ ----------
273
+ tickers_list: str or list
274
+ List of tickers to be removed. Can be used to remove tickers to be excluded from data analysis,
275
+ e.g. stablecoins or indexes.
276
+
277
+ Returns
278
+ -------
279
+ CleanData
280
+ CleanData object
281
+ """
282
+ # filter tickers
283
+ self.filtered_df = Filter(self.df).tickers(tickers_list)
284
+
285
+ # tickers < min obs
286
+ self.filtered_tickers = list(
287
+ set(self.filtered_df.index.droplevel(0).unique()).symmetric_difference(
288
+ set(self.df.index.droplevel(0).unique())
289
+ )
290
+ )
291
+
292
+ # add to summary
293
+ self.summary.loc["n_filtered_tickers", self.df.unstack().columns] = len(self.filtered_tickers)
294
+
295
+ # filtered df
296
+ self.df = self.filtered_df
297
+
298
+ return self
299
+
300
+ def show_plot(self, plot_series: tuple = ("BTC", "close"), compare_series: bool = True) -> None:
301
+ """
302
+ Plots clean time series and compares it to the raw series.
303
+
304
+ Parameters
305
+ ----------
306
+ plot_series: tuple, optional, default('BTC', 'close')
307
+ Plots the time series of a specific (ticker, field) tuple.
308
+ compare_series: bool, default True
309
+ Compares clean time series with raw series
310
+ """
311
+ ax = (
312
+ self.df.loc[pd.IndexSlice[:, plot_series[0]], plot_series[1]]
313
+ .droplevel(1)
314
+ .plot(
315
+ linewidth=1,
316
+ figsize=(15, 7),
317
+ color="#1f77b4",
318
+ zorder=0,
319
+ title="Filtered vs. Raw Data",
320
+ )
321
+ )
322
+ if compare_series:
323
+ ax = (
324
+ self.raw_df.loc[pd.IndexSlice[:, plot_series[0]], plot_series[1]]
325
+ .droplevel(1)
326
+ .plot(
327
+ linewidth=1,
328
+ figsize=(15, 7),
329
+ linestyle=":",
330
+ color="#FF8785",
331
+ zorder=0,
332
+ )
333
+ )
334
+ ax.grid(color="black", linewidth=0.05)
335
+ ax.xaxis.grid(False)
336
+ ax.set_ylabel(plot_series[0])
337
+ ax.ticklabel_format(style="plain", axis="y")
338
+ ax.set_facecolor("whitesmoke")
339
+ ax.legend(
340
+ [plot_series[1] + "_filtered", plot_series[1] + "_raw"], loc="upper left"
341
+ )
342
+
343
+ def get(self, attr="df") -> pd.DataFrame:
344
+ """
345
+ Returns GetData object attribute.
346
+
347
+ Parameters
348
+ ----------
349
+ attr: str, {'df', 'outliers', 'yhat', 'filtered_tickers', 'summary'}, default 'df'
350
+ GetData object attribute to return
351
+
352
+ Returns
353
+ -------
354
+ CleanData
355
+ CleanData object
356
+ """
357
+ self.summary.loc["%_NaN_end", self.df.unstack().columns] = (
358
+ self.df.unstack().isnull().sum() / self.df.unstack().shape[0]
359
+ ).values * 100
360
+ self.summary = self.summary.astype(float).round(2)
361
+
362
+ return getattr(self, attr)