cryptodatapy 0.2.2__tar.gz → 0.2.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (72) hide show
  1. {cryptodatapy-0.2.2 → cryptodatapy-0.2.4}/PKG-INFO +4 -1
  2. {cryptodatapy-0.2.2 → cryptodatapy-0.2.4}/pyproject.toml +4 -1
  3. {cryptodatapy-0.2.2 → cryptodatapy-0.2.4}/setup.py +4 -1
  4. cryptodatapy-0.2.4/src/cryptodatapy/conf/fx_tickers.csv +31 -0
  5. cryptodatapy-0.2.4/src/cryptodatapy/transform/clean.py +398 -0
  6. cryptodatapy-0.2.4/src/cryptodatapy/transform/clean_perp_futures_ohlcv.ipynb +1025 -0
  7. {cryptodatapy-0.2.2 → cryptodatapy-0.2.4}/src/cryptodatapy/transform/filter.py +83 -142
  8. {cryptodatapy-0.2.2 → cryptodatapy-0.2.4}/src/cryptodatapy/transform/impute.py +36 -83
  9. {cryptodatapy-0.2.2 → cryptodatapy-0.2.4}/src/cryptodatapy/transform/od.py +221 -450
  10. cryptodatapy-0.2.2/src/cryptodatapy/transform/clean.py +0 -399
  11. {cryptodatapy-0.2.2 → cryptodatapy-0.2.4}/LICENSE +0 -0
  12. {cryptodatapy-0.2.2 → cryptodatapy-0.2.4}/README.md +0 -0
  13. {cryptodatapy-0.2.2 → cryptodatapy-0.2.4}/src/cryptodatapy/.DS_Store +0 -0
  14. {cryptodatapy-0.2.2 → cryptodatapy-0.2.4}/src/cryptodatapy/.idea/.gitignore +0 -0
  15. {cryptodatapy-0.2.2 → cryptodatapy-0.2.4}/src/cryptodatapy/.idea/cryptodatapy.iml +0 -0
  16. {cryptodatapy-0.2.2 → cryptodatapy-0.2.4}/src/cryptodatapy/.idea/csv-plugin.xml +0 -0
  17. {cryptodatapy-0.2.2 → cryptodatapy-0.2.4}/src/cryptodatapy/.idea/inspectionProfiles/Project_Default.xml +0 -0
  18. {cryptodatapy-0.2.2 → cryptodatapy-0.2.4}/src/cryptodatapy/.idea/inspectionProfiles/profiles_settings.xml +0 -0
  19. {cryptodatapy-0.2.2 → cryptodatapy-0.2.4}/src/cryptodatapy/.idea/misc.xml +0 -0
  20. {cryptodatapy-0.2.2 → cryptodatapy-0.2.4}/src/cryptodatapy/.idea/modules.xml +0 -0
  21. {cryptodatapy-0.2.2 → cryptodatapy-0.2.4}/src/cryptodatapy/.idea/vcs.xml +0 -0
  22. {cryptodatapy-0.2.2 → cryptodatapy-0.2.4}/src/cryptodatapy/__init__.py +0 -0
  23. {cryptodatapy-0.2.2 → cryptodatapy-0.2.4}/src/cryptodatapy/conf/__init__.py +0 -0
  24. {cryptodatapy-0.2.2 → cryptodatapy-0.2.4}/src/cryptodatapy/conf/fields.csv +0 -0
  25. {cryptodatapy-0.2.2 → cryptodatapy-0.2.4}/src/cryptodatapy/conf/tickers.csv +0 -0
  26. {cryptodatapy-0.2.2 → cryptodatapy-0.2.4}/src/cryptodatapy/datasets/__init__.py +0 -0
  27. {cryptodatapy-0.2.2 → cryptodatapy-0.2.4}/src/cryptodatapy/datasets/br_econ_calendar.csv +0 -0
  28. {cryptodatapy-0.2.2 → cryptodatapy-0.2.4}/src/cryptodatapy/datasets/ca_econ_calendar.csv +0 -0
  29. {cryptodatapy-0.2.2 → cryptodatapy-0.2.4}/src/cryptodatapy/datasets/cn_econ_calendar.csv +0 -0
  30. {cryptodatapy-0.2.2 → cryptodatapy-0.2.4}/src/cryptodatapy/datasets/de_econ_calendar.csv +0 -0
  31. {cryptodatapy-0.2.2 → cryptodatapy-0.2.4}/src/cryptodatapy/datasets/ez_econ_calendar.csv +0 -0
  32. {cryptodatapy-0.2.2 → cryptodatapy-0.2.4}/src/cryptodatapy/datasets/fr_econ_calendar.csv +0 -0
  33. {cryptodatapy-0.2.2 → cryptodatapy-0.2.4}/src/cryptodatapy/datasets/gb_econ_calendar.csv +0 -0
  34. {cryptodatapy-0.2.2 → cryptodatapy-0.2.4}/src/cryptodatapy/datasets/get_econ_calendars.py +0 -0
  35. {cryptodatapy-0.2.2 → cryptodatapy-0.2.4}/src/cryptodatapy/datasets/id_econ_calendar.csv +0 -0
  36. {cryptodatapy-0.2.2 → cryptodatapy-0.2.4}/src/cryptodatapy/datasets/in_econ_calendar.csv +0 -0
  37. {cryptodatapy-0.2.2 → cryptodatapy-0.2.4}/src/cryptodatapy/datasets/it_econ_calendar.csv +0 -0
  38. {cryptodatapy-0.2.2 → cryptodatapy-0.2.4}/src/cryptodatapy/datasets/jp_econ_calendar.csv +0 -0
  39. {cryptodatapy-0.2.2 → cryptodatapy-0.2.4}/src/cryptodatapy/datasets/kr_econ_calendar.csv +0 -0
  40. {cryptodatapy-0.2.2 → cryptodatapy-0.2.4}/src/cryptodatapy/datasets/mx_econ_calendar.csv +0 -0
  41. {cryptodatapy-0.2.2 → cryptodatapy-0.2.4}/src/cryptodatapy/datasets/ru_econ_calendar.csv +0 -0
  42. {cryptodatapy-0.2.2 → cryptodatapy-0.2.4}/src/cryptodatapy/datasets/tr_econ_calendar.csv +0 -0
  43. {cryptodatapy-0.2.2 → cryptodatapy-0.2.4}/src/cryptodatapy/datasets/us_econ_calendar.csv +0 -0
  44. {cryptodatapy-0.2.2 → cryptodatapy-0.2.4}/src/cryptodatapy/extract/__init__.py +0 -0
  45. {cryptodatapy-0.2.2 → cryptodatapy-0.2.4}/src/cryptodatapy/extract/data_vendors/.ipynb_checkpoints/CCXT-checkpoint.ipynb +0 -0
  46. {cryptodatapy-0.2.2 → cryptodatapy-0.2.4}/src/cryptodatapy/extract/data_vendors/.ipynb_checkpoints/DBNomics-checkpoint.ipynb +0 -0
  47. {cryptodatapy-0.2.2 → cryptodatapy-0.2.4}/src/cryptodatapy/extract/data_vendors/.ipynb_checkpoints/InvestPy-checkpoint.ipynb +0 -0
  48. {cryptodatapy-0.2.2 → cryptodatapy-0.2.4}/src/cryptodatapy/extract/data_vendors/.ipynb_checkpoints/NasdaqDataLink-checkpoint.ipynb +0 -0
  49. {cryptodatapy-0.2.2 → cryptodatapy-0.2.4}/src/cryptodatapy/extract/data_vendors/.ipynb_checkpoints/PandasDataReader-checkpoint.ipynb +0 -0
  50. {cryptodatapy-0.2.2 → cryptodatapy-0.2.4}/src/cryptodatapy/extract/data_vendors/__init__.py +0 -0
  51. {cryptodatapy-0.2.2 → cryptodatapy-0.2.4}/src/cryptodatapy/extract/data_vendors/coinmetrics_api.py +0 -0
  52. {cryptodatapy-0.2.2 → cryptodatapy-0.2.4}/src/cryptodatapy/extract/data_vendors/cryptocompare_api.py +0 -0
  53. {cryptodatapy-0.2.2 → cryptodatapy-0.2.4}/src/cryptodatapy/extract/data_vendors/datavendor.py +0 -0
  54. {cryptodatapy-0.2.2 → cryptodatapy-0.2.4}/src/cryptodatapy/extract/data_vendors/glassnode_api.py +0 -0
  55. {cryptodatapy-0.2.2 → cryptodatapy-0.2.4}/src/cryptodatapy/extract/data_vendors/tiingo_api.py +0 -0
  56. {cryptodatapy-0.2.2 → cryptodatapy-0.2.4}/src/cryptodatapy/extract/datarequest.py +0 -0
  57. {cryptodatapy-0.2.2 → cryptodatapy-0.2.4}/src/cryptodatapy/extract/getdata.py +0 -0
  58. {cryptodatapy-0.2.2 → cryptodatapy-0.2.4}/src/cryptodatapy/extract/libraries/__init__.py +0 -0
  59. {cryptodatapy-0.2.2 → cryptodatapy-0.2.4}/src/cryptodatapy/extract/libraries/ccxt_api.py +0 -0
  60. {cryptodatapy-0.2.2 → cryptodatapy-0.2.4}/src/cryptodatapy/extract/libraries/dbnomics_api.py +0 -0
  61. {cryptodatapy-0.2.2 → cryptodatapy-0.2.4}/src/cryptodatapy/extract/libraries/investpy_api.py +0 -0
  62. {cryptodatapy-0.2.2 → cryptodatapy-0.2.4}/src/cryptodatapy/extract/libraries/library.py +0 -0
  63. {cryptodatapy-0.2.2 → cryptodatapy-0.2.4}/src/cryptodatapy/extract/libraries/pandasdr_api.py +0 -0
  64. {cryptodatapy-0.2.2 → cryptodatapy-0.2.4}/src/cryptodatapy/extract/web/__init__.py +0 -0
  65. {cryptodatapy-0.2.2 → cryptodatapy-0.2.4}/src/cryptodatapy/extract/web/aqr.py +0 -0
  66. {cryptodatapy-0.2.2 → cryptodatapy-0.2.4}/src/cryptodatapy/extract/web/web.py +0 -0
  67. {cryptodatapy-0.2.2 → cryptodatapy-0.2.4}/src/cryptodatapy/transform/__init__.py +0 -0
  68. {cryptodatapy-0.2.2 → cryptodatapy-0.2.4}/src/cryptodatapy/transform/convertparams.py +0 -0
  69. {cryptodatapy-0.2.2 → cryptodatapy-0.2.4}/src/cryptodatapy/transform/wrangle.py +0 -0
  70. {cryptodatapy-0.2.2 → cryptodatapy-0.2.4}/src/cryptodatapy/util/__init__.py +0 -0
  71. {cryptodatapy-0.2.2 → cryptodatapy-0.2.4}/src/cryptodatapy/util/datacatalog.py +0 -0
  72. {cryptodatapy-0.2.2 → cryptodatapy-0.2.4}/src/cryptodatapy/util/datacredentials.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: cryptodatapy
3
- Version: 0.2.2
3
+ Version: 0.2.4
4
4
  Summary: Cryptoasset data library
5
5
  License: Apache-2.0
6
6
  Author: Systamental
@@ -13,6 +13,7 @@ Classifier: Programming Language :: Python :: 3.9
13
13
  Requires-Dist: DBnomics (>=1.2.3)
14
14
  Requires-Dist: ccxt (>=1.91.52)
15
15
  Requires-Dist: coinmetrics-api-client (>=2022.6.17); python_version >= "3.7"
16
+ Requires-Dist: fsspec (>=2024.6.1)
16
17
  Requires-Dist: investpy (>=1.0.8)
17
18
  Requires-Dist: matplotlib (>=3.5.2)
18
19
  Requires-Dist: numpy (>=1.23.2)
@@ -20,8 +21,10 @@ Requires-Dist: openpyxl (>=3.1.2)
20
21
  Requires-Dist: pandas (>=1.4.4)
21
22
  Requires-Dist: pandas-datareader (>=0.10.0)
22
23
  Requires-Dist: prophet (>=1.1); python_version >= "3.7"
24
+ Requires-Dist: pyarrow (>=17.0.0)
23
25
  Requires-Dist: requests (>=2.28.0); python_version >= "3.7"
24
26
  Requires-Dist: responses (>=0.21.0)
27
+ Requires-Dist: s3fs (>=2024.6.1,<2025.0.0)
25
28
  Requires-Dist: selenium (>=4.4.3)
26
29
  Requires-Dist: statsmodels (>=0.13.2)
27
30
  Requires-Dist: webdriver-manager (>=3.8.3)
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "cryptodatapy"
3
- version = "0.2.2"
3
+ version = "0.2.4"
4
4
  description = "Cryptoasset data library"
5
5
  authors = ["Systamental"]
6
6
  license = "Apache License 2.0"
@@ -25,6 +25,9 @@ responses = ">=0.21.0"
25
25
  yfinance = ">=0.2.14"
26
26
  openpyxl = ">=3.1.2"
27
27
  xlrd = ">=2.0.1"
28
+ fsspec = ">=2024.6.1"
29
+ pyarrow = ">=17.0.0"
30
+ s3fs = "^2024.6.1"
28
31
 
29
32
  [tool.poetry.dev-dependencies]
30
33
  pytest = ">=7.1.2"
@@ -83,13 +83,16 @@ package_data = \
83
83
  install_requires = \
84
84
  ['DBnomics>=1.2.3',
85
85
  'ccxt>=1.91.52',
86
+ 'fsspec>=2024.6.1',
86
87
  'investpy>=1.0.8',
87
88
  'matplotlib>=3.5.2',
88
89
  'numpy>=1.23.2',
89
90
  'openpyxl>=3.1.2',
90
91
  'pandas-datareader>=0.10.0',
91
92
  'pandas>=1.4.4',
93
+ 'pyarrow>=17.0.0',
92
94
  'responses>=0.21.0',
95
+ 's3fs>=2024.6.1,<2025.0.0',
93
96
  'selenium>=4.4.3',
94
97
  'statsmodels>=0.13.2',
95
98
  'webdriver-manager>=3.8.3',
@@ -103,7 +106,7 @@ extras_require = \
103
106
 
104
107
  setup_kwargs = {
105
108
  'name': 'cryptodatapy',
106
- 'version': '0.2.2',
109
+ 'version': '0.2.4',
107
110
  'description': 'Cryptoasset data library',
108
111
  'long_description': "![](cryptodatapy_logo.jpeg)\n\n# CryptoDataPy\n### _Better data beats fancier algorithms_\n<br/>\n\n**CryptoDataPy** is a python library which makes it easy to build high quality data pipelines \nfor the analysis of digital assets. By providing easy access to over 100,000 time series for thousands of assets, \nit facilitates the pre-processing of a wide range of data from different sources.\n\nCryptoassets generate a huge amount of market, on-chain and off-chain data. \nBut unlike legacy financial markets, this data is often fragmented, \nunstructured and dirty. By extracting data from various sources, \npre-processing it into a user-friendly (tidy) format, detecting and repairing 'bad' data,\nand allowing for easy storage and retrieval, CryptoDataPy allows you to spend less time gathering \nand cleaning data, and more time analyzing it.\n\nOur data includes:\n\n- **Market:** market prices of varying granularity (e.g. tick, trade and bar data, aka OHLC),\nfor spot, futures and options markets, as well as funding rates for the analysis of \ncryptoasset returns.\n- **On-chain:** network health and usage data, circulating supply, asset holder positions and \ncost-basis, for the analysis of underlying crypto network fundamentals.\n- **Off-chain:** news, social media, developer activity, web traffic and search for project interest and \nsentiment, as well as traditional financial market and macroeconomic data for broader financial and \neconomic conditions.\n\nThe library's intuitive interface facilitates each step of the ETL/ETL (extract-transform-load) process:\n\n- **Extract**: Extracting data from a wide range of data sources and file formats.\n- **Transform**: \n - Wrangling data into a pandas DataFrame in a structured and user-friendly format, \n a.k.a [tidy data](https://www.jstatsoft.org/article/view/v059i10). \n - Detecting, scrubbing and repairing 'bad' data (e.g. outliers, missing values, 0s, etc.) to improve the accuracy and reliability\nof machine learning/predictive models.\n- **Load**: Storing clean and ready-for-analysis data and metadata for easy access.\n\n## Installation\n\n```bash\n$ pip install cryptodatapy\n```\n\n## Usage\n\n**CryptoDataPy** allows you to pull ready-to-analyze data from a variety of sources \nwith only a few lines of code.\n\nFirst specify which data you want with a `DataRequest`:\n\n```python\n# import DataRequest\nfrom cryptodatapy.extract.datarequest import DataRequest\n# specify parameters for data request: tickers, fields, start date, end_date, etc.\ndata_req = DataRequest(\n source='glassnode', # name of data source\n tickers=['btc', 'eth'], # list of asset tickers, in CryptoDataPy format, defaults to 'btc'\n fields=['close', 'add_act', 'hashrate'], # list of fields, in CryptoDataPy, defaults to 'close'\n freq=None, # data frequency, defaults to daily \n quote_ccy=None, # defaults to USD/USDT\n exch=None, # defaults to exchange weighted average or Binance\n mkt_type= 'spot', # defaults to spot\n start_date=None, # defaults to start date for longest series\n end_date=None, # defaults to most recent \n tz=None, # defaults to UTC time\n cat=None, # optional, should be specified when asset class is not crypto, eg. 'fx', 'rates', 'macro', etc.\n)\n```\nThen get the data :\n\n```python\n# import GetData\nfrom cryptodatapy.extract.getdata import GetData\n# get data\nGetData(data_req).get_series()\n```\n\nWith the same data request parameters, you can retrieve the same data from a different source:\n\n```python\n# modify data source parameter\ndata_req = DataRequest(\n source='coinmetrics', \n tickers=['btc', 'eth'], \n fields=['close', 'add_act', 'hashrate'], \n req='d',\n start_date='2016-01-01')\n# get data\nGetData(data_req).get_series()\n```\n\nFor more detailed code examples and interactive tutorials \nsee [here](https://github.com/systamental/cryptodatapy/blob/main/docs/example.ipynb).\n## Supported Data Sources\n\n- [CryptoCompare](https://min-api.cryptocompare.com/documentation)\n- [CCXT](https://docs.ccxt.com/en/latest/)\n- [Glassnode](https://docs.glassnode.com/)\n- [Coin Metrics](https://docs.coinmetrics.io/api/v4/)\n- [Tiingo](https://api.tiingo.com/documentation/general/overview)\n- [Yahoo Finance](https://github.com/ranaroussi/yfinance)\n- [Fama-French Data](http://mba.tuck.dartmouth.edu/pages/faculty/ken.french/data_library.html)\n- [AQR](https://www.aqr.com/Insights/Datasets)\n- [Federal Reserve Economic Data (FRED)](https://fred.stlouisfed.org/docs/api/fred/)\n- [DBnomics](https://db.nomics.world/docs/)\n- [WorldBank](https://documents.worldbank.org/en/publication/documents-reports/api)\n- [Pandas-datareader](https://pandas-datareader.readthedocs.io/en/latest/)\n\n## Contributing\n\nInterested in contributing? Check out the contributing guidelines and \ncontact us at info@systamental.com. Please note that this project is s\nreleased with a Code of Conduct. By contributing to this project, you agree \nto abide by its terms.\n\n## License\n\n`cryptodatapy` was created by Systamental. \nIt is licensed under the terms of the Apache License 2.0 license.\n\n",
109
112
  'author': 'Systamental',
@@ -0,0 +1,31 @@
1
+ id,name,tiingo_id
2
+ eurusd,,
3
+ gbpusd,,
4
+ usdjpy,,
5
+ usdchf,,
6
+ usdcad,,
7
+ usdsek,,
8
+ usdnok,,
9
+ audusd,,
10
+ nzdusd,,
11
+ usdars,,
12
+ usdmxn,,
13
+ usdbrl,,
14
+ usdcop,,
15
+ usdclp,,
16
+ usdpen,,
17
+ usdils,,
18
+ usdrub,,
19
+ usdczk,,
20
+ usdpln,,
21
+ usdhuf,,
22
+ usdzar,,
23
+ usdtry,,
24
+ usdcny,,
25
+ usdhkd,,
26
+ usdsgd,,
27
+ usdtwd,,
28
+ usdkrw,,
29
+ usdphp,,
30
+ usdinr,,
31
+ usdidr,,
@@ -0,0 +1,398 @@
1
+ from __future__ import annotations
2
+ from typing import Optional, Union
3
+ import pandas as pd
4
+
5
+ from cryptodatapy.transform.od import OutlierDetection
6
+ from cryptodatapy.transform.impute import Impute
7
+ from cryptodatapy.transform.filter import Filter
8
+
9
+
10
+ def stitch_dataframes(dfs):
11
+ """
12
+ Stitches together dataframes with different start dates.
13
+
14
+ Parameters
15
+ ----------
16
+ dfs: list
17
+ List of dataframes to be stitched together.
18
+
19
+ Returns
20
+ -------
21
+ combined_df: pd.DataFrame
22
+ Combined dataframe with extended start date.
23
+ """
24
+ # check if dfs is a list
25
+ if not isinstance(dfs, list):
26
+ raise TypeError("Dataframes must be a list.")
27
+
28
+ # check index types
29
+ if all([isinstance(df.index, pd.MultiIndex) for df in dfs]):
30
+ dfs.sort(key=lambda df: df.index.levels[0][0], reverse=True)
31
+ elif all([isinstance(df.index, pd.DatetimeIndex) for df in dfs]):
32
+ dfs.sort(key=lambda df: df.index[0], reverse=True)
33
+ else:
34
+ raise TypeError("Dataframes must be pd.MultiIndex or have DatetimeIndex.")
35
+
36
+ # most recent start date
37
+ combined_df = dfs[0]
38
+
39
+ # combine dfs
40
+ for df in dfs[1:]:
41
+ combined_df = combined_df.combine_first(df)
42
+
43
+ # reorder cols
44
+ max_columns = max(len(df.columns) for df in dfs)
45
+ cols = next(df.columns.tolist() for df in dfs if len(df.columns) == max_columns)
46
+ combined_df = combined_df[cols]
47
+
48
+ return combined_df
49
+
50
+
51
+ class CleanData:
52
+ """
53
+ Cleans data to improve data quality.
54
+ """
55
+ def __init__(self, df: pd.DataFrame):
56
+ """
57
+ Constructor
58
+
59
+ Parameters
60
+ ----------
61
+ df: pd.DataFrame
62
+ DataFrame MultiIndex with DatetimeIndex (level 0), ticker (level 1) and field (cols) values.
63
+ """
64
+ self.raw_df = df.copy() # keepy copy of raw dataframe
65
+ self.df = df
66
+ self.excluded_cols = None
67
+ self.outliers = None
68
+ self.yhat = None
69
+ self.filtered_df = None
70
+ self.filtered_tickers = None
71
+ self.repaired_df = None
72
+ self.summary = pd.DataFrame()
73
+ self.initialize_summary()
74
+ self.check_types()
75
+
76
+ def initialize_summary(self) -> None:
77
+ """
78
+ Initializes summary dataframe with data quality metrics.
79
+ """
80
+ # add obs and missing vals
81
+ self.summary.loc["n_obs", self.df.unstack().columns] = self.df.unstack().notna().sum().values
82
+ self.summary.loc["%_NaN_start", self.df.unstack().columns] = \
83
+ (self.df.unstack().isnull().sum() / self.df.unstack().shape[0]).values * 100
84
+
85
+ def check_types(self) -> None:
86
+ """
87
+ Checks data types of columns and converts them to the appropriate data types.
88
+
89
+ Returns
90
+ -------
91
+ CleanData
92
+ CleanData object
93
+ """
94
+ if not isinstance(self.df, pd.DataFrame):
95
+ raise TypeError("Data must be a pandas DataFrame.")
96
+
97
+ def filter_outliers(
98
+ self,
99
+ od_method: str = "mad",
100
+ excl_cols: Optional[Union[str, list]] = None,
101
+ **kwargs
102
+ ) -> CleanData:
103
+ """
104
+ Filters outliers.
105
+
106
+ Parameters
107
+ ----------
108
+ od_method: str, {'atr', 'iqr', 'mad', 'z_score', 'ewma', 'stl', 'seasonal_decomp', 'prophet'}, default z_score
109
+ Outlier detection method to use for filtering.
110
+ excl_cols: str or list
111
+ Name of columns to exclude from outlier filtering.
112
+
113
+ Returns
114
+ -------
115
+ CleanData
116
+ CleanData object
117
+ """
118
+ # outlier detection
119
+ od = OutlierDetection(self.df, excl_cols=excl_cols, **kwargs)
120
+ self.excluded_cols = excl_cols
121
+
122
+ # filter outliers
123
+ getattr(od, od_method)()
124
+ self.filtered_df = od.filtered_df
125
+ self.outliers = od.outliers
126
+ self.yhat = od.yhat
127
+
128
+ # add to summary
129
+ self.summary.loc["%_outliers", self.outliers.unstack().columns] = (
130
+ self.outliers.unstack().notna().sum() / self.df.unstack().shape[0]
131
+ ).values * 100
132
+
133
+ # filtered df
134
+ self.df = self.filtered_df.sort_index()
135
+
136
+ return self
137
+
138
+ def repair_outliers(self, imp_method: str = "interpolate", **kwargs) -> CleanData:
139
+ """
140
+ Repairs outliers using an imputation method.
141
+
142
+ Parameters
143
+ ----------
144
+ imp_method: str, {"fwd_fill', 'interpolate', 'fcst'}, default 'fwd_fill'
145
+ Imputation method used to replace filtered outliers.
146
+
147
+ Returns
148
+ -------
149
+ CleanData
150
+ CleanData object
151
+ """
152
+ # impute missing vals
153
+ if imp_method == "fcst":
154
+ self.repaired_df = getattr(Impute(self.df), imp_method)(self.yhat, **kwargs)
155
+ else:
156
+ self.repaired_df = getattr(Impute(self.df), imp_method)(**kwargs)
157
+
158
+ # add repaired % to summary
159
+ rep_vals = self.repaired_df.unstack().notna().sum() - self.df.unstack().notna().sum()
160
+ self.summary.loc["%_imputed", self.df.unstack().columns] = rep_vals / self.df.unstack().shape[0] * 100
161
+
162
+ # repaired df
163
+ if self.excluded_cols is not None:
164
+ self.df = pd.concat([self.repaired_df, self.raw_df[self.excluded_cols]], join="inner", axis=1)
165
+ else:
166
+ self.df = self.repaired_df
167
+
168
+ # reorder cols
169
+ self.df = self.df[self.raw_df.columns].sort_index()
170
+
171
+ return self
172
+
173
+ def filter_avg_trading_val(self, thresh_val: int = 10000000, window_size: int = 30) -> CleanData:
174
+ """
175
+ Filters values below a threshold of average trading value (price * volume/size in quote currency) over some
176
+ lookback window, replacing them with NaNs.
177
+
178
+ Parameters
179
+ ----------
180
+ thresh_val: int, default 10,000,000
181
+ Threshold/cut-off for avg trading value.
182
+ window_size: int, default 30
183
+ Size of rolling window.
184
+
185
+ Returns
186
+ -------
187
+ CleanData
188
+ CleanData object
189
+ """
190
+ # filter outliers
191
+ self.filtered_df = Filter(self.df).avg_trading_val(thresh_val=thresh_val, window_size=window_size)
192
+
193
+ # add to summary
194
+ filtered_vals = self.df.unstack().notna().sum() - self.filtered_df.unstack().notna().sum()
195
+ self.summary.loc["%_below_avg_trading_val", self.df.unstack().columns] = (
196
+ filtered_vals / self.df.unstack().shape[0]
197
+ ).values * 100
198
+
199
+ # filtered df
200
+ self.df = self.filtered_df.sort_index()
201
+
202
+ return self
203
+
204
+ def filter_missing_vals_gaps(self, gap_window: int = 30) -> CleanData:
205
+ """
206
+ Filters values before a large gap of missing values, replacing them with NaNs.
207
+
208
+ Parameters
209
+ ----------
210
+ gap_window: int, default 30
211
+ Size of window where all values are missing (NaNs).
212
+
213
+ Returns
214
+ -------
215
+ CleanData
216
+ CleanData object
217
+ """
218
+ # filter outliers
219
+ self.filtered_df = Filter(self.df).missing_vals_gaps(gap_window=gap_window)
220
+
221
+ # add to summary
222
+ missing_vals_gap = (
223
+ self.df.unstack().notna().sum() - self.filtered_df.unstack().notna().sum()
224
+ )
225
+ self.summary.loc["%_missing_vals_gaps", self.df.unstack().columns] = (
226
+ missing_vals_gap / self.df.unstack().shape[0]
227
+ ).values * 100
228
+
229
+ # filtered df
230
+ self.df = self.filtered_df.sort_index()
231
+
232
+ return self
233
+
234
+ def filter_min_nobs(self, ts_obs: int = 100, cs_obs: int = 2) -> CleanData:
235
+ """
236
+ Removes tickers from dataframe if the ticker has less than a minimum number of observations.
237
+
238
+ Parameters
239
+ ----------
240
+ ts_obs: int, default 100
241
+ Minimum number of observations for field/column over time series.
242
+ cs_obs: int, default 5
243
+ Minimum number of observations for tickers over the cross-section.
244
+
245
+ Returns
246
+ -------
247
+ CleanData
248
+ CleanData object
249
+ """
250
+ # filter outliers
251
+ self.filtered_df = Filter(self.df).min_nobs(ts_obs=ts_obs, cs_obs=cs_obs)
252
+
253
+ # tickers < min obs
254
+ self.filtered_tickers = list(
255
+ set(self.filtered_df.index.droplevel(0).unique()).symmetric_difference(
256
+ set(self.df.index.droplevel(0).unique())
257
+ )
258
+ )
259
+
260
+ # add to summary
261
+ self.summary.loc["n_tickers_below_min_obs", self.df.unstack().columns] = len(self.filtered_tickers)
262
+
263
+ # filtered df
264
+ self.df = self.filtered_df.sort_index()
265
+
266
+ return self
267
+
268
+ def filter_delisted_tickers(self, field: str = 'close', n_unch_vals: int = 30) -> CleanData:
269
+ """
270
+ Removes delisted tickers from dataframe.
271
+
272
+ Parameters
273
+ ----------
274
+ field: str, default 'close'
275
+ Field/column to use for detecting delisted tickers.
276
+ n_unch_vals: int, default 30
277
+ Number of consecutive unchanged values to consider a ticker as delisted.
278
+
279
+ Returns
280
+ -------
281
+ CleanData
282
+ CleanData object
283
+ """
284
+ # filter tickers
285
+ self.filtered_df = Filter(self.df).remove_delisted(field=field, n_unch_vals=n_unch_vals)
286
+
287
+ # tickers < min obs
288
+ self.filtered_tickers = list(
289
+ set(self.filtered_df.index.droplevel(0).unique()).symmetric_difference(
290
+ set(self.df.index.droplevel(0).unique())
291
+ )
292
+ )
293
+
294
+ # add to summary
295
+ self.summary.loc["n_filtered_tickers", self.df.unstack().columns] = len(self.filtered_tickers)
296
+
297
+ # filtered df
298
+ self.df = self.filtered_df.sort_index()
299
+
300
+ return self
301
+
302
+ def filter_tickers(self, tickers_list) -> CleanData:
303
+ """
304
+ Removes specified tickers from dataframe.
305
+
306
+ Parameters
307
+ ----------
308
+ tickers_list: str or list
309
+ List of tickers to be removed. Can be used to remove tickers to be excluded from data analysis,
310
+ e.g. stablecoins or indexes.
311
+
312
+ Returns
313
+ -------
314
+ CleanData
315
+ CleanData object
316
+ """
317
+ # filter tickers
318
+ self.filtered_df = Filter(self.df).tickers(tickers_list)
319
+
320
+ # tickers < min obs
321
+
322
+ self.filtered_tickers = list(
323
+ set(self.filtered_df.index.droplevel(0).unique()).symmetric_difference(
324
+ set(self.df.index.droplevel(0).unique())
325
+ )
326
+ )
327
+
328
+ # add to summary
329
+ self.summary.loc["n_filtered_tickers", self.df.unstack().columns] = len(self.filtered_tickers)
330
+
331
+ # filtered df
332
+ self.df = self.filtered_df.sort_index()
333
+
334
+ return self
335
+
336
+ def show_plot(self, plot_series: tuple = ("BTC", "close"), compare_series: bool = True) -> None:
337
+ """
338
+ Plots clean time series and compares it to the raw series.
339
+
340
+ Parameters
341
+ ----------
342
+ plot_series: tuple, optional, default('BTC', 'close')
343
+ Plots the time series of a specific (ticker, field) tuple.
344
+ compare_series: bool, default True
345
+ Compares clean time series with raw series
346
+ """
347
+ ax = (
348
+ self.df.loc[pd.IndexSlice[:, plot_series[0]], plot_series[1]]
349
+ .droplevel(1)
350
+ .plot(
351
+ linewidth=1,
352
+ figsize=(15, 7),
353
+ color="#1f77b4",
354
+ zorder=0,
355
+ title="Filtered vs. Raw Data",
356
+ )
357
+ )
358
+ if compare_series:
359
+ ax = (
360
+ self.raw_df.loc[pd.IndexSlice[:, plot_series[0]], plot_series[1]]
361
+ .droplevel(1)
362
+ .plot(
363
+ linewidth=1,
364
+ figsize=(15, 7),
365
+ linestyle=":",
366
+ color="#FF8785",
367
+ zorder=0,
368
+ )
369
+ )
370
+ ax.grid(color="black", linewidth=0.05)
371
+ ax.xaxis.grid(False)
372
+ ax.set_ylabel(plot_series[0])
373
+ ax.ticklabel_format(style="plain", axis="y")
374
+ ax.set_facecolor("whitesmoke")
375
+ ax.legend(
376
+ [plot_series[1] + "_filtered", plot_series[1] + "_raw"], loc="upper left"
377
+ )
378
+
379
+ def get(self, attr="df") -> pd.DataFrame:
380
+ """
381
+ Returns GetData object attribute.
382
+
383
+ Parameters
384
+ ----------
385
+ attr: str, {'df', 'outliers', 'yhat', 'filtered_tickers', 'summary'}, default 'df'
386
+ GetData object attribute to return
387
+
388
+ Returns
389
+ -------
390
+ CleanData
391
+ CleanData object
392
+ """
393
+ self.summary.loc["%_NaN_end", self.df.unstack().columns] = (
394
+ self.df.unstack().isnull().sum() / self.df.unstack().shape[0]
395
+ ).values * 100
396
+ self.summary = self.summary.astype(float).round(2)
397
+
398
+ return getattr(self, attr)