cryptodatapy 0.2.2__tar.gz → 0.2.4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {cryptodatapy-0.2.2 → cryptodatapy-0.2.4}/PKG-INFO +4 -1
- {cryptodatapy-0.2.2 → cryptodatapy-0.2.4}/pyproject.toml +4 -1
- {cryptodatapy-0.2.2 → cryptodatapy-0.2.4}/setup.py +4 -1
- cryptodatapy-0.2.4/src/cryptodatapy/conf/fx_tickers.csv +31 -0
- cryptodatapy-0.2.4/src/cryptodatapy/transform/clean.py +398 -0
- cryptodatapy-0.2.4/src/cryptodatapy/transform/clean_perp_futures_ohlcv.ipynb +1025 -0
- {cryptodatapy-0.2.2 → cryptodatapy-0.2.4}/src/cryptodatapy/transform/filter.py +83 -142
- {cryptodatapy-0.2.2 → cryptodatapy-0.2.4}/src/cryptodatapy/transform/impute.py +36 -83
- {cryptodatapy-0.2.2 → cryptodatapy-0.2.4}/src/cryptodatapy/transform/od.py +221 -450
- cryptodatapy-0.2.2/src/cryptodatapy/transform/clean.py +0 -399
- {cryptodatapy-0.2.2 → cryptodatapy-0.2.4}/LICENSE +0 -0
- {cryptodatapy-0.2.2 → cryptodatapy-0.2.4}/README.md +0 -0
- {cryptodatapy-0.2.2 → cryptodatapy-0.2.4}/src/cryptodatapy/.DS_Store +0 -0
- {cryptodatapy-0.2.2 → cryptodatapy-0.2.4}/src/cryptodatapy/.idea/.gitignore +0 -0
- {cryptodatapy-0.2.2 → cryptodatapy-0.2.4}/src/cryptodatapy/.idea/cryptodatapy.iml +0 -0
- {cryptodatapy-0.2.2 → cryptodatapy-0.2.4}/src/cryptodatapy/.idea/csv-plugin.xml +0 -0
- {cryptodatapy-0.2.2 → cryptodatapy-0.2.4}/src/cryptodatapy/.idea/inspectionProfiles/Project_Default.xml +0 -0
- {cryptodatapy-0.2.2 → cryptodatapy-0.2.4}/src/cryptodatapy/.idea/inspectionProfiles/profiles_settings.xml +0 -0
- {cryptodatapy-0.2.2 → cryptodatapy-0.2.4}/src/cryptodatapy/.idea/misc.xml +0 -0
- {cryptodatapy-0.2.2 → cryptodatapy-0.2.4}/src/cryptodatapy/.idea/modules.xml +0 -0
- {cryptodatapy-0.2.2 → cryptodatapy-0.2.4}/src/cryptodatapy/.idea/vcs.xml +0 -0
- {cryptodatapy-0.2.2 → cryptodatapy-0.2.4}/src/cryptodatapy/__init__.py +0 -0
- {cryptodatapy-0.2.2 → cryptodatapy-0.2.4}/src/cryptodatapy/conf/__init__.py +0 -0
- {cryptodatapy-0.2.2 → cryptodatapy-0.2.4}/src/cryptodatapy/conf/fields.csv +0 -0
- {cryptodatapy-0.2.2 → cryptodatapy-0.2.4}/src/cryptodatapy/conf/tickers.csv +0 -0
- {cryptodatapy-0.2.2 → cryptodatapy-0.2.4}/src/cryptodatapy/datasets/__init__.py +0 -0
- {cryptodatapy-0.2.2 → cryptodatapy-0.2.4}/src/cryptodatapy/datasets/br_econ_calendar.csv +0 -0
- {cryptodatapy-0.2.2 → cryptodatapy-0.2.4}/src/cryptodatapy/datasets/ca_econ_calendar.csv +0 -0
- {cryptodatapy-0.2.2 → cryptodatapy-0.2.4}/src/cryptodatapy/datasets/cn_econ_calendar.csv +0 -0
- {cryptodatapy-0.2.2 → cryptodatapy-0.2.4}/src/cryptodatapy/datasets/de_econ_calendar.csv +0 -0
- {cryptodatapy-0.2.2 → cryptodatapy-0.2.4}/src/cryptodatapy/datasets/ez_econ_calendar.csv +0 -0
- {cryptodatapy-0.2.2 → cryptodatapy-0.2.4}/src/cryptodatapy/datasets/fr_econ_calendar.csv +0 -0
- {cryptodatapy-0.2.2 → cryptodatapy-0.2.4}/src/cryptodatapy/datasets/gb_econ_calendar.csv +0 -0
- {cryptodatapy-0.2.2 → cryptodatapy-0.2.4}/src/cryptodatapy/datasets/get_econ_calendars.py +0 -0
- {cryptodatapy-0.2.2 → cryptodatapy-0.2.4}/src/cryptodatapy/datasets/id_econ_calendar.csv +0 -0
- {cryptodatapy-0.2.2 → cryptodatapy-0.2.4}/src/cryptodatapy/datasets/in_econ_calendar.csv +0 -0
- {cryptodatapy-0.2.2 → cryptodatapy-0.2.4}/src/cryptodatapy/datasets/it_econ_calendar.csv +0 -0
- {cryptodatapy-0.2.2 → cryptodatapy-0.2.4}/src/cryptodatapy/datasets/jp_econ_calendar.csv +0 -0
- {cryptodatapy-0.2.2 → cryptodatapy-0.2.4}/src/cryptodatapy/datasets/kr_econ_calendar.csv +0 -0
- {cryptodatapy-0.2.2 → cryptodatapy-0.2.4}/src/cryptodatapy/datasets/mx_econ_calendar.csv +0 -0
- {cryptodatapy-0.2.2 → cryptodatapy-0.2.4}/src/cryptodatapy/datasets/ru_econ_calendar.csv +0 -0
- {cryptodatapy-0.2.2 → cryptodatapy-0.2.4}/src/cryptodatapy/datasets/tr_econ_calendar.csv +0 -0
- {cryptodatapy-0.2.2 → cryptodatapy-0.2.4}/src/cryptodatapy/datasets/us_econ_calendar.csv +0 -0
- {cryptodatapy-0.2.2 → cryptodatapy-0.2.4}/src/cryptodatapy/extract/__init__.py +0 -0
- {cryptodatapy-0.2.2 → cryptodatapy-0.2.4}/src/cryptodatapy/extract/data_vendors/.ipynb_checkpoints/CCXT-checkpoint.ipynb +0 -0
- {cryptodatapy-0.2.2 → cryptodatapy-0.2.4}/src/cryptodatapy/extract/data_vendors/.ipynb_checkpoints/DBNomics-checkpoint.ipynb +0 -0
- {cryptodatapy-0.2.2 → cryptodatapy-0.2.4}/src/cryptodatapy/extract/data_vendors/.ipynb_checkpoints/InvestPy-checkpoint.ipynb +0 -0
- {cryptodatapy-0.2.2 → cryptodatapy-0.2.4}/src/cryptodatapy/extract/data_vendors/.ipynb_checkpoints/NasdaqDataLink-checkpoint.ipynb +0 -0
- {cryptodatapy-0.2.2 → cryptodatapy-0.2.4}/src/cryptodatapy/extract/data_vendors/.ipynb_checkpoints/PandasDataReader-checkpoint.ipynb +0 -0
- {cryptodatapy-0.2.2 → cryptodatapy-0.2.4}/src/cryptodatapy/extract/data_vendors/__init__.py +0 -0
- {cryptodatapy-0.2.2 → cryptodatapy-0.2.4}/src/cryptodatapy/extract/data_vendors/coinmetrics_api.py +0 -0
- {cryptodatapy-0.2.2 → cryptodatapy-0.2.4}/src/cryptodatapy/extract/data_vendors/cryptocompare_api.py +0 -0
- {cryptodatapy-0.2.2 → cryptodatapy-0.2.4}/src/cryptodatapy/extract/data_vendors/datavendor.py +0 -0
- {cryptodatapy-0.2.2 → cryptodatapy-0.2.4}/src/cryptodatapy/extract/data_vendors/glassnode_api.py +0 -0
- {cryptodatapy-0.2.2 → cryptodatapy-0.2.4}/src/cryptodatapy/extract/data_vendors/tiingo_api.py +0 -0
- {cryptodatapy-0.2.2 → cryptodatapy-0.2.4}/src/cryptodatapy/extract/datarequest.py +0 -0
- {cryptodatapy-0.2.2 → cryptodatapy-0.2.4}/src/cryptodatapy/extract/getdata.py +0 -0
- {cryptodatapy-0.2.2 → cryptodatapy-0.2.4}/src/cryptodatapy/extract/libraries/__init__.py +0 -0
- {cryptodatapy-0.2.2 → cryptodatapy-0.2.4}/src/cryptodatapy/extract/libraries/ccxt_api.py +0 -0
- {cryptodatapy-0.2.2 → cryptodatapy-0.2.4}/src/cryptodatapy/extract/libraries/dbnomics_api.py +0 -0
- {cryptodatapy-0.2.2 → cryptodatapy-0.2.4}/src/cryptodatapy/extract/libraries/investpy_api.py +0 -0
- {cryptodatapy-0.2.2 → cryptodatapy-0.2.4}/src/cryptodatapy/extract/libraries/library.py +0 -0
- {cryptodatapy-0.2.2 → cryptodatapy-0.2.4}/src/cryptodatapy/extract/libraries/pandasdr_api.py +0 -0
- {cryptodatapy-0.2.2 → cryptodatapy-0.2.4}/src/cryptodatapy/extract/web/__init__.py +0 -0
- {cryptodatapy-0.2.2 → cryptodatapy-0.2.4}/src/cryptodatapy/extract/web/aqr.py +0 -0
- {cryptodatapy-0.2.2 → cryptodatapy-0.2.4}/src/cryptodatapy/extract/web/web.py +0 -0
- {cryptodatapy-0.2.2 → cryptodatapy-0.2.4}/src/cryptodatapy/transform/__init__.py +0 -0
- {cryptodatapy-0.2.2 → cryptodatapy-0.2.4}/src/cryptodatapy/transform/convertparams.py +0 -0
- {cryptodatapy-0.2.2 → cryptodatapy-0.2.4}/src/cryptodatapy/transform/wrangle.py +0 -0
- {cryptodatapy-0.2.2 → cryptodatapy-0.2.4}/src/cryptodatapy/util/__init__.py +0 -0
- {cryptodatapy-0.2.2 → cryptodatapy-0.2.4}/src/cryptodatapy/util/datacatalog.py +0 -0
- {cryptodatapy-0.2.2 → cryptodatapy-0.2.4}/src/cryptodatapy/util/datacredentials.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: cryptodatapy
|
3
|
-
Version: 0.2.
|
3
|
+
Version: 0.2.4
|
4
4
|
Summary: Cryptoasset data library
|
5
5
|
License: Apache-2.0
|
6
6
|
Author: Systamental
|
@@ -13,6 +13,7 @@ Classifier: Programming Language :: Python :: 3.9
|
|
13
13
|
Requires-Dist: DBnomics (>=1.2.3)
|
14
14
|
Requires-Dist: ccxt (>=1.91.52)
|
15
15
|
Requires-Dist: coinmetrics-api-client (>=2022.6.17); python_version >= "3.7"
|
16
|
+
Requires-Dist: fsspec (>=2024.6.1)
|
16
17
|
Requires-Dist: investpy (>=1.0.8)
|
17
18
|
Requires-Dist: matplotlib (>=3.5.2)
|
18
19
|
Requires-Dist: numpy (>=1.23.2)
|
@@ -20,8 +21,10 @@ Requires-Dist: openpyxl (>=3.1.2)
|
|
20
21
|
Requires-Dist: pandas (>=1.4.4)
|
21
22
|
Requires-Dist: pandas-datareader (>=0.10.0)
|
22
23
|
Requires-Dist: prophet (>=1.1); python_version >= "3.7"
|
24
|
+
Requires-Dist: pyarrow (>=17.0.0)
|
23
25
|
Requires-Dist: requests (>=2.28.0); python_version >= "3.7"
|
24
26
|
Requires-Dist: responses (>=0.21.0)
|
27
|
+
Requires-Dist: s3fs (>=2024.6.1,<2025.0.0)
|
25
28
|
Requires-Dist: selenium (>=4.4.3)
|
26
29
|
Requires-Dist: statsmodels (>=0.13.2)
|
27
30
|
Requires-Dist: webdriver-manager (>=3.8.3)
|
@@ -1,6 +1,6 @@
|
|
1
1
|
[tool.poetry]
|
2
2
|
name = "cryptodatapy"
|
3
|
-
version = "0.2.
|
3
|
+
version = "0.2.4"
|
4
4
|
description = "Cryptoasset data library"
|
5
5
|
authors = ["Systamental"]
|
6
6
|
license = "Apache License 2.0"
|
@@ -25,6 +25,9 @@ responses = ">=0.21.0"
|
|
25
25
|
yfinance = ">=0.2.14"
|
26
26
|
openpyxl = ">=3.1.2"
|
27
27
|
xlrd = ">=2.0.1"
|
28
|
+
fsspec = ">=2024.6.1"
|
29
|
+
pyarrow = ">=17.0.0"
|
30
|
+
s3fs = "^2024.6.1"
|
28
31
|
|
29
32
|
[tool.poetry.dev-dependencies]
|
30
33
|
pytest = ">=7.1.2"
|
@@ -83,13 +83,16 @@ package_data = \
|
|
83
83
|
install_requires = \
|
84
84
|
['DBnomics>=1.2.3',
|
85
85
|
'ccxt>=1.91.52',
|
86
|
+
'fsspec>=2024.6.1',
|
86
87
|
'investpy>=1.0.8',
|
87
88
|
'matplotlib>=3.5.2',
|
88
89
|
'numpy>=1.23.2',
|
89
90
|
'openpyxl>=3.1.2',
|
90
91
|
'pandas-datareader>=0.10.0',
|
91
92
|
'pandas>=1.4.4',
|
93
|
+
'pyarrow>=17.0.0',
|
92
94
|
'responses>=0.21.0',
|
95
|
+
's3fs>=2024.6.1,<2025.0.0',
|
93
96
|
'selenium>=4.4.3',
|
94
97
|
'statsmodels>=0.13.2',
|
95
98
|
'webdriver-manager>=3.8.3',
|
@@ -103,7 +106,7 @@ extras_require = \
|
|
103
106
|
|
104
107
|
setup_kwargs = {
|
105
108
|
'name': 'cryptodatapy',
|
106
|
-
'version': '0.2.
|
109
|
+
'version': '0.2.4',
|
107
110
|
'description': 'Cryptoasset data library',
|
108
111
|
'long_description': "\n\n# CryptoDataPy\n### _Better data beats fancier algorithms_\n<br/>\n\n**CryptoDataPy** is a python library which makes it easy to build high quality data pipelines \nfor the analysis of digital assets. By providing easy access to over 100,000 time series for thousands of assets, \nit facilitates the pre-processing of a wide range of data from different sources.\n\nCryptoassets generate a huge amount of market, on-chain and off-chain data. \nBut unlike legacy financial markets, this data is often fragmented, \nunstructured and dirty. By extracting data from various sources, \npre-processing it into a user-friendly (tidy) format, detecting and repairing 'bad' data,\nand allowing for easy storage and retrieval, CryptoDataPy allows you to spend less time gathering \nand cleaning data, and more time analyzing it.\n\nOur data includes:\n\n- **Market:** market prices of varying granularity (e.g. tick, trade and bar data, aka OHLC),\nfor spot, futures and options markets, as well as funding rates for the analysis of \ncryptoasset returns.\n- **On-chain:** network health and usage data, circulating supply, asset holder positions and \ncost-basis, for the analysis of underlying crypto network fundamentals.\n- **Off-chain:** news, social media, developer activity, web traffic and search for project interest and \nsentiment, as well as traditional financial market and macroeconomic data for broader financial and \neconomic conditions.\n\nThe library's intuitive interface facilitates each step of the ETL/ETL (extract-transform-load) process:\n\n- **Extract**: Extracting data from a wide range of data sources and file formats.\n- **Transform**: \n - Wrangling data into a pandas DataFrame in a structured and user-friendly format, \n a.k.a [tidy data](https://www.jstatsoft.org/article/view/v059i10). \n - Detecting, scrubbing and repairing 'bad' data (e.g. outliers, missing values, 0s, etc.) to improve the accuracy and reliability\nof machine learning/predictive models.\n- **Load**: Storing clean and ready-for-analysis data and metadata for easy access.\n\n## Installation\n\n```bash\n$ pip install cryptodatapy\n```\n\n## Usage\n\n**CryptoDataPy** allows you to pull ready-to-analyze data from a variety of sources \nwith only a few lines of code.\n\nFirst specify which data you want with a `DataRequest`:\n\n```python\n# import DataRequest\nfrom cryptodatapy.extract.datarequest import DataRequest\n# specify parameters for data request: tickers, fields, start date, end_date, etc.\ndata_req = DataRequest(\n source='glassnode', # name of data source\n tickers=['btc', 'eth'], # list of asset tickers, in CryptoDataPy format, defaults to 'btc'\n fields=['close', 'add_act', 'hashrate'], # list of fields, in CryptoDataPy, defaults to 'close'\n freq=None, # data frequency, defaults to daily \n quote_ccy=None, # defaults to USD/USDT\n exch=None, # defaults to exchange weighted average or Binance\n mkt_type= 'spot', # defaults to spot\n start_date=None, # defaults to start date for longest series\n end_date=None, # defaults to most recent \n tz=None, # defaults to UTC time\n cat=None, # optional, should be specified when asset class is not crypto, eg. 'fx', 'rates', 'macro', etc.\n)\n```\nThen get the data :\n\n```python\n# import GetData\nfrom cryptodatapy.extract.getdata import GetData\n# get data\nGetData(data_req).get_series()\n```\n\nWith the same data request parameters, you can retrieve the same data from a different source:\n\n```python\n# modify data source parameter\ndata_req = DataRequest(\n source='coinmetrics', \n tickers=['btc', 'eth'], \n fields=['close', 'add_act', 'hashrate'], \n req='d',\n start_date='2016-01-01')\n# get data\nGetData(data_req).get_series()\n```\n\nFor more detailed code examples and interactive tutorials \nsee [here](https://github.com/systamental/cryptodatapy/blob/main/docs/example.ipynb).\n## Supported Data Sources\n\n- [CryptoCompare](https://min-api.cryptocompare.com/documentation)\n- [CCXT](https://docs.ccxt.com/en/latest/)\n- [Glassnode](https://docs.glassnode.com/)\n- [Coin Metrics](https://docs.coinmetrics.io/api/v4/)\n- [Tiingo](https://api.tiingo.com/documentation/general/overview)\n- [Yahoo Finance](https://github.com/ranaroussi/yfinance)\n- [Fama-French Data](http://mba.tuck.dartmouth.edu/pages/faculty/ken.french/data_library.html)\n- [AQR](https://www.aqr.com/Insights/Datasets)\n- [Federal Reserve Economic Data (FRED)](https://fred.stlouisfed.org/docs/api/fred/)\n- [DBnomics](https://db.nomics.world/docs/)\n- [WorldBank](https://documents.worldbank.org/en/publication/documents-reports/api)\n- [Pandas-datareader](https://pandas-datareader.readthedocs.io/en/latest/)\n\n## Contributing\n\nInterested in contributing? Check out the contributing guidelines and \ncontact us at info@systamental.com. Please note that this project is s\nreleased with a Code of Conduct. By contributing to this project, you agree \nto abide by its terms.\n\n## License\n\n`cryptodatapy` was created by Systamental. \nIt is licensed under the terms of the Apache License 2.0 license.\n\n",
|
109
112
|
'author': 'Systamental',
|
@@ -0,0 +1,31 @@
|
|
1
|
+
id,name,tiingo_id
|
2
|
+
eurusd,,
|
3
|
+
gbpusd,,
|
4
|
+
usdjpy,,
|
5
|
+
usdchf,,
|
6
|
+
usdcad,,
|
7
|
+
usdsek,,
|
8
|
+
usdnok,,
|
9
|
+
audusd,,
|
10
|
+
nzdusd,,
|
11
|
+
usdars,,
|
12
|
+
usdmxn,,
|
13
|
+
usdbrl,,
|
14
|
+
usdcop,,
|
15
|
+
usdclp,,
|
16
|
+
usdpen,,
|
17
|
+
usdils,,
|
18
|
+
usdrub,,
|
19
|
+
usdczk,,
|
20
|
+
usdpln,,
|
21
|
+
usdhuf,,
|
22
|
+
usdzar,,
|
23
|
+
usdtry,,
|
24
|
+
usdcny,,
|
25
|
+
usdhkd,,
|
26
|
+
usdsgd,,
|
27
|
+
usdtwd,,
|
28
|
+
usdkrw,,
|
29
|
+
usdphp,,
|
30
|
+
usdinr,,
|
31
|
+
usdidr,,
|
@@ -0,0 +1,398 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
from typing import Optional, Union
|
3
|
+
import pandas as pd
|
4
|
+
|
5
|
+
from cryptodatapy.transform.od import OutlierDetection
|
6
|
+
from cryptodatapy.transform.impute import Impute
|
7
|
+
from cryptodatapy.transform.filter import Filter
|
8
|
+
|
9
|
+
|
10
|
+
def stitch_dataframes(dfs):
|
11
|
+
"""
|
12
|
+
Stitches together dataframes with different start dates.
|
13
|
+
|
14
|
+
Parameters
|
15
|
+
----------
|
16
|
+
dfs: list
|
17
|
+
List of dataframes to be stitched together.
|
18
|
+
|
19
|
+
Returns
|
20
|
+
-------
|
21
|
+
combined_df: pd.DataFrame
|
22
|
+
Combined dataframe with extended start date.
|
23
|
+
"""
|
24
|
+
# check if dfs is a list
|
25
|
+
if not isinstance(dfs, list):
|
26
|
+
raise TypeError("Dataframes must be a list.")
|
27
|
+
|
28
|
+
# check index types
|
29
|
+
if all([isinstance(df.index, pd.MultiIndex) for df in dfs]):
|
30
|
+
dfs.sort(key=lambda df: df.index.levels[0][0], reverse=True)
|
31
|
+
elif all([isinstance(df.index, pd.DatetimeIndex) for df in dfs]):
|
32
|
+
dfs.sort(key=lambda df: df.index[0], reverse=True)
|
33
|
+
else:
|
34
|
+
raise TypeError("Dataframes must be pd.MultiIndex or have DatetimeIndex.")
|
35
|
+
|
36
|
+
# most recent start date
|
37
|
+
combined_df = dfs[0]
|
38
|
+
|
39
|
+
# combine dfs
|
40
|
+
for df in dfs[1:]:
|
41
|
+
combined_df = combined_df.combine_first(df)
|
42
|
+
|
43
|
+
# reorder cols
|
44
|
+
max_columns = max(len(df.columns) for df in dfs)
|
45
|
+
cols = next(df.columns.tolist() for df in dfs if len(df.columns) == max_columns)
|
46
|
+
combined_df = combined_df[cols]
|
47
|
+
|
48
|
+
return combined_df
|
49
|
+
|
50
|
+
|
51
|
+
class CleanData:
|
52
|
+
"""
|
53
|
+
Cleans data to improve data quality.
|
54
|
+
"""
|
55
|
+
def __init__(self, df: pd.DataFrame):
|
56
|
+
"""
|
57
|
+
Constructor
|
58
|
+
|
59
|
+
Parameters
|
60
|
+
----------
|
61
|
+
df: pd.DataFrame
|
62
|
+
DataFrame MultiIndex with DatetimeIndex (level 0), ticker (level 1) and field (cols) values.
|
63
|
+
"""
|
64
|
+
self.raw_df = df.copy() # keepy copy of raw dataframe
|
65
|
+
self.df = df
|
66
|
+
self.excluded_cols = None
|
67
|
+
self.outliers = None
|
68
|
+
self.yhat = None
|
69
|
+
self.filtered_df = None
|
70
|
+
self.filtered_tickers = None
|
71
|
+
self.repaired_df = None
|
72
|
+
self.summary = pd.DataFrame()
|
73
|
+
self.initialize_summary()
|
74
|
+
self.check_types()
|
75
|
+
|
76
|
+
def initialize_summary(self) -> None:
|
77
|
+
"""
|
78
|
+
Initializes summary dataframe with data quality metrics.
|
79
|
+
"""
|
80
|
+
# add obs and missing vals
|
81
|
+
self.summary.loc["n_obs", self.df.unstack().columns] = self.df.unstack().notna().sum().values
|
82
|
+
self.summary.loc["%_NaN_start", self.df.unstack().columns] = \
|
83
|
+
(self.df.unstack().isnull().sum() / self.df.unstack().shape[0]).values * 100
|
84
|
+
|
85
|
+
def check_types(self) -> None:
|
86
|
+
"""
|
87
|
+
Checks data types of columns and converts them to the appropriate data types.
|
88
|
+
|
89
|
+
Returns
|
90
|
+
-------
|
91
|
+
CleanData
|
92
|
+
CleanData object
|
93
|
+
"""
|
94
|
+
if not isinstance(self.df, pd.DataFrame):
|
95
|
+
raise TypeError("Data must be a pandas DataFrame.")
|
96
|
+
|
97
|
+
def filter_outliers(
|
98
|
+
self,
|
99
|
+
od_method: str = "mad",
|
100
|
+
excl_cols: Optional[Union[str, list]] = None,
|
101
|
+
**kwargs
|
102
|
+
) -> CleanData:
|
103
|
+
"""
|
104
|
+
Filters outliers.
|
105
|
+
|
106
|
+
Parameters
|
107
|
+
----------
|
108
|
+
od_method: str, {'atr', 'iqr', 'mad', 'z_score', 'ewma', 'stl', 'seasonal_decomp', 'prophet'}, default z_score
|
109
|
+
Outlier detection method to use for filtering.
|
110
|
+
excl_cols: str or list
|
111
|
+
Name of columns to exclude from outlier filtering.
|
112
|
+
|
113
|
+
Returns
|
114
|
+
-------
|
115
|
+
CleanData
|
116
|
+
CleanData object
|
117
|
+
"""
|
118
|
+
# outlier detection
|
119
|
+
od = OutlierDetection(self.df, excl_cols=excl_cols, **kwargs)
|
120
|
+
self.excluded_cols = excl_cols
|
121
|
+
|
122
|
+
# filter outliers
|
123
|
+
getattr(od, od_method)()
|
124
|
+
self.filtered_df = od.filtered_df
|
125
|
+
self.outliers = od.outliers
|
126
|
+
self.yhat = od.yhat
|
127
|
+
|
128
|
+
# add to summary
|
129
|
+
self.summary.loc["%_outliers", self.outliers.unstack().columns] = (
|
130
|
+
self.outliers.unstack().notna().sum() / self.df.unstack().shape[0]
|
131
|
+
).values * 100
|
132
|
+
|
133
|
+
# filtered df
|
134
|
+
self.df = self.filtered_df.sort_index()
|
135
|
+
|
136
|
+
return self
|
137
|
+
|
138
|
+
def repair_outliers(self, imp_method: str = "interpolate", **kwargs) -> CleanData:
|
139
|
+
"""
|
140
|
+
Repairs outliers using an imputation method.
|
141
|
+
|
142
|
+
Parameters
|
143
|
+
----------
|
144
|
+
imp_method: str, {"fwd_fill', 'interpolate', 'fcst'}, default 'fwd_fill'
|
145
|
+
Imputation method used to replace filtered outliers.
|
146
|
+
|
147
|
+
Returns
|
148
|
+
-------
|
149
|
+
CleanData
|
150
|
+
CleanData object
|
151
|
+
"""
|
152
|
+
# impute missing vals
|
153
|
+
if imp_method == "fcst":
|
154
|
+
self.repaired_df = getattr(Impute(self.df), imp_method)(self.yhat, **kwargs)
|
155
|
+
else:
|
156
|
+
self.repaired_df = getattr(Impute(self.df), imp_method)(**kwargs)
|
157
|
+
|
158
|
+
# add repaired % to summary
|
159
|
+
rep_vals = self.repaired_df.unstack().notna().sum() - self.df.unstack().notna().sum()
|
160
|
+
self.summary.loc["%_imputed", self.df.unstack().columns] = rep_vals / self.df.unstack().shape[0] * 100
|
161
|
+
|
162
|
+
# repaired df
|
163
|
+
if self.excluded_cols is not None:
|
164
|
+
self.df = pd.concat([self.repaired_df, self.raw_df[self.excluded_cols]], join="inner", axis=1)
|
165
|
+
else:
|
166
|
+
self.df = self.repaired_df
|
167
|
+
|
168
|
+
# reorder cols
|
169
|
+
self.df = self.df[self.raw_df.columns].sort_index()
|
170
|
+
|
171
|
+
return self
|
172
|
+
|
173
|
+
def filter_avg_trading_val(self, thresh_val: int = 10000000, window_size: int = 30) -> CleanData:
|
174
|
+
"""
|
175
|
+
Filters values below a threshold of average trading value (price * volume/size in quote currency) over some
|
176
|
+
lookback window, replacing them with NaNs.
|
177
|
+
|
178
|
+
Parameters
|
179
|
+
----------
|
180
|
+
thresh_val: int, default 10,000,000
|
181
|
+
Threshold/cut-off for avg trading value.
|
182
|
+
window_size: int, default 30
|
183
|
+
Size of rolling window.
|
184
|
+
|
185
|
+
Returns
|
186
|
+
-------
|
187
|
+
CleanData
|
188
|
+
CleanData object
|
189
|
+
"""
|
190
|
+
# filter outliers
|
191
|
+
self.filtered_df = Filter(self.df).avg_trading_val(thresh_val=thresh_val, window_size=window_size)
|
192
|
+
|
193
|
+
# add to summary
|
194
|
+
filtered_vals = self.df.unstack().notna().sum() - self.filtered_df.unstack().notna().sum()
|
195
|
+
self.summary.loc["%_below_avg_trading_val", self.df.unstack().columns] = (
|
196
|
+
filtered_vals / self.df.unstack().shape[0]
|
197
|
+
).values * 100
|
198
|
+
|
199
|
+
# filtered df
|
200
|
+
self.df = self.filtered_df.sort_index()
|
201
|
+
|
202
|
+
return self
|
203
|
+
|
204
|
+
def filter_missing_vals_gaps(self, gap_window: int = 30) -> CleanData:
|
205
|
+
"""
|
206
|
+
Filters values before a large gap of missing values, replacing them with NaNs.
|
207
|
+
|
208
|
+
Parameters
|
209
|
+
----------
|
210
|
+
gap_window: int, default 30
|
211
|
+
Size of window where all values are missing (NaNs).
|
212
|
+
|
213
|
+
Returns
|
214
|
+
-------
|
215
|
+
CleanData
|
216
|
+
CleanData object
|
217
|
+
"""
|
218
|
+
# filter outliers
|
219
|
+
self.filtered_df = Filter(self.df).missing_vals_gaps(gap_window=gap_window)
|
220
|
+
|
221
|
+
# add to summary
|
222
|
+
missing_vals_gap = (
|
223
|
+
self.df.unstack().notna().sum() - self.filtered_df.unstack().notna().sum()
|
224
|
+
)
|
225
|
+
self.summary.loc["%_missing_vals_gaps", self.df.unstack().columns] = (
|
226
|
+
missing_vals_gap / self.df.unstack().shape[0]
|
227
|
+
).values * 100
|
228
|
+
|
229
|
+
# filtered df
|
230
|
+
self.df = self.filtered_df.sort_index()
|
231
|
+
|
232
|
+
return self
|
233
|
+
|
234
|
+
def filter_min_nobs(self, ts_obs: int = 100, cs_obs: int = 2) -> CleanData:
|
235
|
+
"""
|
236
|
+
Removes tickers from dataframe if the ticker has less than a minimum number of observations.
|
237
|
+
|
238
|
+
Parameters
|
239
|
+
----------
|
240
|
+
ts_obs: int, default 100
|
241
|
+
Minimum number of observations for field/column over time series.
|
242
|
+
cs_obs: int, default 5
|
243
|
+
Minimum number of observations for tickers over the cross-section.
|
244
|
+
|
245
|
+
Returns
|
246
|
+
-------
|
247
|
+
CleanData
|
248
|
+
CleanData object
|
249
|
+
"""
|
250
|
+
# filter outliers
|
251
|
+
self.filtered_df = Filter(self.df).min_nobs(ts_obs=ts_obs, cs_obs=cs_obs)
|
252
|
+
|
253
|
+
# tickers < min obs
|
254
|
+
self.filtered_tickers = list(
|
255
|
+
set(self.filtered_df.index.droplevel(0).unique()).symmetric_difference(
|
256
|
+
set(self.df.index.droplevel(0).unique())
|
257
|
+
)
|
258
|
+
)
|
259
|
+
|
260
|
+
# add to summary
|
261
|
+
self.summary.loc["n_tickers_below_min_obs", self.df.unstack().columns] = len(self.filtered_tickers)
|
262
|
+
|
263
|
+
# filtered df
|
264
|
+
self.df = self.filtered_df.sort_index()
|
265
|
+
|
266
|
+
return self
|
267
|
+
|
268
|
+
def filter_delisted_tickers(self, field: str = 'close', n_unch_vals: int = 30) -> CleanData:
|
269
|
+
"""
|
270
|
+
Removes delisted tickers from dataframe.
|
271
|
+
|
272
|
+
Parameters
|
273
|
+
----------
|
274
|
+
field: str, default 'close'
|
275
|
+
Field/column to use for detecting delisted tickers.
|
276
|
+
n_unch_vals: int, default 30
|
277
|
+
Number of consecutive unchanged values to consider a ticker as delisted.
|
278
|
+
|
279
|
+
Returns
|
280
|
+
-------
|
281
|
+
CleanData
|
282
|
+
CleanData object
|
283
|
+
"""
|
284
|
+
# filter tickers
|
285
|
+
self.filtered_df = Filter(self.df).remove_delisted(field=field, n_unch_vals=n_unch_vals)
|
286
|
+
|
287
|
+
# tickers < min obs
|
288
|
+
self.filtered_tickers = list(
|
289
|
+
set(self.filtered_df.index.droplevel(0).unique()).symmetric_difference(
|
290
|
+
set(self.df.index.droplevel(0).unique())
|
291
|
+
)
|
292
|
+
)
|
293
|
+
|
294
|
+
# add to summary
|
295
|
+
self.summary.loc["n_filtered_tickers", self.df.unstack().columns] = len(self.filtered_tickers)
|
296
|
+
|
297
|
+
# filtered df
|
298
|
+
self.df = self.filtered_df.sort_index()
|
299
|
+
|
300
|
+
return self
|
301
|
+
|
302
|
+
def filter_tickers(self, tickers_list) -> CleanData:
|
303
|
+
"""
|
304
|
+
Removes specified tickers from dataframe.
|
305
|
+
|
306
|
+
Parameters
|
307
|
+
----------
|
308
|
+
tickers_list: str or list
|
309
|
+
List of tickers to be removed. Can be used to remove tickers to be excluded from data analysis,
|
310
|
+
e.g. stablecoins or indexes.
|
311
|
+
|
312
|
+
Returns
|
313
|
+
-------
|
314
|
+
CleanData
|
315
|
+
CleanData object
|
316
|
+
"""
|
317
|
+
# filter tickers
|
318
|
+
self.filtered_df = Filter(self.df).tickers(tickers_list)
|
319
|
+
|
320
|
+
# tickers < min obs
|
321
|
+
|
322
|
+
self.filtered_tickers = list(
|
323
|
+
set(self.filtered_df.index.droplevel(0).unique()).symmetric_difference(
|
324
|
+
set(self.df.index.droplevel(0).unique())
|
325
|
+
)
|
326
|
+
)
|
327
|
+
|
328
|
+
# add to summary
|
329
|
+
self.summary.loc["n_filtered_tickers", self.df.unstack().columns] = len(self.filtered_tickers)
|
330
|
+
|
331
|
+
# filtered df
|
332
|
+
self.df = self.filtered_df.sort_index()
|
333
|
+
|
334
|
+
return self
|
335
|
+
|
336
|
+
def show_plot(self, plot_series: tuple = ("BTC", "close"), compare_series: bool = True) -> None:
|
337
|
+
"""
|
338
|
+
Plots clean time series and compares it to the raw series.
|
339
|
+
|
340
|
+
Parameters
|
341
|
+
----------
|
342
|
+
plot_series: tuple, optional, default('BTC', 'close')
|
343
|
+
Plots the time series of a specific (ticker, field) tuple.
|
344
|
+
compare_series: bool, default True
|
345
|
+
Compares clean time series with raw series
|
346
|
+
"""
|
347
|
+
ax = (
|
348
|
+
self.df.loc[pd.IndexSlice[:, plot_series[0]], plot_series[1]]
|
349
|
+
.droplevel(1)
|
350
|
+
.plot(
|
351
|
+
linewidth=1,
|
352
|
+
figsize=(15, 7),
|
353
|
+
color="#1f77b4",
|
354
|
+
zorder=0,
|
355
|
+
title="Filtered vs. Raw Data",
|
356
|
+
)
|
357
|
+
)
|
358
|
+
if compare_series:
|
359
|
+
ax = (
|
360
|
+
self.raw_df.loc[pd.IndexSlice[:, plot_series[0]], plot_series[1]]
|
361
|
+
.droplevel(1)
|
362
|
+
.plot(
|
363
|
+
linewidth=1,
|
364
|
+
figsize=(15, 7),
|
365
|
+
linestyle=":",
|
366
|
+
color="#FF8785",
|
367
|
+
zorder=0,
|
368
|
+
)
|
369
|
+
)
|
370
|
+
ax.grid(color="black", linewidth=0.05)
|
371
|
+
ax.xaxis.grid(False)
|
372
|
+
ax.set_ylabel(plot_series[0])
|
373
|
+
ax.ticklabel_format(style="plain", axis="y")
|
374
|
+
ax.set_facecolor("whitesmoke")
|
375
|
+
ax.legend(
|
376
|
+
[plot_series[1] + "_filtered", plot_series[1] + "_raw"], loc="upper left"
|
377
|
+
)
|
378
|
+
|
379
|
+
def get(self, attr="df") -> pd.DataFrame:
|
380
|
+
"""
|
381
|
+
Returns GetData object attribute.
|
382
|
+
|
383
|
+
Parameters
|
384
|
+
----------
|
385
|
+
attr: str, {'df', 'outliers', 'yhat', 'filtered_tickers', 'summary'}, default 'df'
|
386
|
+
GetData object attribute to return
|
387
|
+
|
388
|
+
Returns
|
389
|
+
-------
|
390
|
+
CleanData
|
391
|
+
CleanData object
|
392
|
+
"""
|
393
|
+
self.summary.loc["%_NaN_end", self.df.unstack().columns] = (
|
394
|
+
self.df.unstack().isnull().sum() / self.df.unstack().shape[0]
|
395
|
+
).values * 100
|
396
|
+
self.summary = self.summary.astype(float).round(2)
|
397
|
+
|
398
|
+
return getattr(self, attr)
|