pfeed 0.0.1.dev1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pfeed/.DS_Store +0 -0
- pfeed/__init__.py +20 -0
- pfeed/config/logging.yml +47 -0
- pfeed/const/commons.py +2 -0
- pfeed/const/paths.py +9 -0
- pfeed/datastore.py +117 -0
- pfeed/feeds/__init__.py +2 -0
- pfeed/feeds/base_feed.py +3 -0
- pfeed/feeds/bybit_feed.py +93 -0
- pfeed/feeds/custom_csv_feed.py +13 -0
- pfeed/feeds/yahoo_finance_feed.py +96 -0
- pfeed/filepath.py +36 -0
- pfeed/main.py +72 -0
- pfeed/sources/.DS_Store +0 -0
- pfeed/sources/__init__.py +1 -0
- pfeed/sources/bybit/__init__.py +2 -0
- pfeed/sources/bybit/api.py +63 -0
- pfeed/sources/bybit/config.yml +4 -0
- pfeed/sources/bybit/const.py +42 -0
- pfeed/sources/bybit/eda.ipynb +29050 -0
- pfeed/sources/bybit/etl.py +161 -0
- pfeed/sources/bybit/historical.py +151 -0
- pfeed/utils/utils.py +77 -0
- pfeed-0.0.1.dev1.dist-info/LICENSE +201 -0
- pfeed-0.0.1.dev1.dist-info/METADATA +42 -0
- pfeed-0.0.1.dev1.dist-info/RECORD +27 -0
- pfeed-0.0.1.dev1.dist-info/WHEEL +4 -0
pfeed/.DS_Store
ADDED
|
Binary file
|
pfeed/__init__.py
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
import multiprocessing
|
|
2
|
+
|
|
3
|
+
from dotenv import load_dotenv, find_dotenv
|
|
4
|
+
from rich.console import Console
|
|
5
|
+
|
|
6
|
+
from pfeed.sources import bybit
|
|
7
|
+
from pfeed.feeds import YahooFinanceFeed, BybitFeed
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
cprint = Console().print
|
|
11
|
+
load_dotenv(find_dotenv())
|
|
12
|
+
multiprocessing.set_start_method('fork', force=True)
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
__all__ = (
|
|
16
|
+
'bybit',
|
|
17
|
+
'YahooFinanceFeed',
|
|
18
|
+
'BybitFeed',
|
|
19
|
+
'cprint',
|
|
20
|
+
)
|
pfeed/config/logging.yml
ADDED
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
version: 1
|
|
2
|
+
filename_format: '%Y-%m-%d_UTC%z'
|
|
3
|
+
loggers:
|
|
4
|
+
root:
|
|
5
|
+
level: 'DEBUG'
|
|
6
|
+
handlers: ['compressed_timed_rotating_file_handler', 'stream_handler']
|
|
7
|
+
propagate: False
|
|
8
|
+
pfeed:
|
|
9
|
+
level: 'DEBUG'
|
|
10
|
+
handlers: ['compressed_timed_rotating_file_handler', 'stream_handler']
|
|
11
|
+
propagate: False
|
|
12
|
+
minio:
|
|
13
|
+
level: 'DEBUG'
|
|
14
|
+
handlers: ['compressed_timed_rotating_file_handler', 'stream_handler']
|
|
15
|
+
propagate: False
|
|
16
|
+
yahoo_finance:
|
|
17
|
+
level: 'DEBUG'
|
|
18
|
+
handlers: ['compressed_timed_rotating_file_handler', 'stream_handler']
|
|
19
|
+
propagate: False
|
|
20
|
+
bybit:
|
|
21
|
+
level: 'DEBUG'
|
|
22
|
+
handlers: ['compressed_timed_rotating_file_handler', 'stream_handler']
|
|
23
|
+
propagate: False
|
|
24
|
+
handlers:
|
|
25
|
+
file_handler:
|
|
26
|
+
class: 'logging.FileHandler'
|
|
27
|
+
level: 'DEBUG'
|
|
28
|
+
formatter: 'file'
|
|
29
|
+
compressed_timed_rotating_file_handler:
|
|
30
|
+
class: 'pfund.logging.handlers.CompressedTimedRotatingFileHandler'
|
|
31
|
+
level: 'DEBUG'
|
|
32
|
+
formatter: 'file'
|
|
33
|
+
kwargs: {'when': 'midnight', 'backupCount': 7, 'utc': True, 'encoding': 'utf-8'}
|
|
34
|
+
stream_handler:
|
|
35
|
+
class: 'logging.StreamHandler'
|
|
36
|
+
level: 'DEBUG'
|
|
37
|
+
formatter: 'console'
|
|
38
|
+
formatters:
|
|
39
|
+
path:
|
|
40
|
+
format: '%(asctime)s.%(msecs)03d | %(levelname)s | %(name)s | %(message)s | %(shortpath)s fn:%(funcName)s ln:%(lineno)d'
|
|
41
|
+
datefmt: '%H:%M:%S%z'
|
|
42
|
+
file:
|
|
43
|
+
format: '%(asctime)s.%(msecs)03d | %(levelname)s | %(message)s | %(filename)s fn:%(funcName)s ln:%(lineno)d'
|
|
44
|
+
datefmt: '%H:%M:%S%z'
|
|
45
|
+
console:
|
|
46
|
+
format: '%(asctime)s.%(msecs)03d | %(levelname)s | %(name)s | %(message)s | fn:%(funcName)s ln:%(lineno)d'
|
|
47
|
+
datefmt: '%Y-%m-%d %H:%M:%S%z'
|
pfeed/const/commons.py
ADDED
pfeed/const/paths.py
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
PROJ_NAME = Path(__file__).resolve().parents[2].name
|
|
5
|
+
MAIN_PATH = Path(__file__).resolve().parents[3]
|
|
6
|
+
PROJ_PATH = MAIN_PATH / PROJ_NAME / PROJ_NAME
|
|
7
|
+
CONFIG_PATH = PROJ_PATH / 'config'
|
|
8
|
+
LOG_PATH = MAIN_PATH / PROJ_NAME / 'logs'
|
|
9
|
+
DATA_PATH = MAIN_PATH / PROJ_NAME / 'data'
|
pfeed/datastore.py
ADDED
|
@@ -0,0 +1,117 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import io
|
|
3
|
+
import logging
|
|
4
|
+
|
|
5
|
+
from typing import Generator
|
|
6
|
+
|
|
7
|
+
from minio import Minio, S3Error
|
|
8
|
+
from minio.api import ObjectWriteResult
|
|
9
|
+
|
|
10
|
+
from pfeed.const.paths import PROJ_NAME
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
logger = logging.getLogger('minio')
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
# EXTEND, currently only consider using MinIO
|
|
17
|
+
class Datastore:
|
|
18
|
+
DATA_PART_SIZE = 5 * (1024 ** 2) # part size for S3, 5 MB
|
|
19
|
+
BUCKET_NAME = PROJ_NAME + '-' + os.getenv('PFEED_ENV', 'DEV').lower()
|
|
20
|
+
|
|
21
|
+
def __init__(self, **kwargs):
|
|
22
|
+
self.minio = Minio(
|
|
23
|
+
endpoint=os.getenv('MINIO_ENDPOINT'),
|
|
24
|
+
access_key=os.getenv('MINIO_ACCESS_KEY'),
|
|
25
|
+
secret_key=os.getenv('MINIO_SECRET_KEY'),
|
|
26
|
+
# turn off TLS, i.e. not using HTTPS
|
|
27
|
+
secure=True if os.getenv('PFEED_ENV', 'DEV').upper() == 'PRD' else False,
|
|
28
|
+
**kwargs,
|
|
29
|
+
)
|
|
30
|
+
|
|
31
|
+
def __getattr__(self, attr):
|
|
32
|
+
'''gets triggered only when the attribute is not found'''
|
|
33
|
+
return getattr(self.minio, attr)
|
|
34
|
+
|
|
35
|
+
def get_object(self, object_name: str) -> bytes | None:
|
|
36
|
+
try:
|
|
37
|
+
bucket_name = self.BUCKET_NAME
|
|
38
|
+
res = self.minio.get_object(bucket_name, object_name)
|
|
39
|
+
if res.status == 200:
|
|
40
|
+
return res.data
|
|
41
|
+
else:
|
|
42
|
+
logger.error(f'Unhandled MinIO response status {res.status}')
|
|
43
|
+
except S3Error as err:
|
|
44
|
+
# logger.warning(f'MinIO S3Error {object_name=} {err=}')
|
|
45
|
+
pass
|
|
46
|
+
|
|
47
|
+
def list_objects(self, prefix) -> list | None:
|
|
48
|
+
'''
|
|
49
|
+
Args:
|
|
50
|
+
prefix: e.g. live/bybit/historical/raw/BTC_USDT_PERP/
|
|
51
|
+
'''
|
|
52
|
+
bucket_name = self.BUCKET_NAME
|
|
53
|
+
objects: Generator = self.minio.list_objects(bucket_name, prefix=prefix)
|
|
54
|
+
return list(objects)
|
|
55
|
+
|
|
56
|
+
def put_object(self, object_name: str, data: bytes, **kwargs) -> ObjectWriteResult:
|
|
57
|
+
bucket_name = self.BUCKET_NAME
|
|
58
|
+
if not self.minio.bucket_exists(bucket_name):
|
|
59
|
+
self.minio.make_bucket(bucket_name)
|
|
60
|
+
return self.minio.put_object(
|
|
61
|
+
bucket_name,
|
|
62
|
+
object_name,
|
|
63
|
+
data=io.BytesIO(data),
|
|
64
|
+
part_size=self.DATA_PART_SIZE,
|
|
65
|
+
length=-1,
|
|
66
|
+
**kwargs
|
|
67
|
+
)
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
if __name__ == '__main__':
|
|
71
|
+
datastore = Datastore()
|
|
72
|
+
# list buckets
|
|
73
|
+
# buckets = datastore.list_buckets()
|
|
74
|
+
# for bucket in buckets:
|
|
75
|
+
# print(bucket.name, bucket.creation_date)
|
|
76
|
+
|
|
77
|
+
# list objects
|
|
78
|
+
objects = datastore.list_objects()
|
|
79
|
+
for obj in objects:
|
|
80
|
+
print(obj.object_name)
|
|
81
|
+
|
|
82
|
+
# get object
|
|
83
|
+
# data = datastore.get_object(
|
|
84
|
+
# object_name="live/bybit/historical/raw/BTC_USDT_PERP/BTC_USDT_PERP_2023-11-01.csv.gz"
|
|
85
|
+
# )
|
|
86
|
+
|
|
87
|
+
# put object
|
|
88
|
+
# datastore.put_object(
|
|
89
|
+
# bucket_name='test',
|
|
90
|
+
# object_name='test_prefix/test',
|
|
91
|
+
# data=b'test',
|
|
92
|
+
# part_size=1024**2 * 5,
|
|
93
|
+
# )
|
|
94
|
+
|
|
95
|
+
# upload a file
|
|
96
|
+
# datastore.fput_object(
|
|
97
|
+
# bucket_name="dev",
|
|
98
|
+
# object_name="test_prefix/test",
|
|
99
|
+
# file_path=f"{PROJ_PATH}/test_data/test.txt"
|
|
100
|
+
# )
|
|
101
|
+
|
|
102
|
+
# get object info
|
|
103
|
+
# res = datastore.stat_object(
|
|
104
|
+
# bucket_name="dev",
|
|
105
|
+
# object_name="test_prefix/test"
|
|
106
|
+
# )
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
# copy an object from one prefix to another
|
|
110
|
+
# res = datastore.copy_object(
|
|
111
|
+
# bucket_name="dev",
|
|
112
|
+
# object_name="new_prefix/test",
|
|
113
|
+
# source=CopySource(
|
|
114
|
+
# bucket_name='dev',
|
|
115
|
+
# object_name='test_prefix/test'
|
|
116
|
+
# )
|
|
117
|
+
# )
|
pfeed/feeds/__init__.py
ADDED
pfeed/feeds/base_feed.py
ADDED
|
@@ -0,0 +1,93 @@
|
|
|
1
|
+
import io
|
|
2
|
+
import datetime
|
|
3
|
+
|
|
4
|
+
import pandas as pd
|
|
5
|
+
|
|
6
|
+
from pfeed.feeds.base_feed import BaseFeed
|
|
7
|
+
from pfeed.sources.bybit import api
|
|
8
|
+
from pfeed.sources.bybit import etl
|
|
9
|
+
from pfeed.sources.bybit.const import DATA_SOURCE, create_efilename
|
|
10
|
+
from pfeed.const.paths import DATA_PATH
|
|
11
|
+
from pfeed.utils.utils import get_dates_in_between, rollback_date_range
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
__all__ = ['BybitFeed']
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class BybitFeed(BaseFeed):
|
|
18
|
+
def __init__(self):
|
|
19
|
+
super().__init__('bybit')
|
|
20
|
+
|
|
21
|
+
@staticmethod
|
|
22
|
+
def _derive_dtype_from_resolution(resolution):
|
|
23
|
+
from pfund.datas.resolution import Resolution
|
|
24
|
+
resolution = Resolution(resolution)
|
|
25
|
+
if resolution.is_tick():
|
|
26
|
+
return 'tick'
|
|
27
|
+
elif resolution.is_second():
|
|
28
|
+
return 'second'
|
|
29
|
+
elif resolution.is_minute():
|
|
30
|
+
return 'minute'
|
|
31
|
+
elif resolution.is_hour():
|
|
32
|
+
return 'hour'
|
|
33
|
+
elif resolution.is_day():
|
|
34
|
+
return 'daily'
|
|
35
|
+
else:
|
|
36
|
+
raise Exception(f'{resolution=} is not supported')
|
|
37
|
+
|
|
38
|
+
def get_historical_data(
|
|
39
|
+
self,
|
|
40
|
+
pdt: str,
|
|
41
|
+
rollback_period: str='1w',
|
|
42
|
+
resolution: str='1d',
|
|
43
|
+
start_date: str=None,
|
|
44
|
+
end_date: str=None,
|
|
45
|
+
data_path: str=str(DATA_PATH),
|
|
46
|
+
) -> pd.DataFrame:
|
|
47
|
+
from pfund.exchanges.bybit.exchange import Exchange
|
|
48
|
+
|
|
49
|
+
env = 'LIVE' # historical data is from LIVE env
|
|
50
|
+
exchange = Exchange(env)
|
|
51
|
+
adapter = exchange.adapter
|
|
52
|
+
dtype = self._derive_dtype_from_resolution(resolution)
|
|
53
|
+
ptype = pdt.split('_')[-1]
|
|
54
|
+
is_spot = (ptype.upper() == 'SPOT')
|
|
55
|
+
category = exchange.categorize_product(ptype)
|
|
56
|
+
epdt = adapter(pdt, ref_key=category)
|
|
57
|
+
efilenames = api.get_efilenames(category, epdt)
|
|
58
|
+
|
|
59
|
+
if start_date:
|
|
60
|
+
# default for end_date is yesterday
|
|
61
|
+
end_date: str = end_date or (datetime.datetime.now(tz=datetime.timezone.utc) - datetime.timedelta(days=1)).strftime('%Y-%m-%d')
|
|
62
|
+
else:
|
|
63
|
+
start_date, end_date = rollback_date_range(rollback_period)
|
|
64
|
+
dates: list[str] = get_dates_in_between(start_date, end_date)
|
|
65
|
+
|
|
66
|
+
dfs = []
|
|
67
|
+
for date in dates:
|
|
68
|
+
data_str = f'{DATA_SOURCE} {pdt} {date}'
|
|
69
|
+
efilename = create_efilename(epdt, date, is_spot=is_spot)
|
|
70
|
+
if efilename not in efilenames:
|
|
71
|
+
print(f'{efilename} does not exist in {DATA_SOURCE}')
|
|
72
|
+
continue
|
|
73
|
+
if local_data := etl.extract_data(pdt, date, dtype, env=env, mode='historical', data_path=data_path):
|
|
74
|
+
# e.g. local_data could be 1m data (period always = 1), but resampled_data could be 3m data
|
|
75
|
+
resampled_data: bytes = etl.resample_data(local_data, resolution, is_tick=True if dtype == 'tick' else False, category=category)
|
|
76
|
+
print(f'loaded {data_str} local {dtype} data')
|
|
77
|
+
else:
|
|
78
|
+
print(f'Downloading {data_str} data on the fly, please consider using {DATA_SOURCE.lower()}.run_historical(...) to pre-download data to your local computer first')
|
|
79
|
+
if raw_data := api.get_data(category, epdt, date):
|
|
80
|
+
tick_data: bytes = etl.clean_data(category, raw_data)
|
|
81
|
+
resampled_data: bytes = etl.resample_data(tick_data, resolution, is_tick=True, category=category)
|
|
82
|
+
print(f'resampled {data_str} data to {resolution=}')
|
|
83
|
+
else:
|
|
84
|
+
raise Exception(f'failed to download {data_str} historical data')
|
|
85
|
+
df = pd.read_parquet(io.BytesIO(resampled_data))
|
|
86
|
+
dfs.append(df)
|
|
87
|
+
return pd.concat(dfs)
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
if __name__ == '__main__':
|
|
91
|
+
feed = BybitFeed()
|
|
92
|
+
df = feed.get_historical_data('BCH_USDT_PERP', resolution='1d', rollback_period='2d')
|
|
93
|
+
print(df)
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
from pfeed.feeds.base_feed import BaseFeed
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
__all__ = ['CustomCsvFeed']
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
# TODO
|
|
8
|
+
class CustomCsvFeed(BaseFeed):
|
|
9
|
+
def __init__(self, name='custom_csv'):
|
|
10
|
+
super().__init__(name)
|
|
11
|
+
|
|
12
|
+
def get_historical_data(self, *args, **kwargs):
|
|
13
|
+
raise NotImplementedError
|
|
@@ -0,0 +1,96 @@
|
|
|
1
|
+
import datetime
|
|
2
|
+
|
|
3
|
+
import pandas as pd
|
|
4
|
+
import yfinance as yf
|
|
5
|
+
|
|
6
|
+
from pfeed.feeds.base_feed import BaseFeed
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
__all__ = ['YahooFinanceFeed']
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class YahooFinanceFeed(BaseFeed):
|
|
13
|
+
_ADAPTER = {
|
|
14
|
+
'timeframe': {
|
|
15
|
+
# pfund's : yfinance's
|
|
16
|
+
'M': 'mo',
|
|
17
|
+
'w': 'wk',
|
|
18
|
+
}
|
|
19
|
+
}
|
|
20
|
+
# yfinance's valid intervals: [1m, 2m, 5m, 15m, 30m, 60m, 90m, 1h, 1d, 5d, 1wk, 1mo, 3mo]
|
|
21
|
+
SUPPORTED_TIMEFRAMES_AND_PERIODS = {
|
|
22
|
+
'm': [1, 2, 5, 15, 30, 60, 90],
|
|
23
|
+
'h': [1],
|
|
24
|
+
'd': [1, 5],
|
|
25
|
+
'w': [1],
|
|
26
|
+
'M': [1, 3],
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
def __init__(self):
|
|
30
|
+
super().__init__('yahoo_finance')
|
|
31
|
+
|
|
32
|
+
def get_ticker(self, symbol):
|
|
33
|
+
return yf.Ticker(symbol.upper())
|
|
34
|
+
|
|
35
|
+
def get_historical_data(
|
|
36
|
+
self,
|
|
37
|
+
symbol: str,
|
|
38
|
+
rollback_period: str='1M',
|
|
39
|
+
resolution: str='1d',
|
|
40
|
+
start_date: str=None,
|
|
41
|
+
end_date: str=None,
|
|
42
|
+
**kwargs
|
|
43
|
+
) -> pd.DataFrame:
|
|
44
|
+
"""Simple Wrapper of yfinance history().
|
|
45
|
+
For the details of args and kwargs, please refer to https://github.com/ranaroussi/yfinance
|
|
46
|
+
"""
|
|
47
|
+
from pfund.datas.resolution import Resolution
|
|
48
|
+
|
|
49
|
+
# convert pfund's rollback_period format to yfinance's period
|
|
50
|
+
rollback_period = Resolution(rollback_period)
|
|
51
|
+
timeframe = repr(rollback_period.timeframe)
|
|
52
|
+
etimeframe = self._ADAPTER['timeframe'].get(timeframe, timeframe)
|
|
53
|
+
erollback_period = str(rollback_period.period) + etimeframe
|
|
54
|
+
# if user is directly using yfinance variable `period`, use it
|
|
55
|
+
if 'period' in kwargs:
|
|
56
|
+
period = kwargs['period']
|
|
57
|
+
del kwargs['period']
|
|
58
|
+
else:
|
|
59
|
+
period = erollback_period
|
|
60
|
+
|
|
61
|
+
# convert pfund's resolution format to yfinance's interval
|
|
62
|
+
resolution = Resolution(resolution)
|
|
63
|
+
timeframe = repr(resolution.timeframe)
|
|
64
|
+
etimeframe = self._ADAPTER['timeframe'].get(timeframe, timeframe)
|
|
65
|
+
eresolution = str(resolution.period) + etimeframe
|
|
66
|
+
# if user is directly using yfinance variable `interval`, use it
|
|
67
|
+
if 'interval' in kwargs:
|
|
68
|
+
interval = kwargs['interval']
|
|
69
|
+
del kwargs['interval']
|
|
70
|
+
else:
|
|
71
|
+
interval = eresolution
|
|
72
|
+
|
|
73
|
+
if start_date:
|
|
74
|
+
# default for end_date is today
|
|
75
|
+
end_date = end_date or datetime.datetime.now(tz=datetime.timezone.utc).strftime('%Y-%m-%d')
|
|
76
|
+
else:
|
|
77
|
+
start_date = end_date = None
|
|
78
|
+
|
|
79
|
+
ticker = self.get_ticker(symbol)
|
|
80
|
+
df = ticker.history(period=period, interval=interval, start=start_date, end=end_date, **kwargs)
|
|
81
|
+
df.rename_axis('ts', inplace=True) # rename index 'Date' to 'ts'
|
|
82
|
+
df.columns = df.columns.str.lower()
|
|
83
|
+
# if there are spaces in column names, they will be turned into some weird names like "_10"
|
|
84
|
+
# during "for row in df.itertuples()"
|
|
85
|
+
df = df.rename(columns={'stock splits': 'stock_splits'})
|
|
86
|
+
# convert to UTC
|
|
87
|
+
df.index = df.index.tz_convert('UTC')
|
|
88
|
+
# convert to UTC and remove +hh:mm from YYYY-MM-DD hh:mm:ss+hh:mm
|
|
89
|
+
# df.index = df.index.tz_convert('UTC').tz_localize(None)
|
|
90
|
+
return df
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
if __name__ == '__main__':
|
|
94
|
+
feed = YahooFinanceFeed()
|
|
95
|
+
df = feed.get_historical_data('TSLA')
|
|
96
|
+
print(df)
|
pfeed/filepath.py
ADDED
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
|
|
3
|
+
from typing import Literal
|
|
4
|
+
|
|
5
|
+
from pfeed.utils.utils import create_filename
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class FilePath:
|
|
9
|
+
'''Simple wrapper for file path to extract info faster'''
|
|
10
|
+
def __init__(
|
|
11
|
+
self,
|
|
12
|
+
data_path: str,
|
|
13
|
+
env: Literal['PAPER', 'LIVE'],
|
|
14
|
+
data_source: str,
|
|
15
|
+
data_type: Literal['raw', 'tick', 'second', 'minute', 'hour', 'daily'],
|
|
16
|
+
mode: Literal['historical', 'streaming'],
|
|
17
|
+
pdt: str,
|
|
18
|
+
date: str,
|
|
19
|
+
file_extension: str,
|
|
20
|
+
):
|
|
21
|
+
self.data_path = self.dpath = data_path
|
|
22
|
+
self.data_Path = self.dPath = Path(data_path)
|
|
23
|
+
self.env = env.lower()
|
|
24
|
+
self.data_source = data_source.lower()
|
|
25
|
+
self.data_type = self.dtype = data_type.lower()
|
|
26
|
+
self.mode = mode
|
|
27
|
+
self.pdt = pdt.lower()
|
|
28
|
+
self.date = date
|
|
29
|
+
self.file_extension = file_extension
|
|
30
|
+
self.filename = create_filename(pdt.upper(), date, file_extension)
|
|
31
|
+
self.storage_Path = self.sPath = Path(self.env) / self.data_source / self.mode / self.dtype / pdt.upper() / self.filename
|
|
32
|
+
self.storage_path = self.spath = str(self.storage_Path)
|
|
33
|
+
self.file_Path = self.fPath = self.data_Path / self.storage_Path
|
|
34
|
+
self.file_path = self.fpath = str(self.file_Path)
|
|
35
|
+
|
|
36
|
+
|
pfeed/main.py
ADDED
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import sys
|
|
3
|
+
import logging
|
|
4
|
+
import importlib
|
|
5
|
+
from types import TracebackType
|
|
6
|
+
from argparse import ArgumentParser
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
from pfeed.const.paths import LOG_PATH, DATA_PATH
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
ALIASES = {
|
|
13
|
+
'yf': 'yahoo_finance',
|
|
14
|
+
}
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def _custom_excepthook(exception_class: type[BaseException], exception: BaseException, traceback: TracebackType):
|
|
18
|
+
'''Catches any uncaught exceptions and logs them'''
|
|
19
|
+
# sys.__excepthook__(exception_class, exception, traceback)
|
|
20
|
+
try:
|
|
21
|
+
raise exception
|
|
22
|
+
except:
|
|
23
|
+
logging.getLogger('pfeed').exception('Uncaught exception:')
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
if __name__ == '__main__':
|
|
27
|
+
sys.excepthook = _custom_excepthook
|
|
28
|
+
|
|
29
|
+
parser = ArgumentParser()
|
|
30
|
+
parser.add_argument('-e', '--env', choices=['PAPER', 'LIVE'], default='LIVE', required=False)
|
|
31
|
+
parser.add_argument('--data-path', dest='data_path', default=str(DATA_PATH), required=False)
|
|
32
|
+
parser.add_argument('--log-path', dest='log_path', default=str(LOG_PATH), required=False)
|
|
33
|
+
parser.add_argument('-m', '--mode', default='historical', help='historical=historical data processing; streaming=live data streaming', choices=['historical', 'streaming'], required=False)
|
|
34
|
+
parser.add_argument('-b', '--start-date', dest='start_date', help='Start date in YYYY-MM-DD format', required=False)
|
|
35
|
+
parser.add_argument('-n', '--end-date', dest='end_date', help='End date in YYYY-MM-DD format', required=False)
|
|
36
|
+
parser.add_argument('--ptypes', nargs='+', help='List of product types, e.g. PERP = get all perpetuals', required=False)
|
|
37
|
+
parser.add_argument('-p', '--pdts', nargs='+', help='List of trading products', required=False)
|
|
38
|
+
parser.add_argument('--dtypes', nargs='+', help='List of data types, e.g. raw, tick, second, minute, hour, daily', required=False)
|
|
39
|
+
parser.add_argument(
|
|
40
|
+
'-s', '--source', required=True,
|
|
41
|
+
choices=[
|
|
42
|
+
'bybit',
|
|
43
|
+
],
|
|
44
|
+
)
|
|
45
|
+
parser.add_argument('-z', '--batch-size', dest='batch_size', default=8, help='batch size for Ray tasks', type=int, required=False)
|
|
46
|
+
parser.add_argument('--no-ray', dest='no_ray', action='store_true', required=False)
|
|
47
|
+
parser.add_argument('--no-minio', dest='no_minio', action='store_true', required=False)
|
|
48
|
+
|
|
49
|
+
# handle arguments
|
|
50
|
+
args = parser.parse_args()
|
|
51
|
+
print("Ray is disabled") if args.no_ray else print("Ray is enabled")
|
|
52
|
+
print("MinIO is disabled") if args.no_minio or os.getenv('MINIO_ENDPOINT') is None else print("MinIO is enabled")
|
|
53
|
+
env, source, mode = args.env.upper(), args.source.lower(), args.mode.lower()
|
|
54
|
+
|
|
55
|
+
source = ALIASES.get(source, source)
|
|
56
|
+
pipeline = importlib.import_module(f'pfeed.sources.{source}.{mode}')
|
|
57
|
+
if mode == 'historical':
|
|
58
|
+
pipeline.run(
|
|
59
|
+
dtypes=args.dtypes,
|
|
60
|
+
ptypes=args.ptypes,
|
|
61
|
+
pdts=args.pdts,
|
|
62
|
+
start_date=args.start_date,
|
|
63
|
+
end_date=args.end_date,
|
|
64
|
+
log_path=args.log_path,
|
|
65
|
+
data_path=args.data_path,
|
|
66
|
+
batch_size=args.batch_size,
|
|
67
|
+
use_ray=not args.no_ray,
|
|
68
|
+
use_minio=not args.no_minio,
|
|
69
|
+
)
|
|
70
|
+
else:
|
|
71
|
+
# TODO: implement streaming mode
|
|
72
|
+
raise NotImplementedError(f'{mode} is not implemented yet')
|
pfeed/sources/.DS_Store
ADDED
|
Binary file
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
from pfeed.sources import bybit
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
import re
|
|
2
|
+
import time
|
|
3
|
+
import logging
|
|
4
|
+
|
|
5
|
+
import requests
|
|
6
|
+
|
|
7
|
+
from bs4 import BeautifulSoup
|
|
8
|
+
from pfeed.sources.bybit.const import DATA_SOURCE, DATA_SOURCE_URLS, DATA_NAMING_REGEX_PATTERNS, create_efilename
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
logger = logging.getLogger(DATA_SOURCE.lower())
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def get(url, handle_func, frequency=1, num_retry=3):
|
|
15
|
+
'''
|
|
16
|
+
Handles general requests.get with control on frequency and number of retry
|
|
17
|
+
Args:
|
|
18
|
+
handle_func: specific logic for handling response
|
|
19
|
+
'''
|
|
20
|
+
logger.debug(f'calling {url}')
|
|
21
|
+
while num_retry:
|
|
22
|
+
res = requests.get(url)
|
|
23
|
+
if res.status_code == 200:
|
|
24
|
+
return handle_func(res)
|
|
25
|
+
elif res.status_code == 404:
|
|
26
|
+
logger.error(f'File not found {url=} {res.status_code=} {res.text=}')
|
|
27
|
+
break
|
|
28
|
+
else:
|
|
29
|
+
logger.warning(f'{res.status_code=} {res.text=}')
|
|
30
|
+
time.sleep(frequency)
|
|
31
|
+
else:
|
|
32
|
+
logger.error(f'failed to call {url}')
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def get_efilenames(category: str, epdt: str):
|
|
36
|
+
'''
|
|
37
|
+
Get efilenames (e.g. BTCUSDT2022-10-04.csv.gz)
|
|
38
|
+
'''
|
|
39
|
+
def _handle_response(res):
|
|
40
|
+
soup = BeautifulSoup(res.text, 'html.parser')
|
|
41
|
+
efilenames = [node.get('href') for node in soup.find_all('a')]
|
|
42
|
+
return efilenames
|
|
43
|
+
url = '/'.join([DATA_SOURCE_URLS[category], epdt])
|
|
44
|
+
return get(url, _handle_response, frequency=1, num_retry=3)
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def get_epdts(category: str, ptype: str):
|
|
48
|
+
def _handle_response(res):
|
|
49
|
+
soup = BeautifulSoup(res.text, 'html.parser')
|
|
50
|
+
epdts = [node.get('href').replace('/', '') for node in soup.find_all('a') if pattern.search(node.get('href'))]
|
|
51
|
+
return epdts
|
|
52
|
+
pattern = re.compile(DATA_NAMING_REGEX_PATTERNS[ptype])
|
|
53
|
+
url = DATA_SOURCE_URLS[category]
|
|
54
|
+
return get(url, _handle_response, frequency=1, num_retry=3)
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def get_data(category: str, epdt: str, date: str):
|
|
58
|
+
def _handle_response(res):
|
|
59
|
+
data = res.content
|
|
60
|
+
return data
|
|
61
|
+
efilename = create_efilename(epdt, date, is_spot=(category.upper()=='SPOT'))
|
|
62
|
+
url = f"{DATA_SOURCE_URLS[category]}/{epdt}/{efilename}"
|
|
63
|
+
return get(url, _handle_response, frequency=1, num_retry=3)
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
DATA_SOURCE = 'BYBIT'
|
|
2
|
+
# EXTEND: support FUT and IFUT
|
|
3
|
+
# SUPPORTED_CRYPTO_PRODUCT_TYPES = ['SPOT', 'PERP', 'IPERP', 'FUT', 'IFUT']
|
|
4
|
+
SUPPORTED_CRYPTO_PRODUCT_TYPES = ['SPOT', 'PERP', 'IPERP']
|
|
5
|
+
RAW_DATA_TYPE = 'tick'
|
|
6
|
+
# do not need to be precise
|
|
7
|
+
DATA_START_DATE = '2020-01-01'
|
|
8
|
+
DATA_SOURCE_URLS = {
|
|
9
|
+
'linear': 'https://public.bybit.com/trading',
|
|
10
|
+
'inverse': 'https://public.bybit.com/trading',
|
|
11
|
+
'spot': 'https://public.bybit.com/spot',
|
|
12
|
+
}
|
|
13
|
+
DATA_NAMING_REGEX_PATTERNS = {
|
|
14
|
+
'PERP': '(USDT\/|PERP\/)$', # USDT perp or USDC perp;
|
|
15
|
+
'FUT': '-\d{2}[A-Z]{3}\d{2}\/$', # USDC futures e.g. BTC-10NOV23/
|
|
16
|
+
'IPERP': 'USD\/$', # inverse perps;
|
|
17
|
+
'IFUT': 'USD[A-Z]\d{2}\/$', # inverse futures e.g. BTCUSDH24/
|
|
18
|
+
# match everything since everything from https://public.bybit.com/spot is spot
|
|
19
|
+
'SPOT': '.*',
|
|
20
|
+
}
|
|
21
|
+
SELECTED_RAW_COLS = {
|
|
22
|
+
'linear': ['timestamp', 'side', 'size', 'price'],
|
|
23
|
+
'inverse': ['timestamp', 'side', 'size', 'price'],
|
|
24
|
+
'spot': ['timestamp', 'side', 'volume', 'price'],
|
|
25
|
+
}
|
|
26
|
+
RENAMING_COLS = {
|
|
27
|
+
'linear': {'timestamp': 'ts', 'size': 'volume'},
|
|
28
|
+
'inverse': {'timestamp': 'ts', 'size': 'volume'},
|
|
29
|
+
'spot': {'timestamp': 'ts'},
|
|
30
|
+
}
|
|
31
|
+
RAW_DATA_TIMESTAMP_UNITS = {
|
|
32
|
+
'linear': 's',
|
|
33
|
+
'inverse': 's',
|
|
34
|
+
'spot': 'ms'
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def create_efilename(epdt, date, is_spot=False):
|
|
39
|
+
if is_spot:
|
|
40
|
+
return f'{epdt}_{date}.csv.gz'
|
|
41
|
+
else:
|
|
42
|
+
return f'{epdt}{date}.csv.gz'
|