pfeed 0.0.1.dev1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
pfeed/.DS_Store ADDED
Binary file
pfeed/__init__.py ADDED
@@ -0,0 +1,20 @@
1
+ import multiprocessing
2
+
3
+ from dotenv import load_dotenv, find_dotenv
4
+ from rich.console import Console
5
+
6
+ from pfeed.sources import bybit
7
+ from pfeed.feeds import YahooFinanceFeed, BybitFeed
8
+
9
+
10
+ cprint = Console().print
11
+ load_dotenv(find_dotenv())
12
+ multiprocessing.set_start_method('fork', force=True)
13
+
14
+
15
+ __all__ = (
16
+ 'bybit',
17
+ 'YahooFinanceFeed',
18
+ 'BybitFeed',
19
+ 'cprint',
20
+ )
@@ -0,0 +1,47 @@
1
+ version: 1
2
+ filename_format: '%Y-%m-%d_UTC%z'
3
+ loggers:
4
+ root:
5
+ level: 'DEBUG'
6
+ handlers: ['compressed_timed_rotating_file_handler', 'stream_handler']
7
+ propagate: False
8
+ pfeed:
9
+ level: 'DEBUG'
10
+ handlers: ['compressed_timed_rotating_file_handler', 'stream_handler']
11
+ propagate: False
12
+ minio:
13
+ level: 'DEBUG'
14
+ handlers: ['compressed_timed_rotating_file_handler', 'stream_handler']
15
+ propagate: False
16
+ yahoo_finance:
17
+ level: 'DEBUG'
18
+ handlers: ['compressed_timed_rotating_file_handler', 'stream_handler']
19
+ propagate: False
20
+ bybit:
21
+ level: 'DEBUG'
22
+ handlers: ['compressed_timed_rotating_file_handler', 'stream_handler']
23
+ propagate: False
24
+ handlers:
25
+ file_handler:
26
+ class: 'logging.FileHandler'
27
+ level: 'DEBUG'
28
+ formatter: 'file'
29
+ compressed_timed_rotating_file_handler:
30
+ class: 'pfund.logging.handlers.CompressedTimedRotatingFileHandler'
31
+ level: 'DEBUG'
32
+ formatter: 'file'
33
+ kwargs: {'when': 'midnight', 'backupCount': 7, 'utc': True, 'encoding': 'utf-8'}
34
+ stream_handler:
35
+ class: 'logging.StreamHandler'
36
+ level: 'DEBUG'
37
+ formatter: 'console'
38
+ formatters:
39
+ path:
40
+ format: '%(asctime)s.%(msecs)03d | %(levelname)s | %(name)s | %(message)s | %(shortpath)s fn:%(funcName)s ln:%(lineno)d'
41
+ datefmt: '%H:%M:%S%z'
42
+ file:
43
+ format: '%(asctime)s.%(msecs)03d | %(levelname)s | %(message)s | %(filename)s fn:%(funcName)s ln:%(lineno)d'
44
+ datefmt: '%H:%M:%S%z'
45
+ console:
46
+ format: '%(asctime)s.%(msecs)03d | %(levelname)s | %(name)s | %(message)s | fn:%(funcName)s ln:%(lineno)d'
47
+ datefmt: '%Y-%m-%d %H:%M:%S%z'
pfeed/const/commons.py ADDED
@@ -0,0 +1,2 @@
1
+ SUPPORTED_DATA_TYPES = ['raw', 'tick', 'second', 'minute', 'hour', 'daily']
2
+ SUPPORTED_DATA_FEEDS = ['YAHOO_FINANCE', 'BYBIT']
pfeed/const/paths.py ADDED
@@ -0,0 +1,9 @@
1
+ from pathlib import Path
2
+
3
+
4
+ PROJ_NAME = Path(__file__).resolve().parents[2].name
5
+ MAIN_PATH = Path(__file__).resolve().parents[3]
6
+ PROJ_PATH = MAIN_PATH / PROJ_NAME / PROJ_NAME
7
+ CONFIG_PATH = PROJ_PATH / 'config'
8
+ LOG_PATH = MAIN_PATH / PROJ_NAME / 'logs'
9
+ DATA_PATH = MAIN_PATH / PROJ_NAME / 'data'
pfeed/datastore.py ADDED
@@ -0,0 +1,117 @@
1
+ import os
2
+ import io
3
+ import logging
4
+
5
+ from typing import Generator
6
+
7
+ from minio import Minio, S3Error
8
+ from minio.api import ObjectWriteResult
9
+
10
+ from pfeed.const.paths import PROJ_NAME
11
+
12
+
13
+ logger = logging.getLogger('minio')
14
+
15
+
16
+ # EXTEND, currently only consider using MinIO
17
+ class Datastore:
18
+ DATA_PART_SIZE = 5 * (1024 ** 2) # part size for S3, 5 MB
19
+ BUCKET_NAME = PROJ_NAME + '-' + os.getenv('PFEED_ENV', 'DEV').lower()
20
+
21
+ def __init__(self, **kwargs):
22
+ self.minio = Minio(
23
+ endpoint=os.getenv('MINIO_ENDPOINT'),
24
+ access_key=os.getenv('MINIO_ACCESS_KEY'),
25
+ secret_key=os.getenv('MINIO_SECRET_KEY'),
26
+ # turn off TLS, i.e. not using HTTPS
27
+ secure=True if os.getenv('PFEED_ENV', 'DEV').upper() == 'PRD' else False,
28
+ **kwargs,
29
+ )
30
+
31
+ def __getattr__(self, attr):
32
+ '''gets triggered only when the attribute is not found'''
33
+ return getattr(self.minio, attr)
34
+
35
+ def get_object(self, object_name: str) -> bytes | None:
36
+ try:
37
+ bucket_name = self.BUCKET_NAME
38
+ res = self.minio.get_object(bucket_name, object_name)
39
+ if res.status == 200:
40
+ return res.data
41
+ else:
42
+ logger.error(f'Unhandled MinIO response status {res.status}')
43
+ except S3Error as err:
44
+ # logger.warning(f'MinIO S3Error {object_name=} {err=}')
45
+ pass
46
+
47
+ def list_objects(self, prefix) -> list | None:
48
+ '''
49
+ Args:
50
+ prefix: e.g. live/bybit/historical/raw/BTC_USDT_PERP/
51
+ '''
52
+ bucket_name = self.BUCKET_NAME
53
+ objects: Generator = self.minio.list_objects(bucket_name, prefix=prefix)
54
+ return list(objects)
55
+
56
+ def put_object(self, object_name: str, data: bytes, **kwargs) -> ObjectWriteResult:
57
+ bucket_name = self.BUCKET_NAME
58
+ if not self.minio.bucket_exists(bucket_name):
59
+ self.minio.make_bucket(bucket_name)
60
+ return self.minio.put_object(
61
+ bucket_name,
62
+ object_name,
63
+ data=io.BytesIO(data),
64
+ part_size=self.DATA_PART_SIZE,
65
+ length=-1,
66
+ **kwargs
67
+ )
68
+
69
+
70
+ if __name__ == '__main__':
71
+ datastore = Datastore()
72
+ # list buckets
73
+ # buckets = datastore.list_buckets()
74
+ # for bucket in buckets:
75
+ # print(bucket.name, bucket.creation_date)
76
+
77
+ # list objects
78
+ objects = datastore.list_objects()
79
+ for obj in objects:
80
+ print(obj.object_name)
81
+
82
+ # get object
83
+ # data = datastore.get_object(
84
+ # object_name="live/bybit/historical/raw/BTC_USDT_PERP/BTC_USDT_PERP_2023-11-01.csv.gz"
85
+ # )
86
+
87
+ # put object
88
+ # datastore.put_object(
89
+ # bucket_name='test',
90
+ # object_name='test_prefix/test',
91
+ # data=b'test',
92
+ # part_size=1024**2 * 5,
93
+ # )
94
+
95
+ # upload a file
96
+ # datastore.fput_object(
97
+ # bucket_name="dev",
98
+ # object_name="test_prefix/test",
99
+ # file_path=f"{PROJ_PATH}/test_data/test.txt"
100
+ # )
101
+
102
+ # get object info
103
+ # res = datastore.stat_object(
104
+ # bucket_name="dev",
105
+ # object_name="test_prefix/test"
106
+ # )
107
+
108
+
109
+ # copy an object from one prefix to another
110
+ # res = datastore.copy_object(
111
+ # bucket_name="dev",
112
+ # object_name="new_prefix/test",
113
+ # source=CopySource(
114
+ # bucket_name='dev',
115
+ # object_name='test_prefix/test'
116
+ # )
117
+ # )
@@ -0,0 +1,2 @@
1
+ from pfeed.feeds.yahoo_finance_feed import YahooFinanceFeed
2
+ from pfeed.feeds.bybit_feed import BybitFeed
@@ -0,0 +1,3 @@
1
+ class BaseFeed:
2
+ def __init__(self, name):
3
+ self.name = name.upper()
@@ -0,0 +1,93 @@
1
+ import io
2
+ import datetime
3
+
4
+ import pandas as pd
5
+
6
+ from pfeed.feeds.base_feed import BaseFeed
7
+ from pfeed.sources.bybit import api
8
+ from pfeed.sources.bybit import etl
9
+ from pfeed.sources.bybit.const import DATA_SOURCE, create_efilename
10
+ from pfeed.const.paths import DATA_PATH
11
+ from pfeed.utils.utils import get_dates_in_between, rollback_date_range
12
+
13
+
14
+ __all__ = ['BybitFeed']
15
+
16
+
17
+ class BybitFeed(BaseFeed):
18
+ def __init__(self):
19
+ super().__init__('bybit')
20
+
21
+ @staticmethod
22
+ def _derive_dtype_from_resolution(resolution):
23
+ from pfund.datas.resolution import Resolution
24
+ resolution = Resolution(resolution)
25
+ if resolution.is_tick():
26
+ return 'tick'
27
+ elif resolution.is_second():
28
+ return 'second'
29
+ elif resolution.is_minute():
30
+ return 'minute'
31
+ elif resolution.is_hour():
32
+ return 'hour'
33
+ elif resolution.is_day():
34
+ return 'daily'
35
+ else:
36
+ raise Exception(f'{resolution=} is not supported')
37
+
38
+ def get_historical_data(
39
+ self,
40
+ pdt: str,
41
+ rollback_period: str='1w',
42
+ resolution: str='1d',
43
+ start_date: str=None,
44
+ end_date: str=None,
45
+ data_path: str=str(DATA_PATH),
46
+ ) -> pd.DataFrame:
47
+ from pfund.exchanges.bybit.exchange import Exchange
48
+
49
+ env = 'LIVE' # historical data is from LIVE env
50
+ exchange = Exchange(env)
51
+ adapter = exchange.adapter
52
+ dtype = self._derive_dtype_from_resolution(resolution)
53
+ ptype = pdt.split('_')[-1]
54
+ is_spot = (ptype.upper() == 'SPOT')
55
+ category = exchange.categorize_product(ptype)
56
+ epdt = adapter(pdt, ref_key=category)
57
+ efilenames = api.get_efilenames(category, epdt)
58
+
59
+ if start_date:
60
+ # default for end_date is yesterday
61
+ end_date: str = end_date or (datetime.datetime.now(tz=datetime.timezone.utc) - datetime.timedelta(days=1)).strftime('%Y-%m-%d')
62
+ else:
63
+ start_date, end_date = rollback_date_range(rollback_period)
64
+ dates: list[str] = get_dates_in_between(start_date, end_date)
65
+
66
+ dfs = []
67
+ for date in dates:
68
+ data_str = f'{DATA_SOURCE} {pdt} {date}'
69
+ efilename = create_efilename(epdt, date, is_spot=is_spot)
70
+ if efilename not in efilenames:
71
+ print(f'{efilename} does not exist in {DATA_SOURCE}')
72
+ continue
73
+ if local_data := etl.extract_data(pdt, date, dtype, env=env, mode='historical', data_path=data_path):
74
+ # e.g. local_data could be 1m data (period always = 1), but resampled_data could be 3m data
75
+ resampled_data: bytes = etl.resample_data(local_data, resolution, is_tick=True if dtype == 'tick' else False, category=category)
76
+ print(f'loaded {data_str} local {dtype} data')
77
+ else:
78
+ print(f'Downloading {data_str} data on the fly, please consider using {DATA_SOURCE.lower()}.run_historical(...) to pre-download data to your local computer first')
79
+ if raw_data := api.get_data(category, epdt, date):
80
+ tick_data: bytes = etl.clean_data(category, raw_data)
81
+ resampled_data: bytes = etl.resample_data(tick_data, resolution, is_tick=True, category=category)
82
+ print(f'resampled {data_str} data to {resolution=}')
83
+ else:
84
+ raise Exception(f'failed to download {data_str} historical data')
85
+ df = pd.read_parquet(io.BytesIO(resampled_data))
86
+ dfs.append(df)
87
+ return pd.concat(dfs)
88
+
89
+
90
+ if __name__ == '__main__':
91
+ feed = BybitFeed()
92
+ df = feed.get_historical_data('BCH_USDT_PERP', resolution='1d', rollback_period='2d')
93
+ print(df)
@@ -0,0 +1,13 @@
1
+ from pfeed.feeds.base_feed import BaseFeed
2
+
3
+
4
+ __all__ = ['CustomCsvFeed']
5
+
6
+
7
+ # TODO
8
+ class CustomCsvFeed(BaseFeed):
9
+ def __init__(self, name='custom_csv'):
10
+ super().__init__(name)
11
+
12
+ def get_historical_data(self, *args, **kwargs):
13
+ raise NotImplementedError
@@ -0,0 +1,96 @@
1
+ import datetime
2
+
3
+ import pandas as pd
4
+ import yfinance as yf
5
+
6
+ from pfeed.feeds.base_feed import BaseFeed
7
+
8
+
9
+ __all__ = ['YahooFinanceFeed']
10
+
11
+
12
+ class YahooFinanceFeed(BaseFeed):
13
+ _ADAPTER = {
14
+ 'timeframe': {
15
+ # pfund's : yfinance's
16
+ 'M': 'mo',
17
+ 'w': 'wk',
18
+ }
19
+ }
20
+ # yfinance's valid intervals: [1m, 2m, 5m, 15m, 30m, 60m, 90m, 1h, 1d, 5d, 1wk, 1mo, 3mo]
21
+ SUPPORTED_TIMEFRAMES_AND_PERIODS = {
22
+ 'm': [1, 2, 5, 15, 30, 60, 90],
23
+ 'h': [1],
24
+ 'd': [1, 5],
25
+ 'w': [1],
26
+ 'M': [1, 3],
27
+ }
28
+
29
+ def __init__(self):
30
+ super().__init__('yahoo_finance')
31
+
32
+ def get_ticker(self, symbol):
33
+ return yf.Ticker(symbol.upper())
34
+
35
+ def get_historical_data(
36
+ self,
37
+ symbol: str,
38
+ rollback_period: str='1M',
39
+ resolution: str='1d',
40
+ start_date: str=None,
41
+ end_date: str=None,
42
+ **kwargs
43
+ ) -> pd.DataFrame:
44
+ """Simple Wrapper of yfinance history().
45
+ For the details of args and kwargs, please refer to https://github.com/ranaroussi/yfinance
46
+ """
47
+ from pfund.datas.resolution import Resolution
48
+
49
+ # convert pfund's rollback_period format to yfinance's period
50
+ rollback_period = Resolution(rollback_period)
51
+ timeframe = repr(rollback_period.timeframe)
52
+ etimeframe = self._ADAPTER['timeframe'].get(timeframe, timeframe)
53
+ erollback_period = str(rollback_period.period) + etimeframe
54
+ # if user is directly using yfinance variable `period`, use it
55
+ if 'period' in kwargs:
56
+ period = kwargs['period']
57
+ del kwargs['period']
58
+ else:
59
+ period = erollback_period
60
+
61
+ # convert pfund's resolution format to yfinance's interval
62
+ resolution = Resolution(resolution)
63
+ timeframe = repr(resolution.timeframe)
64
+ etimeframe = self._ADAPTER['timeframe'].get(timeframe, timeframe)
65
+ eresolution = str(resolution.period) + etimeframe
66
+ # if user is directly using yfinance variable `interval`, use it
67
+ if 'interval' in kwargs:
68
+ interval = kwargs['interval']
69
+ del kwargs['interval']
70
+ else:
71
+ interval = eresolution
72
+
73
+ if start_date:
74
+ # default for end_date is today
75
+ end_date = end_date or datetime.datetime.now(tz=datetime.timezone.utc).strftime('%Y-%m-%d')
76
+ else:
77
+ start_date = end_date = None
78
+
79
+ ticker = self.get_ticker(symbol)
80
+ df = ticker.history(period=period, interval=interval, start=start_date, end=end_date, **kwargs)
81
+ df.rename_axis('ts', inplace=True) # rename index 'Date' to 'ts'
82
+ df.columns = df.columns.str.lower()
83
+ # if there are spaces in column names, they will be turned into some weird names like "_10"
84
+ # during "for row in df.itertuples()"
85
+ df = df.rename(columns={'stock splits': 'stock_splits'})
86
+ # convert to UTC
87
+ df.index = df.index.tz_convert('UTC')
88
+ # convert to UTC and remove +hh:mm from YYYY-MM-DD hh:mm:ss+hh:mm
89
+ # df.index = df.index.tz_convert('UTC').tz_localize(None)
90
+ return df
91
+
92
+
93
+ if __name__ == '__main__':
94
+ feed = YahooFinanceFeed()
95
+ df = feed.get_historical_data('TSLA')
96
+ print(df)
pfeed/filepath.py ADDED
@@ -0,0 +1,36 @@
1
+ from pathlib import Path
2
+
3
+ from typing import Literal
4
+
5
+ from pfeed.utils.utils import create_filename
6
+
7
+
8
+ class FilePath:
9
+ '''Simple wrapper for file path to extract info faster'''
10
+ def __init__(
11
+ self,
12
+ data_path: str,
13
+ env: Literal['PAPER', 'LIVE'],
14
+ data_source: str,
15
+ data_type: Literal['raw', 'tick', 'second', 'minute', 'hour', 'daily'],
16
+ mode: Literal['historical', 'streaming'],
17
+ pdt: str,
18
+ date: str,
19
+ file_extension: str,
20
+ ):
21
+ self.data_path = self.dpath = data_path
22
+ self.data_Path = self.dPath = Path(data_path)
23
+ self.env = env.lower()
24
+ self.data_source = data_source.lower()
25
+ self.data_type = self.dtype = data_type.lower()
26
+ self.mode = mode
27
+ self.pdt = pdt.lower()
28
+ self.date = date
29
+ self.file_extension = file_extension
30
+ self.filename = create_filename(pdt.upper(), date, file_extension)
31
+ self.storage_Path = self.sPath = Path(self.env) / self.data_source / self.mode / self.dtype / pdt.upper() / self.filename
32
+ self.storage_path = self.spath = str(self.storage_Path)
33
+ self.file_Path = self.fPath = self.data_Path / self.storage_Path
34
+ self.file_path = self.fpath = str(self.file_Path)
35
+
36
+
pfeed/main.py ADDED
@@ -0,0 +1,72 @@
1
+ import os
2
+ import sys
3
+ import logging
4
+ import importlib
5
+ from types import TracebackType
6
+ from argparse import ArgumentParser
7
+
8
+
9
+ from pfeed.const.paths import LOG_PATH, DATA_PATH
10
+
11
+
12
+ ALIASES = {
13
+ 'yf': 'yahoo_finance',
14
+ }
15
+
16
+
17
+ def _custom_excepthook(exception_class: type[BaseException], exception: BaseException, traceback: TracebackType):
18
+ '''Catches any uncaught exceptions and logs them'''
19
+ # sys.__excepthook__(exception_class, exception, traceback)
20
+ try:
21
+ raise exception
22
+ except:
23
+ logging.getLogger('pfeed').exception('Uncaught exception:')
24
+
25
+
26
+ if __name__ == '__main__':
27
+ sys.excepthook = _custom_excepthook
28
+
29
+ parser = ArgumentParser()
30
+ parser.add_argument('-e', '--env', choices=['PAPER', 'LIVE'], default='LIVE', required=False)
31
+ parser.add_argument('--data-path', dest='data_path', default=str(DATA_PATH), required=False)
32
+ parser.add_argument('--log-path', dest='log_path', default=str(LOG_PATH), required=False)
33
+ parser.add_argument('-m', '--mode', default='historical', help='historical=historical data processing; streaming=live data streaming', choices=['historical', 'streaming'], required=False)
34
+ parser.add_argument('-b', '--start-date', dest='start_date', help='Start date in YYYY-MM-DD format', required=False)
35
+ parser.add_argument('-n', '--end-date', dest='end_date', help='End date in YYYY-MM-DD format', required=False)
36
+ parser.add_argument('--ptypes', nargs='+', help='List of product types, e.g. PERP = get all perpetuals', required=False)
37
+ parser.add_argument('-p', '--pdts', nargs='+', help='List of trading products', required=False)
38
+ parser.add_argument('--dtypes', nargs='+', help='List of data types, e.g. raw, tick, second, minute, hour, daily', required=False)
39
+ parser.add_argument(
40
+ '-s', '--source', required=True,
41
+ choices=[
42
+ 'bybit',
43
+ ],
44
+ )
45
+ parser.add_argument('-z', '--batch-size', dest='batch_size', default=8, help='batch size for Ray tasks', type=int, required=False)
46
+ parser.add_argument('--no-ray', dest='no_ray', action='store_true', required=False)
47
+ parser.add_argument('--no-minio', dest='no_minio', action='store_true', required=False)
48
+
49
+ # handle arguments
50
+ args = parser.parse_args()
51
+ print("Ray is disabled") if args.no_ray else print("Ray is enabled")
52
+ print("MinIO is disabled") if args.no_minio or os.getenv('MINIO_ENDPOINT') is None else print("MinIO is enabled")
53
+ env, source, mode = args.env.upper(), args.source.lower(), args.mode.lower()
54
+
55
+ source = ALIASES.get(source, source)
56
+ pipeline = importlib.import_module(f'pfeed.sources.{source}.{mode}')
57
+ if mode == 'historical':
58
+ pipeline.run(
59
+ dtypes=args.dtypes,
60
+ ptypes=args.ptypes,
61
+ pdts=args.pdts,
62
+ start_date=args.start_date,
63
+ end_date=args.end_date,
64
+ log_path=args.log_path,
65
+ data_path=args.data_path,
66
+ batch_size=args.batch_size,
67
+ use_ray=not args.no_ray,
68
+ use_minio=not args.no_minio,
69
+ )
70
+ else:
71
+ # TODO: implement streaming mode
72
+ raise NotImplementedError(f'{mode} is not implemented yet')
Binary file
@@ -0,0 +1 @@
1
+ from pfeed.sources import bybit
@@ -0,0 +1,2 @@
1
+ from pfeed.sources.bybit.historical import run as run_historical
2
+ # from pfeed.sources.bybit.streaming import *
@@ -0,0 +1,63 @@
1
+ import re
2
+ import time
3
+ import logging
4
+
5
+ import requests
6
+
7
+ from bs4 import BeautifulSoup
8
+ from pfeed.sources.bybit.const import DATA_SOURCE, DATA_SOURCE_URLS, DATA_NAMING_REGEX_PATTERNS, create_efilename
9
+
10
+
11
+ logger = logging.getLogger(DATA_SOURCE.lower())
12
+
13
+
14
+ def get(url, handle_func, frequency=1, num_retry=3):
15
+ '''
16
+ Handles general requests.get with control on frequency and number of retry
17
+ Args:
18
+ handle_func: specific logic for handling response
19
+ '''
20
+ logger.debug(f'calling {url}')
21
+ while num_retry:
22
+ res = requests.get(url)
23
+ if res.status_code == 200:
24
+ return handle_func(res)
25
+ elif res.status_code == 404:
26
+ logger.error(f'File not found {url=} {res.status_code=} {res.text=}')
27
+ break
28
+ else:
29
+ logger.warning(f'{res.status_code=} {res.text=}')
30
+ time.sleep(frequency)
31
+ else:
32
+ logger.error(f'failed to call {url}')
33
+
34
+
35
+ def get_efilenames(category: str, epdt: str):
36
+ '''
37
+ Get efilenames (e.g. BTCUSDT2022-10-04.csv.gz)
38
+ '''
39
+ def _handle_response(res):
40
+ soup = BeautifulSoup(res.text, 'html.parser')
41
+ efilenames = [node.get('href') for node in soup.find_all('a')]
42
+ return efilenames
43
+ url = '/'.join([DATA_SOURCE_URLS[category], epdt])
44
+ return get(url, _handle_response, frequency=1, num_retry=3)
45
+
46
+
47
+ def get_epdts(category: str, ptype: str):
48
+ def _handle_response(res):
49
+ soup = BeautifulSoup(res.text, 'html.parser')
50
+ epdts = [node.get('href').replace('/', '') for node in soup.find_all('a') if pattern.search(node.get('href'))]
51
+ return epdts
52
+ pattern = re.compile(DATA_NAMING_REGEX_PATTERNS[ptype])
53
+ url = DATA_SOURCE_URLS[category]
54
+ return get(url, _handle_response, frequency=1, num_retry=3)
55
+
56
+
57
+ def get_data(category: str, epdt: str, date: str):
58
+ def _handle_response(res):
59
+ data = res.content
60
+ return data
61
+ efilename = create_efilename(epdt, date, is_spot=(category.upper()=='SPOT'))
62
+ url = f"{DATA_SOURCE_URLS[category]}/{epdt}/{efilename}"
63
+ return get(url, _handle_response, frequency=1, num_retry=3)
@@ -0,0 +1,4 @@
1
+ ptypes: []
2
+ pdts: []
3
+ # start_date:
4
+ # end_date:
@@ -0,0 +1,42 @@
1
+ DATA_SOURCE = 'BYBIT'
2
+ # EXTEND: support FUT and IFUT
3
+ # SUPPORTED_CRYPTO_PRODUCT_TYPES = ['SPOT', 'PERP', 'IPERP', 'FUT', 'IFUT']
4
+ SUPPORTED_CRYPTO_PRODUCT_TYPES = ['SPOT', 'PERP', 'IPERP']
5
+ RAW_DATA_TYPE = 'tick'
6
+ # do not need to be precise
7
+ DATA_START_DATE = '2020-01-01'
8
+ DATA_SOURCE_URLS = {
9
+ 'linear': 'https://public.bybit.com/trading',
10
+ 'inverse': 'https://public.bybit.com/trading',
11
+ 'spot': 'https://public.bybit.com/spot',
12
+ }
13
+ DATA_NAMING_REGEX_PATTERNS = {
14
+ 'PERP': '(USDT\/|PERP\/)$', # USDT perp or USDC perp;
15
+ 'FUT': '-\d{2}[A-Z]{3}\d{2}\/$', # USDC futures e.g. BTC-10NOV23/
16
+ 'IPERP': 'USD\/$', # inverse perps;
17
+ 'IFUT': 'USD[A-Z]\d{2}\/$', # inverse futures e.g. BTCUSDH24/
18
+ # match everything since everything from https://public.bybit.com/spot is spot
19
+ 'SPOT': '.*',
20
+ }
21
+ SELECTED_RAW_COLS = {
22
+ 'linear': ['timestamp', 'side', 'size', 'price'],
23
+ 'inverse': ['timestamp', 'side', 'size', 'price'],
24
+ 'spot': ['timestamp', 'side', 'volume', 'price'],
25
+ }
26
+ RENAMING_COLS = {
27
+ 'linear': {'timestamp': 'ts', 'size': 'volume'},
28
+ 'inverse': {'timestamp': 'ts', 'size': 'volume'},
29
+ 'spot': {'timestamp': 'ts'},
30
+ }
31
+ RAW_DATA_TIMESTAMP_UNITS = {
32
+ 'linear': 's',
33
+ 'inverse': 's',
34
+ 'spot': 'ms'
35
+ }
36
+
37
+
38
+ def create_efilename(epdt, date, is_spot=False):
39
+ if is_spot:
40
+ return f'{epdt}_{date}.csv.gz'
41
+ else:
42
+ return f'{epdt}{date}.csv.gz'