pfeed 0.0.1.dev14__tar.gz → 0.0.2.dev1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {pfeed-0.0.1.dev14 → pfeed-0.0.2.dev1}/PKG-INFO +2 -2
- {pfeed-0.0.1.dev14 → pfeed-0.0.2.dev1}/pfeed/__init__.py +4 -4
- {pfeed-0.0.1.dev14 → pfeed-0.0.2.dev1}/pfeed/cli/commands/download.py +1 -1
- {pfeed-0.0.1.dev14 → pfeed-0.0.2.dev1}/pfeed/config_handler.py +9 -4
- pfeed-0.0.2.dev1/pfeed/data_tools/data_tool_pandas.py +61 -0
- {pfeed-0.0.1.dev14 → pfeed-0.0.2.dev1}/pfeed/data_tools/data_tool_polars.py +21 -18
- {pfeed-0.0.1.dev14 → pfeed-0.0.2.dev1}/pfeed/datastore.py +22 -11
- {pfeed-0.0.1.dev14 → pfeed-0.0.2.dev1}/pfeed/etl.py +85 -113
- {pfeed-0.0.1.dev14 → pfeed-0.0.2.dev1}/pfeed/feeds/base_feed.py +66 -124
- pfeed-0.0.2.dev1/pfeed/feeds/bybit_feed.py +51 -0
- {pfeed-0.0.1.dev14 → pfeed-0.0.2.dev1}/pfeed/filepath.py +1 -1
- {pfeed-0.0.1.dev14 → pfeed-0.0.2.dev1}/pfeed/sources/bybit/download.py +18 -13
- {pfeed-0.0.1.dev14 → pfeed-0.0.2.dev1}/pfeed/sources/bybit/utils.py +4 -2
- {pfeed-0.0.1.dev14 → pfeed-0.0.2.dev1}/pfeed/utils/utils.py +16 -1
- {pfeed-0.0.1.dev14 → pfeed-0.0.2.dev1}/pyproject.toml +2 -2
- pfeed-0.0.1.dev14/pfeed/data_tools/data_tool_pandas.py +0 -62
- pfeed-0.0.1.dev14/pfeed/feeds/bybit_feed.py +0 -53
- {pfeed-0.0.1.dev14 → pfeed-0.0.2.dev1}/LICENSE +0 -0
- {pfeed-0.0.1.dev14 → pfeed-0.0.2.dev1}/README.md +0 -0
- {pfeed-0.0.1.dev14 → pfeed-0.0.2.dev1}/pfeed/cli/__init__.py +0 -0
- {pfeed-0.0.1.dev14 → pfeed-0.0.2.dev1}/pfeed/cli/commands/__init__.py +0 -0
- {pfeed-0.0.1.dev14 → pfeed-0.0.2.dev1}/pfeed/cli/commands/config.py +0 -0
- {pfeed-0.0.1.dev14 → pfeed-0.0.2.dev1}/pfeed/cli/commands/docker_compose.py +0 -0
- {pfeed-0.0.1.dev14 → pfeed-0.0.2.dev1}/pfeed/cli/commands/open.py +0 -0
- {pfeed-0.0.1.dev14 → pfeed-0.0.2.dev1}/pfeed/cli/commands/stream.py +0 -0
- {pfeed-0.0.1.dev14 → pfeed-0.0.2.dev1}/pfeed/cli/main.py +0 -0
- {pfeed-0.0.1.dev14 → pfeed-0.0.2.dev1}/pfeed/const/common.py +0 -0
- {pfeed-0.0.1.dev14 → pfeed-0.0.2.dev1}/pfeed/const/paths.py +0 -0
- {pfeed-0.0.1.dev14 → pfeed-0.0.2.dev1}/pfeed/feeds/__init__.py +0 -0
- {pfeed-0.0.1.dev14 → pfeed-0.0.2.dev1}/pfeed/feeds/binance_feed.py +0 -0
- {pfeed-0.0.1.dev14 → pfeed-0.0.2.dev1}/pfeed/feeds/custom_csv_feed.py +0 -0
- {pfeed-0.0.1.dev14 → pfeed-0.0.2.dev1}/pfeed/feeds/yahoo_finance_feed.py +0 -0
- {pfeed-0.0.1.dev14 → pfeed-0.0.2.dev1}/pfeed/main.py +0 -0
- {pfeed-0.0.1.dev14 → pfeed-0.0.2.dev1}/pfeed/resolution.py +0 -0
- {pfeed-0.0.1.dev14 → pfeed-0.0.2.dev1}/pfeed/sources/binance/__init__.py +0 -0
- {pfeed-0.0.1.dev14 → pfeed-0.0.2.dev1}/pfeed/sources/binance/api.py +0 -0
- {pfeed-0.0.1.dev14 → pfeed-0.0.2.dev1}/pfeed/sources/binance/const.py +0 -0
- {pfeed-0.0.1.dev14 → pfeed-0.0.2.dev1}/pfeed/sources/binance/download.py +0 -0
- {pfeed-0.0.1.dev14 → pfeed-0.0.2.dev1}/pfeed/sources/binance/stream.py +0 -0
- {pfeed-0.0.1.dev14 → pfeed-0.0.2.dev1}/pfeed/sources/bybit/__init__.py +0 -0
- {pfeed-0.0.1.dev14 → pfeed-0.0.2.dev1}/pfeed/sources/bybit/api.py +0 -0
- {pfeed-0.0.1.dev14 → pfeed-0.0.2.dev1}/pfeed/sources/bybit/const.py +0 -0
- {pfeed-0.0.1.dev14 → pfeed-0.0.2.dev1}/pfeed/sources/bybit/stream.py +0 -0
- {pfeed-0.0.1.dev14 → pfeed-0.0.2.dev1}/pfeed/sources/bybit/types.py +0 -0
- {pfeed-0.0.1.dev14 → pfeed-0.0.2.dev1}/pfeed/types/common_literals.py +0 -0
- /pfeed-0.0.1.dev14/pfeed/utils/file_format.py → /pfeed-0.0.2.dev1/pfeed/utils/file_formats.py +0 -0
- {pfeed-0.0.1.dev14 → pfeed-0.0.2.dev1}/pfeed/utils/monitor.py +0 -0
- {pfeed-0.0.1.dev14 → pfeed-0.0.2.dev1}/pfeed/utils/validate.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: pfeed
|
|
3
|
-
Version: 0.0.
|
|
3
|
+
Version: 0.0.2.dev1
|
|
4
4
|
Summary: Data pipeline for algo-trading, getting and storing both real-time and historical data made easy.
|
|
5
5
|
Home-page: https://pfund.ai
|
|
6
6
|
License: Apache-2.0
|
|
@@ -23,7 +23,7 @@ Requires-Dist: fastparquet (>=2024.5.0,<2025.0.0)
|
|
|
23
23
|
Requires-Dist: minio (>=7.2.8,<8.0.0) ; extra == "data" or extra == "all"
|
|
24
24
|
Requires-Dist: pandas (>=2.2.2,<3.0.0) ; extra == "df" or extra == "all"
|
|
25
25
|
Requires-Dist: pfund (>=0.0.1.dev13,<0.0.2)
|
|
26
|
-
Requires-Dist: polars (>=1.
|
|
26
|
+
Requires-Dist: polars (>=1.7.1,<2.0.0) ; extra == "df" or extra == "all"
|
|
27
27
|
Requires-Dist: psutil (>=6.0.0,<7.0.0) ; extra == "data" or extra == "all"
|
|
28
28
|
Requires-Dist: pyarrow (>=15.0.0,<16.0.0) ; extra == "df" or extra == "all"
|
|
29
29
|
Requires-Dist: ray (>=2.35.0,<3.0.0) ; extra == "boost" or extra == "all"
|
|
@@ -20,9 +20,9 @@ def download_historical_data(
|
|
|
20
20
|
ptypes: str | list[str] | None = None,
|
|
21
21
|
start_date: str | None = None,
|
|
22
22
|
end_date: str | None = None,
|
|
23
|
-
num_cpus: int = 8,
|
|
24
|
-
use_ray: bool = True,
|
|
25
23
|
use_minio: bool = False,
|
|
24
|
+
use_ray: bool = True,
|
|
25
|
+
ray_num_cpus: int = 8,
|
|
26
26
|
):
|
|
27
27
|
data_source = importlib.import_module(f"pfeed.sources.{data_source.lower()}")
|
|
28
28
|
return data_source.download_historical_data(
|
|
@@ -31,9 +31,9 @@ def download_historical_data(
|
|
|
31
31
|
ptypes=ptypes,
|
|
32
32
|
start_date=start_date,
|
|
33
33
|
end_date=end_date,
|
|
34
|
-
num_cpus=num_cpus,
|
|
35
|
-
use_ray=use_ray,
|
|
36
34
|
use_minio=use_minio,
|
|
35
|
+
use_ray=use_ray,
|
|
36
|
+
ray_num_cpus=ray_num_cpus,
|
|
37
37
|
)
|
|
38
38
|
|
|
39
39
|
|
|
@@ -41,7 +41,7 @@ def download(data_source, pdts, dtypes, ptypes, start_date, end_date, num_cpus,
|
|
|
41
41
|
ptypes=ptypes,
|
|
42
42
|
start_date=start_date.date().strftime('%Y-%m-%d') if start_date else start_date,
|
|
43
43
|
end_date=end_date.date().strftime('%Y-%m-%d') if end_date else end_date,
|
|
44
|
-
num_cpus=num_cpus,
|
|
45
44
|
use_ray=not no_ray,
|
|
45
|
+
ray_num_cpus=num_cpus,
|
|
46
46
|
use_minio=use_minio,
|
|
47
47
|
)
|
|
@@ -5,9 +5,6 @@ import logging
|
|
|
5
5
|
from types import TracebackType
|
|
6
6
|
from dataclasses import dataclass
|
|
7
7
|
|
|
8
|
-
import yaml
|
|
9
|
-
from dotenv import find_dotenv, load_dotenv
|
|
10
|
-
|
|
11
8
|
from pfeed.const.paths import PROJ_NAME, MAIN_PATH, LOG_PATH, DATA_PATH, USER_CONFIG_FILE_PATH
|
|
12
9
|
|
|
13
10
|
|
|
@@ -48,6 +45,8 @@ class ConfigHandler:
|
|
|
48
45
|
|
|
49
46
|
@classmethod
|
|
50
47
|
def load_config(cls):
|
|
48
|
+
import yaml
|
|
49
|
+
|
|
51
50
|
'''Loads user's config file and returns a ConfigHandler object'''
|
|
52
51
|
config_file_path = USER_CONFIG_FILE_PATH
|
|
53
52
|
if config_file_path.is_file():
|
|
@@ -77,9 +76,15 @@ class ConfigHandler:
|
|
|
77
76
|
self.load_env_file(self.env_file_path)
|
|
78
77
|
|
|
79
78
|
if self.debug:
|
|
80
|
-
|
|
79
|
+
is_loggers_set_up = bool(logging.getLogger('pfeed').handlers)
|
|
80
|
+
if is_loggers_set_up:
|
|
81
|
+
print('loggers are already set up, ignoring enabling debug mode')
|
|
82
|
+
else:
|
|
83
|
+
self.enable_debug_mode()
|
|
81
84
|
|
|
82
85
|
def load_env_file(self, env_file_path: str | None):
|
|
86
|
+
from dotenv import find_dotenv, load_dotenv
|
|
87
|
+
|
|
83
88
|
if not env_file_path:
|
|
84
89
|
found_env_file_path = find_dotenv(usecwd=True, raise_error_if_not_found=False)
|
|
85
90
|
if found_env_file_path:
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
from typing import TYPE_CHECKING
|
|
3
|
+
if TYPE_CHECKING:
|
|
4
|
+
from pfeed.resolution import ExtendedResolution
|
|
5
|
+
from pfeed.types.common_literals import tSUPPORTED_STORAGES
|
|
6
|
+
|
|
7
|
+
import os
|
|
8
|
+
import io
|
|
9
|
+
|
|
10
|
+
import s3fs
|
|
11
|
+
import pandas as pd
|
|
12
|
+
|
|
13
|
+
from pfeed.const.common import SUPPORTED_STORAGES
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
name = 'pandas'
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def read_parquet(paths_or_obj: list[str] | str | bytes, *args, storage: tSUPPORTED_STORAGES='local', **kwargs) -> pd.DataFrame:
|
|
20
|
+
assert storage in SUPPORTED_STORAGES, f'{storage=} not in {SUPPORTED_STORAGES}'
|
|
21
|
+
if isinstance(paths_or_obj, bytes):
|
|
22
|
+
obj = io.BytesIO(paths_or_obj)
|
|
23
|
+
return pd.read_parquet(obj, *args, **kwargs)
|
|
24
|
+
else:
|
|
25
|
+
if storage == 'minio':
|
|
26
|
+
if 'filesystem' not in kwargs:
|
|
27
|
+
fs = s3fs.S3FileSystem(
|
|
28
|
+
endpoint_url="http://"+os.getenv('MINIO_HOST', 'localhost')+':'+os.getenv('MINIO_PORT', '9000'),
|
|
29
|
+
key=os.getenv('MINIO_ROOT_USER', 'pfunder'),
|
|
30
|
+
secret=os.getenv('MINIO_ROOT_PASSWORD', 'password'),
|
|
31
|
+
)
|
|
32
|
+
kwargs['filesystem'] = fs
|
|
33
|
+
paths = paths_or_obj if isinstance(paths_or_obj, list) else [paths_or_obj]
|
|
34
|
+
return pd.read_parquet(paths, *args, **kwargs)
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def estimate_memory_usage(df: pd.DataFrame) -> float:
|
|
38
|
+
"""Estimate the memory usage of a pandas DataFrame in GB."""
|
|
39
|
+
return df.memory_usage(deep=True).sum() / (1024 ** 3)
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def organize_time_series_columns(
|
|
43
|
+
pdt: str,
|
|
44
|
+
resolution: str | ExtendedResolution,
|
|
45
|
+
df: pd.DataFrame,
|
|
46
|
+
override_resolution: bool=False,
|
|
47
|
+
) -> pd.DataFrame:
|
|
48
|
+
"""Standardize the columns of a pandas DataFrame.
|
|
49
|
+
Moving 'ts', 'product', 'resolution' to the leftmost side.
|
|
50
|
+
"""
|
|
51
|
+
from pfeed.resolution import ExtendedResolution
|
|
52
|
+
assert 'ts' in df.columns, "'ts' column not found"
|
|
53
|
+
if isinstance(resolution, str):
|
|
54
|
+
resolution = ExtendedResolution(resolution)
|
|
55
|
+
if 'product' not in df.columns:
|
|
56
|
+
df['product'] = pdt
|
|
57
|
+
if 'resolution' not in df.columns or override_resolution:
|
|
58
|
+
df['resolution'] = repr(resolution)
|
|
59
|
+
left_cols = ['ts', 'product', 'resolution']
|
|
60
|
+
df = df.reindex(left_cols + [col for col in df.columns if col not in left_cols], axis=1)
|
|
61
|
+
return df
|
|
@@ -14,30 +14,26 @@ from pfeed.const.common import SUPPORTED_STORAGES
|
|
|
14
14
|
name = 'polars'
|
|
15
15
|
|
|
16
16
|
|
|
17
|
-
def read_parquet(
|
|
17
|
+
def read_parquet(paths_or_obj: list[str] | str | bytes, *args, storage: tSUPPORTED_STORAGES='local', **kwargs) -> pl.DataFrame | pl.LazyFrame:
|
|
18
18
|
assert storage in SUPPORTED_STORAGES, f'{storage=} not in {SUPPORTED_STORAGES}'
|
|
19
|
-
if isinstance(
|
|
20
|
-
obj =
|
|
19
|
+
if isinstance(paths_or_obj, bytes):
|
|
20
|
+
obj = paths_or_obj
|
|
21
21
|
return pl.read_parquet(obj, *args, **kwargs)
|
|
22
22
|
else:
|
|
23
|
-
|
|
23
|
+
paths = paths_or_obj if isinstance(paths_or_obj, list) else [paths_or_obj]
|
|
24
24
|
if storage == 'local':
|
|
25
|
-
return pl.scan_parquet(
|
|
25
|
+
return pl.scan_parquet(paths, *args, **kwargs)
|
|
26
26
|
elif storage == 'minio':
|
|
27
27
|
storage_options = {
|
|
28
28
|
"endpoint_url": "http://"+os.getenv('MINIO_HOST', 'localhost')+':'+os.getenv('MINIO_PORT', '9000'),
|
|
29
29
|
"access_key_id": os.getenv('MINIO_ROOT_USER', 'pfunder'),
|
|
30
30
|
"secret_access_key": os.getenv('MINIO_ROOT_PASSWORD', 'password'),
|
|
31
31
|
}
|
|
32
|
-
return pl.scan_parquet(
|
|
32
|
+
return pl.scan_parquet(paths, *args, storage_options=storage_options, **kwargs)
|
|
33
33
|
else:
|
|
34
34
|
raise NotImplementedError(f'{storage=}')
|
|
35
35
|
|
|
36
36
|
|
|
37
|
-
def concat(dfs: list[pl.DataFrame | pl.LazyFrame], *args, **kwargs) -> pl.DataFrame | pl.LazyFrame:
|
|
38
|
-
return pl.concat(dfs, *args, **kwargs)
|
|
39
|
-
|
|
40
|
-
|
|
41
37
|
def estimate_memory_usage(df: pl.DataFrame | pl.LazyFrame) -> float:
|
|
42
38
|
"""Estimate the memory usage of a polars DataFrame in GB."""
|
|
43
39
|
if isinstance(df, pl.LazyFrame):
|
|
@@ -45,21 +41,28 @@ def estimate_memory_usage(df: pl.DataFrame | pl.LazyFrame) -> float:
|
|
|
45
41
|
return df.estimated_size(unit='gb')
|
|
46
42
|
|
|
47
43
|
|
|
48
|
-
def organize_time_series_columns(
|
|
44
|
+
def organize_time_series_columns(
|
|
45
|
+
pdt: str,
|
|
46
|
+
resolution: str | ExtendedResolution,
|
|
47
|
+
df: pl.DataFrame | pl.LazyFramem,
|
|
48
|
+
override_resolution: bool=False,
|
|
49
|
+
) -> pl.DataFrame | pl.LazyFrame:
|
|
49
50
|
from pfeed.resolution import ExtendedResolution
|
|
50
51
|
if isinstance(df, pl.LazyFrame):
|
|
51
52
|
cols = df.collect_schema().names()
|
|
52
53
|
else:
|
|
53
54
|
cols = df.columns
|
|
54
55
|
assert 'ts' in cols, "'ts' column not found"
|
|
55
|
-
assert 'product' not in cols, "'product' column already exists"
|
|
56
|
-
assert 'resolution' not in cols, "'resolution' column already exists"
|
|
57
56
|
if isinstance(resolution, str):
|
|
58
57
|
resolution = ExtendedResolution(resolution)
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
58
|
+
if 'product' not in cols:
|
|
59
|
+
df = df.with_columns(
|
|
60
|
+
pl.lit(pdt).alias('product'),
|
|
61
|
+
)
|
|
62
|
+
if 'resolution' not in cols or override_resolution:
|
|
63
|
+
df = df.with_columns(
|
|
64
|
+
pl.lit(repr(resolution)).alias('resolution')
|
|
65
|
+
)
|
|
63
66
|
left_cols = ['ts', 'product', 'resolution']
|
|
64
|
-
df = df.select(left_cols + [col for col in
|
|
67
|
+
df = df.select(left_cols + [col for col in cols if col not in left_cols])
|
|
65
68
|
return df
|
|
@@ -2,7 +2,7 @@ from __future__ import annotations
|
|
|
2
2
|
from typing import TYPE_CHECKING
|
|
3
3
|
if TYPE_CHECKING:
|
|
4
4
|
try:
|
|
5
|
-
from minio.api import ObjectWriteResult
|
|
5
|
+
from minio.api import ObjectWriteResult, Tags
|
|
6
6
|
except ImportError:
|
|
7
7
|
pass
|
|
8
8
|
from typing import Generator
|
|
@@ -12,10 +12,9 @@ import io
|
|
|
12
12
|
import logging
|
|
13
13
|
|
|
14
14
|
|
|
15
|
-
def
|
|
15
|
+
def check_if_minio_running():
|
|
16
16
|
import requests
|
|
17
17
|
from requests.exceptions import RequestException, ReadTimeout
|
|
18
|
-
from minio.error import MinioException
|
|
19
18
|
|
|
20
19
|
endpoint = os.getenv('MINIO_HOST', 'localhost')+':'+os.getenv('MINIO_PORT', '9000')
|
|
21
20
|
if not endpoint.startswith('http'):
|
|
@@ -26,13 +25,15 @@ def assert_if_minio_running():
|
|
|
26
25
|
try:
|
|
27
26
|
response = requests.get(f'{endpoint}/minio/health/live', timeout=3)
|
|
28
27
|
if response.status_code != 200:
|
|
29
|
-
|
|
28
|
+
print(f"Unhandled response from MinIO: {response.status_code=} {response.content} {response}")
|
|
29
|
+
return False
|
|
30
30
|
except (ReadTimeout, RequestException) as e:
|
|
31
|
-
|
|
31
|
+
return False
|
|
32
|
+
return True
|
|
32
33
|
|
|
33
34
|
|
|
34
35
|
class Datastore:
|
|
35
|
-
DATA_PART_SIZE = 5 * (1024 ** 2) # part size for S3, 5 MB
|
|
36
|
+
# DATA_PART_SIZE = 5 * (1024 ** 2) # part size for S3, 5 MB
|
|
36
37
|
BUCKET_NAME = 'pfeed'
|
|
37
38
|
|
|
38
39
|
# EXTEND, currently only consider using MinIO
|
|
@@ -40,9 +41,9 @@ class Datastore:
|
|
|
40
41
|
def initialize_store(cls, name: str, **kwargs):
|
|
41
42
|
if name == 'minio':
|
|
42
43
|
from minio import Minio
|
|
43
|
-
|
|
44
|
+
endpoint = os.getenv('MINIO_HOST', 'localhost')+':'+os.getenv('MINIO_PORT', '9000')
|
|
44
45
|
cls.minio = Minio(
|
|
45
|
-
endpoint=
|
|
46
|
+
endpoint=endpoint,
|
|
46
47
|
access_key=os.getenv('MINIO_ROOT_USER', 'pfunder'),
|
|
47
48
|
secret_key=os.getenv('MINIO_ROOT_PASSWORD', 'password'),
|
|
48
49
|
# turn off TLS, i.e. not using HTTPS
|
|
@@ -74,7 +75,16 @@ class Datastore:
|
|
|
74
75
|
self.logger.error(f'Unhandled MinIO response status {res.status}')
|
|
75
76
|
except S3Error as err:
|
|
76
77
|
# logger.warning(f'MinIO S3Error {object_name=} {err=}')
|
|
77
|
-
|
|
78
|
+
return None
|
|
79
|
+
|
|
80
|
+
def exist_object(self, object_name: str) -> bool:
|
|
81
|
+
from minio import S3Error
|
|
82
|
+
try:
|
|
83
|
+
res: Tags | None = self.minio.get_object_tags(self.BUCKET_NAME, object_name)
|
|
84
|
+
return True
|
|
85
|
+
except S3Error as err:
|
|
86
|
+
# self.logger.warning(f'MinIO S3Error {object_name=} {err=}')
|
|
87
|
+
return False
|
|
78
88
|
|
|
79
89
|
def list_objects(self, prefix) -> list | None:
|
|
80
90
|
'''
|
|
@@ -89,8 +99,9 @@ class Datastore:
|
|
|
89
99
|
self.BUCKET_NAME,
|
|
90
100
|
object_name,
|
|
91
101
|
data=io.BytesIO(data),
|
|
92
|
-
part_size=self.DATA_PART_SIZE,
|
|
93
|
-
length
|
|
102
|
+
# part_size=self.DATA_PART_SIZE,
|
|
103
|
+
length=len(data),
|
|
104
|
+
content_type='application/parquet',
|
|
94
105
|
**kwargs
|
|
95
106
|
)
|
|
96
107
|
|
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
Except extracting and loading data, this module uses "pandas" for data transformation.
|
|
3
3
|
'''
|
|
4
4
|
from __future__ import annotations
|
|
5
|
-
from typing import TYPE_CHECKING
|
|
5
|
+
from typing import TYPE_CHECKING
|
|
6
6
|
if TYPE_CHECKING:
|
|
7
7
|
from pfeed.types.common_literals import (
|
|
8
8
|
tSUPPORTED_ENVIRONMENTS,
|
|
@@ -10,8 +10,6 @@ if TYPE_CHECKING:
|
|
|
10
10
|
tSUPPORTED_STORAGES,
|
|
11
11
|
tSUPPORTED_DATA_TOOLS,
|
|
12
12
|
)
|
|
13
|
-
from pfeed.resolution import ExtendedResolution
|
|
14
|
-
tOUTPUT_FORMATS = Literal['bytes'] | tSUPPORTED_DATA_TOOLS
|
|
15
13
|
|
|
16
14
|
import logging
|
|
17
15
|
import importlib
|
|
@@ -22,7 +20,8 @@ try:
|
|
|
22
20
|
except ImportError:
|
|
23
21
|
pass
|
|
24
22
|
|
|
25
|
-
from pfeed.
|
|
23
|
+
from pfeed.resolution import ExtendedResolution
|
|
24
|
+
from pfeed.datastore import Datastore, check_if_minio_running
|
|
26
25
|
from pfeed.filepath import FilePath
|
|
27
26
|
from pfeed.config_handler import get_config
|
|
28
27
|
from pfeed.const.common import (
|
|
@@ -31,17 +30,14 @@ from pfeed.const.common import (
|
|
|
31
30
|
SUPPORTED_DOWNLOAD_DATA_SOURCES,
|
|
32
31
|
SUPPORTED_DATA_TOOLS,
|
|
33
32
|
)
|
|
34
|
-
from pfeed.types.common_literals import tSUPPORTED_DATA_TOOLS
|
|
35
33
|
from pfeed.utils.utils import derive_trading_venue
|
|
36
|
-
from pfeed.utils.
|
|
37
|
-
|
|
34
|
+
from pfeed.utils.file_formats import read_raw_data
|
|
38
35
|
try:
|
|
39
36
|
from pfeed.utils.monitor import print_disk_usage
|
|
40
37
|
except ImportError:
|
|
41
38
|
print_disk_usage = None
|
|
42
39
|
|
|
43
40
|
|
|
44
|
-
OUTPUT_FORMATS = ['bytes'] + SUPPORTED_DATA_TOOLS
|
|
45
41
|
DataFrame = pd.DataFrame | pl.DataFrame | pl.LazyFrame
|
|
46
42
|
|
|
47
43
|
|
|
@@ -51,7 +47,6 @@ __all__ = [
|
|
|
51
47
|
'transform_data',
|
|
52
48
|
'load_data',
|
|
53
49
|
'clean_raw_data',
|
|
54
|
-
'standardize_raw_data',
|
|
55
50
|
'resample_data',
|
|
56
51
|
]
|
|
57
52
|
|
|
@@ -59,39 +54,36 @@ __all__ = [
|
|
|
59
54
|
def get_data(
|
|
60
55
|
env: tSUPPORTED_ENVIRONMENTS,
|
|
61
56
|
data_source: tSUPPORTED_DOWNLOAD_DATA_SOURCES,
|
|
62
|
-
resolution: str | ExtendedResolution,
|
|
63
57
|
pdt: str,
|
|
64
|
-
|
|
58
|
+
resolution: str | ExtendedResolution,
|
|
59
|
+
dates: list[str],
|
|
60
|
+
storages: list[tSUPPORTED_STORAGES] | None = None,
|
|
65
61
|
trading_venue: str='',
|
|
66
|
-
output_format:
|
|
67
|
-
) ->
|
|
62
|
+
output_format: tSUPPORTED_DATA_TOOLS='pandas',
|
|
63
|
+
) -> DataFrame | None:
|
|
68
64
|
"""Extract data without specifying the data origin.
|
|
69
65
|
This function will try to extract data from all supported data origins.
|
|
70
66
|
|
|
71
67
|
Args:
|
|
72
68
|
env: trading environment, e.g. 'PAPER' | 'LIVE'.
|
|
73
69
|
data_source (Literal['BYBIT']): The data source to extract data from.
|
|
70
|
+
pdt (str): product, e.g. BTC_USDT_PERP.
|
|
74
71
|
resolution: Data resolution. e.g. '1m' = 1 minute as the unit of each data bar/candle.
|
|
75
72
|
Also supports raw resolution such as 'r1m', where 'r' stands for raw.
|
|
76
73
|
Default is '1d' = 1 day.
|
|
77
|
-
|
|
78
|
-
|
|
74
|
+
dates (list[str]): The dates of the data to extract.
|
|
75
|
+
storages: origins of data to search from, default is all supported storages
|
|
79
76
|
trading_venue (str): trading venue's name, e.g. exchange's name or dapp's name
|
|
80
77
|
output_format: The format of the output data. Default is 'pandas'.
|
|
81
|
-
Returns:
|
|
82
|
-
bytes | DataFrame | None: The extracted data as bytes, or None if the data is not found.
|
|
83
78
|
"""
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
data: bytes | pd.DataFrame | None = extract_data(env, storage, data_source, trading_venue, resolution, pdt, date, output_format=output_format)
|
|
93
|
-
except MinioException:
|
|
94
|
-
data = None
|
|
79
|
+
logger = logging.getLogger(data_source.lower() + '_data')
|
|
80
|
+
storages = storages or SUPPORTED_STORAGES
|
|
81
|
+
for storage in storages:
|
|
82
|
+
if storage == 'minio':
|
|
83
|
+
if not check_if_minio_running():
|
|
84
|
+
continue
|
|
85
|
+
logger.debug(f'searching {storage=} for {data_source} {pdt} {resolution} data from {dates[0]} to {dates[-1]}')
|
|
86
|
+
data: DataFrame | None = extract_data(env, storage, data_source, pdt, resolution, dates, trading_venue=trading_venue, output_format=output_format)
|
|
95
87
|
if data is not None:
|
|
96
88
|
return data
|
|
97
89
|
|
|
@@ -100,84 +92,72 @@ def extract_data(
|
|
|
100
92
|
env: tSUPPORTED_ENVIRONMENTS,
|
|
101
93
|
storage: tSUPPORTED_STORAGES,
|
|
102
94
|
data_source: tSUPPORTED_DOWNLOAD_DATA_SOURCES,
|
|
103
|
-
trading_venue: str,
|
|
104
|
-
resolution: str | ExtendedResolution,
|
|
105
95
|
pdt: str,
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
96
|
+
resolution: str | ExtendedResolution,
|
|
97
|
+
dates: list[str],
|
|
98
|
+
trading_venue: str='',
|
|
99
|
+
output_format: tSUPPORTED_DATA_TOOLS='pandas',
|
|
100
|
+
) -> DataFrame | None:
|
|
109
101
|
"""
|
|
110
102
|
Extracts data from a specified data source and returns it as bytes.
|
|
111
103
|
|
|
112
104
|
Args:
|
|
113
105
|
env: trading environment, e.g. 'PAPER' | 'LIVE'.
|
|
114
|
-
storage: The origin of the data (local or minio).
|
|
106
|
+
storage: The origin of the data (e.g. local or minio).
|
|
115
107
|
data_source: The source of the data.
|
|
116
|
-
|
|
108
|
+
pdt (str): product, e.g. BTC_USDT_PERP.
|
|
117
109
|
resolution: Data resolution. e.g. '1m' = 1 minute as the unit of each data bar/candle.
|
|
118
110
|
Also supports raw resolution such as 'r1m', where 'r' stands for raw.
|
|
119
111
|
Default is '1d' = 1 day.
|
|
120
|
-
|
|
121
|
-
|
|
112
|
+
dates (list[str]): The dates of the data.
|
|
113
|
+
trading_venue: trading venue's name, e.g. exchange's name or dapp's name
|
|
122
114
|
output_format: The format of the output data. Default is 'pandas'.
|
|
123
|
-
Returns:
|
|
124
|
-
bytes | DataFrame | None: The extracted data as bytes, or None if extraction fails.
|
|
125
|
-
|
|
126
|
-
Raises:
|
|
127
|
-
AssertionError: If any of the input parameters are invalid.
|
|
128
|
-
NotImplementedError: If the data origin is not supported.
|
|
129
|
-
MinioException: If MinIO is not running / set up correctly.
|
|
130
115
|
"""
|
|
131
|
-
from pfeed.resolution import ExtendedResolution
|
|
132
|
-
|
|
133
116
|
logger = logging.getLogger(data_source.lower() + '_data')
|
|
134
|
-
|
|
135
117
|
env, storage, data_source, pdt, output_format = env.upper(), storage.lower(), data_source.upper(), pdt.upper(), output_format.lower()
|
|
118
|
+
trading_venue = trading_venue or derive_trading_venue(data_source)
|
|
119
|
+
trading_venue = trading_venue.upper()
|
|
136
120
|
assert env in SUPPORTED_ENVIRONMENTS, f'Invalid {env=}, {SUPPORTED_ENVIRONMENTS=}'
|
|
137
121
|
assert storage in SUPPORTED_STORAGES, f'Invalid {storage=}, {SUPPORTED_STORAGES=}'
|
|
138
122
|
assert data_source in SUPPORTED_DOWNLOAD_DATA_SOURCES, f'Invalid {data_source=}, SUPPORTED DATA SOURCES={SUPPORTED_DOWNLOAD_DATA_SOURCES}'
|
|
139
|
-
assert output_format in
|
|
123
|
+
assert output_format in SUPPORTED_DATA_TOOLS, f'Invalid {output_format=}, valid options: {SUPPORTED_DATA_TOOLS}'
|
|
140
124
|
if isinstance(resolution, str):
|
|
141
125
|
resolution = ExtendedResolution(resolution)
|
|
142
126
|
if output_format != 'bytes':
|
|
143
127
|
data_tool = importlib.import_module(f'pfeed.data_tools.data_tool_{output_format.lower()}')
|
|
144
128
|
config = get_config()
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
129
|
+
filepaths = [FilePath(env, data_source, trading_venue, pdt, resolution, date, file_extension='.parquet', data_path=config.data_path) for date in dates]
|
|
130
|
+
try:
|
|
131
|
+
df = None
|
|
132
|
+
if storage == 'local':
|
|
133
|
+
if all(fp.exists() for fp in filepaths):
|
|
134
|
+
df: DataFrame = data_tool.read_parquet([fp.file_path for fp in filepaths])
|
|
135
|
+
elif storage == 'minio':
|
|
136
|
+
datastore = Datastore(storage)
|
|
137
|
+
object_names = [fp.storage_path for fp in filepaths]
|
|
138
|
+
if all(datastore.exist_object(object_name) for object_name in object_names):
|
|
139
|
+
paths = ["s3://" + datastore.BUCKET_NAME + "/" + object_name for object_name in object_names]
|
|
140
|
+
df: DataFrame = data_tool.read_parquet(paths, storage='minio')
|
|
155
141
|
else:
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
data: bytes | None = datastore.get_object(object_name)
|
|
161
|
-
if data:
|
|
162
|
-
if output_format != 'bytes':
|
|
163
|
-
file_path = "s3://" + datastore.BUCKET_NAME + "/" + object_name
|
|
164
|
-
data: DataFrame = data_tool.read_parquet(file_path, storage='minio')
|
|
165
|
-
logger.debug(f'extracted {data_source} {pdt} {date} {resolution} data from MinIO object {object_name}')
|
|
142
|
+
raise NotImplementedError(f'{storage=}')
|
|
143
|
+
|
|
144
|
+
if df is not None:
|
|
145
|
+
logger.debug(f'extracted {data_source} {pdt} {resolution} data from {dates[0]} to {dates[-1]} from {storage}')
|
|
166
146
|
else:
|
|
167
|
-
logger.debug(f'failed to extract {data_source} {pdt} {
|
|
168
|
-
return
|
|
169
|
-
|
|
170
|
-
|
|
147
|
+
logger.debug(f'failed to extract {data_source} {pdt} {resolution} data from {dates[0]} to {dates[-1]} from {storage}')
|
|
148
|
+
return df
|
|
149
|
+
except Exception as err:
|
|
150
|
+
logger.exception(f'failed to extract {data_source} {pdt} {resolution} data from {dates[0]} to {dates[-1]} from {storage}, {err=}')
|
|
171
151
|
|
|
172
152
|
|
|
173
153
|
def transform_data(
|
|
174
154
|
data_source: tSUPPORTED_DOWNLOAD_DATA_SOURCES,
|
|
155
|
+
pdt: str,
|
|
175
156
|
data: bytes | pd.DataFrame | pl.LazyFrame,
|
|
176
157
|
data_resolution: str | ExtendedResolution,
|
|
177
158
|
target_resolution: str | ExtendedResolution,
|
|
178
159
|
) -> bytes | pd.DataFrame | pl.LazyFrame:
|
|
179
160
|
"""Transforms data to a target resolution"""
|
|
180
|
-
from pfeed.resolution import ExtendedResolution
|
|
181
161
|
if isinstance(data_resolution, str):
|
|
182
162
|
data_resolution = ExtendedResolution(data_resolution)
|
|
183
163
|
if isinstance(target_resolution, str):
|
|
@@ -192,22 +172,23 @@ def transform_data(
|
|
|
192
172
|
elif data_resolution.is_raw() and target_resolution.is_raw(): # e.g. 'r1t' -> 'r1m
|
|
193
173
|
raise Exception(f'{data_resolution=} and {target_resolution=} are both raw resolutions')
|
|
194
174
|
else:
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
175
|
+
df: pd.DataFrame = _convert_data_to_pandas_df(data)
|
|
176
|
+
df = _standardize_columns(df, data_resolution.is_tick())
|
|
177
|
+
if not target_resolution.is_tick():
|
|
178
|
+
df = resample_data(df, target_resolution)
|
|
179
|
+
df = _organize_columns(df, pdt, target_resolution)
|
|
180
|
+
return _handle_result(data, df)
|
|
200
181
|
|
|
201
182
|
|
|
202
183
|
def load_data(
|
|
203
184
|
env: tSUPPORTED_ENVIRONMENTS,
|
|
204
185
|
storage: tSUPPORTED_STORAGES,
|
|
205
186
|
data_source: tSUPPORTED_DOWNLOAD_DATA_SOURCES,
|
|
206
|
-
trading_venue: str,
|
|
207
187
|
data: bytes,
|
|
208
|
-
resolution: str | ExtendedResolution,
|
|
209
188
|
pdt: str,
|
|
189
|
+
resolution: str | ExtendedResolution,
|
|
210
190
|
date: str,
|
|
191
|
+
trading_venue: str='',
|
|
211
192
|
**kwargs
|
|
212
193
|
) -> None:
|
|
213
194
|
"""
|
|
@@ -218,28 +199,23 @@ def load_data(
|
|
|
218
199
|
storage: The destination where the data will be loaded.
|
|
219
200
|
It can be either 'local' or 'minio'.
|
|
220
201
|
data_source: The source of the data.
|
|
221
|
-
trading_venue: trading venue's name, e.g. exchange's name or dapp's name
|
|
222
202
|
data (bytes): The data to be loaded.
|
|
203
|
+
pdt (str): product, e.g. BTC_USDT_PERP.
|
|
223
204
|
resolution: Data resolution. e.g. '1m' = 1 minute as the unit of each data bar/candle.
|
|
224
205
|
Also supports raw resolution such as 'r1m', where 'r' stands for raw.
|
|
225
206
|
Default is '1d' = 1 day.
|
|
226
|
-
pdt (str): product, e.g. BTC_USDT_PERP.
|
|
227
207
|
date (str): The date of the data.
|
|
208
|
+
trading_venue: trading venue's name, e.g. exchange's name or dapp's name
|
|
228
209
|
**kwargs: Additional keyword arguments for MinIO.
|
|
229
210
|
|
|
230
211
|
Returns:
|
|
231
212
|
None
|
|
232
|
-
|
|
233
|
-
Raises:
|
|
234
|
-
AssertionError: If any of the input parameters are invalid.
|
|
235
|
-
NotImplementedError: If the specified data destination is not implemented.
|
|
236
|
-
MinioException: If MinIO is not running / set up correctly.
|
|
237
213
|
"""
|
|
238
|
-
from pfeed.resolution import ExtendedResolution
|
|
239
|
-
|
|
240
214
|
logger = logging.getLogger(data_source.lower() + '_data')
|
|
241
215
|
|
|
242
216
|
env, storage, data_source, pdt = env.upper(), storage.lower(), data_source.upper(), pdt.upper()
|
|
217
|
+
trading_venue = trading_venue or derive_trading_venue(data_source)
|
|
218
|
+
trading_venue = trading_venue.upper()
|
|
243
219
|
assert env in SUPPORTED_ENVIRONMENTS, f'Invalid {env=}, {SUPPORTED_ENVIRONMENTS=}'
|
|
244
220
|
assert storage in SUPPORTED_STORAGES, f'Invalid {storage=}, {SUPPORTED_STORAGES=}'
|
|
245
221
|
assert data_source in SUPPORTED_DOWNLOAD_DATA_SOURCES, f'Invalid {data_source=}, SUPPORTED DATA SOURCES={SUPPORTED_DOWNLOAD_DATA_SOURCES}'
|
|
@@ -280,7 +256,6 @@ def clean_raw_data(
|
|
|
280
256
|
bytes: The cleaned raw data.
|
|
281
257
|
'''
|
|
282
258
|
assert data_source in SUPPORTED_DOWNLOAD_DATA_SOURCES, f'Invalid {data_source=}, SUPPORTED DATA SOURCES={SUPPORTED_DOWNLOAD_DATA_SOURCES}'
|
|
283
|
-
|
|
284
259
|
const = importlib.import_module(f'pfeed.sources.{data_source.lower()}.const')
|
|
285
260
|
utils = importlib.import_module(f'pfeed.sources.{data_source.lower()}.utils')
|
|
286
261
|
|
|
@@ -293,27 +268,6 @@ def clean_raw_data(
|
|
|
293
268
|
return _handle_result(data, df)
|
|
294
269
|
|
|
295
270
|
|
|
296
|
-
def standardize_raw_data(
|
|
297
|
-
data: bytes | pd.DataFrame | pl.LazyFrame,
|
|
298
|
-
is_tick: bool
|
|
299
|
-
) -> bytes | pd.DataFrame | pl.LazyFrame:
|
|
300
|
-
"""Filter out unnecessary columns from raw data.
|
|
301
|
-
|
|
302
|
-
Args:
|
|
303
|
-
data (bytes): The raw data in bytes format.
|
|
304
|
-
|
|
305
|
-
Returns:
|
|
306
|
-
bytes | pd.DataFrame | pl.LazyFrame: The standardized data.
|
|
307
|
-
"""
|
|
308
|
-
df: pd.DataFrame = _convert_data_to_pandas_df(data)
|
|
309
|
-
assert 'ts' in df.columns, 'ts column not found, please check if the raw data has been cleaned'
|
|
310
|
-
if is_tick:
|
|
311
|
-
df = df.loc[:, ['ts', 'side', 'volume', 'price']]
|
|
312
|
-
else:
|
|
313
|
-
df = df.loc[:, ['ts', 'open', 'high', 'low', 'close', 'volume']]
|
|
314
|
-
return _handle_result(data, df)
|
|
315
|
-
|
|
316
|
-
|
|
317
271
|
def resample_data(
|
|
318
272
|
data: bytes | pd.DataFrame | pl.LazyFrame,
|
|
319
273
|
resolution: str | ExtendedResolution,
|
|
@@ -326,8 +280,6 @@ def resample_data(
|
|
|
326
280
|
resolution (str | Resolution): The resolution at which the data should be resampled.
|
|
327
281
|
if string, it should be in the format of "# + unit (s/m/h/d)", e.g. "1s".
|
|
328
282
|
'''
|
|
329
|
-
from pfeed.resolution import ExtendedResolution
|
|
330
|
-
|
|
331
283
|
# standardize resolution by following pfund's standard, e.g. '1minute' -> '1m'
|
|
332
284
|
if isinstance(resolution, str):
|
|
333
285
|
resolution = ExtendedResolution(resolution)
|
|
@@ -380,6 +332,26 @@ def resample_data(
|
|
|
380
332
|
return _handle_result(data, resampled_df)
|
|
381
333
|
|
|
382
334
|
|
|
335
|
+
def _standardize_columns(df: pd.DataFrame, is_tick: bool) -> pd.DataFrame:
|
|
336
|
+
"""Filter out unnecessary columns from raw data."""
|
|
337
|
+
assert 'ts' in df.columns, '"ts" column not found'
|
|
338
|
+
if is_tick:
|
|
339
|
+
df = df.loc[:, ['ts', 'side', 'volume', 'price']]
|
|
340
|
+
else:
|
|
341
|
+
df = df.loc[:, ['ts', 'open', 'high', 'low', 'close', 'volume']]
|
|
342
|
+
return df
|
|
343
|
+
|
|
344
|
+
|
|
345
|
+
def _organize_columns(df: pd.DataFrame, pdt: str, resolution: ExtendedResolution) -> pd.DataFrame:
|
|
346
|
+
"""Organizes the columns of a DataFrame.
|
|
347
|
+
Moving 'ts', 'product', 'resolution' to the leftmost side.
|
|
348
|
+
"""
|
|
349
|
+
df['product'] = pdt
|
|
350
|
+
df['resolution'] = repr(resolution)
|
|
351
|
+
left_cols = ['ts', 'product', 'resolution']
|
|
352
|
+
return df.reindex(left_cols + [col for col in df.columns if col not in left_cols], axis=1)
|
|
353
|
+
|
|
354
|
+
|
|
383
355
|
def _convert_data_to_pandas_df(data: bytes | pd.DataFrame | pl.LazyFrame) -> pd.DataFrame:
|
|
384
356
|
"""Converts data to pandas DataFrame."""
|
|
385
357
|
if isinstance(data, bytes):
|