pfeed 0.0.1.dev14__tar.gz → 0.0.2.dev1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. {pfeed-0.0.1.dev14 → pfeed-0.0.2.dev1}/PKG-INFO +2 -2
  2. {pfeed-0.0.1.dev14 → pfeed-0.0.2.dev1}/pfeed/__init__.py +4 -4
  3. {pfeed-0.0.1.dev14 → pfeed-0.0.2.dev1}/pfeed/cli/commands/download.py +1 -1
  4. {pfeed-0.0.1.dev14 → pfeed-0.0.2.dev1}/pfeed/config_handler.py +9 -4
  5. pfeed-0.0.2.dev1/pfeed/data_tools/data_tool_pandas.py +61 -0
  6. {pfeed-0.0.1.dev14 → pfeed-0.0.2.dev1}/pfeed/data_tools/data_tool_polars.py +21 -18
  7. {pfeed-0.0.1.dev14 → pfeed-0.0.2.dev1}/pfeed/datastore.py +22 -11
  8. {pfeed-0.0.1.dev14 → pfeed-0.0.2.dev1}/pfeed/etl.py +85 -113
  9. {pfeed-0.0.1.dev14 → pfeed-0.0.2.dev1}/pfeed/feeds/base_feed.py +66 -124
  10. pfeed-0.0.2.dev1/pfeed/feeds/bybit_feed.py +51 -0
  11. {pfeed-0.0.1.dev14 → pfeed-0.0.2.dev1}/pfeed/filepath.py +1 -1
  12. {pfeed-0.0.1.dev14 → pfeed-0.0.2.dev1}/pfeed/sources/bybit/download.py +18 -13
  13. {pfeed-0.0.1.dev14 → pfeed-0.0.2.dev1}/pfeed/sources/bybit/utils.py +4 -2
  14. {pfeed-0.0.1.dev14 → pfeed-0.0.2.dev1}/pfeed/utils/utils.py +16 -1
  15. {pfeed-0.0.1.dev14 → pfeed-0.0.2.dev1}/pyproject.toml +2 -2
  16. pfeed-0.0.1.dev14/pfeed/data_tools/data_tool_pandas.py +0 -62
  17. pfeed-0.0.1.dev14/pfeed/feeds/bybit_feed.py +0 -53
  18. {pfeed-0.0.1.dev14 → pfeed-0.0.2.dev1}/LICENSE +0 -0
  19. {pfeed-0.0.1.dev14 → pfeed-0.0.2.dev1}/README.md +0 -0
  20. {pfeed-0.0.1.dev14 → pfeed-0.0.2.dev1}/pfeed/cli/__init__.py +0 -0
  21. {pfeed-0.0.1.dev14 → pfeed-0.0.2.dev1}/pfeed/cli/commands/__init__.py +0 -0
  22. {pfeed-0.0.1.dev14 → pfeed-0.0.2.dev1}/pfeed/cli/commands/config.py +0 -0
  23. {pfeed-0.0.1.dev14 → pfeed-0.0.2.dev1}/pfeed/cli/commands/docker_compose.py +0 -0
  24. {pfeed-0.0.1.dev14 → pfeed-0.0.2.dev1}/pfeed/cli/commands/open.py +0 -0
  25. {pfeed-0.0.1.dev14 → pfeed-0.0.2.dev1}/pfeed/cli/commands/stream.py +0 -0
  26. {pfeed-0.0.1.dev14 → pfeed-0.0.2.dev1}/pfeed/cli/main.py +0 -0
  27. {pfeed-0.0.1.dev14 → pfeed-0.0.2.dev1}/pfeed/const/common.py +0 -0
  28. {pfeed-0.0.1.dev14 → pfeed-0.0.2.dev1}/pfeed/const/paths.py +0 -0
  29. {pfeed-0.0.1.dev14 → pfeed-0.0.2.dev1}/pfeed/feeds/__init__.py +0 -0
  30. {pfeed-0.0.1.dev14 → pfeed-0.0.2.dev1}/pfeed/feeds/binance_feed.py +0 -0
  31. {pfeed-0.0.1.dev14 → pfeed-0.0.2.dev1}/pfeed/feeds/custom_csv_feed.py +0 -0
  32. {pfeed-0.0.1.dev14 → pfeed-0.0.2.dev1}/pfeed/feeds/yahoo_finance_feed.py +0 -0
  33. {pfeed-0.0.1.dev14 → pfeed-0.0.2.dev1}/pfeed/main.py +0 -0
  34. {pfeed-0.0.1.dev14 → pfeed-0.0.2.dev1}/pfeed/resolution.py +0 -0
  35. {pfeed-0.0.1.dev14 → pfeed-0.0.2.dev1}/pfeed/sources/binance/__init__.py +0 -0
  36. {pfeed-0.0.1.dev14 → pfeed-0.0.2.dev1}/pfeed/sources/binance/api.py +0 -0
  37. {pfeed-0.0.1.dev14 → pfeed-0.0.2.dev1}/pfeed/sources/binance/const.py +0 -0
  38. {pfeed-0.0.1.dev14 → pfeed-0.0.2.dev1}/pfeed/sources/binance/download.py +0 -0
  39. {pfeed-0.0.1.dev14 → pfeed-0.0.2.dev1}/pfeed/sources/binance/stream.py +0 -0
  40. {pfeed-0.0.1.dev14 → pfeed-0.0.2.dev1}/pfeed/sources/bybit/__init__.py +0 -0
  41. {pfeed-0.0.1.dev14 → pfeed-0.0.2.dev1}/pfeed/sources/bybit/api.py +0 -0
  42. {pfeed-0.0.1.dev14 → pfeed-0.0.2.dev1}/pfeed/sources/bybit/const.py +0 -0
  43. {pfeed-0.0.1.dev14 → pfeed-0.0.2.dev1}/pfeed/sources/bybit/stream.py +0 -0
  44. {pfeed-0.0.1.dev14 → pfeed-0.0.2.dev1}/pfeed/sources/bybit/types.py +0 -0
  45. {pfeed-0.0.1.dev14 → pfeed-0.0.2.dev1}/pfeed/types/common_literals.py +0 -0
  46. /pfeed-0.0.1.dev14/pfeed/utils/file_format.py → /pfeed-0.0.2.dev1/pfeed/utils/file_formats.py +0 -0
  47. {pfeed-0.0.1.dev14 → pfeed-0.0.2.dev1}/pfeed/utils/monitor.py +0 -0
  48. {pfeed-0.0.1.dev14 → pfeed-0.0.2.dev1}/pfeed/utils/validate.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: pfeed
3
- Version: 0.0.1.dev14
3
+ Version: 0.0.2.dev1
4
4
  Summary: Data pipeline for algo-trading, getting and storing both real-time and historical data made easy.
5
5
  Home-page: https://pfund.ai
6
6
  License: Apache-2.0
@@ -23,7 +23,7 @@ Requires-Dist: fastparquet (>=2024.5.0,<2025.0.0)
23
23
  Requires-Dist: minio (>=7.2.8,<8.0.0) ; extra == "data" or extra == "all"
24
24
  Requires-Dist: pandas (>=2.2.2,<3.0.0) ; extra == "df" or extra == "all"
25
25
  Requires-Dist: pfund (>=0.0.1.dev13,<0.0.2)
26
- Requires-Dist: polars (>=1.6.0,<2.0.0) ; extra == "df" or extra == "all"
26
+ Requires-Dist: polars (>=1.7.1,<2.0.0) ; extra == "df" or extra == "all"
27
27
  Requires-Dist: psutil (>=6.0.0,<7.0.0) ; extra == "data" or extra == "all"
28
28
  Requires-Dist: pyarrow (>=15.0.0,<16.0.0) ; extra == "df" or extra == "all"
29
29
  Requires-Dist: ray (>=2.35.0,<3.0.0) ; extra == "boost" or extra == "all"
@@ -20,9 +20,9 @@ def download_historical_data(
20
20
  ptypes: str | list[str] | None = None,
21
21
  start_date: str | None = None,
22
22
  end_date: str | None = None,
23
- num_cpus: int = 8,
24
- use_ray: bool = True,
25
23
  use_minio: bool = False,
24
+ use_ray: bool = True,
25
+ ray_num_cpus: int = 8,
26
26
  ):
27
27
  data_source = importlib.import_module(f"pfeed.sources.{data_source.lower()}")
28
28
  return data_source.download_historical_data(
@@ -31,9 +31,9 @@ def download_historical_data(
31
31
  ptypes=ptypes,
32
32
  start_date=start_date,
33
33
  end_date=end_date,
34
- num_cpus=num_cpus,
35
- use_ray=use_ray,
36
34
  use_minio=use_minio,
35
+ use_ray=use_ray,
36
+ ray_num_cpus=ray_num_cpus,
37
37
  )
38
38
 
39
39
 
@@ -41,7 +41,7 @@ def download(data_source, pdts, dtypes, ptypes, start_date, end_date, num_cpus,
41
41
  ptypes=ptypes,
42
42
  start_date=start_date.date().strftime('%Y-%m-%d') if start_date else start_date,
43
43
  end_date=end_date.date().strftime('%Y-%m-%d') if end_date else end_date,
44
- num_cpus=num_cpus,
45
44
  use_ray=not no_ray,
45
+ ray_num_cpus=num_cpus,
46
46
  use_minio=use_minio,
47
47
  )
@@ -5,9 +5,6 @@ import logging
5
5
  from types import TracebackType
6
6
  from dataclasses import dataclass
7
7
 
8
- import yaml
9
- from dotenv import find_dotenv, load_dotenv
10
-
11
8
  from pfeed.const.paths import PROJ_NAME, MAIN_PATH, LOG_PATH, DATA_PATH, USER_CONFIG_FILE_PATH
12
9
 
13
10
 
@@ -48,6 +45,8 @@ class ConfigHandler:
48
45
 
49
46
  @classmethod
50
47
  def load_config(cls):
48
+ import yaml
49
+
51
50
  '''Loads user's config file and returns a ConfigHandler object'''
52
51
  config_file_path = USER_CONFIG_FILE_PATH
53
52
  if config_file_path.is_file():
@@ -77,9 +76,15 @@ class ConfigHandler:
77
76
  self.load_env_file(self.env_file_path)
78
77
 
79
78
  if self.debug:
80
- self.enable_debug_mode()
79
+ is_loggers_set_up = bool(logging.getLogger('pfeed').handlers)
80
+ if is_loggers_set_up:
81
+ print('loggers are already set up, ignoring enabling debug mode')
82
+ else:
83
+ self.enable_debug_mode()
81
84
 
82
85
  def load_env_file(self, env_file_path: str | None):
86
+ from dotenv import find_dotenv, load_dotenv
87
+
83
88
  if not env_file_path:
84
89
  found_env_file_path = find_dotenv(usecwd=True, raise_error_if_not_found=False)
85
90
  if found_env_file_path:
@@ -0,0 +1,61 @@
1
+ from __future__ import annotations
2
+ from typing import TYPE_CHECKING
3
+ if TYPE_CHECKING:
4
+ from pfeed.resolution import ExtendedResolution
5
+ from pfeed.types.common_literals import tSUPPORTED_STORAGES
6
+
7
+ import os
8
+ import io
9
+
10
+ import s3fs
11
+ import pandas as pd
12
+
13
+ from pfeed.const.common import SUPPORTED_STORAGES
14
+
15
+
16
+ name = 'pandas'
17
+
18
+
19
+ def read_parquet(paths_or_obj: list[str] | str | bytes, *args, storage: tSUPPORTED_STORAGES='local', **kwargs) -> pd.DataFrame:
20
+ assert storage in SUPPORTED_STORAGES, f'{storage=} not in {SUPPORTED_STORAGES}'
21
+ if isinstance(paths_or_obj, bytes):
22
+ obj = io.BytesIO(paths_or_obj)
23
+ return pd.read_parquet(obj, *args, **kwargs)
24
+ else:
25
+ if storage == 'minio':
26
+ if 'filesystem' not in kwargs:
27
+ fs = s3fs.S3FileSystem(
28
+ endpoint_url="http://"+os.getenv('MINIO_HOST', 'localhost')+':'+os.getenv('MINIO_PORT', '9000'),
29
+ key=os.getenv('MINIO_ROOT_USER', 'pfunder'),
30
+ secret=os.getenv('MINIO_ROOT_PASSWORD', 'password'),
31
+ )
32
+ kwargs['filesystem'] = fs
33
+ paths = paths_or_obj if isinstance(paths_or_obj, list) else [paths_or_obj]
34
+ return pd.read_parquet(paths, *args, **kwargs)
35
+
36
+
37
+ def estimate_memory_usage(df: pd.DataFrame) -> float:
38
+ """Estimate the memory usage of a pandas DataFrame in GB."""
39
+ return df.memory_usage(deep=True).sum() / (1024 ** 3)
40
+
41
+
42
+ def organize_time_series_columns(
43
+ pdt: str,
44
+ resolution: str | ExtendedResolution,
45
+ df: pd.DataFrame,
46
+ override_resolution: bool=False,
47
+ ) -> pd.DataFrame:
48
+ """Standardize the columns of a pandas DataFrame.
49
+ Moving 'ts', 'product', 'resolution' to the leftmost side.
50
+ """
51
+ from pfeed.resolution import ExtendedResolution
52
+ assert 'ts' in df.columns, "'ts' column not found"
53
+ if isinstance(resolution, str):
54
+ resolution = ExtendedResolution(resolution)
55
+ if 'product' not in df.columns:
56
+ df['product'] = pdt
57
+ if 'resolution' not in df.columns or override_resolution:
58
+ df['resolution'] = repr(resolution)
59
+ left_cols = ['ts', 'product', 'resolution']
60
+ df = df.reindex(left_cols + [col for col in df.columns if col not in left_cols], axis=1)
61
+ return df
@@ -14,30 +14,26 @@ from pfeed.const.common import SUPPORTED_STORAGES
14
14
  name = 'polars'
15
15
 
16
16
 
17
- def read_parquet(path_or_obj: str | bytes, *args, storage: tSUPPORTED_STORAGES='local', **kwargs) -> pl.DataFrame | pl.LazyFrame:
17
+ def read_parquet(paths_or_obj: list[str] | str | bytes, *args, storage: tSUPPORTED_STORAGES='local', **kwargs) -> pl.DataFrame | pl.LazyFrame:
18
18
  assert storage in SUPPORTED_STORAGES, f'{storage=} not in {SUPPORTED_STORAGES}'
19
- if isinstance(path_or_obj, bytes):
20
- obj = path_or_obj
19
+ if isinstance(paths_or_obj, bytes):
20
+ obj = paths_or_obj
21
21
  return pl.read_parquet(obj, *args, **kwargs)
22
22
  else:
23
- path = path_or_obj
23
+ paths = paths_or_obj if isinstance(paths_or_obj, list) else [paths_or_obj]
24
24
  if storage == 'local':
25
- return pl.scan_parquet(path, *args, **kwargs)
25
+ return pl.scan_parquet(paths, *args, **kwargs)
26
26
  elif storage == 'minio':
27
27
  storage_options = {
28
28
  "endpoint_url": "http://"+os.getenv('MINIO_HOST', 'localhost')+':'+os.getenv('MINIO_PORT', '9000'),
29
29
  "access_key_id": os.getenv('MINIO_ROOT_USER', 'pfunder'),
30
30
  "secret_access_key": os.getenv('MINIO_ROOT_PASSWORD', 'password'),
31
31
  }
32
- return pl.scan_parquet(path, *args, storage_options=storage_options, **kwargs)
32
+ return pl.scan_parquet(paths, *args, storage_options=storage_options, **kwargs)
33
33
  else:
34
34
  raise NotImplementedError(f'{storage=}')
35
35
 
36
36
 
37
- def concat(dfs: list[pl.DataFrame | pl.LazyFrame], *args, **kwargs) -> pl.DataFrame | pl.LazyFrame:
38
- return pl.concat(dfs, *args, **kwargs)
39
-
40
-
41
37
  def estimate_memory_usage(df: pl.DataFrame | pl.LazyFrame) -> float:
42
38
  """Estimate the memory usage of a polars DataFrame in GB."""
43
39
  if isinstance(df, pl.LazyFrame):
@@ -45,21 +41,28 @@ def estimate_memory_usage(df: pl.DataFrame | pl.LazyFrame) -> float:
45
41
  return df.estimated_size(unit='gb')
46
42
 
47
43
 
48
- def organize_time_series_columns(pdt: str, resolution: str | ExtendedResolution, df: pl.DataFrame | pl.LazyFrame) -> pl.DataFrame | pl.LazyFrame:
44
+ def organize_time_series_columns(
45
+ pdt: str,
46
+ resolution: str | ExtendedResolution,
47
+ df: pl.DataFrame | pl.LazyFramem,
48
+ override_resolution: bool=False,
49
+ ) -> pl.DataFrame | pl.LazyFrame:
49
50
  from pfeed.resolution import ExtendedResolution
50
51
  if isinstance(df, pl.LazyFrame):
51
52
  cols = df.collect_schema().names()
52
53
  else:
53
54
  cols = df.columns
54
55
  assert 'ts' in cols, "'ts' column not found"
55
- assert 'product' not in cols, "'product' column already exists"
56
- assert 'resolution' not in cols, "'resolution' column already exists"
57
56
  if isinstance(resolution, str):
58
57
  resolution = ExtendedResolution(resolution)
59
- df = df.with_columns(
60
- pl.lit(pdt).alias('product'),
61
- pl.lit(repr(resolution)).alias('resolution')
62
- )
58
+ if 'product' not in cols:
59
+ df = df.with_columns(
60
+ pl.lit(pdt).alias('product'),
61
+ )
62
+ if 'resolution' not in cols or override_resolution:
63
+ df = df.with_columns(
64
+ pl.lit(repr(resolution)).alias('resolution')
65
+ )
63
66
  left_cols = ['ts', 'product', 'resolution']
64
- df = df.select(left_cols + [col for col in df.collect_schema().names() if col not in left_cols])
67
+ df = df.select(left_cols + [col for col in cols if col not in left_cols])
65
68
  return df
@@ -2,7 +2,7 @@ from __future__ import annotations
2
2
  from typing import TYPE_CHECKING
3
3
  if TYPE_CHECKING:
4
4
  try:
5
- from minio.api import ObjectWriteResult
5
+ from minio.api import ObjectWriteResult, Tags
6
6
  except ImportError:
7
7
  pass
8
8
  from typing import Generator
@@ -12,10 +12,9 @@ import io
12
12
  import logging
13
13
 
14
14
 
15
- def assert_if_minio_running():
15
+ def check_if_minio_running():
16
16
  import requests
17
17
  from requests.exceptions import RequestException, ReadTimeout
18
- from minio.error import MinioException
19
18
 
20
19
  endpoint = os.getenv('MINIO_HOST', 'localhost')+':'+os.getenv('MINIO_PORT', '9000')
21
20
  if not endpoint.startswith('http'):
@@ -26,13 +25,15 @@ def assert_if_minio_running():
26
25
  try:
27
26
  response = requests.get(f'{endpoint}/minio/health/live', timeout=3)
28
27
  if response.status_code != 200:
29
- raise MinioException(f"Unhandled response: {response.status_code=} {response.content} {response}")
28
+ print(f"Unhandled response from MinIO: {response.status_code=} {response.content} {response}")
29
+ return False
30
30
  except (ReadTimeout, RequestException) as e:
31
- raise MinioException(f"MinIO is not running or not detected on {endpoint}: {e}, please use 'pfeed docker-compose up -d' to start MinIO")
31
+ return False
32
+ return True
32
33
 
33
34
 
34
35
  class Datastore:
35
- DATA_PART_SIZE = 5 * (1024 ** 2) # part size for S3, 5 MB
36
+ # DATA_PART_SIZE = 5 * (1024 ** 2) # part size for S3, 5 MB
36
37
  BUCKET_NAME = 'pfeed'
37
38
 
38
39
  # EXTEND, currently only consider using MinIO
@@ -40,9 +41,9 @@ class Datastore:
40
41
  def initialize_store(cls, name: str, **kwargs):
41
42
  if name == 'minio':
42
43
  from minio import Minio
43
- assert_if_minio_running()
44
+ endpoint = os.getenv('MINIO_HOST', 'localhost')+':'+os.getenv('MINIO_PORT', '9000')
44
45
  cls.minio = Minio(
45
- endpoint=os.getenv('MINIO_HOST', 'localhost')+':'+os.getenv('MINIO_PORT', '9000'),
46
+ endpoint=endpoint,
46
47
  access_key=os.getenv('MINIO_ROOT_USER', 'pfunder'),
47
48
  secret_key=os.getenv('MINIO_ROOT_PASSWORD', 'password'),
48
49
  # turn off TLS, i.e. not using HTTPS
@@ -74,7 +75,16 @@ class Datastore:
74
75
  self.logger.error(f'Unhandled MinIO response status {res.status}')
75
76
  except S3Error as err:
76
77
  # logger.warning(f'MinIO S3Error {object_name=} {err=}')
77
- pass
78
+ return None
79
+
80
+ def exist_object(self, object_name: str) -> bool:
81
+ from minio import S3Error
82
+ try:
83
+ res: Tags | None = self.minio.get_object_tags(self.BUCKET_NAME, object_name)
84
+ return True
85
+ except S3Error as err:
86
+ # self.logger.warning(f'MinIO S3Error {object_name=} {err=}')
87
+ return False
78
88
 
79
89
  def list_objects(self, prefix) -> list | None:
80
90
  '''
@@ -89,8 +99,9 @@ class Datastore:
89
99
  self.BUCKET_NAME,
90
100
  object_name,
91
101
  data=io.BytesIO(data),
92
- part_size=self.DATA_PART_SIZE,
93
- length=-1,
102
+ # part_size=self.DATA_PART_SIZE,
103
+ length=len(data),
104
+ content_type='application/parquet',
94
105
  **kwargs
95
106
  )
96
107
 
@@ -2,7 +2,7 @@
2
2
  Except extracting and loading data, this module uses "pandas" for data transformation.
3
3
  '''
4
4
  from __future__ import annotations
5
- from typing import TYPE_CHECKING, Literal
5
+ from typing import TYPE_CHECKING
6
6
  if TYPE_CHECKING:
7
7
  from pfeed.types.common_literals import (
8
8
  tSUPPORTED_ENVIRONMENTS,
@@ -10,8 +10,6 @@ if TYPE_CHECKING:
10
10
  tSUPPORTED_STORAGES,
11
11
  tSUPPORTED_DATA_TOOLS,
12
12
  )
13
- from pfeed.resolution import ExtendedResolution
14
- tOUTPUT_FORMATS = Literal['bytes'] | tSUPPORTED_DATA_TOOLS
15
13
 
16
14
  import logging
17
15
  import importlib
@@ -22,7 +20,8 @@ try:
22
20
  except ImportError:
23
21
  pass
24
22
 
25
- from pfeed.datastore import Datastore
23
+ from pfeed.resolution import ExtendedResolution
24
+ from pfeed.datastore import Datastore, check_if_minio_running
26
25
  from pfeed.filepath import FilePath
27
26
  from pfeed.config_handler import get_config
28
27
  from pfeed.const.common import (
@@ -31,17 +30,14 @@ from pfeed.const.common import (
31
30
  SUPPORTED_DOWNLOAD_DATA_SOURCES,
32
31
  SUPPORTED_DATA_TOOLS,
33
32
  )
34
- from pfeed.types.common_literals import tSUPPORTED_DATA_TOOLS
35
33
  from pfeed.utils.utils import derive_trading_venue
36
- from pfeed.utils.file_format import read_raw_data
37
-
34
+ from pfeed.utils.file_formats import read_raw_data
38
35
  try:
39
36
  from pfeed.utils.monitor import print_disk_usage
40
37
  except ImportError:
41
38
  print_disk_usage = None
42
39
 
43
40
 
44
- OUTPUT_FORMATS = ['bytes'] + SUPPORTED_DATA_TOOLS
45
41
  DataFrame = pd.DataFrame | pl.DataFrame | pl.LazyFrame
46
42
 
47
43
 
@@ -51,7 +47,6 @@ __all__ = [
51
47
  'transform_data',
52
48
  'load_data',
53
49
  'clean_raw_data',
54
- 'standardize_raw_data',
55
50
  'resample_data',
56
51
  ]
57
52
 
@@ -59,39 +54,36 @@ __all__ = [
59
54
  def get_data(
60
55
  env: tSUPPORTED_ENVIRONMENTS,
61
56
  data_source: tSUPPORTED_DOWNLOAD_DATA_SOURCES,
62
- resolution: str | ExtendedResolution,
63
57
  pdt: str,
64
- date: str,
58
+ resolution: str | ExtendedResolution,
59
+ dates: list[str],
60
+ storages: list[tSUPPORTED_STORAGES] | None = None,
65
61
  trading_venue: str='',
66
- output_format: tOUTPUT_FORMATS='pandas',
67
- ) -> bytes | DataFrame | None:
62
+ output_format: tSUPPORTED_DATA_TOOLS='pandas',
63
+ ) -> DataFrame | None:
68
64
  """Extract data without specifying the data origin.
69
65
  This function will try to extract data from all supported data origins.
70
66
 
71
67
  Args:
72
68
  env: trading environment, e.g. 'PAPER' | 'LIVE'.
73
69
  data_source (Literal['BYBIT']): The data source to extract data from.
70
+ pdt (str): product, e.g. BTC_USDT_PERP.
74
71
  resolution: Data resolution. e.g. '1m' = 1 minute as the unit of each data bar/candle.
75
72
  Also supports raw resolution such as 'r1m', where 'r' stands for raw.
76
73
  Default is '1d' = 1 day.
77
- pdt (str): product, e.g. BTC_USDT_PERP.
78
- date (str): The date of the data to extract.
74
+ dates (list[str]): The dates of the data to extract.
75
+ storages: origins of data to search from, default is all supported storages
79
76
  trading_venue (str): trading venue's name, e.g. exchange's name or dapp's name
80
77
  output_format: The format of the output data. Default is 'pandas'.
81
- Returns:
82
- bytes | DataFrame | None: The extracted data as bytes, or None if the data is not found.
83
78
  """
84
- try:
85
- from minio.error import MinioException
86
- except ImportError:
87
- MinioException = Exception
88
-
89
- trading_venue = trading_venue or derive_trading_venue(data_source)
90
- for storage in SUPPORTED_STORAGES:
91
- try:
92
- data: bytes | pd.DataFrame | None = extract_data(env, storage, data_source, trading_venue, resolution, pdt, date, output_format=output_format)
93
- except MinioException:
94
- data = None
79
+ logger = logging.getLogger(data_source.lower() + '_data')
80
+ storages = storages or SUPPORTED_STORAGES
81
+ for storage in storages:
82
+ if storage == 'minio':
83
+ if not check_if_minio_running():
84
+ continue
85
+ logger.debug(f'searching {storage=} for {data_source} {pdt} {resolution} data from {dates[0]} to {dates[-1]}')
86
+ data: DataFrame | None = extract_data(env, storage, data_source, pdt, resolution, dates, trading_venue=trading_venue, output_format=output_format)
95
87
  if data is not None:
96
88
  return data
97
89
 
@@ -100,84 +92,72 @@ def extract_data(
100
92
  env: tSUPPORTED_ENVIRONMENTS,
101
93
  storage: tSUPPORTED_STORAGES,
102
94
  data_source: tSUPPORTED_DOWNLOAD_DATA_SOURCES,
103
- trading_venue: str,
104
- resolution: str | ExtendedResolution,
105
95
  pdt: str,
106
- date: str,
107
- output_format: tOUTPUT_FORMATS='pandas',
108
- ) -> bytes | DataFrame | None:
96
+ resolution: str | ExtendedResolution,
97
+ dates: list[str],
98
+ trading_venue: str='',
99
+ output_format: tSUPPORTED_DATA_TOOLS='pandas',
100
+ ) -> DataFrame | None:
109
101
  """
110
102
  Extracts data from a specified data source and returns it as bytes.
111
103
 
112
104
  Args:
113
105
  env: trading environment, e.g. 'PAPER' | 'LIVE'.
114
- storage: The origin of the data (local or minio).
106
+ storage: The origin of the data (e.g. local or minio).
115
107
  data_source: The source of the data.
116
- trading_venue: trading venue's name, e.g. exchange's name or dapp's name
108
+ pdt (str): product, e.g. BTC_USDT_PERP.
117
109
  resolution: Data resolution. e.g. '1m' = 1 minute as the unit of each data bar/candle.
118
110
  Also supports raw resolution such as 'r1m', where 'r' stands for raw.
119
111
  Default is '1d' = 1 day.
120
- pdt (str): product, e.g. BTC_USDT_PERP.
121
- date (str): The date of the data.
112
+ dates (list[str]): The dates of the data.
113
+ trading_venue: trading venue's name, e.g. exchange's name or dapp's name
122
114
  output_format: The format of the output data. Default is 'pandas'.
123
- Returns:
124
- bytes | DataFrame | None: The extracted data as bytes, or None if extraction fails.
125
-
126
- Raises:
127
- AssertionError: If any of the input parameters are invalid.
128
- NotImplementedError: If the data origin is not supported.
129
- MinioException: If MinIO is not running / set up correctly.
130
115
  """
131
- from pfeed.resolution import ExtendedResolution
132
-
133
116
  logger = logging.getLogger(data_source.lower() + '_data')
134
-
135
117
  env, storage, data_source, pdt, output_format = env.upper(), storage.lower(), data_source.upper(), pdt.upper(), output_format.lower()
118
+ trading_venue = trading_venue or derive_trading_venue(data_source)
119
+ trading_venue = trading_venue.upper()
136
120
  assert env in SUPPORTED_ENVIRONMENTS, f'Invalid {env=}, {SUPPORTED_ENVIRONMENTS=}'
137
121
  assert storage in SUPPORTED_STORAGES, f'Invalid {storage=}, {SUPPORTED_STORAGES=}'
138
122
  assert data_source in SUPPORTED_DOWNLOAD_DATA_SOURCES, f'Invalid {data_source=}, SUPPORTED DATA SOURCES={SUPPORTED_DOWNLOAD_DATA_SOURCES}'
139
- assert output_format in OUTPUT_FORMATS, f'Invalid {output_format=}, {OUTPUT_FORMATS=}'
123
+ assert output_format in SUPPORTED_DATA_TOOLS, f'Invalid {output_format=}, valid options: {SUPPORTED_DATA_TOOLS}'
140
124
  if isinstance(resolution, str):
141
125
  resolution = ExtendedResolution(resolution)
142
126
  if output_format != 'bytes':
143
127
  data_tool = importlib.import_module(f'pfeed.data_tools.data_tool_{output_format.lower()}')
144
128
  config = get_config()
145
- fp = FilePath(env, data_source, trading_venue, pdt, resolution, date, file_extension='.parquet', data_path=config.data_path)
146
- if storage == 'local':
147
- if fp.exists():
148
- if output_format == 'bytes':
149
- with open(fp.file_path, 'rb') as f:
150
- data: bytes = f.read()
151
- else:
152
- data: DataFrame = data_tool.read_parquet(fp.file_path)
153
- logger.debug(f'extracted {data_source} {pdt} {date} {resolution} data from local path {fp.file_path}')
154
- return data
129
+ filepaths = [FilePath(env, data_source, trading_venue, pdt, resolution, date, file_extension='.parquet', data_path=config.data_path) for date in dates]
130
+ try:
131
+ df = None
132
+ if storage == 'local':
133
+ if all(fp.exists() for fp in filepaths):
134
+ df: DataFrame = data_tool.read_parquet([fp.file_path for fp in filepaths])
135
+ elif storage == 'minio':
136
+ datastore = Datastore(storage)
137
+ object_names = [fp.storage_path for fp in filepaths]
138
+ if all(datastore.exist_object(object_name) for object_name in object_names):
139
+ paths = ["s3://" + datastore.BUCKET_NAME + "/" + object_name for object_name in object_names]
140
+ df: DataFrame = data_tool.read_parquet(paths, storage='minio')
155
141
  else:
156
- logger.debug(f'failed to extract {data_source} {pdt} {date} {resolution} data from local path {fp.file_path}')
157
- elif storage == 'minio':
158
- datastore = Datastore(storage)
159
- object_name = fp.storage_path
160
- data: bytes | None = datastore.get_object(object_name)
161
- if data:
162
- if output_format != 'bytes':
163
- file_path = "s3://" + datastore.BUCKET_NAME + "/" + object_name
164
- data: DataFrame = data_tool.read_parquet(file_path, storage='minio')
165
- logger.debug(f'extracted {data_source} {pdt} {date} {resolution} data from MinIO object {object_name}')
142
+ raise NotImplementedError(f'{storage=}')
143
+
144
+ if df is not None:
145
+ logger.debug(f'extracted {data_source} {pdt} {resolution} data from {dates[0]} to {dates[-1]} from {storage}')
166
146
  else:
167
- logger.debug(f'failed to extract {data_source} {pdt} {date} {resolution} data from MinIO object {object_name}')
168
- return data
169
- else:
170
- raise NotImplementedError(f'{storage=}')
147
+ logger.debug(f'failed to extract {data_source} {pdt} {resolution} data from {dates[0]} to {dates[-1]} from {storage}')
148
+ return df
149
+ except Exception as err:
150
+ logger.exception(f'failed to extract {data_source} {pdt} {resolution} data from {dates[0]} to {dates[-1]} from {storage}, {err=}')
171
151
 
172
152
 
173
153
  def transform_data(
174
154
  data_source: tSUPPORTED_DOWNLOAD_DATA_SOURCES,
155
+ pdt: str,
175
156
  data: bytes | pd.DataFrame | pl.LazyFrame,
176
157
  data_resolution: str | ExtendedResolution,
177
158
  target_resolution: str | ExtendedResolution,
178
159
  ) -> bytes | pd.DataFrame | pl.LazyFrame:
179
160
  """Transforms data to a target resolution"""
180
- from pfeed.resolution import ExtendedResolution
181
161
  if isinstance(data_resolution, str):
182
162
  data_resolution = ExtendedResolution(data_resolution)
183
163
  if isinstance(target_resolution, str):
@@ -192,22 +172,23 @@ def transform_data(
192
172
  elif data_resolution.is_raw() and target_resolution.is_raw(): # e.g. 'r1t' -> 'r1m
193
173
  raise Exception(f'{data_resolution=} and {target_resolution=} are both raw resolutions')
194
174
  else:
195
- data: bytes | pd.DataFrame | pl.LazyFrame = standardize_raw_data(data, data_resolution.is_tick())
196
- if target_resolution.is_tick():
197
- return data
198
- else:
199
- return resample_data(data, target_resolution)
175
+ df: pd.DataFrame = _convert_data_to_pandas_df(data)
176
+ df = _standardize_columns(df, data_resolution.is_tick())
177
+ if not target_resolution.is_tick():
178
+ df = resample_data(df, target_resolution)
179
+ df = _organize_columns(df, pdt, target_resolution)
180
+ return _handle_result(data, df)
200
181
 
201
182
 
202
183
  def load_data(
203
184
  env: tSUPPORTED_ENVIRONMENTS,
204
185
  storage: tSUPPORTED_STORAGES,
205
186
  data_source: tSUPPORTED_DOWNLOAD_DATA_SOURCES,
206
- trading_venue: str,
207
187
  data: bytes,
208
- resolution: str | ExtendedResolution,
209
188
  pdt: str,
189
+ resolution: str | ExtendedResolution,
210
190
  date: str,
191
+ trading_venue: str='',
211
192
  **kwargs
212
193
  ) -> None:
213
194
  """
@@ -218,28 +199,23 @@ def load_data(
218
199
  storage: The destination where the data will be loaded.
219
200
  It can be either 'local' or 'minio'.
220
201
  data_source: The source of the data.
221
- trading_venue: trading venue's name, e.g. exchange's name or dapp's name
222
202
  data (bytes): The data to be loaded.
203
+ pdt (str): product, e.g. BTC_USDT_PERP.
223
204
  resolution: Data resolution. e.g. '1m' = 1 minute as the unit of each data bar/candle.
224
205
  Also supports raw resolution such as 'r1m', where 'r' stands for raw.
225
206
  Default is '1d' = 1 day.
226
- pdt (str): product, e.g. BTC_USDT_PERP.
227
207
  date (str): The date of the data.
208
+ trading_venue: trading venue's name, e.g. exchange's name or dapp's name
228
209
  **kwargs: Additional keyword arguments for MinIO.
229
210
 
230
211
  Returns:
231
212
  None
232
-
233
- Raises:
234
- AssertionError: If any of the input parameters are invalid.
235
- NotImplementedError: If the specified data destination is not implemented.
236
- MinioException: If MinIO is not running / set up correctly.
237
213
  """
238
- from pfeed.resolution import ExtendedResolution
239
-
240
214
  logger = logging.getLogger(data_source.lower() + '_data')
241
215
 
242
216
  env, storage, data_source, pdt = env.upper(), storage.lower(), data_source.upper(), pdt.upper()
217
+ trading_venue = trading_venue or derive_trading_venue(data_source)
218
+ trading_venue = trading_venue.upper()
243
219
  assert env in SUPPORTED_ENVIRONMENTS, f'Invalid {env=}, {SUPPORTED_ENVIRONMENTS=}'
244
220
  assert storage in SUPPORTED_STORAGES, f'Invalid {storage=}, {SUPPORTED_STORAGES=}'
245
221
  assert data_source in SUPPORTED_DOWNLOAD_DATA_SOURCES, f'Invalid {data_source=}, SUPPORTED DATA SOURCES={SUPPORTED_DOWNLOAD_DATA_SOURCES}'
@@ -280,7 +256,6 @@ def clean_raw_data(
280
256
  bytes: The cleaned raw data.
281
257
  '''
282
258
  assert data_source in SUPPORTED_DOWNLOAD_DATA_SOURCES, f'Invalid {data_source=}, SUPPORTED DATA SOURCES={SUPPORTED_DOWNLOAD_DATA_SOURCES}'
283
-
284
259
  const = importlib.import_module(f'pfeed.sources.{data_source.lower()}.const')
285
260
  utils = importlib.import_module(f'pfeed.sources.{data_source.lower()}.utils')
286
261
 
@@ -293,27 +268,6 @@ def clean_raw_data(
293
268
  return _handle_result(data, df)
294
269
 
295
270
 
296
- def standardize_raw_data(
297
- data: bytes | pd.DataFrame | pl.LazyFrame,
298
- is_tick: bool
299
- ) -> bytes | pd.DataFrame | pl.LazyFrame:
300
- """Filter out unnecessary columns from raw data.
301
-
302
- Args:
303
- data (bytes): The raw data in bytes format.
304
-
305
- Returns:
306
- bytes | pd.DataFrame | pl.LazyFrame: The standardized data.
307
- """
308
- df: pd.DataFrame = _convert_data_to_pandas_df(data)
309
- assert 'ts' in df.columns, 'ts column not found, please check if the raw data has been cleaned'
310
- if is_tick:
311
- df = df.loc[:, ['ts', 'side', 'volume', 'price']]
312
- else:
313
- df = df.loc[:, ['ts', 'open', 'high', 'low', 'close', 'volume']]
314
- return _handle_result(data, df)
315
-
316
-
317
271
  def resample_data(
318
272
  data: bytes | pd.DataFrame | pl.LazyFrame,
319
273
  resolution: str | ExtendedResolution,
@@ -326,8 +280,6 @@ def resample_data(
326
280
  resolution (str | Resolution): The resolution at which the data should be resampled.
327
281
  if string, it should be in the format of "# + unit (s/m/h/d)", e.g. "1s".
328
282
  '''
329
- from pfeed.resolution import ExtendedResolution
330
-
331
283
  # standardize resolution by following pfund's standard, e.g. '1minute' -> '1m'
332
284
  if isinstance(resolution, str):
333
285
  resolution = ExtendedResolution(resolution)
@@ -380,6 +332,26 @@ def resample_data(
380
332
  return _handle_result(data, resampled_df)
381
333
 
382
334
 
335
+ def _standardize_columns(df: pd.DataFrame, is_tick: bool) -> pd.DataFrame:
336
+ """Filter out unnecessary columns from raw data."""
337
+ assert 'ts' in df.columns, '"ts" column not found'
338
+ if is_tick:
339
+ df = df.loc[:, ['ts', 'side', 'volume', 'price']]
340
+ else:
341
+ df = df.loc[:, ['ts', 'open', 'high', 'low', 'close', 'volume']]
342
+ return df
343
+
344
+
345
+ def _organize_columns(df: pd.DataFrame, pdt: str, resolution: ExtendedResolution) -> pd.DataFrame:
346
+ """Organizes the columns of a DataFrame.
347
+ Moving 'ts', 'product', 'resolution' to the leftmost side.
348
+ """
349
+ df['product'] = pdt
350
+ df['resolution'] = repr(resolution)
351
+ left_cols = ['ts', 'product', 'resolution']
352
+ return df.reindex(left_cols + [col for col in df.columns if col not in left_cols], axis=1)
353
+
354
+
383
355
  def _convert_data_to_pandas_df(data: bytes | pd.DataFrame | pl.LazyFrame) -> pd.DataFrame:
384
356
  """Converts data to pandas DataFrame."""
385
357
  if isinstance(data, bytes):