pfeed 0.0.1.dev13__tar.gz → 0.0.1.dev14__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. {pfeed-0.0.1.dev13 → pfeed-0.0.1.dev14}/PKG-INFO +6 -6
  2. {pfeed-0.0.1.dev13 → pfeed-0.0.1.dev14}/README.md +1 -1
  3. {pfeed-0.0.1.dev13 → pfeed-0.0.1.dev14}/pfeed/__init__.py +0 -2
  4. {pfeed-0.0.1.dev13 → pfeed-0.0.1.dev14}/pfeed/cli/commands/download.py +9 -5
  5. pfeed-0.0.1.dev14/pfeed/const/common.py +15 -0
  6. pfeed-0.0.1.dev14/pfeed/data_tools/data_tool_pandas.py +62 -0
  7. pfeed-0.0.1.dev14/pfeed/data_tools/data_tool_polars.py +65 -0
  8. {pfeed-0.0.1.dev13 → pfeed-0.0.1.dev14}/pfeed/datastore.py +1 -1
  9. pfeed-0.0.1.dev14/pfeed/etl.py +405 -0
  10. pfeed-0.0.1.dev14/pfeed/feeds/base_feed.py +296 -0
  11. {pfeed-0.0.1.dev13 → pfeed-0.0.1.dev14}/pfeed/feeds/bybit_feed.py +22 -24
  12. {pfeed-0.0.1.dev13 → pfeed-0.0.1.dev14}/pfeed/feeds/yahoo_finance_feed.py +1 -1
  13. {pfeed-0.0.1.dev13 → pfeed-0.0.1.dev14}/pfeed/filepath.py +29 -9
  14. pfeed-0.0.1.dev14/pfeed/resolution.py +62 -0
  15. {pfeed-0.0.1.dev13 → pfeed-0.0.1.dev14}/pfeed/sources/binance/download.py +2 -2
  16. {pfeed-0.0.1.dev13 → pfeed-0.0.1.dev14}/pfeed/sources/bybit/const.py +4 -1
  17. {pfeed-0.0.1.dev13 → pfeed-0.0.1.dev14}/pfeed/sources/bybit/download.py +31 -32
  18. pfeed-0.0.1.dev14/pfeed/sources/bybit/types.py +4 -0
  19. pfeed-0.0.1.dev14/pfeed/sources/bybit/utils.py +44 -0
  20. pfeed-0.0.1.dev14/pfeed/types/common_literals.py +13 -0
  21. pfeed-0.0.1.dev14/pfeed/utils/file_format.py +76 -0
  22. {pfeed-0.0.1.dev13 → pfeed-0.0.1.dev14}/pfeed/utils/utils.py +17 -0
  23. {pfeed-0.0.1.dev13 → pfeed-0.0.1.dev14}/pyproject.toml +6 -6
  24. pfeed-0.0.1.dev13/pfeed/const/common.py +0 -11
  25. pfeed-0.0.1.dev13/pfeed/etl.py +0 -319
  26. pfeed-0.0.1.dev13/pfeed/feeds/base_feed.py +0 -300
  27. pfeed-0.0.1.dev13/pfeed/sources/bybit/types.py +0 -5
  28. pfeed-0.0.1.dev13/pfeed/sources/bybit/utils.py +0 -19
  29. pfeed-0.0.1.dev13/pfeed/types/common_literals.py +0 -10
  30. {pfeed-0.0.1.dev13 → pfeed-0.0.1.dev14}/LICENSE +0 -0
  31. {pfeed-0.0.1.dev13 → pfeed-0.0.1.dev14}/pfeed/cli/__init__.py +0 -0
  32. {pfeed-0.0.1.dev13 → pfeed-0.0.1.dev14}/pfeed/cli/commands/__init__.py +0 -0
  33. {pfeed-0.0.1.dev13 → pfeed-0.0.1.dev14}/pfeed/cli/commands/config.py +0 -0
  34. {pfeed-0.0.1.dev13 → pfeed-0.0.1.dev14}/pfeed/cli/commands/docker_compose.py +0 -0
  35. {pfeed-0.0.1.dev13 → pfeed-0.0.1.dev14}/pfeed/cli/commands/open.py +0 -0
  36. {pfeed-0.0.1.dev13 → pfeed-0.0.1.dev14}/pfeed/cli/commands/stream.py +0 -0
  37. {pfeed-0.0.1.dev13 → pfeed-0.0.1.dev14}/pfeed/cli/main.py +0 -0
  38. {pfeed-0.0.1.dev13 → pfeed-0.0.1.dev14}/pfeed/config_handler.py +0 -0
  39. {pfeed-0.0.1.dev13 → pfeed-0.0.1.dev14}/pfeed/const/paths.py +0 -0
  40. {pfeed-0.0.1.dev13 → pfeed-0.0.1.dev14}/pfeed/feeds/__init__.py +0 -0
  41. {pfeed-0.0.1.dev13 → pfeed-0.0.1.dev14}/pfeed/feeds/binance_feed.py +0 -0
  42. {pfeed-0.0.1.dev13 → pfeed-0.0.1.dev14}/pfeed/feeds/custom_csv_feed.py +0 -0
  43. {pfeed-0.0.1.dev13 → pfeed-0.0.1.dev14}/pfeed/main.py +0 -0
  44. {pfeed-0.0.1.dev13 → pfeed-0.0.1.dev14}/pfeed/sources/binance/__init__.py +0 -0
  45. {pfeed-0.0.1.dev13 → pfeed-0.0.1.dev14}/pfeed/sources/binance/api.py +0 -0
  46. {pfeed-0.0.1.dev13 → pfeed-0.0.1.dev14}/pfeed/sources/binance/const.py +0 -0
  47. {pfeed-0.0.1.dev13 → pfeed-0.0.1.dev14}/pfeed/sources/binance/stream.py +0 -0
  48. {pfeed-0.0.1.dev13 → pfeed-0.0.1.dev14}/pfeed/sources/bybit/__init__.py +0 -0
  49. {pfeed-0.0.1.dev13 → pfeed-0.0.1.dev14}/pfeed/sources/bybit/api.py +0 -0
  50. {pfeed-0.0.1.dev13 → pfeed-0.0.1.dev14}/pfeed/sources/bybit/stream.py +0 -0
  51. {pfeed-0.0.1.dev13 → pfeed-0.0.1.dev14}/pfeed/utils/monitor.py +0 -0
  52. {pfeed-0.0.1.dev13 → pfeed-0.0.1.dev14}/pfeed/utils/validate.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: pfeed
3
- Version: 0.0.1.dev13
3
+ Version: 0.0.1.dev14
4
4
  Summary: Data pipeline for algo-trading, getting and storing both real-time and historical data made easy.
5
5
  Home-page: https://pfund.ai
6
6
  License: Apache-2.0
@@ -23,11 +23,11 @@ Requires-Dist: fastparquet (>=2024.5.0,<2025.0.0)
23
23
  Requires-Dist: minio (>=7.2.8,<8.0.0) ; extra == "data" or extra == "all"
24
24
  Requires-Dist: pandas (>=2.2.2,<3.0.0) ; extra == "df" or extra == "all"
25
25
  Requires-Dist: pfund (>=0.0.1.dev13,<0.0.2)
26
- Requires-Dist: polars (>=1.5.0,<2.0.0) ; extra == "df" or extra == "all"
26
+ Requires-Dist: polars (>=1.6.0,<2.0.0) ; extra == "df" or extra == "all"
27
27
  Requires-Dist: psutil (>=6.0.0,<7.0.0) ; extra == "data" or extra == "all"
28
- Requires-Dist: pyarrow (>=15.0.0,<16.0.0) ; extra == "boost" or extra == "all"
29
- Requires-Dist: ray (>=2.34.0,<3.0.0) ; extra == "boost" or extra == "all"
30
- Requires-Dist: s3fs (>=2024.6.1,<2025.0.0) ; extra == "data" or extra == "all"
28
+ Requires-Dist: pyarrow (>=15.0.0,<16.0.0) ; extra == "df" or extra == "all"
29
+ Requires-Dist: ray (>=2.35.0,<3.0.0) ; extra == "boost" or extra == "all"
30
+ Requires-Dist: s3fs (>=2024.9.0,<2025.0.0) ; extra == "data" or extra == "all"
31
31
  Requires-Dist: yfinance (>=0.2.43,<0.3.0)
32
32
  Project-URL: Documentation, https://pfeed-docs.pfund.ai
33
33
  Project-URL: Repository, https://github.com/PFund-Software-Ltd/pfeed
@@ -68,7 +68,7 @@ By leveraging modern data engineering tools, `pfeed` handles the tedious data wo
68
68
  PFeed (/piː fiːd/) is a data pipeline for algorithmic trading, serving as a bridge between raw data sources and traders by automating the process of data collection, cleaning, transformation, and storage, loading clean data into a **local data lake for quantitative analysis**.
69
69
 
70
70
  ## Core Features
71
- - [x] Unified approach for interacting with various data sources and obtaining historical and live data
71
+ - [x] Unified approach for interacting with various [data sources](#supported-data-sources) and obtaining historical and live data
72
72
  - [x] ETL data pipline for transforming raw data to clean data and storing it in [MinIO] (optional)
73
73
  - [x] Fast data downloading, utilizing [Ray] for parallelization
74
74
  - [x] Supports multiple data tools (e.g. Pandas, [Polars], [Dask], [Spark], [DuckDB], [Daft])
@@ -33,7 +33,7 @@ By leveraging modern data engineering tools, `pfeed` handles the tedious data wo
33
33
  PFeed (/piː fiːd/) is a data pipeline for algorithmic trading, serving as a bridge between raw data sources and traders by automating the process of data collection, cleaning, transformation, and storage, loading clean data into a **local data lake for quantitative analysis**.
34
34
 
35
35
  ## Core Features
36
- - [x] Unified approach for interacting with various data sources and obtaining historical and live data
36
+ - [x] Unified approach for interacting with various [data sources](#supported-data-sources) and obtaining historical and live data
37
37
  - [x] ETL data pipline for transforming raw data to clean data and storing it in [MinIO] (optional)
38
38
  - [x] Fast data downloading, utilizing [Ray] for parallelization
39
39
  - [x] Supports multiple data tools (e.g. Pandas, [Polars], [Dask], [Spark], [DuckDB], [Daft])
@@ -7,7 +7,6 @@ if TYPE_CHECKING:
7
7
  import importlib
8
8
  from importlib.metadata import version
9
9
 
10
- from pfeed import etl
11
10
  from pfeed.config_handler import configure, get_config
12
11
  from pfeed.const.common import ALIASES
13
12
  from pfeed.sources import bybit
@@ -55,7 +54,6 @@ __all__ = (
55
54
  "configure",
56
55
  "get_config",
57
56
  "ALIASES",
58
- "etl",
59
57
  "bybit",
60
58
  "binance",
61
59
  "YahooFinanceFeed",
@@ -3,7 +3,12 @@ import importlib
3
3
  import click
4
4
 
5
5
  import pfeed as pe
6
- from pfeed.const.common import ALIASES, SUPPORTED_DOWNLOAD_DATA_SOURCES, SUPPORTED_DATA_TYPES
6
+ from pfeed.const.common import (
7
+ ALIASES,
8
+ SUPPORTED_DOWNLOAD_DATA_SOURCES,
9
+ SUPPORTED_DATA_TYPES,
10
+ SUPPORTED_PRODUCT_TYPES,
11
+ )
7
12
 
8
13
 
9
14
  # add aliases to supported download data sources
@@ -15,11 +20,10 @@ SUPPORTED_DATA_TYPES_IMPLICIT_RAW_ALLOWED = SUPPORTED_DATA_TYPES + ['raw']
15
20
 
16
21
 
17
22
  @click.command()
18
- @click.pass_context
19
23
  @click.option('--data-source', '-d', required=True, type=click.Choice(SUPPORTED_DOWNLOAD_DATA_SOURCES_ALIASES_INCLUDED, case_sensitive=False), help='Data source')
20
- @click.option('--dtypes', '--dt', 'dtypes', multiple=True, default=['raw'], type=click.Choice(SUPPORTED_DATA_TYPES_IMPLICIT_RAW_ALLOWED, case_sensitive=False), help=f'{SUPPORTED_DATA_TYPES=}. How to pass in multiple values: --dt raw --dt tick')
21
24
  @click.option('--pdts', '-p', 'pdts', multiple=True, default=[], help='List of trading products')
22
- @click.option('--ptypes', '--pt', 'ptypes', multiple=True, default=[], help='List of product types, e.g. PERP = get all perpetuals')
25
+ @click.option('--dtypes', '--dt', 'dtypes', multiple=True, default=['raw'], type=click.Choice(SUPPORTED_DATA_TYPES_IMPLICIT_RAW_ALLOWED, case_sensitive=False), help=f'{SUPPORTED_DATA_TYPES=}. How to pass in multiple values: --dt raw --dt tick')
26
+ @click.option('--ptypes', '--pt', 'ptypes', multiple=True, default=[], type=click.Choice(SUPPORTED_PRODUCT_TYPES, case_sensitive=False), help='List of product types, e.g. PERP = get all perpetuals')
23
27
  @click.option('--start-date', '-s', type=click.DateTime(formats=["%Y-%m-%d"]), help='Start date in YYYY-MM-DD format')
24
28
  @click.option('--end-date', '-e', type=click.DateTime(formats=["%Y-%m-%d"]), help='End date in YYYY-MM-DD format')
25
29
  @click.option('--num-cpus', '-n', default=8, type=int, help="number of logical CPUs used for Ray's tasks")
@@ -27,7 +31,7 @@ SUPPORTED_DATA_TYPES_IMPLICIT_RAW_ALLOWED = SUPPORTED_DATA_TYPES + ['raw']
27
31
  @click.option('--no-ray', is_flag=True, help='if enabled, Ray will not be used')
28
32
  @click.option('--env-file', 'env_file_path', type=click.Path(exists=True), help='Path to the .env file')
29
33
  @click.option('--debug', is_flag=True, help='if enabled, debug mode will be enabled where logs at DEBUG level will be printed')
30
- def download(data_source, dtypes, pdts, ptypes, start_date, end_date, num_cpus, no_ray, use_minio, env_file_path, debug):
34
+ def download(data_source, pdts, dtypes, ptypes, start_date, end_date, num_cpus, no_ray, use_minio, env_file_path, debug):
31
35
  pe.configure(env_file_path=env_file_path, debug=debug)
32
36
  data_source = ALIASES.get(data_source, data_source)
33
37
  pipeline = importlib.import_module(f'pfeed.sources.{data_source.lower()}.download')
@@ -0,0 +1,15 @@
1
+ SUPPORTED_ENVIRONMENTS = ['BACKTEST', 'SANDBOX', 'PAPER', 'LIVE']
2
+ SUPPORTED_DATA_FEEDS = ['YAHOO_FINANCE', 'BYBIT', 'BINANCE']
3
+ SUPPORTED_STORAGES = ['local', 'minio']
4
+ SUPPORTED_DOWNLOAD_DATA_SOURCES = ['BYBIT', 'BINANCE']
5
+ SUPPORTED_CRYPTO_EXCHANGES = ['BYBIT', 'BINANCE']
6
+ SUPPORTED_DATA_TOOLS = ['pandas', 'polars']
7
+ SUPPORTED_PRODUCT_TYPES = ['SPOT', 'PERP', 'IPERP', 'FUT', 'IFUT']
8
+ SUPPORTED_DATA_TYPES = [
9
+ 'raw_tick', 'raw_second', 'raw_minute', 'raw_hour', 'raw_daily',
10
+ 'tick', 'second', 'minute', 'hour', 'daily',
11
+ ]
12
+
13
+ ALIASES = {
14
+ 'YF': 'YAHOO_FINANCE',
15
+ }
@@ -0,0 +1,62 @@
1
+ from __future__ import annotations
2
+ from typing import TYPE_CHECKING
3
+ if TYPE_CHECKING:
4
+ from pfeed.resolution import ExtendedResolution
5
+ from pfeed.types.common_literals import tSUPPORTED_STORAGES
6
+
7
+ import os
8
+ import io
9
+
10
+ import s3fs
11
+ import pandas as pd
12
+
13
+ from pfeed.const.common import SUPPORTED_STORAGES
14
+
15
+
16
+ name = 'pandas'
17
+
18
+
19
+ def read_parquet(path_or_obj: str | bytes, *args, storage: tSUPPORTED_STORAGES='local', **kwargs) -> pd.DataFrame:
20
+ assert storage in SUPPORTED_STORAGES, f'{storage=} not in {SUPPORTED_STORAGES}'
21
+ if isinstance(path_or_obj, bytes):
22
+ obj = io.BytesIO(path_or_obj)
23
+ return pd.read_parquet(obj, *args, **kwargs)
24
+ else:
25
+ path = path_or_obj
26
+ if storage == 'local':
27
+ return pd.read_parquet(path, *args, **kwargs)
28
+ elif storage == 'minio':
29
+ fs = s3fs.S3FileSystem(
30
+ endpoint_url="http://"+os.getenv('MINIO_HOST', 'localhost')+':'+os.getenv('MINIO_PORT', '9000'),
31
+ key=os.getenv('MINIO_ROOT_USER', 'pfunder'),
32
+ secret=os.getenv('MINIO_ROOT_PASSWORD', 'password'),
33
+ )
34
+ return pd.read_parquet(path, *args, filesystem=fs, **kwargs)
35
+ else:
36
+ raise NotImplementedError(f'{storage=}')
37
+
38
+
39
+ def concat(dfs: list[pd.DataFrame], *args, **kwargs) -> pd.DataFrame:
40
+ return pd.concat(dfs, *args, **kwargs)
41
+
42
+
43
+ def estimate_memory_usage(df: pd.DataFrame) -> float:
44
+ """Estimate the memory usage of a pandas DataFrame in GB."""
45
+ return df.memory_usage(deep=True).sum() / (1024 ** 3)
46
+
47
+
48
+ def organize_time_series_columns(pdt: str, resolution: str | ExtendedResolution, df: pd.DataFrame) -> pd.DataFrame:
49
+ """Organize the columns of a pandas DataFrame.
50
+ Moving 'ts', 'product', 'resolution' to the leftmost side.
51
+ """
52
+ from pfeed.resolution import ExtendedResolution
53
+ assert 'ts' in df.columns, "'ts' column not found"
54
+ assert 'product' not in df.columns, "'product' column already exists"
55
+ assert 'resolution' not in df.columns, "'resolution' column already exists"
56
+ if isinstance(resolution, str):
57
+ resolution = ExtendedResolution(resolution)
58
+ left_cols = ['ts', 'product', 'resolution']
59
+ df['product'] = pdt
60
+ df['resolution'] = repr(resolution)
61
+ df = df.reindex(left_cols + [col for col in df.columns if col not in left_cols], axis=1)
62
+ return df
@@ -0,0 +1,65 @@
1
+ from __future__ import annotations
2
+ from typing import TYPE_CHECKING
3
+ if TYPE_CHECKING:
4
+ from pfeed.resolution import ExtendedResolution
5
+ from pfeed.types.common_literals import tSUPPORTED_STORAGES
6
+
7
+ import os
8
+
9
+ import polars as pl
10
+
11
+ from pfeed.const.common import SUPPORTED_STORAGES
12
+
13
+
14
+ name = 'polars'
15
+
16
+
17
+ def read_parquet(path_or_obj: str | bytes, *args, storage: tSUPPORTED_STORAGES='local', **kwargs) -> pl.DataFrame | pl.LazyFrame:
18
+ assert storage in SUPPORTED_STORAGES, f'{storage=} not in {SUPPORTED_STORAGES}'
19
+ if isinstance(path_or_obj, bytes):
20
+ obj = path_or_obj
21
+ return pl.read_parquet(obj, *args, **kwargs)
22
+ else:
23
+ path = path_or_obj
24
+ if storage == 'local':
25
+ return pl.scan_parquet(path, *args, **kwargs)
26
+ elif storage == 'minio':
27
+ storage_options = {
28
+ "endpoint_url": "http://"+os.getenv('MINIO_HOST', 'localhost')+':'+os.getenv('MINIO_PORT', '9000'),
29
+ "access_key_id": os.getenv('MINIO_ROOT_USER', 'pfunder'),
30
+ "secret_access_key": os.getenv('MINIO_ROOT_PASSWORD', 'password'),
31
+ }
32
+ return pl.scan_parquet(path, *args, storage_options=storage_options, **kwargs)
33
+ else:
34
+ raise NotImplementedError(f'{storage=}')
35
+
36
+
37
+ def concat(dfs: list[pl.DataFrame | pl.LazyFrame], *args, **kwargs) -> pl.DataFrame | pl.LazyFrame:
38
+ return pl.concat(dfs, *args, **kwargs)
39
+
40
+
41
+ def estimate_memory_usage(df: pl.DataFrame | pl.LazyFrame) -> float:
42
+ """Estimate the memory usage of a polars DataFrame in GB."""
43
+ if isinstance(df, pl.LazyFrame):
44
+ df = df.collect()
45
+ return df.estimated_size(unit='gb')
46
+
47
+
48
+ def organize_time_series_columns(pdt: str, resolution: str | ExtendedResolution, df: pl.DataFrame | pl.LazyFrame) -> pl.DataFrame | pl.LazyFrame:
49
+ from pfeed.resolution import ExtendedResolution
50
+ if isinstance(df, pl.LazyFrame):
51
+ cols = df.collect_schema().names()
52
+ else:
53
+ cols = df.columns
54
+ assert 'ts' in cols, "'ts' column not found"
55
+ assert 'product' not in cols, "'product' column already exists"
56
+ assert 'resolution' not in cols, "'resolution' column already exists"
57
+ if isinstance(resolution, str):
58
+ resolution = ExtendedResolution(resolution)
59
+ df = df.with_columns(
60
+ pl.lit(pdt).alias('product'),
61
+ pl.lit(repr(resolution)).alias('resolution')
62
+ )
63
+ left_cols = ['ts', 'product', 'resolution']
64
+ df = df.select(left_cols + [col for col in df.collect_schema().names() if col not in left_cols])
65
+ return df
@@ -28,7 +28,7 @@ def assert_if_minio_running():
28
28
  if response.status_code != 200:
29
29
  raise MinioException(f"Unhandled response: {response.status_code=} {response.content} {response}")
30
30
  except (ReadTimeout, RequestException) as e:
31
- raise MinioException(f"MinIO is not running or not detected on {endpoint}: {e}")
31
+ raise MinioException(f"MinIO is not running or not detected on {endpoint}: {e}, please use 'pfeed docker-compose up -d' to start MinIO")
32
32
 
33
33
 
34
34
  class Datastore:
@@ -0,0 +1,405 @@
1
+ '''ETL = Extract, Transform, Load data.
2
+ Except extracting and loading data, this module uses "pandas" for data transformation.
3
+ '''
4
+ from __future__ import annotations
5
+ from typing import TYPE_CHECKING, Literal
6
+ if TYPE_CHECKING:
7
+ from pfeed.types.common_literals import (
8
+ tSUPPORTED_ENVIRONMENTS,
9
+ tSUPPORTED_DOWNLOAD_DATA_SOURCES,
10
+ tSUPPORTED_STORAGES,
11
+ tSUPPORTED_DATA_TOOLS,
12
+ )
13
+ from pfeed.resolution import ExtendedResolution
14
+ tOUTPUT_FORMATS = Literal['bytes'] | tSUPPORTED_DATA_TOOLS
15
+
16
+ import logging
17
+ import importlib
18
+
19
+ try:
20
+ import pandas as pd
21
+ import polars as pl
22
+ except ImportError:
23
+ pass
24
+
25
+ from pfeed.datastore import Datastore
26
+ from pfeed.filepath import FilePath
27
+ from pfeed.config_handler import get_config
28
+ from pfeed.const.common import (
29
+ SUPPORTED_ENVIRONMENTS,
30
+ SUPPORTED_STORAGES,
31
+ SUPPORTED_DOWNLOAD_DATA_SOURCES,
32
+ SUPPORTED_DATA_TOOLS,
33
+ )
34
+ from pfeed.types.common_literals import tSUPPORTED_DATA_TOOLS
35
+ from pfeed.utils.utils import derive_trading_venue
36
+ from pfeed.utils.file_format import read_raw_data
37
+
38
+ try:
39
+ from pfeed.utils.monitor import print_disk_usage
40
+ except ImportError:
41
+ print_disk_usage = None
42
+
43
+
44
+ OUTPUT_FORMATS = ['bytes'] + SUPPORTED_DATA_TOOLS
45
+ DataFrame = pd.DataFrame | pl.DataFrame | pl.LazyFrame
46
+
47
+
48
+ __all__ = [
49
+ 'get_data',
50
+ 'extract_data',
51
+ 'transform_data',
52
+ 'load_data',
53
+ 'clean_raw_data',
54
+ 'standardize_raw_data',
55
+ 'resample_data',
56
+ ]
57
+
58
+
59
+ def get_data(
60
+ env: tSUPPORTED_ENVIRONMENTS,
61
+ data_source: tSUPPORTED_DOWNLOAD_DATA_SOURCES,
62
+ resolution: str | ExtendedResolution,
63
+ pdt: str,
64
+ date: str,
65
+ trading_venue: str='',
66
+ output_format: tOUTPUT_FORMATS='pandas',
67
+ ) -> bytes | DataFrame | None:
68
+ """Extract data without specifying the data origin.
69
+ This function will try to extract data from all supported data origins.
70
+
71
+ Args:
72
+ env: trading environment, e.g. 'PAPER' | 'LIVE'.
73
+ data_source (Literal['BYBIT']): The data source to extract data from.
74
+ resolution: Data resolution. e.g. '1m' = 1 minute as the unit of each data bar/candle.
75
+ Also supports raw resolution such as 'r1m', where 'r' stands for raw.
76
+ Default is '1d' = 1 day.
77
+ pdt (str): product, e.g. BTC_USDT_PERP.
78
+ date (str): The date of the data to extract.
79
+ trading_venue (str): trading venue's name, e.g. exchange's name or dapp's name
80
+ output_format: The format of the output data. Default is 'pandas'.
81
+ Returns:
82
+ bytes | DataFrame | None: The extracted data as bytes, or None if the data is not found.
83
+ """
84
+ try:
85
+ from minio.error import MinioException
86
+ except ImportError:
87
+ MinioException = Exception
88
+
89
+ trading_venue = trading_venue or derive_trading_venue(data_source)
90
+ for storage in SUPPORTED_STORAGES:
91
+ try:
92
+ data: bytes | pd.DataFrame | None = extract_data(env, storage, data_source, trading_venue, resolution, pdt, date, output_format=output_format)
93
+ except MinioException:
94
+ data = None
95
+ if data is not None:
96
+ return data
97
+
98
+
99
+ def extract_data(
100
+ env: tSUPPORTED_ENVIRONMENTS,
101
+ storage: tSUPPORTED_STORAGES,
102
+ data_source: tSUPPORTED_DOWNLOAD_DATA_SOURCES,
103
+ trading_venue: str,
104
+ resolution: str | ExtendedResolution,
105
+ pdt: str,
106
+ date: str,
107
+ output_format: tOUTPUT_FORMATS='pandas',
108
+ ) -> bytes | DataFrame | None:
109
+ """
110
+ Extracts data from a specified data source and returns it as bytes.
111
+
112
+ Args:
113
+ env: trading environment, e.g. 'PAPER' | 'LIVE'.
114
+ storage: The origin of the data (local or minio).
115
+ data_source: The source of the data.
116
+ trading_venue: trading venue's name, e.g. exchange's name or dapp's name
117
+ resolution: Data resolution. e.g. '1m' = 1 minute as the unit of each data bar/candle.
118
+ Also supports raw resolution such as 'r1m', where 'r' stands for raw.
119
+ Default is '1d' = 1 day.
120
+ pdt (str): product, e.g. BTC_USDT_PERP.
121
+ date (str): The date of the data.
122
+ output_format: The format of the output data. Default is 'pandas'.
123
+ Returns:
124
+ bytes | DataFrame | None: The extracted data as bytes, or None if extraction fails.
125
+
126
+ Raises:
127
+ AssertionError: If any of the input parameters are invalid.
128
+ NotImplementedError: If the data origin is not supported.
129
+ MinioException: If MinIO is not running / set up correctly.
130
+ """
131
+ from pfeed.resolution import ExtendedResolution
132
+
133
+ logger = logging.getLogger(data_source.lower() + '_data')
134
+
135
+ env, storage, data_source, pdt, output_format = env.upper(), storage.lower(), data_source.upper(), pdt.upper(), output_format.lower()
136
+ assert env in SUPPORTED_ENVIRONMENTS, f'Invalid {env=}, {SUPPORTED_ENVIRONMENTS=}'
137
+ assert storage in SUPPORTED_STORAGES, f'Invalid {storage=}, {SUPPORTED_STORAGES=}'
138
+ assert data_source in SUPPORTED_DOWNLOAD_DATA_SOURCES, f'Invalid {data_source=}, SUPPORTED DATA SOURCES={SUPPORTED_DOWNLOAD_DATA_SOURCES}'
139
+ assert output_format in OUTPUT_FORMATS, f'Invalid {output_format=}, {OUTPUT_FORMATS=}'
140
+ if isinstance(resolution, str):
141
+ resolution = ExtendedResolution(resolution)
142
+ if output_format != 'bytes':
143
+ data_tool = importlib.import_module(f'pfeed.data_tools.data_tool_{output_format.lower()}')
144
+ config = get_config()
145
+ fp = FilePath(env, data_source, trading_venue, pdt, resolution, date, file_extension='.parquet', data_path=config.data_path)
146
+ if storage == 'local':
147
+ if fp.exists():
148
+ if output_format == 'bytes':
149
+ with open(fp.file_path, 'rb') as f:
150
+ data: bytes = f.read()
151
+ else:
152
+ data: DataFrame = data_tool.read_parquet(fp.file_path)
153
+ logger.debug(f'extracted {data_source} {pdt} {date} {resolution} data from local path {fp.file_path}')
154
+ return data
155
+ else:
156
+ logger.debug(f'failed to extract {data_source} {pdt} {date} {resolution} data from local path {fp.file_path}')
157
+ elif storage == 'minio':
158
+ datastore = Datastore(storage)
159
+ object_name = fp.storage_path
160
+ data: bytes | None = datastore.get_object(object_name)
161
+ if data:
162
+ if output_format != 'bytes':
163
+ file_path = "s3://" + datastore.BUCKET_NAME + "/" + object_name
164
+ data: DataFrame = data_tool.read_parquet(file_path, storage='minio')
165
+ logger.debug(f'extracted {data_source} {pdt} {date} {resolution} data from MinIO object {object_name}')
166
+ else:
167
+ logger.debug(f'failed to extract {data_source} {pdt} {date} {resolution} data from MinIO object {object_name}')
168
+ return data
169
+ else:
170
+ raise NotImplementedError(f'{storage=}')
171
+
172
+
173
+ def transform_data(
174
+ data_source: tSUPPORTED_DOWNLOAD_DATA_SOURCES,
175
+ data: bytes | pd.DataFrame | pl.LazyFrame,
176
+ data_resolution: str | ExtendedResolution,
177
+ target_resolution: str | ExtendedResolution,
178
+ ) -> bytes | pd.DataFrame | pl.LazyFrame:
179
+ """Transforms data to a target resolution"""
180
+ from pfeed.resolution import ExtendedResolution
181
+ if isinstance(data_resolution, str):
182
+ data_resolution = ExtendedResolution(data_resolution)
183
+ if isinstance(target_resolution, str):
184
+ target_resolution = ExtendedResolution(target_resolution)
185
+
186
+ data_source = data_source.upper()
187
+ assert data_source in SUPPORTED_DOWNLOAD_DATA_SOURCES, f'Invalid {data_source=}, SUPPORTED DATA SOURCES={SUPPORTED_DOWNLOAD_DATA_SOURCES}'
188
+ assert data_resolution.is_ge(target_resolution), f'{data_resolution=} is less than {target_resolution=}'
189
+
190
+ if data_resolution == target_resolution:
191
+ return data
192
+ elif data_resolution.is_raw() and target_resolution.is_raw(): # e.g. 'r1t' -> 'r1m
193
+ raise Exception(f'{data_resolution=} and {target_resolution=} are both raw resolutions')
194
+ else:
195
+ data: bytes | pd.DataFrame | pl.LazyFrame = standardize_raw_data(data, data_resolution.is_tick())
196
+ if target_resolution.is_tick():
197
+ return data
198
+ else:
199
+ return resample_data(data, target_resolution)
200
+
201
+
202
+ def load_data(
203
+ env: tSUPPORTED_ENVIRONMENTS,
204
+ storage: tSUPPORTED_STORAGES,
205
+ data_source: tSUPPORTED_DOWNLOAD_DATA_SOURCES,
206
+ trading_venue: str,
207
+ data: bytes,
208
+ resolution: str | ExtendedResolution,
209
+ pdt: str,
210
+ date: str,
211
+ **kwargs
212
+ ) -> None:
213
+ """
214
+ Loads data into the specified data destination.
215
+
216
+ Args:
217
+ env: trading environment, e.g. 'PAPER' | 'LIVE'.
218
+ storage: The destination where the data will be loaded.
219
+ It can be either 'local' or 'minio'.
220
+ data_source: The source of the data.
221
+ trading_venue: trading venue's name, e.g. exchange's name or dapp's name
222
+ data (bytes): The data to be loaded.
223
+ resolution: Data resolution. e.g. '1m' = 1 minute as the unit of each data bar/candle.
224
+ Also supports raw resolution such as 'r1m', where 'r' stands for raw.
225
+ Default is '1d' = 1 day.
226
+ pdt (str): product, e.g. BTC_USDT_PERP.
227
+ date (str): The date of the data.
228
+ **kwargs: Additional keyword arguments for MinIO.
229
+
230
+ Returns:
231
+ None
232
+
233
+ Raises:
234
+ AssertionError: If any of the input parameters are invalid.
235
+ NotImplementedError: If the specified data destination is not implemented.
236
+ MinioException: If MinIO is not running / set up correctly.
237
+ """
238
+ from pfeed.resolution import ExtendedResolution
239
+
240
+ logger = logging.getLogger(data_source.lower() + '_data')
241
+
242
+ env, storage, data_source, pdt = env.upper(), storage.lower(), data_source.upper(), pdt.upper()
243
+ assert env in SUPPORTED_ENVIRONMENTS, f'Invalid {env=}, {SUPPORTED_ENVIRONMENTS=}'
244
+ assert storage in SUPPORTED_STORAGES, f'Invalid {storage=}, {SUPPORTED_STORAGES=}'
245
+ assert data_source in SUPPORTED_DOWNLOAD_DATA_SOURCES, f'Invalid {data_source=}, SUPPORTED DATA SOURCES={SUPPORTED_DOWNLOAD_DATA_SOURCES}'
246
+ if isinstance(resolution, str):
247
+ resolution = ExtendedResolution(resolution)
248
+
249
+ config = get_config()
250
+ fp = FilePath(env, data_source, trading_venue, pdt, resolution, date, file_extension='.parquet', data_path=config.data_path)
251
+ if storage == 'local':
252
+ fp.parent.mkdir(parents=True, exist_ok=True)
253
+ with open(fp.file_path, 'wb') as f:
254
+ f.write(data)
255
+ logger.info(f'loaded {data_source} data to {fp.file_path}')
256
+ elif storage == 'minio':
257
+ datastore = Datastore(storage)
258
+ object_name = fp.storage_path
259
+ datastore.put_object(object_name, data, **kwargs)
260
+ logger.info(f'loaded {data_source} data to MinIO object {object_name} {kwargs=}')
261
+ else:
262
+ raise NotImplementedError(f'{storage=}')
263
+ if print_disk_usage:
264
+ print_disk_usage(config.data_path)
265
+
266
+
267
+ def clean_raw_data(
268
+ data_source: tSUPPORTED_DOWNLOAD_DATA_SOURCES,
269
+ data: bytes,
270
+ ) -> bytes:
271
+ '''
272
+ Cleans raw data by renaming columns, mapping columns, and converting timestamp.
273
+ bytes (any format, e.g. csv.gzip) in, bytes (parquet file) out.
274
+
275
+ Args:
276
+ data_source: The source of the data.
277
+ data (bytes): The raw data to be cleaned.
278
+
279
+ Returns:
280
+ bytes: The cleaned raw data.
281
+ '''
282
+ assert data_source in SUPPORTED_DOWNLOAD_DATA_SOURCES, f'Invalid {data_source=}, SUPPORTED DATA SOURCES={SUPPORTED_DOWNLOAD_DATA_SOURCES}'
283
+
284
+ const = importlib.import_module(f'pfeed.sources.{data_source.lower()}.const')
285
+ utils = importlib.import_module(f'pfeed.sources.{data_source.lower()}.utils')
286
+
287
+ df: pd.DataFrame = _convert_data_to_pandas_df(data)
288
+ if RENAMING_COLS := getattr(const, 'RENAMING_COLS', {}):
289
+ df = df.rename(columns=RENAMING_COLS)
290
+ if MAPPING_COLS := getattr(const, 'MAPPING_COLS', {}):
291
+ df['side'] = df['side'].map(MAPPING_COLS)
292
+ df = utils.standardize_ts_column(df)
293
+ return _handle_result(data, df)
294
+
295
+
296
+ def standardize_raw_data(
297
+ data: bytes | pd.DataFrame | pl.LazyFrame,
298
+ is_tick: bool
299
+ ) -> bytes | pd.DataFrame | pl.LazyFrame:
300
+ """Filter out unnecessary columns from raw data.
301
+
302
+ Args:
303
+ data (bytes): The raw data in bytes format.
304
+
305
+ Returns:
306
+ bytes | pd.DataFrame | pl.LazyFrame: The standardized data.
307
+ """
308
+ df: pd.DataFrame = _convert_data_to_pandas_df(data)
309
+ assert 'ts' in df.columns, 'ts column not found, please check if the raw data has been cleaned'
310
+ if is_tick:
311
+ df = df.loc[:, ['ts', 'side', 'volume', 'price']]
312
+ else:
313
+ df = df.loc[:, ['ts', 'open', 'high', 'low', 'close', 'volume']]
314
+ return _handle_result(data, df)
315
+
316
+
317
+ def resample_data(
318
+ data: bytes | pd.DataFrame | pl.LazyFrame,
319
+ resolution: str | ExtendedResolution,
320
+ ) -> bytes | pd.DataFrame | pl.LazyFrame:
321
+ '''
322
+ Resamples the input data based on the specified resolution and returns the resampled data in Parquet format.
323
+
324
+ Args:
325
+ data (bytes): The input data to be resampled.
326
+ resolution (str | Resolution): The resolution at which the data should be resampled.
327
+ if string, it should be in the format of "# + unit (s/m/h/d)", e.g. "1s".
328
+ '''
329
+ from pfeed.resolution import ExtendedResolution
330
+
331
+ # standardize resolution by following pfund's standard, e.g. '1minute' -> '1m'
332
+ if isinstance(resolution, str):
333
+ resolution = ExtendedResolution(resolution)
334
+
335
+ # converts to pandas's resolution format
336
+ eresolution = repr(resolution)
337
+
338
+ # 'min' means minute in pandas, please refer to https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#dateoffset-objects
339
+ eresolution = eresolution.replace('m', 'min')
340
+ eresolution = eresolution.replace('d', 'D')
341
+
342
+ df: pd.DataFrame = _convert_data_to_pandas_df(data)
343
+
344
+ is_tick_data = True if 'price' in df.columns else False
345
+ assert not df.empty, 'data is empty'
346
+ df.set_index('ts', inplace=True)
347
+
348
+ if is_tick_data:
349
+ resample_logic = {
350
+ 'price': 'ohlc',
351
+ 'volume': 'sum',
352
+ }
353
+ else:
354
+ resample_logic = {
355
+ 'open': 'first',
356
+ 'high': 'max',
357
+ 'low': 'min',
358
+ 'close': 'last',
359
+ 'volume': 'sum',
360
+ }
361
+
362
+ if 'dividends' in df.columns:
363
+ resample_logic['dividends'] = 'sum'
364
+ if 'splits' in df.columns:
365
+ resample_logic['splits'] = 'prod'
366
+
367
+ resampled_df = (
368
+ df
369
+ .resample(eresolution)
370
+ .apply(resample_logic)
371
+ )
372
+
373
+ if is_tick_data:
374
+ # drop an unnecessary level created by 'ohlc' in the resample_logic
375
+ resampled_df = resampled_df.droplevel(0, axis=1)
376
+
377
+ resampled_df.dropna(inplace=True)
378
+ resampled_df.reset_index(inplace=True)
379
+
380
+ return _handle_result(data, resampled_df)
381
+
382
+
383
+ def _convert_data_to_pandas_df(data: bytes | pd.DataFrame | pl.LazyFrame) -> pd.DataFrame:
384
+ """Converts data to pandas DataFrame."""
385
+ if isinstance(data, bytes):
386
+ df = read_raw_data(data)
387
+ elif isinstance(data, pd.DataFrame):
388
+ df = data
389
+ elif isinstance(data, pl.LazyFrame):
390
+ df = data.collect().to_pandas()
391
+ else:
392
+ raise TypeError(f'Invalid data type {type(data)}, expected bytes or pd.DataFrame or pl.LazyFrame')
393
+ return df
394
+
395
+
396
+ def _handle_result(input_data: bytes | pd.DataFrame | pl.LazyFrame, output_df: pd.DataFrame) -> bytes | pd.DataFrame | pl.LazyFrame:
397
+ """Outputs the data in the same format as the input data."""
398
+ if isinstance(input_data, bytes):
399
+ return output_df.to_parquet(compression='zstd')
400
+ elif isinstance(input_data, pd.DataFrame):
401
+ return output_df
402
+ elif isinstance(input_data, pl.LazyFrame):
403
+ return pl.from_pandas(output_df).lazy()
404
+ else:
405
+ raise TypeError(f'Invalid data type {type(input_data)}, expected bytes or pd.DataFrame or pl.LazyFrame')