PyPI - pfeed - Versions diffs - 0.0.1.dev13__tar.gz → 0.0.1.dev14__tar.gz - Mend

pfeed 0.0.1.dev13tar.gz → 0.0.1.dev14tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (52) hide show

{pfeed-0.0.1.dev13 → pfeed-0.0.1.dev14}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: pfeed
-Version: 0.0.1.dev13
+Version: 0.0.1.dev14
 Summary: Data pipeline for algo-trading, getting and storing both real-time and historical data made easy.
 Home-page: https://pfund.ai
 License: Apache-2.0
@@ -23,11 +23,11 @@ Requires-Dist: fastparquet (>=2024.5.0,<2025.0.0)
 Requires-Dist: minio (>=7.2.8,<8.0.0) ; extra == "data" or extra == "all"
 Requires-Dist: pandas (>=2.2.2,<3.0.0) ; extra == "df" or extra == "all"
 Requires-Dist: pfund (>=0.0.1.dev13,<0.0.2)
-Requires-Dist: polars (>=1.5.0,<2.0.0) ; extra == "df" or extra == "all"
+Requires-Dist: polars (>=1.6.0,<2.0.0) ; extra == "df" or extra == "all"
 Requires-Dist: psutil (>=6.0.0,<7.0.0) ; extra == "data" or extra == "all"
-Requires-Dist: pyarrow (>=15.0.0,<16.0.0) ; extra == "boost" or extra == "all"
-Requires-Dist: ray (>=2.34.0,<3.0.0) ; extra == "boost" or extra == "all"
-Requires-Dist: s3fs (>=2024.6.1,<2025.0.0) ; extra == "data" or extra == "all"
+Requires-Dist: pyarrow (>=15.0.0,<16.0.0) ; extra == "df" or extra == "all"
+Requires-Dist: ray (>=2.35.0,<3.0.0) ; extra == "boost" or extra == "all"
+Requires-Dist: s3fs (>=2024.9.0,<2025.0.0) ; extra == "data" or extra == "all"
 Requires-Dist: yfinance (>=0.2.43,<0.3.0)
 Project-URL: Documentation, https://pfeed-docs.pfund.ai
 Project-URL: Repository, https://github.com/PFund-Software-Ltd/pfeed
@@ -68,7 +68,7 @@ By leveraging modern data engineering tools, `pfeed` handles the tedious data wo
 PFeed (/piː fiːd/) is a data pipeline for algorithmic trading, serving as a bridge between raw data sources and traders by automating the process of data collection, cleaning, transformation, and storage, loading clean data into a **local data lake for quantitative analysis**.
 ## Core Features
-- [x] Unified approach for interacting with various data sources and obtaining historical and live data
+- [x] Unified approach for interacting with various [data sources](#supported-data-sources) and obtaining historical and live data
 - [x] ETL data pipline for transforming raw data to clean data and storing it in [MinIO] (optional)
 - [x] Fast data downloading, utilizing [Ray] for parallelization
 - [x] Supports multiple data tools (e.g. Pandas, [Polars], [Dask], [Spark], [DuckDB], [Daft])

{pfeed-0.0.1.dev13 → pfeed-0.0.1.dev14}/README.md RENAMED Viewed

@@ -33,7 +33,7 @@ By leveraging modern data engineering tools, `pfeed` handles the tedious data wo
 PFeed (/piː fiːd/) is a data pipeline for algorithmic trading, serving as a bridge between raw data sources and traders by automating the process of data collection, cleaning, transformation, and storage, loading clean data into a **local data lake for quantitative analysis**.
 ## Core Features
-- [x] Unified approach for interacting with various data sources and obtaining historical and live data
+- [x] Unified approach for interacting with various [data sources](#supported-data-sources) and obtaining historical and live data
 - [x] ETL data pipline for transforming raw data to clean data and storing it in [MinIO] (optional)
 - [x] Fast data downloading, utilizing [Ray] for parallelization
 - [x] Supports multiple data tools (e.g. Pandas, [Polars], [Dask], [Spark], [DuckDB], [Daft])

{pfeed-0.0.1.dev13 → pfeed-0.0.1.dev14}/pfeed/__init__.py RENAMED Viewed

@@ -7,7 +7,6 @@ if TYPE_CHECKING:
 import importlib
 from importlib.metadata import version
-from pfeed import etl
 from pfeed.config_handler import configure, get_config
 from pfeed.const.common import ALIASES
 from pfeed.sources import bybit
@@ -55,7 +54,6 @@ __all__ = (
     "configure",
     "get_config",
     "ALIASES",
-    "etl",
     "bybit",
     "binance",
     "YahooFinanceFeed",

{pfeed-0.0.1.dev13 → pfeed-0.0.1.dev14}/pfeed/cli/commands/download.py RENAMED Viewed

@@ -3,7 +3,12 @@ import importlib
 import click
 import pfeed as pe
-from pfeed.const.common import ALIASES, SUPPORTED_DOWNLOAD_DATA_SOURCES, SUPPORTED_DATA_TYPES
+from pfeed.const.common import (
+    ALIASES,
+    SUPPORTED_DOWNLOAD_DATA_SOURCES,
+    SUPPORTED_DATA_TYPES,
+    SUPPORTED_PRODUCT_TYPES,
+)
 # add aliases to supported download data sources
@@ -15,11 +20,10 @@ SUPPORTED_DATA_TYPES_IMPLICIT_RAW_ALLOWED = SUPPORTED_DATA_TYPES + ['raw']
 @click.command()
-@click.pass_context
 @click.option('--data-source', '-d', required=True, type=click.Choice(SUPPORTED_DOWNLOAD_DATA_SOURCES_ALIASES_INCLUDED, case_sensitive=False), help='Data source')
-@click.option('--dtypes', '--dt', 'dtypes', multiple=True, default=['raw'], type=click.Choice(SUPPORTED_DATA_TYPES_IMPLICIT_RAW_ALLOWED, case_sensitive=False), help=f'{SUPPORTED_DATA_TYPES=}. How to pass in multiple values: --dt raw --dt tick')
 @click.option('--pdts', '-p', 'pdts', multiple=True, default=[], help='List of trading products')
-@click.option('--ptypes', '--pt', 'ptypes', multiple=True, default=[], help='List of product types, e.g. PERP = get all perpetuals')
+@click.option('--dtypes', '--dt', 'dtypes', multiple=True, default=['raw'], type=click.Choice(SUPPORTED_DATA_TYPES_IMPLICIT_RAW_ALLOWED, case_sensitive=False), help=f'{SUPPORTED_DATA_TYPES=}. How to pass in multiple values: --dt raw --dt tick')
+@click.option('--ptypes', '--pt', 'ptypes', multiple=True, default=[], type=click.Choice(SUPPORTED_PRODUCT_TYPES, case_sensitive=False), help='List of product types, e.g. PERP = get all perpetuals')
 @click.option('--start-date', '-s', type=click.DateTime(formats=["%Y-%m-%d"]), help='Start date in YYYY-MM-DD format')
 @click.option('--end-date', '-e', type=click.DateTime(formats=["%Y-%m-%d"]), help='End date in YYYY-MM-DD format')
 @click.option('--num-cpus', '-n', default=8, type=int, help="number of logical CPUs used for Ray's tasks")
@@ -27,7 +31,7 @@ SUPPORTED_DATA_TYPES_IMPLICIT_RAW_ALLOWED = SUPPORTED_DATA_TYPES + ['raw']
 @click.option('--no-ray', is_flag=True, help='if enabled, Ray will not be used')
 @click.option('--env-file', 'env_file_path', type=click.Path(exists=True), help='Path to the .env file')
 @click.option('--debug', is_flag=True, help='if enabled, debug mode will be enabled where logs at DEBUG level will be printed')
-def download(data_source, dtypes, pdts, ptypes, start_date, end_date, num_cpus, no_ray, use_minio, env_file_path, debug):
+def download(data_source, pdts, dtypes, ptypes, start_date, end_date, num_cpus, no_ray, use_minio, env_file_path, debug):
     pe.configure(env_file_path=env_file_path, debug=debug)
     data_source = ALIASES.get(data_source, data_source)
     pipeline = importlib.import_module(f'pfeed.sources.{data_source.lower()}.download')

pfeed-0.0.1.dev14/pfeed/const/common.py ADDED Viewed

@@ -0,0 +1,15 @@
+SUPPORTED_ENVIRONMENTS = ['BACKTEST', 'SANDBOX', 'PAPER', 'LIVE']
+SUPPORTED_DATA_FEEDS = ['YAHOO_FINANCE', 'BYBIT', 'BINANCE']
+SUPPORTED_STORAGES = ['local', 'minio']
+SUPPORTED_DOWNLOAD_DATA_SOURCES = ['BYBIT', 'BINANCE']
+SUPPORTED_CRYPTO_EXCHANGES = ['BYBIT', 'BINANCE']
+SUPPORTED_DATA_TOOLS = ['pandas', 'polars']
+SUPPORTED_PRODUCT_TYPES = ['SPOT', 'PERP', 'IPERP', 'FUT', 'IFUT']
+SUPPORTED_DATA_TYPES = [
+    'raw_tick', 'raw_second', 'raw_minute', 'raw_hour', 'raw_daily',
+    'tick', 'second', 'minute', 'hour', 'daily',
+]
+ALIASES = {
+    'YF': 'YAHOO_FINANCE',
+}

pfeed-0.0.1.dev14/pfeed/data_tools/data_tool_pandas.py ADDED Viewed

@@ -0,0 +1,62 @@
+from __future__ import annotations
+from typing import TYPE_CHECKING
+if TYPE_CHECKING:
+    from pfeed.resolution import ExtendedResolution
+    from pfeed.types.common_literals import tSUPPORTED_STORAGES
+import os
+import io
+import s3fs
+import pandas as pd
+from pfeed.const.common import SUPPORTED_STORAGES
+name = 'pandas'
+def read_parquet(path_or_obj: str | bytes, *args, storage: tSUPPORTED_STORAGES='local', **kwargs) -> pd.DataFrame:
+    assert storage in SUPPORTED_STORAGES, f'{storage=} not in {SUPPORTED_STORAGES}'
+    if isinstance(path_or_obj, bytes):
+        obj = io.BytesIO(path_or_obj)
+        return pd.read_parquet(obj, *args, **kwargs)
+    else:
+        path = path_or_obj
+        if storage == 'local':
+            return pd.read_parquet(path, *args, **kwargs)
+        elif storage == 'minio':
+            fs = s3fs.S3FileSystem(
+                endpoint_url="http://"+os.getenv('MINIO_HOST', 'localhost')+':'+os.getenv('MINIO_PORT', '9000'),
+                key=os.getenv('MINIO_ROOT_USER', 'pfunder'),
+                secret=os.getenv('MINIO_ROOT_PASSWORD', 'password'),
+            )
+            return pd.read_parquet(path, *args, filesystem=fs, **kwargs)
+        else:
+            raise NotImplementedError(f'{storage=}')
+def concat(dfs: list[pd.DataFrame], *args, **kwargs) -> pd.DataFrame:
+    return pd.concat(dfs, *args, **kwargs)
+def estimate_memory_usage(df: pd.DataFrame) -> float:
+    """Estimate the memory usage of a pandas DataFrame in GB."""
+    return df.memory_usage(deep=True).sum() / (1024 ** 3)
+def organize_time_series_columns(pdt: str, resolution: str | ExtendedResolution, df: pd.DataFrame) -> pd.DataFrame:
+    """Organize the columns of a pandas DataFrame.
+    Moving 'ts', 'product', 'resolution' to the leftmost side.
+    """
+    from pfeed.resolution import ExtendedResolution
+    assert 'ts' in df.columns, "'ts' column not found"
+    assert 'product' not in df.columns, "'product' column already exists"
+    assert 'resolution' not in df.columns, "'resolution' column already exists"
+    if isinstance(resolution, str):
+        resolution = ExtendedResolution(resolution)
+    left_cols = ['ts', 'product', 'resolution']
+    df['product'] = pdt
+    df['resolution'] = repr(resolution)
+    df = df.reindex(left_cols + [col for col in df.columns if col not in left_cols], axis=1)
+    return df

pfeed-0.0.1.dev14/pfeed/data_tools/data_tool_polars.py ADDED Viewed

@@ -0,0 +1,65 @@
+from __future__ import annotations
+from typing import TYPE_CHECKING
+if TYPE_CHECKING:
+    from pfeed.resolution import ExtendedResolution
+    from pfeed.types.common_literals import tSUPPORTED_STORAGES
+import os
+import polars as pl
+from pfeed.const.common import SUPPORTED_STORAGES
+name = 'polars'
+def read_parquet(path_or_obj: str | bytes, *args, storage: tSUPPORTED_STORAGES='local', **kwargs) -> pl.DataFrame | pl.LazyFrame:
+    assert storage in SUPPORTED_STORAGES, f'{storage=} not in {SUPPORTED_STORAGES}'
+    if isinstance(path_or_obj, bytes):
+        obj = path_or_obj
+        return pl.read_parquet(obj, *args, **kwargs)
+    else:
+        path = path_or_obj
+        if storage == 'local':
+            return pl.scan_parquet(path, *args, **kwargs)
+        elif storage == 'minio':
+            storage_options = {
+                "endpoint_url": "http://"+os.getenv('MINIO_HOST', 'localhost')+':'+os.getenv('MINIO_PORT', '9000'),
+                "access_key_id": os.getenv('MINIO_ROOT_USER', 'pfunder'),
+                "secret_access_key": os.getenv('MINIO_ROOT_PASSWORD', 'password'),
+            }
+            return pl.scan_parquet(path, *args, storage_options=storage_options, **kwargs)
+        else:
+            raise NotImplementedError(f'{storage=}')
+def concat(dfs: list[pl.DataFrame | pl.LazyFrame], *args, **kwargs) -> pl.DataFrame | pl.LazyFrame:
+    return pl.concat(dfs, *args, **kwargs)
+def estimate_memory_usage(df: pl.DataFrame | pl.LazyFrame) -> float:
+    """Estimate the memory usage of a polars DataFrame in GB."""
+    if isinstance(df, pl.LazyFrame):
+        df = df.collect()
+    return df.estimated_size(unit='gb')
+def organize_time_series_columns(pdt: str, resolution: str | ExtendedResolution, df: pl.DataFrame | pl.LazyFrame) -> pl.DataFrame | pl.LazyFrame:
+    from pfeed.resolution import ExtendedResolution
+    if isinstance(df, pl.LazyFrame):
+        cols = df.collect_schema().names()
+    else:
+        cols = df.columns
+    assert 'ts' in cols, "'ts' column not found"
+    assert 'product' not in cols, "'product' column already exists"
+    assert 'resolution' not in cols, "'resolution' column already exists"
+    if isinstance(resolution, str):
+        resolution = ExtendedResolution(resolution)
+    df = df.with_columns(
+        pl.lit(pdt).alias('product'),
+        pl.lit(repr(resolution)).alias('resolution')
+    )
+    left_cols = ['ts', 'product', 'resolution']
+    df = df.select(left_cols + [col for col in df.collect_schema().names() if col not in left_cols])
+    return df

{pfeed-0.0.1.dev13 → pfeed-0.0.1.dev14}/pfeed/datastore.py RENAMED Viewed

@@ -28,7 +28,7 @@ def assert_if_minio_running():
         if response.status_code != 200:
             raise MinioException(f"Unhandled response: {response.status_code=} {response.content} {response}")
     except (ReadTimeout, RequestException) as e:
-        raise MinioException(f"MinIO is not running or not detected on {endpoint}: {e}")
+        raise MinioException(f"MinIO is not running or not detected on {endpoint}: {e}, please use 'pfeed docker-compose up -d' to start MinIO")
 class Datastore:

pfeed-0.0.1.dev14/pfeed/etl.py ADDED Viewed

@@ -0,0 +1,405 @@
+'''ETL = Extract, Transform, Load data.
+Except extracting and loading data, this module uses "pandas" for data transformation.
+'''
+from __future__ import annotations
+from typing import TYPE_CHECKING, Literal
+if TYPE_CHECKING:
+    from pfeed.types.common_literals import (
+        tSUPPORTED_ENVIRONMENTS,
+        tSUPPORTED_DOWNLOAD_DATA_SOURCES,
+        tSUPPORTED_STORAGES,
+        tSUPPORTED_DATA_TOOLS,
+    )
+    from pfeed.resolution import ExtendedResolution
+    tOUTPUT_FORMATS = Literal['bytes'] | tSUPPORTED_DATA_TOOLS
+import logging
+import importlib
+try:
+    import pandas as pd
+    import polars as pl
+except ImportError:
+    pass
+from pfeed.datastore import Datastore
+from pfeed.filepath import FilePath
+from pfeed.config_handler import get_config
+from pfeed.const.common import (
+    SUPPORTED_ENVIRONMENTS,
+    SUPPORTED_STORAGES,
+    SUPPORTED_DOWNLOAD_DATA_SOURCES,
+    SUPPORTED_DATA_TOOLS,
+)
+from pfeed.types.common_literals import tSUPPORTED_DATA_TOOLS
+from pfeed.utils.utils import derive_trading_venue
+from pfeed.utils.file_format import read_raw_data
+try:
+    from pfeed.utils.monitor import print_disk_usage
+except ImportError:
+    print_disk_usage = None
+OUTPUT_FORMATS = ['bytes'] + SUPPORTED_DATA_TOOLS
+DataFrame = pd.DataFrame | pl.DataFrame | pl.LazyFrame
+__all__ = [
+    'get_data',
+    'extract_data',
+    'transform_data',
+    'load_data',
+    'clean_raw_data',
+    'standardize_raw_data',
+    'resample_data',
+]
+def get_data(
+    env: tSUPPORTED_ENVIRONMENTS,
+    data_source: tSUPPORTED_DOWNLOAD_DATA_SOURCES,
+    resolution: str | ExtendedResolution,
+    pdt: str,
+    date: str,
+    trading_venue: str='',
+    output_format: tOUTPUT_FORMATS='pandas',
+) -> bytes | DataFrame | None:
+    """Extract data without specifying the data origin.
+    This function will try to extract data from all supported data origins.
+    Args:
+        env: trading environment, e.g. 'PAPER' | 'LIVE'.
+        data_source (Literal['BYBIT']): The data source to extract data from.
+        resolution: Data resolution. e.g. '1m' = 1 minute as the unit of each data bar/candle.
+            Also supports raw resolution such as 'r1m', where 'r' stands for raw.
+            Default is '1d' = 1 day.
+        pdt (str): product, e.g. BTC_USDT_PERP.
+        date (str): The date of the data to extract.
+        trading_venue (str): trading venue's name, e.g. exchange's name or dapp's name
+        output_format: The format of the output data. Default is 'pandas'.
+    Returns:
+        bytes | DataFrame | None: The extracted data as bytes, or None if the data is not found.
+    """
+    try:
+        from minio.error import MinioException
+    except ImportError:
+        MinioException = Exception
+    trading_venue = trading_venue or derive_trading_venue(data_source)
+    for storage in SUPPORTED_STORAGES:
+        try:
+            data: bytes | pd.DataFrame | None = extract_data(env, storage, data_source, trading_venue, resolution, pdt, date, output_format=output_format)
+        except MinioException:
+            data = None
+        if data is not None:
+            return data
+def extract_data(
+    env: tSUPPORTED_ENVIRONMENTS,
+    storage: tSUPPORTED_STORAGES,
+    data_source: tSUPPORTED_DOWNLOAD_DATA_SOURCES,
+    trading_venue: str,
+    resolution: str | ExtendedResolution,
+    pdt: str,
+    date: str,
+    output_format: tOUTPUT_FORMATS='pandas',
+) -> bytes | DataFrame | None:
+    """
+    Extracts data from a specified data source and returns it as bytes.
+    Args:
+        env: trading environment, e.g. 'PAPER' | 'LIVE'.
+        storage: The origin of the data (local or minio).
+        data_source: The source of the data.
+        trading_venue: trading venue's name, e.g. exchange's name or dapp's name
+        resolution: Data resolution. e.g. '1m' = 1 minute as the unit of each data bar/candle.
+            Also supports raw resolution such as 'r1m', where 'r' stands for raw.
+            Default is '1d' = 1 day.
+        pdt (str): product, e.g. BTC_USDT_PERP.
+        date (str): The date of the data.
+        output_format: The format of the output data. Default is 'pandas'.
+    Returns:
+        bytes | DataFrame | None: The extracted data as bytes, or None if extraction fails.
+    Raises:
+        AssertionError: If any of the input parameters are invalid.
+        NotImplementedError: If the data origin is not supported.
+        MinioException: If MinIO is not running / set up correctly.
+    """
+    from pfeed.resolution import ExtendedResolution
+    logger = logging.getLogger(data_source.lower() + '_data')
+    env, storage, data_source, pdt, output_format = env.upper(), storage.lower(), data_source.upper(), pdt.upper(), output_format.lower()
+    assert env in SUPPORTED_ENVIRONMENTS, f'Invalid {env=}, {SUPPORTED_ENVIRONMENTS=}'
+    assert storage in SUPPORTED_STORAGES, f'Invalid {storage=}, {SUPPORTED_STORAGES=}'
+    assert data_source in SUPPORTED_DOWNLOAD_DATA_SOURCES, f'Invalid {data_source=}, SUPPORTED DATA SOURCES={SUPPORTED_DOWNLOAD_DATA_SOURCES}'
+    assert output_format in OUTPUT_FORMATS, f'Invalid {output_format=}, {OUTPUT_FORMATS=}'
+    if isinstance(resolution, str):
+        resolution = ExtendedResolution(resolution)
+    if output_format != 'bytes':
+        data_tool = importlib.import_module(f'pfeed.data_tools.data_tool_{output_format.lower()}')
+    config = get_config()
+    fp = FilePath(env, data_source, trading_venue, pdt, resolution, date, file_extension='.parquet', data_path=config.data_path)
+    if storage == 'local':
+        if fp.exists():
+            if output_format == 'bytes':
+                with open(fp.file_path, 'rb') as f:
+                    data: bytes = f.read()
+            else:
+                data: DataFrame = data_tool.read_parquet(fp.file_path)
+            logger.debug(f'extracted {data_source} {pdt} {date} {resolution} data from local path {fp.file_path}')
+            return data
+        else:
+            logger.debug(f'failed to extract {data_source} {pdt} {date} {resolution} data from local path {fp.file_path}')
+    elif storage == 'minio':
+        datastore = Datastore(storage)
+        object_name = fp.storage_path
+        data: bytes | None = datastore.get_object(object_name)
+        if data:
+            if output_format != 'bytes':
+                file_path = "s3://" + datastore.BUCKET_NAME + "/" + object_name
+                data: DataFrame = data_tool.read_parquet(file_path, storage='minio')
+            logger.debug(f'extracted {data_source} {pdt} {date} {resolution} data from MinIO object {object_name}')
+        else:
+            logger.debug(f'failed to extract {data_source} {pdt} {date} {resolution} data from MinIO object {object_name}')
+        return data
+    else:
+        raise NotImplementedError(f'{storage=}')
+def transform_data(
+    data_source: tSUPPORTED_DOWNLOAD_DATA_SOURCES,
+    data: bytes | pd.DataFrame | pl.LazyFrame,
+    data_resolution: str | ExtendedResolution,
+    target_resolution: str | ExtendedResolution,
+) -> bytes | pd.DataFrame | pl.LazyFrame:
+    """Transforms data to a target resolution"""
+    from pfeed.resolution import ExtendedResolution
+    if isinstance(data_resolution, str):
+        data_resolution = ExtendedResolution(data_resolution)
+    if isinstance(target_resolution, str):
+        target_resolution = ExtendedResolution(target_resolution)
+    data_source = data_source.upper()
+    assert data_source in SUPPORTED_DOWNLOAD_DATA_SOURCES, f'Invalid {data_source=}, SUPPORTED DATA SOURCES={SUPPORTED_DOWNLOAD_DATA_SOURCES}'
+    assert data_resolution.is_ge(target_resolution), f'{data_resolution=} is less than {target_resolution=}'
+    if data_resolution == target_resolution:
+        return data
+    elif data_resolution.is_raw() and target_resolution.is_raw():  # e.g. 'r1t' -> 'r1m
+        raise Exception(f'{data_resolution=} and {target_resolution=} are both raw resolutions')
+    else:
+        data: bytes | pd.DataFrame | pl.LazyFrame = standardize_raw_data(data, data_resolution.is_tick())
+        if target_resolution.is_tick():
+            return data
+        else:
+            return resample_data(data, target_resolution)
+def load_data(
+    env: tSUPPORTED_ENVIRONMENTS,
+    storage: tSUPPORTED_STORAGES,
+    data_source: tSUPPORTED_DOWNLOAD_DATA_SOURCES,
+    trading_venue: str,
+    data: bytes,
+    resolution: str | ExtendedResolution,
+    pdt: str,
+    date: str,
+    **kwargs
+) -> None:
+    """
+    Loads data into the specified data destination.
+    Args:
+        env: trading environment, e.g. 'PAPER' | 'LIVE'.
+        storage: The destination where the data will be loaded.
+            It can be either 'local' or 'minio'.
+        data_source: The source of the data.
+        trading_venue: trading venue's name, e.g. exchange's name or dapp's name
+        data (bytes): The data to be loaded.
+        resolution: Data resolution. e.g. '1m' = 1 minute as the unit of each data bar/candle.
+            Also supports raw resolution such as 'r1m', where 'r' stands for raw.
+            Default is '1d' = 1 day.
+        pdt (str): product, e.g. BTC_USDT_PERP.
+        date (str): The date of the data.
+        **kwargs: Additional keyword arguments for MinIO.
+    Returns:
+        None
+    Raises:
+        AssertionError: If any of the input parameters are invalid.
+        NotImplementedError: If the specified data destination is not implemented.
+        MinioException: If MinIO is not running / set up correctly.
+    """
+    from pfeed.resolution import ExtendedResolution
+    logger = logging.getLogger(data_source.lower() + '_data')
+    env, storage, data_source, pdt = env.upper(), storage.lower(), data_source.upper(), pdt.upper()
+    assert env in SUPPORTED_ENVIRONMENTS, f'Invalid {env=}, {SUPPORTED_ENVIRONMENTS=}'
+    assert storage in SUPPORTED_STORAGES, f'Invalid {storage=}, {SUPPORTED_STORAGES=}'
+    assert data_source in SUPPORTED_DOWNLOAD_DATA_SOURCES, f'Invalid {data_source=}, SUPPORTED DATA SOURCES={SUPPORTED_DOWNLOAD_DATA_SOURCES}'
+    if isinstance(resolution, str):
+        resolution = ExtendedResolution(resolution)
+    config = get_config()
+    fp = FilePath(env, data_source, trading_venue, pdt, resolution, date, file_extension='.parquet', data_path=config.data_path)
+    if storage == 'local':
+        fp.parent.mkdir(parents=True, exist_ok=True)
+        with open(fp.file_path, 'wb') as f:
+            f.write(data)
+            logger.info(f'loaded {data_source} data to {fp.file_path}')
+    elif storage == 'minio':
+        datastore = Datastore(storage)
+        object_name = fp.storage_path
+        datastore.put_object(object_name, data, **kwargs)
+        logger.info(f'loaded {data_source} data to MinIO object {object_name} {kwargs=}')
+    else:
+        raise NotImplementedError(f'{storage=}')
+    if print_disk_usage:
+        print_disk_usage(config.data_path)
+def clean_raw_data(
+    data_source: tSUPPORTED_DOWNLOAD_DATA_SOURCES,
+    data: bytes,
+) -> bytes:
+    '''
+    Cleans raw data by renaming columns, mapping columns, and converting timestamp.
+    bytes (any format, e.g. csv.gzip) in, bytes (parquet file) out.
+    Args:
+        data_source: The source of the data.
+        data (bytes): The raw data to be cleaned.
+    Returns:
+        bytes: The cleaned raw data.
+    '''
+    assert data_source in SUPPORTED_DOWNLOAD_DATA_SOURCES, f'Invalid {data_source=}, SUPPORTED DATA SOURCES={SUPPORTED_DOWNLOAD_DATA_SOURCES}'
+    const = importlib.import_module(f'pfeed.sources.{data_source.lower()}.const')
+    utils = importlib.import_module(f'pfeed.sources.{data_source.lower()}.utils')
+    df: pd.DataFrame = _convert_data_to_pandas_df(data)
+    if RENAMING_COLS := getattr(const, 'RENAMING_COLS', {}):
+        df = df.rename(columns=RENAMING_COLS)
+    if MAPPING_COLS := getattr(const, 'MAPPING_COLS', {}):
+        df['side'] = df['side'].map(MAPPING_COLS)
+    df = utils.standardize_ts_column(df)
+    return _handle_result(data, df)
+def standardize_raw_data(
+    data: bytes | pd.DataFrame | pl.LazyFrame,
+    is_tick: bool
+) -> bytes | pd.DataFrame | pl.LazyFrame:
+    """Filter out unnecessary columns from raw data.
+    Args:
+        data (bytes): The raw data in bytes format.
+    Returns:
+        bytes | pd.DataFrame | pl.LazyFrame: The standardized data.
+    """
+    df: pd.DataFrame = _convert_data_to_pandas_df(data)
+    assert 'ts' in df.columns, 'ts column not found, please check if the raw data has been cleaned'
+    if is_tick:
+        df = df.loc[:, ['ts', 'side', 'volume', 'price']]
+    else:
+        df = df.loc[:, ['ts', 'open', 'high', 'low', 'close', 'volume']]
+    return _handle_result(data, df)
+def resample_data(
+    data: bytes | pd.DataFrame | pl.LazyFrame,
+    resolution: str | ExtendedResolution,
+) -> bytes | pd.DataFrame | pl.LazyFrame:
+    '''
+    Resamples the input data based on the specified resolution and returns the resampled data in Parquet format.
+    Args:
+        data (bytes): The input data to be resampled.
+        resolution (str | Resolution): The resolution at which the data should be resampled.
+            if string, it should be in the format of "# + unit (s/m/h/d)", e.g. "1s".
+    '''
+    from pfeed.resolution import ExtendedResolution
+    # standardize resolution by following pfund's standard, e.g. '1minute' -> '1m'
+    if isinstance(resolution, str):
+        resolution = ExtendedResolution(resolution)
+    # converts to pandas's resolution format
+    eresolution = repr(resolution)
+    # 'min' means minute in pandas, please refer to https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#dateoffset-objects
+    eresolution = eresolution.replace('m', 'min')
+    eresolution = eresolution.replace('d', 'D')
+    df: pd.DataFrame = _convert_data_to_pandas_df(data)
+    is_tick_data = True if 'price' in df.columns else False
+    assert not df.empty, 'data is empty'
+    df.set_index('ts', inplace=True)
+    if is_tick_data:
+        resample_logic = {
+            'price': 'ohlc',
+            'volume': 'sum',
+        }
+    else:
+        resample_logic = {
+            'open': 'first',
+            'high': 'max',
+            'low': 'min',
+            'close': 'last',
+            'volume': 'sum',
+        }
+    if 'dividends' in df.columns:
+        resample_logic['dividends'] = 'sum'
+    if 'splits' in df.columns:
+        resample_logic['splits'] = 'prod'
+    resampled_df = (
+        df
+        .resample(eresolution)
+        .apply(resample_logic)
+    )
+    if is_tick_data:
+        # drop an unnecessary level created by 'ohlc' in the resample_logic
+        resampled_df = resampled_df.droplevel(0, axis=1)
+    resampled_df.dropna(inplace=True)
+    resampled_df.reset_index(inplace=True)
+    return _handle_result(data, resampled_df)
+def _convert_data_to_pandas_df(data: bytes | pd.DataFrame | pl.LazyFrame) -> pd.DataFrame:
+    """Converts data to pandas DataFrame."""
+    if isinstance(data, bytes):
+        df = read_raw_data(data)
+    elif isinstance(data, pd.DataFrame):
+        df = data
+    elif isinstance(data, pl.LazyFrame):
+        df = data.collect().to_pandas()
+    else:
+        raise TypeError(f'Invalid data type {type(data)}, expected bytes or pd.DataFrame or pl.LazyFrame')
+    return df
+def _handle_result(input_data: bytes | pd.DataFrame | pl.LazyFrame, output_df: pd.DataFrame) -> bytes | pd.DataFrame | pl.LazyFrame:
+    """Outputs the data in the same format as the input data."""
+    if isinstance(input_data, bytes):
+        return output_df.to_parquet(compression='zstd')
+    elif isinstance(input_data, pd.DataFrame):
+        return output_df
+    elif isinstance(input_data, pl.LazyFrame):
+        return pl.from_pandas(output_df).lazy()
+    else:
+        raise TypeError(f'Invalid data type {type(input_data)}, expected bytes or pd.DataFrame or pl.LazyFrame')

pfeed 0.0.1.dev13__tar.gz → 0.0.1.dev14__tar.gz

pfeed 0.0.1.dev13tar.gz → 0.0.1.dev14tar.gz