pfeed 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. pfeed/__init__.py +62 -0
  2. pfeed/cli/__init__.py +4 -0
  3. pfeed/cli/commands/__init__.py +0 -0
  4. pfeed/cli/commands/config.py +69 -0
  5. pfeed/cli/commands/docker_compose.py +33 -0
  6. pfeed/cli/commands/download.py +47 -0
  7. pfeed/cli/commands/open.py +47 -0
  8. pfeed/cli/commands/stream.py +0 -0
  9. pfeed/cli/main.py +24 -0
  10. pfeed/config_handler.py +148 -0
  11. pfeed/const/common.py +15 -0
  12. pfeed/const/paths.py +15 -0
  13. pfeed/data_tools/data_tool_pandas.py +62 -0
  14. pfeed/data_tools/data_tool_polars.py +65 -0
  15. pfeed/datastore.py +145 -0
  16. pfeed/etl.py +405 -0
  17. pfeed/feeds/__init__.py +3 -0
  18. pfeed/feeds/base_feed.py +296 -0
  19. pfeed/feeds/binance_feed.py +21 -0
  20. pfeed/feeds/bybit_feed.py +53 -0
  21. pfeed/feeds/custom_csv_feed.py +13 -0
  22. pfeed/feeds/yahoo_finance_feed.py +178 -0
  23. pfeed/filepath.py +103 -0
  24. pfeed/main.py +18 -0
  25. pfeed/resolution.py +62 -0
  26. pfeed/sources/binance/__init__.py +11 -0
  27. pfeed/sources/binance/api.py +105 -0
  28. pfeed/sources/binance/const.py +47 -0
  29. pfeed/sources/binance/download.py +181 -0
  30. pfeed/sources/binance/stream.py +3 -0
  31. pfeed/sources/bybit/__init__.py +4 -0
  32. pfeed/sources/bybit/api.py +76 -0
  33. pfeed/sources/bybit/const.py +25 -0
  34. pfeed/sources/bybit/download.py +196 -0
  35. pfeed/sources/bybit/stream.py +3 -0
  36. pfeed/sources/bybit/types.py +4 -0
  37. pfeed/sources/bybit/utils.py +44 -0
  38. pfeed/types/common_literals.py +13 -0
  39. pfeed/utils/file_format.py +76 -0
  40. pfeed/utils/monitor.py +21 -0
  41. pfeed/utils/utils.py +122 -0
  42. pfeed/utils/validate.py +39 -0
  43. pfeed-0.0.1.dist-info/LICENSE +201 -0
  44. pfeed-0.0.1.dist-info/METADATA +267 -0
  45. pfeed-0.0.1.dist-info/RECORD +47 -0
  46. pfeed-0.0.1.dist-info/WHEEL +4 -0
  47. pfeed-0.0.1.dist-info/entry_points.txt +3 -0
pfeed/__init__.py ADDED
@@ -0,0 +1,62 @@
1
+ from __future__ import annotations
2
+ from typing import TYPE_CHECKING
3
+
4
+ if TYPE_CHECKING:
5
+ from pfeed.types.common_literals import tSUPPORTED_DOWNLOAD_DATA_SOURCES, tSUPPORTED_DATA_TYPES
6
+
7
+ import importlib
8
+ from importlib.metadata import version
9
+
10
+ from pfeed.config_handler import configure, get_config
11
+ from pfeed.const.common import ALIASES
12
+ from pfeed.sources import bybit
13
+ from pfeed.feeds import BybitFeed, YahooFinanceFeed
14
+
15
+
16
+ def download_historical_data(
17
+ data_source: tSUPPORTED_DOWNLOAD_DATA_SOURCES,
18
+ pdts: str | list[str] | None = None,
19
+ dtypes: tSUPPORTED_DATA_TYPES | list[tSUPPORTED_DATA_TYPES] | None = None,
20
+ ptypes: str | list[str] | None = None,
21
+ start_date: str | None = None,
22
+ end_date: str | None = None,
23
+ num_cpus: int = 8,
24
+ use_ray: bool = True,
25
+ use_minio: bool = False,
26
+ ):
27
+ data_source = importlib.import_module(f"pfeed.sources.{data_source.lower()}")
28
+ return data_source.download_historical_data(
29
+ pdts=pdts,
30
+ dtypes=dtypes,
31
+ ptypes=ptypes,
32
+ start_date=start_date,
33
+ end_date=end_date,
34
+ num_cpus=num_cpus,
35
+ use_ray=use_ray,
36
+ use_minio=use_minio,
37
+ )
38
+
39
+
40
+ # TODO
41
+ def stream_realtime_data(data_source: tSUPPORTED_DOWNLOAD_DATA_SOURCES):
42
+ data_source = importlib.import_module(f"pfeed.sources.{data_source.lower()}")
43
+ return data_source.stream_realtime_data()
44
+
45
+
46
+
47
+ download = download_historical_data
48
+ stream = stream_realtime_data
49
+
50
+
51
+ __version__ = version("pfeed")
52
+ __all__ = (
53
+ "__version__",
54
+ "configure",
55
+ "get_config",
56
+ "ALIASES",
57
+ "bybit",
58
+ "binance",
59
+ "YahooFinanceFeed",
60
+ "BybitFeed",
61
+ "BinanceFeed",
62
+ )
pfeed/cli/__init__.py ADDED
@@ -0,0 +1,4 @@
1
+ from pfeed.cli.main import pfeed_group
2
+
3
+
4
+ __all__ = ["pfeed_group"]
File without changes
@@ -0,0 +1,69 @@
1
+ import os
2
+ import yaml
3
+ from pathlib import Path
4
+ from pprint import pformat
5
+
6
+ import click
7
+
8
+ from pfeed.const.paths import USER_CONFIG_FILE_PATH
9
+ from pfeed.config_handler import ConfigHandler
10
+
11
+
12
+ def save_config(config: ConfigHandler, config_file_path: str | Path):
13
+ if isinstance(config_file_path, str):
14
+ config_file_path = Path(config_file_path)
15
+ config_file_path.parent.mkdir(parents=True, exist_ok=True)
16
+ with open(config_file_path, 'w') as f:
17
+ yaml.dump(config.__dict__, f, default_flow_style=False)
18
+
19
+
20
+ def remove_config(config_file_path: str | Path):
21
+ config_file_path = Path(config_file_path)
22
+ if config_file_path.is_file():
23
+ os.remove(config_file_path)
24
+
25
+
26
+ @click.command()
27
+ @click.pass_context
28
+ @click.option('--data-path', type=click.Path(resolve_path=True), help='Set the data path')
29
+ @click.option('--log-path', type=click.Path(resolve_path=True), help='Set the log path')
30
+ @click.option('--logging-file', 'logging_config_file_path', type=click.Path(resolve_path=True, exists=True), help='Set the logging config file path')
31
+ @click.option('--logging-config', type=dict, help='Set the logging config')
32
+ @click.option('--use-fork-process', type=bool, help='If True, multiprocessing.set_start_method("fork")')
33
+ @click.option('--use-custom-excepthook', type=bool, help='If True, log uncaught exceptions to file')
34
+ @click.option('--env-file', 'env_file_path', type=click.Path(resolve_path=True, exists=True), help='Path to the .env file')
35
+ @click.option('--debug', is_flag=True, help='if enabled, debug mode will be enabled where logs at DEBUG level will be printed')
36
+ @click.option('--list', '-l', is_flag=True, is_eager=True, help='List all available options')
37
+ @click.option('--reset', is_flag=True, is_eager=True, help='Reset the configuration to defaults')
38
+ def config(ctx, **kwargs):
39
+ """Configures pfeed settings."""
40
+ config: ConfigHandler = ctx.obj['config']
41
+
42
+ # Filter out options that were not provided by the user
43
+ provided_options = {k: v for k, v in kwargs.items() if v is not None and v is not False}
44
+
45
+ if kwargs.get('list'): # Check if --list was used
46
+ del provided_options['list']
47
+ assert not provided_options, "No options should be provided with --list"
48
+ config_dict = config.__dict__
49
+ config_dict.update({'config_file_path': USER_CONFIG_FILE_PATH})
50
+ click.echo(f"PFeed's config:\n{pformat(config_dict)}")
51
+ return
52
+
53
+ if kwargs.get('reset'): # Check if --reset was used
54
+ del provided_options['reset']
55
+ assert not provided_options, "No options should be provided with --reset"
56
+ remove_config(USER_CONFIG_FILE_PATH)
57
+ click.echo("PFeed's config successfully reset.")
58
+ return
59
+
60
+ # prints out current config if no options are provided
61
+ if not provided_options:
62
+ raise click.UsageError("No options provided. Use --list to see all available options.")
63
+ else:
64
+ for option, value in provided_options.items():
65
+ setattr(config, option, value)
66
+ click.echo(f"{option} set to: {value}")
67
+
68
+ save_config(config, USER_CONFIG_FILE_PATH)
69
+ click.echo(f"config saved to {USER_CONFIG_FILE_PATH}.")
@@ -0,0 +1,33 @@
1
+ import os
2
+ from pathlib import Path
3
+ import importlib.resources
4
+ import subprocess
5
+
6
+ import click
7
+
8
+ from pfeed.const.paths import PROJ_NAME
9
+
10
+
11
+ @click.command(context_settings=dict(
12
+ ignore_unknown_options=True,
13
+ allow_extra_args=True,
14
+ ))
15
+ @click.pass_context
16
+ @click.option('--env-file', 'env_file_path', type=click.Path(exists=True), help='Path to the .env file')
17
+ @click.option('--docker-file', 'docker_file_path', type=click.Path(exists=True), help='Path to the docker-compose.yml file')
18
+ def docker_compose(ctx, env_file_path, docker_file_path):
19
+ """Forwards commands to docker-compose with the package's docker-compose.yml file if not specified."""
20
+ config = ctx.obj['config']
21
+ config.load_env_file(env_file_path)
22
+ os.environ['PFEED_DATA_PATH'] = config.data_path
23
+
24
+ if not docker_file_path:
25
+ package_dir = Path(importlib.resources.files(PROJ_NAME)).resolve().parents[0]
26
+ docker_file_path = package_dir / 'docker-compose.yml'
27
+ else:
28
+ click.echo(f'loaded custom docker-compose.yml file from "{docker_file_path}"')
29
+ command = ['docker-compose', '-f', str(docker_file_path)] + ctx.args
30
+ subprocess.run(command)
31
+
32
+
33
+
@@ -0,0 +1,47 @@
1
+ import importlib
2
+
3
+ import click
4
+
5
+ import pfeed as pe
6
+ from pfeed.const.common import (
7
+ ALIASES,
8
+ SUPPORTED_DOWNLOAD_DATA_SOURCES,
9
+ SUPPORTED_DATA_TYPES,
10
+ SUPPORTED_PRODUCT_TYPES,
11
+ )
12
+
13
+
14
+ # add aliases to supported download data sources
15
+ SUPPORTED_DOWNLOAD_DATA_SOURCES_ALIASES_INCLUDED = SUPPORTED_DOWNLOAD_DATA_SOURCES + [k for k, v in ALIASES.items() if v in SUPPORTED_DOWNLOAD_DATA_SOURCES]
16
+
17
+ # 'raw' data type is implicit since it doesn't have the timeframe specified, but still allow it for convenience
18
+ # since for data source like bybit, there's only one raw data type, 'raw_tick', i.e. 'raw' will be converted to 'raw_tick'
19
+ SUPPORTED_DATA_TYPES_IMPLICIT_RAW_ALLOWED = SUPPORTED_DATA_TYPES + ['raw']
20
+
21
+
22
+ @click.command()
23
+ @click.option('--data-source', '-d', required=True, type=click.Choice(SUPPORTED_DOWNLOAD_DATA_SOURCES_ALIASES_INCLUDED, case_sensitive=False), help='Data source')
24
+ @click.option('--pdts', '-p', 'pdts', multiple=True, default=[], help='List of trading products')
25
+ @click.option('--dtypes', '--dt', 'dtypes', multiple=True, default=['raw'], type=click.Choice(SUPPORTED_DATA_TYPES_IMPLICIT_RAW_ALLOWED, case_sensitive=False), help=f'{SUPPORTED_DATA_TYPES=}. How to pass in multiple values: --dt raw --dt tick')
26
+ @click.option('--ptypes', '--pt', 'ptypes', multiple=True, default=[], type=click.Choice(SUPPORTED_PRODUCT_TYPES, case_sensitive=False), help='List of product types, e.g. PERP = get all perpetuals')
27
+ @click.option('--start-date', '-s', type=click.DateTime(formats=["%Y-%m-%d"]), help='Start date in YYYY-MM-DD format')
28
+ @click.option('--end-date', '-e', type=click.DateTime(formats=["%Y-%m-%d"]), help='End date in YYYY-MM-DD format')
29
+ @click.option('--num-cpus', '-n', default=8, type=int, help="number of logical CPUs used for Ray's tasks")
30
+ @click.option('--use-minio', '-m', is_flag=True, help='if enabled, data will be loaded into Minio')
31
+ @click.option('--no-ray', is_flag=True, help='if enabled, Ray will not be used')
32
+ @click.option('--env-file', 'env_file_path', type=click.Path(exists=True), help='Path to the .env file')
33
+ @click.option('--debug', is_flag=True, help='if enabled, debug mode will be enabled where logs at DEBUG level will be printed')
34
+ def download(data_source, pdts, dtypes, ptypes, start_date, end_date, num_cpus, no_ray, use_minio, env_file_path, debug):
35
+ pe.configure(env_file_path=env_file_path, debug=debug)
36
+ data_source = ALIASES.get(data_source, data_source)
37
+ pipeline = importlib.import_module(f'pfeed.sources.{data_source.lower()}.download')
38
+ pipeline.download_historical_data(
39
+ pdts=pdts,
40
+ dtypes=dtypes,
41
+ ptypes=ptypes,
42
+ start_date=start_date.date().strftime('%Y-%m-%d') if start_date else start_date,
43
+ end_date=end_date.date().strftime('%Y-%m-%d') if end_date else end_date,
44
+ num_cpus=num_cpus,
45
+ use_ray=not no_ray,
46
+ use_minio=use_minio,
47
+ )
@@ -0,0 +1,47 @@
1
+ from pathlib import Path
2
+ import importlib.resources
3
+ import subprocess
4
+
5
+ import click
6
+
7
+ from pfeed.const.paths import PROJ_NAME, USER_CONFIG_FILE_PATH
8
+
9
+
10
+ def open_with_vscode(file_path):
11
+ try:
12
+ subprocess.run(["code", str(file_path)], check=True)
13
+ click.echo(f"Opened {file_path} with VS Code")
14
+ except subprocess.CalledProcessError:
15
+ click.echo("Failed to open with VS Code. Falling back to default editor.")
16
+ click.edit(filename=file_path)
17
+ except FileNotFoundError:
18
+ click.echo("VS Code command 'code' not found. Falling back to default editor.")
19
+ click.edit(filename=file_path)
20
+
21
+
22
+ @click.command()
23
+ @click.option('--config-file', '-c', is_flag=True, help='Open the config file')
24
+ @click.option('--log-file', '-l', is_flag=True, help='Open the logging.yaml file for logging config')
25
+ @click.option('--docker-file', '-d', is_flag=True, help='Open the docker-compose.yml file')
26
+ @click.option('--default-editor', '-e', is_flag=True, help='Use default editor')
27
+ def open(config_file, log_file, docker_file, default_editor):
28
+ """Opens the log file or docker-compose.yml file."""
29
+ if all([config_file, log_file, docker_file]):
30
+ click.echo('Please specify only one file to open')
31
+ return
32
+
33
+ package_dir = Path(importlib.resources.files(PROJ_NAME)).resolve().parents[0]
34
+ if config_file:
35
+ file_path = USER_CONFIG_FILE_PATH
36
+ elif log_file:
37
+ file_path = package_dir / 'logging.yml'
38
+ elif docker_file:
39
+ file_path = package_dir / 'docker-compose.yml'
40
+ else:
41
+ click.echo('Please specify a file to open')
42
+ return
43
+
44
+ if default_editor:
45
+ click.edit(filename=file_path)
46
+ else:
47
+ open_with_vscode(file_path)
File without changes
pfeed/cli/main.py ADDED
@@ -0,0 +1,24 @@
1
+ import click
2
+
3
+ from pfeed.config_handler import get_config
4
+ from pfeed.cli.commands.docker_compose import docker_compose
5
+ from pfeed.cli.commands.config import config
6
+ from pfeed.cli.commands.download import download
7
+ # from pfeed.cli.commands.stream import stream
8
+ from pfeed.cli.commands.open import open
9
+
10
+
11
+ @click.group(context_settings={"help_option_names": ["-h", "--help"]})
12
+ @click.pass_context
13
+ @click.version_option()
14
+ def pfeed_group(ctx):
15
+ """PFeed's CLI"""
16
+ ctx.ensure_object(dict)
17
+ ctx.obj['config'] = get_config()
18
+
19
+
20
+ pfeed_group.add_command(docker_compose)
21
+ pfeed_group.add_command(config)
22
+ pfeed_group.add_command(download)
23
+ # pfeed_group.add_command(stream)
24
+ pfeed_group.add_command(open)
@@ -0,0 +1,148 @@
1
+ import os
2
+ import sys
3
+ import multiprocessing
4
+ import logging
5
+ from types import TracebackType
6
+ from dataclasses import dataclass
7
+
8
+ import yaml
9
+ from dotenv import find_dotenv, load_dotenv
10
+
11
+ from pfeed.const.paths import PROJ_NAME, MAIN_PATH, LOG_PATH, DATA_PATH, USER_CONFIG_FILE_PATH
12
+
13
+
14
+ # Global configuration object
15
+ _global_config = None
16
+ __all__ = [
17
+ 'get_config',
18
+ 'configure',
19
+ ]
20
+
21
+
22
+ def _custom_excepthook(exception_class: type[BaseException], exception: BaseException, traceback: TracebackType):
23
+ '''Catches any uncaught exceptions and logs them'''
24
+ # sys.__excepthook__(exception_class, exception, traceback)
25
+ try:
26
+ raise exception
27
+ except:
28
+ logging.getLogger(PROJ_NAME).exception('Uncaught exception:')
29
+
30
+
31
+ @dataclass
32
+ class ConfigHandler:
33
+ data_path: str = str(DATA_PATH)
34
+ log_path: str = str(LOG_PATH)
35
+ logging_config_file_path: str = f'{MAIN_PATH}/logging.yml'
36
+ logging_config: dict | None = None
37
+ use_fork_process: bool = True
38
+ use_custom_excepthook: bool = False
39
+ env_file_path: str | None=None
40
+ debug: bool = False
41
+
42
+ @classmethod
43
+ def get_instance(cls):
44
+ global _global_config
45
+ if _global_config is None:
46
+ _global_config = cls.load_config()
47
+ return _global_config
48
+
49
+ @classmethod
50
+ def load_config(cls):
51
+ '''Loads user's config file and returns a ConfigHandler object'''
52
+ config_file_path = USER_CONFIG_FILE_PATH
53
+ if config_file_path.is_file():
54
+ with open(config_file_path, 'r') as f:
55
+ config = yaml.safe_load(f) or {}
56
+ else:
57
+ config = {}
58
+ return cls(**config)
59
+
60
+ def __post_init__(self):
61
+ self.initialize()
62
+
63
+ def initialize(self):
64
+ self.logging_config = self.logging_config or {}
65
+
66
+ for path in [self.data_path]:
67
+ if not os.path.exists(path):
68
+ os.makedirs(path)
69
+ print(f'created {path}')
70
+
71
+ if self.use_fork_process and sys.platform != 'win32':
72
+ multiprocessing.set_start_method('fork', force=True)
73
+
74
+ if self.use_custom_excepthook and sys.excepthook is sys.__excepthook__:
75
+ sys.excepthook = _custom_excepthook
76
+
77
+ self.load_env_file(self.env_file_path)
78
+
79
+ if self.debug:
80
+ self.enable_debug_mode()
81
+
82
+ def load_env_file(self, env_file_path: str | None):
83
+ if not env_file_path:
84
+ found_env_file_path = find_dotenv(usecwd=True, raise_error_if_not_found=False)
85
+ if found_env_file_path:
86
+ print(f'.env file path is not specified, using env file in "{found_env_file_path}"')
87
+ else:
88
+ # print('.env file is not found')
89
+ return
90
+ load_dotenv(env_file_path, override=True)
91
+
92
+ def enable_debug_mode(self):
93
+ '''Enables debug mode by setting the log level to DEBUG for all stream handlers'''
94
+ if 'handlers' not in self.logging_config:
95
+ self.logging_config['handlers'] = {}
96
+ for handler in ['stream_handler', 'stream_path_handler']:
97
+ if handler not in self.logging_config['handlers']:
98
+ self.logging_config['handlers'][handler] = {}
99
+ self.logging_config['handlers'][handler]['level'] = 'DEBUG'
100
+
101
+
102
+ def configure(
103
+ data_path: str | None = None,
104
+ log_path: str | None = None,
105
+ logging_config_file_path: str | None = None,
106
+ logging_config: dict | None = None,
107
+ use_fork_process: bool | None = None,
108
+ use_custom_excepthook: bool | None = None,
109
+ env_file_path: str | None = None,
110
+ debug: bool | None = None,
111
+ **kwargs,
112
+ ):
113
+ '''Configures the global config object.
114
+ It will override the existing config values from the existing config file or the default values.
115
+ '''
116
+ global _global_config
117
+ _global_config = get_config()
118
+
119
+ # override config values, if provided
120
+ if data_path is not None:
121
+ _global_config.data_path = data_path
122
+ if log_path is not None:
123
+ _global_config.log_path = log_path
124
+ if logging_config_file_path is not None:
125
+ _global_config.logging_config_file_path = logging_config_file_path
126
+ if logging_config is not None:
127
+ _global_config.logging_config = logging_config
128
+ if use_fork_process is not None:
129
+ _global_config.use_fork_process = use_fork_process
130
+ if use_custom_excepthook is not None:
131
+ _global_config.use_custom_excepthook = use_custom_excepthook
132
+ if env_file_path is not None:
133
+ _global_config.env_file_path = env_file_path
134
+ if debug is not None:
135
+ _global_config.debug = debug
136
+
137
+ for k, v in kwargs.items():
138
+ if hasattr(_global_config, k):
139
+ setattr(_global_config, k, v)
140
+ else:
141
+ raise AttributeError(f'{k} is not an attribute of ConfigHandler')
142
+
143
+ _global_config.initialize()
144
+ return _global_config
145
+
146
+
147
+ def get_config() -> ConfigHandler:
148
+ return ConfigHandler.get_instance()
pfeed/const/common.py ADDED
@@ -0,0 +1,15 @@
1
+ SUPPORTED_ENVIRONMENTS = ['BACKTEST', 'SANDBOX', 'PAPER', 'LIVE']
2
+ SUPPORTED_DATA_FEEDS = ['YAHOO_FINANCE', 'BYBIT', 'BINANCE']
3
+ SUPPORTED_STORAGES = ['local', 'minio']
4
+ SUPPORTED_DOWNLOAD_DATA_SOURCES = ['BYBIT', 'BINANCE']
5
+ SUPPORTED_CRYPTO_EXCHANGES = ['BYBIT', 'BINANCE']
6
+ SUPPORTED_DATA_TOOLS = ['pandas', 'polars']
7
+ SUPPORTED_PRODUCT_TYPES = ['SPOT', 'PERP', 'IPERP', 'FUT', 'IFUT']
8
+ SUPPORTED_DATA_TYPES = [
9
+ 'raw_tick', 'raw_second', 'raw_minute', 'raw_hour', 'raw_daily',
10
+ 'tick', 'second', 'minute', 'hour', 'daily',
11
+ ]
12
+
13
+ ALIASES = {
14
+ 'YF': 'YAHOO_FINANCE',
15
+ }
pfeed/const/paths.py ADDED
@@ -0,0 +1,15 @@
1
+ from pathlib import Path
2
+ from platformdirs import user_log_dir, user_data_dir, user_config_dir
3
+
4
+
5
+ # project paths
6
+ PROJ_NAME = Path(__file__).resolve().parents[1].name
7
+ MAIN_PATH = Path(__file__).resolve().parents[2]
8
+ PROJ_PATH = MAIN_PATH / PROJ_NAME
9
+
10
+
11
+ # user paths
12
+ LOG_PATH = Path(user_log_dir()) / PROJ_NAME
13
+ DATA_PATH = Path(user_data_dir()) / PROJ_NAME
14
+ USER_CONFIG_PATH = Path(user_config_dir()) / PROJ_NAME
15
+ USER_CONFIG_FILE_PATH = USER_CONFIG_PATH / f'{PROJ_NAME}_config.yml'
@@ -0,0 +1,62 @@
1
+ from __future__ import annotations
2
+ from typing import TYPE_CHECKING
3
+ if TYPE_CHECKING:
4
+ from pfeed.resolution import ExtendedResolution
5
+ from pfeed.types.common_literals import tSUPPORTED_STORAGES
6
+
7
+ import os
8
+ import io
9
+
10
+ import s3fs
11
+ import pandas as pd
12
+
13
+ from pfeed.const.common import SUPPORTED_STORAGES
14
+
15
+
16
+ name = 'pandas'
17
+
18
+
19
+ def read_parquet(path_or_obj: str | bytes, *args, storage: tSUPPORTED_STORAGES='local', **kwargs) -> pd.DataFrame:
20
+ assert storage in SUPPORTED_STORAGES, f'{storage=} not in {SUPPORTED_STORAGES}'
21
+ if isinstance(path_or_obj, bytes):
22
+ obj = io.BytesIO(path_or_obj)
23
+ return pd.read_parquet(obj, *args, **kwargs)
24
+ else:
25
+ path = path_or_obj
26
+ if storage == 'local':
27
+ return pd.read_parquet(path, *args, **kwargs)
28
+ elif storage == 'minio':
29
+ fs = s3fs.S3FileSystem(
30
+ endpoint_url="http://"+os.getenv('MINIO_HOST', 'localhost')+':'+os.getenv('MINIO_PORT', '9000'),
31
+ key=os.getenv('MINIO_ROOT_USER', 'pfunder'),
32
+ secret=os.getenv('MINIO_ROOT_PASSWORD', 'password'),
33
+ )
34
+ return pd.read_parquet(path, *args, filesystem=fs, **kwargs)
35
+ else:
36
+ raise NotImplementedError(f'{storage=}')
37
+
38
+
39
+ def concat(dfs: list[pd.DataFrame], *args, **kwargs) -> pd.DataFrame:
40
+ return pd.concat(dfs, *args, **kwargs)
41
+
42
+
43
+ def estimate_memory_usage(df: pd.DataFrame) -> float:
44
+ """Estimate the memory usage of a pandas DataFrame in GB."""
45
+ return df.memory_usage(deep=True).sum() / (1024 ** 3)
46
+
47
+
48
+ def organize_time_series_columns(pdt: str, resolution: str | ExtendedResolution, df: pd.DataFrame) -> pd.DataFrame:
49
+ """Organize the columns of a pandas DataFrame.
50
+ Moving 'ts', 'product', 'resolution' to the leftmost side.
51
+ """
52
+ from pfeed.resolution import ExtendedResolution
53
+ assert 'ts' in df.columns, "'ts' column not found"
54
+ assert 'product' not in df.columns, "'product' column already exists"
55
+ assert 'resolution' not in df.columns, "'resolution' column already exists"
56
+ if isinstance(resolution, str):
57
+ resolution = ExtendedResolution(resolution)
58
+ left_cols = ['ts', 'product', 'resolution']
59
+ df['product'] = pdt
60
+ df['resolution'] = repr(resolution)
61
+ df = df.reindex(left_cols + [col for col in df.columns if col not in left_cols], axis=1)
62
+ return df
@@ -0,0 +1,65 @@
1
+ from __future__ import annotations
2
+ from typing import TYPE_CHECKING
3
+ if TYPE_CHECKING:
4
+ from pfeed.resolution import ExtendedResolution
5
+ from pfeed.types.common_literals import tSUPPORTED_STORAGES
6
+
7
+ import os
8
+
9
+ import polars as pl
10
+
11
+ from pfeed.const.common import SUPPORTED_STORAGES
12
+
13
+
14
+ name = 'polars'
15
+
16
+
17
+ def read_parquet(path_or_obj: str | bytes, *args, storage: tSUPPORTED_STORAGES='local', **kwargs) -> pl.DataFrame | pl.LazyFrame:
18
+ assert storage in SUPPORTED_STORAGES, f'{storage=} not in {SUPPORTED_STORAGES}'
19
+ if isinstance(path_or_obj, bytes):
20
+ obj = path_or_obj
21
+ return pl.read_parquet(obj, *args, **kwargs)
22
+ else:
23
+ path = path_or_obj
24
+ if storage == 'local':
25
+ return pl.scan_parquet(path, *args, **kwargs)
26
+ elif storage == 'minio':
27
+ storage_options = {
28
+ "endpoint_url": "http://"+os.getenv('MINIO_HOST', 'localhost')+':'+os.getenv('MINIO_PORT', '9000'),
29
+ "access_key_id": os.getenv('MINIO_ROOT_USER', 'pfunder'),
30
+ "secret_access_key": os.getenv('MINIO_ROOT_PASSWORD', 'password'),
31
+ }
32
+ return pl.scan_parquet(path, *args, storage_options=storage_options, **kwargs)
33
+ else:
34
+ raise NotImplementedError(f'{storage=}')
35
+
36
+
37
+ def concat(dfs: list[pl.DataFrame | pl.LazyFrame], *args, **kwargs) -> pl.DataFrame | pl.LazyFrame:
38
+ return pl.concat(dfs, *args, **kwargs)
39
+
40
+
41
+ def estimate_memory_usage(df: pl.DataFrame | pl.LazyFrame) -> float:
42
+ """Estimate the memory usage of a polars DataFrame in GB."""
43
+ if isinstance(df, pl.LazyFrame):
44
+ df = df.collect()
45
+ return df.estimated_size(unit='gb')
46
+
47
+
48
+ def organize_time_series_columns(pdt: str, resolution: str | ExtendedResolution, df: pl.DataFrame | pl.LazyFrame) -> pl.DataFrame | pl.LazyFrame:
49
+ from pfeed.resolution import ExtendedResolution
50
+ if isinstance(df, pl.LazyFrame):
51
+ cols = df.collect_schema().names()
52
+ else:
53
+ cols = df.columns
54
+ assert 'ts' in cols, "'ts' column not found"
55
+ assert 'product' not in cols, "'product' column already exists"
56
+ assert 'resolution' not in cols, "'resolution' column already exists"
57
+ if isinstance(resolution, str):
58
+ resolution = ExtendedResolution(resolution)
59
+ df = df.with_columns(
60
+ pl.lit(pdt).alias('product'),
61
+ pl.lit(repr(resolution)).alias('resolution')
62
+ )
63
+ left_cols = ['ts', 'product', 'resolution']
64
+ df = df.select(left_cols + [col for col in df.collect_schema().names() if col not in left_cols])
65
+ return df