vastdb 1.0.0__py3-none-any.whl → 1.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vastdb/_internal.py +41 -10
- vastdb/bench/perf_bench/__init__.py +0 -0
- vastdb/bench/perf_bench/bench_repo/__init__.py +0 -0
- vastdb/bench/perf_bench/bench_repo/mega_combo.py +87 -0
- vastdb/bench/perf_bench/cli.py +225 -0
- vastdb/bench/perf_bench/common/__init__.py +0 -0
- vastdb/bench/perf_bench/common/constants.py +96 -0
- vastdb/bench/perf_bench/common/log_utils.py +67 -0
- vastdb/bench/perf_bench/common/types.py +34 -0
- vastdb/bench/perf_bench/common/utils.py +219 -0
- vastdb/bench/perf_bench/dataset/__init__.py +0 -0
- vastdb/bench/perf_bench/dataset/generate_secmaster.py +105 -0
- vastdb/bench/perf_bench/dataset/generate_stocks_dataset.py +242 -0
- vastdb/bench/perf_bench/dataset/schemas.py +101 -0
- vastdb/bench/perf_bench/dataset/secmaster.py +33 -0
- vastdb/bench/perf_bench/orchestrate/__init__.py +0 -0
- vastdb/bench/perf_bench/orchestrate/bench_spec.py +91 -0
- vastdb/bench/perf_bench/orchestrate/results_helpers.py +126 -0
- vastdb/bench/perf_bench/orchestrate/scenario.py +109 -0
- vastdb/bench/perf_bench/orchestrate/scenario_generator.py +144 -0
- vastdb/bench/perf_bench/query/__init__.py +0 -0
- vastdb/bench/perf_bench/query/arrow_common.py +59 -0
- vastdb/bench/perf_bench/query/query.py +42 -0
- vastdb/bench/perf_bench/query/query_pyarrow.py +70 -0
- vastdb/bench/perf_bench/query/query_vastdb.py +78 -0
- vastdb/bench/perf_bench/run.py +79 -0
- vastdb/bench/test_sample.py +4 -2
- vastdb/conftest.py +1 -1
- vastdb/session.py +0 -6
- vastdb/table.py +35 -35
- vastdb/tests/test_nested.py +58 -0
- vastdb/tests/test_tables.py +13 -0
- vastdb/transaction.py +4 -8
- vastdb/util.py +5 -0
- {vastdb-1.0.0.dist-info → vastdb-1.1.1.dist-info}/METADATA +3 -4
- {vastdb-1.0.0.dist-info → vastdb-1.1.1.dist-info}/RECORD +39 -14
- {vastdb-1.0.0.dist-info → vastdb-1.1.1.dist-info}/WHEEL +1 -1
- {vastdb-1.0.0.dist-info → vastdb-1.1.1.dist-info}/LICENSE +0 -0
- {vastdb-1.0.0.dist-info → vastdb-1.1.1.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,219 @@
|
|
|
1
|
+
import importlib.util
|
|
2
|
+
import logging
|
|
3
|
+
import os
|
|
4
|
+
import sys
|
|
5
|
+
import time
|
|
6
|
+
from contextlib import contextmanager
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
from types import MappingProxyType, ModuleType
|
|
9
|
+
from typing import (
|
|
10
|
+
Any,
|
|
11
|
+
Dict,
|
|
12
|
+
Generator,
|
|
13
|
+
Optional,
|
|
14
|
+
Sequence,
|
|
15
|
+
Tuple,
|
|
16
|
+
TypeVar,
|
|
17
|
+
Union,
|
|
18
|
+
)
|
|
19
|
+
|
|
20
|
+
import exchange_calendars as xcal
|
|
21
|
+
import fsspec
|
|
22
|
+
import pandas as pd
|
|
23
|
+
from fsspec import AbstractFileSystem
|
|
24
|
+
|
|
25
|
+
# noinspection PyProtectedMember
|
|
26
|
+
from pandas._typing import IntervalClosedType
|
|
27
|
+
from pyarrow import dataset as ds
|
|
28
|
+
|
|
29
|
+
from vastdb.bench.perf_bench.common.constants import (
|
|
30
|
+
DEFAULT_ACCESS_KEY,
|
|
31
|
+
DEFAULT_END_T,
|
|
32
|
+
DEFAULT_ROW_GROUP_SIZE,
|
|
33
|
+
DEFAULT_S3_HOST,
|
|
34
|
+
DEFAULT_SECRET_KEY,
|
|
35
|
+
DEFAULT_START_T,
|
|
36
|
+
DFAULT_PARQUET_COMPRESSION,
|
|
37
|
+
VASTDB_ENDPOINT,
|
|
38
|
+
ParquetCompression,
|
|
39
|
+
)
|
|
40
|
+
from vastdb.bench.perf_bench.common.log_utils import get_logger
|
|
41
|
+
from vastdb.bench.perf_bench.common.types import DateLikeT, PathLikeT
|
|
42
|
+
from vastdb.session import Session
|
|
43
|
+
|
|
44
|
+
_FS_MAP = MappingProxyType({"/mnt": "nfs", "/": "fs", "s3://": "s3", "": "vastdb"})
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def config_ipython():
|
|
48
|
+
try:
|
|
49
|
+
# noinspection PyUnresolvedReferences
|
|
50
|
+
_ = get_ipython
|
|
51
|
+
pd.set_option("display.float_format", lambda x: "%.6f" % x)
|
|
52
|
+
pd.set_option("display.max_rows", 50)
|
|
53
|
+
pd.set_option("display.max_columns", 50)
|
|
54
|
+
pd.set_option("display.width", 1000)
|
|
55
|
+
except NameError:
|
|
56
|
+
...
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
# If in interactive mode, make it pretty
|
|
60
|
+
config_ipython()
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def getenv_flag(name: str, default: bool = False) -> bool:
|
|
64
|
+
return os.getenv(name, str(default)).lower() in ("1", "true", "yes")
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def to_ts(d: Optional[DateLikeT], normalize: bool = True) -> Optional[pd.Timestamp]:
|
|
68
|
+
if not d:
|
|
69
|
+
return None
|
|
70
|
+
ret_d: pd.Timestamp = d if isinstance(d, pd.Timestamp) else pd.Timestamp(d)
|
|
71
|
+
return ret_d.normalize() if normalize else ret_d
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def _default_s3_kwargs(
|
|
75
|
+
aws_access_key_id: str = DEFAULT_ACCESS_KEY,
|
|
76
|
+
aws_secret_access_key: str = DEFAULT_SECRET_KEY,
|
|
77
|
+
host: str = DEFAULT_S3_HOST,
|
|
78
|
+
config: Optional[Dict[str, Any]] = None,
|
|
79
|
+
**kwargs,
|
|
80
|
+
) -> Dict[str, Any]:
|
|
81
|
+
return {
|
|
82
|
+
"endpoint_url": f"http://{host}",
|
|
83
|
+
"aws_access_key_id": aws_access_key_id,
|
|
84
|
+
"aws_secret_access_key": aws_secret_access_key,
|
|
85
|
+
"use_ssl": False,
|
|
86
|
+
"verify": False,
|
|
87
|
+
"config": {
|
|
88
|
+
"signature_version": "s3v4",
|
|
89
|
+
"s3": {"addressing_style": "path"},
|
|
90
|
+
**(config or {}),
|
|
91
|
+
},
|
|
92
|
+
"region_name": "us-east-1",
|
|
93
|
+
**kwargs,
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
def get_vastdb_session(
|
|
98
|
+
access: str = DEFAULT_ACCESS_KEY,
|
|
99
|
+
secret: str = DEFAULT_SECRET_KEY,
|
|
100
|
+
vastdb_endpoint: str = VASTDB_ENDPOINT,
|
|
101
|
+
ssl_verify: bool = True,
|
|
102
|
+
) -> Session:
|
|
103
|
+
return Session(
|
|
104
|
+
endpoint=vastdb_endpoint, access=access, secret=secret, ssl_verify=ssl_verify
|
|
105
|
+
)
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
def get_filesystem(
|
|
109
|
+
path: PathLikeT,
|
|
110
|
+
botocore_client_kwargs: Optional[Dict[str, Any]] = None,
|
|
111
|
+
**kwargs,
|
|
112
|
+
) -> Tuple[str, AbstractFileSystem]:
|
|
113
|
+
path = str(path)
|
|
114
|
+
if path.startswith("s3://"):
|
|
115
|
+
botocore_client_kwargs = _default_s3_kwargs(
|
|
116
|
+
**(botocore_client_kwargs or {}),
|
|
117
|
+
config=kwargs.pop("config_kwargs", None),
|
|
118
|
+
)
|
|
119
|
+
fs = fsspec.filesystem(
|
|
120
|
+
protocol="s3",
|
|
121
|
+
client_kwargs=botocore_client_kwargs,
|
|
122
|
+
**kwargs,
|
|
123
|
+
)
|
|
124
|
+
path = path[5:] # remove the s3:// prefix
|
|
125
|
+
else:
|
|
126
|
+
fs = fsspec.filesystem(protocol="file", **kwargs)
|
|
127
|
+
return path, fs
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
# noinspection PyShadowingBuiltins
|
|
131
|
+
def get_parquet_dataset(
|
|
132
|
+
path: PathLikeT,
|
|
133
|
+
filesystem: Optional[AbstractFileSystem] = None,
|
|
134
|
+
fs_kwargs: Optional[Dict[str, Any]] = None,
|
|
135
|
+
format: Optional[str] = "parquet", # noqa: A002
|
|
136
|
+
partitioning: Optional[str] = "hive",
|
|
137
|
+
**kwargs,
|
|
138
|
+
) -> ds.Dataset:
|
|
139
|
+
path, fs = filesystem or get_filesystem(path=path, **(fs_kwargs or {}))
|
|
140
|
+
return ds.dataset(
|
|
141
|
+
source=path,
|
|
142
|
+
format=format,
|
|
143
|
+
partitioning=partitioning,
|
|
144
|
+
filesystem=fs,
|
|
145
|
+
**kwargs,
|
|
146
|
+
)
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
def get_parquet_dataset_root(
|
|
150
|
+
base_dir: PathLikeT,
|
|
151
|
+
row_group_size: int = DEFAULT_ROW_GROUP_SIZE,
|
|
152
|
+
compression: Union[str, ParquetCompression] = DFAULT_PARQUET_COMPRESSION,
|
|
153
|
+
) -> Path:
|
|
154
|
+
return Path(str(base_dir)) / f"rg_{row_group_size!s}_c_{ParquetCompression[compression]}"
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
@contextmanager
|
|
158
|
+
def time_me(logger: Optional[logging.Logger] = None):
|
|
159
|
+
logger = logger or get_logger(__name__)
|
|
160
|
+
start_t = time.time_ns()
|
|
161
|
+
yield # Yield control back to the context block
|
|
162
|
+
total_t_sec = (time.time_ns() - start_t) / 1e9
|
|
163
|
+
logger.debug(f"Execution time: {total_t_sec:.2f} s")
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
S = TypeVar("S", bound=Sequence)
|
|
167
|
+
|
|
168
|
+
|
|
169
|
+
def chunk_sequence(input_seq: S, chunk_size: int) -> Generator[Sequence, None, None]:
|
|
170
|
+
for i in range(0, len(input_seq), chunk_size):
|
|
171
|
+
yield input_seq[i: i + chunk_size]
|
|
172
|
+
|
|
173
|
+
|
|
174
|
+
def get_dates_range(
|
|
175
|
+
from_t: Optional[DateLikeT] = DEFAULT_START_T,
|
|
176
|
+
to_t: Optional[DateLikeT] = DEFAULT_END_T,
|
|
177
|
+
only_bdays: bool = True,
|
|
178
|
+
inclusive: IntervalClosedType = "both",
|
|
179
|
+
) -> pd.DatetimeIndex:
|
|
180
|
+
fun = pd.bdate_range if only_bdays else pd.date_range
|
|
181
|
+
return fun( # type: ignore[operator]
|
|
182
|
+
start=to_ts(from_t),
|
|
183
|
+
end=to_ts(to_t),
|
|
184
|
+
inclusive=inclusive,
|
|
185
|
+
)
|
|
186
|
+
|
|
187
|
+
|
|
188
|
+
def infer_fs_type(path: str) -> str:
|
|
189
|
+
return next(k for m, k in _FS_MAP.items() if str(path).lower().startswith(m))
|
|
190
|
+
|
|
191
|
+
|
|
192
|
+
def get_session_minutes(
|
|
193
|
+
date: DateLikeT,
|
|
194
|
+
exchange: str = "XNYS",
|
|
195
|
+
) -> pd.DatetimeIndex:
|
|
196
|
+
ecal = xcal.get_calendar(exchange)
|
|
197
|
+
date = pd.Timestamp(date).normalize()
|
|
198
|
+
sess = ecal.date_to_session(date=date, direction="next")
|
|
199
|
+
sess_minutes = ecal.session_minutes(sess)
|
|
200
|
+
return sess_minutes
|
|
201
|
+
|
|
202
|
+
|
|
203
|
+
def load_module_from_path(module_file_path: PathLikeT) -> ModuleType:
|
|
204
|
+
file_path = Path(str(module_file_path)).resolve()
|
|
205
|
+
module_name = file_path.stem
|
|
206
|
+
|
|
207
|
+
spec = importlib.util.spec_from_file_location(module_name, file_path)
|
|
208
|
+
if spec is None:
|
|
209
|
+
raise ImportError(f"Cannot find module {module_name} at {file_path}")
|
|
210
|
+
|
|
211
|
+
module = importlib.util.module_from_spec(spec)
|
|
212
|
+
loader = spec.loader
|
|
213
|
+
if loader is None or not hasattr(loader, "exec_module"):
|
|
214
|
+
raise ImportError(f"Cannot load module {module_name} from {file_path}")
|
|
215
|
+
|
|
216
|
+
loader.exec_module(module)
|
|
217
|
+
sys.modules[module_name] = module
|
|
218
|
+
|
|
219
|
+
return module
|
|
File without changes
|
|
@@ -0,0 +1,105 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
from typing import NamedTuple, TypedDict
|
|
3
|
+
|
|
4
|
+
import requests
|
|
5
|
+
|
|
6
|
+
from vastdb.bench.perf_bench.common.log_utils import get_logger
|
|
7
|
+
|
|
8
|
+
_MY_DIR = Path(__file__).parent
|
|
9
|
+
SM_PATH = _MY_DIR / "secmaster.py"
|
|
10
|
+
|
|
11
|
+
LOG = get_logger(__name__)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class NasdaqRawRecord(TypedDict):
|
|
15
|
+
symbol: str
|
|
16
|
+
name: str
|
|
17
|
+
lastsale: str
|
|
18
|
+
netchange: str
|
|
19
|
+
pctchange: str
|
|
20
|
+
marketCap: str
|
|
21
|
+
url: str
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class NasdaqRecord(NamedTuple):
|
|
25
|
+
symbol: str
|
|
26
|
+
id: int
|
|
27
|
+
last_sale: float
|
|
28
|
+
|
|
29
|
+
@staticmethod
|
|
30
|
+
def id_from_ticker(ticker: str) -> int:
|
|
31
|
+
base = 100
|
|
32
|
+
max_width = 5
|
|
33
|
+
offset = ord(" ")
|
|
34
|
+
if len(ticker := ticker.strip()) > max_width:
|
|
35
|
+
raise ValueError(f"Ticker too long: {ticker}")
|
|
36
|
+
ticker = ticker.rjust(max_width, " ")
|
|
37
|
+
return sum(
|
|
38
|
+
(ord(c) - offset) * base ** (len(ticker) - i)
|
|
39
|
+
for i, c in enumerate(ticker.upper())
|
|
40
|
+
)
|
|
41
|
+
|
|
42
|
+
@classmethod
|
|
43
|
+
def from_raw_dict(cls, raw_dict: NasdaqRawRecord) -> "NasdaqRecord":
|
|
44
|
+
return cls(
|
|
45
|
+
symbol=(sym := raw_dict["symbol"].strip().upper()),
|
|
46
|
+
id=cls.id_from_ticker(sym),
|
|
47
|
+
last_sale=float(raw_dict["lastsale"].replace("$", "").replace(",", "")),
|
|
48
|
+
)
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def generate_secmaster():
|
|
52
|
+
resp = requests.get(
|
|
53
|
+
"http://api.nasdaq.com/api/screener/stocks?limit=3000",
|
|
54
|
+
headers={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"},
|
|
55
|
+
)
|
|
56
|
+
resp.raise_for_status()
|
|
57
|
+
resp_j = resp.json()
|
|
58
|
+
records = [
|
|
59
|
+
NasdaqRecord.from_raw_dict(r)
|
|
60
|
+
for r in resp_j["data"]["table"]["rows"]
|
|
61
|
+
if r and r["symbol"].strip()
|
|
62
|
+
]
|
|
63
|
+
_ticker_to_sid = {r.symbol: r.id for r in records}
|
|
64
|
+
_sid_to_ticker = {r.id: r.symbol for r in records}
|
|
65
|
+
_indicative_px = {r.id: r.last_sale for r in records}
|
|
66
|
+
|
|
67
|
+
uni = sorted(_ticker_to_sid)
|
|
68
|
+
secm_file_contents = f"""
|
|
69
|
+
_ticker_to_sid = {_ticker_to_sid}
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
_sid_to_ticker = {_sid_to_ticker}
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
_indicative_px = {_indicative_px}
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
def to_sid(ticker: str) -> int:
|
|
79
|
+
return _ticker_to_sid[ticker]
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def to_ticker(sid: int) -> str:
|
|
83
|
+
return _sid_to_ticker[sid]
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def get_indicative_px(sid: int) -> float:
|
|
87
|
+
return _indicative_px[sid]
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
UNI_SPEC = {{
|
|
91
|
+
"Large": (large_uni := {uni}),
|
|
92
|
+
"Single": large_uni[:1],
|
|
93
|
+
"Tiny": (tiny_uni := large_uni[1::50]),
|
|
94
|
+
"Small": (small_uni := large_uni[1::10]),
|
|
95
|
+
"SmallSeq": large_uni[: len(small_uni)],
|
|
96
|
+
"Medium": (med_uni := large_uni[1::6]),
|
|
97
|
+
"MediumSeq": large_uni[: (len(med_uni))],
|
|
98
|
+
"Medium2": (med2_uni := large_uni[1::8]),
|
|
99
|
+
"Medium2Seq": large_uni[: (len(med2_uni))],
|
|
100
|
+
}}
|
|
101
|
+
"""
|
|
102
|
+
with open(SM_PATH, "w") as f:
|
|
103
|
+
f.write(secm_file_contents)
|
|
104
|
+
|
|
105
|
+
LOG.info("Secmaster generated (total stocks: %d): %s", len(uni), SM_PATH)
|
|
@@ -0,0 +1,242 @@
|
|
|
1
|
+
import concurrent.futures
|
|
2
|
+
import datetime as dt
|
|
3
|
+
import os
|
|
4
|
+
import time
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import Optional, Sequence, Union
|
|
7
|
+
|
|
8
|
+
import numpy as np
|
|
9
|
+
import pandas as pd
|
|
10
|
+
|
|
11
|
+
from vastdb.bench.perf_bench.common.constants import (
|
|
12
|
+
DEFAULT_END_T,
|
|
13
|
+
DEFAULT_ROW_GROUP_SIZE,
|
|
14
|
+
DEFAULT_START_T,
|
|
15
|
+
DFAULT_PARQUET_COMPRESSION,
|
|
16
|
+
LOCAL_FS_DS_PATH,
|
|
17
|
+
ParquetCompression,
|
|
18
|
+
)
|
|
19
|
+
from vastdb.bench.perf_bench.common.log_utils import get_logger
|
|
20
|
+
from vastdb.bench.perf_bench.common.types import DateLikeT, PathLikeT
|
|
21
|
+
from vastdb.bench.perf_bench.common.utils import (
|
|
22
|
+
get_parquet_dataset_root,
|
|
23
|
+
get_session_minutes,
|
|
24
|
+
)
|
|
25
|
+
from vastdb.bench.perf_bench.dataset import secmaster as sm
|
|
26
|
+
from vastdb.bench.perf_bench.dataset.schemas import (
|
|
27
|
+
BF,
|
|
28
|
+
StockBarsPandasSchema,
|
|
29
|
+
)
|
|
30
|
+
|
|
31
|
+
LOG = get_logger(__name__)
|
|
32
|
+
|
|
33
|
+
DEFAULT_NUM_WORKERS = max(1, int((os.cpu_count() or 1) * 0.4))
|
|
34
|
+
DEFAULT_DS_ROOT_PATH = LOCAL_FS_DS_PATH
|
|
35
|
+
|
|
36
|
+
ref_bars: Optional[pd.DataFrame] = None
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def generate_synthetic_stock_1m_bars_day(
|
|
40
|
+
date: Optional[DateLikeT] = None,
|
|
41
|
+
exchange: str = "XNYS",
|
|
42
|
+
) -> pd.DataFrame:
|
|
43
|
+
sess_minutes = get_session_minutes(date=date or DEFAULT_START_T, exchange=exchange)
|
|
44
|
+
uni_tickers = sm.large_uni
|
|
45
|
+
sids = [sm.to_sid(t) for t in sm.large_uni]
|
|
46
|
+
ref_px = pd.Series(
|
|
47
|
+
[sm.get_indicative_px(sid) for sid in sids],
|
|
48
|
+
dtype=BF.trade_close.pd_type,
|
|
49
|
+
)
|
|
50
|
+
uni_sz = len(uni_tickers)
|
|
51
|
+
|
|
52
|
+
def _build_minute(m: pd.Timestamp) -> pd.DataFrame:
|
|
53
|
+
return pd.DataFrame(
|
|
54
|
+
{
|
|
55
|
+
BF.sid.name: sids,
|
|
56
|
+
BF.ts.name: [m.astimezone("UTC").tz_localize(None)] * uni_sz, # type: ignore[arg-type]
|
|
57
|
+
BF.ticker.name: uni_tickers,
|
|
58
|
+
# Ask
|
|
59
|
+
BF.ask_open.name: (
|
|
60
|
+
ao := ref_px * np.random.uniform(0.999, 0.001, uni_sz)
|
|
61
|
+
),
|
|
62
|
+
BF.ask_high.name: ao * np.random.uniform(1.0, 1.001, uni_sz),
|
|
63
|
+
BF.ask_low.name: ao * np.random.uniform(0.999, 1.0, uni_sz),
|
|
64
|
+
BF.ask_close.name: ao * np.random.uniform(0.999, 1.001, uni_sz),
|
|
65
|
+
BF.ask_qty.name: np.random.randint(1, 10000, uni_sz),
|
|
66
|
+
# Bid
|
|
67
|
+
BF.bid_open.name: (
|
|
68
|
+
bo := ref_px * np.random.uniform(0.999, 0.001, uni_sz)
|
|
69
|
+
),
|
|
70
|
+
BF.bid_high.name: bo * np.random.uniform(1.0, 1.001, uni_sz),
|
|
71
|
+
BF.bid_low.name: bo * np.random.uniform(0.999, 1.0, uni_sz),
|
|
72
|
+
BF.bid_close.name: bo * np.random.uniform(0.999, 1.001, uni_sz),
|
|
73
|
+
BF.bid_qty.name: np.random.randint(1, 10000, uni_sz),
|
|
74
|
+
# Trades
|
|
75
|
+
BF.trade_open.name: (
|
|
76
|
+
to := ref_px * np.random.uniform(0.999, 0.001, uni_sz)
|
|
77
|
+
),
|
|
78
|
+
BF.trade_high.name: to * np.random.uniform(1.0, 1.001, uni_sz),
|
|
79
|
+
BF.trade_low.name: to * np.random.uniform(0.999, 1.0, uni_sz),
|
|
80
|
+
BF.trade_close.name: to * np.random.uniform(0.999, 1.001, uni_sz),
|
|
81
|
+
BF.trade_volume.name: np.random.randint(1, 10000, uni_sz),
|
|
82
|
+
# Vwap
|
|
83
|
+
BF.vwap.name: ref_px * np.random.uniform(0.999, 1.001, uni_sz),
|
|
84
|
+
}
|
|
85
|
+
)
|
|
86
|
+
|
|
87
|
+
# noinspection PyUnreachableCode
|
|
88
|
+
return (
|
|
89
|
+
pd.concat([_build_minute(m) for m in sess_minutes])
|
|
90
|
+
.astype(StockBarsPandasSchema)
|
|
91
|
+
.sort_values([BF.sid.name, BF.ts.name])
|
|
92
|
+
.reset_index(drop=True)
|
|
93
|
+
)
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
def worker_init():
|
|
97
|
+
global ref_bars # noqa: PLW0603
|
|
98
|
+
ref_bars = generate_synthetic_stock_1m_bars_day()
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
# noinspection DuplicatedCode
|
|
102
|
+
def build_bars(
|
|
103
|
+
dates: Union[Sequence[dt.date], pd.DatetimeIndex],
|
|
104
|
+
output_dir: PathLikeT = DEFAULT_DS_ROOT_PATH,
|
|
105
|
+
row_group_size: int = DEFAULT_ROW_GROUP_SIZE,
|
|
106
|
+
compression: Union[str, ParquetCompression] = DFAULT_PARQUET_COMPRESSION,
|
|
107
|
+
):
|
|
108
|
+
global ref_bars # noqa: PLW0602
|
|
109
|
+
if ref_bars is None:
|
|
110
|
+
raise ValueError("Reference bars not initialized")
|
|
111
|
+
|
|
112
|
+
output_dir = Path(str(output_dir))
|
|
113
|
+
compression = ParquetCompression[compression]
|
|
114
|
+
output_dir = get_parquet_dataset_root(output_dir, row_group_size, compression)
|
|
115
|
+
output_dir.mkdir(parents=True, exist_ok=True)
|
|
116
|
+
|
|
117
|
+
LOG.info(
|
|
118
|
+
f"Building bars from {dates[0]:%Y-%m-%d} to {dates[-1]:%Y-%m-%d}, saving to {output_dir}"
|
|
119
|
+
)
|
|
120
|
+
|
|
121
|
+
# noinspection DuplicatedCode
|
|
122
|
+
def build_new_bar(new_date: dt.date) -> pd.DataFrame:
|
|
123
|
+
new_bars = ref_bars
|
|
124
|
+
schema = new_bars.dtypes
|
|
125
|
+
excluded_cols = {BF.ticker.value, BF.sid.value}
|
|
126
|
+
for c in new_bars:
|
|
127
|
+
if c in excluded_cols:
|
|
128
|
+
continue
|
|
129
|
+
kind: str = schema[c].kind # type: ignore[call-overload]
|
|
130
|
+
if kind == "f":
|
|
131
|
+
new_bars[c] = new_bars[c] * np.random.uniform(0.8, 1.2)
|
|
132
|
+
elif kind == "i":
|
|
133
|
+
new_bars[c] = (new_bars[c] * np.random.uniform(0.2, 5)).astype(
|
|
134
|
+
schema[c] # type: ignore[call-overload]
|
|
135
|
+
)
|
|
136
|
+
elif kind == "M":
|
|
137
|
+
new_bars[c] = new_date + (new_bars[c] - new_bars[c].dt.normalize()) # type: ignore[operator]
|
|
138
|
+
return new_bars
|
|
139
|
+
|
|
140
|
+
for d in dates:
|
|
141
|
+
try:
|
|
142
|
+
start_t = time.time()
|
|
143
|
+
|
|
144
|
+
bars_df = build_new_bar(new_date=d)
|
|
145
|
+
generated_time = time.time() - start_t
|
|
146
|
+
|
|
147
|
+
fpth = (
|
|
148
|
+
output_dir
|
|
149
|
+
/ f"year={d:%Y}"
|
|
150
|
+
/ f"date={d:%Y%m%d}"
|
|
151
|
+
/ f"{d:%Y%m%d}_equity_etf_1m_bars.pq"
|
|
152
|
+
)
|
|
153
|
+
fpth.parent.mkdir(parents=True, exist_ok=True)
|
|
154
|
+
|
|
155
|
+
# noinspection PyTypeChecker
|
|
156
|
+
bars_df.to_parquet(
|
|
157
|
+
path=fpth,
|
|
158
|
+
engine="pyarrow",
|
|
159
|
+
compression=compression,
|
|
160
|
+
index=False,
|
|
161
|
+
row_group_size=row_group_size,
|
|
162
|
+
) # type: ignore[call-overload]
|
|
163
|
+
LOG.info(
|
|
164
|
+
f"Written [{row_group_size=} {compression=}]"
|
|
165
|
+
f"[total={(time.time() - start_t):.2f}, generate={generated_time:.2f}) seconds]: "
|
|
166
|
+
f"{fpth}"
|
|
167
|
+
)
|
|
168
|
+
except Exception as e:
|
|
169
|
+
LOG.exception(f"FAILURE with date {d}: {e}")
|
|
170
|
+
|
|
171
|
+
|
|
172
|
+
def common(output_dir: PathLikeT = DEFAULT_DS_ROOT_PATH):
|
|
173
|
+
output_dir = Path(str(output_dir))
|
|
174
|
+
output_dir.mkdir(parents=True, exist_ok=True)
|
|
175
|
+
|
|
176
|
+
global ref_bars # noqa: PLW0602
|
|
177
|
+
if ref_bars is None:
|
|
178
|
+
worker_init()
|
|
179
|
+
|
|
180
|
+
|
|
181
|
+
def generate_synthetic_stock_1m_bars(
|
|
182
|
+
from_t: DateLikeT = DEFAULT_START_T,
|
|
183
|
+
to_t: DateLikeT = DEFAULT_END_T,
|
|
184
|
+
output_dir: PathLikeT = DEFAULT_DS_ROOT_PATH,
|
|
185
|
+
row_group_size: int = DEFAULT_ROW_GROUP_SIZE,
|
|
186
|
+
compression: Union[str, ParquetCompression] = DFAULT_PARQUET_COMPRESSION,
|
|
187
|
+
):
|
|
188
|
+
common(output_dir=output_dir)
|
|
189
|
+
dates = pd.bdate_range(
|
|
190
|
+
pd.Timestamp(from_t).normalize(), pd.Timestamp(to_t).normalize()
|
|
191
|
+
)
|
|
192
|
+
build_bars(
|
|
193
|
+
dates,
|
|
194
|
+
output_dir=output_dir,
|
|
195
|
+
row_group_size=row_group_size,
|
|
196
|
+
compression=compression,
|
|
197
|
+
)
|
|
198
|
+
|
|
199
|
+
|
|
200
|
+
def generate_concurrent_synthetic_stock_1m_bars(
|
|
201
|
+
from_t: DateLikeT = DEFAULT_START_T,
|
|
202
|
+
to_t: DateLikeT = DEFAULT_END_T,
|
|
203
|
+
output_dir: PathLikeT = DEFAULT_DS_ROOT_PATH,
|
|
204
|
+
num_workers: int = DEFAULT_NUM_WORKERS,
|
|
205
|
+
row_group_size: int = DEFAULT_ROW_GROUP_SIZE,
|
|
206
|
+
compression: Union[str, ParquetCompression] = DFAULT_PARQUET_COMPRESSION,
|
|
207
|
+
):
|
|
208
|
+
if (num_workers := max(1, int(num_workers))) == 1:
|
|
209
|
+
return generate_synthetic_stock_1m_bars(
|
|
210
|
+
from_t=from_t,
|
|
211
|
+
to_t=to_t,
|
|
212
|
+
output_dir=output_dir,
|
|
213
|
+
row_group_size=row_group_size,
|
|
214
|
+
compression=compression,
|
|
215
|
+
)
|
|
216
|
+
|
|
217
|
+
common(output_dir=output_dir)
|
|
218
|
+
dates = pd.bdate_range(
|
|
219
|
+
pd.Timestamp(from_t).normalize(), pd.Timestamp(to_t).normalize()
|
|
220
|
+
)
|
|
221
|
+
batch_size = int(len(dates) / num_workers) + 1
|
|
222
|
+
batches = [dates[i: i + batch_size] for i in range(0, len(dates), batch_size)]
|
|
223
|
+
|
|
224
|
+
with concurrent.futures.ProcessPoolExecutor(
|
|
225
|
+
max_workers=num_workers,
|
|
226
|
+
initializer=worker_init,
|
|
227
|
+
) as executor:
|
|
228
|
+
# Prepare a list of futures
|
|
229
|
+
futures = [
|
|
230
|
+
executor.submit(
|
|
231
|
+
build_bars,
|
|
232
|
+
dates=batch_dates,
|
|
233
|
+
output_dir=output_dir,
|
|
234
|
+
row_group_size=row_group_size,
|
|
235
|
+
compression=compression,
|
|
236
|
+
)
|
|
237
|
+
for batch_dates in batches
|
|
238
|
+
]
|
|
239
|
+
|
|
240
|
+
# Iterate over futures to get their results
|
|
241
|
+
for future in concurrent.futures.as_completed(futures):
|
|
242
|
+
LOG.info(future.result())
|
|
@@ -0,0 +1,101 @@
|
|
|
1
|
+
from functools import lru_cache
|
|
2
|
+
from types import MappingProxyType
|
|
3
|
+
from typing import Dict, Mapping, Union
|
|
4
|
+
|
|
5
|
+
import numpy as np
|
|
6
|
+
import pyarrow as pa
|
|
7
|
+
from pandas.core.dtypes.base import ExtensionDtype
|
|
8
|
+
|
|
9
|
+
from vastdb.bench.perf_bench.common.types import StrEnum
|
|
10
|
+
|
|
11
|
+
PandasDTypeT = Union[ExtensionDtype, np.dtype]
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class StockBarField(StrEnum):
|
|
15
|
+
sid = "sid"
|
|
16
|
+
ts = "ts"
|
|
17
|
+
ticker = "ticker"
|
|
18
|
+
|
|
19
|
+
# Ask
|
|
20
|
+
ask_open = "ask_open"
|
|
21
|
+
ask_high = "ask_high"
|
|
22
|
+
ask_low = "ask_low"
|
|
23
|
+
ask_close = "ask_close"
|
|
24
|
+
ask_qty = "ask_qty"
|
|
25
|
+
|
|
26
|
+
# Bid
|
|
27
|
+
bid_open = "bid_open"
|
|
28
|
+
bid_high = "bid_high"
|
|
29
|
+
bid_low = "bid_low"
|
|
30
|
+
bid_close = "bid_close"
|
|
31
|
+
bid_qty = "bid_qty"
|
|
32
|
+
|
|
33
|
+
# Trades
|
|
34
|
+
trade_open = "trade_open"
|
|
35
|
+
trade_high = "trade_high"
|
|
36
|
+
trade_low = "trade_low"
|
|
37
|
+
trade_close = "trade_close"
|
|
38
|
+
trade_volume = "trade_volume"
|
|
39
|
+
|
|
40
|
+
# VWAP
|
|
41
|
+
vwap = "vwap"
|
|
42
|
+
|
|
43
|
+
@property
|
|
44
|
+
def pa_type(self) -> pa.DataType:
|
|
45
|
+
return _get_field_pyarrow_types()[self]
|
|
46
|
+
|
|
47
|
+
@property
|
|
48
|
+
def pd_type(self) -> PandasDTypeT:
|
|
49
|
+
return self.pa_type.to_pandas_dtype()
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
BF = StockBarField
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
@lru_cache
|
|
56
|
+
def _get_field_pyarrow_types() -> Mapping[str, pa.DataType]:
|
|
57
|
+
return MappingProxyType(
|
|
58
|
+
{
|
|
59
|
+
BF.sid.value: pa.int64(),
|
|
60
|
+
BF.ts.value: pa.timestamp(unit="ns"),
|
|
61
|
+
BF.ticker.value: pa.string(),
|
|
62
|
+
BF.ask_open.value: pa.float64(),
|
|
63
|
+
BF.ask_high.value: pa.float64(),
|
|
64
|
+
BF.ask_low.value: pa.float64(),
|
|
65
|
+
BF.ask_close.value: pa.float64(),
|
|
66
|
+
BF.ask_qty.value: pa.int64(),
|
|
67
|
+
BF.bid_open.value: pa.float64(),
|
|
68
|
+
BF.bid_high.value: pa.float64(),
|
|
69
|
+
BF.bid_low.value: pa.float64(),
|
|
70
|
+
BF.bid_close.value: pa.float64(),
|
|
71
|
+
BF.bid_qty.value: pa.int64(),
|
|
72
|
+
BF.trade_open.value: pa.float64(),
|
|
73
|
+
BF.trade_high.value: pa.float64(),
|
|
74
|
+
BF.trade_low.value: pa.float64(),
|
|
75
|
+
BF.trade_close.value: pa.float64(),
|
|
76
|
+
BF.trade_volume.value: pa.int64(),
|
|
77
|
+
BF.vwap.value: pa.float64(),
|
|
78
|
+
}
|
|
79
|
+
)
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
DEFAULT_BARS_COLUMNS = (
|
|
83
|
+
BF.sid.value,
|
|
84
|
+
BF.ts.value,
|
|
85
|
+
BF.ask_open.value,
|
|
86
|
+
BF.ask_close.value,
|
|
87
|
+
BF.bid_open.value,
|
|
88
|
+
BF.bid_close.value,
|
|
89
|
+
BF.bid_qty.value,
|
|
90
|
+
)
|
|
91
|
+
|
|
92
|
+
# noinspection PyUnresolvedReferences
|
|
93
|
+
StockBarsArrowSchema: pa.Schema = pa.schema(
|
|
94
|
+
(fld.value, fld.pa_type) for fld in StockBarField
|
|
95
|
+
)
|
|
96
|
+
# noinspection PyUnresolvedReferences
|
|
97
|
+
StockBarsPandasSchema: Dict[str, PandasDTypeT] = {
|
|
98
|
+
fld.value: fld.pd_type for fld in StockBarField
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
BarsSortFields = (BF.sid.value, BF.ts.value)
|