vastdb 1.0.0__py3-none-any.whl → 1.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. vastdb/_internal.py +41 -10
  2. vastdb/bench/perf_bench/__init__.py +0 -0
  3. vastdb/bench/perf_bench/bench_repo/__init__.py +0 -0
  4. vastdb/bench/perf_bench/bench_repo/mega_combo.py +87 -0
  5. vastdb/bench/perf_bench/cli.py +225 -0
  6. vastdb/bench/perf_bench/common/__init__.py +0 -0
  7. vastdb/bench/perf_bench/common/constants.py +96 -0
  8. vastdb/bench/perf_bench/common/log_utils.py +67 -0
  9. vastdb/bench/perf_bench/common/types.py +34 -0
  10. vastdb/bench/perf_bench/common/utils.py +219 -0
  11. vastdb/bench/perf_bench/dataset/__init__.py +0 -0
  12. vastdb/bench/perf_bench/dataset/generate_secmaster.py +105 -0
  13. vastdb/bench/perf_bench/dataset/generate_stocks_dataset.py +242 -0
  14. vastdb/bench/perf_bench/dataset/schemas.py +101 -0
  15. vastdb/bench/perf_bench/dataset/secmaster.py +33 -0
  16. vastdb/bench/perf_bench/orchestrate/__init__.py +0 -0
  17. vastdb/bench/perf_bench/orchestrate/bench_spec.py +91 -0
  18. vastdb/bench/perf_bench/orchestrate/results_helpers.py +126 -0
  19. vastdb/bench/perf_bench/orchestrate/scenario.py +109 -0
  20. vastdb/bench/perf_bench/orchestrate/scenario_generator.py +144 -0
  21. vastdb/bench/perf_bench/query/__init__.py +0 -0
  22. vastdb/bench/perf_bench/query/arrow_common.py +59 -0
  23. vastdb/bench/perf_bench/query/query.py +42 -0
  24. vastdb/bench/perf_bench/query/query_pyarrow.py +70 -0
  25. vastdb/bench/perf_bench/query/query_vastdb.py +78 -0
  26. vastdb/bench/perf_bench/run.py +79 -0
  27. vastdb/bench/test_sample.py +4 -2
  28. vastdb/conftest.py +1 -1
  29. vastdb/session.py +0 -6
  30. vastdb/table.py +35 -35
  31. vastdb/tests/test_nested.py +58 -0
  32. vastdb/tests/test_tables.py +13 -0
  33. vastdb/transaction.py +4 -8
  34. vastdb/util.py +5 -0
  35. {vastdb-1.0.0.dist-info → vastdb-1.1.1.dist-info}/METADATA +3 -4
  36. {vastdb-1.0.0.dist-info → vastdb-1.1.1.dist-info}/RECORD +39 -14
  37. {vastdb-1.0.0.dist-info → vastdb-1.1.1.dist-info}/WHEEL +1 -1
  38. {vastdb-1.0.0.dist-info → vastdb-1.1.1.dist-info}/LICENSE +0 -0
  39. {vastdb-1.0.0.dist-info → vastdb-1.1.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,219 @@
1
+ import importlib.util
2
+ import logging
3
+ import os
4
+ import sys
5
+ import time
6
+ from contextlib import contextmanager
7
+ from pathlib import Path
8
+ from types import MappingProxyType, ModuleType
9
+ from typing import (
10
+ Any,
11
+ Dict,
12
+ Generator,
13
+ Optional,
14
+ Sequence,
15
+ Tuple,
16
+ TypeVar,
17
+ Union,
18
+ )
19
+
20
+ import exchange_calendars as xcal
21
+ import fsspec
22
+ import pandas as pd
23
+ from fsspec import AbstractFileSystem
24
+
25
+ # noinspection PyProtectedMember
26
+ from pandas._typing import IntervalClosedType
27
+ from pyarrow import dataset as ds
28
+
29
+ from vastdb.bench.perf_bench.common.constants import (
30
+ DEFAULT_ACCESS_KEY,
31
+ DEFAULT_END_T,
32
+ DEFAULT_ROW_GROUP_SIZE,
33
+ DEFAULT_S3_HOST,
34
+ DEFAULT_SECRET_KEY,
35
+ DEFAULT_START_T,
36
+ DFAULT_PARQUET_COMPRESSION,
37
+ VASTDB_ENDPOINT,
38
+ ParquetCompression,
39
+ )
40
+ from vastdb.bench.perf_bench.common.log_utils import get_logger
41
+ from vastdb.bench.perf_bench.common.types import DateLikeT, PathLikeT
42
+ from vastdb.session import Session
43
+
44
+ _FS_MAP = MappingProxyType({"/mnt": "nfs", "/": "fs", "s3://": "s3", "": "vastdb"})
45
+
46
+
47
+ def config_ipython():
48
+ try:
49
+ # noinspection PyUnresolvedReferences
50
+ _ = get_ipython
51
+ pd.set_option("display.float_format", lambda x: "%.6f" % x)
52
+ pd.set_option("display.max_rows", 50)
53
+ pd.set_option("display.max_columns", 50)
54
+ pd.set_option("display.width", 1000)
55
+ except NameError:
56
+ ...
57
+
58
+
59
+ # If in interactive mode, make it pretty
60
+ config_ipython()
61
+
62
+
63
+ def getenv_flag(name: str, default: bool = False) -> bool:
64
+ return os.getenv(name, str(default)).lower() in ("1", "true", "yes")
65
+
66
+
67
+ def to_ts(d: Optional[DateLikeT], normalize: bool = True) -> Optional[pd.Timestamp]:
68
+ if not d:
69
+ return None
70
+ ret_d: pd.Timestamp = d if isinstance(d, pd.Timestamp) else pd.Timestamp(d)
71
+ return ret_d.normalize() if normalize else ret_d
72
+
73
+
74
+ def _default_s3_kwargs(
75
+ aws_access_key_id: str = DEFAULT_ACCESS_KEY,
76
+ aws_secret_access_key: str = DEFAULT_SECRET_KEY,
77
+ host: str = DEFAULT_S3_HOST,
78
+ config: Optional[Dict[str, Any]] = None,
79
+ **kwargs,
80
+ ) -> Dict[str, Any]:
81
+ return {
82
+ "endpoint_url": f"http://{host}",
83
+ "aws_access_key_id": aws_access_key_id,
84
+ "aws_secret_access_key": aws_secret_access_key,
85
+ "use_ssl": False,
86
+ "verify": False,
87
+ "config": {
88
+ "signature_version": "s3v4",
89
+ "s3": {"addressing_style": "path"},
90
+ **(config or {}),
91
+ },
92
+ "region_name": "us-east-1",
93
+ **kwargs,
94
+ }
95
+
96
+
97
+ def get_vastdb_session(
98
+ access: str = DEFAULT_ACCESS_KEY,
99
+ secret: str = DEFAULT_SECRET_KEY,
100
+ vastdb_endpoint: str = VASTDB_ENDPOINT,
101
+ ssl_verify: bool = True,
102
+ ) -> Session:
103
+ return Session(
104
+ endpoint=vastdb_endpoint, access=access, secret=secret, ssl_verify=ssl_verify
105
+ )
106
+
107
+
108
+ def get_filesystem(
109
+ path: PathLikeT,
110
+ botocore_client_kwargs: Optional[Dict[str, Any]] = None,
111
+ **kwargs,
112
+ ) -> Tuple[str, AbstractFileSystem]:
113
+ path = str(path)
114
+ if path.startswith("s3://"):
115
+ botocore_client_kwargs = _default_s3_kwargs(
116
+ **(botocore_client_kwargs or {}),
117
+ config=kwargs.pop("config_kwargs", None),
118
+ )
119
+ fs = fsspec.filesystem(
120
+ protocol="s3",
121
+ client_kwargs=botocore_client_kwargs,
122
+ **kwargs,
123
+ )
124
+ path = path[5:] # remove the s3:// prefix
125
+ else:
126
+ fs = fsspec.filesystem(protocol="file", **kwargs)
127
+ return path, fs
128
+
129
+
130
+ # noinspection PyShadowingBuiltins
131
+ def get_parquet_dataset(
132
+ path: PathLikeT,
133
+ filesystem: Optional[AbstractFileSystem] = None,
134
+ fs_kwargs: Optional[Dict[str, Any]] = None,
135
+ format: Optional[str] = "parquet", # noqa: A002
136
+ partitioning: Optional[str] = "hive",
137
+ **kwargs,
138
+ ) -> ds.Dataset:
139
+ path, fs = filesystem or get_filesystem(path=path, **(fs_kwargs or {}))
140
+ return ds.dataset(
141
+ source=path,
142
+ format=format,
143
+ partitioning=partitioning,
144
+ filesystem=fs,
145
+ **kwargs,
146
+ )
147
+
148
+
149
+ def get_parquet_dataset_root(
150
+ base_dir: PathLikeT,
151
+ row_group_size: int = DEFAULT_ROW_GROUP_SIZE,
152
+ compression: Union[str, ParquetCompression] = DFAULT_PARQUET_COMPRESSION,
153
+ ) -> Path:
154
+ return Path(str(base_dir)) / f"rg_{row_group_size!s}_c_{ParquetCompression[compression]}"
155
+
156
+
157
+ @contextmanager
158
+ def time_me(logger: Optional[logging.Logger] = None):
159
+ logger = logger or get_logger(__name__)
160
+ start_t = time.time_ns()
161
+ yield # Yield control back to the context block
162
+ total_t_sec = (time.time_ns() - start_t) / 1e9
163
+ logger.debug(f"Execution time: {total_t_sec:.2f} s")
164
+
165
+
166
+ S = TypeVar("S", bound=Sequence)
167
+
168
+
169
+ def chunk_sequence(input_seq: S, chunk_size: int) -> Generator[Sequence, None, None]:
170
+ for i in range(0, len(input_seq), chunk_size):
171
+ yield input_seq[i: i + chunk_size]
172
+
173
+
174
+ def get_dates_range(
175
+ from_t: Optional[DateLikeT] = DEFAULT_START_T,
176
+ to_t: Optional[DateLikeT] = DEFAULT_END_T,
177
+ only_bdays: bool = True,
178
+ inclusive: IntervalClosedType = "both",
179
+ ) -> pd.DatetimeIndex:
180
+ fun = pd.bdate_range if only_bdays else pd.date_range
181
+ return fun( # type: ignore[operator]
182
+ start=to_ts(from_t),
183
+ end=to_ts(to_t),
184
+ inclusive=inclusive,
185
+ )
186
+
187
+
188
+ def infer_fs_type(path: str) -> str:
189
+ return next(k for m, k in _FS_MAP.items() if str(path).lower().startswith(m))
190
+
191
+
192
+ def get_session_minutes(
193
+ date: DateLikeT,
194
+ exchange: str = "XNYS",
195
+ ) -> pd.DatetimeIndex:
196
+ ecal = xcal.get_calendar(exchange)
197
+ date = pd.Timestamp(date).normalize()
198
+ sess = ecal.date_to_session(date=date, direction="next")
199
+ sess_minutes = ecal.session_minutes(sess)
200
+ return sess_minutes
201
+
202
+
203
+ def load_module_from_path(module_file_path: PathLikeT) -> ModuleType:
204
+ file_path = Path(str(module_file_path)).resolve()
205
+ module_name = file_path.stem
206
+
207
+ spec = importlib.util.spec_from_file_location(module_name, file_path)
208
+ if spec is None:
209
+ raise ImportError(f"Cannot find module {module_name} at {file_path}")
210
+
211
+ module = importlib.util.module_from_spec(spec)
212
+ loader = spec.loader
213
+ if loader is None or not hasattr(loader, "exec_module"):
214
+ raise ImportError(f"Cannot load module {module_name} from {file_path}")
215
+
216
+ loader.exec_module(module)
217
+ sys.modules[module_name] = module
218
+
219
+ return module
File without changes
@@ -0,0 +1,105 @@
1
+ from pathlib import Path
2
+ from typing import NamedTuple, TypedDict
3
+
4
+ import requests
5
+
6
+ from vastdb.bench.perf_bench.common.log_utils import get_logger
7
+
8
+ _MY_DIR = Path(__file__).parent
9
+ SM_PATH = _MY_DIR / "secmaster.py"
10
+
11
+ LOG = get_logger(__name__)
12
+
13
+
14
+ class NasdaqRawRecord(TypedDict):
15
+ symbol: str
16
+ name: str
17
+ lastsale: str
18
+ netchange: str
19
+ pctchange: str
20
+ marketCap: str
21
+ url: str
22
+
23
+
24
+ class NasdaqRecord(NamedTuple):
25
+ symbol: str
26
+ id: int
27
+ last_sale: float
28
+
29
+ @staticmethod
30
+ def id_from_ticker(ticker: str) -> int:
31
+ base = 100
32
+ max_width = 5
33
+ offset = ord(" ")
34
+ if len(ticker := ticker.strip()) > max_width:
35
+ raise ValueError(f"Ticker too long: {ticker}")
36
+ ticker = ticker.rjust(max_width, " ")
37
+ return sum(
38
+ (ord(c) - offset) * base ** (len(ticker) - i)
39
+ for i, c in enumerate(ticker.upper())
40
+ )
41
+
42
+ @classmethod
43
+ def from_raw_dict(cls, raw_dict: NasdaqRawRecord) -> "NasdaqRecord":
44
+ return cls(
45
+ symbol=(sym := raw_dict["symbol"].strip().upper()),
46
+ id=cls.id_from_ticker(sym),
47
+ last_sale=float(raw_dict["lastsale"].replace("$", "").replace(",", "")),
48
+ )
49
+
50
+
51
+ def generate_secmaster():
52
+ resp = requests.get(
53
+ "http://api.nasdaq.com/api/screener/stocks?limit=3000",
54
+ headers={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"},
55
+ )
56
+ resp.raise_for_status()
57
+ resp_j = resp.json()
58
+ records = [
59
+ NasdaqRecord.from_raw_dict(r)
60
+ for r in resp_j["data"]["table"]["rows"]
61
+ if r and r["symbol"].strip()
62
+ ]
63
+ _ticker_to_sid = {r.symbol: r.id for r in records}
64
+ _sid_to_ticker = {r.id: r.symbol for r in records}
65
+ _indicative_px = {r.id: r.last_sale for r in records}
66
+
67
+ uni = sorted(_ticker_to_sid)
68
+ secm_file_contents = f"""
69
+ _ticker_to_sid = {_ticker_to_sid}
70
+
71
+
72
+ _sid_to_ticker = {_sid_to_ticker}
73
+
74
+
75
+ _indicative_px = {_indicative_px}
76
+
77
+
78
+ def to_sid(ticker: str) -> int:
79
+ return _ticker_to_sid[ticker]
80
+
81
+
82
+ def to_ticker(sid: int) -> str:
83
+ return _sid_to_ticker[sid]
84
+
85
+
86
+ def get_indicative_px(sid: int) -> float:
87
+ return _indicative_px[sid]
88
+
89
+
90
+ UNI_SPEC = {{
91
+ "Large": (large_uni := {uni}),
92
+ "Single": large_uni[:1],
93
+ "Tiny": (tiny_uni := large_uni[1::50]),
94
+ "Small": (small_uni := large_uni[1::10]),
95
+ "SmallSeq": large_uni[: len(small_uni)],
96
+ "Medium": (med_uni := large_uni[1::6]),
97
+ "MediumSeq": large_uni[: (len(med_uni))],
98
+ "Medium2": (med2_uni := large_uni[1::8]),
99
+ "Medium2Seq": large_uni[: (len(med2_uni))],
100
+ }}
101
+ """
102
+ with open(SM_PATH, "w") as f:
103
+ f.write(secm_file_contents)
104
+
105
+ LOG.info("Secmaster generated (total stocks: %d): %s", len(uni), SM_PATH)
@@ -0,0 +1,242 @@
1
+ import concurrent.futures
2
+ import datetime as dt
3
+ import os
4
+ import time
5
+ from pathlib import Path
6
+ from typing import Optional, Sequence, Union
7
+
8
+ import numpy as np
9
+ import pandas as pd
10
+
11
+ from vastdb.bench.perf_bench.common.constants import (
12
+ DEFAULT_END_T,
13
+ DEFAULT_ROW_GROUP_SIZE,
14
+ DEFAULT_START_T,
15
+ DFAULT_PARQUET_COMPRESSION,
16
+ LOCAL_FS_DS_PATH,
17
+ ParquetCompression,
18
+ )
19
+ from vastdb.bench.perf_bench.common.log_utils import get_logger
20
+ from vastdb.bench.perf_bench.common.types import DateLikeT, PathLikeT
21
+ from vastdb.bench.perf_bench.common.utils import (
22
+ get_parquet_dataset_root,
23
+ get_session_minutes,
24
+ )
25
+ from vastdb.bench.perf_bench.dataset import secmaster as sm
26
+ from vastdb.bench.perf_bench.dataset.schemas import (
27
+ BF,
28
+ StockBarsPandasSchema,
29
+ )
30
+
31
+ LOG = get_logger(__name__)
32
+
33
+ DEFAULT_NUM_WORKERS = max(1, int((os.cpu_count() or 1) * 0.4))
34
+ DEFAULT_DS_ROOT_PATH = LOCAL_FS_DS_PATH
35
+
36
+ ref_bars: Optional[pd.DataFrame] = None
37
+
38
+
39
+ def generate_synthetic_stock_1m_bars_day(
40
+ date: Optional[DateLikeT] = None,
41
+ exchange: str = "XNYS",
42
+ ) -> pd.DataFrame:
43
+ sess_minutes = get_session_minutes(date=date or DEFAULT_START_T, exchange=exchange)
44
+ uni_tickers = sm.large_uni
45
+ sids = [sm.to_sid(t) for t in sm.large_uni]
46
+ ref_px = pd.Series(
47
+ [sm.get_indicative_px(sid) for sid in sids],
48
+ dtype=BF.trade_close.pd_type,
49
+ )
50
+ uni_sz = len(uni_tickers)
51
+
52
+ def _build_minute(m: pd.Timestamp) -> pd.DataFrame:
53
+ return pd.DataFrame(
54
+ {
55
+ BF.sid.name: sids,
56
+ BF.ts.name: [m.astimezone("UTC").tz_localize(None)] * uni_sz, # type: ignore[arg-type]
57
+ BF.ticker.name: uni_tickers,
58
+ # Ask
59
+ BF.ask_open.name: (
60
+ ao := ref_px * np.random.uniform(0.999, 0.001, uni_sz)
61
+ ),
62
+ BF.ask_high.name: ao * np.random.uniform(1.0, 1.001, uni_sz),
63
+ BF.ask_low.name: ao * np.random.uniform(0.999, 1.0, uni_sz),
64
+ BF.ask_close.name: ao * np.random.uniform(0.999, 1.001, uni_sz),
65
+ BF.ask_qty.name: np.random.randint(1, 10000, uni_sz),
66
+ # Bid
67
+ BF.bid_open.name: (
68
+ bo := ref_px * np.random.uniform(0.999, 0.001, uni_sz)
69
+ ),
70
+ BF.bid_high.name: bo * np.random.uniform(1.0, 1.001, uni_sz),
71
+ BF.bid_low.name: bo * np.random.uniform(0.999, 1.0, uni_sz),
72
+ BF.bid_close.name: bo * np.random.uniform(0.999, 1.001, uni_sz),
73
+ BF.bid_qty.name: np.random.randint(1, 10000, uni_sz),
74
+ # Trades
75
+ BF.trade_open.name: (
76
+ to := ref_px * np.random.uniform(0.999, 0.001, uni_sz)
77
+ ),
78
+ BF.trade_high.name: to * np.random.uniform(1.0, 1.001, uni_sz),
79
+ BF.trade_low.name: to * np.random.uniform(0.999, 1.0, uni_sz),
80
+ BF.trade_close.name: to * np.random.uniform(0.999, 1.001, uni_sz),
81
+ BF.trade_volume.name: np.random.randint(1, 10000, uni_sz),
82
+ # Vwap
83
+ BF.vwap.name: ref_px * np.random.uniform(0.999, 1.001, uni_sz),
84
+ }
85
+ )
86
+
87
+ # noinspection PyUnreachableCode
88
+ return (
89
+ pd.concat([_build_minute(m) for m in sess_minutes])
90
+ .astype(StockBarsPandasSchema)
91
+ .sort_values([BF.sid.name, BF.ts.name])
92
+ .reset_index(drop=True)
93
+ )
94
+
95
+
96
+ def worker_init():
97
+ global ref_bars # noqa: PLW0603
98
+ ref_bars = generate_synthetic_stock_1m_bars_day()
99
+
100
+
101
+ # noinspection DuplicatedCode
102
+ def build_bars(
103
+ dates: Union[Sequence[dt.date], pd.DatetimeIndex],
104
+ output_dir: PathLikeT = DEFAULT_DS_ROOT_PATH,
105
+ row_group_size: int = DEFAULT_ROW_GROUP_SIZE,
106
+ compression: Union[str, ParquetCompression] = DFAULT_PARQUET_COMPRESSION,
107
+ ):
108
+ global ref_bars # noqa: PLW0602
109
+ if ref_bars is None:
110
+ raise ValueError("Reference bars not initialized")
111
+
112
+ output_dir = Path(str(output_dir))
113
+ compression = ParquetCompression[compression]
114
+ output_dir = get_parquet_dataset_root(output_dir, row_group_size, compression)
115
+ output_dir.mkdir(parents=True, exist_ok=True)
116
+
117
+ LOG.info(
118
+ f"Building bars from {dates[0]:%Y-%m-%d} to {dates[-1]:%Y-%m-%d}, saving to {output_dir}"
119
+ )
120
+
121
+ # noinspection DuplicatedCode
122
+ def build_new_bar(new_date: dt.date) -> pd.DataFrame:
123
+ new_bars = ref_bars
124
+ schema = new_bars.dtypes
125
+ excluded_cols = {BF.ticker.value, BF.sid.value}
126
+ for c in new_bars:
127
+ if c in excluded_cols:
128
+ continue
129
+ kind: str = schema[c].kind # type: ignore[call-overload]
130
+ if kind == "f":
131
+ new_bars[c] = new_bars[c] * np.random.uniform(0.8, 1.2)
132
+ elif kind == "i":
133
+ new_bars[c] = (new_bars[c] * np.random.uniform(0.2, 5)).astype(
134
+ schema[c] # type: ignore[call-overload]
135
+ )
136
+ elif kind == "M":
137
+ new_bars[c] = new_date + (new_bars[c] - new_bars[c].dt.normalize()) # type: ignore[operator]
138
+ return new_bars
139
+
140
+ for d in dates:
141
+ try:
142
+ start_t = time.time()
143
+
144
+ bars_df = build_new_bar(new_date=d)
145
+ generated_time = time.time() - start_t
146
+
147
+ fpth = (
148
+ output_dir
149
+ / f"year={d:%Y}"
150
+ / f"date={d:%Y%m%d}"
151
+ / f"{d:%Y%m%d}_equity_etf_1m_bars.pq"
152
+ )
153
+ fpth.parent.mkdir(parents=True, exist_ok=True)
154
+
155
+ # noinspection PyTypeChecker
156
+ bars_df.to_parquet(
157
+ path=fpth,
158
+ engine="pyarrow",
159
+ compression=compression,
160
+ index=False,
161
+ row_group_size=row_group_size,
162
+ ) # type: ignore[call-overload]
163
+ LOG.info(
164
+ f"Written [{row_group_size=} {compression=}]"
165
+ f"[total={(time.time() - start_t):.2f}, generate={generated_time:.2f}) seconds]: "
166
+ f"{fpth}"
167
+ )
168
+ except Exception as e:
169
+ LOG.exception(f"FAILURE with date {d}: {e}")
170
+
171
+
172
+ def common(output_dir: PathLikeT = DEFAULT_DS_ROOT_PATH):
173
+ output_dir = Path(str(output_dir))
174
+ output_dir.mkdir(parents=True, exist_ok=True)
175
+
176
+ global ref_bars # noqa: PLW0602
177
+ if ref_bars is None:
178
+ worker_init()
179
+
180
+
181
+ def generate_synthetic_stock_1m_bars(
182
+ from_t: DateLikeT = DEFAULT_START_T,
183
+ to_t: DateLikeT = DEFAULT_END_T,
184
+ output_dir: PathLikeT = DEFAULT_DS_ROOT_PATH,
185
+ row_group_size: int = DEFAULT_ROW_GROUP_SIZE,
186
+ compression: Union[str, ParquetCompression] = DFAULT_PARQUET_COMPRESSION,
187
+ ):
188
+ common(output_dir=output_dir)
189
+ dates = pd.bdate_range(
190
+ pd.Timestamp(from_t).normalize(), pd.Timestamp(to_t).normalize()
191
+ )
192
+ build_bars(
193
+ dates,
194
+ output_dir=output_dir,
195
+ row_group_size=row_group_size,
196
+ compression=compression,
197
+ )
198
+
199
+
200
+ def generate_concurrent_synthetic_stock_1m_bars(
201
+ from_t: DateLikeT = DEFAULT_START_T,
202
+ to_t: DateLikeT = DEFAULT_END_T,
203
+ output_dir: PathLikeT = DEFAULT_DS_ROOT_PATH,
204
+ num_workers: int = DEFAULT_NUM_WORKERS,
205
+ row_group_size: int = DEFAULT_ROW_GROUP_SIZE,
206
+ compression: Union[str, ParquetCompression] = DFAULT_PARQUET_COMPRESSION,
207
+ ):
208
+ if (num_workers := max(1, int(num_workers))) == 1:
209
+ return generate_synthetic_stock_1m_bars(
210
+ from_t=from_t,
211
+ to_t=to_t,
212
+ output_dir=output_dir,
213
+ row_group_size=row_group_size,
214
+ compression=compression,
215
+ )
216
+
217
+ common(output_dir=output_dir)
218
+ dates = pd.bdate_range(
219
+ pd.Timestamp(from_t).normalize(), pd.Timestamp(to_t).normalize()
220
+ )
221
+ batch_size = int(len(dates) / num_workers) + 1
222
+ batches = [dates[i: i + batch_size] for i in range(0, len(dates), batch_size)]
223
+
224
+ with concurrent.futures.ProcessPoolExecutor(
225
+ max_workers=num_workers,
226
+ initializer=worker_init,
227
+ ) as executor:
228
+ # Prepare a list of futures
229
+ futures = [
230
+ executor.submit(
231
+ build_bars,
232
+ dates=batch_dates,
233
+ output_dir=output_dir,
234
+ row_group_size=row_group_size,
235
+ compression=compression,
236
+ )
237
+ for batch_dates in batches
238
+ ]
239
+
240
+ # Iterate over futures to get their results
241
+ for future in concurrent.futures.as_completed(futures):
242
+ LOG.info(future.result())
@@ -0,0 +1,101 @@
1
+ from functools import lru_cache
2
+ from types import MappingProxyType
3
+ from typing import Dict, Mapping, Union
4
+
5
+ import numpy as np
6
+ import pyarrow as pa
7
+ from pandas.core.dtypes.base import ExtensionDtype
8
+
9
+ from vastdb.bench.perf_bench.common.types import StrEnum
10
+
11
+ PandasDTypeT = Union[ExtensionDtype, np.dtype]
12
+
13
+
14
+ class StockBarField(StrEnum):
15
+ sid = "sid"
16
+ ts = "ts"
17
+ ticker = "ticker"
18
+
19
+ # Ask
20
+ ask_open = "ask_open"
21
+ ask_high = "ask_high"
22
+ ask_low = "ask_low"
23
+ ask_close = "ask_close"
24
+ ask_qty = "ask_qty"
25
+
26
+ # Bid
27
+ bid_open = "bid_open"
28
+ bid_high = "bid_high"
29
+ bid_low = "bid_low"
30
+ bid_close = "bid_close"
31
+ bid_qty = "bid_qty"
32
+
33
+ # Trades
34
+ trade_open = "trade_open"
35
+ trade_high = "trade_high"
36
+ trade_low = "trade_low"
37
+ trade_close = "trade_close"
38
+ trade_volume = "trade_volume"
39
+
40
+ # VWAP
41
+ vwap = "vwap"
42
+
43
+ @property
44
+ def pa_type(self) -> pa.DataType:
45
+ return _get_field_pyarrow_types()[self]
46
+
47
+ @property
48
+ def pd_type(self) -> PandasDTypeT:
49
+ return self.pa_type.to_pandas_dtype()
50
+
51
+
52
+ BF = StockBarField
53
+
54
+
55
+ @lru_cache
56
+ def _get_field_pyarrow_types() -> Mapping[str, pa.DataType]:
57
+ return MappingProxyType(
58
+ {
59
+ BF.sid.value: pa.int64(),
60
+ BF.ts.value: pa.timestamp(unit="ns"),
61
+ BF.ticker.value: pa.string(),
62
+ BF.ask_open.value: pa.float64(),
63
+ BF.ask_high.value: pa.float64(),
64
+ BF.ask_low.value: pa.float64(),
65
+ BF.ask_close.value: pa.float64(),
66
+ BF.ask_qty.value: pa.int64(),
67
+ BF.bid_open.value: pa.float64(),
68
+ BF.bid_high.value: pa.float64(),
69
+ BF.bid_low.value: pa.float64(),
70
+ BF.bid_close.value: pa.float64(),
71
+ BF.bid_qty.value: pa.int64(),
72
+ BF.trade_open.value: pa.float64(),
73
+ BF.trade_high.value: pa.float64(),
74
+ BF.trade_low.value: pa.float64(),
75
+ BF.trade_close.value: pa.float64(),
76
+ BF.trade_volume.value: pa.int64(),
77
+ BF.vwap.value: pa.float64(),
78
+ }
79
+ )
80
+
81
+
82
+ DEFAULT_BARS_COLUMNS = (
83
+ BF.sid.value,
84
+ BF.ts.value,
85
+ BF.ask_open.value,
86
+ BF.ask_close.value,
87
+ BF.bid_open.value,
88
+ BF.bid_close.value,
89
+ BF.bid_qty.value,
90
+ )
91
+
92
+ # noinspection PyUnresolvedReferences
93
+ StockBarsArrowSchema: pa.Schema = pa.schema(
94
+ (fld.value, fld.pa_type) for fld in StockBarField
95
+ )
96
+ # noinspection PyUnresolvedReferences
97
+ StockBarsPandasSchema: Dict[str, PandasDTypeT] = {
98
+ fld.value: fld.pd_type for fld in StockBarField
99
+ }
100
+
101
+ BarsSortFields = (BF.sid.value, BF.ts.value)