vastdb 1.0.0__py3-none-any.whl → 1.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vastdb/_internal.py +41 -10
- vastdb/bench/perf_bench/__init__.py +0 -0
- vastdb/bench/perf_bench/bench_repo/__init__.py +0 -0
- vastdb/bench/perf_bench/bench_repo/mega_combo.py +87 -0
- vastdb/bench/perf_bench/cli.py +225 -0
- vastdb/bench/perf_bench/common/__init__.py +0 -0
- vastdb/bench/perf_bench/common/constants.py +96 -0
- vastdb/bench/perf_bench/common/log_utils.py +67 -0
- vastdb/bench/perf_bench/common/types.py +34 -0
- vastdb/bench/perf_bench/common/utils.py +219 -0
- vastdb/bench/perf_bench/dataset/__init__.py +0 -0
- vastdb/bench/perf_bench/dataset/generate_secmaster.py +105 -0
- vastdb/bench/perf_bench/dataset/generate_stocks_dataset.py +242 -0
- vastdb/bench/perf_bench/dataset/schemas.py +101 -0
- vastdb/bench/perf_bench/dataset/secmaster.py +33 -0
- vastdb/bench/perf_bench/orchestrate/__init__.py +0 -0
- vastdb/bench/perf_bench/orchestrate/bench_spec.py +91 -0
- vastdb/bench/perf_bench/orchestrate/results_helpers.py +126 -0
- vastdb/bench/perf_bench/orchestrate/scenario.py +109 -0
- vastdb/bench/perf_bench/orchestrate/scenario_generator.py +144 -0
- vastdb/bench/perf_bench/query/__init__.py +0 -0
- vastdb/bench/perf_bench/query/arrow_common.py +59 -0
- vastdb/bench/perf_bench/query/query.py +42 -0
- vastdb/bench/perf_bench/query/query_pyarrow.py +70 -0
- vastdb/bench/perf_bench/query/query_vastdb.py +78 -0
- vastdb/bench/perf_bench/run.py +79 -0
- vastdb/bench/test_sample.py +4 -2
- vastdb/conftest.py +1 -1
- vastdb/session.py +0 -6
- vastdb/table.py +35 -35
- vastdb/tests/test_nested.py +58 -0
- vastdb/tests/test_tables.py +13 -0
- vastdb/transaction.py +4 -8
- vastdb/util.py +5 -0
- {vastdb-1.0.0.dist-info → vastdb-1.1.1.dist-info}/METADATA +3 -4
- {vastdb-1.0.0.dist-info → vastdb-1.1.1.dist-info}/RECORD +39 -14
- {vastdb-1.0.0.dist-info → vastdb-1.1.1.dist-info}/WHEEL +1 -1
- {vastdb-1.0.0.dist-info → vastdb-1.1.1.dist-info}/LICENSE +0 -0
- {vastdb-1.0.0.dist-info → vastdb-1.1.1.dist-info}/top_level.txt +0 -0
|
File without changes
|
|
@@ -0,0 +1,91 @@
|
|
|
1
|
+
import datetime as dt
|
|
2
|
+
import os
|
|
3
|
+
from typing import Dict, List, NamedTuple, Optional, Union
|
|
4
|
+
|
|
5
|
+
import pandas as pd
|
|
6
|
+
|
|
7
|
+
DEFAULT_CSV_SEPARATOR = ","
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class BenchResult(NamedTuple):
|
|
11
|
+
# The identifier for this result
|
|
12
|
+
key: str
|
|
13
|
+
round: int
|
|
14
|
+
|
|
15
|
+
# Host details and time
|
|
16
|
+
host: str
|
|
17
|
+
pid: int
|
|
18
|
+
|
|
19
|
+
# Results details
|
|
20
|
+
start_ts: pd.Timestamp
|
|
21
|
+
end_ts: pd.Timestamp
|
|
22
|
+
duration_sec: float
|
|
23
|
+
n_rows: int
|
|
24
|
+
n_cols: int
|
|
25
|
+
n_bytes: int
|
|
26
|
+
|
|
27
|
+
params: Optional[Dict[str, Union[str, int, float]]] = None
|
|
28
|
+
|
|
29
|
+
def to_dict(
|
|
30
|
+
self,
|
|
31
|
+
include_params: bool = True,
|
|
32
|
+
flatten_params: bool = True,
|
|
33
|
+
) -> Dict[str, Union[str, int, float]]:
|
|
34
|
+
dc = {f: getattr(self, f) for f in self._fields if f != "params"}
|
|
35
|
+
if include_params:
|
|
36
|
+
params = self.params or {}
|
|
37
|
+
if flatten_params:
|
|
38
|
+
dc = {**dc, **{pk: pv for pk, pv in params.items() if pk not in dc}}
|
|
39
|
+
else:
|
|
40
|
+
dc["params"] = params
|
|
41
|
+
return dc
|
|
42
|
+
|
|
43
|
+
def fields(
|
|
44
|
+
self,
|
|
45
|
+
include_params: bool = True,
|
|
46
|
+
flatten_params: bool = True,
|
|
47
|
+
) -> List[str]:
|
|
48
|
+
return list(
|
|
49
|
+
self.to_dict(include_params=include_params, flatten_params=flatten_params)
|
|
50
|
+
)
|
|
51
|
+
|
|
52
|
+
def csv_header(
|
|
53
|
+
self,
|
|
54
|
+
separator: str = DEFAULT_CSV_SEPARATOR,
|
|
55
|
+
include_params: bool = True,
|
|
56
|
+
) -> str:
|
|
57
|
+
return separator.join(
|
|
58
|
+
self.to_dict(include_params=include_params, flatten_params=True)
|
|
59
|
+
)
|
|
60
|
+
|
|
61
|
+
def to_csv(
|
|
62
|
+
self,
|
|
63
|
+
separator: str = DEFAULT_CSV_SEPARATOR,
|
|
64
|
+
include_params: bool = True,
|
|
65
|
+
include_header: bool = False,
|
|
66
|
+
) -> str:
|
|
67
|
+
def _to_str(v) -> str:
|
|
68
|
+
if isinstance(v, dt.date):
|
|
69
|
+
v = v.isoformat()
|
|
70
|
+
v_str = str(v) if not isinstance(v, str) else v
|
|
71
|
+
if separator in v_str:
|
|
72
|
+
if '"' in v_str:
|
|
73
|
+
raise ValueError(f"Can't handle double quotes in value: {v_str}")
|
|
74
|
+
v_str = f'"{v_str}"'
|
|
75
|
+
return v_str
|
|
76
|
+
|
|
77
|
+
data = [
|
|
78
|
+
_to_str(d)
|
|
79
|
+
for d in self.to_dict(include_params=include_params, flatten_params=True)
|
|
80
|
+
]
|
|
81
|
+
data_csv = separator.join(data)
|
|
82
|
+
|
|
83
|
+
if include_header:
|
|
84
|
+
data_csv = os.linesep.join(
|
|
85
|
+
[
|
|
86
|
+
self.csv_header(separator=separator, include_params=include_params),
|
|
87
|
+
data_csv,
|
|
88
|
+
]
|
|
89
|
+
)
|
|
90
|
+
|
|
91
|
+
return data_csv
|
|
@@ -0,0 +1,126 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
from typing import List, Optional, Union
|
|
3
|
+
|
|
4
|
+
import numpy as np
|
|
5
|
+
import pandas as pd
|
|
6
|
+
|
|
7
|
+
from vastdb.bench.perf_bench.common.log_utils import get_logger
|
|
8
|
+
from vastdb.bench.perf_bench.orchestrate.bench_spec import BenchResult
|
|
9
|
+
|
|
10
|
+
LOG = get_logger(__name__)
|
|
11
|
+
|
|
12
|
+
PARAM_NAMES_SEP = "|"
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def results_to_df(
|
|
16
|
+
results: List[BenchResult],
|
|
17
|
+
) -> pd.DataFrame:
|
|
18
|
+
results_df = pd.DataFrame([r.to_dict() for r in results])
|
|
19
|
+
results_df["param_names"] = PARAM_NAMES_SEP.join(
|
|
20
|
+
[c for c in (results[0].params or ()) if c in results_df]
|
|
21
|
+
)
|
|
22
|
+
return results_df
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def save_results(
|
|
26
|
+
results: List[BenchResult],
|
|
27
|
+
results_path: Union[str, Path],
|
|
28
|
+
append: bool = False,
|
|
29
|
+
):
|
|
30
|
+
if not results:
|
|
31
|
+
return
|
|
32
|
+
|
|
33
|
+
results_path = Path(results_path)
|
|
34
|
+
results_path.parent.mkdir(parents=True, exist_ok=True)
|
|
35
|
+
results_df = results_to_df(results)
|
|
36
|
+
|
|
37
|
+
if append and results_path.is_file():
|
|
38
|
+
prev_results = pd.read_csv(results_path).astype(results_df.dtypes)
|
|
39
|
+
results_df = pd.concat([prev_results, results_df], ignore_index=True)
|
|
40
|
+
|
|
41
|
+
results_df.to_csv(results_path, index=False)
|
|
42
|
+
LOG.info(f"Results written to {results_path}")
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def calc_total_time_coverage_seconds(results_df: pd.DataFrame) -> float:
|
|
46
|
+
# Step 1: Separate start and end times, tagging them differently
|
|
47
|
+
starts = results_df["start_ts"].rename("time").to_frame().assign(event=1)
|
|
48
|
+
ends = results_df["end_ts"].rename("time").to_frame().assign(event=-1)
|
|
49
|
+
|
|
50
|
+
# Combine and sort chronologically
|
|
51
|
+
timeline = pd.concat([starts, ends]).sort_values("time")
|
|
52
|
+
|
|
53
|
+
# Step 2 & 3: Calculate cumulative sum to identify active intervals
|
|
54
|
+
timeline["active_intervals"] = timeline["event"].cumsum()
|
|
55
|
+
|
|
56
|
+
# Identify transitions (changes in active_intervals)
|
|
57
|
+
timeline["change"] = timeline["active_intervals"].diff().fillna(1).astype(bool)
|
|
58
|
+
|
|
59
|
+
# Calculate durations between changes
|
|
60
|
+
timeline["duration"] = timeline["time"].diff().shift(-1).fillna(pd.Timedelta(0))
|
|
61
|
+
|
|
62
|
+
# Only consider periods where at least one interval is active
|
|
63
|
+
total_coverage: pd.Timedelta = timeline.query("active_intervals > 0")[
|
|
64
|
+
"duration"
|
|
65
|
+
].sum()
|
|
66
|
+
return total_coverage.total_seconds()
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def calculate_aggregate_stats(
|
|
70
|
+
results: Optional[List[BenchResult]] = None,
|
|
71
|
+
results_df: Optional[pd.DataFrame] = None,
|
|
72
|
+
results_path: Union[None, str, Path] = None,
|
|
73
|
+
) -> pd.DataFrame:
|
|
74
|
+
if sum(bool(x) for x in (results, results_df, results_path)) != 1:
|
|
75
|
+
raise ValueError(
|
|
76
|
+
"Exactly one of results, results_df, or results_path must be provided"
|
|
77
|
+
)
|
|
78
|
+
|
|
79
|
+
if results:
|
|
80
|
+
r_df = results_to_df(results)
|
|
81
|
+
group_flds = list(results[0].params or ())
|
|
82
|
+
else:
|
|
83
|
+
r_df: pd.DataFrame = ( # type: ignore[assignment,no-redef]
|
|
84
|
+
pd.read_csv(results_path) if results_path else results_df
|
|
85
|
+
)
|
|
86
|
+
p_names: pd.Series = results_df["param_names"] # type: ignore[index]
|
|
87
|
+
group_flds = list(p_names.values[0].split(PARAM_NAMES_SEP))
|
|
88
|
+
group_flds = ["key", "n_cols", *group_flds]
|
|
89
|
+
|
|
90
|
+
if not group_flds:
|
|
91
|
+
raise ValueError("No group fields found")
|
|
92
|
+
|
|
93
|
+
def _get_agg(col: str) -> str:
|
|
94
|
+
mapper = {
|
|
95
|
+
"start_ts": "min",
|
|
96
|
+
"end_ts": "max",
|
|
97
|
+
"round": "max",
|
|
98
|
+
"duration_sec": "mean",
|
|
99
|
+
}
|
|
100
|
+
return mapper.get(
|
|
101
|
+
col, "sum" if np.issubdtype(r_df.dtypes[col], np.number) else "last"
|
|
102
|
+
)
|
|
103
|
+
|
|
104
|
+
non_group_flds = [c for c in r_df.columns if c not in group_flds]
|
|
105
|
+
agg_df = (
|
|
106
|
+
r_df.groupby(group_flds)
|
|
107
|
+
.aggregate(
|
|
108
|
+
{c: _get_agg(c) for c in non_group_flds},
|
|
109
|
+
total=("duration_sec", "count"),
|
|
110
|
+
)
|
|
111
|
+
.drop(columns=["pid", "host", "param_names", "round", "duration_sec"])
|
|
112
|
+
.sort_index()
|
|
113
|
+
)
|
|
114
|
+
agg_df["duration_sec"] = (
|
|
115
|
+
r_df.groupby(group_flds)
|
|
116
|
+
.apply(calc_total_time_coverage_seconds, include_groups=False)
|
|
117
|
+
.sort_index()
|
|
118
|
+
)
|
|
119
|
+
agg_df["M_rows_per_sec"] = (agg_df["n_rows"] / agg_df["duration_sec"] / 1e6).astype(
|
|
120
|
+
"float64"
|
|
121
|
+
)
|
|
122
|
+
agg_df["MB_per_sec"] = (
|
|
123
|
+
agg_df["n_bytes"] / 1024 ** 2 / agg_df["duration_sec"]
|
|
124
|
+
).astype("float64")
|
|
125
|
+
|
|
126
|
+
return agg_df
|
|
@@ -0,0 +1,109 @@
|
|
|
1
|
+
import concurrent
|
|
2
|
+
import os
|
|
3
|
+
import socket
|
|
4
|
+
import time
|
|
5
|
+
from dataclasses import dataclass
|
|
6
|
+
from typing import Any, Callable, Dict, List, Optional, Tuple, Union
|
|
7
|
+
|
|
8
|
+
import pandas as pd
|
|
9
|
+
|
|
10
|
+
from vastdb.bench.perf_bench.orchestrate.bench_spec import BenchResult
|
|
11
|
+
from vastdb.bench.perf_bench.orchestrate.results_helpers import LOG
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
@dataclass
|
|
15
|
+
class BenchScenario:
|
|
16
|
+
key: str
|
|
17
|
+
func: Callable
|
|
18
|
+
kwargs: Dict[str, Any]
|
|
19
|
+
results: Optional[Tuple[BenchResult, ...]] = tuple()
|
|
20
|
+
default_result_params: Optional[Dict[str, Union[str, int, float]]] = None
|
|
21
|
+
|
|
22
|
+
# noinspection PyUnresolvedReferences
|
|
23
|
+
def run(
|
|
24
|
+
self,
|
|
25
|
+
n_runs: int = 1,
|
|
26
|
+
discard_first_run: bool = True,
|
|
27
|
+
parallelism: int = 1,
|
|
28
|
+
workers_init: Optional[Callable] = None,
|
|
29
|
+
):
|
|
30
|
+
if parallelism == 1:
|
|
31
|
+
self.results = tuple(
|
|
32
|
+
self._do_run(n_runs=n_runs, discard_first_run=discard_first_run)
|
|
33
|
+
)
|
|
34
|
+
elif parallelism > 1:
|
|
35
|
+
with concurrent.futures.ProcessPoolExecutor(
|
|
36
|
+
max_workers=parallelism,
|
|
37
|
+
initializer=workers_init,
|
|
38
|
+
) as executor:
|
|
39
|
+
futures = [
|
|
40
|
+
executor.submit(
|
|
41
|
+
self._do_run, n_runs=n_runs, discard_first_run=discard_first_run
|
|
42
|
+
)
|
|
43
|
+
for _ in range(parallelism)
|
|
44
|
+
]
|
|
45
|
+
|
|
46
|
+
# Iterate over futures to get their results
|
|
47
|
+
results = []
|
|
48
|
+
for future in concurrent.futures.as_completed(futures):
|
|
49
|
+
LOG.info("Future completed")
|
|
50
|
+
results.extend(future.result())
|
|
51
|
+
self.results = tuple(results)
|
|
52
|
+
else:
|
|
53
|
+
raise ValueError("Can't use a negative number of parallel benches")
|
|
54
|
+
|
|
55
|
+
def _do_run(
|
|
56
|
+
self,
|
|
57
|
+
n_runs: int = 1,
|
|
58
|
+
discard_first_run: bool = True,
|
|
59
|
+
) -> List[BenchResult]:
|
|
60
|
+
hostname = socket.gethostname()
|
|
61
|
+
pid = os.getpid()
|
|
62
|
+
results = []
|
|
63
|
+
kwargs = self.kwargs or {}
|
|
64
|
+
|
|
65
|
+
for i in range(n_runs + int(discard_first_run)):
|
|
66
|
+
key = self.key
|
|
67
|
+
|
|
68
|
+
start_ts = time.time_ns()
|
|
69
|
+
ret_v = self.func(**kwargs)
|
|
70
|
+
end_ts = time.time_ns()
|
|
71
|
+
|
|
72
|
+
if discard_first_run and i == 0:
|
|
73
|
+
LOG.info(f"Discarding first run ({key})")
|
|
74
|
+
continue
|
|
75
|
+
|
|
76
|
+
n_bytes, n_rows, n_cols = 0, 0, 0
|
|
77
|
+
|
|
78
|
+
if hasattr(ret_v, "shape"):
|
|
79
|
+
shape = ret_v.shape
|
|
80
|
+
if len(shape):
|
|
81
|
+
n_rows = shape[0]
|
|
82
|
+
if len(shape) > 1:
|
|
83
|
+
n_cols = shape[1]
|
|
84
|
+
|
|
85
|
+
if hasattr(ret_v, "nbytes"):
|
|
86
|
+
n_bytes = ret_v.nbytes
|
|
87
|
+
elif hasattr(ret_v, "memory_usage"):
|
|
88
|
+
n_bytes = ret_v.memory_usage().sum()
|
|
89
|
+
elif hasattr(ret_v, "estimated_size"):
|
|
90
|
+
n_bytes = ret_v.estimated_size()
|
|
91
|
+
|
|
92
|
+
result = BenchResult(
|
|
93
|
+
key=key,
|
|
94
|
+
round=i,
|
|
95
|
+
host=hostname,
|
|
96
|
+
pid=pid,
|
|
97
|
+
start_ts=pd.Timestamp(start_ts, unit="ns"),
|
|
98
|
+
end_ts=pd.Timestamp(end_ts, unit="ns"),
|
|
99
|
+
duration_sec=(end_ts - start_ts) / 1e9,
|
|
100
|
+
n_rows=n_rows,
|
|
101
|
+
n_cols=n_cols,
|
|
102
|
+
n_bytes=n_bytes,
|
|
103
|
+
params=self.default_result_params or {},
|
|
104
|
+
)
|
|
105
|
+
results.append(result)
|
|
106
|
+
|
|
107
|
+
LOG.info(f"{result.duration_sec} sec")
|
|
108
|
+
|
|
109
|
+
return results
|
|
@@ -0,0 +1,144 @@
|
|
|
1
|
+
import datetime as dt
|
|
2
|
+
from itertools import product
|
|
3
|
+
from typing import Dict, List, Optional, Sequence, Union
|
|
4
|
+
|
|
5
|
+
import pandas as pd
|
|
6
|
+
|
|
7
|
+
from vastdb.bench.perf_bench.common.constants import (
|
|
8
|
+
DEFAULT_ARROW_KWARGS,
|
|
9
|
+
DEFAULT_ROW_GROUP_SIZE,
|
|
10
|
+
DEFAULT_START_T,
|
|
11
|
+
DFAULT_PARQUET_COMPRESSION,
|
|
12
|
+
ParquetCompression,
|
|
13
|
+
VastConnDetails,
|
|
14
|
+
)
|
|
15
|
+
from vastdb.bench.perf_bench.common.types import PathLikeT
|
|
16
|
+
from vastdb.bench.perf_bench.common.utils import get_parquet_dataset_root, infer_fs_type
|
|
17
|
+
from vastdb.bench.perf_bench.dataset import secmaster as sm
|
|
18
|
+
from vastdb.bench.perf_bench.dataset.schemas import DEFAULT_BARS_COLUMNS
|
|
19
|
+
from vastdb.bench.perf_bench.orchestrate.scenario import BenchScenario
|
|
20
|
+
from vastdb.bench.perf_bench.query.query import QueryBackend
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def generate_perf_bench_scenarios(
|
|
24
|
+
base_key: str,
|
|
25
|
+
conn_details: VastConnDetails,
|
|
26
|
+
# Common parameters
|
|
27
|
+
query_backends: Sequence[Union[str, QueryBackend]] = (QueryBackend.pyarrow,),
|
|
28
|
+
universe_choices: Sequence[Union[str, Sequence[str]]] = ("Single",),
|
|
29
|
+
columns_choices: Sequence[Sequence[str]] = (DEFAULT_BARS_COLUMNS,),
|
|
30
|
+
dt_start_t: Union[dt.date, str] = DEFAULT_START_T,
|
|
31
|
+
num_bdays: Sequence[int] = (1,),
|
|
32
|
+
# Arrow-specific options
|
|
33
|
+
fs_path_choices: Optional[Sequence[PathLikeT]] = None,
|
|
34
|
+
rowgroup_size_choices: Optional[Sequence[int]] = None,
|
|
35
|
+
compression_choices: Optional[Sequence[Union[str, ParquetCompression]]] = None,
|
|
36
|
+
arrow_batching_spec_choices: Optional[Sequence[Dict[str, int]]] = None,
|
|
37
|
+
# VastDB-specific options
|
|
38
|
+
vdb_num_sub_splits_choices: Optional[Sequence[int]] = None,
|
|
39
|
+
vdb_num_row_groups_per_sub_split_choices: Optional[Sequence[int]] = None,
|
|
40
|
+
) -> List[BenchScenario]:
|
|
41
|
+
if not base_key:
|
|
42
|
+
raise ValueError("base_key must be provided")
|
|
43
|
+
|
|
44
|
+
uni_choices: List[List[str]] = []
|
|
45
|
+
for k in universe_choices:
|
|
46
|
+
if isinstance(k, str):
|
|
47
|
+
uni_choices.append(sm.UNI_SPEC.get(k, [k]))
|
|
48
|
+
else:
|
|
49
|
+
uni_choices.append(list(k))
|
|
50
|
+
|
|
51
|
+
columns_choices = [[c for c in cs if c] for cs in columns_choices if cs]
|
|
52
|
+
dt_start_t = pd.Timestamp(dt_start_t)
|
|
53
|
+
dt_range_choices = [
|
|
54
|
+
(dt_start_t, dt_start_t + pd.tseries.offsets.BDay(d))
|
|
55
|
+
for d in sorted(set(num_bdays))
|
|
56
|
+
if d > 0
|
|
57
|
+
]
|
|
58
|
+
|
|
59
|
+
scenarios = []
|
|
60
|
+
for i, ( # type: ignore[misc]
|
|
61
|
+
uni,
|
|
62
|
+
columns,
|
|
63
|
+
(from_t, to_t),
|
|
64
|
+
path,
|
|
65
|
+
rowgroup_size,
|
|
66
|
+
compression,
|
|
67
|
+
arrow_batch_spec,
|
|
68
|
+
vdb_num_sub_splits,
|
|
69
|
+
vdb_num_row_groups_per_sub_split,
|
|
70
|
+
query_backend,
|
|
71
|
+
) in enumerate(
|
|
72
|
+
product(
|
|
73
|
+
uni_choices,
|
|
74
|
+
columns_choices,
|
|
75
|
+
dt_range_choices,
|
|
76
|
+
# Arrow specific options
|
|
77
|
+
(*(fs_path_choices or []), None),
|
|
78
|
+
(*(rowgroup_size_choices or [DEFAULT_ROW_GROUP_SIZE]), None),
|
|
79
|
+
(*(compression_choices or [DFAULT_PARQUET_COMPRESSION]), None),
|
|
80
|
+
(*(arrow_batching_spec_choices or [DEFAULT_ARROW_KWARGS]), None),
|
|
81
|
+
# VastDB specific options
|
|
82
|
+
(*(vdb_num_sub_splits_choices or [1]), None),
|
|
83
|
+
(*(vdb_num_row_groups_per_sub_split_choices or [8]), None),
|
|
84
|
+
[QueryBackend[qb] for qb in query_backends],
|
|
85
|
+
)
|
|
86
|
+
):
|
|
87
|
+
if query_backend is QueryBackend.vastdb:
|
|
88
|
+
if not all((vdb_num_sub_splits, vdb_num_row_groups_per_sub_split)) or any(
|
|
89
|
+
(path, rowgroup_size, compression, arrow_batch_spec)
|
|
90
|
+
):
|
|
91
|
+
# ignore path and rg size for vastdb runs
|
|
92
|
+
continue
|
|
93
|
+
backend_kwargs = {
|
|
94
|
+
"num_sub_splits": vdb_num_sub_splits,
|
|
95
|
+
"num_row_groups_per_sub_split": vdb_num_row_groups_per_sub_split,
|
|
96
|
+
"conn_details": conn_details,
|
|
97
|
+
}
|
|
98
|
+
path = None
|
|
99
|
+
elif query_backend is QueryBackend.pyarrow:
|
|
100
|
+
if not all((path, rowgroup_size, compression, arrow_batch_spec)) or any(
|
|
101
|
+
(vdb_num_sub_splits, vdb_num_row_groups_per_sub_split)
|
|
102
|
+
):
|
|
103
|
+
# ignore non-vastdb runs without a path or rg size
|
|
104
|
+
continue
|
|
105
|
+
backend_kwargs: Dict[str, Union[str, int, VastConnDetails]] = arrow_batch_spec # type: ignore[no-redef]
|
|
106
|
+
backend_kwargs["conn_details"] = conn_details
|
|
107
|
+
path = get_parquet_dataset_root(path, rowgroup_size, compression) # type: ignore[arg-type]
|
|
108
|
+
else:
|
|
109
|
+
raise NotImplementedError(f"Unsupported query backend: {query_backend=!r}")
|
|
110
|
+
|
|
111
|
+
scen = BenchScenario(
|
|
112
|
+
key=f"{base_key}_{query_backend.name}",
|
|
113
|
+
func=query_backend.run_query,
|
|
114
|
+
kwargs=dict(
|
|
115
|
+
universe=uni,
|
|
116
|
+
columns=columns,
|
|
117
|
+
from_t=from_t,
|
|
118
|
+
to_t=to_t,
|
|
119
|
+
path=path,
|
|
120
|
+
backend_kwargs=backend_kwargs,
|
|
121
|
+
),
|
|
122
|
+
default_result_params={
|
|
123
|
+
"uni_sz": len(uni), # type: ignore
|
|
124
|
+
"num_days": (to_t - from_t).days, # type: ignore
|
|
125
|
+
"query_backend": query_backend.name, # type: ignore
|
|
126
|
+
"fs_type": infer_fs_type(path or ""), # type: ignore
|
|
127
|
+
"rowgroup_size": rowgroup_size or -1, # type: ignore
|
|
128
|
+
"arrow_fragment_readahead": ( # type: ignore
|
|
129
|
+
arrow_batch_spec["fragment_readahead"] if arrow_batch_spec else -1 # type: ignore
|
|
130
|
+
),
|
|
131
|
+
"arrow_batch_readahead": ( # type: ignore
|
|
132
|
+
arrow_batch_spec["batch_readahead"] if arrow_batch_spec else -1 # type: ignore
|
|
133
|
+
),
|
|
134
|
+
"arrow_batch_size": ( # type: ignore
|
|
135
|
+
arrow_batch_spec["batch_size"] if arrow_batch_spec else -1 # type: ignore
|
|
136
|
+
),
|
|
137
|
+
"vdb_num_sub_splits": vdb_num_sub_splits or -1, # type: ignore
|
|
138
|
+
"vdb_num_row_groups_per_sub_split": vdb_num_row_groups_per_sub_split or -1, # type: ignore
|
|
139
|
+
},
|
|
140
|
+
)
|
|
141
|
+
|
|
142
|
+
scenarios.append(scen)
|
|
143
|
+
|
|
144
|
+
return scenarios
|
|
File without changes
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
# ruff: noqa: F841
|
|
2
|
+
|
|
3
|
+
import datetime as dt
|
|
4
|
+
from functools import reduce
|
|
5
|
+
from typing import Optional, Sequence
|
|
6
|
+
|
|
7
|
+
import pyarrow as pa
|
|
8
|
+
from pyarrow import dataset as ds
|
|
9
|
+
|
|
10
|
+
from vastdb.bench.perf_bench.common import utils as bu
|
|
11
|
+
from vastdb.bench.perf_bench.dataset import secmaster as sm
|
|
12
|
+
from vastdb.bench.perf_bench.dataset.schemas import BF
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def build_arrow_filter(
|
|
16
|
+
universe: Sequence[str],
|
|
17
|
+
from_t: Optional[dt.datetime] = None,
|
|
18
|
+
to_t: Optional[dt.datetime] = None,
|
|
19
|
+
use_sid: bool = True,
|
|
20
|
+
) -> ds.Expression:
|
|
21
|
+
# Build the filters
|
|
22
|
+
y_m_filter = from_t_filter = to_t_filter = None
|
|
23
|
+
from_t, to_t = bu.to_ts(from_t, normalize=False), bu.to_ts(to_t, normalize=False)
|
|
24
|
+
if from_t or to_t:
|
|
25
|
+
all_dates = bu.get_dates_range(from_t, to_t, inclusive="left")
|
|
26
|
+
years = sorted({d.year for d in all_dates})
|
|
27
|
+
|
|
28
|
+
y_m_filter = ds.field("year").isin(years) & ds.field("date").isin(
|
|
29
|
+
tuple(d.strftime("%Y%m%d") for d in all_dates)
|
|
30
|
+
)
|
|
31
|
+
|
|
32
|
+
if from_t and from_t.time() != dt.time.min:
|
|
33
|
+
from_t_filter = ds.field("ts") >= pa.scalar(from_t)
|
|
34
|
+
if to_t and to_t.time() != dt.time.min:
|
|
35
|
+
to_t_filter = ds.field("ts") < pa.scalar(to_t)
|
|
36
|
+
|
|
37
|
+
if use_sid:
|
|
38
|
+
universe = [sm.to_sid(s) for s in universe] # type: ignore[misc]
|
|
39
|
+
fld = BF.sid.value
|
|
40
|
+
else:
|
|
41
|
+
fld = BF.ticker.value
|
|
42
|
+
universe = sorted(universe)
|
|
43
|
+
if len(universe) <= 16:
|
|
44
|
+
tickers_filter = reduce(
|
|
45
|
+
lambda x, y: x | y, [ds.field(fld) == s for s in universe]
|
|
46
|
+
)
|
|
47
|
+
else:
|
|
48
|
+
tickers_filter = ds.field(fld).isin(universe)
|
|
49
|
+
|
|
50
|
+
ds_filter = reduce(
|
|
51
|
+
lambda x, y: x & y,
|
|
52
|
+
[
|
|
53
|
+
flt
|
|
54
|
+
for flt in (y_m_filter, tickers_filter, from_t_filter, to_t_filter)
|
|
55
|
+
if flt is not None
|
|
56
|
+
],
|
|
57
|
+
)
|
|
58
|
+
|
|
59
|
+
return ds_filter
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
import datetime as dt
|
|
2
|
+
import logging
|
|
3
|
+
from typing import Any, Dict, Optional, Protocol, Sequence
|
|
4
|
+
|
|
5
|
+
import pyarrow as pa
|
|
6
|
+
|
|
7
|
+
from vastdb.bench.perf_bench.common.types import StrEnum
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class QueryRunner(Protocol):
|
|
11
|
+
def __call__(
|
|
12
|
+
self,
|
|
13
|
+
universe: Sequence[str],
|
|
14
|
+
columns: Optional[Sequence[str]],
|
|
15
|
+
from_t: Optional[dt.datetime],
|
|
16
|
+
to_t: Optional[dt.datetime],
|
|
17
|
+
path: Optional[str],
|
|
18
|
+
use_sid: bool,
|
|
19
|
+
logger: Optional[logging.Logger],
|
|
20
|
+
backend_kwargs: Optional[Dict[str, Any]],
|
|
21
|
+
) -> pa.Table:
|
|
22
|
+
...
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class QueryBackend(StrEnum):
|
|
26
|
+
vastdb = "vastdb"
|
|
27
|
+
pyarrow = "pyarrow"
|
|
28
|
+
|
|
29
|
+
@property
|
|
30
|
+
def _query_fun(self) -> QueryRunner:
|
|
31
|
+
if self is QueryBackend.vastdb:
|
|
32
|
+
from vastdb.bench.perf_bench.query.query_vastdb import query_vastdb
|
|
33
|
+
|
|
34
|
+
return query_vastdb
|
|
35
|
+
elif self is QueryBackend.pyarrow:
|
|
36
|
+
from vastdb.bench.perf_bench.query.query_pyarrow import query_pyarrow
|
|
37
|
+
|
|
38
|
+
return query_pyarrow
|
|
39
|
+
raise NotImplementedError(f"Unsupported query backend: {self}")
|
|
40
|
+
|
|
41
|
+
def run_query(self, *args, **kwargs) -> pa.Table:
|
|
42
|
+
return self._query_fun(*args, **kwargs)
|
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
# ruff: noqa: F841
|
|
2
|
+
|
|
3
|
+
import datetime as dt
|
|
4
|
+
import logging
|
|
5
|
+
from typing import Any, Dict, Optional, Sequence
|
|
6
|
+
|
|
7
|
+
import pyarrow as pa
|
|
8
|
+
|
|
9
|
+
import vastdb.bench.perf_bench.common.log_utils
|
|
10
|
+
from vastdb.bench.perf_bench.common import utils as bu
|
|
11
|
+
from vastdb.bench.perf_bench.common.constants import (
|
|
12
|
+
DEFAULT_ARROW_KWARGS,
|
|
13
|
+
VastConnDetails,
|
|
14
|
+
)
|
|
15
|
+
from vastdb.bench.perf_bench.query.arrow_common import build_arrow_filter
|
|
16
|
+
|
|
17
|
+
LOG = vastdb.bench.perf_bench.common.log_utils.get_logger(__name__)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
# noinspection PyUnusedLocal
|
|
21
|
+
def query_pyarrow(
|
|
22
|
+
universe: Sequence[str],
|
|
23
|
+
columns: Optional[Sequence[str]] = None,
|
|
24
|
+
from_t: Optional[dt.datetime] = None,
|
|
25
|
+
to_t: Optional[dt.datetime] = None,
|
|
26
|
+
path: Optional[str] = None,
|
|
27
|
+
use_sid: bool = True,
|
|
28
|
+
logger: Optional[logging.Logger] = None,
|
|
29
|
+
backend_kwargs: Optional[Dict[str, Any]] = None,
|
|
30
|
+
) -> pa.Table:
|
|
31
|
+
# ------------------------------------------------------------
|
|
32
|
+
# Query the PyArrow's parquet dataset
|
|
33
|
+
# ------------------------------------------------------------
|
|
34
|
+
backend_kwargs = backend_kwargs or {}
|
|
35
|
+
conn_details: Optional[VastConnDetails] = backend_kwargs.pop("conn_details", None)
|
|
36
|
+
conn_details = conn_details or VastConnDetails()
|
|
37
|
+
fs_kwargs = {
|
|
38
|
+
"botocore_client_kwargs": {
|
|
39
|
+
"aws_access_key_id": conn_details.access,
|
|
40
|
+
"aws_secret_access_key": conn_details.secret,
|
|
41
|
+
"host": conn_details.s3_host,
|
|
42
|
+
}
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
# Note that one can optimize the below by passing the filesystem object and/or partitioning
|
|
46
|
+
# scheme to avoid the slow penalty of discovering the partitioning upon init.
|
|
47
|
+
dset = bu.get_parquet_dataset(path=path, filesystem=None, fs_kwargs=fs_kwargs) # type: ignore[arg-type]
|
|
48
|
+
|
|
49
|
+
ds_filter = build_arrow_filter(
|
|
50
|
+
universe=universe,
|
|
51
|
+
from_t=from_t,
|
|
52
|
+
to_t=to_t,
|
|
53
|
+
use_sid=use_sid,
|
|
54
|
+
)
|
|
55
|
+
|
|
56
|
+
# Cleanup the kwargs to be passed to the arrow scanner
|
|
57
|
+
kwargs = backend_kwargs or DEFAULT_ARROW_KWARGS
|
|
58
|
+
kwargs.pop("arrow_kwargs", None)
|
|
59
|
+
if flt := kwargs.pop("filter", None):
|
|
60
|
+
ds_filter = ds_filter & flt
|
|
61
|
+
|
|
62
|
+
# Read the data
|
|
63
|
+
# noinspection PyArgumentList
|
|
64
|
+
table = dset.to_table(
|
|
65
|
+
columns=columns,
|
|
66
|
+
filter=ds_filter,
|
|
67
|
+
**{**DEFAULT_ARROW_KWARGS, **(kwargs or {})},
|
|
68
|
+
)
|
|
69
|
+
|
|
70
|
+
return table
|