vastdb 1.0.0__py3-none-any.whl → 1.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. vastdb/_internal.py +41 -10
  2. vastdb/bench/perf_bench/__init__.py +0 -0
  3. vastdb/bench/perf_bench/bench_repo/__init__.py +0 -0
  4. vastdb/bench/perf_bench/bench_repo/mega_combo.py +87 -0
  5. vastdb/bench/perf_bench/cli.py +225 -0
  6. vastdb/bench/perf_bench/common/__init__.py +0 -0
  7. vastdb/bench/perf_bench/common/constants.py +96 -0
  8. vastdb/bench/perf_bench/common/log_utils.py +67 -0
  9. vastdb/bench/perf_bench/common/types.py +34 -0
  10. vastdb/bench/perf_bench/common/utils.py +219 -0
  11. vastdb/bench/perf_bench/dataset/__init__.py +0 -0
  12. vastdb/bench/perf_bench/dataset/generate_secmaster.py +105 -0
  13. vastdb/bench/perf_bench/dataset/generate_stocks_dataset.py +242 -0
  14. vastdb/bench/perf_bench/dataset/schemas.py +101 -0
  15. vastdb/bench/perf_bench/dataset/secmaster.py +33 -0
  16. vastdb/bench/perf_bench/orchestrate/__init__.py +0 -0
  17. vastdb/bench/perf_bench/orchestrate/bench_spec.py +91 -0
  18. vastdb/bench/perf_bench/orchestrate/results_helpers.py +126 -0
  19. vastdb/bench/perf_bench/orchestrate/scenario.py +109 -0
  20. vastdb/bench/perf_bench/orchestrate/scenario_generator.py +144 -0
  21. vastdb/bench/perf_bench/query/__init__.py +0 -0
  22. vastdb/bench/perf_bench/query/arrow_common.py +59 -0
  23. vastdb/bench/perf_bench/query/query.py +42 -0
  24. vastdb/bench/perf_bench/query/query_pyarrow.py +70 -0
  25. vastdb/bench/perf_bench/query/query_vastdb.py +78 -0
  26. vastdb/bench/perf_bench/run.py +79 -0
  27. vastdb/bench/test_sample.py +4 -2
  28. vastdb/conftest.py +1 -1
  29. vastdb/session.py +0 -6
  30. vastdb/table.py +35 -35
  31. vastdb/tests/test_nested.py +58 -0
  32. vastdb/tests/test_tables.py +13 -0
  33. vastdb/transaction.py +4 -8
  34. vastdb/util.py +5 -0
  35. {vastdb-1.0.0.dist-info → vastdb-1.1.1.dist-info}/METADATA +3 -4
  36. {vastdb-1.0.0.dist-info → vastdb-1.1.1.dist-info}/RECORD +39 -14
  37. {vastdb-1.0.0.dist-info → vastdb-1.1.1.dist-info}/WHEEL +1 -1
  38. {vastdb-1.0.0.dist-info → vastdb-1.1.1.dist-info}/LICENSE +0 -0
  39. {vastdb-1.0.0.dist-info → vastdb-1.1.1.dist-info}/top_level.txt +0 -0
File without changes
@@ -0,0 +1,91 @@
1
+ import datetime as dt
2
+ import os
3
+ from typing import Dict, List, NamedTuple, Optional, Union
4
+
5
+ import pandas as pd
6
+
7
+ DEFAULT_CSV_SEPARATOR = ","
8
+
9
+
10
+ class BenchResult(NamedTuple):
11
+ # The identifier for this result
12
+ key: str
13
+ round: int
14
+
15
+ # Host details and time
16
+ host: str
17
+ pid: int
18
+
19
+ # Results details
20
+ start_ts: pd.Timestamp
21
+ end_ts: pd.Timestamp
22
+ duration_sec: float
23
+ n_rows: int
24
+ n_cols: int
25
+ n_bytes: int
26
+
27
+ params: Optional[Dict[str, Union[str, int, float]]] = None
28
+
29
+ def to_dict(
30
+ self,
31
+ include_params: bool = True,
32
+ flatten_params: bool = True,
33
+ ) -> Dict[str, Union[str, int, float]]:
34
+ dc = {f: getattr(self, f) for f in self._fields if f != "params"}
35
+ if include_params:
36
+ params = self.params or {}
37
+ if flatten_params:
38
+ dc = {**dc, **{pk: pv for pk, pv in params.items() if pk not in dc}}
39
+ else:
40
+ dc["params"] = params
41
+ return dc
42
+
43
+ def fields(
44
+ self,
45
+ include_params: bool = True,
46
+ flatten_params: bool = True,
47
+ ) -> List[str]:
48
+ return list(
49
+ self.to_dict(include_params=include_params, flatten_params=flatten_params)
50
+ )
51
+
52
+ def csv_header(
53
+ self,
54
+ separator: str = DEFAULT_CSV_SEPARATOR,
55
+ include_params: bool = True,
56
+ ) -> str:
57
+ return separator.join(
58
+ self.to_dict(include_params=include_params, flatten_params=True)
59
+ )
60
+
61
+ def to_csv(
62
+ self,
63
+ separator: str = DEFAULT_CSV_SEPARATOR,
64
+ include_params: bool = True,
65
+ include_header: bool = False,
66
+ ) -> str:
67
+ def _to_str(v) -> str:
68
+ if isinstance(v, dt.date):
69
+ v = v.isoformat()
70
+ v_str = str(v) if not isinstance(v, str) else v
71
+ if separator in v_str:
72
+ if '"' in v_str:
73
+ raise ValueError(f"Can't handle double quotes in value: {v_str}")
74
+ v_str = f'"{v_str}"'
75
+ return v_str
76
+
77
+ data = [
78
+ _to_str(d)
79
+ for d in self.to_dict(include_params=include_params, flatten_params=True)
80
+ ]
81
+ data_csv = separator.join(data)
82
+
83
+ if include_header:
84
+ data_csv = os.linesep.join(
85
+ [
86
+ self.csv_header(separator=separator, include_params=include_params),
87
+ data_csv,
88
+ ]
89
+ )
90
+
91
+ return data_csv
@@ -0,0 +1,126 @@
1
+ from pathlib import Path
2
+ from typing import List, Optional, Union
3
+
4
+ import numpy as np
5
+ import pandas as pd
6
+
7
+ from vastdb.bench.perf_bench.common.log_utils import get_logger
8
+ from vastdb.bench.perf_bench.orchestrate.bench_spec import BenchResult
9
+
10
+ LOG = get_logger(__name__)
11
+
12
+ PARAM_NAMES_SEP = "|"
13
+
14
+
15
+ def results_to_df(
16
+ results: List[BenchResult],
17
+ ) -> pd.DataFrame:
18
+ results_df = pd.DataFrame([r.to_dict() for r in results])
19
+ results_df["param_names"] = PARAM_NAMES_SEP.join(
20
+ [c for c in (results[0].params or ()) if c in results_df]
21
+ )
22
+ return results_df
23
+
24
+
25
+ def save_results(
26
+ results: List[BenchResult],
27
+ results_path: Union[str, Path],
28
+ append: bool = False,
29
+ ):
30
+ if not results:
31
+ return
32
+
33
+ results_path = Path(results_path)
34
+ results_path.parent.mkdir(parents=True, exist_ok=True)
35
+ results_df = results_to_df(results)
36
+
37
+ if append and results_path.is_file():
38
+ prev_results = pd.read_csv(results_path).astype(results_df.dtypes)
39
+ results_df = pd.concat([prev_results, results_df], ignore_index=True)
40
+
41
+ results_df.to_csv(results_path, index=False)
42
+ LOG.info(f"Results written to {results_path}")
43
+
44
+
45
+ def calc_total_time_coverage_seconds(results_df: pd.DataFrame) -> float:
46
+ # Step 1: Separate start and end times, tagging them differently
47
+ starts = results_df["start_ts"].rename("time").to_frame().assign(event=1)
48
+ ends = results_df["end_ts"].rename("time").to_frame().assign(event=-1)
49
+
50
+ # Combine and sort chronologically
51
+ timeline = pd.concat([starts, ends]).sort_values("time")
52
+
53
+ # Step 2 & 3: Calculate cumulative sum to identify active intervals
54
+ timeline["active_intervals"] = timeline["event"].cumsum()
55
+
56
+ # Identify transitions (changes in active_intervals)
57
+ timeline["change"] = timeline["active_intervals"].diff().fillna(1).astype(bool)
58
+
59
+ # Calculate durations between changes
60
+ timeline["duration"] = timeline["time"].diff().shift(-1).fillna(pd.Timedelta(0))
61
+
62
+ # Only consider periods where at least one interval is active
63
+ total_coverage: pd.Timedelta = timeline.query("active_intervals > 0")[
64
+ "duration"
65
+ ].sum()
66
+ return total_coverage.total_seconds()
67
+
68
+
69
+ def calculate_aggregate_stats(
70
+ results: Optional[List[BenchResult]] = None,
71
+ results_df: Optional[pd.DataFrame] = None,
72
+ results_path: Union[None, str, Path] = None,
73
+ ) -> pd.DataFrame:
74
+ if sum(bool(x) for x in (results, results_df, results_path)) != 1:
75
+ raise ValueError(
76
+ "Exactly one of results, results_df, or results_path must be provided"
77
+ )
78
+
79
+ if results:
80
+ r_df = results_to_df(results)
81
+ group_flds = list(results[0].params or ())
82
+ else:
83
+ r_df: pd.DataFrame = ( # type: ignore[assignment,no-redef]
84
+ pd.read_csv(results_path) if results_path else results_df
85
+ )
86
+ p_names: pd.Series = results_df["param_names"] # type: ignore[index]
87
+ group_flds = list(p_names.values[0].split(PARAM_NAMES_SEP))
88
+ group_flds = ["key", "n_cols", *group_flds]
89
+
90
+ if not group_flds:
91
+ raise ValueError("No group fields found")
92
+
93
+ def _get_agg(col: str) -> str:
94
+ mapper = {
95
+ "start_ts": "min",
96
+ "end_ts": "max",
97
+ "round": "max",
98
+ "duration_sec": "mean",
99
+ }
100
+ return mapper.get(
101
+ col, "sum" if np.issubdtype(r_df.dtypes[col], np.number) else "last"
102
+ )
103
+
104
+ non_group_flds = [c for c in r_df.columns if c not in group_flds]
105
+ agg_df = (
106
+ r_df.groupby(group_flds)
107
+ .aggregate(
108
+ {c: _get_agg(c) for c in non_group_flds},
109
+ total=("duration_sec", "count"),
110
+ )
111
+ .drop(columns=["pid", "host", "param_names", "round", "duration_sec"])
112
+ .sort_index()
113
+ )
114
+ agg_df["duration_sec"] = (
115
+ r_df.groupby(group_flds)
116
+ .apply(calc_total_time_coverage_seconds, include_groups=False)
117
+ .sort_index()
118
+ )
119
+ agg_df["M_rows_per_sec"] = (agg_df["n_rows"] / agg_df["duration_sec"] / 1e6).astype(
120
+ "float64"
121
+ )
122
+ agg_df["MB_per_sec"] = (
123
+ agg_df["n_bytes"] / 1024 ** 2 / agg_df["duration_sec"]
124
+ ).astype("float64")
125
+
126
+ return agg_df
@@ -0,0 +1,109 @@
1
+ import concurrent
2
+ import os
3
+ import socket
4
+ import time
5
+ from dataclasses import dataclass
6
+ from typing import Any, Callable, Dict, List, Optional, Tuple, Union
7
+
8
+ import pandas as pd
9
+
10
+ from vastdb.bench.perf_bench.orchestrate.bench_spec import BenchResult
11
+ from vastdb.bench.perf_bench.orchestrate.results_helpers import LOG
12
+
13
+
14
+ @dataclass
15
+ class BenchScenario:
16
+ key: str
17
+ func: Callable
18
+ kwargs: Dict[str, Any]
19
+ results: Optional[Tuple[BenchResult, ...]] = tuple()
20
+ default_result_params: Optional[Dict[str, Union[str, int, float]]] = None
21
+
22
+ # noinspection PyUnresolvedReferences
23
+ def run(
24
+ self,
25
+ n_runs: int = 1,
26
+ discard_first_run: bool = True,
27
+ parallelism: int = 1,
28
+ workers_init: Optional[Callable] = None,
29
+ ):
30
+ if parallelism == 1:
31
+ self.results = tuple(
32
+ self._do_run(n_runs=n_runs, discard_first_run=discard_first_run)
33
+ )
34
+ elif parallelism > 1:
35
+ with concurrent.futures.ProcessPoolExecutor(
36
+ max_workers=parallelism,
37
+ initializer=workers_init,
38
+ ) as executor:
39
+ futures = [
40
+ executor.submit(
41
+ self._do_run, n_runs=n_runs, discard_first_run=discard_first_run
42
+ )
43
+ for _ in range(parallelism)
44
+ ]
45
+
46
+ # Iterate over futures to get their results
47
+ results = []
48
+ for future in concurrent.futures.as_completed(futures):
49
+ LOG.info("Future completed")
50
+ results.extend(future.result())
51
+ self.results = tuple(results)
52
+ else:
53
+ raise ValueError("Can't use a negative number of parallel benches")
54
+
55
+ def _do_run(
56
+ self,
57
+ n_runs: int = 1,
58
+ discard_first_run: bool = True,
59
+ ) -> List[BenchResult]:
60
+ hostname = socket.gethostname()
61
+ pid = os.getpid()
62
+ results = []
63
+ kwargs = self.kwargs or {}
64
+
65
+ for i in range(n_runs + int(discard_first_run)):
66
+ key = self.key
67
+
68
+ start_ts = time.time_ns()
69
+ ret_v = self.func(**kwargs)
70
+ end_ts = time.time_ns()
71
+
72
+ if discard_first_run and i == 0:
73
+ LOG.info(f"Discarding first run ({key})")
74
+ continue
75
+
76
+ n_bytes, n_rows, n_cols = 0, 0, 0
77
+
78
+ if hasattr(ret_v, "shape"):
79
+ shape = ret_v.shape
80
+ if len(shape):
81
+ n_rows = shape[0]
82
+ if len(shape) > 1:
83
+ n_cols = shape[1]
84
+
85
+ if hasattr(ret_v, "nbytes"):
86
+ n_bytes = ret_v.nbytes
87
+ elif hasattr(ret_v, "memory_usage"):
88
+ n_bytes = ret_v.memory_usage().sum()
89
+ elif hasattr(ret_v, "estimated_size"):
90
+ n_bytes = ret_v.estimated_size()
91
+
92
+ result = BenchResult(
93
+ key=key,
94
+ round=i,
95
+ host=hostname,
96
+ pid=pid,
97
+ start_ts=pd.Timestamp(start_ts, unit="ns"),
98
+ end_ts=pd.Timestamp(end_ts, unit="ns"),
99
+ duration_sec=(end_ts - start_ts) / 1e9,
100
+ n_rows=n_rows,
101
+ n_cols=n_cols,
102
+ n_bytes=n_bytes,
103
+ params=self.default_result_params or {},
104
+ )
105
+ results.append(result)
106
+
107
+ LOG.info(f"{result.duration_sec} sec")
108
+
109
+ return results
@@ -0,0 +1,144 @@
1
+ import datetime as dt
2
+ from itertools import product
3
+ from typing import Dict, List, Optional, Sequence, Union
4
+
5
+ import pandas as pd
6
+
7
+ from vastdb.bench.perf_bench.common.constants import (
8
+ DEFAULT_ARROW_KWARGS,
9
+ DEFAULT_ROW_GROUP_SIZE,
10
+ DEFAULT_START_T,
11
+ DFAULT_PARQUET_COMPRESSION,
12
+ ParquetCompression,
13
+ VastConnDetails,
14
+ )
15
+ from vastdb.bench.perf_bench.common.types import PathLikeT
16
+ from vastdb.bench.perf_bench.common.utils import get_parquet_dataset_root, infer_fs_type
17
+ from vastdb.bench.perf_bench.dataset import secmaster as sm
18
+ from vastdb.bench.perf_bench.dataset.schemas import DEFAULT_BARS_COLUMNS
19
+ from vastdb.bench.perf_bench.orchestrate.scenario import BenchScenario
20
+ from vastdb.bench.perf_bench.query.query import QueryBackend
21
+
22
+
23
+ def generate_perf_bench_scenarios(
24
+ base_key: str,
25
+ conn_details: VastConnDetails,
26
+ # Common parameters
27
+ query_backends: Sequence[Union[str, QueryBackend]] = (QueryBackend.pyarrow,),
28
+ universe_choices: Sequence[Union[str, Sequence[str]]] = ("Single",),
29
+ columns_choices: Sequence[Sequence[str]] = (DEFAULT_BARS_COLUMNS,),
30
+ dt_start_t: Union[dt.date, str] = DEFAULT_START_T,
31
+ num_bdays: Sequence[int] = (1,),
32
+ # Arrow-specific options
33
+ fs_path_choices: Optional[Sequence[PathLikeT]] = None,
34
+ rowgroup_size_choices: Optional[Sequence[int]] = None,
35
+ compression_choices: Optional[Sequence[Union[str, ParquetCompression]]] = None,
36
+ arrow_batching_spec_choices: Optional[Sequence[Dict[str, int]]] = None,
37
+ # VastDB-specific options
38
+ vdb_num_sub_splits_choices: Optional[Sequence[int]] = None,
39
+ vdb_num_row_groups_per_sub_split_choices: Optional[Sequence[int]] = None,
40
+ ) -> List[BenchScenario]:
41
+ if not base_key:
42
+ raise ValueError("base_key must be provided")
43
+
44
+ uni_choices: List[List[str]] = []
45
+ for k in universe_choices:
46
+ if isinstance(k, str):
47
+ uni_choices.append(sm.UNI_SPEC.get(k, [k]))
48
+ else:
49
+ uni_choices.append(list(k))
50
+
51
+ columns_choices = [[c for c in cs if c] for cs in columns_choices if cs]
52
+ dt_start_t = pd.Timestamp(dt_start_t)
53
+ dt_range_choices = [
54
+ (dt_start_t, dt_start_t + pd.tseries.offsets.BDay(d))
55
+ for d in sorted(set(num_bdays))
56
+ if d > 0
57
+ ]
58
+
59
+ scenarios = []
60
+ for i, ( # type: ignore[misc]
61
+ uni,
62
+ columns,
63
+ (from_t, to_t),
64
+ path,
65
+ rowgroup_size,
66
+ compression,
67
+ arrow_batch_spec,
68
+ vdb_num_sub_splits,
69
+ vdb_num_row_groups_per_sub_split,
70
+ query_backend,
71
+ ) in enumerate(
72
+ product(
73
+ uni_choices,
74
+ columns_choices,
75
+ dt_range_choices,
76
+ # Arrow specific options
77
+ (*(fs_path_choices or []), None),
78
+ (*(rowgroup_size_choices or [DEFAULT_ROW_GROUP_SIZE]), None),
79
+ (*(compression_choices or [DFAULT_PARQUET_COMPRESSION]), None),
80
+ (*(arrow_batching_spec_choices or [DEFAULT_ARROW_KWARGS]), None),
81
+ # VastDB specific options
82
+ (*(vdb_num_sub_splits_choices or [1]), None),
83
+ (*(vdb_num_row_groups_per_sub_split_choices or [8]), None),
84
+ [QueryBackend[qb] for qb in query_backends],
85
+ )
86
+ ):
87
+ if query_backend is QueryBackend.vastdb:
88
+ if not all((vdb_num_sub_splits, vdb_num_row_groups_per_sub_split)) or any(
89
+ (path, rowgroup_size, compression, arrow_batch_spec)
90
+ ):
91
+ # ignore path and rg size for vastdb runs
92
+ continue
93
+ backend_kwargs = {
94
+ "num_sub_splits": vdb_num_sub_splits,
95
+ "num_row_groups_per_sub_split": vdb_num_row_groups_per_sub_split,
96
+ "conn_details": conn_details,
97
+ }
98
+ path = None
99
+ elif query_backend is QueryBackend.pyarrow:
100
+ if not all((path, rowgroup_size, compression, arrow_batch_spec)) or any(
101
+ (vdb_num_sub_splits, vdb_num_row_groups_per_sub_split)
102
+ ):
103
+ # ignore non-vastdb runs without a path or rg size
104
+ continue
105
+ backend_kwargs: Dict[str, Union[str, int, VastConnDetails]] = arrow_batch_spec # type: ignore[no-redef]
106
+ backend_kwargs["conn_details"] = conn_details
107
+ path = get_parquet_dataset_root(path, rowgroup_size, compression) # type: ignore[arg-type]
108
+ else:
109
+ raise NotImplementedError(f"Unsupported query backend: {query_backend=!r}")
110
+
111
+ scen = BenchScenario(
112
+ key=f"{base_key}_{query_backend.name}",
113
+ func=query_backend.run_query,
114
+ kwargs=dict(
115
+ universe=uni,
116
+ columns=columns,
117
+ from_t=from_t,
118
+ to_t=to_t,
119
+ path=path,
120
+ backend_kwargs=backend_kwargs,
121
+ ),
122
+ default_result_params={
123
+ "uni_sz": len(uni), # type: ignore
124
+ "num_days": (to_t - from_t).days, # type: ignore
125
+ "query_backend": query_backend.name, # type: ignore
126
+ "fs_type": infer_fs_type(path or ""), # type: ignore
127
+ "rowgroup_size": rowgroup_size or -1, # type: ignore
128
+ "arrow_fragment_readahead": ( # type: ignore
129
+ arrow_batch_spec["fragment_readahead"] if arrow_batch_spec else -1 # type: ignore
130
+ ),
131
+ "arrow_batch_readahead": ( # type: ignore
132
+ arrow_batch_spec["batch_readahead"] if arrow_batch_spec else -1 # type: ignore
133
+ ),
134
+ "arrow_batch_size": ( # type: ignore
135
+ arrow_batch_spec["batch_size"] if arrow_batch_spec else -1 # type: ignore
136
+ ),
137
+ "vdb_num_sub_splits": vdb_num_sub_splits or -1, # type: ignore
138
+ "vdb_num_row_groups_per_sub_split": vdb_num_row_groups_per_sub_split or -1, # type: ignore
139
+ },
140
+ )
141
+
142
+ scenarios.append(scen)
143
+
144
+ return scenarios
File without changes
@@ -0,0 +1,59 @@
1
+ # ruff: noqa: F841
2
+
3
+ import datetime as dt
4
+ from functools import reduce
5
+ from typing import Optional, Sequence
6
+
7
+ import pyarrow as pa
8
+ from pyarrow import dataset as ds
9
+
10
+ from vastdb.bench.perf_bench.common import utils as bu
11
+ from vastdb.bench.perf_bench.dataset import secmaster as sm
12
+ from vastdb.bench.perf_bench.dataset.schemas import BF
13
+
14
+
15
+ def build_arrow_filter(
16
+ universe: Sequence[str],
17
+ from_t: Optional[dt.datetime] = None,
18
+ to_t: Optional[dt.datetime] = None,
19
+ use_sid: bool = True,
20
+ ) -> ds.Expression:
21
+ # Build the filters
22
+ y_m_filter = from_t_filter = to_t_filter = None
23
+ from_t, to_t = bu.to_ts(from_t, normalize=False), bu.to_ts(to_t, normalize=False)
24
+ if from_t or to_t:
25
+ all_dates = bu.get_dates_range(from_t, to_t, inclusive="left")
26
+ years = sorted({d.year for d in all_dates})
27
+
28
+ y_m_filter = ds.field("year").isin(years) & ds.field("date").isin(
29
+ tuple(d.strftime("%Y%m%d") for d in all_dates)
30
+ )
31
+
32
+ if from_t and from_t.time() != dt.time.min:
33
+ from_t_filter = ds.field("ts") >= pa.scalar(from_t)
34
+ if to_t and to_t.time() != dt.time.min:
35
+ to_t_filter = ds.field("ts") < pa.scalar(to_t)
36
+
37
+ if use_sid:
38
+ universe = [sm.to_sid(s) for s in universe] # type: ignore[misc]
39
+ fld = BF.sid.value
40
+ else:
41
+ fld = BF.ticker.value
42
+ universe = sorted(universe)
43
+ if len(universe) <= 16:
44
+ tickers_filter = reduce(
45
+ lambda x, y: x | y, [ds.field(fld) == s for s in universe]
46
+ )
47
+ else:
48
+ tickers_filter = ds.field(fld).isin(universe)
49
+
50
+ ds_filter = reduce(
51
+ lambda x, y: x & y,
52
+ [
53
+ flt
54
+ for flt in (y_m_filter, tickers_filter, from_t_filter, to_t_filter)
55
+ if flt is not None
56
+ ],
57
+ )
58
+
59
+ return ds_filter
@@ -0,0 +1,42 @@
1
+ import datetime as dt
2
+ import logging
3
+ from typing import Any, Dict, Optional, Protocol, Sequence
4
+
5
+ import pyarrow as pa
6
+
7
+ from vastdb.bench.perf_bench.common.types import StrEnum
8
+
9
+
10
+ class QueryRunner(Protocol):
11
+ def __call__(
12
+ self,
13
+ universe: Sequence[str],
14
+ columns: Optional[Sequence[str]],
15
+ from_t: Optional[dt.datetime],
16
+ to_t: Optional[dt.datetime],
17
+ path: Optional[str],
18
+ use_sid: bool,
19
+ logger: Optional[logging.Logger],
20
+ backend_kwargs: Optional[Dict[str, Any]],
21
+ ) -> pa.Table:
22
+ ...
23
+
24
+
25
+ class QueryBackend(StrEnum):
26
+ vastdb = "vastdb"
27
+ pyarrow = "pyarrow"
28
+
29
+ @property
30
+ def _query_fun(self) -> QueryRunner:
31
+ if self is QueryBackend.vastdb:
32
+ from vastdb.bench.perf_bench.query.query_vastdb import query_vastdb
33
+
34
+ return query_vastdb
35
+ elif self is QueryBackend.pyarrow:
36
+ from vastdb.bench.perf_bench.query.query_pyarrow import query_pyarrow
37
+
38
+ return query_pyarrow
39
+ raise NotImplementedError(f"Unsupported query backend: {self}")
40
+
41
+ def run_query(self, *args, **kwargs) -> pa.Table:
42
+ return self._query_fun(*args, **kwargs)
@@ -0,0 +1,70 @@
1
+ # ruff: noqa: F841
2
+
3
+ import datetime as dt
4
+ import logging
5
+ from typing import Any, Dict, Optional, Sequence
6
+
7
+ import pyarrow as pa
8
+
9
+ import vastdb.bench.perf_bench.common.log_utils
10
+ from vastdb.bench.perf_bench.common import utils as bu
11
+ from vastdb.bench.perf_bench.common.constants import (
12
+ DEFAULT_ARROW_KWARGS,
13
+ VastConnDetails,
14
+ )
15
+ from vastdb.bench.perf_bench.query.arrow_common import build_arrow_filter
16
+
17
+ LOG = vastdb.bench.perf_bench.common.log_utils.get_logger(__name__)
18
+
19
+
20
+ # noinspection PyUnusedLocal
21
+ def query_pyarrow(
22
+ universe: Sequence[str],
23
+ columns: Optional[Sequence[str]] = None,
24
+ from_t: Optional[dt.datetime] = None,
25
+ to_t: Optional[dt.datetime] = None,
26
+ path: Optional[str] = None,
27
+ use_sid: bool = True,
28
+ logger: Optional[logging.Logger] = None,
29
+ backend_kwargs: Optional[Dict[str, Any]] = None,
30
+ ) -> pa.Table:
31
+ # ------------------------------------------------------------
32
+ # Query the PyArrow's parquet dataset
33
+ # ------------------------------------------------------------
34
+ backend_kwargs = backend_kwargs or {}
35
+ conn_details: Optional[VastConnDetails] = backend_kwargs.pop("conn_details", None)
36
+ conn_details = conn_details or VastConnDetails()
37
+ fs_kwargs = {
38
+ "botocore_client_kwargs": {
39
+ "aws_access_key_id": conn_details.access,
40
+ "aws_secret_access_key": conn_details.secret,
41
+ "host": conn_details.s3_host,
42
+ }
43
+ }
44
+
45
+ # Note that one can optimize the below by passing the filesystem object and/or partitioning
46
+ # scheme to avoid the slow penalty of discovering the partitioning upon init.
47
+ dset = bu.get_parquet_dataset(path=path, filesystem=None, fs_kwargs=fs_kwargs) # type: ignore[arg-type]
48
+
49
+ ds_filter = build_arrow_filter(
50
+ universe=universe,
51
+ from_t=from_t,
52
+ to_t=to_t,
53
+ use_sid=use_sid,
54
+ )
55
+
56
+ # Cleanup the kwargs to be passed to the arrow scanner
57
+ kwargs = backend_kwargs or DEFAULT_ARROW_KWARGS
58
+ kwargs.pop("arrow_kwargs", None)
59
+ if flt := kwargs.pop("filter", None):
60
+ ds_filter = ds_filter & flt
61
+
62
+ # Read the data
63
+ # noinspection PyArgumentList
64
+ table = dset.to_table(
65
+ columns=columns,
66
+ filter=ds_filter,
67
+ **{**DEFAULT_ARROW_KWARGS, **(kwargs or {})},
68
+ )
69
+
70
+ return table