vastdb 0.1.11__py3-none-any.whl → 1.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vastdb/_internal.py +46 -17
- vastdb/bench/perf_bench/__init__.py +0 -0
- vastdb/bench/perf_bench/bench_repo/__init__.py +0 -0
- vastdb/bench/perf_bench/bench_repo/mega_combo.py +87 -0
- vastdb/bench/perf_bench/cli.py +225 -0
- vastdb/bench/perf_bench/common/__init__.py +0 -0
- vastdb/bench/perf_bench/common/constants.py +96 -0
- vastdb/bench/perf_bench/common/log_utils.py +67 -0
- vastdb/bench/perf_bench/common/types.py +34 -0
- vastdb/bench/perf_bench/common/utils.py +219 -0
- vastdb/bench/perf_bench/dataset/__init__.py +0 -0
- vastdb/bench/perf_bench/dataset/generate_secmaster.py +105 -0
- vastdb/bench/perf_bench/dataset/generate_stocks_dataset.py +242 -0
- vastdb/bench/perf_bench/dataset/schemas.py +101 -0
- vastdb/bench/perf_bench/dataset/secmaster.py +33 -0
- vastdb/bench/perf_bench/orchestrate/__init__.py +0 -0
- vastdb/bench/perf_bench/orchestrate/bench_spec.py +91 -0
- vastdb/bench/perf_bench/orchestrate/results_helpers.py +126 -0
- vastdb/bench/perf_bench/orchestrate/scenario.py +109 -0
- vastdb/bench/perf_bench/orchestrate/scenario_generator.py +144 -0
- vastdb/bench/perf_bench/query/__init__.py +0 -0
- vastdb/bench/perf_bench/query/arrow_common.py +59 -0
- vastdb/bench/perf_bench/query/query.py +42 -0
- vastdb/bench/perf_bench/query/query_pyarrow.py +70 -0
- vastdb/bench/perf_bench/query/query_vastdb.py +78 -0
- vastdb/bench/perf_bench/run.py +79 -0
- vastdb/table.py +34 -34
- vastdb/tests/test_nested.py +58 -0
- {vastdb-0.1.11.dist-info → vastdb-1.1.0.dist-info}/METADATA +2 -2
- {vastdb-0.1.11.dist-info → vastdb-1.1.0.dist-info}/RECORD +33 -8
- {vastdb-0.1.11.dist-info → vastdb-1.1.0.dist-info}/WHEEL +1 -1
- {vastdb-0.1.11.dist-info → vastdb-1.1.0.dist-info}/LICENSE +0 -0
- {vastdb-0.1.11.dist-info → vastdb-1.1.0.dist-info}/top_level.txt +0 -0
vastdb/_internal.py
CHANGED
|
@@ -35,6 +35,7 @@ from ibis.expr.operations.logical import (
|
|
|
35
35
|
)
|
|
36
36
|
from ibis.expr.operations.relations import Field
|
|
37
37
|
from ibis.expr.operations.strings import StringContains
|
|
38
|
+
from ibis.expr.operations.structs import StructField
|
|
38
39
|
|
|
39
40
|
import vast_flatbuf.org.apache.arrow.computeir.flatbuf.BinaryLiteral as fb_binary_lit
|
|
40
41
|
import vast_flatbuf.org.apache.arrow.computeir.flatbuf.BooleanLiteral as fb_bool_lit
|
|
@@ -182,7 +183,7 @@ class Predicate:
|
|
|
182
183
|
_logger.debug('OR args: %s op %s', or_args, op)
|
|
183
184
|
inner_offsets = []
|
|
184
185
|
|
|
185
|
-
|
|
186
|
+
prev_field_path = None
|
|
186
187
|
for inner_op in or_args:
|
|
187
188
|
_logger.debug('inner_op %s', inner_op)
|
|
188
189
|
op_type = type(inner_op)
|
|
@@ -216,28 +217,38 @@ class Predicate:
|
|
|
216
217
|
if not isinstance(literal, Literal):
|
|
217
218
|
raise NotImplementedError(self.expr)
|
|
218
219
|
|
|
220
|
+
field_path = []
|
|
221
|
+
while isinstance(column, StructField):
|
|
222
|
+
column, subfield_name = column.args
|
|
223
|
+
field_path.append(subfield_name)
|
|
224
|
+
|
|
219
225
|
if not isinstance(column, Field):
|
|
220
226
|
raise NotImplementedError(self.expr)
|
|
221
227
|
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
228
|
+
field_path.append(column.name)
|
|
229
|
+
field_path.reverse() # first entry should be the top-level column name
|
|
230
|
+
|
|
231
|
+
if prev_field_path is None:
|
|
232
|
+
prev_field_path = field_path
|
|
233
|
+
elif prev_field_path != field_path:
|
|
226
234
|
raise NotImplementedError(self.expr)
|
|
227
235
|
|
|
228
|
-
|
|
236
|
+
nodes_map = self.nodes_map
|
|
237
|
+
for name in field_path:
|
|
238
|
+
node = nodes_map[name]
|
|
239
|
+
nodes_map = node.children_map
|
|
240
|
+
|
|
229
241
|
# TODO: support predicate pushdown for leaf nodes (ORION-160338)
|
|
230
242
|
if node.children:
|
|
231
243
|
raise NotImplementedError(node.field) # no predicate pushdown for nested columns
|
|
232
244
|
column_offset = self.build_column(position=node.index)
|
|
233
|
-
field = self.schema.field(field_name)
|
|
234
245
|
for literal in literals:
|
|
235
246
|
args_offsets = [column_offset]
|
|
236
247
|
if literal is not None:
|
|
237
|
-
args_offsets.append(self.build_literal(field=field, value=literal.value))
|
|
248
|
+
args_offsets.append(self.build_literal(field=node.field, value=literal.value))
|
|
238
249
|
if builder_func == self.build_between:
|
|
239
|
-
args_offsets.append(self.build_literal(field=field, value=lower.value))
|
|
240
|
-
args_offsets.append(self.build_literal(field=field, value=upper.value))
|
|
250
|
+
args_offsets.append(self.build_literal(field=node.field, value=lower.value))
|
|
251
|
+
args_offsets.append(self.build_literal(field=node.field, value=upper.value))
|
|
241
252
|
|
|
242
253
|
inner_offsets.append(builder_func(*args_offsets))
|
|
243
254
|
|
|
@@ -572,6 +583,8 @@ class FieldNode:
|
|
|
572
583
|
else:
|
|
573
584
|
self.children = [] # for non-nested types
|
|
574
585
|
|
|
586
|
+
self.children_map = {c.field.name: c for c in self.children}
|
|
587
|
+
|
|
575
588
|
def _iter_to_root(self) -> Iterator['FieldNode']:
|
|
576
589
|
yield self
|
|
577
590
|
if self.parent is not None:
|
|
@@ -748,7 +761,6 @@ class VastdbApi:
|
|
|
748
761
|
|
|
749
762
|
def __init__(self, endpoint, access_key, secret_key,
|
|
750
763
|
*,
|
|
751
|
-
auth_type=AuthType.SIGV4,
|
|
752
764
|
ssl_verify=True,
|
|
753
765
|
timeout=None,
|
|
754
766
|
backoff_config: Optional[BackoffConfig] = None):
|
|
@@ -766,15 +778,15 @@ class VastdbApi:
|
|
|
766
778
|
self._session.verify = ssl_verify
|
|
767
779
|
self._session.headers['user-agent'] = self.client_sdk_version
|
|
768
780
|
|
|
769
|
-
backoff_config = backoff_config or BackoffConfig()
|
|
781
|
+
self.backoff_config = backoff_config or BackoffConfig()
|
|
770
782
|
self._backoff_decorator = backoff.on_exception(
|
|
771
|
-
wait_gen=backoff_config.wait_gen,
|
|
783
|
+
wait_gen=self.backoff_config.wait_gen,
|
|
772
784
|
exception=_RETRIABLE_EXCEPTIONS,
|
|
773
785
|
giveup=_backoff_giveup,
|
|
774
|
-
max_tries=backoff_config.max_tries,
|
|
775
|
-
max_time=backoff_config.max_time,
|
|
776
|
-
max_value=backoff_config.max_value, # passed to `backoff_config.wait_gen`
|
|
777
|
-
backoff_log_level=backoff_config.backoff_log_level)
|
|
786
|
+
max_tries=self.backoff_config.max_tries,
|
|
787
|
+
max_time=self.backoff_config.max_time,
|
|
788
|
+
max_value=self.backoff_config.max_value, # passed to `self.backoff_config.wait_gen`
|
|
789
|
+
backoff_log_level=self.backoff_config.backoff_log_level)
|
|
778
790
|
self._request = self._backoff_decorator(self._single_request)
|
|
779
791
|
|
|
780
792
|
if url.port in {80, 443, None}:
|
|
@@ -812,6 +824,23 @@ class VastdbApi:
|
|
|
812
824
|
_logger.critical(msg)
|
|
813
825
|
raise NotImplementedError(msg)
|
|
814
826
|
|
|
827
|
+
def __enter__(self):
|
|
828
|
+
"""Allow using this session as a context manager."""
|
|
829
|
+
return self
|
|
830
|
+
|
|
831
|
+
def __exit__(self, *args):
|
|
832
|
+
"""Make sure that the connections closed."""
|
|
833
|
+
self._session.close()
|
|
834
|
+
|
|
835
|
+
def with_endpoint(self, endpoint):
|
|
836
|
+
"""Open a new session for targeting a specific endpoint."""
|
|
837
|
+
return VastdbApi(endpoint=endpoint,
|
|
838
|
+
access_key=self.access_key,
|
|
839
|
+
secret_key=self.secret_key,
|
|
840
|
+
ssl_verify=self._session.verify,
|
|
841
|
+
timeout=self.timeout,
|
|
842
|
+
backoff_config=self.backoff_config)
|
|
843
|
+
|
|
815
844
|
def _single_request(self, *, method, url, skip_status_check=False, **kwargs):
|
|
816
845
|
_logger.debug("Sending request: %s %s %s timeout=%s", method, url, kwargs, self.timeout)
|
|
817
846
|
try:
|
|
File without changes
|
|
File without changes
|
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
from typing import List, Optional
|
|
2
|
+
|
|
3
|
+
from vastdb.bench.perf_bench.common.constants import (
|
|
4
|
+
LOCAL_FS_DS_PATH,
|
|
5
|
+
NFS_DS_PATH, # noqa: F401
|
|
6
|
+
S3_DS_PATH, # noqa: F401
|
|
7
|
+
ParquetCompression,
|
|
8
|
+
VastConnDetails,
|
|
9
|
+
)
|
|
10
|
+
from vastdb.bench.perf_bench.dataset.schemas import DEFAULT_BARS_COLUMNS
|
|
11
|
+
from vastdb.bench.perf_bench.orchestrate.scenario import BenchScenario
|
|
12
|
+
from vastdb.bench.perf_bench.orchestrate.scenario_generator import (
|
|
13
|
+
generate_perf_bench_scenarios,
|
|
14
|
+
)
|
|
15
|
+
from vastdb.bench.perf_bench.query.query import QueryBackend
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def build_scenarios(
|
|
19
|
+
base_key: str,
|
|
20
|
+
conn_details: Optional[VastConnDetails] = None,
|
|
21
|
+
) -> List[BenchScenario]:
|
|
22
|
+
return generate_perf_bench_scenarios(
|
|
23
|
+
base_key=base_key,
|
|
24
|
+
conn_details=conn_details or VastConnDetails(),
|
|
25
|
+
query_backends=[
|
|
26
|
+
QueryBackend.pyarrow,
|
|
27
|
+
# QueryBackend.vastdb,
|
|
28
|
+
],
|
|
29
|
+
columns_choices=(DEFAULT_BARS_COLUMNS,),
|
|
30
|
+
universe_choices=(
|
|
31
|
+
"Single",
|
|
32
|
+
"Tiny",
|
|
33
|
+
"SmallSeq",
|
|
34
|
+
"Medium",
|
|
35
|
+
"Medium2",
|
|
36
|
+
"Large",
|
|
37
|
+
),
|
|
38
|
+
num_bdays=[
|
|
39
|
+
1, # 1d
|
|
40
|
+
5, # 1w
|
|
41
|
+
# 22, # 1m
|
|
42
|
+
65, # 3m
|
|
43
|
+
# 130, # 6m
|
|
44
|
+
252, # 1y
|
|
45
|
+
],
|
|
46
|
+
|
|
47
|
+
# Arrow-specific options
|
|
48
|
+
fs_path_choices=[
|
|
49
|
+
# NFS_DS_PATH,
|
|
50
|
+
LOCAL_FS_DS_PATH,
|
|
51
|
+
# S3_DS_PATH,
|
|
52
|
+
],
|
|
53
|
+
rowgroup_size_choices=[ # make sure you have previously generated the respective datasets
|
|
54
|
+
# 64 * 1024,
|
|
55
|
+
# 128 * 1024,
|
|
56
|
+
256 * 1024,
|
|
57
|
+
# 512 * 1024,
|
|
58
|
+
# DEFAULT_ROW_GROUP_SIZE,
|
|
59
|
+
# int(1.5 * 1024 * 1024),
|
|
60
|
+
],
|
|
61
|
+
compression_choices=[
|
|
62
|
+
ParquetCompression.LZ4,
|
|
63
|
+
],
|
|
64
|
+
arrow_batching_spec_choices=[
|
|
65
|
+
# {"batch_size": 2*2**16, "batch_readahead": 16, "fragment_readahead": 4},
|
|
66
|
+
# {"batch_size": 6*2**16, "batch_readahead": 12, "fragment_readahead": 4},
|
|
67
|
+
# DEFAULT_ARROW_KWARGS,
|
|
68
|
+
{"batch_size": 16 * 2 ** 16, "batch_readahead": 16, "fragment_readahead": 4},
|
|
69
|
+
# {"batch_size": 24 * 2 ** 16, "batch_readahead": 12, "fragment_readahead": 4},
|
|
70
|
+
# {"batch_size": 32*2**16, "batch_readahead": 12, "fragment_readahead": 4},
|
|
71
|
+
# {"batch_size": 64*2**16, "batch_readahead": 12, "fragment_readahead": 4},
|
|
72
|
+
# {"batch_size": 128*2**16, "batch_readahead": 12, "fragment_readahead": 4},
|
|
73
|
+
],
|
|
74
|
+
|
|
75
|
+
# VastDB-specific options
|
|
76
|
+
vdb_num_sub_splits_choices=(
|
|
77
|
+
# 1, # Default
|
|
78
|
+
# 4,
|
|
79
|
+
8,
|
|
80
|
+
# 16,
|
|
81
|
+
),
|
|
82
|
+
vdb_num_row_groups_per_sub_split_choices=(
|
|
83
|
+
# 1,
|
|
84
|
+
# 4,
|
|
85
|
+
8, # Default
|
|
86
|
+
),
|
|
87
|
+
)
|
|
@@ -0,0 +1,225 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
from typing import Annotated, List, Optional
|
|
3
|
+
|
|
4
|
+
import pandas as pd
|
|
5
|
+
import typer
|
|
6
|
+
|
|
7
|
+
from vastdb.bench.perf_bench.common.constants import (
|
|
8
|
+
DEFAULT_END_T,
|
|
9
|
+
DEFAULT_RESULTS_DIR,
|
|
10
|
+
DEFAULT_START_T,
|
|
11
|
+
DFAULT_PARQUET_COMPRESSION,
|
|
12
|
+
LOCAL_FS_DS_PATH,
|
|
13
|
+
LogLevel,
|
|
14
|
+
ParquetCompression,
|
|
15
|
+
)
|
|
16
|
+
from vastdb.bench.perf_bench.common.log_utils import (
|
|
17
|
+
get_logger,
|
|
18
|
+
set_log_file,
|
|
19
|
+
set_log_level,
|
|
20
|
+
)
|
|
21
|
+
from vastdb.bench.perf_bench.common.utils import getenv_flag, load_module_from_path
|
|
22
|
+
from vastdb.bench.perf_bench.dataset.generate_secmaster import (
|
|
23
|
+
SM_PATH,
|
|
24
|
+
generate_secmaster,
|
|
25
|
+
)
|
|
26
|
+
from vastdb.bench.perf_bench.dataset.generate_stocks_dataset import (
|
|
27
|
+
generate_concurrent_synthetic_stock_1m_bars,
|
|
28
|
+
)
|
|
29
|
+
from vastdb.bench.perf_bench.orchestrate.scenario import BenchScenario
|
|
30
|
+
from vastdb.bench.perf_bench.run import run_scenarios
|
|
31
|
+
|
|
32
|
+
app = typer.Typer(pretty_exceptions_enable=getenv_flag("TYPER_PRETTY_EXCEPTIONS"))
|
|
33
|
+
|
|
34
|
+
_MY_DIR = Path(__file__).parent
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
# noinspection PyUnusedLocal
|
|
38
|
+
@app.callback()
|
|
39
|
+
def cli_common(
|
|
40
|
+
ctx: typer.Context,
|
|
41
|
+
verbose: Annotated[
|
|
42
|
+
bool,
|
|
43
|
+
typer.Option(
|
|
44
|
+
"--verbose",
|
|
45
|
+
is_flag=True,
|
|
46
|
+
),
|
|
47
|
+
] = False,
|
|
48
|
+
log_level: Annotated[
|
|
49
|
+
Optional[LogLevel],
|
|
50
|
+
typer.Option(
|
|
51
|
+
"--log-level",
|
|
52
|
+
case_sensitive=False,
|
|
53
|
+
),
|
|
54
|
+
] = None,
|
|
55
|
+
log_file: Annotated[
|
|
56
|
+
Optional[Path],
|
|
57
|
+
typer.Option(
|
|
58
|
+
"--log-file",
|
|
59
|
+
writable=True,
|
|
60
|
+
file_okay=True,
|
|
61
|
+
dir_okay=False,
|
|
62
|
+
resolve_path=True,
|
|
63
|
+
),
|
|
64
|
+
] = None,
|
|
65
|
+
):
|
|
66
|
+
if verbose:
|
|
67
|
+
log_level = LogLevel.DEBUG
|
|
68
|
+
if log_level:
|
|
69
|
+
set_log_level(log_level)
|
|
70
|
+
if log_file:
|
|
71
|
+
set_log_file(log_file)
|
|
72
|
+
get_logger(__name__).info("CLI common setup done.")
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def _positive_int(value: str) -> int:
|
|
76
|
+
i_value = int(value)
|
|
77
|
+
if i_value <= 0:
|
|
78
|
+
raise typer.BadParameter(f"Must be a positive integer: {value}.")
|
|
79
|
+
return i_value
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
# noinspection PyUnusedLocal
|
|
83
|
+
@app.command()
|
|
84
|
+
def run_bench(
|
|
85
|
+
ctx: typer.Context,
|
|
86
|
+
bench_name: Annotated[
|
|
87
|
+
str,
|
|
88
|
+
typer.Option(
|
|
89
|
+
"--bench-name",
|
|
90
|
+
),
|
|
91
|
+
],
|
|
92
|
+
parallelism: Annotated[
|
|
93
|
+
List[int],
|
|
94
|
+
typer.Option(
|
|
95
|
+
"--parallelism",
|
|
96
|
+
callback=lambda par: [_positive_int(p) for p in par],
|
|
97
|
+
),
|
|
98
|
+
],
|
|
99
|
+
runs_per_bench: Annotated[
|
|
100
|
+
int,
|
|
101
|
+
typer.Option(
|
|
102
|
+
"--runs-per-bench",
|
|
103
|
+
callback=_positive_int,
|
|
104
|
+
),
|
|
105
|
+
] = 3,
|
|
106
|
+
bench_generator_path: Annotated[
|
|
107
|
+
Path,
|
|
108
|
+
typer.Option(
|
|
109
|
+
"--bench-generator-path",
|
|
110
|
+
readable=True,
|
|
111
|
+
file_okay=True,
|
|
112
|
+
dir_okay=False,
|
|
113
|
+
resolve_path=True,
|
|
114
|
+
),
|
|
115
|
+
] = _MY_DIR / "bench_repo" / "mega_combo.py",
|
|
116
|
+
results_base_dir: Annotated[
|
|
117
|
+
Path,
|
|
118
|
+
typer.Option(
|
|
119
|
+
"--log-file",
|
|
120
|
+
writable=True,
|
|
121
|
+
file_okay=False,
|
|
122
|
+
dir_okay=True,
|
|
123
|
+
resolve_path=True,
|
|
124
|
+
),
|
|
125
|
+
] = DEFAULT_RESULTS_DIR,
|
|
126
|
+
):
|
|
127
|
+
if not (bench_name := bench_name.strip()):
|
|
128
|
+
raise typer.BadParameter("Bench name must be non-empty.")
|
|
129
|
+
|
|
130
|
+
mod = load_module_from_path(bench_generator_path)
|
|
131
|
+
scenarios: List[BenchScenario] = mod.build_scenarios(base_key=bench_name)
|
|
132
|
+
for para in parallelism:
|
|
133
|
+
run_scenarios(
|
|
134
|
+
scenarios=scenarios,
|
|
135
|
+
runs_per_bench=runs_per_bench,
|
|
136
|
+
parallelism=para,
|
|
137
|
+
results_base_dir=str(results_base_dir),
|
|
138
|
+
)
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
# noinspection PyUnusedLocal
|
|
142
|
+
@app.command()
|
|
143
|
+
def build_secmaster(
|
|
144
|
+
ctx: typer.Context,
|
|
145
|
+
):
|
|
146
|
+
generate_secmaster()
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
# noinspection PyUnusedLocal
|
|
150
|
+
@app.command()
|
|
151
|
+
def build_dataset(
|
|
152
|
+
ctx: typer.Context,
|
|
153
|
+
start_date: Annotated[
|
|
154
|
+
str,
|
|
155
|
+
typer.Option(
|
|
156
|
+
"--start-date",
|
|
157
|
+
help="Start date for the dataset.",
|
|
158
|
+
callback=lambda d: pd.Timestamp(d).normalize(),
|
|
159
|
+
),
|
|
160
|
+
] = DEFAULT_START_T.strftime("%Y%m%d"),
|
|
161
|
+
end_date: Annotated[
|
|
162
|
+
str,
|
|
163
|
+
typer.Option(
|
|
164
|
+
"--end-date",
|
|
165
|
+
help="Start date for the dataset.",
|
|
166
|
+
callback=lambda d: pd.Timestamp(d).normalize(),
|
|
167
|
+
),
|
|
168
|
+
] = DEFAULT_END_T.strftime("%Y%m%d"),
|
|
169
|
+
output_dir: Annotated[
|
|
170
|
+
Path,
|
|
171
|
+
typer.Option(
|
|
172
|
+
"--output-dir",
|
|
173
|
+
writable=True,
|
|
174
|
+
file_okay=False,
|
|
175
|
+
dir_okay=True,
|
|
176
|
+
resolve_path=True,
|
|
177
|
+
),
|
|
178
|
+
] = LOCAL_FS_DS_PATH,
|
|
179
|
+
parallelism: Annotated[
|
|
180
|
+
int,
|
|
181
|
+
typer.Option(
|
|
182
|
+
"--parallelism",
|
|
183
|
+
callback=_positive_int,
|
|
184
|
+
),
|
|
185
|
+
] = 6,
|
|
186
|
+
row_group_size: Annotated[
|
|
187
|
+
int,
|
|
188
|
+
typer.Option(
|
|
189
|
+
"--row-group-size",
|
|
190
|
+
callback=_positive_int,
|
|
191
|
+
help=(
|
|
192
|
+
"Row group size for the dataset, some common values are: 64 * 1024, 128 * 1024, 256"
|
|
193
|
+
" * 1024, 512 * 1024,1024 * 1024, 1.5 * 1024 * 1024."
|
|
194
|
+
),
|
|
195
|
+
),
|
|
196
|
+
] = 256 * 1024,
|
|
197
|
+
compression: Annotated[
|
|
198
|
+
ParquetCompression,
|
|
199
|
+
typer.Option(
|
|
200
|
+
"--compression",
|
|
201
|
+
help="Parquet compression algorithm.",
|
|
202
|
+
),
|
|
203
|
+
] = DFAULT_PARQUET_COMPRESSION,
|
|
204
|
+
):
|
|
205
|
+
if row_group_size < 1024:
|
|
206
|
+
raise typer.BadParameter("Row group size must be at least 1024.")
|
|
207
|
+
if parallelism < 1:
|
|
208
|
+
raise typer.BadParameter("Parallelism must be at least 1.")
|
|
209
|
+
if start_date > end_date:
|
|
210
|
+
raise typer.BadParameter("Start date must be before the end date.")
|
|
211
|
+
if not SM_PATH.is_file():
|
|
212
|
+
generate_secmaster()
|
|
213
|
+
generate_concurrent_synthetic_stock_1m_bars(
|
|
214
|
+
from_t=start_date,
|
|
215
|
+
to_t=end_date,
|
|
216
|
+
output_dir=output_dir,
|
|
217
|
+
num_workers=parallelism,
|
|
218
|
+
row_group_size=row_group_size,
|
|
219
|
+
compression=compression,
|
|
220
|
+
)
|
|
221
|
+
|
|
222
|
+
|
|
223
|
+
if __name__ == "__main__":
|
|
224
|
+
# Set the metadata only if we execute the main (not on just importing this module)
|
|
225
|
+
app()
|
|
File without changes
|
|
@@ -0,0 +1,96 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import os
|
|
3
|
+
from dataclasses import dataclass
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
|
|
6
|
+
import pandas as pd
|
|
7
|
+
from urllib3.util import parse_url
|
|
8
|
+
|
|
9
|
+
from vastdb.bench.perf_bench.common.types import StrEnum
|
|
10
|
+
|
|
11
|
+
_MY_DIR = Path(__file__).parent
|
|
12
|
+
|
|
13
|
+
# VastDB details
|
|
14
|
+
VASTDB_ENDPOINT = os.getenv("VASTDB_ENDPOINT", "")
|
|
15
|
+
VASTDB_BUCKET_NAME = os.getenv("VASTDB_BUCKET_NAME", "")
|
|
16
|
+
VASTDB_TEST_SCHEMA_NAME = os.getenv("VASTDB_TEST_SCHEMA_NAME", "")
|
|
17
|
+
VASTDB_TEST_TABLE_NAME = os.getenv("VASTDB_TEST_TABLE_NAME", "")
|
|
18
|
+
|
|
19
|
+
# Regular S3 details
|
|
20
|
+
S3_BUCKET_NAME = "my-s3-bucket"
|
|
21
|
+
DEFAULT_S3_SSL_PORT = int(os.getenv("DEFAULT_S3_SSL_PORT", 443))
|
|
22
|
+
if DEFAULT_S3_ENDPOINT_URL := os.getenv("AWS_S3_ENDPOINT_URL", ""):
|
|
23
|
+
_parsed = parse_url(DEFAULT_S3_ENDPOINT_URL)
|
|
24
|
+
DEFAULT_S3_HOST = str(_parsed.host)
|
|
25
|
+
DEFAULT_S3_PORT = int(_parsed.port or 80)
|
|
26
|
+
else:
|
|
27
|
+
DEFAULT_S3_HOST = os.getenv("DEFAULT_S3_HOST", "1.1.1.1")
|
|
28
|
+
DEFAULT_S3_PORT = int(os.getenv("DEFAULT_S3_PORT", 80))
|
|
29
|
+
DEFAULT_S3_ENDPOINT_URL = f"http://{DEFAULT_S3_HOST}:{DEFAULT_S3_PORT}"
|
|
30
|
+
|
|
31
|
+
# Paths
|
|
32
|
+
DEFAULT_RESULTS_DIR = Path(__file__).parent.parent / "benchmark_results"
|
|
33
|
+
NFS_DS_PATH = f"/mnt/data/{VASTDB_BUCKET_NAME}/data"
|
|
34
|
+
LOCAL_FS_DS_PATH = _MY_DIR.parent / "dataset" / "test_dataset"
|
|
35
|
+
S3_DS_PATH = f"s3://{VASTDB_BUCKET_NAME}/data"
|
|
36
|
+
|
|
37
|
+
# Access keys
|
|
38
|
+
DEFAULT_ACCESS_KEY = os.getenv("AWS_ACCESS_KEY_ID", "some_access_key")
|
|
39
|
+
DEFAULT_SECRET_KEY = os.getenv("AWS_SECRET_ACCESS_KEY", "some_secret_key")
|
|
40
|
+
|
|
41
|
+
# Dataset start/end dates
|
|
42
|
+
DEFAULT_START_T = pd.Timestamp("20180101")
|
|
43
|
+
DEFAULT_END_T = pd.Timestamp("20200201")
|
|
44
|
+
|
|
45
|
+
# Arrow related constants
|
|
46
|
+
DEFAULT_ROW_GROUP_SIZE = 1024 * 1024
|
|
47
|
+
DEFAULT_ARROW_KWARGS = {
|
|
48
|
+
"batch_size": (DEFAULT_ARROW_BATCH_SIZE := 786432),
|
|
49
|
+
"batch_readahead": (DEFAULT_ARROW_BATCH_READAHEAD := 12),
|
|
50
|
+
"fragment_readahead": (DEFAULT_ARROW_BATCH_FRAGMENT_READAHEAD := 4),
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
class ParquetCompression(StrEnum):
|
|
55
|
+
NONE = "NONE"
|
|
56
|
+
SNAPPY = "SNAPPY"
|
|
57
|
+
GZIP = "GZIP"
|
|
58
|
+
# LZO = "LZO"
|
|
59
|
+
BROTLI = "BROTLI"
|
|
60
|
+
LZ4 = "LZ4"
|
|
61
|
+
ZSTD = "ZSTD"
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
DFAULT_PARQUET_COMPRESSION = ParquetCompression.LZ4
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
class LogLevel(StrEnum):
|
|
68
|
+
CRITICAL = "CRITICAL"
|
|
69
|
+
FATAL = "FATAL"
|
|
70
|
+
ERROR = "ERROR"
|
|
71
|
+
WARN = "WARNING"
|
|
72
|
+
WARNING = "WARNING"
|
|
73
|
+
INFO = "INFO"
|
|
74
|
+
DEBUG = "DEBUG"
|
|
75
|
+
NOTSET = "NOTSET"
|
|
76
|
+
|
|
77
|
+
def to_int(self) -> int:
|
|
78
|
+
# noinspection PyUnresolvedReferences,PyProtectedMember
|
|
79
|
+
return logging._nameToLevel[self] # noqa: SLF001
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
@dataclass(frozen=True)
|
|
83
|
+
class VastConnDetails:
|
|
84
|
+
"""VAST Config."""
|
|
85
|
+
|
|
86
|
+
access: str = DEFAULT_ACCESS_KEY
|
|
87
|
+
secret: str = DEFAULT_SECRET_KEY
|
|
88
|
+
vastdb_bucket: str = VASTDB_BUCKET_NAME
|
|
89
|
+
vastdb_endpoint: str = VASTDB_ENDPOINT
|
|
90
|
+
vastdb_ssl_verify: bool = True
|
|
91
|
+
vastdb_schema: str = VASTDB_TEST_SCHEMA_NAME
|
|
92
|
+
vastdb_table: str = VASTDB_TEST_TABLE_NAME
|
|
93
|
+
s3_host: str = DEFAULT_S3_HOST
|
|
94
|
+
s3_bucket: str = VASTDB_BUCKET_NAME
|
|
95
|
+
s3_port: int = DEFAULT_S3_PORT
|
|
96
|
+
s3_ssl_port: int = DEFAULT_S3_SSL_PORT
|
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
from threading import RLock
|
|
4
|
+
from typing import Optional, Union
|
|
5
|
+
|
|
6
|
+
from vastdb.bench.perf_bench.common.constants import LogLevel
|
|
7
|
+
from vastdb.bench.perf_bench.common.types import PathLikeT
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class LogConfigError(Exception):
|
|
11
|
+
pass
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
_logging_configured: bool = False
|
|
15
|
+
_log_level: int = logging.INFO
|
|
16
|
+
_log_file: Optional[Path] = None
|
|
17
|
+
_lock: RLock = RLock()
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def set_log_file(log_file: PathLikeT) -> None:
|
|
21
|
+
global _log_file # noqa: PLW0603
|
|
22
|
+
with _lock:
|
|
23
|
+
if _logging_configured:
|
|
24
|
+
raise LogConfigError(
|
|
25
|
+
"Cannot change log file after logging has been configured."
|
|
26
|
+
)
|
|
27
|
+
|
|
28
|
+
log_file = Path(str(log_file))
|
|
29
|
+
log_file.parent.mkdir(parents=True, exist_ok=True)
|
|
30
|
+
log_file.touch(exist_ok=True)
|
|
31
|
+
_log_file = log_file
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def get_log_level() -> int:
|
|
35
|
+
return _log_level
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def set_log_level(level: Union[str, LogLevel, int]) -> None:
|
|
39
|
+
global _log_level # noqa: PLW0603
|
|
40
|
+
if isinstance(level, str):
|
|
41
|
+
level = LogLevel[level].to_int()
|
|
42
|
+
with _lock:
|
|
43
|
+
if level != _log_level and _logging_configured:
|
|
44
|
+
raise LogConfigError(
|
|
45
|
+
"Cannot change log level after logging has been configured."
|
|
46
|
+
)
|
|
47
|
+
_log_level = level
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def get_logger(name: Optional[str]) -> logging.Logger:
|
|
51
|
+
global _logging_configured # noqa: PLW0603
|
|
52
|
+
with _lock:
|
|
53
|
+
if not _logging_configured:
|
|
54
|
+
logging.basicConfig(
|
|
55
|
+
level=_log_level,
|
|
56
|
+
format=(
|
|
57
|
+
"%(asctime)s.%(msecs)03d %(levelname)s %(module)s - %(funcName)s: %(message)s"
|
|
58
|
+
),
|
|
59
|
+
datefmt="%Y-%m-%d %H:%M:%S",
|
|
60
|
+
)
|
|
61
|
+
if _log_file:
|
|
62
|
+
root_logger = logging.getLogger()
|
|
63
|
+
fh = logging.FileHandler(_log_file, mode="a")
|
|
64
|
+
fh.setFormatter(root_logger.handlers[0].formatter)
|
|
65
|
+
root_logger.addHandler(fh)
|
|
66
|
+
_logging_configured = True
|
|
67
|
+
return logging.getLogger(name)
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
import datetime as dt
|
|
2
|
+
import os
|
|
3
|
+
import sys
|
|
4
|
+
from typing import Union
|
|
5
|
+
|
|
6
|
+
PathLikeT = Union[str, bytes, os.PathLike]
|
|
7
|
+
DateLikeT = Union[str, dt.date]
|
|
8
|
+
|
|
9
|
+
if sys.version_info >= (3, 11):
|
|
10
|
+
# noinspection PyUnresolvedReferences
|
|
11
|
+
from enum import StrEnum
|
|
12
|
+
else:
|
|
13
|
+
from enum import Enum, auto
|
|
14
|
+
|
|
15
|
+
class StrEnum(str, Enum):
|
|
16
|
+
# noinspection PyTypeChecker
|
|
17
|
+
def __new__(cls, value: Union[auto, str], *args, **kwargs):
|
|
18
|
+
if not isinstance(value, (str, auto)):
|
|
19
|
+
raise TypeError(
|
|
20
|
+
f"Not a string/auto type: {value=!r} [type={type(value)}]"
|
|
21
|
+
)
|
|
22
|
+
return super().__new__(cls, value, *args, **kwargs)
|
|
23
|
+
|
|
24
|
+
def __repr__(self):
|
|
25
|
+
"""Return a string representation of the enumeration member."""
|
|
26
|
+
return f"<{self.__class__.__name__}.{self.name}: '{self.value}'>"
|
|
27
|
+
|
|
28
|
+
def __str__(self):
|
|
29
|
+
"""Return the value of the enumeration member."""
|
|
30
|
+
return str(self.value)
|
|
31
|
+
|
|
32
|
+
@staticmethod
|
|
33
|
+
def _generate_next_value_(name: str, *args, **kwargs):
|
|
34
|
+
return name
|