vastdb 0.1.9__py3-none-any.whl → 0.1.11__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vastdb/__init__.py +6 -3
- vastdb/_internal.py +9 -16
- vastdb/bench/test_perf.py +2 -2
- vastdb/bench/test_sample.py +217 -0
- vastdb/config.py +65 -0
- vastdb/conftest.py +28 -6
- vastdb/errors.py +0 -6
- vastdb/features.py +42 -0
- vastdb/schema.py +1 -2
- vastdb/session.py +12 -45
- vastdb/table.py +8 -52
- vastdb/tests/metrics.py +43 -0
- vastdb/tests/test_sanity.py +1 -1
- vastdb/tests/test_tables.py +29 -40
- vastdb/util.py +1 -8
- vastdb/vast_tests/test_scale.py +68 -0
- {vastdb-0.1.9.dist-info → vastdb-0.1.11.dist-info}/METADATA +1 -1
- {vastdb-0.1.9.dist-info → vastdb-0.1.11.dist-info}/RECORD +21 -16
- {vastdb-0.1.9.dist-info → vastdb-0.1.11.dist-info}/WHEEL +1 -1
- {vastdb-0.1.9.dist-info → vastdb-0.1.11.dist-info}/LICENSE +0 -0
- {vastdb-0.1.9.dist-info → vastdb-0.1.11.dist-info}/top_level.txt +0 -0
vastdb/__init__.py
CHANGED
|
@@ -1,9 +1,6 @@
|
|
|
1
1
|
"""VAST Database Python SDK."""
|
|
2
2
|
|
|
3
3
|
import functools
|
|
4
|
-
import importlib.metadata
|
|
5
|
-
|
|
6
|
-
__version__ = importlib.metadata.distribution(__package__).version
|
|
7
4
|
|
|
8
5
|
from . import session
|
|
9
6
|
|
|
@@ -12,3 +9,9 @@ from . import session
|
|
|
12
9
|
@functools.wraps(session.Session)
|
|
13
10
|
def connect(*args, **kwargs): # noqa: D103
|
|
14
11
|
return session.Session(*args, **kwargs)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def version():
|
|
15
|
+
"""Return VAST DB SDK version."""
|
|
16
|
+
import importlib
|
|
17
|
+
return importlib.metadata.distribution(__package__).version
|
vastdb/_internal.py
CHANGED
|
@@ -5,9 +5,8 @@ import re
|
|
|
5
5
|
import struct
|
|
6
6
|
import urllib.parse
|
|
7
7
|
from collections import defaultdict, namedtuple
|
|
8
|
-
from dataclasses import dataclass, field
|
|
9
8
|
from enum import Enum
|
|
10
|
-
from typing import Any,
|
|
9
|
+
from typing import Any, Dict, Iterator, List, Optional, Tuple, Union
|
|
11
10
|
|
|
12
11
|
import backoff
|
|
13
12
|
import flatbuffers
|
|
@@ -104,6 +103,7 @@ from vast_flatbuf.tabular.ListSchemasResponse import ListSchemasResponse as list
|
|
|
104
103
|
from vast_flatbuf.tabular.ListTablesResponse import ListTablesResponse as list_tables
|
|
105
104
|
|
|
106
105
|
from . import errors
|
|
106
|
+
from .config import BackoffConfig
|
|
107
107
|
|
|
108
108
|
UINT64_MAX = 18446744073709551615
|
|
109
109
|
|
|
@@ -742,15 +742,6 @@ def _backoff_giveup(exc: Exception) -> bool:
|
|
|
742
742
|
return True # give up in case of other exceptions
|
|
743
743
|
|
|
744
744
|
|
|
745
|
-
@dataclass
|
|
746
|
-
class BackoffConfig:
|
|
747
|
-
wait_gen: Callable = field(default=backoff.expo)
|
|
748
|
-
max_value: Optional[float] = None # max duration for a single wait period
|
|
749
|
-
max_tries: int = 10
|
|
750
|
-
max_time: float = 60.0 # in seconds
|
|
751
|
-
backoff_log_level: int = logging.DEBUG
|
|
752
|
-
|
|
753
|
-
|
|
754
745
|
class VastdbApi:
|
|
755
746
|
# we expect the vast version to be <major>.<minor>.<patch>.<protocol>
|
|
756
747
|
VAST_VERSION_REGEX = re.compile(r'^vast (\d+\.\d+\.\d+\.\d+)$')
|
|
@@ -759,15 +750,17 @@ class VastdbApi:
|
|
|
759
750
|
*,
|
|
760
751
|
auth_type=AuthType.SIGV4,
|
|
761
752
|
ssl_verify=True,
|
|
753
|
+
timeout=None,
|
|
762
754
|
backoff_config: Optional[BackoffConfig] = None):
|
|
763
755
|
|
|
764
|
-
from . import
|
|
765
|
-
self.client_sdk_version = f"VAST Database Python SDK {
|
|
756
|
+
from . import version # import lazily here (to avoid circular dependencies)
|
|
757
|
+
self.client_sdk_version = f"VAST Database Python SDK {version()} - 2024 (c)"
|
|
766
758
|
|
|
767
759
|
url = urllib3.util.parse_url(endpoint)
|
|
768
760
|
self.access_key = access_key
|
|
769
761
|
self.secret_key = secret_key
|
|
770
762
|
|
|
763
|
+
self.timeout = timeout
|
|
771
764
|
self.default_max_list_columns_page_size = 1000
|
|
772
765
|
self._session = requests.Session()
|
|
773
766
|
self._session.verify = ssl_verify
|
|
@@ -820,9 +813,9 @@ class VastdbApi:
|
|
|
820
813
|
raise NotImplementedError(msg)
|
|
821
814
|
|
|
822
815
|
def _single_request(self, *, method, url, skip_status_check=False, **kwargs):
|
|
823
|
-
_logger.debug("Sending request: %s %s %s", method, url, kwargs)
|
|
816
|
+
_logger.debug("Sending request: %s %s %s timeout=%s", method, url, kwargs, self.timeout)
|
|
824
817
|
try:
|
|
825
|
-
res = self._session.request(method=method, url=url, **kwargs)
|
|
818
|
+
res = self._session.request(method=method, url=url, timeout=self.timeout, **kwargs)
|
|
826
819
|
except requests.exceptions.ConnectionError as err:
|
|
827
820
|
# low-level connection issue, it is safe to retry only read-only requests
|
|
828
821
|
may_retry = (method == "GET")
|
|
@@ -1807,7 +1800,7 @@ def _iter_query_data_response_columns(fileobj, stream_ids=None):
|
|
|
1807
1800
|
batches.append(batch)
|
|
1808
1801
|
except StopIteration: # we got an end-of-stream IPC message for a given stream ID
|
|
1809
1802
|
reader, batches = readers.pop(stream_id) # end of column
|
|
1810
|
-
table = pa.Table.from_batches(batches) # concatenate all column chunks (as a single)
|
|
1803
|
+
table = pa.Table.from_batches(batches=batches, schema=reader.schema) # concatenate all column chunks (as a single)
|
|
1811
1804
|
_logger.debug("stream_id=%d rows=%d column=%s", stream_id, len(table), table)
|
|
1812
1805
|
yield (stream_id, next_row_id, table)
|
|
1813
1806
|
|
vastdb/bench/test_perf.py
CHANGED
|
@@ -10,11 +10,11 @@ log = logging.getLogger(__name__)
|
|
|
10
10
|
|
|
11
11
|
|
|
12
12
|
@pytest.mark.benchmark
|
|
13
|
-
def test_bench(session,
|
|
13
|
+
def test_bench(session, bucket_name, parquets_path, crater_path):
|
|
14
14
|
files = [str(parquets_path / f) for f in (parquets_path.glob('**/*.pq'))]
|
|
15
15
|
|
|
16
16
|
with session.transaction() as tx:
|
|
17
|
-
b = tx.bucket(
|
|
17
|
+
b = tx.bucket(bucket_name)
|
|
18
18
|
s = b.create_schema('s1')
|
|
19
19
|
t = util.create_table_from_files(s, 't1', files, config=ImportConfig(import_concurrency=8))
|
|
20
20
|
config = QueryConfig(num_splits=8, num_sub_splits=4)
|
|
@@ -0,0 +1,217 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
|
|
3
|
+
import functools
|
|
4
|
+
import itertools
|
|
5
|
+
import logging
|
|
6
|
+
import os
|
|
7
|
+
import random
|
|
8
|
+
import threading
|
|
9
|
+
import time
|
|
10
|
+
from concurrent.futures import ProcessPoolExecutor, as_completed
|
|
11
|
+
|
|
12
|
+
import numpy as np
|
|
13
|
+
import pyarrow as pa
|
|
14
|
+
|
|
15
|
+
import vastdb.errors
|
|
16
|
+
from vastdb.table import INTERNAL_ROW_ID
|
|
17
|
+
from vastdb.tests import metrics
|
|
18
|
+
|
|
19
|
+
logging.basicConfig(
|
|
20
|
+
level="INFO",
|
|
21
|
+
format="%(asctime)s %(levelname)-10s %(process)d/%(thread)d %(filename)s:%(lineno)d %(message)s")
|
|
22
|
+
|
|
23
|
+
log = logging.getLogger()
|
|
24
|
+
|
|
25
|
+
sdk_version = vastdb.version()
|
|
26
|
+
log.info("Python SDK version: %s", sdk_version)
|
|
27
|
+
|
|
28
|
+
NUM_COLUMNS = 10_000
|
|
29
|
+
COLUMNS_BATCH = 10
|
|
30
|
+
|
|
31
|
+
NUM_ROW_GROUPS = 100
|
|
32
|
+
ROW_GROUP_SIZE = 100_000
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
INTERNAL_ROWID_FIELD = pa.field(INTERNAL_ROW_ID, pa.uint64()) # used for UPDATE
|
|
36
|
+
EXTERNAL_ROWID_FIELD = pa.field("vastdb_rowid", pa.int64()) # used for INSERT & SELECT
|
|
37
|
+
|
|
38
|
+
SCHEMA = "perf"
|
|
39
|
+
TABLE = "sample"
|
|
40
|
+
|
|
41
|
+
SCHEMA_ARROW = pa.schema(
|
|
42
|
+
[pa.field(f'c{i}', pa.float32()) for i in range(NUM_COLUMNS)]
|
|
43
|
+
)
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def load_batch(bucket, session_kwargs, offset, limit):
|
|
47
|
+
log.info('loading into [%d..%d)', offset, limit)
|
|
48
|
+
|
|
49
|
+
# Iterate over all row-groups in this file
|
|
50
|
+
rowids_range = range(offset, limit)
|
|
51
|
+
rowids = pa.array(rowids_range, INTERNAL_ROWID_FIELD.type)
|
|
52
|
+
|
|
53
|
+
session = vastdb.connect(**session_kwargs)
|
|
54
|
+
metrics_rows = []
|
|
55
|
+
|
|
56
|
+
with session.transaction() as tx:
|
|
57
|
+
table = tx.bucket(bucket).schema(SCHEMA).table(TABLE)
|
|
58
|
+
|
|
59
|
+
col = table[EXTERNAL_ROWID_FIELD.name]
|
|
60
|
+
pred = (col >= rowids_range[0]) & (col <= rowids_range[-1])
|
|
61
|
+
count = sum(len(rb) for rb in table.select(columns=[], predicate=pred))
|
|
62
|
+
log.info("%d rows exist at %s", count, rowids_range)
|
|
63
|
+
if count == len(rowids_range):
|
|
64
|
+
# skip already loaded rows
|
|
65
|
+
log.info('skipping [%d..%d)', offset, limit)
|
|
66
|
+
|
|
67
|
+
pid = os.getpid()
|
|
68
|
+
tid = threading.get_native_id()
|
|
69
|
+
total_nbytes = 0
|
|
70
|
+
calls = 0
|
|
71
|
+
t0 = time.time()
|
|
72
|
+
# Insert/update every chunk of columns in this rowgroup
|
|
73
|
+
for j in range(0, len(SCHEMA_ARROW), COLUMNS_BATCH):
|
|
74
|
+
cols_batch = list(SCHEMA_ARROW)[j:j + COLUMNS_BATCH]
|
|
75
|
+
arrays = [
|
|
76
|
+
pa.array(np.float32(np.random.uniform(size=[ROW_GROUP_SIZE])))
|
|
77
|
+
for _ in cols_batch
|
|
78
|
+
]
|
|
79
|
+
chunk = pa.table(data=arrays, schema=pa.schema(cols_batch))
|
|
80
|
+
nbytes = chunk.get_total_buffer_size()
|
|
81
|
+
start = time.perf_counter()
|
|
82
|
+
if j == 0:
|
|
83
|
+
chunk = chunk.add_column(0, EXTERNAL_ROWID_FIELD, rowids.cast(EXTERNAL_ROWID_FIELD.type))
|
|
84
|
+
op = 'insert'
|
|
85
|
+
table.insert(chunk)
|
|
86
|
+
else:
|
|
87
|
+
chunk = chunk.add_column(0, INTERNAL_ROWID_FIELD, rowids)
|
|
88
|
+
op = 'update'
|
|
89
|
+
table.update(chunk)
|
|
90
|
+
finish = time.perf_counter()
|
|
91
|
+
|
|
92
|
+
metrics_rows.append(metrics.Row(
|
|
93
|
+
start=start, finish=finish, table_path=table.path, op=op,
|
|
94
|
+
nbytes=nbytes, rows=len(chunk), cols=len(cols_batch),
|
|
95
|
+
pid=pid, tid=tid, sdk_version=sdk_version))
|
|
96
|
+
|
|
97
|
+
total_nbytes += nbytes
|
|
98
|
+
calls += 1
|
|
99
|
+
log.debug("%s into %s: %d rows x %d cols, %.3f MB",
|
|
100
|
+
op, rowids_range, len(chunk), len(chunk.schema),
|
|
101
|
+
chunk.get_total_buffer_size() / 1e6)
|
|
102
|
+
|
|
103
|
+
dt = time.time() - t0
|
|
104
|
+
|
|
105
|
+
log.info('loaded into [%d..%d): %d rows x %d cols, %.3f MB, %d RPCs, %.3f seconds',
|
|
106
|
+
offset, limit, limit - offset, NUM_COLUMNS, total_nbytes / 1e6, calls, dt)
|
|
107
|
+
return metrics_rows
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
def test_ingest(test_bucket_name, session_kwargs, tabular_endpoint_urls, num_workers, perf_metrics_db):
|
|
111
|
+
session = vastdb.connect(**session_kwargs)
|
|
112
|
+
metrics_table = metrics.Table(perf_metrics_db, "ingest")
|
|
113
|
+
|
|
114
|
+
with session.transaction() as tx:
|
|
115
|
+
b = tx.bucket(test_bucket_name)
|
|
116
|
+
try:
|
|
117
|
+
s = b.schema(SCHEMA)
|
|
118
|
+
except vastdb.errors.MissingSchema:
|
|
119
|
+
s = b.create_schema(SCHEMA)
|
|
120
|
+
|
|
121
|
+
try:
|
|
122
|
+
s.table(TABLE)
|
|
123
|
+
except vastdb.errors.MissingTable:
|
|
124
|
+
s.create_table(TABLE, pa.schema([EXTERNAL_ROWID_FIELD] + list(SCHEMA_ARROW)))
|
|
125
|
+
|
|
126
|
+
ranges = [
|
|
127
|
+
(i * ROW_GROUP_SIZE, (i + 1) * ROW_GROUP_SIZE)
|
|
128
|
+
for i in range(NUM_ROW_GROUPS)
|
|
129
|
+
]
|
|
130
|
+
|
|
131
|
+
with ProcessPoolExecutor(max_workers=num_workers) as executor:
|
|
132
|
+
futures = [
|
|
133
|
+
executor.submit(load_batch, test_bucket_name, session_kwargs | {'endpoint': url}, offset, limit)
|
|
134
|
+
for (offset, limit), url in zip(ranges, itertools.cycle(tabular_endpoint_urls))
|
|
135
|
+
]
|
|
136
|
+
log.info("spawned %d futures", len(futures))
|
|
137
|
+
for future in as_completed(futures):
|
|
138
|
+
metrics_table.insert(future.result())
|
|
139
|
+
|
|
140
|
+
with session.transaction() as tx:
|
|
141
|
+
t = tx.bucket(test_bucket_name).schema(SCHEMA).table(TABLE)
|
|
142
|
+
count = sum(len(rb) for rb in t.select([]))
|
|
143
|
+
log.info("%s has %d rows: %s", t, count, t.stats)
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
def run_query(session_kwargs, i, bucket_name, endpoint_url):
|
|
147
|
+
num_columns = 2000
|
|
148
|
+
row_groups_per_query = 10
|
|
149
|
+
|
|
150
|
+
config = vastdb.table.QueryConfig(
|
|
151
|
+
num_sub_splits=1,
|
|
152
|
+
num_splits=1,
|
|
153
|
+
limit_rows_per_sub_split=ROW_GROUP_SIZE,
|
|
154
|
+
num_row_groups_per_sub_split=1)
|
|
155
|
+
|
|
156
|
+
row_group_indices = list(range(NUM_ROW_GROUPS))
|
|
157
|
+
r = random.Random(i)
|
|
158
|
+
r.shuffle(row_group_indices)
|
|
159
|
+
|
|
160
|
+
pid = os.getpid()
|
|
161
|
+
tid = threading.get_native_id()
|
|
162
|
+
metrics_rows = []
|
|
163
|
+
|
|
164
|
+
session = vastdb.connect(**(session_kwargs | {"endpoint": endpoint_url}))
|
|
165
|
+
with session.transaction() as tx:
|
|
166
|
+
t = tx.bucket(bucket_name).schema(SCHEMA).table(TABLE)
|
|
167
|
+
|
|
168
|
+
fields = list(t.arrow_schema)[1:]
|
|
169
|
+
r.shuffle(fields)
|
|
170
|
+
cols = [f.name for f in fields[:num_columns]]
|
|
171
|
+
|
|
172
|
+
vastdb_rowid = t['vastdb_rowid']
|
|
173
|
+
preds = []
|
|
174
|
+
for offset in range(0, len(row_group_indices), row_groups_per_query):
|
|
175
|
+
rowid_ranges = (
|
|
176
|
+
vastdb_rowid.between(j * ROW_GROUP_SIZE, (j + 1) * ROW_GROUP_SIZE - 1)
|
|
177
|
+
for j in row_group_indices[offset:offset + row_groups_per_query]
|
|
178
|
+
)
|
|
179
|
+
pred = functools.reduce((lambda x, y: x | y), rowid_ranges)
|
|
180
|
+
preds.append(pred)
|
|
181
|
+
|
|
182
|
+
for j, pred in enumerate(preds):
|
|
183
|
+
log.info("%d) starting query #%d on %s", i, j, endpoint_url)
|
|
184
|
+
|
|
185
|
+
start = time.perf_counter()
|
|
186
|
+
res = t.select(columns=cols, predicate=pred, config=config)
|
|
187
|
+
rows = 0
|
|
188
|
+
data = 0
|
|
189
|
+
for rb in res:
|
|
190
|
+
rows += len(rb)
|
|
191
|
+
data += rb.nbytes
|
|
192
|
+
dt = time.perf_counter() - start
|
|
193
|
+
log.info("%d) got query #%d batch %.3f[s], %.3f[GB] %.3f[MB/s], %.3f[Mrows]", i, j, dt, data / 1e9, data / 1e6 / dt, rows / 1e6)
|
|
194
|
+
|
|
195
|
+
finish = time.perf_counter()
|
|
196
|
+
dt = finish - start
|
|
197
|
+
log.info("%d) finished query #%d %.3f[s], %.3f[GB], %.3f[MB/s], %.3f[Mrows]", i, j, dt, data / 1e9, data / 1e6 / dt, rows / 1e6)
|
|
198
|
+
|
|
199
|
+
metrics_rows.append(metrics.Row(
|
|
200
|
+
start=start, finish=finish, table_path=t.path, op="select",
|
|
201
|
+
nbytes=data, rows=rows, cols=len(cols),
|
|
202
|
+
pid=pid, tid=tid, sdk_version=sdk_version))
|
|
203
|
+
|
|
204
|
+
|
|
205
|
+
def test_scan(test_bucket_name, session, num_workers, session_kwargs, tabular_endpoint_urls, perf_metrics_db):
|
|
206
|
+
metrics_table = metrics.Table(perf_metrics_db, "query")
|
|
207
|
+
|
|
208
|
+
log.info("starting %d workers, endpoints=%s", num_workers, tabular_endpoint_urls)
|
|
209
|
+
with ProcessPoolExecutor(max_workers=num_workers) as executor:
|
|
210
|
+
futures = [
|
|
211
|
+
executor.submit(run_query, session_kwargs, i, test_bucket_name, url)
|
|
212
|
+
for i, url in zip(range(num_workers), itertools.cycle(tabular_endpoint_urls))
|
|
213
|
+
]
|
|
214
|
+
for future in as_completed(futures):
|
|
215
|
+
metrics_table.insert(future.result())
|
|
216
|
+
|
|
217
|
+
log.info("finished %d workers", num_workers)
|
vastdb/config.py
ADDED
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
"""Configuration-related dataclasses."""
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
from dataclasses import dataclass, field
|
|
5
|
+
from typing import Callable, List, Optional
|
|
6
|
+
|
|
7
|
+
import backoff
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
@dataclass
|
|
11
|
+
class BackoffConfig:
|
|
12
|
+
"""Retry configuration."""
|
|
13
|
+
|
|
14
|
+
wait_gen: Callable = field(default=backoff.expo)
|
|
15
|
+
max_value: Optional[float] = None # max duration for a single wait period
|
|
16
|
+
max_tries: int = 10
|
|
17
|
+
max_time: float = 60.0 # in seconds
|
|
18
|
+
backoff_log_level: int = logging.DEBUG
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
@dataclass
|
|
22
|
+
class QueryConfig:
|
|
23
|
+
"""Query execution configiration."""
|
|
24
|
+
|
|
25
|
+
# allows server-side parallel processing by issuing multiple reads concurrently for a single RPC
|
|
26
|
+
num_sub_splits: int = 4
|
|
27
|
+
|
|
28
|
+
# used to split the table into disjoint subsets of rows, to be processed concurrently using multiple RPCs
|
|
29
|
+
# will be estimated from the table's row count, if not explicitly set
|
|
30
|
+
num_splits: Optional[int] = None
|
|
31
|
+
|
|
32
|
+
# each endpoint will be handled by a separate worker thread
|
|
33
|
+
# a single endpoint can be specified more than once to benefit from multithreaded execution
|
|
34
|
+
data_endpoints: Optional[List[str]] = None
|
|
35
|
+
|
|
36
|
+
# a subsplit fiber will finish after sending this number of rows back to the client
|
|
37
|
+
limit_rows_per_sub_split: int = 128 * 1024
|
|
38
|
+
|
|
39
|
+
# each fiber will read the following number of rowgroups coninuously before skipping
|
|
40
|
+
# in order to use semi-sorted projections this value must be 8 (this is the hard coded size of a row groups per row block).
|
|
41
|
+
num_row_groups_per_sub_split: int = 8
|
|
42
|
+
|
|
43
|
+
# can be disabled for benchmarking purposes
|
|
44
|
+
use_semi_sorted_projections: bool = True
|
|
45
|
+
|
|
46
|
+
# enforce using a specific semi-sorted projection (if enabled above)
|
|
47
|
+
semi_sorted_projection_name: Optional[str] = None
|
|
48
|
+
|
|
49
|
+
# used to estimate the number of splits, given the table rows' count
|
|
50
|
+
rows_per_split: int = 4000000
|
|
51
|
+
|
|
52
|
+
# used for worker threads' naming
|
|
53
|
+
query_id: str = ""
|
|
54
|
+
|
|
55
|
+
# non-negative integer, used for server-side prioritization of queued requests:
|
|
56
|
+
# - requests with lower values will be served before requests with higher values.
|
|
57
|
+
# - if unset, the request will be added to the queue's end.
|
|
58
|
+
queue_priority: Optional[int] = None
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
@dataclass
|
|
62
|
+
class ImportConfig:
|
|
63
|
+
"""Import execution configiration."""
|
|
64
|
+
|
|
65
|
+
import_concurrency: int = 2
|
vastdb/conftest.py
CHANGED
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import os
|
|
2
|
+
import sqlite3
|
|
2
3
|
from pathlib import Path
|
|
3
4
|
|
|
4
5
|
import boto3
|
|
@@ -13,27 +14,43 @@ def pytest_addoption(parser):
|
|
|
13
14
|
default=os.environ.get("AWS_ACCESS_KEY_ID", None))
|
|
14
15
|
parser.addoption("--tabular-secret-key", help="Secret key with Tabular permissions (AWS_SECRET_ACCESS_KEY)",
|
|
15
16
|
default=os.environ.get("AWS_SECRET_ACCESS_KEY", None))
|
|
16
|
-
parser.addoption("--tabular-endpoint-url", help="Tabular server endpoint", default="
|
|
17
|
+
parser.addoption("--tabular-endpoint-url", help="Tabular server endpoint", default=[], action="append")
|
|
17
18
|
parser.addoption("--data-path", help="Data files location", default=None)
|
|
18
19
|
parser.addoption("--crater-path", help="Save benchmark results in a dedicated location", default=None)
|
|
19
20
|
parser.addoption("--schema-name", help="Name of schema for the test to operate on", default=None)
|
|
20
21
|
parser.addoption("--table-name", help="Name of table for the test to operate on", default=None)
|
|
22
|
+
parser.addoption("--num-workers", help="Number of concurrent workers", default=1)
|
|
21
23
|
|
|
22
24
|
|
|
23
25
|
@pytest.fixture(scope="session")
|
|
24
|
-
def
|
|
25
|
-
return
|
|
26
|
+
def session_kwargs(request, tabular_endpoint_urls):
|
|
27
|
+
return dict(
|
|
26
28
|
access=request.config.getoption("--tabular-access-key"),
|
|
27
29
|
secret=request.config.getoption("--tabular-secret-key"),
|
|
28
|
-
endpoint=
|
|
30
|
+
endpoint=tabular_endpoint_urls[0],
|
|
29
31
|
)
|
|
30
32
|
|
|
31
33
|
|
|
34
|
+
@pytest.fixture(scope="session")
|
|
35
|
+
def session(session_kwargs):
|
|
36
|
+
return vastdb.connect(**session_kwargs)
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
@pytest.fixture(scope="session")
|
|
40
|
+
def num_workers(request):
|
|
41
|
+
return int(request.config.getoption("--num-workers"))
|
|
42
|
+
|
|
43
|
+
|
|
32
44
|
@pytest.fixture(scope="session")
|
|
33
45
|
def test_bucket_name(request):
|
|
34
46
|
return request.config.getoption("--tabular-bucket-name")
|
|
35
47
|
|
|
36
48
|
|
|
49
|
+
@pytest.fixture(scope="session")
|
|
50
|
+
def tabular_endpoint_urls(request):
|
|
51
|
+
return request.config.getoption("--tabular-endpoint-url") or ["http://localhost:9090"]
|
|
52
|
+
|
|
53
|
+
|
|
37
54
|
def iter_schemas(s):
|
|
38
55
|
"""Recusively scan all schemas."""
|
|
39
56
|
children = s.schemas()
|
|
@@ -55,12 +72,12 @@ def clean_bucket_name(request, test_bucket_name, session):
|
|
|
55
72
|
|
|
56
73
|
|
|
57
74
|
@pytest.fixture(scope="session")
|
|
58
|
-
def s3(request):
|
|
75
|
+
def s3(request, tabular_endpoint_urls):
|
|
59
76
|
return boto3.client(
|
|
60
77
|
's3',
|
|
61
78
|
aws_access_key_id=request.config.getoption("--tabular-access-key"),
|
|
62
79
|
aws_secret_access_key=request.config.getoption("--tabular-secret-key"),
|
|
63
|
-
endpoint_url=
|
|
80
|
+
endpoint_url=tabular_endpoint_urls[0])
|
|
64
81
|
|
|
65
82
|
|
|
66
83
|
@pytest.fixture(scope="function")
|
|
@@ -81,3 +98,8 @@ def schema_name(request):
|
|
|
81
98
|
@pytest.fixture(scope="function")
|
|
82
99
|
def table_name(request):
|
|
83
100
|
return request.config.getoption("--table-name")
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
@pytest.fixture(scope="function")
|
|
104
|
+
def perf_metrics_db(crater_path):
|
|
105
|
+
return sqlite3.connect(f"{crater_path}/metrics.sqlite")
|
vastdb/errors.py
CHANGED
|
@@ -3,7 +3,6 @@ import xml.etree.ElementTree
|
|
|
3
3
|
from dataclasses import dataclass
|
|
4
4
|
from enum import Enum
|
|
5
5
|
|
|
6
|
-
import pyarrow as pa
|
|
7
6
|
import requests
|
|
8
7
|
|
|
9
8
|
|
|
@@ -170,11 +169,6 @@ class NotSupportedCommand(NotSupported):
|
|
|
170
169
|
table: str
|
|
171
170
|
|
|
172
171
|
|
|
173
|
-
@dataclass
|
|
174
|
-
class NotSupportedType(NotSupported):
|
|
175
|
-
field: pa.Field
|
|
176
|
-
|
|
177
|
-
|
|
178
172
|
@dataclass
|
|
179
173
|
class NotSupportedVersion(NotSupported):
|
|
180
174
|
err_msg: str
|
vastdb/features.py
ADDED
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
"""Version-dependent features."""
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
|
|
5
|
+
from .errors import NotSupportedVersion
|
|
6
|
+
|
|
7
|
+
log = logging.getLogger()
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class Features:
|
|
11
|
+
"""VAST database features - check if server is already support a feature."""
|
|
12
|
+
|
|
13
|
+
def __init__(self, vast_version):
|
|
14
|
+
"""Save the server version."""
|
|
15
|
+
self.vast_version = vast_version
|
|
16
|
+
|
|
17
|
+
self.check_imports_table = self._check(
|
|
18
|
+
"Imported objects' table feature requires 5.2+ VAST release",
|
|
19
|
+
vast_version >= (5, 2))
|
|
20
|
+
|
|
21
|
+
self.check_return_row_ids = self._check(
|
|
22
|
+
"Returning row IDs requires 5.1+ VAST release",
|
|
23
|
+
vast_version >= (5, 1))
|
|
24
|
+
|
|
25
|
+
self.check_enforce_semisorted_projection = self._check(
|
|
26
|
+
"Semi-sorted projection enforcement requires 5.1+ VAST release",
|
|
27
|
+
vast_version >= (5, 1))
|
|
28
|
+
|
|
29
|
+
self.check_external_row_ids_allocation = self._check(
|
|
30
|
+
"External row IDs allocation requires 5.1+ VAST release",
|
|
31
|
+
vast_version >= (5, 1))
|
|
32
|
+
|
|
33
|
+
def _check(self, msg, supported):
|
|
34
|
+
log.debug("%s (current version is %s): supported=%s", msg, self.vast_version, supported)
|
|
35
|
+
if not supported:
|
|
36
|
+
def fail():
|
|
37
|
+
raise NotSupportedVersion(msg, self.vast_version)
|
|
38
|
+
return fail
|
|
39
|
+
|
|
40
|
+
def noop():
|
|
41
|
+
pass
|
|
42
|
+
return noop
|
vastdb/schema.py
CHANGED
|
@@ -10,7 +10,7 @@ from typing import TYPE_CHECKING, Iterable, List, Optional
|
|
|
10
10
|
|
|
11
11
|
import pyarrow as pa
|
|
12
12
|
|
|
13
|
-
from . import bucket, errors, schema, table
|
|
13
|
+
from . import bucket, errors, schema, table
|
|
14
14
|
|
|
15
15
|
if TYPE_CHECKING:
|
|
16
16
|
from .table import Table
|
|
@@ -86,7 +86,6 @@ class Schema:
|
|
|
86
86
|
if use_external_row_ids_allocation:
|
|
87
87
|
self.tx._rpc.features.check_external_row_ids_allocation()
|
|
88
88
|
|
|
89
|
-
util.check_supported_types(columns)
|
|
90
89
|
self.tx._rpc.api.create_table(self.bucket.name, self.name, table_name, columns, txid=self.tx.txid,
|
|
91
90
|
use_external_row_ids_allocation=use_external_row_ids_allocation)
|
|
92
91
|
log.info("Created table: %s", table_name)
|
vastdb/session.py
CHANGED
|
@@ -7,51 +7,11 @@ For more details see:
|
|
|
7
7
|
- [Tabular identity policy with the proper permissions](https://support.vastdata.com/s/article/UUID-14322b60-d6a2-89ac-3df0-3dfbb6974182)
|
|
8
8
|
"""
|
|
9
9
|
|
|
10
|
-
import logging
|
|
11
10
|
import os
|
|
12
|
-
from typing import Optional
|
|
11
|
+
from typing import TYPE_CHECKING, Optional
|
|
13
12
|
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
from . import _internal, errors, transaction
|
|
17
|
-
from ._internal import BackoffConfig
|
|
18
|
-
|
|
19
|
-
log = logging.getLogger()
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
class Features:
|
|
23
|
-
"""VAST database features - check if server is already support a feature."""
|
|
24
|
-
|
|
25
|
-
def __init__(self, vast_version):
|
|
26
|
-
"""Save the server version."""
|
|
27
|
-
self.vast_version = vast_version
|
|
28
|
-
|
|
29
|
-
self.check_imports_table = self._check(
|
|
30
|
-
"Imported objects' table feature requires 5.2+ VAST release",
|
|
31
|
-
vast_version >= (5, 2))
|
|
32
|
-
|
|
33
|
-
self.check_return_row_ids = self._check(
|
|
34
|
-
"Returning row IDs requires 5.1+ VAST release",
|
|
35
|
-
vast_version >= (5, 1))
|
|
36
|
-
|
|
37
|
-
self.check_enforce_semisorted_projection = self._check(
|
|
38
|
-
"Semi-sorted projection enforcement requires 5.1+ VAST release",
|
|
39
|
-
vast_version >= (5, 1))
|
|
40
|
-
|
|
41
|
-
self.check_external_row_ids_allocation = self._check(
|
|
42
|
-
"External row IDs allocation requires 5.1+ VAST release",
|
|
43
|
-
vast_version >= (5, 1))
|
|
44
|
-
|
|
45
|
-
def _check(self, msg, supported):
|
|
46
|
-
log.debug("%s (current version is %s): supported=%s", msg, self.vast_version, supported)
|
|
47
|
-
if not supported:
|
|
48
|
-
def fail():
|
|
49
|
-
raise errors.NotSupportedVersion(msg, self.vast_version)
|
|
50
|
-
return fail
|
|
51
|
-
|
|
52
|
-
def noop():
|
|
53
|
-
pass
|
|
54
|
-
return noop
|
|
13
|
+
if TYPE_CHECKING:
|
|
14
|
+
from .config import BackoffConfig
|
|
55
15
|
|
|
56
16
|
|
|
57
17
|
class Session:
|
|
@@ -60,8 +20,13 @@ class Session:
|
|
|
60
20
|
def __init__(self, access=None, secret=None, endpoint=None,
|
|
61
21
|
*,
|
|
62
22
|
ssl_verify=True,
|
|
63
|
-
|
|
23
|
+
timeout=None,
|
|
24
|
+
backoff_config: Optional["BackoffConfig"] = None):
|
|
64
25
|
"""Connect to a VAST Database endpoint, using specified credentials."""
|
|
26
|
+
import boto3
|
|
27
|
+
|
|
28
|
+
from . import _internal, features
|
|
29
|
+
|
|
65
30
|
if access is None:
|
|
66
31
|
access = os.environ['AWS_ACCESS_KEY_ID']
|
|
67
32
|
if secret is None:
|
|
@@ -74,8 +39,9 @@ class Session:
|
|
|
74
39
|
access_key=access,
|
|
75
40
|
secret_key=secret,
|
|
76
41
|
ssl_verify=ssl_verify,
|
|
42
|
+
timeout=timeout,
|
|
77
43
|
backoff_config=backoff_config)
|
|
78
|
-
self.features = Features(self.api.vast_version)
|
|
44
|
+
self.features = features.Features(self.api.vast_version)
|
|
79
45
|
self.s3 = boto3.client('s3',
|
|
80
46
|
aws_access_key_id=access,
|
|
81
47
|
aws_secret_access_key=secret,
|
|
@@ -93,4 +59,5 @@ class Session:
|
|
|
93
59
|
with session.transaction() as tx:
|
|
94
60
|
tx.bucket("bucket").create_schema("schema")
|
|
95
61
|
"""
|
|
62
|
+
from . import transaction
|
|
96
63
|
return transaction.Transaction(self)
|
vastdb/table.py
CHANGED
|
@@ -14,6 +14,7 @@ import pyarrow as pa
|
|
|
14
14
|
import urllib3
|
|
15
15
|
|
|
16
16
|
from . import _internal, errors, schema, util
|
|
17
|
+
from .config import ImportConfig, QueryConfig
|
|
17
18
|
|
|
18
19
|
log = logging.getLogger(__name__)
|
|
19
20
|
|
|
@@ -39,53 +40,6 @@ class TableStats:
|
|
|
39
40
|
endpoints: Tuple[str, ...] = ()
|
|
40
41
|
|
|
41
42
|
|
|
42
|
-
@dataclass
|
|
43
|
-
class QueryConfig:
|
|
44
|
-
"""Query execution configiration."""
|
|
45
|
-
|
|
46
|
-
# allows server-side parallel processing by issuing multiple reads concurrently for a single RPC
|
|
47
|
-
num_sub_splits: int = 4
|
|
48
|
-
|
|
49
|
-
# used to split the table into disjoint subsets of rows, to be processed concurrently using multiple RPCs
|
|
50
|
-
# will be estimated from the table's row count, if not explicitly set
|
|
51
|
-
num_splits: Optional[int] = None
|
|
52
|
-
|
|
53
|
-
# each endpoint will be handled by a separate worker thread
|
|
54
|
-
# a single endpoint can be specified more than once to benefit from multithreaded execution
|
|
55
|
-
data_endpoints: Optional[List[str]] = None
|
|
56
|
-
|
|
57
|
-
# a subsplit fiber will finish after sending this number of rows back to the client
|
|
58
|
-
limit_rows_per_sub_split: int = 128 * 1024
|
|
59
|
-
|
|
60
|
-
# each fiber will read the following number of rowgroups coninuously before skipping
|
|
61
|
-
# in order to use semi-sorted projections this value must be 8 (this is the hard coded size of a row groups per row block).
|
|
62
|
-
num_row_groups_per_sub_split: int = 8
|
|
63
|
-
|
|
64
|
-
# can be disabled for benchmarking purposes
|
|
65
|
-
use_semi_sorted_projections: bool = True
|
|
66
|
-
|
|
67
|
-
# enforce using a specific semi-sorted projection (if enabled above)
|
|
68
|
-
semi_sorted_projection_name: Optional[str] = None
|
|
69
|
-
|
|
70
|
-
# used to estimate the number of splits, given the table rows' count
|
|
71
|
-
rows_per_split: int = 4000000
|
|
72
|
-
|
|
73
|
-
# used for worker threads' naming
|
|
74
|
-
query_id: str = ""
|
|
75
|
-
|
|
76
|
-
# non-negative integer, used for server-side prioritization of queued requests:
|
|
77
|
-
# - requests with lower values will be served before requests with higher values.
|
|
78
|
-
# - if unset, the request will be added to the queue's end.
|
|
79
|
-
queue_priority: Optional[int] = None
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
@dataclass
|
|
83
|
-
class ImportConfig:
|
|
84
|
-
"""Import execution configiration."""
|
|
85
|
-
|
|
86
|
-
import_concurrency: int = 2
|
|
87
|
-
|
|
88
|
-
|
|
89
43
|
class SelectSplitState:
|
|
90
44
|
"""State of a specific query split execution."""
|
|
91
45
|
|
|
@@ -167,8 +121,13 @@ class Table:
|
|
|
167
121
|
"""Also, load columns' metadata."""
|
|
168
122
|
self.arrow_schema = self.columns()
|
|
169
123
|
|
|
170
|
-
|
|
171
|
-
self._ibis_table = ibis.table(ibis.Schema.from_pyarrow(self.arrow_schema),
|
|
124
|
+
self._table_path = f'{self.schema.bucket.name}/{self.schema.name}/{self.name}'
|
|
125
|
+
self._ibis_table = ibis.table(ibis.Schema.from_pyarrow(self.arrow_schema), self._table_path)
|
|
126
|
+
|
|
127
|
+
@property
|
|
128
|
+
def path(self):
|
|
129
|
+
"""Return table's path."""
|
|
130
|
+
return self._table_path
|
|
172
131
|
|
|
173
132
|
@property
|
|
174
133
|
def tx(self):
|
|
@@ -486,7 +445,6 @@ class Table:
|
|
|
486
445
|
raise errors.NotSupportedCommand(self.bucket.name, self.schema.name, self.name)
|
|
487
446
|
try:
|
|
488
447
|
row_ids = []
|
|
489
|
-
util.check_supported_types(rows.schema)
|
|
490
448
|
serialized_slices = util.iter_serialized_slices(rows, MAX_INSERT_ROWS_PER_PATCH)
|
|
491
449
|
for slice in serialized_slices:
|
|
492
450
|
res = self.tx._rpc.api.insert_rows(self.bucket.name, self.schema.name, self.name, record_batch=slice,
|
|
@@ -529,7 +487,6 @@ class Table:
|
|
|
529
487
|
|
|
530
488
|
update_rows_rb = util.sort_record_batch_if_needed(update_rows_rb, INTERNAL_ROW_ID)
|
|
531
489
|
|
|
532
|
-
util.check_supported_types(update_rows_rb.schema)
|
|
533
490
|
serialized_slices = util.iter_serialized_slices(update_rows_rb, MAX_ROWS_PER_BATCH)
|
|
534
491
|
for slice in serialized_slices:
|
|
535
492
|
self.tx._rpc.api.update_rows(self.bucket.name, self.schema.name, self.name, record_batch=slice,
|
|
@@ -574,7 +531,6 @@ class Table:
|
|
|
574
531
|
"""Add a new column."""
|
|
575
532
|
if self._imports_table:
|
|
576
533
|
raise errors.NotSupportedCommand(self.bucket.name, self.schema.name, self.name)
|
|
577
|
-
util.check_supported_types(new_column)
|
|
578
534
|
self.tx._rpc.api.add_columns(self.bucket.name, self.schema.name, self.name, new_column, txid=self.tx.txid)
|
|
579
535
|
log.info("Added column(s): %s", new_column)
|
|
580
536
|
self.arrow_schema = self.columns()
|
vastdb/tests/metrics.py
ADDED
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
import dataclasses
|
|
2
|
+
import sqlite3
|
|
3
|
+
from typing import List
|
|
4
|
+
|
|
5
|
+
_MAP_SQLITE_TYPES = {
|
|
6
|
+
str: "TEXT",
|
|
7
|
+
float: "REAL",
|
|
8
|
+
int: "INTEGER",
|
|
9
|
+
}
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
@dataclasses.dataclass
|
|
13
|
+
class Row:
|
|
14
|
+
start: float
|
|
15
|
+
finish: float
|
|
16
|
+
table_path: str
|
|
17
|
+
op: str
|
|
18
|
+
nbytes: int
|
|
19
|
+
rows: int
|
|
20
|
+
cols: int
|
|
21
|
+
pid: int
|
|
22
|
+
tid: int
|
|
23
|
+
sdk_version: str
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class Table:
|
|
27
|
+
def __init__(self, conn: sqlite3.Connection, name: str):
|
|
28
|
+
self.fields = dataclasses.fields(Row)
|
|
29
|
+
self.conn = conn
|
|
30
|
+
self.name = name
|
|
31
|
+
columns = ", ".join(
|
|
32
|
+
f"{f.name} {_MAP_SQLITE_TYPES[f.type]}"
|
|
33
|
+
for f in self.fields
|
|
34
|
+
)
|
|
35
|
+
cmd = f"CREATE TABLE {self.name} ({columns})"
|
|
36
|
+
self.conn.execute(cmd).fetchall()
|
|
37
|
+
|
|
38
|
+
def insert(self, rows: List[Row]):
|
|
39
|
+
args = ", ".join(["?"] * len(self.fields))
|
|
40
|
+
cmd = f"INSERT INTO {self.name} VALUES ({args})"
|
|
41
|
+
data = [dataclasses.astuple(row) for row in rows]
|
|
42
|
+
self.conn.executemany(cmd, data).fetchall()
|
|
43
|
+
self.conn.commit()
|
vastdb/tests/test_sanity.py
CHANGED
|
@@ -24,7 +24,7 @@ def test_bad_credentials(session):
|
|
|
24
24
|
|
|
25
25
|
|
|
26
26
|
def test_bad_endpoint(session):
|
|
27
|
-
backoff_config = vastdb.
|
|
27
|
+
backoff_config = vastdb.config.BackoffConfig(max_tries=3)
|
|
28
28
|
with pytest.raises(vastdb.errors.ConnectionError):
|
|
29
29
|
vastdb.connect(access='BAD', secret='BAD', endpoint='http://invalid-host-name-for-tests:12345', backoff_config=backoff_config)
|
|
30
30
|
|
vastdb/tests/test_tables.py
CHANGED
|
@@ -227,6 +227,35 @@ def test_select_with_priority(session, clean_bucket_name):
|
|
|
227
227
|
t.select(config=config).read_all()
|
|
228
228
|
|
|
229
229
|
|
|
230
|
+
def test_timezones(session, clean_bucket_name):
|
|
231
|
+
columns_with_tz = pa.schema([
|
|
232
|
+
('ts0', pa.timestamp('s', tz='+00:00')),
|
|
233
|
+
('ts3', pa.timestamp('ms', tz='UTC')),
|
|
234
|
+
('ts6', pa.timestamp('us', tz='GMT')),
|
|
235
|
+
('ts9', pa.timestamp('ns', tz='Universal')),
|
|
236
|
+
])
|
|
237
|
+
|
|
238
|
+
# currently timezone information is not stored
|
|
239
|
+
columns_without_tz = pa.schema([
|
|
240
|
+
('ts0', pa.timestamp('s')),
|
|
241
|
+
('ts3', pa.timestamp('ms')),
|
|
242
|
+
('ts6', pa.timestamp('us')),
|
|
243
|
+
('ts9', pa.timestamp('ns')),
|
|
244
|
+
])
|
|
245
|
+
|
|
246
|
+
data = [
|
|
247
|
+
[dt.datetime(2024, 4, 10, 12, 34, 56), dt.datetime(2025, 4, 10, 12, 34, 56), dt.datetime(2026, 4, 10, 12, 34, 56)],
|
|
248
|
+
[dt.datetime(2024, 4, 10, 12, 34, 56, 789000), dt.datetime(2025, 4, 10, 12, 34, 56, 789000), dt.datetime(2026, 4, 10, 12, 34, 56, 789000)],
|
|
249
|
+
[dt.datetime(2024, 4, 10, 12, 34, 56, 789789), dt.datetime(2025, 4, 10, 12, 34, 56, 789789), dt.datetime(2026, 4, 10, 12, 34, 56, 789789)],
|
|
250
|
+
[dt.datetime(2024, 4, 10, 12, 34, 56, 789789), dt.datetime(2025, 4, 10, 12, 34, 56, 789789), dt.datetime(2026, 4, 10, 12, 34, 56, 789789)],
|
|
251
|
+
]
|
|
252
|
+
|
|
253
|
+
inserted = pa.table(schema=columns_with_tz, data=data)
|
|
254
|
+
with prepare_data(session, clean_bucket_name, 's', 't', inserted) as table:
|
|
255
|
+
assert table.arrow_schema == columns_without_tz
|
|
256
|
+
assert table.select().read_all() == pa.table(schema=columns_without_tz, data=data)
|
|
257
|
+
|
|
258
|
+
|
|
230
259
|
def test_types(session, clean_bucket_name):
|
|
231
260
|
columns = pa.schema([
|
|
232
261
|
('tb', pa.bool_()),
|
|
@@ -311,46 +340,6 @@ def test_types(session, clean_bucket_name):
|
|
|
311
340
|
assert select(t['ts9'] == ts_literal) == expected.filter(pc.field('ts9') == ts_literal)
|
|
312
341
|
|
|
313
342
|
|
|
314
|
-
TIMESTAMP_UNITS = ['s', 'ms', 'us', 'ns']
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
def test_unsupported_timezone(session, clean_bucket_name):
|
|
318
|
-
with session.transaction() as tx:
|
|
319
|
-
s = tx.bucket(clean_bucket_name).create_schema('s1')
|
|
320
|
-
for unit in TIMESTAMP_UNITS:
|
|
321
|
-
col_type = pa.timestamp(unit, 'UTC')
|
|
322
|
-
with pytest.raises(errors.NotSupportedType):
|
|
323
|
-
s.create_table('t1', pa.schema([('ts', col_type)]))
|
|
324
|
-
assert s.tables() == []
|
|
325
|
-
|
|
326
|
-
cols = [('c', pa.int64())]
|
|
327
|
-
t1 = s.create_table('t1', pa.schema(cols))
|
|
328
|
-
for unit in TIMESTAMP_UNITS:
|
|
329
|
-
col_type = pa.timestamp(unit, 'UTC')
|
|
330
|
-
with pytest.raises(errors.NotSupportedType):
|
|
331
|
-
t1.add_column(pa.schema([('ts', col_type)]))
|
|
332
|
-
|
|
333
|
-
cols = [(f'c_{unit}', pa.timestamp(unit)) for unit in TIMESTAMP_UNITS]
|
|
334
|
-
t2 = s.create_table('t2', pa.schema(cols))
|
|
335
|
-
|
|
336
|
-
for unit in TIMESTAMP_UNITS:
|
|
337
|
-
col_type = pa.timestamp(unit, 'UTC')
|
|
338
|
-
|
|
339
|
-
rb = pa.record_batch(
|
|
340
|
-
data=[[None]],
|
|
341
|
-
schema=pa.schema([(f'c_{unit}', col_type)]))
|
|
342
|
-
with pytest.raises(errors.NotSupportedType):
|
|
343
|
-
t2.insert(rb)
|
|
344
|
-
|
|
345
|
-
rb = pa.record_batch(
|
|
346
|
-
data=[[0], [None]],
|
|
347
|
-
schema=pa.schema([
|
|
348
|
-
(INTERNAL_ROW_ID, pa.uint64()),
|
|
349
|
-
(f'c_{unit}', col_type)]))
|
|
350
|
-
with pytest.raises(errors.NotSupportedType):
|
|
351
|
-
t2.update(rb)
|
|
352
|
-
|
|
353
|
-
|
|
354
343
|
def test_filters(session, clean_bucket_name):
|
|
355
344
|
columns = pa.schema([
|
|
356
345
|
('a', pa.int32()),
|
vastdb/util.py
CHANGED
|
@@ -6,7 +6,7 @@ import pyarrow as pa
|
|
|
6
6
|
import pyarrow.compute as pc
|
|
7
7
|
import pyarrow.parquet as pq
|
|
8
8
|
|
|
9
|
-
from .errors import InvalidArgument,
|
|
9
|
+
from .errors import InvalidArgument, TooWideRow
|
|
10
10
|
|
|
11
11
|
log = logging.getLogger(__name__)
|
|
12
12
|
|
|
@@ -152,10 +152,3 @@ def sort_record_batch_if_needed(record_batch, sort_column):
|
|
|
152
152
|
return record_batch.sort_by(sort_column)
|
|
153
153
|
else:
|
|
154
154
|
return record_batch
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
def check_supported_types(fields: pa.Schema):
|
|
158
|
-
for f in fields:
|
|
159
|
-
if isinstance(f.type, pa.TimestampType):
|
|
160
|
-
if f.type.tz:
|
|
161
|
-
raise NotSupportedType(f)
|
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import random
|
|
3
|
+
import time
|
|
4
|
+
from concurrent.futures import ThreadPoolExecutor
|
|
5
|
+
|
|
6
|
+
import pyarrow as pa
|
|
7
|
+
|
|
8
|
+
from vastdb.table import QueryConfig
|
|
9
|
+
|
|
10
|
+
logger = logging.getLogger(__name__)
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def test_concurrent_query(session, test_bucket_name, schema_name, table_name):
|
|
14
|
+
"""
|
|
15
|
+
This test runs several selective queries in parallel. It is used to check various internal VAST scenarios.
|
|
16
|
+
"""
|
|
17
|
+
amount_of_queries_in_parallel = 10 # due to limit on requests connection-pool
|
|
18
|
+
config = QueryConfig(num_splits=1, num_sub_splits=1)
|
|
19
|
+
|
|
20
|
+
def _execute_single_query():
|
|
21
|
+
with session.transaction() as tx:
|
|
22
|
+
t = tx.bucket(test_bucket_name).schema(schema_name).table(table_name)
|
|
23
|
+
pred = (t["a"] == 0) # 0 is in the min-max range
|
|
24
|
+
s = time.time()
|
|
25
|
+
t.select(config=config, predicate=pred).read_all()
|
|
26
|
+
e = time.time()
|
|
27
|
+
logger.info(f"Query took {e - s}")
|
|
28
|
+
|
|
29
|
+
logger.info(f"about to submit {amount_of_queries_in_parallel} queries in parallel")
|
|
30
|
+
with ThreadPoolExecutor() as executor:
|
|
31
|
+
futures = [executor.submit(_execute_single_query) for _ in range(amount_of_queries_in_parallel)]
|
|
32
|
+
for future in futures:
|
|
33
|
+
future.result()
|
|
34
|
+
logger.info(f"finished running {amount_of_queries_in_parallel} queries")
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def test_table_stats(session, test_bucket_name, schema_name, table_name):
|
|
38
|
+
"""
|
|
39
|
+
Testing stats integrity while altering table
|
|
40
|
+
"""
|
|
41
|
+
NUM_TIMES_TO_INSERT = 1000
|
|
42
|
+
seed = random.randint(0, 10)
|
|
43
|
+
logger.info(f"random seed is {seed}")
|
|
44
|
+
r = random.Random(seed)
|
|
45
|
+
|
|
46
|
+
with session.transaction() as tx:
|
|
47
|
+
t = tx.bucket(test_bucket_name).schema(schema_name).table(table_name)
|
|
48
|
+
initial_stat = t.get_stats()
|
|
49
|
+
table_fields = t.columns()
|
|
50
|
+
|
|
51
|
+
rand_values = {} # create a dict with a random value from each column
|
|
52
|
+
with session.transaction() as tx:
|
|
53
|
+
t = tx.bucket(test_bucket_name).schema(schema_name).table(table_name)
|
|
54
|
+
for col in table_fields:
|
|
55
|
+
res = t.select(columns=[col.name]).read_all().column(col.name)
|
|
56
|
+
rand_values[col.name] = res[int(r.uniform(0, len(res)))].as_py()
|
|
57
|
+
|
|
58
|
+
logger.info(f"rand row to insert to the table - {rand_values}, {NUM_TIMES_TO_INSERT} times")
|
|
59
|
+
rb = pa.RecordBatch.from_pylist([rand_values] * NUM_TIMES_TO_INSERT)
|
|
60
|
+
with session.transaction() as tx:
|
|
61
|
+
t = tx.bucket(test_bucket_name).schema(schema_name).table(table_name)
|
|
62
|
+
t.insert(rb)
|
|
63
|
+
time.sleep(2) # waiting for stats to get updated
|
|
64
|
+
new_stat = t.get_stats()
|
|
65
|
+
|
|
66
|
+
logger.info("inserted to table")
|
|
67
|
+
assert new_stat.size_in_bytes != initial_stat.size_in_bytes
|
|
68
|
+
assert new_stat.num_rows - NUM_TIMES_TO_INSERT == initial_stat.num_rows
|
|
@@ -148,32 +148,37 @@ vast_flatbuf/tabular/ObjectDetails.py,sha256=qW0WtbkCYYE_L-Kw6VNRDCLYaRm5lKvTbLN
|
|
|
148
148
|
vast_flatbuf/tabular/S3File.py,sha256=KC9c2oS5-JXwTTriUVFdjOvRG0B54Cq9kviSDZY3NI0,4450
|
|
149
149
|
vast_flatbuf/tabular/VipRange.py,sha256=_BJd1RRZAcK76T9vlsHzXKYVsPVaz6WTEAqStMQCAUQ,2069
|
|
150
150
|
vast_flatbuf/tabular/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
151
|
-
vastdb/__init__.py,sha256=
|
|
152
|
-
vastdb/_internal.py,sha256=
|
|
151
|
+
vastdb/__init__.py,sha256=J1JjKiFkKC95BHowfh9kJfQFTjRce-QMsc6zF_FfxC0,432
|
|
152
|
+
vastdb/_internal.py,sha256=6Z0pkMCZNInJPFmWl9UvcLxBEX8CJZjV0hIsi_9jib0,89808
|
|
153
153
|
vastdb/bucket.py,sha256=5KuKhPjZOevznZqWHDVVocejvAy7dcwobPuV6BJCfPc,2544
|
|
154
|
-
vastdb/
|
|
155
|
-
vastdb/
|
|
156
|
-
vastdb/
|
|
157
|
-
vastdb/
|
|
158
|
-
vastdb/
|
|
154
|
+
vastdb/config.py,sha256=1tMYtzKXerGcIUjH4tIGEvZNWvO4fviCEdcNCnELJZo,2269
|
|
155
|
+
vastdb/conftest.py,sha256=ePzQiEQmlNGcM2T4GZevE4XuvcnFWfnTSzr8IVZpVKk,3438
|
|
156
|
+
vastdb/errors.py,sha256=2XR1ko7J5nkfiHSAgwuVAADw0SsyqxOwSeFaGgKZEXM,4186
|
|
157
|
+
vastdb/features.py,sha256=DxV746LSkORwVSD6MP2hdXRfnyoLkJwtOwGmp1dnquo,1322
|
|
158
|
+
vastdb/schema.py,sha256=X7IRrogXH7Z0kes-DsDh1bRqIhvjH6owlFigGBXy7XQ,5913
|
|
159
|
+
vastdb/session.py,sha256=ZrQf8cecVIHIBUOPNg4ed8ZCnEEu0QW1OBxQgz_ia80,2241
|
|
160
|
+
vastdb/table.py,sha256=2z5zpnBc5iM5ZqELCVg6wEIdYcPVm6UW_4Xm55S8ZXg,31078
|
|
159
161
|
vastdb/transaction.py,sha256=qu2rOlR7AS1ojMOzgWapQMpcorrutelZZLH1mLmTHxk,3186
|
|
160
|
-
vastdb/util.py,sha256=
|
|
162
|
+
vastdb/util.py,sha256=4LTYBBR13na376AmDm5lQILJzLcfelIKdkNPy0IqI0o,5684
|
|
161
163
|
vastdb/bench/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
162
|
-
vastdb/bench/test_perf.py,sha256=
|
|
164
|
+
vastdb/bench/test_perf.py,sha256=gZIqfHva6lNFpD-9bHAe7M8COBjUyrPkHu3E7F8J2L0,1072
|
|
165
|
+
vastdb/bench/test_sample.py,sha256=bFmw7BOCr5FoGn4TY9pQGd6_cVNK4uBeSRi33tTubyk,7847
|
|
163
166
|
vastdb/tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
167
|
+
vastdb/tests/metrics.py,sha256=N6ELJUmZubhAMmUtDbisXr6TFhSDgVCTTU05gBVxHRA,1010
|
|
164
168
|
vastdb/tests/test_duckdb.py,sha256=STw_1PwTQR8Naz6s0p6lQTV1ZTKKhe3LPBUbhqzTCu0,1880
|
|
165
169
|
vastdb/tests/test_imports.py,sha256=xKub3-bisFjH0BsZM8COfiUWuMrtoOoQKprF6VQT9RI,5669
|
|
166
170
|
vastdb/tests/test_nested.py,sha256=22NAxBTm7Aq-Vn6AIYbi5Cb1ET8W0XeLK3pp4D8BYWI,3448
|
|
167
171
|
vastdb/tests/test_projections.py,sha256=3y1kubwVrzO-xoR0hyps7zrjOJI8niCYspaFTN16Q9w,4540
|
|
168
|
-
vastdb/tests/test_sanity.py,sha256=
|
|
172
|
+
vastdb/tests/test_sanity.py,sha256=oiV2gb05aPyG5RMNUQZlyjNlg3T7Fig1_8OJzpAgcsk,3038
|
|
169
173
|
vastdb/tests/test_schemas.py,sha256=l70YQMlx2UL1KRQhApriiG2ZM7GJF-IzWU31H3Yqn1U,3312
|
|
170
|
-
vastdb/tests/test_tables.py,sha256=
|
|
174
|
+
vastdb/tests/test_tables.py,sha256=qWicD0BYuhrh1kRVqkHMJNsxcHxDcCprbEXuZJm1wm4,31529
|
|
171
175
|
vastdb/tests/test_util.py,sha256=Ok_sAEBJsRGF5Voa_v5eu3eAd52GWu8jMjjQbadwW-s,1260
|
|
172
176
|
vastdb/tests/util.py,sha256=dpRJYbboDnlqL4qIdvScpp8--5fxRUBIcIYitrfcj9o,555
|
|
173
177
|
vastdb/vast_tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
174
178
|
vastdb/vast_tests/test_ha.py,sha256=744P4G6VJ09RIkHhMQL4wlipCBJWQVMhyvUrSc4k1HQ,975
|
|
175
|
-
vastdb
|
|
176
|
-
vastdb-0.1.
|
|
177
|
-
vastdb-0.1.
|
|
178
|
-
vastdb-0.1.
|
|
179
|
-
vastdb-0.1.
|
|
179
|
+
vastdb/vast_tests/test_scale.py,sha256=EpjCJmVAQrNBxVnHGJ-KHCoxevhqOcyqYFPMIIY9s60,2714
|
|
180
|
+
vastdb-0.1.11.dist-info/LICENSE,sha256=obffan7LYrq7hLHNrY7vHcn2pKUTBUYXMKu-VOAvDxU,11333
|
|
181
|
+
vastdb-0.1.11.dist-info/METADATA,sha256=11xuX_TRPnPWsTe6bDgBx-EM--9zLolqog9Z3NhDpno,1351
|
|
182
|
+
vastdb-0.1.11.dist-info/WHEEL,sha256=Z4pYXqR_rTB7OWNDYFOm1qRk0RX6GFP2o8LgvP453Hk,91
|
|
183
|
+
vastdb-0.1.11.dist-info/top_level.txt,sha256=Vsj2MKtlhPg0J4so64slQtnwjhgoPmJgcG-6YcVAwVc,20
|
|
184
|
+
vastdb-0.1.11.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|