vastdb 0.0.5.3__py3-none-any.whl → 0.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vast_flatbuf/tabular/GetTableStatsResponse.py +45 -1
- vast_flatbuf/tabular/VipRange.py +56 -0
- vastdb/__init__.py +7 -0
- vastdb/bench/test_perf.py +29 -0
- vastdb/bucket.py +85 -0
- vastdb/{tests/conftest.py → conftest.py} +29 -14
- vastdb/errors.py +175 -0
- vastdb/{api.py → internal_commands.py} +373 -875
- vastdb/schema.py +85 -0
- vastdb/session.py +47 -0
- vastdb/table.py +483 -0
- vastdb/tests/test_imports.py +123 -0
- vastdb/tests/test_nested.py +28 -0
- vastdb/tests/test_projections.py +42 -0
- vastdb/tests/test_sanity.py +34 -15
- vastdb/tests/test_schemas.py +30 -6
- vastdb/tests/test_tables.py +628 -13
- vastdb/tests/util.py +18 -0
- vastdb/transaction.py +54 -0
- vastdb/util.py +11 -10
- vastdb-0.1.1.dist-info/METADATA +38 -0
- {vastdb-0.0.5.3.dist-info → vastdb-0.1.1.dist-info}/RECORD +26 -31
- vast_protobuf/substrait/__init__.py +0 -0
- vast_protobuf/substrait/algebra_pb2.py +0 -1344
- vast_protobuf/substrait/capabilities_pb2.py +0 -46
- vast_protobuf/substrait/ddl_pb2.py +0 -57
- vast_protobuf/substrait/extended_expression_pb2.py +0 -49
- vast_protobuf/substrait/extensions/__init__.py +0 -0
- vast_protobuf/substrait/extensions/extensions_pb2.py +0 -89
- vast_protobuf/substrait/function_pb2.py +0 -168
- vast_protobuf/substrait/parameterized_types_pb2.py +0 -181
- vast_protobuf/substrait/plan_pb2.py +0 -67
- vast_protobuf/substrait/type_expressions_pb2.py +0 -198
- vast_protobuf/substrait/type_pb2.py +0 -350
- vast_protobuf/tabular/__init__.py +0 -0
- vast_protobuf/tabular/rpc_pb2.py +0 -344
- vastdb/bench_scan.py +0 -45
- vastdb/tests/test_create_table_from_parquets.py +0 -50
- vastdb/v2.py +0 -360
- vastdb-0.0.5.3.dist-info/METADATA +0 -47
- {vast_protobuf → vastdb/bench}/__init__.py +0 -0
- {vastdb-0.0.5.3.dist-info → vastdb-0.1.1.dist-info}/LICENSE +0 -0
- {vastdb-0.0.5.3.dist-info → vastdb-0.1.1.dist-info}/WHEEL +0 -0
- {vastdb-0.0.5.3.dist-info → vastdb-0.1.1.dist-info}/top_level.txt +0 -0
vastdb/schema.py
ADDED
|
@@ -0,0 +1,85 @@
|
|
|
1
|
+
"""VAST Database schema (a container of tables).
|
|
2
|
+
|
|
3
|
+
VAST S3 buckets can be used to create Database schemas and tables.
|
|
4
|
+
It is possible to list and access VAST snapshots generated over a bucket.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import logging
|
|
8
|
+
from dataclasses import dataclass
|
|
9
|
+
|
|
10
|
+
import pyarrow as pa
|
|
11
|
+
|
|
12
|
+
from . import bucket, errors, schema, table
|
|
13
|
+
|
|
14
|
+
log = logging.getLogger(__name__)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
@dataclass
|
|
18
|
+
class Schema:
|
|
19
|
+
"""VAST Schema."""
|
|
20
|
+
|
|
21
|
+
name: str
|
|
22
|
+
bucket: "bucket.Bucket"
|
|
23
|
+
|
|
24
|
+
@property
|
|
25
|
+
def tx(self):
|
|
26
|
+
"""VAST transaction used for this schema."""
|
|
27
|
+
return self.bucket.tx
|
|
28
|
+
|
|
29
|
+
def create_table(self, table_name: str, columns: pa.Schema, fail_if_exists=True) -> "table.Table":
|
|
30
|
+
"""Create a new table under this schema."""
|
|
31
|
+
if current := self.table(table_name, fail_if_missing=False):
|
|
32
|
+
if fail_if_exists:
|
|
33
|
+
raise errors.TableExists(self.bucket.name, self.name, table_name)
|
|
34
|
+
else:
|
|
35
|
+
return current
|
|
36
|
+
self.tx._rpc.api.create_table(self.bucket.name, self.name, table_name, columns, txid=self.tx.txid)
|
|
37
|
+
log.info("Created table: %s", table_name)
|
|
38
|
+
return self.table(table_name)
|
|
39
|
+
|
|
40
|
+
def table(self, name: str, fail_if_missing=True) -> "table.Table":
|
|
41
|
+
"""Get a specific table under this schema."""
|
|
42
|
+
t = self.tables(table_name=name)
|
|
43
|
+
if not t:
|
|
44
|
+
if fail_if_missing:
|
|
45
|
+
raise errors.MissingTable(self.bucket.name, self.name, name)
|
|
46
|
+
else:
|
|
47
|
+
return None
|
|
48
|
+
assert len(t) == 1, f"Expected to receive only a single table, but got: {len(t)}. tables: {t}"
|
|
49
|
+
log.debug("Found table: %s", t[0])
|
|
50
|
+
return t[0]
|
|
51
|
+
|
|
52
|
+
def tables(self, table_name=None) -> ["table.Table"]:
|
|
53
|
+
"""List all tables under this schema."""
|
|
54
|
+
tables = []
|
|
55
|
+
next_key = 0
|
|
56
|
+
name_prefix = table_name if table_name else ""
|
|
57
|
+
exact_match = bool(table_name)
|
|
58
|
+
while True:
|
|
59
|
+
bucket_name, schema_name, curr_tables, next_key, is_truncated, _ = \
|
|
60
|
+
self.tx._rpc.api.list_tables(
|
|
61
|
+
bucket=self.bucket.name, schema=self.name, next_key=next_key, txid=self.tx.txid,
|
|
62
|
+
exact_match=exact_match, name_prefix=name_prefix, include_list_stats=exact_match)
|
|
63
|
+
if not curr_tables:
|
|
64
|
+
break
|
|
65
|
+
tables.extend(curr_tables)
|
|
66
|
+
if not is_truncated:
|
|
67
|
+
break
|
|
68
|
+
|
|
69
|
+
return [_parse_table_info(table, self) for table in tables]
|
|
70
|
+
|
|
71
|
+
def drop(self) -> None:
|
|
72
|
+
"""Delete this schema."""
|
|
73
|
+
self.tx._rpc.api.drop_schema(self.bucket.name, self.name, txid=self.tx.txid)
|
|
74
|
+
log.info("Dropped schema: %s", self.name)
|
|
75
|
+
|
|
76
|
+
def rename(self, new_name) -> None:
|
|
77
|
+
"""Rename this schema."""
|
|
78
|
+
self.tx._rpc.api.alter_schema(self.bucket.name, self.name, txid=self.tx.txid, new_name=new_name)
|
|
79
|
+
log.info("Renamed schema: %s to %s", self.name, new_name)
|
|
80
|
+
self.name = new_name
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def _parse_table_info(table_info, schema: "schema.Schema"):
|
|
84
|
+
stats = table.TableStats(num_rows=table_info.num_rows, size_in_bytes=table_info.size_in_bytes)
|
|
85
|
+
return table.Table(name=table_info.name, schema=schema, handle=int(table_info.handle), stats=stats)
|
vastdb/session.py
ADDED
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
"""VAST database session.
|
|
2
|
+
|
|
3
|
+
It should be used to interact with a specific VAST cluster.
|
|
4
|
+
For more details see:
|
|
5
|
+
- [Virtual IP pool configured with DNS service](https://support.vastdata.com/s/topic/0TOV40000000FThOAM/configuring-network-access-v50)
|
|
6
|
+
- [S3 access & secret keys on VAST cluster](https://support.vastdata.com/s/article/UUID-4d2e7e23-b2fb-7900-d98f-96c31a499626)
|
|
7
|
+
- [Tabular identity policy with the proper permissions](https://support.vastdata.com/s/article/UUID-14322b60-d6a2-89ac-3df0-3dfbb6974182)
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
import os
|
|
11
|
+
|
|
12
|
+
import boto3
|
|
13
|
+
|
|
14
|
+
from . import internal_commands, transaction
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class Session:
|
|
18
|
+
"""VAST database session."""
|
|
19
|
+
|
|
20
|
+
def __init__(self, access=None, secret=None, endpoint=None):
|
|
21
|
+
"""Connect to a VAST Database endpoint, using specified credentials."""
|
|
22
|
+
if access is None:
|
|
23
|
+
access = os.environ['AWS_ACCESS_KEY_ID']
|
|
24
|
+
if secret is None:
|
|
25
|
+
secret = os.environ['AWS_SECRET_ACCESS_KEY']
|
|
26
|
+
if endpoint is None:
|
|
27
|
+
endpoint = os.environ['AWS_S3_ENDPOINT_URL']
|
|
28
|
+
|
|
29
|
+
self.api = internal_commands.VastdbApi(endpoint, access, secret)
|
|
30
|
+
self.s3 = boto3.client('s3',
|
|
31
|
+
aws_access_key_id=access,
|
|
32
|
+
aws_secret_access_key=secret,
|
|
33
|
+
endpoint_url=endpoint)
|
|
34
|
+
|
|
35
|
+
def __repr__(self):
|
|
36
|
+
"""Don't show the secret key."""
|
|
37
|
+
return f'{self.__class__.__name__}(endpoint={self.api.url}, access={self.api.access_key})'
|
|
38
|
+
|
|
39
|
+
def transaction(self):
|
|
40
|
+
"""Create a non-initialized transaction object.
|
|
41
|
+
|
|
42
|
+
It should be used as a context manager:
|
|
43
|
+
|
|
44
|
+
with session.transaction() as tx:
|
|
45
|
+
tx.bucket("bucket").create_schema("schema")
|
|
46
|
+
"""
|
|
47
|
+
return transaction.Transaction(self)
|
vastdb/table.py
ADDED
|
@@ -0,0 +1,483 @@
|
|
|
1
|
+
import concurrent.futures
|
|
2
|
+
import logging
|
|
3
|
+
import os
|
|
4
|
+
import queue
|
|
5
|
+
from dataclasses import dataclass, field
|
|
6
|
+
from math import ceil
|
|
7
|
+
from threading import Event
|
|
8
|
+
from typing import List, Union
|
|
9
|
+
|
|
10
|
+
import ibis
|
|
11
|
+
import pyarrow as pa
|
|
12
|
+
|
|
13
|
+
from . import errors, schema
|
|
14
|
+
from .internal_commands import (
|
|
15
|
+
TABULAR_INVALID_ROW_ID,
|
|
16
|
+
VastdbApi,
|
|
17
|
+
build_query_data_request,
|
|
18
|
+
parse_query_data_response,
|
|
19
|
+
)
|
|
20
|
+
|
|
21
|
+
log = logging.getLogger(__name__)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
INTERNAL_ROW_ID = "$row_id"
|
|
25
|
+
MAX_ROWS_PER_BATCH = 512 * 1024
|
|
26
|
+
# for insert we need a smaller limit due to response amplification
|
|
27
|
+
# for example insert of 512k uint8 result in 512k*8bytes response since row_ids are uint64
|
|
28
|
+
MAX_INSERT_ROWS_PER_PATCH = 512 * 1024
|
|
29
|
+
|
|
30
|
+
@dataclass
|
|
31
|
+
class TableStats:
|
|
32
|
+
num_rows: int
|
|
33
|
+
size_in_bytes: int
|
|
34
|
+
is_external_rowid_alloc: bool = False
|
|
35
|
+
endpoints: List[str] = None
|
|
36
|
+
|
|
37
|
+
@dataclass
|
|
38
|
+
class QueryConfig:
|
|
39
|
+
num_sub_splits: int = 4
|
|
40
|
+
num_splits: int = 1
|
|
41
|
+
data_endpoints: [str] = None
|
|
42
|
+
limit_rows_per_sub_split: int = 128 * 1024
|
|
43
|
+
num_row_groups_per_sub_split: int = 8
|
|
44
|
+
use_semi_sorted_projections: bool = True
|
|
45
|
+
rows_per_split: int = 4000000
|
|
46
|
+
query_id: str = ""
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
@dataclass
|
|
50
|
+
class ImportConfig:
|
|
51
|
+
import_concurrency: int = 2
|
|
52
|
+
|
|
53
|
+
class SelectSplitState():
|
|
54
|
+
def __init__(self, query_data_request, table : "Table", split_id : int, config: QueryConfig) -> None:
|
|
55
|
+
self.split_id = split_id
|
|
56
|
+
self.subsplits_state = {i: 0 for i in range(config.num_sub_splits)}
|
|
57
|
+
self.config = config
|
|
58
|
+
self.query_data_request = query_data_request
|
|
59
|
+
self.table = table
|
|
60
|
+
|
|
61
|
+
def batches(self, api : VastdbApi):
|
|
62
|
+
while not self.done:
|
|
63
|
+
response = api.query_data(
|
|
64
|
+
bucket=self.table.bucket.name,
|
|
65
|
+
schema=self.table.schema.name,
|
|
66
|
+
table=self.table.name,
|
|
67
|
+
params=self.query_data_request.serialized,
|
|
68
|
+
split=(self.split_id, self.config.num_splits, self.config.num_row_groups_per_sub_split),
|
|
69
|
+
num_sub_splits=self.config.num_sub_splits,
|
|
70
|
+
response_row_id=False,
|
|
71
|
+
txid=self.table.tx.txid,
|
|
72
|
+
limit_rows=self.config.limit_rows_per_sub_split,
|
|
73
|
+
sub_split_start_row_ids=self.subsplits_state.items(),
|
|
74
|
+
enable_sorted_projections=self.config.use_semi_sorted_projections)
|
|
75
|
+
pages_iter = parse_query_data_response(
|
|
76
|
+
conn=response.raw,
|
|
77
|
+
schema=self.query_data_request.response_schema,
|
|
78
|
+
start_row_ids=self.subsplits_state)
|
|
79
|
+
|
|
80
|
+
for page in pages_iter:
|
|
81
|
+
for batch in page.to_batches():
|
|
82
|
+
if len(batch) > 0:
|
|
83
|
+
yield batch
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
@property
|
|
87
|
+
def done(self):
|
|
88
|
+
return all(row_id == TABULAR_INVALID_ROW_ID for row_id in self.subsplits_state.values())
|
|
89
|
+
|
|
90
|
+
@dataclass
|
|
91
|
+
class Table:
|
|
92
|
+
name: str
|
|
93
|
+
schema: "schema.Schema"
|
|
94
|
+
handle: int
|
|
95
|
+
stats: TableStats
|
|
96
|
+
properties: dict = None
|
|
97
|
+
arrow_schema: pa.Schema = field(init=False, compare=False)
|
|
98
|
+
_ibis_table: ibis.Schema = field(init=False, compare=False)
|
|
99
|
+
|
|
100
|
+
def __post_init__(self):
|
|
101
|
+
self.properties = self.properties or {}
|
|
102
|
+
self.arrow_schema = self.columns()
|
|
103
|
+
|
|
104
|
+
table_path = f'{self.schema.bucket.name}/{self.schema.name}/{self.name}'
|
|
105
|
+
self._ibis_table = ibis.table(ibis.Schema.from_pyarrow(self.arrow_schema), table_path)
|
|
106
|
+
|
|
107
|
+
@property
|
|
108
|
+
def tx(self):
|
|
109
|
+
return self.schema.tx
|
|
110
|
+
|
|
111
|
+
@property
|
|
112
|
+
def bucket(self):
|
|
113
|
+
return self.schema.bucket
|
|
114
|
+
|
|
115
|
+
def __repr__(self):
|
|
116
|
+
return f"{type(self).__name__}(name={self.name})"
|
|
117
|
+
|
|
118
|
+
def columns(self) -> pa.Schema:
|
|
119
|
+
fields = []
|
|
120
|
+
next_key = 0
|
|
121
|
+
while True:
|
|
122
|
+
cur_columns, next_key, is_truncated, _count = self.tx._rpc.api.list_columns(
|
|
123
|
+
bucket=self.bucket.name, schema=self.schema.name, table=self.name, next_key=next_key, txid=self.tx.txid)
|
|
124
|
+
fields.extend(cur_columns)
|
|
125
|
+
if not is_truncated:
|
|
126
|
+
break
|
|
127
|
+
|
|
128
|
+
self.arrow_schema = pa.schema(fields)
|
|
129
|
+
return self.arrow_schema
|
|
130
|
+
|
|
131
|
+
def projection(self, name: str) -> "Projection":
|
|
132
|
+
projs = self.projections(projection_name=name)
|
|
133
|
+
if not projs:
|
|
134
|
+
raise errors.MissingProjection(self.bucket.name, self.schema.name, self.name, name)
|
|
135
|
+
assert len(projs) == 1, f"Expected to receive only a single projection, but got: {len(projs)}. projections: {projs}"
|
|
136
|
+
log.debug("Found projection: %s", projs[0])
|
|
137
|
+
return projs[0]
|
|
138
|
+
|
|
139
|
+
def projections(self, projection_name=None) -> ["Projection"]:
|
|
140
|
+
projections = []
|
|
141
|
+
next_key = 0
|
|
142
|
+
name_prefix = projection_name if projection_name else ""
|
|
143
|
+
exact_match = bool(projection_name)
|
|
144
|
+
while True:
|
|
145
|
+
bucket_name, schema_name, table_name, curr_projections, next_key, is_truncated, _ = \
|
|
146
|
+
self.tx._rpc.api.list_projections(
|
|
147
|
+
bucket=self.bucket.name, schema=self.schema.name, table=self.name, next_key=next_key, txid=self.tx.txid,
|
|
148
|
+
exact_match=exact_match, name_prefix=name_prefix)
|
|
149
|
+
if not curr_projections:
|
|
150
|
+
break
|
|
151
|
+
projections.extend(curr_projections)
|
|
152
|
+
if not is_truncated:
|
|
153
|
+
break
|
|
154
|
+
return [_parse_projection_info(projection, self) for projection in projections]
|
|
155
|
+
|
|
156
|
+
def import_files(self, files_to_import: [str], config: ImportConfig = None) -> None:
|
|
157
|
+
source_files = {}
|
|
158
|
+
for f in files_to_import:
|
|
159
|
+
bucket_name, object_path = _parse_bucket_and_object_names(f)
|
|
160
|
+
source_files[(bucket_name, object_path)] = b''
|
|
161
|
+
|
|
162
|
+
self._execute_import(source_files, config=config)
|
|
163
|
+
|
|
164
|
+
def import_partitioned_files(self, files_and_partitions: {str: pa.RecordBatch}, config: ImportConfig = None) -> None:
|
|
165
|
+
source_files = {}
|
|
166
|
+
for f, record_batch in files_and_partitions.items():
|
|
167
|
+
bucket_name, object_path = _parse_bucket_and_object_names(f)
|
|
168
|
+
serialized_batch = _serialize_record_batch(record_batch)
|
|
169
|
+
source_files = {(bucket_name, object_path): serialized_batch.to_pybytes()}
|
|
170
|
+
|
|
171
|
+
self._execute_import(source_files, config=config)
|
|
172
|
+
|
|
173
|
+
def _execute_import(self, source_files, config):
|
|
174
|
+
config = config or ImportConfig()
|
|
175
|
+
assert config.import_concurrency > 0 # TODO: Do we want to validate concurrency isn't too high?
|
|
176
|
+
max_batch_size = 10 # Enforced in server side.
|
|
177
|
+
endpoints = [self.tx._rpc.api.url for _ in range(config.import_concurrency)] # TODO: use valid endpoints...
|
|
178
|
+
files_queue = queue.Queue()
|
|
179
|
+
|
|
180
|
+
for source_file in source_files.items():
|
|
181
|
+
files_queue.put(source_file)
|
|
182
|
+
|
|
183
|
+
stop_event = Event()
|
|
184
|
+
num_files_in_batch = min(ceil(len(source_files) / len(endpoints)), max_batch_size)
|
|
185
|
+
|
|
186
|
+
def import_worker(q, session):
|
|
187
|
+
try:
|
|
188
|
+
while not q.empty():
|
|
189
|
+
if stop_event.is_set():
|
|
190
|
+
log.debug("stop_event is set, exiting")
|
|
191
|
+
break
|
|
192
|
+
files_batch = {}
|
|
193
|
+
try:
|
|
194
|
+
for _ in range(num_files_in_batch):
|
|
195
|
+
files_batch.update({q.get(block=False)})
|
|
196
|
+
except queue.Empty:
|
|
197
|
+
pass
|
|
198
|
+
if files_batch:
|
|
199
|
+
log.debug("Starting import batch of %s files", len(files_batch))
|
|
200
|
+
session.import_data(
|
|
201
|
+
self.bucket.name, self.schema.name, self.name, files_batch, txid=self.tx.txid)
|
|
202
|
+
except (Exception, KeyboardInterrupt) as e:
|
|
203
|
+
stop_event.set()
|
|
204
|
+
log.error("Got exception inside import_worker. exception: %s", e)
|
|
205
|
+
raise
|
|
206
|
+
|
|
207
|
+
futures = []
|
|
208
|
+
with concurrent.futures.ThreadPoolExecutor(
|
|
209
|
+
max_workers=config.import_concurrency, thread_name_prefix='import_thread') as pool:
|
|
210
|
+
try:
|
|
211
|
+
for endpoint in endpoints:
|
|
212
|
+
session = VastdbApi(endpoint, self.tx._rpc.api.access_key, self.tx._rpc.api.secret_key)
|
|
213
|
+
futures.append(pool.submit(import_worker, files_queue, session))
|
|
214
|
+
|
|
215
|
+
log.debug("Waiting for import workers to finish")
|
|
216
|
+
for future in concurrent.futures.as_completed(futures):
|
|
217
|
+
future.result()
|
|
218
|
+
finally:
|
|
219
|
+
stop_event.set()
|
|
220
|
+
# ThreadPoolExecutor will be joined at the end of the context
|
|
221
|
+
def refresh_stats(self):
|
|
222
|
+
stats_tuple = self.tx._rpc.api.get_table_stats(
|
|
223
|
+
bucket=self.bucket.name, schema=self.schema.name, name=self.name, txid=self.tx.txid)
|
|
224
|
+
self.stats = TableStats(**stats_tuple._asdict())
|
|
225
|
+
|
|
226
|
+
def select(self, columns: [str] = None,
|
|
227
|
+
predicate: ibis.expr.types.BooleanColumn = None,
|
|
228
|
+
config: QueryConfig = None,
|
|
229
|
+
*,
|
|
230
|
+
internal_row_id: bool = False) -> pa.RecordBatchReader:
|
|
231
|
+
if config is None:
|
|
232
|
+
config = QueryConfig()
|
|
233
|
+
|
|
234
|
+
self.refresh_stats()
|
|
235
|
+
|
|
236
|
+
if self.stats.num_rows > config.rows_per_split and config.num_splits is None:
|
|
237
|
+
config.num_splits = self.stats.num_rows // config.rows_per_split
|
|
238
|
+
log.debug(f"num_rows={self.stats.num_rows} rows_per_splits={config.rows_per_split} num_splits={config.num_splits} ")
|
|
239
|
+
|
|
240
|
+
query_schema = self.arrow_schema
|
|
241
|
+
if internal_row_id:
|
|
242
|
+
queried_fields = [pa.field(INTERNAL_ROW_ID, pa.uint64())]
|
|
243
|
+
queried_fields.extend(column for column in self.arrow_schema)
|
|
244
|
+
query_schema = pa.schema(queried_fields)
|
|
245
|
+
columns.append(INTERNAL_ROW_ID)
|
|
246
|
+
|
|
247
|
+
query_data_request = build_query_data_request(
|
|
248
|
+
schema=query_schema,
|
|
249
|
+
predicate=predicate,
|
|
250
|
+
field_names=columns)
|
|
251
|
+
|
|
252
|
+
splits_queue = queue.Queue()
|
|
253
|
+
|
|
254
|
+
for split in range(config.num_splits):
|
|
255
|
+
splits_queue.put(split)
|
|
256
|
+
|
|
257
|
+
# this queue shouldn't be large it is marely a pipe through which the results
|
|
258
|
+
# are sent to the main thread. Most of the pages actually held in the
|
|
259
|
+
# threads that fetch the pages.
|
|
260
|
+
record_batches_queue = queue.Queue(maxsize=2)
|
|
261
|
+
stop_event = Event()
|
|
262
|
+
class StoppedException(Exception):
|
|
263
|
+
pass
|
|
264
|
+
|
|
265
|
+
def check_stop():
|
|
266
|
+
if stop_event.is_set():
|
|
267
|
+
raise StoppedException
|
|
268
|
+
|
|
269
|
+
def single_endpoint_worker(endpoint : str):
|
|
270
|
+
try:
|
|
271
|
+
host_api = VastdbApi(endpoint=endpoint, access_key=self.tx._rpc.api.access_key, secret_key=self.tx._rpc.api.secret_key)
|
|
272
|
+
while True:
|
|
273
|
+
check_stop()
|
|
274
|
+
try:
|
|
275
|
+
split = splits_queue.get_nowait()
|
|
276
|
+
except queue.Empty:
|
|
277
|
+
log.debug("splits queue is empty")
|
|
278
|
+
break
|
|
279
|
+
|
|
280
|
+
split_state = SelectSplitState(query_data_request=query_data_request,
|
|
281
|
+
table=self,
|
|
282
|
+
split_id=split,
|
|
283
|
+
config=config)
|
|
284
|
+
|
|
285
|
+
for batch in split_state.batches(host_api):
|
|
286
|
+
check_stop()
|
|
287
|
+
record_batches_queue.put(batch)
|
|
288
|
+
except StoppedException:
|
|
289
|
+
log.debug("stop signal.", exc_info=True)
|
|
290
|
+
return
|
|
291
|
+
finally:
|
|
292
|
+
# signal that this thread has ended
|
|
293
|
+
log.debug("exiting")
|
|
294
|
+
record_batches_queue.put(None)
|
|
295
|
+
|
|
296
|
+
# Take a snapshot of enpoints
|
|
297
|
+
endpoints = list(self.stats.endpoints) if config.data_endpoints is None else list(config.data_endpoints)
|
|
298
|
+
|
|
299
|
+
def batches_iterator():
|
|
300
|
+
def propagate_first_exception(futures : List[concurrent.futures.Future], block = False):
|
|
301
|
+
done, not_done = concurrent.futures.wait(futures, None if block else 0, concurrent.futures.FIRST_EXCEPTION)
|
|
302
|
+
for future in done:
|
|
303
|
+
future.result()
|
|
304
|
+
return not_done
|
|
305
|
+
|
|
306
|
+
threads_prefix = "query-data"
|
|
307
|
+
# This is mainly for testing, it helps to identify running threads in runtime.
|
|
308
|
+
if config.query_id:
|
|
309
|
+
threads_prefix = threads_prefix + "-" + config.query_id
|
|
310
|
+
|
|
311
|
+
with concurrent.futures.ThreadPoolExecutor(max_workers=len(endpoints), thread_name_prefix=threads_prefix) as tp: # TODO: concurrency == enpoints is just a heuristic
|
|
312
|
+
futures = [tp.submit(single_endpoint_worker, endpoint) for endpoint in endpoints]
|
|
313
|
+
tasks_running = len(futures)
|
|
314
|
+
try:
|
|
315
|
+
while tasks_running > 0:
|
|
316
|
+
futures = propagate_first_exception(futures, block=False)
|
|
317
|
+
|
|
318
|
+
batch = record_batches_queue.get()
|
|
319
|
+
if batch is not None:
|
|
320
|
+
yield batch
|
|
321
|
+
else:
|
|
322
|
+
tasks_running -= 1
|
|
323
|
+
log.debug("one worker thread finished, remaining: %d", tasks_running)
|
|
324
|
+
|
|
325
|
+
# all host threads ended - wait for all futures to complete
|
|
326
|
+
propagate_first_exception(futures, block=True)
|
|
327
|
+
finally:
|
|
328
|
+
stop_event.set()
|
|
329
|
+
while tasks_running > 0:
|
|
330
|
+
if record_batches_queue.get() is None:
|
|
331
|
+
tasks_running -= 1
|
|
332
|
+
|
|
333
|
+
return pa.RecordBatchReader.from_batches(query_data_request.response_schema, batches_iterator())
|
|
334
|
+
|
|
335
|
+
def _combine_chunks(self, col):
|
|
336
|
+
if hasattr(col, "combine_chunks"):
|
|
337
|
+
return col.combine_chunks()
|
|
338
|
+
else:
|
|
339
|
+
return col
|
|
340
|
+
|
|
341
|
+
def insert(self, rows: pa.RecordBatch) -> pa.RecordBatch:
|
|
342
|
+
serialized_slices = self.tx._rpc.api._record_batch_slices(rows, MAX_INSERT_ROWS_PER_PATCH)
|
|
343
|
+
row_ids = []
|
|
344
|
+
for slice in serialized_slices:
|
|
345
|
+
res = self.tx._rpc.api.insert_rows(self.bucket.name, self.schema.name, self.name, record_batch=slice,
|
|
346
|
+
txid=self.tx.txid)
|
|
347
|
+
(batch,) = pa.RecordBatchStreamReader(res.raw)
|
|
348
|
+
row_ids.append(batch[INTERNAL_ROW_ID])
|
|
349
|
+
|
|
350
|
+
return pa.chunked_array(row_ids)
|
|
351
|
+
|
|
352
|
+
def update(self, rows: Union[pa.RecordBatch, pa.Table], columns: list = None) -> None:
|
|
353
|
+
if columns is not None:
|
|
354
|
+
update_fields = [(INTERNAL_ROW_ID, pa.uint64())]
|
|
355
|
+
update_values = [self._combine_chunks(rows[INTERNAL_ROW_ID])]
|
|
356
|
+
for col in columns:
|
|
357
|
+
update_fields.append(rows.field(col))
|
|
358
|
+
update_values.append(self._combine_chunks(rows[col]))
|
|
359
|
+
|
|
360
|
+
update_rows_rb = pa.record_batch(schema=pa.schema(update_fields), data=update_values)
|
|
361
|
+
else:
|
|
362
|
+
update_rows_rb = rows
|
|
363
|
+
|
|
364
|
+
serialized_slices = self.tx._rpc.api._record_batch_slices(update_rows_rb, MAX_ROWS_PER_BATCH)
|
|
365
|
+
for slice in serialized_slices:
|
|
366
|
+
self.tx._rpc.api.update_rows(self.bucket.name, self.schema.name, self.name, record_batch=slice,
|
|
367
|
+
txid=self.tx.txid)
|
|
368
|
+
|
|
369
|
+
def delete(self, rows: Union[pa.RecordBatch, pa.Table]) -> None:
|
|
370
|
+
delete_rows_rb = pa.record_batch(schema=pa.schema([(INTERNAL_ROW_ID, pa.uint64())]),
|
|
371
|
+
data=[self._combine_chunks(rows[INTERNAL_ROW_ID])])
|
|
372
|
+
|
|
373
|
+
serialized_slices = self.tx._rpc.api._record_batch_slices(delete_rows_rb, MAX_ROWS_PER_BATCH)
|
|
374
|
+
for slice in serialized_slices:
|
|
375
|
+
self.tx._rpc.api.delete_rows(self.bucket.name, self.schema.name, self.name, record_batch=slice,
|
|
376
|
+
txid=self.tx.txid)
|
|
377
|
+
|
|
378
|
+
def drop(self) -> None:
|
|
379
|
+
self.tx._rpc.api.drop_table(self.bucket.name, self.schema.name, self.name, txid=self.tx.txid)
|
|
380
|
+
log.info("Dropped table: %s", self.name)
|
|
381
|
+
|
|
382
|
+
def rename(self, new_name) -> None:
|
|
383
|
+
self.tx._rpc.api.alter_table(
|
|
384
|
+
self.bucket.name, self.schema.name, self.name, txid=self.tx.txid, new_name=new_name)
|
|
385
|
+
log.info("Renamed table from %s to %s ", self.name, new_name)
|
|
386
|
+
self.name = new_name
|
|
387
|
+
|
|
388
|
+
def add_column(self, new_column: pa.Schema) -> None:
|
|
389
|
+
self.tx._rpc.api.add_columns(self.bucket.name, self.schema.name, self.name, new_column, txid=self.tx.txid)
|
|
390
|
+
log.info("Added column(s): %s", new_column)
|
|
391
|
+
self.arrow_schema = self.columns()
|
|
392
|
+
|
|
393
|
+
def drop_column(self, column_to_drop: pa.Schema) -> None:
|
|
394
|
+
self.tx._rpc.api.drop_columns(self.bucket.name, self.schema.name, self.name, column_to_drop, txid=self.tx.txid)
|
|
395
|
+
log.info("Dropped column(s): %s", column_to_drop)
|
|
396
|
+
self.arrow_schema = self.columns()
|
|
397
|
+
|
|
398
|
+
def rename_column(self, current_column_name: str, new_column_name: str) -> None:
|
|
399
|
+
self.tx._rpc.api.alter_column(self.bucket.name, self.schema.name, self.name, name=current_column_name,
|
|
400
|
+
new_name=new_column_name, txid=self.tx.txid)
|
|
401
|
+
log.info("Renamed column: %s to %s", current_column_name, new_column_name)
|
|
402
|
+
self.arrow_schema = self.columns()
|
|
403
|
+
|
|
404
|
+
def create_projection(self, projection_name: str, sorted_columns: List[str], unsorted_columns: List[str]) -> "Projection":
|
|
405
|
+
columns = [(sorted_column, "Sorted") for sorted_column in sorted_columns] + [(unsorted_column, "Unorted") for unsorted_column in unsorted_columns]
|
|
406
|
+
self.tx._rpc.api.create_projection(self.bucket.name, self.schema.name, self.name, projection_name, columns=columns, txid=self.tx.txid)
|
|
407
|
+
log.info("Created projection: %s", projection_name)
|
|
408
|
+
return self.projection(projection_name)
|
|
409
|
+
|
|
410
|
+
def __getitem__(self, col_name):
|
|
411
|
+
return self._ibis_table[col_name]
|
|
412
|
+
|
|
413
|
+
|
|
414
|
+
@dataclass
|
|
415
|
+
class Projection:
|
|
416
|
+
name: str
|
|
417
|
+
table: Table
|
|
418
|
+
handle: int
|
|
419
|
+
stats: TableStats
|
|
420
|
+
properties: dict = None
|
|
421
|
+
|
|
422
|
+
@property
|
|
423
|
+
def bucket(self):
|
|
424
|
+
return self.table.schema.bucket
|
|
425
|
+
|
|
426
|
+
@property
|
|
427
|
+
def schema(self):
|
|
428
|
+
return self.table.schema
|
|
429
|
+
|
|
430
|
+
@property
|
|
431
|
+
def tx(self):
|
|
432
|
+
return self.table.schema.tx
|
|
433
|
+
|
|
434
|
+
def __repr__(self):
|
|
435
|
+
return f"{type(self).__name__}(name={self.name})"
|
|
436
|
+
|
|
437
|
+
def columns(self) -> pa.Schema:
|
|
438
|
+
columns = []
|
|
439
|
+
next_key = 0
|
|
440
|
+
while True:
|
|
441
|
+
curr_columns, next_key, is_truncated, count, _ = \
|
|
442
|
+
self.tx._rpc.api.list_projection_columns(
|
|
443
|
+
self.bucket.name, self.schema.name, self.table.name, self.name, txid=self.table.tx.txid, next_key=next_key)
|
|
444
|
+
if not curr_columns:
|
|
445
|
+
break
|
|
446
|
+
columns.extend(curr_columns)
|
|
447
|
+
if not is_truncated:
|
|
448
|
+
break
|
|
449
|
+
self.arrow_schema = pa.schema([(col[0], col[1]) for col in columns])
|
|
450
|
+
return self.arrow_schema
|
|
451
|
+
|
|
452
|
+
def rename(self, new_name) -> None:
|
|
453
|
+
self.tx._rpc.api.alter_projection(self.bucket.name, self.schema.name,
|
|
454
|
+
self.table.name, self.name, txid=self.tx.txid, new_name=new_name)
|
|
455
|
+
log.info("Renamed projection from %s to %s ", self.name, new_name)
|
|
456
|
+
self.name = new_name
|
|
457
|
+
|
|
458
|
+
def drop(self) -> None:
|
|
459
|
+
self.tx._rpc.api.drop_projection(self.bucket.name, self.schema.name, self.table.name,
|
|
460
|
+
self.name, txid=self.tx.txid)
|
|
461
|
+
log.info("Dropped projection: %s", self.name)
|
|
462
|
+
|
|
463
|
+
|
|
464
|
+
def _parse_projection_info(projection_info, table: "Table"):
|
|
465
|
+
log.info("Projection info %s", str(projection_info))
|
|
466
|
+
stats = TableStats(num_rows=projection_info.num_rows, size_in_bytes=projection_info.size_in_bytes)
|
|
467
|
+
return Projection(name=projection_info.name, table=table, stats=stats, handle=int(projection_info.handle))
|
|
468
|
+
|
|
469
|
+
|
|
470
|
+
def _parse_bucket_and_object_names(path: str) -> (str, str):
|
|
471
|
+
if not path.startswith('/'):
|
|
472
|
+
raise errors.InvalidArgumentError(f"Path {path} must start with a '/'")
|
|
473
|
+
components = path.split(os.path.sep)
|
|
474
|
+
bucket_name = components[1]
|
|
475
|
+
object_path = os.path.sep.join(components[2:])
|
|
476
|
+
return bucket_name, object_path
|
|
477
|
+
|
|
478
|
+
|
|
479
|
+
def _serialize_record_batch(record_batch: pa.RecordBatch) -> pa.lib.Buffer:
|
|
480
|
+
sink = pa.BufferOutputStream()
|
|
481
|
+
with pa.ipc.new_stream(sink, record_batch.schema) as writer:
|
|
482
|
+
writer.write(record_batch)
|
|
483
|
+
return sink.getvalue()
|