vastdb 0.1.1__py3-none-any.whl → 0.1.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vastdb/__init__.py +6 -2
- vastdb/bench/test_perf.py +3 -3
- vastdb/bucket.py +29 -15
- vastdb/errors.py +40 -7
- vastdb/internal_commands.py +194 -233
- vastdb/schema.py +11 -6
- vastdb/session.py +16 -1
- vastdb/table.py +181 -77
- vastdb/tests/test_duckdb.py +61 -0
- vastdb/tests/test_imports.py +13 -1
- vastdb/tests/test_projections.py +1 -0
- vastdb/tests/test_sanity.py +2 -2
- vastdb/tests/test_schemas.py +3 -3
- vastdb/tests/test_tables.py +60 -50
- vastdb/tests/test_util.py +39 -0
- vastdb/tests/util.py +1 -4
- vastdb/transaction.py +32 -6
- vastdb/util.py +42 -6
- {vastdb-0.1.1.dist-info → vastdb-0.1.3.dist-info}/METADATA +2 -5
- {vastdb-0.1.1.dist-info → vastdb-0.1.3.dist-info}/RECORD +23 -21
- {vastdb-0.1.1.dist-info → vastdb-0.1.3.dist-info}/WHEEL +1 -1
- {vastdb-0.1.1.dist-info → vastdb-0.1.3.dist-info}/LICENSE +0 -0
- {vastdb-0.1.1.dist-info → vastdb-0.1.3.dist-info}/top_level.txt +0 -0
vastdb/schema.py
CHANGED
|
@@ -6,11 +6,16 @@ It is possible to list and access VAST snapshots generated over a bucket.
|
|
|
6
6
|
|
|
7
7
|
import logging
|
|
8
8
|
from dataclasses import dataclass
|
|
9
|
+
from typing import TYPE_CHECKING, List, Optional
|
|
9
10
|
|
|
10
11
|
import pyarrow as pa
|
|
11
12
|
|
|
12
13
|
from . import bucket, errors, schema, table
|
|
13
14
|
|
|
15
|
+
if TYPE_CHECKING:
|
|
16
|
+
from .table import Table
|
|
17
|
+
|
|
18
|
+
|
|
14
19
|
log = logging.getLogger(__name__)
|
|
15
20
|
|
|
16
21
|
|
|
@@ -26,7 +31,7 @@ class Schema:
|
|
|
26
31
|
"""VAST transaction used for this schema."""
|
|
27
32
|
return self.bucket.tx
|
|
28
33
|
|
|
29
|
-
def create_table(self, table_name: str, columns: pa.Schema, fail_if_exists=True) -> "
|
|
34
|
+
def create_table(self, table_name: str, columns: pa.Schema, fail_if_exists=True) -> "Table":
|
|
30
35
|
"""Create a new table under this schema."""
|
|
31
36
|
if current := self.table(table_name, fail_if_missing=False):
|
|
32
37
|
if fail_if_exists:
|
|
@@ -35,9 +40,9 @@ class Schema:
|
|
|
35
40
|
return current
|
|
36
41
|
self.tx._rpc.api.create_table(self.bucket.name, self.name, table_name, columns, txid=self.tx.txid)
|
|
37
42
|
log.info("Created table: %s", table_name)
|
|
38
|
-
return self.table(table_name)
|
|
43
|
+
return self.table(table_name) # type: ignore[return-value]
|
|
39
44
|
|
|
40
|
-
def table(self, name: str, fail_if_missing=True) -> "table.Table":
|
|
45
|
+
def table(self, name: str, fail_if_missing=True) -> Optional["table.Table"]:
|
|
41
46
|
"""Get a specific table under this schema."""
|
|
42
47
|
t = self.tables(table_name=name)
|
|
43
48
|
if not t:
|
|
@@ -49,14 +54,14 @@ class Schema:
|
|
|
49
54
|
log.debug("Found table: %s", t[0])
|
|
50
55
|
return t[0]
|
|
51
56
|
|
|
52
|
-
def tables(self, table_name=None) -> ["
|
|
57
|
+
def tables(self, table_name=None) -> List["Table"]:
|
|
53
58
|
"""List all tables under this schema."""
|
|
54
59
|
tables = []
|
|
55
60
|
next_key = 0
|
|
56
61
|
name_prefix = table_name if table_name else ""
|
|
57
62
|
exact_match = bool(table_name)
|
|
58
63
|
while True:
|
|
59
|
-
|
|
64
|
+
_bucket_name, _schema_name, curr_tables, next_key, is_truncated, _ = \
|
|
60
65
|
self.tx._rpc.api.list_tables(
|
|
61
66
|
bucket=self.bucket.name, schema=self.name, next_key=next_key, txid=self.tx.txid,
|
|
62
67
|
exact_match=exact_match, name_prefix=name_prefix, include_list_stats=exact_match)
|
|
@@ -82,4 +87,4 @@ class Schema:
|
|
|
82
87
|
|
|
83
88
|
def _parse_table_info(table_info, schema: "schema.Schema"):
|
|
84
89
|
stats = table.TableStats(num_rows=table_info.num_rows, size_in_bytes=table_info.size_in_bytes)
|
|
85
|
-
return table.Table(name=table_info.name, schema=schema, handle=int(table_info.handle), stats=stats)
|
|
90
|
+
return table.Table(name=table_info.name, schema=schema, handle=int(table_info.handle), stats=stats, _imports_table=False)
|
vastdb/session.py
CHANGED
|
@@ -11,7 +11,20 @@ import os
|
|
|
11
11
|
|
|
12
12
|
import boto3
|
|
13
13
|
|
|
14
|
-
from . import internal_commands, transaction
|
|
14
|
+
from . import errors, internal_commands, transaction
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class Features:
|
|
18
|
+
"""VAST database features - check if server is already support a feature."""
|
|
19
|
+
|
|
20
|
+
def __init__(self, vast_version):
|
|
21
|
+
"""Save the server version."""
|
|
22
|
+
self.vast_version = vast_version
|
|
23
|
+
|
|
24
|
+
def check_imports_table(self):
|
|
25
|
+
"""Check if the feature that support imports table is supported."""
|
|
26
|
+
if self.vast_version < (5, 2):
|
|
27
|
+
raise errors.NotSupportedVersion("import_table requires 5.2+", self.vast_version)
|
|
15
28
|
|
|
16
29
|
|
|
17
30
|
class Session:
|
|
@@ -27,6 +40,8 @@ class Session:
|
|
|
27
40
|
endpoint = os.environ['AWS_S3_ENDPOINT_URL']
|
|
28
41
|
|
|
29
42
|
self.api = internal_commands.VastdbApi(endpoint, access, secret)
|
|
43
|
+
version_tuple = tuple(int(part) for part in self.api.vast_version.split('.'))
|
|
44
|
+
self.features = Features(version_tuple)
|
|
30
45
|
self.s3 = boto3.client('s3',
|
|
31
46
|
aws_access_key_id=access,
|
|
32
47
|
aws_secret_access_key=secret,
|
vastdb/table.py
CHANGED
|
@@ -1,3 +1,5 @@
|
|
|
1
|
+
"""VAST Database table."""
|
|
2
|
+
|
|
1
3
|
import concurrent.futures
|
|
2
4
|
import logging
|
|
3
5
|
import os
|
|
@@ -5,18 +7,12 @@ import queue
|
|
|
5
7
|
from dataclasses import dataclass, field
|
|
6
8
|
from math import ceil
|
|
7
9
|
from threading import Event
|
|
8
|
-
from typing import List, Union
|
|
10
|
+
from typing import Dict, List, Optional, Tuple, Union
|
|
9
11
|
|
|
10
12
|
import ibis
|
|
11
13
|
import pyarrow as pa
|
|
12
14
|
|
|
13
|
-
from . import errors, schema
|
|
14
|
-
from .internal_commands import (
|
|
15
|
-
TABULAR_INVALID_ROW_ID,
|
|
16
|
-
VastdbApi,
|
|
17
|
-
build_query_data_request,
|
|
18
|
-
parse_query_data_response,
|
|
19
|
-
)
|
|
15
|
+
from . import errors, internal_commands, schema, util
|
|
20
16
|
|
|
21
17
|
log = logging.getLogger(__name__)
|
|
22
18
|
|
|
@@ -27,18 +23,24 @@ MAX_ROWS_PER_BATCH = 512 * 1024
|
|
|
27
23
|
# for example insert of 512k uint8 result in 512k*8bytes response since row_ids are uint64
|
|
28
24
|
MAX_INSERT_ROWS_PER_PATCH = 512 * 1024
|
|
29
25
|
|
|
26
|
+
|
|
30
27
|
@dataclass
|
|
31
28
|
class TableStats:
|
|
29
|
+
"""Table-related information."""
|
|
30
|
+
|
|
32
31
|
num_rows: int
|
|
33
32
|
size_in_bytes: int
|
|
34
33
|
is_external_rowid_alloc: bool = False
|
|
35
|
-
endpoints:
|
|
34
|
+
endpoints: Tuple[str, ...] = ()
|
|
35
|
+
|
|
36
36
|
|
|
37
37
|
@dataclass
|
|
38
38
|
class QueryConfig:
|
|
39
|
+
"""Query execution configiration."""
|
|
40
|
+
|
|
39
41
|
num_sub_splits: int = 4
|
|
40
42
|
num_splits: int = 1
|
|
41
|
-
data_endpoints: [str] = None
|
|
43
|
+
data_endpoints: Optional[List[str]] = None
|
|
42
44
|
limit_rows_per_sub_split: int = 128 * 1024
|
|
43
45
|
num_row_groups_per_sub_split: int = 8
|
|
44
46
|
use_semi_sorted_projections: bool = True
|
|
@@ -48,17 +50,27 @@ class QueryConfig:
|
|
|
48
50
|
|
|
49
51
|
@dataclass
|
|
50
52
|
class ImportConfig:
|
|
53
|
+
"""Import execution configiration."""
|
|
54
|
+
|
|
51
55
|
import_concurrency: int = 2
|
|
52
56
|
|
|
53
|
-
|
|
54
|
-
|
|
57
|
+
|
|
58
|
+
class SelectSplitState:
|
|
59
|
+
"""State of a specific query split execution."""
|
|
60
|
+
|
|
61
|
+
def __init__(self, query_data_request, table: "Table", split_id: int, config: QueryConfig) -> None:
|
|
62
|
+
"""Initialize query split state."""
|
|
55
63
|
self.split_id = split_id
|
|
56
64
|
self.subsplits_state = {i: 0 for i in range(config.num_sub_splits)}
|
|
57
65
|
self.config = config
|
|
58
66
|
self.query_data_request = query_data_request
|
|
59
67
|
self.table = table
|
|
60
68
|
|
|
61
|
-
def batches(self, api
|
|
69
|
+
def batches(self, api: internal_commands.VastdbApi):
|
|
70
|
+
"""Execute QueryData request, and yield parsed RecordBatch objects.
|
|
71
|
+
|
|
72
|
+
Can be called repeatedly, to allow pagination.
|
|
73
|
+
"""
|
|
62
74
|
while not self.done:
|
|
63
75
|
response = api.query_data(
|
|
64
76
|
bucket=self.table.bucket.name,
|
|
@@ -71,34 +83,39 @@ class SelectSplitState():
|
|
|
71
83
|
txid=self.table.tx.txid,
|
|
72
84
|
limit_rows=self.config.limit_rows_per_sub_split,
|
|
73
85
|
sub_split_start_row_ids=self.subsplits_state.items(),
|
|
74
|
-
enable_sorted_projections=self.config.use_semi_sorted_projections
|
|
75
|
-
|
|
86
|
+
enable_sorted_projections=self.config.use_semi_sorted_projections,
|
|
87
|
+
query_imports_table=self.table._imports_table)
|
|
88
|
+
pages_iter = internal_commands.parse_query_data_response(
|
|
76
89
|
conn=response.raw,
|
|
77
90
|
schema=self.query_data_request.response_schema,
|
|
78
|
-
start_row_ids=self.subsplits_state
|
|
91
|
+
start_row_ids=self.subsplits_state,
|
|
92
|
+
parser=self.query_data_request.response_parser)
|
|
79
93
|
|
|
80
94
|
for page in pages_iter:
|
|
81
95
|
for batch in page.to_batches():
|
|
82
96
|
if len(batch) > 0:
|
|
83
97
|
yield batch
|
|
84
98
|
|
|
85
|
-
|
|
86
99
|
@property
|
|
87
100
|
def done(self):
|
|
88
|
-
|
|
101
|
+
"""Returns true iff the pagination over."""
|
|
102
|
+
return all(row_id == internal_commands.TABULAR_INVALID_ROW_ID for row_id in self.subsplits_state.values())
|
|
103
|
+
|
|
89
104
|
|
|
90
105
|
@dataclass
|
|
91
106
|
class Table:
|
|
107
|
+
"""VAST Table."""
|
|
108
|
+
|
|
92
109
|
name: str
|
|
93
110
|
schema: "schema.Schema"
|
|
94
111
|
handle: int
|
|
95
112
|
stats: TableStats
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
113
|
+
arrow_schema: pa.Schema = field(init=False, compare=False, repr=False)
|
|
114
|
+
_ibis_table: ibis.Schema = field(init=False, compare=False, repr=False)
|
|
115
|
+
_imports_table: bool
|
|
99
116
|
|
|
100
117
|
def __post_init__(self):
|
|
101
|
-
|
|
118
|
+
"""Also, load columns' metadata."""
|
|
102
119
|
self.arrow_schema = self.columns()
|
|
103
120
|
|
|
104
121
|
table_path = f'{self.schema.bucket.name}/{self.schema.name}/{self.name}'
|
|
@@ -106,21 +123,21 @@ class Table:
|
|
|
106
123
|
|
|
107
124
|
@property
|
|
108
125
|
def tx(self):
|
|
126
|
+
"""Return transaction."""
|
|
109
127
|
return self.schema.tx
|
|
110
128
|
|
|
111
129
|
@property
|
|
112
130
|
def bucket(self):
|
|
131
|
+
"""Return bucket."""
|
|
113
132
|
return self.schema.bucket
|
|
114
133
|
|
|
115
|
-
def __repr__(self):
|
|
116
|
-
return f"{type(self).__name__}(name={self.name})"
|
|
117
|
-
|
|
118
134
|
def columns(self) -> pa.Schema:
|
|
135
|
+
"""Return columns' metadata."""
|
|
119
136
|
fields = []
|
|
120
137
|
next_key = 0
|
|
121
138
|
while True:
|
|
122
139
|
cur_columns, next_key, is_truncated, _count = self.tx._rpc.api.list_columns(
|
|
123
|
-
bucket=self.bucket.name, schema=self.schema.name, table=self.name, next_key=next_key, txid=self.tx.txid)
|
|
140
|
+
bucket=self.bucket.name, schema=self.schema.name, table=self.name, next_key=next_key, txid=self.tx.txid, list_imports_table=self._imports_table)
|
|
124
141
|
fields.extend(cur_columns)
|
|
125
142
|
if not is_truncated:
|
|
126
143
|
break
|
|
@@ -129,6 +146,9 @@ class Table:
|
|
|
129
146
|
return self.arrow_schema
|
|
130
147
|
|
|
131
148
|
def projection(self, name: str) -> "Projection":
|
|
149
|
+
"""Get a specific semi-sorted projection of this table."""
|
|
150
|
+
if self._imports_table:
|
|
151
|
+
raise errors.NotSupportedCommand(self.bucket.name, self.schema.name, self.name)
|
|
132
152
|
projs = self.projections(projection_name=name)
|
|
133
153
|
if not projs:
|
|
134
154
|
raise errors.MissingProjection(self.bucket.name, self.schema.name, self.name, name)
|
|
@@ -136,13 +156,16 @@ class Table:
|
|
|
136
156
|
log.debug("Found projection: %s", projs[0])
|
|
137
157
|
return projs[0]
|
|
138
158
|
|
|
139
|
-
def projections(self, projection_name=None) -> ["Projection"]:
|
|
159
|
+
def projections(self, projection_name=None) -> List["Projection"]:
|
|
160
|
+
"""List all semi-sorted projections of this table."""
|
|
161
|
+
if self._imports_table:
|
|
162
|
+
raise errors.NotSupportedCommand(self.bucket.name, self.schema.name, self.name)
|
|
140
163
|
projections = []
|
|
141
164
|
next_key = 0
|
|
142
165
|
name_prefix = projection_name if projection_name else ""
|
|
143
166
|
exact_match = bool(projection_name)
|
|
144
167
|
while True:
|
|
145
|
-
|
|
168
|
+
_bucket_name, _schema_name, _table_name, curr_projections, next_key, is_truncated, _ = \
|
|
146
169
|
self.tx._rpc.api.list_projections(
|
|
147
170
|
bucket=self.bucket.name, schema=self.schema.name, table=self.name, next_key=next_key, txid=self.tx.txid,
|
|
148
171
|
exact_match=exact_match, name_prefix=name_prefix)
|
|
@@ -153,7 +176,13 @@ class Table:
|
|
|
153
176
|
break
|
|
154
177
|
return [_parse_projection_info(projection, self) for projection in projections]
|
|
155
178
|
|
|
156
|
-
def import_files(self, files_to_import: [str], config: ImportConfig = None) -> None:
|
|
179
|
+
def import_files(self, files_to_import: List[str], config: Optional[ImportConfig] = None) -> None:
|
|
180
|
+
"""Import a list of Parquet files into this table.
|
|
181
|
+
|
|
182
|
+
The files must be on VAST S3 server and be accessible using current credentials.
|
|
183
|
+
"""
|
|
184
|
+
if self._imports_table:
|
|
185
|
+
raise errors.NotSupportedCommand(self.bucket.name, self.schema.name, self.name)
|
|
157
186
|
source_files = {}
|
|
158
187
|
for f in files_to_import:
|
|
159
188
|
bucket_name, object_path = _parse_bucket_and_object_names(f)
|
|
@@ -161,7 +190,14 @@ class Table:
|
|
|
161
190
|
|
|
162
191
|
self._execute_import(source_files, config=config)
|
|
163
192
|
|
|
164
|
-
def import_partitioned_files(self, files_and_partitions:
|
|
193
|
+
def import_partitioned_files(self, files_and_partitions: Dict[str, pa.RecordBatch], config: Optional[ImportConfig] = None) -> None:
|
|
194
|
+
"""Import a list of Parquet files into this table.
|
|
195
|
+
|
|
196
|
+
The files must be on VAST S3 server and be accessible using current credentials.
|
|
197
|
+
Each file must have its own partition values defined as an Arrow RecordBatch.
|
|
198
|
+
"""
|
|
199
|
+
if self._imports_table:
|
|
200
|
+
raise errors.NotSupportedCommand(self.bucket.name, self.schema.name, self.name)
|
|
165
201
|
source_files = {}
|
|
166
202
|
for f, record_batch in files_and_partitions.items():
|
|
167
203
|
bucket_name, object_path = _parse_bucket_and_object_names(f)
|
|
@@ -209,7 +245,7 @@ class Table:
|
|
|
209
245
|
max_workers=config.import_concurrency, thread_name_prefix='import_thread') as pool:
|
|
210
246
|
try:
|
|
211
247
|
for endpoint in endpoints:
|
|
212
|
-
session = VastdbApi(endpoint, self.tx._rpc.api.access_key, self.tx._rpc.api.secret_key)
|
|
248
|
+
session = internal_commands.VastdbApi(endpoint, self.tx._rpc.api.access_key, self.tx._rpc.api.secret_key)
|
|
213
249
|
futures.append(pool.submit(import_worker, files_queue, session))
|
|
214
250
|
|
|
215
251
|
log.debug("Waiting for import workers to finish")
|
|
@@ -218,24 +254,40 @@ class Table:
|
|
|
218
254
|
finally:
|
|
219
255
|
stop_event.set()
|
|
220
256
|
# ThreadPoolExecutor will be joined at the end of the context
|
|
221
|
-
|
|
257
|
+
|
|
258
|
+
def get_stats(self) -> TableStats:
|
|
259
|
+
"""Get the statistics of this table."""
|
|
222
260
|
stats_tuple = self.tx._rpc.api.get_table_stats(
|
|
223
|
-
bucket=self.bucket.name, schema=self.schema.name, name=self.name, txid=self.tx.txid
|
|
224
|
-
|
|
261
|
+
bucket=self.bucket.name, schema=self.schema.name, name=self.name, txid=self.tx.txid,
|
|
262
|
+
imports_table_stats=self._imports_table)
|
|
263
|
+
return TableStats(**stats_tuple._asdict())
|
|
225
264
|
|
|
226
|
-
def select(self, columns: [str] = None,
|
|
265
|
+
def select(self, columns: Optional[List[str]] = None,
|
|
227
266
|
predicate: ibis.expr.types.BooleanColumn = None,
|
|
228
|
-
config: QueryConfig = None,
|
|
267
|
+
config: Optional[QueryConfig] = None,
|
|
229
268
|
*,
|
|
230
269
|
internal_row_id: bool = False) -> pa.RecordBatchReader:
|
|
270
|
+
"""Execute a query over this table.
|
|
271
|
+
|
|
272
|
+
To read a subset of the columns, specify their names via `columns` argument. Otherwise, all columns will be read.
|
|
273
|
+
|
|
274
|
+
In order to apply a filter, a predicate can be specified. See https://github.com/vast-data/vastdb_sdk/blob/main/README.md#filters-and-projections for more details.
|
|
275
|
+
|
|
276
|
+
Query-execution configuration options can be specified via the optional `config` argument.
|
|
277
|
+
"""
|
|
231
278
|
if config is None:
|
|
232
279
|
config = QueryConfig()
|
|
233
280
|
|
|
234
|
-
|
|
281
|
+
# Take a snapshot of enpoints
|
|
282
|
+
stats = self.get_stats()
|
|
283
|
+
endpoints = stats.endpoints if config.data_endpoints is None else config.data_endpoints
|
|
284
|
+
|
|
285
|
+
if stats.num_rows > config.rows_per_split and config.num_splits is None:
|
|
286
|
+
config.num_splits = stats.num_rows // config.rows_per_split
|
|
287
|
+
log.debug(f"num_rows={stats.num_rows} rows_per_splits={config.rows_per_split} num_splits={config.num_splits} ")
|
|
235
288
|
|
|
236
|
-
if
|
|
237
|
-
|
|
238
|
-
log.debug(f"num_rows={self.stats.num_rows} rows_per_splits={config.rows_per_split} num_splits={config.num_splits} ")
|
|
289
|
+
if columns is None:
|
|
290
|
+
columns = [f.name for f in self.arrow_schema]
|
|
239
291
|
|
|
240
292
|
query_schema = self.arrow_schema
|
|
241
293
|
if internal_row_id:
|
|
@@ -244,12 +296,12 @@ class Table:
|
|
|
244
296
|
query_schema = pa.schema(queried_fields)
|
|
245
297
|
columns.append(INTERNAL_ROW_ID)
|
|
246
298
|
|
|
247
|
-
query_data_request = build_query_data_request(
|
|
299
|
+
query_data_request = internal_commands.build_query_data_request(
|
|
248
300
|
schema=query_schema,
|
|
249
301
|
predicate=predicate,
|
|
250
302
|
field_names=columns)
|
|
251
303
|
|
|
252
|
-
splits_queue = queue.Queue()
|
|
304
|
+
splits_queue: queue.Queue[int] = queue.Queue()
|
|
253
305
|
|
|
254
306
|
for split in range(config.num_splits):
|
|
255
307
|
splits_queue.put(split)
|
|
@@ -257,8 +309,10 @@ class Table:
|
|
|
257
309
|
# this queue shouldn't be large it is marely a pipe through which the results
|
|
258
310
|
# are sent to the main thread. Most of the pages actually held in the
|
|
259
311
|
# threads that fetch the pages.
|
|
260
|
-
record_batches_queue = queue.Queue(maxsize=2)
|
|
312
|
+
record_batches_queue: queue.Queue[pa.RecordBatch] = queue.Queue(maxsize=2)
|
|
313
|
+
|
|
261
314
|
stop_event = Event()
|
|
315
|
+
|
|
262
316
|
class StoppedException(Exception):
|
|
263
317
|
pass
|
|
264
318
|
|
|
@@ -266,9 +320,9 @@ class Table:
|
|
|
266
320
|
if stop_event.is_set():
|
|
267
321
|
raise StoppedException
|
|
268
322
|
|
|
269
|
-
def single_endpoint_worker(endpoint
|
|
323
|
+
def single_endpoint_worker(endpoint: str):
|
|
270
324
|
try:
|
|
271
|
-
host_api = VastdbApi(endpoint=endpoint, access_key=self.tx._rpc.api.access_key, secret_key=self.tx._rpc.api.secret_key)
|
|
325
|
+
host_api = internal_commands.VastdbApi(endpoint=endpoint, access_key=self.tx._rpc.api.access_key, secret_key=self.tx._rpc.api.secret_key)
|
|
272
326
|
while True:
|
|
273
327
|
check_stop()
|
|
274
328
|
try:
|
|
@@ -293,12 +347,11 @@ class Table:
|
|
|
293
347
|
log.debug("exiting")
|
|
294
348
|
record_batches_queue.put(None)
|
|
295
349
|
|
|
296
|
-
# Take a snapshot of enpoints
|
|
297
|
-
endpoints = list(self.stats.endpoints) if config.data_endpoints is None else list(config.data_endpoints)
|
|
298
|
-
|
|
299
350
|
def batches_iterator():
|
|
300
|
-
def propagate_first_exception(futures
|
|
351
|
+
def propagate_first_exception(futures: List[concurrent.futures.Future], block=False):
|
|
301
352
|
done, not_done = concurrent.futures.wait(futures, None if block else 0, concurrent.futures.FIRST_EXCEPTION)
|
|
353
|
+
if self.tx.txid is None:
|
|
354
|
+
raise errors.MissingTransaction()
|
|
302
355
|
for future in done:
|
|
303
356
|
future.result()
|
|
304
357
|
return not_done
|
|
@@ -308,7 +361,7 @@ class Table:
|
|
|
308
361
|
if config.query_id:
|
|
309
362
|
threads_prefix = threads_prefix + "-" + config.query_id
|
|
310
363
|
|
|
311
|
-
with concurrent.futures.ThreadPoolExecutor(max_workers=len(endpoints), thread_name_prefix=threads_prefix) as tp:
|
|
364
|
+
with concurrent.futures.ThreadPoolExecutor(max_workers=len(endpoints), thread_name_prefix=threads_prefix) as tp: # TODO: concurrency == enpoints is just a heuristic
|
|
312
365
|
futures = [tp.submit(single_endpoint_worker, endpoint) for endpoint in endpoints]
|
|
313
366
|
tasks_running = len(futures)
|
|
314
367
|
try:
|
|
@@ -332,113 +385,155 @@ class Table:
|
|
|
332
385
|
|
|
333
386
|
return pa.RecordBatchReader.from_batches(query_data_request.response_schema, batches_iterator())
|
|
334
387
|
|
|
335
|
-
def _combine_chunks(self, col):
|
|
336
|
-
if hasattr(col, "combine_chunks"):
|
|
337
|
-
return col.combine_chunks()
|
|
338
|
-
else:
|
|
339
|
-
return col
|
|
340
|
-
|
|
341
388
|
def insert(self, rows: pa.RecordBatch) -> pa.RecordBatch:
|
|
342
|
-
|
|
343
|
-
|
|
389
|
+
"""Insert a RecordBatch into this table."""
|
|
390
|
+
if self._imports_table:
|
|
391
|
+
raise errors.NotSupportedCommand(self.bucket.name, self.schema.name, self.name)
|
|
392
|
+
serialized_slices = util.iter_serialized_slices(rows, MAX_INSERT_ROWS_PER_PATCH)
|
|
344
393
|
for slice in serialized_slices:
|
|
345
|
-
|
|
394
|
+
self.tx._rpc.api.insert_rows(self.bucket.name, self.schema.name, self.name, record_batch=slice,
|
|
346
395
|
txid=self.tx.txid)
|
|
347
|
-
(batch,) = pa.RecordBatchStreamReader(res.raw)
|
|
348
|
-
row_ids.append(batch[INTERNAL_ROW_ID])
|
|
349
396
|
|
|
350
|
-
|
|
397
|
+
def update(self, rows: Union[pa.RecordBatch, pa.Table], columns: Optional[List[str]] = None) -> None:
|
|
398
|
+
"""Update a subset of cells in this table.
|
|
399
|
+
|
|
400
|
+
Row IDs are specified using a special field (named "$row_id" of uint64 type).
|
|
351
401
|
|
|
352
|
-
|
|
402
|
+
A subset of columns to be updated can be specified via the `columns` argument.
|
|
403
|
+
"""
|
|
404
|
+
if self._imports_table:
|
|
405
|
+
raise errors.NotSupportedCommand(self.bucket.name, self.schema.name, self.name)
|
|
353
406
|
if columns is not None:
|
|
354
407
|
update_fields = [(INTERNAL_ROW_ID, pa.uint64())]
|
|
355
|
-
update_values = [
|
|
408
|
+
update_values = [_combine_chunks(rows[INTERNAL_ROW_ID])]
|
|
356
409
|
for col in columns:
|
|
357
410
|
update_fields.append(rows.field(col))
|
|
358
|
-
update_values.append(
|
|
411
|
+
update_values.append(_combine_chunks(rows[col]))
|
|
359
412
|
|
|
360
413
|
update_rows_rb = pa.record_batch(schema=pa.schema(update_fields), data=update_values)
|
|
361
414
|
else:
|
|
362
415
|
update_rows_rb = rows
|
|
363
416
|
|
|
364
|
-
serialized_slices =
|
|
417
|
+
serialized_slices = util.iter_serialized_slices(update_rows_rb, MAX_ROWS_PER_BATCH)
|
|
365
418
|
for slice in serialized_slices:
|
|
366
419
|
self.tx._rpc.api.update_rows(self.bucket.name, self.schema.name, self.name, record_batch=slice,
|
|
367
420
|
txid=self.tx.txid)
|
|
368
421
|
|
|
369
422
|
def delete(self, rows: Union[pa.RecordBatch, pa.Table]) -> None:
|
|
423
|
+
"""Delete a subset of rows in this table.
|
|
424
|
+
|
|
425
|
+
Row IDs are specified using a special field (named "$row_id" of uint64 type).
|
|
426
|
+
"""
|
|
370
427
|
delete_rows_rb = pa.record_batch(schema=pa.schema([(INTERNAL_ROW_ID, pa.uint64())]),
|
|
371
|
-
data=[
|
|
428
|
+
data=[_combine_chunks(rows[INTERNAL_ROW_ID])])
|
|
372
429
|
|
|
373
|
-
serialized_slices =
|
|
430
|
+
serialized_slices = util.iter_serialized_slices(delete_rows_rb, MAX_ROWS_PER_BATCH)
|
|
374
431
|
for slice in serialized_slices:
|
|
375
432
|
self.tx._rpc.api.delete_rows(self.bucket.name, self.schema.name, self.name, record_batch=slice,
|
|
376
|
-
txid=self.tx.txid)
|
|
433
|
+
txid=self.tx.txid, delete_from_imports_table=self._imports_table)
|
|
377
434
|
|
|
378
435
|
def drop(self) -> None:
|
|
379
|
-
|
|
436
|
+
"""Drop this table."""
|
|
437
|
+
self.tx._rpc.api.drop_table(self.bucket.name, self.schema.name, self.name, txid=self.tx.txid, remove_imports_table=self._imports_table)
|
|
380
438
|
log.info("Dropped table: %s", self.name)
|
|
381
439
|
|
|
382
440
|
def rename(self, new_name) -> None:
|
|
441
|
+
"""Rename this table."""
|
|
442
|
+
if self._imports_table:
|
|
443
|
+
raise errors.NotSupportedCommand(self.bucket.name, self.schema.name, self.name)
|
|
383
444
|
self.tx._rpc.api.alter_table(
|
|
384
445
|
self.bucket.name, self.schema.name, self.name, txid=self.tx.txid, new_name=new_name)
|
|
385
446
|
log.info("Renamed table from %s to %s ", self.name, new_name)
|
|
386
447
|
self.name = new_name
|
|
387
448
|
|
|
388
449
|
def add_column(self, new_column: pa.Schema) -> None:
|
|
450
|
+
"""Add a new column."""
|
|
451
|
+
if self._imports_table:
|
|
452
|
+
raise errors.NotSupportedCommand(self.bucket.name, self.schema.name, self.name)
|
|
389
453
|
self.tx._rpc.api.add_columns(self.bucket.name, self.schema.name, self.name, new_column, txid=self.tx.txid)
|
|
390
454
|
log.info("Added column(s): %s", new_column)
|
|
391
455
|
self.arrow_schema = self.columns()
|
|
392
456
|
|
|
393
457
|
def drop_column(self, column_to_drop: pa.Schema) -> None:
|
|
458
|
+
"""Drop an existing column."""
|
|
459
|
+
if self._imports_table:
|
|
460
|
+
raise errors.NotSupported(self.bucket.name, self.schema.name, self.name)
|
|
461
|
+
if self._imports_table:
|
|
462
|
+
raise errors.NotSupportedCommand(self.bucket.name, self.schema.name, self.name)
|
|
394
463
|
self.tx._rpc.api.drop_columns(self.bucket.name, self.schema.name, self.name, column_to_drop, txid=self.tx.txid)
|
|
395
464
|
log.info("Dropped column(s): %s", column_to_drop)
|
|
396
465
|
self.arrow_schema = self.columns()
|
|
397
466
|
|
|
398
467
|
def rename_column(self, current_column_name: str, new_column_name: str) -> None:
|
|
468
|
+
"""Rename an existing column."""
|
|
469
|
+
if self._imports_table:
|
|
470
|
+
raise errors.NotSupportedCommand(self.bucket.name, self.schema.name, self.name)
|
|
399
471
|
self.tx._rpc.api.alter_column(self.bucket.name, self.schema.name, self.name, name=current_column_name,
|
|
400
472
|
new_name=new_column_name, txid=self.tx.txid)
|
|
401
473
|
log.info("Renamed column: %s to %s", current_column_name, new_column_name)
|
|
402
474
|
self.arrow_schema = self.columns()
|
|
403
475
|
|
|
404
476
|
def create_projection(self, projection_name: str, sorted_columns: List[str], unsorted_columns: List[str]) -> "Projection":
|
|
477
|
+
"""Create a new semi-sorted projection."""
|
|
478
|
+
if self._imports_table:
|
|
479
|
+
raise errors.NotSupportedCommand(self.bucket.name, self.schema.name, self.name)
|
|
405
480
|
columns = [(sorted_column, "Sorted") for sorted_column in sorted_columns] + [(unsorted_column, "Unorted") for unsorted_column in unsorted_columns]
|
|
406
481
|
self.tx._rpc.api.create_projection(self.bucket.name, self.schema.name, self.name, projection_name, columns=columns, txid=self.tx.txid)
|
|
407
482
|
log.info("Created projection: %s", projection_name)
|
|
408
483
|
return self.projection(projection_name)
|
|
409
484
|
|
|
485
|
+
def create_imports_table(self, fail_if_exists=True) -> "Table":
|
|
486
|
+
"""Create imports table."""
|
|
487
|
+
self.tx._rpc.features.check_imports_table()
|
|
488
|
+
empty_schema = pa.schema([])
|
|
489
|
+
self.tx._rpc.api.create_table(self.bucket.name, self.schema.name, self.name, empty_schema, txid=self.tx.txid,
|
|
490
|
+
create_imports_table=True)
|
|
491
|
+
log.info("Created imports table for table: %s", self.name)
|
|
492
|
+
return self.imports_table() # type: ignore[return-value]
|
|
493
|
+
|
|
494
|
+
def imports_table(self) -> Optional["Table"]:
|
|
495
|
+
"""Get the imports table under of this table."""
|
|
496
|
+
self.tx._rpc.features.check_imports_table()
|
|
497
|
+
return Table(name=self.name, schema=self.schema, handle=int(self.handle), stats=self.stats, _imports_table=True)
|
|
498
|
+
|
|
410
499
|
def __getitem__(self, col_name):
|
|
500
|
+
"""Allow constructing ibis-like column expressions from this table.
|
|
501
|
+
|
|
502
|
+
It is useful for constructing expressions for predicate pushdown in `Table.select()` method.
|
|
503
|
+
"""
|
|
411
504
|
return self._ibis_table[col_name]
|
|
412
505
|
|
|
413
506
|
|
|
414
507
|
@dataclass
|
|
415
508
|
class Projection:
|
|
509
|
+
"""VAST semi-sorted projection."""
|
|
510
|
+
|
|
416
511
|
name: str
|
|
417
512
|
table: Table
|
|
418
513
|
handle: int
|
|
419
514
|
stats: TableStats
|
|
420
|
-
properties: dict = None
|
|
421
515
|
|
|
422
516
|
@property
|
|
423
517
|
def bucket(self):
|
|
518
|
+
"""Return bucket."""
|
|
424
519
|
return self.table.schema.bucket
|
|
425
520
|
|
|
426
521
|
@property
|
|
427
522
|
def schema(self):
|
|
523
|
+
"""Return schema."""
|
|
428
524
|
return self.table.schema
|
|
429
525
|
|
|
430
526
|
@property
|
|
431
527
|
def tx(self):
|
|
528
|
+
"""Return transaction."""
|
|
432
529
|
return self.table.schema.tx
|
|
433
530
|
|
|
434
|
-
def __repr__(self):
|
|
435
|
-
return f"{type(self).__name__}(name={self.name})"
|
|
436
|
-
|
|
437
531
|
def columns(self) -> pa.Schema:
|
|
532
|
+
"""Return this projections' columns as an Arrow schema."""
|
|
438
533
|
columns = []
|
|
439
534
|
next_key = 0
|
|
440
535
|
while True:
|
|
441
|
-
curr_columns, next_key, is_truncated,
|
|
536
|
+
curr_columns, next_key, is_truncated, _count, _ = \
|
|
442
537
|
self.tx._rpc.api.list_projection_columns(
|
|
443
538
|
self.bucket.name, self.schema.name, self.table.name, self.name, txid=self.table.tx.txid, next_key=next_key)
|
|
444
539
|
if not curr_columns:
|
|
@@ -450,12 +545,14 @@ class Projection:
|
|
|
450
545
|
return self.arrow_schema
|
|
451
546
|
|
|
452
547
|
def rename(self, new_name) -> None:
|
|
548
|
+
"""Rename this projection."""
|
|
453
549
|
self.tx._rpc.api.alter_projection(self.bucket.name, self.schema.name,
|
|
454
550
|
self.table.name, self.name, txid=self.tx.txid, new_name=new_name)
|
|
455
551
|
log.info("Renamed projection from %s to %s ", self.name, new_name)
|
|
456
552
|
self.name = new_name
|
|
457
553
|
|
|
458
554
|
def drop(self) -> None:
|
|
555
|
+
"""Drop this projection."""
|
|
459
556
|
self.tx._rpc.api.drop_projection(self.bucket.name, self.schema.name, self.table.name,
|
|
460
557
|
self.name, txid=self.tx.txid)
|
|
461
558
|
log.info("Dropped projection: %s", self.name)
|
|
@@ -467,9 +564,9 @@ def _parse_projection_info(projection_info, table: "Table"):
|
|
|
467
564
|
return Projection(name=projection_info.name, table=table, stats=stats, handle=int(projection_info.handle))
|
|
468
565
|
|
|
469
566
|
|
|
470
|
-
def _parse_bucket_and_object_names(path: str) ->
|
|
567
|
+
def _parse_bucket_and_object_names(path: str) -> Tuple[str, str]:
|
|
471
568
|
if not path.startswith('/'):
|
|
472
|
-
raise errors.
|
|
569
|
+
raise errors.InvalidArgument(f"Path {path} must start with a '/'")
|
|
473
570
|
components = path.split(os.path.sep)
|
|
474
571
|
bucket_name = components[1]
|
|
475
572
|
object_path = os.path.sep.join(components[2:])
|
|
@@ -481,3 +578,10 @@ def _serialize_record_batch(record_batch: pa.RecordBatch) -> pa.lib.Buffer:
|
|
|
481
578
|
with pa.ipc.new_stream(sink, record_batch.schema) as writer:
|
|
482
579
|
writer.write(record_batch)
|
|
483
580
|
return sink.getvalue()
|
|
581
|
+
|
|
582
|
+
|
|
583
|
+
def _combine_chunks(col):
|
|
584
|
+
if hasattr(col, "combine_chunks"):
|
|
585
|
+
return col.combine_chunks()
|
|
586
|
+
else:
|
|
587
|
+
return col
|