vastdb 0.1.2__py3-none-any.whl → 0.1.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vastdb/bucket.py +20 -10
- vastdb/errors.py +28 -1
- vastdb/internal_commands.py +56 -89
- vastdb/schema.py +1 -1
- vastdb/session.py +16 -1
- vastdb/table.py +134 -27
- vastdb/tests/test_imports.py +13 -1
- vastdb/tests/test_schemas.py +1 -2
- vastdb/tests/test_tables.py +26 -1
- vastdb/tests/test_util.py +39 -0
- vastdb/transaction.py +19 -3
- vastdb/util.py +41 -6
- {vastdb-0.1.2.dist-info → vastdb-0.1.3.dist-info}/METADATA +2 -2
- {vastdb-0.1.2.dist-info → vastdb-0.1.3.dist-info}/RECORD +17 -16
- {vastdb-0.1.2.dist-info → vastdb-0.1.3.dist-info}/LICENSE +0 -0
- {vastdb-0.1.2.dist-info → vastdb-0.1.3.dist-info}/WHEEL +0 -0
- {vastdb-0.1.2.dist-info → vastdb-0.1.3.dist-info}/top_level.txt +0 -0
vastdb/bucket.py
CHANGED
|
@@ -16,14 +16,6 @@ if TYPE_CHECKING:
|
|
|
16
16
|
log = logging.getLogger(__name__)
|
|
17
17
|
|
|
18
18
|
|
|
19
|
-
@dataclass
|
|
20
|
-
class Snapshot:
|
|
21
|
-
"""VAST bucket-level snapshot."""
|
|
22
|
-
|
|
23
|
-
name: str
|
|
24
|
-
bucket: "Bucket"
|
|
25
|
-
|
|
26
|
-
|
|
27
19
|
@dataclass
|
|
28
20
|
class Bucket:
|
|
29
21
|
"""VAST bucket."""
|
|
@@ -73,7 +65,22 @@ class Bucket:
|
|
|
73
65
|
|
|
74
66
|
return [schema.Schema(name=name, bucket=self) for name, *_ in schemas]
|
|
75
67
|
|
|
76
|
-
def
|
|
68
|
+
def snapshot(self, name, fail_if_missing=True) -> Optional["Bucket"]:
|
|
69
|
+
"""Get snapshot by name (if exists)."""
|
|
70
|
+
snapshots, _is_truncated, _next_key = \
|
|
71
|
+
self.tx._rpc.api.list_snapshots(bucket=self.name, name_prefix=name, max_keys=1)
|
|
72
|
+
|
|
73
|
+
expected_name = f".snapshot/{name}"
|
|
74
|
+
exists = snapshots and snapshots[0] == expected_name + "/"
|
|
75
|
+
if not exists:
|
|
76
|
+
if fail_if_missing:
|
|
77
|
+
raise errors.MissingSnapshot(self.name, expected_name)
|
|
78
|
+
else:
|
|
79
|
+
return None
|
|
80
|
+
|
|
81
|
+
return Bucket(name=f'{self.name}/{expected_name}', tx=self.tx)
|
|
82
|
+
|
|
83
|
+
def snapshots(self) -> List["Bucket"]:
|
|
77
84
|
"""List bucket's snapshots."""
|
|
78
85
|
snapshots = []
|
|
79
86
|
next_key = 0
|
|
@@ -86,4 +93,7 @@ class Bucket:
|
|
|
86
93
|
if not is_truncated:
|
|
87
94
|
break
|
|
88
95
|
|
|
89
|
-
return [
|
|
96
|
+
return [
|
|
97
|
+
Bucket(name=f'{self.name}/{snapshot.strip("/")}', tx=self.tx)
|
|
98
|
+
for snapshot in snapshots
|
|
99
|
+
]
|
vastdb/errors.py
CHANGED
|
@@ -85,6 +85,10 @@ class InvalidArgument(Exception):
|
|
|
85
85
|
pass
|
|
86
86
|
|
|
87
87
|
|
|
88
|
+
class TooWideRow(InvalidArgument):
|
|
89
|
+
pass
|
|
90
|
+
|
|
91
|
+
|
|
88
92
|
class Missing(Exception):
|
|
89
93
|
pass
|
|
90
94
|
|
|
@@ -93,11 +97,21 @@ class MissingTransaction(Missing):
|
|
|
93
97
|
pass
|
|
94
98
|
|
|
95
99
|
|
|
100
|
+
class NotSupported(Exception):
|
|
101
|
+
pass
|
|
102
|
+
|
|
103
|
+
|
|
96
104
|
@dataclass
|
|
97
105
|
class MissingBucket(Missing):
|
|
98
106
|
bucket: str
|
|
99
107
|
|
|
100
108
|
|
|
109
|
+
@dataclass
|
|
110
|
+
class MissingSnapshot(Missing):
|
|
111
|
+
bucket: str
|
|
112
|
+
snapshot: str
|
|
113
|
+
|
|
114
|
+
|
|
101
115
|
@dataclass
|
|
102
116
|
class MissingSchema(Missing):
|
|
103
117
|
bucket: str
|
|
@@ -136,6 +150,19 @@ class TableExists(Exists):
|
|
|
136
150
|
table: str
|
|
137
151
|
|
|
138
152
|
|
|
153
|
+
@dataclass
|
|
154
|
+
class NotSupportedCommand(NotSupported):
|
|
155
|
+
bucket: str
|
|
156
|
+
schema: str
|
|
157
|
+
table: str
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
@dataclass
|
|
161
|
+
class NotSupportedVersion(NotSupported):
|
|
162
|
+
err_msg: str
|
|
163
|
+
version: str
|
|
164
|
+
|
|
165
|
+
|
|
139
166
|
ERROR_TYPES_MAP = {
|
|
140
167
|
HttpStatus.BAD_REQUEST: BadRequest,
|
|
141
168
|
HttpStatus.FOBIDDEN: Forbidden,
|
|
@@ -178,4 +205,4 @@ def from_response(res: requests.Response):
|
|
|
178
205
|
log.warning("RPC failed: %s", kwargs)
|
|
179
206
|
status = HttpStatus(res.status_code)
|
|
180
207
|
error_type = ERROR_TYPES_MAP.get(status, UnexpectedError)
|
|
181
|
-
|
|
208
|
+
return error_type(**kwargs)
|
vastdb/internal_commands.py
CHANGED
|
@@ -1,7 +1,6 @@
|
|
|
1
1
|
import itertools
|
|
2
2
|
import json
|
|
3
3
|
import logging
|
|
4
|
-
import math
|
|
5
4
|
import re
|
|
6
5
|
import struct
|
|
7
6
|
import urllib.parse
|
|
@@ -182,6 +181,7 @@ class Predicate:
|
|
|
182
181
|
Equals,
|
|
183
182
|
Greater,
|
|
184
183
|
GreaterEqual,
|
|
184
|
+
InValues,
|
|
185
185
|
Less,
|
|
186
186
|
LessEqual,
|
|
187
187
|
Not,
|
|
@@ -219,40 +219,54 @@ class Predicate:
|
|
|
219
219
|
prev_field_name = None
|
|
220
220
|
for inner_op in or_args:
|
|
221
221
|
_logger.debug('inner_op %s', inner_op)
|
|
222
|
-
|
|
222
|
+
op_type = type(inner_op)
|
|
223
|
+
builder_func: Any = builder_map.get(op_type)
|
|
223
224
|
if not builder_func:
|
|
224
|
-
|
|
225
|
+
if op_type == InValues:
|
|
226
|
+
builder_func = self.build_equal
|
|
227
|
+
else:
|
|
228
|
+
raise NotImplementedError(self.expr)
|
|
225
229
|
|
|
226
230
|
if builder_func == self.build_is_null:
|
|
227
231
|
column, = inner_op.args
|
|
228
|
-
|
|
232
|
+
literals = (None,)
|
|
229
233
|
elif builder_func == self.build_is_not_null:
|
|
230
234
|
not_arg, = inner_op.args
|
|
231
235
|
# currently we only support not is_null, checking we really got is_null under the not:
|
|
232
236
|
if not builder_map.get(type(not_arg)) == self.build_is_null:
|
|
233
|
-
raise NotImplementedError(
|
|
237
|
+
raise NotImplementedError(self.expr)
|
|
234
238
|
column, = not_arg.args
|
|
235
|
-
|
|
239
|
+
literals = (None,)
|
|
236
240
|
else:
|
|
237
|
-
column,
|
|
238
|
-
if
|
|
239
|
-
|
|
241
|
+
column, arg = inner_op.args
|
|
242
|
+
if isinstance(arg, tuple):
|
|
243
|
+
literals = arg
|
|
244
|
+
else:
|
|
245
|
+
literals = (arg,)
|
|
246
|
+
for literal in literals:
|
|
247
|
+
if not isinstance(literal, Literal):
|
|
248
|
+
raise NotImplementedError(self.expr)
|
|
240
249
|
|
|
241
250
|
if not isinstance(column, TableColumn):
|
|
242
|
-
raise NotImplementedError(
|
|
251
|
+
raise NotImplementedError(self.expr)
|
|
243
252
|
|
|
244
253
|
field_name = column.name
|
|
245
254
|
if prev_field_name is None:
|
|
246
255
|
prev_field_name = field_name
|
|
247
256
|
elif prev_field_name != field_name:
|
|
248
|
-
raise NotImplementedError(
|
|
257
|
+
raise NotImplementedError(self.expr)
|
|
249
258
|
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
args_offsets
|
|
259
|
+
column_offset = self.build_column(position=positions_map[field_name])
|
|
260
|
+
field = self.schema.field(field_name)
|
|
261
|
+
for literal in literals:
|
|
262
|
+
args_offsets = [column_offset]
|
|
263
|
+
if literal is not None:
|
|
264
|
+
args_offsets.append(self.build_literal(field=field, value=literal.value))
|
|
254
265
|
|
|
255
|
-
|
|
266
|
+
inner_offsets.append(builder_func(*args_offsets))
|
|
267
|
+
|
|
268
|
+
if not inner_offsets:
|
|
269
|
+
raise NotImplementedError(self.expr) # an empty OR is equivalent to a 'FALSE' literal
|
|
256
270
|
|
|
257
271
|
domain_offset = self.build_or(inner_offsets)
|
|
258
272
|
offsets.append(domain_offset)
|
|
@@ -719,20 +733,6 @@ def _parse_table_info(obj):
|
|
|
719
733
|
return TableInfo(name, properties, handle, num_rows, used_bytes)
|
|
720
734
|
|
|
721
735
|
|
|
722
|
-
def build_record_batch(column_info, column_values):
|
|
723
|
-
fields = [pa.field(column_name, column_type) for column_type, column_name in column_info]
|
|
724
|
-
schema = pa.schema(fields)
|
|
725
|
-
arrays = [pa.array(column_values[column_type], type=column_type) for column_type, _ in column_info]
|
|
726
|
-
batch = pa.record_batch(arrays, schema)
|
|
727
|
-
return serialize_record_batch(batch)
|
|
728
|
-
|
|
729
|
-
|
|
730
|
-
def serialize_record_batch(batch):
|
|
731
|
-
sink = pa.BufferOutputStream()
|
|
732
|
-
with pa.ipc.new_stream(sink, batch.schema) as writer:
|
|
733
|
-
writer.write(batch)
|
|
734
|
-
return sink.getvalue()
|
|
735
|
-
|
|
736
736
|
# Results that returns from tablestats
|
|
737
737
|
|
|
738
738
|
|
|
@@ -952,26 +952,27 @@ class VastdbApi:
|
|
|
952
952
|
|
|
953
953
|
return bucket_name, schemas, next_key, is_truncated, count
|
|
954
954
|
|
|
955
|
-
def list_snapshots(self, bucket, max_keys=1000, next_token=None,
|
|
955
|
+
def list_snapshots(self, bucket, max_keys=1000, next_token=None, name_prefix=''):
|
|
956
956
|
next_token = next_token or ''
|
|
957
|
-
|
|
958
|
-
url_params = {'list_type': '2', 'prefix': '.snapshot/', 'delimiter': '/', 'max_keys': str(max_keys)}
|
|
957
|
+
url_params = {'list_type': '2', 'prefix': '.snapshot/' + name_prefix, 'delimiter': '/', 'max_keys': str(max_keys)}
|
|
959
958
|
if next_token:
|
|
960
959
|
url_params['continuation-token'] = next_token
|
|
961
960
|
|
|
962
961
|
res = self.session.get(self._api_prefix(bucket=bucket, command="list", url_params=url_params), headers={}, stream=True)
|
|
963
|
-
self._check_res(res, "list_snapshots"
|
|
964
|
-
|
|
965
|
-
|
|
966
|
-
|
|
967
|
-
|
|
968
|
-
|
|
969
|
-
|
|
970
|
-
|
|
971
|
-
|
|
972
|
-
|
|
962
|
+
self._check_res(res, "list_snapshots")
|
|
963
|
+
|
|
964
|
+
out = b''.join(res.iter_content(chunk_size=128))
|
|
965
|
+
xml_str = out.decode()
|
|
966
|
+
xml_dict = xmltodict.parse(xml_str)
|
|
967
|
+
list_res = xml_dict['ListBucketResult']
|
|
968
|
+
is_truncated = list_res['IsTruncated'] == 'true'
|
|
969
|
+
marker = list_res['Marker']
|
|
970
|
+
common_prefixes = list_res.get('CommonPrefixes', [])
|
|
971
|
+
if isinstance(common_prefixes, dict): # in case there is a single snapshot
|
|
972
|
+
common_prefixes = [common_prefixes]
|
|
973
|
+
snapshots = [v['Prefix'] for v in common_prefixes]
|
|
973
974
|
|
|
974
|
-
|
|
975
|
+
return snapshots, is_truncated, marker
|
|
975
976
|
|
|
976
977
|
def create_table(self, bucket, schema, name, arrow_schema, txid=0, client_tags=[], expected_retvals=[],
|
|
977
978
|
topic_partitions=0, create_imports_table=False, use_external_row_ids_allocation=False):
|
|
@@ -1030,7 +1031,7 @@ class VastdbApi:
|
|
|
1030
1031
|
# create the table
|
|
1031
1032
|
return self.create_table(bucket, schema, name, arrow_schema, txid, client_tags, expected_retvals)
|
|
1032
1033
|
|
|
1033
|
-
def get_table_stats(self, bucket, schema, name, txid=0, client_tags=[], expected_retvals=[]):
|
|
1034
|
+
def get_table_stats(self, bucket, schema, name, txid=0, client_tags=[], expected_retvals=[], imports_table_stats=False):
|
|
1034
1035
|
"""
|
|
1035
1036
|
GET /mybucket/myschema/mytable?stats HTTP/1.1
|
|
1036
1037
|
tabular-txid: TransactionId
|
|
@@ -1039,7 +1040,8 @@ class VastdbApi:
|
|
|
1039
1040
|
The Command will return the statistics in flatbuf format
|
|
1040
1041
|
"""
|
|
1041
1042
|
headers = self._fill_common_headers(txid=txid, client_tags=client_tags)
|
|
1042
|
-
|
|
1043
|
+
url_params = {'sub-table': IMPORTED_OBJECTS_TABLE_NAME} if imports_table_stats else {}
|
|
1044
|
+
res = self.session.get(self._api_prefix(bucket=bucket, schema=schema, table=name, command="stats", url_params=url_params), headers=headers)
|
|
1043
1045
|
self._check_res(res, "get_table_stats", expected_retvals)
|
|
1044
1046
|
|
|
1045
1047
|
flatbuf = b''.join(res.iter_content(chunk_size=128))
|
|
@@ -1527,11 +1529,18 @@ class VastdbApi:
|
|
|
1527
1529
|
if response.status_code != 200:
|
|
1528
1530
|
return response
|
|
1529
1531
|
|
|
1532
|
+
ALLOWED_IMPORT_STATES = {
|
|
1533
|
+
'Success',
|
|
1534
|
+
'TabularInProgress',
|
|
1535
|
+
'TabularAlreadyImported',
|
|
1536
|
+
'TabularImportNotStarted',
|
|
1537
|
+
}
|
|
1538
|
+
|
|
1530
1539
|
chunk_size = 1024
|
|
1531
1540
|
for chunk in response.iter_content(chunk_size=chunk_size):
|
|
1532
1541
|
chunk_dict = json.loads(chunk)
|
|
1533
1542
|
_logger.debug("import data chunk=%s, result: %s", chunk_dict, chunk_dict['res'])
|
|
1534
|
-
if chunk_dict['res']
|
|
1543
|
+
if chunk_dict['res'] not in ALLOWED_IMPORT_STATES:
|
|
1535
1544
|
raise errors.ImportFilesError(
|
|
1536
1545
|
f"Encountered an error during import_data. status: {chunk_dict['res']}, "
|
|
1537
1546
|
f"error message: {chunk_dict['err_msg'] or 'Unexpected error'} during import of "
|
|
@@ -1555,48 +1564,6 @@ class VastdbApi:
|
|
|
1555
1564
|
|
|
1556
1565
|
return self._check_res(res, "import_data", expected_retvals)
|
|
1557
1566
|
|
|
1558
|
-
def _record_batch_slices(self, batch, rows_per_slice=None):
|
|
1559
|
-
max_slice_size_in_bytes = int(0.9 * 5 * 1024 * 1024) # 0.9 * 5MB
|
|
1560
|
-
batch_len = len(batch)
|
|
1561
|
-
serialized_batch = serialize_record_batch(batch)
|
|
1562
|
-
batch_size_in_bytes = len(serialized_batch)
|
|
1563
|
-
_logger.debug('max_slice_size_in_bytes=%d batch_len=%d batch_size_in_bytes=%d',
|
|
1564
|
-
max_slice_size_in_bytes, batch_len, batch_size_in_bytes)
|
|
1565
|
-
|
|
1566
|
-
if not rows_per_slice:
|
|
1567
|
-
if batch_size_in_bytes < max_slice_size_in_bytes:
|
|
1568
|
-
rows_per_slice = batch_len
|
|
1569
|
-
else:
|
|
1570
|
-
rows_per_slice = int(0.9 * batch_len * max_slice_size_in_bytes / batch_size_in_bytes)
|
|
1571
|
-
|
|
1572
|
-
done_slicing = False
|
|
1573
|
-
while not done_slicing:
|
|
1574
|
-
# Attempt slicing according to the current rows_per_slice
|
|
1575
|
-
offset = 0
|
|
1576
|
-
serialized_slices = []
|
|
1577
|
-
for i in range(math.ceil(batch_len / rows_per_slice)):
|
|
1578
|
-
offset = rows_per_slice * i
|
|
1579
|
-
if offset >= batch_len:
|
|
1580
|
-
done_slicing = True
|
|
1581
|
-
break
|
|
1582
|
-
slice_batch = batch.slice(offset, rows_per_slice)
|
|
1583
|
-
serialized_slice_batch = serialize_record_batch(slice_batch)
|
|
1584
|
-
sizeof_serialized_slice_batch = len(serialized_slice_batch)
|
|
1585
|
-
|
|
1586
|
-
if sizeof_serialized_slice_batch <= max_slice_size_in_bytes:
|
|
1587
|
-
serialized_slices.append(serialized_slice_batch)
|
|
1588
|
-
else:
|
|
1589
|
-
_logger.info(f'Using rows_per_slice {rows_per_slice} slice {i} size {sizeof_serialized_slice_batch} exceeds {max_slice_size_in_bytes} bytes, trying smaller rows_per_slice')
|
|
1590
|
-
# We have a slice that is too large
|
|
1591
|
-
rows_per_slice = int(rows_per_slice / 2)
|
|
1592
|
-
if rows_per_slice < 1:
|
|
1593
|
-
raise ValueError('cannot decrease batch size below 1 row')
|
|
1594
|
-
break
|
|
1595
|
-
else:
|
|
1596
|
-
done_slicing = True
|
|
1597
|
-
|
|
1598
|
-
return serialized_slices
|
|
1599
|
-
|
|
1600
1567
|
def insert_rows(self, bucket, schema, table, record_batch, txid=0, client_tags=[], expected_retvals=[]):
|
|
1601
1568
|
"""
|
|
1602
1569
|
POST /mybucket/myschema/mytable?rows HTTP/1.1
|
vastdb/schema.py
CHANGED
|
@@ -87,4 +87,4 @@ class Schema:
|
|
|
87
87
|
|
|
88
88
|
def _parse_table_info(table_info, schema: "schema.Schema"):
|
|
89
89
|
stats = table.TableStats(num_rows=table_info.num_rows, size_in_bytes=table_info.size_in_bytes)
|
|
90
|
-
return table.Table(name=table_info.name, schema=schema, handle=int(table_info.handle), stats=stats)
|
|
90
|
+
return table.Table(name=table_info.name, schema=schema, handle=int(table_info.handle), stats=stats, _imports_table=False)
|
vastdb/session.py
CHANGED
|
@@ -11,7 +11,20 @@ import os
|
|
|
11
11
|
|
|
12
12
|
import boto3
|
|
13
13
|
|
|
14
|
-
from . import internal_commands, transaction
|
|
14
|
+
from . import errors, internal_commands, transaction
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class Features:
|
|
18
|
+
"""VAST database features - check if server is already support a feature."""
|
|
19
|
+
|
|
20
|
+
def __init__(self, vast_version):
|
|
21
|
+
"""Save the server version."""
|
|
22
|
+
self.vast_version = vast_version
|
|
23
|
+
|
|
24
|
+
def check_imports_table(self):
|
|
25
|
+
"""Check if the feature that support imports table is supported."""
|
|
26
|
+
if self.vast_version < (5, 2):
|
|
27
|
+
raise errors.NotSupportedVersion("import_table requires 5.2+", self.vast_version)
|
|
15
28
|
|
|
16
29
|
|
|
17
30
|
class Session:
|
|
@@ -27,6 +40,8 @@ class Session:
|
|
|
27
40
|
endpoint = os.environ['AWS_S3_ENDPOINT_URL']
|
|
28
41
|
|
|
29
42
|
self.api = internal_commands.VastdbApi(endpoint, access, secret)
|
|
43
|
+
version_tuple = tuple(int(part) for part in self.api.vast_version.split('.'))
|
|
44
|
+
self.features = Features(version_tuple)
|
|
30
45
|
self.s3 = boto3.client('s3',
|
|
31
46
|
aws_access_key_id=access,
|
|
32
47
|
aws_secret_access_key=secret,
|
vastdb/table.py
CHANGED
|
@@ -1,3 +1,5 @@
|
|
|
1
|
+
"""VAST Database table."""
|
|
2
|
+
|
|
1
3
|
import concurrent.futures
|
|
2
4
|
import logging
|
|
3
5
|
import os
|
|
@@ -10,7 +12,7 @@ from typing import Dict, List, Optional, Tuple, Union
|
|
|
10
12
|
import ibis
|
|
11
13
|
import pyarrow as pa
|
|
12
14
|
|
|
13
|
-
from . import errors, internal_commands, schema
|
|
15
|
+
from . import errors, internal_commands, schema, util
|
|
14
16
|
|
|
15
17
|
log = logging.getLogger(__name__)
|
|
16
18
|
|
|
@@ -24,6 +26,8 @@ MAX_INSERT_ROWS_PER_PATCH = 512 * 1024
|
|
|
24
26
|
|
|
25
27
|
@dataclass
|
|
26
28
|
class TableStats:
|
|
29
|
+
"""Table-related information."""
|
|
30
|
+
|
|
27
31
|
num_rows: int
|
|
28
32
|
size_in_bytes: int
|
|
29
33
|
is_external_rowid_alloc: bool = False
|
|
@@ -32,6 +36,8 @@ class TableStats:
|
|
|
32
36
|
|
|
33
37
|
@dataclass
|
|
34
38
|
class QueryConfig:
|
|
39
|
+
"""Query execution configiration."""
|
|
40
|
+
|
|
35
41
|
num_sub_splits: int = 4
|
|
36
42
|
num_splits: int = 1
|
|
37
43
|
data_endpoints: Optional[List[str]] = None
|
|
@@ -44,11 +50,16 @@ class QueryConfig:
|
|
|
44
50
|
|
|
45
51
|
@dataclass
|
|
46
52
|
class ImportConfig:
|
|
53
|
+
"""Import execution configiration."""
|
|
54
|
+
|
|
47
55
|
import_concurrency: int = 2
|
|
48
56
|
|
|
49
57
|
|
|
50
|
-
class SelectSplitState
|
|
58
|
+
class SelectSplitState:
|
|
59
|
+
"""State of a specific query split execution."""
|
|
60
|
+
|
|
51
61
|
def __init__(self, query_data_request, table: "Table", split_id: int, config: QueryConfig) -> None:
|
|
62
|
+
"""Initialize query split state."""
|
|
52
63
|
self.split_id = split_id
|
|
53
64
|
self.subsplits_state = {i: 0 for i in range(config.num_sub_splits)}
|
|
54
65
|
self.config = config
|
|
@@ -56,6 +67,10 @@ class SelectSplitState():
|
|
|
56
67
|
self.table = table
|
|
57
68
|
|
|
58
69
|
def batches(self, api: internal_commands.VastdbApi):
|
|
70
|
+
"""Execute QueryData request, and yield parsed RecordBatch objects.
|
|
71
|
+
|
|
72
|
+
Can be called repeatedly, to allow pagination.
|
|
73
|
+
"""
|
|
59
74
|
while not self.done:
|
|
60
75
|
response = api.query_data(
|
|
61
76
|
bucket=self.table.bucket.name,
|
|
@@ -68,7 +83,8 @@ class SelectSplitState():
|
|
|
68
83
|
txid=self.table.tx.txid,
|
|
69
84
|
limit_rows=self.config.limit_rows_per_sub_split,
|
|
70
85
|
sub_split_start_row_ids=self.subsplits_state.items(),
|
|
71
|
-
enable_sorted_projections=self.config.use_semi_sorted_projections
|
|
86
|
+
enable_sorted_projections=self.config.use_semi_sorted_projections,
|
|
87
|
+
query_imports_table=self.table._imports_table)
|
|
72
88
|
pages_iter = internal_commands.parse_query_data_response(
|
|
73
89
|
conn=response.raw,
|
|
74
90
|
schema=self.query_data_request.response_schema,
|
|
@@ -82,19 +98,24 @@ class SelectSplitState():
|
|
|
82
98
|
|
|
83
99
|
@property
|
|
84
100
|
def done(self):
|
|
101
|
+
"""Returns true iff the pagination over."""
|
|
85
102
|
return all(row_id == internal_commands.TABULAR_INVALID_ROW_ID for row_id in self.subsplits_state.values())
|
|
86
103
|
|
|
87
104
|
|
|
88
105
|
@dataclass
|
|
89
106
|
class Table:
|
|
107
|
+
"""VAST Table."""
|
|
108
|
+
|
|
90
109
|
name: str
|
|
91
110
|
schema: "schema.Schema"
|
|
92
111
|
handle: int
|
|
93
112
|
stats: TableStats
|
|
94
|
-
arrow_schema: pa.Schema = field(init=False, compare=False)
|
|
95
|
-
_ibis_table: ibis.Schema = field(init=False, compare=False)
|
|
113
|
+
arrow_schema: pa.Schema = field(init=False, compare=False, repr=False)
|
|
114
|
+
_ibis_table: ibis.Schema = field(init=False, compare=False, repr=False)
|
|
115
|
+
_imports_table: bool
|
|
96
116
|
|
|
97
117
|
def __post_init__(self):
|
|
118
|
+
"""Also, load columns' metadata."""
|
|
98
119
|
self.arrow_schema = self.columns()
|
|
99
120
|
|
|
100
121
|
table_path = f'{self.schema.bucket.name}/{self.schema.name}/{self.name}'
|
|
@@ -102,21 +123,21 @@ class Table:
|
|
|
102
123
|
|
|
103
124
|
@property
|
|
104
125
|
def tx(self):
|
|
126
|
+
"""Return transaction."""
|
|
105
127
|
return self.schema.tx
|
|
106
128
|
|
|
107
129
|
@property
|
|
108
130
|
def bucket(self):
|
|
131
|
+
"""Return bucket."""
|
|
109
132
|
return self.schema.bucket
|
|
110
133
|
|
|
111
|
-
def __repr__(self):
|
|
112
|
-
return f"{type(self).__name__}(name={self.name})"
|
|
113
|
-
|
|
114
134
|
def columns(self) -> pa.Schema:
|
|
135
|
+
"""Return columns' metadata."""
|
|
115
136
|
fields = []
|
|
116
137
|
next_key = 0
|
|
117
138
|
while True:
|
|
118
139
|
cur_columns, next_key, is_truncated, _count = self.tx._rpc.api.list_columns(
|
|
119
|
-
bucket=self.bucket.name, schema=self.schema.name, table=self.name, next_key=next_key, txid=self.tx.txid)
|
|
140
|
+
bucket=self.bucket.name, schema=self.schema.name, table=self.name, next_key=next_key, txid=self.tx.txid, list_imports_table=self._imports_table)
|
|
120
141
|
fields.extend(cur_columns)
|
|
121
142
|
if not is_truncated:
|
|
122
143
|
break
|
|
@@ -125,6 +146,9 @@ class Table:
|
|
|
125
146
|
return self.arrow_schema
|
|
126
147
|
|
|
127
148
|
def projection(self, name: str) -> "Projection":
|
|
149
|
+
"""Get a specific semi-sorted projection of this table."""
|
|
150
|
+
if self._imports_table:
|
|
151
|
+
raise errors.NotSupportedCommand(self.bucket.name, self.schema.name, self.name)
|
|
128
152
|
projs = self.projections(projection_name=name)
|
|
129
153
|
if not projs:
|
|
130
154
|
raise errors.MissingProjection(self.bucket.name, self.schema.name, self.name, name)
|
|
@@ -133,6 +157,9 @@ class Table:
|
|
|
133
157
|
return projs[0]
|
|
134
158
|
|
|
135
159
|
def projections(self, projection_name=None) -> List["Projection"]:
|
|
160
|
+
"""List all semi-sorted projections of this table."""
|
|
161
|
+
if self._imports_table:
|
|
162
|
+
raise errors.NotSupportedCommand(self.bucket.name, self.schema.name, self.name)
|
|
136
163
|
projections = []
|
|
137
164
|
next_key = 0
|
|
138
165
|
name_prefix = projection_name if projection_name else ""
|
|
@@ -150,6 +177,12 @@ class Table:
|
|
|
150
177
|
return [_parse_projection_info(projection, self) for projection in projections]
|
|
151
178
|
|
|
152
179
|
def import_files(self, files_to_import: List[str], config: Optional[ImportConfig] = None) -> None:
|
|
180
|
+
"""Import a list of Parquet files into this table.
|
|
181
|
+
|
|
182
|
+
The files must be on VAST S3 server and be accessible using current credentials.
|
|
183
|
+
"""
|
|
184
|
+
if self._imports_table:
|
|
185
|
+
raise errors.NotSupportedCommand(self.bucket.name, self.schema.name, self.name)
|
|
153
186
|
source_files = {}
|
|
154
187
|
for f in files_to_import:
|
|
155
188
|
bucket_name, object_path = _parse_bucket_and_object_names(f)
|
|
@@ -158,6 +191,13 @@ class Table:
|
|
|
158
191
|
self._execute_import(source_files, config=config)
|
|
159
192
|
|
|
160
193
|
def import_partitioned_files(self, files_and_partitions: Dict[str, pa.RecordBatch], config: Optional[ImportConfig] = None) -> None:
|
|
194
|
+
"""Import a list of Parquet files into this table.
|
|
195
|
+
|
|
196
|
+
The files must be on VAST S3 server and be accessible using current credentials.
|
|
197
|
+
Each file must have its own partition values defined as an Arrow RecordBatch.
|
|
198
|
+
"""
|
|
199
|
+
if self._imports_table:
|
|
200
|
+
raise errors.NotSupportedCommand(self.bucket.name, self.schema.name, self.name)
|
|
161
201
|
source_files = {}
|
|
162
202
|
for f, record_batch in files_and_partitions.items():
|
|
163
203
|
bucket_name, object_path = _parse_bucket_and_object_names(f)
|
|
@@ -216,8 +256,10 @@ class Table:
|
|
|
216
256
|
# ThreadPoolExecutor will be joined at the end of the context
|
|
217
257
|
|
|
218
258
|
def get_stats(self) -> TableStats:
|
|
259
|
+
"""Get the statistics of this table."""
|
|
219
260
|
stats_tuple = self.tx._rpc.api.get_table_stats(
|
|
220
|
-
bucket=self.bucket.name, schema=self.schema.name, name=self.name, txid=self.tx.txid
|
|
261
|
+
bucket=self.bucket.name, schema=self.schema.name, name=self.name, txid=self.tx.txid,
|
|
262
|
+
imports_table_stats=self._imports_table)
|
|
221
263
|
return TableStats(**stats_tuple._asdict())
|
|
222
264
|
|
|
223
265
|
def select(self, columns: Optional[List[str]] = None,
|
|
@@ -225,6 +267,14 @@ class Table:
|
|
|
225
267
|
config: Optional[QueryConfig] = None,
|
|
226
268
|
*,
|
|
227
269
|
internal_row_id: bool = False) -> pa.RecordBatchReader:
|
|
270
|
+
"""Execute a query over this table.
|
|
271
|
+
|
|
272
|
+
To read a subset of the columns, specify their names via `columns` argument. Otherwise, all columns will be read.
|
|
273
|
+
|
|
274
|
+
In order to apply a filter, a predicate can be specified. See https://github.com/vast-data/vastdb_sdk/blob/main/README.md#filters-and-projections for more details.
|
|
275
|
+
|
|
276
|
+
Query-execution configuration options can be specified via the optional `config` argument.
|
|
277
|
+
"""
|
|
228
278
|
if config is None:
|
|
229
279
|
config = QueryConfig()
|
|
230
280
|
|
|
@@ -335,82 +385,129 @@ class Table:
|
|
|
335
385
|
|
|
336
386
|
return pa.RecordBatchReader.from_batches(query_data_request.response_schema, batches_iterator())
|
|
337
387
|
|
|
338
|
-
def _combine_chunks(self, col):
|
|
339
|
-
if hasattr(col, "combine_chunks"):
|
|
340
|
-
return col.combine_chunks()
|
|
341
|
-
else:
|
|
342
|
-
return col
|
|
343
|
-
|
|
344
388
|
def insert(self, rows: pa.RecordBatch) -> pa.RecordBatch:
|
|
345
|
-
|
|
389
|
+
"""Insert a RecordBatch into this table."""
|
|
390
|
+
if self._imports_table:
|
|
391
|
+
raise errors.NotSupportedCommand(self.bucket.name, self.schema.name, self.name)
|
|
392
|
+
serialized_slices = util.iter_serialized_slices(rows, MAX_INSERT_ROWS_PER_PATCH)
|
|
346
393
|
for slice in serialized_slices:
|
|
347
394
|
self.tx._rpc.api.insert_rows(self.bucket.name, self.schema.name, self.name, record_batch=slice,
|
|
348
395
|
txid=self.tx.txid)
|
|
349
396
|
|
|
350
397
|
def update(self, rows: Union[pa.RecordBatch, pa.Table], columns: Optional[List[str]] = None) -> None:
|
|
398
|
+
"""Update a subset of cells in this table.
|
|
399
|
+
|
|
400
|
+
Row IDs are specified using a special field (named "$row_id" of uint64 type).
|
|
401
|
+
|
|
402
|
+
A subset of columns to be updated can be specified via the `columns` argument.
|
|
403
|
+
"""
|
|
404
|
+
if self._imports_table:
|
|
405
|
+
raise errors.NotSupportedCommand(self.bucket.name, self.schema.name, self.name)
|
|
351
406
|
if columns is not None:
|
|
352
407
|
update_fields = [(INTERNAL_ROW_ID, pa.uint64())]
|
|
353
|
-
update_values = [
|
|
408
|
+
update_values = [_combine_chunks(rows[INTERNAL_ROW_ID])]
|
|
354
409
|
for col in columns:
|
|
355
410
|
update_fields.append(rows.field(col))
|
|
356
|
-
update_values.append(
|
|
411
|
+
update_values.append(_combine_chunks(rows[col]))
|
|
357
412
|
|
|
358
413
|
update_rows_rb = pa.record_batch(schema=pa.schema(update_fields), data=update_values)
|
|
359
414
|
else:
|
|
360
415
|
update_rows_rb = rows
|
|
361
416
|
|
|
362
|
-
serialized_slices =
|
|
417
|
+
serialized_slices = util.iter_serialized_slices(update_rows_rb, MAX_ROWS_PER_BATCH)
|
|
363
418
|
for slice in serialized_slices:
|
|
364
419
|
self.tx._rpc.api.update_rows(self.bucket.name, self.schema.name, self.name, record_batch=slice,
|
|
365
420
|
txid=self.tx.txid)
|
|
366
421
|
|
|
367
422
|
def delete(self, rows: Union[pa.RecordBatch, pa.Table]) -> None:
|
|
423
|
+
"""Delete a subset of rows in this table.
|
|
424
|
+
|
|
425
|
+
Row IDs are specified using a special field (named "$row_id" of uint64 type).
|
|
426
|
+
"""
|
|
368
427
|
delete_rows_rb = pa.record_batch(schema=pa.schema([(INTERNAL_ROW_ID, pa.uint64())]),
|
|
369
|
-
data=[
|
|
428
|
+
data=[_combine_chunks(rows[INTERNAL_ROW_ID])])
|
|
370
429
|
|
|
371
|
-
serialized_slices =
|
|
430
|
+
serialized_slices = util.iter_serialized_slices(delete_rows_rb, MAX_ROWS_PER_BATCH)
|
|
372
431
|
for slice in serialized_slices:
|
|
373
432
|
self.tx._rpc.api.delete_rows(self.bucket.name, self.schema.name, self.name, record_batch=slice,
|
|
374
|
-
txid=self.tx.txid)
|
|
433
|
+
txid=self.tx.txid, delete_from_imports_table=self._imports_table)
|
|
375
434
|
|
|
376
435
|
def drop(self) -> None:
|
|
377
|
-
|
|
436
|
+
"""Drop this table."""
|
|
437
|
+
self.tx._rpc.api.drop_table(self.bucket.name, self.schema.name, self.name, txid=self.tx.txid, remove_imports_table=self._imports_table)
|
|
378
438
|
log.info("Dropped table: %s", self.name)
|
|
379
439
|
|
|
380
440
|
def rename(self, new_name) -> None:
|
|
441
|
+
"""Rename this table."""
|
|
442
|
+
if self._imports_table:
|
|
443
|
+
raise errors.NotSupportedCommand(self.bucket.name, self.schema.name, self.name)
|
|
381
444
|
self.tx._rpc.api.alter_table(
|
|
382
445
|
self.bucket.name, self.schema.name, self.name, txid=self.tx.txid, new_name=new_name)
|
|
383
446
|
log.info("Renamed table from %s to %s ", self.name, new_name)
|
|
384
447
|
self.name = new_name
|
|
385
448
|
|
|
386
449
|
def add_column(self, new_column: pa.Schema) -> None:
|
|
450
|
+
"""Add a new column."""
|
|
451
|
+
if self._imports_table:
|
|
452
|
+
raise errors.NotSupportedCommand(self.bucket.name, self.schema.name, self.name)
|
|
387
453
|
self.tx._rpc.api.add_columns(self.bucket.name, self.schema.name, self.name, new_column, txid=self.tx.txid)
|
|
388
454
|
log.info("Added column(s): %s", new_column)
|
|
389
455
|
self.arrow_schema = self.columns()
|
|
390
456
|
|
|
391
457
|
def drop_column(self, column_to_drop: pa.Schema) -> None:
|
|
458
|
+
"""Drop an existing column."""
|
|
459
|
+
if self._imports_table:
|
|
460
|
+
raise errors.NotSupported(self.bucket.name, self.schema.name, self.name)
|
|
461
|
+
if self._imports_table:
|
|
462
|
+
raise errors.NotSupportedCommand(self.bucket.name, self.schema.name, self.name)
|
|
392
463
|
self.tx._rpc.api.drop_columns(self.bucket.name, self.schema.name, self.name, column_to_drop, txid=self.tx.txid)
|
|
393
464
|
log.info("Dropped column(s): %s", column_to_drop)
|
|
394
465
|
self.arrow_schema = self.columns()
|
|
395
466
|
|
|
396
467
|
def rename_column(self, current_column_name: str, new_column_name: str) -> None:
|
|
468
|
+
"""Rename an existing column."""
|
|
469
|
+
if self._imports_table:
|
|
470
|
+
raise errors.NotSupportedCommand(self.bucket.name, self.schema.name, self.name)
|
|
397
471
|
self.tx._rpc.api.alter_column(self.bucket.name, self.schema.name, self.name, name=current_column_name,
|
|
398
472
|
new_name=new_column_name, txid=self.tx.txid)
|
|
399
473
|
log.info("Renamed column: %s to %s", current_column_name, new_column_name)
|
|
400
474
|
self.arrow_schema = self.columns()
|
|
401
475
|
|
|
402
476
|
def create_projection(self, projection_name: str, sorted_columns: List[str], unsorted_columns: List[str]) -> "Projection":
|
|
477
|
+
"""Create a new semi-sorted projection."""
|
|
478
|
+
if self._imports_table:
|
|
479
|
+
raise errors.NotSupportedCommand(self.bucket.name, self.schema.name, self.name)
|
|
403
480
|
columns = [(sorted_column, "Sorted") for sorted_column in sorted_columns] + [(unsorted_column, "Unorted") for unsorted_column in unsorted_columns]
|
|
404
481
|
self.tx._rpc.api.create_projection(self.bucket.name, self.schema.name, self.name, projection_name, columns=columns, txid=self.tx.txid)
|
|
405
482
|
log.info("Created projection: %s", projection_name)
|
|
406
483
|
return self.projection(projection_name)
|
|
407
484
|
|
|
485
|
+
def create_imports_table(self, fail_if_exists=True) -> "Table":
|
|
486
|
+
"""Create imports table."""
|
|
487
|
+
self.tx._rpc.features.check_imports_table()
|
|
488
|
+
empty_schema = pa.schema([])
|
|
489
|
+
self.tx._rpc.api.create_table(self.bucket.name, self.schema.name, self.name, empty_schema, txid=self.tx.txid,
|
|
490
|
+
create_imports_table=True)
|
|
491
|
+
log.info("Created imports table for table: %s", self.name)
|
|
492
|
+
return self.imports_table() # type: ignore[return-value]
|
|
493
|
+
|
|
494
|
+
def imports_table(self) -> Optional["Table"]:
|
|
495
|
+
"""Get the imports table under of this table."""
|
|
496
|
+
self.tx._rpc.features.check_imports_table()
|
|
497
|
+
return Table(name=self.name, schema=self.schema, handle=int(self.handle), stats=self.stats, _imports_table=True)
|
|
498
|
+
|
|
408
499
|
def __getitem__(self, col_name):
|
|
500
|
+
"""Allow constructing ibis-like column expressions from this table.
|
|
501
|
+
|
|
502
|
+
It is useful for constructing expressions for predicate pushdown in `Table.select()` method.
|
|
503
|
+
"""
|
|
409
504
|
return self._ibis_table[col_name]
|
|
410
505
|
|
|
411
506
|
|
|
412
507
|
@dataclass
|
|
413
508
|
class Projection:
|
|
509
|
+
"""VAST semi-sorted projection."""
|
|
510
|
+
|
|
414
511
|
name: str
|
|
415
512
|
table: Table
|
|
416
513
|
handle: int
|
|
@@ -418,20 +515,21 @@ class Projection:
|
|
|
418
515
|
|
|
419
516
|
@property
|
|
420
517
|
def bucket(self):
|
|
518
|
+
"""Return bucket."""
|
|
421
519
|
return self.table.schema.bucket
|
|
422
520
|
|
|
423
521
|
@property
|
|
424
522
|
def schema(self):
|
|
523
|
+
"""Return schema."""
|
|
425
524
|
return self.table.schema
|
|
426
525
|
|
|
427
526
|
@property
|
|
428
527
|
def tx(self):
|
|
528
|
+
"""Return transaction."""
|
|
429
529
|
return self.table.schema.tx
|
|
430
530
|
|
|
431
|
-
def __repr__(self):
|
|
432
|
-
return f"{type(self).__name__}(name={self.name})"
|
|
433
|
-
|
|
434
531
|
def columns(self) -> pa.Schema:
|
|
532
|
+
"""Return this projections' columns as an Arrow schema."""
|
|
435
533
|
columns = []
|
|
436
534
|
next_key = 0
|
|
437
535
|
while True:
|
|
@@ -447,12 +545,14 @@ class Projection:
|
|
|
447
545
|
return self.arrow_schema
|
|
448
546
|
|
|
449
547
|
def rename(self, new_name) -> None:
|
|
548
|
+
"""Rename this projection."""
|
|
450
549
|
self.tx._rpc.api.alter_projection(self.bucket.name, self.schema.name,
|
|
451
550
|
self.table.name, self.name, txid=self.tx.txid, new_name=new_name)
|
|
452
551
|
log.info("Renamed projection from %s to %s ", self.name, new_name)
|
|
453
552
|
self.name = new_name
|
|
454
553
|
|
|
455
554
|
def drop(self) -> None:
|
|
555
|
+
"""Drop this projection."""
|
|
456
556
|
self.tx._rpc.api.drop_projection(self.bucket.name, self.schema.name, self.table.name,
|
|
457
557
|
self.name, txid=self.tx.txid)
|
|
458
558
|
log.info("Dropped projection: %s", self.name)
|
|
@@ -478,3 +578,10 @@ def _serialize_record_batch(record_batch: pa.RecordBatch) -> pa.lib.Buffer:
|
|
|
478
578
|
with pa.ipc.new_stream(sink, record_batch.schema) as writer:
|
|
479
579
|
writer.write(record_batch)
|
|
480
580
|
return sink.getvalue()
|
|
581
|
+
|
|
582
|
+
|
|
583
|
+
def _combine_chunks(col):
|
|
584
|
+
if hasattr(col, "combine_chunks"):
|
|
585
|
+
return col.combine_chunks()
|
|
586
|
+
else:
|
|
587
|
+
return col
|
vastdb/tests/test_imports.py
CHANGED
|
@@ -6,7 +6,7 @@ import pyarrow.parquet as pq
|
|
|
6
6
|
import pytest
|
|
7
7
|
|
|
8
8
|
from vastdb import util
|
|
9
|
-
from vastdb.errors import ImportFilesError, InvalidArgument
|
|
9
|
+
from vastdb.errors import ImportFilesError, InternalServerError, InvalidArgument
|
|
10
10
|
|
|
11
11
|
log = logging.getLogger(__name__)
|
|
12
12
|
|
|
@@ -34,12 +34,24 @@ def test_parallel_imports(session, clean_bucket_name, s3):
|
|
|
34
34
|
b = tx.bucket(clean_bucket_name)
|
|
35
35
|
s = b.create_schema('s1')
|
|
36
36
|
t = s.create_table('t1', pa.schema([('num', pa.int64())]))
|
|
37
|
+
with pytest.raises(InternalServerError):
|
|
38
|
+
t.create_imports_table()
|
|
37
39
|
log.info("Starting import of %d files", num_files)
|
|
38
40
|
t.import_files(files)
|
|
39
41
|
arrow_table = pa.Table.from_batches(t.select(columns=['num']))
|
|
40
42
|
assert arrow_table.num_rows == num_rows * num_files
|
|
41
43
|
arrow_table = pa.Table.from_batches(t.select(columns=['num'], predicate=t['num'] == 100))
|
|
42
44
|
assert arrow_table.num_rows == num_files
|
|
45
|
+
import_table = t.imports_table()
|
|
46
|
+
# checking all imports are on the imports table:
|
|
47
|
+
objects_name = pa.Table.from_batches(import_table.select(columns=["ObjectName"]))
|
|
48
|
+
objects_name = objects_name.to_pydict()
|
|
49
|
+
object_names = set(objects_name['ObjectName'])
|
|
50
|
+
prefix = 'prq'
|
|
51
|
+
numbers = set(range(53))
|
|
52
|
+
assert all(name.startswith(prefix) for name in object_names)
|
|
53
|
+
numbers.issubset(int(name.replace(prefix, '')) for name in object_names)
|
|
54
|
+
assert len(object_names) == len(objects_name['ObjectName'])
|
|
43
55
|
|
|
44
56
|
|
|
45
57
|
def test_create_table_from_files(session, clean_bucket_name, s3):
|
vastdb/tests/test_schemas.py
CHANGED
|
@@ -60,5 +60,4 @@ def test_commits_and_rollbacks(session, clean_bucket_name):
|
|
|
60
60
|
def test_list_snapshots(session, clean_bucket_name):
|
|
61
61
|
with session.transaction() as tx:
|
|
62
62
|
b = tx.bucket(clean_bucket_name)
|
|
63
|
-
|
|
64
|
-
assert s == []
|
|
63
|
+
b.snapshots() # VAST Catalog may create some snapshots
|
vastdb/tests/test_tables.py
CHANGED
|
@@ -3,6 +3,7 @@ import decimal
|
|
|
3
3
|
import logging
|
|
4
4
|
import random
|
|
5
5
|
import threading
|
|
6
|
+
import time
|
|
6
7
|
from contextlib import closing
|
|
7
8
|
from tempfile import NamedTemporaryFile
|
|
8
9
|
|
|
@@ -261,7 +262,7 @@ def test_filters(session, clean_bucket_name):
|
|
|
261
262
|
|
|
262
263
|
with prepare_data(session, clean_bucket_name, 's', 't', expected) as t:
|
|
263
264
|
def select(predicate):
|
|
264
|
-
return pa.Table.from_batches(t.select(predicate=predicate))
|
|
265
|
+
return pa.Table.from_batches(t.select(predicate=predicate), t.arrow_schema)
|
|
265
266
|
|
|
266
267
|
assert select(None) == expected
|
|
267
268
|
|
|
@@ -304,6 +305,13 @@ def test_filters(session, clean_bucket_name):
|
|
|
304
305
|
assert select(t['s'].contains('b')) == expected.filter(pc.field('s') == 'bb')
|
|
305
306
|
assert select(t['s'].contains('y')) == expected.filter(pc.field('s') == 'xyz')
|
|
306
307
|
|
|
308
|
+
assert select(t['a'].isin([555])) == expected.filter(pc.field('a').isin([555]))
|
|
309
|
+
assert select(t['a'].isin([111, 222, 999])) == expected.filter(pc.field('a').isin([111, 222, 999]))
|
|
310
|
+
assert select((t['a'] == 111) | t['a'].isin([333, 444]) | (t['a'] > 600)) == expected.filter((pc.field('a') == 111) | pc.field('a').isin([333, 444]) | (pc.field('a') > 600))
|
|
311
|
+
|
|
312
|
+
with pytest.raises(NotImplementedError):
|
|
313
|
+
select(t['a'].isin([]))
|
|
314
|
+
|
|
307
315
|
|
|
308
316
|
def test_parquet_export(session, clean_bucket_name):
|
|
309
317
|
with session.transaction() as tx:
|
|
@@ -638,3 +646,20 @@ def test_select_stop(session, clean_bucket_name):
|
|
|
638
646
|
|
|
639
647
|
# validate that all query threads were killed.
|
|
640
648
|
assert active_threads() == 0
|
|
649
|
+
|
|
650
|
+
|
|
651
|
+
def test_big_catalog_select(session, clean_bucket_name):
|
|
652
|
+
with session.transaction() as tx:
|
|
653
|
+
bc = tx.catalog()
|
|
654
|
+
actual = pa.Table.from_batches(bc.select(['name']))
|
|
655
|
+
assert actual
|
|
656
|
+
log.info("actual=%s", actual)
|
|
657
|
+
|
|
658
|
+
|
|
659
|
+
def test_audit_log_select(session, clean_bucket_name):
|
|
660
|
+
with session.transaction() as tx:
|
|
661
|
+
a = tx.audit_log()
|
|
662
|
+
a.columns()
|
|
663
|
+
time.sleep(1)
|
|
664
|
+
actual = pa.Table.from_batches(a.select(), a.arrow_schema)
|
|
665
|
+
log.info("actual=%s", actual)
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
import pyarrow as pa
|
|
2
|
+
import pytest
|
|
3
|
+
|
|
4
|
+
from .. import errors, util
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def test_slices():
|
|
8
|
+
ROWS = 1 << 20
|
|
9
|
+
t = pa.table({"x": range(ROWS), "y": [i / 1000 for i in range(ROWS)]})
|
|
10
|
+
|
|
11
|
+
chunks = list(util.iter_serialized_slices(t))
|
|
12
|
+
assert len(chunks) > 1
|
|
13
|
+
sizes = [len(c) for c in chunks]
|
|
14
|
+
|
|
15
|
+
assert max(sizes) < util.MAX_RECORD_BATCH_SLICE_SIZE
|
|
16
|
+
assert t == pa.Table.from_batches(_parse(chunks))
|
|
17
|
+
|
|
18
|
+
chunks = list(util.iter_serialized_slices(t, 1000))
|
|
19
|
+
assert len(chunks) > 1
|
|
20
|
+
sizes = [len(c) for c in chunks]
|
|
21
|
+
|
|
22
|
+
assert max(sizes) < util.MAX_RECORD_BATCH_SLICE_SIZE
|
|
23
|
+
assert t == pa.Table.from_batches(_parse(chunks))
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def test_wide_row():
|
|
27
|
+
cols = [pa.field(f"x{i}", pa.utf8()) for i in range(1000)]
|
|
28
|
+
values = [['a' * 10000]] * len(cols)
|
|
29
|
+
t = pa.table(values, schema=pa.schema(cols))
|
|
30
|
+
assert len(t) == 1
|
|
31
|
+
|
|
32
|
+
with pytest.raises(errors.TooWideRow):
|
|
33
|
+
list(util.iter_serialized_slices(t))
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def _parse(bufs):
|
|
37
|
+
for buf in bufs:
|
|
38
|
+
with pa.ipc.open_stream(buf) as reader:
|
|
39
|
+
yield from reader
|
vastdb/transaction.py
CHANGED
|
@@ -16,6 +16,14 @@ from . import bucket, errors, schema, session, table
|
|
|
16
16
|
|
|
17
17
|
log = logging.getLogger(__name__)
|
|
18
18
|
|
|
19
|
+
TABULAR_BC_BUCKET = "vast-big-catalog-bucket"
|
|
20
|
+
VAST_CATALOG_SCHEMA_NAME = 'vast_big_catalog_schema'
|
|
21
|
+
VAST_CATALOG_TABLE_NAME = 'vast_big_catalog_table'
|
|
22
|
+
|
|
23
|
+
TABULAR_AUDERY_BUCKET = "vast-audit-log-bucket"
|
|
24
|
+
AUDERY_SCHEMA_NAME = 'vast_audit_log_schema'
|
|
25
|
+
AUDERY_TABLE_NAME = 'vast_audit_log_table'
|
|
26
|
+
|
|
19
27
|
|
|
20
28
|
@dataclass
|
|
21
29
|
class Transaction:
|
|
@@ -44,6 +52,8 @@ class Transaction:
|
|
|
44
52
|
|
|
45
53
|
def __repr__(self):
|
|
46
54
|
"""Don't show the session details."""
|
|
55
|
+
if self.txid is None:
|
|
56
|
+
return 'InvalidTransaction'
|
|
47
57
|
return f'Transaction(id=0x{self.txid:016x})'
|
|
48
58
|
|
|
49
59
|
def bucket(self, name: str) -> "bucket.Bucket":
|
|
@@ -59,6 +69,12 @@ class Transaction:
|
|
|
59
69
|
|
|
60
70
|
def catalog(self, fail_if_missing=True) -> Optional["table.Table"]:
|
|
61
71
|
"""Return VAST Catalog table."""
|
|
62
|
-
b = bucket.Bucket(
|
|
63
|
-
s = schema.Schema(
|
|
64
|
-
return s.table(name=
|
|
72
|
+
b = bucket.Bucket(TABULAR_BC_BUCKET, self)
|
|
73
|
+
s = schema.Schema(VAST_CATALOG_SCHEMA_NAME, b)
|
|
74
|
+
return s.table(name=VAST_CATALOG_TABLE_NAME, fail_if_missing=fail_if_missing)
|
|
75
|
+
|
|
76
|
+
def audit_log(self, fail_if_missing=True) -> Optional["table.Table"]:
|
|
77
|
+
"""Return VAST AuditLog table."""
|
|
78
|
+
b = bucket.Bucket(TABULAR_AUDERY_BUCKET, self)
|
|
79
|
+
s = schema.Schema(AUDERY_SCHEMA_NAME, b)
|
|
80
|
+
return s.table(name=AUDERY_TABLE_NAME, fail_if_missing=fail_if_missing)
|
vastdb/util.py
CHANGED
|
@@ -1,20 +1,22 @@
|
|
|
1
1
|
import logging
|
|
2
|
-
from typing import Callable, List, Optional
|
|
2
|
+
from typing import TYPE_CHECKING, Callable, List, Optional, Union
|
|
3
3
|
|
|
4
4
|
import pyarrow as pa
|
|
5
5
|
import pyarrow.parquet as pq
|
|
6
6
|
|
|
7
|
-
from .errors import InvalidArgument
|
|
8
|
-
from .schema import Schema
|
|
9
|
-
from .table import ImportConfig, Table
|
|
7
|
+
from .errors import InvalidArgument, TooWideRow
|
|
10
8
|
|
|
11
9
|
log = logging.getLogger(__name__)
|
|
12
10
|
|
|
11
|
+
if TYPE_CHECKING:
|
|
12
|
+
from .schema import Schema
|
|
13
|
+
from .table import ImportConfig, Table
|
|
14
|
+
|
|
13
15
|
|
|
14
16
|
def create_table_from_files(
|
|
15
|
-
schema: Schema, table_name: str, parquet_files: List[str],
|
|
17
|
+
schema: "Schema", table_name: str, parquet_files: List[str],
|
|
16
18
|
schema_merge_func: Optional[Callable] = None,
|
|
17
|
-
config: Optional[ImportConfig] = None) -> Table:
|
|
19
|
+
config: Optional["ImportConfig"] = None) -> "Table":
|
|
18
20
|
if not schema_merge_func:
|
|
19
21
|
schema_merge_func = default_schema_merge
|
|
20
22
|
else:
|
|
@@ -77,3 +79,36 @@ def union_schema_merge(current_schema: pa.Schema, new_schema: pa.Schema) -> pa.S
|
|
|
77
79
|
This function returns a unified schema from potentially two different schemas.
|
|
78
80
|
"""
|
|
79
81
|
return pa.unify_schemas([current_schema, new_schema])
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
MAX_TABULAR_REQUEST_SIZE = 5 << 20 # in bytes
|
|
85
|
+
MAX_RECORD_BATCH_SLICE_SIZE = int(0.9 * MAX_TABULAR_REQUEST_SIZE)
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def iter_serialized_slices(batch: Union[pa.RecordBatch, pa.Table], max_rows_per_slice=None):
|
|
89
|
+
"""Iterate over a list of record batch slices."""
|
|
90
|
+
|
|
91
|
+
rows_per_slice = int(0.9 * len(batch) * MAX_RECORD_BATCH_SLICE_SIZE / batch.nbytes)
|
|
92
|
+
if max_rows_per_slice is not None:
|
|
93
|
+
rows_per_slice = min(rows_per_slice, max_rows_per_slice)
|
|
94
|
+
|
|
95
|
+
offset = 0
|
|
96
|
+
while offset < len(batch):
|
|
97
|
+
if rows_per_slice < 1:
|
|
98
|
+
raise TooWideRow(batch)
|
|
99
|
+
|
|
100
|
+
batch_slice = batch.slice(offset, rows_per_slice)
|
|
101
|
+
serialized_slice_batch = serialize_record_batch(batch_slice)
|
|
102
|
+
if len(serialized_slice_batch) <= MAX_RECORD_BATCH_SLICE_SIZE:
|
|
103
|
+
yield serialized_slice_batch
|
|
104
|
+
offset += rows_per_slice
|
|
105
|
+
else:
|
|
106
|
+
rows_per_slice = rows_per_slice // 2
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
def serialize_record_batch(batch: Union[pa.RecordBatch, pa.Table]):
|
|
110
|
+
"""Serialize a RecordBatch using Arrow IPC format."""
|
|
111
|
+
sink = pa.BufferOutputStream()
|
|
112
|
+
with pa.ipc.new_stream(sink, batch.schema) as writer:
|
|
113
|
+
writer.write(batch)
|
|
114
|
+
return sink.getvalue()
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: vastdb
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.3
|
|
4
4
|
Summary: VAST Data SDK
|
|
5
5
|
Home-page: https://github.com/vast-data/vastdb_sdk
|
|
6
6
|
Author: VAST DATA
|
|
@@ -21,7 +21,7 @@ License-File: LICENSE
|
|
|
21
21
|
Requires-Dist: aws-requests-auth
|
|
22
22
|
Requires-Dist: boto3
|
|
23
23
|
Requires-Dist: flatbuffers
|
|
24
|
-
Requires-Dist: ibis-framework
|
|
24
|
+
Requires-Dist: ibis-framework ==8.0.0
|
|
25
25
|
Requires-Dist: pyarrow
|
|
26
26
|
Requires-Dist: requests
|
|
27
27
|
Requires-Dist: xmltodict
|
|
@@ -149,28 +149,29 @@ vast_flatbuf/tabular/S3File.py,sha256=KC9c2oS5-JXwTTriUVFdjOvRG0B54Cq9kviSDZY3NI
|
|
|
149
149
|
vast_flatbuf/tabular/VipRange.py,sha256=_BJd1RRZAcK76T9vlsHzXKYVsPVaz6WTEAqStMQCAUQ,2069
|
|
150
150
|
vast_flatbuf/tabular/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
151
151
|
vastdb/__init__.py,sha256=cMJtZuJ0IL9aKyM3DUWqTCzuP1H1MXXVivKKE1-q0DY,292
|
|
152
|
-
vastdb/bucket.py,sha256=
|
|
152
|
+
vastdb/bucket.py,sha256=4rPEm9qlPTg7ccWO6VGmd4LKb8w-BDhJYwzXGjn03sc,3566
|
|
153
153
|
vastdb/conftest.py,sha256=pKpo_46Vq4QHzTDQAFxasrVhnZ2V2L-y6IMLxojxaFM,2132
|
|
154
|
-
vastdb/errors.py,sha256=
|
|
155
|
-
vastdb/internal_commands.py,sha256=
|
|
156
|
-
vastdb/schema.py,sha256=
|
|
157
|
-
vastdb/session.py,sha256=
|
|
158
|
-
vastdb/table.py,sha256=
|
|
159
|
-
vastdb/transaction.py,sha256=
|
|
160
|
-
vastdb/util.py,sha256=
|
|
154
|
+
vastdb/errors.py,sha256=vKWoq1yXrHyafMWwJgW_sQkSxQYxlI1JbTVCLz5Xi9Y,3793
|
|
155
|
+
vastdb/internal_commands.py,sha256=ZD2YXYvZ3lJWYzZU0oHtv8G3lNtDQUF0e8yg8813Xt4,99575
|
|
156
|
+
vastdb/schema.py,sha256=ql4TPB1W_FQ_BHov3CKHI8JX3krXMlcKWz7dTrjpQ1w,3346
|
|
157
|
+
vastdb/session.py,sha256=ciYS8Je2cRpuaAEE6Wjk79VsW0KAPdnRB2cqfxFCjis,2323
|
|
158
|
+
vastdb/table.py,sha256=xnSTWUUa0QHzXC5MUQWsGT1fsG8yAgMLy3nrgSH4j5Q,25661
|
|
159
|
+
vastdb/transaction.py,sha256=g8YTcYnsNPIhB2udbHyT5RIFB5kHnBLJcvV2CWRICwI,2845
|
|
160
|
+
vastdb/util.py,sha256=rs7nLL2Qz-OVEZDSVIqAvS-uETMq-zxQs5jBksB5-JA,4276
|
|
161
161
|
vastdb/bench/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
162
162
|
vastdb/bench/test_perf.py,sha256=iHE3E60fvyU5SBDHPi4h03Dj6QcY6VI9l9mMhgNMtPc,1117
|
|
163
163
|
vastdb/tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
164
164
|
vastdb/tests/test_duckdb.py,sha256=KDuv4PrjGEwChCGHG36xNT2JiFlBOt6K3DQ3L06Kq-A,1913
|
|
165
|
-
vastdb/tests/test_imports.py,sha256=
|
|
165
|
+
vastdb/tests/test_imports.py,sha256=48kbJKsa_MrEXcBYQUbUDr1e9wzjG4FHQ7C3wUEQfXA,5705
|
|
166
166
|
vastdb/tests/test_nested.py,sha256=3kejEvtSqV0LrUgb1QglRjrlxnKI4_AXTFw2nE7Q520,951
|
|
167
167
|
vastdb/tests/test_projections.py,sha256=_cDNfD5zTwbCXLk6uGpPUWGN0P-4HElu5OjubWu-Jg0,1255
|
|
168
168
|
vastdb/tests/test_sanity.py,sha256=ixx0QPo73hLHjAa7bByFXjS1XST0WvmSwLEpgnHh_JY,2960
|
|
169
|
-
vastdb/tests/test_schemas.py,sha256=
|
|
170
|
-
vastdb/tests/test_tables.py,sha256=
|
|
169
|
+
vastdb/tests/test_schemas.py,sha256=qoHTLX51D-0S4bMxdCpRh9gaYQd-BkZdT_agGOwFwTM,1739
|
|
170
|
+
vastdb/tests/test_tables.py,sha256=joeEQ30TwKBQc-2N_qGIdviZVnQr4rs6thlNsy5s_og,26672
|
|
171
|
+
vastdb/tests/test_util.py,sha256=owRAU3TCKMq-kz54NRdA5wX2O_bZIHqG5ucUR77jm5k,1046
|
|
171
172
|
vastdb/tests/util.py,sha256=NaCzKymEGy1xuiyMxyt2_0frKVfVk9iGrFwLf3GHjTI,435
|
|
172
|
-
vastdb-0.1.
|
|
173
|
-
vastdb-0.1.
|
|
174
|
-
vastdb-0.1.
|
|
175
|
-
vastdb-0.1.
|
|
176
|
-
vastdb-0.1.
|
|
173
|
+
vastdb-0.1.3.dist-info/LICENSE,sha256=obffan7LYrq7hLHNrY7vHcn2pKUTBUYXMKu-VOAvDxU,11333
|
|
174
|
+
vastdb-0.1.3.dist-info/METADATA,sha256=3h3JttUxw9oMMsxV_CVG_LMYwhgegsS9-b4gZkihrM0,1319
|
|
175
|
+
vastdb-0.1.3.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
|
|
176
|
+
vastdb-0.1.3.dist-info/top_level.txt,sha256=Vsj2MKtlhPg0J4so64slQtnwjhgoPmJgcG-6YcVAwVc,20
|
|
177
|
+
vastdb-0.1.3.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|