vastdb 0.0.5.2__py3-none-any.whl → 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vast_flatbuf/tabular/GetTableStatsResponse.py +45 -1
- vast_flatbuf/tabular/VipRange.py +56 -0
- vastdb/__init__.py +7 -0
- vastdb/bucket.py +77 -0
- vastdb/errors.py +158 -0
- vastdb/{api.py → internal_commands.py} +283 -747
- vastdb/schema.py +77 -0
- vastdb/session.py +48 -0
- vastdb/table.py +480 -0
- vastdb/tests/conftest.py +46 -0
- vastdb/tests/test_imports.py +125 -0
- vastdb/tests/test_projections.py +41 -0
- vastdb/tests/test_sanity.py +83 -0
- vastdb/tests/test_schemas.py +45 -0
- vastdb/tests/test_tables.py +608 -0
- vastdb/transaction.py +55 -0
- vastdb/util.py +77 -0
- vastdb-0.1.0.dist-info/METADATA +38 -0
- {vastdb-0.0.5.2.dist-info → vastdb-0.1.0.dist-info}/RECORD +23 -24
- vast_protobuf/substrait/__init__.py +0 -0
- vast_protobuf/substrait/algebra_pb2.py +0 -1344
- vast_protobuf/substrait/capabilities_pb2.py +0 -46
- vast_protobuf/substrait/ddl_pb2.py +0 -57
- vast_protobuf/substrait/extended_expression_pb2.py +0 -49
- vast_protobuf/substrait/extensions/__init__.py +0 -0
- vast_protobuf/substrait/extensions/extensions_pb2.py +0 -89
- vast_protobuf/substrait/function_pb2.py +0 -168
- vast_protobuf/substrait/parameterized_types_pb2.py +0 -181
- vast_protobuf/substrait/plan_pb2.py +0 -67
- vast_protobuf/substrait/type_expressions_pb2.py +0 -198
- vast_protobuf/substrait/type_pb2.py +0 -350
- vast_protobuf/tabular/__init__.py +0 -0
- vast_protobuf/tabular/rpc_pb2.py +0 -344
- vastdb/v2.py +0 -108
- vastdb-0.0.5.2.dist-info/METADATA +0 -47
- {vast_protobuf → vastdb/tests}/__init__.py +0 -0
- {vastdb-0.0.5.2.dist-info → vastdb-0.1.0.dist-info}/LICENSE +0 -0
- {vastdb-0.0.5.2.dist-info → vastdb-0.1.0.dist-info}/WHEEL +0 -0
- {vastdb-0.0.5.2.dist-info → vastdb-0.1.0.dist-info}/top_level.txt +0 -0
|
@@ -1,29 +1,26 @@
|
|
|
1
|
-
import array
|
|
2
1
|
import logging
|
|
3
2
|
import struct
|
|
4
3
|
import urllib.parse
|
|
5
4
|
from collections import defaultdict, namedtuple
|
|
6
5
|
from datetime import datetime
|
|
7
6
|
from enum import Enum
|
|
8
|
-
from typing import
|
|
7
|
+
from typing import Union, Optional, Iterator
|
|
8
|
+
import ibis
|
|
9
9
|
import xmltodict
|
|
10
|
-
import concurrent.futures
|
|
11
|
-
import threading
|
|
12
|
-
import queue
|
|
13
10
|
import math
|
|
14
|
-
import socket
|
|
15
11
|
from functools import cmp_to_key
|
|
16
12
|
import pyarrow.parquet as pq
|
|
17
13
|
import flatbuffers
|
|
18
14
|
import pyarrow as pa
|
|
19
15
|
import requests
|
|
20
|
-
import datetime
|
|
21
|
-
import hashlib
|
|
22
|
-
import hmac
|
|
23
16
|
import json
|
|
24
17
|
import itertools
|
|
25
18
|
from aws_requests_auth.aws_auth import AWSRequestsAuth
|
|
26
|
-
|
|
19
|
+
import urllib3
|
|
20
|
+
import re
|
|
21
|
+
|
|
22
|
+
from . import errors
|
|
23
|
+
from ipaddress import IPv4Address, IPv6Address
|
|
27
24
|
|
|
28
25
|
import vast_flatbuf.org.apache.arrow.computeir.flatbuf.BinaryLiteral as fb_binary_lit
|
|
29
26
|
import vast_flatbuf.org.apache.arrow.computeir.flatbuf.BooleanLiteral as fb_bool_lit
|
|
@@ -91,30 +88,22 @@ TABULAR_QUERY_DATA_COMPLETED_STREAM_ID = 0xFFFFFFFF - 1
|
|
|
91
88
|
TABULAR_QUERY_DATA_FAILED_STREAM_ID = 0xFFFFFFFF - 2
|
|
92
89
|
TABULAR_INVALID_ROW_ID = 0xFFFFFFFFFFFF # (1<<48)-1
|
|
93
90
|
ESTORE_INVALID_EHANDLE = UINT64_MAX
|
|
91
|
+
IMPORTED_OBJECTS_TABLE_NAME = "vastdb-imported-objects"
|
|
94
92
|
|
|
95
93
|
"""
|
|
96
94
|
S3 Tabular API
|
|
97
95
|
"""
|
|
98
96
|
|
|
99
97
|
|
|
100
|
-
|
|
101
|
-
log = logging.getLogger(name)
|
|
102
|
-
log.setLevel(logging.ERROR)
|
|
103
|
-
ch = logging.StreamHandler()
|
|
104
|
-
ch.setLevel(logging.INFO)
|
|
105
|
-
ch.set_name('tabular_stream_handler')
|
|
106
|
-
formatter = logging.Formatter("%(asctime)s:%(levelname)s:%(message)s")
|
|
107
|
-
ch.setFormatter(formatter)
|
|
108
|
-
log.addHandler(ch)
|
|
109
|
-
log.propagate = False
|
|
110
|
-
return log
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
_logger = get_logger(__name__)
|
|
98
|
+
_logger = logging.getLogger(__name__)
|
|
114
99
|
|
|
115
100
|
|
|
116
|
-
def
|
|
117
|
-
|
|
101
|
+
def _flatten_args(op, op_type):
|
|
102
|
+
if isinstance(op, op_type):
|
|
103
|
+
for arg in op.args:
|
|
104
|
+
yield from _flatten_args(arg, op_type)
|
|
105
|
+
else:
|
|
106
|
+
yield op
|
|
118
107
|
|
|
119
108
|
|
|
120
109
|
class AuthType(Enum):
|
|
@@ -123,10 +112,6 @@ class AuthType(Enum):
|
|
|
123
112
|
BASIC = "basic"
|
|
124
113
|
|
|
125
114
|
|
|
126
|
-
class TabularException(Exception):
|
|
127
|
-
pass
|
|
128
|
-
|
|
129
|
-
|
|
130
115
|
def get_unit_to_flatbuff_time_unit(type):
|
|
131
116
|
unit_to_flatbuff_time_unit = {
|
|
132
117
|
'ns': TimeUnit.NANOSECOND,
|
|
@@ -144,11 +129,10 @@ class Predicate:
|
|
|
144
129
|
's': 0.001
|
|
145
130
|
}
|
|
146
131
|
|
|
147
|
-
def __init__(self, schema: 'pa.Schema',
|
|
132
|
+
def __init__(self, schema: 'pa.Schema', expr: ibis.expr.types.BooleanColumn):
|
|
148
133
|
self.schema = schema
|
|
149
|
-
self.
|
|
134
|
+
self.expr = expr
|
|
150
135
|
self.builder = None
|
|
151
|
-
self._field_name_per_index = None
|
|
152
136
|
|
|
153
137
|
def get_field_indexes(self, field: 'pa.Field', field_name_per_index: list) -> None:
|
|
154
138
|
field_name_per_index.append(field.name)
|
|
@@ -172,7 +156,6 @@ class Predicate:
|
|
|
172
156
|
for field in self.schema:
|
|
173
157
|
self.get_field_indexes(field, _field_name_per_index)
|
|
174
158
|
self._field_name_per_index = {field: index for index, field in enumerate(_field_name_per_index)}
|
|
175
|
-
_logger.debug(f'field_name_per_index: {self._field_name_per_index}')
|
|
176
159
|
return self._field_name_per_index
|
|
177
160
|
|
|
178
161
|
def get_projections(self, builder: 'flatbuffers.builder.Builder', field_names: list = None):
|
|
@@ -190,10 +173,77 @@ class Predicate:
|
|
|
190
173
|
return builder.EndVector()
|
|
191
174
|
|
|
192
175
|
def serialize(self, builder: 'flatbuffers.builder.Builder'):
|
|
176
|
+
from ibis.expr.operations.generic import TableColumn, Literal, IsNull
|
|
177
|
+
from ibis.expr.operations.logical import Greater, GreaterEqual, Less, LessEqual, Equals, NotEquals, And, Or, Not
|
|
178
|
+
from ibis.expr.operations.strings import StringContains
|
|
179
|
+
|
|
180
|
+
builder_map = {
|
|
181
|
+
Greater: self.build_greater,
|
|
182
|
+
GreaterEqual: self.build_greater_equal,
|
|
183
|
+
Less: self.build_less,
|
|
184
|
+
LessEqual: self.build_less_equal,
|
|
185
|
+
Equals: self.build_equal,
|
|
186
|
+
NotEquals: self.build_not_equal,
|
|
187
|
+
IsNull: self.build_is_null,
|
|
188
|
+
Not: self.build_is_not_null,
|
|
189
|
+
StringContains: self.build_match_substring,
|
|
190
|
+
}
|
|
191
|
+
|
|
192
|
+
positions_map = dict((f.name, index) for index, f in enumerate(self.schema)) # TODO: BFS
|
|
193
|
+
|
|
193
194
|
self.builder = builder
|
|
195
|
+
|
|
194
196
|
offsets = []
|
|
195
|
-
|
|
196
|
-
|
|
197
|
+
|
|
198
|
+
if self.expr is not None:
|
|
199
|
+
and_args = list(_flatten_args(self.expr.op(), And))
|
|
200
|
+
_logger.debug('AND args: %s ops %s', and_args, self.expr.op())
|
|
201
|
+
for op in and_args:
|
|
202
|
+
or_args = list(_flatten_args(op, Or))
|
|
203
|
+
_logger.debug('OR args: %s op %s', or_args, op)
|
|
204
|
+
inner_offsets = []
|
|
205
|
+
|
|
206
|
+
prev_field_name = None
|
|
207
|
+
for inner_op in or_args:
|
|
208
|
+
_logger.debug('inner_op %s', inner_op)
|
|
209
|
+
builder_func = builder_map.get(type(inner_op))
|
|
210
|
+
if not builder_func:
|
|
211
|
+
raise NotImplementedError(inner_op.name)
|
|
212
|
+
|
|
213
|
+
if builder_func == self.build_is_null:
|
|
214
|
+
column, = inner_op.args
|
|
215
|
+
literal = None
|
|
216
|
+
elif builder_func == self.build_is_not_null:
|
|
217
|
+
not_arg, = inner_op.args
|
|
218
|
+
# currently we only support not is_null, checking we really got is_null under the not:
|
|
219
|
+
if not builder_map.get(type(not_arg)) == self.build_is_null:
|
|
220
|
+
raise NotImplementedError(not_arg.args[0].name)
|
|
221
|
+
column, = not_arg.args
|
|
222
|
+
literal = None
|
|
223
|
+
else:
|
|
224
|
+
column, literal = inner_op.args
|
|
225
|
+
if not isinstance(literal, Literal):
|
|
226
|
+
raise NotImplementedError(inner_op.name)
|
|
227
|
+
|
|
228
|
+
if not isinstance(column, TableColumn):
|
|
229
|
+
raise NotImplementedError(inner_op.name)
|
|
230
|
+
|
|
231
|
+
field_name = column.name
|
|
232
|
+
if prev_field_name is None:
|
|
233
|
+
prev_field_name = field_name
|
|
234
|
+
elif prev_field_name != field_name:
|
|
235
|
+
raise NotImplementedError(op.name)
|
|
236
|
+
|
|
237
|
+
args_offsets = [self.build_column(position=positions_map[field_name])]
|
|
238
|
+
if literal:
|
|
239
|
+
field = self.schema.field(field_name)
|
|
240
|
+
args_offsets.append(self.build_literal(field=field, value=literal.value))
|
|
241
|
+
|
|
242
|
+
inner_offsets.append(builder_func(*args_offsets))
|
|
243
|
+
|
|
244
|
+
domain_offset = self.build_or(inner_offsets)
|
|
245
|
+
offsets.append(domain_offset)
|
|
246
|
+
|
|
197
247
|
return self.build_and(offsets)
|
|
198
248
|
|
|
199
249
|
def build_column(self, position: int):
|
|
@@ -221,7 +271,6 @@ class Predicate:
|
|
|
221
271
|
field = self.schema.field(field_name)
|
|
222
272
|
for attr in field_attrs:
|
|
223
273
|
field = field.type[attr]
|
|
224
|
-
_logger.info(f'trying to append field: {field} with domains: {filters}')
|
|
225
274
|
for filter_by_name in filters:
|
|
226
275
|
offsets.append(self.build_range(column=column, field=field, filter_by_name=filter_by_name))
|
|
227
276
|
return self.build_or(offsets)
|
|
@@ -263,11 +312,9 @@ class Predicate:
|
|
|
263
312
|
return self.build_and(rules)
|
|
264
313
|
|
|
265
314
|
def build_function(self, name: str, *offsets):
|
|
266
|
-
_logger.info(f'name: {name}, offsets: {offsets}')
|
|
267
315
|
offset_name = self.builder.CreateString(name)
|
|
268
316
|
fb_call.StartArgumentsVector(self.builder, len(offsets))
|
|
269
317
|
for offset in reversed(offsets):
|
|
270
|
-
_logger.info(f'offset: {offset}')
|
|
271
318
|
self.builder.PrependUOffsetTRelative(offset)
|
|
272
319
|
offset_arguments = self.builder.EndVector()
|
|
273
320
|
|
|
@@ -282,7 +329,7 @@ class Predicate:
|
|
|
282
329
|
fb_expression.AddImpl(self.builder, offset_call)
|
|
283
330
|
return fb_expression.End(self.builder)
|
|
284
331
|
|
|
285
|
-
def build_literal(self, field: pa.Field, value
|
|
332
|
+
def build_literal(self, field: pa.Field, value):
|
|
286
333
|
if field.type.equals(pa.int64()):
|
|
287
334
|
literal_type = fb_int64_lit
|
|
288
335
|
literal_impl = LiteralImpl.Int64Literal
|
|
@@ -366,8 +413,7 @@ class Predicate:
|
|
|
366
413
|
field_type = fb_date.End(self.builder)
|
|
367
414
|
|
|
368
415
|
start_date = datetime.fromtimestamp(0).date()
|
|
369
|
-
|
|
370
|
-
date_delta = date_value - start_date
|
|
416
|
+
date_delta = value - start_date
|
|
371
417
|
value = date_delta.days
|
|
372
418
|
elif isinstance(field.type, pa.TimestampType):
|
|
373
419
|
literal_type = fb_timestamp_lit
|
|
@@ -426,7 +472,7 @@ class Predicate:
|
|
|
426
472
|
fb_binary.Start(self.builder)
|
|
427
473
|
field_type = fb_binary.End(self.builder)
|
|
428
474
|
|
|
429
|
-
value = self.builder.CreateByteVector(value
|
|
475
|
+
value = self.builder.CreateByteVector(value)
|
|
430
476
|
else:
|
|
431
477
|
raise ValueError(f'unsupported predicate for type={field.type}, value={value}')
|
|
432
478
|
|
|
@@ -459,6 +505,9 @@ class Predicate:
|
|
|
459
505
|
def build_equal(self, column: int, literal: int):
|
|
460
506
|
return self.build_function('equal', column, literal)
|
|
461
507
|
|
|
508
|
+
def build_not_equal(self, column: int, literal: int):
|
|
509
|
+
return self.build_function('not_equal', column, literal)
|
|
510
|
+
|
|
462
511
|
def build_greater(self, column: int, literal: int):
|
|
463
512
|
return self.build_function('greater', column, literal)
|
|
464
513
|
|
|
@@ -477,6 +526,9 @@ class Predicate:
|
|
|
477
526
|
def build_is_not_null(self, column: int):
|
|
478
527
|
return self.build_function('is_valid', column)
|
|
479
528
|
|
|
529
|
+
def build_match_substring(self, column: int, literal: int):
|
|
530
|
+
return self.build_function('match_substring', column, literal)
|
|
531
|
+
|
|
480
532
|
|
|
481
533
|
class FieldNode:
|
|
482
534
|
"""Helper class for representing nested Arrow fields and handling QueryData requests"""
|
|
@@ -574,9 +626,8 @@ class FieldNode:
|
|
|
574
626
|
def build(self) -> pa.Array:
|
|
575
627
|
"""Construct an Arrow array from the collected buffers (recursively)."""
|
|
576
628
|
children = self.children and [node.build() for node in self.children if node.is_projected]
|
|
577
|
-
_logger.debug(
|
|
578
|
-
|
|
579
|
-
f'self.buffers={self.buffers} children={children}')
|
|
629
|
+
_logger.debug('build: self.field.name=%s, self.projected_field.type=%s, self.length=%s, self.buffers=%s children=%s',
|
|
630
|
+
self.field.name, self.projected_field.type, self.length, self.buffers, children)
|
|
580
631
|
result = pa.Array.from_buffers(self.projected_field.type, self.length, buffers=self.buffers, children=children)
|
|
581
632
|
if self.debug:
|
|
582
633
|
_logger.debug('%s result=%s', self.field, result)
|
|
@@ -602,11 +653,9 @@ class QueryDataParser:
|
|
|
602
653
|
for node in self.nodes:
|
|
603
654
|
node.debug_log()
|
|
604
655
|
self.leaves = [leaf for node in self.nodes for leaf in node._iter_leaves()]
|
|
605
|
-
_logger.debug(f'QueryDataParser: self.leaves = {[(leaf.field.name, leaf.index) for leaf in self.leaves]}')
|
|
606
656
|
self.mark_projected_nodes()
|
|
607
657
|
[node.build_projected_field() for node in self.nodes]
|
|
608
658
|
self.projected_leaves = [leaf for node in self.nodes for leaf in node._iter_projected_leaves()]
|
|
609
|
-
_logger.debug(f'QueryDataParser: self.projected_leaves = {[(leaf.field.name, leaf.index) for leaf in self.projected_leaves]}')
|
|
610
659
|
|
|
611
660
|
self.leaf_offset = 0
|
|
612
661
|
|
|
@@ -615,7 +664,6 @@ class QueryDataParser:
|
|
|
615
664
|
if self.projection_positions is None or leaf.index in self.projection_positions:
|
|
616
665
|
for node in leaf._iter_to_root():
|
|
617
666
|
node.is_projected = True
|
|
618
|
-
_logger.debug(f'mark_projected_nodes node.field.name={node.field.name}')
|
|
619
667
|
|
|
620
668
|
def parse(self, column: pa.Array):
|
|
621
669
|
"""Parse a single column response from VAST (see FieldNode.set for details)"""
|
|
@@ -693,7 +741,6 @@ def _parse_table_info(obj):
|
|
|
693
741
|
return TableInfo(name, properties, handle, num_rows, used_bytes)
|
|
694
742
|
|
|
695
743
|
def build_record_batch(column_info, column_values):
|
|
696
|
-
_logger.info(f"column_info={column_info}")
|
|
697
744
|
fields = [pa.field(column_name, column_type) for column_type, column_name in column_info]
|
|
698
745
|
schema = pa.schema(fields)
|
|
699
746
|
arrays = [pa.array(column_values[column_type], type=column_type) for column_type, _ in column_info]
|
|
@@ -706,67 +753,42 @@ def serialize_record_batch(batch):
|
|
|
706
753
|
writer.write(batch)
|
|
707
754
|
return sink.getvalue()
|
|
708
755
|
|
|
709
|
-
|
|
710
|
-
|
|
711
|
-
start_parts = start.split('.')
|
|
712
|
-
start_last_part = int(start_parts[-1])
|
|
713
|
-
end_parts = end.split('.')
|
|
714
|
-
end_last_part = int(end_parts[-1])
|
|
715
|
-
if start_last_part>=end_last_part or True in [start_parts[i] != end_parts[i] for i in range(3)]:
|
|
716
|
-
raise ValueError(f'illegal ip range {ip_range_str}')
|
|
717
|
-
num_ips = 1 + end_last_part - start_last_part
|
|
718
|
-
ips = ['.'.join(start_parts[:-1] + [str(start_last_part + i)]) for i in range(num_ips)]
|
|
719
|
-
return ips
|
|
720
|
-
|
|
721
|
-
def parse_executor_hosts(host):
|
|
722
|
-
executor_hosts_parsed = host.split(',')
|
|
723
|
-
executor_hosts_parsed = [host.strip() for host in executor_hosts_parsed]
|
|
724
|
-
executor_hosts = []
|
|
725
|
-
for executor_host in executor_hosts_parsed:
|
|
726
|
-
is_ip_range=False
|
|
727
|
-
if ':' in executor_host:
|
|
728
|
-
try:
|
|
729
|
-
socket.inet_aton(executor_host.split(':')[0])
|
|
730
|
-
socket.inet_aton(executor_host.split(':')[1])
|
|
731
|
-
is_ip_range = True
|
|
732
|
-
except:
|
|
733
|
-
pass
|
|
734
|
-
if is_ip_range:
|
|
735
|
-
executor_hosts.extend(generate_ip_range(executor_host))
|
|
736
|
-
else:
|
|
737
|
-
executor_hosts.append(executor_host)
|
|
738
|
-
return executor_hosts
|
|
756
|
+
# Results that returns from tablestats
|
|
757
|
+
TableStatsResult = namedtuple("TableStatsResult",["num_rows", "size_in_bytes", "is_external_rowid_alloc", "endpoints"])
|
|
739
758
|
|
|
740
759
|
class VastdbApi:
|
|
741
|
-
|
|
760
|
+
# we expect the vast version to be <major>.<minor>.<patch>.<protocol>
|
|
761
|
+
VAST_VERSION_REGEX = re.compile(r'^vast (\d+\.\d+\.\d+\.\d+)$')
|
|
762
|
+
|
|
763
|
+
def __init__(self, endpoint, access_key, secret_key, username=None, password=None,
|
|
742
764
|
secure=False, auth_type=AuthType.SIGV4):
|
|
743
|
-
|
|
744
|
-
host = executor_hosts[0]
|
|
745
|
-
self.host = host
|
|
765
|
+
url_dict = urllib3.util.parse_url(endpoint)._asdict()
|
|
746
766
|
self.access_key = access_key
|
|
747
767
|
self.secret_key = secret_key
|
|
748
768
|
self.username = username
|
|
749
769
|
self.password = password
|
|
750
|
-
self.port = port
|
|
751
770
|
self.secure = secure
|
|
752
771
|
self.auth_type = auth_type
|
|
753
|
-
self.executor_hosts =
|
|
772
|
+
self.executor_hosts = [endpoint] # TODO: remove
|
|
754
773
|
|
|
755
774
|
username = username or ''
|
|
756
775
|
password = password or ''
|
|
757
|
-
if not port:
|
|
758
|
-
port = 443 if secure else 80
|
|
776
|
+
if not url_dict['port']:
|
|
777
|
+
url_dict['port'] = 443 if secure else 80
|
|
778
|
+
|
|
779
|
+
self.port = url_dict['port']
|
|
759
780
|
|
|
781
|
+
self.default_max_list_columns_page_size = 1000
|
|
760
782
|
self.session = requests.Session()
|
|
761
783
|
self.session.verify = False
|
|
762
784
|
self.session.headers['user-agent'] = "VastData Tabular API 1.0 - 2022 (c)"
|
|
763
785
|
if auth_type == AuthType.BASIC:
|
|
764
786
|
self.session.auth = requests.auth.HTTPBasicAuth(username, password)
|
|
765
787
|
else:
|
|
766
|
-
if port != 80 and port != 443:
|
|
767
|
-
self.aws_host =
|
|
788
|
+
if url_dict['port'] != 80 and url_dict['port'] != 443:
|
|
789
|
+
self.aws_host = '{host}:{port}'.format(**url_dict)
|
|
768
790
|
else:
|
|
769
|
-
self.aws_host =
|
|
791
|
+
self.aws_host = '{host}'.format(**url_dict)
|
|
770
792
|
|
|
771
793
|
self.session.auth = AWSRequestsAuth(aws_access_key=access_key,
|
|
772
794
|
aws_secret_access_key=secret_key,
|
|
@@ -774,8 +796,34 @@ class VastdbApi:
|
|
|
774
796
|
aws_region='us-east-1',
|
|
775
797
|
aws_service='s3')
|
|
776
798
|
|
|
777
|
-
|
|
778
|
-
|
|
799
|
+
if not url_dict['scheme']:
|
|
800
|
+
url_dict['scheme'] = "https" if secure else "http"
|
|
801
|
+
|
|
802
|
+
url = urllib3.util.Url(**url_dict)
|
|
803
|
+
self.url = str(url)
|
|
804
|
+
_logger.debug('url=%s aws_host=%s', self.url, self.aws_host)
|
|
805
|
+
|
|
806
|
+
# probe the cluster for its version
|
|
807
|
+
self.vast_version = None
|
|
808
|
+
res = self.session.options(self.url)
|
|
809
|
+
server_header = res.headers.get("Server")
|
|
810
|
+
if server_header is None:
|
|
811
|
+
_logger.error("OPTIONS response doesn't contain 'Server' header")
|
|
812
|
+
else:
|
|
813
|
+
_logger.debug("Server header is '%s'", server_header)
|
|
814
|
+
if m := self.VAST_VERSION_REGEX.match(server_header):
|
|
815
|
+
self.vast_version, = m.groups()
|
|
816
|
+
return
|
|
817
|
+
else:
|
|
818
|
+
_logger.error("'Server' header '%s' doesn't match the expected pattern", server_header)
|
|
819
|
+
|
|
820
|
+
msg = (
|
|
821
|
+
f'Please use `vastdb` <= 0.0.5.x with current VAST cluster version ("{server_header or "N/A"}"). '
|
|
822
|
+
'To use the latest SDK, please upgrade your cluster to the latest service pack. '
|
|
823
|
+
'Please contact customer.support@vastdata.com for more details.'
|
|
824
|
+
)
|
|
825
|
+
_logger.critical(msg)
|
|
826
|
+
raise NotImplementedError(msg)
|
|
779
827
|
|
|
780
828
|
def update_mgmt_session(self, access_key: str, secret_key: str, auth_type=AuthType.SIGV4):
|
|
781
829
|
if auth_type != AuthType.BASIC:
|
|
@@ -820,21 +868,9 @@ class VastdbApi:
|
|
|
820
868
|
return common_headers
|
|
821
869
|
|
|
822
870
|
def _check_res(self, res, cmd="", expected_retvals=[]):
|
|
823
|
-
|
|
824
|
-
|
|
825
|
-
|
|
826
|
-
if not res.status_code in expected_retvals:
|
|
827
|
-
raise ValueError(f"Expected status code mismatch. status_code={res.status_code}")
|
|
828
|
-
else:
|
|
829
|
-
if not len(expected_retvals) == 0:
|
|
830
|
-
raise ValueError(f"Expected {expected_retvals} but status_code={res.status_code}")
|
|
831
|
-
return res
|
|
832
|
-
except requests.HTTPError as e:
|
|
833
|
-
if res.status_code in expected_retvals:
|
|
834
|
-
_logger.info(f"{cmd} has failed as expected res={res}")
|
|
835
|
-
return res
|
|
836
|
-
else:
|
|
837
|
-
raise e
|
|
871
|
+
if exc := errors.from_response(res):
|
|
872
|
+
raise exc
|
|
873
|
+
return res
|
|
838
874
|
|
|
839
875
|
def create_schema(self, bucket, name, txid=0, client_tags=[], schema_properties="", expected_retvals=[]):
|
|
840
876
|
"""
|
|
@@ -974,7 +1010,8 @@ class VastdbApi:
|
|
|
974
1010
|
return snapshots, is_truncated, marker
|
|
975
1011
|
|
|
976
1012
|
|
|
977
|
-
def create_table(self, bucket, schema, name, arrow_schema, txid=0, client_tags=[], expected_retvals=[],
|
|
1013
|
+
def create_table(self, bucket, schema, name, arrow_schema, txid=0, client_tags=[], expected_retvals=[],
|
|
1014
|
+
topic_partitions=0, create_imports_table=False):
|
|
978
1015
|
"""
|
|
979
1016
|
Create a table, use the following request
|
|
980
1017
|
POST /bucket/schema/table?table HTTP/1.1
|
|
@@ -983,18 +1020,21 @@ class VastdbApi:
|
|
|
983
1020
|
tabular-txid: <integer> TransactionId
|
|
984
1021
|
tabular-client-tag: <string> ClientTag
|
|
985
1022
|
|
|
986
|
-
The body of the POST request contains table column properties as
|
|
987
|
-
|
|
988
|
-
|
|
989
|
-
|
|
990
|
-
|
|
991
|
-
|
|
1023
|
+
The body of the POST request contains table column properties as arrow schema
|
|
1024
|
+
which include field_name, field_type and properties
|
|
1025
|
+
|
|
1026
|
+
In order to create vastdb-imported-objects table that tracks all imported files and avoid duplicate imports,
|
|
1027
|
+
just set create_imports_table=True
|
|
1028
|
+
The request will look like:
|
|
1029
|
+
POST /bucket/schema/table?table&sub-table=vastdb-imported-objects HTTP/1.1
|
|
992
1030
|
"""
|
|
993
1031
|
headers = self._fill_common_headers(txid=txid, client_tags=client_tags)
|
|
994
1032
|
|
|
995
1033
|
serialized_schema = arrow_schema.serialize()
|
|
996
1034
|
headers['Content-Length'] = str(len(serialized_schema))
|
|
997
1035
|
url_params = {'topic_partitions': str(topic_partitions)} if topic_partitions else {}
|
|
1036
|
+
if create_imports_table:
|
|
1037
|
+
url_params['sub-table'] = IMPORTED_OBJECTS_TABLE_NAME
|
|
998
1038
|
|
|
999
1039
|
res = self.session.post(self._api_prefix(bucket=bucket, schema=schema, table=name, command="table", url_params=url_params),
|
|
1000
1040
|
data=serialized_schema, headers=headers)
|
|
@@ -1014,7 +1054,6 @@ class VastdbApi:
|
|
|
1014
1054
|
raise RuntimeError(f'invalid params parquet_path={parquet_path} parquet_bucket_name={parquet_bucket_name} parquet_object_name={parquet_object_name}')
|
|
1015
1055
|
|
|
1016
1056
|
# Get the schema of the Parquet file
|
|
1017
|
-
_logger.info(f'type(parquet_ds.schema) = {type(parquet_ds.schema)}')
|
|
1018
1057
|
if isinstance(parquet_ds.schema, pq.ParquetSchema):
|
|
1019
1058
|
arrow_schema = parquet_ds.schema.to_arrow_schema()
|
|
1020
1059
|
elif isinstance(parquet_ds.schema, pa.Schema):
|
|
@@ -1037,13 +1076,27 @@ class VastdbApi:
|
|
|
1037
1076
|
headers = self._fill_common_headers(txid=txid, client_tags=client_tags)
|
|
1038
1077
|
res = self.session.get(self._api_prefix(bucket=bucket, schema=schema, table=name, command="stats"), headers=headers)
|
|
1039
1078
|
if res.status_code == 200:
|
|
1040
|
-
res_headers = res.headers
|
|
1041
1079
|
flatbuf = b''.join(res.iter_content(chunk_size=128))
|
|
1042
1080
|
stats = get_table_stats.GetRootAs(flatbuf)
|
|
1043
1081
|
num_rows = stats.NumRows()
|
|
1044
1082
|
size_in_bytes = stats.SizeInBytes()
|
|
1045
1083
|
is_external_rowid_alloc = stats.IsExternalRowidAlloc()
|
|
1046
|
-
|
|
1084
|
+
endpoints = []
|
|
1085
|
+
if stats.VipsLength() == 0:
|
|
1086
|
+
endpoints.append(self.url)
|
|
1087
|
+
else:
|
|
1088
|
+
ip_cls = IPv6Address if (stats.AddressType() == "ipv6") else IPv4Address
|
|
1089
|
+
vips = [stats.Vips(i) for i in range(stats.VipsLength())]
|
|
1090
|
+
ips = []
|
|
1091
|
+
# extract the vips into list of IPs
|
|
1092
|
+
for vip in vips:
|
|
1093
|
+
start_ip = int(ip_cls(vip.StartAddress().decode()))
|
|
1094
|
+
ips.extend(ip_cls(start_ip + i) for i in range(vip.AddressCount()))
|
|
1095
|
+
for ip in ips:
|
|
1096
|
+
prefix = "http" if not self.secure else "https"
|
|
1097
|
+
endpoints.append(f"{prefix}://{str(ip)}:{self.port}")
|
|
1098
|
+
return TableStatsResult(num_rows, size_in_bytes, is_external_rowid_alloc, endpoints)
|
|
1099
|
+
|
|
1047
1100
|
return self._check_res(res, "get_table_stats", expected_retvals)
|
|
1048
1101
|
|
|
1049
1102
|
def alter_table(self, bucket, schema, name, txid=0, client_tags=[], table_properties="",
|
|
@@ -1070,22 +1123,26 @@ class VastdbApi:
|
|
|
1070
1123
|
|
|
1071
1124
|
headers = self._fill_common_headers(txid=txid, client_tags=client_tags)
|
|
1072
1125
|
headers['Content-Length'] = str(len(alter_table_req))
|
|
1073
|
-
url_params = {'tabular-new-table-name': new_name} if len(new_name) else {}
|
|
1126
|
+
url_params = {'tabular-new-table-name': schema + "/" + new_name} if len(new_name) else {}
|
|
1074
1127
|
|
|
1075
1128
|
res = self.session.put(self._api_prefix(bucket=bucket, schema=schema, table=name, command="table", url_params=url_params),
|
|
1076
1129
|
data=alter_table_req, headers=headers)
|
|
1077
1130
|
|
|
1078
1131
|
return self._check_res(res, "alter_table", expected_retvals)
|
|
1079
1132
|
|
|
1080
|
-
def drop_table(self, bucket, schema, name, txid=0, client_tags=[], expected_retvals=[]):
|
|
1133
|
+
def drop_table(self, bucket, schema, name, txid=0, client_tags=[], expected_retvals=[], remove_imports_table=False):
|
|
1081
1134
|
"""
|
|
1082
1135
|
DELETE /mybucket/schema_path/mytable?table HTTP/1.1
|
|
1083
1136
|
tabular-txid: TransactionId
|
|
1084
1137
|
tabular-client-tag: ClientTag
|
|
1138
|
+
|
|
1139
|
+
To remove the internal vastdb-imported-objects table just set remove_imports_table=True
|
|
1085
1140
|
"""
|
|
1086
1141
|
headers = self._fill_common_headers(txid=txid, client_tags=client_tags)
|
|
1142
|
+
url_params = {'sub-table': IMPORTED_OBJECTS_TABLE_NAME} if remove_imports_table else {}
|
|
1087
1143
|
|
|
1088
|
-
res = self.session.delete(self._api_prefix(bucket=bucket, schema=schema, table=name, command="table"
|
|
1144
|
+
res = self.session.delete(self._api_prefix(bucket=bucket, schema=schema, table=name, command="table", url_params=url_params),
|
|
1145
|
+
headers=headers)
|
|
1089
1146
|
return self._check_res(res, "drop_table", expected_retvals)
|
|
1090
1147
|
|
|
1091
1148
|
def list_tables(self, bucket, schema, txid=0, client_tags=[], max_keys=1000, next_key=0, name_prefix="",
|
|
@@ -1207,9 +1264,9 @@ class VastdbApi:
|
|
|
1207
1264
|
data=serialized_schema, headers=headers)
|
|
1208
1265
|
return self._check_res(res, "drop_columns", expected_retvals)
|
|
1209
1266
|
|
|
1210
|
-
def list_columns(self, bucket, schema, table, *, txid=0, client_tags=None, max_keys=
|
|
1267
|
+
def list_columns(self, bucket, schema, table, *, txid=0, client_tags=None, max_keys=None, next_key=0,
|
|
1211
1268
|
count_only=False, name_prefix="", exact_match=False,
|
|
1212
|
-
expected_retvals=None, bc_list_internals=False):
|
|
1269
|
+
expected_retvals=None, bc_list_internals=False, list_imports_table=False):
|
|
1213
1270
|
"""
|
|
1214
1271
|
GET /mybucket/myschema/mytable?columns HTTP/1.1
|
|
1215
1272
|
tabular-txid: TransactionId
|
|
@@ -1217,7 +1274,10 @@ class VastdbApi:
|
|
|
1217
1274
|
x-tabluar-name-prefix: TableNamePrefix
|
|
1218
1275
|
tabular-max-keys: 1000
|
|
1219
1276
|
tabular-next-key: NextColumnId
|
|
1277
|
+
|
|
1278
|
+
To list the columns of the internal vastdb-imported-objects table, set list_import_table=True
|
|
1220
1279
|
"""
|
|
1280
|
+
max_keys = max_keys or self.default_max_list_columns_page_size
|
|
1221
1281
|
client_tags = client_tags or []
|
|
1222
1282
|
expected_retvals = expected_retvals or []
|
|
1223
1283
|
|
|
@@ -1233,7 +1293,9 @@ class VastdbApi:
|
|
|
1233
1293
|
else:
|
|
1234
1294
|
headers['tabular-name-prefix'] = name_prefix
|
|
1235
1295
|
|
|
1236
|
-
|
|
1296
|
+
url_params = {'sub-table': IMPORTED_OBJECTS_TABLE_NAME} if list_imports_table else {}
|
|
1297
|
+
res = self.session.get(self._api_prefix(bucket=bucket, schema=schema, table=table, command="column",
|
|
1298
|
+
url_params=url_params),
|
|
1237
1299
|
headers=headers, stream=True)
|
|
1238
1300
|
self._check_res(res, "list_columns", expected_retvals)
|
|
1239
1301
|
if res.status_code == 200:
|
|
@@ -1245,9 +1307,7 @@ class VastdbApi:
|
|
|
1245
1307
|
if not count_only:
|
|
1246
1308
|
schema_buf = b''.join(res.iter_content(chunk_size=128))
|
|
1247
1309
|
schema_out = pa.ipc.open_stream(schema_buf).schema
|
|
1248
|
-
|
|
1249
|
-
for f in schema_out:
|
|
1250
|
-
columns.append([f.name, f.type, f.metadata, f])
|
|
1310
|
+
columns = schema_out
|
|
1251
1311
|
|
|
1252
1312
|
return columns, next_key, is_truncated, count
|
|
1253
1313
|
|
|
@@ -1294,7 +1354,7 @@ class VastdbApi:
|
|
|
1294
1354
|
return self._check_res(res, "get_transaction", expected_retvals)
|
|
1295
1355
|
|
|
1296
1356
|
def select_row_ids(self, bucket, schema, table, params, txid=0, client_tags=[], expected_retvals=[],
|
|
1297
|
-
retry_count=0, enable_sorted_projections=
|
|
1357
|
+
retry_count=0, enable_sorted_projections=True):
|
|
1298
1358
|
"""
|
|
1299
1359
|
POST /mybucket/myschema/mytable?query-data=SelectRowIds HTTP/1.1
|
|
1300
1360
|
"""
|
|
@@ -1311,7 +1371,7 @@ class VastdbApi:
|
|
|
1311
1371
|
return self._check_res(res, "query_data", expected_retvals)
|
|
1312
1372
|
|
|
1313
1373
|
def read_columns_data(self, bucket, schema, table, params, txid=0, client_tags=[], expected_retvals=[], tenant_guid=None,
|
|
1314
|
-
retry_count=0, enable_sorted_projections=
|
|
1374
|
+
retry_count=0, enable_sorted_projections=True):
|
|
1315
1375
|
"""
|
|
1316
1376
|
POST /mybucket/myschema/mytable?query-data=ReadColumns HTTP/1.1
|
|
1317
1377
|
"""
|
|
@@ -1327,7 +1387,7 @@ class VastdbApi:
|
|
|
1327
1387
|
return self._check_res(res, "query_data", expected_retvals)
|
|
1328
1388
|
|
|
1329
1389
|
def count_rows(self, bucket, schema, table, params, txid=0, client_tags=[], expected_retvals=[], tenant_guid=None,
|
|
1330
|
-
retry_count=0, enable_sorted_projections=
|
|
1390
|
+
retry_count=0, enable_sorted_projections=True):
|
|
1331
1391
|
"""
|
|
1332
1392
|
POST /mybucket/myschema/mytable?query-data=CountRows HTTP/1.1
|
|
1333
1393
|
"""
|
|
@@ -1341,27 +1401,9 @@ class VastdbApi:
|
|
|
1341
1401
|
data=params, headers=headers, stream=True)
|
|
1342
1402
|
return self._check_res(res, "query_data", expected_retvals)
|
|
1343
1403
|
|
|
1344
|
-
def
|
|
1345
|
-
|
|
1346
|
-
|
|
1347
|
-
request_format='string', response_format='string'):
|
|
1348
|
-
"""
|
|
1349
|
-
GET /mybucket/myschema/mytable?data HTTP/1.1
|
|
1350
|
-
Content-Length: ContentLength
|
|
1351
|
-
tabular-txid: TransactionId
|
|
1352
|
-
tabular-client-tag: ClientTag
|
|
1353
|
-
tabular-split: "split_id,total_splits,num_row_groups_per_split"
|
|
1354
|
-
tabular-num-of-subsplits: "total"
|
|
1355
|
-
tabular-request-format: "string"
|
|
1356
|
-
tabular-response-format: "string" #arrow/trino
|
|
1357
|
-
tabular-schedule-id: "schedule-id"
|
|
1358
|
-
|
|
1359
|
-
Request Body (flatbuf)
|
|
1360
|
-
projections_chunk [expressions]
|
|
1361
|
-
predicate_chunk "formatted_data", (required)
|
|
1362
|
-
|
|
1363
|
-
"""
|
|
1364
|
-
# add query option select-only and read-only
|
|
1404
|
+
def _build_query_data_headers(self, txid, client_tags, params, split, num_sub_splits, request_format, response_format,
|
|
1405
|
+
enable_sorted_projections, limit_rows, schedule_id, retry_count, search_path, tenant_guid,
|
|
1406
|
+
sub_split_start_row_ids):
|
|
1365
1407
|
headers = self._fill_common_headers(txid=txid, client_tags=client_tags)
|
|
1366
1408
|
headers['Content-Length'] = str(len(params))
|
|
1367
1409
|
headers['tabular-split'] = ','.join(map(str, split))
|
|
@@ -1386,439 +1428,80 @@ class VastdbApi:
|
|
|
1386
1428
|
for sub_split_id, start_row_id in sub_split_start_row_ids:
|
|
1387
1429
|
headers[f'tabular-start-row-id-{sub_split_id}'] = f"{sub_split_id},{start_row_id}"
|
|
1388
1430
|
|
|
1389
|
-
|
|
1431
|
+
return headers
|
|
1390
1432
|
|
|
1391
|
-
|
|
1392
|
-
|
|
1393
|
-
|
|
1433
|
+
def _build_query_data_url_params(self, projection, query_imports_table):
|
|
1434
|
+
if query_imports_table and projection:
|
|
1435
|
+
raise ValueError("Can't query both imports and projection table")
|
|
1394
1436
|
|
|
1395
|
-
|
|
1396
|
-
|
|
1397
|
-
|
|
1398
|
-
|
|
1399
|
-
|
|
1400
|
-
|
|
1401
|
-
while True:
|
|
1402
|
-
cur_columns, next_key, is_truncated, count = self.list_columns(
|
|
1403
|
-
bucket=bucket, schema=schema, table=table, next_key=next_key, txid=txid)
|
|
1404
|
-
if not cur_columns:
|
|
1405
|
-
break
|
|
1406
|
-
all_listed_columns.extend(cur_columns)
|
|
1407
|
-
if not is_truncated:
|
|
1408
|
-
break
|
|
1409
|
-
|
|
1410
|
-
# build a list of the queried columns
|
|
1411
|
-
queried_column_names = set()
|
|
1412
|
-
if filters:
|
|
1413
|
-
filtered_column_names = ([column_name.split('.')[0] for column_name in filters.keys()]) # use top level of the filter column names
|
|
1414
|
-
queried_column_names.update(filtered_column_names)
|
|
1415
|
-
_logger.debug(f"_list_table_columns: filtered_column_names={filtered_column_names}")
|
|
1416
|
-
|
|
1417
|
-
if field_names:
|
|
1418
|
-
field_column_names = ([column_name.split('.')[0] for column_name in field_names]) # use top level of the field column names
|
|
1419
|
-
else:
|
|
1420
|
-
field_column_names = [column[0] for column in all_listed_columns]
|
|
1421
|
-
_logger.debug(f"_list_table_columns: field_column_names={field_column_names}")
|
|
1422
|
-
queried_column_names.update(field_column_names)
|
|
1423
|
-
|
|
1424
|
-
all_listed_column_and_leaves_names = set()
|
|
1425
|
-
for column in all_listed_columns:
|
|
1426
|
-
# Collect the column and leaves names for verification below that all the filters and field names are in the table
|
|
1427
|
-
column_and_leaves_names = [column[0]] + [f.name for f in column[3].flatten()]
|
|
1428
|
-
all_listed_column_and_leaves_names.update(column_and_leaves_names)
|
|
1429
|
-
|
|
1430
|
-
# check if this column is needed for the query
|
|
1431
|
-
if column[0] in queried_column_names:
|
|
1432
|
-
queried_columns.append(column)
|
|
1433
|
-
|
|
1434
|
-
# verify that all the filters and field names are in the table
|
|
1435
|
-
if filters:
|
|
1436
|
-
for filter_column_name in filters.keys():
|
|
1437
|
-
if filter_column_name not in all_listed_column_and_leaves_names:
|
|
1438
|
-
raise KeyError((f'filter column name: {filter_column_name} does not appear in the table'))
|
|
1439
|
-
if field_names:
|
|
1440
|
-
for field_name in field_names:
|
|
1441
|
-
if field_name not in all_listed_column_and_leaves_names:
|
|
1442
|
-
raise ValueError((f'field name: {field_name} does not appear in the table'))
|
|
1443
|
-
return list(queried_columns)
|
|
1444
|
-
|
|
1445
|
-
def _begin_tx_if_necessary(self, txid):
|
|
1446
|
-
if not txid:
|
|
1447
|
-
created_txid = True
|
|
1448
|
-
res = self.begin_transaction()
|
|
1449
|
-
txid = res.headers.get('tabular-txid')
|
|
1450
|
-
else:
|
|
1451
|
-
created_txid = False
|
|
1437
|
+
url_params = {}
|
|
1438
|
+
if query_imports_table:
|
|
1439
|
+
url_params['sub-table'] = IMPORTED_OBJECTS_TABLE_NAME
|
|
1440
|
+
elif projection:
|
|
1441
|
+
url_params['name'] = projection
|
|
1442
|
+
return url_params
|
|
1452
1443
|
|
|
1453
|
-
|
|
1444
|
+
def legacy_query_data(self, bucket, schema, table, params, split=(0, 1, 8), num_sub_splits=1, response_row_id=False,
|
|
1445
|
+
txid=0, client_tags=[], expected_retvals=[], limit_rows=0, schedule_id=None, retry_count=0,
|
|
1446
|
+
search_path=None, sub_split_start_row_ids=[], tenant_guid=None, projection='', enable_sorted_projections=True,
|
|
1447
|
+
request_format='string', response_format='string', query_imports_table=False):
|
|
1448
|
+
"""
|
|
1449
|
+
POST /mybucket/myschema/mytable?query-data=LegacyQueryData HTTP/1.1
|
|
1450
|
+
Content-Length: ContentLength
|
|
1451
|
+
tabular-txid: TransactionId
|
|
1452
|
+
tabular-client-tag: ClientTag
|
|
1453
|
+
tabular-split: "split_id,total_splits,num_row_groups_per_split"
|
|
1454
|
+
tabular-num-of-subsplits: "total"
|
|
1455
|
+
tabular-request-format: "string"
|
|
1456
|
+
tabular-response-format: "string" #arrow/trino
|
|
1457
|
+
tabular-schedule-id: "schedule-id"
|
|
1454
1458
|
|
|
1455
|
-
|
|
1456
|
-
|
|
1457
|
-
|
|
1458
|
-
if response_row_id:
|
|
1459
|
-
queried_fields.append(pa.field('$row_id', pa.uint64()))
|
|
1459
|
+
Request Body (flatbuf)
|
|
1460
|
+
projections_chunk [expressions]
|
|
1461
|
+
predicate_chunk "formatted_data", (required)
|
|
1460
1462
|
|
|
1461
|
-
|
|
1462
|
-
|
|
1463
|
+
"""
|
|
1464
|
+
headers = self._build_query_data_headers(txid, client_tags, params, split, num_sub_splits, request_format, response_format,
|
|
1465
|
+
enable_sorted_projections, limit_rows, schedule_id, retry_count, search_path, tenant_guid,
|
|
1466
|
+
sub_split_start_row_ids)
|
|
1467
|
+
url_params = self._build_query_data_url_params(projection, query_imports_table)
|
|
1463
1468
|
|
|
1464
|
-
|
|
1465
|
-
|
|
1469
|
+
res = self.session.post(self._api_prefix(bucket=bucket, schema=schema, table=table, command="query-data=LegacyQueryData",
|
|
1470
|
+
url_params=url_params), data=params, headers=headers, stream=True)
|
|
1471
|
+
return self._check_res(res, "legacy_query_data", expected_retvals)
|
|
1466
1472
|
|
|
1467
|
-
|
|
1473
|
+
def query_data(self, bucket, schema, table, params, split=(0, 1, 8), num_sub_splits=1, response_row_id=False,
|
|
1474
|
+
txid=0, client_tags=[], expected_retvals=[], limit_rows=0, schedule_id=None, retry_count=0,
|
|
1475
|
+
search_path=None, sub_split_start_row_ids=[], tenant_guid=None, projection='', enable_sorted_projections=True,
|
|
1476
|
+
request_format='string', response_format='string', query_imports_table=False):
|
|
1477
|
+
"""
|
|
1478
|
+
GET /mybucket/myschema/mytable?data HTTP/1.1
|
|
1479
|
+
Content-Length: ContentLength
|
|
1480
|
+
tabular-txid: TransactionId
|
|
1481
|
+
tabular-client-tag: ClientTag
|
|
1482
|
+
tabular-split: "split_id,total_splits,num_row_groups_per_split"
|
|
1483
|
+
tabular-num-of-subsplits: "total"
|
|
1484
|
+
tabular-request-format: "string"
|
|
1485
|
+
tabular-response-format: "string" #arrow/trino
|
|
1486
|
+
tabular-schedule-id: "schedule-id"
|
|
1468
1487
|
|
|
1469
|
-
|
|
1470
|
-
|
|
1471
|
-
|
|
1472
|
-
else:
|
|
1473
|
-
executor_hosts = [self.host]
|
|
1474
|
-
executor_sessions = [VastdbApi(executor_hosts[i], self.access_key, self.secret_key, self.username,
|
|
1475
|
-
self.password, self.port, self.secure, self.auth_type) for i in range(len(executor_hosts))]
|
|
1476
|
-
|
|
1477
|
-
return queried_columns, arrow_schema, query_data_request, executor_sessions
|
|
1478
|
-
|
|
1479
|
-
def _more_pages_exist(self, start_row_ids):
|
|
1480
|
-
for row_id in start_row_ids.values():
|
|
1481
|
-
if row_id != TABULAR_INVALID_ROW_ID:
|
|
1482
|
-
return True
|
|
1483
|
-
return False
|
|
1484
|
-
|
|
1485
|
-
def _query_page(self, bucket, schema, table, query_data_request, split=(0, 1, 8), num_sub_splits=1, response_row_id=False,
|
|
1486
|
-
txid=0, limit_rows=0, sub_split_start_row_ids=[], filters=None, field_names=None):
|
|
1487
|
-
res = self.query_data(bucket=bucket, schema=schema, table=table, params=query_data_request.serialized, split=split,
|
|
1488
|
-
num_sub_splits=num_sub_splits, response_row_id=response_row_id, txid=txid,
|
|
1489
|
-
limit_rows=limit_rows, sub_split_start_row_ids=sub_split_start_row_ids)
|
|
1490
|
-
start_row_ids = {}
|
|
1491
|
-
sub_split_tables = parse_query_data_response(res.raw, query_data_request.response_schema,
|
|
1492
|
-
start_row_ids=start_row_ids)
|
|
1493
|
-
table_page = pa.concat_tables(sub_split_tables)
|
|
1494
|
-
_logger.info("query_page: table_page num_rows=%s start_row_ids len=%s",
|
|
1495
|
-
len(table_page), len(start_row_ids))
|
|
1496
|
-
|
|
1497
|
-
return table_page, start_row_ids
|
|
1498
|
-
|
|
1499
|
-
def _query_page_iterator(self, bucket, schema, table, query_data_request, split=(0, 1, 8), num_sub_splits=1, response_row_id=False,
|
|
1500
|
-
txid=0, limit_rows=0, start_row_ids={}, filters=None, field_names=None):
|
|
1501
|
-
res = self.query_data(bucket=bucket, schema=schema, table=table, params=query_data_request.serialized, split=split,
|
|
1502
|
-
num_sub_splits=num_sub_splits, response_row_id=response_row_id, txid=txid,
|
|
1503
|
-
limit_rows=limit_rows, sub_split_start_row_ids=start_row_ids.items())
|
|
1504
|
-
for sub_split_table in parse_query_data_response(res.raw, query_data_request.response_schema,
|
|
1505
|
-
start_row_ids=start_row_ids):
|
|
1506
|
-
for record_batch in sub_split_table.to_batches():
|
|
1507
|
-
yield record_batch
|
|
1508
|
-
_logger.info(f"query_page_iterator: start_row_ids={start_row_ids}")
|
|
1509
|
-
|
|
1510
|
-
def query_iterator(self, bucket, schema, table, num_sub_splits=1, num_row_groups_per_sub_split=8,
|
|
1511
|
-
response_row_id=False, txid=0, limit_per_sub_split=128*1024, filters=None, field_names=None):
|
|
1512
|
-
"""
|
|
1513
|
-
query rows into a table.
|
|
1514
|
-
|
|
1515
|
-
Parameters
|
|
1516
|
-
----------
|
|
1517
|
-
bucket : string
|
|
1518
|
-
The bucket of the table.
|
|
1519
|
-
schema : string
|
|
1520
|
-
The schema of the table.
|
|
1521
|
-
table : string
|
|
1522
|
-
The table name.
|
|
1523
|
-
num_sub_splits : integer
|
|
1524
|
-
The number of sub_splits per split - determines the parallelism inside a VastDB compute node
|
|
1525
|
-
default: 1
|
|
1526
|
-
num_row_groups_per_sub_split : integer
|
|
1527
|
-
The number of consecutive row groups per sub_split. Each row group consists of 64K row ids.
|
|
1528
|
-
default: 8
|
|
1529
|
-
response_row_id : boolean
|
|
1530
|
-
Return a column with the internal row ids of the table
|
|
1531
|
-
default: False
|
|
1532
|
-
txid : integer
|
|
1533
|
-
A transaction id. The transaction may be initiated before the query, and if not, the query will initiate it
|
|
1534
|
-
default: 0 (will be created by the api)
|
|
1535
|
-
limit_per_sub_split : integer
|
|
1536
|
-
Limit the number of rows from a single sub_split for a single rpc
|
|
1537
|
-
default:131072
|
|
1538
|
-
filters : dict
|
|
1539
|
-
A dictionary whose keys are column names, and values are lists of string expressions that represent
|
|
1540
|
-
filter conditions on the column. AND is applied on the conditions. The condition formats are:
|
|
1541
|
-
'column_name eq some_value'
|
|
1542
|
-
default: None
|
|
1543
|
-
field_names : list
|
|
1544
|
-
A list of column names to be returned in the output table
|
|
1545
|
-
default: None
|
|
1546
|
-
|
|
1547
|
-
Returns
|
|
1548
|
-
-------
|
|
1549
|
-
Query iterator generator
|
|
1550
|
-
|
|
1551
|
-
Yields
|
|
1552
|
-
------
|
|
1553
|
-
pyarrow.RecordBatch
|
|
1554
|
-
|
|
1555
|
-
Examples
|
|
1556
|
-
--------
|
|
1557
|
-
for record_batch in query_iterator('some_bucket', 'some_schema', 'some_table',
|
|
1558
|
-
filters={'name': ['eq Alice', 'eq Bob']}
|
|
1559
|
-
field_names=['name','age']):
|
|
1560
|
-
...
|
|
1561
|
-
|
|
1562
|
-
"""
|
|
1563
|
-
|
|
1564
|
-
# create a transaction if necessary
|
|
1565
|
-
txid, created_txid = self._begin_tx_if_necessary(txid)
|
|
1566
|
-
executor_sessions = []
|
|
1488
|
+
Request Body (flatbuf)
|
|
1489
|
+
projections_chunk [expressions]
|
|
1490
|
+
predicate_chunk "formatted_data", (required)
|
|
1567
1491
|
|
|
1568
|
-
|
|
1569
|
-
|
|
1570
|
-
|
|
1571
|
-
|
|
1572
|
-
|
|
1573
|
-
|
|
1574
|
-
|
|
1575
|
-
|
|
1576
|
-
|
|
1577
|
-
|
|
1578
|
-
|
|
1579
|
-
|
|
1580
|
-
|
|
1581
|
-
if killall:
|
|
1582
|
-
raise RuntimeError(f'query_iterator_split_id: split_id {split_id} received killall')
|
|
1583
|
-
|
|
1584
|
-
while self._more_pages_exist(start_row_ids):
|
|
1585
|
-
for record_batch in session._query_page_iterator(bucket=bucket, schema=schema, table=table, query_data_request=query_data_request,
|
|
1586
|
-
split=(split_id, num_splits, num_row_groups_per_sub_split),
|
|
1587
|
-
num_sub_splits=num_sub_splits, response_row_id=response_row_id,
|
|
1588
|
-
txid=txid, limit_rows=limit_per_sub_split,
|
|
1589
|
-
start_row_ids=start_row_ids):
|
|
1590
|
-
output_queue.put((split_id, record_batch))
|
|
1591
|
-
while not next_sems[split_id].acquire(timeout=1): # wait for the main thread to request the next record batch
|
|
1592
|
-
if killall:
|
|
1593
|
-
raise RuntimeError(f'split_id {split_id} received killall')
|
|
1594
|
-
# end of split
|
|
1595
|
-
output_queue.put((split_id,None))
|
|
1596
|
-
|
|
1597
|
-
except Exception as e:
|
|
1598
|
-
_logger.exception('query_iterator_split_id: exception occurred')
|
|
1599
|
-
try:
|
|
1600
|
-
self.rollback_transaction(txid)
|
|
1601
|
-
except:
|
|
1602
|
-
_logger.exception(f'failed to rollback txid {txid}')
|
|
1603
|
-
error_queue.put(None)
|
|
1604
|
-
raise e
|
|
1605
|
-
|
|
1606
|
-
# kickoff executors
|
|
1607
|
-
num_splits = len(executor_sessions)
|
|
1608
|
-
output_queue = queue.Queue()
|
|
1609
|
-
error_queue = queue.Queue()
|
|
1610
|
-
next_sems = [threading.Semaphore(value=1) for i in range(num_splits)]
|
|
1611
|
-
killall = False
|
|
1612
|
-
with concurrent.futures.ThreadPoolExecutor(max_workers=num_splits) as executor:
|
|
1613
|
-
# start executors
|
|
1614
|
-
futures = []
|
|
1615
|
-
for i in range(num_splits):
|
|
1616
|
-
futures.append(executor.submit(query_iterator_split_id, self, i))
|
|
1617
|
-
|
|
1618
|
-
# receive outputs and yield them
|
|
1619
|
-
done_count = 0
|
|
1620
|
-
while done_count < num_splits:
|
|
1621
|
-
# check for errors
|
|
1622
|
-
try:
|
|
1623
|
-
error_queue.get(block=False)
|
|
1624
|
-
_logger.error('received error from a thread')
|
|
1625
|
-
killall = True
|
|
1626
|
-
# wait for all executors to complete
|
|
1627
|
-
for future in concurrent.futures.as_completed(futures):
|
|
1628
|
-
try:
|
|
1629
|
-
future.result() # trigger an exception if occurred in any thread
|
|
1630
|
-
except Exception:
|
|
1631
|
-
_logger.exception('exception occurred')
|
|
1632
|
-
raise RuntimeError('received error from a thread')
|
|
1633
|
-
except queue.Empty:
|
|
1634
|
-
pass
|
|
1635
|
-
|
|
1636
|
-
# try to get a value from the output queue
|
|
1637
|
-
try:
|
|
1638
|
-
(split_id, record_batch) = output_queue.get(timeout=1)
|
|
1639
|
-
except queue.Empty:
|
|
1640
|
-
continue
|
|
1641
|
-
|
|
1642
|
-
if record_batch:
|
|
1643
|
-
# signal to the thread to read the next record batch and yield the current
|
|
1644
|
-
next_sems[split_id].release()
|
|
1645
|
-
try:
|
|
1646
|
-
yield record_batch
|
|
1647
|
-
except GeneratorExit:
|
|
1648
|
-
killall = True
|
|
1649
|
-
_logger.debug("cancelling query_iterator")
|
|
1650
|
-
raise
|
|
1651
|
-
else:
|
|
1652
|
-
done_count += 1
|
|
1653
|
-
|
|
1654
|
-
# wait for all executors to complete
|
|
1655
|
-
for future in concurrent.futures.as_completed(futures):
|
|
1656
|
-
try:
|
|
1657
|
-
future.result() # trigger an exception if occurred in any thread
|
|
1658
|
-
except Exception:
|
|
1659
|
-
_logger.exception('exception occurred')
|
|
1660
|
-
|
|
1661
|
-
# commit if needed
|
|
1662
|
-
if created_txid:
|
|
1663
|
-
self.commit_transaction(txid)
|
|
1664
|
-
|
|
1665
|
-
except Exception as e:
|
|
1666
|
-
_logger.exception('exception occurred')
|
|
1667
|
-
try:
|
|
1668
|
-
self.rollback_transaction(txid)
|
|
1669
|
-
except:
|
|
1670
|
-
_logger.exception(f'failed to rollback txid {txid}')
|
|
1671
|
-
raise e
|
|
1672
|
-
|
|
1673
|
-
finally:
|
|
1674
|
-
killall = True
|
|
1675
|
-
for session in executor_sessions:
|
|
1676
|
-
try:
|
|
1677
|
-
session.session.close()
|
|
1678
|
-
except Exception:
|
|
1679
|
-
_logger.exception(f'failed to close session {session}')
|
|
1680
|
-
|
|
1681
|
-
def query(self, bucket, schema, table, num_sub_splits=1, num_row_groups_per_sub_split=8,
|
|
1682
|
-
response_row_id=False, txid=0, limit=0, limit_per_sub_split=131072, filters=None, field_names=None,
|
|
1683
|
-
queried_columns=None):
|
|
1684
|
-
"""
|
|
1685
|
-
query rows into a table.
|
|
1686
|
-
|
|
1687
|
-
Parameters
|
|
1688
|
-
----------
|
|
1689
|
-
bucket : string
|
|
1690
|
-
The bucket of the table.
|
|
1691
|
-
schema : string
|
|
1692
|
-
The schema of the table.
|
|
1693
|
-
table : string
|
|
1694
|
-
The table name.
|
|
1695
|
-
num_sub_splits : integer
|
|
1696
|
-
The number of sub_splits per split - determines the parallelism inside a VastDB compute node
|
|
1697
|
-
default: 1
|
|
1698
|
-
num_row_groups_per_sub_split : integer
|
|
1699
|
-
The number of consecutive row groups per sub_split. Each row group consists of 64K row ids.
|
|
1700
|
-
default: 8
|
|
1701
|
-
response_row_id : boolean
|
|
1702
|
-
Return a column with the internal row ids of the table
|
|
1703
|
-
default: False
|
|
1704
|
-
txid : integer
|
|
1705
|
-
A transaction id. The transaction may be initiated before the query, and be used to provide
|
|
1706
|
-
multiple ACID operations
|
|
1707
|
-
default: 0 (will be created by the api)
|
|
1708
|
-
limit : integer
|
|
1709
|
-
Limit the number of rows in the response
|
|
1710
|
-
default: 0 (no limit)
|
|
1711
|
-
limit_per_sub_split : integer
|
|
1712
|
-
Limit the number of rows from a single sub_split for a single rpc
|
|
1713
|
-
default:131072
|
|
1714
|
-
filters : dict
|
|
1715
|
-
A dictionary whose keys are column names, and values are lists of string expressions that represent
|
|
1716
|
-
filter conditions on the column. AND is applied on the conditions. The condition formats are:
|
|
1717
|
-
'column_name eq some_value'
|
|
1718
|
-
default: None
|
|
1719
|
-
field_names : list
|
|
1720
|
-
A list of column names to be returned to the output table
|
|
1721
|
-
default: None
|
|
1722
|
-
queried_columns: list of pyArrow.column
|
|
1723
|
-
A list of the columns to be queried
|
|
1724
|
-
default: None
|
|
1725
|
-
|
|
1726
|
-
Returns
|
|
1727
|
-
-------
|
|
1728
|
-
pyarrow.Table
|
|
1729
|
-
|
|
1730
|
-
|
|
1731
|
-
Examples
|
|
1732
|
-
--------
|
|
1733
|
-
table = query('some_bucket', 'some_schema', 'some_table',
|
|
1734
|
-
filters={'name': ['eq Alice', 'eq Bob']}
|
|
1735
|
-
field_names=['name','age'])
|
|
1736
|
-
|
|
1737
|
-
"""
|
|
1738
|
-
|
|
1739
|
-
# create a transaction
|
|
1740
|
-
txid, created_txid = self._begin_tx_if_necessary(txid)
|
|
1741
|
-
executor_sessions = []
|
|
1742
|
-
try:
|
|
1743
|
-
# prepare query
|
|
1744
|
-
queried_columns, arrow_schema, query_data_request, executor_sessions = \
|
|
1745
|
-
self._prepare_query(bucket, schema, table, num_sub_splits, filters, field_names, response_row_id=response_row_id, txid=txid)
|
|
1746
|
-
|
|
1747
|
-
# define the per split threaded query func
|
|
1748
|
-
def query_split_id(self, split_id):
|
|
1749
|
-
try:
|
|
1750
|
-
start_row_ids = {i:0 for i in range(num_sub_splits)}
|
|
1751
|
-
session = executor_sessions[split_id]
|
|
1752
|
-
row_count = 0
|
|
1753
|
-
while (self._more_pages_exist(start_row_ids) and
|
|
1754
|
-
(not limit or row_count < limit)):
|
|
1755
|
-
# check if killed externally
|
|
1756
|
-
if killall:
|
|
1757
|
-
raise RuntimeError(f'query_split_id: split_id {split_id} received killall')
|
|
1758
|
-
|
|
1759
|
-
# determine the limit rows
|
|
1760
|
-
if limit:
|
|
1761
|
-
limit_rows = min(limit_per_sub_split, limit-row_count)
|
|
1762
|
-
else:
|
|
1763
|
-
limit_rows = limit_per_sub_split
|
|
1764
|
-
|
|
1765
|
-
# query one page
|
|
1766
|
-
table_page, start_row_ids = session._query_page(bucket=bucket, schema=schema, table=table, query_data_request=query_data_request,
|
|
1767
|
-
split=(split_id, num_splits, num_row_groups_per_sub_split),
|
|
1768
|
-
num_sub_splits=num_sub_splits, response_row_id=response_row_id,
|
|
1769
|
-
txid=txid, limit_rows=limit_rows,
|
|
1770
|
-
sub_split_start_row_ids=start_row_ids.items())
|
|
1771
|
-
with lock:
|
|
1772
|
-
table_pages.append(table_page)
|
|
1773
|
-
row_counts[split_id] += len(table_page)
|
|
1774
|
-
row_count = sum(row_counts)
|
|
1775
|
-
_logger.info(f"query_split_id: table_pages split_id={split_id} row_count={row_count}")
|
|
1776
|
-
except Exception as e:
|
|
1777
|
-
_logger.exception('query_split_id: exception occurred')
|
|
1778
|
-
try:
|
|
1779
|
-
self.rollback_transaction(txid)
|
|
1780
|
-
except:
|
|
1781
|
-
_logger.exception(f'failed to rollback txid {txid}')
|
|
1782
|
-
raise e
|
|
1783
|
-
|
|
1784
|
-
table_pages = []
|
|
1785
|
-
num_splits = len(executor_sessions)
|
|
1786
|
-
killall = False
|
|
1787
|
-
with concurrent.futures.ThreadPoolExecutor(max_workers=num_splits) as executor:
|
|
1788
|
-
futures = []
|
|
1789
|
-
row_counts = [0] * num_splits
|
|
1790
|
-
lock = threading.Lock()
|
|
1791
|
-
for i in range(num_splits):
|
|
1792
|
-
futures.append(executor.submit(query_split_id, self, i))
|
|
1793
|
-
for future in concurrent.futures.as_completed(futures):
|
|
1794
|
-
future.result() # trigger an exception if occurred in any thread
|
|
1795
|
-
|
|
1796
|
-
# commit if needed
|
|
1797
|
-
if created_txid:
|
|
1798
|
-
self.commit_transaction(txid)
|
|
1799
|
-
|
|
1800
|
-
# concatenate all table pages and return result
|
|
1801
|
-
out_table = pa.concat_tables(table_pages)
|
|
1802
|
-
out_table = out_table.slice(length=limit) if limit else out_table
|
|
1803
|
-
_logger.info("query: out_table len=%s row_count=%s",
|
|
1804
|
-
len(out_table), len(out_table))
|
|
1805
|
-
return out_table
|
|
1806
|
-
|
|
1807
|
-
except Exception as e:
|
|
1808
|
-
_logger.exception('exception occurred')
|
|
1809
|
-
try:
|
|
1810
|
-
self.rollback_transaction(txid)
|
|
1811
|
-
except:
|
|
1812
|
-
_logger.exception(f'failed to rollback txid {txid}')
|
|
1813
|
-
raise e
|
|
1814
|
-
|
|
1815
|
-
finally:
|
|
1816
|
-
killall = True
|
|
1817
|
-
for session in executor_sessions:
|
|
1818
|
-
try:
|
|
1819
|
-
session.session.close()
|
|
1820
|
-
except Exception:
|
|
1821
|
-
_logger.exception(f'failed to close session {session}')
|
|
1492
|
+
To query the internal vastdb-imported-objects table, set query_imports_table=True
|
|
1493
|
+
"""
|
|
1494
|
+
# add query option select-only and read-only
|
|
1495
|
+
|
|
1496
|
+
headers = self._build_query_data_headers(txid, client_tags, params, split, num_sub_splits, request_format, response_format,
|
|
1497
|
+
enable_sorted_projections, limit_rows, schedule_id, retry_count, search_path, tenant_guid,
|
|
1498
|
+
sub_split_start_row_ids)
|
|
1499
|
+
|
|
1500
|
+
url_params = self._build_query_data_url_params(projection, query_imports_table)
|
|
1501
|
+
|
|
1502
|
+
res = self.session.get(self._api_prefix(bucket=bucket, schema=schema, table=table, command="data", url_params=url_params),
|
|
1503
|
+
data=params, headers=headers, stream=True)
|
|
1504
|
+
return self._check_res(res, "query_data", expected_retvals)
|
|
1822
1505
|
|
|
1823
1506
|
"""
|
|
1824
1507
|
source_files: list of (bucket_name, file_name)
|
|
@@ -1872,21 +1555,22 @@ class VastdbApi:
|
|
|
1872
1555
|
builder.Finish(params)
|
|
1873
1556
|
import_req = builder.Output()
|
|
1874
1557
|
|
|
1875
|
-
def iterate_over_import_data_response(response
|
|
1558
|
+
def iterate_over_import_data_response(response):
|
|
1876
1559
|
if response.status_code != 200:
|
|
1877
1560
|
return response
|
|
1878
1561
|
|
|
1879
1562
|
chunk_size = 1024
|
|
1880
|
-
for chunk in
|
|
1563
|
+
for chunk in response.iter_content(chunk_size=chunk_size):
|
|
1881
1564
|
chunk_dict = json.loads(chunk)
|
|
1882
|
-
_logger.
|
|
1883
|
-
if chunk_dict['res']
|
|
1884
|
-
|
|
1885
|
-
|
|
1886
|
-
|
|
1887
|
-
|
|
1888
|
-
|
|
1889
|
-
|
|
1565
|
+
_logger.debug("import data chunk=%s, result: %s", chunk_dict, chunk_dict['res'])
|
|
1566
|
+
if chunk_dict['res'] != 'Success' and chunk_dict['res'] != 'TabularInProgress' and chunk_dict['res'] != 'TabularAlreadyImported':
|
|
1567
|
+
raise errors.ImportFilesError(
|
|
1568
|
+
f"Encountered an error during import_data. status: {chunk_dict['res']}, "
|
|
1569
|
+
f"error message: {chunk_dict['err_msg'] or 'Unexpected error'} during import of "
|
|
1570
|
+
f"object name: {chunk_dict['object_name']}", chunk_dict)
|
|
1571
|
+
else:
|
|
1572
|
+
_logger.debug("import_data of object name '%s' is in progress. "
|
|
1573
|
+
"status: %s", chunk_dict['object_name'], chunk_dict['res'])
|
|
1890
1574
|
return response
|
|
1891
1575
|
|
|
1892
1576
|
headers = self._fill_common_headers(txid=txid, client_tags=client_tags)
|
|
@@ -1899,34 +1583,17 @@ class VastdbApi:
|
|
|
1899
1583
|
res = self.session.post(self._api_prefix(bucket=bucket, schema=schema, table=table, command="data"),
|
|
1900
1584
|
data=import_req, headers=headers, stream=True)
|
|
1901
1585
|
if blocking:
|
|
1902
|
-
res = iterate_over_import_data_response(res
|
|
1586
|
+
res = iterate_over_import_data_response(res)
|
|
1903
1587
|
|
|
1904
1588
|
return self._check_res(res, "import_data", expected_retvals)
|
|
1905
1589
|
|
|
1906
|
-
def merge_data(self):
|
|
1907
|
-
"""
|
|
1908
|
-
TODO
|
|
1909
|
-
|
|
1910
|
-
POST /mybucket/myschema/mytable?data HTTP/1.1
|
|
1911
|
-
Content-Length: ContentLength
|
|
1912
|
-
tabular-txid: TransactionId
|
|
1913
|
-
tabular-client-tag: ClientTag
|
|
1914
|
-
|
|
1915
|
-
Request Body
|
|
1916
|
-
{
|
|
1917
|
-
"format": "string",
|
|
1918
|
-
"select_source": "formatted data"
|
|
1919
|
-
"predicate": "formatted_data"
|
|
1920
|
-
}
|
|
1921
|
-
"""
|
|
1922
|
-
pass
|
|
1923
|
-
|
|
1924
1590
|
def _record_batch_slices(self, batch, rows_per_slice=None):
|
|
1925
1591
|
max_slice_size_in_bytes = int(0.9*5*1024*1024) # 0.9 * 5MB
|
|
1926
1592
|
batch_len = len(batch)
|
|
1927
1593
|
serialized_batch = serialize_record_batch(batch)
|
|
1928
1594
|
batch_size_in_bytes = len(serialized_batch)
|
|
1929
|
-
_logger.
|
|
1595
|
+
_logger.debug('max_slice_size_in_bytes=%d batch_len=%d batch_size_in_bytes=%d',
|
|
1596
|
+
max_slice_size_in_bytes, batch_len, batch_size_in_bytes)
|
|
1930
1597
|
|
|
1931
1598
|
if not rows_per_slice:
|
|
1932
1599
|
if batch_size_in_bytes < max_slice_size_in_bytes:
|
|
@@ -1948,7 +1615,7 @@ class VastdbApi:
|
|
|
1948
1615
|
serialized_slice_batch = serialize_record_batch(slice_batch)
|
|
1949
1616
|
sizeof_serialized_slice_batch = len(serialized_slice_batch)
|
|
1950
1617
|
|
|
1951
|
-
if sizeof_serialized_slice_batch <= max_slice_size_in_bytes
|
|
1618
|
+
if sizeof_serialized_slice_batch <= max_slice_size_in_bytes:
|
|
1952
1619
|
serialized_slices.append(serialized_slice_batch)
|
|
1953
1620
|
else:
|
|
1954
1621
|
_logger.info(f'Using rows_per_slice {rows_per_slice} slice {i} size {sizeof_serialized_slice_batch} exceeds {max_slice_size_in_bytes} bytes, trying smaller rows_per_slice')
|
|
@@ -1962,125 +1629,6 @@ class VastdbApi:
|
|
|
1962
1629
|
|
|
1963
1630
|
return serialized_slices
|
|
1964
1631
|
|
|
1965
|
-
def insert(self, bucket, schema, table, rows=None, record_batch=None, rows_per_insert=None, txid=0):
|
|
1966
|
-
"""
|
|
1967
|
-
Insert rows into a table. The operation may be split into multiple commands, such that by default no more than 512KB will be inserted per command.
|
|
1968
|
-
|
|
1969
|
-
Parameters
|
|
1970
|
-
----------
|
|
1971
|
-
bucket : string
|
|
1972
|
-
The bucket of the table.
|
|
1973
|
-
schema : string
|
|
1974
|
-
The schema of the table.
|
|
1975
|
-
table : string
|
|
1976
|
-
The table name.
|
|
1977
|
-
rows : dict
|
|
1978
|
-
The rows to insert.
|
|
1979
|
-
dictionary key: column name
|
|
1980
|
-
dictionary value: array of cell values to insert
|
|
1981
|
-
default: None (if None, record_batch must be provided)
|
|
1982
|
-
record_batch : pyarrow.RecordBatch
|
|
1983
|
-
A pyarrow RecordBatch
|
|
1984
|
-
default: None (if None, rows dictionary must be provided)
|
|
1985
|
-
rows_per_insert : integer
|
|
1986
|
-
Split the operation so that each insert command will be limited to this value
|
|
1987
|
-
default: None (will be selected automatically)
|
|
1988
|
-
txid : integer
|
|
1989
|
-
A transaction id. The transaction may be initiated before the insert, and be used to provide
|
|
1990
|
-
multiple ACID operations
|
|
1991
|
-
default: 0 (will be created by the api)
|
|
1992
|
-
|
|
1993
|
-
Returns
|
|
1994
|
-
-------
|
|
1995
|
-
None
|
|
1996
|
-
|
|
1997
|
-
|
|
1998
|
-
Examples
|
|
1999
|
-
--------
|
|
2000
|
-
insert('some_bucket', 'some_schema', 'some_table', {'name': ['Alice','Bob'], 'age': [25,24]})
|
|
2001
|
-
|
|
2002
|
-
"""
|
|
2003
|
-
if (not rows and not record_batch) or (rows and record_batch):
|
|
2004
|
-
raise ValueError(f'insert: missing argument - either rows or record_batch must be provided')
|
|
2005
|
-
|
|
2006
|
-
# create a transaction
|
|
2007
|
-
txid, created_txid = self._begin_tx_if_necessary(txid)
|
|
2008
|
-
|
|
2009
|
-
if rows:
|
|
2010
|
-
columns = self._list_table_columns(bucket, schema, table, field_names=rows.keys())
|
|
2011
|
-
columns_dict = dict([(column[0], column[1]) for column in columns])
|
|
2012
|
-
arrow_schema = pa.schema([])
|
|
2013
|
-
arrays = []
|
|
2014
|
-
for column_name, column_values in rows.items():
|
|
2015
|
-
column_type = columns_dict[column_name]
|
|
2016
|
-
field = pa.field(column_name, column_type)
|
|
2017
|
-
arrow_schema = arrow_schema.append(field)
|
|
2018
|
-
arrays.append(pa.array(column_values, column_type))
|
|
2019
|
-
record_batch = pa.record_batch(arrays, arrow_schema)
|
|
2020
|
-
|
|
2021
|
-
# split the record batch into multiple slices
|
|
2022
|
-
serialized_slices = self._record_batch_slices(record_batch, rows_per_insert)
|
|
2023
|
-
_logger.info(f'inserting record batch using {len(serialized_slices)} slices')
|
|
2024
|
-
|
|
2025
|
-
insert_queue = queue.Queue()
|
|
2026
|
-
|
|
2027
|
-
[insert_queue.put(insert_rows_req) for insert_rows_req in serialized_slices]
|
|
2028
|
-
|
|
2029
|
-
try:
|
|
2030
|
-
executor_sessions = [VastdbApi(self.executor_hosts[i], self.access_key, self.secret_key, self.username,
|
|
2031
|
-
self.password, self.port, self.secure, self.auth_type) for i in range(len(self.executor_hosts))]
|
|
2032
|
-
|
|
2033
|
-
def insert_executor(self, split_id):
|
|
2034
|
-
|
|
2035
|
-
try:
|
|
2036
|
-
_logger.info(f'insert_executor split_id={split_id} starting')
|
|
2037
|
-
session = executor_sessions[split_id]
|
|
2038
|
-
num_inserts = 0
|
|
2039
|
-
while not killall:
|
|
2040
|
-
try:
|
|
2041
|
-
insert_rows_req = insert_queue.get(block=False)
|
|
2042
|
-
except queue.Empty:
|
|
2043
|
-
break
|
|
2044
|
-
session.insert_rows(bucket=bucket, schema=schema,
|
|
2045
|
-
table=table, record_batch=insert_rows_req, txid=txid)
|
|
2046
|
-
num_inserts += 1
|
|
2047
|
-
_logger.info(f'insert_executor split_id={split_id} num_inserts={num_inserts}')
|
|
2048
|
-
if killall:
|
|
2049
|
-
_logger.info('insert_executor killall=True')
|
|
2050
|
-
|
|
2051
|
-
except Exception as e:
|
|
2052
|
-
_logger.exception('insert_executor hit exception')
|
|
2053
|
-
raise e
|
|
2054
|
-
|
|
2055
|
-
num_splits = len(executor_sessions)
|
|
2056
|
-
killall = False
|
|
2057
|
-
with concurrent.futures.ThreadPoolExecutor(max_workers=num_splits) as executor:
|
|
2058
|
-
futures = []
|
|
2059
|
-
for i in range(num_splits):
|
|
2060
|
-
futures.append(executor.submit(insert_executor, self, i))
|
|
2061
|
-
for future in concurrent.futures.as_completed(futures):
|
|
2062
|
-
future.result() # trigger an exception if occurred in any thread
|
|
2063
|
-
|
|
2064
|
-
# commit if needed
|
|
2065
|
-
if created_txid:
|
|
2066
|
-
self.commit_transaction(txid)
|
|
2067
|
-
|
|
2068
|
-
except Exception as e:
|
|
2069
|
-
_logger.exception('exception occurred')
|
|
2070
|
-
try:
|
|
2071
|
-
self.rollback_transaction(txid)
|
|
2072
|
-
except:
|
|
2073
|
-
_logger.exception(f'failed to rollback txid {txid}')
|
|
2074
|
-
raise e
|
|
2075
|
-
|
|
2076
|
-
finally:
|
|
2077
|
-
killall = True
|
|
2078
|
-
for session in executor_sessions:
|
|
2079
|
-
try:
|
|
2080
|
-
session.session.close()
|
|
2081
|
-
except Exception:
|
|
2082
|
-
_logger.exception(f'failed to close session {session}')
|
|
2083
|
-
|
|
2084
1632
|
def insert_rows(self, bucket, schema, table, record_batch, txid=0, client_tags=[], expected_retvals=[]):
|
|
2085
1633
|
"""
|
|
2086
1634
|
POST /mybucket/myschema/mytable?rows HTTP/1.1
|
|
@@ -2350,41 +1898,40 @@ def _iter_query_data_response_columns(fileobj, stream_ids=None):
|
|
|
2350
1898
|
if stream_ids is not None:
|
|
2351
1899
|
stream_ids.update([stream_id]) # count stream IDs using a collections.Counter
|
|
2352
1900
|
if stream_id == TABULAR_KEEP_ALIVE_STREAM_ID:
|
|
2353
|
-
# _logger.info(f"stream_id={stream_id} (skipping)")
|
|
2354
1901
|
continue
|
|
2355
1902
|
|
|
2356
1903
|
if stream_id == TABULAR_QUERY_DATA_COMPLETED_STREAM_ID:
|
|
2357
1904
|
# read the terminating end chunk from socket
|
|
2358
1905
|
res = fileobj.read()
|
|
2359
|
-
_logger.
|
|
1906
|
+
_logger.debug("stream_id=%d res=%s (finish)", stream_id, res)
|
|
2360
1907
|
return
|
|
2361
1908
|
|
|
2362
1909
|
if stream_id == TABULAR_QUERY_DATA_FAILED_STREAM_ID:
|
|
2363
1910
|
# read the terminating end chunk from socket
|
|
2364
1911
|
res = fileobj.read()
|
|
2365
|
-
_logger.
|
|
1912
|
+
_logger.warning("stream_id=%d res=%s (failed)", stream_id, res)
|
|
2366
1913
|
raise IOError(f"Query data stream failed res={res}")
|
|
2367
1914
|
|
|
2368
1915
|
next_row_id_bytes = fileobj.read(8)
|
|
2369
1916
|
next_row_id, = struct.unpack('<Q', next_row_id_bytes)
|
|
2370
|
-
_logger.
|
|
1917
|
+
_logger.debug("stream_id=%d next_row_id=%d", stream_id, next_row_id)
|
|
2371
1918
|
|
|
2372
1919
|
if stream_id not in readers:
|
|
2373
1920
|
# we implicitly read 1st message (Arrow schema) when constructing RecordBatchStreamReader
|
|
2374
1921
|
reader = pa.ipc.RecordBatchStreamReader(fileobj)
|
|
2375
|
-
_logger.
|
|
1922
|
+
_logger.debug("stream_id=%d schema=%s", stream_id, reader.schema)
|
|
2376
1923
|
readers[stream_id] = (reader, [])
|
|
2377
1924
|
continue
|
|
2378
1925
|
|
|
2379
1926
|
(reader, batches) = readers[stream_id]
|
|
2380
1927
|
try:
|
|
2381
1928
|
batch = reader.read_next_batch() # read single-column chunk data
|
|
2382
|
-
_logger.
|
|
1929
|
+
_logger.debug("stream_id=%d rows=%d chunk=%s", stream_id, len(batch), batch)
|
|
2383
1930
|
batches.append(batch)
|
|
2384
1931
|
except StopIteration: # we got an end-of-stream IPC message for a given stream ID
|
|
2385
1932
|
reader, batches = readers.pop(stream_id) # end of column
|
|
2386
1933
|
table = pa.Table.from_batches(batches) # concatenate all column chunks (as a single)
|
|
2387
|
-
_logger.
|
|
1934
|
+
_logger.debug("stream_id=%d rows=%d column=%s", stream_id, len(table), table)
|
|
2388
1935
|
yield (stream_id, next_row_id, table)
|
|
2389
1936
|
|
|
2390
1937
|
|
|
@@ -2413,7 +1960,8 @@ def parse_query_data_response(conn, schema, stream_ids=None, start_row_ids=None,
|
|
|
2413
1960
|
if is_empty_projection: # VAST returns an empty RecordBatch, with the correct rows' count
|
|
2414
1961
|
parsed_table = table
|
|
2415
1962
|
|
|
2416
|
-
_logger.
|
|
1963
|
+
_logger.debug("stream_id=%d rows=%d next_row_id=%d table=%s",
|
|
1964
|
+
stream_id, len(parsed_table), next_row_id, parsed_table)
|
|
2417
1965
|
start_row_ids[stream_id] = next_row_id
|
|
2418
1966
|
yield parsed_table # the result of a single "select_rows()" cycle
|
|
2419
1967
|
|
|
@@ -2562,7 +2110,6 @@ def get_field_type(builder: flatbuffers.Builder, field: pa.Field):
|
|
|
2562
2110
|
return field_type, field_type_type
|
|
2563
2111
|
|
|
2564
2112
|
def build_field(builder: flatbuffers.Builder, f: pa.Field, name: str):
|
|
2565
|
-
_logger.info(f"name={f.name}")
|
|
2566
2113
|
children = None
|
|
2567
2114
|
if isinstance(f.type, pa.StructType):
|
|
2568
2115
|
children = [build_field(builder, child, child.name) for child in list(f.type)]
|
|
@@ -2589,7 +2136,6 @@ def build_field(builder: flatbuffers.Builder, f: pa.Field, name: str):
|
|
|
2589
2136
|
fb_field.AddName(builder, child_col_name)
|
|
2590
2137
|
fb_field.AddChildren(builder, children)
|
|
2591
2138
|
|
|
2592
|
-
_logger.info(f"added key and map to entries")
|
|
2593
2139
|
children = [fb_field.End(builder)]
|
|
2594
2140
|
|
|
2595
2141
|
if children is not None:
|
|
@@ -2600,13 +2146,11 @@ def build_field(builder: flatbuffers.Builder, f: pa.Field, name: str):
|
|
|
2600
2146
|
|
|
2601
2147
|
col_name = builder.CreateString(name)
|
|
2602
2148
|
field_type, field_type_type = get_field_type(builder, f)
|
|
2603
|
-
_logger.info(f"add col_name={name} type_type={field_type_type} to fb")
|
|
2604
2149
|
fb_field.Start(builder)
|
|
2605
2150
|
fb_field.AddName(builder, col_name)
|
|
2606
2151
|
fb_field.AddTypeType(builder, field_type_type)
|
|
2607
2152
|
fb_field.AddType(builder, field_type)
|
|
2608
2153
|
if children is not None:
|
|
2609
|
-
_logger.info(f"add col_name={name} childern")
|
|
2610
2154
|
fb_field.AddChildren(builder, children)
|
|
2611
2155
|
return fb_field.End(builder)
|
|
2612
2156
|
|
|
@@ -2623,9 +2167,7 @@ class QueryDataRequest:
|
|
|
2623
2167
|
self.response_schema = response_schema
|
|
2624
2168
|
|
|
2625
2169
|
|
|
2626
|
-
def build_query_data_request(schema: 'pa.Schema' = pa.schema([]),
|
|
2627
|
-
filters = filters or {}
|
|
2628
|
-
|
|
2170
|
+
def build_query_data_request(schema: 'pa.Schema' = pa.schema([]), predicate: ibis.expr.types.BooleanColumn = None, field_names: list = None):
|
|
2629
2171
|
builder = flatbuffers.Builder(1024)
|
|
2630
2172
|
|
|
2631
2173
|
source_name = builder.CreateString('') # required
|
|
@@ -2641,7 +2183,7 @@ def build_query_data_request(schema: 'pa.Schema' = pa.schema([]), filters: dict
|
|
|
2641
2183
|
fb_schema.AddFields(builder, fields)
|
|
2642
2184
|
schema_obj = fb_schema.End(builder)
|
|
2643
2185
|
|
|
2644
|
-
predicate = Predicate(schema,
|
|
2186
|
+
predicate = Predicate(schema=schema, expr=predicate)
|
|
2645
2187
|
filter_obj = predicate.serialize(builder)
|
|
2646
2188
|
|
|
2647
2189
|
parser = QueryDataParser(schema)
|
|
@@ -2652,10 +2194,8 @@ def build_query_data_request(schema: 'pa.Schema' = pa.schema([]), filters: dict
|
|
|
2652
2194
|
continue
|
|
2653
2195
|
iter_from_root = reversed(list(descendent._iter_to_root()))
|
|
2654
2196
|
descendent_full_name = '.'.join([n.field.name for n in iter_from_root])
|
|
2655
|
-
_logger.debug(f'build_query_data_request: descendent_full_name={descendent_full_name}')
|
|
2656
2197
|
descendent_leaves = [leaf.index for leaf in descendent._iter_leaves()]
|
|
2657
2198
|
leaves_map[descendent_full_name] = descendent_leaves
|
|
2658
|
-
_logger.debug(f'build_query_data_request: leaves_map={leaves_map}')
|
|
2659
2199
|
|
|
2660
2200
|
output_field_names = None
|
|
2661
2201
|
if field_names is None:
|
|
@@ -2666,13 +2206,11 @@ def build_query_data_request(schema: 'pa.Schema' = pa.schema([]), filters: dict
|
|
|
2666
2206
|
def compare_field_names_by_pos(field_name1, field_name2):
|
|
2667
2207
|
return leaves_map[field_name1][0]-leaves_map[field_name2][0]
|
|
2668
2208
|
field_names = sorted(field_names, key=cmp_to_key(compare_field_names_by_pos))
|
|
2669
|
-
_logger.debug(f'build_query_data_request: sorted field_names={field_names} schema={schema}')
|
|
2670
2209
|
|
|
2671
2210
|
projection_fields = []
|
|
2672
2211
|
projection_positions = []
|
|
2673
2212
|
for field_name in field_names:
|
|
2674
2213
|
positions = leaves_map[field_name]
|
|
2675
|
-
_logger.info("projecting field=%s positions=%s", field_name, positions)
|
|
2676
2214
|
projection_positions.extend(positions)
|
|
2677
2215
|
for leaf_position in positions:
|
|
2678
2216
|
fb_field_index.Start(builder)
|
|
@@ -2729,11 +2267,9 @@ def convert_column_types(table: 'pa.Table') -> 'pa.Table':
|
|
|
2729
2267
|
indexes_of_fields_to_change[field.name] = index
|
|
2730
2268
|
for changing_index in ts_indexes:
|
|
2731
2269
|
field_name = table.schema[changing_index].name
|
|
2732
|
-
_logger.info(f'changing resolution for {field_name} to us')
|
|
2733
2270
|
new_column = table[field_name].cast(pa.timestamp('us'), safe=False)
|
|
2734
2271
|
table = table.set_column(changing_index, field_name, new_column)
|
|
2735
2272
|
for field_name, changing_index in indexes_of_fields_to_change.items():
|
|
2736
|
-
_logger.info(f'applying custom rules to {field_name}')
|
|
2737
2273
|
new_column = table[field_name].to_pylist()
|
|
2738
2274
|
new_column = list(map(column_matcher[field_name], new_column))
|
|
2739
2275
|
new_column = pa.array(new_column, table[field_name].type)
|