vastdb 0.0.5.3__py3-none-any.whl → 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vast_flatbuf/tabular/GetTableStatsResponse.py +45 -1
- vast_flatbuf/tabular/VipRange.py +56 -0
- vastdb/__init__.py +7 -0
- vastdb/bucket.py +77 -0
- vastdb/errors.py +158 -0
- vastdb/{api.py → internal_commands.py} +280 -746
- vastdb/schema.py +77 -0
- vastdb/session.py +48 -0
- vastdb/table.py +480 -0
- vastdb/tests/conftest.py +15 -14
- vastdb/tests/test_imports.py +125 -0
- vastdb/tests/test_projections.py +41 -0
- vastdb/tests/test_sanity.py +36 -16
- vastdb/tests/test_schemas.py +12 -6
- vastdb/tests/test_tables.py +581 -13
- vastdb/transaction.py +55 -0
- vastdb/util.py +8 -8
- vastdb-0.1.0.dist-info/METADATA +38 -0
- {vastdb-0.0.5.3.dist-info → vastdb-0.1.0.dist-info}/RECORD +22 -31
- vast_protobuf/__init__.py +0 -0
- vast_protobuf/substrait/__init__.py +0 -0
- vast_protobuf/substrait/algebra_pb2.py +0 -1344
- vast_protobuf/substrait/capabilities_pb2.py +0 -46
- vast_protobuf/substrait/ddl_pb2.py +0 -57
- vast_protobuf/substrait/extended_expression_pb2.py +0 -49
- vast_protobuf/substrait/extensions/__init__.py +0 -0
- vast_protobuf/substrait/extensions/extensions_pb2.py +0 -89
- vast_protobuf/substrait/function_pb2.py +0 -168
- vast_protobuf/substrait/parameterized_types_pb2.py +0 -181
- vast_protobuf/substrait/plan_pb2.py +0 -67
- vast_protobuf/substrait/type_expressions_pb2.py +0 -198
- vast_protobuf/substrait/type_pb2.py +0 -350
- vast_protobuf/tabular/__init__.py +0 -0
- vast_protobuf/tabular/rpc_pb2.py +0 -344
- vastdb/bench_scan.py +0 -45
- vastdb/tests/test_create_table_from_parquets.py +0 -50
- vastdb/v2.py +0 -360
- vastdb-0.0.5.3.dist-info/METADATA +0 -47
- {vastdb-0.0.5.3.dist-info → vastdb-0.1.0.dist-info}/LICENSE +0 -0
- {vastdb-0.0.5.3.dist-info → vastdb-0.1.0.dist-info}/WHEEL +0 -0
- {vastdb-0.0.5.3.dist-info → vastdb-0.1.0.dist-info}/top_level.txt +0 -0
|
@@ -1,29 +1,26 @@
|
|
|
1
|
-
import array
|
|
2
1
|
import logging
|
|
3
2
|
import struct
|
|
4
3
|
import urllib.parse
|
|
5
4
|
from collections import defaultdict, namedtuple
|
|
6
5
|
from datetime import datetime
|
|
7
6
|
from enum import Enum
|
|
8
|
-
from typing import
|
|
7
|
+
from typing import Union, Optional, Iterator
|
|
8
|
+
import ibis
|
|
9
9
|
import xmltodict
|
|
10
|
-
import concurrent.futures
|
|
11
|
-
import threading
|
|
12
|
-
import queue
|
|
13
10
|
import math
|
|
14
|
-
import socket
|
|
15
11
|
from functools import cmp_to_key
|
|
16
12
|
import pyarrow.parquet as pq
|
|
17
13
|
import flatbuffers
|
|
18
14
|
import pyarrow as pa
|
|
19
15
|
import requests
|
|
20
|
-
import datetime
|
|
21
|
-
import hashlib
|
|
22
|
-
import hmac
|
|
23
16
|
import json
|
|
24
17
|
import itertools
|
|
25
18
|
from aws_requests_auth.aws_auth import AWSRequestsAuth
|
|
26
|
-
|
|
19
|
+
import urllib3
|
|
20
|
+
import re
|
|
21
|
+
|
|
22
|
+
from . import errors
|
|
23
|
+
from ipaddress import IPv4Address, IPv6Address
|
|
27
24
|
|
|
28
25
|
import vast_flatbuf.org.apache.arrow.computeir.flatbuf.BinaryLiteral as fb_binary_lit
|
|
29
26
|
import vast_flatbuf.org.apache.arrow.computeir.flatbuf.BooleanLiteral as fb_bool_lit
|
|
@@ -91,30 +88,22 @@ TABULAR_QUERY_DATA_COMPLETED_STREAM_ID = 0xFFFFFFFF - 1
|
|
|
91
88
|
TABULAR_QUERY_DATA_FAILED_STREAM_ID = 0xFFFFFFFF - 2
|
|
92
89
|
TABULAR_INVALID_ROW_ID = 0xFFFFFFFFFFFF # (1<<48)-1
|
|
93
90
|
ESTORE_INVALID_EHANDLE = UINT64_MAX
|
|
91
|
+
IMPORTED_OBJECTS_TABLE_NAME = "vastdb-imported-objects"
|
|
94
92
|
|
|
95
93
|
"""
|
|
96
94
|
S3 Tabular API
|
|
97
95
|
"""
|
|
98
96
|
|
|
99
97
|
|
|
100
|
-
|
|
101
|
-
log = logging.getLogger(name)
|
|
102
|
-
log.setLevel(logging.ERROR)
|
|
103
|
-
ch = logging.StreamHandler()
|
|
104
|
-
ch.setLevel(logging.INFO)
|
|
105
|
-
ch.set_name('tabular_stream_handler')
|
|
106
|
-
formatter = logging.Formatter("%(asctime)s:%(levelname)s:%(message)s")
|
|
107
|
-
ch.setFormatter(formatter)
|
|
108
|
-
log.addHandler(ch)
|
|
109
|
-
log.propagate = False
|
|
110
|
-
return log
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
_logger = get_logger(__name__)
|
|
98
|
+
_logger = logging.getLogger(__name__)
|
|
114
99
|
|
|
115
100
|
|
|
116
|
-
def
|
|
117
|
-
|
|
101
|
+
def _flatten_args(op, op_type):
|
|
102
|
+
if isinstance(op, op_type):
|
|
103
|
+
for arg in op.args:
|
|
104
|
+
yield from _flatten_args(arg, op_type)
|
|
105
|
+
else:
|
|
106
|
+
yield op
|
|
118
107
|
|
|
119
108
|
|
|
120
109
|
class AuthType(Enum):
|
|
@@ -123,10 +112,6 @@ class AuthType(Enum):
|
|
|
123
112
|
BASIC = "basic"
|
|
124
113
|
|
|
125
114
|
|
|
126
|
-
class TabularException(Exception):
|
|
127
|
-
pass
|
|
128
|
-
|
|
129
|
-
|
|
130
115
|
def get_unit_to_flatbuff_time_unit(type):
|
|
131
116
|
unit_to_flatbuff_time_unit = {
|
|
132
117
|
'ns': TimeUnit.NANOSECOND,
|
|
@@ -144,11 +129,10 @@ class Predicate:
|
|
|
144
129
|
's': 0.001
|
|
145
130
|
}
|
|
146
131
|
|
|
147
|
-
def __init__(self, schema: 'pa.Schema',
|
|
132
|
+
def __init__(self, schema: 'pa.Schema', expr: ibis.expr.types.BooleanColumn):
|
|
148
133
|
self.schema = schema
|
|
149
|
-
self.
|
|
134
|
+
self.expr = expr
|
|
150
135
|
self.builder = None
|
|
151
|
-
self._field_name_per_index = None
|
|
152
136
|
|
|
153
137
|
def get_field_indexes(self, field: 'pa.Field', field_name_per_index: list) -> None:
|
|
154
138
|
field_name_per_index.append(field.name)
|
|
@@ -172,7 +156,6 @@ class Predicate:
|
|
|
172
156
|
for field in self.schema:
|
|
173
157
|
self.get_field_indexes(field, _field_name_per_index)
|
|
174
158
|
self._field_name_per_index = {field: index for index, field in enumerate(_field_name_per_index)}
|
|
175
|
-
_logger.debug(f'field_name_per_index: {self._field_name_per_index}')
|
|
176
159
|
return self._field_name_per_index
|
|
177
160
|
|
|
178
161
|
def get_projections(self, builder: 'flatbuffers.builder.Builder', field_names: list = None):
|
|
@@ -190,10 +173,77 @@ class Predicate:
|
|
|
190
173
|
return builder.EndVector()
|
|
191
174
|
|
|
192
175
|
def serialize(self, builder: 'flatbuffers.builder.Builder'):
|
|
176
|
+
from ibis.expr.operations.generic import TableColumn, Literal, IsNull
|
|
177
|
+
from ibis.expr.operations.logical import Greater, GreaterEqual, Less, LessEqual, Equals, NotEquals, And, Or, Not
|
|
178
|
+
from ibis.expr.operations.strings import StringContains
|
|
179
|
+
|
|
180
|
+
builder_map = {
|
|
181
|
+
Greater: self.build_greater,
|
|
182
|
+
GreaterEqual: self.build_greater_equal,
|
|
183
|
+
Less: self.build_less,
|
|
184
|
+
LessEqual: self.build_less_equal,
|
|
185
|
+
Equals: self.build_equal,
|
|
186
|
+
NotEquals: self.build_not_equal,
|
|
187
|
+
IsNull: self.build_is_null,
|
|
188
|
+
Not: self.build_is_not_null,
|
|
189
|
+
StringContains: self.build_match_substring,
|
|
190
|
+
}
|
|
191
|
+
|
|
192
|
+
positions_map = dict((f.name, index) for index, f in enumerate(self.schema)) # TODO: BFS
|
|
193
|
+
|
|
193
194
|
self.builder = builder
|
|
195
|
+
|
|
194
196
|
offsets = []
|
|
195
|
-
|
|
196
|
-
|
|
197
|
+
|
|
198
|
+
if self.expr is not None:
|
|
199
|
+
and_args = list(_flatten_args(self.expr.op(), And))
|
|
200
|
+
_logger.debug('AND args: %s ops %s', and_args, self.expr.op())
|
|
201
|
+
for op in and_args:
|
|
202
|
+
or_args = list(_flatten_args(op, Or))
|
|
203
|
+
_logger.debug('OR args: %s op %s', or_args, op)
|
|
204
|
+
inner_offsets = []
|
|
205
|
+
|
|
206
|
+
prev_field_name = None
|
|
207
|
+
for inner_op in or_args:
|
|
208
|
+
_logger.debug('inner_op %s', inner_op)
|
|
209
|
+
builder_func = builder_map.get(type(inner_op))
|
|
210
|
+
if not builder_func:
|
|
211
|
+
raise NotImplementedError(inner_op.name)
|
|
212
|
+
|
|
213
|
+
if builder_func == self.build_is_null:
|
|
214
|
+
column, = inner_op.args
|
|
215
|
+
literal = None
|
|
216
|
+
elif builder_func == self.build_is_not_null:
|
|
217
|
+
not_arg, = inner_op.args
|
|
218
|
+
# currently we only support not is_null, checking we really got is_null under the not:
|
|
219
|
+
if not builder_map.get(type(not_arg)) == self.build_is_null:
|
|
220
|
+
raise NotImplementedError(not_arg.args[0].name)
|
|
221
|
+
column, = not_arg.args
|
|
222
|
+
literal = None
|
|
223
|
+
else:
|
|
224
|
+
column, literal = inner_op.args
|
|
225
|
+
if not isinstance(literal, Literal):
|
|
226
|
+
raise NotImplementedError(inner_op.name)
|
|
227
|
+
|
|
228
|
+
if not isinstance(column, TableColumn):
|
|
229
|
+
raise NotImplementedError(inner_op.name)
|
|
230
|
+
|
|
231
|
+
field_name = column.name
|
|
232
|
+
if prev_field_name is None:
|
|
233
|
+
prev_field_name = field_name
|
|
234
|
+
elif prev_field_name != field_name:
|
|
235
|
+
raise NotImplementedError(op.name)
|
|
236
|
+
|
|
237
|
+
args_offsets = [self.build_column(position=positions_map[field_name])]
|
|
238
|
+
if literal:
|
|
239
|
+
field = self.schema.field(field_name)
|
|
240
|
+
args_offsets.append(self.build_literal(field=field, value=literal.value))
|
|
241
|
+
|
|
242
|
+
inner_offsets.append(builder_func(*args_offsets))
|
|
243
|
+
|
|
244
|
+
domain_offset = self.build_or(inner_offsets)
|
|
245
|
+
offsets.append(domain_offset)
|
|
246
|
+
|
|
197
247
|
return self.build_and(offsets)
|
|
198
248
|
|
|
199
249
|
def build_column(self, position: int):
|
|
@@ -221,7 +271,6 @@ class Predicate:
|
|
|
221
271
|
field = self.schema.field(field_name)
|
|
222
272
|
for attr in field_attrs:
|
|
223
273
|
field = field.type[attr]
|
|
224
|
-
_logger.info(f'trying to append field: {field} with domains: {filters}')
|
|
225
274
|
for filter_by_name in filters:
|
|
226
275
|
offsets.append(self.build_range(column=column, field=field, filter_by_name=filter_by_name))
|
|
227
276
|
return self.build_or(offsets)
|
|
@@ -263,11 +312,9 @@ class Predicate:
|
|
|
263
312
|
return self.build_and(rules)
|
|
264
313
|
|
|
265
314
|
def build_function(self, name: str, *offsets):
|
|
266
|
-
_logger.info(f'name: {name}, offsets: {offsets}')
|
|
267
315
|
offset_name = self.builder.CreateString(name)
|
|
268
316
|
fb_call.StartArgumentsVector(self.builder, len(offsets))
|
|
269
317
|
for offset in reversed(offsets):
|
|
270
|
-
_logger.info(f'offset: {offset}')
|
|
271
318
|
self.builder.PrependUOffsetTRelative(offset)
|
|
272
319
|
offset_arguments = self.builder.EndVector()
|
|
273
320
|
|
|
@@ -282,7 +329,7 @@ class Predicate:
|
|
|
282
329
|
fb_expression.AddImpl(self.builder, offset_call)
|
|
283
330
|
return fb_expression.End(self.builder)
|
|
284
331
|
|
|
285
|
-
def build_literal(self, field: pa.Field, value
|
|
332
|
+
def build_literal(self, field: pa.Field, value):
|
|
286
333
|
if field.type.equals(pa.int64()):
|
|
287
334
|
literal_type = fb_int64_lit
|
|
288
335
|
literal_impl = LiteralImpl.Int64Literal
|
|
@@ -366,8 +413,7 @@ class Predicate:
|
|
|
366
413
|
field_type = fb_date.End(self.builder)
|
|
367
414
|
|
|
368
415
|
start_date = datetime.fromtimestamp(0).date()
|
|
369
|
-
|
|
370
|
-
date_delta = date_value - start_date
|
|
416
|
+
date_delta = value - start_date
|
|
371
417
|
value = date_delta.days
|
|
372
418
|
elif isinstance(field.type, pa.TimestampType):
|
|
373
419
|
literal_type = fb_timestamp_lit
|
|
@@ -426,7 +472,7 @@ class Predicate:
|
|
|
426
472
|
fb_binary.Start(self.builder)
|
|
427
473
|
field_type = fb_binary.End(self.builder)
|
|
428
474
|
|
|
429
|
-
value = self.builder.CreateByteVector(value
|
|
475
|
+
value = self.builder.CreateByteVector(value)
|
|
430
476
|
else:
|
|
431
477
|
raise ValueError(f'unsupported predicate for type={field.type}, value={value}')
|
|
432
478
|
|
|
@@ -459,6 +505,9 @@ class Predicate:
|
|
|
459
505
|
def build_equal(self, column: int, literal: int):
|
|
460
506
|
return self.build_function('equal', column, literal)
|
|
461
507
|
|
|
508
|
+
def build_not_equal(self, column: int, literal: int):
|
|
509
|
+
return self.build_function('not_equal', column, literal)
|
|
510
|
+
|
|
462
511
|
def build_greater(self, column: int, literal: int):
|
|
463
512
|
return self.build_function('greater', column, literal)
|
|
464
513
|
|
|
@@ -477,6 +526,9 @@ class Predicate:
|
|
|
477
526
|
def build_is_not_null(self, column: int):
|
|
478
527
|
return self.build_function('is_valid', column)
|
|
479
528
|
|
|
529
|
+
def build_match_substring(self, column: int, literal: int):
|
|
530
|
+
return self.build_function('match_substring', column, literal)
|
|
531
|
+
|
|
480
532
|
|
|
481
533
|
class FieldNode:
|
|
482
534
|
"""Helper class for representing nested Arrow fields and handling QueryData requests"""
|
|
@@ -574,9 +626,8 @@ class FieldNode:
|
|
|
574
626
|
def build(self) -> pa.Array:
|
|
575
627
|
"""Construct an Arrow array from the collected buffers (recursively)."""
|
|
576
628
|
children = self.children and [node.build() for node in self.children if node.is_projected]
|
|
577
|
-
_logger.debug(
|
|
578
|
-
|
|
579
|
-
f'self.buffers={self.buffers} children={children}')
|
|
629
|
+
_logger.debug('build: self.field.name=%s, self.projected_field.type=%s, self.length=%s, self.buffers=%s children=%s',
|
|
630
|
+
self.field.name, self.projected_field.type, self.length, self.buffers, children)
|
|
580
631
|
result = pa.Array.from_buffers(self.projected_field.type, self.length, buffers=self.buffers, children=children)
|
|
581
632
|
if self.debug:
|
|
582
633
|
_logger.debug('%s result=%s', self.field, result)
|
|
@@ -602,11 +653,9 @@ class QueryDataParser:
|
|
|
602
653
|
for node in self.nodes:
|
|
603
654
|
node.debug_log()
|
|
604
655
|
self.leaves = [leaf for node in self.nodes for leaf in node._iter_leaves()]
|
|
605
|
-
_logger.debug(f'QueryDataParser: self.leaves = {[(leaf.field.name, leaf.index) for leaf in self.leaves]}')
|
|
606
656
|
self.mark_projected_nodes()
|
|
607
657
|
[node.build_projected_field() for node in self.nodes]
|
|
608
658
|
self.projected_leaves = [leaf for node in self.nodes for leaf in node._iter_projected_leaves()]
|
|
609
|
-
_logger.debug(f'QueryDataParser: self.projected_leaves = {[(leaf.field.name, leaf.index) for leaf in self.projected_leaves]}')
|
|
610
659
|
|
|
611
660
|
self.leaf_offset = 0
|
|
612
661
|
|
|
@@ -615,7 +664,6 @@ class QueryDataParser:
|
|
|
615
664
|
if self.projection_positions is None or leaf.index in self.projection_positions:
|
|
616
665
|
for node in leaf._iter_to_root():
|
|
617
666
|
node.is_projected = True
|
|
618
|
-
_logger.debug(f'mark_projected_nodes node.field.name={node.field.name}')
|
|
619
667
|
|
|
620
668
|
def parse(self, column: pa.Array):
|
|
621
669
|
"""Parse a single column response from VAST (see FieldNode.set for details)"""
|
|
@@ -693,7 +741,6 @@ def _parse_table_info(obj):
|
|
|
693
741
|
return TableInfo(name, properties, handle, num_rows, used_bytes)
|
|
694
742
|
|
|
695
743
|
def build_record_batch(column_info, column_values):
|
|
696
|
-
_logger.info(f"column_info={column_info}")
|
|
697
744
|
fields = [pa.field(column_name, column_type) for column_type, column_name in column_info]
|
|
698
745
|
schema = pa.schema(fields)
|
|
699
746
|
arrays = [pa.array(column_values[column_type], type=column_type) for column_type, _ in column_info]
|
|
@@ -706,56 +753,30 @@ def serialize_record_batch(batch):
|
|
|
706
753
|
writer.write(batch)
|
|
707
754
|
return sink.getvalue()
|
|
708
755
|
|
|
709
|
-
|
|
710
|
-
|
|
711
|
-
start_parts = start.split('.')
|
|
712
|
-
start_last_part = int(start_parts[-1])
|
|
713
|
-
end_parts = end.split('.')
|
|
714
|
-
end_last_part = int(end_parts[-1])
|
|
715
|
-
if start_last_part>=end_last_part or True in [start_parts[i] != end_parts[i] for i in range(3)]:
|
|
716
|
-
raise ValueError(f'illegal ip range {ip_range_str}')
|
|
717
|
-
num_ips = 1 + end_last_part - start_last_part
|
|
718
|
-
ips = ['.'.join(start_parts[:-1] + [str(start_last_part + i)]) for i in range(num_ips)]
|
|
719
|
-
return ips
|
|
720
|
-
|
|
721
|
-
def parse_executor_hosts(host):
|
|
722
|
-
executor_hosts_parsed = host.split(',')
|
|
723
|
-
executor_hosts_parsed = [host.strip() for host in executor_hosts_parsed]
|
|
724
|
-
executor_hosts = []
|
|
725
|
-
for executor_host in executor_hosts_parsed:
|
|
726
|
-
is_ip_range=False
|
|
727
|
-
if ':' in executor_host:
|
|
728
|
-
try:
|
|
729
|
-
socket.inet_aton(executor_host.split(':')[0])
|
|
730
|
-
socket.inet_aton(executor_host.split(':')[1])
|
|
731
|
-
is_ip_range = True
|
|
732
|
-
except:
|
|
733
|
-
pass
|
|
734
|
-
if is_ip_range:
|
|
735
|
-
executor_hosts.extend(generate_ip_range(executor_host))
|
|
736
|
-
else:
|
|
737
|
-
executor_hosts.append(executor_host)
|
|
738
|
-
return executor_hosts
|
|
756
|
+
# Results that returns from tablestats
|
|
757
|
+
TableStatsResult = namedtuple("TableStatsResult",["num_rows", "size_in_bytes", "is_external_rowid_alloc", "endpoints"])
|
|
739
758
|
|
|
740
759
|
class VastdbApi:
|
|
741
|
-
|
|
760
|
+
# we expect the vast version to be <major>.<minor>.<patch>.<protocol>
|
|
761
|
+
VAST_VERSION_REGEX = re.compile(r'^vast (\d+\.\d+\.\d+\.\d+)$')
|
|
762
|
+
|
|
763
|
+
def __init__(self, endpoint, access_key, secret_key, username=None, password=None,
|
|
742
764
|
secure=False, auth_type=AuthType.SIGV4):
|
|
743
|
-
|
|
744
|
-
host = executor_hosts[0]
|
|
745
|
-
self.host = host
|
|
765
|
+
url_dict = urllib3.util.parse_url(endpoint)._asdict()
|
|
746
766
|
self.access_key = access_key
|
|
747
767
|
self.secret_key = secret_key
|
|
748
768
|
self.username = username
|
|
749
769
|
self.password = password
|
|
750
|
-
self.port = port
|
|
751
770
|
self.secure = secure
|
|
752
771
|
self.auth_type = auth_type
|
|
753
|
-
self.executor_hosts =
|
|
772
|
+
self.executor_hosts = [endpoint] # TODO: remove
|
|
754
773
|
|
|
755
774
|
username = username or ''
|
|
756
775
|
password = password or ''
|
|
757
|
-
if not port:
|
|
758
|
-
port = 443 if secure else 80
|
|
776
|
+
if not url_dict['port']:
|
|
777
|
+
url_dict['port'] = 443 if secure else 80
|
|
778
|
+
|
|
779
|
+
self.port = url_dict['port']
|
|
759
780
|
|
|
760
781
|
self.default_max_list_columns_page_size = 1000
|
|
761
782
|
self.session = requests.Session()
|
|
@@ -764,10 +785,10 @@ class VastdbApi:
|
|
|
764
785
|
if auth_type == AuthType.BASIC:
|
|
765
786
|
self.session.auth = requests.auth.HTTPBasicAuth(username, password)
|
|
766
787
|
else:
|
|
767
|
-
if port != 80 and port != 443:
|
|
768
|
-
self.aws_host =
|
|
788
|
+
if url_dict['port'] != 80 and url_dict['port'] != 443:
|
|
789
|
+
self.aws_host = '{host}:{port}'.format(**url_dict)
|
|
769
790
|
else:
|
|
770
|
-
self.aws_host =
|
|
791
|
+
self.aws_host = '{host}'.format(**url_dict)
|
|
771
792
|
|
|
772
793
|
self.session.auth = AWSRequestsAuth(aws_access_key=access_key,
|
|
773
794
|
aws_secret_access_key=secret_key,
|
|
@@ -775,8 +796,34 @@ class VastdbApi:
|
|
|
775
796
|
aws_region='us-east-1',
|
|
776
797
|
aws_service='s3')
|
|
777
798
|
|
|
778
|
-
|
|
779
|
-
|
|
799
|
+
if not url_dict['scheme']:
|
|
800
|
+
url_dict['scheme'] = "https" if secure else "http"
|
|
801
|
+
|
|
802
|
+
url = urllib3.util.Url(**url_dict)
|
|
803
|
+
self.url = str(url)
|
|
804
|
+
_logger.debug('url=%s aws_host=%s', self.url, self.aws_host)
|
|
805
|
+
|
|
806
|
+
# probe the cluster for its version
|
|
807
|
+
self.vast_version = None
|
|
808
|
+
res = self.session.options(self.url)
|
|
809
|
+
server_header = res.headers.get("Server")
|
|
810
|
+
if server_header is None:
|
|
811
|
+
_logger.error("OPTIONS response doesn't contain 'Server' header")
|
|
812
|
+
else:
|
|
813
|
+
_logger.debug("Server header is '%s'", server_header)
|
|
814
|
+
if m := self.VAST_VERSION_REGEX.match(server_header):
|
|
815
|
+
self.vast_version, = m.groups()
|
|
816
|
+
return
|
|
817
|
+
else:
|
|
818
|
+
_logger.error("'Server' header '%s' doesn't match the expected pattern", server_header)
|
|
819
|
+
|
|
820
|
+
msg = (
|
|
821
|
+
f'Please use `vastdb` <= 0.0.5.x with current VAST cluster version ("{server_header or "N/A"}"). '
|
|
822
|
+
'To use the latest SDK, please upgrade your cluster to the latest service pack. '
|
|
823
|
+
'Please contact customer.support@vastdata.com for more details.'
|
|
824
|
+
)
|
|
825
|
+
_logger.critical(msg)
|
|
826
|
+
raise NotImplementedError(msg)
|
|
780
827
|
|
|
781
828
|
def update_mgmt_session(self, access_key: str, secret_key: str, auth_type=AuthType.SIGV4):
|
|
782
829
|
if auth_type != AuthType.BASIC:
|
|
@@ -821,21 +868,9 @@ class VastdbApi:
|
|
|
821
868
|
return common_headers
|
|
822
869
|
|
|
823
870
|
def _check_res(self, res, cmd="", expected_retvals=[]):
|
|
824
|
-
|
|
825
|
-
|
|
826
|
-
|
|
827
|
-
if not res.status_code in expected_retvals:
|
|
828
|
-
raise ValueError(f"Expected status code mismatch. status_code={res.status_code}")
|
|
829
|
-
else:
|
|
830
|
-
if not len(expected_retvals) == 0:
|
|
831
|
-
raise ValueError(f"Expected {expected_retvals} but status_code={res.status_code}")
|
|
832
|
-
return res
|
|
833
|
-
except requests.HTTPError as e:
|
|
834
|
-
if res.status_code in expected_retvals:
|
|
835
|
-
_logger.info(f"{cmd} has failed as expected res={res}")
|
|
836
|
-
return res
|
|
837
|
-
else:
|
|
838
|
-
raise e
|
|
871
|
+
if exc := errors.from_response(res):
|
|
872
|
+
raise exc
|
|
873
|
+
return res
|
|
839
874
|
|
|
840
875
|
def create_schema(self, bucket, name, txid=0, client_tags=[], schema_properties="", expected_retvals=[]):
|
|
841
876
|
"""
|
|
@@ -975,7 +1010,8 @@ class VastdbApi:
|
|
|
975
1010
|
return snapshots, is_truncated, marker
|
|
976
1011
|
|
|
977
1012
|
|
|
978
|
-
def create_table(self, bucket, schema, name, arrow_schema, txid=0, client_tags=[], expected_retvals=[],
|
|
1013
|
+
def create_table(self, bucket, schema, name, arrow_schema, txid=0, client_tags=[], expected_retvals=[],
|
|
1014
|
+
topic_partitions=0, create_imports_table=False):
|
|
979
1015
|
"""
|
|
980
1016
|
Create a table, use the following request
|
|
981
1017
|
POST /bucket/schema/table?table HTTP/1.1
|
|
@@ -984,18 +1020,21 @@ class VastdbApi:
|
|
|
984
1020
|
tabular-txid: <integer> TransactionId
|
|
985
1021
|
tabular-client-tag: <string> ClientTag
|
|
986
1022
|
|
|
987
|
-
The body of the POST request contains table column properties as
|
|
988
|
-
|
|
989
|
-
|
|
990
|
-
|
|
991
|
-
|
|
992
|
-
|
|
1023
|
+
The body of the POST request contains table column properties as arrow schema
|
|
1024
|
+
which include field_name, field_type and properties
|
|
1025
|
+
|
|
1026
|
+
In order to create vastdb-imported-objects table that tracks all imported files and avoid duplicate imports,
|
|
1027
|
+
just set create_imports_table=True
|
|
1028
|
+
The request will look like:
|
|
1029
|
+
POST /bucket/schema/table?table&sub-table=vastdb-imported-objects HTTP/1.1
|
|
993
1030
|
"""
|
|
994
1031
|
headers = self._fill_common_headers(txid=txid, client_tags=client_tags)
|
|
995
1032
|
|
|
996
1033
|
serialized_schema = arrow_schema.serialize()
|
|
997
1034
|
headers['Content-Length'] = str(len(serialized_schema))
|
|
998
1035
|
url_params = {'topic_partitions': str(topic_partitions)} if topic_partitions else {}
|
|
1036
|
+
if create_imports_table:
|
|
1037
|
+
url_params['sub-table'] = IMPORTED_OBJECTS_TABLE_NAME
|
|
999
1038
|
|
|
1000
1039
|
res = self.session.post(self._api_prefix(bucket=bucket, schema=schema, table=name, command="table", url_params=url_params),
|
|
1001
1040
|
data=serialized_schema, headers=headers)
|
|
@@ -1015,7 +1054,6 @@ class VastdbApi:
|
|
|
1015
1054
|
raise RuntimeError(f'invalid params parquet_path={parquet_path} parquet_bucket_name={parquet_bucket_name} parquet_object_name={parquet_object_name}')
|
|
1016
1055
|
|
|
1017
1056
|
# Get the schema of the Parquet file
|
|
1018
|
-
_logger.info(f'type(parquet_ds.schema) = {type(parquet_ds.schema)}')
|
|
1019
1057
|
if isinstance(parquet_ds.schema, pq.ParquetSchema):
|
|
1020
1058
|
arrow_schema = parquet_ds.schema.to_arrow_schema()
|
|
1021
1059
|
elif isinstance(parquet_ds.schema, pa.Schema):
|
|
@@ -1038,13 +1076,27 @@ class VastdbApi:
|
|
|
1038
1076
|
headers = self._fill_common_headers(txid=txid, client_tags=client_tags)
|
|
1039
1077
|
res = self.session.get(self._api_prefix(bucket=bucket, schema=schema, table=name, command="stats"), headers=headers)
|
|
1040
1078
|
if res.status_code == 200:
|
|
1041
|
-
res_headers = res.headers
|
|
1042
1079
|
flatbuf = b''.join(res.iter_content(chunk_size=128))
|
|
1043
1080
|
stats = get_table_stats.GetRootAs(flatbuf)
|
|
1044
1081
|
num_rows = stats.NumRows()
|
|
1045
1082
|
size_in_bytes = stats.SizeInBytes()
|
|
1046
1083
|
is_external_rowid_alloc = stats.IsExternalRowidAlloc()
|
|
1047
|
-
|
|
1084
|
+
endpoints = []
|
|
1085
|
+
if stats.VipsLength() == 0:
|
|
1086
|
+
endpoints.append(self.url)
|
|
1087
|
+
else:
|
|
1088
|
+
ip_cls = IPv6Address if (stats.AddressType() == "ipv6") else IPv4Address
|
|
1089
|
+
vips = [stats.Vips(i) for i in range(stats.VipsLength())]
|
|
1090
|
+
ips = []
|
|
1091
|
+
# extract the vips into list of IPs
|
|
1092
|
+
for vip in vips:
|
|
1093
|
+
start_ip = int(ip_cls(vip.StartAddress().decode()))
|
|
1094
|
+
ips.extend(ip_cls(start_ip + i) for i in range(vip.AddressCount()))
|
|
1095
|
+
for ip in ips:
|
|
1096
|
+
prefix = "http" if not self.secure else "https"
|
|
1097
|
+
endpoints.append(f"{prefix}://{str(ip)}:{self.port}")
|
|
1098
|
+
return TableStatsResult(num_rows, size_in_bytes, is_external_rowid_alloc, endpoints)
|
|
1099
|
+
|
|
1048
1100
|
return self._check_res(res, "get_table_stats", expected_retvals)
|
|
1049
1101
|
|
|
1050
1102
|
def alter_table(self, bucket, schema, name, txid=0, client_tags=[], table_properties="",
|
|
@@ -1071,22 +1123,26 @@ class VastdbApi:
|
|
|
1071
1123
|
|
|
1072
1124
|
headers = self._fill_common_headers(txid=txid, client_tags=client_tags)
|
|
1073
1125
|
headers['Content-Length'] = str(len(alter_table_req))
|
|
1074
|
-
url_params = {'tabular-new-table-name': new_name} if len(new_name) else {}
|
|
1126
|
+
url_params = {'tabular-new-table-name': schema + "/" + new_name} if len(new_name) else {}
|
|
1075
1127
|
|
|
1076
1128
|
res = self.session.put(self._api_prefix(bucket=bucket, schema=schema, table=name, command="table", url_params=url_params),
|
|
1077
1129
|
data=alter_table_req, headers=headers)
|
|
1078
1130
|
|
|
1079
1131
|
return self._check_res(res, "alter_table", expected_retvals)
|
|
1080
1132
|
|
|
1081
|
-
def drop_table(self, bucket, schema, name, txid=0, client_tags=[], expected_retvals=[]):
|
|
1133
|
+
def drop_table(self, bucket, schema, name, txid=0, client_tags=[], expected_retvals=[], remove_imports_table=False):
|
|
1082
1134
|
"""
|
|
1083
1135
|
DELETE /mybucket/schema_path/mytable?table HTTP/1.1
|
|
1084
1136
|
tabular-txid: TransactionId
|
|
1085
1137
|
tabular-client-tag: ClientTag
|
|
1138
|
+
|
|
1139
|
+
To remove the internal vastdb-imported-objects table just set remove_imports_table=True
|
|
1086
1140
|
"""
|
|
1087
1141
|
headers = self._fill_common_headers(txid=txid, client_tags=client_tags)
|
|
1142
|
+
url_params = {'sub-table': IMPORTED_OBJECTS_TABLE_NAME} if remove_imports_table else {}
|
|
1088
1143
|
|
|
1089
|
-
res = self.session.delete(self._api_prefix(bucket=bucket, schema=schema, table=name, command="table"
|
|
1144
|
+
res = self.session.delete(self._api_prefix(bucket=bucket, schema=schema, table=name, command="table", url_params=url_params),
|
|
1145
|
+
headers=headers)
|
|
1090
1146
|
return self._check_res(res, "drop_table", expected_retvals)
|
|
1091
1147
|
|
|
1092
1148
|
def list_tables(self, bucket, schema, txid=0, client_tags=[], max_keys=1000, next_key=0, name_prefix="",
|
|
@@ -1210,7 +1266,7 @@ class VastdbApi:
|
|
|
1210
1266
|
|
|
1211
1267
|
def list_columns(self, bucket, schema, table, *, txid=0, client_tags=None, max_keys=None, next_key=0,
|
|
1212
1268
|
count_only=False, name_prefix="", exact_match=False,
|
|
1213
|
-
expected_retvals=None, bc_list_internals=False):
|
|
1269
|
+
expected_retvals=None, bc_list_internals=False, list_imports_table=False):
|
|
1214
1270
|
"""
|
|
1215
1271
|
GET /mybucket/myschema/mytable?columns HTTP/1.1
|
|
1216
1272
|
tabular-txid: TransactionId
|
|
@@ -1218,6 +1274,8 @@ class VastdbApi:
|
|
|
1218
1274
|
x-tabluar-name-prefix: TableNamePrefix
|
|
1219
1275
|
tabular-max-keys: 1000
|
|
1220
1276
|
tabular-next-key: NextColumnId
|
|
1277
|
+
|
|
1278
|
+
To list the columns of the internal vastdb-imported-objects table, set list_import_table=True
|
|
1221
1279
|
"""
|
|
1222
1280
|
max_keys = max_keys or self.default_max_list_columns_page_size
|
|
1223
1281
|
client_tags = client_tags or []
|
|
@@ -1235,7 +1293,9 @@ class VastdbApi:
|
|
|
1235
1293
|
else:
|
|
1236
1294
|
headers['tabular-name-prefix'] = name_prefix
|
|
1237
1295
|
|
|
1238
|
-
|
|
1296
|
+
url_params = {'sub-table': IMPORTED_OBJECTS_TABLE_NAME} if list_imports_table else {}
|
|
1297
|
+
res = self.session.get(self._api_prefix(bucket=bucket, schema=schema, table=table, command="column",
|
|
1298
|
+
url_params=url_params),
|
|
1239
1299
|
headers=headers, stream=True)
|
|
1240
1300
|
self._check_res(res, "list_columns", expected_retvals)
|
|
1241
1301
|
if res.status_code == 200:
|
|
@@ -1247,9 +1307,7 @@ class VastdbApi:
|
|
|
1247
1307
|
if not count_only:
|
|
1248
1308
|
schema_buf = b''.join(res.iter_content(chunk_size=128))
|
|
1249
1309
|
schema_out = pa.ipc.open_stream(schema_buf).schema
|
|
1250
|
-
|
|
1251
|
-
for f in schema_out:
|
|
1252
|
-
columns.append([f.name, f.type, f.metadata, f])
|
|
1310
|
+
columns = schema_out
|
|
1253
1311
|
|
|
1254
1312
|
return columns, next_key, is_truncated, count
|
|
1255
1313
|
|
|
@@ -1296,7 +1354,7 @@ class VastdbApi:
|
|
|
1296
1354
|
return self._check_res(res, "get_transaction", expected_retvals)
|
|
1297
1355
|
|
|
1298
1356
|
def select_row_ids(self, bucket, schema, table, params, txid=0, client_tags=[], expected_retvals=[],
|
|
1299
|
-
retry_count=0, enable_sorted_projections=
|
|
1357
|
+
retry_count=0, enable_sorted_projections=True):
|
|
1300
1358
|
"""
|
|
1301
1359
|
POST /mybucket/myschema/mytable?query-data=SelectRowIds HTTP/1.1
|
|
1302
1360
|
"""
|
|
@@ -1313,7 +1371,7 @@ class VastdbApi:
|
|
|
1313
1371
|
return self._check_res(res, "query_data", expected_retvals)
|
|
1314
1372
|
|
|
1315
1373
|
def read_columns_data(self, bucket, schema, table, params, txid=0, client_tags=[], expected_retvals=[], tenant_guid=None,
|
|
1316
|
-
retry_count=0, enable_sorted_projections=
|
|
1374
|
+
retry_count=0, enable_sorted_projections=True):
|
|
1317
1375
|
"""
|
|
1318
1376
|
POST /mybucket/myschema/mytable?query-data=ReadColumns HTTP/1.1
|
|
1319
1377
|
"""
|
|
@@ -1329,7 +1387,7 @@ class VastdbApi:
|
|
|
1329
1387
|
return self._check_res(res, "query_data", expected_retvals)
|
|
1330
1388
|
|
|
1331
1389
|
def count_rows(self, bucket, schema, table, params, txid=0, client_tags=[], expected_retvals=[], tenant_guid=None,
|
|
1332
|
-
retry_count=0, enable_sorted_projections=
|
|
1390
|
+
retry_count=0, enable_sorted_projections=True):
|
|
1333
1391
|
"""
|
|
1334
1392
|
POST /mybucket/myschema/mytable?query-data=CountRows HTTP/1.1
|
|
1335
1393
|
"""
|
|
@@ -1343,27 +1401,9 @@ class VastdbApi:
|
|
|
1343
1401
|
data=params, headers=headers, stream=True)
|
|
1344
1402
|
return self._check_res(res, "query_data", expected_retvals)
|
|
1345
1403
|
|
|
1346
|
-
def
|
|
1347
|
-
|
|
1348
|
-
|
|
1349
|
-
request_format='string', response_format='string'):
|
|
1350
|
-
"""
|
|
1351
|
-
GET /mybucket/myschema/mytable?data HTTP/1.1
|
|
1352
|
-
Content-Length: ContentLength
|
|
1353
|
-
tabular-txid: TransactionId
|
|
1354
|
-
tabular-client-tag: ClientTag
|
|
1355
|
-
tabular-split: "split_id,total_splits,num_row_groups_per_split"
|
|
1356
|
-
tabular-num-of-subsplits: "total"
|
|
1357
|
-
tabular-request-format: "string"
|
|
1358
|
-
tabular-response-format: "string" #arrow/trino
|
|
1359
|
-
tabular-schedule-id: "schedule-id"
|
|
1360
|
-
|
|
1361
|
-
Request Body (flatbuf)
|
|
1362
|
-
projections_chunk [expressions]
|
|
1363
|
-
predicate_chunk "formatted_data", (required)
|
|
1364
|
-
|
|
1365
|
-
"""
|
|
1366
|
-
# add query option select-only and read-only
|
|
1404
|
+
def _build_query_data_headers(self, txid, client_tags, params, split, num_sub_splits, request_format, response_format,
|
|
1405
|
+
enable_sorted_projections, limit_rows, schedule_id, retry_count, search_path, tenant_guid,
|
|
1406
|
+
sub_split_start_row_ids):
|
|
1367
1407
|
headers = self._fill_common_headers(txid=txid, client_tags=client_tags)
|
|
1368
1408
|
headers['Content-Length'] = str(len(params))
|
|
1369
1409
|
headers['tabular-split'] = ','.join(map(str, split))
|
|
@@ -1388,439 +1428,80 @@ class VastdbApi:
|
|
|
1388
1428
|
for sub_split_id, start_row_id in sub_split_start_row_ids:
|
|
1389
1429
|
headers[f'tabular-start-row-id-{sub_split_id}'] = f"{sub_split_id},{start_row_id}"
|
|
1390
1430
|
|
|
1391
|
-
|
|
1431
|
+
return headers
|
|
1392
1432
|
|
|
1393
|
-
|
|
1394
|
-
|
|
1395
|
-
|
|
1433
|
+
def _build_query_data_url_params(self, projection, query_imports_table):
|
|
1434
|
+
if query_imports_table and projection:
|
|
1435
|
+
raise ValueError("Can't query both imports and projection table")
|
|
1396
1436
|
|
|
1397
|
-
|
|
1398
|
-
|
|
1399
|
-
|
|
1400
|
-
|
|
1401
|
-
|
|
1402
|
-
|
|
1403
|
-
while True:
|
|
1404
|
-
cur_columns, next_key, is_truncated, count = self.list_columns(
|
|
1405
|
-
bucket=bucket, schema=schema, table=table, next_key=next_key, txid=txid)
|
|
1406
|
-
if not cur_columns:
|
|
1407
|
-
break
|
|
1408
|
-
all_listed_columns.extend(cur_columns)
|
|
1409
|
-
if not is_truncated:
|
|
1410
|
-
break
|
|
1411
|
-
|
|
1412
|
-
# build a list of the queried columns
|
|
1413
|
-
queried_column_names = set()
|
|
1414
|
-
if filters:
|
|
1415
|
-
filtered_column_names = ([column_name.split('.')[0] for column_name in filters.keys()]) # use top level of the filter column names
|
|
1416
|
-
queried_column_names.update(filtered_column_names)
|
|
1417
|
-
_logger.debug(f"_list_table_columns: filtered_column_names={filtered_column_names}")
|
|
1418
|
-
|
|
1419
|
-
if field_names:
|
|
1420
|
-
field_column_names = ([column_name.split('.')[0] for column_name in field_names]) # use top level of the field column names
|
|
1421
|
-
else:
|
|
1422
|
-
field_column_names = [column[0] for column in all_listed_columns]
|
|
1423
|
-
_logger.debug(f"_list_table_columns: field_column_names={field_column_names}")
|
|
1424
|
-
queried_column_names.update(field_column_names)
|
|
1425
|
-
|
|
1426
|
-
all_listed_column_and_leaves_names = set()
|
|
1427
|
-
for column in all_listed_columns:
|
|
1428
|
-
# Collect the column and leaves names for verification below that all the filters and field names are in the table
|
|
1429
|
-
column_and_leaves_names = [column[0]] + [f.name for f in column[3].flatten()]
|
|
1430
|
-
all_listed_column_and_leaves_names.update(column_and_leaves_names)
|
|
1431
|
-
|
|
1432
|
-
# check if this column is needed for the query
|
|
1433
|
-
if column[0] in queried_column_names:
|
|
1434
|
-
queried_columns.append(column)
|
|
1435
|
-
|
|
1436
|
-
# verify that all the filters and field names are in the table
|
|
1437
|
-
if filters:
|
|
1438
|
-
for filter_column_name in filters.keys():
|
|
1439
|
-
if filter_column_name not in all_listed_column_and_leaves_names:
|
|
1440
|
-
raise KeyError((f'filter column name: {filter_column_name} does not appear in the table'))
|
|
1441
|
-
if field_names:
|
|
1442
|
-
for field_name in field_names:
|
|
1443
|
-
if field_name not in all_listed_column_and_leaves_names:
|
|
1444
|
-
raise ValueError((f'field name: {field_name} does not appear in the table'))
|
|
1445
|
-
return list(queried_columns)
|
|
1446
|
-
|
|
1447
|
-
def _begin_tx_if_necessary(self, txid):
|
|
1448
|
-
if not txid:
|
|
1449
|
-
created_txid = True
|
|
1450
|
-
res = self.begin_transaction()
|
|
1451
|
-
txid = res.headers.get('tabular-txid')
|
|
1452
|
-
else:
|
|
1453
|
-
created_txid = False
|
|
1437
|
+
url_params = {}
|
|
1438
|
+
if query_imports_table:
|
|
1439
|
+
url_params['sub-table'] = IMPORTED_OBJECTS_TABLE_NAME
|
|
1440
|
+
elif projection:
|
|
1441
|
+
url_params['name'] = projection
|
|
1442
|
+
return url_params
|
|
1454
1443
|
|
|
1455
|
-
|
|
1444
|
+
def legacy_query_data(self, bucket, schema, table, params, split=(0, 1, 8), num_sub_splits=1, response_row_id=False,
|
|
1445
|
+
txid=0, client_tags=[], expected_retvals=[], limit_rows=0, schedule_id=None, retry_count=0,
|
|
1446
|
+
search_path=None, sub_split_start_row_ids=[], tenant_guid=None, projection='', enable_sorted_projections=True,
|
|
1447
|
+
request_format='string', response_format='string', query_imports_table=False):
|
|
1448
|
+
"""
|
|
1449
|
+
POST /mybucket/myschema/mytable?query-data=LegacyQueryData HTTP/1.1
|
|
1450
|
+
Content-Length: ContentLength
|
|
1451
|
+
tabular-txid: TransactionId
|
|
1452
|
+
tabular-client-tag: ClientTag
|
|
1453
|
+
tabular-split: "split_id,total_splits,num_row_groups_per_split"
|
|
1454
|
+
tabular-num-of-subsplits: "total"
|
|
1455
|
+
tabular-request-format: "string"
|
|
1456
|
+
tabular-response-format: "string" #arrow/trino
|
|
1457
|
+
tabular-schedule-id: "schedule-id"
|
|
1456
1458
|
|
|
1457
|
-
|
|
1458
|
-
|
|
1459
|
-
|
|
1460
|
-
if response_row_id:
|
|
1461
|
-
queried_fields.append(pa.field('$row_id', pa.uint64()))
|
|
1459
|
+
Request Body (flatbuf)
|
|
1460
|
+
projections_chunk [expressions]
|
|
1461
|
+
predicate_chunk "formatted_data", (required)
|
|
1462
1462
|
|
|
1463
|
-
|
|
1464
|
-
|
|
1463
|
+
"""
|
|
1464
|
+
headers = self._build_query_data_headers(txid, client_tags, params, split, num_sub_splits, request_format, response_format,
|
|
1465
|
+
enable_sorted_projections, limit_rows, schedule_id, retry_count, search_path, tenant_guid,
|
|
1466
|
+
sub_split_start_row_ids)
|
|
1467
|
+
url_params = self._build_query_data_url_params(projection, query_imports_table)
|
|
1465
1468
|
|
|
1466
|
-
|
|
1467
|
-
|
|
1469
|
+
res = self.session.post(self._api_prefix(bucket=bucket, schema=schema, table=table, command="query-data=LegacyQueryData",
|
|
1470
|
+
url_params=url_params), data=params, headers=headers, stream=True)
|
|
1471
|
+
return self._check_res(res, "legacy_query_data", expected_retvals)
|
|
1468
1472
|
|
|
1469
|
-
|
|
1473
|
+
def query_data(self, bucket, schema, table, params, split=(0, 1, 8), num_sub_splits=1, response_row_id=False,
|
|
1474
|
+
txid=0, client_tags=[], expected_retvals=[], limit_rows=0, schedule_id=None, retry_count=0,
|
|
1475
|
+
search_path=None, sub_split_start_row_ids=[], tenant_guid=None, projection='', enable_sorted_projections=True,
|
|
1476
|
+
request_format='string', response_format='string', query_imports_table=False):
|
|
1477
|
+
"""
|
|
1478
|
+
GET /mybucket/myschema/mytable?data HTTP/1.1
|
|
1479
|
+
Content-Length: ContentLength
|
|
1480
|
+
tabular-txid: TransactionId
|
|
1481
|
+
tabular-client-tag: ClientTag
|
|
1482
|
+
tabular-split: "split_id,total_splits,num_row_groups_per_split"
|
|
1483
|
+
tabular-num-of-subsplits: "total"
|
|
1484
|
+
tabular-request-format: "string"
|
|
1485
|
+
tabular-response-format: "string" #arrow/trino
|
|
1486
|
+
tabular-schedule-id: "schedule-id"
|
|
1470
1487
|
|
|
1471
|
-
|
|
1472
|
-
|
|
1473
|
-
|
|
1474
|
-
else:
|
|
1475
|
-
executor_hosts = [self.host]
|
|
1476
|
-
executor_sessions = [VastdbApi(executor_hosts[i], self.access_key, self.secret_key, self.username,
|
|
1477
|
-
self.password, self.port, self.secure, self.auth_type) for i in range(len(executor_hosts))]
|
|
1478
|
-
|
|
1479
|
-
return queried_columns, arrow_schema, query_data_request, executor_sessions
|
|
1480
|
-
|
|
1481
|
-
def _more_pages_exist(self, start_row_ids):
|
|
1482
|
-
for row_id in start_row_ids.values():
|
|
1483
|
-
if row_id != TABULAR_INVALID_ROW_ID:
|
|
1484
|
-
return True
|
|
1485
|
-
return False
|
|
1486
|
-
|
|
1487
|
-
def _query_page(self, bucket, schema, table, query_data_request, split=(0, 1, 8), num_sub_splits=1, response_row_id=False,
|
|
1488
|
-
txid=0, limit_rows=0, sub_split_start_row_ids=[], filters=None, field_names=None):
|
|
1489
|
-
res = self.query_data(bucket=bucket, schema=schema, table=table, params=query_data_request.serialized, split=split,
|
|
1490
|
-
num_sub_splits=num_sub_splits, response_row_id=response_row_id, txid=txid,
|
|
1491
|
-
limit_rows=limit_rows, sub_split_start_row_ids=sub_split_start_row_ids)
|
|
1492
|
-
start_row_ids = {}
|
|
1493
|
-
sub_split_tables = parse_query_data_response(res.raw, query_data_request.response_schema,
|
|
1494
|
-
start_row_ids=start_row_ids)
|
|
1495
|
-
table_page = pa.concat_tables(sub_split_tables)
|
|
1496
|
-
_logger.info("query_page: table_page num_rows=%s start_row_ids len=%s",
|
|
1497
|
-
len(table_page), len(start_row_ids))
|
|
1498
|
-
|
|
1499
|
-
return table_page, start_row_ids
|
|
1500
|
-
|
|
1501
|
-
def _query_page_iterator(self, bucket, schema, table, query_data_request, split=(0, 1, 8), num_sub_splits=1, response_row_id=False,
|
|
1502
|
-
txid=0, limit_rows=0, start_row_ids={}, filters=None, field_names=None):
|
|
1503
|
-
res = self.query_data(bucket=bucket, schema=schema, table=table, params=query_data_request.serialized, split=split,
|
|
1504
|
-
num_sub_splits=num_sub_splits, response_row_id=response_row_id, txid=txid,
|
|
1505
|
-
limit_rows=limit_rows, sub_split_start_row_ids=start_row_ids.items())
|
|
1506
|
-
for sub_split_table in parse_query_data_response(res.raw, query_data_request.response_schema,
|
|
1507
|
-
start_row_ids=start_row_ids):
|
|
1508
|
-
for record_batch in sub_split_table.to_batches():
|
|
1509
|
-
yield record_batch
|
|
1510
|
-
_logger.info(f"query_page_iterator: start_row_ids={start_row_ids}")
|
|
1511
|
-
|
|
1512
|
-
def query_iterator(self, bucket, schema, table, num_sub_splits=1, num_row_groups_per_sub_split=8,
|
|
1513
|
-
response_row_id=False, txid=0, limit_per_sub_split=128*1024, filters=None, field_names=None):
|
|
1514
|
-
"""
|
|
1515
|
-
query rows into a table.
|
|
1516
|
-
|
|
1517
|
-
Parameters
|
|
1518
|
-
----------
|
|
1519
|
-
bucket : string
|
|
1520
|
-
The bucket of the table.
|
|
1521
|
-
schema : string
|
|
1522
|
-
The schema of the table.
|
|
1523
|
-
table : string
|
|
1524
|
-
The table name.
|
|
1525
|
-
num_sub_splits : integer
|
|
1526
|
-
The number of sub_splits per split - determines the parallelism inside a VastDB compute node
|
|
1527
|
-
default: 1
|
|
1528
|
-
num_row_groups_per_sub_split : integer
|
|
1529
|
-
The number of consecutive row groups per sub_split. Each row group consists of 64K row ids.
|
|
1530
|
-
default: 8
|
|
1531
|
-
response_row_id : boolean
|
|
1532
|
-
Return a column with the internal row ids of the table
|
|
1533
|
-
default: False
|
|
1534
|
-
txid : integer
|
|
1535
|
-
A transaction id. The transaction may be initiated before the query, and if not, the query will initiate it
|
|
1536
|
-
default: 0 (will be created by the api)
|
|
1537
|
-
limit_per_sub_split : integer
|
|
1538
|
-
Limit the number of rows from a single sub_split for a single rpc
|
|
1539
|
-
default:131072
|
|
1540
|
-
filters : dict
|
|
1541
|
-
A dictionary whose keys are column names, and values are lists of string expressions that represent
|
|
1542
|
-
filter conditions on the column. AND is applied on the conditions. The condition formats are:
|
|
1543
|
-
'column_name eq some_value'
|
|
1544
|
-
default: None
|
|
1545
|
-
field_names : list
|
|
1546
|
-
A list of column names to be returned in the output table
|
|
1547
|
-
default: None
|
|
1548
|
-
|
|
1549
|
-
Returns
|
|
1550
|
-
-------
|
|
1551
|
-
Query iterator generator
|
|
1552
|
-
|
|
1553
|
-
Yields
|
|
1554
|
-
------
|
|
1555
|
-
pyarrow.RecordBatch
|
|
1556
|
-
|
|
1557
|
-
Examples
|
|
1558
|
-
--------
|
|
1559
|
-
for record_batch in query_iterator('some_bucket', 'some_schema', 'some_table',
|
|
1560
|
-
filters={'name': ['eq Alice', 'eq Bob']}
|
|
1561
|
-
field_names=['name','age']):
|
|
1562
|
-
...
|
|
1563
|
-
|
|
1564
|
-
"""
|
|
1565
|
-
|
|
1566
|
-
# create a transaction if necessary
|
|
1567
|
-
txid, created_txid = self._begin_tx_if_necessary(txid)
|
|
1568
|
-
executor_sessions = []
|
|
1488
|
+
Request Body (flatbuf)
|
|
1489
|
+
projections_chunk [expressions]
|
|
1490
|
+
predicate_chunk "formatted_data", (required)
|
|
1569
1491
|
|
|
1570
|
-
|
|
1571
|
-
|
|
1572
|
-
|
|
1573
|
-
|
|
1574
|
-
|
|
1575
|
-
|
|
1576
|
-
|
|
1577
|
-
|
|
1578
|
-
|
|
1579
|
-
|
|
1580
|
-
|
|
1581
|
-
|
|
1582
|
-
|
|
1583
|
-
if killall:
|
|
1584
|
-
raise RuntimeError(f'query_iterator_split_id: split_id {split_id} received killall')
|
|
1585
|
-
|
|
1586
|
-
while self._more_pages_exist(start_row_ids):
|
|
1587
|
-
for record_batch in session._query_page_iterator(bucket=bucket, schema=schema, table=table, query_data_request=query_data_request,
|
|
1588
|
-
split=(split_id, num_splits, num_row_groups_per_sub_split),
|
|
1589
|
-
num_sub_splits=num_sub_splits, response_row_id=response_row_id,
|
|
1590
|
-
txid=txid, limit_rows=limit_per_sub_split,
|
|
1591
|
-
start_row_ids=start_row_ids):
|
|
1592
|
-
output_queue.put((split_id, record_batch))
|
|
1593
|
-
while not next_sems[split_id].acquire(timeout=1): # wait for the main thread to request the next record batch
|
|
1594
|
-
if killall:
|
|
1595
|
-
raise RuntimeError(f'split_id {split_id} received killall')
|
|
1596
|
-
# end of split
|
|
1597
|
-
output_queue.put((split_id,None))
|
|
1598
|
-
|
|
1599
|
-
except Exception as e:
|
|
1600
|
-
_logger.exception('query_iterator_split_id: exception occurred')
|
|
1601
|
-
try:
|
|
1602
|
-
self.rollback_transaction(txid)
|
|
1603
|
-
except:
|
|
1604
|
-
_logger.exception(f'failed to rollback txid {txid}')
|
|
1605
|
-
error_queue.put(None)
|
|
1606
|
-
raise e
|
|
1607
|
-
|
|
1608
|
-
# kickoff executors
|
|
1609
|
-
num_splits = len(executor_sessions)
|
|
1610
|
-
output_queue = queue.Queue()
|
|
1611
|
-
error_queue = queue.Queue()
|
|
1612
|
-
next_sems = [threading.Semaphore(value=1) for i in range(num_splits)]
|
|
1613
|
-
killall = False
|
|
1614
|
-
with concurrent.futures.ThreadPoolExecutor(max_workers=num_splits) as executor:
|
|
1615
|
-
# start executors
|
|
1616
|
-
futures = []
|
|
1617
|
-
for i in range(num_splits):
|
|
1618
|
-
futures.append(executor.submit(query_iterator_split_id, self, i))
|
|
1619
|
-
|
|
1620
|
-
# receive outputs and yield them
|
|
1621
|
-
done_count = 0
|
|
1622
|
-
while done_count < num_splits:
|
|
1623
|
-
# check for errors
|
|
1624
|
-
try:
|
|
1625
|
-
error_queue.get(block=False)
|
|
1626
|
-
_logger.error('received error from a thread')
|
|
1627
|
-
killall = True
|
|
1628
|
-
# wait for all executors to complete
|
|
1629
|
-
for future in concurrent.futures.as_completed(futures):
|
|
1630
|
-
try:
|
|
1631
|
-
future.result() # trigger an exception if occurred in any thread
|
|
1632
|
-
except Exception:
|
|
1633
|
-
_logger.exception('exception occurred')
|
|
1634
|
-
raise RuntimeError('received error from a thread')
|
|
1635
|
-
except queue.Empty:
|
|
1636
|
-
pass
|
|
1637
|
-
|
|
1638
|
-
# try to get a value from the output queue
|
|
1639
|
-
try:
|
|
1640
|
-
(split_id, record_batch) = output_queue.get(timeout=1)
|
|
1641
|
-
except queue.Empty:
|
|
1642
|
-
continue
|
|
1643
|
-
|
|
1644
|
-
if record_batch:
|
|
1645
|
-
# signal to the thread to read the next record batch and yield the current
|
|
1646
|
-
next_sems[split_id].release()
|
|
1647
|
-
try:
|
|
1648
|
-
yield record_batch
|
|
1649
|
-
except GeneratorExit:
|
|
1650
|
-
killall = True
|
|
1651
|
-
_logger.debug("cancelling query_iterator")
|
|
1652
|
-
raise
|
|
1653
|
-
else:
|
|
1654
|
-
done_count += 1
|
|
1655
|
-
|
|
1656
|
-
# wait for all executors to complete
|
|
1657
|
-
for future in concurrent.futures.as_completed(futures):
|
|
1658
|
-
try:
|
|
1659
|
-
future.result() # trigger an exception if occurred in any thread
|
|
1660
|
-
except Exception:
|
|
1661
|
-
_logger.exception('exception occurred')
|
|
1662
|
-
|
|
1663
|
-
# commit if needed
|
|
1664
|
-
if created_txid:
|
|
1665
|
-
self.commit_transaction(txid)
|
|
1666
|
-
|
|
1667
|
-
except Exception as e:
|
|
1668
|
-
_logger.exception('exception occurred')
|
|
1669
|
-
try:
|
|
1670
|
-
self.rollback_transaction(txid)
|
|
1671
|
-
except:
|
|
1672
|
-
_logger.exception(f'failed to rollback txid {txid}')
|
|
1673
|
-
raise e
|
|
1674
|
-
|
|
1675
|
-
finally:
|
|
1676
|
-
killall = True
|
|
1677
|
-
for session in executor_sessions:
|
|
1678
|
-
try:
|
|
1679
|
-
session.session.close()
|
|
1680
|
-
except Exception:
|
|
1681
|
-
_logger.exception(f'failed to close session {session}')
|
|
1682
|
-
|
|
1683
|
-
def query(self, bucket, schema, table, num_sub_splits=1, num_row_groups_per_sub_split=8,
|
|
1684
|
-
response_row_id=False, txid=0, limit=0, limit_per_sub_split=131072, filters=None, field_names=None,
|
|
1685
|
-
queried_columns=None):
|
|
1686
|
-
"""
|
|
1687
|
-
query rows into a table.
|
|
1688
|
-
|
|
1689
|
-
Parameters
|
|
1690
|
-
----------
|
|
1691
|
-
bucket : string
|
|
1692
|
-
The bucket of the table.
|
|
1693
|
-
schema : string
|
|
1694
|
-
The schema of the table.
|
|
1695
|
-
table : string
|
|
1696
|
-
The table name.
|
|
1697
|
-
num_sub_splits : integer
|
|
1698
|
-
The number of sub_splits per split - determines the parallelism inside a VastDB compute node
|
|
1699
|
-
default: 1
|
|
1700
|
-
num_row_groups_per_sub_split : integer
|
|
1701
|
-
The number of consecutive row groups per sub_split. Each row group consists of 64K row ids.
|
|
1702
|
-
default: 8
|
|
1703
|
-
response_row_id : boolean
|
|
1704
|
-
Return a column with the internal row ids of the table
|
|
1705
|
-
default: False
|
|
1706
|
-
txid : integer
|
|
1707
|
-
A transaction id. The transaction may be initiated before the query, and be used to provide
|
|
1708
|
-
multiple ACID operations
|
|
1709
|
-
default: 0 (will be created by the api)
|
|
1710
|
-
limit : integer
|
|
1711
|
-
Limit the number of rows in the response
|
|
1712
|
-
default: 0 (no limit)
|
|
1713
|
-
limit_per_sub_split : integer
|
|
1714
|
-
Limit the number of rows from a single sub_split for a single rpc
|
|
1715
|
-
default:131072
|
|
1716
|
-
filters : dict
|
|
1717
|
-
A dictionary whose keys are column names, and values are lists of string expressions that represent
|
|
1718
|
-
filter conditions on the column. AND is applied on the conditions. The condition formats are:
|
|
1719
|
-
'column_name eq some_value'
|
|
1720
|
-
default: None
|
|
1721
|
-
field_names : list
|
|
1722
|
-
A list of column names to be returned to the output table
|
|
1723
|
-
default: None
|
|
1724
|
-
queried_columns: list of pyArrow.column
|
|
1725
|
-
A list of the columns to be queried
|
|
1726
|
-
default: None
|
|
1727
|
-
|
|
1728
|
-
Returns
|
|
1729
|
-
-------
|
|
1730
|
-
pyarrow.Table
|
|
1731
|
-
|
|
1732
|
-
|
|
1733
|
-
Examples
|
|
1734
|
-
--------
|
|
1735
|
-
table = query('some_bucket', 'some_schema', 'some_table',
|
|
1736
|
-
filters={'name': ['eq Alice', 'eq Bob']}
|
|
1737
|
-
field_names=['name','age'])
|
|
1738
|
-
|
|
1739
|
-
"""
|
|
1740
|
-
|
|
1741
|
-
# create a transaction
|
|
1742
|
-
txid, created_txid = self._begin_tx_if_necessary(txid)
|
|
1743
|
-
executor_sessions = []
|
|
1744
|
-
try:
|
|
1745
|
-
# prepare query
|
|
1746
|
-
queried_columns, arrow_schema, query_data_request, executor_sessions = \
|
|
1747
|
-
self._prepare_query(bucket, schema, table, num_sub_splits, filters, field_names, response_row_id=response_row_id, txid=txid)
|
|
1748
|
-
|
|
1749
|
-
# define the per split threaded query func
|
|
1750
|
-
def query_split_id(self, split_id):
|
|
1751
|
-
try:
|
|
1752
|
-
start_row_ids = {i:0 for i in range(num_sub_splits)}
|
|
1753
|
-
session = executor_sessions[split_id]
|
|
1754
|
-
row_count = 0
|
|
1755
|
-
while (self._more_pages_exist(start_row_ids) and
|
|
1756
|
-
(not limit or row_count < limit)):
|
|
1757
|
-
# check if killed externally
|
|
1758
|
-
if killall:
|
|
1759
|
-
raise RuntimeError(f'query_split_id: split_id {split_id} received killall')
|
|
1760
|
-
|
|
1761
|
-
# determine the limit rows
|
|
1762
|
-
if limit:
|
|
1763
|
-
limit_rows = min(limit_per_sub_split, limit-row_count)
|
|
1764
|
-
else:
|
|
1765
|
-
limit_rows = limit_per_sub_split
|
|
1766
|
-
|
|
1767
|
-
# query one page
|
|
1768
|
-
table_page, start_row_ids = session._query_page(bucket=bucket, schema=schema, table=table, query_data_request=query_data_request,
|
|
1769
|
-
split=(split_id, num_splits, num_row_groups_per_sub_split),
|
|
1770
|
-
num_sub_splits=num_sub_splits, response_row_id=response_row_id,
|
|
1771
|
-
txid=txid, limit_rows=limit_rows,
|
|
1772
|
-
sub_split_start_row_ids=start_row_ids.items())
|
|
1773
|
-
with lock:
|
|
1774
|
-
table_pages.append(table_page)
|
|
1775
|
-
row_counts[split_id] += len(table_page)
|
|
1776
|
-
row_count = sum(row_counts)
|
|
1777
|
-
_logger.info(f"query_split_id: table_pages split_id={split_id} row_count={row_count}")
|
|
1778
|
-
except Exception as e:
|
|
1779
|
-
_logger.exception('query_split_id: exception occurred')
|
|
1780
|
-
try:
|
|
1781
|
-
self.rollback_transaction(txid)
|
|
1782
|
-
except:
|
|
1783
|
-
_logger.exception(f'failed to rollback txid {txid}')
|
|
1784
|
-
raise e
|
|
1785
|
-
|
|
1786
|
-
table_pages = []
|
|
1787
|
-
num_splits = len(executor_sessions)
|
|
1788
|
-
killall = False
|
|
1789
|
-
with concurrent.futures.ThreadPoolExecutor(max_workers=num_splits) as executor:
|
|
1790
|
-
futures = []
|
|
1791
|
-
row_counts = [0] * num_splits
|
|
1792
|
-
lock = threading.Lock()
|
|
1793
|
-
for i in range(num_splits):
|
|
1794
|
-
futures.append(executor.submit(query_split_id, self, i))
|
|
1795
|
-
for future in concurrent.futures.as_completed(futures):
|
|
1796
|
-
future.result() # trigger an exception if occurred in any thread
|
|
1797
|
-
|
|
1798
|
-
# commit if needed
|
|
1799
|
-
if created_txid:
|
|
1800
|
-
self.commit_transaction(txid)
|
|
1801
|
-
|
|
1802
|
-
# concatenate all table pages and return result
|
|
1803
|
-
out_table = pa.concat_tables(table_pages)
|
|
1804
|
-
out_table = out_table.slice(length=limit) if limit else out_table
|
|
1805
|
-
_logger.info("query: out_table len=%s row_count=%s",
|
|
1806
|
-
len(out_table), len(out_table))
|
|
1807
|
-
return out_table
|
|
1808
|
-
|
|
1809
|
-
except Exception as e:
|
|
1810
|
-
_logger.exception('exception occurred')
|
|
1811
|
-
try:
|
|
1812
|
-
self.rollback_transaction(txid)
|
|
1813
|
-
except:
|
|
1814
|
-
_logger.exception(f'failed to rollback txid {txid}')
|
|
1815
|
-
raise e
|
|
1816
|
-
|
|
1817
|
-
finally:
|
|
1818
|
-
killall = True
|
|
1819
|
-
for session in executor_sessions:
|
|
1820
|
-
try:
|
|
1821
|
-
session.session.close()
|
|
1822
|
-
except Exception:
|
|
1823
|
-
_logger.exception(f'failed to close session {session}')
|
|
1492
|
+
To query the internal vastdb-imported-objects table, set query_imports_table=True
|
|
1493
|
+
"""
|
|
1494
|
+
# add query option select-only and read-only
|
|
1495
|
+
|
|
1496
|
+
headers = self._build_query_data_headers(txid, client_tags, params, split, num_sub_splits, request_format, response_format,
|
|
1497
|
+
enable_sorted_projections, limit_rows, schedule_id, retry_count, search_path, tenant_guid,
|
|
1498
|
+
sub_split_start_row_ids)
|
|
1499
|
+
|
|
1500
|
+
url_params = self._build_query_data_url_params(projection, query_imports_table)
|
|
1501
|
+
|
|
1502
|
+
res = self.session.get(self._api_prefix(bucket=bucket, schema=schema, table=table, command="data", url_params=url_params),
|
|
1503
|
+
data=params, headers=headers, stream=True)
|
|
1504
|
+
return self._check_res(res, "query_data", expected_retvals)
|
|
1824
1505
|
|
|
1825
1506
|
"""
|
|
1826
1507
|
source_files: list of (bucket_name, file_name)
|
|
@@ -1874,21 +1555,22 @@ class VastdbApi:
|
|
|
1874
1555
|
builder.Finish(params)
|
|
1875
1556
|
import_req = builder.Output()
|
|
1876
1557
|
|
|
1877
|
-
def iterate_over_import_data_response(response
|
|
1558
|
+
def iterate_over_import_data_response(response):
|
|
1878
1559
|
if response.status_code != 200:
|
|
1879
1560
|
return response
|
|
1880
1561
|
|
|
1881
1562
|
chunk_size = 1024
|
|
1882
|
-
for chunk in
|
|
1563
|
+
for chunk in response.iter_content(chunk_size=chunk_size):
|
|
1883
1564
|
chunk_dict = json.loads(chunk)
|
|
1884
|
-
_logger.
|
|
1885
|
-
if chunk_dict['res']
|
|
1886
|
-
|
|
1887
|
-
|
|
1888
|
-
|
|
1889
|
-
|
|
1890
|
-
|
|
1891
|
-
|
|
1565
|
+
_logger.debug("import data chunk=%s, result: %s", chunk_dict, chunk_dict['res'])
|
|
1566
|
+
if chunk_dict['res'] != 'Success' and chunk_dict['res'] != 'TabularInProgress' and chunk_dict['res'] != 'TabularAlreadyImported':
|
|
1567
|
+
raise errors.ImportFilesError(
|
|
1568
|
+
f"Encountered an error during import_data. status: {chunk_dict['res']}, "
|
|
1569
|
+
f"error message: {chunk_dict['err_msg'] or 'Unexpected error'} during import of "
|
|
1570
|
+
f"object name: {chunk_dict['object_name']}", chunk_dict)
|
|
1571
|
+
else:
|
|
1572
|
+
_logger.debug("import_data of object name '%s' is in progress. "
|
|
1573
|
+
"status: %s", chunk_dict['object_name'], chunk_dict['res'])
|
|
1892
1574
|
return response
|
|
1893
1575
|
|
|
1894
1576
|
headers = self._fill_common_headers(txid=txid, client_tags=client_tags)
|
|
@@ -1901,34 +1583,17 @@ class VastdbApi:
|
|
|
1901
1583
|
res = self.session.post(self._api_prefix(bucket=bucket, schema=schema, table=table, command="data"),
|
|
1902
1584
|
data=import_req, headers=headers, stream=True)
|
|
1903
1585
|
if blocking:
|
|
1904
|
-
res = iterate_over_import_data_response(res
|
|
1586
|
+
res = iterate_over_import_data_response(res)
|
|
1905
1587
|
|
|
1906
1588
|
return self._check_res(res, "import_data", expected_retvals)
|
|
1907
1589
|
|
|
1908
|
-
def merge_data(self):
|
|
1909
|
-
"""
|
|
1910
|
-
TODO
|
|
1911
|
-
|
|
1912
|
-
POST /mybucket/myschema/mytable?data HTTP/1.1
|
|
1913
|
-
Content-Length: ContentLength
|
|
1914
|
-
tabular-txid: TransactionId
|
|
1915
|
-
tabular-client-tag: ClientTag
|
|
1916
|
-
|
|
1917
|
-
Request Body
|
|
1918
|
-
{
|
|
1919
|
-
"format": "string",
|
|
1920
|
-
"select_source": "formatted data"
|
|
1921
|
-
"predicate": "formatted_data"
|
|
1922
|
-
}
|
|
1923
|
-
"""
|
|
1924
|
-
pass
|
|
1925
|
-
|
|
1926
1590
|
def _record_batch_slices(self, batch, rows_per_slice=None):
|
|
1927
1591
|
max_slice_size_in_bytes = int(0.9*5*1024*1024) # 0.9 * 5MB
|
|
1928
1592
|
batch_len = len(batch)
|
|
1929
1593
|
serialized_batch = serialize_record_batch(batch)
|
|
1930
1594
|
batch_size_in_bytes = len(serialized_batch)
|
|
1931
|
-
_logger.
|
|
1595
|
+
_logger.debug('max_slice_size_in_bytes=%d batch_len=%d batch_size_in_bytes=%d',
|
|
1596
|
+
max_slice_size_in_bytes, batch_len, batch_size_in_bytes)
|
|
1932
1597
|
|
|
1933
1598
|
if not rows_per_slice:
|
|
1934
1599
|
if batch_size_in_bytes < max_slice_size_in_bytes:
|
|
@@ -1950,7 +1615,7 @@ class VastdbApi:
|
|
|
1950
1615
|
serialized_slice_batch = serialize_record_batch(slice_batch)
|
|
1951
1616
|
sizeof_serialized_slice_batch = len(serialized_slice_batch)
|
|
1952
1617
|
|
|
1953
|
-
if sizeof_serialized_slice_batch <= max_slice_size_in_bytes
|
|
1618
|
+
if sizeof_serialized_slice_batch <= max_slice_size_in_bytes:
|
|
1954
1619
|
serialized_slices.append(serialized_slice_batch)
|
|
1955
1620
|
else:
|
|
1956
1621
|
_logger.info(f'Using rows_per_slice {rows_per_slice} slice {i} size {sizeof_serialized_slice_batch} exceeds {max_slice_size_in_bytes} bytes, trying smaller rows_per_slice')
|
|
@@ -1964,125 +1629,6 @@ class VastdbApi:
|
|
|
1964
1629
|
|
|
1965
1630
|
return serialized_slices
|
|
1966
1631
|
|
|
1967
|
-
def insert(self, bucket, schema, table, rows=None, record_batch=None, rows_per_insert=None, txid=0):
|
|
1968
|
-
"""
|
|
1969
|
-
Insert rows into a table. The operation may be split into multiple commands, such that by default no more than 512KB will be inserted per command.
|
|
1970
|
-
|
|
1971
|
-
Parameters
|
|
1972
|
-
----------
|
|
1973
|
-
bucket : string
|
|
1974
|
-
The bucket of the table.
|
|
1975
|
-
schema : string
|
|
1976
|
-
The schema of the table.
|
|
1977
|
-
table : string
|
|
1978
|
-
The table name.
|
|
1979
|
-
rows : dict
|
|
1980
|
-
The rows to insert.
|
|
1981
|
-
dictionary key: column name
|
|
1982
|
-
dictionary value: array of cell values to insert
|
|
1983
|
-
default: None (if None, record_batch must be provided)
|
|
1984
|
-
record_batch : pyarrow.RecordBatch
|
|
1985
|
-
A pyarrow RecordBatch
|
|
1986
|
-
default: None (if None, rows dictionary must be provided)
|
|
1987
|
-
rows_per_insert : integer
|
|
1988
|
-
Split the operation so that each insert command will be limited to this value
|
|
1989
|
-
default: None (will be selected automatically)
|
|
1990
|
-
txid : integer
|
|
1991
|
-
A transaction id. The transaction may be initiated before the insert, and be used to provide
|
|
1992
|
-
multiple ACID operations
|
|
1993
|
-
default: 0 (will be created by the api)
|
|
1994
|
-
|
|
1995
|
-
Returns
|
|
1996
|
-
-------
|
|
1997
|
-
None
|
|
1998
|
-
|
|
1999
|
-
|
|
2000
|
-
Examples
|
|
2001
|
-
--------
|
|
2002
|
-
insert('some_bucket', 'some_schema', 'some_table', {'name': ['Alice','Bob'], 'age': [25,24]})
|
|
2003
|
-
|
|
2004
|
-
"""
|
|
2005
|
-
if (not rows and not record_batch) or (rows and record_batch):
|
|
2006
|
-
raise ValueError(f'insert: missing argument - either rows or record_batch must be provided')
|
|
2007
|
-
|
|
2008
|
-
# create a transaction
|
|
2009
|
-
txid, created_txid = self._begin_tx_if_necessary(txid)
|
|
2010
|
-
|
|
2011
|
-
if rows:
|
|
2012
|
-
columns = self._list_table_columns(bucket, schema, table, field_names=rows.keys(), txid=txid)
|
|
2013
|
-
columns_dict = dict([(column[0], column[1]) for column in columns])
|
|
2014
|
-
arrow_schema = pa.schema([])
|
|
2015
|
-
arrays = []
|
|
2016
|
-
for column_name, column_values in rows.items():
|
|
2017
|
-
column_type = columns_dict[column_name]
|
|
2018
|
-
field = pa.field(column_name, column_type)
|
|
2019
|
-
arrow_schema = arrow_schema.append(field)
|
|
2020
|
-
arrays.append(pa.array(column_values, column_type))
|
|
2021
|
-
record_batch = pa.record_batch(arrays, arrow_schema)
|
|
2022
|
-
|
|
2023
|
-
# split the record batch into multiple slices
|
|
2024
|
-
serialized_slices = self._record_batch_slices(record_batch, rows_per_insert)
|
|
2025
|
-
_logger.info(f'inserting record batch using {len(serialized_slices)} slices')
|
|
2026
|
-
|
|
2027
|
-
insert_queue = queue.Queue()
|
|
2028
|
-
|
|
2029
|
-
[insert_queue.put(insert_rows_req) for insert_rows_req in serialized_slices]
|
|
2030
|
-
|
|
2031
|
-
try:
|
|
2032
|
-
executor_sessions = [VastdbApi(self.executor_hosts[i], self.access_key, self.secret_key, self.username,
|
|
2033
|
-
self.password, self.port, self.secure, self.auth_type) for i in range(len(self.executor_hosts))]
|
|
2034
|
-
|
|
2035
|
-
def insert_executor(self, split_id):
|
|
2036
|
-
|
|
2037
|
-
try:
|
|
2038
|
-
_logger.info(f'insert_executor split_id={split_id} starting')
|
|
2039
|
-
session = executor_sessions[split_id]
|
|
2040
|
-
num_inserts = 0
|
|
2041
|
-
while not killall:
|
|
2042
|
-
try:
|
|
2043
|
-
insert_rows_req = insert_queue.get(block=False)
|
|
2044
|
-
except queue.Empty:
|
|
2045
|
-
break
|
|
2046
|
-
session.insert_rows(bucket=bucket, schema=schema,
|
|
2047
|
-
table=table, record_batch=insert_rows_req, txid=txid)
|
|
2048
|
-
num_inserts += 1
|
|
2049
|
-
_logger.info(f'insert_executor split_id={split_id} num_inserts={num_inserts}')
|
|
2050
|
-
if killall:
|
|
2051
|
-
_logger.info('insert_executor killall=True')
|
|
2052
|
-
|
|
2053
|
-
except Exception as e:
|
|
2054
|
-
_logger.exception('insert_executor hit exception')
|
|
2055
|
-
raise e
|
|
2056
|
-
|
|
2057
|
-
num_splits = len(executor_sessions)
|
|
2058
|
-
killall = False
|
|
2059
|
-
with concurrent.futures.ThreadPoolExecutor(max_workers=num_splits) as executor:
|
|
2060
|
-
futures = []
|
|
2061
|
-
for i in range(num_splits):
|
|
2062
|
-
futures.append(executor.submit(insert_executor, self, i))
|
|
2063
|
-
for future in concurrent.futures.as_completed(futures):
|
|
2064
|
-
future.result() # trigger an exception if occurred in any thread
|
|
2065
|
-
|
|
2066
|
-
# commit if needed
|
|
2067
|
-
if created_txid:
|
|
2068
|
-
self.commit_transaction(txid)
|
|
2069
|
-
|
|
2070
|
-
except Exception as e:
|
|
2071
|
-
_logger.exception('exception occurred')
|
|
2072
|
-
try:
|
|
2073
|
-
self.rollback_transaction(txid)
|
|
2074
|
-
except:
|
|
2075
|
-
_logger.exception(f'failed to rollback txid {txid}')
|
|
2076
|
-
raise e
|
|
2077
|
-
|
|
2078
|
-
finally:
|
|
2079
|
-
killall = True
|
|
2080
|
-
for session in executor_sessions:
|
|
2081
|
-
try:
|
|
2082
|
-
session.session.close()
|
|
2083
|
-
except Exception:
|
|
2084
|
-
_logger.exception(f'failed to close session {session}')
|
|
2085
|
-
|
|
2086
1632
|
def insert_rows(self, bucket, schema, table, record_batch, txid=0, client_tags=[], expected_retvals=[]):
|
|
2087
1633
|
"""
|
|
2088
1634
|
POST /mybucket/myschema/mytable?rows HTTP/1.1
|
|
@@ -2352,41 +1898,40 @@ def _iter_query_data_response_columns(fileobj, stream_ids=None):
|
|
|
2352
1898
|
if stream_ids is not None:
|
|
2353
1899
|
stream_ids.update([stream_id]) # count stream IDs using a collections.Counter
|
|
2354
1900
|
if stream_id == TABULAR_KEEP_ALIVE_STREAM_ID:
|
|
2355
|
-
# _logger.info(f"stream_id={stream_id} (skipping)")
|
|
2356
1901
|
continue
|
|
2357
1902
|
|
|
2358
1903
|
if stream_id == TABULAR_QUERY_DATA_COMPLETED_STREAM_ID:
|
|
2359
1904
|
# read the terminating end chunk from socket
|
|
2360
1905
|
res = fileobj.read()
|
|
2361
|
-
_logger.
|
|
1906
|
+
_logger.debug("stream_id=%d res=%s (finish)", stream_id, res)
|
|
2362
1907
|
return
|
|
2363
1908
|
|
|
2364
1909
|
if stream_id == TABULAR_QUERY_DATA_FAILED_STREAM_ID:
|
|
2365
1910
|
# read the terminating end chunk from socket
|
|
2366
1911
|
res = fileobj.read()
|
|
2367
|
-
_logger.
|
|
1912
|
+
_logger.warning("stream_id=%d res=%s (failed)", stream_id, res)
|
|
2368
1913
|
raise IOError(f"Query data stream failed res={res}")
|
|
2369
1914
|
|
|
2370
1915
|
next_row_id_bytes = fileobj.read(8)
|
|
2371
1916
|
next_row_id, = struct.unpack('<Q', next_row_id_bytes)
|
|
2372
|
-
_logger.
|
|
1917
|
+
_logger.debug("stream_id=%d next_row_id=%d", stream_id, next_row_id)
|
|
2373
1918
|
|
|
2374
1919
|
if stream_id not in readers:
|
|
2375
1920
|
# we implicitly read 1st message (Arrow schema) when constructing RecordBatchStreamReader
|
|
2376
1921
|
reader = pa.ipc.RecordBatchStreamReader(fileobj)
|
|
2377
|
-
_logger.
|
|
1922
|
+
_logger.debug("stream_id=%d schema=%s", stream_id, reader.schema)
|
|
2378
1923
|
readers[stream_id] = (reader, [])
|
|
2379
1924
|
continue
|
|
2380
1925
|
|
|
2381
1926
|
(reader, batches) = readers[stream_id]
|
|
2382
1927
|
try:
|
|
2383
1928
|
batch = reader.read_next_batch() # read single-column chunk data
|
|
2384
|
-
_logger.
|
|
1929
|
+
_logger.debug("stream_id=%d rows=%d chunk=%s", stream_id, len(batch), batch)
|
|
2385
1930
|
batches.append(batch)
|
|
2386
1931
|
except StopIteration: # we got an end-of-stream IPC message for a given stream ID
|
|
2387
1932
|
reader, batches = readers.pop(stream_id) # end of column
|
|
2388
1933
|
table = pa.Table.from_batches(batches) # concatenate all column chunks (as a single)
|
|
2389
|
-
_logger.
|
|
1934
|
+
_logger.debug("stream_id=%d rows=%d column=%s", stream_id, len(table), table)
|
|
2390
1935
|
yield (stream_id, next_row_id, table)
|
|
2391
1936
|
|
|
2392
1937
|
|
|
@@ -2415,7 +1960,8 @@ def parse_query_data_response(conn, schema, stream_ids=None, start_row_ids=None,
|
|
|
2415
1960
|
if is_empty_projection: # VAST returns an empty RecordBatch, with the correct rows' count
|
|
2416
1961
|
parsed_table = table
|
|
2417
1962
|
|
|
2418
|
-
_logger.
|
|
1963
|
+
_logger.debug("stream_id=%d rows=%d next_row_id=%d table=%s",
|
|
1964
|
+
stream_id, len(parsed_table), next_row_id, parsed_table)
|
|
2419
1965
|
start_row_ids[stream_id] = next_row_id
|
|
2420
1966
|
yield parsed_table # the result of a single "select_rows()" cycle
|
|
2421
1967
|
|
|
@@ -2564,7 +2110,6 @@ def get_field_type(builder: flatbuffers.Builder, field: pa.Field):
|
|
|
2564
2110
|
return field_type, field_type_type
|
|
2565
2111
|
|
|
2566
2112
|
def build_field(builder: flatbuffers.Builder, f: pa.Field, name: str):
|
|
2567
|
-
_logger.info(f"name={f.name}")
|
|
2568
2113
|
children = None
|
|
2569
2114
|
if isinstance(f.type, pa.StructType):
|
|
2570
2115
|
children = [build_field(builder, child, child.name) for child in list(f.type)]
|
|
@@ -2591,7 +2136,6 @@ def build_field(builder: flatbuffers.Builder, f: pa.Field, name: str):
|
|
|
2591
2136
|
fb_field.AddName(builder, child_col_name)
|
|
2592
2137
|
fb_field.AddChildren(builder, children)
|
|
2593
2138
|
|
|
2594
|
-
_logger.info(f"added key and map to entries")
|
|
2595
2139
|
children = [fb_field.End(builder)]
|
|
2596
2140
|
|
|
2597
2141
|
if children is not None:
|
|
@@ -2602,13 +2146,11 @@ def build_field(builder: flatbuffers.Builder, f: pa.Field, name: str):
|
|
|
2602
2146
|
|
|
2603
2147
|
col_name = builder.CreateString(name)
|
|
2604
2148
|
field_type, field_type_type = get_field_type(builder, f)
|
|
2605
|
-
_logger.info(f"add col_name={name} type_type={field_type_type} to fb")
|
|
2606
2149
|
fb_field.Start(builder)
|
|
2607
2150
|
fb_field.AddName(builder, col_name)
|
|
2608
2151
|
fb_field.AddTypeType(builder, field_type_type)
|
|
2609
2152
|
fb_field.AddType(builder, field_type)
|
|
2610
2153
|
if children is not None:
|
|
2611
|
-
_logger.info(f"add col_name={name} childern")
|
|
2612
2154
|
fb_field.AddChildren(builder, children)
|
|
2613
2155
|
return fb_field.End(builder)
|
|
2614
2156
|
|
|
@@ -2625,9 +2167,7 @@ class QueryDataRequest:
|
|
|
2625
2167
|
self.response_schema = response_schema
|
|
2626
2168
|
|
|
2627
2169
|
|
|
2628
|
-
def build_query_data_request(schema: 'pa.Schema' = pa.schema([]),
|
|
2629
|
-
filters = filters or {}
|
|
2630
|
-
|
|
2170
|
+
def build_query_data_request(schema: 'pa.Schema' = pa.schema([]), predicate: ibis.expr.types.BooleanColumn = None, field_names: list = None):
|
|
2631
2171
|
builder = flatbuffers.Builder(1024)
|
|
2632
2172
|
|
|
2633
2173
|
source_name = builder.CreateString('') # required
|
|
@@ -2643,7 +2183,7 @@ def build_query_data_request(schema: 'pa.Schema' = pa.schema([]), filters: dict
|
|
|
2643
2183
|
fb_schema.AddFields(builder, fields)
|
|
2644
2184
|
schema_obj = fb_schema.End(builder)
|
|
2645
2185
|
|
|
2646
|
-
predicate = Predicate(schema,
|
|
2186
|
+
predicate = Predicate(schema=schema, expr=predicate)
|
|
2647
2187
|
filter_obj = predicate.serialize(builder)
|
|
2648
2188
|
|
|
2649
2189
|
parser = QueryDataParser(schema)
|
|
@@ -2654,10 +2194,8 @@ def build_query_data_request(schema: 'pa.Schema' = pa.schema([]), filters: dict
|
|
|
2654
2194
|
continue
|
|
2655
2195
|
iter_from_root = reversed(list(descendent._iter_to_root()))
|
|
2656
2196
|
descendent_full_name = '.'.join([n.field.name for n in iter_from_root])
|
|
2657
|
-
_logger.debug(f'build_query_data_request: descendent_full_name={descendent_full_name}')
|
|
2658
2197
|
descendent_leaves = [leaf.index for leaf in descendent._iter_leaves()]
|
|
2659
2198
|
leaves_map[descendent_full_name] = descendent_leaves
|
|
2660
|
-
_logger.debug(f'build_query_data_request: leaves_map={leaves_map}')
|
|
2661
2199
|
|
|
2662
2200
|
output_field_names = None
|
|
2663
2201
|
if field_names is None:
|
|
@@ -2668,13 +2206,11 @@ def build_query_data_request(schema: 'pa.Schema' = pa.schema([]), filters: dict
|
|
|
2668
2206
|
def compare_field_names_by_pos(field_name1, field_name2):
|
|
2669
2207
|
return leaves_map[field_name1][0]-leaves_map[field_name2][0]
|
|
2670
2208
|
field_names = sorted(field_names, key=cmp_to_key(compare_field_names_by_pos))
|
|
2671
|
-
_logger.debug(f'build_query_data_request: sorted field_names={field_names} schema={schema}')
|
|
2672
2209
|
|
|
2673
2210
|
projection_fields = []
|
|
2674
2211
|
projection_positions = []
|
|
2675
2212
|
for field_name in field_names:
|
|
2676
2213
|
positions = leaves_map[field_name]
|
|
2677
|
-
_logger.info("projecting field=%s positions=%s", field_name, positions)
|
|
2678
2214
|
projection_positions.extend(positions)
|
|
2679
2215
|
for leaf_position in positions:
|
|
2680
2216
|
fb_field_index.Start(builder)
|
|
@@ -2731,11 +2267,9 @@ def convert_column_types(table: 'pa.Table') -> 'pa.Table':
|
|
|
2731
2267
|
indexes_of_fields_to_change[field.name] = index
|
|
2732
2268
|
for changing_index in ts_indexes:
|
|
2733
2269
|
field_name = table.schema[changing_index].name
|
|
2734
|
-
_logger.info(f'changing resolution for {field_name} to us')
|
|
2735
2270
|
new_column = table[field_name].cast(pa.timestamp('us'), safe=False)
|
|
2736
2271
|
table = table.set_column(changing_index, field_name, new_column)
|
|
2737
2272
|
for field_name, changing_index in indexes_of_fields_to_change.items():
|
|
2738
|
-
_logger.info(f'applying custom rules to {field_name}')
|
|
2739
2273
|
new_column = table[field_name].to_pylist()
|
|
2740
2274
|
new_column = list(map(column_matcher[field_name], new_column))
|
|
2741
2275
|
new_column = pa.array(new_column, table[field_name].type)
|