vastdb 0.0.5.3__py3-none-any.whl → 0.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vast_flatbuf/tabular/GetTableStatsResponse.py +45 -1
- vast_flatbuf/tabular/VipRange.py +56 -0
- vastdb/__init__.py +7 -0
- vastdb/bench/test_perf.py +29 -0
- vastdb/bucket.py +85 -0
- vastdb/{tests/conftest.py → conftest.py} +29 -14
- vastdb/errors.py +175 -0
- vastdb/{api.py → internal_commands.py} +373 -875
- vastdb/schema.py +85 -0
- vastdb/session.py +47 -0
- vastdb/table.py +483 -0
- vastdb/tests/test_imports.py +123 -0
- vastdb/tests/test_nested.py +28 -0
- vastdb/tests/test_projections.py +42 -0
- vastdb/tests/test_sanity.py +34 -15
- vastdb/tests/test_schemas.py +30 -6
- vastdb/tests/test_tables.py +628 -13
- vastdb/tests/util.py +18 -0
- vastdb/transaction.py +54 -0
- vastdb/util.py +11 -10
- vastdb-0.1.1.dist-info/METADATA +38 -0
- {vastdb-0.0.5.3.dist-info → vastdb-0.1.1.dist-info}/RECORD +26 -31
- vast_protobuf/substrait/__init__.py +0 -0
- vast_protobuf/substrait/algebra_pb2.py +0 -1344
- vast_protobuf/substrait/capabilities_pb2.py +0 -46
- vast_protobuf/substrait/ddl_pb2.py +0 -57
- vast_protobuf/substrait/extended_expression_pb2.py +0 -49
- vast_protobuf/substrait/extensions/__init__.py +0 -0
- vast_protobuf/substrait/extensions/extensions_pb2.py +0 -89
- vast_protobuf/substrait/function_pb2.py +0 -168
- vast_protobuf/substrait/parameterized_types_pb2.py +0 -181
- vast_protobuf/substrait/plan_pb2.py +0 -67
- vast_protobuf/substrait/type_expressions_pb2.py +0 -198
- vast_protobuf/substrait/type_pb2.py +0 -350
- vast_protobuf/tabular/__init__.py +0 -0
- vast_protobuf/tabular/rpc_pb2.py +0 -344
- vastdb/bench_scan.py +0 -45
- vastdb/tests/test_create_table_from_parquets.py +0 -50
- vastdb/v2.py +0 -360
- vastdb-0.0.5.3.dist-info/METADATA +0 -47
- {vast_protobuf → vastdb/bench}/__init__.py +0 -0
- {vastdb-0.0.5.3.dist-info → vastdb-0.1.1.dist-info}/LICENSE +0 -0
- {vastdb-0.0.5.3.dist-info → vastdb-0.1.1.dist-info}/WHEEL +0 -0
- {vastdb-0.0.5.3.dist-info → vastdb-0.1.1.dist-info}/top_level.txt +0 -0
|
@@ -1,29 +1,23 @@
|
|
|
1
|
-
import
|
|
1
|
+
import itertools
|
|
2
|
+
import json
|
|
2
3
|
import logging
|
|
4
|
+
import math
|
|
5
|
+
import re
|
|
3
6
|
import struct
|
|
4
7
|
import urllib.parse
|
|
5
8
|
from collections import defaultdict, namedtuple
|
|
6
|
-
from datetime import datetime
|
|
7
9
|
from enum import Enum
|
|
8
|
-
from
|
|
9
|
-
import
|
|
10
|
-
|
|
11
|
-
import threading
|
|
12
|
-
import queue
|
|
13
|
-
import math
|
|
14
|
-
import socket
|
|
15
|
-
from functools import cmp_to_key
|
|
16
|
-
import pyarrow.parquet as pq
|
|
10
|
+
from ipaddress import IPv4Address, IPv6Address
|
|
11
|
+
from typing import Iterator, Optional, Union
|
|
12
|
+
|
|
17
13
|
import flatbuffers
|
|
14
|
+
import ibis
|
|
18
15
|
import pyarrow as pa
|
|
16
|
+
import pyarrow.parquet as pq
|
|
19
17
|
import requests
|
|
20
|
-
import
|
|
21
|
-
import
|
|
22
|
-
import hmac
|
|
23
|
-
import json
|
|
24
|
-
import itertools
|
|
18
|
+
import urllib3
|
|
19
|
+
import xmltodict
|
|
25
20
|
from aws_requests_auth.aws_auth import AWSRequestsAuth
|
|
26
|
-
from io import BytesIO
|
|
27
21
|
|
|
28
22
|
import vast_flatbuf.org.apache.arrow.computeir.flatbuf.BinaryLiteral as fb_binary_lit
|
|
29
23
|
import vast_flatbuf.org.apache.arrow.computeir.flatbuf.BooleanLiteral as fb_bool_lit
|
|
@@ -35,10 +29,10 @@ import vast_flatbuf.org.apache.arrow.computeir.flatbuf.FieldIndex as fb_field_in
|
|
|
35
29
|
import vast_flatbuf.org.apache.arrow.computeir.flatbuf.FieldRef as fb_field_ref
|
|
36
30
|
import vast_flatbuf.org.apache.arrow.computeir.flatbuf.Float32Literal as fb_float32_lit
|
|
37
31
|
import vast_flatbuf.org.apache.arrow.computeir.flatbuf.Float64Literal as fb_float64_lit
|
|
32
|
+
import vast_flatbuf.org.apache.arrow.computeir.flatbuf.Int8Literal as fb_int8_lit
|
|
38
33
|
import vast_flatbuf.org.apache.arrow.computeir.flatbuf.Int16Literal as fb_int16_lit
|
|
39
34
|
import vast_flatbuf.org.apache.arrow.computeir.flatbuf.Int32Literal as fb_int32_lit
|
|
40
35
|
import vast_flatbuf.org.apache.arrow.computeir.flatbuf.Int64Literal as fb_int64_lit
|
|
41
|
-
import vast_flatbuf.org.apache.arrow.computeir.flatbuf.Int8Literal as fb_int8_lit
|
|
42
36
|
import vast_flatbuf.org.apache.arrow.computeir.flatbuf.Literal as fb_literal
|
|
43
37
|
import vast_flatbuf.org.apache.arrow.computeir.flatbuf.Relation as fb_relation
|
|
44
38
|
import vast_flatbuf.org.apache.arrow.computeir.flatbuf.RelationImpl as rel_impl
|
|
@@ -51,38 +45,47 @@ import vast_flatbuf.org.apache.arrow.flatbuf.Bool as fb_bool
|
|
|
51
45
|
import vast_flatbuf.org.apache.arrow.flatbuf.Date as fb_date
|
|
52
46
|
import vast_flatbuf.org.apache.arrow.flatbuf.Decimal as fb_decimal
|
|
53
47
|
import vast_flatbuf.org.apache.arrow.flatbuf.Field as fb_field
|
|
48
|
+
import vast_flatbuf.org.apache.arrow.flatbuf.FixedSizeBinary as fb_fixed_size_binary
|
|
54
49
|
import vast_flatbuf.org.apache.arrow.flatbuf.FloatingPoint as fb_floating_point
|
|
55
50
|
import vast_flatbuf.org.apache.arrow.flatbuf.Int as fb_int
|
|
56
|
-
import vast_flatbuf.org.apache.arrow.flatbuf.Schema as fb_schema
|
|
57
|
-
import vast_flatbuf.org.apache.arrow.flatbuf.Time as fb_time
|
|
58
|
-
import vast_flatbuf.org.apache.arrow.flatbuf.Struct_ as fb_struct
|
|
59
51
|
import vast_flatbuf.org.apache.arrow.flatbuf.List as fb_list
|
|
60
52
|
import vast_flatbuf.org.apache.arrow.flatbuf.Map as fb_map
|
|
61
|
-
import vast_flatbuf.org.apache.arrow.flatbuf.
|
|
53
|
+
import vast_flatbuf.org.apache.arrow.flatbuf.Schema as fb_schema
|
|
54
|
+
import vast_flatbuf.org.apache.arrow.flatbuf.Struct_ as fb_struct
|
|
55
|
+
import vast_flatbuf.org.apache.arrow.flatbuf.Time as fb_time
|
|
62
56
|
import vast_flatbuf.org.apache.arrow.flatbuf.Timestamp as fb_timestamp
|
|
63
57
|
import vast_flatbuf.org.apache.arrow.flatbuf.Utf8 as fb_utf8
|
|
64
58
|
import vast_flatbuf.tabular.AlterColumnRequest as tabular_alter_column
|
|
59
|
+
import vast_flatbuf.tabular.AlterProjectionTableRequest as tabular_alter_projection
|
|
65
60
|
import vast_flatbuf.tabular.AlterSchemaRequest as tabular_alter_schema
|
|
66
61
|
import vast_flatbuf.tabular.AlterTableRequest as tabular_alter_table
|
|
67
|
-
import vast_flatbuf.tabular.
|
|
62
|
+
import vast_flatbuf.tabular.Column as tabular_projecion_column
|
|
63
|
+
import vast_flatbuf.tabular.ColumnType as tabular_proj_column_type
|
|
64
|
+
import vast_flatbuf.tabular.CreateProjectionRequest as tabular_create_projection
|
|
68
65
|
import vast_flatbuf.tabular.CreateSchemaRequest as tabular_create_schema
|
|
69
66
|
import vast_flatbuf.tabular.ImportDataRequest as tabular_import_data
|
|
70
67
|
import vast_flatbuf.tabular.S3File as tabular_s3_file
|
|
71
|
-
import vast_flatbuf.tabular.CreateProjectionRequest as tabular_create_projection
|
|
72
|
-
import vast_flatbuf.tabular.Column as tabular_projecion_column
|
|
73
|
-
import vast_flatbuf.tabular.ColumnType as tabular_proj_column_type
|
|
74
|
-
|
|
75
68
|
from vast_flatbuf.org.apache.arrow.computeir.flatbuf.Deref import Deref
|
|
76
|
-
from vast_flatbuf.org.apache.arrow.computeir.flatbuf.ExpressionImpl import
|
|
69
|
+
from vast_flatbuf.org.apache.arrow.computeir.flatbuf.ExpressionImpl import (
|
|
70
|
+
ExpressionImpl,
|
|
71
|
+
)
|
|
77
72
|
from vast_flatbuf.org.apache.arrow.computeir.flatbuf.LiteralImpl import LiteralImpl
|
|
78
73
|
from vast_flatbuf.org.apache.arrow.flatbuf.DateUnit import DateUnit
|
|
79
74
|
from vast_flatbuf.org.apache.arrow.flatbuf.TimeUnit import TimeUnit
|
|
80
75
|
from vast_flatbuf.org.apache.arrow.flatbuf.Type import Type
|
|
76
|
+
from vast_flatbuf.tabular.GetProjectionTableStatsResponse import (
|
|
77
|
+
GetProjectionTableStatsResponse as get_projection_table_stats,
|
|
78
|
+
)
|
|
79
|
+
from vast_flatbuf.tabular.GetTableStatsResponse import (
|
|
80
|
+
GetTableStatsResponse as get_table_stats,
|
|
81
|
+
)
|
|
82
|
+
from vast_flatbuf.tabular.ListProjectionsResponse import (
|
|
83
|
+
ListProjectionsResponse as list_projections,
|
|
84
|
+
)
|
|
81
85
|
from vast_flatbuf.tabular.ListSchemasResponse import ListSchemasResponse as list_schemas
|
|
82
86
|
from vast_flatbuf.tabular.ListTablesResponse import ListTablesResponse as list_tables
|
|
83
|
-
|
|
84
|
-
from
|
|
85
|
-
from vast_flatbuf.tabular.ListProjectionsResponse import ListProjectionsResponse as list_projections
|
|
87
|
+
|
|
88
|
+
from . import errors
|
|
86
89
|
|
|
87
90
|
UINT64_MAX = 18446744073709551615
|
|
88
91
|
|
|
@@ -91,30 +94,22 @@ TABULAR_QUERY_DATA_COMPLETED_STREAM_ID = 0xFFFFFFFF - 1
|
|
|
91
94
|
TABULAR_QUERY_DATA_FAILED_STREAM_ID = 0xFFFFFFFF - 2
|
|
92
95
|
TABULAR_INVALID_ROW_ID = 0xFFFFFFFFFFFF # (1<<48)-1
|
|
93
96
|
ESTORE_INVALID_EHANDLE = UINT64_MAX
|
|
97
|
+
IMPORTED_OBJECTS_TABLE_NAME = "vastdb-imported-objects"
|
|
94
98
|
|
|
95
99
|
"""
|
|
96
100
|
S3 Tabular API
|
|
97
101
|
"""
|
|
98
102
|
|
|
99
103
|
|
|
100
|
-
|
|
101
|
-
log = logging.getLogger(name)
|
|
102
|
-
log.setLevel(logging.ERROR)
|
|
103
|
-
ch = logging.StreamHandler()
|
|
104
|
-
ch.setLevel(logging.INFO)
|
|
105
|
-
ch.set_name('tabular_stream_handler')
|
|
106
|
-
formatter = logging.Formatter("%(asctime)s:%(levelname)s:%(message)s")
|
|
107
|
-
ch.setFormatter(formatter)
|
|
108
|
-
log.addHandler(ch)
|
|
109
|
-
log.propagate = False
|
|
110
|
-
return log
|
|
111
|
-
|
|
104
|
+
_logger = logging.getLogger(__name__)
|
|
112
105
|
|
|
113
|
-
_logger = get_logger(__name__)
|
|
114
106
|
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
107
|
+
def _flatten_args(op, op_type):
|
|
108
|
+
if isinstance(op, op_type):
|
|
109
|
+
for arg in op.args:
|
|
110
|
+
yield from _flatten_args(arg, op_type)
|
|
111
|
+
else:
|
|
112
|
+
yield op
|
|
118
113
|
|
|
119
114
|
|
|
120
115
|
class AuthType(Enum):
|
|
@@ -123,10 +118,6 @@ class AuthType(Enum):
|
|
|
123
118
|
BASIC = "basic"
|
|
124
119
|
|
|
125
120
|
|
|
126
|
-
class TabularException(Exception):
|
|
127
|
-
pass
|
|
128
|
-
|
|
129
|
-
|
|
130
121
|
def get_unit_to_flatbuff_time_unit(type):
|
|
131
122
|
unit_to_flatbuff_time_unit = {
|
|
132
123
|
'ns': TimeUnit.NANOSECOND,
|
|
@@ -137,18 +128,10 @@ def get_unit_to_flatbuff_time_unit(type):
|
|
|
137
128
|
return unit_to_flatbuff_time_unit[type]
|
|
138
129
|
|
|
139
130
|
class Predicate:
|
|
140
|
-
|
|
141
|
-
'ns': 1_000_000,
|
|
142
|
-
'us': 1_000,
|
|
143
|
-
'ms': 1,
|
|
144
|
-
's': 0.001
|
|
145
|
-
}
|
|
146
|
-
|
|
147
|
-
def __init__(self, schema: 'pa.Schema', filters: dict):
|
|
131
|
+
def __init__(self, schema: 'pa.Schema', expr: ibis.expr.types.BooleanColumn):
|
|
148
132
|
self.schema = schema
|
|
149
|
-
self.
|
|
133
|
+
self.expr = expr
|
|
150
134
|
self.builder = None
|
|
151
|
-
self._field_name_per_index = None
|
|
152
135
|
|
|
153
136
|
def get_field_indexes(self, field: 'pa.Field', field_name_per_index: list) -> None:
|
|
154
137
|
field_name_per_index.append(field.name)
|
|
@@ -172,7 +155,6 @@ class Predicate:
|
|
|
172
155
|
for field in self.schema:
|
|
173
156
|
self.get_field_indexes(field, _field_name_per_index)
|
|
174
157
|
self._field_name_per_index = {field: index for index, field in enumerate(_field_name_per_index)}
|
|
175
|
-
_logger.debug(f'field_name_per_index: {self._field_name_per_index}')
|
|
176
158
|
return self._field_name_per_index
|
|
177
159
|
|
|
178
160
|
def get_projections(self, builder: 'flatbuffers.builder.Builder', field_names: list = None):
|
|
@@ -190,10 +172,87 @@ class Predicate:
|
|
|
190
172
|
return builder.EndVector()
|
|
191
173
|
|
|
192
174
|
def serialize(self, builder: 'flatbuffers.builder.Builder'):
|
|
175
|
+
from ibis.expr.operations.generic import IsNull, Literal, TableColumn
|
|
176
|
+
from ibis.expr.operations.logical import (
|
|
177
|
+
And,
|
|
178
|
+
Equals,
|
|
179
|
+
Greater,
|
|
180
|
+
GreaterEqual,
|
|
181
|
+
Less,
|
|
182
|
+
LessEqual,
|
|
183
|
+
Not,
|
|
184
|
+
NotEquals,
|
|
185
|
+
Or,
|
|
186
|
+
)
|
|
187
|
+
from ibis.expr.operations.strings import StringContains
|
|
188
|
+
|
|
189
|
+
builder_map = {
|
|
190
|
+
Greater: self.build_greater,
|
|
191
|
+
GreaterEqual: self.build_greater_equal,
|
|
192
|
+
Less: self.build_less,
|
|
193
|
+
LessEqual: self.build_less_equal,
|
|
194
|
+
Equals: self.build_equal,
|
|
195
|
+
NotEquals: self.build_not_equal,
|
|
196
|
+
IsNull: self.build_is_null,
|
|
197
|
+
Not: self.build_is_not_null,
|
|
198
|
+
StringContains: self.build_match_substring,
|
|
199
|
+
}
|
|
200
|
+
|
|
201
|
+
positions_map = dict((f.name, index) for index, f in enumerate(self.schema)) # TODO: BFS
|
|
202
|
+
|
|
193
203
|
self.builder = builder
|
|
204
|
+
|
|
194
205
|
offsets = []
|
|
195
|
-
|
|
196
|
-
|
|
206
|
+
|
|
207
|
+
if self.expr is not None:
|
|
208
|
+
and_args = list(_flatten_args(self.expr.op(), And))
|
|
209
|
+
_logger.debug('AND args: %s ops %s', and_args, self.expr.op())
|
|
210
|
+
for op in and_args:
|
|
211
|
+
or_args = list(_flatten_args(op, Or))
|
|
212
|
+
_logger.debug('OR args: %s op %s', or_args, op)
|
|
213
|
+
inner_offsets = []
|
|
214
|
+
|
|
215
|
+
prev_field_name = None
|
|
216
|
+
for inner_op in or_args:
|
|
217
|
+
_logger.debug('inner_op %s', inner_op)
|
|
218
|
+
builder_func = builder_map.get(type(inner_op))
|
|
219
|
+
if not builder_func:
|
|
220
|
+
raise NotImplementedError(inner_op.name)
|
|
221
|
+
|
|
222
|
+
if builder_func == self.build_is_null:
|
|
223
|
+
column, = inner_op.args
|
|
224
|
+
literal = None
|
|
225
|
+
elif builder_func == self.build_is_not_null:
|
|
226
|
+
not_arg, = inner_op.args
|
|
227
|
+
# currently we only support not is_null, checking we really got is_null under the not:
|
|
228
|
+
if not builder_map.get(type(not_arg)) == self.build_is_null:
|
|
229
|
+
raise NotImplementedError(not_arg.args[0].name)
|
|
230
|
+
column, = not_arg.args
|
|
231
|
+
literal = None
|
|
232
|
+
else:
|
|
233
|
+
column, literal = inner_op.args
|
|
234
|
+
if not isinstance(literal, Literal):
|
|
235
|
+
raise NotImplementedError(inner_op.name)
|
|
236
|
+
|
|
237
|
+
if not isinstance(column, TableColumn):
|
|
238
|
+
raise NotImplementedError(inner_op.name)
|
|
239
|
+
|
|
240
|
+
field_name = column.name
|
|
241
|
+
if prev_field_name is None:
|
|
242
|
+
prev_field_name = field_name
|
|
243
|
+
elif prev_field_name != field_name:
|
|
244
|
+
raise NotImplementedError(op.name)
|
|
245
|
+
|
|
246
|
+
args_offsets = [self.build_column(position=positions_map[field_name])]
|
|
247
|
+
if literal:
|
|
248
|
+
field = self.schema.field(field_name)
|
|
249
|
+
args_offsets.append(self.build_literal(field=field, value=literal.value))
|
|
250
|
+
|
|
251
|
+
inner_offsets.append(builder_func(*args_offsets))
|
|
252
|
+
|
|
253
|
+
domain_offset = self.build_or(inner_offsets)
|
|
254
|
+
offsets.append(domain_offset)
|
|
255
|
+
|
|
197
256
|
return self.build_and(offsets)
|
|
198
257
|
|
|
199
258
|
def build_column(self, position: int):
|
|
@@ -221,7 +280,6 @@ class Predicate:
|
|
|
221
280
|
field = self.schema.field(field_name)
|
|
222
281
|
for attr in field_attrs:
|
|
223
282
|
field = field.type[attr]
|
|
224
|
-
_logger.info(f'trying to append field: {field} with domains: {filters}')
|
|
225
283
|
for filter_by_name in filters:
|
|
226
284
|
offsets.append(self.build_range(column=column, field=field, filter_by_name=filter_by_name))
|
|
227
285
|
return self.build_or(offsets)
|
|
@@ -263,11 +321,9 @@ class Predicate:
|
|
|
263
321
|
return self.build_and(rules)
|
|
264
322
|
|
|
265
323
|
def build_function(self, name: str, *offsets):
|
|
266
|
-
_logger.info(f'name: {name}, offsets: {offsets}')
|
|
267
324
|
offset_name = self.builder.CreateString(name)
|
|
268
325
|
fb_call.StartArgumentsVector(self.builder, len(offsets))
|
|
269
326
|
for offset in reversed(offsets):
|
|
270
|
-
_logger.info(f'offset: {offset}')
|
|
271
327
|
self.builder.PrependUOffsetTRelative(offset)
|
|
272
328
|
offset_arguments = self.builder.EndVector()
|
|
273
329
|
|
|
@@ -282,7 +338,7 @@ class Predicate:
|
|
|
282
338
|
fb_expression.AddImpl(self.builder, offset_call)
|
|
283
339
|
return fb_expression.End(self.builder)
|
|
284
340
|
|
|
285
|
-
def build_literal(self, field: pa.Field, value
|
|
341
|
+
def build_literal(self, field: pa.Field, value):
|
|
286
342
|
if field.type.equals(pa.int64()):
|
|
287
343
|
literal_type = fb_int64_lit
|
|
288
344
|
literal_impl = LiteralImpl.Int64Literal
|
|
@@ -356,7 +412,7 @@ class Predicate:
|
|
|
356
412
|
field_type = fb_utf8.End(self.builder)
|
|
357
413
|
|
|
358
414
|
value = self.builder.CreateString(value)
|
|
359
|
-
elif field.type.equals(pa.date32()): # pa.date64()
|
|
415
|
+
elif field.type.equals(pa.date32()): # pa.date64() is not supported
|
|
360
416
|
literal_type = fb_date32_lit
|
|
361
417
|
literal_impl = LiteralImpl.DateLiteral
|
|
362
418
|
|
|
@@ -364,38 +420,49 @@ class Predicate:
|
|
|
364
420
|
fb_date.Start(self.builder)
|
|
365
421
|
fb_date.AddUnit(self.builder, DateUnit.DAY)
|
|
366
422
|
field_type = fb_date.End(self.builder)
|
|
367
|
-
|
|
368
|
-
start_date = datetime.fromtimestamp(0).date()
|
|
369
|
-
date_value = datetime.strptime(value, '%Y-%m-%d').date()
|
|
370
|
-
date_delta = date_value - start_date
|
|
371
|
-
value = date_delta.days
|
|
423
|
+
value, = pa.array([value], field.type).cast(pa.int32()).to_pylist()
|
|
372
424
|
elif isinstance(field.type, pa.TimestampType):
|
|
373
425
|
literal_type = fb_timestamp_lit
|
|
374
426
|
literal_impl = LiteralImpl.TimestampLiteral
|
|
375
427
|
|
|
428
|
+
if field.type.equals(pa.timestamp('s')):
|
|
429
|
+
unit = TimeUnit.SECOND
|
|
430
|
+
if field.type.equals(pa.timestamp('ms')):
|
|
431
|
+
unit = TimeUnit.MILLISECOND
|
|
432
|
+
if field.type.equals(pa.timestamp('us')):
|
|
433
|
+
unit = TimeUnit.MICROSECOND
|
|
434
|
+
if field.type.equals(pa.timestamp('ns')):
|
|
435
|
+
unit = TimeUnit.NANOSECOND
|
|
436
|
+
|
|
376
437
|
field_type_type = Type.Timestamp
|
|
377
438
|
fb_timestamp.Start(self.builder)
|
|
378
|
-
fb_timestamp.AddUnit(self.builder,
|
|
439
|
+
fb_timestamp.AddUnit(self.builder, unit)
|
|
379
440
|
field_type = fb_timestamp.End(self.builder)
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
elif field.type.equals(pa.time32('s')) or field.type.equals(pa.time32('ms')) or field.type.equals(pa.time64('us')) or field.type.equals(pa.time64('ns')):
|
|
383
|
-
|
|
441
|
+
value, = pa.array([value], field.type).cast(pa.int64()).to_pylist()
|
|
442
|
+
elif isinstance(field.type, (pa.Time32Type, pa.Time64Type)):
|
|
384
443
|
literal_type = fb_time_lit
|
|
385
444
|
literal_impl = LiteralImpl.TimeLiteral
|
|
386
445
|
|
|
387
|
-
|
|
388
|
-
|
|
389
|
-
|
|
390
|
-
|
|
446
|
+
if field.type.equals(pa.time32('s')):
|
|
447
|
+
target_type = pa.int32()
|
|
448
|
+
unit = TimeUnit.SECOND
|
|
449
|
+
if field.type.equals(pa.time32('ms')):
|
|
450
|
+
target_type = pa.int32()
|
|
451
|
+
unit = TimeUnit.MILLISECOND
|
|
452
|
+
if field.type.equals(pa.time64('us')):
|
|
453
|
+
target_type = pa.int64()
|
|
454
|
+
unit = TimeUnit.MICROSECOND
|
|
455
|
+
if field.type.equals(pa.time64('ns')):
|
|
456
|
+
target_type = pa.int64()
|
|
457
|
+
unit = TimeUnit.NANOSECOND
|
|
391
458
|
|
|
392
459
|
field_type_type = Type.Time
|
|
393
460
|
fb_time.Start(self.builder)
|
|
394
461
|
fb_time.AddBitWidth(self.builder, field.type.bit_width)
|
|
395
|
-
fb_time.AddUnit(self.builder,
|
|
462
|
+
fb_time.AddUnit(self.builder, unit)
|
|
396
463
|
field_type = fb_time.End(self.builder)
|
|
397
464
|
|
|
398
|
-
value =
|
|
465
|
+
value, = pa.array([value], field.type).cast(target_type).to_pylist()
|
|
399
466
|
elif field.type.equals(pa.bool_()):
|
|
400
467
|
literal_type = fb_bool_lit
|
|
401
468
|
literal_impl = LiteralImpl.BooleanLiteral
|
|
@@ -426,7 +493,7 @@ class Predicate:
|
|
|
426
493
|
fb_binary.Start(self.builder)
|
|
427
494
|
field_type = fb_binary.End(self.builder)
|
|
428
495
|
|
|
429
|
-
value = self.builder.CreateByteVector(value
|
|
496
|
+
value = self.builder.CreateByteVector(value)
|
|
430
497
|
else:
|
|
431
498
|
raise ValueError(f'unsupported predicate for type={field.type}, value={value}')
|
|
432
499
|
|
|
@@ -459,6 +526,9 @@ class Predicate:
|
|
|
459
526
|
def build_equal(self, column: int, literal: int):
|
|
460
527
|
return self.build_function('equal', column, literal)
|
|
461
528
|
|
|
529
|
+
def build_not_equal(self, column: int, literal: int):
|
|
530
|
+
return self.build_function('not_equal', column, literal)
|
|
531
|
+
|
|
462
532
|
def build_greater(self, column: int, literal: int):
|
|
463
533
|
return self.build_function('greater', column, literal)
|
|
464
534
|
|
|
@@ -477,6 +547,9 @@ class Predicate:
|
|
|
477
547
|
def build_is_not_null(self, column: int):
|
|
478
548
|
return self.build_function('is_valid', column)
|
|
479
549
|
|
|
550
|
+
def build_match_substring(self, column: int, literal: int):
|
|
551
|
+
return self.build_function('match_substring', column, literal)
|
|
552
|
+
|
|
480
553
|
|
|
481
554
|
class FieldNode:
|
|
482
555
|
"""Helper class for representing nested Arrow fields and handling QueryData requests"""
|
|
@@ -506,8 +579,6 @@ class FieldNode:
|
|
|
506
579
|
# will be set during by the parser (see below)
|
|
507
580
|
self.buffers = None # a list of Arrow buffers (https://arrow.apache.org/docs/format/Columnar.html#buffer-listing-for-each-layout)
|
|
508
581
|
self.length = None # each array must have it's length specified (https://arrow.apache.org/docs/python/generated/pyarrow.Array.html#pyarrow.Array.from_buffers)
|
|
509
|
-
self.is_projected = False
|
|
510
|
-
self.projected_field = self.field
|
|
511
582
|
|
|
512
583
|
def _iter_to_root(self) -> Iterator['FieldNode']:
|
|
513
584
|
yield self
|
|
@@ -528,15 +599,13 @@ class FieldNode:
|
|
|
528
599
|
for child in self.children:
|
|
529
600
|
yield from child._iter_leaves()
|
|
530
601
|
|
|
531
|
-
def
|
|
602
|
+
def _iter_leaves(self) -> Iterator['FieldNode']:
|
|
532
603
|
"""Generate only leaf nodes (i.e. columns having scalar types)."""
|
|
533
604
|
if not self.children:
|
|
534
|
-
|
|
535
|
-
yield self
|
|
605
|
+
yield self
|
|
536
606
|
else:
|
|
537
607
|
for child in self.children:
|
|
538
|
-
|
|
539
|
-
yield from child._iter_projected_leaves()
|
|
608
|
+
yield from child._iter_leaves()
|
|
540
609
|
|
|
541
610
|
def debug_log(self, level=0):
|
|
542
611
|
"""Recursively dump this node state to log."""
|
|
@@ -573,28 +642,17 @@ class FieldNode:
|
|
|
573
642
|
|
|
574
643
|
def build(self) -> pa.Array:
|
|
575
644
|
"""Construct an Arrow array from the collected buffers (recursively)."""
|
|
576
|
-
children = self.children and [node.build() for node in self.children
|
|
577
|
-
|
|
578
|
-
f'self.projected_field.type={self.projected_field.type}, self.length={self.length} '
|
|
579
|
-
f'self.buffers={self.buffers} children={children}')
|
|
580
|
-
result = pa.Array.from_buffers(self.projected_field.type, self.length, buffers=self.buffers, children=children)
|
|
645
|
+
children = self.children and [node.build() for node in self.children]
|
|
646
|
+
result = pa.Array.from_buffers(self.type, self.length, buffers=self.buffers, children=children)
|
|
581
647
|
if self.debug:
|
|
582
648
|
_logger.debug('%s result=%s', self.field, result)
|
|
583
649
|
return result
|
|
584
650
|
|
|
585
|
-
def build_projected_field(self):
|
|
586
|
-
if isinstance(self.type, pa.StructType):
|
|
587
|
-
[child.build_projected_field() for child in self.children if child.is_projected]
|
|
588
|
-
self.projected_field = pa.field(self.field.name,
|
|
589
|
-
pa.struct([child.projected_field for child in self.children if child.is_projected]),
|
|
590
|
-
self.field.nullable,
|
|
591
|
-
self.field.metadata)
|
|
592
651
|
|
|
593
652
|
class QueryDataParser:
|
|
594
653
|
"""Used to parse VAST QueryData RPC response."""
|
|
595
|
-
def __init__(self, arrow_schema: pa.Schema, *, debug=False
|
|
654
|
+
def __init__(self, arrow_schema: pa.Schema, *, debug=False):
|
|
596
655
|
self.arrow_schema = arrow_schema
|
|
597
|
-
self.projection_positions = projection_positions
|
|
598
656
|
index = itertools.count() # used to generate leaf column positions for VAST QueryData RPC
|
|
599
657
|
self.nodes = [FieldNode(field, index, debug=debug) for field in arrow_schema]
|
|
600
658
|
self.debug = debug
|
|
@@ -602,27 +660,15 @@ class QueryDataParser:
|
|
|
602
660
|
for node in self.nodes:
|
|
603
661
|
node.debug_log()
|
|
604
662
|
self.leaves = [leaf for node in self.nodes for leaf in node._iter_leaves()]
|
|
605
|
-
_logger.debug(f'QueryDataParser: self.leaves = {[(leaf.field.name, leaf.index) for leaf in self.leaves]}')
|
|
606
|
-
self.mark_projected_nodes()
|
|
607
|
-
[node.build_projected_field() for node in self.nodes]
|
|
608
|
-
self.projected_leaves = [leaf for node in self.nodes for leaf in node._iter_projected_leaves()]
|
|
609
|
-
_logger.debug(f'QueryDataParser: self.projected_leaves = {[(leaf.field.name, leaf.index) for leaf in self.projected_leaves]}')
|
|
610
663
|
|
|
611
664
|
self.leaf_offset = 0
|
|
612
665
|
|
|
613
|
-
def mark_projected_nodes(self):
|
|
614
|
-
for leaf in self.leaves:
|
|
615
|
-
if self.projection_positions is None or leaf.index in self.projection_positions:
|
|
616
|
-
for node in leaf._iter_to_root():
|
|
617
|
-
node.is_projected = True
|
|
618
|
-
_logger.debug(f'mark_projected_nodes node.field.name={node.field.name}')
|
|
619
|
-
|
|
620
666
|
def parse(self, column: pa.Array):
|
|
621
667
|
"""Parse a single column response from VAST (see FieldNode.set for details)"""
|
|
622
|
-
if not self.leaf_offset < len(self.
|
|
668
|
+
if not self.leaf_offset < len(self.leaves):
|
|
623
669
|
raise ValueError(f'self.leaf_offset: {self.leaf_offset} are not < '
|
|
624
670
|
f'than len(self.leaves): {len(self.leaves)}')
|
|
625
|
-
leaf = self.
|
|
671
|
+
leaf = self.leaves[self.leaf_offset]
|
|
626
672
|
|
|
627
673
|
# A column response may be sent in multiple chunks, therefore we need to combine
|
|
628
674
|
# it into a single chunk to allow reconstruction using `Array.from_buffers()`.
|
|
@@ -643,32 +689,19 @@ class QueryDataParser:
|
|
|
643
689
|
|
|
644
690
|
self.leaf_offset += 1
|
|
645
691
|
|
|
646
|
-
def build(self
|
|
692
|
+
def build(self) -> Optional[pa.Table]:
|
|
647
693
|
"""Try to build the resulting Table object (if all columns were parsed)"""
|
|
648
|
-
if self.
|
|
649
|
-
|
|
650
|
-
return None
|
|
651
|
-
else:
|
|
652
|
-
if self.leaf_offset < len(self.leaves):
|
|
653
|
-
return None
|
|
694
|
+
if self.leaf_offset < len(self.leaves):
|
|
695
|
+
return None
|
|
654
696
|
|
|
655
697
|
if self.debug:
|
|
656
698
|
for node in self.nodes:
|
|
657
699
|
node.debug_log()
|
|
658
700
|
|
|
659
|
-
# sort resulting table according to the output field names
|
|
660
|
-
projected_nodes = [node for node in self.nodes if node.is_projected]
|
|
661
|
-
if output_field_names is not None:
|
|
662
|
-
def key_func(projected_node):
|
|
663
|
-
return output_field_names.index(projected_node.field.name)
|
|
664
|
-
sorted_projected_nodes = sorted(projected_nodes, key=key_func)
|
|
665
|
-
else:
|
|
666
|
-
sorted_projected_nodes = projected_nodes
|
|
667
|
-
|
|
668
701
|
result = pa.Table.from_arrays(
|
|
669
|
-
arrays=[node.build() for node in
|
|
670
|
-
schema
|
|
671
|
-
result.validate(full=
|
|
702
|
+
arrays=[node.build() for node in self.nodes],
|
|
703
|
+
schema=self.arrow_schema)
|
|
704
|
+
result.validate(full=self.debug) # does expensive validation checks only if debug is enabled
|
|
672
705
|
return result
|
|
673
706
|
|
|
674
707
|
def _iter_nested_arrays(column: pa.Array) -> Iterator[pa.Array]:
|
|
@@ -693,7 +726,6 @@ def _parse_table_info(obj):
|
|
|
693
726
|
return TableInfo(name, properties, handle, num_rows, used_bytes)
|
|
694
727
|
|
|
695
728
|
def build_record_batch(column_info, column_values):
|
|
696
|
-
_logger.info(f"column_info={column_info}")
|
|
697
729
|
fields = [pa.field(column_name, column_type) for column_type, column_name in column_info]
|
|
698
730
|
schema = pa.schema(fields)
|
|
699
731
|
arrays = [pa.array(column_values[column_type], type=column_type) for column_type, _ in column_info]
|
|
@@ -706,56 +738,30 @@ def serialize_record_batch(batch):
|
|
|
706
738
|
writer.write(batch)
|
|
707
739
|
return sink.getvalue()
|
|
708
740
|
|
|
709
|
-
|
|
710
|
-
|
|
711
|
-
start_parts = start.split('.')
|
|
712
|
-
start_last_part = int(start_parts[-1])
|
|
713
|
-
end_parts = end.split('.')
|
|
714
|
-
end_last_part = int(end_parts[-1])
|
|
715
|
-
if start_last_part>=end_last_part or True in [start_parts[i] != end_parts[i] for i in range(3)]:
|
|
716
|
-
raise ValueError(f'illegal ip range {ip_range_str}')
|
|
717
|
-
num_ips = 1 + end_last_part - start_last_part
|
|
718
|
-
ips = ['.'.join(start_parts[:-1] + [str(start_last_part + i)]) for i in range(num_ips)]
|
|
719
|
-
return ips
|
|
720
|
-
|
|
721
|
-
def parse_executor_hosts(host):
|
|
722
|
-
executor_hosts_parsed = host.split(',')
|
|
723
|
-
executor_hosts_parsed = [host.strip() for host in executor_hosts_parsed]
|
|
724
|
-
executor_hosts = []
|
|
725
|
-
for executor_host in executor_hosts_parsed:
|
|
726
|
-
is_ip_range=False
|
|
727
|
-
if ':' in executor_host:
|
|
728
|
-
try:
|
|
729
|
-
socket.inet_aton(executor_host.split(':')[0])
|
|
730
|
-
socket.inet_aton(executor_host.split(':')[1])
|
|
731
|
-
is_ip_range = True
|
|
732
|
-
except:
|
|
733
|
-
pass
|
|
734
|
-
if is_ip_range:
|
|
735
|
-
executor_hosts.extend(generate_ip_range(executor_host))
|
|
736
|
-
else:
|
|
737
|
-
executor_hosts.append(executor_host)
|
|
738
|
-
return executor_hosts
|
|
741
|
+
# Results that returns from tablestats
|
|
742
|
+
TableStatsResult = namedtuple("TableStatsResult",["num_rows", "size_in_bytes", "is_external_rowid_alloc", "endpoints"])
|
|
739
743
|
|
|
740
744
|
class VastdbApi:
|
|
741
|
-
|
|
745
|
+
# we expect the vast version to be <major>.<minor>.<patch>.<protocol>
|
|
746
|
+
VAST_VERSION_REGEX = re.compile(r'^vast (\d+\.\d+\.\d+\.\d+)$')
|
|
747
|
+
|
|
748
|
+
def __init__(self, endpoint, access_key, secret_key, username=None, password=None,
|
|
742
749
|
secure=False, auth_type=AuthType.SIGV4):
|
|
743
|
-
|
|
744
|
-
host = executor_hosts[0]
|
|
745
|
-
self.host = host
|
|
750
|
+
url_dict = urllib3.util.parse_url(endpoint)._asdict()
|
|
746
751
|
self.access_key = access_key
|
|
747
752
|
self.secret_key = secret_key
|
|
748
753
|
self.username = username
|
|
749
754
|
self.password = password
|
|
750
|
-
self.port = port
|
|
751
755
|
self.secure = secure
|
|
752
756
|
self.auth_type = auth_type
|
|
753
|
-
self.executor_hosts =
|
|
757
|
+
self.executor_hosts = [endpoint] # TODO: remove
|
|
754
758
|
|
|
755
759
|
username = username or ''
|
|
756
760
|
password = password or ''
|
|
757
|
-
if not port:
|
|
758
|
-
port = 443 if secure else 80
|
|
761
|
+
if not url_dict['port']:
|
|
762
|
+
url_dict['port'] = 443 if secure else 80
|
|
763
|
+
|
|
764
|
+
self.port = url_dict['port']
|
|
759
765
|
|
|
760
766
|
self.default_max_list_columns_page_size = 1000
|
|
761
767
|
self.session = requests.Session()
|
|
@@ -764,10 +770,10 @@ class VastdbApi:
|
|
|
764
770
|
if auth_type == AuthType.BASIC:
|
|
765
771
|
self.session.auth = requests.auth.HTTPBasicAuth(username, password)
|
|
766
772
|
else:
|
|
767
|
-
if port != 80 and port != 443:
|
|
768
|
-
self.aws_host =
|
|
773
|
+
if url_dict['port'] != 80 and url_dict['port'] != 443:
|
|
774
|
+
self.aws_host = '{host}:{port}'.format(**url_dict)
|
|
769
775
|
else:
|
|
770
|
-
self.aws_host =
|
|
776
|
+
self.aws_host = '{host}'.format(**url_dict)
|
|
771
777
|
|
|
772
778
|
self.session.auth = AWSRequestsAuth(aws_access_key=access_key,
|
|
773
779
|
aws_secret_access_key=secret_key,
|
|
@@ -775,8 +781,34 @@ class VastdbApi:
|
|
|
775
781
|
aws_region='us-east-1',
|
|
776
782
|
aws_service='s3')
|
|
777
783
|
|
|
778
|
-
|
|
779
|
-
|
|
784
|
+
if not url_dict['scheme']:
|
|
785
|
+
url_dict['scheme'] = "https" if secure else "http"
|
|
786
|
+
|
|
787
|
+
url = urllib3.util.Url(**url_dict)
|
|
788
|
+
self.url = str(url)
|
|
789
|
+
_logger.debug('url=%s aws_host=%s', self.url, self.aws_host)
|
|
790
|
+
|
|
791
|
+
# probe the cluster for its version
|
|
792
|
+
self.vast_version = None
|
|
793
|
+
res = self.session.options(self.url)
|
|
794
|
+
server_header = res.headers.get("Server")
|
|
795
|
+
if server_header is None:
|
|
796
|
+
_logger.error("OPTIONS response doesn't contain 'Server' header")
|
|
797
|
+
else:
|
|
798
|
+
_logger.debug("Server header is '%s'", server_header)
|
|
799
|
+
if m := self.VAST_VERSION_REGEX.match(server_header):
|
|
800
|
+
self.vast_version, = m.groups()
|
|
801
|
+
return
|
|
802
|
+
else:
|
|
803
|
+
_logger.error("'Server' header '%s' doesn't match the expected pattern", server_header)
|
|
804
|
+
|
|
805
|
+
msg = (
|
|
806
|
+
f'Please use `vastdb` <= 0.0.5.x with current VAST cluster version ("{server_header or "N/A"}"). '
|
|
807
|
+
'To use the latest SDK, please upgrade your cluster to the latest service pack. '
|
|
808
|
+
'Please contact customer.support@vastdata.com for more details.'
|
|
809
|
+
)
|
|
810
|
+
_logger.critical(msg)
|
|
811
|
+
raise NotImplementedError(msg)
|
|
780
812
|
|
|
781
813
|
def update_mgmt_session(self, access_key: str, secret_key: str, auth_type=AuthType.SIGV4):
|
|
782
814
|
if auth_type != AuthType.BASIC:
|
|
@@ -821,21 +853,9 @@ class VastdbApi:
|
|
|
821
853
|
return common_headers
|
|
822
854
|
|
|
823
855
|
def _check_res(self, res, cmd="", expected_retvals=[]):
|
|
824
|
-
|
|
825
|
-
|
|
826
|
-
|
|
827
|
-
if not res.status_code in expected_retvals:
|
|
828
|
-
raise ValueError(f"Expected status code mismatch. status_code={res.status_code}")
|
|
829
|
-
else:
|
|
830
|
-
if not len(expected_retvals) == 0:
|
|
831
|
-
raise ValueError(f"Expected {expected_retvals} but status_code={res.status_code}")
|
|
832
|
-
return res
|
|
833
|
-
except requests.HTTPError as e:
|
|
834
|
-
if res.status_code in expected_retvals:
|
|
835
|
-
_logger.info(f"{cmd} has failed as expected res={res}")
|
|
836
|
-
return res
|
|
837
|
-
else:
|
|
838
|
-
raise e
|
|
856
|
+
if exc := errors.from_response(res):
|
|
857
|
+
raise exc
|
|
858
|
+
return res
|
|
839
859
|
|
|
840
860
|
def create_schema(self, bucket, name, txid=0, client_tags=[], schema_properties="", expected_retvals=[]):
|
|
841
861
|
"""
|
|
@@ -975,7 +995,8 @@ class VastdbApi:
|
|
|
975
995
|
return snapshots, is_truncated, marker
|
|
976
996
|
|
|
977
997
|
|
|
978
|
-
def create_table(self, bucket, schema, name, arrow_schema, txid=0, client_tags=[], expected_retvals=[],
|
|
998
|
+
def create_table(self, bucket, schema, name, arrow_schema, txid=0, client_tags=[], expected_retvals=[],
|
|
999
|
+
topic_partitions=0, create_imports_table=False):
|
|
979
1000
|
"""
|
|
980
1001
|
Create a table, use the following request
|
|
981
1002
|
POST /bucket/schema/table?table HTTP/1.1
|
|
@@ -984,18 +1005,21 @@ class VastdbApi:
|
|
|
984
1005
|
tabular-txid: <integer> TransactionId
|
|
985
1006
|
tabular-client-tag: <string> ClientTag
|
|
986
1007
|
|
|
987
|
-
The body of the POST request contains table column properties as
|
|
988
|
-
|
|
989
|
-
|
|
990
|
-
|
|
991
|
-
|
|
992
|
-
|
|
1008
|
+
The body of the POST request contains table column properties as arrow schema
|
|
1009
|
+
which include field_name, field_type and properties
|
|
1010
|
+
|
|
1011
|
+
In order to create vastdb-imported-objects table that tracks all imported files and avoid duplicate imports,
|
|
1012
|
+
just set create_imports_table=True
|
|
1013
|
+
The request will look like:
|
|
1014
|
+
POST /bucket/schema/table?table&sub-table=vastdb-imported-objects HTTP/1.1
|
|
993
1015
|
"""
|
|
994
1016
|
headers = self._fill_common_headers(txid=txid, client_tags=client_tags)
|
|
995
1017
|
|
|
996
1018
|
serialized_schema = arrow_schema.serialize()
|
|
997
1019
|
headers['Content-Length'] = str(len(serialized_schema))
|
|
998
1020
|
url_params = {'topic_partitions': str(topic_partitions)} if topic_partitions else {}
|
|
1021
|
+
if create_imports_table:
|
|
1022
|
+
url_params['sub-table'] = IMPORTED_OBJECTS_TABLE_NAME
|
|
999
1023
|
|
|
1000
1024
|
res = self.session.post(self._api_prefix(bucket=bucket, schema=schema, table=name, command="table", url_params=url_params),
|
|
1001
1025
|
data=serialized_schema, headers=headers)
|
|
@@ -1015,7 +1039,6 @@ class VastdbApi:
|
|
|
1015
1039
|
raise RuntimeError(f'invalid params parquet_path={parquet_path} parquet_bucket_name={parquet_bucket_name} parquet_object_name={parquet_object_name}')
|
|
1016
1040
|
|
|
1017
1041
|
# Get the schema of the Parquet file
|
|
1018
|
-
_logger.info(f'type(parquet_ds.schema) = {type(parquet_ds.schema)}')
|
|
1019
1042
|
if isinstance(parquet_ds.schema, pq.ParquetSchema):
|
|
1020
1043
|
arrow_schema = parquet_ds.schema.to_arrow_schema()
|
|
1021
1044
|
elif isinstance(parquet_ds.schema, pa.Schema):
|
|
@@ -1038,13 +1061,27 @@ class VastdbApi:
|
|
|
1038
1061
|
headers = self._fill_common_headers(txid=txid, client_tags=client_tags)
|
|
1039
1062
|
res = self.session.get(self._api_prefix(bucket=bucket, schema=schema, table=name, command="stats"), headers=headers)
|
|
1040
1063
|
if res.status_code == 200:
|
|
1041
|
-
res_headers = res.headers
|
|
1042
1064
|
flatbuf = b''.join(res.iter_content(chunk_size=128))
|
|
1043
1065
|
stats = get_table_stats.GetRootAs(flatbuf)
|
|
1044
1066
|
num_rows = stats.NumRows()
|
|
1045
1067
|
size_in_bytes = stats.SizeInBytes()
|
|
1046
1068
|
is_external_rowid_alloc = stats.IsExternalRowidAlloc()
|
|
1047
|
-
|
|
1069
|
+
endpoints = []
|
|
1070
|
+
if stats.VipsLength() == 0:
|
|
1071
|
+
endpoints.append(self.url)
|
|
1072
|
+
else:
|
|
1073
|
+
ip_cls = IPv6Address if (stats.AddressType() == "ipv6") else IPv4Address
|
|
1074
|
+
vips = [stats.Vips(i) for i in range(stats.VipsLength())]
|
|
1075
|
+
ips = []
|
|
1076
|
+
# extract the vips into list of IPs
|
|
1077
|
+
for vip in vips:
|
|
1078
|
+
start_ip = int(ip_cls(vip.StartAddress().decode()))
|
|
1079
|
+
ips.extend(ip_cls(start_ip + i) for i in range(vip.AddressCount()))
|
|
1080
|
+
for ip in ips:
|
|
1081
|
+
prefix = "http" if not self.secure else "https"
|
|
1082
|
+
endpoints.append(f"{prefix}://{str(ip)}:{self.port}")
|
|
1083
|
+
return TableStatsResult(num_rows, size_in_bytes, is_external_rowid_alloc, endpoints)
|
|
1084
|
+
|
|
1048
1085
|
return self._check_res(res, "get_table_stats", expected_retvals)
|
|
1049
1086
|
|
|
1050
1087
|
def alter_table(self, bucket, schema, name, txid=0, client_tags=[], table_properties="",
|
|
@@ -1071,22 +1108,26 @@ class VastdbApi:
|
|
|
1071
1108
|
|
|
1072
1109
|
headers = self._fill_common_headers(txid=txid, client_tags=client_tags)
|
|
1073
1110
|
headers['Content-Length'] = str(len(alter_table_req))
|
|
1074
|
-
url_params = {'tabular-new-table-name': new_name} if len(new_name) else {}
|
|
1111
|
+
url_params = {'tabular-new-table-name': schema + "/" + new_name} if len(new_name) else {}
|
|
1075
1112
|
|
|
1076
1113
|
res = self.session.put(self._api_prefix(bucket=bucket, schema=schema, table=name, command="table", url_params=url_params),
|
|
1077
1114
|
data=alter_table_req, headers=headers)
|
|
1078
1115
|
|
|
1079
1116
|
return self._check_res(res, "alter_table", expected_retvals)
|
|
1080
1117
|
|
|
1081
|
-
def drop_table(self, bucket, schema, name, txid=0, client_tags=[], expected_retvals=[]):
|
|
1118
|
+
def drop_table(self, bucket, schema, name, txid=0, client_tags=[], expected_retvals=[], remove_imports_table=False):
|
|
1082
1119
|
"""
|
|
1083
1120
|
DELETE /mybucket/schema_path/mytable?table HTTP/1.1
|
|
1084
1121
|
tabular-txid: TransactionId
|
|
1085
1122
|
tabular-client-tag: ClientTag
|
|
1123
|
+
|
|
1124
|
+
To remove the internal vastdb-imported-objects table just set remove_imports_table=True
|
|
1086
1125
|
"""
|
|
1087
1126
|
headers = self._fill_common_headers(txid=txid, client_tags=client_tags)
|
|
1127
|
+
url_params = {'sub-table': IMPORTED_OBJECTS_TABLE_NAME} if remove_imports_table else {}
|
|
1088
1128
|
|
|
1089
|
-
res = self.session.delete(self._api_prefix(bucket=bucket, schema=schema, table=name, command="table"
|
|
1129
|
+
res = self.session.delete(self._api_prefix(bucket=bucket, schema=schema, table=name, command="table", url_params=url_params),
|
|
1130
|
+
headers=headers)
|
|
1090
1131
|
return self._check_res(res, "drop_table", expected_retvals)
|
|
1091
1132
|
|
|
1092
1133
|
def list_tables(self, bucket, schema, txid=0, client_tags=[], max_keys=1000, next_key=0, name_prefix="",
|
|
@@ -1210,7 +1251,7 @@ class VastdbApi:
|
|
|
1210
1251
|
|
|
1211
1252
|
def list_columns(self, bucket, schema, table, *, txid=0, client_tags=None, max_keys=None, next_key=0,
|
|
1212
1253
|
count_only=False, name_prefix="", exact_match=False,
|
|
1213
|
-
expected_retvals=None, bc_list_internals=False):
|
|
1254
|
+
expected_retvals=None, bc_list_internals=False, list_imports_table=False):
|
|
1214
1255
|
"""
|
|
1215
1256
|
GET /mybucket/myschema/mytable?columns HTTP/1.1
|
|
1216
1257
|
tabular-txid: TransactionId
|
|
@@ -1218,6 +1259,8 @@ class VastdbApi:
|
|
|
1218
1259
|
x-tabluar-name-prefix: TableNamePrefix
|
|
1219
1260
|
tabular-max-keys: 1000
|
|
1220
1261
|
tabular-next-key: NextColumnId
|
|
1262
|
+
|
|
1263
|
+
To list the columns of the internal vastdb-imported-objects table, set list_import_table=True
|
|
1221
1264
|
"""
|
|
1222
1265
|
max_keys = max_keys or self.default_max_list_columns_page_size
|
|
1223
1266
|
client_tags = client_tags or []
|
|
@@ -1235,7 +1278,9 @@ class VastdbApi:
|
|
|
1235
1278
|
else:
|
|
1236
1279
|
headers['tabular-name-prefix'] = name_prefix
|
|
1237
1280
|
|
|
1238
|
-
|
|
1281
|
+
url_params = {'sub-table': IMPORTED_OBJECTS_TABLE_NAME} if list_imports_table else {}
|
|
1282
|
+
res = self.session.get(self._api_prefix(bucket=bucket, schema=schema, table=table, command="column",
|
|
1283
|
+
url_params=url_params),
|
|
1239
1284
|
headers=headers, stream=True)
|
|
1240
1285
|
self._check_res(res, "list_columns", expected_retvals)
|
|
1241
1286
|
if res.status_code == 200:
|
|
@@ -1247,9 +1292,7 @@ class VastdbApi:
|
|
|
1247
1292
|
if not count_only:
|
|
1248
1293
|
schema_buf = b''.join(res.iter_content(chunk_size=128))
|
|
1249
1294
|
schema_out = pa.ipc.open_stream(schema_buf).schema
|
|
1250
|
-
|
|
1251
|
-
for f in schema_out:
|
|
1252
|
-
columns.append([f.name, f.type, f.metadata, f])
|
|
1295
|
+
columns = schema_out
|
|
1253
1296
|
|
|
1254
1297
|
return columns, next_key, is_truncated, count
|
|
1255
1298
|
|
|
@@ -1296,7 +1339,7 @@ class VastdbApi:
|
|
|
1296
1339
|
return self._check_res(res, "get_transaction", expected_retvals)
|
|
1297
1340
|
|
|
1298
1341
|
def select_row_ids(self, bucket, schema, table, params, txid=0, client_tags=[], expected_retvals=[],
|
|
1299
|
-
retry_count=0, enable_sorted_projections=
|
|
1342
|
+
retry_count=0, enable_sorted_projections=True):
|
|
1300
1343
|
"""
|
|
1301
1344
|
POST /mybucket/myschema/mytable?query-data=SelectRowIds HTTP/1.1
|
|
1302
1345
|
"""
|
|
@@ -1313,7 +1356,7 @@ class VastdbApi:
|
|
|
1313
1356
|
return self._check_res(res, "query_data", expected_retvals)
|
|
1314
1357
|
|
|
1315
1358
|
def read_columns_data(self, bucket, schema, table, params, txid=0, client_tags=[], expected_retvals=[], tenant_guid=None,
|
|
1316
|
-
retry_count=0, enable_sorted_projections=
|
|
1359
|
+
retry_count=0, enable_sorted_projections=True):
|
|
1317
1360
|
"""
|
|
1318
1361
|
POST /mybucket/myschema/mytable?query-data=ReadColumns HTTP/1.1
|
|
1319
1362
|
"""
|
|
@@ -1329,7 +1372,7 @@ class VastdbApi:
|
|
|
1329
1372
|
return self._check_res(res, "query_data", expected_retvals)
|
|
1330
1373
|
|
|
1331
1374
|
def count_rows(self, bucket, schema, table, params, txid=0, client_tags=[], expected_retvals=[], tenant_guid=None,
|
|
1332
|
-
retry_count=0, enable_sorted_projections=
|
|
1375
|
+
retry_count=0, enable_sorted_projections=True):
|
|
1333
1376
|
"""
|
|
1334
1377
|
POST /mybucket/myschema/mytable?query-data=CountRows HTTP/1.1
|
|
1335
1378
|
"""
|
|
@@ -1343,27 +1386,9 @@ class VastdbApi:
|
|
|
1343
1386
|
data=params, headers=headers, stream=True)
|
|
1344
1387
|
return self._check_res(res, "query_data", expected_retvals)
|
|
1345
1388
|
|
|
1346
|
-
def
|
|
1347
|
-
|
|
1348
|
-
|
|
1349
|
-
request_format='string', response_format='string'):
|
|
1350
|
-
"""
|
|
1351
|
-
GET /mybucket/myschema/mytable?data HTTP/1.1
|
|
1352
|
-
Content-Length: ContentLength
|
|
1353
|
-
tabular-txid: TransactionId
|
|
1354
|
-
tabular-client-tag: ClientTag
|
|
1355
|
-
tabular-split: "split_id,total_splits,num_row_groups_per_split"
|
|
1356
|
-
tabular-num-of-subsplits: "total"
|
|
1357
|
-
tabular-request-format: "string"
|
|
1358
|
-
tabular-response-format: "string" #arrow/trino
|
|
1359
|
-
tabular-schedule-id: "schedule-id"
|
|
1360
|
-
|
|
1361
|
-
Request Body (flatbuf)
|
|
1362
|
-
projections_chunk [expressions]
|
|
1363
|
-
predicate_chunk "formatted_data", (required)
|
|
1364
|
-
|
|
1365
|
-
"""
|
|
1366
|
-
# add query option select-only and read-only
|
|
1389
|
+
def _build_query_data_headers(self, txid, client_tags, params, split, num_sub_splits, request_format, response_format,
|
|
1390
|
+
enable_sorted_projections, limit_rows, schedule_id, retry_count, search_path, tenant_guid,
|
|
1391
|
+
sub_split_start_row_ids):
|
|
1367
1392
|
headers = self._fill_common_headers(txid=txid, client_tags=client_tags)
|
|
1368
1393
|
headers['Content-Length'] = str(len(params))
|
|
1369
1394
|
headers['tabular-split'] = ','.join(map(str, split))
|
|
@@ -1388,439 +1413,80 @@ class VastdbApi:
|
|
|
1388
1413
|
for sub_split_id, start_row_id in sub_split_start_row_ids:
|
|
1389
1414
|
headers[f'tabular-start-row-id-{sub_split_id}'] = f"{sub_split_id},{start_row_id}"
|
|
1390
1415
|
|
|
1391
|
-
|
|
1416
|
+
return headers
|
|
1392
1417
|
|
|
1393
|
-
|
|
1394
|
-
|
|
1395
|
-
|
|
1418
|
+
def _build_query_data_url_params(self, projection, query_imports_table):
|
|
1419
|
+
if query_imports_table and projection:
|
|
1420
|
+
raise ValueError("Can't query both imports and projection table")
|
|
1396
1421
|
|
|
1397
|
-
|
|
1398
|
-
|
|
1399
|
-
|
|
1400
|
-
|
|
1401
|
-
|
|
1402
|
-
|
|
1403
|
-
while True:
|
|
1404
|
-
cur_columns, next_key, is_truncated, count = self.list_columns(
|
|
1405
|
-
bucket=bucket, schema=schema, table=table, next_key=next_key, txid=txid)
|
|
1406
|
-
if not cur_columns:
|
|
1407
|
-
break
|
|
1408
|
-
all_listed_columns.extend(cur_columns)
|
|
1409
|
-
if not is_truncated:
|
|
1410
|
-
break
|
|
1411
|
-
|
|
1412
|
-
# build a list of the queried columns
|
|
1413
|
-
queried_column_names = set()
|
|
1414
|
-
if filters:
|
|
1415
|
-
filtered_column_names = ([column_name.split('.')[0] for column_name in filters.keys()]) # use top level of the filter column names
|
|
1416
|
-
queried_column_names.update(filtered_column_names)
|
|
1417
|
-
_logger.debug(f"_list_table_columns: filtered_column_names={filtered_column_names}")
|
|
1418
|
-
|
|
1419
|
-
if field_names:
|
|
1420
|
-
field_column_names = ([column_name.split('.')[0] for column_name in field_names]) # use top level of the field column names
|
|
1421
|
-
else:
|
|
1422
|
-
field_column_names = [column[0] for column in all_listed_columns]
|
|
1423
|
-
_logger.debug(f"_list_table_columns: field_column_names={field_column_names}")
|
|
1424
|
-
queried_column_names.update(field_column_names)
|
|
1425
|
-
|
|
1426
|
-
all_listed_column_and_leaves_names = set()
|
|
1427
|
-
for column in all_listed_columns:
|
|
1428
|
-
# Collect the column and leaves names for verification below that all the filters and field names are in the table
|
|
1429
|
-
column_and_leaves_names = [column[0]] + [f.name for f in column[3].flatten()]
|
|
1430
|
-
all_listed_column_and_leaves_names.update(column_and_leaves_names)
|
|
1431
|
-
|
|
1432
|
-
# check if this column is needed for the query
|
|
1433
|
-
if column[0] in queried_column_names:
|
|
1434
|
-
queried_columns.append(column)
|
|
1435
|
-
|
|
1436
|
-
# verify that all the filters and field names are in the table
|
|
1437
|
-
if filters:
|
|
1438
|
-
for filter_column_name in filters.keys():
|
|
1439
|
-
if filter_column_name not in all_listed_column_and_leaves_names:
|
|
1440
|
-
raise KeyError((f'filter column name: {filter_column_name} does not appear in the table'))
|
|
1441
|
-
if field_names:
|
|
1442
|
-
for field_name in field_names:
|
|
1443
|
-
if field_name not in all_listed_column_and_leaves_names:
|
|
1444
|
-
raise ValueError((f'field name: {field_name} does not appear in the table'))
|
|
1445
|
-
return list(queried_columns)
|
|
1446
|
-
|
|
1447
|
-
def _begin_tx_if_necessary(self, txid):
|
|
1448
|
-
if not txid:
|
|
1449
|
-
created_txid = True
|
|
1450
|
-
res = self.begin_transaction()
|
|
1451
|
-
txid = res.headers.get('tabular-txid')
|
|
1452
|
-
else:
|
|
1453
|
-
created_txid = False
|
|
1422
|
+
url_params = {}
|
|
1423
|
+
if query_imports_table:
|
|
1424
|
+
url_params['sub-table'] = IMPORTED_OBJECTS_TABLE_NAME
|
|
1425
|
+
elif projection:
|
|
1426
|
+
url_params['name'] = projection
|
|
1427
|
+
return url_params
|
|
1454
1428
|
|
|
1455
|
-
|
|
1429
|
+
def legacy_query_data(self, bucket, schema, table, params, split=(0, 1, 8), num_sub_splits=1, response_row_id=False,
|
|
1430
|
+
txid=0, client_tags=[], expected_retvals=[], limit_rows=0, schedule_id=None, retry_count=0,
|
|
1431
|
+
search_path=None, sub_split_start_row_ids=[], tenant_guid=None, projection='', enable_sorted_projections=True,
|
|
1432
|
+
request_format='string', response_format='string', query_imports_table=False):
|
|
1433
|
+
"""
|
|
1434
|
+
POST /mybucket/myschema/mytable?query-data=LegacyQueryData HTTP/1.1
|
|
1435
|
+
Content-Length: ContentLength
|
|
1436
|
+
tabular-txid: TransactionId
|
|
1437
|
+
tabular-client-tag: ClientTag
|
|
1438
|
+
tabular-split: "split_id,total_splits,num_row_groups_per_split"
|
|
1439
|
+
tabular-num-of-subsplits: "total"
|
|
1440
|
+
tabular-request-format: "string"
|
|
1441
|
+
tabular-response-format: "string" #arrow/trino
|
|
1442
|
+
tabular-schedule-id: "schedule-id"
|
|
1443
|
+
|
|
1444
|
+
Request Body (flatbuf)
|
|
1445
|
+
projections_chunk [expressions]
|
|
1446
|
+
predicate_chunk "formatted_data", (required)
|
|
1447
|
+
|
|
1448
|
+
"""
|
|
1449
|
+
headers = self._build_query_data_headers(txid, client_tags, params, split, num_sub_splits, request_format, response_format,
|
|
1450
|
+
enable_sorted_projections, limit_rows, schedule_id, retry_count, search_path, tenant_guid,
|
|
1451
|
+
sub_split_start_row_ids)
|
|
1452
|
+
url_params = self._build_query_data_url_params(projection, query_imports_table)
|
|
1453
|
+
|
|
1454
|
+
res = self.session.post(self._api_prefix(bucket=bucket, schema=schema, table=table, command="query-data=LegacyQueryData",
|
|
1455
|
+
url_params=url_params), data=params, headers=headers, stream=True)
|
|
1456
|
+
return self._check_res(res, "legacy_query_data", expected_retvals)
|
|
1456
1457
|
|
|
1457
|
-
def
|
|
1458
|
-
|
|
1459
|
-
|
|
1460
|
-
|
|
1461
|
-
|
|
1458
|
+
def query_data(self, bucket, schema, table, params, split=(0, 1, 8), num_sub_splits=1, response_row_id=False,
|
|
1459
|
+
txid=0, client_tags=[], expected_retvals=[], limit_rows=0, schedule_id=None, retry_count=0,
|
|
1460
|
+
search_path=None, sub_split_start_row_ids=[], tenant_guid=None, projection='', enable_sorted_projections=True,
|
|
1461
|
+
request_format='string', response_format='string', query_imports_table=False):
|
|
1462
|
+
"""
|
|
1463
|
+
GET /mybucket/myschema/mytable?data HTTP/1.1
|
|
1464
|
+
Content-Length: ContentLength
|
|
1465
|
+
tabular-txid: TransactionId
|
|
1466
|
+
tabular-client-tag: ClientTag
|
|
1467
|
+
tabular-split: "split_id,total_splits,num_row_groups_per_split"
|
|
1468
|
+
tabular-num-of-subsplits: "total"
|
|
1469
|
+
tabular-request-format: "string"
|
|
1470
|
+
tabular-response-format: "string" #arrow/trino
|
|
1471
|
+
tabular-schedule-id: "schedule-id"
|
|
1462
1472
|
|
|
1463
|
-
|
|
1464
|
-
|
|
1473
|
+
Request Body (flatbuf)
|
|
1474
|
+
projections_chunk [expressions]
|
|
1475
|
+
predicate_chunk "formatted_data", (required)
|
|
1465
1476
|
|
|
1466
|
-
|
|
1467
|
-
|
|
1477
|
+
To query the internal vastdb-imported-objects table, set query_imports_table=True
|
|
1478
|
+
"""
|
|
1479
|
+
# add query option select-only and read-only
|
|
1468
1480
|
|
|
1469
|
-
|
|
1481
|
+
headers = self._build_query_data_headers(txid, client_tags, params, split, num_sub_splits, request_format, response_format,
|
|
1482
|
+
enable_sorted_projections, limit_rows, schedule_id, retry_count, search_path, tenant_guid,
|
|
1483
|
+
sub_split_start_row_ids)
|
|
1470
1484
|
|
|
1471
|
-
|
|
1472
|
-
if self.executor_hosts:
|
|
1473
|
-
executor_hosts = self.executor_hosts
|
|
1474
|
-
else:
|
|
1475
|
-
executor_hosts = [self.host]
|
|
1476
|
-
executor_sessions = [VastdbApi(executor_hosts[i], self.access_key, self.secret_key, self.username,
|
|
1477
|
-
self.password, self.port, self.secure, self.auth_type) for i in range(len(executor_hosts))]
|
|
1478
|
-
|
|
1479
|
-
return queried_columns, arrow_schema, query_data_request, executor_sessions
|
|
1480
|
-
|
|
1481
|
-
def _more_pages_exist(self, start_row_ids):
|
|
1482
|
-
for row_id in start_row_ids.values():
|
|
1483
|
-
if row_id != TABULAR_INVALID_ROW_ID:
|
|
1484
|
-
return True
|
|
1485
|
-
return False
|
|
1486
|
-
|
|
1487
|
-
def _query_page(self, bucket, schema, table, query_data_request, split=(0, 1, 8), num_sub_splits=1, response_row_id=False,
|
|
1488
|
-
txid=0, limit_rows=0, sub_split_start_row_ids=[], filters=None, field_names=None):
|
|
1489
|
-
res = self.query_data(bucket=bucket, schema=schema, table=table, params=query_data_request.serialized, split=split,
|
|
1490
|
-
num_sub_splits=num_sub_splits, response_row_id=response_row_id, txid=txid,
|
|
1491
|
-
limit_rows=limit_rows, sub_split_start_row_ids=sub_split_start_row_ids)
|
|
1492
|
-
start_row_ids = {}
|
|
1493
|
-
sub_split_tables = parse_query_data_response(res.raw, query_data_request.response_schema,
|
|
1494
|
-
start_row_ids=start_row_ids)
|
|
1495
|
-
table_page = pa.concat_tables(sub_split_tables)
|
|
1496
|
-
_logger.info("query_page: table_page num_rows=%s start_row_ids len=%s",
|
|
1497
|
-
len(table_page), len(start_row_ids))
|
|
1498
|
-
|
|
1499
|
-
return table_page, start_row_ids
|
|
1500
|
-
|
|
1501
|
-
def _query_page_iterator(self, bucket, schema, table, query_data_request, split=(0, 1, 8), num_sub_splits=1, response_row_id=False,
|
|
1502
|
-
txid=0, limit_rows=0, start_row_ids={}, filters=None, field_names=None):
|
|
1503
|
-
res = self.query_data(bucket=bucket, schema=schema, table=table, params=query_data_request.serialized, split=split,
|
|
1504
|
-
num_sub_splits=num_sub_splits, response_row_id=response_row_id, txid=txid,
|
|
1505
|
-
limit_rows=limit_rows, sub_split_start_row_ids=start_row_ids.items())
|
|
1506
|
-
for sub_split_table in parse_query_data_response(res.raw, query_data_request.response_schema,
|
|
1507
|
-
start_row_ids=start_row_ids):
|
|
1508
|
-
for record_batch in sub_split_table.to_batches():
|
|
1509
|
-
yield record_batch
|
|
1510
|
-
_logger.info(f"query_page_iterator: start_row_ids={start_row_ids}")
|
|
1511
|
-
|
|
1512
|
-
def query_iterator(self, bucket, schema, table, num_sub_splits=1, num_row_groups_per_sub_split=8,
|
|
1513
|
-
response_row_id=False, txid=0, limit_per_sub_split=128*1024, filters=None, field_names=None):
|
|
1514
|
-
"""
|
|
1515
|
-
query rows into a table.
|
|
1516
|
-
|
|
1517
|
-
Parameters
|
|
1518
|
-
----------
|
|
1519
|
-
bucket : string
|
|
1520
|
-
The bucket of the table.
|
|
1521
|
-
schema : string
|
|
1522
|
-
The schema of the table.
|
|
1523
|
-
table : string
|
|
1524
|
-
The table name.
|
|
1525
|
-
num_sub_splits : integer
|
|
1526
|
-
The number of sub_splits per split - determines the parallelism inside a VastDB compute node
|
|
1527
|
-
default: 1
|
|
1528
|
-
num_row_groups_per_sub_split : integer
|
|
1529
|
-
The number of consecutive row groups per sub_split. Each row group consists of 64K row ids.
|
|
1530
|
-
default: 8
|
|
1531
|
-
response_row_id : boolean
|
|
1532
|
-
Return a column with the internal row ids of the table
|
|
1533
|
-
default: False
|
|
1534
|
-
txid : integer
|
|
1535
|
-
A transaction id. The transaction may be initiated before the query, and if not, the query will initiate it
|
|
1536
|
-
default: 0 (will be created by the api)
|
|
1537
|
-
limit_per_sub_split : integer
|
|
1538
|
-
Limit the number of rows from a single sub_split for a single rpc
|
|
1539
|
-
default:131072
|
|
1540
|
-
filters : dict
|
|
1541
|
-
A dictionary whose keys are column names, and values are lists of string expressions that represent
|
|
1542
|
-
filter conditions on the column. AND is applied on the conditions. The condition formats are:
|
|
1543
|
-
'column_name eq some_value'
|
|
1544
|
-
default: None
|
|
1545
|
-
field_names : list
|
|
1546
|
-
A list of column names to be returned in the output table
|
|
1547
|
-
default: None
|
|
1548
|
-
|
|
1549
|
-
Returns
|
|
1550
|
-
-------
|
|
1551
|
-
Query iterator generator
|
|
1552
|
-
|
|
1553
|
-
Yields
|
|
1554
|
-
------
|
|
1555
|
-
pyarrow.RecordBatch
|
|
1556
|
-
|
|
1557
|
-
Examples
|
|
1558
|
-
--------
|
|
1559
|
-
for record_batch in query_iterator('some_bucket', 'some_schema', 'some_table',
|
|
1560
|
-
filters={'name': ['eq Alice', 'eq Bob']}
|
|
1561
|
-
field_names=['name','age']):
|
|
1562
|
-
...
|
|
1563
|
-
|
|
1564
|
-
"""
|
|
1565
|
-
|
|
1566
|
-
# create a transaction if necessary
|
|
1567
|
-
txid, created_txid = self._begin_tx_if_necessary(txid)
|
|
1568
|
-
executor_sessions = []
|
|
1485
|
+
url_params = self._build_query_data_url_params(projection, query_imports_table)
|
|
1569
1486
|
|
|
1570
|
-
|
|
1571
|
-
|
|
1572
|
-
|
|
1573
|
-
self._prepare_query(bucket, schema, table, num_sub_splits, filters, field_names, response_row_id=response_row_id, txid=txid)
|
|
1574
|
-
|
|
1575
|
-
# define the per split threaded query func
|
|
1576
|
-
def query_iterator_split_id(self, split_id):
|
|
1577
|
-
_logger.info(f"query_iterator_split_id: split_id={split_id}")
|
|
1578
|
-
try:
|
|
1579
|
-
start_row_ids = {i:0 for i in range(num_sub_splits)}
|
|
1580
|
-
session = executor_sessions[split_id]
|
|
1581
|
-
while not next_sems[split_id].acquire(timeout=1):
|
|
1582
|
-
# check if killed externally
|
|
1583
|
-
if killall:
|
|
1584
|
-
raise RuntimeError(f'query_iterator_split_id: split_id {split_id} received killall')
|
|
1585
|
-
|
|
1586
|
-
while self._more_pages_exist(start_row_ids):
|
|
1587
|
-
for record_batch in session._query_page_iterator(bucket=bucket, schema=schema, table=table, query_data_request=query_data_request,
|
|
1588
|
-
split=(split_id, num_splits, num_row_groups_per_sub_split),
|
|
1589
|
-
num_sub_splits=num_sub_splits, response_row_id=response_row_id,
|
|
1590
|
-
txid=txid, limit_rows=limit_per_sub_split,
|
|
1591
|
-
start_row_ids=start_row_ids):
|
|
1592
|
-
output_queue.put((split_id, record_batch))
|
|
1593
|
-
while not next_sems[split_id].acquire(timeout=1): # wait for the main thread to request the next record batch
|
|
1594
|
-
if killall:
|
|
1595
|
-
raise RuntimeError(f'split_id {split_id} received killall')
|
|
1596
|
-
# end of split
|
|
1597
|
-
output_queue.put((split_id,None))
|
|
1598
|
-
|
|
1599
|
-
except Exception as e:
|
|
1600
|
-
_logger.exception('query_iterator_split_id: exception occurred')
|
|
1601
|
-
try:
|
|
1602
|
-
self.rollback_transaction(txid)
|
|
1603
|
-
except:
|
|
1604
|
-
_logger.exception(f'failed to rollback txid {txid}')
|
|
1605
|
-
error_queue.put(None)
|
|
1606
|
-
raise e
|
|
1607
|
-
|
|
1608
|
-
# kickoff executors
|
|
1609
|
-
num_splits = len(executor_sessions)
|
|
1610
|
-
output_queue = queue.Queue()
|
|
1611
|
-
error_queue = queue.Queue()
|
|
1612
|
-
next_sems = [threading.Semaphore(value=1) for i in range(num_splits)]
|
|
1613
|
-
killall = False
|
|
1614
|
-
with concurrent.futures.ThreadPoolExecutor(max_workers=num_splits) as executor:
|
|
1615
|
-
# start executors
|
|
1616
|
-
futures = []
|
|
1617
|
-
for i in range(num_splits):
|
|
1618
|
-
futures.append(executor.submit(query_iterator_split_id, self, i))
|
|
1619
|
-
|
|
1620
|
-
# receive outputs and yield them
|
|
1621
|
-
done_count = 0
|
|
1622
|
-
while done_count < num_splits:
|
|
1623
|
-
# check for errors
|
|
1624
|
-
try:
|
|
1625
|
-
error_queue.get(block=False)
|
|
1626
|
-
_logger.error('received error from a thread')
|
|
1627
|
-
killall = True
|
|
1628
|
-
# wait for all executors to complete
|
|
1629
|
-
for future in concurrent.futures.as_completed(futures):
|
|
1630
|
-
try:
|
|
1631
|
-
future.result() # trigger an exception if occurred in any thread
|
|
1632
|
-
except Exception:
|
|
1633
|
-
_logger.exception('exception occurred')
|
|
1634
|
-
raise RuntimeError('received error from a thread')
|
|
1635
|
-
except queue.Empty:
|
|
1636
|
-
pass
|
|
1637
|
-
|
|
1638
|
-
# try to get a value from the output queue
|
|
1639
|
-
try:
|
|
1640
|
-
(split_id, record_batch) = output_queue.get(timeout=1)
|
|
1641
|
-
except queue.Empty:
|
|
1642
|
-
continue
|
|
1643
|
-
|
|
1644
|
-
if record_batch:
|
|
1645
|
-
# signal to the thread to read the next record batch and yield the current
|
|
1646
|
-
next_sems[split_id].release()
|
|
1647
|
-
try:
|
|
1648
|
-
yield record_batch
|
|
1649
|
-
except GeneratorExit:
|
|
1650
|
-
killall = True
|
|
1651
|
-
_logger.debug("cancelling query_iterator")
|
|
1652
|
-
raise
|
|
1653
|
-
else:
|
|
1654
|
-
done_count += 1
|
|
1655
|
-
|
|
1656
|
-
# wait for all executors to complete
|
|
1657
|
-
for future in concurrent.futures.as_completed(futures):
|
|
1658
|
-
try:
|
|
1659
|
-
future.result() # trigger an exception if occurred in any thread
|
|
1660
|
-
except Exception:
|
|
1661
|
-
_logger.exception('exception occurred')
|
|
1662
|
-
|
|
1663
|
-
# commit if needed
|
|
1664
|
-
if created_txid:
|
|
1665
|
-
self.commit_transaction(txid)
|
|
1666
|
-
|
|
1667
|
-
except Exception as e:
|
|
1668
|
-
_logger.exception('exception occurred')
|
|
1669
|
-
try:
|
|
1670
|
-
self.rollback_transaction(txid)
|
|
1671
|
-
except:
|
|
1672
|
-
_logger.exception(f'failed to rollback txid {txid}')
|
|
1673
|
-
raise e
|
|
1674
|
-
|
|
1675
|
-
finally:
|
|
1676
|
-
killall = True
|
|
1677
|
-
for session in executor_sessions:
|
|
1678
|
-
try:
|
|
1679
|
-
session.session.close()
|
|
1680
|
-
except Exception:
|
|
1681
|
-
_logger.exception(f'failed to close session {session}')
|
|
1682
|
-
|
|
1683
|
-
def query(self, bucket, schema, table, num_sub_splits=1, num_row_groups_per_sub_split=8,
|
|
1684
|
-
response_row_id=False, txid=0, limit=0, limit_per_sub_split=131072, filters=None, field_names=None,
|
|
1685
|
-
queried_columns=None):
|
|
1686
|
-
"""
|
|
1687
|
-
query rows into a table.
|
|
1688
|
-
|
|
1689
|
-
Parameters
|
|
1690
|
-
----------
|
|
1691
|
-
bucket : string
|
|
1692
|
-
The bucket of the table.
|
|
1693
|
-
schema : string
|
|
1694
|
-
The schema of the table.
|
|
1695
|
-
table : string
|
|
1696
|
-
The table name.
|
|
1697
|
-
num_sub_splits : integer
|
|
1698
|
-
The number of sub_splits per split - determines the parallelism inside a VastDB compute node
|
|
1699
|
-
default: 1
|
|
1700
|
-
num_row_groups_per_sub_split : integer
|
|
1701
|
-
The number of consecutive row groups per sub_split. Each row group consists of 64K row ids.
|
|
1702
|
-
default: 8
|
|
1703
|
-
response_row_id : boolean
|
|
1704
|
-
Return a column with the internal row ids of the table
|
|
1705
|
-
default: False
|
|
1706
|
-
txid : integer
|
|
1707
|
-
A transaction id. The transaction may be initiated before the query, and be used to provide
|
|
1708
|
-
multiple ACID operations
|
|
1709
|
-
default: 0 (will be created by the api)
|
|
1710
|
-
limit : integer
|
|
1711
|
-
Limit the number of rows in the response
|
|
1712
|
-
default: 0 (no limit)
|
|
1713
|
-
limit_per_sub_split : integer
|
|
1714
|
-
Limit the number of rows from a single sub_split for a single rpc
|
|
1715
|
-
default:131072
|
|
1716
|
-
filters : dict
|
|
1717
|
-
A dictionary whose keys are column names, and values are lists of string expressions that represent
|
|
1718
|
-
filter conditions on the column. AND is applied on the conditions. The condition formats are:
|
|
1719
|
-
'column_name eq some_value'
|
|
1720
|
-
default: None
|
|
1721
|
-
field_names : list
|
|
1722
|
-
A list of column names to be returned to the output table
|
|
1723
|
-
default: None
|
|
1724
|
-
queried_columns: list of pyArrow.column
|
|
1725
|
-
A list of the columns to be queried
|
|
1726
|
-
default: None
|
|
1727
|
-
|
|
1728
|
-
Returns
|
|
1729
|
-
-------
|
|
1730
|
-
pyarrow.Table
|
|
1731
|
-
|
|
1732
|
-
|
|
1733
|
-
Examples
|
|
1734
|
-
--------
|
|
1735
|
-
table = query('some_bucket', 'some_schema', 'some_table',
|
|
1736
|
-
filters={'name': ['eq Alice', 'eq Bob']}
|
|
1737
|
-
field_names=['name','age'])
|
|
1738
|
-
|
|
1739
|
-
"""
|
|
1740
|
-
|
|
1741
|
-
# create a transaction
|
|
1742
|
-
txid, created_txid = self._begin_tx_if_necessary(txid)
|
|
1743
|
-
executor_sessions = []
|
|
1744
|
-
try:
|
|
1745
|
-
# prepare query
|
|
1746
|
-
queried_columns, arrow_schema, query_data_request, executor_sessions = \
|
|
1747
|
-
self._prepare_query(bucket, schema, table, num_sub_splits, filters, field_names, response_row_id=response_row_id, txid=txid)
|
|
1748
|
-
|
|
1749
|
-
# define the per split threaded query func
|
|
1750
|
-
def query_split_id(self, split_id):
|
|
1751
|
-
try:
|
|
1752
|
-
start_row_ids = {i:0 for i in range(num_sub_splits)}
|
|
1753
|
-
session = executor_sessions[split_id]
|
|
1754
|
-
row_count = 0
|
|
1755
|
-
while (self._more_pages_exist(start_row_ids) and
|
|
1756
|
-
(not limit or row_count < limit)):
|
|
1757
|
-
# check if killed externally
|
|
1758
|
-
if killall:
|
|
1759
|
-
raise RuntimeError(f'query_split_id: split_id {split_id} received killall')
|
|
1760
|
-
|
|
1761
|
-
# determine the limit rows
|
|
1762
|
-
if limit:
|
|
1763
|
-
limit_rows = min(limit_per_sub_split, limit-row_count)
|
|
1764
|
-
else:
|
|
1765
|
-
limit_rows = limit_per_sub_split
|
|
1766
|
-
|
|
1767
|
-
# query one page
|
|
1768
|
-
table_page, start_row_ids = session._query_page(bucket=bucket, schema=schema, table=table, query_data_request=query_data_request,
|
|
1769
|
-
split=(split_id, num_splits, num_row_groups_per_sub_split),
|
|
1770
|
-
num_sub_splits=num_sub_splits, response_row_id=response_row_id,
|
|
1771
|
-
txid=txid, limit_rows=limit_rows,
|
|
1772
|
-
sub_split_start_row_ids=start_row_ids.items())
|
|
1773
|
-
with lock:
|
|
1774
|
-
table_pages.append(table_page)
|
|
1775
|
-
row_counts[split_id] += len(table_page)
|
|
1776
|
-
row_count = sum(row_counts)
|
|
1777
|
-
_logger.info(f"query_split_id: table_pages split_id={split_id} row_count={row_count}")
|
|
1778
|
-
except Exception as e:
|
|
1779
|
-
_logger.exception('query_split_id: exception occurred')
|
|
1780
|
-
try:
|
|
1781
|
-
self.rollback_transaction(txid)
|
|
1782
|
-
except:
|
|
1783
|
-
_logger.exception(f'failed to rollback txid {txid}')
|
|
1784
|
-
raise e
|
|
1785
|
-
|
|
1786
|
-
table_pages = []
|
|
1787
|
-
num_splits = len(executor_sessions)
|
|
1788
|
-
killall = False
|
|
1789
|
-
with concurrent.futures.ThreadPoolExecutor(max_workers=num_splits) as executor:
|
|
1790
|
-
futures = []
|
|
1791
|
-
row_counts = [0] * num_splits
|
|
1792
|
-
lock = threading.Lock()
|
|
1793
|
-
for i in range(num_splits):
|
|
1794
|
-
futures.append(executor.submit(query_split_id, self, i))
|
|
1795
|
-
for future in concurrent.futures.as_completed(futures):
|
|
1796
|
-
future.result() # trigger an exception if occurred in any thread
|
|
1797
|
-
|
|
1798
|
-
# commit if needed
|
|
1799
|
-
if created_txid:
|
|
1800
|
-
self.commit_transaction(txid)
|
|
1801
|
-
|
|
1802
|
-
# concatenate all table pages and return result
|
|
1803
|
-
out_table = pa.concat_tables(table_pages)
|
|
1804
|
-
out_table = out_table.slice(length=limit) if limit else out_table
|
|
1805
|
-
_logger.info("query: out_table len=%s row_count=%s",
|
|
1806
|
-
len(out_table), len(out_table))
|
|
1807
|
-
return out_table
|
|
1808
|
-
|
|
1809
|
-
except Exception as e:
|
|
1810
|
-
_logger.exception('exception occurred')
|
|
1811
|
-
try:
|
|
1812
|
-
self.rollback_transaction(txid)
|
|
1813
|
-
except:
|
|
1814
|
-
_logger.exception(f'failed to rollback txid {txid}')
|
|
1815
|
-
raise e
|
|
1816
|
-
|
|
1817
|
-
finally:
|
|
1818
|
-
killall = True
|
|
1819
|
-
for session in executor_sessions:
|
|
1820
|
-
try:
|
|
1821
|
-
session.session.close()
|
|
1822
|
-
except Exception:
|
|
1823
|
-
_logger.exception(f'failed to close session {session}')
|
|
1487
|
+
res = self.session.get(self._api_prefix(bucket=bucket, schema=schema, table=table, command="data", url_params=url_params),
|
|
1488
|
+
data=params, headers=headers, stream=True)
|
|
1489
|
+
return self._check_res(res, "query_data", expected_retvals)
|
|
1824
1490
|
|
|
1825
1491
|
"""
|
|
1826
1492
|
source_files: list of (bucket_name, file_name)
|
|
@@ -1874,21 +1540,22 @@ class VastdbApi:
|
|
|
1874
1540
|
builder.Finish(params)
|
|
1875
1541
|
import_req = builder.Output()
|
|
1876
1542
|
|
|
1877
|
-
def iterate_over_import_data_response(response
|
|
1543
|
+
def iterate_over_import_data_response(response):
|
|
1878
1544
|
if response.status_code != 200:
|
|
1879
1545
|
return response
|
|
1880
1546
|
|
|
1881
1547
|
chunk_size = 1024
|
|
1882
|
-
for chunk in
|
|
1548
|
+
for chunk in response.iter_content(chunk_size=chunk_size):
|
|
1883
1549
|
chunk_dict = json.loads(chunk)
|
|
1884
|
-
_logger.
|
|
1885
|
-
if chunk_dict['res']
|
|
1886
|
-
|
|
1887
|
-
|
|
1888
|
-
|
|
1889
|
-
|
|
1890
|
-
|
|
1891
|
-
|
|
1550
|
+
_logger.debug("import data chunk=%s, result: %s", chunk_dict, chunk_dict['res'])
|
|
1551
|
+
if chunk_dict['res'] != 'Success' and chunk_dict['res'] != 'TabularInProgress' and chunk_dict['res'] != 'TabularAlreadyImported':
|
|
1552
|
+
raise errors.ImportFilesError(
|
|
1553
|
+
f"Encountered an error during import_data. status: {chunk_dict['res']}, "
|
|
1554
|
+
f"error message: {chunk_dict['err_msg'] or 'Unexpected error'} during import of "
|
|
1555
|
+
f"object name: {chunk_dict['object_name']}", chunk_dict)
|
|
1556
|
+
else:
|
|
1557
|
+
_logger.debug("import_data of object name '%s' is in progress. "
|
|
1558
|
+
"status: %s", chunk_dict['object_name'], chunk_dict['res'])
|
|
1892
1559
|
return response
|
|
1893
1560
|
|
|
1894
1561
|
headers = self._fill_common_headers(txid=txid, client_tags=client_tags)
|
|
@@ -1901,34 +1568,17 @@ class VastdbApi:
|
|
|
1901
1568
|
res = self.session.post(self._api_prefix(bucket=bucket, schema=schema, table=table, command="data"),
|
|
1902
1569
|
data=import_req, headers=headers, stream=True)
|
|
1903
1570
|
if blocking:
|
|
1904
|
-
res = iterate_over_import_data_response(res
|
|
1571
|
+
res = iterate_over_import_data_response(res)
|
|
1905
1572
|
|
|
1906
1573
|
return self._check_res(res, "import_data", expected_retvals)
|
|
1907
1574
|
|
|
1908
|
-
def merge_data(self):
|
|
1909
|
-
"""
|
|
1910
|
-
TODO
|
|
1911
|
-
|
|
1912
|
-
POST /mybucket/myschema/mytable?data HTTP/1.1
|
|
1913
|
-
Content-Length: ContentLength
|
|
1914
|
-
tabular-txid: TransactionId
|
|
1915
|
-
tabular-client-tag: ClientTag
|
|
1916
|
-
|
|
1917
|
-
Request Body
|
|
1918
|
-
{
|
|
1919
|
-
"format": "string",
|
|
1920
|
-
"select_source": "formatted data"
|
|
1921
|
-
"predicate": "formatted_data"
|
|
1922
|
-
}
|
|
1923
|
-
"""
|
|
1924
|
-
pass
|
|
1925
|
-
|
|
1926
1575
|
def _record_batch_slices(self, batch, rows_per_slice=None):
|
|
1927
1576
|
max_slice_size_in_bytes = int(0.9*5*1024*1024) # 0.9 * 5MB
|
|
1928
1577
|
batch_len = len(batch)
|
|
1929
1578
|
serialized_batch = serialize_record_batch(batch)
|
|
1930
1579
|
batch_size_in_bytes = len(serialized_batch)
|
|
1931
|
-
_logger.
|
|
1580
|
+
_logger.debug('max_slice_size_in_bytes=%d batch_len=%d batch_size_in_bytes=%d',
|
|
1581
|
+
max_slice_size_in_bytes, batch_len, batch_size_in_bytes)
|
|
1932
1582
|
|
|
1933
1583
|
if not rows_per_slice:
|
|
1934
1584
|
if batch_size_in_bytes < max_slice_size_in_bytes:
|
|
@@ -1950,7 +1600,7 @@ class VastdbApi:
|
|
|
1950
1600
|
serialized_slice_batch = serialize_record_batch(slice_batch)
|
|
1951
1601
|
sizeof_serialized_slice_batch = len(serialized_slice_batch)
|
|
1952
1602
|
|
|
1953
|
-
if sizeof_serialized_slice_batch <= max_slice_size_in_bytes
|
|
1603
|
+
if sizeof_serialized_slice_batch <= max_slice_size_in_bytes:
|
|
1954
1604
|
serialized_slices.append(serialized_slice_batch)
|
|
1955
1605
|
else:
|
|
1956
1606
|
_logger.info(f'Using rows_per_slice {rows_per_slice} slice {i} size {sizeof_serialized_slice_batch} exceeds {max_slice_size_in_bytes} bytes, trying smaller rows_per_slice')
|
|
@@ -1964,125 +1614,6 @@ class VastdbApi:
|
|
|
1964
1614
|
|
|
1965
1615
|
return serialized_slices
|
|
1966
1616
|
|
|
1967
|
-
def insert(self, bucket, schema, table, rows=None, record_batch=None, rows_per_insert=None, txid=0):
|
|
1968
|
-
"""
|
|
1969
|
-
Insert rows into a table. The operation may be split into multiple commands, such that by default no more than 512KB will be inserted per command.
|
|
1970
|
-
|
|
1971
|
-
Parameters
|
|
1972
|
-
----------
|
|
1973
|
-
bucket : string
|
|
1974
|
-
The bucket of the table.
|
|
1975
|
-
schema : string
|
|
1976
|
-
The schema of the table.
|
|
1977
|
-
table : string
|
|
1978
|
-
The table name.
|
|
1979
|
-
rows : dict
|
|
1980
|
-
The rows to insert.
|
|
1981
|
-
dictionary key: column name
|
|
1982
|
-
dictionary value: array of cell values to insert
|
|
1983
|
-
default: None (if None, record_batch must be provided)
|
|
1984
|
-
record_batch : pyarrow.RecordBatch
|
|
1985
|
-
A pyarrow RecordBatch
|
|
1986
|
-
default: None (if None, rows dictionary must be provided)
|
|
1987
|
-
rows_per_insert : integer
|
|
1988
|
-
Split the operation so that each insert command will be limited to this value
|
|
1989
|
-
default: None (will be selected automatically)
|
|
1990
|
-
txid : integer
|
|
1991
|
-
A transaction id. The transaction may be initiated before the insert, and be used to provide
|
|
1992
|
-
multiple ACID operations
|
|
1993
|
-
default: 0 (will be created by the api)
|
|
1994
|
-
|
|
1995
|
-
Returns
|
|
1996
|
-
-------
|
|
1997
|
-
None
|
|
1998
|
-
|
|
1999
|
-
|
|
2000
|
-
Examples
|
|
2001
|
-
--------
|
|
2002
|
-
insert('some_bucket', 'some_schema', 'some_table', {'name': ['Alice','Bob'], 'age': [25,24]})
|
|
2003
|
-
|
|
2004
|
-
"""
|
|
2005
|
-
if (not rows and not record_batch) or (rows and record_batch):
|
|
2006
|
-
raise ValueError(f'insert: missing argument - either rows or record_batch must be provided')
|
|
2007
|
-
|
|
2008
|
-
# create a transaction
|
|
2009
|
-
txid, created_txid = self._begin_tx_if_necessary(txid)
|
|
2010
|
-
|
|
2011
|
-
if rows:
|
|
2012
|
-
columns = self._list_table_columns(bucket, schema, table, field_names=rows.keys(), txid=txid)
|
|
2013
|
-
columns_dict = dict([(column[0], column[1]) for column in columns])
|
|
2014
|
-
arrow_schema = pa.schema([])
|
|
2015
|
-
arrays = []
|
|
2016
|
-
for column_name, column_values in rows.items():
|
|
2017
|
-
column_type = columns_dict[column_name]
|
|
2018
|
-
field = pa.field(column_name, column_type)
|
|
2019
|
-
arrow_schema = arrow_schema.append(field)
|
|
2020
|
-
arrays.append(pa.array(column_values, column_type))
|
|
2021
|
-
record_batch = pa.record_batch(arrays, arrow_schema)
|
|
2022
|
-
|
|
2023
|
-
# split the record batch into multiple slices
|
|
2024
|
-
serialized_slices = self._record_batch_slices(record_batch, rows_per_insert)
|
|
2025
|
-
_logger.info(f'inserting record batch using {len(serialized_slices)} slices')
|
|
2026
|
-
|
|
2027
|
-
insert_queue = queue.Queue()
|
|
2028
|
-
|
|
2029
|
-
[insert_queue.put(insert_rows_req) for insert_rows_req in serialized_slices]
|
|
2030
|
-
|
|
2031
|
-
try:
|
|
2032
|
-
executor_sessions = [VastdbApi(self.executor_hosts[i], self.access_key, self.secret_key, self.username,
|
|
2033
|
-
self.password, self.port, self.secure, self.auth_type) for i in range(len(self.executor_hosts))]
|
|
2034
|
-
|
|
2035
|
-
def insert_executor(self, split_id):
|
|
2036
|
-
|
|
2037
|
-
try:
|
|
2038
|
-
_logger.info(f'insert_executor split_id={split_id} starting')
|
|
2039
|
-
session = executor_sessions[split_id]
|
|
2040
|
-
num_inserts = 0
|
|
2041
|
-
while not killall:
|
|
2042
|
-
try:
|
|
2043
|
-
insert_rows_req = insert_queue.get(block=False)
|
|
2044
|
-
except queue.Empty:
|
|
2045
|
-
break
|
|
2046
|
-
session.insert_rows(bucket=bucket, schema=schema,
|
|
2047
|
-
table=table, record_batch=insert_rows_req, txid=txid)
|
|
2048
|
-
num_inserts += 1
|
|
2049
|
-
_logger.info(f'insert_executor split_id={split_id} num_inserts={num_inserts}')
|
|
2050
|
-
if killall:
|
|
2051
|
-
_logger.info('insert_executor killall=True')
|
|
2052
|
-
|
|
2053
|
-
except Exception as e:
|
|
2054
|
-
_logger.exception('insert_executor hit exception')
|
|
2055
|
-
raise e
|
|
2056
|
-
|
|
2057
|
-
num_splits = len(executor_sessions)
|
|
2058
|
-
killall = False
|
|
2059
|
-
with concurrent.futures.ThreadPoolExecutor(max_workers=num_splits) as executor:
|
|
2060
|
-
futures = []
|
|
2061
|
-
for i in range(num_splits):
|
|
2062
|
-
futures.append(executor.submit(insert_executor, self, i))
|
|
2063
|
-
for future in concurrent.futures.as_completed(futures):
|
|
2064
|
-
future.result() # trigger an exception if occurred in any thread
|
|
2065
|
-
|
|
2066
|
-
# commit if needed
|
|
2067
|
-
if created_txid:
|
|
2068
|
-
self.commit_transaction(txid)
|
|
2069
|
-
|
|
2070
|
-
except Exception as e:
|
|
2071
|
-
_logger.exception('exception occurred')
|
|
2072
|
-
try:
|
|
2073
|
-
self.rollback_transaction(txid)
|
|
2074
|
-
except:
|
|
2075
|
-
_logger.exception(f'failed to rollback txid {txid}')
|
|
2076
|
-
raise e
|
|
2077
|
-
|
|
2078
|
-
finally:
|
|
2079
|
-
killall = True
|
|
2080
|
-
for session in executor_sessions:
|
|
2081
|
-
try:
|
|
2082
|
-
session.session.close()
|
|
2083
|
-
except Exception:
|
|
2084
|
-
_logger.exception(f'failed to close session {session}')
|
|
2085
|
-
|
|
2086
1617
|
def insert_rows(self, bucket, schema, table, record_batch, txid=0, client_tags=[], expected_retvals=[]):
|
|
2087
1618
|
"""
|
|
2088
1619
|
POST /mybucket/myschema/mytable?rows HTTP/1.1
|
|
@@ -2115,7 +1646,8 @@ class VastdbApi:
|
|
|
2115
1646
|
data=record_batch, headers=headers)
|
|
2116
1647
|
return self._check_res(res, "update_rows", expected_retvals)
|
|
2117
1648
|
|
|
2118
|
-
def delete_rows(self, bucket, schema, table, record_batch, txid=0, client_tags=[], expected_retvals=[]
|
|
1649
|
+
def delete_rows(self, bucket, schema, table, record_batch, txid=0, client_tags=[], expected_retvals=[],
|
|
1650
|
+
delete_from_imports_table=False):
|
|
2119
1651
|
"""
|
|
2120
1652
|
DELETE /mybucket/myschema/mytable?rows HTTP/1.1
|
|
2121
1653
|
Content-Length: ContentLength
|
|
@@ -2127,8 +1659,10 @@ class VastdbApi:
|
|
|
2127
1659
|
"""
|
|
2128
1660
|
headers = self._fill_common_headers(txid=txid, client_tags=client_tags)
|
|
2129
1661
|
headers['Content-Length'] = str(len(record_batch))
|
|
2130
|
-
|
|
2131
|
-
|
|
1662
|
+
url_params = {'sub-table': IMPORTED_OBJECTS_TABLE_NAME} if delete_from_imports_table else {}
|
|
1663
|
+
|
|
1664
|
+
res = self.session.delete(self._api_prefix(bucket=bucket, schema=schema, table=table, command="rows", url_params=url_params),
|
|
1665
|
+
data=record_batch, headers=headers)
|
|
2132
1666
|
return self._check_res(res, "delete_rows", expected_retvals)
|
|
2133
1667
|
|
|
2134
1668
|
def create_projection(self, bucket, schema, table, name, columns, txid=0, client_tags=[], expected_retvals=[]):
|
|
@@ -2352,41 +1886,40 @@ def _iter_query_data_response_columns(fileobj, stream_ids=None):
|
|
|
2352
1886
|
if stream_ids is not None:
|
|
2353
1887
|
stream_ids.update([stream_id]) # count stream IDs using a collections.Counter
|
|
2354
1888
|
if stream_id == TABULAR_KEEP_ALIVE_STREAM_ID:
|
|
2355
|
-
# _logger.info(f"stream_id={stream_id} (skipping)")
|
|
2356
1889
|
continue
|
|
2357
1890
|
|
|
2358
1891
|
if stream_id == TABULAR_QUERY_DATA_COMPLETED_STREAM_ID:
|
|
2359
1892
|
# read the terminating end chunk from socket
|
|
2360
1893
|
res = fileobj.read()
|
|
2361
|
-
_logger.
|
|
1894
|
+
_logger.debug("stream_id=%d res=%s (finish)", stream_id, res)
|
|
2362
1895
|
return
|
|
2363
1896
|
|
|
2364
1897
|
if stream_id == TABULAR_QUERY_DATA_FAILED_STREAM_ID:
|
|
2365
1898
|
# read the terminating end chunk from socket
|
|
2366
1899
|
res = fileobj.read()
|
|
2367
|
-
_logger.
|
|
1900
|
+
_logger.warning("stream_id=%d res=%s (failed)", stream_id, res)
|
|
2368
1901
|
raise IOError(f"Query data stream failed res={res}")
|
|
2369
1902
|
|
|
2370
1903
|
next_row_id_bytes = fileobj.read(8)
|
|
2371
1904
|
next_row_id, = struct.unpack('<Q', next_row_id_bytes)
|
|
2372
|
-
_logger.
|
|
1905
|
+
_logger.debug("stream_id=%d next_row_id=%d", stream_id, next_row_id)
|
|
2373
1906
|
|
|
2374
1907
|
if stream_id not in readers:
|
|
2375
1908
|
# we implicitly read 1st message (Arrow schema) when constructing RecordBatchStreamReader
|
|
2376
1909
|
reader = pa.ipc.RecordBatchStreamReader(fileobj)
|
|
2377
|
-
_logger.
|
|
1910
|
+
_logger.debug("stream_id=%d schema=%s", stream_id, reader.schema)
|
|
2378
1911
|
readers[stream_id] = (reader, [])
|
|
2379
1912
|
continue
|
|
2380
1913
|
|
|
2381
1914
|
(reader, batches) = readers[stream_id]
|
|
2382
1915
|
try:
|
|
2383
1916
|
batch = reader.read_next_batch() # read single-column chunk data
|
|
2384
|
-
_logger.
|
|
1917
|
+
_logger.debug("stream_id=%d rows=%d chunk=%s", stream_id, len(batch), batch)
|
|
2385
1918
|
batches.append(batch)
|
|
2386
1919
|
except StopIteration: # we got an end-of-stream IPC message for a given stream ID
|
|
2387
1920
|
reader, batches = readers.pop(stream_id) # end of column
|
|
2388
1921
|
table = pa.Table.from_batches(batches) # concatenate all column chunks (as a single)
|
|
2389
|
-
_logger.
|
|
1922
|
+
_logger.debug("stream_id=%d rows=%d column=%s", stream_id, len(table), table)
|
|
2390
1923
|
yield (stream_id, next_row_id, table)
|
|
2391
1924
|
|
|
2392
1925
|
|
|
@@ -2398,24 +1931,23 @@ def parse_query_data_response(conn, schema, stream_ids=None, start_row_ids=None,
|
|
|
2398
1931
|
"""
|
|
2399
1932
|
if start_row_ids is None:
|
|
2400
1933
|
start_row_ids = {}
|
|
2401
|
-
|
|
2402
|
-
|
|
2403
|
-
|
|
2404
|
-
|
|
2405
|
-
is_empty_projection = (len(projection_positions) == 0)
|
|
2406
|
-
parsers = defaultdict(lambda: QueryDataParser(arrow_schema, debug=debug, projection_positions=projection_positions)) # {stream_id: QueryDataParser}
|
|
1934
|
+
|
|
1935
|
+
is_empty_projection = (len(schema) == 0)
|
|
1936
|
+
parsers = defaultdict(lambda: QueryDataParser(schema, debug=debug)) # {stream_id: QueryDataParser}
|
|
1937
|
+
|
|
2407
1938
|
for stream_id, next_row_id, table in _iter_query_data_response_columns(conn, stream_ids):
|
|
2408
1939
|
parser = parsers[stream_id]
|
|
2409
1940
|
for column in table.columns:
|
|
2410
1941
|
parser.parse(column)
|
|
2411
1942
|
|
|
2412
|
-
parsed_table = parser.build(
|
|
1943
|
+
parsed_table = parser.build()
|
|
2413
1944
|
if parsed_table is not None: # when we got all columns (and before starting a new "select_rows" cycle)
|
|
2414
1945
|
parsers.pop(stream_id)
|
|
2415
1946
|
if is_empty_projection: # VAST returns an empty RecordBatch, with the correct rows' count
|
|
2416
1947
|
parsed_table = table
|
|
2417
1948
|
|
|
2418
|
-
_logger.
|
|
1949
|
+
_logger.debug("stream_id=%d rows=%d next_row_id=%d table=%s",
|
|
1950
|
+
stream_id, len(parsed_table), next_row_id, parsed_table)
|
|
2419
1951
|
start_row_ids[stream_id] = next_row_id
|
|
2420
1952
|
yield parsed_table # the result of a single "select_rows()" cycle
|
|
2421
1953
|
|
|
@@ -2496,7 +2028,7 @@ def get_field_type(builder: flatbuffers.Builder, field: pa.Field):
|
|
|
2496
2028
|
fb_utf8.Start(builder)
|
|
2497
2029
|
field_type = fb_utf8.End(builder)
|
|
2498
2030
|
|
|
2499
|
-
elif field.type.equals(pa.date32()): # pa.date64()
|
|
2031
|
+
elif field.type.equals(pa.date32()): # pa.date64() is not supported
|
|
2500
2032
|
field_type_type = Type.Date
|
|
2501
2033
|
fb_date.Start(builder)
|
|
2502
2034
|
fb_date.AddUnit(builder, DateUnit.DAY)
|
|
@@ -2564,7 +2096,6 @@ def get_field_type(builder: flatbuffers.Builder, field: pa.Field):
|
|
|
2564
2096
|
return field_type, field_type_type
|
|
2565
2097
|
|
|
2566
2098
|
def build_field(builder: flatbuffers.Builder, f: pa.Field, name: str):
|
|
2567
|
-
_logger.info(f"name={f.name}")
|
|
2568
2099
|
children = None
|
|
2569
2100
|
if isinstance(f.type, pa.StructType):
|
|
2570
2101
|
children = [build_field(builder, child, child.name) for child in list(f.type)]
|
|
@@ -2591,7 +2122,6 @@ def build_field(builder: flatbuffers.Builder, f: pa.Field, name: str):
|
|
|
2591
2122
|
fb_field.AddName(builder, child_col_name)
|
|
2592
2123
|
fb_field.AddChildren(builder, children)
|
|
2593
2124
|
|
|
2594
|
-
_logger.info(f"added key and map to entries")
|
|
2595
2125
|
children = [fb_field.End(builder)]
|
|
2596
2126
|
|
|
2597
2127
|
if children is not None:
|
|
@@ -2602,32 +2132,22 @@ def build_field(builder: flatbuffers.Builder, f: pa.Field, name: str):
|
|
|
2602
2132
|
|
|
2603
2133
|
col_name = builder.CreateString(name)
|
|
2604
2134
|
field_type, field_type_type = get_field_type(builder, f)
|
|
2605
|
-
_logger.info(f"add col_name={name} type_type={field_type_type} to fb")
|
|
2606
2135
|
fb_field.Start(builder)
|
|
2607
2136
|
fb_field.AddName(builder, col_name)
|
|
2608
2137
|
fb_field.AddTypeType(builder, field_type_type)
|
|
2609
2138
|
fb_field.AddType(builder, field_type)
|
|
2610
2139
|
if children is not None:
|
|
2611
|
-
_logger.info(f"add col_name={name} childern")
|
|
2612
2140
|
fb_field.AddChildren(builder, children)
|
|
2613
2141
|
return fb_field.End(builder)
|
|
2614
2142
|
|
|
2615
2143
|
|
|
2616
|
-
class VastDBResponseSchema:
|
|
2617
|
-
def __init__(self, arrow_schema, projection_positions, output_field_names):
|
|
2618
|
-
self.arrow_schema = arrow_schema
|
|
2619
|
-
self.projection_positions = projection_positions
|
|
2620
|
-
self.output_field_names = output_field_names
|
|
2621
|
-
|
|
2622
2144
|
class QueryDataRequest:
|
|
2623
2145
|
def __init__(self, serialized, response_schema):
|
|
2624
2146
|
self.serialized = serialized
|
|
2625
2147
|
self.response_schema = response_schema
|
|
2626
2148
|
|
|
2627
2149
|
|
|
2628
|
-
def build_query_data_request(schema: 'pa.Schema' = pa.schema([]),
|
|
2629
|
-
filters = filters or {}
|
|
2630
|
-
|
|
2150
|
+
def build_query_data_request(schema: 'pa.Schema' = pa.schema([]), predicate: ibis.expr.types.BooleanColumn = None, field_names: list = None):
|
|
2631
2151
|
builder = flatbuffers.Builder(1024)
|
|
2632
2152
|
|
|
2633
2153
|
source_name = builder.CreateString('') # required
|
|
@@ -2643,39 +2163,21 @@ def build_query_data_request(schema: 'pa.Schema' = pa.schema([]), filters: dict
|
|
|
2643
2163
|
fb_schema.AddFields(builder, fields)
|
|
2644
2164
|
schema_obj = fb_schema.End(builder)
|
|
2645
2165
|
|
|
2646
|
-
predicate = Predicate(schema,
|
|
2166
|
+
predicate = Predicate(schema=schema, expr=predicate)
|
|
2647
2167
|
filter_obj = predicate.serialize(builder)
|
|
2648
2168
|
|
|
2649
2169
|
parser = QueryDataParser(schema)
|
|
2650
|
-
|
|
2651
|
-
for node in parser.nodes
|
|
2652
|
-
|
|
2653
|
-
if descendent.parent and isinstance(descendent.parent.type, (pa.ListType, pa.MapType)):
|
|
2654
|
-
continue
|
|
2655
|
-
iter_from_root = reversed(list(descendent._iter_to_root()))
|
|
2656
|
-
descendent_full_name = '.'.join([n.field.name for n in iter_from_root])
|
|
2657
|
-
_logger.debug(f'build_query_data_request: descendent_full_name={descendent_full_name}')
|
|
2658
|
-
descendent_leaves = [leaf.index for leaf in descendent._iter_leaves()]
|
|
2659
|
-
leaves_map[descendent_full_name] = descendent_leaves
|
|
2660
|
-
_logger.debug(f'build_query_data_request: leaves_map={leaves_map}')
|
|
2661
|
-
|
|
2662
|
-
output_field_names = None
|
|
2170
|
+
fields_map = {node.field.name: node.field for node in parser.nodes}
|
|
2171
|
+
leaves_map = {node.field.name: [leaf.index for leaf in node._iter_leaves()] for node in parser.nodes}
|
|
2172
|
+
|
|
2663
2173
|
if field_names is None:
|
|
2664
2174
|
field_names = [field.name for field in schema]
|
|
2665
|
-
else:
|
|
2666
|
-
output_field_names = [f.split('.')[0] for f in field_names]
|
|
2667
|
-
# sort projected field_names according to positions to maintain ordering according to the schema
|
|
2668
|
-
def compare_field_names_by_pos(field_name1, field_name2):
|
|
2669
|
-
return leaves_map[field_name1][0]-leaves_map[field_name2][0]
|
|
2670
|
-
field_names = sorted(field_names, key=cmp_to_key(compare_field_names_by_pos))
|
|
2671
|
-
_logger.debug(f'build_query_data_request: sorted field_names={field_names} schema={schema}')
|
|
2672
2175
|
|
|
2176
|
+
response_schema = pa.schema([fields_map[name] for name in field_names])
|
|
2673
2177
|
projection_fields = []
|
|
2674
|
-
projection_positions = []
|
|
2675
2178
|
for field_name in field_names:
|
|
2179
|
+
# TODO: only root-level projection pushdown is supported (i.e. no support for SELECT s.x FROM t)
|
|
2676
2180
|
positions = leaves_map[field_name]
|
|
2677
|
-
_logger.info("projecting field=%s positions=%s", field_name, positions)
|
|
2678
|
-
projection_positions.extend(positions)
|
|
2679
2181
|
for leaf_position in positions:
|
|
2680
2182
|
fb_field_index.Start(builder)
|
|
2681
2183
|
fb_field_index.AddPosition(builder, leaf_position)
|
|
@@ -2686,8 +2188,6 @@ def build_query_data_request(schema: 'pa.Schema' = pa.schema([]), filters: dict
|
|
|
2686
2188
|
builder.PrependUOffsetTRelative(offset)
|
|
2687
2189
|
projection = builder.EndVector()
|
|
2688
2190
|
|
|
2689
|
-
response_schema = VastDBResponseSchema(schema, projection_positions, output_field_names=output_field_names)
|
|
2690
|
-
|
|
2691
2191
|
fb_source.Start(builder)
|
|
2692
2192
|
fb_source.AddName(builder, source_name)
|
|
2693
2193
|
fb_source.AddSchema(builder, schema_obj)
|
|
@@ -2731,11 +2231,9 @@ def convert_column_types(table: 'pa.Table') -> 'pa.Table':
|
|
|
2731
2231
|
indexes_of_fields_to_change[field.name] = index
|
|
2732
2232
|
for changing_index in ts_indexes:
|
|
2733
2233
|
field_name = table.schema[changing_index].name
|
|
2734
|
-
_logger.info(f'changing resolution for {field_name} to us')
|
|
2735
2234
|
new_column = table[field_name].cast(pa.timestamp('us'), safe=False)
|
|
2736
2235
|
table = table.set_column(changing_index, field_name, new_column)
|
|
2737
2236
|
for field_name, changing_index in indexes_of_fields_to_change.items():
|
|
2738
|
-
_logger.info(f'applying custom rules to {field_name}')
|
|
2739
2237
|
new_column = table[field_name].to_pylist()
|
|
2740
2238
|
new_column = list(map(column_matcher[field_name], new_column))
|
|
2741
2239
|
new_column = pa.array(new_column, table[field_name].type)
|