vastdb 0.0.5.3__py3-none-any.whl → 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. vast_flatbuf/tabular/GetTableStatsResponse.py +45 -1
  2. vast_flatbuf/tabular/VipRange.py +56 -0
  3. vastdb/__init__.py +7 -0
  4. vastdb/bench/test_perf.py +29 -0
  5. vastdb/bucket.py +85 -0
  6. vastdb/{tests/conftest.py → conftest.py} +29 -14
  7. vastdb/errors.py +175 -0
  8. vastdb/{api.py → internal_commands.py} +373 -875
  9. vastdb/schema.py +85 -0
  10. vastdb/session.py +47 -0
  11. vastdb/table.py +483 -0
  12. vastdb/tests/test_imports.py +123 -0
  13. vastdb/tests/test_nested.py +28 -0
  14. vastdb/tests/test_projections.py +42 -0
  15. vastdb/tests/test_sanity.py +34 -15
  16. vastdb/tests/test_schemas.py +30 -6
  17. vastdb/tests/test_tables.py +628 -13
  18. vastdb/tests/util.py +18 -0
  19. vastdb/transaction.py +54 -0
  20. vastdb/util.py +11 -10
  21. vastdb-0.1.1.dist-info/METADATA +38 -0
  22. {vastdb-0.0.5.3.dist-info → vastdb-0.1.1.dist-info}/RECORD +26 -31
  23. vast_protobuf/substrait/__init__.py +0 -0
  24. vast_protobuf/substrait/algebra_pb2.py +0 -1344
  25. vast_protobuf/substrait/capabilities_pb2.py +0 -46
  26. vast_protobuf/substrait/ddl_pb2.py +0 -57
  27. vast_protobuf/substrait/extended_expression_pb2.py +0 -49
  28. vast_protobuf/substrait/extensions/__init__.py +0 -0
  29. vast_protobuf/substrait/extensions/extensions_pb2.py +0 -89
  30. vast_protobuf/substrait/function_pb2.py +0 -168
  31. vast_protobuf/substrait/parameterized_types_pb2.py +0 -181
  32. vast_protobuf/substrait/plan_pb2.py +0 -67
  33. vast_protobuf/substrait/type_expressions_pb2.py +0 -198
  34. vast_protobuf/substrait/type_pb2.py +0 -350
  35. vast_protobuf/tabular/__init__.py +0 -0
  36. vast_protobuf/tabular/rpc_pb2.py +0 -344
  37. vastdb/bench_scan.py +0 -45
  38. vastdb/tests/test_create_table_from_parquets.py +0 -50
  39. vastdb/v2.py +0 -360
  40. vastdb-0.0.5.3.dist-info/METADATA +0 -47
  41. {vast_protobuf → vastdb/bench}/__init__.py +0 -0
  42. {vastdb-0.0.5.3.dist-info → vastdb-0.1.1.dist-info}/LICENSE +0 -0
  43. {vastdb-0.0.5.3.dist-info → vastdb-0.1.1.dist-info}/WHEEL +0 -0
  44. {vastdb-0.0.5.3.dist-info → vastdb-0.1.1.dist-info}/top_level.txt +0 -0
@@ -1,29 +1,23 @@
1
- import array
1
+ import itertools
2
+ import json
2
3
  import logging
4
+ import math
5
+ import re
3
6
  import struct
4
7
  import urllib.parse
5
8
  from collections import defaultdict, namedtuple
6
- from datetime import datetime
7
9
  from enum import Enum
8
- from typing import List, Union, Optional, Iterator
9
- import xmltodict
10
- import concurrent.futures
11
- import threading
12
- import queue
13
- import math
14
- import socket
15
- from functools import cmp_to_key
16
- import pyarrow.parquet as pq
10
+ from ipaddress import IPv4Address, IPv6Address
11
+ from typing import Iterator, Optional, Union
12
+
17
13
  import flatbuffers
14
+ import ibis
18
15
  import pyarrow as pa
16
+ import pyarrow.parquet as pq
19
17
  import requests
20
- import datetime
21
- import hashlib
22
- import hmac
23
- import json
24
- import itertools
18
+ import urllib3
19
+ import xmltodict
25
20
  from aws_requests_auth.aws_auth import AWSRequestsAuth
26
- from io import BytesIO
27
21
 
28
22
  import vast_flatbuf.org.apache.arrow.computeir.flatbuf.BinaryLiteral as fb_binary_lit
29
23
  import vast_flatbuf.org.apache.arrow.computeir.flatbuf.BooleanLiteral as fb_bool_lit
@@ -35,10 +29,10 @@ import vast_flatbuf.org.apache.arrow.computeir.flatbuf.FieldIndex as fb_field_in
35
29
  import vast_flatbuf.org.apache.arrow.computeir.flatbuf.FieldRef as fb_field_ref
36
30
  import vast_flatbuf.org.apache.arrow.computeir.flatbuf.Float32Literal as fb_float32_lit
37
31
  import vast_flatbuf.org.apache.arrow.computeir.flatbuf.Float64Literal as fb_float64_lit
32
+ import vast_flatbuf.org.apache.arrow.computeir.flatbuf.Int8Literal as fb_int8_lit
38
33
  import vast_flatbuf.org.apache.arrow.computeir.flatbuf.Int16Literal as fb_int16_lit
39
34
  import vast_flatbuf.org.apache.arrow.computeir.flatbuf.Int32Literal as fb_int32_lit
40
35
  import vast_flatbuf.org.apache.arrow.computeir.flatbuf.Int64Literal as fb_int64_lit
41
- import vast_flatbuf.org.apache.arrow.computeir.flatbuf.Int8Literal as fb_int8_lit
42
36
  import vast_flatbuf.org.apache.arrow.computeir.flatbuf.Literal as fb_literal
43
37
  import vast_flatbuf.org.apache.arrow.computeir.flatbuf.Relation as fb_relation
44
38
  import vast_flatbuf.org.apache.arrow.computeir.flatbuf.RelationImpl as rel_impl
@@ -51,38 +45,47 @@ import vast_flatbuf.org.apache.arrow.flatbuf.Bool as fb_bool
51
45
  import vast_flatbuf.org.apache.arrow.flatbuf.Date as fb_date
52
46
  import vast_flatbuf.org.apache.arrow.flatbuf.Decimal as fb_decimal
53
47
  import vast_flatbuf.org.apache.arrow.flatbuf.Field as fb_field
48
+ import vast_flatbuf.org.apache.arrow.flatbuf.FixedSizeBinary as fb_fixed_size_binary
54
49
  import vast_flatbuf.org.apache.arrow.flatbuf.FloatingPoint as fb_floating_point
55
50
  import vast_flatbuf.org.apache.arrow.flatbuf.Int as fb_int
56
- import vast_flatbuf.org.apache.arrow.flatbuf.Schema as fb_schema
57
- import vast_flatbuf.org.apache.arrow.flatbuf.Time as fb_time
58
- import vast_flatbuf.org.apache.arrow.flatbuf.Struct_ as fb_struct
59
51
  import vast_flatbuf.org.apache.arrow.flatbuf.List as fb_list
60
52
  import vast_flatbuf.org.apache.arrow.flatbuf.Map as fb_map
61
- import vast_flatbuf.org.apache.arrow.flatbuf.FixedSizeBinary as fb_fixed_size_binary
53
+ import vast_flatbuf.org.apache.arrow.flatbuf.Schema as fb_schema
54
+ import vast_flatbuf.org.apache.arrow.flatbuf.Struct_ as fb_struct
55
+ import vast_flatbuf.org.apache.arrow.flatbuf.Time as fb_time
62
56
  import vast_flatbuf.org.apache.arrow.flatbuf.Timestamp as fb_timestamp
63
57
  import vast_flatbuf.org.apache.arrow.flatbuf.Utf8 as fb_utf8
64
58
  import vast_flatbuf.tabular.AlterColumnRequest as tabular_alter_column
59
+ import vast_flatbuf.tabular.AlterProjectionTableRequest as tabular_alter_projection
65
60
  import vast_flatbuf.tabular.AlterSchemaRequest as tabular_alter_schema
66
61
  import vast_flatbuf.tabular.AlterTableRequest as tabular_alter_table
67
- import vast_flatbuf.tabular.AlterProjectionTableRequest as tabular_alter_projection
62
+ import vast_flatbuf.tabular.Column as tabular_projecion_column
63
+ import vast_flatbuf.tabular.ColumnType as tabular_proj_column_type
64
+ import vast_flatbuf.tabular.CreateProjectionRequest as tabular_create_projection
68
65
  import vast_flatbuf.tabular.CreateSchemaRequest as tabular_create_schema
69
66
  import vast_flatbuf.tabular.ImportDataRequest as tabular_import_data
70
67
  import vast_flatbuf.tabular.S3File as tabular_s3_file
71
- import vast_flatbuf.tabular.CreateProjectionRequest as tabular_create_projection
72
- import vast_flatbuf.tabular.Column as tabular_projecion_column
73
- import vast_flatbuf.tabular.ColumnType as tabular_proj_column_type
74
-
75
68
  from vast_flatbuf.org.apache.arrow.computeir.flatbuf.Deref import Deref
76
- from vast_flatbuf.org.apache.arrow.computeir.flatbuf.ExpressionImpl import ExpressionImpl
69
+ from vast_flatbuf.org.apache.arrow.computeir.flatbuf.ExpressionImpl import (
70
+ ExpressionImpl,
71
+ )
77
72
  from vast_flatbuf.org.apache.arrow.computeir.flatbuf.LiteralImpl import LiteralImpl
78
73
  from vast_flatbuf.org.apache.arrow.flatbuf.DateUnit import DateUnit
79
74
  from vast_flatbuf.org.apache.arrow.flatbuf.TimeUnit import TimeUnit
80
75
  from vast_flatbuf.org.apache.arrow.flatbuf.Type import Type
76
+ from vast_flatbuf.tabular.GetProjectionTableStatsResponse import (
77
+ GetProjectionTableStatsResponse as get_projection_table_stats,
78
+ )
79
+ from vast_flatbuf.tabular.GetTableStatsResponse import (
80
+ GetTableStatsResponse as get_table_stats,
81
+ )
82
+ from vast_flatbuf.tabular.ListProjectionsResponse import (
83
+ ListProjectionsResponse as list_projections,
84
+ )
81
85
  from vast_flatbuf.tabular.ListSchemasResponse import ListSchemasResponse as list_schemas
82
86
  from vast_flatbuf.tabular.ListTablesResponse import ListTablesResponse as list_tables
83
- from vast_flatbuf.tabular.GetTableStatsResponse import GetTableStatsResponse as get_table_stats
84
- from vast_flatbuf.tabular.GetProjectionTableStatsResponse import GetProjectionTableStatsResponse as get_projection_table_stats
85
- from vast_flatbuf.tabular.ListProjectionsResponse import ListProjectionsResponse as list_projections
87
+
88
+ from . import errors
86
89
 
87
90
  UINT64_MAX = 18446744073709551615
88
91
 
@@ -91,30 +94,22 @@ TABULAR_QUERY_DATA_COMPLETED_STREAM_ID = 0xFFFFFFFF - 1
91
94
  TABULAR_QUERY_DATA_FAILED_STREAM_ID = 0xFFFFFFFF - 2
92
95
  TABULAR_INVALID_ROW_ID = 0xFFFFFFFFFFFF # (1<<48)-1
93
96
  ESTORE_INVALID_EHANDLE = UINT64_MAX
97
+ IMPORTED_OBJECTS_TABLE_NAME = "vastdb-imported-objects"
94
98
 
95
99
  """
96
100
  S3 Tabular API
97
101
  """
98
102
 
99
103
 
100
- def get_logger(name):
101
- log = logging.getLogger(name)
102
- log.setLevel(logging.ERROR)
103
- ch = logging.StreamHandler()
104
- ch.setLevel(logging.INFO)
105
- ch.set_name('tabular_stream_handler')
106
- formatter = logging.Formatter("%(asctime)s:%(levelname)s:%(message)s")
107
- ch.setFormatter(formatter)
108
- log.addHandler(ch)
109
- log.propagate = False
110
- return log
111
-
104
+ _logger = logging.getLogger(__name__)
112
105
 
113
- _logger = get_logger(__name__)
114
106
 
115
-
116
- def set_tabular_log_level(level: int = logging.INFO):
117
- _logger.setLevel(level)
107
+ def _flatten_args(op, op_type):
108
+ if isinstance(op, op_type):
109
+ for arg in op.args:
110
+ yield from _flatten_args(arg, op_type)
111
+ else:
112
+ yield op
118
113
 
119
114
 
120
115
  class AuthType(Enum):
@@ -123,10 +118,6 @@ class AuthType(Enum):
123
118
  BASIC = "basic"
124
119
 
125
120
 
126
- class TabularException(Exception):
127
- pass
128
-
129
-
130
121
  def get_unit_to_flatbuff_time_unit(type):
131
122
  unit_to_flatbuff_time_unit = {
132
123
  'ns': TimeUnit.NANOSECOND,
@@ -137,18 +128,10 @@ def get_unit_to_flatbuff_time_unit(type):
137
128
  return unit_to_flatbuff_time_unit[type]
138
129
 
139
130
  class Predicate:
140
- unit_to_epoch = {
141
- 'ns': 1_000_000,
142
- 'us': 1_000,
143
- 'ms': 1,
144
- 's': 0.001
145
- }
146
-
147
- def __init__(self, schema: 'pa.Schema', filters: dict):
131
+ def __init__(self, schema: 'pa.Schema', expr: ibis.expr.types.BooleanColumn):
148
132
  self.schema = schema
149
- self.filters = filters
133
+ self.expr = expr
150
134
  self.builder = None
151
- self._field_name_per_index = None
152
135
 
153
136
  def get_field_indexes(self, field: 'pa.Field', field_name_per_index: list) -> None:
154
137
  field_name_per_index.append(field.name)
@@ -172,7 +155,6 @@ class Predicate:
172
155
  for field in self.schema:
173
156
  self.get_field_indexes(field, _field_name_per_index)
174
157
  self._field_name_per_index = {field: index for index, field in enumerate(_field_name_per_index)}
175
- _logger.debug(f'field_name_per_index: {self._field_name_per_index}')
176
158
  return self._field_name_per_index
177
159
 
178
160
  def get_projections(self, builder: 'flatbuffers.builder.Builder', field_names: list = None):
@@ -190,10 +172,87 @@ class Predicate:
190
172
  return builder.EndVector()
191
173
 
192
174
  def serialize(self, builder: 'flatbuffers.builder.Builder'):
175
+ from ibis.expr.operations.generic import IsNull, Literal, TableColumn
176
+ from ibis.expr.operations.logical import (
177
+ And,
178
+ Equals,
179
+ Greater,
180
+ GreaterEqual,
181
+ Less,
182
+ LessEqual,
183
+ Not,
184
+ NotEquals,
185
+ Or,
186
+ )
187
+ from ibis.expr.operations.strings import StringContains
188
+
189
+ builder_map = {
190
+ Greater: self.build_greater,
191
+ GreaterEqual: self.build_greater_equal,
192
+ Less: self.build_less,
193
+ LessEqual: self.build_less_equal,
194
+ Equals: self.build_equal,
195
+ NotEquals: self.build_not_equal,
196
+ IsNull: self.build_is_null,
197
+ Not: self.build_is_not_null,
198
+ StringContains: self.build_match_substring,
199
+ }
200
+
201
+ positions_map = dict((f.name, index) for index, f in enumerate(self.schema)) # TODO: BFS
202
+
193
203
  self.builder = builder
204
+
194
205
  offsets = []
195
- for field_name in self.filters:
196
- offsets.append(self.build_domain(self.build_column(self.field_name_per_index[field_name]), field_name))
206
+
207
+ if self.expr is not None:
208
+ and_args = list(_flatten_args(self.expr.op(), And))
209
+ _logger.debug('AND args: %s ops %s', and_args, self.expr.op())
210
+ for op in and_args:
211
+ or_args = list(_flatten_args(op, Or))
212
+ _logger.debug('OR args: %s op %s', or_args, op)
213
+ inner_offsets = []
214
+
215
+ prev_field_name = None
216
+ for inner_op in or_args:
217
+ _logger.debug('inner_op %s', inner_op)
218
+ builder_func = builder_map.get(type(inner_op))
219
+ if not builder_func:
220
+ raise NotImplementedError(inner_op.name)
221
+
222
+ if builder_func == self.build_is_null:
223
+ column, = inner_op.args
224
+ literal = None
225
+ elif builder_func == self.build_is_not_null:
226
+ not_arg, = inner_op.args
227
+ # currently we only support not is_null, checking we really got is_null under the not:
228
+ if not builder_map.get(type(not_arg)) == self.build_is_null:
229
+ raise NotImplementedError(not_arg.args[0].name)
230
+ column, = not_arg.args
231
+ literal = None
232
+ else:
233
+ column, literal = inner_op.args
234
+ if not isinstance(literal, Literal):
235
+ raise NotImplementedError(inner_op.name)
236
+
237
+ if not isinstance(column, TableColumn):
238
+ raise NotImplementedError(inner_op.name)
239
+
240
+ field_name = column.name
241
+ if prev_field_name is None:
242
+ prev_field_name = field_name
243
+ elif prev_field_name != field_name:
244
+ raise NotImplementedError(op.name)
245
+
246
+ args_offsets = [self.build_column(position=positions_map[field_name])]
247
+ if literal:
248
+ field = self.schema.field(field_name)
249
+ args_offsets.append(self.build_literal(field=field, value=literal.value))
250
+
251
+ inner_offsets.append(builder_func(*args_offsets))
252
+
253
+ domain_offset = self.build_or(inner_offsets)
254
+ offsets.append(domain_offset)
255
+
197
256
  return self.build_and(offsets)
198
257
 
199
258
  def build_column(self, position: int):
@@ -221,7 +280,6 @@ class Predicate:
221
280
  field = self.schema.field(field_name)
222
281
  for attr in field_attrs:
223
282
  field = field.type[attr]
224
- _logger.info(f'trying to append field: {field} with domains: {filters}')
225
283
  for filter_by_name in filters:
226
284
  offsets.append(self.build_range(column=column, field=field, filter_by_name=filter_by_name))
227
285
  return self.build_or(offsets)
@@ -263,11 +321,9 @@ class Predicate:
263
321
  return self.build_and(rules)
264
322
 
265
323
  def build_function(self, name: str, *offsets):
266
- _logger.info(f'name: {name}, offsets: {offsets}')
267
324
  offset_name = self.builder.CreateString(name)
268
325
  fb_call.StartArgumentsVector(self.builder, len(offsets))
269
326
  for offset in reversed(offsets):
270
- _logger.info(f'offset: {offset}')
271
327
  self.builder.PrependUOffsetTRelative(offset)
272
328
  offset_arguments = self.builder.EndVector()
273
329
 
@@ -282,7 +338,7 @@ class Predicate:
282
338
  fb_expression.AddImpl(self.builder, offset_call)
283
339
  return fb_expression.End(self.builder)
284
340
 
285
- def build_literal(self, field: pa.Field, value: str):
341
+ def build_literal(self, field: pa.Field, value):
286
342
  if field.type.equals(pa.int64()):
287
343
  literal_type = fb_int64_lit
288
344
  literal_impl = LiteralImpl.Int64Literal
@@ -356,7 +412,7 @@ class Predicate:
356
412
  field_type = fb_utf8.End(self.builder)
357
413
 
358
414
  value = self.builder.CreateString(value)
359
- elif field.type.equals(pa.date32()): # pa.date64()
415
+ elif field.type.equals(pa.date32()): # pa.date64() is not supported
360
416
  literal_type = fb_date32_lit
361
417
  literal_impl = LiteralImpl.DateLiteral
362
418
 
@@ -364,38 +420,49 @@ class Predicate:
364
420
  fb_date.Start(self.builder)
365
421
  fb_date.AddUnit(self.builder, DateUnit.DAY)
366
422
  field_type = fb_date.End(self.builder)
367
-
368
- start_date = datetime.fromtimestamp(0).date()
369
- date_value = datetime.strptime(value, '%Y-%m-%d').date()
370
- date_delta = date_value - start_date
371
- value = date_delta.days
423
+ value, = pa.array([value], field.type).cast(pa.int32()).to_pylist()
372
424
  elif isinstance(field.type, pa.TimestampType):
373
425
  literal_type = fb_timestamp_lit
374
426
  literal_impl = LiteralImpl.TimestampLiteral
375
427
 
428
+ if field.type.equals(pa.timestamp('s')):
429
+ unit = TimeUnit.SECOND
430
+ if field.type.equals(pa.timestamp('ms')):
431
+ unit = TimeUnit.MILLISECOND
432
+ if field.type.equals(pa.timestamp('us')):
433
+ unit = TimeUnit.MICROSECOND
434
+ if field.type.equals(pa.timestamp('ns')):
435
+ unit = TimeUnit.NANOSECOND
436
+
376
437
  field_type_type = Type.Timestamp
377
438
  fb_timestamp.Start(self.builder)
378
- fb_timestamp.AddUnit(self.builder, get_unit_to_flatbuff_time_unit(field.type.unit))
439
+ fb_timestamp.AddUnit(self.builder, unit)
379
440
  field_type = fb_timestamp.End(self.builder)
380
-
381
- value = int(int(value) * self.unit_to_epoch[field.type.unit])
382
- elif field.type.equals(pa.time32('s')) or field.type.equals(pa.time32('ms')) or field.type.equals(pa.time64('us')) or field.type.equals(pa.time64('ns')):
383
-
441
+ value, = pa.array([value], field.type).cast(pa.int64()).to_pylist()
442
+ elif isinstance(field.type, (pa.Time32Type, pa.Time64Type)):
384
443
  literal_type = fb_time_lit
385
444
  literal_impl = LiteralImpl.TimeLiteral
386
445
 
387
- field_type_str = str(field.type)
388
- start = field_type_str.index('[')
389
- end = field_type_str.index(']')
390
- unit = field_type_str[start + 1:end]
446
+ if field.type.equals(pa.time32('s')):
447
+ target_type = pa.int32()
448
+ unit = TimeUnit.SECOND
449
+ if field.type.equals(pa.time32('ms')):
450
+ target_type = pa.int32()
451
+ unit = TimeUnit.MILLISECOND
452
+ if field.type.equals(pa.time64('us')):
453
+ target_type = pa.int64()
454
+ unit = TimeUnit.MICROSECOND
455
+ if field.type.equals(pa.time64('ns')):
456
+ target_type = pa.int64()
457
+ unit = TimeUnit.NANOSECOND
391
458
 
392
459
  field_type_type = Type.Time
393
460
  fb_time.Start(self.builder)
394
461
  fb_time.AddBitWidth(self.builder, field.type.bit_width)
395
- fb_time.AddUnit(self.builder, get_unit_to_flatbuff_time_unit(unit))
462
+ fb_time.AddUnit(self.builder, unit)
396
463
  field_type = fb_time.End(self.builder)
397
464
 
398
- value = int(value) * self.unit_to_epoch[unit]
465
+ value, = pa.array([value], field.type).cast(target_type).to_pylist()
399
466
  elif field.type.equals(pa.bool_()):
400
467
  literal_type = fb_bool_lit
401
468
  literal_impl = LiteralImpl.BooleanLiteral
@@ -426,7 +493,7 @@ class Predicate:
426
493
  fb_binary.Start(self.builder)
427
494
  field_type = fb_binary.End(self.builder)
428
495
 
429
- value = self.builder.CreateByteVector(value.encode())
496
+ value = self.builder.CreateByteVector(value)
430
497
  else:
431
498
  raise ValueError(f'unsupported predicate for type={field.type}, value={value}')
432
499
 
@@ -459,6 +526,9 @@ class Predicate:
459
526
  def build_equal(self, column: int, literal: int):
460
527
  return self.build_function('equal', column, literal)
461
528
 
529
+ def build_not_equal(self, column: int, literal: int):
530
+ return self.build_function('not_equal', column, literal)
531
+
462
532
  def build_greater(self, column: int, literal: int):
463
533
  return self.build_function('greater', column, literal)
464
534
 
@@ -477,6 +547,9 @@ class Predicate:
477
547
  def build_is_not_null(self, column: int):
478
548
  return self.build_function('is_valid', column)
479
549
 
550
+ def build_match_substring(self, column: int, literal: int):
551
+ return self.build_function('match_substring', column, literal)
552
+
480
553
 
481
554
  class FieldNode:
482
555
  """Helper class for representing nested Arrow fields and handling QueryData requests"""
@@ -506,8 +579,6 @@ class FieldNode:
506
579
  # will be set during by the parser (see below)
507
580
  self.buffers = None # a list of Arrow buffers (https://arrow.apache.org/docs/format/Columnar.html#buffer-listing-for-each-layout)
508
581
  self.length = None # each array must have it's length specified (https://arrow.apache.org/docs/python/generated/pyarrow.Array.html#pyarrow.Array.from_buffers)
509
- self.is_projected = False
510
- self.projected_field = self.field
511
582
 
512
583
  def _iter_to_root(self) -> Iterator['FieldNode']:
513
584
  yield self
@@ -528,15 +599,13 @@ class FieldNode:
528
599
  for child in self.children:
529
600
  yield from child._iter_leaves()
530
601
 
531
- def _iter_projected_leaves(self) -> Iterator['FieldNode']:
602
+ def _iter_leaves(self) -> Iterator['FieldNode']:
532
603
  """Generate only leaf nodes (i.e. columns having scalar types)."""
533
604
  if not self.children:
534
- if self.is_projected:
535
- yield self
605
+ yield self
536
606
  else:
537
607
  for child in self.children:
538
- if child.is_projected:
539
- yield from child._iter_projected_leaves()
608
+ yield from child._iter_leaves()
540
609
 
541
610
  def debug_log(self, level=0):
542
611
  """Recursively dump this node state to log."""
@@ -573,28 +642,17 @@ class FieldNode:
573
642
 
574
643
  def build(self) -> pa.Array:
575
644
  """Construct an Arrow array from the collected buffers (recursively)."""
576
- children = self.children and [node.build() for node in self.children if node.is_projected]
577
- _logger.debug(f'build: self.field.name={self.field.name}, '
578
- f'self.projected_field.type={self.projected_field.type}, self.length={self.length} '
579
- f'self.buffers={self.buffers} children={children}')
580
- result = pa.Array.from_buffers(self.projected_field.type, self.length, buffers=self.buffers, children=children)
645
+ children = self.children and [node.build() for node in self.children]
646
+ result = pa.Array.from_buffers(self.type, self.length, buffers=self.buffers, children=children)
581
647
  if self.debug:
582
648
  _logger.debug('%s result=%s', self.field, result)
583
649
  return result
584
650
 
585
- def build_projected_field(self):
586
- if isinstance(self.type, pa.StructType):
587
- [child.build_projected_field() for child in self.children if child.is_projected]
588
- self.projected_field = pa.field(self.field.name,
589
- pa.struct([child.projected_field for child in self.children if child.is_projected]),
590
- self.field.nullable,
591
- self.field.metadata)
592
651
 
593
652
  class QueryDataParser:
594
653
  """Used to parse VAST QueryData RPC response."""
595
- def __init__(self, arrow_schema: pa.Schema, *, debug=False, projection_positions=None):
654
+ def __init__(self, arrow_schema: pa.Schema, *, debug=False):
596
655
  self.arrow_schema = arrow_schema
597
- self.projection_positions = projection_positions
598
656
  index = itertools.count() # used to generate leaf column positions for VAST QueryData RPC
599
657
  self.nodes = [FieldNode(field, index, debug=debug) for field in arrow_schema]
600
658
  self.debug = debug
@@ -602,27 +660,15 @@ class QueryDataParser:
602
660
  for node in self.nodes:
603
661
  node.debug_log()
604
662
  self.leaves = [leaf for node in self.nodes for leaf in node._iter_leaves()]
605
- _logger.debug(f'QueryDataParser: self.leaves = {[(leaf.field.name, leaf.index) for leaf in self.leaves]}')
606
- self.mark_projected_nodes()
607
- [node.build_projected_field() for node in self.nodes]
608
- self.projected_leaves = [leaf for node in self.nodes for leaf in node._iter_projected_leaves()]
609
- _logger.debug(f'QueryDataParser: self.projected_leaves = {[(leaf.field.name, leaf.index) for leaf in self.projected_leaves]}')
610
663
 
611
664
  self.leaf_offset = 0
612
665
 
613
- def mark_projected_nodes(self):
614
- for leaf in self.leaves:
615
- if self.projection_positions is None or leaf.index in self.projection_positions:
616
- for node in leaf._iter_to_root():
617
- node.is_projected = True
618
- _logger.debug(f'mark_projected_nodes node.field.name={node.field.name}')
619
-
620
666
  def parse(self, column: pa.Array):
621
667
  """Parse a single column response from VAST (see FieldNode.set for details)"""
622
- if not self.leaf_offset < len(self.projected_leaves):
668
+ if not self.leaf_offset < len(self.leaves):
623
669
  raise ValueError(f'self.leaf_offset: {self.leaf_offset} are not < '
624
670
  f'than len(self.leaves): {len(self.leaves)}')
625
- leaf = self.projected_leaves[self.leaf_offset]
671
+ leaf = self.leaves[self.leaf_offset]
626
672
 
627
673
  # A column response may be sent in multiple chunks, therefore we need to combine
628
674
  # it into a single chunk to allow reconstruction using `Array.from_buffers()`.
@@ -643,32 +689,19 @@ class QueryDataParser:
643
689
 
644
690
  self.leaf_offset += 1
645
691
 
646
- def build(self, output_field_names=None) -> Optional[pa.Table]:
692
+ def build(self) -> Optional[pa.Table]:
647
693
  """Try to build the resulting Table object (if all columns were parsed)"""
648
- if self.projection_positions is not None:
649
- if self.leaf_offset < len(self.projection_positions):
650
- return None
651
- else:
652
- if self.leaf_offset < len(self.leaves):
653
- return None
694
+ if self.leaf_offset < len(self.leaves):
695
+ return None
654
696
 
655
697
  if self.debug:
656
698
  for node in self.nodes:
657
699
  node.debug_log()
658
700
 
659
- # sort resulting table according to the output field names
660
- projected_nodes = [node for node in self.nodes if node.is_projected]
661
- if output_field_names is not None:
662
- def key_func(projected_node):
663
- return output_field_names.index(projected_node.field.name)
664
- sorted_projected_nodes = sorted(projected_nodes, key=key_func)
665
- else:
666
- sorted_projected_nodes = projected_nodes
667
-
668
701
  result = pa.Table.from_arrays(
669
- arrays=[node.build() for node in sorted_projected_nodes],
670
- schema = pa.schema([node.projected_field for node in sorted_projected_nodes]))
671
- result.validate(full=True) # does expensive validation checks only if debug is enabled
702
+ arrays=[node.build() for node in self.nodes],
703
+ schema=self.arrow_schema)
704
+ result.validate(full=self.debug) # does expensive validation checks only if debug is enabled
672
705
  return result
673
706
 
674
707
  def _iter_nested_arrays(column: pa.Array) -> Iterator[pa.Array]:
@@ -693,7 +726,6 @@ def _parse_table_info(obj):
693
726
  return TableInfo(name, properties, handle, num_rows, used_bytes)
694
727
 
695
728
  def build_record_batch(column_info, column_values):
696
- _logger.info(f"column_info={column_info}")
697
729
  fields = [pa.field(column_name, column_type) for column_type, column_name in column_info]
698
730
  schema = pa.schema(fields)
699
731
  arrays = [pa.array(column_values[column_type], type=column_type) for column_type, _ in column_info]
@@ -706,56 +738,30 @@ def serialize_record_batch(batch):
706
738
  writer.write(batch)
707
739
  return sink.getvalue()
708
740
 
709
- def generate_ip_range(ip_range_str):
710
- start, end = ip_range_str.split(':')
711
- start_parts = start.split('.')
712
- start_last_part = int(start_parts[-1])
713
- end_parts = end.split('.')
714
- end_last_part = int(end_parts[-1])
715
- if start_last_part>=end_last_part or True in [start_parts[i] != end_parts[i] for i in range(3)]:
716
- raise ValueError(f'illegal ip range {ip_range_str}')
717
- num_ips = 1 + end_last_part - start_last_part
718
- ips = ['.'.join(start_parts[:-1] + [str(start_last_part + i)]) for i in range(num_ips)]
719
- return ips
720
-
721
- def parse_executor_hosts(host):
722
- executor_hosts_parsed = host.split(',')
723
- executor_hosts_parsed = [host.strip() for host in executor_hosts_parsed]
724
- executor_hosts = []
725
- for executor_host in executor_hosts_parsed:
726
- is_ip_range=False
727
- if ':' in executor_host:
728
- try:
729
- socket.inet_aton(executor_host.split(':')[0])
730
- socket.inet_aton(executor_host.split(':')[1])
731
- is_ip_range = True
732
- except:
733
- pass
734
- if is_ip_range:
735
- executor_hosts.extend(generate_ip_range(executor_host))
736
- else:
737
- executor_hosts.append(executor_host)
738
- return executor_hosts
741
+ # Results that returns from tablestats
742
+ TableStatsResult = namedtuple("TableStatsResult",["num_rows", "size_in_bytes", "is_external_rowid_alloc", "endpoints"])
739
743
 
740
744
  class VastdbApi:
741
- def __init__(self, host, access_key, secret_key, username=None, password=None, port=None,
745
+ # we expect the vast version to be <major>.<minor>.<patch>.<protocol>
746
+ VAST_VERSION_REGEX = re.compile(r'^vast (\d+\.\d+\.\d+\.\d+)$')
747
+
748
+ def __init__(self, endpoint, access_key, secret_key, username=None, password=None,
742
749
  secure=False, auth_type=AuthType.SIGV4):
743
- executor_hosts = parse_executor_hosts(host)
744
- host = executor_hosts[0]
745
- self.host = host
750
+ url_dict = urllib3.util.parse_url(endpoint)._asdict()
746
751
  self.access_key = access_key
747
752
  self.secret_key = secret_key
748
753
  self.username = username
749
754
  self.password = password
750
- self.port = port
751
755
  self.secure = secure
752
756
  self.auth_type = auth_type
753
- self.executor_hosts = executor_hosts
757
+ self.executor_hosts = [endpoint] # TODO: remove
754
758
 
755
759
  username = username or ''
756
760
  password = password or ''
757
- if not port:
758
- port = 443 if secure else 80
761
+ if not url_dict['port']:
762
+ url_dict['port'] = 443 if secure else 80
763
+
764
+ self.port = url_dict['port']
759
765
 
760
766
  self.default_max_list_columns_page_size = 1000
761
767
  self.session = requests.Session()
@@ -764,10 +770,10 @@ class VastdbApi:
764
770
  if auth_type == AuthType.BASIC:
765
771
  self.session.auth = requests.auth.HTTPBasicAuth(username, password)
766
772
  else:
767
- if port != 80 and port != 443:
768
- self.aws_host = f'{host}:{port}'
773
+ if url_dict['port'] != 80 and url_dict['port'] != 443:
774
+ self.aws_host = '{host}:{port}'.format(**url_dict)
769
775
  else:
770
- self.aws_host = f'{host}'
776
+ self.aws_host = '{host}'.format(**url_dict)
771
777
 
772
778
  self.session.auth = AWSRequestsAuth(aws_access_key=access_key,
773
779
  aws_secret_access_key=secret_key,
@@ -775,8 +781,34 @@ class VastdbApi:
775
781
  aws_region='us-east-1',
776
782
  aws_service='s3')
777
783
 
778
- proto = "https" if secure else "http"
779
- self.url = f"{proto}://{self.aws_host}"
784
+ if not url_dict['scheme']:
785
+ url_dict['scheme'] = "https" if secure else "http"
786
+
787
+ url = urllib3.util.Url(**url_dict)
788
+ self.url = str(url)
789
+ _logger.debug('url=%s aws_host=%s', self.url, self.aws_host)
790
+
791
+ # probe the cluster for its version
792
+ self.vast_version = None
793
+ res = self.session.options(self.url)
794
+ server_header = res.headers.get("Server")
795
+ if server_header is None:
796
+ _logger.error("OPTIONS response doesn't contain 'Server' header")
797
+ else:
798
+ _logger.debug("Server header is '%s'", server_header)
799
+ if m := self.VAST_VERSION_REGEX.match(server_header):
800
+ self.vast_version, = m.groups()
801
+ return
802
+ else:
803
+ _logger.error("'Server' header '%s' doesn't match the expected pattern", server_header)
804
+
805
+ msg = (
806
+ f'Please use `vastdb` <= 0.0.5.x with current VAST cluster version ("{server_header or "N/A"}"). '
807
+ 'To use the latest SDK, please upgrade your cluster to the latest service pack. '
808
+ 'Please contact customer.support@vastdata.com for more details.'
809
+ )
810
+ _logger.critical(msg)
811
+ raise NotImplementedError(msg)
780
812
 
781
813
  def update_mgmt_session(self, access_key: str, secret_key: str, auth_type=AuthType.SIGV4):
782
814
  if auth_type != AuthType.BASIC:
@@ -821,21 +853,9 @@ class VastdbApi:
821
853
  return common_headers
822
854
 
823
855
  def _check_res(self, res, cmd="", expected_retvals=[]):
824
- try:
825
- res.raise_for_status()
826
- if res.status_code != 200:
827
- if not res.status_code in expected_retvals:
828
- raise ValueError(f"Expected status code mismatch. status_code={res.status_code}")
829
- else:
830
- if not len(expected_retvals) == 0:
831
- raise ValueError(f"Expected {expected_retvals} but status_code={res.status_code}")
832
- return res
833
- except requests.HTTPError as e:
834
- if res.status_code in expected_retvals:
835
- _logger.info(f"{cmd} has failed as expected res={res}")
836
- return res
837
- else:
838
- raise e
856
+ if exc := errors.from_response(res):
857
+ raise exc
858
+ return res
839
859
 
840
860
  def create_schema(self, bucket, name, txid=0, client_tags=[], schema_properties="", expected_retvals=[]):
841
861
  """
@@ -975,7 +995,8 @@ class VastdbApi:
975
995
  return snapshots, is_truncated, marker
976
996
 
977
997
 
978
- def create_table(self, bucket, schema, name, arrow_schema, txid=0, client_tags=[], expected_retvals=[], topic_partitions=0):
998
+ def create_table(self, bucket, schema, name, arrow_schema, txid=0, client_tags=[], expected_retvals=[],
999
+ topic_partitions=0, create_imports_table=False):
979
1000
  """
980
1001
  Create a table, use the following request
981
1002
  POST /bucket/schema/table?table HTTP/1.1
@@ -984,18 +1005,21 @@ class VastdbApi:
984
1005
  tabular-txid: <integer> TransactionId
985
1006
  tabular-client-tag: <string> ClientTag
986
1007
 
987
- The body of the POST request contains table column properties as json
988
- {
989
- "format": "string",
990
- "column_names": {"name1":"type1", "name2":"type2", ...},
991
- "table_properties": {"key1":"val1", "key2":"val2", ...}
992
- }
1008
+ The body of the POST request contains table column properties as arrow schema
1009
+ which include field_name, field_type and properties
1010
+
1011
+ In order to create vastdb-imported-objects table that tracks all imported files and avoid duplicate imports,
1012
+ just set create_imports_table=True
1013
+ The request will look like:
1014
+ POST /bucket/schema/table?table&sub-table=vastdb-imported-objects HTTP/1.1
993
1015
  """
994
1016
  headers = self._fill_common_headers(txid=txid, client_tags=client_tags)
995
1017
 
996
1018
  serialized_schema = arrow_schema.serialize()
997
1019
  headers['Content-Length'] = str(len(serialized_schema))
998
1020
  url_params = {'topic_partitions': str(topic_partitions)} if topic_partitions else {}
1021
+ if create_imports_table:
1022
+ url_params['sub-table'] = IMPORTED_OBJECTS_TABLE_NAME
999
1023
 
1000
1024
  res = self.session.post(self._api_prefix(bucket=bucket, schema=schema, table=name, command="table", url_params=url_params),
1001
1025
  data=serialized_schema, headers=headers)
@@ -1015,7 +1039,6 @@ class VastdbApi:
1015
1039
  raise RuntimeError(f'invalid params parquet_path={parquet_path} parquet_bucket_name={parquet_bucket_name} parquet_object_name={parquet_object_name}')
1016
1040
 
1017
1041
  # Get the schema of the Parquet file
1018
- _logger.info(f'type(parquet_ds.schema) = {type(parquet_ds.schema)}')
1019
1042
  if isinstance(parquet_ds.schema, pq.ParquetSchema):
1020
1043
  arrow_schema = parquet_ds.schema.to_arrow_schema()
1021
1044
  elif isinstance(parquet_ds.schema, pa.Schema):
@@ -1038,13 +1061,27 @@ class VastdbApi:
1038
1061
  headers = self._fill_common_headers(txid=txid, client_tags=client_tags)
1039
1062
  res = self.session.get(self._api_prefix(bucket=bucket, schema=schema, table=name, command="stats"), headers=headers)
1040
1063
  if res.status_code == 200:
1041
- res_headers = res.headers
1042
1064
  flatbuf = b''.join(res.iter_content(chunk_size=128))
1043
1065
  stats = get_table_stats.GetRootAs(flatbuf)
1044
1066
  num_rows = stats.NumRows()
1045
1067
  size_in_bytes = stats.SizeInBytes()
1046
1068
  is_external_rowid_alloc = stats.IsExternalRowidAlloc()
1047
- return num_rows, size_in_bytes, is_external_rowid_alloc
1069
+ endpoints = []
1070
+ if stats.VipsLength() == 0:
1071
+ endpoints.append(self.url)
1072
+ else:
1073
+ ip_cls = IPv6Address if (stats.AddressType() == "ipv6") else IPv4Address
1074
+ vips = [stats.Vips(i) for i in range(stats.VipsLength())]
1075
+ ips = []
1076
+ # extract the vips into list of IPs
1077
+ for vip in vips:
1078
+ start_ip = int(ip_cls(vip.StartAddress().decode()))
1079
+ ips.extend(ip_cls(start_ip + i) for i in range(vip.AddressCount()))
1080
+ for ip in ips:
1081
+ prefix = "http" if not self.secure else "https"
1082
+ endpoints.append(f"{prefix}://{str(ip)}:{self.port}")
1083
+ return TableStatsResult(num_rows, size_in_bytes, is_external_rowid_alloc, endpoints)
1084
+
1048
1085
  return self._check_res(res, "get_table_stats", expected_retvals)
1049
1086
 
1050
1087
  def alter_table(self, bucket, schema, name, txid=0, client_tags=[], table_properties="",
@@ -1071,22 +1108,26 @@ class VastdbApi:
1071
1108
 
1072
1109
  headers = self._fill_common_headers(txid=txid, client_tags=client_tags)
1073
1110
  headers['Content-Length'] = str(len(alter_table_req))
1074
- url_params = {'tabular-new-table-name': new_name} if len(new_name) else {}
1111
+ url_params = {'tabular-new-table-name': schema + "/" + new_name} if len(new_name) else {}
1075
1112
 
1076
1113
  res = self.session.put(self._api_prefix(bucket=bucket, schema=schema, table=name, command="table", url_params=url_params),
1077
1114
  data=alter_table_req, headers=headers)
1078
1115
 
1079
1116
  return self._check_res(res, "alter_table", expected_retvals)
1080
1117
 
1081
- def drop_table(self, bucket, schema, name, txid=0, client_tags=[], expected_retvals=[]):
1118
+ def drop_table(self, bucket, schema, name, txid=0, client_tags=[], expected_retvals=[], remove_imports_table=False):
1082
1119
  """
1083
1120
  DELETE /mybucket/schema_path/mytable?table HTTP/1.1
1084
1121
  tabular-txid: TransactionId
1085
1122
  tabular-client-tag: ClientTag
1123
+
1124
+ To remove the internal vastdb-imported-objects table just set remove_imports_table=True
1086
1125
  """
1087
1126
  headers = self._fill_common_headers(txid=txid, client_tags=client_tags)
1127
+ url_params = {'sub-table': IMPORTED_OBJECTS_TABLE_NAME} if remove_imports_table else {}
1088
1128
 
1089
- res = self.session.delete(self._api_prefix(bucket=bucket, schema=schema, table=name, command="table"), headers=headers)
1129
+ res = self.session.delete(self._api_prefix(bucket=bucket, schema=schema, table=name, command="table", url_params=url_params),
1130
+ headers=headers)
1090
1131
  return self._check_res(res, "drop_table", expected_retvals)
1091
1132
 
1092
1133
  def list_tables(self, bucket, schema, txid=0, client_tags=[], max_keys=1000, next_key=0, name_prefix="",
@@ -1210,7 +1251,7 @@ class VastdbApi:
1210
1251
 
1211
1252
  def list_columns(self, bucket, schema, table, *, txid=0, client_tags=None, max_keys=None, next_key=0,
1212
1253
  count_only=False, name_prefix="", exact_match=False,
1213
- expected_retvals=None, bc_list_internals=False):
1254
+ expected_retvals=None, bc_list_internals=False, list_imports_table=False):
1214
1255
  """
1215
1256
  GET /mybucket/myschema/mytable?columns HTTP/1.1
1216
1257
  tabular-txid: TransactionId
@@ -1218,6 +1259,8 @@ class VastdbApi:
1218
1259
  x-tabluar-name-prefix: TableNamePrefix
1219
1260
  tabular-max-keys: 1000
1220
1261
  tabular-next-key: NextColumnId
1262
+
1263
+ To list the columns of the internal vastdb-imported-objects table, set list_import_table=True
1221
1264
  """
1222
1265
  max_keys = max_keys or self.default_max_list_columns_page_size
1223
1266
  client_tags = client_tags or []
@@ -1235,7 +1278,9 @@ class VastdbApi:
1235
1278
  else:
1236
1279
  headers['tabular-name-prefix'] = name_prefix
1237
1280
 
1238
- res = self.session.get(self._api_prefix(bucket=bucket, schema=schema, table=table, command="column"),
1281
+ url_params = {'sub-table': IMPORTED_OBJECTS_TABLE_NAME} if list_imports_table else {}
1282
+ res = self.session.get(self._api_prefix(bucket=bucket, schema=schema, table=table, command="column",
1283
+ url_params=url_params),
1239
1284
  headers=headers, stream=True)
1240
1285
  self._check_res(res, "list_columns", expected_retvals)
1241
1286
  if res.status_code == 200:
@@ -1247,9 +1292,7 @@ class VastdbApi:
1247
1292
  if not count_only:
1248
1293
  schema_buf = b''.join(res.iter_content(chunk_size=128))
1249
1294
  schema_out = pa.ipc.open_stream(schema_buf).schema
1250
- # _logger.info(f"schema={schema_out}")
1251
- for f in schema_out:
1252
- columns.append([f.name, f.type, f.metadata, f])
1295
+ columns = schema_out
1253
1296
 
1254
1297
  return columns, next_key, is_truncated, count
1255
1298
 
@@ -1296,7 +1339,7 @@ class VastdbApi:
1296
1339
  return self._check_res(res, "get_transaction", expected_retvals)
1297
1340
 
1298
1341
  def select_row_ids(self, bucket, schema, table, params, txid=0, client_tags=[], expected_retvals=[],
1299
- retry_count=0, enable_sorted_projections=False):
1342
+ retry_count=0, enable_sorted_projections=True):
1300
1343
  """
1301
1344
  POST /mybucket/myschema/mytable?query-data=SelectRowIds HTTP/1.1
1302
1345
  """
@@ -1313,7 +1356,7 @@ class VastdbApi:
1313
1356
  return self._check_res(res, "query_data", expected_retvals)
1314
1357
 
1315
1358
  def read_columns_data(self, bucket, schema, table, params, txid=0, client_tags=[], expected_retvals=[], tenant_guid=None,
1316
- retry_count=0, enable_sorted_projections=False):
1359
+ retry_count=0, enable_sorted_projections=True):
1317
1360
  """
1318
1361
  POST /mybucket/myschema/mytable?query-data=ReadColumns HTTP/1.1
1319
1362
  """
@@ -1329,7 +1372,7 @@ class VastdbApi:
1329
1372
  return self._check_res(res, "query_data", expected_retvals)
1330
1373
 
1331
1374
  def count_rows(self, bucket, schema, table, params, txid=0, client_tags=[], expected_retvals=[], tenant_guid=None,
1332
- retry_count=0, enable_sorted_projections=False):
1375
+ retry_count=0, enable_sorted_projections=True):
1333
1376
  """
1334
1377
  POST /mybucket/myschema/mytable?query-data=CountRows HTTP/1.1
1335
1378
  """
@@ -1343,27 +1386,9 @@ class VastdbApi:
1343
1386
  data=params, headers=headers, stream=True)
1344
1387
  return self._check_res(res, "query_data", expected_retvals)
1345
1388
 
1346
- def query_data(self, bucket, schema, table, params, split=(0, 1, 8), num_sub_splits=1, response_row_id=False,
1347
- txid=0, client_tags=[], expected_retvals=[], limit_rows=0, schedule_id=None, retry_count=0,
1348
- search_path=None, sub_split_start_row_ids=[], tenant_guid=None, projection='', enable_sorted_projections=True,
1349
- request_format='string', response_format='string'):
1350
- """
1351
- GET /mybucket/myschema/mytable?data HTTP/1.1
1352
- Content-Length: ContentLength
1353
- tabular-txid: TransactionId
1354
- tabular-client-tag: ClientTag
1355
- tabular-split: "split_id,total_splits,num_row_groups_per_split"
1356
- tabular-num-of-subsplits: "total"
1357
- tabular-request-format: "string"
1358
- tabular-response-format: "string" #arrow/trino
1359
- tabular-schedule-id: "schedule-id"
1360
-
1361
- Request Body (flatbuf)
1362
- projections_chunk [expressions]
1363
- predicate_chunk "formatted_data", (required)
1364
-
1365
- """
1366
- # add query option select-only and read-only
1389
+ def _build_query_data_headers(self, txid, client_tags, params, split, num_sub_splits, request_format, response_format,
1390
+ enable_sorted_projections, limit_rows, schedule_id, retry_count, search_path, tenant_guid,
1391
+ sub_split_start_row_ids):
1367
1392
  headers = self._fill_common_headers(txid=txid, client_tags=client_tags)
1368
1393
  headers['Content-Length'] = str(len(params))
1369
1394
  headers['tabular-split'] = ','.join(map(str, split))
@@ -1388,439 +1413,80 @@ class VastdbApi:
1388
1413
  for sub_split_id, start_row_id in sub_split_start_row_ids:
1389
1414
  headers[f'tabular-start-row-id-{sub_split_id}'] = f"{sub_split_id},{start_row_id}"
1390
1415
 
1391
- url_params = {'name': projection} if projection else {}
1416
+ return headers
1392
1417
 
1393
- res = self.session.get(self._api_prefix(bucket=bucket, schema=schema, table=table, command="data", url_params=url_params),
1394
- data=params, headers=headers, stream=True)
1395
- return self._check_res(res, "query_data", expected_retvals)
1418
+ def _build_query_data_url_params(self, projection, query_imports_table):
1419
+ if query_imports_table and projection:
1420
+ raise ValueError("Can't query both imports and projection table")
1396
1421
 
1397
- def _list_table_columns(self, bucket, schema, table, filters=None, field_names=None, txid=0):
1398
- # build a list of the queried column names
1399
- queried_columns = []
1400
- # get all columns from the table
1401
- all_listed_columns = []
1402
- next_key = 0
1403
- while True:
1404
- cur_columns, next_key, is_truncated, count = self.list_columns(
1405
- bucket=bucket, schema=schema, table=table, next_key=next_key, txid=txid)
1406
- if not cur_columns:
1407
- break
1408
- all_listed_columns.extend(cur_columns)
1409
- if not is_truncated:
1410
- break
1411
-
1412
- # build a list of the queried columns
1413
- queried_column_names = set()
1414
- if filters:
1415
- filtered_column_names = ([column_name.split('.')[0] for column_name in filters.keys()]) # use top level of the filter column names
1416
- queried_column_names.update(filtered_column_names)
1417
- _logger.debug(f"_list_table_columns: filtered_column_names={filtered_column_names}")
1418
-
1419
- if field_names:
1420
- field_column_names = ([column_name.split('.')[0] for column_name in field_names]) # use top level of the field column names
1421
- else:
1422
- field_column_names = [column[0] for column in all_listed_columns]
1423
- _logger.debug(f"_list_table_columns: field_column_names={field_column_names}")
1424
- queried_column_names.update(field_column_names)
1425
-
1426
- all_listed_column_and_leaves_names = set()
1427
- for column in all_listed_columns:
1428
- # Collect the column and leaves names for verification below that all the filters and field names are in the table
1429
- column_and_leaves_names = [column[0]] + [f.name for f in column[3].flatten()]
1430
- all_listed_column_and_leaves_names.update(column_and_leaves_names)
1431
-
1432
- # check if this column is needed for the query
1433
- if column[0] in queried_column_names:
1434
- queried_columns.append(column)
1435
-
1436
- # verify that all the filters and field names are in the table
1437
- if filters:
1438
- for filter_column_name in filters.keys():
1439
- if filter_column_name not in all_listed_column_and_leaves_names:
1440
- raise KeyError((f'filter column name: {filter_column_name} does not appear in the table'))
1441
- if field_names:
1442
- for field_name in field_names:
1443
- if field_name not in all_listed_column_and_leaves_names:
1444
- raise ValueError((f'field name: {field_name} does not appear in the table'))
1445
- return list(queried_columns)
1446
-
1447
- def _begin_tx_if_necessary(self, txid):
1448
- if not txid:
1449
- created_txid = True
1450
- res = self.begin_transaction()
1451
- txid = res.headers.get('tabular-txid')
1452
- else:
1453
- created_txid = False
1422
+ url_params = {}
1423
+ if query_imports_table:
1424
+ url_params['sub-table'] = IMPORTED_OBJECTS_TABLE_NAME
1425
+ elif projection:
1426
+ url_params['name'] = projection
1427
+ return url_params
1454
1428
 
1455
- return txid, created_txid
1429
+ def legacy_query_data(self, bucket, schema, table, params, split=(0, 1, 8), num_sub_splits=1, response_row_id=False,
1430
+ txid=0, client_tags=[], expected_retvals=[], limit_rows=0, schedule_id=None, retry_count=0,
1431
+ search_path=None, sub_split_start_row_ids=[], tenant_guid=None, projection='', enable_sorted_projections=True,
1432
+ request_format='string', response_format='string', query_imports_table=False):
1433
+ """
1434
+ POST /mybucket/myschema/mytable?query-data=LegacyQueryData HTTP/1.1
1435
+ Content-Length: ContentLength
1436
+ tabular-txid: TransactionId
1437
+ tabular-client-tag: ClientTag
1438
+ tabular-split: "split_id,total_splits,num_row_groups_per_split"
1439
+ tabular-num-of-subsplits: "total"
1440
+ tabular-request-format: "string"
1441
+ tabular-response-format: "string" #arrow/trino
1442
+ tabular-schedule-id: "schedule-id"
1443
+
1444
+ Request Body (flatbuf)
1445
+ projections_chunk [expressions]
1446
+ predicate_chunk "formatted_data", (required)
1447
+
1448
+ """
1449
+ headers = self._build_query_data_headers(txid, client_tags, params, split, num_sub_splits, request_format, response_format,
1450
+ enable_sorted_projections, limit_rows, schedule_id, retry_count, search_path, tenant_guid,
1451
+ sub_split_start_row_ids)
1452
+ url_params = self._build_query_data_url_params(projection, query_imports_table)
1453
+
1454
+ res = self.session.post(self._api_prefix(bucket=bucket, schema=schema, table=table, command="query-data=LegacyQueryData",
1455
+ url_params=url_params), data=params, headers=headers, stream=True)
1456
+ return self._check_res(res, "legacy_query_data", expected_retvals)
1456
1457
 
1457
- def _prepare_query(self, bucket, schema, table, num_sub_splits, filters=None, field_names=None,
1458
- queried_columns=None, response_row_id=False, txid=0):
1459
- queried_fields = []
1460
- if response_row_id:
1461
- queried_fields.append(pa.field('$row_id', pa.uint64()))
1458
+ def query_data(self, bucket, schema, table, params, split=(0, 1, 8), num_sub_splits=1, response_row_id=False,
1459
+ txid=0, client_tags=[], expected_retvals=[], limit_rows=0, schedule_id=None, retry_count=0,
1460
+ search_path=None, sub_split_start_row_ids=[], tenant_guid=None, projection='', enable_sorted_projections=True,
1461
+ request_format='string', response_format='string', query_imports_table=False):
1462
+ """
1463
+ GET /mybucket/myschema/mytable?data HTTP/1.1
1464
+ Content-Length: ContentLength
1465
+ tabular-txid: TransactionId
1466
+ tabular-client-tag: ClientTag
1467
+ tabular-split: "split_id,total_splits,num_row_groups_per_split"
1468
+ tabular-num-of-subsplits: "total"
1469
+ tabular-request-format: "string"
1470
+ tabular-response-format: "string" #arrow/trino
1471
+ tabular-schedule-id: "schedule-id"
1462
1472
 
1463
- if not queried_columns:
1464
- queried_columns = self._list_table_columns(bucket, schema, table, filters, field_names, txid=txid)
1473
+ Request Body (flatbuf)
1474
+ projections_chunk [expressions]
1475
+ predicate_chunk "formatted_data", (required)
1465
1476
 
1466
- queried_fields.extend(pa.field(column[0], column[1]) for column in queried_columns)
1467
- arrow_schema = pa.schema(queried_fields)
1477
+ To query the internal vastdb-imported-objects table, set query_imports_table=True
1478
+ """
1479
+ # add query option select-only and read-only
1468
1480
 
1469
- _logger.debug(f'_prepare_query: arrow_schema = {arrow_schema}')
1481
+ headers = self._build_query_data_headers(txid, client_tags, params, split, num_sub_splits, request_format, response_format,
1482
+ enable_sorted_projections, limit_rows, schedule_id, retry_count, search_path, tenant_guid,
1483
+ sub_split_start_row_ids)
1470
1484
 
1471
- query_data_request = build_query_data_request(schema=arrow_schema, filters=filters, field_names=field_names)
1472
- if self.executor_hosts:
1473
- executor_hosts = self.executor_hosts
1474
- else:
1475
- executor_hosts = [self.host]
1476
- executor_sessions = [VastdbApi(executor_hosts[i], self.access_key, self.secret_key, self.username,
1477
- self.password, self.port, self.secure, self.auth_type) for i in range(len(executor_hosts))]
1478
-
1479
- return queried_columns, arrow_schema, query_data_request, executor_sessions
1480
-
1481
- def _more_pages_exist(self, start_row_ids):
1482
- for row_id in start_row_ids.values():
1483
- if row_id != TABULAR_INVALID_ROW_ID:
1484
- return True
1485
- return False
1486
-
1487
- def _query_page(self, bucket, schema, table, query_data_request, split=(0, 1, 8), num_sub_splits=1, response_row_id=False,
1488
- txid=0, limit_rows=0, sub_split_start_row_ids=[], filters=None, field_names=None):
1489
- res = self.query_data(bucket=bucket, schema=schema, table=table, params=query_data_request.serialized, split=split,
1490
- num_sub_splits=num_sub_splits, response_row_id=response_row_id, txid=txid,
1491
- limit_rows=limit_rows, sub_split_start_row_ids=sub_split_start_row_ids)
1492
- start_row_ids = {}
1493
- sub_split_tables = parse_query_data_response(res.raw, query_data_request.response_schema,
1494
- start_row_ids=start_row_ids)
1495
- table_page = pa.concat_tables(sub_split_tables)
1496
- _logger.info("query_page: table_page num_rows=%s start_row_ids len=%s",
1497
- len(table_page), len(start_row_ids))
1498
-
1499
- return table_page, start_row_ids
1500
-
1501
- def _query_page_iterator(self, bucket, schema, table, query_data_request, split=(0, 1, 8), num_sub_splits=1, response_row_id=False,
1502
- txid=0, limit_rows=0, start_row_ids={}, filters=None, field_names=None):
1503
- res = self.query_data(bucket=bucket, schema=schema, table=table, params=query_data_request.serialized, split=split,
1504
- num_sub_splits=num_sub_splits, response_row_id=response_row_id, txid=txid,
1505
- limit_rows=limit_rows, sub_split_start_row_ids=start_row_ids.items())
1506
- for sub_split_table in parse_query_data_response(res.raw, query_data_request.response_schema,
1507
- start_row_ids=start_row_ids):
1508
- for record_batch in sub_split_table.to_batches():
1509
- yield record_batch
1510
- _logger.info(f"query_page_iterator: start_row_ids={start_row_ids}")
1511
-
1512
- def query_iterator(self, bucket, schema, table, num_sub_splits=1, num_row_groups_per_sub_split=8,
1513
- response_row_id=False, txid=0, limit_per_sub_split=128*1024, filters=None, field_names=None):
1514
- """
1515
- query rows into a table.
1516
-
1517
- Parameters
1518
- ----------
1519
- bucket : string
1520
- The bucket of the table.
1521
- schema : string
1522
- The schema of the table.
1523
- table : string
1524
- The table name.
1525
- num_sub_splits : integer
1526
- The number of sub_splits per split - determines the parallelism inside a VastDB compute node
1527
- default: 1
1528
- num_row_groups_per_sub_split : integer
1529
- The number of consecutive row groups per sub_split. Each row group consists of 64K row ids.
1530
- default: 8
1531
- response_row_id : boolean
1532
- Return a column with the internal row ids of the table
1533
- default: False
1534
- txid : integer
1535
- A transaction id. The transaction may be initiated before the query, and if not, the query will initiate it
1536
- default: 0 (will be created by the api)
1537
- limit_per_sub_split : integer
1538
- Limit the number of rows from a single sub_split for a single rpc
1539
- default:131072
1540
- filters : dict
1541
- A dictionary whose keys are column names, and values are lists of string expressions that represent
1542
- filter conditions on the column. AND is applied on the conditions. The condition formats are:
1543
- 'column_name eq some_value'
1544
- default: None
1545
- field_names : list
1546
- A list of column names to be returned in the output table
1547
- default: None
1548
-
1549
- Returns
1550
- -------
1551
- Query iterator generator
1552
-
1553
- Yields
1554
- ------
1555
- pyarrow.RecordBatch
1556
-
1557
- Examples
1558
- --------
1559
- for record_batch in query_iterator('some_bucket', 'some_schema', 'some_table',
1560
- filters={'name': ['eq Alice', 'eq Bob']}
1561
- field_names=['name','age']):
1562
- ...
1563
-
1564
- """
1565
-
1566
- # create a transaction if necessary
1567
- txid, created_txid = self._begin_tx_if_necessary(txid)
1568
- executor_sessions = []
1485
+ url_params = self._build_query_data_url_params(projection, query_imports_table)
1569
1486
 
1570
- try:
1571
- # prepare query
1572
- queried_columns, arrow_schema, query_data_request, executor_sessions = \
1573
- self._prepare_query(bucket, schema, table, num_sub_splits, filters, field_names, response_row_id=response_row_id, txid=txid)
1574
-
1575
- # define the per split threaded query func
1576
- def query_iterator_split_id(self, split_id):
1577
- _logger.info(f"query_iterator_split_id: split_id={split_id}")
1578
- try:
1579
- start_row_ids = {i:0 for i in range(num_sub_splits)}
1580
- session = executor_sessions[split_id]
1581
- while not next_sems[split_id].acquire(timeout=1):
1582
- # check if killed externally
1583
- if killall:
1584
- raise RuntimeError(f'query_iterator_split_id: split_id {split_id} received killall')
1585
-
1586
- while self._more_pages_exist(start_row_ids):
1587
- for record_batch in session._query_page_iterator(bucket=bucket, schema=schema, table=table, query_data_request=query_data_request,
1588
- split=(split_id, num_splits, num_row_groups_per_sub_split),
1589
- num_sub_splits=num_sub_splits, response_row_id=response_row_id,
1590
- txid=txid, limit_rows=limit_per_sub_split,
1591
- start_row_ids=start_row_ids):
1592
- output_queue.put((split_id, record_batch))
1593
- while not next_sems[split_id].acquire(timeout=1): # wait for the main thread to request the next record batch
1594
- if killall:
1595
- raise RuntimeError(f'split_id {split_id} received killall')
1596
- # end of split
1597
- output_queue.put((split_id,None))
1598
-
1599
- except Exception as e:
1600
- _logger.exception('query_iterator_split_id: exception occurred')
1601
- try:
1602
- self.rollback_transaction(txid)
1603
- except:
1604
- _logger.exception(f'failed to rollback txid {txid}')
1605
- error_queue.put(None)
1606
- raise e
1607
-
1608
- # kickoff executors
1609
- num_splits = len(executor_sessions)
1610
- output_queue = queue.Queue()
1611
- error_queue = queue.Queue()
1612
- next_sems = [threading.Semaphore(value=1) for i in range(num_splits)]
1613
- killall = False
1614
- with concurrent.futures.ThreadPoolExecutor(max_workers=num_splits) as executor:
1615
- # start executors
1616
- futures = []
1617
- for i in range(num_splits):
1618
- futures.append(executor.submit(query_iterator_split_id, self, i))
1619
-
1620
- # receive outputs and yield them
1621
- done_count = 0
1622
- while done_count < num_splits:
1623
- # check for errors
1624
- try:
1625
- error_queue.get(block=False)
1626
- _logger.error('received error from a thread')
1627
- killall = True
1628
- # wait for all executors to complete
1629
- for future in concurrent.futures.as_completed(futures):
1630
- try:
1631
- future.result() # trigger an exception if occurred in any thread
1632
- except Exception:
1633
- _logger.exception('exception occurred')
1634
- raise RuntimeError('received error from a thread')
1635
- except queue.Empty:
1636
- pass
1637
-
1638
- # try to get a value from the output queue
1639
- try:
1640
- (split_id, record_batch) = output_queue.get(timeout=1)
1641
- except queue.Empty:
1642
- continue
1643
-
1644
- if record_batch:
1645
- # signal to the thread to read the next record batch and yield the current
1646
- next_sems[split_id].release()
1647
- try:
1648
- yield record_batch
1649
- except GeneratorExit:
1650
- killall = True
1651
- _logger.debug("cancelling query_iterator")
1652
- raise
1653
- else:
1654
- done_count += 1
1655
-
1656
- # wait for all executors to complete
1657
- for future in concurrent.futures.as_completed(futures):
1658
- try:
1659
- future.result() # trigger an exception if occurred in any thread
1660
- except Exception:
1661
- _logger.exception('exception occurred')
1662
-
1663
- # commit if needed
1664
- if created_txid:
1665
- self.commit_transaction(txid)
1666
-
1667
- except Exception as e:
1668
- _logger.exception('exception occurred')
1669
- try:
1670
- self.rollback_transaction(txid)
1671
- except:
1672
- _logger.exception(f'failed to rollback txid {txid}')
1673
- raise e
1674
-
1675
- finally:
1676
- killall = True
1677
- for session in executor_sessions:
1678
- try:
1679
- session.session.close()
1680
- except Exception:
1681
- _logger.exception(f'failed to close session {session}')
1682
-
1683
- def query(self, bucket, schema, table, num_sub_splits=1, num_row_groups_per_sub_split=8,
1684
- response_row_id=False, txid=0, limit=0, limit_per_sub_split=131072, filters=None, field_names=None,
1685
- queried_columns=None):
1686
- """
1687
- query rows into a table.
1688
-
1689
- Parameters
1690
- ----------
1691
- bucket : string
1692
- The bucket of the table.
1693
- schema : string
1694
- The schema of the table.
1695
- table : string
1696
- The table name.
1697
- num_sub_splits : integer
1698
- The number of sub_splits per split - determines the parallelism inside a VastDB compute node
1699
- default: 1
1700
- num_row_groups_per_sub_split : integer
1701
- The number of consecutive row groups per sub_split. Each row group consists of 64K row ids.
1702
- default: 8
1703
- response_row_id : boolean
1704
- Return a column with the internal row ids of the table
1705
- default: False
1706
- txid : integer
1707
- A transaction id. The transaction may be initiated before the query, and be used to provide
1708
- multiple ACID operations
1709
- default: 0 (will be created by the api)
1710
- limit : integer
1711
- Limit the number of rows in the response
1712
- default: 0 (no limit)
1713
- limit_per_sub_split : integer
1714
- Limit the number of rows from a single sub_split for a single rpc
1715
- default:131072
1716
- filters : dict
1717
- A dictionary whose keys are column names, and values are lists of string expressions that represent
1718
- filter conditions on the column. AND is applied on the conditions. The condition formats are:
1719
- 'column_name eq some_value'
1720
- default: None
1721
- field_names : list
1722
- A list of column names to be returned to the output table
1723
- default: None
1724
- queried_columns: list of pyArrow.column
1725
- A list of the columns to be queried
1726
- default: None
1727
-
1728
- Returns
1729
- -------
1730
- pyarrow.Table
1731
-
1732
-
1733
- Examples
1734
- --------
1735
- table = query('some_bucket', 'some_schema', 'some_table',
1736
- filters={'name': ['eq Alice', 'eq Bob']}
1737
- field_names=['name','age'])
1738
-
1739
- """
1740
-
1741
- # create a transaction
1742
- txid, created_txid = self._begin_tx_if_necessary(txid)
1743
- executor_sessions = []
1744
- try:
1745
- # prepare query
1746
- queried_columns, arrow_schema, query_data_request, executor_sessions = \
1747
- self._prepare_query(bucket, schema, table, num_sub_splits, filters, field_names, response_row_id=response_row_id, txid=txid)
1748
-
1749
- # define the per split threaded query func
1750
- def query_split_id(self, split_id):
1751
- try:
1752
- start_row_ids = {i:0 for i in range(num_sub_splits)}
1753
- session = executor_sessions[split_id]
1754
- row_count = 0
1755
- while (self._more_pages_exist(start_row_ids) and
1756
- (not limit or row_count < limit)):
1757
- # check if killed externally
1758
- if killall:
1759
- raise RuntimeError(f'query_split_id: split_id {split_id} received killall')
1760
-
1761
- # determine the limit rows
1762
- if limit:
1763
- limit_rows = min(limit_per_sub_split, limit-row_count)
1764
- else:
1765
- limit_rows = limit_per_sub_split
1766
-
1767
- # query one page
1768
- table_page, start_row_ids = session._query_page(bucket=bucket, schema=schema, table=table, query_data_request=query_data_request,
1769
- split=(split_id, num_splits, num_row_groups_per_sub_split),
1770
- num_sub_splits=num_sub_splits, response_row_id=response_row_id,
1771
- txid=txid, limit_rows=limit_rows,
1772
- sub_split_start_row_ids=start_row_ids.items())
1773
- with lock:
1774
- table_pages.append(table_page)
1775
- row_counts[split_id] += len(table_page)
1776
- row_count = sum(row_counts)
1777
- _logger.info(f"query_split_id: table_pages split_id={split_id} row_count={row_count}")
1778
- except Exception as e:
1779
- _logger.exception('query_split_id: exception occurred')
1780
- try:
1781
- self.rollback_transaction(txid)
1782
- except:
1783
- _logger.exception(f'failed to rollback txid {txid}')
1784
- raise e
1785
-
1786
- table_pages = []
1787
- num_splits = len(executor_sessions)
1788
- killall = False
1789
- with concurrent.futures.ThreadPoolExecutor(max_workers=num_splits) as executor:
1790
- futures = []
1791
- row_counts = [0] * num_splits
1792
- lock = threading.Lock()
1793
- for i in range(num_splits):
1794
- futures.append(executor.submit(query_split_id, self, i))
1795
- for future in concurrent.futures.as_completed(futures):
1796
- future.result() # trigger an exception if occurred in any thread
1797
-
1798
- # commit if needed
1799
- if created_txid:
1800
- self.commit_transaction(txid)
1801
-
1802
- # concatenate all table pages and return result
1803
- out_table = pa.concat_tables(table_pages)
1804
- out_table = out_table.slice(length=limit) if limit else out_table
1805
- _logger.info("query: out_table len=%s row_count=%s",
1806
- len(out_table), len(out_table))
1807
- return out_table
1808
-
1809
- except Exception as e:
1810
- _logger.exception('exception occurred')
1811
- try:
1812
- self.rollback_transaction(txid)
1813
- except:
1814
- _logger.exception(f'failed to rollback txid {txid}')
1815
- raise e
1816
-
1817
- finally:
1818
- killall = True
1819
- for session in executor_sessions:
1820
- try:
1821
- session.session.close()
1822
- except Exception:
1823
- _logger.exception(f'failed to close session {session}')
1487
+ res = self.session.get(self._api_prefix(bucket=bucket, schema=schema, table=table, command="data", url_params=url_params),
1488
+ data=params, headers=headers, stream=True)
1489
+ return self._check_res(res, "query_data", expected_retvals)
1824
1490
 
1825
1491
  """
1826
1492
  source_files: list of (bucket_name, file_name)
@@ -1874,21 +1540,22 @@ class VastdbApi:
1874
1540
  builder.Finish(params)
1875
1541
  import_req = builder.Output()
1876
1542
 
1877
- def iterate_over_import_data_response(response, expected_retvals):
1543
+ def iterate_over_import_data_response(response):
1878
1544
  if response.status_code != 200:
1879
1545
  return response
1880
1546
 
1881
1547
  chunk_size = 1024
1882
- for chunk in res.iter_content(chunk_size=chunk_size):
1548
+ for chunk in response.iter_content(chunk_size=chunk_size):
1883
1549
  chunk_dict = json.loads(chunk)
1884
- _logger.info(f"import data chunk={chunk}, result: {chunk_dict['res']}")
1885
- if chunk_dict['res'] in expected_retvals:
1886
- _logger.info(f"import finished with expected result={chunk_dict['res']}, error message: {chunk_dict['err_msg']}")
1887
- return response
1888
- elif chunk_dict['res'] != 'Success' and chunk_dict['res'] != 'TabularInProgress':
1889
- raise TabularException(f"Received unexpected error in import_data. "
1890
- f"status: {chunk_dict['res']}, error message: {chunk_dict['err_msg']}")
1891
- _logger.info(f"import_data is in progress. status: {chunk_dict['res']}")
1550
+ _logger.debug("import data chunk=%s, result: %s", chunk_dict, chunk_dict['res'])
1551
+ if chunk_dict['res'] != 'Success' and chunk_dict['res'] != 'TabularInProgress' and chunk_dict['res'] != 'TabularAlreadyImported':
1552
+ raise errors.ImportFilesError(
1553
+ f"Encountered an error during import_data. status: {chunk_dict['res']}, "
1554
+ f"error message: {chunk_dict['err_msg'] or 'Unexpected error'} during import of "
1555
+ f"object name: {chunk_dict['object_name']}", chunk_dict)
1556
+ else:
1557
+ _logger.debug("import_data of object name '%s' is in progress. "
1558
+ "status: %s", chunk_dict['object_name'], chunk_dict['res'])
1892
1559
  return response
1893
1560
 
1894
1561
  headers = self._fill_common_headers(txid=txid, client_tags=client_tags)
@@ -1901,34 +1568,17 @@ class VastdbApi:
1901
1568
  res = self.session.post(self._api_prefix(bucket=bucket, schema=schema, table=table, command="data"),
1902
1569
  data=import_req, headers=headers, stream=True)
1903
1570
  if blocking:
1904
- res = iterate_over_import_data_response(res, expected_retvals)
1571
+ res = iterate_over_import_data_response(res)
1905
1572
 
1906
1573
  return self._check_res(res, "import_data", expected_retvals)
1907
1574
 
1908
- def merge_data(self):
1909
- """
1910
- TODO
1911
-
1912
- POST /mybucket/myschema/mytable?data HTTP/1.1
1913
- Content-Length: ContentLength
1914
- tabular-txid: TransactionId
1915
- tabular-client-tag: ClientTag
1916
-
1917
- Request Body
1918
- {
1919
- "format": "string",
1920
- "select_source": "formatted data"
1921
- "predicate": "formatted_data"
1922
- }
1923
- """
1924
- pass
1925
-
1926
1575
  def _record_batch_slices(self, batch, rows_per_slice=None):
1927
1576
  max_slice_size_in_bytes = int(0.9*5*1024*1024) # 0.9 * 5MB
1928
1577
  batch_len = len(batch)
1929
1578
  serialized_batch = serialize_record_batch(batch)
1930
1579
  batch_size_in_bytes = len(serialized_batch)
1931
- _logger.info(f'max_slice_size_in_bytes={max_slice_size_in_bytes} batch_len={batch_len} batch_size_in_bytes={batch_size_in_bytes}')
1580
+ _logger.debug('max_slice_size_in_bytes=%d batch_len=%d batch_size_in_bytes=%d',
1581
+ max_slice_size_in_bytes, batch_len, batch_size_in_bytes)
1932
1582
 
1933
1583
  if not rows_per_slice:
1934
1584
  if batch_size_in_bytes < max_slice_size_in_bytes:
@@ -1950,7 +1600,7 @@ class VastdbApi:
1950
1600
  serialized_slice_batch = serialize_record_batch(slice_batch)
1951
1601
  sizeof_serialized_slice_batch = len(serialized_slice_batch)
1952
1602
 
1953
- if sizeof_serialized_slice_batch <= max_slice_size_in_bytes or rows_per_slice < 10000:
1603
+ if sizeof_serialized_slice_batch <= max_slice_size_in_bytes:
1954
1604
  serialized_slices.append(serialized_slice_batch)
1955
1605
  else:
1956
1606
  _logger.info(f'Using rows_per_slice {rows_per_slice} slice {i} size {sizeof_serialized_slice_batch} exceeds {max_slice_size_in_bytes} bytes, trying smaller rows_per_slice')
@@ -1964,125 +1614,6 @@ class VastdbApi:
1964
1614
 
1965
1615
  return serialized_slices
1966
1616
 
1967
- def insert(self, bucket, schema, table, rows=None, record_batch=None, rows_per_insert=None, txid=0):
1968
- """
1969
- Insert rows into a table. The operation may be split into multiple commands, such that by default no more than 512KB will be inserted per command.
1970
-
1971
- Parameters
1972
- ----------
1973
- bucket : string
1974
- The bucket of the table.
1975
- schema : string
1976
- The schema of the table.
1977
- table : string
1978
- The table name.
1979
- rows : dict
1980
- The rows to insert.
1981
- dictionary key: column name
1982
- dictionary value: array of cell values to insert
1983
- default: None (if None, record_batch must be provided)
1984
- record_batch : pyarrow.RecordBatch
1985
- A pyarrow RecordBatch
1986
- default: None (if None, rows dictionary must be provided)
1987
- rows_per_insert : integer
1988
- Split the operation so that each insert command will be limited to this value
1989
- default: None (will be selected automatically)
1990
- txid : integer
1991
- A transaction id. The transaction may be initiated before the insert, and be used to provide
1992
- multiple ACID operations
1993
- default: 0 (will be created by the api)
1994
-
1995
- Returns
1996
- -------
1997
- None
1998
-
1999
-
2000
- Examples
2001
- --------
2002
- insert('some_bucket', 'some_schema', 'some_table', {'name': ['Alice','Bob'], 'age': [25,24]})
2003
-
2004
- """
2005
- if (not rows and not record_batch) or (rows and record_batch):
2006
- raise ValueError(f'insert: missing argument - either rows or record_batch must be provided')
2007
-
2008
- # create a transaction
2009
- txid, created_txid = self._begin_tx_if_necessary(txid)
2010
-
2011
- if rows:
2012
- columns = self._list_table_columns(bucket, schema, table, field_names=rows.keys(), txid=txid)
2013
- columns_dict = dict([(column[0], column[1]) for column in columns])
2014
- arrow_schema = pa.schema([])
2015
- arrays = []
2016
- for column_name, column_values in rows.items():
2017
- column_type = columns_dict[column_name]
2018
- field = pa.field(column_name, column_type)
2019
- arrow_schema = arrow_schema.append(field)
2020
- arrays.append(pa.array(column_values, column_type))
2021
- record_batch = pa.record_batch(arrays, arrow_schema)
2022
-
2023
- # split the record batch into multiple slices
2024
- serialized_slices = self._record_batch_slices(record_batch, rows_per_insert)
2025
- _logger.info(f'inserting record batch using {len(serialized_slices)} slices')
2026
-
2027
- insert_queue = queue.Queue()
2028
-
2029
- [insert_queue.put(insert_rows_req) for insert_rows_req in serialized_slices]
2030
-
2031
- try:
2032
- executor_sessions = [VastdbApi(self.executor_hosts[i], self.access_key, self.secret_key, self.username,
2033
- self.password, self.port, self.secure, self.auth_type) for i in range(len(self.executor_hosts))]
2034
-
2035
- def insert_executor(self, split_id):
2036
-
2037
- try:
2038
- _logger.info(f'insert_executor split_id={split_id} starting')
2039
- session = executor_sessions[split_id]
2040
- num_inserts = 0
2041
- while not killall:
2042
- try:
2043
- insert_rows_req = insert_queue.get(block=False)
2044
- except queue.Empty:
2045
- break
2046
- session.insert_rows(bucket=bucket, schema=schema,
2047
- table=table, record_batch=insert_rows_req, txid=txid)
2048
- num_inserts += 1
2049
- _logger.info(f'insert_executor split_id={split_id} num_inserts={num_inserts}')
2050
- if killall:
2051
- _logger.info('insert_executor killall=True')
2052
-
2053
- except Exception as e:
2054
- _logger.exception('insert_executor hit exception')
2055
- raise e
2056
-
2057
- num_splits = len(executor_sessions)
2058
- killall = False
2059
- with concurrent.futures.ThreadPoolExecutor(max_workers=num_splits) as executor:
2060
- futures = []
2061
- for i in range(num_splits):
2062
- futures.append(executor.submit(insert_executor, self, i))
2063
- for future in concurrent.futures.as_completed(futures):
2064
- future.result() # trigger an exception if occurred in any thread
2065
-
2066
- # commit if needed
2067
- if created_txid:
2068
- self.commit_transaction(txid)
2069
-
2070
- except Exception as e:
2071
- _logger.exception('exception occurred')
2072
- try:
2073
- self.rollback_transaction(txid)
2074
- except:
2075
- _logger.exception(f'failed to rollback txid {txid}')
2076
- raise e
2077
-
2078
- finally:
2079
- killall = True
2080
- for session in executor_sessions:
2081
- try:
2082
- session.session.close()
2083
- except Exception:
2084
- _logger.exception(f'failed to close session {session}')
2085
-
2086
1617
  def insert_rows(self, bucket, schema, table, record_batch, txid=0, client_tags=[], expected_retvals=[]):
2087
1618
  """
2088
1619
  POST /mybucket/myschema/mytable?rows HTTP/1.1
@@ -2115,7 +1646,8 @@ class VastdbApi:
2115
1646
  data=record_batch, headers=headers)
2116
1647
  return self._check_res(res, "update_rows", expected_retvals)
2117
1648
 
2118
- def delete_rows(self, bucket, schema, table, record_batch, txid=0, client_tags=[], expected_retvals=[]):
1649
+ def delete_rows(self, bucket, schema, table, record_batch, txid=0, client_tags=[], expected_retvals=[],
1650
+ delete_from_imports_table=False):
2119
1651
  """
2120
1652
  DELETE /mybucket/myschema/mytable?rows HTTP/1.1
2121
1653
  Content-Length: ContentLength
@@ -2127,8 +1659,10 @@ class VastdbApi:
2127
1659
  """
2128
1660
  headers = self._fill_common_headers(txid=txid, client_tags=client_tags)
2129
1661
  headers['Content-Length'] = str(len(record_batch))
2130
- res = self.session.delete(self._api_prefix(bucket=bucket, schema=schema, table=table, command="rows"),
2131
- data=record_batch, headers=headers)
1662
+ url_params = {'sub-table': IMPORTED_OBJECTS_TABLE_NAME} if delete_from_imports_table else {}
1663
+
1664
+ res = self.session.delete(self._api_prefix(bucket=bucket, schema=schema, table=table, command="rows", url_params=url_params),
1665
+ data=record_batch, headers=headers)
2132
1666
  return self._check_res(res, "delete_rows", expected_retvals)
2133
1667
 
2134
1668
  def create_projection(self, bucket, schema, table, name, columns, txid=0, client_tags=[], expected_retvals=[]):
@@ -2352,41 +1886,40 @@ def _iter_query_data_response_columns(fileobj, stream_ids=None):
2352
1886
  if stream_ids is not None:
2353
1887
  stream_ids.update([stream_id]) # count stream IDs using a collections.Counter
2354
1888
  if stream_id == TABULAR_KEEP_ALIVE_STREAM_ID:
2355
- # _logger.info(f"stream_id={stream_id} (skipping)")
2356
1889
  continue
2357
1890
 
2358
1891
  if stream_id == TABULAR_QUERY_DATA_COMPLETED_STREAM_ID:
2359
1892
  # read the terminating end chunk from socket
2360
1893
  res = fileobj.read()
2361
- _logger.info(f"stream_id={stream_id} res={res} (finish)")
1894
+ _logger.debug("stream_id=%d res=%s (finish)", stream_id, res)
2362
1895
  return
2363
1896
 
2364
1897
  if stream_id == TABULAR_QUERY_DATA_FAILED_STREAM_ID:
2365
1898
  # read the terminating end chunk from socket
2366
1899
  res = fileobj.read()
2367
- _logger.info(f"stream_id={stream_id} res={res} (failed)")
1900
+ _logger.warning("stream_id=%d res=%s (failed)", stream_id, res)
2368
1901
  raise IOError(f"Query data stream failed res={res}")
2369
1902
 
2370
1903
  next_row_id_bytes = fileobj.read(8)
2371
1904
  next_row_id, = struct.unpack('<Q', next_row_id_bytes)
2372
- _logger.info(f"stream_id={stream_id} next_row_id={next_row_id}")
1905
+ _logger.debug("stream_id=%d next_row_id=%d", stream_id, next_row_id)
2373
1906
 
2374
1907
  if stream_id not in readers:
2375
1908
  # we implicitly read 1st message (Arrow schema) when constructing RecordBatchStreamReader
2376
1909
  reader = pa.ipc.RecordBatchStreamReader(fileobj)
2377
- _logger.info(f"stream_id={stream_id} schema={reader.schema}")
1910
+ _logger.debug("stream_id=%d schema=%s", stream_id, reader.schema)
2378
1911
  readers[stream_id] = (reader, [])
2379
1912
  continue
2380
1913
 
2381
1914
  (reader, batches) = readers[stream_id]
2382
1915
  try:
2383
1916
  batch = reader.read_next_batch() # read single-column chunk data
2384
- _logger.info(f"stream_id={stream_id} rows={len(batch)} chunk={batch}")
1917
+ _logger.debug("stream_id=%d rows=%d chunk=%s", stream_id, len(batch), batch)
2385
1918
  batches.append(batch)
2386
1919
  except StopIteration: # we got an end-of-stream IPC message for a given stream ID
2387
1920
  reader, batches = readers.pop(stream_id) # end of column
2388
1921
  table = pa.Table.from_batches(batches) # concatenate all column chunks (as a single)
2389
- _logger.info(f"stream_id={stream_id} rows={len(table)} column={table}")
1922
+ _logger.debug("stream_id=%d rows=%d column=%s", stream_id, len(table), table)
2390
1923
  yield (stream_id, next_row_id, table)
2391
1924
 
2392
1925
 
@@ -2398,24 +1931,23 @@ def parse_query_data_response(conn, schema, stream_ids=None, start_row_ids=None,
2398
1931
  """
2399
1932
  if start_row_ids is None:
2400
1933
  start_row_ids = {}
2401
- projection_positions = schema.projection_positions
2402
- arrow_schema = schema.arrow_schema
2403
- output_field_names = schema.output_field_names
2404
- _logger.debug(f'projection_positions={projection_positions} len(arrow_schema)={len(arrow_schema)} arrow_schema={arrow_schema}')
2405
- is_empty_projection = (len(projection_positions) == 0)
2406
- parsers = defaultdict(lambda: QueryDataParser(arrow_schema, debug=debug, projection_positions=projection_positions)) # {stream_id: QueryDataParser}
1934
+
1935
+ is_empty_projection = (len(schema) == 0)
1936
+ parsers = defaultdict(lambda: QueryDataParser(schema, debug=debug)) # {stream_id: QueryDataParser}
1937
+
2407
1938
  for stream_id, next_row_id, table in _iter_query_data_response_columns(conn, stream_ids):
2408
1939
  parser = parsers[stream_id]
2409
1940
  for column in table.columns:
2410
1941
  parser.parse(column)
2411
1942
 
2412
- parsed_table = parser.build(output_field_names)
1943
+ parsed_table = parser.build()
2413
1944
  if parsed_table is not None: # when we got all columns (and before starting a new "select_rows" cycle)
2414
1945
  parsers.pop(stream_id)
2415
1946
  if is_empty_projection: # VAST returns an empty RecordBatch, with the correct rows' count
2416
1947
  parsed_table = table
2417
1948
 
2418
- _logger.info(f"stream_id={stream_id} rows={len(parsed_table)} next_row_id={next_row_id} table={parsed_table}")
1949
+ _logger.debug("stream_id=%d rows=%d next_row_id=%d table=%s",
1950
+ stream_id, len(parsed_table), next_row_id, parsed_table)
2419
1951
  start_row_ids[stream_id] = next_row_id
2420
1952
  yield parsed_table # the result of a single "select_rows()" cycle
2421
1953
 
@@ -2496,7 +2028,7 @@ def get_field_type(builder: flatbuffers.Builder, field: pa.Field):
2496
2028
  fb_utf8.Start(builder)
2497
2029
  field_type = fb_utf8.End(builder)
2498
2030
 
2499
- elif field.type.equals(pa.date32()): # pa.date64()
2031
+ elif field.type.equals(pa.date32()): # pa.date64() is not supported
2500
2032
  field_type_type = Type.Date
2501
2033
  fb_date.Start(builder)
2502
2034
  fb_date.AddUnit(builder, DateUnit.DAY)
@@ -2564,7 +2096,6 @@ def get_field_type(builder: flatbuffers.Builder, field: pa.Field):
2564
2096
  return field_type, field_type_type
2565
2097
 
2566
2098
  def build_field(builder: flatbuffers.Builder, f: pa.Field, name: str):
2567
- _logger.info(f"name={f.name}")
2568
2099
  children = None
2569
2100
  if isinstance(f.type, pa.StructType):
2570
2101
  children = [build_field(builder, child, child.name) for child in list(f.type)]
@@ -2591,7 +2122,6 @@ def build_field(builder: flatbuffers.Builder, f: pa.Field, name: str):
2591
2122
  fb_field.AddName(builder, child_col_name)
2592
2123
  fb_field.AddChildren(builder, children)
2593
2124
 
2594
- _logger.info(f"added key and map to entries")
2595
2125
  children = [fb_field.End(builder)]
2596
2126
 
2597
2127
  if children is not None:
@@ -2602,32 +2132,22 @@ def build_field(builder: flatbuffers.Builder, f: pa.Field, name: str):
2602
2132
 
2603
2133
  col_name = builder.CreateString(name)
2604
2134
  field_type, field_type_type = get_field_type(builder, f)
2605
- _logger.info(f"add col_name={name} type_type={field_type_type} to fb")
2606
2135
  fb_field.Start(builder)
2607
2136
  fb_field.AddName(builder, col_name)
2608
2137
  fb_field.AddTypeType(builder, field_type_type)
2609
2138
  fb_field.AddType(builder, field_type)
2610
2139
  if children is not None:
2611
- _logger.info(f"add col_name={name} childern")
2612
2140
  fb_field.AddChildren(builder, children)
2613
2141
  return fb_field.End(builder)
2614
2142
 
2615
2143
 
2616
- class VastDBResponseSchema:
2617
- def __init__(self, arrow_schema, projection_positions, output_field_names):
2618
- self.arrow_schema = arrow_schema
2619
- self.projection_positions = projection_positions
2620
- self.output_field_names = output_field_names
2621
-
2622
2144
  class QueryDataRequest:
2623
2145
  def __init__(self, serialized, response_schema):
2624
2146
  self.serialized = serialized
2625
2147
  self.response_schema = response_schema
2626
2148
 
2627
2149
 
2628
- def build_query_data_request(schema: 'pa.Schema' = pa.schema([]), filters: dict = None, field_names: list = None):
2629
- filters = filters or {}
2630
-
2150
+ def build_query_data_request(schema: 'pa.Schema' = pa.schema([]), predicate: ibis.expr.types.BooleanColumn = None, field_names: list = None):
2631
2151
  builder = flatbuffers.Builder(1024)
2632
2152
 
2633
2153
  source_name = builder.CreateString('') # required
@@ -2643,39 +2163,21 @@ def build_query_data_request(schema: 'pa.Schema' = pa.schema([]), filters: dict
2643
2163
  fb_schema.AddFields(builder, fields)
2644
2164
  schema_obj = fb_schema.End(builder)
2645
2165
 
2646
- predicate = Predicate(schema, filters)
2166
+ predicate = Predicate(schema=schema, expr=predicate)
2647
2167
  filter_obj = predicate.serialize(builder)
2648
2168
 
2649
2169
  parser = QueryDataParser(schema)
2650
- leaves_map = {}
2651
- for node in parser.nodes:
2652
- for descendent in node._iter_nodes():
2653
- if descendent.parent and isinstance(descendent.parent.type, (pa.ListType, pa.MapType)):
2654
- continue
2655
- iter_from_root = reversed(list(descendent._iter_to_root()))
2656
- descendent_full_name = '.'.join([n.field.name for n in iter_from_root])
2657
- _logger.debug(f'build_query_data_request: descendent_full_name={descendent_full_name}')
2658
- descendent_leaves = [leaf.index for leaf in descendent._iter_leaves()]
2659
- leaves_map[descendent_full_name] = descendent_leaves
2660
- _logger.debug(f'build_query_data_request: leaves_map={leaves_map}')
2661
-
2662
- output_field_names = None
2170
+ fields_map = {node.field.name: node.field for node in parser.nodes}
2171
+ leaves_map = {node.field.name: [leaf.index for leaf in node._iter_leaves()] for node in parser.nodes}
2172
+
2663
2173
  if field_names is None:
2664
2174
  field_names = [field.name for field in schema]
2665
- else:
2666
- output_field_names = [f.split('.')[0] for f in field_names]
2667
- # sort projected field_names according to positions to maintain ordering according to the schema
2668
- def compare_field_names_by_pos(field_name1, field_name2):
2669
- return leaves_map[field_name1][0]-leaves_map[field_name2][0]
2670
- field_names = sorted(field_names, key=cmp_to_key(compare_field_names_by_pos))
2671
- _logger.debug(f'build_query_data_request: sorted field_names={field_names} schema={schema}')
2672
2175
 
2176
+ response_schema = pa.schema([fields_map[name] for name in field_names])
2673
2177
  projection_fields = []
2674
- projection_positions = []
2675
2178
  for field_name in field_names:
2179
+ # TODO: only root-level projection pushdown is supported (i.e. no support for SELECT s.x FROM t)
2676
2180
  positions = leaves_map[field_name]
2677
- _logger.info("projecting field=%s positions=%s", field_name, positions)
2678
- projection_positions.extend(positions)
2679
2181
  for leaf_position in positions:
2680
2182
  fb_field_index.Start(builder)
2681
2183
  fb_field_index.AddPosition(builder, leaf_position)
@@ -2686,8 +2188,6 @@ def build_query_data_request(schema: 'pa.Schema' = pa.schema([]), filters: dict
2686
2188
  builder.PrependUOffsetTRelative(offset)
2687
2189
  projection = builder.EndVector()
2688
2190
 
2689
- response_schema = VastDBResponseSchema(schema, projection_positions, output_field_names=output_field_names)
2690
-
2691
2191
  fb_source.Start(builder)
2692
2192
  fb_source.AddName(builder, source_name)
2693
2193
  fb_source.AddSchema(builder, schema_obj)
@@ -2731,11 +2231,9 @@ def convert_column_types(table: 'pa.Table') -> 'pa.Table':
2731
2231
  indexes_of_fields_to_change[field.name] = index
2732
2232
  for changing_index in ts_indexes:
2733
2233
  field_name = table.schema[changing_index].name
2734
- _logger.info(f'changing resolution for {field_name} to us')
2735
2234
  new_column = table[field_name].cast(pa.timestamp('us'), safe=False)
2736
2235
  table = table.set_column(changing_index, field_name, new_column)
2737
2236
  for field_name, changing_index in indexes_of_fields_to_change.items():
2738
- _logger.info(f'applying custom rules to {field_name}')
2739
2237
  new_column = table[field_name].to_pylist()
2740
2238
  new_column = list(map(column_matcher[field_name], new_column))
2741
2239
  new_column = pa.array(new_column, table[field_name].type)