vastdb 0.0.5.2__py3-none-any.whl → 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. vast_flatbuf/tabular/GetTableStatsResponse.py +45 -1
  2. vast_flatbuf/tabular/VipRange.py +56 -0
  3. vastdb/__init__.py +7 -0
  4. vastdb/bucket.py +77 -0
  5. vastdb/errors.py +158 -0
  6. vastdb/{api.py → internal_commands.py} +283 -747
  7. vastdb/schema.py +77 -0
  8. vastdb/session.py +48 -0
  9. vastdb/table.py +480 -0
  10. vastdb/tests/conftest.py +46 -0
  11. vastdb/tests/test_imports.py +125 -0
  12. vastdb/tests/test_projections.py +41 -0
  13. vastdb/tests/test_sanity.py +83 -0
  14. vastdb/tests/test_schemas.py +45 -0
  15. vastdb/tests/test_tables.py +608 -0
  16. vastdb/transaction.py +55 -0
  17. vastdb/util.py +77 -0
  18. vastdb-0.1.0.dist-info/METADATA +38 -0
  19. {vastdb-0.0.5.2.dist-info → vastdb-0.1.0.dist-info}/RECORD +23 -24
  20. vast_protobuf/substrait/__init__.py +0 -0
  21. vast_protobuf/substrait/algebra_pb2.py +0 -1344
  22. vast_protobuf/substrait/capabilities_pb2.py +0 -46
  23. vast_protobuf/substrait/ddl_pb2.py +0 -57
  24. vast_protobuf/substrait/extended_expression_pb2.py +0 -49
  25. vast_protobuf/substrait/extensions/__init__.py +0 -0
  26. vast_protobuf/substrait/extensions/extensions_pb2.py +0 -89
  27. vast_protobuf/substrait/function_pb2.py +0 -168
  28. vast_protobuf/substrait/parameterized_types_pb2.py +0 -181
  29. vast_protobuf/substrait/plan_pb2.py +0 -67
  30. vast_protobuf/substrait/type_expressions_pb2.py +0 -198
  31. vast_protobuf/substrait/type_pb2.py +0 -350
  32. vast_protobuf/tabular/__init__.py +0 -0
  33. vast_protobuf/tabular/rpc_pb2.py +0 -344
  34. vastdb/v2.py +0 -108
  35. vastdb-0.0.5.2.dist-info/METADATA +0 -47
  36. {vast_protobuf → vastdb/tests}/__init__.py +0 -0
  37. {vastdb-0.0.5.2.dist-info → vastdb-0.1.0.dist-info}/LICENSE +0 -0
  38. {vastdb-0.0.5.2.dist-info → vastdb-0.1.0.dist-info}/WHEEL +0 -0
  39. {vastdb-0.0.5.2.dist-info → vastdb-0.1.0.dist-info}/top_level.txt +0 -0
@@ -1,29 +1,26 @@
1
- import array
2
1
  import logging
3
2
  import struct
4
3
  import urllib.parse
5
4
  from collections import defaultdict, namedtuple
6
5
  from datetime import datetime
7
6
  from enum import Enum
8
- from typing import List, Union, Optional, Iterator
7
+ from typing import Union, Optional, Iterator
8
+ import ibis
9
9
  import xmltodict
10
- import concurrent.futures
11
- import threading
12
- import queue
13
10
  import math
14
- import socket
15
11
  from functools import cmp_to_key
16
12
  import pyarrow.parquet as pq
17
13
  import flatbuffers
18
14
  import pyarrow as pa
19
15
  import requests
20
- import datetime
21
- import hashlib
22
- import hmac
23
16
  import json
24
17
  import itertools
25
18
  from aws_requests_auth.aws_auth import AWSRequestsAuth
26
- from io import BytesIO
19
+ import urllib3
20
+ import re
21
+
22
+ from . import errors
23
+ from ipaddress import IPv4Address, IPv6Address
27
24
 
28
25
  import vast_flatbuf.org.apache.arrow.computeir.flatbuf.BinaryLiteral as fb_binary_lit
29
26
  import vast_flatbuf.org.apache.arrow.computeir.flatbuf.BooleanLiteral as fb_bool_lit
@@ -91,30 +88,22 @@ TABULAR_QUERY_DATA_COMPLETED_STREAM_ID = 0xFFFFFFFF - 1
91
88
  TABULAR_QUERY_DATA_FAILED_STREAM_ID = 0xFFFFFFFF - 2
92
89
  TABULAR_INVALID_ROW_ID = 0xFFFFFFFFFFFF # (1<<48)-1
93
90
  ESTORE_INVALID_EHANDLE = UINT64_MAX
91
+ IMPORTED_OBJECTS_TABLE_NAME = "vastdb-imported-objects"
94
92
 
95
93
  """
96
94
  S3 Tabular API
97
95
  """
98
96
 
99
97
 
100
- def get_logger(name):
101
- log = logging.getLogger(name)
102
- log.setLevel(logging.ERROR)
103
- ch = logging.StreamHandler()
104
- ch.setLevel(logging.INFO)
105
- ch.set_name('tabular_stream_handler')
106
- formatter = logging.Formatter("%(asctime)s:%(levelname)s:%(message)s")
107
- ch.setFormatter(formatter)
108
- log.addHandler(ch)
109
- log.propagate = False
110
- return log
111
-
112
-
113
- _logger = get_logger(__name__)
98
+ _logger = logging.getLogger(__name__)
114
99
 
115
100
 
116
- def set_tabular_log_level(level: int = logging.INFO):
117
- _logger.setLevel(level)
101
+ def _flatten_args(op, op_type):
102
+ if isinstance(op, op_type):
103
+ for arg in op.args:
104
+ yield from _flatten_args(arg, op_type)
105
+ else:
106
+ yield op
118
107
 
119
108
 
120
109
  class AuthType(Enum):
@@ -123,10 +112,6 @@ class AuthType(Enum):
123
112
  BASIC = "basic"
124
113
 
125
114
 
126
- class TabularException(Exception):
127
- pass
128
-
129
-
130
115
  def get_unit_to_flatbuff_time_unit(type):
131
116
  unit_to_flatbuff_time_unit = {
132
117
  'ns': TimeUnit.NANOSECOND,
@@ -144,11 +129,10 @@ class Predicate:
144
129
  's': 0.001
145
130
  }
146
131
 
147
- def __init__(self, schema: 'pa.Schema', filters: dict):
132
+ def __init__(self, schema: 'pa.Schema', expr: ibis.expr.types.BooleanColumn):
148
133
  self.schema = schema
149
- self.filters = filters
134
+ self.expr = expr
150
135
  self.builder = None
151
- self._field_name_per_index = None
152
136
 
153
137
  def get_field_indexes(self, field: 'pa.Field', field_name_per_index: list) -> None:
154
138
  field_name_per_index.append(field.name)
@@ -172,7 +156,6 @@ class Predicate:
172
156
  for field in self.schema:
173
157
  self.get_field_indexes(field, _field_name_per_index)
174
158
  self._field_name_per_index = {field: index for index, field in enumerate(_field_name_per_index)}
175
- _logger.debug(f'field_name_per_index: {self._field_name_per_index}')
176
159
  return self._field_name_per_index
177
160
 
178
161
  def get_projections(self, builder: 'flatbuffers.builder.Builder', field_names: list = None):
@@ -190,10 +173,77 @@ class Predicate:
190
173
  return builder.EndVector()
191
174
 
192
175
  def serialize(self, builder: 'flatbuffers.builder.Builder'):
176
+ from ibis.expr.operations.generic import TableColumn, Literal, IsNull
177
+ from ibis.expr.operations.logical import Greater, GreaterEqual, Less, LessEqual, Equals, NotEquals, And, Or, Not
178
+ from ibis.expr.operations.strings import StringContains
179
+
180
+ builder_map = {
181
+ Greater: self.build_greater,
182
+ GreaterEqual: self.build_greater_equal,
183
+ Less: self.build_less,
184
+ LessEqual: self.build_less_equal,
185
+ Equals: self.build_equal,
186
+ NotEquals: self.build_not_equal,
187
+ IsNull: self.build_is_null,
188
+ Not: self.build_is_not_null,
189
+ StringContains: self.build_match_substring,
190
+ }
191
+
192
+ positions_map = dict((f.name, index) for index, f in enumerate(self.schema)) # TODO: BFS
193
+
193
194
  self.builder = builder
195
+
194
196
  offsets = []
195
- for field_name in self.filters:
196
- offsets.append(self.build_domain(self.build_column(self.field_name_per_index[field_name]), field_name))
197
+
198
+ if self.expr is not None:
199
+ and_args = list(_flatten_args(self.expr.op(), And))
200
+ _logger.debug('AND args: %s ops %s', and_args, self.expr.op())
201
+ for op in and_args:
202
+ or_args = list(_flatten_args(op, Or))
203
+ _logger.debug('OR args: %s op %s', or_args, op)
204
+ inner_offsets = []
205
+
206
+ prev_field_name = None
207
+ for inner_op in or_args:
208
+ _logger.debug('inner_op %s', inner_op)
209
+ builder_func = builder_map.get(type(inner_op))
210
+ if not builder_func:
211
+ raise NotImplementedError(inner_op.name)
212
+
213
+ if builder_func == self.build_is_null:
214
+ column, = inner_op.args
215
+ literal = None
216
+ elif builder_func == self.build_is_not_null:
217
+ not_arg, = inner_op.args
218
+ # currently we only support not is_null, checking we really got is_null under the not:
219
+ if not builder_map.get(type(not_arg)) == self.build_is_null:
220
+ raise NotImplementedError(not_arg.args[0].name)
221
+ column, = not_arg.args
222
+ literal = None
223
+ else:
224
+ column, literal = inner_op.args
225
+ if not isinstance(literal, Literal):
226
+ raise NotImplementedError(inner_op.name)
227
+
228
+ if not isinstance(column, TableColumn):
229
+ raise NotImplementedError(inner_op.name)
230
+
231
+ field_name = column.name
232
+ if prev_field_name is None:
233
+ prev_field_name = field_name
234
+ elif prev_field_name != field_name:
235
+ raise NotImplementedError(op.name)
236
+
237
+ args_offsets = [self.build_column(position=positions_map[field_name])]
238
+ if literal:
239
+ field = self.schema.field(field_name)
240
+ args_offsets.append(self.build_literal(field=field, value=literal.value))
241
+
242
+ inner_offsets.append(builder_func(*args_offsets))
243
+
244
+ domain_offset = self.build_or(inner_offsets)
245
+ offsets.append(domain_offset)
246
+
197
247
  return self.build_and(offsets)
198
248
 
199
249
  def build_column(self, position: int):
@@ -221,7 +271,6 @@ class Predicate:
221
271
  field = self.schema.field(field_name)
222
272
  for attr in field_attrs:
223
273
  field = field.type[attr]
224
- _logger.info(f'trying to append field: {field} with domains: {filters}')
225
274
  for filter_by_name in filters:
226
275
  offsets.append(self.build_range(column=column, field=field, filter_by_name=filter_by_name))
227
276
  return self.build_or(offsets)
@@ -263,11 +312,9 @@ class Predicate:
263
312
  return self.build_and(rules)
264
313
 
265
314
  def build_function(self, name: str, *offsets):
266
- _logger.info(f'name: {name}, offsets: {offsets}')
267
315
  offset_name = self.builder.CreateString(name)
268
316
  fb_call.StartArgumentsVector(self.builder, len(offsets))
269
317
  for offset in reversed(offsets):
270
- _logger.info(f'offset: {offset}')
271
318
  self.builder.PrependUOffsetTRelative(offset)
272
319
  offset_arguments = self.builder.EndVector()
273
320
 
@@ -282,7 +329,7 @@ class Predicate:
282
329
  fb_expression.AddImpl(self.builder, offset_call)
283
330
  return fb_expression.End(self.builder)
284
331
 
285
- def build_literal(self, field: pa.Field, value: str):
332
+ def build_literal(self, field: pa.Field, value):
286
333
  if field.type.equals(pa.int64()):
287
334
  literal_type = fb_int64_lit
288
335
  literal_impl = LiteralImpl.Int64Literal
@@ -366,8 +413,7 @@ class Predicate:
366
413
  field_type = fb_date.End(self.builder)
367
414
 
368
415
  start_date = datetime.fromtimestamp(0).date()
369
- date_value = datetime.strptime(value, '%Y-%m-%d').date()
370
- date_delta = date_value - start_date
416
+ date_delta = value - start_date
371
417
  value = date_delta.days
372
418
  elif isinstance(field.type, pa.TimestampType):
373
419
  literal_type = fb_timestamp_lit
@@ -426,7 +472,7 @@ class Predicate:
426
472
  fb_binary.Start(self.builder)
427
473
  field_type = fb_binary.End(self.builder)
428
474
 
429
- value = self.builder.CreateByteVector(value.encode())
475
+ value = self.builder.CreateByteVector(value)
430
476
  else:
431
477
  raise ValueError(f'unsupported predicate for type={field.type}, value={value}')
432
478
 
@@ -459,6 +505,9 @@ class Predicate:
459
505
  def build_equal(self, column: int, literal: int):
460
506
  return self.build_function('equal', column, literal)
461
507
 
508
+ def build_not_equal(self, column: int, literal: int):
509
+ return self.build_function('not_equal', column, literal)
510
+
462
511
  def build_greater(self, column: int, literal: int):
463
512
  return self.build_function('greater', column, literal)
464
513
 
@@ -477,6 +526,9 @@ class Predicate:
477
526
  def build_is_not_null(self, column: int):
478
527
  return self.build_function('is_valid', column)
479
528
 
529
+ def build_match_substring(self, column: int, literal: int):
530
+ return self.build_function('match_substring', column, literal)
531
+
480
532
 
481
533
  class FieldNode:
482
534
  """Helper class for representing nested Arrow fields and handling QueryData requests"""
@@ -574,9 +626,8 @@ class FieldNode:
574
626
  def build(self) -> pa.Array:
575
627
  """Construct an Arrow array from the collected buffers (recursively)."""
576
628
  children = self.children and [node.build() for node in self.children if node.is_projected]
577
- _logger.debug(f'build: self.field.name={self.field.name}, '
578
- f'self.projected_field.type={self.projected_field.type}, self.length={self.length} '
579
- f'self.buffers={self.buffers} children={children}')
629
+ _logger.debug('build: self.field.name=%s, self.projected_field.type=%s, self.length=%s, self.buffers=%s children=%s',
630
+ self.field.name, self.projected_field.type, self.length, self.buffers, children)
580
631
  result = pa.Array.from_buffers(self.projected_field.type, self.length, buffers=self.buffers, children=children)
581
632
  if self.debug:
582
633
  _logger.debug('%s result=%s', self.field, result)
@@ -602,11 +653,9 @@ class QueryDataParser:
602
653
  for node in self.nodes:
603
654
  node.debug_log()
604
655
  self.leaves = [leaf for node in self.nodes for leaf in node._iter_leaves()]
605
- _logger.debug(f'QueryDataParser: self.leaves = {[(leaf.field.name, leaf.index) for leaf in self.leaves]}')
606
656
  self.mark_projected_nodes()
607
657
  [node.build_projected_field() for node in self.nodes]
608
658
  self.projected_leaves = [leaf for node in self.nodes for leaf in node._iter_projected_leaves()]
609
- _logger.debug(f'QueryDataParser: self.projected_leaves = {[(leaf.field.name, leaf.index) for leaf in self.projected_leaves]}')
610
659
 
611
660
  self.leaf_offset = 0
612
661
 
@@ -615,7 +664,6 @@ class QueryDataParser:
615
664
  if self.projection_positions is None or leaf.index in self.projection_positions:
616
665
  for node in leaf._iter_to_root():
617
666
  node.is_projected = True
618
- _logger.debug(f'mark_projected_nodes node.field.name={node.field.name}')
619
667
 
620
668
  def parse(self, column: pa.Array):
621
669
  """Parse a single column response from VAST (see FieldNode.set for details)"""
@@ -693,7 +741,6 @@ def _parse_table_info(obj):
693
741
  return TableInfo(name, properties, handle, num_rows, used_bytes)
694
742
 
695
743
  def build_record_batch(column_info, column_values):
696
- _logger.info(f"column_info={column_info}")
697
744
  fields = [pa.field(column_name, column_type) for column_type, column_name in column_info]
698
745
  schema = pa.schema(fields)
699
746
  arrays = [pa.array(column_values[column_type], type=column_type) for column_type, _ in column_info]
@@ -706,67 +753,42 @@ def serialize_record_batch(batch):
706
753
  writer.write(batch)
707
754
  return sink.getvalue()
708
755
 
709
- def generate_ip_range(ip_range_str):
710
- start, end = ip_range_str.split(':')
711
- start_parts = start.split('.')
712
- start_last_part = int(start_parts[-1])
713
- end_parts = end.split('.')
714
- end_last_part = int(end_parts[-1])
715
- if start_last_part>=end_last_part or True in [start_parts[i] != end_parts[i] for i in range(3)]:
716
- raise ValueError(f'illegal ip range {ip_range_str}')
717
- num_ips = 1 + end_last_part - start_last_part
718
- ips = ['.'.join(start_parts[:-1] + [str(start_last_part + i)]) for i in range(num_ips)]
719
- return ips
720
-
721
- def parse_executor_hosts(host):
722
- executor_hosts_parsed = host.split(',')
723
- executor_hosts_parsed = [host.strip() for host in executor_hosts_parsed]
724
- executor_hosts = []
725
- for executor_host in executor_hosts_parsed:
726
- is_ip_range=False
727
- if ':' in executor_host:
728
- try:
729
- socket.inet_aton(executor_host.split(':')[0])
730
- socket.inet_aton(executor_host.split(':')[1])
731
- is_ip_range = True
732
- except:
733
- pass
734
- if is_ip_range:
735
- executor_hosts.extend(generate_ip_range(executor_host))
736
- else:
737
- executor_hosts.append(executor_host)
738
- return executor_hosts
756
+ # Results that returns from tablestats
757
+ TableStatsResult = namedtuple("TableStatsResult",["num_rows", "size_in_bytes", "is_external_rowid_alloc", "endpoints"])
739
758
 
740
759
  class VastdbApi:
741
- def __init__(self, host, access_key, secret_key, username=None, password=None, port=None,
760
+ # we expect the vast version to be <major>.<minor>.<patch>.<protocol>
761
+ VAST_VERSION_REGEX = re.compile(r'^vast (\d+\.\d+\.\d+\.\d+)$')
762
+
763
+ def __init__(self, endpoint, access_key, secret_key, username=None, password=None,
742
764
  secure=False, auth_type=AuthType.SIGV4):
743
- executor_hosts = parse_executor_hosts(host)
744
- host = executor_hosts[0]
745
- self.host = host
765
+ url_dict = urllib3.util.parse_url(endpoint)._asdict()
746
766
  self.access_key = access_key
747
767
  self.secret_key = secret_key
748
768
  self.username = username
749
769
  self.password = password
750
- self.port = port
751
770
  self.secure = secure
752
771
  self.auth_type = auth_type
753
- self.executor_hosts = executor_hosts
772
+ self.executor_hosts = [endpoint] # TODO: remove
754
773
 
755
774
  username = username or ''
756
775
  password = password or ''
757
- if not port:
758
- port = 443 if secure else 80
776
+ if not url_dict['port']:
777
+ url_dict['port'] = 443 if secure else 80
778
+
779
+ self.port = url_dict['port']
759
780
 
781
+ self.default_max_list_columns_page_size = 1000
760
782
  self.session = requests.Session()
761
783
  self.session.verify = False
762
784
  self.session.headers['user-agent'] = "VastData Tabular API 1.0 - 2022 (c)"
763
785
  if auth_type == AuthType.BASIC:
764
786
  self.session.auth = requests.auth.HTTPBasicAuth(username, password)
765
787
  else:
766
- if port != 80 and port != 443:
767
- self.aws_host = f'{host}:{port}'
788
+ if url_dict['port'] != 80 and url_dict['port'] != 443:
789
+ self.aws_host = '{host}:{port}'.format(**url_dict)
768
790
  else:
769
- self.aws_host = f'{host}'
791
+ self.aws_host = '{host}'.format(**url_dict)
770
792
 
771
793
  self.session.auth = AWSRequestsAuth(aws_access_key=access_key,
772
794
  aws_secret_access_key=secret_key,
@@ -774,8 +796,34 @@ class VastdbApi:
774
796
  aws_region='us-east-1',
775
797
  aws_service='s3')
776
798
 
777
- proto = "https" if secure else "http"
778
- self.url = f"{proto}://{self.aws_host}"
799
+ if not url_dict['scheme']:
800
+ url_dict['scheme'] = "https" if secure else "http"
801
+
802
+ url = urllib3.util.Url(**url_dict)
803
+ self.url = str(url)
804
+ _logger.debug('url=%s aws_host=%s', self.url, self.aws_host)
805
+
806
+ # probe the cluster for its version
807
+ self.vast_version = None
808
+ res = self.session.options(self.url)
809
+ server_header = res.headers.get("Server")
810
+ if server_header is None:
811
+ _logger.error("OPTIONS response doesn't contain 'Server' header")
812
+ else:
813
+ _logger.debug("Server header is '%s'", server_header)
814
+ if m := self.VAST_VERSION_REGEX.match(server_header):
815
+ self.vast_version, = m.groups()
816
+ return
817
+ else:
818
+ _logger.error("'Server' header '%s' doesn't match the expected pattern", server_header)
819
+
820
+ msg = (
821
+ f'Please use `vastdb` <= 0.0.5.x with current VAST cluster version ("{server_header or "N/A"}"). '
822
+ 'To use the latest SDK, please upgrade your cluster to the latest service pack. '
823
+ 'Please contact customer.support@vastdata.com for more details.'
824
+ )
825
+ _logger.critical(msg)
826
+ raise NotImplementedError(msg)
779
827
 
780
828
  def update_mgmt_session(self, access_key: str, secret_key: str, auth_type=AuthType.SIGV4):
781
829
  if auth_type != AuthType.BASIC:
@@ -820,21 +868,9 @@ class VastdbApi:
820
868
  return common_headers
821
869
 
822
870
  def _check_res(self, res, cmd="", expected_retvals=[]):
823
- try:
824
- res.raise_for_status()
825
- if res.status_code != 200:
826
- if not res.status_code in expected_retvals:
827
- raise ValueError(f"Expected status code mismatch. status_code={res.status_code}")
828
- else:
829
- if not len(expected_retvals) == 0:
830
- raise ValueError(f"Expected {expected_retvals} but status_code={res.status_code}")
831
- return res
832
- except requests.HTTPError as e:
833
- if res.status_code in expected_retvals:
834
- _logger.info(f"{cmd} has failed as expected res={res}")
835
- return res
836
- else:
837
- raise e
871
+ if exc := errors.from_response(res):
872
+ raise exc
873
+ return res
838
874
 
839
875
  def create_schema(self, bucket, name, txid=0, client_tags=[], schema_properties="", expected_retvals=[]):
840
876
  """
@@ -974,7 +1010,8 @@ class VastdbApi:
974
1010
  return snapshots, is_truncated, marker
975
1011
 
976
1012
 
977
- def create_table(self, bucket, schema, name, arrow_schema, txid=0, client_tags=[], expected_retvals=[], topic_partitions=0):
1013
+ def create_table(self, bucket, schema, name, arrow_schema, txid=0, client_tags=[], expected_retvals=[],
1014
+ topic_partitions=0, create_imports_table=False):
978
1015
  """
979
1016
  Create a table, use the following request
980
1017
  POST /bucket/schema/table?table HTTP/1.1
@@ -983,18 +1020,21 @@ class VastdbApi:
983
1020
  tabular-txid: <integer> TransactionId
984
1021
  tabular-client-tag: <string> ClientTag
985
1022
 
986
- The body of the POST request contains table column properties as json
987
- {
988
- "format": "string",
989
- "column_names": {"name1":"type1", "name2":"type2", ...},
990
- "table_properties": {"key1":"val1", "key2":"val2", ...}
991
- }
1023
+ The body of the POST request contains table column properties as arrow schema
1024
+ which include field_name, field_type and properties
1025
+
1026
+ In order to create vastdb-imported-objects table that tracks all imported files and avoid duplicate imports,
1027
+ just set create_imports_table=True
1028
+ The request will look like:
1029
+ POST /bucket/schema/table?table&sub-table=vastdb-imported-objects HTTP/1.1
992
1030
  """
993
1031
  headers = self._fill_common_headers(txid=txid, client_tags=client_tags)
994
1032
 
995
1033
  serialized_schema = arrow_schema.serialize()
996
1034
  headers['Content-Length'] = str(len(serialized_schema))
997
1035
  url_params = {'topic_partitions': str(topic_partitions)} if topic_partitions else {}
1036
+ if create_imports_table:
1037
+ url_params['sub-table'] = IMPORTED_OBJECTS_TABLE_NAME
998
1038
 
999
1039
  res = self.session.post(self._api_prefix(bucket=bucket, schema=schema, table=name, command="table", url_params=url_params),
1000
1040
  data=serialized_schema, headers=headers)
@@ -1014,7 +1054,6 @@ class VastdbApi:
1014
1054
  raise RuntimeError(f'invalid params parquet_path={parquet_path} parquet_bucket_name={parquet_bucket_name} parquet_object_name={parquet_object_name}')
1015
1055
 
1016
1056
  # Get the schema of the Parquet file
1017
- _logger.info(f'type(parquet_ds.schema) = {type(parquet_ds.schema)}')
1018
1057
  if isinstance(parquet_ds.schema, pq.ParquetSchema):
1019
1058
  arrow_schema = parquet_ds.schema.to_arrow_schema()
1020
1059
  elif isinstance(parquet_ds.schema, pa.Schema):
@@ -1037,13 +1076,27 @@ class VastdbApi:
1037
1076
  headers = self._fill_common_headers(txid=txid, client_tags=client_tags)
1038
1077
  res = self.session.get(self._api_prefix(bucket=bucket, schema=schema, table=name, command="stats"), headers=headers)
1039
1078
  if res.status_code == 200:
1040
- res_headers = res.headers
1041
1079
  flatbuf = b''.join(res.iter_content(chunk_size=128))
1042
1080
  stats = get_table_stats.GetRootAs(flatbuf)
1043
1081
  num_rows = stats.NumRows()
1044
1082
  size_in_bytes = stats.SizeInBytes()
1045
1083
  is_external_rowid_alloc = stats.IsExternalRowidAlloc()
1046
- return num_rows, size_in_bytes, is_external_rowid_alloc
1084
+ endpoints = []
1085
+ if stats.VipsLength() == 0:
1086
+ endpoints.append(self.url)
1087
+ else:
1088
+ ip_cls = IPv6Address if (stats.AddressType() == "ipv6") else IPv4Address
1089
+ vips = [stats.Vips(i) for i in range(stats.VipsLength())]
1090
+ ips = []
1091
+ # extract the vips into list of IPs
1092
+ for vip in vips:
1093
+ start_ip = int(ip_cls(vip.StartAddress().decode()))
1094
+ ips.extend(ip_cls(start_ip + i) for i in range(vip.AddressCount()))
1095
+ for ip in ips:
1096
+ prefix = "http" if not self.secure else "https"
1097
+ endpoints.append(f"{prefix}://{str(ip)}:{self.port}")
1098
+ return TableStatsResult(num_rows, size_in_bytes, is_external_rowid_alloc, endpoints)
1099
+
1047
1100
  return self._check_res(res, "get_table_stats", expected_retvals)
1048
1101
 
1049
1102
  def alter_table(self, bucket, schema, name, txid=0, client_tags=[], table_properties="",
@@ -1070,22 +1123,26 @@ class VastdbApi:
1070
1123
 
1071
1124
  headers = self._fill_common_headers(txid=txid, client_tags=client_tags)
1072
1125
  headers['Content-Length'] = str(len(alter_table_req))
1073
- url_params = {'tabular-new-table-name': new_name} if len(new_name) else {}
1126
+ url_params = {'tabular-new-table-name': schema + "/" + new_name} if len(new_name) else {}
1074
1127
 
1075
1128
  res = self.session.put(self._api_prefix(bucket=bucket, schema=schema, table=name, command="table", url_params=url_params),
1076
1129
  data=alter_table_req, headers=headers)
1077
1130
 
1078
1131
  return self._check_res(res, "alter_table", expected_retvals)
1079
1132
 
1080
- def drop_table(self, bucket, schema, name, txid=0, client_tags=[], expected_retvals=[]):
1133
+ def drop_table(self, bucket, schema, name, txid=0, client_tags=[], expected_retvals=[], remove_imports_table=False):
1081
1134
  """
1082
1135
  DELETE /mybucket/schema_path/mytable?table HTTP/1.1
1083
1136
  tabular-txid: TransactionId
1084
1137
  tabular-client-tag: ClientTag
1138
+
1139
+ To remove the internal vastdb-imported-objects table just set remove_imports_table=True
1085
1140
  """
1086
1141
  headers = self._fill_common_headers(txid=txid, client_tags=client_tags)
1142
+ url_params = {'sub-table': IMPORTED_OBJECTS_TABLE_NAME} if remove_imports_table else {}
1087
1143
 
1088
- res = self.session.delete(self._api_prefix(bucket=bucket, schema=schema, table=name, command="table"), headers=headers)
1144
+ res = self.session.delete(self._api_prefix(bucket=bucket, schema=schema, table=name, command="table", url_params=url_params),
1145
+ headers=headers)
1089
1146
  return self._check_res(res, "drop_table", expected_retvals)
1090
1147
 
1091
1148
  def list_tables(self, bucket, schema, txid=0, client_tags=[], max_keys=1000, next_key=0, name_prefix="",
@@ -1207,9 +1264,9 @@ class VastdbApi:
1207
1264
  data=serialized_schema, headers=headers)
1208
1265
  return self._check_res(res, "drop_columns", expected_retvals)
1209
1266
 
1210
- def list_columns(self, bucket, schema, table, *, txid=0, client_tags=None, max_keys=1000, next_key=0,
1267
+ def list_columns(self, bucket, schema, table, *, txid=0, client_tags=None, max_keys=None, next_key=0,
1211
1268
  count_only=False, name_prefix="", exact_match=False,
1212
- expected_retvals=None, bc_list_internals=False):
1269
+ expected_retvals=None, bc_list_internals=False, list_imports_table=False):
1213
1270
  """
1214
1271
  GET /mybucket/myschema/mytable?columns HTTP/1.1
1215
1272
  tabular-txid: TransactionId
@@ -1217,7 +1274,10 @@ class VastdbApi:
1217
1274
  x-tabluar-name-prefix: TableNamePrefix
1218
1275
  tabular-max-keys: 1000
1219
1276
  tabular-next-key: NextColumnId
1277
+
1278
+ To list the columns of the internal vastdb-imported-objects table, set list_import_table=True
1220
1279
  """
1280
+ max_keys = max_keys or self.default_max_list_columns_page_size
1221
1281
  client_tags = client_tags or []
1222
1282
  expected_retvals = expected_retvals or []
1223
1283
 
@@ -1233,7 +1293,9 @@ class VastdbApi:
1233
1293
  else:
1234
1294
  headers['tabular-name-prefix'] = name_prefix
1235
1295
 
1236
- res = self.session.get(self._api_prefix(bucket=bucket, schema=schema, table=table, command="column"),
1296
+ url_params = {'sub-table': IMPORTED_OBJECTS_TABLE_NAME} if list_imports_table else {}
1297
+ res = self.session.get(self._api_prefix(bucket=bucket, schema=schema, table=table, command="column",
1298
+ url_params=url_params),
1237
1299
  headers=headers, stream=True)
1238
1300
  self._check_res(res, "list_columns", expected_retvals)
1239
1301
  if res.status_code == 200:
@@ -1245,9 +1307,7 @@ class VastdbApi:
1245
1307
  if not count_only:
1246
1308
  schema_buf = b''.join(res.iter_content(chunk_size=128))
1247
1309
  schema_out = pa.ipc.open_stream(schema_buf).schema
1248
- # _logger.info(f"schema={schema_out}")
1249
- for f in schema_out:
1250
- columns.append([f.name, f.type, f.metadata, f])
1310
+ columns = schema_out
1251
1311
 
1252
1312
  return columns, next_key, is_truncated, count
1253
1313
 
@@ -1294,7 +1354,7 @@ class VastdbApi:
1294
1354
  return self._check_res(res, "get_transaction", expected_retvals)
1295
1355
 
1296
1356
  def select_row_ids(self, bucket, schema, table, params, txid=0, client_tags=[], expected_retvals=[],
1297
- retry_count=0, enable_sorted_projections=False):
1357
+ retry_count=0, enable_sorted_projections=True):
1298
1358
  """
1299
1359
  POST /mybucket/myschema/mytable?query-data=SelectRowIds HTTP/1.1
1300
1360
  """
@@ -1311,7 +1371,7 @@ class VastdbApi:
1311
1371
  return self._check_res(res, "query_data", expected_retvals)
1312
1372
 
1313
1373
  def read_columns_data(self, bucket, schema, table, params, txid=0, client_tags=[], expected_retvals=[], tenant_guid=None,
1314
- retry_count=0, enable_sorted_projections=False):
1374
+ retry_count=0, enable_sorted_projections=True):
1315
1375
  """
1316
1376
  POST /mybucket/myschema/mytable?query-data=ReadColumns HTTP/1.1
1317
1377
  """
@@ -1327,7 +1387,7 @@ class VastdbApi:
1327
1387
  return self._check_res(res, "query_data", expected_retvals)
1328
1388
 
1329
1389
  def count_rows(self, bucket, schema, table, params, txid=0, client_tags=[], expected_retvals=[], tenant_guid=None,
1330
- retry_count=0, enable_sorted_projections=False):
1390
+ retry_count=0, enable_sorted_projections=True):
1331
1391
  """
1332
1392
  POST /mybucket/myschema/mytable?query-data=CountRows HTTP/1.1
1333
1393
  """
@@ -1341,27 +1401,9 @@ class VastdbApi:
1341
1401
  data=params, headers=headers, stream=True)
1342
1402
  return self._check_res(res, "query_data", expected_retvals)
1343
1403
 
1344
- def query_data(self, bucket, schema, table, params, split=(0, 1, 8), num_sub_splits=1, response_row_id=False,
1345
- txid=0, client_tags=[], expected_retvals=[], limit_rows=0, schedule_id=None, retry_count=0,
1346
- search_path=None, sub_split_start_row_ids=[], tenant_guid=None, projection='', enable_sorted_projections=True,
1347
- request_format='string', response_format='string'):
1348
- """
1349
- GET /mybucket/myschema/mytable?data HTTP/1.1
1350
- Content-Length: ContentLength
1351
- tabular-txid: TransactionId
1352
- tabular-client-tag: ClientTag
1353
- tabular-split: "split_id,total_splits,num_row_groups_per_split"
1354
- tabular-num-of-subsplits: "total"
1355
- tabular-request-format: "string"
1356
- tabular-response-format: "string" #arrow/trino
1357
- tabular-schedule-id: "schedule-id"
1358
-
1359
- Request Body (flatbuf)
1360
- projections_chunk [expressions]
1361
- predicate_chunk "formatted_data", (required)
1362
-
1363
- """
1364
- # add query option select-only and read-only
1404
+ def _build_query_data_headers(self, txid, client_tags, params, split, num_sub_splits, request_format, response_format,
1405
+ enable_sorted_projections, limit_rows, schedule_id, retry_count, search_path, tenant_guid,
1406
+ sub_split_start_row_ids):
1365
1407
  headers = self._fill_common_headers(txid=txid, client_tags=client_tags)
1366
1408
  headers['Content-Length'] = str(len(params))
1367
1409
  headers['tabular-split'] = ','.join(map(str, split))
@@ -1386,439 +1428,80 @@ class VastdbApi:
1386
1428
  for sub_split_id, start_row_id in sub_split_start_row_ids:
1387
1429
  headers[f'tabular-start-row-id-{sub_split_id}'] = f"{sub_split_id},{start_row_id}"
1388
1430
 
1389
- url_params = {'name': projection} if projection else {}
1431
+ return headers
1390
1432
 
1391
- res = self.session.get(self._api_prefix(bucket=bucket, schema=schema, table=table, command="data", url_params=url_params),
1392
- data=params, headers=headers, stream=True)
1393
- return self._check_res(res, "query_data", expected_retvals)
1433
+ def _build_query_data_url_params(self, projection, query_imports_table):
1434
+ if query_imports_table and projection:
1435
+ raise ValueError("Can't query both imports and projection table")
1394
1436
 
1395
- def _list_table_columns(self, bucket, schema, table, filters=None, field_names=None, txid=0):
1396
- # build a list of the queried column names
1397
- queried_columns = []
1398
- # get all columns from the table
1399
- all_listed_columns = []
1400
- next_key = 0
1401
- while True:
1402
- cur_columns, next_key, is_truncated, count = self.list_columns(
1403
- bucket=bucket, schema=schema, table=table, next_key=next_key, txid=txid)
1404
- if not cur_columns:
1405
- break
1406
- all_listed_columns.extend(cur_columns)
1407
- if not is_truncated:
1408
- break
1409
-
1410
- # build a list of the queried columns
1411
- queried_column_names = set()
1412
- if filters:
1413
- filtered_column_names = ([column_name.split('.')[0] for column_name in filters.keys()]) # use top level of the filter column names
1414
- queried_column_names.update(filtered_column_names)
1415
- _logger.debug(f"_list_table_columns: filtered_column_names={filtered_column_names}")
1416
-
1417
- if field_names:
1418
- field_column_names = ([column_name.split('.')[0] for column_name in field_names]) # use top level of the field column names
1419
- else:
1420
- field_column_names = [column[0] for column in all_listed_columns]
1421
- _logger.debug(f"_list_table_columns: field_column_names={field_column_names}")
1422
- queried_column_names.update(field_column_names)
1423
-
1424
- all_listed_column_and_leaves_names = set()
1425
- for column in all_listed_columns:
1426
- # Collect the column and leaves names for verification below that all the filters and field names are in the table
1427
- column_and_leaves_names = [column[0]] + [f.name for f in column[3].flatten()]
1428
- all_listed_column_and_leaves_names.update(column_and_leaves_names)
1429
-
1430
- # check if this column is needed for the query
1431
- if column[0] in queried_column_names:
1432
- queried_columns.append(column)
1433
-
1434
- # verify that all the filters and field names are in the table
1435
- if filters:
1436
- for filter_column_name in filters.keys():
1437
- if filter_column_name not in all_listed_column_and_leaves_names:
1438
- raise KeyError((f'filter column name: {filter_column_name} does not appear in the table'))
1439
- if field_names:
1440
- for field_name in field_names:
1441
- if field_name not in all_listed_column_and_leaves_names:
1442
- raise ValueError((f'field name: {field_name} does not appear in the table'))
1443
- return list(queried_columns)
1444
-
1445
- def _begin_tx_if_necessary(self, txid):
1446
- if not txid:
1447
- created_txid = True
1448
- res = self.begin_transaction()
1449
- txid = res.headers.get('tabular-txid')
1450
- else:
1451
- created_txid = False
1437
+ url_params = {}
1438
+ if query_imports_table:
1439
+ url_params['sub-table'] = IMPORTED_OBJECTS_TABLE_NAME
1440
+ elif projection:
1441
+ url_params['name'] = projection
1442
+ return url_params
1452
1443
 
1453
- return txid, created_txid
1444
+ def legacy_query_data(self, bucket, schema, table, params, split=(0, 1, 8), num_sub_splits=1, response_row_id=False,
1445
+ txid=0, client_tags=[], expected_retvals=[], limit_rows=0, schedule_id=None, retry_count=0,
1446
+ search_path=None, sub_split_start_row_ids=[], tenant_guid=None, projection='', enable_sorted_projections=True,
1447
+ request_format='string', response_format='string', query_imports_table=False):
1448
+ """
1449
+ POST /mybucket/myschema/mytable?query-data=LegacyQueryData HTTP/1.1
1450
+ Content-Length: ContentLength
1451
+ tabular-txid: TransactionId
1452
+ tabular-client-tag: ClientTag
1453
+ tabular-split: "split_id,total_splits,num_row_groups_per_split"
1454
+ tabular-num-of-subsplits: "total"
1455
+ tabular-request-format: "string"
1456
+ tabular-response-format: "string" #arrow/trino
1457
+ tabular-schedule-id: "schedule-id"
1454
1458
 
1455
- def _prepare_query(self, bucket, schema, table, num_sub_splits, filters=None, field_names=None,
1456
- queried_columns=None, response_row_id=False, txid=0):
1457
- queried_fields = []
1458
- if response_row_id:
1459
- queried_fields.append(pa.field('$row_id', pa.uint64()))
1459
+ Request Body (flatbuf)
1460
+ projections_chunk [expressions]
1461
+ predicate_chunk "formatted_data", (required)
1460
1462
 
1461
- if not queried_columns:
1462
- queried_columns = self._list_table_columns(bucket, schema, table, filters, field_names, txid=txid)
1463
+ """
1464
+ headers = self._build_query_data_headers(txid, client_tags, params, split, num_sub_splits, request_format, response_format,
1465
+ enable_sorted_projections, limit_rows, schedule_id, retry_count, search_path, tenant_guid,
1466
+ sub_split_start_row_ids)
1467
+ url_params = self._build_query_data_url_params(projection, query_imports_table)
1463
1468
 
1464
- queried_fields.extend(pa.field(column[0], column[1]) for column in queried_columns)
1465
- arrow_schema = pa.schema(queried_fields)
1469
+ res = self.session.post(self._api_prefix(bucket=bucket, schema=schema, table=table, command="query-data=LegacyQueryData",
1470
+ url_params=url_params), data=params, headers=headers, stream=True)
1471
+ return self._check_res(res, "legacy_query_data", expected_retvals)
1466
1472
 
1467
- _logger.debug(f'_prepare_query: arrow_schema = {arrow_schema}')
1473
+ def query_data(self, bucket, schema, table, params, split=(0, 1, 8), num_sub_splits=1, response_row_id=False,
1474
+ txid=0, client_tags=[], expected_retvals=[], limit_rows=0, schedule_id=None, retry_count=0,
1475
+ search_path=None, sub_split_start_row_ids=[], tenant_guid=None, projection='', enable_sorted_projections=True,
1476
+ request_format='string', response_format='string', query_imports_table=False):
1477
+ """
1478
+ GET /mybucket/myschema/mytable?data HTTP/1.1
1479
+ Content-Length: ContentLength
1480
+ tabular-txid: TransactionId
1481
+ tabular-client-tag: ClientTag
1482
+ tabular-split: "split_id,total_splits,num_row_groups_per_split"
1483
+ tabular-num-of-subsplits: "total"
1484
+ tabular-request-format: "string"
1485
+ tabular-response-format: "string" #arrow/trino
1486
+ tabular-schedule-id: "schedule-id"
1468
1487
 
1469
- query_data_request = build_query_data_request(schema=arrow_schema, filters=filters, field_names=field_names)
1470
- if self.executor_hosts:
1471
- executor_hosts = self.executor_hosts
1472
- else:
1473
- executor_hosts = [self.host]
1474
- executor_sessions = [VastdbApi(executor_hosts[i], self.access_key, self.secret_key, self.username,
1475
- self.password, self.port, self.secure, self.auth_type) for i in range(len(executor_hosts))]
1476
-
1477
- return queried_columns, arrow_schema, query_data_request, executor_sessions
1478
-
1479
- def _more_pages_exist(self, start_row_ids):
1480
- for row_id in start_row_ids.values():
1481
- if row_id != TABULAR_INVALID_ROW_ID:
1482
- return True
1483
- return False
1484
-
1485
- def _query_page(self, bucket, schema, table, query_data_request, split=(0, 1, 8), num_sub_splits=1, response_row_id=False,
1486
- txid=0, limit_rows=0, sub_split_start_row_ids=[], filters=None, field_names=None):
1487
- res = self.query_data(bucket=bucket, schema=schema, table=table, params=query_data_request.serialized, split=split,
1488
- num_sub_splits=num_sub_splits, response_row_id=response_row_id, txid=txid,
1489
- limit_rows=limit_rows, sub_split_start_row_ids=sub_split_start_row_ids)
1490
- start_row_ids = {}
1491
- sub_split_tables = parse_query_data_response(res.raw, query_data_request.response_schema,
1492
- start_row_ids=start_row_ids)
1493
- table_page = pa.concat_tables(sub_split_tables)
1494
- _logger.info("query_page: table_page num_rows=%s start_row_ids len=%s",
1495
- len(table_page), len(start_row_ids))
1496
-
1497
- return table_page, start_row_ids
1498
-
1499
- def _query_page_iterator(self, bucket, schema, table, query_data_request, split=(0, 1, 8), num_sub_splits=1, response_row_id=False,
1500
- txid=0, limit_rows=0, start_row_ids={}, filters=None, field_names=None):
1501
- res = self.query_data(bucket=bucket, schema=schema, table=table, params=query_data_request.serialized, split=split,
1502
- num_sub_splits=num_sub_splits, response_row_id=response_row_id, txid=txid,
1503
- limit_rows=limit_rows, sub_split_start_row_ids=start_row_ids.items())
1504
- for sub_split_table in parse_query_data_response(res.raw, query_data_request.response_schema,
1505
- start_row_ids=start_row_ids):
1506
- for record_batch in sub_split_table.to_batches():
1507
- yield record_batch
1508
- _logger.info(f"query_page_iterator: start_row_ids={start_row_ids}")
1509
-
1510
- def query_iterator(self, bucket, schema, table, num_sub_splits=1, num_row_groups_per_sub_split=8,
1511
- response_row_id=False, txid=0, limit_per_sub_split=128*1024, filters=None, field_names=None):
1512
- """
1513
- query rows into a table.
1514
-
1515
- Parameters
1516
- ----------
1517
- bucket : string
1518
- The bucket of the table.
1519
- schema : string
1520
- The schema of the table.
1521
- table : string
1522
- The table name.
1523
- num_sub_splits : integer
1524
- The number of sub_splits per split - determines the parallelism inside a VastDB compute node
1525
- default: 1
1526
- num_row_groups_per_sub_split : integer
1527
- The number of consecutive row groups per sub_split. Each row group consists of 64K row ids.
1528
- default: 8
1529
- response_row_id : boolean
1530
- Return a column with the internal row ids of the table
1531
- default: False
1532
- txid : integer
1533
- A transaction id. The transaction may be initiated before the query, and if not, the query will initiate it
1534
- default: 0 (will be created by the api)
1535
- limit_per_sub_split : integer
1536
- Limit the number of rows from a single sub_split for a single rpc
1537
- default:131072
1538
- filters : dict
1539
- A dictionary whose keys are column names, and values are lists of string expressions that represent
1540
- filter conditions on the column. AND is applied on the conditions. The condition formats are:
1541
- 'column_name eq some_value'
1542
- default: None
1543
- field_names : list
1544
- A list of column names to be returned in the output table
1545
- default: None
1546
-
1547
- Returns
1548
- -------
1549
- Query iterator generator
1550
-
1551
- Yields
1552
- ------
1553
- pyarrow.RecordBatch
1554
-
1555
- Examples
1556
- --------
1557
- for record_batch in query_iterator('some_bucket', 'some_schema', 'some_table',
1558
- filters={'name': ['eq Alice', 'eq Bob']}
1559
- field_names=['name','age']):
1560
- ...
1561
-
1562
- """
1563
-
1564
- # create a transaction if necessary
1565
- txid, created_txid = self._begin_tx_if_necessary(txid)
1566
- executor_sessions = []
1488
+ Request Body (flatbuf)
1489
+ projections_chunk [expressions]
1490
+ predicate_chunk "formatted_data", (required)
1567
1491
 
1568
- try:
1569
- # prepare query
1570
- queried_columns, arrow_schema, query_data_request, executor_sessions = \
1571
- self._prepare_query(bucket, schema, table, num_sub_splits, filters, field_names, response_row_id=response_row_id, txid=txid)
1572
-
1573
- # define the per split threaded query func
1574
- def query_iterator_split_id(self, split_id):
1575
- _logger.info(f"query_iterator_split_id: split_id={split_id}")
1576
- try:
1577
- start_row_ids = {i:0 for i in range(num_sub_splits)}
1578
- session = executor_sessions[split_id]
1579
- while not next_sems[split_id].acquire(timeout=1):
1580
- # check if killed externally
1581
- if killall:
1582
- raise RuntimeError(f'query_iterator_split_id: split_id {split_id} received killall')
1583
-
1584
- while self._more_pages_exist(start_row_ids):
1585
- for record_batch in session._query_page_iterator(bucket=bucket, schema=schema, table=table, query_data_request=query_data_request,
1586
- split=(split_id, num_splits, num_row_groups_per_sub_split),
1587
- num_sub_splits=num_sub_splits, response_row_id=response_row_id,
1588
- txid=txid, limit_rows=limit_per_sub_split,
1589
- start_row_ids=start_row_ids):
1590
- output_queue.put((split_id, record_batch))
1591
- while not next_sems[split_id].acquire(timeout=1): # wait for the main thread to request the next record batch
1592
- if killall:
1593
- raise RuntimeError(f'split_id {split_id} received killall')
1594
- # end of split
1595
- output_queue.put((split_id,None))
1596
-
1597
- except Exception as e:
1598
- _logger.exception('query_iterator_split_id: exception occurred')
1599
- try:
1600
- self.rollback_transaction(txid)
1601
- except:
1602
- _logger.exception(f'failed to rollback txid {txid}')
1603
- error_queue.put(None)
1604
- raise e
1605
-
1606
- # kickoff executors
1607
- num_splits = len(executor_sessions)
1608
- output_queue = queue.Queue()
1609
- error_queue = queue.Queue()
1610
- next_sems = [threading.Semaphore(value=1) for i in range(num_splits)]
1611
- killall = False
1612
- with concurrent.futures.ThreadPoolExecutor(max_workers=num_splits) as executor:
1613
- # start executors
1614
- futures = []
1615
- for i in range(num_splits):
1616
- futures.append(executor.submit(query_iterator_split_id, self, i))
1617
-
1618
- # receive outputs and yield them
1619
- done_count = 0
1620
- while done_count < num_splits:
1621
- # check for errors
1622
- try:
1623
- error_queue.get(block=False)
1624
- _logger.error('received error from a thread')
1625
- killall = True
1626
- # wait for all executors to complete
1627
- for future in concurrent.futures.as_completed(futures):
1628
- try:
1629
- future.result() # trigger an exception if occurred in any thread
1630
- except Exception:
1631
- _logger.exception('exception occurred')
1632
- raise RuntimeError('received error from a thread')
1633
- except queue.Empty:
1634
- pass
1635
-
1636
- # try to get a value from the output queue
1637
- try:
1638
- (split_id, record_batch) = output_queue.get(timeout=1)
1639
- except queue.Empty:
1640
- continue
1641
-
1642
- if record_batch:
1643
- # signal to the thread to read the next record batch and yield the current
1644
- next_sems[split_id].release()
1645
- try:
1646
- yield record_batch
1647
- except GeneratorExit:
1648
- killall = True
1649
- _logger.debug("cancelling query_iterator")
1650
- raise
1651
- else:
1652
- done_count += 1
1653
-
1654
- # wait for all executors to complete
1655
- for future in concurrent.futures.as_completed(futures):
1656
- try:
1657
- future.result() # trigger an exception if occurred in any thread
1658
- except Exception:
1659
- _logger.exception('exception occurred')
1660
-
1661
- # commit if needed
1662
- if created_txid:
1663
- self.commit_transaction(txid)
1664
-
1665
- except Exception as e:
1666
- _logger.exception('exception occurred')
1667
- try:
1668
- self.rollback_transaction(txid)
1669
- except:
1670
- _logger.exception(f'failed to rollback txid {txid}')
1671
- raise e
1672
-
1673
- finally:
1674
- killall = True
1675
- for session in executor_sessions:
1676
- try:
1677
- session.session.close()
1678
- except Exception:
1679
- _logger.exception(f'failed to close session {session}')
1680
-
1681
- def query(self, bucket, schema, table, num_sub_splits=1, num_row_groups_per_sub_split=8,
1682
- response_row_id=False, txid=0, limit=0, limit_per_sub_split=131072, filters=None, field_names=None,
1683
- queried_columns=None):
1684
- """
1685
- query rows into a table.
1686
-
1687
- Parameters
1688
- ----------
1689
- bucket : string
1690
- The bucket of the table.
1691
- schema : string
1692
- The schema of the table.
1693
- table : string
1694
- The table name.
1695
- num_sub_splits : integer
1696
- The number of sub_splits per split - determines the parallelism inside a VastDB compute node
1697
- default: 1
1698
- num_row_groups_per_sub_split : integer
1699
- The number of consecutive row groups per sub_split. Each row group consists of 64K row ids.
1700
- default: 8
1701
- response_row_id : boolean
1702
- Return a column with the internal row ids of the table
1703
- default: False
1704
- txid : integer
1705
- A transaction id. The transaction may be initiated before the query, and be used to provide
1706
- multiple ACID operations
1707
- default: 0 (will be created by the api)
1708
- limit : integer
1709
- Limit the number of rows in the response
1710
- default: 0 (no limit)
1711
- limit_per_sub_split : integer
1712
- Limit the number of rows from a single sub_split for a single rpc
1713
- default:131072
1714
- filters : dict
1715
- A dictionary whose keys are column names, and values are lists of string expressions that represent
1716
- filter conditions on the column. AND is applied on the conditions. The condition formats are:
1717
- 'column_name eq some_value'
1718
- default: None
1719
- field_names : list
1720
- A list of column names to be returned to the output table
1721
- default: None
1722
- queried_columns: list of pyArrow.column
1723
- A list of the columns to be queried
1724
- default: None
1725
-
1726
- Returns
1727
- -------
1728
- pyarrow.Table
1729
-
1730
-
1731
- Examples
1732
- --------
1733
- table = query('some_bucket', 'some_schema', 'some_table',
1734
- filters={'name': ['eq Alice', 'eq Bob']}
1735
- field_names=['name','age'])
1736
-
1737
- """
1738
-
1739
- # create a transaction
1740
- txid, created_txid = self._begin_tx_if_necessary(txid)
1741
- executor_sessions = []
1742
- try:
1743
- # prepare query
1744
- queried_columns, arrow_schema, query_data_request, executor_sessions = \
1745
- self._prepare_query(bucket, schema, table, num_sub_splits, filters, field_names, response_row_id=response_row_id, txid=txid)
1746
-
1747
- # define the per split threaded query func
1748
- def query_split_id(self, split_id):
1749
- try:
1750
- start_row_ids = {i:0 for i in range(num_sub_splits)}
1751
- session = executor_sessions[split_id]
1752
- row_count = 0
1753
- while (self._more_pages_exist(start_row_ids) and
1754
- (not limit or row_count < limit)):
1755
- # check if killed externally
1756
- if killall:
1757
- raise RuntimeError(f'query_split_id: split_id {split_id} received killall')
1758
-
1759
- # determine the limit rows
1760
- if limit:
1761
- limit_rows = min(limit_per_sub_split, limit-row_count)
1762
- else:
1763
- limit_rows = limit_per_sub_split
1764
-
1765
- # query one page
1766
- table_page, start_row_ids = session._query_page(bucket=bucket, schema=schema, table=table, query_data_request=query_data_request,
1767
- split=(split_id, num_splits, num_row_groups_per_sub_split),
1768
- num_sub_splits=num_sub_splits, response_row_id=response_row_id,
1769
- txid=txid, limit_rows=limit_rows,
1770
- sub_split_start_row_ids=start_row_ids.items())
1771
- with lock:
1772
- table_pages.append(table_page)
1773
- row_counts[split_id] += len(table_page)
1774
- row_count = sum(row_counts)
1775
- _logger.info(f"query_split_id: table_pages split_id={split_id} row_count={row_count}")
1776
- except Exception as e:
1777
- _logger.exception('query_split_id: exception occurred')
1778
- try:
1779
- self.rollback_transaction(txid)
1780
- except:
1781
- _logger.exception(f'failed to rollback txid {txid}')
1782
- raise e
1783
-
1784
- table_pages = []
1785
- num_splits = len(executor_sessions)
1786
- killall = False
1787
- with concurrent.futures.ThreadPoolExecutor(max_workers=num_splits) as executor:
1788
- futures = []
1789
- row_counts = [0] * num_splits
1790
- lock = threading.Lock()
1791
- for i in range(num_splits):
1792
- futures.append(executor.submit(query_split_id, self, i))
1793
- for future in concurrent.futures.as_completed(futures):
1794
- future.result() # trigger an exception if occurred in any thread
1795
-
1796
- # commit if needed
1797
- if created_txid:
1798
- self.commit_transaction(txid)
1799
-
1800
- # concatenate all table pages and return result
1801
- out_table = pa.concat_tables(table_pages)
1802
- out_table = out_table.slice(length=limit) if limit else out_table
1803
- _logger.info("query: out_table len=%s row_count=%s",
1804
- len(out_table), len(out_table))
1805
- return out_table
1806
-
1807
- except Exception as e:
1808
- _logger.exception('exception occurred')
1809
- try:
1810
- self.rollback_transaction(txid)
1811
- except:
1812
- _logger.exception(f'failed to rollback txid {txid}')
1813
- raise e
1814
-
1815
- finally:
1816
- killall = True
1817
- for session in executor_sessions:
1818
- try:
1819
- session.session.close()
1820
- except Exception:
1821
- _logger.exception(f'failed to close session {session}')
1492
+ To query the internal vastdb-imported-objects table, set query_imports_table=True
1493
+ """
1494
+ # add query option select-only and read-only
1495
+
1496
+ headers = self._build_query_data_headers(txid, client_tags, params, split, num_sub_splits, request_format, response_format,
1497
+ enable_sorted_projections, limit_rows, schedule_id, retry_count, search_path, tenant_guid,
1498
+ sub_split_start_row_ids)
1499
+
1500
+ url_params = self._build_query_data_url_params(projection, query_imports_table)
1501
+
1502
+ res = self.session.get(self._api_prefix(bucket=bucket, schema=schema, table=table, command="data", url_params=url_params),
1503
+ data=params, headers=headers, stream=True)
1504
+ return self._check_res(res, "query_data", expected_retvals)
1822
1505
 
1823
1506
  """
1824
1507
  source_files: list of (bucket_name, file_name)
@@ -1872,21 +1555,22 @@ class VastdbApi:
1872
1555
  builder.Finish(params)
1873
1556
  import_req = builder.Output()
1874
1557
 
1875
- def iterate_over_import_data_response(response, expected_retvals):
1558
+ def iterate_over_import_data_response(response):
1876
1559
  if response.status_code != 200:
1877
1560
  return response
1878
1561
 
1879
1562
  chunk_size = 1024
1880
- for chunk in res.iter_content(chunk_size=chunk_size):
1563
+ for chunk in response.iter_content(chunk_size=chunk_size):
1881
1564
  chunk_dict = json.loads(chunk)
1882
- _logger.info(f"import data chunk={chunk}, result: {chunk_dict['res']}")
1883
- if chunk_dict['res'] in expected_retvals:
1884
- _logger.info(f"import finished with expected result={chunk_dict['res']}, error message: {chunk_dict['err_msg']}")
1885
- return response
1886
- elif chunk_dict['res'] != 'Success' and chunk_dict['res'] != 'TabularInProgress':
1887
- raise TabularException(f"Received unexpected error in import_data. "
1888
- f"status: {chunk_dict['res']}, error message: {chunk_dict['err_msg']}")
1889
- _logger.info(f"import_data is in progress. status: {chunk_dict['res']}")
1565
+ _logger.debug("import data chunk=%s, result: %s", chunk_dict, chunk_dict['res'])
1566
+ if chunk_dict['res'] != 'Success' and chunk_dict['res'] != 'TabularInProgress' and chunk_dict['res'] != 'TabularAlreadyImported':
1567
+ raise errors.ImportFilesError(
1568
+ f"Encountered an error during import_data. status: {chunk_dict['res']}, "
1569
+ f"error message: {chunk_dict['err_msg'] or 'Unexpected error'} during import of "
1570
+ f"object name: {chunk_dict['object_name']}", chunk_dict)
1571
+ else:
1572
+ _logger.debug("import_data of object name '%s' is in progress. "
1573
+ "status: %s", chunk_dict['object_name'], chunk_dict['res'])
1890
1574
  return response
1891
1575
 
1892
1576
  headers = self._fill_common_headers(txid=txid, client_tags=client_tags)
@@ -1899,34 +1583,17 @@ class VastdbApi:
1899
1583
  res = self.session.post(self._api_prefix(bucket=bucket, schema=schema, table=table, command="data"),
1900
1584
  data=import_req, headers=headers, stream=True)
1901
1585
  if blocking:
1902
- res = iterate_over_import_data_response(res, expected_retvals)
1586
+ res = iterate_over_import_data_response(res)
1903
1587
 
1904
1588
  return self._check_res(res, "import_data", expected_retvals)
1905
1589
 
1906
- def merge_data(self):
1907
- """
1908
- TODO
1909
-
1910
- POST /mybucket/myschema/mytable?data HTTP/1.1
1911
- Content-Length: ContentLength
1912
- tabular-txid: TransactionId
1913
- tabular-client-tag: ClientTag
1914
-
1915
- Request Body
1916
- {
1917
- "format": "string",
1918
- "select_source": "formatted data"
1919
- "predicate": "formatted_data"
1920
- }
1921
- """
1922
- pass
1923
-
1924
1590
  def _record_batch_slices(self, batch, rows_per_slice=None):
1925
1591
  max_slice_size_in_bytes = int(0.9*5*1024*1024) # 0.9 * 5MB
1926
1592
  batch_len = len(batch)
1927
1593
  serialized_batch = serialize_record_batch(batch)
1928
1594
  batch_size_in_bytes = len(serialized_batch)
1929
- _logger.info(f'max_slice_size_in_bytes={max_slice_size_in_bytes} batch_len={batch_len} batch_size_in_bytes={batch_size_in_bytes}')
1595
+ _logger.debug('max_slice_size_in_bytes=%d batch_len=%d batch_size_in_bytes=%d',
1596
+ max_slice_size_in_bytes, batch_len, batch_size_in_bytes)
1930
1597
 
1931
1598
  if not rows_per_slice:
1932
1599
  if batch_size_in_bytes < max_slice_size_in_bytes:
@@ -1948,7 +1615,7 @@ class VastdbApi:
1948
1615
  serialized_slice_batch = serialize_record_batch(slice_batch)
1949
1616
  sizeof_serialized_slice_batch = len(serialized_slice_batch)
1950
1617
 
1951
- if sizeof_serialized_slice_batch <= max_slice_size_in_bytes or rows_per_slice < 10000:
1618
+ if sizeof_serialized_slice_batch <= max_slice_size_in_bytes:
1952
1619
  serialized_slices.append(serialized_slice_batch)
1953
1620
  else:
1954
1621
  _logger.info(f'Using rows_per_slice {rows_per_slice} slice {i} size {sizeof_serialized_slice_batch} exceeds {max_slice_size_in_bytes} bytes, trying smaller rows_per_slice')
@@ -1962,125 +1629,6 @@ class VastdbApi:
1962
1629
 
1963
1630
  return serialized_slices
1964
1631
 
1965
- def insert(self, bucket, schema, table, rows=None, record_batch=None, rows_per_insert=None, txid=0):
1966
- """
1967
- Insert rows into a table. The operation may be split into multiple commands, such that by default no more than 512KB will be inserted per command.
1968
-
1969
- Parameters
1970
- ----------
1971
- bucket : string
1972
- The bucket of the table.
1973
- schema : string
1974
- The schema of the table.
1975
- table : string
1976
- The table name.
1977
- rows : dict
1978
- The rows to insert.
1979
- dictionary key: column name
1980
- dictionary value: array of cell values to insert
1981
- default: None (if None, record_batch must be provided)
1982
- record_batch : pyarrow.RecordBatch
1983
- A pyarrow RecordBatch
1984
- default: None (if None, rows dictionary must be provided)
1985
- rows_per_insert : integer
1986
- Split the operation so that each insert command will be limited to this value
1987
- default: None (will be selected automatically)
1988
- txid : integer
1989
- A transaction id. The transaction may be initiated before the insert, and be used to provide
1990
- multiple ACID operations
1991
- default: 0 (will be created by the api)
1992
-
1993
- Returns
1994
- -------
1995
- None
1996
-
1997
-
1998
- Examples
1999
- --------
2000
- insert('some_bucket', 'some_schema', 'some_table', {'name': ['Alice','Bob'], 'age': [25,24]})
2001
-
2002
- """
2003
- if (not rows and not record_batch) or (rows and record_batch):
2004
- raise ValueError(f'insert: missing argument - either rows or record_batch must be provided')
2005
-
2006
- # create a transaction
2007
- txid, created_txid = self._begin_tx_if_necessary(txid)
2008
-
2009
- if rows:
2010
- columns = self._list_table_columns(bucket, schema, table, field_names=rows.keys())
2011
- columns_dict = dict([(column[0], column[1]) for column in columns])
2012
- arrow_schema = pa.schema([])
2013
- arrays = []
2014
- for column_name, column_values in rows.items():
2015
- column_type = columns_dict[column_name]
2016
- field = pa.field(column_name, column_type)
2017
- arrow_schema = arrow_schema.append(field)
2018
- arrays.append(pa.array(column_values, column_type))
2019
- record_batch = pa.record_batch(arrays, arrow_schema)
2020
-
2021
- # split the record batch into multiple slices
2022
- serialized_slices = self._record_batch_slices(record_batch, rows_per_insert)
2023
- _logger.info(f'inserting record batch using {len(serialized_slices)} slices')
2024
-
2025
- insert_queue = queue.Queue()
2026
-
2027
- [insert_queue.put(insert_rows_req) for insert_rows_req in serialized_slices]
2028
-
2029
- try:
2030
- executor_sessions = [VastdbApi(self.executor_hosts[i], self.access_key, self.secret_key, self.username,
2031
- self.password, self.port, self.secure, self.auth_type) for i in range(len(self.executor_hosts))]
2032
-
2033
- def insert_executor(self, split_id):
2034
-
2035
- try:
2036
- _logger.info(f'insert_executor split_id={split_id} starting')
2037
- session = executor_sessions[split_id]
2038
- num_inserts = 0
2039
- while not killall:
2040
- try:
2041
- insert_rows_req = insert_queue.get(block=False)
2042
- except queue.Empty:
2043
- break
2044
- session.insert_rows(bucket=bucket, schema=schema,
2045
- table=table, record_batch=insert_rows_req, txid=txid)
2046
- num_inserts += 1
2047
- _logger.info(f'insert_executor split_id={split_id} num_inserts={num_inserts}')
2048
- if killall:
2049
- _logger.info('insert_executor killall=True')
2050
-
2051
- except Exception as e:
2052
- _logger.exception('insert_executor hit exception')
2053
- raise e
2054
-
2055
- num_splits = len(executor_sessions)
2056
- killall = False
2057
- with concurrent.futures.ThreadPoolExecutor(max_workers=num_splits) as executor:
2058
- futures = []
2059
- for i in range(num_splits):
2060
- futures.append(executor.submit(insert_executor, self, i))
2061
- for future in concurrent.futures.as_completed(futures):
2062
- future.result() # trigger an exception if occurred in any thread
2063
-
2064
- # commit if needed
2065
- if created_txid:
2066
- self.commit_transaction(txid)
2067
-
2068
- except Exception as e:
2069
- _logger.exception('exception occurred')
2070
- try:
2071
- self.rollback_transaction(txid)
2072
- except:
2073
- _logger.exception(f'failed to rollback txid {txid}')
2074
- raise e
2075
-
2076
- finally:
2077
- killall = True
2078
- for session in executor_sessions:
2079
- try:
2080
- session.session.close()
2081
- except Exception:
2082
- _logger.exception(f'failed to close session {session}')
2083
-
2084
1632
  def insert_rows(self, bucket, schema, table, record_batch, txid=0, client_tags=[], expected_retvals=[]):
2085
1633
  """
2086
1634
  POST /mybucket/myschema/mytable?rows HTTP/1.1
@@ -2350,41 +1898,40 @@ def _iter_query_data_response_columns(fileobj, stream_ids=None):
2350
1898
  if stream_ids is not None:
2351
1899
  stream_ids.update([stream_id]) # count stream IDs using a collections.Counter
2352
1900
  if stream_id == TABULAR_KEEP_ALIVE_STREAM_ID:
2353
- # _logger.info(f"stream_id={stream_id} (skipping)")
2354
1901
  continue
2355
1902
 
2356
1903
  if stream_id == TABULAR_QUERY_DATA_COMPLETED_STREAM_ID:
2357
1904
  # read the terminating end chunk from socket
2358
1905
  res = fileobj.read()
2359
- _logger.info(f"stream_id={stream_id} res={res} (finish)")
1906
+ _logger.debug("stream_id=%d res=%s (finish)", stream_id, res)
2360
1907
  return
2361
1908
 
2362
1909
  if stream_id == TABULAR_QUERY_DATA_FAILED_STREAM_ID:
2363
1910
  # read the terminating end chunk from socket
2364
1911
  res = fileobj.read()
2365
- _logger.info(f"stream_id={stream_id} res={res} (failed)")
1912
+ _logger.warning("stream_id=%d res=%s (failed)", stream_id, res)
2366
1913
  raise IOError(f"Query data stream failed res={res}")
2367
1914
 
2368
1915
  next_row_id_bytes = fileobj.read(8)
2369
1916
  next_row_id, = struct.unpack('<Q', next_row_id_bytes)
2370
- _logger.info(f"stream_id={stream_id} next_row_id={next_row_id}")
1917
+ _logger.debug("stream_id=%d next_row_id=%d", stream_id, next_row_id)
2371
1918
 
2372
1919
  if stream_id not in readers:
2373
1920
  # we implicitly read 1st message (Arrow schema) when constructing RecordBatchStreamReader
2374
1921
  reader = pa.ipc.RecordBatchStreamReader(fileobj)
2375
- _logger.info(f"stream_id={stream_id} schema={reader.schema}")
1922
+ _logger.debug("stream_id=%d schema=%s", stream_id, reader.schema)
2376
1923
  readers[stream_id] = (reader, [])
2377
1924
  continue
2378
1925
 
2379
1926
  (reader, batches) = readers[stream_id]
2380
1927
  try:
2381
1928
  batch = reader.read_next_batch() # read single-column chunk data
2382
- _logger.info(f"stream_id={stream_id} rows={len(batch)} chunk={batch}")
1929
+ _logger.debug("stream_id=%d rows=%d chunk=%s", stream_id, len(batch), batch)
2383
1930
  batches.append(batch)
2384
1931
  except StopIteration: # we got an end-of-stream IPC message for a given stream ID
2385
1932
  reader, batches = readers.pop(stream_id) # end of column
2386
1933
  table = pa.Table.from_batches(batches) # concatenate all column chunks (as a single)
2387
- _logger.info(f"stream_id={stream_id} rows={len(table)} column={table}")
1934
+ _logger.debug("stream_id=%d rows=%d column=%s", stream_id, len(table), table)
2388
1935
  yield (stream_id, next_row_id, table)
2389
1936
 
2390
1937
 
@@ -2413,7 +1960,8 @@ def parse_query_data_response(conn, schema, stream_ids=None, start_row_ids=None,
2413
1960
  if is_empty_projection: # VAST returns an empty RecordBatch, with the correct rows' count
2414
1961
  parsed_table = table
2415
1962
 
2416
- _logger.info(f"stream_id={stream_id} rows={len(parsed_table)} next_row_id={next_row_id} table={parsed_table}")
1963
+ _logger.debug("stream_id=%d rows=%d next_row_id=%d table=%s",
1964
+ stream_id, len(parsed_table), next_row_id, parsed_table)
2417
1965
  start_row_ids[stream_id] = next_row_id
2418
1966
  yield parsed_table # the result of a single "select_rows()" cycle
2419
1967
 
@@ -2562,7 +2110,6 @@ def get_field_type(builder: flatbuffers.Builder, field: pa.Field):
2562
2110
  return field_type, field_type_type
2563
2111
 
2564
2112
  def build_field(builder: flatbuffers.Builder, f: pa.Field, name: str):
2565
- _logger.info(f"name={f.name}")
2566
2113
  children = None
2567
2114
  if isinstance(f.type, pa.StructType):
2568
2115
  children = [build_field(builder, child, child.name) for child in list(f.type)]
@@ -2589,7 +2136,6 @@ def build_field(builder: flatbuffers.Builder, f: pa.Field, name: str):
2589
2136
  fb_field.AddName(builder, child_col_name)
2590
2137
  fb_field.AddChildren(builder, children)
2591
2138
 
2592
- _logger.info(f"added key and map to entries")
2593
2139
  children = [fb_field.End(builder)]
2594
2140
 
2595
2141
  if children is not None:
@@ -2600,13 +2146,11 @@ def build_field(builder: flatbuffers.Builder, f: pa.Field, name: str):
2600
2146
 
2601
2147
  col_name = builder.CreateString(name)
2602
2148
  field_type, field_type_type = get_field_type(builder, f)
2603
- _logger.info(f"add col_name={name} type_type={field_type_type} to fb")
2604
2149
  fb_field.Start(builder)
2605
2150
  fb_field.AddName(builder, col_name)
2606
2151
  fb_field.AddTypeType(builder, field_type_type)
2607
2152
  fb_field.AddType(builder, field_type)
2608
2153
  if children is not None:
2609
- _logger.info(f"add col_name={name} childern")
2610
2154
  fb_field.AddChildren(builder, children)
2611
2155
  return fb_field.End(builder)
2612
2156
 
@@ -2623,9 +2167,7 @@ class QueryDataRequest:
2623
2167
  self.response_schema = response_schema
2624
2168
 
2625
2169
 
2626
- def build_query_data_request(schema: 'pa.Schema' = pa.schema([]), filters: dict = None, field_names: list = None):
2627
- filters = filters or {}
2628
-
2170
+ def build_query_data_request(schema: 'pa.Schema' = pa.schema([]), predicate: ibis.expr.types.BooleanColumn = None, field_names: list = None):
2629
2171
  builder = flatbuffers.Builder(1024)
2630
2172
 
2631
2173
  source_name = builder.CreateString('') # required
@@ -2641,7 +2183,7 @@ def build_query_data_request(schema: 'pa.Schema' = pa.schema([]), filters: dict
2641
2183
  fb_schema.AddFields(builder, fields)
2642
2184
  schema_obj = fb_schema.End(builder)
2643
2185
 
2644
- predicate = Predicate(schema, filters)
2186
+ predicate = Predicate(schema=schema, expr=predicate)
2645
2187
  filter_obj = predicate.serialize(builder)
2646
2188
 
2647
2189
  parser = QueryDataParser(schema)
@@ -2652,10 +2194,8 @@ def build_query_data_request(schema: 'pa.Schema' = pa.schema([]), filters: dict
2652
2194
  continue
2653
2195
  iter_from_root = reversed(list(descendent._iter_to_root()))
2654
2196
  descendent_full_name = '.'.join([n.field.name for n in iter_from_root])
2655
- _logger.debug(f'build_query_data_request: descendent_full_name={descendent_full_name}')
2656
2197
  descendent_leaves = [leaf.index for leaf in descendent._iter_leaves()]
2657
2198
  leaves_map[descendent_full_name] = descendent_leaves
2658
- _logger.debug(f'build_query_data_request: leaves_map={leaves_map}')
2659
2199
 
2660
2200
  output_field_names = None
2661
2201
  if field_names is None:
@@ -2666,13 +2206,11 @@ def build_query_data_request(schema: 'pa.Schema' = pa.schema([]), filters: dict
2666
2206
  def compare_field_names_by_pos(field_name1, field_name2):
2667
2207
  return leaves_map[field_name1][0]-leaves_map[field_name2][0]
2668
2208
  field_names = sorted(field_names, key=cmp_to_key(compare_field_names_by_pos))
2669
- _logger.debug(f'build_query_data_request: sorted field_names={field_names} schema={schema}')
2670
2209
 
2671
2210
  projection_fields = []
2672
2211
  projection_positions = []
2673
2212
  for field_name in field_names:
2674
2213
  positions = leaves_map[field_name]
2675
- _logger.info("projecting field=%s positions=%s", field_name, positions)
2676
2214
  projection_positions.extend(positions)
2677
2215
  for leaf_position in positions:
2678
2216
  fb_field_index.Start(builder)
@@ -2729,11 +2267,9 @@ def convert_column_types(table: 'pa.Table') -> 'pa.Table':
2729
2267
  indexes_of_fields_to_change[field.name] = index
2730
2268
  for changing_index in ts_indexes:
2731
2269
  field_name = table.schema[changing_index].name
2732
- _logger.info(f'changing resolution for {field_name} to us')
2733
2270
  new_column = table[field_name].cast(pa.timestamp('us'), safe=False)
2734
2271
  table = table.set_column(changing_index, field_name, new_column)
2735
2272
  for field_name, changing_index in indexes_of_fields_to_change.items():
2736
- _logger.info(f'applying custom rules to {field_name}')
2737
2273
  new_column = table[field_name].to_pylist()
2738
2274
  new_column = list(map(column_matcher[field_name], new_column))
2739
2275
  new_column = pa.array(new_column, table[field_name].type)