vastdb 0.0.5.3__py3-none-any.whl → 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. vast_flatbuf/tabular/GetTableStatsResponse.py +45 -1
  2. vast_flatbuf/tabular/VipRange.py +56 -0
  3. vastdb/__init__.py +7 -0
  4. vastdb/bucket.py +77 -0
  5. vastdb/errors.py +158 -0
  6. vastdb/{api.py → internal_commands.py} +280 -746
  7. vastdb/schema.py +77 -0
  8. vastdb/session.py +48 -0
  9. vastdb/table.py +480 -0
  10. vastdb/tests/conftest.py +15 -14
  11. vastdb/tests/test_imports.py +125 -0
  12. vastdb/tests/test_projections.py +41 -0
  13. vastdb/tests/test_sanity.py +36 -16
  14. vastdb/tests/test_schemas.py +12 -6
  15. vastdb/tests/test_tables.py +581 -13
  16. vastdb/transaction.py +55 -0
  17. vastdb/util.py +8 -8
  18. vastdb-0.1.0.dist-info/METADATA +38 -0
  19. {vastdb-0.0.5.3.dist-info → vastdb-0.1.0.dist-info}/RECORD +22 -31
  20. vast_protobuf/__init__.py +0 -0
  21. vast_protobuf/substrait/__init__.py +0 -0
  22. vast_protobuf/substrait/algebra_pb2.py +0 -1344
  23. vast_protobuf/substrait/capabilities_pb2.py +0 -46
  24. vast_protobuf/substrait/ddl_pb2.py +0 -57
  25. vast_protobuf/substrait/extended_expression_pb2.py +0 -49
  26. vast_protobuf/substrait/extensions/__init__.py +0 -0
  27. vast_protobuf/substrait/extensions/extensions_pb2.py +0 -89
  28. vast_protobuf/substrait/function_pb2.py +0 -168
  29. vast_protobuf/substrait/parameterized_types_pb2.py +0 -181
  30. vast_protobuf/substrait/plan_pb2.py +0 -67
  31. vast_protobuf/substrait/type_expressions_pb2.py +0 -198
  32. vast_protobuf/substrait/type_pb2.py +0 -350
  33. vast_protobuf/tabular/__init__.py +0 -0
  34. vast_protobuf/tabular/rpc_pb2.py +0 -344
  35. vastdb/bench_scan.py +0 -45
  36. vastdb/tests/test_create_table_from_parquets.py +0 -50
  37. vastdb/v2.py +0 -360
  38. vastdb-0.0.5.3.dist-info/METADATA +0 -47
  39. {vastdb-0.0.5.3.dist-info → vastdb-0.1.0.dist-info}/LICENSE +0 -0
  40. {vastdb-0.0.5.3.dist-info → vastdb-0.1.0.dist-info}/WHEEL +0 -0
  41. {vastdb-0.0.5.3.dist-info → vastdb-0.1.0.dist-info}/top_level.txt +0 -0
@@ -1,29 +1,26 @@
1
- import array
2
1
  import logging
3
2
  import struct
4
3
  import urllib.parse
5
4
  from collections import defaultdict, namedtuple
6
5
  from datetime import datetime
7
6
  from enum import Enum
8
- from typing import List, Union, Optional, Iterator
7
+ from typing import Union, Optional, Iterator
8
+ import ibis
9
9
  import xmltodict
10
- import concurrent.futures
11
- import threading
12
- import queue
13
10
  import math
14
- import socket
15
11
  from functools import cmp_to_key
16
12
  import pyarrow.parquet as pq
17
13
  import flatbuffers
18
14
  import pyarrow as pa
19
15
  import requests
20
- import datetime
21
- import hashlib
22
- import hmac
23
16
  import json
24
17
  import itertools
25
18
  from aws_requests_auth.aws_auth import AWSRequestsAuth
26
- from io import BytesIO
19
+ import urllib3
20
+ import re
21
+
22
+ from . import errors
23
+ from ipaddress import IPv4Address, IPv6Address
27
24
 
28
25
  import vast_flatbuf.org.apache.arrow.computeir.flatbuf.BinaryLiteral as fb_binary_lit
29
26
  import vast_flatbuf.org.apache.arrow.computeir.flatbuf.BooleanLiteral as fb_bool_lit
@@ -91,30 +88,22 @@ TABULAR_QUERY_DATA_COMPLETED_STREAM_ID = 0xFFFFFFFF - 1
91
88
  TABULAR_QUERY_DATA_FAILED_STREAM_ID = 0xFFFFFFFF - 2
92
89
  TABULAR_INVALID_ROW_ID = 0xFFFFFFFFFFFF # (1<<48)-1
93
90
  ESTORE_INVALID_EHANDLE = UINT64_MAX
91
+ IMPORTED_OBJECTS_TABLE_NAME = "vastdb-imported-objects"
94
92
 
95
93
  """
96
94
  S3 Tabular API
97
95
  """
98
96
 
99
97
 
100
- def get_logger(name):
101
- log = logging.getLogger(name)
102
- log.setLevel(logging.ERROR)
103
- ch = logging.StreamHandler()
104
- ch.setLevel(logging.INFO)
105
- ch.set_name('tabular_stream_handler')
106
- formatter = logging.Formatter("%(asctime)s:%(levelname)s:%(message)s")
107
- ch.setFormatter(formatter)
108
- log.addHandler(ch)
109
- log.propagate = False
110
- return log
111
-
112
-
113
- _logger = get_logger(__name__)
98
+ _logger = logging.getLogger(__name__)
114
99
 
115
100
 
116
- def set_tabular_log_level(level: int = logging.INFO):
117
- _logger.setLevel(level)
101
+ def _flatten_args(op, op_type):
102
+ if isinstance(op, op_type):
103
+ for arg in op.args:
104
+ yield from _flatten_args(arg, op_type)
105
+ else:
106
+ yield op
118
107
 
119
108
 
120
109
  class AuthType(Enum):
@@ -123,10 +112,6 @@ class AuthType(Enum):
123
112
  BASIC = "basic"
124
113
 
125
114
 
126
- class TabularException(Exception):
127
- pass
128
-
129
-
130
115
  def get_unit_to_flatbuff_time_unit(type):
131
116
  unit_to_flatbuff_time_unit = {
132
117
  'ns': TimeUnit.NANOSECOND,
@@ -144,11 +129,10 @@ class Predicate:
144
129
  's': 0.001
145
130
  }
146
131
 
147
- def __init__(self, schema: 'pa.Schema', filters: dict):
132
+ def __init__(self, schema: 'pa.Schema', expr: ibis.expr.types.BooleanColumn):
148
133
  self.schema = schema
149
- self.filters = filters
134
+ self.expr = expr
150
135
  self.builder = None
151
- self._field_name_per_index = None
152
136
 
153
137
  def get_field_indexes(self, field: 'pa.Field', field_name_per_index: list) -> None:
154
138
  field_name_per_index.append(field.name)
@@ -172,7 +156,6 @@ class Predicate:
172
156
  for field in self.schema:
173
157
  self.get_field_indexes(field, _field_name_per_index)
174
158
  self._field_name_per_index = {field: index for index, field in enumerate(_field_name_per_index)}
175
- _logger.debug(f'field_name_per_index: {self._field_name_per_index}')
176
159
  return self._field_name_per_index
177
160
 
178
161
  def get_projections(self, builder: 'flatbuffers.builder.Builder', field_names: list = None):
@@ -190,10 +173,77 @@ class Predicate:
190
173
  return builder.EndVector()
191
174
 
192
175
  def serialize(self, builder: 'flatbuffers.builder.Builder'):
176
+ from ibis.expr.operations.generic import TableColumn, Literal, IsNull
177
+ from ibis.expr.operations.logical import Greater, GreaterEqual, Less, LessEqual, Equals, NotEquals, And, Or, Not
178
+ from ibis.expr.operations.strings import StringContains
179
+
180
+ builder_map = {
181
+ Greater: self.build_greater,
182
+ GreaterEqual: self.build_greater_equal,
183
+ Less: self.build_less,
184
+ LessEqual: self.build_less_equal,
185
+ Equals: self.build_equal,
186
+ NotEquals: self.build_not_equal,
187
+ IsNull: self.build_is_null,
188
+ Not: self.build_is_not_null,
189
+ StringContains: self.build_match_substring,
190
+ }
191
+
192
+ positions_map = dict((f.name, index) for index, f in enumerate(self.schema)) # TODO: BFS
193
+
193
194
  self.builder = builder
195
+
194
196
  offsets = []
195
- for field_name in self.filters:
196
- offsets.append(self.build_domain(self.build_column(self.field_name_per_index[field_name]), field_name))
197
+
198
+ if self.expr is not None:
199
+ and_args = list(_flatten_args(self.expr.op(), And))
200
+ _logger.debug('AND args: %s ops %s', and_args, self.expr.op())
201
+ for op in and_args:
202
+ or_args = list(_flatten_args(op, Or))
203
+ _logger.debug('OR args: %s op %s', or_args, op)
204
+ inner_offsets = []
205
+
206
+ prev_field_name = None
207
+ for inner_op in or_args:
208
+ _logger.debug('inner_op %s', inner_op)
209
+ builder_func = builder_map.get(type(inner_op))
210
+ if not builder_func:
211
+ raise NotImplementedError(inner_op.name)
212
+
213
+ if builder_func == self.build_is_null:
214
+ column, = inner_op.args
215
+ literal = None
216
+ elif builder_func == self.build_is_not_null:
217
+ not_arg, = inner_op.args
218
+ # currently we only support not is_null, checking we really got is_null under the not:
219
+ if not builder_map.get(type(not_arg)) == self.build_is_null:
220
+ raise NotImplementedError(not_arg.args[0].name)
221
+ column, = not_arg.args
222
+ literal = None
223
+ else:
224
+ column, literal = inner_op.args
225
+ if not isinstance(literal, Literal):
226
+ raise NotImplementedError(inner_op.name)
227
+
228
+ if not isinstance(column, TableColumn):
229
+ raise NotImplementedError(inner_op.name)
230
+
231
+ field_name = column.name
232
+ if prev_field_name is None:
233
+ prev_field_name = field_name
234
+ elif prev_field_name != field_name:
235
+ raise NotImplementedError(op.name)
236
+
237
+ args_offsets = [self.build_column(position=positions_map[field_name])]
238
+ if literal:
239
+ field = self.schema.field(field_name)
240
+ args_offsets.append(self.build_literal(field=field, value=literal.value))
241
+
242
+ inner_offsets.append(builder_func(*args_offsets))
243
+
244
+ domain_offset = self.build_or(inner_offsets)
245
+ offsets.append(domain_offset)
246
+
197
247
  return self.build_and(offsets)
198
248
 
199
249
  def build_column(self, position: int):
@@ -221,7 +271,6 @@ class Predicate:
221
271
  field = self.schema.field(field_name)
222
272
  for attr in field_attrs:
223
273
  field = field.type[attr]
224
- _logger.info(f'trying to append field: {field} with domains: {filters}')
225
274
  for filter_by_name in filters:
226
275
  offsets.append(self.build_range(column=column, field=field, filter_by_name=filter_by_name))
227
276
  return self.build_or(offsets)
@@ -263,11 +312,9 @@ class Predicate:
263
312
  return self.build_and(rules)
264
313
 
265
314
  def build_function(self, name: str, *offsets):
266
- _logger.info(f'name: {name}, offsets: {offsets}')
267
315
  offset_name = self.builder.CreateString(name)
268
316
  fb_call.StartArgumentsVector(self.builder, len(offsets))
269
317
  for offset in reversed(offsets):
270
- _logger.info(f'offset: {offset}')
271
318
  self.builder.PrependUOffsetTRelative(offset)
272
319
  offset_arguments = self.builder.EndVector()
273
320
 
@@ -282,7 +329,7 @@ class Predicate:
282
329
  fb_expression.AddImpl(self.builder, offset_call)
283
330
  return fb_expression.End(self.builder)
284
331
 
285
- def build_literal(self, field: pa.Field, value: str):
332
+ def build_literal(self, field: pa.Field, value):
286
333
  if field.type.equals(pa.int64()):
287
334
  literal_type = fb_int64_lit
288
335
  literal_impl = LiteralImpl.Int64Literal
@@ -366,8 +413,7 @@ class Predicate:
366
413
  field_type = fb_date.End(self.builder)
367
414
 
368
415
  start_date = datetime.fromtimestamp(0).date()
369
- date_value = datetime.strptime(value, '%Y-%m-%d').date()
370
- date_delta = date_value - start_date
416
+ date_delta = value - start_date
371
417
  value = date_delta.days
372
418
  elif isinstance(field.type, pa.TimestampType):
373
419
  literal_type = fb_timestamp_lit
@@ -426,7 +472,7 @@ class Predicate:
426
472
  fb_binary.Start(self.builder)
427
473
  field_type = fb_binary.End(self.builder)
428
474
 
429
- value = self.builder.CreateByteVector(value.encode())
475
+ value = self.builder.CreateByteVector(value)
430
476
  else:
431
477
  raise ValueError(f'unsupported predicate for type={field.type}, value={value}')
432
478
 
@@ -459,6 +505,9 @@ class Predicate:
459
505
  def build_equal(self, column: int, literal: int):
460
506
  return self.build_function('equal', column, literal)
461
507
 
508
+ def build_not_equal(self, column: int, literal: int):
509
+ return self.build_function('not_equal', column, literal)
510
+
462
511
  def build_greater(self, column: int, literal: int):
463
512
  return self.build_function('greater', column, literal)
464
513
 
@@ -477,6 +526,9 @@ class Predicate:
477
526
  def build_is_not_null(self, column: int):
478
527
  return self.build_function('is_valid', column)
479
528
 
529
+ def build_match_substring(self, column: int, literal: int):
530
+ return self.build_function('match_substring', column, literal)
531
+
480
532
 
481
533
  class FieldNode:
482
534
  """Helper class for representing nested Arrow fields and handling QueryData requests"""
@@ -574,9 +626,8 @@ class FieldNode:
574
626
  def build(self) -> pa.Array:
575
627
  """Construct an Arrow array from the collected buffers (recursively)."""
576
628
  children = self.children and [node.build() for node in self.children if node.is_projected]
577
- _logger.debug(f'build: self.field.name={self.field.name}, '
578
- f'self.projected_field.type={self.projected_field.type}, self.length={self.length} '
579
- f'self.buffers={self.buffers} children={children}')
629
+ _logger.debug('build: self.field.name=%s, self.projected_field.type=%s, self.length=%s, self.buffers=%s children=%s',
630
+ self.field.name, self.projected_field.type, self.length, self.buffers, children)
580
631
  result = pa.Array.from_buffers(self.projected_field.type, self.length, buffers=self.buffers, children=children)
581
632
  if self.debug:
582
633
  _logger.debug('%s result=%s', self.field, result)
@@ -602,11 +653,9 @@ class QueryDataParser:
602
653
  for node in self.nodes:
603
654
  node.debug_log()
604
655
  self.leaves = [leaf for node in self.nodes for leaf in node._iter_leaves()]
605
- _logger.debug(f'QueryDataParser: self.leaves = {[(leaf.field.name, leaf.index) for leaf in self.leaves]}')
606
656
  self.mark_projected_nodes()
607
657
  [node.build_projected_field() for node in self.nodes]
608
658
  self.projected_leaves = [leaf for node in self.nodes for leaf in node._iter_projected_leaves()]
609
- _logger.debug(f'QueryDataParser: self.projected_leaves = {[(leaf.field.name, leaf.index) for leaf in self.projected_leaves]}')
610
659
 
611
660
  self.leaf_offset = 0
612
661
 
@@ -615,7 +664,6 @@ class QueryDataParser:
615
664
  if self.projection_positions is None or leaf.index in self.projection_positions:
616
665
  for node in leaf._iter_to_root():
617
666
  node.is_projected = True
618
- _logger.debug(f'mark_projected_nodes node.field.name={node.field.name}')
619
667
 
620
668
  def parse(self, column: pa.Array):
621
669
  """Parse a single column response from VAST (see FieldNode.set for details)"""
@@ -693,7 +741,6 @@ def _parse_table_info(obj):
693
741
  return TableInfo(name, properties, handle, num_rows, used_bytes)
694
742
 
695
743
  def build_record_batch(column_info, column_values):
696
- _logger.info(f"column_info={column_info}")
697
744
  fields = [pa.field(column_name, column_type) for column_type, column_name in column_info]
698
745
  schema = pa.schema(fields)
699
746
  arrays = [pa.array(column_values[column_type], type=column_type) for column_type, _ in column_info]
@@ -706,56 +753,30 @@ def serialize_record_batch(batch):
706
753
  writer.write(batch)
707
754
  return sink.getvalue()
708
755
 
709
- def generate_ip_range(ip_range_str):
710
- start, end = ip_range_str.split(':')
711
- start_parts = start.split('.')
712
- start_last_part = int(start_parts[-1])
713
- end_parts = end.split('.')
714
- end_last_part = int(end_parts[-1])
715
- if start_last_part>=end_last_part or True in [start_parts[i] != end_parts[i] for i in range(3)]:
716
- raise ValueError(f'illegal ip range {ip_range_str}')
717
- num_ips = 1 + end_last_part - start_last_part
718
- ips = ['.'.join(start_parts[:-1] + [str(start_last_part + i)]) for i in range(num_ips)]
719
- return ips
720
-
721
- def parse_executor_hosts(host):
722
- executor_hosts_parsed = host.split(',')
723
- executor_hosts_parsed = [host.strip() for host in executor_hosts_parsed]
724
- executor_hosts = []
725
- for executor_host in executor_hosts_parsed:
726
- is_ip_range=False
727
- if ':' in executor_host:
728
- try:
729
- socket.inet_aton(executor_host.split(':')[0])
730
- socket.inet_aton(executor_host.split(':')[1])
731
- is_ip_range = True
732
- except:
733
- pass
734
- if is_ip_range:
735
- executor_hosts.extend(generate_ip_range(executor_host))
736
- else:
737
- executor_hosts.append(executor_host)
738
- return executor_hosts
756
+ # Results that returns from tablestats
757
+ TableStatsResult = namedtuple("TableStatsResult",["num_rows", "size_in_bytes", "is_external_rowid_alloc", "endpoints"])
739
758
 
740
759
  class VastdbApi:
741
- def __init__(self, host, access_key, secret_key, username=None, password=None, port=None,
760
+ # we expect the vast version to be <major>.<minor>.<patch>.<protocol>
761
+ VAST_VERSION_REGEX = re.compile(r'^vast (\d+\.\d+\.\d+\.\d+)$')
762
+
763
+ def __init__(self, endpoint, access_key, secret_key, username=None, password=None,
742
764
  secure=False, auth_type=AuthType.SIGV4):
743
- executor_hosts = parse_executor_hosts(host)
744
- host = executor_hosts[0]
745
- self.host = host
765
+ url_dict = urllib3.util.parse_url(endpoint)._asdict()
746
766
  self.access_key = access_key
747
767
  self.secret_key = secret_key
748
768
  self.username = username
749
769
  self.password = password
750
- self.port = port
751
770
  self.secure = secure
752
771
  self.auth_type = auth_type
753
- self.executor_hosts = executor_hosts
772
+ self.executor_hosts = [endpoint] # TODO: remove
754
773
 
755
774
  username = username or ''
756
775
  password = password or ''
757
- if not port:
758
- port = 443 if secure else 80
776
+ if not url_dict['port']:
777
+ url_dict['port'] = 443 if secure else 80
778
+
779
+ self.port = url_dict['port']
759
780
 
760
781
  self.default_max_list_columns_page_size = 1000
761
782
  self.session = requests.Session()
@@ -764,10 +785,10 @@ class VastdbApi:
764
785
  if auth_type == AuthType.BASIC:
765
786
  self.session.auth = requests.auth.HTTPBasicAuth(username, password)
766
787
  else:
767
- if port != 80 and port != 443:
768
- self.aws_host = f'{host}:{port}'
788
+ if url_dict['port'] != 80 and url_dict['port'] != 443:
789
+ self.aws_host = '{host}:{port}'.format(**url_dict)
769
790
  else:
770
- self.aws_host = f'{host}'
791
+ self.aws_host = '{host}'.format(**url_dict)
771
792
 
772
793
  self.session.auth = AWSRequestsAuth(aws_access_key=access_key,
773
794
  aws_secret_access_key=secret_key,
@@ -775,8 +796,34 @@ class VastdbApi:
775
796
  aws_region='us-east-1',
776
797
  aws_service='s3')
777
798
 
778
- proto = "https" if secure else "http"
779
- self.url = f"{proto}://{self.aws_host}"
799
+ if not url_dict['scheme']:
800
+ url_dict['scheme'] = "https" if secure else "http"
801
+
802
+ url = urllib3.util.Url(**url_dict)
803
+ self.url = str(url)
804
+ _logger.debug('url=%s aws_host=%s', self.url, self.aws_host)
805
+
806
+ # probe the cluster for its version
807
+ self.vast_version = None
808
+ res = self.session.options(self.url)
809
+ server_header = res.headers.get("Server")
810
+ if server_header is None:
811
+ _logger.error("OPTIONS response doesn't contain 'Server' header")
812
+ else:
813
+ _logger.debug("Server header is '%s'", server_header)
814
+ if m := self.VAST_VERSION_REGEX.match(server_header):
815
+ self.vast_version, = m.groups()
816
+ return
817
+ else:
818
+ _logger.error("'Server' header '%s' doesn't match the expected pattern", server_header)
819
+
820
+ msg = (
821
+ f'Please use `vastdb` <= 0.0.5.x with current VAST cluster version ("{server_header or "N/A"}"). '
822
+ 'To use the latest SDK, please upgrade your cluster to the latest service pack. '
823
+ 'Please contact customer.support@vastdata.com for more details.'
824
+ )
825
+ _logger.critical(msg)
826
+ raise NotImplementedError(msg)
780
827
 
781
828
  def update_mgmt_session(self, access_key: str, secret_key: str, auth_type=AuthType.SIGV4):
782
829
  if auth_type != AuthType.BASIC:
@@ -821,21 +868,9 @@ class VastdbApi:
821
868
  return common_headers
822
869
 
823
870
  def _check_res(self, res, cmd="", expected_retvals=[]):
824
- try:
825
- res.raise_for_status()
826
- if res.status_code != 200:
827
- if not res.status_code in expected_retvals:
828
- raise ValueError(f"Expected status code mismatch. status_code={res.status_code}")
829
- else:
830
- if not len(expected_retvals) == 0:
831
- raise ValueError(f"Expected {expected_retvals} but status_code={res.status_code}")
832
- return res
833
- except requests.HTTPError as e:
834
- if res.status_code in expected_retvals:
835
- _logger.info(f"{cmd} has failed as expected res={res}")
836
- return res
837
- else:
838
- raise e
871
+ if exc := errors.from_response(res):
872
+ raise exc
873
+ return res
839
874
 
840
875
  def create_schema(self, bucket, name, txid=0, client_tags=[], schema_properties="", expected_retvals=[]):
841
876
  """
@@ -975,7 +1010,8 @@ class VastdbApi:
975
1010
  return snapshots, is_truncated, marker
976
1011
 
977
1012
 
978
- def create_table(self, bucket, schema, name, arrow_schema, txid=0, client_tags=[], expected_retvals=[], topic_partitions=0):
1013
+ def create_table(self, bucket, schema, name, arrow_schema, txid=0, client_tags=[], expected_retvals=[],
1014
+ topic_partitions=0, create_imports_table=False):
979
1015
  """
980
1016
  Create a table, use the following request
981
1017
  POST /bucket/schema/table?table HTTP/1.1
@@ -984,18 +1020,21 @@ class VastdbApi:
984
1020
  tabular-txid: <integer> TransactionId
985
1021
  tabular-client-tag: <string> ClientTag
986
1022
 
987
- The body of the POST request contains table column properties as json
988
- {
989
- "format": "string",
990
- "column_names": {"name1":"type1", "name2":"type2", ...},
991
- "table_properties": {"key1":"val1", "key2":"val2", ...}
992
- }
1023
+ The body of the POST request contains table column properties as arrow schema
1024
+ which include field_name, field_type and properties
1025
+
1026
+ In order to create vastdb-imported-objects table that tracks all imported files and avoid duplicate imports,
1027
+ just set create_imports_table=True
1028
+ The request will look like:
1029
+ POST /bucket/schema/table?table&sub-table=vastdb-imported-objects HTTP/1.1
993
1030
  """
994
1031
  headers = self._fill_common_headers(txid=txid, client_tags=client_tags)
995
1032
 
996
1033
  serialized_schema = arrow_schema.serialize()
997
1034
  headers['Content-Length'] = str(len(serialized_schema))
998
1035
  url_params = {'topic_partitions': str(topic_partitions)} if topic_partitions else {}
1036
+ if create_imports_table:
1037
+ url_params['sub-table'] = IMPORTED_OBJECTS_TABLE_NAME
999
1038
 
1000
1039
  res = self.session.post(self._api_prefix(bucket=bucket, schema=schema, table=name, command="table", url_params=url_params),
1001
1040
  data=serialized_schema, headers=headers)
@@ -1015,7 +1054,6 @@ class VastdbApi:
1015
1054
  raise RuntimeError(f'invalid params parquet_path={parquet_path} parquet_bucket_name={parquet_bucket_name} parquet_object_name={parquet_object_name}')
1016
1055
 
1017
1056
  # Get the schema of the Parquet file
1018
- _logger.info(f'type(parquet_ds.schema) = {type(parquet_ds.schema)}')
1019
1057
  if isinstance(parquet_ds.schema, pq.ParquetSchema):
1020
1058
  arrow_schema = parquet_ds.schema.to_arrow_schema()
1021
1059
  elif isinstance(parquet_ds.schema, pa.Schema):
@@ -1038,13 +1076,27 @@ class VastdbApi:
1038
1076
  headers = self._fill_common_headers(txid=txid, client_tags=client_tags)
1039
1077
  res = self.session.get(self._api_prefix(bucket=bucket, schema=schema, table=name, command="stats"), headers=headers)
1040
1078
  if res.status_code == 200:
1041
- res_headers = res.headers
1042
1079
  flatbuf = b''.join(res.iter_content(chunk_size=128))
1043
1080
  stats = get_table_stats.GetRootAs(flatbuf)
1044
1081
  num_rows = stats.NumRows()
1045
1082
  size_in_bytes = stats.SizeInBytes()
1046
1083
  is_external_rowid_alloc = stats.IsExternalRowidAlloc()
1047
- return num_rows, size_in_bytes, is_external_rowid_alloc
1084
+ endpoints = []
1085
+ if stats.VipsLength() == 0:
1086
+ endpoints.append(self.url)
1087
+ else:
1088
+ ip_cls = IPv6Address if (stats.AddressType() == "ipv6") else IPv4Address
1089
+ vips = [stats.Vips(i) for i in range(stats.VipsLength())]
1090
+ ips = []
1091
+ # extract the vips into list of IPs
1092
+ for vip in vips:
1093
+ start_ip = int(ip_cls(vip.StartAddress().decode()))
1094
+ ips.extend(ip_cls(start_ip + i) for i in range(vip.AddressCount()))
1095
+ for ip in ips:
1096
+ prefix = "http" if not self.secure else "https"
1097
+ endpoints.append(f"{prefix}://{str(ip)}:{self.port}")
1098
+ return TableStatsResult(num_rows, size_in_bytes, is_external_rowid_alloc, endpoints)
1099
+
1048
1100
  return self._check_res(res, "get_table_stats", expected_retvals)
1049
1101
 
1050
1102
  def alter_table(self, bucket, schema, name, txid=0, client_tags=[], table_properties="",
@@ -1071,22 +1123,26 @@ class VastdbApi:
1071
1123
 
1072
1124
  headers = self._fill_common_headers(txid=txid, client_tags=client_tags)
1073
1125
  headers['Content-Length'] = str(len(alter_table_req))
1074
- url_params = {'tabular-new-table-name': new_name} if len(new_name) else {}
1126
+ url_params = {'tabular-new-table-name': schema + "/" + new_name} if len(new_name) else {}
1075
1127
 
1076
1128
  res = self.session.put(self._api_prefix(bucket=bucket, schema=schema, table=name, command="table", url_params=url_params),
1077
1129
  data=alter_table_req, headers=headers)
1078
1130
 
1079
1131
  return self._check_res(res, "alter_table", expected_retvals)
1080
1132
 
1081
- def drop_table(self, bucket, schema, name, txid=0, client_tags=[], expected_retvals=[]):
1133
+ def drop_table(self, bucket, schema, name, txid=0, client_tags=[], expected_retvals=[], remove_imports_table=False):
1082
1134
  """
1083
1135
  DELETE /mybucket/schema_path/mytable?table HTTP/1.1
1084
1136
  tabular-txid: TransactionId
1085
1137
  tabular-client-tag: ClientTag
1138
+
1139
+ To remove the internal vastdb-imported-objects table just set remove_imports_table=True
1086
1140
  """
1087
1141
  headers = self._fill_common_headers(txid=txid, client_tags=client_tags)
1142
+ url_params = {'sub-table': IMPORTED_OBJECTS_TABLE_NAME} if remove_imports_table else {}
1088
1143
 
1089
- res = self.session.delete(self._api_prefix(bucket=bucket, schema=schema, table=name, command="table"), headers=headers)
1144
+ res = self.session.delete(self._api_prefix(bucket=bucket, schema=schema, table=name, command="table", url_params=url_params),
1145
+ headers=headers)
1090
1146
  return self._check_res(res, "drop_table", expected_retvals)
1091
1147
 
1092
1148
  def list_tables(self, bucket, schema, txid=0, client_tags=[], max_keys=1000, next_key=0, name_prefix="",
@@ -1210,7 +1266,7 @@ class VastdbApi:
1210
1266
 
1211
1267
  def list_columns(self, bucket, schema, table, *, txid=0, client_tags=None, max_keys=None, next_key=0,
1212
1268
  count_only=False, name_prefix="", exact_match=False,
1213
- expected_retvals=None, bc_list_internals=False):
1269
+ expected_retvals=None, bc_list_internals=False, list_imports_table=False):
1214
1270
  """
1215
1271
  GET /mybucket/myschema/mytable?columns HTTP/1.1
1216
1272
  tabular-txid: TransactionId
@@ -1218,6 +1274,8 @@ class VastdbApi:
1218
1274
  x-tabluar-name-prefix: TableNamePrefix
1219
1275
  tabular-max-keys: 1000
1220
1276
  tabular-next-key: NextColumnId
1277
+
1278
+ To list the columns of the internal vastdb-imported-objects table, set list_import_table=True
1221
1279
  """
1222
1280
  max_keys = max_keys or self.default_max_list_columns_page_size
1223
1281
  client_tags = client_tags or []
@@ -1235,7 +1293,9 @@ class VastdbApi:
1235
1293
  else:
1236
1294
  headers['tabular-name-prefix'] = name_prefix
1237
1295
 
1238
- res = self.session.get(self._api_prefix(bucket=bucket, schema=schema, table=table, command="column"),
1296
+ url_params = {'sub-table': IMPORTED_OBJECTS_TABLE_NAME} if list_imports_table else {}
1297
+ res = self.session.get(self._api_prefix(bucket=bucket, schema=schema, table=table, command="column",
1298
+ url_params=url_params),
1239
1299
  headers=headers, stream=True)
1240
1300
  self._check_res(res, "list_columns", expected_retvals)
1241
1301
  if res.status_code == 200:
@@ -1247,9 +1307,7 @@ class VastdbApi:
1247
1307
  if not count_only:
1248
1308
  schema_buf = b''.join(res.iter_content(chunk_size=128))
1249
1309
  schema_out = pa.ipc.open_stream(schema_buf).schema
1250
- # _logger.info(f"schema={schema_out}")
1251
- for f in schema_out:
1252
- columns.append([f.name, f.type, f.metadata, f])
1310
+ columns = schema_out
1253
1311
 
1254
1312
  return columns, next_key, is_truncated, count
1255
1313
 
@@ -1296,7 +1354,7 @@ class VastdbApi:
1296
1354
  return self._check_res(res, "get_transaction", expected_retvals)
1297
1355
 
1298
1356
  def select_row_ids(self, bucket, schema, table, params, txid=0, client_tags=[], expected_retvals=[],
1299
- retry_count=0, enable_sorted_projections=False):
1357
+ retry_count=0, enable_sorted_projections=True):
1300
1358
  """
1301
1359
  POST /mybucket/myschema/mytable?query-data=SelectRowIds HTTP/1.1
1302
1360
  """
@@ -1313,7 +1371,7 @@ class VastdbApi:
1313
1371
  return self._check_res(res, "query_data", expected_retvals)
1314
1372
 
1315
1373
  def read_columns_data(self, bucket, schema, table, params, txid=0, client_tags=[], expected_retvals=[], tenant_guid=None,
1316
- retry_count=0, enable_sorted_projections=False):
1374
+ retry_count=0, enable_sorted_projections=True):
1317
1375
  """
1318
1376
  POST /mybucket/myschema/mytable?query-data=ReadColumns HTTP/1.1
1319
1377
  """
@@ -1329,7 +1387,7 @@ class VastdbApi:
1329
1387
  return self._check_res(res, "query_data", expected_retvals)
1330
1388
 
1331
1389
  def count_rows(self, bucket, schema, table, params, txid=0, client_tags=[], expected_retvals=[], tenant_guid=None,
1332
- retry_count=0, enable_sorted_projections=False):
1390
+ retry_count=0, enable_sorted_projections=True):
1333
1391
  """
1334
1392
  POST /mybucket/myschema/mytable?query-data=CountRows HTTP/1.1
1335
1393
  """
@@ -1343,27 +1401,9 @@ class VastdbApi:
1343
1401
  data=params, headers=headers, stream=True)
1344
1402
  return self._check_res(res, "query_data", expected_retvals)
1345
1403
 
1346
- def query_data(self, bucket, schema, table, params, split=(0, 1, 8), num_sub_splits=1, response_row_id=False,
1347
- txid=0, client_tags=[], expected_retvals=[], limit_rows=0, schedule_id=None, retry_count=0,
1348
- search_path=None, sub_split_start_row_ids=[], tenant_guid=None, projection='', enable_sorted_projections=True,
1349
- request_format='string', response_format='string'):
1350
- """
1351
- GET /mybucket/myschema/mytable?data HTTP/1.1
1352
- Content-Length: ContentLength
1353
- tabular-txid: TransactionId
1354
- tabular-client-tag: ClientTag
1355
- tabular-split: "split_id,total_splits,num_row_groups_per_split"
1356
- tabular-num-of-subsplits: "total"
1357
- tabular-request-format: "string"
1358
- tabular-response-format: "string" #arrow/trino
1359
- tabular-schedule-id: "schedule-id"
1360
-
1361
- Request Body (flatbuf)
1362
- projections_chunk [expressions]
1363
- predicate_chunk "formatted_data", (required)
1364
-
1365
- """
1366
- # add query option select-only and read-only
1404
+ def _build_query_data_headers(self, txid, client_tags, params, split, num_sub_splits, request_format, response_format,
1405
+ enable_sorted_projections, limit_rows, schedule_id, retry_count, search_path, tenant_guid,
1406
+ sub_split_start_row_ids):
1367
1407
  headers = self._fill_common_headers(txid=txid, client_tags=client_tags)
1368
1408
  headers['Content-Length'] = str(len(params))
1369
1409
  headers['tabular-split'] = ','.join(map(str, split))
@@ -1388,439 +1428,80 @@ class VastdbApi:
1388
1428
  for sub_split_id, start_row_id in sub_split_start_row_ids:
1389
1429
  headers[f'tabular-start-row-id-{sub_split_id}'] = f"{sub_split_id},{start_row_id}"
1390
1430
 
1391
- url_params = {'name': projection} if projection else {}
1431
+ return headers
1392
1432
 
1393
- res = self.session.get(self._api_prefix(bucket=bucket, schema=schema, table=table, command="data", url_params=url_params),
1394
- data=params, headers=headers, stream=True)
1395
- return self._check_res(res, "query_data", expected_retvals)
1433
+ def _build_query_data_url_params(self, projection, query_imports_table):
1434
+ if query_imports_table and projection:
1435
+ raise ValueError("Can't query both imports and projection table")
1396
1436
 
1397
- def _list_table_columns(self, bucket, schema, table, filters=None, field_names=None, txid=0):
1398
- # build a list of the queried column names
1399
- queried_columns = []
1400
- # get all columns from the table
1401
- all_listed_columns = []
1402
- next_key = 0
1403
- while True:
1404
- cur_columns, next_key, is_truncated, count = self.list_columns(
1405
- bucket=bucket, schema=schema, table=table, next_key=next_key, txid=txid)
1406
- if not cur_columns:
1407
- break
1408
- all_listed_columns.extend(cur_columns)
1409
- if not is_truncated:
1410
- break
1411
-
1412
- # build a list of the queried columns
1413
- queried_column_names = set()
1414
- if filters:
1415
- filtered_column_names = ([column_name.split('.')[0] for column_name in filters.keys()]) # use top level of the filter column names
1416
- queried_column_names.update(filtered_column_names)
1417
- _logger.debug(f"_list_table_columns: filtered_column_names={filtered_column_names}")
1418
-
1419
- if field_names:
1420
- field_column_names = ([column_name.split('.')[0] for column_name in field_names]) # use top level of the field column names
1421
- else:
1422
- field_column_names = [column[0] for column in all_listed_columns]
1423
- _logger.debug(f"_list_table_columns: field_column_names={field_column_names}")
1424
- queried_column_names.update(field_column_names)
1425
-
1426
- all_listed_column_and_leaves_names = set()
1427
- for column in all_listed_columns:
1428
- # Collect the column and leaves names for verification below that all the filters and field names are in the table
1429
- column_and_leaves_names = [column[0]] + [f.name for f in column[3].flatten()]
1430
- all_listed_column_and_leaves_names.update(column_and_leaves_names)
1431
-
1432
- # check if this column is needed for the query
1433
- if column[0] in queried_column_names:
1434
- queried_columns.append(column)
1435
-
1436
- # verify that all the filters and field names are in the table
1437
- if filters:
1438
- for filter_column_name in filters.keys():
1439
- if filter_column_name not in all_listed_column_and_leaves_names:
1440
- raise KeyError((f'filter column name: {filter_column_name} does not appear in the table'))
1441
- if field_names:
1442
- for field_name in field_names:
1443
- if field_name not in all_listed_column_and_leaves_names:
1444
- raise ValueError((f'field name: {field_name} does not appear in the table'))
1445
- return list(queried_columns)
1446
-
1447
- def _begin_tx_if_necessary(self, txid):
1448
- if not txid:
1449
- created_txid = True
1450
- res = self.begin_transaction()
1451
- txid = res.headers.get('tabular-txid')
1452
- else:
1453
- created_txid = False
1437
+ url_params = {}
1438
+ if query_imports_table:
1439
+ url_params['sub-table'] = IMPORTED_OBJECTS_TABLE_NAME
1440
+ elif projection:
1441
+ url_params['name'] = projection
1442
+ return url_params
1454
1443
 
1455
- return txid, created_txid
1444
+ def legacy_query_data(self, bucket, schema, table, params, split=(0, 1, 8), num_sub_splits=1, response_row_id=False,
1445
+ txid=0, client_tags=[], expected_retvals=[], limit_rows=0, schedule_id=None, retry_count=0,
1446
+ search_path=None, sub_split_start_row_ids=[], tenant_guid=None, projection='', enable_sorted_projections=True,
1447
+ request_format='string', response_format='string', query_imports_table=False):
1448
+ """
1449
+ POST /mybucket/myschema/mytable?query-data=LegacyQueryData HTTP/1.1
1450
+ Content-Length: ContentLength
1451
+ tabular-txid: TransactionId
1452
+ tabular-client-tag: ClientTag
1453
+ tabular-split: "split_id,total_splits,num_row_groups_per_split"
1454
+ tabular-num-of-subsplits: "total"
1455
+ tabular-request-format: "string"
1456
+ tabular-response-format: "string" #arrow/trino
1457
+ tabular-schedule-id: "schedule-id"
1456
1458
 
1457
- def _prepare_query(self, bucket, schema, table, num_sub_splits, filters=None, field_names=None,
1458
- queried_columns=None, response_row_id=False, txid=0):
1459
- queried_fields = []
1460
- if response_row_id:
1461
- queried_fields.append(pa.field('$row_id', pa.uint64()))
1459
+ Request Body (flatbuf)
1460
+ projections_chunk [expressions]
1461
+ predicate_chunk "formatted_data", (required)
1462
1462
 
1463
- if not queried_columns:
1464
- queried_columns = self._list_table_columns(bucket, schema, table, filters, field_names, txid=txid)
1463
+ """
1464
+ headers = self._build_query_data_headers(txid, client_tags, params, split, num_sub_splits, request_format, response_format,
1465
+ enable_sorted_projections, limit_rows, schedule_id, retry_count, search_path, tenant_guid,
1466
+ sub_split_start_row_ids)
1467
+ url_params = self._build_query_data_url_params(projection, query_imports_table)
1465
1468
 
1466
- queried_fields.extend(pa.field(column[0], column[1]) for column in queried_columns)
1467
- arrow_schema = pa.schema(queried_fields)
1469
+ res = self.session.post(self._api_prefix(bucket=bucket, schema=schema, table=table, command="query-data=LegacyQueryData",
1470
+ url_params=url_params), data=params, headers=headers, stream=True)
1471
+ return self._check_res(res, "legacy_query_data", expected_retvals)
1468
1472
 
1469
- _logger.debug(f'_prepare_query: arrow_schema = {arrow_schema}')
1473
+ def query_data(self, bucket, schema, table, params, split=(0, 1, 8), num_sub_splits=1, response_row_id=False,
1474
+ txid=0, client_tags=[], expected_retvals=[], limit_rows=0, schedule_id=None, retry_count=0,
1475
+ search_path=None, sub_split_start_row_ids=[], tenant_guid=None, projection='', enable_sorted_projections=True,
1476
+ request_format='string', response_format='string', query_imports_table=False):
1477
+ """
1478
+ GET /mybucket/myschema/mytable?data HTTP/1.1
1479
+ Content-Length: ContentLength
1480
+ tabular-txid: TransactionId
1481
+ tabular-client-tag: ClientTag
1482
+ tabular-split: "split_id,total_splits,num_row_groups_per_split"
1483
+ tabular-num-of-subsplits: "total"
1484
+ tabular-request-format: "string"
1485
+ tabular-response-format: "string" #arrow/trino
1486
+ tabular-schedule-id: "schedule-id"
1470
1487
 
1471
- query_data_request = build_query_data_request(schema=arrow_schema, filters=filters, field_names=field_names)
1472
- if self.executor_hosts:
1473
- executor_hosts = self.executor_hosts
1474
- else:
1475
- executor_hosts = [self.host]
1476
- executor_sessions = [VastdbApi(executor_hosts[i], self.access_key, self.secret_key, self.username,
1477
- self.password, self.port, self.secure, self.auth_type) for i in range(len(executor_hosts))]
1478
-
1479
- return queried_columns, arrow_schema, query_data_request, executor_sessions
1480
-
1481
- def _more_pages_exist(self, start_row_ids):
1482
- for row_id in start_row_ids.values():
1483
- if row_id != TABULAR_INVALID_ROW_ID:
1484
- return True
1485
- return False
1486
-
1487
- def _query_page(self, bucket, schema, table, query_data_request, split=(0, 1, 8), num_sub_splits=1, response_row_id=False,
1488
- txid=0, limit_rows=0, sub_split_start_row_ids=[], filters=None, field_names=None):
1489
- res = self.query_data(bucket=bucket, schema=schema, table=table, params=query_data_request.serialized, split=split,
1490
- num_sub_splits=num_sub_splits, response_row_id=response_row_id, txid=txid,
1491
- limit_rows=limit_rows, sub_split_start_row_ids=sub_split_start_row_ids)
1492
- start_row_ids = {}
1493
- sub_split_tables = parse_query_data_response(res.raw, query_data_request.response_schema,
1494
- start_row_ids=start_row_ids)
1495
- table_page = pa.concat_tables(sub_split_tables)
1496
- _logger.info("query_page: table_page num_rows=%s start_row_ids len=%s",
1497
- len(table_page), len(start_row_ids))
1498
-
1499
- return table_page, start_row_ids
1500
-
1501
- def _query_page_iterator(self, bucket, schema, table, query_data_request, split=(0, 1, 8), num_sub_splits=1, response_row_id=False,
1502
- txid=0, limit_rows=0, start_row_ids={}, filters=None, field_names=None):
1503
- res = self.query_data(bucket=bucket, schema=schema, table=table, params=query_data_request.serialized, split=split,
1504
- num_sub_splits=num_sub_splits, response_row_id=response_row_id, txid=txid,
1505
- limit_rows=limit_rows, sub_split_start_row_ids=start_row_ids.items())
1506
- for sub_split_table in parse_query_data_response(res.raw, query_data_request.response_schema,
1507
- start_row_ids=start_row_ids):
1508
- for record_batch in sub_split_table.to_batches():
1509
- yield record_batch
1510
- _logger.info(f"query_page_iterator: start_row_ids={start_row_ids}")
1511
-
1512
- def query_iterator(self, bucket, schema, table, num_sub_splits=1, num_row_groups_per_sub_split=8,
1513
- response_row_id=False, txid=0, limit_per_sub_split=128*1024, filters=None, field_names=None):
1514
- """
1515
- query rows into a table.
1516
-
1517
- Parameters
1518
- ----------
1519
- bucket : string
1520
- The bucket of the table.
1521
- schema : string
1522
- The schema of the table.
1523
- table : string
1524
- The table name.
1525
- num_sub_splits : integer
1526
- The number of sub_splits per split - determines the parallelism inside a VastDB compute node
1527
- default: 1
1528
- num_row_groups_per_sub_split : integer
1529
- The number of consecutive row groups per sub_split. Each row group consists of 64K row ids.
1530
- default: 8
1531
- response_row_id : boolean
1532
- Return a column with the internal row ids of the table
1533
- default: False
1534
- txid : integer
1535
- A transaction id. The transaction may be initiated before the query, and if not, the query will initiate it
1536
- default: 0 (will be created by the api)
1537
- limit_per_sub_split : integer
1538
- Limit the number of rows from a single sub_split for a single rpc
1539
- default:131072
1540
- filters : dict
1541
- A dictionary whose keys are column names, and values are lists of string expressions that represent
1542
- filter conditions on the column. AND is applied on the conditions. The condition formats are:
1543
- 'column_name eq some_value'
1544
- default: None
1545
- field_names : list
1546
- A list of column names to be returned in the output table
1547
- default: None
1548
-
1549
- Returns
1550
- -------
1551
- Query iterator generator
1552
-
1553
- Yields
1554
- ------
1555
- pyarrow.RecordBatch
1556
-
1557
- Examples
1558
- --------
1559
- for record_batch in query_iterator('some_bucket', 'some_schema', 'some_table',
1560
- filters={'name': ['eq Alice', 'eq Bob']}
1561
- field_names=['name','age']):
1562
- ...
1563
-
1564
- """
1565
-
1566
- # create a transaction if necessary
1567
- txid, created_txid = self._begin_tx_if_necessary(txid)
1568
- executor_sessions = []
1488
+ Request Body (flatbuf)
1489
+ projections_chunk [expressions]
1490
+ predicate_chunk "formatted_data", (required)
1569
1491
 
1570
- try:
1571
- # prepare query
1572
- queried_columns, arrow_schema, query_data_request, executor_sessions = \
1573
- self._prepare_query(bucket, schema, table, num_sub_splits, filters, field_names, response_row_id=response_row_id, txid=txid)
1574
-
1575
- # define the per split threaded query func
1576
- def query_iterator_split_id(self, split_id):
1577
- _logger.info(f"query_iterator_split_id: split_id={split_id}")
1578
- try:
1579
- start_row_ids = {i:0 for i in range(num_sub_splits)}
1580
- session = executor_sessions[split_id]
1581
- while not next_sems[split_id].acquire(timeout=1):
1582
- # check if killed externally
1583
- if killall:
1584
- raise RuntimeError(f'query_iterator_split_id: split_id {split_id} received killall')
1585
-
1586
- while self._more_pages_exist(start_row_ids):
1587
- for record_batch in session._query_page_iterator(bucket=bucket, schema=schema, table=table, query_data_request=query_data_request,
1588
- split=(split_id, num_splits, num_row_groups_per_sub_split),
1589
- num_sub_splits=num_sub_splits, response_row_id=response_row_id,
1590
- txid=txid, limit_rows=limit_per_sub_split,
1591
- start_row_ids=start_row_ids):
1592
- output_queue.put((split_id, record_batch))
1593
- while not next_sems[split_id].acquire(timeout=1): # wait for the main thread to request the next record batch
1594
- if killall:
1595
- raise RuntimeError(f'split_id {split_id} received killall')
1596
- # end of split
1597
- output_queue.put((split_id,None))
1598
-
1599
- except Exception as e:
1600
- _logger.exception('query_iterator_split_id: exception occurred')
1601
- try:
1602
- self.rollback_transaction(txid)
1603
- except:
1604
- _logger.exception(f'failed to rollback txid {txid}')
1605
- error_queue.put(None)
1606
- raise e
1607
-
1608
- # kickoff executors
1609
- num_splits = len(executor_sessions)
1610
- output_queue = queue.Queue()
1611
- error_queue = queue.Queue()
1612
- next_sems = [threading.Semaphore(value=1) for i in range(num_splits)]
1613
- killall = False
1614
- with concurrent.futures.ThreadPoolExecutor(max_workers=num_splits) as executor:
1615
- # start executors
1616
- futures = []
1617
- for i in range(num_splits):
1618
- futures.append(executor.submit(query_iterator_split_id, self, i))
1619
-
1620
- # receive outputs and yield them
1621
- done_count = 0
1622
- while done_count < num_splits:
1623
- # check for errors
1624
- try:
1625
- error_queue.get(block=False)
1626
- _logger.error('received error from a thread')
1627
- killall = True
1628
- # wait for all executors to complete
1629
- for future in concurrent.futures.as_completed(futures):
1630
- try:
1631
- future.result() # trigger an exception if occurred in any thread
1632
- except Exception:
1633
- _logger.exception('exception occurred')
1634
- raise RuntimeError('received error from a thread')
1635
- except queue.Empty:
1636
- pass
1637
-
1638
- # try to get a value from the output queue
1639
- try:
1640
- (split_id, record_batch) = output_queue.get(timeout=1)
1641
- except queue.Empty:
1642
- continue
1643
-
1644
- if record_batch:
1645
- # signal to the thread to read the next record batch and yield the current
1646
- next_sems[split_id].release()
1647
- try:
1648
- yield record_batch
1649
- except GeneratorExit:
1650
- killall = True
1651
- _logger.debug("cancelling query_iterator")
1652
- raise
1653
- else:
1654
- done_count += 1
1655
-
1656
- # wait for all executors to complete
1657
- for future in concurrent.futures.as_completed(futures):
1658
- try:
1659
- future.result() # trigger an exception if occurred in any thread
1660
- except Exception:
1661
- _logger.exception('exception occurred')
1662
-
1663
- # commit if needed
1664
- if created_txid:
1665
- self.commit_transaction(txid)
1666
-
1667
- except Exception as e:
1668
- _logger.exception('exception occurred')
1669
- try:
1670
- self.rollback_transaction(txid)
1671
- except:
1672
- _logger.exception(f'failed to rollback txid {txid}')
1673
- raise e
1674
-
1675
- finally:
1676
- killall = True
1677
- for session in executor_sessions:
1678
- try:
1679
- session.session.close()
1680
- except Exception:
1681
- _logger.exception(f'failed to close session {session}')
1682
-
1683
- def query(self, bucket, schema, table, num_sub_splits=1, num_row_groups_per_sub_split=8,
1684
- response_row_id=False, txid=0, limit=0, limit_per_sub_split=131072, filters=None, field_names=None,
1685
- queried_columns=None):
1686
- """
1687
- query rows into a table.
1688
-
1689
- Parameters
1690
- ----------
1691
- bucket : string
1692
- The bucket of the table.
1693
- schema : string
1694
- The schema of the table.
1695
- table : string
1696
- The table name.
1697
- num_sub_splits : integer
1698
- The number of sub_splits per split - determines the parallelism inside a VastDB compute node
1699
- default: 1
1700
- num_row_groups_per_sub_split : integer
1701
- The number of consecutive row groups per sub_split. Each row group consists of 64K row ids.
1702
- default: 8
1703
- response_row_id : boolean
1704
- Return a column with the internal row ids of the table
1705
- default: False
1706
- txid : integer
1707
- A transaction id. The transaction may be initiated before the query, and be used to provide
1708
- multiple ACID operations
1709
- default: 0 (will be created by the api)
1710
- limit : integer
1711
- Limit the number of rows in the response
1712
- default: 0 (no limit)
1713
- limit_per_sub_split : integer
1714
- Limit the number of rows from a single sub_split for a single rpc
1715
- default:131072
1716
- filters : dict
1717
- A dictionary whose keys are column names, and values are lists of string expressions that represent
1718
- filter conditions on the column. AND is applied on the conditions. The condition formats are:
1719
- 'column_name eq some_value'
1720
- default: None
1721
- field_names : list
1722
- A list of column names to be returned to the output table
1723
- default: None
1724
- queried_columns: list of pyArrow.column
1725
- A list of the columns to be queried
1726
- default: None
1727
-
1728
- Returns
1729
- -------
1730
- pyarrow.Table
1731
-
1732
-
1733
- Examples
1734
- --------
1735
- table = query('some_bucket', 'some_schema', 'some_table',
1736
- filters={'name': ['eq Alice', 'eq Bob']}
1737
- field_names=['name','age'])
1738
-
1739
- """
1740
-
1741
- # create a transaction
1742
- txid, created_txid = self._begin_tx_if_necessary(txid)
1743
- executor_sessions = []
1744
- try:
1745
- # prepare query
1746
- queried_columns, arrow_schema, query_data_request, executor_sessions = \
1747
- self._prepare_query(bucket, schema, table, num_sub_splits, filters, field_names, response_row_id=response_row_id, txid=txid)
1748
-
1749
- # define the per split threaded query func
1750
- def query_split_id(self, split_id):
1751
- try:
1752
- start_row_ids = {i:0 for i in range(num_sub_splits)}
1753
- session = executor_sessions[split_id]
1754
- row_count = 0
1755
- while (self._more_pages_exist(start_row_ids) and
1756
- (not limit or row_count < limit)):
1757
- # check if killed externally
1758
- if killall:
1759
- raise RuntimeError(f'query_split_id: split_id {split_id} received killall')
1760
-
1761
- # determine the limit rows
1762
- if limit:
1763
- limit_rows = min(limit_per_sub_split, limit-row_count)
1764
- else:
1765
- limit_rows = limit_per_sub_split
1766
-
1767
- # query one page
1768
- table_page, start_row_ids = session._query_page(bucket=bucket, schema=schema, table=table, query_data_request=query_data_request,
1769
- split=(split_id, num_splits, num_row_groups_per_sub_split),
1770
- num_sub_splits=num_sub_splits, response_row_id=response_row_id,
1771
- txid=txid, limit_rows=limit_rows,
1772
- sub_split_start_row_ids=start_row_ids.items())
1773
- with lock:
1774
- table_pages.append(table_page)
1775
- row_counts[split_id] += len(table_page)
1776
- row_count = sum(row_counts)
1777
- _logger.info(f"query_split_id: table_pages split_id={split_id} row_count={row_count}")
1778
- except Exception as e:
1779
- _logger.exception('query_split_id: exception occurred')
1780
- try:
1781
- self.rollback_transaction(txid)
1782
- except:
1783
- _logger.exception(f'failed to rollback txid {txid}')
1784
- raise e
1785
-
1786
- table_pages = []
1787
- num_splits = len(executor_sessions)
1788
- killall = False
1789
- with concurrent.futures.ThreadPoolExecutor(max_workers=num_splits) as executor:
1790
- futures = []
1791
- row_counts = [0] * num_splits
1792
- lock = threading.Lock()
1793
- for i in range(num_splits):
1794
- futures.append(executor.submit(query_split_id, self, i))
1795
- for future in concurrent.futures.as_completed(futures):
1796
- future.result() # trigger an exception if occurred in any thread
1797
-
1798
- # commit if needed
1799
- if created_txid:
1800
- self.commit_transaction(txid)
1801
-
1802
- # concatenate all table pages and return result
1803
- out_table = pa.concat_tables(table_pages)
1804
- out_table = out_table.slice(length=limit) if limit else out_table
1805
- _logger.info("query: out_table len=%s row_count=%s",
1806
- len(out_table), len(out_table))
1807
- return out_table
1808
-
1809
- except Exception as e:
1810
- _logger.exception('exception occurred')
1811
- try:
1812
- self.rollback_transaction(txid)
1813
- except:
1814
- _logger.exception(f'failed to rollback txid {txid}')
1815
- raise e
1816
-
1817
- finally:
1818
- killall = True
1819
- for session in executor_sessions:
1820
- try:
1821
- session.session.close()
1822
- except Exception:
1823
- _logger.exception(f'failed to close session {session}')
1492
+ To query the internal vastdb-imported-objects table, set query_imports_table=True
1493
+ """
1494
+ # add query option select-only and read-only
1495
+
1496
+ headers = self._build_query_data_headers(txid, client_tags, params, split, num_sub_splits, request_format, response_format,
1497
+ enable_sorted_projections, limit_rows, schedule_id, retry_count, search_path, tenant_guid,
1498
+ sub_split_start_row_ids)
1499
+
1500
+ url_params = self._build_query_data_url_params(projection, query_imports_table)
1501
+
1502
+ res = self.session.get(self._api_prefix(bucket=bucket, schema=schema, table=table, command="data", url_params=url_params),
1503
+ data=params, headers=headers, stream=True)
1504
+ return self._check_res(res, "query_data", expected_retvals)
1824
1505
 
1825
1506
  """
1826
1507
  source_files: list of (bucket_name, file_name)
@@ -1874,21 +1555,22 @@ class VastdbApi:
1874
1555
  builder.Finish(params)
1875
1556
  import_req = builder.Output()
1876
1557
 
1877
- def iterate_over_import_data_response(response, expected_retvals):
1558
+ def iterate_over_import_data_response(response):
1878
1559
  if response.status_code != 200:
1879
1560
  return response
1880
1561
 
1881
1562
  chunk_size = 1024
1882
- for chunk in res.iter_content(chunk_size=chunk_size):
1563
+ for chunk in response.iter_content(chunk_size=chunk_size):
1883
1564
  chunk_dict = json.loads(chunk)
1884
- _logger.info(f"import data chunk={chunk}, result: {chunk_dict['res']}")
1885
- if chunk_dict['res'] in expected_retvals:
1886
- _logger.info(f"import finished with expected result={chunk_dict['res']}, error message: {chunk_dict['err_msg']}")
1887
- return response
1888
- elif chunk_dict['res'] != 'Success' and chunk_dict['res'] != 'TabularInProgress':
1889
- raise TabularException(f"Received unexpected error in import_data. "
1890
- f"status: {chunk_dict['res']}, error message: {chunk_dict['err_msg']}")
1891
- _logger.info(f"import_data is in progress. status: {chunk_dict['res']}")
1565
+ _logger.debug("import data chunk=%s, result: %s", chunk_dict, chunk_dict['res'])
1566
+ if chunk_dict['res'] != 'Success' and chunk_dict['res'] != 'TabularInProgress' and chunk_dict['res'] != 'TabularAlreadyImported':
1567
+ raise errors.ImportFilesError(
1568
+ f"Encountered an error during import_data. status: {chunk_dict['res']}, "
1569
+ f"error message: {chunk_dict['err_msg'] or 'Unexpected error'} during import of "
1570
+ f"object name: {chunk_dict['object_name']}", chunk_dict)
1571
+ else:
1572
+ _logger.debug("import_data of object name '%s' is in progress. "
1573
+ "status: %s", chunk_dict['object_name'], chunk_dict['res'])
1892
1574
  return response
1893
1575
 
1894
1576
  headers = self._fill_common_headers(txid=txid, client_tags=client_tags)
@@ -1901,34 +1583,17 @@ class VastdbApi:
1901
1583
  res = self.session.post(self._api_prefix(bucket=bucket, schema=schema, table=table, command="data"),
1902
1584
  data=import_req, headers=headers, stream=True)
1903
1585
  if blocking:
1904
- res = iterate_over_import_data_response(res, expected_retvals)
1586
+ res = iterate_over_import_data_response(res)
1905
1587
 
1906
1588
  return self._check_res(res, "import_data", expected_retvals)
1907
1589
 
1908
- def merge_data(self):
1909
- """
1910
- TODO
1911
-
1912
- POST /mybucket/myschema/mytable?data HTTP/1.1
1913
- Content-Length: ContentLength
1914
- tabular-txid: TransactionId
1915
- tabular-client-tag: ClientTag
1916
-
1917
- Request Body
1918
- {
1919
- "format": "string",
1920
- "select_source": "formatted data"
1921
- "predicate": "formatted_data"
1922
- }
1923
- """
1924
- pass
1925
-
1926
1590
  def _record_batch_slices(self, batch, rows_per_slice=None):
1927
1591
  max_slice_size_in_bytes = int(0.9*5*1024*1024) # 0.9 * 5MB
1928
1592
  batch_len = len(batch)
1929
1593
  serialized_batch = serialize_record_batch(batch)
1930
1594
  batch_size_in_bytes = len(serialized_batch)
1931
- _logger.info(f'max_slice_size_in_bytes={max_slice_size_in_bytes} batch_len={batch_len} batch_size_in_bytes={batch_size_in_bytes}')
1595
+ _logger.debug('max_slice_size_in_bytes=%d batch_len=%d batch_size_in_bytes=%d',
1596
+ max_slice_size_in_bytes, batch_len, batch_size_in_bytes)
1932
1597
 
1933
1598
  if not rows_per_slice:
1934
1599
  if batch_size_in_bytes < max_slice_size_in_bytes:
@@ -1950,7 +1615,7 @@ class VastdbApi:
1950
1615
  serialized_slice_batch = serialize_record_batch(slice_batch)
1951
1616
  sizeof_serialized_slice_batch = len(serialized_slice_batch)
1952
1617
 
1953
- if sizeof_serialized_slice_batch <= max_slice_size_in_bytes or rows_per_slice < 10000:
1618
+ if sizeof_serialized_slice_batch <= max_slice_size_in_bytes:
1954
1619
  serialized_slices.append(serialized_slice_batch)
1955
1620
  else:
1956
1621
  _logger.info(f'Using rows_per_slice {rows_per_slice} slice {i} size {sizeof_serialized_slice_batch} exceeds {max_slice_size_in_bytes} bytes, trying smaller rows_per_slice')
@@ -1964,125 +1629,6 @@ class VastdbApi:
1964
1629
 
1965
1630
  return serialized_slices
1966
1631
 
1967
- def insert(self, bucket, schema, table, rows=None, record_batch=None, rows_per_insert=None, txid=0):
1968
- """
1969
- Insert rows into a table. The operation may be split into multiple commands, such that by default no more than 512KB will be inserted per command.
1970
-
1971
- Parameters
1972
- ----------
1973
- bucket : string
1974
- The bucket of the table.
1975
- schema : string
1976
- The schema of the table.
1977
- table : string
1978
- The table name.
1979
- rows : dict
1980
- The rows to insert.
1981
- dictionary key: column name
1982
- dictionary value: array of cell values to insert
1983
- default: None (if None, record_batch must be provided)
1984
- record_batch : pyarrow.RecordBatch
1985
- A pyarrow RecordBatch
1986
- default: None (if None, rows dictionary must be provided)
1987
- rows_per_insert : integer
1988
- Split the operation so that each insert command will be limited to this value
1989
- default: None (will be selected automatically)
1990
- txid : integer
1991
- A transaction id. The transaction may be initiated before the insert, and be used to provide
1992
- multiple ACID operations
1993
- default: 0 (will be created by the api)
1994
-
1995
- Returns
1996
- -------
1997
- None
1998
-
1999
-
2000
- Examples
2001
- --------
2002
- insert('some_bucket', 'some_schema', 'some_table', {'name': ['Alice','Bob'], 'age': [25,24]})
2003
-
2004
- """
2005
- if (not rows and not record_batch) or (rows and record_batch):
2006
- raise ValueError(f'insert: missing argument - either rows or record_batch must be provided')
2007
-
2008
- # create a transaction
2009
- txid, created_txid = self._begin_tx_if_necessary(txid)
2010
-
2011
- if rows:
2012
- columns = self._list_table_columns(bucket, schema, table, field_names=rows.keys(), txid=txid)
2013
- columns_dict = dict([(column[0], column[1]) for column in columns])
2014
- arrow_schema = pa.schema([])
2015
- arrays = []
2016
- for column_name, column_values in rows.items():
2017
- column_type = columns_dict[column_name]
2018
- field = pa.field(column_name, column_type)
2019
- arrow_schema = arrow_schema.append(field)
2020
- arrays.append(pa.array(column_values, column_type))
2021
- record_batch = pa.record_batch(arrays, arrow_schema)
2022
-
2023
- # split the record batch into multiple slices
2024
- serialized_slices = self._record_batch_slices(record_batch, rows_per_insert)
2025
- _logger.info(f'inserting record batch using {len(serialized_slices)} slices')
2026
-
2027
- insert_queue = queue.Queue()
2028
-
2029
- [insert_queue.put(insert_rows_req) for insert_rows_req in serialized_slices]
2030
-
2031
- try:
2032
- executor_sessions = [VastdbApi(self.executor_hosts[i], self.access_key, self.secret_key, self.username,
2033
- self.password, self.port, self.secure, self.auth_type) for i in range(len(self.executor_hosts))]
2034
-
2035
- def insert_executor(self, split_id):
2036
-
2037
- try:
2038
- _logger.info(f'insert_executor split_id={split_id} starting')
2039
- session = executor_sessions[split_id]
2040
- num_inserts = 0
2041
- while not killall:
2042
- try:
2043
- insert_rows_req = insert_queue.get(block=False)
2044
- except queue.Empty:
2045
- break
2046
- session.insert_rows(bucket=bucket, schema=schema,
2047
- table=table, record_batch=insert_rows_req, txid=txid)
2048
- num_inserts += 1
2049
- _logger.info(f'insert_executor split_id={split_id} num_inserts={num_inserts}')
2050
- if killall:
2051
- _logger.info('insert_executor killall=True')
2052
-
2053
- except Exception as e:
2054
- _logger.exception('insert_executor hit exception')
2055
- raise e
2056
-
2057
- num_splits = len(executor_sessions)
2058
- killall = False
2059
- with concurrent.futures.ThreadPoolExecutor(max_workers=num_splits) as executor:
2060
- futures = []
2061
- for i in range(num_splits):
2062
- futures.append(executor.submit(insert_executor, self, i))
2063
- for future in concurrent.futures.as_completed(futures):
2064
- future.result() # trigger an exception if occurred in any thread
2065
-
2066
- # commit if needed
2067
- if created_txid:
2068
- self.commit_transaction(txid)
2069
-
2070
- except Exception as e:
2071
- _logger.exception('exception occurred')
2072
- try:
2073
- self.rollback_transaction(txid)
2074
- except:
2075
- _logger.exception(f'failed to rollback txid {txid}')
2076
- raise e
2077
-
2078
- finally:
2079
- killall = True
2080
- for session in executor_sessions:
2081
- try:
2082
- session.session.close()
2083
- except Exception:
2084
- _logger.exception(f'failed to close session {session}')
2085
-
2086
1632
  def insert_rows(self, bucket, schema, table, record_batch, txid=0, client_tags=[], expected_retvals=[]):
2087
1633
  """
2088
1634
  POST /mybucket/myschema/mytable?rows HTTP/1.1
@@ -2352,41 +1898,40 @@ def _iter_query_data_response_columns(fileobj, stream_ids=None):
2352
1898
  if stream_ids is not None:
2353
1899
  stream_ids.update([stream_id]) # count stream IDs using a collections.Counter
2354
1900
  if stream_id == TABULAR_KEEP_ALIVE_STREAM_ID:
2355
- # _logger.info(f"stream_id={stream_id} (skipping)")
2356
1901
  continue
2357
1902
 
2358
1903
  if stream_id == TABULAR_QUERY_DATA_COMPLETED_STREAM_ID:
2359
1904
  # read the terminating end chunk from socket
2360
1905
  res = fileobj.read()
2361
- _logger.info(f"stream_id={stream_id} res={res} (finish)")
1906
+ _logger.debug("stream_id=%d res=%s (finish)", stream_id, res)
2362
1907
  return
2363
1908
 
2364
1909
  if stream_id == TABULAR_QUERY_DATA_FAILED_STREAM_ID:
2365
1910
  # read the terminating end chunk from socket
2366
1911
  res = fileobj.read()
2367
- _logger.info(f"stream_id={stream_id} res={res} (failed)")
1912
+ _logger.warning("stream_id=%d res=%s (failed)", stream_id, res)
2368
1913
  raise IOError(f"Query data stream failed res={res}")
2369
1914
 
2370
1915
  next_row_id_bytes = fileobj.read(8)
2371
1916
  next_row_id, = struct.unpack('<Q', next_row_id_bytes)
2372
- _logger.info(f"stream_id={stream_id} next_row_id={next_row_id}")
1917
+ _logger.debug("stream_id=%d next_row_id=%d", stream_id, next_row_id)
2373
1918
 
2374
1919
  if stream_id not in readers:
2375
1920
  # we implicitly read 1st message (Arrow schema) when constructing RecordBatchStreamReader
2376
1921
  reader = pa.ipc.RecordBatchStreamReader(fileobj)
2377
- _logger.info(f"stream_id={stream_id} schema={reader.schema}")
1922
+ _logger.debug("stream_id=%d schema=%s", stream_id, reader.schema)
2378
1923
  readers[stream_id] = (reader, [])
2379
1924
  continue
2380
1925
 
2381
1926
  (reader, batches) = readers[stream_id]
2382
1927
  try:
2383
1928
  batch = reader.read_next_batch() # read single-column chunk data
2384
- _logger.info(f"stream_id={stream_id} rows={len(batch)} chunk={batch}")
1929
+ _logger.debug("stream_id=%d rows=%d chunk=%s", stream_id, len(batch), batch)
2385
1930
  batches.append(batch)
2386
1931
  except StopIteration: # we got an end-of-stream IPC message for a given stream ID
2387
1932
  reader, batches = readers.pop(stream_id) # end of column
2388
1933
  table = pa.Table.from_batches(batches) # concatenate all column chunks (as a single)
2389
- _logger.info(f"stream_id={stream_id} rows={len(table)} column={table}")
1934
+ _logger.debug("stream_id=%d rows=%d column=%s", stream_id, len(table), table)
2390
1935
  yield (stream_id, next_row_id, table)
2391
1936
 
2392
1937
 
@@ -2415,7 +1960,8 @@ def parse_query_data_response(conn, schema, stream_ids=None, start_row_ids=None,
2415
1960
  if is_empty_projection: # VAST returns an empty RecordBatch, with the correct rows' count
2416
1961
  parsed_table = table
2417
1962
 
2418
- _logger.info(f"stream_id={stream_id} rows={len(parsed_table)} next_row_id={next_row_id} table={parsed_table}")
1963
+ _logger.debug("stream_id=%d rows=%d next_row_id=%d table=%s",
1964
+ stream_id, len(parsed_table), next_row_id, parsed_table)
2419
1965
  start_row_ids[stream_id] = next_row_id
2420
1966
  yield parsed_table # the result of a single "select_rows()" cycle
2421
1967
 
@@ -2564,7 +2110,6 @@ def get_field_type(builder: flatbuffers.Builder, field: pa.Field):
2564
2110
  return field_type, field_type_type
2565
2111
 
2566
2112
  def build_field(builder: flatbuffers.Builder, f: pa.Field, name: str):
2567
- _logger.info(f"name={f.name}")
2568
2113
  children = None
2569
2114
  if isinstance(f.type, pa.StructType):
2570
2115
  children = [build_field(builder, child, child.name) for child in list(f.type)]
@@ -2591,7 +2136,6 @@ def build_field(builder: flatbuffers.Builder, f: pa.Field, name: str):
2591
2136
  fb_field.AddName(builder, child_col_name)
2592
2137
  fb_field.AddChildren(builder, children)
2593
2138
 
2594
- _logger.info(f"added key and map to entries")
2595
2139
  children = [fb_field.End(builder)]
2596
2140
 
2597
2141
  if children is not None:
@@ -2602,13 +2146,11 @@ def build_field(builder: flatbuffers.Builder, f: pa.Field, name: str):
2602
2146
 
2603
2147
  col_name = builder.CreateString(name)
2604
2148
  field_type, field_type_type = get_field_type(builder, f)
2605
- _logger.info(f"add col_name={name} type_type={field_type_type} to fb")
2606
2149
  fb_field.Start(builder)
2607
2150
  fb_field.AddName(builder, col_name)
2608
2151
  fb_field.AddTypeType(builder, field_type_type)
2609
2152
  fb_field.AddType(builder, field_type)
2610
2153
  if children is not None:
2611
- _logger.info(f"add col_name={name} childern")
2612
2154
  fb_field.AddChildren(builder, children)
2613
2155
  return fb_field.End(builder)
2614
2156
 
@@ -2625,9 +2167,7 @@ class QueryDataRequest:
2625
2167
  self.response_schema = response_schema
2626
2168
 
2627
2169
 
2628
- def build_query_data_request(schema: 'pa.Schema' = pa.schema([]), filters: dict = None, field_names: list = None):
2629
- filters = filters or {}
2630
-
2170
+ def build_query_data_request(schema: 'pa.Schema' = pa.schema([]), predicate: ibis.expr.types.BooleanColumn = None, field_names: list = None):
2631
2171
  builder = flatbuffers.Builder(1024)
2632
2172
 
2633
2173
  source_name = builder.CreateString('') # required
@@ -2643,7 +2183,7 @@ def build_query_data_request(schema: 'pa.Schema' = pa.schema([]), filters: dict
2643
2183
  fb_schema.AddFields(builder, fields)
2644
2184
  schema_obj = fb_schema.End(builder)
2645
2185
 
2646
- predicate = Predicate(schema, filters)
2186
+ predicate = Predicate(schema=schema, expr=predicate)
2647
2187
  filter_obj = predicate.serialize(builder)
2648
2188
 
2649
2189
  parser = QueryDataParser(schema)
@@ -2654,10 +2194,8 @@ def build_query_data_request(schema: 'pa.Schema' = pa.schema([]), filters: dict
2654
2194
  continue
2655
2195
  iter_from_root = reversed(list(descendent._iter_to_root()))
2656
2196
  descendent_full_name = '.'.join([n.field.name for n in iter_from_root])
2657
- _logger.debug(f'build_query_data_request: descendent_full_name={descendent_full_name}')
2658
2197
  descendent_leaves = [leaf.index for leaf in descendent._iter_leaves()]
2659
2198
  leaves_map[descendent_full_name] = descendent_leaves
2660
- _logger.debug(f'build_query_data_request: leaves_map={leaves_map}')
2661
2199
 
2662
2200
  output_field_names = None
2663
2201
  if field_names is None:
@@ -2668,13 +2206,11 @@ def build_query_data_request(schema: 'pa.Schema' = pa.schema([]), filters: dict
2668
2206
  def compare_field_names_by_pos(field_name1, field_name2):
2669
2207
  return leaves_map[field_name1][0]-leaves_map[field_name2][0]
2670
2208
  field_names = sorted(field_names, key=cmp_to_key(compare_field_names_by_pos))
2671
- _logger.debug(f'build_query_data_request: sorted field_names={field_names} schema={schema}')
2672
2209
 
2673
2210
  projection_fields = []
2674
2211
  projection_positions = []
2675
2212
  for field_name in field_names:
2676
2213
  positions = leaves_map[field_name]
2677
- _logger.info("projecting field=%s positions=%s", field_name, positions)
2678
2214
  projection_positions.extend(positions)
2679
2215
  for leaf_position in positions:
2680
2216
  fb_field_index.Start(builder)
@@ -2731,11 +2267,9 @@ def convert_column_types(table: 'pa.Table') -> 'pa.Table':
2731
2267
  indexes_of_fields_to_change[field.name] = index
2732
2268
  for changing_index in ts_indexes:
2733
2269
  field_name = table.schema[changing_index].name
2734
- _logger.info(f'changing resolution for {field_name} to us')
2735
2270
  new_column = table[field_name].cast(pa.timestamp('us'), safe=False)
2736
2271
  table = table.set_column(changing_index, field_name, new_column)
2737
2272
  for field_name, changing_index in indexes_of_fields_to_change.items():
2738
- _logger.info(f'applying custom rules to {field_name}')
2739
2273
  new_column = table[field_name].to_pylist()
2740
2274
  new_column = list(map(column_matcher[field_name], new_column))
2741
2275
  new_column = pa.array(new_column, table[field_name].type)