PyPI - vastdb - Versions diffs - 0.0.5.3__py3-none-any.whl → 0.1.1__py3-none-any.whl - Mend

vastdb 0.0.5.3py3-none-any.whl → 0.1.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (44) hide show

vast_flatbuf/tabular/GetTableStatsResponse.py +45 -1
vast_flatbuf/tabular/VipRange.py +56 -0
vastdb/__init__.py +7 -0
vastdb/bench/test_perf.py +29 -0
vastdb/bucket.py +85 -0
vastdb/{tests/conftest.py → conftest.py} +29 -14
vastdb/errors.py +175 -0
vastdb/{api.py → internal_commands.py} +373 -875
vastdb/schema.py +85 -0
vastdb/session.py +47 -0
vastdb/table.py +483 -0
vastdb/tests/test_imports.py +123 -0
vastdb/tests/test_nested.py +28 -0
vastdb/tests/test_projections.py +42 -0
vastdb/tests/test_sanity.py +34 -15
vastdb/tests/test_schemas.py +30 -6
vastdb/tests/test_tables.py +628 -13
vastdb/tests/util.py +18 -0
vastdb/transaction.py +54 -0
vastdb/util.py +11 -10
vastdb-0.1.1.dist-info/METADATA +38 -0
{vastdb-0.0.5.3.dist-info → vastdb-0.1.1.dist-info}/RECORD +26 -31
vast_protobuf/substrait/__init__.py +0 -0
vast_protobuf/substrait/algebra_pb2.py +0 -1344
vast_protobuf/substrait/capabilities_pb2.py +0 -46
vast_protobuf/substrait/ddl_pb2.py +0 -57
vast_protobuf/substrait/extended_expression_pb2.py +0 -49
vast_protobuf/substrait/extensions/__init__.py +0 -0
vast_protobuf/substrait/extensions/extensions_pb2.py +0 -89
vast_protobuf/substrait/function_pb2.py +0 -168
vast_protobuf/substrait/parameterized_types_pb2.py +0 -181
vast_protobuf/substrait/plan_pb2.py +0 -67
vast_protobuf/substrait/type_expressions_pb2.py +0 -198
vast_protobuf/substrait/type_pb2.py +0 -350
vast_protobuf/tabular/__init__.py +0 -0
vast_protobuf/tabular/rpc_pb2.py +0 -344
vastdb/bench_scan.py +0 -45
vastdb/tests/test_create_table_from_parquets.py +0 -50
vastdb/v2.py +0 -360
vastdb-0.0.5.3.dist-info/METADATA +0 -47
{vast_protobuf → vastdb/bench}/__init__.py +0 -0
{vastdb-0.0.5.3.dist-info → vastdb-0.1.1.dist-info}/LICENSE +0 -0
{vastdb-0.0.5.3.dist-info → vastdb-0.1.1.dist-info}/WHEEL +0 -0
{vastdb-0.0.5.3.dist-info → vastdb-0.1.1.dist-info}/top_level.txt +0 -0

vastdb/{api.py → internal_commands.py} RENAMED Viewed

@@ -1,29 +1,23 @@
-import array
+import itertools
+import json
 import logging
+import math
+import re
 import struct
 import urllib.parse
 from collections import defaultdict, namedtuple
-from datetime import datetime
 from enum import Enum
-from typing import List, Union, Optional, Iterator
-import xmltodict
-import concurrent.futures
-import threading
-import queue
-import math
-import socket
-from functools import cmp_to_key
-import pyarrow.parquet as pq
+from ipaddress import IPv4Address, IPv6Address
+from typing import Iterator, Optional, Union
 import flatbuffers
+import ibis
 import pyarrow as pa
+import pyarrow.parquet as pq
 import requests
-import datetime
-import hashlib
-import hmac
-import json
-import itertools
+import urllib3
+import xmltodict
 from aws_requests_auth.aws_auth import AWSRequestsAuth
-from io import BytesIO
 import vast_flatbuf.org.apache.arrow.computeir.flatbuf.BinaryLiteral as fb_binary_lit
 import vast_flatbuf.org.apache.arrow.computeir.flatbuf.BooleanLiteral as fb_bool_lit
@@ -35,10 +29,10 @@ import vast_flatbuf.org.apache.arrow.computeir.flatbuf.FieldIndex as fb_field_in
 import vast_flatbuf.org.apache.arrow.computeir.flatbuf.FieldRef as fb_field_ref
 import vast_flatbuf.org.apache.arrow.computeir.flatbuf.Float32Literal as fb_float32_lit
 import vast_flatbuf.org.apache.arrow.computeir.flatbuf.Float64Literal as fb_float64_lit
+import vast_flatbuf.org.apache.arrow.computeir.flatbuf.Int8Literal as fb_int8_lit
 import vast_flatbuf.org.apache.arrow.computeir.flatbuf.Int16Literal as fb_int16_lit
 import vast_flatbuf.org.apache.arrow.computeir.flatbuf.Int32Literal as fb_int32_lit
 import vast_flatbuf.org.apache.arrow.computeir.flatbuf.Int64Literal as fb_int64_lit
-import vast_flatbuf.org.apache.arrow.computeir.flatbuf.Int8Literal as fb_int8_lit
 import vast_flatbuf.org.apache.arrow.computeir.flatbuf.Literal as fb_literal
 import vast_flatbuf.org.apache.arrow.computeir.flatbuf.Relation as fb_relation
 import vast_flatbuf.org.apache.arrow.computeir.flatbuf.RelationImpl as rel_impl
@@ -51,38 +45,47 @@ import vast_flatbuf.org.apache.arrow.flatbuf.Bool as fb_bool
 import vast_flatbuf.org.apache.arrow.flatbuf.Date as fb_date
 import vast_flatbuf.org.apache.arrow.flatbuf.Decimal as fb_decimal
 import vast_flatbuf.org.apache.arrow.flatbuf.Field as fb_field
+import vast_flatbuf.org.apache.arrow.flatbuf.FixedSizeBinary as fb_fixed_size_binary
 import vast_flatbuf.org.apache.arrow.flatbuf.FloatingPoint as fb_floating_point
 import vast_flatbuf.org.apache.arrow.flatbuf.Int as fb_int
-import vast_flatbuf.org.apache.arrow.flatbuf.Schema as fb_schema
-import vast_flatbuf.org.apache.arrow.flatbuf.Time as fb_time
-import vast_flatbuf.org.apache.arrow.flatbuf.Struct_ as fb_struct
 import vast_flatbuf.org.apache.arrow.flatbuf.List as fb_list
 import vast_flatbuf.org.apache.arrow.flatbuf.Map as fb_map
-import vast_flatbuf.org.apache.arrow.flatbuf.FixedSizeBinary as fb_fixed_size_binary
+import vast_flatbuf.org.apache.arrow.flatbuf.Schema as fb_schema
+import vast_flatbuf.org.apache.arrow.flatbuf.Struct_ as fb_struct
+import vast_flatbuf.org.apache.arrow.flatbuf.Time as fb_time
 import vast_flatbuf.org.apache.arrow.flatbuf.Timestamp as fb_timestamp
 import vast_flatbuf.org.apache.arrow.flatbuf.Utf8 as fb_utf8
 import vast_flatbuf.tabular.AlterColumnRequest as tabular_alter_column
+import vast_flatbuf.tabular.AlterProjectionTableRequest as tabular_alter_projection
 import vast_flatbuf.tabular.AlterSchemaRequest as tabular_alter_schema
 import vast_flatbuf.tabular.AlterTableRequest as tabular_alter_table
-import vast_flatbuf.tabular.AlterProjectionTableRequest as tabular_alter_projection
+import vast_flatbuf.tabular.Column as tabular_projecion_column
+import vast_flatbuf.tabular.ColumnType as tabular_proj_column_type
+import vast_flatbuf.tabular.CreateProjectionRequest as tabular_create_projection
 import vast_flatbuf.tabular.CreateSchemaRequest as tabular_create_schema
 import vast_flatbuf.tabular.ImportDataRequest as tabular_import_data
 import vast_flatbuf.tabular.S3File as tabular_s3_file
-import vast_flatbuf.tabular.CreateProjectionRequest as tabular_create_projection
-import vast_flatbuf.tabular.Column as tabular_projecion_column
-import vast_flatbuf.tabular.ColumnType as tabular_proj_column_type
 from vast_flatbuf.org.apache.arrow.computeir.flatbuf.Deref import Deref
-from vast_flatbuf.org.apache.arrow.computeir.flatbuf.ExpressionImpl import ExpressionImpl
+from vast_flatbuf.org.apache.arrow.computeir.flatbuf.ExpressionImpl import (
+    ExpressionImpl,
+)
 from vast_flatbuf.org.apache.arrow.computeir.flatbuf.LiteralImpl import LiteralImpl
 from vast_flatbuf.org.apache.arrow.flatbuf.DateUnit import DateUnit
 from vast_flatbuf.org.apache.arrow.flatbuf.TimeUnit import TimeUnit
 from vast_flatbuf.org.apache.arrow.flatbuf.Type import Type
+from vast_flatbuf.tabular.GetProjectionTableStatsResponse import (
+    GetProjectionTableStatsResponse as get_projection_table_stats,
+)
+from vast_flatbuf.tabular.GetTableStatsResponse import (
+    GetTableStatsResponse as get_table_stats,
+)
+from vast_flatbuf.tabular.ListProjectionsResponse import (
+    ListProjectionsResponse as list_projections,
+)
 from vast_flatbuf.tabular.ListSchemasResponse import ListSchemasResponse as list_schemas
 from vast_flatbuf.tabular.ListTablesResponse import ListTablesResponse as list_tables
-from vast_flatbuf.tabular.GetTableStatsResponse import GetTableStatsResponse as get_table_stats
-from vast_flatbuf.tabular.GetProjectionTableStatsResponse import GetProjectionTableStatsResponse as get_projection_table_stats
-from vast_flatbuf.tabular.ListProjectionsResponse import ListProjectionsResponse as list_projections
+from . import errors
 UINT64_MAX = 18446744073709551615
@@ -91,30 +94,22 @@ TABULAR_QUERY_DATA_COMPLETED_STREAM_ID = 0xFFFFFFFF - 1
 TABULAR_QUERY_DATA_FAILED_STREAM_ID = 0xFFFFFFFF - 2
 TABULAR_INVALID_ROW_ID = 0xFFFFFFFFFFFF # (1<<48)-1
 ESTORE_INVALID_EHANDLE = UINT64_MAX
+IMPORTED_OBJECTS_TABLE_NAME = "vastdb-imported-objects"
 """
 S3 Tabular API
 """
-def get_logger(name):
-    log = logging.getLogger(name)
-    log.setLevel(logging.ERROR)
-    ch = logging.StreamHandler()
-    ch.setLevel(logging.INFO)
-    ch.set_name('tabular_stream_handler')
-    formatter = logging.Formatter("%(asctime)s:%(levelname)s:%(message)s")
-    ch.setFormatter(formatter)
-    log.addHandler(ch)
-    log.propagate = False
-    return log
+_logger = logging.getLogger(__name__)
-_logger = get_logger(__name__)
-def set_tabular_log_level(level: int = logging.INFO):
-    _logger.setLevel(level)
+def _flatten_args(op, op_type):
+    if isinstance(op, op_type):
+        for arg in op.args:
+            yield from _flatten_args(arg, op_type)
+    else:
+        yield op
 class AuthType(Enum):
@@ -123,10 +118,6 @@ class AuthType(Enum):
     BASIC = "basic"
-class TabularException(Exception):
-    pass
 def get_unit_to_flatbuff_time_unit(type):
     unit_to_flatbuff_time_unit = {
         'ns': TimeUnit.NANOSECOND,
@@ -137,18 +128,10 @@ def get_unit_to_flatbuff_time_unit(type):
     return unit_to_flatbuff_time_unit[type]
 class Predicate:
-    unit_to_epoch = {
-        'ns': 1_000_000,
-        'us': 1_000,
-        'ms': 1,
-        's': 0.001
-    }
-    def __init__(self, schema: 'pa.Schema', filters: dict):
+    def __init__(self, schema: 'pa.Schema', expr: ibis.expr.types.BooleanColumn):
         self.schema = schema
-        self.filters = filters
+        self.expr = expr
         self.builder = None
-        self._field_name_per_index = None
     def get_field_indexes(self, field: 'pa.Field', field_name_per_index: list) -> None:
         field_name_per_index.append(field.name)
@@ -172,7 +155,6 @@ class Predicate:
             for field in self.schema:
                 self.get_field_indexes(field, _field_name_per_index)
             self._field_name_per_index = {field: index for index, field in enumerate(_field_name_per_index)}
-            _logger.debug(f'field_name_per_index: {self._field_name_per_index}')
         return self._field_name_per_index
     def get_projections(self, builder: 'flatbuffers.builder.Builder', field_names: list = None):
@@ -190,10 +172,87 @@ class Predicate:
         return builder.EndVector()
     def serialize(self, builder: 'flatbuffers.builder.Builder'):
+        from ibis.expr.operations.generic import IsNull, Literal, TableColumn
+        from ibis.expr.operations.logical import (
+            And,
+            Equals,
+            Greater,
+            GreaterEqual,
+            Less,
+            LessEqual,
+            Not,
+            NotEquals,
+            Or,
+        )
+        from ibis.expr.operations.strings import StringContains
+        builder_map = {
+            Greater: self.build_greater,
+            GreaterEqual: self.build_greater_equal,
+            Less: self.build_less,
+            LessEqual: self.build_less_equal,
+            Equals: self.build_equal,
+            NotEquals: self.build_not_equal,
+            IsNull: self.build_is_null,
+            Not: self.build_is_not_null,
+            StringContains: self.build_match_substring,
+        }
+        positions_map = dict((f.name, index) for index, f in enumerate(self.schema)) # TODO: BFS
         self.builder = builder
         offsets = []
-        for field_name in self.filters:
-            offsets.append(self.build_domain(self.build_column(self.field_name_per_index[field_name]), field_name))
+        if self.expr is not None:
+            and_args = list(_flatten_args(self.expr.op(), And))
+            _logger.debug('AND args: %s ops %s', and_args, self.expr.op())
+            for op in and_args:
+                or_args = list(_flatten_args(op, Or))
+                _logger.debug('OR args: %s op %s', or_args, op)
+                inner_offsets = []
+                prev_field_name = None
+                for inner_op in or_args:
+                    _logger.debug('inner_op %s', inner_op)
+                    builder_func = builder_map.get(type(inner_op))
+                    if not builder_func:
+                        raise NotImplementedError(inner_op.name)
+                    if builder_func == self.build_is_null:
+                        column, = inner_op.args
+                        literal = None
+                    elif builder_func == self.build_is_not_null:
+                        not_arg, = inner_op.args
+                        # currently we only support not is_null, checking we really got is_null under the not:
+                        if not builder_map.get(type(not_arg)) == self.build_is_null:
+                            raise NotImplementedError(not_arg.args[0].name)
+                        column, = not_arg.args
+                        literal = None
+                    else:
+                        column, literal = inner_op.args
+                        if not isinstance(literal, Literal):
+                            raise NotImplementedError(inner_op.name)
+                    if not isinstance(column, TableColumn):
+                        raise NotImplementedError(inner_op.name)
+                    field_name = column.name
+                    if prev_field_name is None:
+                        prev_field_name = field_name
+                    elif prev_field_name != field_name:
+                        raise NotImplementedError(op.name)
+                    args_offsets = [self.build_column(position=positions_map[field_name])]
+                    if literal:
+                        field = self.schema.field(field_name)
+                        args_offsets.append(self.build_literal(field=field, value=literal.value))
+                    inner_offsets.append(builder_func(*args_offsets))
+                domain_offset = self.build_or(inner_offsets)
+                offsets.append(domain_offset)
         return self.build_and(offsets)
     def build_column(self, position: int):
@@ -221,7 +280,6 @@ class Predicate:
         field = self.schema.field(field_name)
         for attr in field_attrs:
             field = field.type[attr]
-        _logger.info(f'trying to append field: {field} with domains: {filters}')
         for filter_by_name in filters:
             offsets.append(self.build_range(column=column, field=field, filter_by_name=filter_by_name))
         return self.build_or(offsets)
@@ -263,11 +321,9 @@ class Predicate:
         return self.build_and(rules)
     def build_function(self, name: str, *offsets):
-        _logger.info(f'name: {name}, offsets: {offsets}')
         offset_name = self.builder.CreateString(name)
         fb_call.StartArgumentsVector(self.builder, len(offsets))
         for offset in reversed(offsets):
-            _logger.info(f'offset: {offset}')
             self.builder.PrependUOffsetTRelative(offset)
         offset_arguments = self.builder.EndVector()
@@ -282,7 +338,7 @@ class Predicate:
         fb_expression.AddImpl(self.builder, offset_call)
         return fb_expression.End(self.builder)
-    def build_literal(self, field: pa.Field, value: str):
+    def build_literal(self, field: pa.Field, value):
         if field.type.equals(pa.int64()):
             literal_type = fb_int64_lit
             literal_impl = LiteralImpl.Int64Literal
@@ -356,7 +412,7 @@ class Predicate:
             field_type = fb_utf8.End(self.builder)
             value = self.builder.CreateString(value)
-        elif field.type.equals(pa.date32()):  # pa.date64()
+        elif field.type.equals(pa.date32()):  # pa.date64() is not supported
             literal_type = fb_date32_lit
             literal_impl = LiteralImpl.DateLiteral
@@ -364,38 +420,49 @@ class Predicate:
             fb_date.Start(self.builder)
             fb_date.AddUnit(self.builder, DateUnit.DAY)
             field_type = fb_date.End(self.builder)
-            start_date = datetime.fromtimestamp(0).date()
-            date_value = datetime.strptime(value, '%Y-%m-%d').date()
-            date_delta = date_value - start_date
-            value = date_delta.days
+            value, = pa.array([value], field.type).cast(pa.int32()).to_pylist()
         elif isinstance(field.type, pa.TimestampType):
             literal_type = fb_timestamp_lit
             literal_impl = LiteralImpl.TimestampLiteral
+            if field.type.equals(pa.timestamp('s')):
+                unit = TimeUnit.SECOND
+            if field.type.equals(pa.timestamp('ms')):
+                unit = TimeUnit.MILLISECOND
+            if field.type.equals(pa.timestamp('us')):
+                unit = TimeUnit.MICROSECOND
+            if field.type.equals(pa.timestamp('ns')):
+                unit = TimeUnit.NANOSECOND
             field_type_type = Type.Timestamp
             fb_timestamp.Start(self.builder)
-            fb_timestamp.AddUnit(self.builder, get_unit_to_flatbuff_time_unit(field.type.unit))
+            fb_timestamp.AddUnit(self.builder, unit)
             field_type = fb_timestamp.End(self.builder)
-            value = int(int(value) * self.unit_to_epoch[field.type.unit])
-        elif field.type.equals(pa.time32('s')) or field.type.equals(pa.time32('ms')) or field.type.equals(pa.time64('us')) or field.type.equals(pa.time64('ns')):
+            value, = pa.array([value], field.type).cast(pa.int64()).to_pylist()
+        elif isinstance(field.type, (pa.Time32Type, pa.Time64Type)):
             literal_type = fb_time_lit
             literal_impl = LiteralImpl.TimeLiteral
-            field_type_str = str(field.type)
-            start = field_type_str.index('[')
-            end = field_type_str.index(']')
-            unit = field_type_str[start + 1:end]
+            if field.type.equals(pa.time32('s')):
+                target_type = pa.int32()
+                unit = TimeUnit.SECOND
+            if field.type.equals(pa.time32('ms')):
+                target_type = pa.int32()
+                unit = TimeUnit.MILLISECOND
+            if field.type.equals(pa.time64('us')):
+                target_type = pa.int64()
+                unit = TimeUnit.MICROSECOND
+            if field.type.equals(pa.time64('ns')):
+                target_type = pa.int64()
+                unit = TimeUnit.NANOSECOND
             field_type_type = Type.Time
             fb_time.Start(self.builder)
             fb_time.AddBitWidth(self.builder, field.type.bit_width)
-            fb_time.AddUnit(self.builder, get_unit_to_flatbuff_time_unit(unit))
+            fb_time.AddUnit(self.builder, unit)
             field_type = fb_time.End(self.builder)
-            value = int(value) * self.unit_to_epoch[unit]
+            value, = pa.array([value], field.type).cast(target_type).to_pylist()
         elif field.type.equals(pa.bool_()):
             literal_type = fb_bool_lit
             literal_impl = LiteralImpl.BooleanLiteral
@@ -426,7 +493,7 @@ class Predicate:
             fb_binary.Start(self.builder)
             field_type = fb_binary.End(self.builder)
-            value = self.builder.CreateByteVector(value.encode())
+            value = self.builder.CreateByteVector(value)
         else:
             raise ValueError(f'unsupported predicate for type={field.type}, value={value}')
@@ -459,6 +526,9 @@ class Predicate:
     def build_equal(self, column: int, literal: int):
         return self.build_function('equal', column, literal)
+    def build_not_equal(self, column: int, literal: int):
+        return self.build_function('not_equal', column, literal)
     def build_greater(self, column: int, literal: int):
         return self.build_function('greater', column, literal)
@@ -477,6 +547,9 @@ class Predicate:
     def build_is_not_null(self, column: int):
         return self.build_function('is_valid', column)
+    def build_match_substring(self, column: int, literal: int):
+        return self.build_function('match_substring', column, literal)
 class FieldNode:
     """Helper class for representing nested Arrow fields and handling QueryData requests"""
@@ -506,8 +579,6 @@ class FieldNode:
         # will be set during by the parser (see below)
         self.buffers = None # a list of Arrow buffers (https://arrow.apache.org/docs/format/Columnar.html#buffer-listing-for-each-layout)
         self.length = None # each array must have it's length specified (https://arrow.apache.org/docs/python/generated/pyarrow.Array.html#pyarrow.Array.from_buffers)
-        self.is_projected = False
-        self.projected_field = self.field
     def _iter_to_root(self) -> Iterator['FieldNode']:
         yield self
@@ -528,15 +599,13 @@ class FieldNode:
             for child in self.children:
                 yield from child._iter_leaves()
-    def _iter_projected_leaves(self) -> Iterator['FieldNode']:
+    def _iter_leaves(self) -> Iterator['FieldNode']:
         """Generate only leaf nodes (i.e. columns having scalar types)."""
         if not self.children:
-            if self.is_projected:
-                yield self
+            yield self
         else:
             for child in self.children:
-                if child.is_projected:
-                    yield from child._iter_projected_leaves()
+                yield from child._iter_leaves()
     def debug_log(self, level=0):
         """Recursively dump this node state to log."""
@@ -573,28 +642,17 @@ class FieldNode:
     def build(self) -> pa.Array:
         """Construct an Arrow array from the collected buffers (recursively)."""
-        children = self.children and [node.build() for node in self.children if node.is_projected]
-        _logger.debug(f'build: self.field.name={self.field.name}, '
-                      f'self.projected_field.type={self.projected_field.type}, self.length={self.length} '
-                      f'self.buffers={self.buffers} children={children}')
-        result = pa.Array.from_buffers(self.projected_field.type, self.length, buffers=self.buffers, children=children)
+        children = self.children and [node.build() for node in self.children]
+        result = pa.Array.from_buffers(self.type, self.length, buffers=self.buffers, children=children)
         if self.debug:
             _logger.debug('%s result=%s', self.field, result)
         return result
-    def build_projected_field(self):
-        if isinstance(self.type, pa.StructType):
-            [child.build_projected_field() for child in self.children if child.is_projected]
-            self.projected_field = pa.field(self.field.name,
-                                            pa.struct([child.projected_field for child in self.children if child.is_projected]),
-                                            self.field.nullable,
-                                            self.field.metadata)
 class QueryDataParser:
     """Used to parse VAST QueryData RPC response."""
-    def __init__(self, arrow_schema: pa.Schema, *, debug=False, projection_positions=None):
+    def __init__(self, arrow_schema: pa.Schema, *, debug=False):
         self.arrow_schema = arrow_schema
-        self.projection_positions = projection_positions
         index = itertools.count() # used to generate leaf column positions for VAST QueryData RPC
         self.nodes = [FieldNode(field, index, debug=debug) for field in arrow_schema]
         self.debug = debug
@@ -602,27 +660,15 @@ class QueryDataParser:
             for node in self.nodes:
                 node.debug_log()
         self.leaves = [leaf for node in self.nodes for leaf in node._iter_leaves()]
-        _logger.debug(f'QueryDataParser: self.leaves = {[(leaf.field.name, leaf.index) for leaf in self.leaves]}')
-        self.mark_projected_nodes()
-        [node.build_projected_field() for node in self.nodes]
-        self.projected_leaves = [leaf for node in self.nodes for leaf in node._iter_projected_leaves()]
-        _logger.debug(f'QueryDataParser: self.projected_leaves = {[(leaf.field.name, leaf.index) for leaf in self.projected_leaves]}')
         self.leaf_offset = 0
-    def mark_projected_nodes(self):
-        for leaf in self.leaves:
-            if self.projection_positions is None or leaf.index in self.projection_positions:
-                for node in leaf._iter_to_root():
-                    node.is_projected = True
-                    _logger.debug(f'mark_projected_nodes node.field.name={node.field.name}')
     def parse(self, column: pa.Array):
         """Parse a single column response from VAST (see FieldNode.set for details)"""
-        if not self.leaf_offset < len(self.projected_leaves):
+        if not self.leaf_offset < len(self.leaves):
             raise ValueError(f'self.leaf_offset: {self.leaf_offset} are not < '
                              f'than len(self.leaves): {len(self.leaves)}')
-        leaf = self.projected_leaves[self.leaf_offset]
+        leaf = self.leaves[self.leaf_offset]
         # A column response may be sent in multiple chunks, therefore we need to combine
         # it into a single chunk to allow reconstruction using `Array.from_buffers()`.
@@ -643,32 +689,19 @@ class QueryDataParser:
         self.leaf_offset += 1
-    def build(self, output_field_names=None) -> Optional[pa.Table]:
+    def build(self) -> Optional[pa.Table]:
         """Try to build the resulting Table object (if all columns were parsed)"""
-        if self.projection_positions is not None:
-            if self.leaf_offset < len(self.projection_positions):
-                return None
-        else:
-            if self.leaf_offset < len(self.leaves):
-                return None
+        if self.leaf_offset < len(self.leaves):
+            return None
         if self.debug:
             for node in self.nodes:
                 node.debug_log()
-        # sort resulting table according to the output field names
-        projected_nodes = [node for node in self.nodes if node.is_projected]
-        if output_field_names is not None:
-            def key_func(projected_node):
-                return output_field_names.index(projected_node.field.name)
-            sorted_projected_nodes = sorted(projected_nodes, key=key_func)
-        else:
-            sorted_projected_nodes = projected_nodes
         result = pa.Table.from_arrays(
-            arrays=[node.build() for node in sorted_projected_nodes],
-            schema = pa.schema([node.projected_field for node in sorted_projected_nodes]))
-        result.validate(full=True) # does expensive validation checks only if debug is enabled
+            arrays=[node.build() for node in self.nodes],
+            schema=self.arrow_schema)
+        result.validate(full=self.debug) # does expensive validation checks only if debug is enabled
         return result
 def _iter_nested_arrays(column: pa.Array) -> Iterator[pa.Array]:
@@ -693,7 +726,6 @@ def _parse_table_info(obj):
     return TableInfo(name, properties, handle, num_rows, used_bytes)
 def build_record_batch(column_info, column_values):
-    _logger.info(f"column_info={column_info}")
     fields = [pa.field(column_name, column_type) for column_type, column_name in column_info]
     schema = pa.schema(fields)
     arrays = [pa.array(column_values[column_type], type=column_type) for column_type, _ in column_info]
@@ -706,56 +738,30 @@ def serialize_record_batch(batch):
         writer.write(batch)
     return sink.getvalue()
-def generate_ip_range(ip_range_str):
-    start, end = ip_range_str.split(':')
-    start_parts = start.split('.')
-    start_last_part = int(start_parts[-1])
-    end_parts = end.split('.')
-    end_last_part = int(end_parts[-1])
-    if start_last_part>=end_last_part or True in [start_parts[i] != end_parts[i] for i in range(3)]:
-        raise ValueError(f'illegal ip range {ip_range_str}')
-    num_ips = 1 + end_last_part - start_last_part
-    ips = ['.'.join(start_parts[:-1] + [str(start_last_part + i)]) for i in range(num_ips)]
-    return ips
-def parse_executor_hosts(host):
-        executor_hosts_parsed = host.split(',')
-        executor_hosts_parsed = [host.strip() for host in executor_hosts_parsed]
-        executor_hosts = []
-        for executor_host in executor_hosts_parsed:
-            is_ip_range=False
-            if ':' in executor_host:
-                try:
-                    socket.inet_aton(executor_host.split(':')[0])
-                    socket.inet_aton(executor_host.split(':')[1])
-                    is_ip_range = True
-                except:
-                    pass
-            if is_ip_range:
-                executor_hosts.extend(generate_ip_range(executor_host))
-            else:
-                executor_hosts.append(executor_host)
-        return executor_hosts
+# Results that returns from tablestats
+TableStatsResult = namedtuple("TableStatsResult",["num_rows", "size_in_bytes", "is_external_rowid_alloc", "endpoints"])
 class VastdbApi:
-    def __init__(self, host, access_key, secret_key, username=None, password=None, port=None,
+    # we expect the vast version to be <major>.<minor>.<patch>.<protocol>
+    VAST_VERSION_REGEX = re.compile(r'^vast (\d+\.\d+\.\d+\.\d+)$')
+    def __init__(self, endpoint, access_key, secret_key, username=None, password=None,
                  secure=False, auth_type=AuthType.SIGV4):
-        executor_hosts = parse_executor_hosts(host)
-        host = executor_hosts[0]
-        self.host = host
+        url_dict = urllib3.util.parse_url(endpoint)._asdict()
         self.access_key = access_key
         self.secret_key = secret_key
         self.username = username
         self.password = password
-        self.port = port
         self.secure = secure
         self.auth_type = auth_type
-        self.executor_hosts = executor_hosts
+        self.executor_hosts = [endpoint]  # TODO: remove
         username = username or ''
         password = password or ''
-        if not port:
-            port = 443 if secure else 80
+        if not url_dict['port']:
+            url_dict['port'] = 443 if secure else 80
+        self.port = url_dict['port']
         self.default_max_list_columns_page_size = 1000
         self.session = requests.Session()
@@ -764,10 +770,10 @@ class VastdbApi:
         if auth_type == AuthType.BASIC:
             self.session.auth = requests.auth.HTTPBasicAuth(username, password)
         else:
-            if port != 80 and port != 443:
-                self.aws_host = f'{host}:{port}'
+            if url_dict['port'] != 80 and url_dict['port'] != 443:
+                self.aws_host = '{host}:{port}'.format(**url_dict)
             else:
-                self.aws_host = f'{host}'
+                self.aws_host = '{host}'.format(**url_dict)
             self.session.auth = AWSRequestsAuth(aws_access_key=access_key,
                                                 aws_secret_access_key=secret_key,
@@ -775,8 +781,34 @@ class VastdbApi:
                                                 aws_region='us-east-1',
                                                 aws_service='s3')
-        proto = "https" if secure else "http"
-        self.url = f"{proto}://{self.aws_host}"
+        if not url_dict['scheme']:
+            url_dict['scheme'] = "https" if secure else "http"
+        url = urllib3.util.Url(**url_dict)
+        self.url = str(url)
+        _logger.debug('url=%s aws_host=%s', self.url, self.aws_host)
+        # probe the cluster for its version
+        self.vast_version = None
+        res = self.session.options(self.url)
+        server_header = res.headers.get("Server")
+        if server_header is None:
+            _logger.error("OPTIONS response doesn't contain 'Server' header")
+        else:
+            _logger.debug("Server header is '%s'", server_header)
+            if m := self.VAST_VERSION_REGEX.match(server_header):
+                self.vast_version, = m.groups()
+                return
+            else:
+                _logger.error("'Server' header '%s' doesn't match the expected pattern", server_header)
+        msg = (
+            f'Please use `vastdb` <= 0.0.5.x with current VAST cluster version ("{server_header or "N/A"}"). '
+            'To use the latest SDK, please upgrade your cluster to the latest service pack. '
+            'Please contact customer.support@vastdata.com for more details.'
+        )
+        _logger.critical(msg)
+        raise NotImplementedError(msg)
     def update_mgmt_session(self, access_key: str, secret_key: str, auth_type=AuthType.SIGV4):
         if auth_type != AuthType.BASIC:
@@ -821,21 +853,9 @@ class VastdbApi:
         return common_headers
     def _check_res(self, res, cmd="", expected_retvals=[]):
-        try:
-            res.raise_for_status()
-            if res.status_code != 200:
-                if not  res.status_code in expected_retvals:
-                    raise ValueError(f"Expected status code mismatch. status_code={res.status_code}")
-            else:
-                if not len(expected_retvals) == 0:
-                    raise ValueError(f"Expected {expected_retvals} but status_code={res.status_code}")
-            return res
-        except requests.HTTPError as e:
-            if res.status_code in expected_retvals:
-                _logger.info(f"{cmd} has failed as expected res={res}")
-                return res
-            else:
-                raise e
+        if exc := errors.from_response(res):
+            raise exc
+        return res
     def create_schema(self, bucket, name, txid=0, client_tags=[], schema_properties="", expected_retvals=[]):
         """
@@ -975,7 +995,8 @@ class VastdbApi:
             return snapshots, is_truncated, marker
-    def create_table(self, bucket, schema, name, arrow_schema, txid=0, client_tags=[], expected_retvals=[], topic_partitions=0):
+    def create_table(self, bucket, schema, name, arrow_schema, txid=0, client_tags=[], expected_retvals=[],
+                     topic_partitions=0, create_imports_table=False):
         """
         Create a table, use the following request
         POST /bucket/schema/table?table HTTP/1.1
@@ -984,18 +1005,21 @@ class VastdbApi:
         tabular-txid: <integer> TransactionId
         tabular-client-tag: <string> ClientTag
-        The body of the POST request contains table column properties as json
-        {
-            "format": "string",
-            "column_names": {"name1":"type1", "name2":"type2", ...},
-            "table_properties": {"key1":"val1", "key2":"val2", ...}
-        }
+        The body of the POST request contains table column properties as arrow schema
+        which include field_name, field_type and properties
+        In order to create vastdb-imported-objects table that tracks all imported files and avoid duplicate imports,
+        just set create_imports_table=True
+        The request will look like:
+        POST /bucket/schema/table?table&sub-table=vastdb-imported-objects HTTP/1.1
         """
         headers = self._fill_common_headers(txid=txid, client_tags=client_tags)
         serialized_schema = arrow_schema.serialize()
         headers['Content-Length'] = str(len(serialized_schema))
         url_params = {'topic_partitions': str(topic_partitions)} if topic_partitions else {}
+        if create_imports_table:
+            url_params['sub-table'] = IMPORTED_OBJECTS_TABLE_NAME
         res = self.session.post(self._api_prefix(bucket=bucket, schema=schema, table=name, command="table", url_params=url_params),
                                 data=serialized_schema, headers=headers)
@@ -1015,7 +1039,6 @@ class VastdbApi:
             raise RuntimeError(f'invalid params parquet_path={parquet_path} parquet_bucket_name={parquet_bucket_name} parquet_object_name={parquet_object_name}')
         # Get the schema of the Parquet file
-        _logger.info(f'type(parquet_ds.schema) = {type(parquet_ds.schema)}')
         if isinstance(parquet_ds.schema, pq.ParquetSchema):
             arrow_schema = parquet_ds.schema.to_arrow_schema()
         elif isinstance(parquet_ds.schema, pa.Schema):
@@ -1038,13 +1061,27 @@ class VastdbApi:
         headers = self._fill_common_headers(txid=txid, client_tags=client_tags)
         res = self.session.get(self._api_prefix(bucket=bucket, schema=schema, table=name, command="stats"), headers=headers)
         if res.status_code == 200:
-            res_headers = res.headers
             flatbuf = b''.join(res.iter_content(chunk_size=128))
             stats = get_table_stats.GetRootAs(flatbuf)
             num_rows = stats.NumRows()
             size_in_bytes = stats.SizeInBytes()
             is_external_rowid_alloc = stats.IsExternalRowidAlloc()
-            return num_rows, size_in_bytes, is_external_rowid_alloc
+            endpoints = []
+            if stats.VipsLength() == 0:
+                endpoints.append(self.url)
+            else:
+                ip_cls = IPv6Address if (stats.AddressType() == "ipv6") else IPv4Address
+                vips = [stats.Vips(i) for i in range(stats.VipsLength())]
+                ips = []
+                # extract the vips into list of IPs
+                for vip in vips:
+                    start_ip = int(ip_cls(vip.StartAddress().decode()))
+                    ips.extend(ip_cls(start_ip + i) for i  in range(vip.AddressCount()))
+                for ip in ips:
+                    prefix = "http" if not self.secure else "https"
+                    endpoints.append(f"{prefix}://{str(ip)}:{self.port}")
+            return TableStatsResult(num_rows, size_in_bytes, is_external_rowid_alloc, endpoints)
         return self._check_res(res, "get_table_stats", expected_retvals)
     def alter_table(self, bucket, schema, name, txid=0, client_tags=[], table_properties="",
@@ -1071,22 +1108,26 @@ class VastdbApi:
         headers = self._fill_common_headers(txid=txid, client_tags=client_tags)
         headers['Content-Length'] = str(len(alter_table_req))
-        url_params = {'tabular-new-table-name': new_name} if len(new_name) else {}
+        url_params = {'tabular-new-table-name': schema + "/" + new_name} if len(new_name) else {}
         res = self.session.put(self._api_prefix(bucket=bucket, schema=schema, table=name, command="table", url_params=url_params),
                                data=alter_table_req, headers=headers)
         return self._check_res(res, "alter_table", expected_retvals)
-    def drop_table(self, bucket, schema, name, txid=0, client_tags=[], expected_retvals=[]):
+    def drop_table(self, bucket, schema, name, txid=0, client_tags=[], expected_retvals=[], remove_imports_table=False):
         """
         DELETE /mybucket/schema_path/mytable?table HTTP/1.1
         tabular-txid: TransactionId
         tabular-client-tag: ClientTag
+        To remove the internal vastdb-imported-objects table just set remove_imports_table=True
         """
         headers = self._fill_common_headers(txid=txid, client_tags=client_tags)
+        url_params = {'sub-table': IMPORTED_OBJECTS_TABLE_NAME} if remove_imports_table else {}
-        res = self.session.delete(self._api_prefix(bucket=bucket, schema=schema, table=name, command="table"), headers=headers)
+        res = self.session.delete(self._api_prefix(bucket=bucket, schema=schema, table=name, command="table", url_params=url_params),
+                                  headers=headers)
         return self._check_res(res, "drop_table", expected_retvals)
     def list_tables(self, bucket, schema, txid=0, client_tags=[], max_keys=1000, next_key=0, name_prefix="",
@@ -1210,7 +1251,7 @@ class VastdbApi:
     def list_columns(self, bucket, schema, table, *, txid=0, client_tags=None, max_keys=None, next_key=0,
                      count_only=False, name_prefix="", exact_match=False,
-                     expected_retvals=None, bc_list_internals=False):
+                     expected_retvals=None, bc_list_internals=False, list_imports_table=False):
         """
         GET /mybucket/myschema/mytable?columns HTTP/1.1
         tabular-txid: TransactionId
@@ -1218,6 +1259,8 @@ class VastdbApi:
         x-tabluar-name-prefix: TableNamePrefix
         tabular-max-keys: 1000
         tabular-next-key: NextColumnId
+        To list the columns of the internal vastdb-imported-objects table, set list_import_table=True
         """
         max_keys = max_keys or self.default_max_list_columns_page_size
         client_tags = client_tags or []
@@ -1235,7 +1278,9 @@ class VastdbApi:
         else:
             headers['tabular-name-prefix'] = name_prefix
-        res = self.session.get(self._api_prefix(bucket=bucket, schema=schema, table=table, command="column"),
+        url_params = {'sub-table': IMPORTED_OBJECTS_TABLE_NAME} if list_imports_table else {}
+        res = self.session.get(self._api_prefix(bucket=bucket, schema=schema, table=table, command="column",
+                                                url_params=url_params),
                                headers=headers, stream=True)
         self._check_res(res, "list_columns", expected_retvals)
         if res.status_code == 200:
@@ -1247,9 +1292,7 @@ class VastdbApi:
             if not count_only:
                 schema_buf = b''.join(res.iter_content(chunk_size=128))
                 schema_out = pa.ipc.open_stream(schema_buf).schema
-    #            _logger.info(f"schema={schema_out}")
-                for f in schema_out:
-                    columns.append([f.name, f.type, f.metadata, f])
+                columns = schema_out
             return columns, next_key, is_truncated, count
@@ -1296,7 +1339,7 @@ class VastdbApi:
         return self._check_res(res, "get_transaction", expected_retvals)
     def select_row_ids(self, bucket, schema, table, params, txid=0, client_tags=[], expected_retvals=[],
-                       retry_count=0, enable_sorted_projections=False):
+                       retry_count=0, enable_sorted_projections=True):
         """
         POST /mybucket/myschema/mytable?query-data=SelectRowIds HTTP/1.1
         """
@@ -1313,7 +1356,7 @@ class VastdbApi:
         return self._check_res(res, "query_data", expected_retvals)
     def read_columns_data(self, bucket, schema, table, params, txid=0, client_tags=[], expected_retvals=[], tenant_guid=None,
-                          retry_count=0, enable_sorted_projections=False):
+                          retry_count=0, enable_sorted_projections=True):
         """
         POST /mybucket/myschema/mytable?query-data=ReadColumns HTTP/1.1
         """
@@ -1329,7 +1372,7 @@ class VastdbApi:
         return self._check_res(res, "query_data", expected_retvals)
     def count_rows(self, bucket, schema, table, params, txid=0, client_tags=[], expected_retvals=[], tenant_guid=None,
-                   retry_count=0, enable_sorted_projections=False):
+                   retry_count=0, enable_sorted_projections=True):
         """
         POST /mybucket/myschema/mytable?query-data=CountRows HTTP/1.1
         """
@@ -1343,27 +1386,9 @@ class VastdbApi:
                                data=params, headers=headers, stream=True)
         return self._check_res(res, "query_data", expected_retvals)
-    def query_data(self, bucket, schema, table, params, split=(0, 1, 8), num_sub_splits=1, response_row_id=False,
-                   txid=0, client_tags=[], expected_retvals=[], limit_rows=0, schedule_id=None, retry_count=0,
-                   search_path=None, sub_split_start_row_ids=[], tenant_guid=None, projection='', enable_sorted_projections=True,
-                   request_format='string', response_format='string'):
-        """
-        GET /mybucket/myschema/mytable?data HTTP/1.1
-        Content-Length: ContentLength
-        tabular-txid: TransactionId
-        tabular-client-tag: ClientTag
-        tabular-split: "split_id,total_splits,num_row_groups_per_split"
-        tabular-num-of-subsplits: "total"
-        tabular-request-format: "string"
-        tabular-response-format: "string" #arrow/trino
-        tabular-schedule-id: "schedule-id"
-        Request Body (flatbuf)
-        projections_chunk [expressions]
-        predicate_chunk "formatted_data", (required)
-        """
-        # add query option select-only and read-only
+    def _build_query_data_headers(self, txid, client_tags, params, split, num_sub_splits, request_format, response_format,
+                                  enable_sorted_projections, limit_rows, schedule_id, retry_count, search_path, tenant_guid,
+                                  sub_split_start_row_ids):
         headers = self._fill_common_headers(txid=txid, client_tags=client_tags)
         headers['Content-Length'] = str(len(params))
         headers['tabular-split'] = ','.join(map(str, split))
@@ -1388,439 +1413,80 @@ class VastdbApi:
         for sub_split_id, start_row_id in sub_split_start_row_ids:
             headers[f'tabular-start-row-id-{sub_split_id}'] = f"{sub_split_id},{start_row_id}"
-        url_params = {'name': projection} if projection else {}
+        return headers
-        res = self.session.get(self._api_prefix(bucket=bucket, schema=schema, table=table, command="data", url_params=url_params),
-                               data=params, headers=headers, stream=True)
-        return self._check_res(res, "query_data", expected_retvals)
+    def _build_query_data_url_params(self, projection, query_imports_table):
+        if query_imports_table and projection:
+            raise ValueError("Can't query both imports and projection table")
-    def _list_table_columns(self, bucket, schema, table, filters=None, field_names=None, txid=0):
-        # build a list of the queried column names
-        queried_columns = []
-        # get all columns from the table
-        all_listed_columns = []
-        next_key = 0
-        while True:
-            cur_columns, next_key, is_truncated, count = self.list_columns(
-                bucket=bucket, schema=schema, table=table, next_key=next_key, txid=txid)
-            if not cur_columns:
-                break
-            all_listed_columns.extend(cur_columns)
-            if not is_truncated:
-                break
-        # build a list of the queried columns
-        queried_column_names = set()
-        if filters:
-            filtered_column_names = ([column_name.split('.')[0] for column_name in filters.keys()]) # use top level of the filter column names
-            queried_column_names.update(filtered_column_names)
-            _logger.debug(f"_list_table_columns: filtered_column_names={filtered_column_names}")
-        if field_names:
-            field_column_names = ([column_name.split('.')[0] for column_name in field_names]) # use top level of the field column names
-        else:
-            field_column_names = [column[0] for column in all_listed_columns]
-        _logger.debug(f"_list_table_columns: field_column_names={field_column_names}")
-        queried_column_names.update(field_column_names)
-        all_listed_column_and_leaves_names = set()
-        for column in all_listed_columns:
-            # Collect the column and leaves names for verification below that all the filters and field names are in the table
-            column_and_leaves_names = [column[0]] + [f.name for f in column[3].flatten()]
-            all_listed_column_and_leaves_names.update(column_and_leaves_names)
-            # check if this column is needed for the query
-            if column[0] in queried_column_names:
-                queried_columns.append(column)
-        # verify that all the filters and field names are in the table
-        if filters:
-            for filter_column_name in filters.keys():
-                if filter_column_name not in all_listed_column_and_leaves_names:
-                    raise KeyError((f'filter column name: {filter_column_name} does not appear in the table'))
-        if field_names:
-            for field_name in field_names:
-                if field_name not in all_listed_column_and_leaves_names:
-                    raise ValueError((f'field name: {field_name} does not appear in the table'))
-        return list(queried_columns)
-    def _begin_tx_if_necessary(self, txid):
-        if not txid:
-            created_txid = True
-            res = self.begin_transaction()
-            txid = res.headers.get('tabular-txid')
-        else:
-            created_txid = False
+        url_params = {}
+        if query_imports_table:
+            url_params['sub-table'] = IMPORTED_OBJECTS_TABLE_NAME
+        elif projection:
+            url_params['name'] = projection
+        return url_params
-        return txid, created_txid
+    def legacy_query_data(self, bucket, schema, table, params, split=(0, 1, 8), num_sub_splits=1, response_row_id=False,
+                      txid=0, client_tags=[], expected_retvals=[], limit_rows=0, schedule_id=None, retry_count=0,
+                      search_path=None, sub_split_start_row_ids=[], tenant_guid=None, projection='', enable_sorted_projections=True,
+                      request_format='string', response_format='string', query_imports_table=False):
+        """
+        POST /mybucket/myschema/mytable?query-data=LegacyQueryData HTTP/1.1
+        Content-Length: ContentLength
+        tabular-txid: TransactionId
+        tabular-client-tag: ClientTag
+        tabular-split: "split_id,total_splits,num_row_groups_per_split"
+        tabular-num-of-subsplits: "total"
+        tabular-request-format: "string"
+        tabular-response-format: "string" #arrow/trino
+        tabular-schedule-id: "schedule-id"
+        Request Body (flatbuf)
+        projections_chunk [expressions]
+        predicate_chunk "formatted_data", (required)
+        """
+        headers = self._build_query_data_headers(txid, client_tags, params, split, num_sub_splits, request_format, response_format,
+                                                  enable_sorted_projections, limit_rows, schedule_id, retry_count, search_path, tenant_guid,
+                                                  sub_split_start_row_ids)
+        url_params = self._build_query_data_url_params(projection, query_imports_table)
+        res = self.session.post(self._api_prefix(bucket=bucket, schema=schema, table=table, command="query-data=LegacyQueryData",
+                                                  url_params=url_params), data=params, headers=headers, stream=True)
+        return self._check_res(res, "legacy_query_data", expected_retvals)
-    def _prepare_query(self, bucket, schema, table, num_sub_splits, filters=None, field_names=None,
-                       queried_columns=None, response_row_id=False, txid=0):
-        queried_fields = []
-        if response_row_id:
-            queried_fields.append(pa.field('$row_id', pa.uint64()))
+    def query_data(self, bucket, schema, table, params, split=(0, 1, 8), num_sub_splits=1, response_row_id=False,
+                   txid=0, client_tags=[], expected_retvals=[], limit_rows=0, schedule_id=None, retry_count=0,
+                   search_path=None, sub_split_start_row_ids=[], tenant_guid=None, projection='', enable_sorted_projections=True,
+                   request_format='string', response_format='string', query_imports_table=False):
+        """
+        GET /mybucket/myschema/mytable?data HTTP/1.1
+        Content-Length: ContentLength
+        tabular-txid: TransactionId
+        tabular-client-tag: ClientTag
+        tabular-split: "split_id,total_splits,num_row_groups_per_split"
+        tabular-num-of-subsplits: "total"
+        tabular-request-format: "string"
+        tabular-response-format: "string" #arrow/trino
+        tabular-schedule-id: "schedule-id"
-        if not queried_columns:
-            queried_columns = self._list_table_columns(bucket, schema, table, filters, field_names, txid=txid)
+        Request Body (flatbuf)
+        projections_chunk [expressions]
+        predicate_chunk "formatted_data", (required)
-        queried_fields.extend(pa.field(column[0], column[1]) for column in queried_columns)
-        arrow_schema = pa.schema(queried_fields)
+        To query the internal vastdb-imported-objects table, set query_imports_table=True
+        """
+        # add query option select-only and read-only
-        _logger.debug(f'_prepare_query: arrow_schema = {arrow_schema}')
+        headers = self._build_query_data_headers(txid, client_tags, params, split, num_sub_splits, request_format, response_format,
+                                                 enable_sorted_projections, limit_rows, schedule_id, retry_count, search_path, tenant_guid,
+                                                 sub_split_start_row_ids)
-        query_data_request = build_query_data_request(schema=arrow_schema, filters=filters, field_names=field_names)
-        if self.executor_hosts:
-            executor_hosts = self.executor_hosts
-        else:
-            executor_hosts = [self.host]
-        executor_sessions = [VastdbApi(executor_hosts[i], self.access_key, self.secret_key, self.username,
-                                       self.password, self.port, self.secure, self.auth_type) for i in range(len(executor_hosts))]
-        return queried_columns, arrow_schema, query_data_request, executor_sessions
-    def _more_pages_exist(self, start_row_ids):
-        for row_id in start_row_ids.values():
-            if row_id != TABULAR_INVALID_ROW_ID:
-                return True
-        return False
-    def _query_page(self, bucket, schema, table, query_data_request, split=(0, 1, 8), num_sub_splits=1, response_row_id=False,
-                   txid=0, limit_rows=0, sub_split_start_row_ids=[], filters=None, field_names=None):
-        res = self.query_data(bucket=bucket, schema=schema, table=table, params=query_data_request.serialized, split=split,
-                              num_sub_splits=num_sub_splits, response_row_id=response_row_id, txid=txid,
-                              limit_rows=limit_rows, sub_split_start_row_ids=sub_split_start_row_ids)
-        start_row_ids = {}
-        sub_split_tables = parse_query_data_response(res.raw, query_data_request.response_schema,
-                                                    start_row_ids=start_row_ids)
-        table_page = pa.concat_tables(sub_split_tables)
-        _logger.info("query_page: table_page num_rows=%s start_row_ids len=%s",
-                     len(table_page), len(start_row_ids))
-        return table_page, start_row_ids
-    def _query_page_iterator(self, bucket, schema, table, query_data_request, split=(0, 1, 8), num_sub_splits=1, response_row_id=False,
-                             txid=0, limit_rows=0, start_row_ids={}, filters=None, field_names=None):
-        res = self.query_data(bucket=bucket, schema=schema, table=table, params=query_data_request.serialized, split=split,
-                              num_sub_splits=num_sub_splits, response_row_id=response_row_id, txid=txid,
-                              limit_rows=limit_rows, sub_split_start_row_ids=start_row_ids.items())
-        for sub_split_table in parse_query_data_response(res.raw, query_data_request.response_schema,
-                                                        start_row_ids=start_row_ids):
-            for record_batch in sub_split_table.to_batches():
-                yield record_batch
-        _logger.info(f"query_page_iterator: start_row_ids={start_row_ids}")
-    def query_iterator(self, bucket, schema, table, num_sub_splits=1, num_row_groups_per_sub_split=8,
-                       response_row_id=False, txid=0, limit_per_sub_split=128*1024, filters=None, field_names=None):
-        """
-        query rows into a table.
-        Parameters
-        ----------
-        bucket : string
-            The bucket of the table.
-        schema : string
-            The schema of the table.
-        table : string
-            The table name.
-        num_sub_splits : integer
-            The number of sub_splits per split - determines the parallelism inside a VastDB compute node
-            default: 1
-        num_row_groups_per_sub_split : integer
-            The number of consecutive row groups per sub_split. Each row group consists of 64K row ids.
-            default: 8
-        response_row_id : boolean
-            Return a column with the internal row ids of the table
-            default: False
-        txid : integer
-            A transaction id. The transaction may be initiated before the query, and if not, the query will initiate it
-            default: 0 (will be created by the api)
-        limit_per_sub_split : integer
-            Limit the number of rows from a single sub_split for a single rpc
-            default:131072
-        filters : dict
-            A dictionary whose keys are column names, and values are lists of string expressions that represent
-            filter conditions on the column. AND is applied on the conditions. The condition formats are:
-            'column_name eq some_value'
-            default: None
-        field_names : list
-            A list of column names to be returned in the output table
-            default: None
-        Returns
-        -------
-        Query iterator generator
-        Yields
-        ------
-        pyarrow.RecordBatch
-        Examples
-        --------
-        for record_batch in query_iterator('some_bucket', 'some_schema', 'some_table',
-                                           filters={'name': ['eq Alice', 'eq Bob']}
-                                           field_names=['name','age']):
-            ...
-        """
-        # create a transaction if necessary
-        txid, created_txid = self._begin_tx_if_necessary(txid)
-        executor_sessions = []
+        url_params = self._build_query_data_url_params(projection, query_imports_table)
-        try:
-            # prepare query
-            queried_columns, arrow_schema, query_data_request, executor_sessions = \
-                self._prepare_query(bucket, schema, table, num_sub_splits, filters, field_names, response_row_id=response_row_id, txid=txid)
-            # define the per split threaded query func
-            def query_iterator_split_id(self, split_id):
-                _logger.info(f"query_iterator_split_id: split_id={split_id}")
-                try:
-                    start_row_ids = {i:0 for i in range(num_sub_splits)}
-                    session = executor_sessions[split_id]
-                    while not next_sems[split_id].acquire(timeout=1):
-                        # check if killed externally
-                        if killall:
-                            raise RuntimeError(f'query_iterator_split_id: split_id {split_id} received killall')
-                    while self._more_pages_exist(start_row_ids):
-                        for record_batch in session._query_page_iterator(bucket=bucket, schema=schema, table=table, query_data_request=query_data_request,
-                                                                         split=(split_id, num_splits, num_row_groups_per_sub_split),
-                                                                         num_sub_splits=num_sub_splits, response_row_id=response_row_id,
-                                                                         txid=txid, limit_rows=limit_per_sub_split,
-                                                                         start_row_ids=start_row_ids):
-                            output_queue.put((split_id, record_batch))
-                            while not next_sems[split_id].acquire(timeout=1): # wait for the main thread to request the next record batch
-                                if killall:
-                                    raise RuntimeError(f'split_id {split_id} received killall')
-                    # end of split
-                    output_queue.put((split_id,None))
-                except Exception as e:
-                    _logger.exception('query_iterator_split_id: exception occurred')
-                    try:
-                        self.rollback_transaction(txid)
-                    except:
-                        _logger.exception(f'failed to rollback txid {txid}')
-                    error_queue.put(None)
-                    raise e
-            # kickoff executors
-            num_splits = len(executor_sessions)
-            output_queue = queue.Queue()
-            error_queue = queue.Queue()
-            next_sems = [threading.Semaphore(value=1) for i in range(num_splits)]
-            killall = False
-            with concurrent.futures.ThreadPoolExecutor(max_workers=num_splits) as executor:
-                # start executors
-                futures = []
-                for i in range(num_splits):
-                    futures.append(executor.submit(query_iterator_split_id, self, i))
-                # receive outputs and yield them
-                done_count = 0
-                while done_count < num_splits:
-                    # check for errors
-                    try:
-                        error_queue.get(block=False)
-                        _logger.error('received error from a thread')
-                        killall = True
-                        # wait for all executors to complete
-                        for future in concurrent.futures.as_completed(futures):
-                            try:
-                                future.result() # trigger an exception if occurred in any thread
-                            except Exception:
-                                _logger.exception('exception occurred')
-                        raise RuntimeError('received error from a thread')
-                    except queue.Empty:
-                        pass
-                    # try to get a value from the output queue
-                    try:
-                        (split_id, record_batch) = output_queue.get(timeout=1)
-                    except queue.Empty:
-                        continue
-                    if record_batch:
-                        # signal to the thread to read the next record batch and yield the current
-                        next_sems[split_id].release()
-                        try:
-                            yield record_batch
-                        except GeneratorExit:
-                            killall = True
-                            _logger.debug("cancelling query_iterator")
-                            raise
-                    else:
-                        done_count += 1
-                # wait for all executors to complete
-                for future in concurrent.futures.as_completed(futures):
-                    try:
-                        future.result() # trigger an exception if occurred in any thread
-                    except Exception:
-                        _logger.exception('exception occurred')
-            # commit if needed
-            if created_txid:
-                self.commit_transaction(txid)
-        except Exception as e:
-            _logger.exception('exception occurred')
-            try:
-                self.rollback_transaction(txid)
-            except:
-                _logger.exception(f'failed to rollback txid {txid}')
-            raise e
-        finally:
-            killall = True
-            for session in executor_sessions:
-                try:
-                    session.session.close()
-                except Exception:
-                    _logger.exception(f'failed to close session {session}')
-    def query(self, bucket, schema, table, num_sub_splits=1, num_row_groups_per_sub_split=8,
-              response_row_id=False, txid=0, limit=0, limit_per_sub_split=131072, filters=None, field_names=None,
-              queried_columns=None):
-        """
-        query rows into a table.
-        Parameters
-        ----------
-        bucket : string
-            The bucket of the table.
-        schema : string
-            The schema of the table.
-        table : string
-            The table name.
-        num_sub_splits : integer
-            The number of sub_splits per split - determines the parallelism inside a VastDB compute node
-            default: 1
-        num_row_groups_per_sub_split : integer
-            The number of consecutive row groups per sub_split. Each row group consists of 64K row ids.
-            default: 8
-        response_row_id : boolean
-            Return a column with the internal row ids of the table
-            default: False
-        txid : integer
-            A transaction id. The transaction may be initiated before the query, and be used to provide
-            multiple ACID operations
-            default: 0 (will be created by the api)
-        limit : integer
-            Limit the number of rows in the response
-            default: 0 (no limit)
-        limit_per_sub_split : integer
-            Limit the number of rows from a single sub_split for a single rpc
-            default:131072
-        filters : dict
-            A dictionary whose keys are column names, and values are lists of string expressions that represent
-            filter conditions on the column. AND is applied on the conditions. The condition formats are:
-            'column_name eq some_value'
-            default: None
-        field_names : list
-            A list of column names to be returned to the output table
-            default: None
-        queried_columns: list of pyArrow.column
-            A list of the columns to be queried
-            default: None
-        Returns
-        -------
-        pyarrow.Table
-        Examples
-        --------
-        table = query('some_bucket', 'some_schema', 'some_table',
-                      filters={'name': ['eq Alice', 'eq Bob']}
-                      field_names=['name','age'])
-        """
-        # create a transaction
-        txid, created_txid = self._begin_tx_if_necessary(txid)
-        executor_sessions = []
-        try:
-            # prepare query
-            queried_columns, arrow_schema, query_data_request, executor_sessions = \
-                self._prepare_query(bucket, schema, table, num_sub_splits, filters, field_names, response_row_id=response_row_id, txid=txid)
-            # define the per split threaded query func
-            def query_split_id(self, split_id):
-                try:
-                    start_row_ids = {i:0 for i in range(num_sub_splits)}
-                    session = executor_sessions[split_id]
-                    row_count = 0
-                    while (self._more_pages_exist(start_row_ids) and
-                           (not limit or row_count < limit)):
-                        # check if killed externally
-                        if killall:
-                            raise RuntimeError(f'query_split_id: split_id {split_id} received killall')
-                        # determine the limit rows
-                        if limit:
-                            limit_rows = min(limit_per_sub_split, limit-row_count)
-                        else:
-                            limit_rows = limit_per_sub_split
-                        # query one page
-                        table_page, start_row_ids = session._query_page(bucket=bucket, schema=schema, table=table, query_data_request=query_data_request,
-                                                                        split=(split_id, num_splits, num_row_groups_per_sub_split),
-                                                                        num_sub_splits=num_sub_splits, response_row_id=response_row_id,
-                                                                        txid=txid, limit_rows=limit_rows,
-                                                                        sub_split_start_row_ids=start_row_ids.items())
-                        with lock:
-                            table_pages.append(table_page)
-                            row_counts[split_id] += len(table_page)
-                            row_count = sum(row_counts)
-                        _logger.info(f"query_split_id: table_pages split_id={split_id} row_count={row_count}")
-                except Exception as e:
-                    _logger.exception('query_split_id: exception occurred')
-                    try:
-                        self.rollback_transaction(txid)
-                    except:
-                        _logger.exception(f'failed to rollback txid {txid}')
-                    raise e
-            table_pages = []
-            num_splits = len(executor_sessions)
-            killall = False
-            with concurrent.futures.ThreadPoolExecutor(max_workers=num_splits) as executor:
-                futures = []
-                row_counts = [0] * num_splits
-                lock = threading.Lock()
-                for i in range(num_splits):
-                    futures.append(executor.submit(query_split_id, self, i))
-                for future in concurrent.futures.as_completed(futures):
-                    future.result() # trigger an exception if occurred in any thread
-            # commit if needed
-            if created_txid:
-                self.commit_transaction(txid)
-            # concatenate all table pages and return result
-            out_table = pa.concat_tables(table_pages)
-            out_table = out_table.slice(length=limit) if limit else out_table
-            _logger.info("query: out_table len=%s row_count=%s",
-                          len(out_table), len(out_table))
-            return out_table
-        except Exception as e:
-            _logger.exception('exception occurred')
-            try:
-                self.rollback_transaction(txid)
-            except:
-                _logger.exception(f'failed to rollback txid {txid}')
-            raise e
-        finally:
-            killall = True
-            for session in executor_sessions:
-                try:
-                    session.session.close()
-                except Exception:
-                    _logger.exception(f'failed to close session {session}')
+        res = self.session.get(self._api_prefix(bucket=bucket, schema=schema, table=table, command="data", url_params=url_params),
+                               data=params, headers=headers, stream=True)
+        return self._check_res(res, "query_data", expected_retvals)
     """
     source_files: list of (bucket_name, file_name)
@@ -1874,21 +1540,22 @@ class VastdbApi:
         builder.Finish(params)
         import_req = builder.Output()
-        def iterate_over_import_data_response(response, expected_retvals):
+        def iterate_over_import_data_response(response):
             if response.status_code != 200:
                 return response
             chunk_size = 1024
-            for chunk in res.iter_content(chunk_size=chunk_size):
+            for chunk in response.iter_content(chunk_size=chunk_size):
                 chunk_dict = json.loads(chunk)
-                _logger.info(f"import data chunk={chunk}, result: {chunk_dict['res']}")
-                if chunk_dict['res'] in expected_retvals:
-                    _logger.info(f"import finished with expected result={chunk_dict['res']}, error message: {chunk_dict['err_msg']}")
-                    return response
-                elif chunk_dict['res'] != 'Success' and chunk_dict['res'] != 'TabularInProgress':
-                    raise TabularException(f"Received unexpected error in import_data. "
-                                           f"status: {chunk_dict['res']}, error message: {chunk_dict['err_msg']}")
-                _logger.info(f"import_data is in progress. status: {chunk_dict['res']}")
+                _logger.debug("import data chunk=%s, result: %s", chunk_dict, chunk_dict['res'])
+                if chunk_dict['res'] != 'Success' and chunk_dict['res'] != 'TabularInProgress' and chunk_dict['res'] != 'TabularAlreadyImported':
+                    raise errors.ImportFilesError(
+                        f"Encountered an error during import_data. status: {chunk_dict['res']}, "
+                        f"error message: {chunk_dict['err_msg'] or 'Unexpected error'} during import of "
+                        f"object name: {chunk_dict['object_name']}", chunk_dict)
+                else:
+                    _logger.debug("import_data of object name '%s' is in progress. "
+                                  "status: %s", chunk_dict['object_name'], chunk_dict['res'])
             return response
         headers = self._fill_common_headers(txid=txid, client_tags=client_tags)
@@ -1901,34 +1568,17 @@ class VastdbApi:
         res = self.session.post(self._api_prefix(bucket=bucket, schema=schema, table=table, command="data"),
                                 data=import_req, headers=headers, stream=True)
         if blocking:
-            res = iterate_over_import_data_response(res, expected_retvals)
+            res = iterate_over_import_data_response(res)
         return self._check_res(res, "import_data", expected_retvals)
-    def merge_data(self):
-        """
-        TODO
-        POST /mybucket/myschema/mytable?data HTTP/1.1
-        Content-Length: ContentLength
-        tabular-txid: TransactionId
-        tabular-client-tag: ClientTag
-        Request Body
-        {
-          "format": "string",
-          "select_source": "formatted data"
-          "predicate": "formatted_data"
-        }
-        """
-        pass
     def _record_batch_slices(self, batch, rows_per_slice=None):
         max_slice_size_in_bytes = int(0.9*5*1024*1024) # 0.9 * 5MB
         batch_len = len(batch)
         serialized_batch = serialize_record_batch(batch)
         batch_size_in_bytes = len(serialized_batch)
-        _logger.info(f'max_slice_size_in_bytes={max_slice_size_in_bytes} batch_len={batch_len} batch_size_in_bytes={batch_size_in_bytes}')
+        _logger.debug('max_slice_size_in_bytes=%d batch_len=%d batch_size_in_bytes=%d',
+                      max_slice_size_in_bytes, batch_len, batch_size_in_bytes)
         if not rows_per_slice:
             if batch_size_in_bytes < max_slice_size_in_bytes:
@@ -1950,7 +1600,7 @@ class VastdbApi:
                 serialized_slice_batch = serialize_record_batch(slice_batch)
                 sizeof_serialized_slice_batch = len(serialized_slice_batch)
-                if sizeof_serialized_slice_batch <= max_slice_size_in_bytes or rows_per_slice < 10000:
+                if sizeof_serialized_slice_batch <= max_slice_size_in_bytes:
                     serialized_slices.append(serialized_slice_batch)
                 else:
                     _logger.info(f'Using rows_per_slice {rows_per_slice} slice {i} size {sizeof_serialized_slice_batch} exceeds {max_slice_size_in_bytes} bytes, trying smaller rows_per_slice')
@@ -1964,125 +1614,6 @@ class VastdbApi:
         return serialized_slices
-    def insert(self, bucket, schema, table, rows=None, record_batch=None, rows_per_insert=None, txid=0):
-        """
-        Insert rows into a table. The operation may be split into multiple commands, such that by default no more than 512KB will be inserted per command.
-        Parameters
-        ----------
-        bucket : string
-            The bucket of the table.
-        schema : string
-            The schema of the table.
-        table : string
-            The table name.
-        rows : dict
-            The rows to insert.
-            dictionary key: column name
-            dictionary value: array of cell values to insert
-            default: None (if None, record_batch must be provided)
-        record_batch : pyarrow.RecordBatch
-            A pyarrow RecordBatch
-            default: None (if None, rows dictionary must be provided)
-        rows_per_insert : integer
-            Split the operation so that each insert command will be limited to this value
-            default: None (will be selected automatically)
-        txid : integer
-            A transaction id. The transaction may be initiated before the insert, and be used to provide
-            multiple ACID operations
-            default: 0 (will be created by the api)
-        Returns
-        -------
-        None
-        Examples
-        --------
-        insert('some_bucket', 'some_schema', 'some_table', {'name': ['Alice','Bob'], 'age': [25,24]})
-        """
-        if (not rows and not record_batch) or (rows and record_batch):
-            raise ValueError(f'insert: missing argument - either rows or record_batch must be provided')
-        # create a transaction
-        txid, created_txid = self._begin_tx_if_necessary(txid)
-        if rows:
-            columns = self._list_table_columns(bucket, schema, table, field_names=rows.keys(), txid=txid)
-            columns_dict = dict([(column[0], column[1]) for column in columns])
-            arrow_schema = pa.schema([])
-            arrays = []
-            for column_name, column_values in rows.items():
-                column_type = columns_dict[column_name]
-                field = pa.field(column_name, column_type)
-                arrow_schema = arrow_schema.append(field)
-                arrays.append(pa.array(column_values, column_type))
-            record_batch = pa.record_batch(arrays, arrow_schema)
-        # split the record batch into multiple slices
-        serialized_slices = self._record_batch_slices(record_batch, rows_per_insert)
-        _logger.info(f'inserting record batch using {len(serialized_slices)} slices')
-        insert_queue = queue.Queue()
-        [insert_queue.put(insert_rows_req) for insert_rows_req in serialized_slices]
-        try:
-            executor_sessions = [VastdbApi(self.executor_hosts[i], self.access_key, self.secret_key, self.username,
-                                           self.password, self.port, self.secure, self.auth_type) for i in range(len(self.executor_hosts))]
-            def insert_executor(self, split_id):
-                try:
-                    _logger.info(f'insert_executor split_id={split_id} starting')
-                    session = executor_sessions[split_id]
-                    num_inserts = 0
-                    while not killall:
-                        try:
-                            insert_rows_req = insert_queue.get(block=False)
-                        except queue.Empty:
-                            break
-                        session.insert_rows(bucket=bucket, schema=schema,
-                                            table=table, record_batch=insert_rows_req, txid=txid)
-                        num_inserts += 1
-                    _logger.info(f'insert_executor split_id={split_id} num_inserts={num_inserts}')
-                    if killall:
-                        _logger.info('insert_executor killall=True')
-                except Exception as e:
-                    _logger.exception('insert_executor hit exception')
-                    raise e
-            num_splits = len(executor_sessions)
-            killall = False
-            with concurrent.futures.ThreadPoolExecutor(max_workers=num_splits) as executor:
-                futures = []
-                for i in range(num_splits):
-                    futures.append(executor.submit(insert_executor, self, i))
-                for future in concurrent.futures.as_completed(futures):
-                    future.result() # trigger an exception if occurred in any thread
-            # commit if needed
-            if created_txid:
-                self.commit_transaction(txid)
-        except Exception as e:
-            _logger.exception('exception occurred')
-            try:
-                self.rollback_transaction(txid)
-            except:
-                _logger.exception(f'failed to rollback txid {txid}')
-            raise e
-        finally:
-            killall = True
-            for session in executor_sessions:
-                try:
-                    session.session.close()
-                except Exception:
-                    _logger.exception(f'failed to close session {session}')
     def insert_rows(self, bucket, schema, table, record_batch, txid=0, client_tags=[], expected_retvals=[]):
         """
         POST /mybucket/myschema/mytable?rows HTTP/1.1
@@ -2115,7 +1646,8 @@ class VastdbApi:
                                 data=record_batch, headers=headers)
         return self._check_res(res, "update_rows", expected_retvals)
-    def delete_rows(self, bucket, schema, table, record_batch, txid=0, client_tags=[], expected_retvals=[]):
+    def delete_rows(self, bucket, schema, table, record_batch, txid=0, client_tags=[], expected_retvals=[],
+                    delete_from_imports_table=False):
         """
         DELETE /mybucket/myschema/mytable?rows HTTP/1.1
         Content-Length: ContentLength
@@ -2127,8 +1659,10 @@ class VastdbApi:
         """
         headers = self._fill_common_headers(txid=txid, client_tags=client_tags)
         headers['Content-Length'] = str(len(record_batch))
-        res = self.session.delete(self._api_prefix(bucket=bucket, schema=schema, table=table, command="rows"),
-                               data=record_batch, headers=headers)
+        url_params = {'sub-table': IMPORTED_OBJECTS_TABLE_NAME} if delete_from_imports_table else {}
+        res = self.session.delete(self._api_prefix(bucket=bucket, schema=schema, table=table, command="rows", url_params=url_params),
+                                  data=record_batch, headers=headers)
         return self._check_res(res, "delete_rows", expected_retvals)
     def create_projection(self, bucket, schema, table, name, columns, txid=0, client_tags=[], expected_retvals=[]):
@@ -2352,41 +1886,40 @@ def _iter_query_data_response_columns(fileobj, stream_ids=None):
         if stream_ids is not None:
             stream_ids.update([stream_id])  # count stream IDs using a collections.Counter
         if stream_id == TABULAR_KEEP_ALIVE_STREAM_ID:
-#            _logger.info(f"stream_id={stream_id} (skipping)")
             continue
         if stream_id == TABULAR_QUERY_DATA_COMPLETED_STREAM_ID:
             # read the terminating end chunk from socket
             res = fileobj.read()
-            _logger.info(f"stream_id={stream_id} res={res} (finish)")
+            _logger.debug("stream_id=%d res=%s (finish)", stream_id, res)
             return
         if stream_id == TABULAR_QUERY_DATA_FAILED_STREAM_ID:
             # read the terminating end chunk from socket
             res = fileobj.read()
-            _logger.info(f"stream_id={stream_id} res={res} (failed)")
+            _logger.warning("stream_id=%d res=%s (failed)", stream_id, res)
             raise IOError(f"Query data stream failed res={res}")
         next_row_id_bytes = fileobj.read(8)
         next_row_id, = struct.unpack('<Q', next_row_id_bytes)
-        _logger.info(f"stream_id={stream_id} next_row_id={next_row_id}")
+        _logger.debug("stream_id=%d next_row_id=%d", stream_id, next_row_id)
         if stream_id not in readers:
             # we implicitly read 1st message (Arrow schema) when constructing RecordBatchStreamReader
             reader = pa.ipc.RecordBatchStreamReader(fileobj)
-            _logger.info(f"stream_id={stream_id} schema={reader.schema}")
+            _logger.debug("stream_id=%d schema=%s", stream_id, reader.schema)
             readers[stream_id] = (reader, [])
             continue
         (reader, batches) = readers[stream_id]
         try:
             batch = reader.read_next_batch() # read single-column chunk data
-            _logger.info(f"stream_id={stream_id} rows={len(batch)} chunk={batch}")
+            _logger.debug("stream_id=%d rows=%d chunk=%s", stream_id, len(batch), batch)
             batches.append(batch)
         except StopIteration:  # we got an end-of-stream IPC message for a given stream ID
             reader, batches = readers.pop(stream_id)  # end of column
             table = pa.Table.from_batches(batches)  # concatenate all column chunks (as a single)
-            _logger.info(f"stream_id={stream_id} rows={len(table)} column={table}")
+            _logger.debug("stream_id=%d rows=%d column=%s", stream_id, len(table), table)
             yield (stream_id, next_row_id, table)
@@ -2398,24 +1931,23 @@ def parse_query_data_response(conn, schema, stream_ids=None, start_row_ids=None,
     """
     if start_row_ids is None:
         start_row_ids = {}
-    projection_positions = schema.projection_positions
-    arrow_schema = schema.arrow_schema
-    output_field_names = schema.output_field_names
-    _logger.debug(f'projection_positions={projection_positions} len(arrow_schema)={len(arrow_schema)} arrow_schema={arrow_schema}')
-    is_empty_projection = (len(projection_positions) == 0)
-    parsers = defaultdict(lambda: QueryDataParser(arrow_schema, debug=debug, projection_positions=projection_positions))  # {stream_id: QueryDataParser}
+    is_empty_projection = (len(schema) == 0)
+    parsers = defaultdict(lambda: QueryDataParser(schema, debug=debug))  # {stream_id: QueryDataParser}
     for stream_id, next_row_id, table in _iter_query_data_response_columns(conn, stream_ids):
         parser = parsers[stream_id]
         for column in table.columns:
             parser.parse(column)
-        parsed_table = parser.build(output_field_names)
+        parsed_table = parser.build()
         if parsed_table is not None:  # when we got all columns (and before starting a new "select_rows" cycle)
             parsers.pop(stream_id)
             if is_empty_projection:  # VAST returns an empty RecordBatch, with the correct rows' count
                 parsed_table = table
-            _logger.info(f"stream_id={stream_id} rows={len(parsed_table)} next_row_id={next_row_id} table={parsed_table}")
+            _logger.debug("stream_id=%d rows=%d next_row_id=%d table=%s",
+                          stream_id, len(parsed_table), next_row_id, parsed_table)
             start_row_ids[stream_id] = next_row_id
             yield parsed_table  # the result of a single "select_rows()" cycle
@@ -2496,7 +2028,7 @@ def get_field_type(builder: flatbuffers.Builder, field: pa.Field):
         fb_utf8.Start(builder)
         field_type = fb_utf8.End(builder)
-    elif field.type.equals(pa.date32()):  # pa.date64()
+    elif field.type.equals(pa.date32()):  # pa.date64() is not supported
         field_type_type = Type.Date
         fb_date.Start(builder)
         fb_date.AddUnit(builder, DateUnit.DAY)
@@ -2564,7 +2096,6 @@ def get_field_type(builder: flatbuffers.Builder, field: pa.Field):
     return field_type, field_type_type
 def build_field(builder: flatbuffers.Builder, f: pa.Field, name: str):
-    _logger.info(f"name={f.name}")
     children = None
     if isinstance(f.type, pa.StructType):
         children = [build_field(builder, child, child.name) for child in list(f.type)]
@@ -2591,7 +2122,6 @@ def build_field(builder: flatbuffers.Builder, f: pa.Field, name: str):
         fb_field.AddName(builder, child_col_name)
         fb_field.AddChildren(builder, children)
-        _logger.info(f"added key and map to entries")
         children = [fb_field.End(builder)]
     if children is not None:
@@ -2602,32 +2132,22 @@ def build_field(builder: flatbuffers.Builder, f: pa.Field, name: str):
     col_name = builder.CreateString(name)
     field_type, field_type_type = get_field_type(builder, f)
-    _logger.info(f"add col_name={name} type_type={field_type_type} to fb")
     fb_field.Start(builder)
     fb_field.AddName(builder, col_name)
     fb_field.AddTypeType(builder, field_type_type)
     fb_field.AddType(builder, field_type)
     if children is not None:
-        _logger.info(f"add col_name={name} childern")
         fb_field.AddChildren(builder, children)
     return fb_field.End(builder)
-class VastDBResponseSchema:
-    def __init__(self, arrow_schema, projection_positions, output_field_names):
-        self.arrow_schema = arrow_schema
-        self.projection_positions = projection_positions
-        self.output_field_names = output_field_names
 class QueryDataRequest:
     def __init__(self, serialized, response_schema):
         self.serialized = serialized
         self.response_schema = response_schema
-def build_query_data_request(schema: 'pa.Schema' = pa.schema([]), filters: dict = None, field_names: list = None):
-    filters = filters or {}
+def build_query_data_request(schema: 'pa.Schema' = pa.schema([]), predicate: ibis.expr.types.BooleanColumn = None, field_names: list = None):
     builder = flatbuffers.Builder(1024)
     source_name = builder.CreateString('')  # required
@@ -2643,39 +2163,21 @@ def build_query_data_request(schema: 'pa.Schema' = pa.schema([]), filters: dict
     fb_schema.AddFields(builder, fields)
     schema_obj = fb_schema.End(builder)
-    predicate = Predicate(schema, filters)
+    predicate = Predicate(schema=schema, expr=predicate)
     filter_obj = predicate.serialize(builder)
     parser = QueryDataParser(schema)
-    leaves_map = {}
-    for node in parser.nodes:
-        for descendent in node._iter_nodes():
-            if descendent.parent and isinstance(descendent.parent.type, (pa.ListType, pa.MapType)):
-                continue
-            iter_from_root = reversed(list(descendent._iter_to_root()))
-            descendent_full_name = '.'.join([n.field.name for n in iter_from_root])
-            _logger.debug(f'build_query_data_request: descendent_full_name={descendent_full_name}')
-            descendent_leaves = [leaf.index for leaf in descendent._iter_leaves()]
-            leaves_map[descendent_full_name] = descendent_leaves
-    _logger.debug(f'build_query_data_request: leaves_map={leaves_map}')
-    output_field_names = None
+    fields_map = {node.field.name: node.field for node in parser.nodes}
+    leaves_map = {node.field.name: [leaf.index for leaf in node._iter_leaves()] for node in parser.nodes}
     if field_names is None:
         field_names = [field.name for field in schema]
-    else:
-        output_field_names  = [f.split('.')[0] for f in field_names]
-        # sort projected field_names according to positions to maintain ordering according to the schema
-        def compare_field_names_by_pos(field_name1, field_name2):
-            return leaves_map[field_name1][0]-leaves_map[field_name2][0]
-        field_names = sorted(field_names, key=cmp_to_key(compare_field_names_by_pos))
-    _logger.debug(f'build_query_data_request: sorted field_names={field_names} schema={schema}')
+    response_schema = pa.schema([fields_map[name] for name in field_names])
     projection_fields = []
-    projection_positions = []
     for field_name in field_names:
+        # TODO: only root-level projection pushdown is supported (i.e. no support for SELECT s.x FROM t)
         positions = leaves_map[field_name]
-        _logger.info("projecting field=%s positions=%s", field_name, positions)
-        projection_positions.extend(positions)
         for leaf_position in positions:
             fb_field_index.Start(builder)
             fb_field_index.AddPosition(builder, leaf_position)
@@ -2686,8 +2188,6 @@ def build_query_data_request(schema: 'pa.Schema' = pa.schema([]), filters: dict
         builder.PrependUOffsetTRelative(offset)
     projection = builder.EndVector()
-    response_schema = VastDBResponseSchema(schema, projection_positions, output_field_names=output_field_names)
     fb_source.Start(builder)
     fb_source.AddName(builder, source_name)
     fb_source.AddSchema(builder, schema_obj)
@@ -2731,11 +2231,9 @@ def convert_column_types(table: 'pa.Table') -> 'pa.Table':
             indexes_of_fields_to_change[field.name] = index
     for changing_index in ts_indexes:
         field_name = table.schema[changing_index].name
-        _logger.info(f'changing resolution for {field_name} to us')
         new_column = table[field_name].cast(pa.timestamp('us'), safe=False)
         table = table.set_column(changing_index, field_name, new_column)
     for field_name, changing_index in indexes_of_fields_to_change.items():
-        _logger.info(f'applying custom rules to {field_name}')
         new_column = table[field_name].to_pylist()
         new_column = list(map(column_matcher[field_name], new_column))
         new_column = pa.array(new_column, table[field_name].type)

vastdb 0.0.5.3__py3-none-any.whl → 0.1.1__py3-none-any.whl

vastdb 0.0.5.3py3-none-any.whl → 0.1.1py3-none-any.whl