PyPI - vastdb - Versions diffs - 1.3.9__py3-none-any.whl → 1.3.11__py3-none-any.whl - Mend

vastdb 1.3.9py3-none-any.whl → 1.3.11py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (17) hide show

vastdb/_internal.py +50 -34
vastdb/bench/test_perf.py +68 -9
vastdb/conftest.py +9 -2
vastdb/errors.py +57 -3
vastdb/features.py +5 -1
vastdb/schema.py +7 -6
vastdb/table.py +51 -15
vastdb/tests/test_fixed_list.py +294 -0
vastdb/tests/test_imports.py +405 -53
vastdb/tests/test_nested.py +13 -8
vastdb/tests/test_tables.py +88 -4
vastdb/tests/util.py +21 -0
{vastdb-1.3.9.dist-info → vastdb-1.3.11.dist-info}/METADATA +1 -1
{vastdb-1.3.9.dist-info → vastdb-1.3.11.dist-info}/RECORD +17 -16
{vastdb-1.3.9.dist-info → vastdb-1.3.11.dist-info}/LICENSE +0 -0
{vastdb-1.3.9.dist-info → vastdb-1.3.11.dist-info}/WHEEL +0 -0
{vastdb-1.3.9.dist-info → vastdb-1.3.11.dist-info}/top_level.txt +0 -0

vastdb/_internal.py CHANGED Viewed

@@ -69,6 +69,7 @@ import vastdb.vast_flatbuf.org.apache.arrow.flatbuf.Date as fb_date
 import vastdb.vast_flatbuf.org.apache.arrow.flatbuf.Decimal as fb_decimal
 import vastdb.vast_flatbuf.org.apache.arrow.flatbuf.Field as fb_field
 import vastdb.vast_flatbuf.org.apache.arrow.flatbuf.FixedSizeBinary as fb_fixed_size_binary
+import vastdb.vast_flatbuf.org.apache.arrow.flatbuf.FixedSizeList as fb_fixed_size_list
 import vastdb.vast_flatbuf.org.apache.arrow.flatbuf.FloatingPoint as fb_floating_point
 import vastdb.vast_flatbuf.org.apache.arrow.flatbuf.Int as fb_int
 import vastdb.vast_flatbuf.org.apache.arrow.flatbuf.List as fb_list
@@ -497,7 +498,13 @@ class Predicate:
             fb_bool.Start(self.builder)
             field_type = fb_bool.End(self.builder)
-            value = True if value == 'true' else False  # not cover all cases
+            # Handle both boolean values and string representations
+            if isinstance(value, bool):
+                value = value
+            elif isinstance(value, str):
+                value = value.lower() == 'true'
+            else:
+                value = bool(value)
         elif isinstance(field.type, pa.Decimal128Type):
             literal_type = fb_decimal_lit
             literal_impl = LiteralImpl.DecimalLiteral
@@ -608,7 +615,7 @@ class FieldNode:
         self.debug = debug
         if isinstance(self.type, pa.StructType):
             self.children = [FieldNode(field, index_iter, parent=self) for field in self.type]
-        elif isinstance(self.type, pa.ListType):
+        elif pa.types.is_list(self.type) or pa.types.is_fixed_size_list(self.type):
             self.children = [FieldNode(self.type.value_field, index_iter, parent=self)]
         elif isinstance(self.type, pa.MapType):
             # Map is represented as List<Struct<K, V>> in Arrow
@@ -752,7 +759,7 @@ def _iter_nested_arrays(column: pa.Array) -> Iterator[pa.Array]:
         if not column.type.num_fields == 1:  # Note: VAST serializes only a single struct field at a time
             raise ValueError(f'column.type.num_fields: {column.type.num_fields} not eq to 1')
         yield from _iter_nested_arrays(column.field(0))
-    elif isinstance(column.type, pa.ListType):
+    elif pa.types.is_list(column.type) or pa.types.is_fixed_size_list(column.type):
         yield from _iter_nested_arrays(column.values)  # Note: Map is serialized in VAST as a List<Struct<K, V>>
@@ -853,10 +860,11 @@ class VastdbApi:
     VAST_VERSION_REGEX = re.compile(r'^vast (\d+\.\d+\.\d+\.\d+)$')
     def __init__(self, endpoint, access_key, secret_key,
-            *,
-            ssl_verify=True,
-            timeout=None,
-            backoff_config: Optional[BackoffConfig] = None):
+                 *,
+                 ssl_verify=True,
+                 timeout=None,
+                 backoff_config: Optional[BackoffConfig] = None,
+                 version_check=True):
         from . import version  # import lazily here (to avoid circular dependencies)
         self.client_sdk_version = f"VAST Database Python SDK {version()} - 2024 (c)"
@@ -896,29 +904,30 @@ class VastdbApi:
                                             aws_region='',
                                             aws_service='s3')
-        # probe the cluster for its version
-        res = self._request(method="GET", url=self._url(command="transaction"), skip_status_check=True)  # used only for the response headers
-        _logger.debug("headers=%s code=%s content=%s", res.headers, res.status_code, res.content)
-        server_header = res.headers.get("Server")
-        if server_header is None:
-            _logger.error("Response doesn't contain 'Server' header")
-        else:
-            if not server_header.startswith(self.VAST_SERVER_PREFIX):
-                raise UnsupportedServer(f'{self.url} is not a VAST DB server endpoint ("{server_header}")')
-            if m := self.VAST_VERSION_REGEX.match(server_header):
-                self.vast_version: Tuple[int, ...] = tuple(int(v) for v in m.group(1).split("."))
-                return
+        if version_check:
+            # probe the cluster for its version
+            res = self._request(method="GET", url=self._url(command="transaction"), skip_status_check=True)  # used only for the response headers
+            _logger.debug("headers=%s code=%s content=%s", res.headers, res.status_code, res.content)
+            server_header = res.headers.get("Server")
+            if server_header is None:
+                _logger.error("Response doesn't contain 'Server' header")
             else:
-                _logger.error("'Server' header '%s' doesn't match the expected pattern", server_header)
+                if not server_header.startswith(self.VAST_SERVER_PREFIX):
+                    raise UnsupportedServer(f'{self.url} is not a VAST DB server endpoint ("{server_header}")')
-        msg = (
-            f'Please use `vastdb` <= 0.0.5.x with current VAST cluster version ("{server_header or "N/A"}"). '
-            'To use the latest SDK, please upgrade your cluster to the latest service pack. '
-            'Please contact customer.support@vastdata.com for more details.'
-        )
-        _logger.critical(msg)
-        raise NotImplementedError(msg)
+                if m := self.VAST_VERSION_REGEX.match(server_header):
+                    self.vast_version: Tuple[int, ...] = tuple(int(v) for v in m.group(1).split("."))
+                    return
+                else:
+                    _logger.error("'Server' header '%s' doesn't match the expected pattern", server_header)
+            msg = (
+                f'Please use `vastdb` <= 0.0.5.x with current VAST cluster version ("{server_header or "N/A"}"). '
+                'To use the latest SDK, please upgrade your cluster to the latest service pack. '
+                'Please contact customer.support@vastdata.com for more details.'
+            )
+            _logger.critical(msg)
+            raise NotImplementedError(msg)
     def __enter__(self):
         """Allow using this session as a context manager."""
@@ -935,7 +944,8 @@ class VastdbApi:
             secret_key=self.secret_key,
             ssl_verify=self._session.verify,
             timeout=self.timeout,
-            backoff_config=self.backoff_config)
+            backoff_config=self.backoff_config,
+            version_check=False)
     def _single_request(self, *, method, url, skip_status_check=False, **kwargs):
         _logger.debug("Sending request: %s %s %s timeout=%s", method, url, kwargs, self.timeout)
@@ -1349,12 +1359,12 @@ class VastdbApi:
         lists = list_tables.GetRootAs(res.content)
         tables_length = lists.TablesLength()
         count = int(res_headers['tabular-list-count']) if 'tabular-list-count' in res_headers else tables_length
-        return lists, is_truncated, count
+        return lists, next_key, is_truncated, count
     def _list_tables_internal(self, bucket, schema, parse_properties, txid=0, client_tags=[], max_keys=1000, next_key=0, name_prefix="",
                               exact_match=False, expected_retvals=[], include_list_stats=False, count_only=False):
         tables = []
-        lists, is_truncated, count = self._list_tables_raw(bucket, schema, txid=txid, client_tags=client_tags, max_keys=max_keys,
+        lists, next_key, is_truncated, count = self._list_tables_raw(bucket, schema, txid=txid, client_tags=client_tags, max_keys=max_keys,
                                  next_key=next_key, name_prefix=name_prefix, exact_match=exact_match, expected_retvals=expected_retvals,
                                  include_list_stats=include_list_stats, count_only=count_only)
         bucket_name = lists.BucketName().decode()
@@ -1368,7 +1378,7 @@ class VastdbApi:
         return bucket_name, schema_name, tables, next_key, is_truncated, count
     def raw_sorting_score(self, bucket, schema, txid, name):
-        lists, _, _ = self._list_tables_raw(bucket, schema, txid=txid, exact_match=True, name_prefix=name, include_list_stats=True)
+        lists, _, _, _ = self._list_tables_raw(bucket, schema, txid=txid, exact_match=True, name_prefix=name, include_list_stats=True)
         bucket_name = lists.BucketName().decode()
         if not bucket.startswith(bucket_name):  # ignore snapshot name
             raise ValueError(f'bucket: {bucket} did not start from {bucket_name}')
@@ -2267,11 +2277,17 @@ def get_field_type(builder: flatbuffers.Builder, field: pa.Field):
         fb_struct.Start(builder)
         field_type = fb_struct.End(builder)
-    elif isinstance(field.type, pa.ListType):
+    elif pa.types.is_list(field.type):
         field_type_type = Type.List
         fb_list.Start(builder)
         field_type = fb_list.End(builder)
+    elif pa.types.is_fixed_size_list(field.type):
+        field_type_type = Type.FixedSizeList
+        fb_fixed_size_list.Start(builder)
+        fb_fixed_size_list.AddListSize(builder, field.type.list_size)
+        field_type = fb_fixed_size_list.End(builder)
     elif isinstance(field.type, pa.MapType):
         field_type_type = Type.Map
         fb_map.Start(builder)
@@ -2293,7 +2309,7 @@ def build_field(builder: flatbuffers.Builder, f: pa.Field, name: str):
     children = None
     if isinstance(f.type, pa.StructType):
         children = [build_field(builder, child, child.name) for child in list(f.type)]
-    if isinstance(f.type, pa.ListType):
+    if pa.types.is_list(f.type) or pa.types.is_fixed_size_list(f.type):
         children = [build_field(builder, f.type.value_field, "item")]
     if isinstance(f.type, pa.MapType):
         children = [

vastdb/bench/test_perf.py CHANGED Viewed

@@ -1,3 +1,4 @@
+import datetime as dt
 import logging
 import time
@@ -5,6 +6,7 @@ import pytest
 from vastdb import util
 from vastdb.table import ImportConfig, QueryConfig
+from vastdb.tests.util import compare_pyarrow_tables
 log = logging.getLogger(__name__)
@@ -12,17 +14,74 @@ log = logging.getLogger(__name__)
 @pytest.mark.benchmark
 def test_bench(session, test_bucket_name, parquets_path, crater_path):
     files = [str(parquets_path / f) for f in (parquets_path.glob('**/*.pq'))]
+    stats = None
     with session.transaction() as tx:
         b = tx.bucket(test_bucket_name)
         s = b.create_schema('s1')
-        t = util.create_table_from_files(s, 't1', files, config=ImportConfig(import_concurrency=8))
+        util.create_table_from_files(s, 't1', files, config=ImportConfig(import_concurrency=8))
+        t2 = util.create_table_from_files(s, 't2', files, config=ImportConfig(import_concurrency=8))
+        # Enabling Elysium with 4 sorting keys - ts, sid, ask_open, ask_close
+        t2.add_sorting_key([2, 0, 3, 4])
+        stats = t2.get_stats()
+        log.info("Added sorting keys")
+    assert stats
+    # Waiting up to 2 hours for sorting to complete.
+    start_time = time.time()
+    while not stats.sorting_done:
+        if time.time() - start_time > 7200:
+            raise TimeoutError("Sorting did not complete after waiting for 2 hours.")
+        time.sleep(30)
+        with session.transaction() as tx:
+            table = tx.bucket(test_bucket_name).schema('s1').table('t2')
+            stats = table.get_stats()
+    log.info("Sorting completed")
+    queries = [
+        {'query_str': "select sid from {t} where sid = 10033007".format, 'columns': ['sid'],
+         'predicate': lambda t: t['sid'] == 10033007},
+        {'query_str': "select last_trade_price from {t} where ts between "
+                      "TIMESTAMP'2018-01-04 20:30:00' AND TIMESTAMP'2018-01-05 20:30:00'".format,
+         'columns': ['last_trade_price'], 'predicate': lambda t: (t['ts'].between(
+            dt.datetime(2018, 1, 4, 20, 30, 00, 00), dt.datetime(2018, 1, 5, 20, 30, 00, 00)))},
+        {'query_str': "select ts,ask_close,ask_open from {t} where bid_qty = 684000 and ask_close > 1".format,
+         'columns': ['ts', 'ask_close', 'ask_open'],
+         'predicate': lambda t: ((t['bid_qty'] == 684000) & (t['ask_close'] > 1))},
+        {'query_str': "select ts,ticker from {t} where "
+                      "ask_open between 4374 and 4375 OR ask_open between 380 and 381".format,
+         'columns': ['ts', 'ticker'],
+         'predicate': lambda t: ((t['ask_open'].between(4374, 4375)) | (t['ask_open'].between(380, 381)))},
+        {
+         'query_str': "select trade_close, trade_high, trade_low, trade_open from {t} where ticker in ('BANR', 'KELYB')".format,
+         'columns': ['trade_close', 'trade_high', 'trade_low', 'trade_open'],
+         'predicate': lambda t: (t['ticker'].isin(['BANR', 'KELYB']))}
+    ]
+    log.info("Starting to run queries")
+    with session.transaction() as tx:
+        schema = tx.bucket(test_bucket_name).schema('s1')
+        t1 = schema.table("t1")
+        t2 = schema.table("t2")
         config = QueryConfig(num_splits=8, num_sub_splits=4)
-        s = time.time()
-        pa_table = t.select(columns=['sid'], predicate=t['sid'] == 10033007, config=config).read_all()
-        e = time.time()
-        log.info("'SELECT sid from TABLE WHERE sid = 10033007' returned in %s seconds.", e - s)
-        if crater_path:
-            with open(f'{crater_path}/bench_results', 'a') as f:
-                f.write(f"'SELECT sid FROM TABLE WHERE sid = 10033007' returned in {e - s} seconds")
-        assert pa_table.num_rows == 255_075
+        for q in queries:
+            normal_table_res, els_table_res = None, None
+            for table in [t1, t2]:
+                log.info("Starting query: %s", q['query_str'](t=table.name))
+                s = time.time()
+                res = table.select(columns=q['columns'], predicate=q['predicate'](table), config=config).read_all()
+                e = time.time()
+                if table == t1:
+                    normal_table_res = res
+                else:
+                    els_table_res = res
+                log.info("Query %s returned in %s seconds.", q['query_str'](t=table.name), e - s)
+                if crater_path:
+                    with open(f'{crater_path}/bench_results', 'a') as f:
+                        f.write(f"Query '{q['query_str'](t=table)}' returned in {e - s} seconds")
+            assert normal_table_res, f"missing result for {t1} table"
+            assert els_table_res, f"missing result for {t2} table"
+            assert compare_pyarrow_tables(normal_table_res, els_table_res)

vastdb/conftest.py CHANGED Viewed

@@ -6,6 +6,7 @@ import boto3
 import pytest
 import vastdb
+import vastdb.errors
 def pytest_addoption(parser):
@@ -65,8 +66,14 @@ def clean_bucket_name(request, test_bucket_name, session):
         b = tx.bucket(test_bucket_name)
         for top_schema in b.schemas():
             for s in iter_schemas(top_schema):
-                for t in s.tables():
-                    t.drop()
+                for t_name in s.tablenames():
+                    try:
+                        t = s.table(t_name)
+                        t.drop()
+                    except vastdb.errors.NotSupportedSchema:
+                        # Use internal API to drop the table in case unsupported schema prevents creating a table
+                        # object.
+                        tx._rpc.api.drop_table(b.name, s.name, t_name, txid=tx.txid)
                 s.drop()
     return test_bucket_name

vastdb/errors.py CHANGED Viewed

@@ -2,7 +2,9 @@ import logging
 import xml.etree.ElementTree
 from dataclasses import dataclass
 from enum import Enum
+from typing import Optional
+import pyarrow as pa
 import requests
@@ -89,6 +91,9 @@ class ImportFilesError(Exception):
     message: str
     error_dict: dict
+    def __post_init__(self):
+        self.args = [vars(self)]
 class InvalidArgument(Exception):
     pass
@@ -122,18 +127,27 @@ class NotSupported(Exception):
 class MissingBucket(Missing):
     bucket: str
+    def __post_init__(self):
+        self.args = [vars(self)]
 @dataclass
 class MissingSnapshot(Missing):
     bucket: str
     snapshot: str
+    def __post_init__(self):
+        self.args = [vars(self)]
 @dataclass
 class MissingSchema(Missing):
     bucket: str
     schema: str
+    def __post_init__(self):
+        self.args = [vars(self)]
 @dataclass
 class MissingTable(Missing):
@@ -141,6 +155,9 @@ class MissingTable(Missing):
     schema: str
     table: str
+    def __post_init__(self):
+        self.args = [vars(self)]
 @dataclass
 class MissingProjection(Missing):
@@ -149,6 +166,9 @@ class MissingProjection(Missing):
     table: str
     projection: str
+    def __post_init__(self):
+        self.args = [vars(self)]
 class Exists(Exception):
     pass
@@ -159,6 +179,9 @@ class SchemaExists(Exists):
     bucket: str
     schema: str
+    def __post_init__(self):
+        self.args = [vars(self)]
 @dataclass
 class TableExists(Exists):
@@ -166,6 +189,9 @@ class TableExists(Exists):
     schema: str
     table: str
+    def __post_init__(self):
+        self.args = [vars(self)]
 @dataclass
 class NotSupportedCommand(NotSupported):
@@ -173,18 +199,37 @@ class NotSupportedCommand(NotSupported):
     schema: str
     table: str
+    def __post_init__(self):
+        self.args = [vars(self)]
 @dataclass
 class NotSupportedVersion(NotSupported):
     err_msg: str
     version: str
+    def __post_init__(self):
+        self.args = [vars(self)]
+@dataclass
+class NotSupportedSchema(NotSupported):
+    message: Optional[str] = None
+    schema: Optional[pa.Schema] = None
+    cause: Optional[Exception] = None
+    def __post_init__(self):
+        self.args = [vars(self)]
 @dataclass
 class ConnectionError(Exception):
     cause: Exception
     may_retry: bool
+    def __post_init__(self):
+        self.args = [vars(self)]
 def handle_unavailable(**kwargs):
     if kwargs['code'] == 'SlowDown':
@@ -192,7 +237,7 @@ def handle_unavailable(**kwargs):
     raise ServiceUnavailable(**kwargs)
-ERROR_TYPES_MAP = {
+HTTP_ERROR_TYPES_MAP = {
     HttpStatus.BAD_REQUEST: BadRequest,
     HttpStatus.FOBIDDEN: Forbidden,
     HttpStatus.NOT_FOUND: NotFound,
@@ -205,6 +250,10 @@ ERROR_TYPES_MAP = {
     HttpStatus.INSUFFICIENT_CAPACITY: InsufficientCapacity,
 }
+SPECIFIC_ERROR_TYPES_MAP = {
+    'TabularUnsupportedColumnType': NotSupportedSchema,
+}
 def from_response(res: requests.Response):
     if res.status_code == HttpStatus.SUCCESS.value:
@@ -234,5 +283,10 @@ def from_response(res: requests.Response):
     )
     log.warning("RPC failed: %s", kwargs)
     status = HttpStatus(res.status_code)
-    error_type = ERROR_TYPES_MAP.get(status, UnexpectedError)
-    return error_type(**kwargs)  # type: ignore
+    http_error_type = HTTP_ERROR_TYPES_MAP.get(status, UnexpectedError)
+    http_error = http_error_type(**kwargs)  # type: ignore
+    # Wrap specific error types if applicable
+    if code_str in SPECIFIC_ERROR_TYPES_MAP:
+        error_type = SPECIFIC_ERROR_TYPES_MAP[code_str]
+        return error_type(message=message_str, cause=http_error)
+    return http_error

vastdb/features.py CHANGED Viewed

@@ -4,7 +4,7 @@ import logging
 from .errors import NotSupportedVersion
-log = logging.getLogger()
+log = logging.getLogger(__name__)
 class Features:
@@ -39,6 +39,10 @@ class Features:
             "Zip import requires 5.3.1+ VAST release",
             vast_version >= (5, 3, 1))
+        self.check_timezone = self._check(
+            "Timezone support requires 5.4+ Vast release",
+            vast_version >= (5, 4))
     def _check(self, msg, supported):
         log.debug("%s (current version is %s): supported=%s", msg, self.vast_version, supported)
         if not supported:

vastdb/schema.py CHANGED Viewed

@@ -91,6 +91,7 @@ class Schema:
         if use_external_row_ids_allocation:
             self.tx._rpc.features.check_external_row_ids_allocation()
+        table.Table.validate_ibis_support_schema(columns)
         self.tx._rpc.api.create_table(self.bucket.name, self.name, table_name, columns, txid=self.tx.txid,
                                       use_external_row_ids_allocation=use_external_row_ids_allocation,
                                       sorting_key=sorting_key)
@@ -109,14 +110,14 @@ class Schema:
         log.debug("Found table: %s", t[0])
         return t[0]
-    def _iter_tables(self, table_name=None):
+    def _iter_tables(self, table_name=None, page_size=1000):
         next_key = 0
         name_prefix = table_name if table_name else ""
         exact_match = bool(table_name)
         while True:
             _bucket_name, _schema_name, curr_tables, next_key, is_truncated, _ = \
                 self.tx._rpc.api.list_tables(
-                    bucket=self.bucket.name, schema=self.name, next_key=next_key, txid=self.tx.txid,
+                    bucket=self.bucket.name, schema=self.name, next_key=next_key, max_keys=page_size, txid=self.tx.txid,
                     exact_match=exact_match, name_prefix=name_prefix, include_list_stats=exact_match)
             if not curr_tables:
                 break
@@ -124,19 +125,19 @@ class Schema:
             if not is_truncated:
                 break
-    def tables(self, table_name: str = "") -> List["Table"]:
+    def tables(self, table_name: str = "", page_size=1000) -> List["Table"]:
         """List all tables under this schema if `table_name` is empty.
         Otherwise, list only the specific table (if exists).
         """
         return [
             _parse_table_info(table_info, self)
-            for table_info in self._iter_tables(table_name=table_name)
+            for table_info in self._iter_tables(table_name=table_name, page_size=page_size)
         ]
-    def tablenames(self) -> List[str]:
+    def tablenames(self, page_size=1000) -> List[str]:
         """List all table names under this schema."""
-        return [table_info.name for table_info in self._iter_tables()]
+        return [table_info.name for table_info in self._iter_tables(page_size=page_size)]
     def drop(self) -> None:
         """Delete this schema."""

vastdb/table.py CHANGED Viewed

@@ -1,9 +1,11 @@
 """VAST Database table."""
 import concurrent.futures
+import copy
 import logging
 import os
 import queue
+import sys
 from dataclasses import dataclass, field
 from math import ceil
 from threading import Event
@@ -124,11 +126,35 @@ class Table:
     _imports_table: bool
     sorted_table: bool
+    @staticmethod
+    def validate_ibis_support_schema(arrow_schema: pa.Schema):
+        """Validate that the provided Arrow schema is compatible with Ibis.
+        Raises NotSupportedSchema if the schema contains unsupported fields.
+        """
+        unsupported_fields = []
+        first_exception = None
+        for f in arrow_schema:
+            try:
+                ibis.Schema.from_pyarrow(pa.schema([f]))
+            except Exception as e:
+                if first_exception is None:
+                    first_exception = e
+                unsupported_fields.append(f)
+        if unsupported_fields:
+            raise errors.NotSupportedSchema(
+                message=f"Ibis does not support the schema {unsupported_fields=}",
+                schema=arrow_schema,
+                cause=first_exception
+            )
     def __post_init__(self):
         """Also, load columns' metadata."""
         self.arrow_schema = self.columns()
         self._table_path = f'{self.schema.bucket.name}/{self.schema.name}/{self.name}'
+        self.validate_ibis_support_schema(self.arrow_schema)
         self._ibis_table = ibis.table(ibis.Schema.from_pyarrow(self.arrow_schema), self._table_path)
     @property
@@ -333,7 +359,8 @@ class Table:
                predicate: Union[ibis.expr.types.BooleanColumn, ibis.common.deferred.Deferred] = None,
                config: Optional[QueryConfig] = None,
                *,
-               internal_row_id: bool = False) -> pa.RecordBatchReader:
+               internal_row_id: bool = False,
+               limit_rows: Optional[int] = None) -> pa.RecordBatchReader:
         """Execute a query over this table.
         To read a subset of the columns, specify their names via `columns` argument. Otherwise, all columns will be read.
@@ -342,15 +369,13 @@ class Table:
         Query-execution configuration options can be specified via the optional `config` argument.
         """
-        if config is None:
-            config = QueryConfig()
+        config = copy.deepcopy(config) if config else QueryConfig()
+        if limit_rows:
+            config.limit_rows_per_sub_split = limit_rows
-        stats = None
-        # Retrieve snapshots only if needed
         if config.data_endpoints is None:
-            stats = self.get_stats()
-            log.debug("stats: %s", stats)
-            endpoints = stats.endpoints
+            endpoints = tuple([self.tx._rpc.api.url])
         else:
             endpoints = tuple(config.data_endpoints)
         log.debug("endpoints: %s", endpoints)
@@ -380,8 +405,7 @@ class Table:
                 num_rows = self._get_row_estimate(columns, predicate, query_schema)
                 log.debug(f'sorted estimate: {num_rows}')
             if num_rows == 0:
-                if stats is None:
-                    stats = self.get_stats()
+                stats = self.get_stats()
                 num_rows = stats.num_rows
             config.num_splits = max(1, num_rows // config.rows_per_split)
@@ -402,7 +426,7 @@ class Table:
         for split in range(config.num_splits):
             splits_queue.put(split)
-        # this queue shouldn't be large it is marely a pipe through which the results
+        # this queue shouldn't be large it is merely a pipe through which the results
         # are sent to the main thread. Most of the pages actually held in the
         # threads that fetch the pages.
         record_batches_queue: queue.Queue[pa.RecordBatch] = queue.Queue(maxsize=2)
@@ -458,8 +482,9 @@ class Table:
             if config.query_id:
                 threads_prefix = threads_prefix + "-" + config.query_id
+            total_num_rows = limit_rows if limit_rows else sys.maxsize
             with concurrent.futures.ThreadPoolExecutor(max_workers=len(endpoints), thread_name_prefix=threads_prefix) as tp:  # TODO: concurrency == enpoints is just a heuristic
-                futures = [tp.submit(single_endpoint_worker, endpoint) for endpoint in endpoints]
+                futures = [tp.submit(single_endpoint_worker, endpoint) for endpoint in endpoints[:config.num_splits]]
                 tasks_running = len(futures)
                 try:
                     while tasks_running > 0:
@@ -467,7 +492,14 @@ class Table:
                         batch = record_batches_queue.get()
                         if batch is not None:
-                            yield batch
+                            if batch.num_rows < total_num_rows:
+                                yield batch
+                                total_num_rows -= batch.num_rows
+                            else:
+                                yield batch.slice(length=total_num_rows)
+                                log.info("reached limit rows per query: %d - stop query", limit_rows)
+                                stop_event.set()
+                                break
                         else:
                             tasks_running -= 1
                             log.debug("one worker thread finished, remaining: %d", tasks_running)
@@ -510,6 +542,9 @@ class Table:
         """Insert a RecordBatch into this table."""
         if self._imports_table:
             raise errors.NotSupportedCommand(self.bucket.name, self.schema.name, self.name)
+        if 0 == rows.num_rows:
+            log.debug("Ignoring empty insert into %s", self.name)
+            return pa.chunked_array([], type=INTERNAL_ROW_ID_FIELD.type)
         try:
             row_ids = []
             serialized_slices = util.iter_serialized_slices(rows, MAX_INSERT_ROWS_PER_PATCH)
@@ -522,7 +557,7 @@ class Table:
                 self.tx._rpc.features.check_return_row_ids()
             except errors.NotSupportedVersion:
                 return  # type: ignore
-            return pa.chunked_array(row_ids)
+            return pa.chunked_array(row_ids, type=INTERNAL_ROW_ID_FIELD.type)
         except errors.TooWideRow:
             self.tx._rpc.features.check_return_row_ids()
             return self.insert_in_column_batches(rows)
@@ -596,7 +631,7 @@ class Table:
         self.name = new_name
     def add_sorting_key(self, sorting_key: list) -> None:
-        """Ads a sorting key to a table that doesn't have any."""
+        """Add a sorting key to a table that doesn't have any."""
         self.tx._rpc.features.check_elysium()
         self.tx._rpc.api.alter_table(
             self.bucket.name, self.schema.name, self.name, txid=self.tx.txid, sorting_key=sorting_key)
@@ -606,6 +641,7 @@ class Table:
         """Add a new column."""
         if self._imports_table:
             raise errors.NotSupportedCommand(self.bucket.name, self.schema.name, self.name)
+        self.validate_ibis_support_schema(new_column)
         self.tx._rpc.api.add_columns(self.bucket.name, self.schema.name, self.name, new_column, txid=self.tx.txid)
         log.info("Added column(s): %s", new_column)
         self.arrow_schema = self.columns()

vastdb 1.3.9__py3-none-any.whl → 1.3.11__py3-none-any.whl

vastdb 1.3.9py3-none-any.whl → 1.3.11py3-none-any.whl