PyPI - vastdb - Versions diffs - 0.1.6__py3-none-any.whl → 0.1.8__py3-none-any.whl - Mend

vastdb 0.1.6py3-none-any.whl → 0.1.8py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (22) hide show

vastdb/__init__.py +3 -0
vastdb/{internal_commands.py → _internal.py} +289 -346
vastdb/bucket.py +2 -2
vastdb/conftest.py +16 -2
vastdb/errors.py +6 -0
vastdb/schema.py +8 -4
vastdb/session.py +18 -5
vastdb/table.py +79 -58
vastdb/tests/test_duckdb.py +2 -2
vastdb/tests/test_projections.py +5 -1
vastdb/tests/test_sanity.py +5 -5
vastdb/tests/test_tables.py +54 -1
vastdb/tests/test_util.py +6 -0
vastdb/transaction.py +2 -2
vastdb/util.py +40 -1
vastdb/vast_tests/__init__.py +0 -0
vastdb/vast_tests/test_ha.py +29 -0
{vastdb-0.1.6.dist-info → vastdb-0.1.8.dist-info}/METADATA +2 -2
{vastdb-0.1.6.dist-info → vastdb-0.1.8.dist-info}/RECORD +22 -20
{vastdb-0.1.6.dist-info → vastdb-0.1.8.dist-info}/LICENSE +0 -0
{vastdb-0.1.6.dist-info → vastdb-0.1.8.dist-info}/WHEEL +0 -0
{vastdb-0.1.6.dist-info → vastdb-0.1.8.dist-info}/top_level.txt +0 -0

vastdb/bucket.py CHANGED Viewed

@@ -6,7 +6,7 @@ It is possible to list and access VAST snapshots generated over a bucket.
 import logging
 from dataclasses import dataclass, field
-from typing import TYPE_CHECKING, List, Optional
+from typing import TYPE_CHECKING, Iterable, Optional
 from . import errors, schema, transaction
@@ -55,7 +55,7 @@ class Bucket:
         return Bucket(name=f'{self.name}/{expected_name}', tx=self.tx)
-    def snapshots(self) -> List["Bucket"]:
+    def snapshots(self) -> Iterable["Bucket"]:
         """List bucket's snapshots."""
         snapshots = []
         next_key = 0

vastdb/conftest.py CHANGED Viewed

@@ -9,11 +9,15 @@ import vastdb
 def pytest_addoption(parser):
     parser.addoption("--tabular-bucket-name", help="Name of the S3 bucket with Tabular enabled", default="vastdb")
-    parser.addoption("--tabular-access-key", help="Access key with Tabular permissions (AWS_ACCESS_KEY_ID)", default=os.environ.get("AWS_ACCESS_KEY_ID", None))
-    parser.addoption("--tabular-secret-key", help="Secret key with Tabular permissions (AWS_SECRET_ACCESS_KEY)", default=os.environ.get("AWS_SECRET_ACCESS_KEY", None))
+    parser.addoption("--tabular-access-key", help="Access key with Tabular permissions (AWS_ACCESS_KEY_ID)",
+                     default=os.environ.get("AWS_ACCESS_KEY_ID", None))
+    parser.addoption("--tabular-secret-key", help="Secret key with Tabular permissions (AWS_SECRET_ACCESS_KEY)",
+                     default=os.environ.get("AWS_SECRET_ACCESS_KEY", None))
     parser.addoption("--tabular-endpoint-url", help="Tabular server endpoint", default="http://localhost:9090")
     parser.addoption("--data-path", help="Data files location", default=None)
     parser.addoption("--crater-path", help="Save benchmark results in a dedicated location", default=None)
+    parser.addoption("--schema-name", help="Name of schema for the test to operate on", default=None)
+    parser.addoption("--table-name", help="Name of table for the test to operate on", default=None)
 @pytest.fixture(scope="session")
@@ -67,3 +71,13 @@ def parquets_path(request):
 @pytest.fixture(scope="function")
 def crater_path(request):
     return request.config.getoption("--crater-path")
+@pytest.fixture(scope="function")
+def schema_name(request):
+    return request.config.getoption("--schema-name")
+@pytest.fixture(scope="function")
+def table_name(request):
+    return request.config.getoption("--table-name")

vastdb/errors.py CHANGED Viewed

@@ -175,6 +175,12 @@ class NotSupportedVersion(NotSupported):
     version: str
+@dataclass
+class ConnectionError(Exception):
+    cause: Exception
+    may_retry: bool
 def handle_unavailable(**kwargs):
     if kwargs['code'] == 'SlowDown':
         raise Slowdown(**kwargs)

vastdb/schema.py CHANGED Viewed

@@ -6,7 +6,7 @@ It is possible to list and access VAST snapshots generated over a bucket.
 import logging
 from dataclasses import dataclass
-from typing import TYPE_CHECKING, List, Optional
+from typing import TYPE_CHECKING, Iterable, List, Optional
 import pyarrow as pa
@@ -62,7 +62,7 @@ class Schema:
         assert len(names) == 1, f"Expected to receive only a single schema, but got {len(schemas)}: ({schemas})"
         return schema.Schema(name=self._subschema_full_name(names[0]), bucket=self.bucket)
-    def schemas(self, batch_size=None) -> List["Schema"]:
+    def schemas(self, batch_size=None) -> Iterable["Schema"]:
         """List child schemas."""
         next_key = 0
         if not batch_size:
@@ -76,14 +76,18 @@ class Schema:
                 break
         return result
-    def create_table(self, table_name: str, columns: pa.Schema, fail_if_exists=True) -> "Table":
+    def create_table(self, table_name: str, columns: pa.Schema, fail_if_exists=True, use_external_row_ids_allocation=False) -> "Table":
         """Create a new table under this schema."""
         if current := self.table(table_name, fail_if_missing=False):
             if fail_if_exists:
                 raise errors.TableExists(self.bucket.name, self.name, table_name)
             else:
                 return current
-        self.tx._rpc.api.create_table(self.bucket.name, self.name, table_name, columns, txid=self.tx.txid)
+        if use_external_row_ids_allocation:
+            self.tx._rpc.features.check_external_row_ids_allocation()
+        self.tx._rpc.api.create_table(self.bucket.name, self.name, table_name, columns, txid=self.tx.txid,
+                                      use_external_row_ids_allocation=use_external_row_ids_allocation)
         log.info("Created table: %s", table_name)
         return self.table(table_name)  # type: ignore[return-value]

vastdb/session.py CHANGED Viewed

@@ -9,10 +9,12 @@ For more details see:
 import logging
 import os
+from typing import Optional
 import boto3
-from . import errors, internal_commands, transaction
+from . import _internal, errors, transaction
+from ._internal import BackoffConfig
 log = logging.getLogger()
@@ -36,6 +38,10 @@ class Features:
             "Semi-sorted projection enforcement requires 5.1+ VAST release",
             vast_version >= (5, 1))
+        self.check_external_row_ids_allocation = self._check(
+            "External row IDs allocation requires 5.1+ VAST release",
+            vast_version >= (5, 1))
     def _check(self, msg, supported):
         log.debug("%s (current version is %s): supported=%s", msg, self.vast_version, supported)
         if not supported:
@@ -51,7 +57,10 @@ class Features:
 class Session:
     """VAST database session."""
-    def __init__(self, access=None, secret=None, endpoint=None, ssl_verify=True):
+    def __init__(self, access=None, secret=None, endpoint=None,
+                 *,
+                 ssl_verify=True,
+                 backoff_config: Optional[BackoffConfig] = None):
         """Connect to a VAST Database endpoint, using specified credentials."""
         if access is None:
             access = os.environ['AWS_ACCESS_KEY_ID']
@@ -60,9 +69,13 @@ class Session:
         if endpoint is None:
             endpoint = os.environ['AWS_S3_ENDPOINT_URL']
-        self.api = internal_commands.VastdbApi(endpoint, access, secret, ssl_verify=ssl_verify)
-        version_tuple = tuple(int(part) for part in self.api.vast_version.split('.'))
-        self.features = Features(version_tuple)
+        self.api = _internal.VastdbApi(
+            endpoint=endpoint,
+            access_key=access,
+            secret_key=secret,
+            ssl_verify=ssl_verify,
+            backoff_config=backoff_config)
+        self.features = Features(self.api.vast_version)
         self.s3 = boto3.client('s3',
             aws_access_key_id=access,
             aws_secret_access_key=secret,

vastdb/table.py CHANGED Viewed

@@ -7,14 +7,13 @@ import queue
 from dataclasses import dataclass, field
 from math import ceil
 from threading import Event
-from typing import Any, Dict, List, Optional, Tuple, Union
+from typing import Callable, Dict, Iterable, List, Optional, Tuple, Union
-import backoff
 import ibis
 import pyarrow as pa
-import requests
+import urllib3
-from . import errors, internal_commands, schema, util
+from . import _internal, errors, schema, util
 log = logging.getLogger(__name__)
@@ -40,12 +39,6 @@ class TableStats:
     endpoints: Tuple[str, ...] = ()
-RETRIABLE_ERRORS = (
-    errors.Slowdown,
-    requests.exceptions.ConnectionError,
-)
 @dataclass
 class QueryConfig:
     """Query execution configiration."""
@@ -80,8 +73,10 @@ class QueryConfig:
     # used for worker threads' naming
     query_id: str = ""
-    # allows retrying QueryData when the server is overloaded
-    backoff_func: Any = field(default=backoff.on_exception(backoff.expo, RETRIABLE_ERRORS, max_tries=10))
+    # non-negative integer, used for server-side prioritization of queued requests:
+    # - requests with lower values will be served before requests with higher values.
+    # - if unset, the request will be added to the queue's end.
+    queue_priority: Optional[int] = None
 @dataclass
@@ -102,42 +97,58 @@ class SelectSplitState:
         self.query_data_request = query_data_request
         self.table = table
-    def batches(self, api: internal_commands.VastdbApi):
-        """Execute QueryData request, and yield parsed RecordBatch objects.
+    def process_split(self, api: _internal.VastdbApi, record_batches_queue: queue.Queue[pa.RecordBatch], check_stop: Callable):
+        """Execute a sequence of QueryData requests, and queue the parsed RecordBatch objects.
-        Can be called repeatedly, to allow pagination.
+        Can be called repeatedly, to support resuming the query after a disconnection / retriable error.
         """
-        while not self.done:
-            query_with_backoff = self.config.backoff_func(api.query_data)
-            response = query_with_backoff(
-                            bucket=self.table.bucket.name,
-                            schema=self.table.schema.name,
-                            table=self.table.name,
-                            params=self.query_data_request.serialized,
-                            split=(self.split_id, self.config.num_splits, self.config.num_row_groups_per_sub_split),
-                            num_sub_splits=self.config.num_sub_splits,
-                            response_row_id=False,
-                            txid=self.table.tx.txid,
-                            limit_rows=self.config.limit_rows_per_sub_split,
-                            sub_split_start_row_ids=self.subsplits_state.items(),
-                            enable_sorted_projections=self.config.use_semi_sorted_projections,
-                            query_imports_table=self.table._imports_table,
-                            projection=self.config.semi_sorted_projection_name)
-            pages_iter = internal_commands.parse_query_data_response(
-                conn=response.raw,
-                schema=self.query_data_request.response_schema,
-                start_row_ids=self.subsplits_state,
-                parser=self.query_data_request.response_parser)
-            for page in pages_iter:
-                for batch in page.to_batches():
-                    if len(batch) > 0:
-                        yield batch
+        try:
+            # contains RecordBatch parts received from the server, must be re-created in case of a retry
+            while not self.done:
+                # raises if request parsing fails or throttled due to server load, and will be externally retried
+                response = api.query_data(
+                                bucket=self.table.bucket.name,
+                                schema=self.table.schema.name,
+                                table=self.table.name,
+                                params=self.query_data_request.serialized,
+                                split=(self.split_id, self.config.num_splits, self.config.num_row_groups_per_sub_split),
+                                num_sub_splits=self.config.num_sub_splits,
+                                response_row_id=False,
+                                txid=self.table.tx.txid,
+                                limit_rows=self.config.limit_rows_per_sub_split,
+                                sub_split_start_row_ids=self.subsplits_state.items(),
+                                schedule_id=self.config.queue_priority,
+                                enable_sorted_projections=self.config.use_semi_sorted_projections,
+                                query_imports_table=self.table._imports_table,
+                                projection=self.config.semi_sorted_projection_name)
+                # can raise during response parsing (e.g. due to disconnections), and will be externally retried
+                # the pagination state is stored in `self.subsplits_state` and must be correct in case of a reconnection
+                # the partial RecordBatch chunks are managed internally in `parse_query_data_response`
+                response_iter = _internal.parse_query_data_response(
+                    conn=response.raw,
+                    schema=self.query_data_request.response_schema,
+                    parser=self.query_data_request.response_parser)
+                for stream_id, next_row_id, table_chunk in response_iter:
+                    # in case of I/O error, `response_iter` will be closed and an appropriate exception will be thrown.
+                    self.subsplits_state[stream_id] = next_row_id
+                    # we have parsed a pyarrow.Table successfully, self.subsplits_state is now correctly updated
+                    # if the below loop fails, the query is not retried
+                    for batch in table_chunk.to_batches():
+                        check_stop()  # may raise StoppedException to early-exit the query (without retries)
+                        if batch:
+                            record_batches_queue.put(batch)
+        except urllib3.exceptions.ProtocolError as err:
+            log.warning("Failed parsing QueryData response table=%r split=%s/%s offsets=%s cause=%s",
+                        self.table, self.split_id, self.config.num_splits, self.subsplits_state, err)
+            # since this is a read-only idempotent operation, it is safe to retry
+            raise errors.ConnectionError(cause=err, may_retry=True)
     @property
     def done(self):
         """Returns true iff the pagination over."""
-        return all(row_id == internal_commands.TABULAR_INVALID_ROW_ID for row_id in self.subsplits_state.values())
+        return all(row_id == _internal.TABULAR_INVALID_ROW_ID for row_id in self.subsplits_state.values())
 @dataclass
@@ -187,14 +198,14 @@ class Table:
         """Get a specific semi-sorted projection of this table."""
         if self._imports_table:
             raise errors.NotSupportedCommand(self.bucket.name, self.schema.name, self.name)
-        projs = self.projections(projection_name=name)
+        projs = tuple(self.projections(projection_name=name))
         if not projs:
             raise errors.MissingProjection(self.bucket.name, self.schema.name, self.name, name)
         assert len(projs) == 1, f"Expected to receive only a single projection, but got: {len(projs)}. projections: {projs}"
         log.debug("Found projection: %s", projs[0])
         return projs[0]
-    def projections(self, projection_name=None) -> List["Projection"]:
+    def projections(self, projection_name=None) -> Iterable["Projection"]:
         """List all semi-sorted projections of this table."""
         if self._imports_table:
             raise errors.NotSupportedCommand(self.bucket.name, self.schema.name, self.name)
@@ -214,7 +225,7 @@ class Table:
                 break
         return [_parse_projection_info(projection, self) for projection in projections]
-    def import_files(self, files_to_import: List[str], config: Optional[ImportConfig] = None) -> None:
+    def import_files(self, files_to_import: Iterable[str], config: Optional[ImportConfig] = None) -> None:
         """Import a list of Parquet files into this table.
         The files must be on VAST S3 server and be accessible using current credentials.
@@ -283,7 +294,7 @@ class Table:
                 max_workers=config.import_concurrency, thread_name_prefix='import_thread') as pool:
             try:
                 for endpoint in endpoints:
-                    session = internal_commands.VastdbApi(endpoint, self.tx._rpc.api.access_key, self.tx._rpc.api.secret_key)
+                    session = _internal.VastdbApi(endpoint, self.tx._rpc.api.access_key, self.tx._rpc.api.secret_key)
                     futures.append(pool.submit(import_worker, files_queue, session))
                 log.debug("Waiting for import workers to finish")
@@ -316,10 +327,15 @@ class Table:
         if config is None:
             config = QueryConfig()
-        # Take a snapshot of enpoints
-        stats = self.get_stats()
-        log.debug("stats: %s", stats)
-        endpoints = stats.endpoints if config.data_endpoints is None else config.data_endpoints
+        # Retrieve snapshots only if needed
+        if config.data_endpoints is None or config.num_splits is None:
+            stats = self.get_stats()
+            log.debug("stats: %s", stats)
+        if config.data_endpoints is None:
+            endpoints = stats.endpoints
+        else:
+            endpoints = tuple(config.data_endpoints)
         log.debug("endpoints: %s", endpoints)
         if config.num_splits is None:
@@ -342,13 +358,13 @@ class Table:
         if predicate is True:
             predicate = None
         if predicate is False:
-            response_schema = internal_commands.get_response_schema(schema=query_schema, field_names=columns)
+            response_schema = _internal.get_response_schema(schema=query_schema, field_names=columns)
             return pa.RecordBatchReader.from_batches(response_schema, [])
         if isinstance(predicate, ibis.common.deferred.Deferred):
             predicate = predicate.resolve(self._ibis_table)  # may raise if the predicate is invalid (e.g. wrong types / missing column)
-        query_data_request = internal_commands.build_query_data_request(
+        query_data_request = _internal.build_query_data_request(
             schema=query_schema,
             predicate=predicate,
             field_names=columns)
@@ -376,7 +392,8 @@ class Table:
         def single_endpoint_worker(endpoint: str):
             try:
-                host_api = internal_commands.VastdbApi(endpoint=endpoint, access_key=self.tx._rpc.api.access_key, secret_key=self.tx._rpc.api.secret_key)
+                host_api = _internal.VastdbApi(endpoint=endpoint, access_key=self.tx._rpc.api.access_key, secret_key=self.tx._rpc.api.secret_key)
+                backoff_decorator = self.tx._rpc.api._backoff_decorator
                 while True:
                     check_stop()
                     try:
@@ -390,9 +407,9 @@ class Table:
                                                    split_id=split,
                                                    config=config)
-                    for batch in split_state.batches(host_api):
-                        check_stop()
-                        record_batches_queue.put(batch)
+                    process_with_retries = backoff_decorator(split_state.process_split)
+                    process_with_retries(host_api, record_batches_queue, check_stop)
             except StoppedException:
                 log.debug("stop signal.", exc_info=True)
                 return
@@ -473,7 +490,7 @@ class Table:
             for slice in serialized_slices:
                 res = self.tx._rpc.api.insert_rows(self.bucket.name, self.schema.name, self.name, record_batch=slice,
                                                    txid=self.tx.txid)
-                (batch,) = pa.RecordBatchStreamReader(res.raw)
+                (batch,) = pa.RecordBatchStreamReader(res.content)
                 row_ids.append(batch[INTERNAL_ROW_ID])
             try:
                 self.tx._rpc.features.check_return_row_ids()
@@ -509,6 +526,8 @@ class Table:
         else:
             update_rows_rb = rows
+        update_rows_rb = util.sort_record_batch_if_needed(update_rows_rb, INTERNAL_ROW_ID)
         serialized_slices = util.iter_serialized_slices(update_rows_rb, MAX_ROWS_PER_BATCH)
         for slice in serialized_slices:
             self.tx._rpc.api.update_rows(self.bucket.name, self.schema.name, self.name, record_batch=slice,
@@ -528,6 +547,8 @@ class Table:
         delete_rows_rb = pa.record_batch(schema=pa.schema([(INTERNAL_ROW_ID, pa.uint64())]),
                                          data=[_combine_chunks(rows_chunk)])
+        delete_rows_rb = util.sort_record_batch_if_needed(delete_rows_rb, INTERNAL_ROW_ID)
         serialized_slices = util.iter_serialized_slices(delete_rows_rb, MAX_ROWS_PER_BATCH)
         for slice in serialized_slices:
             self.tx._rpc.api.delete_rows(self.bucket.name, self.schema.name, self.name, record_batch=slice,
@@ -593,7 +614,7 @@ class Table:
         return self.imports_table()  # type: ignore[return-value]
     def imports_table(self) -> Optional["Table"]:
-        """Get the imports table under of this table."""
+        """Get the imports table of this table."""
         self.tx._rpc.features.check_imports_table()
         return Table(name=self.name, schema=self.schema, handle=int(self.handle), stats=self.stats, _imports_table=True)

vastdb/tests/test_duckdb.py CHANGED Viewed

@@ -56,6 +56,6 @@ def test_closed_tx(session, clean_bucket_name):
         res = conn.execute('SELECT a FROM batches')
         log.debug("closing tx=%s after first batch=%s", t.tx, first)
-    # transaction is closed, collecting the result should fail
-    with pytest.raises(duckdb.InvalidInputException, match="Detail: Python exception: MissingTransaction"):
+    # transaction is closed, collecting the result should fail internally in DuckDB
+    with pytest.raises(duckdb.InvalidInputException):
         res.arrow()

vastdb/tests/test_projections.py CHANGED Viewed

@@ -105,7 +105,11 @@ def test_query_data_with_projection(session, clean_bucket_name):
         t = s.table(table_name)
         projection_actual = pa.Table.from_batches(t.select(columns=['a', 'b', 's'], predicate=(t['b'] < 5), config=config))
         # no projection supply - need to be with p1 projeciton
-        assert expected_projection_p1 == projection_actual
+        # doing this since we also run this test against production clusters
+        if expected_projection_p1 != projection_actual:
+            config.num_row_groups_per_sub_split = 8
+            projection_actual = pa.Table.from_batches(t.select(columns=['a', 'b', 's'], predicate=(t['b'] < 5), config=config))
+            assert expected_projection_p1 == projection_actual
         config.semi_sorted_projection_name = 'p1'
         projection_actual = pa.Table.from_batches(t.select(columns=['a', 'b', 's'], predicate=(t['b'] < 5), config=config))

vastdb/tests/test_sanity.py CHANGED Viewed

@@ -5,9 +5,8 @@ from http.server import BaseHTTPRequestHandler, HTTPServer
 from itertools import cycle
 import pytest
-import requests
-import vastdb
+import vastdb.errors
 log = logging.getLogger(__name__)
@@ -25,8 +24,9 @@ def test_bad_credentials(session):
 def test_bad_endpoint(session):
-    with pytest.raises(requests.exceptions.ConnectionError):
-        vastdb.connect(access='BAD', secret='BAD', endpoint='http://invalid-host-name-for-tests:12345')
+    backoff_config = vastdb.session.BackoffConfig(max_tries=3)
+    with pytest.raises(vastdb.errors.ConnectionError):
+        vastdb.connect(access='BAD', secret='BAD', endpoint='http://invalid-host-name-for-tests:12345', backoff_config=backoff_config)
 def test_version_extraction():
@@ -36,7 +36,7 @@ def test_version_extraction():
             ("5", None),                                    # major
             ("5.2", None),                                  # major.minor
             ("5.2.0", None),                                # major.minor.patch
-            ("5.2.0.10", "5.2.0.10"),                       # major.minor.patch.protocol
+            ("5.2.0.10", (5, 2, 0, 10)),                    # major.minor.patch.protocol
             ("5.2.0.10 some other things", None),           # suffix
             ("5.2.0.10.20", None),                          # extra version
     ]

vastdb/tests/test_tables.py CHANGED Viewed

@@ -58,7 +58,7 @@ def test_tables(session, clean_bucket_name):
         }
         columns_to_delete = pa.schema([(INTERNAL_ROW_ID, pa.uint64())])
-        rb = pa.record_batch(schema=columns_to_delete, data=[[0]])  # delete rows 0,1
+        rb = pa.record_batch(schema=columns_to_delete, data=[[0]])  # delete row 0
         t.delete(rb)
         selected_rows = t.select(columns=['b'], predicate=(t['a'] == 222), internal_row_id=True).read_all()
@@ -81,6 +81,19 @@ def test_insert_wide_row(session, clean_bucket_name):
         assert actual == expected
+def test_insert_empty(session, clean_bucket_name):
+    columns = pa.schema([('a', pa.int8()), ('b', pa.float32())])
+    data = [[None] * 5, [None] * 5]
+    all_nulls = pa.table(schema=columns, data=data)
+    no_columns = all_nulls.select([])
+    with session.transaction() as tx:
+        t = tx.bucket(clean_bucket_name).create_schema('s').create_table('t', columns)
+        t.insert(all_nulls)
+        with pytest.raises(errors.NotImplemented):
+            t.insert(no_columns)
 def test_exists(session, clean_bucket_name):
     with session.transaction() as tx:
         s = tx.bucket(clean_bucket_name).create_schema('s1')
@@ -156,6 +169,27 @@ def test_update_table(session, clean_bucket_name):
             'b': [0.5, 1.5, 2.5]
         }
+        # test update for not sorted rows:
+        rb = pa.record_batch(schema=columns_to_update, data=[
+            [2, 0],  # update rows 0,2
+            [231, 235]
+        ])
+        t.update(rb)
+        actual = t.select(columns=['a', 'b']).read_all()
+        assert actual.to_pydict() == {
+            'a': [235, 2222, 231],
+            'b': [0.5, 1.5, 2.5]
+        }
+        # test delete for not sorted rows:
+        rb = pa.record_batch(schema=pa.schema([(INTERNAL_ROW_ID, pa.uint64())]), data=[[2, 0]])
+        t.delete(rb)
+        actual = t.select(columns=['a', 'b']).read_all()
+        assert actual.to_pydict() == {
+            'a': [2222],
+            'b': [1.5]
+        }
 def test_select_with_multisplits(session, clean_bucket_name):
     columns = pa.schema([
@@ -174,6 +208,25 @@ def test_select_with_multisplits(session, clean_bucket_name):
         assert actual == expected
+def test_select_with_priority(session, clean_bucket_name):
+    columns = pa.schema([
+        ('a', pa.int32())
+    ])
+    expected = pa.table(schema=columns, data=[range(100)])
+    with prepare_data(session, clean_bucket_name, 's', 't', expected) as t:
+        config = QueryConfig()
+        config.queue_priority = 0
+        assert t.select(config=config).read_all() == expected
+        config.queue_priority = 12345
+        assert t.select(config=config).read_all() == expected
+        config.queue_priority = -1
+        with pytest.raises(errors.BadRequest):
+            t.select(config=config).read_all()
 def test_types(session, clean_bucket_name):
     columns = pa.schema([
         ('tb', pa.bool_()),

vastdb/tests/test_util.py CHANGED Viewed

@@ -33,6 +33,12 @@ def test_wide_row():
         list(util.iter_serialized_slices(t))
+def test_expand_ip_ranges():
+    endpoints = ["http://172.19.101.1-3"]
+    expected = ["http://172.19.101.1", "http://172.19.101.2", "http://172.19.101.3"]
+    assert util.expand_ip_ranges(endpoints) == expected
 def _parse(bufs):
     for buf in bufs:
         with pa.ipc.open_stream(buf) as reader:

vastdb/transaction.py CHANGED Viewed

@@ -8,7 +8,7 @@ A transcation is used as a context manager, since every Database-related operati
 import logging
 from dataclasses import dataclass
-from typing import TYPE_CHECKING, List, Optional
+from typing import TYPE_CHECKING, Iterable, Optional
 import botocore
@@ -72,7 +72,7 @@ class Transaction:
             raise
         return bucket.Bucket(name, self)
-    def catalog_snapshots(self) -> List["Bucket"]:
+    def catalog_snapshots(self) -> Iterable["Bucket"]:
         """Return VAST Catalog bucket snapshots."""
         return bucket.Bucket(VAST_CATALOG_BUCKET_NAME, self).snapshots()

vastdb/util.py CHANGED Viewed

@@ -1,7 +1,9 @@
 import logging
+import re
 from typing import TYPE_CHECKING, Callable, List, Optional, Union
 import pyarrow as pa
+import pyarrow.compute as pc
 import pyarrow.parquet as pq
 from .errors import InvalidArgument, TooWideRow
@@ -88,8 +90,11 @@ MAX_QUERY_DATA_REQUEST_SIZE = int(0.9 * MAX_TABULAR_REQUEST_SIZE)
 def iter_serialized_slices(batch: Union[pa.RecordBatch, pa.Table], max_rows_per_slice=None):
     """Iterate over a list of record batch slices."""
+    if batch.nbytes:
+        rows_per_slice = int(0.9 * len(batch) * MAX_RECORD_BATCH_SLICE_SIZE / batch.nbytes)
+    else:
+        rows_per_slice = len(batch)  # if the batch has no buffers (no rows/columns)
-    rows_per_slice = int(0.9 * len(batch) * MAX_RECORD_BATCH_SLICE_SIZE / batch.nbytes)
     if max_rows_per_slice is not None:
         rows_per_slice = min(rows_per_slice, max_rows_per_slice)
@@ -113,3 +118,37 @@ def serialize_record_batch(batch: Union[pa.RecordBatch, pa.Table]):
     with pa.ipc.new_stream(sink, batch.schema) as writer:
         writer.write(batch)
     return sink.getvalue()
+def expand_ip_ranges(endpoints):
+    """Expands endpoint strings that include an IP range in the format 'http://172.19.101.1-16'."""
+    expanded_endpoints = []
+    pattern = re.compile(r"(http://\d+\.\d+\.\d+)\.(\d+)-(\d+)")
+    for endpoint in endpoints:
+        match = pattern.match(endpoint)
+        if match:
+            base_url = match.group(1)
+            start_ip = int(match.group(2))
+            end_ip = int(match.group(3))
+            if start_ip > end_ip:
+                raise ValueError("Start IP cannot be greater than end IP in the range.")
+            expanded_endpoints.extend(f"{base_url}.{ip}" for ip in range(start_ip, end_ip + 1))
+        else:
+            expanded_endpoints.append(endpoint)
+    return expanded_endpoints
+def is_sorted(arr):
+    """Check if the array is sorted."""
+    return pc.all(pc.greater(arr[1:], arr[:-1])).as_py()
+def sort_record_batch_if_needed(record_batch, sort_column):
+    """Sort the RecordBatch by the specified column if it is not already sorted."""
+    column_data = record_batch[sort_column]
+    if not is_sorted(column_data):
+        return record_batch.sort_by(sort_column)
+    else:
+        return record_batch

vastdb/vast_tests/__init__.py ADDED Viewed

File without changes

vastdb 0.1.6__py3-none-any.whl → 0.1.8__py3-none-any.whl

vastdb 0.1.6py3-none-any.whl → 0.1.8py3-none-any.whl