PyPI - vastdb - Versions diffs - 0.1.5__py3-none-any.whl → 0.1.6__py3-none-any.whl - Mend

vastdb 0.1.5py3-none-any.whl → 0.1.6py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (20) hide show

vastdb/bench/test_perf.py +1 -2
vastdb/bucket.py +12 -37
vastdb/conftest.py +13 -4
vastdb/errors.py +5 -1
vastdb/internal_commands.py +1 -20
vastdb/schema.py +45 -0
vastdb/session.py +24 -8
vastdb/table.py +18 -6
vastdb/tests/test_imports.py +3 -3
vastdb/tests/test_nested.py +4 -4
vastdb/tests/test_projections.py +78 -0
vastdb/tests/test_schemas.py +49 -0
vastdb/tests/test_tables.py +53 -28
vastdb/transaction.py +23 -14
vastdb/util.py +1 -0
{vastdb-0.1.5.dist-info → vastdb-0.1.6.dist-info}/METADATA +1 -1
{vastdb-0.1.5.dist-info → vastdb-0.1.6.dist-info}/RECORD +20 -20
{vastdb-0.1.5.dist-info → vastdb-0.1.6.dist-info}/LICENSE +0 -0
{vastdb-0.1.5.dist-info → vastdb-0.1.6.dist-info}/WHEEL +0 -0
{vastdb-0.1.5.dist-info → vastdb-0.1.6.dist-info}/top_level.txt +0 -0

vastdb/bench/test_perf.py CHANGED Viewed

@@ -1,7 +1,6 @@
 import logging
 import time
-import pyarrow as pa
 import pytest
 from vastdb import util
@@ -20,7 +19,7 @@ def test_bench(session, clean_bucket_name, parquets_path, crater_path):
         t = util.create_table_from_files(s, 't1', files, config=ImportConfig(import_concurrency=8))
         config = QueryConfig(num_splits=8, num_sub_splits=4)
         s = time.time()
-        pa_table = pa.Table.from_batches(t.select(columns=['sid'], predicate=t['sid'] == 10033007, config=config))
+        pa_table = t.select(columns=['sid'], predicate=t['sid'] == 10033007, config=config).read_all()
         e = time.time()
         log.info("'SELECT sid from TABLE WHERE sid = 10033007' returned in %s seconds.", e - s)
         if crater_path:

vastdb/bucket.py CHANGED Viewed

@@ -5,7 +5,7 @@ It is possible to list and access VAST snapshots generated over a bucket.
 """
 import logging
-from dataclasses import dataclass
+from dataclasses import dataclass, field
 from typing import TYPE_CHECKING, List, Optional
 from . import errors, schema, transaction
@@ -22,48 +22,23 @@ class Bucket:
     name: str
     tx: "transaction.Transaction"
+    _root_schema: "Schema" = field(init=False, compare=False, repr=False)
-    def create_schema(self, path: str, fail_if_exists=True) -> "Schema":
+    def __post_init__(self):
+        """Root schema is empty."""
+        self._root_schema = schema.Schema(name="", bucket=self)
+    def create_schema(self, name: str, fail_if_exists=True) -> "Schema":
         """Create a new schema (a container of tables) under this bucket."""
-        if current := self.schema(path, fail_if_missing=False):
-            if fail_if_exists:
-                raise errors.SchemaExists(self.name, path)
-            else:
-                return current
-        self.tx._rpc.api.create_schema(self.name, path, txid=self.tx.txid)
-        log.info("Created schema: %s", path)
-        return self.schema(path)  # type: ignore[return-value]
+        return self._root_schema.create_schema(name=name, fail_if_exists=fail_if_exists)
-    def schema(self, path: str, fail_if_missing=True) -> Optional["Schema"]:
+    def schema(self, name: str, fail_if_missing=True) -> Optional["Schema"]:
         """Get a specific schema (a container of tables) under this bucket."""
-        s = self.schemas(path)
-        log.debug("schema: %s", s)
-        if not s:
-            if fail_if_missing:
-                raise errors.MissingSchema(self.name, path)
-            else:
-                return None
-        assert len(s) == 1, f"Expected to receive only a single schema, but got: {len(s)}. ({s})"
-        log.debug("Found schema: %s", s[0].name)
-        return s[0]
+        return self._root_schema.schema(name=name, fail_if_missing=fail_if_missing)
-    def schemas(self, name: Optional[str] = None) -> List["Schema"]:
+    def schemas(self, batch_size=None):
         """List bucket's schemas."""
-        schemas = []
-        next_key = 0
-        exact_match = bool(name)
-        log.debug("list schemas param: schema=%s, exact_match=%s", name, exact_match)
-        while True:
-            _bucket_name, curr_schemas, next_key, is_truncated, _ = \
-                self.tx._rpc.api.list_schemas(bucket=self.name, next_key=next_key, txid=self.tx.txid,
-                                               name_prefix=name, exact_match=exact_match)
-            if not curr_schemas:
-                break
-            schemas.extend(curr_schemas)
-            if not is_truncated:
-                break
-        return [schema.Schema(name=name, bucket=self) for name, *_ in schemas]
+        return self._root_schema.schemas(batch_size=batch_size)
     def snapshot(self, name, fail_if_missing=True) -> Optional["Bucket"]:
         """Get snapshot by name (if exists)."""

vastdb/conftest.py CHANGED Viewed

@@ -30,14 +30,23 @@ def test_bucket_name(request):
     return request.config.getoption("--tabular-bucket-name")
+def iter_schemas(s):
+    """Recusively scan all schemas."""
+    children = s.schemas()
+    for c in children:
+        yield from iter_schemas(c)
+    yield s
 @pytest.fixture(scope="function")
 def clean_bucket_name(request, test_bucket_name, session):
     with session.transaction() as tx:
         b = tx.bucket(test_bucket_name)
-        for s in b.schemas():
-            for t in s.tables():
-                t.drop()
-            s.drop()
+        for top_schema in b.schemas():
+            for s in iter_schemas(top_schema):
+                for t in s.tables():
+                    t.drop()
+                s.drop()
     return test_bucket_name

vastdb/errors.py CHANGED Viewed

@@ -89,7 +89,11 @@ class InvalidArgument(Exception):
     pass
-class TooWideRow(InvalidArgument):
+class TooLargeRequest(InvalidArgument):
+    pass
+class TooWideRow(TooLargeRequest):
     pass

vastdb/internal_commands.py CHANGED Viewed

@@ -6,7 +6,6 @@ import struct
 import urllib.parse
 from collections import defaultdict, namedtuple
 from enum import Enum
-from ipaddress import IPv4Address, IPv6Address
 from typing import Any, Dict, Iterator, List, Optional, Union
 import flatbuffers
@@ -1031,25 +1030,7 @@ class VastdbApi:
         num_rows = stats.NumRows()
         size_in_bytes = stats.SizeInBytes()
         is_external_rowid_alloc = stats.IsExternalRowidAlloc()
-        endpoints = []
-        if stats.VipsLength() == 0:
-            endpoints.append(self.url)
-        else:
-            url = urllib3.util.parse_url(self.url)
-            ip_cls = IPv6Address if (stats.AddressType() == "ipv6") else IPv4Address
-            vips = [stats.Vips(i) for i in range(stats.VipsLength())]
-            ips = []
-            # extract the vips into list of IPs
-            for vip in vips:
-                start_ip = int(ip_cls(vip.StartAddress().decode()))
-                ips.extend(ip_cls(start_ip + i) for i in range(vip.AddressCount()))
-            # build a list of endpoint URLs, reusing schema and port (if specified when constructing the session).
-            # it is assumed that the client can access the returned IPs (e.g. if they are part of the VIP pool).
-            for ip in ips:
-                d = url._asdict()
-                d['host'] = str(ip)
-                endpoints.append(str(urllib3.util.Url(**d)))
+        endpoints = [self.url]  # we cannot replace the host by a VIP address in HTTPS-based URLs
         return TableStatsResult(num_rows, size_in_bytes, is_external_rowid_alloc, tuple(endpoints))
     def alter_table(self, bucket, schema, name, txid=0, client_tags=[], table_properties="",

vastdb/schema.py CHANGED Viewed

@@ -31,6 +31,51 @@ class Schema:
         """VAST transaction used for this schema."""
         return self.bucket.tx
+    def _subschema_full_name(self, name: str) -> str:
+        return f"{self.name}/{name}" if self.name else name
+    def create_schema(self, name: str, fail_if_exists=True) -> "Schema":
+        """Create a new schema (a container of tables) under this schema."""
+        if current := self.schema(name, fail_if_missing=False):
+            if fail_if_exists:
+                raise errors.SchemaExists(self.bucket.name, name)
+            else:
+                return current
+        full_name = self._subschema_full_name(name)
+        self.tx._rpc.api.create_schema(self.bucket.name, full_name, txid=self.tx.txid)
+        log.info("Created schema: %s", full_name)
+        return self.schema(name)  # type: ignore[return-value]
+    def schema(self, name: str, fail_if_missing=True) -> Optional["Schema"]:
+        """Get a specific schema (a container of tables) under this schema."""
+        _bucket_name, schemas, _next_key, _is_truncated, _ = \
+            self.tx._rpc.api.list_schemas(bucket=self.bucket.name, schema=self.name, next_key=0, txid=self.tx.txid,
+                                          name_prefix=name, exact_match=True, max_keys=1)
+        names = [name for name, *_ in schemas]
+        log.debug("Found schemas: %s", names)
+        if not names:
+            if fail_if_missing:
+                raise errors.MissingSchema(self.bucket.name, self._subschema_full_name(name))
+            else:
+                return None
+        assert len(names) == 1, f"Expected to receive only a single schema, but got {len(schemas)}: ({schemas})"
+        return schema.Schema(name=self._subschema_full_name(names[0]), bucket=self.bucket)
+    def schemas(self, batch_size=None) -> List["Schema"]:
+        """List child schemas."""
+        next_key = 0
+        if not batch_size:
+            batch_size = 1000
+        result: List["Schema"] = []
+        while True:
+            _bucket_name, curr_schemas, next_key, is_truncated, _ = \
+                self.tx._rpc.api.list_schemas(bucket=self.bucket.name, schema=self.name, next_key=next_key, max_keys=batch_size, txid=self.tx.txid)
+            result.extend(schema.Schema(name=self._subschema_full_name(name), bucket=self.bucket) for name, *_ in curr_schemas)
+            if not is_truncated:
+                break
+        return result
     def create_table(self, table_name: str, columns: pa.Schema, fail_if_exists=True) -> "Table":
         """Create a new table under this schema."""
         if current := self.table(table_name, fail_if_missing=False):

vastdb/session.py CHANGED Viewed

@@ -7,12 +7,15 @@ For more details see:
 - [Tabular identity policy with the proper permissions](https://support.vastdata.com/s/article/UUID-14322b60-d6a2-89ac-3df0-3dfbb6974182)
 """
+import logging
 import os
 import boto3
 from . import errors, internal_commands, transaction
+log = logging.getLogger()
 class Features:
     """VAST database features - check if server is already support a feature."""
@@ -21,15 +24,28 @@ class Features:
         """Save the server version."""
         self.vast_version = vast_version
-    def check_imports_table(self):
-        """Check if the feature that support imports table is supported."""
-        if self.vast_version < (5, 2):
-            raise errors.NotSupportedVersion("import_table requires 5.2+", self.vast_version)
+        self.check_imports_table = self._check(
+            "Imported objects' table feature requires 5.2+ VAST release",
+            vast_version >= (5, 2))
+        self.check_return_row_ids = self._check(
+            "Returning row IDs requires 5.1+ VAST release",
+            vast_version >= (5, 1))
+        self.check_enforce_semisorted_projection = self._check(
+            "Semi-sorted projection enforcement requires 5.1+ VAST release",
+            vast_version >= (5, 1))
+    def _check(self, msg, supported):
+        log.debug("%s (current version is %s): supported=%s", msg, self.vast_version, supported)
+        if not supported:
+            def fail():
+                raise errors.NotSupportedVersion(msg, self.vast_version)
+            return fail
-    def check_return_row_ids(self):
-        """Check if insert/update/delete can return the row_ids."""
-        if self.vast_version < (5, 1):
-            raise errors.NotSupportedVersion("return_row_ids requires 5.1+", self.vast_version)
+        def noop():
+            pass
+        return noop
 class Session:

vastdb/table.py CHANGED Viewed

@@ -54,7 +54,8 @@ class QueryConfig:
     num_sub_splits: int = 4
     # used to split the table into disjoint subsets of rows, to be processed concurrently using multiple RPCs
-    num_splits: int = 1
+    # will be estimated from the table's row count, if not explicitly set
+    num_splits: Optional[int] = None
     # each endpoint will be handled by a separate worker thread
     # a single endpoint can be specified more than once to benefit from multithreaded execution
@@ -64,12 +65,15 @@ class QueryConfig:
     limit_rows_per_sub_split: int = 128 * 1024
     # each fiber will read the following number of rowgroups coninuously before skipping
-    # in order to use semi-sorted projections this value must be 8
+    # in order to use semi-sorted projections this value must be 8 (this is the hard coded size of a row groups per row block).
     num_row_groups_per_sub_split: int = 8
     # can be disabled for benchmarking purposes
     use_semi_sorted_projections: bool = True
+    # enforce using a specific semi-sorted projection (if enabled above)
+    semi_sorted_projection_name: Optional[str] = None
     # used to estimate the number of splits, given the table rows' count
     rows_per_split: int = 4000000
@@ -117,7 +121,8 @@ class SelectSplitState:
                             limit_rows=self.config.limit_rows_per_sub_split,
                             sub_split_start_row_ids=self.subsplits_state.items(),
                             enable_sorted_projections=self.config.use_semi_sorted_projections,
-                            query_imports_table=self.table._imports_table)
+                            query_imports_table=self.table._imports_table,
+                            projection=self.config.semi_sorted_projection_name)
             pages_iter = internal_commands.parse_query_data_response(
                 conn=response.raw,
                 schema=self.query_data_request.response_schema,
@@ -313,11 +318,16 @@ class Table:
         # Take a snapshot of enpoints
         stats = self.get_stats()
+        log.debug("stats: %s", stats)
         endpoints = stats.endpoints if config.data_endpoints is None else config.data_endpoints
+        log.debug("endpoints: %s", endpoints)
+        if config.num_splits is None:
+            config.num_splits = max(1, stats.num_rows // config.rows_per_split)
+        log.debug("config: %s", config)
-        if stats.num_rows > config.rows_per_split and config.num_splits is None:
-            config.num_splits = stats.num_rows // config.rows_per_split
-        log.debug(f"num_rows={stats.num_rows} rows_per_splits={config.rows_per_split} num_splits={config.num_splits} ")
+        if config.semi_sorted_projection_name:
+            self.tx._rpc.features.check_enforce_semisorted_projection()
         if columns is None:
             columns = [f.name for f in self.arrow_schema]
@@ -342,6 +352,8 @@ class Table:
             schema=query_schema,
             predicate=predicate,
             field_names=columns)
+        if len(query_data_request.serialized) > util.MAX_QUERY_DATA_REQUEST_SIZE:
+            raise errors.TooLargeRequest(f"{len(query_data_request.serialized)} bytes")
         splits_queue: queue.Queue[int] = queue.Queue()

vastdb/tests/test_imports.py CHANGED Viewed

@@ -38,13 +38,13 @@ def test_parallel_imports(session, clean_bucket_name, s3):
             t.create_imports_table()
         log.info("Starting import of %d files", num_files)
         t.import_files(files)
-        arrow_table = pa.Table.from_batches(t.select(columns=['num']))
+        arrow_table = t.select(columns=['num']).read_all()
         assert arrow_table.num_rows == num_rows * num_files
-        arrow_table = pa.Table.from_batches(t.select(columns=['num'], predicate=t['num'] == 100))
+        arrow_table = t.select(columns=['num'], predicate=t['num'] == 100).read_all()
         assert arrow_table.num_rows == num_files
         import_table = t.imports_table()
         # checking all imports are on the imports table:
-        objects_name = pa.Table.from_batches(import_table.select(columns=["ObjectName"]))
+        objects_name = import_table.select(columns=["ObjectName"]).read_all()
         objects_name = objects_name.to_pydict()
         object_names = set(objects_name['ObjectName'])
         prefix = 'prq'

vastdb/tests/test_nested.py CHANGED Viewed

@@ -22,13 +22,13 @@ def test_nested_select(session, clean_bucket_name):
     ])
     with prepare_data(session, clean_bucket_name, 's', 't', expected) as t:
-        actual = pa.Table.from_batches(t.select())
+        actual = t.select().read_all()
         assert actual == expected
         names = [f.name for f in columns]
         for n in range(len(names) + 1):
             for cols in itertools.permutations(names, n):
-                actual = pa.Table.from_batches(t.select(columns=cols))
+                actual = t.select(columns=cols).read_all()
                 assert actual == expected.select(cols)
@@ -53,7 +53,7 @@ def test_nested_filter(session, clean_bucket_name):
     ])
     with prepare_data(session, clean_bucket_name, 's', 't', expected) as t:
-        actual = pa.Table.from_batches(t.select())
+        actual = t.select().read_all()
         assert actual == expected
         names = list('xyzw')
@@ -62,7 +62,7 @@ def test_nested_filter(session, clean_bucket_name):
                 ibis_predicate = functools.reduce(
                     operator.and_,
                     (t[col] > 2 for col in cols))
-                actual = pa.Table.from_batches(t.select(predicate=ibis_predicate), t.arrow_schema)
+                actual = t.select(predicate=ibis_predicate).read_all()
                 arrow_predicate = functools.reduce(
                     operator.and_,

vastdb/tests/test_projections.py CHANGED Viewed

@@ -1,7 +1,10 @@
 import logging
+import time
 import pyarrow as pa
+from vastdb.table import QueryConfig
 log = logging.getLogger(__name__)
@@ -41,3 +44,78 @@ def test_basic_projections(session, clean_bucket_name):
         projs = t.projections()
         assert len(projs) == 1
         assert projs[0].name == 'p_new'
+def test_query_data_with_projection(session, clean_bucket_name):
+    columns = pa.schema([
+        ('a', pa.int64()),
+        ('b', pa.int64()),
+        ('s', pa.utf8()),
+    ])
+    # need to be large enough in order to consider as projection
+    GROUP_SIZE = 128 * 1024
+    expected = pa.table(schema=columns, data=[
+        [i for i in range(GROUP_SIZE)],
+        [i for i in reversed(range(GROUP_SIZE))],
+        [f's{i}' for i in range(GROUP_SIZE)],
+    ])
+    expected_projection_p1 = pa.table(schema=columns, data=[
+        [i for i in reversed(range(GROUP_SIZE - 5, GROUP_SIZE))],
+        [i for i in range(5)],
+        [f's{i}' for i in reversed(range(GROUP_SIZE - 5, GROUP_SIZE))],
+    ])
+    expected_projection_p2 = pa.table(schema=columns, data=[
+        [i for i in range(GROUP_SIZE - 5, GROUP_SIZE)],
+        [i for i in reversed(range(5))],
+        [f's{i}' for i in range(GROUP_SIZE - 5, GROUP_SIZE)],
+    ])
+    schema_name = "schema"
+    table_name = "table"
+    with session.transaction() as tx:
+        s = tx.bucket(clean_bucket_name).create_schema(schema_name)
+        t = s.create_table(table_name, expected.schema)
+        sorted_columns = ['b']
+        unsorted_columns = ['a', 's']
+        t.create_projection('p1', sorted_columns, unsorted_columns)
+        sorted_columns = ['a']
+        unsorted_columns = ['b', 's']
+        t.create_projection('p2', sorted_columns, unsorted_columns)
+    with session.transaction() as tx:
+        s = tx.bucket(clean_bucket_name).schema(schema_name)
+        t = s.table(table_name)
+        t.insert(expected)
+        actual = pa.Table.from_batches(t.select(columns=['a', 'b', 's']))
+        assert actual == expected
+    time.sleep(3)
+    with session.transaction() as tx:
+        config = QueryConfig()
+        # in nfs mock server num row groups per row block is 1 so need to change this in the config
+        config.num_row_groups_per_sub_split = 1
+        s = tx.bucket(clean_bucket_name).schema(schema_name)
+        t = s.table(table_name)
+        projection_actual = pa.Table.from_batches(t.select(columns=['a', 'b', 's'], predicate=(t['b'] < 5), config=config))
+        # no projection supply - need to be with p1 projeciton
+        assert expected_projection_p1 == projection_actual
+        config.semi_sorted_projection_name = 'p1'
+        projection_actual = pa.Table.from_batches(t.select(columns=['a', 'b', 's'], predicate=(t['b'] < 5), config=config))
+        # expecting results of projection p1 since we asked it specificaly
+        assert expected_projection_p1 == projection_actual
+        config.semi_sorted_projection_name = 'p2'
+        projection_actual = pa.Table.from_batches(t.select(columns=['a', 'b', 's'], predicate=(t['b'] < 5), config=config))
+        # expecting results of projection p2 since we asked it specificaly
+        assert expected_projection_p2 == projection_actual
+        t.drop()
+        s.drop()

vastdb/tests/test_schemas.py CHANGED Viewed

@@ -61,3 +61,52 @@ def test_list_snapshots(session, clean_bucket_name):
     with session.transaction() as tx:
         b = tx.bucket(clean_bucket_name)
         b.snapshots()  # VAST Catalog may create some snapshots
+def test_nested_schemas(session, clean_bucket_name):
+    with session.transaction() as tx:
+        b = tx.bucket(clean_bucket_name)
+        s1 = b.create_schema('s1')
+        s1_s2 = s1.create_schema('s2')
+        s1_s3 = s1.create_schema('s3')
+        s1_s3_s4 = s1_s3.create_schema('s4')
+        s5 = b.create_schema('s5')
+        assert b.schema('s1') == s1
+        assert s1.schema('s2') == s1_s2
+        assert s1.schema('s3') == s1_s3
+        assert s1_s3.schema('s4') == s1_s3_s4
+        assert b.schema('s5') == s5
+        assert b.schemas() == [s1, s5]
+        assert s1.schemas() == [s1_s2, s1_s3]
+        assert s1_s2.schemas() == []
+        assert s1_s3.schemas() == [s1_s3_s4]
+        assert s1_s3_s4.schemas() == []
+        assert s5.schemas() == []
+        s1_s3_s4.drop()
+        assert s1_s3.schemas() == []
+        s1_s3.drop()
+        assert s1.schemas() == [s1_s2]
+        s1_s2.drop()
+        assert s1.schemas() == []
+        assert b.schemas() == [s1, s5]
+        s1.drop()
+        assert b.schemas() == [s5]
+        s5.drop()
+        assert b.schemas() == []
+def test_schema_pagination(session, clean_bucket_name):
+    with session.transaction() as tx:
+        b = tx.bucket(clean_bucket_name)
+        names = [f's{i}' for i in range(10)]
+        schemas = [b.create_schema(name) for name in names]
+        assert b.schemas(batch_size=3) == schemas
+        s0 = b.schema('s0')
+        names = [f'q{i}' for i in range(10)]
+        subschemas = [s0.create_schema(name) for name in names]
+        assert s0.schemas(batch_size=3) == subschemas

vastdb/tests/test_tables.py CHANGED Viewed

@@ -3,7 +3,6 @@ import decimal
 import logging
 import random
 import threading
-import time
 from contextlib import closing
 from tempfile import NamedTemporaryFile
@@ -33,25 +32,25 @@ def test_tables(session, clean_bucket_name):
         ['a', 'bb', 'ccc'],
     ])
     with prepare_data(session, clean_bucket_name, 's', 't', expected) as t:
-        actual = pa.Table.from_batches(t.select(columns=['a', 'b', 's']))
+        actual = t.select(columns=['a', 'b', 's']).read_all()
         assert actual == expected
-        actual = pa.Table.from_batches(t.select())
+        actual = t.select().read_all()
         assert actual == expected
-        actual = pa.Table.from_batches(t.select(columns=['a', 'b']))
+        actual = t.select(columns=['a', 'b']).read_all()
         assert actual == expected.select(['a', 'b'])
-        actual = pa.Table.from_batches(t.select(columns=['b', 's', 'a']))
+        actual = t.select(columns=['b', 's', 'a']).read_all()
         assert actual == expected.select(['b', 's', 'a'])
-        actual = pa.Table.from_batches(t.select(columns=['s']))
+        actual = t.select(columns=['s']).read_all()
         assert actual == expected.select(['s'])
-        actual = pa.Table.from_batches(t.select(columns=[]))
+        actual = t.select(columns=[]).read_all()
         assert actual == expected.select([])
-        actual = pa.Table.from_batches(t.select(columns=['s'], internal_row_id=True))
+        actual = t.select(columns=['s'], internal_row_id=True).read_all()
         log.debug("actual=%s", actual)
         assert actual.to_pydict() == {
             's': ['a', 'bb', 'ccc'],
@@ -62,9 +61,9 @@ def test_tables(session, clean_bucket_name):
         rb = pa.record_batch(schema=columns_to_delete, data=[[0]])  # delete rows 0,1
         t.delete(rb)
-        selected_rows = pa.Table.from_batches(t.select(columns=['b'], predicate=(t['a'] == 222), internal_row_id=True))
+        selected_rows = t.select(columns=['b'], predicate=(t['a'] == 222), internal_row_id=True).read_all()
         t.delete(selected_rows)
-        actual = pa.Table.from_batches(t.select(columns=['a', 'b', 's']))
+        actual = t.select(columns=['a', 'b', 's']).read_all()
         assert actual.to_pydict() == {
             'a': [333],
             'b': [2.5],
@@ -78,7 +77,7 @@ def test_insert_wide_row(session, clean_bucket_name):
     expected = pa.table(schema=columns, data=data)
     with prepare_data(session, clean_bucket_name, 's', 't', expected) as t:
-        actual = pa.Table.from_batches(t.select())
+        actual = t.select().read_all()
         assert actual == expected
@@ -125,33 +124,33 @@ def test_update_table(session, clean_bucket_name):
         ])
         t.update(rb)
-        actual = pa.Table.from_batches(t.select(columns=['a', 'b']))
+        actual = t.select(columns=['a', 'b']).read_all()
         assert actual.to_pydict() == {
             'a': [1110, 222, 3330],
             'b': [0.5, 1.5, 2.5]
         }
-        actual = pa.Table.from_batches(t.select(columns=['a', 'b'], predicate=(t['a'] < 1000), internal_row_id=True))
+        actual = t.select(columns=['a', 'b'], predicate=(t['a'] < 1000), internal_row_id=True).read_all()
         column_index = actual.column_names.index('a')
         column_field = actual.field(column_index)
         new_data = pc.add(actual.column('a'), 2000)
         update_table = actual.set_column(column_index, column_field, new_data)
         t.update(update_table, columns=['a'])
-        actual = pa.Table.from_batches(t.select(columns=['a', 'b']))
+        actual = t.select(columns=['a', 'b']).read_all()
         assert actual.to_pydict() == {
             'a': [1110, 2222, 3330],
             'b': [0.5, 1.5, 2.5]
         }
-        actual = pa.Table.from_batches(t.select(columns=['a', 'b'], predicate=(t['a'] != 2222), internal_row_id=True))
+        actual = t.select(columns=['a', 'b'], predicate=(t['a'] != 2222), internal_row_id=True).read_all()
         column_index = actual.column_names.index('a')
         column_field = actual.field(column_index)
         new_data = pc.divide(actual.column('a'), 10)
         update_table = actual.set_column(column_index, column_field, new_data)
         t.update(update_table.to_batches()[0], columns=['a'])
-        actual = pa.Table.from_batches(t.select(columns=['a', 'b']))
+        actual = t.select(columns=['a', 'b']).read_all()
         assert actual.to_pydict() == {
             'a': [111, 2222, 333],
             'b': [0.5, 1.5, 2.5]
@@ -171,7 +170,7 @@ def test_select_with_multisplits(session, clean_bucket_name):
     config.rows_per_split = 1000
     with prepare_data(session, clean_bucket_name, 's', 't', expected) as t:
-        actual = pa.Table.from_batches(t.select(columns=['a'], config=config))
+        actual = t.select(columns=['a'], config=config).read_all()
         assert actual == expected
@@ -218,7 +217,7 @@ def test_types(session, clean_bucket_name):
     with prepare_data(session, clean_bucket_name, 's', 't', expected) as table:
         def select(predicate):
-            return pa.Table.from_batches(table.select(predicate=predicate))
+            return table.select(predicate=predicate).read_all()
         assert select(None) == expected
         for t in [table, ibis._]:
@@ -274,13 +273,20 @@ def test_filters(session, clean_bucket_name):
     with prepare_data(session, clean_bucket_name, 's', 't', expected) as table:
         def select(predicate):
-            return pa.Table.from_batches(table.select(predicate=predicate), table.arrow_schema)
+            return table.select(predicate=predicate).read_all()
         assert select(None) == expected
         assert select(True) == expected
         assert select(False) == pa.Table.from_batches([], schema=columns)
         for t in [table, ibis._]:
+            select(t['a'].isin(list(range(100))))
+            select(t['a'].isin(list(range(1000))))
+            select(t['a'].isin(list(range(10000))))
+            with pytest.raises(errors.TooLargeRequest):
+                select(t['a'].isin(list(range(100000))))
             assert select(t['a'].between(222, 444)) == expected.filter((pc.field('a') >= 222) & (pc.field('a') <= 444))
             assert select((t['a'].between(222, 444)) & (t['b'] > 2.5)) == expected.filter((pc.field('a') >= 222) & (pc.field('a') <= 444) & (pc.field('b') > 2.5))
@@ -351,7 +357,7 @@ def test_parquet_export(session, clean_bucket_name):
         expected = pa.Table.from_batches([rb])
         rb = t.insert(rb)
         assert rb.to_pylist() == [0, 1]
-        actual = pa.Table.from_batches(t.select())
+        actual = t.select().read_all()
         assert actual == expected
         table_batches = t.select()
@@ -667,18 +673,37 @@ def test_select_stop(session, clean_bucket_name):
     assert active_threads() == 0
-def test_big_catalog_select(session, clean_bucket_name):
+def test_catalog_select(session, clean_bucket_name):
     with session.transaction() as tx:
         bc = tx.catalog()
-        actual = pa.Table.from_batches(bc.select(['name']))
-        assert actual
-        log.info("actual=%s", actual)
+        assert bc.columns()
+        rows = bc.select(['name']).read_all()
+        assert len(rows) > 0, rows
+class NotReady(Exception):
+    pass
+@pytest.mark.flaky(retries=30, delay=1, only_on=[NotReady])
 def test_audit_log_select(session, clean_bucket_name):
     with session.transaction() as tx:
         a = tx.audit_log()
-        a.columns()
-        time.sleep(1)
-        actual = pa.Table.from_batches(a.select(), a.arrow_schema)
-        log.info("actual=%s", actual)
+        assert a.columns()
+        rows = a.select().read_all()
+        if len(rows) == 0:
+            raise NotReady
+@pytest.mark.flaky(retries=30, delay=1, only_on=[NotReady])
+def test_catalog_snapshots_select(session, clean_bucket_name):
+    with session.transaction() as tx:
+        snaps = tx.catalog_snapshots()
+        if not snaps:
+            raise NotReady
+        latest = snaps[-1]
+        t = tx.catalog(latest)
+        assert t.columns()
+        rows = t.select().read_all()
+        if not rows:
+            raise NotReady

vastdb/transaction.py CHANGED Viewed

@@ -8,21 +8,26 @@ A transcation is used as a context manager, since every Database-related operati
 import logging
 from dataclasses import dataclass
-from typing import Optional
+from typing import TYPE_CHECKING, List, Optional
 import botocore
-from . import bucket, errors, schema, session, table
+from . import bucket, errors, schema, session
+if TYPE_CHECKING:
+    from bucket import Bucket
+    from table import Table
 log = logging.getLogger(__name__)
-TABULAR_BC_BUCKET = "vast-big-catalog-bucket"
+VAST_CATALOG_BUCKET_NAME = "vast-big-catalog-bucket"
 VAST_CATALOG_SCHEMA_NAME = 'vast_big_catalog_schema'
 VAST_CATALOG_TABLE_NAME = 'vast_big_catalog_table'
-TABULAR_AUDERY_BUCKET = "vast-audit-log-bucket"
-AUDERY_SCHEMA_NAME = 'vast_audit_log_schema'
-AUDERY_TABLE_NAME = 'vast_audit_log_table'
+AUDIT_LOG_BUCKET_NAME = "vast-audit-log-bucket"
+AUDIT_LOG_SCHEMA_NAME = 'vast_audit_log_schema'
+AUDIT_LOG_TABLE_NAME = 'vast_audit_log_table'
 @dataclass
@@ -56,7 +61,7 @@ class Transaction:
             return 'InvalidTransaction'
         return f'Transaction(id=0x{self.txid:016x})'
-    def bucket(self, name: str) -> "bucket.Bucket":
+    def bucket(self, name: str) -> "Bucket":
         """Return a VAST Bucket, if exists."""
         try:
             self._rpc.s3.head_bucket(Bucket=name)
@@ -67,14 +72,18 @@ class Transaction:
             raise
         return bucket.Bucket(name, self)
-    def catalog(self, fail_if_missing=True) -> Optional["table.Table"]:
+    def catalog_snapshots(self) -> List["Bucket"]:
+        """Return VAST Catalog bucket snapshots."""
+        return bucket.Bucket(VAST_CATALOG_BUCKET_NAME, self).snapshots()
+    def catalog(self, snapshot: Optional["Bucket"] = None, fail_if_missing=True) -> Optional["Table"]:
         """Return VAST Catalog table."""
-        b = bucket.Bucket(TABULAR_BC_BUCKET, self)
+        b = snapshot or bucket.Bucket(VAST_CATALOG_BUCKET_NAME, self)
         s = schema.Schema(VAST_CATALOG_SCHEMA_NAME, b)
         return s.table(name=VAST_CATALOG_TABLE_NAME, fail_if_missing=fail_if_missing)
-    def audit_log(self, fail_if_missing=True) -> Optional["table.Table"]:
-        """Return VAST AuditLog table."""
-        b = bucket.Bucket(TABULAR_AUDERY_BUCKET, self)
-        s = schema.Schema(AUDERY_SCHEMA_NAME, b)
-        return s.table(name=AUDERY_TABLE_NAME, fail_if_missing=fail_if_missing)
+    def audit_log(self, fail_if_missing=True) -> Optional["Table"]:
+        """Return VAST Audit Log table."""
+        b = bucket.Bucket(AUDIT_LOG_BUCKET_NAME, self)
+        s = schema.Schema(AUDIT_LOG_SCHEMA_NAME, b)
+        return s.table(name=AUDIT_LOG_TABLE_NAME, fail_if_missing=fail_if_missing)

vastdb/util.py CHANGED Viewed

@@ -83,6 +83,7 @@ def union_schema_merge(current_schema: pa.Schema, new_schema: pa.Schema) -> pa.S
 MAX_TABULAR_REQUEST_SIZE = 5 << 20  # in bytes
 MAX_RECORD_BATCH_SLICE_SIZE = int(0.9 * MAX_TABULAR_REQUEST_SIZE)
+MAX_QUERY_DATA_REQUEST_SIZE = int(0.9 * MAX_TABULAR_REQUEST_SIZE)
 def iter_serialized_slices(batch: Union[pa.RecordBatch, pa.Table], max_rows_per_slice=None):

{vastdb-0.1.5.dist-info → vastdb-0.1.6.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: vastdb
-Version: 0.1.5
+Version: 0.1.6
 Summary: VAST Data SDK
 Home-page: https://github.com/vast-data/vastdb_sdk
 Author: VAST DATA

{vastdb-0.1.5.dist-info → vastdb-0.1.6.dist-info}/RECORD RENAMED Viewed

@@ -149,29 +149,29 @@ vast_flatbuf/tabular/S3File.py,sha256=KC9c2oS5-JXwTTriUVFdjOvRG0B54Cq9kviSDZY3NI
 vast_flatbuf/tabular/VipRange.py,sha256=_BJd1RRZAcK76T9vlsHzXKYVsPVaz6WTEAqStMQCAUQ,2069
 vast_flatbuf/tabular/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 vastdb/__init__.py,sha256=cMJtZuJ0IL9aKyM3DUWqTCzuP1H1MXXVivKKE1-q0DY,292
-vastdb/bucket.py,sha256=4rPEm9qlPTg7ccWO6VGmd4LKb8w-BDhJYwzXGjn03sc,3566
-vastdb/conftest.py,sha256=pKpo_46Vq4QHzTDQAFxasrVhnZ2V2L-y6IMLxojxaFM,2132
-vastdb/errors.py,sha256=fj8IlPnGi1lbJWIl1-8MSjLavL9bYQ-YUoboWbXCo54,4047
-vastdb/internal_commands.py,sha256=kIdkLHabW8r4-GSygGl1Gdrr4puxD79WPO8Jkx8aszg,98490
-vastdb/schema.py,sha256=ql4TPB1W_FQ_BHov3CKHI8JX3krXMlcKWz7dTrjpQ1w,3346
-vastdb/session.py,sha256=UTaz1Fh3u71Bnay2r6IyCHNMDrAszbzjnwylPURzhsk,2603
-vastdb/table.py,sha256=1ikj6toITImFowI2WHiimmqSiObmTfAohCdWC89q71Y,30031
-vastdb/transaction.py,sha256=u4pJBLooZQ_YGjsRgEWVL6RPAlt3lgm5oOpPHzPcayM,2852
-vastdb/util.py,sha256=rs7nLL2Qz-OVEZDSVIqAvS-uETMq-zxQs5jBksB5-JA,4276
+vastdb/bucket.py,sha256=T0qX8efIJsQvK8Zn1_B-Np6BZqu_i9IuU3aN3JE7kyQ,2536
+vastdb/conftest.py,sha256=D4RvOhGvMQy-JliKY-uyzcB-_mFBwI6aMF__xwHiwOM,2359
+vastdb/errors.py,sha256=nC7d05xwe0WxMFyM3cEEqIvA09OXNqcxiUGsKov822I,4098
+vastdb/internal_commands.py,sha256=r8EjueIaqSkdiqV6Cv7YsCiuuTO7rMCshyGExeBnXVw,97586
+vastdb/schema.py,sha256=ro4GrVlhJEN4HT8qLdyPtiufVZrNBPGtej-z6Y-v2jg,5642
+vastdb/session.py,sha256=zuy0wjKKB8d388KgRA77vQwxoraU0tWZacqVVMrU5dU,2984
+vastdb/table.py,sha256=JZeAqww6dBlhXWW7J-LqC54a5IH2Dl8GTne3hU3wL2I,30646
+vastdb/transaction.py,sha256=6mCnd43uF9AJyhhUZViD799Mde6AczBi5Cp7LGGDuoM,3178
+vastdb/util.py,sha256=vt4LWROOFdZieJXLpQMlcnF7YWQFpPqQTVaRbmQ241o,4342
 vastdb/bench/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-vastdb/bench/test_perf.py,sha256=iHE3E60fvyU5SBDHPi4h03Dj6QcY6VI9l9mMhgNMtPc,1117
+vastdb/bench/test_perf.py,sha256=yn5gE7t_nzmJHBl9bCs1hxQOgzhvFphuYElsWGko8ts,1084
 vastdb/tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 vastdb/tests/test_duckdb.py,sha256=KDuv4PrjGEwChCGHG36xNT2JiFlBOt6K3DQ3L06Kq-A,1913
-vastdb/tests/test_imports.py,sha256=48kbJKsa_MrEXcBYQUbUDr1e9wzjG4FHQ7C3wUEQfXA,5705
-vastdb/tests/test_nested.py,sha256=FHYMmaKYvqVh0NvsocUFLr2LDVlSfXZYgqUSopWOSM0,3512
-vastdb/tests/test_projections.py,sha256=_cDNfD5zTwbCXLk6uGpPUWGN0P-4HElu5OjubWu-Jg0,1255
+vastdb/tests/test_imports.py,sha256=xKub3-bisFjH0BsZM8COfiUWuMrtoOoQKprF6VQT9RI,5669
+vastdb/tests/test_nested.py,sha256=22NAxBTm7Aq-Vn6AIYbi5Cb1ET8W0XeLK3pp4D8BYWI,3448
+vastdb/tests/test_projections.py,sha256=11a-55VbJcqaFPkOKaKDEdM5nkeI0xtUhh6cQc1upSA,4223
 vastdb/tests/test_sanity.py,sha256=ixx0QPo73hLHjAa7bByFXjS1XST0WvmSwLEpgnHh_JY,2960
-vastdb/tests/test_schemas.py,sha256=qoHTLX51D-0S4bMxdCpRh9gaYQd-BkZdT_agGOwFwTM,1739
-vastdb/tests/test_tables.py,sha256=Q3N5P-7mOPVcfAFEfpAzomqkyCJ5gKZmfE4SUW5jehk,27859
+vastdb/tests/test_schemas.py,sha256=l70YQMlx2UL1KRQhApriiG2ZM7GJF-IzWU31H3Yqn1U,3312
+vastdb/tests/test_tables.py,sha256=V1-WOxCOD8ELJF6Ebj57Jwtum7Z6iYG9JKo89HxC7rM,28342
 vastdb/tests/test_util.py,sha256=owRAU3TCKMq-kz54NRdA5wX2O_bZIHqG5ucUR77jm5k,1046
 vastdb/tests/util.py,sha256=dpRJYbboDnlqL4qIdvScpp8--5fxRUBIcIYitrfcj9o,555
-vastdb-0.1.5.dist-info/LICENSE,sha256=obffan7LYrq7hLHNrY7vHcn2pKUTBUYXMKu-VOAvDxU,11333
-vastdb-0.1.5.dist-info/METADATA,sha256=NJzrnkyfPs4lliFamaEdJy2elLYLzYJtlCxEMRSiLtg,1350
-vastdb-0.1.5.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
-vastdb-0.1.5.dist-info/top_level.txt,sha256=Vsj2MKtlhPg0J4so64slQtnwjhgoPmJgcG-6YcVAwVc,20
-vastdb-0.1.5.dist-info/RECORD,,
+vastdb-0.1.6.dist-info/LICENSE,sha256=obffan7LYrq7hLHNrY7vHcn2pKUTBUYXMKu-VOAvDxU,11333
+vastdb-0.1.6.dist-info/METADATA,sha256=ibcsckhsDh4iGEN0xjK1_-FUrB24hNbwz9eAouRE6kY,1350
+vastdb-0.1.6.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
+vastdb-0.1.6.dist-info/top_level.txt,sha256=Vsj2MKtlhPg0J4so64slQtnwjhgoPmJgcG-6YcVAwVc,20
+vastdb-0.1.6.dist-info/RECORD,,

{vastdb-0.1.5.dist-info → vastdb-0.1.6.dist-info}/LICENSE RENAMED Viewed

File without changes

{vastdb-0.1.5.dist-info → vastdb-0.1.6.dist-info}/WHEEL RENAMED Viewed

File without changes

{vastdb-0.1.5.dist-info → vastdb-0.1.6.dist-info}/top_level.txt RENAMED Viewed

File without changes

vastdb 0.1.5__py3-none-any.whl → 0.1.6__py3-none-any.whl

vastdb 0.1.5py3-none-any.whl → 0.1.6py3-none-any.whl