PyPI - vastdb - Versions diffs - 0.1.0__py3-none-any.whl → 0.1.2__py3-none-any.whl - Mend

vastdb 0.1.0py3-none-any.whl → 0.1.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (25) hide show

vastdb/__init__.py +6 -2
vastdb/bench/__init__.py +0 -0
vastdb/bench/test_perf.py +29 -0
vastdb/bucket.py +21 -9
vastdb/{tests/conftest.py → conftest.py} +21 -7
vastdb/errors.py +32 -9
vastdb/internal_commands.py +236 -278
vastdb/schema.py +22 -9
vastdb/session.py +2 -3
vastdb/table.py +57 -57
vastdb/tests/test_duckdb.py +61 -0
vastdb/tests/test_imports.py +3 -5
vastdb/tests/test_nested.py +28 -0
vastdb/tests/test_projections.py +3 -1
vastdb/tests/test_sanity.py +5 -6
vastdb/tests/test_schemas.py +20 -1
vastdb/tests/test_tables.py +108 -76
vastdb/tests/util.py +15 -0
vastdb/transaction.py +18 -9
vastdb/util.py +6 -4
{vastdb-0.1.0.dist-info → vastdb-0.1.2.dist-info}/METADATA +1 -4
{vastdb-0.1.0.dist-info → vastdb-0.1.2.dist-info}/RECORD +25 -20
{vastdb-0.1.0.dist-info → vastdb-0.1.2.dist-info}/WHEEL +1 -1
{vastdb-0.1.0.dist-info → vastdb-0.1.2.dist-info}/LICENSE +0 -0
{vastdb-0.1.0.dist-info → vastdb-0.1.2.dist-info}/top_level.txt +0 -0

vastdb/schema.py CHANGED Viewed

@@ -4,12 +4,17 @@ VAST S3 buckets can be used to create Database schemas and tables.
 It is possible to list and access VAST snapshots generated over a bucket.
 """
-from . import bucket, errors, schema, table
+import logging
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, List, Optional
 import pyarrow as pa
-from dataclasses import dataclass
-import logging
+from . import bucket, errors, schema, table
+if TYPE_CHECKING:
+    from .table import Table
 log = logging.getLogger(__name__)
@@ -26,29 +31,37 @@ class Schema:
         """VAST transaction used for this schema."""
         return self.bucket.tx
-    def create_table(self, table_name: str, columns: pa.Schema) -> "table.Table":
+    def create_table(self, table_name: str, columns: pa.Schema, fail_if_exists=True) -> "Table":
         """Create a new table under this schema."""
+        if current := self.table(table_name, fail_if_missing=False):
+            if fail_if_exists:
+                raise errors.TableExists(self.bucket.name, self.name, table_name)
+            else:
+                return current
         self.tx._rpc.api.create_table(self.bucket.name, self.name, table_name, columns, txid=self.tx.txid)
         log.info("Created table: %s", table_name)
-        return self.table(table_name)
+        return self.table(table_name)  # type: ignore[return-value]
-    def table(self, name: str) -> "table.Table":
+    def table(self, name: str, fail_if_missing=True) -> Optional["table.Table"]:
         """Get a specific table under this schema."""
         t = self.tables(table_name=name)
         if not t:
-            raise errors.MissingTable(self.bucket.name, self.name, name)
+            if fail_if_missing:
+                raise errors.MissingTable(self.bucket.name, self.name, name)
+            else:
+                return None
         assert len(t) == 1, f"Expected to receive only a single table, but got: {len(t)}. tables: {t}"
         log.debug("Found table: %s", t[0])
         return t[0]
-    def tables(self, table_name=None) -> ["table.Table"]:
+    def tables(self, table_name=None) -> List["Table"]:
         """List all tables under this schema."""
         tables = []
         next_key = 0
         name_prefix = table_name if table_name else ""
         exact_match = bool(table_name)
         while True:
-            bucket_name, schema_name, curr_tables, next_key, is_truncated, _ = \
+            _bucket_name, _schema_name, curr_tables, next_key, is_truncated, _ = \
                 self.tx._rpc.api.list_tables(
                     bucket=self.bucket.name, schema=self.name, next_key=next_key, txid=self.tx.txid,
                     exact_match=exact_match, name_prefix=name_prefix, include_list_stats=exact_match)

vastdb/session.py CHANGED Viewed

@@ -7,12 +7,11 @@ For more details see:
 - [Tabular identity policy with the proper permissions](https://support.vastdata.com/s/article/UUID-14322b60-d6a2-89ac-3df0-3dfbb6974182)
 """
-from . import internal_commands
-from . import transaction
+import os
 import boto3
-import os
+from . import internal_commands, transaction
 class Session:

vastdb/table.py CHANGED Viewed

@@ -1,19 +1,16 @@
-from . import errors, schema
-from .internal_commands import build_query_data_request, parse_query_data_response, \
-    TABULAR_INVALID_ROW_ID, VastdbApi
-import pyarrow as pa
-import ibis
 import concurrent.futures
+import logging
+import os
 import queue
-from threading import Event
+from dataclasses import dataclass, field
 from math import ceil
+from threading import Event
+from typing import Dict, List, Optional, Tuple, Union
-from dataclasses import dataclass, field
-from typing import List, Union
-import logging
-import os
+import ibis
+import pyarrow as pa
+from . import errors, internal_commands, schema
 log = logging.getLogger(__name__)
@@ -24,18 +21,20 @@ MAX_ROWS_PER_BATCH = 512 * 1024
 # for example insert of 512k uint8 result in 512k*8bytes response since row_ids are uint64
 MAX_INSERT_ROWS_PER_PATCH = 512 * 1024
 @dataclass
 class TableStats:
     num_rows: int
     size_in_bytes: int
     is_external_rowid_alloc: bool = False
-    endpoints: List[str] = None
+    endpoints: Tuple[str, ...] = ()
 @dataclass
 class QueryConfig:
     num_sub_splits: int = 4
     num_splits: int = 1
-    data_endpoints: [str] = None
+    data_endpoints: Optional[List[str]] = None
     limit_rows_per_sub_split: int = 128 * 1024
     num_row_groups_per_sub_split: int = 8
     use_semi_sorted_projections: bool = True
@@ -47,15 +46,16 @@ class QueryConfig:
 class ImportConfig:
     import_concurrency: int = 2
 class SelectSplitState():
-    def __init__(self, query_data_request, table : "Table", split_id : int, config: QueryConfig) -> None:
+    def __init__(self, query_data_request, table: "Table", split_id: int, config: QueryConfig) -> None:
         self.split_id = split_id
         self.subsplits_state = {i: 0 for i in range(config.num_sub_splits)}
         self.config = config
         self.query_data_request = query_data_request
         self.table = table
-    def batches(self, api : VastdbApi):
+    def batches(self, api: internal_commands.VastdbApi):
         while not self.done:
             response = api.query_data(
                             bucket=self.table.bucket.name,
@@ -69,20 +69,21 @@ class SelectSplitState():
                             limit_rows=self.config.limit_rows_per_sub_split,
                             sub_split_start_row_ids=self.subsplits_state.items(),
                             enable_sorted_projections=self.config.use_semi_sorted_projections)
-            pages_iter = parse_query_data_response(
+            pages_iter = internal_commands.parse_query_data_response(
                 conn=response.raw,
                 schema=self.query_data_request.response_schema,
-                start_row_ids=self.subsplits_state)
+                start_row_ids=self.subsplits_state,
+                parser=self.query_data_request.response_parser)
             for page in pages_iter:
                 for batch in page.to_batches():
                     if len(batch) > 0:
                         yield batch
     @property
     def done(self):
-        return all(row_id == TABULAR_INVALID_ROW_ID for row_id in self.subsplits_state.values())
+        return all(row_id == internal_commands.TABULAR_INVALID_ROW_ID for row_id in self.subsplits_state.values())
 @dataclass
 class Table:
@@ -90,12 +91,10 @@ class Table:
     schema: "schema.Schema"
     handle: int
     stats: TableStats
-    properties: dict = None
     arrow_schema: pa.Schema = field(init=False, compare=False)
     _ibis_table: ibis.Schema = field(init=False, compare=False)
     def __post_init__(self):
-        self.properties = self.properties or {}
         self.arrow_schema = self.columns()
         table_path = f'{self.schema.bucket.name}/{self.schema.name}/{self.name}'
@@ -133,13 +132,13 @@ class Table:
         log.debug("Found projection: %s", projs[0])
         return projs[0]
-    def projections(self, projection_name=None) -> ["Projection"]:
+    def projections(self, projection_name=None) -> List["Projection"]:
         projections = []
         next_key = 0
         name_prefix = projection_name if projection_name else ""
         exact_match = bool(projection_name)
         while True:
-            bucket_name, schema_name, table_name, curr_projections, next_key, is_truncated, _ = \
+            _bucket_name, _schema_name, _table_name, curr_projections, next_key, is_truncated, _ = \
                 self.tx._rpc.api.list_projections(
                     bucket=self.bucket.name, schema=self.schema.name, table=self.name, next_key=next_key, txid=self.tx.txid,
                     exact_match=exact_match, name_prefix=name_prefix)
@@ -150,7 +149,7 @@ class Table:
                 break
         return [_parse_projection_info(projection, self) for projection in projections]
-    def import_files(self, files_to_import: [str], config: ImportConfig = None) -> None:
+    def import_files(self, files_to_import: List[str], config: Optional[ImportConfig] = None) -> None:
         source_files = {}
         for f in files_to_import:
             bucket_name, object_path = _parse_bucket_and_object_names(f)
@@ -158,7 +157,7 @@ class Table:
         self._execute_import(source_files, config=config)
-    def import_partitioned_files(self, files_and_partitions: {str: pa.RecordBatch}, config: ImportConfig = None) -> None:
+    def import_partitioned_files(self, files_and_partitions: Dict[str, pa.RecordBatch], config: Optional[ImportConfig] = None) -> None:
         source_files = {}
         for f, record_batch in files_and_partitions.items():
             bucket_name, object_path = _parse_bucket_and_object_names(f)
@@ -206,7 +205,7 @@ class Table:
                 max_workers=config.import_concurrency, thread_name_prefix='import_thread') as pool:
             try:
                 for endpoint in endpoints:
-                    session = VastdbApi(endpoint, self.tx._rpc.api.access_key, self.tx._rpc.api.secret_key)
+                    session = internal_commands.VastdbApi(endpoint, self.tx._rpc.api.access_key, self.tx._rpc.api.secret_key)
                     futures.append(pool.submit(import_worker, files_queue, session))
                 log.debug("Waiting for import workers to finish")
@@ -215,24 +214,30 @@ class Table:
             finally:
                 stop_event.set()
                 # ThreadPoolExecutor will be joined at the end of the context
-    def refresh_stats(self):
+    def get_stats(self) -> TableStats:
         stats_tuple = self.tx._rpc.api.get_table_stats(
             bucket=self.bucket.name, schema=self.schema.name, name=self.name, txid=self.tx.txid)
-        self.stats = TableStats(**stats_tuple._asdict())
+        return TableStats(**stats_tuple._asdict())
-    def select(self, columns: [str] = None,
+    def select(self, columns: Optional[List[str]] = None,
                predicate: ibis.expr.types.BooleanColumn = None,
-               config: QueryConfig = None,
+               config: Optional[QueryConfig] = None,
                *,
                internal_row_id: bool = False) -> pa.RecordBatchReader:
         if config is None:
             config = QueryConfig()
-        self.refresh_stats()
+        # Take a snapshot of enpoints
+        stats = self.get_stats()
+        endpoints = stats.endpoints if config.data_endpoints is None else config.data_endpoints
+        if stats.num_rows > config.rows_per_split and config.num_splits is None:
+            config.num_splits = stats.num_rows // config.rows_per_split
+        log.debug(f"num_rows={stats.num_rows} rows_per_splits={config.rows_per_split} num_splits={config.num_splits} ")
-        if self.stats.num_rows > config.rows_per_split and config.num_splits is None:
-            config.num_splits = self.stats.num_rows // config.rows_per_split
-        log.debug(f"num_rows={self.stats.num_rows} rows_per_splits={config.rows_per_split} num_splits={config.num_splits} ")
+        if columns is None:
+            columns = [f.name for f in self.arrow_schema]
         query_schema = self.arrow_schema
         if internal_row_id:
@@ -241,12 +246,12 @@ class Table:
             query_schema = pa.schema(queried_fields)
             columns.append(INTERNAL_ROW_ID)
-        query_data_request = build_query_data_request(
+        query_data_request = internal_commands.build_query_data_request(
             schema=query_schema,
             predicate=predicate,
             field_names=columns)
-        splits_queue = queue.Queue()
+        splits_queue: queue.Queue[int] = queue.Queue()
         for split in range(config.num_splits):
             splits_queue.put(split)
@@ -254,8 +259,10 @@ class Table:
         # this queue shouldn't be large it is marely a pipe through which the results
         # are sent to the main thread. Most of the pages actually held in the
         # threads that fetch the pages.
-        record_batches_queue = queue.Queue(maxsize=2)
+        record_batches_queue: queue.Queue[pa.RecordBatch] = queue.Queue(maxsize=2)
         stop_event = Event()
         class StoppedException(Exception):
             pass
@@ -263,9 +270,9 @@ class Table:
             if stop_event.is_set():
                 raise StoppedException
-        def single_endpoint_worker(endpoint : str):
+        def single_endpoint_worker(endpoint: str):
             try:
-                host_api = VastdbApi(endpoint=endpoint, access_key=self.tx._rpc.api.access_key, secret_key=self.tx._rpc.api.secret_key)
+                host_api = internal_commands.VastdbApi(endpoint=endpoint, access_key=self.tx._rpc.api.access_key, secret_key=self.tx._rpc.api.secret_key)
                 while True:
                     check_stop()
                     try:
@@ -290,12 +297,11 @@ class Table:
                 log.debug("exiting")
                 record_batches_queue.put(None)
-        # Take a snapshot of enpoints
-        endpoints = list(self.stats.endpoints) if config.data_endpoints is None else list(config.data_endpoints)
         def batches_iterator():
-            def propagate_first_exception(futures : List[concurrent.futures.Future], block = False):
+            def propagate_first_exception(futures: List[concurrent.futures.Future], block=False):
                 done, not_done = concurrent.futures.wait(futures, None if block else 0, concurrent.futures.FIRST_EXCEPTION)
+                if self.tx.txid is None:
+                    raise errors.MissingTransaction()
                 for future in done:
                     future.result()
                 return not_done
@@ -305,7 +311,7 @@ class Table:
             if config.query_id:
                 threads_prefix = threads_prefix + "-" + config.query_id
-            with concurrent.futures.ThreadPoolExecutor(max_workers=len(endpoints), thread_name_prefix=threads_prefix) as tp: # TODO: concurrency == enpoints is just a heuristic
+            with concurrent.futures.ThreadPoolExecutor(max_workers=len(endpoints), thread_name_prefix=threads_prefix) as tp:  # TODO: concurrency == enpoints is just a heuristic
                 futures = [tp.submit(single_endpoint_worker, endpoint) for endpoint in endpoints]
                 tasks_running = len(futures)
                 try:
@@ -327,7 +333,7 @@ class Table:
                         if record_batches_queue.get() is None:
                             tasks_running -= 1
-        return pa.RecordBatchReader.from_batches(query_data_request.response_schema.arrow_schema, batches_iterator())
+        return pa.RecordBatchReader.from_batches(query_data_request.response_schema, batches_iterator())
     def _combine_chunks(self, col):
         if hasattr(col, "combine_chunks"):
@@ -337,16 +343,11 @@ class Table:
     def insert(self, rows: pa.RecordBatch) -> pa.RecordBatch:
         serialized_slices = self.tx._rpc.api._record_batch_slices(rows, MAX_INSERT_ROWS_PER_PATCH)
-        row_ids = []
         for slice in serialized_slices:
-            res = self.tx._rpc.api.insert_rows(self.bucket.name, self.schema.name, self.name, record_batch=slice,
+            self.tx._rpc.api.insert_rows(self.bucket.name, self.schema.name, self.name, record_batch=slice,
                                                txid=self.tx.txid)
-            (batch,) = pa.RecordBatchStreamReader(res.raw)
-            row_ids.append(batch[INTERNAL_ROW_ID])
-        return pa.chunked_array(row_ids)
-    def update(self, rows: Union[pa.RecordBatch, pa.Table], columns: list = None) -> None:
+    def update(self, rows: Union[pa.RecordBatch, pa.Table], columns: Optional[List[str]] = None) -> None:
         if columns is not None:
             update_fields = [(INTERNAL_ROW_ID, pa.uint64())]
             update_values = [self._combine_chunks(rows[INTERNAL_ROW_ID])]
@@ -414,7 +415,6 @@ class Projection:
     table: Table
     handle: int
     stats: TableStats
-    properties: dict = None
     @property
     def bucket(self):
@@ -435,7 +435,7 @@ class Projection:
         columns = []
         next_key = 0
         while True:
-            curr_columns, next_key, is_truncated, count, _ = \
+            curr_columns, next_key, is_truncated, _count, _ = \
                 self.tx._rpc.api.list_projection_columns(
                     self.bucket.name, self.schema.name, self.table.name, self.name, txid=self.table.tx.txid, next_key=next_key)
             if not curr_columns:
@@ -464,9 +464,9 @@ def _parse_projection_info(projection_info, table: "Table"):
     return Projection(name=projection_info.name, table=table, stats=stats, handle=int(projection_info.handle))
-def _parse_bucket_and_object_names(path: str) -> (str, str):
+def _parse_bucket_and_object_names(path: str) -> Tuple[str, str]:
     if not path.startswith('/'):
-        raise errors.InvalidArgumentError(f"Path {path} must start with a '/'")
+        raise errors.InvalidArgument(f"Path {path} must start with a '/'")
     components = path.split(os.path.sep)
     bucket_name = components[1]
     object_path = os.path.sep.join(components[2:])

vastdb/tests/test_duckdb.py ADDED Viewed

@@ -0,0 +1,61 @@
+import logging
+import duckdb
+import pyarrow as pa
+import pyarrow.compute as pc
+import pytest
+from ..table import QueryConfig
+from .util import prepare_data
+log = logging.getLogger(__name__)
+def test_duckdb(session, clean_bucket_name):
+    columns = pa.schema([
+        ('a', pa.int32()),
+        ('b', pa.float64()),
+    ])
+    data = pa.table(schema=columns, data=[
+        [111, 222, 333],
+        [0.5, 1.5, 2.5],
+    ])
+    with prepare_data(session, clean_bucket_name, 's', 't', data) as t:
+        conn = duckdb.connect()
+        batches = t.select(columns=['a'], predicate=(t['b'] < 2))  # noqa: F841
+        actual = conn.execute('SELECT max(a) as "a_max" FROM batches').arrow()
+        expected = (data
+            .filter(pc.field('b') < 2)
+            .group_by([])
+            .aggregate([('a', 'max')]))
+        assert actual == expected
+def test_closed_tx(session, clean_bucket_name):
+    columns = pa.schema([
+        ('a', pa.int64()),
+    ])
+    data = pa.table(schema=columns, data=[
+        list(range(10000)),
+    ])
+    with session.transaction() as tx:
+        t = tx.bucket(clean_bucket_name).create_schema("s1").create_table("t1", columns)
+        t.insert(data)
+        config = QueryConfig(
+            num_sub_splits=1,
+            num_splits=1,
+            num_row_groups_per_sub_split=1,
+            limit_rows_per_sub_split=100)
+        batches = t.select(config=config)  # noqa: F841
+        first = next(batches)  # make sure that HTTP response processing has started
+        assert first['a'].to_pylist() == list(range(100))
+        conn = duckdb.connect()
+        res = conn.execute('SELECT a FROM batches')
+        log.debug("closing tx=%s after first batch=%s", t.tx, first)
+    # transaction is closed, collecting the result should fail
+    with pytest.raises(duckdb.InvalidInputException, match="Detail: Python exception: MissingTransaction"):
+        res.arrow()

vastdb/tests/test_imports.py CHANGED Viewed

@@ -1,14 +1,12 @@
-import pytest
-from tempfile import NamedTemporaryFile
 import logging
+from tempfile import NamedTemporaryFile
 import pyarrow as pa
 import pyarrow.parquet as pq
+import pytest
-from vastdb.errors import InvalidArgument, ImportFilesError
 from vastdb import util
+from vastdb.errors import ImportFilesError, InvalidArgument
 log = logging.getLogger(__name__)

vastdb/tests/test_nested.py ADDED Viewed

@@ -0,0 +1,28 @@
+import itertools
+import pyarrow as pa
+from .util import prepare_data
+def test_nested(session, clean_bucket_name):
+    columns = pa.schema([
+        ('l', pa.list_(pa.int8())),
+        ('m', pa.map_(pa.utf8(), pa.float64())),
+        ('s', pa.struct([('x', pa.int16()), ('y', pa.int32())])),
+    ])
+    expected = pa.table(schema=columns, data=[
+        [[1], [], [2, 3], None],
+        [None, {'a': 2.5}, {'b': 0.25, 'c': 0.025}, {}],
+        [{'x': 1, 'y': None}, None, {'x': 2, 'y': 3}, {'x': None, 'y': 4}],
+    ])
+    with prepare_data(session, clean_bucket_name, 's', 't', expected) as t:
+        actual = pa.Table.from_batches(t.select())
+        assert actual == expected
+        names = [f.name for f in columns]
+        for n in range(len(names) + 1):
+            for cols in itertools.permutations(names, n):
+                actual = pa.Table.from_batches(t.select(columns=cols))
+                assert actual == expected.select(cols)

vastdb/tests/test_projections.py CHANGED Viewed

@@ -1,8 +1,10 @@
-import pyarrow as pa
 import logging
+import pyarrow as pa
 log = logging.getLogger(__name__)
 def test_basic_projections(session, clean_bucket_name):
     with session.transaction() as tx:
         s = tx.bucket(clean_bucket_name).create_schema('s1')

vastdb/tests/test_sanity.py CHANGED Viewed

@@ -1,15 +1,14 @@
-from http.server import HTTPServer, BaseHTTPRequestHandler
-from itertools import cycle
+import contextlib
 import logging
 import threading
-import contextlib
+from http.server import BaseHTTPRequestHandler, HTTPServer
+from itertools import cycle
 import pytest
 import requests
 import vastdb
 log = logging.getLogger(__name__)
@@ -58,10 +57,10 @@ def test_version_extraction():
             return f"vast {version}" if version else "vast"
         def log_message(self, format, *args):
-            log.debug(format,*args)
+            log.debug(format, *args)
     # start the server on localhost on some available port port
-    server_address =('localhost', 0)
+    server_address = ('localhost', 0)
     httpd = HTTPServer(server_address, MockOptionsHandler)
     def start_http_server_in_thread():

vastdb/tests/test_schemas.py CHANGED Viewed

@@ -1,5 +1,7 @@
 import pytest
+from .. import errors
 def test_schemas(session, clean_bucket_name):
     with session.transaction() as tx:
@@ -19,6 +21,22 @@ def test_schemas(session, clean_bucket_name):
         assert b.schemas() == []
+def test_exists(session, clean_bucket_name):
+    with session.transaction() as tx:
+        b = tx.bucket(clean_bucket_name)
+        assert b.schemas() == []
+        s = b.create_schema('s1')
+        assert b.schemas() == [s]
+        with pytest.raises(errors.SchemaExists):
+            b.create_schema('s1')
+        assert b.schemas() == [s]
+        assert b.create_schema('s1', fail_if_exists=False) == s
+        assert b.schemas() == [s]
 def test_commits_and_rollbacks(session, clean_bucket_name):
     with session.transaction() as tx:
         b = tx.bucket(clean_bucket_name)
@@ -32,12 +50,13 @@ def test_commits_and_rollbacks(session, clean_bucket_name):
             b = tx.bucket(clean_bucket_name)
             b.schema("s3").drop()
             assert b.schemas() == []
-            1/0  # rollback schema dropping
+            1 / 0  # rollback schema dropping
     with session.transaction() as tx:
         b = tx.bucket(clean_bucket_name)
         assert b.schemas() != []
 def test_list_snapshots(session, clean_bucket_name):
     with session.transaction() as tx:
         b = tx.bucket(clean_bucket_name)

vastdb 0.1.0__py3-none-any.whl → 0.1.2__py3-none-any.whl

vastdb 0.1.0py3-none-any.whl → 0.1.2py3-none-any.whl