PyPI - vastdb - Versions diffs - 0.1.1__py3-none-any.whl → 0.1.2__py3-none-any.whl - Mend

vastdb 0.1.1py3-none-any.whl → 0.1.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (20) hide show

vastdb/__init__.py +6 -2
vastdb/bench/test_perf.py +3 -3
vastdb/bucket.py +10 -6
vastdb/errors.py +12 -6
vastdb/internal_commands.py +146 -152
vastdb/schema.py +10 -5
vastdb/table.py +49 -52
vastdb/tests/test_duckdb.py +61 -0
vastdb/tests/test_projections.py +1 -0
vastdb/tests/test_sanity.py +2 -2
vastdb/tests/test_schemas.py +2 -1
vastdb/tests/test_tables.py +36 -51
vastdb/tests/util.py +1 -4
vastdb/transaction.py +16 -6
vastdb/util.py +4 -3
{vastdb-0.1.1.dist-info → vastdb-0.1.2.dist-info}/METADATA +1 -4
{vastdb-0.1.1.dist-info → vastdb-0.1.2.dist-info}/RECORD +20 -19
{vastdb-0.1.1.dist-info → vastdb-0.1.2.dist-info}/WHEEL +1 -1
{vastdb-0.1.1.dist-info → vastdb-0.1.2.dist-info}/LICENSE +0 -0
{vastdb-0.1.1.dist-info → vastdb-0.1.2.dist-info}/top_level.txt +0 -0

vastdb/schema.py CHANGED Viewed

@@ -6,11 +6,16 @@ It is possible to list and access VAST snapshots generated over a bucket.
 import logging
 from dataclasses import dataclass
+from typing import TYPE_CHECKING, List, Optional
 import pyarrow as pa
 from . import bucket, errors, schema, table
+if TYPE_CHECKING:
+    from .table import Table
 log = logging.getLogger(__name__)
@@ -26,7 +31,7 @@ class Schema:
         """VAST transaction used for this schema."""
         return self.bucket.tx
-    def create_table(self, table_name: str, columns: pa.Schema, fail_if_exists=True) -> "table.Table":
+    def create_table(self, table_name: str, columns: pa.Schema, fail_if_exists=True) -> "Table":
         """Create a new table under this schema."""
         if current := self.table(table_name, fail_if_missing=False):
             if fail_if_exists:
@@ -35,9 +40,9 @@ class Schema:
                 return current
         self.tx._rpc.api.create_table(self.bucket.name, self.name, table_name, columns, txid=self.tx.txid)
         log.info("Created table: %s", table_name)
-        return self.table(table_name)
+        return self.table(table_name)  # type: ignore[return-value]
-    def table(self, name: str, fail_if_missing=True) -> "table.Table":
+    def table(self, name: str, fail_if_missing=True) -> Optional["table.Table"]:
         """Get a specific table under this schema."""
         t = self.tables(table_name=name)
         if not t:
@@ -49,14 +54,14 @@ class Schema:
         log.debug("Found table: %s", t[0])
         return t[0]
-    def tables(self, table_name=None) -> ["table.Table"]:
+    def tables(self, table_name=None) -> List["Table"]:
         """List all tables under this schema."""
         tables = []
         next_key = 0
         name_prefix = table_name if table_name else ""
         exact_match = bool(table_name)
         while True:
-            bucket_name, schema_name, curr_tables, next_key, is_truncated, _ = \
+            _bucket_name, _schema_name, curr_tables, next_key, is_truncated, _ = \
                 self.tx._rpc.api.list_tables(
                     bucket=self.bucket.name, schema=self.name, next_key=next_key, txid=self.tx.txid,
                     exact_match=exact_match, name_prefix=name_prefix, include_list_stats=exact_match)

vastdb/table.py CHANGED Viewed

@@ -5,18 +5,12 @@ import queue
 from dataclasses import dataclass, field
 from math import ceil
 from threading import Event
-from typing import List, Union
+from typing import Dict, List, Optional, Tuple, Union
 import ibis
 import pyarrow as pa
-from . import errors, schema
-from .internal_commands import (
-    TABULAR_INVALID_ROW_ID,
-    VastdbApi,
-    build_query_data_request,
-    parse_query_data_response,
-)
+from . import errors, internal_commands, schema
 log = logging.getLogger(__name__)
@@ -27,18 +21,20 @@ MAX_ROWS_PER_BATCH = 512 * 1024
 # for example insert of 512k uint8 result in 512k*8bytes response since row_ids are uint64
 MAX_INSERT_ROWS_PER_PATCH = 512 * 1024
 @dataclass
 class TableStats:
     num_rows: int
     size_in_bytes: int
     is_external_rowid_alloc: bool = False
-    endpoints: List[str] = None
+    endpoints: Tuple[str, ...] = ()
 @dataclass
 class QueryConfig:
     num_sub_splits: int = 4
     num_splits: int = 1
-    data_endpoints: [str] = None
+    data_endpoints: Optional[List[str]] = None
     limit_rows_per_sub_split: int = 128 * 1024
     num_row_groups_per_sub_split: int = 8
     use_semi_sorted_projections: bool = True
@@ -50,15 +46,16 @@ class QueryConfig:
 class ImportConfig:
     import_concurrency: int = 2
 class SelectSplitState():
-    def __init__(self, query_data_request, table : "Table", split_id : int, config: QueryConfig) -> None:
+    def __init__(self, query_data_request, table: "Table", split_id: int, config: QueryConfig) -> None:
         self.split_id = split_id
         self.subsplits_state = {i: 0 for i in range(config.num_sub_splits)}
         self.config = config
         self.query_data_request = query_data_request
         self.table = table
-    def batches(self, api : VastdbApi):
+    def batches(self, api: internal_commands.VastdbApi):
         while not self.done:
             response = api.query_data(
                             bucket=self.table.bucket.name,
@@ -72,20 +69,21 @@ class SelectSplitState():
                             limit_rows=self.config.limit_rows_per_sub_split,
                             sub_split_start_row_ids=self.subsplits_state.items(),
                             enable_sorted_projections=self.config.use_semi_sorted_projections)
-            pages_iter = parse_query_data_response(
+            pages_iter = internal_commands.parse_query_data_response(
                 conn=response.raw,
                 schema=self.query_data_request.response_schema,
-                start_row_ids=self.subsplits_state)
+                start_row_ids=self.subsplits_state,
+                parser=self.query_data_request.response_parser)
             for page in pages_iter:
                 for batch in page.to_batches():
                     if len(batch) > 0:
                         yield batch
     @property
     def done(self):
-        return all(row_id == TABULAR_INVALID_ROW_ID for row_id in self.subsplits_state.values())
+        return all(row_id == internal_commands.TABULAR_INVALID_ROW_ID for row_id in self.subsplits_state.values())
 @dataclass
 class Table:
@@ -93,12 +91,10 @@ class Table:
     schema: "schema.Schema"
     handle: int
     stats: TableStats
-    properties: dict = None
     arrow_schema: pa.Schema = field(init=False, compare=False)
     _ibis_table: ibis.Schema = field(init=False, compare=False)
     def __post_init__(self):
-        self.properties = self.properties or {}
         self.arrow_schema = self.columns()
         table_path = f'{self.schema.bucket.name}/{self.schema.name}/{self.name}'
@@ -136,13 +132,13 @@ class Table:
         log.debug("Found projection: %s", projs[0])
         return projs[0]
-    def projections(self, projection_name=None) -> ["Projection"]:
+    def projections(self, projection_name=None) -> List["Projection"]:
         projections = []
         next_key = 0
         name_prefix = projection_name if projection_name else ""
         exact_match = bool(projection_name)
         while True:
-            bucket_name, schema_name, table_name, curr_projections, next_key, is_truncated, _ = \
+            _bucket_name, _schema_name, _table_name, curr_projections, next_key, is_truncated, _ = \
                 self.tx._rpc.api.list_projections(
                     bucket=self.bucket.name, schema=self.schema.name, table=self.name, next_key=next_key, txid=self.tx.txid,
                     exact_match=exact_match, name_prefix=name_prefix)
@@ -153,7 +149,7 @@ class Table:
                 break
         return [_parse_projection_info(projection, self) for projection in projections]
-    def import_files(self, files_to_import: [str], config: ImportConfig = None) -> None:
+    def import_files(self, files_to_import: List[str], config: Optional[ImportConfig] = None) -> None:
         source_files = {}
         for f in files_to_import:
             bucket_name, object_path = _parse_bucket_and_object_names(f)
@@ -161,7 +157,7 @@ class Table:
         self._execute_import(source_files, config=config)
-    def import_partitioned_files(self, files_and_partitions: {str: pa.RecordBatch}, config: ImportConfig = None) -> None:
+    def import_partitioned_files(self, files_and_partitions: Dict[str, pa.RecordBatch], config: Optional[ImportConfig] = None) -> None:
         source_files = {}
         for f, record_batch in files_and_partitions.items():
             bucket_name, object_path = _parse_bucket_and_object_names(f)
@@ -209,7 +205,7 @@ class Table:
                 max_workers=config.import_concurrency, thread_name_prefix='import_thread') as pool:
             try:
                 for endpoint in endpoints:
-                    session = VastdbApi(endpoint, self.tx._rpc.api.access_key, self.tx._rpc.api.secret_key)
+                    session = internal_commands.VastdbApi(endpoint, self.tx._rpc.api.access_key, self.tx._rpc.api.secret_key)
                     futures.append(pool.submit(import_worker, files_queue, session))
                 log.debug("Waiting for import workers to finish")
@@ -218,24 +214,30 @@ class Table:
             finally:
                 stop_event.set()
                 # ThreadPoolExecutor will be joined at the end of the context
-    def refresh_stats(self):
+    def get_stats(self) -> TableStats:
         stats_tuple = self.tx._rpc.api.get_table_stats(
             bucket=self.bucket.name, schema=self.schema.name, name=self.name, txid=self.tx.txid)
-        self.stats = TableStats(**stats_tuple._asdict())
+        return TableStats(**stats_tuple._asdict())
-    def select(self, columns: [str] = None,
+    def select(self, columns: Optional[List[str]] = None,
                predicate: ibis.expr.types.BooleanColumn = None,
-               config: QueryConfig = None,
+               config: Optional[QueryConfig] = None,
                *,
                internal_row_id: bool = False) -> pa.RecordBatchReader:
         if config is None:
             config = QueryConfig()
-        self.refresh_stats()
+        # Take a snapshot of enpoints
+        stats = self.get_stats()
+        endpoints = stats.endpoints if config.data_endpoints is None else config.data_endpoints
-        if self.stats.num_rows > config.rows_per_split and config.num_splits is None:
-            config.num_splits = self.stats.num_rows // config.rows_per_split
-        log.debug(f"num_rows={self.stats.num_rows} rows_per_splits={config.rows_per_split} num_splits={config.num_splits} ")
+        if stats.num_rows > config.rows_per_split and config.num_splits is None:
+            config.num_splits = stats.num_rows // config.rows_per_split
+        log.debug(f"num_rows={stats.num_rows} rows_per_splits={config.rows_per_split} num_splits={config.num_splits} ")
+        if columns is None:
+            columns = [f.name for f in self.arrow_schema]
         query_schema = self.arrow_schema
         if internal_row_id:
@@ -244,12 +246,12 @@ class Table:
             query_schema = pa.schema(queried_fields)
             columns.append(INTERNAL_ROW_ID)
-        query_data_request = build_query_data_request(
+        query_data_request = internal_commands.build_query_data_request(
             schema=query_schema,
             predicate=predicate,
             field_names=columns)
-        splits_queue = queue.Queue()
+        splits_queue: queue.Queue[int] = queue.Queue()
         for split in range(config.num_splits):
             splits_queue.put(split)
@@ -257,8 +259,10 @@ class Table:
         # this queue shouldn't be large it is marely a pipe through which the results
         # are sent to the main thread. Most of the pages actually held in the
         # threads that fetch the pages.
-        record_batches_queue = queue.Queue(maxsize=2)
+        record_batches_queue: queue.Queue[pa.RecordBatch] = queue.Queue(maxsize=2)
         stop_event = Event()
         class StoppedException(Exception):
             pass
@@ -266,9 +270,9 @@ class Table:
             if stop_event.is_set():
                 raise StoppedException
-        def single_endpoint_worker(endpoint : str):
+        def single_endpoint_worker(endpoint: str):
             try:
-                host_api = VastdbApi(endpoint=endpoint, access_key=self.tx._rpc.api.access_key, secret_key=self.tx._rpc.api.secret_key)
+                host_api = internal_commands.VastdbApi(endpoint=endpoint, access_key=self.tx._rpc.api.access_key, secret_key=self.tx._rpc.api.secret_key)
                 while True:
                     check_stop()
                     try:
@@ -293,12 +297,11 @@ class Table:
                 log.debug("exiting")
                 record_batches_queue.put(None)
-        # Take a snapshot of enpoints
-        endpoints = list(self.stats.endpoints) if config.data_endpoints is None else list(config.data_endpoints)
         def batches_iterator():
-            def propagate_first_exception(futures : List[concurrent.futures.Future], block = False):
+            def propagate_first_exception(futures: List[concurrent.futures.Future], block=False):
                 done, not_done = concurrent.futures.wait(futures, None if block else 0, concurrent.futures.FIRST_EXCEPTION)
+                if self.tx.txid is None:
+                    raise errors.MissingTransaction()
                 for future in done:
                     future.result()
                 return not_done
@@ -308,7 +311,7 @@ class Table:
             if config.query_id:
                 threads_prefix = threads_prefix + "-" + config.query_id
-            with concurrent.futures.ThreadPoolExecutor(max_workers=len(endpoints), thread_name_prefix=threads_prefix) as tp: # TODO: concurrency == enpoints is just a heuristic
+            with concurrent.futures.ThreadPoolExecutor(max_workers=len(endpoints), thread_name_prefix=threads_prefix) as tp:  # TODO: concurrency == enpoints is just a heuristic
                 futures = [tp.submit(single_endpoint_worker, endpoint) for endpoint in endpoints]
                 tasks_running = len(futures)
                 try:
@@ -340,16 +343,11 @@ class Table:
     def insert(self, rows: pa.RecordBatch) -> pa.RecordBatch:
         serialized_slices = self.tx._rpc.api._record_batch_slices(rows, MAX_INSERT_ROWS_PER_PATCH)
-        row_ids = []
         for slice in serialized_slices:
-            res = self.tx._rpc.api.insert_rows(self.bucket.name, self.schema.name, self.name, record_batch=slice,
+            self.tx._rpc.api.insert_rows(self.bucket.name, self.schema.name, self.name, record_batch=slice,
                                                txid=self.tx.txid)
-            (batch,) = pa.RecordBatchStreamReader(res.raw)
-            row_ids.append(batch[INTERNAL_ROW_ID])
-        return pa.chunked_array(row_ids)
-    def update(self, rows: Union[pa.RecordBatch, pa.Table], columns: list = None) -> None:
+    def update(self, rows: Union[pa.RecordBatch, pa.Table], columns: Optional[List[str]] = None) -> None:
         if columns is not None:
             update_fields = [(INTERNAL_ROW_ID, pa.uint64())]
             update_values = [self._combine_chunks(rows[INTERNAL_ROW_ID])]
@@ -417,7 +415,6 @@ class Projection:
     table: Table
     handle: int
     stats: TableStats
-    properties: dict = None
     @property
     def bucket(self):
@@ -438,7 +435,7 @@ class Projection:
         columns = []
         next_key = 0
         while True:
-            curr_columns, next_key, is_truncated, count, _ = \
+            curr_columns, next_key, is_truncated, _count, _ = \
                 self.tx._rpc.api.list_projection_columns(
                     self.bucket.name, self.schema.name, self.table.name, self.name, txid=self.table.tx.txid, next_key=next_key)
             if not curr_columns:
@@ -467,9 +464,9 @@ def _parse_projection_info(projection_info, table: "Table"):
     return Projection(name=projection_info.name, table=table, stats=stats, handle=int(projection_info.handle))
-def _parse_bucket_and_object_names(path: str) -> (str, str):
+def _parse_bucket_and_object_names(path: str) -> Tuple[str, str]:
     if not path.startswith('/'):
-        raise errors.InvalidArgumentError(f"Path {path} must start with a '/'")
+        raise errors.InvalidArgument(f"Path {path} must start with a '/'")
     components = path.split(os.path.sep)
     bucket_name = components[1]
     object_path = os.path.sep.join(components[2:])

vastdb/tests/test_duckdb.py ADDED Viewed

@@ -0,0 +1,61 @@
+import logging
+import duckdb
+import pyarrow as pa
+import pyarrow.compute as pc
+import pytest
+from ..table import QueryConfig
+from .util import prepare_data
+log = logging.getLogger(__name__)
+def test_duckdb(session, clean_bucket_name):
+    columns = pa.schema([
+        ('a', pa.int32()),
+        ('b', pa.float64()),
+    ])
+    data = pa.table(schema=columns, data=[
+        [111, 222, 333],
+        [0.5, 1.5, 2.5],
+    ])
+    with prepare_data(session, clean_bucket_name, 's', 't', data) as t:
+        conn = duckdb.connect()
+        batches = t.select(columns=['a'], predicate=(t['b'] < 2))  # noqa: F841
+        actual = conn.execute('SELECT max(a) as "a_max" FROM batches').arrow()
+        expected = (data
+            .filter(pc.field('b') < 2)
+            .group_by([])
+            .aggregate([('a', 'max')]))
+        assert actual == expected
+def test_closed_tx(session, clean_bucket_name):
+    columns = pa.schema([
+        ('a', pa.int64()),
+    ])
+    data = pa.table(schema=columns, data=[
+        list(range(10000)),
+    ])
+    with session.transaction() as tx:
+        t = tx.bucket(clean_bucket_name).create_schema("s1").create_table("t1", columns)
+        t.insert(data)
+        config = QueryConfig(
+            num_sub_splits=1,
+            num_splits=1,
+            num_row_groups_per_sub_split=1,
+            limit_rows_per_sub_split=100)
+        batches = t.select(config=config)  # noqa: F841
+        first = next(batches)  # make sure that HTTP response processing has started
+        assert first['a'].to_pylist() == list(range(100))
+        conn = duckdb.connect()
+        res = conn.execute('SELECT a FROM batches')
+        log.debug("closing tx=%s after first batch=%s", t.tx, first)
+    # transaction is closed, collecting the result should fail
+    with pytest.raises(duckdb.InvalidInputException, match="Detail: Python exception: MissingTransaction"):
+        res.arrow()

vastdb/tests/test_projections.py CHANGED Viewed

@@ -4,6 +4,7 @@ import pyarrow as pa
 log = logging.getLogger(__name__)
 def test_basic_projections(session, clean_bucket_name):
     with session.transaction() as tx:
         s = tx.bucket(clean_bucket_name).create_schema('s1')

vastdb/tests/test_sanity.py CHANGED Viewed

@@ -57,10 +57,10 @@ def test_version_extraction():
             return f"vast {version}" if version else "vast"
         def log_message(self, format, *args):
-            log.debug(format,*args)
+            log.debug(format, *args)
     # start the server on localhost on some available port port
-    server_address =('localhost', 0)
+    server_address = ('localhost', 0)
     httpd = HTTPServer(server_address, MockOptionsHandler)
     def start_http_server_in_thread():

vastdb/tests/test_schemas.py CHANGED Viewed

@@ -50,12 +50,13 @@ def test_commits_and_rollbacks(session, clean_bucket_name):
             b = tx.bucket(clean_bucket_name)
             b.schema("s3").drop()
             assert b.schemas() == []
-            1/0  # rollback schema dropping
+            1 / 0  # rollback schema dropping
     with session.transaction() as tx:
         b = tx.bucket(clean_bucket_name)
         assert b.schemas() != []
 def test_list_snapshots(session, clean_bucket_name):
     with session.transaction() as tx:
         b = tx.bucket(clean_bucket_name)

vastdb 0.1.1__py3-none-any.whl → 0.1.2__py3-none-any.whl

vastdb 0.1.1py3-none-any.whl → 0.1.2py3-none-any.whl