PyPI - vastdb - Versions diffs - 2.0.1__py3-none-any.whl → 2.0.3__py3-none-any.whl - Mend

vastdb 2.0.1py3-none-any.whl → 2.0.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (25) hide show

vastdb/_adbc.py +194 -0
vastdb/_internal.py +101 -12
vastdb/_table_interface.py +20 -3
vastdb/conftest.py +23 -1
vastdb/errors.py +5 -0
vastdb/schema.py +17 -2
vastdb/session.py +12 -5
vastdb/table.py +60 -24
vastdb/table_metadata.py +58 -34
vastdb/tests/test_adbc_integration.py +89 -0
vastdb/tests/test_projections.py +49 -1
vastdb/tests/test_tables.py +35 -1
vastdb/tests/test_vector_index.py +162 -0
vastdb/tests/test_vector_search.py +210 -0
vastdb/tests/util.py +3 -2
vastdb/transaction.py +30 -0
vastdb/vast_flatbuf/tabular/GetTableStatsResponse.py +51 -59
vastdb/vast_flatbuf/tabular/ObjectDetails.py +36 -59
vastdb/vast_flatbuf/tabular/VectorIndexMetadata.py +67 -0
vastdb/vast_flatbuf/tabular/VipRange.py +19 -12
{vastdb-2.0.1.dist-info → vastdb-2.0.3.dist-info}/METADATA +2 -1
{vastdb-2.0.1.dist-info → vastdb-2.0.3.dist-info}/RECORD +25 -20
{vastdb-2.0.1.dist-info → vastdb-2.0.3.dist-info}/WHEEL +0 -0
{vastdb-2.0.1.dist-info → vastdb-2.0.3.dist-info}/licenses/LICENSE +0 -0
{vastdb-2.0.1.dist-info → vastdb-2.0.3.dist-info}/top_level.txt +0 -0

vastdb/table.py CHANGED Viewed

@@ -22,11 +22,12 @@ import ibis
 import pyarrow as pa
 import urllib3
-from vastdb._table_interface import ITable
+from vastdb._table_interface import IbisPredicate, ITable
 from vastdb.table_metadata import TableMetadata, TableRef, TableStats, TableType
 from . import _internal, errors, util
 from ._ibis_support import validate_ibis_support_schema
+from ._internal import VectorIndex
 from .config import ImportConfig, QueryConfig
 if TYPE_CHECKING:
@@ -213,6 +214,11 @@ class TableInTransaction(ITable):
         """Reload Sorted Columns."""
         self._metadata.load_sorted_columns(self._tx)
+    @property
+    def vector_index(self) -> Optional[VectorIndex]:
+        """Table's Vector Index if exists."""
+        return self._metadata._vector_index
     @property
     def path(self) -> str:
         """Return table's path."""
@@ -222,7 +228,7 @@ class TableInTransaction(ITable):
     def _internal_rowid_field(self) -> pa.Field:
         return INTERNAL_ROW_ID_SORTED_FIELD if self._is_sorted_table else INTERNAL_ROW_ID_FIELD
-    def sorted_columns(self) -> list[str]:
+    def sorted_columns(self) -> list[pa.Field]:
         """Return sorted columns' metadata."""
         return self._metadata.sorted_columns
@@ -231,11 +237,11 @@ class TableInTransaction(ITable):
             raise errors.NotSupportedCommand(
                 self.ref.bucket, self.ref.schema, self.ref.table)
-    def projection(self, name: str) -> "Projection":
+    def projection(self, name: str, include_stats: bool = True) -> "Projection":
         """Get a specific semi-sorted projection of this table."""
         self._assert_not_imports_table()
-        projs = tuple(self.projections(projection_name=name))
+        projs = tuple(self.projections(projection_name=name, include_stats=include_stats))
         if not projs:
             raise errors.MissingProjection(
                 self.ref.bucket, self.ref.schema, self.ref.table, name)
@@ -245,7 +251,7 @@ class TableInTransaction(ITable):
         log.debug("Found projection: %s", projs[0])
         return projs[0]
-    def projections(self, projection_name: str = "") -> Iterable["Projection"]:
+    def projections(self, projection_name: str = "", include_stats: bool = True) -> Iterable["Projection"]:
         """List all semi-sorted projections of this table if `projection_name` is empty.
         Otherwise, list only the specific projection (if exists).
@@ -260,7 +266,7 @@ class TableInTransaction(ITable):
             _bucket_name, _schema_name, _table_name, curr_projections, next_key, is_truncated, _ = \
                 self._tx._rpc.api.list_projections(
                     bucket=self.ref.bucket, schema=self.ref.schema, table=self.ref.table, next_key=next_key, txid=self._tx.active_txid,
-                    exact_match=exact_match, name_prefix=name_prefix)
+                    exact_match=exact_match, name_prefix=name_prefix, include_list_stats=include_stats)
             if not curr_projections:
                 break
             projections.extend(curr_projections)
@@ -631,35 +637,41 @@ class TableInTransaction(ITable):
         return pa.RecordBatchReader.from_batches(query_data_request.response_schema, batches_iterator())
     def insert_in_column_batches(self, rows: pa.RecordBatch) -> pa.ChunkedArray:
-        """Split the RecordBatch into max_columns that can be inserted in single RPC.
+        """Split the RecordBatch into an insert + updates.
+        This is both to support rows that won't fit into an RPC and for performance for wide rows.
         Insert first MAX_COLUMN_IN_BATCH columns and get the row_ids. Then loop on the rest of the columns and
         update in groups of MAX_COLUMN_IN_BATCH.
         """
-        column_record_batch = pa.RecordBatch.from_arrays([_combine_chunks(rows.column(i)) for i in range(0, MAX_COLUMN_IN_BATCH)],
-                                                         schema=pa.schema([rows.schema.field(i) for i in range(0, MAX_COLUMN_IN_BATCH)]))
-        row_ids = self.insert(rows=column_record_batch)  # type: ignore
         columns_names = [field.name for field in rows.schema]
-        columns = list(rows.schema)
-        arrays = [_combine_chunks(rows.column(i))
-                  for i in range(len(rows.schema))]
-        for start in range(MAX_COLUMN_IN_BATCH, len(rows.schema), MAX_COLUMN_IN_BATCH):
+        # Sorted columns must be in the first insert as those can't be updated later.
+        if self._is_sorted_table:
+            sorted_columns_names = [field.name for field in self.sorted_columns()]
+            columns_names = sorted_columns_names + [column_name for column_name in columns_names if column_name not in sorted_columns_names]
+        columns = [rows.schema.field(column_name) for column_name in columns_names]
+        arrays = [_combine_chunks(rows.column(column_name)) for column_name in columns_names]
+        for start in range(0, len(rows.schema), MAX_COLUMN_IN_BATCH):
             end = start + MAX_COLUMN_IN_BATCH if start + \
                 MAX_COLUMN_IN_BATCH < len(rows.schema) else len(rows.schema)
             columns_name_chunk = columns_names[start:end]
             columns_chunks = columns[start:end]
             arrays_chunks = arrays[start:end]
-            columns_chunks.append(self._internal_rowid_field)
-            arrays_chunks.append(row_ids.to_pylist())
-            column_record_batch = pa.RecordBatch.from_arrays(
-                arrays_chunks, schema=pa.schema(columns_chunks))
-            self.update(rows=column_record_batch, columns=columns_name_chunk)
+            if start == 0:
+                column_record_batch = pa.RecordBatch.from_arrays(
+                    arrays_chunks, schema=pa.schema(columns_chunks))
+                row_ids = self.insert(rows=column_record_batch, by_columns=False)  # type: ignore
+            else:
+                columns_chunks.append(self._internal_rowid_field)
+                arrays_chunks.append(row_ids.to_pylist())
+                column_record_batch = pa.RecordBatch.from_arrays(
+                    arrays_chunks, schema=pa.schema(columns_chunks))
+                self.update(rows=column_record_batch, columns=columns_name_chunk)
         return row_ids
     def insert(self,
                rows: Union[pa.RecordBatch, pa.Table],
-               by_columns: bool = False) -> pa.ChunkedArray:
+               by_columns: bool = True) -> pa.ChunkedArray:
         """Insert a RecordBatch into this table."""
         self._assert_not_imports_table()
@@ -667,9 +679,14 @@ class TableInTransaction(ITable):
             log.debug("Ignoring empty insert into %s", self.ref)
             return pa.chunked_array([], type=self._internal_rowid_field.type)
-        if by_columns:
-            self._tx._rpc.features.check_return_row_ids()
-            return self.insert_in_column_batches(rows)
+        # inserting by columns is faster, so default to doing that
+        # if the cluster supports it and there are actually columns in the rows
+        if by_columns and len(rows.schema):
+            try:
+                self._tx._rpc.features.check_return_row_ids()
+                return self.insert_in_column_batches(rows)
+            except errors.NotSupportedVersion:
+                pass
         try:
             row_ids = []
@@ -802,6 +819,25 @@ class TableInTransaction(ITable):
     def _is_sorted_table(self) -> bool:
         return self._metadata.table_type is TableType.Elysium
+    def vector_search(
+        self,
+        vec: list[float],
+        columns: list[str],
+        limit: int,
+        predicate: Optional[IbisPredicate] = None,
+    ) -> pa.RecordBatchReader:
+        """Vector Search over vector indexed columns."""
+        assert self.vector_index is not None, "Table is either not vector indexed. (maybe try reloading the TableMetadata)"
+        return self._tx.adbc_conn.vector_search(
+            vec,
+            self.vector_index,
+            self.ref,
+            columns,
+            limit,
+            predicate=predicate,
+        )
 class Table(TableInTransaction):
     """Vast Interactive Table."""

vastdb/table_metadata.py CHANGED Viewed

@@ -4,17 +4,19 @@ import logging
 from copy import deepcopy
 from dataclasses import dataclass
 from enum import Enum
-from typing import TYPE_CHECKING, Optional, Tuple
+from typing import TYPE_CHECKING, Optional
 import ibis
 import pyarrow as pa
 from vastdb import errors
 from vastdb._ibis_support import validate_ibis_support_schema
+from vastdb._internal import TableStats, VectorIndex
 if TYPE_CHECKING:
     from .transaction import Transaction
 log = logging.getLogger(__name__)
@@ -39,26 +41,16 @@ class TableRef:
         """Table full path."""
         return f"{self.bucket}/{self.schema}/{self.table}"
+    @property
+    def query_engine_full_path(self) -> str:
+        """Table full path for VastDB Query Engine."""
+        return f'"{self.bucket}/{self.schema}".{self.table}'
     def __str__(self) -> str:
         """Table full path."""
         return self.full_path
-@dataclass
-class TableStats:
-    """Table-related information."""
-    num_rows: int
-    size_in_bytes: int
-    sorting_score: int
-    write_amplification: int
-    acummulative_row_inserition_count: int
-    is_external_rowid_alloc: bool = False
-    sorting_key_enabled: bool = False
-    sorting_done: bool = False
-    endpoints: Tuple[str, ...] = ()
 class TableMetadata:
     """Table Metadata."""
@@ -67,25 +59,29 @@ class TableMetadata:
     _sorted_columns: Optional[list[str]]
     _ibis_table: ibis.Table
     _stats: Optional[TableStats]
-    def __init__(self,
-                 ref: TableRef,
-                 arrow_schema: Optional[pa.Schema] = None,
-                 table_type: Optional[TableType] = None):
+    _vector_index: Optional[VectorIndex]
+    def __init__(
+        self,
+        ref: TableRef,
+        arrow_schema: Optional[pa.Schema] = None,
+        table_type: Optional[TableType] = None,
+        vector_index: Optional[VectorIndex] = None,
+    ):
         """Table Metadata."""
         self._ref = deepcopy(ref)
         self._table_type = table_type
         self.arrow_schema = deepcopy(arrow_schema)
         self._sorted_columns = None
         self._stats = None
+        self._vector_index = vector_index
     def __eq__(self, other: object) -> bool:
         """TableMetadata Equal."""
         if not isinstance(other, TableMetadata):
             return False
-        return (self._ref == other._ref and
-                self._table_type == other._table_type)
+        return self._ref == other._ref and self._table_type == other._table_type
     def rename_table(self, name: str) -> None:
         """Rename table metadata's table name."""
@@ -110,7 +106,8 @@ class TableMetadata:
                 table=self.ref.table,
                 next_key=next_key,
                 txid=tx.active_txid,
-                list_imports_table=self.is_imports_table)
+                list_imports_table=self.is_imports_table,
+            )
             fields.extend(cur_columns)
             if not is_truncated:
                 break
@@ -123,9 +120,16 @@ class TableMetadata:
         try:
             next_key = 0
             while True:
-                cur_columns, next_key, is_truncated, _count = tx._rpc.api.list_sorted_columns(
-                    bucket=self.ref.bucket, schema=self.ref.schema, table=self.ref.table,
-                    next_key=next_key, txid=tx.active_txid, list_imports_table=self.is_imports_table)
+                cur_columns, next_key, is_truncated, _count = (
+                    tx._rpc.api.list_sorted_columns(
+                        bucket=self.ref.bucket,
+                        schema=self.ref.schema,
+                        table=self.ref.table,
+                        next_key=next_key,
+                        txid=tx.active_txid,
+                        list_imports_table=self.is_imports_table,
+                    )
+                )
                 fields.extend(cur_columns)
                 if not is_truncated:
                     break
@@ -133,7 +137,9 @@ class TableMetadata:
             raise
         except errors.InternalServerError as ise:
             log.warning(
-                "Failed to get the sorted columns Elysium might not be supported: %s", ise)
+                "Failed to get the sorted columns Elysium might not be supported: %s",
+                ise,
+            )
             raise
         except errors.NotSupportedVersion:
             log.warning("Failed to get the sorted columns, Elysium not supported")
@@ -143,10 +149,13 @@ class TableMetadata:
     def load_stats(self, tx: "Transaction") -> None:
         """Load/Reload table stats."""
-        stats_tuple = tx._rpc.api.get_table_stats(
-            bucket=self.ref.bucket, schema=self.ref.schema, name=self.ref.table, txid=tx.active_txid,
-            imports_table_stats=self.is_imports_table)
-        self._stats = TableStats(**stats_tuple._asdict())
+        self._stats = tx._rpc.api.get_table_stats(
+            bucket=self.ref.bucket,
+            schema=self.ref.schema,
+            name=self.ref.table,
+            txid=tx.active_txid,
+            imports_table_stats=self.is_imports_table,
+        )
         is_elysium_table = self._stats.sorting_key_enabled
@@ -161,6 +170,18 @@ class TableMetadata:
                     "Actual table is sorted (TableType.Elysium), was not inited as TableType.Elysium"
                 )
+        self._parse_stats_vector_index()
+    def _parse_stats_vector_index(self):
+        vector_index_is_set = self._vector_index is not None
+        if vector_index_is_set and self._stats.vector_index != self._vector_index:
+            raise ValueError(
+                f"Table has index {self._stats.vector_index}, but was initialized as {self._vector_index}"
+                )
+        else:
+            self._vector_index = self._stats.vector_index
     def _set_sorted_table(self, tx: "Transaction"):
         self._table_type = TableType.Elysium
         tx._rpc.features.check_elysium()
@@ -184,7 +205,9 @@ class TableMetadata:
         if arrow_schema:
             validate_ibis_support_schema(arrow_schema)
             self._arrow_schema = arrow_schema
-            self._ibis_table = ibis.table(ibis.Schema.from_pyarrow(arrow_schema), self._ref.full_path)
+            self._ibis_table = ibis.table(
+                ibis.Schema.from_pyarrow(arrow_schema), self._ref.full_path
+            )
         else:
             self._arrow_schema = None
             self._ibis_table = None
@@ -211,7 +234,8 @@ class TableMetadata:
         """Table's type."""
         if self._table_type is None:
             raise ValueError(
-                "TableType was not loaded. load using TableMetadata.load_stats")
+                "TableType was not loaded. load using TableMetadata.load_stats"
+            )
         return self._table_type

vastdb/tests/test_adbc_integration.py ADDED Viewed

@@ -0,0 +1,89 @@
+import pyarrow as pa
+import pytest
+from vastdb.table_metadata import TableRef
+from vastdb.transaction import NoAdbcConnectionError
+def test_sanity(session_factory, clean_bucket_name: str):
+    session = session_factory(with_adbc=True)
+    arrow_schema = pa.schema([("n", pa.int32())])
+    ref = TableRef(clean_bucket_name, "s", "t")
+    data_table = pa.table(schema=arrow_schema, data=[[1, 2, 3, 4, 5]])
+    with session.transaction() as tx:
+        table = (
+            tx.bucket(clean_bucket_name)
+            .create_schema("s")
+            .create_table("t", arrow_schema)
+        )
+        table.insert(data_table)
+    with session.transaction() as tx:
+        tx.adbc_conn.cursor.execute(f"SELECT * FROM {ref.query_engine_full_path}")
+        res = tx.adbc_conn.cursor.fetchall()
+        assert res == [(1,), (2,), (3,), (4,), (5,)]
+def test_adbc_shares_tx(session_factory, clean_bucket_name: str):
+    session = session_factory(with_adbc=True)
+    arrow_schema = pa.schema([("n", pa.int32())])
+    data_table = pa.table(schema=arrow_schema, data=[[1, 2, 3, 4, 5]])
+    with session.transaction() as tx:
+        table = (
+            tx.bucket(clean_bucket_name)
+            .create_schema("s")
+            .create_table("t", arrow_schema)
+        )
+        table.insert(data_table)
+        # expecting adbc execute to "see" table if it shares the transaction with the pysdk
+        tx.adbc_conn.cursor.execute(f"SELECT * FROM {table.ref.query_engine_full_path}")
+        assert tx.adbc_conn.cursor.fetchall() == [(1,), (2,), (3,), (4,), (5,)]
+def test_adbc_conn_unreachable_tx_close(session_factory):
+    session = session_factory(with_adbc=True)
+    with session.transaction() as tx:
+        assert tx.adbc_conn is not None
+    # adbc conn should not be reachable after tx close
+    with pytest.raises(NoAdbcConnectionError):
+        tx.adbc_conn
+def test_two_simulatnious_txs_with_adbc(session_factory, clean_bucket_name: str):
+    session = session_factory(with_adbc=True)
+    arrow_schema = pa.schema([("n", pa.int32())])
+    data_table = pa.table(schema=arrow_schema, data=[[1, 2, 3, 4, 5]])
+    with session.transaction() as tx:
+        table = (
+            tx.bucket(clean_bucket_name)
+            .create_schema("s")
+            .create_table("t1", arrow_schema)
+        )
+        table.insert(data_table)
+        # expecting adbc execute to "see" table if it shares the transaction with the pysdk
+        tx.adbc_conn.cursor.execute(f"SELECT * FROM {table.ref.query_engine_full_path}")
+        assert tx.adbc_conn.cursor.fetchall() == [(1,), (2,), (3,), (4,), (5,)]
+    with session.transaction() as tx:
+        table = (
+            tx.bucket(clean_bucket_name).schema("s").create_table("t2", arrow_schema)
+        )
+        table.insert(data_table)
+        # expecting adbc execute to "see" table if it shares the transaction with the pysdk
+        tx.adbc_conn.cursor.execute(f"SELECT * FROM {table.ref.query_engine_full_path}")
+        assert tx.adbc_conn.cursor.fetchall() == [(1,), (2,), (3,), (4,), (5,)]

vastdb/tests/test_projections.py CHANGED Viewed

@@ -8,6 +8,9 @@ from vastdb.table import QueryConfig
 log = logging.getLogger(__name__)
+DELAY_TO_LET_SERVER_UPDATE: int = 3
 def test_basic_projections(session, clean_bucket_name):
     with session.transaction() as tx:
         s = tx.bucket(clean_bucket_name).create_schema('s1')
@@ -94,7 +97,7 @@ def test_query_data_with_projection(session, clean_bucket_name):
         actual = pa.Table.from_batches(t.select(columns=['a', 'b', 's']))
         assert actual == expected
-    time.sleep(3)
+    time.sleep(DELAY_TO_LET_SERVER_UPDATE)
     with session.transaction() as tx:
         config = QueryConfig()
@@ -123,3 +126,48 @@ def test_query_data_with_projection(session, clean_bucket_name):
         t.drop()
         s.drop()
+def test_projection_stats(session, clean_bucket_name):
+    columns = pa.schema([
+        ('a', pa.int64()),
+        ('b', pa.int64()),
+    ])
+    # min size to be considered as a projection
+    GROUP_SIZE = 64 * 1024
+    expected = pa.table(schema=columns, data=[
+        [i for i in range(GROUP_SIZE)],
+        [i for i in reversed(range(GROUP_SIZE))],
+    ])
+    schema_name = "schema"
+    table_name = "table"
+    with session.transaction() as tx:
+        s = tx.bucket(clean_bucket_name).create_schema(schema_name)
+        t = s.create_table(table_name, expected.schema)
+        sorted_columns = ['b']
+        unsorted_columns = ['a']
+        t.create_projection('p1', sorted_columns, unsorted_columns)
+    with session.transaction() as tx:
+        s = tx.bucket(clean_bucket_name).schema(schema_name)
+        t = s.table(table_name)
+        t.insert(expected)
+        actual = pa.Table.from_batches(t.select(columns=['a', 'b']))
+        assert actual == expected
+    time.sleep(DELAY_TO_LET_SERVER_UPDATE)
+    with session.transaction() as tx:
+        s = tx.bucket(clean_bucket_name).schema(schema_name)
+        t = s.table(table_name)
+        projections = t.projections()
+        assert len(projections) == 1
+        stats = projections[0].stats
+        assert stats.num_rows == GROUP_SIZE
+        assert stats.size_in_bytes > 0
+        t.drop()
+        s.drop()

vastdb/tests/test_tables.py CHANGED Viewed

@@ -1,5 +1,6 @@
 import datetime as dt
 import decimal
+import itertools
 import logging
 import random
 import threading
@@ -17,7 +18,7 @@ from requests.exceptions import HTTPError
 from vastdb import errors
 from vastdb.session import Session
-from vastdb.table import INTERNAL_ROW_ID, QueryConfig
+from vastdb.table import INTERNAL_ROW_ID, MAX_COLUMN_IN_BATCH, QueryConfig
 from .util import assert_row_ids_ascending_on_first_insertion_to_table, prepare_data
@@ -95,6 +96,39 @@ def test_insert_wide_row(session, clean_bucket_name):
         assert actual == expected
+@pytest.mark.parametrize("num_columns,insert_by_columns", itertools.product([
+        MAX_COLUMN_IN_BATCH // 2,
+        MAX_COLUMN_IN_BATCH - 1,
+        MAX_COLUMN_IN_BATCH,
+        MAX_COLUMN_IN_BATCH + 1,
+        MAX_COLUMN_IN_BATCH * 2,
+        MAX_COLUMN_IN_BATCH * 10,
+    ],
+    [False, True]
+    )
+)
+def test_insert_by_columns_variations(session, clean_bucket_name, num_columns, insert_by_columns):
+    columns = pa.schema([pa.field(f'i{i}', pa.int64()) for i in range(num_columns)])
+    data = [[i] for i in range(num_columns)]
+    expected = pa.table(schema=columns, data=data)
+    with prepare_data(session, clean_bucket_name, 's', 't', expected, insert_by_columns=insert_by_columns) as t:
+        actual = t.select().read_all()
+        assert actual == expected
+@pytest.mark.parametrize("sorting_key", [0, 40, 80, 120])
+def test_insert_by_columns_sorted(session, clean_bucket_name, sorting_key):
+    num_columns = 160
+    columns = pa.schema([pa.field(f'i{i}', pa.int64()) for i in range(num_columns)])
+    data = [[i] for i in range(num_columns)]
+    expected = pa.table(schema=columns, data=data)
+    with prepare_data(session, clean_bucket_name, 's', 't', expected, sorting_key=[sorting_key], insert_by_columns=True) as t:
+        actual = t.select().read_all()
+        assert actual == expected
 def test_multi_batch_table(session, clean_bucket_name):
     columns = pa.schema([pa.field('s', pa.utf8())])
     expected = pa.Table.from_batches([

vastdb 2.0.1__py3-none-any.whl → 2.0.3__py3-none-any.whl

vastdb 2.0.1py3-none-any.whl → 2.0.3py3-none-any.whl