PyPI - vastdb - Versions diffs - 0.1.1__py3-none-any.whl → 0.1.3__py3-none-any.whl - Mend

vastdb 0.1.1py3-none-any.whl → 0.1.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (23) hide show

vastdb/__init__.py +6 -2
vastdb/bench/test_perf.py +3 -3
vastdb/bucket.py +29 -15
vastdb/errors.py +40 -7
vastdb/internal_commands.py +194 -233
vastdb/schema.py +11 -6
vastdb/session.py +16 -1
vastdb/table.py +181 -77
vastdb/tests/test_duckdb.py +61 -0
vastdb/tests/test_imports.py +13 -1
vastdb/tests/test_projections.py +1 -0
vastdb/tests/test_sanity.py +2 -2
vastdb/tests/test_schemas.py +3 -3
vastdb/tests/test_tables.py +60 -50
vastdb/tests/test_util.py +39 -0
vastdb/tests/util.py +1 -4
vastdb/transaction.py +32 -6
vastdb/util.py +42 -6
{vastdb-0.1.1.dist-info → vastdb-0.1.3.dist-info}/METADATA +2 -5
{vastdb-0.1.1.dist-info → vastdb-0.1.3.dist-info}/RECORD +23 -21
{vastdb-0.1.1.dist-info → vastdb-0.1.3.dist-info}/WHEEL +1 -1
{vastdb-0.1.1.dist-info → vastdb-0.1.3.dist-info}/LICENSE +0 -0
{vastdb-0.1.1.dist-info → vastdb-0.1.3.dist-info}/top_level.txt +0 -0

vastdb/schema.py CHANGED Viewed

@@ -6,11 +6,16 @@ It is possible to list and access VAST snapshots generated over a bucket.
 import logging
 from dataclasses import dataclass
+from typing import TYPE_CHECKING, List, Optional
 import pyarrow as pa
 from . import bucket, errors, schema, table
+if TYPE_CHECKING:
+    from .table import Table
 log = logging.getLogger(__name__)
@@ -26,7 +31,7 @@ class Schema:
         """VAST transaction used for this schema."""
         return self.bucket.tx
-    def create_table(self, table_name: str, columns: pa.Schema, fail_if_exists=True) -> "table.Table":
+    def create_table(self, table_name: str, columns: pa.Schema, fail_if_exists=True) -> "Table":
         """Create a new table under this schema."""
         if current := self.table(table_name, fail_if_missing=False):
             if fail_if_exists:
@@ -35,9 +40,9 @@ class Schema:
                 return current
         self.tx._rpc.api.create_table(self.bucket.name, self.name, table_name, columns, txid=self.tx.txid)
         log.info("Created table: %s", table_name)
-        return self.table(table_name)
+        return self.table(table_name)  # type: ignore[return-value]
-    def table(self, name: str, fail_if_missing=True) -> "table.Table":
+    def table(self, name: str, fail_if_missing=True) -> Optional["table.Table"]:
         """Get a specific table under this schema."""
         t = self.tables(table_name=name)
         if not t:
@@ -49,14 +54,14 @@ class Schema:
         log.debug("Found table: %s", t[0])
         return t[0]
-    def tables(self, table_name=None) -> ["table.Table"]:
+    def tables(self, table_name=None) -> List["Table"]:
         """List all tables under this schema."""
         tables = []
         next_key = 0
         name_prefix = table_name if table_name else ""
         exact_match = bool(table_name)
         while True:
-            bucket_name, schema_name, curr_tables, next_key, is_truncated, _ = \
+            _bucket_name, _schema_name, curr_tables, next_key, is_truncated, _ = \
                 self.tx._rpc.api.list_tables(
                     bucket=self.bucket.name, schema=self.name, next_key=next_key, txid=self.tx.txid,
                     exact_match=exact_match, name_prefix=name_prefix, include_list_stats=exact_match)
@@ -82,4 +87,4 @@ class Schema:
 def _parse_table_info(table_info, schema: "schema.Schema"):
     stats = table.TableStats(num_rows=table_info.num_rows, size_in_bytes=table_info.size_in_bytes)
-    return table.Table(name=table_info.name, schema=schema, handle=int(table_info.handle), stats=stats)
+    return table.Table(name=table_info.name, schema=schema, handle=int(table_info.handle), stats=stats, _imports_table=False)

vastdb/session.py CHANGED Viewed

@@ -11,7 +11,20 @@ import os
 import boto3
-from . import internal_commands, transaction
+from . import errors, internal_commands, transaction
+class Features:
+    """VAST database features - check if server is already support a feature."""
+    def __init__(self, vast_version):
+        """Save the server version."""
+        self.vast_version = vast_version
+    def check_imports_table(self):
+        """Check if the feature that support imports table is supported."""
+        if self.vast_version < (5, 2):
+            raise errors.NotSupportedVersion("import_table requires 5.2+", self.vast_version)
 class Session:
@@ -27,6 +40,8 @@ class Session:
             endpoint = os.environ['AWS_S3_ENDPOINT_URL']
         self.api = internal_commands.VastdbApi(endpoint, access, secret)
+        version_tuple = tuple(int(part) for part in self.api.vast_version.split('.'))
+        self.features = Features(version_tuple)
         self.s3 = boto3.client('s3',
             aws_access_key_id=access,
             aws_secret_access_key=secret,

vastdb/table.py CHANGED Viewed

@@ -1,3 +1,5 @@
+"""VAST Database table."""
 import concurrent.futures
 import logging
 import os
@@ -5,18 +7,12 @@ import queue
 from dataclasses import dataclass, field
 from math import ceil
 from threading import Event
-from typing import List, Union
+from typing import Dict, List, Optional, Tuple, Union
 import ibis
 import pyarrow as pa
-from . import errors, schema
-from .internal_commands import (
-    TABULAR_INVALID_ROW_ID,
-    VastdbApi,
-    build_query_data_request,
-    parse_query_data_response,
-)
+from . import errors, internal_commands, schema, util
 log = logging.getLogger(__name__)
@@ -27,18 +23,24 @@ MAX_ROWS_PER_BATCH = 512 * 1024
 # for example insert of 512k uint8 result in 512k*8bytes response since row_ids are uint64
 MAX_INSERT_ROWS_PER_PATCH = 512 * 1024
 @dataclass
 class TableStats:
+    """Table-related information."""
     num_rows: int
     size_in_bytes: int
     is_external_rowid_alloc: bool = False
-    endpoints: List[str] = None
+    endpoints: Tuple[str, ...] = ()
 @dataclass
 class QueryConfig:
+    """Query execution configiration."""
     num_sub_splits: int = 4
     num_splits: int = 1
-    data_endpoints: [str] = None
+    data_endpoints: Optional[List[str]] = None
     limit_rows_per_sub_split: int = 128 * 1024
     num_row_groups_per_sub_split: int = 8
     use_semi_sorted_projections: bool = True
@@ -48,17 +50,27 @@ class QueryConfig:
 @dataclass
 class ImportConfig:
+    """Import execution configiration."""
     import_concurrency: int = 2
-class SelectSplitState():
-    def __init__(self, query_data_request, table : "Table", split_id : int, config: QueryConfig) -> None:
+class SelectSplitState:
+    """State of a specific query split execution."""
+    def __init__(self, query_data_request, table: "Table", split_id: int, config: QueryConfig) -> None:
+        """Initialize query split state."""
         self.split_id = split_id
         self.subsplits_state = {i: 0 for i in range(config.num_sub_splits)}
         self.config = config
         self.query_data_request = query_data_request
         self.table = table
-    def batches(self, api : VastdbApi):
+    def batches(self, api: internal_commands.VastdbApi):
+        """Execute QueryData request, and yield parsed RecordBatch objects.
+        Can be called repeatedly, to allow pagination.
+        """
         while not self.done:
             response = api.query_data(
                             bucket=self.table.bucket.name,
@@ -71,34 +83,39 @@ class SelectSplitState():
                             txid=self.table.tx.txid,
                             limit_rows=self.config.limit_rows_per_sub_split,
                             sub_split_start_row_ids=self.subsplits_state.items(),
-                            enable_sorted_projections=self.config.use_semi_sorted_projections)
-            pages_iter = parse_query_data_response(
+                            enable_sorted_projections=self.config.use_semi_sorted_projections,
+                            query_imports_table=self.table._imports_table)
+            pages_iter = internal_commands.parse_query_data_response(
                 conn=response.raw,
                 schema=self.query_data_request.response_schema,
-                start_row_ids=self.subsplits_state)
+                start_row_ids=self.subsplits_state,
+                parser=self.query_data_request.response_parser)
             for page in pages_iter:
                 for batch in page.to_batches():
                     if len(batch) > 0:
                         yield batch
     @property
     def done(self):
-        return all(row_id == TABULAR_INVALID_ROW_ID for row_id in self.subsplits_state.values())
+        """Returns true iff the pagination over."""
+        return all(row_id == internal_commands.TABULAR_INVALID_ROW_ID for row_id in self.subsplits_state.values())
 @dataclass
 class Table:
+    """VAST Table."""
     name: str
     schema: "schema.Schema"
     handle: int
     stats: TableStats
-    properties: dict = None
-    arrow_schema: pa.Schema = field(init=False, compare=False)
-    _ibis_table: ibis.Schema = field(init=False, compare=False)
+    arrow_schema: pa.Schema = field(init=False, compare=False, repr=False)
+    _ibis_table: ibis.Schema = field(init=False, compare=False, repr=False)
+    _imports_table: bool
     def __post_init__(self):
-        self.properties = self.properties or {}
+        """Also, load columns' metadata."""
         self.arrow_schema = self.columns()
         table_path = f'{self.schema.bucket.name}/{self.schema.name}/{self.name}'
@@ -106,21 +123,21 @@ class Table:
     @property
     def tx(self):
+        """Return transaction."""
         return self.schema.tx
     @property
     def bucket(self):
+        """Return bucket."""
         return self.schema.bucket
-    def __repr__(self):
-        return f"{type(self).__name__}(name={self.name})"
     def columns(self) -> pa.Schema:
+        """Return columns' metadata."""
         fields = []
         next_key = 0
         while True:
             cur_columns, next_key, is_truncated, _count = self.tx._rpc.api.list_columns(
-                bucket=self.bucket.name, schema=self.schema.name, table=self.name, next_key=next_key, txid=self.tx.txid)
+                bucket=self.bucket.name, schema=self.schema.name, table=self.name, next_key=next_key, txid=self.tx.txid, list_imports_table=self._imports_table)
             fields.extend(cur_columns)
             if not is_truncated:
                 break
@@ -129,6 +146,9 @@ class Table:
         return self.arrow_schema
     def projection(self, name: str) -> "Projection":
+        """Get a specific semi-sorted projection of this table."""
+        if self._imports_table:
+            raise errors.NotSupportedCommand(self.bucket.name, self.schema.name, self.name)
         projs = self.projections(projection_name=name)
         if not projs:
             raise errors.MissingProjection(self.bucket.name, self.schema.name, self.name, name)
@@ -136,13 +156,16 @@ class Table:
         log.debug("Found projection: %s", projs[0])
         return projs[0]
-    def projections(self, projection_name=None) -> ["Projection"]:
+    def projections(self, projection_name=None) -> List["Projection"]:
+        """List all semi-sorted projections of this table."""
+        if self._imports_table:
+            raise errors.NotSupportedCommand(self.bucket.name, self.schema.name, self.name)
         projections = []
         next_key = 0
         name_prefix = projection_name if projection_name else ""
         exact_match = bool(projection_name)
         while True:
-            bucket_name, schema_name, table_name, curr_projections, next_key, is_truncated, _ = \
+            _bucket_name, _schema_name, _table_name, curr_projections, next_key, is_truncated, _ = \
                 self.tx._rpc.api.list_projections(
                     bucket=self.bucket.name, schema=self.schema.name, table=self.name, next_key=next_key, txid=self.tx.txid,
                     exact_match=exact_match, name_prefix=name_prefix)
@@ -153,7 +176,13 @@ class Table:
                 break
         return [_parse_projection_info(projection, self) for projection in projections]
-    def import_files(self, files_to_import: [str], config: ImportConfig = None) -> None:
+    def import_files(self, files_to_import: List[str], config: Optional[ImportConfig] = None) -> None:
+        """Import a list of Parquet files into this table.
+        The files must be on VAST S3 server and be accessible using current credentials.
+        """
+        if self._imports_table:
+            raise errors.NotSupportedCommand(self.bucket.name, self.schema.name, self.name)
         source_files = {}
         for f in files_to_import:
             bucket_name, object_path = _parse_bucket_and_object_names(f)
@@ -161,7 +190,14 @@ class Table:
         self._execute_import(source_files, config=config)
-    def import_partitioned_files(self, files_and_partitions: {str: pa.RecordBatch}, config: ImportConfig = None) -> None:
+    def import_partitioned_files(self, files_and_partitions: Dict[str, pa.RecordBatch], config: Optional[ImportConfig] = None) -> None:
+        """Import a list of Parquet files into this table.
+        The files must be on VAST S3 server and be accessible using current credentials.
+        Each file must have its own partition values defined as an Arrow RecordBatch.
+        """
+        if self._imports_table:
+            raise errors.NotSupportedCommand(self.bucket.name, self.schema.name, self.name)
         source_files = {}
         for f, record_batch in files_and_partitions.items():
             bucket_name, object_path = _parse_bucket_and_object_names(f)
@@ -209,7 +245,7 @@ class Table:
                 max_workers=config.import_concurrency, thread_name_prefix='import_thread') as pool:
             try:
                 for endpoint in endpoints:
-                    session = VastdbApi(endpoint, self.tx._rpc.api.access_key, self.tx._rpc.api.secret_key)
+                    session = internal_commands.VastdbApi(endpoint, self.tx._rpc.api.access_key, self.tx._rpc.api.secret_key)
                     futures.append(pool.submit(import_worker, files_queue, session))
                 log.debug("Waiting for import workers to finish")
@@ -218,24 +254,40 @@ class Table:
             finally:
                 stop_event.set()
                 # ThreadPoolExecutor will be joined at the end of the context
-    def refresh_stats(self):
+    def get_stats(self) -> TableStats:
+        """Get the statistics of this table."""
         stats_tuple = self.tx._rpc.api.get_table_stats(
-            bucket=self.bucket.name, schema=self.schema.name, name=self.name, txid=self.tx.txid)
-        self.stats = TableStats(**stats_tuple._asdict())
+            bucket=self.bucket.name, schema=self.schema.name, name=self.name, txid=self.tx.txid,
+            imports_table_stats=self._imports_table)
+        return TableStats(**stats_tuple._asdict())
-    def select(self, columns: [str] = None,
+    def select(self, columns: Optional[List[str]] = None,
                predicate: ibis.expr.types.BooleanColumn = None,
-               config: QueryConfig = None,
+               config: Optional[QueryConfig] = None,
                *,
                internal_row_id: bool = False) -> pa.RecordBatchReader:
+        """Execute a query over this table.
+        To read a subset of the columns, specify their names via `columns` argument. Otherwise, all columns will be read.
+        In order to apply a filter, a predicate can be specified. See https://github.com/vast-data/vastdb_sdk/blob/main/README.md#filters-and-projections for more details.
+        Query-execution configuration options can be specified via the optional `config` argument.
+        """
         if config is None:
             config = QueryConfig()
-        self.refresh_stats()
+        # Take a snapshot of enpoints
+        stats = self.get_stats()
+        endpoints = stats.endpoints if config.data_endpoints is None else config.data_endpoints
+        if stats.num_rows > config.rows_per_split and config.num_splits is None:
+            config.num_splits = stats.num_rows // config.rows_per_split
+        log.debug(f"num_rows={stats.num_rows} rows_per_splits={config.rows_per_split} num_splits={config.num_splits} ")
-        if self.stats.num_rows > config.rows_per_split and config.num_splits is None:
-            config.num_splits = self.stats.num_rows // config.rows_per_split
-        log.debug(f"num_rows={self.stats.num_rows} rows_per_splits={config.rows_per_split} num_splits={config.num_splits} ")
+        if columns is None:
+            columns = [f.name for f in self.arrow_schema]
         query_schema = self.arrow_schema
         if internal_row_id:
@@ -244,12 +296,12 @@ class Table:
             query_schema = pa.schema(queried_fields)
             columns.append(INTERNAL_ROW_ID)
-        query_data_request = build_query_data_request(
+        query_data_request = internal_commands.build_query_data_request(
             schema=query_schema,
             predicate=predicate,
             field_names=columns)
-        splits_queue = queue.Queue()
+        splits_queue: queue.Queue[int] = queue.Queue()
         for split in range(config.num_splits):
             splits_queue.put(split)
@@ -257,8 +309,10 @@ class Table:
         # this queue shouldn't be large it is marely a pipe through which the results
         # are sent to the main thread. Most of the pages actually held in the
         # threads that fetch the pages.
-        record_batches_queue = queue.Queue(maxsize=2)
+        record_batches_queue: queue.Queue[pa.RecordBatch] = queue.Queue(maxsize=2)
         stop_event = Event()
         class StoppedException(Exception):
             pass
@@ -266,9 +320,9 @@ class Table:
             if stop_event.is_set():
                 raise StoppedException
-        def single_endpoint_worker(endpoint : str):
+        def single_endpoint_worker(endpoint: str):
             try:
-                host_api = VastdbApi(endpoint=endpoint, access_key=self.tx._rpc.api.access_key, secret_key=self.tx._rpc.api.secret_key)
+                host_api = internal_commands.VastdbApi(endpoint=endpoint, access_key=self.tx._rpc.api.access_key, secret_key=self.tx._rpc.api.secret_key)
                 while True:
                     check_stop()
                     try:
@@ -293,12 +347,11 @@ class Table:
                 log.debug("exiting")
                 record_batches_queue.put(None)
-        # Take a snapshot of enpoints
-        endpoints = list(self.stats.endpoints) if config.data_endpoints is None else list(config.data_endpoints)
         def batches_iterator():
-            def propagate_first_exception(futures : List[concurrent.futures.Future], block = False):
+            def propagate_first_exception(futures: List[concurrent.futures.Future], block=False):
                 done, not_done = concurrent.futures.wait(futures, None if block else 0, concurrent.futures.FIRST_EXCEPTION)
+                if self.tx.txid is None:
+                    raise errors.MissingTransaction()
                 for future in done:
                     future.result()
                 return not_done
@@ -308,7 +361,7 @@ class Table:
             if config.query_id:
                 threads_prefix = threads_prefix + "-" + config.query_id
-            with concurrent.futures.ThreadPoolExecutor(max_workers=len(endpoints), thread_name_prefix=threads_prefix) as tp: # TODO: concurrency == enpoints is just a heuristic
+            with concurrent.futures.ThreadPoolExecutor(max_workers=len(endpoints), thread_name_prefix=threads_prefix) as tp:  # TODO: concurrency == enpoints is just a heuristic
                 futures = [tp.submit(single_endpoint_worker, endpoint) for endpoint in endpoints]
                 tasks_running = len(futures)
                 try:
@@ -332,113 +385,155 @@ class Table:
         return pa.RecordBatchReader.from_batches(query_data_request.response_schema, batches_iterator())
-    def _combine_chunks(self, col):
-        if hasattr(col, "combine_chunks"):
-            return col.combine_chunks()
-        else:
-            return col
     def insert(self, rows: pa.RecordBatch) -> pa.RecordBatch:
-        serialized_slices = self.tx._rpc.api._record_batch_slices(rows, MAX_INSERT_ROWS_PER_PATCH)
-        row_ids = []
+        """Insert a RecordBatch into this table."""
+        if self._imports_table:
+            raise errors.NotSupportedCommand(self.bucket.name, self.schema.name, self.name)
+        serialized_slices = util.iter_serialized_slices(rows, MAX_INSERT_ROWS_PER_PATCH)
         for slice in serialized_slices:
-            res = self.tx._rpc.api.insert_rows(self.bucket.name, self.schema.name, self.name, record_batch=slice,
+            self.tx._rpc.api.insert_rows(self.bucket.name, self.schema.name, self.name, record_batch=slice,
                                                txid=self.tx.txid)
-            (batch,) = pa.RecordBatchStreamReader(res.raw)
-            row_ids.append(batch[INTERNAL_ROW_ID])
-        return pa.chunked_array(row_ids)
+    def update(self, rows: Union[pa.RecordBatch, pa.Table], columns: Optional[List[str]] = None) -> None:
+        """Update a subset of cells in this table.
+        Row IDs are specified using a special field (named "$row_id" of uint64 type).
-    def update(self, rows: Union[pa.RecordBatch, pa.Table], columns: list = None) -> None:
+        A subset of columns to be updated can be specified via the `columns` argument.
+        """
+        if self._imports_table:
+            raise errors.NotSupportedCommand(self.bucket.name, self.schema.name, self.name)
         if columns is not None:
             update_fields = [(INTERNAL_ROW_ID, pa.uint64())]
-            update_values = [self._combine_chunks(rows[INTERNAL_ROW_ID])]
+            update_values = [_combine_chunks(rows[INTERNAL_ROW_ID])]
             for col in columns:
                 update_fields.append(rows.field(col))
-                update_values.append(self._combine_chunks(rows[col]))
+                update_values.append(_combine_chunks(rows[col]))
             update_rows_rb = pa.record_batch(schema=pa.schema(update_fields), data=update_values)
         else:
             update_rows_rb = rows
-        serialized_slices = self.tx._rpc.api._record_batch_slices(update_rows_rb, MAX_ROWS_PER_BATCH)
+        serialized_slices = util.iter_serialized_slices(update_rows_rb, MAX_ROWS_PER_BATCH)
         for slice in serialized_slices:
             self.tx._rpc.api.update_rows(self.bucket.name, self.schema.name, self.name, record_batch=slice,
                                          txid=self.tx.txid)
     def delete(self, rows: Union[pa.RecordBatch, pa.Table]) -> None:
+        """Delete a subset of rows in this table.
+        Row IDs are specified using a special field (named "$row_id" of uint64 type).
+        """
         delete_rows_rb = pa.record_batch(schema=pa.schema([(INTERNAL_ROW_ID, pa.uint64())]),
-                                         data=[self._combine_chunks(rows[INTERNAL_ROW_ID])])
+                                         data=[_combine_chunks(rows[INTERNAL_ROW_ID])])
-        serialized_slices = self.tx._rpc.api._record_batch_slices(delete_rows_rb, MAX_ROWS_PER_BATCH)
+        serialized_slices = util.iter_serialized_slices(delete_rows_rb, MAX_ROWS_PER_BATCH)
         for slice in serialized_slices:
             self.tx._rpc.api.delete_rows(self.bucket.name, self.schema.name, self.name, record_batch=slice,
-                                         txid=self.tx.txid)
+                                         txid=self.tx.txid, delete_from_imports_table=self._imports_table)
     def drop(self) -> None:
-        self.tx._rpc.api.drop_table(self.bucket.name, self.schema.name, self.name, txid=self.tx.txid)
+        """Drop this table."""
+        self.tx._rpc.api.drop_table(self.bucket.name, self.schema.name, self.name, txid=self.tx.txid, remove_imports_table=self._imports_table)
         log.info("Dropped table: %s", self.name)
     def rename(self, new_name) -> None:
+        """Rename this table."""
+        if self._imports_table:
+            raise errors.NotSupportedCommand(self.bucket.name, self.schema.name, self.name)
         self.tx._rpc.api.alter_table(
             self.bucket.name, self.schema.name, self.name, txid=self.tx.txid, new_name=new_name)
         log.info("Renamed table from %s to %s ", self.name, new_name)
         self.name = new_name
     def add_column(self, new_column: pa.Schema) -> None:
+        """Add a new column."""
+        if self._imports_table:
+            raise errors.NotSupportedCommand(self.bucket.name, self.schema.name, self.name)
         self.tx._rpc.api.add_columns(self.bucket.name, self.schema.name, self.name, new_column, txid=self.tx.txid)
         log.info("Added column(s): %s", new_column)
         self.arrow_schema = self.columns()
     def drop_column(self, column_to_drop: pa.Schema) -> None:
+        """Drop an existing column."""
+        if self._imports_table:
+            raise errors.NotSupported(self.bucket.name, self.schema.name, self.name)
+        if self._imports_table:
+            raise errors.NotSupportedCommand(self.bucket.name, self.schema.name, self.name)
         self.tx._rpc.api.drop_columns(self.bucket.name, self.schema.name, self.name, column_to_drop, txid=self.tx.txid)
         log.info("Dropped column(s): %s", column_to_drop)
         self.arrow_schema = self.columns()
     def rename_column(self, current_column_name: str, new_column_name: str) -> None:
+        """Rename an existing column."""
+        if self._imports_table:
+            raise errors.NotSupportedCommand(self.bucket.name, self.schema.name, self.name)
         self.tx._rpc.api.alter_column(self.bucket.name, self.schema.name, self.name, name=current_column_name,
                                        new_name=new_column_name, txid=self.tx.txid)
         log.info("Renamed column: %s to %s", current_column_name, new_column_name)
         self.arrow_schema = self.columns()
     def create_projection(self, projection_name: str, sorted_columns: List[str], unsorted_columns: List[str]) -> "Projection":
+        """Create a new semi-sorted projection."""
+        if self._imports_table:
+            raise errors.NotSupportedCommand(self.bucket.name, self.schema.name, self.name)
         columns = [(sorted_column, "Sorted") for sorted_column in sorted_columns] + [(unsorted_column, "Unorted") for unsorted_column in unsorted_columns]
         self.tx._rpc.api.create_projection(self.bucket.name, self.schema.name, self.name, projection_name, columns=columns, txid=self.tx.txid)
         log.info("Created projection: %s", projection_name)
         return self.projection(projection_name)
+    def create_imports_table(self, fail_if_exists=True) -> "Table":
+        """Create imports table."""
+        self.tx._rpc.features.check_imports_table()
+        empty_schema = pa.schema([])
+        self.tx._rpc.api.create_table(self.bucket.name, self.schema.name, self.name, empty_schema, txid=self.tx.txid,
+                                        create_imports_table=True)
+        log.info("Created imports table for table: %s", self.name)
+        return self.imports_table()  # type: ignore[return-value]
+    def imports_table(self) -> Optional["Table"]:
+        """Get the imports table under of this table."""
+        self.tx._rpc.features.check_imports_table()
+        return Table(name=self.name, schema=self.schema, handle=int(self.handle), stats=self.stats, _imports_table=True)
     def __getitem__(self, col_name):
+        """Allow constructing ibis-like column expressions from this table.
+        It is useful for constructing expressions for predicate pushdown in `Table.select()` method.
+        """
         return self._ibis_table[col_name]
 @dataclass
 class Projection:
+    """VAST semi-sorted projection."""
     name: str
     table: Table
     handle: int
     stats: TableStats
-    properties: dict = None
     @property
     def bucket(self):
+        """Return bucket."""
         return self.table.schema.bucket
     @property
     def schema(self):
+        """Return schema."""
         return self.table.schema
     @property
     def tx(self):
+        """Return transaction."""
         return self.table.schema.tx
-    def __repr__(self):
-        return f"{type(self).__name__}(name={self.name})"
     def columns(self) -> pa.Schema:
+        """Return this projections' columns as an Arrow schema."""
         columns = []
         next_key = 0
         while True:
-            curr_columns, next_key, is_truncated, count, _ = \
+            curr_columns, next_key, is_truncated, _count, _ = \
                 self.tx._rpc.api.list_projection_columns(
                     self.bucket.name, self.schema.name, self.table.name, self.name, txid=self.table.tx.txid, next_key=next_key)
             if not curr_columns:
@@ -450,12 +545,14 @@ class Projection:
         return self.arrow_schema
     def rename(self, new_name) -> None:
+        """Rename this projection."""
         self.tx._rpc.api.alter_projection(self.bucket.name, self.schema.name,
                                                 self.table.name, self.name, txid=self.tx.txid, new_name=new_name)
         log.info("Renamed projection from %s to %s ", self.name, new_name)
         self.name = new_name
     def drop(self) -> None:
+        """Drop this projection."""
         self.tx._rpc.api.drop_projection(self.bucket.name, self.schema.name, self.table.name,
                                          self.name, txid=self.tx.txid)
         log.info("Dropped projection: %s", self.name)
@@ -467,9 +564,9 @@ def _parse_projection_info(projection_info, table: "Table"):
     return Projection(name=projection_info.name, table=table, stats=stats, handle=int(projection_info.handle))
-def _parse_bucket_and_object_names(path: str) -> (str, str):
+def _parse_bucket_and_object_names(path: str) -> Tuple[str, str]:
     if not path.startswith('/'):
-        raise errors.InvalidArgumentError(f"Path {path} must start with a '/'")
+        raise errors.InvalidArgument(f"Path {path} must start with a '/'")
     components = path.split(os.path.sep)
     bucket_name = components[1]
     object_path = os.path.sep.join(components[2:])
@@ -481,3 +578,10 @@ def _serialize_record_batch(record_batch: pa.RecordBatch) -> pa.lib.Buffer:
     with pa.ipc.new_stream(sink, record_batch.schema) as writer:
         writer.write(record_batch)
     return sink.getvalue()
+def _combine_chunks(col):
+    if hasattr(col, "combine_chunks"):
+        return col.combine_chunks()
+    else:
+        return col

vastdb 0.1.1__py3-none-any.whl → 0.1.3__py3-none-any.whl

vastdb 0.1.1py3-none-any.whl → 0.1.3py3-none-any.whl