PyPI - vastdb - Versions diffs - 0.1.9__py3-none-any.whl → 0.1.11__py3-none-any.whl - Mend

vastdb 0.1.9py3-none-any.whl → 0.1.11py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (21) hide show

vastdb/__init__.py +6 -3
vastdb/_internal.py +9 -16
vastdb/bench/test_perf.py +2 -2
vastdb/bench/test_sample.py +217 -0
vastdb/config.py +65 -0
vastdb/conftest.py +28 -6
vastdb/errors.py +0 -6
vastdb/features.py +42 -0
vastdb/schema.py +1 -2
vastdb/session.py +12 -45
vastdb/table.py +8 -52
vastdb/tests/metrics.py +43 -0
vastdb/tests/test_sanity.py +1 -1
vastdb/tests/test_tables.py +29 -40
vastdb/util.py +1 -8
vastdb/vast_tests/test_scale.py +68 -0
{vastdb-0.1.9.dist-info → vastdb-0.1.11.dist-info}/METADATA +1 -1
{vastdb-0.1.9.dist-info → vastdb-0.1.11.dist-info}/RECORD +21 -16
{vastdb-0.1.9.dist-info → vastdb-0.1.11.dist-info}/WHEEL +1 -1
{vastdb-0.1.9.dist-info → vastdb-0.1.11.dist-info}/LICENSE +0 -0
{vastdb-0.1.9.dist-info → vastdb-0.1.11.dist-info}/top_level.txt +0 -0

vastdb/__init__.py CHANGED Viewed

@@ -1,9 +1,6 @@
 """VAST Database Python SDK."""
 import functools
-import importlib.metadata
-__version__ = importlib.metadata.distribution(__package__).version
 from . import session
@@ -12,3 +9,9 @@ from . import session
 @functools.wraps(session.Session)
 def connect(*args, **kwargs):  # noqa: D103
     return session.Session(*args, **kwargs)
+def version():
+    """Return VAST DB SDK version."""
+    import importlib
+    return importlib.metadata.distribution(__package__).version

vastdb/_internal.py CHANGED Viewed

@@ -5,9 +5,8 @@ import re
 import struct
 import urllib.parse
 from collections import defaultdict, namedtuple
-from dataclasses import dataclass, field
 from enum import Enum
-from typing import Any, Callable, Dict, Iterator, List, Optional, Tuple, Union
+from typing import Any, Dict, Iterator, List, Optional, Tuple, Union
 import backoff
 import flatbuffers
@@ -104,6 +103,7 @@ from vast_flatbuf.tabular.ListSchemasResponse import ListSchemasResponse as list
 from vast_flatbuf.tabular.ListTablesResponse import ListTablesResponse as list_tables
 from . import errors
+from .config import BackoffConfig
 UINT64_MAX = 18446744073709551615
@@ -742,15 +742,6 @@ def _backoff_giveup(exc: Exception) -> bool:
     return True  # give up in case of other exceptions
-@dataclass
-class BackoffConfig:
-    wait_gen: Callable = field(default=backoff.expo)
-    max_value: Optional[float] = None  # max duration for a single wait period
-    max_tries: int = 10
-    max_time: float = 60.0  # in seconds
-    backoff_log_level: int = logging.DEBUG
 class VastdbApi:
     # we expect the vast version to be <major>.<minor>.<patch>.<protocol>
     VAST_VERSION_REGEX = re.compile(r'^vast (\d+\.\d+\.\d+\.\d+)$')
@@ -759,15 +750,17 @@ class VastdbApi:
             *,
             auth_type=AuthType.SIGV4,
             ssl_verify=True,
+            timeout=None,
             backoff_config: Optional[BackoffConfig] = None):
-        from . import __version__  # import lazily here (to avoid circular dependencies)
-        self.client_sdk_version = f"VAST Database Python SDK {__version__} - 2024 (c)"
+        from . import version  # import lazily here (to avoid circular dependencies)
+        self.client_sdk_version = f"VAST Database Python SDK {version()} - 2024 (c)"
         url = urllib3.util.parse_url(endpoint)
         self.access_key = access_key
         self.secret_key = secret_key
+        self.timeout = timeout
         self.default_max_list_columns_page_size = 1000
         self._session = requests.Session()
         self._session.verify = ssl_verify
@@ -820,9 +813,9 @@ class VastdbApi:
         raise NotImplementedError(msg)
     def _single_request(self, *, method, url, skip_status_check=False, **kwargs):
-        _logger.debug("Sending request: %s %s %s", method, url, kwargs)
+        _logger.debug("Sending request: %s %s %s timeout=%s", method, url, kwargs, self.timeout)
         try:
-            res = self._session.request(method=method, url=url, **kwargs)
+            res = self._session.request(method=method, url=url, timeout=self.timeout, **kwargs)
         except requests.exceptions.ConnectionError as err:
             # low-level connection issue, it is safe to retry only read-only requests
             may_retry = (method == "GET")
@@ -1807,7 +1800,7 @@ def _iter_query_data_response_columns(fileobj, stream_ids=None):
             batches.append(batch)
         except StopIteration:  # we got an end-of-stream IPC message for a given stream ID
             reader, batches = readers.pop(stream_id)  # end of column
-            table = pa.Table.from_batches(batches)  # concatenate all column chunks (as a single)
+            table = pa.Table.from_batches(batches=batches, schema=reader.schema)  # concatenate all column chunks (as a single)
             _logger.debug("stream_id=%d rows=%d column=%s", stream_id, len(table), table)
             yield (stream_id, next_row_id, table)

vastdb/bench/test_perf.py CHANGED Viewed

@@ -10,11 +10,11 @@ log = logging.getLogger(__name__)
 @pytest.mark.benchmark
-def test_bench(session, clean_bucket_name, parquets_path, crater_path):
+def test_bench(session, bucket_name, parquets_path, crater_path):
     files = [str(parquets_path / f) for f in (parquets_path.glob('**/*.pq'))]
     with session.transaction() as tx:
-        b = tx.bucket(clean_bucket_name)
+        b = tx.bucket(bucket_name)
         s = b.create_schema('s1')
         t = util.create_table_from_files(s, 't1', files, config=ImportConfig(import_concurrency=8))
         config = QueryConfig(num_splits=8, num_sub_splits=4)

vastdb/bench/test_sample.py ADDED Viewed

@@ -0,0 +1,217 @@
+#!/usr/bin/env python3
+import functools
+import itertools
+import logging
+import os
+import random
+import threading
+import time
+from concurrent.futures import ProcessPoolExecutor, as_completed
+import numpy as np
+import pyarrow as pa
+import vastdb.errors
+from vastdb.table import INTERNAL_ROW_ID
+from vastdb.tests import metrics
+logging.basicConfig(
+    level="INFO",
+    format="%(asctime)s %(levelname)-10s %(process)d/%(thread)d %(filename)s:%(lineno)d %(message)s")
+log = logging.getLogger()
+sdk_version = vastdb.version()
+log.info("Python SDK version: %s", sdk_version)
+NUM_COLUMNS = 10_000
+COLUMNS_BATCH = 10
+NUM_ROW_GROUPS = 100
+ROW_GROUP_SIZE = 100_000
+INTERNAL_ROWID_FIELD = pa.field(INTERNAL_ROW_ID, pa.uint64())  # used for UPDATE
+EXTERNAL_ROWID_FIELD = pa.field("vastdb_rowid", pa.int64())  # used for INSERT & SELECT
+SCHEMA = "perf"
+TABLE = "sample"
+SCHEMA_ARROW = pa.schema(
+    [pa.field(f'c{i}', pa.float32()) for i in range(NUM_COLUMNS)]
+)
+def load_batch(bucket, session_kwargs, offset, limit):
+    log.info('loading into [%d..%d)', offset, limit)
+    # Iterate over all row-groups in this file
+    rowids_range = range(offset, limit)
+    rowids = pa.array(rowids_range, INTERNAL_ROWID_FIELD.type)
+    session = vastdb.connect(**session_kwargs)
+    metrics_rows = []
+    with session.transaction() as tx:
+        table = tx.bucket(bucket).schema(SCHEMA).table(TABLE)
+        col = table[EXTERNAL_ROWID_FIELD.name]
+        pred = (col >= rowids_range[0]) & (col <= rowids_range[-1])
+        count = sum(len(rb) for rb in table.select(columns=[], predicate=pred))
+        log.info("%d rows exist at %s", count, rowids_range)
+        if count == len(rowids_range):
+            # skip already loaded rows
+            log.info('skipping [%d..%d)', offset, limit)
+        pid = os.getpid()
+        tid = threading.get_native_id()
+        total_nbytes = 0
+        calls = 0
+        t0 = time.time()
+        # Insert/update every chunk of columns in this rowgroup
+        for j in range(0, len(SCHEMA_ARROW), COLUMNS_BATCH):
+            cols_batch = list(SCHEMA_ARROW)[j:j + COLUMNS_BATCH]
+            arrays = [
+                pa.array(np.float32(np.random.uniform(size=[ROW_GROUP_SIZE])))
+                for _ in cols_batch
+            ]
+            chunk = pa.table(data=arrays, schema=pa.schema(cols_batch))
+            nbytes = chunk.get_total_buffer_size()
+            start = time.perf_counter()
+            if j == 0:
+                chunk = chunk.add_column(0, EXTERNAL_ROWID_FIELD, rowids.cast(EXTERNAL_ROWID_FIELD.type))
+                op = 'insert'
+                table.insert(chunk)
+            else:
+                chunk = chunk.add_column(0, INTERNAL_ROWID_FIELD, rowids)
+                op = 'update'
+                table.update(chunk)
+            finish = time.perf_counter()
+            metrics_rows.append(metrics.Row(
+                start=start, finish=finish, table_path=table.path, op=op,
+                nbytes=nbytes, rows=len(chunk), cols=len(cols_batch),
+                pid=pid, tid=tid, sdk_version=sdk_version))
+            total_nbytes += nbytes
+            calls += 1
+            log.debug("%s into %s: %d rows x %d cols, %.3f MB",
+                op, rowids_range, len(chunk), len(chunk.schema),
+                chunk.get_total_buffer_size() / 1e6)
+        dt = time.time() - t0
+    log.info('loaded into [%d..%d): %d rows x %d cols, %.3f MB, %d RPCs, %.3f seconds',
+             offset, limit, limit - offset, NUM_COLUMNS, total_nbytes / 1e6, calls, dt)
+    return metrics_rows
+def test_ingest(test_bucket_name, session_kwargs, tabular_endpoint_urls, num_workers, perf_metrics_db):
+    session = vastdb.connect(**session_kwargs)
+    metrics_table = metrics.Table(perf_metrics_db, "ingest")
+    with session.transaction() as tx:
+        b = tx.bucket(test_bucket_name)
+        try:
+            s = b.schema(SCHEMA)
+        except vastdb.errors.MissingSchema:
+            s = b.create_schema(SCHEMA)
+        try:
+            s.table(TABLE)
+        except vastdb.errors.MissingTable:
+            s.create_table(TABLE, pa.schema([EXTERNAL_ROWID_FIELD] + list(SCHEMA_ARROW)))
+    ranges = [
+        (i * ROW_GROUP_SIZE, (i + 1) * ROW_GROUP_SIZE)
+        for i in range(NUM_ROW_GROUPS)
+    ]
+    with ProcessPoolExecutor(max_workers=num_workers) as executor:
+        futures = [
+            executor.submit(load_batch, test_bucket_name, session_kwargs | {'endpoint': url}, offset, limit)
+            for (offset, limit), url in zip(ranges, itertools.cycle(tabular_endpoint_urls))
+        ]
+        log.info("spawned %d futures", len(futures))
+        for future in as_completed(futures):
+            metrics_table.insert(future.result())
+    with session.transaction() as tx:
+        t = tx.bucket(test_bucket_name).schema(SCHEMA).table(TABLE)
+        count = sum(len(rb) for rb in t.select([]))
+        log.info("%s has %d rows: %s", t, count, t.stats)
+def run_query(session_kwargs, i, bucket_name, endpoint_url):
+    num_columns = 2000
+    row_groups_per_query = 10
+    config = vastdb.table.QueryConfig(
+        num_sub_splits=1,
+        num_splits=1,
+        limit_rows_per_sub_split=ROW_GROUP_SIZE,
+        num_row_groups_per_sub_split=1)
+    row_group_indices = list(range(NUM_ROW_GROUPS))
+    r = random.Random(i)
+    r.shuffle(row_group_indices)
+    pid = os.getpid()
+    tid = threading.get_native_id()
+    metrics_rows = []
+    session = vastdb.connect(**(session_kwargs | {"endpoint": endpoint_url}))
+    with session.transaction() as tx:
+        t = tx.bucket(bucket_name).schema(SCHEMA).table(TABLE)
+        fields = list(t.arrow_schema)[1:]
+        r.shuffle(fields)
+        cols = [f.name for f in fields[:num_columns]]
+        vastdb_rowid = t['vastdb_rowid']
+        preds = []
+        for offset in range(0, len(row_group_indices), row_groups_per_query):
+            rowid_ranges = (
+                vastdb_rowid.between(j * ROW_GROUP_SIZE, (j + 1) * ROW_GROUP_SIZE - 1)
+                for j in row_group_indices[offset:offset + row_groups_per_query]
+            )
+            pred = functools.reduce((lambda x, y: x | y), rowid_ranges)
+            preds.append(pred)
+        for j, pred in enumerate(preds):
+            log.info("%d) starting query #%d on %s", i, j, endpoint_url)
+            start = time.perf_counter()
+            res = t.select(columns=cols, predicate=pred, config=config)
+            rows = 0
+            data = 0
+            for rb in res:
+                rows += len(rb)
+                data += rb.nbytes
+                dt = time.perf_counter() - start
+                log.info("%d) got query #%d batch %.3f[s], %.3f[GB] %.3f[MB/s], %.3f[Mrows]", i, j, dt, data / 1e9, data / 1e6 / dt, rows / 1e6)
+            finish = time.perf_counter()
+            dt = finish - start
+            log.info("%d) finished query #%d %.3f[s], %.3f[GB], %.3f[MB/s], %.3f[Mrows]", i, j, dt, data / 1e9, data / 1e6 / dt, rows / 1e6)
+            metrics_rows.append(metrics.Row(
+                start=start, finish=finish, table_path=t.path, op="select",
+                nbytes=data, rows=rows, cols=len(cols),
+                pid=pid, tid=tid, sdk_version=sdk_version))
+def test_scan(test_bucket_name, session, num_workers, session_kwargs, tabular_endpoint_urls, perf_metrics_db):
+    metrics_table = metrics.Table(perf_metrics_db, "query")
+    log.info("starting %d workers, endpoints=%s", num_workers, tabular_endpoint_urls)
+    with ProcessPoolExecutor(max_workers=num_workers) as executor:
+        futures = [
+            executor.submit(run_query, session_kwargs, i, test_bucket_name, url)
+            for i, url in zip(range(num_workers), itertools.cycle(tabular_endpoint_urls))
+        ]
+        for future in as_completed(futures):
+            metrics_table.insert(future.result())
+    log.info("finished %d workers", num_workers)

vastdb/config.py ADDED Viewed

@@ -0,0 +1,65 @@
+"""Configuration-related dataclasses."""
+import logging
+from dataclasses import dataclass, field
+from typing import Callable, List, Optional
+import backoff
+@dataclass
+class BackoffConfig:
+    """Retry configuration."""
+    wait_gen: Callable = field(default=backoff.expo)
+    max_value: Optional[float] = None  # max duration for a single wait period
+    max_tries: int = 10
+    max_time: float = 60.0  # in seconds
+    backoff_log_level: int = logging.DEBUG
+@dataclass
+class QueryConfig:
+    """Query execution configiration."""
+    # allows server-side parallel processing by issuing multiple reads concurrently for a single RPC
+    num_sub_splits: int = 4
+    # used to split the table into disjoint subsets of rows, to be processed concurrently using multiple RPCs
+    # will be estimated from the table's row count, if not explicitly set
+    num_splits: Optional[int] = None
+    # each endpoint will be handled by a separate worker thread
+    # a single endpoint can be specified more than once to benefit from multithreaded execution
+    data_endpoints: Optional[List[str]] = None
+    # a subsplit fiber will finish after sending this number of rows back to the client
+    limit_rows_per_sub_split: int = 128 * 1024
+    # each fiber will read the following number of rowgroups coninuously before skipping
+    # in order to use semi-sorted projections this value must be 8 (this is the hard coded size of a row groups per row block).
+    num_row_groups_per_sub_split: int = 8
+    # can be disabled for benchmarking purposes
+    use_semi_sorted_projections: bool = True
+    # enforce using a specific semi-sorted projection (if enabled above)
+    semi_sorted_projection_name: Optional[str] = None
+    # used to estimate the number of splits, given the table rows' count
+    rows_per_split: int = 4000000
+    # used for worker threads' naming
+    query_id: str = ""
+    # non-negative integer, used for server-side prioritization of queued requests:
+    # - requests with lower values will be served before requests with higher values.
+    # - if unset, the request will be added to the queue's end.
+    queue_priority: Optional[int] = None
+@dataclass
+class ImportConfig:
+    """Import execution configiration."""
+    import_concurrency: int = 2

vastdb/conftest.py CHANGED Viewed

@@ -1,4 +1,5 @@
 import os
+import sqlite3
 from pathlib import Path
 import boto3
@@ -13,27 +14,43 @@ def pytest_addoption(parser):
                      default=os.environ.get("AWS_ACCESS_KEY_ID", None))
     parser.addoption("--tabular-secret-key", help="Secret key with Tabular permissions (AWS_SECRET_ACCESS_KEY)",
                      default=os.environ.get("AWS_SECRET_ACCESS_KEY", None))
-    parser.addoption("--tabular-endpoint-url", help="Tabular server endpoint", default="http://localhost:9090")
+    parser.addoption("--tabular-endpoint-url", help="Tabular server endpoint", default=[], action="append")
     parser.addoption("--data-path", help="Data files location", default=None)
     parser.addoption("--crater-path", help="Save benchmark results in a dedicated location", default=None)
     parser.addoption("--schema-name", help="Name of schema for the test to operate on", default=None)
     parser.addoption("--table-name", help="Name of table for the test to operate on", default=None)
+    parser.addoption("--num-workers", help="Number of concurrent workers", default=1)
 @pytest.fixture(scope="session")
-def session(request):
-    return vastdb.connect(
+def session_kwargs(request, tabular_endpoint_urls):
+    return dict(
         access=request.config.getoption("--tabular-access-key"),
         secret=request.config.getoption("--tabular-secret-key"),
-        endpoint=request.config.getoption("--tabular-endpoint-url"),
+        endpoint=tabular_endpoint_urls[0],
     )
+@pytest.fixture(scope="session")
+def session(session_kwargs):
+    return vastdb.connect(**session_kwargs)
+@pytest.fixture(scope="session")
+def num_workers(request):
+    return int(request.config.getoption("--num-workers"))
 @pytest.fixture(scope="session")
 def test_bucket_name(request):
     return request.config.getoption("--tabular-bucket-name")
+@pytest.fixture(scope="session")
+def tabular_endpoint_urls(request):
+    return request.config.getoption("--tabular-endpoint-url") or ["http://localhost:9090"]
 def iter_schemas(s):
     """Recusively scan all schemas."""
     children = s.schemas()
@@ -55,12 +72,12 @@ def clean_bucket_name(request, test_bucket_name, session):
 @pytest.fixture(scope="session")
-def s3(request):
+def s3(request, tabular_endpoint_urls):
     return boto3.client(
         's3',
         aws_access_key_id=request.config.getoption("--tabular-access-key"),
         aws_secret_access_key=request.config.getoption("--tabular-secret-key"),
-        endpoint_url=request.config.getoption("--tabular-endpoint-url"))
+        endpoint_url=tabular_endpoint_urls[0])
 @pytest.fixture(scope="function")
@@ -81,3 +98,8 @@ def schema_name(request):
 @pytest.fixture(scope="function")
 def table_name(request):
     return request.config.getoption("--table-name")
+@pytest.fixture(scope="function")
+def perf_metrics_db(crater_path):
+    return sqlite3.connect(f"{crater_path}/metrics.sqlite")

vastdb/errors.py CHANGED Viewed

@@ -3,7 +3,6 @@ import xml.etree.ElementTree
 from dataclasses import dataclass
 from enum import Enum
-import pyarrow as pa
 import requests
@@ -170,11 +169,6 @@ class NotSupportedCommand(NotSupported):
     table: str
-@dataclass
-class NotSupportedType(NotSupported):
-    field: pa.Field
 @dataclass
 class NotSupportedVersion(NotSupported):
     err_msg: str

vastdb/features.py ADDED Viewed

@@ -0,0 +1,42 @@
+"""Version-dependent features."""
+import logging
+from .errors import NotSupportedVersion
+log = logging.getLogger()
+class Features:
+    """VAST database features - check if server is already support a feature."""
+    def __init__(self, vast_version):
+        """Save the server version."""
+        self.vast_version = vast_version
+        self.check_imports_table = self._check(
+            "Imported objects' table feature requires 5.2+ VAST release",
+            vast_version >= (5, 2))
+        self.check_return_row_ids = self._check(
+            "Returning row IDs requires 5.1+ VAST release",
+            vast_version >= (5, 1))
+        self.check_enforce_semisorted_projection = self._check(
+            "Semi-sorted projection enforcement requires 5.1+ VAST release",
+            vast_version >= (5, 1))
+        self.check_external_row_ids_allocation = self._check(
+            "External row IDs allocation requires 5.1+ VAST release",
+            vast_version >= (5, 1))
+    def _check(self, msg, supported):
+        log.debug("%s (current version is %s): supported=%s", msg, self.vast_version, supported)
+        if not supported:
+            def fail():
+                raise NotSupportedVersion(msg, self.vast_version)
+            return fail
+        def noop():
+            pass
+        return noop

vastdb/schema.py CHANGED Viewed

@@ -10,7 +10,7 @@ from typing import TYPE_CHECKING, Iterable, List, Optional
 import pyarrow as pa
-from . import bucket, errors, schema, table, util
+from . import bucket, errors, schema, table
 if TYPE_CHECKING:
     from .table import Table
@@ -86,7 +86,6 @@ class Schema:
         if use_external_row_ids_allocation:
             self.tx._rpc.features.check_external_row_ids_allocation()
-        util.check_supported_types(columns)
         self.tx._rpc.api.create_table(self.bucket.name, self.name, table_name, columns, txid=self.tx.txid,
                                       use_external_row_ids_allocation=use_external_row_ids_allocation)
         log.info("Created table: %s", table_name)

vastdb/session.py CHANGED Viewed

@@ -7,51 +7,11 @@ For more details see:
 - [Tabular identity policy with the proper permissions](https://support.vastdata.com/s/article/UUID-14322b60-d6a2-89ac-3df0-3dfbb6974182)
 """
-import logging
 import os
-from typing import Optional
+from typing import TYPE_CHECKING, Optional
-import boto3
-from . import _internal, errors, transaction
-from ._internal import BackoffConfig
-log = logging.getLogger()
-class Features:
-    """VAST database features - check if server is already support a feature."""
-    def __init__(self, vast_version):
-        """Save the server version."""
-        self.vast_version = vast_version
-        self.check_imports_table = self._check(
-            "Imported objects' table feature requires 5.2+ VAST release",
-            vast_version >= (5, 2))
-        self.check_return_row_ids = self._check(
-            "Returning row IDs requires 5.1+ VAST release",
-            vast_version >= (5, 1))
-        self.check_enforce_semisorted_projection = self._check(
-            "Semi-sorted projection enforcement requires 5.1+ VAST release",
-            vast_version >= (5, 1))
-        self.check_external_row_ids_allocation = self._check(
-            "External row IDs allocation requires 5.1+ VAST release",
-            vast_version >= (5, 1))
-    def _check(self, msg, supported):
-        log.debug("%s (current version is %s): supported=%s", msg, self.vast_version, supported)
-        if not supported:
-            def fail():
-                raise errors.NotSupportedVersion(msg, self.vast_version)
-            return fail
-        def noop():
-            pass
-        return noop
+if TYPE_CHECKING:
+    from .config import BackoffConfig
 class Session:
@@ -60,8 +20,13 @@ class Session:
     def __init__(self, access=None, secret=None, endpoint=None,
                  *,
                  ssl_verify=True,
-                 backoff_config: Optional[BackoffConfig] = None):
+                 timeout=None,
+                 backoff_config: Optional["BackoffConfig"] = None):
         """Connect to a VAST Database endpoint, using specified credentials."""
+        import boto3
+        from . import _internal, features
         if access is None:
             access = os.environ['AWS_ACCESS_KEY_ID']
         if secret is None:
@@ -74,8 +39,9 @@ class Session:
             access_key=access,
             secret_key=secret,
             ssl_verify=ssl_verify,
+            timeout=timeout,
             backoff_config=backoff_config)
-        self.features = Features(self.api.vast_version)
+        self.features = features.Features(self.api.vast_version)
         self.s3 = boto3.client('s3',
             aws_access_key_id=access,
             aws_secret_access_key=secret,
@@ -93,4 +59,5 @@ class Session:
             with session.transaction() as tx:
                 tx.bucket("bucket").create_schema("schema")
         """
+        from . import transaction
         return transaction.Transaction(self)

vastdb/table.py CHANGED Viewed

@@ -14,6 +14,7 @@ import pyarrow as pa
 import urllib3
 from . import _internal, errors, schema, util
+from .config import ImportConfig, QueryConfig
 log = logging.getLogger(__name__)
@@ -39,53 +40,6 @@ class TableStats:
     endpoints: Tuple[str, ...] = ()
-@dataclass
-class QueryConfig:
-    """Query execution configiration."""
-    # allows server-side parallel processing by issuing multiple reads concurrently for a single RPC
-    num_sub_splits: int = 4
-    # used to split the table into disjoint subsets of rows, to be processed concurrently using multiple RPCs
-    # will be estimated from the table's row count, if not explicitly set
-    num_splits: Optional[int] = None
-    # each endpoint will be handled by a separate worker thread
-    # a single endpoint can be specified more than once to benefit from multithreaded execution
-    data_endpoints: Optional[List[str]] = None
-    # a subsplit fiber will finish after sending this number of rows back to the client
-    limit_rows_per_sub_split: int = 128 * 1024
-    # each fiber will read the following number of rowgroups coninuously before skipping
-    # in order to use semi-sorted projections this value must be 8 (this is the hard coded size of a row groups per row block).
-    num_row_groups_per_sub_split: int = 8
-    # can be disabled for benchmarking purposes
-    use_semi_sorted_projections: bool = True
-    # enforce using a specific semi-sorted projection (if enabled above)
-    semi_sorted_projection_name: Optional[str] = None
-    # used to estimate the number of splits, given the table rows' count
-    rows_per_split: int = 4000000
-    # used for worker threads' naming
-    query_id: str = ""
-    # non-negative integer, used for server-side prioritization of queued requests:
-    # - requests with lower values will be served before requests with higher values.
-    # - if unset, the request will be added to the queue's end.
-    queue_priority: Optional[int] = None
-@dataclass
-class ImportConfig:
-    """Import execution configiration."""
-    import_concurrency: int = 2
 class SelectSplitState:
     """State of a specific query split execution."""
@@ -167,8 +121,13 @@ class Table:
         """Also, load columns' metadata."""
         self.arrow_schema = self.columns()
-        table_path = f'{self.schema.bucket.name}/{self.schema.name}/{self.name}'
-        self._ibis_table = ibis.table(ibis.Schema.from_pyarrow(self.arrow_schema), table_path)
+        self._table_path = f'{self.schema.bucket.name}/{self.schema.name}/{self.name}'
+        self._ibis_table = ibis.table(ibis.Schema.from_pyarrow(self.arrow_schema), self._table_path)
+    @property
+    def path(self):
+        """Return table's path."""
+        return self._table_path
     @property
     def tx(self):
@@ -486,7 +445,6 @@ class Table:
             raise errors.NotSupportedCommand(self.bucket.name, self.schema.name, self.name)
         try:
             row_ids = []
-            util.check_supported_types(rows.schema)
             serialized_slices = util.iter_serialized_slices(rows, MAX_INSERT_ROWS_PER_PATCH)
             for slice in serialized_slices:
                 res = self.tx._rpc.api.insert_rows(self.bucket.name, self.schema.name, self.name, record_batch=slice,
@@ -529,7 +487,6 @@ class Table:
         update_rows_rb = util.sort_record_batch_if_needed(update_rows_rb, INTERNAL_ROW_ID)
-        util.check_supported_types(update_rows_rb.schema)
         serialized_slices = util.iter_serialized_slices(update_rows_rb, MAX_ROWS_PER_BATCH)
         for slice in serialized_slices:
             self.tx._rpc.api.update_rows(self.bucket.name, self.schema.name, self.name, record_batch=slice,
@@ -574,7 +531,6 @@ class Table:
         """Add a new column."""
         if self._imports_table:
             raise errors.NotSupportedCommand(self.bucket.name, self.schema.name, self.name)
-        util.check_supported_types(new_column)
         self.tx._rpc.api.add_columns(self.bucket.name, self.schema.name, self.name, new_column, txid=self.tx.txid)
         log.info("Added column(s): %s", new_column)
         self.arrow_schema = self.columns()

vastdb/tests/metrics.py ADDED Viewed

@@ -0,0 +1,43 @@
+import dataclasses
+import sqlite3
+from typing import List
+_MAP_SQLITE_TYPES = {
+    str: "TEXT",
+    float: "REAL",
+    int: "INTEGER",
+}
+@dataclasses.dataclass
+class Row:
+    start: float
+    finish: float
+    table_path: str
+    op: str
+    nbytes: int
+    rows: int
+    cols: int
+    pid: int
+    tid: int
+    sdk_version: str
+class Table:
+    def __init__(self, conn: sqlite3.Connection, name: str):
+        self.fields = dataclasses.fields(Row)
+        self.conn = conn
+        self.name = name
+        columns = ", ".join(
+            f"{f.name} {_MAP_SQLITE_TYPES[f.type]}"
+            for f in self.fields
+        )
+        cmd = f"CREATE TABLE {self.name} ({columns})"
+        self.conn.execute(cmd).fetchall()
+    def insert(self, rows: List[Row]):
+        args = ", ".join(["?"] * len(self.fields))
+        cmd = f"INSERT INTO {self.name} VALUES ({args})"
+        data = [dataclasses.astuple(row) for row in rows]
+        self.conn.executemany(cmd, data).fetchall()
+        self.conn.commit()

vastdb/tests/test_sanity.py CHANGED Viewed

@@ -24,7 +24,7 @@ def test_bad_credentials(session):
 def test_bad_endpoint(session):
-    backoff_config = vastdb.session.BackoffConfig(max_tries=3)
+    backoff_config = vastdb.config.BackoffConfig(max_tries=3)
     with pytest.raises(vastdb.errors.ConnectionError):
         vastdb.connect(access='BAD', secret='BAD', endpoint='http://invalid-host-name-for-tests:12345', backoff_config=backoff_config)

vastdb/tests/test_tables.py CHANGED Viewed

@@ -227,6 +227,35 @@ def test_select_with_priority(session, clean_bucket_name):
             t.select(config=config).read_all()
+def test_timezones(session, clean_bucket_name):
+    columns_with_tz = pa.schema([
+        ('ts0', pa.timestamp('s', tz='+00:00')),
+        ('ts3', pa.timestamp('ms', tz='UTC')),
+        ('ts6', pa.timestamp('us', tz='GMT')),
+        ('ts9', pa.timestamp('ns', tz='Universal')),
+    ])
+    # currently timezone information is not stored
+    columns_without_tz = pa.schema([
+        ('ts0', pa.timestamp('s')),
+        ('ts3', pa.timestamp('ms')),
+        ('ts6', pa.timestamp('us')),
+        ('ts9', pa.timestamp('ns')),
+    ])
+    data = [
+        [dt.datetime(2024, 4, 10, 12, 34, 56), dt.datetime(2025, 4, 10, 12, 34, 56), dt.datetime(2026, 4, 10, 12, 34, 56)],
+        [dt.datetime(2024, 4, 10, 12, 34, 56, 789000), dt.datetime(2025, 4, 10, 12, 34, 56, 789000), dt.datetime(2026, 4, 10, 12, 34, 56, 789000)],
+        [dt.datetime(2024, 4, 10, 12, 34, 56, 789789), dt.datetime(2025, 4, 10, 12, 34, 56, 789789), dt.datetime(2026, 4, 10, 12, 34, 56, 789789)],
+        [dt.datetime(2024, 4, 10, 12, 34, 56, 789789), dt.datetime(2025, 4, 10, 12, 34, 56, 789789), dt.datetime(2026, 4, 10, 12, 34, 56, 789789)],
+    ]
+    inserted = pa.table(schema=columns_with_tz, data=data)
+    with prepare_data(session, clean_bucket_name, 's', 't', inserted) as table:
+        assert table.arrow_schema == columns_without_tz
+        assert table.select().read_all() == pa.table(schema=columns_without_tz, data=data)
 def test_types(session, clean_bucket_name):
     columns = pa.schema([
         ('tb', pa.bool_()),
@@ -311,46 +340,6 @@ def test_types(session, clean_bucket_name):
             assert select(t['ts9'] == ts_literal) == expected.filter(pc.field('ts9') == ts_literal)
-TIMESTAMP_UNITS = ['s', 'ms', 'us', 'ns']
-def test_unsupported_timezone(session, clean_bucket_name):
-    with session.transaction() as tx:
-        s = tx.bucket(clean_bucket_name).create_schema('s1')
-        for unit in TIMESTAMP_UNITS:
-            col_type = pa.timestamp(unit, 'UTC')
-            with pytest.raises(errors.NotSupportedType):
-                s.create_table('t1', pa.schema([('ts', col_type)]))
-            assert s.tables() == []
-        cols = [('c', pa.int64())]
-        t1 = s.create_table('t1', pa.schema(cols))
-        for unit in TIMESTAMP_UNITS:
-            col_type = pa.timestamp(unit, 'UTC')
-            with pytest.raises(errors.NotSupportedType):
-                t1.add_column(pa.schema([('ts', col_type)]))
-        cols = [(f'c_{unit}', pa.timestamp(unit)) for unit in TIMESTAMP_UNITS]
-        t2 = s.create_table('t2', pa.schema(cols))
-        for unit in TIMESTAMP_UNITS:
-            col_type = pa.timestamp(unit, 'UTC')
-            rb = pa.record_batch(
-                data=[[None]],
-                schema=pa.schema([(f'c_{unit}', col_type)]))
-            with pytest.raises(errors.NotSupportedType):
-                t2.insert(rb)
-            rb = pa.record_batch(
-                data=[[0], [None]],
-                schema=pa.schema([
-                    (INTERNAL_ROW_ID, pa.uint64()),
-                    (f'c_{unit}', col_type)]))
-            with pytest.raises(errors.NotSupportedType):
-                t2.update(rb)
 def test_filters(session, clean_bucket_name):
     columns = pa.schema([
         ('a', pa.int32()),

vastdb/util.py CHANGED Viewed

@@ -6,7 +6,7 @@ import pyarrow as pa
 import pyarrow.compute as pc
 import pyarrow.parquet as pq
-from .errors import InvalidArgument, NotSupportedType, TooWideRow
+from .errors import InvalidArgument, TooWideRow
 log = logging.getLogger(__name__)
@@ -152,10 +152,3 @@ def sort_record_batch_if_needed(record_batch, sort_column):
         return record_batch.sort_by(sort_column)
     else:
         return record_batch
-def check_supported_types(fields: pa.Schema):
-    for f in fields:
-        if isinstance(f.type, pa.TimestampType):
-            if f.type.tz:
-                raise NotSupportedType(f)

vastdb/vast_tests/test_scale.py ADDED Viewed

@@ -0,0 +1,68 @@
+import logging
+import random
+import time
+from concurrent.futures import ThreadPoolExecutor
+import pyarrow as pa
+from vastdb.table import QueryConfig
+logger = logging.getLogger(__name__)
+def test_concurrent_query(session, test_bucket_name, schema_name, table_name):
+    """
+    This test runs several selective queries in parallel. It is used to check various internal VAST scenarios.
+    """
+    amount_of_queries_in_parallel = 10  # due to limit on requests connection-pool
+    config = QueryConfig(num_splits=1, num_sub_splits=1)
+    def _execute_single_query():
+        with session.transaction() as tx:
+            t = tx.bucket(test_bucket_name).schema(schema_name).table(table_name)
+            pred = (t["a"] == 0)  # 0 is in the min-max range
+            s = time.time()
+            t.select(config=config, predicate=pred).read_all()
+            e = time.time()
+            logger.info(f"Query took {e - s}")
+    logger.info(f"about to submit {amount_of_queries_in_parallel} queries in parallel")
+    with ThreadPoolExecutor() as executor:
+        futures = [executor.submit(_execute_single_query) for _ in range(amount_of_queries_in_parallel)]
+        for future in futures:
+            future.result()
+    logger.info(f"finished running {amount_of_queries_in_parallel} queries")
+def test_table_stats(session, test_bucket_name, schema_name, table_name):
+    """
+    Testing stats integrity while altering table
+    """
+    NUM_TIMES_TO_INSERT = 1000
+    seed = random.randint(0, 10)
+    logger.info(f"random seed is {seed}")
+    r = random.Random(seed)
+    with session.transaction() as tx:
+        t = tx.bucket(test_bucket_name).schema(schema_name).table(table_name)
+        initial_stat = t.get_stats()
+        table_fields = t.columns()
+    rand_values = {}  # create a dict with a random value from each column
+    with session.transaction() as tx:
+        t = tx.bucket(test_bucket_name).schema(schema_name).table(table_name)
+        for col in table_fields:
+            res = t.select(columns=[col.name]).read_all().column(col.name)
+            rand_values[col.name] = res[int(r.uniform(0, len(res)))].as_py()
+    logger.info(f"rand row to insert to the table - {rand_values}, {NUM_TIMES_TO_INSERT} times")
+    rb = pa.RecordBatch.from_pylist([rand_values] * NUM_TIMES_TO_INSERT)
+    with session.transaction() as tx:
+        t = tx.bucket(test_bucket_name).schema(schema_name).table(table_name)
+        t.insert(rb)
+        time.sleep(2)  # waiting for stats to get updated
+        new_stat = t.get_stats()
+    logger.info("inserted to table")
+    assert new_stat.size_in_bytes != initial_stat.size_in_bytes
+    assert new_stat.num_rows - NUM_TIMES_TO_INSERT == initial_stat.num_rows

{vastdb-0.1.9.dist-info → vastdb-0.1.11.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: vastdb
-Version: 0.1.9
+Version: 0.1.11
 Summary: VAST Data SDK
 Home-page: https://github.com/vast-data/vastdb_sdk
 Author: VAST DATA

{vastdb-0.1.9.dist-info → vastdb-0.1.11.dist-info}/RECORD RENAMED Viewed

@@ -148,32 +148,37 @@ vast_flatbuf/tabular/ObjectDetails.py,sha256=qW0WtbkCYYE_L-Kw6VNRDCLYaRm5lKvTbLN
 vast_flatbuf/tabular/S3File.py,sha256=KC9c2oS5-JXwTTriUVFdjOvRG0B54Cq9kviSDZY3NI0,4450
 vast_flatbuf/tabular/VipRange.py,sha256=_BJd1RRZAcK76T9vlsHzXKYVsPVaz6WTEAqStMQCAUQ,2069
 vast_flatbuf/tabular/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-vastdb/__init__.py,sha256=8PLcZowy_vM0zuiYSQPXuxIEMcwHD7IRFpgcPK-03bk,386
-vastdb/_internal.py,sha256=bENTnMZFAXolkFaWa7op9bsPWMk3wVE6oKKU--0ukXk,89971
+vastdb/__init__.py,sha256=J1JjKiFkKC95BHowfh9kJfQFTjRce-QMsc6zF_FfxC0,432
+vastdb/_internal.py,sha256=6Z0pkMCZNInJPFmWl9UvcLxBEX8CJZjV0hIsi_9jib0,89808
 vastdb/bucket.py,sha256=5KuKhPjZOevznZqWHDVVocejvAy7dcwobPuV6BJCfPc,2544
-vastdb/conftest.py,sha256=wnnPXjeLems2zpOBB1UGbZ_YW5S169NhGA5UZu7H5SM,2831
-vastdb/errors.py,sha256=jER5RQYsBRlQsjym1ItQYRukggMypATOo_sKvsJtMbo,4278
-vastdb/schema.py,sha256=yaueil92MSMYJf6bWseov_8fXTdW5zaKLXNjP5uuyzI,5963
-vastdb/session.py,sha256=3YHhG7IamFOKuy-Fkq_IVtPNriSfI6IN_4z4arBFbDU,3349
-vastdb/table.py,sha256=TGBiIp0pB7vHd-92q4_sDRDjd4klHDLFOeEgdn1ACQI,32880
+vastdb/config.py,sha256=1tMYtzKXerGcIUjH4tIGEvZNWvO4fviCEdcNCnELJZo,2269
+vastdb/conftest.py,sha256=ePzQiEQmlNGcM2T4GZevE4XuvcnFWfnTSzr8IVZpVKk,3438
+vastdb/errors.py,sha256=2XR1ko7J5nkfiHSAgwuVAADw0SsyqxOwSeFaGgKZEXM,4186
+vastdb/features.py,sha256=DxV746LSkORwVSD6MP2hdXRfnyoLkJwtOwGmp1dnquo,1322
+vastdb/schema.py,sha256=X7IRrogXH7Z0kes-DsDh1bRqIhvjH6owlFigGBXy7XQ,5913
+vastdb/session.py,sha256=ZrQf8cecVIHIBUOPNg4ed8ZCnEEu0QW1OBxQgz_ia80,2241
+vastdb/table.py,sha256=2z5zpnBc5iM5ZqELCVg6wEIdYcPVm6UW_4Xm55S8ZXg,31078
 vastdb/transaction.py,sha256=qu2rOlR7AS1ojMOzgWapQMpcorrutelZZLH1mLmTHxk,3186
-vastdb/util.py,sha256=2W5bBnlihIFvdV4im4HiDLArEhU8zjKMZB3Xw0lzgz0,5888
+vastdb/util.py,sha256=4LTYBBR13na376AmDm5lQILJzLcfelIKdkNPy0IqI0o,5684
 vastdb/bench/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-vastdb/bench/test_perf.py,sha256=yn5gE7t_nzmJHBl9bCs1hxQOgzhvFphuYElsWGko8ts,1084
+vastdb/bench/test_perf.py,sha256=gZIqfHva6lNFpD-9bHAe7M8COBjUyrPkHu3E7F8J2L0,1072
+vastdb/bench/test_sample.py,sha256=bFmw7BOCr5FoGn4TY9pQGd6_cVNK4uBeSRi33tTubyk,7847
 vastdb/tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+vastdb/tests/metrics.py,sha256=N6ELJUmZubhAMmUtDbisXr6TFhSDgVCTTU05gBVxHRA,1010
 vastdb/tests/test_duckdb.py,sha256=STw_1PwTQR8Naz6s0p6lQTV1ZTKKhe3LPBUbhqzTCu0,1880
 vastdb/tests/test_imports.py,sha256=xKub3-bisFjH0BsZM8COfiUWuMrtoOoQKprF6VQT9RI,5669
 vastdb/tests/test_nested.py,sha256=22NAxBTm7Aq-Vn6AIYbi5Cb1ET8W0XeLK3pp4D8BYWI,3448
 vastdb/tests/test_projections.py,sha256=3y1kubwVrzO-xoR0hyps7zrjOJI8niCYspaFTN16Q9w,4540
-vastdb/tests/test_sanity.py,sha256=V6dO5Y44B6pG8Eet6atTTGGH1yPz75_k0ZybHY-IiF8,3039
+vastdb/tests/test_sanity.py,sha256=oiV2gb05aPyG5RMNUQZlyjNlg3T7Fig1_8OJzpAgcsk,3038
 vastdb/tests/test_schemas.py,sha256=l70YQMlx2UL1KRQhApriiG2ZM7GJF-IzWU31H3Yqn1U,3312
-vastdb/tests/test_tables.py,sha256=RlwVfzs2hjfs2gchiRY0hnWoOAu4MV_9NbQCeHR6_us,31590
+vastdb/tests/test_tables.py,sha256=qWicD0BYuhrh1kRVqkHMJNsxcHxDcCprbEXuZJm1wm4,31529
 vastdb/tests/test_util.py,sha256=Ok_sAEBJsRGF5Voa_v5eu3eAd52GWu8jMjjQbadwW-s,1260
 vastdb/tests/util.py,sha256=dpRJYbboDnlqL4qIdvScpp8--5fxRUBIcIYitrfcj9o,555
 vastdb/vast_tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 vastdb/vast_tests/test_ha.py,sha256=744P4G6VJ09RIkHhMQL4wlipCBJWQVMhyvUrSc4k1HQ,975
-vastdb-0.1.9.dist-info/LICENSE,sha256=obffan7LYrq7hLHNrY7vHcn2pKUTBUYXMKu-VOAvDxU,11333
-vastdb-0.1.9.dist-info/METADATA,sha256=YRP3W_JzaDywneqZ0cFWJzLbLE_NGf9QYja6-CFwQl4,1350
-vastdb-0.1.9.dist-info/WHEEL,sha256=mguMlWGMX-VHnMpKOjjQidIo1ssRlCFu4a4mBpz1s2M,91
-vastdb-0.1.9.dist-info/top_level.txt,sha256=Vsj2MKtlhPg0J4so64slQtnwjhgoPmJgcG-6YcVAwVc,20
-vastdb-0.1.9.dist-info/RECORD,,
+vastdb/vast_tests/test_scale.py,sha256=EpjCJmVAQrNBxVnHGJ-KHCoxevhqOcyqYFPMIIY9s60,2714
+vastdb-0.1.11.dist-info/LICENSE,sha256=obffan7LYrq7hLHNrY7vHcn2pKUTBUYXMKu-VOAvDxU,11333
+vastdb-0.1.11.dist-info/METADATA,sha256=11xuX_TRPnPWsTe6bDgBx-EM--9zLolqog9Z3NhDpno,1351
+vastdb-0.1.11.dist-info/WHEEL,sha256=Z4pYXqR_rTB7OWNDYFOm1qRk0RX6GFP2o8LgvP453Hk,91
+vastdb-0.1.11.dist-info/top_level.txt,sha256=Vsj2MKtlhPg0J4so64slQtnwjhgoPmJgcG-6YcVAwVc,20
+vastdb-0.1.11.dist-info/RECORD,,

{vastdb-0.1.9.dist-info → vastdb-0.1.11.dist-info}/WHEEL RENAMED Viewed

@@ -1,5 +1,5 @@
 Wheel-Version: 1.0
-Generator: setuptools (70.1.1)
+Generator: setuptools (70.3.0)
 Root-Is-Purelib: true
 Tag: py3-none-any

{vastdb-0.1.9.dist-info → vastdb-0.1.11.dist-info}/LICENSE RENAMED Viewed

File without changes

{vastdb-0.1.9.dist-info → vastdb-0.1.11.dist-info}/top_level.txt RENAMED Viewed

File without changes

vastdb 0.1.9__py3-none-any.whl → 0.1.11__py3-none-any.whl

vastdb 0.1.9py3-none-any.whl → 0.1.11py3-none-any.whl