PyPI - vastdb - Versions diffs - 1.3.8__py3-none-any.whl → 1.3.10__py3-none-any.whl - Mend

vastdb 1.3.8py3-none-any.whl → 1.3.10py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

vastdb/__init__.py +2 -2
vastdb/_internal.py +41 -8
vastdb/bench/test_perf.py +68 -9
vastdb/features.py +6 -2
vastdb/table.py +44 -10
vastdb/tests/test_imports.py +366 -53
vastdb/tests/test_tables.py +25 -2
vastdb/tests/util.py +21 -0
{vastdb-1.3.8.dist-info → vastdb-1.3.10.dist-info}/METADATA +1 -1
{vastdb-1.3.8.dist-info → vastdb-1.3.10.dist-info}/RECORD +13 -13
{vastdb-1.3.8.dist-info → vastdb-1.3.10.dist-info}/LICENSE +0 -0
{vastdb-1.3.8.dist-info → vastdb-1.3.10.dist-info}/WHEEL +0 -0
{vastdb-1.3.8.dist-info → vastdb-1.3.10.dist-info}/top_level.txt +0 -0

vastdb/__init__.py CHANGED Viewed

@@ -13,5 +13,5 @@ def connect(*args, **kwargs):  # noqa: D103
 def version():
     """Return VAST DB SDK version."""
-    import importlib
-    return importlib.metadata.distribution(__package__).version
+    from importlib import metadata
+    return metadata.distribution(__package__).version

vastdb/_internal.py CHANGED Viewed

@@ -794,7 +794,7 @@ def _decode_table_props(s):
     return {y: _prop_coding[x][1](z) for x, y, z in triplets if z != ''}
-TableInfo = namedtuple('TableInfo', 'name properties handle num_rows size_in_bytes num_partitions sorting_key_enabled')
+TableInfo = namedtuple('TableInfo', 'name properties handle num_rows size_in_bytes num_partitions sorting_key_enabled sorting_score write_amplification acummulative_row_insertion_count sorting_done')
 def _parse_table_info(obj, parse_properties):
@@ -806,13 +806,20 @@ def _parse_table_info(obj, parse_properties):
     num_partitions = obj.NumPartitions()
     properties = parse_properties(properties)
     sorting_key_enabled = obj.SortingKeyEnabled()
-    return TableInfo(name, properties, handle, num_rows, used_bytes, num_partitions, sorting_key_enabled)
+    sorting_score_raw = obj.SortingScore()
+    write_amplification = obj.WriteAmplification()
+    acummulative_row_insertion_count = obj.AcummulativeRowInseritionCount()
+    sorting_score = sorting_score_raw & ((1 << 63) - 1)
+    sorting_done = bool(sorting_score_raw >> 63)
+    return TableInfo(name, properties, handle, num_rows, used_bytes, num_partitions, sorting_key_enabled,
+                     sorting_score, write_amplification, acummulative_row_insertion_count, sorting_done)
 # Results that returns from tablestats
-TableStatsResult = namedtuple("TableStatsResult", ["num_rows", "size_in_bytes", "is_external_rowid_alloc", "endpoints"])
+TableStatsResult = namedtuple("TableStatsResult", 'num_rows size_in_bytes is_external_rowid_alloc sorting_key_enabled sorting_score write_amplification acummulative_row_inserition_count sorting_done endpoints')
 _RETRIABLE_EXCEPTIONS = (
@@ -1213,8 +1220,16 @@ class VastdbApi:
         num_rows = stats.NumRows()
         size_in_bytes = stats.SizeInBytes()
         is_external_rowid_alloc = stats.IsExternalRowidAlloc()
+        sorting_key_enabled = stats.SortingKeyEnabled()
+        sorting_score_raw = stats.SortingScore()
+        write_amplification = stats.WriteAmplification()
+        acummulative_row_inserition_count = stats.AcummulativeRowInseritionCount()
+        sorting_score = sorting_score_raw & ((1 << 63) - 1)
+        sorting_done = bool(sorting_score_raw >> 63)
         endpoints = [self.url]  # we cannot replace the host by a VIP address in HTTPS-based URLs
-        return TableStatsResult(num_rows, size_in_bytes, is_external_rowid_alloc, tuple(endpoints))
+        return TableStatsResult(num_rows, size_in_bytes, is_external_rowid_alloc, sorting_key_enabled, sorting_score, write_amplification, acummulative_row_inserition_count, sorting_done, tuple(endpoints))
     def alter_topic(self, bucket, name,
                     new_name="", expected_retvals=[],
@@ -1302,8 +1317,8 @@ class VastdbApi:
                                           expected_retvals=expected_retvals,
                                           include_list_stats=include_list_stats, count_only=count_only)
-    def _list_tables_internal(self, bucket, schema, parse_properties, txid=0, client_tags=[], max_keys=1000, next_key=0, name_prefix="",
-                              exact_match=False, expected_retvals=[], include_list_stats=False, count_only=False):
+    def _list_tables_raw(self, bucket, schema, txid=0, client_tags=[], max_keys=1000, next_key=0, name_prefix="",
+                         exact_match=False, expected_retvals=[], include_list_stats=False, count_only=False):
         """
         GET /mybucket/schema_path?table HTTP/1.1
         tabular-txid: TransactionId
@@ -1323,7 +1338,6 @@ class VastdbApi:
         headers['tabular-list-count-only'] = str(count_only)
         headers['tabular-include-list-stats'] = str(include_list_stats)
-        tables = []
         res = self._request(
             method="GET",
             url=self._url(bucket=bucket, schema=schema, command="table"),
@@ -1333,17 +1347,36 @@ class VastdbApi:
         next_key = int(res_headers['tabular-next-key'])
         is_truncated = res_headers['tabular-is-truncated'] == 'true'
         lists = list_tables.GetRootAs(res.content)
+        tables_length = lists.TablesLength()
+        count = int(res_headers['tabular-list-count']) if 'tabular-list-count' in res_headers else tables_length
+        return lists, is_truncated, count
+    def _list_tables_internal(self, bucket, schema, parse_properties, txid=0, client_tags=[], max_keys=1000, next_key=0, name_prefix="",
+                              exact_match=False, expected_retvals=[], include_list_stats=False, count_only=False):
+        tables = []
+        lists, is_truncated, count = self._list_tables_raw(bucket, schema, txid=txid, client_tags=client_tags, max_keys=max_keys,
+                                 next_key=next_key, name_prefix=name_prefix, exact_match=exact_match, expected_retvals=expected_retvals,
+                                 include_list_stats=include_list_stats, count_only=count_only)
         bucket_name = lists.BucketName().decode()
         schema_name = lists.SchemaName().decode()
         if not bucket.startswith(bucket_name):  # ignore snapshot name
             raise ValueError(f'bucket: {bucket} did not start from {bucket_name}')
         tables_length = lists.TablesLength()
-        count = int(res_headers['tabular-list-count']) if 'tabular-list-count' in res_headers else tables_length
         for i in range(tables_length):
             tables.append(_parse_table_info(lists.Tables(i), parse_properties))
         return bucket_name, schema_name, tables, next_key, is_truncated, count
+    def raw_sorting_score(self, bucket, schema, txid, name):
+        lists, _, _ = self._list_tables_raw(bucket, schema, txid=txid, exact_match=True, name_prefix=name, include_list_stats=True)
+        bucket_name = lists.BucketName().decode()
+        if not bucket.startswith(bucket_name):  # ignore snapshot name
+            raise ValueError(f'bucket: {bucket} did not start from {bucket_name}')
+        tables_length = lists.TablesLength()
+        if tables_length != 1:
+            raise ValueError(f'table: {name} received {tables_length} response')
+        return lists.Tables(0).SortingScore()
     def add_columns(self, bucket, schema, name, arrow_schema, txid=0, client_tags=[], expected_retvals=[]):
         """
         Add a column to table, use the following request

vastdb/bench/test_perf.py CHANGED Viewed

@@ -1,3 +1,4 @@
+import datetime as dt
 import logging
 import time
@@ -5,6 +6,7 @@ import pytest
 from vastdb import util
 from vastdb.table import ImportConfig, QueryConfig
+from vastdb.tests.util import compare_pyarrow_tables
 log = logging.getLogger(__name__)
@@ -12,17 +14,74 @@ log = logging.getLogger(__name__)
 @pytest.mark.benchmark
 def test_bench(session, test_bucket_name, parquets_path, crater_path):
     files = [str(parquets_path / f) for f in (parquets_path.glob('**/*.pq'))]
+    stats = None
     with session.transaction() as tx:
         b = tx.bucket(test_bucket_name)
         s = b.create_schema('s1')
-        t = util.create_table_from_files(s, 't1', files, config=ImportConfig(import_concurrency=8))
+        util.create_table_from_files(s, 't1', files, config=ImportConfig(import_concurrency=8))
+        t2 = util.create_table_from_files(s, 't2', files, config=ImportConfig(import_concurrency=8))
+        # Enabling Elysium with 4 sorting keys - ts, sid, ask_open, ask_close
+        t2.add_sorting_key([2, 0, 3, 4])
+        stats = t2.get_stats()
+        log.info("Added sorting keys")
+    assert stats
+    # Waiting up to 2 hours for sorting to complete.
+    start_time = time.time()
+    while not stats.sorting_done:
+        if time.time() - start_time > 7200:
+            raise TimeoutError("Sorting did not complete after waiting for 2 hours.")
+        time.sleep(30)
+        with session.transaction() as tx:
+            table = tx.bucket(test_bucket_name).schema('s1').table('t2')
+            stats = table.get_stats()
+    log.info("Sorting completed")
+    queries = [
+        {'query_str': "select sid from {t} where sid = 10033007".format, 'columns': ['sid'],
+         'predicate': lambda t: t['sid'] == 10033007},
+        {'query_str': "select last_trade_price from {t} where ts between "
+                      "TIMESTAMP'2018-01-04 20:30:00' AND TIMESTAMP'2018-01-05 20:30:00'".format,
+         'columns': ['last_trade_price'], 'predicate': lambda t: (t['ts'].between(
+            dt.datetime(2018, 1, 4, 20, 30, 00, 00), dt.datetime(2018, 1, 5, 20, 30, 00, 00)))},
+        {'query_str': "select ts,ask_close,ask_open from {t} where bid_qty = 684000 and ask_close > 1".format,
+         'columns': ['ts', 'ask_close', 'ask_open'],
+         'predicate': lambda t: ((t['bid_qty'] == 684000) & (t['ask_close'] > 1))},
+        {'query_str': "select ts,ticker from {t} where "
+                      "ask_open between 4374 and 4375 OR ask_open between 380 and 381".format,
+         'columns': ['ts', 'ticker'],
+         'predicate': lambda t: ((t['ask_open'].between(4374, 4375)) | (t['ask_open'].between(380, 381)))},
+        {
+         'query_str': "select trade_close, trade_high, trade_low, trade_open from {t} where ticker in ('BANR', 'KELYB')".format,
+         'columns': ['trade_close', 'trade_high', 'trade_low', 'trade_open'],
+         'predicate': lambda t: (t['ticker'].isin(['BANR', 'KELYB']))}
+    ]
+    log.info("Starting to run queries")
+    with session.transaction() as tx:
+        schema = tx.bucket(test_bucket_name).schema('s1')
+        t1 = schema.table("t1")
+        t2 = schema.table("t2")
         config = QueryConfig(num_splits=8, num_sub_splits=4)
-        s = time.time()
-        pa_table = t.select(columns=['sid'], predicate=t['sid'] == 10033007, config=config).read_all()
-        e = time.time()
-        log.info("'SELECT sid from TABLE WHERE sid = 10033007' returned in %s seconds.", e - s)
-        if crater_path:
-            with open(f'{crater_path}/bench_results', 'a') as f:
-                f.write(f"'SELECT sid FROM TABLE WHERE sid = 10033007' returned in {e - s} seconds")
-        assert pa_table.num_rows == 255_075
+        for q in queries:
+            normal_table_res, els_table_res = None, None
+            for table in [t1, t2]:
+                log.info("Starting query: %s", q['query_str'](t=table.name))
+                s = time.time()
+                res = table.select(columns=q['columns'], predicate=q['predicate'](table), config=config).read_all()
+                e = time.time()
+                if table == t1:
+                    normal_table_res = res
+                else:
+                    els_table_res = res
+                log.info("Query %s returned in %s seconds.", q['query_str'](t=table.name), e - s)
+                if crater_path:
+                    with open(f'{crater_path}/bench_results', 'a') as f:
+                        f.write(f"Query '{q['query_str'](t=table)}' returned in {e - s} seconds")
+            assert normal_table_res, f"missing result for {t1} table"
+            assert els_table_res, f"missing result for {t2} table"
+            assert compare_pyarrow_tables(normal_table_res, els_table_res)

vastdb/features.py CHANGED Viewed

@@ -4,7 +4,7 @@ import logging
 from .errors import NotSupportedVersion
-log = logging.getLogger()
+log = logging.getLogger(__name__)
 class Features:
@@ -33,12 +33,16 @@ class Features:
         self.check_elysium = self._check(
             "Elysium requires 5.3.5+ VAST release",
-            vast_version >= (5, 3, 5))
+            vast_version >= (5, 3))  # TODO: make this validation stricter for v5.4 (beta/poc version is 5.3.0.x)
         self.check_zip_import = self._check(
             "Zip import requires 5.3.1+ VAST release",
             vast_version >= (5, 3, 1))
+        self.check_timezone = self._check(
+            "Timezone support requires 5.4+ Vast release",
+            vast_version >= (5, 4))
     def _check(self, msg, supported):
         log.debug("%s (current version is %s): supported=%s", msg, self.vast_version, supported)
         if not supported:

vastdb/table.py CHANGED Viewed

@@ -1,9 +1,11 @@
 """VAST Database table."""
 import concurrent.futures
+import copy
 import logging
 import os
 import queue
+import sys
 from dataclasses import dataclass, field
 from math import ceil
 from threading import Event
@@ -29,6 +31,7 @@ MAX_ROWS_PER_BATCH = 512 * 1024
 MAX_INSERT_ROWS_PER_PATCH = 512 * 1024
 # in case insert has TooWideRow - need to insert in smaller batches - each cell could contain up to 128K, and our wire is limited to 5MB
 MAX_COLUMN_IN_BATCH = int(5 * 1024 / 128)
+SORTING_SCORE_BITS = 63
 @dataclass
@@ -37,7 +40,12 @@ class TableStats:
     num_rows: int
     size_in_bytes: int
+    sorting_score: int
+    write_amplification: int
+    acummulative_row_inserition_count: int
     is_external_rowid_alloc: bool = False
+    sorting_key_enabled: bool = False
+    sorting_done: bool = False
     endpoints: Tuple[str, ...] = ()
@@ -277,8 +285,8 @@ class Table:
                         except queue.Empty:
                             pass
                         if files_batch:
-                            log.debug("Starting import batch of %s files", len(files_batch))
-                            log.info(f"starting import of {files_batch}")
+                            log.info("Starting import batch of %s files", len(files_batch))
+                            log.debug(f"starting import of {files_batch}")
                             session.import_data(
                                 self.bucket.name, self.schema.name, self.name, files_batch, txid=self.tx.txid,
                                 key_names=key_names)
@@ -327,7 +335,8 @@ class Table:
                predicate: Union[ibis.expr.types.BooleanColumn, ibis.common.deferred.Deferred] = None,
                config: Optional[QueryConfig] = None,
                *,
-               internal_row_id: bool = False) -> pa.RecordBatchReader:
+               internal_row_id: bool = False,
+               limit_rows: Optional[int] = None) -> pa.RecordBatchReader:
         """Execute a query over this table.
         To read a subset of the columns, specify their names via `columns` argument. Otherwise, all columns will be read.
@@ -336,8 +345,10 @@ class Table:
         Query-execution configuration options can be specified via the optional `config` argument.
         """
-        if config is None:
-            config = QueryConfig()
+        config = copy.deepcopy(config) if config else QueryConfig()
+        if limit_rows:
+            config.limit_rows_per_sub_split = limit_rows
         stats = None
         # Retrieve snapshots only if needed
@@ -372,7 +383,7 @@ class Table:
             num_rows = 0
             if self.sorted_table:
                 num_rows = self._get_row_estimate(columns, predicate, query_schema)
-                log.info(f'sorted estimate: {num_rows}')
+                log.debug(f'sorted estimate: {num_rows}')
             if num_rows == 0:
                 if stats is None:
                     stats = self.get_stats()
@@ -396,7 +407,7 @@ class Table:
         for split in range(config.num_splits):
             splits_queue.put(split)
-        # this queue shouldn't be large it is marely a pipe through which the results
+        # this queue shouldn't be large it is merely a pipe through which the results
         # are sent to the main thread. Most of the pages actually held in the
         # threads that fetch the pages.
         record_batches_queue: queue.Queue[pa.RecordBatch] = queue.Queue(maxsize=2)
@@ -452,6 +463,7 @@ class Table:
             if config.query_id:
                 threads_prefix = threads_prefix + "-" + config.query_id
+            total_num_rows = limit_rows if limit_rows else sys.maxsize
             with concurrent.futures.ThreadPoolExecutor(max_workers=len(endpoints), thread_name_prefix=threads_prefix) as tp:  # TODO: concurrency == enpoints is just a heuristic
                 futures = [tp.submit(single_endpoint_worker, endpoint) for endpoint in endpoints]
                 tasks_running = len(futures)
@@ -461,7 +473,14 @@ class Table:
                         batch = record_batches_queue.get()
                         if batch is not None:
-                            yield batch
+                            if batch.num_rows < total_num_rows:
+                                yield batch
+                                total_num_rows -= batch.num_rows
+                            else:
+                                yield batch.slice(length=total_num_rows)
+                                log.info("reached limit rows per query: %d - stop query", limit_rows)
+                                stop_event.set()
+                                break
                         else:
                             tasks_running -= 1
                             log.debug("one worker thread finished, remaining: %d", tasks_running)
@@ -590,7 +609,7 @@ class Table:
         self.name = new_name
     def add_sorting_key(self, sorting_key: list) -> None:
-        """Ads a sorting key to a table that doesn't have any."""
+        """Add a sorting key to a table that doesn't have any."""
         self.tx._rpc.features.check_elysium()
         self.tx._rpc.api.alter_table(
             self.bucket.name, self.schema.name, self.name, txid=self.tx.txid, sorting_key=sorting_key)
@@ -653,6 +672,20 @@ class Table:
         """
         return self._ibis_table[col_name]
+    def sorting_done(self) -> int:
+        """Sorting done indicator for the table.  Always False for unsorted tables."""
+        if not self.sorted_table:
+            return False
+        raw_sorting_score = self.tx._rpc.api.raw_sorting_score(self.schema.bucket.name, self.schema.name, self.schema.tx.txid, self.name)
+        return bool(raw_sorting_score >> SORTING_SCORE_BITS)
+    def sorting_score(self) -> int:
+        """Sorting score for the table.  Always 0 for unsorted tables."""
+        if not self.sorted_table:
+            return 0
+        raw_sorting_score = self.tx._rpc.api.raw_sorting_score(self.schema.bucket.name, self.schema.name, self.schema.tx.txid, self.name)
+        return raw_sorting_score & ((1 << SORTING_SCORE_BITS) - 1)
 @dataclass
 class Projection:
@@ -710,7 +743,8 @@ class Projection:
 def _parse_projection_info(projection_info, table: "Table"):
     log.info("Projection info %s", str(projection_info))
-    stats = TableStats(num_rows=projection_info.num_rows, size_in_bytes=projection_info.size_in_bytes)
+    stats = TableStats(num_rows=projection_info.num_rows, size_in_bytes=projection_info.size_in_bytes,
+                       sorting_score=0, write_amplification=0, acummulative_row_inserition_count=0)
     return Projection(name=projection_info.name, table=table, stats=stats, handle=int(projection_info.handle))

vastdb/tests/test_imports.py CHANGED Viewed

@@ -1,4 +1,5 @@
 import logging
+from datetime import datetime
 from tempfile import NamedTemporaryFile
 import pyarrow as pa
@@ -70,59 +71,6 @@ def test_parallel_imports(session, clean_bucket_name, s3):
         assert len(object_names) == len(objects_name['ObjectName'])
-def test_zip_imports(zip_import_session, clean_bucket_name, s3):
-    num_rows = 10
-    num_files = 5
-    files = []
-    ids = [i for i in range(num_rows)]
-    symbols = [chr(c) for c in range(ord('a'), ord('a') + num_rows)]
-    for i in range(num_files):
-        ds = {'id': ids,
-              'symbol': symbols,
-              f'feature{i}': [i * 10 + k for k in range(num_rows)]}
-        table = pa.Table.from_pydict(ds)
-        with NamedTemporaryFile() as f:
-            pq.write_table(table, f.name)
-            pname = f'prq{i}'
-            s3.put_object(Bucket=clean_bucket_name, Key=pname, Body=f)
-            files.append(f'/{clean_bucket_name}/{pname}')
-    with zip_import_session.transaction() as tx:
-        b = tx.bucket(clean_bucket_name)
-        s = b.create_schema('s1')
-        t = s.create_table('t1', pa.schema([('vastdb_rowid', pa.int64()), ('id', pa.int64()), ('symbol', pa.string())]))
-        columns = pa.schema([
-            ('vastdb_rowid', pa.int64()),
-            ('id', pa.int64()),
-            ('symbol', pa.string()),
-        ])
-        ext_row_ids = [10 + i for i in range(num_rows)]
-        arrow_table = pa.table(schema=columns, data=[
-            ext_row_ids,
-            ids,
-            symbols,
-        ])
-        row_ids_array = t.insert(arrow_table)
-        row_ids = row_ids_array.to_pylist()
-        assert row_ids == ext_row_ids
-    with zip_import_session.transaction() as tx:
-        s = tx.bucket(clean_bucket_name).schema('s1')
-        t = s.table('t1')
-        log.info("Starting import of %d files", num_files)
-        config = ImportConfig()
-        config.key_names = ['id', 'symbol']
-        t.import_files(files, config=config)
-    with zip_import_session.transaction() as tx:
-        s = tx.bucket(clean_bucket_name).schema('s1')
-        t = s.table('t1')
-        arrow_table = t.select(columns=['feature0']).read_all()
-        assert arrow_table.num_rows == num_rows
-        log.debug(f"table schema={t.arrow_schema}")
-        assert len(t.arrow_schema) == 8
 def test_create_table_from_files(session, clean_bucket_name, s3):
     datasets = [
         {'num': [0],
@@ -202,3 +150,368 @@ def test_import_type_mismatch_error(session, clean_bucket_name, s3):
         assert exc.value.error_dict['object_name'] == prq_name
         assert exc.value.error_dict['res'] == 'TabularMismatchColumnType'
         assert 'num_type_mismatch' in exc.value.error_dict['err_msg']
+def create_parquet_file(s3, bucket_name, file_key, data):
+    """Creates a Parquet file and uploads it to S3."""
+    parquet_table = pa.Table.from_pydict(data)
+    with NamedTemporaryFile(delete=False) as f:
+        pq.write_table(parquet_table, f.name)
+        with open(f.name, 'rb') as file_data:
+            s3.put_object(Bucket=bucket_name, Key=file_key, Body=file_data)
+    return f'/{bucket_name}/{file_key}'
+def create_table_with_data(session, bucket_name, schema_name, table_name, schema, data=None):
+    """Creates a table with the specified schema and optional initial data."""
+    with session.transaction() as tx:
+        b = tx.bucket(bucket_name)
+        s = b.create_schema(schema_name)
+        t = s.create_table(table_name, schema)
+        if data:
+            arrow_table = pa.table(schema=schema, data=data)
+            t.insert(arrow_table)
+        return t
+def attempt_import(session, bucket_name, schema_name, table_name, files, key_names, expected_error=None):
+    """Attempts to import files into a table and handles expected errors."""
+    with session.transaction() as tx:
+        t = tx.bucket(bucket_name).schema(schema_name).table(table_name)
+        config = ImportConfig()
+        config.key_names = key_names
+        if expected_error:
+            try:
+                t.import_files(files, config=config)
+            except Exception as e:
+                log.info(f"Caught expected error: {e}")
+                assert expected_error in str(e)
+        else:
+            t.import_files(files, config=config)
+def test_zip_imports(zip_import_session, clean_bucket_name, s3):
+    schema = pa.schema([
+        ('vastdb_rowid', pa.int64()),
+        ('id', pa.int64()),
+        ('symbol', pa.string()),
+    ])
+    num_rows = 10
+    num_files = 5
+    # Step 1: Generate and upload Parquet files
+    files = []
+    for i in range(num_files):
+        data = {
+            'id': [k for k in range(num_rows)],
+            'symbol': [chr(c) for c in range(ord('a'), ord('a') + num_rows)],
+            f'feature{i}': [i * 10 + k for k in range(num_rows)],
+        }
+        file_key = f'prq{i}'
+        files.append(create_parquet_file(s3, clean_bucket_name, file_key, data))
+    # Step 2: Create table and insert initial data
+    data = {
+        'vastdb_rowid': [10 + i for i in range(num_rows)],
+        'id': [i for i in range(num_rows)],
+        'symbol': [chr(c) for c in range(ord('a'), ord('a') + num_rows)],
+    }
+    create_table_with_data(zip_import_session, clean_bucket_name, 's1', 't1', schema, data)
+    # Step 3: Import files into the table
+    attempt_import(zip_import_session, clean_bucket_name, 's1', 't1', files, key_names=['id', 'symbol'])
+def test_zip_imports_scale(zip_import_session, clean_bucket_name, s3):
+    """Verify that many key names, and large amounts of data of different kind work as expected."""
+    # Step 1: Create and upload Parquet data
+    log.info("Step 1: Creating and uploading Parquet data")
+    num_rows = 1_000_000
+    data = {
+        'id': [i for i in range(num_rows)],
+        'symbol': [chr((i % 26) + ord('a')) for i in range(num_rows)],
+        'feature': [i * 10 for i in range(num_rows)],  # Extra column not in the initial table
+        'col_0': [datetime.now() for _ in range(num_rows)],
+        'col_1': [1 for _ in range(num_rows)],
+        'col_2': [2 for _ in range(num_rows)],
+        'col_3': [3 for _ in range(num_rows)],
+        'col_4': [4 for _ in range(num_rows)],
+        'col_5': [5 for _ in range(num_rows)],  # Extra column not in the initial table
+    }
+    file_key = 'large_data.parquet'
+    file_path = create_parquet_file(s3, clean_bucket_name, file_key, data)
+    # Step 2: Create table and insert initial data
+    log.info("Step 2: Creating table and inserting initial data")
+    table_data = {
+        'vastdb_rowid': [10 + i for i in range(num_rows)],
+        'id': data['id'],
+        'symbol': data['symbol'],
+        'col_0': data['col_0'],
+        'col_1': data['col_1'],
+        'col_2': data['col_2'],
+        'col_3': data['col_3'],
+        'col_4': data['col_4'],
+    }
+    schema = pa.schema([
+        ('vastdb_rowid', pa.int64()),
+        ('id', pa.int64()),
+        ('symbol', pa.string()),
+        ('col_0', pa.timestamp('s')),
+        ('col_1', pa.int64()),
+        ('col_2', pa.int64()),
+        ('col_3', pa.int64()),
+        ('col_4', pa.int64()),
+    ])
+    create_table_with_data(zip_import_session, clean_bucket_name, 's1', 't1', schema, table_data)
+    # Step 3: Import the Parquet file into the table
+    log.info("Step 3: Importing Parquet file into the table")
+    attempt_import(
+        zip_import_session,
+        clean_bucket_name,
+        's1',
+        't1',
+        [file_path],
+        key_names=['id', 'symbol', 'col_0', 'col_1', 'col_2', 'col_3', 'col_4']
+    )
+    # Step 4: Verify schema and row count
+    log.info("Step 4: Verifying schema and row count")
+    with (zip_import_session.transaction() as tx):
+        table = tx.bucket(clean_bucket_name).schema('s1').table('t1')
+        updated_schema = table.arrow_schema
+        updated_data = table.select().read_all()
+        # Verify schema
+        expected_schema = pa.schema([
+            ('vastdb_rowid', pa.int64()),
+            ('id', pa.int64()),
+            ('symbol', pa.string()),
+            ('col_0', pa.timestamp('s')),
+            ('col_1', pa.int64()),
+            ('col_2', pa.int64()),
+            ('col_3', pa.int64()),
+            ('col_4', pa.int64()),
+            ('feature', pa.int64()),  # Added during import
+            ('col_5', pa.int64()),   # Added during import
+        ])
+        assert updated_schema == expected_schema, \
+            "The table schema does not match the expected schema."
+        assert updated_data.num_rows == num_rows, \
+            f"Expected {num_rows} rows, but got {updated_data.num_rows}."
+        assert len(updated_schema.names) == 10, \
+            "The table should have exactly 10 columns"
+def test_zip_imports_missing_columns(zip_import_session, clean_bucket_name, s3):
+    """Verify that importing Parquet data with missing columns fails as expected."""
+    # Step 1: Create and upload Parquet data missing key columns
+    log.info("Step 1: Creating and uploading Parquet data without key columns")
+    data = {
+        'feature': [i * 10 for i in range(10)],  # Only feature column, no 'id' or 'symbol'
+    }
+    file_key = 'missing_keys.parquet'
+    file_path = create_parquet_file(s3, clean_bucket_name, file_key, data)
+    # Step 2: Create table with key columns
+    log.info("Step 2: Creating table with key columns")
+    schema = pa.schema([
+        ('vastdb_rowid', pa.int64()),
+        ('id', pa.int64()),
+        ('symbol', pa.string()),
+    ])
+    create_table_with_data(zip_import_session, clean_bucket_name, 's1', 't1', schema)
+    # Step 3: Attempt to import Parquet data missing key columns
+    log.info("Step 3: Attempting to import data without key columns")
+    attempt_import(
+        zip_import_session,
+        clean_bucket_name,
+        's1',
+        't1',
+        [file_path],
+        key_names=['id', 'symbol'],
+        expected_error="Failed to verify import keys"
+    )
+def test_zip_imports_missing_key_values(zip_import_session, clean_bucket_name, s3):
+    """Verify that importing Parquet data with extra key values fails as expected
+    and that importing a subset of key values fails as expected."""
+    schema = pa.schema([
+        ('vastdb_rowid', pa.int64()),
+        ('id', pa.int64()),
+        ('symbol', pa.string()),
+    ])
+    num_rows = 5
+    # Step 1: Create Parquet data with keys 0-4
+    data = {
+        'id': [i for i in range(num_rows)],
+        'symbol': [chr((i % 26) + ord('a')) for i in range(num_rows)],
+        'feature': [i * 10 for i in range(num_rows)],
+    }
+    file_key = 'missing_key_values.parquet'
+    file_path = create_parquet_file(s3, clean_bucket_name, file_key, data)
+    # Step 2: Create a table with non-overlapping keys 3-7
+    table_data = {
+        'vastdb_rowid': [i + 3 for i in range(num_rows)],
+        'id': [i + 3 for i in range(num_rows)],
+        'symbol': [chr(((i + 3) % 26) + ord('k')) for i in range(num_rows)],
+    }
+    create_table_with_data(zip_import_session, clean_bucket_name, 's1', 't1', schema, table_data)
+    # Step 3: Attempt to import Parquet data with mismatched keys
+    log.info("Step 3: Attempting to import Parquet data with keys that do not match the table")
+    attempt_import(
+        zip_import_session,
+        clean_bucket_name,
+        's1',
+        't1',
+        [file_path],
+        key_names=['id', 'symbol'],
+        expected_error="Failed to get row_ids to update on table"
+    )
+    # Step 4: Create and upload Parquet data with fewer rows but all key values present in the table
+    log.info("Step 4: Creating and uploading Parquet data with fewer rows, but matching all table keys")
+    smaller_data = {
+        'id': [3, 4],  # Subset of the table keys
+        'symbol': ['k', 'l'],  # Matching symbols for keys 3 and 4
+        'feature': [300, 400],  # Example new feature data
+    }
+    smaller_file_key = 'subset_matching_keys.parquet'
+    smaller_file_path = create_parquet_file(s3, clean_bucket_name, smaller_file_key, smaller_data)
+    # Step 5: Attempt to import the Parquet data with fewer rows but all key values present
+    log.info("Step 5: Attempting to import smaller Parquet data with all table keys")
+    attempt_import(
+        zip_import_session,
+        clean_bucket_name,
+        's1',
+        't1',
+        [smaller_file_path],
+        key_names=['id', 'symbol'],
+        expected_error='Failed to get row_ids to update on table'
+    )
+def test_zip_imports_nested_keys(zip_import_session, clean_bucket_name, s3):
+    """Verify that importing Parquet data with nested key columns fails as expected."""
+    # Step 1: Creating Parquet data with nested key columns
+    log.info("Step 1: Creating Parquet data with nested key columns")
+    num_rows = 10
+    nested_keys = [{'id': i, 'symbol': chr(ord('a') + i)} for i in range(num_rows)]
+    feature_column = [i * 10 for i in range(num_rows)]
+    ds = {
+        'nested_key': nested_keys,
+        'feature': feature_column,
+    }
+    # Use create_parquet_file helper
+    file_key = 'nested_keys.parquet'
+    file_path = create_parquet_file(s3, clean_bucket_name, file_key, ds)
+    # Step 2: Creating table with flat key columns
+    log.info("Step 2: Creating table with flat key columns")
+    schema = pa.schema([
+        ('vastdb_rowid', pa.int64()),
+        ('id', pa.int64()),
+        ('symbol', pa.string()),
+    ])
+    # Use create_table_with_data helper
+    create_table_with_data(
+        zip_import_session,
+        clean_bucket_name,
+        's1',
+        't1',
+        schema
+    )
+    # Step 3: Attempt to import Parquet data with nested key columns
+    log.info("Step 3: Attempting to import data with nested key columns")
+    # Use attempt_import helper with expected error
+    attempt_import(
+        zip_import_session,
+        clean_bucket_name,
+        's1',
+        't1',
+        [file_path],
+        ['id', 'symbol'],
+        expected_error="Failed to verify import keys"
+    )
+def test_zip_imports_type_mismatch(zip_import_session, clean_bucket_name, s3):
+    """Verify behavior when key column data types in the Parquet file do not match the table schema."""
+    # Step 1: Define table schema with id as string
+    schema = pa.schema([
+        ('vastdb_rowid', pa.int64()),
+        ('id', pa.string()),  # Expecting strings here
+        ('symbol', pa.string()),
+    ])
+    num_rows = 10
+    # Step 2: Generate and upload a single Parquet file with mismatched id type (integers)
+    log.info("Step 2: Creating a Parquet file with mismatched key column data types")
+    data = {
+        'id': [k for k in range(num_rows)],  # Integers, causing the type mismatch
+        'symbol': [chr(c) for c in range(ord('a'), ord('a') + num_rows)],
+        'feature': [k * 10 for k in range(num_rows)],
+    }
+    file_key = 'mismatched_data.parquet'
+    file_path = create_parquet_file(s3, clean_bucket_name, file_key, data)
+    # Step 3: Create table with string id column and insert valid initial data
+    log.info("Step 3: Creating table with string key column and valid initial data")
+    table_data = {
+        'vastdb_rowid': [10 + i for i in range(num_rows)],
+        'id': [str(i) for i in range(num_rows)],  # Strings to match schema
+        'symbol': [chr(c) for c in range(ord('a'), ord('a') + num_rows)],
+    }
+    create_table_with_data(zip_import_session, clean_bucket_name, 's1', 't1', schema, table_data)
+    # Step 4: Attempt to import the file into the table
+    log.info("Step 4: Attempting to import the Parquet file with mismatched key column data types")
+    attempt_import(
+        zip_import_session,
+        clean_bucket_name,
+        's1',
+        't1',
+        [file_path],
+        key_names=['id', 'symbol'],
+        expected_error="TabularMismatchColumnType"
+    )
+def test_zip_imports_duplicate_key_values(zip_import_session, clean_bucket_name):
+    """Verify that creating a table with duplicate key values fails as expected,
+    also show that it has to be in same order."""
+    schema = pa.schema([
+        ('vastdb_rowid', pa.int64()),
+        ('id', pa.int64()),
+        ('symbol', pa.string()),
+    ])
+    # Data with duplicate keys
+    table_data = {
+        'vastdb_rowid': [1, 2, 2, 4, 5],
+        'id': [1, 2, 2, 4, 5],
+        'symbol': ['a', 'b', 'b', 'd', 'e'],
+    }
+    try:
+        # Attempt to create the table
+        create_table_with_data(zip_import_session, clean_bucket_name, 's1', 't1', schema, table_data)
+        assert False, "Expected an error due to duplicate keys, but the table was created successfully."
+    except Exception as e:
+        # Verify the exception is due to duplicate row IDs
+        assert "Found duplicate row ids or not in ascending order" in str(e), f"Unexpected error: {e}"

vastdb/tests/test_tables.py CHANGED Viewed

@@ -269,6 +269,24 @@ def test_select_with_multisplits(session, clean_bucket_name):
         assert actual == expected
+def test_select_with_limit(session, clean_bucket_name):
+    columns = pa.schema([
+        ('a', pa.int32())
+    ])
+    data = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
+    data = data * 1000
+    expected = pa.table(schema=columns, data=[data])
+    limit_rows = 10
+    with prepare_data(session, clean_bucket_name, 's', 't', expected) as t:
+        start = time.time()
+        actual = t.select(predicate=(t['a'] < 3), limit_rows=limit_rows).read_all()
+        end = time.time()
+        log.info(f"actual: {actual} elapsed time: {end - start}")
+        assert len(actual) == limit_rows
 def test_select_with_priority(session, clean_bucket_name):
     columns = pa.schema([
         ('a', pa.int32())
@@ -313,8 +331,13 @@ def test_timezones(session, clean_bucket_name):
     inserted = pa.table(schema=columns_with_tz, data=data)
     with prepare_data(session, clean_bucket_name, 's', 't', inserted) as table:
-        assert table.arrow_schema == columns_without_tz
-        assert table.select().read_all() == pa.table(schema=columns_without_tz, data=data)
+        try:
+            table.tx._rpc.features.check_timezone()
+            assert table.arrow_schema == columns_with_tz
+            assert table.select().read_all() == pa.table(schema=columns_with_tz, data=data)
+        except errors.NotSupportedVersion:
+            assert table.arrow_schema == columns_without_tz
+            assert table.select().read_all() == pa.table(schema=columns_without_tz, data=data)
 def test_types(session, clean_bucket_name):

vastdb/tests/util.py CHANGED Viewed

@@ -1,6 +1,8 @@
 import logging
 from contextlib import contextmanager
+import pyarrow as pa
 log = logging.getLogger(__name__)
@@ -15,3 +17,22 @@ def prepare_data(session, clean_bucket_name, schema_name, table_name, arrow_tabl
         yield t
         t.drop()
         s.drop()
+def compare_pyarrow_tables(t1, t2):
+    def sort_table(table):
+        return table.sort_by([(col, 'ascending') for col in table.schema.names])
+    def compare_tables(table1, table2):
+        if table1.schema != table2.schema:
+            raise RuntimeError(f"Schema mismatch. {table1.schema} vs {table2.schema}")
+        for t1_col, t2_col in zip(table1.columns, table2.columns):
+            if not pa.compute.equal(t1_col, t2_col).to_pandas().all():
+                raise RuntimeError(f"Data mismatch in column {t1_col} vs {t2_col}.")
+        return True
+    sorted_table1 = sort_table(t1)
+    sorted_table2 = sort_table(t2)
+    return compare_tables(sorted_table1, sorted_table2)

{vastdb-1.3.8.dist-info → vastdb-1.3.10.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: vastdb
-Version: 1.3.8
+Version: 1.3.10
 Summary: VAST Data SDK
 Home-page: https://github.com/vast-data/vastdb_sdk
 Author: VAST DATA

{vastdb-1.3.8.dist-info → vastdb-1.3.10.dist-info}/RECORD RENAMED Viewed

@@ -1,17 +1,17 @@
-vastdb/__init__.py,sha256=J1JjKiFkKC95BHowfh9kJfQFTjRce-QMsc6zF_FfxC0,432
-vastdb/_internal.py,sha256=-TSpq5nYQPrPdB2nsnt7DWDDaD6HC8iOnI5yXRiM3Ao,104965
+vastdb/__init__.py,sha256=uf-AXdzsD4nPxFP7WxkcAXGG0whv8BHLrrXCJtsPGaQ,436
+vastdb/_internal.py,sha256=tGNU-9wOtRoK7OXFmX1-uEgQRjpKQXPA0H4rZy86-JM,107257
 vastdb/bucket.py,sha256=aomUbrfK5Oa6FdGPVsoBXgRW39IzYnmsorF8642r990,2549
 vastdb/config.py,sha256=OehnsWrjzv0-SUouEXmkrKBugiWyhXOn4XiSLV3s9yk,2342
 vastdb/conftest.py,sha256=X2kVveySPQYZlVBXUMoo7Oea5IsvmJzjdqq3fpH2kVw,3469
 vastdb/errors.py,sha256=B_FNFONDE8apoTRL8wkMNjUJWAjXu36mO0HI4cGSBgY,4328
-vastdb/features.py,sha256=3QRyIMUDovLcOTDVM_4qYFHmKtzCDtlkdlbhbK1a8rY,1652
+vastdb/features.py,sha256=ivYbvhiGA858B00vhs_CNzlVV9QDUe53yW6V3J5EoxM,1874
 vastdb/schema.py,sha256=UR1WzQvfAdnpDaNsEaGZLYGC65Blri5MYOWinCcl8Hc,6552
 vastdb/session.py,sha256=toMR0BXwTaECdWDKnIZky1F3MA1SmelRBiqCrqQ3GCM,2067
-vastdb/table.py,sha256=1QSvZDhpaOjRsEu_FU8di3STUrbsRmGW4VFx4g4FYFs,34237
+vastdb/table.py,sha256=NGImmz_KltU80B0u-CYDgEdGOMHSppf7mmVs72WD8wM,35937
 vastdb/transaction.py,sha256=NlVkEowJ_pmtffjWBBDaKExYDKPekjSZyj_fK_bZPJE,3026
 vastdb/util.py,sha256=8CUnVRsJukC3uNHNoB5D0qPf0FxS8OSdVB84nNoLJKc,6290
 vastdb/bench/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-vastdb/bench/test_perf.py,sha256=nyK-BM1HJhPHrcNa2pLNmxqcC_CG2UsJogE92EvN-UM,1082
+vastdb/bench/test_perf.py,sha256=0kbCxK8U9vYO0zCMUYcZHzEICaaII3I0-6FeR5-CNtM,4040
 vastdb/bench/test_sample.py,sha256=LgF4syzij09sH3Noiv1EyCAJ9pvrUE5bxR4RJTVEYag,7881
 vastdb/bench/perf_bench/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 vastdb/bench/perf_bench/cli.py,sha256=NtaPEBTDI6PWgEtwI1wVbwmUeA5bwGqAj_Z_2lDJ28I,5931
@@ -41,14 +41,14 @@ vastdb/bench/perf_bench/query/query_vastdb.py,sha256=SZYem_EmsaynEftAa_VFobjSJZD
 vastdb/tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 vastdb/tests/metrics.py,sha256=ZCSeBYFSPMG3yI0JrAHs2CrY6wFjx_5GwRTYHVAwLKA,1026
 vastdb/tests/test_duckdb.py,sha256=STw_1PwTQR8Naz6s0p6lQTV1ZTKKhe3LPBUbhqzTCu0,1880
-vastdb/tests/test_imports.py,sha256=ICI9EWFgKf9TbphFRhFifqZoESKWIM3_zb53U7-jOSo,8058
+vastdb/tests/test_imports.py,sha256=R-ExC6IYaf4REGQw0v7iVAz7TPY9vd8S3S892vy86R0,20011
 vastdb/tests/test_nested.py,sha256=LPU6uV3Ri23dBzAEMFQqRPbqapV5LfmiHSHkhILPIY0,6332
 vastdb/tests/test_projections.py,sha256=3y1kubwVrzO-xoR0hyps7zrjOJI8niCYspaFTN16Q9w,4540
 vastdb/tests/test_sanity.py,sha256=bv1ypGDzvOgmMvGbucDYiLQu8krQLlE6NB3M__q87x8,3303
 vastdb/tests/test_schemas.py,sha256=l70YQMlx2UL1KRQhApriiG2ZM7GJF-IzWU31H3Yqn1U,3312
-vastdb/tests/test_tables.py,sha256=D6eHSDjC4SJGFA91qJO56SoVPE040rN37uOrDWRDthk,47634
+vastdb/tests/test_tables.py,sha256=wBPUewfJVEJNyDHwO49qld3lMVjVjUiAzP7ngX07fFA,48478
 vastdb/tests/test_util.py,sha256=n7gvT5Wg6b6bxgqkFXkYqvFd_W1GlUdVfmPv66XYXyA,1956
-vastdb/tests/util.py,sha256=O2bgB5403meX69vVY1gWACOtWLOoXE5yQA00ppk4WN8,596
+vastdb/tests/util.py,sha256=YsCBCcx7n1QOH-IPDpCsl6KEaUQQJRZwGPeayijHNb4,1307
 vastdb/vast_flatbuf/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 vastdb/vast_flatbuf/org/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 vastdb/vast_flatbuf/org/apache/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -209,8 +209,8 @@ vastdb/vast_flatbuf/tabular/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMp
 vastdb/vast_tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 vastdb/vast_tests/test_ha.py,sha256=744P4G6VJ09RIkHhMQL4wlipCBJWQVMhyvUrSc4k1HQ,975
 vastdb/vast_tests/test_scale.py,sha256=5jGwOdZH6Tv5tPdZYPWoqcxOceI2jA5i2D1zNKZHER4,3958
-vastdb-1.3.8.dist-info/LICENSE,sha256=obffan7LYrq7hLHNrY7vHcn2pKUTBUYXMKu-VOAvDxU,11333
-vastdb-1.3.8.dist-info/METADATA,sha256=JO1YYjtkqWE7VLusG8OkWUNHw4Osq6hduCUlg6xIU7g,1340
-vastdb-1.3.8.dist-info/WHEEL,sha256=R0nc6qTxuoLk7ShA2_Y-UWkN8ZdfDBG2B6Eqpz2WXbs,91
-vastdb-1.3.8.dist-info/top_level.txt,sha256=nnKAaZaQa8GFbYpWAexr_B9HrhonZbUlX6hL6AC--yA,7
-vastdb-1.3.8.dist-info/RECORD,,
+vastdb-1.3.10.dist-info/LICENSE,sha256=obffan7LYrq7hLHNrY7vHcn2pKUTBUYXMKu-VOAvDxU,11333
+vastdb-1.3.10.dist-info/METADATA,sha256=BFeEhZ0mgwoCyAKM_EkijrPcI5RWTME4tDtdq-fcWwc,1341
+vastdb-1.3.10.dist-info/WHEEL,sha256=R0nc6qTxuoLk7ShA2_Y-UWkN8ZdfDBG2B6Eqpz2WXbs,91
+vastdb-1.3.10.dist-info/top_level.txt,sha256=nnKAaZaQa8GFbYpWAexr_B9HrhonZbUlX6hL6AC--yA,7
+vastdb-1.3.10.dist-info/RECORD,,

{vastdb-1.3.8.dist-info → vastdb-1.3.10.dist-info}/LICENSE RENAMED Viewed

File without changes

{vastdb-1.3.8.dist-info → vastdb-1.3.10.dist-info}/WHEEL RENAMED Viewed

File without changes

{vastdb-1.3.8.dist-info → vastdb-1.3.10.dist-info}/top_level.txt RENAMED Viewed

File without changes

vastdb 1.3.8__py3-none-any.whl → 1.3.10__py3-none-any.whl

vastdb 1.3.8py3-none-any.whl → 1.3.10py3-none-any.whl