vastdb 1.3.9__py3-none-any.whl → 1.3.10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
vastdb/bench/test_perf.py CHANGED
@@ -1,3 +1,4 @@
1
+ import datetime as dt
1
2
  import logging
2
3
  import time
3
4
 
@@ -5,6 +6,7 @@ import pytest
5
6
 
6
7
  from vastdb import util
7
8
  from vastdb.table import ImportConfig, QueryConfig
9
+ from vastdb.tests.util import compare_pyarrow_tables
8
10
 
9
11
  log = logging.getLogger(__name__)
10
12
 
@@ -12,17 +14,74 @@ log = logging.getLogger(__name__)
12
14
  @pytest.mark.benchmark
13
15
  def test_bench(session, test_bucket_name, parquets_path, crater_path):
14
16
  files = [str(parquets_path / f) for f in (parquets_path.glob('**/*.pq'))]
17
+ stats = None
15
18
 
16
19
  with session.transaction() as tx:
17
20
  b = tx.bucket(test_bucket_name)
18
21
  s = b.create_schema('s1')
19
- t = util.create_table_from_files(s, 't1', files, config=ImportConfig(import_concurrency=8))
22
+ util.create_table_from_files(s, 't1', files, config=ImportConfig(import_concurrency=8))
23
+ t2 = util.create_table_from_files(s, 't2', files, config=ImportConfig(import_concurrency=8))
24
+ # Enabling Elysium with 4 sorting keys - ts, sid, ask_open, ask_close
25
+ t2.add_sorting_key([2, 0, 3, 4])
26
+ stats = t2.get_stats()
27
+ log.info("Added sorting keys")
28
+
29
+ assert stats
30
+ # Waiting up to 2 hours for sorting to complete.
31
+ start_time = time.time()
32
+ while not stats.sorting_done:
33
+ if time.time() - start_time > 7200:
34
+ raise TimeoutError("Sorting did not complete after waiting for 2 hours.")
35
+ time.sleep(30)
36
+ with session.transaction() as tx:
37
+ table = tx.bucket(test_bucket_name).schema('s1').table('t2')
38
+ stats = table.get_stats()
39
+ log.info("Sorting completed")
40
+
41
+ queries = [
42
+ {'query_str': "select sid from {t} where sid = 10033007".format, 'columns': ['sid'],
43
+ 'predicate': lambda t: t['sid'] == 10033007},
44
+ {'query_str': "select last_trade_price from {t} where ts between "
45
+ "TIMESTAMP'2018-01-04 20:30:00' AND TIMESTAMP'2018-01-05 20:30:00'".format,
46
+ 'columns': ['last_trade_price'], 'predicate': lambda t: (t['ts'].between(
47
+ dt.datetime(2018, 1, 4, 20, 30, 00, 00), dt.datetime(2018, 1, 5, 20, 30, 00, 00)))},
48
+ {'query_str': "select ts,ask_close,ask_open from {t} where bid_qty = 684000 and ask_close > 1".format,
49
+ 'columns': ['ts', 'ask_close', 'ask_open'],
50
+ 'predicate': lambda t: ((t['bid_qty'] == 684000) & (t['ask_close'] > 1))},
51
+ {'query_str': "select ts,ticker from {t} where "
52
+ "ask_open between 4374 and 4375 OR ask_open between 380 and 381".format,
53
+ 'columns': ['ts', 'ticker'],
54
+ 'predicate': lambda t: ((t['ask_open'].between(4374, 4375)) | (t['ask_open'].between(380, 381)))},
55
+ {
56
+ 'query_str': "select trade_close, trade_high, trade_low, trade_open from {t} where ticker in ('BANR', 'KELYB')".format,
57
+ 'columns': ['trade_close', 'trade_high', 'trade_low', 'trade_open'],
58
+ 'predicate': lambda t: (t['ticker'].isin(['BANR', 'KELYB']))}
59
+ ]
60
+
61
+ log.info("Starting to run queries")
62
+ with session.transaction() as tx:
63
+ schema = tx.bucket(test_bucket_name).schema('s1')
64
+ t1 = schema.table("t1")
65
+ t2 = schema.table("t2")
66
+
20
67
  config = QueryConfig(num_splits=8, num_sub_splits=4)
21
- s = time.time()
22
- pa_table = t.select(columns=['sid'], predicate=t['sid'] == 10033007, config=config).read_all()
23
- e = time.time()
24
- log.info("'SELECT sid from TABLE WHERE sid = 10033007' returned in %s seconds.", e - s)
25
- if crater_path:
26
- with open(f'{crater_path}/bench_results', 'a') as f:
27
- f.write(f"'SELECT sid FROM TABLE WHERE sid = 10033007' returned in {e - s} seconds")
28
- assert pa_table.num_rows == 255_075
68
+
69
+ for q in queries:
70
+ normal_table_res, els_table_res = None, None
71
+ for table in [t1, t2]:
72
+ log.info("Starting query: %s", q['query_str'](t=table.name))
73
+ s = time.time()
74
+ res = table.select(columns=q['columns'], predicate=q['predicate'](table), config=config).read_all()
75
+ e = time.time()
76
+ if table == t1:
77
+ normal_table_res = res
78
+ else:
79
+ els_table_res = res
80
+ log.info("Query %s returned in %s seconds.", q['query_str'](t=table.name), e - s)
81
+ if crater_path:
82
+ with open(f'{crater_path}/bench_results', 'a') as f:
83
+ f.write(f"Query '{q['query_str'](t=table)}' returned in {e - s} seconds")
84
+
85
+ assert normal_table_res, f"missing result for {t1} table"
86
+ assert els_table_res, f"missing result for {t2} table"
87
+ assert compare_pyarrow_tables(normal_table_res, els_table_res)
vastdb/features.py CHANGED
@@ -4,7 +4,7 @@ import logging
4
4
 
5
5
  from .errors import NotSupportedVersion
6
6
 
7
- log = logging.getLogger()
7
+ log = logging.getLogger(__name__)
8
8
 
9
9
 
10
10
  class Features:
@@ -39,6 +39,10 @@ class Features:
39
39
  "Zip import requires 5.3.1+ VAST release",
40
40
  vast_version >= (5, 3, 1))
41
41
 
42
+ self.check_timezone = self._check(
43
+ "Timezone support requires 5.4+ Vast release",
44
+ vast_version >= (5, 4))
45
+
42
46
  def _check(self, msg, supported):
43
47
  log.debug("%s (current version is %s): supported=%s", msg, self.vast_version, supported)
44
48
  if not supported:
vastdb/table.py CHANGED
@@ -1,9 +1,11 @@
1
1
  """VAST Database table."""
2
2
 
3
3
  import concurrent.futures
4
+ import copy
4
5
  import logging
5
6
  import os
6
7
  import queue
8
+ import sys
7
9
  from dataclasses import dataclass, field
8
10
  from math import ceil
9
11
  from threading import Event
@@ -333,7 +335,8 @@ class Table:
333
335
  predicate: Union[ibis.expr.types.BooleanColumn, ibis.common.deferred.Deferred] = None,
334
336
  config: Optional[QueryConfig] = None,
335
337
  *,
336
- internal_row_id: bool = False) -> pa.RecordBatchReader:
338
+ internal_row_id: bool = False,
339
+ limit_rows: Optional[int] = None) -> pa.RecordBatchReader:
337
340
  """Execute a query over this table.
338
341
 
339
342
  To read a subset of the columns, specify their names via `columns` argument. Otherwise, all columns will be read.
@@ -342,8 +345,10 @@ class Table:
342
345
 
343
346
  Query-execution configuration options can be specified via the optional `config` argument.
344
347
  """
345
- if config is None:
346
- config = QueryConfig()
348
+ config = copy.deepcopy(config) if config else QueryConfig()
349
+
350
+ if limit_rows:
351
+ config.limit_rows_per_sub_split = limit_rows
347
352
 
348
353
  stats = None
349
354
  # Retrieve snapshots only if needed
@@ -402,7 +407,7 @@ class Table:
402
407
  for split in range(config.num_splits):
403
408
  splits_queue.put(split)
404
409
 
405
- # this queue shouldn't be large it is marely a pipe through which the results
410
+ # this queue shouldn't be large it is merely a pipe through which the results
406
411
  # are sent to the main thread. Most of the pages actually held in the
407
412
  # threads that fetch the pages.
408
413
  record_batches_queue: queue.Queue[pa.RecordBatch] = queue.Queue(maxsize=2)
@@ -458,6 +463,7 @@ class Table:
458
463
  if config.query_id:
459
464
  threads_prefix = threads_prefix + "-" + config.query_id
460
465
 
466
+ total_num_rows = limit_rows if limit_rows else sys.maxsize
461
467
  with concurrent.futures.ThreadPoolExecutor(max_workers=len(endpoints), thread_name_prefix=threads_prefix) as tp: # TODO: concurrency == enpoints is just a heuristic
462
468
  futures = [tp.submit(single_endpoint_worker, endpoint) for endpoint in endpoints]
463
469
  tasks_running = len(futures)
@@ -467,7 +473,14 @@ class Table:
467
473
 
468
474
  batch = record_batches_queue.get()
469
475
  if batch is not None:
470
- yield batch
476
+ if batch.num_rows < total_num_rows:
477
+ yield batch
478
+ total_num_rows -= batch.num_rows
479
+ else:
480
+ yield batch.slice(length=total_num_rows)
481
+ log.info("reached limit rows per query: %d - stop query", limit_rows)
482
+ stop_event.set()
483
+ break
471
484
  else:
472
485
  tasks_running -= 1
473
486
  log.debug("one worker thread finished, remaining: %d", tasks_running)
@@ -596,7 +609,7 @@ class Table:
596
609
  self.name = new_name
597
610
 
598
611
  def add_sorting_key(self, sorting_key: list) -> None:
599
- """Ads a sorting key to a table that doesn't have any."""
612
+ """Add a sorting key to a table that doesn't have any."""
600
613
  self.tx._rpc.features.check_elysium()
601
614
  self.tx._rpc.api.alter_table(
602
615
  self.bucket.name, self.schema.name, self.name, txid=self.tx.txid, sorting_key=sorting_key)
@@ -1,4 +1,5 @@
1
1
  import logging
2
+ from datetime import datetime
2
3
  from tempfile import NamedTemporaryFile
3
4
 
4
5
  import pyarrow as pa
@@ -70,59 +71,6 @@ def test_parallel_imports(session, clean_bucket_name, s3):
70
71
  assert len(object_names) == len(objects_name['ObjectName'])
71
72
 
72
73
 
73
- def test_zip_imports(zip_import_session, clean_bucket_name, s3):
74
- num_rows = 10
75
- num_files = 5
76
- files = []
77
- ids = [i for i in range(num_rows)]
78
- symbols = [chr(c) for c in range(ord('a'), ord('a') + num_rows)]
79
- for i in range(num_files):
80
- ds = {'id': ids,
81
- 'symbol': symbols,
82
- f'feature{i}': [i * 10 + k for k in range(num_rows)]}
83
- table = pa.Table.from_pydict(ds)
84
- with NamedTemporaryFile() as f:
85
- pq.write_table(table, f.name)
86
- pname = f'prq{i}'
87
- s3.put_object(Bucket=clean_bucket_name, Key=pname, Body=f)
88
- files.append(f'/{clean_bucket_name}/{pname}')
89
-
90
- with zip_import_session.transaction() as tx:
91
- b = tx.bucket(clean_bucket_name)
92
- s = b.create_schema('s1')
93
- t = s.create_table('t1', pa.schema([('vastdb_rowid', pa.int64()), ('id', pa.int64()), ('symbol', pa.string())]))
94
- columns = pa.schema([
95
- ('vastdb_rowid', pa.int64()),
96
- ('id', pa.int64()),
97
- ('symbol', pa.string()),
98
- ])
99
- ext_row_ids = [10 + i for i in range(num_rows)]
100
- arrow_table = pa.table(schema=columns, data=[
101
- ext_row_ids,
102
- ids,
103
- symbols,
104
- ])
105
- row_ids_array = t.insert(arrow_table)
106
- row_ids = row_ids_array.to_pylist()
107
- assert row_ids == ext_row_ids
108
-
109
- with zip_import_session.transaction() as tx:
110
- s = tx.bucket(clean_bucket_name).schema('s1')
111
- t = s.table('t1')
112
- log.info("Starting import of %d files", num_files)
113
- config = ImportConfig()
114
- config.key_names = ['id', 'symbol']
115
- t.import_files(files, config=config)
116
-
117
- with zip_import_session.transaction() as tx:
118
- s = tx.bucket(clean_bucket_name).schema('s1')
119
- t = s.table('t1')
120
- arrow_table = t.select(columns=['feature0']).read_all()
121
- assert arrow_table.num_rows == num_rows
122
- log.debug(f"table schema={t.arrow_schema}")
123
- assert len(t.arrow_schema) == 8
124
-
125
-
126
74
  def test_create_table_from_files(session, clean_bucket_name, s3):
127
75
  datasets = [
128
76
  {'num': [0],
@@ -202,3 +150,368 @@ def test_import_type_mismatch_error(session, clean_bucket_name, s3):
202
150
  assert exc.value.error_dict['object_name'] == prq_name
203
151
  assert exc.value.error_dict['res'] == 'TabularMismatchColumnType'
204
152
  assert 'num_type_mismatch' in exc.value.error_dict['err_msg']
153
+
154
+
155
+ def create_parquet_file(s3, bucket_name, file_key, data):
156
+ """Creates a Parquet file and uploads it to S3."""
157
+ parquet_table = pa.Table.from_pydict(data)
158
+ with NamedTemporaryFile(delete=False) as f:
159
+ pq.write_table(parquet_table, f.name)
160
+ with open(f.name, 'rb') as file_data:
161
+ s3.put_object(Bucket=bucket_name, Key=file_key, Body=file_data)
162
+ return f'/{bucket_name}/{file_key}'
163
+
164
+
165
+ def create_table_with_data(session, bucket_name, schema_name, table_name, schema, data=None):
166
+ """Creates a table with the specified schema and optional initial data."""
167
+ with session.transaction() as tx:
168
+ b = tx.bucket(bucket_name)
169
+ s = b.create_schema(schema_name)
170
+ t = s.create_table(table_name, schema)
171
+ if data:
172
+ arrow_table = pa.table(schema=schema, data=data)
173
+ t.insert(arrow_table)
174
+ return t
175
+
176
+
177
+ def attempt_import(session, bucket_name, schema_name, table_name, files, key_names, expected_error=None):
178
+ """Attempts to import files into a table and handles expected errors."""
179
+ with session.transaction() as tx:
180
+ t = tx.bucket(bucket_name).schema(schema_name).table(table_name)
181
+ config = ImportConfig()
182
+ config.key_names = key_names
183
+
184
+ if expected_error:
185
+ try:
186
+ t.import_files(files, config=config)
187
+ except Exception as e:
188
+ log.info(f"Caught expected error: {e}")
189
+ assert expected_error in str(e)
190
+ else:
191
+ t.import_files(files, config=config)
192
+
193
+
194
+ def test_zip_imports(zip_import_session, clean_bucket_name, s3):
195
+ schema = pa.schema([
196
+ ('vastdb_rowid', pa.int64()),
197
+ ('id', pa.int64()),
198
+ ('symbol', pa.string()),
199
+ ])
200
+ num_rows = 10
201
+ num_files = 5
202
+
203
+ # Step 1: Generate and upload Parquet files
204
+ files = []
205
+ for i in range(num_files):
206
+ data = {
207
+ 'id': [k for k in range(num_rows)],
208
+ 'symbol': [chr(c) for c in range(ord('a'), ord('a') + num_rows)],
209
+ f'feature{i}': [i * 10 + k for k in range(num_rows)],
210
+ }
211
+ file_key = f'prq{i}'
212
+ files.append(create_parquet_file(s3, clean_bucket_name, file_key, data))
213
+
214
+ # Step 2: Create table and insert initial data
215
+ data = {
216
+ 'vastdb_rowid': [10 + i for i in range(num_rows)],
217
+ 'id': [i for i in range(num_rows)],
218
+ 'symbol': [chr(c) for c in range(ord('a'), ord('a') + num_rows)],
219
+ }
220
+ create_table_with_data(zip_import_session, clean_bucket_name, 's1', 't1', schema, data)
221
+
222
+ # Step 3: Import files into the table
223
+ attempt_import(zip_import_session, clean_bucket_name, 's1', 't1', files, key_names=['id', 'symbol'])
224
+
225
+
226
+ def test_zip_imports_scale(zip_import_session, clean_bucket_name, s3):
227
+ """Verify that many key names, and large amounts of data of different kind work as expected."""
228
+ # Step 1: Create and upload Parquet data
229
+ log.info("Step 1: Creating and uploading Parquet data")
230
+ num_rows = 1_000_000
231
+ data = {
232
+ 'id': [i for i in range(num_rows)],
233
+ 'symbol': [chr((i % 26) + ord('a')) for i in range(num_rows)],
234
+ 'feature': [i * 10 for i in range(num_rows)], # Extra column not in the initial table
235
+ 'col_0': [datetime.now() for _ in range(num_rows)],
236
+ 'col_1': [1 for _ in range(num_rows)],
237
+ 'col_2': [2 for _ in range(num_rows)],
238
+ 'col_3': [3 for _ in range(num_rows)],
239
+ 'col_4': [4 for _ in range(num_rows)],
240
+ 'col_5': [5 for _ in range(num_rows)], # Extra column not in the initial table
241
+ }
242
+ file_key = 'large_data.parquet'
243
+ file_path = create_parquet_file(s3, clean_bucket_name, file_key, data)
244
+
245
+ # Step 2: Create table and insert initial data
246
+ log.info("Step 2: Creating table and inserting initial data")
247
+ table_data = {
248
+ 'vastdb_rowid': [10 + i for i in range(num_rows)],
249
+ 'id': data['id'],
250
+ 'symbol': data['symbol'],
251
+ 'col_0': data['col_0'],
252
+ 'col_1': data['col_1'],
253
+ 'col_2': data['col_2'],
254
+ 'col_3': data['col_3'],
255
+ 'col_4': data['col_4'],
256
+ }
257
+ schema = pa.schema([
258
+ ('vastdb_rowid', pa.int64()),
259
+ ('id', pa.int64()),
260
+ ('symbol', pa.string()),
261
+ ('col_0', pa.timestamp('s')),
262
+ ('col_1', pa.int64()),
263
+ ('col_2', pa.int64()),
264
+ ('col_3', pa.int64()),
265
+ ('col_4', pa.int64()),
266
+ ])
267
+ create_table_with_data(zip_import_session, clean_bucket_name, 's1', 't1', schema, table_data)
268
+
269
+ # Step 3: Import the Parquet file into the table
270
+ log.info("Step 3: Importing Parquet file into the table")
271
+ attempt_import(
272
+ zip_import_session,
273
+ clean_bucket_name,
274
+ 's1',
275
+ 't1',
276
+ [file_path],
277
+ key_names=['id', 'symbol', 'col_0', 'col_1', 'col_2', 'col_3', 'col_4']
278
+ )
279
+
280
+ # Step 4: Verify schema and row count
281
+ log.info("Step 4: Verifying schema and row count")
282
+ with (zip_import_session.transaction() as tx):
283
+ table = tx.bucket(clean_bucket_name).schema('s1').table('t1')
284
+ updated_schema = table.arrow_schema
285
+ updated_data = table.select().read_all()
286
+
287
+ # Verify schema
288
+ expected_schema = pa.schema([
289
+ ('vastdb_rowid', pa.int64()),
290
+ ('id', pa.int64()),
291
+ ('symbol', pa.string()),
292
+ ('col_0', pa.timestamp('s')),
293
+ ('col_1', pa.int64()),
294
+ ('col_2', pa.int64()),
295
+ ('col_3', pa.int64()),
296
+ ('col_4', pa.int64()),
297
+ ('feature', pa.int64()), # Added during import
298
+ ('col_5', pa.int64()), # Added during import
299
+ ])
300
+ assert updated_schema == expected_schema, \
301
+ "The table schema does not match the expected schema."
302
+
303
+ assert updated_data.num_rows == num_rows, \
304
+ f"Expected {num_rows} rows, but got {updated_data.num_rows}."
305
+
306
+ assert len(updated_schema.names) == 10, \
307
+ "The table should have exactly 10 columns"
308
+
309
+
310
+ def test_zip_imports_missing_columns(zip_import_session, clean_bucket_name, s3):
311
+ """Verify that importing Parquet data with missing columns fails as expected."""
312
+ # Step 1: Create and upload Parquet data missing key columns
313
+ log.info("Step 1: Creating and uploading Parquet data without key columns")
314
+ data = {
315
+ 'feature': [i * 10 for i in range(10)], # Only feature column, no 'id' or 'symbol'
316
+ }
317
+ file_key = 'missing_keys.parquet'
318
+ file_path = create_parquet_file(s3, clean_bucket_name, file_key, data)
319
+
320
+ # Step 2: Create table with key columns
321
+ log.info("Step 2: Creating table with key columns")
322
+ schema = pa.schema([
323
+ ('vastdb_rowid', pa.int64()),
324
+ ('id', pa.int64()),
325
+ ('symbol', pa.string()),
326
+ ])
327
+ create_table_with_data(zip_import_session, clean_bucket_name, 's1', 't1', schema)
328
+
329
+ # Step 3: Attempt to import Parquet data missing key columns
330
+ log.info("Step 3: Attempting to import data without key columns")
331
+ attempt_import(
332
+ zip_import_session,
333
+ clean_bucket_name,
334
+ 's1',
335
+ 't1',
336
+ [file_path],
337
+ key_names=['id', 'symbol'],
338
+ expected_error="Failed to verify import keys"
339
+ )
340
+
341
+
342
+ def test_zip_imports_missing_key_values(zip_import_session, clean_bucket_name, s3):
343
+ """Verify that importing Parquet data with extra key values fails as expected
344
+ and that importing a subset of key values fails as expected."""
345
+ schema = pa.schema([
346
+ ('vastdb_rowid', pa.int64()),
347
+ ('id', pa.int64()),
348
+ ('symbol', pa.string()),
349
+ ])
350
+ num_rows = 5
351
+
352
+ # Step 1: Create Parquet data with keys 0-4
353
+ data = {
354
+ 'id': [i for i in range(num_rows)],
355
+ 'symbol': [chr((i % 26) + ord('a')) for i in range(num_rows)],
356
+ 'feature': [i * 10 for i in range(num_rows)],
357
+ }
358
+ file_key = 'missing_key_values.parquet'
359
+ file_path = create_parquet_file(s3, clean_bucket_name, file_key, data)
360
+
361
+ # Step 2: Create a table with non-overlapping keys 3-7
362
+ table_data = {
363
+ 'vastdb_rowid': [i + 3 for i in range(num_rows)],
364
+ 'id': [i + 3 for i in range(num_rows)],
365
+ 'symbol': [chr(((i + 3) % 26) + ord('k')) for i in range(num_rows)],
366
+ }
367
+ create_table_with_data(zip_import_session, clean_bucket_name, 's1', 't1', schema, table_data)
368
+
369
+ # Step 3: Attempt to import Parquet data with mismatched keys
370
+ log.info("Step 3: Attempting to import Parquet data with keys that do not match the table")
371
+ attempt_import(
372
+ zip_import_session,
373
+ clean_bucket_name,
374
+ 's1',
375
+ 't1',
376
+ [file_path],
377
+ key_names=['id', 'symbol'],
378
+ expected_error="Failed to get row_ids to update on table"
379
+ )
380
+
381
+ # Step 4: Create and upload Parquet data with fewer rows but all key values present in the table
382
+ log.info("Step 4: Creating and uploading Parquet data with fewer rows, but matching all table keys")
383
+ smaller_data = {
384
+ 'id': [3, 4], # Subset of the table keys
385
+ 'symbol': ['k', 'l'], # Matching symbols for keys 3 and 4
386
+ 'feature': [300, 400], # Example new feature data
387
+ }
388
+ smaller_file_key = 'subset_matching_keys.parquet'
389
+ smaller_file_path = create_parquet_file(s3, clean_bucket_name, smaller_file_key, smaller_data)
390
+
391
+ # Step 5: Attempt to import the Parquet data with fewer rows but all key values present
392
+ log.info("Step 5: Attempting to import smaller Parquet data with all table keys")
393
+ attempt_import(
394
+ zip_import_session,
395
+ clean_bucket_name,
396
+ 's1',
397
+ 't1',
398
+ [smaller_file_path],
399
+ key_names=['id', 'symbol'],
400
+ expected_error='Failed to get row_ids to update on table'
401
+ )
402
+
403
+
404
+ def test_zip_imports_nested_keys(zip_import_session, clean_bucket_name, s3):
405
+ """Verify that importing Parquet data with nested key columns fails as expected."""
406
+ # Step 1: Creating Parquet data with nested key columns
407
+ log.info("Step 1: Creating Parquet data with nested key columns")
408
+ num_rows = 10
409
+ nested_keys = [{'id': i, 'symbol': chr(ord('a') + i)} for i in range(num_rows)]
410
+ feature_column = [i * 10 for i in range(num_rows)]
411
+
412
+ ds = {
413
+ 'nested_key': nested_keys,
414
+ 'feature': feature_column,
415
+ }
416
+
417
+ # Use create_parquet_file helper
418
+ file_key = 'nested_keys.parquet'
419
+ file_path = create_parquet_file(s3, clean_bucket_name, file_key, ds)
420
+
421
+ # Step 2: Creating table with flat key columns
422
+ log.info("Step 2: Creating table with flat key columns")
423
+ schema = pa.schema([
424
+ ('vastdb_rowid', pa.int64()),
425
+ ('id', pa.int64()),
426
+ ('symbol', pa.string()),
427
+ ])
428
+
429
+ # Use create_table_with_data helper
430
+ create_table_with_data(
431
+ zip_import_session,
432
+ clean_bucket_name,
433
+ 's1',
434
+ 't1',
435
+ schema
436
+ )
437
+
438
+ # Step 3: Attempt to import Parquet data with nested key columns
439
+ log.info("Step 3: Attempting to import data with nested key columns")
440
+
441
+ # Use attempt_import helper with expected error
442
+ attempt_import(
443
+ zip_import_session,
444
+ clean_bucket_name,
445
+ 's1',
446
+ 't1',
447
+ [file_path],
448
+ ['id', 'symbol'],
449
+ expected_error="Failed to verify import keys"
450
+ )
451
+
452
+
453
+ def test_zip_imports_type_mismatch(zip_import_session, clean_bucket_name, s3):
454
+ """Verify behavior when key column data types in the Parquet file do not match the table schema."""
455
+ # Step 1: Define table schema with id as string
456
+ schema = pa.schema([
457
+ ('vastdb_rowid', pa.int64()),
458
+ ('id', pa.string()), # Expecting strings here
459
+ ('symbol', pa.string()),
460
+ ])
461
+ num_rows = 10
462
+
463
+ # Step 2: Generate and upload a single Parquet file with mismatched id type (integers)
464
+ log.info("Step 2: Creating a Parquet file with mismatched key column data types")
465
+ data = {
466
+ 'id': [k for k in range(num_rows)], # Integers, causing the type mismatch
467
+ 'symbol': [chr(c) for c in range(ord('a'), ord('a') + num_rows)],
468
+ 'feature': [k * 10 for k in range(num_rows)],
469
+ }
470
+ file_key = 'mismatched_data.parquet'
471
+ file_path = create_parquet_file(s3, clean_bucket_name, file_key, data)
472
+
473
+ # Step 3: Create table with string id column and insert valid initial data
474
+ log.info("Step 3: Creating table with string key column and valid initial data")
475
+ table_data = {
476
+ 'vastdb_rowid': [10 + i for i in range(num_rows)],
477
+ 'id': [str(i) for i in range(num_rows)], # Strings to match schema
478
+ 'symbol': [chr(c) for c in range(ord('a'), ord('a') + num_rows)],
479
+ }
480
+ create_table_with_data(zip_import_session, clean_bucket_name, 's1', 't1', schema, table_data)
481
+
482
+ # Step 4: Attempt to import the file into the table
483
+ log.info("Step 4: Attempting to import the Parquet file with mismatched key column data types")
484
+ attempt_import(
485
+ zip_import_session,
486
+ clean_bucket_name,
487
+ 's1',
488
+ 't1',
489
+ [file_path],
490
+ key_names=['id', 'symbol'],
491
+ expected_error="TabularMismatchColumnType"
492
+ )
493
+
494
+
495
+ def test_zip_imports_duplicate_key_values(zip_import_session, clean_bucket_name):
496
+ """Verify that creating a table with duplicate key values fails as expected,
497
+ also show that it has to be in same order."""
498
+ schema = pa.schema([
499
+ ('vastdb_rowid', pa.int64()),
500
+ ('id', pa.int64()),
501
+ ('symbol', pa.string()),
502
+ ])
503
+
504
+ # Data with duplicate keys
505
+ table_data = {
506
+ 'vastdb_rowid': [1, 2, 2, 4, 5],
507
+ 'id': [1, 2, 2, 4, 5],
508
+ 'symbol': ['a', 'b', 'b', 'd', 'e'],
509
+ }
510
+
511
+ try:
512
+ # Attempt to create the table
513
+ create_table_with_data(zip_import_session, clean_bucket_name, 's1', 't1', schema, table_data)
514
+ assert False, "Expected an error due to duplicate keys, but the table was created successfully."
515
+ except Exception as e:
516
+ # Verify the exception is due to duplicate row IDs
517
+ assert "Found duplicate row ids or not in ascending order" in str(e), f"Unexpected error: {e}"
@@ -269,6 +269,24 @@ def test_select_with_multisplits(session, clean_bucket_name):
269
269
  assert actual == expected
270
270
 
271
271
 
272
+ def test_select_with_limit(session, clean_bucket_name):
273
+ columns = pa.schema([
274
+ ('a', pa.int32())
275
+ ])
276
+
277
+ data = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
278
+ data = data * 1000
279
+ expected = pa.table(schema=columns, data=[data])
280
+ limit_rows = 10
281
+
282
+ with prepare_data(session, clean_bucket_name, 's', 't', expected) as t:
283
+ start = time.time()
284
+ actual = t.select(predicate=(t['a'] < 3), limit_rows=limit_rows).read_all()
285
+ end = time.time()
286
+ log.info(f"actual: {actual} elapsed time: {end - start}")
287
+ assert len(actual) == limit_rows
288
+
289
+
272
290
  def test_select_with_priority(session, clean_bucket_name):
273
291
  columns = pa.schema([
274
292
  ('a', pa.int32())
@@ -313,8 +331,13 @@ def test_timezones(session, clean_bucket_name):
313
331
 
314
332
  inserted = pa.table(schema=columns_with_tz, data=data)
315
333
  with prepare_data(session, clean_bucket_name, 's', 't', inserted) as table:
316
- assert table.arrow_schema == columns_without_tz
317
- assert table.select().read_all() == pa.table(schema=columns_without_tz, data=data)
334
+ try:
335
+ table.tx._rpc.features.check_timezone()
336
+ assert table.arrow_schema == columns_with_tz
337
+ assert table.select().read_all() == pa.table(schema=columns_with_tz, data=data)
338
+ except errors.NotSupportedVersion:
339
+ assert table.arrow_schema == columns_without_tz
340
+ assert table.select().read_all() == pa.table(schema=columns_without_tz, data=data)
318
341
 
319
342
 
320
343
  def test_types(session, clean_bucket_name):
vastdb/tests/util.py CHANGED
@@ -1,6 +1,8 @@
1
1
  import logging
2
2
  from contextlib import contextmanager
3
3
 
4
+ import pyarrow as pa
5
+
4
6
  log = logging.getLogger(__name__)
5
7
 
6
8
 
@@ -15,3 +17,22 @@ def prepare_data(session, clean_bucket_name, schema_name, table_name, arrow_tabl
15
17
  yield t
16
18
  t.drop()
17
19
  s.drop()
20
+
21
+
22
+ def compare_pyarrow_tables(t1, t2):
23
+
24
+ def sort_table(table):
25
+ return table.sort_by([(col, 'ascending') for col in table.schema.names])
26
+
27
+ def compare_tables(table1, table2):
28
+ if table1.schema != table2.schema:
29
+ raise RuntimeError(f"Schema mismatch. {table1.schema} vs {table2.schema}")
30
+
31
+ for t1_col, t2_col in zip(table1.columns, table2.columns):
32
+ if not pa.compute.equal(t1_col, t2_col).to_pandas().all():
33
+ raise RuntimeError(f"Data mismatch in column {t1_col} vs {t2_col}.")
34
+ return True
35
+
36
+ sorted_table1 = sort_table(t1)
37
+ sorted_table2 = sort_table(t2)
38
+ return compare_tables(sorted_table1, sorted_table2)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: vastdb
3
- Version: 1.3.9
3
+ Version: 1.3.10
4
4
  Summary: VAST Data SDK
5
5
  Home-page: https://github.com/vast-data/vastdb_sdk
6
6
  Author: VAST DATA
@@ -4,14 +4,14 @@ vastdb/bucket.py,sha256=aomUbrfK5Oa6FdGPVsoBXgRW39IzYnmsorF8642r990,2549
4
4
  vastdb/config.py,sha256=OehnsWrjzv0-SUouEXmkrKBugiWyhXOn4XiSLV3s9yk,2342
5
5
  vastdb/conftest.py,sha256=X2kVveySPQYZlVBXUMoo7Oea5IsvmJzjdqq3fpH2kVw,3469
6
6
  vastdb/errors.py,sha256=B_FNFONDE8apoTRL8wkMNjUJWAjXu36mO0HI4cGSBgY,4328
7
- vastdb/features.py,sha256=6OAyTGxpOlMYqkcX2IfuG_ihJC8qrmraKdnef_B3xuo,1727
7
+ vastdb/features.py,sha256=ivYbvhiGA858B00vhs_CNzlVV9QDUe53yW6V3J5EoxM,1874
8
8
  vastdb/schema.py,sha256=UR1WzQvfAdnpDaNsEaGZLYGC65Blri5MYOWinCcl8Hc,6552
9
9
  vastdb/session.py,sha256=toMR0BXwTaECdWDKnIZky1F3MA1SmelRBiqCrqQ3GCM,2067
10
- vastdb/table.py,sha256=V7LymaLfirOiAbBj68M_29ijOeSZKD0_gYU44OGkkac,35278
10
+ vastdb/table.py,sha256=NGImmz_KltU80B0u-CYDgEdGOMHSppf7mmVs72WD8wM,35937
11
11
  vastdb/transaction.py,sha256=NlVkEowJ_pmtffjWBBDaKExYDKPekjSZyj_fK_bZPJE,3026
12
12
  vastdb/util.py,sha256=8CUnVRsJukC3uNHNoB5D0qPf0FxS8OSdVB84nNoLJKc,6290
13
13
  vastdb/bench/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
14
- vastdb/bench/test_perf.py,sha256=nyK-BM1HJhPHrcNa2pLNmxqcC_CG2UsJogE92EvN-UM,1082
14
+ vastdb/bench/test_perf.py,sha256=0kbCxK8U9vYO0zCMUYcZHzEICaaII3I0-6FeR5-CNtM,4040
15
15
  vastdb/bench/test_sample.py,sha256=LgF4syzij09sH3Noiv1EyCAJ9pvrUE5bxR4RJTVEYag,7881
16
16
  vastdb/bench/perf_bench/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
17
17
  vastdb/bench/perf_bench/cli.py,sha256=NtaPEBTDI6PWgEtwI1wVbwmUeA5bwGqAj_Z_2lDJ28I,5931
@@ -41,14 +41,14 @@ vastdb/bench/perf_bench/query/query_vastdb.py,sha256=SZYem_EmsaynEftAa_VFobjSJZD
41
41
  vastdb/tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
42
42
  vastdb/tests/metrics.py,sha256=ZCSeBYFSPMG3yI0JrAHs2CrY6wFjx_5GwRTYHVAwLKA,1026
43
43
  vastdb/tests/test_duckdb.py,sha256=STw_1PwTQR8Naz6s0p6lQTV1ZTKKhe3LPBUbhqzTCu0,1880
44
- vastdb/tests/test_imports.py,sha256=ICI9EWFgKf9TbphFRhFifqZoESKWIM3_zb53U7-jOSo,8058
44
+ vastdb/tests/test_imports.py,sha256=R-ExC6IYaf4REGQw0v7iVAz7TPY9vd8S3S892vy86R0,20011
45
45
  vastdb/tests/test_nested.py,sha256=LPU6uV3Ri23dBzAEMFQqRPbqapV5LfmiHSHkhILPIY0,6332
46
46
  vastdb/tests/test_projections.py,sha256=3y1kubwVrzO-xoR0hyps7zrjOJI8niCYspaFTN16Q9w,4540
47
47
  vastdb/tests/test_sanity.py,sha256=bv1ypGDzvOgmMvGbucDYiLQu8krQLlE6NB3M__q87x8,3303
48
48
  vastdb/tests/test_schemas.py,sha256=l70YQMlx2UL1KRQhApriiG2ZM7GJF-IzWU31H3Yqn1U,3312
49
- vastdb/tests/test_tables.py,sha256=D6eHSDjC4SJGFA91qJO56SoVPE040rN37uOrDWRDthk,47634
49
+ vastdb/tests/test_tables.py,sha256=wBPUewfJVEJNyDHwO49qld3lMVjVjUiAzP7ngX07fFA,48478
50
50
  vastdb/tests/test_util.py,sha256=n7gvT5Wg6b6bxgqkFXkYqvFd_W1GlUdVfmPv66XYXyA,1956
51
- vastdb/tests/util.py,sha256=O2bgB5403meX69vVY1gWACOtWLOoXE5yQA00ppk4WN8,596
51
+ vastdb/tests/util.py,sha256=YsCBCcx7n1QOH-IPDpCsl6KEaUQQJRZwGPeayijHNb4,1307
52
52
  vastdb/vast_flatbuf/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
53
53
  vastdb/vast_flatbuf/org/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
54
54
  vastdb/vast_flatbuf/org/apache/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -209,8 +209,8 @@ vastdb/vast_flatbuf/tabular/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMp
209
209
  vastdb/vast_tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
210
210
  vastdb/vast_tests/test_ha.py,sha256=744P4G6VJ09RIkHhMQL4wlipCBJWQVMhyvUrSc4k1HQ,975
211
211
  vastdb/vast_tests/test_scale.py,sha256=5jGwOdZH6Tv5tPdZYPWoqcxOceI2jA5i2D1zNKZHER4,3958
212
- vastdb-1.3.9.dist-info/LICENSE,sha256=obffan7LYrq7hLHNrY7vHcn2pKUTBUYXMKu-VOAvDxU,11333
213
- vastdb-1.3.9.dist-info/METADATA,sha256=szfHdpcb7zy4H49PgDYqVnn2J5UCAu0azRHkfXJkEpY,1340
214
- vastdb-1.3.9.dist-info/WHEEL,sha256=R0nc6qTxuoLk7ShA2_Y-UWkN8ZdfDBG2B6Eqpz2WXbs,91
215
- vastdb-1.3.9.dist-info/top_level.txt,sha256=nnKAaZaQa8GFbYpWAexr_B9HrhonZbUlX6hL6AC--yA,7
216
- vastdb-1.3.9.dist-info/RECORD,,
212
+ vastdb-1.3.10.dist-info/LICENSE,sha256=obffan7LYrq7hLHNrY7vHcn2pKUTBUYXMKu-VOAvDxU,11333
213
+ vastdb-1.3.10.dist-info/METADATA,sha256=BFeEhZ0mgwoCyAKM_EkijrPcI5RWTME4tDtdq-fcWwc,1341
214
+ vastdb-1.3.10.dist-info/WHEEL,sha256=R0nc6qTxuoLk7ShA2_Y-UWkN8ZdfDBG2B6Eqpz2WXbs,91
215
+ vastdb-1.3.10.dist-info/top_level.txt,sha256=nnKAaZaQa8GFbYpWAexr_B9HrhonZbUlX6hL6AC--yA,7
216
+ vastdb-1.3.10.dist-info/RECORD,,