vastdb 0.1.1__py3-none-any.whl → 0.1.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,61 @@
1
+ import logging
2
+
3
+ import duckdb
4
+ import pyarrow as pa
5
+ import pyarrow.compute as pc
6
+ import pytest
7
+
8
+ from ..table import QueryConfig
9
+ from .util import prepare_data
10
+
11
+ log = logging.getLogger(__name__)
12
+
13
+
14
+ def test_duckdb(session, clean_bucket_name):
15
+ columns = pa.schema([
16
+ ('a', pa.int32()),
17
+ ('b', pa.float64()),
18
+ ])
19
+ data = pa.table(schema=columns, data=[
20
+ [111, 222, 333],
21
+ [0.5, 1.5, 2.5],
22
+ ])
23
+ with prepare_data(session, clean_bucket_name, 's', 't', data) as t:
24
+ conn = duckdb.connect()
25
+ batches = t.select(columns=['a'], predicate=(t['b'] < 2)) # noqa: F841
26
+ actual = conn.execute('SELECT max(a) as "a_max" FROM batches').arrow()
27
+ expected = (data
28
+ .filter(pc.field('b') < 2)
29
+ .group_by([])
30
+ .aggregate([('a', 'max')]))
31
+ assert actual == expected
32
+
33
+
34
+ def test_closed_tx(session, clean_bucket_name):
35
+ columns = pa.schema([
36
+ ('a', pa.int64()),
37
+ ])
38
+ data = pa.table(schema=columns, data=[
39
+ list(range(10000)),
40
+ ])
41
+
42
+ with session.transaction() as tx:
43
+ t = tx.bucket(clean_bucket_name).create_schema("s1").create_table("t1", columns)
44
+ t.insert(data)
45
+
46
+ config = QueryConfig(
47
+ num_sub_splits=1,
48
+ num_splits=1,
49
+ num_row_groups_per_sub_split=1,
50
+ limit_rows_per_sub_split=100)
51
+ batches = t.select(config=config) # noqa: F841
52
+ first = next(batches) # make sure that HTTP response processing has started
53
+ assert first['a'].to_pylist() == list(range(100))
54
+
55
+ conn = duckdb.connect()
56
+ res = conn.execute('SELECT a FROM batches')
57
+ log.debug("closing tx=%s after first batch=%s", t.tx, first)
58
+
59
+ # transaction is closed, collecting the result should fail
60
+ with pytest.raises(duckdb.InvalidInputException, match="Detail: Python exception: MissingTransaction"):
61
+ res.arrow()
@@ -6,7 +6,7 @@ import pyarrow.parquet as pq
6
6
  import pytest
7
7
 
8
8
  from vastdb import util
9
- from vastdb.errors import ImportFilesError, InvalidArgument
9
+ from vastdb.errors import ImportFilesError, InternalServerError, InvalidArgument
10
10
 
11
11
  log = logging.getLogger(__name__)
12
12
 
@@ -34,12 +34,24 @@ def test_parallel_imports(session, clean_bucket_name, s3):
34
34
  b = tx.bucket(clean_bucket_name)
35
35
  s = b.create_schema('s1')
36
36
  t = s.create_table('t1', pa.schema([('num', pa.int64())]))
37
+ with pytest.raises(InternalServerError):
38
+ t.create_imports_table()
37
39
  log.info("Starting import of %d files", num_files)
38
40
  t.import_files(files)
39
41
  arrow_table = pa.Table.from_batches(t.select(columns=['num']))
40
42
  assert arrow_table.num_rows == num_rows * num_files
41
43
  arrow_table = pa.Table.from_batches(t.select(columns=['num'], predicate=t['num'] == 100))
42
44
  assert arrow_table.num_rows == num_files
45
+ import_table = t.imports_table()
46
+ # checking all imports are on the imports table:
47
+ objects_name = pa.Table.from_batches(import_table.select(columns=["ObjectName"]))
48
+ objects_name = objects_name.to_pydict()
49
+ object_names = set(objects_name['ObjectName'])
50
+ prefix = 'prq'
51
+ numbers = set(range(53))
52
+ assert all(name.startswith(prefix) for name in object_names)
53
+ numbers.issubset(int(name.replace(prefix, '')) for name in object_names)
54
+ assert len(object_names) == len(objects_name['ObjectName'])
43
55
 
44
56
 
45
57
  def test_create_table_from_files(session, clean_bucket_name, s3):
@@ -4,6 +4,7 @@ import pyarrow as pa
4
4
 
5
5
  log = logging.getLogger(__name__)
6
6
 
7
+
7
8
  def test_basic_projections(session, clean_bucket_name):
8
9
  with session.transaction() as tx:
9
10
  s = tx.bucket(clean_bucket_name).create_schema('s1')
@@ -57,10 +57,10 @@ def test_version_extraction():
57
57
  return f"vast {version}" if version else "vast"
58
58
 
59
59
  def log_message(self, format, *args):
60
- log.debug(format,*args)
60
+ log.debug(format, *args)
61
61
 
62
62
  # start the server on localhost on some available port port
63
- server_address =('localhost', 0)
63
+ server_address = ('localhost', 0)
64
64
  httpd = HTTPServer(server_address, MockOptionsHandler)
65
65
 
66
66
  def start_http_server_in_thread():
@@ -50,14 +50,14 @@ def test_commits_and_rollbacks(session, clean_bucket_name):
50
50
  b = tx.bucket(clean_bucket_name)
51
51
  b.schema("s3").drop()
52
52
  assert b.schemas() == []
53
- 1/0 # rollback schema dropping
53
+ 1 / 0 # rollback schema dropping
54
54
 
55
55
  with session.transaction() as tx:
56
56
  b = tx.bucket(clean_bucket_name)
57
57
  assert b.schemas() != []
58
58
 
59
+
59
60
  def test_list_snapshots(session, clean_bucket_name):
60
61
  with session.transaction() as tx:
61
62
  b = tx.bucket(clean_bucket_name)
62
- s = b.snapshots()
63
- assert s == []
63
+ b.snapshots() # VAST Catalog may create some snapshots
@@ -3,10 +3,10 @@ import decimal
3
3
  import logging
4
4
  import random
5
5
  import threading
6
+ import time
6
7
  from contextlib import closing
7
8
  from tempfile import NamedTemporaryFile
8
9
 
9
- import duckdb
10
10
  import pyarrow as pa
11
11
  import pyarrow.compute as pc
12
12
  import pyarrow.parquet as pq
@@ -91,7 +91,6 @@ def test_exists(session, clean_bucket_name):
91
91
  assert s.tables() == [t]
92
92
 
93
93
 
94
-
95
94
  def test_update_table(session, clean_bucket_name):
96
95
  columns = pa.schema([
97
96
  ('a', pa.int64()),
@@ -147,12 +146,13 @@ def test_update_table(session, clean_bucket_name):
147
146
  'b': [0.5, 1.5, 2.5]
148
147
  }
149
148
 
149
+
150
150
  def test_select_with_multisplits(session, clean_bucket_name):
151
151
  columns = pa.schema([
152
152
  ('a', pa.int32())
153
153
  ])
154
154
 
155
- data = [ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
155
+ data = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
156
156
  data = data * 1000
157
157
  expected = pa.table(schema=columns, data=[data])
158
158
 
@@ -179,15 +179,15 @@ def test_types(session, clean_bucket_name):
179
179
  ('t3', pa.time32('ms')),
180
180
  ('t6', pa.time64('us')),
181
181
  ('t9', pa.time64('ns')),
182
- ('ts0' ,pa.timestamp('s')),
183
- ('ts3' ,pa.timestamp('ms')),
184
- ('ts6' ,pa.timestamp('us')),
185
- ('ts9' ,pa.timestamp('ns')),
182
+ ('ts0', pa.timestamp('s')),
183
+ ('ts3', pa.timestamp('ms')),
184
+ ('ts6', pa.timestamp('us')),
185
+ ('ts9', pa.timestamp('ns')),
186
186
  ])
187
187
 
188
188
  expected = pa.table(schema=columns, data=[
189
189
  [True, True, False],
190
- [1 , 2, 4],
190
+ [1, 2, 4],
191
191
  [1999, 2000, 2001],
192
192
  [11122221, 222111122, 333333],
193
193
  [0.5, 1.5, 2.5],
@@ -262,7 +262,7 @@ def test_filters(session, clean_bucket_name):
262
262
 
263
263
  with prepare_data(session, clean_bucket_name, 's', 't', expected) as t:
264
264
  def select(predicate):
265
- return pa.Table.from_batches(t.select(predicate=predicate))
265
+ return pa.Table.from_batches(t.select(predicate=predicate), t.arrow_schema)
266
266
 
267
267
  assert select(None) == expected
268
268
 
@@ -295,7 +295,7 @@ def test_filters(session, clean_bucket_name):
295
295
  assert select(((t['a'] > 111) | (t['a'] < 333)) & (t['b'] < 2.5)) == expected.filter(((pc.field('a') > 111) | (pc.field('a') < 333)) & (pc.field('b') < 2.5))
296
296
  with pytest.raises(NotImplementedError):
297
297
  assert select((t['a'] > 111) | (t['b'] > 0) | (t['s'] < 'ccc')) == expected.filter((pc.field('a') > 111) | (pc.field('b') > 0) | (pc.field('s') < 'ccc'))
298
- assert select((t['a'] > 111) | (t['a'] < 333) | (t['a'] == 777) ) == expected.filter((pc.field('a') > 111) | (pc.field('a') < 333) | (pc.field('a') == 777))
298
+ assert select((t['a'] > 111) | (t['a'] < 333) | (t['a'] == 777)) == expected.filter((pc.field('a') > 111) | (pc.field('a') < 333) | (pc.field('a') == 777))
299
299
 
300
300
  assert select(t['s'].isnull()) == expected.filter(pc.field('s').is_null())
301
301
  assert select((t['s'].isnull()) | (t['s'] == 'bb')) == expected.filter((pc.field('s').is_null()) | (pc.field('s') == 'bb'))
@@ -305,25 +305,12 @@ def test_filters(session, clean_bucket_name):
305
305
  assert select(t['s'].contains('b')) == expected.filter(pc.field('s') == 'bb')
306
306
  assert select(t['s'].contains('y')) == expected.filter(pc.field('s') == 'xyz')
307
307
 
308
+ assert select(t['a'].isin([555])) == expected.filter(pc.field('a').isin([555]))
309
+ assert select(t['a'].isin([111, 222, 999])) == expected.filter(pc.field('a').isin([111, 222, 999]))
310
+ assert select((t['a'] == 111) | t['a'].isin([333, 444]) | (t['a'] > 600)) == expected.filter((pc.field('a') == 111) | pc.field('a').isin([333, 444]) | (pc.field('a') > 600))
308
311
 
309
- def test_duckdb(session, clean_bucket_name):
310
- columns = pa.schema([
311
- ('a', pa.int32()),
312
- ('b', pa.float64()),
313
- ])
314
- data = pa.table(schema=columns, data=[
315
- [111, 222, 333],
316
- [0.5, 1.5, 2.5],
317
- ])
318
- with prepare_data(session, clean_bucket_name, 's', 't', data) as t:
319
- conn = duckdb.connect()
320
- batches = t.select(columns=['a'], predicate=(t['b'] < 2)) # noqa: F841
321
- actual = conn.execute('SELECT max(a) as "a_max" FROM batches').arrow()
322
- expected = (data
323
- .filter(pc.field('b') < 2)
324
- .group_by([])
325
- .aggregate([('a', 'max')]))
326
- assert actual == expected
312
+ with pytest.raises(NotImplementedError):
313
+ select(t['a'].isin([]))
327
314
 
328
315
 
329
316
  def test_parquet_export(session, clean_bucket_name):
@@ -344,8 +331,7 @@ def test_parquet_export(session, clean_bucket_name):
344
331
  ['a', 'b'],
345
332
  ])
346
333
  expected = pa.Table.from_batches([rb])
347
- rb = t.insert(rb)
348
- assert rb.to_pylist() == [0, 1]
334
+ t.insert(rb)
349
335
  actual = pa.Table.from_batches(t.select())
350
336
  assert actual == expected
351
337
 
@@ -359,6 +345,7 @@ def test_parquet_export(session, clean_bucket_name):
359
345
 
360
346
  assert expected == pq.read_table(parquet_file.name)
361
347
 
348
+
362
349
  def test_errors(session, clean_bucket_name):
363
350
  with pytest.raises(errors.MissingSchema):
364
351
  with session.transaction() as tx:
@@ -378,7 +365,8 @@ def test_errors(session, clean_bucket_name):
378
365
  ('s', pa.utf8()),
379
366
  ])
380
367
  s.create_table('t1', columns)
381
- s.drop() # cannot drop schema without dropping its tables first
368
+ s.drop() # cannot drop schema without dropping its tables first
369
+
382
370
 
383
371
  def test_rename_schema(session, clean_bucket_name):
384
372
 
@@ -436,20 +424,21 @@ def test_rename_table(session, clean_bucket_name):
436
424
  s.table('t')
437
425
  t = s.table('t2')
438
426
 
439
- #assert that other transactions are isolated
427
+ # assert that other transactions are isolated
440
428
  with pytest.raises(errors.MissingTable):
441
429
  tx2.bucket(clean_bucket_name).schema('s').table('t2')
442
430
  tx2.bucket(clean_bucket_name).schema('s').table('t')
443
431
 
444
432
  with session.transaction() as tx:
445
433
  s = tx.bucket(clean_bucket_name).schema('s')
446
- #assert that new transactions see the change
434
+ # assert that new transactions see the change
447
435
  with pytest.raises(errors.MissingTable):
448
436
  s.table('t')
449
437
  t = s.table('t2')
450
438
  t.drop()
451
439
  s.drop()
452
440
 
441
+
453
442
  def test_add_column(session, clean_bucket_name):
454
443
  columns = pa.schema([
455
444
  ('a', pa.int16()),
@@ -472,18 +461,18 @@ def test_add_column(session, clean_bucket_name):
472
461
  # in which it was added
473
462
  assert t.arrow_schema == new_schema
474
463
 
475
- #assert that other transactions are isolated
464
+ # assert that other transactions are isolated
476
465
  assert tx2.bucket(clean_bucket_name).schema('s').table('t').arrow_schema == columns
477
466
 
478
-
479
467
  with session.transaction() as tx:
480
468
  s = tx.bucket(clean_bucket_name).schema('s')
481
469
  t = s.table('t')
482
- #assert that new transactions see the change
470
+ # assert that new transactions see the change
483
471
  assert t.arrow_schema == new_schema
484
472
  t.drop()
485
473
  s.drop()
486
474
 
475
+
487
476
  def test_drop_column(session, clean_bucket_name):
488
477
  columns = pa.schema([
489
478
  ('a', pa.int16()),
@@ -507,31 +496,32 @@ def test_drop_column(session, clean_bucket_name):
507
496
  # in which it was added
508
497
  assert t.arrow_schema == new_schema
509
498
 
510
- #assert that other transactions are isolated
499
+ # assert that other transactions are isolated
511
500
  assert tx2.bucket(clean_bucket_name).schema('s').table('t').arrow_schema == columns
512
501
 
513
-
514
502
  with session.transaction() as tx:
515
503
  s = tx.bucket(clean_bucket_name).schema('s')
516
504
  t = s.table('t')
517
- #assert that new transactions see the change
505
+ # assert that new transactions see the change
518
506
  assert t.arrow_schema == new_schema
519
507
  t.drop()
520
508
  s.drop()
521
509
 
510
+
522
511
  def test_rename_column(session, clean_bucket_name):
523
512
  columns = pa.schema([
524
513
  ('a', pa.int16()),
525
514
  ('b', pa.float32()),
526
515
  ('s', pa.utf8()),
527
516
  ])
528
- def prepare_rename_column(schema : pa.Schema, old_name : str, new_name : str) -> pa.Schema:
517
+
518
+ def prepare_rename_column(schema: pa.Schema, old_name: str, new_name: str) -> pa.Schema:
529
519
  field_idx = schema.get_field_index(old_name)
530
520
  column_to_rename = schema.field(field_idx)
531
521
  renamed_column = column_to_rename.with_name(new_name)
532
522
  return schema.set(field_idx, renamed_column)
533
523
 
534
- new_schema = prepare_rename_column(columns,'a','aaa')
524
+ new_schema = prepare_rename_column(columns, 'a', 'aaa')
535
525
 
536
526
  with session.transaction() as tx:
537
527
  s = tx.bucket(clean_bucket_name).create_schema('s')
@@ -546,10 +536,10 @@ def test_rename_column(session, clean_bucket_name):
546
536
  # in which it was added
547
537
  assert t.arrow_schema == new_schema
548
538
 
549
- #assert that other transactions are isolated
539
+ # assert that other transactions are isolated
550
540
  assert tx2.bucket(clean_bucket_name).schema('s').table('t').arrow_schema == columns
551
541
 
552
- #assert that new transactions see the change
542
+ # assert that new transactions see the change
553
543
  with session.transaction() as tx:
554
544
  s = tx.bucket(clean_bucket_name).schema('s')
555
545
  t = s.table('t')
@@ -564,7 +554,7 @@ def test_rename_column(session, clean_bucket_name):
564
554
  t1 = tx1.bucket(clean_bucket_name).schema('s').table('t')
565
555
  t2 = tx2.bucket(clean_bucket_name).schema('s').table('t')
566
556
  t1.rename_column('b', 'bb')
567
- with pytest.raises(HTTPError, match = '409 Client Error: Conflict'):
557
+ with pytest.raises(HTTPError, match='409 Client Error: Conflict'):
568
558
  t2.rename_column('b', 'bbb')
569
559
 
570
560
  with session.transaction() as tx:
@@ -580,6 +570,7 @@ def test_rename_column(session, clean_bucket_name):
580
570
  t.drop()
581
571
  s.drop()
582
572
 
573
+
583
574
  def test_select_stop(session, clean_bucket_name):
584
575
  columns = pa.schema([
585
576
  ('a', pa.uint8()),
@@ -602,15 +593,16 @@ def test_select_stop(session, clean_bucket_name):
602
593
  qc = QueryConfig(num_sub_splits=2, num_splits=4, num_row_groups_per_sub_split=1)
603
594
  with session.transaction() as tx:
604
595
  t = tx.bucket(clean_bucket_name).schema('s').table('t')
605
- t.refresh_stats()
606
- qc.data_endpoints = list(t.stats.endpoints) * 2
596
+ qc.data_endpoints = list(t.get_stats().endpoints) * 2
607
597
 
608
598
  # Duplicate the table until it is large enough to generate enough batches
609
599
  while num_rows < (qc.num_sub_splits * qc.num_splits) * ROWS_PER_GROUP:
600
+ # We need two separate transactions to prevent an infinite loop that may happen
601
+ # while appending and reading the same table using a single transaction.
610
602
  with session.transaction() as tx_read, session.transaction() as tx_write:
611
603
  t_read = tx_read.bucket(clean_bucket_name).schema('s').table('t')
612
604
  t_write = tx_write.bucket(clean_bucket_name).schema('s').table('t')
613
- for batch in t_read.select(['a'],config=qc):
605
+ for batch in t_read.select(['a'], config=qc):
614
606
  t_write.insert(batch)
615
607
  num_rows = num_rows * 2
616
608
  log.info("Num rows: %d", num_rows)
@@ -627,11 +619,12 @@ def test_select_stop(session, clean_bucket_name):
627
619
  # If this assert triggers it just means that the test assumptions about how
628
620
  # the tabular server splits the batches is not true anymore and we need to
629
621
  # rewrite the test.
630
- assert read_batches == qc.num_splits*qc.num_sub_splits
631
- qc.query_id = str(random.randint(0,2**32))
622
+ assert read_batches == qc.num_splits * qc.num_sub_splits
623
+ qc.query_id = str(random.randint(0, 2**32))
632
624
  log.info("query id is: %s", qc.query_id)
625
+
633
626
  def active_threads():
634
- log.debug("%s",[t.getName() for t in threading.enumerate() if t.is_alive()])
627
+ log.debug("%s", [t.getName() for t in threading.enumerate() if t.is_alive()])
635
628
  return sum([1 if t.is_alive() and qc.query_id in t.getName() else 0 for t in threading.enumerate()])
636
629
 
637
630
  assert active_threads() == 0
@@ -653,3 +646,20 @@ def test_select_stop(session, clean_bucket_name):
653
646
 
654
647
  # validate that all query threads were killed.
655
648
  assert active_threads() == 0
649
+
650
+
651
+ def test_big_catalog_select(session, clean_bucket_name):
652
+ with session.transaction() as tx:
653
+ bc = tx.catalog()
654
+ actual = pa.Table.from_batches(bc.select(['name']))
655
+ assert actual
656
+ log.info("actual=%s", actual)
657
+
658
+
659
+ def test_audit_log_select(session, clean_bucket_name):
660
+ with session.transaction() as tx:
661
+ a = tx.audit_log()
662
+ a.columns()
663
+ time.sleep(1)
664
+ actual = pa.Table.from_batches(a.select(), a.arrow_schema)
665
+ log.info("actual=%s", actual)
@@ -0,0 +1,39 @@
1
+ import pyarrow as pa
2
+ import pytest
3
+
4
+ from .. import errors, util
5
+
6
+
7
+ def test_slices():
8
+ ROWS = 1 << 20
9
+ t = pa.table({"x": range(ROWS), "y": [i / 1000 for i in range(ROWS)]})
10
+
11
+ chunks = list(util.iter_serialized_slices(t))
12
+ assert len(chunks) > 1
13
+ sizes = [len(c) for c in chunks]
14
+
15
+ assert max(sizes) < util.MAX_RECORD_BATCH_SLICE_SIZE
16
+ assert t == pa.Table.from_batches(_parse(chunks))
17
+
18
+ chunks = list(util.iter_serialized_slices(t, 1000))
19
+ assert len(chunks) > 1
20
+ sizes = [len(c) for c in chunks]
21
+
22
+ assert max(sizes) < util.MAX_RECORD_BATCH_SLICE_SIZE
23
+ assert t == pa.Table.from_batches(_parse(chunks))
24
+
25
+
26
+ def test_wide_row():
27
+ cols = [pa.field(f"x{i}", pa.utf8()) for i in range(1000)]
28
+ values = [['a' * 10000]] * len(cols)
29
+ t = pa.table(values, schema=pa.schema(cols))
30
+ assert len(t) == 1
31
+
32
+ with pytest.raises(errors.TooWideRow):
33
+ list(util.iter_serialized_slices(t))
34
+
35
+
36
+ def _parse(bufs):
37
+ for buf in bufs:
38
+ with pa.ipc.open_stream(buf) as reader:
39
+ yield from reader
vastdb/tests/util.py CHANGED
@@ -9,10 +9,7 @@ def prepare_data(session, clean_bucket_name, schema_name, table_name, arrow_tabl
9
9
  with session.transaction() as tx:
10
10
  s = tx.bucket(clean_bucket_name).create_schema(schema_name)
11
11
  t = s.create_table(table_name, arrow_table.schema)
12
- row_ids_array = t.insert(arrow_table)
13
- row_ids = row_ids_array.to_pylist()
14
- log.debug("row_ids=%s" % row_ids)
15
- assert row_ids == list(range(arrow_table.num_rows))
12
+ t.insert(arrow_table)
16
13
  yield t
17
14
  t.drop()
18
15
  s.drop()
vastdb/transaction.py CHANGED
@@ -8,19 +8,29 @@ A transcation is used as a context manager, since every Database-related operati
8
8
 
9
9
  import logging
10
10
  from dataclasses import dataclass
11
+ from typing import Optional
11
12
 
12
13
  import botocore
13
14
 
14
- from . import bucket, errors, session
15
+ from . import bucket, errors, schema, session, table
15
16
 
16
17
  log = logging.getLogger(__name__)
17
18
 
19
+ TABULAR_BC_BUCKET = "vast-big-catalog-bucket"
20
+ VAST_CATALOG_SCHEMA_NAME = 'vast_big_catalog_schema'
21
+ VAST_CATALOG_TABLE_NAME = 'vast_big_catalog_table'
22
+
23
+ TABULAR_AUDERY_BUCKET = "vast-audit-log-bucket"
24
+ AUDERY_SCHEMA_NAME = 'vast_audit_log_schema'
25
+ AUDERY_TABLE_NAME = 'vast_audit_log_table'
26
+
27
+
18
28
  @dataclass
19
29
  class Transaction:
20
30
  """A holder of a single VAST transaction."""
21
31
 
22
32
  _rpc: "session.Session"
23
- txid: int = None
33
+ txid: Optional[int] = None
24
34
 
25
35
  def __enter__(self):
26
36
  """Create a transaction and store its ID."""
@@ -31,15 +41,19 @@ class Transaction:
31
41
 
32
42
  def __exit__(self, exc_type, exc_value, exc_traceback):
33
43
  """On success, the transaction is committed. Otherwise, it is rolled back."""
44
+ txid = self.txid
45
+ self.txid = None
34
46
  if (exc_type, exc_value, exc_traceback) == (None, None, None):
35
- log.debug("committing txid=%016x", self.txid)
36
- self._rpc.api.commit_transaction(self.txid)
47
+ log.debug("committing txid=%016x", txid)
48
+ self._rpc.api.commit_transaction(txid)
37
49
  else:
38
- log.exception("rolling back txid=%016x due to:", self.txid)
39
- self._rpc.api.rollback_transaction(self.txid)
50
+ log.exception("rolling back txid=%016x due to:", txid)
51
+ self._rpc.api.rollback_transaction(txid)
40
52
 
41
53
  def __repr__(self):
42
54
  """Don't show the session details."""
55
+ if self.txid is None:
56
+ return 'InvalidTransaction'
43
57
  return f'Transaction(id=0x{self.txid:016x})'
44
58
 
45
59
  def bucket(self, name: str) -> "bucket.Bucket":
@@ -52,3 +66,15 @@ class Transaction:
52
66
  raise errors.MissingBucket(name)
53
67
  raise
54
68
  return bucket.Bucket(name, self)
69
+
70
+ def catalog(self, fail_if_missing=True) -> Optional["table.Table"]:
71
+ """Return VAST Catalog table."""
72
+ b = bucket.Bucket(TABULAR_BC_BUCKET, self)
73
+ s = schema.Schema(VAST_CATALOG_SCHEMA_NAME, b)
74
+ return s.table(name=VAST_CATALOG_TABLE_NAME, fail_if_missing=fail_if_missing)
75
+
76
+ def audit_log(self, fail_if_missing=True) -> Optional["table.Table"]:
77
+ """Return VAST AuditLog table."""
78
+ b = bucket.Bucket(TABULAR_AUDERY_BUCKET, self)
79
+ s = schema.Schema(AUDERY_SCHEMA_NAME, b)
80
+ return s.table(name=AUDERY_TABLE_NAME, fail_if_missing=fail_if_missing)
vastdb/util.py CHANGED
@@ -1,19 +1,22 @@
1
1
  import logging
2
- from typing import Callable
2
+ from typing import TYPE_CHECKING, Callable, List, Optional, Union
3
3
 
4
4
  import pyarrow as pa
5
5
  import pyarrow.parquet as pq
6
6
 
7
- from .errors import InvalidArgument
8
- from .schema import Schema
9
- from .table import ImportConfig, Table
7
+ from .errors import InvalidArgument, TooWideRow
10
8
 
11
9
  log = logging.getLogger(__name__)
12
10
 
11
+ if TYPE_CHECKING:
12
+ from .schema import Schema
13
+ from .table import ImportConfig, Table
14
+
13
15
 
14
16
  def create_table_from_files(
15
- schema: Schema, table_name: str, parquet_files: [str], schema_merge_func: Callable = None,
16
- config: ImportConfig = None) -> Table:
17
+ schema: "Schema", table_name: str, parquet_files: List[str],
18
+ schema_merge_func: Optional[Callable] = None,
19
+ config: Optional["ImportConfig"] = None) -> "Table":
17
20
  if not schema_merge_func:
18
21
  schema_merge_func = default_schema_merge
19
22
  else:
@@ -76,3 +79,36 @@ def union_schema_merge(current_schema: pa.Schema, new_schema: pa.Schema) -> pa.S
76
79
  This function returns a unified schema from potentially two different schemas.
77
80
  """
78
81
  return pa.unify_schemas([current_schema, new_schema])
82
+
83
+
84
+ MAX_TABULAR_REQUEST_SIZE = 5 << 20 # in bytes
85
+ MAX_RECORD_BATCH_SLICE_SIZE = int(0.9 * MAX_TABULAR_REQUEST_SIZE)
86
+
87
+
88
+ def iter_serialized_slices(batch: Union[pa.RecordBatch, pa.Table], max_rows_per_slice=None):
89
+ """Iterate over a list of record batch slices."""
90
+
91
+ rows_per_slice = int(0.9 * len(batch) * MAX_RECORD_BATCH_SLICE_SIZE / batch.nbytes)
92
+ if max_rows_per_slice is not None:
93
+ rows_per_slice = min(rows_per_slice, max_rows_per_slice)
94
+
95
+ offset = 0
96
+ while offset < len(batch):
97
+ if rows_per_slice < 1:
98
+ raise TooWideRow(batch)
99
+
100
+ batch_slice = batch.slice(offset, rows_per_slice)
101
+ serialized_slice_batch = serialize_record_batch(batch_slice)
102
+ if len(serialized_slice_batch) <= MAX_RECORD_BATCH_SLICE_SIZE:
103
+ yield serialized_slice_batch
104
+ offset += rows_per_slice
105
+ else:
106
+ rows_per_slice = rows_per_slice // 2
107
+
108
+
109
+ def serialize_record_batch(batch: Union[pa.RecordBatch, pa.Table]):
110
+ """Serialize a RecordBatch using Arrow IPC format."""
111
+ sink = pa.BufferOutputStream()
112
+ with pa.ipc.new_stream(sink, batch.schema) as writer:
113
+ writer.write(batch)
114
+ return sink.getvalue()
@@ -1,12 +1,11 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: vastdb
3
- Version: 0.1.1
3
+ Version: 0.1.3
4
4
  Summary: VAST Data SDK
5
5
  Home-page: https://github.com/vast-data/vastdb_sdk
6
6
  Author: VAST DATA
7
7
  Author-email: hello@vastdata.com
8
8
  License: Copyright (C) VAST Data Ltd.
9
- Platform: UNKNOWN
10
9
  Classifier: Development Status :: 4 - Beta
11
10
  Classifier: License :: OSI Approved :: Apache Software License
12
11
  Classifier: Programming Language :: Python :: 3
@@ -22,7 +21,7 @@ License-File: LICENSE
22
21
  Requires-Dist: aws-requests-auth
23
22
  Requires-Dist: boto3
24
23
  Requires-Dist: flatbuffers
25
- Requires-Dist: ibis-framework
24
+ Requires-Dist: ibis-framework ==8.0.0
26
25
  Requires-Dist: pyarrow
27
26
  Requires-Dist: requests
28
27
  Requires-Dist: xmltodict
@@ -34,5 +33,3 @@ and [VAST Catalog](https://vastdata.com/blog/vast-catalog-treat-your-file-system
34
33
  enabling schema and table management, efficient ingest, query and modification of columnar data.
35
34
 
36
35
  For more details, see [our whitepaper](https://vastdata.com/whitepaper/#TheVASTDataBase).
37
-
38
-