vastdb 0.1.1__py3-none-any.whl → 0.1.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vastdb/__init__.py +6 -2
- vastdb/bench/test_perf.py +3 -3
- vastdb/bucket.py +29 -15
- vastdb/errors.py +40 -7
- vastdb/internal_commands.py +194 -233
- vastdb/schema.py +11 -6
- vastdb/session.py +16 -1
- vastdb/table.py +181 -77
- vastdb/tests/test_duckdb.py +61 -0
- vastdb/tests/test_imports.py +13 -1
- vastdb/tests/test_projections.py +1 -0
- vastdb/tests/test_sanity.py +2 -2
- vastdb/tests/test_schemas.py +3 -3
- vastdb/tests/test_tables.py +60 -50
- vastdb/tests/test_util.py +39 -0
- vastdb/tests/util.py +1 -4
- vastdb/transaction.py +32 -6
- vastdb/util.py +42 -6
- {vastdb-0.1.1.dist-info → vastdb-0.1.3.dist-info}/METADATA +2 -5
- {vastdb-0.1.1.dist-info → vastdb-0.1.3.dist-info}/RECORD +23 -21
- {vastdb-0.1.1.dist-info → vastdb-0.1.3.dist-info}/WHEEL +1 -1
- {vastdb-0.1.1.dist-info → vastdb-0.1.3.dist-info}/LICENSE +0 -0
- {vastdb-0.1.1.dist-info → vastdb-0.1.3.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
|
|
3
|
+
import duckdb
|
|
4
|
+
import pyarrow as pa
|
|
5
|
+
import pyarrow.compute as pc
|
|
6
|
+
import pytest
|
|
7
|
+
|
|
8
|
+
from ..table import QueryConfig
|
|
9
|
+
from .util import prepare_data
|
|
10
|
+
|
|
11
|
+
log = logging.getLogger(__name__)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def test_duckdb(session, clean_bucket_name):
|
|
15
|
+
columns = pa.schema([
|
|
16
|
+
('a', pa.int32()),
|
|
17
|
+
('b', pa.float64()),
|
|
18
|
+
])
|
|
19
|
+
data = pa.table(schema=columns, data=[
|
|
20
|
+
[111, 222, 333],
|
|
21
|
+
[0.5, 1.5, 2.5],
|
|
22
|
+
])
|
|
23
|
+
with prepare_data(session, clean_bucket_name, 's', 't', data) as t:
|
|
24
|
+
conn = duckdb.connect()
|
|
25
|
+
batches = t.select(columns=['a'], predicate=(t['b'] < 2)) # noqa: F841
|
|
26
|
+
actual = conn.execute('SELECT max(a) as "a_max" FROM batches').arrow()
|
|
27
|
+
expected = (data
|
|
28
|
+
.filter(pc.field('b') < 2)
|
|
29
|
+
.group_by([])
|
|
30
|
+
.aggregate([('a', 'max')]))
|
|
31
|
+
assert actual == expected
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def test_closed_tx(session, clean_bucket_name):
|
|
35
|
+
columns = pa.schema([
|
|
36
|
+
('a', pa.int64()),
|
|
37
|
+
])
|
|
38
|
+
data = pa.table(schema=columns, data=[
|
|
39
|
+
list(range(10000)),
|
|
40
|
+
])
|
|
41
|
+
|
|
42
|
+
with session.transaction() as tx:
|
|
43
|
+
t = tx.bucket(clean_bucket_name).create_schema("s1").create_table("t1", columns)
|
|
44
|
+
t.insert(data)
|
|
45
|
+
|
|
46
|
+
config = QueryConfig(
|
|
47
|
+
num_sub_splits=1,
|
|
48
|
+
num_splits=1,
|
|
49
|
+
num_row_groups_per_sub_split=1,
|
|
50
|
+
limit_rows_per_sub_split=100)
|
|
51
|
+
batches = t.select(config=config) # noqa: F841
|
|
52
|
+
first = next(batches) # make sure that HTTP response processing has started
|
|
53
|
+
assert first['a'].to_pylist() == list(range(100))
|
|
54
|
+
|
|
55
|
+
conn = duckdb.connect()
|
|
56
|
+
res = conn.execute('SELECT a FROM batches')
|
|
57
|
+
log.debug("closing tx=%s after first batch=%s", t.tx, first)
|
|
58
|
+
|
|
59
|
+
# transaction is closed, collecting the result should fail
|
|
60
|
+
with pytest.raises(duckdb.InvalidInputException, match="Detail: Python exception: MissingTransaction"):
|
|
61
|
+
res.arrow()
|
vastdb/tests/test_imports.py
CHANGED
|
@@ -6,7 +6,7 @@ import pyarrow.parquet as pq
|
|
|
6
6
|
import pytest
|
|
7
7
|
|
|
8
8
|
from vastdb import util
|
|
9
|
-
from vastdb.errors import ImportFilesError, InvalidArgument
|
|
9
|
+
from vastdb.errors import ImportFilesError, InternalServerError, InvalidArgument
|
|
10
10
|
|
|
11
11
|
log = logging.getLogger(__name__)
|
|
12
12
|
|
|
@@ -34,12 +34,24 @@ def test_parallel_imports(session, clean_bucket_name, s3):
|
|
|
34
34
|
b = tx.bucket(clean_bucket_name)
|
|
35
35
|
s = b.create_schema('s1')
|
|
36
36
|
t = s.create_table('t1', pa.schema([('num', pa.int64())]))
|
|
37
|
+
with pytest.raises(InternalServerError):
|
|
38
|
+
t.create_imports_table()
|
|
37
39
|
log.info("Starting import of %d files", num_files)
|
|
38
40
|
t.import_files(files)
|
|
39
41
|
arrow_table = pa.Table.from_batches(t.select(columns=['num']))
|
|
40
42
|
assert arrow_table.num_rows == num_rows * num_files
|
|
41
43
|
arrow_table = pa.Table.from_batches(t.select(columns=['num'], predicate=t['num'] == 100))
|
|
42
44
|
assert arrow_table.num_rows == num_files
|
|
45
|
+
import_table = t.imports_table()
|
|
46
|
+
# checking all imports are on the imports table:
|
|
47
|
+
objects_name = pa.Table.from_batches(import_table.select(columns=["ObjectName"]))
|
|
48
|
+
objects_name = objects_name.to_pydict()
|
|
49
|
+
object_names = set(objects_name['ObjectName'])
|
|
50
|
+
prefix = 'prq'
|
|
51
|
+
numbers = set(range(53))
|
|
52
|
+
assert all(name.startswith(prefix) for name in object_names)
|
|
53
|
+
numbers.issubset(int(name.replace(prefix, '')) for name in object_names)
|
|
54
|
+
assert len(object_names) == len(objects_name['ObjectName'])
|
|
43
55
|
|
|
44
56
|
|
|
45
57
|
def test_create_table_from_files(session, clean_bucket_name, s3):
|
vastdb/tests/test_projections.py
CHANGED
vastdb/tests/test_sanity.py
CHANGED
|
@@ -57,10 +57,10 @@ def test_version_extraction():
|
|
|
57
57
|
return f"vast {version}" if version else "vast"
|
|
58
58
|
|
|
59
59
|
def log_message(self, format, *args):
|
|
60
|
-
log.debug(format
|
|
60
|
+
log.debug(format, *args)
|
|
61
61
|
|
|
62
62
|
# start the server on localhost on some available port port
|
|
63
|
-
server_address =('localhost', 0)
|
|
63
|
+
server_address = ('localhost', 0)
|
|
64
64
|
httpd = HTTPServer(server_address, MockOptionsHandler)
|
|
65
65
|
|
|
66
66
|
def start_http_server_in_thread():
|
vastdb/tests/test_schemas.py
CHANGED
|
@@ -50,14 +50,14 @@ def test_commits_and_rollbacks(session, clean_bucket_name):
|
|
|
50
50
|
b = tx.bucket(clean_bucket_name)
|
|
51
51
|
b.schema("s3").drop()
|
|
52
52
|
assert b.schemas() == []
|
|
53
|
-
1/0 # rollback schema dropping
|
|
53
|
+
1 / 0 # rollback schema dropping
|
|
54
54
|
|
|
55
55
|
with session.transaction() as tx:
|
|
56
56
|
b = tx.bucket(clean_bucket_name)
|
|
57
57
|
assert b.schemas() != []
|
|
58
58
|
|
|
59
|
+
|
|
59
60
|
def test_list_snapshots(session, clean_bucket_name):
|
|
60
61
|
with session.transaction() as tx:
|
|
61
62
|
b = tx.bucket(clean_bucket_name)
|
|
62
|
-
|
|
63
|
-
assert s == []
|
|
63
|
+
b.snapshots() # VAST Catalog may create some snapshots
|
vastdb/tests/test_tables.py
CHANGED
|
@@ -3,10 +3,10 @@ import decimal
|
|
|
3
3
|
import logging
|
|
4
4
|
import random
|
|
5
5
|
import threading
|
|
6
|
+
import time
|
|
6
7
|
from contextlib import closing
|
|
7
8
|
from tempfile import NamedTemporaryFile
|
|
8
9
|
|
|
9
|
-
import duckdb
|
|
10
10
|
import pyarrow as pa
|
|
11
11
|
import pyarrow.compute as pc
|
|
12
12
|
import pyarrow.parquet as pq
|
|
@@ -91,7 +91,6 @@ def test_exists(session, clean_bucket_name):
|
|
|
91
91
|
assert s.tables() == [t]
|
|
92
92
|
|
|
93
93
|
|
|
94
|
-
|
|
95
94
|
def test_update_table(session, clean_bucket_name):
|
|
96
95
|
columns = pa.schema([
|
|
97
96
|
('a', pa.int64()),
|
|
@@ -147,12 +146,13 @@ def test_update_table(session, clean_bucket_name):
|
|
|
147
146
|
'b': [0.5, 1.5, 2.5]
|
|
148
147
|
}
|
|
149
148
|
|
|
149
|
+
|
|
150
150
|
def test_select_with_multisplits(session, clean_bucket_name):
|
|
151
151
|
columns = pa.schema([
|
|
152
152
|
('a', pa.int32())
|
|
153
153
|
])
|
|
154
154
|
|
|
155
|
-
data = [
|
|
155
|
+
data = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
|
|
156
156
|
data = data * 1000
|
|
157
157
|
expected = pa.table(schema=columns, data=[data])
|
|
158
158
|
|
|
@@ -179,15 +179,15 @@ def test_types(session, clean_bucket_name):
|
|
|
179
179
|
('t3', pa.time32('ms')),
|
|
180
180
|
('t6', pa.time64('us')),
|
|
181
181
|
('t9', pa.time64('ns')),
|
|
182
|
-
('ts0'
|
|
183
|
-
('ts3'
|
|
184
|
-
('ts6'
|
|
185
|
-
('ts9'
|
|
182
|
+
('ts0', pa.timestamp('s')),
|
|
183
|
+
('ts3', pa.timestamp('ms')),
|
|
184
|
+
('ts6', pa.timestamp('us')),
|
|
185
|
+
('ts9', pa.timestamp('ns')),
|
|
186
186
|
])
|
|
187
187
|
|
|
188
188
|
expected = pa.table(schema=columns, data=[
|
|
189
189
|
[True, True, False],
|
|
190
|
-
[1
|
|
190
|
+
[1, 2, 4],
|
|
191
191
|
[1999, 2000, 2001],
|
|
192
192
|
[11122221, 222111122, 333333],
|
|
193
193
|
[0.5, 1.5, 2.5],
|
|
@@ -262,7 +262,7 @@ def test_filters(session, clean_bucket_name):
|
|
|
262
262
|
|
|
263
263
|
with prepare_data(session, clean_bucket_name, 's', 't', expected) as t:
|
|
264
264
|
def select(predicate):
|
|
265
|
-
return pa.Table.from_batches(t.select(predicate=predicate))
|
|
265
|
+
return pa.Table.from_batches(t.select(predicate=predicate), t.arrow_schema)
|
|
266
266
|
|
|
267
267
|
assert select(None) == expected
|
|
268
268
|
|
|
@@ -295,7 +295,7 @@ def test_filters(session, clean_bucket_name):
|
|
|
295
295
|
assert select(((t['a'] > 111) | (t['a'] < 333)) & (t['b'] < 2.5)) == expected.filter(((pc.field('a') > 111) | (pc.field('a') < 333)) & (pc.field('b') < 2.5))
|
|
296
296
|
with pytest.raises(NotImplementedError):
|
|
297
297
|
assert select((t['a'] > 111) | (t['b'] > 0) | (t['s'] < 'ccc')) == expected.filter((pc.field('a') > 111) | (pc.field('b') > 0) | (pc.field('s') < 'ccc'))
|
|
298
|
-
assert select((t['a'] > 111) | (t['a'] < 333) | (t['a'] == 777)
|
|
298
|
+
assert select((t['a'] > 111) | (t['a'] < 333) | (t['a'] == 777)) == expected.filter((pc.field('a') > 111) | (pc.field('a') < 333) | (pc.field('a') == 777))
|
|
299
299
|
|
|
300
300
|
assert select(t['s'].isnull()) == expected.filter(pc.field('s').is_null())
|
|
301
301
|
assert select((t['s'].isnull()) | (t['s'] == 'bb')) == expected.filter((pc.field('s').is_null()) | (pc.field('s') == 'bb'))
|
|
@@ -305,25 +305,12 @@ def test_filters(session, clean_bucket_name):
|
|
|
305
305
|
assert select(t['s'].contains('b')) == expected.filter(pc.field('s') == 'bb')
|
|
306
306
|
assert select(t['s'].contains('y')) == expected.filter(pc.field('s') == 'xyz')
|
|
307
307
|
|
|
308
|
+
assert select(t['a'].isin([555])) == expected.filter(pc.field('a').isin([555]))
|
|
309
|
+
assert select(t['a'].isin([111, 222, 999])) == expected.filter(pc.field('a').isin([111, 222, 999]))
|
|
310
|
+
assert select((t['a'] == 111) | t['a'].isin([333, 444]) | (t['a'] > 600)) == expected.filter((pc.field('a') == 111) | pc.field('a').isin([333, 444]) | (pc.field('a') > 600))
|
|
308
311
|
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
('a', pa.int32()),
|
|
312
|
-
('b', pa.float64()),
|
|
313
|
-
])
|
|
314
|
-
data = pa.table(schema=columns, data=[
|
|
315
|
-
[111, 222, 333],
|
|
316
|
-
[0.5, 1.5, 2.5],
|
|
317
|
-
])
|
|
318
|
-
with prepare_data(session, clean_bucket_name, 's', 't', data) as t:
|
|
319
|
-
conn = duckdb.connect()
|
|
320
|
-
batches = t.select(columns=['a'], predicate=(t['b'] < 2)) # noqa: F841
|
|
321
|
-
actual = conn.execute('SELECT max(a) as "a_max" FROM batches').arrow()
|
|
322
|
-
expected = (data
|
|
323
|
-
.filter(pc.field('b') < 2)
|
|
324
|
-
.group_by([])
|
|
325
|
-
.aggregate([('a', 'max')]))
|
|
326
|
-
assert actual == expected
|
|
312
|
+
with pytest.raises(NotImplementedError):
|
|
313
|
+
select(t['a'].isin([]))
|
|
327
314
|
|
|
328
315
|
|
|
329
316
|
def test_parquet_export(session, clean_bucket_name):
|
|
@@ -344,8 +331,7 @@ def test_parquet_export(session, clean_bucket_name):
|
|
|
344
331
|
['a', 'b'],
|
|
345
332
|
])
|
|
346
333
|
expected = pa.Table.from_batches([rb])
|
|
347
|
-
|
|
348
|
-
assert rb.to_pylist() == [0, 1]
|
|
334
|
+
t.insert(rb)
|
|
349
335
|
actual = pa.Table.from_batches(t.select())
|
|
350
336
|
assert actual == expected
|
|
351
337
|
|
|
@@ -359,6 +345,7 @@ def test_parquet_export(session, clean_bucket_name):
|
|
|
359
345
|
|
|
360
346
|
assert expected == pq.read_table(parquet_file.name)
|
|
361
347
|
|
|
348
|
+
|
|
362
349
|
def test_errors(session, clean_bucket_name):
|
|
363
350
|
with pytest.raises(errors.MissingSchema):
|
|
364
351
|
with session.transaction() as tx:
|
|
@@ -378,7 +365,8 @@ def test_errors(session, clean_bucket_name):
|
|
|
378
365
|
('s', pa.utf8()),
|
|
379
366
|
])
|
|
380
367
|
s.create_table('t1', columns)
|
|
381
|
-
s.drop()
|
|
368
|
+
s.drop() # cannot drop schema without dropping its tables first
|
|
369
|
+
|
|
382
370
|
|
|
383
371
|
def test_rename_schema(session, clean_bucket_name):
|
|
384
372
|
|
|
@@ -436,20 +424,21 @@ def test_rename_table(session, clean_bucket_name):
|
|
|
436
424
|
s.table('t')
|
|
437
425
|
t = s.table('t2')
|
|
438
426
|
|
|
439
|
-
#assert that other transactions are isolated
|
|
427
|
+
# assert that other transactions are isolated
|
|
440
428
|
with pytest.raises(errors.MissingTable):
|
|
441
429
|
tx2.bucket(clean_bucket_name).schema('s').table('t2')
|
|
442
430
|
tx2.bucket(clean_bucket_name).schema('s').table('t')
|
|
443
431
|
|
|
444
432
|
with session.transaction() as tx:
|
|
445
433
|
s = tx.bucket(clean_bucket_name).schema('s')
|
|
446
|
-
#assert that new transactions see the change
|
|
434
|
+
# assert that new transactions see the change
|
|
447
435
|
with pytest.raises(errors.MissingTable):
|
|
448
436
|
s.table('t')
|
|
449
437
|
t = s.table('t2')
|
|
450
438
|
t.drop()
|
|
451
439
|
s.drop()
|
|
452
440
|
|
|
441
|
+
|
|
453
442
|
def test_add_column(session, clean_bucket_name):
|
|
454
443
|
columns = pa.schema([
|
|
455
444
|
('a', pa.int16()),
|
|
@@ -472,18 +461,18 @@ def test_add_column(session, clean_bucket_name):
|
|
|
472
461
|
# in which it was added
|
|
473
462
|
assert t.arrow_schema == new_schema
|
|
474
463
|
|
|
475
|
-
#assert that other transactions are isolated
|
|
464
|
+
# assert that other transactions are isolated
|
|
476
465
|
assert tx2.bucket(clean_bucket_name).schema('s').table('t').arrow_schema == columns
|
|
477
466
|
|
|
478
|
-
|
|
479
467
|
with session.transaction() as tx:
|
|
480
468
|
s = tx.bucket(clean_bucket_name).schema('s')
|
|
481
469
|
t = s.table('t')
|
|
482
|
-
#assert that new transactions see the change
|
|
470
|
+
# assert that new transactions see the change
|
|
483
471
|
assert t.arrow_schema == new_schema
|
|
484
472
|
t.drop()
|
|
485
473
|
s.drop()
|
|
486
474
|
|
|
475
|
+
|
|
487
476
|
def test_drop_column(session, clean_bucket_name):
|
|
488
477
|
columns = pa.schema([
|
|
489
478
|
('a', pa.int16()),
|
|
@@ -507,31 +496,32 @@ def test_drop_column(session, clean_bucket_name):
|
|
|
507
496
|
# in which it was added
|
|
508
497
|
assert t.arrow_schema == new_schema
|
|
509
498
|
|
|
510
|
-
#assert that other transactions are isolated
|
|
499
|
+
# assert that other transactions are isolated
|
|
511
500
|
assert tx2.bucket(clean_bucket_name).schema('s').table('t').arrow_schema == columns
|
|
512
501
|
|
|
513
|
-
|
|
514
502
|
with session.transaction() as tx:
|
|
515
503
|
s = tx.bucket(clean_bucket_name).schema('s')
|
|
516
504
|
t = s.table('t')
|
|
517
|
-
#assert that new transactions see the change
|
|
505
|
+
# assert that new transactions see the change
|
|
518
506
|
assert t.arrow_schema == new_schema
|
|
519
507
|
t.drop()
|
|
520
508
|
s.drop()
|
|
521
509
|
|
|
510
|
+
|
|
522
511
|
def test_rename_column(session, clean_bucket_name):
|
|
523
512
|
columns = pa.schema([
|
|
524
513
|
('a', pa.int16()),
|
|
525
514
|
('b', pa.float32()),
|
|
526
515
|
('s', pa.utf8()),
|
|
527
516
|
])
|
|
528
|
-
|
|
517
|
+
|
|
518
|
+
def prepare_rename_column(schema: pa.Schema, old_name: str, new_name: str) -> pa.Schema:
|
|
529
519
|
field_idx = schema.get_field_index(old_name)
|
|
530
520
|
column_to_rename = schema.field(field_idx)
|
|
531
521
|
renamed_column = column_to_rename.with_name(new_name)
|
|
532
522
|
return schema.set(field_idx, renamed_column)
|
|
533
523
|
|
|
534
|
-
new_schema = prepare_rename_column(columns,'a','aaa')
|
|
524
|
+
new_schema = prepare_rename_column(columns, 'a', 'aaa')
|
|
535
525
|
|
|
536
526
|
with session.transaction() as tx:
|
|
537
527
|
s = tx.bucket(clean_bucket_name).create_schema('s')
|
|
@@ -546,10 +536,10 @@ def test_rename_column(session, clean_bucket_name):
|
|
|
546
536
|
# in which it was added
|
|
547
537
|
assert t.arrow_schema == new_schema
|
|
548
538
|
|
|
549
|
-
#assert that other transactions are isolated
|
|
539
|
+
# assert that other transactions are isolated
|
|
550
540
|
assert tx2.bucket(clean_bucket_name).schema('s').table('t').arrow_schema == columns
|
|
551
541
|
|
|
552
|
-
#assert that new transactions see the change
|
|
542
|
+
# assert that new transactions see the change
|
|
553
543
|
with session.transaction() as tx:
|
|
554
544
|
s = tx.bucket(clean_bucket_name).schema('s')
|
|
555
545
|
t = s.table('t')
|
|
@@ -564,7 +554,7 @@ def test_rename_column(session, clean_bucket_name):
|
|
|
564
554
|
t1 = tx1.bucket(clean_bucket_name).schema('s').table('t')
|
|
565
555
|
t2 = tx2.bucket(clean_bucket_name).schema('s').table('t')
|
|
566
556
|
t1.rename_column('b', 'bb')
|
|
567
|
-
with pytest.raises(HTTPError, match
|
|
557
|
+
with pytest.raises(HTTPError, match='409 Client Error: Conflict'):
|
|
568
558
|
t2.rename_column('b', 'bbb')
|
|
569
559
|
|
|
570
560
|
with session.transaction() as tx:
|
|
@@ -580,6 +570,7 @@ def test_rename_column(session, clean_bucket_name):
|
|
|
580
570
|
t.drop()
|
|
581
571
|
s.drop()
|
|
582
572
|
|
|
573
|
+
|
|
583
574
|
def test_select_stop(session, clean_bucket_name):
|
|
584
575
|
columns = pa.schema([
|
|
585
576
|
('a', pa.uint8()),
|
|
@@ -602,15 +593,16 @@ def test_select_stop(session, clean_bucket_name):
|
|
|
602
593
|
qc = QueryConfig(num_sub_splits=2, num_splits=4, num_row_groups_per_sub_split=1)
|
|
603
594
|
with session.transaction() as tx:
|
|
604
595
|
t = tx.bucket(clean_bucket_name).schema('s').table('t')
|
|
605
|
-
t.
|
|
606
|
-
qc.data_endpoints = list(t.stats.endpoints) * 2
|
|
596
|
+
qc.data_endpoints = list(t.get_stats().endpoints) * 2
|
|
607
597
|
|
|
608
598
|
# Duplicate the table until it is large enough to generate enough batches
|
|
609
599
|
while num_rows < (qc.num_sub_splits * qc.num_splits) * ROWS_PER_GROUP:
|
|
600
|
+
# We need two separate transactions to prevent an infinite loop that may happen
|
|
601
|
+
# while appending and reading the same table using a single transaction.
|
|
610
602
|
with session.transaction() as tx_read, session.transaction() as tx_write:
|
|
611
603
|
t_read = tx_read.bucket(clean_bucket_name).schema('s').table('t')
|
|
612
604
|
t_write = tx_write.bucket(clean_bucket_name).schema('s').table('t')
|
|
613
|
-
for batch in t_read.select(['a'],config=qc):
|
|
605
|
+
for batch in t_read.select(['a'], config=qc):
|
|
614
606
|
t_write.insert(batch)
|
|
615
607
|
num_rows = num_rows * 2
|
|
616
608
|
log.info("Num rows: %d", num_rows)
|
|
@@ -627,11 +619,12 @@ def test_select_stop(session, clean_bucket_name):
|
|
|
627
619
|
# If this assert triggers it just means that the test assumptions about how
|
|
628
620
|
# the tabular server splits the batches is not true anymore and we need to
|
|
629
621
|
# rewrite the test.
|
|
630
|
-
assert read_batches == qc.num_splits*qc.num_sub_splits
|
|
631
|
-
qc.query_id = str(random.randint(0,2**32))
|
|
622
|
+
assert read_batches == qc.num_splits * qc.num_sub_splits
|
|
623
|
+
qc.query_id = str(random.randint(0, 2**32))
|
|
632
624
|
log.info("query id is: %s", qc.query_id)
|
|
625
|
+
|
|
633
626
|
def active_threads():
|
|
634
|
-
log.debug("%s",[t.getName() for t in threading.enumerate() if t.is_alive()])
|
|
627
|
+
log.debug("%s", [t.getName() for t in threading.enumerate() if t.is_alive()])
|
|
635
628
|
return sum([1 if t.is_alive() and qc.query_id in t.getName() else 0 for t in threading.enumerate()])
|
|
636
629
|
|
|
637
630
|
assert active_threads() == 0
|
|
@@ -653,3 +646,20 @@ def test_select_stop(session, clean_bucket_name):
|
|
|
653
646
|
|
|
654
647
|
# validate that all query threads were killed.
|
|
655
648
|
assert active_threads() == 0
|
|
649
|
+
|
|
650
|
+
|
|
651
|
+
def test_big_catalog_select(session, clean_bucket_name):
|
|
652
|
+
with session.transaction() as tx:
|
|
653
|
+
bc = tx.catalog()
|
|
654
|
+
actual = pa.Table.from_batches(bc.select(['name']))
|
|
655
|
+
assert actual
|
|
656
|
+
log.info("actual=%s", actual)
|
|
657
|
+
|
|
658
|
+
|
|
659
|
+
def test_audit_log_select(session, clean_bucket_name):
|
|
660
|
+
with session.transaction() as tx:
|
|
661
|
+
a = tx.audit_log()
|
|
662
|
+
a.columns()
|
|
663
|
+
time.sleep(1)
|
|
664
|
+
actual = pa.Table.from_batches(a.select(), a.arrow_schema)
|
|
665
|
+
log.info("actual=%s", actual)
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
import pyarrow as pa
|
|
2
|
+
import pytest
|
|
3
|
+
|
|
4
|
+
from .. import errors, util
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def test_slices():
|
|
8
|
+
ROWS = 1 << 20
|
|
9
|
+
t = pa.table({"x": range(ROWS), "y": [i / 1000 for i in range(ROWS)]})
|
|
10
|
+
|
|
11
|
+
chunks = list(util.iter_serialized_slices(t))
|
|
12
|
+
assert len(chunks) > 1
|
|
13
|
+
sizes = [len(c) for c in chunks]
|
|
14
|
+
|
|
15
|
+
assert max(sizes) < util.MAX_RECORD_BATCH_SLICE_SIZE
|
|
16
|
+
assert t == pa.Table.from_batches(_parse(chunks))
|
|
17
|
+
|
|
18
|
+
chunks = list(util.iter_serialized_slices(t, 1000))
|
|
19
|
+
assert len(chunks) > 1
|
|
20
|
+
sizes = [len(c) for c in chunks]
|
|
21
|
+
|
|
22
|
+
assert max(sizes) < util.MAX_RECORD_BATCH_SLICE_SIZE
|
|
23
|
+
assert t == pa.Table.from_batches(_parse(chunks))
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def test_wide_row():
|
|
27
|
+
cols = [pa.field(f"x{i}", pa.utf8()) for i in range(1000)]
|
|
28
|
+
values = [['a' * 10000]] * len(cols)
|
|
29
|
+
t = pa.table(values, schema=pa.schema(cols))
|
|
30
|
+
assert len(t) == 1
|
|
31
|
+
|
|
32
|
+
with pytest.raises(errors.TooWideRow):
|
|
33
|
+
list(util.iter_serialized_slices(t))
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def _parse(bufs):
|
|
37
|
+
for buf in bufs:
|
|
38
|
+
with pa.ipc.open_stream(buf) as reader:
|
|
39
|
+
yield from reader
|
vastdb/tests/util.py
CHANGED
|
@@ -9,10 +9,7 @@ def prepare_data(session, clean_bucket_name, schema_name, table_name, arrow_tabl
|
|
|
9
9
|
with session.transaction() as tx:
|
|
10
10
|
s = tx.bucket(clean_bucket_name).create_schema(schema_name)
|
|
11
11
|
t = s.create_table(table_name, arrow_table.schema)
|
|
12
|
-
|
|
13
|
-
row_ids = row_ids_array.to_pylist()
|
|
14
|
-
log.debug("row_ids=%s" % row_ids)
|
|
15
|
-
assert row_ids == list(range(arrow_table.num_rows))
|
|
12
|
+
t.insert(arrow_table)
|
|
16
13
|
yield t
|
|
17
14
|
t.drop()
|
|
18
15
|
s.drop()
|
vastdb/transaction.py
CHANGED
|
@@ -8,19 +8,29 @@ A transcation is used as a context manager, since every Database-related operati
|
|
|
8
8
|
|
|
9
9
|
import logging
|
|
10
10
|
from dataclasses import dataclass
|
|
11
|
+
from typing import Optional
|
|
11
12
|
|
|
12
13
|
import botocore
|
|
13
14
|
|
|
14
|
-
from . import bucket, errors, session
|
|
15
|
+
from . import bucket, errors, schema, session, table
|
|
15
16
|
|
|
16
17
|
log = logging.getLogger(__name__)
|
|
17
18
|
|
|
19
|
+
TABULAR_BC_BUCKET = "vast-big-catalog-bucket"
|
|
20
|
+
VAST_CATALOG_SCHEMA_NAME = 'vast_big_catalog_schema'
|
|
21
|
+
VAST_CATALOG_TABLE_NAME = 'vast_big_catalog_table'
|
|
22
|
+
|
|
23
|
+
TABULAR_AUDERY_BUCKET = "vast-audit-log-bucket"
|
|
24
|
+
AUDERY_SCHEMA_NAME = 'vast_audit_log_schema'
|
|
25
|
+
AUDERY_TABLE_NAME = 'vast_audit_log_table'
|
|
26
|
+
|
|
27
|
+
|
|
18
28
|
@dataclass
|
|
19
29
|
class Transaction:
|
|
20
30
|
"""A holder of a single VAST transaction."""
|
|
21
31
|
|
|
22
32
|
_rpc: "session.Session"
|
|
23
|
-
txid: int = None
|
|
33
|
+
txid: Optional[int] = None
|
|
24
34
|
|
|
25
35
|
def __enter__(self):
|
|
26
36
|
"""Create a transaction and store its ID."""
|
|
@@ -31,15 +41,19 @@ class Transaction:
|
|
|
31
41
|
|
|
32
42
|
def __exit__(self, exc_type, exc_value, exc_traceback):
|
|
33
43
|
"""On success, the transaction is committed. Otherwise, it is rolled back."""
|
|
44
|
+
txid = self.txid
|
|
45
|
+
self.txid = None
|
|
34
46
|
if (exc_type, exc_value, exc_traceback) == (None, None, None):
|
|
35
|
-
log.debug("committing txid=%016x",
|
|
36
|
-
self._rpc.api.commit_transaction(
|
|
47
|
+
log.debug("committing txid=%016x", txid)
|
|
48
|
+
self._rpc.api.commit_transaction(txid)
|
|
37
49
|
else:
|
|
38
|
-
log.exception("rolling back txid=%016x due to:",
|
|
39
|
-
self._rpc.api.rollback_transaction(
|
|
50
|
+
log.exception("rolling back txid=%016x due to:", txid)
|
|
51
|
+
self._rpc.api.rollback_transaction(txid)
|
|
40
52
|
|
|
41
53
|
def __repr__(self):
|
|
42
54
|
"""Don't show the session details."""
|
|
55
|
+
if self.txid is None:
|
|
56
|
+
return 'InvalidTransaction'
|
|
43
57
|
return f'Transaction(id=0x{self.txid:016x})'
|
|
44
58
|
|
|
45
59
|
def bucket(self, name: str) -> "bucket.Bucket":
|
|
@@ -52,3 +66,15 @@ class Transaction:
|
|
|
52
66
|
raise errors.MissingBucket(name)
|
|
53
67
|
raise
|
|
54
68
|
return bucket.Bucket(name, self)
|
|
69
|
+
|
|
70
|
+
def catalog(self, fail_if_missing=True) -> Optional["table.Table"]:
|
|
71
|
+
"""Return VAST Catalog table."""
|
|
72
|
+
b = bucket.Bucket(TABULAR_BC_BUCKET, self)
|
|
73
|
+
s = schema.Schema(VAST_CATALOG_SCHEMA_NAME, b)
|
|
74
|
+
return s.table(name=VAST_CATALOG_TABLE_NAME, fail_if_missing=fail_if_missing)
|
|
75
|
+
|
|
76
|
+
def audit_log(self, fail_if_missing=True) -> Optional["table.Table"]:
|
|
77
|
+
"""Return VAST AuditLog table."""
|
|
78
|
+
b = bucket.Bucket(TABULAR_AUDERY_BUCKET, self)
|
|
79
|
+
s = schema.Schema(AUDERY_SCHEMA_NAME, b)
|
|
80
|
+
return s.table(name=AUDERY_TABLE_NAME, fail_if_missing=fail_if_missing)
|
vastdb/util.py
CHANGED
|
@@ -1,19 +1,22 @@
|
|
|
1
1
|
import logging
|
|
2
|
-
from typing import Callable
|
|
2
|
+
from typing import TYPE_CHECKING, Callable, List, Optional, Union
|
|
3
3
|
|
|
4
4
|
import pyarrow as pa
|
|
5
5
|
import pyarrow.parquet as pq
|
|
6
6
|
|
|
7
|
-
from .errors import InvalidArgument
|
|
8
|
-
from .schema import Schema
|
|
9
|
-
from .table import ImportConfig, Table
|
|
7
|
+
from .errors import InvalidArgument, TooWideRow
|
|
10
8
|
|
|
11
9
|
log = logging.getLogger(__name__)
|
|
12
10
|
|
|
11
|
+
if TYPE_CHECKING:
|
|
12
|
+
from .schema import Schema
|
|
13
|
+
from .table import ImportConfig, Table
|
|
14
|
+
|
|
13
15
|
|
|
14
16
|
def create_table_from_files(
|
|
15
|
-
schema: Schema, table_name: str, parquet_files: [str],
|
|
16
|
-
|
|
17
|
+
schema: "Schema", table_name: str, parquet_files: List[str],
|
|
18
|
+
schema_merge_func: Optional[Callable] = None,
|
|
19
|
+
config: Optional["ImportConfig"] = None) -> "Table":
|
|
17
20
|
if not schema_merge_func:
|
|
18
21
|
schema_merge_func = default_schema_merge
|
|
19
22
|
else:
|
|
@@ -76,3 +79,36 @@ def union_schema_merge(current_schema: pa.Schema, new_schema: pa.Schema) -> pa.S
|
|
|
76
79
|
This function returns a unified schema from potentially two different schemas.
|
|
77
80
|
"""
|
|
78
81
|
return pa.unify_schemas([current_schema, new_schema])
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
MAX_TABULAR_REQUEST_SIZE = 5 << 20 # in bytes
|
|
85
|
+
MAX_RECORD_BATCH_SLICE_SIZE = int(0.9 * MAX_TABULAR_REQUEST_SIZE)
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def iter_serialized_slices(batch: Union[pa.RecordBatch, pa.Table], max_rows_per_slice=None):
|
|
89
|
+
"""Iterate over a list of record batch slices."""
|
|
90
|
+
|
|
91
|
+
rows_per_slice = int(0.9 * len(batch) * MAX_RECORD_BATCH_SLICE_SIZE / batch.nbytes)
|
|
92
|
+
if max_rows_per_slice is not None:
|
|
93
|
+
rows_per_slice = min(rows_per_slice, max_rows_per_slice)
|
|
94
|
+
|
|
95
|
+
offset = 0
|
|
96
|
+
while offset < len(batch):
|
|
97
|
+
if rows_per_slice < 1:
|
|
98
|
+
raise TooWideRow(batch)
|
|
99
|
+
|
|
100
|
+
batch_slice = batch.slice(offset, rows_per_slice)
|
|
101
|
+
serialized_slice_batch = serialize_record_batch(batch_slice)
|
|
102
|
+
if len(serialized_slice_batch) <= MAX_RECORD_BATCH_SLICE_SIZE:
|
|
103
|
+
yield serialized_slice_batch
|
|
104
|
+
offset += rows_per_slice
|
|
105
|
+
else:
|
|
106
|
+
rows_per_slice = rows_per_slice // 2
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
def serialize_record_batch(batch: Union[pa.RecordBatch, pa.Table]):
|
|
110
|
+
"""Serialize a RecordBatch using Arrow IPC format."""
|
|
111
|
+
sink = pa.BufferOutputStream()
|
|
112
|
+
with pa.ipc.new_stream(sink, batch.schema) as writer:
|
|
113
|
+
writer.write(batch)
|
|
114
|
+
return sink.getvalue()
|
|
@@ -1,12 +1,11 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: vastdb
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.3
|
|
4
4
|
Summary: VAST Data SDK
|
|
5
5
|
Home-page: https://github.com/vast-data/vastdb_sdk
|
|
6
6
|
Author: VAST DATA
|
|
7
7
|
Author-email: hello@vastdata.com
|
|
8
8
|
License: Copyright (C) VAST Data Ltd.
|
|
9
|
-
Platform: UNKNOWN
|
|
10
9
|
Classifier: Development Status :: 4 - Beta
|
|
11
10
|
Classifier: License :: OSI Approved :: Apache Software License
|
|
12
11
|
Classifier: Programming Language :: Python :: 3
|
|
@@ -22,7 +21,7 @@ License-File: LICENSE
|
|
|
22
21
|
Requires-Dist: aws-requests-auth
|
|
23
22
|
Requires-Dist: boto3
|
|
24
23
|
Requires-Dist: flatbuffers
|
|
25
|
-
Requires-Dist: ibis-framework
|
|
24
|
+
Requires-Dist: ibis-framework ==8.0.0
|
|
26
25
|
Requires-Dist: pyarrow
|
|
27
26
|
Requires-Dist: requests
|
|
28
27
|
Requires-Dist: xmltodict
|
|
@@ -34,5 +33,3 @@ and [VAST Catalog](https://vastdata.com/blog/vast-catalog-treat-your-file-system
|
|
|
34
33
|
enabling schema and table management, efficient ingest, query and modification of columnar data.
|
|
35
34
|
|
|
36
35
|
For more details, see [our whitepaper](https://vastdata.com/whitepaper/#TheVASTDataBase).
|
|
37
|
-
|
|
38
|
-
|