vastdb 0.0.5.3__py3-none-any.whl → 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. vast_flatbuf/tabular/GetTableStatsResponse.py +45 -1
  2. vast_flatbuf/tabular/VipRange.py +56 -0
  3. vastdb/__init__.py +7 -0
  4. vastdb/bucket.py +77 -0
  5. vastdb/errors.py +158 -0
  6. vastdb/{api.py → internal_commands.py} +280 -746
  7. vastdb/schema.py +77 -0
  8. vastdb/session.py +48 -0
  9. vastdb/table.py +480 -0
  10. vastdb/tests/conftest.py +15 -14
  11. vastdb/tests/test_imports.py +125 -0
  12. vastdb/tests/test_projections.py +41 -0
  13. vastdb/tests/test_sanity.py +36 -16
  14. vastdb/tests/test_schemas.py +12 -6
  15. vastdb/tests/test_tables.py +581 -13
  16. vastdb/transaction.py +55 -0
  17. vastdb/util.py +8 -8
  18. vastdb-0.1.0.dist-info/METADATA +38 -0
  19. {vastdb-0.0.5.3.dist-info → vastdb-0.1.0.dist-info}/RECORD +22 -31
  20. vast_protobuf/__init__.py +0 -0
  21. vast_protobuf/substrait/__init__.py +0 -0
  22. vast_protobuf/substrait/algebra_pb2.py +0 -1344
  23. vast_protobuf/substrait/capabilities_pb2.py +0 -46
  24. vast_protobuf/substrait/ddl_pb2.py +0 -57
  25. vast_protobuf/substrait/extended_expression_pb2.py +0 -49
  26. vast_protobuf/substrait/extensions/__init__.py +0 -0
  27. vast_protobuf/substrait/extensions/extensions_pb2.py +0 -89
  28. vast_protobuf/substrait/function_pb2.py +0 -168
  29. vast_protobuf/substrait/parameterized_types_pb2.py +0 -181
  30. vast_protobuf/substrait/plan_pb2.py +0 -67
  31. vast_protobuf/substrait/type_expressions_pb2.py +0 -198
  32. vast_protobuf/substrait/type_pb2.py +0 -350
  33. vast_protobuf/tabular/__init__.py +0 -0
  34. vast_protobuf/tabular/rpc_pb2.py +0 -344
  35. vastdb/bench_scan.py +0 -45
  36. vastdb/tests/test_create_table_from_parquets.py +0 -50
  37. vastdb/v2.py +0 -360
  38. vastdb-0.0.5.3.dist-info/METADATA +0 -47
  39. {vastdb-0.0.5.3.dist-info → vastdb-0.1.0.dist-info}/LICENSE +0 -0
  40. {vastdb-0.0.5.3.dist-info → vastdb-0.1.0.dist-info}/WHEEL +0 -0
  41. {vastdb-0.0.5.3.dist-info → vastdb-0.1.0.dist-info}/top_level.txt +0 -0
@@ -1,8 +1,286 @@
1
+ import duckdb
2
+ import pytest
3
+ import threading
4
+ import random
1
5
  import pyarrow as pa
6
+ import pyarrow.compute as pc
7
+ import pyarrow.parquet as pq
8
+ import decimal
9
+ import datetime as dt
2
10
 
11
+ from tempfile import NamedTemporaryFile
12
+ from contextlib import contextmanager, closing
3
13
 
4
- def test_tables(rpc, clean_bucket_name):
5
- with rpc.transaction() as tx:
14
+ from requests.exceptions import HTTPError
15
+ import logging
16
+
17
+ from ..table import INTERNAL_ROW_ID, QueryConfig
18
+ from .. import errors
19
+
20
+
21
+ log = logging.getLogger(__name__)
22
+
23
+
24
+ @contextmanager
25
+ def prepare_data(session, clean_bucket_name, schema_name, table_name, arrow_table):
26
+ with session.transaction() as tx:
27
+ s = tx.bucket(clean_bucket_name).create_schema(schema_name)
28
+ t = s.create_table(table_name, arrow_table.schema)
29
+ row_ids_array = t.insert(arrow_table)
30
+ row_ids = row_ids_array.to_pylist()
31
+ log.debug("row_ids=%s" % row_ids)
32
+ assert row_ids == list(range(arrow_table.num_rows))
33
+ yield t
34
+ t.drop()
35
+ s.drop()
36
+
37
+ log = logging.getLogger(__name__)
38
+
39
+ def test_tables(session, clean_bucket_name):
40
+ columns = pa.schema([
41
+ ('a', pa.int64()),
42
+ ('b', pa.float32()),
43
+ ('s', pa.utf8()),
44
+ ])
45
+ expected = pa.table(schema=columns, data=[
46
+ [111, 222, 333],
47
+ [0.5, 1.5, 2.5],
48
+ ['a', 'bb', 'ccc'],
49
+ ])
50
+ with prepare_data(session, clean_bucket_name, 's', 't', expected) as t:
51
+ actual = pa.Table.from_batches(t.select(columns=['a', 'b', 's']))
52
+ assert actual == expected
53
+
54
+ actual = pa.Table.from_batches(t.select())
55
+ assert actual == expected
56
+
57
+ actual = pa.Table.from_batches(t.select(columns=['a', 'b']))
58
+ assert actual == expected.select(['a', 'b'])
59
+
60
+ actual = pa.Table.from_batches(t.select(columns=['b', 's', 'a']))
61
+ assert actual == expected.select(['b', 's', 'a'])
62
+
63
+ actual = pa.Table.from_batches(t.select(columns=['s']))
64
+ assert actual == expected.select(['s'])
65
+
66
+ actual = pa.Table.from_batches(t.select(columns=[]))
67
+ assert actual == expected.select([])
68
+
69
+ actual = pa.Table.from_batches(t.select(columns=['s'], internal_row_id=True))
70
+ log.debug("actual=%s", actual)
71
+ assert actual.to_pydict() == {
72
+ 's': ['a', 'bb', 'ccc'],
73
+ INTERNAL_ROW_ID: [0, 1, 2]
74
+ }
75
+
76
+ columns_to_delete = pa.schema([(INTERNAL_ROW_ID, pa.uint64())])
77
+ rb = pa.record_batch(schema=columns_to_delete, data=[[0]]) # delete rows 0,1
78
+ t.delete(rb)
79
+
80
+ selected_rows = pa.Table.from_batches(t.select(columns=['b'], predicate=(t['a'] == 222), internal_row_id=True))
81
+ t.delete(selected_rows)
82
+ actual = pa.Table.from_batches(t.select(columns=['a', 'b', 's']))
83
+ assert actual.to_pydict() == {
84
+ 'a': [333],
85
+ 'b': [2.5],
86
+ 's': ['ccc']
87
+ }
88
+
89
+ def test_update_table(session, clean_bucket_name):
90
+ columns = pa.schema([
91
+ ('a', pa.int64()),
92
+ ('b', pa.float32()),
93
+ ('s', pa.utf8()),
94
+ ])
95
+ expected = pa.table(schema=columns, data=[
96
+ [111, 222, 333],
97
+ [0.5, 1.5, 2.5],
98
+ ['a', 'bb', 'ccc'],
99
+ ])
100
+ with prepare_data(session, clean_bucket_name, 's', 't', expected) as t:
101
+ columns_to_update = pa.schema([
102
+ (INTERNAL_ROW_ID, pa.uint64()),
103
+ ('a', pa.int64())
104
+ ])
105
+
106
+ rb = pa.record_batch(schema=columns_to_update, data=[
107
+ [0, 2], # update rows 0,2
108
+ [1110, 3330]
109
+ ])
110
+
111
+ t.update(rb)
112
+ actual = pa.Table.from_batches(t.select(columns=['a', 'b']))
113
+ assert actual.to_pydict() == {
114
+ 'a': [1110, 222, 3330],
115
+ 'b': [0.5, 1.5, 2.5]
116
+ }
117
+
118
+ actual = pa.Table.from_batches(t.select(columns=['a', 'b'], predicate=(t['a'] < 1000), internal_row_id=True))
119
+ column_index = actual.column_names.index('a')
120
+ column_field = actual.field(column_index)
121
+ new_data = pc.add(actual.column('a'), 2000)
122
+ update_table = actual.set_column(column_index, column_field, new_data)
123
+
124
+ t.update(update_table, columns=['a'])
125
+ actual = pa.Table.from_batches(t.select(columns=['a', 'b']))
126
+ assert actual.to_pydict() == {
127
+ 'a': [1110, 2222, 3330],
128
+ 'b': [0.5, 1.5, 2.5]
129
+ }
130
+
131
+ actual = pa.Table.from_batches(t.select(columns=['a', 'b'], predicate=(t['a'] != 2222), internal_row_id=True))
132
+ column_index = actual.column_names.index('a')
133
+ column_field = actual.field(column_index)
134
+ new_data = pc.divide(actual.column('a'), 10)
135
+ update_table = actual.set_column(column_index, column_field, new_data)
136
+
137
+ t.update(update_table.to_batches()[0], columns=['a'])
138
+ actual = pa.Table.from_batches(t.select(columns=['a', 'b']))
139
+ assert actual.to_pydict() == {
140
+ 'a': [111, 2222, 333],
141
+ 'b': [0.5, 1.5, 2.5]
142
+ }
143
+
144
+ def test_select_with_multisplits(session, clean_bucket_name):
145
+ columns = pa.schema([
146
+ ('a', pa.int32())
147
+ ])
148
+
149
+ data = [ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
150
+ data = data * 1000
151
+ expected = pa.table(schema=columns, data=[data])
152
+
153
+ config = QueryConfig()
154
+ config.rows_per_split = 1000
155
+
156
+ with prepare_data(session, clean_bucket_name, 's', 't', expected) as t:
157
+ actual = pa.Table.from_batches(t.select(columns=['a'], config=config))
158
+ assert actual == expected
159
+
160
+
161
+ def test_types(session, clean_bucket_name):
162
+ columns = pa.schema([
163
+ ('tb', pa.bool_()),
164
+ ('a1', pa.int8()),
165
+ ('a2', pa.int16()),
166
+ ('a4', pa.int64()),
167
+ ('b', pa.float32()),
168
+ ('s', pa.string()),
169
+ ('d', pa.decimal128(7, 3)),
170
+ ('bin', pa.binary()),
171
+ ('date', pa.date32()),
172
+ ('ts' ,pa.timestamp('s')),
173
+ ])
174
+
175
+ expected = pa.table(schema=columns, data=[
176
+ [True, True, False],
177
+ [1 , 2, 4],
178
+ [1999, 2000, 2001],
179
+ [11122221, 222111122, 333333],
180
+ [0.5, 1.5, 2.5],
181
+ ["a", "v", "s"],
182
+ [decimal.Decimal('110.52'), decimal.Decimal('231.15'), decimal.Decimal('3332.44')],
183
+ [b"\x01\x02", b"\x01\x05", b"\x01\x07"],
184
+ [dt.datetime.now().date(), dt.datetime.now().date(), dt.datetime.now().date()],
185
+ [dt.datetime.fromtimestamp(10000), dt.datetime.fromtimestamp(100), dt.datetime.fromtimestamp(0)]
186
+ ])
187
+ with prepare_data(session, clean_bucket_name, 's', 't', expected) as t:
188
+ def select(predicate):
189
+ return pa.Table.from_batches(t.select(predicate=predicate))
190
+
191
+ assert select(None) == expected
192
+ assert select(t['tb'] == False) == expected.filter(pc.field('tb') == False) # noqa: E712
193
+ assert select(t['a1'] == 2) == expected.filter(pc.field('a1') == 2)
194
+ assert select(t['a2'] == 2000) == expected.filter(pc.field('a2') == 2000)
195
+ assert select(t['a4'] == 222111122) == expected.filter(pc.field('a4') == 222111122)
196
+ assert select(t['b'] == 1.5) == expected.filter(pc.field('b') == 1.5)
197
+ assert select(t['s'] == "v") == expected.filter(pc.field('s') == "v")
198
+ assert select(t['d'] == 231.15) == expected.filter(pc.field('d') == 231.15)
199
+ assert select(t['bin'] == b"\x01\x02") == expected.filter(pc.field('bin') == b"\x01\x02")
200
+ assert select(t['date'] == dt.datetime.now().date()) == expected.filter(pc.field('date') == dt.datetime.now().date())
201
+
202
+
203
+ def test_filters(session, clean_bucket_name):
204
+ columns = pa.schema([
205
+ ('a', pa.int32()),
206
+ ('b', pa.float64()),
207
+ ('s', pa.utf8()),
208
+ ])
209
+
210
+ expected = pa.table(schema=columns, data=[
211
+ [111, 222, 333, 444, 555],
212
+ [0.5, 1.5, 2.5, 3.5, 4.5],
213
+ ['a', 'bb', 'ccc', None, 'xyz'],
214
+ ])
215
+
216
+ with prepare_data(session, clean_bucket_name, 's', 't', expected) as t:
217
+ def select(predicate):
218
+ return pa.Table.from_batches(t.select(predicate=predicate))
219
+
220
+ assert select(None) == expected
221
+
222
+ assert select(t['a'] > 222) == expected.filter(pc.field('a') > 222)
223
+ assert select(t['a'] < 222) == expected.filter(pc.field('a') < 222)
224
+ assert select(t['a'] == 222) == expected.filter(pc.field('a') == 222)
225
+ assert select(t['a'] != 222) == expected.filter(pc.field('a') != 222)
226
+ assert select(t['a'] <= 222) == expected.filter(pc.field('a') <= 222)
227
+ assert select(t['a'] >= 222) == expected.filter(pc.field('a') >= 222)
228
+
229
+ assert select(t['b'] > 1.5) == expected.filter(pc.field('b') > 1.5)
230
+ assert select(t['b'] < 1.5) == expected.filter(pc.field('b') < 1.5)
231
+ assert select(t['b'] == 1.5) == expected.filter(pc.field('b') == 1.5)
232
+ assert select(t['b'] != 1.5) == expected.filter(pc.field('b') != 1.5)
233
+ assert select(t['b'] <= 1.5) == expected.filter(pc.field('b') <= 1.5)
234
+ assert select(t['b'] >= 1.5) == expected.filter(pc.field('b') >= 1.5)
235
+
236
+ assert select(t['s'] > 'bb') == expected.filter(pc.field('s') > 'bb')
237
+ assert select(t['s'] < 'bb') == expected.filter(pc.field('s') < 'bb')
238
+ assert select(t['s'] == 'bb') == expected.filter(pc.field('s') == 'bb')
239
+ assert select(t['s'] != 'bb') == expected.filter(pc.field('s') != 'bb')
240
+ assert select(t['s'] <= 'bb') == expected.filter(pc.field('s') <= 'bb')
241
+ assert select(t['s'] >= 'bb') == expected.filter(pc.field('s') >= 'bb')
242
+
243
+ assert select((t['a'] > 111) & (t['b'] > 0) & (t['s'] < 'ccc')) == expected.filter((pc.field('a') > 111) & (pc.field('b') > 0) & (pc.field('s') < 'ccc'))
244
+ assert select((t['a'] > 111) & (t['b'] < 2.5)) == expected.filter((pc.field('a') > 111) & (pc.field('b') < 2.5))
245
+ assert select((t['a'] > 111) & (t['a'] < 333)) == expected.filter((pc.field('a') > 111) & (pc.field('a') < 333))
246
+
247
+ assert select((t['a'] > 111) | (t['a'] < 333)) == expected.filter((pc.field('a') > 111) | (pc.field('a') < 333))
248
+ assert select(((t['a'] > 111) | (t['a'] < 333)) & (t['b'] < 2.5)) == expected.filter(((pc.field('a') > 111) | (pc.field('a') < 333)) & (pc.field('b') < 2.5))
249
+ with pytest.raises(NotImplementedError):
250
+ assert select((t['a'] > 111) | (t['b'] > 0) | (t['s'] < 'ccc')) == expected.filter((pc.field('a') > 111) | (pc.field('b') > 0) | (pc.field('s') < 'ccc'))
251
+ assert select((t['a'] > 111) | (t['a'] < 333) | (t['a'] == 777) ) == expected.filter((pc.field('a') > 111) | (pc.field('a') < 333) | (pc.field('a') == 777))
252
+
253
+ assert select(t['s'].isnull()) == expected.filter(pc.field('s').is_null())
254
+ assert select((t['s'].isnull()) | (t['s'] == 'bb')) == expected.filter((pc.field('s').is_null()) | (pc.field('s') == 'bb'))
255
+ assert select((t['s'].isnull()) & (t['b'] == 3.5)) == expected.filter((pc.field('s').is_null()) & (pc.field('b') == 3.5))
256
+
257
+ assert select(~t['s'].isnull()) == expected.filter(~pc.field('s').is_null())
258
+ assert select(t['s'].contains('b')) == expected.filter(pc.field('s') == 'bb')
259
+ assert select(t['s'].contains('y')) == expected.filter(pc.field('s') == 'xyz')
260
+
261
+
262
+ def test_duckdb(session, clean_bucket_name):
263
+ columns = pa.schema([
264
+ ('a', pa.int32()),
265
+ ('b', pa.float64()),
266
+ ])
267
+ data = pa.table(schema=columns, data=[
268
+ [111, 222, 333],
269
+ [0.5, 1.5, 2.5],
270
+ ])
271
+ with prepare_data(session, clean_bucket_name, 's', 't', data) as t:
272
+ conn = duckdb.connect()
273
+ batches = t.select(columns=['a'], predicate=(t['b'] < 2)) # noqa: F841
274
+ actual = conn.execute('SELECT max(a) as "a_max" FROM batches').arrow()
275
+ expected = (data
276
+ .filter(pc.field('b') < 2)
277
+ .group_by([])
278
+ .aggregate([('a', 'max')]))
279
+ assert actual == expected
280
+
281
+
282
+ def test_parquet_export(session, clean_bucket_name):
283
+ with session.transaction() as tx:
6
284
  s = tx.bucket(clean_bucket_name).create_schema('s1')
7
285
  columns = pa.schema([
8
286
  ('a', pa.int16()),
@@ -19,22 +297,312 @@ def test_tables(rpc, clean_bucket_name):
19
297
  ['a', 'b'],
20
298
  ])
21
299
  expected = pa.Table.from_batches([rb])
22
- t.insert(rb)
23
-
24
- actual = pa.Table.from_batches(t.select(columns=['a', 'b', 's']))
300
+ rb = t.insert(rb)
301
+ assert rb.to_pylist() == [0, 1]
302
+ actual = pa.Table.from_batches(t.select())
25
303
  assert actual == expected
26
304
 
27
- actual = pa.Table.from_batches(t.select(columns=['a', 'b']))
28
- assert actual == expected.select(['a', 'b'])
305
+ table_batches = t.select()
29
306
 
30
- actual = pa.Table.from_batches(t.select(columns=['b', 's', 'a']))
31
- assert actual == expected.select(['b', 's', 'a'])
307
+ with NamedTemporaryFile() as parquet_file:
308
+ log.info("Writing table into parquet file: '%s'", parquet_file.name)
309
+ with closing(pq.ParquetWriter(parquet_file.name, table_batches.schema)) as parquet_writer:
310
+ for batch in table_batches:
311
+ parquet_writer.write_batch(batch)
32
312
 
33
- actual = pa.Table.from_batches(t.select(columns=['s']))
34
- assert actual == expected.select(['s'])
313
+ assert expected == pq.read_table(parquet_file.name)
35
314
 
36
- actual = pa.Table.from_batches(t.select(columns=[]))
37
- assert actual == expected.select([])
315
+ def test_errors(session, clean_bucket_name):
316
+ with pytest.raises(errors.MissingSchema):
317
+ with session.transaction() as tx:
318
+ tx.bucket(clean_bucket_name).schema('s1')
319
+
320
+ with pytest.raises(errors.MissingBucket):
321
+ with session.transaction() as tx:
322
+ tx.bucket("bla")
323
+
324
+ with pytest.raises(errors.Conflict):
325
+ with session.transaction() as tx:
326
+ b = tx.bucket(clean_bucket_name)
327
+ s = b.create_schema('s1')
328
+ columns = pa.schema([
329
+ ('a', pa.int16()),
330
+ ('b', pa.float32()),
331
+ ('s', pa.utf8()),
332
+ ])
333
+ s.create_table('t1', columns)
334
+ s.drop() # cannot drop schema without dropping its tables first
335
+
336
+ def test_rename_schema(session, clean_bucket_name):
337
+
338
+ with session.transaction() as tx:
339
+ s = tx.bucket(clean_bucket_name).create_schema('s')
340
+
341
+ with session.transaction() as tx, session.transaction() as tx2:
342
+ b = tx.bucket(clean_bucket_name)
343
+ # assert that there is only one schema in this bucket - pre rename
344
+ assert [s.name for s in b.schemas()] == ['s']
345
+
346
+ s = b.schema('s')
347
+ s.rename('ss')
348
+
349
+ # assert the table was renamed in the transaction context
350
+ # where it was renamed
351
+ assert s.name == 'ss'
352
+ with pytest.raises(errors.MissingSchema):
353
+ tx.bucket(clean_bucket_name).schema('s')
354
+
355
+ # assert that other transactions are isolated
356
+ tx2.bucket(clean_bucket_name).schema('s')
357
+ with pytest.raises(errors.MissingSchema):
358
+ tx2.bucket(clean_bucket_name).schema('ss')
359
+
360
+ # assert that new transactions see the updated schema name
361
+ with session.transaction() as tx:
362
+ b = tx.bucket(clean_bucket_name)
363
+ with pytest.raises(errors.MissingSchema):
364
+ b.schema('s')
365
+ s = b.schema('ss')
366
+ # assert that we still have only one schema and it is the one that was renamed
367
+ assert [s.name for s in b.schemas()] == ['ss']
368
+ s.drop()
369
+
370
+
371
+ def test_rename_table(session, clean_bucket_name):
372
+ columns = pa.schema([
373
+ ('a', pa.int16()),
374
+ ('b', pa.float32()),
375
+ ('s', pa.utf8()),
376
+ ])
377
+ with session.transaction() as tx:
378
+ s = tx.bucket(clean_bucket_name).create_schema('s')
379
+ t = s.create_table('t', columns)
380
+
381
+ with session.transaction() as tx, session.transaction() as tx2:
382
+ s = tx.bucket(clean_bucket_name).schema('s')
383
+ t = s.table('t')
384
+ t.rename('t2')
385
+ # assert that the new table name is seen in the context
386
+ # in which it was renamed
387
+ assert t.name == 't2'
388
+ with pytest.raises(errors.MissingTable):
389
+ s.table('t')
390
+ t = s.table('t2')
391
+
392
+ #assert that other transactions are isolated
393
+ with pytest.raises(errors.MissingTable):
394
+ tx2.bucket(clean_bucket_name).schema('s').table('t2')
395
+ tx2.bucket(clean_bucket_name).schema('s').table('t')
396
+
397
+ with session.transaction() as tx:
398
+ s = tx.bucket(clean_bucket_name).schema('s')
399
+ #assert that new transactions see the change
400
+ with pytest.raises(errors.MissingTable):
401
+ s.table('t')
402
+ t = s.table('t2')
403
+ t.drop()
404
+ s.drop()
405
+
406
+ def test_add_column(session, clean_bucket_name):
407
+ columns = pa.schema([
408
+ ('a', pa.int16()),
409
+ ('b', pa.float32()),
410
+ ('s', pa.utf8()),
411
+ ])
412
+ new_column = pa.field('aa', pa.int16())
413
+ new_schema = columns.append(new_column)
414
+
415
+ with session.transaction() as tx:
416
+ s = tx.bucket(clean_bucket_name).create_schema('s')
417
+ s.create_table('t', columns)
418
+
419
+ with session.transaction() as tx, session.transaction() as tx2:
420
+ t = tx.bucket(clean_bucket_name).schema('s').table('t')
421
+ assert t.arrow_schema == columns
422
+
423
+ t.add_column(pa.schema([new_column]))
424
+ # assert that the column is seen in the context
425
+ # in which it was added
426
+ assert t.arrow_schema == new_schema
427
+
428
+ #assert that other transactions are isolated
429
+ assert tx2.bucket(clean_bucket_name).schema('s').table('t').arrow_schema == columns
430
+
431
+
432
+ with session.transaction() as tx:
433
+ s = tx.bucket(clean_bucket_name).schema('s')
434
+ t = s.table('t')
435
+ #assert that new transactions see the change
436
+ assert t.arrow_schema == new_schema
437
+ t.drop()
438
+ s.drop()
439
+
440
+ def test_drop_column(session, clean_bucket_name):
441
+ columns = pa.schema([
442
+ ('a', pa.int16()),
443
+ ('b', pa.float32()),
444
+ ('s', pa.utf8()),
445
+ ])
446
+ field_idx = columns.get_field_index('a')
447
+ new_schema = columns.remove(field_idx)
448
+ column_to_drop = columns.field(field_idx)
449
+
450
+ with session.transaction() as tx:
451
+ s = tx.bucket(clean_bucket_name).create_schema('s')
452
+ s.create_table('t', columns)
453
+
454
+ with session.transaction() as tx, session.transaction() as tx2:
455
+ t = tx.bucket(clean_bucket_name).schema('s').table('t')
456
+ assert t.arrow_schema == columns
457
+
458
+ t.drop_column(pa.schema([column_to_drop]))
459
+ # assert that the column is seen in the context
460
+ # in which it was added
461
+ assert t.arrow_schema == new_schema
462
+
463
+ #assert that other transactions are isolated
464
+ assert tx2.bucket(clean_bucket_name).schema('s').table('t').arrow_schema == columns
465
+
466
+
467
+ with session.transaction() as tx:
468
+ s = tx.bucket(clean_bucket_name).schema('s')
469
+ t = s.table('t')
470
+ #assert that new transactions see the change
471
+ assert t.arrow_schema == new_schema
472
+ t.drop()
473
+ s.drop()
474
+
475
+ def test_rename_column(session, clean_bucket_name):
476
+ columns = pa.schema([
477
+ ('a', pa.int16()),
478
+ ('b', pa.float32()),
479
+ ('s', pa.utf8()),
480
+ ])
481
+ def prepare_rename_column(schema : pa.Schema, old_name : str, new_name : str) -> pa.Schema:
482
+ field_idx = schema.get_field_index(old_name)
483
+ column_to_rename = schema.field(field_idx)
484
+ renamed_column = column_to_rename.with_name(new_name)
485
+ return schema.set(field_idx, renamed_column)
486
+
487
+ new_schema = prepare_rename_column(columns,'a','aaa')
488
+
489
+ with session.transaction() as tx:
490
+ s = tx.bucket(clean_bucket_name).create_schema('s')
491
+ s.create_table('t', columns)
492
+
493
+ with session.transaction() as tx, session.transaction() as tx2:
494
+ t = tx.bucket(clean_bucket_name).schema('s').table('t')
495
+ assert t.arrow_schema == columns
38
496
 
497
+ t.rename_column('a', 'aaa')
498
+ # assert that the column is seen in the context
499
+ # in which it was added
500
+ assert t.arrow_schema == new_schema
501
+
502
+ #assert that other transactions are isolated
503
+ assert tx2.bucket(clean_bucket_name).schema('s').table('t').arrow_schema == columns
504
+
505
+ #assert that new transactions see the change
506
+ with session.transaction() as tx:
507
+ s = tx.bucket(clean_bucket_name).schema('s')
508
+ t = s.table('t')
509
+
510
+ assert t.arrow_schema == new_schema
511
+
512
+ # simultaneos renames of the same column
513
+ new_schema_tx1 = prepare_rename_column(new_schema, 'b', 'bb')
514
+ new_schema_tx2 = prepare_rename_column(new_schema, 'b', 'bbb')
515
+ with pytest.raises(errors.Conflict):
516
+ with session.transaction() as tx1, session.transaction() as tx2:
517
+ t1 = tx1.bucket(clean_bucket_name).schema('s').table('t')
518
+ t2 = tx2.bucket(clean_bucket_name).schema('s').table('t')
519
+ t1.rename_column('b', 'bb')
520
+ with pytest.raises(HTTPError, match = '409 Client Error: Conflict'):
521
+ t2.rename_column('b', 'bbb')
522
+
523
+ with session.transaction() as tx:
524
+ s = tx.bucket(clean_bucket_name).schema('s')
525
+ t = s.table('t')
526
+ # validate that the rename conflicted and rolled back
527
+ assert (t.arrow_schema != new_schema_tx1) and \
528
+ (t.arrow_schema != new_schema_tx2)
529
+
530
+ with session.transaction() as tx:
531
+ s = tx.bucket(clean_bucket_name).schema('s')
532
+ t = s.table('t')
39
533
  t.drop()
40
534
  s.drop()
535
+
536
+ def test_select_stop(session, clean_bucket_name):
537
+ columns = pa.schema([
538
+ ('a', pa.uint8()),
539
+ ])
540
+
541
+ rb = pa.record_batch(schema=columns, data=[
542
+ list(range(256)),
543
+ ])
544
+
545
+ num_rows = 0
546
+ with session.transaction() as tx:
547
+ b = tx.bucket(clean_bucket_name)
548
+ s = b.create_schema('s')
549
+ t = s.create_table('t', columns)
550
+ t.insert(rb)
551
+
552
+ num_rows = 2**8
553
+
554
+ ROWS_PER_GROUP = 2**16
555
+ qc = QueryConfig(num_sub_splits=2, num_splits=4, num_row_groups_per_sub_split=1)
556
+ with session.transaction() as tx:
557
+ t = tx.bucket(clean_bucket_name).schema('s').table('t')
558
+ t.refresh_stats()
559
+ qc.data_endpoints = list(t.stats.endpoints) * 2
560
+
561
+ # Duplicate the table until it is large enough to generate enough batches
562
+ while num_rows < (qc.num_sub_splits * qc.num_splits) * ROWS_PER_GROUP:
563
+ with session.transaction() as tx_read, session.transaction() as tx_write:
564
+ t_read = tx_read.bucket(clean_bucket_name).schema('s').table('t')
565
+ t_write = tx_write.bucket(clean_bucket_name).schema('s').table('t')
566
+ for batch in t_read.select(['a'],config=qc):
567
+ t_write.insert(batch)
568
+ num_rows = num_rows * 2
569
+ log.info("Num rows: %d", num_rows)
570
+
571
+ # Validate the number of batches and the number of rows
572
+ read_rows = 0
573
+ read_batches = 0
574
+ with session.transaction() as tx:
575
+ t = tx.bucket(clean_bucket_name).schema('s').table('t')
576
+ for batch in t.select(['a'], config=qc):
577
+ read_batches += 1
578
+ read_rows += len(batch)
579
+ assert read_rows == num_rows
580
+ # If this assert triggers it just means that the test assumptions about how
581
+ # the tabular server splits the batches is not true anymore and we need to
582
+ # rewrite the test.
583
+ assert read_batches == qc.num_splits*qc.num_sub_splits
584
+ qc.query_id = str(random.randint(0,2**32))
585
+ log.info("query id is: %s", qc.query_id)
586
+ def active_threads():
587
+ log.debug("%s",[t.getName() for t in threading.enumerate() if t.is_alive()])
588
+ return sum([1 if t.is_alive() and qc.query_id in t.getName() else 0 for t in threading.enumerate()])
589
+
590
+ assert active_threads() == 0
591
+
592
+ with session.transaction() as tx:
593
+ t = tx.bucket(clean_bucket_name).schema('s').table('t')
594
+ batches = iter(t.select(['a'], config=qc))
595
+ next(batches)
596
+ log.info("Active threads: %d", active_threads())
597
+ try:
598
+ assert active_threads() > 0
599
+ finally:
600
+ # If we dont delete the iterator, the threads will hang in a
601
+ # zombie state.
602
+ del batches
603
+
604
+ # Check that all threads were killed
605
+ log.info("Active threads: %d", active_threads())
606
+
607
+ # validate that all query threads were killed.
608
+ assert active_threads() == 0
vastdb/transaction.py ADDED
@@ -0,0 +1,55 @@
1
+ """VAST Database transaction.
2
+
3
+ A transcation is used as a context manager, since every Database-related operation in VAST requires a transaction.
4
+
5
+ with session.transaction() as tx:
6
+ tx.bucket("bucket").create_schema("schema")
7
+ """
8
+
9
+ from . import bucket, errors, session
10
+
11
+ import botocore
12
+
13
+ from dataclasses import dataclass
14
+ import logging
15
+
16
+
17
+ log = logging.getLogger(__name__)
18
+
19
+ @dataclass
20
+ class Transaction:
21
+ """A holder of a single VAST transaction."""
22
+
23
+ _rpc: "session.Session"
24
+ txid: int = None
25
+
26
+ def __enter__(self):
27
+ """Create a transaction and store its ID."""
28
+ response = self._rpc.api.begin_transaction()
29
+ self.txid = int(response.headers['tabular-txid'])
30
+ log.debug("opened txid=%016x", self.txid)
31
+ return self
32
+
33
+ def __exit__(self, exc_type, exc_value, exc_traceback):
34
+ """On success, the transaction is committed. Otherwise, it is rolled back."""
35
+ if (exc_type, exc_value, exc_traceback) == (None, None, None):
36
+ log.debug("committing txid=%016x", self.txid)
37
+ self._rpc.api.commit_transaction(self.txid)
38
+ else:
39
+ log.exception("rolling back txid=%016x due to:", self.txid)
40
+ self._rpc.api.rollback_transaction(self.txid)
41
+
42
+ def __repr__(self):
43
+ """Don't show the session details."""
44
+ return f'Transaction(id=0x{self.txid:016x})'
45
+
46
+ def bucket(self, name: str) -> "bucket.Bucket":
47
+ """Return a VAST Bucket, if exists."""
48
+ try:
49
+ self._rpc.s3.head_bucket(Bucket=name)
50
+ except botocore.exceptions.ClientError as e:
51
+ log.warning("res: %s", e.response)
52
+ if e.response['Error']['Code'] == '404':
53
+ raise errors.MissingBucket(name)
54
+ raise
55
+ return bucket.Bucket(name, self)