vastdb 0.0.5.2__py3-none-any.whl → 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vast_flatbuf/tabular/GetTableStatsResponse.py +45 -1
- vast_flatbuf/tabular/VipRange.py +56 -0
- vastdb/__init__.py +7 -0
- vastdb/bucket.py +77 -0
- vastdb/errors.py +158 -0
- vastdb/{api.py → internal_commands.py} +283 -747
- vastdb/schema.py +77 -0
- vastdb/session.py +48 -0
- vastdb/table.py +480 -0
- vastdb/tests/conftest.py +46 -0
- vastdb/tests/test_imports.py +125 -0
- vastdb/tests/test_projections.py +41 -0
- vastdb/tests/test_sanity.py +83 -0
- vastdb/tests/test_schemas.py +45 -0
- vastdb/tests/test_tables.py +608 -0
- vastdb/transaction.py +55 -0
- vastdb/util.py +77 -0
- vastdb-0.1.0.dist-info/METADATA +38 -0
- {vastdb-0.0.5.2.dist-info → vastdb-0.1.0.dist-info}/RECORD +23 -24
- vast_protobuf/substrait/__init__.py +0 -0
- vast_protobuf/substrait/algebra_pb2.py +0 -1344
- vast_protobuf/substrait/capabilities_pb2.py +0 -46
- vast_protobuf/substrait/ddl_pb2.py +0 -57
- vast_protobuf/substrait/extended_expression_pb2.py +0 -49
- vast_protobuf/substrait/extensions/__init__.py +0 -0
- vast_protobuf/substrait/extensions/extensions_pb2.py +0 -89
- vast_protobuf/substrait/function_pb2.py +0 -168
- vast_protobuf/substrait/parameterized_types_pb2.py +0 -181
- vast_protobuf/substrait/plan_pb2.py +0 -67
- vast_protobuf/substrait/type_expressions_pb2.py +0 -198
- vast_protobuf/substrait/type_pb2.py +0 -350
- vast_protobuf/tabular/__init__.py +0 -0
- vast_protobuf/tabular/rpc_pb2.py +0 -344
- vastdb/v2.py +0 -108
- vastdb-0.0.5.2.dist-info/METADATA +0 -47
- {vast_protobuf → vastdb/tests}/__init__.py +0 -0
- {vastdb-0.0.5.2.dist-info → vastdb-0.1.0.dist-info}/LICENSE +0 -0
- {vastdb-0.0.5.2.dist-info → vastdb-0.1.0.dist-info}/WHEEL +0 -0
- {vastdb-0.0.5.2.dist-info → vastdb-0.1.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,608 @@
|
|
|
1
|
+
import duckdb
|
|
2
|
+
import pytest
|
|
3
|
+
import threading
|
|
4
|
+
import random
|
|
5
|
+
import pyarrow as pa
|
|
6
|
+
import pyarrow.compute as pc
|
|
7
|
+
import pyarrow.parquet as pq
|
|
8
|
+
import decimal
|
|
9
|
+
import datetime as dt
|
|
10
|
+
|
|
11
|
+
from tempfile import NamedTemporaryFile
|
|
12
|
+
from contextlib import contextmanager, closing
|
|
13
|
+
|
|
14
|
+
from requests.exceptions import HTTPError
|
|
15
|
+
import logging
|
|
16
|
+
|
|
17
|
+
from ..table import INTERNAL_ROW_ID, QueryConfig
|
|
18
|
+
from .. import errors
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
log = logging.getLogger(__name__)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
@contextmanager
|
|
25
|
+
def prepare_data(session, clean_bucket_name, schema_name, table_name, arrow_table):
|
|
26
|
+
with session.transaction() as tx:
|
|
27
|
+
s = tx.bucket(clean_bucket_name).create_schema(schema_name)
|
|
28
|
+
t = s.create_table(table_name, arrow_table.schema)
|
|
29
|
+
row_ids_array = t.insert(arrow_table)
|
|
30
|
+
row_ids = row_ids_array.to_pylist()
|
|
31
|
+
log.debug("row_ids=%s" % row_ids)
|
|
32
|
+
assert row_ids == list(range(arrow_table.num_rows))
|
|
33
|
+
yield t
|
|
34
|
+
t.drop()
|
|
35
|
+
s.drop()
|
|
36
|
+
|
|
37
|
+
log = logging.getLogger(__name__)
|
|
38
|
+
|
|
39
|
+
def test_tables(session, clean_bucket_name):
|
|
40
|
+
columns = pa.schema([
|
|
41
|
+
('a', pa.int64()),
|
|
42
|
+
('b', pa.float32()),
|
|
43
|
+
('s', pa.utf8()),
|
|
44
|
+
])
|
|
45
|
+
expected = pa.table(schema=columns, data=[
|
|
46
|
+
[111, 222, 333],
|
|
47
|
+
[0.5, 1.5, 2.5],
|
|
48
|
+
['a', 'bb', 'ccc'],
|
|
49
|
+
])
|
|
50
|
+
with prepare_data(session, clean_bucket_name, 's', 't', expected) as t:
|
|
51
|
+
actual = pa.Table.from_batches(t.select(columns=['a', 'b', 's']))
|
|
52
|
+
assert actual == expected
|
|
53
|
+
|
|
54
|
+
actual = pa.Table.from_batches(t.select())
|
|
55
|
+
assert actual == expected
|
|
56
|
+
|
|
57
|
+
actual = pa.Table.from_batches(t.select(columns=['a', 'b']))
|
|
58
|
+
assert actual == expected.select(['a', 'b'])
|
|
59
|
+
|
|
60
|
+
actual = pa.Table.from_batches(t.select(columns=['b', 's', 'a']))
|
|
61
|
+
assert actual == expected.select(['b', 's', 'a'])
|
|
62
|
+
|
|
63
|
+
actual = pa.Table.from_batches(t.select(columns=['s']))
|
|
64
|
+
assert actual == expected.select(['s'])
|
|
65
|
+
|
|
66
|
+
actual = pa.Table.from_batches(t.select(columns=[]))
|
|
67
|
+
assert actual == expected.select([])
|
|
68
|
+
|
|
69
|
+
actual = pa.Table.from_batches(t.select(columns=['s'], internal_row_id=True))
|
|
70
|
+
log.debug("actual=%s", actual)
|
|
71
|
+
assert actual.to_pydict() == {
|
|
72
|
+
's': ['a', 'bb', 'ccc'],
|
|
73
|
+
INTERNAL_ROW_ID: [0, 1, 2]
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
columns_to_delete = pa.schema([(INTERNAL_ROW_ID, pa.uint64())])
|
|
77
|
+
rb = pa.record_batch(schema=columns_to_delete, data=[[0]]) # delete rows 0,1
|
|
78
|
+
t.delete(rb)
|
|
79
|
+
|
|
80
|
+
selected_rows = pa.Table.from_batches(t.select(columns=['b'], predicate=(t['a'] == 222), internal_row_id=True))
|
|
81
|
+
t.delete(selected_rows)
|
|
82
|
+
actual = pa.Table.from_batches(t.select(columns=['a', 'b', 's']))
|
|
83
|
+
assert actual.to_pydict() == {
|
|
84
|
+
'a': [333],
|
|
85
|
+
'b': [2.5],
|
|
86
|
+
's': ['ccc']
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
def test_update_table(session, clean_bucket_name):
|
|
90
|
+
columns = pa.schema([
|
|
91
|
+
('a', pa.int64()),
|
|
92
|
+
('b', pa.float32()),
|
|
93
|
+
('s', pa.utf8()),
|
|
94
|
+
])
|
|
95
|
+
expected = pa.table(schema=columns, data=[
|
|
96
|
+
[111, 222, 333],
|
|
97
|
+
[0.5, 1.5, 2.5],
|
|
98
|
+
['a', 'bb', 'ccc'],
|
|
99
|
+
])
|
|
100
|
+
with prepare_data(session, clean_bucket_name, 's', 't', expected) as t:
|
|
101
|
+
columns_to_update = pa.schema([
|
|
102
|
+
(INTERNAL_ROW_ID, pa.uint64()),
|
|
103
|
+
('a', pa.int64())
|
|
104
|
+
])
|
|
105
|
+
|
|
106
|
+
rb = pa.record_batch(schema=columns_to_update, data=[
|
|
107
|
+
[0, 2], # update rows 0,2
|
|
108
|
+
[1110, 3330]
|
|
109
|
+
])
|
|
110
|
+
|
|
111
|
+
t.update(rb)
|
|
112
|
+
actual = pa.Table.from_batches(t.select(columns=['a', 'b']))
|
|
113
|
+
assert actual.to_pydict() == {
|
|
114
|
+
'a': [1110, 222, 3330],
|
|
115
|
+
'b': [0.5, 1.5, 2.5]
|
|
116
|
+
}
|
|
117
|
+
|
|
118
|
+
actual = pa.Table.from_batches(t.select(columns=['a', 'b'], predicate=(t['a'] < 1000), internal_row_id=True))
|
|
119
|
+
column_index = actual.column_names.index('a')
|
|
120
|
+
column_field = actual.field(column_index)
|
|
121
|
+
new_data = pc.add(actual.column('a'), 2000)
|
|
122
|
+
update_table = actual.set_column(column_index, column_field, new_data)
|
|
123
|
+
|
|
124
|
+
t.update(update_table, columns=['a'])
|
|
125
|
+
actual = pa.Table.from_batches(t.select(columns=['a', 'b']))
|
|
126
|
+
assert actual.to_pydict() == {
|
|
127
|
+
'a': [1110, 2222, 3330],
|
|
128
|
+
'b': [0.5, 1.5, 2.5]
|
|
129
|
+
}
|
|
130
|
+
|
|
131
|
+
actual = pa.Table.from_batches(t.select(columns=['a', 'b'], predicate=(t['a'] != 2222), internal_row_id=True))
|
|
132
|
+
column_index = actual.column_names.index('a')
|
|
133
|
+
column_field = actual.field(column_index)
|
|
134
|
+
new_data = pc.divide(actual.column('a'), 10)
|
|
135
|
+
update_table = actual.set_column(column_index, column_field, new_data)
|
|
136
|
+
|
|
137
|
+
t.update(update_table.to_batches()[0], columns=['a'])
|
|
138
|
+
actual = pa.Table.from_batches(t.select(columns=['a', 'b']))
|
|
139
|
+
assert actual.to_pydict() == {
|
|
140
|
+
'a': [111, 2222, 333],
|
|
141
|
+
'b': [0.5, 1.5, 2.5]
|
|
142
|
+
}
|
|
143
|
+
|
|
144
|
+
def test_select_with_multisplits(session, clean_bucket_name):
|
|
145
|
+
columns = pa.schema([
|
|
146
|
+
('a', pa.int32())
|
|
147
|
+
])
|
|
148
|
+
|
|
149
|
+
data = [ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
|
|
150
|
+
data = data * 1000
|
|
151
|
+
expected = pa.table(schema=columns, data=[data])
|
|
152
|
+
|
|
153
|
+
config = QueryConfig()
|
|
154
|
+
config.rows_per_split = 1000
|
|
155
|
+
|
|
156
|
+
with prepare_data(session, clean_bucket_name, 's', 't', expected) as t:
|
|
157
|
+
actual = pa.Table.from_batches(t.select(columns=['a'], config=config))
|
|
158
|
+
assert actual == expected
|
|
159
|
+
|
|
160
|
+
|
|
161
|
+
def test_types(session, clean_bucket_name):
|
|
162
|
+
columns = pa.schema([
|
|
163
|
+
('tb', pa.bool_()),
|
|
164
|
+
('a1', pa.int8()),
|
|
165
|
+
('a2', pa.int16()),
|
|
166
|
+
('a4', pa.int64()),
|
|
167
|
+
('b', pa.float32()),
|
|
168
|
+
('s', pa.string()),
|
|
169
|
+
('d', pa.decimal128(7, 3)),
|
|
170
|
+
('bin', pa.binary()),
|
|
171
|
+
('date', pa.date32()),
|
|
172
|
+
('ts' ,pa.timestamp('s')),
|
|
173
|
+
])
|
|
174
|
+
|
|
175
|
+
expected = pa.table(schema=columns, data=[
|
|
176
|
+
[True, True, False],
|
|
177
|
+
[1 , 2, 4],
|
|
178
|
+
[1999, 2000, 2001],
|
|
179
|
+
[11122221, 222111122, 333333],
|
|
180
|
+
[0.5, 1.5, 2.5],
|
|
181
|
+
["a", "v", "s"],
|
|
182
|
+
[decimal.Decimal('110.52'), decimal.Decimal('231.15'), decimal.Decimal('3332.44')],
|
|
183
|
+
[b"\x01\x02", b"\x01\x05", b"\x01\x07"],
|
|
184
|
+
[dt.datetime.now().date(), dt.datetime.now().date(), dt.datetime.now().date()],
|
|
185
|
+
[dt.datetime.fromtimestamp(10000), dt.datetime.fromtimestamp(100), dt.datetime.fromtimestamp(0)]
|
|
186
|
+
])
|
|
187
|
+
with prepare_data(session, clean_bucket_name, 's', 't', expected) as t:
|
|
188
|
+
def select(predicate):
|
|
189
|
+
return pa.Table.from_batches(t.select(predicate=predicate))
|
|
190
|
+
|
|
191
|
+
assert select(None) == expected
|
|
192
|
+
assert select(t['tb'] == False) == expected.filter(pc.field('tb') == False) # noqa: E712
|
|
193
|
+
assert select(t['a1'] == 2) == expected.filter(pc.field('a1') == 2)
|
|
194
|
+
assert select(t['a2'] == 2000) == expected.filter(pc.field('a2') == 2000)
|
|
195
|
+
assert select(t['a4'] == 222111122) == expected.filter(pc.field('a4') == 222111122)
|
|
196
|
+
assert select(t['b'] == 1.5) == expected.filter(pc.field('b') == 1.5)
|
|
197
|
+
assert select(t['s'] == "v") == expected.filter(pc.field('s') == "v")
|
|
198
|
+
assert select(t['d'] == 231.15) == expected.filter(pc.field('d') == 231.15)
|
|
199
|
+
assert select(t['bin'] == b"\x01\x02") == expected.filter(pc.field('bin') == b"\x01\x02")
|
|
200
|
+
assert select(t['date'] == dt.datetime.now().date()) == expected.filter(pc.field('date') == dt.datetime.now().date())
|
|
201
|
+
|
|
202
|
+
|
|
203
|
+
def test_filters(session, clean_bucket_name):
|
|
204
|
+
columns = pa.schema([
|
|
205
|
+
('a', pa.int32()),
|
|
206
|
+
('b', pa.float64()),
|
|
207
|
+
('s', pa.utf8()),
|
|
208
|
+
])
|
|
209
|
+
|
|
210
|
+
expected = pa.table(schema=columns, data=[
|
|
211
|
+
[111, 222, 333, 444, 555],
|
|
212
|
+
[0.5, 1.5, 2.5, 3.5, 4.5],
|
|
213
|
+
['a', 'bb', 'ccc', None, 'xyz'],
|
|
214
|
+
])
|
|
215
|
+
|
|
216
|
+
with prepare_data(session, clean_bucket_name, 's', 't', expected) as t:
|
|
217
|
+
def select(predicate):
|
|
218
|
+
return pa.Table.from_batches(t.select(predicate=predicate))
|
|
219
|
+
|
|
220
|
+
assert select(None) == expected
|
|
221
|
+
|
|
222
|
+
assert select(t['a'] > 222) == expected.filter(pc.field('a') > 222)
|
|
223
|
+
assert select(t['a'] < 222) == expected.filter(pc.field('a') < 222)
|
|
224
|
+
assert select(t['a'] == 222) == expected.filter(pc.field('a') == 222)
|
|
225
|
+
assert select(t['a'] != 222) == expected.filter(pc.field('a') != 222)
|
|
226
|
+
assert select(t['a'] <= 222) == expected.filter(pc.field('a') <= 222)
|
|
227
|
+
assert select(t['a'] >= 222) == expected.filter(pc.field('a') >= 222)
|
|
228
|
+
|
|
229
|
+
assert select(t['b'] > 1.5) == expected.filter(pc.field('b') > 1.5)
|
|
230
|
+
assert select(t['b'] < 1.5) == expected.filter(pc.field('b') < 1.5)
|
|
231
|
+
assert select(t['b'] == 1.5) == expected.filter(pc.field('b') == 1.5)
|
|
232
|
+
assert select(t['b'] != 1.5) == expected.filter(pc.field('b') != 1.5)
|
|
233
|
+
assert select(t['b'] <= 1.5) == expected.filter(pc.field('b') <= 1.5)
|
|
234
|
+
assert select(t['b'] >= 1.5) == expected.filter(pc.field('b') >= 1.5)
|
|
235
|
+
|
|
236
|
+
assert select(t['s'] > 'bb') == expected.filter(pc.field('s') > 'bb')
|
|
237
|
+
assert select(t['s'] < 'bb') == expected.filter(pc.field('s') < 'bb')
|
|
238
|
+
assert select(t['s'] == 'bb') == expected.filter(pc.field('s') == 'bb')
|
|
239
|
+
assert select(t['s'] != 'bb') == expected.filter(pc.field('s') != 'bb')
|
|
240
|
+
assert select(t['s'] <= 'bb') == expected.filter(pc.field('s') <= 'bb')
|
|
241
|
+
assert select(t['s'] >= 'bb') == expected.filter(pc.field('s') >= 'bb')
|
|
242
|
+
|
|
243
|
+
assert select((t['a'] > 111) & (t['b'] > 0) & (t['s'] < 'ccc')) == expected.filter((pc.field('a') > 111) & (pc.field('b') > 0) & (pc.field('s') < 'ccc'))
|
|
244
|
+
assert select((t['a'] > 111) & (t['b'] < 2.5)) == expected.filter((pc.field('a') > 111) & (pc.field('b') < 2.5))
|
|
245
|
+
assert select((t['a'] > 111) & (t['a'] < 333)) == expected.filter((pc.field('a') > 111) & (pc.field('a') < 333))
|
|
246
|
+
|
|
247
|
+
assert select((t['a'] > 111) | (t['a'] < 333)) == expected.filter((pc.field('a') > 111) | (pc.field('a') < 333))
|
|
248
|
+
assert select(((t['a'] > 111) | (t['a'] < 333)) & (t['b'] < 2.5)) == expected.filter(((pc.field('a') > 111) | (pc.field('a') < 333)) & (pc.field('b') < 2.5))
|
|
249
|
+
with pytest.raises(NotImplementedError):
|
|
250
|
+
assert select((t['a'] > 111) | (t['b'] > 0) | (t['s'] < 'ccc')) == expected.filter((pc.field('a') > 111) | (pc.field('b') > 0) | (pc.field('s') < 'ccc'))
|
|
251
|
+
assert select((t['a'] > 111) | (t['a'] < 333) | (t['a'] == 777) ) == expected.filter((pc.field('a') > 111) | (pc.field('a') < 333) | (pc.field('a') == 777))
|
|
252
|
+
|
|
253
|
+
assert select(t['s'].isnull()) == expected.filter(pc.field('s').is_null())
|
|
254
|
+
assert select((t['s'].isnull()) | (t['s'] == 'bb')) == expected.filter((pc.field('s').is_null()) | (pc.field('s') == 'bb'))
|
|
255
|
+
assert select((t['s'].isnull()) & (t['b'] == 3.5)) == expected.filter((pc.field('s').is_null()) & (pc.field('b') == 3.5))
|
|
256
|
+
|
|
257
|
+
assert select(~t['s'].isnull()) == expected.filter(~pc.field('s').is_null())
|
|
258
|
+
assert select(t['s'].contains('b')) == expected.filter(pc.field('s') == 'bb')
|
|
259
|
+
assert select(t['s'].contains('y')) == expected.filter(pc.field('s') == 'xyz')
|
|
260
|
+
|
|
261
|
+
|
|
262
|
+
def test_duckdb(session, clean_bucket_name):
|
|
263
|
+
columns = pa.schema([
|
|
264
|
+
('a', pa.int32()),
|
|
265
|
+
('b', pa.float64()),
|
|
266
|
+
])
|
|
267
|
+
data = pa.table(schema=columns, data=[
|
|
268
|
+
[111, 222, 333],
|
|
269
|
+
[0.5, 1.5, 2.5],
|
|
270
|
+
])
|
|
271
|
+
with prepare_data(session, clean_bucket_name, 's', 't', data) as t:
|
|
272
|
+
conn = duckdb.connect()
|
|
273
|
+
batches = t.select(columns=['a'], predicate=(t['b'] < 2)) # noqa: F841
|
|
274
|
+
actual = conn.execute('SELECT max(a) as "a_max" FROM batches').arrow()
|
|
275
|
+
expected = (data
|
|
276
|
+
.filter(pc.field('b') < 2)
|
|
277
|
+
.group_by([])
|
|
278
|
+
.aggregate([('a', 'max')]))
|
|
279
|
+
assert actual == expected
|
|
280
|
+
|
|
281
|
+
|
|
282
|
+
def test_parquet_export(session, clean_bucket_name):
|
|
283
|
+
with session.transaction() as tx:
|
|
284
|
+
s = tx.bucket(clean_bucket_name).create_schema('s1')
|
|
285
|
+
columns = pa.schema([
|
|
286
|
+
('a', pa.int16()),
|
|
287
|
+
('b', pa.float32()),
|
|
288
|
+
('s', pa.utf8()),
|
|
289
|
+
])
|
|
290
|
+
assert s.tables() == []
|
|
291
|
+
t = s.create_table('t1', columns)
|
|
292
|
+
assert s.tables() == [t]
|
|
293
|
+
|
|
294
|
+
rb = pa.record_batch(schema=columns, data=[
|
|
295
|
+
[111, 222],
|
|
296
|
+
[0.5, 1.5],
|
|
297
|
+
['a', 'b'],
|
|
298
|
+
])
|
|
299
|
+
expected = pa.Table.from_batches([rb])
|
|
300
|
+
rb = t.insert(rb)
|
|
301
|
+
assert rb.to_pylist() == [0, 1]
|
|
302
|
+
actual = pa.Table.from_batches(t.select())
|
|
303
|
+
assert actual == expected
|
|
304
|
+
|
|
305
|
+
table_batches = t.select()
|
|
306
|
+
|
|
307
|
+
with NamedTemporaryFile() as parquet_file:
|
|
308
|
+
log.info("Writing table into parquet file: '%s'", parquet_file.name)
|
|
309
|
+
with closing(pq.ParquetWriter(parquet_file.name, table_batches.schema)) as parquet_writer:
|
|
310
|
+
for batch in table_batches:
|
|
311
|
+
parquet_writer.write_batch(batch)
|
|
312
|
+
|
|
313
|
+
assert expected == pq.read_table(parquet_file.name)
|
|
314
|
+
|
|
315
|
+
def test_errors(session, clean_bucket_name):
|
|
316
|
+
with pytest.raises(errors.MissingSchema):
|
|
317
|
+
with session.transaction() as tx:
|
|
318
|
+
tx.bucket(clean_bucket_name).schema('s1')
|
|
319
|
+
|
|
320
|
+
with pytest.raises(errors.MissingBucket):
|
|
321
|
+
with session.transaction() as tx:
|
|
322
|
+
tx.bucket("bla")
|
|
323
|
+
|
|
324
|
+
with pytest.raises(errors.Conflict):
|
|
325
|
+
with session.transaction() as tx:
|
|
326
|
+
b = tx.bucket(clean_bucket_name)
|
|
327
|
+
s = b.create_schema('s1')
|
|
328
|
+
columns = pa.schema([
|
|
329
|
+
('a', pa.int16()),
|
|
330
|
+
('b', pa.float32()),
|
|
331
|
+
('s', pa.utf8()),
|
|
332
|
+
])
|
|
333
|
+
s.create_table('t1', columns)
|
|
334
|
+
s.drop() # cannot drop schema without dropping its tables first
|
|
335
|
+
|
|
336
|
+
def test_rename_schema(session, clean_bucket_name):
|
|
337
|
+
|
|
338
|
+
with session.transaction() as tx:
|
|
339
|
+
s = tx.bucket(clean_bucket_name).create_schema('s')
|
|
340
|
+
|
|
341
|
+
with session.transaction() as tx, session.transaction() as tx2:
|
|
342
|
+
b = tx.bucket(clean_bucket_name)
|
|
343
|
+
# assert that there is only one schema in this bucket - pre rename
|
|
344
|
+
assert [s.name for s in b.schemas()] == ['s']
|
|
345
|
+
|
|
346
|
+
s = b.schema('s')
|
|
347
|
+
s.rename('ss')
|
|
348
|
+
|
|
349
|
+
# assert the table was renamed in the transaction context
|
|
350
|
+
# where it was renamed
|
|
351
|
+
assert s.name == 'ss'
|
|
352
|
+
with pytest.raises(errors.MissingSchema):
|
|
353
|
+
tx.bucket(clean_bucket_name).schema('s')
|
|
354
|
+
|
|
355
|
+
# assert that other transactions are isolated
|
|
356
|
+
tx2.bucket(clean_bucket_name).schema('s')
|
|
357
|
+
with pytest.raises(errors.MissingSchema):
|
|
358
|
+
tx2.bucket(clean_bucket_name).schema('ss')
|
|
359
|
+
|
|
360
|
+
# assert that new transactions see the updated schema name
|
|
361
|
+
with session.transaction() as tx:
|
|
362
|
+
b = tx.bucket(clean_bucket_name)
|
|
363
|
+
with pytest.raises(errors.MissingSchema):
|
|
364
|
+
b.schema('s')
|
|
365
|
+
s = b.schema('ss')
|
|
366
|
+
# assert that we still have only one schema and it is the one that was renamed
|
|
367
|
+
assert [s.name for s in b.schemas()] == ['ss']
|
|
368
|
+
s.drop()
|
|
369
|
+
|
|
370
|
+
|
|
371
|
+
def test_rename_table(session, clean_bucket_name):
|
|
372
|
+
columns = pa.schema([
|
|
373
|
+
('a', pa.int16()),
|
|
374
|
+
('b', pa.float32()),
|
|
375
|
+
('s', pa.utf8()),
|
|
376
|
+
])
|
|
377
|
+
with session.transaction() as tx:
|
|
378
|
+
s = tx.bucket(clean_bucket_name).create_schema('s')
|
|
379
|
+
t = s.create_table('t', columns)
|
|
380
|
+
|
|
381
|
+
with session.transaction() as tx, session.transaction() as tx2:
|
|
382
|
+
s = tx.bucket(clean_bucket_name).schema('s')
|
|
383
|
+
t = s.table('t')
|
|
384
|
+
t.rename('t2')
|
|
385
|
+
# assert that the new table name is seen in the context
|
|
386
|
+
# in which it was renamed
|
|
387
|
+
assert t.name == 't2'
|
|
388
|
+
with pytest.raises(errors.MissingTable):
|
|
389
|
+
s.table('t')
|
|
390
|
+
t = s.table('t2')
|
|
391
|
+
|
|
392
|
+
#assert that other transactions are isolated
|
|
393
|
+
with pytest.raises(errors.MissingTable):
|
|
394
|
+
tx2.bucket(clean_bucket_name).schema('s').table('t2')
|
|
395
|
+
tx2.bucket(clean_bucket_name).schema('s').table('t')
|
|
396
|
+
|
|
397
|
+
with session.transaction() as tx:
|
|
398
|
+
s = tx.bucket(clean_bucket_name).schema('s')
|
|
399
|
+
#assert that new transactions see the change
|
|
400
|
+
with pytest.raises(errors.MissingTable):
|
|
401
|
+
s.table('t')
|
|
402
|
+
t = s.table('t2')
|
|
403
|
+
t.drop()
|
|
404
|
+
s.drop()
|
|
405
|
+
|
|
406
|
+
def test_add_column(session, clean_bucket_name):
|
|
407
|
+
columns = pa.schema([
|
|
408
|
+
('a', pa.int16()),
|
|
409
|
+
('b', pa.float32()),
|
|
410
|
+
('s', pa.utf8()),
|
|
411
|
+
])
|
|
412
|
+
new_column = pa.field('aa', pa.int16())
|
|
413
|
+
new_schema = columns.append(new_column)
|
|
414
|
+
|
|
415
|
+
with session.transaction() as tx:
|
|
416
|
+
s = tx.bucket(clean_bucket_name).create_schema('s')
|
|
417
|
+
s.create_table('t', columns)
|
|
418
|
+
|
|
419
|
+
with session.transaction() as tx, session.transaction() as tx2:
|
|
420
|
+
t = tx.bucket(clean_bucket_name).schema('s').table('t')
|
|
421
|
+
assert t.arrow_schema == columns
|
|
422
|
+
|
|
423
|
+
t.add_column(pa.schema([new_column]))
|
|
424
|
+
# assert that the column is seen in the context
|
|
425
|
+
# in which it was added
|
|
426
|
+
assert t.arrow_schema == new_schema
|
|
427
|
+
|
|
428
|
+
#assert that other transactions are isolated
|
|
429
|
+
assert tx2.bucket(clean_bucket_name).schema('s').table('t').arrow_schema == columns
|
|
430
|
+
|
|
431
|
+
|
|
432
|
+
with session.transaction() as tx:
|
|
433
|
+
s = tx.bucket(clean_bucket_name).schema('s')
|
|
434
|
+
t = s.table('t')
|
|
435
|
+
#assert that new transactions see the change
|
|
436
|
+
assert t.arrow_schema == new_schema
|
|
437
|
+
t.drop()
|
|
438
|
+
s.drop()
|
|
439
|
+
|
|
440
|
+
def test_drop_column(session, clean_bucket_name):
|
|
441
|
+
columns = pa.schema([
|
|
442
|
+
('a', pa.int16()),
|
|
443
|
+
('b', pa.float32()),
|
|
444
|
+
('s', pa.utf8()),
|
|
445
|
+
])
|
|
446
|
+
field_idx = columns.get_field_index('a')
|
|
447
|
+
new_schema = columns.remove(field_idx)
|
|
448
|
+
column_to_drop = columns.field(field_idx)
|
|
449
|
+
|
|
450
|
+
with session.transaction() as tx:
|
|
451
|
+
s = tx.bucket(clean_bucket_name).create_schema('s')
|
|
452
|
+
s.create_table('t', columns)
|
|
453
|
+
|
|
454
|
+
with session.transaction() as tx, session.transaction() as tx2:
|
|
455
|
+
t = tx.bucket(clean_bucket_name).schema('s').table('t')
|
|
456
|
+
assert t.arrow_schema == columns
|
|
457
|
+
|
|
458
|
+
t.drop_column(pa.schema([column_to_drop]))
|
|
459
|
+
# assert that the column is seen in the context
|
|
460
|
+
# in which it was added
|
|
461
|
+
assert t.arrow_schema == new_schema
|
|
462
|
+
|
|
463
|
+
#assert that other transactions are isolated
|
|
464
|
+
assert tx2.bucket(clean_bucket_name).schema('s').table('t').arrow_schema == columns
|
|
465
|
+
|
|
466
|
+
|
|
467
|
+
with session.transaction() as tx:
|
|
468
|
+
s = tx.bucket(clean_bucket_name).schema('s')
|
|
469
|
+
t = s.table('t')
|
|
470
|
+
#assert that new transactions see the change
|
|
471
|
+
assert t.arrow_schema == new_schema
|
|
472
|
+
t.drop()
|
|
473
|
+
s.drop()
|
|
474
|
+
|
|
475
|
+
def test_rename_column(session, clean_bucket_name):
|
|
476
|
+
columns = pa.schema([
|
|
477
|
+
('a', pa.int16()),
|
|
478
|
+
('b', pa.float32()),
|
|
479
|
+
('s', pa.utf8()),
|
|
480
|
+
])
|
|
481
|
+
def prepare_rename_column(schema : pa.Schema, old_name : str, new_name : str) -> pa.Schema:
|
|
482
|
+
field_idx = schema.get_field_index(old_name)
|
|
483
|
+
column_to_rename = schema.field(field_idx)
|
|
484
|
+
renamed_column = column_to_rename.with_name(new_name)
|
|
485
|
+
return schema.set(field_idx, renamed_column)
|
|
486
|
+
|
|
487
|
+
new_schema = prepare_rename_column(columns,'a','aaa')
|
|
488
|
+
|
|
489
|
+
with session.transaction() as tx:
|
|
490
|
+
s = tx.bucket(clean_bucket_name).create_schema('s')
|
|
491
|
+
s.create_table('t', columns)
|
|
492
|
+
|
|
493
|
+
with session.transaction() as tx, session.transaction() as tx2:
|
|
494
|
+
t = tx.bucket(clean_bucket_name).schema('s').table('t')
|
|
495
|
+
assert t.arrow_schema == columns
|
|
496
|
+
|
|
497
|
+
t.rename_column('a', 'aaa')
|
|
498
|
+
# assert that the column is seen in the context
|
|
499
|
+
# in which it was added
|
|
500
|
+
assert t.arrow_schema == new_schema
|
|
501
|
+
|
|
502
|
+
#assert that other transactions are isolated
|
|
503
|
+
assert tx2.bucket(clean_bucket_name).schema('s').table('t').arrow_schema == columns
|
|
504
|
+
|
|
505
|
+
#assert that new transactions see the change
|
|
506
|
+
with session.transaction() as tx:
|
|
507
|
+
s = tx.bucket(clean_bucket_name).schema('s')
|
|
508
|
+
t = s.table('t')
|
|
509
|
+
|
|
510
|
+
assert t.arrow_schema == new_schema
|
|
511
|
+
|
|
512
|
+
# simultaneos renames of the same column
|
|
513
|
+
new_schema_tx1 = prepare_rename_column(new_schema, 'b', 'bb')
|
|
514
|
+
new_schema_tx2 = prepare_rename_column(new_schema, 'b', 'bbb')
|
|
515
|
+
with pytest.raises(errors.Conflict):
|
|
516
|
+
with session.transaction() as tx1, session.transaction() as tx2:
|
|
517
|
+
t1 = tx1.bucket(clean_bucket_name).schema('s').table('t')
|
|
518
|
+
t2 = tx2.bucket(clean_bucket_name).schema('s').table('t')
|
|
519
|
+
t1.rename_column('b', 'bb')
|
|
520
|
+
with pytest.raises(HTTPError, match = '409 Client Error: Conflict'):
|
|
521
|
+
t2.rename_column('b', 'bbb')
|
|
522
|
+
|
|
523
|
+
with session.transaction() as tx:
|
|
524
|
+
s = tx.bucket(clean_bucket_name).schema('s')
|
|
525
|
+
t = s.table('t')
|
|
526
|
+
# validate that the rename conflicted and rolled back
|
|
527
|
+
assert (t.arrow_schema != new_schema_tx1) and \
|
|
528
|
+
(t.arrow_schema != new_schema_tx2)
|
|
529
|
+
|
|
530
|
+
with session.transaction() as tx:
|
|
531
|
+
s = tx.bucket(clean_bucket_name).schema('s')
|
|
532
|
+
t = s.table('t')
|
|
533
|
+
t.drop()
|
|
534
|
+
s.drop()
|
|
535
|
+
|
|
536
|
+
def test_select_stop(session, clean_bucket_name):
|
|
537
|
+
columns = pa.schema([
|
|
538
|
+
('a', pa.uint8()),
|
|
539
|
+
])
|
|
540
|
+
|
|
541
|
+
rb = pa.record_batch(schema=columns, data=[
|
|
542
|
+
list(range(256)),
|
|
543
|
+
])
|
|
544
|
+
|
|
545
|
+
num_rows = 0
|
|
546
|
+
with session.transaction() as tx:
|
|
547
|
+
b = tx.bucket(clean_bucket_name)
|
|
548
|
+
s = b.create_schema('s')
|
|
549
|
+
t = s.create_table('t', columns)
|
|
550
|
+
t.insert(rb)
|
|
551
|
+
|
|
552
|
+
num_rows = 2**8
|
|
553
|
+
|
|
554
|
+
ROWS_PER_GROUP = 2**16
|
|
555
|
+
qc = QueryConfig(num_sub_splits=2, num_splits=4, num_row_groups_per_sub_split=1)
|
|
556
|
+
with session.transaction() as tx:
|
|
557
|
+
t = tx.bucket(clean_bucket_name).schema('s').table('t')
|
|
558
|
+
t.refresh_stats()
|
|
559
|
+
qc.data_endpoints = list(t.stats.endpoints) * 2
|
|
560
|
+
|
|
561
|
+
# Duplicate the table until it is large enough to generate enough batches
|
|
562
|
+
while num_rows < (qc.num_sub_splits * qc.num_splits) * ROWS_PER_GROUP:
|
|
563
|
+
with session.transaction() as tx_read, session.transaction() as tx_write:
|
|
564
|
+
t_read = tx_read.bucket(clean_bucket_name).schema('s').table('t')
|
|
565
|
+
t_write = tx_write.bucket(clean_bucket_name).schema('s').table('t')
|
|
566
|
+
for batch in t_read.select(['a'],config=qc):
|
|
567
|
+
t_write.insert(batch)
|
|
568
|
+
num_rows = num_rows * 2
|
|
569
|
+
log.info("Num rows: %d", num_rows)
|
|
570
|
+
|
|
571
|
+
# Validate the number of batches and the number of rows
|
|
572
|
+
read_rows = 0
|
|
573
|
+
read_batches = 0
|
|
574
|
+
with session.transaction() as tx:
|
|
575
|
+
t = tx.bucket(clean_bucket_name).schema('s').table('t')
|
|
576
|
+
for batch in t.select(['a'], config=qc):
|
|
577
|
+
read_batches += 1
|
|
578
|
+
read_rows += len(batch)
|
|
579
|
+
assert read_rows == num_rows
|
|
580
|
+
# If this assert triggers it just means that the test assumptions about how
|
|
581
|
+
# the tabular server splits the batches is not true anymore and we need to
|
|
582
|
+
# rewrite the test.
|
|
583
|
+
assert read_batches == qc.num_splits*qc.num_sub_splits
|
|
584
|
+
qc.query_id = str(random.randint(0,2**32))
|
|
585
|
+
log.info("query id is: %s", qc.query_id)
|
|
586
|
+
def active_threads():
|
|
587
|
+
log.debug("%s",[t.getName() for t in threading.enumerate() if t.is_alive()])
|
|
588
|
+
return sum([1 if t.is_alive() and qc.query_id in t.getName() else 0 for t in threading.enumerate()])
|
|
589
|
+
|
|
590
|
+
assert active_threads() == 0
|
|
591
|
+
|
|
592
|
+
with session.transaction() as tx:
|
|
593
|
+
t = tx.bucket(clean_bucket_name).schema('s').table('t')
|
|
594
|
+
batches = iter(t.select(['a'], config=qc))
|
|
595
|
+
next(batches)
|
|
596
|
+
log.info("Active threads: %d", active_threads())
|
|
597
|
+
try:
|
|
598
|
+
assert active_threads() > 0
|
|
599
|
+
finally:
|
|
600
|
+
# If we dont delete the iterator, the threads will hang in a
|
|
601
|
+
# zombie state.
|
|
602
|
+
del batches
|
|
603
|
+
|
|
604
|
+
# Check that all threads were killed
|
|
605
|
+
log.info("Active threads: %d", active_threads())
|
|
606
|
+
|
|
607
|
+
# validate that all query threads were killed.
|
|
608
|
+
assert active_threads() == 0
|
vastdb/transaction.py
ADDED
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
"""VAST Database transaction.
|
|
2
|
+
|
|
3
|
+
A transcation is used as a context manager, since every Database-related operation in VAST requires a transaction.
|
|
4
|
+
|
|
5
|
+
with session.transaction() as tx:
|
|
6
|
+
tx.bucket("bucket").create_schema("schema")
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from . import bucket, errors, session
|
|
10
|
+
|
|
11
|
+
import botocore
|
|
12
|
+
|
|
13
|
+
from dataclasses import dataclass
|
|
14
|
+
import logging
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
log = logging.getLogger(__name__)
|
|
18
|
+
|
|
19
|
+
@dataclass
|
|
20
|
+
class Transaction:
|
|
21
|
+
"""A holder of a single VAST transaction."""
|
|
22
|
+
|
|
23
|
+
_rpc: "session.Session"
|
|
24
|
+
txid: int = None
|
|
25
|
+
|
|
26
|
+
def __enter__(self):
|
|
27
|
+
"""Create a transaction and store its ID."""
|
|
28
|
+
response = self._rpc.api.begin_transaction()
|
|
29
|
+
self.txid = int(response.headers['tabular-txid'])
|
|
30
|
+
log.debug("opened txid=%016x", self.txid)
|
|
31
|
+
return self
|
|
32
|
+
|
|
33
|
+
def __exit__(self, exc_type, exc_value, exc_traceback):
|
|
34
|
+
"""On success, the transaction is committed. Otherwise, it is rolled back."""
|
|
35
|
+
if (exc_type, exc_value, exc_traceback) == (None, None, None):
|
|
36
|
+
log.debug("committing txid=%016x", self.txid)
|
|
37
|
+
self._rpc.api.commit_transaction(self.txid)
|
|
38
|
+
else:
|
|
39
|
+
log.exception("rolling back txid=%016x due to:", self.txid)
|
|
40
|
+
self._rpc.api.rollback_transaction(self.txid)
|
|
41
|
+
|
|
42
|
+
def __repr__(self):
|
|
43
|
+
"""Don't show the session details."""
|
|
44
|
+
return f'Transaction(id=0x{self.txid:016x})'
|
|
45
|
+
|
|
46
|
+
def bucket(self, name: str) -> "bucket.Bucket":
|
|
47
|
+
"""Return a VAST Bucket, if exists."""
|
|
48
|
+
try:
|
|
49
|
+
self._rpc.s3.head_bucket(Bucket=name)
|
|
50
|
+
except botocore.exceptions.ClientError as e:
|
|
51
|
+
log.warning("res: %s", e.response)
|
|
52
|
+
if e.response['Error']['Code'] == '404':
|
|
53
|
+
raise errors.MissingBucket(name)
|
|
54
|
+
raise
|
|
55
|
+
return bucket.Bucket(name, self)
|