vastdb 0.0.5.3__py3-none-any.whl → 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. vast_flatbuf/tabular/GetTableStatsResponse.py +45 -1
  2. vast_flatbuf/tabular/VipRange.py +56 -0
  3. vastdb/__init__.py +7 -0
  4. vastdb/bench/test_perf.py +29 -0
  5. vastdb/bucket.py +85 -0
  6. vastdb/{tests/conftest.py → conftest.py} +29 -14
  7. vastdb/errors.py +175 -0
  8. vastdb/{api.py → internal_commands.py} +373 -875
  9. vastdb/schema.py +85 -0
  10. vastdb/session.py +47 -0
  11. vastdb/table.py +483 -0
  12. vastdb/tests/test_imports.py +123 -0
  13. vastdb/tests/test_nested.py +28 -0
  14. vastdb/tests/test_projections.py +42 -0
  15. vastdb/tests/test_sanity.py +34 -15
  16. vastdb/tests/test_schemas.py +30 -6
  17. vastdb/tests/test_tables.py +628 -13
  18. vastdb/tests/util.py +18 -0
  19. vastdb/transaction.py +54 -0
  20. vastdb/util.py +11 -10
  21. vastdb-0.1.1.dist-info/METADATA +38 -0
  22. {vastdb-0.0.5.3.dist-info → vastdb-0.1.1.dist-info}/RECORD +26 -31
  23. vast_protobuf/substrait/__init__.py +0 -0
  24. vast_protobuf/substrait/algebra_pb2.py +0 -1344
  25. vast_protobuf/substrait/capabilities_pb2.py +0 -46
  26. vast_protobuf/substrait/ddl_pb2.py +0 -57
  27. vast_protobuf/substrait/extended_expression_pb2.py +0 -49
  28. vast_protobuf/substrait/extensions/__init__.py +0 -0
  29. vast_protobuf/substrait/extensions/extensions_pb2.py +0 -89
  30. vast_protobuf/substrait/function_pb2.py +0 -168
  31. vast_protobuf/substrait/parameterized_types_pb2.py +0 -181
  32. vast_protobuf/substrait/plan_pb2.py +0 -67
  33. vast_protobuf/substrait/type_expressions_pb2.py +0 -198
  34. vast_protobuf/substrait/type_pb2.py +0 -350
  35. vast_protobuf/tabular/__init__.py +0 -0
  36. vast_protobuf/tabular/rpc_pb2.py +0 -344
  37. vastdb/bench_scan.py +0 -45
  38. vastdb/tests/test_create_table_from_parquets.py +0 -50
  39. vastdb/v2.py +0 -360
  40. vastdb-0.0.5.3.dist-info/METADATA +0 -47
  41. {vast_protobuf → vastdb/bench}/__init__.py +0 -0
  42. {vastdb-0.0.5.3.dist-info → vastdb-0.1.1.dist-info}/LICENSE +0 -0
  43. {vastdb-0.0.5.3.dist-info → vastdb-0.1.1.dist-info}/WHEEL +0 -0
  44. {vastdb-0.0.5.3.dist-info → vastdb-0.1.1.dist-info}/top_level.txt +0 -0
@@ -1,8 +1,333 @@
1
+ import datetime as dt
2
+ import decimal
3
+ import logging
4
+ import random
5
+ import threading
6
+ from contextlib import closing
7
+ from tempfile import NamedTemporaryFile
8
+
9
+ import duckdb
1
10
  import pyarrow as pa
11
+ import pyarrow.compute as pc
12
+ import pyarrow.parquet as pq
13
+ import pytest
14
+ from requests.exceptions import HTTPError
15
+
16
+ from .. import errors
17
+ from ..table import INTERNAL_ROW_ID, QueryConfig
18
+ from .util import prepare_data
19
+
20
+ log = logging.getLogger(__name__)
21
+
22
+
23
+ def test_tables(session, clean_bucket_name):
24
+ columns = pa.schema([
25
+ ('a', pa.int64()),
26
+ ('b', pa.float32()),
27
+ ('s', pa.utf8()),
28
+ ])
29
+ expected = pa.table(schema=columns, data=[
30
+ [111, 222, 333],
31
+ [0.5, 1.5, 2.5],
32
+ ['a', 'bb', 'ccc'],
33
+ ])
34
+ with prepare_data(session, clean_bucket_name, 's', 't', expected) as t:
35
+ actual = pa.Table.from_batches(t.select(columns=['a', 'b', 's']))
36
+ assert actual == expected
37
+
38
+ actual = pa.Table.from_batches(t.select())
39
+ assert actual == expected
40
+
41
+ actual = pa.Table.from_batches(t.select(columns=['a', 'b']))
42
+ assert actual == expected.select(['a', 'b'])
43
+
44
+ actual = pa.Table.from_batches(t.select(columns=['b', 's', 'a']))
45
+ assert actual == expected.select(['b', 's', 'a'])
46
+
47
+ actual = pa.Table.from_batches(t.select(columns=['s']))
48
+ assert actual == expected.select(['s'])
49
+
50
+ actual = pa.Table.from_batches(t.select(columns=[]))
51
+ assert actual == expected.select([])
52
+
53
+ actual = pa.Table.from_batches(t.select(columns=['s'], internal_row_id=True))
54
+ log.debug("actual=%s", actual)
55
+ assert actual.to_pydict() == {
56
+ 's': ['a', 'bb', 'ccc'],
57
+ INTERNAL_ROW_ID: [0, 1, 2]
58
+ }
59
+
60
+ columns_to_delete = pa.schema([(INTERNAL_ROW_ID, pa.uint64())])
61
+ rb = pa.record_batch(schema=columns_to_delete, data=[[0]]) # delete rows 0,1
62
+ t.delete(rb)
63
+
64
+ selected_rows = pa.Table.from_batches(t.select(columns=['b'], predicate=(t['a'] == 222), internal_row_id=True))
65
+ t.delete(selected_rows)
66
+ actual = pa.Table.from_batches(t.select(columns=['a', 'b', 's']))
67
+ assert actual.to_pydict() == {
68
+ 'a': [333],
69
+ 'b': [2.5],
70
+ 's': ['ccc']
71
+ }
72
+
73
+
74
+ def test_exists(session, clean_bucket_name):
75
+ with session.transaction() as tx:
76
+ s = tx.bucket(clean_bucket_name).create_schema('s1')
77
+ assert s.tables() == []
78
+
79
+ t = s.create_table('t', pa.schema([('x', pa.int64())]))
80
+
81
+ assert s.tables() == [t]
82
+ with pytest.raises(errors.TableExists):
83
+ s.create_table('t', pa.schema([('x', pa.int64())]))
84
+
85
+ assert s.tables() == [t]
86
+ assert s.create_table('t', pa.schema([('x', pa.int64())]), fail_if_exists=False) == t
87
+ assert s.tables() == [t]
88
+ assert s.create_table('t', pa.schema([('y', pa.int64())]), fail_if_exists=False) == t
89
+ assert s.tables() == [t]
90
+ assert s.create_table('t', pa.schema([('x', pa.int64())]), fail_if_exists=False) == t
91
+ assert s.tables() == [t]
92
+
93
+
94
+
95
+ def test_update_table(session, clean_bucket_name):
96
+ columns = pa.schema([
97
+ ('a', pa.int64()),
98
+ ('b', pa.float32()),
99
+ ('s', pa.utf8()),
100
+ ])
101
+ expected = pa.table(schema=columns, data=[
102
+ [111, 222, 333],
103
+ [0.5, 1.5, 2.5],
104
+ ['a', 'bb', 'ccc'],
105
+ ])
106
+ with prepare_data(session, clean_bucket_name, 's', 't', expected) as t:
107
+ columns_to_update = pa.schema([
108
+ (INTERNAL_ROW_ID, pa.uint64()),
109
+ ('a', pa.int64())
110
+ ])
111
+
112
+ rb = pa.record_batch(schema=columns_to_update, data=[
113
+ [0, 2], # update rows 0,2
114
+ [1110, 3330]
115
+ ])
116
+
117
+ t.update(rb)
118
+ actual = pa.Table.from_batches(t.select(columns=['a', 'b']))
119
+ assert actual.to_pydict() == {
120
+ 'a': [1110, 222, 3330],
121
+ 'b': [0.5, 1.5, 2.5]
122
+ }
123
+
124
+ actual = pa.Table.from_batches(t.select(columns=['a', 'b'], predicate=(t['a'] < 1000), internal_row_id=True))
125
+ column_index = actual.column_names.index('a')
126
+ column_field = actual.field(column_index)
127
+ new_data = pc.add(actual.column('a'), 2000)
128
+ update_table = actual.set_column(column_index, column_field, new_data)
129
+
130
+ t.update(update_table, columns=['a'])
131
+ actual = pa.Table.from_batches(t.select(columns=['a', 'b']))
132
+ assert actual.to_pydict() == {
133
+ 'a': [1110, 2222, 3330],
134
+ 'b': [0.5, 1.5, 2.5]
135
+ }
136
+
137
+ actual = pa.Table.from_batches(t.select(columns=['a', 'b'], predicate=(t['a'] != 2222), internal_row_id=True))
138
+ column_index = actual.column_names.index('a')
139
+ column_field = actual.field(column_index)
140
+ new_data = pc.divide(actual.column('a'), 10)
141
+ update_table = actual.set_column(column_index, column_field, new_data)
142
+
143
+ t.update(update_table.to_batches()[0], columns=['a'])
144
+ actual = pa.Table.from_batches(t.select(columns=['a', 'b']))
145
+ assert actual.to_pydict() == {
146
+ 'a': [111, 2222, 333],
147
+ 'b': [0.5, 1.5, 2.5]
148
+ }
149
+
150
+ def test_select_with_multisplits(session, clean_bucket_name):
151
+ columns = pa.schema([
152
+ ('a', pa.int32())
153
+ ])
154
+
155
+ data = [ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
156
+ data = data * 1000
157
+ expected = pa.table(schema=columns, data=[data])
158
+
159
+ config = QueryConfig()
160
+ config.rows_per_split = 1000
161
+
162
+ with prepare_data(session, clean_bucket_name, 's', 't', expected) as t:
163
+ actual = pa.Table.from_batches(t.select(columns=['a'], config=config))
164
+ assert actual == expected
165
+
166
+
167
+ def test_types(session, clean_bucket_name):
168
+ columns = pa.schema([
169
+ ('tb', pa.bool_()),
170
+ ('a1', pa.int8()),
171
+ ('a2', pa.int16()),
172
+ ('a4', pa.int64()),
173
+ ('b', pa.float32()),
174
+ ('s', pa.string()),
175
+ ('d', pa.decimal128(7, 3)),
176
+ ('bin', pa.binary()),
177
+ ('date', pa.date32()),
178
+ ('t0', pa.time32('s')),
179
+ ('t3', pa.time32('ms')),
180
+ ('t6', pa.time64('us')),
181
+ ('t9', pa.time64('ns')),
182
+ ('ts0' ,pa.timestamp('s')),
183
+ ('ts3' ,pa.timestamp('ms')),
184
+ ('ts6' ,pa.timestamp('us')),
185
+ ('ts9' ,pa.timestamp('ns')),
186
+ ])
187
+
188
+ expected = pa.table(schema=columns, data=[
189
+ [True, True, False],
190
+ [1 , 2, 4],
191
+ [1999, 2000, 2001],
192
+ [11122221, 222111122, 333333],
193
+ [0.5, 1.5, 2.5],
194
+ ["a", "v", "s"],
195
+ [decimal.Decimal('110.52'), decimal.Decimal('231.15'), decimal.Decimal('3332.44')],
196
+ [b"\x01\x02", b"\x01\x05", b"\x01\x07"],
197
+ [dt.date(2024, 4, 10), dt.date(2024, 4, 11), dt.date(2024, 4, 12)],
198
+ [dt.time(12, 34, 56), dt.time(12, 34, 57), dt.time(12, 34, 58)],
199
+ [dt.time(12, 34, 56, 789000), dt.time(12, 34, 57, 789000), dt.time(12, 34, 58, 789000)],
200
+ [dt.time(12, 34, 56, 789789), dt.time(12, 34, 57, 789789), dt.time(12, 34, 58, 789789)],
201
+ [dt.time(12, 34, 56, 789789), dt.time(12, 34, 57, 789789), dt.time(12, 34, 58, 789789)],
202
+ [dt.datetime(2024, 4, 10, 12, 34, 56), dt.datetime(2025, 4, 10, 12, 34, 56), dt.datetime(2026, 4, 10, 12, 34, 56)],
203
+ [dt.datetime(2024, 4, 10, 12, 34, 56, 789000), dt.datetime(2025, 4, 10, 12, 34, 56, 789000), dt.datetime(2026, 4, 10, 12, 34, 56, 789000)],
204
+ [dt.datetime(2024, 4, 10, 12, 34, 56, 789789), dt.datetime(2025, 4, 10, 12, 34, 56, 789789), dt.datetime(2026, 4, 10, 12, 34, 56, 789789)],
205
+ [dt.datetime(2024, 4, 10, 12, 34, 56, 789789), dt.datetime(2025, 4, 10, 12, 34, 56, 789789), dt.datetime(2026, 4, 10, 12, 34, 56, 789789)],
206
+ ])
207
+
208
+ with prepare_data(session, clean_bucket_name, 's', 't', expected) as t:
209
+ def select(predicate):
210
+ return pa.Table.from_batches(t.select(predicate=predicate))
211
+
212
+ assert select(None) == expected
213
+ assert select(t['tb'] == False) == expected.filter(pc.field('tb') == False) # noqa: E712
214
+ assert select(t['a1'] == 2) == expected.filter(pc.field('a1') == 2)
215
+ assert select(t['a2'] == 2000) == expected.filter(pc.field('a2') == 2000)
216
+ assert select(t['a4'] == 222111122) == expected.filter(pc.field('a4') == 222111122)
217
+ assert select(t['b'] == 1.5) == expected.filter(pc.field('b') == 1.5)
218
+ assert select(t['s'] == "v") == expected.filter(pc.field('s') == "v")
219
+ assert select(t['d'] == 231.15) == expected.filter(pc.field('d') == 231.15)
220
+ assert select(t['bin'] == b"\x01\x02") == expected.filter(pc.field('bin') == b"\x01\x02")
221
+
222
+ date_literal = dt.date(2024, 4, 10)
223
+ assert select(t['date'] == date_literal) == expected.filter(pc.field('date') == date_literal)
224
+
225
+ time_literal = dt.time(12, 34, 56)
226
+ assert select(t['t0'] == time_literal) == expected.filter(pc.field('t0') == time_literal)
227
+
228
+ time_literal = dt.time(12, 34, 56, 789000)
229
+ assert select(t['t3'] == time_literal) == expected.filter(pc.field('t3') == time_literal)
230
+
231
+ time_literal = dt.time(12, 34, 56, 789789)
232
+ assert select(t['t6'] == time_literal) == expected.filter(pc.field('t6') == time_literal)
233
+
234
+ time_literal = dt.time(12, 34, 56, 789789)
235
+ assert select(t['t9'] == time_literal) == expected.filter(pc.field('t9') == time_literal)
236
+
237
+ ts_literal = dt.datetime(2024, 4, 10, 12, 34, 56)
238
+ assert select(t['ts0'] == ts_literal) == expected.filter(pc.field('ts0') == ts_literal)
239
+
240
+ ts_literal = dt.datetime(2024, 4, 10, 12, 34, 56, 789000)
241
+ assert select(t['ts3'] == ts_literal) == expected.filter(pc.field('ts3') == ts_literal)
242
+
243
+ ts_literal = dt.datetime(2024, 4, 10, 12, 34, 56, 789789)
244
+ assert select(t['ts6'] == ts_literal) == expected.filter(pc.field('ts6') == ts_literal)
245
+
246
+ ts_literal = dt.datetime(2024, 4, 10, 12, 34, 56, 789789)
247
+ assert select(t['ts9'] == ts_literal) == expected.filter(pc.field('ts9') == ts_literal)
248
+
249
+
250
+ def test_filters(session, clean_bucket_name):
251
+ columns = pa.schema([
252
+ ('a', pa.int32()),
253
+ ('b', pa.float64()),
254
+ ('s', pa.utf8()),
255
+ ])
256
+
257
+ expected = pa.table(schema=columns, data=[
258
+ [111, 222, 333, 444, 555],
259
+ [0.5, 1.5, 2.5, 3.5, 4.5],
260
+ ['a', 'bb', 'ccc', None, 'xyz'],
261
+ ])
262
+
263
+ with prepare_data(session, clean_bucket_name, 's', 't', expected) as t:
264
+ def select(predicate):
265
+ return pa.Table.from_batches(t.select(predicate=predicate))
266
+
267
+ assert select(None) == expected
268
+
269
+ assert select(t['a'] > 222) == expected.filter(pc.field('a') > 222)
270
+ assert select(t['a'] < 222) == expected.filter(pc.field('a') < 222)
271
+ assert select(t['a'] == 222) == expected.filter(pc.field('a') == 222)
272
+ assert select(t['a'] != 222) == expected.filter(pc.field('a') != 222)
273
+ assert select(t['a'] <= 222) == expected.filter(pc.field('a') <= 222)
274
+ assert select(t['a'] >= 222) == expected.filter(pc.field('a') >= 222)
275
+
276
+ assert select(t['b'] > 1.5) == expected.filter(pc.field('b') > 1.5)
277
+ assert select(t['b'] < 1.5) == expected.filter(pc.field('b') < 1.5)
278
+ assert select(t['b'] == 1.5) == expected.filter(pc.field('b') == 1.5)
279
+ assert select(t['b'] != 1.5) == expected.filter(pc.field('b') != 1.5)
280
+ assert select(t['b'] <= 1.5) == expected.filter(pc.field('b') <= 1.5)
281
+ assert select(t['b'] >= 1.5) == expected.filter(pc.field('b') >= 1.5)
282
+
283
+ assert select(t['s'] > 'bb') == expected.filter(pc.field('s') > 'bb')
284
+ assert select(t['s'] < 'bb') == expected.filter(pc.field('s') < 'bb')
285
+ assert select(t['s'] == 'bb') == expected.filter(pc.field('s') == 'bb')
286
+ assert select(t['s'] != 'bb') == expected.filter(pc.field('s') != 'bb')
287
+ assert select(t['s'] <= 'bb') == expected.filter(pc.field('s') <= 'bb')
288
+ assert select(t['s'] >= 'bb') == expected.filter(pc.field('s') >= 'bb')
289
+
290
+ assert select((t['a'] > 111) & (t['b'] > 0) & (t['s'] < 'ccc')) == expected.filter((pc.field('a') > 111) & (pc.field('b') > 0) & (pc.field('s') < 'ccc'))
291
+ assert select((t['a'] > 111) & (t['b'] < 2.5)) == expected.filter((pc.field('a') > 111) & (pc.field('b') < 2.5))
292
+ assert select((t['a'] > 111) & (t['a'] < 333)) == expected.filter((pc.field('a') > 111) & (pc.field('a') < 333))
293
+
294
+ assert select((t['a'] > 111) | (t['a'] < 333)) == expected.filter((pc.field('a') > 111) | (pc.field('a') < 333))
295
+ assert select(((t['a'] > 111) | (t['a'] < 333)) & (t['b'] < 2.5)) == expected.filter(((pc.field('a') > 111) | (pc.field('a') < 333)) & (pc.field('b') < 2.5))
296
+ with pytest.raises(NotImplementedError):
297
+ assert select((t['a'] > 111) | (t['b'] > 0) | (t['s'] < 'ccc')) == expected.filter((pc.field('a') > 111) | (pc.field('b') > 0) | (pc.field('s') < 'ccc'))
298
+ assert select((t['a'] > 111) | (t['a'] < 333) | (t['a'] == 777) ) == expected.filter((pc.field('a') > 111) | (pc.field('a') < 333) | (pc.field('a') == 777))
299
+
300
+ assert select(t['s'].isnull()) == expected.filter(pc.field('s').is_null())
301
+ assert select((t['s'].isnull()) | (t['s'] == 'bb')) == expected.filter((pc.field('s').is_null()) | (pc.field('s') == 'bb'))
302
+ assert select((t['s'].isnull()) & (t['b'] == 3.5)) == expected.filter((pc.field('s').is_null()) & (pc.field('b') == 3.5))
303
+
304
+ assert select(~t['s'].isnull()) == expected.filter(~pc.field('s').is_null())
305
+ assert select(t['s'].contains('b')) == expected.filter(pc.field('s') == 'bb')
306
+ assert select(t['s'].contains('y')) == expected.filter(pc.field('s') == 'xyz')
307
+
308
+
309
+ def test_duckdb(session, clean_bucket_name):
310
+ columns = pa.schema([
311
+ ('a', pa.int32()),
312
+ ('b', pa.float64()),
313
+ ])
314
+ data = pa.table(schema=columns, data=[
315
+ [111, 222, 333],
316
+ [0.5, 1.5, 2.5],
317
+ ])
318
+ with prepare_data(session, clean_bucket_name, 's', 't', data) as t:
319
+ conn = duckdb.connect()
320
+ batches = t.select(columns=['a'], predicate=(t['b'] < 2)) # noqa: F841
321
+ actual = conn.execute('SELECT max(a) as "a_max" FROM batches').arrow()
322
+ expected = (data
323
+ .filter(pc.field('b') < 2)
324
+ .group_by([])
325
+ .aggregate([('a', 'max')]))
326
+ assert actual == expected
2
327
 
3
328
 
4
- def test_tables(rpc, clean_bucket_name):
5
- with rpc.transaction() as tx:
329
+ def test_parquet_export(session, clean_bucket_name):
330
+ with session.transaction() as tx:
6
331
  s = tx.bucket(clean_bucket_name).create_schema('s1')
7
332
  columns = pa.schema([
8
333
  ('a', pa.int16()),
@@ -19,22 +344,312 @@ def test_tables(rpc, clean_bucket_name):
19
344
  ['a', 'b'],
20
345
  ])
21
346
  expected = pa.Table.from_batches([rb])
22
- t.insert(rb)
23
-
24
- actual = pa.Table.from_batches(t.select(columns=['a', 'b', 's']))
347
+ rb = t.insert(rb)
348
+ assert rb.to_pylist() == [0, 1]
349
+ actual = pa.Table.from_batches(t.select())
25
350
  assert actual == expected
26
351
 
27
- actual = pa.Table.from_batches(t.select(columns=['a', 'b']))
28
- assert actual == expected.select(['a', 'b'])
352
+ table_batches = t.select()
29
353
 
30
- actual = pa.Table.from_batches(t.select(columns=['b', 's', 'a']))
31
- assert actual == expected.select(['b', 's', 'a'])
354
+ with NamedTemporaryFile() as parquet_file:
355
+ log.info("Writing table into parquet file: '%s'", parquet_file.name)
356
+ with closing(pq.ParquetWriter(parquet_file.name, table_batches.schema)) as parquet_writer:
357
+ for batch in table_batches:
358
+ parquet_writer.write_batch(batch)
32
359
 
33
- actual = pa.Table.from_batches(t.select(columns=['s']))
34
- assert actual == expected.select(['s'])
360
+ assert expected == pq.read_table(parquet_file.name)
361
+
362
+ def test_errors(session, clean_bucket_name):
363
+ with pytest.raises(errors.MissingSchema):
364
+ with session.transaction() as tx:
365
+ tx.bucket(clean_bucket_name).schema('s1')
366
+
367
+ with pytest.raises(errors.MissingBucket):
368
+ with session.transaction() as tx:
369
+ tx.bucket("bla")
370
+
371
+ with pytest.raises(errors.Conflict):
372
+ with session.transaction() as tx:
373
+ b = tx.bucket(clean_bucket_name)
374
+ s = b.create_schema('s1')
375
+ columns = pa.schema([
376
+ ('a', pa.int16()),
377
+ ('b', pa.float32()),
378
+ ('s', pa.utf8()),
379
+ ])
380
+ s.create_table('t1', columns)
381
+ s.drop() # cannot drop schema without dropping its tables first
382
+
383
+ def test_rename_schema(session, clean_bucket_name):
384
+
385
+ with session.transaction() as tx:
386
+ s = tx.bucket(clean_bucket_name).create_schema('s')
387
+
388
+ with session.transaction() as tx, session.transaction() as tx2:
389
+ b = tx.bucket(clean_bucket_name)
390
+ # assert that there is only one schema in this bucket - pre rename
391
+ assert [s.name for s in b.schemas()] == ['s']
392
+
393
+ s = b.schema('s')
394
+ s.rename('ss')
395
+
396
+ # assert the table was renamed in the transaction context
397
+ # where it was renamed
398
+ assert s.name == 'ss'
399
+ with pytest.raises(errors.MissingSchema):
400
+ tx.bucket(clean_bucket_name).schema('s')
401
+
402
+ # assert that other transactions are isolated
403
+ tx2.bucket(clean_bucket_name).schema('s')
404
+ with pytest.raises(errors.MissingSchema):
405
+ tx2.bucket(clean_bucket_name).schema('ss')
406
+
407
+ # assert that new transactions see the updated schema name
408
+ with session.transaction() as tx:
409
+ b = tx.bucket(clean_bucket_name)
410
+ with pytest.raises(errors.MissingSchema):
411
+ b.schema('s')
412
+ s = b.schema('ss')
413
+ # assert that we still have only one schema and it is the one that was renamed
414
+ assert [s.name for s in b.schemas()] == ['ss']
415
+ s.drop()
35
416
 
36
- actual = pa.Table.from_batches(t.select(columns=[]))
37
- assert actual == expected.select([])
38
417
 
418
+ def test_rename_table(session, clean_bucket_name):
419
+ columns = pa.schema([
420
+ ('a', pa.int16()),
421
+ ('b', pa.float32()),
422
+ ('s', pa.utf8()),
423
+ ])
424
+ with session.transaction() as tx:
425
+ s = tx.bucket(clean_bucket_name).create_schema('s')
426
+ t = s.create_table('t', columns)
427
+
428
+ with session.transaction() as tx, session.transaction() as tx2:
429
+ s = tx.bucket(clean_bucket_name).schema('s')
430
+ t = s.table('t')
431
+ t.rename('t2')
432
+ # assert that the new table name is seen in the context
433
+ # in which it was renamed
434
+ assert t.name == 't2'
435
+ with pytest.raises(errors.MissingTable):
436
+ s.table('t')
437
+ t = s.table('t2')
438
+
439
+ #assert that other transactions are isolated
440
+ with pytest.raises(errors.MissingTable):
441
+ tx2.bucket(clean_bucket_name).schema('s').table('t2')
442
+ tx2.bucket(clean_bucket_name).schema('s').table('t')
443
+
444
+ with session.transaction() as tx:
445
+ s = tx.bucket(clean_bucket_name).schema('s')
446
+ #assert that new transactions see the change
447
+ with pytest.raises(errors.MissingTable):
448
+ s.table('t')
449
+ t = s.table('t2')
39
450
  t.drop()
40
451
  s.drop()
452
+
453
+ def test_add_column(session, clean_bucket_name):
454
+ columns = pa.schema([
455
+ ('a', pa.int16()),
456
+ ('b', pa.float32()),
457
+ ('s', pa.utf8()),
458
+ ])
459
+ new_column = pa.field('aa', pa.int16())
460
+ new_schema = columns.append(new_column)
461
+
462
+ with session.transaction() as tx:
463
+ s = tx.bucket(clean_bucket_name).create_schema('s')
464
+ s.create_table('t', columns)
465
+
466
+ with session.transaction() as tx, session.transaction() as tx2:
467
+ t = tx.bucket(clean_bucket_name).schema('s').table('t')
468
+ assert t.arrow_schema == columns
469
+
470
+ t.add_column(pa.schema([new_column]))
471
+ # assert that the column is seen in the context
472
+ # in which it was added
473
+ assert t.arrow_schema == new_schema
474
+
475
+ #assert that other transactions are isolated
476
+ assert tx2.bucket(clean_bucket_name).schema('s').table('t').arrow_schema == columns
477
+
478
+
479
+ with session.transaction() as tx:
480
+ s = tx.bucket(clean_bucket_name).schema('s')
481
+ t = s.table('t')
482
+ #assert that new transactions see the change
483
+ assert t.arrow_schema == new_schema
484
+ t.drop()
485
+ s.drop()
486
+
487
+ def test_drop_column(session, clean_bucket_name):
488
+ columns = pa.schema([
489
+ ('a', pa.int16()),
490
+ ('b', pa.float32()),
491
+ ('s', pa.utf8()),
492
+ ])
493
+ field_idx = columns.get_field_index('a')
494
+ new_schema = columns.remove(field_idx)
495
+ column_to_drop = columns.field(field_idx)
496
+
497
+ with session.transaction() as tx:
498
+ s = tx.bucket(clean_bucket_name).create_schema('s')
499
+ s.create_table('t', columns)
500
+
501
+ with session.transaction() as tx, session.transaction() as tx2:
502
+ t = tx.bucket(clean_bucket_name).schema('s').table('t')
503
+ assert t.arrow_schema == columns
504
+
505
+ t.drop_column(pa.schema([column_to_drop]))
506
+ # assert that the column is seen in the context
507
+ # in which it was added
508
+ assert t.arrow_schema == new_schema
509
+
510
+ #assert that other transactions are isolated
511
+ assert tx2.bucket(clean_bucket_name).schema('s').table('t').arrow_schema == columns
512
+
513
+
514
+ with session.transaction() as tx:
515
+ s = tx.bucket(clean_bucket_name).schema('s')
516
+ t = s.table('t')
517
+ #assert that new transactions see the change
518
+ assert t.arrow_schema == new_schema
519
+ t.drop()
520
+ s.drop()
521
+
522
+ def test_rename_column(session, clean_bucket_name):
523
+ columns = pa.schema([
524
+ ('a', pa.int16()),
525
+ ('b', pa.float32()),
526
+ ('s', pa.utf8()),
527
+ ])
528
+ def prepare_rename_column(schema : pa.Schema, old_name : str, new_name : str) -> pa.Schema:
529
+ field_idx = schema.get_field_index(old_name)
530
+ column_to_rename = schema.field(field_idx)
531
+ renamed_column = column_to_rename.with_name(new_name)
532
+ return schema.set(field_idx, renamed_column)
533
+
534
+ new_schema = prepare_rename_column(columns,'a','aaa')
535
+
536
+ with session.transaction() as tx:
537
+ s = tx.bucket(clean_bucket_name).create_schema('s')
538
+ s.create_table('t', columns)
539
+
540
+ with session.transaction() as tx, session.transaction() as tx2:
541
+ t = tx.bucket(clean_bucket_name).schema('s').table('t')
542
+ assert t.arrow_schema == columns
543
+
544
+ t.rename_column('a', 'aaa')
545
+ # assert that the column is seen in the context
546
+ # in which it was added
547
+ assert t.arrow_schema == new_schema
548
+
549
+ #assert that other transactions are isolated
550
+ assert tx2.bucket(clean_bucket_name).schema('s').table('t').arrow_schema == columns
551
+
552
+ #assert that new transactions see the change
553
+ with session.transaction() as tx:
554
+ s = tx.bucket(clean_bucket_name).schema('s')
555
+ t = s.table('t')
556
+
557
+ assert t.arrow_schema == new_schema
558
+
559
+ # simultaneos renames of the same column
560
+ new_schema_tx1 = prepare_rename_column(new_schema, 'b', 'bb')
561
+ new_schema_tx2 = prepare_rename_column(new_schema, 'b', 'bbb')
562
+ with pytest.raises(errors.Conflict):
563
+ with session.transaction() as tx1, session.transaction() as tx2:
564
+ t1 = tx1.bucket(clean_bucket_name).schema('s').table('t')
565
+ t2 = tx2.bucket(clean_bucket_name).schema('s').table('t')
566
+ t1.rename_column('b', 'bb')
567
+ with pytest.raises(HTTPError, match = '409 Client Error: Conflict'):
568
+ t2.rename_column('b', 'bbb')
569
+
570
+ with session.transaction() as tx:
571
+ s = tx.bucket(clean_bucket_name).schema('s')
572
+ t = s.table('t')
573
+ # validate that the rename conflicted and rolled back
574
+ assert (t.arrow_schema != new_schema_tx1) and \
575
+ (t.arrow_schema != new_schema_tx2)
576
+
577
+ with session.transaction() as tx:
578
+ s = tx.bucket(clean_bucket_name).schema('s')
579
+ t = s.table('t')
580
+ t.drop()
581
+ s.drop()
582
+
583
+ def test_select_stop(session, clean_bucket_name):
584
+ columns = pa.schema([
585
+ ('a', pa.uint8()),
586
+ ])
587
+
588
+ rb = pa.record_batch(schema=columns, data=[
589
+ list(range(256)),
590
+ ])
591
+
592
+ num_rows = 0
593
+ with session.transaction() as tx:
594
+ b = tx.bucket(clean_bucket_name)
595
+ s = b.create_schema('s')
596
+ t = s.create_table('t', columns)
597
+ t.insert(rb)
598
+
599
+ num_rows = 2**8
600
+
601
+ ROWS_PER_GROUP = 2**16
602
+ qc = QueryConfig(num_sub_splits=2, num_splits=4, num_row_groups_per_sub_split=1)
603
+ with session.transaction() as tx:
604
+ t = tx.bucket(clean_bucket_name).schema('s').table('t')
605
+ t.refresh_stats()
606
+ qc.data_endpoints = list(t.stats.endpoints) * 2
607
+
608
+ # Duplicate the table until it is large enough to generate enough batches
609
+ while num_rows < (qc.num_sub_splits * qc.num_splits) * ROWS_PER_GROUP:
610
+ with session.transaction() as tx_read, session.transaction() as tx_write:
611
+ t_read = tx_read.bucket(clean_bucket_name).schema('s').table('t')
612
+ t_write = tx_write.bucket(clean_bucket_name).schema('s').table('t')
613
+ for batch in t_read.select(['a'],config=qc):
614
+ t_write.insert(batch)
615
+ num_rows = num_rows * 2
616
+ log.info("Num rows: %d", num_rows)
617
+
618
+ # Validate the number of batches and the number of rows
619
+ read_rows = 0
620
+ read_batches = 0
621
+ with session.transaction() as tx:
622
+ t = tx.bucket(clean_bucket_name).schema('s').table('t')
623
+ for batch in t.select(['a'], config=qc):
624
+ read_batches += 1
625
+ read_rows += len(batch)
626
+ assert read_rows == num_rows
627
+ # If this assert triggers it just means that the test assumptions about how
628
+ # the tabular server splits the batches is not true anymore and we need to
629
+ # rewrite the test.
630
+ assert read_batches == qc.num_splits*qc.num_sub_splits
631
+ qc.query_id = str(random.randint(0,2**32))
632
+ log.info("query id is: %s", qc.query_id)
633
+ def active_threads():
634
+ log.debug("%s",[t.getName() for t in threading.enumerate() if t.is_alive()])
635
+ return sum([1 if t.is_alive() and qc.query_id in t.getName() else 0 for t in threading.enumerate()])
636
+
637
+ assert active_threads() == 0
638
+
639
+ with session.transaction() as tx:
640
+ t = tx.bucket(clean_bucket_name).schema('s').table('t')
641
+ batches = iter(t.select(['a'], config=qc))
642
+ next(batches)
643
+ log.info("Active threads: %d", active_threads())
644
+ try:
645
+ assert active_threads() > 0
646
+ finally:
647
+ # If we dont delete the iterator, the threads will hang in a
648
+ # zombie state.
649
+ del batches
650
+
651
+ # Check that all threads were killed
652
+ log.info("Active threads: %d", active_threads())
653
+
654
+ # validate that all query threads were killed.
655
+ assert active_threads() == 0
vastdb/tests/util.py ADDED
@@ -0,0 +1,18 @@
1
+ import logging
2
+ from contextlib import contextmanager
3
+
4
+ log = logging.getLogger(__name__)
5
+
6
+
7
+ @contextmanager
8
+ def prepare_data(session, clean_bucket_name, schema_name, table_name, arrow_table):
9
+ with session.transaction() as tx:
10
+ s = tx.bucket(clean_bucket_name).create_schema(schema_name)
11
+ t = s.create_table(table_name, arrow_table.schema)
12
+ row_ids_array = t.insert(arrow_table)
13
+ row_ids = row_ids_array.to_pylist()
14
+ log.debug("row_ids=%s" % row_ids)
15
+ assert row_ids == list(range(arrow_table.num_rows))
16
+ yield t
17
+ t.drop()
18
+ s.drop()