vastdb 1.3.11__py3-none-any.whl → 2.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vastdb/_ibis_support.py +28 -0
- vastdb/_internal.py +167 -180
- vastdb/_table_interface.py +136 -0
- vastdb/bench/perf_bench/orchestrate/results_helpers.py +1 -1
- vastdb/bucket.py +1 -1
- vastdb/conftest.py +42 -19
- vastdb/schema.py +15 -3
- vastdb/session.py +3 -1
- vastdb/table.py +599 -339
- vastdb/table_metadata.py +221 -0
- vastdb/tests/test_duckdb.py +30 -30
- vastdb/tests/test_fixed_list.py +56 -6
- vastdb/tests/test_imports.py +2 -1
- vastdb/tests/test_nested.py +0 -5
- vastdb/tests/test_table_in_tx.py +249 -0
- vastdb/tests/test_tables.py +63 -16
- vastdb/tests/util.py +109 -2
- vastdb/transaction.py +27 -0
- {vastdb-1.3.11.dist-info → vastdb-2.0.0.dist-info}/METADATA +21 -6
- {vastdb-1.3.11.dist-info → vastdb-2.0.0.dist-info}/RECORD +23 -19
- {vastdb-1.3.11.dist-info → vastdb-2.0.0.dist-info}/WHEEL +1 -1
- {vastdb-1.3.11.dist-info → vastdb-2.0.0.dist-info/licenses}/LICENSE +0 -0
- {vastdb-1.3.11.dist-info → vastdb-2.0.0.dist-info}/top_level.txt +0 -0
vastdb/table_metadata.py
ADDED
|
@@ -0,0 +1,221 @@
|
|
|
1
|
+
"""VAST Database table metadata."""
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
from copy import deepcopy
|
|
5
|
+
from dataclasses import dataclass
|
|
6
|
+
from enum import Enum
|
|
7
|
+
from typing import TYPE_CHECKING, Optional, Tuple
|
|
8
|
+
|
|
9
|
+
import ibis
|
|
10
|
+
import pyarrow as pa
|
|
11
|
+
|
|
12
|
+
from vastdb import errors
|
|
13
|
+
from vastdb._ibis_support import validate_ibis_support_schema
|
|
14
|
+
|
|
15
|
+
if TYPE_CHECKING:
|
|
16
|
+
from .transaction import Transaction
|
|
17
|
+
|
|
18
|
+
log = logging.getLogger(__name__)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class TableType(Enum):
|
|
22
|
+
"""Table Type."""
|
|
23
|
+
|
|
24
|
+
Regular = 1
|
|
25
|
+
Elysium = 2
|
|
26
|
+
TableImports = 3
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
@dataclass
|
|
30
|
+
class TableRef:
|
|
31
|
+
"""Represents a table ref (table's full path)."""
|
|
32
|
+
|
|
33
|
+
bucket: str
|
|
34
|
+
schema: str
|
|
35
|
+
table: str
|
|
36
|
+
|
|
37
|
+
@property
|
|
38
|
+
def full_path(self) -> str:
|
|
39
|
+
"""Table full path."""
|
|
40
|
+
return f"{self.bucket}/{self.schema}/{self.table}"
|
|
41
|
+
|
|
42
|
+
def __str__(self) -> str:
|
|
43
|
+
"""Table full path."""
|
|
44
|
+
return self.full_path
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
@dataclass
|
|
48
|
+
class TableStats:
|
|
49
|
+
"""Table-related information."""
|
|
50
|
+
|
|
51
|
+
num_rows: int
|
|
52
|
+
size_in_bytes: int
|
|
53
|
+
sorting_score: int
|
|
54
|
+
write_amplification: int
|
|
55
|
+
acummulative_row_inserition_count: int
|
|
56
|
+
is_external_rowid_alloc: bool = False
|
|
57
|
+
sorting_key_enabled: bool = False
|
|
58
|
+
sorting_done: bool = False
|
|
59
|
+
endpoints: Tuple[str, ...] = ()
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
class TableMetadata:
|
|
63
|
+
"""Table Metadata."""
|
|
64
|
+
|
|
65
|
+
_ref: TableRef
|
|
66
|
+
_arrow_schema: Optional[pa.Schema]
|
|
67
|
+
_sorted_columns: Optional[list[str]]
|
|
68
|
+
_ibis_table: ibis.Table
|
|
69
|
+
_stats: Optional[TableStats]
|
|
70
|
+
|
|
71
|
+
def __init__(self,
|
|
72
|
+
ref: TableRef,
|
|
73
|
+
arrow_schema: Optional[pa.Schema] = None,
|
|
74
|
+
table_type: Optional[TableType] = None):
|
|
75
|
+
"""Table Metadata."""
|
|
76
|
+
self._ref = deepcopy(ref)
|
|
77
|
+
self._table_type = table_type
|
|
78
|
+
self.arrow_schema = deepcopy(arrow_schema)
|
|
79
|
+
self._sorted_columns = None
|
|
80
|
+
self._stats = None
|
|
81
|
+
|
|
82
|
+
def __eq__(self, other: object) -> bool:
|
|
83
|
+
"""TableMetadata Equal."""
|
|
84
|
+
if not isinstance(other, TableMetadata):
|
|
85
|
+
return False
|
|
86
|
+
|
|
87
|
+
return (self._ref == other._ref and
|
|
88
|
+
self._table_type == other._table_type)
|
|
89
|
+
|
|
90
|
+
def rename_table(self, name: str) -> None:
|
|
91
|
+
"""Rename table metadata's table name."""
|
|
92
|
+
self._ref.table = name
|
|
93
|
+
|
|
94
|
+
def load(self, tx: "Transaction") -> None:
|
|
95
|
+
"""Load/Reload table metadata."""
|
|
96
|
+
self.load_stats(tx)
|
|
97
|
+
self.load_schema(tx)
|
|
98
|
+
|
|
99
|
+
if self._table_type is TableType.Elysium:
|
|
100
|
+
self.load_sorted_columns(tx)
|
|
101
|
+
|
|
102
|
+
def load_schema(self, tx: "Transaction") -> None:
|
|
103
|
+
"""Load/Reload table schema."""
|
|
104
|
+
fields = []
|
|
105
|
+
next_key = 0
|
|
106
|
+
while True:
|
|
107
|
+
cur_columns, next_key, is_truncated, _count = tx._rpc.api.list_columns(
|
|
108
|
+
bucket=self.ref.bucket,
|
|
109
|
+
schema=self.ref.schema,
|
|
110
|
+
table=self.ref.table,
|
|
111
|
+
next_key=next_key,
|
|
112
|
+
txid=tx.active_txid,
|
|
113
|
+
list_imports_table=self.is_imports_table)
|
|
114
|
+
fields.extend(cur_columns)
|
|
115
|
+
if not is_truncated:
|
|
116
|
+
break
|
|
117
|
+
|
|
118
|
+
self.arrow_schema = pa.schema(fields)
|
|
119
|
+
|
|
120
|
+
def load_sorted_columns(self, tx: "Transaction") -> None:
|
|
121
|
+
"""Return sorted columns' metadata."""
|
|
122
|
+
fields = []
|
|
123
|
+
try:
|
|
124
|
+
next_key = 0
|
|
125
|
+
while True:
|
|
126
|
+
cur_columns, next_key, is_truncated, _count = tx._rpc.api.list_sorted_columns(
|
|
127
|
+
bucket=self.ref.bucket, schema=self.ref.schema, table=self.ref.table,
|
|
128
|
+
next_key=next_key, txid=tx.active_txid, list_imports_table=self.is_imports_table)
|
|
129
|
+
fields.extend(cur_columns)
|
|
130
|
+
if not is_truncated:
|
|
131
|
+
break
|
|
132
|
+
except errors.BadRequest:
|
|
133
|
+
raise
|
|
134
|
+
except errors.InternalServerError as ise:
|
|
135
|
+
log.warning(
|
|
136
|
+
"Failed to get the sorted columns Elysium might not be supported: %s", ise)
|
|
137
|
+
raise
|
|
138
|
+
except errors.NotSupportedVersion:
|
|
139
|
+
log.warning("Failed to get the sorted columns, Elysium not supported")
|
|
140
|
+
raise
|
|
141
|
+
finally:
|
|
142
|
+
self._sorted_columns = fields
|
|
143
|
+
|
|
144
|
+
def load_stats(self, tx: "Transaction") -> None:
|
|
145
|
+
"""Load/Reload table stats."""
|
|
146
|
+
stats_tuple = tx._rpc.api.get_table_stats(
|
|
147
|
+
bucket=self.ref.bucket, schema=self.ref.schema, name=self.ref.table, txid=tx.active_txid,
|
|
148
|
+
imports_table_stats=self.is_imports_table)
|
|
149
|
+
self._stats = TableStats(**stats_tuple._asdict())
|
|
150
|
+
|
|
151
|
+
is_elysium_table = self._stats.sorting_key_enabled
|
|
152
|
+
|
|
153
|
+
if self._table_type is None:
|
|
154
|
+
if is_elysium_table:
|
|
155
|
+
self._set_sorted_table(tx)
|
|
156
|
+
else:
|
|
157
|
+
self._set_regular_table()
|
|
158
|
+
else:
|
|
159
|
+
if is_elysium_table and self.table_type is not TableType.Elysium:
|
|
160
|
+
raise ValueError(
|
|
161
|
+
"Actual table is sorted (TableType.Elysium), was not inited as TableType.Elysium"
|
|
162
|
+
)
|
|
163
|
+
|
|
164
|
+
def _set_sorted_table(self, tx: "Transaction"):
|
|
165
|
+
self._table_type = TableType.Elysium
|
|
166
|
+
tx._rpc.features.check_elysium()
|
|
167
|
+
|
|
168
|
+
def _set_regular_table(self):
|
|
169
|
+
self._table_type = TableType.Regular
|
|
170
|
+
|
|
171
|
+
@property
|
|
172
|
+
def stats(self) -> Optional[TableStats]:
|
|
173
|
+
"""Get table's stats."""
|
|
174
|
+
return self._stats
|
|
175
|
+
|
|
176
|
+
@property
|
|
177
|
+
def arrow_schema(self) -> pa.Schema:
|
|
178
|
+
"""Table's arrow schema."""
|
|
179
|
+
return self._arrow_schema
|
|
180
|
+
|
|
181
|
+
@arrow_schema.setter
|
|
182
|
+
def arrow_schema(self, arrow_schema: Optional[pa.Schema]):
|
|
183
|
+
"""Set arrow schema."""
|
|
184
|
+
if arrow_schema:
|
|
185
|
+
validate_ibis_support_schema(arrow_schema)
|
|
186
|
+
self._arrow_schema = arrow_schema
|
|
187
|
+
self._ibis_table = ibis.table(ibis.Schema.from_pyarrow(arrow_schema), self._ref.full_path)
|
|
188
|
+
else:
|
|
189
|
+
self._arrow_schema = None
|
|
190
|
+
self._ibis_table = None
|
|
191
|
+
|
|
192
|
+
@property
|
|
193
|
+
def sorted_columns(self) -> list:
|
|
194
|
+
"""Sorted columns."""
|
|
195
|
+
if self._sorted_columns is None:
|
|
196
|
+
raise ValueError("sorted columns not loaded")
|
|
197
|
+
return self._sorted_columns
|
|
198
|
+
|
|
199
|
+
@property
|
|
200
|
+
def ibis_table(self) -> ibis.Table:
|
|
201
|
+
"""Ibis table."""
|
|
202
|
+
return self._ibis_table
|
|
203
|
+
|
|
204
|
+
@property
|
|
205
|
+
def ref(self) -> TableRef:
|
|
206
|
+
"""Table's reference."""
|
|
207
|
+
return self._ref
|
|
208
|
+
|
|
209
|
+
@property
|
|
210
|
+
def table_type(self) -> TableType:
|
|
211
|
+
"""Table's type."""
|
|
212
|
+
if self._table_type is None:
|
|
213
|
+
raise ValueError(
|
|
214
|
+
"TableType was not loaded. load using TableMetadata.load_stats")
|
|
215
|
+
|
|
216
|
+
return self._table_type
|
|
217
|
+
|
|
218
|
+
@property
|
|
219
|
+
def is_imports_table(self) -> bool:
|
|
220
|
+
"""Is table an imports table."""
|
|
221
|
+
return self._table_type is TableType.TableImports
|
vastdb/tests/test_duckdb.py
CHANGED
|
@@ -3,9 +3,7 @@ import logging
|
|
|
3
3
|
import duckdb
|
|
4
4
|
import pyarrow as pa
|
|
5
5
|
import pyarrow.compute as pc
|
|
6
|
-
import pytest
|
|
7
6
|
|
|
8
|
-
from ..table import QueryConfig
|
|
9
7
|
from .util import prepare_data
|
|
10
8
|
|
|
11
9
|
log = logging.getLogger(__name__)
|
|
@@ -31,31 +29,33 @@ def test_duckdb(session, clean_bucket_name):
|
|
|
31
29
|
assert actual == expected
|
|
32
30
|
|
|
33
31
|
|
|
34
|
-
def test_closed_tx(session, clean_bucket_name):
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
32
|
+
# def test_closed_tx(session, clean_bucket_name):
|
|
33
|
+
# assert duckdb.__version__ == "1.0.0", "doesn't reproduce with newer duckdb versions, when updating duckdb in tests/when relevant need to update this test accordingly."
|
|
34
|
+
|
|
35
|
+
# columns = pa.schema([
|
|
36
|
+
# ('a', pa.int64()),
|
|
37
|
+
# ])
|
|
38
|
+
# data = pa.table(schema=columns, data=[
|
|
39
|
+
# list(range(10000)),
|
|
40
|
+
# ])
|
|
41
|
+
|
|
42
|
+
# with session.transaction() as tx:
|
|
43
|
+
# t = tx.bucket(clean_bucket_name).create_schema("s1").create_table("t1", columns)
|
|
44
|
+
# t.insert(data)
|
|
45
|
+
|
|
46
|
+
# config = QueryConfig(
|
|
47
|
+
# num_sub_splits=1,
|
|
48
|
+
# num_splits=1,
|
|
49
|
+
# num_row_groups_per_sub_split=1,
|
|
50
|
+
# limit_rows_per_sub_split=100)
|
|
51
|
+
# batches = t.select(config=config) # noqa: F841
|
|
52
|
+
# first = next(batches) # make sure that HTTP response processing has started
|
|
53
|
+
# assert first['a'].to_pylist() == list(range(100))
|
|
54
|
+
|
|
55
|
+
# conn = duckdb.connect()
|
|
56
|
+
# res = conn.execute('SELECT a FROM batches')
|
|
57
|
+
# log.debug("closing tx=%s after first batch=%s", t.tx, first)
|
|
58
|
+
|
|
59
|
+
# # transaction is closed, collecting the result should fail internally in DuckDB
|
|
60
|
+
# with pytest.raises(duckdb.InvalidInputException):
|
|
61
|
+
# res.arrow()
|
vastdb/tests/test_fixed_list.py
CHANGED
|
@@ -11,7 +11,11 @@ import pytest
|
|
|
11
11
|
|
|
12
12
|
import vastdb.errors
|
|
13
13
|
|
|
14
|
-
from .util import
|
|
14
|
+
from .util import (
|
|
15
|
+
assert_pandas_df_equal,
|
|
16
|
+
convert_pandas_df_to_hashable_values,
|
|
17
|
+
prepare_data,
|
|
18
|
+
)
|
|
15
19
|
|
|
16
20
|
supported_fixed_list_element_types = [
|
|
17
21
|
pa.uint8(),
|
|
@@ -85,7 +89,7 @@ def test_vectors(session, clean_bucket_name):
|
|
|
85
89
|
columns = pa.schema(
|
|
86
90
|
[("id", pa.int64()), ("vec", pa.list_(pa.field(name="item", type=element_type, nullable=False), dimension),)]
|
|
87
91
|
)
|
|
88
|
-
ids = range(num_rows)
|
|
92
|
+
ids = list(range(num_rows))
|
|
89
93
|
expected = pa.table(
|
|
90
94
|
schema=columns,
|
|
91
95
|
data=[
|
|
@@ -102,7 +106,7 @@ def test_vectors(session, clean_bucket_name):
|
|
|
102
106
|
assert actual == expected
|
|
103
107
|
|
|
104
108
|
# Select by id.
|
|
105
|
-
select_id = random.
|
|
109
|
+
select_id = random.choice(ids)
|
|
106
110
|
actual = t.select(predicate=(t["id"] == select_id)).read_all()
|
|
107
111
|
assert actual.to_pydict()["vec"] == [[select_id] * dimension]
|
|
108
112
|
assert actual == expected.filter(pc.field("id") == select_id)
|
|
@@ -221,24 +225,70 @@ def generate_random_pyarrow_value(
|
|
|
221
225
|
|
|
222
226
|
@pytest.mark.parametrize("element_field", supported_fixed_list_element_fields)
|
|
223
227
|
def test_fixed_list_type_values(session, clean_bucket_name, element_field):
|
|
224
|
-
list_size =
|
|
225
|
-
num_rows =
|
|
228
|
+
list_size = 250
|
|
229
|
+
num_rows = 100
|
|
226
230
|
|
|
227
231
|
vec_type = pa.list_(element_field, list_size)
|
|
228
232
|
schema = pa.schema(
|
|
229
233
|
{"id": pa.int64(), "vec": vec_type, "random_int": pa.int64()})
|
|
234
|
+
ids = list(range(num_rows))
|
|
230
235
|
expected = pa.table(
|
|
231
236
|
schema=schema,
|
|
232
|
-
data=[
|
|
237
|
+
data=[ids] + [[generate_random_pyarrow_value(schema.field(col_name)) for _ in range(num_rows)]
|
|
233
238
|
for col_name in
|
|
234
239
|
schema.names[1:]],
|
|
235
240
|
)
|
|
241
|
+
# Convert the list to tuple in order to support comparison as a whole.
|
|
242
|
+
pd_expected = convert_pandas_df_to_hashable_values(expected.to_pandas())
|
|
236
243
|
|
|
237
244
|
with prepare_data(session, clean_bucket_name, "s", "t", expected) as table:
|
|
238
245
|
assert table.arrow_schema == schema
|
|
239
246
|
actual = table.select().read_all()
|
|
240
247
|
assert actual == expected
|
|
241
248
|
|
|
249
|
+
# Select by id.
|
|
250
|
+
id_to_select = random.choice(ids)
|
|
251
|
+
select_by_id = table.select(predicate=(table["id"] == id_to_select)).read_all()
|
|
252
|
+
assert len(select_by_id) == 1 # ID is unique.
|
|
253
|
+
assert select_by_id == expected.filter(pc.field("id") == id_to_select)
|
|
254
|
+
|
|
255
|
+
# Choose a random vector which is not null. Nulls should not be selected using == , != operators, but by isnull.
|
|
256
|
+
# In addition, nulls are discarded unless isnull is used (meaning != 1 will return both not nulls and not 1).
|
|
257
|
+
vector_to_select = random.choice(expected.filter(~pc.field('vec').is_null())['vec'].to_numpy())
|
|
258
|
+
|
|
259
|
+
# TODO VSDK-36: Remove this workaround when the issue with negative decimals is predicate is fixed.
|
|
260
|
+
if pa.types.is_decimal(element_field.type):
|
|
261
|
+
vector_to_select = abs(vector_to_select)
|
|
262
|
+
|
|
263
|
+
# Dtype is not asserted since pandas convert the dtype of integer to float when there are (or could be)
|
|
264
|
+
# NaN/None values.
|
|
265
|
+
# Select by vector value.
|
|
266
|
+
select_by_vector = table.select(predicate=(table["vec"] == vector_to_select)).read_all()
|
|
267
|
+
assert_pandas_df_equal(select_by_vector.to_pandas(),
|
|
268
|
+
pd_expected.loc[pd_expected['vec'] == tuple(vector_to_select)], check_dtype=False)
|
|
269
|
+
|
|
270
|
+
# Not equal to vector value.
|
|
271
|
+
select_by_vector = table.select(predicate=(table["vec"] != vector_to_select)).read_all()
|
|
272
|
+
assert_pandas_df_equal(select_by_vector.to_pandas(),
|
|
273
|
+
pd_expected.loc[(pd_expected['vec'] != tuple(vector_to_select)) &
|
|
274
|
+
pd_expected['vec'].notnull()], check_dtype=False)
|
|
275
|
+
|
|
276
|
+
# Not equal to vector value or null.
|
|
277
|
+
select_by_vector = table.select(
|
|
278
|
+
predicate=((table["vec"] != vector_to_select) | (table['vec'].isnull()))).read_all()
|
|
279
|
+
assert_pandas_df_equal(select_by_vector.to_pandas(),
|
|
280
|
+
pd_expected.loc[pd_expected['vec'] != tuple(vector_to_select)], check_dtype=False)
|
|
281
|
+
|
|
282
|
+
# Lexicographically greater than vector.
|
|
283
|
+
select_by_vector = table.select(predicate=(table["vec"] > vector_to_select)).read_all()
|
|
284
|
+
assert_pandas_df_equal(select_by_vector.to_pandas(), pd_expected.loc[
|
|
285
|
+
pd_expected['vec'].notnull() & (pd_expected['vec'] > tuple(vector_to_select))], check_dtype=False)
|
|
286
|
+
|
|
287
|
+
# Lexicographically less than vector.
|
|
288
|
+
select_by_vector = table.select(predicate=(table["vec"] < vector_to_select)).read_all()
|
|
289
|
+
assert_pandas_df_equal(select_by_vector.to_pandas(), pd_expected.loc[
|
|
290
|
+
pd_expected['vec'].notnull() & (pd_expected['vec'] < tuple(vector_to_select))], check_dtype=False)
|
|
291
|
+
|
|
242
292
|
|
|
243
293
|
@pytest.mark.parametrize("list_type", unsupported_fixed_list_types)
|
|
244
294
|
def test_unsupported_fixed_list_types(session, clean_bucket_name, list_type):
|
vastdb/tests/test_imports.py
CHANGED
|
@@ -14,6 +14,7 @@ from vastdb.errors import (
|
|
|
14
14
|
InvalidArgument,
|
|
15
15
|
NotSupportedVersion,
|
|
16
16
|
)
|
|
17
|
+
from vastdb.session import Session
|
|
17
18
|
|
|
18
19
|
log = logging.getLogger(__name__)
|
|
19
20
|
|
|
@@ -28,7 +29,7 @@ def zip_import_session(session):
|
|
|
28
29
|
pytest.skip("Skipped because this test requires version 5.3.1")
|
|
29
30
|
|
|
30
31
|
|
|
31
|
-
def test_parallel_imports(session, clean_bucket_name, s3):
|
|
32
|
+
def test_parallel_imports(session: Session, clean_bucket_name: str, s3):
|
|
32
33
|
num_rows = 1000
|
|
33
34
|
num_files = 53
|
|
34
35
|
ds = {'num': [i for i in range(num_rows)]}
|
vastdb/tests/test_nested.py
CHANGED
|
@@ -79,13 +79,11 @@ def test_nested_filter(session, clean_bucket_name):
|
|
|
79
79
|
def test_nested_unsupported_filter(session, clean_bucket_name):
|
|
80
80
|
columns = pa.schema([
|
|
81
81
|
('l', pa.list_(pa.int8())),
|
|
82
|
-
('fl', pa.list_(pa.field(name='item', type=pa.int64(), nullable=False), 2)),
|
|
83
82
|
('m', pa.map_(pa.utf8(), pa.float64())),
|
|
84
83
|
('s', pa.struct([('x', pa.int16()), ('y', pa.int32())])),
|
|
85
84
|
])
|
|
86
85
|
expected = pa.table(schema=columns, data=[
|
|
87
86
|
[[1], [], [2, 3], None],
|
|
88
|
-
[[1, 2], None, [3, 4], None],
|
|
89
87
|
[None, {'a': 2.5}, {'b': 0.25, 'c': 0.025}, {}],
|
|
90
88
|
[{'x': 1, 'y': None}, None, {'x': 2, 'y': 3}, {'x': None, 'y': 4}],
|
|
91
89
|
])
|
|
@@ -95,9 +93,6 @@ def test_nested_unsupported_filter(session, clean_bucket_name):
|
|
|
95
93
|
with pytest.raises(NotImplementedError):
|
|
96
94
|
list(t.select(predicate=(t['l'].isnull())))
|
|
97
95
|
|
|
98
|
-
with pytest.raises(NotImplementedError):
|
|
99
|
-
list(t.select(predicate=(t['fl'].isnull())))
|
|
100
|
-
|
|
101
96
|
with pytest.raises(NotImplementedError):
|
|
102
97
|
list(t.select(predicate=(t['m'].isnull())))
|
|
103
98
|
|
|
@@ -0,0 +1,249 @@
|
|
|
1
|
+
from dataclasses import dataclass
|
|
2
|
+
from typing import Generator, Optional
|
|
3
|
+
|
|
4
|
+
import ibis
|
|
5
|
+
import pyarrow as pa
|
|
6
|
+
import pytest
|
|
7
|
+
|
|
8
|
+
from vastdb.session import Session
|
|
9
|
+
from vastdb.table import INTERNAL_ROW_ID, ITable
|
|
10
|
+
from vastdb.table_metadata import TableMetadata, TableRef, TableType
|
|
11
|
+
from vastdb.transaction import Transaction
|
|
12
|
+
|
|
13
|
+
from .util import compare_pyarrow_tables, prepare_data_get_tx
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def test_sanity(session: Session, clean_bucket_name):
|
|
17
|
+
columns = pa.schema([
|
|
18
|
+
('a', pa.int64()),
|
|
19
|
+
('b', pa.float32()),
|
|
20
|
+
('s', pa.utf8()),
|
|
21
|
+
])
|
|
22
|
+
expected = pa.table(schema=columns, data=[
|
|
23
|
+
[111, 222, 333],
|
|
24
|
+
[0.5, 1.5, 2.5],
|
|
25
|
+
['a', 'bb', 'ccc'],
|
|
26
|
+
])
|
|
27
|
+
with prepare_data_get_tx(session, clean_bucket_name, 's', 't', expected) as tx:
|
|
28
|
+
ref = TableRef(clean_bucket_name, 's', 't')
|
|
29
|
+
table_md = TableMetadata(ref, columns, TableType.Regular)
|
|
30
|
+
|
|
31
|
+
table_md.load_stats(tx)
|
|
32
|
+
|
|
33
|
+
t = tx.table_from_metadata(table_md)
|
|
34
|
+
|
|
35
|
+
actual = t.select(columns=['a', 'b', 's']).read_all()
|
|
36
|
+
assert actual == expected
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
@dataclass
|
|
40
|
+
class SimpleDbSetup:
|
|
41
|
+
tx: Transaction
|
|
42
|
+
ref: TableRef
|
|
43
|
+
table_type: TableType
|
|
44
|
+
arrow_schema: Optional[pa.Schema] = None
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
@pytest.fixture(scope="function")
|
|
48
|
+
def simple_db_setup(session: Session, clean_bucket_name: str) -> Generator[SimpleDbSetup, None, None]:
|
|
49
|
+
arrow_schema = pa.schema([
|
|
50
|
+
('a', pa.int64()),
|
|
51
|
+
('b', pa.float32()),
|
|
52
|
+
('s', pa.utf8()),
|
|
53
|
+
])
|
|
54
|
+
expected = pa.table(schema=arrow_schema, data=[
|
|
55
|
+
[111, 222, 333],
|
|
56
|
+
[0.5, 1.5, 2.5],
|
|
57
|
+
['a', 'bb', 'ccc'],
|
|
58
|
+
])
|
|
59
|
+
with prepare_data_get_tx(session, clean_bucket_name, 's', 't', expected) as tx:
|
|
60
|
+
yield SimpleDbSetup(tx=tx,
|
|
61
|
+
arrow_schema=arrow_schema,
|
|
62
|
+
ref=TableRef(clean_bucket_name, 's', 't'),
|
|
63
|
+
table_type=TableType.Regular)
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def test_schema_load_through_metadata(simple_db_setup: SimpleDbSetup):
|
|
67
|
+
table_md = TableMetadata(simple_db_setup.ref,
|
|
68
|
+
table_type=simple_db_setup.table_type)
|
|
69
|
+
|
|
70
|
+
table = simple_db_setup.tx.table_from_metadata(table_md)
|
|
71
|
+
assert table.arrow_schema is None
|
|
72
|
+
table.reload_schema()
|
|
73
|
+
assert table.arrow_schema is not None
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def test_metadata_init_with_schema(simple_db_setup: SimpleDbSetup):
|
|
77
|
+
table_md = TableMetadata(ref=simple_db_setup.ref,
|
|
78
|
+
arrow_schema=simple_db_setup.arrow_schema,
|
|
79
|
+
table_type=simple_db_setup.table_type)
|
|
80
|
+
|
|
81
|
+
table = simple_db_setup.tx.table_from_metadata(table_md)
|
|
82
|
+
assert table.arrow_schema is not None
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
def test_path(simple_db_setup: SimpleDbSetup):
|
|
86
|
+
table_md = TableMetadata(ref=simple_db_setup.ref,
|
|
87
|
+
arrow_schema=simple_db_setup.arrow_schema,
|
|
88
|
+
table_type=simple_db_setup.table_type)
|
|
89
|
+
|
|
90
|
+
table = simple_db_setup.tx.table_from_metadata(table_md)
|
|
91
|
+
assert table.path == simple_db_setup.ref.full_path
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def test_name(simple_db_setup: SimpleDbSetup):
|
|
95
|
+
table_md = TableMetadata(ref=simple_db_setup.ref,
|
|
96
|
+
arrow_schema=simple_db_setup.arrow_schema,
|
|
97
|
+
table_type=simple_db_setup.table_type)
|
|
98
|
+
table = simple_db_setup.tx.table_from_metadata(table_md)
|
|
99
|
+
assert table.name == simple_db_setup.ref.table
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
def test_arrow_schema(simple_db_setup: SimpleDbSetup):
|
|
103
|
+
table_md = TableMetadata(ref=simple_db_setup.ref,
|
|
104
|
+
arrow_schema=simple_db_setup.arrow_schema,
|
|
105
|
+
table_type=simple_db_setup.table_type)
|
|
106
|
+
table = simple_db_setup.tx.table_from_metadata(table_md)
|
|
107
|
+
assert table.arrow_schema == simple_db_setup.arrow_schema
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
def test_eq(simple_db_setup: SimpleDbSetup):
|
|
111
|
+
table_md1 = TableMetadata(ref=simple_db_setup.ref, table_type=simple_db_setup.table_type)
|
|
112
|
+
table1 = simple_db_setup.tx.table_from_metadata(table_md1)
|
|
113
|
+
|
|
114
|
+
table_md2 = TableMetadata(ref=simple_db_setup.ref, table_type=simple_db_setup.table_type)
|
|
115
|
+
table2 = simple_db_setup.tx.table_from_metadata(table_md2)
|
|
116
|
+
|
|
117
|
+
assert table1 == table2
|
|
118
|
+
|
|
119
|
+
other_ref = TableRef(simple_db_setup.ref.bucket, simple_db_setup.ref.schema, "other_table")
|
|
120
|
+
table_md3 = TableMetadata(ref=other_ref, table_type=simple_db_setup.table_type)
|
|
121
|
+
table3 = simple_db_setup.tx.table_from_metadata(table_md3)
|
|
122
|
+
assert table1 != table3
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
def test_insert_and_select(simple_db_setup: SimpleDbSetup):
|
|
126
|
+
table_md = TableMetadata(ref=simple_db_setup.ref,
|
|
127
|
+
arrow_schema=simple_db_setup.arrow_schema,
|
|
128
|
+
table_type=simple_db_setup.table_type)
|
|
129
|
+
table_md.load_stats(simple_db_setup.tx) # the next select requires stats loaded
|
|
130
|
+
|
|
131
|
+
table = simple_db_setup.tx.table_from_metadata(table_md)
|
|
132
|
+
|
|
133
|
+
initial_data = table.select().read_all()
|
|
134
|
+
|
|
135
|
+
assert initial_data.num_rows == 3
|
|
136
|
+
|
|
137
|
+
new_rows = pa.table(schema=simple_db_setup.arrow_schema, data=[[444], [4.5], ["dddd"]])
|
|
138
|
+
table.insert(new_rows)
|
|
139
|
+
|
|
140
|
+
all_data = table.select().read_all()
|
|
141
|
+
assert all_data.num_rows == 4
|
|
142
|
+
|
|
143
|
+
t = ibis.table(table.arrow_schema, name=table.name)
|
|
144
|
+
reader = table.select(predicate=t.a > 300)
|
|
145
|
+
filtered_data = reader.read_all()
|
|
146
|
+
assert filtered_data.num_rows == 2
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
def test_sorting_status(simple_db_setup: SimpleDbSetup):
|
|
150
|
+
table_md = TableMetadata(ref=simple_db_setup.ref, table_type=simple_db_setup.table_type)
|
|
151
|
+
table = simple_db_setup.tx.table_from_metadata(table_md)
|
|
152
|
+
|
|
153
|
+
is_done = table.sorting_done()
|
|
154
|
+
assert isinstance(is_done, bool)
|
|
155
|
+
|
|
156
|
+
score = table.sorting_score()
|
|
157
|
+
assert isinstance(score, int)
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
def test_projections(simple_db_setup: SimpleDbSetup):
|
|
161
|
+
table_md = TableMetadata(ref=simple_db_setup.ref, table_type=simple_db_setup.table_type)
|
|
162
|
+
table: ITable = simple_db_setup.tx.table_from_metadata(table_md)
|
|
163
|
+
|
|
164
|
+
ref = simple_db_setup.ref
|
|
165
|
+
legacy_table = simple_db_setup.tx.bucket(ref.bucket).schema(ref.schema).table(ref.table)
|
|
166
|
+
|
|
167
|
+
initial_projections = list(table.projections())
|
|
168
|
+
proj_name = "my_proj"
|
|
169
|
+
proj = legacy_table.create_projection(
|
|
170
|
+
projection_name=proj_name, sorted_columns=["a"], unsorted_columns=["s"]
|
|
171
|
+
)
|
|
172
|
+
assert proj.name == proj_name
|
|
173
|
+
|
|
174
|
+
retrieved_proj = table.projection(proj_name)
|
|
175
|
+
assert retrieved_proj == proj
|
|
176
|
+
|
|
177
|
+
all_projections = list(table.projections())
|
|
178
|
+
assert len(all_projections) == len(initial_projections) + 1
|
|
179
|
+
|
|
180
|
+
|
|
181
|
+
def test_update(simple_db_setup: SimpleDbSetup):
|
|
182
|
+
table_md = TableMetadata(ref=simple_db_setup.ref,
|
|
183
|
+
arrow_schema=simple_db_setup.arrow_schema,
|
|
184
|
+
table_type=simple_db_setup.table_type)
|
|
185
|
+
table_md.load_stats(simple_db_setup.tx)
|
|
186
|
+
table = simple_db_setup.tx.table_from_metadata(table_md)
|
|
187
|
+
|
|
188
|
+
# 1. Select a row to update
|
|
189
|
+
row_to_update = table.select(predicate=table['a'] == 222, internal_row_id=True).read_all()
|
|
190
|
+
assert row_to_update.num_rows == 1
|
|
191
|
+
|
|
192
|
+
# 2. Create a modified version in a new RecordBatch
|
|
193
|
+
update_data = pa.table({
|
|
194
|
+
INTERNAL_ROW_ID: row_to_update[INTERNAL_ROW_ID],
|
|
195
|
+
's': ['updated_bb']
|
|
196
|
+
})
|
|
197
|
+
|
|
198
|
+
# 3. Call table.update()
|
|
199
|
+
table.update(update_data)
|
|
200
|
+
|
|
201
|
+
# 4. Select the row again and verify changes
|
|
202
|
+
updated_row = table.select(predicate=table['a'] == 222).read_all()
|
|
203
|
+
assert updated_row.to_pydict()['s'] == ['updated_bb']
|
|
204
|
+
|
|
205
|
+
remaining_rows = table.select(predicate=table['a'] != 222).read_all()
|
|
206
|
+
expected_remaining = pa.table({
|
|
207
|
+
'a': pa.array([111, 333], type=pa.int64()),
|
|
208
|
+
'b': pa.array([0.5, 2.5], type=pa.float32()),
|
|
209
|
+
's': pa.array(['a', 'ccc'], type=pa.utf8()),
|
|
210
|
+
})
|
|
211
|
+
assert compare_pyarrow_tables(remaining_rows, expected_remaining)
|
|
212
|
+
|
|
213
|
+
|
|
214
|
+
def test_delete(simple_db_setup: SimpleDbSetup):
|
|
215
|
+
table_md = TableMetadata(ref=simple_db_setup.ref,
|
|
216
|
+
arrow_schema=simple_db_setup.arrow_schema,
|
|
217
|
+
table_type=simple_db_setup.table_type)
|
|
218
|
+
table_md.load_stats(simple_db_setup.tx)
|
|
219
|
+
table = simple_db_setup.tx.table_from_metadata(table_md)
|
|
220
|
+
|
|
221
|
+
# 1. Identify a row to delete
|
|
222
|
+
row_to_delete = table.select(predicate=table['a'] == 333, internal_row_id=True).read_all()
|
|
223
|
+
assert row_to_delete.num_rows == 1
|
|
224
|
+
|
|
225
|
+
# 2. Create a RecordBatch with the key of the row
|
|
226
|
+
delete_data = pa.table({
|
|
227
|
+
INTERNAL_ROW_ID: row_to_delete[INTERNAL_ROW_ID]
|
|
228
|
+
})
|
|
229
|
+
|
|
230
|
+
# 3. Call table.delete()
|
|
231
|
+
table.delete(delete_data)
|
|
232
|
+
|
|
233
|
+
# 4. Select to verify the row is gone
|
|
234
|
+
all_data = table.select().read_all()
|
|
235
|
+
assert all_data.num_rows == 2
|
|
236
|
+
assert 333 not in all_data.to_pydict()['a']
|
|
237
|
+
expected_remaining = pa.table({
|
|
238
|
+
'a': pa.array([111, 222], type=pa.int64()),
|
|
239
|
+
'b': pa.array([0.5, 1.5], type=pa.float32()),
|
|
240
|
+
's': pa.array(['a', 'bb'], type=pa.utf8()),
|
|
241
|
+
})
|
|
242
|
+
assert compare_pyarrow_tables(all_data, expected_remaining)
|
|
243
|
+
|
|
244
|
+
|
|
245
|
+
def test_sanity_load(simple_db_setup: SimpleDbSetup):
|
|
246
|
+
table_md = TableMetadata(TableRef(simple_db_setup.ref.bucket,
|
|
247
|
+
simple_db_setup.ref.schema,
|
|
248
|
+
simple_db_setup.ref.table))
|
|
249
|
+
table_md.load(simple_db_setup.tx)
|