vastdb 1.4.0__py3-none-any.whl → 2.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,221 @@
1
+ """VAST Database table metadata."""
2
+
3
+ import logging
4
+ from copy import deepcopy
5
+ from dataclasses import dataclass
6
+ from enum import Enum
7
+ from typing import TYPE_CHECKING, Optional, Tuple
8
+
9
+ import ibis
10
+ import pyarrow as pa
11
+
12
+ from vastdb import errors
13
+ from vastdb._ibis_support import validate_ibis_support_schema
14
+
15
+ if TYPE_CHECKING:
16
+ from .transaction import Transaction
17
+
18
+ log = logging.getLogger(__name__)
19
+
20
+
21
+ class TableType(Enum):
22
+ """Table Type."""
23
+
24
+ Regular = 1
25
+ Elysium = 2
26
+ TableImports = 3
27
+
28
+
29
+ @dataclass
30
+ class TableRef:
31
+ """Represents a table ref (table's full path)."""
32
+
33
+ bucket: str
34
+ schema: str
35
+ table: str
36
+
37
+ @property
38
+ def full_path(self) -> str:
39
+ """Table full path."""
40
+ return f"{self.bucket}/{self.schema}/{self.table}"
41
+
42
+ def __str__(self) -> str:
43
+ """Table full path."""
44
+ return self.full_path
45
+
46
+
47
+ @dataclass
48
+ class TableStats:
49
+ """Table-related information."""
50
+
51
+ num_rows: int
52
+ size_in_bytes: int
53
+ sorting_score: int
54
+ write_amplification: int
55
+ acummulative_row_inserition_count: int
56
+ is_external_rowid_alloc: bool = False
57
+ sorting_key_enabled: bool = False
58
+ sorting_done: bool = False
59
+ endpoints: Tuple[str, ...] = ()
60
+
61
+
62
+ class TableMetadata:
63
+ """Table Metadata."""
64
+
65
+ _ref: TableRef
66
+ _arrow_schema: Optional[pa.Schema]
67
+ _sorted_columns: Optional[list[str]]
68
+ _ibis_table: ibis.Table
69
+ _stats: Optional[TableStats]
70
+
71
+ def __init__(self,
72
+ ref: TableRef,
73
+ arrow_schema: Optional[pa.Schema] = None,
74
+ table_type: Optional[TableType] = None):
75
+ """Table Metadata."""
76
+ self._ref = deepcopy(ref)
77
+ self._table_type = table_type
78
+ self.arrow_schema = deepcopy(arrow_schema)
79
+ self._sorted_columns = None
80
+ self._stats = None
81
+
82
+ def __eq__(self, other: object) -> bool:
83
+ """TableMetadata Equal."""
84
+ if not isinstance(other, TableMetadata):
85
+ return False
86
+
87
+ return (self._ref == other._ref and
88
+ self._table_type == other._table_type)
89
+
90
+ def rename_table(self, name: str) -> None:
91
+ """Rename table metadata's table name."""
92
+ self._ref.table = name
93
+
94
+ def load(self, tx: "Transaction") -> None:
95
+ """Load/Reload table metadata."""
96
+ self.load_stats(tx)
97
+ self.load_schema(tx)
98
+
99
+ if self._table_type is TableType.Elysium:
100
+ self.load_sorted_columns(tx)
101
+
102
+ def load_schema(self, tx: "Transaction") -> None:
103
+ """Load/Reload table schema."""
104
+ fields = []
105
+ next_key = 0
106
+ while True:
107
+ cur_columns, next_key, is_truncated, _count = tx._rpc.api.list_columns(
108
+ bucket=self.ref.bucket,
109
+ schema=self.ref.schema,
110
+ table=self.ref.table,
111
+ next_key=next_key,
112
+ txid=tx.active_txid,
113
+ list_imports_table=self.is_imports_table)
114
+ fields.extend(cur_columns)
115
+ if not is_truncated:
116
+ break
117
+
118
+ self.arrow_schema = pa.schema(fields)
119
+
120
+ def load_sorted_columns(self, tx: "Transaction") -> None:
121
+ """Return sorted columns' metadata."""
122
+ fields = []
123
+ try:
124
+ next_key = 0
125
+ while True:
126
+ cur_columns, next_key, is_truncated, _count = tx._rpc.api.list_sorted_columns(
127
+ bucket=self.ref.bucket, schema=self.ref.schema, table=self.ref.table,
128
+ next_key=next_key, txid=tx.active_txid, list_imports_table=self.is_imports_table)
129
+ fields.extend(cur_columns)
130
+ if not is_truncated:
131
+ break
132
+ except errors.BadRequest:
133
+ raise
134
+ except errors.InternalServerError as ise:
135
+ log.warning(
136
+ "Failed to get the sorted columns Elysium might not be supported: %s", ise)
137
+ raise
138
+ except errors.NotSupportedVersion:
139
+ log.warning("Failed to get the sorted columns, Elysium not supported")
140
+ raise
141
+ finally:
142
+ self._sorted_columns = fields
143
+
144
+ def load_stats(self, tx: "Transaction") -> None:
145
+ """Load/Reload table stats."""
146
+ stats_tuple = tx._rpc.api.get_table_stats(
147
+ bucket=self.ref.bucket, schema=self.ref.schema, name=self.ref.table, txid=tx.active_txid,
148
+ imports_table_stats=self.is_imports_table)
149
+ self._stats = TableStats(**stats_tuple._asdict())
150
+
151
+ is_elysium_table = self._stats.sorting_key_enabled
152
+
153
+ if self._table_type is None:
154
+ if is_elysium_table:
155
+ self._set_sorted_table(tx)
156
+ else:
157
+ self._set_regular_table()
158
+ else:
159
+ if is_elysium_table and self.table_type is not TableType.Elysium:
160
+ raise ValueError(
161
+ "Actual table is sorted (TableType.Elysium), was not inited as TableType.Elysium"
162
+ )
163
+
164
+ def _set_sorted_table(self, tx: "Transaction"):
165
+ self._table_type = TableType.Elysium
166
+ tx._rpc.features.check_elysium()
167
+
168
+ def _set_regular_table(self):
169
+ self._table_type = TableType.Regular
170
+
171
+ @property
172
+ def stats(self) -> Optional[TableStats]:
173
+ """Get table's stats."""
174
+ return self._stats
175
+
176
+ @property
177
+ def arrow_schema(self) -> pa.Schema:
178
+ """Table's arrow schema."""
179
+ return self._arrow_schema
180
+
181
+ @arrow_schema.setter
182
+ def arrow_schema(self, arrow_schema: Optional[pa.Schema]):
183
+ """Set arrow schema."""
184
+ if arrow_schema:
185
+ validate_ibis_support_schema(arrow_schema)
186
+ self._arrow_schema = arrow_schema
187
+ self._ibis_table = ibis.table(ibis.Schema.from_pyarrow(arrow_schema), self._ref.full_path)
188
+ else:
189
+ self._arrow_schema = None
190
+ self._ibis_table = None
191
+
192
+ @property
193
+ def sorted_columns(self) -> list:
194
+ """Sorted columns."""
195
+ if self._sorted_columns is None:
196
+ raise ValueError("sorted columns not loaded")
197
+ return self._sorted_columns
198
+
199
+ @property
200
+ def ibis_table(self) -> ibis.Table:
201
+ """Ibis table."""
202
+ return self._ibis_table
203
+
204
+ @property
205
+ def ref(self) -> TableRef:
206
+ """Table's reference."""
207
+ return self._ref
208
+
209
+ @property
210
+ def table_type(self) -> TableType:
211
+ """Table's type."""
212
+ if self._table_type is None:
213
+ raise ValueError(
214
+ "TableType was not loaded. load using TableMetadata.load_stats")
215
+
216
+ return self._table_type
217
+
218
+ @property
219
+ def is_imports_table(self) -> bool:
220
+ """Is table an imports table."""
221
+ return self._table_type is TableType.TableImports
@@ -3,9 +3,7 @@ import logging
3
3
  import duckdb
4
4
  import pyarrow as pa
5
5
  import pyarrow.compute as pc
6
- import pytest
7
6
 
8
- from ..table import QueryConfig
9
7
  from .util import prepare_data
10
8
 
11
9
  log = logging.getLogger(__name__)
@@ -31,31 +29,33 @@ def test_duckdb(session, clean_bucket_name):
31
29
  assert actual == expected
32
30
 
33
31
 
34
- def test_closed_tx(session, clean_bucket_name):
35
- columns = pa.schema([
36
- ('a', pa.int64()),
37
- ])
38
- data = pa.table(schema=columns, data=[
39
- list(range(10000)),
40
- ])
41
-
42
- with session.transaction() as tx:
43
- t = tx.bucket(clean_bucket_name).create_schema("s1").create_table("t1", columns)
44
- t.insert(data)
45
-
46
- config = QueryConfig(
47
- num_sub_splits=1,
48
- num_splits=1,
49
- num_row_groups_per_sub_split=1,
50
- limit_rows_per_sub_split=100)
51
- batches = t.select(config=config) # noqa: F841
52
- first = next(batches) # make sure that HTTP response processing has started
53
- assert first['a'].to_pylist() == list(range(100))
54
-
55
- conn = duckdb.connect()
56
- res = conn.execute('SELECT a FROM batches')
57
- log.debug("closing tx=%s after first batch=%s", t.tx, first)
58
-
59
- # transaction is closed, collecting the result should fail internally in DuckDB
60
- with pytest.raises(duckdb.InvalidInputException):
61
- res.arrow()
32
+ # def test_closed_tx(session, clean_bucket_name):
33
+ # assert duckdb.__version__ == "1.0.0", "doesn't reproduce with newer duckdb versions, when updating duckdb in tests/when relevant need to update this test accordingly."
34
+
35
+ # columns = pa.schema([
36
+ # ('a', pa.int64()),
37
+ # ])
38
+ # data = pa.table(schema=columns, data=[
39
+ # list(range(10000)),
40
+ # ])
41
+
42
+ # with session.transaction() as tx:
43
+ # t = tx.bucket(clean_bucket_name).create_schema("s1").create_table("t1", columns)
44
+ # t.insert(data)
45
+
46
+ # config = QueryConfig(
47
+ # num_sub_splits=1,
48
+ # num_splits=1,
49
+ # num_row_groups_per_sub_split=1,
50
+ # limit_rows_per_sub_split=100)
51
+ # batches = t.select(config=config) # noqa: F841
52
+ # first = next(batches) # make sure that HTTP response processing has started
53
+ # assert first['a'].to_pylist() == list(range(100))
54
+
55
+ # conn = duckdb.connect()
56
+ # res = conn.execute('SELECT a FROM batches')
57
+ # log.debug("closing tx=%s after first batch=%s", t.tx, first)
58
+
59
+ # # transaction is closed, collecting the result should fail internally in DuckDB
60
+ # with pytest.raises(duckdb.InvalidInputException):
61
+ # res.arrow()
@@ -11,7 +11,11 @@ import pytest
11
11
 
12
12
  import vastdb.errors
13
13
 
14
- from .util import prepare_data
14
+ from .util import (
15
+ assert_pandas_df_equal,
16
+ convert_pandas_df_to_hashable_values,
17
+ prepare_data,
18
+ )
15
19
 
16
20
  supported_fixed_list_element_types = [
17
21
  pa.uint8(),
@@ -85,7 +89,7 @@ def test_vectors(session, clean_bucket_name):
85
89
  columns = pa.schema(
86
90
  [("id", pa.int64()), ("vec", pa.list_(pa.field(name="item", type=element_type, nullable=False), dimension),)]
87
91
  )
88
- ids = range(num_rows)
92
+ ids = list(range(num_rows))
89
93
  expected = pa.table(
90
94
  schema=columns,
91
95
  data=[
@@ -102,7 +106,7 @@ def test_vectors(session, clean_bucket_name):
102
106
  assert actual == expected
103
107
 
104
108
  # Select by id.
105
- select_id = random.randint(0, num_rows)
109
+ select_id = random.choice(ids)
106
110
  actual = t.select(predicate=(t["id"] == select_id)).read_all()
107
111
  assert actual.to_pydict()["vec"] == [[select_id] * dimension]
108
112
  assert actual == expected.filter(pc.field("id") == select_id)
@@ -221,24 +225,70 @@ def generate_random_pyarrow_value(
221
225
 
222
226
  @pytest.mark.parametrize("element_field", supported_fixed_list_element_fields)
223
227
  def test_fixed_list_type_values(session, clean_bucket_name, element_field):
224
- list_size = random.randint(1, 1000)
225
- num_rows = random.randint(1, 100)
228
+ list_size = 250
229
+ num_rows = 100
226
230
 
227
231
  vec_type = pa.list_(element_field, list_size)
228
232
  schema = pa.schema(
229
233
  {"id": pa.int64(), "vec": vec_type, "random_int": pa.int64()})
234
+ ids = list(range(num_rows))
230
235
  expected = pa.table(
231
236
  schema=schema,
232
- data=[list(range(num_rows))] + [[generate_random_pyarrow_value(schema.field(col_name)) for _ in range(num_rows)]
237
+ data=[ids] + [[generate_random_pyarrow_value(schema.field(col_name)) for _ in range(num_rows)]
233
238
  for col_name in
234
239
  schema.names[1:]],
235
240
  )
241
+ # Convert the list to tuple in order to support comparison as a whole.
242
+ pd_expected = convert_pandas_df_to_hashable_values(expected.to_pandas())
236
243
 
237
244
  with prepare_data(session, clean_bucket_name, "s", "t", expected) as table:
238
245
  assert table.arrow_schema == schema
239
246
  actual = table.select().read_all()
240
247
  assert actual == expected
241
248
 
249
+ # Select by id.
250
+ id_to_select = random.choice(ids)
251
+ select_by_id = table.select(predicate=(table["id"] == id_to_select)).read_all()
252
+ assert len(select_by_id) == 1 # ID is unique.
253
+ assert select_by_id == expected.filter(pc.field("id") == id_to_select)
254
+
255
+ # Choose a random vector which is not null. Nulls should not be selected using == , != operators, but by isnull.
256
+ # In addition, nulls are discarded unless isnull is used (meaning != 1 will return both not nulls and not 1).
257
+ vector_to_select = random.choice(expected.filter(~pc.field('vec').is_null())['vec'].to_numpy())
258
+
259
+ # TODO VSDK-36: Remove this workaround when the issue with negative decimals is predicate is fixed.
260
+ if pa.types.is_decimal(element_field.type):
261
+ vector_to_select = abs(vector_to_select)
262
+
263
+ # Dtype is not asserted since pandas convert the dtype of integer to float when there are (or could be)
264
+ # NaN/None values.
265
+ # Select by vector value.
266
+ select_by_vector = table.select(predicate=(table["vec"] == vector_to_select)).read_all()
267
+ assert_pandas_df_equal(select_by_vector.to_pandas(),
268
+ pd_expected.loc[pd_expected['vec'] == tuple(vector_to_select)], check_dtype=False)
269
+
270
+ # Not equal to vector value.
271
+ select_by_vector = table.select(predicate=(table["vec"] != vector_to_select)).read_all()
272
+ assert_pandas_df_equal(select_by_vector.to_pandas(),
273
+ pd_expected.loc[(pd_expected['vec'] != tuple(vector_to_select)) &
274
+ pd_expected['vec'].notnull()], check_dtype=False)
275
+
276
+ # Not equal to vector value or null.
277
+ select_by_vector = table.select(
278
+ predicate=((table["vec"] != vector_to_select) | (table['vec'].isnull()))).read_all()
279
+ assert_pandas_df_equal(select_by_vector.to_pandas(),
280
+ pd_expected.loc[pd_expected['vec'] != tuple(vector_to_select)], check_dtype=False)
281
+
282
+ # Lexicographically greater than vector.
283
+ select_by_vector = table.select(predicate=(table["vec"] > vector_to_select)).read_all()
284
+ assert_pandas_df_equal(select_by_vector.to_pandas(), pd_expected.loc[
285
+ pd_expected['vec'].notnull() & (pd_expected['vec'] > tuple(vector_to_select))], check_dtype=False)
286
+
287
+ # Lexicographically less than vector.
288
+ select_by_vector = table.select(predicate=(table["vec"] < vector_to_select)).read_all()
289
+ assert_pandas_df_equal(select_by_vector.to_pandas(), pd_expected.loc[
290
+ pd_expected['vec'].notnull() & (pd_expected['vec'] < tuple(vector_to_select))], check_dtype=False)
291
+
242
292
 
243
293
  @pytest.mark.parametrize("list_type", unsupported_fixed_list_types)
244
294
  def test_unsupported_fixed_list_types(session, clean_bucket_name, list_type):
@@ -14,6 +14,7 @@ from vastdb.errors import (
14
14
  InvalidArgument,
15
15
  NotSupportedVersion,
16
16
  )
17
+ from vastdb.session import Session
17
18
 
18
19
  log = logging.getLogger(__name__)
19
20
 
@@ -28,7 +29,7 @@ def zip_import_session(session):
28
29
  pytest.skip("Skipped because this test requires version 5.3.1")
29
30
 
30
31
 
31
- def test_parallel_imports(session, clean_bucket_name, s3):
32
+ def test_parallel_imports(session: Session, clean_bucket_name: str, s3):
32
33
  num_rows = 1000
33
34
  num_files = 53
34
35
  ds = {'num': [i for i in range(num_rows)]}
@@ -79,13 +79,11 @@ def test_nested_filter(session, clean_bucket_name):
79
79
  def test_nested_unsupported_filter(session, clean_bucket_name):
80
80
  columns = pa.schema([
81
81
  ('l', pa.list_(pa.int8())),
82
- ('fl', pa.list_(pa.field(name='item', type=pa.int64(), nullable=False), 2)),
83
82
  ('m', pa.map_(pa.utf8(), pa.float64())),
84
83
  ('s', pa.struct([('x', pa.int16()), ('y', pa.int32())])),
85
84
  ])
86
85
  expected = pa.table(schema=columns, data=[
87
86
  [[1], [], [2, 3], None],
88
- [[1, 2], None, [3, 4], None],
89
87
  [None, {'a': 2.5}, {'b': 0.25, 'c': 0.025}, {}],
90
88
  [{'x': 1, 'y': None}, None, {'x': 2, 'y': 3}, {'x': None, 'y': 4}],
91
89
  ])
@@ -95,9 +93,6 @@ def test_nested_unsupported_filter(session, clean_bucket_name):
95
93
  with pytest.raises(NotImplementedError):
96
94
  list(t.select(predicate=(t['l'].isnull())))
97
95
 
98
- with pytest.raises(NotImplementedError):
99
- list(t.select(predicate=(t['fl'].isnull())))
100
-
101
96
  with pytest.raises(NotImplementedError):
102
97
  list(t.select(predicate=(t['m'].isnull())))
103
98
 
@@ -0,0 +1,249 @@
1
+ from dataclasses import dataclass
2
+ from typing import Generator, Optional
3
+
4
+ import ibis
5
+ import pyarrow as pa
6
+ import pytest
7
+
8
+ from vastdb.session import Session
9
+ from vastdb.table import INTERNAL_ROW_ID, ITable
10
+ from vastdb.table_metadata import TableMetadata, TableRef, TableType
11
+ from vastdb.transaction import Transaction
12
+
13
+ from .util import compare_pyarrow_tables, prepare_data_get_tx
14
+
15
+
16
+ def test_sanity(session: Session, clean_bucket_name):
17
+ columns = pa.schema([
18
+ ('a', pa.int64()),
19
+ ('b', pa.float32()),
20
+ ('s', pa.utf8()),
21
+ ])
22
+ expected = pa.table(schema=columns, data=[
23
+ [111, 222, 333],
24
+ [0.5, 1.5, 2.5],
25
+ ['a', 'bb', 'ccc'],
26
+ ])
27
+ with prepare_data_get_tx(session, clean_bucket_name, 's', 't', expected) as tx:
28
+ ref = TableRef(clean_bucket_name, 's', 't')
29
+ table_md = TableMetadata(ref, columns, TableType.Regular)
30
+
31
+ table_md.load_stats(tx)
32
+
33
+ t = tx.table_from_metadata(table_md)
34
+
35
+ actual = t.select(columns=['a', 'b', 's']).read_all()
36
+ assert actual == expected
37
+
38
+
39
+ @dataclass
40
+ class SimpleDbSetup:
41
+ tx: Transaction
42
+ ref: TableRef
43
+ table_type: TableType
44
+ arrow_schema: Optional[pa.Schema] = None
45
+
46
+
47
+ @pytest.fixture(scope="function")
48
+ def simple_db_setup(session: Session, clean_bucket_name: str) -> Generator[SimpleDbSetup, None, None]:
49
+ arrow_schema = pa.schema([
50
+ ('a', pa.int64()),
51
+ ('b', pa.float32()),
52
+ ('s', pa.utf8()),
53
+ ])
54
+ expected = pa.table(schema=arrow_schema, data=[
55
+ [111, 222, 333],
56
+ [0.5, 1.5, 2.5],
57
+ ['a', 'bb', 'ccc'],
58
+ ])
59
+ with prepare_data_get_tx(session, clean_bucket_name, 's', 't', expected) as tx:
60
+ yield SimpleDbSetup(tx=tx,
61
+ arrow_schema=arrow_schema,
62
+ ref=TableRef(clean_bucket_name, 's', 't'),
63
+ table_type=TableType.Regular)
64
+
65
+
66
+ def test_schema_load_through_metadata(simple_db_setup: SimpleDbSetup):
67
+ table_md = TableMetadata(simple_db_setup.ref,
68
+ table_type=simple_db_setup.table_type)
69
+
70
+ table = simple_db_setup.tx.table_from_metadata(table_md)
71
+ assert table.arrow_schema is None
72
+ table.reload_schema()
73
+ assert table.arrow_schema is not None
74
+
75
+
76
+ def test_metadata_init_with_schema(simple_db_setup: SimpleDbSetup):
77
+ table_md = TableMetadata(ref=simple_db_setup.ref,
78
+ arrow_schema=simple_db_setup.arrow_schema,
79
+ table_type=simple_db_setup.table_type)
80
+
81
+ table = simple_db_setup.tx.table_from_metadata(table_md)
82
+ assert table.arrow_schema is not None
83
+
84
+
85
+ def test_path(simple_db_setup: SimpleDbSetup):
86
+ table_md = TableMetadata(ref=simple_db_setup.ref,
87
+ arrow_schema=simple_db_setup.arrow_schema,
88
+ table_type=simple_db_setup.table_type)
89
+
90
+ table = simple_db_setup.tx.table_from_metadata(table_md)
91
+ assert table.path == simple_db_setup.ref.full_path
92
+
93
+
94
+ def test_name(simple_db_setup: SimpleDbSetup):
95
+ table_md = TableMetadata(ref=simple_db_setup.ref,
96
+ arrow_schema=simple_db_setup.arrow_schema,
97
+ table_type=simple_db_setup.table_type)
98
+ table = simple_db_setup.tx.table_from_metadata(table_md)
99
+ assert table.name == simple_db_setup.ref.table
100
+
101
+
102
+ def test_arrow_schema(simple_db_setup: SimpleDbSetup):
103
+ table_md = TableMetadata(ref=simple_db_setup.ref,
104
+ arrow_schema=simple_db_setup.arrow_schema,
105
+ table_type=simple_db_setup.table_type)
106
+ table = simple_db_setup.tx.table_from_metadata(table_md)
107
+ assert table.arrow_schema == simple_db_setup.arrow_schema
108
+
109
+
110
+ def test_eq(simple_db_setup: SimpleDbSetup):
111
+ table_md1 = TableMetadata(ref=simple_db_setup.ref, table_type=simple_db_setup.table_type)
112
+ table1 = simple_db_setup.tx.table_from_metadata(table_md1)
113
+
114
+ table_md2 = TableMetadata(ref=simple_db_setup.ref, table_type=simple_db_setup.table_type)
115
+ table2 = simple_db_setup.tx.table_from_metadata(table_md2)
116
+
117
+ assert table1 == table2
118
+
119
+ other_ref = TableRef(simple_db_setup.ref.bucket, simple_db_setup.ref.schema, "other_table")
120
+ table_md3 = TableMetadata(ref=other_ref, table_type=simple_db_setup.table_type)
121
+ table3 = simple_db_setup.tx.table_from_metadata(table_md3)
122
+ assert table1 != table3
123
+
124
+
125
+ def test_insert_and_select(simple_db_setup: SimpleDbSetup):
126
+ table_md = TableMetadata(ref=simple_db_setup.ref,
127
+ arrow_schema=simple_db_setup.arrow_schema,
128
+ table_type=simple_db_setup.table_type)
129
+ table_md.load_stats(simple_db_setup.tx) # the next select requires stats loaded
130
+
131
+ table = simple_db_setup.tx.table_from_metadata(table_md)
132
+
133
+ initial_data = table.select().read_all()
134
+
135
+ assert initial_data.num_rows == 3
136
+
137
+ new_rows = pa.table(schema=simple_db_setup.arrow_schema, data=[[444], [4.5], ["dddd"]])
138
+ table.insert(new_rows)
139
+
140
+ all_data = table.select().read_all()
141
+ assert all_data.num_rows == 4
142
+
143
+ t = ibis.table(table.arrow_schema, name=table.name)
144
+ reader = table.select(predicate=t.a > 300)
145
+ filtered_data = reader.read_all()
146
+ assert filtered_data.num_rows == 2
147
+
148
+
149
+ def test_sorting_status(simple_db_setup: SimpleDbSetup):
150
+ table_md = TableMetadata(ref=simple_db_setup.ref, table_type=simple_db_setup.table_type)
151
+ table = simple_db_setup.tx.table_from_metadata(table_md)
152
+
153
+ is_done = table.sorting_done()
154
+ assert isinstance(is_done, bool)
155
+
156
+ score = table.sorting_score()
157
+ assert isinstance(score, int)
158
+
159
+
160
+ def test_projections(simple_db_setup: SimpleDbSetup):
161
+ table_md = TableMetadata(ref=simple_db_setup.ref, table_type=simple_db_setup.table_type)
162
+ table: ITable = simple_db_setup.tx.table_from_metadata(table_md)
163
+
164
+ ref = simple_db_setup.ref
165
+ legacy_table = simple_db_setup.tx.bucket(ref.bucket).schema(ref.schema).table(ref.table)
166
+
167
+ initial_projections = list(table.projections())
168
+ proj_name = "my_proj"
169
+ proj = legacy_table.create_projection(
170
+ projection_name=proj_name, sorted_columns=["a"], unsorted_columns=["s"]
171
+ )
172
+ assert proj.name == proj_name
173
+
174
+ retrieved_proj = table.projection(proj_name)
175
+ assert retrieved_proj == proj
176
+
177
+ all_projections = list(table.projections())
178
+ assert len(all_projections) == len(initial_projections) + 1
179
+
180
+
181
+ def test_update(simple_db_setup: SimpleDbSetup):
182
+ table_md = TableMetadata(ref=simple_db_setup.ref,
183
+ arrow_schema=simple_db_setup.arrow_schema,
184
+ table_type=simple_db_setup.table_type)
185
+ table_md.load_stats(simple_db_setup.tx)
186
+ table = simple_db_setup.tx.table_from_metadata(table_md)
187
+
188
+ # 1. Select a row to update
189
+ row_to_update = table.select(predicate=table['a'] == 222, internal_row_id=True).read_all()
190
+ assert row_to_update.num_rows == 1
191
+
192
+ # 2. Create a modified version in a new RecordBatch
193
+ update_data = pa.table({
194
+ INTERNAL_ROW_ID: row_to_update[INTERNAL_ROW_ID],
195
+ 's': ['updated_bb']
196
+ })
197
+
198
+ # 3. Call table.update()
199
+ table.update(update_data)
200
+
201
+ # 4. Select the row again and verify changes
202
+ updated_row = table.select(predicate=table['a'] == 222).read_all()
203
+ assert updated_row.to_pydict()['s'] == ['updated_bb']
204
+
205
+ remaining_rows = table.select(predicate=table['a'] != 222).read_all()
206
+ expected_remaining = pa.table({
207
+ 'a': pa.array([111, 333], type=pa.int64()),
208
+ 'b': pa.array([0.5, 2.5], type=pa.float32()),
209
+ 's': pa.array(['a', 'ccc'], type=pa.utf8()),
210
+ })
211
+ assert compare_pyarrow_tables(remaining_rows, expected_remaining)
212
+
213
+
214
+ def test_delete(simple_db_setup: SimpleDbSetup):
215
+ table_md = TableMetadata(ref=simple_db_setup.ref,
216
+ arrow_schema=simple_db_setup.arrow_schema,
217
+ table_type=simple_db_setup.table_type)
218
+ table_md.load_stats(simple_db_setup.tx)
219
+ table = simple_db_setup.tx.table_from_metadata(table_md)
220
+
221
+ # 1. Identify a row to delete
222
+ row_to_delete = table.select(predicate=table['a'] == 333, internal_row_id=True).read_all()
223
+ assert row_to_delete.num_rows == 1
224
+
225
+ # 2. Create a RecordBatch with the key of the row
226
+ delete_data = pa.table({
227
+ INTERNAL_ROW_ID: row_to_delete[INTERNAL_ROW_ID]
228
+ })
229
+
230
+ # 3. Call table.delete()
231
+ table.delete(delete_data)
232
+
233
+ # 4. Select to verify the row is gone
234
+ all_data = table.select().read_all()
235
+ assert all_data.num_rows == 2
236
+ assert 333 not in all_data.to_pydict()['a']
237
+ expected_remaining = pa.table({
238
+ 'a': pa.array([111, 222], type=pa.int64()),
239
+ 'b': pa.array([0.5, 1.5], type=pa.float32()),
240
+ 's': pa.array(['a', 'bb'], type=pa.utf8()),
241
+ })
242
+ assert compare_pyarrow_tables(all_data, expected_remaining)
243
+
244
+
245
+ def test_sanity_load(simple_db_setup: SimpleDbSetup):
246
+ table_md = TableMetadata(TableRef(simple_db_setup.ref.bucket,
247
+ simple_db_setup.ref.schema,
248
+ simple_db_setup.ref.table))
249
+ table_md.load(simple_db_setup.tx)