vastdb 1.4.0__py3-none-any.whl → 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
vastdb/table.py CHANGED
@@ -1,29 +1,44 @@
1
1
  """VAST Database table."""
2
2
 
3
3
  import concurrent.futures
4
- import copy
4
+ import itertools
5
5
  import logging
6
6
  import os
7
7
  import queue
8
8
  import sys
9
- from dataclasses import dataclass, field
9
+ from dataclasses import dataclass
10
10
  from math import ceil
11
+ from queue import Queue
11
12
  from threading import Event
12
- from typing import Callable, Dict, Iterable, List, Optional, Tuple, Union
13
+ from typing import (
14
+ TYPE_CHECKING,
15
+ Callable,
16
+ Iterable,
17
+ Optional,
18
+ Union,
19
+ )
13
20
 
14
21
  import ibis
15
22
  import pyarrow as pa
16
23
  import urllib3
17
24
 
18
- from . import _internal, errors, schema, util
25
+ from vastdb._table_interface import ITable
26
+ from vastdb.table_metadata import TableMetadata, TableRef, TableStats, TableType
27
+
28
+ from . import _internal, errors, util
29
+ from ._ibis_support import validate_ibis_support_schema
19
30
  from .config import ImportConfig, QueryConfig
20
31
 
32
+ if TYPE_CHECKING:
33
+ from .transaction import Transaction
34
+
21
35
  log = logging.getLogger(__name__)
22
36
 
23
37
 
24
38
  INTERNAL_ROW_ID = "$row_id"
25
39
  INTERNAL_ROW_ID_FIELD = pa.field(INTERNAL_ROW_ID, pa.uint64())
26
- INTERNAL_ROW_ID_SORTED_FIELD = pa.field(INTERNAL_ROW_ID, pa.decimal128(38, 0)) # Sorted tables have longer row ids
40
+ INTERNAL_ROW_ID_SORTED_FIELD = pa.field(
41
+ INTERNAL_ROW_ID, pa.decimal128(38, 0)) # Sorted tables have longer row ids
27
42
 
28
43
  MAX_ROWS_PER_BATCH = 512 * 1024
29
44
  # for insert we need a smaller limit due to response amplification
@@ -34,33 +49,52 @@ MAX_COLUMN_IN_BATCH = int(5 * 1024 / 128)
34
49
  SORTING_SCORE_BITS = 63
35
50
 
36
51
 
52
+ class _EmptyResultException(Exception):
53
+ response_schema: pa.Schema
54
+
55
+ def __init__(self, response_schema: pa.Schema):
56
+ self.response_schema = response_schema
57
+
58
+
37
59
  @dataclass
38
- class TableStats:
39
- """Table-related information."""
60
+ class SplitWorkerConfig:
61
+ """Split worker configuration."""
40
62
 
41
- num_rows: int
42
- size_in_bytes: int
43
- sorting_score: int
44
- write_amplification: int
45
- acummulative_row_inserition_count: int
46
- is_external_rowid_alloc: bool = False
47
- sorting_key_enabled: bool = False
48
- sorting_done: bool = False
49
- endpoints: Tuple[str, ...] = ()
63
+ num_splits: int
64
+ num_sub_splits: int
65
+ num_row_groups_per_sub_split: int
66
+ limit_rows_per_sub_split: int
67
+ use_semi_sorted_projections: bool
68
+ queue_priority: Optional[int]
69
+ semi_sorted_projection_name: Optional[str]
50
70
 
51
71
 
52
- class SelectSplitState:
72
+ class SplitWorker:
53
73
  """State of a specific query split execution."""
54
74
 
55
- def __init__(self, query_data_request, table: "Table", split_id: int, config: QueryConfig) -> None:
75
+ def __init__(self,
76
+ api: _internal.VastdbApi,
77
+ query_data_request: _internal.QueryDataRequest,
78
+ bucket_name: str,
79
+ schema_name: str,
80
+ table_name: str,
81
+ txid: Optional[int],
82
+ query_imports_table: bool,
83
+ split_id: int,
84
+ config: SplitWorkerConfig) -> None:
56
85
  """Initialize query split state."""
86
+ self.api = api
57
87
  self.split_id = split_id
58
88
  self.subsplits_state = {i: 0 for i in range(config.num_sub_splits)}
59
- self.config = config
60
89
  self.query_data_request = query_data_request
61
- self.table = table
90
+ self.bucket_name = bucket_name
91
+ self.schema_name = schema_name
92
+ self.table_name = table_name
93
+ self.txid = txid
94
+ self.query_imports_table = query_imports_table
95
+ self.config = config
62
96
 
63
- def process_split(self, api: _internal.VastdbApi, record_batches_queue: queue.Queue[pa.RecordBatch], check_stop: Callable):
97
+ def __iter__(self):
64
98
  """Execute a sequence of QueryData requests, and queue the parsed RecordBatch objects.
65
99
 
66
100
  Can be called repeatedly, to support resuming the query after a disconnection / retriable error.
@@ -69,21 +103,22 @@ class SelectSplitState:
69
103
  # contains RecordBatch parts received from the server, must be re-created in case of a retry
70
104
  while not self.done:
71
105
  # raises if request parsing fails or throttled due to server load, and will be externally retried
72
- response = api.query_data(
73
- bucket=self.table.bucket.name,
74
- schema=self.table.schema.name,
75
- table=self.table.name,
76
- params=self.query_data_request.serialized,
77
- split=(self.split_id, self.config.num_splits, self.config.num_row_groups_per_sub_split),
78
- num_sub_splits=self.config.num_sub_splits,
79
- response_row_id=False,
80
- txid=self.table.tx.txid,
81
- limit_rows=self.config.limit_rows_per_sub_split,
82
- sub_split_start_row_ids=self.subsplits_state.items(),
83
- schedule_id=self.config.queue_priority,
84
- enable_sorted_projections=self.config.use_semi_sorted_projections,
85
- query_imports_table=self.table._imports_table,
86
- projection=self.config.semi_sorted_projection_name)
106
+ response = self.api.query_data(
107
+ bucket=self.bucket_name,
108
+ schema=self.schema_name,
109
+ table=self.table_name,
110
+ params=self.query_data_request.serialized,
111
+ split=(self.split_id, self.config.num_splits,
112
+ self.config.num_row_groups_per_sub_split),
113
+ num_sub_splits=self.config.num_sub_splits,
114
+ response_row_id=False,
115
+ txid=self.txid,
116
+ limit_rows=self.config.limit_rows_per_sub_split,
117
+ sub_split_start_row_ids=self.subsplits_state.items(),
118
+ schedule_id=self.config.queue_priority,
119
+ enable_sorted_projections=self.config.use_semi_sorted_projections,
120
+ query_imports_table=self.query_imports_table,
121
+ projection=self.config.semi_sorted_projection_name)
87
122
 
88
123
  # can raise during response parsing (e.g. due to disconnections), and will be externally retried
89
124
  # the pagination state is stored in `self.subsplits_state` and must be correct in case of a reconnection
@@ -99,129 +134,114 @@ class SelectSplitState:
99
134
  # we have parsed a pyarrow.Table successfully, self.subsplits_state is now correctly updated
100
135
  # if the below loop fails, the query is not retried
101
136
  for batch in table_chunk.to_batches():
102
- check_stop() # may raise StoppedException to early-exit the query (without retries)
103
- if batch:
104
- record_batches_queue.put(batch)
137
+ yield batch
105
138
  except urllib3.exceptions.ProtocolError as err:
106
- log.warning("Failed parsing QueryData response table=%r split=%s/%s offsets=%s cause=%s",
107
- self.table, self.split_id, self.config.num_splits, self.subsplits_state, err)
139
+ fully_qualified_table_name = f"\"{self.bucket_name}/{self.schema_name}\".{self.table_name}"
140
+ log.warning("Failed parsing QueryData response table=%s txid=%s split=%s/%s offsets=%s cause=%s",
141
+ fully_qualified_table_name, self.txid,
142
+ self.split_id, self.config.num_splits, self.subsplits_state, err)
108
143
  # since this is a read-only idempotent operation, it is safe to retry
109
144
  raise errors.ConnectionError(cause=err, may_retry=True)
110
145
 
146
+ def split_record_batch_reader(self) -> pa.RecordBatchReader:
147
+ """Return pa.RecordBatchReader for split."""
148
+ return pa.RecordBatchReader.from_batches(self.query_data_request.response_schema,
149
+ self)
150
+
151
+ def _process_split(self, record_batches_queue: Queue[pa.RecordBatch], check_stop: Callable):
152
+ """Process split and enqueues batches into the queue."""
153
+ for batch in self:
154
+ check_stop() # may raise StoppedException to early-exit the query (without retries)
155
+ if batch:
156
+ record_batches_queue.put(batch)
157
+
111
158
  @property
112
159
  def done(self):
113
160
  """Returns true iff the pagination over."""
114
161
  return all(row_id == _internal.TABULAR_INVALID_ROW_ID for row_id in self.subsplits_state.values())
115
162
 
116
163
 
117
- @dataclass
118
- class Table:
164
+ class TableInTransaction(ITable):
119
165
  """VAST Table."""
120
166
 
121
- name: str
122
- schema: "schema.Schema"
123
- handle: int
124
- arrow_schema: pa.Schema = field(init=False, compare=False, repr=False)
125
- _ibis_table: ibis.Schema = field(init=False, compare=False, repr=False)
126
- _imports_table: bool
127
- sorted_table: bool
128
-
129
- @staticmethod
130
- def validate_ibis_support_schema(arrow_schema: pa.Schema):
131
- """Validate that the provided Arrow schema is compatible with Ibis.
167
+ _metadata: TableMetadata
168
+ _tx: "Transaction"
132
169
 
133
- Raises NotSupportedSchema if the schema contains unsupported fields.
134
- """
135
- unsupported_fields = []
136
- first_exception = None
137
- for f in arrow_schema:
138
- try:
139
- ibis.Schema.from_pyarrow(pa.schema([f]))
140
- except Exception as e:
141
- if first_exception is None:
142
- first_exception = e
143
- unsupported_fields.append(f)
144
-
145
- if unsupported_fields:
146
- raise errors.NotSupportedSchema(
147
- message=f"Ibis does not support the schema {unsupported_fields=}",
148
- schema=arrow_schema,
149
- cause=first_exception
150
- )
151
-
152
- def __post_init__(self):
153
- """Also, load columns' metadata."""
154
- self.arrow_schema = self.columns()
155
-
156
- self._table_path = f'{self.schema.bucket.name}/{self.schema.name}/{self.name}'
157
- self.validate_ibis_support_schema(self.arrow_schema)
158
- self._ibis_table = ibis.table(ibis.Schema.from_pyarrow(self.arrow_schema), self._table_path)
170
+ def __init__(self,
171
+ metadata: TableMetadata,
172
+ tx: "Transaction"):
173
+ """VastDB Table."""
174
+ self._metadata = metadata
175
+ self._tx = tx
159
176
 
160
177
  @property
161
- def path(self):
162
- """Return table's path."""
163
- return self._table_path
178
+ def ref(self) -> TableRef:
179
+ """Table Reference."""
180
+ return self._metadata.ref
181
+
182
+ def __eq__(self, other: object) -> bool:
183
+ """Table __eq__."""
184
+ if not isinstance(other, type(self)):
185
+ return False
186
+
187
+ return self.ref == other.ref
164
188
 
165
189
  @property
166
- def tx(self):
167
- """Return transaction."""
168
- return self.schema.tx
190
+ def name(self) -> str:
191
+ """Table name."""
192
+ return self.ref.table
169
193
 
170
194
  @property
171
- def bucket(self):
172
- """Return bucket."""
173
- return self.schema.bucket
195
+ def arrow_schema(self) -> pa.Schema:
196
+ """Table arrow schema."""
197
+ return self._metadata.arrow_schema
174
198
 
175
199
  @property
176
- def stats(self):
177
- """Fetch table's statistics from server."""
178
- return self.get_stats()
200
+ def stats(self) -> Optional[TableStats]:
201
+ """Table's statistics."""
202
+ return self._metadata.stats
179
203
 
180
- def columns(self) -> pa.Schema:
181
- """Return columns' metadata."""
182
- fields = []
183
- next_key = 0
184
- while True:
185
- cur_columns, next_key, is_truncated, _count = self.tx._rpc.api.list_columns(
186
- bucket=self.bucket.name, schema=self.schema.name, table=self.name, next_key=next_key, txid=self.tx.txid, list_imports_table=self._imports_table)
187
- fields.extend(cur_columns)
188
- if not is_truncated:
189
- break
204
+ def reload_schema(self) -> None:
205
+ """Reload Arrow Schema."""
206
+ self._metadata.load_schema(self._tx)
190
207
 
191
- self.arrow_schema = pa.schema(fields)
192
- return self.arrow_schema
208
+ def reload_stats(self) -> None:
209
+ """Reload Table Stats."""
210
+ self._metadata.load_stats(self._tx)
193
211
 
194
- def sorted_columns(self) -> list:
212
+ def reload_sorted_columns(self) -> None:
213
+ """Reload Sorted Columns."""
214
+ self._metadata.load_sorted_columns(self._tx)
215
+
216
+ @property
217
+ def path(self) -> str:
218
+ """Return table's path."""
219
+ return self.ref.full_path
220
+
221
+ @property
222
+ def _internal_rowid_field(self) -> pa.Field:
223
+ return INTERNAL_ROW_ID_SORTED_FIELD if self._is_sorted_table else INTERNAL_ROW_ID_FIELD
224
+
225
+ def sorted_columns(self) -> list[str]:
195
226
  """Return sorted columns' metadata."""
196
- fields = []
197
- try:
198
- self.tx._rpc.features.check_elysium()
199
- next_key = 0
200
- while True:
201
- cur_columns, next_key, is_truncated, _count = self.tx._rpc.api.list_sorted_columns(
202
- bucket=self.bucket.name, schema=self.schema.name, table=self.name, next_key=next_key, txid=self.tx.txid, list_imports_table=self._imports_table)
203
- fields.extend(cur_columns)
204
- if not is_truncated:
205
- break
206
- except errors.BadRequest:
207
- pass
208
- except errors.InternalServerError as ise:
209
- log.warning("Failed to get the sorted columns Elysium might not be supported: %s", ise)
210
- pass
211
- except errors.NotSupportedVersion:
212
- log.warning("Failed to get the sorted columns, Elysium not supported")
213
- pass
227
+ return self._metadata.sorted_columns
214
228
 
215
- return fields
229
+ def _assert_not_imports_table(self):
230
+ if self._metadata.is_imports_table:
231
+ raise errors.NotSupportedCommand(
232
+ self.ref.bucket, self.ref.schema, self.ref.table)
216
233
 
217
234
  def projection(self, name: str) -> "Projection":
218
235
  """Get a specific semi-sorted projection of this table."""
219
- if self._imports_table:
220
- raise errors.NotSupportedCommand(self.bucket.name, self.schema.name, self.name)
236
+ self._assert_not_imports_table()
237
+
221
238
  projs = tuple(self.projections(projection_name=name))
222
239
  if not projs:
223
- raise errors.MissingProjection(self.bucket.name, self.schema.name, self.name, name)
224
- assert len(projs) == 1, f"Expected to receive only a single projection, but got: {len(projs)}. projections: {projs}"
240
+ raise errors.MissingProjection(
241
+ self.ref.bucket, self.ref.schema, self.ref.table, name)
242
+ if len(projs) != 1:
243
+ raise AssertionError(
244
+ f"Expected to receive only a single projection, but got: {len(projs)}. projections: {projs}")
225
245
  log.debug("Found projection: %s", projs[0])
226
246
  return projs[0]
227
247
 
@@ -230,31 +250,33 @@ class Table:
230
250
 
231
251
  Otherwise, list only the specific projection (if exists).
232
252
  """
233
- if self._imports_table:
234
- raise errors.NotSupportedCommand(self.bucket.name, self.schema.name, self.name)
253
+ self._assert_not_imports_table()
254
+
235
255
  projections = []
236
256
  next_key = 0
237
257
  name_prefix = projection_name if projection_name else ""
238
258
  exact_match = bool(projection_name)
239
259
  while True:
240
260
  _bucket_name, _schema_name, _table_name, curr_projections, next_key, is_truncated, _ = \
241
- self.tx._rpc.api.list_projections(
242
- bucket=self.bucket.name, schema=self.schema.name, table=self.name, next_key=next_key, txid=self.tx.txid,
261
+ self._tx._rpc.api.list_projections(
262
+ bucket=self.ref.bucket, schema=self.ref.schema, table=self.ref.table, next_key=next_key, txid=self._tx.active_txid,
243
263
  exact_match=exact_match, name_prefix=name_prefix)
244
264
  if not curr_projections:
245
265
  break
246
266
  projections.extend(curr_projections)
247
267
  if not is_truncated:
248
268
  break
249
- return [_parse_projection_info(projection, self) for projection in projections]
269
+ return [_parse_projection_info(projection, self._metadata, self._tx) for projection in projections]
250
270
 
251
- def import_files(self, files_to_import: Iterable[str], config: Optional[ImportConfig] = None) -> None:
271
+ def import_files(self,
272
+ files_to_import: Iterable[str],
273
+ config: Optional[ImportConfig] = None) -> None:
252
274
  """Import a list of Parquet files into this table.
253
275
 
254
276
  The files must be on VAST S3 server and be accessible using current credentials.
255
277
  """
256
- if self._imports_table:
257
- raise errors.NotSupportedCommand(self.bucket.name, self.schema.name, self.name)
278
+ self._assert_not_imports_table()
279
+
258
280
  source_files = {}
259
281
  for f in files_to_import:
260
282
  bucket_name, object_path = _parse_bucket_and_object_names(f)
@@ -262,42 +284,50 @@ class Table:
262
284
 
263
285
  self._execute_import(source_files, config=config)
264
286
 
265
- def import_partitioned_files(self, files_and_partitions: Dict[str, pa.RecordBatch], config: Optional[ImportConfig] = None) -> None:
287
+ def import_partitioned_files(self,
288
+ files_and_partitions: dict[str, pa.RecordBatch],
289
+ config: Optional[ImportConfig] = None) -> None:
266
290
  """Import a list of Parquet files into this table.
267
291
 
268
292
  The files must be on VAST S3 server and be accessible using current credentials.
269
293
  Each file must have its own partition values defined as an Arrow RecordBatch.
270
294
  """
271
- if self._imports_table:
272
- raise errors.NotSupportedCommand(self.bucket.name, self.schema.name, self.name)
295
+ self._assert_not_imports_table()
296
+
273
297
  source_files = {}
274
298
  for f, record_batch in files_and_partitions.items():
275
299
  bucket_name, object_path = _parse_bucket_and_object_names(f)
276
300
  serialized_batch = _serialize_record_batch(record_batch)
277
- source_files = {(bucket_name, object_path): serialized_batch.to_pybytes()}
301
+ source_files[(bucket_name, object_path)] = serialized_batch.to_pybytes()
278
302
 
279
303
  self._execute_import(source_files, config=config)
280
304
 
281
- def _execute_import(self, source_files, config):
305
+ def _execute_import(self,
306
+ source_files: dict[tuple[str, str], bytes],
307
+ config: Optional[ImportConfig]):
282
308
  config = config or ImportConfig()
283
- assert config.import_concurrency > 0 # TODO: Do we want to validate concurrency isn't too high?
309
+ # TODO: Do we want to validate concurrency isn't too high?
310
+ assert config.import_concurrency > 0
284
311
  max_batch_size = 10 # Enforced in server side.
285
- endpoints = [self.tx._rpc.api.url for _ in range(config.import_concurrency)] # TODO: use valid endpoints...
286
- files_queue = queue.Queue()
312
+ # TODO: use valid endpoints...
313
+ endpoints = [self._tx._rpc.api.url for _ in range(
314
+ config.import_concurrency)]
315
+ files_queue: Queue = Queue()
287
316
 
288
317
  key_names = config.key_names or []
289
318
  if key_names:
290
- self.tx._rpc.features.check_zip_import()
319
+ self._tx._rpc.features.check_zip_import()
291
320
 
292
321
  for source_file in source_files.items():
293
322
  files_queue.put(source_file)
294
323
 
295
324
  stop_event = Event()
296
- num_files_in_batch = min(ceil(len(source_files) / len(endpoints)), max_batch_size)
325
+ num_files_in_batch = min(
326
+ ceil(len(source_files) / len(endpoints)), max_batch_size)
297
327
 
298
328
  def import_worker(q, endpoint):
299
329
  try:
300
- with self.tx._rpc.api.with_endpoint(endpoint) as session:
330
+ with self._tx._rpc.api.with_endpoint(endpoint) as session:
301
331
  while not q.empty():
302
332
  if stop_event.is_set():
303
333
  log.debug("stop_event is set, exiting")
@@ -309,10 +339,11 @@ class Table:
309
339
  except queue.Empty:
310
340
  pass
311
341
  if files_batch:
312
- log.info("Starting import batch of %s files", len(files_batch))
342
+ log.info(
343
+ "Starting import batch of %s files", len(files_batch))
313
344
  log.debug(f"starting import of {files_batch}")
314
345
  session.import_data(
315
- self.bucket.name, self.schema.name, self.name, files_batch, txid=self.tx.txid,
346
+ self.ref.bucket, self.ref.schema, self.ref.table, files_batch, txid=self._tx.active_txid,
316
347
  key_names=key_names)
317
348
  except (Exception, KeyboardInterrupt) as e:
318
349
  stop_event.set()
@@ -324,7 +355,8 @@ class Table:
324
355
  max_workers=config.import_concurrency, thread_name_prefix='import_thread') as pool:
325
356
  try:
326
357
  for endpoint in endpoints:
327
- futures.append(pool.submit(import_worker, files_queue, endpoint))
358
+ futures.append(pool.submit(
359
+ import_worker, files_queue, endpoint))
328
360
 
329
361
  log.debug("Waiting for import workers to finish")
330
362
  for future in concurrent.futures.as_completed(futures):
@@ -333,49 +365,35 @@ class Table:
333
365
  stop_event.set()
334
366
  # ThreadPoolExecutor will be joined at the end of the context
335
367
 
336
- def get_stats(self) -> TableStats:
337
- """Get the statistics of this table."""
338
- stats_tuple = self.tx._rpc.api.get_table_stats(
339
- bucket=self.bucket.name, schema=self.schema.name, name=self.name, txid=self.tx.txid,
340
- imports_table_stats=self._imports_table)
341
- return TableStats(**stats_tuple._asdict())
342
-
343
- def _get_row_estimate(self, columns: List[str], predicate: ibis.expr.types.BooleanColumn, arrow_schema: pa.Schema):
368
+ def _get_row_estimate(self,
369
+ columns: list[str],
370
+ predicate: ibis.expr.types.BooleanColumn,
371
+ arrow_schema: pa.Schema):
344
372
  query_data_request = _internal.build_query_data_request(
345
373
  schema=arrow_schema,
346
374
  predicate=predicate,
347
375
  field_names=columns)
348
- response = self.tx._rpc.api.query_data(
349
- bucket=self.bucket.name,
350
- schema=self.schema.name,
351
- table=self.name,
376
+ response = self._tx._rpc.api.query_data(
377
+ bucket=self.ref.bucket,
378
+ schema=self.ref.schema,
379
+ table=self.ref.table,
352
380
  params=query_data_request.serialized,
353
381
  split=(0xffffffff - 3, 1, 1),
354
- txid=self.tx.txid)
382
+ txid=self._tx.active_txid)
355
383
  batch = _internal.read_first_batch(response.raw)
356
384
  return batch.num_rows * 2**16 if batch is not None else 0
357
385
 
358
- def select(self, columns: Optional[List[str]] = None,
359
- predicate: Union[ibis.expr.types.BooleanColumn, ibis.common.deferred.Deferred] = None,
360
- config: Optional[QueryConfig] = None,
361
- *,
362
- internal_row_id: bool = False,
363
- limit_rows: Optional[int] = None) -> pa.RecordBatchReader:
364
- """Execute a query over this table.
365
-
366
- To read a subset of the columns, specify their names via `columns` argument. Otherwise, all columns will be read.
367
-
368
- In order to apply a filter, a predicate can be specified. See https://github.com/vast-data/vastdb_sdk/blob/main/README.md#filters-and-projections for more details.
369
-
370
- Query-execution configuration options can be specified via the optional `config` argument.
371
- """
372
- config = copy.deepcopy(config) if config else QueryConfig()
373
-
374
- if limit_rows:
375
- config.limit_rows_per_sub_split = limit_rows
386
+ def _select_prepare(self,
387
+ config: QueryConfig,
388
+ columns: Optional[list[str]] = None,
389
+ predicate: Union[ibis.expr.types.BooleanColumn,
390
+ ibis.common.deferred.Deferred] = None,
391
+ *,
392
+ internal_row_id: bool = False,
393
+ limit_rows: Optional[int] = None) -> tuple[SplitWorkerConfig, _internal.QueryDataRequest, tuple[str, ...]]:
376
394
 
377
395
  if config.data_endpoints is None:
378
- endpoints = tuple([self.tx._rpc.api.url])
396
+ endpoints = tuple([self._tx._rpc.api.url])
379
397
  else:
380
398
  endpoints = tuple(config.data_endpoints)
381
399
  log.debug("endpoints: %s", endpoints)
@@ -385,7 +403,7 @@ class Table:
385
403
 
386
404
  query_schema = self.arrow_schema
387
405
  if internal_row_id:
388
- queried_fields = [INTERNAL_ROW_ID_SORTED_FIELD if self.sorted_table else INTERNAL_ROW_ID_FIELD]
406
+ queried_fields = [self._internal_rowid_field]
389
407
  queried_fields.extend(column for column in self.arrow_schema)
390
408
  query_schema = pa.schema(queried_fields)
391
409
  columns.append(INTERNAL_ROW_ID)
@@ -393,43 +411,129 @@ class Table:
393
411
  if predicate is True:
394
412
  predicate = None
395
413
  if predicate is False:
396
- response_schema = _internal.get_response_schema(schema=query_schema, field_names=columns)
397
- return pa.RecordBatchReader.from_batches(response_schema, [])
414
+ raise _EmptyResultException(
415
+ response_schema=_internal.get_response_schema(schema=query_schema, field_names=columns))
398
416
 
399
417
  if isinstance(predicate, ibis.common.deferred.Deferred):
400
- predicate = predicate.resolve(self._ibis_table) # may raise if the predicate is invalid (e.g. wrong types / missing column)
418
+ # may raise if the predicate is invalid (e.g. wrong types / missing column)
419
+ predicate = predicate.resolve(self._metadata.ibis_table)
401
420
 
402
- if config.num_splits is None:
421
+ if config.num_splits:
422
+ num_splits = config.num_splits
423
+ else:
403
424
  num_rows = 0
404
- if self.sorted_table:
405
- num_rows = self._get_row_estimate(columns, predicate, query_schema)
425
+ if self._is_sorted_table:
426
+ num_rows = self._get_row_estimate(
427
+ columns, predicate, query_schema)
406
428
  log.debug(f'sorted estimate: {num_rows}')
429
+
407
430
  if num_rows == 0:
408
- stats = self.get_stats()
409
- num_rows = stats.num_rows
431
+ if self.stats is None:
432
+ raise AssertionError("Select requires either config.num_splits or loaded stats.")
433
+
434
+ num_rows = self.stats.num_rows
435
+
436
+ num_splits = max(1, num_rows // config.rows_per_split)
410
437
 
411
- config.num_splits = max(1, num_rows // config.rows_per_split)
412
438
  log.debug("config: %s", config)
413
439
 
414
440
  if config.semi_sorted_projection_name:
415
- self.tx._rpc.features.check_enforce_semisorted_projection()
441
+ self._tx._rpc.features.check_enforce_semisorted_projection()
416
442
 
417
443
  query_data_request = _internal.build_query_data_request(
418
444
  schema=query_schema,
419
445
  predicate=predicate,
420
446
  field_names=columns)
421
447
  if len(query_data_request.serialized) > util.MAX_QUERY_DATA_REQUEST_SIZE:
422
- raise errors.TooLargeRequest(f"{len(query_data_request.serialized)} bytes")
448
+ raise errors.TooLargeRequest(
449
+ f"{len(query_data_request.serialized)} bytes")
450
+
451
+ split_config = SplitWorkerConfig(
452
+ num_splits=num_splits,
453
+ num_sub_splits=config.num_sub_splits,
454
+ num_row_groups_per_sub_split=config.num_row_groups_per_sub_split,
455
+ limit_rows_per_sub_split=limit_rows or config.limit_rows_per_sub_split,
456
+ use_semi_sorted_projections=config.use_semi_sorted_projections,
457
+ queue_priority=config.queue_priority,
458
+ semi_sorted_projection_name=config.semi_sorted_projection_name)
459
+
460
+ return split_config, query_data_request, endpoints
461
+
462
+ def select_splits(self, columns: Optional[list[str]] = None,
463
+ predicate: Union[ibis.expr.types.BooleanColumn,
464
+ ibis.common.deferred.Deferred] = None,
465
+ config: Optional[QueryConfig] = None,
466
+ *,
467
+ internal_row_id: bool = False,
468
+ limit_rows: Optional[int] = None) -> list[pa.RecordBatchReader]:
469
+ """Return pa.RecordBatchReader for each split."""
470
+ config = config or QueryConfig()
471
+
472
+ try:
473
+ split_config, query_data_request, endpoints = self._select_prepare(
474
+ config, columns, predicate, internal_row_id=internal_row_id, limit_rows=limit_rows)
475
+ except _EmptyResultException:
476
+ return []
477
+
478
+ endpoint_api = itertools.cycle([
479
+ self._tx._rpc.api.with_endpoint(endpoint)
480
+ for endpoint in endpoints])
481
+
482
+ return [
483
+ SplitWorker(
484
+ api=next(endpoint_api),
485
+ query_data_request=query_data_request,
486
+ bucket_name=self.ref.bucket,
487
+ schema_name=self.ref.schema,
488
+ table_name=self.ref.table,
489
+ txid=self._tx.active_txid,
490
+ query_imports_table=self._metadata.is_imports_table,
491
+ split_id=split,
492
+ config=split_config
493
+ ).split_record_batch_reader()
494
+ for split in range(split_config.num_splits)
495
+ ]
496
+
497
+ def select(self, columns: Optional[list[str]] = None,
498
+ predicate: Union[ibis.expr.types.BooleanColumn,
499
+ ibis.common.deferred.Deferred] = None,
500
+ config: Optional[QueryConfig] = None,
501
+ *,
502
+ internal_row_id: bool = False,
503
+ limit_rows: Optional[int] = None) -> pa.RecordBatchReader:
504
+ """Execute a query over this table.
505
+
506
+ To read a subset of the columns, specify their names via `columns` argument. Otherwise, all columns will be read.
507
+
508
+ In order to apply a filter, a predicate can be specified. See https://github.com/vast-data/vastdb_sdk/blob/main/README.md#filters-and-projections for more details.
509
+
510
+ Query-execution configuration options can be specified via the optional `config` argument.
511
+ """
512
+ config = config or QueryConfig()
513
+
514
+ try:
515
+ split_config, query_data_request, endpoints = self._select_prepare(config,
516
+ columns,
517
+ predicate,
518
+ internal_row_id=internal_row_id,
519
+ limit_rows=limit_rows)
520
+ except _EmptyResultException as e:
521
+ return pa.RecordBatchReader.from_batches(e.response_schema, [])
423
522
 
424
- splits_queue: queue.Queue[int] = queue.Queue()
523
+ splits_queue: Queue[int] = Queue()
425
524
 
426
- for split in range(config.num_splits):
525
+ for split in range(split_config.num_splits):
427
526
  splits_queue.put(split)
428
527
 
429
528
  # this queue shouldn't be large it is merely a pipe through which the results
430
529
  # are sent to the main thread. Most of the pages actually held in the
431
530
  # threads that fetch the pages.
432
- record_batches_queue: queue.Queue[pa.RecordBatch] = queue.Queue(maxsize=2)
531
+ # also, this queue should be at least the amount of workers. otherwise a deadlock may arise.
532
+ # each worker must be able to send the final None message without blocking.
533
+ log.warn("Using the number of endpoints as a heuristic for concurrency.")
534
+ max_workers = len(endpoints)
535
+ record_batches_queue: Queue[pa.RecordBatch] = Queue(
536
+ maxsize=max_workers)
433
537
 
434
538
  stop_event = Event()
435
539
 
@@ -442,8 +546,8 @@ class Table:
442
546
 
443
547
  def single_endpoint_worker(endpoint: str):
444
548
  try:
445
- with self.tx._rpc.api.with_endpoint(endpoint) as host_api:
446
- backoff_decorator = self.tx._rpc.api._backoff_decorator
549
+ with self._tx._rpc.api.with_endpoint(endpoint) as host_api:
550
+ backoff_decorator = self._tx._rpc.api._backoff_decorator
447
551
  while True:
448
552
  check_stop()
449
553
  try:
@@ -452,13 +556,20 @@ class Table:
452
556
  log.debug("splits queue is empty")
453
557
  break
454
558
 
455
- split_state = SelectSplitState(query_data_request=query_data_request,
456
- table=self,
457
- split_id=split,
458
- config=config)
459
-
460
- process_with_retries = backoff_decorator(split_state.process_split)
461
- process_with_retries(host_api, record_batches_queue, check_stop)
559
+ split_state = SplitWorker(
560
+ api=host_api,
561
+ query_data_request=query_data_request,
562
+ bucket_name=self.ref.bucket,
563
+ schema_name=self.ref.schema,
564
+ table_name=self.ref.table,
565
+ txid=self._tx.active_txid,
566
+ query_imports_table=self._metadata.is_imports_table,
567
+ split_id=split,
568
+ config=split_config)
569
+
570
+ process_with_retries = backoff_decorator(
571
+ split_state._process_split)
572
+ process_with_retries(record_batches_queue, check_stop)
462
573
 
463
574
  except StoppedException:
464
575
  log.debug("stop signal.", exc_info=True)
@@ -468,10 +579,11 @@ class Table:
468
579
  log.debug("exiting")
469
580
  record_batches_queue.put(None)
470
581
 
471
- def batches_iterator():
472
- def propagate_first_exception(futures: List[concurrent.futures.Future], block=False):
473
- done, not_done = concurrent.futures.wait(futures, None if block else 0, concurrent.futures.FIRST_EXCEPTION)
474
- if self.tx.txid is None:
582
+ def batches_iterator() -> Iterable[pa.RecordBatch]:
583
+ def propagate_first_exception(futures: set[concurrent.futures.Future], block=False) -> set[concurrent.futures.Future]:
584
+ done, not_done = concurrent.futures.wait(
585
+ futures, None if block else 0, concurrent.futures.FIRST_EXCEPTION)
586
+ if not self._tx.is_active:
475
587
  raise errors.MissingTransaction()
476
588
  for future in done:
477
589
  future.result()
@@ -483,12 +595,14 @@ class Table:
483
595
  threads_prefix = threads_prefix + "-" + config.query_id
484
596
 
485
597
  total_num_rows = limit_rows if limit_rows else sys.maxsize
486
- with concurrent.futures.ThreadPoolExecutor(max_workers=len(endpoints), thread_name_prefix=threads_prefix) as tp: # TODO: concurrency == enpoints is just a heuristic
487
- futures = [tp.submit(single_endpoint_worker, endpoint) for endpoint in endpoints[:config.num_splits]]
598
+ with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers, thread_name_prefix=threads_prefix) as tp:
599
+ futures: set[concurrent.futures.Future] = {tp.submit(single_endpoint_worker, endpoint)
600
+ for endpoint in endpoints[:config.num_splits]}
488
601
  tasks_running = len(futures)
489
602
  try:
490
603
  while tasks_running > 0:
491
- futures = propagate_first_exception(futures, block=False)
604
+ futures = propagate_first_exception(
605
+ futures, block=False)
492
606
 
493
607
  batch = record_batches_queue.get()
494
608
  if batch is not None:
@@ -497,12 +611,14 @@ class Table:
497
611
  total_num_rows -= batch.num_rows
498
612
  else:
499
613
  yield batch.slice(length=total_num_rows)
500
- log.info("reached limit rows per query: %d - stop query", limit_rows)
614
+ log.info(
615
+ "reached limit rows per query: %d - stop query", limit_rows)
501
616
  stop_event.set()
502
617
  break
503
618
  else:
504
619
  tasks_running -= 1
505
- log.debug("one worker thread finished, remaining: %d", tasks_running)
620
+ log.debug(
621
+ "one worker thread finished, remaining: %d", tasks_running)
506
622
 
507
623
  # all host threads ended - wait for all futures to complete
508
624
  propagate_first_exception(futures, block=True)
@@ -514,7 +630,7 @@ class Table:
514
630
 
515
631
  return pa.RecordBatchReader.from_batches(query_data_request.response_schema, batches_iterator())
516
632
 
517
- def insert_in_column_batches(self, rows: pa.RecordBatch):
633
+ def insert_in_column_batches(self, rows: pa.RecordBatch) -> pa.ChunkedArray:
518
634
  """Split the RecordBatch into max_columns that can be inserted in single RPC.
519
635
 
520
636
  Insert first MAX_COLUMN_IN_BATCH columns and get the row_ids. Then loop on the rest of the columns and
@@ -526,48 +642,59 @@ class Table:
526
642
 
527
643
  columns_names = [field.name for field in rows.schema]
528
644
  columns = list(rows.schema)
529
- arrays = [_combine_chunks(rows.column(i)) for i in range(len(rows.schema))]
645
+ arrays = [_combine_chunks(rows.column(i))
646
+ for i in range(len(rows.schema))]
530
647
  for start in range(MAX_COLUMN_IN_BATCH, len(rows.schema), MAX_COLUMN_IN_BATCH):
531
- end = start + MAX_COLUMN_IN_BATCH if start + MAX_COLUMN_IN_BATCH < len(rows.schema) else len(rows.schema)
648
+ end = start + MAX_COLUMN_IN_BATCH if start + \
649
+ MAX_COLUMN_IN_BATCH < len(rows.schema) else len(rows.schema)
532
650
  columns_name_chunk = columns_names[start:end]
533
651
  columns_chunks = columns[start:end]
534
652
  arrays_chunks = arrays[start:end]
535
- columns_chunks.append(INTERNAL_ROW_ID_SORTED_FIELD if self.sorted_table else INTERNAL_ROW_ID_FIELD)
653
+ columns_chunks.append(self._internal_rowid_field)
536
654
  arrays_chunks.append(row_ids.to_pylist())
537
- column_record_batch = pa.RecordBatch.from_arrays(arrays_chunks, schema=pa.schema(columns_chunks))
655
+ column_record_batch = pa.RecordBatch.from_arrays(
656
+ arrays_chunks, schema=pa.schema(columns_chunks))
538
657
  self.update(rows=column_record_batch, columns=columns_name_chunk)
539
658
  return row_ids
540
659
 
541
- def insert(self, rows: Union[pa.RecordBatch, pa.Table], by_columns: bool = False):
660
+ def insert(self,
661
+ rows: Union[pa.RecordBatch, pa.Table],
662
+ by_columns: bool = False) -> pa.ChunkedArray:
542
663
  """Insert a RecordBatch into this table."""
543
- if self._imports_table:
544
- raise errors.NotSupportedCommand(self.bucket.name, self.schema.name, self.name)
664
+ self._assert_not_imports_table()
665
+
545
666
  if 0 == rows.num_rows:
546
- log.debug("Ignoring empty insert into %s", self.name)
547
- return pa.chunked_array([], type=(INTERNAL_ROW_ID_SORTED_FIELD if self.sorted_table else INTERNAL_ROW_ID_FIELD).type)
667
+ log.debug("Ignoring empty insert into %s", self.ref)
668
+ return pa.chunked_array([], type=self._internal_rowid_field.type)
548
669
 
549
670
  if by_columns:
550
- self.tx._rpc.features.check_return_row_ids()
671
+ self._tx._rpc.features.check_return_row_ids()
551
672
  return self.insert_in_column_batches(rows)
552
673
 
553
674
  try:
554
675
  row_ids = []
555
- serialized_slices = util.iter_serialized_slices(rows, MAX_INSERT_ROWS_PER_PATCH)
676
+ serialized_slices = util.iter_serialized_slices(
677
+ rows, MAX_INSERT_ROWS_PER_PATCH)
556
678
  for slice in serialized_slices:
557
- res = self.tx._rpc.api.insert_rows(self.bucket.name, self.schema.name, self.name, record_batch=slice,
558
- txid=self.tx.txid)
679
+ res = self._tx._rpc.api.insert_rows(self.ref.bucket,
680
+ self.ref.schema,
681
+ self.ref.table,
682
+ record_batch=slice,
683
+ txid=self._tx.active_txid)
559
684
  (batch,) = pa.RecordBatchStreamReader(res.content)
560
685
  row_ids.append(batch[INTERNAL_ROW_ID])
561
686
  try:
562
- self.tx._rpc.features.check_return_row_ids()
687
+ self._tx._rpc.features.check_return_row_ids()
563
688
  except errors.NotSupportedVersion:
564
689
  return # type: ignore
565
- return pa.chunked_array(row_ids, type=(INTERNAL_ROW_ID_SORTED_FIELD if self.sorted_table else INTERNAL_ROW_ID_FIELD).type)
690
+ return pa.chunked_array(row_ids, type=self._internal_rowid_field.type)
566
691
  except errors.TooWideRow:
567
- self.tx._rpc.features.check_return_row_ids()
692
+ self._tx._rpc.features.check_return_row_ids()
568
693
  return self.insert_in_column_batches(rows)
569
694
 
570
- def update(self, rows: Union[pa.RecordBatch, pa.Table], columns: Optional[List[str]] = None) -> None:
695
+ def update(self,
696
+ rows: Union[pa.RecordBatch, pa.Table],
697
+ columns: Optional[list[str]] = None) -> None:
571
698
  """Update a subset of cells in this table.
572
699
 
573
700
  Row IDs are specified using a special field (named "$row_id" of uint64 type) - this function assume that this
@@ -575,169 +702,275 @@ class Table:
575
702
 
576
703
  A subset of columns to be updated can be specified via the `columns` argument.
577
704
  """
578
- if self._imports_table:
579
- raise errors.NotSupportedCommand(self.bucket.name, self.schema.name, self.name)
705
+ self._assert_not_imports_table()
706
+
580
707
  try:
581
708
  rows_chunk = rows[INTERNAL_ROW_ID]
582
709
  except KeyError:
583
710
  raise errors.MissingRowIdColumn
584
711
 
585
712
  if columns is None:
586
- columns = [name for name in rows.schema.names if name != INTERNAL_ROW_ID]
713
+ columns = [
714
+ name for name in rows.schema.names if name != INTERNAL_ROW_ID]
587
715
 
588
- update_fields = [INTERNAL_ROW_ID_SORTED_FIELD if self.sorted_table else INTERNAL_ROW_ID_FIELD]
716
+ update_fields = [self._internal_rowid_field]
589
717
  update_values = [_combine_chunks(rows_chunk)]
590
718
  for col in columns:
591
719
  update_fields.append(rows.field(col))
592
720
  update_values.append(_combine_chunks(rows[col]))
593
721
 
594
- update_rows_rb = pa.record_batch(schema=pa.schema(update_fields), data=update_values)
722
+ update_rows_rb = pa.record_batch(
723
+ schema=pa.schema(update_fields), data=update_values)
595
724
 
596
- update_rows_rb = util.sort_record_batch_if_needed(update_rows_rb, INTERNAL_ROW_ID)
725
+ update_rows_rb = util.sort_record_batch_if_needed(
726
+ update_rows_rb, INTERNAL_ROW_ID)
597
727
 
598
- serialized_slices = util.iter_serialized_slices(update_rows_rb, MAX_ROWS_PER_BATCH)
728
+ serialized_slices = util.iter_serialized_slices(
729
+ update_rows_rb, MAX_ROWS_PER_BATCH)
599
730
  for slice in serialized_slices:
600
- self.tx._rpc.api.update_rows(self.bucket.name, self.schema.name, self.name, record_batch=slice,
601
- txid=self.tx.txid)
731
+ self._tx._rpc.api.update_rows(self.ref.bucket, self.ref.schema, self.ref.table, record_batch=slice,
732
+ txid=self._tx.active_txid)
602
733
 
603
734
  def delete(self, rows: Union[pa.RecordBatch, pa.Table]) -> None:
604
735
  """Delete a subset of rows in this table.
605
736
 
606
737
  Row IDs are specified using a special field (named "$row_id" of uint64 type).
607
738
  """
608
- if self._imports_table:
609
- raise errors.NotSupportedCommand(self.bucket.name, self.schema.name, self.name)
739
+ self._assert_not_imports_table()
740
+
610
741
  try:
611
742
  rows_chunk = rows[INTERNAL_ROW_ID]
612
743
  except KeyError:
613
744
  raise errors.MissingRowIdColumn
614
- delete_rows_rb = pa.record_batch(schema=pa.schema([INTERNAL_ROW_ID_SORTED_FIELD if self.sorted_table else INTERNAL_ROW_ID_FIELD]),
745
+ delete_rows_rb = pa.record_batch(schema=pa.schema([self._internal_rowid_field]),
615
746
  data=[_combine_chunks(rows_chunk)])
616
747
 
617
- delete_rows_rb = util.sort_record_batch_if_needed(delete_rows_rb, INTERNAL_ROW_ID)
748
+ delete_rows_rb = util.sort_record_batch_if_needed(
749
+ delete_rows_rb, INTERNAL_ROW_ID)
618
750
 
619
- serialized_slices = util.iter_serialized_slices(delete_rows_rb, MAX_ROWS_PER_BATCH)
751
+ serialized_slices = util.iter_serialized_slices(
752
+ delete_rows_rb, MAX_ROWS_PER_BATCH)
620
753
  for slice in serialized_slices:
621
- self.tx._rpc.api.delete_rows(self.bucket.name, self.schema.name, self.name, record_batch=slice,
622
- txid=self.tx.txid, delete_from_imports_table=self._imports_table)
754
+ self._tx._rpc.api.delete_rows(self.ref.bucket,
755
+ self.ref.schema,
756
+ self.ref.table,
757
+ record_batch=slice,
758
+ txid=self._tx.active_txid,
759
+ delete_from_imports_table=self._metadata.is_imports_table)
760
+
761
+ def imports_table(self) -> Optional[ITable]:
762
+ """Get the imports table of this table."""
763
+ imports_table_metadata = self.imports_table_metadata()
764
+ return TableInTransaction(metadata=imports_table_metadata,
765
+ tx=self._tx)
766
+
767
+ def imports_table_metadata(self) -> TableMetadata:
768
+ """Get TableMetadata for import table."""
769
+ self._tx._rpc.features.check_imports_table()
770
+
771
+ return TableMetadata(ref=self.ref,
772
+ table_type=TableType.TableImports)
773
+
774
+ def __getitem__(self, col_name: str) -> ibis.Column:
775
+ """Allow constructing ibis-like column expressions from this table.
776
+
777
+ It is useful for constructing expressions for predicate pushdown in `Table.select()` method.
778
+ """
779
+ return self._metadata.ibis_table[col_name]
780
+
781
+ def sorting_done(self) -> bool:
782
+ """Sorting done indicator for the table. Always False for unsorted tables."""
783
+ if not self._is_sorted_table:
784
+ return False
785
+ raw_sorting_score = self._tx._rpc.api.raw_sorting_score(self.ref.bucket,
786
+ self.ref.schema,
787
+ self._tx.active_txid,
788
+ self.ref.table)
789
+ return bool(raw_sorting_score >> SORTING_SCORE_BITS)
790
+
791
+ def sorting_score(self) -> int:
792
+ """Sorting score for the table. Always 0 for unsorted tables."""
793
+ if not self._is_sorted_table:
794
+ return 0
795
+ raw_sorting_score = self._tx._rpc.api.raw_sorting_score(self.ref.bucket,
796
+ self.ref.schema,
797
+ self._tx.active_txid,
798
+ self.ref.table)
799
+ return raw_sorting_score & ((1 << SORTING_SCORE_BITS) - 1)
800
+
801
+ @property
802
+ def _is_sorted_table(self) -> bool:
803
+ return self._metadata.table_type is TableType.Elysium
804
+
805
+
806
+ class Table(TableInTransaction):
807
+ """Vast Interactive Table."""
808
+
809
+ _handle: int
810
+
811
+ def __init__(self,
812
+ metadata: TableMetadata,
813
+ handle: int,
814
+ tx: "Transaction"):
815
+ """Vast Interactive Table."""
816
+ super().__init__(metadata, tx)
817
+ self._metadata.load_schema(tx)
818
+
819
+ self._handle = handle
820
+
821
+ @property
822
+ def handle(self) -> int:
823
+ """Table Handle."""
824
+ return self._handle
825
+
826
+ @property
827
+ def tx(self):
828
+ """Return transaction."""
829
+ return self._tx
830
+
831
+ @property
832
+ def stats(self) -> TableStats:
833
+ """Fetch table's statistics from server."""
834
+ self.reload_stats()
835
+ assert self._metadata.stats is not None
836
+ return self._metadata.stats
837
+
838
+ def columns(self) -> pa.Schema:
839
+ """Return columns' metadata."""
840
+ self.reload_schema()
841
+ return self._metadata.arrow_schema
842
+
843
+ def sorted_columns(self) -> list:
844
+ """Return sorted columns' metadata."""
845
+ try:
846
+ self.reload_sorted_columns()
847
+ except Exception:
848
+ pass
849
+
850
+ return self._metadata.sorted_columns
851
+
852
+ def get_stats(self) -> TableStats:
853
+ """Get the statistics of this table."""
854
+ return self.stats
855
+
856
+ def imports_table(self) -> Optional["Table"]:
857
+ """Get the imports table of this table."""
858
+ imports_table_metadata = self.imports_table_metadata()
859
+ imports_table_metadata.load(self.tx)
860
+ return Table(handle=self.handle,
861
+ metadata=imports_table_metadata,
862
+ tx=self.tx)
863
+
864
+ @property
865
+ def sorted_table(self) -> bool:
866
+ """Is table a sorted table."""
867
+ return self._is_sorted_table
868
+
869
+ def __getitem__(self, col_name: str):
870
+ """Allow constructing ibis-like column expressions from this table.
871
+
872
+ It is useful for constructing expressions for predicate pushdown in `Table.select()` method.
873
+ """
874
+ return self._metadata.ibis_table[col_name]
623
875
 
624
876
  def drop(self) -> None:
625
877
  """Drop this table."""
626
- self.tx._rpc.api.drop_table(self.bucket.name, self.schema.name, self.name, txid=self.tx.txid, remove_imports_table=self._imports_table)
627
- log.info("Dropped table: %s", self.name)
878
+ self._tx._rpc.api.drop_table(self.ref.bucket,
879
+ self.ref.schema,
880
+ self.ref.table,
881
+ txid=self._tx.active_txid,
882
+ remove_imports_table=self._metadata.is_imports_table)
883
+ log.info("Dropped table: %s", self.ref.table)
628
884
 
629
885
  def rename(self, new_name: str) -> None:
630
886
  """Rename this table."""
631
- if self._imports_table:
632
- raise errors.NotSupportedCommand(self.bucket.name, self.schema.name, self.name)
633
- self.tx._rpc.api.alter_table(
634
- self.bucket.name, self.schema.name, self.name, txid=self.tx.txid, new_name=new_name)
635
- log.info("Renamed table from %s to %s ", self.name, new_name)
636
- self.name = new_name
887
+ self._assert_not_imports_table()
637
888
 
638
- def add_sorting_key(self, sorting_key: list) -> None:
889
+ self._tx._rpc.api.alter_table(self.ref.bucket,
890
+ self.ref.schema,
891
+ self.ref.table,
892
+ txid=self._tx.active_txid,
893
+ new_name=new_name)
894
+ log.info("Renamed table from %s to %s ", self.ref.table, new_name)
895
+ self._metadata.rename_table(new_name)
896
+
897
+ def add_sorting_key(self, sorting_key: list[int]) -> None:
639
898
  """Add a sorting key to a table that doesn't have any."""
640
- self.tx._rpc.features.check_elysium()
641
- self.tx._rpc.api.alter_table(
642
- self.bucket.name, self.schema.name, self.name, txid=self.tx.txid, sorting_key=sorting_key)
643
- log.info("Enabled Elysium for table %s with sorting key %s ", self.name, str(sorting_key))
899
+ self._tx._rpc.features.check_elysium()
900
+ self._tx._rpc.api.alter_table(self.ref.bucket,
901
+ self.ref.schema,
902
+ self.ref.table,
903
+ txid=self._tx.active_txid,
904
+ sorting_key=sorting_key)
905
+ log.info("Enabled Elysium for table %s with sorting key %s ",
906
+ self.ref.table, str(sorting_key))
644
907
 
645
908
  def add_column(self, new_column: pa.Schema) -> None:
646
909
  """Add a new column."""
647
- if self._imports_table:
648
- raise errors.NotSupportedCommand(self.bucket.name, self.schema.name, self.name)
649
- self.validate_ibis_support_schema(new_column)
650
- self.tx._rpc.api.add_columns(self.bucket.name, self.schema.name, self.name, new_column, txid=self.tx.txid)
910
+ self._assert_not_imports_table()
911
+
912
+ validate_ibis_support_schema(new_column)
913
+ self._tx._rpc.api.add_columns(
914
+ self.ref.bucket, self.ref.schema, self.ref.table, new_column, txid=self._tx.active_txid)
651
915
  log.info("Added column(s): %s", new_column)
652
- self.arrow_schema = self.columns()
916
+ self._metadata.load_schema(self._tx)
653
917
 
654
918
  def drop_column(self, column_to_drop: pa.Schema) -> None:
655
919
  """Drop an existing column."""
656
- if self._imports_table:
657
- raise errors.NotSupported(self.bucket.name, self.schema.name, self.name)
658
- if self._imports_table:
659
- raise errors.NotSupportedCommand(self.bucket.name, self.schema.name, self.name)
660
- self.tx._rpc.api.drop_columns(self.bucket.name, self.schema.name, self.name, column_to_drop, txid=self.tx.txid)
920
+ self._tx._rpc.api.drop_columns(self.ref.bucket,
921
+ self.ref.schema,
922
+ self.ref.table, column_to_drop, txid=self._tx.active_txid)
661
923
  log.info("Dropped column(s): %s", column_to_drop)
662
- self.arrow_schema = self.columns()
924
+ self._metadata.load_schema(self._tx)
663
925
 
664
926
  def rename_column(self, current_column_name: str, new_column_name: str) -> None:
665
927
  """Rename an existing column."""
666
- if self._imports_table:
667
- raise errors.NotSupportedCommand(self.bucket.name, self.schema.name, self.name)
668
- self.tx._rpc.api.alter_column(self.bucket.name, self.schema.name, self.name, name=current_column_name,
669
- new_name=new_column_name, txid=self.tx.txid)
670
- log.info("Renamed column: %s to %s", current_column_name, new_column_name)
671
- self.arrow_schema = self.columns()
672
-
673
- def create_projection(self, projection_name: str, sorted_columns: List[str], unsorted_columns: List[str]) -> "Projection":
928
+ self._assert_not_imports_table()
929
+
930
+ self._tx._rpc.api.alter_column(self.ref.bucket,
931
+ self.ref.schema,
932
+ self.ref.table, name=current_column_name,
933
+ new_name=new_column_name, txid=self._tx.active_txid)
934
+ log.info("Renamed column: %s to %s",
935
+ current_column_name, new_column_name)
936
+ self._metadata.load_schema(self._tx)
937
+
938
+ def create_projection(self, projection_name: str, sorted_columns: list[str], unsorted_columns: list[str]) -> "Projection":
674
939
  """Create a new semi-sorted projection."""
675
- if self._imports_table:
676
- raise errors.NotSupportedCommand(self.bucket.name, self.schema.name, self.name)
677
- columns = [(sorted_column, "Sorted") for sorted_column in sorted_columns] + [(unsorted_column, "Unorted") for unsorted_column in unsorted_columns]
678
- self.tx._rpc.api.create_projection(self.bucket.name, self.schema.name, self.name, projection_name, columns=columns, txid=self.tx.txid)
940
+ self._assert_not_imports_table()
941
+
942
+ columns = [(sorted_column, "Sorted") for sorted_column in sorted_columns] + \
943
+ [(unsorted_column, "Unorted")
944
+ for unsorted_column in unsorted_columns]
945
+ self._tx._rpc.api.create_projection(self.ref.bucket,
946
+ self.ref.schema,
947
+ self.ref.table, projection_name, columns=columns, txid=self._tx.active_txid)
679
948
  log.info("Created projection: %s", projection_name)
680
949
  return self.projection(projection_name)
681
950
 
682
- def create_imports_table(self, fail_if_exists=True) -> "Table":
951
+ def create_imports_table(self, fail_if_exists=True) -> ITable:
683
952
  """Create imports table."""
684
- self.tx._rpc.features.check_imports_table()
953
+ self._tx._rpc.features.check_imports_table()
685
954
  empty_schema = pa.schema([])
686
- self.tx._rpc.api.create_table(self.bucket.name, self.schema.name, self.name, empty_schema, txid=self.tx.txid,
687
- create_imports_table=True)
955
+ self._tx._rpc.api.create_table(self.ref.bucket,
956
+ self.ref.schema,
957
+ self.ref.table,
958
+ empty_schema,
959
+ txid=self._tx.active_txid,
960
+ create_imports_table=True)
688
961
  log.info("Created imports table for table: %s", self.name)
689
962
  return self.imports_table() # type: ignore[return-value]
690
963
 
691
- def imports_table(self) -> Optional["Table"]:
692
- """Get the imports table of this table."""
693
- self.tx._rpc.features.check_imports_table()
694
- return Table(name=self.name, schema=self.schema, handle=int(self.handle), _imports_table=True, sorted_table=self.sorted_table)
695
-
696
- def __getitem__(self, col_name: str):
697
- """Allow constructing ibis-like column expressions from this table.
698
-
699
- It is useful for constructing expressions for predicate pushdown in `Table.select()` method.
700
- """
701
- return self._ibis_table[col_name]
702
-
703
- def sorting_done(self) -> int:
704
- """Sorting done indicator for the table. Always False for unsorted tables."""
705
- if not self.sorted_table:
706
- return False
707
- raw_sorting_score = self.tx._rpc.api.raw_sorting_score(self.schema.bucket.name, self.schema.name, self.schema.tx.txid, self.name)
708
- return bool(raw_sorting_score >> SORTING_SCORE_BITS)
709
-
710
- def sorting_score(self) -> int:
711
- """Sorting score for the table. Always 0 for unsorted tables."""
712
- if not self.sorted_table:
713
- return 0
714
- raw_sorting_score = self.tx._rpc.api.raw_sorting_score(self.schema.bucket.name, self.schema.name, self.schema.tx.txid, self.name)
715
- return raw_sorting_score & ((1 << SORTING_SCORE_BITS) - 1)
716
-
717
964
 
718
965
  @dataclass
719
966
  class Projection:
720
967
  """VAST semi-sorted projection."""
721
968
 
722
969
  name: str
723
- table: Table
724
- handle: int
970
+ table_metadata: TableMetadata
725
971
  stats: TableStats
726
-
727
- @property
728
- def bucket(self):
729
- """Return bucket."""
730
- return self.table.schema.bucket
731
-
732
- @property
733
- def schema(self):
734
- """Return schema."""
735
- return self.table.schema
736
-
737
- @property
738
- def tx(self):
739
- """Return transaction."""
740
- return self.table.schema.tx
972
+ handle: int
973
+ tx: "Transaction"
741
974
 
742
975
  def columns(self) -> pa.Schema:
743
976
  """Return this projections' columns as an Arrow schema."""
@@ -746,7 +979,12 @@ class Projection:
746
979
  while True:
747
980
  curr_columns, next_key, is_truncated, _count, _ = \
748
981
  self.tx._rpc.api.list_projection_columns(
749
- self.bucket.name, self.schema.name, self.table.name, self.name, txid=self.table.tx.txid, next_key=next_key)
982
+ self.table_metadata.ref.bucket,
983
+ self.table_metadata.ref.schema,
984
+ self.table_metadata.ref.table,
985
+ self.name,
986
+ txid=self.tx.active_txid,
987
+ next_key=next_key)
750
988
  if not curr_columns:
751
989
  break
752
990
  columns.extend(curr_columns)
@@ -757,26 +995,37 @@ class Projection:
757
995
 
758
996
  def rename(self, new_name: str) -> None:
759
997
  """Rename this projection."""
760
- self.tx._rpc.api.alter_projection(self.bucket.name, self.schema.name,
761
- self.table.name, self.name, txid=self.tx.txid, new_name=new_name)
998
+ self.tx._rpc.api.alter_projection(self.table_metadata.ref.bucket,
999
+ self.table_metadata.ref.schema,
1000
+ self.table_metadata.ref.table,
1001
+ self.name,
1002
+ txid=self.tx.active_txid,
1003
+ new_name=new_name)
762
1004
  log.info("Renamed projection from %s to %s ", self.name, new_name)
763
1005
  self.name = new_name
764
1006
 
765
1007
  def drop(self) -> None:
766
1008
  """Drop this projection."""
767
- self.tx._rpc.api.drop_projection(self.bucket.name, self.schema.name, self.table.name,
768
- self.name, txid=self.tx.txid)
1009
+ self.tx._rpc.api.drop_projection(self.table_metadata.ref.bucket,
1010
+ self.table_metadata.ref.schema,
1011
+ self.table_metadata.ref.table,
1012
+ self.name,
1013
+ txid=self.tx.active_txid)
769
1014
  log.info("Dropped projection: %s", self.name)
770
1015
 
771
1016
 
772
- def _parse_projection_info(projection_info, table: "Table"):
1017
+ def _parse_projection_info(projection_info, table_metadata: "TableMetadata", tx: "Transaction"):
773
1018
  log.info("Projection info %s", str(projection_info))
774
1019
  stats = TableStats(num_rows=projection_info.num_rows, size_in_bytes=projection_info.size_in_bytes,
775
1020
  sorting_score=0, write_amplification=0, acummulative_row_inserition_count=0)
776
- return Projection(name=projection_info.name, table=table, stats=stats, handle=int(projection_info.handle))
1021
+ return Projection(name=projection_info.name,
1022
+ table_metadata=table_metadata,
1023
+ stats=stats,
1024
+ handle=int(projection_info.handle),
1025
+ tx=tx)
777
1026
 
778
1027
 
779
- def _parse_bucket_and_object_names(path: str) -> Tuple[str, str]:
1028
+ def _parse_bucket_and_object_names(path: str) -> tuple[str, str]:
780
1029
  if not path.startswith('/'):
781
1030
  raise errors.InvalidArgument(f"Path {path} must start with a '/'")
782
1031
  components = path.split(os.path.sep)
@@ -797,3 +1046,9 @@ def _combine_chunks(col):
797
1046
  return col.combine_chunks()
798
1047
  else:
799
1048
  return col
1049
+
1050
+
1051
+ __all__ = ["ITable",
1052
+ "Table",
1053
+ "TableInTransaction",
1054
+ "Projection"]