vastdb 0.1.1__py3-none-any.whl → 0.1.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
vastdb/schema.py CHANGED
@@ -6,11 +6,16 @@ It is possible to list and access VAST snapshots generated over a bucket.
6
6
 
7
7
  import logging
8
8
  from dataclasses import dataclass
9
+ from typing import TYPE_CHECKING, List, Optional
9
10
 
10
11
  import pyarrow as pa
11
12
 
12
13
  from . import bucket, errors, schema, table
13
14
 
15
+ if TYPE_CHECKING:
16
+ from .table import Table
17
+
18
+
14
19
  log = logging.getLogger(__name__)
15
20
 
16
21
 
@@ -26,7 +31,7 @@ class Schema:
26
31
  """VAST transaction used for this schema."""
27
32
  return self.bucket.tx
28
33
 
29
- def create_table(self, table_name: str, columns: pa.Schema, fail_if_exists=True) -> "table.Table":
34
+ def create_table(self, table_name: str, columns: pa.Schema, fail_if_exists=True) -> "Table":
30
35
  """Create a new table under this schema."""
31
36
  if current := self.table(table_name, fail_if_missing=False):
32
37
  if fail_if_exists:
@@ -35,9 +40,9 @@ class Schema:
35
40
  return current
36
41
  self.tx._rpc.api.create_table(self.bucket.name, self.name, table_name, columns, txid=self.tx.txid)
37
42
  log.info("Created table: %s", table_name)
38
- return self.table(table_name)
43
+ return self.table(table_name) # type: ignore[return-value]
39
44
 
40
- def table(self, name: str, fail_if_missing=True) -> "table.Table":
45
+ def table(self, name: str, fail_if_missing=True) -> Optional["table.Table"]:
41
46
  """Get a specific table under this schema."""
42
47
  t = self.tables(table_name=name)
43
48
  if not t:
@@ -49,14 +54,14 @@ class Schema:
49
54
  log.debug("Found table: %s", t[0])
50
55
  return t[0]
51
56
 
52
- def tables(self, table_name=None) -> ["table.Table"]:
57
+ def tables(self, table_name=None) -> List["Table"]:
53
58
  """List all tables under this schema."""
54
59
  tables = []
55
60
  next_key = 0
56
61
  name_prefix = table_name if table_name else ""
57
62
  exact_match = bool(table_name)
58
63
  while True:
59
- bucket_name, schema_name, curr_tables, next_key, is_truncated, _ = \
64
+ _bucket_name, _schema_name, curr_tables, next_key, is_truncated, _ = \
60
65
  self.tx._rpc.api.list_tables(
61
66
  bucket=self.bucket.name, schema=self.name, next_key=next_key, txid=self.tx.txid,
62
67
  exact_match=exact_match, name_prefix=name_prefix, include_list_stats=exact_match)
@@ -82,4 +87,4 @@ class Schema:
82
87
 
83
88
  def _parse_table_info(table_info, schema: "schema.Schema"):
84
89
  stats = table.TableStats(num_rows=table_info.num_rows, size_in_bytes=table_info.size_in_bytes)
85
- return table.Table(name=table_info.name, schema=schema, handle=int(table_info.handle), stats=stats)
90
+ return table.Table(name=table_info.name, schema=schema, handle=int(table_info.handle), stats=stats, _imports_table=False)
vastdb/session.py CHANGED
@@ -11,7 +11,20 @@ import os
11
11
 
12
12
  import boto3
13
13
 
14
- from . import internal_commands, transaction
14
+ from . import errors, internal_commands, transaction
15
+
16
+
17
+ class Features:
18
+ """VAST database features - check if server is already support a feature."""
19
+
20
+ def __init__(self, vast_version):
21
+ """Save the server version."""
22
+ self.vast_version = vast_version
23
+
24
+ def check_imports_table(self):
25
+ """Check if the feature that support imports table is supported."""
26
+ if self.vast_version < (5, 2):
27
+ raise errors.NotSupportedVersion("import_table requires 5.2+", self.vast_version)
15
28
 
16
29
 
17
30
  class Session:
@@ -27,6 +40,8 @@ class Session:
27
40
  endpoint = os.environ['AWS_S3_ENDPOINT_URL']
28
41
 
29
42
  self.api = internal_commands.VastdbApi(endpoint, access, secret)
43
+ version_tuple = tuple(int(part) for part in self.api.vast_version.split('.'))
44
+ self.features = Features(version_tuple)
30
45
  self.s3 = boto3.client('s3',
31
46
  aws_access_key_id=access,
32
47
  aws_secret_access_key=secret,
vastdb/table.py CHANGED
@@ -1,3 +1,5 @@
1
+ """VAST Database table."""
2
+
1
3
  import concurrent.futures
2
4
  import logging
3
5
  import os
@@ -5,18 +7,12 @@ import queue
5
7
  from dataclasses import dataclass, field
6
8
  from math import ceil
7
9
  from threading import Event
8
- from typing import List, Union
10
+ from typing import Dict, List, Optional, Tuple, Union
9
11
 
10
12
  import ibis
11
13
  import pyarrow as pa
12
14
 
13
- from . import errors, schema
14
- from .internal_commands import (
15
- TABULAR_INVALID_ROW_ID,
16
- VastdbApi,
17
- build_query_data_request,
18
- parse_query_data_response,
19
- )
15
+ from . import errors, internal_commands, schema, util
20
16
 
21
17
  log = logging.getLogger(__name__)
22
18
 
@@ -27,18 +23,24 @@ MAX_ROWS_PER_BATCH = 512 * 1024
27
23
  # for example insert of 512k uint8 result in 512k*8bytes response since row_ids are uint64
28
24
  MAX_INSERT_ROWS_PER_PATCH = 512 * 1024
29
25
 
26
+
30
27
  @dataclass
31
28
  class TableStats:
29
+ """Table-related information."""
30
+
32
31
  num_rows: int
33
32
  size_in_bytes: int
34
33
  is_external_rowid_alloc: bool = False
35
- endpoints: List[str] = None
34
+ endpoints: Tuple[str, ...] = ()
35
+
36
36
 
37
37
  @dataclass
38
38
  class QueryConfig:
39
+ """Query execution configiration."""
40
+
39
41
  num_sub_splits: int = 4
40
42
  num_splits: int = 1
41
- data_endpoints: [str] = None
43
+ data_endpoints: Optional[List[str]] = None
42
44
  limit_rows_per_sub_split: int = 128 * 1024
43
45
  num_row_groups_per_sub_split: int = 8
44
46
  use_semi_sorted_projections: bool = True
@@ -48,17 +50,27 @@ class QueryConfig:
48
50
 
49
51
  @dataclass
50
52
  class ImportConfig:
53
+ """Import execution configiration."""
54
+
51
55
  import_concurrency: int = 2
52
56
 
53
- class SelectSplitState():
54
- def __init__(self, query_data_request, table : "Table", split_id : int, config: QueryConfig) -> None:
57
+
58
+ class SelectSplitState:
59
+ """State of a specific query split execution."""
60
+
61
+ def __init__(self, query_data_request, table: "Table", split_id: int, config: QueryConfig) -> None:
62
+ """Initialize query split state."""
55
63
  self.split_id = split_id
56
64
  self.subsplits_state = {i: 0 for i in range(config.num_sub_splits)}
57
65
  self.config = config
58
66
  self.query_data_request = query_data_request
59
67
  self.table = table
60
68
 
61
- def batches(self, api : VastdbApi):
69
+ def batches(self, api: internal_commands.VastdbApi):
70
+ """Execute QueryData request, and yield parsed RecordBatch objects.
71
+
72
+ Can be called repeatedly, to allow pagination.
73
+ """
62
74
  while not self.done:
63
75
  response = api.query_data(
64
76
  bucket=self.table.bucket.name,
@@ -71,34 +83,39 @@ class SelectSplitState():
71
83
  txid=self.table.tx.txid,
72
84
  limit_rows=self.config.limit_rows_per_sub_split,
73
85
  sub_split_start_row_ids=self.subsplits_state.items(),
74
- enable_sorted_projections=self.config.use_semi_sorted_projections)
75
- pages_iter = parse_query_data_response(
86
+ enable_sorted_projections=self.config.use_semi_sorted_projections,
87
+ query_imports_table=self.table._imports_table)
88
+ pages_iter = internal_commands.parse_query_data_response(
76
89
  conn=response.raw,
77
90
  schema=self.query_data_request.response_schema,
78
- start_row_ids=self.subsplits_state)
91
+ start_row_ids=self.subsplits_state,
92
+ parser=self.query_data_request.response_parser)
79
93
 
80
94
  for page in pages_iter:
81
95
  for batch in page.to_batches():
82
96
  if len(batch) > 0:
83
97
  yield batch
84
98
 
85
-
86
99
  @property
87
100
  def done(self):
88
- return all(row_id == TABULAR_INVALID_ROW_ID for row_id in self.subsplits_state.values())
101
+ """Returns true iff the pagination over."""
102
+ return all(row_id == internal_commands.TABULAR_INVALID_ROW_ID for row_id in self.subsplits_state.values())
103
+
89
104
 
90
105
  @dataclass
91
106
  class Table:
107
+ """VAST Table."""
108
+
92
109
  name: str
93
110
  schema: "schema.Schema"
94
111
  handle: int
95
112
  stats: TableStats
96
- properties: dict = None
97
- arrow_schema: pa.Schema = field(init=False, compare=False)
98
- _ibis_table: ibis.Schema = field(init=False, compare=False)
113
+ arrow_schema: pa.Schema = field(init=False, compare=False, repr=False)
114
+ _ibis_table: ibis.Schema = field(init=False, compare=False, repr=False)
115
+ _imports_table: bool
99
116
 
100
117
  def __post_init__(self):
101
- self.properties = self.properties or {}
118
+ """Also, load columns' metadata."""
102
119
  self.arrow_schema = self.columns()
103
120
 
104
121
  table_path = f'{self.schema.bucket.name}/{self.schema.name}/{self.name}'
@@ -106,21 +123,21 @@ class Table:
106
123
 
107
124
  @property
108
125
  def tx(self):
126
+ """Return transaction."""
109
127
  return self.schema.tx
110
128
 
111
129
  @property
112
130
  def bucket(self):
131
+ """Return bucket."""
113
132
  return self.schema.bucket
114
133
 
115
- def __repr__(self):
116
- return f"{type(self).__name__}(name={self.name})"
117
-
118
134
  def columns(self) -> pa.Schema:
135
+ """Return columns' metadata."""
119
136
  fields = []
120
137
  next_key = 0
121
138
  while True:
122
139
  cur_columns, next_key, is_truncated, _count = self.tx._rpc.api.list_columns(
123
- bucket=self.bucket.name, schema=self.schema.name, table=self.name, next_key=next_key, txid=self.tx.txid)
140
+ bucket=self.bucket.name, schema=self.schema.name, table=self.name, next_key=next_key, txid=self.tx.txid, list_imports_table=self._imports_table)
124
141
  fields.extend(cur_columns)
125
142
  if not is_truncated:
126
143
  break
@@ -129,6 +146,9 @@ class Table:
129
146
  return self.arrow_schema
130
147
 
131
148
  def projection(self, name: str) -> "Projection":
149
+ """Get a specific semi-sorted projection of this table."""
150
+ if self._imports_table:
151
+ raise errors.NotSupportedCommand(self.bucket.name, self.schema.name, self.name)
132
152
  projs = self.projections(projection_name=name)
133
153
  if not projs:
134
154
  raise errors.MissingProjection(self.bucket.name, self.schema.name, self.name, name)
@@ -136,13 +156,16 @@ class Table:
136
156
  log.debug("Found projection: %s", projs[0])
137
157
  return projs[0]
138
158
 
139
- def projections(self, projection_name=None) -> ["Projection"]:
159
+ def projections(self, projection_name=None) -> List["Projection"]:
160
+ """List all semi-sorted projections of this table."""
161
+ if self._imports_table:
162
+ raise errors.NotSupportedCommand(self.bucket.name, self.schema.name, self.name)
140
163
  projections = []
141
164
  next_key = 0
142
165
  name_prefix = projection_name if projection_name else ""
143
166
  exact_match = bool(projection_name)
144
167
  while True:
145
- bucket_name, schema_name, table_name, curr_projections, next_key, is_truncated, _ = \
168
+ _bucket_name, _schema_name, _table_name, curr_projections, next_key, is_truncated, _ = \
146
169
  self.tx._rpc.api.list_projections(
147
170
  bucket=self.bucket.name, schema=self.schema.name, table=self.name, next_key=next_key, txid=self.tx.txid,
148
171
  exact_match=exact_match, name_prefix=name_prefix)
@@ -153,7 +176,13 @@ class Table:
153
176
  break
154
177
  return [_parse_projection_info(projection, self) for projection in projections]
155
178
 
156
- def import_files(self, files_to_import: [str], config: ImportConfig = None) -> None:
179
+ def import_files(self, files_to_import: List[str], config: Optional[ImportConfig] = None) -> None:
180
+ """Import a list of Parquet files into this table.
181
+
182
+ The files must be on VAST S3 server and be accessible using current credentials.
183
+ """
184
+ if self._imports_table:
185
+ raise errors.NotSupportedCommand(self.bucket.name, self.schema.name, self.name)
157
186
  source_files = {}
158
187
  for f in files_to_import:
159
188
  bucket_name, object_path = _parse_bucket_and_object_names(f)
@@ -161,7 +190,14 @@ class Table:
161
190
 
162
191
  self._execute_import(source_files, config=config)
163
192
 
164
- def import_partitioned_files(self, files_and_partitions: {str: pa.RecordBatch}, config: ImportConfig = None) -> None:
193
+ def import_partitioned_files(self, files_and_partitions: Dict[str, pa.RecordBatch], config: Optional[ImportConfig] = None) -> None:
194
+ """Import a list of Parquet files into this table.
195
+
196
+ The files must be on VAST S3 server and be accessible using current credentials.
197
+ Each file must have its own partition values defined as an Arrow RecordBatch.
198
+ """
199
+ if self._imports_table:
200
+ raise errors.NotSupportedCommand(self.bucket.name, self.schema.name, self.name)
165
201
  source_files = {}
166
202
  for f, record_batch in files_and_partitions.items():
167
203
  bucket_name, object_path = _parse_bucket_and_object_names(f)
@@ -209,7 +245,7 @@ class Table:
209
245
  max_workers=config.import_concurrency, thread_name_prefix='import_thread') as pool:
210
246
  try:
211
247
  for endpoint in endpoints:
212
- session = VastdbApi(endpoint, self.tx._rpc.api.access_key, self.tx._rpc.api.secret_key)
248
+ session = internal_commands.VastdbApi(endpoint, self.tx._rpc.api.access_key, self.tx._rpc.api.secret_key)
213
249
  futures.append(pool.submit(import_worker, files_queue, session))
214
250
 
215
251
  log.debug("Waiting for import workers to finish")
@@ -218,24 +254,40 @@ class Table:
218
254
  finally:
219
255
  stop_event.set()
220
256
  # ThreadPoolExecutor will be joined at the end of the context
221
- def refresh_stats(self):
257
+
258
+ def get_stats(self) -> TableStats:
259
+ """Get the statistics of this table."""
222
260
  stats_tuple = self.tx._rpc.api.get_table_stats(
223
- bucket=self.bucket.name, schema=self.schema.name, name=self.name, txid=self.tx.txid)
224
- self.stats = TableStats(**stats_tuple._asdict())
261
+ bucket=self.bucket.name, schema=self.schema.name, name=self.name, txid=self.tx.txid,
262
+ imports_table_stats=self._imports_table)
263
+ return TableStats(**stats_tuple._asdict())
225
264
 
226
- def select(self, columns: [str] = None,
265
+ def select(self, columns: Optional[List[str]] = None,
227
266
  predicate: ibis.expr.types.BooleanColumn = None,
228
- config: QueryConfig = None,
267
+ config: Optional[QueryConfig] = None,
229
268
  *,
230
269
  internal_row_id: bool = False) -> pa.RecordBatchReader:
270
+ """Execute a query over this table.
271
+
272
+ To read a subset of the columns, specify their names via `columns` argument. Otherwise, all columns will be read.
273
+
274
+ In order to apply a filter, a predicate can be specified. See https://github.com/vast-data/vastdb_sdk/blob/main/README.md#filters-and-projections for more details.
275
+
276
+ Query-execution configuration options can be specified via the optional `config` argument.
277
+ """
231
278
  if config is None:
232
279
  config = QueryConfig()
233
280
 
234
- self.refresh_stats()
281
+ # Take a snapshot of enpoints
282
+ stats = self.get_stats()
283
+ endpoints = stats.endpoints if config.data_endpoints is None else config.data_endpoints
284
+
285
+ if stats.num_rows > config.rows_per_split and config.num_splits is None:
286
+ config.num_splits = stats.num_rows // config.rows_per_split
287
+ log.debug(f"num_rows={stats.num_rows} rows_per_splits={config.rows_per_split} num_splits={config.num_splits} ")
235
288
 
236
- if self.stats.num_rows > config.rows_per_split and config.num_splits is None:
237
- config.num_splits = self.stats.num_rows // config.rows_per_split
238
- log.debug(f"num_rows={self.stats.num_rows} rows_per_splits={config.rows_per_split} num_splits={config.num_splits} ")
289
+ if columns is None:
290
+ columns = [f.name for f in self.arrow_schema]
239
291
 
240
292
  query_schema = self.arrow_schema
241
293
  if internal_row_id:
@@ -244,12 +296,12 @@ class Table:
244
296
  query_schema = pa.schema(queried_fields)
245
297
  columns.append(INTERNAL_ROW_ID)
246
298
 
247
- query_data_request = build_query_data_request(
299
+ query_data_request = internal_commands.build_query_data_request(
248
300
  schema=query_schema,
249
301
  predicate=predicate,
250
302
  field_names=columns)
251
303
 
252
- splits_queue = queue.Queue()
304
+ splits_queue: queue.Queue[int] = queue.Queue()
253
305
 
254
306
  for split in range(config.num_splits):
255
307
  splits_queue.put(split)
@@ -257,8 +309,10 @@ class Table:
257
309
  # this queue shouldn't be large it is marely a pipe through which the results
258
310
  # are sent to the main thread. Most of the pages actually held in the
259
311
  # threads that fetch the pages.
260
- record_batches_queue = queue.Queue(maxsize=2)
312
+ record_batches_queue: queue.Queue[pa.RecordBatch] = queue.Queue(maxsize=2)
313
+
261
314
  stop_event = Event()
315
+
262
316
  class StoppedException(Exception):
263
317
  pass
264
318
 
@@ -266,9 +320,9 @@ class Table:
266
320
  if stop_event.is_set():
267
321
  raise StoppedException
268
322
 
269
- def single_endpoint_worker(endpoint : str):
323
+ def single_endpoint_worker(endpoint: str):
270
324
  try:
271
- host_api = VastdbApi(endpoint=endpoint, access_key=self.tx._rpc.api.access_key, secret_key=self.tx._rpc.api.secret_key)
325
+ host_api = internal_commands.VastdbApi(endpoint=endpoint, access_key=self.tx._rpc.api.access_key, secret_key=self.tx._rpc.api.secret_key)
272
326
  while True:
273
327
  check_stop()
274
328
  try:
@@ -293,12 +347,11 @@ class Table:
293
347
  log.debug("exiting")
294
348
  record_batches_queue.put(None)
295
349
 
296
- # Take a snapshot of enpoints
297
- endpoints = list(self.stats.endpoints) if config.data_endpoints is None else list(config.data_endpoints)
298
-
299
350
  def batches_iterator():
300
- def propagate_first_exception(futures : List[concurrent.futures.Future], block = False):
351
+ def propagate_first_exception(futures: List[concurrent.futures.Future], block=False):
301
352
  done, not_done = concurrent.futures.wait(futures, None if block else 0, concurrent.futures.FIRST_EXCEPTION)
353
+ if self.tx.txid is None:
354
+ raise errors.MissingTransaction()
302
355
  for future in done:
303
356
  future.result()
304
357
  return not_done
@@ -308,7 +361,7 @@ class Table:
308
361
  if config.query_id:
309
362
  threads_prefix = threads_prefix + "-" + config.query_id
310
363
 
311
- with concurrent.futures.ThreadPoolExecutor(max_workers=len(endpoints), thread_name_prefix=threads_prefix) as tp: # TODO: concurrency == enpoints is just a heuristic
364
+ with concurrent.futures.ThreadPoolExecutor(max_workers=len(endpoints), thread_name_prefix=threads_prefix) as tp: # TODO: concurrency == enpoints is just a heuristic
312
365
  futures = [tp.submit(single_endpoint_worker, endpoint) for endpoint in endpoints]
313
366
  tasks_running = len(futures)
314
367
  try:
@@ -332,113 +385,155 @@ class Table:
332
385
 
333
386
  return pa.RecordBatchReader.from_batches(query_data_request.response_schema, batches_iterator())
334
387
 
335
- def _combine_chunks(self, col):
336
- if hasattr(col, "combine_chunks"):
337
- return col.combine_chunks()
338
- else:
339
- return col
340
-
341
388
  def insert(self, rows: pa.RecordBatch) -> pa.RecordBatch:
342
- serialized_slices = self.tx._rpc.api._record_batch_slices(rows, MAX_INSERT_ROWS_PER_PATCH)
343
- row_ids = []
389
+ """Insert a RecordBatch into this table."""
390
+ if self._imports_table:
391
+ raise errors.NotSupportedCommand(self.bucket.name, self.schema.name, self.name)
392
+ serialized_slices = util.iter_serialized_slices(rows, MAX_INSERT_ROWS_PER_PATCH)
344
393
  for slice in serialized_slices:
345
- res = self.tx._rpc.api.insert_rows(self.bucket.name, self.schema.name, self.name, record_batch=slice,
394
+ self.tx._rpc.api.insert_rows(self.bucket.name, self.schema.name, self.name, record_batch=slice,
346
395
  txid=self.tx.txid)
347
- (batch,) = pa.RecordBatchStreamReader(res.raw)
348
- row_ids.append(batch[INTERNAL_ROW_ID])
349
396
 
350
- return pa.chunked_array(row_ids)
397
+ def update(self, rows: Union[pa.RecordBatch, pa.Table], columns: Optional[List[str]] = None) -> None:
398
+ """Update a subset of cells in this table.
399
+
400
+ Row IDs are specified using a special field (named "$row_id" of uint64 type).
351
401
 
352
- def update(self, rows: Union[pa.RecordBatch, pa.Table], columns: list = None) -> None:
402
+ A subset of columns to be updated can be specified via the `columns` argument.
403
+ """
404
+ if self._imports_table:
405
+ raise errors.NotSupportedCommand(self.bucket.name, self.schema.name, self.name)
353
406
  if columns is not None:
354
407
  update_fields = [(INTERNAL_ROW_ID, pa.uint64())]
355
- update_values = [self._combine_chunks(rows[INTERNAL_ROW_ID])]
408
+ update_values = [_combine_chunks(rows[INTERNAL_ROW_ID])]
356
409
  for col in columns:
357
410
  update_fields.append(rows.field(col))
358
- update_values.append(self._combine_chunks(rows[col]))
411
+ update_values.append(_combine_chunks(rows[col]))
359
412
 
360
413
  update_rows_rb = pa.record_batch(schema=pa.schema(update_fields), data=update_values)
361
414
  else:
362
415
  update_rows_rb = rows
363
416
 
364
- serialized_slices = self.tx._rpc.api._record_batch_slices(update_rows_rb, MAX_ROWS_PER_BATCH)
417
+ serialized_slices = util.iter_serialized_slices(update_rows_rb, MAX_ROWS_PER_BATCH)
365
418
  for slice in serialized_slices:
366
419
  self.tx._rpc.api.update_rows(self.bucket.name, self.schema.name, self.name, record_batch=slice,
367
420
  txid=self.tx.txid)
368
421
 
369
422
  def delete(self, rows: Union[pa.RecordBatch, pa.Table]) -> None:
423
+ """Delete a subset of rows in this table.
424
+
425
+ Row IDs are specified using a special field (named "$row_id" of uint64 type).
426
+ """
370
427
  delete_rows_rb = pa.record_batch(schema=pa.schema([(INTERNAL_ROW_ID, pa.uint64())]),
371
- data=[self._combine_chunks(rows[INTERNAL_ROW_ID])])
428
+ data=[_combine_chunks(rows[INTERNAL_ROW_ID])])
372
429
 
373
- serialized_slices = self.tx._rpc.api._record_batch_slices(delete_rows_rb, MAX_ROWS_PER_BATCH)
430
+ serialized_slices = util.iter_serialized_slices(delete_rows_rb, MAX_ROWS_PER_BATCH)
374
431
  for slice in serialized_slices:
375
432
  self.tx._rpc.api.delete_rows(self.bucket.name, self.schema.name, self.name, record_batch=slice,
376
- txid=self.tx.txid)
433
+ txid=self.tx.txid, delete_from_imports_table=self._imports_table)
377
434
 
378
435
  def drop(self) -> None:
379
- self.tx._rpc.api.drop_table(self.bucket.name, self.schema.name, self.name, txid=self.tx.txid)
436
+ """Drop this table."""
437
+ self.tx._rpc.api.drop_table(self.bucket.name, self.schema.name, self.name, txid=self.tx.txid, remove_imports_table=self._imports_table)
380
438
  log.info("Dropped table: %s", self.name)
381
439
 
382
440
  def rename(self, new_name) -> None:
441
+ """Rename this table."""
442
+ if self._imports_table:
443
+ raise errors.NotSupportedCommand(self.bucket.name, self.schema.name, self.name)
383
444
  self.tx._rpc.api.alter_table(
384
445
  self.bucket.name, self.schema.name, self.name, txid=self.tx.txid, new_name=new_name)
385
446
  log.info("Renamed table from %s to %s ", self.name, new_name)
386
447
  self.name = new_name
387
448
 
388
449
  def add_column(self, new_column: pa.Schema) -> None:
450
+ """Add a new column."""
451
+ if self._imports_table:
452
+ raise errors.NotSupportedCommand(self.bucket.name, self.schema.name, self.name)
389
453
  self.tx._rpc.api.add_columns(self.bucket.name, self.schema.name, self.name, new_column, txid=self.tx.txid)
390
454
  log.info("Added column(s): %s", new_column)
391
455
  self.arrow_schema = self.columns()
392
456
 
393
457
  def drop_column(self, column_to_drop: pa.Schema) -> None:
458
+ """Drop an existing column."""
459
+ if self._imports_table:
460
+ raise errors.NotSupported(self.bucket.name, self.schema.name, self.name)
461
+ if self._imports_table:
462
+ raise errors.NotSupportedCommand(self.bucket.name, self.schema.name, self.name)
394
463
  self.tx._rpc.api.drop_columns(self.bucket.name, self.schema.name, self.name, column_to_drop, txid=self.tx.txid)
395
464
  log.info("Dropped column(s): %s", column_to_drop)
396
465
  self.arrow_schema = self.columns()
397
466
 
398
467
  def rename_column(self, current_column_name: str, new_column_name: str) -> None:
468
+ """Rename an existing column."""
469
+ if self._imports_table:
470
+ raise errors.NotSupportedCommand(self.bucket.name, self.schema.name, self.name)
399
471
  self.tx._rpc.api.alter_column(self.bucket.name, self.schema.name, self.name, name=current_column_name,
400
472
  new_name=new_column_name, txid=self.tx.txid)
401
473
  log.info("Renamed column: %s to %s", current_column_name, new_column_name)
402
474
  self.arrow_schema = self.columns()
403
475
 
404
476
  def create_projection(self, projection_name: str, sorted_columns: List[str], unsorted_columns: List[str]) -> "Projection":
477
+ """Create a new semi-sorted projection."""
478
+ if self._imports_table:
479
+ raise errors.NotSupportedCommand(self.bucket.name, self.schema.name, self.name)
405
480
  columns = [(sorted_column, "Sorted") for sorted_column in sorted_columns] + [(unsorted_column, "Unorted") for unsorted_column in unsorted_columns]
406
481
  self.tx._rpc.api.create_projection(self.bucket.name, self.schema.name, self.name, projection_name, columns=columns, txid=self.tx.txid)
407
482
  log.info("Created projection: %s", projection_name)
408
483
  return self.projection(projection_name)
409
484
 
485
+ def create_imports_table(self, fail_if_exists=True) -> "Table":
486
+ """Create imports table."""
487
+ self.tx._rpc.features.check_imports_table()
488
+ empty_schema = pa.schema([])
489
+ self.tx._rpc.api.create_table(self.bucket.name, self.schema.name, self.name, empty_schema, txid=self.tx.txid,
490
+ create_imports_table=True)
491
+ log.info("Created imports table for table: %s", self.name)
492
+ return self.imports_table() # type: ignore[return-value]
493
+
494
+ def imports_table(self) -> Optional["Table"]:
495
+ """Get the imports table under of this table."""
496
+ self.tx._rpc.features.check_imports_table()
497
+ return Table(name=self.name, schema=self.schema, handle=int(self.handle), stats=self.stats, _imports_table=True)
498
+
410
499
  def __getitem__(self, col_name):
500
+ """Allow constructing ibis-like column expressions from this table.
501
+
502
+ It is useful for constructing expressions for predicate pushdown in `Table.select()` method.
503
+ """
411
504
  return self._ibis_table[col_name]
412
505
 
413
506
 
414
507
  @dataclass
415
508
  class Projection:
509
+ """VAST semi-sorted projection."""
510
+
416
511
  name: str
417
512
  table: Table
418
513
  handle: int
419
514
  stats: TableStats
420
- properties: dict = None
421
515
 
422
516
  @property
423
517
  def bucket(self):
518
+ """Return bucket."""
424
519
  return self.table.schema.bucket
425
520
 
426
521
  @property
427
522
  def schema(self):
523
+ """Return schema."""
428
524
  return self.table.schema
429
525
 
430
526
  @property
431
527
  def tx(self):
528
+ """Return transaction."""
432
529
  return self.table.schema.tx
433
530
 
434
- def __repr__(self):
435
- return f"{type(self).__name__}(name={self.name})"
436
-
437
531
  def columns(self) -> pa.Schema:
532
+ """Return this projections' columns as an Arrow schema."""
438
533
  columns = []
439
534
  next_key = 0
440
535
  while True:
441
- curr_columns, next_key, is_truncated, count, _ = \
536
+ curr_columns, next_key, is_truncated, _count, _ = \
442
537
  self.tx._rpc.api.list_projection_columns(
443
538
  self.bucket.name, self.schema.name, self.table.name, self.name, txid=self.table.tx.txid, next_key=next_key)
444
539
  if not curr_columns:
@@ -450,12 +545,14 @@ class Projection:
450
545
  return self.arrow_schema
451
546
 
452
547
  def rename(self, new_name) -> None:
548
+ """Rename this projection."""
453
549
  self.tx._rpc.api.alter_projection(self.bucket.name, self.schema.name,
454
550
  self.table.name, self.name, txid=self.tx.txid, new_name=new_name)
455
551
  log.info("Renamed projection from %s to %s ", self.name, new_name)
456
552
  self.name = new_name
457
553
 
458
554
  def drop(self) -> None:
555
+ """Drop this projection."""
459
556
  self.tx._rpc.api.drop_projection(self.bucket.name, self.schema.name, self.table.name,
460
557
  self.name, txid=self.tx.txid)
461
558
  log.info("Dropped projection: %s", self.name)
@@ -467,9 +564,9 @@ def _parse_projection_info(projection_info, table: "Table"):
467
564
  return Projection(name=projection_info.name, table=table, stats=stats, handle=int(projection_info.handle))
468
565
 
469
566
 
470
- def _parse_bucket_and_object_names(path: str) -> (str, str):
567
+ def _parse_bucket_and_object_names(path: str) -> Tuple[str, str]:
471
568
  if not path.startswith('/'):
472
- raise errors.InvalidArgumentError(f"Path {path} must start with a '/'")
569
+ raise errors.InvalidArgument(f"Path {path} must start with a '/'")
473
570
  components = path.split(os.path.sep)
474
571
  bucket_name = components[1]
475
572
  object_path = os.path.sep.join(components[2:])
@@ -481,3 +578,10 @@ def _serialize_record_batch(record_batch: pa.RecordBatch) -> pa.lib.Buffer:
481
578
  with pa.ipc.new_stream(sink, record_batch.schema) as writer:
482
579
  writer.write(record_batch)
483
580
  return sink.getvalue()
581
+
582
+
583
+ def _combine_chunks(col):
584
+ if hasattr(col, "combine_chunks"):
585
+ return col.combine_chunks()
586
+ else:
587
+ return col