vastdb 0.0.5.3__py3-none-any.whl → 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. vast_flatbuf/tabular/GetTableStatsResponse.py +45 -1
  2. vast_flatbuf/tabular/VipRange.py +56 -0
  3. vastdb/__init__.py +7 -0
  4. vastdb/bench/test_perf.py +29 -0
  5. vastdb/bucket.py +85 -0
  6. vastdb/{tests/conftest.py → conftest.py} +29 -14
  7. vastdb/errors.py +175 -0
  8. vastdb/{api.py → internal_commands.py} +373 -875
  9. vastdb/schema.py +85 -0
  10. vastdb/session.py +47 -0
  11. vastdb/table.py +483 -0
  12. vastdb/tests/test_imports.py +123 -0
  13. vastdb/tests/test_nested.py +28 -0
  14. vastdb/tests/test_projections.py +42 -0
  15. vastdb/tests/test_sanity.py +34 -15
  16. vastdb/tests/test_schemas.py +30 -6
  17. vastdb/tests/test_tables.py +628 -13
  18. vastdb/tests/util.py +18 -0
  19. vastdb/transaction.py +54 -0
  20. vastdb/util.py +11 -10
  21. vastdb-0.1.1.dist-info/METADATA +38 -0
  22. {vastdb-0.0.5.3.dist-info → vastdb-0.1.1.dist-info}/RECORD +26 -31
  23. vast_protobuf/substrait/__init__.py +0 -0
  24. vast_protobuf/substrait/algebra_pb2.py +0 -1344
  25. vast_protobuf/substrait/capabilities_pb2.py +0 -46
  26. vast_protobuf/substrait/ddl_pb2.py +0 -57
  27. vast_protobuf/substrait/extended_expression_pb2.py +0 -49
  28. vast_protobuf/substrait/extensions/__init__.py +0 -0
  29. vast_protobuf/substrait/extensions/extensions_pb2.py +0 -89
  30. vast_protobuf/substrait/function_pb2.py +0 -168
  31. vast_protobuf/substrait/parameterized_types_pb2.py +0 -181
  32. vast_protobuf/substrait/plan_pb2.py +0 -67
  33. vast_protobuf/substrait/type_expressions_pb2.py +0 -198
  34. vast_protobuf/substrait/type_pb2.py +0 -350
  35. vast_protobuf/tabular/__init__.py +0 -0
  36. vast_protobuf/tabular/rpc_pb2.py +0 -344
  37. vastdb/bench_scan.py +0 -45
  38. vastdb/tests/test_create_table_from_parquets.py +0 -50
  39. vastdb/v2.py +0 -360
  40. vastdb-0.0.5.3.dist-info/METADATA +0 -47
  41. {vast_protobuf → vastdb/bench}/__init__.py +0 -0
  42. {vastdb-0.0.5.3.dist-info → vastdb-0.1.1.dist-info}/LICENSE +0 -0
  43. {vastdb-0.0.5.3.dist-info → vastdb-0.1.1.dist-info}/WHEEL +0 -0
  44. {vastdb-0.0.5.3.dist-info → vastdb-0.1.1.dist-info}/top_level.txt +0 -0
vastdb/schema.py ADDED
@@ -0,0 +1,85 @@
1
+ """VAST Database schema (a container of tables).
2
+
3
+ VAST S3 buckets can be used to create Database schemas and tables.
4
+ It is possible to list and access VAST snapshots generated over a bucket.
5
+ """
6
+
7
+ import logging
8
+ from dataclasses import dataclass
9
+
10
+ import pyarrow as pa
11
+
12
+ from . import bucket, errors, schema, table
13
+
14
+ log = logging.getLogger(__name__)
15
+
16
+
17
+ @dataclass
18
+ class Schema:
19
+ """VAST Schema."""
20
+
21
+ name: str
22
+ bucket: "bucket.Bucket"
23
+
24
+ @property
25
+ def tx(self):
26
+ """VAST transaction used for this schema."""
27
+ return self.bucket.tx
28
+
29
+ def create_table(self, table_name: str, columns: pa.Schema, fail_if_exists=True) -> "table.Table":
30
+ """Create a new table under this schema."""
31
+ if current := self.table(table_name, fail_if_missing=False):
32
+ if fail_if_exists:
33
+ raise errors.TableExists(self.bucket.name, self.name, table_name)
34
+ else:
35
+ return current
36
+ self.tx._rpc.api.create_table(self.bucket.name, self.name, table_name, columns, txid=self.tx.txid)
37
+ log.info("Created table: %s", table_name)
38
+ return self.table(table_name)
39
+
40
+ def table(self, name: str, fail_if_missing=True) -> "table.Table":
41
+ """Get a specific table under this schema."""
42
+ t = self.tables(table_name=name)
43
+ if not t:
44
+ if fail_if_missing:
45
+ raise errors.MissingTable(self.bucket.name, self.name, name)
46
+ else:
47
+ return None
48
+ assert len(t) == 1, f"Expected to receive only a single table, but got: {len(t)}. tables: {t}"
49
+ log.debug("Found table: %s", t[0])
50
+ return t[0]
51
+
52
+ def tables(self, table_name=None) -> ["table.Table"]:
53
+ """List all tables under this schema."""
54
+ tables = []
55
+ next_key = 0
56
+ name_prefix = table_name if table_name else ""
57
+ exact_match = bool(table_name)
58
+ while True:
59
+ bucket_name, schema_name, curr_tables, next_key, is_truncated, _ = \
60
+ self.tx._rpc.api.list_tables(
61
+ bucket=self.bucket.name, schema=self.name, next_key=next_key, txid=self.tx.txid,
62
+ exact_match=exact_match, name_prefix=name_prefix, include_list_stats=exact_match)
63
+ if not curr_tables:
64
+ break
65
+ tables.extend(curr_tables)
66
+ if not is_truncated:
67
+ break
68
+
69
+ return [_parse_table_info(table, self) for table in tables]
70
+
71
+ def drop(self) -> None:
72
+ """Delete this schema."""
73
+ self.tx._rpc.api.drop_schema(self.bucket.name, self.name, txid=self.tx.txid)
74
+ log.info("Dropped schema: %s", self.name)
75
+
76
+ def rename(self, new_name) -> None:
77
+ """Rename this schema."""
78
+ self.tx._rpc.api.alter_schema(self.bucket.name, self.name, txid=self.tx.txid, new_name=new_name)
79
+ log.info("Renamed schema: %s to %s", self.name, new_name)
80
+ self.name = new_name
81
+
82
+
83
+ def _parse_table_info(table_info, schema: "schema.Schema"):
84
+ stats = table.TableStats(num_rows=table_info.num_rows, size_in_bytes=table_info.size_in_bytes)
85
+ return table.Table(name=table_info.name, schema=schema, handle=int(table_info.handle), stats=stats)
vastdb/session.py ADDED
@@ -0,0 +1,47 @@
1
+ """VAST database session.
2
+
3
+ It should be used to interact with a specific VAST cluster.
4
+ For more details see:
5
+ - [Virtual IP pool configured with DNS service](https://support.vastdata.com/s/topic/0TOV40000000FThOAM/configuring-network-access-v50)
6
+ - [S3 access & secret keys on VAST cluster](https://support.vastdata.com/s/article/UUID-4d2e7e23-b2fb-7900-d98f-96c31a499626)
7
+ - [Tabular identity policy with the proper permissions](https://support.vastdata.com/s/article/UUID-14322b60-d6a2-89ac-3df0-3dfbb6974182)
8
+ """
9
+
10
+ import os
11
+
12
+ import boto3
13
+
14
+ from . import internal_commands, transaction
15
+
16
+
17
+ class Session:
18
+ """VAST database session."""
19
+
20
+ def __init__(self, access=None, secret=None, endpoint=None):
21
+ """Connect to a VAST Database endpoint, using specified credentials."""
22
+ if access is None:
23
+ access = os.environ['AWS_ACCESS_KEY_ID']
24
+ if secret is None:
25
+ secret = os.environ['AWS_SECRET_ACCESS_KEY']
26
+ if endpoint is None:
27
+ endpoint = os.environ['AWS_S3_ENDPOINT_URL']
28
+
29
+ self.api = internal_commands.VastdbApi(endpoint, access, secret)
30
+ self.s3 = boto3.client('s3',
31
+ aws_access_key_id=access,
32
+ aws_secret_access_key=secret,
33
+ endpoint_url=endpoint)
34
+
35
+ def __repr__(self):
36
+ """Don't show the secret key."""
37
+ return f'{self.__class__.__name__}(endpoint={self.api.url}, access={self.api.access_key})'
38
+
39
+ def transaction(self):
40
+ """Create a non-initialized transaction object.
41
+
42
+ It should be used as a context manager:
43
+
44
+ with session.transaction() as tx:
45
+ tx.bucket("bucket").create_schema("schema")
46
+ """
47
+ return transaction.Transaction(self)
vastdb/table.py ADDED
@@ -0,0 +1,483 @@
1
+ import concurrent.futures
2
+ import logging
3
+ import os
4
+ import queue
5
+ from dataclasses import dataclass, field
6
+ from math import ceil
7
+ from threading import Event
8
+ from typing import List, Union
9
+
10
+ import ibis
11
+ import pyarrow as pa
12
+
13
+ from . import errors, schema
14
+ from .internal_commands import (
15
+ TABULAR_INVALID_ROW_ID,
16
+ VastdbApi,
17
+ build_query_data_request,
18
+ parse_query_data_response,
19
+ )
20
+
21
+ log = logging.getLogger(__name__)
22
+
23
+
24
+ INTERNAL_ROW_ID = "$row_id"
25
+ MAX_ROWS_PER_BATCH = 512 * 1024
26
+ # for insert we need a smaller limit due to response amplification
27
+ # for example insert of 512k uint8 result in 512k*8bytes response since row_ids are uint64
28
+ MAX_INSERT_ROWS_PER_PATCH = 512 * 1024
29
+
30
+ @dataclass
31
+ class TableStats:
32
+ num_rows: int
33
+ size_in_bytes: int
34
+ is_external_rowid_alloc: bool = False
35
+ endpoints: List[str] = None
36
+
37
+ @dataclass
38
+ class QueryConfig:
39
+ num_sub_splits: int = 4
40
+ num_splits: int = 1
41
+ data_endpoints: [str] = None
42
+ limit_rows_per_sub_split: int = 128 * 1024
43
+ num_row_groups_per_sub_split: int = 8
44
+ use_semi_sorted_projections: bool = True
45
+ rows_per_split: int = 4000000
46
+ query_id: str = ""
47
+
48
+
49
+ @dataclass
50
+ class ImportConfig:
51
+ import_concurrency: int = 2
52
+
53
+ class SelectSplitState():
54
+ def __init__(self, query_data_request, table : "Table", split_id : int, config: QueryConfig) -> None:
55
+ self.split_id = split_id
56
+ self.subsplits_state = {i: 0 for i in range(config.num_sub_splits)}
57
+ self.config = config
58
+ self.query_data_request = query_data_request
59
+ self.table = table
60
+
61
+ def batches(self, api : VastdbApi):
62
+ while not self.done:
63
+ response = api.query_data(
64
+ bucket=self.table.bucket.name,
65
+ schema=self.table.schema.name,
66
+ table=self.table.name,
67
+ params=self.query_data_request.serialized,
68
+ split=(self.split_id, self.config.num_splits, self.config.num_row_groups_per_sub_split),
69
+ num_sub_splits=self.config.num_sub_splits,
70
+ response_row_id=False,
71
+ txid=self.table.tx.txid,
72
+ limit_rows=self.config.limit_rows_per_sub_split,
73
+ sub_split_start_row_ids=self.subsplits_state.items(),
74
+ enable_sorted_projections=self.config.use_semi_sorted_projections)
75
+ pages_iter = parse_query_data_response(
76
+ conn=response.raw,
77
+ schema=self.query_data_request.response_schema,
78
+ start_row_ids=self.subsplits_state)
79
+
80
+ for page in pages_iter:
81
+ for batch in page.to_batches():
82
+ if len(batch) > 0:
83
+ yield batch
84
+
85
+
86
+ @property
87
+ def done(self):
88
+ return all(row_id == TABULAR_INVALID_ROW_ID for row_id in self.subsplits_state.values())
89
+
90
+ @dataclass
91
+ class Table:
92
+ name: str
93
+ schema: "schema.Schema"
94
+ handle: int
95
+ stats: TableStats
96
+ properties: dict = None
97
+ arrow_schema: pa.Schema = field(init=False, compare=False)
98
+ _ibis_table: ibis.Schema = field(init=False, compare=False)
99
+
100
+ def __post_init__(self):
101
+ self.properties = self.properties or {}
102
+ self.arrow_schema = self.columns()
103
+
104
+ table_path = f'{self.schema.bucket.name}/{self.schema.name}/{self.name}'
105
+ self._ibis_table = ibis.table(ibis.Schema.from_pyarrow(self.arrow_schema), table_path)
106
+
107
+ @property
108
+ def tx(self):
109
+ return self.schema.tx
110
+
111
+ @property
112
+ def bucket(self):
113
+ return self.schema.bucket
114
+
115
+ def __repr__(self):
116
+ return f"{type(self).__name__}(name={self.name})"
117
+
118
+ def columns(self) -> pa.Schema:
119
+ fields = []
120
+ next_key = 0
121
+ while True:
122
+ cur_columns, next_key, is_truncated, _count = self.tx._rpc.api.list_columns(
123
+ bucket=self.bucket.name, schema=self.schema.name, table=self.name, next_key=next_key, txid=self.tx.txid)
124
+ fields.extend(cur_columns)
125
+ if not is_truncated:
126
+ break
127
+
128
+ self.arrow_schema = pa.schema(fields)
129
+ return self.arrow_schema
130
+
131
+ def projection(self, name: str) -> "Projection":
132
+ projs = self.projections(projection_name=name)
133
+ if not projs:
134
+ raise errors.MissingProjection(self.bucket.name, self.schema.name, self.name, name)
135
+ assert len(projs) == 1, f"Expected to receive only a single projection, but got: {len(projs)}. projections: {projs}"
136
+ log.debug("Found projection: %s", projs[0])
137
+ return projs[0]
138
+
139
+ def projections(self, projection_name=None) -> ["Projection"]:
140
+ projections = []
141
+ next_key = 0
142
+ name_prefix = projection_name if projection_name else ""
143
+ exact_match = bool(projection_name)
144
+ while True:
145
+ bucket_name, schema_name, table_name, curr_projections, next_key, is_truncated, _ = \
146
+ self.tx._rpc.api.list_projections(
147
+ bucket=self.bucket.name, schema=self.schema.name, table=self.name, next_key=next_key, txid=self.tx.txid,
148
+ exact_match=exact_match, name_prefix=name_prefix)
149
+ if not curr_projections:
150
+ break
151
+ projections.extend(curr_projections)
152
+ if not is_truncated:
153
+ break
154
+ return [_parse_projection_info(projection, self) for projection in projections]
155
+
156
+ def import_files(self, files_to_import: [str], config: ImportConfig = None) -> None:
157
+ source_files = {}
158
+ for f in files_to_import:
159
+ bucket_name, object_path = _parse_bucket_and_object_names(f)
160
+ source_files[(bucket_name, object_path)] = b''
161
+
162
+ self._execute_import(source_files, config=config)
163
+
164
+ def import_partitioned_files(self, files_and_partitions: {str: pa.RecordBatch}, config: ImportConfig = None) -> None:
165
+ source_files = {}
166
+ for f, record_batch in files_and_partitions.items():
167
+ bucket_name, object_path = _parse_bucket_and_object_names(f)
168
+ serialized_batch = _serialize_record_batch(record_batch)
169
+ source_files = {(bucket_name, object_path): serialized_batch.to_pybytes()}
170
+
171
+ self._execute_import(source_files, config=config)
172
+
173
+ def _execute_import(self, source_files, config):
174
+ config = config or ImportConfig()
175
+ assert config.import_concurrency > 0 # TODO: Do we want to validate concurrency isn't too high?
176
+ max_batch_size = 10 # Enforced in server side.
177
+ endpoints = [self.tx._rpc.api.url for _ in range(config.import_concurrency)] # TODO: use valid endpoints...
178
+ files_queue = queue.Queue()
179
+
180
+ for source_file in source_files.items():
181
+ files_queue.put(source_file)
182
+
183
+ stop_event = Event()
184
+ num_files_in_batch = min(ceil(len(source_files) / len(endpoints)), max_batch_size)
185
+
186
+ def import_worker(q, session):
187
+ try:
188
+ while not q.empty():
189
+ if stop_event.is_set():
190
+ log.debug("stop_event is set, exiting")
191
+ break
192
+ files_batch = {}
193
+ try:
194
+ for _ in range(num_files_in_batch):
195
+ files_batch.update({q.get(block=False)})
196
+ except queue.Empty:
197
+ pass
198
+ if files_batch:
199
+ log.debug("Starting import batch of %s files", len(files_batch))
200
+ session.import_data(
201
+ self.bucket.name, self.schema.name, self.name, files_batch, txid=self.tx.txid)
202
+ except (Exception, KeyboardInterrupt) as e:
203
+ stop_event.set()
204
+ log.error("Got exception inside import_worker. exception: %s", e)
205
+ raise
206
+
207
+ futures = []
208
+ with concurrent.futures.ThreadPoolExecutor(
209
+ max_workers=config.import_concurrency, thread_name_prefix='import_thread') as pool:
210
+ try:
211
+ for endpoint in endpoints:
212
+ session = VastdbApi(endpoint, self.tx._rpc.api.access_key, self.tx._rpc.api.secret_key)
213
+ futures.append(pool.submit(import_worker, files_queue, session))
214
+
215
+ log.debug("Waiting for import workers to finish")
216
+ for future in concurrent.futures.as_completed(futures):
217
+ future.result()
218
+ finally:
219
+ stop_event.set()
220
+ # ThreadPoolExecutor will be joined at the end of the context
221
+ def refresh_stats(self):
222
+ stats_tuple = self.tx._rpc.api.get_table_stats(
223
+ bucket=self.bucket.name, schema=self.schema.name, name=self.name, txid=self.tx.txid)
224
+ self.stats = TableStats(**stats_tuple._asdict())
225
+
226
+ def select(self, columns: [str] = None,
227
+ predicate: ibis.expr.types.BooleanColumn = None,
228
+ config: QueryConfig = None,
229
+ *,
230
+ internal_row_id: bool = False) -> pa.RecordBatchReader:
231
+ if config is None:
232
+ config = QueryConfig()
233
+
234
+ self.refresh_stats()
235
+
236
+ if self.stats.num_rows > config.rows_per_split and config.num_splits is None:
237
+ config.num_splits = self.stats.num_rows // config.rows_per_split
238
+ log.debug(f"num_rows={self.stats.num_rows} rows_per_splits={config.rows_per_split} num_splits={config.num_splits} ")
239
+
240
+ query_schema = self.arrow_schema
241
+ if internal_row_id:
242
+ queried_fields = [pa.field(INTERNAL_ROW_ID, pa.uint64())]
243
+ queried_fields.extend(column for column in self.arrow_schema)
244
+ query_schema = pa.schema(queried_fields)
245
+ columns.append(INTERNAL_ROW_ID)
246
+
247
+ query_data_request = build_query_data_request(
248
+ schema=query_schema,
249
+ predicate=predicate,
250
+ field_names=columns)
251
+
252
+ splits_queue = queue.Queue()
253
+
254
+ for split in range(config.num_splits):
255
+ splits_queue.put(split)
256
+
257
+ # this queue shouldn't be large it is marely a pipe through which the results
258
+ # are sent to the main thread. Most of the pages actually held in the
259
+ # threads that fetch the pages.
260
+ record_batches_queue = queue.Queue(maxsize=2)
261
+ stop_event = Event()
262
+ class StoppedException(Exception):
263
+ pass
264
+
265
+ def check_stop():
266
+ if stop_event.is_set():
267
+ raise StoppedException
268
+
269
+ def single_endpoint_worker(endpoint : str):
270
+ try:
271
+ host_api = VastdbApi(endpoint=endpoint, access_key=self.tx._rpc.api.access_key, secret_key=self.tx._rpc.api.secret_key)
272
+ while True:
273
+ check_stop()
274
+ try:
275
+ split = splits_queue.get_nowait()
276
+ except queue.Empty:
277
+ log.debug("splits queue is empty")
278
+ break
279
+
280
+ split_state = SelectSplitState(query_data_request=query_data_request,
281
+ table=self,
282
+ split_id=split,
283
+ config=config)
284
+
285
+ for batch in split_state.batches(host_api):
286
+ check_stop()
287
+ record_batches_queue.put(batch)
288
+ except StoppedException:
289
+ log.debug("stop signal.", exc_info=True)
290
+ return
291
+ finally:
292
+ # signal that this thread has ended
293
+ log.debug("exiting")
294
+ record_batches_queue.put(None)
295
+
296
+ # Take a snapshot of enpoints
297
+ endpoints = list(self.stats.endpoints) if config.data_endpoints is None else list(config.data_endpoints)
298
+
299
+ def batches_iterator():
300
+ def propagate_first_exception(futures : List[concurrent.futures.Future], block = False):
301
+ done, not_done = concurrent.futures.wait(futures, None if block else 0, concurrent.futures.FIRST_EXCEPTION)
302
+ for future in done:
303
+ future.result()
304
+ return not_done
305
+
306
+ threads_prefix = "query-data"
307
+ # This is mainly for testing, it helps to identify running threads in runtime.
308
+ if config.query_id:
309
+ threads_prefix = threads_prefix + "-" + config.query_id
310
+
311
+ with concurrent.futures.ThreadPoolExecutor(max_workers=len(endpoints), thread_name_prefix=threads_prefix) as tp: # TODO: concurrency == enpoints is just a heuristic
312
+ futures = [tp.submit(single_endpoint_worker, endpoint) for endpoint in endpoints]
313
+ tasks_running = len(futures)
314
+ try:
315
+ while tasks_running > 0:
316
+ futures = propagate_first_exception(futures, block=False)
317
+
318
+ batch = record_batches_queue.get()
319
+ if batch is not None:
320
+ yield batch
321
+ else:
322
+ tasks_running -= 1
323
+ log.debug("one worker thread finished, remaining: %d", tasks_running)
324
+
325
+ # all host threads ended - wait for all futures to complete
326
+ propagate_first_exception(futures, block=True)
327
+ finally:
328
+ stop_event.set()
329
+ while tasks_running > 0:
330
+ if record_batches_queue.get() is None:
331
+ tasks_running -= 1
332
+
333
+ return pa.RecordBatchReader.from_batches(query_data_request.response_schema, batches_iterator())
334
+
335
+ def _combine_chunks(self, col):
336
+ if hasattr(col, "combine_chunks"):
337
+ return col.combine_chunks()
338
+ else:
339
+ return col
340
+
341
+ def insert(self, rows: pa.RecordBatch) -> pa.RecordBatch:
342
+ serialized_slices = self.tx._rpc.api._record_batch_slices(rows, MAX_INSERT_ROWS_PER_PATCH)
343
+ row_ids = []
344
+ for slice in serialized_slices:
345
+ res = self.tx._rpc.api.insert_rows(self.bucket.name, self.schema.name, self.name, record_batch=slice,
346
+ txid=self.tx.txid)
347
+ (batch,) = pa.RecordBatchStreamReader(res.raw)
348
+ row_ids.append(batch[INTERNAL_ROW_ID])
349
+
350
+ return pa.chunked_array(row_ids)
351
+
352
+ def update(self, rows: Union[pa.RecordBatch, pa.Table], columns: list = None) -> None:
353
+ if columns is not None:
354
+ update_fields = [(INTERNAL_ROW_ID, pa.uint64())]
355
+ update_values = [self._combine_chunks(rows[INTERNAL_ROW_ID])]
356
+ for col in columns:
357
+ update_fields.append(rows.field(col))
358
+ update_values.append(self._combine_chunks(rows[col]))
359
+
360
+ update_rows_rb = pa.record_batch(schema=pa.schema(update_fields), data=update_values)
361
+ else:
362
+ update_rows_rb = rows
363
+
364
+ serialized_slices = self.tx._rpc.api._record_batch_slices(update_rows_rb, MAX_ROWS_PER_BATCH)
365
+ for slice in serialized_slices:
366
+ self.tx._rpc.api.update_rows(self.bucket.name, self.schema.name, self.name, record_batch=slice,
367
+ txid=self.tx.txid)
368
+
369
+ def delete(self, rows: Union[pa.RecordBatch, pa.Table]) -> None:
370
+ delete_rows_rb = pa.record_batch(schema=pa.schema([(INTERNAL_ROW_ID, pa.uint64())]),
371
+ data=[self._combine_chunks(rows[INTERNAL_ROW_ID])])
372
+
373
+ serialized_slices = self.tx._rpc.api._record_batch_slices(delete_rows_rb, MAX_ROWS_PER_BATCH)
374
+ for slice in serialized_slices:
375
+ self.tx._rpc.api.delete_rows(self.bucket.name, self.schema.name, self.name, record_batch=slice,
376
+ txid=self.tx.txid)
377
+
378
+ def drop(self) -> None:
379
+ self.tx._rpc.api.drop_table(self.bucket.name, self.schema.name, self.name, txid=self.tx.txid)
380
+ log.info("Dropped table: %s", self.name)
381
+
382
+ def rename(self, new_name) -> None:
383
+ self.tx._rpc.api.alter_table(
384
+ self.bucket.name, self.schema.name, self.name, txid=self.tx.txid, new_name=new_name)
385
+ log.info("Renamed table from %s to %s ", self.name, new_name)
386
+ self.name = new_name
387
+
388
+ def add_column(self, new_column: pa.Schema) -> None:
389
+ self.tx._rpc.api.add_columns(self.bucket.name, self.schema.name, self.name, new_column, txid=self.tx.txid)
390
+ log.info("Added column(s): %s", new_column)
391
+ self.arrow_schema = self.columns()
392
+
393
+ def drop_column(self, column_to_drop: pa.Schema) -> None:
394
+ self.tx._rpc.api.drop_columns(self.bucket.name, self.schema.name, self.name, column_to_drop, txid=self.tx.txid)
395
+ log.info("Dropped column(s): %s", column_to_drop)
396
+ self.arrow_schema = self.columns()
397
+
398
+ def rename_column(self, current_column_name: str, new_column_name: str) -> None:
399
+ self.tx._rpc.api.alter_column(self.bucket.name, self.schema.name, self.name, name=current_column_name,
400
+ new_name=new_column_name, txid=self.tx.txid)
401
+ log.info("Renamed column: %s to %s", current_column_name, new_column_name)
402
+ self.arrow_schema = self.columns()
403
+
404
+ def create_projection(self, projection_name: str, sorted_columns: List[str], unsorted_columns: List[str]) -> "Projection":
405
+ columns = [(sorted_column, "Sorted") for sorted_column in sorted_columns] + [(unsorted_column, "Unorted") for unsorted_column in unsorted_columns]
406
+ self.tx._rpc.api.create_projection(self.bucket.name, self.schema.name, self.name, projection_name, columns=columns, txid=self.tx.txid)
407
+ log.info("Created projection: %s", projection_name)
408
+ return self.projection(projection_name)
409
+
410
+ def __getitem__(self, col_name):
411
+ return self._ibis_table[col_name]
412
+
413
+
414
+ @dataclass
415
+ class Projection:
416
+ name: str
417
+ table: Table
418
+ handle: int
419
+ stats: TableStats
420
+ properties: dict = None
421
+
422
+ @property
423
+ def bucket(self):
424
+ return self.table.schema.bucket
425
+
426
+ @property
427
+ def schema(self):
428
+ return self.table.schema
429
+
430
+ @property
431
+ def tx(self):
432
+ return self.table.schema.tx
433
+
434
+ def __repr__(self):
435
+ return f"{type(self).__name__}(name={self.name})"
436
+
437
+ def columns(self) -> pa.Schema:
438
+ columns = []
439
+ next_key = 0
440
+ while True:
441
+ curr_columns, next_key, is_truncated, count, _ = \
442
+ self.tx._rpc.api.list_projection_columns(
443
+ self.bucket.name, self.schema.name, self.table.name, self.name, txid=self.table.tx.txid, next_key=next_key)
444
+ if not curr_columns:
445
+ break
446
+ columns.extend(curr_columns)
447
+ if not is_truncated:
448
+ break
449
+ self.arrow_schema = pa.schema([(col[0], col[1]) for col in columns])
450
+ return self.arrow_schema
451
+
452
+ def rename(self, new_name) -> None:
453
+ self.tx._rpc.api.alter_projection(self.bucket.name, self.schema.name,
454
+ self.table.name, self.name, txid=self.tx.txid, new_name=new_name)
455
+ log.info("Renamed projection from %s to %s ", self.name, new_name)
456
+ self.name = new_name
457
+
458
+ def drop(self) -> None:
459
+ self.tx._rpc.api.drop_projection(self.bucket.name, self.schema.name, self.table.name,
460
+ self.name, txid=self.tx.txid)
461
+ log.info("Dropped projection: %s", self.name)
462
+
463
+
464
+ def _parse_projection_info(projection_info, table: "Table"):
465
+ log.info("Projection info %s", str(projection_info))
466
+ stats = TableStats(num_rows=projection_info.num_rows, size_in_bytes=projection_info.size_in_bytes)
467
+ return Projection(name=projection_info.name, table=table, stats=stats, handle=int(projection_info.handle))
468
+
469
+
470
+ def _parse_bucket_and_object_names(path: str) -> (str, str):
471
+ if not path.startswith('/'):
472
+ raise errors.InvalidArgumentError(f"Path {path} must start with a '/'")
473
+ components = path.split(os.path.sep)
474
+ bucket_name = components[1]
475
+ object_path = os.path.sep.join(components[2:])
476
+ return bucket_name, object_path
477
+
478
+
479
+ def _serialize_record_batch(record_batch: pa.RecordBatch) -> pa.lib.Buffer:
480
+ sink = pa.BufferOutputStream()
481
+ with pa.ipc.new_stream(sink, record_batch.schema) as writer:
482
+ writer.write(record_batch)
483
+ return sink.getvalue()