vastdb 0.0.5.2__py3-none-any.whl → 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. vast_flatbuf/tabular/GetTableStatsResponse.py +45 -1
  2. vast_flatbuf/tabular/VipRange.py +56 -0
  3. vastdb/__init__.py +7 -0
  4. vastdb/bucket.py +77 -0
  5. vastdb/errors.py +158 -0
  6. vastdb/{api.py → internal_commands.py} +283 -747
  7. vastdb/schema.py +77 -0
  8. vastdb/session.py +48 -0
  9. vastdb/table.py +480 -0
  10. vastdb/tests/conftest.py +46 -0
  11. vastdb/tests/test_imports.py +125 -0
  12. vastdb/tests/test_projections.py +41 -0
  13. vastdb/tests/test_sanity.py +83 -0
  14. vastdb/tests/test_schemas.py +45 -0
  15. vastdb/tests/test_tables.py +608 -0
  16. vastdb/transaction.py +55 -0
  17. vastdb/util.py +77 -0
  18. vastdb-0.1.0.dist-info/METADATA +38 -0
  19. {vastdb-0.0.5.2.dist-info → vastdb-0.1.0.dist-info}/RECORD +23 -24
  20. vast_protobuf/substrait/__init__.py +0 -0
  21. vast_protobuf/substrait/algebra_pb2.py +0 -1344
  22. vast_protobuf/substrait/capabilities_pb2.py +0 -46
  23. vast_protobuf/substrait/ddl_pb2.py +0 -57
  24. vast_protobuf/substrait/extended_expression_pb2.py +0 -49
  25. vast_protobuf/substrait/extensions/__init__.py +0 -0
  26. vast_protobuf/substrait/extensions/extensions_pb2.py +0 -89
  27. vast_protobuf/substrait/function_pb2.py +0 -168
  28. vast_protobuf/substrait/parameterized_types_pb2.py +0 -181
  29. vast_protobuf/substrait/plan_pb2.py +0 -67
  30. vast_protobuf/substrait/type_expressions_pb2.py +0 -198
  31. vast_protobuf/substrait/type_pb2.py +0 -350
  32. vast_protobuf/tabular/__init__.py +0 -0
  33. vast_protobuf/tabular/rpc_pb2.py +0 -344
  34. vastdb/v2.py +0 -108
  35. vastdb-0.0.5.2.dist-info/METADATA +0 -47
  36. {vast_protobuf → vastdb/tests}/__init__.py +0 -0
  37. {vastdb-0.0.5.2.dist-info → vastdb-0.1.0.dist-info}/LICENSE +0 -0
  38. {vastdb-0.0.5.2.dist-info → vastdb-0.1.0.dist-info}/WHEEL +0 -0
  39. {vastdb-0.0.5.2.dist-info → vastdb-0.1.0.dist-info}/top_level.txt +0 -0
vastdb/schema.py ADDED
@@ -0,0 +1,77 @@
1
+ """VAST Database schema (a container of tables).
2
+
3
+ VAST S3 buckets can be used to create Database schemas and tables.
4
+ It is possible to list and access VAST snapshots generated over a bucket.
5
+ """
6
+
7
+ from . import bucket, errors, schema, table
8
+
9
+ import pyarrow as pa
10
+
11
+ from dataclasses import dataclass
12
+ import logging
13
+
14
+ log = logging.getLogger(__name__)
15
+
16
+
17
+ @dataclass
18
+ class Schema:
19
+ """VAST Schema."""
20
+
21
+ name: str
22
+ bucket: "bucket.Bucket"
23
+
24
+ @property
25
+ def tx(self):
26
+ """VAST transaction used for this schema."""
27
+ return self.bucket.tx
28
+
29
+ def create_table(self, table_name: str, columns: pa.Schema) -> "table.Table":
30
+ """Create a new table under this schema."""
31
+ self.tx._rpc.api.create_table(self.bucket.name, self.name, table_name, columns, txid=self.tx.txid)
32
+ log.info("Created table: %s", table_name)
33
+ return self.table(table_name)
34
+
35
+ def table(self, name: str) -> "table.Table":
36
+ """Get a specific table under this schema."""
37
+ t = self.tables(table_name=name)
38
+ if not t:
39
+ raise errors.MissingTable(self.bucket.name, self.name, name)
40
+ assert len(t) == 1, f"Expected to receive only a single table, but got: {len(t)}. tables: {t}"
41
+ log.debug("Found table: %s", t[0])
42
+ return t[0]
43
+
44
+ def tables(self, table_name=None) -> ["table.Table"]:
45
+ """List all tables under this schema."""
46
+ tables = []
47
+ next_key = 0
48
+ name_prefix = table_name if table_name else ""
49
+ exact_match = bool(table_name)
50
+ while True:
51
+ bucket_name, schema_name, curr_tables, next_key, is_truncated, _ = \
52
+ self.tx._rpc.api.list_tables(
53
+ bucket=self.bucket.name, schema=self.name, next_key=next_key, txid=self.tx.txid,
54
+ exact_match=exact_match, name_prefix=name_prefix, include_list_stats=exact_match)
55
+ if not curr_tables:
56
+ break
57
+ tables.extend(curr_tables)
58
+ if not is_truncated:
59
+ break
60
+
61
+ return [_parse_table_info(table, self) for table in tables]
62
+
63
+ def drop(self) -> None:
64
+ """Delete this schema."""
65
+ self.tx._rpc.api.drop_schema(self.bucket.name, self.name, txid=self.tx.txid)
66
+ log.info("Dropped schema: %s", self.name)
67
+
68
+ def rename(self, new_name) -> None:
69
+ """Rename this schema."""
70
+ self.tx._rpc.api.alter_schema(self.bucket.name, self.name, txid=self.tx.txid, new_name=new_name)
71
+ log.info("Renamed schema: %s to %s", self.name, new_name)
72
+ self.name = new_name
73
+
74
+
75
+ def _parse_table_info(table_info, schema: "schema.Schema"):
76
+ stats = table.TableStats(num_rows=table_info.num_rows, size_in_bytes=table_info.size_in_bytes)
77
+ return table.Table(name=table_info.name, schema=schema, handle=int(table_info.handle), stats=stats)
vastdb/session.py ADDED
@@ -0,0 +1,48 @@
1
+ """VAST database session.
2
+
3
+ It should be used to interact with a specific VAST cluster.
4
+ For more details see:
5
+ - [Virtual IP pool configured with DNS service](https://support.vastdata.com/s/topic/0TOV40000000FThOAM/configuring-network-access-v50)
6
+ - [S3 access & secret keys on VAST cluster](https://support.vastdata.com/s/article/UUID-4d2e7e23-b2fb-7900-d98f-96c31a499626)
7
+ - [Tabular identity policy with the proper permissions](https://support.vastdata.com/s/article/UUID-14322b60-d6a2-89ac-3df0-3dfbb6974182)
8
+ """
9
+
10
+ from . import internal_commands
11
+ from . import transaction
12
+
13
+ import boto3
14
+
15
+ import os
16
+
17
+
18
+ class Session:
19
+ """VAST database session."""
20
+
21
+ def __init__(self, access=None, secret=None, endpoint=None):
22
+ """Connect to a VAST Database endpoint, using specified credentials."""
23
+ if access is None:
24
+ access = os.environ['AWS_ACCESS_KEY_ID']
25
+ if secret is None:
26
+ secret = os.environ['AWS_SECRET_ACCESS_KEY']
27
+ if endpoint is None:
28
+ endpoint = os.environ['AWS_S3_ENDPOINT_URL']
29
+
30
+ self.api = internal_commands.VastdbApi(endpoint, access, secret)
31
+ self.s3 = boto3.client('s3',
32
+ aws_access_key_id=access,
33
+ aws_secret_access_key=secret,
34
+ endpoint_url=endpoint)
35
+
36
+ def __repr__(self):
37
+ """Don't show the secret key."""
38
+ return f'{self.__class__.__name__}(endpoint={self.api.url}, access={self.api.access_key})'
39
+
40
+ def transaction(self):
41
+ """Create a non-initialized transaction object.
42
+
43
+ It should be used as a context manager:
44
+
45
+ with session.transaction() as tx:
46
+ tx.bucket("bucket").create_schema("schema")
47
+ """
48
+ return transaction.Transaction(self)
vastdb/table.py ADDED
@@ -0,0 +1,480 @@
1
+ from . import errors, schema
2
+ from .internal_commands import build_query_data_request, parse_query_data_response, \
3
+ TABULAR_INVALID_ROW_ID, VastdbApi
4
+
5
+ import pyarrow as pa
6
+ import ibis
7
+
8
+ import concurrent.futures
9
+ import queue
10
+ from threading import Event
11
+ from math import ceil
12
+
13
+ from dataclasses import dataclass, field
14
+ from typing import List, Union
15
+ import logging
16
+ import os
17
+
18
+ log = logging.getLogger(__name__)
19
+
20
+
21
+ INTERNAL_ROW_ID = "$row_id"
22
+ MAX_ROWS_PER_BATCH = 512 * 1024
23
+ # for insert we need a smaller limit due to response amplification
24
+ # for example insert of 512k uint8 result in 512k*8bytes response since row_ids are uint64
25
+ MAX_INSERT_ROWS_PER_PATCH = 512 * 1024
26
+
27
+ @dataclass
28
+ class TableStats:
29
+ num_rows: int
30
+ size_in_bytes: int
31
+ is_external_rowid_alloc: bool = False
32
+ endpoints: List[str] = None
33
+
34
+ @dataclass
35
+ class QueryConfig:
36
+ num_sub_splits: int = 4
37
+ num_splits: int = 1
38
+ data_endpoints: [str] = None
39
+ limit_rows_per_sub_split: int = 128 * 1024
40
+ num_row_groups_per_sub_split: int = 8
41
+ use_semi_sorted_projections: bool = True
42
+ rows_per_split: int = 4000000
43
+ query_id: str = ""
44
+
45
+
46
+ @dataclass
47
+ class ImportConfig:
48
+ import_concurrency: int = 2
49
+
50
+ class SelectSplitState():
51
+ def __init__(self, query_data_request, table : "Table", split_id : int, config: QueryConfig) -> None:
52
+ self.split_id = split_id
53
+ self.subsplits_state = {i: 0 for i in range(config.num_sub_splits)}
54
+ self.config = config
55
+ self.query_data_request = query_data_request
56
+ self.table = table
57
+
58
+ def batches(self, api : VastdbApi):
59
+ while not self.done:
60
+ response = api.query_data(
61
+ bucket=self.table.bucket.name,
62
+ schema=self.table.schema.name,
63
+ table=self.table.name,
64
+ params=self.query_data_request.serialized,
65
+ split=(self.split_id, self.config.num_splits, self.config.num_row_groups_per_sub_split),
66
+ num_sub_splits=self.config.num_sub_splits,
67
+ response_row_id=False,
68
+ txid=self.table.tx.txid,
69
+ limit_rows=self.config.limit_rows_per_sub_split,
70
+ sub_split_start_row_ids=self.subsplits_state.items(),
71
+ enable_sorted_projections=self.config.use_semi_sorted_projections)
72
+ pages_iter = parse_query_data_response(
73
+ conn=response.raw,
74
+ schema=self.query_data_request.response_schema,
75
+ start_row_ids=self.subsplits_state)
76
+
77
+ for page in pages_iter:
78
+ for batch in page.to_batches():
79
+ if len(batch) > 0:
80
+ yield batch
81
+
82
+
83
+ @property
84
+ def done(self):
85
+ return all(row_id == TABULAR_INVALID_ROW_ID for row_id in self.subsplits_state.values())
86
+
87
+ @dataclass
88
+ class Table:
89
+ name: str
90
+ schema: "schema.Schema"
91
+ handle: int
92
+ stats: TableStats
93
+ properties: dict = None
94
+ arrow_schema: pa.Schema = field(init=False, compare=False)
95
+ _ibis_table: ibis.Schema = field(init=False, compare=False)
96
+
97
+ def __post_init__(self):
98
+ self.properties = self.properties or {}
99
+ self.arrow_schema = self.columns()
100
+
101
+ table_path = f'{self.schema.bucket.name}/{self.schema.name}/{self.name}'
102
+ self._ibis_table = ibis.table(ibis.Schema.from_pyarrow(self.arrow_schema), table_path)
103
+
104
+ @property
105
+ def tx(self):
106
+ return self.schema.tx
107
+
108
+ @property
109
+ def bucket(self):
110
+ return self.schema.bucket
111
+
112
+ def __repr__(self):
113
+ return f"{type(self).__name__}(name={self.name})"
114
+
115
+ def columns(self) -> pa.Schema:
116
+ fields = []
117
+ next_key = 0
118
+ while True:
119
+ cur_columns, next_key, is_truncated, _count = self.tx._rpc.api.list_columns(
120
+ bucket=self.bucket.name, schema=self.schema.name, table=self.name, next_key=next_key, txid=self.tx.txid)
121
+ fields.extend(cur_columns)
122
+ if not is_truncated:
123
+ break
124
+
125
+ self.arrow_schema = pa.schema(fields)
126
+ return self.arrow_schema
127
+
128
+ def projection(self, name: str) -> "Projection":
129
+ projs = self.projections(projection_name=name)
130
+ if not projs:
131
+ raise errors.MissingProjection(self.bucket.name, self.schema.name, self.name, name)
132
+ assert len(projs) == 1, f"Expected to receive only a single projection, but got: {len(projs)}. projections: {projs}"
133
+ log.debug("Found projection: %s", projs[0])
134
+ return projs[0]
135
+
136
+ def projections(self, projection_name=None) -> ["Projection"]:
137
+ projections = []
138
+ next_key = 0
139
+ name_prefix = projection_name if projection_name else ""
140
+ exact_match = bool(projection_name)
141
+ while True:
142
+ bucket_name, schema_name, table_name, curr_projections, next_key, is_truncated, _ = \
143
+ self.tx._rpc.api.list_projections(
144
+ bucket=self.bucket.name, schema=self.schema.name, table=self.name, next_key=next_key, txid=self.tx.txid,
145
+ exact_match=exact_match, name_prefix=name_prefix)
146
+ if not curr_projections:
147
+ break
148
+ projections.extend(curr_projections)
149
+ if not is_truncated:
150
+ break
151
+ return [_parse_projection_info(projection, self) for projection in projections]
152
+
153
+ def import_files(self, files_to_import: [str], config: ImportConfig = None) -> None:
154
+ source_files = {}
155
+ for f in files_to_import:
156
+ bucket_name, object_path = _parse_bucket_and_object_names(f)
157
+ source_files[(bucket_name, object_path)] = b''
158
+
159
+ self._execute_import(source_files, config=config)
160
+
161
+ def import_partitioned_files(self, files_and_partitions: {str: pa.RecordBatch}, config: ImportConfig = None) -> None:
162
+ source_files = {}
163
+ for f, record_batch in files_and_partitions.items():
164
+ bucket_name, object_path = _parse_bucket_and_object_names(f)
165
+ serialized_batch = _serialize_record_batch(record_batch)
166
+ source_files = {(bucket_name, object_path): serialized_batch.to_pybytes()}
167
+
168
+ self._execute_import(source_files, config=config)
169
+
170
+ def _execute_import(self, source_files, config):
171
+ config = config or ImportConfig()
172
+ assert config.import_concurrency > 0 # TODO: Do we want to validate concurrency isn't too high?
173
+ max_batch_size = 10 # Enforced in server side.
174
+ endpoints = [self.tx._rpc.api.url for _ in range(config.import_concurrency)] # TODO: use valid endpoints...
175
+ files_queue = queue.Queue()
176
+
177
+ for source_file in source_files.items():
178
+ files_queue.put(source_file)
179
+
180
+ stop_event = Event()
181
+ num_files_in_batch = min(ceil(len(source_files) / len(endpoints)), max_batch_size)
182
+
183
+ def import_worker(q, session):
184
+ try:
185
+ while not q.empty():
186
+ if stop_event.is_set():
187
+ log.debug("stop_event is set, exiting")
188
+ break
189
+ files_batch = {}
190
+ try:
191
+ for _ in range(num_files_in_batch):
192
+ files_batch.update({q.get(block=False)})
193
+ except queue.Empty:
194
+ pass
195
+ if files_batch:
196
+ log.debug("Starting import batch of %s files", len(files_batch))
197
+ session.import_data(
198
+ self.bucket.name, self.schema.name, self.name, files_batch, txid=self.tx.txid)
199
+ except (Exception, KeyboardInterrupt) as e:
200
+ stop_event.set()
201
+ log.error("Got exception inside import_worker. exception: %s", e)
202
+ raise
203
+
204
+ futures = []
205
+ with concurrent.futures.ThreadPoolExecutor(
206
+ max_workers=config.import_concurrency, thread_name_prefix='import_thread') as pool:
207
+ try:
208
+ for endpoint in endpoints:
209
+ session = VastdbApi(endpoint, self.tx._rpc.api.access_key, self.tx._rpc.api.secret_key)
210
+ futures.append(pool.submit(import_worker, files_queue, session))
211
+
212
+ log.debug("Waiting for import workers to finish")
213
+ for future in concurrent.futures.as_completed(futures):
214
+ future.result()
215
+ finally:
216
+ stop_event.set()
217
+ # ThreadPoolExecutor will be joined at the end of the context
218
+ def refresh_stats(self):
219
+ stats_tuple = self.tx._rpc.api.get_table_stats(
220
+ bucket=self.bucket.name, schema=self.schema.name, name=self.name, txid=self.tx.txid)
221
+ self.stats = TableStats(**stats_tuple._asdict())
222
+
223
+ def select(self, columns: [str] = None,
224
+ predicate: ibis.expr.types.BooleanColumn = None,
225
+ config: QueryConfig = None,
226
+ *,
227
+ internal_row_id: bool = False) -> pa.RecordBatchReader:
228
+ if config is None:
229
+ config = QueryConfig()
230
+
231
+ self.refresh_stats()
232
+
233
+ if self.stats.num_rows > config.rows_per_split and config.num_splits is None:
234
+ config.num_splits = self.stats.num_rows // config.rows_per_split
235
+ log.debug(f"num_rows={self.stats.num_rows} rows_per_splits={config.rows_per_split} num_splits={config.num_splits} ")
236
+
237
+ query_schema = self.arrow_schema
238
+ if internal_row_id:
239
+ queried_fields = [pa.field(INTERNAL_ROW_ID, pa.uint64())]
240
+ queried_fields.extend(column for column in self.arrow_schema)
241
+ query_schema = pa.schema(queried_fields)
242
+ columns.append(INTERNAL_ROW_ID)
243
+
244
+ query_data_request = build_query_data_request(
245
+ schema=query_schema,
246
+ predicate=predicate,
247
+ field_names=columns)
248
+
249
+ splits_queue = queue.Queue()
250
+
251
+ for split in range(config.num_splits):
252
+ splits_queue.put(split)
253
+
254
+ # this queue shouldn't be large it is marely a pipe through which the results
255
+ # are sent to the main thread. Most of the pages actually held in the
256
+ # threads that fetch the pages.
257
+ record_batches_queue = queue.Queue(maxsize=2)
258
+ stop_event = Event()
259
+ class StoppedException(Exception):
260
+ pass
261
+
262
+ def check_stop():
263
+ if stop_event.is_set():
264
+ raise StoppedException
265
+
266
+ def single_endpoint_worker(endpoint : str):
267
+ try:
268
+ host_api = VastdbApi(endpoint=endpoint, access_key=self.tx._rpc.api.access_key, secret_key=self.tx._rpc.api.secret_key)
269
+ while True:
270
+ check_stop()
271
+ try:
272
+ split = splits_queue.get_nowait()
273
+ except queue.Empty:
274
+ log.debug("splits queue is empty")
275
+ break
276
+
277
+ split_state = SelectSplitState(query_data_request=query_data_request,
278
+ table=self,
279
+ split_id=split,
280
+ config=config)
281
+
282
+ for batch in split_state.batches(host_api):
283
+ check_stop()
284
+ record_batches_queue.put(batch)
285
+ except StoppedException:
286
+ log.debug("stop signal.", exc_info=True)
287
+ return
288
+ finally:
289
+ # signal that this thread has ended
290
+ log.debug("exiting")
291
+ record_batches_queue.put(None)
292
+
293
+ # Take a snapshot of enpoints
294
+ endpoints = list(self.stats.endpoints) if config.data_endpoints is None else list(config.data_endpoints)
295
+
296
+ def batches_iterator():
297
+ def propagate_first_exception(futures : List[concurrent.futures.Future], block = False):
298
+ done, not_done = concurrent.futures.wait(futures, None if block else 0, concurrent.futures.FIRST_EXCEPTION)
299
+ for future in done:
300
+ future.result()
301
+ return not_done
302
+
303
+ threads_prefix = "query-data"
304
+ # This is mainly for testing, it helps to identify running threads in runtime.
305
+ if config.query_id:
306
+ threads_prefix = threads_prefix + "-" + config.query_id
307
+
308
+ with concurrent.futures.ThreadPoolExecutor(max_workers=len(endpoints), thread_name_prefix=threads_prefix) as tp: # TODO: concurrency == enpoints is just a heuristic
309
+ futures = [tp.submit(single_endpoint_worker, endpoint) for endpoint in endpoints]
310
+ tasks_running = len(futures)
311
+ try:
312
+ while tasks_running > 0:
313
+ futures = propagate_first_exception(futures, block=False)
314
+
315
+ batch = record_batches_queue.get()
316
+ if batch is not None:
317
+ yield batch
318
+ else:
319
+ tasks_running -= 1
320
+ log.debug("one worker thread finished, remaining: %d", tasks_running)
321
+
322
+ # all host threads ended - wait for all futures to complete
323
+ propagate_first_exception(futures, block=True)
324
+ finally:
325
+ stop_event.set()
326
+ while tasks_running > 0:
327
+ if record_batches_queue.get() is None:
328
+ tasks_running -= 1
329
+
330
+ return pa.RecordBatchReader.from_batches(query_data_request.response_schema.arrow_schema, batches_iterator())
331
+
332
+ def _combine_chunks(self, col):
333
+ if hasattr(col, "combine_chunks"):
334
+ return col.combine_chunks()
335
+ else:
336
+ return col
337
+
338
+ def insert(self, rows: pa.RecordBatch) -> pa.RecordBatch:
339
+ serialized_slices = self.tx._rpc.api._record_batch_slices(rows, MAX_INSERT_ROWS_PER_PATCH)
340
+ row_ids = []
341
+ for slice in serialized_slices:
342
+ res = self.tx._rpc.api.insert_rows(self.bucket.name, self.schema.name, self.name, record_batch=slice,
343
+ txid=self.tx.txid)
344
+ (batch,) = pa.RecordBatchStreamReader(res.raw)
345
+ row_ids.append(batch[INTERNAL_ROW_ID])
346
+
347
+ return pa.chunked_array(row_ids)
348
+
349
+ def update(self, rows: Union[pa.RecordBatch, pa.Table], columns: list = None) -> None:
350
+ if columns is not None:
351
+ update_fields = [(INTERNAL_ROW_ID, pa.uint64())]
352
+ update_values = [self._combine_chunks(rows[INTERNAL_ROW_ID])]
353
+ for col in columns:
354
+ update_fields.append(rows.field(col))
355
+ update_values.append(self._combine_chunks(rows[col]))
356
+
357
+ update_rows_rb = pa.record_batch(schema=pa.schema(update_fields), data=update_values)
358
+ else:
359
+ update_rows_rb = rows
360
+
361
+ serialized_slices = self.tx._rpc.api._record_batch_slices(update_rows_rb, MAX_ROWS_PER_BATCH)
362
+ for slice in serialized_slices:
363
+ self.tx._rpc.api.update_rows(self.bucket.name, self.schema.name, self.name, record_batch=slice,
364
+ txid=self.tx.txid)
365
+
366
+ def delete(self, rows: Union[pa.RecordBatch, pa.Table]) -> None:
367
+ delete_rows_rb = pa.record_batch(schema=pa.schema([(INTERNAL_ROW_ID, pa.uint64())]),
368
+ data=[self._combine_chunks(rows[INTERNAL_ROW_ID])])
369
+
370
+ serialized_slices = self.tx._rpc.api._record_batch_slices(delete_rows_rb, MAX_ROWS_PER_BATCH)
371
+ for slice in serialized_slices:
372
+ self.tx._rpc.api.delete_rows(self.bucket.name, self.schema.name, self.name, record_batch=slice,
373
+ txid=self.tx.txid)
374
+
375
+ def drop(self) -> None:
376
+ self.tx._rpc.api.drop_table(self.bucket.name, self.schema.name, self.name, txid=self.tx.txid)
377
+ log.info("Dropped table: %s", self.name)
378
+
379
+ def rename(self, new_name) -> None:
380
+ self.tx._rpc.api.alter_table(
381
+ self.bucket.name, self.schema.name, self.name, txid=self.tx.txid, new_name=new_name)
382
+ log.info("Renamed table from %s to %s ", self.name, new_name)
383
+ self.name = new_name
384
+
385
+ def add_column(self, new_column: pa.Schema) -> None:
386
+ self.tx._rpc.api.add_columns(self.bucket.name, self.schema.name, self.name, new_column, txid=self.tx.txid)
387
+ log.info("Added column(s): %s", new_column)
388
+ self.arrow_schema = self.columns()
389
+
390
+ def drop_column(self, column_to_drop: pa.Schema) -> None:
391
+ self.tx._rpc.api.drop_columns(self.bucket.name, self.schema.name, self.name, column_to_drop, txid=self.tx.txid)
392
+ log.info("Dropped column(s): %s", column_to_drop)
393
+ self.arrow_schema = self.columns()
394
+
395
+ def rename_column(self, current_column_name: str, new_column_name: str) -> None:
396
+ self.tx._rpc.api.alter_column(self.bucket.name, self.schema.name, self.name, name=current_column_name,
397
+ new_name=new_column_name, txid=self.tx.txid)
398
+ log.info("Renamed column: %s to %s", current_column_name, new_column_name)
399
+ self.arrow_schema = self.columns()
400
+
401
+ def create_projection(self, projection_name: str, sorted_columns: List[str], unsorted_columns: List[str]) -> "Projection":
402
+ columns = [(sorted_column, "Sorted") for sorted_column in sorted_columns] + [(unsorted_column, "Unorted") for unsorted_column in unsorted_columns]
403
+ self.tx._rpc.api.create_projection(self.bucket.name, self.schema.name, self.name, projection_name, columns=columns, txid=self.tx.txid)
404
+ log.info("Created projection: %s", projection_name)
405
+ return self.projection(projection_name)
406
+
407
+ def __getitem__(self, col_name):
408
+ return self._ibis_table[col_name]
409
+
410
+
411
+ @dataclass
412
+ class Projection:
413
+ name: str
414
+ table: Table
415
+ handle: int
416
+ stats: TableStats
417
+ properties: dict = None
418
+
419
+ @property
420
+ def bucket(self):
421
+ return self.table.schema.bucket
422
+
423
+ @property
424
+ def schema(self):
425
+ return self.table.schema
426
+
427
+ @property
428
+ def tx(self):
429
+ return self.table.schema.tx
430
+
431
+ def __repr__(self):
432
+ return f"{type(self).__name__}(name={self.name})"
433
+
434
+ def columns(self) -> pa.Schema:
435
+ columns = []
436
+ next_key = 0
437
+ while True:
438
+ curr_columns, next_key, is_truncated, count, _ = \
439
+ self.tx._rpc.api.list_projection_columns(
440
+ self.bucket.name, self.schema.name, self.table.name, self.name, txid=self.table.tx.txid, next_key=next_key)
441
+ if not curr_columns:
442
+ break
443
+ columns.extend(curr_columns)
444
+ if not is_truncated:
445
+ break
446
+ self.arrow_schema = pa.schema([(col[0], col[1]) for col in columns])
447
+ return self.arrow_schema
448
+
449
+ def rename(self, new_name) -> None:
450
+ self.tx._rpc.api.alter_projection(self.bucket.name, self.schema.name,
451
+ self.table.name, self.name, txid=self.tx.txid, new_name=new_name)
452
+ log.info("Renamed projection from %s to %s ", self.name, new_name)
453
+ self.name = new_name
454
+
455
+ def drop(self) -> None:
456
+ self.tx._rpc.api.drop_projection(self.bucket.name, self.schema.name, self.table.name,
457
+ self.name, txid=self.tx.txid)
458
+ log.info("Dropped projection: %s", self.name)
459
+
460
+
461
+ def _parse_projection_info(projection_info, table: "Table"):
462
+ log.info("Projection info %s", str(projection_info))
463
+ stats = TableStats(num_rows=projection_info.num_rows, size_in_bytes=projection_info.size_in_bytes)
464
+ return Projection(name=projection_info.name, table=table, stats=stats, handle=int(projection_info.handle))
465
+
466
+
467
+ def _parse_bucket_and_object_names(path: str) -> (str, str):
468
+ if not path.startswith('/'):
469
+ raise errors.InvalidArgumentError(f"Path {path} must start with a '/'")
470
+ components = path.split(os.path.sep)
471
+ bucket_name = components[1]
472
+ object_path = os.path.sep.join(components[2:])
473
+ return bucket_name, object_path
474
+
475
+
476
+ def _serialize_record_batch(record_batch: pa.RecordBatch) -> pa.lib.Buffer:
477
+ sink = pa.BufferOutputStream()
478
+ with pa.ipc.new_stream(sink, record_batch.schema) as writer:
479
+ writer.write(record_batch)
480
+ return sink.getvalue()
@@ -0,0 +1,46 @@
1
+ import vastdb
2
+
3
+ import pytest
4
+ import boto3
5
+ import os
6
+
7
+
8
+ def pytest_addoption(parser):
9
+ parser.addoption("--tabular-bucket-name", help="Name of the S3 bucket with Tabular enabled", default = "vastdb")
10
+ parser.addoption("--tabular-access-key", help="Access key with Tabular permissions (AWS_ACCESS_KEY_ID)", default = os.environ.get("AWS_ACCESS_KEY_ID", None))
11
+ parser.addoption("--tabular-secret-key", help="Secret key with Tabular permissions (AWS_SECRET_ACCESS_KEY)" , default = os.environ.get("AWS_SECRET_ACCESS_KEY", None))
12
+ parser.addoption("--tabular-endpoint-url", help="Tabular server endpoint", default = "http://localhost:9090")
13
+
14
+
15
+ @pytest.fixture(scope="session")
16
+ def session(request):
17
+ return vastdb.connect(
18
+ access=request.config.getoption("--tabular-access-key"),
19
+ secret=request.config.getoption("--tabular-secret-key"),
20
+ endpoint=request.config.getoption("--tabular-endpoint-url"),
21
+ )
22
+
23
+
24
+ @pytest.fixture(scope="session")
25
+ def test_bucket_name(request):
26
+ return request.config.getoption("--tabular-bucket-name")
27
+
28
+
29
+ @pytest.fixture(scope="function")
30
+ def clean_bucket_name(request, test_bucket_name, session):
31
+ with session.transaction() as tx:
32
+ b = tx.bucket(test_bucket_name)
33
+ for s in b.schemas():
34
+ for t in s.tables():
35
+ t.drop()
36
+ s.drop()
37
+ return test_bucket_name
38
+
39
+
40
+ @pytest.fixture(scope="session")
41
+ def s3(request):
42
+ return boto3.client(
43
+ 's3',
44
+ aws_access_key_id=request.config.getoption("--tabular-access-key"),
45
+ aws_secret_access_key=request.config.getoption("--tabular-secret-key"),
46
+ endpoint_url=request.config.getoption("--tabular-endpoint-url"))