PyPI - vastdb - Versions diffs - 0.0.5.3__py3-none-any.whl → 0.1.0__py3-none-any.whl - Mend

vastdb 0.0.5.3py3-none-any.whl → 0.1.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (41) hide show

vast_flatbuf/tabular/GetTableStatsResponse.py +45 -1
vast_flatbuf/tabular/VipRange.py +56 -0
vastdb/__init__.py +7 -0
vastdb/bucket.py +77 -0
vastdb/errors.py +158 -0
vastdb/{api.py → internal_commands.py} +280 -746
vastdb/schema.py +77 -0
vastdb/session.py +48 -0
vastdb/table.py +480 -0
vastdb/tests/conftest.py +15 -14
vastdb/tests/test_imports.py +125 -0
vastdb/tests/test_projections.py +41 -0
vastdb/tests/test_sanity.py +36 -16
vastdb/tests/test_schemas.py +12 -6
vastdb/tests/test_tables.py +581 -13
vastdb/transaction.py +55 -0
vastdb/util.py +8 -8
vastdb-0.1.0.dist-info/METADATA +38 -0
{vastdb-0.0.5.3.dist-info → vastdb-0.1.0.dist-info}/RECORD +22 -31
vast_protobuf/__init__.py +0 -0
vast_protobuf/substrait/__init__.py +0 -0
vast_protobuf/substrait/algebra_pb2.py +0 -1344
vast_protobuf/substrait/capabilities_pb2.py +0 -46
vast_protobuf/substrait/ddl_pb2.py +0 -57
vast_protobuf/substrait/extended_expression_pb2.py +0 -49
vast_protobuf/substrait/extensions/__init__.py +0 -0
vast_protobuf/substrait/extensions/extensions_pb2.py +0 -89
vast_protobuf/substrait/function_pb2.py +0 -168
vast_protobuf/substrait/parameterized_types_pb2.py +0 -181
vast_protobuf/substrait/plan_pb2.py +0 -67
vast_protobuf/substrait/type_expressions_pb2.py +0 -198
vast_protobuf/substrait/type_pb2.py +0 -350
vast_protobuf/tabular/__init__.py +0 -0
vast_protobuf/tabular/rpc_pb2.py +0 -344
vastdb/bench_scan.py +0 -45
vastdb/tests/test_create_table_from_parquets.py +0 -50
vastdb/v2.py +0 -360
vastdb-0.0.5.3.dist-info/METADATA +0 -47
{vastdb-0.0.5.3.dist-info → vastdb-0.1.0.dist-info}/LICENSE +0 -0
{vastdb-0.0.5.3.dist-info → vastdb-0.1.0.dist-info}/WHEEL +0 -0
{vastdb-0.0.5.3.dist-info → vastdb-0.1.0.dist-info}/top_level.txt +0 -0

vastdb/v2.py DELETED Viewed

@@ -1,360 +0,0 @@
-from dataclasses import dataclass, field
-import logging
-import os
-import boto3
-import botocore
-import ibis
-import pyarrow as pa
-import requests
-from vastdb.api import VastdbApi, serialize_record_batch, build_query_data_request, parse_query_data_response, TABULAR_INVALID_ROW_ID
-log = logging.getLogger(__name__)
-class VastException(Exception):
-    pass
-class NotFoundError(VastException):
-    pass
-class AccessDeniedError(VastException):
-    pass
-class ImportFilesError(VastException):
-    pass
-class InvalidArgumentError(VastException):
-    pass
-class RPC:
-    def __init__(self, access=None, secret=None, endpoint=None):
-        if access is None:
-            access = os.environ['AWS_ACCESS_KEY_ID']
-        if secret is None:
-            secret = os.environ['AWS_SECRET_ACCESS_KEY']
-        if endpoint is None:
-            endpoint = os.environ['AWS_S3_ENDPOINT_URL']
-        self.api = VastdbApi(endpoint, access, secret)
-        self.s3 = boto3.client('s3',
-            aws_access_key_id=access,
-            aws_secret_access_key=secret,
-            endpoint_url=endpoint)
-    def __repr__(self):
-        return f'RPC(endpoint={self.api.url}, access={self.api.access_key})'
-    def transaction(self):
-        return Transaction(self)
-def connect(*args, **kw):
-    return RPC(*args, **kw)
-@dataclass
-class Transaction:
-    _rpc: RPC
-    txid: int = None
-    def __enter__(self):
-        response = self._rpc.api.begin_transaction()
-        self.txid = int(response.headers['tabular-txid'])
-        log.debug("opened txid=%016x", self.txid)
-        return self
-    def __exit__(self, *args):
-        if args == (None, None, None):
-            log.debug("committing txid=%016x", self.txid)
-            self._rpc.api.commit_transaction(self.txid)
-        else:
-            log.exception("rolling back txid=%016x", self.txid)
-            self._rpc.api.rollback_transaction(self.txid)
-    def __repr__(self):
-        return f'Transaction(id=0x{self.txid:016x})'
-    def bucket(self, name: str) -> "Bucket":
-        try:
-            self._rpc.s3.head_bucket(Bucket=name)
-            return Bucket(name, self)
-        except botocore.exceptions.ClientError as e:
-            if e.response['Error']['Code'] == 403:
-                raise AccessDeniedError(f"Access is denied to bucket: {name}") from e
-            else:
-                raise NotFoundError(f"Bucket {name} does not exist") from e
-@dataclass
-class Bucket:
-    name: str
-    tx: Transaction
-    def create_schema(self, path: str) -> "Schema":
-        self.tx._rpc.api.create_schema(self.name, path, txid=self.tx.txid)
-        log.info("Created schema: %s", path)
-        return self.schema(path)
-    def schema(self, path: str) -> "Schema":
-        schema = self.schemas(path)
-        log.debug("schema: %s", schema)
-        if not schema:
-            raise NotFoundError(f"Schema '{path}' was not found in bucket: {self.name}")
-        assert len(schema) == 1, f"Expected to receive only a single schema, but got: {len(schema)}. ({schema})"
-        log.debug("Found schema: %s", schema[0].name)
-        return schema[0]
-    def schemas(self, schema: str = None) -> ["Schema"]:
-        schemas = []
-        next_key = 0
-        exact_match = bool(schema)
-        log.debug("list schemas param: schema=%s, exact_match=%s", schema, exact_match)
-        while True:
-            bucket_name, curr_schemas, next_key, is_truncated, _ = \
-                self.tx._rpc.api.list_schemas(bucket=self.name, next_key=next_key, txid=self.tx.txid,
-                                               name_prefix=schema, exact_match=exact_match)
-            if not curr_schemas:
-                break
-            schemas.extend(curr_schemas)
-            if not is_truncated:
-                break
-        return [Schema(name=name, bucket=self) for name, *_ in schemas]
-@dataclass
-class Schema:
-    name: str
-    bucket: Bucket
-    @property
-    def tx(self):
-        return self.bucket.tx
-    def create_table(self, table_name: str, columns: pa.Schema) -> "Table":
-        self.tx._rpc.api.create_table(self.bucket.name, self.name, table_name, columns, txid=self.tx.txid)
-        log.info("Created table: %s", table_name)
-        return self.table(table_name)
-    def table(self, name: str) -> "Table":
-        t = self.tables(table_name=name)
-        if not t:
-            raise NotFoundError(f"Table '{name}' was not found under schema: {self.name}")
-        assert len(t) == 1, f"Expected to receive only a single table, but got: {len(t)}. tables: {t}"
-        log.debug("Found table: %s", t[0])
-        return t[0]
-    def tables(self, table_name=None) -> ["Table"]:
-        tables = []
-        next_key = 0
-        name_prefix = table_name if table_name else ""
-        exact_match = bool(table_name)
-        while True:
-            bucket_name, schema_name, curr_tables, next_key, is_truncated, _ = \
-                self.tx._rpc.api.list_tables(
-                    bucket=self.bucket.name, schema=self.name, next_key=next_key, txid=self.tx.txid,
-                    exact_match=exact_match, name_prefix=name_prefix)
-            if not curr_tables:
-                break
-            tables.extend(curr_tables)
-            if not is_truncated:
-                break
-        return [_parse_table_info(table, self) for table in tables]
-    def drop(self) -> None:
-        self.tx._rpc.api.drop_schema(self.bucket.name, self.name, txid=self.tx.txid)
-        log.info("Dropped schema: %s", self.name)
-    def rename(self, new_name) -> None:
-        self.tx._rpc.api.alter_schema(self.bucket.name, self.name, txid=self.tx.txid, new_name=new_name)
-        log.info("Renamed schema: %s to %s", self.name, new_name)
-        self.name = new_name
-@dataclass
-class TableStats:
-    num_rows: int
-    size: int
-@dataclass
-class QueryConfig:
-    num_sub_splits: int = 4
-    num_splits: int = 1
-    data_endpoints: [str] = None
-    limit_per_sub_split: int = 128 * 1024
-    num_row_groups_per_sub_split: int = 8
-@dataclass
-class Table:
-    name: str
-    schema: pa.Schema
-    handle: int
-    stats: TableStats
-    properties: dict = None
-    arrow_schema: pa.Schema = field(init=False, compare=False)
-    _ibis_table: ibis.Schema = field(init=False, compare=False)
-    def __post_init__(self):
-        self.properties = self.properties or {}
-        self.arrow_schema = self.columns()
-        self._ibis_table = ibis.Schema.from_pyarrow(self.arrow_schema)
-    @property
-    def tx(self):
-        return self.schema.tx
-    @property
-    def bucket(self):
-        return self.schema.bucket
-    def __repr__(self):
-        return f"{type(self).__name__}(name={self.name})"
-    def columns(self) -> pa.Schema:
-        cols = self.tx._rpc.api._list_table_columns(self.bucket.name, self.schema.name, self.name, txid=self.tx.txid)
-        self.arrow_schema = pa.schema([(col[0], col[1]) for col in cols])
-        return self.arrow_schema
-    def import_files(self, files_to_import: [str]) -> None:
-        source_files = {}
-        for f in files_to_import:
-            bucket_name, object_path = _parse_bucket_and_object_names(f)
-            source_files[(bucket_name, object_path)] = b''
-        self._execute_import(source_files)
-    def import_partitioned_files(self, files_and_partitions: {str: pa.RecordBatch}) -> None:
-        source_files = {}
-        for f, record_batch in files_and_partitions.items():
-            bucket_name, object_path = _parse_bucket_and_object_names(f)
-            serialized_batch = _serialize_record_batch(record_batch)
-            source_files = {(bucket_name, object_path): serialized_batch.to_pybytes()}
-        self._execute_import(source_files)
-    def _execute_import(self, source_files):
-        try:
-            self.tx._rpc.api.import_data(
-                self.bucket.name, self.schema.name, self.name, source_files, txid=self.tx.txid)
-        except requests.HTTPError as e:
-            raise ImportFilesError(f"import_files failed with status: {e.response.status_code}, reason: {e.response.reason}")
-        except Exception as e:
-            # TODO: investigate and raise proper error in case of failure mid import.
-            raise ImportFilesError("import_files failed") from e
-    def select(self, columns: [str], predicate: ibis.expr.types.BooleanColumn = None,
-               config: "QueryConfig" = None):
-        if config is None:
-            config = QueryConfig()
-        api = self.tx._rpc.api
-        field_names = columns
-        filters = []
-        bucket = self.bucket.name
-        schema = self.schema.name
-        table = self.name
-        query_data_request = build_query_data_request(
-            schema=self.arrow_schema, filters=filters, field_names=field_names)
-        start_row_ids = {i: 0 for i in range(config.num_sub_splits)}
-        assert config.num_splits == 1  # TODO()
-        split = (0, 1, config.num_row_groups_per_sub_split)
-        response_row_id = False
-        while not all(row_id == TABULAR_INVALID_ROW_ID for row_id in start_row_ids.values()):
-            response = api.query_data(
-                bucket=bucket,
-                schema=schema,
-                table=table,
-                params=query_data_request.serialized,
-                split=split,
-                num_sub_splits=config.num_sub_splits,
-                response_row_id=response_row_id,
-                txid=self.tx.txid,
-                limit_rows=config.limit_per_sub_split,
-                sub_split_start_row_ids=start_row_ids.items())
-            pages_iter = parse_query_data_response(
-                conn=response.raw,
-                schema=query_data_request.response_schema,
-                start_row_ids=start_row_ids)
-            for page in pages_iter:
-                for batch in page.to_batches():
-                    if len(batch) > 0:
-                        yield batch
-    def insert(self, rows: pa.RecordBatch) -> None:
-        blob = serialize_record_batch(rows)
-        self.tx._rpc.api.insert_rows(self.bucket.name, self.schema.name, self.name, record_batch=blob, txid=self.tx.txid)
-    def drop(self) -> None:
-        self.tx._rpc.api.drop_table(self.bucket.name, self.schema.name, self.name, txid=self.tx.txid)
-        log.info("Dropped table: %s", self.name)
-    def rename(self, new_name) -> None:
-        self.tx._rpc.api.alter_table(
-            self.bucket.name, self.schema.name, self.name, txid=self.tx.txid, new_name=new_name)
-        log.info("Renamed table from %s to %s ", self.name, new_name)
-        self.name = new_name
-    def add_column(self, new_column: pa.Schema) -> None:
-        self.tx._rpc.api.add_columns(self.bucket.name, self.schema.name, self.name, new_column, txid=self.tx.txid)
-        log.info("Added column(s): %s", new_column)
-        self.arrow_schema = self.columns()
-    def drop_column(self, column_to_drop: pa.Schema) -> None:
-        self.tx._rpc.api.drop_columns(self.bucket.name, self.schema.name, self.name, column_to_drop, txid=self.tx.txid)
-        log.info("Dropped column(s): %s", column_to_drop)
-        self.arrow_schema = self.columns()
-    def rename_column(self, current_column_name: str, new_column_name: str) -> None:
-        self.tx._rpc.api.alter_column(self.bucket.name, self.schema.name, self.name, name=current_column_name,
-                                       new_name=new_column_name, txid=self.tx.txid)
-        log.info("Renamed column: %s to %s", current_column_name, new_column_name)
-        self.arrow_schema = self.columns()
-    def __getitem__(self, col_name):
-        return self._ibis_table[col_name]
-def _parse_table_info(table_info, schema: "Schema"):
-    stats = TableStats(num_rows=table_info.num_rows, size=table_info.size_in_bytes)
-    return Table(name=table_info.name, schema=schema, handle=int(table_info.handle), stats=stats)
-def _parse_bucket_and_object_names(path: str) -> (str, str):
-    if not path.startswith('/'):
-        raise InvalidArgumentError(f"Path {path} must start with a '/'")
-    components = path.split(os.path.sep)
-    bucket_name = components[1]
-    object_path = os.path.sep.join(components[2:])
-    return bucket_name, object_path
-def _serialize_record_batch(record_batch: pa.RecordBatch) -> pa.lib.Buffer:
-    sink = pa.BufferOutputStream()
-    with pa.ipc.new_stream(sink, record_batch.schema) as writer:
-        writer.write(record_batch)
-    return sink.getvalue()
-def _parse_endpoint(endpoint):
-    if ":" in endpoint:
-        endpoint, port = endpoint.split(":")
-        port = int(port)
-    else:
-        port = 80
-    log.debug("endpoint: %s, port: %d", endpoint, port)
-    return endpoint, port

vastdb-0.0.5.3.dist-info/METADATA DELETED Viewed

@@ -1,47 +0,0 @@
-Metadata-Version: 2.1
-Name: vastdb
-Version: 0.0.5.3
-Summary: VAST Data SDK
-Home-page: https://github.com/vast-data/vastdb_sdk
-Author: VAST DATA
-Author-email: hello@vastdata.com
-License: Copyright (C) VAST Data Ltd.
-Platform: UNKNOWN
-Description-Content-Type: text/markdown
-License-File: LICENSE
-Requires-Dist: flatbuffers
-Requires-Dist: pyarrow
-Requires-Dist: requests
-Requires-Dist: aws-requests-auth
-Requires-Dist: xmltodict
-`VastdbApi` is a Python based API designed for interacting with *VastDB* & *Vast Catalog*, enabling operations such as schema and table management, data querying, and transaction handling.
-Key libraries used in this API include requests for HTTP requests, pyarrow for handling Apache Arrow data formats, and flatbuffers for efficient serialization of data structures.
-```
-pip install vastdb
-```
-## Creating the initial session with VastdbApi:
-```python
-from vastdb import api
-import pyarrow as pa
-import vast_flatbuf
-from vastdb.api import VastdbApi
-def create_vastdb_session(access_key, secret_key):
-    return VastdbApi(host='VAST_VIP_POOL_DNS_NAME', access_key=access_key, secret_key=secret_key)
-access_key='D8UDFDF...'
-secret_key='B7bqMegmj+TDN..'
-vastdb_session = create_vastdb_session(access_key, secret_key)
-```
-#### For the complete Guide for the SDK please go to VastData github: https://github.com/vast-data/vastdb_sdk

{vastdb-0.0.5.3.dist-info → vastdb-0.1.0.dist-info}/LICENSE RENAMED Viewed

File without changes

{vastdb-0.0.5.3.dist-info → vastdb-0.1.0.dist-info}/WHEEL RENAMED Viewed

File without changes

{vastdb-0.0.5.3.dist-info → vastdb-0.1.0.dist-info}/top_level.txt RENAMED Viewed

File without changes

vastdb 0.0.5.3__py3-none-any.whl → 0.1.0__py3-none-any.whl

vastdb 0.0.5.3py3-none-any.whl → 0.1.0py3-none-any.whl