PyPI - vastdb - Versions diffs - 0.0.5.3__py3-none-any.whl → 0.0.5.4__py3-none-any.whl - Mend

vastdb 0.0.5.3py3-none-any.whl → 0.0.5.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (30) hide show

vastdb/api.py +16 -11
{vastdb-0.0.5.3.dist-info → vastdb-0.0.5.4.dist-info}/METADATA +1 -1
{vastdb-0.0.5.3.dist-info → vastdb-0.0.5.4.dist-info}/RECORD +6 -30
{vastdb-0.0.5.3.dist-info → vastdb-0.0.5.4.dist-info}/WHEEL +1 -1
vast_protobuf/__init__.py +0 -0
vast_protobuf/substrait/__init__.py +0 -0
vast_protobuf/substrait/algebra_pb2.py +0 -1344
vast_protobuf/substrait/capabilities_pb2.py +0 -46
vast_protobuf/substrait/ddl_pb2.py +0 -57
vast_protobuf/substrait/extended_expression_pb2.py +0 -49
vast_protobuf/substrait/extensions/__init__.py +0 -0
vast_protobuf/substrait/extensions/extensions_pb2.py +0 -89
vast_protobuf/substrait/function_pb2.py +0 -168
vast_protobuf/substrait/parameterized_types_pb2.py +0 -181
vast_protobuf/substrait/plan_pb2.py +0 -67
vast_protobuf/substrait/type_expressions_pb2.py +0 -198
vast_protobuf/substrait/type_pb2.py +0 -350
vast_protobuf/tabular/__init__.py +0 -0
vast_protobuf/tabular/rpc_pb2.py +0 -344
vastdb/bench_scan.py +0 -45
vastdb/tests/__init__.py +0 -0
vastdb/tests/conftest.py +0 -45
vastdb/tests/test_create_table_from_parquets.py +0 -50
vastdb/tests/test_sanity.py +0 -63
vastdb/tests/test_schemas.py +0 -39
vastdb/tests/test_tables.py +0 -40
vastdb/util.py +0 -77
vastdb/v2.py +0 -360
{vastdb-0.0.5.3.dist-info → vastdb-0.0.5.4.dist-info}/LICENSE +0 -0
{vastdb-0.0.5.3.dist-info → vastdb-0.0.5.4.dist-info}/top_level.txt +0 -0

vastdb/util.py DELETED Viewed

@@ -1,77 +0,0 @@
-import logging
-from typing import Callable
-import pyarrow as pa
-import pyarrow.parquet as pq
-from vastdb.v2 import InvalidArgumentError, Table, Schema
-log = logging.getLogger(__name__)
-def create_table_from_files(
-        schema: Schema, table_name: str, parquet_files: [str], schema_merge_func: Callable = None) -> Table:
-    if not schema_merge_func:
-        schema_merge_func = default_schema_merge
-    else:
-        assert schema_merge_func in [default_schema_merge, strict_schema_merge, union_schema_merge]
-    tx = schema.tx
-    current_schema = pa.schema([])
-    s3fs = pa.fs.S3FileSystem(
-        access_key=tx._rpc.api.access_key, secret_key=tx._rpc.api.secret_key, endpoint_override=tx._rpc.api.url)
-    for prq_file in parquet_files:
-        if not prq_file.startswith('/'):
-            raise InvalidArgumentError(f"Path {prq_file} must start with a '/'")
-        parquet_ds = pq.ParquetDataset(prq_file.lstrip('/'), filesystem=s3fs)
-        current_schema = schema_merge_func(current_schema, parquet_ds.schema)
-    log.info("Creating table %s from %d Parquet files, with columns: %s",
-             table_name, len(parquet_files), list(current_schema))
-    table = schema.create_table(table_name, current_schema)
-    log.info("Starting import of %d files to table: %s", len(parquet_files), table)
-    table.import_files(parquet_files)
-    log.info("Finished import of %d files to table: %s", len(parquet_files), table)
-    return table
-def default_schema_merge(current_schema: pa.Schema, new_schema: pa.Schema) -> pa.Schema:
-    """
-    This function validates a schema is contained in another schema
-    Raises an InvalidArgumentError if a certain field does not exist in the target schema
-    """
-    if not current_schema.names:
-        return new_schema
-    s1 = set(current_schema)
-    s2 = set(new_schema)
-    if len(s1) > len(s2):
-        s1, s2 = s2, s1
-        result = current_schema  # We need this variable in order to preserve the original fields order
-    else:
-        result = new_schema
-    if not s1.issubset(s2):
-        log.error("Schema mismatch. schema: %s isn't contained in schema: %s.", s1, s2)
-        raise InvalidArgumentError("Found mismatch in parquet files schemas.")
-    return result
-def strict_schema_merge(current_schema: pa.Schema, new_schema: pa.Schema) -> pa.Schema:
-    """
-    This function validates two Schemas are identical.
-    Raises an InvalidArgumentError if schemas aren't identical.
-    """
-    if current_schema.names and current_schema != new_schema:
-        raise InvalidArgumentError(f"Schemas are not identical. \n {current_schema} \n vs \n {new_schema}")
-    return new_schema
-def union_schema_merge(current_schema: pa.Schema, new_schema: pa.Schema) -> pa.Schema:
-    """
-    This function returns a unified schema from potentially two different schemas.
-    """
-    return pa.unify_schemas([current_schema, new_schema])

vastdb/v2.py DELETED Viewed

@@ -1,360 +0,0 @@
-from dataclasses import dataclass, field
-import logging
-import os
-import boto3
-import botocore
-import ibis
-import pyarrow as pa
-import requests
-from vastdb.api import VastdbApi, serialize_record_batch, build_query_data_request, parse_query_data_response, TABULAR_INVALID_ROW_ID
-log = logging.getLogger(__name__)
-class VastException(Exception):
-    pass
-class NotFoundError(VastException):
-    pass
-class AccessDeniedError(VastException):
-    pass
-class ImportFilesError(VastException):
-    pass
-class InvalidArgumentError(VastException):
-    pass
-class RPC:
-    def __init__(self, access=None, secret=None, endpoint=None):
-        if access is None:
-            access = os.environ['AWS_ACCESS_KEY_ID']
-        if secret is None:
-            secret = os.environ['AWS_SECRET_ACCESS_KEY']
-        if endpoint is None:
-            endpoint = os.environ['AWS_S3_ENDPOINT_URL']
-        self.api = VastdbApi(endpoint, access, secret)
-        self.s3 = boto3.client('s3',
-            aws_access_key_id=access,
-            aws_secret_access_key=secret,
-            endpoint_url=endpoint)
-    def __repr__(self):
-        return f'RPC(endpoint={self.api.url}, access={self.api.access_key})'
-    def transaction(self):
-        return Transaction(self)
-def connect(*args, **kw):
-    return RPC(*args, **kw)
-@dataclass
-class Transaction:
-    _rpc: RPC
-    txid: int = None
-    def __enter__(self):
-        response = self._rpc.api.begin_transaction()
-        self.txid = int(response.headers['tabular-txid'])
-        log.debug("opened txid=%016x", self.txid)
-        return self
-    def __exit__(self, *args):
-        if args == (None, None, None):
-            log.debug("committing txid=%016x", self.txid)
-            self._rpc.api.commit_transaction(self.txid)
-        else:
-            log.exception("rolling back txid=%016x", self.txid)
-            self._rpc.api.rollback_transaction(self.txid)
-    def __repr__(self):
-        return f'Transaction(id=0x{self.txid:016x})'
-    def bucket(self, name: str) -> "Bucket":
-        try:
-            self._rpc.s3.head_bucket(Bucket=name)
-            return Bucket(name, self)
-        except botocore.exceptions.ClientError as e:
-            if e.response['Error']['Code'] == 403:
-                raise AccessDeniedError(f"Access is denied to bucket: {name}") from e
-            else:
-                raise NotFoundError(f"Bucket {name} does not exist") from e
-@dataclass
-class Bucket:
-    name: str
-    tx: Transaction
-    def create_schema(self, path: str) -> "Schema":
-        self.tx._rpc.api.create_schema(self.name, path, txid=self.tx.txid)
-        log.info("Created schema: %s", path)
-        return self.schema(path)
-    def schema(self, path: str) -> "Schema":
-        schema = self.schemas(path)
-        log.debug("schema: %s", schema)
-        if not schema:
-            raise NotFoundError(f"Schema '{path}' was not found in bucket: {self.name}")
-        assert len(schema) == 1, f"Expected to receive only a single schema, but got: {len(schema)}. ({schema})"
-        log.debug("Found schema: %s", schema[0].name)
-        return schema[0]
-    def schemas(self, schema: str = None) -> ["Schema"]:
-        schemas = []
-        next_key = 0
-        exact_match = bool(schema)
-        log.debug("list schemas param: schema=%s, exact_match=%s", schema, exact_match)
-        while True:
-            bucket_name, curr_schemas, next_key, is_truncated, _ = \
-                self.tx._rpc.api.list_schemas(bucket=self.name, next_key=next_key, txid=self.tx.txid,
-                                               name_prefix=schema, exact_match=exact_match)
-            if not curr_schemas:
-                break
-            schemas.extend(curr_schemas)
-            if not is_truncated:
-                break
-        return [Schema(name=name, bucket=self) for name, *_ in schemas]
-@dataclass
-class Schema:
-    name: str
-    bucket: Bucket
-    @property
-    def tx(self):
-        return self.bucket.tx
-    def create_table(self, table_name: str, columns: pa.Schema) -> "Table":
-        self.tx._rpc.api.create_table(self.bucket.name, self.name, table_name, columns, txid=self.tx.txid)
-        log.info("Created table: %s", table_name)
-        return self.table(table_name)
-    def table(self, name: str) -> "Table":
-        t = self.tables(table_name=name)
-        if not t:
-            raise NotFoundError(f"Table '{name}' was not found under schema: {self.name}")
-        assert len(t) == 1, f"Expected to receive only a single table, but got: {len(t)}. tables: {t}"
-        log.debug("Found table: %s", t[0])
-        return t[0]
-    def tables(self, table_name=None) -> ["Table"]:
-        tables = []
-        next_key = 0
-        name_prefix = table_name if table_name else ""
-        exact_match = bool(table_name)
-        while True:
-            bucket_name, schema_name, curr_tables, next_key, is_truncated, _ = \
-                self.tx._rpc.api.list_tables(
-                    bucket=self.bucket.name, schema=self.name, next_key=next_key, txid=self.tx.txid,
-                    exact_match=exact_match, name_prefix=name_prefix)
-            if not curr_tables:
-                break
-            tables.extend(curr_tables)
-            if not is_truncated:
-                break
-        return [_parse_table_info(table, self) for table in tables]
-    def drop(self) -> None:
-        self.tx._rpc.api.drop_schema(self.bucket.name, self.name, txid=self.tx.txid)
-        log.info("Dropped schema: %s", self.name)
-    def rename(self, new_name) -> None:
-        self.tx._rpc.api.alter_schema(self.bucket.name, self.name, txid=self.tx.txid, new_name=new_name)
-        log.info("Renamed schema: %s to %s", self.name, new_name)
-        self.name = new_name
-@dataclass
-class TableStats:
-    num_rows: int
-    size: int
-@dataclass
-class QueryConfig:
-    num_sub_splits: int = 4
-    num_splits: int = 1
-    data_endpoints: [str] = None
-    limit_per_sub_split: int = 128 * 1024
-    num_row_groups_per_sub_split: int = 8
-@dataclass
-class Table:
-    name: str
-    schema: pa.Schema
-    handle: int
-    stats: TableStats
-    properties: dict = None
-    arrow_schema: pa.Schema = field(init=False, compare=False)
-    _ibis_table: ibis.Schema = field(init=False, compare=False)
-    def __post_init__(self):
-        self.properties = self.properties or {}
-        self.arrow_schema = self.columns()
-        self._ibis_table = ibis.Schema.from_pyarrow(self.arrow_schema)
-    @property
-    def tx(self):
-        return self.schema.tx
-    @property
-    def bucket(self):
-        return self.schema.bucket
-    def __repr__(self):
-        return f"{type(self).__name__}(name={self.name})"
-    def columns(self) -> pa.Schema:
-        cols = self.tx._rpc.api._list_table_columns(self.bucket.name, self.schema.name, self.name, txid=self.tx.txid)
-        self.arrow_schema = pa.schema([(col[0], col[1]) for col in cols])
-        return self.arrow_schema
-    def import_files(self, files_to_import: [str]) -> None:
-        source_files = {}
-        for f in files_to_import:
-            bucket_name, object_path = _parse_bucket_and_object_names(f)
-            source_files[(bucket_name, object_path)] = b''
-        self._execute_import(source_files)
-    def import_partitioned_files(self, files_and_partitions: {str: pa.RecordBatch}) -> None:
-        source_files = {}
-        for f, record_batch in files_and_partitions.items():
-            bucket_name, object_path = _parse_bucket_and_object_names(f)
-            serialized_batch = _serialize_record_batch(record_batch)
-            source_files = {(bucket_name, object_path): serialized_batch.to_pybytes()}
-        self._execute_import(source_files)
-    def _execute_import(self, source_files):
-        try:
-            self.tx._rpc.api.import_data(
-                self.bucket.name, self.schema.name, self.name, source_files, txid=self.tx.txid)
-        except requests.HTTPError as e:
-            raise ImportFilesError(f"import_files failed with status: {e.response.status_code}, reason: {e.response.reason}")
-        except Exception as e:
-            # TODO: investigate and raise proper error in case of failure mid import.
-            raise ImportFilesError("import_files failed") from e
-    def select(self, columns: [str], predicate: ibis.expr.types.BooleanColumn = None,
-               config: "QueryConfig" = None):
-        if config is None:
-            config = QueryConfig()
-        api = self.tx._rpc.api
-        field_names = columns
-        filters = []
-        bucket = self.bucket.name
-        schema = self.schema.name
-        table = self.name
-        query_data_request = build_query_data_request(
-            schema=self.arrow_schema, filters=filters, field_names=field_names)
-        start_row_ids = {i: 0 for i in range(config.num_sub_splits)}
-        assert config.num_splits == 1  # TODO()
-        split = (0, 1, config.num_row_groups_per_sub_split)
-        response_row_id = False
-        while not all(row_id == TABULAR_INVALID_ROW_ID for row_id in start_row_ids.values()):
-            response = api.query_data(
-                bucket=bucket,
-                schema=schema,
-                table=table,
-                params=query_data_request.serialized,
-                split=split,
-                num_sub_splits=config.num_sub_splits,
-                response_row_id=response_row_id,
-                txid=self.tx.txid,
-                limit_rows=config.limit_per_sub_split,
-                sub_split_start_row_ids=start_row_ids.items())
-            pages_iter = parse_query_data_response(
-                conn=response.raw,
-                schema=query_data_request.response_schema,
-                start_row_ids=start_row_ids)
-            for page in pages_iter:
-                for batch in page.to_batches():
-                    if len(batch) > 0:
-                        yield batch
-    def insert(self, rows: pa.RecordBatch) -> None:
-        blob = serialize_record_batch(rows)
-        self.tx._rpc.api.insert_rows(self.bucket.name, self.schema.name, self.name, record_batch=blob, txid=self.tx.txid)
-    def drop(self) -> None:
-        self.tx._rpc.api.drop_table(self.bucket.name, self.schema.name, self.name, txid=self.tx.txid)
-        log.info("Dropped table: %s", self.name)
-    def rename(self, new_name) -> None:
-        self.tx._rpc.api.alter_table(
-            self.bucket.name, self.schema.name, self.name, txid=self.tx.txid, new_name=new_name)
-        log.info("Renamed table from %s to %s ", self.name, new_name)
-        self.name = new_name
-    def add_column(self, new_column: pa.Schema) -> None:
-        self.tx._rpc.api.add_columns(self.bucket.name, self.schema.name, self.name, new_column, txid=self.tx.txid)
-        log.info("Added column(s): %s", new_column)
-        self.arrow_schema = self.columns()
-    def drop_column(self, column_to_drop: pa.Schema) -> None:
-        self.tx._rpc.api.drop_columns(self.bucket.name, self.schema.name, self.name, column_to_drop, txid=self.tx.txid)
-        log.info("Dropped column(s): %s", column_to_drop)
-        self.arrow_schema = self.columns()
-    def rename_column(self, current_column_name: str, new_column_name: str) -> None:
-        self.tx._rpc.api.alter_column(self.bucket.name, self.schema.name, self.name, name=current_column_name,
-                                       new_name=new_column_name, txid=self.tx.txid)
-        log.info("Renamed column: %s to %s", current_column_name, new_column_name)
-        self.arrow_schema = self.columns()
-    def __getitem__(self, col_name):
-        return self._ibis_table[col_name]
-def _parse_table_info(table_info, schema: "Schema"):
-    stats = TableStats(num_rows=table_info.num_rows, size=table_info.size_in_bytes)
-    return Table(name=table_info.name, schema=schema, handle=int(table_info.handle), stats=stats)
-def _parse_bucket_and_object_names(path: str) -> (str, str):
-    if not path.startswith('/'):
-        raise InvalidArgumentError(f"Path {path} must start with a '/'")
-    components = path.split(os.path.sep)
-    bucket_name = components[1]
-    object_path = os.path.sep.join(components[2:])
-    return bucket_name, object_path
-def _serialize_record_batch(record_batch: pa.RecordBatch) -> pa.lib.Buffer:
-    sink = pa.BufferOutputStream()
-    with pa.ipc.new_stream(sink, record_batch.schema) as writer:
-        writer.write(record_batch)
-    return sink.getvalue()
-def _parse_endpoint(endpoint):
-    if ":" in endpoint:
-        endpoint, port = endpoint.split(":")
-        port = int(port)
-    else:
-        port = 80
-    log.debug("endpoint: %s, port: %d", endpoint, port)
-    return endpoint, port

{vastdb-0.0.5.3.dist-info → vastdb-0.0.5.4.dist-info}/LICENSE RENAMED Viewed

File without changes

{vastdb-0.0.5.3.dist-info → vastdb-0.0.5.4.dist-info}/top_level.txt RENAMED Viewed

File without changes

vastdb 0.0.5.3__py3-none-any.whl → 0.0.5.4__py3-none-any.whl

vastdb 0.0.5.3py3-none-any.whl → 0.0.5.4py3-none-any.whl