PyPI - deltacat - Versions diffs - 0.1.18b13__py3-none-any.whl → 0.1.18b15__py3-none-any.whl - Mend

deltacat 0.1.18b13py3-none-any.whl → 0.1.18b15py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (95) hide show

deltacat/__init__.py +3 -2
deltacat/aws/clients.py +123 -3
deltacat/aws/redshift/model/manifest.py +4 -0
deltacat/aws/s3u.py +24 -1
deltacat/benchmarking/benchmark_parquet_reads.py +53 -0
deltacat/benchmarking/conftest.py +61 -0
deltacat/catalog/delegate.py +1 -1
deltacat/catalog/interface.py +1 -1
deltacat/compute/compactor/__init__.py +0 -3
deltacat/compute/compactor/compaction_session.py +45 -20
deltacat/compute/compactor/model/compact_partition_params.py +287 -58
deltacat/compute/compactor/model/compaction_session_audit_info.py +150 -9
deltacat/compute/compactor/model/delta_annotated.py +91 -9
deltacat/compute/compactor/model/delta_file_envelope.py +15 -3
deltacat/compute/compactor/model/primary_key_index.py +1 -1
deltacat/compute/compactor/model/round_completion_info.py +17 -1
deltacat/compute/compactor/repartition_session.py +5 -3
deltacat/compute/compactor/steps/dedupe.py +10 -8
deltacat/compute/compactor/steps/hash_bucket.py +25 -4
deltacat/compute/compactor/steps/materialize.py +11 -6
deltacat/compute/compactor/steps/repartition.py +16 -1
deltacat/compute/compactor/utils/io.py +40 -23
deltacat/compute/compactor/utils/primary_key_index.py +1 -15
deltacat/compute/compactor/utils/sort_key.py +57 -0
deltacat/compute/compactor/utils/system_columns.py +43 -0
deltacat/compute/compactor_v2/compaction_session.py +506 -0
deltacat/compute/compactor_v2/constants.py +34 -0
deltacat/compute/compactor_v2/model/__init__.py +0 -0
deltacat/compute/compactor_v2/model/hash_bucket_input.py +78 -0
deltacat/compute/compactor_v2/model/hash_bucket_result.py +12 -0
deltacat/compute/compactor_v2/model/merge_input.py +127 -0
deltacat/compute/compactor_v2/model/merge_result.py +12 -0
deltacat/compute/compactor_v2/steps/__init__.py +0 -0
deltacat/compute/compactor_v2/steps/hash_bucket.py +203 -0
deltacat/compute/compactor_v2/steps/merge.py +41 -0
deltacat/compute/compactor_v2/utils/__init__.py +0 -0
deltacat/compute/compactor_v2/utils/content_type_params.py +37 -0
deltacat/compute/compactor_v2/utils/io.py +149 -0
deltacat/compute/compactor_v2/utils/primary_key_index.py +308 -0
deltacat/compute/compactor_v2/utils/task_options.py +228 -0
deltacat/compute/metastats/meta_stats.py +4 -2
deltacat/compute/metastats/stats.py +1 -0
deltacat/compute/metastats/utils/io.py +4 -0
deltacat/compute/stats/utils/io.py +20 -5
deltacat/exceptions.py +4 -0
deltacat/io/memcached_object_store.py +37 -14
deltacat/logs.py +4 -3
deltacat/storage/__init__.py +3 -0
deltacat/storage/interface.py +11 -2
deltacat/storage/model/sort_key.py +33 -0
deltacat/storage/model/table_version.py +11 -0
deltacat/storage/model/types.py +2 -1
deltacat/tests/aws/__init__.py +0 -0
deltacat/tests/aws/test_clients.py +80 -0
deltacat/tests/compute/__init__.py +0 -0
deltacat/tests/compute/common.py +96 -0
deltacat/tests/compute/compactor/__init__.py +0 -0
deltacat/tests/compute/compactor/steps/__init__.py +0 -0
deltacat/tests/{test_repartition.py → compute/compactor/steps/test_repartition.py} +22 -8
deltacat/tests/compute/compactor/utils/__init__.py +0 -0
deltacat/tests/{compactor → compute/compactor}/utils/test_io.py +47 -5
deltacat/tests/compute/compactor_v2/__init__.py +0 -0
deltacat/tests/compute/compactor_v2/steps/__init__.py +0 -0
deltacat/tests/compute/compactor_v2/steps/test_hash_bucket.py +199 -0
deltacat/tests/{compactor → compute}/test_compact_partition_params.py +14 -30
deltacat/tests/compute/test_compaction_session_incremental.py +348 -0
deltacat/tests/compute/testcases.py +390 -0
deltacat/tests/io/test_memcached_object_store.py +5 -4
deltacat/tests/local_deltacat_storage/__init__.py +1109 -0
deltacat/tests/test_utils/pyarrow.py +32 -0
deltacat/tests/test_utils/utils.py +13 -0
deltacat/tests/utils/data/__init__.py +0 -0
deltacat/tests/utils/test_daft.py +76 -0
deltacat/tests/utils/test_pyarrow.py +133 -0
deltacat/tests/utils/test_resources.py +23 -20
deltacat/types/media.py +1 -0
deltacat/types/partial_download.py +82 -0
deltacat/types/tables.py +1 -0
deltacat/utils/arguments.py +26 -0
deltacat/utils/daft.py +87 -0
deltacat/utils/performance.py +4 -2
deltacat/utils/placement.py +20 -3
deltacat/utils/pyarrow.py +213 -1
deltacat/utils/ray_utils/concurrency.py +26 -1
deltacat/utils/resources.py +72 -1
deltacat/utils/s3fs.py +21 -0
{deltacat-0.1.18b13.dist-info → deltacat-0.1.18b15.dist-info}/METADATA +27 -13
deltacat-0.1.18b15.dist-info/RECORD +176 -0
{deltacat-0.1.18b13.dist-info → deltacat-0.1.18b15.dist-info}/WHEEL +1 -1
deltacat/compute/compactor/model/sort_key.py +0 -98
deltacat-0.1.18b13.dist-info/RECORD +0 -136
/deltacat/{tests/compactor → benchmarking}/__init__.py +0 -0
/deltacat/{tests/compactor/utils → compute/compactor_v2}/__init__.py +0 -0
{deltacat-0.1.18b13.dist-info → deltacat-0.1.18b15.dist-info}/LICENSE +0 -0
{deltacat-0.1.18b13.dist-info → deltacat-0.1.18b15.dist-info}/top_level.txt +0 -0

deltacat/compute/compactor_v2/utils/primary_key_index.py ADDED Viewed

@@ -0,0 +1,308 @@
+import logging
+from typing import List, Optional, Iterable
+import numpy as np
+import pyarrow as pa
+import pyarrow.compute as pc
+import uuid
+import hashlib
+from deltacat.compute.compactor_v2.constants import (
+    TOTAL_BYTES_IN_SHA1_HASH,
+    PK_DELIMITER,
+    MAX_SIZE_OF_RECORD_BATCH_IN_GIB,
+)
+import time
+from deltacat.compute.compactor.model.delta_file_envelope import DeltaFileEnvelope
+from deltacat import logs
+from deltacat.compute.compactor.utils import system_columns as sc
+from deltacat.io.object_store import IObjectStore
+from deltacat.utils.performance import timed_invocation
+logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
+def _append_sha1_hash_to_table(table: pa.Table, hash_column: pa.Array) -> pa.Table:
+    hash_column_np = hash_column.to_numpy()
+    result = []
+    for hash_value in hash_column_np:
+        result.append(hashlib.sha1(hash_value.encode("utf-8")).hexdigest())
+    return sc.append_pk_hash_string_column(table, result)
+def _is_sha1_desired(hash_column: pa.Array) -> bool:
+    return hash_column.nbytes > TOTAL_BYTES_IN_SHA1_HASH * len(hash_column)
+def _append_table_by_hash_bucket(
+    pki_table: pa.Table, hash_bucket_to_table: np.ndarray
+) -> int:
+    hb_pk_table, sort_latency = timed_invocation(
+        lambda: pki_table.sort_by(sc._HASH_BUCKET_IDX_COLUMN_NAME)
+    )
+    logger.info(f"Sorting a pk table of length {len(pki_table)} took {sort_latency}s")
+    hb_pk_grouped_by, groupby_latency = timed_invocation(
+        lambda: hb_pk_table.group_by(sc._HASH_BUCKET_IDX_COLUMN_NAME).aggregate(
+            [(sc._HASH_BUCKET_IDX_COLUMN_NAME, "count")]
+        )
+    )
+    logger.info(
+        f"Grouping a pki table of length {len(pki_table)} took {groupby_latency}s"
+    )
+    group_count_array = hb_pk_grouped_by[f"{sc._HASH_BUCKET_IDX_COLUMN_NAME}_count"]
+    hb_group_array = hb_pk_grouped_by[sc._HASH_BUCKET_IDX_COLUMN_NAME]
+    result_len = 0
+    for i, group_count in enumerate(group_count_array):
+        hb_idx = hb_group_array[i].as_py()
+        pyarrow_table = hb_pk_table.slice(offset=result_len, length=group_count.as_py())
+        pyarrow_table = pyarrow_table.drop([sc._HASH_BUCKET_IDX_COLUMN_NAME])
+        if hash_bucket_to_table[hb_idx] is None:
+            hash_bucket_to_table[hb_idx] = []
+        hash_bucket_to_table[hb_idx].append(pyarrow_table)
+        result_len += len(pyarrow_table)
+    return result_len
+def _optimized_group_record_batches_by_hash_bucket(
+    pki_table: pa.Table, num_buckets: int
+):
+    input_table_len = len(pki_table)
+    hash_bucket_to_tables = np.empty([num_buckets], dtype="object")
+    hb_to_table = np.empty([num_buckets], dtype="object")
+    # This split will ensure that the sort is not performed on a very huge table
+    # resulting in ArrowInvalid: offset overflow while concatenating arrays
+    # Known issue with Arrow: https://github.com/apache/arrow/issues/25822
+    table_batches, to_batches_latency = timed_invocation(lambda: pki_table.to_batches())
+    logger.info(f"to_batches took {to_batches_latency} for {len(pki_table)} rows")
+    current_bytes = 0
+    record_batches = []
+    result_len = 0
+    for record_batch in table_batches:
+        current_bytes += record_batch.nbytes
+        record_batches.append(record_batch)
+        if current_bytes >= MAX_SIZE_OF_RECORD_BATCH_IN_GIB:
+            logger.info(
+                f"Total number of record batches without exceeding {MAX_SIZE_OF_RECORD_BATCH_IN_GIB} "
+                f"is {len(record_batches)} and size {current_bytes}"
+            )
+            appended_len, append_latency = timed_invocation(
+                _append_table_by_hash_bucket,
+                pa.Table.from_batches(record_batches),
+                hash_bucket_to_tables,
+            )
+            logger.info(
+                f"Appended the hash bucketed batch of {appended_len} in {append_latency}s"
+            )
+            result_len += appended_len
+            current_bytes = 0
+            record_batches.clear()
+    if record_batches:
+        appended_len, append_latency = timed_invocation(
+            _append_table_by_hash_bucket,
+            pa.Table.from_batches(record_batches),
+            hash_bucket_to_tables,
+        )
+        result_len += appended_len
+        current_bytes = 0
+        record_batches.clear()
+    concat_start = time.monotonic()
+    for hb, tables in enumerate(hash_bucket_to_tables):
+        if tables:
+            assert hb_to_table[hb] is None, f"The HB index is repeated {hb}"
+            hb_to_table[hb] = pa.concat_tables(tables)
+    concat_end = time.monotonic()
+    logger.info(
+        f"Total time taken to concat all record batches with length "
+        f"{input_table_len}: {concat_end - concat_start}s"
+    )
+    assert (
+        input_table_len == result_len
+    ), f"Grouping has resulted in record loss as {result_len} != {input_table_len}"
+    return hb_to_table
+def group_by_pk_hash_bucket(
+    table: pa.Table, num_buckets: int, primary_keys: List[str]
+) -> np.ndarray:
+    table = generate_pk_hash_column(table, primary_keys, requires_sha1=True)
+    # group hash bucket record indices
+    result = group_record_indices_by_hash_bucket(
+        table,
+        num_buckets,
+    )
+    return result
+def generate_pk_hash_column(
+    table: pa.Table,
+    primary_keys: Optional[List[str]] = None,
+    requires_sha1: bool = False,
+) -> pa.Table:
+    """
+    Returns a new table after generating the primary key hash if desired.
+    1. If there are no primary keys, each hash will be unique uuid/sha1 hex
+    2. If there are more than 0 primary keys, returns a table with new columns appended.
+    """
+    start = time.monotonic()
+    can_sha1 = False
+    if primary_keys:
+        pk_columns = []
+        for pk_name in primary_keys:
+            pk_columns.append(pc.cast(table[pk_name], pa.string()))
+        pk_columns.append(PK_DELIMITER)
+        hash_column = pc.binary_join_element_wise(*pk_columns)
+        can_sha1 = requires_sha1 or _is_sha1_desired(hash_column)
+    else:
+        hash_column = pa.array(
+            [uuid.uuid4().hex for _ in range(len(table))], pa.string()
+        )
+    logger.info(
+        f"can_generate_sha1={can_sha1} for the table with hash column size"
+        f"={hash_column.nbytes} bytes, num_rows={len(hash_column)}, "
+        f"and requires_sha1={requires_sha1}"
+    )
+    if can_sha1:
+        table = _append_sha1_hash_to_table(table, hash_column)
+    else:
+        table = table.append_column(sc._PK_HASH_STRING_COLUMN_FIELD, hash_column)
+    end = time.monotonic()
+    logger.info(
+        f"Took {end - start}s to generate pk hash of len: {len(hash_column)}"
+        f" and size: {hash_column.nbytes} bytes"
+    )
+    return table
+def group_record_indices_by_hash_bucket(
+    pki_table: pa.Table, num_buckets: int
+) -> np.ndarray:
+    """
+    Groups the record indices by it's corresponding hash bucket. Hence, this method may
+    create num_buckets tables as a result.
+    """
+    input_table_len = len(pki_table)
+    hash_bucket_id_col_list = np.empty([input_table_len], dtype="int32")
+    bucketing_start_time = time.monotonic()
+    for index, hash_value in enumerate(sc.pk_hash_string_column_np(pki_table)):
+        hash_bucket = pk_digest_to_hash_bucket_index(hash_value, num_buckets)
+        hash_bucket_id_col_list[index] = hash_bucket
+    pki_table = sc.append_hash_bucket_idx_col(pki_table, hash_bucket_id_col_list)
+    bucketing_end_time = time.monotonic()
+    logger.info(
+        f"Took {bucketing_end_time - bucketing_start_time}s to generate the "
+        f"hb index for {len(pki_table)} rows"
+    )
+    result, group_latency = timed_invocation(
+        _optimized_group_record_batches_by_hash_bucket,
+        pki_table=pki_table,
+        num_buckets=num_buckets,
+    )
+    logger.info(
+        f"Final grouping of table with {input_table_len} records took: {group_latency}s"
+    )
+    return result
+def group_hash_bucket_indices(
+    hash_bucket_object_groups: np.ndarray,
+    num_buckets: int,
+    num_groups: int,
+    object_store: Optional[IObjectStore] = None,
+) -> np.ndarray:
+    """
+    This method persists all tables for a given hash bucket into the object store
+    and returns the object references for each hash group.
+    """
+    hash_bucket_group_to_obj_id_size_tuple = np.empty([num_groups], dtype="object")
+    if hash_bucket_object_groups is None:
+        return hash_bucket_group_to_obj_id_size_tuple
+    hb_group_to_object = np.empty([num_groups], dtype="object")
+    hash_group_to_size = np.empty([num_groups], dtype="int64")
+    hash_group_to_num_rows = np.empty([num_groups], dtype="int64")
+    for hb_index, obj in enumerate(hash_bucket_object_groups):
+        if obj:
+            hb_group = hash_bucket_index_to_hash_group_index(hb_index, num_groups)
+            if hb_group_to_object[hb_group] is None:
+                hb_group_to_object[hb_group] = np.empty([num_buckets], dtype="object")
+                hash_group_to_size[hb_group] = np.int64(0)
+                hash_group_to_num_rows[hb_group] = np.int64(0)
+            hb_group_to_object[hb_group][hb_index] = obj
+            for dfe in obj:
+                casted_dfe: DeltaFileEnvelope = dfe
+                hash_group_to_size[hb_group] += casted_dfe.table_size_bytes
+                hash_group_to_num_rows[hb_group] += casted_dfe.table_num_rows
+    for hb_group, obj in enumerate(hb_group_to_object):
+        if obj is None:
+            continue
+        object_ref = object_store.put(obj)
+        hash_bucket_group_to_obj_id_size_tuple[hb_group] = (
+            object_ref,
+            hash_group_to_size[hb_group],
+            hash_group_to_num_rows[hb_group],
+        )
+        del object_ref
+    return hash_bucket_group_to_obj_id_size_tuple
+def hash_bucket_index_to_hash_group_index(hb_index: int, num_groups: int) -> int:
+    return hb_index % num_groups
+def hash_group_index_to_hash_bucket_indices(
+    hb_group: int, num_buckets: int, num_groups: int
+) -> Iterable[int]:
+    if hb_group > num_buckets:
+        return []
+    return range(hb_group, num_groups, num_buckets)
+def pk_digest_to_hash_bucket_index(digest: str, num_buckets: int) -> int:
+    """
+    Generates the hash bucket index from the given digest.
+    """
+    return int(digest, 16) % num_buckets

deltacat/compute/compactor_v2/utils/task_options.py ADDED Viewed

@@ -0,0 +1,228 @@
+from typing import Dict, Optional, List, Tuple
+from deltacat.types.media import ContentEncoding, ContentType
+from deltacat.types.partial_download import PartialParquetParameters
+from deltacat.storage import (
+    Delta,
+    ManifestEntry,
+    interface as unimplemented_deltacat_storage,
+)
+from deltacat.compute.compactor.model.delta_annotated import DeltaAnnotated
+from deltacat.compute.compactor.model.round_completion_info import RoundCompletionInfo
+from deltacat.compute.compactor_v2.utils.primary_key_index import (
+    hash_group_index_to_hash_bucket_indices,
+)
+from deltacat.compute.compactor_v2.utils.content_type_params import (
+    append_content_type_params,
+)
+from deltacat.compute.compactor_v2.constants import TOTAL_MEMORY_BUFFER_PERCENTAGE
+def _get_parquet_type_params_if_exist(
+    entry: ManifestEntry,
+) -> Optional[PartialParquetParameters]:
+    if (
+        entry.meta
+        and entry.meta.content_type == ContentType.PARQUET
+        and entry.meta.content_encoding == ContentEncoding.IDENTITY
+    ):
+        for type_params in entry.meta.content_type_parameters:
+            if isinstance(type_params, PartialParquetParameters):
+                return type_params
+    return None
+def _calculate_parquet_column_size(
+    type_params: PartialParquetParameters, columns: List[str]
+):
+    column_size = 0.0
+    for rg in type_params.row_groups_to_download:
+        columns_found = 0
+        row_group_meta = type_params.pq_metadata.row_group(rg)
+        for col in range(row_group_meta.num_columns):
+            column_meta = row_group_meta.column(col)
+            if column_meta.path_in_schema in columns:
+                columns_found += 1
+                column_size += column_meta.total_uncompressed_size
+        assert columns_found == len(columns), (
+            "Columns not found in the parquet data as "
+            f"{columns_found} != {len(columns)}"
+        )
+    return column_size
+def estimate_manifest_entry_size_bytes(
+    entry: ManifestEntry, previous_inflation: float, **kwargs
+) -> float:
+    if entry.meta.source_content_length:
+        return entry.meta.source_content_length
+    type_params = _get_parquet_type_params_if_exist(entry=entry)
+    if type_params:
+        return type_params.in_memory_size_bytes
+    return entry.meta.content_length * previous_inflation
+def estimate_manifest_entry_num_rows(
+    entry: ManifestEntry,
+    average_record_size_bytes: float,
+    previous_inflation: float,
+    **kwargs,
+) -> int:
+    if entry.meta.record_count:
+        return entry.meta.record_count
+    type_params = _get_parquet_type_params_if_exist(entry=entry)
+    if type_params:
+        return type_params.num_rows
+    total_size_bytes = estimate_manifest_entry_size_bytes(
+        entry=entry, previous_inflation=previous_inflation, **kwargs
+    )
+    return int(total_size_bytes / average_record_size_bytes)
+def estimate_manifest_entry_column_size_bytes(
+    entry: ManifestEntry, columns: Optional[List[str]] = None
+) -> Optional[float]:
+    if not columns:
+        return 0
+    type_params = _get_parquet_type_params_if_exist(entry=entry)
+    if type_params.pq_metadata:
+        return _calculate_parquet_column_size(type_params=type_params, columns=columns)
+    return None
+def hash_bucket_resource_options_provider(
+    index: int,
+    item: DeltaAnnotated,
+    previous_inflation: float,
+    average_record_size_bytes: float,
+    primary_keys: List[str] = None,
+    **kwargs,
+) -> Dict:
+    size_bytes = 0.0
+    num_rows = 0
+    total_pk_size = 0
+    if not item.manifest or not item.manifest.entries:
+        return {"CPU": 0.01}
+    for entry in item.manifest.entries:
+        entry_size = estimate_manifest_entry_size_bytes(
+            entry=entry, previous_inflation=previous_inflation
+        )
+        num_rows += estimate_manifest_entry_num_rows(
+            entry=entry,
+            previous_inflation=previous_inflation,
+            average_record_size_bytes=average_record_size_bytes,
+        )
+        size_bytes += entry_size
+        if primary_keys:
+            pk_size = estimate_manifest_entry_column_size_bytes(
+                entry=entry,
+                columns=primary_keys,
+            )
+            if pk_size is None:
+                total_pk_size += entry_size
+            else:
+                total_pk_size += pk_size
+    # total size + pk size + pk hash column + hash bucket index column
+    # Refer to hash_bucket step for more details.
+    total_memory = size_bytes + total_pk_size + num_rows * 20 + num_rows * 4
+    # Consider buffer
+    total_memory = total_memory * (1 + TOTAL_MEMORY_BUFFER_PERCENTAGE / 100.0)
+    return {"num_cpus": 0.01, "memory": total_memory}
+def merge_resource_options_provider(
+    index: int,
+    item: Tuple[int, List],
+    num_hash_groups: int,
+    hash_group_size_bytes: Dict[int, int],
+    hash_group_num_rows: Dict[int, int],
+    round_completion_info: Optional[RoundCompletionInfo] = None,
+    compacted_delta: Optional[Delta] = None,
+    primary_keys: Optional[List[str]] = None,
+    deltacat_storage=unimplemented_deltacat_storage,
+    deltacat_storage_kwargs: Optional[Dict] = {},
+    **kwargs,
+) -> Dict:
+    hb_group_idx = item[0]
+    data_size = hash_group_size_bytes.get(hb_group_idx, 0)
+    num_rows = hash_group_num_rows.get(hb_group_idx, 0)
+    pk_size_bytes = 0
+    if (
+        round_completion_info
+        and compacted_delta
+        and round_completion_info.hb_index_to_entry_range_both_inclusive
+    ):
+        previous_inflation = (
+            round_completion_info.compacted_pyarrow_write_result.pyarrow_bytes
+            / round_completion_info.compacted_pyarrow_write_result.file_bytes
+        )
+        average_record_size = (
+            round_completion_info.compacted_pyarrow_write_result.pyarrow_bytes
+            / round_completion_info.compacted_pyarrow_write_result.records
+        )
+        iterable = hash_group_index_to_hash_bucket_indices(
+            hb_group_idx, round_completion_info.hash_bucket_count, num_hash_groups
+        )
+        for hb_idx in iterable:
+            entry_start, entry_end = round_completion_info.hb_index_to_entry_range[
+                hb_idx
+            ]
+            for entry_index in range(entry_start, entry_end):
+                entry = append_content_type_params(
+                    compacted_delta,
+                    entry_index=entry_index,
+                    deltacat_storage=deltacat_storage,
+                    deltacat_storage_kwargs=deltacat_storage_kwargs,
+                )
+                current_entry_size = estimate_manifest_entry_size_bytes(
+                    entry=entry, previous_inflation=previous_inflation
+                )
+                current_entry_rows = estimate_manifest_entry_num_rows(
+                    entry=entry,
+                    average_record_size_bytes=average_record_size,
+                    previous_inflation=previous_inflation,
+                )
+                data_size += current_entry_size
+                num_rows += current_entry_rows
+                if primary_keys:
+                    pk_size = estimate_manifest_entry_column_size_bytes(
+                        entry=entry,
+                        columns=primary_keys,
+                    )
+                    if pk_size is None:
+                        pk_size_bytes += current_entry_size
+                    else:
+                        pk_size_bytes += pk_size
+    # total data downloaded + primary key hash column + primary key column + dict size for merge
+    total_memory = data_size + pk_size_bytes + num_rows * 20 + num_rows * 20
+    total_memory = total_memory * (1 + TOTAL_MEMORY_BUFFER_PERCENTAGE / 100.0)
+    return {"num_cpus": 0.01, "memory": total_memory}

deltacat/compute/metastats/meta_stats.py CHANGED Viewed

@@ -5,7 +5,7 @@ import functools
 import logging
 import os
 import pathlib
-from typing import Dict, List, Optional, Set
+from typing import Any, Dict, List, Optional, Set
 import ray
 from ray.types import ObjectRef
@@ -118,10 +118,12 @@ def collect_from_partition(
     stat_results_s3_bucket: Optional[str] = None,
     metastats_results_s3_bucket: Optional[str] = None,
     deltacat_storage=unimplemented_deltacat_storage,
+    deltacat_storage_kwargs: Optional[Dict[str, Any]] = None,
     *args,
     **kwargs,
 ) -> ObjectRef[Dict[int, DeltaStats]]:
+    if deltacat_storage_kwargs is None:
+        deltacat_storage_kwargs = {}
     if not columns:
         columns = deltacat_storage.get_table_version_column_names(
             source_partition_locator.namespace,

deltacat/compute/metastats/stats.py CHANGED Viewed

@@ -33,6 +33,7 @@ def start_stats_collection(
     stat_results_s3_bucket: Optional[str] = None,
     metastats_results_s3_bucket: Optional[str] = None,
     deltacat_storage=unimplemented_deltacat_storage,
+    **kwargs,
 ) -> Dict[str, List[DeltaStats]]:
     """Collects statistics on deltas, given a set of delta stream position ranges.
     Example:

deltacat/compute/metastats/utils/io.py CHANGED Viewed

@@ -171,6 +171,7 @@ def collect_stats_by_columns(
     delta_annotated: DeltaAnnotated,
     columns_to_compute: Optional[List[str]] = None,
     deltacat_storage=unimplemented_deltacat_storage,
+    deltacat_storage_kwargs: Optional[Dict[str, Any]] = None,
 ) -> Dict[str, Any]:
     """Materializes one manifest entry at a time to save memory usage and calculate stats from each of its columns.
@@ -182,6 +183,8 @@ def collect_stats_by_columns(
     Returns:
         A delta wide stats container
     """
+    if deltacat_storage_kwargs is None:
+        deltacat_storage_kwargs = {}
     total_tables_size = 0
     # Mapping of column_name -> [stats_file_idx_1, stats_file_idx_2, ... stats_file_idx_n]
@@ -198,6 +201,7 @@ def collect_stats_by_columns(
                 TableType.PYARROW,
                 columns_to_compute,
                 equivalent_table_types="uncompacted",
+                **deltacat_storage_kwargs,
             )
         )
         assert isinstance(entry_pyarrow_table, pyarrow.Table), (

deltacat/compute/stats/utils/io.py CHANGED Viewed

@@ -1,6 +1,6 @@
 import logging
 from collections import defaultdict
-from typing import Dict, List, Optional
+from typing import Any, Dict, List, Optional
 import pyarrow
 import ray
@@ -83,6 +83,7 @@ def get_delta_stats(
     delta_locator: DeltaLocator,
     columns: Optional[List[str]] = None,
     deltacat_storage=unimplemented_deltacat_storage,
+    deltacat_storage_kwargs: Optional[Dict[str, Any]] = None,
 ) -> DeltaStats:
     """Ray distributed task to compute and collect stats for a requested delta.
     If no columns are requested, stats will be computed for all columns.
@@ -93,10 +94,15 @@ def get_delta_stats(
     Returns:
         A delta wide stats container
     """
-    manifest = deltacat_storage.get_delta_manifest(delta_locator)
+    if deltacat_storage_kwargs is None:
+        deltacat_storage_kwargs = {}
+    manifest = deltacat_storage.get_delta_manifest(
+        delta_locator, **deltacat_storage_kwargs
+    )
     delta = Delta.of(delta_locator, None, None, None, manifest)
-    return _collect_stats_by_columns(delta, columns, deltacat_storage)
+    return _collect_stats_by_columns(
+        delta, columns, deltacat_storage, deltacat_storage_kwargs
+    )
 @ray.remote
@@ -105,6 +111,7 @@ def get_deltas_from_range(
     start_position_inclusive: DeltaRange,
     end_position_inclusive: DeltaRange,
     deltacat_storage=unimplemented_deltacat_storage,
+    **kwargs,
 ) -> List[Delta]:
     """Looks up deltas in the specified partition using Ray, given both starting and ending delta stream positions.
@@ -137,6 +144,7 @@ def get_deltas_from_range(
         end_position_inclusive,
         ascending_order=True,
         include_manifest=False,
+        **kwargs,
     )
     return deltas_list_result.all_items()
@@ -145,6 +153,7 @@ def _collect_stats_by_columns(
     delta: Delta,
     columns_to_compute: Optional[List[str]] = None,
     deltacat_storage=unimplemented_deltacat_storage,
+    deltacat_storage_kwargs: Optional[Dict[str, Any]] = None,
 ) -> DeltaStats:
     """Materializes one manifest entry at a time to save memory usage and calculate stats from each of its columns.
     Args:
@@ -154,6 +163,8 @@ def _collect_stats_by_columns(
     Returns:
         A delta wide stats container
     """
+    if deltacat_storage_kwargs is None:
+        deltacat_storage_kwargs = {}
     assert (
         delta.manifest is not None
     ), f"Manifest should not be missing from delta for stats calculation: {delta}"
@@ -167,7 +178,11 @@ def _collect_stats_by_columns(
     for file_idx, manifest in enumerate(delta.manifest.entries):
         entry_pyarrow_table: LocalTable = (
             deltacat_storage.download_delta_manifest_entry(
-                delta, file_idx, TableType.PYARROW, columns_to_compute
+                delta,
+                file_idx,
+                TableType.PYARROW,
+                columns_to_compute,
+                **deltacat_storage_kwargs,
             )
         )
         assert isinstance(entry_pyarrow_table, pyarrow.Table), (

deltacat/exceptions.py CHANGED Viewed

@@ -8,3 +8,7 @@ class NonRetryableError(Exception):
 class ConcurrentModificationError(Exception):
     pass
+class ValidationError(NonRetryableError):
+    pass

deltacat 0.1.18b13__py3-none-any.whl → 0.1.18b15__py3-none-any.whl

deltacat 0.1.18b13py3-none-any.whl → 0.1.18b15py3-none-any.whl