PyPI - deltacat - Versions diffs - 0.1.6__py3-none-any.whl → 0.1.11__py3-none-any.whl - Mend

deltacat 0.1.6py3-none-any.whl → 0.1.11py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (83) hide show

deltacat/__init__.py +41 -15
deltacat/aws/clients.py +12 -31
deltacat/aws/constants.py +1 -1
deltacat/aws/redshift/__init__.py +7 -2
deltacat/aws/redshift/model/manifest.py +54 -50
deltacat/aws/s3u.py +183 -194
deltacat/catalog/delegate.py +151 -185
deltacat/catalog/interface.py +78 -97
deltacat/catalog/model/catalog.py +21 -21
deltacat/catalog/model/table_definition.py +11 -9
deltacat/compute/compactor/__init__.py +12 -16
deltacat/compute/compactor/compaction_session.py +249 -198
deltacat/compute/compactor/model/delta_annotated.py +60 -44
deltacat/compute/compactor/model/delta_file_envelope.py +5 -6
deltacat/compute/compactor/model/delta_file_locator.py +10 -8
deltacat/compute/compactor/model/materialize_result.py +6 -7
deltacat/compute/compactor/model/primary_key_index.py +38 -34
deltacat/compute/compactor/model/pyarrow_write_result.py +3 -4
deltacat/compute/compactor/model/round_completion_info.py +25 -19
deltacat/compute/compactor/model/sort_key.py +18 -15
deltacat/compute/compactor/steps/dedupe.py +153 -260
deltacat/compute/compactor/steps/hash_bucket.py +56 -56
deltacat/compute/compactor/steps/materialize.py +139 -100
deltacat/compute/compactor/steps/rehash/rehash_bucket.py +13 -13
deltacat/compute/compactor/steps/rehash/rewrite_index.py +11 -13
deltacat/compute/compactor/utils/io.py +59 -47
deltacat/compute/compactor/utils/primary_key_index.py +131 -90
deltacat/compute/compactor/utils/round_completion_file.py +22 -23
deltacat/compute/compactor/utils/system_columns.py +33 -42
deltacat/compute/metastats/meta_stats.py +235 -157
deltacat/compute/metastats/model/partition_stats_dict.py +7 -10
deltacat/compute/metastats/model/stats_cluster_size_estimator.py +13 -5
deltacat/compute/metastats/stats.py +95 -64
deltacat/compute/metastats/utils/io.py +100 -53
deltacat/compute/metastats/utils/pyarrow_memory_estimation_function.py +5 -2
deltacat/compute/metastats/utils/ray_utils.py +38 -33
deltacat/compute/stats/basic.py +107 -69
deltacat/compute/stats/models/delta_column_stats.py +11 -8
deltacat/compute/stats/models/delta_stats.py +59 -32
deltacat/compute/stats/models/delta_stats_cache_result.py +4 -1
deltacat/compute/stats/models/manifest_entry_stats.py +12 -6
deltacat/compute/stats/models/stats_result.py +24 -14
deltacat/compute/stats/utils/intervals.py +16 -9
deltacat/compute/stats/utils/io.py +86 -51
deltacat/compute/stats/utils/manifest_stats_file.py +24 -33
deltacat/constants.py +8 -10
deltacat/io/__init__.py +2 -2
deltacat/io/aws/redshift/redshift_datasource.py +157 -143
deltacat/io/dataset.py +14 -17
deltacat/io/read_api.py +36 -33
deltacat/logs.py +94 -42
deltacat/storage/__init__.py +18 -8
deltacat/storage/interface.py +196 -213
deltacat/storage/model/delta.py +45 -51
deltacat/storage/model/list_result.py +12 -8
deltacat/storage/model/namespace.py +4 -5
deltacat/storage/model/partition.py +42 -42
deltacat/storage/model/stream.py +29 -30
deltacat/storage/model/table.py +14 -14
deltacat/storage/model/table_version.py +32 -31
deltacat/storage/model/types.py +1 -0
deltacat/tests/stats/test_intervals.py +11 -24
deltacat/tests/utils/__init__.py +0 -0
deltacat/tests/utils/test_record_batch_tables.py +284 -0
deltacat/types/media.py +3 -4
deltacat/types/tables.py +31 -21
deltacat/utils/common.py +5 -11
deltacat/utils/numpy.py +20 -22
deltacat/utils/pandas.py +73 -100
deltacat/utils/performance.py +3 -9
deltacat/utils/placement.py +276 -228
deltacat/utils/pyarrow.py +302 -89
deltacat/utils/ray_utils/collections.py +2 -1
deltacat/utils/ray_utils/concurrency.py +36 -29
deltacat/utils/ray_utils/dataset.py +28 -28
deltacat/utils/ray_utils/performance.py +5 -9
deltacat/utils/ray_utils/runtime.py +9 -10
{deltacat-0.1.6.dist-info → deltacat-0.1.11.dist-info}/METADATA +21 -11
deltacat-0.1.11.dist-info/RECORD +110 -0
{deltacat-0.1.6.dist-info → deltacat-0.1.11.dist-info}/WHEEL +1 -1
deltacat-0.1.6.dist-info/RECORD +0 -108
{deltacat-0.1.6.dist-info → deltacat-0.1.11.dist-info}/LICENSE +0 -0
{deltacat-0.1.6.dist-info → deltacat-0.1.11.dist-info}/top_level.txt +0 -0

deltacat/compute/compactor/utils/io.py CHANGED Viewed

@@ -1,24 +1,23 @@
 import logging
-import time
 import math
-from deltacat.compute.stats.models.delta_stats import DeltaStats
-from deltacat.constants import PYARROW_INFLATION_MULTIPLIER, BYTES_PER_MEBIBYTE
+from typing import Dict, List, Optional, Tuple
-from deltacat.storage import PartitionLocator, Delta, \
-    interface as unimplemented_deltacat_storage
 from deltacat import logs
 from deltacat.compute.compactor import DeltaAnnotated
-from typing import Dict, List, Optional, Tuple
+from deltacat.compute.stats.models.delta_stats import DeltaStats
+from deltacat.constants import BYTES_PER_MEBIBYTE, PYARROW_INFLATION_MULTIPLIER
+from deltacat.storage import Delta, PartitionLocator
+from deltacat.storage import interface as unimplemented_deltacat_storage
 logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
 def discover_deltas(
-        source_partition_locator: PartitionLocator,
-        start_position_exclusive: Optional[int],
-        end_position_inclusive: int,
-        deltacat_storage=unimplemented_deltacat_storage) -> List[Delta]:
+    source_partition_locator: PartitionLocator,
+    start_position_exclusive: Optional[int],
+    end_position_inclusive: int,
+    deltacat_storage=unimplemented_deltacat_storage,
+) -> List[Delta]:
     stream_locator = source_partition_locator.stream_locator
     namespace = stream_locator.namespace
@@ -36,32 +35,38 @@ def discover_deltas(
     )
     deltas = deltas_list_result.all_items()
     if not deltas:
-        raise RuntimeError(f"Unexpected Error: Couldn't find any deltas to "
-                           f"compact in delta stream position range "
-                           f"('{start_position_exclusive}', "
-                           f"'{end_position_inclusive}']. Source partition: "
-                           f"{source_partition_locator}")
+        raise RuntimeError(
+            f"Unexpected Error: Couldn't find any deltas to "
+            f"compact in delta stream position range "
+            f"('{start_position_exclusive}', "
+            f"'{end_position_inclusive}']. Source partition: "
+            f"{source_partition_locator}"
+        )
     if start_position_exclusive:
         first_delta = deltas.pop(0)
-        logger.info(f"Removed exclusive start delta w/ expected stream "
-                    f"position '{start_position_exclusive}' from deltas to "
-                    f"compact: {first_delta}")
-    logger.info(f"Count of deltas to compact in delta stream "
-                f"position range ('{start_position_exclusive}', "
-                f"'{end_position_inclusive}']: {len(deltas)}. Source "
-                f"partition: '{source_partition_locator}'")
+        logger.info(
+            f"Removed exclusive start delta w/ expected stream "
+            f"position '{start_position_exclusive}' from deltas to "
+            f"compact: {first_delta}"
+        )
+    logger.info(
+        f"Count of deltas to compact in delta stream "
+        f"position range ('{start_position_exclusive}', "
+        f"'{end_position_inclusive}']: {len(deltas)}. Source "
+        f"partition: '{source_partition_locator}'"
+    )
     return deltas
 def limit_input_deltas(
-        input_deltas: List[Delta],
-        cluster_resources: Dict[str, float],
-        hash_bucket_count: int,
-        min_pk_index_pa_bytes: int,
-        user_hash_bucket_chunk_size: int,
-        input_deltas_stats: Dict[int, DeltaStats],
-        deltacat_storage=unimplemented_deltacat_storage) \
-        -> Tuple[List[DeltaAnnotated], int, int]:
+    input_deltas: List[Delta],
+    cluster_resources: Dict[str, float],
+    hash_bucket_count: int,
+    min_pk_index_pa_bytes: int,
+    user_hash_bucket_chunk_size: int,
+    input_deltas_stats: Dict[int, DeltaStats],
+    deltacat_storage=unimplemented_deltacat_storage,
+) -> Tuple[List[DeltaAnnotated], int, int]:
     # TODO (pdames): when row counts are available in metadata, use them
     #  instead of bytes - memory consumption depends more on number of
@@ -78,9 +83,10 @@ def limit_input_deltas(
     # )
     if min_pk_index_pa_bytes > 0:
         required_heap_mem_for_dedupe = worker_obj_store_mem - min_pk_index_pa_bytes
-        assert required_heap_mem_for_dedupe > 0, \
-            f"Not enough required memory available to re-batch input deltas" \
+        assert required_heap_mem_for_dedupe > 0, (
+            f"Not enough required memory available to re-batch input deltas"
             f"and initiate the dedupe step."
+        )
         # Size of batched deltas must also be reduced to have enough space for primary
         # key index files (from earlier compaction rounds) in the dedupe step, since
         # they will be loaded into worker heap memory.
@@ -88,8 +94,7 @@ def limit_input_deltas(
     logger.info(f"Total worker object store memory: {worker_obj_store_mem}")
     worker_obj_store_mem_per_task = worker_obj_store_mem / worker_cpus
-    logger.info(f"Worker object store memory/task: "
-                f"{worker_obj_store_mem_per_task}")
+    logger.info(f"Worker object store memory/task: " f"{worker_obj_store_mem_per_task}")
     worker_task_mem = cluster_resources["memory"]
     logger.info(f"Total worker memory: {worker_task_mem}")
     # TODO (pdames): ensure fixed memory per CPU in heterogenous clusters
@@ -105,8 +110,10 @@ def limit_input_deltas(
     if input_deltas_stats is None:
         input_deltas_stats = {}
-    input_deltas_stats = {int(stream_pos): DeltaStats(delta_stats)
-                          for stream_pos, delta_stats in input_deltas_stats.items()}
+    input_deltas_stats = {
+        int(stream_pos): DeltaStats(delta_stats)
+        for stream_pos, delta_stats in input_deltas_stats.items()
+    }
     for delta in input_deltas:
         manifest = deltacat_storage.get_delta_manifest(delta)
         delta.manifest = manifest
@@ -118,7 +125,8 @@ def limit_input_deltas(
             # TODO (pdames): ensure pyarrow object fits in per-task obj store mem
             logger.warning(
                 f"Stats are missing for delta stream position {delta.stream_position}, "
-                f"materialized delta may not fit in per-task object store memory.")
+                f"materialized delta may not fit in per-task object store memory."
+            )
         manifest_entries = delta.manifest.entries
         delta_manifest_entries += len(manifest_entries)
         for entry in manifest_entries:
@@ -130,13 +138,13 @@ def limit_input_deltas(
             logger.info(
                 f"Input deltas limited to "
                 f"{len(limited_input_da_list)} by object store mem "
-                f"({delta_bytes_pyarrow} > {worker_obj_store_mem})")
+                f"({delta_bytes_pyarrow} > {worker_obj_store_mem})"
+            )
             break
         delta_annotated = DeltaAnnotated.of(delta)
         limited_input_da_list.append(delta_annotated)
-    logger.info(f"Input deltas to compact this round: "
-                f"{len(limited_input_da_list)}")
+    logger.info(f"Input deltas to compact this round: " f"{len(limited_input_da_list)}")
     logger.info(f"Input delta bytes to compact: {delta_bytes}")
     logger.info(f"Input delta files to compact: {delta_manifest_entries}")
     logger.info(f"Latest input delta stream position: {latest_stream_position}")
@@ -146,10 +154,12 @@ def limit_input_deltas(
     # TODO (pdames): determine min hash buckets from size of all deltas
     #  (not just deltas for this round)
-    min_hash_bucket_count = int(max(
-        math.ceil(delta_bytes_pyarrow / worker_obj_store_mem_per_task),
-        min(worker_cpus, 256),
-    ))
+    min_hash_bucket_count = int(
+        max(
+            math.ceil(delta_bytes_pyarrow / worker_obj_store_mem_per_task),
+            min(worker_cpus, 256),
+        )
+    )
     logger.info(f"Minimum recommended hash buckets: {min_hash_bucket_count}")
     if hash_bucket_count is None:
@@ -168,7 +178,8 @@ def limit_input_deltas(
             f"resolve this problem either specify a larger number of hash "
             f"buckets when running compaction, omit a custom hash bucket "
             f"count when running compaction, or provision workers with more "
-            f"task memory per CPU.")
+            f"task memory per CPU."
+        )
     hash_bucket_chunk_size = user_hash_bucket_chunk_size
     max_hash_bucket_chunk_size = math.ceil(
@@ -185,7 +196,8 @@ def limit_input_deltas(
             f"specify a smaller hash bucket chunk size when running "
             f"compaction, omit a custom hash bucket chunk size when running "
             f"compaction, or provision workers with more task and object "
-            f"store memory per CPU.")
+            f"store memory per CPU."
+        )
     elif not hash_bucket_chunk_size:
         hash_bucket_chunk_size_load_balanced = max(
             math.ceil(max(delta_bytes, delta_bytes_pyarrow) / worker_cpus),

deltacat/compute/compactor/utils/primary_key_index.py CHANGED Viewed

@@ -1,48 +1,54 @@
-import logging
 import json
-import ray
-import pyarrow as pa
-import numpy as np
-import s3fs
+import logging
 from collections import defaultdict
+from typing import Any, Callable, Dict, List, Optional, Tuple
-from deltacat.utils.common import ReadKwargsProvider
+import numpy as np
+import pyarrow as pa
+import ray
+import s3fs
 from ray import cloudpickle
+from ray.types import ObjectRef
-from deltacat.storage import Manifest, PartitionLocator
-from deltacat.utils.ray_utils.concurrency import invoke_parallel, \
-    round_robin_options_provider
-from deltacat.compute.compactor import PyArrowWriteResult, \
-    RoundCompletionInfo, PrimaryKeyIndexMeta, PrimaryKeyIndexLocator, \
-    PrimaryKeyIndexVersionMeta, PrimaryKeyIndexVersionLocator
+from deltacat import logs
+from deltacat.aws import s3u
+from deltacat.compute.compactor import (
+    PrimaryKeyIndexLocator,
+    PrimaryKeyIndexMeta,
+    PrimaryKeyIndexVersionLocator,
+    PrimaryKeyIndexVersionMeta,
+    PyArrowWriteResult,
+    RoundCompletionInfo,
+)
+from deltacat.compute.compactor.steps.rehash import rehash_bucket as rb
+from deltacat.compute.compactor.steps.rehash import rewrite_index as ri
 from deltacat.compute.compactor.utils import round_completion_file as rcf
 from deltacat.compute.compactor.utils import system_columns as sc
-from deltacat.compute.compactor.steps.rehash import rehash_bucket as rb, \
-    rewrite_index as ri
-from deltacat.types.tables import get_table_writer, get_table_slicer
-from deltacat.types.media import ContentType, ContentEncoding
-from deltacat.aws import s3u
-from deltacat import logs
-from typing import Any, Callable, Dict, List, Optional, Tuple
-from ray.types import ObjectRef
+from deltacat.constants import PRIMARY_KEY_INDEX_WRITE_BOTO3_CONFIG
+from deltacat.storage import Manifest, PartitionLocator
+from deltacat.types.media import ContentEncoding, ContentType
+from deltacat.types.tables import get_table_slicer, get_table_writer
+from deltacat.utils.common import ReadKwargsProvider
+from deltacat.utils.ray_utils.concurrency import invoke_parallel
 logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
 def rehash(
-        options_provider: Callable[[int, Any], Dict[str, Any]],
-        s3_bucket: str,
-        source_partition_locator: PartitionLocator,
-        old_rci: RoundCompletionInfo,
-        new_hash_bucket_count: int,
-        hash_bucket_index_group_count: int,
-        records_per_primary_key_index_file: int,
-        delete_old_primary_key_index: bool) -> RoundCompletionInfo:
-    logger.info(f"Rehashing primary key index. Old round completion info: "
-                f"{old_rci}. New hash bucket count: {new_hash_bucket_count}")
+    options_provider: Callable[[int, Any], Dict[str, Any]],
+    s3_bucket: str,
+    source_partition_locator: PartitionLocator,
+    old_rci: RoundCompletionInfo,
+    new_hash_bucket_count: int,
+    hash_bucket_index_group_count: int,
+    records_per_primary_key_index_file: int,
+    delete_old_primary_key_index: bool,
+) -> RoundCompletionInfo:
+    logger.info(
+        f"Rehashing primary key index. Old round completion info: "
+        f"{old_rci}. New hash bucket count: {new_hash_bucket_count}"
+    )
     # collect old primary key index information
     old_pki_version_locator = old_rci.primary_key_index_version_locator
@@ -50,10 +56,12 @@ def rehash(
     old_pki_meta = old_pkiv_meta.primary_key_index_meta
     old_compacted_partition_locator = old_pki_meta.compacted_partition_locator
     if old_pkiv_meta.hash_bucket_count == new_hash_bucket_count:
-        raise ValueError(f"Primary key index rehash failed. Old hash bucket "
-                         f"count ({new_hash_bucket_count}) is "
-                         f"equal to new hash bucket count. Partition: "
-                         f"{old_compacted_partition_locator}.")
+        raise ValueError(
+            f"Primary key index rehash failed. Old hash bucket "
+            f"count ({new_hash_bucket_count}) is "
+            f"equal to new hash bucket count. Partition: "
+            f"{old_compacted_partition_locator}."
+        )
     # generate a new unique primary key index version locator to rehash into
     new_pki_meta = PrimaryKeyIndexMeta.of(
@@ -68,7 +76,8 @@ def rehash(
         new_hash_bucket_count,
     )
     rehashed_pki_version_locator = PrimaryKeyIndexVersionLocator.generate(
-        new_pki_version_meta)
+        new_pki_version_meta
+    )
     # launch a rehash task for each bucket of the old primary key index version
     old_hash_bucket_count = old_pkiv_meta.hash_bucket_count
@@ -114,6 +123,7 @@ def rehash(
         PyArrowWriteResult.union(pki_stats),
         old_rci.sort_keys_bit_width,
         rehashed_pki_version_locator,
+        old_rci.rebase_source_partition_locator,
     )
     rcf.write_round_completion_file(
         s3_bucket,
@@ -126,41 +136,48 @@ def rehash(
             s3_bucket,
             old_pki_version_locator,
         )
-    logger.info(f"Rehashed primary key index. New round completion info: "
-                f"{round_completion_info}.")
+    logger.info(
+        f"Rehashed primary key index. New round completion info: "
+        f"{round_completion_info}."
+    )
     return round_completion_info
 def download_hash_bucket_entries(
-        s3_bucket: str,
-        hash_bucket_index: int,
-        primary_key_index_version_locator: PrimaryKeyIndexVersionLocator,
-        file_reader_kwargs_provider: Optional[ReadKwargsProvider] = None) \
-        -> List[pa.Table]:
-    pk_index_manifest_s3_url = primary_key_index_version_locator\
-        .get_pkiv_hb_index_manifest_s3_url(
+    s3_bucket: str,
+    hash_bucket_index: int,
+    primary_key_index_version_locator: PrimaryKeyIndexVersionLocator,
+    file_reader_kwargs_provider: Optional[ReadKwargsProvider] = None,
+) -> List[pa.Table]:
+    pk_index_manifest_s3_url = (
+        primary_key_index_version_locator.get_pkiv_hb_index_manifest_s3_url(
             s3_bucket,
             hash_bucket_index,
         )
+    )
     result = s3u.download(pk_index_manifest_s3_url, False)
-    logger.info(f"Downloading primary key index hash bucket manifest entries: "
-                f"{pk_index_manifest_s3_url}. Primary key index version "
-                f"locator: {primary_key_index_version_locator}")
+    logger.info(
+        f"Downloading primary key index hash bucket manifest entries: "
+        f"{pk_index_manifest_s3_url}. Primary key index version "
+        f"locator: {primary_key_index_version_locator}"
+    )
     pk_index_manifest = Manifest(json.loads(result["Body"].read().decode("utf-8")))
-    tables = s3u.download_manifest_entries(pk_index_manifest,
-                                           file_reader_kwargs_provider=file_reader_kwargs_provider)
+    tables = s3u.download_manifest_entries(
+        pk_index_manifest, file_reader_kwargs_provider=file_reader_kwargs_provider
+    )
     if not tables:
         logger.warning(
             f"Primary key index manifest is empty at: "
             f"{pk_index_manifest_s3_url}. Primary key index version "
-            f"locator: {primary_key_index_version_locator}")
+            f"locator: {primary_key_index_version_locator}"
+        )
     return tables
 def delete_primary_key_index_version(
-        s3_bucket: str,
-        pki_version_locator: PrimaryKeyIndexVersionLocator) -> None:
+    s3_bucket: str, pki_version_locator: PrimaryKeyIndexVersionLocator
+) -> None:
     logger.info(f"Deleting primary key index: {pki_version_locator}")
     s3u.delete_files_by_prefix(
@@ -171,8 +188,8 @@ def delete_primary_key_index_version(
 def group_record_indices_by_hash_bucket(
-        pki_table: pa.Table,
-        num_buckets: int) -> np.ndarray:
+    pki_table: pa.Table, num_buckets: int
+) -> np.ndarray:
     hash_bucket_to_indices = np.empty([num_buckets], dtype="object")
     record_index = 0
@@ -186,9 +203,11 @@ def group_record_indices_by_hash_bucket(
 def group_hash_bucket_indices(
-        hash_bucket_object_groups: np.ndarray,
-        num_buckets: int,
-        num_groups: int) -> Tuple[np.ndarray, List[ObjectRef]]:
+    hash_bucket_object_groups: np.ndarray, num_buckets: int, num_groups: int
+) -> Tuple[np.ndarray, List[ObjectRef]]:
+    """
+    Groups all the ObjectRef that belongs to a particular hash bucket group and hash bucket index.
+    """
     object_refs = []
     hash_bucket_group_to_obj_id = np.empty([num_groups], dtype="object")
@@ -201,50 +220,70 @@ def group_hash_bucket_indices(
         if obj:
             hb_group = hb_index % num_groups
             if hb_group_to_object[hb_group] is None:
-                hb_group_to_object[hb_group] = np.empty(
-                    [num_buckets], dtype="object")
+                hb_group_to_object[hb_group] = np.empty([num_buckets], dtype="object")
             hb_group_to_object[hb_group][hb_index] = obj
     for hb_group, obj in enumerate(hb_group_to_object):
-        if obj is not None:
-            obj_ref = ray.put(obj)
-            object_refs.append(obj_ref)
-            hash_bucket_group_to_obj_id[hb_group] = cloudpickle.dumps(obj_ref)
+        if obj is None:
+            continue
+        obj_ref = ray.put(obj)
+        pickled_obj_ref = cloudpickle.dumps(obj_ref)
+        object_refs.append(pickled_obj_ref)
+        hash_bucket_group_to_obj_id[hb_group] = pickled_obj_ref
+        # NOTE: The cloudpickle.dumps API call creates an out of band object reference to the object_ref variable.
+        # After pickling, Ray cannot track the serialized copy of the object or determine when the ObjectRef has been deserialized
+        # (e.g., if the ObjectRef is deserialized by a non-Ray process).
+        # Thus the object_ref cannot be tracked by Ray's distributed reference counter, even if it goes out of scope.
+        # The object now has a permanent reference and the data can't be freed from Ray’s object store.
+        # Manually deleting the untrackable object references offsets these permanent references and
+        # helps to allow these objects to be garbage collected normally.
+        del obj_ref
+        del pickled_obj_ref
     return hash_bucket_group_to_obj_id, object_refs
-def pk_digest_to_hash_bucket_index(
-        digest,
-        num_buckets: int) -> int:
+def pk_digest_to_hash_bucket_index(digest, num_buckets: int) -> int:
+    """
+    Deterministically get the hash bucket a particular digest belongs to
+    based on number of total hash buckets.
+    """
     return int.from_bytes(digest, "big") % num_buckets
 def write_primary_key_index_files(
-        table: pa.Table,
-        primary_key_index_version_locator: PrimaryKeyIndexVersionLocator,
-        s3_bucket: str,
-        hb_index: int,
-        records_per_index_file: int) -> PyArrowWriteResult:
+    table: pa.Table,
+    primary_key_index_version_locator: PrimaryKeyIndexVersionLocator,
+    s3_bucket: str,
+    hb_index: int,
+    records_per_index_file: int,
+) -> PyArrowWriteResult:
     """
     Writes primary key index files for the given hash bucket index out to the
     specified S3 bucket at the path identified by the given primary key index
     version locator. Output is written as 1 or more Parquet files with the
     given maximum number of records per file.
+    TODO(raghumdani): Support writing primary key index to any data catalog
     """
-    logger.info(f"Writing primary key index files for hash bucket {hb_index}. "
-                f"Primary key index version locator: "
-                f"{primary_key_index_version_locator}.")
+    logger.info(
+        f"Writing primary key index files for hash bucket {hb_index}. "
+        f"Primary key index version locator: "
+        f"{primary_key_index_version_locator}."
+    )
     s3_file_system = s3fs.S3FileSystem(
         anon=False,
         s3_additional_kwargs={
             "ContentType": ContentType.PARQUET.value,
             "ContentEncoding": ContentEncoding.IDENTITY.value,
-        }
+        },
+        config_kwargs=PRIMARY_KEY_INDEX_WRITE_BOTO3_CONFIG,
+    )
+    pkiv_hb_index_s3_url_base = (
+        primary_key_index_version_locator.get_pkiv_hb_index_s3_url_base(
+            s3_bucket, hb_index
+        )
     )
-    pkiv_hb_index_s3_url_base = primary_key_index_version_locator\
-        .get_pkiv_hb_index_s3_url_base(s3_bucket, hb_index)
     manifest_entries = s3u.upload_sliced_table(
         table,
         pkiv_hb_index_s3_url_base,
@@ -254,19 +293,21 @@ def write_primary_key_index_files(
         get_table_slicer(table),
     )
     manifest = Manifest.of(manifest_entries)
-    pkiv_hb_index_s3_manifest_s3_url = primary_key_index_version_locator\
-        .get_pkiv_hb_index_manifest_s3_url(s3_bucket, hb_index)
-    s3u.upload(
-        pkiv_hb_index_s3_manifest_s3_url,
-        str(json.dumps(manifest))
+    pkiv_hb_index_s3_manifest_s3_url = (
+        primary_key_index_version_locator.get_pkiv_hb_index_manifest_s3_url(
+            s3_bucket, hb_index
+        )
     )
+    s3u.upload(pkiv_hb_index_s3_manifest_s3_url, str(json.dumps(manifest)))
     result = PyArrowWriteResult.of(
         len(manifest_entries),
         table.nbytes,
         manifest.meta.content_length,
         len(table),
     )
-    logger.info(f"Wrote primary key index files for hash bucket {hb_index}. "
-                f"Primary key index version locator: "
-                f"{primary_key_index_version_locator}. Result: {result}")
+    logger.info(
+        f"Wrote primary key index files for hash bucket {hb_index}. "
+        f"Primary key index version locator: "
+        f"{primary_key_index_version_locator}. Result: {result}"
+    )
     return result

deltacat/compute/compactor/utils/round_completion_file.py CHANGED Viewed

@@ -1,35 +1,35 @@
-import logging
 import json
+import logging
-from deltacat.storage import PartitionLocator
-from deltacat.compute.compactor import RoundCompletionInfo
 from deltacat import logs
+from deltacat.compute.compactor import RoundCompletionInfo
+from deltacat.storage import PartitionLocator
 logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
 def get_round_completion_file_s3_url(
-        bucket: str,
-        source_partition_locator: PartitionLocator,
-        pki_root_path: str) -> str:
+    bucket: str, source_partition_locator: PartitionLocator, pki_root_path: str
+) -> str:
     base_url = source_partition_locator.path(f"s3://{bucket}")
     return f"{base_url}/{pki_root_path}.json"
 def read_round_completion_file(
-        bucket: str,
-        source_partition_locator: PartitionLocator,
-        primary_key_index_root_path: str) -> RoundCompletionInfo:
+    bucket: str,
+    source_partition_locator: PartitionLocator,
+    primary_key_index_root_path: str,
+) -> RoundCompletionInfo:
     from deltacat.aws import s3u as s3_utils
     round_completion_file_url = get_round_completion_file_s3_url(
         bucket,
         source_partition_locator,
         primary_key_index_root_path,
     )
-    logger.info(
-        f"reading round completion file from: {round_completion_file_url}")
+    logger.info(f"reading round completion file from: {round_completion_file_url}")
     round_completion_info = None
     result = s3_utils.download(round_completion_file_url, False)
     if result:
@@ -40,24 +40,23 @@ def read_round_completion_file(
 def write_round_completion_file(
-        bucket: str,
-        source_partition_locator: PartitionLocator,
-        primary_key_index_root_path: str,
-        round_completion_info: RoundCompletionInfo):
+    bucket: str,
+    source_partition_locator: PartitionLocator,
+    primary_key_index_root_path: str,
+    round_completion_info: RoundCompletionInfo,
+) -> str:
     from deltacat.aws import s3u as s3_utils
-    logger.info(
-        f"writing round completion file contents: {round_completion_info}")
+    logger.info(f"writing round completion file contents: {round_completion_info}")
     round_completion_file_s3_url = get_round_completion_file_s3_url(
         bucket,
         source_partition_locator,
         primary_key_index_root_path,
     )
-    logger.info(
-        f"writing round completion file to: {round_completion_file_s3_url}")
+    logger.info(f"writing round completion file to: {round_completion_file_s3_url}")
     s3_utils.upload(
-        round_completion_file_s3_url,
-        str(json.dumps(round_completion_info))
+        round_completion_file_s3_url, str(json.dumps(round_completion_info))
     )
-    logger.info(
-        f"round completion file written to: {round_completion_file_s3_url}")
+    logger.info(f"round completion file written to: {round_completion_file_s3_url}")
+    return round_completion_file_s3_url

deltacat 0.1.6__py3-none-any.whl → 0.1.11__py3-none-any.whl

deltacat 0.1.6py3-none-any.whl → 0.1.11py3-none-any.whl