PyPI - deltacat - Versions diffs - 0.1.10.dev0__py3-none-any.whl → 0.1.12__py3-none-any.whl - Mend

deltacat 0.1.10.dev0py3-none-any.whl → 0.1.12py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (83) hide show

deltacat/__init__.py +41 -15
deltacat/aws/clients.py +12 -31
deltacat/aws/constants.py +1 -1
deltacat/aws/redshift/__init__.py +7 -2
deltacat/aws/redshift/model/manifest.py +54 -50
deltacat/aws/s3u.py +176 -187
deltacat/catalog/delegate.py +151 -185
deltacat/catalog/interface.py +78 -97
deltacat/catalog/model/catalog.py +21 -21
deltacat/catalog/model/table_definition.py +11 -9
deltacat/compute/compactor/__init__.py +12 -16
deltacat/compute/compactor/compaction_session.py +237 -166
deltacat/compute/compactor/model/delta_annotated.py +60 -44
deltacat/compute/compactor/model/delta_file_envelope.py +5 -6
deltacat/compute/compactor/model/delta_file_locator.py +10 -8
deltacat/compute/compactor/model/materialize_result.py +6 -7
deltacat/compute/compactor/model/primary_key_index.py +38 -34
deltacat/compute/compactor/model/pyarrow_write_result.py +3 -4
deltacat/compute/compactor/model/round_completion_info.py +25 -19
deltacat/compute/compactor/model/sort_key.py +18 -15
deltacat/compute/compactor/steps/dedupe.py +119 -94
deltacat/compute/compactor/steps/hash_bucket.py +48 -47
deltacat/compute/compactor/steps/materialize.py +86 -92
deltacat/compute/compactor/steps/rehash/rehash_bucket.py +13 -13
deltacat/compute/compactor/steps/rehash/rewrite_index.py +5 -5
deltacat/compute/compactor/utils/io.py +59 -47
deltacat/compute/compactor/utils/primary_key_index.py +91 -80
deltacat/compute/compactor/utils/round_completion_file.py +22 -23
deltacat/compute/compactor/utils/system_columns.py +33 -45
deltacat/compute/metastats/meta_stats.py +235 -157
deltacat/compute/metastats/model/partition_stats_dict.py +7 -10
deltacat/compute/metastats/model/stats_cluster_size_estimator.py +13 -5
deltacat/compute/metastats/stats.py +95 -64
deltacat/compute/metastats/utils/io.py +100 -53
deltacat/compute/metastats/utils/pyarrow_memory_estimation_function.py +5 -2
deltacat/compute/metastats/utils/ray_utils.py +38 -33
deltacat/compute/stats/basic.py +107 -69
deltacat/compute/stats/models/delta_column_stats.py +11 -8
deltacat/compute/stats/models/delta_stats.py +59 -32
deltacat/compute/stats/models/delta_stats_cache_result.py +4 -1
deltacat/compute/stats/models/manifest_entry_stats.py +12 -6
deltacat/compute/stats/models/stats_result.py +24 -14
deltacat/compute/stats/utils/intervals.py +16 -9
deltacat/compute/stats/utils/io.py +86 -51
deltacat/compute/stats/utils/manifest_stats_file.py +24 -33
deltacat/constants.py +4 -13
deltacat/io/__init__.py +2 -2
deltacat/io/aws/redshift/redshift_datasource.py +157 -143
deltacat/io/dataset.py +14 -17
deltacat/io/read_api.py +36 -33
deltacat/logs.py +94 -42
deltacat/storage/__init__.py +18 -8
deltacat/storage/interface.py +196 -213
deltacat/storage/model/delta.py +45 -51
deltacat/storage/model/list_result.py +12 -8
deltacat/storage/model/namespace.py +4 -5
deltacat/storage/model/partition.py +42 -42
deltacat/storage/model/stream.py +29 -30
deltacat/storage/model/table.py +14 -14
deltacat/storage/model/table_version.py +32 -31
deltacat/storage/model/types.py +1 -0
deltacat/tests/stats/test_intervals.py +11 -24
deltacat/tests/utils/__init__.py +0 -0
deltacat/tests/utils/test_record_batch_tables.py +284 -0
deltacat/types/media.py +3 -4
deltacat/types/tables.py +31 -21
deltacat/utils/common.py +5 -11
deltacat/utils/numpy.py +20 -22
deltacat/utils/pandas.py +73 -100
deltacat/utils/performance.py +3 -9
deltacat/utils/placement.py +259 -230
deltacat/utils/pyarrow.py +302 -89
deltacat/utils/ray_utils/collections.py +2 -1
deltacat/utils/ray_utils/concurrency.py +27 -28
deltacat/utils/ray_utils/dataset.py +28 -28
deltacat/utils/ray_utils/performance.py +5 -9
deltacat/utils/ray_utils/runtime.py +9 -10
{deltacat-0.1.10.dev0.dist-info → deltacat-0.1.12.dist-info}/METADATA +1 -1
deltacat-0.1.12.dist-info/RECORD +110 -0
deltacat-0.1.10.dev0.dist-info/RECORD +0 -108
{deltacat-0.1.10.dev0.dist-info → deltacat-0.1.12.dist-info}/LICENSE +0 -0
{deltacat-0.1.10.dev0.dist-info → deltacat-0.1.12.dist-info}/WHEEL +0 -0
{deltacat-0.1.10.dev0.dist-info → deltacat-0.1.12.dist-info}/top_level.txt +0 -0

deltacat/compute/compactor/compaction_session.py CHANGED Viewed

@@ -1,51 +1,66 @@
-import logging
 import functools
-import ray
+import logging
 from collections import defaultdict
+from typing import Dict, List, Optional, Set, Tuple
+import pyarrow as pa
+import ray
 from deltacat import logs
+from deltacat.compute.compactor import (
+    PrimaryKeyIndexLocator,
+    PrimaryKeyIndexMeta,
+    PrimaryKeyIndexVersionLocator,
+    PrimaryKeyIndexVersionMeta,
+    PyArrowWriteResult,
+    RoundCompletionInfo,
+    SortKey,
+)
+from deltacat.compute.compactor.steps import dedupe as dd
+from deltacat.compute.compactor.steps import hash_bucket as hb
+from deltacat.compute.compactor.steps import materialize as mat
+from deltacat.compute.compactor.utils import io
+from deltacat.compute.compactor.utils import primary_key_index as pki
+from deltacat.compute.compactor.utils import round_completion_file as rcf
 from deltacat.compute.stats.models.delta_stats import DeltaStats
-from deltacat.storage import Delta, DeltaLocator, Partition, \
-    PartitionLocator, interface as unimplemented_deltacat_storage
-from deltacat.utils.ray_utils.concurrency import invoke_parallel, \
-    round_robin_options_provider
-from deltacat.utils.ray_utils.runtime import live_node_resource_keys
-from deltacat.compute.compactor.steps import hash_bucket as hb, dedupe as dd, \
-    materialize as mat
-from deltacat.compute.compactor import SortKey, PrimaryKeyIndexMeta, \
-    PrimaryKeyIndexLocator, PrimaryKeyIndexVersionMeta, \
-    PrimaryKeyIndexVersionLocator, RoundCompletionInfo, \
-    PyArrowWriteResult
-from deltacat.compute.compactor.utils import round_completion_file as rcf, io, \
-    primary_key_index as pki
+from deltacat.storage import Delta, DeltaLocator, Partition, PartitionLocator
+from deltacat.storage import interface as unimplemented_deltacat_storage
 from deltacat.types.media import ContentType
 from deltacat.utils.placement import PlacementGroupConfig
-from typing import List, Set, Optional, Tuple, Dict
+from deltacat.utils.ray_utils.concurrency import (
+    invoke_parallel,
+    round_robin_options_provider,
+)
+from deltacat.utils.ray_utils.runtime import live_node_resource_keys
-import pyarrow as pa
 logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
 _PRIMARY_KEY_INDEX_ALGORITHM_VERSION: str = "1.0"
 def check_preconditions(
-        source_partition_locator: PartitionLocator,
-        compacted_partition_locator: PartitionLocator,
-        sort_keys: List[SortKey],
-        max_records_per_output_file: int,
-        new_hash_bucket_count: Optional[int],
-        deltacat_storage=unimplemented_deltacat_storage) -> int:
-    assert source_partition_locator.partition_values \
-           == compacted_partition_locator.partition_values, \
-        "In-place compaction must use the same partition values for the " \
+    source_partition_locator: PartitionLocator,
+    compacted_partition_locator: PartitionLocator,
+    sort_keys: List[SortKey],
+    max_records_per_output_file: int,
+    new_hash_bucket_count: Optional[int],
+    deltacat_storage=unimplemented_deltacat_storage,
+) -> int:
+    assert (
+        source_partition_locator.partition_values
+        == compacted_partition_locator.partition_values
+    ), (
+        "In-place compaction must use the same partition values for the "
         "source and destination."
-    assert max_records_per_output_file >= 1, \
-        "Max records per output file must be a positive value"
+    )
+    assert (
+        max_records_per_output_file >= 1
+    ), "Max records per output file must be a positive value"
     if new_hash_bucket_count is not None:
-        assert new_hash_bucket_count >= 1, \
-            "New hash bucket count must be a positive value"
+        assert (
+            new_hash_bucket_count >= 1
+        ), "New hash bucket count must be a positive value"
     return SortKey.validate_sort_keys(
         source_partition_locator,
         sort_keys,
@@ -54,95 +69,110 @@ def check_preconditions(
 def compact_partition(
-        source_partition_locator: PartitionLocator,
-        compacted_partition_locator: PartitionLocator,
-        primary_keys: Set[str],
-        compaction_artifact_s3_bucket: str,
-        last_stream_position_to_compact: int,
-        hash_bucket_count: Optional[int] = None,
-        sort_keys: List[SortKey] = None,
-        records_per_primary_key_index_file: int = 38_000_000,
-        records_per_compacted_file: int = 4_000_000,
-        input_deltas_stats: Dict[int, DeltaStats] = None,
-        min_pk_index_pa_bytes: int = 0,
-        min_hash_bucket_chunk_size: int = 0,
-        compacted_file_content_type: ContentType = ContentType.PARQUET,
-        delete_prev_primary_key_index: bool = False,
-        read_round_completion: bool = False,
-        pg_config: Optional[PlacementGroupConfig] = None,
-        schema_on_read: Optional[pa.schema] = None,  # TODO (ricmiyam): Remove this and retrieve schema from storage API
-        deltacat_storage=unimplemented_deltacat_storage):
+    source_partition_locator: PartitionLocator,
+    destination_partition_locator: PartitionLocator,
+    primary_keys: Set[str],
+    compaction_artifact_s3_bucket: str,
+    last_stream_position_to_compact: int,
+    *,
+    hash_bucket_count: Optional[int] = None,
+    sort_keys: List[SortKey] = None,
+    records_per_primary_key_index_file: int = 38_000_000,
+    records_per_compacted_file: int = 4_000_000,
+    input_deltas_stats: Dict[int, DeltaStats] = None,
+    min_pk_index_pa_bytes: int = 0,
+    min_hash_bucket_chunk_size: int = 0,
+    compacted_file_content_type: ContentType = ContentType.PARQUET,
+    delete_prev_primary_key_index: bool = False,
+    pg_config: Optional[PlacementGroupConfig] = None,
+    schema_on_read: Optional[
+        pa.schema
+    ] = None,  # TODO (ricmiyam): Remove this and retrieve schema from storage API
+    rebase_source_partition_locator: Optional[PartitionLocator] = None,
+    rebase_source_partition_high_watermark: Optional[int] = None,
+    deltacat_storage=unimplemented_deltacat_storage,
+) -> Optional[str]:
     logger.info(f"Starting compaction session for: {source_partition_locator}")
     partition = None
     compaction_rounds_executed = 0
     has_next_compaction_round = True
+    new_rcf_s3_url = None
     while has_next_compaction_round:
-        has_next_compaction_round, new_partition, new_rci = \
-            _execute_compaction_round(
-                source_partition_locator,
-                compacted_partition_locator,
-                primary_keys,
-                compaction_artifact_s3_bucket,
-                last_stream_position_to_compact,
-                hash_bucket_count,
-                sort_keys,
-                records_per_primary_key_index_file,
-                records_per_compacted_file,
-                input_deltas_stats,
-                min_pk_index_pa_bytes,
-                min_hash_bucket_chunk_size,
-                compacted_file_content_type,
-                delete_prev_primary_key_index,
-                read_round_completion,
-                schema_on_read,
-                deltacat_storage=deltacat_storage,
-                pg_config=pg_config
-            )
+        (
+            has_next_compaction_round,
+            new_partition,
+            new_rci,
+            new_rcf_s3_url,
+        ) = _execute_compaction_round(
+            source_partition_locator,
+            destination_partition_locator,
+            primary_keys,
+            compaction_artifact_s3_bucket,
+            last_stream_position_to_compact,
+            hash_bucket_count,
+            sort_keys,
+            records_per_primary_key_index_file,
+            records_per_compacted_file,
+            input_deltas_stats,
+            min_pk_index_pa_bytes,
+            min_hash_bucket_chunk_size,
+            compacted_file_content_type,
+            delete_prev_primary_key_index,
+            pg_config,
+            schema_on_read,
+            rebase_source_partition_locator,
+            rebase_source_partition_high_watermark,
+            deltacat_storage,
+        )
         if new_partition:
             partition = new_partition
-            compacted_partition_locator = new_partition.locator
+            destination_partition_locator = new_partition.locator
             compaction_rounds_executed += 1
         # Take new primary key index sizes into account for subsequent compaction rounds and their dedupe steps
         if new_rci:
             min_pk_index_pa_bytes = new_rci.pk_index_pyarrow_write_result.pyarrow_bytes
-    logger.info(f"Partition-{source_partition_locator.partition_values}-> Compaction session data processing completed in "
-                f"{compaction_rounds_executed} rounds.")
+    logger.info(
+        f"Partition-{source_partition_locator.partition_values}-> Compaction session data processing completed in "
+        f"{compaction_rounds_executed} rounds."
+    )
     if partition:
         logger.info(f"Committing compacted partition to: {partition.locator}")
         partition = deltacat_storage.commit_partition(partition)
         logger.info(f"Committed compacted partition: {partition}")
     logger.info(f"Completed compaction session for: {source_partition_locator}")
+    return new_rcf_s3_url
 def _execute_compaction_round(
-        source_partition_locator: PartitionLocator,
-        compacted_partition_locator: PartitionLocator,
-        primary_keys: Set[str],
-        compaction_artifact_s3_bucket: str,
-        last_stream_position_to_compact: int,
-        new_hash_bucket_count: Optional[int],
-        sort_keys: List[SortKey],
-        records_per_primary_key_index_file: int,
-        records_per_compacted_file: int,
-        input_deltas_stats: Dict[int, DeltaStats],
-        min_pk_index_pa_bytes: int,
-        min_hash_bucket_chunk_size: int,
-        compacted_file_content_type: ContentType,
-        delete_prev_primary_key_index: bool,
-        read_round_completion: bool,
-        schema_on_read: Optional[pa.schema],
-        deltacat_storage = unimplemented_deltacat_storage,
-        pg_config: Optional[PlacementGroupConfig] = None) \
-        -> Tuple[bool, Optional[Partition], Optional[RoundCompletionInfo]]:
+    source_partition_locator: PartitionLocator,
+    compacted_partition_locator: PartitionLocator,
+    primary_keys: Set[str],
+    compaction_artifact_s3_bucket: str,
+    last_stream_position_to_compact: int,
+    new_hash_bucket_count: Optional[int],
+    sort_keys: List[SortKey],
+    records_per_primary_key_index_file: int,
+    records_per_compacted_file: int,
+    input_deltas_stats: Dict[int, DeltaStats],
+    min_pk_index_pa_bytes: int,
+    min_hash_bucket_chunk_size: int,
+    compacted_file_content_type: ContentType,
+    delete_prev_primary_key_index: bool,
+    pg_config: Optional[PlacementGroupConfig],
+    schema_on_read: Optional[pa.schema],
+    rebase_source_partition_locator: Optional[PartitionLocator],
+    rebase_source_partition_high_watermark: Optional[int],
+    deltacat_storage=unimplemented_deltacat_storage,
+) -> Tuple[bool, Optional[Partition], Optional[RoundCompletionInfo], Optional[str]]:
     if not primary_keys:
         # TODO (pdames): run simple rebatch to reduce all deltas into 1 delta
         #  with normalized manifest entry sizes
         raise NotImplementedError(
-            "Compaction only supports tables with 1 or more primary keys")
+            "Compaction only supports tables with 1 or more primary keys"
+        )
     if sort_keys is None:
         sort_keys = []
     # TODO (pdames): detect and handle schema evolution (at least ensure that
@@ -166,23 +196,25 @@ def _execute_compaction_round(
     cluster_resources = ray.cluster_resources()
     logger.info(f"Total cluster resources: {cluster_resources}")
     node_resource_keys = None
-    if pg_config: # use resource in each placement group
+    if pg_config:  # use resource in each placement group
         cluster_resources = pg_config.resource
-        cluster_cpus = cluster_resources['CPU']
-    else: # use all cluster resource
+        cluster_cpus = cluster_resources["CPU"]
+    else:  # use all cluster resource
         logger.info(f"Available cluster resources: {ray.available_resources()}")
         cluster_cpus = int(cluster_resources["CPU"])
         logger.info(f"Total cluster CPUs: {cluster_cpus}")
         node_resource_keys = live_node_resource_keys()
-        logger.info(f"Found {len(node_resource_keys)} live cluster nodes: "
-                   f"{node_resource_keys}")
+        logger.info(
+            f"Found {len(node_resource_keys)} live cluster nodes: "
+            f"{node_resource_keys}"
+        )
     # create a remote options provider to round-robin tasks across all nodes or allocated bundles
     logger.info(f"Setting round robin scheduling with node id:{node_resource_keys}")
     round_robin_opt_provider = functools.partial(
         round_robin_options_provider,
         resource_keys=node_resource_keys,
-        pg_config = pg_config.opts if pg_config else None
+        pg_config=pg_config.opts if pg_config else None,
     )
     # assign a distinct index to each node in the cluster
@@ -206,16 +238,20 @@ def _execute_compaction_round(
         _PRIMARY_KEY_INDEX_ALGORITHM_VERSION,
     )
     compatible_primary_key_index_locator = PrimaryKeyIndexLocator.of(
-        compatible_primary_key_index_meta)
-    compatible_primary_key_index_root_path = \
+        compatible_primary_key_index_meta
+    )
+    compatible_primary_key_index_root_path = (
         compatible_primary_key_index_locator.primary_key_index_root_path
+    )
     # read the results from any previously completed compaction round that used
     # a compatible primary key index
     round_completion_info = None
-    if read_round_completion:
-        logger.info(f"Reading round completion file for compatible "
-                    f"primary key index root path {compatible_primary_key_index_root_path}")
+    if not rebase_source_partition_locator:
+        logger.info(
+            f"Reading round completion file for compatible "
+            f"primary key index root path: {compatible_primary_key_index_root_path}"
+        )
         round_completion_info = rcf.read_round_completion_file(
             compaction_artifact_s3_bucket,
             source_partition_locator,
@@ -226,21 +262,34 @@ def _execute_compaction_round(
     # read the previous compaction round's hash bucket count, if any
     old_hash_bucket_count = None
     if round_completion_info:
-        old_pki_version_locator = round_completion_info\
-            .primary_key_index_version_locator
-        old_hash_bucket_count = old_pki_version_locator\
-            .primary_key_index_version_meta \
-            .hash_bucket_count
-        min_pk_index_pa_bytes = round_completion_info.pk_index_pyarrow_write_result.pyarrow_bytes
+        old_pki_version_locator = (
+            round_completion_info.primary_key_index_version_locator
+        )
+        old_hash_bucket_count = (
+            old_pki_version_locator.primary_key_index_version_meta.hash_bucket_count
+        )
+        min_pk_index_pa_bytes = (
+            round_completion_info.pk_index_pyarrow_write_result.pyarrow_bytes
+        )
+    else:
+        logger.info(
+            f"No prior round info read. Source partition: "
+            f"{source_partition_locator}. Primary key index locator: "
+            f"{compatible_primary_key_index_locator}. Rebase source "
+            f"partition locator: {rebase_source_partition_locator}"
+        )
     # use the new hash bucket count if provided, or fall back to old count
-    hash_bucket_count = new_hash_bucket_count \
-        if new_hash_bucket_count is not None \
+    hash_bucket_count = (
+        new_hash_bucket_count
+        if new_hash_bucket_count is not None
         else old_hash_bucket_count
+    )
     # discover input delta files
-    high_watermark = round_completion_info.high_watermark \
-        if round_completion_info else None
+    high_watermark = (
+        round_completion_info.high_watermark if round_completion_info else None
+    )
     input_deltas = io.discover_deltas(
         source_partition_locator,
@@ -251,25 +300,29 @@ def _execute_compaction_round(
     if not input_deltas:
         logger.info("No input deltas found to compact.")
-        return False, None, None
+        return False, None, None, None
     # limit the input deltas to fit on this cluster and convert them to
     # annotated deltas of equivalent size for easy parallel distribution
-    uniform_deltas, hash_bucket_count, last_stream_position_compacted = \
-        io.limit_input_deltas(
-            input_deltas,
-            cluster_resources,
-            hash_bucket_count,
-            min_pk_index_pa_bytes,
-            min_hash_bucket_chunk_size,
-            input_deltas_stats=input_deltas_stats,
-            deltacat_storage=deltacat_storage
-        )
+    (
+        uniform_deltas,
+        hash_bucket_count,
+        last_stream_position_compacted,
+    ) = io.limit_input_deltas(
+        input_deltas,
+        cluster_resources,
+        hash_bucket_count,
+        min_pk_index_pa_bytes,
+        min_hash_bucket_chunk_size,
+        input_deltas_stats=input_deltas_stats,
+        deltacat_storage=deltacat_storage,
+    )
-    assert hash_bucket_count is not None and hash_bucket_count > 0, \
-        f"Unexpected Error: Default hash bucket count ({hash_bucket_count}) " \
-        f"is invalid."
+    assert hash_bucket_count is not None and hash_bucket_count > 0, (
+        f"Expected hash bucket count to be a positive integer, but found "
+        f"`{hash_bucket_count}`"
+    )
     # rehash the primary key index if necessary
     if round_completion_info:
@@ -277,8 +330,8 @@ def _execute_compaction_round(
         # the previous primary key index is compatible with the current, but
         # will need to be rehashed if the hash bucket count has changed
         if hash_bucket_count != old_hash_bucket_count:
-            # TODO(draghave): manually test the path after prior primary key
-            # index was already built
+            # TODO(draghave): manually test the path after prior primary key
+            #   index was already built
             round_completion_info = pki.rehash(
                 round_robin_opt_provider,
                 compaction_artifact_s3_bucket,
@@ -289,10 +342,6 @@ def _execute_compaction_round(
                 records_per_primary_key_index_file,
                 delete_prev_primary_key_index,
             )
-    else:
-        logger.info(f"No prior round completion file found. Source partition: "
-                    f"{source_partition_locator}. Primary key index locator: "
-                    f"{compatible_primary_key_index_locator}")
     # parallel step 1:
     # group like primary keys together by hashing them into buckets
@@ -315,7 +364,7 @@ def _execute_compaction_round(
         for hash_group_index, object_id in enumerate(hash_group_idx_to_obj_id):
             if object_id:
                 all_hash_group_idx_to_obj_id[hash_group_index].append(object_id)
-    hash_group_count = dedupe_task_count = len(all_hash_group_idx_to_obj_id)
+    hash_group_count = len(all_hash_group_idx_to_obj_id)
     logger.info(f"Hash bucket groups created: {hash_group_count}")
     # TODO (pdames): when resources are freed during the last round of hash
@@ -343,9 +392,11 @@ def _execute_compaction_round(
         _PRIMARY_KEY_INDEX_ALGORITHM_VERSION,
     )
     new_primary_key_index_locator = PrimaryKeyIndexLocator.of(
-        new_primary_key_index_meta)
-    new_primary_key_index_root_path = new_primary_key_index_locator\
-        .primary_key_index_root_path
+        new_primary_key_index_meta
+    )
+    new_primary_key_index_root_path = (
+        new_primary_key_index_locator.primary_key_index_root_path
+    )
     # generate a new primary key index version locator for this round
     new_primary_key_index_version_meta = PrimaryKeyIndexVersionMeta.of(
@@ -353,8 +404,8 @@ def _execute_compaction_round(
         hash_bucket_count,
     )
     new_pki_version_locator = PrimaryKeyIndexVersionLocator.generate(
-        new_primary_key_index_version_meta)
+        new_primary_key_index_version_meta
+    )
     # parallel step 2:
     # discover records with duplicate primary keys in each hash bucket, and
@@ -366,30 +417,34 @@ def _execute_compaction_round(
         ray_task=dd.dedupe,
         max_parallelism=max_parallelism,
         options_provider=round_robin_opt_provider,
-        kwargs_provider=lambda index, item: {"dedupe_task_index": index,
-                                             "object_ids": item},
+        kwargs_provider=lambda index, item: {
+            "dedupe_task_index": index,
+            "object_ids": item,
+        },
         compaction_artifact_s3_bucket=compaction_artifact_s3_bucket,
         round_completion_info=round_completion_info,
         new_primary_key_index_version_locator=new_pki_version_locator,
         sort_keys=sort_keys,
         max_records_per_index_file=records_per_primary_key_index_file,
         num_materialize_buckets=num_materialize_buckets,
-        delete_old_primary_key_index=delete_prev_primary_key_index
+        delete_old_primary_key_index=delete_prev_primary_key_index,
     )
     logger.info(f"Getting {len(dd_tasks_pending)} dedupe results...")
     dd_results = ray.get([t[0] for t in dd_tasks_pending])
     logger.info(f"Got {len(dd_results)} dedupe results.")
     all_mat_buckets_to_obj_id = defaultdict(list)
     for mat_bucket_idx_to_obj_id in dd_results:
-        for bucket_idx, dd_task_index_and_object_id_tuple in \
-                mat_bucket_idx_to_obj_id.items():
+        for (
+            bucket_idx,
+            dd_task_index_and_object_id_tuple,
+        ) in mat_bucket_idx_to_obj_id.items():
             all_mat_buckets_to_obj_id[bucket_idx].append(
-                dd_task_index_and_object_id_tuple)
+                dd_task_index_and_object_id_tuple
+            )
     logger.info(f"Getting {len(dd_tasks_pending)} dedupe result stat(s)...")
     pki_stats = ray.get([t[2] for t in dd_tasks_pending])
     logger.info(f"Got {len(pki_stats)} dedupe result stat(s).")
-    logger.info(f"Materialize buckets created: "
-                f"{len(all_mat_buckets_to_obj_id)}")
+    logger.info(f"Materialize buckets created: " f"{len(all_mat_buckets_to_obj_id)}")
     # TODO(pdames): when resources are freed during the last round of deduping
     #  start running materialize tasks that read materialization source file
@@ -408,9 +463,9 @@ def _execute_compaction_round(
         ray_task=mat.materialize,
         max_parallelism=max_parallelism,
         options_provider=round_robin_opt_provider,
-        kwargs_provider=lambda index, mat_bucket_idx_to_obj_id: {
-            "mat_bucket_index": mat_bucket_idx_to_obj_id[0],
-            "dedupe_task_idx_and_obj_id_tuples": mat_bucket_idx_to_obj_id[1],
+        kwargs_provider=lambda index, mat_bucket_index_to_obj_id: {
+            "mat_bucket_index": mat_bucket_index_to_obj_id[0],
+            "dedupe_task_idx_and_obj_id_tuples": mat_bucket_index_to_obj_id[1],
         },
         schema=schema_on_read,
         round_completion_info=round_completion_info,
@@ -435,24 +490,40 @@ def _execute_compaction_round(
         compacted_delta.stream_position,
     )
-    round_completion_info = RoundCompletionInfo.of(
-        last_stream_position_compacted,
+    rci_high_watermark = (
+        rebase_source_partition_high_watermark
+        if rebase_source_partition_high_watermark
+        else last_stream_position_compacted
+    )
+    new_round_completion_info = RoundCompletionInfo.of(
+        rci_high_watermark,
         new_compacted_delta_locator,
-        PyArrowWriteResult.union([m.pyarrow_write_result
-                                  for m in mat_results]),
+        PyArrowWriteResult.union([m.pyarrow_write_result for m in mat_results]),
         PyArrowWriteResult.union(pki_stats),
         bit_width_of_sort_keys,
         new_pki_version_locator,
+        rebase_source_partition_locator
+        or round_completion_info.rebase_source_partition_locator,
     )
-    rcf.write_round_completion_file(
+    rcf_source_partition_locator = (
+        rebase_source_partition_locator
+        if rebase_source_partition_locator
+        else source_partition_locator
+    )
+    round_completion_file_s3_url = rcf.write_round_completion_file(
         compaction_artifact_s3_bucket,
-        source_partition_locator,
+        rcf_source_partition_locator,
         new_primary_key_index_root_path,
-        round_completion_info,
+        new_round_completion_info,
+    )
+    logger.info(
+        f"partition-{source_partition_locator.partition_values},"
+        f"compacted at: {last_stream_position_compacted},"
+        f"last position: {last_stream_position_to_compact}"
+    )
+    return (
+        (last_stream_position_compacted < last_stream_position_to_compact),
+        partition,
+        new_round_completion_info,
+        round_completion_file_s3_url,
     )
-    logger.info(f"partition-{source_partition_locator.partition_values},compacted at:{last_stream_position_compacted}, last position:{last_stream_position_to_compact}")
-    return \
-        (last_stream_position_compacted < last_stream_position_to_compact), \
-        partition, \
-        round_completion_info

deltacat 0.1.10.dev0__py3-none-any.whl → 0.1.12__py3-none-any.whl

deltacat 0.1.10.dev0py3-none-any.whl → 0.1.12py3-none-any.whl