PyPI - deltacat - Versions diffs - 0.1.6__py3-none-any.whl → 0.1.11__py3-none-any.whl - Mend

deltacat 0.1.6py3-none-any.whl → 0.1.11py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (83) hide show

deltacat/__init__.py +41 -15
deltacat/aws/clients.py +12 -31
deltacat/aws/constants.py +1 -1
deltacat/aws/redshift/__init__.py +7 -2
deltacat/aws/redshift/model/manifest.py +54 -50
deltacat/aws/s3u.py +183 -194
deltacat/catalog/delegate.py +151 -185
deltacat/catalog/interface.py +78 -97
deltacat/catalog/model/catalog.py +21 -21
deltacat/catalog/model/table_definition.py +11 -9
deltacat/compute/compactor/__init__.py +12 -16
deltacat/compute/compactor/compaction_session.py +249 -198
deltacat/compute/compactor/model/delta_annotated.py +60 -44
deltacat/compute/compactor/model/delta_file_envelope.py +5 -6
deltacat/compute/compactor/model/delta_file_locator.py +10 -8
deltacat/compute/compactor/model/materialize_result.py +6 -7
deltacat/compute/compactor/model/primary_key_index.py +38 -34
deltacat/compute/compactor/model/pyarrow_write_result.py +3 -4
deltacat/compute/compactor/model/round_completion_info.py +25 -19
deltacat/compute/compactor/model/sort_key.py +18 -15
deltacat/compute/compactor/steps/dedupe.py +153 -260
deltacat/compute/compactor/steps/hash_bucket.py +56 -56
deltacat/compute/compactor/steps/materialize.py +139 -100
deltacat/compute/compactor/steps/rehash/rehash_bucket.py +13 -13
deltacat/compute/compactor/steps/rehash/rewrite_index.py +11 -13
deltacat/compute/compactor/utils/io.py +59 -47
deltacat/compute/compactor/utils/primary_key_index.py +131 -90
deltacat/compute/compactor/utils/round_completion_file.py +22 -23
deltacat/compute/compactor/utils/system_columns.py +33 -42
deltacat/compute/metastats/meta_stats.py +235 -157
deltacat/compute/metastats/model/partition_stats_dict.py +7 -10
deltacat/compute/metastats/model/stats_cluster_size_estimator.py +13 -5
deltacat/compute/metastats/stats.py +95 -64
deltacat/compute/metastats/utils/io.py +100 -53
deltacat/compute/metastats/utils/pyarrow_memory_estimation_function.py +5 -2
deltacat/compute/metastats/utils/ray_utils.py +38 -33
deltacat/compute/stats/basic.py +107 -69
deltacat/compute/stats/models/delta_column_stats.py +11 -8
deltacat/compute/stats/models/delta_stats.py +59 -32
deltacat/compute/stats/models/delta_stats_cache_result.py +4 -1
deltacat/compute/stats/models/manifest_entry_stats.py +12 -6
deltacat/compute/stats/models/stats_result.py +24 -14
deltacat/compute/stats/utils/intervals.py +16 -9
deltacat/compute/stats/utils/io.py +86 -51
deltacat/compute/stats/utils/manifest_stats_file.py +24 -33
deltacat/constants.py +8 -10
deltacat/io/__init__.py +2 -2
deltacat/io/aws/redshift/redshift_datasource.py +157 -143
deltacat/io/dataset.py +14 -17
deltacat/io/read_api.py +36 -33
deltacat/logs.py +94 -42
deltacat/storage/__init__.py +18 -8
deltacat/storage/interface.py +196 -213
deltacat/storage/model/delta.py +45 -51
deltacat/storage/model/list_result.py +12 -8
deltacat/storage/model/namespace.py +4 -5
deltacat/storage/model/partition.py +42 -42
deltacat/storage/model/stream.py +29 -30
deltacat/storage/model/table.py +14 -14
deltacat/storage/model/table_version.py +32 -31
deltacat/storage/model/types.py +1 -0
deltacat/tests/stats/test_intervals.py +11 -24
deltacat/tests/utils/__init__.py +0 -0
deltacat/tests/utils/test_record_batch_tables.py +284 -0
deltacat/types/media.py +3 -4
deltacat/types/tables.py +31 -21
deltacat/utils/common.py +5 -11
deltacat/utils/numpy.py +20 -22
deltacat/utils/pandas.py +73 -100
deltacat/utils/performance.py +3 -9
deltacat/utils/placement.py +276 -228
deltacat/utils/pyarrow.py +302 -89
deltacat/utils/ray_utils/collections.py +2 -1
deltacat/utils/ray_utils/concurrency.py +36 -29
deltacat/utils/ray_utils/dataset.py +28 -28
deltacat/utils/ray_utils/performance.py +5 -9
deltacat/utils/ray_utils/runtime.py +9 -10
{deltacat-0.1.6.dist-info → deltacat-0.1.11.dist-info}/METADATA +21 -11
deltacat-0.1.11.dist-info/RECORD +110 -0
{deltacat-0.1.6.dist-info → deltacat-0.1.11.dist-info}/WHEEL +1 -1
deltacat-0.1.6.dist-info/RECORD +0 -108
{deltacat-0.1.6.dist-info → deltacat-0.1.11.dist-info}/LICENSE +0 -0
{deltacat-0.1.6.dist-info → deltacat-0.1.11.dist-info}/top_level.txt +0 -0

deltacat/compute/compactor/compaction_session.py CHANGED Viewed

@@ -1,54 +1,66 @@
-import logging
-import time
 import functools
-import ray
+import logging
 from collections import defaultdict
+from typing import Dict, List, Optional, Set, Tuple
+import pyarrow as pa
+import ray
 from deltacat import logs
+from deltacat.compute.compactor import (
+    PrimaryKeyIndexLocator,
+    PrimaryKeyIndexMeta,
+    PrimaryKeyIndexVersionLocator,
+    PrimaryKeyIndexVersionMeta,
+    PyArrowWriteResult,
+    RoundCompletionInfo,
+    SortKey,
+)
+from deltacat.compute.compactor.steps import dedupe as dd
+from deltacat.compute.compactor.steps import hash_bucket as hb
+from deltacat.compute.compactor.steps import materialize as mat
+from deltacat.compute.compactor.utils import io
+from deltacat.compute.compactor.utils import primary_key_index as pki
+from deltacat.compute.compactor.utils import round_completion_file as rcf
 from deltacat.compute.stats.models.delta_stats import DeltaStats
-from deltacat.storage import Delta, DeltaLocator, Partition, \
-    PartitionLocator, interface as unimplemented_deltacat_storage
-from deltacat.utils.ray_utils.concurrency import invoke_parallel, \
-    round_robin_options_provider
-from deltacat.utils.ray_utils.runtime import live_node_resource_keys
-from deltacat.compute.compactor.steps import hash_bucket as hb, dedupe as dd, \
-    materialize as mat
-from deltacat.compute.compactor import SortKey, PrimaryKeyIndexMeta, \
-    PrimaryKeyIndexLocator, PrimaryKeyIndexVersionMeta, \
-    PrimaryKeyIndexVersionLocator, RoundCompletionInfo, \
-    PyArrowWriteResult
-from deltacat.compute.compactor.utils import round_completion_file as rcf, io, \
-    primary_key_index as pki
+from deltacat.storage import Delta, DeltaLocator, Partition, PartitionLocator
+from deltacat.storage import interface as unimplemented_deltacat_storage
 from deltacat.types.media import ContentType
+from deltacat.utils.placement import PlacementGroupConfig
+from deltacat.utils.ray_utils.concurrency import (
+    invoke_parallel,
+    round_robin_options_provider,
+)
+from deltacat.utils.ray_utils.runtime import live_node_resource_keys
-from typing import List, Set, Optional, Tuple, Dict, Union, Any
-import pyarrow as pa
 logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
-_SORT_KEY_NAME_INDEX: int = 0
-_SORT_KEY_ORDER_INDEX: int = 1
 _PRIMARY_KEY_INDEX_ALGORITHM_VERSION: str = "1.0"
 def check_preconditions(
-        source_partition_locator: PartitionLocator,
-        compacted_partition_locator: PartitionLocator,
-        sort_keys: List[SortKey],
-        max_records_per_output_file: int,
-        new_hash_bucket_count: Optional[int],
-        deltacat_storage=unimplemented_deltacat_storage) -> int:
-    assert source_partition_locator.partition_values \
-           == compacted_partition_locator.partition_values, \
-        "In-place compaction must use the same partition values for the " \
+    source_partition_locator: PartitionLocator,
+    compacted_partition_locator: PartitionLocator,
+    sort_keys: List[SortKey],
+    max_records_per_output_file: int,
+    new_hash_bucket_count: Optional[int],
+    deltacat_storage=unimplemented_deltacat_storage,
+) -> int:
+    assert (
+        source_partition_locator.partition_values
+        == compacted_partition_locator.partition_values
+    ), (
+        "In-place compaction must use the same partition values for the "
         "source and destination."
-    assert max_records_per_output_file >= 1, \
-        "Max records per output file must be a positive value"
+    )
+    assert (
+        max_records_per_output_file >= 1
+    ), "Max records per output file must be a positive value"
     if new_hash_bucket_count is not None:
-        assert new_hash_bucket_count >= 1, \
-            "New hash bucket count must be a positive value"
+        assert (
+            new_hash_bucket_count >= 1
+        ), "New hash bucket count must be a positive value"
     return SortKey.validate_sort_keys(
         source_partition_locator,
         sort_keys,
@@ -57,101 +69,111 @@ def check_preconditions(
 def compact_partition(
-        source_partition_locator: PartitionLocator,
-        compacted_partition_locator: PartitionLocator,
-        primary_keys: Set[str],
-        compaction_artifact_s3_bucket: str,
-        last_stream_position_to_compact: int,
-        hash_bucket_count: Optional[int] = None,
-        sort_keys: List[SortKey] = None,
-        records_per_primary_key_index_file: int = 38_000_000,
-        records_per_compacted_file: int = 4_000_000,
-        input_deltas_stats: Dict[int, DeltaStats] = None,
-        min_pk_index_pa_bytes: int = 0,
-        min_hash_bucket_chunk_size: int = 0,
-        compacted_file_content_type: ContentType = ContentType.PARQUET,
-        delete_prev_primary_key_index: bool = False,
-        read_round_completion: bool = False,
-        pg_config: Optional[List[Dict[str, Any]]] = None,
-        schema_on_read: Optional[pa.schema] = None,  # TODO (ricmiyam): Remove this and retrieve schema from storage API
-        deltacat_storage=unimplemented_deltacat_storage):
+    source_partition_locator: PartitionLocator,
+    destination_partition_locator: PartitionLocator,
+    primary_keys: Set[str],
+    compaction_artifact_s3_bucket: str,
+    last_stream_position_to_compact: int,
+    *,
+    hash_bucket_count: Optional[int] = None,
+    sort_keys: List[SortKey] = None,
+    records_per_primary_key_index_file: int = 38_000_000,
+    records_per_compacted_file: int = 4_000_000,
+    input_deltas_stats: Dict[int, DeltaStats] = None,
+    min_pk_index_pa_bytes: int = 0,
+    min_hash_bucket_chunk_size: int = 0,
+    compacted_file_content_type: ContentType = ContentType.PARQUET,
+    delete_prev_primary_key_index: bool = False,
+    pg_config: Optional[PlacementGroupConfig] = None,
+    schema_on_read: Optional[
+        pa.schema
+    ] = None,  # TODO (ricmiyam): Remove this and retrieve schema from storage API
+    rebase_source_partition_locator: Optional[PartitionLocator] = None,
+    rebase_source_partition_high_watermark: Optional[int] = None,
+    deltacat_storage=unimplemented_deltacat_storage,
+) -> Optional[str]:
     logger.info(f"Starting compaction session for: {source_partition_locator}")
     partition = None
     compaction_rounds_executed = 0
     has_next_compaction_round = True
-    opts={}
-    if pg_config:
-        opts=pg_config[0]
+    new_rcf_s3_url = None
     while has_next_compaction_round:
-        has_next_compaction_round_obj, new_partition_obj, new_rci_obj = \
-            _execute_compaction_round.options(**opts).remote(
-                source_partition_locator,
-                compacted_partition_locator,
-                primary_keys,
-                compaction_artifact_s3_bucket,
-                last_stream_position_to_compact,
-                hash_bucket_count,
-                sort_keys,
-                records_per_primary_key_index_file,
-                records_per_compacted_file,
-                input_deltas_stats,
-                min_pk_index_pa_bytes,
-                min_hash_bucket_chunk_size,
-                compacted_file_content_type,
-                delete_prev_primary_key_index,
-                read_round_completion,
-                schema_on_read,
-                deltacat_storage=deltacat_storage,
-                pg_config=pg_config
-            )
-        has_next_compaction_round = ray.get(has_next_compaction_round_obj)
-        new_partition = ray.get(new_partition_obj)
-        new_rci = ray.get(new_rci_obj)
+        (
+            has_next_compaction_round,
+            new_partition,
+            new_rci,
+            new_rcf_s3_url,
+        ) = _execute_compaction_round(
+            source_partition_locator,
+            destination_partition_locator,
+            primary_keys,
+            compaction_artifact_s3_bucket,
+            last_stream_position_to_compact,
+            hash_bucket_count,
+            sort_keys,
+            records_per_primary_key_index_file,
+            records_per_compacted_file,
+            input_deltas_stats,
+            min_pk_index_pa_bytes,
+            min_hash_bucket_chunk_size,
+            compacted_file_content_type,
+            delete_prev_primary_key_index,
+            pg_config,
+            schema_on_read,
+            rebase_source_partition_locator,
+            rebase_source_partition_high_watermark,
+            deltacat_storage,
+        )
         if new_partition:
             partition = new_partition
-            compacted_partition_locator = new_partition.locator
+            destination_partition_locator = new_partition.locator
             compaction_rounds_executed += 1
         # Take new primary key index sizes into account for subsequent compaction rounds and their dedupe steps
         if new_rci:
             min_pk_index_pa_bytes = new_rci.pk_index_pyarrow_write_result.pyarrow_bytes
-    logger.info(f"Partition-{source_partition_locator.partition_values}-> Compaction session data processing completed in "
-                f"{compaction_rounds_executed} rounds.")
+    logger.info(
+        f"Partition-{source_partition_locator.partition_values}-> Compaction session data processing completed in "
+        f"{compaction_rounds_executed} rounds."
+    )
     if partition:
         logger.info(f"Committing compacted partition to: {partition.locator}")
         partition = deltacat_storage.commit_partition(partition)
         logger.info(f"Committed compacted partition: {partition}")
     logger.info(f"Completed compaction session for: {source_partition_locator}")
+    return new_rcf_s3_url
-@ray.remote(num_cpus=0.1,num_returns=3)
-def _execute_compaction_round(
-        source_partition_locator: PartitionLocator,
-        compacted_partition_locator: PartitionLocator,
-        primary_keys: Set[str],
-        compaction_artifact_s3_bucket: str,
-        last_stream_position_to_compact: int,
-        new_hash_bucket_count: Optional[int],
-        sort_keys: List[SortKey],
-        records_per_primary_key_index_file: int,
-        records_per_compacted_file: int,
-        input_deltas_stats: Dict[int, DeltaStats],
-        min_pk_index_pa_bytes: int,
-        min_hash_bucket_chunk_size: int,
-        compacted_file_content_type: ContentType,
-        delete_prev_primary_key_index: bool,
-        read_round_completion: bool,
-        schema_on_read: Optional[pa.schema],
-        deltacat_storage = unimplemented_deltacat_storage,
-        pg_config: Optional[List[Dict[str, Any]]] = None) \
-        -> Tuple[bool, Optional[Partition], Optional[RoundCompletionInfo]]:
+@ray.remote(num_cpus=0.1, num_returns=3)
+def _execute_compaction_round(
+    source_partition_locator: PartitionLocator,
+    compacted_partition_locator: PartitionLocator,
+    primary_keys: Set[str],
+    compaction_artifact_s3_bucket: str,
+    last_stream_position_to_compact: int,
+    new_hash_bucket_count: Optional[int],
+    sort_keys: List[SortKey],
+    records_per_primary_key_index_file: int,
+    records_per_compacted_file: int,
+    input_deltas_stats: Dict[int, DeltaStats],
+    min_pk_index_pa_bytes: int,
+    min_hash_bucket_chunk_size: int,
+    compacted_file_content_type: ContentType,
+    delete_prev_primary_key_index: bool,
+    pg_config: Optional[PlacementGroupConfig],
+    schema_on_read: Optional[pa.schema],
+    rebase_source_partition_locator: Optional[PartitionLocator],
+    rebase_source_partition_high_watermark: Optional[int],
+    deltacat_storage=unimplemented_deltacat_storage,
+) -> Tuple[bool, Optional[Partition], Optional[RoundCompletionInfo], Optional[str]]:
     if not primary_keys:
         # TODO (pdames): run simple rebatch to reduce all deltas into 1 delta
         #  with normalized manifest entry sizes
         raise NotImplementedError(
-            "Compaction only supports tables with 1 or more primary keys")
+            "Compaction only supports tables with 1 or more primary keys"
+        )
     if sort_keys is None:
         sort_keys = []
     # TODO (pdames): detect and handle schema evolution (at least ensure that
@@ -172,39 +194,30 @@ def _execute_compaction_round(
     # sort primary keys to produce the same pk digest regardless of input order
     primary_keys = sorted(primary_keys)
-    # collect cluster resource stats
-    # cluster_resources = ray.cluster_resources()
-    # logger.info(f"Total cluster resources: {cluster_resources}")
-    # logger.info(f"Available cluster resources: {ray.available_resources()}")
-    # cluster_cpus = int(cluster_resources["CPU"])
-    # logger.info(f"Total cluster CPUs: {cluster_cpus}")
-    # collect node group resources
     cluster_resources = ray.cluster_resources()
     logger.info(f"Total cluster resources: {cluster_resources}")
-    if pg_config: # use resource in each placement group
-        node_resource_keys=None
-        cluster_resources = pg_config[1]
-        cluster_cpus = cluster_resources['CPU']
-    else: # use all cluster resource
+    node_resource_keys = None
+    if pg_config:  # use resource in each placement group
+        cluster_resources = pg_config.resource
+        cluster_cpus = cluster_resources["CPU"]
+    else:  # use all cluster resource
         logger.info(f"Available cluster resources: {ray.available_resources()}")
         cluster_cpus = int(cluster_resources["CPU"])
         logger.info(f"Total cluster CPUs: {cluster_cpus}")
         node_resource_keys = live_node_resource_keys()
-        logger.info(f"Found {len(node_resource_keys)} live cluster nodes: "
-                   f"{node_resource_keys}")
-    if node_resource_keys:
-        # create a remote options provider to round-robin tasks across all nodes
-        logger.info(f"Setting round robin scheduling with node id:{node_resource_keys}")
-        round_robin_opt_provider = functools.partial(
-            round_robin_options_provider,
-            resource_keys=node_resource_keys,
+        logger.info(
+            f"Found {len(node_resource_keys)} live cluster nodes: "
+            f"{node_resource_keys}"
         )
-    else:
-        logger.info("Setting round robin scheduling to None")
-        round_robin_opt_provider = None
+    # create a remote options provider to round-robin tasks across all nodes or allocated bundles
+    logger.info(f"Setting round robin scheduling with node id:{node_resource_keys}")
+    round_robin_opt_provider = functools.partial(
+        round_robin_options_provider,
+        resource_keys=node_resource_keys,
+        pg_config=pg_config.opts if pg_config else None,
+    )
     # assign a distinct index to each node in the cluster
     # head_node_ip = urllib.request.urlopen(
     #     "http://169.254.169.254/latest/meta-data/local-ipv4"
@@ -226,38 +239,58 @@ def _execute_compaction_round(
         _PRIMARY_KEY_INDEX_ALGORITHM_VERSION,
     )
     compatible_primary_key_index_locator = PrimaryKeyIndexLocator.of(
-        compatible_primary_key_index_meta)
-    compatible_primary_key_index_root_path = \
+        compatible_primary_key_index_meta
+    )
+    compatible_primary_key_index_root_path = (
         compatible_primary_key_index_locator.primary_key_index_root_path
+    )
     # read the results from any previously completed compaction round that used
     # a compatible primary key index
     round_completion_info = None
-    if read_round_completion:
+    if not rebase_source_partition_locator:
+        logger.info(
+            f"Reading round completion file for compatible "
+            f"primary key index root path: {compatible_primary_key_index_root_path}"
+        )
         round_completion_info = rcf.read_round_completion_file(
             compaction_artifact_s3_bucket,
             source_partition_locator,
             compatible_primary_key_index_root_path,
         )
+        logger.info(f"Round completion file: {round_completion_info}")
     # read the previous compaction round's hash bucket count, if any
     old_hash_bucket_count = None
     if round_completion_info:
-        old_pki_version_locator = round_completion_info\
-            .primary_key_index_version_locator
-        old_hash_bucket_count = old_pki_version_locator\
-            .primary_key_index_version_meta \
-            .hash_bucket_count
-        min_pk_index_pa_bytes = round_completion_info.pk_index_pyarrow_write_result.pyarrow_bytes
+        old_pki_version_locator = (
+            round_completion_info.primary_key_index_version_locator
+        )
+        old_hash_bucket_count = (
+            old_pki_version_locator.primary_key_index_version_meta.hash_bucket_count
+        )
+        min_pk_index_pa_bytes = (
+            round_completion_info.pk_index_pyarrow_write_result.pyarrow_bytes
+        )
+    else:
+        logger.info(
+            f"No prior round info read. Source partition: "
+            f"{source_partition_locator}. Primary key index locator: "
+            f"{compatible_primary_key_index_locator}. Rebase source "
+            f"partition locator: {rebase_source_partition_locator}"
+        )
     # use the new hash bucket count if provided, or fall back to old count
-    hash_bucket_count = new_hash_bucket_count \
-        if new_hash_bucket_count is not None \
+    hash_bucket_count = (
+        new_hash_bucket_count
+        if new_hash_bucket_count is not None
         else old_hash_bucket_count
+    )
     # discover input delta files
-    high_watermark = round_completion_info.high_watermark \
-        if round_completion_info else None
+    high_watermark = (
+        round_completion_info.high_watermark if round_completion_info else None
+    )
     input_deltas = io.discover_deltas(
         source_partition_locator,
@@ -268,33 +301,38 @@ def _execute_compaction_round(
     if not input_deltas:
         logger.info("No input deltas found to compact.")
-        return False, None, None
+        return False, None, None, None
     # limit the input deltas to fit on this cluster and convert them to
     # annotated deltas of equivalent size for easy parallel distribution
-    uniform_deltas, hash_bucket_count, last_stream_position_compacted = \
-        io.limit_input_deltas(
-            input_deltas,
-            cluster_resources,
-            hash_bucket_count,
-            min_pk_index_pa_bytes,
-            min_hash_bucket_chunk_size,
-            input_deltas_stats=input_deltas_stats,
-            deltacat_storage=deltacat_storage
-        )
+    (
+        uniform_deltas,
+        hash_bucket_count,
+        last_stream_position_compacted,
+    ) = io.limit_input_deltas(
+        input_deltas,
+        cluster_resources,
+        hash_bucket_count,
+        min_pk_index_pa_bytes,
+        min_hash_bucket_chunk_size,
+        input_deltas_stats=input_deltas_stats,
+        deltacat_storage=deltacat_storage,
+    )
-    assert hash_bucket_count is not None and hash_bucket_count > 0, \
-        f"Unexpected Error: Default hash bucket count ({hash_bucket_count}) " \
-        f"is invalid."
+    assert hash_bucket_count is not None and hash_bucket_count > 0, (
+        f"Expected hash bucket count to be a positive integer, but found "
+        f"`{hash_bucket_count}`"
+    )
     # rehash the primary key index if necessary
-    round_completion_info = None
     if round_completion_info:
         logger.info(f"Round completion file contents: {round_completion_info}")
         # the previous primary key index is compatible with the current, but
         # will need to be rehashed if the hash bucket count has changed
         if hash_bucket_count != old_hash_bucket_count:
+            # TODO(draghave): manually test the path after prior primary key
+            #   index was already built
             round_completion_info = pki.rehash(
                 round_robin_opt_provider,
                 compaction_artifact_s3_bucket,
@@ -305,10 +343,6 @@ def _execute_compaction_round(
                 records_per_primary_key_index_file,
                 delete_prev_primary_key_index,
             )
-    else:
-        logger.info(f"No prior round completion file found. Source partition: "
-                    f"{source_partition_locator}. Primary key index locator: "
-                    f"{compatible_primary_key_index_locator}")
     # parallel step 1:
     # group like primary keys together by hashing them into buckets
@@ -331,7 +365,7 @@ def _execute_compaction_round(
         for hash_group_index, object_id in enumerate(hash_group_idx_to_obj_id):
             if object_id:
                 all_hash_group_idx_to_obj_id[hash_group_index].append(object_id)
-    hash_group_count = dedupe_task_count = len(all_hash_group_idx_to_obj_id)
+    hash_group_count = len(all_hash_group_idx_to_obj_id)
     logger.info(f"Hash bucket groups created: {hash_group_count}")
     # TODO (pdames): when resources are freed during the last round of hash
@@ -359,9 +393,11 @@ def _execute_compaction_round(
         _PRIMARY_KEY_INDEX_ALGORITHM_VERSION,
     )
     new_primary_key_index_locator = PrimaryKeyIndexLocator.of(
-        new_primary_key_index_meta)
-    new_primary_key_index_root_path = new_primary_key_index_locator\
-        .primary_key_index_root_path
+        new_primary_key_index_meta
+    )
+    new_primary_key_index_root_path = (
+        new_primary_key_index_locator.primary_key_index_root_path
+    )
     # generate a new primary key index version locator for this round
     new_primary_key_index_version_meta = PrimaryKeyIndexVersionMeta.of(
@@ -369,47 +405,47 @@ def _execute_compaction_round(
         hash_bucket_count,
     )
     new_pki_version_locator = PrimaryKeyIndexVersionLocator.generate(
-        new_primary_key_index_version_meta)
+        new_primary_key_index_version_meta
+    )
     # parallel step 2:
     # discover records with duplicate primary keys in each hash bucket, and
     # identify the index of records to keep or drop based on sort keys
     num_materialize_buckets = max_parallelism
     logger.info(f"Materialize Bucket Count: {num_materialize_buckets}")
-    record_counts_pending_materialize = \
-        dd.RecordCountsPendingMaterialize.remote(dedupe_task_count)
     dd_tasks_pending = invoke_parallel(
         items=all_hash_group_idx_to_obj_id.values(),
         ray_task=dd.dedupe,
         max_parallelism=max_parallelism,
         options_provider=round_robin_opt_provider,
-        kwargs_provider=lambda index, item: {"dedupe_task_index": index,
-                                             "object_ids": item},
+        kwargs_provider=lambda index, item: {
+            "dedupe_task_index": index,
+            "object_ids": item,
+        },
         compaction_artifact_s3_bucket=compaction_artifact_s3_bucket,
         round_completion_info=round_completion_info,
         new_primary_key_index_version_locator=new_pki_version_locator,
         sort_keys=sort_keys,
         max_records_per_index_file=records_per_primary_key_index_file,
-        max_records_per_materialized_file=records_per_compacted_file,
         num_materialize_buckets=num_materialize_buckets,
         delete_old_primary_key_index=delete_prev_primary_key_index,
-        record_counts_pending_materialize=record_counts_pending_materialize,
     )
     logger.info(f"Getting {len(dd_tasks_pending)} dedupe results...")
     dd_results = ray.get([t[0] for t in dd_tasks_pending])
     logger.info(f"Got {len(dd_results)} dedupe results.")
     all_mat_buckets_to_obj_id = defaultdict(list)
     for mat_bucket_idx_to_obj_id in dd_results:
-        for bucket_idx, dd_task_index_and_object_id_tuple in \
-                mat_bucket_idx_to_obj_id.items():
+        for (
+            bucket_idx,
+            dd_task_index_and_object_id_tuple,
+        ) in mat_bucket_idx_to_obj_id.items():
             all_mat_buckets_to_obj_id[bucket_idx].append(
-                dd_task_index_and_object_id_tuple)
+                dd_task_index_and_object_id_tuple
+            )
     logger.info(f"Getting {len(dd_tasks_pending)} dedupe result stat(s)...")
     pki_stats = ray.get([t[2] for t in dd_tasks_pending])
     logger.info(f"Got {len(pki_stats)} dedupe result stat(s).")
-    logger.info(f"Materialize buckets created: "
-                f"{len(all_mat_buckets_to_obj_id)}")
+    logger.info(f"Materialize buckets created: " f"{len(all_mat_buckets_to_obj_id)}")
     # TODO(pdames): when resources are freed during the last round of deduping
     #  start running materialize tasks that read materialization source file
@@ -428,9 +464,9 @@ def _execute_compaction_round(
         ray_task=mat.materialize,
         max_parallelism=max_parallelism,
         options_provider=round_robin_opt_provider,
-        kwargs_provider=lambda index, mat_bucket_idx_to_obj_id: {
-            "mat_bucket_index": mat_bucket_idx_to_obj_id[0],
-            "dedupe_task_idx_and_obj_id_tuples": mat_bucket_idx_to_obj_id[1],
+        kwargs_provider=lambda index, mat_bucket_index_to_obj_id: {
+            "mat_bucket_index": mat_bucket_index_to_obj_id[0],
+            "dedupe_task_idx_and_obj_id_tuples": mat_bucket_index_to_obj_id[1],
         },
         schema=schema_on_read,
         round_completion_info=round_completion_info,
@@ -455,25 +491,40 @@ def _execute_compaction_round(
         compacted_delta.stream_position,
     )
-    round_completion_info = RoundCompletionInfo.of(
-        last_stream_position_compacted,
+    rci_high_watermark = (
+        rebase_source_partition_high_watermark
+        if rebase_source_partition_high_watermark
+        else last_stream_position_compacted
+    )
+    new_round_completion_info = RoundCompletionInfo.of(
+        rci_high_watermark,
         new_compacted_delta_locator,
-        PyArrowWriteResult.union([m.pyarrow_write_result
-                                  for m in mat_results]),
+        PyArrowWriteResult.union([m.pyarrow_write_result for m in mat_results]),
         PyArrowWriteResult.union(pki_stats),
         bit_width_of_sort_keys,
         new_pki_version_locator,
+        rebase_source_partition_locator
+        or round_completion_info.rebase_source_partition_locator,
+    )
+    rcf_source_partition_locator = (
+        rebase_source_partition_locator
+        if rebase_source_partition_locator
+        else source_partition_locator
     )
-    rcf.write_round_completion_file(
+    round_completion_file_s3_url = rcf.write_round_completion_file(
         compaction_artifact_s3_bucket,
-        source_partition_locator,
+        rcf_source_partition_locator,
         new_primary_key_index_root_path,
-        round_completion_info,
+        new_round_completion_info,
+    )
+    logger.info(
+        f"partition-{source_partition_locator.partition_values},"
+        f"compacted at: {last_stream_position_compacted},"
+        f"last position: {last_stream_position_to_compact}"
+    )
+    return (
+        (last_stream_position_compacted < last_stream_position_to_compact),
+        partition,
+        new_round_completion_info,
+        round_completion_file_s3_url,
     )
-    time_mat_e = time.time()
-    logger.info(f"partition-{source_partition_locator.partition_values},compacted at:{last_stream_position_compacted}, last position:{last_stream_position_to_compact}")
-    return \
-        (last_stream_position_compacted < last_stream_position_to_compact), \
-        partition, \
-        round_completion_info

deltacat 0.1.6__py3-none-any.whl → 0.1.11__py3-none-any.whl

deltacat 0.1.6py3-none-any.whl → 0.1.11py3-none-any.whl