PyPI - deltacat - Versions diffs - 0.1.8__py3-none-any.whl → 0.1.11__py3-none-any.whl - Mend

deltacat 0.1.8py3-none-any.whl → 0.1.11py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (105) hide show

deltacat/__init__.py +41 -15
deltacat/aws/clients.py +12 -31
deltacat/aws/constants.py +1 -1
deltacat/aws/redshift/__init__.py +7 -2
deltacat/aws/redshift/model/manifest.py +54 -50
deltacat/aws/s3u.py +188 -218
deltacat/catalog/delegate.py +151 -185
deltacat/catalog/interface.py +78 -97
deltacat/catalog/model/catalog.py +21 -21
deltacat/catalog/model/table_definition.py +11 -9
deltacat/compute/compactor/__init__.py +12 -16
deltacat/compute/compactor/compaction_session.py +259 -316
deltacat/compute/compactor/model/delta_annotated.py +60 -44
deltacat/compute/compactor/model/delta_file_envelope.py +5 -6
deltacat/compute/compactor/model/delta_file_locator.py +10 -8
deltacat/compute/compactor/model/materialize_result.py +6 -7
deltacat/compute/compactor/model/primary_key_index.py +38 -34
deltacat/compute/compactor/model/pyarrow_write_result.py +3 -4
deltacat/compute/compactor/model/round_completion_info.py +25 -19
deltacat/compute/compactor/model/sort_key.py +18 -15
deltacat/compute/compactor/steps/dedupe.py +152 -259
deltacat/compute/compactor/steps/hash_bucket.py +57 -73
deltacat/compute/compactor/steps/materialize.py +138 -99
deltacat/compute/compactor/steps/rehash/rehash_bucket.py +13 -13
deltacat/compute/compactor/steps/rehash/rewrite_index.py +11 -13
deltacat/compute/compactor/utils/io.py +59 -47
deltacat/compute/compactor/utils/primary_key_index.py +131 -90
deltacat/compute/compactor/utils/round_completion_file.py +22 -23
deltacat/compute/compactor/utils/system_columns.py +33 -42
deltacat/compute/metastats/meta_stats.py +235 -157
deltacat/compute/metastats/model/partition_stats_dict.py +7 -10
deltacat/compute/metastats/model/stats_cluster_size_estimator.py +13 -5
deltacat/compute/metastats/stats.py +95 -64
deltacat/compute/metastats/utils/io.py +100 -53
deltacat/compute/metastats/utils/pyarrow_memory_estimation_function.py +5 -2
deltacat/compute/metastats/utils/ray_utils.py +38 -33
deltacat/compute/stats/basic.py +107 -69
deltacat/compute/stats/models/delta_column_stats.py +11 -8
deltacat/compute/stats/models/delta_stats.py +59 -32
deltacat/compute/stats/models/delta_stats_cache_result.py +4 -1
deltacat/compute/stats/models/manifest_entry_stats.py +12 -6
deltacat/compute/stats/models/stats_result.py +24 -14
deltacat/compute/stats/utils/intervals.py +16 -9
deltacat/compute/stats/utils/io.py +86 -51
deltacat/compute/stats/utils/manifest_stats_file.py +24 -33
deltacat/constants.py +8 -10
deltacat/io/__init__.py +2 -2
deltacat/io/aws/redshift/redshift_datasource.py +157 -143
deltacat/io/dataset.py +14 -17
deltacat/io/read_api.py +36 -33
deltacat/logs.py +94 -42
deltacat/storage/__init__.py +18 -8
deltacat/storage/interface.py +196 -213
deltacat/storage/model/delta.py +45 -51
deltacat/storage/model/list_result.py +12 -8
deltacat/storage/model/namespace.py +4 -5
deltacat/storage/model/partition.py +42 -42
deltacat/storage/model/stream.py +29 -30
deltacat/storage/model/table.py +14 -14
deltacat/storage/model/table_version.py +32 -31
deltacat/storage/model/types.py +1 -0
deltacat/tests/stats/test_intervals.py +11 -24
deltacat/tests/utils/test_record_batch_tables.py +284 -0
deltacat/types/media.py +3 -4
deltacat/types/tables.py +31 -21
deltacat/utils/common.py +5 -11
deltacat/utils/numpy.py +20 -22
deltacat/utils/pandas.py +73 -100
deltacat/utils/performance.py +3 -9
deltacat/utils/placement.py +276 -231
deltacat/utils/pyarrow.py +302 -89
deltacat/utils/ray_utils/collections.py +2 -1
deltacat/utils/ray_utils/concurrency.py +38 -32
deltacat/utils/ray_utils/dataset.py +28 -28
deltacat/utils/ray_utils/performance.py +5 -9
deltacat/utils/ray_utils/runtime.py +9 -10
{deltacat-0.1.8.dist-info → deltacat-0.1.11.dist-info}/METADATA +22 -12
deltacat-0.1.11.dist-info/RECORD +110 -0
{deltacat-0.1.8.dist-info → deltacat-0.1.11.dist-info}/WHEEL +1 -1
deltacat/autoscaler/events/__init__.py +0 -0
deltacat/autoscaler/events/compaction/__init__.py +0 -0
deltacat/autoscaler/events/compaction/cluster.py +0 -82
deltacat/autoscaler/events/compaction/collections/__init__.py +0 -0
deltacat/autoscaler/events/compaction/collections/partition_key_value.py +0 -36
deltacat/autoscaler/events/compaction/dispatcher.py +0 -28
deltacat/autoscaler/events/compaction/input.py +0 -27
deltacat/autoscaler/events/compaction/process.py +0 -25
deltacat/autoscaler/events/compaction/session_manager.py +0 -13
deltacat/autoscaler/events/compaction/utils.py +0 -216
deltacat/autoscaler/events/compaction/workflow.py +0 -303
deltacat/autoscaler/events/dispatcher.py +0 -95
deltacat/autoscaler/events/dynamodb/__init__.py +0 -0
deltacat/autoscaler/events/dynamodb/event_store.py +0 -164
deltacat/autoscaler/events/event_store.py +0 -55
deltacat/autoscaler/events/exceptions.py +0 -6
deltacat/autoscaler/events/processor.py +0 -177
deltacat/autoscaler/events/session_manager.py +0 -25
deltacat/autoscaler/events/states.py +0 -88
deltacat/autoscaler/events/workflow.py +0 -54
deltacat/autoscaler/node_group.py +0 -230
deltacat/autoscaler/utils.py +0 -69
deltacat-0.1.8.dist-info/RECORD +0 -131
/deltacat/{autoscaler → tests/utils}/__init__.py +0 -0
{deltacat-0.1.8.dist-info → deltacat-0.1.11.dist-info}/LICENSE +0 -0
{deltacat-0.1.8.dist-info → deltacat-0.1.11.dist-info}/top_level.txt +0 -0

deltacat/compute/compactor/compaction_session.py CHANGED Viewed

@@ -1,83 +1,66 @@
-import logging
-import time
 import functools
-import ray
+import logging
 from collections import defaultdict
+from typing import Dict, List, Optional, Set, Tuple
+import pyarrow as pa
+import ray
 from deltacat import logs
+from deltacat.compute.compactor import (
+    PrimaryKeyIndexLocator,
+    PrimaryKeyIndexMeta,
+    PrimaryKeyIndexVersionLocator,
+    PrimaryKeyIndexVersionMeta,
+    PyArrowWriteResult,
+    RoundCompletionInfo,
+    SortKey,
+)
+from deltacat.compute.compactor.steps import dedupe as dd
+from deltacat.compute.compactor.steps import hash_bucket as hb
+from deltacat.compute.compactor.steps import materialize as mat
+from deltacat.compute.compactor.utils import io
+from deltacat.compute.compactor.utils import primary_key_index as pki
+from deltacat.compute.compactor.utils import round_completion_file as rcf
 from deltacat.compute.stats.models.delta_stats import DeltaStats
-from deltacat.storage import Delta, DeltaLocator, Partition, \
-    PartitionLocator, interface as unimplemented_deltacat_storage
-from deltacat.utils.ray_utils.concurrency import invoke_parallel, \
-    round_robin_options_provider
-from deltacat.utils.ray_utils.runtime import live_node_resource_keys
-from deltacat.compute.compactor.steps import hash_bucket as hb, dedupe as dd, \
-    materialize as mat
-from deltacat.compute.compactor import SortKey, PrimaryKeyIndexMeta, \
-    PrimaryKeyIndexLocator, PrimaryKeyIndexVersionMeta, \
-    PrimaryKeyIndexVersionLocator, RoundCompletionInfo, \
-    PyArrowWriteResult
-from deltacat.compute.compactor.utils import round_completion_file as rcf, io, \
-    primary_key_index as pki
+from deltacat.storage import Delta, DeltaLocator, Partition, PartitionLocator
+from deltacat.storage import interface as unimplemented_deltacat_storage
 from deltacat.types.media import ContentType
+from deltacat.utils.placement import PlacementGroupConfig
+from deltacat.utils.ray_utils.concurrency import (
+    invoke_parallel,
+    round_robin_options_provider,
+)
+from deltacat.utils.ray_utils.runtime import live_node_resource_keys
-from typing import List, Set, Optional, Tuple, Dict, Union, Any
-import pyarrow as pa
 logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
-@ray.remote(num_cpus=0.01)
-class STATES_ACTOR:
-    def __init__(self):
-        self._SORT_KEY_NAME_INDEX: int = 0
-        self._SORT_KEY_ORDER_INDEX: int = 1
-        self._PRIMARY_KEY_INDEX_ALGORITHM_VERSION: str = "1.0"
-        self._TOTAL_ENTRIES: int = 0
-        self._TOTAL_DELTAS: int = 0
-        self._TOTAL_ROUNDS: float = 0
-    def SORT_KEY_NAME_INDEX(self):
-        return self._SORT_KEY_NAME_INDEX
-    def SORT_KEY_ORDER_INDEX(self):
-        return self._SORT_KEY_NAME_INDEX
-    def PRIMARY_KEY_INDEX_ALGORITHM_VERSION(self):
-        return self._PRIMARY_KEY_INDEX_ALGORITHM_VERSION
-    def TOTAL_ROUNDS(self):
-        return self._TOTAL_ROUNDS
-    def TOTAL_ENTRIES(self):
-        return self._TOTAL_ENTRIES
-    def TOTAL_DELTAS(self):
-        return self._TOTAL_DELTAS
-    def update_delta(self, delta):
-        self._TOTAL_DELTAS = delta
-    def update_entry(self, entry):
-        self._TOTAL_ENTRIES = entry
-    def update_round(self, round):
-        self._TOTAL_ROUNDS = round
+_PRIMARY_KEY_INDEX_ALGORITHM_VERSION: str = "1.0"
 def check_preconditions(
-        source_partition_locator: PartitionLocator,
-        compacted_partition_locator: PartitionLocator,
-        sort_keys: List[SortKey],
-        max_records_per_output_file: int,
-        new_hash_bucket_count: Optional[int],
-        deltacat_storage=unimplemented_deltacat_storage) -> int:
-    assert source_partition_locator.partition_values \
-           == compacted_partition_locator.partition_values, \
-        "In-place compaction must use the same partition values for the " \
+    source_partition_locator: PartitionLocator,
+    compacted_partition_locator: PartitionLocator,
+    sort_keys: List[SortKey],
+    max_records_per_output_file: int,
+    new_hash_bucket_count: Optional[int],
+    deltacat_storage=unimplemented_deltacat_storage,
+) -> int:
+    assert (
+        source_partition_locator.partition_values
+        == compacted_partition_locator.partition_values
+    ), (
+        "In-place compaction must use the same partition values for the "
         "source and destination."
-    assert max_records_per_output_file >= 1, \
-        "Max records per output file must be a positive value"
+    )
+    assert (
+        max_records_per_output_file >= 1
+    ), "Max records per output file must be a positive value"
     if new_hash_bucket_count is not None:
-        assert new_hash_bucket_count >= 1, \
-            "New hash bucket count must be a positive value"
+        assert (
+            new_hash_bucket_count >= 1
+        ), "New hash bucket count must be a positive value"
     return SortKey.validate_sort_keys(
         source_partition_locator,
         sort_keys,
@@ -86,129 +69,111 @@ def check_preconditions(
 def compact_partition(
-        source_partition_locator: PartitionLocator,
-        compacted_partition_locator: PartitionLocator,
-        primary_keys: Set[str],
-        compaction_artifact_s3_bucket: str,
-        last_stream_position_to_compact: int,
-        hash_bucket_count: Optional[int] = None,
-        sort_keys: List[SortKey] = None,
-        records_per_primary_key_index_file: int = 38_000_000,
-        records_per_compacted_file: int = 4_000_000,
-        input_deltas_stats: Dict[int, DeltaStats] = None,
-        min_pk_index_pa_bytes: int = 0,
-        min_hash_bucket_chunk_size: int = 0,
-        compacted_file_content_type: ContentType = ContentType.PARQUET,
-        delete_prev_primary_key_index: bool = False,
-        read_round_completion: bool = False,
-        ignore_missing_manifest: bool = False,
-        max_parallelism: List[float] = None,
-        num_cpus: List[int] = None,
-        pg_config: Optional[List[Dict[str, Any]]] = None,
-        schema_on_read: Optional[pa.schema] = None,  # TODO (ricmiyam): Remove this and retrieve schema from storage API
-        deltacat_storage=unimplemented_deltacat_storage):
+    source_partition_locator: PartitionLocator,
+    destination_partition_locator: PartitionLocator,
+    primary_keys: Set[str],
+    compaction_artifact_s3_bucket: str,
+    last_stream_position_to_compact: int,
+    *,
+    hash_bucket_count: Optional[int] = None,
+    sort_keys: List[SortKey] = None,
+    records_per_primary_key_index_file: int = 38_000_000,
+    records_per_compacted_file: int = 4_000_000,
+    input_deltas_stats: Dict[int, DeltaStats] = None,
+    min_pk_index_pa_bytes: int = 0,
+    min_hash_bucket_chunk_size: int = 0,
+    compacted_file_content_type: ContentType = ContentType.PARQUET,
+    delete_prev_primary_key_index: bool = False,
+    pg_config: Optional[PlacementGroupConfig] = None,
+    schema_on_read: Optional[
+        pa.schema
+    ] = None,  # TODO (ricmiyam): Remove this and retrieve schema from storage API
+    rebase_source_partition_locator: Optional[PartitionLocator] = None,
+    rebase_source_partition_high_watermark: Optional[int] = None,
+    deltacat_storage=unimplemented_deltacat_storage,
+) -> Optional[str]:
     logger.info(f"Starting compaction session for: {source_partition_locator}")
     partition = None
     compaction_rounds_executed = 0
     has_next_compaction_round = True
-    opts={}
-    if pg_config:
-        opts=pg_config[0]
-    round_id = 1
-    states = STATES_ACTOR.remote()
+    new_rcf_s3_url = None
     while has_next_compaction_round:
-        round_start = time.time()
-        logger.info(f"round {round_id} started")
-        print(f"round {round_id} started")
-        has_next_compaction_round_obj, new_partition_obj, new_rci_obj = \
-            _execute_compaction_round.options(**opts).remote(
-                source_partition_locator,
-                compacted_partition_locator,
-                primary_keys,
-                compaction_artifact_s3_bucket,
-                last_stream_position_to_compact,
-                hash_bucket_count,
-                sort_keys,
-                records_per_primary_key_index_file,
-                records_per_compacted_file,
-                input_deltas_stats,
-                min_pk_index_pa_bytes,
-                min_hash_bucket_chunk_size,
-                compacted_file_content_type,
-                delete_prev_primary_key_index,
-                read_round_completion,
-                ignore_missing_manifest,
-                max_parallelism,
-                num_cpus,
-                round_id,
-                states,
-                schema_on_read,
-                deltacat_storage=deltacat_storage,
-                pg_config=pg_config
-            )
-        round_id +=1
-        has_next_compaction_round = ray.get(has_next_compaction_round_obj)
-        round_end = time.time()
-        TOTAL_ROUNDS = ray.get(states.TOTAL_ROUNDS.remote())
-        logger.info(f"Round {round_id}/{TOTAL_ROUNDS} took {round_end-round_start} seconds, estimated time to finish:{(TOTAL_ROUNDS-round_id)*(round_end-round_start)}")
-        print(f"Round {round_id}/{TOTAL_ROUNDS} took {round_end-round_start} seconds, estimated time to finish:{(TOTAL_ROUNDS-round_id)*(round_end-round_start)}")
-        new_partition = ray.get(new_partition_obj)
-        new_rci = ray.get(new_rci_obj)
+        (
+            has_next_compaction_round,
+            new_partition,
+            new_rci,
+            new_rcf_s3_url,
+        ) = _execute_compaction_round(
+            source_partition_locator,
+            destination_partition_locator,
+            primary_keys,
+            compaction_artifact_s3_bucket,
+            last_stream_position_to_compact,
+            hash_bucket_count,
+            sort_keys,
+            records_per_primary_key_index_file,
+            records_per_compacted_file,
+            input_deltas_stats,
+            min_pk_index_pa_bytes,
+            min_hash_bucket_chunk_size,
+            compacted_file_content_type,
+            delete_prev_primary_key_index,
+            pg_config,
+            schema_on_read,
+            rebase_source_partition_locator,
+            rebase_source_partition_high_watermark,
+            deltacat_storage,
+        )
         if new_partition:
             partition = new_partition
-            compacted_partition_locator = new_partition.locator
+            destination_partition_locator = new_partition.locator
             compaction_rounds_executed += 1
         # Take new primary key index sizes into account for subsequent compaction rounds and their dedupe steps
         if new_rci:
             min_pk_index_pa_bytes = new_rci.pk_index_pyarrow_write_result.pyarrow_bytes
-    logger.info(f"Partition-{source_partition_locator.partition_values}-> Compaction session data processing completed in "
-                f"{compaction_rounds_executed} rounds.")
+    logger.info(
+        f"Partition-{source_partition_locator.partition_values}-> Compaction session data processing completed in "
+        f"{compaction_rounds_executed} rounds."
+    )
     if partition:
         logger.info(f"Committing compacted partition to: {partition.locator}")
         partition = deltacat_storage.commit_partition(partition)
         logger.info(f"Committed compacted partition: {partition}")
     logger.info(f"Completed compaction session for: {source_partition_locator}")
+    return new_rcf_s3_url
-@ray.remote
-def get_metadata(deltacat_storage, delta):
-    return len(deltacat_storage.get_delta_manifest(delta).entries)
-@ray.remote(num_cpus=1,num_returns=3,max_retries=1)
+@ray.remote(num_cpus=0.1, num_returns=3)
 def _execute_compaction_round(
-        source_partition_locator: PartitionLocator,
-        compacted_partition_locator: PartitionLocator,
-        primary_keys: Set[str],
-        compaction_artifact_s3_bucket: str,
-        last_stream_position_to_compact: int,
-        new_hash_bucket_count: Optional[int],
-        sort_keys: List[SortKey],
-        records_per_primary_key_index_file: int,
-        records_per_compacted_file: int,
-        input_deltas_stats: Dict[int, DeltaStats],
-        min_pk_index_pa_bytes: int,
-        min_hash_bucket_chunk_size: int,
-        compacted_file_content_type: ContentType,
-        delete_prev_primary_key_index: bool,
-        read_round_completion: bool,
-        ignore_missing_manifest: bool,
-        max_parallelism: List[float],
-        num_cpus: List[int],
-        round_id: int,
-        states: STATES_ACTOR,
-        schema_on_read: Optional[pa.schema],
-        deltacat_storage = unimplemented_deltacat_storage,
-        pg_config: Optional[List[Dict[str, Any]]] = None) \
-        -> Tuple[bool, Optional[Partition], Optional[RoundCompletionInfo]]:
-    pre_hb_start = time.time()
+    source_partition_locator: PartitionLocator,
+    compacted_partition_locator: PartitionLocator,
+    primary_keys: Set[str],
+    compaction_artifact_s3_bucket: str,
+    last_stream_position_to_compact: int,
+    new_hash_bucket_count: Optional[int],
+    sort_keys: List[SortKey],
+    records_per_primary_key_index_file: int,
+    records_per_compacted_file: int,
+    input_deltas_stats: Dict[int, DeltaStats],
+    min_pk_index_pa_bytes: int,
+    min_hash_bucket_chunk_size: int,
+    compacted_file_content_type: ContentType,
+    delete_prev_primary_key_index: bool,
+    pg_config: Optional[PlacementGroupConfig],
+    schema_on_read: Optional[pa.schema],
+    rebase_source_partition_locator: Optional[PartitionLocator],
+    rebase_source_partition_high_watermark: Optional[int],
+    deltacat_storage=unimplemented_deltacat_storage,
+) -> Tuple[bool, Optional[Partition], Optional[RoundCompletionInfo], Optional[str]]:
     if not primary_keys:
         # TODO (pdames): run simple rebatch to reduce all deltas into 1 delta
         #  with normalized manifest entry sizes
         raise NotImplementedError(
-            "Compaction only supports tables with 1 or more primary keys")
+            "Compaction only supports tables with 1 or more primary keys"
+        )
     if sort_keys is None:
         sort_keys = []
     # TODO (pdames): detect and handle schema evolution (at least ensure that
@@ -229,40 +194,30 @@ def _execute_compaction_round(
     # sort primary keys to produce the same pk digest regardless of input order
     primary_keys = sorted(primary_keys)
-    # collect cluster resource stats
-    # cluster_resources = ray.cluster_resources()
-    # logger.info(f"Total cluster resources: {cluster_resources}")
-    # logger.info(f"Available cluster resources: {ray.available_resources()}")
-    # cluster_cpus = int(cluster_resources["CPU"])
-    # logger.info(f"Total cluster CPUs: {cluster_cpus}")
-    # collect node group resources
     cluster_resources = ray.cluster_resources()
     logger.info(f"Total cluster resources: {cluster_resources}")
-    if pg_config: # use resource in each placement group
-        #node_resource_keys=None
-        cluster_resources = pg_config[1]
-        cluster_cpus = cluster_resources['CPU']
-        node_resource_keys = cluster_resources['node_id']
-    else: # use all cluster resource
+    node_resource_keys = None
+    if pg_config:  # use resource in each placement group
+        cluster_resources = pg_config.resource
+        cluster_cpus = cluster_resources["CPU"]
+    else:  # use all cluster resource
         logger.info(f"Available cluster resources: {ray.available_resources()}")
         cluster_cpus = int(cluster_resources["CPU"])
         logger.info(f"Total cluster CPUs: {cluster_cpus}")
         node_resource_keys = live_node_resource_keys()
-        logger.info(f"Found {len(node_resource_keys)} live cluster nodes: "
-                   f"{node_resource_keys}")
-    if node_resource_keys:
-        # create a remote options provider to round-robin tasks across all nodes
-        logger.info(f"Setting round robin scheduling with node id:{node_resource_keys}")
-        round_robin_opt_provider = functools.partial(
-            round_robin_options_provider,
-            resource_keys=node_resource_keys,
+        logger.info(
+            f"Found {len(node_resource_keys)} live cluster nodes: "
+            f"{node_resource_keys}"
         )
-    else:
-        logger.info("Setting round robin scheduling to None")
-        round_robin_opt_provider = None
+    # create a remote options provider to round-robin tasks across all nodes or allocated bundles
+    logger.info(f"Setting round robin scheduling with node id:{node_resource_keys}")
+    round_robin_opt_provider = functools.partial(
+        round_robin_options_provider,
+        resource_keys=node_resource_keys,
+        pg_config=pg_config.opts if pg_config else None,
+    )
     # assign a distinct index to each node in the cluster
     # head_node_ip = urllib.request.urlopen(
     #     "http://169.254.169.254/latest/meta-data/local-ipv4"
@@ -273,20 +228,10 @@ def _execute_compaction_round(
     # we assume here that we're running on a fixed-size cluster - this
     # assumption could be removed but we'd still need to know the maximum
     # "safe" number of parallel tasks that our autoscaling cluster could handle
-    if max_parallelism and len(max_parallelism)==3: # customized for each major step: hb, dd, mat
-        max_parallelism = [int(cluster_cpus*i) for i in max_parallelism]
-    else:
-        max_parallelism = [int(cluster_cpus) for _ in range(3)]
-    logger.info(f"Max parallelism for each steps: {max_parallelism}")
-    if not num_cpus:
-        num_cpus=[1,1,1] # allocate 1 cpu for each task (hb, dd or mat)
-    logger.info(f"Number of cpus for each steps: {num_cpus}")
+    max_parallelism = int(cluster_cpus)
+    logger.info(f"Max parallelism: {max_parallelism}")
     # get the root path of a compatible primary key index for this round
-    _PRIMARY_KEY_INDEX_ALGORITHM_VERSION=ray.get(states.PRIMARY_KEY_INDEX_ALGORITHM_VERSION.remote())
     compatible_primary_key_index_meta = PrimaryKeyIndexMeta.of(
         compacted_partition_locator,
         primary_keys,
@@ -294,38 +239,58 @@ def _execute_compaction_round(
         _PRIMARY_KEY_INDEX_ALGORITHM_VERSION,
     )
     compatible_primary_key_index_locator = PrimaryKeyIndexLocator.of(
-        compatible_primary_key_index_meta)
-    compatible_primary_key_index_root_path = \
+        compatible_primary_key_index_meta
+    )
+    compatible_primary_key_index_root_path = (
         compatible_primary_key_index_locator.primary_key_index_root_path
+    )
     # read the results from any previously completed compaction round that used
     # a compatible primary key index
     round_completion_info = None
-    if read_round_completion:
+    if not rebase_source_partition_locator:
+        logger.info(
+            f"Reading round completion file for compatible "
+            f"primary key index root path: {compatible_primary_key_index_root_path}"
+        )
         round_completion_info = rcf.read_round_completion_file(
             compaction_artifact_s3_bucket,
             source_partition_locator,
             compatible_primary_key_index_root_path,
         )
+        logger.info(f"Round completion file: {round_completion_info}")
     # read the previous compaction round's hash bucket count, if any
     old_hash_bucket_count = None
     if round_completion_info:
-        old_pki_version_locator = round_completion_info\
-            .primary_key_index_version_locator
-        old_hash_bucket_count = old_pki_version_locator\
-            .primary_key_index_version_meta \
-            .hash_bucket_count
-        min_pk_index_pa_bytes = round_completion_info.pk_index_pyarrow_write_result.pyarrow_bytes
+        old_pki_version_locator = (
+            round_completion_info.primary_key_index_version_locator
+        )
+        old_hash_bucket_count = (
+            old_pki_version_locator.primary_key_index_version_meta.hash_bucket_count
+        )
+        min_pk_index_pa_bytes = (
+            round_completion_info.pk_index_pyarrow_write_result.pyarrow_bytes
+        )
+    else:
+        logger.info(
+            f"No prior round info read. Source partition: "
+            f"{source_partition_locator}. Primary key index locator: "
+            f"{compatible_primary_key_index_locator}. Rebase source "
+            f"partition locator: {rebase_source_partition_locator}"
+        )
     # use the new hash bucket count if provided, or fall back to old count
-    hash_bucket_count = new_hash_bucket_count \
-        if new_hash_bucket_count is not None \
+    hash_bucket_count = (
+        new_hash_bucket_count
+        if new_hash_bucket_count is not None
         else old_hash_bucket_count
+    )
     # discover input delta files
-    high_watermark = round_completion_info.high_watermark \
-        if round_completion_info else None
+    high_watermark = (
+        round_completion_info.high_watermark if round_completion_info else None
+    )
     input_deltas = io.discover_deltas(
         source_partition_locator,
@@ -336,99 +301,72 @@ def _execute_compaction_round(
     if not input_deltas:
         logger.info("No input deltas found to compact.")
-        return False, None, None
+        return False, None, None, None
     # limit the input deltas to fit on this cluster and convert them to
     # annotated deltas of equivalent size for easy parallel distribution
-    uniform_deltas, hash_bucket_count, last_stream_position_compacted = \
-        io.limit_input_deltas(
-            input_deltas,
-            cluster_resources,
-            hash_bucket_count,
-            min_pk_index_pa_bytes,
-            min_hash_bucket_chunk_size,
-            input_deltas_stats=input_deltas_stats,
-            deltacat_storage=deltacat_storage
-        )
+    (
+        uniform_deltas,
+        hash_bucket_count,
+        last_stream_position_compacted,
+    ) = io.limit_input_deltas(
+        input_deltas,
+        cluster_resources,
+        hash_bucket_count,
+        min_pk_index_pa_bytes,
+        min_hash_bucket_chunk_size,
+        input_deltas_stats=input_deltas_stats,
+        deltacat_storage=deltacat_storage,
+    )
-    uniform_deltas_entries=sum([len(i.manifest.entries) for i in uniform_deltas])
-    if round_id == 1: # first round, total_deltas is known
-        #TOTAL_ENTRIES = sum([len(deltacat_storage.get_delta_manifest(i).manifest.entries) for i in input_deltas])
-        #TOTAL_ENTRIES = sum(ray.get([get_metadata.remote(deltacat_storage,i) for i in input_deltas]))
-        #TODO: use stats, otherwise too slow to get all manifest's metadata
-        TOTAL_ENTRIES = 722451
-        TOTAL_DELTAS = len(input_deltas)
-        ray.get(states.update_entry.remote(TOTAL_ENTRIES))
-        ray.get(states.update_delta.remote(TOTAL_DELTAS))
-        logger.info(f"Estimated Rounds:{TOTAL_ENTRIES/uniform_deltas_entries}")
-        TOTAL_ROUNDS = TOTAL_ENTRIES/uniform_deltas_entries
-        ray.get(states.update_round.remote(TOTAL_ROUNDS))
-    TOTAL_ROUNDS = ray.get(states.TOTAL_ROUNDS.remote())
-    TOTAL_ENTRIES = ray.get(states.TOTAL_ENTRIES.remote())
-    TOTAL_DELTAS = ray.get(states.TOTAL_DELTAS.remote())
-    logger.info(f"Round {round_id}/{TOTAL_ROUNDS}: {uniform_deltas_entries}/{TOTAL_ENTRIES} entries in total deltas {TOTAL_DELTAS}")
-    print(f"Round {round_id}/{TOTAL_ROUNDS}: {uniform_deltas_entries}/{TOTAL_ENTRIES} entries in total deltas {TOTAL_DELTAS}")
-    assert hash_bucket_count is not None and hash_bucket_count > 0, \
-        f"Unexpected Error: Default hash bucket count ({hash_bucket_count}) " \
-        f"is invalid."
+    assert hash_bucket_count is not None and hash_bucket_count > 0, (
+        f"Expected hash bucket count to be a positive integer, but found "
+        f"`{hash_bucket_count}`"
+    )
     # rehash the primary key index if necessary
-    round_completion_info = None
     if round_completion_info:
         logger.info(f"Round completion file contents: {round_completion_info}")
         # the previous primary key index is compatible with the current, but
         # will need to be rehashed if the hash bucket count has changed
         if hash_bucket_count != old_hash_bucket_count:
+            # TODO(draghave): manually test the path after prior primary key
+            #   index was already built
             round_completion_info = pki.rehash(
                 round_robin_opt_provider,
                 compaction_artifact_s3_bucket,
                 source_partition_locator,
                 round_completion_info,
                 hash_bucket_count,
-                max_parallelism[0],
+                max_parallelism,
                 records_per_primary_key_index_file,
                 delete_prev_primary_key_index,
             )
-    else:
-        logger.info(f"No prior round completion file found. Source partition: "
-                    f"{source_partition_locator}. Primary key index locator: "
-                    f"{compatible_primary_key_index_locator}")
-    hb_start = time.time()
-    logger.info(f"adhoc_rootliu, Round {round_id} Pre-Hash bucket took:{hb_start-pre_hb_start} seconds")
-    print(f"adhoc_rootliu, Round {round_id} Pre-Hash bucket took:{hb_start-pre_hb_start} seconds")
     # parallel step 1:
     # group like primary keys together by hashing them into buckets
     hb_tasks_pending = invoke_parallel(
         items=uniform_deltas,
         ray_task=hb.hash_bucket,
-        max_parallelism=max_parallelism[0],
-        num_cpus = num_cpus[0],
+        max_parallelism=max_parallelism,
         options_provider=round_robin_opt_provider,
         primary_keys=primary_keys,
         sort_keys=sort_keys,
         num_buckets=hash_bucket_count,
-        num_groups=max_parallelism[0],
-        ignore_missing_manifest=ignore_missing_manifest,
+        num_groups=max_parallelism,
         deltacat_storage=deltacat_storage,
     )
     logger.info(f"Getting {len(hb_tasks_pending)} hash bucket results...")
     hb_results = ray.get([t[0] for t in hb_tasks_pending])
-    print(f"adhoc_rootliu, Round {round_id} Got {len(hb_results)} hash bucket results.")
     logger.info(f"Got {len(hb_results)} hash bucket results.")
     all_hash_group_idx_to_obj_id = defaultdict(list)
     for hash_group_idx_to_obj_id in hb_results:
         for hash_group_index, object_id in enumerate(hash_group_idx_to_obj_id):
             if object_id:
                 all_hash_group_idx_to_obj_id[hash_group_index].append(object_id)
-    hash_group_count = dedupe_task_count = len(all_hash_group_idx_to_obj_id)
+    hash_group_count = len(all_hash_group_idx_to_obj_id)
     logger.info(f"Hash bucket groups created: {hash_group_count}")
-    hb_end = time.time()
-    logger.info(f"adhoc_rootliu, Round {round_id} Hash bucket took:{hb_end-hb_start} seconds")
-    print(f"adhoc_rootliu, Round {round_id} Hash bucket took:{hb_end-hb_start} seconds")
     # TODO (pdames): when resources are freed during the last round of hash
     #  bucketing, start running dedupe tasks that read existing dedupe
@@ -455,9 +393,11 @@ def _execute_compaction_round(
         _PRIMARY_KEY_INDEX_ALGORITHM_VERSION,
     )
     new_primary_key_index_locator = PrimaryKeyIndexLocator.of(
-        new_primary_key_index_meta)
-    new_primary_key_index_root_path = new_primary_key_index_locator\
-        .primary_key_index_root_path
+        new_primary_key_index_meta
+    )
+    new_primary_key_index_root_path = (
+        new_primary_key_index_locator.primary_key_index_root_path
+    )
     # generate a new primary key index version locator for this round
     new_primary_key_index_version_meta = PrimaryKeyIndexVersionMeta.of(
@@ -465,53 +405,48 @@ def _execute_compaction_round(
         hash_bucket_count,
     )
     new_pki_version_locator = PrimaryKeyIndexVersionLocator.generate(
-        new_primary_key_index_version_meta)
+        new_primary_key_index_version_meta
+    )
     # parallel step 2:
     # discover records with duplicate primary keys in each hash bucket, and
     # identify the index of records to keep or drop based on sort keys
-    num_materialize_buckets = max_parallelism[1]
+    num_materialize_buckets = max_parallelism
     logger.info(f"Materialize Bucket Count: {num_materialize_buckets}")
-    record_counts_pending_materialize = \
-        dd.RecordCountsPendingMaterialize.remote(dedupe_task_count)
     dd_tasks_pending = invoke_parallel(
         items=all_hash_group_idx_to_obj_id.values(),
         ray_task=dd.dedupe,
-        max_parallelism=max_parallelism[1],
-        num_cpus = num_cpus[1],
+        max_parallelism=max_parallelism,
         options_provider=round_robin_opt_provider,
-        kwargs_provider=lambda index, item: {"dedupe_task_index": index,
-                                             "object_ids": item},
+        kwargs_provider=lambda index, item: {
+            "dedupe_task_index": index,
+            "object_ids": item,
+        },
         compaction_artifact_s3_bucket=compaction_artifact_s3_bucket,
         round_completion_info=round_completion_info,
         new_primary_key_index_version_locator=new_pki_version_locator,
         sort_keys=sort_keys,
         max_records_per_index_file=records_per_primary_key_index_file,
-        max_records_per_materialized_file=records_per_compacted_file,
         num_materialize_buckets=num_materialize_buckets,
         delete_old_primary_key_index=delete_prev_primary_key_index,
-        record_counts_pending_materialize=record_counts_pending_materialize,
     )
     logger.info(f"Getting {len(dd_tasks_pending)} dedupe results...")
     dd_results = ray.get([t[0] for t in dd_tasks_pending])
     logger.info(f"Got {len(dd_results)} dedupe results.")
-    print((f"adhoc_rootliu, Round {round_id} Got {len(dd_results)} dedupe results."))
     all_mat_buckets_to_obj_id = defaultdict(list)
     for mat_bucket_idx_to_obj_id in dd_results:
-        for bucket_idx, dd_task_index_and_object_id_tuple in \
-                mat_bucket_idx_to_obj_id.items():
+        for (
+            bucket_idx,
+            dd_task_index_and_object_id_tuple,
+        ) in mat_bucket_idx_to_obj_id.items():
             all_mat_buckets_to_obj_id[bucket_idx].append(
-                dd_task_index_and_object_id_tuple)
+                dd_task_index_and_object_id_tuple
+            )
     logger.info(f"Getting {len(dd_tasks_pending)} dedupe result stat(s)...")
     pki_stats = ray.get([t[2] for t in dd_tasks_pending])
     logger.info(f"Got {len(pki_stats)} dedupe result stat(s).")
-    logger.info(f"Materialize buckets created: "
-                f"{len(all_mat_buckets_to_obj_id)}")
+    logger.info(f"Materialize buckets created: " f"{len(all_mat_buckets_to_obj_id)}")
-    dd_end = time.time()
-    logger.info(f"adhoc_rootliu, Round {round_id} dedupe took:{dd_end-hb_end} seconds")
-    print(f"adhoc_rootliu, Round {round_id} dedupe took:{dd_end-hb_end} seconds")
     # TODO(pdames): when resources are freed during the last round of deduping
     #  start running materialize tasks that read materialization source file
     #  tables from S3 then wait for deduping to finish before continuing
@@ -527,12 +462,11 @@ def _execute_compaction_round(
     mat_tasks_pending = invoke_parallel(
         items=all_mat_buckets_to_obj_id.items(),
         ray_task=mat.materialize,
-        max_parallelism=max_parallelism[2],
-        num_cpus = num_cpus[2],
+        max_parallelism=max_parallelism,
         options_provider=round_robin_opt_provider,
-        kwargs_provider=lambda index, mat_bucket_idx_to_obj_id: {
-            "mat_bucket_index": mat_bucket_idx_to_obj_id[0],
-            "dedupe_task_idx_and_obj_id_tuples": mat_bucket_idx_to_obj_id[1],
+        kwargs_provider=lambda index, mat_bucket_index_to_obj_id: {
+            "mat_bucket_index": mat_bucket_index_to_obj_id[0],
+            "dedupe_task_idx_and_obj_id_tuples": mat_bucket_index_to_obj_id[1],
         },
         schema=schema_on_read,
         round_completion_info=round_completion_info,
@@ -545,43 +479,52 @@ def _execute_compaction_round(
     logger.info(f"Getting {len(mat_tasks_pending)} materialize result(s)...")
     mat_results = ray.get(mat_tasks_pending)
     logger.info(f"Got {len(mat_results)} materialize result(s).")
-    print(f"adhoc_rootliu, Round {round_id} Got {len(mat_results)} materialize result(s).")
-    mat_end = time.time()
-    logger.info(f"adhoc_rootliu, Round {round_id} mat took:{mat_end-dd_end} seconds")
-    print(f"adhoc_rootliu, Round {round_id} mat took:{mat_end-dd_end} seconds")
     mat_results = sorted(mat_results, key=lambda m: m.task_index)
     deltas = [m.delta for m in mat_results]
     merged_delta = Delta.merge_deltas(deltas)
     compacted_delta = deltacat_storage.commit_delta(merged_delta)
     logger.info(f"Committed compacted delta: {compacted_delta}")
-    commit_end=time.time()
-    logger.info(f"adhoc_rootliu, Round {round_id} commit took:{commit_end-mat_end} seconds")
-    print(f"adhoc_rootliu, Round {round_id} commit took:{commit_end-mat_end} seconds")
     new_compacted_delta_locator = DeltaLocator.of(
         new_compacted_partition_locator,
         compacted_delta.stream_position,
     )
-    round_completion_info = RoundCompletionInfo.of(
-        last_stream_position_compacted,
+    rci_high_watermark = (
+        rebase_source_partition_high_watermark
+        if rebase_source_partition_high_watermark
+        else last_stream_position_compacted
+    )
+    new_round_completion_info = RoundCompletionInfo.of(
+        rci_high_watermark,
         new_compacted_delta_locator,
-        PyArrowWriteResult.union([m.pyarrow_write_result
-                                  for m in mat_results]),
+        PyArrowWriteResult.union([m.pyarrow_write_result for m in mat_results]),
         PyArrowWriteResult.union(pki_stats),
         bit_width_of_sort_keys,
         new_pki_version_locator,
+        rebase_source_partition_locator
+        or round_completion_info.rebase_source_partition_locator,
     )
-    rcf.write_round_completion_file(
+    rcf_source_partition_locator = (
+        rebase_source_partition_locator
+        if rebase_source_partition_locator
+        else source_partition_locator
+    )
+    round_completion_file_s3_url = rcf.write_round_completion_file(
         compaction_artifact_s3_bucket,
-        source_partition_locator,
+        rcf_source_partition_locator,
         new_primary_key_index_root_path,
-        round_completion_info,
+        new_round_completion_info,
+    )
+    logger.info(
+        f"partition-{source_partition_locator.partition_values},"
+        f"compacted at: {last_stream_position_compacted},"
+        f"last position: {last_stream_position_to_compact}"
+    )
+    return (
+        (last_stream_position_compacted < last_stream_position_to_compact),
+        partition,
+        new_round_completion_info,
+        round_completion_file_s3_url,
     )
-    time_mat_e = time.time()
-    logger.info(f"partition-{source_partition_locator.partition_values},compacted at:{last_stream_position_compacted}, last position:{last_stream_position_to_compact}")
-    return \
-        (last_stream_position_compacted < last_stream_position_to_compact), \
-        partition, \
-        round_completion_info

deltacat 0.1.8__py3-none-any.whl → 0.1.11__py3-none-any.whl

deltacat 0.1.8py3-none-any.whl → 0.1.11py3-none-any.whl