PyPI - deltacat - Versions diffs - 0.1.8__py3-none-any.whl → 0.1.11__py3-none-any.whl - Mend

deltacat 0.1.8py3-none-any.whl → 0.1.11py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (105) hide show

deltacat/__init__.py +41 -15
deltacat/aws/clients.py +12 -31
deltacat/aws/constants.py +1 -1
deltacat/aws/redshift/__init__.py +7 -2
deltacat/aws/redshift/model/manifest.py +54 -50
deltacat/aws/s3u.py +188 -218
deltacat/catalog/delegate.py +151 -185
deltacat/catalog/interface.py +78 -97
deltacat/catalog/model/catalog.py +21 -21
deltacat/catalog/model/table_definition.py +11 -9
deltacat/compute/compactor/__init__.py +12 -16
deltacat/compute/compactor/compaction_session.py +259 -316
deltacat/compute/compactor/model/delta_annotated.py +60 -44
deltacat/compute/compactor/model/delta_file_envelope.py +5 -6
deltacat/compute/compactor/model/delta_file_locator.py +10 -8
deltacat/compute/compactor/model/materialize_result.py +6 -7
deltacat/compute/compactor/model/primary_key_index.py +38 -34
deltacat/compute/compactor/model/pyarrow_write_result.py +3 -4
deltacat/compute/compactor/model/round_completion_info.py +25 -19
deltacat/compute/compactor/model/sort_key.py +18 -15
deltacat/compute/compactor/steps/dedupe.py +152 -259
deltacat/compute/compactor/steps/hash_bucket.py +57 -73
deltacat/compute/compactor/steps/materialize.py +138 -99
deltacat/compute/compactor/steps/rehash/rehash_bucket.py +13 -13
deltacat/compute/compactor/steps/rehash/rewrite_index.py +11 -13
deltacat/compute/compactor/utils/io.py +59 -47
deltacat/compute/compactor/utils/primary_key_index.py +131 -90
deltacat/compute/compactor/utils/round_completion_file.py +22 -23
deltacat/compute/compactor/utils/system_columns.py +33 -42
deltacat/compute/metastats/meta_stats.py +235 -157
deltacat/compute/metastats/model/partition_stats_dict.py +7 -10
deltacat/compute/metastats/model/stats_cluster_size_estimator.py +13 -5
deltacat/compute/metastats/stats.py +95 -64
deltacat/compute/metastats/utils/io.py +100 -53
deltacat/compute/metastats/utils/pyarrow_memory_estimation_function.py +5 -2
deltacat/compute/metastats/utils/ray_utils.py +38 -33
deltacat/compute/stats/basic.py +107 -69
deltacat/compute/stats/models/delta_column_stats.py +11 -8
deltacat/compute/stats/models/delta_stats.py +59 -32
deltacat/compute/stats/models/delta_stats_cache_result.py +4 -1
deltacat/compute/stats/models/manifest_entry_stats.py +12 -6
deltacat/compute/stats/models/stats_result.py +24 -14
deltacat/compute/stats/utils/intervals.py +16 -9
deltacat/compute/stats/utils/io.py +86 -51
deltacat/compute/stats/utils/manifest_stats_file.py +24 -33
deltacat/constants.py +8 -10
deltacat/io/__init__.py +2 -2
deltacat/io/aws/redshift/redshift_datasource.py +157 -143
deltacat/io/dataset.py +14 -17
deltacat/io/read_api.py +36 -33
deltacat/logs.py +94 -42
deltacat/storage/__init__.py +18 -8
deltacat/storage/interface.py +196 -213
deltacat/storage/model/delta.py +45 -51
deltacat/storage/model/list_result.py +12 -8
deltacat/storage/model/namespace.py +4 -5
deltacat/storage/model/partition.py +42 -42
deltacat/storage/model/stream.py +29 -30
deltacat/storage/model/table.py +14 -14
deltacat/storage/model/table_version.py +32 -31
deltacat/storage/model/types.py +1 -0
deltacat/tests/stats/test_intervals.py +11 -24
deltacat/tests/utils/test_record_batch_tables.py +284 -0
deltacat/types/media.py +3 -4
deltacat/types/tables.py +31 -21
deltacat/utils/common.py +5 -11
deltacat/utils/numpy.py +20 -22
deltacat/utils/pandas.py +73 -100
deltacat/utils/performance.py +3 -9
deltacat/utils/placement.py +276 -231
deltacat/utils/pyarrow.py +302 -89
deltacat/utils/ray_utils/collections.py +2 -1
deltacat/utils/ray_utils/concurrency.py +38 -32
deltacat/utils/ray_utils/dataset.py +28 -28
deltacat/utils/ray_utils/performance.py +5 -9
deltacat/utils/ray_utils/runtime.py +9 -10
{deltacat-0.1.8.dist-info → deltacat-0.1.11.dist-info}/METADATA +22 -12
deltacat-0.1.11.dist-info/RECORD +110 -0
{deltacat-0.1.8.dist-info → deltacat-0.1.11.dist-info}/WHEEL +1 -1
deltacat/autoscaler/events/__init__.py +0 -0
deltacat/autoscaler/events/compaction/__init__.py +0 -0
deltacat/autoscaler/events/compaction/cluster.py +0 -82
deltacat/autoscaler/events/compaction/collections/__init__.py +0 -0
deltacat/autoscaler/events/compaction/collections/partition_key_value.py +0 -36
deltacat/autoscaler/events/compaction/dispatcher.py +0 -28
deltacat/autoscaler/events/compaction/input.py +0 -27
deltacat/autoscaler/events/compaction/process.py +0 -25
deltacat/autoscaler/events/compaction/session_manager.py +0 -13
deltacat/autoscaler/events/compaction/utils.py +0 -216
deltacat/autoscaler/events/compaction/workflow.py +0 -303
deltacat/autoscaler/events/dispatcher.py +0 -95
deltacat/autoscaler/events/dynamodb/__init__.py +0 -0
deltacat/autoscaler/events/dynamodb/event_store.py +0 -164
deltacat/autoscaler/events/event_store.py +0 -55
deltacat/autoscaler/events/exceptions.py +0 -6
deltacat/autoscaler/events/processor.py +0 -177
deltacat/autoscaler/events/session_manager.py +0 -25
deltacat/autoscaler/events/states.py +0 -88
deltacat/autoscaler/events/workflow.py +0 -54
deltacat/autoscaler/node_group.py +0 -230
deltacat/autoscaler/utils.py +0 -69
deltacat-0.1.8.dist-info/RECORD +0 -131
/deltacat/{autoscaler → tests/utils}/__init__.py +0 -0
{deltacat-0.1.8.dist-info → deltacat-0.1.11.dist-info}/LICENSE +0 -0
{deltacat-0.1.8.dist-info → deltacat-0.1.11.dist-info}/top_level.txt +0 -0

deltacat/compute/compactor/steps/dedupe.py CHANGED Viewed

@@ -1,26 +1,29 @@
 import logging
+from collections import defaultdict
+from typing import Any, Dict, List, Optional, Tuple
+import numpy as np
 import pyarrow as pa
-import ray
-import time
 import pyarrow.compute as pc
-import numpy as np
-from deltacat.compute.compactor.utils.system_columns import get_minimal_hb_schema
-from deltacat.utils.pyarrow import ReadKwargsProviderPyArrowSchemaOverride
+import ray
 from ray import cloudpickle
 from ray.types import ObjectRef
 from deltacat import logs
-from collections import defaultdict
-from itertools import repeat
-from deltacat.storage import DeltaType
-from deltacat.compute.compactor import SortKey, SortOrder, \
-    RoundCompletionInfo, PrimaryKeyIndexVersionLocator, DeltaFileEnvelope, \
-    DeltaFileLocator, PyArrowWriteResult
-from deltacat.compute.compactor.utils import system_columns as sc, \
-    primary_key_index as pki
-from typing import Any, Dict, List, Optional, Tuple
-from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy
+from deltacat.compute.compactor import (
+    DeltaFileEnvelope,
+    DeltaFileLocator,
+    PrimaryKeyIndexVersionLocator,
+    PyArrowWriteResult,
+    RoundCompletionInfo,
+    SortKey,
+    SortOrder,
+)
+from deltacat.compute.compactor.utils import primary_key_index as pki
+from deltacat.compute.compactor.utils import system_columns as sc
+from deltacat.compute.compactor.utils.system_columns import get_minimal_hb_schema
+from deltacat.utils.performance import timed_invocation
+from deltacat.utils.pyarrow import ReadKwargsProviderPyArrowSchemaOverride
 logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
@@ -32,18 +35,21 @@ DedupeTaskIndexWithObjectId = Tuple[DedupeTaskIndex, PickledObjectRef]
 DedupeResult = Tuple[
     Dict[MaterializeBucketIndex, DedupeTaskIndexWithObjectId],
     List[ObjectRef[DeltaFileLocatorToRecords]],
-    PyArrowWriteResult
+    PyArrowWriteResult,
 ]
-def union_primary_key_indices(
-        s3_bucket: str,
-        round_completion_info: RoundCompletionInfo,
-        hash_bucket_index: int,
-        df_envelopes_list: List[List[DeltaFileEnvelope]]) -> pa.Table:
+def _union_primary_key_indices(
+    s3_bucket: str,
+    round_completion_info: RoundCompletionInfo,
+    hash_bucket_index: int,
+    df_envelopes_list: List[List[DeltaFileEnvelope]],
+) -> pa.Table:
-    logger.info(f"Reading dedupe input for {len(df_envelopes_list)} "
-                f"delta file envelope lists...")
+    logger.info(
+        f"[Hash bucket index {hash_bucket_index}] Reading dedupe input for "
+        f"{len(df_envelopes_list)} delta file envelope lists..."
+    )
     # read compacted input parquet files first
     # (which implicitly have older stream positions than deltas)
     hb_tables = []
@@ -53,37 +59,15 @@ def union_primary_key_indices(
             hash_bucket_index,
             round_completion_info.primary_key_index_version_locator,
             # Enforce consistent column ordering by reading from a schema, to prevent schema mismatch errors
-            file_reader_kwargs_provider=ReadKwargsProviderPyArrowSchemaOverride(schema=get_minimal_hb_schema())
+            file_reader_kwargs_provider=ReadKwargsProviderPyArrowSchemaOverride(
+                schema=get_minimal_hb_schema()
+            ),
         )
         if tables:
-            prev_compacted_delta_stream_pos = round_completion_info\
-                .compacted_delta_locator \
-                .stream_position
-            if prev_compacted_delta_stream_pos is None:
-                raise ValueError(f"Unexpected Error: No previous compacted "
-                                 f"delta stream position found in round "
-                                 f"completion info: {round_completion_info}")
             prior_pk_index_table = pa.concat_tables(tables)
-            prior_pk_index_table = sc.append_stream_position_column(
-                prior_pk_index_table,
-                repeat(
-                    prev_compacted_delta_stream_pos,
-                    len(prior_pk_index_table),
-                ),
-            )
-            prior_pk_index_table = sc.append_delta_type_col(
-                prior_pk_index_table,
-                repeat(
-                    sc.delta_type_to_field(DeltaType.UPSERT),
-                    len(prior_pk_index_table),
-                )
-            )
-            prior_pk_index_table = sc.append_is_source_col(
-                prior_pk_index_table,
-                repeat(
-                    False,
-                    len(prior_pk_index_table),
-                )
+            logger.info(
+                f"Number of records in prior primary index for hash bucket"
+                f" {hash_bucket_index}: {prior_pk_index_table.num_rows}"
             )
             hb_tables.append(prior_pk_index_table)
@@ -99,114 +83,56 @@ def union_primary_key_indices(
     hb_table = pa.concat_tables(hb_tables)
+    logger.info(
+        f"Total records in hash bucket {hash_bucket_index} is {hb_table.num_rows}"
+    )
     return hb_table
-def drop_duplicates_by_primary_key_hash(table: pa.Table) -> pa.Table:
+def _drop_duplicates_by_primary_key_hash(table: pa.Table) -> pa.Table:
     value_to_last_row_idx = {}
-    row_idx = 0
-    pk_op_chunk_iter = zip(
-        sc.pk_hash_column(table).iterchunks(),
-        sc.delta_type_column(table).iterchunks(),
+    pk_hash_np = sc.pk_hash_column_np(table)
+    op_type_np = sc.delta_type_column_np(table)
+    assert len(pk_hash_np) == len(op_type_np), (
+        f"Primary key digest column length ({len(pk_hash_np)}) doesn't "
+        f"match delta type column length ({len(op_type_np)})."
     )
-    for (pk_chunk, op_chunk) in pk_op_chunk_iter:
-        pk_op_val_iter = zip(
-            pk_chunk.to_numpy(zero_copy_only=False),
-            op_chunk.to_numpy(zero_copy_only=False),
-        )
-        for (pk_val, op_val) in pk_op_val_iter:
-            # operation type is True for `UPSERT` and False for `DELETE`
-            if op_val:
-                # UPSERT this row
-                value_to_last_row_idx[pk_val] = row_idx
-            else:
-                # DELETE this row
-                value_to_last_row_idx.pop(pk_val, None)
-            row_idx += 1
+    # TODO(raghumdani): move the dedupe to C++ using arrow methods or similar.
+    row_idx = 0
+    pk_op_val_iter = zip(pk_hash_np, op_type_np)
+    for (pk_val, op_val) in pk_op_val_iter:
+        # operation type is True for `UPSERT` and False for `DELETE`
+        if op_val:
+            # UPSERT this row
+            value_to_last_row_idx[pk_val] = row_idx
+        else:
+            # DELETE this row
+            value_to_last_row_idx.pop(pk_val, None)
+        row_idx += 1
     return table.take(list(value_to_last_row_idx.values()))
-def write_new_primary_key_index(
-        s3_bucket: str,
-        new_primary_key_index_version_locator: PrimaryKeyIndexVersionLocator,
-        max_rows_per_index_file: int,
-        max_rows_per_mat_file: int,
-        num_materialize_buckets: int,
-        dedupe_task_index: int,
-        deduped_tables: List[Tuple[int, pa.Table]],
-        row_counts: Dict[int, Dict[Tuple[np.bool_, np.int64, np.int32],
-                                   Dict[int, int]]]) -> PyArrowWriteResult:
-    logger.info(f"Writing new deduped primary key index: "
-                f"{new_primary_key_index_version_locator}")
-    # TODO (pdames): move to RecordCountsPendingMaterialize.finalize()?
-    file_idx = 0
-    prev_file_idx = 0
-    dest_file_indices = defaultdict(
-        lambda: defaultdict(
-            lambda: defaultdict(int)
-        )
-    )
-    dest_file_row_indices = defaultdict(
-        lambda: defaultdict(
-            lambda: defaultdict(int)
-        )
+def _write_new_primary_key_index(
+    s3_bucket: str,
+    new_primary_key_index_version_locator: PrimaryKeyIndexVersionLocator,
+    max_rows_per_index_file: int,
+    dedupe_task_index: int,
+    deduped_tables: List[Tuple[int, pa.Table]],
+) -> PyArrowWriteResult:
+    logger.info(
+        f"[Dedupe task index {dedupe_task_index}] Writing new deduped primary key index: "
+        f"{new_primary_key_index_version_locator}"
     )
-    for mat_bucket in sorted(row_counts.keys()):
-        mat_bucket_row_idx = 0
-        sorted_src_dfls = sorted(row_counts[mat_bucket].keys())
-        for src_dfl in sorted_src_dfls:
-            sorted_dd_tasks = sorted(row_counts[mat_bucket][src_dfl].keys())
-            for dd_task_idx in sorted_dd_tasks:
-                dest_file_row_indices[mat_bucket][src_dfl][dd_task_idx] = \
-                    mat_bucket_row_idx % max_rows_per_mat_file
-                file_idx = prev_file_idx + int(
-                    mat_bucket_row_idx / max_rows_per_mat_file
-                )
-                dest_file_indices[mat_bucket][src_dfl][dd_task_idx] = file_idx
-                row_count = row_counts[mat_bucket][src_dfl][dd_task_idx]
-                mat_bucket_row_idx += row_count
-        prev_file_idx = file_idx + 1
     pki_results = []
-    src_dfl_row_counts = defaultdict(int)
     for hb_index, table in deduped_tables:
-        is_source_col = sc.is_source_column_np(table)
-        stream_pos_col = sc.stream_position_column_np(table)
-        file_idx_col = sc.file_index_column_np(table)
-        dest_file_idx_col = []
-        dest_file_row_idx_col = []
-        for row_idx in range(len(table)):
-            src_dfl = DeltaFileLocator.of(
-                is_source_col[row_idx],
-                stream_pos_col[row_idx],
-                file_idx_col[row_idx],
-            )
-            mat_bucket = delta_file_locator_to_mat_bucket_index(
-                src_dfl,
-                num_materialize_buckets,
-            )
-            dest_file_start_idx = \
-                dest_file_indices[mat_bucket][src_dfl][dedupe_task_index]
-            dest_file_row_idx_offset = src_dfl_row_counts[src_dfl] + \
-                dest_file_row_indices[mat_bucket][src_dfl][dedupe_task_index]
-            dest_file_idx_offset = int(
-                dest_file_row_idx_offset / max_rows_per_mat_file
-            )
-            dest_file_idx = dest_file_start_idx + dest_file_idx_offset
-            dest_file_idx_col.append(dest_file_idx)
-            dest_file_row_idx = dest_file_row_idx_offset % max_rows_per_mat_file
-            dest_file_row_idx_col.append(dest_file_row_idx)
-            src_dfl_row_counts[src_dfl] += 1
-        table = table.drop([
-            sc._IS_SOURCE_COLUMN_NAME,
-            sc._PARTITION_STREAM_POSITION_COLUMN_NAME,
-            sc._ORDERED_FILE_IDX_COLUMN_NAME,
-            sc._ORDERED_RECORD_IDX_COLUMN_NAME,
-        ])
-        table = sc.append_file_idx_column(table, dest_file_idx_col)
-        table = sc.append_record_idx_col(table, dest_file_row_idx_col)
         hb_pki_result = pki.write_primary_key_index_files(
             table,
             new_primary_key_index_version_locator,
@@ -217,77 +143,44 @@ def write_new_primary_key_index(
         pki_results.append(hb_pki_result)
     result = PyArrowWriteResult.union(pki_results)
-    logger.info(f"Wrote new deduped primary key index: "
-                f"{new_primary_key_index_version_locator}. Result: {result}")
+    logger.info(
+        f"[Dedupe task index {dedupe_task_index}] Wrote new deduped primary key index: "
+        f"{new_primary_key_index_version_locator}. Result: {result}"
+    )
     return result
 def delta_file_locator_to_mat_bucket_index(
-        df_locator: DeltaFileLocator,
-        materialize_bucket_count: int) -> int:
+    df_locator: DeltaFileLocator, materialize_bucket_count: int
+) -> int:
     digest = df_locator.digest()
     return int.from_bytes(digest, "big") % materialize_bucket_count
-@ray.remote(num_cpus=0.1)
-class RecordCountsPendingMaterialize:
-    def __init__(self, expected_result_count: int):
-        # materialize_bucket -> src_file_id
-        self.record_counts = defaultdict(
-            # delta_file_locator -> dedupe task index
-            lambda: defaultdict(
-                # dedupe task index -> row count
-                lambda: defaultdict(int)
-            )
-        )
-        self.expected_result_count = expected_result_count
-        self.actual_result_count = 0
-    def add_record_counts(
-            self,
-            result_idx: int,
-            record_counts:
-            Dict[int, Dict[Tuple[np.bool_, np.int64, np.int32], int]]) -> None:
-        for mat_bucket, df_locator_rows in record_counts.items():
-            for df_locator, rows in df_locator_rows.items():
-                self.record_counts[mat_bucket][df_locator][result_idx] += rows
-        self.actual_result_count += 1
-    def get_record_counts(self) -> \
-            Dict[int, Dict[Tuple[np.bool_, np.int64, np.int32],
-                           Dict[int, int]]]:
-        return self.record_counts
-    def get_expected_result_count(self) -> int:
-        return self.expected_result_count
-    def get_actual_result_count(self) -> int:
-        return self.actual_result_count
-    def is_finalized(self) -> bool:
-        return self.actual_result_count == self.expected_result_count
 @ray.remote(num_returns=3)
 def dedupe(
-        compaction_artifact_s3_bucket: str,
-        round_completion_info: Optional[RoundCompletionInfo],
-        new_primary_key_index_version_locator: PrimaryKeyIndexVersionLocator,
-        object_ids: List[Any],
-        sort_keys: List[SortKey],
-        max_records_per_index_file: int,
-        max_records_per_materialized_file: int,
-        num_materialize_buckets: int,
-        dedupe_task_index: int,
-        delete_old_primary_key_index: bool,
-        record_counts_pending_materialize: RecordCountsPendingMaterialize) -> DedupeResult:
-    logger.info(f"Starting dedupe task...")
+    compaction_artifact_s3_bucket: str,
+    round_completion_info: Optional[RoundCompletionInfo],
+    new_primary_key_index_version_locator: PrimaryKeyIndexVersionLocator,
+    object_ids: List[Any],
+    sort_keys: List[SortKey],
+    max_records_per_index_file: int,
+    num_materialize_buckets: int,
+    dedupe_task_index: int,
+    delete_old_primary_key_index: bool,
+) -> DedupeResult:
+    logger.info(f"[Dedupe task {dedupe_task_index}] Starting dedupe task...")
     # TODO (pdames): mitigate risk of running out of memory here in cases of
     #  severe skew of primary key updates in deltas
     src_file_records_obj_refs = [
-        cloudpickle.loads(obj_id_pkl) for obj_id_pkl in object_ids]
-    logger.info(f"Getting delta file envelope groups object refs...")
+        cloudpickle.loads(obj_id_pkl) for obj_id_pkl in object_ids
+    ]
+    logger.info(
+        f"[Dedupe task {dedupe_task_index}] Getting delta file envelope "
+        f"groups for {len(src_file_records_obj_refs)} object refs..."
+    )
     delta_file_envelope_groups_list = ray.get(src_file_records_obj_refs)
     hb_index_to_delta_file_envelopes_list = defaultdict(list)
     for delta_file_envelope_groups in delta_file_envelope_groups_list:
@@ -296,36 +189,51 @@ def dedupe(
                 hb_index_to_delta_file_envelopes_list[hb_idx].append(dfes)
     src_file_id_to_row_indices = defaultdict(list)
     deduped_tables = []
-    logger.info(f"Running {len(hb_index_to_delta_file_envelopes_list)} "
-                f"dedupe rounds...")
+    logger.info(
+        f"[Dedupe task {dedupe_task_index}] Running {len(hb_index_to_delta_file_envelopes_list)} "
+        f"dedupe rounds..."
+    )
     for hb_idx, dfe_list in hb_index_to_delta_file_envelopes_list.items():
-        table = union_primary_key_indices(
-            compaction_artifact_s3_bucket,
-            round_completion_info,
-            hb_idx,
-            dfe_list,
+        logger.info(f"{dedupe_task_index}: union primary keys for hb_index: {hb_idx}")
+        table, union_time = timed_invocation(
+            func=_union_primary_key_indices,
+            s3_bucket=compaction_artifact_s3_bucket,
+            round_completion_info=round_completion_info,
+            hash_bucket_index=hb_idx,
+            df_envelopes_list=dfe_list,
+        )
+        logger.info(
+            f"[Dedupe {dedupe_task_index}] Dedupe round input "
+            f"record count: {len(table)}, took {union_time}s"
         )
-        logger.info(f"Dedupe round input record count: {len(table)}")
         # sort by sort keys
         if len(sort_keys):
             # TODO (pdames): convert to O(N) dedupe w/ sort keys
-            sort_keys.extend([
-                SortKey.of(
-                    sc._PARTITION_STREAM_POSITION_COLUMN_NAME,
-                    SortOrder.ASCENDING
-                ),
-                SortKey.of(
-                    sc._ORDERED_FILE_IDX_COLUMN_NAME,
-                    SortOrder.ASCENDING
-                ),
-            ])
+            sort_keys.extend(
+                [
+                    SortKey.of(
+                        sc._PARTITION_STREAM_POSITION_COLUMN_NAME, SortOrder.ASCENDING
+                    ),
+                    SortKey.of(sc._ORDERED_FILE_IDX_COLUMN_NAME, SortOrder.ASCENDING),
+                ]
+            )
             table = table.take(pc.sort_indices(table, sort_keys=sort_keys))
         # drop duplicates by primary key hash column
-        table = drop_duplicates_by_primary_key_hash(table)
-        table = table.drop([sc._DELTA_TYPE_COLUMN_NAME])
-        logger.info(f"Dedupe round output record count: {len(table)}")
+        logger.info(
+            f"[Dedupe task index {dedupe_task_index}] Dropping duplicates for {hb_idx}"
+        )
+        table, drop_time = timed_invocation(
+            func=_drop_duplicates_by_primary_key_hash, table=table
+        )
+        logger.info(
+            f"[Dedupe task index {dedupe_task_index}] Dedupe round output "
+            f"record count: {len(table)}, took: {drop_time}s"
+        )
         deduped_tables.append((hb_idx, table))
@@ -344,7 +252,9 @@ def dedupe(
     logger.info(f"Finished all dedupe rounds...")
     mat_bucket_to_src_file_record_count = defaultdict(dict)
-    mat_bucket_to_src_file_records: Dict[MaterializeBucketIndex, DeltaFileLocatorToRecords] = defaultdict(dict)
+    mat_bucket_to_src_file_records: Dict[
+        MaterializeBucketIndex, DeltaFileLocatorToRecords
+    ] = defaultdict(dict)
     for src_dfl, src_row_indices in src_file_id_to_row_indices.items():
         mat_bucket = delta_file_locator_to_mat_bucket_index(
             src_dfl,
@@ -353,48 +263,33 @@ def dedupe(
         mat_bucket_to_src_file_records[mat_bucket][src_dfl] = np.array(
             src_row_indices,
         )
-        mat_bucket_to_src_file_record_count[mat_bucket][src_dfl] = \
-            len(src_row_indices)
+        mat_bucket_to_src_file_record_count[mat_bucket][src_dfl] = len(src_row_indices)
-    mat_bucket_to_dd_idx_obj_id: Dict[MaterializeBucketIndex, DedupeTaskIndexWithObjectId] = {}
+    mat_bucket_to_dd_idx_obj_id: Dict[
+        MaterializeBucketIndex, DedupeTaskIndexWithObjectId
+    ] = {}
     src_file_records_obj_refs: List[ObjectRef[DeltaFileLocatorToRecords]] = []
     for mat_bucket, src_file_records in mat_bucket_to_src_file_records.items():
         object_ref = ray.put(src_file_records)
-        src_file_records_obj_refs.append(object_ref)
+        pickled_object_ref = cloudpickle.dumps(object_ref)
+        src_file_records_obj_refs.append(pickled_object_ref)
         mat_bucket_to_dd_idx_obj_id[mat_bucket] = (
             dedupe_task_index,
-            cloudpickle.dumps(object_ref),
+            pickled_object_ref,
         )
-    logger.info(f"Count of materialize buckets with object refs: "
-                f"{len(mat_bucket_to_dd_idx_obj_id)}")
-    record_counts_pending_materialize.add_record_counts.remote(
-        dedupe_task_index,
-        mat_bucket_to_src_file_record_count,
-    )
-    # wait for all dedupe tasks to reach this point before continuing
+        del object_ref
+        del pickled_object_ref
     logger.info(
-        f"Waiting for all dedupe tasks to finish writing record counts...")
-    finalized = False
-    while not finalized:
-        finalized = ray.get(
-            record_counts_pending_materialize.is_finalized.remote()
-        )
-        time.sleep(0.25)
-    record_counts = ray.get(
-        record_counts_pending_materialize.get_record_counts.remote()
+        f"Count of materialize buckets with object refs: "
+        f"{len(mat_bucket_to_dd_idx_obj_id)}"
     )
-    write_pki_result: PyArrowWriteResult = write_new_primary_key_index(
+    write_pki_result: PyArrowWriteResult = _write_new_primary_key_index(
         compaction_artifact_s3_bucket,
         new_primary_key_index_version_locator,
         max_records_per_index_file,
-        max_records_per_materialized_file,
-        num_materialize_buckets,
         dedupe_task_index,
         deduped_tables,
-        record_counts,
     )
     if delete_old_primary_key_index:
@@ -402,7 +297,5 @@ def dedupe(
             compaction_artifact_s3_bucket,
             round_completion_info.primary_key_index_version_locator,
         )
-    logger.info(f"Finished dedupe task...")
-    return mat_bucket_to_dd_idx_obj_id, \
-        src_file_records_obj_refs, \
-        write_pki_result
+    logger.info(f"[Dedupe task index {dedupe_task_index}] Finished dedupe task...")
+    return mat_bucket_to_dd_idx_obj_id, src_file_records_obj_refs, write_pki_result

deltacat 0.1.8__py3-none-any.whl → 0.1.11__py3-none-any.whl

deltacat 0.1.8py3-none-any.whl → 0.1.11py3-none-any.whl