PyPI - deltacat - Versions diffs - 0.1.18b3__py3-none-any.whl → 0.1.18b7__py3-none-any.whl - Mend

deltacat 0.1.18b3py3-none-any.whl → 0.1.18b7py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (42) hide show

deltacat/__init__.py +1 -1
deltacat/compute/compactor/compaction_session.py +184 -29
deltacat/compute/compactor/model/compact_partition_params.py +153 -0
deltacat/compute/compactor/model/compaction_session_audit_info.py +725 -0
deltacat/compute/compactor/model/dedupe_result.py +3 -0
deltacat/compute/compactor/model/delta_file_envelope.py +8 -0
deltacat/compute/compactor/model/delta_file_locator.py +11 -6
deltacat/compute/compactor/model/hash_bucket_result.py +3 -0
deltacat/compute/compactor/model/materialize_result.py +27 -6
deltacat/compute/compactor/model/round_completion_info.py +9 -0
deltacat/compute/compactor/steps/dedupe.py +35 -19
deltacat/compute/compactor/steps/hash_bucket.py +41 -16
deltacat/compute/compactor/steps/materialize.py +73 -70
deltacat/compute/compactor/utils/io.py +15 -0
deltacat/compute/compactor/utils/primary_key_index.py +9 -15
deltacat/compute/compactor/utils/round_completion_file.py +13 -4
deltacat/compute/compactor/utils/system_columns.py +32 -0
deltacat/io/__init__.py +0 -7
deltacat/io/file_object_store.py +48 -0
deltacat/io/memcached_object_store.py +121 -0
deltacat/io/object_store.py +51 -0
deltacat/io/ray_plasma_object_store.py +23 -0
deltacat/io/redis_object_store.py +114 -0
deltacat/io/s3_object_store.py +44 -0
deltacat/storage/model/delta.py +2 -1
deltacat/tests/compactor/test_compact_partition_params.py +237 -0
deltacat/tests/compactor/utils/test_io.py +27 -5
deltacat/tests/io/__init__.py +0 -0
deltacat/tests/io/test_file_object_store.py +86 -0
deltacat/tests/io/test_memcached_object_store.py +158 -0
deltacat/tests/io/test_ray_plasma_object_store.py +54 -0
deltacat/tests/io/test_redis_object_store.py +103 -0
deltacat/tests/io/test_s3_object_store.py +59 -0
deltacat/tests/utils/test_record_batch_tables.py +1 -1
deltacat/tests/utils/test_resources.py +9 -0
deltacat/utils/ray_utils/concurrency.py +0 -2
deltacat/utils/resources.py +30 -18
{deltacat-0.1.18b3.dist-info → deltacat-0.1.18b7.dist-info}/METADATA +3 -1
{deltacat-0.1.18b3.dist-info → deltacat-0.1.18b7.dist-info}/RECORD +42 -27
{deltacat-0.1.18b3.dist-info → deltacat-0.1.18b7.dist-info}/LICENSE +0 -0
{deltacat-0.1.18b3.dist-info → deltacat-0.1.18b7.dist-info}/WHEEL +0 -0
{deltacat-0.1.18b3.dist-info → deltacat-0.1.18b7.dist-info}/top_level.txt +0 -0

deltacat/compute/compactor/model/dedupe_result.py CHANGED Viewed

@@ -6,3 +6,6 @@ import numpy as np
 class DedupeResult(NamedTuple):
     mat_bucket_idx_to_obj_id: Dict[int, Tuple]
     deduped_record_count: np.int64
+    peak_memory_usage_bytes: np.double
+    telemetry_time_in_seconds: np.double
+    task_completed_at: np.double

deltacat/compute/compactor/model/delta_file_envelope.py CHANGED Viewed

@@ -5,6 +5,8 @@ import numpy as np
 from deltacat.storage import DeltaType, LocalTable
+from typing import Optional
 DeltaFileEnvelopeGroups = np.ndarray
@@ -16,6 +18,7 @@ class DeltaFileEnvelope(dict):
         delta_type: DeltaType,
         table: LocalTable,
         is_src_delta: np.bool_ = True,
+        file_record_count: Optional[int] = None,
     ) -> DeltaFileEnvelope:
         """Static factory builder for a Delta File Envelope
         `
@@ -46,6 +49,7 @@ class DeltaFileEnvelope(dict):
         delta_file_envelope["deltaType"] = delta_type.value
         delta_file_envelope["table"] = table
         delta_file_envelope["is_src_delta"] = is_src_delta
+        delta_file_envelope["file_record_count"] = file_record_count
         return delta_file_envelope
     @property
@@ -67,3 +71,7 @@ class DeltaFileEnvelope(dict):
     @property
     def is_src_delta(self) -> np.bool_:
         return self["is_src_delta"]
+    @property
+    def file_record_count(self) -> int:
+        return self["file_record_count"]

deltacat/compute/compactor/model/delta_file_locator.py CHANGED Viewed

@@ -5,11 +5,16 @@ import numpy as np
 from deltacat.storage import Locator
+from typing import Optional
 class DeltaFileLocator(Locator, tuple):
     @staticmethod
     def of(
-        is_src_delta: np.bool_, stream_position: np.int64, file_index: np.int32
+        is_src_delta: np.bool_,
+        stream_position: np.int64,
+        file_index: np.int32,
+        file_record_count: Optional[np.int64] = None,
     ) -> DeltaFileLocator:
         """
         Create a Delta File Locator tuple that can be used to uniquely identify
@@ -31,11 +36,7 @@ class DeltaFileLocator(Locator, tuple):
             (is_source_delta, stream_position, file_index).
         """
         return DeltaFileLocator(
-            (
-                is_src_delta,
-                stream_position,
-                file_index,
-            )
+            (is_src_delta, stream_position, file_index, file_record_count)
         )
     @property
@@ -50,6 +51,10 @@ class DeltaFileLocator(Locator, tuple):
     def file_index(self) -> np.int32:
         return self[2]
+    @property
+    def file_record_count(self) -> np.int64:
+        return self[3]
     def canonical_string(self) -> str:
         """
         Returns a unique string for the given locator that can be used

deltacat/compute/compactor/model/hash_bucket_result.py CHANGED Viewed

@@ -6,3 +6,6 @@ import numpy as np
 class HashBucketResult(NamedTuple):
     hash_bucket_group_to_obj_id: np.ndarray
     hb_record_count: np.int64
+    peak_memory_usage_bytes: np.double
+    telemetry_time_in_seconds: np.double
+    task_completed_at: np.double

deltacat/compute/compactor/model/materialize_result.py CHANGED Viewed

@@ -2,6 +2,7 @@
 from __future__ import annotations
 from typing import Any, Dict, Optional
+import numpy as np
 from deltacat.compute.compactor.model.pyarrow_write_result import PyArrowWriteResult
 from deltacat.storage import Delta
@@ -13,15 +14,19 @@ class MaterializeResult(dict):
         delta: Delta,
         task_index: int,
         pyarrow_write_result: PyArrowWriteResult,
-        count_of_src_dfl_not_touched: Optional[int] = 0,
-        count_of_src_dfl: Optional[int] = 0,
+        referenced_pyarrow_write_result: Optional[PyArrowWriteResult] = None,
+        peak_memory_usage_bytes: Optional[np.double] = None,
+        telemetry_time_in_seconds: Optional[np.double] = None,
+        task_completed_at: Optional[np.double] = None,
     ) -> MaterializeResult:
         materialize_result = MaterializeResult()
         materialize_result["delta"] = delta
         materialize_result["taskIndex"] = task_index
         materialize_result["paWriteResult"] = pyarrow_write_result
-        materialize_result["countOfSrcFileNotTouched"] = count_of_src_dfl_not_touched
-        materialize_result["countOfSrcFile"] = count_of_src_dfl
+        materialize_result["referencedPaWriteResult"] = referenced_pyarrow_write_result
+        materialize_result["peakMemoryUsageBytes"] = peak_memory_usage_bytes
+        materialize_result["telemetryTimeInSeconds"] = telemetry_time_in_seconds
+        materialize_result["taskCompletedAt"] = task_completed_at
         return materialize_result
     @property
@@ -35,6 +40,14 @@ class MaterializeResult(dict):
     def task_index(self) -> int:
         return self["taskIndex"]
+    @property
+    def peak_memory_usage_bytes(self) -> Optional[np.double]:
+        return self["peakMemoryUsageBytes"]
+    @property
+    def telemetry_time_in_seconds(self) -> Optional[np.double]:
+        return self["telemetryTimeInSeconds"]
     @property
     def pyarrow_write_result(self) -> PyArrowWriteResult:
         val: Dict[str, Any] = self.get("paWriteResult")
@@ -47,5 +60,13 @@ class MaterializeResult(dict):
         return self["countOfSrcFileNotTouched"]
     @property
-    def count_of_src_dfl(self) -> int:
-        return self["countOfSrcFile"]
+    def referenced_pyarrow_write_result(self) -> PyArrowWriteResult:
+        val: Dict[str, Any] = self.get("referencedPaWriteResult")
+        if val is not None and not isinstance(val, PyArrowWriteResult):
+            self["referencedPaWriteResult"] = val = PyArrowWriteResult(val)
+        return val
+    @property
+    def task_completed_at(self) -> Optional[np.double]:
+        return self["taskCompletedAt"]

deltacat/compute/compactor/model/round_completion_info.py CHANGED Viewed

@@ -3,6 +3,9 @@ from __future__ import annotations
 from deltacat.storage import DeltaLocator, PartitionLocator
 from deltacat.compute.compactor.model.pyarrow_write_result import PyArrowWriteResult
+from deltacat.compute.compactor.model.compaction_session_audit_info import (
+    CompactionSessionAuditInfo,
+)
 from typing import Any, Dict, Optional
@@ -39,6 +42,7 @@ class RoundCompletionInfo(dict):
         sort_keys_bit_width: int,
         rebase_source_partition_locator: Optional[PartitionLocator],
         manifest_entry_copied_by_reference_ratio: Optional[float] = None,
+        compaction_audit_url: Optional[str] = None,
     ) -> RoundCompletionInfo:
         rci = RoundCompletionInfo()
@@ -50,6 +54,7 @@ class RoundCompletionInfo(dict):
         rci[
             "manifestEntryCopiedByReferenceRatio"
         ] = manifest_entry_copied_by_reference_ratio
+        rci["compactionAuditUrl"] = compaction_audit_url
         return rci
     @property
@@ -81,6 +86,10 @@ class RoundCompletionInfo(dict):
     def sort_keys_bit_width(self) -> int:
         return self["sortKeysBitWidth"]
+    @property
+    def compaction_audit(self) -> Optional[CompactionSessionAuditInfo]:
+        return self.get("compactionAudit")
     @property
     def rebase_source_partition_locator(self) -> Optional[PartitionLocator]:
         return self.get("rebaseSourcePartitionLocator")

deltacat/compute/compactor/steps/dedupe.py CHANGED Viewed

@@ -1,14 +1,14 @@
 import importlib
 import logging
+from typing import Optional
+import time
 from collections import defaultdict
 from contextlib import nullcontext
 from typing import Any, Dict, List, Tuple
 import numpy as np
 import pyarrow as pa
 import pyarrow.compute as pc
 import ray
-from ray import cloudpickle
 from deltacat import logs
 from deltacat.compute.compactor import (
@@ -25,6 +25,8 @@ from deltacat.utils.ray_utils.runtime import (
 )
 from deltacat.utils.performance import timed_invocation
 from deltacat.utils.metrics import emit_timer_metrics, MetricsConfig
+from deltacat.io.object_store import IObjectStore
+from deltacat.utils.resources import get_current_node_peak_memory_usage_in_bytes
 if importlib.util.find_spec("memray"):
     import memray
@@ -105,6 +107,7 @@ def _timed_dedupe(
     num_materialize_buckets: int,
     dedupe_task_index: int,
     enable_profiler: bool,
+    object_store: Optional[IObjectStore],
 ):
     task_id = get_current_ray_task_id()
     worker_id = get_current_ray_worker_id()
@@ -113,15 +116,12 @@ def _timed_dedupe(
     ) if enable_profiler else nullcontext():
         # TODO (pdames): mitigate risk of running out of memory here in cases of
         #  severe skew of primary key updates in deltas
-        src_file_records_obj_refs = [
-            cloudpickle.loads(obj_id_pkl) for obj_id_pkl in object_ids
-        ]
         logger.info(
             f"[Dedupe task {dedupe_task_index}] Getting delta file envelope "
-            f"groups for {len(src_file_records_obj_refs)} object refs..."
+            f"groups for {len(object_ids)} object refs..."
         )
-        delta_file_envelope_groups_list = ray.get(src_file_records_obj_refs)
+        delta_file_envelope_groups_list = object_store.get_many(object_ids)
         hb_index_to_delta_file_envelopes_list = defaultdict(list)
         for delta_file_envelope_groups in delta_file_envelope_groups_list:
             for hb_idx, dfes in enumerate(delta_file_envelope_groups):
@@ -188,17 +188,18 @@ def _timed_dedupe(
             file_idx_col = sc.file_index_column_np(table)
             row_idx_col = sc.record_index_column_np(table)
             is_source_col = sc.is_source_column_np(table)
+            file_record_count_col = sc.file_record_count_column_np(table)
             for row_idx in range(len(table)):
                 src_dfl = DeltaFileLocator.of(
                     is_source_col[row_idx],
                     stream_position_col[row_idx],
                     file_idx_col[row_idx],
+                    file_record_count_col[row_idx],
                 )
                 # TODO(pdames): merge contiguous record number ranges
                 src_file_id_to_row_indices[src_dfl].append(row_idx_col[row_idx])
         logger.info(f"Finished all dedupe rounds...")
-        mat_bucket_to_src_file_record_count = defaultdict(dict)
         mat_bucket_to_src_file_records: Dict[
             MaterializeBucketIndex, DeltaFileLocatorToRecords
         ] = defaultdict(dict)
@@ -210,29 +211,30 @@ def _timed_dedupe(
             mat_bucket_to_src_file_records[mat_bucket][src_dfl] = np.array(
                 src_row_indices,
             )
-            mat_bucket_to_src_file_record_count[mat_bucket][src_dfl] = len(
-                src_row_indices
-            )
         mat_bucket_to_dd_idx_obj_id: Dict[
             MaterializeBucketIndex, DedupeTaskIndexWithObjectId
         ] = {}
         for mat_bucket, src_file_records in mat_bucket_to_src_file_records.items():
-            object_ref = ray.put(src_file_records)
-            pickled_object_ref = cloudpickle.dumps(object_ref)
+            object_ref = object_store.put(src_file_records)
             mat_bucket_to_dd_idx_obj_id[mat_bucket] = (
                 dedupe_task_index,
-                pickled_object_ref,
+                object_ref,
             )
             del object_ref
-            del pickled_object_ref
         logger.info(
             f"Count of materialize buckets with object refs: "
             f"{len(mat_bucket_to_dd_idx_obj_id)}"
         )
+        peak_memory_usage_bytes = get_current_node_peak_memory_usage_in_bytes()
         return DedupeResult(
-            mat_bucket_to_dd_idx_obj_id, np.int64(total_deduped_records)
+            mat_bucket_to_dd_idx_obj_id,
+            np.int64(total_deduped_records),
+            np.double(peak_memory_usage_bytes),
+            np.double(0.0),
+            np.double(time.time()),
         )
@@ -244,6 +246,7 @@ def dedupe(
     dedupe_task_index: int,
     enable_profiler: bool,
     metrics_config: MetricsConfig,
+    object_store: Optional[IObjectStore],
 ) -> DedupeResult:
     logger.info(f"[Dedupe task {dedupe_task_index}] Starting dedupe task...")
     dedupe_result, duration = timed_invocation(
@@ -253,11 +256,24 @@ def dedupe(
         num_materialize_buckets=num_materialize_buckets,
         dedupe_task_index=dedupe_task_index,
         enable_profiler=enable_profiler,
+        object_store=object_store,
     )
+    emit_metrics_time = 0.0
     if metrics_config:
-        emit_timer_metrics(
-            metrics_name="dedupe", value=duration, metrics_config=metrics_config
+        emit_result, latency = timed_invocation(
+            func=emit_timer_metrics,
+            metrics_name="dedupe",
+            value=duration,
+            metrics_config=metrics_config,
         )
+        emit_metrics_time = latency
     logger.info(f"[Dedupe task index {dedupe_task_index}] Finished dedupe task...")
-    return dedupe_result
+    return DedupeResult(
+        dedupe_result[0],
+        dedupe_result[1],
+        dedupe_result[2],
+        np.double(emit_metrics_time),
+        dedupe_result[4],
+    )

deltacat/compute/compactor/steps/hash_bucket.py CHANGED Viewed

@@ -1,5 +1,6 @@
 import importlib
 import logging
+import time
 from contextlib import nullcontext
 from itertools import chain
 from typing import Generator, List, Optional, Tuple
@@ -30,6 +31,8 @@ from deltacat.utils.ray_utils.runtime import (
 from deltacat.utils.common import ReadKwargsProvider
 from deltacat.utils.performance import timed_invocation
 from deltacat.utils.metrics import emit_timer_metrics, MetricsConfig
+from deltacat.io.object_store import IObjectStore
+from deltacat.utils.resources import get_current_node_peak_memory_usage_in_bytes
 if importlib.util.find_spec("memray"):
     import memray
@@ -114,11 +117,12 @@ def _group_file_records_by_pk_hash_bucket(
                     hb_to_delta_file_envelopes[hb] = []
                 hb_to_delta_file_envelopes[hb].append(
                     DeltaFileEnvelope.of(
-                        dfe.stream_position,
-                        dfe.file_index,
-                        dfe.delta_type,
-                        table,
-                        is_src_delta,
+                        stream_position=dfe.stream_position,
+                        file_index=dfe.file_index,
+                        delta_type=dfe.delta_type,
+                        table=table,
+                        is_src_delta=is_src_delta,
+                        file_record_count=dfe.file_record_count,
                     )
                 )
     return hb_to_delta_file_envelopes, total_record_count
@@ -157,10 +161,11 @@ def _read_delta_file_envelopes(
     for i, table in enumerate(tables):
         total_record_count += len(table)
         delta_file = DeltaFileEnvelope.of(
-            annotations[i].annotation_stream_position,
-            annotations[i].annotation_file_index,
-            annotations[i].annotation_delta_type,
-            table,
+            stream_position=annotations[i].annotation_stream_position,
+            file_index=annotations[i].annotation_file_index,
+            delta_type=annotations[i].annotation_delta_type,
+            table=table,
+            file_record_count=len(table),
         )
         delta_file_envelopes.append(delta_file)
     return delta_file_envelopes, total_record_count
@@ -175,6 +180,7 @@ def _timed_hash_bucket(
     num_groups: int,
     enable_profiler: bool,
     read_kwargs_provider: Optional[ReadKwargsProvider] = None,
+    object_store: Optional[IObjectStore] = None,
     deltacat_storage=unimplemented_deltacat_storage,
 ):
     task_id = get_current_ray_task_id()
@@ -203,12 +209,16 @@ def _timed_hash_bucket(
             deltacat_storage,
         )
         hash_bucket_group_to_obj_id, _ = group_hash_bucket_indices(
-            delta_file_envelope_groups,
-            num_buckets,
-            num_groups,
+            delta_file_envelope_groups, num_buckets, num_groups, object_store
         )
+        peak_memory_usage_bytes = get_current_node_peak_memory_usage_in_bytes()
         return HashBucketResult(
-            hash_bucket_group_to_obj_id, np.int64(total_record_count)
+            hash_bucket_group_to_obj_id,
+            np.int64(total_record_count),
+            np.double(peak_memory_usage_bytes),
+            np.double(0.0),
+            np.double(time.time()),
         )
@@ -223,6 +233,7 @@ def hash_bucket(
     enable_profiler: bool,
     metrics_config: MetricsConfig,
     read_kwargs_provider: Optional[ReadKwargsProvider],
+    object_store: Optional[IObjectStore],
     deltacat_storage=unimplemented_deltacat_storage,
 ) -> HashBucketResult:
@@ -237,11 +248,25 @@ def hash_bucket(
         num_groups=num_groups,
         enable_profiler=enable_profiler,
         read_kwargs_provider=read_kwargs_provider,
+        object_store=object_store,
         deltacat_storage=deltacat_storage,
     )
+    emit_metrics_time = 0.0
     if metrics_config:
-        emit_timer_metrics(
-            metrics_name="hash_bucket", value=duration, metrics_config=metrics_config
+        emit_result, latency = timed_invocation(
+            func=emit_timer_metrics,
+            metrics_name="hash_bucket",
+            value=duration,
+            metrics_config=metrics_config,
         )
+        emit_metrics_time = latency
     logger.info(f"Finished hash bucket task...")
-    return hash_bucket_result
+    return HashBucketResult(
+        hash_bucket_result[0],
+        hash_bucket_result[1],
+        hash_bucket_result[2],
+        np.double(emit_metrics_time),
+        hash_bucket_result[4],
+    )

deltacat 0.1.18b3__py3-none-any.whl → 0.1.18b7__py3-none-any.whl

deltacat 0.1.18b3py3-none-any.whl → 0.1.18b7py3-none-any.whl