PyPI - deltacat - Versions diffs - 0.1.18b3__tar.gz → 0.1.18b6__tar.gz - Mend

deltacat 0.1.18b3tar.gz → 0.1.18b6tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (146) hide show

{deltacat-0.1.18b3/deltacat.egg-info → deltacat-0.1.18b6}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: deltacat
-Version: 0.1.18b3
+Version: 0.1.18b6
 Summary: A scalable, fast, ACID-compliant Data Catalog powered by Ray.
 Home-page: https://github.com/ray-project/deltacat
 Author: Ray Team

{deltacat-0.1.18b3 → deltacat-0.1.18b6}/deltacat/__init__.py RENAMED Viewed

@@ -43,7 +43,7 @@ from deltacat.types.tables import TableWriteMode
 deltacat.logs.configure_deltacat_logger(logging.getLogger(__name__))
-__version__ = "0.1.18b3"
+__version__ = "0.1.18b6"
 __all__ = [

{deltacat-0.1.18b3 → deltacat-0.1.18b6}/deltacat/compute/compactor/compaction_session.py RENAMED Viewed

@@ -3,6 +3,10 @@ from contextlib import nullcontext
 import functools
 import logging
 import ray
+import time
+import json
+from deltacat.aws import s3u as s3_utils
+import deltacat
 from deltacat import logs
 import pyarrow as pa
 from deltacat.compute.compactor import (
@@ -12,6 +16,9 @@ from deltacat.compute.compactor import (
 )
 from deltacat.compute.compactor.model.dedupe_result import DedupeResult
 from deltacat.compute.compactor.model.hash_bucket_result import HashBucketResult
+from deltacat.io.object_store import IObjectStore
+from deltacat.io.ray_plasma_object_store import RayPlasmaObjectStore
+from deltacat.compute.compactor.model.materialize_result import MaterializeResult
 from deltacat.compute.stats.models.delta_stats import DeltaStats
 from deltacat.storage import (
     Delta,
@@ -20,6 +27,9 @@ from deltacat.storage import (
     PartitionLocator,
     interface as unimplemented_deltacat_storage,
 )
+from deltacat.compute.compactor.model.compact_partition_params import (
+    CompactPartitionParams,
+)
 from deltacat.utils.ray_utils.concurrency import (
     invoke_parallel,
     round_robin_options_provider,
@@ -37,7 +47,11 @@ from deltacat.utils.placement import PlacementGroupConfig
 from typing import List, Set, Optional, Tuple, Dict, Any
 from collections import defaultdict
 from deltacat.utils.metrics import MetricsConfig
-from deltacat.utils.resources import log_current_cluster_utilization
+from deltacat.compute.compactor.model.compaction_session_audit_info import (
+    CompactionSessionAuditInfo,
+)
+from deltacat.utils.resources import get_current_node_peak_memory_usage_in_bytes
 if importlib.util.find_spec("memray"):
     import memray
@@ -100,6 +114,7 @@ def compact_partition(
     list_deltas_kwargs: Optional[Dict[str, Any]] = None,
     read_kwargs_provider: Optional[ReadKwargsProvider] = None,
     s3_table_writer_kwargs: Optional[Dict[str, Any]] = None,
+    object_store: Optional[IObjectStore] = RayPlasmaObjectStore(),
     deltacat_storage=unimplemented_deltacat_storage,
     **kwargs,
 ) -> Optional[str]:
@@ -139,6 +154,7 @@ def compact_partition(
             list_deltas_kwargs,
             read_kwargs_provider,
             s3_table_writer_kwargs,
+            object_store,
             deltacat_storage,
             **kwargs,
         )
@@ -184,10 +200,28 @@ def _execute_compaction_round(
     list_deltas_kwargs: Optional[Dict[str, Any]],
     read_kwargs_provider: Optional[ReadKwargsProvider],
     s3_table_writer_kwargs: Optional[Dict[str, Any]],
+    object_store: Optional[IObjectStore],
     deltacat_storage=unimplemented_deltacat_storage,
     **kwargs,
 ) -> Tuple[Optional[Partition], Optional[RoundCompletionInfo], Optional[str]]:
+    rcf_source_partition_locator = (
+        rebase_source_partition_locator
+        if rebase_source_partition_locator
+        else source_partition_locator
+    )
+    base_audit_url = rcf_source_partition_locator.path(
+        f"s3://{compaction_artifact_s3_bucket}/compaction-audit"
+    )
+    audit_url = f"{base_audit_url}.json"
+    logger.info(f"Compaction audit will be written to {audit_url}")
+    compaction_audit = CompactionSessionAuditInfo(deltacat.__version__, audit_url)
+    compaction_start = time.monotonic()
     if not primary_keys:
         # TODO (pdames): run simple rebatch to reduce all deltas into 1 delta
         #  with normalized manifest entry sizes
@@ -230,6 +264,7 @@ def _execute_compaction_round(
             f"{node_resource_keys}"
         )
+    compaction_audit.set_cluster_cpu_max(cluster_cpus)
     # create a remote options provider to round-robin tasks across all nodes or allocated bundles
     logger.info(f"Setting round robin scheduling with node id:{node_resource_keys}")
     round_robin_opt_provider = functools.partial(
@@ -257,6 +292,13 @@ def _execute_compaction_round(
             )
         logger.info(f"Round completion file: {round_completion_info}")
+    enable_manifest_entry_copy_by_reference = (
+        False if rebase_source_partition_locator else True
+    )
+    logger.info(
+        f"Enable manifest entry copy by reference is set to: {enable_manifest_entry_copy_by_reference}"
+    )
     # discover input delta files
     # For rebase:
     # Copy the old compacted table to a new destination, plus any new deltas from rebased source
@@ -268,6 +310,7 @@ def _execute_compaction_round(
         round_completion_info.high_watermark if round_completion_info else None
     )
+    delta_discovery_start = time.monotonic()
     (
         input_deltas,
         previous_last_stream_position_compacted_on_destination_table,
@@ -282,6 +325,13 @@ def _execute_compaction_round(
         **list_deltas_kwargs,
     )
+    delta_discovery_end = time.monotonic()
+    compaction_audit.set_delta_discovery_time_in_seconds(
+        delta_discovery_end - delta_discovery_start
+    )
+    s3_utils.upload(compaction_audit.audit_url, str(json.dumps(compaction_audit)))
     if not input_deltas:
         logger.info("No input deltas found to compact.")
         return None, None, None
@@ -298,6 +348,7 @@ def _execute_compaction_round(
         io.fit_input_deltas(
             input_deltas,
             cluster_resources,
+            compaction_audit,
             hash_bucket_count,
             deltacat_storage=deltacat_storage,
         )
@@ -307,11 +358,14 @@ def _execute_compaction_round(
             cluster_resources,
             hash_bucket_count,
             min_hash_bucket_chunk_size,
+            compaction_audit=compaction_audit,
             input_deltas_stats=input_deltas_stats,
             deltacat_storage=deltacat_storage,
         )
     )
+    compaction_audit.set_uniform_deltas_created(len(uniform_deltas))
     assert hash_bucket_count is not None and hash_bucket_count > 0, (
         f"Expected hash bucket count to be a positive integer, but found "
         f"`{hash_bucket_count}`"
@@ -335,6 +389,8 @@ def _execute_compaction_round(
             "Multiple rounds are not supported. Please increase the cluster size and run again."
         )
+    hb_start = time.monotonic()
     hb_tasks_pending = invoke_parallel(
         items=uniform_deltas,
         ray_task=hb.hash_bucket,
@@ -348,11 +404,28 @@ def _execute_compaction_round(
         enable_profiler=enable_profiler,
         metrics_config=metrics_config,
         read_kwargs_provider=read_kwargs_provider,
+        object_store=object_store,
         deltacat_storage=deltacat_storage,
     )
+    hb_invoke_end = time.monotonic()
     logger.info(f"Getting {len(hb_tasks_pending)} hash bucket results...")
     hb_results: List[HashBucketResult] = ray.get(hb_tasks_pending)
     logger.info(f"Got {len(hb_results)} hash bucket results.")
+    hb_end = time.monotonic()
+    hb_results_retrieved_at = time.time()
+    telemetry_time_hb = compaction_audit.save_step_stats(
+        CompactionSessionAuditInfo.HASH_BUCKET_STEP_NAME,
+        hb_results,
+        hb_results_retrieved_at,
+        hb_invoke_end - hb_start,
+        hb_end - hb_start,
+    )
+    s3_utils.upload(compaction_audit.audit_url, str(json.dumps(compaction_audit)))
     all_hash_group_idx_to_obj_id = defaultdict(list)
     for hb_result in hb_results:
         for hash_group_index, object_id in enumerate(
@@ -367,6 +440,8 @@ def _execute_compaction_round(
         f"Got {total_hb_record_count} hash bucket records from hash bucketing step..."
     )
+    compaction_audit.set_input_records(total_hb_record_count.item())
     # TODO (pdames): when resources are freed during the last round of hash
     #  bucketing, start running dedupe tasks that read existing dedupe
     #  output from S3 then wait for hash bucketing to finish before continuing
@@ -389,10 +464,18 @@ def _execute_compaction_round(
     # identify the index of records to keep or drop based on sort keys
     num_materialize_buckets = max_parallelism
     logger.info(f"Materialize Bucket Count: {num_materialize_buckets}")
+    dedupe_start = time.monotonic()
+    dd_max_parallelism = int(
+        max_parallelism * kwargs.get("dd_max_parallelism_ratio", 1)
+    )
+    logger.info(
+        f"dd max_parallelism is set to {dd_max_parallelism}, max_parallelism is {max_parallelism}"
+    )
     dd_tasks_pending = invoke_parallel(
         items=all_hash_group_idx_to_obj_id.values(),
         ray_task=dd.dedupe,
-        max_parallelism=max_parallelism,
+        max_parallelism=dd_max_parallelism,
         options_provider=round_robin_opt_provider,
         kwargs_provider=lambda index, item: {
             "dedupe_task_index": index,
@@ -402,12 +485,33 @@ def _execute_compaction_round(
         num_materialize_buckets=num_materialize_buckets,
         enable_profiler=enable_profiler,
         metrics_config=metrics_config,
+        object_store=object_store,
     )
+    dedupe_invoke_end = time.monotonic()
     logger.info(f"Getting {len(dd_tasks_pending)} dedupe results...")
     dd_results: List[DedupeResult] = ray.get(dd_tasks_pending)
     logger.info(f"Got {len(dd_results)} dedupe results.")
+    # we use time.time() here because time.monotonic() has no reference point
+    # whereas time.time() measures epoch seconds. Hence, it will be reasonable
+    # to compare time.time()s captured in different nodes.
+    dedupe_results_retrieved_at = time.time()
+    dedupe_end = time.monotonic()
     total_dd_record_count = sum([ddr.deduped_record_count for ddr in dd_results])
     logger.info(f"Deduped {total_dd_record_count} records...")
+    telemetry_time_dd = compaction_audit.save_step_stats(
+        CompactionSessionAuditInfo.DEDUPE_STEP_NAME,
+        dd_results,
+        dedupe_results_retrieved_at,
+        dedupe_invoke_end - dedupe_start,
+        dedupe_end - dedupe_start,
+    )
+    compaction_audit.set_records_deduped(total_dd_record_count.item())
     all_mat_buckets_to_obj_id = defaultdict(list)
     for dd_result in dd_results:
         for (
@@ -420,6 +524,8 @@ def _execute_compaction_round(
     logger.info(f"Getting {len(dd_tasks_pending)} dedupe result stat(s)...")
     logger.info(f"Materialize buckets created: " f"{len(all_mat_buckets_to_obj_id)}")
+    compaction_audit.set_materialize_buckets(len(all_mat_buckets_to_obj_id))
     # TODO(pdames): when resources are freed during the last round of deduping
     #  start running materialize tasks that read materialization source file
     #  tables from S3 then wait for deduping to finish before continuing
@@ -432,6 +538,11 @@ def _execute_compaction_round(
     # parallel step 3:
     # materialize records to keep by index
+    s3_utils.upload(compaction_audit.audit_url, str(json.dumps(compaction_audit)))
+    materialize_start = time.monotonic()
     mat_tasks_pending = invoke_parallel(
         items=all_mat_buckets_to_obj_id.items(),
         ray_task=mat.materialize,
@@ -445,38 +556,34 @@ def _execute_compaction_round(
         round_completion_info=round_completion_info,
         source_partition_locator=source_partition_locator,
         partition=partition,
+        enable_manifest_entry_copy_by_reference=enable_manifest_entry_copy_by_reference,
         max_records_per_output_file=records_per_compacted_file,
         compacted_file_content_type=compacted_file_content_type,
         enable_profiler=enable_profiler,
         metrics_config=metrics_config,
         read_kwargs_provider=read_kwargs_provider,
         s3_table_writer_kwargs=s3_table_writer_kwargs,
+        object_store=object_store,
         deltacat_storage=deltacat_storage,
     )
+    materialize_invoke_end = time.monotonic()
     logger.info(f"Getting {len(mat_tasks_pending)} materialize result(s)...")
-    mat_results = ray.get(mat_tasks_pending)
-    total_count_of_src_dfl_not_touched = sum(
-        m.count_of_src_dfl_not_touched for m in mat_results
-    )
-    total_length_src_dfl = sum(m.count_of_src_dfl for m in mat_results)
-    logger.info(
-        f"Got total of {total_count_of_src_dfl_not_touched} manifest files not touched."
-    )
-    logger.info(
-        f"Got total of {total_length_src_dfl} manifest files during compaction."
-    )
-    manifest_entry_copied_by_reference_ratio = (
-        (round(total_count_of_src_dfl_not_touched / total_length_src_dfl, 4) * 100)
-        if total_length_src_dfl != 0
-        else None
-    )
-    logger.info(
-        f"{manifest_entry_copied_by_reference_ratio} percent of manifest files are copied by reference during materialize."
-    )
+    mat_results: List[MaterializeResult] = ray.get(mat_tasks_pending)
     logger.info(f"Got {len(mat_results)} materialize result(s).")
-    log_current_cluster_utilization(log_identifier="post_materialize")
+    materialize_end = time.monotonic()
+    materialize_results_retrieved_at = time.time()
+    telemetry_time_materialize = compaction_audit.save_step_stats(
+        CompactionSessionAuditInfo.MATERIALIZE_STEP_NAME,
+        mat_results,
+        materialize_results_retrieved_at,
+        materialize_invoke_end - materialize_start,
+        materialize_end - materialize_start,
+    )
     mat_results = sorted(mat_results, key=lambda m: m.task_index)
     deltas = [m.delta for m in mat_results]
@@ -494,6 +601,7 @@ def _execute_compaction_round(
         f" Materialized records: {merged_delta.meta.record_count}"
     )
     logger.info(record_info_msg)
     assert (
         total_hb_record_count - total_dd_record_count == merged_delta.meta.record_count
     ), (
@@ -506,6 +614,9 @@ def _execute_compaction_round(
     )
     logger.info(f"Committed compacted delta: {compacted_delta}")
+    compaction_end = time.monotonic()
+    compaction_audit.set_compaction_time_in_seconds(compaction_end - compaction_start)
     new_compacted_delta_locator = DeltaLocator.of(
         new_compacted_partition_locator,
         compacted_delta.stream_position,
@@ -516,26 +627,51 @@ def _execute_compaction_round(
         if round_completion_info
         else None
     )
+    pyarrow_write_result = PyArrowWriteResult.union(
+        [m.pyarrow_write_result for m in mat_results]
+    )
+    session_peak_memory = get_current_node_peak_memory_usage_in_bytes()
+    compaction_audit.set_peak_memory_used_bytes_by_compaction_session_process(
+        session_peak_memory
+    )
+    compaction_audit.save_round_completion_stats(
+        mat_results, telemetry_time_hb + telemetry_time_dd + telemetry_time_materialize
+    )
+    s3_utils.upload(compaction_audit.audit_url, str(json.dumps(compaction_audit)))
     new_round_completion_info = RoundCompletionInfo.of(
         last_stream_position_compacted,
         new_compacted_delta_locator,
-        PyArrowWriteResult.union([m.pyarrow_write_result for m in mat_results]),
+        pyarrow_write_result,
         bit_width_of_sort_keys,
         last_rebase_source_partition_locator,
-        manifest_entry_copied_by_reference_ratio,
-    )
-    rcf_source_partition_locator = (
-        rebase_source_partition_locator
-        if rebase_source_partition_locator
-        else source_partition_locator
+        compaction_audit.untouched_file_ratio,
+        audit_url,
     )
     logger.info(
         f"partition-{source_partition_locator.partition_values},"
         f"compacted at: {last_stream_position_compacted},"
         f"last position: {last_stream_position_to_compact}"
     )
     return (
         partition,
         new_round_completion_info,
         rcf_source_partition_locator,
     )
+def compact_partition_from_request(
+    compact_partition_params: CompactPartitionParams,
+) -> Optional[str]:
+    """
+    Wrapper for compact_partition that allows for the compact_partition parameters to be
+    passed in as a custom dictionary-like CompactPartitionParams object.
+    :param compact_partition_params:
+    """
+    return compact_partition(**compact_partition_params)

deltacat-0.1.18b6/deltacat/compute/compactor/model/compact_partition_params.py ADDED Viewed

@@ -0,0 +1,153 @@
+from __future__ import annotations
+import copy
+import json
+from typing import Any, Dict, List, Optional
+from deltacat.types.media import ContentType
+class CompactPartitionParams(dict):
+    """
+    This class represents the parameters passed to compact_partition (deltacat/compute/compactor/compaction_session.py)
+    """
+    @staticmethod
+    def of(params: Optional[Dict]) -> CompactPartitionParams:
+        if params is None:
+            params = {}
+        compact_partition_params = CompactPartitionParams()
+        compact_partition_params["destination_partition_locator"] = params.get(
+            "destination_partition_locator"
+        )
+        compact_partition_params["last_stream_position_to_compact"] = params.get(
+            "last_stream_position_to_compact"
+        )
+        compact_partition_params["source_partition_locator"] = params.get(
+            "source_partition_locator"
+        )
+        compact_partition_params["primary_keys"] = params.get("primary_keys")
+        compact_partition_params["rebase_source_partition_locator"] = params.get(
+            "rebase_source_partition_locator"
+        )
+        compact_partition_params["rebase_source_partition_high_watermark"] = params.get(
+            "rebase_source_partition_high_watermark"
+        )
+        compact_partition_params["hash_bucket_count"] = params.get("hash_bucket_count")
+        compact_partition_params["deltacat_storage"] = params.get("deltacat_storage")
+        compact_partition_params["compaction_artifact_s3_bucket"] = params.get(
+            "compaction_artifact_s3_bucket"
+        )
+        compact_partition_params["properties"] = params.get("properties")
+        compact_partition_params["compacted_file_content_type"] = params.get(
+            "compacted_file_content_type"
+        )
+        compact_partition_params["list_deltas_kwargs"] = params.get(
+            "list_deltas_kwargs"
+        )
+        compact_partition_params["pg_config"] = params.get("pg_config")
+        compact_partition_params["read_kwargs_provider"] = params.get(
+            "read_kwargs_provider"
+        )
+        compact_partition_params["s3_table_writer_kwargs"] = params.get(
+            "s3_table_writer_kwargs"
+        )
+        return compact_partition_params
+    @property
+    def destination_partition_locator(self) -> Optional[dict]:
+        return self["destination_partition_locator"]
+    @property
+    def last_stream_position_to_compact(self) -> Optional[int]:
+        return self["last_stream_position_to_compact"]
+    @property
+    def source_partition_locator(self) -> Optional[dict]:
+        return self["source_partition_locator"]
+    @property
+    def primary_keys(self) -> Optional[List[str]]:
+        return list(self["primary_keys"])
+    @property
+    def rebase_source_partition_locator(self) -> Optional[dict]:
+        return self["rebase_source_partition_locator"]
+    @property
+    def rebase_source_partition_high_watermark(self) -> Optional[int]:
+        return self["rebase_source_partition_high_watermark"]
+    @property
+    def hash_bucket_count(self) -> Optional[int]:
+        return self["hash_bucket_count"]
+    @property
+    def deltacat_storage(self) -> Optional[str]:
+        return self["deltacat_storage"]
+    @property
+    def compaction_artifact_s3_bucket(self) -> Optional[str]:
+        return self["compaction_artifact_s3_bucket"]
+    @property
+    def properties(self) -> Optional[Dict[str, str]]:
+        return self["properties"]
+    @property
+    def compacted_file_content_type(self) -> Optional[ContentType]:
+        return self["compacted_file_content_type"]
+    @property
+    def list_deltas_kwargs(self) -> Optional[dict]:
+        return self["list_deltas_kwargs"]
+    @property
+    def pg_config(self) -> Optional[Any]:
+        return self["pg_config"]
+    @property
+    def read_kwargs_provider(self) -> Optional[Any]:
+        return self["read_kwargs_provider"]
+    @property
+    def s3_table_writer_kwargs(self) -> Optional[Any]:
+        return self["s3_table_writer_kwargs"]
+    @staticmethod
+    def json_handler_for_compact_partition_params(obj):
+        """
+        A handler for the `json.dumps()` function that can be used to serialize sets to JSON.
+        If the `set_default()` handler is passed as the `default` argument to the `json.dumps()` function, it will be called whenever a set object is encountered.
+        The `set_default()` handler will then serialize the set as a list.
+        """
+        try:
+            if isinstance(obj, set):
+                return list(obj)
+            elif hasattr(obj, "toJSON"):
+                return obj.toJSON()
+            else:
+                return obj.__dict__
+        except Exception:
+            return obj.__class__.__name__
+    def serialize(self) -> str:
+        """
+        Serializes itself to a json-formatted string
+        Returns:
+            The serialized object.
+        """
+        to_serialize: Dict[str, Any] = {}
+        # individually try deepcopy the values from the self dictionary and just use the class name for the value when it is not possible to deepcopy
+        for attr, value in self.items():
+            try:
+                to_serialize[attr] = copy.deepcopy(value)
+            except Exception:  # if unable to deep copy the objects like module objects for example then just provide the class name at minimum
+                to_serialize[attr] = value.__class__.__name__
+        serialized_arguments_compact_partition_args: str = json.dumps(
+            to_serialize,
+            default=CompactPartitionParams.json_handler_for_compact_partition_params,
+        )
+        return serialized_arguments_compact_partition_args

deltacat 0.1.18b3__tar.gz → 0.1.18b6__tar.gz

deltacat 0.1.18b3tar.gz → 0.1.18b6tar.gz