PyPI - deltacat - Versions diffs - 1.1.17__py3-none-any.whl → 1.1.19__py3-none-any.whl - Mend

deltacat 1.1.17py3-none-any.whl → 1.1.19py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (61) hide show

deltacat/__init__.py +1 -1
deltacat/aws/constants.py +0 -1
deltacat/compute/compactor/model/compact_partition_params.py +76 -0
deltacat/compute/compactor/model/compaction_session_audit_info.py +26 -0
deltacat/compute/compactor/model/delta_annotated.py +16 -9
deltacat/compute/compactor_v2/constants.py +3 -0
deltacat/compute/compactor_v2/private/compaction_utils.py +9 -5
deltacat/compute/compactor_v2/utils/content_type_params.py +185 -34
deltacat/compute/compactor_v2/utils/io.py +28 -14
deltacat/compute/compactor_v2/utils/primary_key_index.py +9 -4
deltacat/compute/compactor_v2/utils/task_options.py +128 -183
deltacat/compute/resource_estimation/__init__.py +27 -0
deltacat/compute/resource_estimation/delta.py +271 -0
deltacat/compute/resource_estimation/manifest.py +394 -0
deltacat/compute/resource_estimation/model.py +165 -0
deltacat/compute/resource_estimation/parquet.py +108 -0
deltacat/constants.py +5 -0
deltacat/exceptions.py +2 -4
deltacat/logs.py +8 -0
deltacat/tests/compute/compact_partition_multiple_rounds_test_cases.py +77 -0
deltacat/tests/compute/compact_partition_rebase_test_cases.py +308 -0
deltacat/tests/compute/compact_partition_rebase_then_incremental_test_cases.py +159 -0
deltacat/tests/compute/compactor_v2/test_compaction_session.py +157 -0
deltacat/tests/compute/compactor_v2/utils/test_task_options.py +3 -3
deltacat/tests/compute/resource_estimation/test_delta.py +605 -0
deltacat/tests/compute/resource_estimation/test_manifest.py +921 -0
deltacat/tests/compute/test_compact_partition_rebase.py +13 -4
deltacat/tests/compute/test_util_common.py +2 -0
deltacat/tests/compute/test_util_create_table_deltas_repo.py +13 -5
deltacat/tests/test_logs.py +34 -0
deltacat/tests/test_utils/pyarrow.py +15 -5
{deltacat-1.1.17.dist-info → deltacat-1.1.19.dist-info}/METADATA +2 -2
{deltacat-1.1.17.dist-info → deltacat-1.1.19.dist-info}/RECORD +38 -54
deltacat/compute/metastats/meta_stats.py +0 -479
deltacat/compute/metastats/model/__init__.py +0 -0
deltacat/compute/metastats/model/partition_stats_dict.py +0 -34
deltacat/compute/metastats/model/stats_cluster_size_estimator.py +0 -68
deltacat/compute/metastats/stats.py +0 -182
deltacat/compute/metastats/utils/__init__.py +0 -0
deltacat/compute/metastats/utils/constants.py +0 -16
deltacat/compute/metastats/utils/io.py +0 -223
deltacat/compute/metastats/utils/pyarrow_memory_estimation_function.py +0 -18
deltacat/compute/metastats/utils/ray_utils.py +0 -129
deltacat/compute/stats/basic.py +0 -226
deltacat/compute/stats/models/__init__.py +0 -0
deltacat/compute/stats/models/delta_column_stats.py +0 -98
deltacat/compute/stats/models/delta_stats.py +0 -233
deltacat/compute/stats/models/delta_stats_cache_result.py +0 -49
deltacat/compute/stats/models/manifest_entry_stats.py +0 -72
deltacat/compute/stats/models/stats_result.py +0 -104
deltacat/compute/stats/utils/__init__.py +0 -0
deltacat/compute/stats/utils/intervals.py +0 -94
deltacat/compute/stats/utils/io.py +0 -230
deltacat/compute/stats/utils/manifest_stats_file.py +0 -100
deltacat/tests/stats/__init__.py +0 -0
deltacat/tests/stats/test_intervals.py +0 -49
/deltacat/{compute/metastats → tests/compute/resource_estimation}/__init__.py +0 -0
/deltacat/{compute/metastats/config → tests/compute/resource_estimation/data}/__init__.py +0 -0
{deltacat-1.1.17.dist-info → deltacat-1.1.19.dist-info}/LICENSE +0 -0
{deltacat-1.1.17.dist-info → deltacat-1.1.19.dist-info}/WHEEL +0 -0
{deltacat-1.1.17.dist-info → deltacat-1.1.19.dist-info}/top_level.txt +0 -0

deltacat/__init__.py CHANGED Viewed

@@ -44,7 +44,7 @@ from deltacat.types.tables import TableWriteMode
 deltacat.logs.configure_deltacat_logger(logging.getLogger(__name__))
-__version__ = "1.1.17"
+__version__ = "1.1.19"
 __all__ = [

deltacat/aws/constants.py CHANGED Viewed

@@ -1,7 +1,6 @@
 import botocore
 from typing import Set
 from daft.exceptions import DaftTransientError
 from deltacat.utils.common import env_integer, env_string

deltacat/compute/compactor/model/compact_partition_params.py CHANGED Viewed

@@ -13,6 +13,10 @@ from deltacat.storage import (
     PartitionLocator,
     SortKey,
 )
+from deltacat.compute.resource_estimation import (
+    ResourceEstimationMethod,
+    EstimateResourcesParams,
+)
 from deltacat.compute.compactor_v2.constants import (
     MAX_RECORDS_PER_COMPACTED_FILE,
     MIN_DELTA_BYTES_IN_BATCH,
@@ -23,6 +27,8 @@ from deltacat.compute.compactor_v2.constants import (
     TOTAL_MEMORY_BUFFER_PERCENTAGE,
     DEFAULT_DISABLE_COPY_BY_REFERENCE,
     DEFAULT_NUM_ROUNDS,
+    PARQUET_TO_PYARROW_INFLATION,
+    MAX_PARQUET_METADATA_SIZE,
 )
 from deltacat.constants import PYARROW_INFLATION_MULTIPLIER
 from deltacat.compute.compactor.utils.sort_key import validate_sort_keys
@@ -104,6 +110,22 @@ class CompactPartitionParams(dict):
         result.metrics_config = params.get("metrics_config")
         result.num_rounds = params.get("num_rounds", DEFAULT_NUM_ROUNDS)
+        result.parquet_to_pyarrow_inflation = params.get(
+            "parquet_to_pyarrow_inflation", PARQUET_TO_PYARROW_INFLATION
+        )
+        result.resource_estimation_method = ResourceEstimationMethod[
+            params.get(
+                "resource_estimation_method", ResourceEstimationMethod.DEFAULT.value
+            )
+        ]
+        # disable input split during rebase as the rebase files are already uniform
+        result.enable_input_split = (
+            params.get("rebase_source_partition_locator") is None
+        )
+        result.max_parquet_meta_size_bytes = params.get(
+            "max_parquet_meta_size_bytes", MAX_PARQUET_METADATA_SIZE
+        )
         if not importlib.util.find_spec("memray"):
             result.enable_profiler = False
@@ -413,6 +435,60 @@ class CompactPartitionParams(dict):
     def num_rounds(self, num_rounds: int) -> None:
         self["num_rounds"] = num_rounds
+    @property
+    def parquet_to_pyarrow_inflation(self) -> float:
+        """
+        The inflation factor for the parquet uncompressed_size_bytes to pyarrow table size.
+        """
+        return self["parquet_to_pyarrow_inflation"]
+    @parquet_to_pyarrow_inflation.setter
+    def parquet_to_pyarrow_inflation(self, value: float) -> None:
+        self["parquet_to_pyarrow_inflation"] = value
+    @property
+    def enable_input_split(self) -> bool:
+        """
+        When this is True, the input split will be always enabled for parquet files.
+        The input split feature will split the parquet files into individual row groups
+        so that we could process them in different nodes in parallel.
+        By default, input split is enabled for incremental compaction and disabled for rebase or backfill.
+        """
+        return self["enable_input_split"]
+    @enable_input_split.setter
+    def enable_input_split(self, value: bool) -> None:
+        self["enable_input_split"] = value
+    @property
+    def max_parquet_meta_size_bytes(self) -> int:
+        """
+        The maximum size of the parquet metadata in bytes. Used for allocating tasks
+        to fetch parquet metadata.
+        """
+        return self["max_parquet_meta_size_bytes"]
+    @max_parquet_meta_size_bytes.setter
+    def max_parquet_meta_size_bytes(self, value: int) -> None:
+        self["max_parquet_meta_size_bytes"] = value
+    @property
+    def resource_estimation_method(self) -> ResourceEstimationMethod:
+        return self["resource_estimation_method"]
+    @resource_estimation_method.setter
+    def resource_estimation_method(self, value: ResourceEstimationMethod) -> None:
+        self["resource_estimation_method"] = value
+    @property
+    def estimate_resources_params(self) -> EstimateResourcesParams:
+        return EstimateResourcesParams.of(
+            resource_estimation_method=self.resource_estimation_method,
+            previous_inflation=self.previous_inflation,
+            parquet_to_pyarrow_inflation=self.parquet_to_pyarrow_inflation,
+            average_record_size_bytes=self.average_record_size_bytes,
+        )
     @staticmethod
     def json_handler_for_compact_partition_params(obj):
         """

deltacat/compute/compactor/model/compaction_session_audit_info.py CHANGED Viewed

@@ -436,6 +436,22 @@ class CompactionSessionAuditInfo(dict):
         """
         return self.get("compactorVersion")
+    @property
+    def observed_input_inflation(self) -> float:
+        """
+        The average inflation observed for input files only.
+        This only accounts for files in the source.
+        """
+        return self.get("observedInputInflation")
+    @property
+    def observed_input_average_record_size_bytes(self) -> float:
+        """
+        The average record size observed for input files only.
+        This only accounts for files in the source.
+        """
+        return self.get("observedInputAverageRecordSizeBytes")
     # Setters follow
     def set_audit_url(self, audit_url: str) -> CompactionSessionAuditInfo:
@@ -756,6 +772,16 @@ class CompactionSessionAuditInfo(dict):
         self["compactorVersion"] = value
         return self
+    def set_observed_input_inflation(self, value: float) -> CompactionSessionAuditInfo:
+        self["observedInputInflation"] = value
+        return self
+    def set_observed_input_average_record_size_bytes(
+        self, value: float
+    ) -> CompactionSessionAuditInfo:
+        self["observedInputAverageRecordSizeBytes"] = value
+        return self
     # High level methods to save stats
     def save_step_stats(
         self,

deltacat/compute/compactor/model/delta_annotated.py CHANGED Viewed

@@ -69,6 +69,7 @@ class DeltaAnnotated(Delta):
         estimation_function: Optional[
             Callable[[ManifestEntry], float]
         ] = lambda entry: entry.meta.content_length,
+        enable_input_split: Optional[bool] = False,
     ) -> List[DeltaAnnotated]:
         """
         Simple greedy algorithm to split/merge 1 or more annotated deltas into
@@ -86,13 +87,19 @@ class DeltaAnnotated(Delta):
         new_da_bytes = 0
         da_group_entry_count = 0
-        for delta_annotated in annotated_deltas:
-            split_annotated_deltas.extend(DeltaAnnotated._split_single(delta_annotated))
+        if enable_input_split:
+            for delta_annotated in annotated_deltas:
+                split_annotated_deltas.extend(
+                    DeltaAnnotated._split_single(delta_annotated)
+                )
-        logger.info(
-            f"Split the {len(annotated_deltas)} annotated deltas "
-            f"into {len(split_annotated_deltas)} groups."
-        )
+            logger.info(
+                f"Split the {len(annotated_deltas)} annotated deltas "
+                f"into {len(split_annotated_deltas)} groups."
+            )
+        else:
+            logger.info("Skipping input split as it is disabled...")
+            split_annotated_deltas = annotated_deltas
         for src_da in split_annotated_deltas:
             src_da_annotations = src_da.annotations
@@ -107,7 +114,7 @@ class DeltaAnnotated(Delta):
                 # (i.e. the previous compaction round ran a rebase)
                 if new_da and src_da.locator != new_da.locator:
                     groups.append(new_da)
-                    logger.info(
+                    logger.debug(
                         f"Due to different delta locator, Appending group of {da_group_entry_count} elements "
                         f"and {new_da_bytes} bytes"
                     )
@@ -126,12 +133,12 @@ class DeltaAnnotated(Delta):
                     or da_group_entry_count >= min_file_counts
                 ):
                     if new_da_bytes >= min_delta_bytes:
-                        logger.info(
+                        logger.debug(
                             f"Appending group of {da_group_entry_count} elements "
                             f"and {new_da_bytes} bytes to meet file size limit"
                         )
                     if da_group_entry_count >= min_file_counts:
-                        logger.info(
+                        logger.debug(
                             f"Appending group of {da_group_entry_count} elements "
                             f"and {da_group_entry_count} files to meet file count limit"
                         )

deltacat/compute/compactor_v2/constants.py CHANGED Viewed

@@ -41,6 +41,9 @@ DROP_DUPLICATES = True
 # size in metadata to pyarrow table size.
 PARQUET_TO_PYARROW_INFLATION = 4
+# Maximum size of the parquet metadata
+MAX_PARQUET_METADATA_SIZE = 100_000_000  # 100 MB
 # By default, copy by reference is enabled
 DEFAULT_DISABLE_COPY_BY_REFERENCE = False

deltacat/compute/compactor_v2/private/compaction_utils.py CHANGED Viewed

@@ -148,12 +148,8 @@ def _build_uniform_deltas(
         input_deltas=input_deltas,
         hash_bucket_count=params.hash_bucket_count,
         compaction_audit=mutable_compaction_audit,
+        compact_partition_params=params,
         deltacat_storage=params.deltacat_storage,
-        previous_inflation=params.previous_inflation,
-        min_delta_bytes=params.min_delta_bytes_in_batch,
-        min_file_counts=params.min_files_in_batch,
-        # disable input split during rebase as the rebase files are already uniform
-        enable_input_split=params.rebase_source_partition_locator is None,
         deltacat_storage_kwargs=params.deltacat_storage_kwargs,
     )
     delta_discovery_end: float = time.monotonic()
@@ -400,6 +396,7 @@ def _merge(
         deltacat_storage_kwargs=params.deltacat_storage_kwargs,
         ray_custom_resources=params.ray_custom_resources,
         memory_logs_enabled=params.memory_logs_enabled,
+        estimate_resources_params=params.estimate_resources_params,
     )
     def merge_input_provider(index, item) -> dict[str, MergeInput]:
@@ -463,6 +460,7 @@ def _hash_bucket(
         primary_keys=params.primary_keys,
         ray_custom_resources=params.ray_custom_resources,
         memory_logs_enabled=params.memory_logs_enabled,
+        estimate_resources_params=params.estimate_resources_params,
     )
     def hash_bucket_input_provider(index, item) -> dict[str, HashBucketInput]:
@@ -537,6 +535,7 @@ def _run_local_merge(
         ray_custom_resources=params.ray_custom_resources,
         primary_keys=params.primary_keys,
         memory_logs_enabled=params.memory_logs_enabled,
+        estimate_resources_params=params.estimate_resources_params,
     )
     local_merge_result = ray.get(
         mg.merge.options(**local_merge_options).remote(local_merge_input)
@@ -666,6 +665,11 @@ def _write_new_round_completion_file(
         f" and average record size={input_average_record_size_bytes}"
     )
+    mutable_compaction_audit.set_observed_input_inflation(input_inflation)
+    mutable_compaction_audit.set_observed_input_average_record_size_bytes(
+        input_average_record_size_bytes
+    )
     _update_and_upload_compaction_audit(
         params,
         mutable_compaction_audit,

deltacat/compute/compactor_v2/utils/content_type_params.py CHANGED Viewed

@@ -1,66 +1,217 @@
 import logging
+import ray
+import functools
+from deltacat.compute.compactor_v2.constants import (
+    TASK_MAX_PARALLELISM,
+    MAX_PARQUET_METADATA_SIZE,
+)
+from deltacat.utils.ray_utils.concurrency import invoke_parallel
 from deltacat import logs
 from deltacat.storage import (
     Delta,
+    ManifestEntry,
     interface as unimplemented_deltacat_storage,
 )
 from typing import Dict, Optional, Any
-from deltacat.types.media import TableType, StorageType
+from deltacat.types.media import TableType
 from deltacat.types.media import ContentType
 from deltacat.types.partial_download import PartialParquetParameters
+from deltacat.exceptions import RetryableError
 logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
+def append_content_type_params_options_provider(
+    index: int, item: Any, max_parquet_meta_size_bytes: int, **kwargs
+) -> Dict:
+    task_opts = {
+        "num_cpus": 0.01,
+        "memory": max_parquet_meta_size_bytes,
+        "scheduling_strategy": "DEFAULT",
+    }
+    task_opts["max_retries"] = 3
+    task_opts["retry_exceptions"] = [RetryableError]
+    return task_opts
+def _contains_partial_parquet_parameters(entry: ManifestEntry) -> bool:
+    return (
+        entry.meta
+        and entry.meta.content_type_parameters
+        and any(
+            isinstance(type_params, PartialParquetParameters)
+            for type_params in entry.meta.content_type_parameters
+        )
+    )
+APPEND_CONTENT_TYPE_PARAMS_CACHE = "append_content_type_params_cache"
+# At this point, it's better to fetch all parquet than to cache and
+# call actor which is not expected to support high throughput.
+MINIMUM_ENTRIES_TO_CACHE = 10
+@ray.remote
+class AppendContentTypeParamsCache:
+    """
+    This actor caches the delta that contains content type meta.
+    """
+    def __init__(self):
+        self.cache = {}
+    def get(self, key):
+        return self.cache.get(key)
+    def put(self, key, value):
+        self.cache[key] = value
+@ray.remote
+def _download_parquet_metadata_for_manifest_entry(
+    delta: Delta,
+    entry_index: int,
+    deltacat_storage: unimplemented_deltacat_storage,
+    deltacat_storage_kwargs: Optional[Dict[Any, Any]] = {},
+) -> Dict[str, Any]:
+    pq_file = deltacat_storage.download_delta_manifest_entry(
+        delta,
+        entry_index=entry_index,
+        table_type=TableType.PYARROW_PARQUET,
+        **deltacat_storage_kwargs,
+    )
+    return {
+        "entry_index": entry_index,
+        "partial_parquet_params": PartialParquetParameters.of(
+            pq_metadata=pq_file.metadata
+        ),
+    }
 def append_content_type_params(
     delta: Delta,
-    entry_index: Optional[int] = None,
+    task_max_parallelism: int = TASK_MAX_PARALLELISM,
+    max_parquet_meta_size_bytes: Optional[int] = MAX_PARQUET_METADATA_SIZE,
     deltacat_storage=unimplemented_deltacat_storage,
     deltacat_storage_kwargs: Optional[Dict[str, Any]] = {},
 ) -> None:
+    """
+    This operation appends content type params into the delta entry. Note
+    that this operation can be time consuming, hence we cache it in a Ray actor.
+    """
-    if delta.meta.content_type != ContentType.PARQUET.value:
-        logger.info(
-            f"Delta with locator {delta.locator} is not a parquet delta, "
-            "skipping appending content type parameters."
-        )
+    if not delta.meta:
+        logger.warning(f"Delta with locator {delta.locator} doesn't contain meta.")
         return
-    manifest_entries = delta.manifest.entries
-    ordered_pq_meta = []
-    if entry_index is not None:
-        manifest_entries = [delta.manifest.entries[entry_index]]
+    entry_indices_to_download = []
+    for entry_index, entry in enumerate(delta.manifest.entries):
+        if (
+            not _contains_partial_parquet_parameters(entry)
+            and entry.meta
+            and entry.meta.content_type == ContentType.PARQUET.value
+        ):
+            entry_indices_to_download.append(entry_index)
-        pq_file = deltacat_storage.download_delta_manifest_entry(
-            delta,
-            entry_index=entry_index,
-            table_type=TableType.PYARROW_PARQUET,
-            **deltacat_storage_kwargs,
+    if not entry_indices_to_download:
+        logger.info(
+            f"No parquet type params to download for delta with locator {delta.locator}."
         )
+        return None
-        partial_file_meta = PartialParquetParameters.of(pq_metadata=pq_file.metadata)
-        ordered_pq_meta.append(partial_file_meta)
+    ray_namespace = ray.get_runtime_context().namespace
+    logger.info(
+        f"Got Ray namespace: {ray_namespace}. "
+        "Note that caching only works with non-anonymous namespace."
+        "To set a non-anonymous namespace, call ray.init(namespace='X')."
+    )
+    if len(entry_indices_to_download) >= MINIMUM_ENTRIES_TO_CACHE:
+        logger.info(
+            f"Checking if cache contains parquet meta in namespace {ray_namespace} for "
+            f"delta locator {delta.locator} and digest {delta.locator.hexdigest()}..."
+        )
+        cache = AppendContentTypeParamsCache.options(
+            name=APPEND_CONTENT_TYPE_PARAMS_CACHE,
+            namespace=ray_namespace,
+            get_if_exists=True,
+        ).remote()
-    else:
-        pq_files = deltacat_storage.download_delta(
-            delta,
-            table_type=TableType.PYARROW_PARQUET,
-            storage_type=StorageType.LOCAL,
-            **deltacat_storage_kwargs,
+        logger.info(f"Got cache actor: {cache}")
+        cached_value = ray.get(cache.get.remote(delta.locator.hexdigest()))
+        if cached_value is not None:
+            logger.info(
+                "Using cached parquet meta for delta with locator"
+                f" {delta.locator} and digest {delta.locator.hexdigest()}."
+            )
+            delta.manifest = cached_value.manifest
+            return
+        logger.info(
+            f"Cache doesn't contain parquet meta for delta with locator {delta.locator}."
         )
-        assert len(pq_files) == len(
-            manifest_entries
-        ), f"Expected {len(manifest_entries)} pq files, got {len(pq_files)}"
+    options_provider = functools.partial(
+        append_content_type_params_options_provider,
+        max_parquet_meta_size_bytes=max_parquet_meta_size_bytes,
+    )
-        ordered_pq_meta = [
-            PartialParquetParameters.of(pq_metadata=pq_file.metadata)
-            for pq_file in pq_files
-        ]
+    def input_provider(index, item) -> Dict:
+        return {
+            "deltacat_storage_kwargs": deltacat_storage_kwargs,
+            "deltacat_storage": deltacat_storage,
+            "delta": delta,
+            "entry_index": item,
+        }
-    for entry_index, entry in enumerate(manifest_entries):
+    logger.info(
+        f"Downloading parquet meta for {len(entry_indices_to_download)} manifest entries..."
+    )
+    pq_files_promise = invoke_parallel(
+        entry_indices_to_download,
+        ray_task=_download_parquet_metadata_for_manifest_entry,
+        max_parallelism=task_max_parallelism,
+        options_provider=options_provider,
+        kwargs_provider=input_provider,
+    )
+    partial_file_meta_list = ray.get(pq_files_promise)
+    logger.info(
+        f"Downloaded parquet meta for {len(entry_indices_to_download)} manifest entries"
+    )
+    assert len(partial_file_meta_list) == len(
+        entry_indices_to_download
+    ), f"Expected {len(entry_indices_to_download)} pq files, got {len(partial_file_meta_list)}"
+    for index, entry_index in enumerate(entry_indices_to_download):
+        assert (
+            entry_index == partial_file_meta_list[index]["entry_index"]
+        ), "entry_index must match with the associated parquet meta"
+        entry = delta.manifest.entries[entry_index]
         if not entry.meta.content_type_parameters:
             entry.meta.content_type_parameters = []
+        entry.meta.content_type_parameters.append(
+            partial_file_meta_list[index]["partial_parquet_params"]
+        )
+    for entry_index, entry in enumerate(delta.manifest.entries):
+        assert _contains_partial_parquet_parameters(
+            entry
+        ), "partial parquet params validation failed."
-        entry.meta.content_type_parameters.append(ordered_pq_meta[entry_index])
+    if len(entry_indices_to_download) >= MINIMUM_ENTRIES_TO_CACHE:
+        cache = AppendContentTypeParamsCache.options(
+            name=APPEND_CONTENT_TYPE_PARAMS_CACHE,
+            namespace=ray_namespace,
+            get_if_exists=True,
+        ).remote()
+        logger.info(f"Got cache actor when writing: {cache}")
+        logger.info(
+            f"Caching parquet meta for delta with locator {delta.locator} "
+            f"and digest {delta.locator.hexdigest()}..."
+        )
+        ray.get(cache.put.remote(delta.locator.hexdigest(), delta))
+        assert ray.get(cache.get.remote(delta.locator.hexdigest())) is not None

deltacat/compute/compactor_v2/utils/io.py CHANGED Viewed

@@ -1,6 +1,5 @@
 import logging
 import functools
-from deltacat.constants import PYARROW_INFLATION_MULTIPLIER
 from deltacat.storage import (
     PartitionLocator,
     Delta,
@@ -9,11 +8,10 @@ from deltacat.storage import (
 from deltacat import logs
 from deltacat.compute.compactor.utils import io as io_v1
 from deltacat.compute.compactor import DeltaAnnotated
-from typing import Dict, List, Optional, Any
-from deltacat.compute.compactor_v2.constants import (
-    MIN_FILES_IN_BATCH,
-    MIN_DELTA_BYTES_IN_BATCH,
+from deltacat.compute.compactor.model.compact_partition_params import (
+    CompactPartitionParams,
 )
+from typing import Dict, List, Optional, Any
 from deltacat.compute.compactor.model.compaction_session_audit_info import (
     CompactionSessionAuditInfo,
 )
@@ -25,6 +23,10 @@ from deltacat.compute.compactor_v2.utils.content_type_params import (
 )
 from deltacat.utils.metrics import metrics
 from deltacat.compute.compactor_v2.constants import DISCOVER_DELTAS_METRIC_PREFIX
+from deltacat.compute.resource_estimation.manifest import (
+    does_require_content_type_params,
+)
+from deltacat.compute.resource_estimation.model import OperationType
 logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
@@ -90,10 +92,7 @@ def create_uniform_input_deltas(
     input_deltas: List[Delta],
     hash_bucket_count: int,
     compaction_audit: CompactionSessionAuditInfo,
-    min_delta_bytes: Optional[float] = MIN_DELTA_BYTES_IN_BATCH,
-    min_file_counts: Optional[float] = MIN_FILES_IN_BATCH,
-    previous_inflation: Optional[float] = PYARROW_INFLATION_MULTIPLIER,
-    enable_input_split: Optional[bool] = False,
+    compact_partition_params: CompactPartitionParams,
     deltacat_storage=unimplemented_deltacat_storage,
     deltacat_storage_kwargs: Optional[Dict[str, Any]] = {},
 ) -> List[DeltaAnnotated]:
@@ -104,11 +103,21 @@ def create_uniform_input_deltas(
     input_da_list = []
     for delta in input_deltas:
-        if enable_input_split:
+        if (
+            compact_partition_params.enable_input_split
+            or does_require_content_type_params(
+                compact_partition_params.resource_estimation_method
+            )
+        ):
+            logger.debug(
+                f"Delta with locator: {delta.locator} requires content type params..."
+            )
             append_content_type_params(
                 delta=delta,
                 deltacat_storage=deltacat_storage,
                 deltacat_storage_kwargs=deltacat_storage_kwargs,
+                task_max_parallelism=compact_partition_params.task_max_parallelism,
+                max_parquet_meta_size_bytes=compact_partition_params.max_parquet_meta_size_bytes,
             )
         manifest_entries = delta.manifest.entries
@@ -118,7 +127,9 @@ def create_uniform_input_deltas(
             entry = manifest_entries[entry_index]
             delta_bytes += entry.meta.content_length
             estimated_da_bytes += estimate_manifest_entry_size_bytes(
-                entry=entry, previous_inflation=previous_inflation
+                entry=entry,
+                operation_type=OperationType.PYARROW_DOWNLOAD,
+                estimate_resources_params=compact_partition_params.estimate_resources_params,
             )
         delta_annotated = DeltaAnnotated.of(delta)
@@ -129,13 +140,16 @@ def create_uniform_input_deltas(
     logger.info(f"Input delta files to compact: {delta_manifest_entries_count}")
     size_estimation_function = functools.partial(
-        estimate_manifest_entry_size_bytes, previous_inflation=previous_inflation
+        estimate_manifest_entry_size_bytes,
+        operation_type=OperationType.PYARROW_DOWNLOAD,
+        estimate_resources_params=compact_partition_params.estimate_resources_params,
     )
     rebatched_da_list = DeltaAnnotated.rebatch(
         input_da_list,
-        min_delta_bytes=min_delta_bytes,
-        min_file_counts=min_file_counts,
+        min_delta_bytes=compact_partition_params.min_delta_bytes_in_batch,
+        min_file_counts=compact_partition_params.min_files_in_batch,
         estimation_function=size_estimation_function,
+        enable_input_split=compact_partition_params.enable_input_split,
     )
     compaction_audit.set_input_size_bytes(delta_bytes)

deltacat/compute/compactor_v2/utils/primary_key_index.py CHANGED Viewed

@@ -27,8 +27,11 @@ def _append_sha1_hash_to_table(table: pa.Table, hash_column: pa.Array) -> pa.Tab
     result = []
     for hash_value in hash_column_np:
-        assert hash_value is not None, f"Expected non-null primary key"
-        result.append(hashlib.sha1(hash_value.encode("utf-8")).hexdigest())
+        if hash_value is None:
+            result.append(None)
+            logger.info("A primary key hash is null")
+        else:
+            result.append(hashlib.sha1(hash_value.encode("utf-8")).hexdigest())
     return sc.append_pk_hash_string_column(table, result)
@@ -191,7 +194,7 @@ def generate_pk_hash_column(
             pk_columns.append(sliced_string_cast(table[pk_name]))
         pk_columns.append(PK_DELIMITER)
-        hash_column = pc.binary_join_element_wise(*pk_columns)
+        hash_column = pc.binary_join_element_wise(*pk_columns, null_handling="replace")
         return hash_column
     def _generate_uuid(table: pa.Table) -> pa.Array:
@@ -345,8 +348,10 @@ def hash_group_index_to_hash_bucket_indices(
     return range(hb_group, num_buckets, num_groups)
-def pk_digest_to_hash_bucket_index(digest: str, num_buckets: int) -> int:
+def pk_digest_to_hash_bucket_index(digest: Optional[str], num_buckets: int) -> int:
     """
     Generates the hash bucket index from the given digest.
     """
+    if digest is None:
+        return 0
     return int(digest, 16) % num_buckets

deltacat 1.1.17__py3-none-any.whl → 1.1.19__py3-none-any.whl

deltacat 1.1.17py3-none-any.whl → 1.1.19py3-none-any.whl