PyPI - deltacat - Versions diffs - 1.1.18__py3-none-any.whl → 1.1.20__py3-none-any.whl - Mend

deltacat 1.1.18py3-none-any.whl → 1.1.20py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (53) hide show

deltacat/__init__.py +1 -1
deltacat/compute/compactor/model/compact_partition_params.py +76 -0
deltacat/compute/compactor/model/compaction_session_audit_info.py +26 -0
deltacat/compute/compactor/model/delta_annotated.py +16 -9
deltacat/compute/compactor_v2/constants.py +3 -0
deltacat/compute/compactor_v2/private/compaction_utils.py +11 -5
deltacat/compute/compactor_v2/utils/content_type_params.py +185 -34
deltacat/compute/compactor_v2/utils/io.py +28 -14
deltacat/compute/compactor_v2/utils/task_options.py +128 -183
deltacat/compute/resource_estimation/__init__.py +27 -0
deltacat/compute/resource_estimation/delta.py +271 -0
deltacat/compute/resource_estimation/manifest.py +394 -0
deltacat/compute/resource_estimation/model.py +165 -0
deltacat/compute/resource_estimation/parquet.py +108 -0
deltacat/constants.py +5 -0
deltacat/logs.py +8 -0
deltacat/tests/compute/compactor_v2/test_compaction_session.py +157 -0
deltacat/tests/compute/compactor_v2/utils/test_task_options.py +3 -3
deltacat/tests/compute/resource_estimation/test_delta.py +605 -0
deltacat/tests/compute/resource_estimation/test_manifest.py +921 -0
deltacat/tests/compute/test_util_common.py +2 -0
deltacat/tests/test_logs.py +34 -0
deltacat/tests/test_utils/pyarrow.py +15 -5
{deltacat-1.1.18.dist-info → deltacat-1.1.20.dist-info}/METADATA +2 -2
{deltacat-1.1.18.dist-info → deltacat-1.1.20.dist-info}/RECORD +30 -46
deltacat/compute/metastats/meta_stats.py +0 -479
deltacat/compute/metastats/model/__init__.py +0 -0
deltacat/compute/metastats/model/partition_stats_dict.py +0 -34
deltacat/compute/metastats/model/stats_cluster_size_estimator.py +0 -68
deltacat/compute/metastats/stats.py +0 -182
deltacat/compute/metastats/utils/__init__.py +0 -0
deltacat/compute/metastats/utils/constants.py +0 -16
deltacat/compute/metastats/utils/io.py +0 -223
deltacat/compute/metastats/utils/pyarrow_memory_estimation_function.py +0 -18
deltacat/compute/metastats/utils/ray_utils.py +0 -129
deltacat/compute/stats/basic.py +0 -226
deltacat/compute/stats/models/__init__.py +0 -0
deltacat/compute/stats/models/delta_column_stats.py +0 -98
deltacat/compute/stats/models/delta_stats.py +0 -233
deltacat/compute/stats/models/delta_stats_cache_result.py +0 -49
deltacat/compute/stats/models/manifest_entry_stats.py +0 -72
deltacat/compute/stats/models/stats_result.py +0 -104
deltacat/compute/stats/utils/__init__.py +0 -0
deltacat/compute/stats/utils/intervals.py +0 -94
deltacat/compute/stats/utils/io.py +0 -230
deltacat/compute/stats/utils/manifest_stats_file.py +0 -100
deltacat/tests/stats/__init__.py +0 -0
deltacat/tests/stats/test_intervals.py +0 -49
/deltacat/{compute/metastats → tests/compute/resource_estimation}/__init__.py +0 -0
/deltacat/{compute/metastats/config → tests/compute/resource_estimation/data}/__init__.py +0 -0
{deltacat-1.1.18.dist-info → deltacat-1.1.20.dist-info}/LICENSE +0 -0
{deltacat-1.1.18.dist-info → deltacat-1.1.20.dist-info}/WHEEL +0 -0
{deltacat-1.1.18.dist-info → deltacat-1.1.20.dist-info}/top_level.txt +0 -0

deltacat/compute/resource_estimation/model.py ADDED Viewed

@@ -0,0 +1,165 @@
+from __future__ import annotations
+from enum import Enum
+from typing import Optional
+class ResourceEstimationMethod(str, Enum):
+    """
+    The default approach executes certain methods in a specific order until the size
+    is estimated by any. The order is as follows:
+    1. CONTENT_TYPE_META
+    2. PREVIOUS_INFLATION
+    This method expects previous inflation and average record bytes to be passed.
+    """
+    DEFAULT = "DEFAULT"
+    """
+    This approach combines intelligent estimation and inflation based methods
+    and runs them in the order specified below:
+    1. INTELLIGENT_ESTIMATION
+    2. FILE_SAMPLING
+    3. PREVIOUS_INFLATION
+    """
+    DEFAULT_V2 = "DEFAULT_V2"
+    """
+    This approach strictly uses previous inflation and average record size to arrive
+    at a resource estimate. It requires users to pass in previous inflation and average
+    record sizes.
+    """
+    PREVIOUS_INFLATION = "PREVIOUS_INFLATION"
+    """
+    This approach is similar to PREVIOUS_INFLATION, but it determines average record size
+    and previous inflation by sampling few files in the given set of files.
+    """
+    FILE_SAMPLING = "FILE_SAMPLING"
+    """
+    This approach leverages metadata present in content type params.
+    """
+    CONTENT_TYPE_META = "CONTENT_TYPE_META"
+    """
+    This approach leverages parquet metadata and granularly estimate resources for each column and
+    then aggregate to arrive at most accurate estimation.
+    """
+    INTELLIGENT_ESTIMATION = "INTELLIGENT_ESTIMATION"
+class EstimateResourcesParams(dict):
+    """
+    This class represents the parameters required for estimating resources.
+    """
+    @staticmethod
+    def of(
+        resource_estimation_method: ResourceEstimationMethod = ResourceEstimationMethod.DEFAULT,
+        previous_inflation: Optional[float] = None,
+        parquet_to_pyarrow_inflation: Optional[float] = None,
+        average_record_size_bytes: Optional[float] = None,
+        max_files_to_sample: Optional[int] = None,
+    ) -> EstimateResourcesParams:
+        result = EstimateResourcesParams()
+        result["previous_inflation"] = previous_inflation
+        result["parquet_to_pyarrow_inflation"] = parquet_to_pyarrow_inflation
+        result["resource_estimation_method"] = resource_estimation_method
+        result["max_files_to_sample"] = max_files_to_sample
+        result["average_record_size_bytes"] = average_record_size_bytes
+        return result
+    @property
+    def resource_estimation_method(self) -> ResourceEstimationMethod:
+        return self["resource_estimation_method"]
+    @property
+    def max_files_to_sample(self) -> Optional[int]:
+        """
+        Applicable only for FILE_SAMPLING method. This parameter controls the
+        number of files to sample to arrive at average record sizes and previous inflation.
+        """
+        return self.get("max_files_to_sample")
+    @property
+    def previous_inflation(self) -> Optional[float]:
+        """
+        This parameter is required for PREVIOUS_INFLATION method. The inflation factor determines
+        a ratio of in-memory size to the on-disk size.
+        """
+        return self.get("previous_inflation")
+    @property
+    def parquet_to_pyarrow_inflation(self) -> Optional[float]:
+        """
+        This parameter is required for INTELLIGENT_ESTIMATION or CONTENT_TYPE_META method.
+        This determines inflation factor for parquet estimated size to pyarrow in-memory table size.
+        """
+        return self.get("parquet_to_pyarrow_inflation")
+    @property
+    def average_record_size_bytes(self) -> Optional[float]:
+        """
+        This parameter is required for PREVIOUS_INFLATION method. This determines average size of
+        records in bytes in a given file or entity.
+        """
+        return self.get("average_record_size_bytes")
+class OperationType(str, Enum):
+    """
+    This operation type is used when user would download the given entities using pyarrow library.
+    """
+    PYARROW_DOWNLOAD = "DOWNLOAD"
+class EstimatedResources(dict):
+    """
+    This class represents the resource requirements for a certain type of operation.
+    For example, downloading a delta requires certain amount of memory.
+    """
+    @staticmethod
+    def of(memory_bytes: float, statistics: Statistics = None) -> EstimatedResources:
+        result = EstimatedResources()
+        result["memory_bytes"] = memory_bytes
+        result["statistics"] = statistics
+        return result
+    @property
+    def memory_bytes(self) -> float:
+        return self["memory_bytes"]
+    @property
+    def statistics(self) -> Optional[Statistics]:
+        return self.get("statistics")
+class Statistics(dict):
+    """
+    This class represents the statistics of underlying objects that was used
+    to estimate the resource required.
+    """
+    @staticmethod
+    def of(
+        in_memory_size_bytes: float, record_count: int, on_disk_size_bytes: float
+    ) -> Statistics:
+        result = Statistics()
+        result["in_memory_size_bytes"] = in_memory_size_bytes
+        result["record_count"] = record_count
+        result["on_disk_size_bytes"] = on_disk_size_bytes
+        return result
+    @property
+    def in_memory_size_bytes(self) -> float:
+        return self["in_memory_size_bytes"]
+    @property
+    def record_count(self) -> int:
+        return self["record_count"]
+    @property
+    def on_disk_size_bytes(self) -> float:
+        return self["on_disk_size_bytes"]

deltacat/compute/resource_estimation/parquet.py ADDED Viewed

@@ -0,0 +1,108 @@
+import logging
+from typing import Optional
+from deltacat import logs
+from pyarrow.parquet import ColumnChunkMetaData
+from deltacat.constants import NULL_SIZE_BYTES
+logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
+def _observed_string_size(min_value: str, max_value: str) -> float:
+    """
+    Pyarrow uses few additional bytes to store each string.
+    """
+    return (len(min_value) + len(max_value)) / 2 + 4
+def _int96_size_estimator(
+    column_chunk_metadata: ColumnChunkMetaData,
+) -> float:
+    return column_chunk_metadata.num_values * 12
+def _int64_size_estimator(
+    column_chunk_metadata: ColumnChunkMetaData,
+) -> float:
+    return column_chunk_metadata.num_values * 8
+def _int32_size_estimator(
+    column_chunk_metadata: ColumnChunkMetaData,
+) -> float:
+    return column_chunk_metadata.num_values * 4
+def _boolean_size_estimator(
+    column_chunk_metadata: ColumnChunkMetaData,
+) -> float:
+    return column_chunk_metadata.num_values
+def _double_size_estimator(
+    column_chunk_metadata: ColumnChunkMetaData,
+) -> float:
+    return column_chunk_metadata.num_values * 8
+def _float_size_estimator(
+    column_chunk_metadata: ColumnChunkMetaData,
+) -> float:
+    return column_chunk_metadata.num_values * 4
+def _byte_array_size_estimator(
+    column_chunk_metadata: ColumnChunkMetaData,
+) -> float:
+    uncompressed_size = column_chunk_metadata.total_uncompressed_size
+    if column_chunk_metadata.is_stats_set:
+        statistics = column_chunk_metadata.statistics
+        if (
+            statistics.has_min_max
+            and isinstance(statistics.min, str)
+            and isinstance(statistics.max, str)
+        ):
+            return max(
+                uncompressed_size,
+                (
+                    statistics.num_values
+                    * _observed_string_size(statistics.min, statistics.max)
+                    + statistics.null_count * NULL_SIZE_BYTES
+                ),
+            )
+        else:
+            # A case of decimal
+            return max(column_chunk_metadata.num_values * 16, uncompressed_size)
+    else:
+        return uncompressed_size
+def _fixed_len_byte_array_size_estimator(
+    column_chunk_metadata: ColumnChunkMetaData,
+) -> float:
+    return _byte_array_size_estimator(column_chunk_metadata)
+_PHYSICAL_TYPE_TO_SIZE_ESTIMATOR = {
+    "INT96": _int96_size_estimator,
+    "INT64": _int64_size_estimator,
+    "INT32": _int32_size_estimator,
+    "BOOLEAN": _boolean_size_estimator,
+    "DOUBLE": _double_size_estimator,
+    "FLOAT": _float_size_estimator,
+    "BYTE_ARRAY": _byte_array_size_estimator,
+    "FIXED_LEN_BYTE_ARRAY": _fixed_len_byte_array_size_estimator,
+}
+def parquet_column_chunk_size_estimator(
+    column_meta: ColumnChunkMetaData,
+) -> Optional[float]:
+    physical_type = column_meta.physical_type
+    if physical_type in _PHYSICAL_TYPE_TO_SIZE_ESTIMATOR:
+        return _PHYSICAL_TYPE_TO_SIZE_ESTIMATOR[physical_type](column_meta)
+    else:
+        logger.warning(
+            f"Unsupported physical type: {physical_type}. "
+            "Returning total_uncompressed_size."
+        )
+        return column_meta.total_uncompressed_size

deltacat/constants.py CHANGED Viewed

@@ -28,6 +28,8 @@ DELTACAT_APP_DEBUG_LOG_BASE_FILE_NAME = env_string(
     "DELTACAT_APP_DEBUG_LOG_BASE_FILE_NAME",
     "application.debug.log",
 )
+# A json context which will be logged along with other context args.
+DELTACAT_LOGGER_CONTEXT = env_string("DELTACAT_LOGGER_CONTEXT", None)
 # Byte Units
 BYTES_PER_KIBIBYTE = 2**10
@@ -53,3 +55,6 @@ PYARROW_INFLATION_MULTIPLIER = 2.5
 PYARROW_INFLATION_MULTIPLIER_ALL_COLUMNS = 6
 MEMORY_TO_HASH_BUCKET_COUNT_RATIO = 0.0512 * BYTES_PER_TEBIBYTE
+# The number of bytes allocated to null values in string physical type in parquet
+NULL_SIZE_BYTES = 4

deltacat/logs.py CHANGED Viewed

@@ -17,6 +17,7 @@ from deltacat.constants import (
     DELTACAT_SYS_INFO_LOG_BASE_FILE_NAME,
     DELTACAT_APP_DEBUG_LOG_BASE_FILE_NAME,
     DELTACAT_SYS_DEBUG_LOG_BASE_FILE_NAME,
+    DELTACAT_LOGGER_CONTEXT,
 )
 DEFAULT_LOG_LEVEL = "INFO"
@@ -66,6 +67,13 @@ class JsonFormatter(logging.Formatter):
             self.ray_runtime_ctx = None
             self.context = {}
+        if DELTACAT_LOGGER_CONTEXT is not None:
+            try:
+                env_context = json.loads(DELTACAT_LOGGER_CONTEXT)
+                self.additional_context.update(env_context)
+            except Exception:
+                pass
     def usesTime(self) -> bool:
         """
         Overwritten to look for the attribute in the format dict values instead of the fmt string.

deltacat/tests/compute/compactor_v2/test_compaction_session.py CHANGED Viewed

@@ -19,6 +19,7 @@ from deltacat.tests.test_utils.utils import read_s3_contents
 from deltacat.tests.compute.test_util_constant import (
     TEST_S3_RCF_BUCKET_NAME,
 )
+from deltacat.compute.resource_estimation import ResourceEstimationMethod
 from deltacat.tests.compute.test_util_common import get_rcf
 from deltacat.tests.test_utils.pyarrow import (
     stage_partition_from_file_paths,
@@ -399,3 +400,159 @@ class TestCompactionSession:
         assert compaction_audit.output_file_count == 2
         assert abs(compaction_audit.output_size_bytes - 1843) / 1843 <= self.ERROR_RATE
         assert abs(compaction_audit.input_size_bytes - 2748) / 2748 <= self.ERROR_RATE
+    def test_compact_partition_when_incremental_then_intelligent_estimation_sanity(
+        self, s3_resource, local_deltacat_storage_kwargs
+    ):
+        """
+        A test case which asserts the RCF stats are correctly generated for
+        a rebase and incremental use-case.
+        """
+        # setup
+        staged_source = stage_partition_from_file_paths(
+            self.NAMESPACE, ["source"], **local_deltacat_storage_kwargs
+        )
+        source_delta = commit_delta_to_staged_partition(
+            staged_source, [self.BACKFILL_FILE_PATH], **local_deltacat_storage_kwargs
+        )
+        staged_dest = stage_partition_from_file_paths(
+            self.NAMESPACE, ["destination"], **local_deltacat_storage_kwargs
+        )
+        dest_partition = ds.commit_partition(
+            staged_dest, **local_deltacat_storage_kwargs
+        )
+        # action
+        compact_partition(
+            CompactPartitionParams.of(
+                {
+                    "compaction_artifact_s3_bucket": TEST_S3_RCF_BUCKET_NAME,
+                    "compacted_file_content_type": ContentType.PARQUET,
+                    "dd_max_parallelism_ratio": 1.0,
+                    "deltacat_storage": ds,
+                    "deltacat_storage_kwargs": local_deltacat_storage_kwargs,
+                    "destination_partition_locator": dest_partition.locator,
+                    "drop_duplicates": True,
+                    "hash_bucket_count": 2,
+                    "last_stream_position_to_compact": source_delta.stream_position,
+                    "list_deltas_kwargs": {
+                        **local_deltacat_storage_kwargs,
+                        **{"equivalent_table_types": []},
+                    },
+                    "primary_keys": ["pk"],
+                    "rebase_source_partition_locator": source_delta.partition_locator,
+                    "rebase_source_partition_high_watermark": source_delta.stream_position,
+                    "records_per_compacted_file": 4000,
+                    "s3_client_kwargs": {},
+                    "source_partition_locator": source_delta.partition_locator,
+                    "resource_estimation_method": ResourceEstimationMethod.INTELLIGENT_ESTIMATION,
+                }
+            )
+        )
+    def test_compact_partition_when_incremental_then_content_type_meta_estimation_sanity(
+        self, s3_resource, local_deltacat_storage_kwargs
+    ):
+        """
+        A test case which asserts the RCF stats are correctly generated for
+        a rebase and incremental use-case.
+        """
+        # setup
+        staged_source = stage_partition_from_file_paths(
+            self.NAMESPACE, ["source"], **local_deltacat_storage_kwargs
+        )
+        source_delta = commit_delta_to_staged_partition(
+            staged_source, [self.BACKFILL_FILE_PATH], **local_deltacat_storage_kwargs
+        )
+        staged_dest = stage_partition_from_file_paths(
+            self.NAMESPACE, ["destination"], **local_deltacat_storage_kwargs
+        )
+        dest_partition = ds.commit_partition(
+            staged_dest, **local_deltacat_storage_kwargs
+        )
+        # action
+        compact_partition(
+            CompactPartitionParams.of(
+                {
+                    "compaction_artifact_s3_bucket": TEST_S3_RCF_BUCKET_NAME,
+                    "compacted_file_content_type": ContentType.PARQUET,
+                    "dd_max_parallelism_ratio": 1.0,
+                    "deltacat_storage": ds,
+                    "deltacat_storage_kwargs": local_deltacat_storage_kwargs,
+                    "destination_partition_locator": dest_partition.locator,
+                    "drop_duplicates": True,
+                    "hash_bucket_count": 2,
+                    "last_stream_position_to_compact": source_delta.stream_position,
+                    "list_deltas_kwargs": {
+                        **local_deltacat_storage_kwargs,
+                        **{"equivalent_table_types": []},
+                    },
+                    "primary_keys": ["pk"],
+                    "rebase_source_partition_locator": source_delta.partition_locator,
+                    "rebase_source_partition_high_watermark": source_delta.stream_position,
+                    "records_per_compacted_file": 4000,
+                    "s3_client_kwargs": {},
+                    "source_partition_locator": source_delta.partition_locator,
+                    "resource_estimation_method": ResourceEstimationMethod.CONTENT_TYPE_META,
+                }
+            )
+        )
+    def test_compact_partition_when_incremental_then_previous_inflation_estimation_sanity(
+        self, s3_resource, local_deltacat_storage_kwargs
+    ):
+        """
+        A test case which asserts the RCF stats are correctly generated for
+        a rebase and incremental use-case.
+        """
+        # setup
+        staged_source = stage_partition_from_file_paths(
+            self.NAMESPACE, ["source"], **local_deltacat_storage_kwargs
+        )
+        source_delta = commit_delta_to_staged_partition(
+            staged_source, [self.BACKFILL_FILE_PATH], **local_deltacat_storage_kwargs
+        )
+        staged_dest = stage_partition_from_file_paths(
+            self.NAMESPACE, ["destination"], **local_deltacat_storage_kwargs
+        )
+        dest_partition = ds.commit_partition(
+            staged_dest, **local_deltacat_storage_kwargs
+        )
+        # action
+        compact_partition(
+            CompactPartitionParams.of(
+                {
+                    "compaction_artifact_s3_bucket": TEST_S3_RCF_BUCKET_NAME,
+                    "compacted_file_content_type": ContentType.PARQUET,
+                    "dd_max_parallelism_ratio": 1.0,
+                    "deltacat_storage": ds,
+                    "deltacat_storage_kwargs": local_deltacat_storage_kwargs,
+                    "destination_partition_locator": dest_partition.locator,
+                    "drop_duplicates": True,
+                    "hash_bucket_count": 2,
+                    "last_stream_position_to_compact": source_delta.stream_position,
+                    "list_deltas_kwargs": {
+                        **local_deltacat_storage_kwargs,
+                        **{"equivalent_table_types": []},
+                    },
+                    "primary_keys": ["pk"],
+                    "rebase_source_partition_locator": source_delta.partition_locator,
+                    "rebase_source_partition_high_watermark": source_delta.stream_position,
+                    "records_per_compacted_file": 4000,
+                    "s3_client_kwargs": {},
+                    "source_partition_locator": source_delta.partition_locator,
+                    "resource_estimation_method": ResourceEstimationMethod.PREVIOUS_INFLATION,
+                }
+            )
+        )

deltacat/tests/compute/compactor_v2/utils/test_task_options.py CHANGED Viewed

@@ -1,6 +1,6 @@
 import unittest
 import ray
-from deltacat.compute.compactor_v2.utils.task_options import get_task_options
+from deltacat.compute.compactor_v2.utils.task_options import _get_task_options
 @ray.remote
@@ -20,14 +20,14 @@ class TestTaskOptions(unittest.TestCase):
         super().setUpClass()
     def test_get_task_options_sanity(self):
-        opts = get_task_options(0.01, 0.01)
+        opts = _get_task_options(0.01, 0.01)
         result_ref = valid_func.options(**opts).remote()
         result = ray.get(result_ref)
         self.assertEqual(result, 2)
     def test_get_task_options_when_exception_is_thrown(self):
-        opts = get_task_options(0.01, 0.01)
+        opts = _get_task_options(0.01, 0.01)
         result_ref = throwing_func.options(**opts).remote()
         self.assertRaises(ConnectionAbortedError, lambda: ray.get(result_ref))

deltacat 1.1.18__py3-none-any.whl → 1.1.20__py3-none-any.whl

deltacat 1.1.18py3-none-any.whl → 1.1.20py3-none-any.whl