PyPI - deltacat - Versions diffs - 0.1.18b13__py3-none-any.whl → 0.1.18b15__py3-none-any.whl - Mend

deltacat 0.1.18b13py3-none-any.whl → 0.1.18b15py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (95) hide show

deltacat/__init__.py +3 -2
deltacat/aws/clients.py +123 -3
deltacat/aws/redshift/model/manifest.py +4 -0
deltacat/aws/s3u.py +24 -1
deltacat/benchmarking/benchmark_parquet_reads.py +53 -0
deltacat/benchmarking/conftest.py +61 -0
deltacat/catalog/delegate.py +1 -1
deltacat/catalog/interface.py +1 -1
deltacat/compute/compactor/__init__.py +0 -3
deltacat/compute/compactor/compaction_session.py +45 -20
deltacat/compute/compactor/model/compact_partition_params.py +287 -58
deltacat/compute/compactor/model/compaction_session_audit_info.py +150 -9
deltacat/compute/compactor/model/delta_annotated.py +91 -9
deltacat/compute/compactor/model/delta_file_envelope.py +15 -3
deltacat/compute/compactor/model/primary_key_index.py +1 -1
deltacat/compute/compactor/model/round_completion_info.py +17 -1
deltacat/compute/compactor/repartition_session.py +5 -3
deltacat/compute/compactor/steps/dedupe.py +10 -8
deltacat/compute/compactor/steps/hash_bucket.py +25 -4
deltacat/compute/compactor/steps/materialize.py +11 -6
deltacat/compute/compactor/steps/repartition.py +16 -1
deltacat/compute/compactor/utils/io.py +40 -23
deltacat/compute/compactor/utils/primary_key_index.py +1 -15
deltacat/compute/compactor/utils/sort_key.py +57 -0
deltacat/compute/compactor/utils/system_columns.py +43 -0
deltacat/compute/compactor_v2/compaction_session.py +506 -0
deltacat/compute/compactor_v2/constants.py +34 -0
deltacat/compute/compactor_v2/model/__init__.py +0 -0
deltacat/compute/compactor_v2/model/hash_bucket_input.py +78 -0
deltacat/compute/compactor_v2/model/hash_bucket_result.py +12 -0
deltacat/compute/compactor_v2/model/merge_input.py +127 -0
deltacat/compute/compactor_v2/model/merge_result.py +12 -0
deltacat/compute/compactor_v2/steps/__init__.py +0 -0
deltacat/compute/compactor_v2/steps/hash_bucket.py +203 -0
deltacat/compute/compactor_v2/steps/merge.py +41 -0
deltacat/compute/compactor_v2/utils/__init__.py +0 -0
deltacat/compute/compactor_v2/utils/content_type_params.py +37 -0
deltacat/compute/compactor_v2/utils/io.py +149 -0
deltacat/compute/compactor_v2/utils/primary_key_index.py +308 -0
deltacat/compute/compactor_v2/utils/task_options.py +228 -0
deltacat/compute/metastats/meta_stats.py +4 -2
deltacat/compute/metastats/stats.py +1 -0
deltacat/compute/metastats/utils/io.py +4 -0
deltacat/compute/stats/utils/io.py +20 -5
deltacat/exceptions.py +4 -0
deltacat/io/memcached_object_store.py +37 -14
deltacat/logs.py +4 -3
deltacat/storage/__init__.py +3 -0
deltacat/storage/interface.py +11 -2
deltacat/storage/model/sort_key.py +33 -0
deltacat/storage/model/table_version.py +11 -0
deltacat/storage/model/types.py +2 -1
deltacat/tests/aws/__init__.py +0 -0
deltacat/tests/aws/test_clients.py +80 -0
deltacat/tests/compute/__init__.py +0 -0
deltacat/tests/compute/common.py +96 -0
deltacat/tests/compute/compactor/__init__.py +0 -0
deltacat/tests/compute/compactor/steps/__init__.py +0 -0
deltacat/tests/{test_repartition.py → compute/compactor/steps/test_repartition.py} +22 -8
deltacat/tests/compute/compactor/utils/__init__.py +0 -0
deltacat/tests/{compactor → compute/compactor}/utils/test_io.py +47 -5
deltacat/tests/compute/compactor_v2/__init__.py +0 -0
deltacat/tests/compute/compactor_v2/steps/__init__.py +0 -0
deltacat/tests/compute/compactor_v2/steps/test_hash_bucket.py +199 -0
deltacat/tests/{compactor → compute}/test_compact_partition_params.py +14 -30
deltacat/tests/compute/test_compaction_session_incremental.py +348 -0
deltacat/tests/compute/testcases.py +390 -0
deltacat/tests/io/test_memcached_object_store.py +5 -4
deltacat/tests/local_deltacat_storage/__init__.py +1109 -0
deltacat/tests/test_utils/pyarrow.py +32 -0
deltacat/tests/test_utils/utils.py +13 -0
deltacat/tests/utils/data/__init__.py +0 -0
deltacat/tests/utils/test_daft.py +76 -0
deltacat/tests/utils/test_pyarrow.py +133 -0
deltacat/tests/utils/test_resources.py +23 -20
deltacat/types/media.py +1 -0
deltacat/types/partial_download.py +82 -0
deltacat/types/tables.py +1 -0
deltacat/utils/arguments.py +26 -0
deltacat/utils/daft.py +87 -0
deltacat/utils/performance.py +4 -2
deltacat/utils/placement.py +20 -3
deltacat/utils/pyarrow.py +213 -1
deltacat/utils/ray_utils/concurrency.py +26 -1
deltacat/utils/resources.py +72 -1
deltacat/utils/s3fs.py +21 -0
{deltacat-0.1.18b13.dist-info → deltacat-0.1.18b15.dist-info}/METADATA +27 -13
deltacat-0.1.18b15.dist-info/RECORD +176 -0
{deltacat-0.1.18b13.dist-info → deltacat-0.1.18b15.dist-info}/WHEEL +1 -1
deltacat/compute/compactor/model/sort_key.py +0 -98
deltacat-0.1.18b13.dist-info/RECORD +0 -136
/deltacat/{tests/compactor → benchmarking}/__init__.py +0 -0
/deltacat/{tests/compactor/utils → compute/compactor_v2}/__init__.py +0 -0
{deltacat-0.1.18b13.dist-info → deltacat-0.1.18b15.dist-info}/LICENSE +0 -0
{deltacat-0.1.18b13.dist-info → deltacat-0.1.18b15.dist-info}/top_level.txt +0 -0

deltacat/__init__.py CHANGED Viewed

@@ -28,7 +28,6 @@ from deltacat.catalog.model.catalog import (  # noqa: F401
     init,
 )
 from deltacat.catalog.model.table_definition import TableDefinition
-from deltacat.compute.compactor import SortKey, SortOrder
 from deltacat.storage import (
     DistributedDataset,
     LifecycleState,
@@ -37,13 +36,15 @@ from deltacat.storage import (
     LocalTable,
     Namespace,
     SchemaConsistencyType,
+    SortKey,
+    SortOrder,
 )
 from deltacat.types.media import ContentEncoding, ContentType, TableType
 from deltacat.types.tables import TableWriteMode
 deltacat.logs.configure_deltacat_logger(logging.getLogger(__name__))
-__version__ = "0.1.18b13"
+__version__ = "0.1.18b15"
 __all__ = [

deltacat/aws/clients.py CHANGED Viewed

@@ -1,22 +1,142 @@
 import logging
 from functools import lru_cache
-from typing import Optional
+from typing import Optional, FrozenSet
+from http import HTTPStatus
 import boto3
 from boto3.exceptions import ResourceNotExistsError
 from boto3.resources.base import ServiceResource
 from botocore.client import BaseClient
 from botocore.config import Config
+from requests.adapters import Response
+from tenacity import (
+    RetryError,
+    Retrying,
+    wait_fixed,
+    retry_if_exception,
+    stop_after_delay,
+)
 from deltacat import logs
 from deltacat.aws.constants import BOTO_MAX_RETRIES
+import requests
 logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
 BOTO3_PROFILE_NAME_KWARG_KEY = "boto3_profile_name"
+INSTANCE_METADATA_SERVICE_IPV4_URI = "http://169.254.169.254/latest/meta-data/"  # https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
+RETRYABLE_HTTP_STATUS_CODES = [
+    # 429
+    HTTPStatus.TOO_MANY_REQUESTS,
+    # 5xx
+    HTTPStatus.INTERNAL_SERVER_ERROR,
+    HTTPStatus.NOT_IMPLEMENTED,
+    HTTPStatus.BAD_GATEWAY,
+    HTTPStatus.SERVICE_UNAVAILABLE,
+    HTTPStatus.GATEWAY_TIMEOUT,
+]
+class RetryIfRetryableHTTPStatusCode(retry_if_exception):
+    """
+    Retry strategy that retries if the exception is an ``HTTPError`` with
+    a status code in the retryable errors list.
+    """
+    def __init__(self):
+        def is_retryable_error(exception):
+            return (
+                isinstance(exception, requests.exceptions.HTTPError)
+                and exception.response.status_code in RETRYABLE_HTTP_STATUS_CODES
+            )
+        super().__init__(predicate=is_retryable_error)
+def _log_attempt_number(retry_state):
+    """return the result of the last call attempt"""
+    logger.warning(f"Retrying: {retry_state.attempt_number}...")
+def _get_url(url: str, get_url_kwargs=None):
+    if get_url_kwargs is None:
+        get_url_kwargs = {}
+    resp = requests.get(url, **get_url_kwargs)
+    resp.raise_for_status()
+    return resp
+def retrying_get(
+    url: str,
+    retry_strategy,
+    wait_strategy,
+    stop_strategy,
+    short_circuit_on_status: FrozenSet[int] = {HTTPStatus.OK},
+) -> Optional[Response]:
+    """Retries a request to the given URL until it succeeds.
+    Args:
+        retry_strategy (Callable): A function that returns a retry strategy.
+        wait_strategy (Callable): A function that returns a wait strategy.
+        stop_strategy (Callable): A function that returns a stop strategy.
+        url (str): The URL to retry.
+    Returns:
+        Optional[Response]: The response from the URL, or None if the request
+            failed after the maximum number of retries.
+    """
+    try:
+        resp = _get_url(url)
+        if resp.status_code in short_circuit_on_status:
+            return resp
+        for attempt in Retrying(
+            retry=retry_strategy(),
+            wait=wait_strategy,
+            stop=stop_strategy,
+            after=_log_attempt_number,
+        ):
+            with attempt:
+                resp = _get_url(url)
+                return resp
+    except RetryError as re:
+        logger.error(f"Failed to retry URL: {url} - {re}")
+    logger.info(f"Unable to get from URL: {url}")
+    return None
+def block_until_instance_metadata_service_returns_success(
+    url=INSTANCE_METADATA_SERVICE_IPV4_URI,
+    retry_strategy=RetryIfRetryableHTTPStatusCode,
+    wait_strategy=wait_fixed(2),  # wait 2 seconds before retrying,
+    stop_strategy=stop_after_delay(60 * 10),  # stop trying after 10 minutes
+) -> Optional[Response]:
+    """Blocks until the instance metadata service returns a successful response.
+    Args:
+        retry_strategy (Callable): A function that returns a retry strategy.
+        wait_strategy (Callable): A function that returns a wait strategy.
+        stop_strategy (Callable): A function that returns a stop strategy.
+        url (str): The URL of the instance metadata service.
+    Returns:
+        Optional[Response]: The response from the instance metadata service,
+            or None if the request failed after the maximum number of retries.
+    https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
+    """
+    # We will get a 403 HTTP status code if running deltacat not in an EC2 instance. In that case we won't want to block.
+    return retrying_get(
+        url,
+        retry_strategy,
+        wait_strategy,
+        stop_strategy,
+        short_circuit_on_status={HTTPStatus.OK, HTTPStatus.FORBIDDEN},
+    )
 def _get_session_from_kwargs(input_kwargs):
+    block_until_instance_metadata_service_returns_success()
     if input_kwargs.get(BOTO3_PROFILE_NAME_KWARG_KEY) is not None:
         boto3_session = boto3.Session(
             profile_name=input_kwargs.get(BOTO3_PROFILE_NAME_KWARG_KEY)
@@ -30,7 +150,7 @@ def _get_session_from_kwargs(input_kwargs):
 def _resource(name: str, region: Optional[str], **kwargs) -> ServiceResource:
     boto3_session = _get_session_from_kwargs(kwargs)
-    boto_config = Config(retries={"max_attempts": BOTO_MAX_RETRIES, "mode": "standard"})
+    boto_config = Config(retries={"max_attempts": BOTO_MAX_RETRIES, "mode": "adaptive"})
     return boto3_session.resource(
         name,
         region,
@@ -47,7 +167,7 @@ def _client(name: str, region: Optional[str], **kwargs) -> BaseClient:
         # fall back for clients without an associated resource
         boto3_session = _get_session_from_kwargs(kwargs)
         boto_config = Config(
-            retries={"max_attempts": BOTO_MAX_RETRIES, "mode": "standard"}
+            retries={"max_attempts": BOTO_MAX_RETRIES, "mode": "adaptive"}
         )
         return boto3_session.client(
             name,

deltacat/aws/redshift/model/manifest.py CHANGED Viewed

@@ -170,6 +170,10 @@ class ManifestMeta(dict):
     def content_type_parameters(self) -> Optional[List[Dict[str, str]]]:
         return self.get("content_type_parameters")
+    @content_type_parameters.setter
+    def content_type_parameters(self, params: List[Dict[str, str]]) -> None:
+        self["content_type_parameters"] = params
     @property
     def credentials(self) -> Optional[Dict[str, str]]:
         return self.get("credentials")

deltacat/aws/s3u.py CHANGED Viewed

@@ -3,6 +3,8 @@ import multiprocessing
 from functools import partial
 from typing import Any, Callable, Dict, Generator, List, Optional, Union
 from uuid import uuid4
+from botocore.config import Config
+from deltacat.aws.constants import BOTO_MAX_RETRIES
 import pyarrow as pa
 import ray
@@ -39,6 +41,7 @@ from deltacat.types.tables import (
     TABLE_TYPE_TO_READER_FUNC,
     get_table_length,
 )
+from deltacat.types.partial_download import PartialFileDownloadParams
 from deltacat.utils.common import ReadKwargsProvider
 logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
@@ -197,6 +200,7 @@ def read_file(
     column_names: Optional[List[str]] = None,
     include_columns: Optional[List[str]] = None,
     file_reader_kwargs_provider: Optional[ReadKwargsProvider] = None,
+    partial_file_download_params: Optional[PartialFileDownloadParams] = None,
     **s3_client_kwargs,
 ) -> LocalTable:
@@ -209,6 +213,7 @@ def read_file(
             column_names,
             include_columns,
             file_reader_kwargs_provider,
+            partial_file_download_params,
             **s3_client_kwargs,
         )
         return table
@@ -217,6 +222,13 @@ def read_file(
             # Timeout error not caught by botocore
             raise RetryableError(f"Retry table download from: {s3_url}") from e
         raise NonRetryableError(f"Failed table download from: {s3_url}") from e
+    except BaseException as e:
+        logger.warn(
+            f"Read has failed for {s3_url} and content_type={content_type} "
+            f"and encoding={content_encoding}. Error: {e}",
+            exc_info=True,
+        )
+        raise e
 def upload_sliced_table(
@@ -385,14 +397,16 @@ def download_manifest_entry(
     content_encoding: Optional[ContentEncoding] = None,
 ) -> LocalTable:
+    conf = Config(retries={"max_attempts": BOTO_MAX_RETRIES, "mode": "adaptive"})
     s3_client_kwargs = (
         {
             "aws_access_key_id": token_holder["accessKeyId"],
             "aws_secret_access_key": token_holder["secretAccessKey"],
             "aws_session_token": token_holder["sessionToken"],
+            "config": conf,
         }
         if token_holder
-        else {}
+        else {"config": conf}
     )
     if not content_type:
         content_type = manifest_entry.meta.content_type
@@ -409,6 +423,14 @@ def download_manifest_entry(
     s3_url = manifest_entry.uri
     if s3_url is None:
         s3_url = manifest_entry.url
+    partial_file_download_params = None
+    if manifest_entry.meta and manifest_entry.meta.content_type_parameters:
+        for type_params in manifest_entry.meta.content_type_parameters:
+            if isinstance(type_params, PartialFileDownloadParams):
+                partial_file_download_params = type_params
+                break
     # @retry decorator can't be pickled by Ray, so wrap download in Retrying
     retrying = Retrying(
         wait=wait_random_exponential(multiplier=1, max=60),
@@ -424,6 +446,7 @@ def download_manifest_entry(
         column_names,
         include_columns,
         file_reader_kwargs_provider,
+        partial_file_download_params,
         **s3_client_kwargs,
     )
     return table

deltacat/benchmarking/benchmark_parquet_reads.py ADDED Viewed

@@ -0,0 +1,53 @@
+from __future__ import annotations
+import pytest
+# Benchmarks for retrieving a single column in the Parquet file
+SINGLE_COLUMN_BENCHMARKS = {
+    "mvp": ("s3://daft-public-data/test_fixtures/parquet-dev/mvp.parquet", ["a"]),
+    "TPCH-lineitems-200MB-2RG": (
+        "s3://daft-public-data/test_fixtures/parquet-dev/daft_200MB_lineitem_chunk.RG-2.parquet",
+        ["L_ORDERKEY"],
+    ),
+}
+# Benchmarks for retrieving all columns in the Parquet file
+ALL_COLUMN_BENCHMARKS = {
+    "mvp": ("s3://daft-public-data/test_fixtures/parquet-dev/mvp.parquet", None),
+    "TPCH-lineitems-200MB-2RG": (
+        "s3://daft-public-data/test_fixtures/parquet-dev/daft_200MB_lineitem_chunk.RG-2.parquet",
+        None,
+    ),
+}
+@pytest.mark.benchmark(group="num_rowgroups_single_column")
+@pytest.mark.parametrize(
+    ["name", "path", "columns"],
+    [
+        (name, path, columns)
+        for name, (path, columns) in SINGLE_COLUMN_BENCHMARKS.items()
+    ],
+    ids=[name for name in SINGLE_COLUMN_BENCHMARKS],
+)
+def test_read_parquet_num_rowgroups_single_column(
+    name, path, columns, read_fn, benchmark
+):
+    data = benchmark(read_fn, path, columns=columns)
+    if columns is not None:
+        assert data.column_names == columns
+@pytest.mark.benchmark(group="num_rowgroups_all_columns")
+@pytest.mark.parametrize(
+    ["name", "path", "columns"],
+    [(name, path, columns) for name, (path, columns) in ALL_COLUMN_BENCHMARKS.items()],
+    ids=[name for name in ALL_COLUMN_BENCHMARKS],
+)
+def test_read_parquet_num_rowgroups_all_columns(
+    name, path, columns, read_fn, benchmark
+):
+    data = benchmark(read_fn, path, columns=columns)
+    if columns is not None:
+        assert data.column_names == columns

deltacat/benchmarking/conftest.py ADDED Viewed

@@ -0,0 +1,61 @@
+from __future__ import annotations
+import pyarrow as pa
+import pyarrow.fs as pafs
+import pyarrow.parquet as papq
+import pytest
+from deltacat.utils.pyarrow import s3_file_to_table
+from deltacat.types.media import (
+    ContentEncoding,
+    ContentType,
+)
+def pyarrow_read(path: str, columns: list[str] | None = None) -> pa.Table:
+    assert path.startswith(
+        "s3://"
+    ), f"Expected file path to start with 's3://', but got {path}."
+    fs = pafs.S3FileSystem()
+    path = path.replace("s3://", "")
+    return papq.read_table(path, columns=columns, filesystem=fs)
+def deltacat_read(path: str, columns: list[str] | None = None) -> pa.Table:
+    assert path.startswith("s3://")
+    return s3_file_to_table(
+        path,
+        content_type=ContentType.PARQUET,
+        content_encoding=ContentEncoding.IDENTITY,
+        column_names=None,  # Parquet files are schemaful
+        include_columns=columns,
+    )
+def daft_table_read(path: str, columns: list[str] | None = None) -> pa.Table:
+    try:
+        import daft
+    except ImportError:
+        raise ImportError(
+            "Daft not installed. Install Daft using pip to run these benchmarks: `pip install getdaft`"
+        )
+    tbl = daft.table.Table.read_parquet(path, columns=columns)
+    return tbl.to_arrow()
+@pytest.fixture(
+    params=[
+        daft_table_read,
+        pyarrow_read,
+        deltacat_read,
+    ],
+    ids=[
+        "daft_table",
+        "pyarrow",
+        "deltacat",
+    ],
+)
+def read_fn(request):
+    """Fixture which returns the function to read a PyArrow table from a path"""
+    return request.param

deltacat/catalog/delegate.py CHANGED Viewed

@@ -5,7 +5,7 @@ import ray
 from deltacat.catalog.model.catalog import Catalog, all_catalogs
 from deltacat.catalog.model.table_definition import TableDefinition
-from deltacat.compute.compactor.model.sort_key import SortKey
+from deltacat.storage.model.sort_key import SortKey
 from deltacat.storage.model.list_result import ListResult
 from deltacat.storage.model.namespace import Namespace
 from deltacat.storage.model.types import (

deltacat/catalog/interface.py CHANGED Viewed

@@ -3,7 +3,7 @@ from typing import Any, Dict, List, Optional, Set, Union
 import pyarrow as pa
 from deltacat.catalog.model.table_definition import TableDefinition
-from deltacat.compute.compactor.model.sort_key import SortKey
+from deltacat.storage.model.sort_key import SortKey
 from deltacat.storage.model.list_result import ListResult
 from deltacat.storage.model.namespace import Namespace
 from deltacat.storage.model.types import (

deltacat/compute/compactor/__init__.py CHANGED Viewed

@@ -13,7 +13,6 @@ from deltacat.compute.compactor.model.round_completion_info import (
     RoundCompletionInfo,
     HighWatermark,
 )
-from deltacat.compute.compactor.model.sort_key import SortKey, SortOrder
 __all__ = [
     "DeltaAnnotated",
@@ -27,6 +26,4 @@ __all__ = [
     "PyArrowWriteResult",
     "RoundCompletionInfo",
     "HighWatermark",
-    "SortKey",
-    "SortOrder",
 ]

deltacat/compute/compactor/compaction_session.py CHANGED Viewed

@@ -12,8 +12,8 @@ import pyarrow as pa
 from deltacat.compute.compactor import (
     PyArrowWriteResult,
     RoundCompletionInfo,
-    SortKey,
 )
+from deltacat.storage.model.sort_key import SortKey
 from deltacat.compute.compactor.model.dedupe_result import DedupeResult
 from deltacat.compute.compactor.model.hash_bucket_result import HashBucketResult
 from deltacat.io.object_store import IObjectStore
@@ -50,6 +50,7 @@ from deltacat.utils.metrics import MetricsConfig
 from deltacat.compute.compactor.model.compaction_session_audit_info import (
     CompactionSessionAuditInfo,
 )
+from deltacat.compute.compactor.utils.sort_key import validate_sort_keys
 from deltacat.utils.resources import get_current_node_peak_memory_usage_in_bytes
@@ -59,6 +60,9 @@ if importlib.util.find_spec("memray"):
 logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
+DEFAULT_DEDUPE_MAX_PARALLELISM_RATIO_ARG: int = 1
+DEFAULT_PROPERTIES_ARG: Dict[str, Any] = {}
 def check_preconditions(
     source_partition_locator: PartitionLocator,
@@ -67,8 +71,11 @@ def check_preconditions(
     max_records_per_output_file: int,
     new_hash_bucket_count: Optional[int],
     deltacat_storage=unimplemented_deltacat_storage,
+    deltacat_storage_kwargs: Optional[Dict[str, Any]] = None,
+    **kwargs,
 ) -> int:
+    if deltacat_storage_kwargs is None:
+        deltacat_storage_kwargs = {}
     assert (
         source_partition_locator.partition_values
         == destination_partition_locator.partition_values
@@ -83,10 +90,12 @@ def check_preconditions(
         assert (
             new_hash_bucket_count >= 1
         ), "New hash bucket count must be a positive value"
-    return SortKey.validate_sort_keys(
+    return validate_sort_keys(
         source_partition_locator,
         sort_keys,
         deltacat_storage,
+        deltacat_storage_kwargs,
+        **kwargs,
     )
@@ -117,9 +126,11 @@ def compact_partition(
     object_store: Optional[IObjectStore] = RayPlasmaObjectStore(),
     s3_client_kwargs: Optional[Dict[str, Any]] = None,
     deltacat_storage=unimplemented_deltacat_storage,
+    deltacat_storage_kwargs: Optional[Dict[str, Any]] = None,
     **kwargs,
 ) -> Optional[str]:
+    if deltacat_storage_kwargs is None:
+        deltacat_storage_kwargs = {}
     if not importlib.util.find_spec("memray"):
         logger.info(f"memray profiler not available, disabling all profiling")
         enable_profiler = False
@@ -161,6 +172,7 @@ def compact_partition(
             object_store,
             s3_client_kwargs,
             deltacat_storage,
+            deltacat_storage_kwargs,
             **kwargs,
         )
         if new_partition:
@@ -172,7 +184,9 @@ def compact_partition(
         round_completion_file_s3_url = None
         if partition:
             logger.info(f"Committing compacted partition to: {partition.locator}")
-            partition = deltacat_storage.commit_partition(partition)
+            partition = deltacat_storage.commit_partition(
+                partition, **deltacat_storage_kwargs
+            )
             logger.info(f"Committed compacted partition: {partition}")
             round_completion_file_s3_url = rcf.write_round_completion_file(
@@ -209,15 +223,16 @@ def _execute_compaction_round(
     object_store: Optional[IObjectStore],
     s3_client_kwargs: Optional[Dict[str, Any]],
     deltacat_storage=unimplemented_deltacat_storage,
+    deltacat_storage_kwargs: Optional[Dict[str, Any]] = None,
     **kwargs,
 ) -> Tuple[Optional[Partition], Optional[RoundCompletionInfo], Optional[str]]:
+    if deltacat_storage_kwargs is None:
+        deltacat_storage_kwargs = {}
     rcf_source_partition_locator = (
         rebase_source_partition_locator
         if rebase_source_partition_locator
         else source_partition_locator
     )
     base_audit_url = rcf_source_partition_locator.path(
         f"s3://{compaction_artifact_s3_bucket}/compaction-audit"
     )
@@ -250,6 +265,8 @@ def _execute_compaction_round(
         records_per_compacted_file,
         hash_bucket_count,
         deltacat_storage,
+        deltacat_storage_kwargs,
+        **kwargs,
     )
     # sort primary keys to produce the same pk digest regardless of input order
@@ -329,7 +346,8 @@ def _execute_compaction_round(
         rebase_source_partition_locator,
         rebase_source_partition_high_watermark,
         deltacat_storage,
-        **list_deltas_kwargs,
+        deltacat_storage_kwargs,
+        list_deltas_kwargs,
     )
     delta_discovery_end = time.monotonic()
@@ -362,6 +380,8 @@ def _execute_compaction_round(
             compaction_audit,
             hash_bucket_count,
             deltacat_storage=deltacat_storage,
+            deltacat_storage_kwargs=deltacat_storage_kwargs,
+            **kwargs,
         )
         if input_deltas_stats is None
         else io.limit_input_deltas(
@@ -372,6 +392,8 @@ def _execute_compaction_round(
             compaction_audit=compaction_audit,
             input_deltas_stats=input_deltas_stats,
             deltacat_storage=deltacat_storage,
+            deltacat_storage_kwargs=deltacat_storage_kwargs,
+            **kwargs,
         )
     )
@@ -399,9 +421,7 @@ def _execute_compaction_round(
         raise AssertionError(
             "Multiple rounds are not supported. Please increase the cluster size and run again."
         )
     hb_start = time.monotonic()
     hb_tasks_pending = invoke_parallel(
         items=uniform_deltas,
         ray_task=hb.hash_bucket,
@@ -417,8 +437,9 @@ def _execute_compaction_round(
         read_kwargs_provider=read_kwargs_provider,
         object_store=object_store,
         deltacat_storage=deltacat_storage,
+        deltacat_storage_kwargs=deltacat_storage_kwargs,
+        **kwargs,
     )
     hb_invoke_end = time.monotonic()
     logger.info(f"Getting {len(hb_tasks_pending)} hash bucket results...")
@@ -456,7 +477,6 @@ def _execute_compaction_round(
     )
     compaction_audit.set_input_records(total_hb_record_count.item())
     # TODO (pdames): when resources are freed during the last round of hash
     #  bucketing, start running dedupe tasks that read existing dedupe
     #  output from S3 then wait for hash bucketing to finish before continuing
@@ -467,13 +487,14 @@ def _execute_compaction_round(
         compacted_stream_locator.namespace,
         compacted_stream_locator.table_name,
         compacted_stream_locator.table_version,
+        **deltacat_storage_kwargs,
     )
     partition = deltacat_storage.stage_partition(
         stream,
         destination_partition_locator.partition_values,
+        **deltacat_storage_kwargs,
     )
     new_compacted_partition_locator = partition.locator
     # parallel step 2:
     # discover records with duplicate primary keys in each hash bucket, and
     # identify the index of records to keep or drop based on sort keys
@@ -482,7 +503,10 @@ def _execute_compaction_round(
     dedupe_start = time.monotonic()
     dd_max_parallelism = int(
-        max_parallelism * kwargs.get("dd_max_parallelism_ratio", 1)
+        max_parallelism
+        * kwargs.get(
+            "dd_max_parallelism_ratio", DEFAULT_DEDUPE_MAX_PARALLELISM_RATIO_ARG
+        )
     )
     logger.info(
         f"dd max_parallelism is set to {dd_max_parallelism}, max_parallelism is {max_parallelism}"
@@ -526,7 +550,6 @@ def _execute_compaction_round(
     )
     compaction_audit.set_records_deduped(total_dd_record_count.item())
     all_mat_buckets_to_obj_id = defaultdict(list)
     for dd_result in dd_results:
         for (
@@ -540,7 +563,6 @@ def _execute_compaction_round(
     logger.info(f"Materialize buckets created: " f"{len(all_mat_buckets_to_obj_id)}")
     compaction_audit.set_materialize_buckets(len(all_mat_buckets_to_obj_id))
     # TODO(pdames): when resources are freed during the last round of deduping
     #  start running materialize tasks that read materialization source file
     #  tables from S3 then wait for deduping to finish before continuing
@@ -561,7 +583,6 @@ def _execute_compaction_round(
     )
     materialize_start = time.monotonic()
     mat_tasks_pending = invoke_parallel(
         items=all_mat_buckets_to_obj_id.items(),
         ray_task=mat.materialize,
@@ -584,6 +605,7 @@ def _execute_compaction_round(
         s3_table_writer_kwargs=s3_table_writer_kwargs,
         object_store=object_store,
         deltacat_storage=deltacat_storage,
+        deltacat_storage_kwargs=deltacat_storage_kwargs,
     )
     materialize_invoke_end = time.monotonic()
@@ -629,7 +651,9 @@ def _execute_compaction_round(
         f" {record_info_msg}"
     )
     compacted_delta = deltacat_storage.commit_delta(
-        merged_delta, properties=kwargs.get("properties", {})
+        merged_delta,
+        properties=kwargs.get("properties", DEFAULT_PROPERTIES_ARG),
+        **deltacat_storage_kwargs,
     )
     logger.info(f"Committed compacted delta: {compacted_delta}")
@@ -691,10 +715,11 @@ def _execute_compaction_round(
 def compact_partition_from_request(
     compact_partition_params: CompactPartitionParams,
+    *compact_partition_pos_args,
 ) -> Optional[str]:
     """
     Wrapper for compact_partition that allows for the compact_partition parameters to be
-    passed in as a custom dictionary-like CompactPartitionParams object.
+    passed in as a custom dictionary-like CompactPartitionParams object along with any compact_partition positional arguments.
     :param compact_partition_params:
     """
-    return compact_partition(**compact_partition_params)
+    return compact_partition(*compact_partition_pos_args, **compact_partition_params)

deltacat 0.1.18b13__py3-none-any.whl → 0.1.18b15__py3-none-any.whl

deltacat 0.1.18b13py3-none-any.whl → 0.1.18b15py3-none-any.whl