PyPI - deltacat - Versions diffs - 0.1.18b13__py3-none-any.whl → 0.1.18b15__py3-none-any.whl - Mend

deltacat 0.1.18b13py3-none-any.whl → 0.1.18b15py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (95) hide show

deltacat/__init__.py +3 -2
deltacat/aws/clients.py +123 -3
deltacat/aws/redshift/model/manifest.py +4 -0
deltacat/aws/s3u.py +24 -1
deltacat/benchmarking/benchmark_parquet_reads.py +53 -0
deltacat/benchmarking/conftest.py +61 -0
deltacat/catalog/delegate.py +1 -1
deltacat/catalog/interface.py +1 -1
deltacat/compute/compactor/__init__.py +0 -3
deltacat/compute/compactor/compaction_session.py +45 -20
deltacat/compute/compactor/model/compact_partition_params.py +287 -58
deltacat/compute/compactor/model/compaction_session_audit_info.py +150 -9
deltacat/compute/compactor/model/delta_annotated.py +91 -9
deltacat/compute/compactor/model/delta_file_envelope.py +15 -3
deltacat/compute/compactor/model/primary_key_index.py +1 -1
deltacat/compute/compactor/model/round_completion_info.py +17 -1
deltacat/compute/compactor/repartition_session.py +5 -3
deltacat/compute/compactor/steps/dedupe.py +10 -8
deltacat/compute/compactor/steps/hash_bucket.py +25 -4
deltacat/compute/compactor/steps/materialize.py +11 -6
deltacat/compute/compactor/steps/repartition.py +16 -1
deltacat/compute/compactor/utils/io.py +40 -23
deltacat/compute/compactor/utils/primary_key_index.py +1 -15
deltacat/compute/compactor/utils/sort_key.py +57 -0
deltacat/compute/compactor/utils/system_columns.py +43 -0
deltacat/compute/compactor_v2/compaction_session.py +506 -0
deltacat/compute/compactor_v2/constants.py +34 -0
deltacat/compute/compactor_v2/model/__init__.py +0 -0
deltacat/compute/compactor_v2/model/hash_bucket_input.py +78 -0
deltacat/compute/compactor_v2/model/hash_bucket_result.py +12 -0
deltacat/compute/compactor_v2/model/merge_input.py +127 -0
deltacat/compute/compactor_v2/model/merge_result.py +12 -0
deltacat/compute/compactor_v2/steps/__init__.py +0 -0
deltacat/compute/compactor_v2/steps/hash_bucket.py +203 -0
deltacat/compute/compactor_v2/steps/merge.py +41 -0
deltacat/compute/compactor_v2/utils/__init__.py +0 -0
deltacat/compute/compactor_v2/utils/content_type_params.py +37 -0
deltacat/compute/compactor_v2/utils/io.py +149 -0
deltacat/compute/compactor_v2/utils/primary_key_index.py +308 -0
deltacat/compute/compactor_v2/utils/task_options.py +228 -0
deltacat/compute/metastats/meta_stats.py +4 -2
deltacat/compute/metastats/stats.py +1 -0
deltacat/compute/metastats/utils/io.py +4 -0
deltacat/compute/stats/utils/io.py +20 -5
deltacat/exceptions.py +4 -0
deltacat/io/memcached_object_store.py +37 -14
deltacat/logs.py +4 -3
deltacat/storage/__init__.py +3 -0
deltacat/storage/interface.py +11 -2
deltacat/storage/model/sort_key.py +33 -0
deltacat/storage/model/table_version.py +11 -0
deltacat/storage/model/types.py +2 -1
deltacat/tests/aws/__init__.py +0 -0
deltacat/tests/aws/test_clients.py +80 -0
deltacat/tests/compute/__init__.py +0 -0
deltacat/tests/compute/common.py +96 -0
deltacat/tests/compute/compactor/__init__.py +0 -0
deltacat/tests/compute/compactor/steps/__init__.py +0 -0
deltacat/tests/{test_repartition.py → compute/compactor/steps/test_repartition.py} +22 -8
deltacat/tests/compute/compactor/utils/__init__.py +0 -0
deltacat/tests/{compactor → compute/compactor}/utils/test_io.py +47 -5
deltacat/tests/compute/compactor_v2/__init__.py +0 -0
deltacat/tests/compute/compactor_v2/steps/__init__.py +0 -0
deltacat/tests/compute/compactor_v2/steps/test_hash_bucket.py +199 -0
deltacat/tests/{compactor → compute}/test_compact_partition_params.py +14 -30
deltacat/tests/compute/test_compaction_session_incremental.py +348 -0
deltacat/tests/compute/testcases.py +390 -0
deltacat/tests/io/test_memcached_object_store.py +5 -4
deltacat/tests/local_deltacat_storage/__init__.py +1109 -0
deltacat/tests/test_utils/pyarrow.py +32 -0
deltacat/tests/test_utils/utils.py +13 -0
deltacat/tests/utils/data/__init__.py +0 -0
deltacat/tests/utils/test_daft.py +76 -0
deltacat/tests/utils/test_pyarrow.py +133 -0
deltacat/tests/utils/test_resources.py +23 -20
deltacat/types/media.py +1 -0
deltacat/types/partial_download.py +82 -0
deltacat/types/tables.py +1 -0
deltacat/utils/arguments.py +26 -0
deltacat/utils/daft.py +87 -0
deltacat/utils/performance.py +4 -2
deltacat/utils/placement.py +20 -3
deltacat/utils/pyarrow.py +213 -1
deltacat/utils/ray_utils/concurrency.py +26 -1
deltacat/utils/resources.py +72 -1
deltacat/utils/s3fs.py +21 -0
{deltacat-0.1.18b13.dist-info → deltacat-0.1.18b15.dist-info}/METADATA +27 -13
deltacat-0.1.18b15.dist-info/RECORD +176 -0
{deltacat-0.1.18b13.dist-info → deltacat-0.1.18b15.dist-info}/WHEEL +1 -1
deltacat/compute/compactor/model/sort_key.py +0 -98
deltacat-0.1.18b13.dist-info/RECORD +0 -136
/deltacat/{tests/compactor → benchmarking}/__init__.py +0 -0
/deltacat/{tests/compactor/utils → compute/compactor_v2}/__init__.py +0 -0
{deltacat-0.1.18b13.dist-info → deltacat-0.1.18b15.dist-info}/LICENSE +0 -0
{deltacat-0.1.18b13.dist-info → deltacat-0.1.18b15.dist-info}/top_level.txt +0 -0

deltacat/compute/compactor/model/delta_annotated.py CHANGED Viewed

@@ -2,7 +2,9 @@
 from __future__ import annotations
 import logging
-from types import FunctionType
+import copy
+from deltacat.types.media import ContentType, ContentEncoding
+from deltacat.types.partial_download import PartialParquetParameters
 from typing import Callable, List, Optional, Union
 from deltacat import logs
@@ -64,7 +66,9 @@ class DeltaAnnotated(Delta):
         annotated_deltas: List[DeltaAnnotated],
         min_delta_bytes: float,
         min_file_counts: Optional[Union[int, float]] = float("inf"),
-        estimation_function: Optional[Callable] = None,
+        estimation_function: Optional[
+            Callable[[ManifestEntry], float]
+        ] = lambda entry: entry.meta.content_length,
     ) -> List[DeltaAnnotated]:
         """
         Simple greedy algorithm to split/merge 1 or more annotated deltas into
@@ -76,11 +80,16 @@ class DeltaAnnotated(Delta):
         of bytes at rest for the associated object. Returns the list of annotated
         delta groups.
         """
-        groups = []
+        split_annotated_deltas: List[DeltaAnnotated] = []
+        groups: List[DeltaAnnotated] = []
         new_da = DeltaAnnotated()
         new_da_bytes = 0
         da_group_entry_count = 0
-        for src_da in annotated_deltas:
+        for delta_annotated in annotated_deltas:
+            split_annotated_deltas.extend(DeltaAnnotated._split_single(delta_annotated))
+        for src_da in split_annotated_deltas:
             src_da_annotations = src_da.annotations
             src_da_entries = src_da.manifest.entries
             assert (
@@ -105,11 +114,7 @@ class DeltaAnnotated(Delta):
                     src_da, new_da, src_entry, src_da_annotations[i]
                 )
                 # TODO: Fetch s3_obj["Size"] if entry content length undefined?
-                estimated_new_da_bytes = (
-                    estimation_function(src_entry.meta.content_length)
-                    if type(estimation_function) is FunctionType
-                    else src_entry.meta.content_length
-                )
+                estimated_new_da_bytes = estimation_function(src_entry)
                 new_da_bytes += estimated_new_da_bytes
                 da_group_entry_count += 1
                 if (
@@ -132,6 +137,7 @@ class DeltaAnnotated(Delta):
                     da_group_entry_count = 0
         if new_da:
             groups.append(new_da)
         return groups
     @staticmethod
@@ -207,3 +213,79 @@ class DeltaAnnotated(Delta):
                 dst_da.type = None
             entries.append(src_entry)
             dst_da.annotations.append(src_annotation)
+    @staticmethod
+    def _split_single(delta_annotated: DeltaAnnotated) -> List[DeltaAnnotated]:
+        """
+        Split a single delta annotated into multiple granular
+        annotated entries. Note that split is not always guaranteed.
+        Note: Currently we are only able to split the Parquet File downloads.
+        """
+        result = []
+        if (
+            delta_annotated.meta
+            and delta_annotated.manifest
+            and delta_annotated.meta.content_type == ContentType.PARQUET
+            and delta_annotated.meta.content_encoding == ContentEncoding.IDENTITY
+        ):
+            # we split by row groups
+            for entry_index, entry in enumerate(delta_annotated.manifest.entries):
+                input_split_params = None
+                if entry.meta and entry.meta.content_type_parameters:
+                    for type_params in entry.meta.content_type_parameters:
+                        if (
+                            isinstance(type_params, PartialParquetParameters)
+                            and type_params.num_row_groups > 1
+                            and type_params.pq_metadata
+                        ):
+                            input_split_params = type_params
+                            break
+                if input_split_params:
+                    logger.info(
+                        f"Splitting input file with URI: {entry.uri} into "
+                        f"different {input_split_params.num_row_groups} entries"
+                    )
+                    for rg in input_split_params.row_groups_to_download:
+                        new_da = DeltaAnnotated()
+                        new_entry_dict = copy.deepcopy(entry)
+                        new_entry = ManifestEntry(new_entry_dict)
+                        row_group_meta = input_split_params.pq_metadata.row_group(rg)
+                        new_partial_params = PartialParquetParameters.of(
+                            row_groups_to_download=[rg],
+                            num_row_groups=1,
+                            num_rows=row_group_meta.num_rows,
+                            in_memory_size_bytes=row_group_meta.total_byte_size,
+                            pq_metadata=input_split_params.pq_metadata,
+                        )
+                        new_entry.meta.content_type_parameters = [new_partial_params]
+                        for type_params in entry.meta.content_type_parameters:
+                            if not isinstance(type_params, PartialParquetParameters):
+                                new_entry.meta.content_type_parameters.append(
+                                    type_params
+                                )
+                        DeltaAnnotated._append_annotated_entry(
+                            delta_annotated,
+                            new_da,
+                            new_entry,
+                            delta_annotated.annotations[entry_index],
+                        )
+                        result.append(new_da)
+        if result:
+            return result
+        else:
+            logger.info(
+                f"Split was not performed on the delta with locator: {delta_annotated.locator}"
+            )
+        return [delta_annotated]

deltacat/compute/compactor/model/delta_file_envelope.py CHANGED Viewed

@@ -2,6 +2,7 @@
 from __future__ import annotations
 import numpy as np
+import pyarrow as pa
 from deltacat.storage import DeltaType, LocalTable
@@ -14,9 +15,9 @@ class DeltaFileEnvelope(dict):
     @staticmethod
     def of(
         stream_position: int,
-        file_index: int,
         delta_type: DeltaType,
         table: LocalTable,
+        file_index: int = None,
         is_src_delta: np.bool_ = True,
         file_record_count: Optional[int] = None,
     ) -> DeltaFileEnvelope:
@@ -37,8 +38,6 @@ class DeltaFileEnvelope(dict):
         """
         if stream_position is None:
             raise ValueError("Missing delta file envelope stream position.")
-        if file_index is None:
-            raise ValueError("Missing delta file envelope file index.")
         if delta_type is None:
             raise ValueError("Missing Delta file envelope delta type.")
         if table is None:
@@ -75,3 +74,16 @@ class DeltaFileEnvelope(dict):
     @property
     def file_record_count(self) -> int:
         return self["file_record_count"]
+    @property
+    def table_size_bytes(self) -> int:
+        if isinstance(self.table, pa.Table):
+            return self.table.nbytes
+        else:
+            raise ValueError(
+                f"Table type: {type(self.table)} not for supported for size method."
+            )
+    @property
+    def table_num_rows(self) -> int:
+        return len(self.table)

deltacat/compute/compactor/model/primary_key_index.py CHANGED Viewed

@@ -4,7 +4,7 @@ from __future__ import annotations
 from typing import Any, Dict, List
 from uuid import uuid4
-from deltacat.compute.compactor.model.sort_key import SortKey
+from deltacat.storage.model.sort_key import SortKey
 from deltacat.storage import Locator, PartitionLocator
 from deltacat.utils.common import sha1_hexdigest

deltacat/compute/compactor/model/round_completion_info.py CHANGED Viewed

@@ -1,6 +1,7 @@
 # Allow classes to use self-referencing Type hints in Python 3.7.
 from __future__ import annotations
+from typing import Tuple
 from deltacat.storage import DeltaLocator, PartitionLocator
 from deltacat.compute.compactor.model.pyarrow_write_result import PyArrowWriteResult
 from deltacat.compute.compactor.model.compaction_session_audit_info import (
@@ -40,9 +41,11 @@ class RoundCompletionInfo(dict):
         compacted_delta_locator: DeltaLocator,
         compacted_pyarrow_write_result: PyArrowWriteResult,
         sort_keys_bit_width: int,
-        rebase_source_partition_locator: Optional[PartitionLocator],
+        rebase_source_partition_locator: Optional[PartitionLocator] = None,
         manifest_entry_copied_by_reference_ratio: Optional[float] = None,
         compaction_audit_url: Optional[str] = None,
+        hash_bucket_count: Optional[int] = None,
+        hb_index_to_entry_range: Optional[Dict[int, Tuple[int, int]]] = None,
     ) -> RoundCompletionInfo:
         rci = RoundCompletionInfo()
@@ -55,6 +58,8 @@ class RoundCompletionInfo(dict):
             "manifestEntryCopiedByReferenceRatio"
         ] = manifest_entry_copied_by_reference_ratio
         rci["compactionAuditUrl"] = compaction_audit_url
+        rci["hashBucketCount"] = hash_bucket_count
+        rci["hbIndexToEntryRange"] = hb_index_to_entry_range
         return rci
     @property
@@ -97,3 +102,14 @@ class RoundCompletionInfo(dict):
     @property
     def manifest_entry_copied_by_reference_ratio(self) -> Optional[float]:
         return self["manifestEntryCopiedByReferenceRatio"]
+    @property
+    def hash_bucket_count(self) -> Optional[int]:
+        return self["hashBucketCount"]
+    @property
+    def hb_index_to_entry_range(self) -> Optional[Dict[int, Tuple[int, int]]]:
+        """
+        The start index is inclusive and end index is exclusive by default.
+        """
+        return self["hbIndexToEntryRange"]

deltacat/compute/compactor/repartition_session.py CHANGED Viewed

@@ -7,8 +7,8 @@ import functools
 import itertools
 from deltacat.compute.compactor import (
     RoundCompletionInfo,
-    SortKey,
 )
+from deltacat.storage.model.sort_key import SortKey
 from deltacat.types.media import ContentType
 from deltacat.compute.compactor import DeltaAnnotated
 from deltacat.utils.ray_utils.concurrency import (
@@ -31,6 +31,7 @@ from deltacat.storage import (
     interface as unimplemented_deltacat_storage,
 )
 from deltacat.utils.metrics import MetricsConfig
+from deltacat.compute.compactor.utils.sort_key import validate_sort_keys
 logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
@@ -90,7 +91,7 @@ def repartition(
             source_partition_locator.partition_values,
         ).stream_position,
         deltacat_storage,
-        **list_deltas_kwargs,
+        list_deltas_kwargs,
     )
     uniform_deltas = []
@@ -157,10 +158,11 @@ def repartition(
         new_compacted_partition_locator,
         compacted_delta.stream_position,
     )
-    bit_width_of_sort_keys = SortKey.validate_sort_keys(
+    bit_width_of_sort_keys = validate_sort_keys(
         source_partition_locator,
         sort_keys,
         deltacat_storage,
+        deltacat_storage_kwargs={},
     )
     repartition_completion_info = RoundCompletionInfo.of(
         last_stream_position_to_compact,

deltacat/compute/compactor/steps/dedupe.py CHANGED Viewed

@@ -12,11 +12,10 @@ import ray
 from deltacat import logs
 from deltacat.compute.compactor import (
-    SortKey,
-    SortOrder,
     DeltaFileEnvelope,
     DeltaFileLocator,
 )
+from deltacat.storage.model.sort_key import SortKey, SortOrder
 from deltacat.compute.compactor.model.dedupe_result import DedupeResult
 from deltacat.compute.compactor.utils import system_columns as sc
 from deltacat.utils.ray_utils.runtime import (
@@ -108,20 +107,21 @@ def _timed_dedupe(
     dedupe_task_index: int,
     enable_profiler: bool,
     object_store: Optional[IObjectStore],
+    **kwargs,
 ):
     task_id = get_current_ray_task_id()
     worker_id = get_current_ray_worker_id()
     with memray.Tracker(
         f"dedupe_{worker_id}_{task_id}.bin"
     ) if enable_profiler else nullcontext():
-        # TODO (pdames): mitigate risk of running out of memory here in cases of
-        #  severe skew of primary key updates in deltas
+        # TODO (pdames): mitigate risk of running out of memory here in cases of severe skew of primary key updates in deltas
         logger.info(
             f"[Dedupe task {dedupe_task_index}] Getting delta file envelope "
             f"groups for {len(object_ids)} object refs..."
         )
-        delta_file_envelope_groups_list = object_store.get_many(object_ids)
+        delta_file_envelope_groups_list: List[object] = object_store.get_many(
+            object_ids
+        )
         hb_index_to_delta_file_envelopes_list = defaultdict(list)
         for delta_file_envelope_groups in delta_file_envelope_groups_list:
             for hb_idx, dfes in enumerate(delta_file_envelope_groups):
@@ -172,7 +172,8 @@ def _timed_dedupe(
             hb_table_record_count = len(table)
             table, drop_time = timed_invocation(
-                func=_drop_duplicates_by_primary_key_hash, table=table
+                func=_drop_duplicates_by_primary_key_hash,
+                table=table,
             )
             deduped_record_count = hb_table_record_count - len(table)
             total_deduped_records += deduped_record_count
@@ -228,7 +229,6 @@ def _timed_dedupe(
         )
         peak_memory_usage_bytes = get_current_node_peak_memory_usage_in_bytes()
         return DedupeResult(
             mat_bucket_to_dd_idx_obj_id,
             np.int64(total_deduped_records),
@@ -247,6 +247,7 @@ def dedupe(
     enable_profiler: bool,
     metrics_config: MetricsConfig,
     object_store: Optional[IObjectStore],
+    **kwargs,
 ) -> DedupeResult:
     logger.info(f"[Dedupe task {dedupe_task_index}] Starting dedupe task...")
     dedupe_result, duration = timed_invocation(
@@ -257,6 +258,7 @@ def dedupe(
         dedupe_task_index=dedupe_task_index,
         enable_profiler=enable_profiler,
         object_store=object_store,
+        **kwargs,
     )
     emit_metrics_time = 0.0

deltacat/compute/compactor/steps/hash_bucket.py CHANGED Viewed

@@ -3,7 +3,7 @@ import logging
 import time
 from contextlib import nullcontext
 from itertools import chain
-from typing import Generator, List, Optional, Tuple
+from typing import Any, Dict, Generator, List, Optional, Tuple
 import numpy as np
 import pyarrow as pa
 import ray
@@ -11,9 +11,9 @@ from deltacat import logs
 from deltacat.compute.compactor import (
     DeltaAnnotated,
     DeltaFileEnvelope,
-    SortKey,
     RoundCompletionInfo,
 )
+from deltacat.storage.model.sort_key import SortKey
 from deltacat.compute.compactor.model.delta_file_envelope import DeltaFileEnvelopeGroups
 from deltacat.compute.compactor.model.hash_bucket_result import HashBucketResult
 from deltacat.compute.compactor.utils import system_columns as sc
@@ -91,7 +91,11 @@ def _group_file_records_by_pk_hash_bucket(
     is_src_delta: np.bool_ = True,
     read_kwargs_provider: Optional[ReadKwargsProvider] = None,
     deltacat_storage=unimplemented_deltacat_storage,
+    deltacat_storage_kwargs: Optional[Dict[str, Any]] = None,
+    **kwargs,
 ) -> Tuple[Optional[DeltaFileEnvelopeGroups], int]:
+    if deltacat_storage_kwargs is None:
+        deltacat_storage_kwargs = {}
     # read input parquet s3 objects into a list of delta file envelopes
     delta_file_envelopes, total_record_count = _read_delta_file_envelopes(
         annotated_delta,
@@ -99,6 +103,8 @@ def _group_file_records_by_pk_hash_bucket(
         sort_key_names,
         read_kwargs_provider,
         deltacat_storage,
+        deltacat_storage_kwargs,
+        **kwargs,
     )
     if delta_file_envelopes is None:
         return None, 0
@@ -134,8 +140,11 @@ def _read_delta_file_envelopes(
     sort_key_names: List[str],
     read_kwargs_provider: Optional[ReadKwargsProvider],
     deltacat_storage=unimplemented_deltacat_storage,
+    deltacat_storage_kwargs: Optional[Dict[str, Any]] = None,
+    **kwargs,
 ) -> Tuple[Optional[List[DeltaFileEnvelope]], int]:
+    if deltacat_storage_kwargs is None:
+        deltacat_storage_kwargs = {}
     columns_to_read = list(chain(primary_keys, sort_key_names))
     # TODO (rootliu) compare performance of column read from unpartitioned vs partitioned file
     # https://arrow.apache.org/docs/python/parquet.html#writing-to-partitioned-datasets
@@ -145,6 +154,7 @@ def _read_delta_file_envelopes(
         columns=columns_to_read,
         file_reader_kwargs_provider=read_kwargs_provider,
         storage_type=StorageType.LOCAL,
+        **deltacat_storage_kwargs,
     )
     annotations = annotated_delta.annotations
     assert (
@@ -182,7 +192,11 @@ def _timed_hash_bucket(
     read_kwargs_provider: Optional[ReadKwargsProvider] = None,
     object_store: Optional[IObjectStore] = None,
     deltacat_storage=unimplemented_deltacat_storage,
+    deltacat_storage_kwargs: Optional[Dict[str, Any]] = None,
+    **kwargs,
 ):
+    if deltacat_storage_kwargs is None:
+        deltacat_storage_kwargs = {}
     task_id = get_current_ray_task_id()
     worker_id = get_current_ray_worker_id()
     with memray.Tracker(
@@ -207,6 +221,8 @@ def _timed_hash_bucket(
             is_src_delta,
             read_kwargs_provider,
             deltacat_storage,
+            deltacat_storage_kwargs,
+            **kwargs,
         )
         hash_bucket_group_to_obj_id, _ = group_hash_bucket_indices(
             delta_file_envelope_groups, num_buckets, num_groups, object_store
@@ -235,8 +251,11 @@ def hash_bucket(
     read_kwargs_provider: Optional[ReadKwargsProvider],
     object_store: Optional[IObjectStore],
     deltacat_storage=unimplemented_deltacat_storage,
+    deltacat_storage_kwargs: Optional[Dict[str, Any]] = None,
+    **kwargs,
 ) -> HashBucketResult:
+    if deltacat_storage_kwargs is None:
+        deltacat_storage_kwargs = {}
     logger.info(f"Starting hash bucket task...")
     hash_bucket_result, duration = timed_invocation(
         func=_timed_hash_bucket,
@@ -250,6 +269,8 @@ def hash_bucket(
         read_kwargs_provider=read_kwargs_provider,
         object_store=object_store,
         deltacat_storage=deltacat_storage,
+        deltacat_storage_kwargs=deltacat_storage_kwargs,
+        **kwargs,
     )
     emit_metrics_time = 0.0

deltacat/compute/compactor/steps/materialize.py CHANGED Viewed

@@ -69,7 +69,11 @@ def materialize(
     s3_table_writer_kwargs: Optional[Dict[str, Any]] = None,
     object_store: Optional[IObjectStore] = None,
     deltacat_storage=unimplemented_deltacat_storage,
+    deltacat_storage_kwargs: Optional[Dict[str, Any]] = None,
 ):
+    if deltacat_storage_kwargs is None:
+        deltacat_storage_kwargs = {}
     def _stage_delta_from_manifest_entry_reference_list(
         manifest_entry_list_reference: List[ManifestEntry],
         partition: Partition,
@@ -105,6 +109,7 @@ def materialize(
             max_records_per_entry=max_records_per_output_file,
             content_type=compacted_file_content_type,
             s3_table_writer_kwargs=s3_table_writer_kwargs,
+            **deltacat_storage_kwargs,
         )
         compacted_table_size = TABLE_CLASS_TO_SIZE_FUNC[type(compacted_table)](
             compacted_table
@@ -116,11 +121,10 @@ def materialize(
         )
         manifest = delta.manifest
         manifest_records = manifest.meta.record_count
-        assert (
-            manifest_records == len(compacted_table),
+        assert manifest_records == len(compacted_table), (
             f"Unexpected Error: Materialized delta manifest record count "
             f"({manifest_records}) does not equal compacted table record count "
-            f"({len(compacted_table)})",
+            f"({len(compacted_table)})"
         )
         materialize_result = MaterializeResult.of(
             delta=delta,
@@ -187,10 +191,11 @@ def materialize(
                 src_stream_position_np.item(),
             )
             dl_digest = delta_locator.digest()
             manifest = manifest_cache.setdefault(
                 dl_digest,
-                deltacat_storage.get_delta_manifest(delta_locator),
+                deltacat_storage.get_delta_manifest(
+                    delta_locator, **deltacat_storage_kwargs
+                ),
             )
             if read_kwargs_provider is None:
@@ -236,6 +241,7 @@ def materialize(
                     Delta.of(delta_locator, None, None, None, manifest),
                     src_file_idx_np.item(),
                     file_reader_kwargs_provider=read_kwargs_provider,
+                    **deltacat_storage_kwargs,
                 )
                 logger.debug(
                     f"Time taken for materialize task"
@@ -253,7 +259,6 @@ def materialize(
             materialized_results.append(_materialize(record_batch_tables.remaining))
         logger.info(f"Got {count_of_src_dfl} source delta files during materialize")
         referenced_manifest_delta = (
             _stage_delta_from_manifest_entry_reference_list(
                 manifest_entry_list_reference, partition

deltacat/compute/compactor/steps/repartition.py CHANGED Viewed

@@ -4,7 +4,7 @@ from contextlib import nullcontext
 import pyarrow.compute as pc
 from deltacat.constants import SIGNED_INT64_MIN_VALUE, SIGNED_INT64_MAX_VALUE
 import pyarrow as pa
-from typing import List, Optional
+from typing import Any, Dict, List, Optional
 from deltacat.types.media import StorageType, ContentType
 import ray
 from deltacat import logs
@@ -58,6 +58,8 @@ def repartition_range(
     max_records_per_output_file: int,
     repartitioned_file_content_type: ContentType = ContentType.PARQUET,
     deltacat_storage=unimplemented_deltacat_storage,
+    deltacat_storage_kwargs: Optional[Dict[str, Any]] = None,
+    **kwargs,
 ):
     """
     Repartitions a list of Arrow tables based on specified ranges and stores the repartitioned tables.
@@ -85,6 +87,8 @@ def repartition_range(
         in the tables, an error will be raised. For each partition range, a new file is created. This could result in
         more output files than input files.
     """
+    if deltacat_storage_kwargs is None:
+        deltacat_storage_kwargs = {}
     column: str = repartition_args["column"]
     partition_ranges: List = repartition_args["ranges"]
     if len(partition_ranges) == 0:
@@ -141,6 +145,7 @@ def repartition_range(
                     destination_partition,
                     max_records_per_entry=max_records_per_output_file,
                     content_type=repartitioned_file_content_type,
+                    **deltacat_storage_kwargs,
                 )
                 partition_deltas.append(partition_delta)
@@ -163,7 +168,11 @@ def _timed_repartition(
     read_kwargs_provider: Optional[ReadKwargsProvider],
     repartitioned_file_content_type: ContentType = ContentType.PARQUET,
     deltacat_storage=unimplemented_deltacat_storage,
+    deltacat_storage_kwargs: Optional[Dict[str, Any]] = None,
+    **kwargs,
 ) -> RepartitionResult:
+    if deltacat_storage_kwargs is None:
+        deltacat_storage_kwargs = {}
     task_id = get_current_ray_task_id()
     worker_id = get_current_ray_worker_id()
     with memray.Tracker(
@@ -182,6 +191,7 @@ def _timed_repartition(
                 max_records_per_output_file=max_records_per_output_file,
                 repartitioned_file_content_type=repartitioned_file_content_type,
                 deltacat_storage=deltacat_storage,
+                deltacat_storage_kwargs=deltacat_storage_kwargs,
             )
         else:
             raise NotImplementedError(
@@ -201,7 +211,11 @@ def repartition(
     read_kwargs_provider: Optional[ReadKwargsProvider],
     repartitioned_file_content_type: ContentType = ContentType.PARQUET,
     deltacat_storage=unimplemented_deltacat_storage,
+    deltacat_storage_kwargs: Optional[Dict[str, Any]] = None,
+    **kwargs,
 ) -> RepartitionResult:
+    if deltacat_storage_kwargs is None:
+        deltacat_storage_kwargs = {}
     logger.info(f"Starting repartition task...")
     repartition_result, duration = timed_invocation(
         func=_timed_repartition,
@@ -214,6 +228,7 @@ def repartition(
         read_kwargs_provider=read_kwargs_provider,
         repartitioned_file_content_type=repartitioned_file_content_type,
         deltacat_storage=deltacat_storage,
+        deltacat_storage_kwargs=deltacat_storage_kwargs,
     )
     if metrics_config:
         emit_timer_metrics(

deltacat 0.1.18b13__py3-none-any.whl → 0.1.18b15__py3-none-any.whl

deltacat 0.1.18b13py3-none-any.whl → 0.1.18b15py3-none-any.whl