PyPI - deltacat - Versions diffs - 0.1.18b14__py3-none-any.whl → 0.1.18b16__py3-none-any.whl - Mend

deltacat 0.1.18b14py3-none-any.whl → 0.1.18b16py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (79) hide show

deltacat/__init__.py +1 -1
deltacat/aws/clients.py +17 -6
deltacat/aws/redshift/model/manifest.py +4 -0
deltacat/aws/s3u.py +24 -1
deltacat/compute/compactor/compaction_session.py +42 -18
deltacat/compute/compactor/model/compact_partition_params.py +297 -58
deltacat/compute/compactor/model/compaction_session_audit_info.py +163 -9
deltacat/compute/compactor/model/delta_annotated.py +95 -9
deltacat/compute/compactor/model/delta_file_envelope.py +14 -2
deltacat/compute/compactor/model/round_completion_info.py +17 -1
deltacat/compute/compactor/repartition_session.py +4 -1
deltacat/compute/compactor/steps/dedupe.py +9 -6
deltacat/compute/compactor/steps/hash_bucket.py +24 -3
deltacat/compute/compactor/steps/materialize.py +11 -6
deltacat/compute/compactor/steps/repartition.py +22 -1
deltacat/compute/compactor/utils/io.py +40 -23
deltacat/compute/compactor/utils/sort_key.py +5 -0
deltacat/compute/compactor/utils/system_columns.py +43 -0
deltacat/compute/compactor_v2/compaction_session.py +509 -0
deltacat/compute/compactor_v2/constants.py +37 -0
deltacat/compute/compactor_v2/model/hash_bucket_input.py +78 -0
deltacat/compute/compactor_v2/model/hash_bucket_result.py +12 -0
deltacat/compute/compactor_v2/model/merge_input.py +143 -0
deltacat/compute/compactor_v2/model/merge_result.py +12 -0
deltacat/compute/compactor_v2/steps/__init__.py +0 -0
deltacat/compute/compactor_v2/steps/hash_bucket.py +203 -0
deltacat/compute/compactor_v2/steps/merge.py +469 -0
deltacat/compute/compactor_v2/utils/__init__.py +0 -0
deltacat/compute/compactor_v2/utils/content_type_params.py +66 -0
deltacat/compute/compactor_v2/utils/dedupe.py +58 -0
deltacat/compute/compactor_v2/utils/io.py +152 -0
deltacat/compute/compactor_v2/utils/primary_key_index.py +341 -0
deltacat/compute/compactor_v2/utils/task_options.py +221 -0
deltacat/compute/metastats/meta_stats.py +4 -2
deltacat/compute/metastats/stats.py +1 -0
deltacat/compute/metastats/utils/io.py +4 -0
deltacat/compute/stats/utils/io.py +20 -5
deltacat/exceptions.py +4 -0
deltacat/io/memcached_object_store.py +37 -14
deltacat/logs.py +4 -3
deltacat/storage/interface.py +8 -1
deltacat/storage/model/types.py +2 -1
deltacat/tests/aws/test_clients.py +16 -3
deltacat/tests/compute/__init__.py +0 -0
deltacat/tests/compute/common.py +96 -0
deltacat/tests/compute/compactor/__init__.py +0 -0
deltacat/tests/compute/compactor/steps/__init__.py +0 -0
deltacat/tests/{test_repartition.py → compute/compactor/steps/test_repartition.py} +34 -8
deltacat/tests/compute/compactor/utils/__init__.py +0 -0
deltacat/tests/{compactor → compute/compactor}/utils/test_io.py +47 -5
deltacat/tests/compute/compactor_v2/__init__.py +0 -0
deltacat/tests/{compactor → compute}/test_compact_partition_params.py +14 -30
deltacat/tests/compute/test_compaction_session_incremental.py +363 -0
deltacat/tests/compute/testcases.py +395 -0
deltacat/tests/io/test_memcached_object_store.py +5 -4
deltacat/tests/local_deltacat_storage/__init__.py +62 -19
deltacat/tests/test_utils/pyarrow.py +49 -0
deltacat/tests/test_utils/utils.py +13 -0
deltacat/tests/utils/data/__init__.py +0 -0
deltacat/tests/utils/test_daft.py +76 -0
deltacat/tests/utils/test_pyarrow.py +133 -0
deltacat/tests/utils/test_resources.py +23 -20
deltacat/types/media.py +1 -0
deltacat/types/partial_download.py +83 -0
deltacat/types/tables.py +6 -0
deltacat/utils/arguments.py +25 -0
deltacat/utils/daft.py +87 -0
deltacat/utils/placement.py +20 -3
deltacat/utils/pyarrow.py +218 -1
deltacat/utils/ray_utils/concurrency.py +26 -1
deltacat/utils/resources.py +72 -1
deltacat/utils/s3fs.py +21 -0
{deltacat-0.1.18b14.dist-info → deltacat-0.1.18b16.dist-info}/METADATA +17 -3
{deltacat-0.1.18b14.dist-info → deltacat-0.1.18b16.dist-info}/RECORD +79 -47
{deltacat-0.1.18b14.dist-info → deltacat-0.1.18b16.dist-info}/WHEEL +1 -1
/deltacat/{tests/compactor → compute/compactor_v2}/__init__.py +0 -0
/deltacat/{tests/compactor/utils → compute/compactor_v2/model}/__init__.py +0 -0
{deltacat-0.1.18b14.dist-info → deltacat-0.1.18b16.dist-info}/LICENSE +0 -0
{deltacat-0.1.18b14.dist-info → deltacat-0.1.18b16.dist-info}/top_level.txt +0 -0

deltacat/compute/compactor/steps/hash_bucket.py CHANGED Viewed

@@ -3,7 +3,7 @@ import logging
 import time
 from contextlib import nullcontext
 from itertools import chain
-from typing import Generator, List, Optional, Tuple
+from typing import Any, Dict, Generator, List, Optional, Tuple
 import numpy as np
 import pyarrow as pa
 import ray
@@ -91,7 +91,11 @@ def _group_file_records_by_pk_hash_bucket(
     is_src_delta: np.bool_ = True,
     read_kwargs_provider: Optional[ReadKwargsProvider] = None,
     deltacat_storage=unimplemented_deltacat_storage,
+    deltacat_storage_kwargs: Optional[Dict[str, Any]] = None,
+    **kwargs,
 ) -> Tuple[Optional[DeltaFileEnvelopeGroups], int]:
+    if deltacat_storage_kwargs is None:
+        deltacat_storage_kwargs = {}
     # read input parquet s3 objects into a list of delta file envelopes
     delta_file_envelopes, total_record_count = _read_delta_file_envelopes(
         annotated_delta,
@@ -99,6 +103,8 @@ def _group_file_records_by_pk_hash_bucket(
         sort_key_names,
         read_kwargs_provider,
         deltacat_storage,
+        deltacat_storage_kwargs,
+        **kwargs,
     )
     if delta_file_envelopes is None:
         return None, 0
@@ -134,8 +140,11 @@ def _read_delta_file_envelopes(
     sort_key_names: List[str],
     read_kwargs_provider: Optional[ReadKwargsProvider],
     deltacat_storage=unimplemented_deltacat_storage,
+    deltacat_storage_kwargs: Optional[Dict[str, Any]] = None,
+    **kwargs,
 ) -> Tuple[Optional[List[DeltaFileEnvelope]], int]:
+    if deltacat_storage_kwargs is None:
+        deltacat_storage_kwargs = {}
     columns_to_read = list(chain(primary_keys, sort_key_names))
     # TODO (rootliu) compare performance of column read from unpartitioned vs partitioned file
     # https://arrow.apache.org/docs/python/parquet.html#writing-to-partitioned-datasets
@@ -145,6 +154,7 @@ def _read_delta_file_envelopes(
         columns=columns_to_read,
         file_reader_kwargs_provider=read_kwargs_provider,
         storage_type=StorageType.LOCAL,
+        **deltacat_storage_kwargs,
     )
     annotations = annotated_delta.annotations
     assert (
@@ -182,7 +192,11 @@ def _timed_hash_bucket(
     read_kwargs_provider: Optional[ReadKwargsProvider] = None,
     object_store: Optional[IObjectStore] = None,
     deltacat_storage=unimplemented_deltacat_storage,
+    deltacat_storage_kwargs: Optional[Dict[str, Any]] = None,
+    **kwargs,
 ):
+    if deltacat_storage_kwargs is None:
+        deltacat_storage_kwargs = {}
     task_id = get_current_ray_task_id()
     worker_id = get_current_ray_worker_id()
     with memray.Tracker(
@@ -207,6 +221,8 @@ def _timed_hash_bucket(
             is_src_delta,
             read_kwargs_provider,
             deltacat_storage,
+            deltacat_storage_kwargs,
+            **kwargs,
         )
         hash_bucket_group_to_obj_id, _ = group_hash_bucket_indices(
             delta_file_envelope_groups, num_buckets, num_groups, object_store
@@ -235,8 +251,11 @@ def hash_bucket(
     read_kwargs_provider: Optional[ReadKwargsProvider],
     object_store: Optional[IObjectStore],
     deltacat_storage=unimplemented_deltacat_storage,
+    deltacat_storage_kwargs: Optional[Dict[str, Any]] = None,
+    **kwargs,
 ) -> HashBucketResult:
+    if deltacat_storage_kwargs is None:
+        deltacat_storage_kwargs = {}
     logger.info(f"Starting hash bucket task...")
     hash_bucket_result, duration = timed_invocation(
         func=_timed_hash_bucket,
@@ -250,6 +269,8 @@ def hash_bucket(
         read_kwargs_provider=read_kwargs_provider,
         object_store=object_store,
         deltacat_storage=deltacat_storage,
+        deltacat_storage_kwargs=deltacat_storage_kwargs,
+        **kwargs,
     )
     emit_metrics_time = 0.0

deltacat/compute/compactor/steps/materialize.py CHANGED Viewed

@@ -69,7 +69,11 @@ def materialize(
     s3_table_writer_kwargs: Optional[Dict[str, Any]] = None,
     object_store: Optional[IObjectStore] = None,
     deltacat_storage=unimplemented_deltacat_storage,
+    deltacat_storage_kwargs: Optional[Dict[str, Any]] = None,
 ):
+    if deltacat_storage_kwargs is None:
+        deltacat_storage_kwargs = {}
     def _stage_delta_from_manifest_entry_reference_list(
         manifest_entry_list_reference: List[ManifestEntry],
         partition: Partition,
@@ -105,6 +109,7 @@ def materialize(
             max_records_per_entry=max_records_per_output_file,
             content_type=compacted_file_content_type,
             s3_table_writer_kwargs=s3_table_writer_kwargs,
+            **deltacat_storage_kwargs,
         )
         compacted_table_size = TABLE_CLASS_TO_SIZE_FUNC[type(compacted_table)](
             compacted_table
@@ -116,11 +121,10 @@ def materialize(
         )
         manifest = delta.manifest
         manifest_records = manifest.meta.record_count
-        assert (
-            manifest_records == len(compacted_table),
+        assert manifest_records == len(compacted_table), (
             f"Unexpected Error: Materialized delta manifest record count "
             f"({manifest_records}) does not equal compacted table record count "
-            f"({len(compacted_table)})",
+            f"({len(compacted_table)})"
         )
         materialize_result = MaterializeResult.of(
             delta=delta,
@@ -187,10 +191,11 @@ def materialize(
                 src_stream_position_np.item(),
             )
             dl_digest = delta_locator.digest()
             manifest = manifest_cache.setdefault(
                 dl_digest,
-                deltacat_storage.get_delta_manifest(delta_locator),
+                deltacat_storage.get_delta_manifest(
+                    delta_locator, **deltacat_storage_kwargs
+                ),
             )
             if read_kwargs_provider is None:
@@ -236,6 +241,7 @@ def materialize(
                     Delta.of(delta_locator, None, None, None, manifest),
                     src_file_idx_np.item(),
                     file_reader_kwargs_provider=read_kwargs_provider,
+                    **deltacat_storage_kwargs,
                 )
                 logger.debug(
                     f"Time taken for materialize task"
@@ -253,7 +259,6 @@ def materialize(
             materialized_results.append(_materialize(record_batch_tables.remaining))
         logger.info(f"Got {count_of_src_dfl} source delta files during materialize")
         referenced_manifest_delta = (
             _stage_delta_from_manifest_entry_reference_list(
                 manifest_entry_list_reference, partition

deltacat/compute/compactor/steps/repartition.py CHANGED Viewed

@@ -4,7 +4,7 @@ from contextlib import nullcontext
 import pyarrow.compute as pc
 from deltacat.constants import SIGNED_INT64_MIN_VALUE, SIGNED_INT64_MAX_VALUE
 import pyarrow as pa
-from typing import List, Optional
+from typing import Any, Dict, List, Optional
 from deltacat.types.media import StorageType, ContentType
 import ray
 from deltacat import logs
@@ -56,8 +56,11 @@ def repartition_range(
     destination_partition: Partition,
     repartition_args: dict,
     max_records_per_output_file: int,
+    s3_table_writer_kwargs: Optional[Dict[str, Any]] = None,
     repartitioned_file_content_type: ContentType = ContentType.PARQUET,
     deltacat_storage=unimplemented_deltacat_storage,
+    deltacat_storage_kwargs: Optional[Dict[str, Any]] = None,
+    **kwargs,
 ):
     """
     Repartitions a list of Arrow tables based on specified ranges and stores the repartitioned tables.
@@ -85,6 +88,8 @@ def repartition_range(
         in the tables, an error will be raised. For each partition range, a new file is created. This could result in
         more output files than input files.
     """
+    if deltacat_storage_kwargs is None:
+        deltacat_storage_kwargs = {}
     column: str = repartition_args["column"]
     partition_ranges: List = repartition_args["ranges"]
     if len(partition_ranges) == 0:
@@ -141,6 +146,8 @@ def repartition_range(
                     destination_partition,
                     max_records_per_entry=max_records_per_output_file,
                     content_type=repartitioned_file_content_type,
+                    s3_table_writer_kwargs=s3_table_writer_kwargs,
+                    **deltacat_storage_kwargs,
                 )
                 partition_deltas.append(partition_delta)
@@ -161,9 +168,14 @@ def _timed_repartition(
     max_records_per_output_file: int,
     enable_profiler: bool,
     read_kwargs_provider: Optional[ReadKwargsProvider],
+    s3_table_writer_kwargs: Optional[Dict[str, Any]] = None,
     repartitioned_file_content_type: ContentType = ContentType.PARQUET,
     deltacat_storage=unimplemented_deltacat_storage,
+    deltacat_storage_kwargs: Optional[Dict[str, Any]] = None,
+    **kwargs,
 ) -> RepartitionResult:
+    if deltacat_storage_kwargs is None:
+        deltacat_storage_kwargs = {}
     task_id = get_current_ray_task_id()
     worker_id = get_current_ray_worker_id()
     with memray.Tracker(
@@ -180,8 +192,10 @@ def _timed_repartition(
                 destination_partition=destination_partition,
                 repartition_args=repartition_args,
                 max_records_per_output_file=max_records_per_output_file,
+                s3_table_writer_kwargs=s3_table_writer_kwargs,
                 repartitioned_file_content_type=repartitioned_file_content_type,
                 deltacat_storage=deltacat_storage,
+                deltacat_storage_kwargs=deltacat_storage_kwargs,
             )
         else:
             raise NotImplementedError(
@@ -199,9 +213,14 @@ def repartition(
     enable_profiler: bool,
     metrics_config: Optional[MetricsConfig],
     read_kwargs_provider: Optional[ReadKwargsProvider],
+    s3_table_writer_kwargs: Optional[Dict[str, Any]] = None,
     repartitioned_file_content_type: ContentType = ContentType.PARQUET,
     deltacat_storage=unimplemented_deltacat_storage,
+    deltacat_storage_kwargs: Optional[Dict[str, Any]] = None,
+    **kwargs,
 ) -> RepartitionResult:
+    if deltacat_storage_kwargs is None:
+        deltacat_storage_kwargs = {}
     logger.info(f"Starting repartition task...")
     repartition_result, duration = timed_invocation(
         func=_timed_repartition,
@@ -212,8 +231,10 @@ def repartition(
         max_records_per_output_file=max_records_per_output_file,
         enable_profiler=enable_profiler,
         read_kwargs_provider=read_kwargs_provider,
+        s3_table_writer_kwargs=s3_table_writer_kwargs,
         repartitioned_file_content_type=repartitioned_file_content_type,
         deltacat_storage=deltacat_storage,
+        deltacat_storage_kwargs=deltacat_storage_kwargs,
     )
     if metrics_config:
         emit_timer_metrics(

deltacat/compute/compactor/utils/io.py CHANGED Viewed

@@ -10,11 +10,12 @@ from deltacat.constants import (
 from deltacat.storage import (
     PartitionLocator,
     Delta,
+    ManifestEntry,
     interface as unimplemented_deltacat_storage,
 )
 from deltacat import logs
 from deltacat.compute.compactor import DeltaAnnotated
-from typing import Dict, List, Optional, Tuple, Union
+from typing import Dict, List, Optional, Tuple, Union, Any
 from deltacat.compute.compactor import HighWatermark
 from deltacat.compute.compactor.model.compaction_session_audit_info import (
     CompactionSessionAuditInfo,
@@ -31,23 +32,30 @@ def discover_deltas(
     rebase_source_partition_locator: Optional[PartitionLocator],
     rebase_source_partition_high_watermark: Optional[int],
     deltacat_storage=unimplemented_deltacat_storage,
-    **kwargs,
+    deltacat_storage_kwargs: Optional[Dict[str, Any]] = {},
+    list_deltas_kwargs: Optional[Dict[str, Any]] = {},
 ) -> Tuple[List[Delta], int]:
+    if deltacat_storage_kwargs is None:
+        deltacat_storage_kwargs = {}
     # Source One: new deltas from uncompacted table for incremental compaction or deltas from compacted table for rebase
-    input_deltas = _discover_deltas(
-        source_partition_locator,
+    start_position_exclusive = (
         high_watermark.get(source_partition_locator)
         if isinstance(high_watermark, dict)
-        else high_watermark,
+        else high_watermark
+    )
+    input_deltas = _discover_deltas(
+        source_partition_locator,
+        start_position_exclusive,
         last_stream_position_to_compact
         if not rebase_source_partition_locator
         else deltacat_storage.get_partition(
             source_partition_locator.stream_locator,
             source_partition_locator.partition_values,
+            **deltacat_storage_kwargs,
         ).stream_position,
         deltacat_storage,
-        **kwargs,
+        deltacat_storage_kwargs,
+        list_deltas_kwargs,
     )
     # Source Two: delta from compacted table for incremental compaction or new deltas from uncompacted table for rebase
@@ -56,6 +64,7 @@ def discover_deltas(
         compacted_partition = deltacat_storage.get_partition(
             compacted_partition_locator.stream_locator,
             compacted_partition_locator.partition_values,
+            **deltacat_storage_kwargs,
         )
         previous_last_stream_position_compacted = (
             compacted_partition.stream_position if compacted_partition else -1
@@ -67,7 +76,8 @@ def discover_deltas(
                 None,
                 previous_last_stream_position_compacted,
                 deltacat_storage,
-                **kwargs,
+                deltacat_storage_kwargs,
+                list_deltas_kwargs,
             )
         logger.info(
             f"Length of input deltas from uncompacted table {len(input_deltas)} up to {last_stream_position_to_compact},"
@@ -80,7 +90,8 @@ def discover_deltas(
             rebase_source_partition_high_watermark,
             last_stream_position_to_compact,
             deltacat_storage,
-            **kwargs,
+            deltacat_storage_kwargs,
+            list_deltas_kwargs,
         )
         logger.info(
             f"Length of input deltas from uncompacted table {len(input_deltas_new)} up to {last_stream_position_to_compact},"
@@ -99,6 +110,8 @@ def limit_input_deltas(
     input_deltas_stats: Dict[int, DeltaStats],
     compaction_audit: CompactionSessionAuditInfo,
     deltacat_storage=unimplemented_deltacat_storage,
+    deltacat_storage_kwargs: Optional[Dict[str, Any]] = None,
+    **kwargs,
 ) -> Tuple[List[DeltaAnnotated], int, HighWatermark, bool]:
     # TODO (pdames): when row counts are available in metadata, use them
     #  instead of bytes - memory consumption depends more on number of
@@ -108,6 +121,8 @@ def limit_input_deltas(
     # this assumption could be removed, but we'd still need to know the max
     # resources we COULD get for this cluster, and the amount of memory
     # available per CPU should remain fixed across the cluster.
+    if deltacat_storage_kwargs is None:
+        deltacat_storage_kwargs = {}
     worker_cpus = int(cluster_resources["CPU"])
     worker_obj_store_mem = float(cluster_resources["object_store_memory"])
     logger.info(f"Total worker object store memory: {worker_obj_store_mem}")
@@ -135,7 +150,7 @@ def limit_input_deltas(
         for stream_pos, delta_stats in input_deltas_stats.items()
     }
     for delta in input_deltas:
-        manifest = deltacat_storage.get_delta_manifest(delta)
+        manifest = deltacat_storage.get_delta_manifest(delta, **deltacat_storage_kwargs)
         delta.manifest = manifest
         position = delta.stream_position
         delta_stats = input_deltas_stats.get(delta.stream_position, DeltaStats())
@@ -258,6 +273,8 @@ def fit_input_deltas(
     compaction_audit: CompactionSessionAuditInfo,
     hash_bucket_count: Optional[int],
     deltacat_storage=unimplemented_deltacat_storage,
+    deltacat_storage_kwargs: Optional[Dict[str, Any]] = None,
+    **kwargs,
 ) -> Tuple[List[DeltaAnnotated], int, HighWatermark, bool]:
     """
     This method tries to fit all the input deltas to run into the existing cluster. Contrary to
@@ -277,6 +294,8 @@ def fit_input_deltas(
         Tuple of list of annotated deltas, recommended hash bucket count, high watermark,
             and whether multiple rounds are required (which is always False)
     """
+    if deltacat_storage_kwargs is None:
+        deltacat_storage_kwargs = {}
     worker_cpus = int(cluster_resources["CPU"])
     total_memory = float(cluster_resources["memory"])
     high_watermark = HighWatermark()
@@ -306,8 +325,8 @@ def fit_input_deltas(
     # We assume that the cluster is capable of distributing all tasks
     # correctly. Hence, the correct in-memory size will be in the ratio of
     # in-disk size.
-    def estimate_size(content_length):
-        return (content_length * 1.0 / delta_bytes) * total_memory
+    def estimate_size(manifest_entry: ManifestEntry):
+        return (manifest_entry.meta.content_length * 1.0 / delta_bytes) * total_memory
     # Assuming each CPU consumes equal amount of memory
     min_delta_bytes = total_memory / worker_cpus
@@ -341,18 +360,16 @@ def _discover_deltas(
     start_position_exclusive: Optional[int],
     end_position_inclusive: int,
     deltacat_storage=unimplemented_deltacat_storage,
-    **kwargs,
+    deltacat_storage_kwargs: Optional[Dict[str, Any]] = {},
+    list_deltas_kwargs: Optional[Dict[str, Any]] = {},
 ) -> List[Delta]:
-    stream_locator = source_partition_locator.stream_locator
-    namespace = stream_locator.namespace
-    table_name = stream_locator.table_name
-    table_version = stream_locator.table_version
-    partition_values = source_partition_locator.partition_values
-    deltas_list_result = deltacat_storage.list_deltas(
-        namespace=namespace,
-        table_name=table_name,
-        partition_values=partition_values,
-        table_version=table_version,
+    if deltacat_storage_kwargs is None:
+        deltacat_storage_kwargs = {}
+    kwargs = {**deltacat_storage_kwargs, **list_deltas_kwargs}
+    deltas_list_result = deltacat_storage.list_partition_deltas(
+        partition_like=source_partition_locator,
         first_stream_position=start_position_exclusive,
         last_stream_position=end_position_inclusive,
         ascending_order=True,

deltacat/compute/compactor/utils/sort_key.py CHANGED Viewed

@@ -9,6 +9,8 @@ def validate_sort_keys(
     source_partition_locator: PartitionLocator,
     sort_keys: List[SortKey],
     deltacat_storage,
+    deltacat_storage_kwargs,
+    **kwargs,
 ) -> int:
     """
     Validates the input sort keys to ensure that they are unique, are using
@@ -16,6 +18,8 @@ def validate_sort_keys(
     sum of bit widths across sort key data types is less-than-or-equal-to
     256. Returns the sum of bit widths across all sort keys.
     """
+    if deltacat_storage_kwargs is None:
+        deltacat_storage_kwargs = {}
     total_sort_keys_bit_width = 0
     if sort_keys:
         sort_key_names = [key.key_name for key in sort_keys]
@@ -27,6 +31,7 @@ def validate_sort_keys(
             stream_locator.namespace,
             stream_locator.table_name,
             stream_locator.table_version,
+            **deltacat_storage_kwargs,
         )
         if isinstance(table_version_schema, pa.Schema):
             for sort_key_name in sort_key_names:

deltacat/compute/compactor/utils/system_columns.py CHANGED Viewed

@@ -22,6 +22,13 @@ _PK_HASH_COLUMN_FIELD = pa.field(
     _PK_HASH_COLUMN_TYPE,
 )
+_PK_HASH_STRING_COLUMN_NAME = _get_sys_col_name("hash_str")
+_PK_HASH_STRING_COLUMN_TYPE = pa.string()
+_PK_HASH_STRING_COLUMN_FIELD = pa.field(
+    _PK_HASH_STRING_COLUMN_NAME,
+    _PK_HASH_STRING_COLUMN_TYPE,
+)
 _DEDUPE_TASK_IDX_COLUMN_NAME = _get_sys_col_name("dedupe_task_idx")
 _DEDUPE_TASK_IDX_COLUMN_TYPE = pa.int32()
 _DEDUPE_TASK_IDX_COLUMN_FIELD = pa.field(
@@ -36,6 +43,12 @@ _PARTITION_STREAM_POSITION_COLUMN_FIELD = pa.field(
     _PARTITION_STREAM_POSITION_COLUMN_TYPE,
 )
+_HASH_BUCKET_IDX_COLUMN_NAME = _get_sys_col_name("hash_bucket_idx")
+_HASH_BUCKET_IDX_COLUMN_TYPE = pa.int32()
+_HASH_BUCKET_IDX_COLUMN_FIELD = pa.field(
+    _HASH_BUCKET_IDX_COLUMN_NAME, _HASH_BUCKET_IDX_COLUMN_TYPE
+)
 _ORDERED_FILE_IDX_COLUMN_NAME = _get_sys_col_name("file_index")
 _ORDERED_FILE_IDX_COLUMN_TYPE = pa.int32()
 _ORDERED_FILE_IDX_COLUMN_FIELD = pa.field(
@@ -76,10 +89,18 @@ def get_pk_hash_column_array(obj) -> Union[pa.Array, pa.ChunkedArray]:
     return pa.array(obj, _PK_HASH_COLUMN_TYPE)
+def get_pk_hash_string_column_array(obj) -> Union[pa.Array, pa.ChunkedArray]:
+    return pa.array(obj, _PK_HASH_STRING_COLUMN_TYPE)
 def pk_hash_column_np(table: pa.Table) -> np.ndarray:
     return table[_PK_HASH_COLUMN_NAME].to_numpy()
+def pk_hash_string_column_np(table: pa.Table) -> np.ndarray:
+    return table[_PK_HASH_STRING_COLUMN_NAME].to_numpy()
 def pk_hash_column(table: pa.Table) -> pa.ChunkedArray:
     return table[_PK_HASH_COLUMN_NAME]
@@ -143,6 +164,10 @@ def get_delta_type_column_array(obj) -> Union[pa.Array, pa.ChunkedArray]:
     )
+def get_hash_bucket_idx_column_array(obj) -> Union[pa.Array, pa.ChunkedArray]:
+    return pa.array(obj, _HASH_BUCKET_IDX_COLUMN_TYPE)
 def get_is_source_column_array(obj) -> Union[pa.Array, pa.ChunkedArray]:
     return pa.array(
         obj,
@@ -232,6 +257,24 @@ def append_pk_hash_column(table: pa.Table, pk_hashes) -> pa.Table:
     return table
+def append_pk_hash_string_column(table: pa.Table, pk_hashes) -> pa.Table:
+    table = table.append_column(
+        _PK_HASH_STRING_COLUMN_FIELD, get_pk_hash_string_column_array(pk_hashes)
+    )
+    return table
+def append_hash_bucket_idx_col(table: pa.Table, hash_bucket_indexes) -> pa.Table:
+    table = table.append_column(
+        _HASH_BUCKET_IDX_COLUMN_FIELD,
+        get_hash_bucket_idx_column_array(hash_bucket_indexes),
+    )
+    return table
 def append_record_idx_col(table: pa.Table, ordered_record_indices) -> pa.Table:
     table = table.append_column(

deltacat 0.1.18b14__py3-none-any.whl → 0.1.18b16__py3-none-any.whl

deltacat 0.1.18b14py3-none-any.whl → 0.1.18b16py3-none-any.whl