PyPI - deltacat - Versions diffs - 1.1.27__py3-none-any.whl → 1.1.29__py3-none-any.whl - Mend

deltacat 1.1.27py3-none-any.whl → 1.1.29py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

deltacat/__init__.py CHANGED Viewed

@@ -44,7 +44,7 @@ from deltacat.types.tables import TableWriteMode
 deltacat.logs.configure_deltacat_logger(logging.getLogger(__name__))
-__version__ = "1.1.27"
+__version__ = "1.1.29"
 __all__ = [

deltacat/compute/compactor_v2/constants.py CHANGED Viewed

@@ -1,3 +1,5 @@
+from deltacat.utils.common import env_bool, env_integer
 TOTAL_BYTES_IN_SHA1_HASH = 20
 PK_DELIMITER = "L6kl7u5f"
@@ -31,7 +33,9 @@ TOTAL_MEMORY_BUFFER_PERCENTAGE = 30
 # The total size of records that will be hash bucketed at once
 # Since, sorting is nlogn, we ensure that is not performed
 # on a very large dataset for best performance.
-MAX_SIZE_OF_RECORD_BATCH_IN_GIB = 2 * 1024 * 1024 * 1024
+MAX_SIZE_OF_RECORD_BATCH_IN_GIB = env_integer(
+    "MAX_SIZE_OF_RECORD_BATCH_IN_GIB", 2 * 1024 * 1024 * 1024
+)
 # Whether to drop duplicates during merge.
 DROP_DUPLICATES = True
@@ -78,3 +82,13 @@ COMPACT_PARTITION_METRIC_PREFIX = "compact_partition"
 # Number of rounds to run hash/merge for a single
 # partition. (For large table support)
 DEFAULT_NUM_ROUNDS = 1
+# Whether to perform sha1 hashing when required to
+# optimize memory. For example, hashing is always
+# required for bucketing where it's not mandatory
+# when dropping duplicates. Setting this to True
+# will disable sha1 hashing in cases where it isn't
+# mandatory. This flag is False by default.
+SHA1_HASHING_FOR_MEMORY_OPTIMIZATION_DISABLED = env_bool(
+    "SHA1_HASHING_FOR_MEMORY_OPTIMIZATION_DISABLED", False
+)

deltacat/compute/compactor_v2/steps/merge.py CHANGED Viewed

@@ -7,6 +7,7 @@ import ray
 import itertools
 import time
 import pyarrow.compute as pc
+from deltacat.utils.pyarrow import MAX_INT_BYTES
 import deltacat.compute.compactor_v2.utils.merge as merge_utils
 from uuid import uuid4
 from deltacat import logs
@@ -147,10 +148,32 @@ def _merge_tables(
     if compacted_table:
         compacted_table = all_tables[0]
+        compacted_pk_hash_str = compacted_table[sc._PK_HASH_STRING_COLUMN_NAME]
+        incremental_pk_hash_str = incremental_table[sc._PK_HASH_STRING_COLUMN_NAME]
+        logger.info(
+            f"Size of compacted pk hash={compacted_pk_hash_str.nbytes} "
+            f"and incremental pk hash={incremental_pk_hash_str.nbytes}."
+        )
+        if (
+            compacted_table[sc._PK_HASH_STRING_COLUMN_NAME].nbytes >= MAX_INT_BYTES
+            or incremental_table[sc._PK_HASH_STRING_COLUMN_NAME].nbytes >= MAX_INT_BYTES
+        ):
+            logger.info("Casting compacted and incremental pk hash to large_string...")
+            # is_in combines the chunks of the chunked array passed which can cause
+            # ArrowCapacityError if the total size of string array is over 2GB.
+            # Using a large_string would resolve this issue.
+            # The cast here should be zero-copy in most cases.
+            compacted_pk_hash_str = pc.cast(compacted_pk_hash_str, pa.large_string())
+            incremental_pk_hash_str = pc.cast(
+                incremental_pk_hash_str, pa.large_string()
+            )
         records_to_keep = pc.invert(
             pc.is_in(
-                compacted_table[sc._PK_HASH_STRING_COLUMN_NAME],
-                incremental_table[sc._PK_HASH_STRING_COLUMN_NAME],
+                compacted_pk_hash_str,
+                incremental_pk_hash_str,
             )
         )
@@ -492,9 +515,11 @@ def _copy_manifests_from_hash_bucketing(
 def _timed_merge(input: MergeInput) -> MergeResult:
     task_id = get_current_ray_task_id()
     worker_id = get_current_ray_worker_id()
-    with memray.Tracker(
-        f"merge_{worker_id}_{task_id}.bin"
-    ) if input.enable_profiler else nullcontext():
+    with (
+        memray.Tracker(f"merge_{worker_id}_{task_id}.bin")
+        if input.enable_profiler
+        else nullcontext()
+    ):
         total_input_records, total_deduped_records = 0, 0
         total_dropped_records = 0
         materialized_results: List[MaterializeResult] = []

deltacat/compute/compactor_v2/utils/dedupe.py CHANGED Viewed

@@ -25,7 +25,7 @@ def _create_chunked_index_array(array: pa.Array) -> pa.Array:
         result[index] = np.arange(cl, dtype="int32")
     chunk_lengths = ([0] + chunk_lengths)[:-1]
-    result = pa.chunked_array(result + np.cumsum(chunk_lengths))
+    result = pa.chunked_array(result + np.cumsum(chunk_lengths), type=pa.int32())
     return result

deltacat/compute/compactor_v2/utils/primary_key_index.py CHANGED Viewed

@@ -10,6 +10,7 @@ from deltacat.compute.compactor_v2.constants import (
     TOTAL_BYTES_IN_SHA1_HASH,
     PK_DELIMITER,
     MAX_SIZE_OF_RECORD_BATCH_IN_GIB,
+    SHA1_HASHING_FOR_MEMORY_OPTIMIZATION_DISABLED,
 )
 import time
 from deltacat.compute.compactor.model.delta_file_envelope import DeltaFileEnvelope
@@ -48,6 +49,13 @@ def _is_sha1_desired(hash_columns: List[pa.Array]) -> bool:
         f"Found total length of hash column={total_len} and total_size={total_size}"
     )
+    if SHA1_HASHING_FOR_MEMORY_OPTIMIZATION_DISABLED:
+        logger.info(
+            f"SHA1_HASHING_FOR_MEMORY_OPTIMIZATION_DISABLED is True. "
+            f"Returning False for is_sha1_desired"
+        )
+        return False
     return total_size > TOTAL_BYTES_IN_SHA1_HASH * total_len
@@ -108,9 +116,10 @@ def _optimized_group_record_batches_by_hash_bucket(
     record_batches = []
     result_len = 0
     for record_batch in table_batches:
-        current_bytes += record_batch.nbytes
-        record_batches.append(record_batch)
-        if current_bytes >= MAX_SIZE_OF_RECORD_BATCH_IN_GIB:
+        if (
+            record_batches
+            and current_bytes + record_batch.nbytes >= MAX_SIZE_OF_RECORD_BATCH_IN_GIB
+        ):
             logger.info(
                 f"Total number of record batches without exceeding {MAX_SIZE_OF_RECORD_BATCH_IN_GIB} "
                 f"is {len(record_batches)} and size {current_bytes}"
@@ -128,6 +137,9 @@ def _optimized_group_record_batches_by_hash_bucket(
             current_bytes = 0
             record_batches.clear()
+        current_bytes += record_batch.nbytes
+        record_batches.append(record_batch)
     if record_batches:
         appended_len, append_latency = timed_invocation(
             _append_table_by_hash_bucket,

deltacat/tests/compute/compact_partition_test_cases.py CHANGED Viewed

@@ -601,6 +601,38 @@ INCREMENTAL_TEST_CASES: Dict[str, IncrementalCompactionTestCaseParams] = {
         skip_enabled_compact_partition_drivers=[CompactorVersion.V1],
         assert_compaction_audit=None,
     ),
+    "15-incremental-empty-input-with-single-hash-bucket": IncrementalCompactionTestCaseParams(
+        primary_keys={"pk_col_1"},
+        sort_keys=[SortKey.of(key_name="sk_col_1")],
+        partition_keys=ZERO_VALUED_PARTITION_KEYS_PARAM,
+        partition_values=ZERO_VALUED_PARTITION_VALUES_PARAM,
+        input_deltas=pa.Table.from_arrays(
+            [
+                pa.array([]),
+                pa.array([]),
+            ],
+            names=["pk_col_1", "sk_col_1"],
+        ),
+        input_deltas_delta_type=DeltaType.UPSERT,
+        expected_terminal_compact_partition_result=pa.Table.from_arrays(
+            [
+                pa.array([]),
+                pa.array([]),
+            ],
+            names=["pk_col_1", "sk_col_1"],
+        ),
+        expected_terminal_exception=None,
+        expected_terminal_exception_message=None,
+        do_create_placement_group=False,
+        records_per_compacted_file=DEFAULT_MAX_RECORDS_PER_FILE,
+        hash_bucket_count=1,
+        read_kwargs_provider=None,
+        drop_duplicates=True,
+        is_inplace=False,
+        add_late_deltas=None,
+        skip_enabled_compact_partition_drivers=[CompactorVersion.V1],
+        assert_compaction_audit=assert_compaction_audit_no_hash_bucket,
+    ),
 }
 INCREMENTAL_TEST_CASES = with_compactor_version_func_test_param(INCREMENTAL_TEST_CASES)

deltacat/tests/compute/compactor_v2/test_compaction_session.py CHANGED Viewed

@@ -1,6 +1,7 @@
 from typing import Dict, Any
 import ray
 import os
+import pyarrow as pa
 import pytest
 import boto3
 from deltacat.compute.compactor.model.compaction_session_audit_info import (
@@ -76,6 +77,17 @@ def local_deltacat_storage_kwargs(request: pytest.FixtureRequest):
         os.remove(DATABASE_FILE_PATH_VALUE)
+@pytest.fixture(scope="function")
+def disable_sha1(monkeypatch):
+    import deltacat.compute.compactor_v2.utils.primary_key_index
+    monkeypatch.setattr(
+        deltacat.compute.compactor_v2.utils.primary_key_index,
+        "SHA1_HASHING_FOR_MEMORY_OPTIMIZATION_DISABLED",
+        True,
+    )
 class TestCompactionSession:
     """
     This class adds specific tests that aren't part of the parametrized test suite.
@@ -556,3 +568,124 @@ class TestCompactionSession:
                 }
             )
         )
+    def test_compact_partition_when_incremental_pk_hash_is_over_2gb(
+        self, s3_resource, local_deltacat_storage_kwargs, disable_sha1
+    ):
+        """
+        A test case which ensures the compaction succeeds even if the incremental
+        arrow table size is over 2GB. It is added to prevent ArrowCapacityError
+        when running is_in operation during merge.
+        Note that we set SHA1_HASHING_FOR_MEMORY_OPTIMIZATION_DISABLED to bypass sha1 hashing
+        which truncates the lengths of pk strings when deduping.
+        """
+        # setup
+        staged_source = stage_partition_from_file_paths(
+            self.NAMESPACE, ["source"], **local_deltacat_storage_kwargs
+        )
+        # we create chunked array to avoid ArrowCapacityError
+        chunked_pk_array = pa.chunked_array([["13bytesstring"], ["12bytestring"]])
+        table = pa.table([chunked_pk_array], names=["pk"])
+        source_delta = commit_delta_to_staged_partition(
+            staged_source, pa_table=table, **local_deltacat_storage_kwargs
+        )
+        staged_dest = stage_partition_from_file_paths(
+            self.NAMESPACE, ["destination"], **local_deltacat_storage_kwargs
+        )
+        dest_partition = ds.commit_partition(
+            staged_dest, **local_deltacat_storage_kwargs
+        )
+        # rebase first
+        rebase_url = compact_partition(
+            CompactPartitionParams.of(
+                {
+                    "compaction_artifact_s3_bucket": TEST_S3_RCF_BUCKET_NAME,
+                    "compacted_file_content_type": ContentType.PARQUET,
+                    "dd_max_parallelism_ratio": 1.0,
+                    "deltacat_storage": ds,
+                    "deltacat_storage_kwargs": local_deltacat_storage_kwargs,
+                    "destination_partition_locator": dest_partition.locator,
+                    "drop_duplicates": True,
+                    "hash_bucket_count": 1,
+                    "last_stream_position_to_compact": source_delta.stream_position,
+                    "list_deltas_kwargs": {
+                        **local_deltacat_storage_kwargs,
+                        **{"equivalent_table_types": []},
+                    },
+                    "primary_keys": ["pk"],
+                    "rebase_source_partition_locator": source_delta.partition_locator,
+                    "rebase_source_partition_high_watermark": source_delta.stream_position,
+                    "records_per_compacted_file": 4000,
+                    "s3_client_kwargs": {},
+                    "source_partition_locator": source_delta.partition_locator,
+                    "resource_estimation_method": ResourceEstimationMethod.PREVIOUS_INFLATION,
+                }
+            )
+        )
+        rebased_rcf = get_rcf(s3_resource, rebase_url)
+        assert rebased_rcf.compacted_pyarrow_write_result.files == 1
+        assert rebased_rcf.compacted_pyarrow_write_result.records == 2
+        # Run incremental with a small delta on source
+        chunked_pk_array = pa.chunked_array(
+            [["13bytesstring" * 95_000_000], ["12bytestring" * 95_000_000]]
+        )  # 2.3GB
+        table = pa.table([chunked_pk_array], names=["pk"])
+        incremental_source_delta = commit_delta_to_partition(
+            source_delta.partition_locator,
+            pa_table=table,
+            **local_deltacat_storage_kwargs,
+        )
+        assert (
+            incremental_source_delta.partition_locator == source_delta.partition_locator
+        ), "source partition locator should not change"
+        dest_partition = ds.get_partition(
+            dest_partition.stream_locator,
+            dest_partition.partition_values,
+            **local_deltacat_storage_kwargs,
+        )
+        assert (
+            dest_partition.locator
+            == rebased_rcf.compacted_delta_locator.partition_locator
+        ), "The new destination partition should be same as compacted partition"
+        # Run incremental
+        incremental_url = compact_partition(
+            CompactPartitionParams.of(
+                {
+                    "compaction_artifact_s3_bucket": TEST_S3_RCF_BUCKET_NAME,
+                    "compacted_file_content_type": ContentType.PARQUET,
+                    "dd_max_parallelism_ratio": 1.0,
+                    "deltacat_storage": ds,
+                    "deltacat_storage_kwargs": local_deltacat_storage_kwargs,
+                    "destination_partition_locator": dest_partition.locator,
+                    "drop_duplicates": True,
+                    "hash_bucket_count": 1,
+                    "last_stream_position_to_compact": incremental_source_delta.stream_position,
+                    "list_deltas_kwargs": {
+                        **local_deltacat_storage_kwargs,
+                        **{"equivalent_table_types": []},
+                    },
+                    "primary_keys": ["pk"],
+                    "records_per_compacted_file": 4000,
+                    "s3_client_kwargs": {},
+                    "source_partition_locator": incremental_source_delta.partition_locator,
+                    "resource_estimation_method": ResourceEstimationMethod.PREVIOUS_INFLATION,
+                }
+            )
+        )
+        incremental_rcf = get_rcf(s3_resource, incremental_url)
+        assert incremental_rcf.compacted_pyarrow_write_result.files == 1
+        assert (
+            incremental_rcf.compacted_pyarrow_write_result.pyarrow_bytes >= 2300000000
+        )
+        assert incremental_rcf.compacted_pyarrow_write_result.records == 4

deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py ADDED Viewed

@@ -0,0 +1,45 @@
+import pyarrow as pa
+from deltacat.compute.compactor_v2.utils.primary_key_index import (
+    group_by_pk_hash_bucket,
+)
+class TestGroupByPkHashBucket:
+    def test_sanity(self):
+        record = pa.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
+        pk = pa.array(["a", "b", "c", "d", "e", "f", "g", "h", "i", "j"])
+        record_batch = pa.RecordBatch.from_arrays([record, pk], names=["record", "pk"])
+        table = pa.Table.from_batches([record_batch])
+        grouped_array = group_by_pk_hash_bucket(table, 3, ["pk"])
+        assert len(grouped_array) == 3
+        total_records = 0
+        for arr in grouped_array:
+            if arr is not None:
+                total_records += len(arr[1])
+        assert total_records == len(table)
+    def test_when_record_batches_exceed_int_max_size(self):
+        record = pa.array(["12bytestring" * 90_000_000])
+        record_batch = pa.RecordBatch.from_arrays([record], names=["pk"])
+        table = pa.Table.from_batches([record_batch, record_batch])
+        grouped_array = group_by_pk_hash_bucket(table, 3, ["pk"])
+        assert len(grouped_array) == 3
+        # two record batches are preserved as combining them
+        # would exceed 2GB.
+        assert len(grouped_array[2].to_batches()) == 2
+    def test_when_record_batches_less_than_int_max_size(self):
+        record = pa.array(["12bytestring" * 90_000])
+        record_batch = pa.RecordBatch.from_arrays([record], names=["pk"])
+        table = pa.Table.from_batches([record_batch, record_batch])
+        grouped_array = group_by_pk_hash_bucket(table, 3, ["pk"])
+        assert len(grouped_array) == 3
+        # Combined the arrays into one record batch as the size
+        # would not exceed 2GB.
+        assert len(grouped_array[1].to_batches()) == 1

deltacat/tests/test_utils/pyarrow.py CHANGED Viewed

@@ -47,7 +47,8 @@ def stage_partition_from_file_paths(
 def commit_delta_to_staged_partition(
     staged_partition,
-    file_paths: List[str],
+    file_paths: List[str] = None,
+    pa_table: pa.Table = None,
     content_type: ContentType = ContentType.PARQUET,
     *args,
     **kwargs,
@@ -57,6 +58,7 @@ def commit_delta_to_staged_partition(
         *args,
         file_paths=file_paths,
         content_type=content_type,
+        pa_table=pa_table,
         **kwargs,
     )
     ds.commit_partition(staged_partition, **kwargs)
@@ -76,23 +78,28 @@ def download_delta(delta_like: Union[Delta, DeltaLocator], *args, **kwargs) -> D
 def commit_delta_to_partition(
     partition: Union[Partition, PartitionLocator],
-    file_paths: List[str],
+    file_paths: List[str] = None,
+    pa_table: pa.Table = None,
     content_type: ContentType = ContentType.PARQUET,
     *args,
     **kwargs,
 ) -> Delta:
-    tables = []
     if isinstance(partition, PartitionLocator):
         partition = ds.get_partition(
             partition.stream_locator, partition.partition_values, *args, **kwargs
         )
+    if pa_table is None:
+        assert file_paths is not None, "One of pa_table or file_paths must be passed."
+        tables = []
+        for file_path in file_paths:
+            table = pa.csv.read_csv(file_path)
+            tables.append(table)
-    for file_path in file_paths:
-        table = pa.csv.read_csv(file_path)
-        tables.append(table)
+        pa_table = pa.concat_tables(tables)
-    table = pa.concat_tables(tables)
-    staged_delta = ds.stage_delta(table, partition, content_type=content_type, **kwargs)
+    staged_delta = ds.stage_delta(
+        pa_table, partition, content_type=content_type, **kwargs
+    )
     return ds.commit_delta(staged_delta, **kwargs)

deltacat/tests/utils/test_pyarrow.py CHANGED Viewed

@@ -7,15 +7,24 @@ from deltacat.utils.pyarrow import (
     s3_file_to_table,
     ReadKwargsProviderPyArrowSchemaOverride,
     RAISE_ON_EMPTY_CSV_KWARG,
+    RAISE_ON_DECIMAL_OVERFLOW,
 )
+import decimal
 from deltacat.types.media import ContentEncoding, ContentType
 from deltacat.types.partial_download import PartialParquetParameters
 from pyarrow.parquet import ParquetFile
 import pyarrow as pa
 PARQUET_FILE_PATH = "deltacat/tests/utils/data/test_file.parquet"
+PARQUET_GZIP_COMPRESSED_FILE_PATH = "deltacat/tests/utils/data/test_file.parquet.gz"
 EMPTY_UTSV_PATH = "deltacat/tests/utils/data/empty.csv"
 NON_EMPTY_VALID_UTSV_PATH = "deltacat/tests/utils/data/non_empty_valid.csv"
+OVERFLOWING_DECIMAL_PRECISION_UTSV_PATH = (
+    "deltacat/tests/utils/data/overflowing_decimal_precision.csv"
+)
+OVERFLOWING_DECIMAL_SCALE_UTSV_PATH = (
+    "deltacat/tests/utils/data/overflowing_decimal_scale.csv"
+)
 GZIP_COMPRESSED_FILE_UTSV_PATH = "deltacat/tests/utils/data/non_empty_compressed.gz"
 BZ2_COMPRESSED_FILE_UTSV_PATH = "deltacat/tests/utils/data/non_empty_compressed.bz2"
@@ -407,6 +416,253 @@ class TestReadCSV(TestCase):
             ),
         )
+    def test_read_csv_when_decimal_precision_overflows_and_raise_kwarg_specified(self):
+        schema = pa.schema(
+            [("is_active", pa.string()), ("decimal_value", pa.decimal128(4, 2))]
+        )
+        kwargs = content_type_to_reader_kwargs(ContentType.UNESCAPED_TSV.value)
+        _add_column_kwargs(
+            ContentType.UNESCAPED_TSV.value,
+            ["is_active", "decimal_value"],
+            ["is_active", "decimal_value"],
+            kwargs,
+        )
+        read_kwargs_provider = ReadKwargsProviderPyArrowSchemaOverride(schema=schema)
+        kwargs = read_kwargs_provider(ContentType.UNESCAPED_TSV.value, kwargs)
+        self.assertRaises(
+            pa.lib.ArrowInvalid,
+            lambda: pyarrow_read_csv(
+                OVERFLOWING_DECIMAL_PRECISION_UTSV_PATH,
+                **{**kwargs, RAISE_ON_DECIMAL_OVERFLOW: True}
+            ),
+        )
+    def test_read_csv_when_decimal_precision_overflows_sanity(self):
+        schema = pa.schema(
+            [("is_active", pa.string()), ("decimal_value", pa.decimal128(4, 2))]
+        )
+        kwargs = content_type_to_reader_kwargs(ContentType.UNESCAPED_TSV.value)
+        _add_column_kwargs(
+            ContentType.UNESCAPED_TSV.value,
+            ["is_active", "decimal_value"],
+            ["is_active", "decimal_value"],
+            kwargs,
+        )
+        read_kwargs_provider = ReadKwargsProviderPyArrowSchemaOverride(schema=schema)
+        kwargs = read_kwargs_provider(ContentType.UNESCAPED_TSV.value, kwargs)
+        self.assertRaises(
+            pa.lib.ArrowInvalid,
+            lambda: pyarrow_read_csv(OVERFLOWING_DECIMAL_PRECISION_UTSV_PATH, **kwargs),
+        )
+    def test_read_csv_when_decimal_scale_overflows_and_raise_kwarg_specified(self):
+        schema = pa.schema(
+            [("is_active", pa.string()), ("decimal_value", pa.decimal128(20, 2))]
+        )
+        kwargs = content_type_to_reader_kwargs(ContentType.UNESCAPED_TSV.value)
+        _add_column_kwargs(
+            ContentType.UNESCAPED_TSV.value,
+            ["is_active", "decimal_value"],
+            ["is_active", "decimal_value"],
+            kwargs,
+        )
+        read_kwargs_provider = ReadKwargsProviderPyArrowSchemaOverride(schema=schema)
+        kwargs = read_kwargs_provider(ContentType.UNESCAPED_TSV.value, kwargs)
+        self.assertRaises(
+            pa.lib.ArrowInvalid,
+            lambda: pyarrow_read_csv(
+                OVERFLOWING_DECIMAL_SCALE_UTSV_PATH,
+                **{**kwargs, RAISE_ON_DECIMAL_OVERFLOW: True}
+            ),
+        )
+    def test_read_csv_when_decimal_scale_overflows_sanity(self):
+        schema = pa.schema(
+            [("is_active", pa.string()), ("decimal_value", pa.decimal128(20, 2))]
+        )
+        kwargs = content_type_to_reader_kwargs(ContentType.UNESCAPED_TSV.value)
+        _add_column_kwargs(
+            ContentType.UNESCAPED_TSV.value,
+            ["is_active", "decimal_value"],
+            ["is_active", "decimal_value"],
+            kwargs,
+        )
+        read_kwargs_provider = ReadKwargsProviderPyArrowSchemaOverride(schema=schema)
+        kwargs = read_kwargs_provider(ContentType.UNESCAPED_TSV.value, kwargs)
+        result = pyarrow_read_csv(OVERFLOWING_DECIMAL_SCALE_UTSV_PATH, **kwargs)
+        self.assertEqual(len(result), 3)
+        self.assertEqual(
+            result[1][0].as_py(), decimal.Decimal("322236.66")
+        )  # rounding decimal
+        self.assertEqual(result[1][1].as_py(), decimal.Decimal("32.33"))  # not rounded
+        self.assertEqual(len(result.column_names), 2)
+        result_schema = result.schema
+        self.assertEqual(result_schema.field(0).type, "string")
+        self.assertEqual(result_schema.field(1).type, pa.decimal128(20, 2))
+    def test_read_csv_when_decimal_scale_overflows_and_negative_scale(self):
+        schema = pa.schema(
+            [("is_active", pa.string()), ("decimal_value", pa.decimal128(20, -2))]
+        )
+        kwargs = content_type_to_reader_kwargs(ContentType.UNESCAPED_TSV.value)
+        _add_column_kwargs(
+            ContentType.UNESCAPED_TSV.value,
+            ["is_active", "decimal_value"],
+            ["is_active", "decimal_value"],
+            kwargs,
+        )
+        read_kwargs_provider = ReadKwargsProviderPyArrowSchemaOverride(schema=schema)
+        kwargs = read_kwargs_provider(ContentType.UNESCAPED_TSV.value, kwargs)
+        result = pyarrow_read_csv(OVERFLOWING_DECIMAL_SCALE_UTSV_PATH, **kwargs)
+        self.assertEqual(len(result), 3)
+        self.assertEqual(
+            result[1][0].as_py(),
+            decimal.Decimal("322200"),  # consequence of negative scale
+        )  # rounding decimal
+        self.assertEqual(result[1][1].as_py(), decimal.Decimal("00"))
+        self.assertEqual(len(result.column_names), 2)
+        result_schema = result.schema
+        self.assertEqual(result_schema.field(0).type, "string")
+        self.assertEqual(result_schema.field(1).type, pa.decimal128(20, -2))
+    def test_read_csv_when_decimal_scale_overflows_with_decimal256(self):
+        schema = pa.schema(
+            [("is_active", pa.string()), ("decimal_value", pa.decimal256(20, 2))]
+        )
+        kwargs = content_type_to_reader_kwargs(ContentType.UNESCAPED_TSV.value)
+        _add_column_kwargs(
+            ContentType.UNESCAPED_TSV.value,
+            ["is_active", "decimal_value"],
+            ["is_active", "decimal_value"],
+            kwargs,
+        )
+        read_kwargs_provider = ReadKwargsProviderPyArrowSchemaOverride(schema=schema)
+        kwargs = read_kwargs_provider(ContentType.UNESCAPED_TSV.value, kwargs)
+        result = pyarrow_read_csv(OVERFLOWING_DECIMAL_SCALE_UTSV_PATH, **kwargs)
+        self.assertEqual(len(result), 3)
+        self.assertEqual(
+            result[1][0].as_py(), decimal.Decimal("322236.66")
+        )  # rounding decimal
+        self.assertEqual(result[1][1].as_py(), decimal.Decimal("32.33"))  # not rounded
+        self.assertEqual(len(result.column_names), 2)
+        result_schema = result.schema
+        self.assertEqual(result_schema.field(0).type, "string")
+        self.assertEqual(result_schema.field(1).type, pa.decimal256(20, 2))
+    def test_read_csv_when_decimal_scale_overflows_with_decimal256_and_raise_on_overflow(
+        self,
+    ):
+        schema = pa.schema(
+            [("is_active", pa.string()), ("decimal_value", pa.decimal256(20, 2))]
+        )
+        kwargs = content_type_to_reader_kwargs(ContentType.UNESCAPED_TSV.value)
+        _add_column_kwargs(
+            ContentType.UNESCAPED_TSV.value,
+            ["is_active", "decimal_value"],
+            ["is_active", "decimal_value"],
+            kwargs,
+        )
+        read_kwargs_provider = ReadKwargsProviderPyArrowSchemaOverride(schema=schema)
+        kwargs = read_kwargs_provider(ContentType.UNESCAPED_TSV.value, kwargs)
+        self.assertRaises(
+            pa.lib.ArrowNotImplementedError,
+            lambda: pyarrow_read_csv(
+                OVERFLOWING_DECIMAL_SCALE_UTSV_PATH,
+                **{**kwargs, RAISE_ON_DECIMAL_OVERFLOW: True}
+            ),
+        )
+    def test_read_csv_when_decimal_scale_overflows_without_any_schema_then_infers(self):
+        kwargs = content_type_to_reader_kwargs(ContentType.UNESCAPED_TSV.value)
+        read_kwargs_provider = ReadKwargsProviderPyArrowSchemaOverride(schema=None)
+        kwargs = read_kwargs_provider(ContentType.UNESCAPED_TSV.value, kwargs)
+        result = pyarrow_read_csv(OVERFLOWING_DECIMAL_SCALE_UTSV_PATH, **kwargs)
+        # The default behavior of pyarrow is to invalid skip rows
+        self.assertEqual(len(result), 2)
+        self.assertEqual(result[1][0].as_py(), 32.33)  # rounding decimal
+        self.assertEqual(result[1][1].as_py(), 0.4)  # not rounded
+        self.assertEqual(len(result.column_names), 2)
+        result_schema = result.schema
+        self.assertEqual(result_schema.field(0).type, "string")
+        self.assertEqual(result_schema.field(1).type, pa.float64())
+    def test_read_csv_when_decimal_scale_and_precision_overflow_and_raise_on_overflow(
+        self,
+    ):
+        schema = pa.schema(
+            [("is_active", pa.string()), ("decimal_value", pa.decimal128(5, 2))]
+        )
+        kwargs = content_type_to_reader_kwargs(ContentType.UNESCAPED_TSV.value)
+        _add_column_kwargs(
+            ContentType.UNESCAPED_TSV.value,
+            ["is_active", "decimal_value"],
+            ["is_active", "decimal_value"],
+            kwargs,
+        )
+        read_kwargs_provider = ReadKwargsProviderPyArrowSchemaOverride(schema=schema)
+        kwargs = read_kwargs_provider(ContentType.UNESCAPED_TSV.value, kwargs)
+        self.assertRaises(
+            pa.lib.ArrowInvalid,
+            lambda: pyarrow_read_csv(OVERFLOWING_DECIMAL_SCALE_UTSV_PATH, **kwargs),
+        )
+    def test_read_csv_when_decimal_scale_overflow_and_file_like_obj_passed(self):
+        schema = pa.schema(
+            [("is_active", pa.string()), ("decimal_value", pa.decimal128(15, 2))]
+        )
+        kwargs = content_type_to_reader_kwargs(ContentType.UNESCAPED_TSV.value)
+        _add_column_kwargs(
+            ContentType.UNESCAPED_TSV.value,
+            ["is_active", "decimal_value"],
+            ["is_active", "decimal_value"],
+            kwargs,
+        )
+        read_kwargs_provider = ReadKwargsProviderPyArrowSchemaOverride(schema=schema)
+        kwargs = read_kwargs_provider(ContentType.UNESCAPED_TSV.value, kwargs)
+        with open(OVERFLOWING_DECIMAL_SCALE_UTSV_PATH, "rb") as file:
+            result = pyarrow_read_csv(file, **kwargs)
+            self.assertEqual(len(result), 3)
+            self.assertEqual(
+                result[1][0].as_py(), decimal.Decimal("322236.66")
+            )  # rounding decimal
+            self.assertEqual(
+                result[1][1].as_py(), decimal.Decimal("32.33")
+            )  # not rounded
+            self.assertEqual(len(result.column_names), 2)
+            result_schema = result.schema
+            self.assertEqual(result_schema.field(0).type, "string")
+            self.assertEqual(result_schema.field(1).type, pa.decimal128(15, 2))
 class TestS3FileToTable(TestCase):
     def test_s3_file_to_table_identity_sanity(self):
@@ -534,3 +790,25 @@ class TestS3FileToTable(TestCase):
             self.assertEqual(field.name, schema.field(index).name)
         self.assertEqual(result.schema.field(1).type, "string")
+    def test_s3_file_to_table_when_parquet_gzip(self):
+        pa_kwargs_provider = lambda content_type, kwargs: {
+            "reader_type": "pyarrow",
+            **kwargs,
+        }
+        result = s3_file_to_table(
+            PARQUET_GZIP_COMPRESSED_FILE_PATH,
+            ContentType.PARQUET.value,
+            ContentEncoding.GZIP.value,
+            ["n_legs", "animal"],
+            ["n_legs"],
+            pa_read_func_kwargs_provider=pa_kwargs_provider,
+        )
+        self.assertEqual(len(result), 6)
+        self.assertEqual(len(result.column_names), 1)
+        schema = result.schema
+        schema_index = schema.get_field_index("n_legs")
+        self.assertEqual(schema.field(schema_index).type, "int64")

deltacat/utils/pyarrow.py CHANGED Viewed

@@ -1,6 +1,7 @@
 # Allow classes to use self-referencing Type hints in Python 3.7.
 from __future__ import annotations
+import copy
 import bz2
 import gzip
 import io
@@ -47,6 +48,18 @@ logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
 RAISE_ON_EMPTY_CSV_KWARG = "raise_on_empty_csv"
 READER_TYPE_KWARG = "reader_type"
+"""
+By default, round decimal values using half_to_even round mode when
+rescaling a decimal to the given scale and precision in the schema would cause
+data loss. Setting any non null value of this argument will result
+in an error instead.
+"""
+RAISE_ON_DECIMAL_OVERFLOW = "raise_on_decimal_overflow"
+# Note the maximum from https://arrow.apache.org/docs/python/generated/pyarrow.Decimal256Type.html#pyarrow.Decimal256Type
+DECIMAL256_DEFAULT_SCALE = 38
+DECIMAL256_MAX_PRECISION = 76
+MAX_INT_BYTES = 2147483646
 def _filter_schema_for_columns(schema: pa.Schema, columns: List[str]) -> pa.Schema:
@@ -64,45 +77,164 @@ def _filter_schema_for_columns(schema: pa.Schema, columns: List[str]) -> pa.Sche
     return target_schema
-def pyarrow_read_csv(*args, **kwargs) -> pa.Table:
-    try:
-        new_kwargs = sanitize_kwargs_by_supported_kwargs(
-            ["read_options", "parse_options", "convert_options", "memory_pool"], kwargs
+def _extract_arrow_schema_from_read_csv_kwargs(kwargs: Dict[str, Any]) -> pa.Schema:
+    schema = None
+    if (
+        "convert_options" in kwargs
+        and kwargs["convert_options"].column_types is not None
+    ):
+        schema = kwargs["convert_options"].column_types
+        if not isinstance(schema, pa.Schema):
+            schema = pa.schema(schema)
+        if kwargs["convert_options"].include_columns:
+            schema = _filter_schema_for_columns(
+                schema, kwargs["convert_options"].include_columns
+            )
+        elif (
+            kwargs.get("read_options") is not None
+            and kwargs["read_options"].column_names
+        ):
+            schema = _filter_schema_for_columns(
+                schema, kwargs["read_options"].column_names
+            )
+    else:
+        logger.debug(
+            "Schema not specified in the kwargs."
+            " Hence, schema could not be inferred from the empty CSV."
         )
+    return schema
+def _new_schema_with_replaced_fields(
+    schema: pa.Schema, field_to_replace: Callable[[pa.Field], Optional[pa.Field]]
+) -> pa.Schema:
+    if schema is None:
+        return None
+    new_schema_fields = []
+    for field in schema:
+        new_field = field_to_replace(field)
+        if new_field is not None:
+            new_schema_fields.append(new_field)
+        else:
+            new_schema_fields.append(field)
+    return pa.schema(new_schema_fields, metadata=schema.metadata)
+def _read_csv_rounding_decimal_columns_to_fit_scale(
+    schema: pa.Schema, reader_args: List[Any], reader_kwargs: Dict[str, Any]
+) -> pa.Table:
+    # Note: We read decimals as strings first because CSV
+    # conversion to decimal256 isn't implemented as of pyarrow==12.0.1
+    new_schema = _new_schema_with_replaced_fields(
+        schema,
+        lambda fld: (
+            pa.field(fld.name, pa.string(), metadata=fld.metadata)
+            if pa.types.is_decimal128(fld.type) or pa.types.is_decimal256(fld.type)
+            else None
+        ),
+    )
+    new_kwargs = sanitize_kwargs_by_supported_kwargs(
+        ["read_options", "parse_options", "convert_options", "memory_pool"],
+        reader_kwargs,
+    )
+    # Creating a shallow copy for efficiency
+    new_convert_options = copy.copy(new_kwargs["convert_options"])
+    new_convert_options.column_types = new_schema
+    new_reader_kwargs = {**new_kwargs, "convert_options": new_convert_options}
+    arrow_table = pacsv.read_csv(*reader_args, **new_reader_kwargs)
+    for column_index, field in enumerate(schema):
+        if pa.types.is_decimal128(field.type) or pa.types.is_decimal256(field.type):
+            column_array = arrow_table[field.name]
+            # We always cast to decimal256 to accomodate fixed scale of 38
+            cast_to_type = pa.decimal256(
+                DECIMAL256_MAX_PRECISION, DECIMAL256_DEFAULT_SCALE
+            )
+            casted_decimal_array = pc.cast(column_array, cast_to_type)
+            # Note that scale can be negative
+            rounded_column_array = pc.round(
+                casted_decimal_array, ndigits=field.type.scale
+            )
+            final_decimal_array = pc.cast(rounded_column_array, field.type)
+            arrow_table = arrow_table.set_column(
+                column_index,
+                field,
+                final_decimal_array,
+            )
+            logger.debug(
+                f"Rounded decimal column: {field.name} to {field.type.scale} scale and"
+                f" {field.type.precision} precision"
+            )
+    return arrow_table
+def pyarrow_read_csv_default(*args, **kwargs):
+    new_kwargs = sanitize_kwargs_by_supported_kwargs(
+        ["read_options", "parse_options", "convert_options", "memory_pool"], kwargs
+    )
+    try:
         return pacsv.read_csv(*args, **new_kwargs)
     except pa.lib.ArrowInvalid as e:
-        if e.__str__() == "Empty CSV file" and not kwargs.get(RAISE_ON_EMPTY_CSV_KWARG):
-            schema = None
-            if (
-                "convert_options" in kwargs
-                and kwargs["convert_options"].column_types is not None
-            ):
-                schema = kwargs["convert_options"].column_types
-                if not isinstance(schema, pa.Schema):
-                    schema = pa.schema(schema)
-                if kwargs["convert_options"].include_columns:
-                    schema = _filter_schema_for_columns(
-                        schema, kwargs["convert_options"].include_columns
-                    )
-                elif (
-                    kwargs.get("read_options") is not None
-                    and kwargs["read_options"].column_names
+        error_str = e.__str__()
+        schema = _extract_arrow_schema_from_read_csv_kwargs(kwargs)
+        if error_str == "Empty CSV file" and not kwargs.get(RAISE_ON_EMPTY_CSV_KWARG):
+            logger.debug(f"Read CSV empty schema being used: {schema}")
+            return pa.Table.from_pylist([], schema=schema)
+        if not kwargs.get(RAISE_ON_DECIMAL_OVERFLOW):
+            # Note, this logic requires expensive casting. To prevent downgrading performance
+            # for happy path reads, we are handling this case in response to an error.
+            logger.warning(
+                "Rescaling Decimal to the given scale in the schema. "
+                f"Original error: {error_str}"
+            )
+            if schema is not None and "convert_options" in kwargs:
+                if (
+                    "Rescaling Decimal" in error_str
+                    and "value would cause data loss" in error_str
                 ):
-                    schema = _filter_schema_for_columns(
-                        schema, kwargs["read_options"].column_names
+                    logger.debug(f"Checking if the file: {args[0]}...")
+                    # Since we are re-reading the file, we have to seek to beginning
+                    if isinstance(args[0], io.IOBase) and args[0].seekable():
+                        logger.debug(f"Seeking to the beginning of the file {args[0]}")
+                        args[0].seek(0)
+                    return _read_csv_rounding_decimal_columns_to_fit_scale(
+                        schema=schema, reader_args=args, reader_kwargs=kwargs
                     )
             else:
                 logger.debug(
-                    "Schema not specified in the kwargs."
-                    " Hence, schema could not be inferred from the empty CSV."
+                    "Schema is None when trying to adjust decimal values. "
+                    "Hence, bubbling up exception..."
                 )
-            logger.debug(f"Read CSV empty schema being used: {schema}")
-            return pa.Table.from_pylist([], schema=schema)
         raise e
+def pyarrow_read_csv(*args, **kwargs) -> pa.Table:
+    schema = _extract_arrow_schema_from_read_csv_kwargs(kwargs)
+    # CSV conversion to decimal256 isn't supported as of pyarrow=12.0.1
+    # Below ensures decimal256 is casted properly.
+    schema_includes_decimal256 = (
+        (True if any([pa.types.is_decimal256(x.type) for x in schema]) else False)
+        if schema is not None
+        else None
+    )
+    if schema_includes_decimal256 and not kwargs.get(RAISE_ON_DECIMAL_OVERFLOW):
+        # falling back to expensive method of reading CSV
+        return _read_csv_rounding_decimal_columns_to_fit_scale(
+            schema, reader_args=args, reader_kwargs=kwargs
+        )
+    else:
+        return pyarrow_read_csv_default(*args, **kwargs)
 CONTENT_TYPE_TO_PA_READ_FUNC: Dict[str, Callable] = {
     ContentType.UNESCAPED_TSV.value: pyarrow_read_csv,
     ContentType.TSV.value: pyarrow_read_csv,
@@ -440,8 +572,8 @@ def s3_file_to_table(
                 **s3_client_kwargs,
             )
-        if READER_TYPE_KWARG in kwargs:
-            kwargs.pop(READER_TYPE_KWARG)
+    if READER_TYPE_KWARG in kwargs:
+        kwargs.pop(READER_TYPE_KWARG)
     filesystem = io
     if s3_url.startswith("s3://"):
@@ -783,7 +915,6 @@ def sliced_string_cast(array: pa.ChunkedArray) -> pa.ChunkedArray:
     TODO: deprecate this function when pyarrow performs proper ChunkedArray -> ChunkedArray casting
     """
     dtype = array.type
-    MAX_BYTES = 2147483646
     max_str_len = None
     if pa.types.is_integer(dtype):
         max_str_len = _int_max_string_len()
@@ -795,7 +926,7 @@ def sliced_string_cast(array: pa.ChunkedArray) -> pa.ChunkedArray:
         max_str_len = _max_decimal256_string_len()
     if max_str_len is not None:
-        max_elems_per_chunk = MAX_BYTES // (2 * max_str_len)  # safety factor of 2
+        max_elems_per_chunk = MAX_INT_BYTES // (2 * max_str_len)  # safety factor of 2
         all_chunks = []
         for chunk in array.chunks:
             if len(chunk) < max_elems_per_chunk:

{deltacat-1.1.27.dist-info → deltacat-1.1.29.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: deltacat
-Version: 1.1.27
+Version: 1.1.29
 Summary: A scalable, fast, ACID-compliant Data Catalog powered by Ray.
 Home-page: https://github.com/ray-project/deltacat
 Author: Ray Team

{deltacat-1.1.27.dist-info → deltacat-1.1.29.dist-info}/RECORD RENAMED Viewed

@@ -1,4 +1,4 @@
-deltacat/__init__.py,sha256=NNgt1N6a4dwztCKl6C7klF3mQEn-S-sBHNZPKPqRHko,1778
+deltacat/__init__.py,sha256=DoUiDxmgMh8HUGOEAG7CUY0Q9Ip-S7gePDsL8XQO5kk,1778
 deltacat/constants.py,sha256=TUJLXUJ9xq1Ryil72yLkKR8EDH_Irp5wUg56QstbRNE,2181
 deltacat/exceptions.py,sha256=7sjk3BuMY5Oo-6OvAfHncZx_OcvtEL47BblWr2F7waE,12740
 deltacat/logs.py,sha256=EQSDin1deehzz5xlLV1_TrFJrO_IBZ9Ahp7MdL-4cK8,9363
@@ -51,7 +51,7 @@ deltacat/compute/compactor/utils/sort_key.py,sha256=oK6otg-CSsma6zlGPaKg-KNEvcZR
 deltacat/compute/compactor/utils/system_columns.py,sha256=CNIgAGos0xAGEpdaQIH7KfbSRrGZgjRbItXMararqXQ,9399
 deltacat/compute/compactor_v2/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 deltacat/compute/compactor_v2/compaction_session.py,sha256=COtol2s63DRPbd-AN9KCiWr4exLX8x5Tvxea_7cOGEQ,8078
-deltacat/compute/compactor_v2/constants.py,sha256=AOvnIxQfKOnLubrUsg4g8OPLgqvOT46LE_da9_Dm2KY,2507
+deltacat/compute/compactor_v2/constants.py,sha256=wvd34d7RGdniGbbiJcMljxRrRas4_uy9F9UaqXfS_Ag,3034
 deltacat/compute/compactor_v2/deletes/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 deltacat/compute/compactor_v2/deletes/delete_file_envelope.py,sha256=AeuH9JRMwp6mvQf6P2cqL92hUEtResQq6qUTS0kIKac,3111
 deltacat/compute/compactor_v2/deletes/delete_strategy.py,sha256=SMEJOxR-5r92kvKNqtu2w6HmwtmhljcZX1wcNEuS-4w,2833
@@ -69,14 +69,14 @@ deltacat/compute/compactor_v2/private/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCe
 deltacat/compute/compactor_v2/private/compaction_utils.py,sha256=fMWXg1SCIIgjk9p_OFYrcm760dOKNbFO1Lj3_JI3GCY,30929
 deltacat/compute/compactor_v2/steps/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 deltacat/compute/compactor_v2/steps/hash_bucket.py,sha256=1R5xLUkl7GqL1nY-apAgY1czKDEHjIVYSRi9qLOMass,6726
-deltacat/compute/compactor_v2/steps/merge.py,sha256=LpktsDPfj7Of6RgUw9w1f3Y3OBkPDjvtyXjzFaIDoSo,21771
+deltacat/compute/compactor_v2/steps/merge.py,sha256=qxmb3cmiKvOgfuOzlJT4Q60zOyWNjsiuZSzxdh6KTm8,22909
 deltacat/compute/compactor_v2/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 deltacat/compute/compactor_v2/utils/content_type_params.py,sha256=1P9CDpuWErsFcTTlRCeuUQHDokVI92he_MsL82uRAdA,7424
-deltacat/compute/compactor_v2/utils/dedupe.py,sha256=62tFCY2iRP7I3-45GCIYs6_SJsQl8C5lBEr8gbNfbsw,1932
+deltacat/compute/compactor_v2/utils/dedupe.py,sha256=Jz1QbBOdZJwT8K1vD9q01eOn7hdLNZ_AF7bJ0wficr0,1949
 deltacat/compute/compactor_v2/utils/delta.py,sha256=I7Yvda8NVbpKXG3nM2Ku1utvR2r2OpHvUMqUL2ja3aw,3626
 deltacat/compute/compactor_v2/utils/io.py,sha256=3m4dorxj-WD6Yu9_3gRE6gz3C-eNJA7nn02sHKwo-J8,6018
 deltacat/compute/compactor_v2/utils/merge.py,sha256=EV_iKhNc3WflgfLW1Q46dXUvyClx8VebWHGtninEfsI,5311
-deltacat/compute/compactor_v2/utils/primary_key_index.py,sha256=QOMwWxGhZ7VWa3oE6InM4thR5pbjmT7ttNXvx_IiKjo,11676
+deltacat/compute/compactor_v2/utils/primary_key_index.py,sha256=SbQ97M1Cxld-zZik2QMSzlj20g6JlENaQx_0PhlCIP8,12034
 deltacat/compute/compactor_v2/utils/task_options.py,sha256=W0jyWIIZ0tcSAGp8mhpnu1G8p3rmX4d3juCPpAJxnDM,12649
 deltacat/compute/merge_on_read/__init__.py,sha256=ckbgngmqPjYBYz_NySsR1vNTOb_hNpeL1sYkZKvBI9M,214
 deltacat/compute/merge_on_read/daft.py,sha256=1oC38u5ig_aTrq7EzyWBo8Ui54rb6yERYMk-vEFbpxM,1400
@@ -136,7 +136,7 @@ deltacat/tests/compute/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3
 deltacat/tests/compute/compact_partition_multiple_rounds_test_cases.py,sha256=kWyIJQMyF9oBemvgOp3ngGhMpH9zjkznV-67ewELgHQ,37719
 deltacat/tests/compute/compact_partition_rebase_test_cases.py,sha256=8HVr3EIFYFqNaJoqeCuj9xIBjM4Ch2bx-mJcO4BRrLo,16839
 deltacat/tests/compute/compact_partition_rebase_then_incremental_test_cases.py,sha256=l_6-pAKOsRY3NbtfHsYmEaJEkq6IJueYuLsjyJxNgz4,81564
-deltacat/tests/compute/compact_partition_test_cases.py,sha256=R9eiKvxCLqcoHjAx3iOogdnXZEO9TvLbRf0wA7bcJN4,26170
+deltacat/tests/compute/compact_partition_test_cases.py,sha256=HJ15Xyawv8ImFju8wDwt22fH5okoPhyS-QAygkXDb7Q,27422
 deltacat/tests/compute/test_compact_partition_incremental.py,sha256=lkfAraOJmEmieesf7b1BqlfTS26YjYM5xXOXoTMrsos,14989
 deltacat/tests/compute/test_compact_partition_multiple_rounds.py,sha256=xXBA66TTfARR90m5KQs31nmiokuMy9iGQt7Z9evyG7M,12950
 deltacat/tests/compute/test_compact_partition_params.py,sha256=Dm5eLyHo8oGMeO3XBbpj1rZqHtPZ1hAB7z2qvzc4Lxk,8497
@@ -152,9 +152,10 @@ deltacat/tests/compute/compactor/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JC
 deltacat/tests/compute/compactor/utils/test_io.py,sha256=st5mlU4cVU-eQl7B4mvPgNA3izuNwbVawYOp-NcoyrI,4326
 deltacat/tests/compute/compactor/utils/test_round_completion_file.py,sha256=LAQ4usiRF4oTx4cA85L0eOcBa_Z-febc-CuzUijSGrI,7439
 deltacat/tests/compute/compactor_v2/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-deltacat/tests/compute/compactor_v2/test_compaction_session.py,sha256=0U8Hmu-qLvqXqLPBPS6qENc1ErolWAaAoUlwms2xLe8,23124
+deltacat/tests/compute/compactor_v2/test_compaction_session.py,sha256=y8nNHq9ADHENzUKMQYguB45zOD7F2lZgcBYYTvbTsdM,28957
 deltacat/tests/compute/compactor_v2/test_hashlib.py,sha256=8csF2hFWtBvY2MbX3-6iphCsVXxRp0zP1NTnKhfdmkg,328
 deltacat/tests/compute/compactor_v2/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py,sha256=aFb9rzT_EK9k8qAMHPtpqd5btyEmll1So1loDmZkotQ,1769
 deltacat/tests/compute/compactor_v2/utils/test_task_options.py,sha256=37DkR1u_XwhedV9cGed6FFuJTC0XmuiowHJIa_Op6uA,865
 deltacat/tests/compute/resource_estimation/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 deltacat/tests/compute/resource_estimation/test_delta.py,sha256=HCL2oUnCqm0E26T3HLJjMhoAsHTJIWPYGwIKRgM_H7E,25712
@@ -171,7 +172,7 @@ deltacat/tests/local_deltacat_storage/__init__.py,sha256=5T9ubNIS42-BotEH0yrUiWE
 deltacat/tests/local_deltacat_storage/exceptions.py,sha256=oxZ0psmrEO0M6P2r8gHQ2E8E-Y8UBfUCBUIwfuHcx38,251
 deltacat/tests/test_utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 deltacat/tests/test_utils/constants.py,sha256=UYe--9T_clYjiOpv0M7TtAMGdpje_SMZ-w8n0IeCAjc,214
-deltacat/tests/test_utils/pyarrow.py,sha256=pzTBk07xMaAfykXo3GNGwTqaQxrKnSbr-WO3HBszikI,2828
+deltacat/tests/test_utils/pyarrow.py,sha256=QDdGilzsJ2xUESiGotdNVZde9yD7ja9MvNhhssnox-E,3083
 deltacat/tests/test_utils/storage.py,sha256=93GEn4A5WbMHWk0Ec4Bd7RxeHoSEnBfSarfWhKOSNtM,972
 deltacat/tests/test_utils/utils.py,sha256=a32qEwcSSd1lvRi0aJJ4ZLnc1ZyXmoQF_K95zaQRk2M,455
 deltacat/tests/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -179,7 +180,7 @@ deltacat/tests/utils/test_cloudpickle.py,sha256=J0pnBY3-PxlUh6MamZAN1PuquKQPr2iy
 deltacat/tests/utils/test_daft.py,sha256=kY8lkXoQvyWunok8UvOsh1An297rb3jcnstTuIAyAlc,8232
 deltacat/tests/utils/test_metrics.py,sha256=Ym9nOz1EtB180pLmvugihj1sDTNDMb5opIjjr5Nmcls,16339
 deltacat/tests/utils/test_placement.py,sha256=g61wVOMkHe4YJeR9Oxg_BOVQ6bhHHbC3IBYv8YhUu94,597
-deltacat/tests/utils/test_pyarrow.py,sha256=AWx0DTsBA1PkQag48w_HeQdz7tlBzJsm9v7Nd6-dhEY,19607
+deltacat/tests/utils/test_pyarrow.py,sha256=fDjDkGPjdRZA3kgjgiQRym9shdeDYgkdDPYU2a7IEUk,30790
 deltacat/tests/utils/test_record_batch_tables.py,sha256=AkG1WyljQmjnl-AxhbFWyo5LnMIKRyLScfgC2B_ES-s,11321
 deltacat/tests/utils/test_resources.py,sha256=HtpvDrfPZQNtGDXUlsIzc_yd7Vf1cDscZ3YbN0oTvO8,2560
 deltacat/tests/utils/data/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -200,7 +201,7 @@ deltacat/utils/numpy.py,sha256=SpHKKvC-K8NINTWGVfTZ5-gBFTGYqaXjjgKFhsdUjwg,2049
 deltacat/utils/pandas.py,sha256=q99mlRB7tymICMcNbfGLfLqFu_C-feyPZKZm2CWJJVc,9574
 deltacat/utils/performance.py,sha256=7ZLaMkS1ehPSIhT5uOQVBHvjC70iKHzoFquFo-KL0PI,645
 deltacat/utils/placement.py,sha256=Lj20fb-eq8rgMdm_M2MBMfDLwhDM1sS1nJj2DvIK56s,12060
-deltacat/utils/pyarrow.py,sha256=nW_eD6fWAlbyHUzPj1rOOfnUbpP3RnAgNSuuVNyvhZ4,29174
+deltacat/utils/pyarrow.py,sha256=R3KkJPenE48rS3VrfFKSkJerX94f4e7X2dUPBQg44DY,34339
 deltacat/utils/resources.py,sha256=Ax1OgLLbZI4oYpp4Ki27OLaST-7I-AJgZwU87FVfY8g,8253
 deltacat/utils/s3fs.py,sha256=PmUJ5Fm1WmD-_zp_M6yd9VbXvIoJuBeK6ApOdJJApLE,662
 deltacat/utils/schema.py,sha256=m4Wm4ZQcpttzOUxex4dVneGlHy1_E36HspTcjNYzvVM,1564
@@ -210,8 +211,8 @@ deltacat/utils/ray_utils/concurrency.py,sha256=JDVwMiQWrmuSlyCWAoiq9ctoJ0XADEfDD
 deltacat/utils/ray_utils/dataset.py,sha256=waHdtH0c835a-2t51HYRHnulfC0_zBxx8mFSAPvPSPM,3274
 deltacat/utils/ray_utils/performance.py,sha256=d7JFM7vTXHzkGx9qNQcZzUWajnqINvYRwaM088_FpsE,464
 deltacat/utils/ray_utils/runtime.py,sha256=rB0A-tU9WZHz0J11LzJdANYtL397YyuemcA1l-K9dAw,5029
-deltacat-1.1.27.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
-deltacat-1.1.27.dist-info/METADATA,sha256=VL7sWG3lO3cV3tzwTiCTgpm7h0K5Dh3GtKiqojgSgHI,1733
-deltacat-1.1.27.dist-info/WHEEL,sha256=eOLhNAGa2EW3wWl_TU484h7q1UNgy0JXjjoqKoxAAQc,92
-deltacat-1.1.27.dist-info/top_level.txt,sha256=RWdIcid4Bv2i2ozLVh-70kJpyB61xEKXod9XXGpiono,9
-deltacat-1.1.27.dist-info/RECORD,,
+deltacat-1.1.29.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
+deltacat-1.1.29.dist-info/METADATA,sha256=ZverlgFUJV4wGJao8tusRCv_sRNX4KJ4RTNAGvBCJes,1733
+deltacat-1.1.29.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
+deltacat-1.1.29.dist-info/top_level.txt,sha256=RWdIcid4Bv2i2ozLVh-70kJpyB61xEKXod9XXGpiono,9
+deltacat-1.1.29.dist-info/RECORD,,

{deltacat-1.1.27.dist-info → deltacat-1.1.29.dist-info}/WHEEL RENAMED Viewed

@@ -1,5 +1,5 @@
 Wheel-Version: 1.0
-Generator: bdist_wheel (0.44.0)
+Generator: bdist_wheel (0.45.1)
 Root-Is-Purelib: true
 Tag: py3-none-any

{deltacat-1.1.27.dist-info → deltacat-1.1.29.dist-info}/LICENSE RENAMED Viewed

File without changes

{deltacat-1.1.27.dist-info → deltacat-1.1.29.dist-info}/top_level.txt RENAMED Viewed

File without changes

deltacat 1.1.27__py3-none-any.whl → 1.1.29__py3-none-any.whl

deltacat 1.1.27py3-none-any.whl → 1.1.29py3-none-any.whl