PyPI - deltacat - Versions diffs - 0.1.18b15__py3-none-any.whl → 0.1.18b16__py3-none-any.whl - Mend

deltacat 0.1.18b15py3-none-any.whl → 0.1.18b16py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (31) hide show

deltacat/compute/compactor_v2/utils/primary_key_index.py CHANGED Viewed

@@ -31,8 +31,19 @@ def _append_sha1_hash_to_table(table: pa.Table, hash_column: pa.Array) -> pa.Tab
     return sc.append_pk_hash_string_column(table, result)
-def _is_sha1_desired(hash_column: pa.Array) -> bool:
-    return hash_column.nbytes > TOTAL_BYTES_IN_SHA1_HASH * len(hash_column)
+def _is_sha1_desired(hash_columns: List[pa.Array]) -> bool:
+    total_size = 0
+    total_len = 0
+    for hash_column in hash_columns:
+        total_size += hash_column.nbytes
+        total_len += len(hash_column)
+    logger.info(
+        f"Found total length of hash column={total_len} and total_size={total_size}"
+    )
+    return total_size > TOTAL_BYTES_IN_SHA1_HASH * total_len
 def _append_table_by_hash_bucket(
@@ -61,7 +72,9 @@ def _append_table_by_hash_bucket(
     for i, group_count in enumerate(group_count_array):
         hb_idx = hb_group_array[i].as_py()
         pyarrow_table = hb_pk_table.slice(offset=result_len, length=group_count.as_py())
-        pyarrow_table = pyarrow_table.drop([sc._HASH_BUCKET_IDX_COLUMN_NAME])
+        pyarrow_table = pyarrow_table.drop(
+            [sc._HASH_BUCKET_IDX_COLUMN_NAME, sc._PK_HASH_STRING_COLUMN_NAME]
+        )
         if hash_bucket_to_table[hb_idx] is None:
             hash_bucket_to_table[hb_idx] = []
         hash_bucket_to_table[hb_idx].append(pyarrow_table)
@@ -142,7 +155,7 @@ def _optimized_group_record_batches_by_hash_bucket(
 def group_by_pk_hash_bucket(
     table: pa.Table, num_buckets: int, primary_keys: List[str]
 ) -> np.ndarray:
-    table = generate_pk_hash_column(table, primary_keys, requires_sha1=True)
+    table = generate_pk_hash_column([table], primary_keys, requires_sha1=True)[0]
     # group hash bucket record indices
     result = group_record_indices_by_hash_bucket(
@@ -154,53 +167,73 @@ def group_by_pk_hash_bucket(
 def generate_pk_hash_column(
-    table: pa.Table,
+    tables: List[pa.Table],
     primary_keys: Optional[List[str]] = None,
     requires_sha1: bool = False,
-) -> pa.Table:
+) -> List[pa.Table]:
     """
-    Returns a new table after generating the primary key hash if desired.
+    Returns a new table list after generating the primary key hash if desired.
     1. If there are no primary keys, each hash will be unique uuid/sha1 hex
-    2. If there are more than 0 primary keys, returns a table with new columns appended.
+    2. If there are more than 0 primary keys, returns a table with pk hash column appended.
     """
-    start = time.monotonic()
-    can_sha1 = False
-    if primary_keys:
+    def _generate_pk_hash(table: pa.Table) -> pa.Array:
         pk_columns = []
         for pk_name in primary_keys:
             pk_columns.append(pc.cast(table[pk_name], pa.string()))
         pk_columns.append(PK_DELIMITER)
         hash_column = pc.binary_join_element_wise(*pk_columns)
+        return hash_column
-        can_sha1 = requires_sha1 or _is_sha1_desired(hash_column)
-    else:
+    def _generate_uuid(table: pa.Table) -> pa.Array:
         hash_column = pa.array(
             [uuid.uuid4().hex for _ in range(len(table))], pa.string()
         )
+        return hash_column
+    start = time.monotonic()
+    hash_column_list = []
+    can_sha1 = False
+    if primary_keys:
+        hash_column_list = [_generate_pk_hash(table) for table in tables]
+        can_sha1 = requires_sha1 or _is_sha1_desired(hash_column_list)
+    else:
+        hash_column_list = [_generate_uuid(table) for table in tables]
     logger.info(
-        f"can_generate_sha1={can_sha1} for the table with hash column size"
-        f"={hash_column.nbytes} bytes, num_rows={len(hash_column)}, "
-        f"and requires_sha1={requires_sha1}"
+        f"can_generate_sha1={can_sha1} for the table and requires_sha1={requires_sha1}"
     )
-    if can_sha1:
-        table = _append_sha1_hash_to_table(table, hash_column)
-    else:
-        table = table.append_column(sc._PK_HASH_STRING_COLUMN_FIELD, hash_column)
+    result = []
+    total_len = 0
+    total_size = 0
+    for index, table in enumerate(tables):
+        if can_sha1:
+            table = _append_sha1_hash_to_table(table, hash_column_list[index])
+        else:
+            table = table.append_column(
+                sc._PK_HASH_STRING_COLUMN_FIELD, hash_column_list[index]
+            )
+        total_len += len(table)
+        total_size += hash_column_list[index].nbytes
+        result.append(table)
     end = time.monotonic()
     logger.info(
-        f"Took {end - start}s to generate pk hash of len: {len(hash_column)}"
-        f" and size: {hash_column.nbytes} bytes"
+        f"Took {end - start}s to generate pk hash of len: {total_len}"
+        f" for size: {total_size} bytes"
     )
-    return table
+    return result
 def group_record_indices_by_hash_bucket(
@@ -298,7 +331,7 @@ def hash_group_index_to_hash_bucket_indices(
     if hb_group > num_buckets:
         return []
-    return range(hb_group, num_groups, num_buckets)
+    return range(hb_group, num_buckets, num_groups)
 def pk_digest_to_hash_bucket_index(digest: str, num_buckets: int) -> int:

deltacat/compute/compactor_v2/utils/task_options.py CHANGED Viewed

@@ -2,7 +2,7 @@ from typing import Dict, Optional, List, Tuple
 from deltacat.types.media import ContentEncoding, ContentType
 from deltacat.types.partial_download import PartialParquetParameters
 from deltacat.storage import (
-    Delta,
+    Manifest,
     ManifestEntry,
     interface as unimplemented_deltacat_storage,
 )
@@ -11,9 +11,6 @@ from deltacat.compute.compactor.model.round_completion_info import RoundCompleti
 from deltacat.compute.compactor_v2.utils.primary_key_index import (
     hash_group_index_to_hash_bucket_indices,
 )
-from deltacat.compute.compactor_v2.utils.content_type_params import (
-    append_content_type_params,
-)
 from deltacat.compute.compactor_v2.constants import TOTAL_MEMORY_BUFFER_PERCENTAGE
@@ -24,6 +21,7 @@ def _get_parquet_type_params_if_exist(
         entry.meta
         and entry.meta.content_type == ContentType.PARQUET
         and entry.meta.content_encoding == ContentEncoding.IDENTITY
+        and entry.meta.content_type_parameters
     ):
         for type_params in entry.meta.content_type_parameters:
             if isinstance(type_params, PartialParquetParameters):
@@ -93,7 +91,7 @@ def estimate_manifest_entry_column_size_bytes(
     type_params = _get_parquet_type_params_if_exist(entry=entry)
-    if type_params.pq_metadata:
+    if type_params and type_params.pq_metadata:
         return _calculate_parquet_column_size(type_params=type_params, columns=columns)
     return None
@@ -153,7 +151,7 @@ def merge_resource_options_provider(
     hash_group_size_bytes: Dict[int, int],
     hash_group_num_rows: Dict[int, int],
     round_completion_info: Optional[RoundCompletionInfo] = None,
-    compacted_delta: Optional[Delta] = None,
+    compacted_delta_manifest: Optional[Manifest] = None,
     primary_keys: Optional[List[str]] = None,
     deltacat_storage=unimplemented_deltacat_storage,
     deltacat_storage_kwargs: Optional[Dict] = {},
@@ -168,8 +166,8 @@ def merge_resource_options_provider(
     if (
         round_completion_info
-        and compacted_delta
-        and round_completion_info.hb_index_to_entry_range_both_inclusive
+        and compacted_delta_manifest
+        and round_completion_info.hb_index_to_entry_range
     ):
         previous_inflation = (
@@ -187,15 +185,10 @@ def merge_resource_options_provider(
         for hb_idx in iterable:
             entry_start, entry_end = round_completion_info.hb_index_to_entry_range[
-                hb_idx
+                str(hb_idx)
             ]
             for entry_index in range(entry_start, entry_end):
-                entry = append_content_type_params(
-                    compacted_delta,
-                    entry_index=entry_index,
-                    deltacat_storage=deltacat_storage,
-                    deltacat_storage_kwargs=deltacat_storage_kwargs,
-                )
+                entry = compacted_delta_manifest.entries[entry_index]
                 current_entry_size = estimate_manifest_entry_size_bytes(
                     entry=entry, previous_inflation=previous_inflation

deltacat/tests/compute/common.py CHANGED Viewed

@@ -15,7 +15,7 @@ BASE_TEST_DESTINATION_NAMESPACE = "destination_test_namespace"
 BASE_TEST_DESTINATION_TABLE_NAME = "destination_test_table_RAY"
 BASE_TEST_DESTINATION_TABLE_VERSION = "1"
-HASH_BUCKET_COUNT: int = 1
+HASH_BUCKET_COUNT: int = 3
 MAX_RECORDS_PER_FILE: int = 1

deltacat/tests/compute/compactor/steps/test_repartition.py CHANGED Viewed

@@ -49,6 +49,7 @@ class TestRepartitionRange(unittest.TestCase):
         self.destination_partition: PartitionLocator = MagicMock()
         self.repartition_args = {"column": "last_updated", "ranges": [1678665487112747]}
         self.max_records_per_output_file = 2
+        self.s3_table_writer_kwargs = {}
         self.repartitioned_file_content_type = ContentType.PARQUET
         self.deltacat_storage = MagicMock()
         self.deltacat_storage_kwargs = MagicMock()
@@ -59,6 +60,7 @@ class TestRepartitionRange(unittest.TestCase):
             self.destination_partition,
             self.repartition_args,
             self.max_records_per_output_file,
+            self.s3_table_writer_kwargs,
             self.repartitioned_file_content_type,
             self.deltacat_storage,
             self.deltacat_storage_kwargs,
@@ -85,6 +87,7 @@ class TestRepartitionRange(unittest.TestCase):
                 self.destination_partition,
                 self.repartition_args,
                 self.max_records_per_output_file,
+                self.s3_table_writer_kwargs,
                 self.repartitioned_file_content_type,
                 self.deltacat_storage,
                 self.deltacat_storage_kwargs,
@@ -98,6 +101,7 @@ class TestRepartitionRange(unittest.TestCase):
                 self.destination_partition,
                 self.repartition_args,
                 self.max_records_per_output_file,
+                self.s3_table_writer_kwargs,
                 self.repartitioned_file_content_type,
                 self.deltacat_storage,
                 self.deltacat_storage_kwargs,
@@ -110,6 +114,7 @@ class TestRepartitionRange(unittest.TestCase):
             self.destination_partition,
             self.repartition_args,
             self.max_records_per_output_file,
+            self.s3_table_writer_kwargs,
             self.repartitioned_file_content_type,
             self.deltacat_storage,
             self.deltacat_storage_kwargs,
@@ -123,6 +128,7 @@ class TestRepartitionRange(unittest.TestCase):
             self.destination_partition,
             self.repartition_args,
             self.max_records_per_output_file,
+            self.s3_table_writer_kwargs,
             self.repartitioned_file_content_type,
             self.deltacat_storage,
             self.deltacat_storage_kwargs,
@@ -137,6 +143,7 @@ class TestRepartitionRange(unittest.TestCase):
             self.destination_partition,
             self.repartition_args,
             self.max_records_per_output_file,
+            self.s3_table_writer_kwargs,
             self.repartitioned_file_content_type,
             self.deltacat_storage,
             self.deltacat_storage_kwargs,
@@ -151,6 +158,7 @@ class TestRepartitionRange(unittest.TestCase):
                 self.destination_partition,
                 self.repartition_args,
                 self.max_records_per_output_file,
+                self.s3_table_writer_kwargs,
                 self.repartitioned_file_content_type,
                 self.deltacat_storage,
                 self.deltacat_storage_kwargs,
@@ -167,6 +175,7 @@ class TestRepartitionRange(unittest.TestCase):
             self.destination_partition,
             self.repartition_args,
             self.max_records_per_output_file,
+            self.s3_table_writer_kwargs,
             self.repartitioned_file_content_type,
             self.deltacat_storage,
             self.deltacat_storage_kwargs,
@@ -180,6 +189,7 @@ class TestRepartitionRange(unittest.TestCase):
             self.destination_partition,
             self.repartition_args,
             self.max_records_per_output_file,
+            self.s3_table_writer_kwargs,
             self.repartitioned_file_content_type,
             self.deltacat_storage,
             self.deltacat_storage_kwargs,
@@ -196,6 +206,7 @@ class TestRepartitionRange(unittest.TestCase):
                 self.destination_partition,
                 self.repartition_args,
                 self.max_records_per_output_file,
+                self.s3_table_writer_kwargs,
                 self.repartitioned_file_content_type,
                 self.deltacat_storage,
             ),
@@ -222,6 +233,7 @@ class TestRepartitionRange(unittest.TestCase):
             self.destination_partition,
             self.repartition_args,
             self.max_records_per_output_file,
+            self.s3_table_writer_kwargs,
             self.repartitioned_file_content_type,
             self.deltacat_storage,
             self.deltacat_storage_kwargs,

deltacat/tests/compute/test_compaction_session_incremental.py CHANGED Viewed

@@ -238,7 +238,8 @@ def test_compact_partition_incremental(
     rebase_source_partition_locator_param,
     partition_values_param,
     expected_result,
-    validation_callback_func,  # use and implement func and func_kwargs if you want to run additional validations apart from the ones in the test
+    # use and implement func and func_kwargs if you want to run additional validations apart from the ones in the test
+    validation_callback_func,
     validation_callback_func_kwargs,
     do_teardown_local_deltacat_storage_db,
     use_prev_compacted,
@@ -281,6 +282,10 @@ def test_compact_partition_incremental(
         arrow_arrays_param,
         partition_values_param,
         ds_mock_kwargs,
+        f"{test_name}_src_namespace",
+        f"{test_name}_table_src",
+        f"{test_name}_dest_namespace",
+        f"{test_name}_table_dest",
     )
     ray.shutdown()
     ray.init(local_mode=True)
@@ -334,6 +339,16 @@ def test_compact_partition_incremental(
     compacted_delta_locator = round_completion_info.compacted_delta_locator
     tables = ds.download_delta(compacted_delta_locator, **ds_mock_kwargs)
     compacted_table = pa.concat_tables(tables)
+    # the compacted table may contain multiple files and chunks
+    # and order of records may be incorrect due to multiple files.
+    expected_result = expected_result.combine_chunks().sort_by(
+        [(val, "ascending") for val in primary_keys_param]
+    )
+    compacted_table = compacted_table.combine_chunks().sort_by(
+        [(val, "ascending") for val in primary_keys_param]
+    )
     assert compacted_table.equals(
         expected_result
     ), f"{compacted_table} does not match {expected_result}"

deltacat/tests/compute/testcases.py CHANGED Viewed

@@ -12,13 +12,18 @@ from deltacat.tests.compute.common import (
 from deltacat.compute.compactor.compaction_session import (
     compact_partition_from_request as compact_partition_v1,
 )
+from deltacat.compute.compactor_v2.compaction_session import (
+    compact_partition as compact_partition_v2,
+)
 def create_tests_cases_for_all_compactor_versions(test_cases: Dict[str, List]):
     final_cases = {}
-    for version, compact_partition_func in enumerate([compact_partition_v1]):
+    for version, compact_partition_func in enumerate(
+        [compact_partition_v1, compact_partition_v2]
+    ):
         for case_name, case_value in test_cases.items():
-            final_cases[f"{case_name}_v{version}"] = [
+            final_cases[f"{case_name}_v{version + 1}"] = [
                 *case_value,
                 compact_partition_func,
             ]

deltacat/tests/test_utils/pyarrow.py CHANGED Viewed

@@ -1,24 +1,42 @@
 from typing import List
 import pyarrow as pa
-from deltacat.storage import Delta
+from deltacat.storage import Delta, Partition
 import deltacat.tests.local_deltacat_storage as ds
 def create_delta_from_csv_file(
     namespace: str, file_paths: List[str], *args, **kwargs
 ) -> Delta:
-    tables = []
+    staged_partition = stage_partition_from_csv_file(
+        namespace, file_paths, *args, **kwargs
+    )
-    for file_path in file_paths:
-        table = pa.csv.read_csv(file_path)
-        tables.append(table)
+    committed_delta = commit_delta_to_staged_partition(
+        staged_partition, file_paths, *args, **kwargs
+    )
+    return committed_delta
+def stage_partition_from_csv_file(
+    namespace: str, file_paths: List[str], *args, **kwargs
+) -> Partition:
     ds.create_namespace(namespace, {}, **kwargs)
     table_name = "-".join(file_paths).replace("/", "_")
     ds.create_table_version(namespace, table_name, "1", **kwargs)
     stream = ds.get_stream(namespace, table_name, "1", **kwargs)
     staged_partition = ds.stage_partition(stream, [], **kwargs)
+    return staged_partition
+def commit_delta_to_staged_partition(
+    staged_partition, file_paths: List[str], *args, **kwargs
+) -> Delta:
+    tables = []
+    for file_path in file_paths:
+        table = pa.csv.read_csv(file_path)
+        tables.append(table)
     deltas = []
     for table in tables:
@@ -28,5 +46,4 @@ def create_delta_from_csv_file(
     merged_delta = Delta.merge_deltas(deltas=deltas)
     committed_delta = ds.commit_delta(merged_delta, **kwargs)
     ds.commit_partition(staged_partition, **kwargs)
     return committed_delta

deltacat/types/partial_download.py CHANGED Viewed

@@ -38,6 +38,7 @@ class PartialParquetParameters(PartialFileDownloadParams):
             num_row_groups = pq_metadata.num_row_groups
             row_groups_to_download = [rg for rg in range(num_row_groups)]
             in_memory_size_bytes = 0.0
+            num_rows = pq_metadata.num_rows
             for rg in row_groups_to_download:
                 row_group_meta = pq_metadata.row_group(rg)

deltacat/types/tables.py CHANGED Viewed

@@ -4,6 +4,7 @@ from typing import Callable, Dict, Type, Union
 import numpy as np
 import pandas as pd
 import pyarrow as pa
+import pyarrow.parquet as papq
 from ray.data.dataset import Dataset
 from ray.data.read_api import (
     from_arrow,
@@ -49,6 +50,7 @@ TABLE_CLASS_TO_SIZE_FUNC: Dict[
     Type[Union[dcs.LocalTable, dcs.DistributedDataset]], Callable
 ] = {
     pa.Table: pa_utils.table_size,
+    papq.ParquetFile: pa_utils.parquet_file_size,
     pd.DataFrame: pd_utils.dataframe_size,
     np.ndarray: np_utils.ndarray_size,
     Dataset: ds_utils.dataset_size,
@@ -56,18 +58,21 @@ TABLE_CLASS_TO_SIZE_FUNC: Dict[
 TABLE_CLASS_TO_TABLE_TYPE: Dict[Type[dcs.LocalTable], str] = {
     pa.Table: TableType.PYARROW.value,
+    papq.ParquetFile: TableType.PYARROW_PARQUET.value,
     pd.DataFrame: TableType.PANDAS.value,
     np.ndarray: TableType.NUMPY.value,
 }
 TABLE_TYPE_TO_DATASET_CREATE_FUNC: Dict[str, Callable] = {
     TableType.PYARROW.value: from_arrow,
+    TableType.PYARROW_PARQUET.value: from_arrow,
     TableType.NUMPY.value: from_numpy,
     TableType.PANDAS.value: from_pandas,
 }
 TABLE_TYPE_TO_DATASET_CREATE_FUNC_REFS: Dict[str, Callable] = {
     TableType.PYARROW.value: from_arrow_refs,
+    TableType.PYARROW_PARQUET.value: from_arrow_refs,
     TableType.NUMPY.value: from_numpy,
     TableType.PANDAS.value: from_pandas_refs,
 }

deltacat/utils/arguments.py CHANGED Viewed

@@ -1,5 +1,4 @@
 import inspect
-import copy
 from typing import Any, Dict
@@ -13,7 +12,7 @@ def sanitize_kwargs_to_callable(callable: Any, kwargs: Dict) -> Dict:
     signature = inspect.signature(callable)
     params = signature.parameters
-    new_kwargs = copy.copy(kwargs)
+    new_kwargs = {**kwargs}
     for key in params:
         if params[key].kind == inspect.Parameter.VAR_KEYWORD:

deltacat/utils/pyarrow.py CHANGED Viewed

@@ -294,6 +294,7 @@ def s3_partial_parquet_file_to_table(
         content_type=content_type,
         content_encoding=content_encoding,
         partial_file_download_params=partial_file_download_params,
+        pa_read_func_kwargs_provider=pa_read_func_kwargs_provider,
         **s3_client_kwargs,
     )
@@ -488,6 +489,10 @@ def table_size(table: pa.Table) -> int:
     return table.nbytes
+def parquet_file_size(table: papq.ParquetFile) -> int:
+    return table.metadata.serialized_size
 def table_to_file(
     table: pa.Table,
     base_path: str,

{deltacat-0.1.18b15.dist-info → deltacat-0.1.18b16.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: deltacat
-Version: 0.1.18b15
+Version: 0.1.18b16
 Summary: A scalable, fast, ACID-compliant Data Catalog powered by Ray.
 Home-page: https://github.com/ray-project/deltacat
 Author: Ray Team

deltacat 0.1.18b15__py3-none-any.whl → 0.1.18b16__py3-none-any.whl

deltacat 0.1.18b15py3-none-any.whl → 0.1.18b16py3-none-any.whl