PyPI - deltacat - Versions diffs - 0.1.18b15__py3-none-any.whl → 0.1.18b16__py3-none-any.whl - Mend

deltacat 0.1.18b15py3-none-any.whl → 0.1.18b16py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (31) hide show

deltacat/__init__.py CHANGED Viewed

@@ -44,7 +44,7 @@ from deltacat.types.tables import TableWriteMode
 deltacat.logs.configure_deltacat_logger(logging.getLogger(__name__))
-__version__ = "0.1.18b15"
+__version__ = "0.1.18b16"
 __all__ = [

deltacat/compute/compactor/model/compact_partition_params.py CHANGED Viewed

@@ -19,6 +19,7 @@ from deltacat.compute.compactor_v2.constants import (
     MIN_FILES_IN_BATCH,
     AVERAGE_RECORD_SIZE_BYTES,
     TASK_MAX_PARALLELISM,
+    DROP_DUPLICATES,
 )
 from deltacat.constants import PYARROW_INFLATION_MULTIPLIER
 from deltacat.compute.compactor.utils.sort_key import validate_sort_keys
@@ -88,6 +89,7 @@ class CompactPartitionParams(dict):
         result.hash_group_count = params.get(
             "hash_group_count", result.hash_bucket_count
         )
+        result.drop_duplicates = params.get("drop_duplicates", DROP_DUPLICATES)
         if not importlib.util.find_spec("memray"):
             result.enable_profiler = False
@@ -196,7 +198,7 @@ class CompactPartitionParams(dict):
     @property
     def min_delta_bytes_in_batch(self) -> float:
-        return self["min_files_in_batch"]
+        return self["min_delta_bytes_in_batch"]
     @min_delta_bytes_in_batch.setter
     def min_delta_bytes_in_batch(self, min_delta_bytes_in_batch: float) -> None:
@@ -258,6 +260,14 @@ class CompactPartitionParams(dict):
     def records_per_compacted_file(self, count: int) -> None:
         self["records_per_compacted_file"] = count
+    @property
+    def drop_duplicates(self) -> bool:
+        return self["drop_duplicates"]
+    @drop_duplicates.setter
+    def drop_duplicates(self, value: bool):
+        self["drop_duplicates"] = value
     @property
     def bit_width_of_sort_keys(self) -> int:
         return self["bit_width_of_sort_keys"]

deltacat/compute/compactor/model/compaction_session_audit_info.py CHANGED Viewed

@@ -1,5 +1,6 @@
 # Allow classes to use self-referencing Type hints in Python 3.7.
 from __future__ import annotations
+import pyarrow as pa
 import logging
 from deltacat import logs
 from typing import List, Union
@@ -419,6 +420,13 @@ class CompactionSessionAuditInfo(dict):
         """
         return self.get("usedCPUSeconds")
+    @property
+    def pyarrow_version(self) -> str:
+        """
+        The version of PyArrow used.
+        """
+        return self.get("pyarrowVersion")
     # Setters follow
     def set_audit_url(self, audit_url: str) -> CompactionSessionAuditInfo:
@@ -735,6 +743,10 @@ class CompactionSessionAuditInfo(dict):
         self["usedCPUSeconds"] = value
         return self
+    def set_pyarrow_version(self, value: str) -> CompactionSessionAuditInfo:
+        self["pyarrowVersion"] = value
+        return self
     # High level methods to save stats
     def save_step_stats(
         self,
@@ -863,4 +875,5 @@ class CompactionSessionAuditInfo(dict):
             )
         )
+        self.set_pyarrow_version(pa.__version__)
         self.set_telemetry_time_in_seconds(total_telemetry_time)

deltacat/compute/compactor/model/delta_annotated.py CHANGED Viewed

@@ -89,6 +89,11 @@ class DeltaAnnotated(Delta):
         for delta_annotated in annotated_deltas:
             split_annotated_deltas.extend(DeltaAnnotated._split_single(delta_annotated))
+        logger.info(
+            f"Split the {len(annotated_deltas)} annotated deltas "
+            f"into {len(split_annotated_deltas)} groups."
+        )
         for src_da in split_annotated_deltas:
             src_da_annotations = src_da.annotations
             src_da_entries = src_da.manifest.entries
@@ -280,12 +285,11 @@ class DeltaAnnotated(Delta):
                         )
                         result.append(new_da)
+                else:
+                    return [delta_annotated]
-        if result:
-            return result
-        else:
-            logger.info(
-                f"Split was not performed on the delta with locator: {delta_annotated.locator}"
-            )
+        logger.info(
+            f"Split was not performed on the delta with locator: {delta_annotated.locator}"
+        )
         return [delta_annotated]

deltacat/compute/compactor/repartition_session.py CHANGED Viewed

@@ -54,6 +54,7 @@ def repartition(
     pg_config: Optional[PlacementGroupConfig] = None,
     list_deltas_kwargs: Optional[Dict[str, Any]] = None,
     read_kwargs_provider: Optional[ReadKwargsProvider] = None,
+    s3_table_writer_kwargs: Optional[Dict[str, Any]] = None,
     s3_client_kwargs: Optional[Dict[str, Any]] = None,
     deltacat_storage=unimplemented_deltacat_storage,
     **kwargs,
@@ -131,6 +132,7 @@ def repartition(
         enable_profiler=enable_profiler,
         metrics_config=metrics_config,
         read_kwargs_provider=read_kwargs_provider,
+        s3_table_writer_kwargs=s3_table_writer_kwargs,
         repartitioned_file_content_type=repartitioned_file_content_type,
         deltacat_storage=deltacat_storage,
     )

deltacat/compute/compactor/steps/repartition.py CHANGED Viewed

@@ -56,6 +56,7 @@ def repartition_range(
     destination_partition: Partition,
     repartition_args: dict,
     max_records_per_output_file: int,
+    s3_table_writer_kwargs: Optional[Dict[str, Any]] = None,
     repartitioned_file_content_type: ContentType = ContentType.PARQUET,
     deltacat_storage=unimplemented_deltacat_storage,
     deltacat_storage_kwargs: Optional[Dict[str, Any]] = None,
@@ -145,6 +146,7 @@ def repartition_range(
                     destination_partition,
                     max_records_per_entry=max_records_per_output_file,
                     content_type=repartitioned_file_content_type,
+                    s3_table_writer_kwargs=s3_table_writer_kwargs,
                     **deltacat_storage_kwargs,
                 )
                 partition_deltas.append(partition_delta)
@@ -166,6 +168,7 @@ def _timed_repartition(
     max_records_per_output_file: int,
     enable_profiler: bool,
     read_kwargs_provider: Optional[ReadKwargsProvider],
+    s3_table_writer_kwargs: Optional[Dict[str, Any]] = None,
     repartitioned_file_content_type: ContentType = ContentType.PARQUET,
     deltacat_storage=unimplemented_deltacat_storage,
     deltacat_storage_kwargs: Optional[Dict[str, Any]] = None,
@@ -189,6 +192,7 @@ def _timed_repartition(
                 destination_partition=destination_partition,
                 repartition_args=repartition_args,
                 max_records_per_output_file=max_records_per_output_file,
+                s3_table_writer_kwargs=s3_table_writer_kwargs,
                 repartitioned_file_content_type=repartitioned_file_content_type,
                 deltacat_storage=deltacat_storage,
                 deltacat_storage_kwargs=deltacat_storage_kwargs,
@@ -209,6 +213,7 @@ def repartition(
     enable_profiler: bool,
     metrics_config: Optional[MetricsConfig],
     read_kwargs_provider: Optional[ReadKwargsProvider],
+    s3_table_writer_kwargs: Optional[Dict[str, Any]] = None,
     repartitioned_file_content_type: ContentType = ContentType.PARQUET,
     deltacat_storage=unimplemented_deltacat_storage,
     deltacat_storage_kwargs: Optional[Dict[str, Any]] = None,
@@ -226,6 +231,7 @@ def repartition(
         max_records_per_output_file=max_records_per_output_file,
         enable_profiler=enable_profiler,
         read_kwargs_provider=read_kwargs_provider,
+        s3_table_writer_kwargs=s3_table_writer_kwargs,
         repartitioned_file_content_type=repartitioned_file_content_type,
         deltacat_storage=deltacat_storage,
         deltacat_storage_kwargs=deltacat_storage_kwargs,

deltacat/compute/compactor_v2/compaction_session.py CHANGED Viewed

@@ -133,7 +133,7 @@ def _execute_compaction(
     # read the results from any previously completed compaction round
     round_completion_info = None
     high_watermark = None
-    previous_compacted_delta = None
+    previous_compacted_delta_manifest = None
     if not params.rebase_source_partition_locator:
         round_completion_info = rcf.read_round_completion_file(
@@ -147,13 +147,11 @@ def _execute_compaction(
             )
         else:
             compacted_delta_locator = round_completion_info.compacted_delta_locator
-            previous_compacted_delta = params.deltacat_storage.get_delta(
-                namespace=compacted_delta_locator.namespace,
-                table_name=compacted_delta_locator.table_name,
-                table_version=compacted_delta_locator.table_version,
-                stream_position=compacted_delta_locator.stream_position,
-                include_manifest=True,
-                **params.deltacat_storage_kwargs,
+            previous_compacted_delta_manifest = (
+                params.deltacat_storage.get_delta_manifest(
+                    compacted_delta_locator, **params.deltacat_storage_kwargs
+                )
             )
             high_watermark = round_completion_info.high_watermark
@@ -182,7 +180,22 @@ def _execute_compaction(
         params.list_deltas_kwargs,
     )
+    uniform_deltas = io.create_uniform_input_deltas(
+        input_deltas=input_deltas,
+        hash_bucket_count=params.hash_bucket_count,
+        compaction_audit=compaction_audit,
+        deltacat_storage=params.deltacat_storage,
+        previous_inflation=params.previous_inflation,
+        min_delta_bytes=params.min_delta_bytes_in_batch,
+        min_file_counts=params.min_files_in_batch,
+        # disable input split during rebase as the rebase files are already uniform
+        enable_input_split=params.rebase_source_partition_locator is None,
+        deltacat_storage_kwargs=params.deltacat_storage_kwargs,
+    )
     delta_discovery_end = time.monotonic()
+    compaction_audit.set_uniform_deltas_created(len(uniform_deltas))
     compaction_audit.set_delta_discovery_time_in_seconds(
         delta_discovery_end - delta_discovery_start
     )
@@ -197,19 +210,6 @@ def _execute_compaction(
         logger.info("No input deltas found to compact.")
         return None, None, None
-    uniform_deltas = io.create_uniform_input_deltas(
-        input_deltas=input_deltas,
-        hash_bucket_count=params.hash_bucket_count,
-        compaction_audit=compaction_audit,
-        deltacat_storage=params.deltacat_storage,
-        previous_inflation=params.previous_inflation,
-        min_delta_bytes=params.min_delta_bytes_in_batch,
-        min_file_counts=params.min_files_in_batch,
-        deltacat_storage_kwargs=params.deltacat_storage_kwargs,
-    )
-    compaction_audit.set_uniform_deltas_created(len(uniform_deltas))
     hb_options_provider = functools.partial(
         task_resource_options_provider,
         pg_config=params.pg_config,
@@ -221,20 +221,21 @@ def _execute_compaction(
     hb_start = time.monotonic()
-    hash_bucket_input_provider = lambda index, item: {
-        "input": HashBucketInput.of(
-            item,
-            primary_keys=params.primary_keys,
-            num_hash_buckets=params.hash_bucket_count,
-            num_hash_groups=params.hash_group_count,
-            enable_profiler=params.enable_profiler,
-            metrics_config=params.metrics_config,
-            read_kwargs_provider=params.read_kwargs_provider,
-            object_store=params.object_store,
-            deltacat_storage=params.deltacat_storage,
-            deltacat_storage_kwargs=params.deltacat_storage_kwargs,
-        )
-    }
+    def hash_bucket_input_provider(index, item):
+        return {
+            "input": HashBucketInput.of(
+                item,
+                primary_keys=params.primary_keys,
+                num_hash_buckets=params.hash_bucket_count,
+                num_hash_groups=params.hash_group_count,
+                enable_profiler=params.enable_profiler,
+                metrics_config=params.metrics_config,
+                read_kwargs_provider=params.read_kwargs_provider,
+                object_store=params.object_store,
+                deltacat_storage=params.deltacat_storage,
+                deltacat_storage_kwargs=params.deltacat_storage_kwargs,
+            )
+        }
     hb_tasks_pending = invoke_parallel(
         items=uniform_deltas,
@@ -332,33 +333,36 @@ def _execute_compaction(
         hash_group_size_bytes=all_hash_group_idx_to_size_bytes,
         hash_group_num_rows=all_hash_group_idx_to_num_rows,
         round_completion_info=round_completion_info,
-        compacted_delta=previous_compacted_delta,
+        compacted_delta_manifest=previous_compacted_delta_manifest,
         primary_keys=params.primary_keys,
         deltacat_storage=params.deltacat_storage,
         deltacat_storage_kwargs=params.deltacat_storage_kwargs,
     )
-    merge_input_provider = lambda index, item: {
-        "input": MergeInput.of(
-            dfe_groups_refs=item[1],
-            write_to_partition=compacted_partition,
-            compacted_file_content_type=params.compacted_file_content_type,
-            primary_keys=params.primary_keys,
-            sort_keys=params.sort_keys,
-            merge_task_index=index,
-            hash_group_index=item[0],
-            num_hash_groups=params.hash_group_count,
-            max_records_per_output_file=params.records_per_compacted_file,
-            enable_profiler=params.enable_profiler,
-            metrics_config=params.metrics_config,
-            s3_table_writer_kwargs=params.s3_table_writer_kwargs,
-            read_kwargs_provider=params.read_kwargs_provider,
-            round_completion_info=round_completion_info,
-            object_store=params.object_store,
-            deltacat_storage=params.deltacat_storage,
-            deltacat_storage_kwargs=params.deltacat_storage_kwargs,
-        )
-    }
+    def merge_input_provider(index, item):
+        return {
+            "input": MergeInput.of(
+                dfe_groups_refs=item[1],
+                write_to_partition=compacted_partition,
+                compacted_file_content_type=params.compacted_file_content_type,
+                primary_keys=params.primary_keys,
+                sort_keys=params.sort_keys,
+                merge_task_index=index,
+                hash_bucket_count=params.hash_bucket_count,
+                drop_duplicates=params.drop_duplicates,
+                hash_group_index=item[0],
+                num_hash_groups=params.hash_group_count,
+                max_records_per_output_file=params.records_per_compacted_file,
+                enable_profiler=params.enable_profiler,
+                metrics_config=params.metrics_config,
+                s3_table_writer_kwargs=params.s3_table_writer_kwargs,
+                read_kwargs_provider=params.read_kwargs_provider,
+                round_completion_info=round_completion_info,
+                object_store=params.object_store,
+                deltacat_storage=params.deltacat_storage,
+                deltacat_storage_kwargs=params.deltacat_storage_kwargs,
+            )
+        }
     merge_start = time.monotonic()
@@ -399,25 +403,25 @@ def _execute_compaction(
         mat_results, key=lambda m: m.task_index
     )
-    deltas = [m.delta for m in mat_results]
     hb_id_to_entry_indices_range = {}
     file_index = 0
     previous_task_index = -1
-    for m in mat_results:
-        assert m.pyarrow_write_result.files >= 1, "Atleast file must be materialized"
-        assert m.task_index != previous_task_index, (
-            "Multiple materialize results found for a " f"hash bucket: {m.task_index}"
-        )
+    for mat_result in mat_results:
+        assert (
+            mat_result.pyarrow_write_result.files >= 1
+        ), "Atleast one file must be materialized"
+        assert (
+            mat_result.task_index != previous_task_index
+        ), f"Multiple materialize results found for a hash bucket: {mat_result.task_index}"
-        hb_id_to_entry_indices_range[str(m.task_index)] = (
+        hb_id_to_entry_indices_range[str(mat_result.task_index)] = (
             file_index,
-            file_index + m.pyarrow_write_result.files - 1,
+            file_index + mat_result.pyarrow_write_result.files,
         )
-        file_index += m.pyarrow_write_result.files
-        previous_task_index = m.task_index
+        file_index += mat_result.pyarrow_write_result.files
+        previous_task_index = mat_result.task_index
     s3_utils.upload(
         compaction_audit.audit_url,
@@ -425,7 +429,6 @@ def _execute_compaction(
         **params.s3_client_kwargs,
     )
-    mat_results = sorted(mat_results, key=lambda m: m.task_index)
     deltas = [m.delta for m in mat_results]
     # Note: An appropriate last stream position must be set

deltacat/compute/compactor_v2/constants.py CHANGED Viewed

@@ -32,3 +32,6 @@ TOTAL_MEMORY_BUFFER_PERCENTAGE = 20
 # Since, sorting is nlogn, we ensure that is not performed
 # on a very large dataset for best performance.
 MAX_SIZE_OF_RECORD_BATCH_IN_GIB = 2 * 1024 * 1024 * 1024
+# Whether to drop duplicates during merge.
+DROP_DUPLICATES = True

deltacat/compute/compactor_v2/model/merge_input.py CHANGED Viewed

@@ -10,6 +10,10 @@ from deltacat.storage import (
     SortKey,
     interface as unimplemented_deltacat_storage,
 )
+from deltacat.compute.compactor_v2.constants import (
+    DROP_DUPLICATES,
+    MAX_RECORDS_PER_COMPACTED_FILE,
+)
 from deltacat.types.media import ContentType
 from deltacat.compute.compactor.model.round_completion_info import RoundCompletionInfo
 from deltacat.compute.compactor.model.delta_file_envelope import DeltaFileEnvelopeGroups
@@ -24,9 +28,11 @@ class MergeInput(Dict):
         primary_keys: List[str],
         hash_group_index: int,
         num_hash_groups: int,
+        hash_bucket_count: int,
+        drop_duplicates: Optional[bool] = DROP_DUPLICATES,
         sort_keys: Optional[List[SortKey]] = None,
         merge_task_index: Optional[int] = 0,
-        max_records_per_output_file: Optional[int] = 4_000_000,
+        max_records_per_output_file: Optional[int] = MAX_RECORDS_PER_COMPACTED_FILE,
         enable_profiler: Optional[bool] = False,
         metrics_config: Optional[MetricsConfig] = None,
         s3_table_writer_kwargs: Optional[Dict[str, Any]] = None,
@@ -44,6 +50,8 @@ class MergeInput(Dict):
         result["primary_keys"] = primary_keys
         result["hash_group_index"] = hash_group_index
         result["num_hash_groups"] = num_hash_groups
+        result["hash_bucket_count"] = hash_bucket_count
+        result["drop_duplicates"] = drop_duplicates
         result["sort_keys"] = sort_keys
         result["merge_task_index"] = merge_task_index
         result["max_records_per_output_file"] = max_records_per_output_file
@@ -82,6 +90,14 @@ class MergeInput(Dict):
     def num_hash_groups(self) -> int:
         return self["num_hash_groups"]
+    @property
+    def hash_bucket_count(self) -> int:
+        return self["hash_bucket_count"]
+    @property
+    def drop_duplicates(self) -> int:
+        return self["drop_duplicates"]
     @property
     def sort_keys(self) -> Optional[List[SortKey]]:
         return self.get("sort_keys")

deltacat 0.1.18b15__py3-none-any.whl → 0.1.18b16__py3-none-any.whl

deltacat 0.1.18b15py3-none-any.whl → 0.1.18b16py3-none-any.whl