PyPI - deltacat - Versions diffs - 0.1.18b1__py3-none-any.whl → 0.1.18b3__py3-none-any.whl - Mend

deltacat 0.1.18b1py3-none-any.whl → 0.1.18b3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (28) hide show

deltacat/__init__.py +1 -1
deltacat/compute/compactor/compaction_session.py +62 -25
deltacat/compute/compactor/model/delta_annotated.py +1 -1
deltacat/compute/compactor/model/materialize_result.py +16 -2
deltacat/compute/compactor/model/repartition_result.py +6 -0
deltacat/compute/compactor/model/round_completion_info.py +8 -0
deltacat/compute/compactor/repartition_session.py +174 -0
deltacat/compute/compactor/steps/materialize.py +116 -27
deltacat/compute/compactor/steps/repartition.py +210 -0
deltacat/compute/compactor/utils/io.py +131 -49
deltacat/compute/compactor/utils/round_completion_file.py +14 -16
deltacat/constants.py +2 -0
deltacat/storage/interface.py +1 -1
deltacat/storage/model/types.py +10 -2
deltacat/tests/compactor/utils/__init__.py +0 -0
deltacat/tests/compactor/utils/test_io.py +69 -0
deltacat/tests/test_repartition.py +193 -0
deltacat/tests/test_utils/__init__.py +0 -0
deltacat/tests/test_utils/constants.py +7 -0
deltacat/tests/utils/test_resources.py +36 -0
deltacat/utils/ray_utils/concurrency.py +2 -0
deltacat/utils/resources.py +72 -0
{deltacat-0.1.18b1.dist-info → deltacat-0.1.18b3.dist-info}/METADATA +2 -5
{deltacat-0.1.18b1.dist-info → deltacat-0.1.18b3.dist-info}/RECORD +28 -18
{deltacat-0.1.18b1.dist-info → deltacat-0.1.18b3.dist-info}/WHEEL +1 -1
/deltacat/{utils/profiling.py → tests/compactor/__init__.py} +0 -0
{deltacat-0.1.18b1.dist-info → deltacat-0.1.18b3.dist-info}/LICENSE +0 -0
{deltacat-0.1.18b1.dist-info → deltacat-0.1.18b3.dist-info}/top_level.txt +0 -0

deltacat/__init__.py CHANGED Viewed

@@ -43,7 +43,7 @@ from deltacat.types.tables import TableWriteMode
 deltacat.logs.configure_deltacat_logger(logging.getLogger(__name__))
-__version__ = "0.1.18.beta1"
+__version__ = "0.1.18b3"
 __all__ = [

deltacat/compute/compactor/compaction_session.py CHANGED Viewed

@@ -37,6 +37,7 @@ from deltacat.utils.placement import PlacementGroupConfig
 from typing import List, Set, Optional, Tuple, Dict, Any
 from collections import defaultdict
 from deltacat.utils.metrics import MetricsConfig
+from deltacat.utils.resources import log_current_cluster_utilization
 if importlib.util.find_spec("memray"):
     import memray
@@ -113,8 +114,11 @@ def compact_partition(
         f"compaction_partition.bin"
     ) if enable_profiler else nullcontext():
         partition = None
-        new_rcf_s3_url = None
-        (new_partition, new_rci, new_rcf_s3_url,) = _execute_compaction_round(
+        (
+            new_partition,
+            new_rci,
+            new_rcf_partition_locator,
+        ) = _execute_compaction_round(
             source_partition_locator,
             destination_partition_locator,
             primary_keys,
@@ -144,12 +148,19 @@ def compact_partition(
         logger.info(
             f"Partition-{source_partition_locator.partition_values}-> Compaction session data processing completed"
         )
+        round_completion_file_s3_url = None
         if partition:
             logger.info(f"Committing compacted partition to: {partition.locator}")
             partition = deltacat_storage.commit_partition(partition)
             logger.info(f"Committed compacted partition: {partition}")
+            round_completion_file_s3_url = rcf.write_round_completion_file(
+                compaction_artifact_s3_bucket,
+                new_rcf_partition_locator,
+                new_rci,
+            )
         logger.info(f"Completed compaction session for: {source_partition_locator}")
-        return new_rcf_s3_url
+        return round_completion_file_s3_url
 def _execute_compaction_round(
@@ -283,13 +294,22 @@ def _execute_compaction_round(
         hash_bucket_count,
         last_stream_position_compacted,
         require_multiple_rounds,
-    ) = io.limit_input_deltas(
-        input_deltas,
-        cluster_resources,
-        hash_bucket_count,
-        min_hash_bucket_chunk_size,
-        input_deltas_stats=input_deltas_stats,
-        deltacat_storage=deltacat_storage,
+    ) = (
+        io.fit_input_deltas(
+            input_deltas,
+            cluster_resources,
+            hash_bucket_count,
+            deltacat_storage=deltacat_storage,
+        )
+        if input_deltas_stats is None
+        else io.limit_input_deltas(
+            input_deltas,
+            cluster_resources,
+            hash_bucket_count,
+            min_hash_bucket_chunk_size,
+            input_deltas_stats=input_deltas_stats,
+            deltacat_storage=deltacat_storage,
+        )
     )
     assert hash_bucket_count is not None and hash_bucket_count > 0, (
@@ -435,11 +455,39 @@ def _execute_compaction_round(
     )
     logger.info(f"Getting {len(mat_tasks_pending)} materialize result(s)...")
     mat_results = ray.get(mat_tasks_pending)
+    total_count_of_src_dfl_not_touched = sum(
+        m.count_of_src_dfl_not_touched for m in mat_results
+    )
+    total_length_src_dfl = sum(m.count_of_src_dfl for m in mat_results)
+    logger.info(
+        f"Got total of {total_count_of_src_dfl_not_touched} manifest files not touched."
+    )
+    logger.info(
+        f"Got total of {total_length_src_dfl} manifest files during compaction."
+    )
+    manifest_entry_copied_by_reference_ratio = (
+        (round(total_count_of_src_dfl_not_touched / total_length_src_dfl, 4) * 100)
+        if total_length_src_dfl != 0
+        else None
+    )
+    logger.info(
+        f"{manifest_entry_copied_by_reference_ratio} percent of manifest files are copied by reference during materialize."
+    )
     logger.info(f"Got {len(mat_results)} materialize result(s).")
+    log_current_cluster_utilization(log_identifier="post_materialize")
     mat_results = sorted(mat_results, key=lambda m: m.task_index)
     deltas = [m.delta for m in mat_results]
-    merged_delta = Delta.merge_deltas(deltas)
+    # Note: An appropriate last stream position must be set
+    # to avoid correctness issue.
+    merged_delta = Delta.merge_deltas(
+        deltas,
+        stream_position=last_stream_position_to_compact,
+    )
     record_info_msg = (
         f"Hash bucket records: {total_hb_record_count},"
         f" Deduped records: {total_dd_record_count}, "
@@ -463,35 +511,24 @@ def _execute_compaction_round(
         compacted_delta.stream_position,
     )
-    rci_high_watermark = (
-        rebase_source_partition_high_watermark
-        if rebase_source_partition_high_watermark
-        else last_stream_position_compacted
-    )
     last_rebase_source_partition_locator = rebase_source_partition_locator or (
         round_completion_info.rebase_source_partition_locator
         if round_completion_info
         else None
     )
     new_round_completion_info = RoundCompletionInfo.of(
-        rci_high_watermark,
+        last_stream_position_compacted,
         new_compacted_delta_locator,
         PyArrowWriteResult.union([m.pyarrow_write_result for m in mat_results]),
         bit_width_of_sort_keys,
         last_rebase_source_partition_locator,
+        manifest_entry_copied_by_reference_ratio,
     )
     rcf_source_partition_locator = (
         rebase_source_partition_locator
         if rebase_source_partition_locator
         else source_partition_locator
     )
-    round_completion_file_s3_url = rcf.write_round_completion_file(
-        compaction_artifact_s3_bucket,
-        rcf_source_partition_locator,
-        new_round_completion_info,
-    )
     logger.info(
         f"partition-{source_partition_locator.partition_values},"
         f"compacted at: {last_stream_position_compacted},"
@@ -500,5 +537,5 @@ def _execute_compaction_round(
     return (
         partition,
         new_round_completion_info,
-        round_completion_file_s3_url,
+        rcf_source_partition_locator,
     )

deltacat/compute/compactor/model/delta_annotated.py CHANGED Viewed

@@ -62,7 +62,7 @@ class DeltaAnnotated(Delta):
     @staticmethod
     def rebatch(
         annotated_deltas: List[DeltaAnnotated],
-        min_delta_bytes,
+        min_delta_bytes: float,
         min_file_counts: Optional[Union[int, float]] = float("inf"),
         estimation_function: Optional[Callable] = None,
     ) -> List[DeltaAnnotated]:

deltacat/compute/compactor/model/materialize_result.py CHANGED Viewed

@@ -1,7 +1,7 @@
 # Allow classes to use self-referencing Type hints in Python 3.7.
 from __future__ import annotations
-from typing import Any, Dict
+from typing import Any, Dict, Optional
 from deltacat.compute.compactor.model.pyarrow_write_result import PyArrowWriteResult
 from deltacat.storage import Delta
@@ -10,12 +10,18 @@ from deltacat.storage import Delta
 class MaterializeResult(dict):
     @staticmethod
     def of(
-        delta: Delta, task_index: int, pyarrow_write_result: PyArrowWriteResult
+        delta: Delta,
+        task_index: int,
+        pyarrow_write_result: PyArrowWriteResult,
+        count_of_src_dfl_not_touched: Optional[int] = 0,
+        count_of_src_dfl: Optional[int] = 0,
     ) -> MaterializeResult:
         materialize_result = MaterializeResult()
         materialize_result["delta"] = delta
         materialize_result["taskIndex"] = task_index
         materialize_result["paWriteResult"] = pyarrow_write_result
+        materialize_result["countOfSrcFileNotTouched"] = count_of_src_dfl_not_touched
+        materialize_result["countOfSrcFile"] = count_of_src_dfl
         return materialize_result
     @property
@@ -35,3 +41,11 @@ class MaterializeResult(dict):
         if val is not None and not isinstance(val, PyArrowWriteResult):
             self["paWriteResult"] = val = PyArrowWriteResult(val)
         return val
+    @property
+    def count_of_src_dfl_not_touched(self) -> int:
+        return self["countOfSrcFileNotTouched"]
+    @property
+    def count_of_src_dfl(self) -> int:
+        return self["countOfSrcFile"]

deltacat/compute/compactor/model/repartition_result.py ADDED Viewed

@@ -0,0 +1,6 @@
+from typing import NamedTuple, List
+from deltacat.storage import Delta
+class RepartitionResult(NamedTuple):
+    range_deltas: List[Delta]

deltacat/compute/compactor/model/round_completion_info.py CHANGED Viewed

@@ -38,6 +38,7 @@ class RoundCompletionInfo(dict):
         compacted_pyarrow_write_result: PyArrowWriteResult,
         sort_keys_bit_width: int,
         rebase_source_partition_locator: Optional[PartitionLocator],
+        manifest_entry_copied_by_reference_ratio: Optional[float] = None,
     ) -> RoundCompletionInfo:
         rci = RoundCompletionInfo()
@@ -46,6 +47,9 @@ class RoundCompletionInfo(dict):
         rci["compactedPyarrowWriteResult"] = compacted_pyarrow_write_result
         rci["sortKeysBitWidth"] = sort_keys_bit_width
         rci["rebaseSourcePartitionLocator"] = rebase_source_partition_locator
+        rci[
+            "manifestEntryCopiedByReferenceRatio"
+        ] = manifest_entry_copied_by_reference_ratio
         return rci
     @property
@@ -80,3 +84,7 @@ class RoundCompletionInfo(dict):
     @property
     def rebase_source_partition_locator(self) -> Optional[PartitionLocator]:
         return self.get("rebaseSourcePartitionLocator")
+    @property
+    def manifest_entry_copied_by_reference_ratio(self) -> Optional[float]:
+        return self["manifestEntryCopiedByReferenceRatio"]

deltacat/compute/compactor/repartition_session.py ADDED Viewed

@@ -0,0 +1,174 @@
+import ray
+import time
+import logging
+from deltacat import logs
+from deltacat.utils.common import ReadKwargsProvider
+import functools
+import itertools
+from deltacat.compute.compactor import (
+    RoundCompletionInfo,
+    SortKey,
+)
+from deltacat.types.media import ContentType
+from deltacat.compute.compactor import DeltaAnnotated
+from deltacat.utils.ray_utils.concurrency import (
+    invoke_parallel,
+    round_robin_options_provider,
+)
+from deltacat.compute.compactor.model.repartition_result import RepartitionResult
+from deltacat.utils.placement import PlacementGroupConfig
+from typing import List, Optional, Dict, Any
+from deltacat.utils.ray_utils.runtime import live_node_resource_keys
+from deltacat.compute.compactor.utils import io
+from deltacat.compute.compactor.utils import round_completion_file as rcf
+from deltacat.compute.compactor.steps import repartition as repar
+from deltacat.compute.compactor.steps.repartition import RepartitionType
+from deltacat.storage import (
+    Delta,
+    DeltaLocator,
+    PartitionLocator,
+    interface as unimplemented_deltacat_storage,
+)
+from deltacat.utils.metrics import MetricsConfig
+logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
+# TODO: move this repartition function to a separate module under compute
+def repartition(
+    source_partition_locator: PartitionLocator,
+    destination_partition_locator: PartitionLocator,
+    repartition_args: Any,
+    repartition_completion_file_s3_url: str,
+    last_stream_position_to_compact: int,
+    repartition_type: RepartitionType = RepartitionType.RANGE,
+    sort_keys: List[SortKey] = None,
+    records_per_repartitioned_file: int = 4_000_000,
+    min_file_count: int = 1000,
+    min_delta_bytes: int = 200 * 2**20,
+    repartitioned_file_content_type: ContentType = ContentType.PARQUET,
+    enable_profiler: bool = False,
+    metrics_config: Optional[MetricsConfig] = None,
+    pg_config: Optional[PlacementGroupConfig] = None,
+    list_deltas_kwargs: Optional[Dict[str, Any]] = None,
+    read_kwargs_provider: Optional[ReadKwargsProvider] = None,
+    deltacat_storage=unimplemented_deltacat_storage,
+    **kwargs,
+) -> Optional[str]:
+    node_resource_keys = None
+    if pg_config:  # use resource in each placement group
+        cluster_resources = pg_config.resource
+        cluster_cpus = cluster_resources["CPU"]
+    else:  # use all cluster resource
+        cluster_resources = ray.cluster_resources()
+        logger.info(f"Total cluster resources: {cluster_resources}")
+        logger.info(f"Available cluster resources: {ray.available_resources()}")
+        cluster_cpus = int(cluster_resources["CPU"])
+        logger.info(f"Total cluster CPUs: {cluster_cpus}")
+        node_resource_keys = live_node_resource_keys()
+        logger.info(
+            f"Found {len(node_resource_keys)} live cluster nodes: "
+            f"{node_resource_keys}"
+        )
+    # create a remote options provider to round-robin tasks across all nodes or allocated bundles
+    logger.info(f"Setting round robin scheduling with node id:{node_resource_keys}")
+    round_robin_opt_provider = functools.partial(
+        round_robin_options_provider,
+        resource_keys=node_resource_keys,
+        pg_config=pg_config.opts if pg_config else None,
+    )
+    deltas = io._discover_deltas(
+        source_partition_locator,
+        None,
+        deltacat_storage.get_partition(
+            source_partition_locator.stream_locator,
+            source_partition_locator.partition_values,
+        ).stream_position,
+        deltacat_storage,
+        **list_deltas_kwargs,
+    )
+    uniform_deltas = []
+    for delta in deltas:
+        uniform_deltas_part = DeltaAnnotated.rebatch(
+            [DeltaAnnotated.of(delta)],
+            min_delta_bytes=min_delta_bytes,
+            min_file_counts=min_file_count,
+        )
+        uniform_deltas.extend(uniform_deltas_part)
+    logger.info(f"Retrieved a total of {len(uniform_deltas)} uniform deltas.")
+    max_parallelism = cluster_cpus
+    # create a new stream for this round
+    compacted_stream_locator = destination_partition_locator.stream_locator
+    stream = deltacat_storage.get_stream(
+        compacted_stream_locator.namespace,
+        compacted_stream_locator.table_name,
+        compacted_stream_locator.table_version,
+    )
+    partition = deltacat_storage.stage_partition(
+        stream,
+        destination_partition_locator.partition_values,
+    )
+    new_compacted_partition_locator = partition.locator
+    repar_start = time.time()
+    repar_tasks_pending = invoke_parallel(
+        items=uniform_deltas,
+        ray_task=repar.repartition,
+        max_parallelism=max_parallelism,
+        options_provider=round_robin_opt_provider,
+        repartition_type=repartition_type,
+        repartition_args=repartition_args,
+        max_records_per_output_file=records_per_repartitioned_file,
+        destination_partition=partition,
+        enable_profiler=enable_profiler,
+        metrics_config=metrics_config,
+        read_kwargs_provider=read_kwargs_provider,
+        repartitioned_file_content_type=repartitioned_file_content_type,
+        deltacat_storage=deltacat_storage,
+    )
+    logger.info(f"Getting {len(repar_tasks_pending)} task results...")
+    repar_results: List[RepartitionResult] = ray.get(repar_tasks_pending)
+    repar_results: List[Delta] = [rp.range_deltas for rp in repar_results]
+    transposed = list(itertools.zip_longest(*repar_results, fillvalue=None))
+    ordered_deltas: List[Delta] = [
+        i for sublist in transposed for i in sublist if i is not None
+    ]
+    repar_end = time.time()
+    logger.info(f"repartition {repar_end - repar_start} seconds")
+    logger.info(f"Got {len(ordered_deltas)} task results.")
+    # ordered_deltas are ordered as [cold1, cold2, coldN, hot1, hot2, hotN]
+    merged_delta = Delta.merge_deltas(ordered_deltas)
+    compacted_delta = deltacat_storage.commit_delta(
+        merged_delta, properties=kwargs.get("properties", {})
+    )
+    deltacat_storage.commit_partition(partition)
+    logger.info(f"Committed final delta: {compacted_delta}")
+    logger.info(f"Job run completed successfully!")
+    new_compacted_delta_locator = DeltaLocator.of(
+        new_compacted_partition_locator,
+        compacted_delta.stream_position,
+    )
+    bit_width_of_sort_keys = SortKey.validate_sort_keys(
+        source_partition_locator,
+        sort_keys,
+        deltacat_storage,
+    )
+    repartition_completion_info = RoundCompletionInfo.of(
+        last_stream_position_to_compact,
+        new_compacted_delta_locator,
+        None,
+        bit_width_of_sort_keys,
+        None,
+    )
+    return rcf.write_round_completion_file(
+        None,
+        None,
+        repartition_completion_info,
+        repartition_completion_file_s3_url,
+    )

deltacat/compute/compactor/steps/materialize.py CHANGED Viewed

@@ -1,10 +1,11 @@
 import importlib
 import logging
 import time
+from uuid import uuid4
 from collections import defaultdict
 from contextlib import nullcontext
 from itertools import chain, repeat
-from typing import List, Optional, Tuple, Dict, Any
+from typing import List, Optional, Tuple, Dict, Any, Union
 import pyarrow as pa
 import ray
 from ray import cloudpickle
@@ -18,7 +19,18 @@ from deltacat.compute.compactor.steps.dedupe import (
     DedupeTaskIndexWithObjectId,
     DeltaFileLocatorToRecords,
 )
-from deltacat.storage import Delta, DeltaLocator, Partition, PartitionLocator
+from deltacat.storage import (
+    Delta,
+    DeltaLocator,
+    DeltaType,
+    Partition,
+    PartitionLocator,
+    Manifest,
+    ManifestEntry,
+    LocalDataset,
+    LocalTable,
+    DistributedDataset,
+)
 from deltacat.storage import interface as unimplemented_deltacat_storage
 from deltacat.utils.common import ReadKwargsProvider
 from deltacat.types.media import DELIMITED_TEXT_CONTENT_TYPES, ContentType
@@ -56,12 +68,44 @@ def materialize(
     read_kwargs_provider: Optional[ReadKwargsProvider] = None,
     s3_table_writer_kwargs: Optional[Dict[str, Any]] = None,
     deltacat_storage=unimplemented_deltacat_storage,
-) -> MaterializeResult:
+):
+    def _stage_delta_implementation(
+        data: Union[LocalTable, LocalDataset, DistributedDataset, Manifest],
+        partition: Partition,
+        stage_delta_from_existing_manifest: Optional[bool],
+    ) -> Delta:
+        if stage_delta_from_existing_manifest:
+            delta = Delta.of(
+                locator=DeltaLocator.of(partition.locator),
+                delta_type=DeltaType.UPSERT,
+                meta=manifest.meta,
+                manifest=data,
+                previous_stream_position=partition.stream_position,
+                properties={},
+            )
+            return delta
+    def _stage_delta_from_manifest_entry_reference_list(
+        manifest_entry_list_reference: List[ManifestEntry],
+        partition: Partition,
+        delta_type: DeltaType = DeltaType.UPSERT,
+    ) -> Delta:
+        assert (
+            delta_type == DeltaType.UPSERT
+        ), "Stage delta with existing manifest entries only supports UPSERT delta type!"
+        manifest = Manifest.of(entries=manifest_entry_list_reference, uuid=str(uuid4()))
+        delta = _stage_delta_implementation(
+            data=manifest,
+            partition=partition,
+            delta_type=delta_type,
+            stage_delta_from_existing_manifest=True,
+        )
+        return delta
     # TODO (rkenmi): Add docstrings for the steps in the compaction workflow
     #  https://github.com/ray-project/deltacat/issues/79
     def _materialize(compacted_tables: List[pa.Table]) -> MaterializeResult:
         compacted_table = pa.concat_tables(compacted_tables)
         if compacted_file_content_type in DELIMITED_TEXT_CONTENT_TYPES:
             # TODO (rkenmi): Investigate if we still need to convert this table to pandas DataFrame
             # TODO (pdames): compare performance to pandas-native materialize path
@@ -92,11 +136,11 @@ def materialize(
             f"({len(compacted_table)})",
         )
         materialize_result = MaterializeResult.of(
-            delta,
-            mat_bucket_index,
+            delta=delta,
+            task_index=mat_bucket_index,
             # TODO (pdames): Generalize WriteResult to contain in-memory-table-type
             #  and in-memory-table-bytes instead of tight coupling to paBytes
-            PyArrowWriteResult.of(
+            pyarrow_write_result=PyArrowWriteResult.of(
                 len(manifest.entries),
                 TABLE_CLASS_TO_SIZE_FUNC[type(compacted_table)](compacted_table),
                 manifest.meta.content_length,
@@ -138,6 +182,9 @@ def materialize(
         manifest_cache = {}
         materialized_results: List[MaterializeResult] = []
         record_batch_tables = RecordBatchTables(max_records_per_output_file)
+        count_of_src_dfl = 0
+        manifest_entry_list_reference = []
+        referenced_pyarrow_write_results = []
         for src_dfl in sorted(all_src_file_records.keys()):
             record_numbers_dd_task_idx_tpl_list: List[
                 Tuple[DeltaFileLocatorToRecords, repeat]
@@ -148,11 +195,13 @@ def materialize(
             is_src_partition_file_np = src_dfl.is_source_delta
             src_stream_position_np = src_dfl.stream_position
             src_file_idx_np = src_dfl.file_index
+            count_of_src_dfl += 1
             src_file_partition_locator = (
                 source_partition_locator
                 if is_src_partition_file_np
                 else round_completion_info.compacted_delta_locator.partition_locator
             )
             delta_locator = DeltaLocator.of(
                 src_file_partition_locator,
                 src_stream_position_np.item(),
@@ -185,39 +234,79 @@ def materialize(
                 f" to download delta locator {delta_locator} with entry ID {src_file_idx_np.item()}"
                 f" is: {download_delta_manifest_entry_time}s"
             )
-            mask_pylist = list(repeat(False, len(pa_table)))
             record_numbers = chain.from_iterable(record_numbers_tpl)
-            # TODO(raghumdani): reference the same file URIs while writing the files
-            # instead of copying the data over and creating new files.
+            record_numbers_length = 0
+            mask_pylist = list(repeat(False, len(pa_table)))
             for record_number in record_numbers:
+                record_numbers_length += 1
                 mask_pylist[record_number] = True
-            mask = pa.array(mask_pylist)
-            pa_table = pa_table.filter(mask)
-            record_batch_tables.append(pa_table)
-            if record_batch_tables.has_batches():
-                batched_tables = record_batch_tables.evict()
-                materialized_results.append(_materialize(batched_tables))
+            if (
+                record_numbers_length == len(pa_table)
+                and src_file_partition_locator
+                == round_completion_info.compacted_delta_locator.partition_locator
+            ):
+                logger.debug(
+                    f"Untouched manifest file found, "
+                    f"record numbers length: {record_numbers_length} "
+                    f"same as downloaded table length: {len(pa_table)}"
+                )
+                untouched_src_manifest_entry = manifest.entries[src_file_idx_np.item()]
+                manifest_entry_list_reference.append(untouched_src_manifest_entry)
+                referenced_pyarrow_write_result = PyArrowWriteResult.of(
+                    len(untouched_src_manifest_entry.entries),
+                    TABLE_CLASS_TO_SIZE_FUNC[type(pa_table)](pa_table),
+                    manifest.meta.content_length,
+                    len(pa_table),
+                )
+                referenced_pyarrow_write_results.append(referenced_pyarrow_write_result)
+            else:
+                mask = pa.array(mask_pylist)
+                pa_table = pa_table.filter(mask)
+                record_batch_tables.append(pa_table)
+                if record_batch_tables.has_batches():
+                    batched_tables = record_batch_tables.evict()
+                    materialized_results.append(_materialize(batched_tables))
         if record_batch_tables.has_remaining():
             materialized_results.append(_materialize(record_batch_tables.remaining))
-        merged_delta = Delta.merge_deltas([mr.delta for mr in materialized_results])
-        assert (
-            materialized_results and len(materialized_results) > 0
-        ), f"Expected at least one materialized result in materialize step."
+        logger.info(f"Got {count_of_src_dfl} source delta files during materialize")
+        referenced_manifest_delta = (
+            _stage_delta_from_manifest_entry_reference_list(
+                manifest_entry_list_reference
+            )
+            if manifest_entry_list_reference
+            else None
+        )
+        if referenced_manifest_delta:
+            logger.info(
+                f"Got delta with {len(referenced_manifest_delta.manifest.entries)} referenced manifest entries"
+            )
+        merged_materialized_delta = [mr.delta for mr in materialized_results]
+        merged_materialized_delta.append(referenced_manifest_delta)
+        merged_delta = Delta.merge_deltas(
+            [d for d in merged_materialized_delta if d is not None]
+        )
+        write_results_union = referenced_pyarrow_write_results
+        if materialized_results:
+            for mr in materialized_results:
+                write_results_union.append(mr.pyarrow_write_result)
+        write_result = PyArrowWriteResult.union(write_results_union)
-        write_results = [mr.pyarrow_write_result for mr in materialized_results]
         logger.debug(
-            f"{len(write_results)} files written"
-            f" with records: {[wr.records for wr in write_results]}"
+            f"{len(write_results_union)} files written"
+            f" with records: {[wr.records for wr in write_results_union]}"
         )
         # Merge all new deltas into one for this materialize bucket index
         merged_materialize_result = MaterializeResult.of(
             merged_delta,
-            materialized_results[0].task_index,
-            PyArrowWriteResult.union(
-                [mr.pyarrow_write_result for mr in materialized_results]
-            ),
+            mat_bucket_index,
+            write_result,
+            len(manifest_entry_list_reference),
+            count_of_src_dfl,
         )
         logger.info(f"Finished materialize task...")

deltacat 0.1.18b1__py3-none-any.whl → 0.1.18b3__py3-none-any.whl

deltacat 0.1.18b1py3-none-any.whl → 0.1.18b3py3-none-any.whl