PyPI - deltacat - Versions diffs - 0.2.9__py3-none-any.whl → 1.0.0__py3-none-any.whl - Mend

deltacat 0.2.9py3-none-any.whl → 1.0.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (39) hide show

deltacat/__init__.py +1 -1
deltacat/aws/redshift/__init__.py +4 -0
deltacat/aws/redshift/model/manifest.py +93 -1
deltacat/aws/s3u.py +250 -111
deltacat/catalog/default_catalog_impl/__init__.py +369 -0
deltacat/compute/compactor_v2/compaction_session.py +175 -152
deltacat/compute/compactor_v2/model/hash_bucket_input.py +6 -0
deltacat/compute/compactor_v2/model/merge_file_group.py +213 -0
deltacat/compute/compactor_v2/model/merge_input.py +8 -24
deltacat/compute/compactor_v2/model/merge_result.py +1 -0
deltacat/compute/compactor_v2/steps/hash_bucket.py +4 -56
deltacat/compute/compactor_v2/steps/merge.py +106 -171
deltacat/compute/compactor_v2/utils/delta.py +97 -0
deltacat/compute/compactor_v2/utils/merge.py +126 -0
deltacat/compute/compactor_v2/utils/task_options.py +47 -4
deltacat/compute/merge_on_read/__init__.py +4 -0
deltacat/compute/merge_on_read/daft.py +40 -0
deltacat/compute/merge_on_read/model/__init__.py +0 -0
deltacat/compute/merge_on_read/model/merge_on_read_params.py +66 -0
deltacat/compute/merge_on_read/utils/__init__.py +0 -0
deltacat/compute/merge_on_read/utils/delta.py +42 -0
deltacat/storage/interface.py +10 -2
deltacat/storage/model/types.py +3 -11
deltacat/tests/catalog/__init__.py +0 -0
deltacat/tests/catalog/test_default_catalog_impl.py +98 -0
deltacat/tests/compute/compact_partition_test_cases.py +126 -1
deltacat/tests/compute/test_compact_partition_incremental.py +4 -1
deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +9 -2
deltacat/tests/local_deltacat_storage/__init__.py +19 -2
deltacat/tests/test_utils/pyarrow.py +33 -14
deltacat/tests/utils/test_daft.py +42 -2
deltacat/types/media.py +5 -0
deltacat/types/tables.py +7 -1
deltacat/utils/daft.py +78 -13
{deltacat-0.2.9.dist-info → deltacat-1.0.0.dist-info}/METADATA +2 -2
{deltacat-0.2.9.dist-info → deltacat-1.0.0.dist-info}/RECORD +39 -27
{deltacat-0.2.9.dist-info → deltacat-1.0.0.dist-info}/LICENSE +0 -0
{deltacat-0.2.9.dist-info → deltacat-1.0.0.dist-info}/WHEEL +0 -0
{deltacat-0.2.9.dist-info → deltacat-1.0.0.dist-info}/top_level.txt +0 -0

deltacat/compute/compactor_v2/model/merge_result.py CHANGED Viewed

@@ -6,6 +6,7 @@ import numpy as np
 class MergeResult(NamedTuple):
     materialize_results: List[MaterializeResult]
+    input_record_count: np.int64
     deduped_record_count: np.int64
     peak_memory_usage_bytes: np.double
     telemetry_time_in_seconds: np.double

deltacat/compute/compactor_v2/steps/hash_bucket.py CHANGED Viewed

@@ -5,7 +5,6 @@ from contextlib import nullcontext
 from typing import List, Optional, Tuple
 from deltacat.compute.compactor_v2.model.hash_bucket_input import HashBucketInput
 import numpy as np
-import pyarrow as pa
 import ray
 from deltacat import logs
 from deltacat.compute.compactor import (
@@ -14,12 +13,12 @@ from deltacat.compute.compactor import (
 )
 from deltacat.compute.compactor.model.delta_file_envelope import DeltaFileEnvelopeGroups
 from deltacat.compute.compactor_v2.model.hash_bucket_result import HashBucketResult
+from deltacat.compute.compactor_v2.utils.delta import read_delta_file_envelopes
 from deltacat.compute.compactor_v2.utils.primary_key_index import (
     group_hash_bucket_indices,
     group_by_pk_hash_bucket,
 )
 from deltacat.storage import interface as unimplemented_deltacat_storage
-from deltacat.types.media import StorageType
 from deltacat.utils.ray_utils.runtime import (
     get_current_ray_task_id,
     get_current_ray_worker_id,
@@ -39,57 +38,6 @@ if importlib.util.find_spec("memray"):
 logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
-def _read_delta_file_envelopes(
-    annotated_delta: DeltaAnnotated,
-    read_kwargs_provider: Optional[ReadKwargsProvider],
-    deltacat_storage=unimplemented_deltacat_storage,
-    deltacat_storage_kwargs: Optional[dict] = None,
-) -> Tuple[Optional[List[DeltaFileEnvelope]], int, int]:
-    tables = deltacat_storage.download_delta(
-        annotated_delta,
-        max_parallelism=1,
-        file_reader_kwargs_provider=read_kwargs_provider,
-        storage_type=StorageType.LOCAL,
-        **deltacat_storage_kwargs,
-    )
-    annotations = annotated_delta.annotations
-    assert (
-        len(tables) == len(annotations),
-        f"Unexpected Error: Length of downloaded delta manifest tables "
-        f"({len(tables)}) doesn't match the length of delta manifest "
-        f"annotations ({len(annotations)}).",
-    )
-    if not tables:
-        return None, 0, 0
-    delta_stream_position = annotations[0].annotation_stream_position
-    delta_type = annotations[0].annotation_delta_type
-    for annotation in annotations:
-        assert annotation.annotation_stream_position == delta_stream_position, (
-            f"Annotation stream position does not match - {annotation.annotation_stream_position} "
-            f"!= {delta_stream_position}"
-        )
-        assert annotation.annotation_delta_type == delta_type, (
-            f"Annotation delta type does not match - {annotation.annotation_delta_type} "
-            f"!= {delta_type}"
-        )
-    delta_file_envelopes = []
-    table = pa.concat_tables(tables)
-    total_record_count = len(table)
-    total_size_bytes = int(table.nbytes)
-    delta_file = DeltaFileEnvelope.of(
-        stream_position=delta_stream_position,
-        delta_type=delta_type,
-        table=table,
-    )
-    delta_file_envelopes.append(delta_file)
-    return delta_file_envelopes, total_record_count, total_size_bytes
 def _group_file_records_by_pk_hash_bucket(
     annotated_delta: DeltaAnnotated,
     num_hash_buckets: int,
@@ -103,7 +51,7 @@ def _group_file_records_by_pk_hash_bucket(
         delta_file_envelopes,
         total_record_count,
         total_size_bytes,
-    ) = _read_delta_file_envelopes(
+    ) = read_delta_file_envelopes(
         annotated_delta,
         read_kwargs_provider,
         deltacat_storage,
@@ -187,7 +135,7 @@ def _timed_hash_bucket(input: HashBucketInput):
 @ray.remote
 def hash_bucket(input: HashBucketInput) -> HashBucketResult:
     with ProcessUtilizationOverTimeRange() as process_util:
-        logger.info(f"Starting hash bucket task...")
+        logger.info(f"Starting hash bucket task {input.hb_task_index}...")
         # Log node peak memory utilization every 10 seconds
         def log_peak_memory():
@@ -212,7 +160,7 @@ def hash_bucket(input: HashBucketInput) -> HashBucketResult:
             )
             emit_metrics_time = latency
-        logger.info(f"Finished hash bucket task...")
+        logger.info(f"Finished hash bucket task {input.hb_task_index}...")
         return HashBucketResult(
             hash_bucket_result[0],
             hash_bucket_result[1],

deltacat/compute/compactor_v2/steps/merge.py CHANGED Viewed

@@ -6,28 +6,21 @@ import pyarrow as pa
 import ray
 import time
 import pyarrow.compute as pc
+import deltacat.compute.compactor_v2.utils.merge as merge_utils
 from uuid import uuid4
-from collections import defaultdict
 from deltacat import logs
-from typing import List, Optional
-from deltacat.types.media import DELIMITED_TEXT_CONTENT_TYPES
+from typing import List, Optional, Tuple
 from deltacat.compute.compactor_v2.model.merge_result import MergeResult
 from deltacat.compute.compactor.model.materialize_result import MaterializeResult
 from deltacat.compute.compactor.model.pyarrow_write_result import PyArrowWriteResult
-from deltacat.compute.compactor import (
-    RoundCompletionInfo,
-    DeltaFileEnvelope,
-)
+from deltacat.compute.compactor import RoundCompletionInfo, DeltaFileEnvelope
 from deltacat.utils.common import ReadKwargsProvider
 from contextlib import nullcontext
-from deltacat.types.tables import TABLE_CLASS_TO_SIZE_FUNC
 from deltacat.utils.ray_utils.runtime import (
     get_current_ray_task_id,
     get_current_ray_worker_id,
 )
 from deltacat.compute.compactor.utils import system_columns as sc
 from deltacat.utils.performance import timed_invocation
 from deltacat.utils.metrics import emit_timer_metrics
 from deltacat.utils.resources import (
@@ -36,7 +29,6 @@ from deltacat.utils.resources import (
 )
 from deltacat.compute.compactor_v2.utils.primary_key_index import (
     generate_pk_hash_column,
-    hash_group_index_to_hash_bucket_indices,
 )
 from deltacat.storage import (
     Delta,
@@ -77,14 +69,9 @@ def _drop_delta_type_rows(table: pa.Table, delta_type: DeltaType) -> pa.Table:
 def _build_incremental_table(
-    hash_bucket_index: int,
     df_envelopes_list: List[List[DeltaFileEnvelope]],
 ) -> pa.Table:
-    logger.info(
-        f"[Hash bucket index {hash_bucket_index}] Reading dedupe input for "
-        f"{len(df_envelopes_list)} delta file envelope lists..."
-    )
     hb_tables = []
     # sort by delta file stream position now instead of sorting every row later
     df_envelopes = [d for dfe_list in df_envelopes_list for d in dfe_list]
@@ -270,174 +257,120 @@ def _copy_all_manifest_files_from_old_hash_buckets(
     return materialize_result_list
-def _timed_merge(input: MergeInput) -> MergeResult:
-    def _materialize(
-        hash_bucket_index,
-        compacted_tables: List[pa.Table],
-    ) -> MaterializeResult:
-        compacted_table = pa.concat_tables(compacted_tables)
-        if input.compacted_file_content_type in DELIMITED_TEXT_CONTENT_TYPES:
-            # TODO (rkenmi): Investigate if we still need to convert this table to pandas DataFrame
-            # TODO (pdames): compare performance to pandas-native materialize path
-            df = compacted_table.to_pandas(split_blocks=True, self_destruct=True)
-            compacted_table = df
-        delta, stage_delta_time = timed_invocation(
-            input.deltacat_storage.stage_delta,
-            compacted_table,
-            input.write_to_partition,
-            max_records_per_entry=input.max_records_per_output_file,
-            content_type=input.compacted_file_content_type,
-            s3_table_writer_kwargs=input.s3_table_writer_kwargs,
-            **input.deltacat_storage_kwargs,
-        )
-        compacted_table_size = TABLE_CLASS_TO_SIZE_FUNC[type(compacted_table)](
-            compacted_table
-        )
-        logger.debug(
-            f"Time taken for materialize task"
-            f" to upload {len(compacted_table)} records"
-            f" of size {compacted_table_size} is: {stage_delta_time}s"
+def _compact_tables(
+    input: MergeInput, dfe_list: List[List[DeltaFileEnvelope]], hb_idx: int
+) -> Tuple[pa.Table, int, int]:
+    logger.info(
+        f"[Hash bucket index {hb_idx}] Reading dedupe input for "
+        f"{len(dfe_list)} delta file envelope lists..."
+    )
+    table = _build_incremental_table(dfe_list)
+    incremental_len = len(table)
+    logger.info(
+        f"[Hash bucket index {hb_idx}] Got the incremental table of length {incremental_len}"
+    )
+    if input.sort_keys:
+        # Incremental is sorted and merged, as sorting
+        # on non event based sort key does not produce consistent
+        # compaction results. E.g., compaction(delta1, delta2, delta3)
+        # will not be equal to compaction(compaction(delta1, delta2), delta3).
+        table = table.sort_by(input.sort_keys)
+    compacted_table = None
+    if (
+        input.round_completion_info
+        and input.round_completion_info.hb_index_to_entry_range
+        and input.round_completion_info.hb_index_to_entry_range.get(str(hb_idx))
+        is not None
+    ):
+        compacted_table = _download_compacted_table(
+            hb_index=hb_idx,
+            rcf=input.round_completion_info,
+            read_kwargs_provider=input.read_kwargs_provider,
+            deltacat_storage=input.deltacat_storage,
+            deltacat_storage_kwargs=input.deltacat_storage_kwargs,
         )
-        manifest = delta.manifest
-        manifest_records = manifest.meta.record_count
-        assert manifest_records == len(compacted_table), (
-            f"Unexpected Error: Materialized delta manifest record count "
-            f"({manifest_records}) does not equal compacted table record count "
-            f"({len(compacted_table)})"
+    hb_table_record_count = len(table) + (
+        len(compacted_table) if compacted_table else 0
+    )
+    table, merge_time = timed_invocation(
+        func=_merge_tables,
+        table=table,
+        primary_keys=input.primary_keys,
+        can_drop_duplicates=input.drop_duplicates,
+        compacted_table=compacted_table,
+    )
+    total_deduped_records = hb_table_record_count - len(table)
+    logger.info(
+        f"[Merge task index {input.merge_task_index}] Merged "
+        f"record count: {len(table)}, size={table.nbytes} took: {merge_time}s"
+    )
+    return table, incremental_len, total_deduped_records
+def _copy_manifests_from_hash_bucketing(
+    input: MergeInput, hb_index_copy_by_reference_ids: List[int]
+) -> List[MaterializeResult]:
+    materialized_results: List[MaterializeResult] = []
+    if input.round_completion_info:
+        referenced_materialized_results = (
+            _copy_all_manifest_files_from_old_hash_buckets(
+                hb_index_copy_by_reference_ids,
+                input.round_completion_info,
+                input.write_to_partition,
+                input.deltacat_storage,
+                input.deltacat_storage_kwargs,
+            )
         )
-        materialize_result = MaterializeResult.of(
-            delta=delta,
-            task_index=hash_bucket_index,
-            # TODO (pdames): Generalize WriteResult to contain in-memory-table-type
-            #  and in-memory-table-bytes instead of tight coupling to paBytes
-            pyarrow_write_result=PyArrowWriteResult.of(
-                len(manifest.entries),
-                TABLE_CLASS_TO_SIZE_FUNC[type(compacted_table)](compacted_table),
-                manifest.meta.content_length,
-                len(compacted_table),
-            ),
+        logger.info(
+            f"Copying {len(referenced_materialized_results)} manifest files by reference..."
         )
-        logger.info(f"Materialize result: {materialize_result}")
-        return materialize_result
+        materialized_results.extend(referenced_materialized_results)
+    return materialized_results
+def _timed_merge(input: MergeInput) -> MergeResult:
     task_id = get_current_ray_task_id()
     worker_id = get_current_ray_worker_id()
     with memray.Tracker(
         f"merge_{worker_id}_{task_id}.bin"
     ) if input.enable_profiler else nullcontext():
-        # In V2, we need to mitigate risk of running out of memory here in cases of
-        #  severe skew of primary key updates in deltas. By severe skew, we mean
-        #  one hash bucket require more memory than a worker instance have.
-        logger.info(
-            f"[Merge task {input.merge_task_index}] Getting delta file envelope "
-            f"groups for {len(input.dfe_groups_refs)} object refs..."
-        )
-        delta_file_envelope_groups_list = input.object_store.get_many(
-            input.dfe_groups_refs
-        )
-        hb_index_to_delta_file_envelopes_list = defaultdict(list)
-        for delta_file_envelope_groups in delta_file_envelope_groups_list:
-            assert input.hash_bucket_count == len(delta_file_envelope_groups), (
-                f"The hash bucket count must match the dfe size as {input.hash_bucket_count}"
-                f" != {len(delta_file_envelope_groups)}"
-            )
-            for hb_idx, dfes in enumerate(delta_file_envelope_groups):
-                if dfes:
-                    hb_index_to_delta_file_envelopes_list[hb_idx].append(dfes)
-        valid_hb_indices_iterable = hash_group_index_to_hash_bucket_indices(
-            input.hash_group_index, input.hash_bucket_count, input.num_hash_groups
-        )
+        total_input_records, total_deduped_records = 0, 0
+        materialized_results: List[MaterializeResult] = []
+        merge_file_groups = input.merge_file_groups_provider.create()
+        hb_index_copy_by_ref_ids = []
-        total_deduped_records = 0
-        total_dfes_found = 0
+        for merge_file_group in merge_file_groups:
+            if not merge_file_group.dfe_groups:
+                hb_index_copy_by_ref_ids.append(merge_file_group.hb_index)
+                continue
-        materialized_results: List[MaterializeResult] = []
-        hb_index_copy_by_reference = []
-        for hb_idx in valid_hb_indices_iterable:
-            dfe_list = hb_index_to_delta_file_envelopes_list.get(hb_idx)
-            if dfe_list:
-                total_dfes_found += 1
-                table = _build_incremental_table(hb_idx, dfe_list)
-                incremental_len = len(table)
-                logger.info(
-                    f"Got the incremental table of length {incremental_len} for hash bucket {hb_idx}"
-                )
-                if input.sort_keys:
-                    # Incremental is sorted and merged, as sorting
-                    # on non event based sort key does not produce consistent
-                    # compaction results. E.g., compaction(delta1, delta2, delta3)
-                    # will not be equal to compaction(compaction(delta1, delta2), delta3).
-                    table = table.sort_by(input.sort_keys)
-                compacted_table = None
-                if (
-                    input.round_completion_info
-                    and input.round_completion_info.hb_index_to_entry_range
-                    and input.round_completion_info.hb_index_to_entry_range.get(
-                        str(hb_idx)
-                    )
-                    is not None
-                ):
-                    compacted_table = _download_compacted_table(
-                        hb_index=hb_idx,
-                        rcf=input.round_completion_info,
-                        read_kwargs_provider=input.read_kwargs_provider,
-                        deltacat_storage=input.deltacat_storage,
-                        deltacat_storage_kwargs=input.deltacat_storage_kwargs,
-                    )
-                hb_table_record_count = len(table) + (
-                    len(compacted_table) if compacted_table else 0
-                )
-                table, merge_time = timed_invocation(
-                    func=_merge_tables,
-                    table=table,
-                    primary_keys=input.primary_keys,
-                    can_drop_duplicates=input.drop_duplicates,
-                    compacted_table=compacted_table,
-                )
-                total_deduped_records += hb_table_record_count - len(table)
-                logger.info(
-                    f"[Merge task index {input.merge_task_index}] Merged "
-                    f"record count: {len(table)}, size={table.nbytes} took: {merge_time}s"
-                )
-                materialized_results.append(_materialize(hb_idx, [table]))
-            else:
-                hb_index_copy_by_reference.append(hb_idx)
-        if input.round_completion_info and hb_index_copy_by_reference:
-            referenced_materialized_results = (
-                _copy_all_manifest_files_from_old_hash_buckets(
-                    hb_index_copy_by_reference,
-                    input.round_completion_info,
-                    input.write_to_partition,
-                    input.deltacat_storage,
-                    input.deltacat_storage_kwargs,
-                )
+            table, input_records, deduped_records = _compact_tables(
+                input, merge_file_group.dfe_groups, merge_file_group.hb_index
             )
-            logger.info(
-                f"Copying {len(referenced_materialized_results)} manifest files by reference..."
+            total_input_records += input_records
+            total_deduped_records += deduped_records
+            materialized_results.append(
+                merge_utils.materialize(input, merge_file_group.hb_index, [table])
             )
-            materialized_results.extend(referenced_materialized_results)
-        logger.info(
-            "Total number of materialized results produced for "
-            f"hash group index: {input.hash_group_index} is {len(materialized_results)}"
-        )
+        if hb_index_copy_by_ref_ids:
+            materialized_results.extend(
+                _copy_manifests_from_hash_bucketing(input, hb_index_copy_by_ref_ids)
+            )
-        assert total_dfes_found == len(hb_index_to_delta_file_envelopes_list), (
-            "The total dfe list does not match the input dfes from hash bucket as "
-            f"{total_dfes_found} != {len(hb_index_to_delta_file_envelopes_list)}"
+        logger.info(
+            f"[Hash group index: {input.merge_file_groups_provider.hash_group_index}]"
+            f" Total number of materialized results produced: {len(materialized_results)} "
         )
         peak_memory_usage_bytes = get_current_process_peak_memory_usage_in_bytes()
@@ -447,6 +380,7 @@ def _timed_merge(input: MergeInput) -> MergeResult:
         return MergeResult(
             materialized_results,
+            np.int64(total_input_records),
             np.int64(total_deduped_records),
             np.double(peak_memory_usage_bytes),
             np.double(0.0),
@@ -457,7 +391,7 @@ def _timed_merge(input: MergeInput) -> MergeResult:
 @ray.remote
 def merge(input: MergeInput) -> MergeResult:
     with ProcessUtilizationOverTimeRange() as process_util:
-        logger.info(f"Starting merge task...")
+        logger.info(f"Starting merge task {input.merge_task_index}...")
         # Log node peak memory utilization every 10 seconds
         def log_peak_memory():
@@ -480,11 +414,12 @@ def merge(input: MergeInput) -> MergeResult:
             )
             emit_metrics_time = latency
-        logger.info(f"Finished merge task...")
+        logger.info(f"Finished merge task {input.merge_task_index}...")
         return MergeResult(
             merge_result[0],
             merge_result[1],
             merge_result[2],
+            merge_result[3],
             np.double(emit_metrics_time),
             merge_result[4],
         )

deltacat/compute/compactor_v2/utils/delta.py ADDED Viewed

@@ -0,0 +1,97 @@
+import time
+from typing import List, Optional, Tuple
+from deltacat.compute.compactor import (
+    DeltaAnnotated,
+    DeltaFileEnvelope,
+)
+from deltacat.storage import interface as unimplemented_deltacat_storage
+from deltacat.types.media import StorageType
+from deltacat.utils.common import ReadKwargsProvider
+from deltacat import logs
+import pyarrow as pa
+import logging
+logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
+def read_delta_file_envelopes(
+    annotated_delta: DeltaAnnotated,
+    read_kwargs_provider: Optional[ReadKwargsProvider],
+    deltacat_storage=unimplemented_deltacat_storage,
+    deltacat_storage_kwargs: Optional[dict] = None,
+) -> Tuple[Optional[List[DeltaFileEnvelope]], int, int]:
+    tables = deltacat_storage.download_delta(
+        annotated_delta,
+        max_parallelism=1,
+        file_reader_kwargs_provider=read_kwargs_provider,
+        storage_type=StorageType.LOCAL,
+        **deltacat_storage_kwargs,
+    )
+    annotations = annotated_delta.annotations
+    assert (
+        len(tables) == len(annotations),
+        f"Unexpected Error: Length of downloaded delta manifest tables "
+        f"({len(tables)}) doesn't match the length of delta manifest "
+        f"annotations ({len(annotations)}).",
+    )
+    if not tables:
+        return None, 0, 0
+    delta_stream_position = annotations[0].annotation_stream_position
+    delta_type = annotations[0].annotation_delta_type
+    for annotation in annotations:
+        assert annotation.annotation_stream_position == delta_stream_position, (
+            f"Annotation stream position does not match - {annotation.annotation_stream_position} "
+            f"!= {delta_stream_position}"
+        )
+        assert annotation.annotation_delta_type == delta_type, (
+            f"Annotation delta type does not match - {annotation.annotation_delta_type} "
+            f"!= {delta_type}"
+        )
+    delta_file_envelopes = []
+    table = pa.concat_tables(tables)
+    total_record_count = len(table)
+    total_size_bytes = int(table.nbytes)
+    delta_file = DeltaFileEnvelope.of(
+        stream_position=delta_stream_position,
+        delta_type=delta_type,
+        table=table,
+    )
+    delta_file_envelopes.append(delta_file)
+    return delta_file_envelopes, total_record_count, total_size_bytes
+def get_local_delta_file_envelopes(
+    uniform_deltas: List[DeltaAnnotated],
+    read_kwargs_provider: Optional[ReadKwargsProvider],
+    deltacat_storage=unimplemented_deltacat_storage,
+    deltacat_storage_kwargs: Optional[dict] = None,
+) -> Tuple[List[DeltaFileEnvelope], int]:
+    local_dfe_list = []
+    input_records_count = 0
+    logger.info(f"Getting {len(uniform_deltas)} DFE Tasks.")
+    dfe_start = time.monotonic()
+    for annotated_delta in uniform_deltas:
+        (
+            delta_file_envelopes,
+            total_record_count,
+            total_size_bytes,
+        ) = read_delta_file_envelopes(
+            annotated_delta,
+            read_kwargs_provider,
+            deltacat_storage,
+            deltacat_storage_kwargs,
+        )
+        if delta_file_envelopes:
+            local_dfe_list.extend(delta_file_envelopes)
+            input_records_count += total_record_count
+    dfe_end = time.monotonic()
+    logger.info(f"Retrieved {len(local_dfe_list)} DFE Tasks in {dfe_end - dfe_start}s.")
+    return local_dfe_list, input_records_count

deltacat 0.2.9__py3-none-any.whl → 1.0.0__py3-none-any.whl

deltacat 0.2.9py3-none-any.whl → 1.0.0py3-none-any.whl