PyPI - deltacat - Versions diffs - 0.1.10.dev0__py3-none-any.whl → 0.1.12__py3-none-any.whl - Mend

deltacat 0.1.10.dev0py3-none-any.whl → 0.1.12py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (83) hide show

deltacat/__init__.py +41 -15
deltacat/aws/clients.py +12 -31
deltacat/aws/constants.py +1 -1
deltacat/aws/redshift/__init__.py +7 -2
deltacat/aws/redshift/model/manifest.py +54 -50
deltacat/aws/s3u.py +176 -187
deltacat/catalog/delegate.py +151 -185
deltacat/catalog/interface.py +78 -97
deltacat/catalog/model/catalog.py +21 -21
deltacat/catalog/model/table_definition.py +11 -9
deltacat/compute/compactor/__init__.py +12 -16
deltacat/compute/compactor/compaction_session.py +237 -166
deltacat/compute/compactor/model/delta_annotated.py +60 -44
deltacat/compute/compactor/model/delta_file_envelope.py +5 -6
deltacat/compute/compactor/model/delta_file_locator.py +10 -8
deltacat/compute/compactor/model/materialize_result.py +6 -7
deltacat/compute/compactor/model/primary_key_index.py +38 -34
deltacat/compute/compactor/model/pyarrow_write_result.py +3 -4
deltacat/compute/compactor/model/round_completion_info.py +25 -19
deltacat/compute/compactor/model/sort_key.py +18 -15
deltacat/compute/compactor/steps/dedupe.py +119 -94
deltacat/compute/compactor/steps/hash_bucket.py +48 -47
deltacat/compute/compactor/steps/materialize.py +86 -92
deltacat/compute/compactor/steps/rehash/rehash_bucket.py +13 -13
deltacat/compute/compactor/steps/rehash/rewrite_index.py +5 -5
deltacat/compute/compactor/utils/io.py +59 -47
deltacat/compute/compactor/utils/primary_key_index.py +91 -80
deltacat/compute/compactor/utils/round_completion_file.py +22 -23
deltacat/compute/compactor/utils/system_columns.py +33 -45
deltacat/compute/metastats/meta_stats.py +235 -157
deltacat/compute/metastats/model/partition_stats_dict.py +7 -10
deltacat/compute/metastats/model/stats_cluster_size_estimator.py +13 -5
deltacat/compute/metastats/stats.py +95 -64
deltacat/compute/metastats/utils/io.py +100 -53
deltacat/compute/metastats/utils/pyarrow_memory_estimation_function.py +5 -2
deltacat/compute/metastats/utils/ray_utils.py +38 -33
deltacat/compute/stats/basic.py +107 -69
deltacat/compute/stats/models/delta_column_stats.py +11 -8
deltacat/compute/stats/models/delta_stats.py +59 -32
deltacat/compute/stats/models/delta_stats_cache_result.py +4 -1
deltacat/compute/stats/models/manifest_entry_stats.py +12 -6
deltacat/compute/stats/models/stats_result.py +24 -14
deltacat/compute/stats/utils/intervals.py +16 -9
deltacat/compute/stats/utils/io.py +86 -51
deltacat/compute/stats/utils/manifest_stats_file.py +24 -33
deltacat/constants.py +4 -13
deltacat/io/__init__.py +2 -2
deltacat/io/aws/redshift/redshift_datasource.py +157 -143
deltacat/io/dataset.py +14 -17
deltacat/io/read_api.py +36 -33
deltacat/logs.py +94 -42
deltacat/storage/__init__.py +18 -8
deltacat/storage/interface.py +196 -213
deltacat/storage/model/delta.py +45 -51
deltacat/storage/model/list_result.py +12 -8
deltacat/storage/model/namespace.py +4 -5
deltacat/storage/model/partition.py +42 -42
deltacat/storage/model/stream.py +29 -30
deltacat/storage/model/table.py +14 -14
deltacat/storage/model/table_version.py +32 -31
deltacat/storage/model/types.py +1 -0
deltacat/tests/stats/test_intervals.py +11 -24
deltacat/tests/utils/__init__.py +0 -0
deltacat/tests/utils/test_record_batch_tables.py +284 -0
deltacat/types/media.py +3 -4
deltacat/types/tables.py +31 -21
deltacat/utils/common.py +5 -11
deltacat/utils/numpy.py +20 -22
deltacat/utils/pandas.py +73 -100
deltacat/utils/performance.py +3 -9
deltacat/utils/placement.py +259 -230
deltacat/utils/pyarrow.py +302 -89
deltacat/utils/ray_utils/collections.py +2 -1
deltacat/utils/ray_utils/concurrency.py +27 -28
deltacat/utils/ray_utils/dataset.py +28 -28
deltacat/utils/ray_utils/performance.py +5 -9
deltacat/utils/ray_utils/runtime.py +9 -10
{deltacat-0.1.10.dev0.dist-info → deltacat-0.1.12.dist-info}/METADATA +1 -1
deltacat-0.1.12.dist-info/RECORD +110 -0
deltacat-0.1.10.dev0.dist-info/RECORD +0 -108
{deltacat-0.1.10.dev0.dist-info → deltacat-0.1.12.dist-info}/LICENSE +0 -0
{deltacat-0.1.10.dev0.dist-info → deltacat-0.1.12.dist-info}/WHEEL +0 -0
{deltacat-0.1.10.dev0.dist-info → deltacat-0.1.12.dist-info}/top_level.txt +0 -0

deltacat/compute/compactor/steps/materialize.py CHANGED Viewed

@@ -1,14 +1,11 @@
 import logging
-import ray
-import pyarrow as pa
+import time
 from collections import defaultdict
 from itertools import chain, repeat
 from typing import List, Optional, Tuple
 import pyarrow as pa
 import ray
-from pyarrow import compute as pc
 from ray import cloudpickle
 from deltacat import logs
@@ -21,14 +18,15 @@ from deltacat.compute.compactor.steps.dedupe import (
     DedupeTaskIndexWithObjectId,
     DeltaFileLocatorToRecords,
 )
-from deltacat.compute.compactor.utils import system_columns as sc
 from deltacat.storage import Delta, DeltaLocator, Partition, PartitionLocator
 from deltacat.storage import interface as unimplemented_deltacat_storage
 from deltacat.types.media import DELIMITED_TEXT_CONTENT_TYPES, ContentType
 from deltacat.types.tables import TABLE_CLASS_TO_SIZE_FUNC
+from deltacat.utils.performance import timed_invocation
 from deltacat.utils.pyarrow import (
     ReadKwargsProviderPyArrowCsvPureUtf8,
     ReadKwargsProviderPyArrowSchemaOverride,
+    RecordBatchTables,
 )
 logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
@@ -36,47 +34,49 @@ logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
 @ray.remote
 def materialize(
-        source_partition_locator: PartitionLocator,
-        round_completion_info: Optional[RoundCompletionInfo],
-        partition: Partition,
-        mat_bucket_index: int,
-        dedupe_task_idx_and_obj_id_tuples: List[DedupeTaskIndexWithObjectId],
-        max_records_per_output_file: int,
-        compacted_file_content_type: ContentType,
-        schema: Optional[pa.Schema] = None,
-        deltacat_storage=unimplemented_deltacat_storage) -> MaterializeResult:
-    def _materialize(
-            compacted_tables: List[pa.Table],
-            compacted_tables_record_count: int) -> MaterializeResult:
-        compacted_tables_size = sum([TABLE_CLASS_TO_SIZE_FUNC[type(tbl)](tbl)
-                                     for tbl in compacted_tables])
-        logger.debug(f"Uploading {len(compacted_tables)} compacted tables "
-                     f"with size: {compacted_tables_size} bytes "
-                     f"and record count: {compacted_tables_record_count}")
+    source_partition_locator: PartitionLocator,
+    round_completion_info: Optional[RoundCompletionInfo],
+    partition: Partition,
+    mat_bucket_index: int,
+    dedupe_task_idx_and_obj_id_tuples: List[DedupeTaskIndexWithObjectId],
+    max_records_per_output_file: int,
+    compacted_file_content_type: ContentType,
+    schema: Optional[pa.Schema] = None,
+    deltacat_storage=unimplemented_deltacat_storage,
+) -> MaterializeResult:
+    # TODO (rkenmi): Add docstrings for the steps in the compaction workflow
+    #  https://github.com/ray-project/deltacat/issues/79
+    def _materialize(compacted_tables: List[pa.Table]) -> MaterializeResult:
         compacted_table = pa.concat_tables(compacted_tables)
         if compacted_file_content_type in DELIMITED_TEXT_CONTENT_TYPES:
-            # TODO (ricmiyam): Investigate if we still need to convert this table to pandas DataFrame
+            # TODO (rkenmi): Investigate if we still need to convert this table to pandas DataFrame
             # TODO (pdames): compare performance to pandas-native materialize path
-            df = compacted_table.to_pandas(
-                split_blocks=True,
-                self_destruct=True,
-                zero_copy_only=True
-            )
+            df = compacted_table.to_pandas(split_blocks=True, self_destruct=True)
             compacted_table = df
-        delta = deltacat_storage.stage_delta(
+        delta, stage_delta_time = timed_invocation(
+            deltacat_storage.stage_delta,
             compacted_table,
             partition,
             max_records_per_entry=max_records_per_output_file,
             content_type=compacted_file_content_type,
         )
+        compacted_table_size = TABLE_CLASS_TO_SIZE_FUNC[type(compacted_table)](
+            compacted_table
+        )
+        logger.debug(
+            f"Time taken for materialize task"
+            f" to upload {len(compacted_table)} records"
+            f" of size {compacted_table_size} is: {stage_delta_time}s"
+        )
         manifest = delta.manifest
         manifest_records = manifest.meta.record_count
-        assert(manifest_records == len(compacted_table),
-               f"Unexpected Error: Materialized delta manifest record count "
-               f"({manifest_records}) does not equal compacted table record count "
-               f"({len(compacted_table)})")
+        assert (
+            manifest_records == len(compacted_table),
+            f"Unexpected Error: Materialized delta manifest record count "
+            f"({manifest_records}) does not equal compacted table record count "
+            f"({len(compacted_table)})",
+        )
         materialize_result = MaterializeResult.of(
             delta,
             mat_bucket_index,
@@ -92,17 +92,20 @@ def materialize(
         logger.info(f"Materialize result: {materialize_result}")
         return materialize_result
-    logger.info(f"Starting materialize task...")
+    logger.info(
+        f"Starting materialize task with"
+        f" materialize bucket index: {mat_bucket_index}..."
+    )
+    start = time.time()
     dedupe_task_idx_and_obj_ref_tuples = [
         (
             t1,
             cloudpickle.loads(t2),
-        ) for t1, t2 in dedupe_task_idx_and_obj_id_tuples
+        )
+        for t1, t2 in dedupe_task_idx_and_obj_id_tuples
     ]
     logger.info(f"Resolved materialize task obj refs...")
-    dedupe_task_indices, obj_refs = zip(
-        *dedupe_task_idx_and_obj_ref_tuples
-    )
+    dedupe_task_indices, obj_refs = zip(*dedupe_task_idx_and_obj_ref_tuples)
     # this depends on `ray.get` result order matching input order, as per the
     # contract established in: https://github.com/ray-project/ray/pull/16763
     src_file_records_list = ray.get(list(obj_refs))
@@ -114,21 +117,23 @@ def materialize(
                 (record_numbers, repeat(dedupe_task_idx, len(record_numbers)))
             )
     manifest_cache = {}
-    compacted_tables = []
     materialized_results: List[MaterializeResult] = []
-    total_record_count = 0
+    record_batch_tables = RecordBatchTables(max_records_per_output_file)
     for src_dfl in sorted(all_src_file_records.keys()):
-        record_numbers_dd_task_idx_tpl_list: List[Tuple[DeltaFileLocatorToRecords, repeat]] = \
-            all_src_file_records[src_dfl]
+        record_numbers_dd_task_idx_tpl_list: List[
+            Tuple[DeltaFileLocatorToRecords, repeat]
+        ] = all_src_file_records[src_dfl]
         record_numbers_tpl, dedupe_task_idx_iter_tpl = zip(
             *record_numbers_dd_task_idx_tpl_list
         )
         is_src_partition_file_np = src_dfl.is_source_delta
         src_stream_position_np = src_dfl.stream_position
         src_file_idx_np = src_dfl.file_index
-        src_file_partition_locator = source_partition_locator \
-            if is_src_partition_file_np \
+        src_file_partition_locator = (
+            source_partition_locator
+            if is_src_partition_file_np
             else round_completion_info.compacted_delta_locator.partition_locator
+        )
         delta_locator = DeltaLocator.of(
             src_file_partition_locator,
             src_stream_position_np.item(),
@@ -148,65 +153,54 @@ def materialize(
         # enforce a consistent schema if provided, when reading files into PyArrow tables
         elif schema is not None:
             read_kwargs_provider = ReadKwargsProviderPyArrowSchemaOverride(
-                schema=schema)
-        pa_table = deltacat_storage.download_delta_manifest_entry(
+                schema=schema
+            )
+        pa_table, download_delta_manifest_entry_time = timed_invocation(
+            deltacat_storage.download_delta_manifest_entry,
             Delta.of(delta_locator, None, None, None, manifest),
             src_file_idx_np.item(),
             file_reader_kwargs_provider=read_kwargs_provider,
         )
-        record_count = len(pa_table)
-        mask_pylist = list(repeat(False, record_count))
+        logger.debug(
+            f"Time taken for materialize task"
+            f" to download delta locator {delta_locator} with entry ID {src_file_idx_np.item()}"
+            f" is: {download_delta_manifest_entry_time}s"
+        )
+        mask_pylist = list(repeat(False, len(pa_table)))
         record_numbers = chain.from_iterable(record_numbers_tpl)
         # TODO(raghumdani): reference the same file URIs while writing the files
-        # instead of copying the data over and creating new files.
+        # instead of copying the data over and creating new files.
         for record_number in record_numbers:
             mask_pylist[record_number] = True
         mask = pa.array(mask_pylist)
         pa_table = pa_table.filter(mask)
+        record_batch_tables.append(pa_table)
+        if record_batch_tables.has_batches():
+            batched_tables = record_batch_tables.evict()
+            materialized_results.append(_materialize(batched_tables))
-        # appending, sorting, taking, and dropping has 2-3X latency of a
-        # single filter on average, and thus provides better average
-        # performance than repeatedly filtering the table in dedupe task index
-        # order
-        dedupe_task_indices = chain.from_iterable(dedupe_task_idx_iter_tpl)
-        pa_table = sc.append_dedupe_task_idx_col(
-            pa_table,
-            dedupe_task_indices,
-        )
-        pa_sort_keys = [(sc._DEDUPE_TASK_IDX_COLUMN_NAME, "ascending")]
-        pa_table = pa_table.take(
-            pc.sort_indices(pa_table, sort_keys=pa_sort_keys),
-        )
-        pa_table = pa_table.drop(
-            [sc._DEDUPE_TASK_IDX_COLUMN_NAME]
-        )
-        # Write manifests up to max_records_per_output_file
-        # TODO(raghumdani): Write exactly the same number of records into each file to
-        # produce a read-optimized view of the tables.
-        if compacted_tables and \
-                total_record_count + record_count > max_records_per_output_file:
-            materialized_results.append(_materialize(compacted_tables, total_record_count))
-            # Free up written tables in memory
-            compacted_tables.clear()
-            total_record_count = 0
-        total_record_count += record_count
-        compacted_tables.append(pa_table)
-    materialized_results.append(_materialize(compacted_tables, total_record_count))
-    # Free up written tables in memory
-    compacted_tables.clear()
+    if record_batch_tables.has_remaining():
+        materialized_results.append(_materialize(record_batch_tables.remaining))
     merged_delta = Delta.merge_deltas([mr.delta for mr in materialized_results])
-    assert materialized_results and len(materialized_results) > 0, \
-        f"Expected at least one materialized result in materialize step."
+    assert (
+        materialized_results and len(materialized_results) > 0
+    ), f"Expected at least one materialized result in materialize step."
+    write_results = [mr.pyarrow_write_result for mr in materialized_results]
+    logger.debug(
+        f"{len(write_results)} files written"
+        f" with records: {[wr.records for wr in write_results]}"
+    )
     # Merge all new deltas into one for this materialize bucket index
-    merged_materialize_result = MaterializeResult.of(merged_delta,
-                                                     materialized_results[0].task_index,
-                                                     PyArrowWriteResult.union([mr.pyarrow_write_result
-                                                                               for mr in materialized_results]))
+    merged_materialize_result = MaterializeResult.of(
+        merged_delta,
+        materialized_results[0].task_index,
+        PyArrowWriteResult.union(
+            [mr.pyarrow_write_result for mr in materialized_results]
+        ),
+    )
     logger.info(f"Finished materialize task...")
+    end = time.time()
+    logger.info(f"Materialize task ended in {end - start}s")
     return merged_materialize_result

deltacat/compute/compactor/steps/rehash/rehash_bucket.py CHANGED Viewed

@@ -1,22 +1,21 @@
 import logging
-import ray
-import pyarrow as pa
-import numpy as np
+from typing import List, Tuple
+import numpy as np
+import pyarrow as pa
+import ray
 from ray.types import ObjectRef
 from deltacat import logs
 from deltacat.compute.compactor import PrimaryKeyIndexVersionLocator
 from deltacat.compute.compactor.utils import primary_key_index as pki
-from typing import List, Tuple
 logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
 def group_file_records_by_pk_hash_bucket(
-        pki_table: pa.Table,
-        num_buckets: int) -> np.ndarray:
+    pki_table: pa.Table, num_buckets: int
+) -> np.ndarray:
     # generate the new table for each new hash bucket
     hash_bucket_to_indices = pki.group_record_indices_by_hash_bucket(
         pki_table,
@@ -29,13 +28,14 @@ def group_file_records_by_pk_hash_bucket(
     return hash_bucket_to_table
-@ray.remote(num_cpus=1,num_returns=2)
+@ray.remote(num_cpus=1, num_returns=2)
 def rehash_bucket(
-        hash_bucket_index: int,
-        s3_bucket: str,
-        old_pki_version_locator: PrimaryKeyIndexVersionLocator,
-        num_buckets: int,
-        num_groups: int) -> Tuple[np.ndarray, List[ObjectRef]]:
+    hash_bucket_index: int,
+    s3_bucket: str,
+    old_pki_version_locator: PrimaryKeyIndexVersionLocator,
+    num_buckets: int,
+    num_groups: int,
+) -> Tuple[np.ndarray, List[ObjectRef]]:
     logger.info(f"Starting rehash bucket task...")
     tables = pki.download_hash_bucket_entries(

deltacat/compute/compactor/steps/rehash/rewrite_index.py CHANGED Viewed

@@ -16,11 +16,11 @@ logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
 @ray.remote(num_cpus=1, num_returns=2)
 def rewrite_index(
-        object_ids: List[Any],
-        s3_bucket: str,
-        new_primary_key_index_version_locator: PrimaryKeyIndexVersionLocator,
-        max_records_per_index_file: int) -> \
-        Tuple[PyArrowWriteResult, List[ObjectRef]]:
+    object_ids: List[Any],
+    s3_bucket: str,
+    new_primary_key_index_version_locator: PrimaryKeyIndexVersionLocator,
+    max_records_per_index_file: int,
+) -> Tuple[PyArrowWriteResult, List[ObjectRef]]:
     logger.info(f"Starting rewrite primary key index task...")
     object_refs = [cloudpickle.loads(obj_id_pkl) for obj_id_pkl in object_ids]

deltacat/compute/compactor/utils/io.py CHANGED Viewed

@@ -1,24 +1,23 @@
 import logging
-import time
 import math
-from deltacat.compute.stats.models.delta_stats import DeltaStats
-from deltacat.constants import PYARROW_INFLATION_MULTIPLIER, BYTES_PER_MEBIBYTE
+from typing import Dict, List, Optional, Tuple
-from deltacat.storage import PartitionLocator, Delta, \
-    interface as unimplemented_deltacat_storage
 from deltacat import logs
 from deltacat.compute.compactor import DeltaAnnotated
-from typing import Dict, List, Optional, Tuple
+from deltacat.compute.stats.models.delta_stats import DeltaStats
+from deltacat.constants import BYTES_PER_MEBIBYTE, PYARROW_INFLATION_MULTIPLIER
+from deltacat.storage import Delta, PartitionLocator
+from deltacat.storage import interface as unimplemented_deltacat_storage
 logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
 def discover_deltas(
-        source_partition_locator: PartitionLocator,
-        start_position_exclusive: Optional[int],
-        end_position_inclusive: int,
-        deltacat_storage=unimplemented_deltacat_storage) -> List[Delta]:
+    source_partition_locator: PartitionLocator,
+    start_position_exclusive: Optional[int],
+    end_position_inclusive: int,
+    deltacat_storage=unimplemented_deltacat_storage,
+) -> List[Delta]:
     stream_locator = source_partition_locator.stream_locator
     namespace = stream_locator.namespace
@@ -36,32 +35,38 @@ def discover_deltas(
     )
     deltas = deltas_list_result.all_items()
     if not deltas:
-        raise RuntimeError(f"Unexpected Error: Couldn't find any deltas to "
-                           f"compact in delta stream position range "
-                           f"('{start_position_exclusive}', "
-                           f"'{end_position_inclusive}']. Source partition: "
-                           f"{source_partition_locator}")
+        raise RuntimeError(
+            f"Unexpected Error: Couldn't find any deltas to "
+            f"compact in delta stream position range "
+            f"('{start_position_exclusive}', "
+            f"'{end_position_inclusive}']. Source partition: "
+            f"{source_partition_locator}"
+        )
     if start_position_exclusive:
         first_delta = deltas.pop(0)
-        logger.info(f"Removed exclusive start delta w/ expected stream "
-                    f"position '{start_position_exclusive}' from deltas to "
-                    f"compact: {first_delta}")
-    logger.info(f"Count of deltas to compact in delta stream "
-                f"position range ('{start_position_exclusive}', "
-                f"'{end_position_inclusive}']: {len(deltas)}. Source "
-                f"partition: '{source_partition_locator}'")
+        logger.info(
+            f"Removed exclusive start delta w/ expected stream "
+            f"position '{start_position_exclusive}' from deltas to "
+            f"compact: {first_delta}"
+        )
+    logger.info(
+        f"Count of deltas to compact in delta stream "
+        f"position range ('{start_position_exclusive}', "
+        f"'{end_position_inclusive}']: {len(deltas)}. Source "
+        f"partition: '{source_partition_locator}'"
+    )
     return deltas
 def limit_input_deltas(
-        input_deltas: List[Delta],
-        cluster_resources: Dict[str, float],
-        hash_bucket_count: int,
-        min_pk_index_pa_bytes: int,
-        user_hash_bucket_chunk_size: int,
-        input_deltas_stats: Dict[int, DeltaStats],
-        deltacat_storage=unimplemented_deltacat_storage) \
-        -> Tuple[List[DeltaAnnotated], int, int]:
+    input_deltas: List[Delta],
+    cluster_resources: Dict[str, float],
+    hash_bucket_count: int,
+    min_pk_index_pa_bytes: int,
+    user_hash_bucket_chunk_size: int,
+    input_deltas_stats: Dict[int, DeltaStats],
+    deltacat_storage=unimplemented_deltacat_storage,
+) -> Tuple[List[DeltaAnnotated], int, int]:
     # TODO (pdames): when row counts are available in metadata, use them
     #  instead of bytes - memory consumption depends more on number of
@@ -78,9 +83,10 @@ def limit_input_deltas(
     # )
     if min_pk_index_pa_bytes > 0:
         required_heap_mem_for_dedupe = worker_obj_store_mem - min_pk_index_pa_bytes
-        assert required_heap_mem_for_dedupe > 0, \
-            f"Not enough required memory available to re-batch input deltas" \
+        assert required_heap_mem_for_dedupe > 0, (
+            f"Not enough required memory available to re-batch input deltas"
             f"and initiate the dedupe step."
+        )
         # Size of batched deltas must also be reduced to have enough space for primary
         # key index files (from earlier compaction rounds) in the dedupe step, since
         # they will be loaded into worker heap memory.
@@ -88,8 +94,7 @@ def limit_input_deltas(
     logger.info(f"Total worker object store memory: {worker_obj_store_mem}")
     worker_obj_store_mem_per_task = worker_obj_store_mem / worker_cpus
-    logger.info(f"Worker object store memory/task: "
-                f"{worker_obj_store_mem_per_task}")
+    logger.info(f"Worker object store memory/task: " f"{worker_obj_store_mem_per_task}")
     worker_task_mem = cluster_resources["memory"]
     logger.info(f"Total worker memory: {worker_task_mem}")
     # TODO (pdames): ensure fixed memory per CPU in heterogenous clusters
@@ -105,8 +110,10 @@ def limit_input_deltas(
     if input_deltas_stats is None:
         input_deltas_stats = {}
-    input_deltas_stats = {int(stream_pos): DeltaStats(delta_stats)
-                          for stream_pos, delta_stats in input_deltas_stats.items()}
+    input_deltas_stats = {
+        int(stream_pos): DeltaStats(delta_stats)
+        for stream_pos, delta_stats in input_deltas_stats.items()
+    }
     for delta in input_deltas:
         manifest = deltacat_storage.get_delta_manifest(delta)
         delta.manifest = manifest
@@ -118,7 +125,8 @@ def limit_input_deltas(
             # TODO (pdames): ensure pyarrow object fits in per-task obj store mem
             logger.warning(
                 f"Stats are missing for delta stream position {delta.stream_position}, "
-                f"materialized delta may not fit in per-task object store memory.")
+                f"materialized delta may not fit in per-task object store memory."
+            )
         manifest_entries = delta.manifest.entries
         delta_manifest_entries += len(manifest_entries)
         for entry in manifest_entries:
@@ -130,13 +138,13 @@ def limit_input_deltas(
             logger.info(
                 f"Input deltas limited to "
                 f"{len(limited_input_da_list)} by object store mem "
-                f"({delta_bytes_pyarrow} > {worker_obj_store_mem})")
+                f"({delta_bytes_pyarrow} > {worker_obj_store_mem})"
+            )
             break
         delta_annotated = DeltaAnnotated.of(delta)
         limited_input_da_list.append(delta_annotated)
-    logger.info(f"Input deltas to compact this round: "
-                f"{len(limited_input_da_list)}")
+    logger.info(f"Input deltas to compact this round: " f"{len(limited_input_da_list)}")
     logger.info(f"Input delta bytes to compact: {delta_bytes}")
     logger.info(f"Input delta files to compact: {delta_manifest_entries}")
     logger.info(f"Latest input delta stream position: {latest_stream_position}")
@@ -146,10 +154,12 @@ def limit_input_deltas(
     # TODO (pdames): determine min hash buckets from size of all deltas
     #  (not just deltas for this round)
-    min_hash_bucket_count = int(max(
-        math.ceil(delta_bytes_pyarrow / worker_obj_store_mem_per_task),
-        min(worker_cpus, 256),
-    ))
+    min_hash_bucket_count = int(
+        max(
+            math.ceil(delta_bytes_pyarrow / worker_obj_store_mem_per_task),
+            min(worker_cpus, 256),
+        )
+    )
     logger.info(f"Minimum recommended hash buckets: {min_hash_bucket_count}")
     if hash_bucket_count is None:
@@ -168,7 +178,8 @@ def limit_input_deltas(
             f"resolve this problem either specify a larger number of hash "
             f"buckets when running compaction, omit a custom hash bucket "
             f"count when running compaction, or provision workers with more "
-            f"task memory per CPU.")
+            f"task memory per CPU."
+        )
     hash_bucket_chunk_size = user_hash_bucket_chunk_size
     max_hash_bucket_chunk_size = math.ceil(
@@ -185,7 +196,8 @@ def limit_input_deltas(
             f"specify a smaller hash bucket chunk size when running "
             f"compaction, omit a custom hash bucket chunk size when running "
             f"compaction, or provision workers with more task and object "
-            f"store memory per CPU.")
+            f"store memory per CPU."
+        )
     elif not hash_bucket_chunk_size:
         hash_bucket_chunk_size_load_balanced = max(
             math.ceil(max(delta_bytes, delta_bytes_pyarrow) / worker_cpus),

deltacat 0.1.10.dev0__py3-none-any.whl → 0.1.12__py3-none-any.whl

deltacat 0.1.10.dev0py3-none-any.whl → 0.1.12py3-none-any.whl