PyPI - deltacat - Versions diffs - 0.1.8__py3-none-any.whl → 0.1.11__py3-none-any.whl - Mend

deltacat 0.1.8py3-none-any.whl → 0.1.11py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (105) hide show

deltacat/__init__.py +41 -15
deltacat/aws/clients.py +12 -31
deltacat/aws/constants.py +1 -1
deltacat/aws/redshift/__init__.py +7 -2
deltacat/aws/redshift/model/manifest.py +54 -50
deltacat/aws/s3u.py +188 -218
deltacat/catalog/delegate.py +151 -185
deltacat/catalog/interface.py +78 -97
deltacat/catalog/model/catalog.py +21 -21
deltacat/catalog/model/table_definition.py +11 -9
deltacat/compute/compactor/__init__.py +12 -16
deltacat/compute/compactor/compaction_session.py +259 -316
deltacat/compute/compactor/model/delta_annotated.py +60 -44
deltacat/compute/compactor/model/delta_file_envelope.py +5 -6
deltacat/compute/compactor/model/delta_file_locator.py +10 -8
deltacat/compute/compactor/model/materialize_result.py +6 -7
deltacat/compute/compactor/model/primary_key_index.py +38 -34
deltacat/compute/compactor/model/pyarrow_write_result.py +3 -4
deltacat/compute/compactor/model/round_completion_info.py +25 -19
deltacat/compute/compactor/model/sort_key.py +18 -15
deltacat/compute/compactor/steps/dedupe.py +152 -259
deltacat/compute/compactor/steps/hash_bucket.py +57 -73
deltacat/compute/compactor/steps/materialize.py +138 -99
deltacat/compute/compactor/steps/rehash/rehash_bucket.py +13 -13
deltacat/compute/compactor/steps/rehash/rewrite_index.py +11 -13
deltacat/compute/compactor/utils/io.py +59 -47
deltacat/compute/compactor/utils/primary_key_index.py +131 -90
deltacat/compute/compactor/utils/round_completion_file.py +22 -23
deltacat/compute/compactor/utils/system_columns.py +33 -42
deltacat/compute/metastats/meta_stats.py +235 -157
deltacat/compute/metastats/model/partition_stats_dict.py +7 -10
deltacat/compute/metastats/model/stats_cluster_size_estimator.py +13 -5
deltacat/compute/metastats/stats.py +95 -64
deltacat/compute/metastats/utils/io.py +100 -53
deltacat/compute/metastats/utils/pyarrow_memory_estimation_function.py +5 -2
deltacat/compute/metastats/utils/ray_utils.py +38 -33
deltacat/compute/stats/basic.py +107 -69
deltacat/compute/stats/models/delta_column_stats.py +11 -8
deltacat/compute/stats/models/delta_stats.py +59 -32
deltacat/compute/stats/models/delta_stats_cache_result.py +4 -1
deltacat/compute/stats/models/manifest_entry_stats.py +12 -6
deltacat/compute/stats/models/stats_result.py +24 -14
deltacat/compute/stats/utils/intervals.py +16 -9
deltacat/compute/stats/utils/io.py +86 -51
deltacat/compute/stats/utils/manifest_stats_file.py +24 -33
deltacat/constants.py +8 -10
deltacat/io/__init__.py +2 -2
deltacat/io/aws/redshift/redshift_datasource.py +157 -143
deltacat/io/dataset.py +14 -17
deltacat/io/read_api.py +36 -33
deltacat/logs.py +94 -42
deltacat/storage/__init__.py +18 -8
deltacat/storage/interface.py +196 -213
deltacat/storage/model/delta.py +45 -51
deltacat/storage/model/list_result.py +12 -8
deltacat/storage/model/namespace.py +4 -5
deltacat/storage/model/partition.py +42 -42
deltacat/storage/model/stream.py +29 -30
deltacat/storage/model/table.py +14 -14
deltacat/storage/model/table_version.py +32 -31
deltacat/storage/model/types.py +1 -0
deltacat/tests/stats/test_intervals.py +11 -24
deltacat/tests/utils/test_record_batch_tables.py +284 -0
deltacat/types/media.py +3 -4
deltacat/types/tables.py +31 -21
deltacat/utils/common.py +5 -11
deltacat/utils/numpy.py +20 -22
deltacat/utils/pandas.py +73 -100
deltacat/utils/performance.py +3 -9
deltacat/utils/placement.py +276 -231
deltacat/utils/pyarrow.py +302 -89
deltacat/utils/ray_utils/collections.py +2 -1
deltacat/utils/ray_utils/concurrency.py +38 -32
deltacat/utils/ray_utils/dataset.py +28 -28
deltacat/utils/ray_utils/performance.py +5 -9
deltacat/utils/ray_utils/runtime.py +9 -10
{deltacat-0.1.8.dist-info → deltacat-0.1.11.dist-info}/METADATA +22 -12
deltacat-0.1.11.dist-info/RECORD +110 -0
{deltacat-0.1.8.dist-info → deltacat-0.1.11.dist-info}/WHEEL +1 -1
deltacat/autoscaler/events/__init__.py +0 -0
deltacat/autoscaler/events/compaction/__init__.py +0 -0
deltacat/autoscaler/events/compaction/cluster.py +0 -82
deltacat/autoscaler/events/compaction/collections/__init__.py +0 -0
deltacat/autoscaler/events/compaction/collections/partition_key_value.py +0 -36
deltacat/autoscaler/events/compaction/dispatcher.py +0 -28
deltacat/autoscaler/events/compaction/input.py +0 -27
deltacat/autoscaler/events/compaction/process.py +0 -25
deltacat/autoscaler/events/compaction/session_manager.py +0 -13
deltacat/autoscaler/events/compaction/utils.py +0 -216
deltacat/autoscaler/events/compaction/workflow.py +0 -303
deltacat/autoscaler/events/dispatcher.py +0 -95
deltacat/autoscaler/events/dynamodb/__init__.py +0 -0
deltacat/autoscaler/events/dynamodb/event_store.py +0 -164
deltacat/autoscaler/events/event_store.py +0 -55
deltacat/autoscaler/events/exceptions.py +0 -6
deltacat/autoscaler/events/processor.py +0 -177
deltacat/autoscaler/events/session_manager.py +0 -25
deltacat/autoscaler/events/states.py +0 -88
deltacat/autoscaler/events/workflow.py +0 -54
deltacat/autoscaler/node_group.py +0 -230
deltacat/autoscaler/utils.py +0 -69
deltacat-0.1.8.dist-info/RECORD +0 -131
/deltacat/{autoscaler → tests/utils}/__init__.py +0 -0
{deltacat-0.1.8.dist-info → deltacat-0.1.11.dist-info}/LICENSE +0 -0
{deltacat-0.1.8.dist-info → deltacat-0.1.11.dist-info}/top_level.txt +0 -0

deltacat/compute/compactor/steps/hash_bucket.py CHANGED Viewed

@@ -1,37 +1,37 @@
-import ray
-import pyarrow as pa
-import numpy as np
 import logging
-from deltacat.compute.compactor.model.delta_file_envelope import DeltaFileEnvelopeGroups
 from itertools import chain
+from typing import Generator, List, Optional, Tuple
+import numpy as np
+import pyarrow as pa
+import ray
+from ray.types import ObjectRef
 from deltacat import logs
-from deltacat.compute.compactor import DeltaAnnotated, DeltaFileEnvelope, \
-    SortKey
-from deltacat.compute.compactor.utils.primary_key_index import \
-    group_hash_bucket_indices, group_record_indices_by_hash_bucket
+from deltacat.compute.compactor import DeltaAnnotated, DeltaFileEnvelope, SortKey
+from deltacat.compute.compactor.model.delta_file_envelope import DeltaFileEnvelopeGroups
+from deltacat.compute.compactor.utils import system_columns as sc
+from deltacat.compute.compactor.utils.primary_key_index import (
+    group_hash_bucket_indices,
+    group_record_indices_by_hash_bucket,
+)
 from deltacat.storage import interface as unimplemented_deltacat_storage
 from deltacat.types.media import StorageType
 from deltacat.utils.common import sha1_digest
-from deltacat.compute.compactor.utils import system_columns as sc
-from typing import List, Optional, Generator, Tuple
-from ray.types import ObjectRef
 logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
-_PK_BYTES_DELIMITER = b'L6kl7u5f'
+_PK_BYTES_DELIMITER = b"L6kl7u5f"
 HashBucketGroupToObjectId = np.ndarray
-HashBucketResult = Tuple[HashBucketGroupToObjectId, List[ObjectRef[DeltaFileEnvelopeGroups]]]
+HashBucketResult = Tuple[
+    HashBucketGroupToObjectId, List[ObjectRef[DeltaFileEnvelopeGroups]]
+]
-def group_by_pk_hash_bucket(
-        table: pa.Table,
-        num_buckets: int,
-        primary_keys: List[str]) -> np.ndarray:
+def _group_by_pk_hash_bucket(
+    table: pa.Table, num_buckets: int, primary_keys: List[str]
+) -> np.ndarray:
     # generate the primary key digest column
     all_pk_column_fields = []
@@ -39,7 +39,7 @@ def group_by_pk_hash_bucket(
         # casting a primary key column to numpy also ensures no nulls exist
         column_fields = table[pk_name].to_numpy()
         all_pk_column_fields.append(column_fields)
-    hash_column_generator = hash_pk_bytes_generator(all_pk_column_fields)
+    hash_column_generator = _hash_pk_bytes_generator(all_pk_column_fields)
     table = sc.append_pk_hash_column(table, hash_column_generator)
     # drop primary key columns to free up memory
@@ -62,31 +62,27 @@ def group_by_pk_hash_bucket(
     return hash_bucket_to_table
-def hash_pk_bytes_generator(all_column_fields) -> Generator[bytes, None, None]:
+def _hash_pk_bytes_generator(all_column_fields) -> Generator[bytes, None, None]:
     for field_index in range(len(all_column_fields[0])):
         bytes_to_join = []
         for column_fields in all_column_fields:
-            bytes_to_join.append(
-                bytes(str(column_fields[field_index]), "utf-8")
-            )
+            bytes_to_join.append(bytes(str(column_fields[field_index]), "utf-8"))
         yield sha1_digest(_PK_BYTES_DELIMITER.join(bytes_to_join))
-def group_file_records_by_pk_hash_bucket(
-        annotated_delta: DeltaAnnotated,
-        num_hash_buckets: int,
-        primary_keys: List[str],
-        sort_key_names: List[str],
-        ignore_missing_manifest: bool = False,
-        deltacat_storage=unimplemented_deltacat_storage) \
-        -> Optional[DeltaFileEnvelopeGroups]:
+def _group_file_records_by_pk_hash_bucket(
+    annotated_delta: DeltaAnnotated,
+    num_hash_buckets: int,
+    primary_keys: List[str],
+    sort_key_names: List[str],
+    deltacat_storage=unimplemented_deltacat_storage,
+) -> Optional[DeltaFileEnvelopeGroups]:
     # read input parquet s3 objects into a list of delta file envelopes
-    delta_file_envelopes = read_delta_file_envelopes(
+    delta_file_envelopes = _read_delta_file_envelopes(
         annotated_delta,
         primary_keys,
         sort_key_names,
-        ignore_missing_manifest,
         deltacat_storage,
     )
     if delta_file_envelopes is None:
@@ -95,7 +91,7 @@ def group_file_records_by_pk_hash_bucket(
     # group the data by primary key hash value
     hb_to_delta_file_envelopes = np.empty([num_hash_buckets], dtype="object")
     for dfe in delta_file_envelopes:
-        hash_bucket_to_table = group_by_pk_hash_bucket(
+        hash_bucket_to_table = _group_by_pk_hash_bucket(
             dfe.table,
             num_hash_buckets,
             primary_keys,
@@ -106,44 +102,33 @@ def group_file_records_by_pk_hash_bucket(
                     hb_to_delta_file_envelopes[hb] = []
                 hb_to_delta_file_envelopes[hb].append(
                     DeltaFileEnvelope.of(
-                        dfe.stream_position,
-                        dfe.file_index,
-                        dfe.delta_type,
-                        table))
+                        dfe.stream_position, dfe.file_index, dfe.delta_type, table
+                    )
+                )
     return hb_to_delta_file_envelopes
-def read_delta_file_envelopes(
-        annotated_delta: DeltaAnnotated,
-        primary_keys: List[str],
-        sort_key_names: List[str],
-        ignore_missing_manifest: bool = False,
-        deltacat_storage=unimplemented_deltacat_storage) \
-        -> Optional[List[DeltaFileEnvelope]]:
+def _read_delta_file_envelopes(
+    annotated_delta: DeltaAnnotated,
+    primary_keys: List[str],
+    sort_key_names: List[str],
+    deltacat_storage=unimplemented_deltacat_storage,
+) -> Optional[List[DeltaFileEnvelope]]:
     columns_to_read = list(chain(primary_keys, sort_key_names))
-    missing_ids=[]
-    tables_and_missing_ids = deltacat_storage.download_delta(
+    tables = deltacat_storage.download_delta(
         annotated_delta,
-        max_parallelism=1, # if >1, will use python multiprocessing
+        max_parallelism=1,
         columns=columns_to_read,
         storage_type=StorageType.LOCAL,
-        ignore_missing_manifest=ignore_missing_manifest,
     )
-    if ignore_missing_manifest:
-        missing_ids = tables_and_missing_ids[1]
-        tables=tables_and_missing_ids[0]
-    else:
-        tables = tables_and_missing_ids
     annotations = annotated_delta.annotations
-    if len(missing_ids)>0:
-        print(f"missing files:{len(missing_ids)}")
-        for id_missing in sorted(missing_ids, reverse=True):
-            del annotations[id_missing]
-    assert(len(tables) == len(annotations),
-           f"Unexpected Error: Length of downloaded delta manifest tables "
-           f"({len(tables)}) doesn't match the length of delta manifest "
-           f"annotations ({len(annotations)}).")
+    assert (
+        len(tables) == len(annotations),
+        f"Unexpected Error: Length of downloaded delta manifest tables "
+        f"({len(tables)}) doesn't match the length of delta manifest "
+        f"annotations ({len(annotations)}).",
+    )
     if not tables:
         return None
@@ -161,22 +146,21 @@ def read_delta_file_envelopes(
 @ray.remote(num_returns=2)
 def hash_bucket(
-        annotated_delta: DeltaAnnotated,
-        primary_keys: List[str],
-        sort_keys: List[SortKey],
-        num_buckets: int,
-        num_groups: int,
-        ignore_missing_manifest: bool = False,
-        deltacat_storage=unimplemented_deltacat_storage) -> HashBucketResult:
+    annotated_delta: DeltaAnnotated,
+    primary_keys: List[str],
+    sort_keys: List[SortKey],
+    num_buckets: int,
+    num_groups: int,
+    deltacat_storage=unimplemented_deltacat_storage,
+) -> HashBucketResult:
     logger.info(f"Starting hash bucket task...")
     sort_key_names = [key.key_name for key in sort_keys]
-    delta_file_envelope_groups = group_file_records_by_pk_hash_bucket(
+    delta_file_envelope_groups = _group_file_records_by_pk_hash_bucket(
         annotated_delta,
         num_buckets,
         primary_keys,
         sort_key_names,
-        ignore_missing_manifest,
         deltacat_storage,
     )
     hash_bucket_group_to_obj_id, object_refs = group_hash_bucket_indices(

deltacat/compute/compactor/steps/materialize.py CHANGED Viewed

@@ -1,57 +1,113 @@
-import logging,time
-import ray
-import pyarrow as pa
+import logging
+import time
 from collections import defaultdict
-from deltacat.compute.compactor.steps.dedupe import DedupeTaskIndexWithObjectId, \
-    DeltaFileLocatorToRecords
 from itertools import chain, repeat
+from typing import List, Optional, Tuple
-from pyarrow import compute as pc
+import pyarrow as pa
+import ray
 from ray import cloudpickle
 from deltacat import logs
-from deltacat.storage import Delta, DeltaLocator, Partition, PartitionLocator, \
-    interface as unimplemented_deltacat_storage
-from deltacat.compute.compactor import MaterializeResult, PyArrowWriteResult, \
-    RoundCompletionInfo
-from deltacat.compute.compactor.utils import system_columns as sc
-from deltacat.types.media import ContentType, DELIMITED_TEXT_CONTENT_TYPES
-from typing import List, Tuple, Optional
-from deltacat.utils.pyarrow import ReadKwargsProviderPyArrowSchemaOverride
+from deltacat.compute.compactor import (
+    MaterializeResult,
+    PyArrowWriteResult,
+    RoundCompletionInfo,
+)
+from deltacat.compute.compactor.steps.dedupe import (
+    DedupeTaskIndexWithObjectId,
+    DeltaFileLocatorToRecords,
+)
+from deltacat.storage import Delta, DeltaLocator, Partition, PartitionLocator
+from deltacat.storage import interface as unimplemented_deltacat_storage
+from deltacat.types.media import DELIMITED_TEXT_CONTENT_TYPES, ContentType
 from deltacat.types.tables import TABLE_CLASS_TO_SIZE_FUNC
-from deltacat.utils.pyarrow import ReadKwargsProviderPyArrowCsvPureUtf8
+from deltacat.utils.performance import timed_invocation
+from deltacat.utils.pyarrow import (
+    ReadKwargsProviderPyArrowCsvPureUtf8,
+    ReadKwargsProviderPyArrowSchemaOverride,
+    RecordBatchTables,
+)
 logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
 @ray.remote
 def materialize(
-        source_partition_locator: PartitionLocator,
-        round_completion_info: Optional[RoundCompletionInfo],
-        partition: Partition,
-        mat_bucket_index: int,
-        dedupe_task_idx_and_obj_id_tuples: List[DedupeTaskIndexWithObjectId],
-        max_records_per_output_file: int,
-        compacted_file_content_type: ContentType,
-        schema: Optional[pa.Schema] = None,
-        deltacat_storage=unimplemented_deltacat_storage) -> MaterializeResult:
+    source_partition_locator: PartitionLocator,
+    round_completion_info: Optional[RoundCompletionInfo],
+    partition: Partition,
+    mat_bucket_index: int,
+    dedupe_task_idx_and_obj_id_tuples: List[DedupeTaskIndexWithObjectId],
+    max_records_per_output_file: int,
+    compacted_file_content_type: ContentType,
+    schema: Optional[pa.Schema] = None,
+    deltacat_storage=unimplemented_deltacat_storage,
+) -> MaterializeResult:
+    # TODO (rkenmi): Add docstrings for the steps in the compaction workflow
+    #  https://github.com/ray-project/deltacat/issues/79
+    def _materialize(compacted_tables: List[pa.Table]) -> MaterializeResult:
+        compacted_table = pa.concat_tables(compacted_tables)
+        if compacted_file_content_type in DELIMITED_TEXT_CONTENT_TYPES:
+            # TODO (rkenmi): Investigate if we still need to convert this table to pandas DataFrame
+            # TODO (pdames): compare performance to pandas-native materialize path
+            df = compacted_table.to_pandas(
+                split_blocks=True, self_destruct=True, zero_copy_only=True
+            )
+            compacted_table = df
+        delta, stage_delta_time = timed_invocation(
+            deltacat_storage.stage_delta,
+            compacted_table,
+            partition,
+            max_records_per_entry=max_records_per_output_file,
+            content_type=compacted_file_content_type,
+        )
+        compacted_table_size = TABLE_CLASS_TO_SIZE_FUNC[type(compacted_table)](
+            compacted_table
+        )
+        logger.debug(
+            f"Time taken for materialize task"
+            f" to upload {len(compacted_table)} records"
+            f" of size {compacted_table_size} is: {stage_delta_time}s"
+        )
+        manifest = delta.manifest
+        manifest_records = manifest.meta.record_count
+        assert (
+            manifest_records == len(compacted_table),
+            f"Unexpected Error: Materialized delta manifest record count "
+            f"({manifest_records}) does not equal compacted table record count "
+            f"({len(compacted_table)})",
+        )
+        materialize_result = MaterializeResult.of(
+            delta,
+            mat_bucket_index,
+            # TODO (pdames): Generalize WriteResult to contain in-memory-table-type
+            #  and in-memory-table-bytes instead of tight coupling to paBytes
+            PyArrowWriteResult.of(
+                len(manifest.entries),
+                TABLE_CLASS_TO_SIZE_FUNC[type(compacted_table)](compacted_table),
+                manifest.meta.content_length,
+                len(compacted_table),
+            ),
+        )
+        logger.info(f"Materialize result: {materialize_result}")
+        return materialize_result
-    logger.info(f"Starting materialize task...")
+    logger.info(
+        f"Starting materialize task with"
+        f" materialize bucket index: {mat_bucket_index}..."
+    )
+    start = time.time()
     dedupe_task_idx_and_obj_ref_tuples = [
         (
-            t[0],
-            cloudpickle.loads(t[1])
-        ) for t in dedupe_task_idx_and_obj_id_tuples
+            t1,
+            cloudpickle.loads(t2),
+        )
+        for t1, t2 in dedupe_task_idx_and_obj_id_tuples
     ]
     logger.info(f"Resolved materialize task obj refs...")
-    dedupe_task_indices, obj_refs = zip(
-        *dedupe_task_idx_and_obj_ref_tuples
-    )
+    dedupe_task_indices, obj_refs = zip(*dedupe_task_idx_and_obj_ref_tuples)
     # this depends on `ray.get` result order matching input order, as per the
     # contract established in: https://github.com/ray-project/ray/pull/16763
     src_file_records_list = ray.get(list(obj_refs))
@@ -63,19 +119,23 @@ def materialize(
                 (record_numbers, repeat(dedupe_task_idx, len(record_numbers)))
             )
     manifest_cache = {}
-    compacted_tables = []
+    materialized_results: List[MaterializeResult] = []
+    record_batch_tables = RecordBatchTables(max_records_per_output_file)
     for src_dfl in sorted(all_src_file_records.keys()):
-        record_numbers_dd_task_idx_tpl_list: List[Tuple[DeltaFileLocatorToRecords, repeat]] = \
-            all_src_file_records[src_dfl]
+        record_numbers_dd_task_idx_tpl_list: List[
+            Tuple[DeltaFileLocatorToRecords, repeat]
+        ] = all_src_file_records[src_dfl]
         record_numbers_tpl, dedupe_task_idx_iter_tpl = zip(
             *record_numbers_dd_task_idx_tpl_list
         )
         is_src_partition_file_np = src_dfl.is_source_delta
         src_stream_position_np = src_dfl.stream_position
         src_file_idx_np = src_dfl.file_index
-        src_file_partition_locator = source_partition_locator \
-            if is_src_partition_file_np \
+        src_file_partition_locator = (
+            source_partition_locator
+            if is_src_partition_file_np
             else round_completion_info.compacted_delta_locator.partition_locator
+        )
         delta_locator = DeltaLocator.of(
             src_file_partition_locator,
             src_stream_position_np.item(),
@@ -95,75 +155,54 @@ def materialize(
         # enforce a consistent schema if provided, when reading files into PyArrow tables
         elif schema is not None:
             read_kwargs_provider = ReadKwargsProviderPyArrowSchemaOverride(
-                schema=schema)
-        pa_table = deltacat_storage.download_delta_manifest_entry(
+                schema=schema
+            )
+        pa_table, download_delta_manifest_entry_time = timed_invocation(
+            deltacat_storage.download_delta_manifest_entry,
             Delta.of(delta_locator, None, None, None, manifest),
             src_file_idx_np.item(),
-            file_reader_kwargs=read_kwargs_provider,
+            file_reader_kwargs_provider=read_kwargs_provider,
+        )
+        logger.debug(
+            f"Time taken for materialize task"
+            f" to download delta locator {delta_locator} with entry ID {src_file_idx_np.item()}"
+            f" is: {download_delta_manifest_entry_time}s"
         )
         mask_pylist = list(repeat(False, len(pa_table)))
         record_numbers = chain.from_iterable(record_numbers_tpl)
+        # TODO(raghumdani): reference the same file URIs while writing the files
+        # instead of copying the data over and creating new files.
         for record_number in record_numbers:
             mask_pylist[record_number] = True
         mask = pa.array(mask_pylist)
-        compacted_table = pa_table.filter(mask)
+        pa_table = pa_table.filter(mask)
+        record_batch_tables.append(pa_table)
+        if record_batch_tables.has_batches():
+            batched_tables = record_batch_tables.evict()
+            materialized_results.append(_materialize(batched_tables))
-        # appending, sorting, taking, and dropping has 2-3X latency of a
-        # single filter on average, and thus provides better average
-        # performance than repeatedly filtering the table in dedupe task index
-        # order
-        dedupe_task_indices = chain.from_iterable(dedupe_task_idx_iter_tpl)
-        compacted_table = sc.append_dedupe_task_idx_col(
-            compacted_table,
-            dedupe_task_indices,
-        )
-        pa_sort_keys = [(sc._DEDUPE_TASK_IDX_COLUMN_NAME, "ascending")]
-        compacted_table = compacted_table.take(
-            pc.sort_indices(compacted_table, sort_keys=pa_sort_keys),
-        )
-        compacted_table = compacted_table.drop(
-            [sc._DEDUPE_TASK_IDX_COLUMN_NAME]
-        )
-        compacted_tables.append(compacted_table)
+    if record_batch_tables.has_remaining():
+        materialized_results.append(_materialize(record_batch_tables.remaining))
-    # TODO (pdames): save memory by writing output files eagerly whenever
-    #  len(compacted_table) >= max_records_per_output_file (but don't write
-    #  partial slices from the compacted_table remainder every time!)
-    compacted_table = pa.concat_tables(compacted_tables)
-    if compacted_file_content_type in DELIMITED_TEXT_CONTENT_TYPES:
-        # convert to pandas since pyarrow doesn't support custom delimiters
-        # and doesn't support utf-8 conversion of all types (e.g. Decimal128)
-        # TODO (pdames): compare performance to pandas-native materialize path
-        df = compacted_table.to_pandas(
-            split_blocks=True,
-            self_destruct=True,
-        )
-        del compacted_table
-        compacted_table = df
-    delta = deltacat_storage.stage_delta(
-        compacted_table,
-        partition,
-        max_records_per_entry=max_records_per_output_file,
-        content_type=compacted_file_content_type,
+    merged_delta = Delta.merge_deltas([mr.delta for mr in materialized_results])
+    assert (
+        materialized_results and len(materialized_results) > 0
+    ), f"Expected at least one materialized result in materialize step."
+    write_results = [mr.pyarrow_write_result for mr in materialized_results]
+    logger.debug(
+        f"{len(write_results)} files written"
+        f" with records: {[wr.records for wr in write_results]}"
     )
-    manifest = delta.manifest
-    manifest_records = manifest.meta.record_count
-    assert(manifest_records == len(compacted_table),
-           f"Unexpected Error: Materialized delta manifest record count "
-           f"({manifest_records}) does not equal compacted table record count "
-           f"({len(compacted_table)})")
-    materialize_result = MaterializeResult.of(
-        delta,
-        mat_bucket_index,
-        # TODO (pdames): Generalize WriteResult to contain in-memory-table-type
-        #  and in-memory-table-bytes instead of tight coupling to paBytes
-        PyArrowWriteResult.of(
-            len(manifest.entries),
-            TABLE_CLASS_TO_SIZE_FUNC[type(compacted_table)](compacted_table),
-            manifest.meta.content_length,
-            len(compacted_table),
+    # Merge all new deltas into one for this materialize bucket index
+    merged_materialize_result = MaterializeResult.of(
+        merged_delta,
+        materialized_results[0].task_index,
+        PyArrowWriteResult.union(
+            [mr.pyarrow_write_result for mr in materialized_results]
         ),
     )
-    logger.info(f"Materialize result: {materialize_result}")
     logger.info(f"Finished materialize task...")
-    return materialize_result
+    end = time.time()
+    logger.info(f"Materialize task ended in {end - start}s")
+    return merged_materialize_result

deltacat/compute/compactor/steps/rehash/rehash_bucket.py CHANGED Viewed

@@ -1,22 +1,21 @@
 import logging
-import ray
-import pyarrow as pa
-import numpy as np
+from typing import List, Tuple
+import numpy as np
+import pyarrow as pa
+import ray
 from ray.types import ObjectRef
 from deltacat import logs
 from deltacat.compute.compactor import PrimaryKeyIndexVersionLocator
 from deltacat.compute.compactor.utils import primary_key_index as pki
-from typing import List, Tuple
 logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
 def group_file_records_by_pk_hash_bucket(
-        pki_table: pa.Table,
-        num_buckets: int) -> np.ndarray:
+    pki_table: pa.Table, num_buckets: int
+) -> np.ndarray:
     # generate the new table for each new hash bucket
     hash_bucket_to_indices = pki.group_record_indices_by_hash_bucket(
         pki_table,
@@ -29,13 +28,14 @@ def group_file_records_by_pk_hash_bucket(
     return hash_bucket_to_table
-@ray.remote(num_cpus=1,num_returns=2)
+@ray.remote(num_cpus=1, num_returns=2)
 def rehash_bucket(
-        hash_bucket_index: int,
-        s3_bucket: str,
-        old_pki_version_locator: PrimaryKeyIndexVersionLocator,
-        num_buckets: int,
-        num_groups: int) -> Tuple[np.ndarray, List[ObjectRef]]:
+    hash_bucket_index: int,
+    s3_bucket: str,
+    old_pki_version_locator: PrimaryKeyIndexVersionLocator,
+    num_buckets: int,
+    num_groups: int,
+) -> Tuple[np.ndarray, List[ObjectRef]]:
     logger.info(f"Starting rehash bucket task...")
     tables = pki.download_hash_bucket_entries(

deltacat/compute/compactor/steps/rehash/rewrite_index.py CHANGED Viewed

@@ -1,28 +1,26 @@
-import ray
 import logging
-import pyarrow as pa
 from collections import defaultdict
-from ray import cloudpickle
-from deltacat import logs
+from typing import Any, List, Tuple
+import pyarrow as pa
+import ray
+from ray import cloudpickle
 from ray.types import ObjectRef
-from deltacat.compute.compactor import PrimaryKeyIndexVersionLocator, \
-    PyArrowWriteResult
+from deltacat import logs
+from deltacat.compute.compactor import PrimaryKeyIndexVersionLocator, PyArrowWriteResult
 from deltacat.compute.compactor.utils import primary_key_index as pki
-from typing import Any, List, Tuple
 logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
 @ray.remote(num_cpus=1, num_returns=2)
 def rewrite_index(
-        object_ids: List[Any],
-        s3_bucket: str,
-        new_primary_key_index_version_locator: PrimaryKeyIndexVersionLocator,
-        max_records_per_index_file: int) -> \
-        Tuple[PyArrowWriteResult, List[ObjectRef]]:
+    object_ids: List[Any],
+    s3_bucket: str,
+    new_primary_key_index_version_locator: PrimaryKeyIndexVersionLocator,
+    max_records_per_index_file: int,
+) -> Tuple[PyArrowWriteResult, List[ObjectRef]]:
     logger.info(f"Starting rewrite primary key index task...")
     object_refs = [cloudpickle.loads(obj_id_pkl) for obj_id_pkl in object_ids]

deltacat 0.1.8__py3-none-any.whl → 0.1.11__py3-none-any.whl

deltacat 0.1.8py3-none-any.whl → 0.1.11py3-none-any.whl