PyPI - deltacat - Versions diffs - 2.0.0b11__py3-none-any.whl → 2.0.0b12__py3-none-any.whl - Mend

deltacat 2.0.0b11py3-none-any.whl → 2.0.0b12py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (194) hide show

deltacat/__init__.py +78 -3
deltacat/api.py +122 -67
deltacat/aws/constants.py +0 -23
deltacat/aws/s3u.py +4 -631
deltacat/benchmarking/conftest.py +0 -18
deltacat/catalog/__init__.py +2 -0
deltacat/catalog/delegate.py +445 -63
deltacat/catalog/interface.py +188 -62
deltacat/catalog/main/impl.py +2417 -271
deltacat/catalog/model/catalog.py +49 -10
deltacat/catalog/model/properties.py +38 -0
deltacat/compute/compactor/compaction_session.py +97 -75
deltacat/compute/compactor/model/compact_partition_params.py +75 -30
deltacat/compute/compactor/model/compaction_session_audit_info.py +17 -0
deltacat/compute/compactor/model/round_completion_info.py +16 -6
deltacat/compute/compactor/repartition_session.py +8 -21
deltacat/compute/compactor/steps/hash_bucket.py +5 -5
deltacat/compute/compactor/steps/materialize.py +9 -7
deltacat/compute/compactor/steps/repartition.py +12 -11
deltacat/compute/compactor/utils/io.py +6 -5
deltacat/compute/compactor/utils/round_completion_reader.py +117 -0
deltacat/compute/compactor/utils/system_columns.py +3 -1
deltacat/compute/compactor_v2/compaction_session.py +17 -14
deltacat/compute/compactor_v2/constants.py +30 -1
deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -1
deltacat/compute/compactor_v2/model/hash_bucket_input.py +9 -3
deltacat/compute/compactor_v2/model/merge_file_group.py +5 -2
deltacat/compute/compactor_v2/model/merge_input.py +33 -8
deltacat/compute/compactor_v2/private/compaction_utils.py +167 -68
deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -2
deltacat/compute/compactor_v2/steps/merge.py +267 -55
deltacat/compute/compactor_v2/utils/content_type_params.py +34 -6
deltacat/compute/compactor_v2/utils/dedupe.py +1 -1
deltacat/compute/compactor_v2/utils/delta.py +5 -3
deltacat/compute/compactor_v2/utils/io.py +11 -4
deltacat/compute/compactor_v2/utils/merge.py +15 -2
deltacat/compute/compactor_v2/utils/primary_key_index.py +28 -4
deltacat/compute/compactor_v2/utils/task_options.py +45 -33
deltacat/compute/converter/converter_session.py +145 -32
deltacat/compute/converter/model/convert_input.py +26 -19
deltacat/compute/converter/model/convert_input_files.py +33 -16
deltacat/compute/converter/model/convert_result.py +35 -16
deltacat/compute/converter/model/converter_session_params.py +24 -21
deltacat/compute/converter/pyiceberg/catalog.py +21 -18
deltacat/compute/converter/pyiceberg/overrides.py +18 -9
deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +148 -100
deltacat/compute/converter/steps/convert.py +157 -50
deltacat/compute/converter/steps/dedupe.py +24 -11
deltacat/compute/converter/utils/convert_task_options.py +27 -12
deltacat/compute/converter/utils/converter_session_utils.py +126 -60
deltacat/compute/converter/utils/iceberg_columns.py +8 -8
deltacat/compute/converter/utils/io.py +101 -12
deltacat/compute/converter/utils/s3u.py +33 -27
deltacat/compute/janitor.py +205 -0
deltacat/compute/jobs/client.py +19 -8
deltacat/compute/resource_estimation/delta.py +38 -6
deltacat/compute/resource_estimation/model.py +8 -0
deltacat/constants.py +44 -0
deltacat/docs/autogen/schema/__init__.py +0 -0
deltacat/docs/autogen/schema/inference/__init__.py +0 -0
deltacat/docs/autogen/schema/inference/generate_type_mappings.py +687 -0
deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py +673 -0
deltacat/examples/compactor/__init__.py +0 -0
deltacat/examples/compactor/aws/__init__.py +1 -0
deltacat/examples/compactor/bootstrap.py +863 -0
deltacat/examples/compactor/compactor.py +373 -0
deltacat/examples/compactor/explorer.py +473 -0
deltacat/examples/compactor/gcp/__init__.py +1 -0
deltacat/examples/compactor/job_runner.py +439 -0
deltacat/examples/compactor/utils/__init__.py +1 -0
deltacat/examples/compactor/utils/common.py +261 -0
deltacat/examples/experimental/iceberg/converter/__init__.py +0 -0
deltacat/examples/experimental/iceberg/converter/beam/__init__.py +0 -0
deltacat/examples/experimental/iceberg/converter/beam/app.py +226 -0
deltacat/examples/experimental/iceberg/converter/beam/main.py +133 -0
deltacat/examples/experimental/iceberg/converter/beam/test_workflow.py +113 -0
deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py +3 -0
deltacat/examples/experimental/iceberg/converter/beam/utils/common.py +174 -0
deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py +263 -0
deltacat/exceptions.py +66 -4
deltacat/experimental/catalog/iceberg/impl.py +2 -2
deltacat/experimental/compatibility/__init__.py +0 -0
deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py +201 -0
deltacat/experimental/converter_agent/__init__.py +0 -0
deltacat/experimental/converter_agent/beam/__init__.py +0 -0
deltacat/experimental/converter_agent/beam/managed.py +173 -0
deltacat/experimental/converter_agent/table_monitor.py +479 -0
deltacat/experimental/storage/iceberg/iceberg_scan_planner.py +105 -4
deltacat/experimental/storage/iceberg/impl.py +5 -3
deltacat/experimental/storage/iceberg/model.py +7 -3
deltacat/experimental/storage/iceberg/visitor.py +119 -0
deltacat/experimental/storage/rivulet/dataset.py +0 -3
deltacat/experimental/storage/rivulet/metastore/delta.py +0 -2
deltacat/experimental/storage/rivulet/reader/dataset_metastore.py +3 -2
deltacat/io/datasource/deltacat_datasource.py +0 -1
deltacat/storage/__init__.py +20 -2
deltacat/storage/interface.py +54 -32
deltacat/storage/main/impl.py +1494 -541
deltacat/storage/model/delta.py +27 -3
deltacat/storage/model/locator.py +6 -12
deltacat/storage/model/manifest.py +182 -6
deltacat/storage/model/metafile.py +151 -78
deltacat/storage/model/namespace.py +8 -1
deltacat/storage/model/partition.py +117 -42
deltacat/storage/model/schema.py +2427 -159
deltacat/storage/model/sort_key.py +40 -0
deltacat/storage/model/stream.py +9 -2
deltacat/storage/model/table.py +12 -1
deltacat/storage/model/table_version.py +11 -0
deltacat/storage/model/transaction.py +1184 -208
deltacat/storage/model/transform.py +81 -2
deltacat/storage/model/types.py +48 -26
deltacat/tests/_io/test_cloudpickle_bug_fix.py +8 -4
deltacat/tests/aws/test_s3u.py +2 -31
deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +1606 -70
deltacat/tests/catalog/test_catalogs.py +54 -11
deltacat/tests/catalog/test_default_catalog_impl.py +12152 -71
deltacat/tests/compute/compact_partition_test_cases.py +35 -8
deltacat/tests/compute/compactor/steps/test_repartition.py +12 -12
deltacat/tests/compute/compactor/utils/test_io.py +124 -120
deltacat/tests/compute/compactor/utils/test_round_completion_reader.py +254 -0
deltacat/tests/compute/compactor_v2/test_compaction_session.py +423 -312
deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +266 -0
deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +45 -0
deltacat/tests/compute/compactor_v2/utils/test_task_options.py +270 -1
deltacat/tests/compute/conftest.py +8 -44
deltacat/tests/compute/converter/test_convert_session.py +675 -490
deltacat/tests/compute/converter/utils.py +15 -6
deltacat/tests/compute/resource_estimation/test_delta.py +145 -79
deltacat/tests/compute/test_compact_partition_incremental.py +103 -70
deltacat/tests/compute/test_compact_partition_multiple_rounds.py +89 -66
deltacat/tests/compute/test_compact_partition_params.py +13 -8
deltacat/tests/compute/test_compact_partition_rebase.py +77 -62
deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +263 -193
deltacat/tests/compute/test_janitor.py +236 -0
deltacat/tests/compute/test_util_common.py +716 -43
deltacat/tests/compute/test_util_constant.py +0 -1
deltacat/tests/{storage/conftest.py → conftest.py} +1 -1
deltacat/tests/experimental/__init__.py +1 -0
deltacat/tests/experimental/compatibility/__init__.py +1 -0
deltacat/tests/experimental/compatibility/test_backfill_locator_to_id_mappings.py +582 -0
deltacat/tests/storage/main/test_main_storage.py +6900 -95
deltacat/tests/storage/model/test_metafile_io.py +78 -173
deltacat/tests/storage/model/test_partition_scheme.py +85 -0
deltacat/tests/storage/model/test_schema.py +171 -0
deltacat/tests/storage/model/test_schema_update.py +1925 -0
deltacat/tests/storage/model/test_sort_scheme.py +90 -0
deltacat/tests/storage/model/test_transaction.py +393 -48
deltacat/tests/storage/model/test_transaction_history.py +886 -0
deltacat/tests/test_deltacat_api.py +988 -4
deltacat/tests/test_exceptions.py +9 -5
deltacat/tests/test_utils/pyarrow.py +52 -21
deltacat/tests/test_utils/storage.py +23 -34
deltacat/tests/types/__init__.py +0 -0
deltacat/tests/types/test_tables.py +104 -0
deltacat/tests/utils/exceptions.py +22 -0
deltacat/tests/utils/main_deltacat_storage_mock.py +31 -0
deltacat/tests/utils/ray_utils/test_dataset.py +123 -5
deltacat/tests/utils/test_daft.py +121 -31
deltacat/tests/utils/test_numpy.py +1193 -0
deltacat/tests/utils/test_pandas.py +1106 -0
deltacat/tests/utils/test_polars.py +1040 -0
deltacat/tests/utils/test_pyarrow.py +1370 -89
deltacat/types/media.py +221 -11
deltacat/types/tables.py +2329 -59
deltacat/utils/arguments.py +33 -1
deltacat/utils/daft.py +411 -150
deltacat/utils/filesystem.py +100 -0
deltacat/utils/metafile_locator.py +2 -1
deltacat/utils/numpy.py +118 -26
deltacat/utils/pandas.py +577 -48
deltacat/utils/polars.py +658 -27
deltacat/utils/pyarrow.py +1258 -213
deltacat/utils/ray_utils/dataset.py +101 -10
deltacat/utils/reader_compatibility_mapping.py +3083 -0
deltacat/utils/url.py +56 -15
deltacat-2.0.0b12.dist-info/METADATA +1163 -0
{deltacat-2.0.0b11.dist-info → deltacat-2.0.0b12.dist-info}/RECORD +183 -145
{deltacat-2.0.0b11.dist-info → deltacat-2.0.0b12.dist-info}/WHEEL +1 -1
deltacat/compute/compactor/utils/round_completion_file.py +0 -97
deltacat/compute/merge_on_read/__init__.py +0 -4
deltacat/compute/merge_on_read/daft.py +0 -40
deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -66
deltacat/compute/merge_on_read/utils/delta.py +0 -42
deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -231
deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -388
deltacat/tests/local_deltacat_storage/__init__.py +0 -1236
deltacat/tests/local_deltacat_storage/exceptions.py +0 -10
deltacat/utils/s3fs.py +0 -21
deltacat-2.0.0b11.dist-info/METADATA +0 -67
/deltacat/{compute/merge_on_read/model → docs}/__init__.py +0 -0
/deltacat/{compute/merge_on_read/utils → docs/autogen}/__init__.py +0 -0
{deltacat-2.0.0b11.dist-info → deltacat-2.0.0b12.dist-info/licenses}/LICENSE +0 -0
{deltacat-2.0.0b11.dist-info → deltacat-2.0.0b12.dist-info}/top_level.txt +0 -0

deltacat/compute/compactor_v2/steps/merge.py CHANGED Viewed

@@ -7,10 +7,11 @@ import ray
 import itertools
 import time
 import pyarrow.compute as pc
+from deltacat.utils.pyarrow import MAX_INT_BYTES
 import deltacat.compute.compactor_v2.utils.merge as merge_utils
 from uuid import uuid4
 from deltacat import logs
-from typing import Callable, Iterator, List, Optional, Tuple
+from typing import Callable, Iterator, List, Optional, Tuple, Set
 from deltacat.compute.compactor_v2.model.merge_result import MergeResult
 from deltacat.compute.compactor_v2.model.merge_file_group import MergeFileGroup
 from deltacat.compute.compactor.model.materialize_result import MaterializeResult
@@ -31,13 +32,14 @@ from deltacat.utils.resources import (
 )
 from deltacat.compute.compactor_v2.utils.primary_key_index import (
     generate_pk_hash_column,
+    pk_digest_to_hash_bucket_index,
 )
 from deltacat.storage import (
     Delta,
     DeltaLocator,
     DeltaType,
     Partition,
-    interface as unimplemented_deltacat_storage,
+    metastore,
 )
 from deltacat.storage.model.manifest import Manifest
 from deltacat.compute.compactor_v2.utils.dedupe import drop_duplicates
@@ -46,6 +48,9 @@ from deltacat.compute.compactor_v2.constants import (
     MERGE_TIME_IN_SECONDS,
     MERGE_SUCCESS_COUNT,
     MERGE_FAILURE_COUNT,
+    BUCKETING_SPEC_COMPLIANCE_PROFILE,
+    BUCKETING_SPEC_COMPLIANCE_ASSERT,
+    BUCKETING_SPEC_COMPLIANCE_PRINT_LOG,
 )
 from deltacat.exceptions import (
     categorize_errors,
@@ -57,6 +62,10 @@ if importlib.util.find_spec("memray"):
 logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
+_EXISTING_VARIANT_LOG_PREFIX = "Existing variant "
+_INCREMENTAL_TABLE_LOG_PREFIX = "Incremental table "
 def _append_delta_type_column(table: pa.Table, value: np.bool_):
     return table.append_column(
         sc._DELTA_TYPE_COLUMN_FIELD,
@@ -85,9 +94,12 @@ def _build_incremental_table(
     # sort by delta file stream position now instead of sorting every row later
     is_delete = False
     for df_envelope in df_envelopes:
-        assert (
-            df_envelope.delta_type != DeltaType.APPEND
-        ), "APPEND type deltas are not supported. Kindly use UPSERT or DELETE"
+        # Allow APPEND, UPSERT, and DELETE delta types
+        assert df_envelope.delta_type in (
+            DeltaType.APPEND,
+            DeltaType.UPSERT,
+            DeltaType.DELETE,
+        ), "Only APPEND, UPSERT, and DELETE delta types are supported"
         if df_envelope.delta_type == DeltaType.DELETE:
             is_delete = True
@@ -99,14 +111,35 @@ def _build_incremental_table(
             )
         hb_tables.append(table)
-    result = pa.concat_tables(hb_tables)
+    result = _concat_or_coerce_tables(hb_tables)
     return result
+def _concat_or_coerce_tables(all_tables: List[pa.Table]) -> pa.Table:
+    try:
+        return pa.concat_tables(all_tables)
+    except pa.ArrowInvalid:
+        # Fallback path: schema evolution needed - try PyArrow's built-in unification
+        if all_tables:
+            try:
+                return pa.concat_tables(
+                    all_tables, promote_options="permissive", unify_schemas=True
+                )
+            except (pa.ArrowInvalid, TypeError, pa.ArrowNotImplementedError):
+                # If PyArrow unification fails, re-raise the original error
+                raise
+        else:
+            # Empty table list - should not happen but handle gracefully
+            raise RuntimeError("Expected at least one table to merge, but found none.")
 def _merge_tables(
     table: pa.Table,
     primary_keys: List[str],
     can_drop_duplicates: bool,
+    hb_index: int,
+    num_buckets: int,
+    original_fields: Set[str],
     compacted_table: Optional[pa.Table] = None,
 ) -> pa.Table:
     """
@@ -125,6 +158,20 @@ def _merge_tables(
     all_tables.append(table)
+    check_bucketing_spec = BUCKETING_SPEC_COMPLIANCE_PROFILE in [
+        BUCKETING_SPEC_COMPLIANCE_PRINT_LOG,
+        BUCKETING_SPEC_COMPLIANCE_ASSERT,
+    ]
+    if primary_keys and check_bucketing_spec:
+        _validate_bucketing_spec_compliance(
+            table=all_tables[incremental_idx],
+            num_buckets=num_buckets,
+            primary_keys=primary_keys,
+            hb_index=hb_index,
+            log_prefix=_INCREMENTAL_TABLE_LOG_PREFIX,
+        )
     if not primary_keys or not can_drop_duplicates:
         logger.info(
             f"Not dropping duplicates for primary keys={primary_keys} "
@@ -134,7 +181,7 @@ def _merge_tables(
             all_tables[incremental_idx], DeltaType.DELETE
         )
         # we need not drop duplicates
-        return pa.concat_tables(all_tables)
+        return _concat_or_coerce_tables(all_tables)
     all_tables = generate_pk_hash_column(all_tables, primary_keys=primary_keys)
@@ -144,36 +191,170 @@ def _merge_tables(
         all_tables[incremental_idx], on=sc._PK_HASH_STRING_COLUMN_NAME
     )
+    # Always drop DELETE rows from incremental table
+    incremental_table = _drop_delta_type_rows(incremental_table, DeltaType.DELETE)
+    # Default to using incremental records as-is, override only if merging is needed
+    incremental_data = incremental_table
     if compacted_table:
         compacted_table = all_tables[0]
-        records_to_keep = pc.invert(
-            pc.is_in(
-                compacted_table[sc._PK_HASH_STRING_COLUMN_NAME],
-                incremental_table[sc._PK_HASH_STRING_COLUMN_NAME],
+        compacted_pk_hash_str = compacted_table[sc._PK_HASH_STRING_COLUMN_NAME]
+        incremental_pk_hash_str = incremental_table[sc._PK_HASH_STRING_COLUMN_NAME]
+        logger.info(
+            f"Size of compacted pk hash={compacted_pk_hash_str.nbytes} "
+            f"and incremental pk hash={incremental_pk_hash_str.nbytes}."
+        )
+        if (
+            compacted_table[sc._PK_HASH_STRING_COLUMN_NAME].nbytes >= MAX_INT_BYTES
+            or incremental_table[sc._PK_HASH_STRING_COLUMN_NAME].nbytes >= MAX_INT_BYTES
+        ):
+            logger.info("Casting compacted and incremental pk hash to large_string...")
+            # is_in combines the chunks of the chunked array passed which can cause
+            # ArrowCapacityError if the total size of string array is over 2GB.
+            # Using a large_string would resolve this issue.
+            # The cast here should be zero-copy in most cases.
+            compacted_pk_hash_str = pc.cast(compacted_pk_hash_str, pa.large_string())
+            incremental_pk_hash_str = pc.cast(
+                incremental_pk_hash_str, pa.large_string()
             )
+        records_to_update = pc.is_in(
+            compacted_pk_hash_str,
+            incremental_pk_hash_str,
         )
+        records_to_keep = pc.invert(records_to_update)
+        # Keep records that don't have updates
         result_table_list.append(compacted_table.filter(records_to_keep))
-    incremental_table = _drop_delta_type_rows(incremental_table, DeltaType.DELETE)
-    result_table_list.append(incremental_table)
+        # Override default if merging is needed
+        if pc.sum(records_to_update).as_py() > 0:  # There are records to update
+            old_records_to_update = compacted_table.filter(records_to_update)
+            # Perform partial UPSERT: merge old and new records field by field
+            incremental_data = _merge_records_partially(
+                old_records=old_records_to_update,
+                new_records=incremental_table,
+                original_fields=original_fields,
+            )
+    # Add the determined incremental data
+    result_table_list.append(incremental_data)
-    final_table = pa.concat_tables(result_table_list)
+    final_table = _concat_or_coerce_tables(result_table_list)
     final_table = final_table.drop([sc._PK_HASH_STRING_COLUMN_NAME])
     return final_table
+def _merge_records_partially(
+    old_records: pa.Table, new_records: pa.Table, original_fields: Set[str]
+) -> pa.Table:
+    """
+    Merge records field by field for partial UPSERT behavior. Fills missing
+    fields in new_records with values from old_records.
+    Args:
+        old_records: Records from the compacted table that need updates
+        new_records: New records with potential partial field updates
+    Returns:
+        Table with merged records where missing fields preserve old values
+    """
+    # Get field sets (excluding hash column which is used for joining)
+    old_fields = set(old_records.column_names) - {sc._PK_HASH_STRING_COLUMN_NAME}
+    new_fields = set(new_records.column_names) - {sc._PK_HASH_STRING_COLUMN_NAME}
+    # Find fields that are missing from new_records but exist in old_records
+    missing_fields = old_fields - new_fields
+    # Find fields that were auto-added by schema coercion (missing from original user data)
+    # These should be treated as missing fields and filled from old_records
+    auto_added_null_fields = set()
+    # Use definitive information about which fields were originally provided
+    # Any field that exists in both tables but was NOT in the original user data
+    # should be treated as auto-added by schema coercion
+    for field_name in old_fields & new_fields:  # Fields that exist in both
+        if field_name not in original_fields:
+            auto_added_null_fields.add(field_name)
+    # Combine missing fields with auto-added null fields
+    fields_to_fill = missing_fields | auto_added_null_fields
+    # Start with new_records and add missing fields from old_records
+    result_columns = {}
+    # Copy all existing columns from new_records
+    for column_name in new_records.column_names:
+        result_columns[column_name] = new_records[column_name]
+    # Fill in missing/auto-added null fields with values from old_records
+    for field_name in fields_to_fill:
+        # For missing fields, use the old values entirely
+        result_columns[field_name] = old_records[field_name]
+    # Create the enhanced new_records table with all fields filled
+    enhanced_new_records = pa.table(result_columns)
+    # Now we can return the enhanced table - it has all the fields with proper values
+    # Missing fields are filled with old values, explicitly null fields remain null
+    return enhanced_new_records
+def _validate_bucketing_spec_compliance(
+    table: pa.Table,
+    num_buckets: int,
+    hb_index: int,
+    primary_keys: List[str],
+    rci: Optional[RoundCompletionInfo] = None,
+    log_prefix=None,
+) -> None:
+    if rci is not None:
+        message_prefix = f"{log_prefix}{rci.compacted_delta_locator.namespace}.{rci.compacted_delta_locator.table_name}.{rci.compacted_delta_locator.table_version}.{rci.compacted_delta_locator.partition_id}.{rci.compacted_delta_locator.partition_values}"
+    else:
+        message_prefix = f"{log_prefix}"
+    pki_table = generate_pk_hash_column(
+        [table], primary_keys=primary_keys, requires_hash=True
+    )[0]
+    is_not_compliant: bool = False
+    for index, hash_value in enumerate(sc.pk_hash_string_column_np(pki_table)):
+        hash_bucket: int = pk_digest_to_hash_bucket_index(hash_value, num_buckets)
+        if hash_bucket != hb_index:
+            is_not_compliant = True
+            logger.info(
+                f"{message_prefix} has non-compliant bucketing spec at index: {index} "
+                f"Expected hash bucket is {hb_index} but found {hash_bucket}."
+            )
+            if BUCKETING_SPEC_COMPLIANCE_PROFILE == BUCKETING_SPEC_COMPLIANCE_ASSERT:
+                raise AssertionError(
+                    f"Hash bucket drift detected at index: {index}. Expected hash bucket index"
+                    f" to be {hb_index} but found {hash_bucket}"
+                )
+            # No further checks necessary
+            break
+    if not is_not_compliant:
+        logger.debug(
+            f"{message_prefix} has compliant bucketing spec for hb_index: {hb_index}"
+        )
 def _download_compacted_table(
     hb_index: int,
-    rcf: RoundCompletionInfo,
+    rci: RoundCompletionInfo,
+    primary_keys: List[str],
+    all_column_names: List[str],
+    compacted_delta_manifest: Optional[Manifest] = None,
     read_kwargs_provider: Optional[ReadKwargsProvider] = None,
-    deltacat_storage=unimplemented_deltacat_storage,
+    deltacat_storage: metastore = metastore,
     deltacat_storage_kwargs: Optional[dict] = None,
 ) -> pa.Table:
     tables = []
-    hb_index_to_indices = rcf.hb_index_to_entry_range
+    hb_index_to_indices = rci.hb_index_to_entry_range
     if str(hb_index) not in hb_index_to_indices:
         return None
@@ -183,30 +364,52 @@ def _download_compacted_table(
     ), "indices should not be none and contains exactly two elements"
     for offset in range(indices[1] - indices[0]):
         table = deltacat_storage.download_delta_manifest_entry(
-            rcf.compacted_delta_locator,
+            Delta.of(
+                rci.compacted_delta_locator,
+                DeltaType.APPEND,
+                compacted_delta_manifest.meta,
+                None,
+                compacted_delta_manifest,
+            ),
             entry_index=(indices[0] + offset),
             file_reader_kwargs_provider=read_kwargs_provider,
+            all_column_names=all_column_names,
             **deltacat_storage_kwargs,
         )
         tables.append(table)
-    return pa.concat_tables(tables)
+    compacted_table = pa.concat_tables(tables)
+    check_bucketing_spec = BUCKETING_SPEC_COMPLIANCE_PROFILE in [
+        BUCKETING_SPEC_COMPLIANCE_PRINT_LOG,
+        BUCKETING_SPEC_COMPLIANCE_ASSERT,
+    ]
+    logger.debug(
+        f"Value of BUCKETING_SPEC_COMPLIANCE_PROFILE, check_bucketing_spec:"
+        f" {BUCKETING_SPEC_COMPLIANCE_PROFILE}, {check_bucketing_spec}"
+    )
+    # Bucketing spec compliance isn't required without primary keys
+    if primary_keys and check_bucketing_spec:
+        _validate_bucketing_spec_compliance(
+            compacted_table,
+            rci.hash_bucket_count,
+            hb_index,
+            primary_keys,
+            rci=rci,
+            log_prefix=_EXISTING_VARIANT_LOG_PREFIX,
+        )
+    return compacted_table
 def _copy_all_manifest_files_from_old_hash_buckets(
     hb_index_copy_by_reference: List[int],
     round_completion_info: RoundCompletionInfo,
     write_to_partition: Partition,
-    deltacat_storage=unimplemented_deltacat_storage,
-    deltacat_storage_kwargs: Optional[dict] = None,
+    compacted_manifest: Optional[Manifest] = None,
 ) -> List[MaterializeResult]:
-    compacted_delta_locator = round_completion_info.compacted_delta_locator
-    manifest = deltacat_storage.get_delta_manifest(
-        compacted_delta_locator, **deltacat_storage_kwargs
-    )
     manifest_entry_referenced_list = []
     materialize_result_list = []
     hb_index_to_indices = round_completion_info.hb_index_to_entry_range
@@ -223,27 +426,27 @@ def _copy_all_manifest_files_from_old_hash_buckets(
         for offset in range(indices[1] - indices[0]):
             entry_index = indices[0] + offset
             assert entry_index < len(
-                manifest.entries
-            ), f"entry index: {entry_index} >= {len(manifest.entries)}"
-            manifest_entry = manifest.entries[entry_index]
+                compacted_manifest.entries
+            ), f"entry index: {entry_index} >= {len(compacted_manifest.entries)}"
+            manifest_entry = compacted_manifest.entries[entry_index]
             manifest_entry_referenced_list.append(manifest_entry)
-        manifest = Manifest.of(
+        compacted_manifest = Manifest.of(
             entries=manifest_entry_referenced_list, uuid=str(uuid4())
         )
         delta = Delta.of(
             locator=DeltaLocator.of(write_to_partition.locator),
-            delta_type=DeltaType.UPSERT,
-            meta=manifest.meta,
-            manifest=manifest,
+            delta_type=DeltaType.APPEND,  # Compaction always produces APPEND deltas
+            meta=compacted_manifest.meta,
+            manifest=compacted_manifest,
             previous_stream_position=write_to_partition.stream_position,
             properties={},
         )
         referenced_pyarrow_write_result = PyArrowWriteResult.of(
             len(manifest_entry_referenced_list),
-            manifest.meta.source_content_length,
-            manifest.meta.content_length,
-            manifest.meta.record_count,
+            compacted_manifest.meta.source_content_length,
+            compacted_manifest.meta.content_length,
+            compacted_manifest.meta.record_count,
         )
         materialize_result = MaterializeResult.of(
             delta=delta,
@@ -268,6 +471,7 @@ def _has_previous_compacted_table(input: MergeInput, hb_idx: int) -> bool:
     """
     return (
         input.round_completion_info
+        and input.compacted_manifest is not None
         and input.round_completion_info.hb_index_to_entry_range
         and input.round_completion_info.hb_index_to_entry_range.get(str(hb_idx))
         is not None
@@ -285,6 +489,7 @@ def _can_copy_by_reference(
         not has_delete
         and not merge_file_group.dfe_groups
         and input.round_completion_info is not None
+        and input.compacted_manifest is not None
     )
     if input.disable_copy_by_reference:
@@ -383,9 +588,9 @@ def _compact_tables(
         delete_file_envelopes + df_envelopes
     )
     assert all(
-        dfe.delta_type in (DeltaType.UPSERT, DeltaType.DELETE)
+        dfe.delta_type in (DeltaType.APPEND, DeltaType.UPSERT, DeltaType.DELETE)
         for dfe in reordered_all_dfes
-    ), "All reordered delta file envelopes must be of the UPSERT or DELETE"
+    ), "All reordered delta file envelopes must be of the APPEND, UPSERT or DELETE"
     table = compacted_table
     aggregated_incremental_len = 0
     aggregated_deduped_records = 0
@@ -393,13 +598,13 @@ def _compact_tables(
     for i, (delta_type, delta_type_sequence) in enumerate(
         _group_sequence_by_delta_type(reordered_all_dfes)
     ):
-        if delta_type is DeltaType.UPSERT:
-            (
-                table,
-                incremental_len,
-                deduped_records,
-                merge_time,
-            ) = _apply_upserts(input, delta_type_sequence, hb_idx, table)
+        if delta_type is DeltaType.UPSERT or delta_type is DeltaType.APPEND:
+            (table, incremental_len, deduped_records, merge_time,) = _apply_upserts(
+                input=input,
+                dfe_list=delta_type_sequence,
+                hb_idx=hb_idx,
+                prev_table=table,
+            )
             logger.info(
                 f" [Merge task index {input.merge_task_index}] Merged"
                 f" record count: {len(table)}, size={table.nbytes} took: {merge_time}s"
@@ -434,8 +639,9 @@ def _apply_upserts(
     prev_table=None,
 ) -> Tuple[pa.Table, int, int, int]:
     assert all(
-        dfe.delta_type is DeltaType.UPSERT for dfe in dfe_list
-    ), "All incoming delta file envelopes must of the DeltaType.UPSERT"
+        dfe.delta_type is DeltaType.UPSERT or dfe.delta_type is DeltaType.APPEND
+        for dfe in dfe_list
+    ), "All incoming delta file envelopes must of the DeltaType.UPSERT or DeltaType.APPEND"
     logger.info(
         f"[Hash bucket index {hb_idx}] Reading dedupe input for "
         f"{len(dfe_list)} delta file envelope lists..."
@@ -459,6 +665,9 @@ def _apply_upserts(
         table=table,
         primary_keys=input.primary_keys,
         can_drop_duplicates=input.drop_duplicates,
+        hb_index=hb_idx,
+        num_buckets=input.hash_bucket_count,
+        original_fields=input.original_fields,
         compacted_table=prev_table,
     )
     deduped_records = hb_table_record_count - len(table)
@@ -476,8 +685,7 @@ def _copy_manifests_from_hash_bucketing(
                 hb_index_copy_by_reference_ids,
                 input.round_completion_info,
                 input.write_to_partition,
-                input.deltacat_storage,
-                input.deltacat_storage_kwargs,
+                input.compacted_manifest,
             )
         )
         logger.info(
@@ -494,9 +702,11 @@ def _copy_manifests_from_hash_bucketing(
 def _timed_merge(input: MergeInput) -> MergeResult:
     task_id = get_current_ray_task_id()
     worker_id = get_current_ray_worker_id()
-    with memray.Tracker(
-        f"merge_{worker_id}_{task_id}.bin"
-    ) if input.enable_profiler else nullcontext():
+    with (
+        memray.Tracker(f"merge_{worker_id}_{task_id}.bin")
+        if input.enable_profiler
+        else nullcontext()
+    ):
         total_input_records, total_deduped_records = 0, 0
         total_dropped_records = 0
         materialized_results: List[MaterializeResult] = []
@@ -515,11 +725,13 @@ def _timed_merge(input: MergeInput) -> MergeResult:
             ):
                 hb_index_copy_by_ref_ids.append(merge_file_group.hb_index)
                 continue
             if _has_previous_compacted_table(input, merge_file_group.hb_index):
                 compacted_table = _download_compacted_table(
                     hb_index=merge_file_group.hb_index,
-                    rcf=input.round_completion_info,
+                    rci=input.round_completion_info,
+                    primary_keys=input.primary_keys,
+                    all_column_names=input.all_column_names,
+                    compacted_delta_manifest=input.compacted_manifest,
                     read_kwargs_provider=input.read_kwargs_provider,
                     deltacat_storage=input.deltacat_storage,
                     deltacat_storage_kwargs=input.deltacat_storage_kwargs,
@@ -604,5 +816,5 @@ def merge(input: MergeInput) -> MergeResult:
             merge_result[3],
             merge_result[4],
             np.double(emit_metrics_time),
-            merge_result[4],
+            merge_result[6],
         )

deltacat/compute/compactor_v2/utils/content_type_params.py CHANGED Viewed

@@ -1,19 +1,21 @@
 import logging
 import ray
 import functools
+from typing import List
 from deltacat.compute.compactor_v2.constants import (
     TASK_MAX_PARALLELISM,
     MAX_PARQUET_METADATA_SIZE,
 )
+from deltacat.utils.common import ReadKwargsProvider
 from deltacat.utils.ray_utils.concurrency import invoke_parallel
 from deltacat import logs
 from deltacat.storage import (
     Delta,
     ManifestEntry,
-    interface as unimplemented_deltacat_storage,
+    metastore,
 )
 from typing import Dict, Optional, Any
-from deltacat.types.media import TableType
+from deltacat.types.media import DatasetType
 from deltacat.types.media import ContentType
 from deltacat.types.partial_download import PartialParquetParameters
 from deltacat.exceptions import RetryableError
@@ -73,13 +75,26 @@ class AppendContentTypeParamsCache:
 def _download_parquet_metadata_for_manifest_entry(
     delta: Delta,
     entry_index: int,
-    deltacat_storage: unimplemented_deltacat_storage,
+    all_column_names: List[str],
+    deltacat_storage: metastore,
     deltacat_storage_kwargs: Optional[Dict[Any, Any]] = {},
+    file_reader_kwargs_provider: Optional[ReadKwargsProvider] = None,
 ) -> Dict[str, Any]:
+    logger.info(
+        f"Downloading the parquet metadata for Delta with locator {delta.locator} and entry_index: {entry_index}"
+    )
+    if "file_reader_kwargs_provider" in deltacat_storage_kwargs:
+        logger.info(
+            "'file_reader_kwargs_provider' is also present in deltacat_storage_kwargs. Removing to prevent multiple values for keyword argument"
+        )
+        deltacat_storage_kwargs.pop("file_reader_kwargs_provider")
     pq_file = deltacat_storage.download_delta_manifest_entry(
         delta,
         entry_index=entry_index,
-        table_type=TableType.PYARROW_PARQUET,
+        table_type=DatasetType.PYARROW_PARQUET,
+        file_reader_kwargs_provider=file_reader_kwargs_provider,
+        all_column_names=all_column_names,
         **deltacat_storage_kwargs,
     )
@@ -93,15 +108,20 @@ def _download_parquet_metadata_for_manifest_entry(
 def append_content_type_params(
     delta: Delta,
+    all_column_names: List[str],
     task_max_parallelism: int = TASK_MAX_PARALLELISM,
     max_parquet_meta_size_bytes: Optional[int] = MAX_PARQUET_METADATA_SIZE,
-    deltacat_storage=unimplemented_deltacat_storage,
+    deltacat_storage: metastore = metastore,
     deltacat_storage_kwargs: Optional[Dict[str, Any]] = {},
+    file_reader_kwargs_provider: Optional[ReadKwargsProvider] = None,
 ) -> bool:
     """
     This operation appends content type params into the delta entry. Note
     that this operation can be time consuming, hence we cache it in a Ray actor.
     """
+    logger.info(
+        f"Appending the content type params for Delta with locator {delta.locator}..."
+    )
     if not delta.meta:
         logger.warning(f"Delta with locator {delta.locator} doesn't contain meta.")
@@ -157,17 +177,25 @@ def append_content_type_params(
         max_parquet_meta_size_bytes=max_parquet_meta_size_bytes,
     )
+    # create a copy of deltacat_storage_kwargs without transaction key
+    deltacat_storage_kwargs_copy = {
+        k: v for k, v in deltacat_storage_kwargs.items() if k != "transaction"
+    }
     def input_provider(index, item) -> Dict:
         return {
-            "deltacat_storage_kwargs": deltacat_storage_kwargs,
+            "file_reader_kwargs_provider": file_reader_kwargs_provider,
+            "deltacat_storage_kwargs": deltacat_storage_kwargs_copy,
             "deltacat_storage": deltacat_storage,
             "delta": delta,
             "entry_index": item,
+            "all_column_names": all_column_names,
         }
     logger.info(
         f"Downloading parquet meta for {len(entry_indices_to_download)} manifest entries..."
     )
     pq_files_promise = invoke_parallel(
         entry_indices_to_download,
         ray_task=_download_parquet_metadata_for_manifest_entry,

deltacat/compute/compactor_v2/utils/dedupe.py CHANGED Viewed

@@ -25,7 +25,7 @@ def _create_chunked_index_array(array: pa.Array) -> pa.Array:
         result[index] = np.arange(cl, dtype="int32")
     chunk_lengths = ([0] + chunk_lengths)[:-1]
-    result = pa.chunked_array(result + np.cumsum(chunk_lengths))
+    result = pa.chunked_array(result + np.cumsum(chunk_lengths), type=pa.int32())
     return result

deltacat/compute/compactor_v2/utils/delta.py CHANGED Viewed

@@ -9,7 +9,7 @@ from deltacat.storage import (
     Delta,
 )
 from deltacat.storage.model.delta import DeltaType
-from deltacat.storage import interface as unimplemented_deltacat_storage
+from deltacat.storage import metastore
 from deltacat.types.media import StorageType
 from deltacat.utils.common import ReadKwargsProvider
 from deltacat import logs
@@ -30,8 +30,9 @@ def contains_delete_deltas(deltas: List[Delta]) -> bool:
 def read_delta_file_envelopes(
     annotated_delta: DeltaAnnotated,
+    all_column_names: List[str],
     read_kwargs_provider: Optional[ReadKwargsProvider],
-    deltacat_storage=unimplemented_deltacat_storage,
+    deltacat_storage: metastore = metastore,
     deltacat_storage_kwargs: Optional[dict] = None,
 ) -> Tuple[Optional[List[DeltaFileEnvelope]], int, int]:
     tables = deltacat_storage.download_delta(
@@ -39,6 +40,7 @@ def read_delta_file_envelopes(
         max_parallelism=1,
         file_reader_kwargs_provider=read_kwargs_provider,
         storage_type=StorageType.LOCAL,
+        all_column_names=all_column_names,
         **deltacat_storage_kwargs,
     )
     annotations = annotated_delta.annotations
@@ -80,7 +82,7 @@ def read_delta_file_envelopes(
 def get_local_delta_file_envelopes(
     uniform_deltas: List[DeltaAnnotated],
     read_kwargs_provider: Optional[ReadKwargsProvider],
-    deltacat_storage=unimplemented_deltacat_storage,
+    deltacat_storage=metastore,
     deltacat_storage_kwargs: Optional[dict] = None,
 ) -> Tuple[List[DeltaFileEnvelope], int]:
     local_dfe_list = []

deltacat 2.0.0b11__py3-none-any.whl → 2.0.0b12__py3-none-any.whl

deltacat 2.0.0b11py3-none-any.whl → 2.0.0b12py3-none-any.whl