PyPI - deltacat - Versions diffs - 2.0.0b11__py3-none-any.whl → 2.0.0.post1__py3-none-any.whl - Mend

deltacat 2.0.0b11py3-none-any.whl → 2.0.0.post1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (194) hide show

deltacat/__init__.py +78 -3
deltacat/api.py +122 -67
deltacat/aws/constants.py +0 -23
deltacat/aws/s3u.py +4 -631
deltacat/benchmarking/conftest.py +0 -18
deltacat/catalog/__init__.py +2 -0
deltacat/catalog/delegate.py +445 -63
deltacat/catalog/interface.py +188 -62
deltacat/catalog/main/impl.py +2417 -271
deltacat/catalog/model/catalog.py +49 -10
deltacat/catalog/model/properties.py +38 -0
deltacat/compute/compactor/compaction_session.py +97 -75
deltacat/compute/compactor/model/compact_partition_params.py +75 -30
deltacat/compute/compactor/model/compaction_session_audit_info.py +17 -0
deltacat/compute/compactor/model/round_completion_info.py +16 -6
deltacat/compute/compactor/repartition_session.py +8 -21
deltacat/compute/compactor/steps/hash_bucket.py +5 -5
deltacat/compute/compactor/steps/materialize.py +9 -7
deltacat/compute/compactor/steps/repartition.py +12 -11
deltacat/compute/compactor/utils/io.py +6 -5
deltacat/compute/compactor/utils/round_completion_reader.py +117 -0
deltacat/compute/compactor/utils/system_columns.py +3 -1
deltacat/compute/compactor_v2/compaction_session.py +17 -14
deltacat/compute/compactor_v2/constants.py +30 -1
deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -1
deltacat/compute/compactor_v2/model/hash_bucket_input.py +9 -3
deltacat/compute/compactor_v2/model/merge_file_group.py +5 -2
deltacat/compute/compactor_v2/model/merge_input.py +33 -8
deltacat/compute/compactor_v2/private/compaction_utils.py +167 -68
deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -2
deltacat/compute/compactor_v2/steps/merge.py +267 -55
deltacat/compute/compactor_v2/utils/content_type_params.py +34 -6
deltacat/compute/compactor_v2/utils/dedupe.py +1 -1
deltacat/compute/compactor_v2/utils/delta.py +5 -3
deltacat/compute/compactor_v2/utils/io.py +11 -4
deltacat/compute/compactor_v2/utils/merge.py +15 -2
deltacat/compute/compactor_v2/utils/primary_key_index.py +28 -4
deltacat/compute/compactor_v2/utils/task_options.py +45 -33
deltacat/compute/converter/converter_session.py +145 -32
deltacat/compute/converter/model/convert_input.py +26 -19
deltacat/compute/converter/model/convert_input_files.py +33 -16
deltacat/compute/converter/model/convert_result.py +35 -16
deltacat/compute/converter/model/converter_session_params.py +24 -21
deltacat/compute/converter/pyiceberg/catalog.py +21 -18
deltacat/compute/converter/pyiceberg/overrides.py +18 -9
deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +148 -100
deltacat/compute/converter/steps/convert.py +157 -50
deltacat/compute/converter/steps/dedupe.py +24 -11
deltacat/compute/converter/utils/convert_task_options.py +27 -12
deltacat/compute/converter/utils/converter_session_utils.py +126 -60
deltacat/compute/converter/utils/iceberg_columns.py +8 -8
deltacat/compute/converter/utils/io.py +101 -12
deltacat/compute/converter/utils/s3u.py +33 -27
deltacat/compute/janitor.py +205 -0
deltacat/compute/jobs/client.py +19 -8
deltacat/compute/resource_estimation/delta.py +38 -6
deltacat/compute/resource_estimation/model.py +8 -0
deltacat/constants.py +44 -0
deltacat/docs/autogen/schema/__init__.py +0 -0
deltacat/docs/autogen/schema/inference/__init__.py +0 -0
deltacat/docs/autogen/schema/inference/generate_type_mappings.py +687 -0
deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py +673 -0
deltacat/examples/compactor/__init__.py +0 -0
deltacat/examples/compactor/aws/__init__.py +1 -0
deltacat/examples/compactor/bootstrap.py +863 -0
deltacat/examples/compactor/compactor.py +373 -0
deltacat/examples/compactor/explorer.py +473 -0
deltacat/examples/compactor/gcp/__init__.py +1 -0
deltacat/examples/compactor/job_runner.py +439 -0
deltacat/examples/compactor/utils/__init__.py +1 -0
deltacat/examples/compactor/utils/common.py +261 -0
deltacat/examples/experimental/iceberg/converter/__init__.py +0 -0
deltacat/examples/experimental/iceberg/converter/beam/__init__.py +0 -0
deltacat/examples/experimental/iceberg/converter/beam/app.py +226 -0
deltacat/examples/experimental/iceberg/converter/beam/main.py +133 -0
deltacat/examples/experimental/iceberg/converter/beam/test_workflow.py +113 -0
deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py +3 -0
deltacat/examples/experimental/iceberg/converter/beam/utils/common.py +174 -0
deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py +263 -0
deltacat/exceptions.py +66 -4
deltacat/experimental/catalog/iceberg/impl.py +2 -2
deltacat/experimental/compatibility/__init__.py +0 -0
deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py +201 -0
deltacat/experimental/converter_agent/__init__.py +0 -0
deltacat/experimental/converter_agent/beam/__init__.py +0 -0
deltacat/experimental/converter_agent/beam/managed.py +173 -0
deltacat/experimental/converter_agent/table_monitor.py +479 -0
deltacat/experimental/storage/iceberg/iceberg_scan_planner.py +105 -4
deltacat/experimental/storage/iceberg/impl.py +5 -3
deltacat/experimental/storage/iceberg/model.py +7 -3
deltacat/experimental/storage/iceberg/visitor.py +119 -0
deltacat/experimental/storage/rivulet/dataset.py +0 -3
deltacat/experimental/storage/rivulet/metastore/delta.py +0 -2
deltacat/experimental/storage/rivulet/reader/dataset_metastore.py +3 -2
deltacat/io/datasource/deltacat_datasource.py +0 -1
deltacat/storage/__init__.py +20 -2
deltacat/storage/interface.py +54 -32
deltacat/storage/main/impl.py +1494 -541
deltacat/storage/model/delta.py +27 -3
deltacat/storage/model/locator.py +6 -12
deltacat/storage/model/manifest.py +182 -6
deltacat/storage/model/metafile.py +151 -78
deltacat/storage/model/namespace.py +8 -1
deltacat/storage/model/partition.py +117 -42
deltacat/storage/model/schema.py +2427 -159
deltacat/storage/model/sort_key.py +40 -0
deltacat/storage/model/stream.py +9 -2
deltacat/storage/model/table.py +12 -1
deltacat/storage/model/table_version.py +11 -0
deltacat/storage/model/transaction.py +1184 -208
deltacat/storage/model/transform.py +81 -2
deltacat/storage/model/types.py +48 -26
deltacat/tests/_io/test_cloudpickle_bug_fix.py +8 -4
deltacat/tests/aws/test_s3u.py +2 -31
deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +1606 -70
deltacat/tests/catalog/test_catalogs.py +54 -11
deltacat/tests/catalog/test_default_catalog_impl.py +12152 -71
deltacat/tests/compute/compact_partition_test_cases.py +35 -8
deltacat/tests/compute/compactor/steps/test_repartition.py +12 -12
deltacat/tests/compute/compactor/utils/test_io.py +124 -120
deltacat/tests/compute/compactor/utils/test_round_completion_reader.py +254 -0
deltacat/tests/compute/compactor_v2/test_compaction_session.py +423 -312
deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +266 -0
deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +45 -0
deltacat/tests/compute/compactor_v2/utils/test_task_options.py +270 -1
deltacat/tests/compute/conftest.py +8 -44
deltacat/tests/compute/converter/test_convert_session.py +675 -490
deltacat/tests/compute/converter/utils.py +15 -6
deltacat/tests/compute/resource_estimation/test_delta.py +145 -79
deltacat/tests/compute/test_compact_partition_incremental.py +103 -70
deltacat/tests/compute/test_compact_partition_multiple_rounds.py +89 -66
deltacat/tests/compute/test_compact_partition_params.py +13 -8
deltacat/tests/compute/test_compact_partition_rebase.py +77 -62
deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +263 -193
deltacat/tests/compute/test_janitor.py +236 -0
deltacat/tests/compute/test_util_common.py +716 -43
deltacat/tests/compute/test_util_constant.py +0 -1
deltacat/tests/{storage/conftest.py → conftest.py} +1 -1
deltacat/tests/experimental/__init__.py +1 -0
deltacat/tests/experimental/compatibility/__init__.py +1 -0
deltacat/tests/experimental/compatibility/test_backfill_locator_to_id_mappings.py +582 -0
deltacat/tests/storage/main/test_main_storage.py +6900 -95
deltacat/tests/storage/model/test_metafile_io.py +78 -173
deltacat/tests/storage/model/test_partition_scheme.py +85 -0
deltacat/tests/storage/model/test_schema.py +171 -0
deltacat/tests/storage/model/test_schema_update.py +1925 -0
deltacat/tests/storage/model/test_sort_scheme.py +90 -0
deltacat/tests/storage/model/test_transaction.py +393 -48
deltacat/tests/storage/model/test_transaction_history.py +886 -0
deltacat/tests/test_deltacat_api.py +988 -4
deltacat/tests/test_exceptions.py +9 -5
deltacat/tests/test_utils/pyarrow.py +52 -21
deltacat/tests/test_utils/storage.py +23 -34
deltacat/tests/types/__init__.py +0 -0
deltacat/tests/types/test_tables.py +104 -0
deltacat/tests/utils/exceptions.py +22 -0
deltacat/tests/utils/main_deltacat_storage_mock.py +31 -0
deltacat/tests/utils/ray_utils/test_dataset.py +123 -5
deltacat/tests/utils/test_daft.py +121 -31
deltacat/tests/utils/test_numpy.py +1193 -0
deltacat/tests/utils/test_pandas.py +1106 -0
deltacat/tests/utils/test_polars.py +1040 -0
deltacat/tests/utils/test_pyarrow.py +1370 -89
deltacat/types/media.py +221 -11
deltacat/types/tables.py +2329 -59
deltacat/utils/arguments.py +33 -1
deltacat/utils/daft.py +411 -150
deltacat/utils/filesystem.py +100 -0
deltacat/utils/metafile_locator.py +2 -1
deltacat/utils/numpy.py +118 -26
deltacat/utils/pandas.py +577 -48
deltacat/utils/polars.py +658 -27
deltacat/utils/pyarrow.py +1258 -213
deltacat/utils/ray_utils/dataset.py +101 -10
deltacat/utils/reader_compatibility_mapping.py +3083 -0
deltacat/utils/url.py +56 -15
deltacat-2.0.0.post1.dist-info/METADATA +1163 -0
{deltacat-2.0.0b11.dist-info → deltacat-2.0.0.post1.dist-info}/RECORD +183 -145
{deltacat-2.0.0b11.dist-info → deltacat-2.0.0.post1.dist-info}/WHEEL +1 -1
deltacat/compute/compactor/utils/round_completion_file.py +0 -97
deltacat/compute/merge_on_read/__init__.py +0 -4
deltacat/compute/merge_on_read/daft.py +0 -40
deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -66
deltacat/compute/merge_on_read/utils/delta.py +0 -42
deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -231
deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -388
deltacat/tests/local_deltacat_storage/__init__.py +0 -1236
deltacat/tests/local_deltacat_storage/exceptions.py +0 -10
deltacat/utils/s3fs.py +0 -21
deltacat-2.0.0b11.dist-info/METADATA +0 -67
/deltacat/{compute/merge_on_read/model → docs}/__init__.py +0 -0
/deltacat/{compute/merge_on_read/utils → docs/autogen}/__init__.py +0 -0
{deltacat-2.0.0b11.dist-info → deltacat-2.0.0.post1.dist-info/licenses}/LICENSE +0 -0
{deltacat-2.0.0b11.dist-info → deltacat-2.0.0.post1.dist-info}/top_level.txt +0 -0

deltacat/compute/converter/steps/convert.py CHANGED Viewed

@@ -8,24 +8,33 @@ import ray
 import logging
 from deltacat.compute.converter.model.convert_input import ConvertInput
 from deltacat.compute.converter.steps.dedupe import dedupe_data_files
-from deltacat.compute.converter.utils.s3u import upload_table_with_retry
+from deltacat.compute.converter.utils.io import write_sliced_table
 from deltacat.compute.converter.utils.io import (
     download_data_table_and_append_iceberg_columns,
 )
 from deltacat.compute.converter.utils.converter_session_utils import (
     partition_value_record_to_partition_value_string,
+    sort_data_files_maintaining_order,
 )
 from deltacat.compute.converter.pyiceberg.overrides import (
     parquet_files_dict_to_iceberg_data_files,
 )
 from deltacat.compute.converter.model.convert_result import ConvertResult
+from pyiceberg.manifest import DataFileContent
 from deltacat import logs
+from fsspec import AbstractFileSystem
+from typing import List, Dict, Tuple, Optional, Any
+from deltacat.utils.resources import get_current_process_peak_memory_usage_in_bytes
+from deltacat.compute.converter.model.convert_input_files import (
+    DataFileList,
+    DataFileListGroup,
+)
 logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
 @ray.remote
-def convert(convert_input: ConvertInput):
+def convert(convert_input: ConvertInput) -> ConvertResult:
     convert_input_files = convert_input.convert_input_files
     convert_task_index = convert_input.convert_task_index
     iceberg_table_warehouse_prefix = convert_input.iceberg_table_warehouse_prefix
@@ -39,8 +48,10 @@ def convert(convert_input: ConvertInput):
         convert_input.position_delete_for_multiple_data_files
     )
     max_parallel_data_file_download = convert_input.max_parallel_data_file_download
-    s3_file_system = convert_input.s3_file_system
+    filesystem = convert_input.filesystem
     s3_client_kwargs = convert_input.s3_client_kwargs
+    task_memory = convert_input.task_memory
     if not position_delete_for_multiple_data_files:
         raise NotImplementedError(
             f"Distributed file level position delete compute is not supported yet"
@@ -54,6 +65,7 @@ def convert(convert_input: ConvertInput):
     applicable_equality_delete_files = (
         convert_input_files.applicable_equality_delete_files
     )
     all_data_files_for_this_bucket = convert_input_files.all_data_files_for_dedupe
     partition_value_str = partition_value_record_to_partition_value_string(
@@ -69,11 +81,14 @@ def convert(convert_input: ConvertInput):
         iceberg_table_warehouse_prefix_with_partition = (
             f"{iceberg_table_warehouse_prefix}"
         )
     enforce_primary_key_uniqueness = convert_input.enforce_primary_key_uniqueness
     total_pos_delete_table = []
+    data_table_after_converting_equality_delete = []
     if applicable_equality_delete_files:
         (
-            pos_delete_after_converting_equality_delete
+            pos_delete_after_converting_equality_delete,
+            data_table_after_converting_equality_delete,
         ) = compute_pos_delete_with_limited_parallelism(
             data_files_list=applicable_data_files,
             identifier_columns=identifier_fields,
@@ -81,20 +96,35 @@ def convert(convert_input: ConvertInput):
             iceberg_table_warehouse_prefix_with_partition=iceberg_table_warehouse_prefix_with_partition,
             convert_task_index=convert_task_index,
             max_parallel_data_file_download=max_parallel_data_file_download,
-            s3_file_system=s3_file_system,
+            s3_file_system=filesystem,
             s3_client_kwargs=s3_client_kwargs,
         )
         if pos_delete_after_converting_equality_delete:
             total_pos_delete_table.append(pos_delete_after_converting_equality_delete)
     if enforce_primary_key_uniqueness:
+        data_files_downloaded_during_convert = []
+        if applicable_data_files:
+            for file_list in applicable_data_files:
+                for file in file_list:
+                    data_files_downloaded_during_convert.append(file)
         data_files_to_dedupe = get_additional_applicable_data_files(
             all_data_files=all_data_files_for_this_bucket,
-            data_files_downloaded=applicable_data_files,
+            data_files_downloaded=data_files_downloaded_during_convert,
+        )
+        dedupe_file_size_bytes = sum(
+            data_file.file_size_in_bytes for _, data_file in data_files_to_dedupe
+        )
+        logger.info(
+            f"Total on-disk size of files to dedupe: {dedupe_file_size_bytes} bytes"
         )
         logger.info(
             f"[Convert task {convert_task_index}]: Got {len(data_files_to_dedupe)} files to dedupe."
         )
         (
             pos_delete_after_dedupe,
             data_file_to_dedupe_record_count,
@@ -102,6 +132,7 @@ def convert(convert_input: ConvertInput):
         ) = dedupe_data_files(
             data_file_to_dedupe=data_files_to_dedupe,
             identifier_columns=identifier_fields,
+            remaining_data_table_after_convert=data_table_after_converting_equality_delete,
             merge_sort_column=sc._ORDERED_RECORD_IDX_COLUMN_NAME,
             s3_client_kwargs=s3_client_kwargs,
         )
@@ -118,11 +149,11 @@ def convert(convert_input: ConvertInput):
     to_be_added_files_list = []
     if total_pos_delete:
-        to_be_added_files_list_parquet = upload_table_with_retry(
+        to_be_added_files_list_parquet = write_sliced_table(
             table=total_pos_delete,
-            s3_url_prefix=iceberg_table_warehouse_prefix_with_partition,
-            s3_table_writer_kwargs={},
-            s3_file_system=s3_file_system,
+            base_path=iceberg_table_warehouse_prefix_with_partition,
+            table_writer_kwargs={},
+            filesystem=filesystem,
         )
         to_be_added_files_dict = defaultdict()
@@ -131,19 +162,39 @@ def convert(convert_input: ConvertInput):
         logger.info(
             f"[Convert task {convert_task_index}]: Produced {len(to_be_added_files_list_parquet)} position delete files."
         )
+        file_content_type = DataFileContent.POSITION_DELETES
         to_be_added_files_list = parquet_files_dict_to_iceberg_data_files(
             io=table_io,
             table_metadata=table_metadata,
             files_dict=to_be_added_files_dict,
+            file_content_type=file_content_type,
         )
     to_be_delete_files_dict = defaultdict()
     if applicable_equality_delete_files:
         to_be_delete_files_dict[partition_value] = [
             equality_delete_file[1]
-            for equality_delete_file in applicable_equality_delete_files
+            for equality_delete_list in applicable_equality_delete_files
+            for equality_delete_file in equality_delete_list
         ]
+    if not enforce_primary_key_uniqueness:
+        data_file_to_dedupe_record_count = 0
+        data_file_to_dedupe_size = 0
+    peak_memory_usage_bytes = (
+        get_current_process_peak_memory_usage_in_bytes()
+    )  # Convert KB to bytes
+    memory_usage_percentage = (peak_memory_usage_bytes / task_memory) * 100
+    logger.info(
+        f"[Convert task {convert_task_index}]: Memory usage stats - "
+        f"Peak memory usage: {peak_memory_usage_bytes} bytes, "
+        f"Allocated task memory: {convert_input.task_memory} bytes, "
+        f"Usage percentage: {memory_usage_percentage:.2f}%"
+    )
     convert_res = ConvertResult.of(
         convert_task_index=convert_task_index,
         to_be_added_files=to_be_added_files_list,
@@ -155,38 +206,73 @@ def convert(convert_input: ConvertInput):
         position_delete_on_disk_sizes=sum(
             file.file_size_in_bytes for file in to_be_added_files_list
         ),
+        input_data_files_on_disk_size=dedupe_file_size_bytes,
+        peak_memory_usage_bytes=peak_memory_usage_bytes,
+        memory_usage_percentage=memory_usage_percentage,
     )
     return convert_res
-def get_additional_applicable_data_files(all_data_files, data_files_downloaded):
-    data_file_to_dedupe = all_data_files
+def get_additional_applicable_data_files(
+    all_data_files: DataFileList,
+    data_files_downloaded: DataFileList,
+) -> DataFileList:
+    data_file_to_dedupe = []
+    assert len(set(all_data_files)) >= len(set(data_files_downloaded)), (
+        f"Length of all data files ({len(set(all_data_files))}) should never be less than "
+        f"the length of candidate equality delete data files ({len(set(data_files_downloaded))})"
+    )
     if data_files_downloaded:
-        data_file_to_dedupe = list(set(all_data_files) - set(data_files_downloaded))
+        # set1.difference(set2) returns elements in set1 but not in set2
+        data_file_to_dedupe.extend(
+            list(set(data_file_to_dedupe).difference(set(data_files_downloaded)))
+        )
+    else:
+        data_file_to_dedupe = all_data_files
     return data_file_to_dedupe
 def filter_rows_to_be_deleted(
-    equality_delete_table, data_file_table, identifier_columns
-):
-    identifier_column = identifier_columns[0]
+    equality_delete_table: Optional[pa.Table],
+    data_file_table: Optional[pa.Table],
+    identifier_columns: List[str],
+) -> Tuple[Optional[pa.Table], Optional[pa.Table]]:
+    identifier_column = sc._IDENTIFIER_COLUMNS_HASH_COLUMN_NAME
     if equality_delete_table and data_file_table:
         equality_deletes = pc.is_in(
             data_file_table[identifier_column],
             equality_delete_table[identifier_column],
         )
+        data_file_record_remaining = pc.invert(
+            pc.is_in(
+                data_file_table[identifier_column],
+                equality_delete_table[identifier_column],
+            )
+        )
         position_delete_table = data_file_table.filter(equality_deletes)
-    return position_delete_table
+        remaining_data_table = data_file_table.filter(data_file_record_remaining)
+        position_delete_table = position_delete_table.drop(
+            [sc._IDENTIFIER_COLUMNS_HASH_COLUMN_NAME]
+        )
+        assert len(position_delete_table) + len(remaining_data_table) == len(
+            data_file_table
+        ), (
+            f"Expected undeleted data file record count plus length of pos deletes to match original data file record count of {len(data_file_table)}, "
+            f"but found {len(position_delete_table)} pos deletes + {len(remaining_data_table)} equality deletes."
+        )
+    return position_delete_table, remaining_data_table
 def compute_pos_delete_converting_equality_deletes(
-    equality_delete_table,
-    data_file_table,
-    identifier_columns,
-    iceberg_table_warehouse_prefix_with_partition,
-    s3_file_system,
-):
-    new_position_delete_table = filter_rows_to_be_deleted(
+    equality_delete_table: Optional[pa.Table],
+    data_file_table: Optional[pa.Table],
+    identifier_columns: List[str],
+    iceberg_table_warehouse_prefix_with_partition: str,
+    s3_file_system: Optional[AbstractFileSystem],
+) -> Tuple[Optional[pa.Table], Optional[pa.Table]]:
+    new_position_delete_table, remaining_data_table = filter_rows_to_be_deleted(
         data_file_table=data_file_table,
         equality_delete_table=equality_delete_table,
         identifier_columns=identifier_columns,
@@ -195,34 +281,46 @@ def compute_pos_delete_converting_equality_deletes(
         logger.info(
             f"Length of position delete table after converting from equality deletes:{len(new_position_delete_table)}"
         )
+        return new_position_delete_table, remaining_data_table
+    elif not remaining_data_table:
+        return None, None
     else:
-        return None
-    return new_position_delete_table
+        return None, remaining_data_table
 def compute_pos_delete_with_limited_parallelism(
-    data_files_list,
-    identifier_columns,
-    equality_delete_files_list,
-    iceberg_table_warehouse_prefix_with_partition,
-    convert_task_index,
-    max_parallel_data_file_download,
-    s3_file_system,
-    s3_client_kwargs,
-):
+    data_files_list: DataFileListGroup,
+    identifier_columns: List[str],
+    equality_delete_files_list: DataFileListGroup,
+    iceberg_table_warehouse_prefix_with_partition: str,
+    convert_task_index: int,
+    max_parallel_data_file_download: int,
+    s3_file_system: Optional[AbstractFileSystem],
+    s3_client_kwargs: Optional[Dict[str, Any]],
+) -> Tuple[Optional[pa.Table], Optional[pa.Table]]:
+    assert len(data_files_list) == len(equality_delete_files_list), (
+        f"Number of lists of data files should equal to number of list of equality delete files, "
+        f"But got {len(data_files_list)} data files lists vs {len(equality_delete_files_list)}."
+    )
+    new_pos_delete_table_total = []
     for data_files, equality_delete_files in zip(
         data_files_list, equality_delete_files_list
     ):
         data_table_total = []
+        # Sort data files by file sequence number first, then file path to
+        # make sure files having same sequence number are deterministically sorted
+        data_files = sort_data_files_maintaining_order(data_files=data_files)
         for data_file in data_files:
             data_table = download_data_table_and_append_iceberg_columns(
-                data_files=data_file[1],
+                file=data_file[1],
                 columns_to_download=identifier_columns,
                 additional_columns_to_append=[
                     sc._FILE_PATH_COLUMN_NAME,
                     sc._ORDERED_RECORD_IDX_COLUMN_NAME,
                 ],
-                sequence_number=data_file[0],
                 s3_client_kwargs=s3_client_kwargs,
             )
             data_table_total.append(data_table)
@@ -231,29 +329,38 @@ def compute_pos_delete_with_limited_parallelism(
         equality_delete_table_total = []
         for equality_delete in equality_delete_files:
             equality_delete_table = download_data_table_and_append_iceberg_columns(
-                data_files=equality_delete[1],
+                file=equality_delete[1],
                 columns_to_download=identifier_columns,
                 s3_client_kwargs=s3_client_kwargs,
             )
             equality_delete_table_total.append(equality_delete_table)
         equality_delete_table_total = pa.concat_tables(equality_delete_table_total)
-    new_pos_delete_table = compute_pos_delete_converting_equality_deletes(
-        equality_delete_table=equality_delete_table_total,
-        data_file_table=data_table_total,
-        iceberg_table_warehouse_prefix_with_partition=iceberg_table_warehouse_prefix_with_partition,
-        identifier_columns=identifier_columns,
-        s3_file_system=s3_file_system,
-        s3_client_kwargs=s3_client_kwargs,
-    )
+        (
+            new_pos_delete_table,
+            remaining_data_table,
+        ) = compute_pos_delete_converting_equality_deletes(
+            equality_delete_table=equality_delete_table_total,
+            data_file_table=data_table_total,
+            iceberg_table_warehouse_prefix_with_partition=iceberg_table_warehouse_prefix_with_partition,
+            identifier_columns=identifier_columns,
+            s3_file_system=s3_file_system,
+        )
+        new_pos_delete_table_total.append(new_pos_delete_table)
+    if new_pos_delete_table_total:
+        new_pos_delete_table_total = pa.concat_tables(new_pos_delete_table_total)
     logger.info(
         f"[Convert task {convert_task_index}]: Find deletes got {len(data_table_total)} data table records, "
         f"{len(equality_delete_table_total)} equality deletes as input, "
-        f"Produced {len(new_pos_delete_table)} position deletes based off find deletes input."
+        f"Produced {len(new_pos_delete_table_total)} position deletes based off find deletes input."
     )
-    if not new_pos_delete_table:
+    if not new_pos_delete_table_total:
         logger.info("No records deleted based on equality delete convertion")
-    return new_pos_delete_table
+    if not remaining_data_table:
+        logger.info("No data table remaining after converting equality deletes")
+    return new_pos_delete_table_total, remaining_data_table

deltacat/compute/converter/steps/dedupe.py CHANGED Viewed

@@ -4,25 +4,33 @@ import deltacat.compute.converter.utils.iceberg_columns as sc
 from deltacat.compute.converter.utils.io import (
     download_data_table_and_append_iceberg_columns,
 )
+from deltacat.compute.converter.utils.converter_session_utils import (
+    sort_data_files_maintaining_order,
+)
 import logging
 from deltacat import logs
+from typing import List, Dict, Tuple, Optional, Any
+from pyiceberg.manifest import DataFile
 logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
 def dedupe_data_files(
-    data_file_to_dedupe,
-    identifier_columns,
-    merge_sort_column,
-    s3_client_kwargs,
-):
+    data_file_to_dedupe: List[Tuple[int, DataFile]],
+    identifier_columns: List[str],
+    remaining_data_table_after_convert: Optional[pa.Table],
+    merge_sort_column: str,
+    s3_client_kwargs: Optional[Dict[str, Any]],
+) -> Tuple[pa.Table, int, int]:
     data_file_table = []
+    if remaining_data_table_after_convert:
+        data_file_table.append(remaining_data_table_after_convert)
+    data_file_to_dedupe = sort_data_files_maintaining_order(
+        data_files=data_file_to_dedupe
+    )
     downloaded_data_file_record_count = 0
-    # Sort data files by file sequence number first
-    data_file_to_dedupe = sorted(data_file_to_dedupe, key=lambda f: f[0])
     for file_tuple in data_file_to_dedupe:
-        sequence_number = file_tuple[0]
         data_file = file_tuple[1]
         data_file_to_dedupe_table = download_data_table_and_append_iceberg_columns(
             file=data_file,
@@ -31,17 +39,22 @@ def dedupe_data_files(
                 sc._FILE_PATH_COLUMN_NAME,
                 sc._ORDERED_RECORD_IDX_COLUMN_NAME,
             ],
-            sequence_number=sequence_number,
             s3_client_kwargs=s3_client_kwargs,
         )
+        logger.info(
+            f"Length of downloaded data file table: {len(data_file_to_dedupe_table)}"
+        )
         downloaded_data_file_record_count += len(data_file_to_dedupe_table)
         data_file_table.append(data_file_to_dedupe_table)
     final_data_to_dedupe = pa.concat_tables(data_file_table)
-    assert len(final_data_to_dedupe) == downloaded_data_file_record_count, (
+    dedupe_input_record_count = downloaded_data_file_record_count
+    if remaining_data_table_after_convert:
+        dedupe_input_record_count += len(remaining_data_table_after_convert)
+    assert len(final_data_to_dedupe) == dedupe_input_record_count, (
         f"Mismatch record count while performing table concat, Got {len(final_data_to_dedupe)} in final table, "
-        f"while input table length is: {downloaded_data_file_record_count}"
+        f"while input table length is: {dedupe_input_record_count}"
     )
     logger.info(f"Length of pyarrow table to dedupe:{len(final_data_to_dedupe)}")

deltacat/compute/converter/utils/convert_task_options.py CHANGED Viewed

@@ -1,20 +1,27 @@
-from typing import Optional, Dict
+from typing import Optional, Dict, List, Tuple, Any
 from deltacat.exceptions import RetryableError
+from pyiceberg.manifest import DataFile
+from deltacat.compute.converter.model.convert_input_files import ConvertInputFiles
-AVERAGE_FILE_PATH_COLUMN_SIZE_BYTES = 80
+AVERAGE_FILE_PATH_COLUMN_SIZE_BYTES = 160
 AVERAGE_POS_COLUMN_SIZE_BYTES = 4
 XXHASH_BYTE_PER_RECORD = 8
 MEMORY_BUFFER_RATE = 2
-# TODO: Add audit info to check this number in practice
 # Worst case 2 as no duplicates exists across all pk
 PYARROW_AGGREGATE_MEMORY_MULTIPLIER = 2
+# Observed base memory usage at the beginning of each worker process
+BASE_MEMORY_BUFFER = 0.3 * 1024 * 1024 * 1024
-def estimate_fixed_hash_columns(hash_value_size_bytes_per_record, total_record_count):
+def estimate_fixed_hash_columns(
+    hash_value_size_bytes_per_record: int, total_record_count: int
+) -> int:
     return hash_value_size_bytes_per_record * total_record_count
-def get_total_record_from_iceberg_files(iceberg_files_list):
+def get_total_record_from_iceberg_files(
+    iceberg_files_list: List[Tuple[int, DataFile]]
+) -> int:
     total_record_count = 0
     # file are in form of tuple (sequence_number, DataFile)
     total_record_count += sum(file[1].record_count for file in iceberg_files_list)
@@ -22,8 +29,8 @@ def get_total_record_from_iceberg_files(iceberg_files_list):
 def estimate_iceberg_pos_delete_additional_columns(
-    include_columns, num_of_record_count
-):
+    include_columns: List[str], num_of_record_count: int
+) -> int:
     total_additional_columns_sizes = 0
     if "file_path" in include_columns:
         total_additional_columns_sizes += (
@@ -36,7 +43,10 @@ def estimate_iceberg_pos_delete_additional_columns(
     return total_additional_columns_sizes
-def estimate_convert_remote_option_resources(data_files, equality_delete_files):
+def estimate_convert_remote_option_resources(
+    data_files: List[Tuple[int, DataFile]],
+    equality_delete_files: List[Tuple[int, DataFile]],
+) -> float:
     data_file_record_count = get_total_record_from_iceberg_files(data_files)
     equality_delete_record_count = get_total_record_from_iceberg_files(
         equality_delete_files
@@ -53,9 +63,9 @@ def estimate_convert_remote_option_resources(data_files, equality_delete_files):
 def _get_task_options(
     memory: float,
-    ray_custom_resources: Optional[Dict] = None,
+    ray_custom_resources: Optional[Dict[str, Any]] = None,
     scheduling_strategy: str = "SPREAD",
-) -> Dict:
+) -> Dict[str, Any]:
     # NOTE: With DEFAULT scheduling strategy in Ray 2.20.0, autoscaler does
     # not spin up enough nodes fast and hence we see only approximately
@@ -80,7 +90,9 @@ def _get_task_options(
     return task_opts
-def estimate_dedupe_memory(all_data_files_for_dedupe):
+def estimate_dedupe_memory(
+    all_data_files_for_dedupe: List[Tuple[int, DataFile]]
+) -> float:
     dedupe_record_count = get_total_record_from_iceberg_files(all_data_files_for_dedupe)
     produced_pos_memory_required = estimate_iceberg_pos_delete_additional_columns(
         ["file_path", "pos"], dedupe_record_count
@@ -95,13 +107,16 @@ def estimate_dedupe_memory(all_data_files_for_dedupe):
     return memory_with_buffer
-def convert_resource_options_provider(index, convert_input_files):
+def convert_resource_options_provider(
+    index: int, convert_input_files: ConvertInputFiles
+) -> Dict[str, Any]:
     applicable_data_files = convert_input_files.applicable_data_files
     applicable_equality_delete_files = (
         convert_input_files.applicable_equality_delete_files
     )
     all_data_files_for_dedupe = convert_input_files.all_data_files_for_dedupe
     total_memory_required = 0
+    total_memory_required += BASE_MEMORY_BUFFER
     if applicable_data_files and applicable_equality_delete_files:
         memory_requirement_for_convert_equality_deletes = (
             estimate_convert_remote_option_resources(

deltacat 2.0.0b11__py3-none-any.whl → 2.0.0.post1__py3-none-any.whl

deltacat 2.0.0b11py3-none-any.whl → 2.0.0.post1py3-none-any.whl