PyPI - deltacat - Versions diffs - 2.0__py3-none-any.whl → 2.0.0__py3-none-any.whl - Mend

deltacat 2.0py3-none-any.whl → 2.0.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (324) hide show

deltacat/__init__.py +117 -18
deltacat/api.py +536 -126
deltacat/aws/constants.py +0 -23
deltacat/aws/s3u.py +4 -631
deltacat/benchmarking/benchmark_engine.py +4 -2
deltacat/benchmarking/conftest.py +1 -19
deltacat/benchmarking/test_benchmark_pipeline.py +6 -4
deltacat/catalog/__init__.py +64 -5
deltacat/catalog/delegate.py +445 -63
deltacat/catalog/interface.py +188 -62
deltacat/catalog/main/impl.py +2444 -282
deltacat/catalog/model/catalog.py +208 -113
deltacat/catalog/model/properties.py +63 -24
deltacat/compute/__init__.py +14 -0
deltacat/compute/compactor/compaction_session.py +97 -75
deltacat/compute/compactor/model/compact_partition_params.py +75 -30
deltacat/compute/compactor/model/compaction_session_audit_info.py +17 -0
deltacat/compute/compactor/model/round_completion_info.py +16 -6
deltacat/compute/compactor/repartition_session.py +8 -21
deltacat/compute/compactor/steps/hash_bucket.py +5 -5
deltacat/compute/compactor/steps/materialize.py +9 -7
deltacat/compute/compactor/steps/repartition.py +12 -11
deltacat/compute/compactor/utils/io.py +6 -5
deltacat/compute/compactor/utils/round_completion_reader.py +117 -0
deltacat/compute/compactor/utils/system_columns.py +3 -1
deltacat/compute/compactor_v2/compaction_session.py +17 -14
deltacat/compute/compactor_v2/constants.py +30 -1
deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -1
deltacat/compute/compactor_v2/model/hash_bucket_input.py +9 -3
deltacat/compute/compactor_v2/model/merge_file_group.py +5 -2
deltacat/compute/compactor_v2/model/merge_input.py +33 -8
deltacat/compute/compactor_v2/private/compaction_utils.py +167 -68
deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -2
deltacat/compute/compactor_v2/steps/merge.py +267 -55
deltacat/compute/compactor_v2/utils/content_type_params.py +34 -6
deltacat/compute/compactor_v2/utils/dedupe.py +1 -1
deltacat/compute/compactor_v2/utils/delta.py +5 -3
deltacat/compute/compactor_v2/utils/io.py +11 -4
deltacat/compute/compactor_v2/utils/merge.py +15 -2
deltacat/compute/compactor_v2/utils/primary_key_index.py +28 -4
deltacat/compute/compactor_v2/utils/task_options.py +45 -33
deltacat/compute/converter/constants.py +5 -0
deltacat/compute/converter/converter_session.py +207 -52
deltacat/compute/converter/model/convert_input.py +43 -16
deltacat/compute/converter/model/convert_input_files.py +33 -16
deltacat/compute/converter/model/convert_result.py +80 -0
deltacat/compute/converter/model/converter_session_params.py +64 -19
deltacat/compute/converter/pyiceberg/catalog.py +21 -18
deltacat/compute/converter/pyiceberg/overrides.py +193 -65
deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +148 -100
deltacat/compute/converter/steps/convert.py +230 -75
deltacat/compute/converter/steps/dedupe.py +46 -12
deltacat/compute/converter/utils/convert_task_options.py +66 -22
deltacat/compute/converter/utils/converter_session_utils.py +126 -60
deltacat/compute/converter/utils/iceberg_columns.py +13 -8
deltacat/compute/converter/utils/io.py +173 -13
deltacat/compute/converter/utils/s3u.py +42 -27
deltacat/compute/janitor.py +205 -0
deltacat/compute/jobs/client.py +417 -0
deltacat/compute/resource_estimation/delta.py +38 -6
deltacat/compute/resource_estimation/model.py +8 -0
deltacat/constants.py +49 -6
deltacat/docs/autogen/schema/inference/generate_type_mappings.py +687 -0
deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py +673 -0
deltacat/env.py +10 -0
deltacat/examples/basic_logging.py +6 -6
deltacat/examples/compactor/aws/__init__.py +1 -0
deltacat/examples/compactor/bootstrap.py +863 -0
deltacat/examples/compactor/compactor.py +373 -0
deltacat/examples/compactor/explorer.py +473 -0
deltacat/examples/compactor/gcp/__init__.py +1 -0
deltacat/examples/compactor/job_runner.py +439 -0
deltacat/examples/compactor/utils/__init__.py +1 -0
deltacat/examples/compactor/utils/common.py +261 -0
deltacat/examples/experimental/iceberg/converter/beam/app.py +226 -0
deltacat/examples/experimental/iceberg/converter/beam/main.py +133 -0
deltacat/examples/experimental/iceberg/converter/beam/test_workflow.py +113 -0
deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py +3 -0
deltacat/examples/experimental/iceberg/converter/beam/utils/common.py +174 -0
deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py +263 -0
deltacat/examples/{iceberg → experimental/iceberg}/iceberg_bucket_writer.py +66 -21
deltacat/examples/{iceberg → experimental/iceberg}/iceberg_reader.py +2 -4
deltacat/examples/hello_world.py +4 -2
deltacat/examples/indexer/indexer.py +163 -0
deltacat/examples/indexer/job_runner.py +198 -0
deltacat/exceptions.py +66 -4
deltacat/experimental/catalog/iceberg/__init__.py +6 -0
deltacat/{catalog → experimental/catalog}/iceberg/iceberg_catalog_config.py +1 -1
deltacat/{catalog → experimental/catalog}/iceberg/impl.py +43 -12
deltacat/{catalog → experimental/catalog}/iceberg/overrides.py +12 -14
deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py +201 -0
deltacat/experimental/converter_agent/__init__.py +0 -0
deltacat/experimental/converter_agent/beam/__init__.py +0 -0
deltacat/experimental/converter_agent/beam/managed.py +173 -0
deltacat/experimental/converter_agent/table_monitor.py +479 -0
deltacat/experimental/daft/__init__.py +4 -0
deltacat/experimental/daft/daft_catalog.py +229 -0
deltacat/experimental/storage/__init__.py +0 -0
deltacat/experimental/storage/iceberg/__init__.py +0 -0
deltacat/experimental/storage/iceberg/iceberg_scan_planner.py +129 -0
deltacat/{storage → experimental/storage}/iceberg/impl.py +6 -4
deltacat/{storage → experimental/storage}/iceberg/model.py +7 -3
deltacat/experimental/storage/iceberg/visitor.py +119 -0
deltacat/experimental/storage/rivulet/__init__.py +11 -0
deltacat/experimental/storage/rivulet/arrow/__init__.py +0 -0
deltacat/{storage → experimental/storage}/rivulet/arrow/serializer.py +7 -4
deltacat/{storage → experimental/storage}/rivulet/dataset.py +13 -12
deltacat/{storage → experimental/storage}/rivulet/dataset_executor.py +12 -20
deltacat/experimental/storage/rivulet/feather/__init__.py +7 -0
deltacat/{storage → experimental/storage}/rivulet/feather/file_reader.py +7 -5
deltacat/{storage → experimental/storage}/rivulet/feather/serializer.py +4 -4
deltacat/experimental/storage/rivulet/fs/__init__.py +0 -0
deltacat/{storage → experimental/storage}/rivulet/fs/file_provider.py +3 -3
deltacat/{storage → experimental/storage}/rivulet/fs/file_store.py +2 -2
deltacat/{storage → experimental/storage}/rivulet/fs/output_file.py +1 -1
deltacat/{storage → experimental/storage}/rivulet/logical_plan.py +4 -4
deltacat/experimental/storage/rivulet/metastore/__init__.py +0 -0
deltacat/{storage → experimental/storage}/rivulet/metastore/delta.py +1 -3
deltacat/{storage → experimental/storage}/rivulet/metastore/json_sst.py +3 -3
deltacat/{storage → experimental/storage}/rivulet/metastore/sst.py +2 -2
deltacat/{storage → experimental/storage}/rivulet/metastore/sst_interval_tree.py +3 -3
deltacat/experimental/storage/rivulet/parquet/__init__.py +7 -0
deltacat/experimental/storage/rivulet/parquet/data_reader.py +0 -0
deltacat/{storage → experimental/storage}/rivulet/parquet/file_reader.py +7 -5
deltacat/{storage → experimental/storage}/rivulet/parquet/serializer.py +4 -4
deltacat/experimental/storage/rivulet/reader/__init__.py +0 -0
deltacat/{storage → experimental/storage}/rivulet/reader/block_scanner.py +20 -9
deltacat/{storage → experimental/storage}/rivulet/reader/data_reader.py +3 -3
deltacat/{storage → experimental/storage}/rivulet/reader/data_scan.py +5 -3
deltacat/{storage → experimental/storage}/rivulet/reader/dataset_metastore.py +7 -6
deltacat/{storage → experimental/storage}/rivulet/reader/dataset_reader.py +8 -6
deltacat/{storage → experimental/storage}/rivulet/reader/pyarrow_data_reader.py +4 -1
deltacat/{storage → experimental/storage}/rivulet/reader/reader_type_registrar.py +4 -4
deltacat/experimental/storage/rivulet/schema/__init__.py +0 -0
deltacat/{storage → experimental/storage}/rivulet/schema/schema.py +1 -1
deltacat/{storage → experimental/storage}/rivulet/serializer.py +1 -1
deltacat/{storage → experimental/storage}/rivulet/serializer_factory.py +9 -5
deltacat/experimental/storage/rivulet/shard/__init__.py +0 -0
deltacat/experimental/storage/rivulet/shard/range_shard.py +129 -0
deltacat/experimental/storage/rivulet/writer/__init__.py +0 -0
deltacat/{storage → experimental/storage}/rivulet/writer/memtable_dataset_writer.py +20 -9
deltacat/io/__init__.py +13 -0
deltacat/io/dataset/__init__.py +0 -0
deltacat/io/dataset/deltacat_dataset.py +91 -0
deltacat/io/datasink/__init__.py +0 -0
deltacat/io/datasink/deltacat_datasink.py +207 -0
deltacat/io/datasource/__init__.py +0 -0
deltacat/io/datasource/deltacat_datasource.py +579 -0
deltacat/io/reader/__init__.py +0 -0
deltacat/io/reader/deltacat_read_api.py +172 -0
deltacat/storage/__init__.py +22 -2
deltacat/storage/interface.py +54 -32
deltacat/storage/main/impl.py +1494 -541
deltacat/storage/model/delta.py +27 -3
deltacat/storage/model/expression/__init__.py +47 -0
deltacat/storage/model/expression/expression.py +656 -0
deltacat/storage/model/expression/visitor.py +248 -0
deltacat/storage/model/locator.py +6 -12
deltacat/storage/model/manifest.py +231 -6
deltacat/storage/model/metafile.py +224 -119
deltacat/storage/model/namespace.py +8 -1
deltacat/storage/model/partition.py +117 -42
deltacat/storage/model/scan/push_down.py +32 -5
deltacat/storage/model/schema.py +2427 -159
deltacat/storage/model/shard.py +6 -2
deltacat/storage/model/sort_key.py +40 -0
deltacat/storage/model/stream.py +9 -2
deltacat/storage/model/table.py +12 -1
deltacat/storage/model/table_version.py +11 -0
deltacat/storage/model/transaction.py +1184 -208
deltacat/storage/model/transform.py +81 -2
deltacat/storage/model/types.py +53 -29
deltacat/storage/util/__init__.py +0 -0
deltacat/storage/util/scan_planner.py +26 -0
deltacat/tests/_io/reader/__init__.py +0 -0
deltacat/tests/_io/reader/test_deltacat_read_api.py +0 -0
deltacat/tests/_io/test_cloudpickle_bug_fix.py +8 -4
deltacat/tests/aws/test_s3u.py +2 -31
deltacat/tests/catalog/data/__init__.py +0 -0
deltacat/tests/catalog/main/__init__.py +0 -0
deltacat/tests/catalog/main/test_catalog_impl_namespace_operations.py +130 -0
deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +1972 -0
deltacat/tests/catalog/model/__init__.py +0 -0
deltacat/tests/catalog/model/test_table_definition.py +16 -0
deltacat/tests/catalog/test_catalogs.py +103 -106
deltacat/tests/catalog/test_default_catalog_impl.py +12152 -72
deltacat/tests/compute/compact_partition_test_cases.py +35 -8
deltacat/tests/compute/compactor/steps/test_repartition.py +12 -12
deltacat/tests/compute/compactor/utils/test_io.py +124 -120
deltacat/tests/compute/compactor/utils/test_round_completion_reader.py +254 -0
deltacat/tests/compute/compactor_v2/test_compaction_session.py +423 -312
deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +266 -0
deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +45 -0
deltacat/tests/compute/compactor_v2/utils/test_task_options.py +270 -1
deltacat/tests/compute/conftest.py +8 -44
deltacat/tests/compute/converter/test_convert_session.py +697 -349
deltacat/tests/compute/converter/utils.py +15 -6
deltacat/tests/compute/resource_estimation/test_delta.py +145 -79
deltacat/tests/compute/test_compact_partition_incremental.py +103 -70
deltacat/tests/compute/test_compact_partition_multiple_rounds.py +89 -66
deltacat/tests/compute/test_compact_partition_params.py +13 -8
deltacat/tests/compute/test_compact_partition_rebase.py +77 -62
deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +263 -193
deltacat/tests/compute/test_janitor.py +236 -0
deltacat/tests/compute/test_util_common.py +716 -43
deltacat/tests/compute/test_util_constant.py +0 -1
deltacat/tests/{storage/conftest.py → conftest.py} +1 -1
deltacat/tests/daft/__init__.py +0 -0
deltacat/tests/daft/test_model.py +97 -0
deltacat/tests/experimental/__init__.py +1 -0
deltacat/tests/experimental/catalog/__init__.py +0 -0
deltacat/tests/experimental/catalog/iceberg/__init__.py +0 -0
deltacat/tests/experimental/catalog/iceberg/test_iceberg_catalog.py +71 -0
deltacat/tests/experimental/compatibility/__init__.py +1 -0
deltacat/tests/experimental/compatibility/test_backfill_locator_to_id_mappings.py +582 -0
deltacat/tests/experimental/daft/__init__.py +0 -0
deltacat/tests/experimental/daft/test_deltacat_daft_integration.py +136 -0
deltacat/tests/experimental/storage/__init__.py +0 -0
deltacat/tests/experimental/storage/rivulet/__init__.py +0 -0
deltacat/tests/{storage → experimental/storage}/rivulet/conftest.py +3 -3
deltacat/tests/experimental/storage/rivulet/fs/__init__.py +0 -0
deltacat/tests/{storage → experimental/storage}/rivulet/fs/test_file_location_provider.py +3 -2
deltacat/tests/experimental/storage/rivulet/reader/__init__.py +0 -0
deltacat/tests/experimental/storage/rivulet/reader/query_expression.py +80 -0
deltacat/tests/experimental/storage/rivulet/reader/test_data_scan.py +119 -0
deltacat/tests/experimental/storage/rivulet/reader/test_dataset_metastore.py +71 -0
deltacat/tests/experimental/storage/rivulet/schema/__init__.py +0 -0
deltacat/tests/{storage → experimental/storage}/rivulet/schema/test_schema.py +1 -1
deltacat/tests/experimental/storage/rivulet/shard/__init__.py +0 -0
deltacat/tests/experimental/storage/rivulet/shard/test_range_shard.py +162 -0
deltacat/tests/{storage → experimental/storage}/rivulet/test_dataset.py +6 -4
deltacat/tests/{storage → experimental/storage}/rivulet/test_manifest.py +5 -5
deltacat/tests/{storage → experimental/storage}/rivulet/test_sst_interval_tree.py +5 -5
deltacat/tests/{storage → experimental/storage}/rivulet/test_utils.py +8 -6
deltacat/tests/experimental/storage/rivulet/writer/__init__.py +0 -0
deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_dataset_write_then_read.py +11 -9
deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_dataset_writer.py +2 -2
deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_memtable_dataset_writer.py +7 -7
deltacat/tests/storage/main/test_main_storage.py +6900 -95
deltacat/tests/storage/model/test_expression.py +327 -0
deltacat/tests/storage/model/test_manifest.py +129 -0
deltacat/tests/storage/model/test_metafile_io.py +78 -173
deltacat/tests/storage/model/test_partition_scheme.py +85 -0
deltacat/tests/storage/model/test_schema.py +171 -0
deltacat/tests/storage/model/test_schema_update.py +1925 -0
deltacat/tests/storage/model/test_shard.py +3 -1
deltacat/tests/storage/model/test_sort_scheme.py +90 -0
deltacat/tests/storage/model/test_transaction.py +393 -48
deltacat/tests/storage/model/test_transaction_history.py +886 -0
deltacat/tests/test_deltacat_api.py +1036 -11
deltacat/tests/test_exceptions.py +9 -5
deltacat/tests/test_utils/pyarrow.py +52 -21
deltacat/tests/test_utils/storage.py +23 -34
deltacat/tests/types/__init__.py +0 -0
deltacat/tests/types/test_tables.py +104 -0
deltacat/tests/utils/exceptions.py +22 -0
deltacat/tests/utils/main_deltacat_storage_mock.py +31 -0
deltacat/tests/utils/ray_utils/test_dataset.py +123 -5
deltacat/tests/utils/test_daft.py +121 -31
deltacat/tests/utils/test_numpy.py +1193 -0
deltacat/tests/utils/test_pandas.py +1106 -0
deltacat/tests/utils/test_polars.py +1040 -0
deltacat/tests/utils/test_pyarrow.py +1370 -89
deltacat/types/media.py +345 -37
deltacat/types/tables.py +2344 -46
deltacat/utils/arguments.py +33 -1
deltacat/utils/daft.py +824 -40
deltacat/utils/export.py +3 -1
deltacat/utils/filesystem.py +139 -9
deltacat/utils/metafile_locator.py +2 -1
deltacat/utils/numpy.py +118 -26
deltacat/utils/pandas.py +577 -48
deltacat/utils/polars.py +759 -0
deltacat/utils/pyarrow.py +1373 -192
deltacat/utils/ray_utils/concurrency.py +1 -1
deltacat/utils/ray_utils/dataset.py +101 -10
deltacat/utils/ray_utils/runtime.py +56 -4
deltacat/utils/reader_compatibility_mapping.py +3083 -0
deltacat/utils/url.py +1325 -0
deltacat-2.0.0.dist-info/METADATA +1163 -0
deltacat-2.0.0.dist-info/RECORD +439 -0
{deltacat-2.0.dist-info → deltacat-2.0.0.dist-info}/WHEEL +1 -1
deltacat/catalog/iceberg/__init__.py +0 -4
deltacat/compute/compactor/utils/round_completion_file.py +0 -97
deltacat/compute/merge_on_read/__init__.py +0 -4
deltacat/compute/merge_on_read/daft.py +0 -40
deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -66
deltacat/compute/merge_on_read/utils/delta.py +0 -42
deltacat/examples/common/fixtures.py +0 -15
deltacat/storage/iceberg/iceberg_scan_planner.py +0 -28
deltacat/storage/rivulet/__init__.py +0 -11
deltacat/storage/rivulet/feather/__init__.py +0 -5
deltacat/storage/rivulet/parquet/__init__.py +0 -5
deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -231
deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -388
deltacat/tests/local_deltacat_storage/__init__.py +0 -1235
deltacat/tests/local_deltacat_storage/exceptions.py +0 -10
deltacat/utils/s3fs.py +0 -21
deltacat-2.0.dist-info/METADATA +0 -65
deltacat-2.0.dist-info/RECORD +0 -347
/deltacat/compute/{merge_on_read/model → jobs}/__init__.py +0 -0
/deltacat/{compute/merge_on_read/utils → docs}/__init__.py +0 -0
/deltacat/{examples/common → docs/autogen}/__init__.py +0 -0
/deltacat/{examples/iceberg → docs/autogen/schema}/__init__.py +0 -0
/deltacat/{storage/iceberg → docs/autogen/schema/inference}/__init__.py +0 -0
/deltacat/{storage/rivulet/arrow → examples/compactor}/__init__.py +0 -0
/deltacat/{storage/rivulet/fs → examples/experimental}/__init__.py +0 -0
/deltacat/{storage/rivulet/metastore → examples/experimental/iceberg}/__init__.py +0 -0
/deltacat/{storage/rivulet/reader → examples/experimental/iceberg/converter}/__init__.py +0 -0
/deltacat/{storage/rivulet/schema → examples/experimental/iceberg/converter/beam}/__init__.py +0 -0
/deltacat/{storage/rivulet/writer → examples/indexer}/__init__.py +0 -0
/deltacat/{tests/storage/rivulet → examples/indexer/aws}/__init__.py +0 -0
/deltacat/{tests/storage/rivulet/fs → examples/indexer/gcp}/__init__.py +0 -0
/deltacat/{tests/storage/rivulet/schema → experimental}/__init__.py +0 -0
/deltacat/{tests/storage/rivulet/writer → experimental/catalog}/__init__.py +0 -0
/deltacat/{storage/rivulet/parquet/data_reader.py → experimental/compatibility/__init__.py} +0 -0
/deltacat/{storage → experimental/storage}/rivulet/fs/input_file.py +0 -0
/deltacat/{storage → experimental/storage}/rivulet/mvp/Table.py +0 -0
/deltacat/{storage → experimental/storage}/rivulet/mvp/__init__.py +0 -0
/deltacat/{storage → experimental/storage}/rivulet/reader/query_expression.py +0 -0
/deltacat/{storage → experimental/storage}/rivulet/schema/datatype.py +0 -0
/deltacat/{storage → experimental/storage}/rivulet/writer/dataset_writer.py +0 -0
{deltacat-2.0.dist-info → deltacat-2.0.0.dist-info/licenses}/LICENSE +0 -0
{deltacat-2.0.dist-info → deltacat-2.0.0.dist-info}/top_level.txt +0 -0

deltacat/compute/converter/utils/converter_session_utils.py CHANGED Viewed

@@ -1,97 +1,102 @@
 from collections import defaultdict
 import logging
 from deltacat import logs
-from deltacat.compute.converter.model.convert_input_files import ConvertInputFiles
+from deltacat.compute.converter.model.convert_input_files import (
+    ConvertInputFiles,
+    DataFileList,
+    DataFileListGroup,
+)
+from typing import List, Dict, Tuple, Any
+from enum import Enum
+from pyiceberg.manifest import DataFile
 logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
-def check_data_files_sequence_number(data_files_list, equality_delete_files_list):
+def check_data_files_sequence_number(
+    data_files_list: DataFileList,
+    equality_delete_files_list: DataFileList,
+) -> Tuple[DataFileListGroup, DataFileListGroup]:
     # Sort by file sequence number
     data_files_list.sort(key=lambda file_tuple: file_tuple[0])
     equality_delete_files_list.sort(key=lambda file_tuple: file_tuple[0])
-    equality_delete_files = []
-    result_data_file = []
-    # Pointer for list data_file
-    data_file_pointer = 0
+    data_file_delete_applicable = []
+    result_eq_files_list = []
     # Loop through each value in equality_delete_file
-    for equality_file_tuple in equality_delete_files_list:
-        # Find all values in data_file that are smaller than val_equality
-        valid_values = []
+    for data_file_tuple in data_files_list:
+        # Find all values in equality delete file that having a larger sequence number than current data file
+        valid_values_eq = []
+        # Pointer for equality delete file
+        eq_file_pointer = 0
         # Move data_file_pointer to the first value in data_file that is smaller than val_equality
         while (
-            data_file_pointer < len(data_files_list)
-            and data_files_list[data_file_pointer][0] < equality_file_tuple[0]
+            eq_file_pointer < len(equality_delete_files_list)
+            and equality_delete_files_list[eq_file_pointer][0] > data_file_tuple[0]
         ):
-            valid_values.append(data_files_list[data_file_pointer])
-            data_file_pointer += 1
-            equality_delete_files.append(equality_file_tuple)
-        # Append the value from equality_delete_file and the corresponding valid values from data_file
-        if valid_values:
-            result_data_file.append(valid_values)
-    result_equality_delete_file = append_larger_sequence_number_data_files(
-        equality_delete_files
+            valid_values_eq.append(equality_delete_files_list[eq_file_pointer])
+            eq_file_pointer += 1
+        if valid_values_eq:
+            # Append the value for both applicable eq files list and applicable data files list
+            data_file_delete_applicable.append(data_file_tuple)
+            result_eq_files_list.append(valid_values_eq)
+    res_data_file_list = []
+    res_equality_delete_file_list = []
+    merged_file_dict = defaultdict(list)
+    for data_file_sublist, eq_delete_sublist in zip(
+        data_file_delete_applicable, result_eq_files_list
+    ):
+        merged_file_dict[tuple(eq_delete_sublist)].append(data_file_sublist)
+    for eq_file_list, data_file_list in merged_file_dict.items():
+        res_data_file_list.append(list(set(data_file_list)))
+        res_equality_delete_file_list.append(list(set(eq_file_list)))
+    assert len(res_data_file_list) == len(res_equality_delete_file_list), (
+        f"length of applicable data files list: {len(res_data_file_list)} "
+        f"should equal to length of equality delete files list:{len(res_equality_delete_file_list)}"
     )
-    return result_equality_delete_file, result_data_file
-def append_larger_sequence_number_data_files(data_files_list):
-    result = []
-    # Iterate over the input list
-    for i in range(len(data_files_list)):
-        sublist = data_files_list[i:]
-        sublist_file_list = []
-        for file in sublist:
-            sublist_file_list.append(file)
-        result.append(sublist_file_list)
-    return result
+    return res_equality_delete_file_list, res_data_file_list
 def construct_iceberg_table_prefix(
-    iceberg_warehouse_bucket_name, table_name, iceberg_namespace
-):
+    iceberg_warehouse_bucket_name: str, table_name: str, iceberg_namespace: str
+) -> str:
     return f"{iceberg_warehouse_bucket_name}/{iceberg_namespace}/{table_name}/data"
-def partition_value_record_to_partition_value_string(partition):
+def partition_value_record_to_partition_value_string(partition: Any) -> str:
     # Get string representation of partition value out of Record[partition_value]
     partition_value_str = partition.__repr__().split("[", 1)[1].split("]")[0]
     return partition_value_str
 def group_all_files_to_each_bucket(
-    data_file_dict, equality_delete_dict, pos_delete_dict
-):
+    data_file_dict: Dict[Any, DataFileList],
+    equality_delete_dict: Dict[Any, DataFileList],
+    pos_delete_dict: Dict[Any, DataFileList],
+) -> List[ConvertInputFiles]:
     convert_input_files_for_all_buckets = []
     files_for_each_bucket_for_deletes = defaultdict(tuple)
     if equality_delete_dict:
         for partition_value, equality_delete_file_list in equality_delete_dict.items():
-            (
-                result_equality_delete_file,
-                result_data_file,
-            ) = check_data_files_sequence_number(
-                data_files_list=data_file_dict[partition_value],
-                equality_delete_files_list=equality_delete_dict[partition_value],
-            )
-            files_for_each_bucket_for_deletes[partition_value] = (
-                result_data_file,
-                result_equality_delete_file,
-                [],
-            )
-            if partition_value not in data_file_dict:
-                convert_input_file = ConvertInputFiles.of(
-                    partition_value=partition_value,
-                    applicable_data_files=result_data_file,
-                    applicable_equalitu_delete_files=result_equality_delete_file,
+            if partition_value in data_file_dict:
+                (
+                    result_equality_delete_file,
+                    result_data_file,
+                ) = check_data_files_sequence_number(
+                    data_files_list=data_file_dict[partition_value],
+                    equality_delete_files_list=equality_delete_dict[partition_value],
+                )
+                files_for_each_bucket_for_deletes[partition_value] = (
+                    result_data_file,
+                    result_equality_delete_file,
+                    [],
                 )
-                convert_input_files_for_all_buckets.append(convert_input_file)
     for partition_value, all_data_files_for_each_bucket in data_file_dict.items():
         convert_input_file = ConvertInputFiles.of(
@@ -102,8 +107,69 @@ def group_all_files_to_each_bucket(
             convert_input_file.applicable_data_files = (
                 files_for_each_bucket_for_deletes[partition_value][0]
             )
-            convert_input_file.applicable_delete_files = (
+            convert_input_file.applicable_equality_delete_files = (
                 files_for_each_bucket_for_deletes[partition_value][1]
             )
         convert_input_files_for_all_buckets.append(convert_input_file)
     return convert_input_files_for_all_buckets
+def sort_data_files_maintaining_order(data_files: DataFileList) -> DataFileList:
+    """
+    Sort data files deterministically based on two criterias:
+    1. Sequence number: Newly added files will have a higher sequence number
+    2. File path: If file sequence is the same, files are guaranteed to be returned in a deterministic order since file path is unique.
+    """
+    if data_files:
+        data_files = sorted(data_files, key=lambda f: (f[0], f[1].file_path))
+    return data_files
+class SnapshotType(Enum):
+    """Enumeration of possible snapshot types."""
+    NONE = "none"
+    APPEND = "append"
+    REPLACE = "replace"
+    DELETE = "delete"
+def _get_snapshot_action_description(
+    snapshot_type: SnapshotType,
+    files_to_delete: List[List[DataFile]],
+    files_to_add: List[DataFile],
+) -> str:
+    """Get a human-readable description of the snapshot action."""
+    descriptions = {
+        SnapshotType.NONE: "No changes needed",
+        SnapshotType.APPEND: f"Adding {len(files_to_add)} new files",
+        SnapshotType.REPLACE: f"Replacing {sum(len(files) for files in files_to_delete)} files with {len(files_to_add)} new files",
+        SnapshotType.DELETE: f"Deleting {sum(len(files) for files in files_to_delete)} files",
+    }
+    return descriptions[snapshot_type]
+def _determine_snapshot_type(
+    to_be_deleted_files: List[List[DataFile]], to_be_added_files: List[DataFile]
+) -> SnapshotType:
+    """
+    Determine the snapshot type based on file changes.
+    Args:
+        to_be_deleted_files: List of files to be deleted
+        to_be_added_files: List of files to be added
+    Returns:
+        SnapshotType indicating what kind of snapshot to commit
+    """
+    has_files_to_delete = bool(to_be_deleted_files)
+    has_files_to_add = bool(to_be_added_files)
+    if not has_files_to_delete and not has_files_to_add:
+        return SnapshotType.NONE
+    elif not has_files_to_delete and has_files_to_add:
+        return SnapshotType.APPEND
+    elif has_files_to_delete and has_files_to_add:
+        return SnapshotType.REPLACE
+    else:  # has_files_to_delete and not has_files_to_add
+        return SnapshotType.DELETE

deltacat/compute/converter/utils/iceberg_columns.py CHANGED Viewed

@@ -1,5 +1,5 @@
 import pyarrow as pa
-from typing import Union
+from typing import Union, Iterator, Any
 import numpy as np
 # Refer to: https://iceberg.apache.org/spec/#reserved-field-ids for reserved field ids
@@ -9,7 +9,7 @@ ICEBERG_RESERVED_FIELD_ID_FOR_FILE_PATH_COLUMN = 2147483546
 ICEBERG_RESERVED_FIELD_ID_FOR_POS_COLUMN = 2147483545
-def _get_iceberg_col_name(suffix):
+def _get_iceberg_col_name(suffix: str) -> str:
     return suffix
@@ -26,15 +26,16 @@ _ORDERED_RECORD_IDX_COLUMN_FIELD = pa.field(
 )
-def get_record_index_column_array(obj) -> Union[pa.Array, pa.ChunkedArray]:
+def get_record_index_column_array(obj: Any) -> Union[pa.Array, pa.ChunkedArray]:
     return pa.array(
         obj,
         _ORDERED_RECORD_IDX_COLUMN_TYPE,
     )
-def append_record_idx_col(table: pa.Table, ordered_record_indices) -> pa.Table:
+def append_record_idx_col(
+    table: pa.Table, ordered_record_indices: Iterator[int]
+) -> pa.Table:
     table = table.append_column(
         _ORDERED_RECORD_IDX_COLUMN_FIELD,
         get_record_index_column_array(ordered_record_indices),
@@ -55,7 +56,7 @@ _FILE_PATH_COLUMN_FIELD = pa.field(
 )
-def append_file_path_column(table: pa.Table, file_path: str):
+def append_file_path_column(table: pa.Table, file_path: str) -> pa.Table:
     table = table.append_column(
         _FILE_PATH_COLUMN_FIELD,
         pa.array(np.repeat(file_path, len(table)), _FILE_PATH_COLUMN_TYPE),
@@ -72,11 +73,15 @@ _GLOBAL_RECORD_IDX_COLUMN_FIELD = pa.field(
 def append_global_record_idx_column(
-    table: pa.Table, ordered_record_indices
+    table: pa.Table, ordered_record_indices: Iterator[int]
 ) -> pa.Table:
     table = table.append_column(
         _GLOBAL_RECORD_IDX_COLUMN_NAME,
         pa.array(ordered_record_indices, _GLOBAL_RECORD_IDX_COLUMN_TYPE),
     )
     return table
+_IDENTIFIER_COLUMNS_HASH_COLUMN_NAME = _get_iceberg_col_name(
+    "identifier_columns_hashed"
+)

deltacat/compute/converter/utils/io.py CHANGED Viewed

@@ -1,26 +1,57 @@
+import logging
+from fsspec import AbstractFileSystem
+from deltacat import logs
 import deltacat.compute.converter.utils.iceberg_columns as sc
 import daft
+from deltacat.utils.daft import _get_s3_io_config
+from daft import TimeUnit, DataFrame
+import pyarrow as pa
+from typing import Callable, Optional, List, Dict, Any
+from deltacat.utils.pyarrow import sliced_string_cast
+from deltacat.compute.converter.constants import IDENTIFIER_FIELD_DELIMITER
+from deltacat.compute.converter.utils.s3u import upload_table_with_retry
+from pyiceberg.manifest import DataFile
+import pyarrow.compute as pc
+from deltacat.types.media import ContentType
+from deltacat.types.tables import (
+    get_table_writer,
+    get_table_slicer,
+    write_sliced_table as types_write_sliced_table,
+)
+from deltacat.storage import LocalTable, DistributedDataset
+from typing import Union
+logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
 def download_data_table_and_append_iceberg_columns(
-    file, columns_to_download, additional_columns_to_append, sequence_number
-):
-    # TODO; add S3 client kwargs
+    file: DataFile,
+    columns_to_download: List[str],
+    additional_columns_to_append: Optional[List[str]] = [],
+    s3_client_kwargs: Optional[Dict[str, Any]] = None,
+) -> pa.Table:
     table = download_parquet_with_daft_hash_applied(
-        identify_columns=columns_to_download, file=file, s3_client_kwargs={}
+        identifier_columns=columns_to_download,
+        file=file,
+        s3_client_kwargs=s3_client_kwargs,
     )
     if sc._FILE_PATH_COLUMN_NAME in additional_columns_to_append:
         table = sc.append_file_path_column(table, file.file_path)
     if sc._ORDERED_RECORD_IDX_COLUMN_NAME in additional_columns_to_append:
         record_idx_iterator = iter(range(len(table)))
         table = sc.append_record_idx_col(table, record_idx_iterator)
     return table
 def download_parquet_with_daft_hash_applied(
-    identify_columns, file, s3_client_kwargs, **kwargs
-):
-    from daft import TimeUnit
+    identifier_columns: List[str],
+    file: DataFile,
+    s3_client_kwargs: Optional[Dict[str, Any]],
+    **kwargs: Any,
+) -> pa.Table:
     # TODO: Add correct read kwargs as in:
     #  https://github.com/ray-project/deltacat/blob/383855a4044e4dfe03cf36d7738359d512a517b4/deltacat/utils/daft.py#L97
@@ -29,15 +60,144 @@ def download_parquet_with_daft_hash_applied(
         kwargs.get("coerce_int96_timestamp_unit", "ms")
     )
-    from deltacat.utils.daft import _get_s3_io_config
     # TODO: Use Daft SHA1 hash instead to minimize probably of data corruption
     io_config = _get_s3_io_config(s3_client_kwargs=s3_client_kwargs)
-    df = daft.read_parquet(
+    df = daft_read_parquet(
         path=file.file_path,
         io_config=io_config,
         coerce_int96_timestamp_unit=coerce_int96_timestamp_unit,
     )
-    df = df.select(daft.col(identify_columns[0]).hash())
-    arrow_table = df.to_arrow()
-    return arrow_table
+    hash_column = concatenate_hashed_identifier_columns(
+        df=df, identifier_columns=identifier_columns
+    )
+    table = pa.Table.from_arrays(
+        [hash_column], names=[sc._IDENTIFIER_COLUMNS_HASH_COLUMN_NAME]
+    )
+    return table
+def daft_read_parquet(
+    path: str, io_config: Dict[str, Any], coerce_int96_timestamp_unit: TimeUnit
+) -> DataFrame:
+    df = daft.read_parquet(
+        path=path,
+        io_config=io_config,
+        coerce_int96_timestamp_unit=coerce_int96_timestamp_unit,
+    )
+    return df
+def concatenate_hashed_identifier_columns(
+    df: DataFrame, identifier_columns: List[str]
+) -> pa.Array:
+    pk_hash_columns = []
+    previous_hash_column_length = None
+    for i in range(len(identifier_columns)):
+        pk_hash_column = df.select(daft.col(identifier_columns[i]).hash())
+        pk_hash_column_arrow = pk_hash_column.to_arrow()
+        # Assert that each hash column downloaded are same length to ensure we don't create mismatch between columns.
+        if not previous_hash_column_length:
+            previous_hash_column_length = len(pk_hash_column_arrow)
+        else:
+            assert previous_hash_column_length == len(pk_hash_column_arrow), (
+                f"Identifier column Length mismatch: {identifier_columns[i]} has length {len(pk_hash_column_arrow)} "
+                f"but expected {previous_hash_column_length}."
+            )
+            previous_hash_column_length = len(pk_hash_column_arrow)
+        # Convert identifier from different datatypes to string here
+        pk_hash_column_str = sliced_string_cast(
+            pk_hash_column_arrow[identifier_columns[i]]
+        )
+        assert len(pk_hash_column_str) == previous_hash_column_length, (
+            f"Casting column Length mismatch: {identifier_columns[i]} has length {len(pk_hash_column_str)} after casting, "
+            f"before casting length: {previous_hash_column_length}."
+        )
+        pk_hash_columns.append(pk_hash_column_str)
+    pk_hash_columns.append(IDENTIFIER_FIELD_DELIMITER)
+    pk_hash_columns_concatenated = pc.binary_join_element_wise(
+        *pk_hash_columns, null_handling="replace"
+    )
+    assert len(pk_hash_columns_concatenated) == previous_hash_column_length, (
+        f"Concatenated column Length mismatch: Final concatenated identifier column has length {len(pk_hash_columns_concatenated)}, "
+        f"before concatenating length: {previous_hash_column_length}."
+    )
+    return pk_hash_columns_concatenated
+def write_sliced_table(
+    table: Union[LocalTable, DistributedDataset],
+    base_path: str,
+    table_writer_kwargs: Optional[Dict[str, Any]],
+    content_type: ContentType = ContentType.PARQUET,
+    max_records_per_file: Optional[int] = 4000000,
+    filesystem: Optional[Union[AbstractFileSystem, pa.fs.FileSystem]] = None,
+    **kwargs,
+) -> List[str]:
+    """
+    Writes the given table to 1 or more files and return the paths
+    of the files written.
+    """
+    if isinstance(filesystem, pa.fs.FileSystem):
+        table_writer_fn = get_table_writer(table)
+        table_slicer_fn = get_table_slicer(table)
+        # Create a wrapper for the table writer that ensures directory creation
+        def table_writer_with_dir_creation(
+            dataframe: Any,
+            base_path: str,
+            filesystem: Optional[Union[AbstractFileSystem, pa.fs.FileSystem]],
+            block_path_provider: Callable,
+            content_type: str = ContentType.PARQUET.value,
+            **kwargs,
+        ):
+            try:
+                # Ensure base path directory exists
+                if isinstance(base_path, str):
+                    # Normalize the base path and ensure it's treated as a directory path
+                    base_dir = base_path.rstrip("/")
+                    filesystem.create_dir(base_dir, recursive=True)
+            except Exception:
+                # Directory might already exist or there might be permission issues
+                # Let the original write attempt proceed
+                pass
+            return table_writer_fn(
+                dataframe,
+                base_path,
+                filesystem,
+                block_path_provider,
+                content_type,
+                **kwargs,
+            )
+        # TODO(pdames): Disable redundant file info fetch currently
+        #   used to construct unused manifest entry metadata.
+        manifest_entry_list = types_write_sliced_table(
+            table=table,
+            base_path=base_path,
+            filesystem=filesystem,
+            max_records_per_entry=max_records_per_file,
+            table_writer_fn=table_writer_with_dir_creation,
+            table_slicer_fn=table_slicer_fn,
+            table_writer_kwargs=table_writer_kwargs,
+            content_type=content_type,
+        )
+        paths = [entry.uri for entry in manifest_entry_list]
+        return paths
+    else:
+        return upload_table_with_retry(
+            table=table,
+            s3_url_prefix=base_path,
+            s3_table_writer_kwargs=table_writer_kwargs,
+            content_type=content_type,
+            max_records_per_file=max_records_per_file,
+            s3_file_system=filesystem,
+            **kwargs,
+        )

deltacat/compute/converter/utils/s3u.py CHANGED Viewed

@@ -4,14 +4,16 @@ from tenacity import (
     stop_after_delay,
     wait_random_exponential,
 )
-from typing import Union
-from deltacat.aws.s3u import CapturedBlockWritePaths, UuidBlockWritePathProvider
+from typing import Union, Optional, Dict, Any, List, Callable
+from deltacat.types.tables import (
+    CapturedBlockWritePaths,
+    UuidBlockWritePathProvider,
+)
 from deltacat.types.tables import (
     get_table_writer,
     get_table_length,
     TABLE_CLASS_TO_SLICER_FUNC,
 )
-from typing import Optional, Dict, Any, List
 from deltacat.exceptions import RetryableError
 from deltacat.storage import (
     DistributedDataset,
@@ -21,19 +23,22 @@ from deltacat.types.media import (
     ContentEncoding,
     ContentType,
 )
-from deltacat.aws.s3u import UPLOAD_SLICED_TABLE_RETRY_STOP_AFTER_DELAY
+from deltacat.constants import UPLOAD_SLICED_TABLE_RETRY_STOP_AFTER_DELAY
 import s3fs
+import boto3
+from boto3.session import Session
+from botocore.credentials import Credentials
-def get_credential():
-    import boto3
-    boto3_session = boto3.Session()
-    credentials = boto3_session.get_credentials()
+def get_credential() -> Credentials:
+    boto3_session: Session = boto3.Session()
+    credentials: Credentials = boto3_session.get_credentials()
     return credentials
 def get_s3_file_system(content_type):
+    import s3fs  # noqa: F401
     token_holder = get_credential()
     content_encoding = ContentEncoding.IDENTITY
@@ -57,12 +62,12 @@ def upload_table_with_retry(
     s3_table_writer_kwargs: Optional[Dict[str, Any]],
     content_type: ContentType = ContentType.PARQUET,
     max_records_per_file: Optional[int] = 4000000,
-    s3_file_system=None,
-    **s3_client_kwargs,
+    filesystem: Optional[s3fs.S3FileSystem] = None,
+    **s3_client_kwargs: Any,
 ) -> List[str]:
     """
-    Writes the given table to 1 or more S3 files and return Redshift
-    manifest entries describing the uploaded files.
+    Writes the given table to 1 or more S3 files and return the paths
+    of the S3 files written.
     """
     retrying = Retrying(
         wait=wait_random_exponential(multiplier=1, max=60),
@@ -73,11 +78,11 @@ def upload_table_with_retry(
     if s3_table_writer_kwargs is None:
         s3_table_writer_kwargs = {}
-    if not s3_file_system:
-        s3_file_system = get_s3_file_system(content_type=content_type)
+    if not filesystem:
+        filesystem = get_s3_file_system(content_type=content_type)
     capture_object = CapturedBlockWritePaths()
     block_write_path_provider = UuidBlockWritePathProvider(
-        capture_object=capture_object
+        capture_object=capture_object, base_path=s3_url_prefix
     )
     s3_table_writer_func = get_table_writer(table)
     table_record_count = get_table_length(table)
@@ -86,7 +91,7 @@ def upload_table_with_retry(
             fn=upload_table,
             table_slices=table,
             s3_base_url=f"{s3_url_prefix}",
-            s3_file_system=s3_file_system,
+            s3_file_system=filesystem,
             s3_table_writer_func=s3_table_writer_func,
             s3_table_writer_kwargs=s3_table_writer_kwargs,
             block_write_path_provider=block_write_path_provider,
@@ -101,7 +106,7 @@ def upload_table_with_retry(
                 fn=upload_table,
                 table_slices=table_slice,
                 s3_base_url=f"{s3_url_prefix}",
-                s3_file_system=s3_file_system,
+                s3_file_system=filesystem,
                 s3_table_writer_func=s3_table_writer_func,
                 s3_table_writer_kwargs=s3_table_writer_kwargs,
                 block_write_path_provider=block_write_path_provider,
@@ -110,18 +115,28 @@ def upload_table_with_retry(
             )
     del block_write_path_provider
     write_paths = capture_object.write_paths()
-    return write_paths
+    s3_write_paths = []
+    for path in write_paths:
+        s3_write_path = construct_s3_url(path)
+        s3_write_paths.append(s3_write_path)
+    return s3_write_paths
+def construct_s3_url(path: Optional[str]) -> Optional[str]:
+    if path:
+        return f"s3://{path}"
+    return None
 def upload_table(
-    table_slices,
-    s3_base_url,
-    s3_file_system,
-    s3_table_writer_func,
-    block_write_path_provider,
-    content_type,
-    s3_table_writer_kwargs,
-):
+    table_slices: Union[LocalTable, DistributedDataset],
+    s3_base_url: str,
+    s3_file_system: s3fs.S3FileSystem,
+    s3_table_writer_func: Callable,
+    block_write_path_provider: UuidBlockWritePathProvider,
+    content_type: ContentType,
+    s3_table_writer_kwargs: Dict[str, Any],
+) -> None:
     s3_table_writer_func(
         table_slices,
         s3_base_url,

deltacat 2.0__py3-none-any.whl → 2.0.0__py3-none-any.whl

deltacat 2.0py3-none-any.whl → 2.0.0py3-none-any.whl