PyPI - deltacat - Versions diffs - 1.1.36__py3-none-any.whl → 2.0.0b2__py3-none-any.whl - Mend

deltacat 1.1.36py3-none-any.whl → 2.0.0b2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (238) hide show

deltacat/__init__.py +42 -3
deltacat/annotations.py +36 -0
deltacat/api.py +168 -0
deltacat/aws/s3u.py +4 -4
deltacat/benchmarking/benchmark_engine.py +82 -0
deltacat/benchmarking/benchmark_report.py +86 -0
deltacat/benchmarking/benchmark_suite.py +11 -0
deltacat/benchmarking/conftest.py +21 -0
deltacat/benchmarking/data/random_row_generator.py +94 -0
deltacat/benchmarking/data/row_generator.py +10 -0
deltacat/benchmarking/test_benchmark_pipeline.py +106 -0
deltacat/catalog/__init__.py +14 -0
deltacat/catalog/delegate.py +199 -106
deltacat/catalog/iceberg/__init__.py +4 -0
deltacat/catalog/iceberg/iceberg_catalog_config.py +26 -0
deltacat/catalog/iceberg/impl.py +368 -0
deltacat/catalog/iceberg/overrides.py +74 -0
deltacat/catalog/interface.py +273 -76
deltacat/catalog/main/impl.py +720 -0
deltacat/catalog/model/catalog.py +227 -20
deltacat/catalog/model/properties.py +116 -0
deltacat/catalog/model/table_definition.py +32 -1
deltacat/compute/compactor/model/compaction_session_audit_info.py +7 -3
deltacat/compute/compactor/model/delta_annotated.py +3 -3
deltacat/compute/compactor/model/delta_file_envelope.py +3 -1
deltacat/compute/compactor/model/delta_file_locator.py +3 -1
deltacat/compute/compactor/model/round_completion_info.py +5 -5
deltacat/compute/compactor/model/table_object_store.py +3 -2
deltacat/compute/compactor/repartition_session.py +1 -1
deltacat/compute/compactor/steps/dedupe.py +11 -4
deltacat/compute/compactor/steps/hash_bucket.py +1 -1
deltacat/compute/compactor/steps/materialize.py +6 -2
deltacat/compute/compactor/utils/io.py +1 -1
deltacat/compute/compactor/utils/sort_key.py +9 -2
deltacat/compute/compactor_v2/compaction_session.py +5 -9
deltacat/compute/compactor_v2/constants.py +1 -30
deltacat/compute/compactor_v2/deletes/utils.py +3 -3
deltacat/compute/compactor_v2/model/merge_input.py +1 -7
deltacat/compute/compactor_v2/private/compaction_utils.py +5 -6
deltacat/compute/compactor_v2/steps/merge.py +17 -126
deltacat/compute/compactor_v2/utils/content_type_params.py +0 -17
deltacat/compute/compactor_v2/utils/dedupe.py +1 -1
deltacat/compute/compactor_v2/utils/io.py +1 -1
deltacat/compute/compactor_v2/utils/merge.py +0 -1
deltacat/compute/compactor_v2/utils/primary_key_index.py +3 -15
deltacat/compute/compactor_v2/utils/task_options.py +23 -43
deltacat/compute/converter/constants.py +4 -0
deltacat/compute/converter/converter_session.py +143 -0
deltacat/compute/converter/model/convert_input.py +69 -0
deltacat/compute/converter/model/convert_input_files.py +61 -0
deltacat/compute/converter/model/converter_session_params.py +99 -0
deltacat/compute/converter/pyiceberg/__init__.py +0 -0
deltacat/compute/converter/pyiceberg/catalog.py +75 -0
deltacat/compute/converter/pyiceberg/overrides.py +135 -0
deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +251 -0
deltacat/compute/converter/steps/__init__.py +0 -0
deltacat/compute/converter/steps/convert.py +211 -0
deltacat/compute/converter/steps/dedupe.py +60 -0
deltacat/compute/converter/utils/__init__.py +0 -0
deltacat/compute/converter/utils/convert_task_options.py +88 -0
deltacat/compute/converter/utils/converter_session_utils.py +109 -0
deltacat/compute/converter/utils/iceberg_columns.py +82 -0
deltacat/compute/converter/utils/io.py +43 -0
deltacat/compute/converter/utils/s3u.py +133 -0
deltacat/compute/resource_estimation/delta.py +1 -19
deltacat/constants.py +47 -1
deltacat/env.py +51 -0
deltacat/examples/__init__.py +0 -0
deltacat/examples/basic_logging.py +101 -0
deltacat/examples/common/__init__.py +0 -0
deltacat/examples/common/fixtures.py +15 -0
deltacat/examples/hello_world.py +27 -0
deltacat/examples/iceberg/__init__.py +0 -0
deltacat/examples/iceberg/iceberg_bucket_writer.py +139 -0
deltacat/examples/iceberg/iceberg_reader.py +149 -0
deltacat/exceptions.py +51 -9
deltacat/logs.py +4 -1
deltacat/storage/__init__.py +118 -28
deltacat/storage/iceberg/__init__.py +0 -0
deltacat/storage/iceberg/iceberg_scan_planner.py +28 -0
deltacat/storage/iceberg/impl.py +737 -0
deltacat/storage/iceberg/model.py +709 -0
deltacat/storage/interface.py +217 -134
deltacat/storage/main/__init__.py +0 -0
deltacat/storage/main/impl.py +2077 -0
deltacat/storage/model/delta.py +118 -71
deltacat/storage/model/interop.py +24 -0
deltacat/storage/model/list_result.py +8 -0
deltacat/storage/model/locator.py +93 -3
deltacat/{aws/redshift → storage}/model/manifest.py +122 -98
deltacat/storage/model/metafile.py +1316 -0
deltacat/storage/model/namespace.py +34 -18
deltacat/storage/model/partition.py +362 -37
deltacat/storage/model/scan/__init__.py +0 -0
deltacat/storage/model/scan/push_down.py +19 -0
deltacat/storage/model/scan/scan_plan.py +10 -0
deltacat/storage/model/scan/scan_task.py +34 -0
deltacat/storage/model/schema.py +892 -0
deltacat/storage/model/shard.py +47 -0
deltacat/storage/model/sort_key.py +170 -13
deltacat/storage/model/stream.py +208 -80
deltacat/storage/model/table.py +123 -29
deltacat/storage/model/table_version.py +322 -46
deltacat/storage/model/transaction.py +757 -0
deltacat/storage/model/transform.py +198 -61
deltacat/storage/model/types.py +111 -13
deltacat/storage/rivulet/__init__.py +11 -0
deltacat/storage/rivulet/arrow/__init__.py +0 -0
deltacat/storage/rivulet/arrow/serializer.py +75 -0
deltacat/storage/rivulet/dataset.py +744 -0
deltacat/storage/rivulet/dataset_executor.py +87 -0
deltacat/storage/rivulet/feather/__init__.py +5 -0
deltacat/storage/rivulet/feather/file_reader.py +136 -0
deltacat/storage/rivulet/feather/serializer.py +35 -0
deltacat/storage/rivulet/fs/__init__.py +0 -0
deltacat/storage/rivulet/fs/file_provider.py +105 -0
deltacat/storage/rivulet/fs/file_store.py +130 -0
deltacat/storage/rivulet/fs/input_file.py +76 -0
deltacat/storage/rivulet/fs/output_file.py +86 -0
deltacat/storage/rivulet/logical_plan.py +105 -0
deltacat/storage/rivulet/metastore/__init__.py +0 -0
deltacat/storage/rivulet/metastore/delta.py +190 -0
deltacat/storage/rivulet/metastore/json_sst.py +105 -0
deltacat/storage/rivulet/metastore/sst.py +82 -0
deltacat/storage/rivulet/metastore/sst_interval_tree.py +260 -0
deltacat/storage/rivulet/mvp/Table.py +101 -0
deltacat/storage/rivulet/mvp/__init__.py +5 -0
deltacat/storage/rivulet/parquet/__init__.py +5 -0
deltacat/storage/rivulet/parquet/data_reader.py +0 -0
deltacat/storage/rivulet/parquet/file_reader.py +127 -0
deltacat/storage/rivulet/parquet/serializer.py +37 -0
deltacat/storage/rivulet/reader/__init__.py +0 -0
deltacat/storage/rivulet/reader/block_scanner.py +378 -0
deltacat/storage/rivulet/reader/data_reader.py +136 -0
deltacat/storage/rivulet/reader/data_scan.py +63 -0
deltacat/storage/rivulet/reader/dataset_metastore.py +178 -0
deltacat/storage/rivulet/reader/dataset_reader.py +156 -0
deltacat/storage/rivulet/reader/pyarrow_data_reader.py +121 -0
deltacat/storage/rivulet/reader/query_expression.py +99 -0
deltacat/storage/rivulet/reader/reader_type_registrar.py +84 -0
deltacat/storage/rivulet/schema/__init__.py +0 -0
deltacat/storage/rivulet/schema/datatype.py +128 -0
deltacat/storage/rivulet/schema/schema.py +251 -0
deltacat/storage/rivulet/serializer.py +40 -0
deltacat/storage/rivulet/serializer_factory.py +42 -0
deltacat/storage/rivulet/writer/__init__.py +0 -0
deltacat/storage/rivulet/writer/dataset_writer.py +29 -0
deltacat/storage/rivulet/writer/memtable_dataset_writer.py +294 -0
deltacat/storage/util/__init__.py +0 -0
deltacat/storage/util/scan_planner.py +26 -0
deltacat/tests/_io/__init__.py +1 -0
deltacat/tests/catalog/test_catalogs.py +324 -0
deltacat/tests/catalog/test_default_catalog_impl.py +16 -8
deltacat/tests/compute/compact_partition_multiple_rounds_test_cases.py +21 -21
deltacat/tests/compute/compact_partition_rebase_test_cases.py +6 -6
deltacat/tests/compute/compact_partition_rebase_then_incremental_test_cases.py +56 -56
deltacat/tests/compute/compact_partition_test_cases.py +19 -53
deltacat/tests/compute/compactor/steps/test_repartition.py +2 -2
deltacat/tests/compute/compactor/utils/test_io.py +6 -8
deltacat/tests/compute/compactor_v2/test_compaction_session.py +0 -466
deltacat/tests/compute/compactor_v2/utils/test_task_options.py +1 -273
deltacat/tests/compute/conftest.py +75 -0
deltacat/tests/compute/converter/__init__.py +0 -0
deltacat/tests/compute/converter/conftest.py +80 -0
deltacat/tests/compute/converter/test_convert_session.py +478 -0
deltacat/tests/compute/converter/utils.py +123 -0
deltacat/tests/compute/resource_estimation/test_delta.py +0 -16
deltacat/tests/compute/test_compact_partition_incremental.py +2 -42
deltacat/tests/compute/test_compact_partition_multiple_rounds.py +5 -46
deltacat/tests/compute/test_compact_partition_params.py +3 -3
deltacat/tests/compute/test_compact_partition_rebase.py +1 -46
deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +5 -46
deltacat/tests/compute/test_util_common.py +19 -12
deltacat/tests/compute/test_util_create_table_deltas_repo.py +13 -22
deltacat/tests/local_deltacat_storage/__init__.py +76 -103
deltacat/tests/storage/__init__.py +0 -0
deltacat/tests/storage/conftest.py +25 -0
deltacat/tests/storage/main/__init__.py +0 -0
deltacat/tests/storage/main/test_main_storage.py +1399 -0
deltacat/tests/storage/model/__init__.py +0 -0
deltacat/tests/storage/model/test_delete_parameters.py +21 -0
deltacat/tests/storage/model/test_metafile_io.py +2535 -0
deltacat/tests/storage/model/test_schema.py +308 -0
deltacat/tests/storage/model/test_shard.py +22 -0
deltacat/tests/storage/model/test_table_version.py +110 -0
deltacat/tests/storage/model/test_transaction.py +308 -0
deltacat/tests/storage/rivulet/__init__.py +0 -0
deltacat/tests/storage/rivulet/conftest.py +149 -0
deltacat/tests/storage/rivulet/fs/__init__.py +0 -0
deltacat/tests/storage/rivulet/fs/test_file_location_provider.py +93 -0
deltacat/tests/storage/rivulet/schema/__init__.py +0 -0
deltacat/tests/storage/rivulet/schema/test_schema.py +241 -0
deltacat/tests/storage/rivulet/test_dataset.py +406 -0
deltacat/tests/storage/rivulet/test_manifest.py +67 -0
deltacat/tests/storage/rivulet/test_sst_interval_tree.py +232 -0
deltacat/tests/storage/rivulet/test_utils.py +122 -0
deltacat/tests/storage/rivulet/writer/__init__.py +0 -0
deltacat/tests/storage/rivulet/writer/test_dataset_write_then_read.py +341 -0
deltacat/tests/storage/rivulet/writer/test_dataset_writer.py +79 -0
deltacat/tests/storage/rivulet/writer/test_memtable_dataset_writer.py +75 -0
deltacat/tests/test_deltacat_api.py +39 -0
deltacat/tests/test_utils/filesystem.py +14 -0
deltacat/tests/test_utils/message_pack_utils.py +54 -0
deltacat/tests/test_utils/pyarrow.py +8 -15
deltacat/tests/test_utils/storage.py +266 -3
deltacat/tests/utils/test_daft.py +3 -3
deltacat/tests/utils/test_pyarrow.py +0 -432
deltacat/types/partial_download.py +1 -1
deltacat/types/tables.py +1 -1
deltacat/utils/export.py +59 -0
deltacat/utils/filesystem.py +320 -0
deltacat/utils/metafile_locator.py +73 -0
deltacat/utils/pyarrow.py +36 -183
deltacat-2.0.0b2.dist-info/METADATA +65 -0
deltacat-2.0.0b2.dist-info/RECORD +349 -0
deltacat/aws/redshift/__init__.py +0 -19
deltacat/catalog/default_catalog_impl/__init__.py +0 -369
deltacat/io/dataset.py +0 -73
deltacat/io/read_api.py +0 -143
deltacat/storage/model/delete_parameters.py +0 -40
deltacat/storage/model/partition_spec.py +0 -71
deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +0 -253
deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +0 -45
deltacat-1.1.36.dist-info/METADATA +0 -64
deltacat-1.1.36.dist-info/RECORD +0 -219
/deltacat/{aws/redshift/model → benchmarking/data}/__init__.py +0 -0
/deltacat/{io/aws → catalog/main}/__init__.py +0 -0
/deltacat/{io/aws/redshift → compute/converter}/__init__.py +0 -0
/deltacat/{tests/io → compute/converter/model}/__init__.py +0 -0
/deltacat/tests/{io → _io}/test_cloudpickle_bug_fix.py +0 -0
/deltacat/tests/{io → _io}/test_file_object_store.py +0 -0
/deltacat/tests/{io → _io}/test_memcached_object_store.py +0 -0
/deltacat/tests/{io → _io}/test_ray_plasma_object_store.py +0 -0
/deltacat/tests/{io → _io}/test_redis_object_store.py +0 -0
/deltacat/tests/{io → _io}/test_s3_object_store.py +0 -0
{deltacat-1.1.36.dist-info → deltacat-2.0.0b2.dist-info}/LICENSE +0 -0
{deltacat-1.1.36.dist-info → deltacat-2.0.0b2.dist-info}/WHEEL +0 -0
{deltacat-1.1.36.dist-info → deltacat-2.0.0b2.dist-info}/top_level.txt +0 -0

deltacat/compute/converter/utils/convert_task_options.py ADDED Viewed

@@ -0,0 +1,88 @@
+from typing import Optional, Dict
+from deltacat.exceptions import RetryableError
+AVERAGE_FILE_PATH_COLUMN_SIZE_BYTES = 80
+AVERAGE_POS_COLUMN_SIZE_BYTES = 4
+XXHASH_BYTE_PER_RECORD = 8
+MEMORY_BUFFER_RATE = 1.2
+def estimate_fixed_hash_columns(hash_value_size_bytes_per_record, total_record_count):
+    return hash_value_size_bytes_per_record * total_record_count
+def get_total_record_from_iceberg_files(iceberg_files_list):
+    total_record_count = 0
+    for iceberg_files in iceberg_files_list:
+        total_record_count += sum(file.record_count for file in iceberg_files)
+    return total_record_count
+def estimate_iceberg_pos_delete_additional_columns(
+    include_columns, num_of_record_count
+):
+    total_additional_columns_sizes = 0
+    if "file_path" in include_columns:
+        total_additional_columns_sizes += (
+            AVERAGE_FILE_PATH_COLUMN_SIZE_BYTES * num_of_record_count
+        )
+    elif "pos" in include_columns:
+        total_additional_columns_sizes += (
+            AVERAGE_POS_COLUMN_SIZE_BYTES * num_of_record_count
+        )
+    return total_additional_columns_sizes
+def estimate_convert_remote_option_resources(data_files, equality_delete_files):
+    data_file_record_count = get_total_record_from_iceberg_files(data_files)
+    equality_delete_record_count = get_total_record_from_iceberg_files(
+        equality_delete_files
+    )
+    hash_column_sizes = estimate_fixed_hash_columns(
+        XXHASH_BYTE_PER_RECORD, data_file_record_count + equality_delete_record_count
+    )
+    pos_delete_sizes = estimate_iceberg_pos_delete_additional_columns(
+        ["file_path", "pos"], data_file_record_count + equality_delete_record_count
+    )
+    total_memory_required = hash_column_sizes + pos_delete_sizes
+    return total_memory_required * MEMORY_BUFFER_RATE
+def _get_task_options(
+    memory: float,
+    ray_custom_resources: Optional[Dict] = None,
+    scheduling_strategy: str = "SPREAD",
+) -> Dict:
+    # NOTE: With DEFAULT scheduling strategy in Ray 2.20.0, autoscaler does
+    # not spin up enough nodes fast and hence we see only approximately
+    # 20 tasks get scheduled out of 100 tasks in queue. Hence, we use SPREAD
+    # which is also ideal for merge and hash bucket tasks.
+    # https://docs.ray.io/en/latest/ray-core/scheduling/index.html
+    task_opts = {
+        "memory": memory,
+        "scheduling_strategy": scheduling_strategy,
+    }
+    if ray_custom_resources:
+        task_opts["resources"] = ray_custom_resources
+    task_opts["max_retries"] = 3
+    # List of possible botocore exceptions are available at
+    # https://github.com/boto/botocore/blob/develop/botocore/exceptions.py
+    task_opts["retry_exceptions"] = [RetryableError]
+    return task_opts
+def convert_resource_options_provider(index, files_for_each_bucket):
+    (
+        data_files_list,
+        equality_delete_files_list,
+        position_delete_files_list,
+    ) = files_for_each_bucket[1]
+    memory_requirement = estimate_convert_remote_option_resources(
+        data_files_list, equality_delete_files_list
+    )
+    return _get_task_options(memory=memory_requirement)

deltacat/compute/converter/utils/converter_session_utils.py ADDED Viewed

@@ -0,0 +1,109 @@
+from collections import defaultdict
+import logging
+from deltacat import logs
+from deltacat.compute.converter.model.convert_input_files import ConvertInputFiles
+logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
+def check_data_files_sequence_number(data_files_list, equality_delete_files_list):
+    # Sort by file sequence number
+    data_files_list.sort(key=lambda file_tuple: file_tuple[0])
+    equality_delete_files_list.sort(key=lambda file_tuple: file_tuple[0])
+    equality_delete_files = []
+    result_data_file = []
+    # Pointer for list data_file
+    data_file_pointer = 0
+    # Loop through each value in equality_delete_file
+    for equality_file_tuple in equality_delete_files_list:
+        # Find all values in data_file that are smaller than val_equality
+        valid_values = []
+        # Move data_file_pointer to the first value in data_file that is smaller than val_equality
+        while (
+            data_file_pointer < len(data_files_list)
+            and data_files_list[data_file_pointer][0] < equality_file_tuple[0]
+        ):
+            valid_values.append(data_files_list[data_file_pointer])
+            data_file_pointer += 1
+            equality_delete_files.append(equality_file_tuple)
+        # Append the value from equality_delete_file and the corresponding valid values from data_file
+        if valid_values:
+            result_data_file.append(valid_values)
+    result_equality_delete_file = append_larger_sequence_number_data_files(
+        equality_delete_files
+    )
+    return result_equality_delete_file, result_data_file
+def append_larger_sequence_number_data_files(data_files_list):
+    result = []
+    # Iterate over the input list
+    for i in range(len(data_files_list)):
+        sublist = data_files_list[i:]
+        sublist_file_list = []
+        for file in sublist:
+            sublist_file_list.append(file)
+        result.append(sublist_file_list)
+    return result
+def construct_iceberg_table_prefix(
+    iceberg_warehouse_bucket_name, table_name, iceberg_namespace
+):
+    return f"{iceberg_warehouse_bucket_name}/{iceberg_namespace}/{table_name}/data"
+def partition_value_record_to_partition_value_string(partition):
+    # Get string representation of partition value out of Record[partition_value]
+    partition_value_str = partition.__repr__().split("[", 1)[1].split("]")[0]
+    return partition_value_str
+def group_all_files_to_each_bucket(
+    data_file_dict, equality_delete_dict, pos_delete_dict
+):
+    convert_input_files_for_all_buckets = []
+    files_for_each_bucket_for_deletes = defaultdict(tuple)
+    if equality_delete_dict:
+        for partition_value, equality_delete_file_list in equality_delete_dict.items():
+            (
+                result_equality_delete_file,
+                result_data_file,
+            ) = check_data_files_sequence_number(
+                data_files_list=data_file_dict[partition_value],
+                equality_delete_files_list=equality_delete_dict[partition_value],
+            )
+            files_for_each_bucket_for_deletes[partition_value] = (
+                result_data_file,
+                result_equality_delete_file,
+                [],
+            )
+            if partition_value not in data_file_dict:
+                convert_input_file = ConvertInputFiles.of(
+                    partition_value=partition_value,
+                    applicable_data_files=result_data_file,
+                    applicable_equalitu_delete_files=result_equality_delete_file,
+                )
+                convert_input_files_for_all_buckets.append(convert_input_file)
+    for partition_value, all_data_files_for_each_bucket in data_file_dict.items():
+        convert_input_file = ConvertInputFiles.of(
+            partition_value=partition_value,
+            all_data_files_for_dedupe=all_data_files_for_each_bucket,
+        )
+        if partition_value in files_for_each_bucket_for_deletes:
+            convert_input_file.applicable_data_files = (
+                files_for_each_bucket_for_deletes[partition_value][0]
+            )
+            convert_input_file.applicable_delete_files = (
+                files_for_each_bucket_for_deletes[partition_value][1]
+            )
+        convert_input_files_for_all_buckets.append(convert_input_file)
+    return convert_input_files_for_all_buckets

deltacat/compute/converter/utils/iceberg_columns.py ADDED Viewed

@@ -0,0 +1,82 @@
+import pyarrow as pa
+from typing import Union
+import numpy as np
+# Refer to: https://iceberg.apache.org/spec/#reserved-field-ids for reserved field ids
+ICEBERG_RESERVED_FIELD_ID_FOR_FILE_PATH_COLUMN = 2147483546
+# Refer to: https://iceberg.apache.org/spec/#reserved-field-ids for reserved field ids
+ICEBERG_RESERVED_FIELD_ID_FOR_POS_COLUMN = 2147483545
+def _get_iceberg_col_name(suffix):
+    return suffix
+_ORDERED_RECORD_IDX_COLUMN_NAME = _get_iceberg_col_name("pos")
+_ORDERED_RECORD_IDX_COLUMN_TYPE = pa.int64()
+_ORDERED_RECORD_IDX_FIELD_METADATA = {
+    b"PARQUET:field_id": f"{ICEBERG_RESERVED_FIELD_ID_FOR_POS_COLUMN}"
+}
+_ORDERED_RECORD_IDX_COLUMN_FIELD = pa.field(
+    _ORDERED_RECORD_IDX_COLUMN_NAME,
+    _ORDERED_RECORD_IDX_COLUMN_TYPE,
+    metadata=_ORDERED_RECORD_IDX_FIELD_METADATA,
+    nullable=False,
+)
+def get_record_index_column_array(obj) -> Union[pa.Array, pa.ChunkedArray]:
+    return pa.array(
+        obj,
+        _ORDERED_RECORD_IDX_COLUMN_TYPE,
+    )
+def append_record_idx_col(table: pa.Table, ordered_record_indices) -> pa.Table:
+    table = table.append_column(
+        _ORDERED_RECORD_IDX_COLUMN_FIELD,
+        get_record_index_column_array(ordered_record_indices),
+    )
+    return table
+_FILE_PATH_COLUMN_NAME = _get_iceberg_col_name("file_path")
+_FILE_PATH_COLUMN_TYPE = pa.string()
+_FILE_PATH_FIELD_METADATA = {
+    b"PARQUET:field_id": f"{ICEBERG_RESERVED_FIELD_ID_FOR_FILE_PATH_COLUMN}"
+}
+_FILE_PATH_COLUMN_FIELD = pa.field(
+    _FILE_PATH_COLUMN_NAME,
+    _FILE_PATH_COLUMN_TYPE,
+    metadata=_FILE_PATH_FIELD_METADATA,
+    nullable=False,
+)
+def append_file_path_column(table: pa.Table, file_path: str):
+    table = table.append_column(
+        _FILE_PATH_COLUMN_FIELD,
+        pa.array(np.repeat(file_path, len(table)), _FILE_PATH_COLUMN_TYPE),
+    )
+    return table
+_GLOBAL_RECORD_IDX_COLUMN_NAME = _get_iceberg_col_name("global_record_index")
+_GLOBAL_RECORD_IDX_COLUMN_TYPE = pa.int64()
+_GLOBAL_RECORD_IDX_COLUMN_FIELD = pa.field(
+    _GLOBAL_RECORD_IDX_COLUMN_NAME,
+    _GLOBAL_RECORD_IDX_COLUMN_TYPE,
+)
+def append_global_record_idx_column(
+    table: pa.Table, ordered_record_indices
+) -> pa.Table:
+    table = table.append_column(
+        _GLOBAL_RECORD_IDX_COLUMN_NAME,
+        pa.array(ordered_record_indices, _GLOBAL_RECORD_IDX_COLUMN_TYPE),
+    )
+    return table

deltacat/compute/converter/utils/io.py ADDED Viewed

@@ -0,0 +1,43 @@
+import deltacat.compute.converter.utils.iceberg_columns as sc
+import daft
+def download_data_table_and_append_iceberg_columns(
+    file, columns_to_download, additional_columns_to_append, sequence_number
+):
+    # TODO; add S3 client kwargs
+    table = download_parquet_with_daft_hash_applied(
+        identify_columns=columns_to_download, file=file, s3_client_kwargs={}
+    )
+    if sc._FILE_PATH_COLUMN_NAME in additional_columns_to_append:
+        table = sc.append_file_path_column(table, file.file_path)
+    if sc._ORDERED_RECORD_IDX_COLUMN_NAME in additional_columns_to_append:
+        record_idx_iterator = iter(range(len(table)))
+        table = sc.append_record_idx_col(table, record_idx_iterator)
+    return table
+def download_parquet_with_daft_hash_applied(
+    identify_columns, file, s3_client_kwargs, **kwargs
+):
+    from daft import TimeUnit
+    # TODO: Add correct read kwargs as in:
+    #  https://github.com/ray-project/deltacat/blob/383855a4044e4dfe03cf36d7738359d512a517b4/deltacat/utils/daft.py#L97
+    coerce_int96_timestamp_unit = TimeUnit.from_str(
+        kwargs.get("coerce_int96_timestamp_unit", "ms")
+    )
+    from deltacat.utils.daft import _get_s3_io_config
+    # TODO: Use Daft SHA1 hash instead to minimize probably of data corruption
+    io_config = _get_s3_io_config(s3_client_kwargs=s3_client_kwargs)
+    df = daft.read_parquet(
+        path=file.file_path,
+        io_config=io_config,
+        coerce_int96_timestamp_unit=coerce_int96_timestamp_unit,
+    )
+    df = df.select(daft.col(identify_columns[0]).hash())
+    arrow_table = df.to_arrow()
+    return arrow_table

deltacat/compute/converter/utils/s3u.py ADDED Viewed

@@ -0,0 +1,133 @@
+from tenacity import (
+    Retrying,
+    retry_if_exception_type,
+    stop_after_delay,
+    wait_random_exponential,
+)
+from typing import Union
+from deltacat.aws.s3u import CapturedBlockWritePaths, UuidBlockWritePathProvider
+from deltacat.types.tables import (
+    get_table_writer,
+    get_table_length,
+    TABLE_CLASS_TO_SLICER_FUNC,
+)
+from typing import Optional, Dict, Any, List
+from deltacat.exceptions import RetryableError
+from deltacat.storage import (
+    DistributedDataset,
+    LocalTable,
+)
+from deltacat.types.media import (
+    ContentEncoding,
+    ContentType,
+)
+from deltacat.aws.s3u import UPLOAD_SLICED_TABLE_RETRY_STOP_AFTER_DELAY
+import s3fs
+def get_credential():
+    import boto3
+    boto3_session = boto3.Session()
+    credentials = boto3_session.get_credentials()
+    return credentials
+def get_s3_file_system(content_type):
+    token_holder = get_credential()
+    content_encoding = ContentEncoding.IDENTITY
+    s3_file_system = s3fs.S3FileSystem(
+        key=token_holder.access_key,
+        secret=token_holder.secret_key,
+        token=token_holder.token,
+        s3_additional_kwargs={
+            "ServerSideEncryption": "aws:kms",
+            # TODO: Get tagging from table properties
+            "ContentType": content_type.value,
+            "ContentEncoding": content_encoding.value,
+        },
+    )
+    return s3_file_system
+def upload_table_with_retry(
+    table: Union[LocalTable, DistributedDataset],
+    s3_url_prefix: str,
+    s3_table_writer_kwargs: Optional[Dict[str, Any]],
+    content_type: ContentType = ContentType.PARQUET,
+    max_records_per_file: Optional[int] = 4000000,
+    s3_file_system=None,
+    **s3_client_kwargs,
+) -> List[str]:
+    """
+    Writes the given table to 1 or more S3 files and return Redshift
+    manifest entries describing the uploaded files.
+    """
+    retrying = Retrying(
+        wait=wait_random_exponential(multiplier=1, max=60),
+        stop=stop_after_delay(UPLOAD_SLICED_TABLE_RETRY_STOP_AFTER_DELAY),
+        retry=retry_if_exception_type(RetryableError),
+    )
+    if s3_table_writer_kwargs is None:
+        s3_table_writer_kwargs = {}
+    if not s3_file_system:
+        s3_file_system = get_s3_file_system(content_type=content_type)
+    capture_object = CapturedBlockWritePaths()
+    block_write_path_provider = UuidBlockWritePathProvider(
+        capture_object=capture_object
+    )
+    s3_table_writer_func = get_table_writer(table)
+    table_record_count = get_table_length(table)
+    if max_records_per_file is None or not table_record_count:
+        retrying(
+            fn=upload_table,
+            table_slices=table,
+            s3_base_url=f"{s3_url_prefix}",
+            s3_file_system=s3_file_system,
+            s3_table_writer_func=s3_table_writer_func,
+            s3_table_writer_kwargs=s3_table_writer_kwargs,
+            block_write_path_provider=block_write_path_provider,
+            content_type=content_type,
+            **s3_client_kwargs,
+        )
+    else:
+        table_slicer_func = TABLE_CLASS_TO_SLICER_FUNC.get(type(table))
+        table_slices = table_slicer_func(table, max_records_per_file)
+        for table_slice in table_slices:
+            retrying(
+                fn=upload_table,
+                table_slices=table_slice,
+                s3_base_url=f"{s3_url_prefix}",
+                s3_file_system=s3_file_system,
+                s3_table_writer_func=s3_table_writer_func,
+                s3_table_writer_kwargs=s3_table_writer_kwargs,
+                block_write_path_provider=block_write_path_provider,
+                content_type=content_type,
+                **s3_client_kwargs,
+            )
+    del block_write_path_provider
+    write_paths = capture_object.write_paths()
+    return write_paths
+def upload_table(
+    table_slices,
+    s3_base_url,
+    s3_file_system,
+    s3_table_writer_func,
+    block_write_path_provider,
+    content_type,
+    s3_table_writer_kwargs,
+):
+    s3_table_writer_func(
+        table_slices,
+        s3_base_url,
+        s3_file_system,
+        block_write_path_provider,
+        content_type.value,
+        **s3_table_writer_kwargs,
+    )
+    # TODO: Add a proper fix for block_refs and write_paths not persisting in Ray actors

deltacat/compute/resource_estimation/delta.py CHANGED Viewed

@@ -93,29 +93,11 @@ def _estimate_resources_required_to_process_delta_using_type_params(
                 on_disk_size_bytes=delta.meta.content_length,
             ),
         )
-    file_reader_kwargs_provider = kwargs.get(
-        "file_reader_kwargs_provider"
-    ) or deltacat_storage_kwargs.get("file_reader_kwargs_provider")
-    """
-    NOTE: The file_reader_kwargs_provider parameter can be passed in two ways:
-    1. Nested within deltacat_storage_kwargs during resource estimation
-    2. As a top-level attribute of CompactPartitionsParams during compaction
-    This creates an inconsistent parameter path between resource estimation and compaction flows.
-    As a long-term solution, this should be unified to use a single consistent path (either always
-    nested in deltacat_storage_kwargs or always as a top-level parameter).
-    For now, this implementation handles the resource estimation case by:
-    1. First checking for file_reader_kwargs_provider as a direct kwarg
-    2. Falling back to deltacat_storage_kwargs if not found
-    This approach maintains backward compatibility by not modifying the DELTA_RESOURCE_ESTIMATION_FUNCTIONS signatures.
-    """
     appended = append_content_type_params(
         delta=delta,
         deltacat_storage=deltacat_storage,
         deltacat_storage_kwargs=deltacat_storage_kwargs,
-        file_reader_kwargs_provider=file_reader_kwargs_provider,
     )
     if not appended:

deltacat/constants.py CHANGED Viewed

@@ -1,4 +1,8 @@
-from deltacat.utils.common import env_string
+from __future__ import annotations
+from deltacat.utils.common import env_string, env_bool
+import os
 # Environment variables
 DELTACAT_SYS_LOG_LEVEL = env_string("DELTACAT_SYS_LOG_LEVEL", "DEBUG")
@@ -30,6 +34,26 @@ DELTACAT_APP_DEBUG_LOG_BASE_FILE_NAME = env_string(
 )
 # A json context which will be logged along with other context args.
 DELTACAT_LOGGER_CONTEXT = env_string("DELTACAT_LOGGER_CONTEXT", None)
+DELTACAT_LOGGER_USE_SINGLE_HANDLER = env_bool(
+    "DELTACAT_LOGGER_USE_SINGLE_HANDLER",
+    False,
+)
+DELTACAT_ROOT = env_string(
+    "DELTACAT_ROOT",
+    os.path.join(os.getcwd(), ".deltacat"),
+)
+# CLI Args
+METAFILE_FORMAT_KEY = "METAFILE_FORMAT"
+METAFILE_FORMAT_JSON = "json"
+METAFILE_FORMAT_MSGPACK = "msgpack"
+METAFILE_FORMAT = env_string(METAFILE_FORMAT_KEY, METAFILE_FORMAT_MSGPACK)
+SUPPORTED_METAFILE_FORMATS = [METAFILE_FORMAT_JSON, METAFILE_FORMAT_MSGPACK]
+METAFILE_EXT = {
+    "json": ".json",
+    "msgpack": ".mpk",
+}[METAFILE_FORMAT]
 # Byte Units
 BYTES_PER_KIBIBYTE = 2**10
@@ -41,6 +65,11 @@ BYTES_PER_PEBIBYTE = 2**50
 SIGNED_INT64_MIN_VALUE = -(2**63)
 SIGNED_INT64_MAX_VALUE = 2**63 - 1
+# Time Units
+NANOS_PER_SEC = 1_000_000_000
+MICROS_PER_SEC = 1_000_000
+MILLIS_PER_SEC = 1000
 # Inflation multiplier from snappy-compressed parquet to pyarrow.
 # This should be kept larger than actual average inflation multipliers.
 # Note that this is a very rough guess since actual observed pyarrow
@@ -58,3 +87,20 @@ MEMORY_TO_HASH_BUCKET_COUNT_RATIO = 0.0512 * BYTES_PER_TEBIBYTE
 # The number of bytes allocated to null values in string physical type in parquet
 NULL_SIZE_BYTES = 4
+# Metastore Constants
+REVISION_DIR_NAME: str = "rev"
+TXN_DIR_NAME: str = "txn"
+RUNNING_TXN_DIR_NAME: str = "running"
+FAILED_TXN_DIR_NAME: str = "failed"
+SUCCESS_TXN_DIR_NAME: str = "success"
+TXN_PART_SEPARATOR = "_"
+# Storage interface defaults
+# These defaults should be applied in catalog interface implementations
+# Storage interface implementations should be agnostic to defaults and require full information
+DEFAULT_CATALOG = "DEFAULT"
+DEFAULT_NAMESPACE = "DEFAULT"
+DEFAULT_TABLE_VERSION = "1"
+DEFAULT_STREAM_ID = "stream"
+DEFAULT_PARTITION_ID = "partition"
+DEFAULT_PARTITION_VALUES = ["default"]

deltacat/env.py ADDED Viewed

@@ -0,0 +1,51 @@
+import os
+import logging
+from typing import Dict, Any
+from deltacat import logs
+from deltacat.constants import (
+    DELTACAT_APP_LOG_LEVEL,
+    DELTACAT_SYS_LOG_LEVEL,
+    DELTACAT_APP_LOG_DIR,
+    DELTACAT_SYS_LOG_DIR,
+    DELTACAT_APP_INFO_LOG_BASE_FILE_NAME,
+    DELTACAT_SYS_INFO_LOG_BASE_FILE_NAME,
+    DELTACAT_APP_DEBUG_LOG_BASE_FILE_NAME,
+    DELTACAT_SYS_DEBUG_LOG_BASE_FILE_NAME,
+    DELTACAT_LOGGER_USE_SINGLE_HANDLER,
+)
+logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
+def create_ray_runtime_environment() -> Dict[str, Any]:
+    # log the system environment for debugging
+    logger.debug(f"System Environment: {os.environ}")
+    # read the stage (e.g. alpha, beta, dev, etc.) from system environment vars
+    stage = os.environ.get("STAGE")
+    logger.debug(f"Runtime Environment Stage: {stage}")
+    runtime_environment = None
+    if stage:
+        worker_env_vars = {
+            # forward the STAGE environment variable to workers
+            "STAGE": stage,
+            # forward deltacat logging environment variables to workers
+            "DELTACAT_APP_LOG_LEVEL": DELTACAT_APP_LOG_LEVEL,
+            "DELTACAT_SYS_LOG_LEVEL": DELTACAT_SYS_LOG_LEVEL,
+            "DELTACAT_APP_LOG_DIR": DELTACAT_APP_LOG_DIR,
+            "DELTACAT_SYS_LOG_DIR": DELTACAT_SYS_LOG_DIR,
+            "DELTACAT_APP_INFO_LOG_BASE_FILE_NAME": DELTACAT_APP_INFO_LOG_BASE_FILE_NAME,
+            "DELTACAT_SYS_INFO_LOG_BASE_FILE_NAME": DELTACAT_SYS_INFO_LOG_BASE_FILE_NAME,
+            "DELTACAT_APP_DEBUG_LOG_BASE_FILE_NAME": DELTACAT_APP_DEBUG_LOG_BASE_FILE_NAME,
+            "DELTACAT_SYS_DEBUG_LOG_BASE_FILE_NAME": DELTACAT_SYS_DEBUG_LOG_BASE_FILE_NAME,
+            "DELTACAT_LOGGER_USE_SINGLE_HANDLER": str(
+                DELTACAT_LOGGER_USE_SINGLE_HANDLER
+            ),
+        }
+        # setup runtime environment from system environment variables:
+        runtime_environment = {
+            "env_vars": worker_env_vars,
+        }
+    return runtime_environment

deltacat/examples/__init__.py ADDED Viewed

File without changes

deltacat 1.1.36__py3-none-any.whl → 2.0.0b2__py3-none-any.whl

deltacat 1.1.36py3-none-any.whl → 2.0.0b2py3-none-any.whl