deltacat 1.1.36__py3-none-any.whl → 2.0.0b2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +42 -3
- deltacat/annotations.py +36 -0
- deltacat/api.py +168 -0
- deltacat/aws/s3u.py +4 -4
- deltacat/benchmarking/benchmark_engine.py +82 -0
- deltacat/benchmarking/benchmark_report.py +86 -0
- deltacat/benchmarking/benchmark_suite.py +11 -0
- deltacat/benchmarking/conftest.py +21 -0
- deltacat/benchmarking/data/random_row_generator.py +94 -0
- deltacat/benchmarking/data/row_generator.py +10 -0
- deltacat/benchmarking/test_benchmark_pipeline.py +106 -0
- deltacat/catalog/__init__.py +14 -0
- deltacat/catalog/delegate.py +199 -106
- deltacat/catalog/iceberg/__init__.py +4 -0
- deltacat/catalog/iceberg/iceberg_catalog_config.py +26 -0
- deltacat/catalog/iceberg/impl.py +368 -0
- deltacat/catalog/iceberg/overrides.py +74 -0
- deltacat/catalog/interface.py +273 -76
- deltacat/catalog/main/impl.py +720 -0
- deltacat/catalog/model/catalog.py +227 -20
- deltacat/catalog/model/properties.py +116 -0
- deltacat/catalog/model/table_definition.py +32 -1
- deltacat/compute/compactor/model/compaction_session_audit_info.py +7 -3
- deltacat/compute/compactor/model/delta_annotated.py +3 -3
- deltacat/compute/compactor/model/delta_file_envelope.py +3 -1
- deltacat/compute/compactor/model/delta_file_locator.py +3 -1
- deltacat/compute/compactor/model/round_completion_info.py +5 -5
- deltacat/compute/compactor/model/table_object_store.py +3 -2
- deltacat/compute/compactor/repartition_session.py +1 -1
- deltacat/compute/compactor/steps/dedupe.py +11 -4
- deltacat/compute/compactor/steps/hash_bucket.py +1 -1
- deltacat/compute/compactor/steps/materialize.py +6 -2
- deltacat/compute/compactor/utils/io.py +1 -1
- deltacat/compute/compactor/utils/sort_key.py +9 -2
- deltacat/compute/compactor_v2/compaction_session.py +5 -9
- deltacat/compute/compactor_v2/constants.py +1 -30
- deltacat/compute/compactor_v2/deletes/utils.py +3 -3
- deltacat/compute/compactor_v2/model/merge_input.py +1 -7
- deltacat/compute/compactor_v2/private/compaction_utils.py +5 -6
- deltacat/compute/compactor_v2/steps/merge.py +17 -126
- deltacat/compute/compactor_v2/utils/content_type_params.py +0 -17
- deltacat/compute/compactor_v2/utils/dedupe.py +1 -1
- deltacat/compute/compactor_v2/utils/io.py +1 -1
- deltacat/compute/compactor_v2/utils/merge.py +0 -1
- deltacat/compute/compactor_v2/utils/primary_key_index.py +3 -15
- deltacat/compute/compactor_v2/utils/task_options.py +23 -43
- deltacat/compute/converter/constants.py +4 -0
- deltacat/compute/converter/converter_session.py +143 -0
- deltacat/compute/converter/model/convert_input.py +69 -0
- deltacat/compute/converter/model/convert_input_files.py +61 -0
- deltacat/compute/converter/model/converter_session_params.py +99 -0
- deltacat/compute/converter/pyiceberg/__init__.py +0 -0
- deltacat/compute/converter/pyiceberg/catalog.py +75 -0
- deltacat/compute/converter/pyiceberg/overrides.py +135 -0
- deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +251 -0
- deltacat/compute/converter/steps/__init__.py +0 -0
- deltacat/compute/converter/steps/convert.py +211 -0
- deltacat/compute/converter/steps/dedupe.py +60 -0
- deltacat/compute/converter/utils/__init__.py +0 -0
- deltacat/compute/converter/utils/convert_task_options.py +88 -0
- deltacat/compute/converter/utils/converter_session_utils.py +109 -0
- deltacat/compute/converter/utils/iceberg_columns.py +82 -0
- deltacat/compute/converter/utils/io.py +43 -0
- deltacat/compute/converter/utils/s3u.py +133 -0
- deltacat/compute/resource_estimation/delta.py +1 -19
- deltacat/constants.py +47 -1
- deltacat/env.py +51 -0
- deltacat/examples/__init__.py +0 -0
- deltacat/examples/basic_logging.py +101 -0
- deltacat/examples/common/__init__.py +0 -0
- deltacat/examples/common/fixtures.py +15 -0
- deltacat/examples/hello_world.py +27 -0
- deltacat/examples/iceberg/__init__.py +0 -0
- deltacat/examples/iceberg/iceberg_bucket_writer.py +139 -0
- deltacat/examples/iceberg/iceberg_reader.py +149 -0
- deltacat/exceptions.py +51 -9
- deltacat/logs.py +4 -1
- deltacat/storage/__init__.py +118 -28
- deltacat/storage/iceberg/__init__.py +0 -0
- deltacat/storage/iceberg/iceberg_scan_planner.py +28 -0
- deltacat/storage/iceberg/impl.py +737 -0
- deltacat/storage/iceberg/model.py +709 -0
- deltacat/storage/interface.py +217 -134
- deltacat/storage/main/__init__.py +0 -0
- deltacat/storage/main/impl.py +2077 -0
- deltacat/storage/model/delta.py +118 -71
- deltacat/storage/model/interop.py +24 -0
- deltacat/storage/model/list_result.py +8 -0
- deltacat/storage/model/locator.py +93 -3
- deltacat/{aws/redshift → storage}/model/manifest.py +122 -98
- deltacat/storage/model/metafile.py +1316 -0
- deltacat/storage/model/namespace.py +34 -18
- deltacat/storage/model/partition.py +362 -37
- deltacat/storage/model/scan/__init__.py +0 -0
- deltacat/storage/model/scan/push_down.py +19 -0
- deltacat/storage/model/scan/scan_plan.py +10 -0
- deltacat/storage/model/scan/scan_task.py +34 -0
- deltacat/storage/model/schema.py +892 -0
- deltacat/storage/model/shard.py +47 -0
- deltacat/storage/model/sort_key.py +170 -13
- deltacat/storage/model/stream.py +208 -80
- deltacat/storage/model/table.py +123 -29
- deltacat/storage/model/table_version.py +322 -46
- deltacat/storage/model/transaction.py +757 -0
- deltacat/storage/model/transform.py +198 -61
- deltacat/storage/model/types.py +111 -13
- deltacat/storage/rivulet/__init__.py +11 -0
- deltacat/storage/rivulet/arrow/__init__.py +0 -0
- deltacat/storage/rivulet/arrow/serializer.py +75 -0
- deltacat/storage/rivulet/dataset.py +744 -0
- deltacat/storage/rivulet/dataset_executor.py +87 -0
- deltacat/storage/rivulet/feather/__init__.py +5 -0
- deltacat/storage/rivulet/feather/file_reader.py +136 -0
- deltacat/storage/rivulet/feather/serializer.py +35 -0
- deltacat/storage/rivulet/fs/__init__.py +0 -0
- deltacat/storage/rivulet/fs/file_provider.py +105 -0
- deltacat/storage/rivulet/fs/file_store.py +130 -0
- deltacat/storage/rivulet/fs/input_file.py +76 -0
- deltacat/storage/rivulet/fs/output_file.py +86 -0
- deltacat/storage/rivulet/logical_plan.py +105 -0
- deltacat/storage/rivulet/metastore/__init__.py +0 -0
- deltacat/storage/rivulet/metastore/delta.py +190 -0
- deltacat/storage/rivulet/metastore/json_sst.py +105 -0
- deltacat/storage/rivulet/metastore/sst.py +82 -0
- deltacat/storage/rivulet/metastore/sst_interval_tree.py +260 -0
- deltacat/storage/rivulet/mvp/Table.py +101 -0
- deltacat/storage/rivulet/mvp/__init__.py +5 -0
- deltacat/storage/rivulet/parquet/__init__.py +5 -0
- deltacat/storage/rivulet/parquet/data_reader.py +0 -0
- deltacat/storage/rivulet/parquet/file_reader.py +127 -0
- deltacat/storage/rivulet/parquet/serializer.py +37 -0
- deltacat/storage/rivulet/reader/__init__.py +0 -0
- deltacat/storage/rivulet/reader/block_scanner.py +378 -0
- deltacat/storage/rivulet/reader/data_reader.py +136 -0
- deltacat/storage/rivulet/reader/data_scan.py +63 -0
- deltacat/storage/rivulet/reader/dataset_metastore.py +178 -0
- deltacat/storage/rivulet/reader/dataset_reader.py +156 -0
- deltacat/storage/rivulet/reader/pyarrow_data_reader.py +121 -0
- deltacat/storage/rivulet/reader/query_expression.py +99 -0
- deltacat/storage/rivulet/reader/reader_type_registrar.py +84 -0
- deltacat/storage/rivulet/schema/__init__.py +0 -0
- deltacat/storage/rivulet/schema/datatype.py +128 -0
- deltacat/storage/rivulet/schema/schema.py +251 -0
- deltacat/storage/rivulet/serializer.py +40 -0
- deltacat/storage/rivulet/serializer_factory.py +42 -0
- deltacat/storage/rivulet/writer/__init__.py +0 -0
- deltacat/storage/rivulet/writer/dataset_writer.py +29 -0
- deltacat/storage/rivulet/writer/memtable_dataset_writer.py +294 -0
- deltacat/storage/util/__init__.py +0 -0
- deltacat/storage/util/scan_planner.py +26 -0
- deltacat/tests/_io/__init__.py +1 -0
- deltacat/tests/catalog/test_catalogs.py +324 -0
- deltacat/tests/catalog/test_default_catalog_impl.py +16 -8
- deltacat/tests/compute/compact_partition_multiple_rounds_test_cases.py +21 -21
- deltacat/tests/compute/compact_partition_rebase_test_cases.py +6 -6
- deltacat/tests/compute/compact_partition_rebase_then_incremental_test_cases.py +56 -56
- deltacat/tests/compute/compact_partition_test_cases.py +19 -53
- deltacat/tests/compute/compactor/steps/test_repartition.py +2 -2
- deltacat/tests/compute/compactor/utils/test_io.py +6 -8
- deltacat/tests/compute/compactor_v2/test_compaction_session.py +0 -466
- deltacat/tests/compute/compactor_v2/utils/test_task_options.py +1 -273
- deltacat/tests/compute/conftest.py +75 -0
- deltacat/tests/compute/converter/__init__.py +0 -0
- deltacat/tests/compute/converter/conftest.py +80 -0
- deltacat/tests/compute/converter/test_convert_session.py +478 -0
- deltacat/tests/compute/converter/utils.py +123 -0
- deltacat/tests/compute/resource_estimation/test_delta.py +0 -16
- deltacat/tests/compute/test_compact_partition_incremental.py +2 -42
- deltacat/tests/compute/test_compact_partition_multiple_rounds.py +5 -46
- deltacat/tests/compute/test_compact_partition_params.py +3 -3
- deltacat/tests/compute/test_compact_partition_rebase.py +1 -46
- deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +5 -46
- deltacat/tests/compute/test_util_common.py +19 -12
- deltacat/tests/compute/test_util_create_table_deltas_repo.py +13 -22
- deltacat/tests/local_deltacat_storage/__init__.py +76 -103
- deltacat/tests/storage/__init__.py +0 -0
- deltacat/tests/storage/conftest.py +25 -0
- deltacat/tests/storage/main/__init__.py +0 -0
- deltacat/tests/storage/main/test_main_storage.py +1399 -0
- deltacat/tests/storage/model/__init__.py +0 -0
- deltacat/tests/storage/model/test_delete_parameters.py +21 -0
- deltacat/tests/storage/model/test_metafile_io.py +2535 -0
- deltacat/tests/storage/model/test_schema.py +308 -0
- deltacat/tests/storage/model/test_shard.py +22 -0
- deltacat/tests/storage/model/test_table_version.py +110 -0
- deltacat/tests/storage/model/test_transaction.py +308 -0
- deltacat/tests/storage/rivulet/__init__.py +0 -0
- deltacat/tests/storage/rivulet/conftest.py +149 -0
- deltacat/tests/storage/rivulet/fs/__init__.py +0 -0
- deltacat/tests/storage/rivulet/fs/test_file_location_provider.py +93 -0
- deltacat/tests/storage/rivulet/schema/__init__.py +0 -0
- deltacat/tests/storage/rivulet/schema/test_schema.py +241 -0
- deltacat/tests/storage/rivulet/test_dataset.py +406 -0
- deltacat/tests/storage/rivulet/test_manifest.py +67 -0
- deltacat/tests/storage/rivulet/test_sst_interval_tree.py +232 -0
- deltacat/tests/storage/rivulet/test_utils.py +122 -0
- deltacat/tests/storage/rivulet/writer/__init__.py +0 -0
- deltacat/tests/storage/rivulet/writer/test_dataset_write_then_read.py +341 -0
- deltacat/tests/storage/rivulet/writer/test_dataset_writer.py +79 -0
- deltacat/tests/storage/rivulet/writer/test_memtable_dataset_writer.py +75 -0
- deltacat/tests/test_deltacat_api.py +39 -0
- deltacat/tests/test_utils/filesystem.py +14 -0
- deltacat/tests/test_utils/message_pack_utils.py +54 -0
- deltacat/tests/test_utils/pyarrow.py +8 -15
- deltacat/tests/test_utils/storage.py +266 -3
- deltacat/tests/utils/test_daft.py +3 -3
- deltacat/tests/utils/test_pyarrow.py +0 -432
- deltacat/types/partial_download.py +1 -1
- deltacat/types/tables.py +1 -1
- deltacat/utils/export.py +59 -0
- deltacat/utils/filesystem.py +320 -0
- deltacat/utils/metafile_locator.py +73 -0
- deltacat/utils/pyarrow.py +36 -183
- deltacat-2.0.0b2.dist-info/METADATA +65 -0
- deltacat-2.0.0b2.dist-info/RECORD +349 -0
- deltacat/aws/redshift/__init__.py +0 -19
- deltacat/catalog/default_catalog_impl/__init__.py +0 -369
- deltacat/io/dataset.py +0 -73
- deltacat/io/read_api.py +0 -143
- deltacat/storage/model/delete_parameters.py +0 -40
- deltacat/storage/model/partition_spec.py +0 -71
- deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +0 -253
- deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +0 -45
- deltacat-1.1.36.dist-info/METADATA +0 -64
- deltacat-1.1.36.dist-info/RECORD +0 -219
- /deltacat/{aws/redshift/model → benchmarking/data}/__init__.py +0 -0
- /deltacat/{io/aws → catalog/main}/__init__.py +0 -0
- /deltacat/{io/aws/redshift → compute/converter}/__init__.py +0 -0
- /deltacat/{tests/io → compute/converter/model}/__init__.py +0 -0
- /deltacat/tests/{io → _io}/test_cloudpickle_bug_fix.py +0 -0
- /deltacat/tests/{io → _io}/test_file_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_memcached_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_ray_plasma_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_redis_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_s3_object_store.py +0 -0
- {deltacat-1.1.36.dist-info → deltacat-2.0.0b2.dist-info}/LICENSE +0 -0
- {deltacat-1.1.36.dist-info → deltacat-2.0.0b2.dist-info}/WHEEL +0 -0
- {deltacat-1.1.36.dist-info → deltacat-2.0.0b2.dist-info}/top_level.txt +0 -0
@@ -1,16 +1,11 @@
|
|
1
1
|
import logging
|
2
2
|
from typing import Dict, Optional, List, Tuple, Any
|
3
3
|
from deltacat import logs
|
4
|
-
from deltacat.constants import PYARROW_INFLATION_MULTIPLIER
|
5
|
-
from deltacat.compute.compactor_v2.constants import (
|
6
|
-
AVERAGE_RECORD_SIZE_BYTES as DEFAULT_AVERAGE_RECORD_SIZE_BYTES,
|
7
|
-
)
|
8
4
|
from deltacat.compute.compactor_v2.model.merge_file_group import (
|
9
5
|
LocalMergeFileGroupsProvider,
|
10
6
|
)
|
11
7
|
from deltacat.storage import (
|
12
8
|
Manifest,
|
13
|
-
ManifestEntry,
|
14
9
|
interface as unimplemented_deltacat_storage,
|
15
10
|
)
|
16
11
|
from deltacat.compute.compactor.model.delta_annotated import DeltaAnnotated
|
@@ -86,27 +81,16 @@ def _get_merge_task_options(
|
|
86
81
|
and compacted_delta_manifest
|
87
82
|
and round_completion_info.hb_index_to_entry_range
|
88
83
|
):
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
previous_inflation: float = (
|
94
|
-
(
|
95
|
-
round_completion_info.compacted_pyarrow_write_result.pyarrow_bytes
|
96
|
-
/ round_completion_info.compacted_pyarrow_write_result.file_bytes
|
97
|
-
)
|
98
|
-
if round_completion_info.compacted_pyarrow_write_result.file_bytes
|
99
|
-
else PYARROW_INFLATION_MULTIPLIER
|
84
|
+
|
85
|
+
previous_inflation = (
|
86
|
+
round_completion_info.compacted_pyarrow_write_result.pyarrow_bytes
|
87
|
+
/ round_completion_info.compacted_pyarrow_write_result.file_bytes
|
100
88
|
)
|
101
89
|
debug_memory_params["previous_inflation"] = previous_inflation
|
102
90
|
|
103
|
-
average_record_size
|
104
|
-
|
105
|
-
|
106
|
-
/ round_completion_info.compacted_pyarrow_write_result.records
|
107
|
-
)
|
108
|
-
if round_completion_info.compacted_pyarrow_write_result.records
|
109
|
-
else DEFAULT_AVERAGE_RECORD_SIZE_BYTES
|
91
|
+
average_record_size = (
|
92
|
+
round_completion_info.compacted_pyarrow_write_result.pyarrow_bytes
|
93
|
+
/ round_completion_info.compacted_pyarrow_write_result.records
|
110
94
|
)
|
111
95
|
debug_memory_params["average_record_size"] = average_record_size
|
112
96
|
|
@@ -122,36 +106,31 @@ def _get_merge_task_options(
|
|
122
106
|
str(hb_idx)
|
123
107
|
]
|
124
108
|
for entry_index in range(entry_start, entry_end):
|
125
|
-
entry
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
)
|
132
|
-
or 0.0
|
109
|
+
entry = compacted_delta_manifest.entries[entry_index]
|
110
|
+
|
111
|
+
current_entry_size = estimate_manifest_entry_size_bytes(
|
112
|
+
entry=entry,
|
113
|
+
operation_type=OperationType.PYARROW_DOWNLOAD,
|
114
|
+
estimate_resources_params=estimate_resources_params,
|
133
115
|
)
|
134
|
-
current_entry_rows
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
estimate_resources_params=estimate_resources_params,
|
139
|
-
)
|
140
|
-
or 0
|
116
|
+
current_entry_rows = estimate_manifest_entry_num_rows(
|
117
|
+
entry=entry,
|
118
|
+
operation_type=OperationType.PYARROW_DOWNLOAD,
|
119
|
+
estimate_resources_params=estimate_resources_params,
|
141
120
|
)
|
142
|
-
|
121
|
+
|
143
122
|
data_size += current_entry_size
|
144
123
|
num_rows += current_entry_rows
|
124
|
+
|
145
125
|
if primary_keys:
|
146
|
-
pk_size
|
147
|
-
float
|
148
|
-
] = estimate_manifest_entry_column_size_bytes(
|
126
|
+
pk_size = estimate_manifest_entry_column_size_bytes(
|
149
127
|
entry=entry,
|
150
128
|
columns=primary_keys,
|
151
129
|
operation_type=OperationType.PYARROW_DOWNLOAD,
|
152
130
|
estimate_resources_params=estimate_resources_params,
|
153
131
|
)
|
154
|
-
|
132
|
+
|
133
|
+
if pk_size is None:
|
155
134
|
pk_size_bytes += current_entry_size
|
156
135
|
else:
|
157
136
|
pk_size_bytes += pk_size
|
@@ -180,6 +159,7 @@ def _get_merge_task_options(
|
|
180
159
|
f"[Merge task {index}]: Params used for calculating merge memory: {debug_memory_params}",
|
181
160
|
memory_logs_enabled,
|
182
161
|
)
|
162
|
+
|
183
163
|
return _get_task_options(0.01, total_memory, ray_custom_resources)
|
184
164
|
|
185
165
|
|
@@ -0,0 +1,143 @@
|
|
1
|
+
# from pyiceberg.typedef import EMPTY_DICT, Identifier, Properties
|
2
|
+
from deltacat.utils.ray_utils.concurrency import (
|
3
|
+
invoke_parallel,
|
4
|
+
task_resource_options_provider,
|
5
|
+
)
|
6
|
+
import ray
|
7
|
+
import functools
|
8
|
+
from deltacat.compute.converter.utils.convert_task_options import (
|
9
|
+
convert_resource_options_provider,
|
10
|
+
)
|
11
|
+
import logging
|
12
|
+
from deltacat import logs
|
13
|
+
from deltacat.compute.converter.model.converter_session_params import (
|
14
|
+
ConverterSessionParams,
|
15
|
+
)
|
16
|
+
|
17
|
+
|
18
|
+
from deltacat.compute.converter.constants import DEFAULT_MAX_PARALLEL_DATA_FILE_DOWNLOAD
|
19
|
+
from deltacat.compute.converter.steps.convert import convert
|
20
|
+
from deltacat.compute.converter.model.convert_input import ConvertInput
|
21
|
+
from deltacat.compute.converter.pyiceberg.overrides import (
|
22
|
+
fetch_all_bucket_files,
|
23
|
+
parquet_files_dict_to_iceberg_data_files,
|
24
|
+
)
|
25
|
+
from deltacat.compute.converter.utils.converter_session_utils import (
|
26
|
+
construct_iceberg_table_prefix,
|
27
|
+
)
|
28
|
+
from deltacat.compute.converter.pyiceberg.update_snapshot_overrides import (
|
29
|
+
commit_replace_snapshot,
|
30
|
+
commit_append_snapshot,
|
31
|
+
)
|
32
|
+
from deltacat.compute.converter.pyiceberg.catalog import load_table
|
33
|
+
from deltacat.compute.converter.utils.converter_session_utils import (
|
34
|
+
group_all_files_to_each_bucket,
|
35
|
+
)
|
36
|
+
|
37
|
+
logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
|
38
|
+
|
39
|
+
|
40
|
+
def converter_session(params: ConverterSessionParams, **kwargs):
|
41
|
+
"""
|
42
|
+
Convert equality delete to position delete.
|
43
|
+
Compute and memory heavy work from downloading equality delete table and compute position deletes
|
44
|
+
will be executed on Ray remote tasks.
|
45
|
+
"""
|
46
|
+
|
47
|
+
catalog = params.catalog
|
48
|
+
table_name = params.iceberg_table_name
|
49
|
+
iceberg_table = load_table(catalog, table_name)
|
50
|
+
enforce_primary_key_uniqueness = params.enforce_primary_key_uniqueness
|
51
|
+
data_file_dict, equality_delete_dict, pos_delete_dict = fetch_all_bucket_files(
|
52
|
+
iceberg_table
|
53
|
+
)
|
54
|
+
convert_input_files_for_all_buckets = group_all_files_to_each_bucket(
|
55
|
+
data_file_dict=data_file_dict,
|
56
|
+
equality_delete_dict=equality_delete_dict,
|
57
|
+
pos_delete_dict=pos_delete_dict,
|
58
|
+
)
|
59
|
+
iceberg_warehouse_bucket_name = params.iceberg_warehouse_bucket_name
|
60
|
+
iceberg_namespace = params.iceberg_namespace
|
61
|
+
iceberg_table_warehouse_prefix = construct_iceberg_table_prefix(
|
62
|
+
iceberg_warehouse_bucket_name=iceberg_warehouse_bucket_name,
|
63
|
+
table_name=table_name,
|
64
|
+
iceberg_namespace=iceberg_namespace,
|
65
|
+
)
|
66
|
+
merge_keys = params.merge_keys
|
67
|
+
# Using table identifier fields as merge keys if merge keys not provided
|
68
|
+
if not merge_keys:
|
69
|
+
identifier_fields_set = iceberg_table.schema().identifier_field_names()
|
70
|
+
identifier_fields = list(identifier_fields_set)
|
71
|
+
else:
|
72
|
+
identifier_fields = merge_keys
|
73
|
+
if len(identifier_fields) > 1:
|
74
|
+
raise NotImplementedError(
|
75
|
+
f"Multiple identifier fields lookup not supported yet."
|
76
|
+
)
|
77
|
+
convert_options_provider = functools.partial(
|
78
|
+
task_resource_options_provider,
|
79
|
+
resource_amount_provider=convert_resource_options_provider,
|
80
|
+
)
|
81
|
+
|
82
|
+
# TODO (zyiqin): max_parallel_data_file_download should be determined by memory requirement for each bucket.
|
83
|
+
# Specifically, for case when files for one bucket memory requirement exceed one worker node's memory limit, WITHOUT rebasing with larger hash bucket count,
|
84
|
+
# 1. We can control parallel files to download by adjusting max_parallel_data_file_download.
|
85
|
+
# 2. Implement two-layer converter tasks, with convert tasks to spin up child convert tasks.
|
86
|
+
# Note that approach 2 will ideally require shared object store to avoid download equality delete files * number of child tasks times.
|
87
|
+
max_parallel_data_file_download = DEFAULT_MAX_PARALLEL_DATA_FILE_DOWNLOAD
|
88
|
+
|
89
|
+
compact_small_files = params.compact_small_files
|
90
|
+
position_delete_for_multiple_data_files = (
|
91
|
+
params.position_delete_for_multiple_data_files
|
92
|
+
)
|
93
|
+
task_max_parallelism = params.task_max_parallelism
|
94
|
+
|
95
|
+
def convert_input_provider(index, item):
|
96
|
+
return {
|
97
|
+
"convert_input": ConvertInput.of(
|
98
|
+
files_for_each_bucket=item,
|
99
|
+
convert_task_index=index,
|
100
|
+
iceberg_table_warehouse_prefix=iceberg_table_warehouse_prefix,
|
101
|
+
identifier_fields=identifier_fields,
|
102
|
+
compact_small_files=compact_small_files,
|
103
|
+
enforce_primary_key_uniqueness=enforce_primary_key_uniqueness,
|
104
|
+
position_delete_for_multiple_data_files=position_delete_for_multiple_data_files,
|
105
|
+
max_parallel_data_file_download=max_parallel_data_file_download,
|
106
|
+
)
|
107
|
+
}
|
108
|
+
|
109
|
+
# Ray remote task: convert
|
110
|
+
# Assuming that memory consume by each bucket doesn't exceed one node's memory limit.
|
111
|
+
# TODO: Add split mechanism to split large buckets
|
112
|
+
convert_tasks_pending = invoke_parallel(
|
113
|
+
items=convert_input_files_for_all_buckets.items(),
|
114
|
+
ray_task=convert,
|
115
|
+
max_parallelism=task_max_parallelism,
|
116
|
+
options_provider=convert_options_provider,
|
117
|
+
kwargs_provider=convert_input_provider,
|
118
|
+
)
|
119
|
+
to_be_deleted_files_list = []
|
120
|
+
to_be_added_files_dict_list = []
|
121
|
+
convert_results = ray.get(convert_tasks_pending)
|
122
|
+
for convert_result in convert_results:
|
123
|
+
to_be_deleted_files_list.extend(convert_result[0].values())
|
124
|
+
to_be_added_files_dict_list.append(convert_result[1])
|
125
|
+
|
126
|
+
new_position_delete_files = parquet_files_dict_to_iceberg_data_files(
|
127
|
+
io=iceberg_table.io,
|
128
|
+
table_metadata=iceberg_table.metadata,
|
129
|
+
files_dict_list=to_be_added_files_dict_list,
|
130
|
+
)
|
131
|
+
|
132
|
+
if not to_be_deleted_files_list:
|
133
|
+
commit_append_snapshot(
|
134
|
+
iceberg_table=iceberg_table,
|
135
|
+
new_position_delete_files=new_position_delete_files,
|
136
|
+
)
|
137
|
+
else:
|
138
|
+
commit_replace_snapshot(
|
139
|
+
iceberg_table=iceberg_table,
|
140
|
+
# equality_delete_files + data file that all rows are deleted
|
141
|
+
to_be_deleted_files_list=to_be_deleted_files_list,
|
142
|
+
new_position_delete_files=new_position_delete_files,
|
143
|
+
)
|
@@ -0,0 +1,69 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
from typing import Dict, List
|
3
|
+
from deltacat.compute.converter.model.convert_input_files import ConvertInputFiles
|
4
|
+
|
5
|
+
|
6
|
+
class ConvertInput(Dict):
|
7
|
+
@staticmethod
|
8
|
+
def of(
|
9
|
+
convert_input_files,
|
10
|
+
convert_task_index,
|
11
|
+
iceberg_table_warehouse_prefix,
|
12
|
+
identifier_fields,
|
13
|
+
compact_small_files,
|
14
|
+
enforce_primary_key_uniqueness,
|
15
|
+
position_delete_for_multiple_data_files,
|
16
|
+
max_parallel_data_file_download,
|
17
|
+
s3_file_system,
|
18
|
+
) -> ConvertInput:
|
19
|
+
|
20
|
+
result = ConvertInput()
|
21
|
+
result["convert_input_files"] = convert_input_files
|
22
|
+
result["convert_task_index"] = convert_task_index
|
23
|
+
result["identifier_fields"] = identifier_fields
|
24
|
+
result["iceberg_table_warehouse_prefix"] = iceberg_table_warehouse_prefix
|
25
|
+
result["compact_small_files"] = compact_small_files
|
26
|
+
result["enforce_primary_key_uniqueness"] = enforce_primary_key_uniqueness
|
27
|
+
result[
|
28
|
+
"position_delete_for_multiple_data_files"
|
29
|
+
] = position_delete_for_multiple_data_files
|
30
|
+
result["max_parallel_data_file_download"] = max_parallel_data_file_download
|
31
|
+
result["s3_file_system"] = s3_file_system
|
32
|
+
|
33
|
+
return result
|
34
|
+
|
35
|
+
@property
|
36
|
+
def convert_input_files(self) -> ConvertInputFiles:
|
37
|
+
return self["convert_input_files"]
|
38
|
+
|
39
|
+
@property
|
40
|
+
def identifier_fields(self) -> List[str]:
|
41
|
+
return self["identifier_fields"]
|
42
|
+
|
43
|
+
@property
|
44
|
+
def convert_task_index(self) -> int:
|
45
|
+
return self["convert_task_index"]
|
46
|
+
|
47
|
+
@property
|
48
|
+
def iceberg_table_warehouse_prefix(self) -> str:
|
49
|
+
return self["iceberg_table_warehouse_prefix"]
|
50
|
+
|
51
|
+
@property
|
52
|
+
def compact_small_files(self) -> bool:
|
53
|
+
return self["compact_small_files"]
|
54
|
+
|
55
|
+
@property
|
56
|
+
def enforce_primary_key_uniqueness(self) -> bool:
|
57
|
+
return self["enforce_primary_key_uniqueness"]
|
58
|
+
|
59
|
+
@property
|
60
|
+
def position_delete_for_multiple_data_files(self) -> bool:
|
61
|
+
return self["position_delete_for_multiple_data_files"]
|
62
|
+
|
63
|
+
@property
|
64
|
+
def max_parallel_data_file_download(self) -> int:
|
65
|
+
return self["max_parallel_data_file_download"]
|
66
|
+
|
67
|
+
@property
|
68
|
+
def s3_file_system(self):
|
69
|
+
return self["s3_file_system"]
|
@@ -0,0 +1,61 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
from typing import Dict
|
3
|
+
|
4
|
+
|
5
|
+
class ConvertInputFiles(Dict):
|
6
|
+
@staticmethod
|
7
|
+
def of(
|
8
|
+
partition_value,
|
9
|
+
all_data_files_for_dedupe=None,
|
10
|
+
applicable_data_files=None,
|
11
|
+
applicable_equality_delete_files=None,
|
12
|
+
existing_position_delete_files=None,
|
13
|
+
) -> ConvertInputFiles:
|
14
|
+
|
15
|
+
result = ConvertInputFiles()
|
16
|
+
result["partition_value"] = partition_value
|
17
|
+
result["all_data_files_for_dedupe"] = all_data_files_for_dedupe
|
18
|
+
result["applicable_data_files"] = applicable_data_files
|
19
|
+
result["applicable_equality_delete_files"] = applicable_equality_delete_files
|
20
|
+
result["existing_position_delete_files"] = existing_position_delete_files
|
21
|
+
return result
|
22
|
+
|
23
|
+
@property
|
24
|
+
def partition_value(self):
|
25
|
+
return self["partition_value"]
|
26
|
+
|
27
|
+
@property
|
28
|
+
def all_data_files_for_dedupe(self):
|
29
|
+
return self["all_data_files_for_dedupe"]
|
30
|
+
|
31
|
+
@property
|
32
|
+
def applicable_data_files(self):
|
33
|
+
return self["applicable_data_files"]
|
34
|
+
|
35
|
+
@property
|
36
|
+
def applicable_equality_delete_files(self):
|
37
|
+
return self["applicable_equality_delete_files"]
|
38
|
+
|
39
|
+
@property
|
40
|
+
def existing_position_delete_files(self):
|
41
|
+
return self["existing_position_delete_files"]
|
42
|
+
|
43
|
+
@partition_value.setter
|
44
|
+
def partition_value(self, partition_value):
|
45
|
+
self["partition_value"] = partition_value
|
46
|
+
|
47
|
+
@all_data_files_for_dedupe.setter
|
48
|
+
def all_data_files_for_dedupe(self, all_data_files_for_dedupe):
|
49
|
+
self["all_data_files_for_dedupe"] = all_data_files_for_dedupe
|
50
|
+
|
51
|
+
@applicable_data_files.setter
|
52
|
+
def applicable_data_files(self, applicable_data_files):
|
53
|
+
self["applicable_data_files"] = applicable_data_files
|
54
|
+
|
55
|
+
@applicable_equality_delete_files.setter
|
56
|
+
def applicable_equality_delete_files(self, applicable_equality_delete_files):
|
57
|
+
self["applicable_equality_delete_files"] = applicable_equality_delete_files
|
58
|
+
|
59
|
+
@existing_position_delete_files.setter
|
60
|
+
def existing_position_delete_files(self, existing_position_delete_files):
|
61
|
+
self["existing_position_delete_files"] = existing_position_delete_files
|
@@ -0,0 +1,99 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
from typing import Optional, Dict
|
3
|
+
from deltacat.compute.converter.constants import DEFAULT_CONVERTER_TASK_MAX_PARALLELISM
|
4
|
+
|
5
|
+
|
6
|
+
class ConverterSessionParams(dict):
|
7
|
+
"""
|
8
|
+
This class represents the parameters passed to convert_ (deltacat/compute/compactor/compaction_session.py)
|
9
|
+
"""
|
10
|
+
|
11
|
+
@staticmethod
|
12
|
+
def of(params: Optional[Dict]) -> ConverterSessionParams:
|
13
|
+
params = {} if params is None else params
|
14
|
+
assert params.get("catalog") is not None, "catalog is a required arg"
|
15
|
+
assert (
|
16
|
+
params.get("iceberg_table_name") is not None
|
17
|
+
), "iceberg_table_name is a required arg"
|
18
|
+
assert (
|
19
|
+
params.get("iceberg_warehouse_bucket_name") is not None
|
20
|
+
), "iceberg_warehouse_bucket_name is a required arg"
|
21
|
+
assert (
|
22
|
+
params.get("iceberg_namespace") is not None
|
23
|
+
), "iceberg_namespace is a required arg"
|
24
|
+
result = ConverterSessionParams(params)
|
25
|
+
|
26
|
+
result.enforce_primary_key_uniqueness = params.get(
|
27
|
+
"enforce_primary_key_uniqueness", False
|
28
|
+
)
|
29
|
+
result.compact_small_files = params.get("compact_small_files", False)
|
30
|
+
|
31
|
+
# For Iceberg v3 spec, option to produce delete vector that can establish 1:1 mapping with data files.
|
32
|
+
result.position_delete_for_multiple_data_files = params.get(
|
33
|
+
"position_delete_for_multiple_data_files", True
|
34
|
+
)
|
35
|
+
result.task_max_parallelism = params.get(
|
36
|
+
"task_max_parallelism", DEFAULT_CONVERTER_TASK_MAX_PARALLELISM
|
37
|
+
)
|
38
|
+
result.merge_keys = params.get("merge_keys", None)
|
39
|
+
return result
|
40
|
+
|
41
|
+
@property
|
42
|
+
def catalog(self):
|
43
|
+
return self["catalog"]
|
44
|
+
|
45
|
+
@property
|
46
|
+
def iceberg_table_name(self) -> str:
|
47
|
+
return self["iceberg_table_name"]
|
48
|
+
|
49
|
+
@property
|
50
|
+
def iceberg_warehouse_bucket_name(self) -> str:
|
51
|
+
return self["iceberg_warehouse_bucket_name"]
|
52
|
+
|
53
|
+
@property
|
54
|
+
def iceberg_namespace(self) -> str:
|
55
|
+
return self["iceberg_namespace"]
|
56
|
+
|
57
|
+
@property
|
58
|
+
def enforce_primary_key_uniqueness(self) -> bool:
|
59
|
+
return self["enforce_primary_key_uniqueness"]
|
60
|
+
|
61
|
+
@enforce_primary_key_uniqueness.setter
|
62
|
+
def enforce_primary_key_uniqueness(self, enforce_primary_key_uniqueness) -> None:
|
63
|
+
self["enforce_primary_key_uniqueness"] = enforce_primary_key_uniqueness
|
64
|
+
|
65
|
+
@property
|
66
|
+
def compact_small_files(self) -> bool:
|
67
|
+
return self["compact_small_files"]
|
68
|
+
|
69
|
+
@compact_small_files.setter
|
70
|
+
def compact_small_files(self, compact_small_files) -> None:
|
71
|
+
self["compact_small_files"] = compact_small_files
|
72
|
+
|
73
|
+
@property
|
74
|
+
def position_delete_for_multiple_data_files(self) -> bool:
|
75
|
+
return self["position_delete_for_multiple_data_files"]
|
76
|
+
|
77
|
+
@position_delete_for_multiple_data_files.setter
|
78
|
+
def position_delete_for_multiple_data_files(
|
79
|
+
self, position_delete_for_multiple_data_files
|
80
|
+
) -> None:
|
81
|
+
self[
|
82
|
+
"position_delete_for_multiple_data_files"
|
83
|
+
] = position_delete_for_multiple_data_files
|
84
|
+
|
85
|
+
@property
|
86
|
+
def task_max_parallelism(self) -> str:
|
87
|
+
return self["task_max_parallelism"]
|
88
|
+
|
89
|
+
@task_max_parallelism.setter
|
90
|
+
def task_max_parallelism(self, task_max_parallelism) -> None:
|
91
|
+
self["task_max_parallelism"] = task_max_parallelism
|
92
|
+
|
93
|
+
@property
|
94
|
+
def merge_keys(self) -> str:
|
95
|
+
return self["merge_keys"]
|
96
|
+
|
97
|
+
@merge_keys.setter
|
98
|
+
def merge_keys(self, merge_keys) -> None:
|
99
|
+
self["merge_keys"] = merge_keys
|
File without changes
|
@@ -0,0 +1,75 @@
|
|
1
|
+
from typing import Optional
|
2
|
+
|
3
|
+
|
4
|
+
def load_catalog(iceberg_catalog_name, iceberg_catalog_properties):
|
5
|
+
catalog = load_catalog(
|
6
|
+
name=iceberg_catalog_name,
|
7
|
+
**iceberg_catalog_properties,
|
8
|
+
)
|
9
|
+
return catalog
|
10
|
+
|
11
|
+
|
12
|
+
def get_s3_path(
|
13
|
+
bucket_name: str,
|
14
|
+
database_name: Optional[str] = None,
|
15
|
+
table_name: Optional[str] = None,
|
16
|
+
) -> str:
|
17
|
+
result_path = f"s3://{bucket_name}"
|
18
|
+
if database_name is not None:
|
19
|
+
result_path += f"/{database_name}.db"
|
20
|
+
|
21
|
+
if table_name is not None:
|
22
|
+
result_path += f"/{table_name}"
|
23
|
+
return result_path
|
24
|
+
|
25
|
+
|
26
|
+
def get_bucket_name():
|
27
|
+
return "metadata-py4j-zyiqin1"
|
28
|
+
|
29
|
+
|
30
|
+
def get_s3_prefix():
|
31
|
+
return get_s3_path(get_bucket_name())
|
32
|
+
|
33
|
+
|
34
|
+
def get_credential():
|
35
|
+
import boto3
|
36
|
+
|
37
|
+
boto3_session = boto3.Session()
|
38
|
+
credentials = boto3_session.get_credentials()
|
39
|
+
return credentials
|
40
|
+
|
41
|
+
|
42
|
+
def get_glue_catalog():
|
43
|
+
from pyiceberg.catalog import load_catalog
|
44
|
+
|
45
|
+
credential = get_credential()
|
46
|
+
# Credentials are refreshable, so accessing your access key / secret key
|
47
|
+
# separately can lead to a race condition. Use this to get an actual matched
|
48
|
+
# set.
|
49
|
+
credential = credential.get_frozen_credentials()
|
50
|
+
access_key_id = credential.access_key
|
51
|
+
secret_access_key = credential.secret_key
|
52
|
+
session_token = credential.token
|
53
|
+
s3_path = get_s3_prefix()
|
54
|
+
glue_catalog = load_catalog(
|
55
|
+
"glue",
|
56
|
+
**{
|
57
|
+
"warehouse": s3_path,
|
58
|
+
"type": "glue",
|
59
|
+
"aws_access_key_id": access_key_id,
|
60
|
+
"aws_secret_access_key": secret_access_key,
|
61
|
+
"aws_session_token": session_token,
|
62
|
+
"region_name": "us-east-1",
|
63
|
+
"s3.access-key-id": access_key_id,
|
64
|
+
"s3.secret-access-key": secret_access_key,
|
65
|
+
"s3.session-token": session_token,
|
66
|
+
"s3.region": "us-east-1",
|
67
|
+
},
|
68
|
+
)
|
69
|
+
|
70
|
+
return glue_catalog
|
71
|
+
|
72
|
+
|
73
|
+
def load_table(catalog, table_name):
|
74
|
+
loaded_table = catalog.load_table(table_name)
|
75
|
+
return loaded_table
|