deltacat 2.0.0b11__py3-none-any.whl → 2.0.0.post1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +78 -3
- deltacat/api.py +122 -67
- deltacat/aws/constants.py +0 -23
- deltacat/aws/s3u.py +4 -631
- deltacat/benchmarking/conftest.py +0 -18
- deltacat/catalog/__init__.py +2 -0
- deltacat/catalog/delegate.py +445 -63
- deltacat/catalog/interface.py +188 -62
- deltacat/catalog/main/impl.py +2417 -271
- deltacat/catalog/model/catalog.py +49 -10
- deltacat/catalog/model/properties.py +38 -0
- deltacat/compute/compactor/compaction_session.py +97 -75
- deltacat/compute/compactor/model/compact_partition_params.py +75 -30
- deltacat/compute/compactor/model/compaction_session_audit_info.py +17 -0
- deltacat/compute/compactor/model/round_completion_info.py +16 -6
- deltacat/compute/compactor/repartition_session.py +8 -21
- deltacat/compute/compactor/steps/hash_bucket.py +5 -5
- deltacat/compute/compactor/steps/materialize.py +9 -7
- deltacat/compute/compactor/steps/repartition.py +12 -11
- deltacat/compute/compactor/utils/io.py +6 -5
- deltacat/compute/compactor/utils/round_completion_reader.py +117 -0
- deltacat/compute/compactor/utils/system_columns.py +3 -1
- deltacat/compute/compactor_v2/compaction_session.py +17 -14
- deltacat/compute/compactor_v2/constants.py +30 -1
- deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -1
- deltacat/compute/compactor_v2/model/hash_bucket_input.py +9 -3
- deltacat/compute/compactor_v2/model/merge_file_group.py +5 -2
- deltacat/compute/compactor_v2/model/merge_input.py +33 -8
- deltacat/compute/compactor_v2/private/compaction_utils.py +167 -68
- deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -2
- deltacat/compute/compactor_v2/steps/merge.py +267 -55
- deltacat/compute/compactor_v2/utils/content_type_params.py +34 -6
- deltacat/compute/compactor_v2/utils/dedupe.py +1 -1
- deltacat/compute/compactor_v2/utils/delta.py +5 -3
- deltacat/compute/compactor_v2/utils/io.py +11 -4
- deltacat/compute/compactor_v2/utils/merge.py +15 -2
- deltacat/compute/compactor_v2/utils/primary_key_index.py +28 -4
- deltacat/compute/compactor_v2/utils/task_options.py +45 -33
- deltacat/compute/converter/converter_session.py +145 -32
- deltacat/compute/converter/model/convert_input.py +26 -19
- deltacat/compute/converter/model/convert_input_files.py +33 -16
- deltacat/compute/converter/model/convert_result.py +35 -16
- deltacat/compute/converter/model/converter_session_params.py +24 -21
- deltacat/compute/converter/pyiceberg/catalog.py +21 -18
- deltacat/compute/converter/pyiceberg/overrides.py +18 -9
- deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +148 -100
- deltacat/compute/converter/steps/convert.py +157 -50
- deltacat/compute/converter/steps/dedupe.py +24 -11
- deltacat/compute/converter/utils/convert_task_options.py +27 -12
- deltacat/compute/converter/utils/converter_session_utils.py +126 -60
- deltacat/compute/converter/utils/iceberg_columns.py +8 -8
- deltacat/compute/converter/utils/io.py +101 -12
- deltacat/compute/converter/utils/s3u.py +33 -27
- deltacat/compute/janitor.py +205 -0
- deltacat/compute/jobs/client.py +19 -8
- deltacat/compute/resource_estimation/delta.py +38 -6
- deltacat/compute/resource_estimation/model.py +8 -0
- deltacat/constants.py +44 -0
- deltacat/docs/autogen/schema/__init__.py +0 -0
- deltacat/docs/autogen/schema/inference/__init__.py +0 -0
- deltacat/docs/autogen/schema/inference/generate_type_mappings.py +687 -0
- deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py +673 -0
- deltacat/examples/compactor/__init__.py +0 -0
- deltacat/examples/compactor/aws/__init__.py +1 -0
- deltacat/examples/compactor/bootstrap.py +863 -0
- deltacat/examples/compactor/compactor.py +373 -0
- deltacat/examples/compactor/explorer.py +473 -0
- deltacat/examples/compactor/gcp/__init__.py +1 -0
- deltacat/examples/compactor/job_runner.py +439 -0
- deltacat/examples/compactor/utils/__init__.py +1 -0
- deltacat/examples/compactor/utils/common.py +261 -0
- deltacat/examples/experimental/iceberg/converter/__init__.py +0 -0
- deltacat/examples/experimental/iceberg/converter/beam/__init__.py +0 -0
- deltacat/examples/experimental/iceberg/converter/beam/app.py +226 -0
- deltacat/examples/experimental/iceberg/converter/beam/main.py +133 -0
- deltacat/examples/experimental/iceberg/converter/beam/test_workflow.py +113 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py +3 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/common.py +174 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py +263 -0
- deltacat/exceptions.py +66 -4
- deltacat/experimental/catalog/iceberg/impl.py +2 -2
- deltacat/experimental/compatibility/__init__.py +0 -0
- deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py +201 -0
- deltacat/experimental/converter_agent/__init__.py +0 -0
- deltacat/experimental/converter_agent/beam/__init__.py +0 -0
- deltacat/experimental/converter_agent/beam/managed.py +173 -0
- deltacat/experimental/converter_agent/table_monitor.py +479 -0
- deltacat/experimental/storage/iceberg/iceberg_scan_planner.py +105 -4
- deltacat/experimental/storage/iceberg/impl.py +5 -3
- deltacat/experimental/storage/iceberg/model.py +7 -3
- deltacat/experimental/storage/iceberg/visitor.py +119 -0
- deltacat/experimental/storage/rivulet/dataset.py +0 -3
- deltacat/experimental/storage/rivulet/metastore/delta.py +0 -2
- deltacat/experimental/storage/rivulet/reader/dataset_metastore.py +3 -2
- deltacat/io/datasource/deltacat_datasource.py +0 -1
- deltacat/storage/__init__.py +20 -2
- deltacat/storage/interface.py +54 -32
- deltacat/storage/main/impl.py +1494 -541
- deltacat/storage/model/delta.py +27 -3
- deltacat/storage/model/locator.py +6 -12
- deltacat/storage/model/manifest.py +182 -6
- deltacat/storage/model/metafile.py +151 -78
- deltacat/storage/model/namespace.py +8 -1
- deltacat/storage/model/partition.py +117 -42
- deltacat/storage/model/schema.py +2427 -159
- deltacat/storage/model/sort_key.py +40 -0
- deltacat/storage/model/stream.py +9 -2
- deltacat/storage/model/table.py +12 -1
- deltacat/storage/model/table_version.py +11 -0
- deltacat/storage/model/transaction.py +1184 -208
- deltacat/storage/model/transform.py +81 -2
- deltacat/storage/model/types.py +48 -26
- deltacat/tests/_io/test_cloudpickle_bug_fix.py +8 -4
- deltacat/tests/aws/test_s3u.py +2 -31
- deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +1606 -70
- deltacat/tests/catalog/test_catalogs.py +54 -11
- deltacat/tests/catalog/test_default_catalog_impl.py +12152 -71
- deltacat/tests/compute/compact_partition_test_cases.py +35 -8
- deltacat/tests/compute/compactor/steps/test_repartition.py +12 -12
- deltacat/tests/compute/compactor/utils/test_io.py +124 -120
- deltacat/tests/compute/compactor/utils/test_round_completion_reader.py +254 -0
- deltacat/tests/compute/compactor_v2/test_compaction_session.py +423 -312
- deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +266 -0
- deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +45 -0
- deltacat/tests/compute/compactor_v2/utils/test_task_options.py +270 -1
- deltacat/tests/compute/conftest.py +8 -44
- deltacat/tests/compute/converter/test_convert_session.py +675 -490
- deltacat/tests/compute/converter/utils.py +15 -6
- deltacat/tests/compute/resource_estimation/test_delta.py +145 -79
- deltacat/tests/compute/test_compact_partition_incremental.py +103 -70
- deltacat/tests/compute/test_compact_partition_multiple_rounds.py +89 -66
- deltacat/tests/compute/test_compact_partition_params.py +13 -8
- deltacat/tests/compute/test_compact_partition_rebase.py +77 -62
- deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +263 -193
- deltacat/tests/compute/test_janitor.py +236 -0
- deltacat/tests/compute/test_util_common.py +716 -43
- deltacat/tests/compute/test_util_constant.py +0 -1
- deltacat/tests/{storage/conftest.py → conftest.py} +1 -1
- deltacat/tests/experimental/__init__.py +1 -0
- deltacat/tests/experimental/compatibility/__init__.py +1 -0
- deltacat/tests/experimental/compatibility/test_backfill_locator_to_id_mappings.py +582 -0
- deltacat/tests/storage/main/test_main_storage.py +6900 -95
- deltacat/tests/storage/model/test_metafile_io.py +78 -173
- deltacat/tests/storage/model/test_partition_scheme.py +85 -0
- deltacat/tests/storage/model/test_schema.py +171 -0
- deltacat/tests/storage/model/test_schema_update.py +1925 -0
- deltacat/tests/storage/model/test_sort_scheme.py +90 -0
- deltacat/tests/storage/model/test_transaction.py +393 -48
- deltacat/tests/storage/model/test_transaction_history.py +886 -0
- deltacat/tests/test_deltacat_api.py +988 -4
- deltacat/tests/test_exceptions.py +9 -5
- deltacat/tests/test_utils/pyarrow.py +52 -21
- deltacat/tests/test_utils/storage.py +23 -34
- deltacat/tests/types/__init__.py +0 -0
- deltacat/tests/types/test_tables.py +104 -0
- deltacat/tests/utils/exceptions.py +22 -0
- deltacat/tests/utils/main_deltacat_storage_mock.py +31 -0
- deltacat/tests/utils/ray_utils/test_dataset.py +123 -5
- deltacat/tests/utils/test_daft.py +121 -31
- deltacat/tests/utils/test_numpy.py +1193 -0
- deltacat/tests/utils/test_pandas.py +1106 -0
- deltacat/tests/utils/test_polars.py +1040 -0
- deltacat/tests/utils/test_pyarrow.py +1370 -89
- deltacat/types/media.py +221 -11
- deltacat/types/tables.py +2329 -59
- deltacat/utils/arguments.py +33 -1
- deltacat/utils/daft.py +411 -150
- deltacat/utils/filesystem.py +100 -0
- deltacat/utils/metafile_locator.py +2 -1
- deltacat/utils/numpy.py +118 -26
- deltacat/utils/pandas.py +577 -48
- deltacat/utils/polars.py +658 -27
- deltacat/utils/pyarrow.py +1258 -213
- deltacat/utils/ray_utils/dataset.py +101 -10
- deltacat/utils/reader_compatibility_mapping.py +3083 -0
- deltacat/utils/url.py +56 -15
- deltacat-2.0.0.post1.dist-info/METADATA +1163 -0
- {deltacat-2.0.0b11.dist-info → deltacat-2.0.0.post1.dist-info}/RECORD +183 -145
- {deltacat-2.0.0b11.dist-info → deltacat-2.0.0.post1.dist-info}/WHEEL +1 -1
- deltacat/compute/compactor/utils/round_completion_file.py +0 -97
- deltacat/compute/merge_on_read/__init__.py +0 -4
- deltacat/compute/merge_on_read/daft.py +0 -40
- deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -66
- deltacat/compute/merge_on_read/utils/delta.py +0 -42
- deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -231
- deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -388
- deltacat/tests/local_deltacat_storage/__init__.py +0 -1236
- deltacat/tests/local_deltacat_storage/exceptions.py +0 -10
- deltacat/utils/s3fs.py +0 -21
- deltacat-2.0.0b11.dist-info/METADATA +0 -67
- /deltacat/{compute/merge_on_read/model → docs}/__init__.py +0 -0
- /deltacat/{compute/merge_on_read/utils → docs/autogen}/__init__.py +0 -0
- {deltacat-2.0.0b11.dist-info → deltacat-2.0.0.post1.dist-info/licenses}/LICENSE +0 -0
- {deltacat-2.0.0b11.dist-info → deltacat-2.0.0.post1.dist-info}/top_level.txt +0 -0
@@ -8,24 +8,33 @@ import ray
|
|
8
8
|
import logging
|
9
9
|
from deltacat.compute.converter.model.convert_input import ConvertInput
|
10
10
|
from deltacat.compute.converter.steps.dedupe import dedupe_data_files
|
11
|
-
from deltacat.compute.converter.utils.
|
11
|
+
from deltacat.compute.converter.utils.io import write_sliced_table
|
12
12
|
from deltacat.compute.converter.utils.io import (
|
13
13
|
download_data_table_and_append_iceberg_columns,
|
14
14
|
)
|
15
15
|
from deltacat.compute.converter.utils.converter_session_utils import (
|
16
16
|
partition_value_record_to_partition_value_string,
|
17
|
+
sort_data_files_maintaining_order,
|
17
18
|
)
|
18
19
|
from deltacat.compute.converter.pyiceberg.overrides import (
|
19
20
|
parquet_files_dict_to_iceberg_data_files,
|
20
21
|
)
|
21
22
|
from deltacat.compute.converter.model.convert_result import ConvertResult
|
23
|
+
from pyiceberg.manifest import DataFileContent
|
22
24
|
from deltacat import logs
|
25
|
+
from fsspec import AbstractFileSystem
|
26
|
+
from typing import List, Dict, Tuple, Optional, Any
|
27
|
+
from deltacat.utils.resources import get_current_process_peak_memory_usage_in_bytes
|
28
|
+
from deltacat.compute.converter.model.convert_input_files import (
|
29
|
+
DataFileList,
|
30
|
+
DataFileListGroup,
|
31
|
+
)
|
23
32
|
|
24
33
|
logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
|
25
34
|
|
26
35
|
|
27
36
|
@ray.remote
|
28
|
-
def convert(convert_input: ConvertInput):
|
37
|
+
def convert(convert_input: ConvertInput) -> ConvertResult:
|
29
38
|
convert_input_files = convert_input.convert_input_files
|
30
39
|
convert_task_index = convert_input.convert_task_index
|
31
40
|
iceberg_table_warehouse_prefix = convert_input.iceberg_table_warehouse_prefix
|
@@ -39,8 +48,10 @@ def convert(convert_input: ConvertInput):
|
|
39
48
|
convert_input.position_delete_for_multiple_data_files
|
40
49
|
)
|
41
50
|
max_parallel_data_file_download = convert_input.max_parallel_data_file_download
|
42
|
-
|
51
|
+
filesystem = convert_input.filesystem
|
43
52
|
s3_client_kwargs = convert_input.s3_client_kwargs
|
53
|
+
task_memory = convert_input.task_memory
|
54
|
+
|
44
55
|
if not position_delete_for_multiple_data_files:
|
45
56
|
raise NotImplementedError(
|
46
57
|
f"Distributed file level position delete compute is not supported yet"
|
@@ -54,6 +65,7 @@ def convert(convert_input: ConvertInput):
|
|
54
65
|
applicable_equality_delete_files = (
|
55
66
|
convert_input_files.applicable_equality_delete_files
|
56
67
|
)
|
68
|
+
|
57
69
|
all_data_files_for_this_bucket = convert_input_files.all_data_files_for_dedupe
|
58
70
|
|
59
71
|
partition_value_str = partition_value_record_to_partition_value_string(
|
@@ -69,11 +81,14 @@ def convert(convert_input: ConvertInput):
|
|
69
81
|
iceberg_table_warehouse_prefix_with_partition = (
|
70
82
|
f"{iceberg_table_warehouse_prefix}"
|
71
83
|
)
|
84
|
+
|
72
85
|
enforce_primary_key_uniqueness = convert_input.enforce_primary_key_uniqueness
|
73
86
|
total_pos_delete_table = []
|
87
|
+
data_table_after_converting_equality_delete = []
|
74
88
|
if applicable_equality_delete_files:
|
75
89
|
(
|
76
|
-
pos_delete_after_converting_equality_delete
|
90
|
+
pos_delete_after_converting_equality_delete,
|
91
|
+
data_table_after_converting_equality_delete,
|
77
92
|
) = compute_pos_delete_with_limited_parallelism(
|
78
93
|
data_files_list=applicable_data_files,
|
79
94
|
identifier_columns=identifier_fields,
|
@@ -81,20 +96,35 @@ def convert(convert_input: ConvertInput):
|
|
81
96
|
iceberg_table_warehouse_prefix_with_partition=iceberg_table_warehouse_prefix_with_partition,
|
82
97
|
convert_task_index=convert_task_index,
|
83
98
|
max_parallel_data_file_download=max_parallel_data_file_download,
|
84
|
-
s3_file_system=
|
99
|
+
s3_file_system=filesystem,
|
85
100
|
s3_client_kwargs=s3_client_kwargs,
|
86
101
|
)
|
87
102
|
if pos_delete_after_converting_equality_delete:
|
88
103
|
total_pos_delete_table.append(pos_delete_after_converting_equality_delete)
|
89
104
|
|
90
105
|
if enforce_primary_key_uniqueness:
|
106
|
+
data_files_downloaded_during_convert = []
|
107
|
+
if applicable_data_files:
|
108
|
+
for file_list in applicable_data_files:
|
109
|
+
for file in file_list:
|
110
|
+
data_files_downloaded_during_convert.append(file)
|
111
|
+
|
91
112
|
data_files_to_dedupe = get_additional_applicable_data_files(
|
92
113
|
all_data_files=all_data_files_for_this_bucket,
|
93
|
-
data_files_downloaded=
|
114
|
+
data_files_downloaded=data_files_downloaded_during_convert,
|
115
|
+
)
|
116
|
+
|
117
|
+
dedupe_file_size_bytes = sum(
|
118
|
+
data_file.file_size_in_bytes for _, data_file in data_files_to_dedupe
|
119
|
+
)
|
120
|
+
logger.info(
|
121
|
+
f"Total on-disk size of files to dedupe: {dedupe_file_size_bytes} bytes"
|
94
122
|
)
|
123
|
+
|
95
124
|
logger.info(
|
96
125
|
f"[Convert task {convert_task_index}]: Got {len(data_files_to_dedupe)} files to dedupe."
|
97
126
|
)
|
127
|
+
|
98
128
|
(
|
99
129
|
pos_delete_after_dedupe,
|
100
130
|
data_file_to_dedupe_record_count,
|
@@ -102,6 +132,7 @@ def convert(convert_input: ConvertInput):
|
|
102
132
|
) = dedupe_data_files(
|
103
133
|
data_file_to_dedupe=data_files_to_dedupe,
|
104
134
|
identifier_columns=identifier_fields,
|
135
|
+
remaining_data_table_after_convert=data_table_after_converting_equality_delete,
|
105
136
|
merge_sort_column=sc._ORDERED_RECORD_IDX_COLUMN_NAME,
|
106
137
|
s3_client_kwargs=s3_client_kwargs,
|
107
138
|
)
|
@@ -118,11 +149,11 @@ def convert(convert_input: ConvertInput):
|
|
118
149
|
|
119
150
|
to_be_added_files_list = []
|
120
151
|
if total_pos_delete:
|
121
|
-
to_be_added_files_list_parquet =
|
152
|
+
to_be_added_files_list_parquet = write_sliced_table(
|
122
153
|
table=total_pos_delete,
|
123
|
-
|
124
|
-
|
125
|
-
|
154
|
+
base_path=iceberg_table_warehouse_prefix_with_partition,
|
155
|
+
table_writer_kwargs={},
|
156
|
+
filesystem=filesystem,
|
126
157
|
)
|
127
158
|
|
128
159
|
to_be_added_files_dict = defaultdict()
|
@@ -131,19 +162,39 @@ def convert(convert_input: ConvertInput):
|
|
131
162
|
logger.info(
|
132
163
|
f"[Convert task {convert_task_index}]: Produced {len(to_be_added_files_list_parquet)} position delete files."
|
133
164
|
)
|
165
|
+
file_content_type = DataFileContent.POSITION_DELETES
|
134
166
|
to_be_added_files_list = parquet_files_dict_to_iceberg_data_files(
|
135
167
|
io=table_io,
|
136
168
|
table_metadata=table_metadata,
|
137
169
|
files_dict=to_be_added_files_dict,
|
170
|
+
file_content_type=file_content_type,
|
138
171
|
)
|
139
172
|
|
140
173
|
to_be_delete_files_dict = defaultdict()
|
174
|
+
|
141
175
|
if applicable_equality_delete_files:
|
142
176
|
to_be_delete_files_dict[partition_value] = [
|
143
177
|
equality_delete_file[1]
|
144
|
-
for
|
178
|
+
for equality_delete_list in applicable_equality_delete_files
|
179
|
+
for equality_delete_file in equality_delete_list
|
145
180
|
]
|
146
181
|
|
182
|
+
if not enforce_primary_key_uniqueness:
|
183
|
+
data_file_to_dedupe_record_count = 0
|
184
|
+
data_file_to_dedupe_size = 0
|
185
|
+
|
186
|
+
peak_memory_usage_bytes = (
|
187
|
+
get_current_process_peak_memory_usage_in_bytes()
|
188
|
+
) # Convert KB to bytes
|
189
|
+
memory_usage_percentage = (peak_memory_usage_bytes / task_memory) * 100
|
190
|
+
|
191
|
+
logger.info(
|
192
|
+
f"[Convert task {convert_task_index}]: Memory usage stats - "
|
193
|
+
f"Peak memory usage: {peak_memory_usage_bytes} bytes, "
|
194
|
+
f"Allocated task memory: {convert_input.task_memory} bytes, "
|
195
|
+
f"Usage percentage: {memory_usage_percentage:.2f}%"
|
196
|
+
)
|
197
|
+
|
147
198
|
convert_res = ConvertResult.of(
|
148
199
|
convert_task_index=convert_task_index,
|
149
200
|
to_be_added_files=to_be_added_files_list,
|
@@ -155,38 +206,73 @@ def convert(convert_input: ConvertInput):
|
|
155
206
|
position_delete_on_disk_sizes=sum(
|
156
207
|
file.file_size_in_bytes for file in to_be_added_files_list
|
157
208
|
),
|
209
|
+
input_data_files_on_disk_size=dedupe_file_size_bytes,
|
210
|
+
peak_memory_usage_bytes=peak_memory_usage_bytes,
|
211
|
+
memory_usage_percentage=memory_usage_percentage,
|
158
212
|
)
|
159
213
|
return convert_res
|
160
214
|
|
161
215
|
|
162
|
-
def get_additional_applicable_data_files(
|
163
|
-
|
216
|
+
def get_additional_applicable_data_files(
|
217
|
+
all_data_files: DataFileList,
|
218
|
+
data_files_downloaded: DataFileList,
|
219
|
+
) -> DataFileList:
|
220
|
+
data_file_to_dedupe = []
|
221
|
+
assert len(set(all_data_files)) >= len(set(data_files_downloaded)), (
|
222
|
+
f"Length of all data files ({len(set(all_data_files))}) should never be less than "
|
223
|
+
f"the length of candidate equality delete data files ({len(set(data_files_downloaded))})"
|
224
|
+
)
|
164
225
|
if data_files_downloaded:
|
165
|
-
|
226
|
+
# set1.difference(set2) returns elements in set1 but not in set2
|
227
|
+
data_file_to_dedupe.extend(
|
228
|
+
list(set(data_file_to_dedupe).difference(set(data_files_downloaded)))
|
229
|
+
)
|
230
|
+
else:
|
231
|
+
data_file_to_dedupe = all_data_files
|
166
232
|
return data_file_to_dedupe
|
167
233
|
|
168
234
|
|
169
235
|
def filter_rows_to_be_deleted(
|
170
|
-
equality_delete_table
|
171
|
-
|
172
|
-
|
236
|
+
equality_delete_table: Optional[pa.Table],
|
237
|
+
data_file_table: Optional[pa.Table],
|
238
|
+
identifier_columns: List[str],
|
239
|
+
) -> Tuple[Optional[pa.Table], Optional[pa.Table]]:
|
240
|
+
identifier_column = sc._IDENTIFIER_COLUMNS_HASH_COLUMN_NAME
|
173
241
|
if equality_delete_table and data_file_table:
|
174
242
|
equality_deletes = pc.is_in(
|
175
243
|
data_file_table[identifier_column],
|
176
244
|
equality_delete_table[identifier_column],
|
177
245
|
)
|
246
|
+
data_file_record_remaining = pc.invert(
|
247
|
+
pc.is_in(
|
248
|
+
data_file_table[identifier_column],
|
249
|
+
equality_delete_table[identifier_column],
|
250
|
+
)
|
251
|
+
)
|
178
252
|
position_delete_table = data_file_table.filter(equality_deletes)
|
179
|
-
|
253
|
+
remaining_data_table = data_file_table.filter(data_file_record_remaining)
|
254
|
+
|
255
|
+
position_delete_table = position_delete_table.drop(
|
256
|
+
[sc._IDENTIFIER_COLUMNS_HASH_COLUMN_NAME]
|
257
|
+
)
|
258
|
+
assert len(position_delete_table) + len(remaining_data_table) == len(
|
259
|
+
data_file_table
|
260
|
+
), (
|
261
|
+
f"Expected undeleted data file record count plus length of pos deletes to match original data file record count of {len(data_file_table)}, "
|
262
|
+
f"but found {len(position_delete_table)} pos deletes + {len(remaining_data_table)} equality deletes."
|
263
|
+
)
|
264
|
+
|
265
|
+
return position_delete_table, remaining_data_table
|
180
266
|
|
181
267
|
|
182
268
|
def compute_pos_delete_converting_equality_deletes(
|
183
|
-
equality_delete_table,
|
184
|
-
data_file_table,
|
185
|
-
identifier_columns,
|
186
|
-
iceberg_table_warehouse_prefix_with_partition,
|
187
|
-
s3_file_system,
|
188
|
-
):
|
189
|
-
new_position_delete_table = filter_rows_to_be_deleted(
|
269
|
+
equality_delete_table: Optional[pa.Table],
|
270
|
+
data_file_table: Optional[pa.Table],
|
271
|
+
identifier_columns: List[str],
|
272
|
+
iceberg_table_warehouse_prefix_with_partition: str,
|
273
|
+
s3_file_system: Optional[AbstractFileSystem],
|
274
|
+
) -> Tuple[Optional[pa.Table], Optional[pa.Table]]:
|
275
|
+
new_position_delete_table, remaining_data_table = filter_rows_to_be_deleted(
|
190
276
|
data_file_table=data_file_table,
|
191
277
|
equality_delete_table=equality_delete_table,
|
192
278
|
identifier_columns=identifier_columns,
|
@@ -195,34 +281,46 @@ def compute_pos_delete_converting_equality_deletes(
|
|
195
281
|
logger.info(
|
196
282
|
f"Length of position delete table after converting from equality deletes:{len(new_position_delete_table)}"
|
197
283
|
)
|
284
|
+
return new_position_delete_table, remaining_data_table
|
285
|
+
elif not remaining_data_table:
|
286
|
+
return None, None
|
198
287
|
else:
|
199
|
-
return None
|
200
|
-
return new_position_delete_table
|
288
|
+
return None, remaining_data_table
|
201
289
|
|
202
290
|
|
203
291
|
def compute_pos_delete_with_limited_parallelism(
|
204
|
-
data_files_list,
|
205
|
-
identifier_columns,
|
206
|
-
equality_delete_files_list,
|
207
|
-
iceberg_table_warehouse_prefix_with_partition,
|
208
|
-
convert_task_index,
|
209
|
-
max_parallel_data_file_download,
|
210
|
-
s3_file_system,
|
211
|
-
s3_client_kwargs,
|
212
|
-
):
|
292
|
+
data_files_list: DataFileListGroup,
|
293
|
+
identifier_columns: List[str],
|
294
|
+
equality_delete_files_list: DataFileListGroup,
|
295
|
+
iceberg_table_warehouse_prefix_with_partition: str,
|
296
|
+
convert_task_index: int,
|
297
|
+
max_parallel_data_file_download: int,
|
298
|
+
s3_file_system: Optional[AbstractFileSystem],
|
299
|
+
s3_client_kwargs: Optional[Dict[str, Any]],
|
300
|
+
) -> Tuple[Optional[pa.Table], Optional[pa.Table]]:
|
301
|
+
assert len(data_files_list) == len(equality_delete_files_list), (
|
302
|
+
f"Number of lists of data files should equal to number of list of equality delete files, "
|
303
|
+
f"But got {len(data_files_list)} data files lists vs {len(equality_delete_files_list)}."
|
304
|
+
)
|
305
|
+
|
306
|
+
new_pos_delete_table_total = []
|
213
307
|
for data_files, equality_delete_files in zip(
|
214
308
|
data_files_list, equality_delete_files_list
|
215
309
|
):
|
216
310
|
data_table_total = []
|
311
|
+
|
312
|
+
# Sort data files by file sequence number first, then file path to
|
313
|
+
# make sure files having same sequence number are deterministically sorted
|
314
|
+
data_files = sort_data_files_maintaining_order(data_files=data_files)
|
315
|
+
|
217
316
|
for data_file in data_files:
|
218
317
|
data_table = download_data_table_and_append_iceberg_columns(
|
219
|
-
|
318
|
+
file=data_file[1],
|
220
319
|
columns_to_download=identifier_columns,
|
221
320
|
additional_columns_to_append=[
|
222
321
|
sc._FILE_PATH_COLUMN_NAME,
|
223
322
|
sc._ORDERED_RECORD_IDX_COLUMN_NAME,
|
224
323
|
],
|
225
|
-
sequence_number=data_file[0],
|
226
324
|
s3_client_kwargs=s3_client_kwargs,
|
227
325
|
)
|
228
326
|
data_table_total.append(data_table)
|
@@ -231,29 +329,38 @@ def compute_pos_delete_with_limited_parallelism(
|
|
231
329
|
equality_delete_table_total = []
|
232
330
|
for equality_delete in equality_delete_files:
|
233
331
|
equality_delete_table = download_data_table_and_append_iceberg_columns(
|
234
|
-
|
332
|
+
file=equality_delete[1],
|
235
333
|
columns_to_download=identifier_columns,
|
236
334
|
s3_client_kwargs=s3_client_kwargs,
|
237
335
|
)
|
238
336
|
equality_delete_table_total.append(equality_delete_table)
|
239
337
|
equality_delete_table_total = pa.concat_tables(equality_delete_table_total)
|
240
338
|
|
241
|
-
|
242
|
-
|
243
|
-
|
244
|
-
|
245
|
-
|
246
|
-
|
247
|
-
|
248
|
-
|
339
|
+
(
|
340
|
+
new_pos_delete_table,
|
341
|
+
remaining_data_table,
|
342
|
+
) = compute_pos_delete_converting_equality_deletes(
|
343
|
+
equality_delete_table=equality_delete_table_total,
|
344
|
+
data_file_table=data_table_total,
|
345
|
+
iceberg_table_warehouse_prefix_with_partition=iceberg_table_warehouse_prefix_with_partition,
|
346
|
+
identifier_columns=identifier_columns,
|
347
|
+
s3_file_system=s3_file_system,
|
348
|
+
)
|
349
|
+
new_pos_delete_table_total.append(new_pos_delete_table)
|
350
|
+
|
351
|
+
if new_pos_delete_table_total:
|
352
|
+
new_pos_delete_table_total = pa.concat_tables(new_pos_delete_table_total)
|
249
353
|
|
250
354
|
logger.info(
|
251
355
|
f"[Convert task {convert_task_index}]: Find deletes got {len(data_table_total)} data table records, "
|
252
356
|
f"{len(equality_delete_table_total)} equality deletes as input, "
|
253
|
-
f"Produced {len(
|
357
|
+
f"Produced {len(new_pos_delete_table_total)} position deletes based off find deletes input."
|
254
358
|
)
|
255
359
|
|
256
|
-
if not
|
360
|
+
if not new_pos_delete_table_total:
|
257
361
|
logger.info("No records deleted based on equality delete convertion")
|
258
362
|
|
259
|
-
|
363
|
+
if not remaining_data_table:
|
364
|
+
logger.info("No data table remaining after converting equality deletes")
|
365
|
+
|
366
|
+
return new_pos_delete_table_total, remaining_data_table
|
@@ -4,25 +4,33 @@ import deltacat.compute.converter.utils.iceberg_columns as sc
|
|
4
4
|
from deltacat.compute.converter.utils.io import (
|
5
5
|
download_data_table_and_append_iceberg_columns,
|
6
6
|
)
|
7
|
+
from deltacat.compute.converter.utils.converter_session_utils import (
|
8
|
+
sort_data_files_maintaining_order,
|
9
|
+
)
|
7
10
|
import logging
|
8
11
|
from deltacat import logs
|
12
|
+
from typing import List, Dict, Tuple, Optional, Any
|
13
|
+
from pyiceberg.manifest import DataFile
|
9
14
|
|
10
15
|
logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
|
11
16
|
|
12
17
|
|
13
18
|
def dedupe_data_files(
|
14
|
-
data_file_to_dedupe,
|
15
|
-
identifier_columns,
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
+
data_file_to_dedupe: List[Tuple[int, DataFile]],
|
20
|
+
identifier_columns: List[str],
|
21
|
+
remaining_data_table_after_convert: Optional[pa.Table],
|
22
|
+
merge_sort_column: str,
|
23
|
+
s3_client_kwargs: Optional[Dict[str, Any]],
|
24
|
+
) -> Tuple[pa.Table, int, int]:
|
19
25
|
data_file_table = []
|
26
|
+
if remaining_data_table_after_convert:
|
27
|
+
data_file_table.append(remaining_data_table_after_convert)
|
20
28
|
|
29
|
+
data_file_to_dedupe = sort_data_files_maintaining_order(
|
30
|
+
data_files=data_file_to_dedupe
|
31
|
+
)
|
21
32
|
downloaded_data_file_record_count = 0
|
22
|
-
# Sort data files by file sequence number first
|
23
|
-
data_file_to_dedupe = sorted(data_file_to_dedupe, key=lambda f: f[0])
|
24
33
|
for file_tuple in data_file_to_dedupe:
|
25
|
-
sequence_number = file_tuple[0]
|
26
34
|
data_file = file_tuple[1]
|
27
35
|
data_file_to_dedupe_table = download_data_table_and_append_iceberg_columns(
|
28
36
|
file=data_file,
|
@@ -31,17 +39,22 @@ def dedupe_data_files(
|
|
31
39
|
sc._FILE_PATH_COLUMN_NAME,
|
32
40
|
sc._ORDERED_RECORD_IDX_COLUMN_NAME,
|
33
41
|
],
|
34
|
-
sequence_number=sequence_number,
|
35
42
|
s3_client_kwargs=s3_client_kwargs,
|
36
43
|
)
|
44
|
+
logger.info(
|
45
|
+
f"Length of downloaded data file table: {len(data_file_to_dedupe_table)}"
|
46
|
+
)
|
37
47
|
downloaded_data_file_record_count += len(data_file_to_dedupe_table)
|
38
48
|
data_file_table.append(data_file_to_dedupe_table)
|
39
49
|
|
40
50
|
final_data_to_dedupe = pa.concat_tables(data_file_table)
|
41
51
|
|
42
|
-
|
52
|
+
dedupe_input_record_count = downloaded_data_file_record_count
|
53
|
+
if remaining_data_table_after_convert:
|
54
|
+
dedupe_input_record_count += len(remaining_data_table_after_convert)
|
55
|
+
assert len(final_data_to_dedupe) == dedupe_input_record_count, (
|
43
56
|
f"Mismatch record count while performing table concat, Got {len(final_data_to_dedupe)} in final table, "
|
44
|
-
f"while input table length is: {
|
57
|
+
f"while input table length is: {dedupe_input_record_count}"
|
45
58
|
)
|
46
59
|
|
47
60
|
logger.info(f"Length of pyarrow table to dedupe:{len(final_data_to_dedupe)}")
|
@@ -1,20 +1,27 @@
|
|
1
|
-
from typing import Optional, Dict
|
1
|
+
from typing import Optional, Dict, List, Tuple, Any
|
2
2
|
from deltacat.exceptions import RetryableError
|
3
|
+
from pyiceberg.manifest import DataFile
|
4
|
+
from deltacat.compute.converter.model.convert_input_files import ConvertInputFiles
|
3
5
|
|
4
|
-
AVERAGE_FILE_PATH_COLUMN_SIZE_BYTES =
|
6
|
+
AVERAGE_FILE_PATH_COLUMN_SIZE_BYTES = 160
|
5
7
|
AVERAGE_POS_COLUMN_SIZE_BYTES = 4
|
6
8
|
XXHASH_BYTE_PER_RECORD = 8
|
7
9
|
MEMORY_BUFFER_RATE = 2
|
8
|
-
# TODO: Add audit info to check this number in practice
|
9
10
|
# Worst case 2 as no duplicates exists across all pk
|
10
11
|
PYARROW_AGGREGATE_MEMORY_MULTIPLIER = 2
|
12
|
+
# Observed base memory usage at the beginning of each worker process
|
13
|
+
BASE_MEMORY_BUFFER = 0.3 * 1024 * 1024 * 1024
|
11
14
|
|
12
15
|
|
13
|
-
def estimate_fixed_hash_columns(
|
16
|
+
def estimate_fixed_hash_columns(
|
17
|
+
hash_value_size_bytes_per_record: int, total_record_count: int
|
18
|
+
) -> int:
|
14
19
|
return hash_value_size_bytes_per_record * total_record_count
|
15
20
|
|
16
21
|
|
17
|
-
def get_total_record_from_iceberg_files(
|
22
|
+
def get_total_record_from_iceberg_files(
|
23
|
+
iceberg_files_list: List[Tuple[int, DataFile]]
|
24
|
+
) -> int:
|
18
25
|
total_record_count = 0
|
19
26
|
# file are in form of tuple (sequence_number, DataFile)
|
20
27
|
total_record_count += sum(file[1].record_count for file in iceberg_files_list)
|
@@ -22,8 +29,8 @@ def get_total_record_from_iceberg_files(iceberg_files_list):
|
|
22
29
|
|
23
30
|
|
24
31
|
def estimate_iceberg_pos_delete_additional_columns(
|
25
|
-
include_columns, num_of_record_count
|
26
|
-
):
|
32
|
+
include_columns: List[str], num_of_record_count: int
|
33
|
+
) -> int:
|
27
34
|
total_additional_columns_sizes = 0
|
28
35
|
if "file_path" in include_columns:
|
29
36
|
total_additional_columns_sizes += (
|
@@ -36,7 +43,10 @@ def estimate_iceberg_pos_delete_additional_columns(
|
|
36
43
|
return total_additional_columns_sizes
|
37
44
|
|
38
45
|
|
39
|
-
def estimate_convert_remote_option_resources(
|
46
|
+
def estimate_convert_remote_option_resources(
|
47
|
+
data_files: List[Tuple[int, DataFile]],
|
48
|
+
equality_delete_files: List[Tuple[int, DataFile]],
|
49
|
+
) -> float:
|
40
50
|
data_file_record_count = get_total_record_from_iceberg_files(data_files)
|
41
51
|
equality_delete_record_count = get_total_record_from_iceberg_files(
|
42
52
|
equality_delete_files
|
@@ -53,9 +63,9 @@ def estimate_convert_remote_option_resources(data_files, equality_delete_files):
|
|
53
63
|
|
54
64
|
def _get_task_options(
|
55
65
|
memory: float,
|
56
|
-
ray_custom_resources: Optional[Dict] = None,
|
66
|
+
ray_custom_resources: Optional[Dict[str, Any]] = None,
|
57
67
|
scheduling_strategy: str = "SPREAD",
|
58
|
-
) -> Dict:
|
68
|
+
) -> Dict[str, Any]:
|
59
69
|
|
60
70
|
# NOTE: With DEFAULT scheduling strategy in Ray 2.20.0, autoscaler does
|
61
71
|
# not spin up enough nodes fast and hence we see only approximately
|
@@ -80,7 +90,9 @@ def _get_task_options(
|
|
80
90
|
return task_opts
|
81
91
|
|
82
92
|
|
83
|
-
def estimate_dedupe_memory(
|
93
|
+
def estimate_dedupe_memory(
|
94
|
+
all_data_files_for_dedupe: List[Tuple[int, DataFile]]
|
95
|
+
) -> float:
|
84
96
|
dedupe_record_count = get_total_record_from_iceberg_files(all_data_files_for_dedupe)
|
85
97
|
produced_pos_memory_required = estimate_iceberg_pos_delete_additional_columns(
|
86
98
|
["file_path", "pos"], dedupe_record_count
|
@@ -95,13 +107,16 @@ def estimate_dedupe_memory(all_data_files_for_dedupe):
|
|
95
107
|
return memory_with_buffer
|
96
108
|
|
97
109
|
|
98
|
-
def convert_resource_options_provider(
|
110
|
+
def convert_resource_options_provider(
|
111
|
+
index: int, convert_input_files: ConvertInputFiles
|
112
|
+
) -> Dict[str, Any]:
|
99
113
|
applicable_data_files = convert_input_files.applicable_data_files
|
100
114
|
applicable_equality_delete_files = (
|
101
115
|
convert_input_files.applicable_equality_delete_files
|
102
116
|
)
|
103
117
|
all_data_files_for_dedupe = convert_input_files.all_data_files_for_dedupe
|
104
118
|
total_memory_required = 0
|
119
|
+
total_memory_required += BASE_MEMORY_BUFFER
|
105
120
|
if applicable_data_files and applicable_equality_delete_files:
|
106
121
|
memory_requirement_for_convert_equality_deletes = (
|
107
122
|
estimate_convert_remote_option_resources(
|