deltacat 2.0.0b11__py3-none-any.whl → 2.0.0.post1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (194) hide show
  1. deltacat/__init__.py +78 -3
  2. deltacat/api.py +122 -67
  3. deltacat/aws/constants.py +0 -23
  4. deltacat/aws/s3u.py +4 -631
  5. deltacat/benchmarking/conftest.py +0 -18
  6. deltacat/catalog/__init__.py +2 -0
  7. deltacat/catalog/delegate.py +445 -63
  8. deltacat/catalog/interface.py +188 -62
  9. deltacat/catalog/main/impl.py +2417 -271
  10. deltacat/catalog/model/catalog.py +49 -10
  11. deltacat/catalog/model/properties.py +38 -0
  12. deltacat/compute/compactor/compaction_session.py +97 -75
  13. deltacat/compute/compactor/model/compact_partition_params.py +75 -30
  14. deltacat/compute/compactor/model/compaction_session_audit_info.py +17 -0
  15. deltacat/compute/compactor/model/round_completion_info.py +16 -6
  16. deltacat/compute/compactor/repartition_session.py +8 -21
  17. deltacat/compute/compactor/steps/hash_bucket.py +5 -5
  18. deltacat/compute/compactor/steps/materialize.py +9 -7
  19. deltacat/compute/compactor/steps/repartition.py +12 -11
  20. deltacat/compute/compactor/utils/io.py +6 -5
  21. deltacat/compute/compactor/utils/round_completion_reader.py +117 -0
  22. deltacat/compute/compactor/utils/system_columns.py +3 -1
  23. deltacat/compute/compactor_v2/compaction_session.py +17 -14
  24. deltacat/compute/compactor_v2/constants.py +30 -1
  25. deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -1
  26. deltacat/compute/compactor_v2/model/hash_bucket_input.py +9 -3
  27. deltacat/compute/compactor_v2/model/merge_file_group.py +5 -2
  28. deltacat/compute/compactor_v2/model/merge_input.py +33 -8
  29. deltacat/compute/compactor_v2/private/compaction_utils.py +167 -68
  30. deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -2
  31. deltacat/compute/compactor_v2/steps/merge.py +267 -55
  32. deltacat/compute/compactor_v2/utils/content_type_params.py +34 -6
  33. deltacat/compute/compactor_v2/utils/dedupe.py +1 -1
  34. deltacat/compute/compactor_v2/utils/delta.py +5 -3
  35. deltacat/compute/compactor_v2/utils/io.py +11 -4
  36. deltacat/compute/compactor_v2/utils/merge.py +15 -2
  37. deltacat/compute/compactor_v2/utils/primary_key_index.py +28 -4
  38. deltacat/compute/compactor_v2/utils/task_options.py +45 -33
  39. deltacat/compute/converter/converter_session.py +145 -32
  40. deltacat/compute/converter/model/convert_input.py +26 -19
  41. deltacat/compute/converter/model/convert_input_files.py +33 -16
  42. deltacat/compute/converter/model/convert_result.py +35 -16
  43. deltacat/compute/converter/model/converter_session_params.py +24 -21
  44. deltacat/compute/converter/pyiceberg/catalog.py +21 -18
  45. deltacat/compute/converter/pyiceberg/overrides.py +18 -9
  46. deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +148 -100
  47. deltacat/compute/converter/steps/convert.py +157 -50
  48. deltacat/compute/converter/steps/dedupe.py +24 -11
  49. deltacat/compute/converter/utils/convert_task_options.py +27 -12
  50. deltacat/compute/converter/utils/converter_session_utils.py +126 -60
  51. deltacat/compute/converter/utils/iceberg_columns.py +8 -8
  52. deltacat/compute/converter/utils/io.py +101 -12
  53. deltacat/compute/converter/utils/s3u.py +33 -27
  54. deltacat/compute/janitor.py +205 -0
  55. deltacat/compute/jobs/client.py +19 -8
  56. deltacat/compute/resource_estimation/delta.py +38 -6
  57. deltacat/compute/resource_estimation/model.py +8 -0
  58. deltacat/constants.py +44 -0
  59. deltacat/docs/autogen/schema/__init__.py +0 -0
  60. deltacat/docs/autogen/schema/inference/__init__.py +0 -0
  61. deltacat/docs/autogen/schema/inference/generate_type_mappings.py +687 -0
  62. deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py +673 -0
  63. deltacat/examples/compactor/__init__.py +0 -0
  64. deltacat/examples/compactor/aws/__init__.py +1 -0
  65. deltacat/examples/compactor/bootstrap.py +863 -0
  66. deltacat/examples/compactor/compactor.py +373 -0
  67. deltacat/examples/compactor/explorer.py +473 -0
  68. deltacat/examples/compactor/gcp/__init__.py +1 -0
  69. deltacat/examples/compactor/job_runner.py +439 -0
  70. deltacat/examples/compactor/utils/__init__.py +1 -0
  71. deltacat/examples/compactor/utils/common.py +261 -0
  72. deltacat/examples/experimental/iceberg/converter/__init__.py +0 -0
  73. deltacat/examples/experimental/iceberg/converter/beam/__init__.py +0 -0
  74. deltacat/examples/experimental/iceberg/converter/beam/app.py +226 -0
  75. deltacat/examples/experimental/iceberg/converter/beam/main.py +133 -0
  76. deltacat/examples/experimental/iceberg/converter/beam/test_workflow.py +113 -0
  77. deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py +3 -0
  78. deltacat/examples/experimental/iceberg/converter/beam/utils/common.py +174 -0
  79. deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py +263 -0
  80. deltacat/exceptions.py +66 -4
  81. deltacat/experimental/catalog/iceberg/impl.py +2 -2
  82. deltacat/experimental/compatibility/__init__.py +0 -0
  83. deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py +201 -0
  84. deltacat/experimental/converter_agent/__init__.py +0 -0
  85. deltacat/experimental/converter_agent/beam/__init__.py +0 -0
  86. deltacat/experimental/converter_agent/beam/managed.py +173 -0
  87. deltacat/experimental/converter_agent/table_monitor.py +479 -0
  88. deltacat/experimental/storage/iceberg/iceberg_scan_planner.py +105 -4
  89. deltacat/experimental/storage/iceberg/impl.py +5 -3
  90. deltacat/experimental/storage/iceberg/model.py +7 -3
  91. deltacat/experimental/storage/iceberg/visitor.py +119 -0
  92. deltacat/experimental/storage/rivulet/dataset.py +0 -3
  93. deltacat/experimental/storage/rivulet/metastore/delta.py +0 -2
  94. deltacat/experimental/storage/rivulet/reader/dataset_metastore.py +3 -2
  95. deltacat/io/datasource/deltacat_datasource.py +0 -1
  96. deltacat/storage/__init__.py +20 -2
  97. deltacat/storage/interface.py +54 -32
  98. deltacat/storage/main/impl.py +1494 -541
  99. deltacat/storage/model/delta.py +27 -3
  100. deltacat/storage/model/locator.py +6 -12
  101. deltacat/storage/model/manifest.py +182 -6
  102. deltacat/storage/model/metafile.py +151 -78
  103. deltacat/storage/model/namespace.py +8 -1
  104. deltacat/storage/model/partition.py +117 -42
  105. deltacat/storage/model/schema.py +2427 -159
  106. deltacat/storage/model/sort_key.py +40 -0
  107. deltacat/storage/model/stream.py +9 -2
  108. deltacat/storage/model/table.py +12 -1
  109. deltacat/storage/model/table_version.py +11 -0
  110. deltacat/storage/model/transaction.py +1184 -208
  111. deltacat/storage/model/transform.py +81 -2
  112. deltacat/storage/model/types.py +48 -26
  113. deltacat/tests/_io/test_cloudpickle_bug_fix.py +8 -4
  114. deltacat/tests/aws/test_s3u.py +2 -31
  115. deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +1606 -70
  116. deltacat/tests/catalog/test_catalogs.py +54 -11
  117. deltacat/tests/catalog/test_default_catalog_impl.py +12152 -71
  118. deltacat/tests/compute/compact_partition_test_cases.py +35 -8
  119. deltacat/tests/compute/compactor/steps/test_repartition.py +12 -12
  120. deltacat/tests/compute/compactor/utils/test_io.py +124 -120
  121. deltacat/tests/compute/compactor/utils/test_round_completion_reader.py +254 -0
  122. deltacat/tests/compute/compactor_v2/test_compaction_session.py +423 -312
  123. deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +266 -0
  124. deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +45 -0
  125. deltacat/tests/compute/compactor_v2/utils/test_task_options.py +270 -1
  126. deltacat/tests/compute/conftest.py +8 -44
  127. deltacat/tests/compute/converter/test_convert_session.py +675 -490
  128. deltacat/tests/compute/converter/utils.py +15 -6
  129. deltacat/tests/compute/resource_estimation/test_delta.py +145 -79
  130. deltacat/tests/compute/test_compact_partition_incremental.py +103 -70
  131. deltacat/tests/compute/test_compact_partition_multiple_rounds.py +89 -66
  132. deltacat/tests/compute/test_compact_partition_params.py +13 -8
  133. deltacat/tests/compute/test_compact_partition_rebase.py +77 -62
  134. deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +263 -193
  135. deltacat/tests/compute/test_janitor.py +236 -0
  136. deltacat/tests/compute/test_util_common.py +716 -43
  137. deltacat/tests/compute/test_util_constant.py +0 -1
  138. deltacat/tests/{storage/conftest.py → conftest.py} +1 -1
  139. deltacat/tests/experimental/__init__.py +1 -0
  140. deltacat/tests/experimental/compatibility/__init__.py +1 -0
  141. deltacat/tests/experimental/compatibility/test_backfill_locator_to_id_mappings.py +582 -0
  142. deltacat/tests/storage/main/test_main_storage.py +6900 -95
  143. deltacat/tests/storage/model/test_metafile_io.py +78 -173
  144. deltacat/tests/storage/model/test_partition_scheme.py +85 -0
  145. deltacat/tests/storage/model/test_schema.py +171 -0
  146. deltacat/tests/storage/model/test_schema_update.py +1925 -0
  147. deltacat/tests/storage/model/test_sort_scheme.py +90 -0
  148. deltacat/tests/storage/model/test_transaction.py +393 -48
  149. deltacat/tests/storage/model/test_transaction_history.py +886 -0
  150. deltacat/tests/test_deltacat_api.py +988 -4
  151. deltacat/tests/test_exceptions.py +9 -5
  152. deltacat/tests/test_utils/pyarrow.py +52 -21
  153. deltacat/tests/test_utils/storage.py +23 -34
  154. deltacat/tests/types/__init__.py +0 -0
  155. deltacat/tests/types/test_tables.py +104 -0
  156. deltacat/tests/utils/exceptions.py +22 -0
  157. deltacat/tests/utils/main_deltacat_storage_mock.py +31 -0
  158. deltacat/tests/utils/ray_utils/test_dataset.py +123 -5
  159. deltacat/tests/utils/test_daft.py +121 -31
  160. deltacat/tests/utils/test_numpy.py +1193 -0
  161. deltacat/tests/utils/test_pandas.py +1106 -0
  162. deltacat/tests/utils/test_polars.py +1040 -0
  163. deltacat/tests/utils/test_pyarrow.py +1370 -89
  164. deltacat/types/media.py +221 -11
  165. deltacat/types/tables.py +2329 -59
  166. deltacat/utils/arguments.py +33 -1
  167. deltacat/utils/daft.py +411 -150
  168. deltacat/utils/filesystem.py +100 -0
  169. deltacat/utils/metafile_locator.py +2 -1
  170. deltacat/utils/numpy.py +118 -26
  171. deltacat/utils/pandas.py +577 -48
  172. deltacat/utils/polars.py +658 -27
  173. deltacat/utils/pyarrow.py +1258 -213
  174. deltacat/utils/ray_utils/dataset.py +101 -10
  175. deltacat/utils/reader_compatibility_mapping.py +3083 -0
  176. deltacat/utils/url.py +56 -15
  177. deltacat-2.0.0.post1.dist-info/METADATA +1163 -0
  178. {deltacat-2.0.0b11.dist-info → deltacat-2.0.0.post1.dist-info}/RECORD +183 -145
  179. {deltacat-2.0.0b11.dist-info → deltacat-2.0.0.post1.dist-info}/WHEEL +1 -1
  180. deltacat/compute/compactor/utils/round_completion_file.py +0 -97
  181. deltacat/compute/merge_on_read/__init__.py +0 -4
  182. deltacat/compute/merge_on_read/daft.py +0 -40
  183. deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -66
  184. deltacat/compute/merge_on_read/utils/delta.py +0 -42
  185. deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -231
  186. deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -388
  187. deltacat/tests/local_deltacat_storage/__init__.py +0 -1236
  188. deltacat/tests/local_deltacat_storage/exceptions.py +0 -10
  189. deltacat/utils/s3fs.py +0 -21
  190. deltacat-2.0.0b11.dist-info/METADATA +0 -67
  191. /deltacat/{compute/merge_on_read/model → docs}/__init__.py +0 -0
  192. /deltacat/{compute/merge_on_read/utils → docs/autogen}/__init__.py +0 -0
  193. {deltacat-2.0.0b11.dist-info → deltacat-2.0.0.post1.dist-info/licenses}/LICENSE +0 -0
  194. {deltacat-2.0.0b11.dist-info → deltacat-2.0.0.post1.dist-info}/top_level.txt +0 -0
@@ -8,24 +8,33 @@ import ray
8
8
  import logging
9
9
  from deltacat.compute.converter.model.convert_input import ConvertInput
10
10
  from deltacat.compute.converter.steps.dedupe import dedupe_data_files
11
- from deltacat.compute.converter.utils.s3u import upload_table_with_retry
11
+ from deltacat.compute.converter.utils.io import write_sliced_table
12
12
  from deltacat.compute.converter.utils.io import (
13
13
  download_data_table_and_append_iceberg_columns,
14
14
  )
15
15
  from deltacat.compute.converter.utils.converter_session_utils import (
16
16
  partition_value_record_to_partition_value_string,
17
+ sort_data_files_maintaining_order,
17
18
  )
18
19
  from deltacat.compute.converter.pyiceberg.overrides import (
19
20
  parquet_files_dict_to_iceberg_data_files,
20
21
  )
21
22
  from deltacat.compute.converter.model.convert_result import ConvertResult
23
+ from pyiceberg.manifest import DataFileContent
22
24
  from deltacat import logs
25
+ from fsspec import AbstractFileSystem
26
+ from typing import List, Dict, Tuple, Optional, Any
27
+ from deltacat.utils.resources import get_current_process_peak_memory_usage_in_bytes
28
+ from deltacat.compute.converter.model.convert_input_files import (
29
+ DataFileList,
30
+ DataFileListGroup,
31
+ )
23
32
 
24
33
  logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
25
34
 
26
35
 
27
36
  @ray.remote
28
- def convert(convert_input: ConvertInput):
37
+ def convert(convert_input: ConvertInput) -> ConvertResult:
29
38
  convert_input_files = convert_input.convert_input_files
30
39
  convert_task_index = convert_input.convert_task_index
31
40
  iceberg_table_warehouse_prefix = convert_input.iceberg_table_warehouse_prefix
@@ -39,8 +48,10 @@ def convert(convert_input: ConvertInput):
39
48
  convert_input.position_delete_for_multiple_data_files
40
49
  )
41
50
  max_parallel_data_file_download = convert_input.max_parallel_data_file_download
42
- s3_file_system = convert_input.s3_file_system
51
+ filesystem = convert_input.filesystem
43
52
  s3_client_kwargs = convert_input.s3_client_kwargs
53
+ task_memory = convert_input.task_memory
54
+
44
55
  if not position_delete_for_multiple_data_files:
45
56
  raise NotImplementedError(
46
57
  f"Distributed file level position delete compute is not supported yet"
@@ -54,6 +65,7 @@ def convert(convert_input: ConvertInput):
54
65
  applicable_equality_delete_files = (
55
66
  convert_input_files.applicable_equality_delete_files
56
67
  )
68
+
57
69
  all_data_files_for_this_bucket = convert_input_files.all_data_files_for_dedupe
58
70
 
59
71
  partition_value_str = partition_value_record_to_partition_value_string(
@@ -69,11 +81,14 @@ def convert(convert_input: ConvertInput):
69
81
  iceberg_table_warehouse_prefix_with_partition = (
70
82
  f"{iceberg_table_warehouse_prefix}"
71
83
  )
84
+
72
85
  enforce_primary_key_uniqueness = convert_input.enforce_primary_key_uniqueness
73
86
  total_pos_delete_table = []
87
+ data_table_after_converting_equality_delete = []
74
88
  if applicable_equality_delete_files:
75
89
  (
76
- pos_delete_after_converting_equality_delete
90
+ pos_delete_after_converting_equality_delete,
91
+ data_table_after_converting_equality_delete,
77
92
  ) = compute_pos_delete_with_limited_parallelism(
78
93
  data_files_list=applicable_data_files,
79
94
  identifier_columns=identifier_fields,
@@ -81,20 +96,35 @@ def convert(convert_input: ConvertInput):
81
96
  iceberg_table_warehouse_prefix_with_partition=iceberg_table_warehouse_prefix_with_partition,
82
97
  convert_task_index=convert_task_index,
83
98
  max_parallel_data_file_download=max_parallel_data_file_download,
84
- s3_file_system=s3_file_system,
99
+ s3_file_system=filesystem,
85
100
  s3_client_kwargs=s3_client_kwargs,
86
101
  )
87
102
  if pos_delete_after_converting_equality_delete:
88
103
  total_pos_delete_table.append(pos_delete_after_converting_equality_delete)
89
104
 
90
105
  if enforce_primary_key_uniqueness:
106
+ data_files_downloaded_during_convert = []
107
+ if applicable_data_files:
108
+ for file_list in applicable_data_files:
109
+ for file in file_list:
110
+ data_files_downloaded_during_convert.append(file)
111
+
91
112
  data_files_to_dedupe = get_additional_applicable_data_files(
92
113
  all_data_files=all_data_files_for_this_bucket,
93
- data_files_downloaded=applicable_data_files,
114
+ data_files_downloaded=data_files_downloaded_during_convert,
115
+ )
116
+
117
+ dedupe_file_size_bytes = sum(
118
+ data_file.file_size_in_bytes for _, data_file in data_files_to_dedupe
119
+ )
120
+ logger.info(
121
+ f"Total on-disk size of files to dedupe: {dedupe_file_size_bytes} bytes"
94
122
  )
123
+
95
124
  logger.info(
96
125
  f"[Convert task {convert_task_index}]: Got {len(data_files_to_dedupe)} files to dedupe."
97
126
  )
127
+
98
128
  (
99
129
  pos_delete_after_dedupe,
100
130
  data_file_to_dedupe_record_count,
@@ -102,6 +132,7 @@ def convert(convert_input: ConvertInput):
102
132
  ) = dedupe_data_files(
103
133
  data_file_to_dedupe=data_files_to_dedupe,
104
134
  identifier_columns=identifier_fields,
135
+ remaining_data_table_after_convert=data_table_after_converting_equality_delete,
105
136
  merge_sort_column=sc._ORDERED_RECORD_IDX_COLUMN_NAME,
106
137
  s3_client_kwargs=s3_client_kwargs,
107
138
  )
@@ -118,11 +149,11 @@ def convert(convert_input: ConvertInput):
118
149
 
119
150
  to_be_added_files_list = []
120
151
  if total_pos_delete:
121
- to_be_added_files_list_parquet = upload_table_with_retry(
152
+ to_be_added_files_list_parquet = write_sliced_table(
122
153
  table=total_pos_delete,
123
- s3_url_prefix=iceberg_table_warehouse_prefix_with_partition,
124
- s3_table_writer_kwargs={},
125
- s3_file_system=s3_file_system,
154
+ base_path=iceberg_table_warehouse_prefix_with_partition,
155
+ table_writer_kwargs={},
156
+ filesystem=filesystem,
126
157
  )
127
158
 
128
159
  to_be_added_files_dict = defaultdict()
@@ -131,19 +162,39 @@ def convert(convert_input: ConvertInput):
131
162
  logger.info(
132
163
  f"[Convert task {convert_task_index}]: Produced {len(to_be_added_files_list_parquet)} position delete files."
133
164
  )
165
+ file_content_type = DataFileContent.POSITION_DELETES
134
166
  to_be_added_files_list = parquet_files_dict_to_iceberg_data_files(
135
167
  io=table_io,
136
168
  table_metadata=table_metadata,
137
169
  files_dict=to_be_added_files_dict,
170
+ file_content_type=file_content_type,
138
171
  )
139
172
 
140
173
  to_be_delete_files_dict = defaultdict()
174
+
141
175
  if applicable_equality_delete_files:
142
176
  to_be_delete_files_dict[partition_value] = [
143
177
  equality_delete_file[1]
144
- for equality_delete_file in applicable_equality_delete_files
178
+ for equality_delete_list in applicable_equality_delete_files
179
+ for equality_delete_file in equality_delete_list
145
180
  ]
146
181
 
182
+ if not enforce_primary_key_uniqueness:
183
+ data_file_to_dedupe_record_count = 0
184
+ data_file_to_dedupe_size = 0
185
+
186
+ peak_memory_usage_bytes = (
187
+ get_current_process_peak_memory_usage_in_bytes()
188
+ ) # Convert KB to bytes
189
+ memory_usage_percentage = (peak_memory_usage_bytes / task_memory) * 100
190
+
191
+ logger.info(
192
+ f"[Convert task {convert_task_index}]: Memory usage stats - "
193
+ f"Peak memory usage: {peak_memory_usage_bytes} bytes, "
194
+ f"Allocated task memory: {convert_input.task_memory} bytes, "
195
+ f"Usage percentage: {memory_usage_percentage:.2f}%"
196
+ )
197
+
147
198
  convert_res = ConvertResult.of(
148
199
  convert_task_index=convert_task_index,
149
200
  to_be_added_files=to_be_added_files_list,
@@ -155,38 +206,73 @@ def convert(convert_input: ConvertInput):
155
206
  position_delete_on_disk_sizes=sum(
156
207
  file.file_size_in_bytes for file in to_be_added_files_list
157
208
  ),
209
+ input_data_files_on_disk_size=dedupe_file_size_bytes,
210
+ peak_memory_usage_bytes=peak_memory_usage_bytes,
211
+ memory_usage_percentage=memory_usage_percentage,
158
212
  )
159
213
  return convert_res
160
214
 
161
215
 
162
- def get_additional_applicable_data_files(all_data_files, data_files_downloaded):
163
- data_file_to_dedupe = all_data_files
216
+ def get_additional_applicable_data_files(
217
+ all_data_files: DataFileList,
218
+ data_files_downloaded: DataFileList,
219
+ ) -> DataFileList:
220
+ data_file_to_dedupe = []
221
+ assert len(set(all_data_files)) >= len(set(data_files_downloaded)), (
222
+ f"Length of all data files ({len(set(all_data_files))}) should never be less than "
223
+ f"the length of candidate equality delete data files ({len(set(data_files_downloaded))})"
224
+ )
164
225
  if data_files_downloaded:
165
- data_file_to_dedupe = list(set(all_data_files) - set(data_files_downloaded))
226
+ # set1.difference(set2) returns elements in set1 but not in set2
227
+ data_file_to_dedupe.extend(
228
+ list(set(data_file_to_dedupe).difference(set(data_files_downloaded)))
229
+ )
230
+ else:
231
+ data_file_to_dedupe = all_data_files
166
232
  return data_file_to_dedupe
167
233
 
168
234
 
169
235
  def filter_rows_to_be_deleted(
170
- equality_delete_table, data_file_table, identifier_columns
171
- ):
172
- identifier_column = identifier_columns[0]
236
+ equality_delete_table: Optional[pa.Table],
237
+ data_file_table: Optional[pa.Table],
238
+ identifier_columns: List[str],
239
+ ) -> Tuple[Optional[pa.Table], Optional[pa.Table]]:
240
+ identifier_column = sc._IDENTIFIER_COLUMNS_HASH_COLUMN_NAME
173
241
  if equality_delete_table and data_file_table:
174
242
  equality_deletes = pc.is_in(
175
243
  data_file_table[identifier_column],
176
244
  equality_delete_table[identifier_column],
177
245
  )
246
+ data_file_record_remaining = pc.invert(
247
+ pc.is_in(
248
+ data_file_table[identifier_column],
249
+ equality_delete_table[identifier_column],
250
+ )
251
+ )
178
252
  position_delete_table = data_file_table.filter(equality_deletes)
179
- return position_delete_table
253
+ remaining_data_table = data_file_table.filter(data_file_record_remaining)
254
+
255
+ position_delete_table = position_delete_table.drop(
256
+ [sc._IDENTIFIER_COLUMNS_HASH_COLUMN_NAME]
257
+ )
258
+ assert len(position_delete_table) + len(remaining_data_table) == len(
259
+ data_file_table
260
+ ), (
261
+ f"Expected undeleted data file record count plus length of pos deletes to match original data file record count of {len(data_file_table)}, "
262
+ f"but found {len(position_delete_table)} pos deletes + {len(remaining_data_table)} equality deletes."
263
+ )
264
+
265
+ return position_delete_table, remaining_data_table
180
266
 
181
267
 
182
268
  def compute_pos_delete_converting_equality_deletes(
183
- equality_delete_table,
184
- data_file_table,
185
- identifier_columns,
186
- iceberg_table_warehouse_prefix_with_partition,
187
- s3_file_system,
188
- ):
189
- new_position_delete_table = filter_rows_to_be_deleted(
269
+ equality_delete_table: Optional[pa.Table],
270
+ data_file_table: Optional[pa.Table],
271
+ identifier_columns: List[str],
272
+ iceberg_table_warehouse_prefix_with_partition: str,
273
+ s3_file_system: Optional[AbstractFileSystem],
274
+ ) -> Tuple[Optional[pa.Table], Optional[pa.Table]]:
275
+ new_position_delete_table, remaining_data_table = filter_rows_to_be_deleted(
190
276
  data_file_table=data_file_table,
191
277
  equality_delete_table=equality_delete_table,
192
278
  identifier_columns=identifier_columns,
@@ -195,34 +281,46 @@ def compute_pos_delete_converting_equality_deletes(
195
281
  logger.info(
196
282
  f"Length of position delete table after converting from equality deletes:{len(new_position_delete_table)}"
197
283
  )
284
+ return new_position_delete_table, remaining_data_table
285
+ elif not remaining_data_table:
286
+ return None, None
198
287
  else:
199
- return None
200
- return new_position_delete_table
288
+ return None, remaining_data_table
201
289
 
202
290
 
203
291
  def compute_pos_delete_with_limited_parallelism(
204
- data_files_list,
205
- identifier_columns,
206
- equality_delete_files_list,
207
- iceberg_table_warehouse_prefix_with_partition,
208
- convert_task_index,
209
- max_parallel_data_file_download,
210
- s3_file_system,
211
- s3_client_kwargs,
212
- ):
292
+ data_files_list: DataFileListGroup,
293
+ identifier_columns: List[str],
294
+ equality_delete_files_list: DataFileListGroup,
295
+ iceberg_table_warehouse_prefix_with_partition: str,
296
+ convert_task_index: int,
297
+ max_parallel_data_file_download: int,
298
+ s3_file_system: Optional[AbstractFileSystem],
299
+ s3_client_kwargs: Optional[Dict[str, Any]],
300
+ ) -> Tuple[Optional[pa.Table], Optional[pa.Table]]:
301
+ assert len(data_files_list) == len(equality_delete_files_list), (
302
+ f"Number of lists of data files should equal to number of list of equality delete files, "
303
+ f"But got {len(data_files_list)} data files lists vs {len(equality_delete_files_list)}."
304
+ )
305
+
306
+ new_pos_delete_table_total = []
213
307
  for data_files, equality_delete_files in zip(
214
308
  data_files_list, equality_delete_files_list
215
309
  ):
216
310
  data_table_total = []
311
+
312
+ # Sort data files by file sequence number first, then file path to
313
+ # make sure files having same sequence number are deterministically sorted
314
+ data_files = sort_data_files_maintaining_order(data_files=data_files)
315
+
217
316
  for data_file in data_files:
218
317
  data_table = download_data_table_and_append_iceberg_columns(
219
- data_files=data_file[1],
318
+ file=data_file[1],
220
319
  columns_to_download=identifier_columns,
221
320
  additional_columns_to_append=[
222
321
  sc._FILE_PATH_COLUMN_NAME,
223
322
  sc._ORDERED_RECORD_IDX_COLUMN_NAME,
224
323
  ],
225
- sequence_number=data_file[0],
226
324
  s3_client_kwargs=s3_client_kwargs,
227
325
  )
228
326
  data_table_total.append(data_table)
@@ -231,29 +329,38 @@ def compute_pos_delete_with_limited_parallelism(
231
329
  equality_delete_table_total = []
232
330
  for equality_delete in equality_delete_files:
233
331
  equality_delete_table = download_data_table_and_append_iceberg_columns(
234
- data_files=equality_delete[1],
332
+ file=equality_delete[1],
235
333
  columns_to_download=identifier_columns,
236
334
  s3_client_kwargs=s3_client_kwargs,
237
335
  )
238
336
  equality_delete_table_total.append(equality_delete_table)
239
337
  equality_delete_table_total = pa.concat_tables(equality_delete_table_total)
240
338
 
241
- new_pos_delete_table = compute_pos_delete_converting_equality_deletes(
242
- equality_delete_table=equality_delete_table_total,
243
- data_file_table=data_table_total,
244
- iceberg_table_warehouse_prefix_with_partition=iceberg_table_warehouse_prefix_with_partition,
245
- identifier_columns=identifier_columns,
246
- s3_file_system=s3_file_system,
247
- s3_client_kwargs=s3_client_kwargs,
248
- )
339
+ (
340
+ new_pos_delete_table,
341
+ remaining_data_table,
342
+ ) = compute_pos_delete_converting_equality_deletes(
343
+ equality_delete_table=equality_delete_table_total,
344
+ data_file_table=data_table_total,
345
+ iceberg_table_warehouse_prefix_with_partition=iceberg_table_warehouse_prefix_with_partition,
346
+ identifier_columns=identifier_columns,
347
+ s3_file_system=s3_file_system,
348
+ )
349
+ new_pos_delete_table_total.append(new_pos_delete_table)
350
+
351
+ if new_pos_delete_table_total:
352
+ new_pos_delete_table_total = pa.concat_tables(new_pos_delete_table_total)
249
353
 
250
354
  logger.info(
251
355
  f"[Convert task {convert_task_index}]: Find deletes got {len(data_table_total)} data table records, "
252
356
  f"{len(equality_delete_table_total)} equality deletes as input, "
253
- f"Produced {len(new_pos_delete_table)} position deletes based off find deletes input."
357
+ f"Produced {len(new_pos_delete_table_total)} position deletes based off find deletes input."
254
358
  )
255
359
 
256
- if not new_pos_delete_table:
360
+ if not new_pos_delete_table_total:
257
361
  logger.info("No records deleted based on equality delete convertion")
258
362
 
259
- return new_pos_delete_table
363
+ if not remaining_data_table:
364
+ logger.info("No data table remaining after converting equality deletes")
365
+
366
+ return new_pos_delete_table_total, remaining_data_table
@@ -4,25 +4,33 @@ import deltacat.compute.converter.utils.iceberg_columns as sc
4
4
  from deltacat.compute.converter.utils.io import (
5
5
  download_data_table_and_append_iceberg_columns,
6
6
  )
7
+ from deltacat.compute.converter.utils.converter_session_utils import (
8
+ sort_data_files_maintaining_order,
9
+ )
7
10
  import logging
8
11
  from deltacat import logs
12
+ from typing import List, Dict, Tuple, Optional, Any
13
+ from pyiceberg.manifest import DataFile
9
14
 
10
15
  logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
11
16
 
12
17
 
13
18
  def dedupe_data_files(
14
- data_file_to_dedupe,
15
- identifier_columns,
16
- merge_sort_column,
17
- s3_client_kwargs,
18
- ):
19
+ data_file_to_dedupe: List[Tuple[int, DataFile]],
20
+ identifier_columns: List[str],
21
+ remaining_data_table_after_convert: Optional[pa.Table],
22
+ merge_sort_column: str,
23
+ s3_client_kwargs: Optional[Dict[str, Any]],
24
+ ) -> Tuple[pa.Table, int, int]:
19
25
  data_file_table = []
26
+ if remaining_data_table_after_convert:
27
+ data_file_table.append(remaining_data_table_after_convert)
20
28
 
29
+ data_file_to_dedupe = sort_data_files_maintaining_order(
30
+ data_files=data_file_to_dedupe
31
+ )
21
32
  downloaded_data_file_record_count = 0
22
- # Sort data files by file sequence number first
23
- data_file_to_dedupe = sorted(data_file_to_dedupe, key=lambda f: f[0])
24
33
  for file_tuple in data_file_to_dedupe:
25
- sequence_number = file_tuple[0]
26
34
  data_file = file_tuple[1]
27
35
  data_file_to_dedupe_table = download_data_table_and_append_iceberg_columns(
28
36
  file=data_file,
@@ -31,17 +39,22 @@ def dedupe_data_files(
31
39
  sc._FILE_PATH_COLUMN_NAME,
32
40
  sc._ORDERED_RECORD_IDX_COLUMN_NAME,
33
41
  ],
34
- sequence_number=sequence_number,
35
42
  s3_client_kwargs=s3_client_kwargs,
36
43
  )
44
+ logger.info(
45
+ f"Length of downloaded data file table: {len(data_file_to_dedupe_table)}"
46
+ )
37
47
  downloaded_data_file_record_count += len(data_file_to_dedupe_table)
38
48
  data_file_table.append(data_file_to_dedupe_table)
39
49
 
40
50
  final_data_to_dedupe = pa.concat_tables(data_file_table)
41
51
 
42
- assert len(final_data_to_dedupe) == downloaded_data_file_record_count, (
52
+ dedupe_input_record_count = downloaded_data_file_record_count
53
+ if remaining_data_table_after_convert:
54
+ dedupe_input_record_count += len(remaining_data_table_after_convert)
55
+ assert len(final_data_to_dedupe) == dedupe_input_record_count, (
43
56
  f"Mismatch record count while performing table concat, Got {len(final_data_to_dedupe)} in final table, "
44
- f"while input table length is: {downloaded_data_file_record_count}"
57
+ f"while input table length is: {dedupe_input_record_count}"
45
58
  )
46
59
 
47
60
  logger.info(f"Length of pyarrow table to dedupe:{len(final_data_to_dedupe)}")
@@ -1,20 +1,27 @@
1
- from typing import Optional, Dict
1
+ from typing import Optional, Dict, List, Tuple, Any
2
2
  from deltacat.exceptions import RetryableError
3
+ from pyiceberg.manifest import DataFile
4
+ from deltacat.compute.converter.model.convert_input_files import ConvertInputFiles
3
5
 
4
- AVERAGE_FILE_PATH_COLUMN_SIZE_BYTES = 80
6
+ AVERAGE_FILE_PATH_COLUMN_SIZE_BYTES = 160
5
7
  AVERAGE_POS_COLUMN_SIZE_BYTES = 4
6
8
  XXHASH_BYTE_PER_RECORD = 8
7
9
  MEMORY_BUFFER_RATE = 2
8
- # TODO: Add audit info to check this number in practice
9
10
  # Worst case 2 as no duplicates exists across all pk
10
11
  PYARROW_AGGREGATE_MEMORY_MULTIPLIER = 2
12
+ # Observed base memory usage at the beginning of each worker process
13
+ BASE_MEMORY_BUFFER = 0.3 * 1024 * 1024 * 1024
11
14
 
12
15
 
13
- def estimate_fixed_hash_columns(hash_value_size_bytes_per_record, total_record_count):
16
+ def estimate_fixed_hash_columns(
17
+ hash_value_size_bytes_per_record: int, total_record_count: int
18
+ ) -> int:
14
19
  return hash_value_size_bytes_per_record * total_record_count
15
20
 
16
21
 
17
- def get_total_record_from_iceberg_files(iceberg_files_list):
22
+ def get_total_record_from_iceberg_files(
23
+ iceberg_files_list: List[Tuple[int, DataFile]]
24
+ ) -> int:
18
25
  total_record_count = 0
19
26
  # file are in form of tuple (sequence_number, DataFile)
20
27
  total_record_count += sum(file[1].record_count for file in iceberg_files_list)
@@ -22,8 +29,8 @@ def get_total_record_from_iceberg_files(iceberg_files_list):
22
29
 
23
30
 
24
31
  def estimate_iceberg_pos_delete_additional_columns(
25
- include_columns, num_of_record_count
26
- ):
32
+ include_columns: List[str], num_of_record_count: int
33
+ ) -> int:
27
34
  total_additional_columns_sizes = 0
28
35
  if "file_path" in include_columns:
29
36
  total_additional_columns_sizes += (
@@ -36,7 +43,10 @@ def estimate_iceberg_pos_delete_additional_columns(
36
43
  return total_additional_columns_sizes
37
44
 
38
45
 
39
- def estimate_convert_remote_option_resources(data_files, equality_delete_files):
46
+ def estimate_convert_remote_option_resources(
47
+ data_files: List[Tuple[int, DataFile]],
48
+ equality_delete_files: List[Tuple[int, DataFile]],
49
+ ) -> float:
40
50
  data_file_record_count = get_total_record_from_iceberg_files(data_files)
41
51
  equality_delete_record_count = get_total_record_from_iceberg_files(
42
52
  equality_delete_files
@@ -53,9 +63,9 @@ def estimate_convert_remote_option_resources(data_files, equality_delete_files):
53
63
 
54
64
  def _get_task_options(
55
65
  memory: float,
56
- ray_custom_resources: Optional[Dict] = None,
66
+ ray_custom_resources: Optional[Dict[str, Any]] = None,
57
67
  scheduling_strategy: str = "SPREAD",
58
- ) -> Dict:
68
+ ) -> Dict[str, Any]:
59
69
 
60
70
  # NOTE: With DEFAULT scheduling strategy in Ray 2.20.0, autoscaler does
61
71
  # not spin up enough nodes fast and hence we see only approximately
@@ -80,7 +90,9 @@ def _get_task_options(
80
90
  return task_opts
81
91
 
82
92
 
83
- def estimate_dedupe_memory(all_data_files_for_dedupe):
93
+ def estimate_dedupe_memory(
94
+ all_data_files_for_dedupe: List[Tuple[int, DataFile]]
95
+ ) -> float:
84
96
  dedupe_record_count = get_total_record_from_iceberg_files(all_data_files_for_dedupe)
85
97
  produced_pos_memory_required = estimate_iceberg_pos_delete_additional_columns(
86
98
  ["file_path", "pos"], dedupe_record_count
@@ -95,13 +107,16 @@ def estimate_dedupe_memory(all_data_files_for_dedupe):
95
107
  return memory_with_buffer
96
108
 
97
109
 
98
- def convert_resource_options_provider(index, convert_input_files):
110
+ def convert_resource_options_provider(
111
+ index: int, convert_input_files: ConvertInputFiles
112
+ ) -> Dict[str, Any]:
99
113
  applicable_data_files = convert_input_files.applicable_data_files
100
114
  applicable_equality_delete_files = (
101
115
  convert_input_files.applicable_equality_delete_files
102
116
  )
103
117
  all_data_files_for_dedupe = convert_input_files.all_data_files_for_dedupe
104
118
  total_memory_required = 0
119
+ total_memory_required += BASE_MEMORY_BUFFER
105
120
  if applicable_data_files and applicable_equality_delete_files:
106
121
  memory_requirement_for_convert_equality_deletes = (
107
122
  estimate_convert_remote_option_resources(