deltacat 1.1.36__py3-none-any.whl → 2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (236) hide show
  1. deltacat/__init__.py +42 -3
  2. deltacat/annotations.py +36 -0
  3. deltacat/api.py +168 -0
  4. deltacat/aws/s3u.py +4 -4
  5. deltacat/benchmarking/benchmark_engine.py +82 -0
  6. deltacat/benchmarking/benchmark_report.py +86 -0
  7. deltacat/benchmarking/benchmark_suite.py +11 -0
  8. deltacat/benchmarking/conftest.py +21 -0
  9. deltacat/benchmarking/data/random_row_generator.py +94 -0
  10. deltacat/benchmarking/data/row_generator.py +10 -0
  11. deltacat/benchmarking/test_benchmark_pipeline.py +106 -0
  12. deltacat/catalog/__init__.py +14 -0
  13. deltacat/catalog/delegate.py +199 -106
  14. deltacat/catalog/iceberg/__init__.py +4 -0
  15. deltacat/catalog/iceberg/iceberg_catalog_config.py +26 -0
  16. deltacat/catalog/iceberg/impl.py +368 -0
  17. deltacat/catalog/iceberg/overrides.py +74 -0
  18. deltacat/catalog/interface.py +273 -76
  19. deltacat/catalog/main/impl.py +720 -0
  20. deltacat/catalog/model/catalog.py +227 -20
  21. deltacat/catalog/model/properties.py +116 -0
  22. deltacat/catalog/model/table_definition.py +32 -1
  23. deltacat/compute/compactor/model/compaction_session_audit_info.py +7 -3
  24. deltacat/compute/compactor/model/delta_annotated.py +3 -3
  25. deltacat/compute/compactor/model/delta_file_envelope.py +3 -1
  26. deltacat/compute/compactor/model/delta_file_locator.py +3 -1
  27. deltacat/compute/compactor/model/round_completion_info.py +5 -5
  28. deltacat/compute/compactor/model/table_object_store.py +3 -2
  29. deltacat/compute/compactor/repartition_session.py +1 -1
  30. deltacat/compute/compactor/steps/dedupe.py +11 -4
  31. deltacat/compute/compactor/steps/hash_bucket.py +1 -1
  32. deltacat/compute/compactor/steps/materialize.py +6 -2
  33. deltacat/compute/compactor/utils/io.py +1 -1
  34. deltacat/compute/compactor/utils/sort_key.py +9 -2
  35. deltacat/compute/compactor_v2/compaction_session.py +5 -9
  36. deltacat/compute/compactor_v2/constants.py +1 -30
  37. deltacat/compute/compactor_v2/deletes/utils.py +3 -3
  38. deltacat/compute/compactor_v2/model/merge_input.py +1 -7
  39. deltacat/compute/compactor_v2/private/compaction_utils.py +5 -6
  40. deltacat/compute/compactor_v2/steps/merge.py +17 -126
  41. deltacat/compute/compactor_v2/utils/content_type_params.py +0 -17
  42. deltacat/compute/compactor_v2/utils/dedupe.py +1 -1
  43. deltacat/compute/compactor_v2/utils/io.py +1 -1
  44. deltacat/compute/compactor_v2/utils/merge.py +0 -1
  45. deltacat/compute/compactor_v2/utils/primary_key_index.py +3 -15
  46. deltacat/compute/compactor_v2/utils/task_options.py +23 -43
  47. deltacat/compute/converter/constants.py +4 -0
  48. deltacat/compute/converter/converter_session.py +143 -0
  49. deltacat/compute/converter/model/convert_input.py +69 -0
  50. deltacat/compute/converter/model/convert_input_files.py +61 -0
  51. deltacat/compute/converter/model/converter_session_params.py +99 -0
  52. deltacat/compute/converter/pyiceberg/__init__.py +0 -0
  53. deltacat/compute/converter/pyiceberg/catalog.py +75 -0
  54. deltacat/compute/converter/pyiceberg/overrides.py +135 -0
  55. deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +251 -0
  56. deltacat/compute/converter/steps/__init__.py +0 -0
  57. deltacat/compute/converter/steps/convert.py +211 -0
  58. deltacat/compute/converter/steps/dedupe.py +60 -0
  59. deltacat/compute/converter/utils/__init__.py +0 -0
  60. deltacat/compute/converter/utils/convert_task_options.py +88 -0
  61. deltacat/compute/converter/utils/converter_session_utils.py +109 -0
  62. deltacat/compute/converter/utils/iceberg_columns.py +82 -0
  63. deltacat/compute/converter/utils/io.py +43 -0
  64. deltacat/compute/converter/utils/s3u.py +133 -0
  65. deltacat/compute/resource_estimation/delta.py +1 -19
  66. deltacat/constants.py +47 -1
  67. deltacat/env.py +51 -0
  68. deltacat/examples/__init__.py +0 -0
  69. deltacat/examples/basic_logging.py +101 -0
  70. deltacat/examples/common/__init__.py +0 -0
  71. deltacat/examples/common/fixtures.py +15 -0
  72. deltacat/examples/hello_world.py +27 -0
  73. deltacat/examples/iceberg/__init__.py +0 -0
  74. deltacat/examples/iceberg/iceberg_bucket_writer.py +139 -0
  75. deltacat/examples/iceberg/iceberg_reader.py +149 -0
  76. deltacat/exceptions.py +51 -9
  77. deltacat/logs.py +4 -1
  78. deltacat/storage/__init__.py +118 -28
  79. deltacat/storage/iceberg/__init__.py +0 -0
  80. deltacat/storage/iceberg/iceberg_scan_planner.py +28 -0
  81. deltacat/storage/iceberg/impl.py +737 -0
  82. deltacat/storage/iceberg/model.py +709 -0
  83. deltacat/storage/interface.py +217 -134
  84. deltacat/storage/main/__init__.py +0 -0
  85. deltacat/storage/main/impl.py +2077 -0
  86. deltacat/storage/model/delta.py +118 -71
  87. deltacat/storage/model/interop.py +24 -0
  88. deltacat/storage/model/list_result.py +8 -0
  89. deltacat/storage/model/locator.py +93 -3
  90. deltacat/{aws/redshift → storage}/model/manifest.py +122 -98
  91. deltacat/storage/model/metafile.py +1316 -0
  92. deltacat/storage/model/namespace.py +34 -18
  93. deltacat/storage/model/partition.py +362 -37
  94. deltacat/storage/model/scan/__init__.py +0 -0
  95. deltacat/storage/model/scan/push_down.py +19 -0
  96. deltacat/storage/model/scan/scan_plan.py +10 -0
  97. deltacat/storage/model/scan/scan_task.py +34 -0
  98. deltacat/storage/model/schema.py +892 -0
  99. deltacat/storage/model/shard.py +47 -0
  100. deltacat/storage/model/sort_key.py +170 -13
  101. deltacat/storage/model/stream.py +208 -80
  102. deltacat/storage/model/table.py +123 -29
  103. deltacat/storage/model/table_version.py +322 -46
  104. deltacat/storage/model/transaction.py +757 -0
  105. deltacat/storage/model/transform.py +198 -61
  106. deltacat/storage/model/types.py +111 -13
  107. deltacat/storage/rivulet/__init__.py +11 -0
  108. deltacat/storage/rivulet/arrow/__init__.py +0 -0
  109. deltacat/storage/rivulet/arrow/serializer.py +75 -0
  110. deltacat/storage/rivulet/dataset.py +744 -0
  111. deltacat/storage/rivulet/dataset_executor.py +87 -0
  112. deltacat/storage/rivulet/feather/__init__.py +5 -0
  113. deltacat/storage/rivulet/feather/file_reader.py +136 -0
  114. deltacat/storage/rivulet/feather/serializer.py +35 -0
  115. deltacat/storage/rivulet/fs/__init__.py +0 -0
  116. deltacat/storage/rivulet/fs/file_provider.py +105 -0
  117. deltacat/storage/rivulet/fs/file_store.py +130 -0
  118. deltacat/storage/rivulet/fs/input_file.py +76 -0
  119. deltacat/storage/rivulet/fs/output_file.py +86 -0
  120. deltacat/storage/rivulet/logical_plan.py +105 -0
  121. deltacat/storage/rivulet/metastore/__init__.py +0 -0
  122. deltacat/storage/rivulet/metastore/delta.py +190 -0
  123. deltacat/storage/rivulet/metastore/json_sst.py +105 -0
  124. deltacat/storage/rivulet/metastore/sst.py +82 -0
  125. deltacat/storage/rivulet/metastore/sst_interval_tree.py +260 -0
  126. deltacat/storage/rivulet/mvp/Table.py +101 -0
  127. deltacat/storage/rivulet/mvp/__init__.py +5 -0
  128. deltacat/storage/rivulet/parquet/__init__.py +5 -0
  129. deltacat/storage/rivulet/parquet/data_reader.py +0 -0
  130. deltacat/storage/rivulet/parquet/file_reader.py +127 -0
  131. deltacat/storage/rivulet/parquet/serializer.py +37 -0
  132. deltacat/storage/rivulet/reader/__init__.py +0 -0
  133. deltacat/storage/rivulet/reader/block_scanner.py +378 -0
  134. deltacat/storage/rivulet/reader/data_reader.py +136 -0
  135. deltacat/storage/rivulet/reader/data_scan.py +63 -0
  136. deltacat/storage/rivulet/reader/dataset_metastore.py +178 -0
  137. deltacat/storage/rivulet/reader/dataset_reader.py +156 -0
  138. deltacat/storage/rivulet/reader/pyarrow_data_reader.py +121 -0
  139. deltacat/storage/rivulet/reader/query_expression.py +99 -0
  140. deltacat/storage/rivulet/reader/reader_type_registrar.py +84 -0
  141. deltacat/storage/rivulet/schema/__init__.py +0 -0
  142. deltacat/storage/rivulet/schema/datatype.py +128 -0
  143. deltacat/storage/rivulet/schema/schema.py +251 -0
  144. deltacat/storage/rivulet/serializer.py +40 -0
  145. deltacat/storage/rivulet/serializer_factory.py +42 -0
  146. deltacat/storage/rivulet/writer/__init__.py +0 -0
  147. deltacat/storage/rivulet/writer/dataset_writer.py +29 -0
  148. deltacat/storage/rivulet/writer/memtable_dataset_writer.py +294 -0
  149. deltacat/tests/_io/__init__.py +1 -0
  150. deltacat/tests/catalog/test_catalogs.py +324 -0
  151. deltacat/tests/catalog/test_default_catalog_impl.py +16 -8
  152. deltacat/tests/compute/compact_partition_multiple_rounds_test_cases.py +21 -21
  153. deltacat/tests/compute/compact_partition_rebase_test_cases.py +6 -6
  154. deltacat/tests/compute/compact_partition_rebase_then_incremental_test_cases.py +56 -56
  155. deltacat/tests/compute/compact_partition_test_cases.py +19 -53
  156. deltacat/tests/compute/compactor/steps/test_repartition.py +2 -2
  157. deltacat/tests/compute/compactor/utils/test_io.py +6 -8
  158. deltacat/tests/compute/compactor_v2/test_compaction_session.py +0 -466
  159. deltacat/tests/compute/compactor_v2/utils/test_task_options.py +1 -273
  160. deltacat/tests/compute/conftest.py +75 -0
  161. deltacat/tests/compute/converter/__init__.py +0 -0
  162. deltacat/tests/compute/converter/conftest.py +80 -0
  163. deltacat/tests/compute/converter/test_convert_session.py +478 -0
  164. deltacat/tests/compute/converter/utils.py +123 -0
  165. deltacat/tests/compute/resource_estimation/test_delta.py +0 -16
  166. deltacat/tests/compute/test_compact_partition_incremental.py +2 -42
  167. deltacat/tests/compute/test_compact_partition_multiple_rounds.py +5 -46
  168. deltacat/tests/compute/test_compact_partition_params.py +3 -3
  169. deltacat/tests/compute/test_compact_partition_rebase.py +1 -46
  170. deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +5 -46
  171. deltacat/tests/compute/test_util_common.py +19 -12
  172. deltacat/tests/compute/test_util_create_table_deltas_repo.py +13 -22
  173. deltacat/tests/local_deltacat_storage/__init__.py +76 -103
  174. deltacat/tests/storage/__init__.py +0 -0
  175. deltacat/tests/storage/conftest.py +25 -0
  176. deltacat/tests/storage/main/__init__.py +0 -0
  177. deltacat/tests/storage/main/test_main_storage.py +1399 -0
  178. deltacat/tests/storage/model/__init__.py +0 -0
  179. deltacat/tests/storage/model/test_delete_parameters.py +21 -0
  180. deltacat/tests/storage/model/test_metafile_io.py +2535 -0
  181. deltacat/tests/storage/model/test_schema.py +308 -0
  182. deltacat/tests/storage/model/test_shard.py +22 -0
  183. deltacat/tests/storage/model/test_table_version.py +110 -0
  184. deltacat/tests/storage/model/test_transaction.py +308 -0
  185. deltacat/tests/storage/rivulet/__init__.py +0 -0
  186. deltacat/tests/storage/rivulet/conftest.py +149 -0
  187. deltacat/tests/storage/rivulet/fs/__init__.py +0 -0
  188. deltacat/tests/storage/rivulet/fs/test_file_location_provider.py +93 -0
  189. deltacat/tests/storage/rivulet/schema/__init__.py +0 -0
  190. deltacat/tests/storage/rivulet/schema/test_schema.py +241 -0
  191. deltacat/tests/storage/rivulet/test_dataset.py +406 -0
  192. deltacat/tests/storage/rivulet/test_manifest.py +67 -0
  193. deltacat/tests/storage/rivulet/test_sst_interval_tree.py +232 -0
  194. deltacat/tests/storage/rivulet/test_utils.py +122 -0
  195. deltacat/tests/storage/rivulet/writer/__init__.py +0 -0
  196. deltacat/tests/storage/rivulet/writer/test_dataset_write_then_read.py +341 -0
  197. deltacat/tests/storage/rivulet/writer/test_dataset_writer.py +79 -0
  198. deltacat/tests/storage/rivulet/writer/test_memtable_dataset_writer.py +75 -0
  199. deltacat/tests/test_deltacat_api.py +39 -0
  200. deltacat/tests/test_utils/filesystem.py +14 -0
  201. deltacat/tests/test_utils/message_pack_utils.py +54 -0
  202. deltacat/tests/test_utils/pyarrow.py +8 -15
  203. deltacat/tests/test_utils/storage.py +266 -3
  204. deltacat/tests/utils/test_daft.py +3 -3
  205. deltacat/tests/utils/test_pyarrow.py +0 -432
  206. deltacat/types/partial_download.py +1 -1
  207. deltacat/types/tables.py +1 -1
  208. deltacat/utils/export.py +59 -0
  209. deltacat/utils/filesystem.py +320 -0
  210. deltacat/utils/metafile_locator.py +73 -0
  211. deltacat/utils/pyarrow.py +36 -183
  212. deltacat-2.0.dist-info/METADATA +65 -0
  213. deltacat-2.0.dist-info/RECORD +347 -0
  214. deltacat/aws/redshift/__init__.py +0 -19
  215. deltacat/catalog/default_catalog_impl/__init__.py +0 -369
  216. deltacat/io/dataset.py +0 -73
  217. deltacat/io/read_api.py +0 -143
  218. deltacat/storage/model/delete_parameters.py +0 -40
  219. deltacat/storage/model/partition_spec.py +0 -71
  220. deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +0 -253
  221. deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +0 -45
  222. deltacat-1.1.36.dist-info/METADATA +0 -64
  223. deltacat-1.1.36.dist-info/RECORD +0 -219
  224. /deltacat/{aws/redshift/model → benchmarking/data}/__init__.py +0 -0
  225. /deltacat/{io/aws → catalog/main}/__init__.py +0 -0
  226. /deltacat/{io/aws/redshift → compute/converter}/__init__.py +0 -0
  227. /deltacat/{tests/io → compute/converter/model}/__init__.py +0 -0
  228. /deltacat/tests/{io → _io}/test_cloudpickle_bug_fix.py +0 -0
  229. /deltacat/tests/{io → _io}/test_file_object_store.py +0 -0
  230. /deltacat/tests/{io → _io}/test_memcached_object_store.py +0 -0
  231. /deltacat/tests/{io → _io}/test_ray_plasma_object_store.py +0 -0
  232. /deltacat/tests/{io → _io}/test_redis_object_store.py +0 -0
  233. /deltacat/tests/{io → _io}/test_s3_object_store.py +0 -0
  234. {deltacat-1.1.36.dist-info → deltacat-2.0.dist-info}/LICENSE +0 -0
  235. {deltacat-1.1.36.dist-info → deltacat-2.0.dist-info}/WHEEL +0 -0
  236. {deltacat-1.1.36.dist-info → deltacat-2.0.dist-info}/top_level.txt +0 -0
@@ -1,16 +1,11 @@
1
1
  import logging
2
2
  from typing import Dict, Optional, List, Tuple, Any
3
3
  from deltacat import logs
4
- from deltacat.constants import PYARROW_INFLATION_MULTIPLIER
5
- from deltacat.compute.compactor_v2.constants import (
6
- AVERAGE_RECORD_SIZE_BYTES as DEFAULT_AVERAGE_RECORD_SIZE_BYTES,
7
- )
8
4
  from deltacat.compute.compactor_v2.model.merge_file_group import (
9
5
  LocalMergeFileGroupsProvider,
10
6
  )
11
7
  from deltacat.storage import (
12
8
  Manifest,
13
- ManifestEntry,
14
9
  interface as unimplemented_deltacat_storage,
15
10
  )
16
11
  from deltacat.compute.compactor.model.delta_annotated import DeltaAnnotated
@@ -86,27 +81,16 @@ def _get_merge_task_options(
86
81
  and compacted_delta_manifest
87
82
  and round_completion_info.hb_index_to_entry_range
88
83
  ):
89
- logger.debug_conditional(
90
- f"[Merge task {index}]: Using previous compaction rounds to calculate merge memory: {round_completion_info.compacted_pyarrow_write_result}",
91
- memory_logs_enabled,
92
- )
93
- previous_inflation: float = (
94
- (
95
- round_completion_info.compacted_pyarrow_write_result.pyarrow_bytes
96
- / round_completion_info.compacted_pyarrow_write_result.file_bytes
97
- )
98
- if round_completion_info.compacted_pyarrow_write_result.file_bytes
99
- else PYARROW_INFLATION_MULTIPLIER
84
+
85
+ previous_inflation = (
86
+ round_completion_info.compacted_pyarrow_write_result.pyarrow_bytes
87
+ / round_completion_info.compacted_pyarrow_write_result.file_bytes
100
88
  )
101
89
  debug_memory_params["previous_inflation"] = previous_inflation
102
90
 
103
- average_record_size: float = (
104
- (
105
- round_completion_info.compacted_pyarrow_write_result.pyarrow_bytes
106
- / round_completion_info.compacted_pyarrow_write_result.records
107
- )
108
- if round_completion_info.compacted_pyarrow_write_result.records
109
- else DEFAULT_AVERAGE_RECORD_SIZE_BYTES
91
+ average_record_size = (
92
+ round_completion_info.compacted_pyarrow_write_result.pyarrow_bytes
93
+ / round_completion_info.compacted_pyarrow_write_result.records
110
94
  )
111
95
  debug_memory_params["average_record_size"] = average_record_size
112
96
 
@@ -122,36 +106,31 @@ def _get_merge_task_options(
122
106
  str(hb_idx)
123
107
  ]
124
108
  for entry_index in range(entry_start, entry_end):
125
- entry: ManifestEntry = compacted_delta_manifest.entries[entry_index]
126
- current_entry_size: float = (
127
- estimate_manifest_entry_size_bytes(
128
- entry=entry,
129
- operation_type=OperationType.PYARROW_DOWNLOAD,
130
- estimate_resources_params=estimate_resources_params,
131
- )
132
- or 0.0
109
+ entry = compacted_delta_manifest.entries[entry_index]
110
+
111
+ current_entry_size = estimate_manifest_entry_size_bytes(
112
+ entry=entry,
113
+ operation_type=OperationType.PYARROW_DOWNLOAD,
114
+ estimate_resources_params=estimate_resources_params,
133
115
  )
134
- current_entry_rows: int = (
135
- estimate_manifest_entry_num_rows(
136
- entry=entry,
137
- operation_type=OperationType.PYARROW_DOWNLOAD,
138
- estimate_resources_params=estimate_resources_params,
139
- )
140
- or 0
116
+ current_entry_rows = estimate_manifest_entry_num_rows(
117
+ entry=entry,
118
+ operation_type=OperationType.PYARROW_DOWNLOAD,
119
+ estimate_resources_params=estimate_resources_params,
141
120
  )
142
- # NOTE: We can treat the current_entry_size and current_entry_rows as 0 as a None estimated entry size implies a 0 value
121
+
143
122
  data_size += current_entry_size
144
123
  num_rows += current_entry_rows
124
+
145
125
  if primary_keys:
146
- pk_size: Optional[
147
- float
148
- ] = estimate_manifest_entry_column_size_bytes(
126
+ pk_size = estimate_manifest_entry_column_size_bytes(
149
127
  entry=entry,
150
128
  columns=primary_keys,
151
129
  operation_type=OperationType.PYARROW_DOWNLOAD,
152
130
  estimate_resources_params=estimate_resources_params,
153
131
  )
154
- if not pk_size:
132
+
133
+ if pk_size is None:
155
134
  pk_size_bytes += current_entry_size
156
135
  else:
157
136
  pk_size_bytes += pk_size
@@ -180,6 +159,7 @@ def _get_merge_task_options(
180
159
  f"[Merge task {index}]: Params used for calculating merge memory: {debug_memory_params}",
181
160
  memory_logs_enabled,
182
161
  )
162
+
183
163
  return _get_task_options(0.01, total_memory, ray_custom_resources)
184
164
 
185
165
 
@@ -0,0 +1,4 @@
1
+ DEFAULT_CONVERTER_TASK_MAX_PARALLELISM = 4096
2
+
3
+ # Safe limit ONLY considering CPU limit, typically 32 for a 8x-large worker
4
+ DEFAULT_MAX_PARALLEL_DATA_FILE_DOWNLOAD = 30
@@ -0,0 +1,143 @@
1
+ # from pyiceberg.typedef import EMPTY_DICT, Identifier, Properties
2
+ from deltacat.utils.ray_utils.concurrency import (
3
+ invoke_parallel,
4
+ task_resource_options_provider,
5
+ )
6
+ import ray
7
+ import functools
8
+ from deltacat.compute.converter.utils.convert_task_options import (
9
+ convert_resource_options_provider,
10
+ )
11
+ import logging
12
+ from deltacat import logs
13
+ from deltacat.compute.converter.model.converter_session_params import (
14
+ ConverterSessionParams,
15
+ )
16
+
17
+
18
+ from deltacat.compute.converter.constants import DEFAULT_MAX_PARALLEL_DATA_FILE_DOWNLOAD
19
+ from deltacat.compute.converter.steps.convert import convert
20
+ from deltacat.compute.converter.model.convert_input import ConvertInput
21
+ from deltacat.compute.converter.pyiceberg.overrides import (
22
+ fetch_all_bucket_files,
23
+ parquet_files_dict_to_iceberg_data_files,
24
+ )
25
+ from deltacat.compute.converter.utils.converter_session_utils import (
26
+ construct_iceberg_table_prefix,
27
+ )
28
+ from deltacat.compute.converter.pyiceberg.update_snapshot_overrides import (
29
+ commit_replace_snapshot,
30
+ commit_append_snapshot,
31
+ )
32
+ from deltacat.compute.converter.pyiceberg.catalog import load_table
33
+ from deltacat.compute.converter.utils.converter_session_utils import (
34
+ group_all_files_to_each_bucket,
35
+ )
36
+
37
+ logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
38
+
39
+
40
+ def converter_session(params: ConverterSessionParams, **kwargs):
41
+ """
42
+ Convert equality delete to position delete.
43
+ Compute and memory heavy work from downloading equality delete table and compute position deletes
44
+ will be executed on Ray remote tasks.
45
+ """
46
+
47
+ catalog = params.catalog
48
+ table_name = params.iceberg_table_name
49
+ iceberg_table = load_table(catalog, table_name)
50
+ enforce_primary_key_uniqueness = params.enforce_primary_key_uniqueness
51
+ data_file_dict, equality_delete_dict, pos_delete_dict = fetch_all_bucket_files(
52
+ iceberg_table
53
+ )
54
+ convert_input_files_for_all_buckets = group_all_files_to_each_bucket(
55
+ data_file_dict=data_file_dict,
56
+ equality_delete_dict=equality_delete_dict,
57
+ pos_delete_dict=pos_delete_dict,
58
+ )
59
+ iceberg_warehouse_bucket_name = params.iceberg_warehouse_bucket_name
60
+ iceberg_namespace = params.iceberg_namespace
61
+ iceberg_table_warehouse_prefix = construct_iceberg_table_prefix(
62
+ iceberg_warehouse_bucket_name=iceberg_warehouse_bucket_name,
63
+ table_name=table_name,
64
+ iceberg_namespace=iceberg_namespace,
65
+ )
66
+ merge_keys = params.merge_keys
67
+ # Using table identifier fields as merge keys if merge keys not provided
68
+ if not merge_keys:
69
+ identifier_fields_set = iceberg_table.schema().identifier_field_names()
70
+ identifier_fields = list(identifier_fields_set)
71
+ else:
72
+ identifier_fields = merge_keys
73
+ if len(identifier_fields) > 1:
74
+ raise NotImplementedError(
75
+ f"Multiple identifier fields lookup not supported yet."
76
+ )
77
+ convert_options_provider = functools.partial(
78
+ task_resource_options_provider,
79
+ resource_amount_provider=convert_resource_options_provider,
80
+ )
81
+
82
+ # TODO (zyiqin): max_parallel_data_file_download should be determined by memory requirement for each bucket.
83
+ # Specifically, for case when files for one bucket memory requirement exceed one worker node's memory limit, WITHOUT rebasing with larger hash bucket count,
84
+ # 1. We can control parallel files to download by adjusting max_parallel_data_file_download.
85
+ # 2. Implement two-layer converter tasks, with convert tasks to spin up child convert tasks.
86
+ # Note that approach 2 will ideally require shared object store to avoid download equality delete files * number of child tasks times.
87
+ max_parallel_data_file_download = DEFAULT_MAX_PARALLEL_DATA_FILE_DOWNLOAD
88
+
89
+ compact_small_files = params.compact_small_files
90
+ position_delete_for_multiple_data_files = (
91
+ params.position_delete_for_multiple_data_files
92
+ )
93
+ task_max_parallelism = params.task_max_parallelism
94
+
95
+ def convert_input_provider(index, item):
96
+ return {
97
+ "convert_input": ConvertInput.of(
98
+ files_for_each_bucket=item,
99
+ convert_task_index=index,
100
+ iceberg_table_warehouse_prefix=iceberg_table_warehouse_prefix,
101
+ identifier_fields=identifier_fields,
102
+ compact_small_files=compact_small_files,
103
+ enforce_primary_key_uniqueness=enforce_primary_key_uniqueness,
104
+ position_delete_for_multiple_data_files=position_delete_for_multiple_data_files,
105
+ max_parallel_data_file_download=max_parallel_data_file_download,
106
+ )
107
+ }
108
+
109
+ # Ray remote task: convert
110
+ # Assuming that memory consume by each bucket doesn't exceed one node's memory limit.
111
+ # TODO: Add split mechanism to split large buckets
112
+ convert_tasks_pending = invoke_parallel(
113
+ items=convert_input_files_for_all_buckets.items(),
114
+ ray_task=convert,
115
+ max_parallelism=task_max_parallelism,
116
+ options_provider=convert_options_provider,
117
+ kwargs_provider=convert_input_provider,
118
+ )
119
+ to_be_deleted_files_list = []
120
+ to_be_added_files_dict_list = []
121
+ convert_results = ray.get(convert_tasks_pending)
122
+ for convert_result in convert_results:
123
+ to_be_deleted_files_list.extend(convert_result[0].values())
124
+ to_be_added_files_dict_list.append(convert_result[1])
125
+
126
+ new_position_delete_files = parquet_files_dict_to_iceberg_data_files(
127
+ io=iceberg_table.io,
128
+ table_metadata=iceberg_table.metadata,
129
+ files_dict_list=to_be_added_files_dict_list,
130
+ )
131
+
132
+ if not to_be_deleted_files_list:
133
+ commit_append_snapshot(
134
+ iceberg_table=iceberg_table,
135
+ new_position_delete_files=new_position_delete_files,
136
+ )
137
+ else:
138
+ commit_replace_snapshot(
139
+ iceberg_table=iceberg_table,
140
+ # equality_delete_files + data file that all rows are deleted
141
+ to_be_deleted_files_list=to_be_deleted_files_list,
142
+ new_position_delete_files=new_position_delete_files,
143
+ )
@@ -0,0 +1,69 @@
1
+ from __future__ import annotations
2
+ from typing import Dict, List
3
+ from deltacat.compute.converter.model.convert_input_files import ConvertInputFiles
4
+
5
+
6
+ class ConvertInput(Dict):
7
+ @staticmethod
8
+ def of(
9
+ convert_input_files,
10
+ convert_task_index,
11
+ iceberg_table_warehouse_prefix,
12
+ identifier_fields,
13
+ compact_small_files,
14
+ enforce_primary_key_uniqueness,
15
+ position_delete_for_multiple_data_files,
16
+ max_parallel_data_file_download,
17
+ s3_file_system,
18
+ ) -> ConvertInput:
19
+
20
+ result = ConvertInput()
21
+ result["convert_input_files"] = convert_input_files
22
+ result["convert_task_index"] = convert_task_index
23
+ result["identifier_fields"] = identifier_fields
24
+ result["iceberg_table_warehouse_prefix"] = iceberg_table_warehouse_prefix
25
+ result["compact_small_files"] = compact_small_files
26
+ result["enforce_primary_key_uniqueness"] = enforce_primary_key_uniqueness
27
+ result[
28
+ "position_delete_for_multiple_data_files"
29
+ ] = position_delete_for_multiple_data_files
30
+ result["max_parallel_data_file_download"] = max_parallel_data_file_download
31
+ result["s3_file_system"] = s3_file_system
32
+
33
+ return result
34
+
35
+ @property
36
+ def convert_input_files(self) -> ConvertInputFiles:
37
+ return self["convert_input_files"]
38
+
39
+ @property
40
+ def identifier_fields(self) -> List[str]:
41
+ return self["identifier_fields"]
42
+
43
+ @property
44
+ def convert_task_index(self) -> int:
45
+ return self["convert_task_index"]
46
+
47
+ @property
48
+ def iceberg_table_warehouse_prefix(self) -> str:
49
+ return self["iceberg_table_warehouse_prefix"]
50
+
51
+ @property
52
+ def compact_small_files(self) -> bool:
53
+ return self["compact_small_files"]
54
+
55
+ @property
56
+ def enforce_primary_key_uniqueness(self) -> bool:
57
+ return self["enforce_primary_key_uniqueness"]
58
+
59
+ @property
60
+ def position_delete_for_multiple_data_files(self) -> bool:
61
+ return self["position_delete_for_multiple_data_files"]
62
+
63
+ @property
64
+ def max_parallel_data_file_download(self) -> int:
65
+ return self["max_parallel_data_file_download"]
66
+
67
+ @property
68
+ def s3_file_system(self):
69
+ return self["s3_file_system"]
@@ -0,0 +1,61 @@
1
+ from __future__ import annotations
2
+ from typing import Dict
3
+
4
+
5
+ class ConvertInputFiles(Dict):
6
+ @staticmethod
7
+ def of(
8
+ partition_value,
9
+ all_data_files_for_dedupe=None,
10
+ applicable_data_files=None,
11
+ applicable_equality_delete_files=None,
12
+ existing_position_delete_files=None,
13
+ ) -> ConvertInputFiles:
14
+
15
+ result = ConvertInputFiles()
16
+ result["partition_value"] = partition_value
17
+ result["all_data_files_for_dedupe"] = all_data_files_for_dedupe
18
+ result["applicable_data_files"] = applicable_data_files
19
+ result["applicable_equality_delete_files"] = applicable_equality_delete_files
20
+ result["existing_position_delete_files"] = existing_position_delete_files
21
+ return result
22
+
23
+ @property
24
+ def partition_value(self):
25
+ return self["partition_value"]
26
+
27
+ @property
28
+ def all_data_files_for_dedupe(self):
29
+ return self["all_data_files_for_dedupe"]
30
+
31
+ @property
32
+ def applicable_data_files(self):
33
+ return self["applicable_data_files"]
34
+
35
+ @property
36
+ def applicable_equality_delete_files(self):
37
+ return self["applicable_equality_delete_files"]
38
+
39
+ @property
40
+ def existing_position_delete_files(self):
41
+ return self["existing_position_delete_files"]
42
+
43
+ @partition_value.setter
44
+ def partition_value(self, partition_value):
45
+ self["partition_value"] = partition_value
46
+
47
+ @all_data_files_for_dedupe.setter
48
+ def all_data_files_for_dedupe(self, all_data_files_for_dedupe):
49
+ self["all_data_files_for_dedupe"] = all_data_files_for_dedupe
50
+
51
+ @applicable_data_files.setter
52
+ def applicable_data_files(self, applicable_data_files):
53
+ self["applicable_data_files"] = applicable_data_files
54
+
55
+ @applicable_equality_delete_files.setter
56
+ def applicable_equality_delete_files(self, applicable_equality_delete_files):
57
+ self["applicable_equality_delete_files"] = applicable_equality_delete_files
58
+
59
+ @existing_position_delete_files.setter
60
+ def existing_position_delete_files(self, existing_position_delete_files):
61
+ self["existing_position_delete_files"] = existing_position_delete_files
@@ -0,0 +1,99 @@
1
+ from __future__ import annotations
2
+ from typing import Optional, Dict
3
+ from deltacat.compute.converter.constants import DEFAULT_CONVERTER_TASK_MAX_PARALLELISM
4
+
5
+
6
+ class ConverterSessionParams(dict):
7
+ """
8
+ This class represents the parameters passed to convert_ (deltacat/compute/compactor/compaction_session.py)
9
+ """
10
+
11
+ @staticmethod
12
+ def of(params: Optional[Dict]) -> ConverterSessionParams:
13
+ params = {} if params is None else params
14
+ assert params.get("catalog") is not None, "catalog is a required arg"
15
+ assert (
16
+ params.get("iceberg_table_name") is not None
17
+ ), "iceberg_table_name is a required arg"
18
+ assert (
19
+ params.get("iceberg_warehouse_bucket_name") is not None
20
+ ), "iceberg_warehouse_bucket_name is a required arg"
21
+ assert (
22
+ params.get("iceberg_namespace") is not None
23
+ ), "iceberg_namespace is a required arg"
24
+ result = ConverterSessionParams(params)
25
+
26
+ result.enforce_primary_key_uniqueness = params.get(
27
+ "enforce_primary_key_uniqueness", False
28
+ )
29
+ result.compact_small_files = params.get("compact_small_files", False)
30
+
31
+ # For Iceberg v3 spec, option to produce delete vector that can establish 1:1 mapping with data files.
32
+ result.position_delete_for_multiple_data_files = params.get(
33
+ "position_delete_for_multiple_data_files", True
34
+ )
35
+ result.task_max_parallelism = params.get(
36
+ "task_max_parallelism", DEFAULT_CONVERTER_TASK_MAX_PARALLELISM
37
+ )
38
+ result.merge_keys = params.get("merge_keys", None)
39
+ return result
40
+
41
+ @property
42
+ def catalog(self):
43
+ return self["catalog"]
44
+
45
+ @property
46
+ def iceberg_table_name(self) -> str:
47
+ return self["iceberg_table_name"]
48
+
49
+ @property
50
+ def iceberg_warehouse_bucket_name(self) -> str:
51
+ return self["iceberg_warehouse_bucket_name"]
52
+
53
+ @property
54
+ def iceberg_namespace(self) -> str:
55
+ return self["iceberg_namespace"]
56
+
57
+ @property
58
+ def enforce_primary_key_uniqueness(self) -> bool:
59
+ return self["enforce_primary_key_uniqueness"]
60
+
61
+ @enforce_primary_key_uniqueness.setter
62
+ def enforce_primary_key_uniqueness(self, enforce_primary_key_uniqueness) -> None:
63
+ self["enforce_primary_key_uniqueness"] = enforce_primary_key_uniqueness
64
+
65
+ @property
66
+ def compact_small_files(self) -> bool:
67
+ return self["compact_small_files"]
68
+
69
+ @compact_small_files.setter
70
+ def compact_small_files(self, compact_small_files) -> None:
71
+ self["compact_small_files"] = compact_small_files
72
+
73
+ @property
74
+ def position_delete_for_multiple_data_files(self) -> bool:
75
+ return self["position_delete_for_multiple_data_files"]
76
+
77
+ @position_delete_for_multiple_data_files.setter
78
+ def position_delete_for_multiple_data_files(
79
+ self, position_delete_for_multiple_data_files
80
+ ) -> None:
81
+ self[
82
+ "position_delete_for_multiple_data_files"
83
+ ] = position_delete_for_multiple_data_files
84
+
85
+ @property
86
+ def task_max_parallelism(self) -> str:
87
+ return self["task_max_parallelism"]
88
+
89
+ @task_max_parallelism.setter
90
+ def task_max_parallelism(self, task_max_parallelism) -> None:
91
+ self["task_max_parallelism"] = task_max_parallelism
92
+
93
+ @property
94
+ def merge_keys(self) -> str:
95
+ return self["merge_keys"]
96
+
97
+ @merge_keys.setter
98
+ def merge_keys(self, merge_keys) -> None:
99
+ self["merge_keys"] = merge_keys
File without changes
@@ -0,0 +1,75 @@
1
+ from typing import Optional
2
+
3
+
4
+ def load_catalog(iceberg_catalog_name, iceberg_catalog_properties):
5
+ catalog = load_catalog(
6
+ name=iceberg_catalog_name,
7
+ **iceberg_catalog_properties,
8
+ )
9
+ return catalog
10
+
11
+
12
+ def get_s3_path(
13
+ bucket_name: str,
14
+ database_name: Optional[str] = None,
15
+ table_name: Optional[str] = None,
16
+ ) -> str:
17
+ result_path = f"s3://{bucket_name}"
18
+ if database_name is not None:
19
+ result_path += f"/{database_name}.db"
20
+
21
+ if table_name is not None:
22
+ result_path += f"/{table_name}"
23
+ return result_path
24
+
25
+
26
+ def get_bucket_name():
27
+ return "metadata-py4j-zyiqin1"
28
+
29
+
30
+ def get_s3_prefix():
31
+ return get_s3_path(get_bucket_name())
32
+
33
+
34
+ def get_credential():
35
+ import boto3
36
+
37
+ boto3_session = boto3.Session()
38
+ credentials = boto3_session.get_credentials()
39
+ return credentials
40
+
41
+
42
+ def get_glue_catalog():
43
+ from pyiceberg.catalog import load_catalog
44
+
45
+ credential = get_credential()
46
+ # Credentials are refreshable, so accessing your access key / secret key
47
+ # separately can lead to a race condition. Use this to get an actual matched
48
+ # set.
49
+ credential = credential.get_frozen_credentials()
50
+ access_key_id = credential.access_key
51
+ secret_access_key = credential.secret_key
52
+ session_token = credential.token
53
+ s3_path = get_s3_prefix()
54
+ glue_catalog = load_catalog(
55
+ "glue",
56
+ **{
57
+ "warehouse": s3_path,
58
+ "type": "glue",
59
+ "aws_access_key_id": access_key_id,
60
+ "aws_secret_access_key": secret_access_key,
61
+ "aws_session_token": session_token,
62
+ "region_name": "us-east-1",
63
+ "s3.access-key-id": access_key_id,
64
+ "s3.secret-access-key": secret_access_key,
65
+ "s3.session-token": session_token,
66
+ "s3.region": "us-east-1",
67
+ },
68
+ )
69
+
70
+ return glue_catalog
71
+
72
+
73
+ def load_table(catalog, table_name):
74
+ loaded_table = catalog.load_table(table_name)
75
+ return loaded_table