deltacat 1.1.36__py3-none-any.whl → 2.0.0b2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (238) hide show
  1. deltacat/__init__.py +42 -3
  2. deltacat/annotations.py +36 -0
  3. deltacat/api.py +168 -0
  4. deltacat/aws/s3u.py +4 -4
  5. deltacat/benchmarking/benchmark_engine.py +82 -0
  6. deltacat/benchmarking/benchmark_report.py +86 -0
  7. deltacat/benchmarking/benchmark_suite.py +11 -0
  8. deltacat/benchmarking/conftest.py +21 -0
  9. deltacat/benchmarking/data/random_row_generator.py +94 -0
  10. deltacat/benchmarking/data/row_generator.py +10 -0
  11. deltacat/benchmarking/test_benchmark_pipeline.py +106 -0
  12. deltacat/catalog/__init__.py +14 -0
  13. deltacat/catalog/delegate.py +199 -106
  14. deltacat/catalog/iceberg/__init__.py +4 -0
  15. deltacat/catalog/iceberg/iceberg_catalog_config.py +26 -0
  16. deltacat/catalog/iceberg/impl.py +368 -0
  17. deltacat/catalog/iceberg/overrides.py +74 -0
  18. deltacat/catalog/interface.py +273 -76
  19. deltacat/catalog/main/impl.py +720 -0
  20. deltacat/catalog/model/catalog.py +227 -20
  21. deltacat/catalog/model/properties.py +116 -0
  22. deltacat/catalog/model/table_definition.py +32 -1
  23. deltacat/compute/compactor/model/compaction_session_audit_info.py +7 -3
  24. deltacat/compute/compactor/model/delta_annotated.py +3 -3
  25. deltacat/compute/compactor/model/delta_file_envelope.py +3 -1
  26. deltacat/compute/compactor/model/delta_file_locator.py +3 -1
  27. deltacat/compute/compactor/model/round_completion_info.py +5 -5
  28. deltacat/compute/compactor/model/table_object_store.py +3 -2
  29. deltacat/compute/compactor/repartition_session.py +1 -1
  30. deltacat/compute/compactor/steps/dedupe.py +11 -4
  31. deltacat/compute/compactor/steps/hash_bucket.py +1 -1
  32. deltacat/compute/compactor/steps/materialize.py +6 -2
  33. deltacat/compute/compactor/utils/io.py +1 -1
  34. deltacat/compute/compactor/utils/sort_key.py +9 -2
  35. deltacat/compute/compactor_v2/compaction_session.py +5 -9
  36. deltacat/compute/compactor_v2/constants.py +1 -30
  37. deltacat/compute/compactor_v2/deletes/utils.py +3 -3
  38. deltacat/compute/compactor_v2/model/merge_input.py +1 -7
  39. deltacat/compute/compactor_v2/private/compaction_utils.py +5 -6
  40. deltacat/compute/compactor_v2/steps/merge.py +17 -126
  41. deltacat/compute/compactor_v2/utils/content_type_params.py +0 -17
  42. deltacat/compute/compactor_v2/utils/dedupe.py +1 -1
  43. deltacat/compute/compactor_v2/utils/io.py +1 -1
  44. deltacat/compute/compactor_v2/utils/merge.py +0 -1
  45. deltacat/compute/compactor_v2/utils/primary_key_index.py +3 -15
  46. deltacat/compute/compactor_v2/utils/task_options.py +23 -43
  47. deltacat/compute/converter/constants.py +4 -0
  48. deltacat/compute/converter/converter_session.py +143 -0
  49. deltacat/compute/converter/model/convert_input.py +69 -0
  50. deltacat/compute/converter/model/convert_input_files.py +61 -0
  51. deltacat/compute/converter/model/converter_session_params.py +99 -0
  52. deltacat/compute/converter/pyiceberg/__init__.py +0 -0
  53. deltacat/compute/converter/pyiceberg/catalog.py +75 -0
  54. deltacat/compute/converter/pyiceberg/overrides.py +135 -0
  55. deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +251 -0
  56. deltacat/compute/converter/steps/__init__.py +0 -0
  57. deltacat/compute/converter/steps/convert.py +211 -0
  58. deltacat/compute/converter/steps/dedupe.py +60 -0
  59. deltacat/compute/converter/utils/__init__.py +0 -0
  60. deltacat/compute/converter/utils/convert_task_options.py +88 -0
  61. deltacat/compute/converter/utils/converter_session_utils.py +109 -0
  62. deltacat/compute/converter/utils/iceberg_columns.py +82 -0
  63. deltacat/compute/converter/utils/io.py +43 -0
  64. deltacat/compute/converter/utils/s3u.py +133 -0
  65. deltacat/compute/resource_estimation/delta.py +1 -19
  66. deltacat/constants.py +47 -1
  67. deltacat/env.py +51 -0
  68. deltacat/examples/__init__.py +0 -0
  69. deltacat/examples/basic_logging.py +101 -0
  70. deltacat/examples/common/__init__.py +0 -0
  71. deltacat/examples/common/fixtures.py +15 -0
  72. deltacat/examples/hello_world.py +27 -0
  73. deltacat/examples/iceberg/__init__.py +0 -0
  74. deltacat/examples/iceberg/iceberg_bucket_writer.py +139 -0
  75. deltacat/examples/iceberg/iceberg_reader.py +149 -0
  76. deltacat/exceptions.py +51 -9
  77. deltacat/logs.py +4 -1
  78. deltacat/storage/__init__.py +118 -28
  79. deltacat/storage/iceberg/__init__.py +0 -0
  80. deltacat/storage/iceberg/iceberg_scan_planner.py +28 -0
  81. deltacat/storage/iceberg/impl.py +737 -0
  82. deltacat/storage/iceberg/model.py +709 -0
  83. deltacat/storage/interface.py +217 -134
  84. deltacat/storage/main/__init__.py +0 -0
  85. deltacat/storage/main/impl.py +2077 -0
  86. deltacat/storage/model/delta.py +118 -71
  87. deltacat/storage/model/interop.py +24 -0
  88. deltacat/storage/model/list_result.py +8 -0
  89. deltacat/storage/model/locator.py +93 -3
  90. deltacat/{aws/redshift → storage}/model/manifest.py +122 -98
  91. deltacat/storage/model/metafile.py +1316 -0
  92. deltacat/storage/model/namespace.py +34 -18
  93. deltacat/storage/model/partition.py +362 -37
  94. deltacat/storage/model/scan/__init__.py +0 -0
  95. deltacat/storage/model/scan/push_down.py +19 -0
  96. deltacat/storage/model/scan/scan_plan.py +10 -0
  97. deltacat/storage/model/scan/scan_task.py +34 -0
  98. deltacat/storage/model/schema.py +892 -0
  99. deltacat/storage/model/shard.py +47 -0
  100. deltacat/storage/model/sort_key.py +170 -13
  101. deltacat/storage/model/stream.py +208 -80
  102. deltacat/storage/model/table.py +123 -29
  103. deltacat/storage/model/table_version.py +322 -46
  104. deltacat/storage/model/transaction.py +757 -0
  105. deltacat/storage/model/transform.py +198 -61
  106. deltacat/storage/model/types.py +111 -13
  107. deltacat/storage/rivulet/__init__.py +11 -0
  108. deltacat/storage/rivulet/arrow/__init__.py +0 -0
  109. deltacat/storage/rivulet/arrow/serializer.py +75 -0
  110. deltacat/storage/rivulet/dataset.py +744 -0
  111. deltacat/storage/rivulet/dataset_executor.py +87 -0
  112. deltacat/storage/rivulet/feather/__init__.py +5 -0
  113. deltacat/storage/rivulet/feather/file_reader.py +136 -0
  114. deltacat/storage/rivulet/feather/serializer.py +35 -0
  115. deltacat/storage/rivulet/fs/__init__.py +0 -0
  116. deltacat/storage/rivulet/fs/file_provider.py +105 -0
  117. deltacat/storage/rivulet/fs/file_store.py +130 -0
  118. deltacat/storage/rivulet/fs/input_file.py +76 -0
  119. deltacat/storage/rivulet/fs/output_file.py +86 -0
  120. deltacat/storage/rivulet/logical_plan.py +105 -0
  121. deltacat/storage/rivulet/metastore/__init__.py +0 -0
  122. deltacat/storage/rivulet/metastore/delta.py +190 -0
  123. deltacat/storage/rivulet/metastore/json_sst.py +105 -0
  124. deltacat/storage/rivulet/metastore/sst.py +82 -0
  125. deltacat/storage/rivulet/metastore/sst_interval_tree.py +260 -0
  126. deltacat/storage/rivulet/mvp/Table.py +101 -0
  127. deltacat/storage/rivulet/mvp/__init__.py +5 -0
  128. deltacat/storage/rivulet/parquet/__init__.py +5 -0
  129. deltacat/storage/rivulet/parquet/data_reader.py +0 -0
  130. deltacat/storage/rivulet/parquet/file_reader.py +127 -0
  131. deltacat/storage/rivulet/parquet/serializer.py +37 -0
  132. deltacat/storage/rivulet/reader/__init__.py +0 -0
  133. deltacat/storage/rivulet/reader/block_scanner.py +378 -0
  134. deltacat/storage/rivulet/reader/data_reader.py +136 -0
  135. deltacat/storage/rivulet/reader/data_scan.py +63 -0
  136. deltacat/storage/rivulet/reader/dataset_metastore.py +178 -0
  137. deltacat/storage/rivulet/reader/dataset_reader.py +156 -0
  138. deltacat/storage/rivulet/reader/pyarrow_data_reader.py +121 -0
  139. deltacat/storage/rivulet/reader/query_expression.py +99 -0
  140. deltacat/storage/rivulet/reader/reader_type_registrar.py +84 -0
  141. deltacat/storage/rivulet/schema/__init__.py +0 -0
  142. deltacat/storage/rivulet/schema/datatype.py +128 -0
  143. deltacat/storage/rivulet/schema/schema.py +251 -0
  144. deltacat/storage/rivulet/serializer.py +40 -0
  145. deltacat/storage/rivulet/serializer_factory.py +42 -0
  146. deltacat/storage/rivulet/writer/__init__.py +0 -0
  147. deltacat/storage/rivulet/writer/dataset_writer.py +29 -0
  148. deltacat/storage/rivulet/writer/memtable_dataset_writer.py +294 -0
  149. deltacat/storage/util/__init__.py +0 -0
  150. deltacat/storage/util/scan_planner.py +26 -0
  151. deltacat/tests/_io/__init__.py +1 -0
  152. deltacat/tests/catalog/test_catalogs.py +324 -0
  153. deltacat/tests/catalog/test_default_catalog_impl.py +16 -8
  154. deltacat/tests/compute/compact_partition_multiple_rounds_test_cases.py +21 -21
  155. deltacat/tests/compute/compact_partition_rebase_test_cases.py +6 -6
  156. deltacat/tests/compute/compact_partition_rebase_then_incremental_test_cases.py +56 -56
  157. deltacat/tests/compute/compact_partition_test_cases.py +19 -53
  158. deltacat/tests/compute/compactor/steps/test_repartition.py +2 -2
  159. deltacat/tests/compute/compactor/utils/test_io.py +6 -8
  160. deltacat/tests/compute/compactor_v2/test_compaction_session.py +0 -466
  161. deltacat/tests/compute/compactor_v2/utils/test_task_options.py +1 -273
  162. deltacat/tests/compute/conftest.py +75 -0
  163. deltacat/tests/compute/converter/__init__.py +0 -0
  164. deltacat/tests/compute/converter/conftest.py +80 -0
  165. deltacat/tests/compute/converter/test_convert_session.py +478 -0
  166. deltacat/tests/compute/converter/utils.py +123 -0
  167. deltacat/tests/compute/resource_estimation/test_delta.py +0 -16
  168. deltacat/tests/compute/test_compact_partition_incremental.py +2 -42
  169. deltacat/tests/compute/test_compact_partition_multiple_rounds.py +5 -46
  170. deltacat/tests/compute/test_compact_partition_params.py +3 -3
  171. deltacat/tests/compute/test_compact_partition_rebase.py +1 -46
  172. deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +5 -46
  173. deltacat/tests/compute/test_util_common.py +19 -12
  174. deltacat/tests/compute/test_util_create_table_deltas_repo.py +13 -22
  175. deltacat/tests/local_deltacat_storage/__init__.py +76 -103
  176. deltacat/tests/storage/__init__.py +0 -0
  177. deltacat/tests/storage/conftest.py +25 -0
  178. deltacat/tests/storage/main/__init__.py +0 -0
  179. deltacat/tests/storage/main/test_main_storage.py +1399 -0
  180. deltacat/tests/storage/model/__init__.py +0 -0
  181. deltacat/tests/storage/model/test_delete_parameters.py +21 -0
  182. deltacat/tests/storage/model/test_metafile_io.py +2535 -0
  183. deltacat/tests/storage/model/test_schema.py +308 -0
  184. deltacat/tests/storage/model/test_shard.py +22 -0
  185. deltacat/tests/storage/model/test_table_version.py +110 -0
  186. deltacat/tests/storage/model/test_transaction.py +308 -0
  187. deltacat/tests/storage/rivulet/__init__.py +0 -0
  188. deltacat/tests/storage/rivulet/conftest.py +149 -0
  189. deltacat/tests/storage/rivulet/fs/__init__.py +0 -0
  190. deltacat/tests/storage/rivulet/fs/test_file_location_provider.py +93 -0
  191. deltacat/tests/storage/rivulet/schema/__init__.py +0 -0
  192. deltacat/tests/storage/rivulet/schema/test_schema.py +241 -0
  193. deltacat/tests/storage/rivulet/test_dataset.py +406 -0
  194. deltacat/tests/storage/rivulet/test_manifest.py +67 -0
  195. deltacat/tests/storage/rivulet/test_sst_interval_tree.py +232 -0
  196. deltacat/tests/storage/rivulet/test_utils.py +122 -0
  197. deltacat/tests/storage/rivulet/writer/__init__.py +0 -0
  198. deltacat/tests/storage/rivulet/writer/test_dataset_write_then_read.py +341 -0
  199. deltacat/tests/storage/rivulet/writer/test_dataset_writer.py +79 -0
  200. deltacat/tests/storage/rivulet/writer/test_memtable_dataset_writer.py +75 -0
  201. deltacat/tests/test_deltacat_api.py +39 -0
  202. deltacat/tests/test_utils/filesystem.py +14 -0
  203. deltacat/tests/test_utils/message_pack_utils.py +54 -0
  204. deltacat/tests/test_utils/pyarrow.py +8 -15
  205. deltacat/tests/test_utils/storage.py +266 -3
  206. deltacat/tests/utils/test_daft.py +3 -3
  207. deltacat/tests/utils/test_pyarrow.py +0 -432
  208. deltacat/types/partial_download.py +1 -1
  209. deltacat/types/tables.py +1 -1
  210. deltacat/utils/export.py +59 -0
  211. deltacat/utils/filesystem.py +320 -0
  212. deltacat/utils/metafile_locator.py +73 -0
  213. deltacat/utils/pyarrow.py +36 -183
  214. deltacat-2.0.0b2.dist-info/METADATA +65 -0
  215. deltacat-2.0.0b2.dist-info/RECORD +349 -0
  216. deltacat/aws/redshift/__init__.py +0 -19
  217. deltacat/catalog/default_catalog_impl/__init__.py +0 -369
  218. deltacat/io/dataset.py +0 -73
  219. deltacat/io/read_api.py +0 -143
  220. deltacat/storage/model/delete_parameters.py +0 -40
  221. deltacat/storage/model/partition_spec.py +0 -71
  222. deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +0 -253
  223. deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +0 -45
  224. deltacat-1.1.36.dist-info/METADATA +0 -64
  225. deltacat-1.1.36.dist-info/RECORD +0 -219
  226. /deltacat/{aws/redshift/model → benchmarking/data}/__init__.py +0 -0
  227. /deltacat/{io/aws → catalog/main}/__init__.py +0 -0
  228. /deltacat/{io/aws/redshift → compute/converter}/__init__.py +0 -0
  229. /deltacat/{tests/io → compute/converter/model}/__init__.py +0 -0
  230. /deltacat/tests/{io → _io}/test_cloudpickle_bug_fix.py +0 -0
  231. /deltacat/tests/{io → _io}/test_file_object_store.py +0 -0
  232. /deltacat/tests/{io → _io}/test_memcached_object_store.py +0 -0
  233. /deltacat/tests/{io → _io}/test_ray_plasma_object_store.py +0 -0
  234. /deltacat/tests/{io → _io}/test_redis_object_store.py +0 -0
  235. /deltacat/tests/{io → _io}/test_s3_object_store.py +0 -0
  236. {deltacat-1.1.36.dist-info → deltacat-2.0.0b2.dist-info}/LICENSE +0 -0
  237. {deltacat-1.1.36.dist-info → deltacat-2.0.0b2.dist-info}/WHEEL +0 -0
  238. {deltacat-1.1.36.dist-info → deltacat-2.0.0b2.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,88 @@
1
+ from typing import Optional, Dict
2
+ from deltacat.exceptions import RetryableError
3
+
4
+ AVERAGE_FILE_PATH_COLUMN_SIZE_BYTES = 80
5
+ AVERAGE_POS_COLUMN_SIZE_BYTES = 4
6
+ XXHASH_BYTE_PER_RECORD = 8
7
+ MEMORY_BUFFER_RATE = 1.2
8
+
9
+
10
+ def estimate_fixed_hash_columns(hash_value_size_bytes_per_record, total_record_count):
11
+ return hash_value_size_bytes_per_record * total_record_count
12
+
13
+
14
+ def get_total_record_from_iceberg_files(iceberg_files_list):
15
+ total_record_count = 0
16
+ for iceberg_files in iceberg_files_list:
17
+ total_record_count += sum(file.record_count for file in iceberg_files)
18
+ return total_record_count
19
+
20
+
21
+ def estimate_iceberg_pos_delete_additional_columns(
22
+ include_columns, num_of_record_count
23
+ ):
24
+ total_additional_columns_sizes = 0
25
+ if "file_path" in include_columns:
26
+ total_additional_columns_sizes += (
27
+ AVERAGE_FILE_PATH_COLUMN_SIZE_BYTES * num_of_record_count
28
+ )
29
+ elif "pos" in include_columns:
30
+ total_additional_columns_sizes += (
31
+ AVERAGE_POS_COLUMN_SIZE_BYTES * num_of_record_count
32
+ )
33
+ return total_additional_columns_sizes
34
+
35
+
36
+ def estimate_convert_remote_option_resources(data_files, equality_delete_files):
37
+ data_file_record_count = get_total_record_from_iceberg_files(data_files)
38
+ equality_delete_record_count = get_total_record_from_iceberg_files(
39
+ equality_delete_files
40
+ )
41
+ hash_column_sizes = estimate_fixed_hash_columns(
42
+ XXHASH_BYTE_PER_RECORD, data_file_record_count + equality_delete_record_count
43
+ )
44
+ pos_delete_sizes = estimate_iceberg_pos_delete_additional_columns(
45
+ ["file_path", "pos"], data_file_record_count + equality_delete_record_count
46
+ )
47
+ total_memory_required = hash_column_sizes + pos_delete_sizes
48
+ return total_memory_required * MEMORY_BUFFER_RATE
49
+
50
+
51
+ def _get_task_options(
52
+ memory: float,
53
+ ray_custom_resources: Optional[Dict] = None,
54
+ scheduling_strategy: str = "SPREAD",
55
+ ) -> Dict:
56
+
57
+ # NOTE: With DEFAULT scheduling strategy in Ray 2.20.0, autoscaler does
58
+ # not spin up enough nodes fast and hence we see only approximately
59
+ # 20 tasks get scheduled out of 100 tasks in queue. Hence, we use SPREAD
60
+ # which is also ideal for merge and hash bucket tasks.
61
+ # https://docs.ray.io/en/latest/ray-core/scheduling/index.html
62
+ task_opts = {
63
+ "memory": memory,
64
+ "scheduling_strategy": scheduling_strategy,
65
+ }
66
+
67
+ if ray_custom_resources:
68
+ task_opts["resources"] = ray_custom_resources
69
+
70
+ task_opts["max_retries"] = 3
71
+
72
+ # List of possible botocore exceptions are available at
73
+ # https://github.com/boto/botocore/blob/develop/botocore/exceptions.py
74
+ task_opts["retry_exceptions"] = [RetryableError]
75
+
76
+ return task_opts
77
+
78
+
79
+ def convert_resource_options_provider(index, files_for_each_bucket):
80
+ (
81
+ data_files_list,
82
+ equality_delete_files_list,
83
+ position_delete_files_list,
84
+ ) = files_for_each_bucket[1]
85
+ memory_requirement = estimate_convert_remote_option_resources(
86
+ data_files_list, equality_delete_files_list
87
+ )
88
+ return _get_task_options(memory=memory_requirement)
@@ -0,0 +1,109 @@
1
+ from collections import defaultdict
2
+ import logging
3
+ from deltacat import logs
4
+ from deltacat.compute.converter.model.convert_input_files import ConvertInputFiles
5
+
6
+ logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
7
+
8
+
9
+ def check_data_files_sequence_number(data_files_list, equality_delete_files_list):
10
+ # Sort by file sequence number
11
+ data_files_list.sort(key=lambda file_tuple: file_tuple[0])
12
+ equality_delete_files_list.sort(key=lambda file_tuple: file_tuple[0])
13
+
14
+ equality_delete_files = []
15
+ result_data_file = []
16
+
17
+ # Pointer for list data_file
18
+ data_file_pointer = 0
19
+
20
+ # Loop through each value in equality_delete_file
21
+ for equality_file_tuple in equality_delete_files_list:
22
+ # Find all values in data_file that are smaller than val_equality
23
+ valid_values = []
24
+
25
+ # Move data_file_pointer to the first value in data_file that is smaller than val_equality
26
+ while (
27
+ data_file_pointer < len(data_files_list)
28
+ and data_files_list[data_file_pointer][0] < equality_file_tuple[0]
29
+ ):
30
+ valid_values.append(data_files_list[data_file_pointer])
31
+ data_file_pointer += 1
32
+ equality_delete_files.append(equality_file_tuple)
33
+
34
+ # Append the value from equality_delete_file and the corresponding valid values from data_file
35
+ if valid_values:
36
+ result_data_file.append(valid_values)
37
+
38
+ result_equality_delete_file = append_larger_sequence_number_data_files(
39
+ equality_delete_files
40
+ )
41
+
42
+ return result_equality_delete_file, result_data_file
43
+
44
+
45
+ def append_larger_sequence_number_data_files(data_files_list):
46
+ result = []
47
+ # Iterate over the input list
48
+ for i in range(len(data_files_list)):
49
+ sublist = data_files_list[i:]
50
+ sublist_file_list = []
51
+ for file in sublist:
52
+ sublist_file_list.append(file)
53
+ result.append(sublist_file_list)
54
+ return result
55
+
56
+
57
+ def construct_iceberg_table_prefix(
58
+ iceberg_warehouse_bucket_name, table_name, iceberg_namespace
59
+ ):
60
+ return f"{iceberg_warehouse_bucket_name}/{iceberg_namespace}/{table_name}/data"
61
+
62
+
63
+ def partition_value_record_to_partition_value_string(partition):
64
+ # Get string representation of partition value out of Record[partition_value]
65
+ partition_value_str = partition.__repr__().split("[", 1)[1].split("]")[0]
66
+ return partition_value_str
67
+
68
+
69
+ def group_all_files_to_each_bucket(
70
+ data_file_dict, equality_delete_dict, pos_delete_dict
71
+ ):
72
+ convert_input_files_for_all_buckets = []
73
+ files_for_each_bucket_for_deletes = defaultdict(tuple)
74
+ if equality_delete_dict:
75
+ for partition_value, equality_delete_file_list in equality_delete_dict.items():
76
+ (
77
+ result_equality_delete_file,
78
+ result_data_file,
79
+ ) = check_data_files_sequence_number(
80
+ data_files_list=data_file_dict[partition_value],
81
+ equality_delete_files_list=equality_delete_dict[partition_value],
82
+ )
83
+ files_for_each_bucket_for_deletes[partition_value] = (
84
+ result_data_file,
85
+ result_equality_delete_file,
86
+ [],
87
+ )
88
+ if partition_value not in data_file_dict:
89
+ convert_input_file = ConvertInputFiles.of(
90
+ partition_value=partition_value,
91
+ applicable_data_files=result_data_file,
92
+ applicable_equalitu_delete_files=result_equality_delete_file,
93
+ )
94
+ convert_input_files_for_all_buckets.append(convert_input_file)
95
+
96
+ for partition_value, all_data_files_for_each_bucket in data_file_dict.items():
97
+ convert_input_file = ConvertInputFiles.of(
98
+ partition_value=partition_value,
99
+ all_data_files_for_dedupe=all_data_files_for_each_bucket,
100
+ )
101
+ if partition_value in files_for_each_bucket_for_deletes:
102
+ convert_input_file.applicable_data_files = (
103
+ files_for_each_bucket_for_deletes[partition_value][0]
104
+ )
105
+ convert_input_file.applicable_delete_files = (
106
+ files_for_each_bucket_for_deletes[partition_value][1]
107
+ )
108
+ convert_input_files_for_all_buckets.append(convert_input_file)
109
+ return convert_input_files_for_all_buckets
@@ -0,0 +1,82 @@
1
+ import pyarrow as pa
2
+ from typing import Union
3
+ import numpy as np
4
+
5
+ # Refer to: https://iceberg.apache.org/spec/#reserved-field-ids for reserved field ids
6
+ ICEBERG_RESERVED_FIELD_ID_FOR_FILE_PATH_COLUMN = 2147483546
7
+
8
+ # Refer to: https://iceberg.apache.org/spec/#reserved-field-ids for reserved field ids
9
+ ICEBERG_RESERVED_FIELD_ID_FOR_POS_COLUMN = 2147483545
10
+
11
+
12
+ def _get_iceberg_col_name(suffix):
13
+ return suffix
14
+
15
+
16
+ _ORDERED_RECORD_IDX_COLUMN_NAME = _get_iceberg_col_name("pos")
17
+ _ORDERED_RECORD_IDX_COLUMN_TYPE = pa.int64()
18
+ _ORDERED_RECORD_IDX_FIELD_METADATA = {
19
+ b"PARQUET:field_id": f"{ICEBERG_RESERVED_FIELD_ID_FOR_POS_COLUMN}"
20
+ }
21
+ _ORDERED_RECORD_IDX_COLUMN_FIELD = pa.field(
22
+ _ORDERED_RECORD_IDX_COLUMN_NAME,
23
+ _ORDERED_RECORD_IDX_COLUMN_TYPE,
24
+ metadata=_ORDERED_RECORD_IDX_FIELD_METADATA,
25
+ nullable=False,
26
+ )
27
+
28
+
29
+ def get_record_index_column_array(obj) -> Union[pa.Array, pa.ChunkedArray]:
30
+ return pa.array(
31
+ obj,
32
+ _ORDERED_RECORD_IDX_COLUMN_TYPE,
33
+ )
34
+
35
+
36
+ def append_record_idx_col(table: pa.Table, ordered_record_indices) -> pa.Table:
37
+
38
+ table = table.append_column(
39
+ _ORDERED_RECORD_IDX_COLUMN_FIELD,
40
+ get_record_index_column_array(ordered_record_indices),
41
+ )
42
+ return table
43
+
44
+
45
+ _FILE_PATH_COLUMN_NAME = _get_iceberg_col_name("file_path")
46
+ _FILE_PATH_COLUMN_TYPE = pa.string()
47
+ _FILE_PATH_FIELD_METADATA = {
48
+ b"PARQUET:field_id": f"{ICEBERG_RESERVED_FIELD_ID_FOR_FILE_PATH_COLUMN}"
49
+ }
50
+ _FILE_PATH_COLUMN_FIELD = pa.field(
51
+ _FILE_PATH_COLUMN_NAME,
52
+ _FILE_PATH_COLUMN_TYPE,
53
+ metadata=_FILE_PATH_FIELD_METADATA,
54
+ nullable=False,
55
+ )
56
+
57
+
58
+ def append_file_path_column(table: pa.Table, file_path: str):
59
+ table = table.append_column(
60
+ _FILE_PATH_COLUMN_FIELD,
61
+ pa.array(np.repeat(file_path, len(table)), _FILE_PATH_COLUMN_TYPE),
62
+ )
63
+ return table
64
+
65
+
66
+ _GLOBAL_RECORD_IDX_COLUMN_NAME = _get_iceberg_col_name("global_record_index")
67
+ _GLOBAL_RECORD_IDX_COLUMN_TYPE = pa.int64()
68
+ _GLOBAL_RECORD_IDX_COLUMN_FIELD = pa.field(
69
+ _GLOBAL_RECORD_IDX_COLUMN_NAME,
70
+ _GLOBAL_RECORD_IDX_COLUMN_TYPE,
71
+ )
72
+
73
+
74
+ def append_global_record_idx_column(
75
+ table: pa.Table, ordered_record_indices
76
+ ) -> pa.Table:
77
+
78
+ table = table.append_column(
79
+ _GLOBAL_RECORD_IDX_COLUMN_NAME,
80
+ pa.array(ordered_record_indices, _GLOBAL_RECORD_IDX_COLUMN_TYPE),
81
+ )
82
+ return table
@@ -0,0 +1,43 @@
1
+ import deltacat.compute.converter.utils.iceberg_columns as sc
2
+ import daft
3
+
4
+
5
+ def download_data_table_and_append_iceberg_columns(
6
+ file, columns_to_download, additional_columns_to_append, sequence_number
7
+ ):
8
+ # TODO; add S3 client kwargs
9
+ table = download_parquet_with_daft_hash_applied(
10
+ identify_columns=columns_to_download, file=file, s3_client_kwargs={}
11
+ )
12
+ if sc._FILE_PATH_COLUMN_NAME in additional_columns_to_append:
13
+ table = sc.append_file_path_column(table, file.file_path)
14
+ if sc._ORDERED_RECORD_IDX_COLUMN_NAME in additional_columns_to_append:
15
+ record_idx_iterator = iter(range(len(table)))
16
+ table = sc.append_record_idx_col(table, record_idx_iterator)
17
+ return table
18
+
19
+
20
+ def download_parquet_with_daft_hash_applied(
21
+ identify_columns, file, s3_client_kwargs, **kwargs
22
+ ):
23
+ from daft import TimeUnit
24
+
25
+ # TODO: Add correct read kwargs as in:
26
+ # https://github.com/ray-project/deltacat/blob/383855a4044e4dfe03cf36d7738359d512a517b4/deltacat/utils/daft.py#L97
27
+
28
+ coerce_int96_timestamp_unit = TimeUnit.from_str(
29
+ kwargs.get("coerce_int96_timestamp_unit", "ms")
30
+ )
31
+
32
+ from deltacat.utils.daft import _get_s3_io_config
33
+
34
+ # TODO: Use Daft SHA1 hash instead to minimize probably of data corruption
35
+ io_config = _get_s3_io_config(s3_client_kwargs=s3_client_kwargs)
36
+ df = daft.read_parquet(
37
+ path=file.file_path,
38
+ io_config=io_config,
39
+ coerce_int96_timestamp_unit=coerce_int96_timestamp_unit,
40
+ )
41
+ df = df.select(daft.col(identify_columns[0]).hash())
42
+ arrow_table = df.to_arrow()
43
+ return arrow_table
@@ -0,0 +1,133 @@
1
+ from tenacity import (
2
+ Retrying,
3
+ retry_if_exception_type,
4
+ stop_after_delay,
5
+ wait_random_exponential,
6
+ )
7
+ from typing import Union
8
+ from deltacat.aws.s3u import CapturedBlockWritePaths, UuidBlockWritePathProvider
9
+ from deltacat.types.tables import (
10
+ get_table_writer,
11
+ get_table_length,
12
+ TABLE_CLASS_TO_SLICER_FUNC,
13
+ )
14
+ from typing import Optional, Dict, Any, List
15
+ from deltacat.exceptions import RetryableError
16
+ from deltacat.storage import (
17
+ DistributedDataset,
18
+ LocalTable,
19
+ )
20
+ from deltacat.types.media import (
21
+ ContentEncoding,
22
+ ContentType,
23
+ )
24
+ from deltacat.aws.s3u import UPLOAD_SLICED_TABLE_RETRY_STOP_AFTER_DELAY
25
+ import s3fs
26
+
27
+
28
+ def get_credential():
29
+ import boto3
30
+
31
+ boto3_session = boto3.Session()
32
+ credentials = boto3_session.get_credentials()
33
+ return credentials
34
+
35
+
36
+ def get_s3_file_system(content_type):
37
+ token_holder = get_credential()
38
+ content_encoding = ContentEncoding.IDENTITY
39
+
40
+ s3_file_system = s3fs.S3FileSystem(
41
+ key=token_holder.access_key,
42
+ secret=token_holder.secret_key,
43
+ token=token_holder.token,
44
+ s3_additional_kwargs={
45
+ "ServerSideEncryption": "aws:kms",
46
+ # TODO: Get tagging from table properties
47
+ "ContentType": content_type.value,
48
+ "ContentEncoding": content_encoding.value,
49
+ },
50
+ )
51
+ return s3_file_system
52
+
53
+
54
+ def upload_table_with_retry(
55
+ table: Union[LocalTable, DistributedDataset],
56
+ s3_url_prefix: str,
57
+ s3_table_writer_kwargs: Optional[Dict[str, Any]],
58
+ content_type: ContentType = ContentType.PARQUET,
59
+ max_records_per_file: Optional[int] = 4000000,
60
+ s3_file_system=None,
61
+ **s3_client_kwargs,
62
+ ) -> List[str]:
63
+ """
64
+ Writes the given table to 1 or more S3 files and return Redshift
65
+ manifest entries describing the uploaded files.
66
+ """
67
+ retrying = Retrying(
68
+ wait=wait_random_exponential(multiplier=1, max=60),
69
+ stop=stop_after_delay(UPLOAD_SLICED_TABLE_RETRY_STOP_AFTER_DELAY),
70
+ retry=retry_if_exception_type(RetryableError),
71
+ )
72
+
73
+ if s3_table_writer_kwargs is None:
74
+ s3_table_writer_kwargs = {}
75
+
76
+ if not s3_file_system:
77
+ s3_file_system = get_s3_file_system(content_type=content_type)
78
+ capture_object = CapturedBlockWritePaths()
79
+ block_write_path_provider = UuidBlockWritePathProvider(
80
+ capture_object=capture_object
81
+ )
82
+ s3_table_writer_func = get_table_writer(table)
83
+ table_record_count = get_table_length(table)
84
+ if max_records_per_file is None or not table_record_count:
85
+ retrying(
86
+ fn=upload_table,
87
+ table_slices=table,
88
+ s3_base_url=f"{s3_url_prefix}",
89
+ s3_file_system=s3_file_system,
90
+ s3_table_writer_func=s3_table_writer_func,
91
+ s3_table_writer_kwargs=s3_table_writer_kwargs,
92
+ block_write_path_provider=block_write_path_provider,
93
+ content_type=content_type,
94
+ **s3_client_kwargs,
95
+ )
96
+ else:
97
+ table_slicer_func = TABLE_CLASS_TO_SLICER_FUNC.get(type(table))
98
+ table_slices = table_slicer_func(table, max_records_per_file)
99
+ for table_slice in table_slices:
100
+ retrying(
101
+ fn=upload_table,
102
+ table_slices=table_slice,
103
+ s3_base_url=f"{s3_url_prefix}",
104
+ s3_file_system=s3_file_system,
105
+ s3_table_writer_func=s3_table_writer_func,
106
+ s3_table_writer_kwargs=s3_table_writer_kwargs,
107
+ block_write_path_provider=block_write_path_provider,
108
+ content_type=content_type,
109
+ **s3_client_kwargs,
110
+ )
111
+ del block_write_path_provider
112
+ write_paths = capture_object.write_paths()
113
+ return write_paths
114
+
115
+
116
+ def upload_table(
117
+ table_slices,
118
+ s3_base_url,
119
+ s3_file_system,
120
+ s3_table_writer_func,
121
+ block_write_path_provider,
122
+ content_type,
123
+ s3_table_writer_kwargs,
124
+ ):
125
+ s3_table_writer_func(
126
+ table_slices,
127
+ s3_base_url,
128
+ s3_file_system,
129
+ block_write_path_provider,
130
+ content_type.value,
131
+ **s3_table_writer_kwargs,
132
+ )
133
+ # TODO: Add a proper fix for block_refs and write_paths not persisting in Ray actors
@@ -93,29 +93,11 @@ def _estimate_resources_required_to_process_delta_using_type_params(
93
93
  on_disk_size_bytes=delta.meta.content_length,
94
94
  ),
95
95
  )
96
- file_reader_kwargs_provider = kwargs.get(
97
- "file_reader_kwargs_provider"
98
- ) or deltacat_storage_kwargs.get("file_reader_kwargs_provider")
99
-
100
- """
101
- NOTE: The file_reader_kwargs_provider parameter can be passed in two ways:
102
- 1. Nested within deltacat_storage_kwargs during resource estimation
103
- 2. As a top-level attribute of CompactPartitionsParams during compaction
104
-
105
- This creates an inconsistent parameter path between resource estimation and compaction flows.
106
- As a long-term solution, this should be unified to use a single consistent path (either always
107
- nested in deltacat_storage_kwargs or always as a top-level parameter).
108
-
109
- For now, this implementation handles the resource estimation case by:
110
- 1. First checking for file_reader_kwargs_provider as a direct kwarg
111
- 2. Falling back to deltacat_storage_kwargs if not found
112
- This approach maintains backward compatibility by not modifying the DELTA_RESOURCE_ESTIMATION_FUNCTIONS signatures.
113
- """
96
+
114
97
  appended = append_content_type_params(
115
98
  delta=delta,
116
99
  deltacat_storage=deltacat_storage,
117
100
  deltacat_storage_kwargs=deltacat_storage_kwargs,
118
- file_reader_kwargs_provider=file_reader_kwargs_provider,
119
101
  )
120
102
 
121
103
  if not appended:
deltacat/constants.py CHANGED
@@ -1,4 +1,8 @@
1
- from deltacat.utils.common import env_string
1
+ from __future__ import annotations
2
+
3
+
4
+ from deltacat.utils.common import env_string, env_bool
5
+ import os
2
6
 
3
7
  # Environment variables
4
8
  DELTACAT_SYS_LOG_LEVEL = env_string("DELTACAT_SYS_LOG_LEVEL", "DEBUG")
@@ -30,6 +34,26 @@ DELTACAT_APP_DEBUG_LOG_BASE_FILE_NAME = env_string(
30
34
  )
31
35
  # A json context which will be logged along with other context args.
32
36
  DELTACAT_LOGGER_CONTEXT = env_string("DELTACAT_LOGGER_CONTEXT", None)
37
+ DELTACAT_LOGGER_USE_SINGLE_HANDLER = env_bool(
38
+ "DELTACAT_LOGGER_USE_SINGLE_HANDLER",
39
+ False,
40
+ )
41
+ DELTACAT_ROOT = env_string(
42
+ "DELTACAT_ROOT",
43
+ os.path.join(os.getcwd(), ".deltacat"),
44
+ )
45
+
46
+ # CLI Args
47
+ METAFILE_FORMAT_KEY = "METAFILE_FORMAT"
48
+ METAFILE_FORMAT_JSON = "json"
49
+ METAFILE_FORMAT_MSGPACK = "msgpack"
50
+ METAFILE_FORMAT = env_string(METAFILE_FORMAT_KEY, METAFILE_FORMAT_MSGPACK)
51
+ SUPPORTED_METAFILE_FORMATS = [METAFILE_FORMAT_JSON, METAFILE_FORMAT_MSGPACK]
52
+ METAFILE_EXT = {
53
+ "json": ".json",
54
+ "msgpack": ".mpk",
55
+ }[METAFILE_FORMAT]
56
+
33
57
 
34
58
  # Byte Units
35
59
  BYTES_PER_KIBIBYTE = 2**10
@@ -41,6 +65,11 @@ BYTES_PER_PEBIBYTE = 2**50
41
65
  SIGNED_INT64_MIN_VALUE = -(2**63)
42
66
  SIGNED_INT64_MAX_VALUE = 2**63 - 1
43
67
 
68
+ # Time Units
69
+ NANOS_PER_SEC = 1_000_000_000
70
+ MICROS_PER_SEC = 1_000_000
71
+ MILLIS_PER_SEC = 1000
72
+
44
73
  # Inflation multiplier from snappy-compressed parquet to pyarrow.
45
74
  # This should be kept larger than actual average inflation multipliers.
46
75
  # Note that this is a very rough guess since actual observed pyarrow
@@ -58,3 +87,20 @@ MEMORY_TO_HASH_BUCKET_COUNT_RATIO = 0.0512 * BYTES_PER_TEBIBYTE
58
87
 
59
88
  # The number of bytes allocated to null values in string physical type in parquet
60
89
  NULL_SIZE_BYTES = 4
90
+
91
+ # Metastore Constants
92
+ REVISION_DIR_NAME: str = "rev"
93
+ TXN_DIR_NAME: str = "txn"
94
+ RUNNING_TXN_DIR_NAME: str = "running"
95
+ FAILED_TXN_DIR_NAME: str = "failed"
96
+ SUCCESS_TXN_DIR_NAME: str = "success"
97
+ TXN_PART_SEPARATOR = "_"
98
+ # Storage interface defaults
99
+ # These defaults should be applied in catalog interface implementations
100
+ # Storage interface implementations should be agnostic to defaults and require full information
101
+ DEFAULT_CATALOG = "DEFAULT"
102
+ DEFAULT_NAMESPACE = "DEFAULT"
103
+ DEFAULT_TABLE_VERSION = "1"
104
+ DEFAULT_STREAM_ID = "stream"
105
+ DEFAULT_PARTITION_ID = "partition"
106
+ DEFAULT_PARTITION_VALUES = ["default"]
deltacat/env.py ADDED
@@ -0,0 +1,51 @@
1
+ import os
2
+ import logging
3
+ from typing import Dict, Any
4
+
5
+ from deltacat import logs
6
+
7
+ from deltacat.constants import (
8
+ DELTACAT_APP_LOG_LEVEL,
9
+ DELTACAT_SYS_LOG_LEVEL,
10
+ DELTACAT_APP_LOG_DIR,
11
+ DELTACAT_SYS_LOG_DIR,
12
+ DELTACAT_APP_INFO_LOG_BASE_FILE_NAME,
13
+ DELTACAT_SYS_INFO_LOG_BASE_FILE_NAME,
14
+ DELTACAT_APP_DEBUG_LOG_BASE_FILE_NAME,
15
+ DELTACAT_SYS_DEBUG_LOG_BASE_FILE_NAME,
16
+ DELTACAT_LOGGER_USE_SINGLE_HANDLER,
17
+ )
18
+
19
+ logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
20
+
21
+
22
+ def create_ray_runtime_environment() -> Dict[str, Any]:
23
+ # log the system environment for debugging
24
+ logger.debug(f"System Environment: {os.environ}")
25
+
26
+ # read the stage (e.g. alpha, beta, dev, etc.) from system environment vars
27
+ stage = os.environ.get("STAGE")
28
+ logger.debug(f"Runtime Environment Stage: {stage}")
29
+ runtime_environment = None
30
+ if stage:
31
+ worker_env_vars = {
32
+ # forward the STAGE environment variable to workers
33
+ "STAGE": stage,
34
+ # forward deltacat logging environment variables to workers
35
+ "DELTACAT_APP_LOG_LEVEL": DELTACAT_APP_LOG_LEVEL,
36
+ "DELTACAT_SYS_LOG_LEVEL": DELTACAT_SYS_LOG_LEVEL,
37
+ "DELTACAT_APP_LOG_DIR": DELTACAT_APP_LOG_DIR,
38
+ "DELTACAT_SYS_LOG_DIR": DELTACAT_SYS_LOG_DIR,
39
+ "DELTACAT_APP_INFO_LOG_BASE_FILE_NAME": DELTACAT_APP_INFO_LOG_BASE_FILE_NAME,
40
+ "DELTACAT_SYS_INFO_LOG_BASE_FILE_NAME": DELTACAT_SYS_INFO_LOG_BASE_FILE_NAME,
41
+ "DELTACAT_APP_DEBUG_LOG_BASE_FILE_NAME": DELTACAT_APP_DEBUG_LOG_BASE_FILE_NAME,
42
+ "DELTACAT_SYS_DEBUG_LOG_BASE_FILE_NAME": DELTACAT_SYS_DEBUG_LOG_BASE_FILE_NAME,
43
+ "DELTACAT_LOGGER_USE_SINGLE_HANDLER": str(
44
+ DELTACAT_LOGGER_USE_SINGLE_HANDLER
45
+ ),
46
+ }
47
+ # setup runtime environment from system environment variables:
48
+ runtime_environment = {
49
+ "env_vars": worker_env_vars,
50
+ }
51
+ return runtime_environment
File without changes