deltacat 1.1.35__py3-none-any.whl → 2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (235) hide show
  1. deltacat/__init__.py +42 -3
  2. deltacat/annotations.py +36 -0
  3. deltacat/api.py +168 -0
  4. deltacat/aws/s3u.py +4 -4
  5. deltacat/benchmarking/benchmark_engine.py +82 -0
  6. deltacat/benchmarking/benchmark_report.py +86 -0
  7. deltacat/benchmarking/benchmark_suite.py +11 -0
  8. deltacat/benchmarking/conftest.py +21 -0
  9. deltacat/benchmarking/data/random_row_generator.py +94 -0
  10. deltacat/benchmarking/data/row_generator.py +10 -0
  11. deltacat/benchmarking/test_benchmark_pipeline.py +106 -0
  12. deltacat/catalog/__init__.py +14 -0
  13. deltacat/catalog/delegate.py +199 -106
  14. deltacat/catalog/iceberg/__init__.py +4 -0
  15. deltacat/catalog/iceberg/iceberg_catalog_config.py +26 -0
  16. deltacat/catalog/iceberg/impl.py +368 -0
  17. deltacat/catalog/iceberg/overrides.py +74 -0
  18. deltacat/catalog/interface.py +273 -76
  19. deltacat/catalog/main/impl.py +720 -0
  20. deltacat/catalog/model/catalog.py +227 -20
  21. deltacat/catalog/model/properties.py +116 -0
  22. deltacat/catalog/model/table_definition.py +32 -1
  23. deltacat/compute/compactor/model/compaction_session_audit_info.py +7 -3
  24. deltacat/compute/compactor/model/delta_annotated.py +3 -3
  25. deltacat/compute/compactor/model/delta_file_envelope.py +3 -1
  26. deltacat/compute/compactor/model/delta_file_locator.py +3 -1
  27. deltacat/compute/compactor/model/round_completion_info.py +5 -5
  28. deltacat/compute/compactor/model/table_object_store.py +3 -2
  29. deltacat/compute/compactor/repartition_session.py +1 -1
  30. deltacat/compute/compactor/steps/dedupe.py +11 -4
  31. deltacat/compute/compactor/steps/hash_bucket.py +1 -1
  32. deltacat/compute/compactor/steps/materialize.py +6 -2
  33. deltacat/compute/compactor/utils/io.py +1 -1
  34. deltacat/compute/compactor/utils/sort_key.py +9 -2
  35. deltacat/compute/compactor_v2/compaction_session.py +2 -3
  36. deltacat/compute/compactor_v2/constants.py +1 -30
  37. deltacat/compute/compactor_v2/deletes/utils.py +3 -3
  38. deltacat/compute/compactor_v2/model/merge_input.py +1 -1
  39. deltacat/compute/compactor_v2/private/compaction_utils.py +5 -5
  40. deltacat/compute/compactor_v2/steps/merge.py +11 -80
  41. deltacat/compute/compactor_v2/utils/content_type_params.py +0 -17
  42. deltacat/compute/compactor_v2/utils/dedupe.py +1 -1
  43. deltacat/compute/compactor_v2/utils/io.py +1 -1
  44. deltacat/compute/compactor_v2/utils/primary_key_index.py +3 -15
  45. deltacat/compute/compactor_v2/utils/task_options.py +23 -43
  46. deltacat/compute/converter/constants.py +4 -0
  47. deltacat/compute/converter/converter_session.py +143 -0
  48. deltacat/compute/converter/model/convert_input.py +69 -0
  49. deltacat/compute/converter/model/convert_input_files.py +61 -0
  50. deltacat/compute/converter/model/converter_session_params.py +99 -0
  51. deltacat/compute/converter/pyiceberg/__init__.py +0 -0
  52. deltacat/compute/converter/pyiceberg/catalog.py +75 -0
  53. deltacat/compute/converter/pyiceberg/overrides.py +135 -0
  54. deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +251 -0
  55. deltacat/compute/converter/steps/__init__.py +0 -0
  56. deltacat/compute/converter/steps/convert.py +211 -0
  57. deltacat/compute/converter/steps/dedupe.py +60 -0
  58. deltacat/compute/converter/utils/__init__.py +0 -0
  59. deltacat/compute/converter/utils/convert_task_options.py +88 -0
  60. deltacat/compute/converter/utils/converter_session_utils.py +109 -0
  61. deltacat/compute/converter/utils/iceberg_columns.py +82 -0
  62. deltacat/compute/converter/utils/io.py +43 -0
  63. deltacat/compute/converter/utils/s3u.py +133 -0
  64. deltacat/compute/resource_estimation/delta.py +1 -19
  65. deltacat/constants.py +47 -1
  66. deltacat/env.py +51 -0
  67. deltacat/examples/__init__.py +0 -0
  68. deltacat/examples/basic_logging.py +101 -0
  69. deltacat/examples/common/__init__.py +0 -0
  70. deltacat/examples/common/fixtures.py +15 -0
  71. deltacat/examples/hello_world.py +27 -0
  72. deltacat/examples/iceberg/__init__.py +0 -0
  73. deltacat/examples/iceberg/iceberg_bucket_writer.py +139 -0
  74. deltacat/examples/iceberg/iceberg_reader.py +149 -0
  75. deltacat/exceptions.py +51 -9
  76. deltacat/logs.py +4 -1
  77. deltacat/storage/__init__.py +118 -28
  78. deltacat/storage/iceberg/__init__.py +0 -0
  79. deltacat/storage/iceberg/iceberg_scan_planner.py +28 -0
  80. deltacat/storage/iceberg/impl.py +737 -0
  81. deltacat/storage/iceberg/model.py +709 -0
  82. deltacat/storage/interface.py +217 -134
  83. deltacat/storage/main/__init__.py +0 -0
  84. deltacat/storage/main/impl.py +2077 -0
  85. deltacat/storage/model/delta.py +118 -71
  86. deltacat/storage/model/interop.py +24 -0
  87. deltacat/storage/model/list_result.py +8 -0
  88. deltacat/storage/model/locator.py +93 -3
  89. deltacat/{aws/redshift → storage}/model/manifest.py +122 -98
  90. deltacat/storage/model/metafile.py +1316 -0
  91. deltacat/storage/model/namespace.py +34 -18
  92. deltacat/storage/model/partition.py +362 -37
  93. deltacat/storage/model/scan/__init__.py +0 -0
  94. deltacat/storage/model/scan/push_down.py +19 -0
  95. deltacat/storage/model/scan/scan_plan.py +10 -0
  96. deltacat/storage/model/scan/scan_task.py +34 -0
  97. deltacat/storage/model/schema.py +892 -0
  98. deltacat/storage/model/shard.py +47 -0
  99. deltacat/storage/model/sort_key.py +170 -13
  100. deltacat/storage/model/stream.py +208 -80
  101. deltacat/storage/model/table.py +123 -29
  102. deltacat/storage/model/table_version.py +322 -46
  103. deltacat/storage/model/transaction.py +757 -0
  104. deltacat/storage/model/transform.py +198 -61
  105. deltacat/storage/model/types.py +111 -13
  106. deltacat/storage/rivulet/__init__.py +11 -0
  107. deltacat/storage/rivulet/arrow/__init__.py +0 -0
  108. deltacat/storage/rivulet/arrow/serializer.py +75 -0
  109. deltacat/storage/rivulet/dataset.py +744 -0
  110. deltacat/storage/rivulet/dataset_executor.py +87 -0
  111. deltacat/storage/rivulet/feather/__init__.py +5 -0
  112. deltacat/storage/rivulet/feather/file_reader.py +136 -0
  113. deltacat/storage/rivulet/feather/serializer.py +35 -0
  114. deltacat/storage/rivulet/fs/__init__.py +0 -0
  115. deltacat/storage/rivulet/fs/file_provider.py +105 -0
  116. deltacat/storage/rivulet/fs/file_store.py +130 -0
  117. deltacat/storage/rivulet/fs/input_file.py +76 -0
  118. deltacat/storage/rivulet/fs/output_file.py +86 -0
  119. deltacat/storage/rivulet/logical_plan.py +105 -0
  120. deltacat/storage/rivulet/metastore/__init__.py +0 -0
  121. deltacat/storage/rivulet/metastore/delta.py +190 -0
  122. deltacat/storage/rivulet/metastore/json_sst.py +105 -0
  123. deltacat/storage/rivulet/metastore/sst.py +82 -0
  124. deltacat/storage/rivulet/metastore/sst_interval_tree.py +260 -0
  125. deltacat/storage/rivulet/mvp/Table.py +101 -0
  126. deltacat/storage/rivulet/mvp/__init__.py +5 -0
  127. deltacat/storage/rivulet/parquet/__init__.py +5 -0
  128. deltacat/storage/rivulet/parquet/data_reader.py +0 -0
  129. deltacat/storage/rivulet/parquet/file_reader.py +127 -0
  130. deltacat/storage/rivulet/parquet/serializer.py +37 -0
  131. deltacat/storage/rivulet/reader/__init__.py +0 -0
  132. deltacat/storage/rivulet/reader/block_scanner.py +378 -0
  133. deltacat/storage/rivulet/reader/data_reader.py +136 -0
  134. deltacat/storage/rivulet/reader/data_scan.py +63 -0
  135. deltacat/storage/rivulet/reader/dataset_metastore.py +178 -0
  136. deltacat/storage/rivulet/reader/dataset_reader.py +156 -0
  137. deltacat/storage/rivulet/reader/pyarrow_data_reader.py +121 -0
  138. deltacat/storage/rivulet/reader/query_expression.py +99 -0
  139. deltacat/storage/rivulet/reader/reader_type_registrar.py +84 -0
  140. deltacat/storage/rivulet/schema/__init__.py +0 -0
  141. deltacat/storage/rivulet/schema/datatype.py +128 -0
  142. deltacat/storage/rivulet/schema/schema.py +251 -0
  143. deltacat/storage/rivulet/serializer.py +40 -0
  144. deltacat/storage/rivulet/serializer_factory.py +42 -0
  145. deltacat/storage/rivulet/writer/__init__.py +0 -0
  146. deltacat/storage/rivulet/writer/dataset_writer.py +29 -0
  147. deltacat/storage/rivulet/writer/memtable_dataset_writer.py +294 -0
  148. deltacat/tests/_io/__init__.py +1 -0
  149. deltacat/tests/catalog/test_catalogs.py +324 -0
  150. deltacat/tests/catalog/test_default_catalog_impl.py +16 -8
  151. deltacat/tests/compute/compact_partition_multiple_rounds_test_cases.py +21 -21
  152. deltacat/tests/compute/compact_partition_rebase_test_cases.py +6 -6
  153. deltacat/tests/compute/compact_partition_rebase_then_incremental_test_cases.py +56 -56
  154. deltacat/tests/compute/compact_partition_test_cases.py +19 -53
  155. deltacat/tests/compute/compactor/steps/test_repartition.py +2 -2
  156. deltacat/tests/compute/compactor/utils/test_io.py +6 -8
  157. deltacat/tests/compute/compactor_v2/test_compaction_session.py +0 -466
  158. deltacat/tests/compute/compactor_v2/utils/test_task_options.py +1 -273
  159. deltacat/tests/compute/conftest.py +75 -0
  160. deltacat/tests/compute/converter/__init__.py +0 -0
  161. deltacat/tests/compute/converter/conftest.py +80 -0
  162. deltacat/tests/compute/converter/test_convert_session.py +478 -0
  163. deltacat/tests/compute/converter/utils.py +123 -0
  164. deltacat/tests/compute/resource_estimation/test_delta.py +0 -16
  165. deltacat/tests/compute/test_compact_partition_incremental.py +2 -42
  166. deltacat/tests/compute/test_compact_partition_multiple_rounds.py +5 -46
  167. deltacat/tests/compute/test_compact_partition_params.py +3 -3
  168. deltacat/tests/compute/test_compact_partition_rebase.py +1 -46
  169. deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +5 -46
  170. deltacat/tests/compute/test_util_common.py +19 -12
  171. deltacat/tests/compute/test_util_create_table_deltas_repo.py +13 -22
  172. deltacat/tests/local_deltacat_storage/__init__.py +76 -103
  173. deltacat/tests/storage/__init__.py +0 -0
  174. deltacat/tests/storage/conftest.py +25 -0
  175. deltacat/tests/storage/main/__init__.py +0 -0
  176. deltacat/tests/storage/main/test_main_storage.py +1399 -0
  177. deltacat/tests/storage/model/__init__.py +0 -0
  178. deltacat/tests/storage/model/test_delete_parameters.py +21 -0
  179. deltacat/tests/storage/model/test_metafile_io.py +2535 -0
  180. deltacat/tests/storage/model/test_schema.py +308 -0
  181. deltacat/tests/storage/model/test_shard.py +22 -0
  182. deltacat/tests/storage/model/test_table_version.py +110 -0
  183. deltacat/tests/storage/model/test_transaction.py +308 -0
  184. deltacat/tests/storage/rivulet/__init__.py +0 -0
  185. deltacat/tests/storage/rivulet/conftest.py +149 -0
  186. deltacat/tests/storage/rivulet/fs/__init__.py +0 -0
  187. deltacat/tests/storage/rivulet/fs/test_file_location_provider.py +93 -0
  188. deltacat/tests/storage/rivulet/schema/__init__.py +0 -0
  189. deltacat/tests/storage/rivulet/schema/test_schema.py +241 -0
  190. deltacat/tests/storage/rivulet/test_dataset.py +406 -0
  191. deltacat/tests/storage/rivulet/test_manifest.py +67 -0
  192. deltacat/tests/storage/rivulet/test_sst_interval_tree.py +232 -0
  193. deltacat/tests/storage/rivulet/test_utils.py +122 -0
  194. deltacat/tests/storage/rivulet/writer/__init__.py +0 -0
  195. deltacat/tests/storage/rivulet/writer/test_dataset_write_then_read.py +341 -0
  196. deltacat/tests/storage/rivulet/writer/test_dataset_writer.py +79 -0
  197. deltacat/tests/storage/rivulet/writer/test_memtable_dataset_writer.py +75 -0
  198. deltacat/tests/test_deltacat_api.py +39 -0
  199. deltacat/tests/test_utils/filesystem.py +14 -0
  200. deltacat/tests/test_utils/message_pack_utils.py +54 -0
  201. deltacat/tests/test_utils/pyarrow.py +8 -15
  202. deltacat/tests/test_utils/storage.py +266 -3
  203. deltacat/tests/utils/test_daft.py +3 -3
  204. deltacat/tests/utils/test_pyarrow.py +0 -432
  205. deltacat/types/partial_download.py +1 -1
  206. deltacat/types/tables.py +1 -1
  207. deltacat/utils/export.py +59 -0
  208. deltacat/utils/filesystem.py +320 -0
  209. deltacat/utils/metafile_locator.py +73 -0
  210. deltacat/utils/pyarrow.py +36 -183
  211. deltacat-2.0.dist-info/METADATA +65 -0
  212. deltacat-2.0.dist-info/RECORD +347 -0
  213. deltacat/aws/redshift/__init__.py +0 -19
  214. deltacat/catalog/default_catalog_impl/__init__.py +0 -369
  215. deltacat/io/dataset.py +0 -73
  216. deltacat/io/read_api.py +0 -143
  217. deltacat/storage/model/delete_parameters.py +0 -40
  218. deltacat/storage/model/partition_spec.py +0 -71
  219. deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +0 -253
  220. deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +0 -45
  221. deltacat-1.1.35.dist-info/METADATA +0 -64
  222. deltacat-1.1.35.dist-info/RECORD +0 -219
  223. /deltacat/{aws/redshift/model → benchmarking/data}/__init__.py +0 -0
  224. /deltacat/{io/aws → catalog/main}/__init__.py +0 -0
  225. /deltacat/{io/aws/redshift → compute/converter}/__init__.py +0 -0
  226. /deltacat/{tests/io → compute/converter/model}/__init__.py +0 -0
  227. /deltacat/tests/{io → _io}/test_cloudpickle_bug_fix.py +0 -0
  228. /deltacat/tests/{io → _io}/test_file_object_store.py +0 -0
  229. /deltacat/tests/{io → _io}/test_memcached_object_store.py +0 -0
  230. /deltacat/tests/{io → _io}/test_ray_plasma_object_store.py +0 -0
  231. /deltacat/tests/{io → _io}/test_redis_object_store.py +0 -0
  232. /deltacat/tests/{io → _io}/test_s3_object_store.py +0 -0
  233. {deltacat-1.1.35.dist-info → deltacat-2.0.dist-info}/LICENSE +0 -0
  234. {deltacat-1.1.35.dist-info → deltacat-2.0.dist-info}/WHEEL +0 -0
  235. {deltacat-1.1.35.dist-info → deltacat-2.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,109 @@
1
+ from collections import defaultdict
2
+ import logging
3
+ from deltacat import logs
4
+ from deltacat.compute.converter.model.convert_input_files import ConvertInputFiles
5
+
6
+ logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
7
+
8
+
9
+ def check_data_files_sequence_number(data_files_list, equality_delete_files_list):
10
+ # Sort by file sequence number
11
+ data_files_list.sort(key=lambda file_tuple: file_tuple[0])
12
+ equality_delete_files_list.sort(key=lambda file_tuple: file_tuple[0])
13
+
14
+ equality_delete_files = []
15
+ result_data_file = []
16
+
17
+ # Pointer for list data_file
18
+ data_file_pointer = 0
19
+
20
+ # Loop through each value in equality_delete_file
21
+ for equality_file_tuple in equality_delete_files_list:
22
+ # Find all values in data_file that are smaller than val_equality
23
+ valid_values = []
24
+
25
+ # Move data_file_pointer to the first value in data_file that is smaller than val_equality
26
+ while (
27
+ data_file_pointer < len(data_files_list)
28
+ and data_files_list[data_file_pointer][0] < equality_file_tuple[0]
29
+ ):
30
+ valid_values.append(data_files_list[data_file_pointer])
31
+ data_file_pointer += 1
32
+ equality_delete_files.append(equality_file_tuple)
33
+
34
+ # Append the value from equality_delete_file and the corresponding valid values from data_file
35
+ if valid_values:
36
+ result_data_file.append(valid_values)
37
+
38
+ result_equality_delete_file = append_larger_sequence_number_data_files(
39
+ equality_delete_files
40
+ )
41
+
42
+ return result_equality_delete_file, result_data_file
43
+
44
+
45
+ def append_larger_sequence_number_data_files(data_files_list):
46
+ result = []
47
+ # Iterate over the input list
48
+ for i in range(len(data_files_list)):
49
+ sublist = data_files_list[i:]
50
+ sublist_file_list = []
51
+ for file in sublist:
52
+ sublist_file_list.append(file)
53
+ result.append(sublist_file_list)
54
+ return result
55
+
56
+
57
+ def construct_iceberg_table_prefix(
58
+ iceberg_warehouse_bucket_name, table_name, iceberg_namespace
59
+ ):
60
+ return f"{iceberg_warehouse_bucket_name}/{iceberg_namespace}/{table_name}/data"
61
+
62
+
63
+ def partition_value_record_to_partition_value_string(partition):
64
+ # Get string representation of partition value out of Record[partition_value]
65
+ partition_value_str = partition.__repr__().split("[", 1)[1].split("]")[0]
66
+ return partition_value_str
67
+
68
+
69
+ def group_all_files_to_each_bucket(
70
+ data_file_dict, equality_delete_dict, pos_delete_dict
71
+ ):
72
+ convert_input_files_for_all_buckets = []
73
+ files_for_each_bucket_for_deletes = defaultdict(tuple)
74
+ if equality_delete_dict:
75
+ for partition_value, equality_delete_file_list in equality_delete_dict.items():
76
+ (
77
+ result_equality_delete_file,
78
+ result_data_file,
79
+ ) = check_data_files_sequence_number(
80
+ data_files_list=data_file_dict[partition_value],
81
+ equality_delete_files_list=equality_delete_dict[partition_value],
82
+ )
83
+ files_for_each_bucket_for_deletes[partition_value] = (
84
+ result_data_file,
85
+ result_equality_delete_file,
86
+ [],
87
+ )
88
+ if partition_value not in data_file_dict:
89
+ convert_input_file = ConvertInputFiles.of(
90
+ partition_value=partition_value,
91
+ applicable_data_files=result_data_file,
92
+ applicable_equalitu_delete_files=result_equality_delete_file,
93
+ )
94
+ convert_input_files_for_all_buckets.append(convert_input_file)
95
+
96
+ for partition_value, all_data_files_for_each_bucket in data_file_dict.items():
97
+ convert_input_file = ConvertInputFiles.of(
98
+ partition_value=partition_value,
99
+ all_data_files_for_dedupe=all_data_files_for_each_bucket,
100
+ )
101
+ if partition_value in files_for_each_bucket_for_deletes:
102
+ convert_input_file.applicable_data_files = (
103
+ files_for_each_bucket_for_deletes[partition_value][0]
104
+ )
105
+ convert_input_file.applicable_delete_files = (
106
+ files_for_each_bucket_for_deletes[partition_value][1]
107
+ )
108
+ convert_input_files_for_all_buckets.append(convert_input_file)
109
+ return convert_input_files_for_all_buckets
@@ -0,0 +1,82 @@
1
+ import pyarrow as pa
2
+ from typing import Union
3
+ import numpy as np
4
+
5
+ # Refer to: https://iceberg.apache.org/spec/#reserved-field-ids for reserved field ids
6
+ ICEBERG_RESERVED_FIELD_ID_FOR_FILE_PATH_COLUMN = 2147483546
7
+
8
+ # Refer to: https://iceberg.apache.org/spec/#reserved-field-ids for reserved field ids
9
+ ICEBERG_RESERVED_FIELD_ID_FOR_POS_COLUMN = 2147483545
10
+
11
+
12
+ def _get_iceberg_col_name(suffix):
13
+ return suffix
14
+
15
+
16
+ _ORDERED_RECORD_IDX_COLUMN_NAME = _get_iceberg_col_name("pos")
17
+ _ORDERED_RECORD_IDX_COLUMN_TYPE = pa.int64()
18
+ _ORDERED_RECORD_IDX_FIELD_METADATA = {
19
+ b"PARQUET:field_id": f"{ICEBERG_RESERVED_FIELD_ID_FOR_POS_COLUMN}"
20
+ }
21
+ _ORDERED_RECORD_IDX_COLUMN_FIELD = pa.field(
22
+ _ORDERED_RECORD_IDX_COLUMN_NAME,
23
+ _ORDERED_RECORD_IDX_COLUMN_TYPE,
24
+ metadata=_ORDERED_RECORD_IDX_FIELD_METADATA,
25
+ nullable=False,
26
+ )
27
+
28
+
29
+ def get_record_index_column_array(obj) -> Union[pa.Array, pa.ChunkedArray]:
30
+ return pa.array(
31
+ obj,
32
+ _ORDERED_RECORD_IDX_COLUMN_TYPE,
33
+ )
34
+
35
+
36
+ def append_record_idx_col(table: pa.Table, ordered_record_indices) -> pa.Table:
37
+
38
+ table = table.append_column(
39
+ _ORDERED_RECORD_IDX_COLUMN_FIELD,
40
+ get_record_index_column_array(ordered_record_indices),
41
+ )
42
+ return table
43
+
44
+
45
+ _FILE_PATH_COLUMN_NAME = _get_iceberg_col_name("file_path")
46
+ _FILE_PATH_COLUMN_TYPE = pa.string()
47
+ _FILE_PATH_FIELD_METADATA = {
48
+ b"PARQUET:field_id": f"{ICEBERG_RESERVED_FIELD_ID_FOR_FILE_PATH_COLUMN}"
49
+ }
50
+ _FILE_PATH_COLUMN_FIELD = pa.field(
51
+ _FILE_PATH_COLUMN_NAME,
52
+ _FILE_PATH_COLUMN_TYPE,
53
+ metadata=_FILE_PATH_FIELD_METADATA,
54
+ nullable=False,
55
+ )
56
+
57
+
58
+ def append_file_path_column(table: pa.Table, file_path: str):
59
+ table = table.append_column(
60
+ _FILE_PATH_COLUMN_FIELD,
61
+ pa.array(np.repeat(file_path, len(table)), _FILE_PATH_COLUMN_TYPE),
62
+ )
63
+ return table
64
+
65
+
66
+ _GLOBAL_RECORD_IDX_COLUMN_NAME = _get_iceberg_col_name("global_record_index")
67
+ _GLOBAL_RECORD_IDX_COLUMN_TYPE = pa.int64()
68
+ _GLOBAL_RECORD_IDX_COLUMN_FIELD = pa.field(
69
+ _GLOBAL_RECORD_IDX_COLUMN_NAME,
70
+ _GLOBAL_RECORD_IDX_COLUMN_TYPE,
71
+ )
72
+
73
+
74
+ def append_global_record_idx_column(
75
+ table: pa.Table, ordered_record_indices
76
+ ) -> pa.Table:
77
+
78
+ table = table.append_column(
79
+ _GLOBAL_RECORD_IDX_COLUMN_NAME,
80
+ pa.array(ordered_record_indices, _GLOBAL_RECORD_IDX_COLUMN_TYPE),
81
+ )
82
+ return table
@@ -0,0 +1,43 @@
1
+ import deltacat.compute.converter.utils.iceberg_columns as sc
2
+ import daft
3
+
4
+
5
+ def download_data_table_and_append_iceberg_columns(
6
+ file, columns_to_download, additional_columns_to_append, sequence_number
7
+ ):
8
+ # TODO; add S3 client kwargs
9
+ table = download_parquet_with_daft_hash_applied(
10
+ identify_columns=columns_to_download, file=file, s3_client_kwargs={}
11
+ )
12
+ if sc._FILE_PATH_COLUMN_NAME in additional_columns_to_append:
13
+ table = sc.append_file_path_column(table, file.file_path)
14
+ if sc._ORDERED_RECORD_IDX_COLUMN_NAME in additional_columns_to_append:
15
+ record_idx_iterator = iter(range(len(table)))
16
+ table = sc.append_record_idx_col(table, record_idx_iterator)
17
+ return table
18
+
19
+
20
+ def download_parquet_with_daft_hash_applied(
21
+ identify_columns, file, s3_client_kwargs, **kwargs
22
+ ):
23
+ from daft import TimeUnit
24
+
25
+ # TODO: Add correct read kwargs as in:
26
+ # https://github.com/ray-project/deltacat/blob/383855a4044e4dfe03cf36d7738359d512a517b4/deltacat/utils/daft.py#L97
27
+
28
+ coerce_int96_timestamp_unit = TimeUnit.from_str(
29
+ kwargs.get("coerce_int96_timestamp_unit", "ms")
30
+ )
31
+
32
+ from deltacat.utils.daft import _get_s3_io_config
33
+
34
+ # TODO: Use Daft SHA1 hash instead to minimize probably of data corruption
35
+ io_config = _get_s3_io_config(s3_client_kwargs=s3_client_kwargs)
36
+ df = daft.read_parquet(
37
+ path=file.file_path,
38
+ io_config=io_config,
39
+ coerce_int96_timestamp_unit=coerce_int96_timestamp_unit,
40
+ )
41
+ df = df.select(daft.col(identify_columns[0]).hash())
42
+ arrow_table = df.to_arrow()
43
+ return arrow_table
@@ -0,0 +1,133 @@
1
+ from tenacity import (
2
+ Retrying,
3
+ retry_if_exception_type,
4
+ stop_after_delay,
5
+ wait_random_exponential,
6
+ )
7
+ from typing import Union
8
+ from deltacat.aws.s3u import CapturedBlockWritePaths, UuidBlockWritePathProvider
9
+ from deltacat.types.tables import (
10
+ get_table_writer,
11
+ get_table_length,
12
+ TABLE_CLASS_TO_SLICER_FUNC,
13
+ )
14
+ from typing import Optional, Dict, Any, List
15
+ from deltacat.exceptions import RetryableError
16
+ from deltacat.storage import (
17
+ DistributedDataset,
18
+ LocalTable,
19
+ )
20
+ from deltacat.types.media import (
21
+ ContentEncoding,
22
+ ContentType,
23
+ )
24
+ from deltacat.aws.s3u import UPLOAD_SLICED_TABLE_RETRY_STOP_AFTER_DELAY
25
+ import s3fs
26
+
27
+
28
+ def get_credential():
29
+ import boto3
30
+
31
+ boto3_session = boto3.Session()
32
+ credentials = boto3_session.get_credentials()
33
+ return credentials
34
+
35
+
36
+ def get_s3_file_system(content_type):
37
+ token_holder = get_credential()
38
+ content_encoding = ContentEncoding.IDENTITY
39
+
40
+ s3_file_system = s3fs.S3FileSystem(
41
+ key=token_holder.access_key,
42
+ secret=token_holder.secret_key,
43
+ token=token_holder.token,
44
+ s3_additional_kwargs={
45
+ "ServerSideEncryption": "aws:kms",
46
+ # TODO: Get tagging from table properties
47
+ "ContentType": content_type.value,
48
+ "ContentEncoding": content_encoding.value,
49
+ },
50
+ )
51
+ return s3_file_system
52
+
53
+
54
+ def upload_table_with_retry(
55
+ table: Union[LocalTable, DistributedDataset],
56
+ s3_url_prefix: str,
57
+ s3_table_writer_kwargs: Optional[Dict[str, Any]],
58
+ content_type: ContentType = ContentType.PARQUET,
59
+ max_records_per_file: Optional[int] = 4000000,
60
+ s3_file_system=None,
61
+ **s3_client_kwargs,
62
+ ) -> List[str]:
63
+ """
64
+ Writes the given table to 1 or more S3 files and return Redshift
65
+ manifest entries describing the uploaded files.
66
+ """
67
+ retrying = Retrying(
68
+ wait=wait_random_exponential(multiplier=1, max=60),
69
+ stop=stop_after_delay(UPLOAD_SLICED_TABLE_RETRY_STOP_AFTER_DELAY),
70
+ retry=retry_if_exception_type(RetryableError),
71
+ )
72
+
73
+ if s3_table_writer_kwargs is None:
74
+ s3_table_writer_kwargs = {}
75
+
76
+ if not s3_file_system:
77
+ s3_file_system = get_s3_file_system(content_type=content_type)
78
+ capture_object = CapturedBlockWritePaths()
79
+ block_write_path_provider = UuidBlockWritePathProvider(
80
+ capture_object=capture_object
81
+ )
82
+ s3_table_writer_func = get_table_writer(table)
83
+ table_record_count = get_table_length(table)
84
+ if max_records_per_file is None or not table_record_count:
85
+ retrying(
86
+ fn=upload_table,
87
+ table_slices=table,
88
+ s3_base_url=f"{s3_url_prefix}",
89
+ s3_file_system=s3_file_system,
90
+ s3_table_writer_func=s3_table_writer_func,
91
+ s3_table_writer_kwargs=s3_table_writer_kwargs,
92
+ block_write_path_provider=block_write_path_provider,
93
+ content_type=content_type,
94
+ **s3_client_kwargs,
95
+ )
96
+ else:
97
+ table_slicer_func = TABLE_CLASS_TO_SLICER_FUNC.get(type(table))
98
+ table_slices = table_slicer_func(table, max_records_per_file)
99
+ for table_slice in table_slices:
100
+ retrying(
101
+ fn=upload_table,
102
+ table_slices=table_slice,
103
+ s3_base_url=f"{s3_url_prefix}",
104
+ s3_file_system=s3_file_system,
105
+ s3_table_writer_func=s3_table_writer_func,
106
+ s3_table_writer_kwargs=s3_table_writer_kwargs,
107
+ block_write_path_provider=block_write_path_provider,
108
+ content_type=content_type,
109
+ **s3_client_kwargs,
110
+ )
111
+ del block_write_path_provider
112
+ write_paths = capture_object.write_paths()
113
+ return write_paths
114
+
115
+
116
+ def upload_table(
117
+ table_slices,
118
+ s3_base_url,
119
+ s3_file_system,
120
+ s3_table_writer_func,
121
+ block_write_path_provider,
122
+ content_type,
123
+ s3_table_writer_kwargs,
124
+ ):
125
+ s3_table_writer_func(
126
+ table_slices,
127
+ s3_base_url,
128
+ s3_file_system,
129
+ block_write_path_provider,
130
+ content_type.value,
131
+ **s3_table_writer_kwargs,
132
+ )
133
+ # TODO: Add a proper fix for block_refs and write_paths not persisting in Ray actors
@@ -93,29 +93,11 @@ def _estimate_resources_required_to_process_delta_using_type_params(
93
93
  on_disk_size_bytes=delta.meta.content_length,
94
94
  ),
95
95
  )
96
- file_reader_kwargs_provider = kwargs.get(
97
- "file_reader_kwargs_provider"
98
- ) or deltacat_storage_kwargs.get("file_reader_kwargs_provider")
99
-
100
- """
101
- NOTE: The file_reader_kwargs_provider parameter can be passed in two ways:
102
- 1. Nested within deltacat_storage_kwargs during resource estimation
103
- 2. As a top-level attribute of CompactPartitionsParams during compaction
104
-
105
- This creates an inconsistent parameter path between resource estimation and compaction flows.
106
- As a long-term solution, this should be unified to use a single consistent path (either always
107
- nested in deltacat_storage_kwargs or always as a top-level parameter).
108
-
109
- For now, this implementation handles the resource estimation case by:
110
- 1. First checking for file_reader_kwargs_provider as a direct kwarg
111
- 2. Falling back to deltacat_storage_kwargs if not found
112
- This approach maintains backward compatibility by not modifying the DELTA_RESOURCE_ESTIMATION_FUNCTIONS signatures.
113
- """
96
+
114
97
  appended = append_content_type_params(
115
98
  delta=delta,
116
99
  deltacat_storage=deltacat_storage,
117
100
  deltacat_storage_kwargs=deltacat_storage_kwargs,
118
- file_reader_kwargs_provider=file_reader_kwargs_provider,
119
101
  )
120
102
 
121
103
  if not appended:
deltacat/constants.py CHANGED
@@ -1,4 +1,8 @@
1
- from deltacat.utils.common import env_string
1
+ from __future__ import annotations
2
+
3
+
4
+ from deltacat.utils.common import env_string, env_bool
5
+ import os
2
6
 
3
7
  # Environment variables
4
8
  DELTACAT_SYS_LOG_LEVEL = env_string("DELTACAT_SYS_LOG_LEVEL", "DEBUG")
@@ -30,6 +34,26 @@ DELTACAT_APP_DEBUG_LOG_BASE_FILE_NAME = env_string(
30
34
  )
31
35
  # A json context which will be logged along with other context args.
32
36
  DELTACAT_LOGGER_CONTEXT = env_string("DELTACAT_LOGGER_CONTEXT", None)
37
+ DELTACAT_LOGGER_USE_SINGLE_HANDLER = env_bool(
38
+ "DELTACAT_LOGGER_USE_SINGLE_HANDLER",
39
+ False,
40
+ )
41
+ DELTACAT_ROOT = env_string(
42
+ "DELTACAT_ROOT",
43
+ os.path.join(os.getcwd(), ".deltacat"),
44
+ )
45
+
46
+ # CLI Args
47
+ METAFILE_FORMAT_KEY = "METAFILE_FORMAT"
48
+ METAFILE_FORMAT_JSON = "json"
49
+ METAFILE_FORMAT_MSGPACK = "msgpack"
50
+ METAFILE_FORMAT = env_string(METAFILE_FORMAT_KEY, METAFILE_FORMAT_MSGPACK)
51
+ SUPPORTED_METAFILE_FORMATS = [METAFILE_FORMAT_JSON, METAFILE_FORMAT_MSGPACK]
52
+ METAFILE_EXT = {
53
+ "json": ".json",
54
+ "msgpack": ".mpk",
55
+ }[METAFILE_FORMAT]
56
+
33
57
 
34
58
  # Byte Units
35
59
  BYTES_PER_KIBIBYTE = 2**10
@@ -41,6 +65,11 @@ BYTES_PER_PEBIBYTE = 2**50
41
65
  SIGNED_INT64_MIN_VALUE = -(2**63)
42
66
  SIGNED_INT64_MAX_VALUE = 2**63 - 1
43
67
 
68
+ # Time Units
69
+ NANOS_PER_SEC = 1_000_000_000
70
+ MICROS_PER_SEC = 1_000_000
71
+ MILLIS_PER_SEC = 1000
72
+
44
73
  # Inflation multiplier from snappy-compressed parquet to pyarrow.
45
74
  # This should be kept larger than actual average inflation multipliers.
46
75
  # Note that this is a very rough guess since actual observed pyarrow
@@ -58,3 +87,20 @@ MEMORY_TO_HASH_BUCKET_COUNT_RATIO = 0.0512 * BYTES_PER_TEBIBYTE
58
87
 
59
88
  # The number of bytes allocated to null values in string physical type in parquet
60
89
  NULL_SIZE_BYTES = 4
90
+
91
+ # Metastore Constants
92
+ REVISION_DIR_NAME: str = "rev"
93
+ TXN_DIR_NAME: str = "txn"
94
+ RUNNING_TXN_DIR_NAME: str = "running"
95
+ FAILED_TXN_DIR_NAME: str = "failed"
96
+ SUCCESS_TXN_DIR_NAME: str = "success"
97
+ TXN_PART_SEPARATOR = "_"
98
+ # Storage interface defaults
99
+ # These defaults should be applied in catalog interface implementations
100
+ # Storage interface implementations should be agnostic to defaults and require full information
101
+ DEFAULT_CATALOG = "DEFAULT"
102
+ DEFAULT_NAMESPACE = "DEFAULT"
103
+ DEFAULT_TABLE_VERSION = "1"
104
+ DEFAULT_STREAM_ID = "stream"
105
+ DEFAULT_PARTITION_ID = "partition"
106
+ DEFAULT_PARTITION_VALUES = ["default"]
deltacat/env.py ADDED
@@ -0,0 +1,51 @@
1
+ import os
2
+ import logging
3
+ from typing import Dict, Any
4
+
5
+ from deltacat import logs
6
+
7
+ from deltacat.constants import (
8
+ DELTACAT_APP_LOG_LEVEL,
9
+ DELTACAT_SYS_LOG_LEVEL,
10
+ DELTACAT_APP_LOG_DIR,
11
+ DELTACAT_SYS_LOG_DIR,
12
+ DELTACAT_APP_INFO_LOG_BASE_FILE_NAME,
13
+ DELTACAT_SYS_INFO_LOG_BASE_FILE_NAME,
14
+ DELTACAT_APP_DEBUG_LOG_BASE_FILE_NAME,
15
+ DELTACAT_SYS_DEBUG_LOG_BASE_FILE_NAME,
16
+ DELTACAT_LOGGER_USE_SINGLE_HANDLER,
17
+ )
18
+
19
+ logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
20
+
21
+
22
+ def create_ray_runtime_environment() -> Dict[str, Any]:
23
+ # log the system environment for debugging
24
+ logger.debug(f"System Environment: {os.environ}")
25
+
26
+ # read the stage (e.g. alpha, beta, dev, etc.) from system environment vars
27
+ stage = os.environ.get("STAGE")
28
+ logger.debug(f"Runtime Environment Stage: {stage}")
29
+ runtime_environment = None
30
+ if stage:
31
+ worker_env_vars = {
32
+ # forward the STAGE environment variable to workers
33
+ "STAGE": stage,
34
+ # forward deltacat logging environment variables to workers
35
+ "DELTACAT_APP_LOG_LEVEL": DELTACAT_APP_LOG_LEVEL,
36
+ "DELTACAT_SYS_LOG_LEVEL": DELTACAT_SYS_LOG_LEVEL,
37
+ "DELTACAT_APP_LOG_DIR": DELTACAT_APP_LOG_DIR,
38
+ "DELTACAT_SYS_LOG_DIR": DELTACAT_SYS_LOG_DIR,
39
+ "DELTACAT_APP_INFO_LOG_BASE_FILE_NAME": DELTACAT_APP_INFO_LOG_BASE_FILE_NAME,
40
+ "DELTACAT_SYS_INFO_LOG_BASE_FILE_NAME": DELTACAT_SYS_INFO_LOG_BASE_FILE_NAME,
41
+ "DELTACAT_APP_DEBUG_LOG_BASE_FILE_NAME": DELTACAT_APP_DEBUG_LOG_BASE_FILE_NAME,
42
+ "DELTACAT_SYS_DEBUG_LOG_BASE_FILE_NAME": DELTACAT_SYS_DEBUG_LOG_BASE_FILE_NAME,
43
+ "DELTACAT_LOGGER_USE_SINGLE_HANDLER": str(
44
+ DELTACAT_LOGGER_USE_SINGLE_HANDLER
45
+ ),
46
+ }
47
+ # setup runtime environment from system environment variables:
48
+ runtime_environment = {
49
+ "env_vars": worker_env_vars,
50
+ }
51
+ return runtime_environment
File without changes
@@ -0,0 +1,101 @@
1
+ import os
2
+ import ray
3
+ import logging
4
+
5
+ from deltacat import logs
6
+ from deltacat.constants import DELTACAT_APP_LOG_DIR, DELTACAT_SYS_LOG_DIR
7
+ from deltacat.examples.common.fixtures import (
8
+ store_cli_args_in_os_environ,
9
+ )
10
+ from deltacat.env import create_ray_runtime_environment
11
+
12
+ # initialize the driver logger
13
+ driver_logger = logs.configure_application_logger(logging.getLogger(__name__))
14
+
15
+
16
+ @ray.remote
17
+ def logging_worker(var1, var2):
18
+ # for AWS Glue, worker loggers must be initialized within the worker process
19
+ worker_logger = logs.configure_application_logger(logging.getLogger(__name__))
20
+
21
+ log_line_1 = f"Worker System Environment: {os.environ}"
22
+ print(
23
+ f"Writing DEBUG log line from Worker to {DELTACAT_APP_LOG_DIR}: '{log_line_1}'"
24
+ )
25
+ worker_logger.debug(log_line_1)
26
+
27
+ log_line_2 = f"Worker Variable 1: {var1}"
28
+ print(
29
+ f"Writing INFO log line from Worker to {DELTACAT_APP_LOG_DIR}: '{log_line_2}'"
30
+ )
31
+ worker_logger.info(log_line_2)
32
+
33
+ log_line_3 = f"Worker Variable 2: {var2}"
34
+ print(
35
+ f"Writing INFO log line from Worker to {DELTACAT_APP_LOG_DIR}: '{log_line_3}'"
36
+ )
37
+ worker_logger.info(log_line_3)
38
+
39
+
40
+ def run(var1="default1", var2="default2", **kwargs):
41
+ log_line_1 = f"Driver Variable 1: {var1}"
42
+ print(
43
+ f"Writing INFO log line from Driver to {DELTACAT_APP_LOG_DIR}: '{log_line_1}'"
44
+ )
45
+ driver_logger.info(log_line_1)
46
+
47
+ log_line_2 = f"Driver Variable 2: {var2}"
48
+ print(
49
+ f"Writing INFO log line from Driver to {DELTACAT_APP_LOG_DIR}: '{log_line_2}'"
50
+ )
51
+ driver_logger.info(log_line_2)
52
+
53
+ print("Starting worker...")
54
+ ray.get(logging_worker.remote(var1, var2))
55
+ print(
56
+ f"The driver is shutting down. Additional DeltaCAT system logs have been written to {DELTACAT_SYS_LOG_DIR}"
57
+ )
58
+
59
+
60
+ if __name__ == "__main__":
61
+ example_script_args = [
62
+ (
63
+ [
64
+ "--var1",
65
+ ],
66
+ {
67
+ "help": "First argument to log.",
68
+ "type": str,
69
+ },
70
+ ),
71
+ (
72
+ [
73
+ "--var2",
74
+ ],
75
+ {
76
+ "help": "Second argument to log.",
77
+ "type": str,
78
+ },
79
+ ),
80
+ (
81
+ [
82
+ "--STAGE",
83
+ ],
84
+ {
85
+ "help": "Example runtime environment stage (e.g. dev, alpha, beta, prod).",
86
+ "type": str,
87
+ },
88
+ ),
89
+ ]
90
+
91
+ # store any CLI args in the runtime environment
92
+ store_cli_args_in_os_environ(example_script_args)
93
+
94
+ # create any runtime environment required to run the example
95
+ runtime_env = create_ray_runtime_environment()
96
+
97
+ # initialize ray
98
+ ray.init(runtime_env=runtime_env)
99
+
100
+ # run the example using os.environ as kwargs
101
+ run(**os.environ)
File without changes
@@ -0,0 +1,15 @@
1
+ import os
2
+ import logging
3
+ import argparse
4
+ from deltacat import logs
5
+
6
+ logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
7
+
8
+
9
+ def store_cli_args_in_os_environ(script_args_list=[]):
10
+ parser = argparse.ArgumentParser()
11
+ for args, kwargs in script_args_list:
12
+ parser.add_argument(*args, **kwargs)
13
+ args = parser.parse_args()
14
+ print(f"Command Line Arguments: {args}")
15
+ os.environ.update(vars(args))
@@ -0,0 +1,27 @@
1
+ import ray
2
+ import deltacat
3
+ import daft
4
+ import pyiceberg
5
+
6
+
7
+ def print_package_version_info():
8
+ print(f"DeltaCAT Version: {deltacat.__version__}")
9
+ print(f"PyIceberg Version: {pyiceberg.__version__}")
10
+ print(f"Ray Version: {ray.__version__}")
11
+ print(f"Daft Version: {daft.__version__}")
12
+
13
+
14
+ @ray.remote
15
+ def hello_worker():
16
+ print("Hello, Worker!")
17
+ print_package_version_info()
18
+
19
+
20
+ def run():
21
+ print("Hello, Driver!")
22
+ print_package_version_info()
23
+ hello_worker.remote()
24
+
25
+
26
+ if __name__ == "__main__":
27
+ run()
File without changes