deltacat 1.1.36__py3-none-any.whl → 2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (236) hide show
  1. deltacat/__init__.py +42 -3
  2. deltacat/annotations.py +36 -0
  3. deltacat/api.py +168 -0
  4. deltacat/aws/s3u.py +4 -4
  5. deltacat/benchmarking/benchmark_engine.py +82 -0
  6. deltacat/benchmarking/benchmark_report.py +86 -0
  7. deltacat/benchmarking/benchmark_suite.py +11 -0
  8. deltacat/benchmarking/conftest.py +21 -0
  9. deltacat/benchmarking/data/random_row_generator.py +94 -0
  10. deltacat/benchmarking/data/row_generator.py +10 -0
  11. deltacat/benchmarking/test_benchmark_pipeline.py +106 -0
  12. deltacat/catalog/__init__.py +14 -0
  13. deltacat/catalog/delegate.py +199 -106
  14. deltacat/catalog/iceberg/__init__.py +4 -0
  15. deltacat/catalog/iceberg/iceberg_catalog_config.py +26 -0
  16. deltacat/catalog/iceberg/impl.py +368 -0
  17. deltacat/catalog/iceberg/overrides.py +74 -0
  18. deltacat/catalog/interface.py +273 -76
  19. deltacat/catalog/main/impl.py +720 -0
  20. deltacat/catalog/model/catalog.py +227 -20
  21. deltacat/catalog/model/properties.py +116 -0
  22. deltacat/catalog/model/table_definition.py +32 -1
  23. deltacat/compute/compactor/model/compaction_session_audit_info.py +7 -3
  24. deltacat/compute/compactor/model/delta_annotated.py +3 -3
  25. deltacat/compute/compactor/model/delta_file_envelope.py +3 -1
  26. deltacat/compute/compactor/model/delta_file_locator.py +3 -1
  27. deltacat/compute/compactor/model/round_completion_info.py +5 -5
  28. deltacat/compute/compactor/model/table_object_store.py +3 -2
  29. deltacat/compute/compactor/repartition_session.py +1 -1
  30. deltacat/compute/compactor/steps/dedupe.py +11 -4
  31. deltacat/compute/compactor/steps/hash_bucket.py +1 -1
  32. deltacat/compute/compactor/steps/materialize.py +6 -2
  33. deltacat/compute/compactor/utils/io.py +1 -1
  34. deltacat/compute/compactor/utils/sort_key.py +9 -2
  35. deltacat/compute/compactor_v2/compaction_session.py +5 -9
  36. deltacat/compute/compactor_v2/constants.py +1 -30
  37. deltacat/compute/compactor_v2/deletes/utils.py +3 -3
  38. deltacat/compute/compactor_v2/model/merge_input.py +1 -7
  39. deltacat/compute/compactor_v2/private/compaction_utils.py +5 -6
  40. deltacat/compute/compactor_v2/steps/merge.py +17 -126
  41. deltacat/compute/compactor_v2/utils/content_type_params.py +0 -17
  42. deltacat/compute/compactor_v2/utils/dedupe.py +1 -1
  43. deltacat/compute/compactor_v2/utils/io.py +1 -1
  44. deltacat/compute/compactor_v2/utils/merge.py +0 -1
  45. deltacat/compute/compactor_v2/utils/primary_key_index.py +3 -15
  46. deltacat/compute/compactor_v2/utils/task_options.py +23 -43
  47. deltacat/compute/converter/constants.py +4 -0
  48. deltacat/compute/converter/converter_session.py +143 -0
  49. deltacat/compute/converter/model/convert_input.py +69 -0
  50. deltacat/compute/converter/model/convert_input_files.py +61 -0
  51. deltacat/compute/converter/model/converter_session_params.py +99 -0
  52. deltacat/compute/converter/pyiceberg/__init__.py +0 -0
  53. deltacat/compute/converter/pyiceberg/catalog.py +75 -0
  54. deltacat/compute/converter/pyiceberg/overrides.py +135 -0
  55. deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +251 -0
  56. deltacat/compute/converter/steps/__init__.py +0 -0
  57. deltacat/compute/converter/steps/convert.py +211 -0
  58. deltacat/compute/converter/steps/dedupe.py +60 -0
  59. deltacat/compute/converter/utils/__init__.py +0 -0
  60. deltacat/compute/converter/utils/convert_task_options.py +88 -0
  61. deltacat/compute/converter/utils/converter_session_utils.py +109 -0
  62. deltacat/compute/converter/utils/iceberg_columns.py +82 -0
  63. deltacat/compute/converter/utils/io.py +43 -0
  64. deltacat/compute/converter/utils/s3u.py +133 -0
  65. deltacat/compute/resource_estimation/delta.py +1 -19
  66. deltacat/constants.py +47 -1
  67. deltacat/env.py +51 -0
  68. deltacat/examples/__init__.py +0 -0
  69. deltacat/examples/basic_logging.py +101 -0
  70. deltacat/examples/common/__init__.py +0 -0
  71. deltacat/examples/common/fixtures.py +15 -0
  72. deltacat/examples/hello_world.py +27 -0
  73. deltacat/examples/iceberg/__init__.py +0 -0
  74. deltacat/examples/iceberg/iceberg_bucket_writer.py +139 -0
  75. deltacat/examples/iceberg/iceberg_reader.py +149 -0
  76. deltacat/exceptions.py +51 -9
  77. deltacat/logs.py +4 -1
  78. deltacat/storage/__init__.py +118 -28
  79. deltacat/storage/iceberg/__init__.py +0 -0
  80. deltacat/storage/iceberg/iceberg_scan_planner.py +28 -0
  81. deltacat/storage/iceberg/impl.py +737 -0
  82. deltacat/storage/iceberg/model.py +709 -0
  83. deltacat/storage/interface.py +217 -134
  84. deltacat/storage/main/__init__.py +0 -0
  85. deltacat/storage/main/impl.py +2077 -0
  86. deltacat/storage/model/delta.py +118 -71
  87. deltacat/storage/model/interop.py +24 -0
  88. deltacat/storage/model/list_result.py +8 -0
  89. deltacat/storage/model/locator.py +93 -3
  90. deltacat/{aws/redshift → storage}/model/manifest.py +122 -98
  91. deltacat/storage/model/metafile.py +1316 -0
  92. deltacat/storage/model/namespace.py +34 -18
  93. deltacat/storage/model/partition.py +362 -37
  94. deltacat/storage/model/scan/__init__.py +0 -0
  95. deltacat/storage/model/scan/push_down.py +19 -0
  96. deltacat/storage/model/scan/scan_plan.py +10 -0
  97. deltacat/storage/model/scan/scan_task.py +34 -0
  98. deltacat/storage/model/schema.py +892 -0
  99. deltacat/storage/model/shard.py +47 -0
  100. deltacat/storage/model/sort_key.py +170 -13
  101. deltacat/storage/model/stream.py +208 -80
  102. deltacat/storage/model/table.py +123 -29
  103. deltacat/storage/model/table_version.py +322 -46
  104. deltacat/storage/model/transaction.py +757 -0
  105. deltacat/storage/model/transform.py +198 -61
  106. deltacat/storage/model/types.py +111 -13
  107. deltacat/storage/rivulet/__init__.py +11 -0
  108. deltacat/storage/rivulet/arrow/__init__.py +0 -0
  109. deltacat/storage/rivulet/arrow/serializer.py +75 -0
  110. deltacat/storage/rivulet/dataset.py +744 -0
  111. deltacat/storage/rivulet/dataset_executor.py +87 -0
  112. deltacat/storage/rivulet/feather/__init__.py +5 -0
  113. deltacat/storage/rivulet/feather/file_reader.py +136 -0
  114. deltacat/storage/rivulet/feather/serializer.py +35 -0
  115. deltacat/storage/rivulet/fs/__init__.py +0 -0
  116. deltacat/storage/rivulet/fs/file_provider.py +105 -0
  117. deltacat/storage/rivulet/fs/file_store.py +130 -0
  118. deltacat/storage/rivulet/fs/input_file.py +76 -0
  119. deltacat/storage/rivulet/fs/output_file.py +86 -0
  120. deltacat/storage/rivulet/logical_plan.py +105 -0
  121. deltacat/storage/rivulet/metastore/__init__.py +0 -0
  122. deltacat/storage/rivulet/metastore/delta.py +190 -0
  123. deltacat/storage/rivulet/metastore/json_sst.py +105 -0
  124. deltacat/storage/rivulet/metastore/sst.py +82 -0
  125. deltacat/storage/rivulet/metastore/sst_interval_tree.py +260 -0
  126. deltacat/storage/rivulet/mvp/Table.py +101 -0
  127. deltacat/storage/rivulet/mvp/__init__.py +5 -0
  128. deltacat/storage/rivulet/parquet/__init__.py +5 -0
  129. deltacat/storage/rivulet/parquet/data_reader.py +0 -0
  130. deltacat/storage/rivulet/parquet/file_reader.py +127 -0
  131. deltacat/storage/rivulet/parquet/serializer.py +37 -0
  132. deltacat/storage/rivulet/reader/__init__.py +0 -0
  133. deltacat/storage/rivulet/reader/block_scanner.py +378 -0
  134. deltacat/storage/rivulet/reader/data_reader.py +136 -0
  135. deltacat/storage/rivulet/reader/data_scan.py +63 -0
  136. deltacat/storage/rivulet/reader/dataset_metastore.py +178 -0
  137. deltacat/storage/rivulet/reader/dataset_reader.py +156 -0
  138. deltacat/storage/rivulet/reader/pyarrow_data_reader.py +121 -0
  139. deltacat/storage/rivulet/reader/query_expression.py +99 -0
  140. deltacat/storage/rivulet/reader/reader_type_registrar.py +84 -0
  141. deltacat/storage/rivulet/schema/__init__.py +0 -0
  142. deltacat/storage/rivulet/schema/datatype.py +128 -0
  143. deltacat/storage/rivulet/schema/schema.py +251 -0
  144. deltacat/storage/rivulet/serializer.py +40 -0
  145. deltacat/storage/rivulet/serializer_factory.py +42 -0
  146. deltacat/storage/rivulet/writer/__init__.py +0 -0
  147. deltacat/storage/rivulet/writer/dataset_writer.py +29 -0
  148. deltacat/storage/rivulet/writer/memtable_dataset_writer.py +294 -0
  149. deltacat/tests/_io/__init__.py +1 -0
  150. deltacat/tests/catalog/test_catalogs.py +324 -0
  151. deltacat/tests/catalog/test_default_catalog_impl.py +16 -8
  152. deltacat/tests/compute/compact_partition_multiple_rounds_test_cases.py +21 -21
  153. deltacat/tests/compute/compact_partition_rebase_test_cases.py +6 -6
  154. deltacat/tests/compute/compact_partition_rebase_then_incremental_test_cases.py +56 -56
  155. deltacat/tests/compute/compact_partition_test_cases.py +19 -53
  156. deltacat/tests/compute/compactor/steps/test_repartition.py +2 -2
  157. deltacat/tests/compute/compactor/utils/test_io.py +6 -8
  158. deltacat/tests/compute/compactor_v2/test_compaction_session.py +0 -466
  159. deltacat/tests/compute/compactor_v2/utils/test_task_options.py +1 -273
  160. deltacat/tests/compute/conftest.py +75 -0
  161. deltacat/tests/compute/converter/__init__.py +0 -0
  162. deltacat/tests/compute/converter/conftest.py +80 -0
  163. deltacat/tests/compute/converter/test_convert_session.py +478 -0
  164. deltacat/tests/compute/converter/utils.py +123 -0
  165. deltacat/tests/compute/resource_estimation/test_delta.py +0 -16
  166. deltacat/tests/compute/test_compact_partition_incremental.py +2 -42
  167. deltacat/tests/compute/test_compact_partition_multiple_rounds.py +5 -46
  168. deltacat/tests/compute/test_compact_partition_params.py +3 -3
  169. deltacat/tests/compute/test_compact_partition_rebase.py +1 -46
  170. deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +5 -46
  171. deltacat/tests/compute/test_util_common.py +19 -12
  172. deltacat/tests/compute/test_util_create_table_deltas_repo.py +13 -22
  173. deltacat/tests/local_deltacat_storage/__init__.py +76 -103
  174. deltacat/tests/storage/__init__.py +0 -0
  175. deltacat/tests/storage/conftest.py +25 -0
  176. deltacat/tests/storage/main/__init__.py +0 -0
  177. deltacat/tests/storage/main/test_main_storage.py +1399 -0
  178. deltacat/tests/storage/model/__init__.py +0 -0
  179. deltacat/tests/storage/model/test_delete_parameters.py +21 -0
  180. deltacat/tests/storage/model/test_metafile_io.py +2535 -0
  181. deltacat/tests/storage/model/test_schema.py +308 -0
  182. deltacat/tests/storage/model/test_shard.py +22 -0
  183. deltacat/tests/storage/model/test_table_version.py +110 -0
  184. deltacat/tests/storage/model/test_transaction.py +308 -0
  185. deltacat/tests/storage/rivulet/__init__.py +0 -0
  186. deltacat/tests/storage/rivulet/conftest.py +149 -0
  187. deltacat/tests/storage/rivulet/fs/__init__.py +0 -0
  188. deltacat/tests/storage/rivulet/fs/test_file_location_provider.py +93 -0
  189. deltacat/tests/storage/rivulet/schema/__init__.py +0 -0
  190. deltacat/tests/storage/rivulet/schema/test_schema.py +241 -0
  191. deltacat/tests/storage/rivulet/test_dataset.py +406 -0
  192. deltacat/tests/storage/rivulet/test_manifest.py +67 -0
  193. deltacat/tests/storage/rivulet/test_sst_interval_tree.py +232 -0
  194. deltacat/tests/storage/rivulet/test_utils.py +122 -0
  195. deltacat/tests/storage/rivulet/writer/__init__.py +0 -0
  196. deltacat/tests/storage/rivulet/writer/test_dataset_write_then_read.py +341 -0
  197. deltacat/tests/storage/rivulet/writer/test_dataset_writer.py +79 -0
  198. deltacat/tests/storage/rivulet/writer/test_memtable_dataset_writer.py +75 -0
  199. deltacat/tests/test_deltacat_api.py +39 -0
  200. deltacat/tests/test_utils/filesystem.py +14 -0
  201. deltacat/tests/test_utils/message_pack_utils.py +54 -0
  202. deltacat/tests/test_utils/pyarrow.py +8 -15
  203. deltacat/tests/test_utils/storage.py +266 -3
  204. deltacat/tests/utils/test_daft.py +3 -3
  205. deltacat/tests/utils/test_pyarrow.py +0 -432
  206. deltacat/types/partial_download.py +1 -1
  207. deltacat/types/tables.py +1 -1
  208. deltacat/utils/export.py +59 -0
  209. deltacat/utils/filesystem.py +320 -0
  210. deltacat/utils/metafile_locator.py +73 -0
  211. deltacat/utils/pyarrow.py +36 -183
  212. deltacat-2.0.dist-info/METADATA +65 -0
  213. deltacat-2.0.dist-info/RECORD +347 -0
  214. deltacat/aws/redshift/__init__.py +0 -19
  215. deltacat/catalog/default_catalog_impl/__init__.py +0 -369
  216. deltacat/io/dataset.py +0 -73
  217. deltacat/io/read_api.py +0 -143
  218. deltacat/storage/model/delete_parameters.py +0 -40
  219. deltacat/storage/model/partition_spec.py +0 -71
  220. deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +0 -253
  221. deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +0 -45
  222. deltacat-1.1.36.dist-info/METADATA +0 -64
  223. deltacat-1.1.36.dist-info/RECORD +0 -219
  224. /deltacat/{aws/redshift/model → benchmarking/data}/__init__.py +0 -0
  225. /deltacat/{io/aws → catalog/main}/__init__.py +0 -0
  226. /deltacat/{io/aws/redshift → compute/converter}/__init__.py +0 -0
  227. /deltacat/{tests/io → compute/converter/model}/__init__.py +0 -0
  228. /deltacat/tests/{io → _io}/test_cloudpickle_bug_fix.py +0 -0
  229. /deltacat/tests/{io → _io}/test_file_object_store.py +0 -0
  230. /deltacat/tests/{io → _io}/test_memcached_object_store.py +0 -0
  231. /deltacat/tests/{io → _io}/test_ray_plasma_object_store.py +0 -0
  232. /deltacat/tests/{io → _io}/test_redis_object_store.py +0 -0
  233. /deltacat/tests/{io → _io}/test_s3_object_store.py +0 -0
  234. {deltacat-1.1.36.dist-info → deltacat-2.0.dist-info}/LICENSE +0 -0
  235. {deltacat-1.1.36.dist-info → deltacat-2.0.dist-info}/WHEEL +0 -0
  236. {deltacat-1.1.36.dist-info → deltacat-2.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,757 @@
1
+ from __future__ import annotations
2
+
3
+ import os
4
+ import copy
5
+ import time
6
+ import uuid
7
+ import posixpath
8
+ from pathlib import PosixPath
9
+ import threading
10
+ from collections import defaultdict
11
+
12
+ from itertools import chain
13
+ from typing import Optional, List, Union, Tuple
14
+
15
+ import msgpack
16
+ import pyarrow.fs
17
+
18
+ from deltacat.constants import (
19
+ TXN_DIR_NAME,
20
+ TXN_PART_SEPARATOR,
21
+ RUNNING_TXN_DIR_NAME,
22
+ FAILED_TXN_DIR_NAME,
23
+ SUCCESS_TXN_DIR_NAME,
24
+ NANOS_PER_SEC,
25
+ )
26
+ from deltacat.storage.model.list_result import ListResult
27
+ from deltacat.storage.model.types import (
28
+ TransactionOperationType,
29
+ TransactionType,
30
+ )
31
+ from deltacat.storage.model.metafile import (
32
+ Metafile,
33
+ MetafileRevisionInfo,
34
+ )
35
+ from deltacat.utils.filesystem import (
36
+ resolve_path_and_filesystem,
37
+ list_directory,
38
+ )
39
+
40
+
41
+ class TransactionTimeProvider:
42
+ """
43
+ Provider interface for transaction start and end times. An ideal
44
+ transaction time provider is externally consistent (e.g.,
45
+ https://cloud.google.com/spanner/docs/true-time-external-consistency),
46
+ such that:
47
+ 1. A transaction start time is never less than a previously completed
48
+ transaction's end time.
49
+ 2. A transaction end time is never less than an in-progress
50
+ transaction's start time.
51
+ 3. Every transaction has a unique start and end time.
52
+ 4. Start/end time assignment is non-blocking.
53
+ """
54
+
55
+ def start_time(self) -> int:
56
+ raise NotImplementedError("start_time not implemented")
57
+
58
+ def end_time(self) -> int:
59
+ raise NotImplementedError("end_time not implemented")
60
+
61
+
62
+ class TransactionSystemTimeProvider(TransactionTimeProvider):
63
+ """
64
+ A local transaction time provider that returns the current system clock
65
+ epoch time in nanoseconds. Ensures that all local transaction start
66
+ times are greater than all last known end times, and that all known end
67
+ times are no less than all last known start time across all local threads
68
+ using this time provider.
69
+
70
+ Note that this time provider gives no external consistency guarantees due
71
+ to potential clock skew between distributed nodes writing to the same
72
+ catalog, and is only recommended for use with local catalogs.
73
+ """
74
+
75
+ last_known_start_times = defaultdict(int)
76
+ last_known_end_times = defaultdict(int)
77
+
78
+ # don't wait more than 60 seconds for the system clock to catch up
79
+ # between transactions (assumed to be indicative of a larger system
80
+ # clock change made between transactions)
81
+ max_sync_wait_time = 60 * NANOS_PER_SEC
82
+
83
+ def start_time(self) -> int:
84
+ """
85
+ Gets the current system time in nanoseconds since the epoch. Ensures
86
+ that the start time returned is greater than the last known end time
87
+ recorded at the time this method is invoked.
88
+ :return: Current epoch time in nanoseconds.
89
+ """
90
+ # ensure serial transactions in a single process have start times after
91
+ # the last known end time
92
+ last_known_end_times = self.last_known_end_times.values() or [0]
93
+ max_known_end_time = max(last_known_end_times)
94
+
95
+ elapsed_start_time = time.monotonic_ns()
96
+ current_time = time.time_ns()
97
+ while current_time <= max_known_end_time:
98
+ elapsed_time = time.monotonic_ns() - elapsed_start_time
99
+ if elapsed_time > self.max_sync_wait_time:
100
+ raise TimeoutError(
101
+ f"Failed to sync cross-transaction system clock time after "
102
+ f"{self.max_sync_wait_time / NANOS_PER_SEC} seconds, "
103
+ f"aborting."
104
+ )
105
+ time.sleep(0.000001)
106
+ current_time = time.time_ns()
107
+
108
+ # update the current thread's last known end time
109
+ pid = os.getpid()
110
+ tid = threading.current_thread().ident
111
+ current_thread_time_key = (pid, tid)
112
+ self.last_known_end_times[current_thread_time_key] = current_time
113
+
114
+ return current_time
115
+
116
+ def end_time(self) -> int:
117
+ """
118
+ Gets the current system time in nanoseconds since the epoch. Ensures
119
+ that the end time returned is no less than the last known start time
120
+ recorded at the time this method is invoked.
121
+ :return: Current epoch time in nanoseconds.
122
+ """
123
+ # ensure serial transactions in a single process have end times no less
124
+ # than the last known start time
125
+ last_known_start_times = self.last_known_start_times.values() or [0]
126
+ last_start_time = max(last_known_start_times)
127
+
128
+ elapsed_start_time = time.monotonic_ns()
129
+ current_time = time.time_ns()
130
+ while current_time < last_start_time:
131
+ elapsed_time = time.monotonic_ns() - elapsed_start_time
132
+ if elapsed_time > self.max_sync_wait_time:
133
+ raise TimeoutError(
134
+ f"Failed to sync cross-transaction system clock time after "
135
+ f"{self.max_sync_wait_time / NANOS_PER_SEC} seconds, "
136
+ f"aborting."
137
+ )
138
+ time.sleep(0.000001)
139
+ current_time = time.time_ns()
140
+
141
+ # update the current thread's last known end time
142
+ pid = os.getpid()
143
+ tid = threading.current_thread().ident
144
+ current_thread_time_key = (pid, tid)
145
+ self.last_known_start_times[current_thread_time_key] = current_time
146
+
147
+ return current_time
148
+
149
+
150
+ class TransactionOperation(dict):
151
+ """
152
+ Base class for DeltaCAT transaction operations against individual metafiles.
153
+ """
154
+
155
+ @staticmethod
156
+ def of(
157
+ operation_type: Optional[TransactionOperationType],
158
+ dest_metafile: Metafile,
159
+ src_metafile: Optional[Metafile] = None,
160
+ read_limit: Optional[int] = None,
161
+ ) -> TransactionOperation:
162
+ if not dest_metafile:
163
+ raise ValueError("Transaction operations must have a destination metafile.")
164
+ if operation_type == TransactionOperationType.UPDATE:
165
+ if not src_metafile:
166
+ raise ValueError(
167
+ "UPDATE transaction operations must have a source metafile."
168
+ )
169
+ elif type(dest_metafile) is not type(src_metafile):
170
+ raise ValueError(
171
+ f"Source metafile type `{type(src_metafile)}` is not "
172
+ f"equal to dest metafile type `{type(dest_metafile)}`."
173
+ )
174
+ elif src_metafile:
175
+ raise ValueError(
176
+ "Only UPDATE transaction operations may have a source metafile."
177
+ )
178
+ if operation_type.is_write_operation() and read_limit:
179
+ raise ValueError("Only READ transaction operations may have a read limit.")
180
+ txn_op = TransactionOperation()
181
+ txn_op.type = operation_type
182
+ txn_op.dest_metafile = dest_metafile
183
+ txn_op.src_metafile = src_metafile
184
+ txn_op.read_limit = read_limit
185
+ return txn_op
186
+
187
+ @property
188
+ def type(self) -> TransactionOperationType:
189
+ """
190
+ Returns the type of the transaction operation.
191
+ """
192
+ return TransactionOperationType(self["type"])
193
+
194
+ @type.setter
195
+ def type(self, txn_op_type: TransactionOperationType):
196
+ self["type"] = txn_op_type
197
+
198
+ @property
199
+ def dest_metafile(self) -> Metafile:
200
+ """
201
+ Returns the metafile that is the target of this transaction operation.
202
+ """
203
+ return self["dest_metafile"]
204
+
205
+ @dest_metafile.setter
206
+ def dest_metafile(self, metafile: Metafile):
207
+ self["dest_metafile"] = metafile
208
+
209
+ @property
210
+ def src_metafile(self) -> Optional[Metafile]:
211
+ """
212
+ Returns the metafile that is the source of this transaction operation.
213
+ """
214
+ return self["src_metafile"]
215
+
216
+ @src_metafile.setter
217
+ def src_metafile(self, src_metafile: Optional[Metafile]):
218
+ self["src_metafile"] = src_metafile
219
+
220
+ @property
221
+ def read_limit(self) -> Optional[int]:
222
+ """
223
+ Returns the read limit for this transaction operation.
224
+ """
225
+ return self.get("read_limit")
226
+
227
+ @read_limit.setter
228
+ def read_limit(self, read_limit: Optional[int]):
229
+ self["read_limit"] = read_limit
230
+
231
+ @property
232
+ def metafile_write_paths(self) -> List[str]:
233
+ return self.get("metafile_write_paths") or []
234
+
235
+ @property
236
+ def locator_write_paths(self) -> List[str]:
237
+ return self.get("locator_write_paths") or []
238
+
239
+ def append_metafile_write_path(self, write_path: str):
240
+ metafile_write_paths = self.get("metafile_write_paths")
241
+ if not metafile_write_paths:
242
+ metafile_write_paths = self["metafile_write_paths"] = []
243
+ metafile_write_paths.append(write_path)
244
+
245
+ def append_locator_write_path(self, write_path: str):
246
+ locator_write_paths = self.get("locator_write_paths")
247
+ if not locator_write_paths:
248
+ locator_write_paths = self["locator_write_paths"] = []
249
+ locator_write_paths.append(write_path)
250
+
251
+ @metafile_write_paths.setter
252
+ def metafile_write_paths(self, write_paths: List[str]) -> None:
253
+ self["metafile_write_paths"] = write_paths
254
+
255
+ @locator_write_paths.setter
256
+ def locator_write_paths(self, write_paths: List[str]):
257
+ self["locator_write_paths"] = write_paths
258
+
259
+
260
+ class TransactionOperationList(List[TransactionOperation]):
261
+ @staticmethod
262
+ def of(items: List[TransactionOperation]) -> TransactionOperationList:
263
+ typed_items = TransactionOperationList()
264
+ for item in items:
265
+ if item is not None and not isinstance(item, TransactionOperation):
266
+ item = TransactionOperation(item)
267
+ typed_items.append(item)
268
+ return typed_items
269
+
270
+ def __getitem__(self, item):
271
+ val = super().__getitem__(item)
272
+ if val is not None and not isinstance(val, TransactionOperation):
273
+ self[item] = val = TransactionOperation(val)
274
+ return val
275
+
276
+
277
+ class Transaction(dict):
278
+ """
279
+ Base class for DeltaCAT transactions.
280
+ """
281
+
282
+ @staticmethod
283
+ def of(
284
+ txn_type: TransactionType,
285
+ txn_operations: Optional[TransactionOperationList],
286
+ ) -> Transaction:
287
+ operation_types = set([op.type for op in txn_operations])
288
+ if txn_type == TransactionType.READ:
289
+ if operation_types - TransactionOperationType.read_operations():
290
+ raise ValueError(
291
+ "Only READ transaction operation types may be specified as "
292
+ "part of a READ transaction."
293
+ )
294
+ elif (
295
+ len(operation_types) == 1
296
+ and TransactionOperationType.CREATE in operation_types
297
+ ):
298
+ if txn_type != TransactionType.APPEND:
299
+ raise ValueError(
300
+ "Transactions with only CREATE operations must be "
301
+ "specified as part of an APPEND transaction."
302
+ )
303
+ elif TransactionOperationType.DELETE in operation_types:
304
+ if txn_type != TransactionType.DELETE:
305
+ raise ValueError(
306
+ "DELETE transaction operations must be specified as part "
307
+ "of a DELETE transaction."
308
+ )
309
+ elif TransactionOperationType.UPDATE in operation_types and txn_type not in {
310
+ TransactionType.ALTER,
311
+ TransactionType.RESTATE,
312
+ TransactionType.OVERWRITE,
313
+ }:
314
+ raise ValueError(
315
+ "Transactions with UPDATE operations must be specified "
316
+ "as part of an ALTER, RESTATE, or OVERWRITE transaction."
317
+ )
318
+ transaction = Transaction()
319
+ transaction.type = txn_type
320
+ transaction.operations = txn_operations
321
+ return transaction
322
+
323
+ @staticmethod
324
+ def read_end_time(
325
+ path: str,
326
+ filesystem: Optional[pyarrow.fs.FileSystem] = None,
327
+ ) -> Optional[int]:
328
+ """
329
+ Returns the end time of the transaction, or None if the transaction
330
+ log file does not exist.
331
+ :param path: Transaction log path to read.
332
+ :param filesystem: File system to use for reading the Transaction file.
333
+ :return: Deserialized object from the Transaction file.
334
+ """
335
+ # TODO(pdames): Validate that input file path is a valid txn log.
336
+ if not filesystem:
337
+ path, filesystem = resolve_path_and_filesystem(path, filesystem)
338
+ file_info_and_sizes = list_directory(
339
+ path=path,
340
+ filesystem=filesystem,
341
+ ignore_missing_path=True,
342
+ )
343
+ end_time = None
344
+ if file_info_and_sizes:
345
+ if len(file_info_and_sizes) > 1:
346
+ raise ValueError(
347
+ f"Expected to find only one transaction log at {path}, "
348
+ f"but found {len(file_info_and_sizes)}"
349
+ )
350
+ end_time = Transaction._parse_end_time(file_info_and_sizes[0][0])
351
+ return end_time
352
+
353
+ @staticmethod
354
+ def _parse_end_time(txn_log_file_name_or_path: str) -> int:
355
+ return int(posixpath.basename(txn_log_file_name_or_path))
356
+
357
+ @classmethod
358
+ def read(
359
+ cls,
360
+ path: str,
361
+ filesystem: Optional[pyarrow.fs.FileSystem] = None,
362
+ ) -> Transaction:
363
+ """
364
+ Read a Transaction file and return the deserialized object.
365
+ :param path: Transaction file path to read.
366
+ :param filesystem: File system to use for reading the Transaction file.
367
+ :return: Deserialized object from the Transaction file.
368
+ """
369
+ if not filesystem:
370
+ path, filesystem = resolve_path_and_filesystem(path, filesystem)
371
+ with filesystem.open_input_stream(path) as file:
372
+ binary = file.readall()
373
+ obj = cls(**msgpack.loads(binary))
374
+ return obj
375
+
376
+ @property
377
+ def id(self) -> Optional[str]:
378
+ """
379
+ Returns this transaction's unique ID assigned at commit start time, or
380
+ None if the unique ID has not yet been assigned.
381
+ """
382
+ _id = self.get("id")
383
+ if not _id and self.start_time:
384
+ _id = self["id"] = f"{self.start_time}{TXN_PART_SEPARATOR}{uuid.uuid4()}"
385
+ return _id
386
+
387
+ @property
388
+ def type(self) -> TransactionType:
389
+ """
390
+ Returns the type of the transaction.
391
+ """
392
+ return TransactionType(self["type"])
393
+
394
+ @type.setter
395
+ def type(self, txn_type: TransactionType):
396
+ self["type"] = txn_type
397
+
398
+ @property
399
+ def operations(self) -> TransactionOperationList:
400
+ """
401
+ Returns the list of transaction operations.
402
+ """
403
+ return TransactionOperationList(self["operations"])
404
+
405
+ @operations.setter
406
+ def operations(self, operations: TransactionOperationList):
407
+ self["operations"] = operations
408
+
409
+ @property
410
+ def start_time(self) -> Optional[int]:
411
+ """
412
+ Returns the start time of the transaction.
413
+ """
414
+ return self.get("start_time")
415
+
416
+ @property
417
+ def end_time(self) -> Optional[int]:
418
+ """
419
+ Returns the end time of the transaction.
420
+ """
421
+ return self.get("end_time")
422
+
423
+ def _mark_start_time(self, time_provider: TransactionTimeProvider) -> int:
424
+ """
425
+ Sets the start time of the transaction using the given
426
+ TransactionTimeProvider. Raises a runtime error if the transaction
427
+ start time has already been set by a previous commit.
428
+ """
429
+ if self.get("start_time"):
430
+ raise RuntimeError("Cannot restart a previously started transaction.")
431
+ start_time = self["start_time"] = time_provider.start_time()
432
+ return start_time
433
+
434
+ def _mark_end_time(self, time_provider: TransactionTimeProvider) -> int:
435
+ """
436
+ Sets the end time of the transaction using the given
437
+ TransactionTimeProvider. Raises a runtime error if the transaction end
438
+ time has already been set by a previous commit, or if the transaction
439
+ start time has not been set.
440
+ """
441
+ if not self.get("start_time"):
442
+ raise RuntimeError("Cannot end an unstarted transaction.")
443
+ if self.get("end_time"):
444
+ raise RuntimeError("Cannot end a completed transaction.")
445
+ end_time = self["end_time"] = time_provider.end_time()
446
+ return end_time
447
+
448
+ @staticmethod
449
+ def _abs_txn_meta_path_to_relative(root: str, target: str) -> str:
450
+ """
451
+ Takes an absolute root directory path and target absolute path to
452
+ relativize with respect to the root directory. Returns the target
453
+ path relative to the root directory path. Raises an error if the
454
+ target path is not contained in the given root directory path, if
455
+ either path is not an absolute path, or if the target path is equal
456
+ to the root directory path.
457
+ """
458
+ root_path = PosixPath(root)
459
+ target_path = PosixPath(target)
460
+ # TODO (martinezdavid): Check why is_absolute() fails for certain Delta paths
461
+ # if not root_path.is_absolute() or not target_path.is_absolute():
462
+ # raise ValueError("Both root and target must be absolute paths.")
463
+ if root_path == target_path:
464
+ raise ValueError(
465
+ "Target and root are identical, but expected target to be a child of root."
466
+ )
467
+ try:
468
+ relative_path = target_path.relative_to(root_path)
469
+ except ValueError:
470
+ raise ValueError("Expected target to be a child of root.")
471
+ return str(relative_path)
472
+
473
+ def relativize_operation_paths(
474
+ self, operation: TransactionOperation, catalog_root: str
475
+ ) -> None:
476
+ """
477
+ Converts all absolute paths in an operation to relative paths
478
+ with respect to the catalog root directory.
479
+ """
480
+ # handle metafile paths
481
+ if operation.metafile_write_paths:
482
+ metafile_write_paths = [
483
+ Transaction._abs_txn_meta_path_to_relative(catalog_root, path)
484
+ for path in operation.metafile_write_paths
485
+ ]
486
+ operation.metafile_write_paths = metafile_write_paths
487
+ # handle locator paths
488
+ if operation.locator_write_paths:
489
+ locator_write_paths = [
490
+ Transaction._abs_txn_meta_path_to_relative(catalog_root, path)
491
+ for path in operation.locator_write_paths
492
+ ]
493
+ operation.locator_write_paths = locator_write_paths
494
+
495
+ def to_serializable(self, catalog_root) -> Transaction:
496
+ """
497
+ Prepare the object for serialization by converting any non-serializable
498
+ types to serializable types. May also run any required pre-write
499
+ validations on the serialized or deserialized object.
500
+ :return: a serializable version of the object
501
+ """
502
+ serializable = copy.deepcopy(self)
503
+ # remove all src/dest metafile contents except IDs and locators to
504
+ # reduce file size (they can be reconstructed from their corresponding
505
+ # files as required).
506
+ for operation in serializable.operations:
507
+ # Sanity check that IDs exist on source and dest metafiles
508
+ if operation.dest_metafile and operation.dest_metafile.id is None:
509
+ raise ValueError(
510
+ f"Transaction operation ${operation} dest metafile does "
511
+ f"not have ID: ${operation.dest_metafile}"
512
+ )
513
+ if operation.src_metafile and operation.src_metafile.id is None:
514
+ raise ValueError(
515
+ f"Transaction operation ${operation} src metafile does "
516
+ f"not have ID: ${operation.src_metafile}"
517
+ )
518
+ # relativize after checking that dest and src metafiles are valid
519
+ self.relativize_operation_paths(operation, catalog_root)
520
+ operation.dest_metafile = {
521
+ "id": operation.dest_metafile.id,
522
+ "locator": operation.dest_metafile.locator,
523
+ "locator_alias": operation.dest_metafile.locator_alias,
524
+ }
525
+ if operation.src_metafile:
526
+ operation.src_metafile = {
527
+ "id": operation.src_metafile.id,
528
+ "locator": operation.src_metafile.locator,
529
+ "locator_alias": operation.src_metafile.locator_alias,
530
+ }
531
+ # TODO(pdames): Ensure that all file paths recorded are relative to the
532
+ # catalog root.
533
+ return serializable
534
+
535
+ @staticmethod
536
+ def _validate_txn_log_file(success_txn_log_file: str) -> None:
537
+ txn_log_dir_name = posixpath.basename(posixpath.dirname(success_txn_log_file))
538
+ txn_log_parts = txn_log_dir_name.split(TXN_PART_SEPARATOR)
539
+ # ensure that the transaction start time is valid
540
+ try:
541
+ start_time = int(txn_log_parts[0])
542
+ except ValueError as e:
543
+ raise ValueError(
544
+ f"Transaction log file `{success_txn_log_file}` does not "
545
+ f"contain a valid start time."
546
+ ) from e
547
+ # ensure that the txn uuid is valid
548
+ txn_uuid_str = txn_log_parts[1]
549
+ try:
550
+ uuid.UUID(txn_uuid_str)
551
+ except ValueError as e:
552
+ raise OSError(
553
+ f"Transaction log file `{success_txn_log_file}` does not "
554
+ f"contain a valid UUID string."
555
+ ) from e
556
+ # ensure that the transaction end time is valid
557
+ try:
558
+ end_time = Transaction._parse_end_time(success_txn_log_file)
559
+ except ValueError as e:
560
+ raise ValueError(
561
+ f"Transaction log file `{success_txn_log_file}` does not "
562
+ f"contain a valid end time."
563
+ ) from e
564
+ # ensure transaction end time was not recorded before start time
565
+ if end_time < start_time:
566
+ raise OSError(
567
+ f"Transaction end time {end_time} is earlier than start "
568
+ f"time {start_time}! To preserve catalog integrity, the "
569
+ f"corresponding completed transaction log at "
570
+ f"`{success_txn_log_file}` has been removed."
571
+ )
572
+
573
+ def commit(
574
+ self,
575
+ catalog_root_dir: str,
576
+ filesystem: Optional[pyarrow.fs.FileSystem] = None,
577
+ ) -> Union[List[ListResult[Metafile]], Tuple[List[str], str]]:
578
+ # TODO(pdames): allow transactions to be durably staged and resumed
579
+ # across multiple sessions prior to commit
580
+
581
+ # create a new internal copy of this transaction to guard against
582
+ # external modification and dirty state across retries
583
+ txn = copy.deepcopy(self)
584
+
585
+ # create the transaction directory first to telegraph that at least 1
586
+ # transaction at this root has been attempted
587
+ catalog_root_normalized, filesystem = resolve_path_and_filesystem(
588
+ catalog_root_dir,
589
+ filesystem,
590
+ )
591
+ txn_log_dir = posixpath.join(catalog_root_normalized, TXN_DIR_NAME)
592
+ running_txn_log_dir = posixpath.join(txn_log_dir, RUNNING_TXN_DIR_NAME)
593
+ filesystem.create_dir(running_txn_log_dir, recursive=True)
594
+ failed_txn_log_dir = posixpath.join(txn_log_dir, FAILED_TXN_DIR_NAME)
595
+ filesystem.create_dir(failed_txn_log_dir, recursive=False)
596
+ success_txn_log_dir = posixpath.join(txn_log_dir, SUCCESS_TXN_DIR_NAME)
597
+ filesystem.create_dir(success_txn_log_dir, recursive=False)
598
+
599
+ # TODO(pdames): Support injection of other time providers, but ensure
600
+ # that ALL transactions in a catalog use the same time provider.
601
+ time_provider = TransactionSystemTimeProvider()
602
+
603
+ # record the transaction start time
604
+ txn._mark_start_time(time_provider)
605
+
606
+ if txn.type == TransactionType.READ:
607
+ list_results = []
608
+ for operation in self.operations:
609
+ list_result = operation.dest_metafile.read_txn(
610
+ catalog_root_dir=catalog_root_normalized,
611
+ success_txn_log_dir=success_txn_log_dir,
612
+ current_txn_op=operation,
613
+ current_txn_start_time=txn.start_time,
614
+ current_txn_id=txn.id,
615
+ filesystem=filesystem,
616
+ )
617
+ list_results.append(list_result)
618
+ return list_results
619
+ else:
620
+ return txn._commit_write(
621
+ catalog_root_normalized=catalog_root_normalized,
622
+ running_txn_log_dir=running_txn_log_dir,
623
+ failed_txn_log_dir=failed_txn_log_dir,
624
+ success_txn_log_dir=success_txn_log_dir,
625
+ filesystem=filesystem,
626
+ time_provider=time_provider,
627
+ )
628
+
629
+ def _commit_write(
630
+ self,
631
+ catalog_root_normalized: str,
632
+ running_txn_log_dir: str,
633
+ failed_txn_log_dir: str,
634
+ success_txn_log_dir: str,
635
+ filesystem: pyarrow.fs.FileSystem,
636
+ time_provider: TransactionTimeProvider,
637
+ ) -> Tuple[List[str], str]:
638
+ # write the in-progress transaction log file
639
+ running_txn_log_file_path = posixpath.join(
640
+ running_txn_log_dir,
641
+ self.id,
642
+ )
643
+ with filesystem.open_output_stream(running_txn_log_file_path) as file:
644
+ packed = msgpack.dumps(self.to_serializable(catalog_root_normalized))
645
+ file.write(packed)
646
+
647
+ # write each metafile associated with the transaction
648
+ metafile_write_paths = []
649
+ locator_write_paths = []
650
+ try:
651
+ for operation in self.operations:
652
+ operation.dest_metafile.write_txn(
653
+ catalog_root_dir=catalog_root_normalized,
654
+ success_txn_log_dir=success_txn_log_dir,
655
+ current_txn_op=operation,
656
+ current_txn_start_time=self.start_time,
657
+ current_txn_id=self.id,
658
+ filesystem=filesystem,
659
+ )
660
+ metafile_write_paths.extend(operation.metafile_write_paths)
661
+ locator_write_paths.extend(operation.locator_write_paths)
662
+ # check for conflicts with concurrent transactions
663
+ for path in metafile_write_paths + locator_write_paths:
664
+ MetafileRevisionInfo.check_for_concurrent_txn_conflict(
665
+ success_txn_log_dir=success_txn_log_dir,
666
+ current_txn_revision_file_path=path,
667
+ filesystem=filesystem,
668
+ )
669
+ except Exception:
670
+ # write a failed transaction log file entry
671
+ failed_txn_log_file_path = posixpath.join(
672
+ failed_txn_log_dir,
673
+ self.id,
674
+ )
675
+ with filesystem.open_output_stream(failed_txn_log_file_path) as file:
676
+ packed = msgpack.dumps(self.to_serializable(catalog_root_normalized))
677
+ file.write(packed)
678
+
679
+ ###################################################################
680
+ ###################################################################
681
+ # failure past here telegraphs a failed transaction cleanup attempt
682
+ ###################################################################
683
+ ###################################################################
684
+
685
+ # delete all files written during the failed transaction
686
+ known_write_paths = chain.from_iterable(
687
+ [
688
+ operation.metafile_write_paths + operation.locator_write_paths
689
+ for operation in self.operations
690
+ ]
691
+ )
692
+ # TODO(pdames): Add separate janitor job to cleanup files that we
693
+ # either failed to add to the known write paths, or fail to delete.
694
+ for write_path in known_write_paths:
695
+ filesystem.delete_file(write_path)
696
+
697
+ # delete the in-progress transaction log file entry
698
+ filesystem.delete_file(running_txn_log_file_path)
699
+ # failed transaction cleanup is now complete
700
+ raise
701
+
702
+ # record the completed transaction
703
+ success_txn_log_file_dir = posixpath.join(
704
+ success_txn_log_dir,
705
+ self.id,
706
+ )
707
+ filesystem.create_dir(
708
+ success_txn_log_file_dir,
709
+ recursive=False,
710
+ )
711
+ end_time = self._mark_end_time(time_provider)
712
+ success_txn_log_file_path = posixpath.join(
713
+ success_txn_log_file_dir,
714
+ str(end_time),
715
+ )
716
+ with filesystem.open_output_stream(success_txn_log_file_path) as file:
717
+ packed = msgpack.dumps(self.to_serializable(catalog_root_normalized))
718
+ file.write(packed)
719
+ try:
720
+ Transaction._validate_txn_log_file(
721
+ success_txn_log_file=success_txn_log_file_path
722
+ )
723
+ except Exception as e1:
724
+ try:
725
+ # move the txn log from success dir to failed dir
726
+ failed_txn_log_file_path = posixpath.join(
727
+ failed_txn_log_dir,
728
+ self.id,
729
+ )
730
+ filesystem.move(
731
+ src=success_txn_log_file_path,
732
+ dest=failed_txn_log_file_path,
733
+ )
734
+ # keep parent success txn log dir to telegraph failed validation
735
+
736
+ ###############################################################
737
+ ###############################################################
738
+ # failure past here telegraphs a failed transaction validation
739
+ # cleanup attempt
740
+ ###############################################################
741
+ ###############################################################
742
+ except Exception as e2:
743
+ raise OSError(
744
+ f"Failed to cleanup bad transaction log file at "
745
+ f"`{success_txn_log_file_path}`"
746
+ ) from e2
747
+ finally:
748
+ raise RuntimeError(
749
+ f"Transaction validation failed. To preserve "
750
+ f"catalog integrity, the corresponding completed "
751
+ f"transaction log at `{success_txn_log_file_path}` has "
752
+ f"been removed."
753
+ ) from e1
754
+ finally:
755
+ # delete the in-progress transaction log file entry
756
+ filesystem.delete_file(running_txn_log_file_path)
757
+ return metafile_write_paths, success_txn_log_file_path