deltacat 1.1.36__py3-none-any.whl → 2.0.0b2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (238) hide show
  1. deltacat/__init__.py +42 -3
  2. deltacat/annotations.py +36 -0
  3. deltacat/api.py +168 -0
  4. deltacat/aws/s3u.py +4 -4
  5. deltacat/benchmarking/benchmark_engine.py +82 -0
  6. deltacat/benchmarking/benchmark_report.py +86 -0
  7. deltacat/benchmarking/benchmark_suite.py +11 -0
  8. deltacat/benchmarking/conftest.py +21 -0
  9. deltacat/benchmarking/data/random_row_generator.py +94 -0
  10. deltacat/benchmarking/data/row_generator.py +10 -0
  11. deltacat/benchmarking/test_benchmark_pipeline.py +106 -0
  12. deltacat/catalog/__init__.py +14 -0
  13. deltacat/catalog/delegate.py +199 -106
  14. deltacat/catalog/iceberg/__init__.py +4 -0
  15. deltacat/catalog/iceberg/iceberg_catalog_config.py +26 -0
  16. deltacat/catalog/iceberg/impl.py +368 -0
  17. deltacat/catalog/iceberg/overrides.py +74 -0
  18. deltacat/catalog/interface.py +273 -76
  19. deltacat/catalog/main/impl.py +720 -0
  20. deltacat/catalog/model/catalog.py +227 -20
  21. deltacat/catalog/model/properties.py +116 -0
  22. deltacat/catalog/model/table_definition.py +32 -1
  23. deltacat/compute/compactor/model/compaction_session_audit_info.py +7 -3
  24. deltacat/compute/compactor/model/delta_annotated.py +3 -3
  25. deltacat/compute/compactor/model/delta_file_envelope.py +3 -1
  26. deltacat/compute/compactor/model/delta_file_locator.py +3 -1
  27. deltacat/compute/compactor/model/round_completion_info.py +5 -5
  28. deltacat/compute/compactor/model/table_object_store.py +3 -2
  29. deltacat/compute/compactor/repartition_session.py +1 -1
  30. deltacat/compute/compactor/steps/dedupe.py +11 -4
  31. deltacat/compute/compactor/steps/hash_bucket.py +1 -1
  32. deltacat/compute/compactor/steps/materialize.py +6 -2
  33. deltacat/compute/compactor/utils/io.py +1 -1
  34. deltacat/compute/compactor/utils/sort_key.py +9 -2
  35. deltacat/compute/compactor_v2/compaction_session.py +5 -9
  36. deltacat/compute/compactor_v2/constants.py +1 -30
  37. deltacat/compute/compactor_v2/deletes/utils.py +3 -3
  38. deltacat/compute/compactor_v2/model/merge_input.py +1 -7
  39. deltacat/compute/compactor_v2/private/compaction_utils.py +5 -6
  40. deltacat/compute/compactor_v2/steps/merge.py +17 -126
  41. deltacat/compute/compactor_v2/utils/content_type_params.py +0 -17
  42. deltacat/compute/compactor_v2/utils/dedupe.py +1 -1
  43. deltacat/compute/compactor_v2/utils/io.py +1 -1
  44. deltacat/compute/compactor_v2/utils/merge.py +0 -1
  45. deltacat/compute/compactor_v2/utils/primary_key_index.py +3 -15
  46. deltacat/compute/compactor_v2/utils/task_options.py +23 -43
  47. deltacat/compute/converter/constants.py +4 -0
  48. deltacat/compute/converter/converter_session.py +143 -0
  49. deltacat/compute/converter/model/convert_input.py +69 -0
  50. deltacat/compute/converter/model/convert_input_files.py +61 -0
  51. deltacat/compute/converter/model/converter_session_params.py +99 -0
  52. deltacat/compute/converter/pyiceberg/__init__.py +0 -0
  53. deltacat/compute/converter/pyiceberg/catalog.py +75 -0
  54. deltacat/compute/converter/pyiceberg/overrides.py +135 -0
  55. deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +251 -0
  56. deltacat/compute/converter/steps/__init__.py +0 -0
  57. deltacat/compute/converter/steps/convert.py +211 -0
  58. deltacat/compute/converter/steps/dedupe.py +60 -0
  59. deltacat/compute/converter/utils/__init__.py +0 -0
  60. deltacat/compute/converter/utils/convert_task_options.py +88 -0
  61. deltacat/compute/converter/utils/converter_session_utils.py +109 -0
  62. deltacat/compute/converter/utils/iceberg_columns.py +82 -0
  63. deltacat/compute/converter/utils/io.py +43 -0
  64. deltacat/compute/converter/utils/s3u.py +133 -0
  65. deltacat/compute/resource_estimation/delta.py +1 -19
  66. deltacat/constants.py +47 -1
  67. deltacat/env.py +51 -0
  68. deltacat/examples/__init__.py +0 -0
  69. deltacat/examples/basic_logging.py +101 -0
  70. deltacat/examples/common/__init__.py +0 -0
  71. deltacat/examples/common/fixtures.py +15 -0
  72. deltacat/examples/hello_world.py +27 -0
  73. deltacat/examples/iceberg/__init__.py +0 -0
  74. deltacat/examples/iceberg/iceberg_bucket_writer.py +139 -0
  75. deltacat/examples/iceberg/iceberg_reader.py +149 -0
  76. deltacat/exceptions.py +51 -9
  77. deltacat/logs.py +4 -1
  78. deltacat/storage/__init__.py +118 -28
  79. deltacat/storage/iceberg/__init__.py +0 -0
  80. deltacat/storage/iceberg/iceberg_scan_planner.py +28 -0
  81. deltacat/storage/iceberg/impl.py +737 -0
  82. deltacat/storage/iceberg/model.py +709 -0
  83. deltacat/storage/interface.py +217 -134
  84. deltacat/storage/main/__init__.py +0 -0
  85. deltacat/storage/main/impl.py +2077 -0
  86. deltacat/storage/model/delta.py +118 -71
  87. deltacat/storage/model/interop.py +24 -0
  88. deltacat/storage/model/list_result.py +8 -0
  89. deltacat/storage/model/locator.py +93 -3
  90. deltacat/{aws/redshift → storage}/model/manifest.py +122 -98
  91. deltacat/storage/model/metafile.py +1316 -0
  92. deltacat/storage/model/namespace.py +34 -18
  93. deltacat/storage/model/partition.py +362 -37
  94. deltacat/storage/model/scan/__init__.py +0 -0
  95. deltacat/storage/model/scan/push_down.py +19 -0
  96. deltacat/storage/model/scan/scan_plan.py +10 -0
  97. deltacat/storage/model/scan/scan_task.py +34 -0
  98. deltacat/storage/model/schema.py +892 -0
  99. deltacat/storage/model/shard.py +47 -0
  100. deltacat/storage/model/sort_key.py +170 -13
  101. deltacat/storage/model/stream.py +208 -80
  102. deltacat/storage/model/table.py +123 -29
  103. deltacat/storage/model/table_version.py +322 -46
  104. deltacat/storage/model/transaction.py +757 -0
  105. deltacat/storage/model/transform.py +198 -61
  106. deltacat/storage/model/types.py +111 -13
  107. deltacat/storage/rivulet/__init__.py +11 -0
  108. deltacat/storage/rivulet/arrow/__init__.py +0 -0
  109. deltacat/storage/rivulet/arrow/serializer.py +75 -0
  110. deltacat/storage/rivulet/dataset.py +744 -0
  111. deltacat/storage/rivulet/dataset_executor.py +87 -0
  112. deltacat/storage/rivulet/feather/__init__.py +5 -0
  113. deltacat/storage/rivulet/feather/file_reader.py +136 -0
  114. deltacat/storage/rivulet/feather/serializer.py +35 -0
  115. deltacat/storage/rivulet/fs/__init__.py +0 -0
  116. deltacat/storage/rivulet/fs/file_provider.py +105 -0
  117. deltacat/storage/rivulet/fs/file_store.py +130 -0
  118. deltacat/storage/rivulet/fs/input_file.py +76 -0
  119. deltacat/storage/rivulet/fs/output_file.py +86 -0
  120. deltacat/storage/rivulet/logical_plan.py +105 -0
  121. deltacat/storage/rivulet/metastore/__init__.py +0 -0
  122. deltacat/storage/rivulet/metastore/delta.py +190 -0
  123. deltacat/storage/rivulet/metastore/json_sst.py +105 -0
  124. deltacat/storage/rivulet/metastore/sst.py +82 -0
  125. deltacat/storage/rivulet/metastore/sst_interval_tree.py +260 -0
  126. deltacat/storage/rivulet/mvp/Table.py +101 -0
  127. deltacat/storage/rivulet/mvp/__init__.py +5 -0
  128. deltacat/storage/rivulet/parquet/__init__.py +5 -0
  129. deltacat/storage/rivulet/parquet/data_reader.py +0 -0
  130. deltacat/storage/rivulet/parquet/file_reader.py +127 -0
  131. deltacat/storage/rivulet/parquet/serializer.py +37 -0
  132. deltacat/storage/rivulet/reader/__init__.py +0 -0
  133. deltacat/storage/rivulet/reader/block_scanner.py +378 -0
  134. deltacat/storage/rivulet/reader/data_reader.py +136 -0
  135. deltacat/storage/rivulet/reader/data_scan.py +63 -0
  136. deltacat/storage/rivulet/reader/dataset_metastore.py +178 -0
  137. deltacat/storage/rivulet/reader/dataset_reader.py +156 -0
  138. deltacat/storage/rivulet/reader/pyarrow_data_reader.py +121 -0
  139. deltacat/storage/rivulet/reader/query_expression.py +99 -0
  140. deltacat/storage/rivulet/reader/reader_type_registrar.py +84 -0
  141. deltacat/storage/rivulet/schema/__init__.py +0 -0
  142. deltacat/storage/rivulet/schema/datatype.py +128 -0
  143. deltacat/storage/rivulet/schema/schema.py +251 -0
  144. deltacat/storage/rivulet/serializer.py +40 -0
  145. deltacat/storage/rivulet/serializer_factory.py +42 -0
  146. deltacat/storage/rivulet/writer/__init__.py +0 -0
  147. deltacat/storage/rivulet/writer/dataset_writer.py +29 -0
  148. deltacat/storage/rivulet/writer/memtable_dataset_writer.py +294 -0
  149. deltacat/storage/util/__init__.py +0 -0
  150. deltacat/storage/util/scan_planner.py +26 -0
  151. deltacat/tests/_io/__init__.py +1 -0
  152. deltacat/tests/catalog/test_catalogs.py +324 -0
  153. deltacat/tests/catalog/test_default_catalog_impl.py +16 -8
  154. deltacat/tests/compute/compact_partition_multiple_rounds_test_cases.py +21 -21
  155. deltacat/tests/compute/compact_partition_rebase_test_cases.py +6 -6
  156. deltacat/tests/compute/compact_partition_rebase_then_incremental_test_cases.py +56 -56
  157. deltacat/tests/compute/compact_partition_test_cases.py +19 -53
  158. deltacat/tests/compute/compactor/steps/test_repartition.py +2 -2
  159. deltacat/tests/compute/compactor/utils/test_io.py +6 -8
  160. deltacat/tests/compute/compactor_v2/test_compaction_session.py +0 -466
  161. deltacat/tests/compute/compactor_v2/utils/test_task_options.py +1 -273
  162. deltacat/tests/compute/conftest.py +75 -0
  163. deltacat/tests/compute/converter/__init__.py +0 -0
  164. deltacat/tests/compute/converter/conftest.py +80 -0
  165. deltacat/tests/compute/converter/test_convert_session.py +478 -0
  166. deltacat/tests/compute/converter/utils.py +123 -0
  167. deltacat/tests/compute/resource_estimation/test_delta.py +0 -16
  168. deltacat/tests/compute/test_compact_partition_incremental.py +2 -42
  169. deltacat/tests/compute/test_compact_partition_multiple_rounds.py +5 -46
  170. deltacat/tests/compute/test_compact_partition_params.py +3 -3
  171. deltacat/tests/compute/test_compact_partition_rebase.py +1 -46
  172. deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +5 -46
  173. deltacat/tests/compute/test_util_common.py +19 -12
  174. deltacat/tests/compute/test_util_create_table_deltas_repo.py +13 -22
  175. deltacat/tests/local_deltacat_storage/__init__.py +76 -103
  176. deltacat/tests/storage/__init__.py +0 -0
  177. deltacat/tests/storage/conftest.py +25 -0
  178. deltacat/tests/storage/main/__init__.py +0 -0
  179. deltacat/tests/storage/main/test_main_storage.py +1399 -0
  180. deltacat/tests/storage/model/__init__.py +0 -0
  181. deltacat/tests/storage/model/test_delete_parameters.py +21 -0
  182. deltacat/tests/storage/model/test_metafile_io.py +2535 -0
  183. deltacat/tests/storage/model/test_schema.py +308 -0
  184. deltacat/tests/storage/model/test_shard.py +22 -0
  185. deltacat/tests/storage/model/test_table_version.py +110 -0
  186. deltacat/tests/storage/model/test_transaction.py +308 -0
  187. deltacat/tests/storage/rivulet/__init__.py +0 -0
  188. deltacat/tests/storage/rivulet/conftest.py +149 -0
  189. deltacat/tests/storage/rivulet/fs/__init__.py +0 -0
  190. deltacat/tests/storage/rivulet/fs/test_file_location_provider.py +93 -0
  191. deltacat/tests/storage/rivulet/schema/__init__.py +0 -0
  192. deltacat/tests/storage/rivulet/schema/test_schema.py +241 -0
  193. deltacat/tests/storage/rivulet/test_dataset.py +406 -0
  194. deltacat/tests/storage/rivulet/test_manifest.py +67 -0
  195. deltacat/tests/storage/rivulet/test_sst_interval_tree.py +232 -0
  196. deltacat/tests/storage/rivulet/test_utils.py +122 -0
  197. deltacat/tests/storage/rivulet/writer/__init__.py +0 -0
  198. deltacat/tests/storage/rivulet/writer/test_dataset_write_then_read.py +341 -0
  199. deltacat/tests/storage/rivulet/writer/test_dataset_writer.py +79 -0
  200. deltacat/tests/storage/rivulet/writer/test_memtable_dataset_writer.py +75 -0
  201. deltacat/tests/test_deltacat_api.py +39 -0
  202. deltacat/tests/test_utils/filesystem.py +14 -0
  203. deltacat/tests/test_utils/message_pack_utils.py +54 -0
  204. deltacat/tests/test_utils/pyarrow.py +8 -15
  205. deltacat/tests/test_utils/storage.py +266 -3
  206. deltacat/tests/utils/test_daft.py +3 -3
  207. deltacat/tests/utils/test_pyarrow.py +0 -432
  208. deltacat/types/partial_download.py +1 -1
  209. deltacat/types/tables.py +1 -1
  210. deltacat/utils/export.py +59 -0
  211. deltacat/utils/filesystem.py +320 -0
  212. deltacat/utils/metafile_locator.py +73 -0
  213. deltacat/utils/pyarrow.py +36 -183
  214. deltacat-2.0.0b2.dist-info/METADATA +65 -0
  215. deltacat-2.0.0b2.dist-info/RECORD +349 -0
  216. deltacat/aws/redshift/__init__.py +0 -19
  217. deltacat/catalog/default_catalog_impl/__init__.py +0 -369
  218. deltacat/io/dataset.py +0 -73
  219. deltacat/io/read_api.py +0 -143
  220. deltacat/storage/model/delete_parameters.py +0 -40
  221. deltacat/storage/model/partition_spec.py +0 -71
  222. deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +0 -253
  223. deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +0 -45
  224. deltacat-1.1.36.dist-info/METADATA +0 -64
  225. deltacat-1.1.36.dist-info/RECORD +0 -219
  226. /deltacat/{aws/redshift/model → benchmarking/data}/__init__.py +0 -0
  227. /deltacat/{io/aws → catalog/main}/__init__.py +0 -0
  228. /deltacat/{io/aws/redshift → compute/converter}/__init__.py +0 -0
  229. /deltacat/{tests/io → compute/converter/model}/__init__.py +0 -0
  230. /deltacat/tests/{io → _io}/test_cloudpickle_bug_fix.py +0 -0
  231. /deltacat/tests/{io → _io}/test_file_object_store.py +0 -0
  232. /deltacat/tests/{io → _io}/test_memcached_object_store.py +0 -0
  233. /deltacat/tests/{io → _io}/test_ray_plasma_object_store.py +0 -0
  234. /deltacat/tests/{io → _io}/test_redis_object_store.py +0 -0
  235. /deltacat/tests/{io → _io}/test_s3_object_store.py +0 -0
  236. {deltacat-1.1.36.dist-info → deltacat-2.0.0b2.dist-info}/LICENSE +0 -0
  237. {deltacat-1.1.36.dist-info → deltacat-2.0.0b2.dist-info}/WHEEL +0 -0
  238. {deltacat-1.1.36.dist-info → deltacat-2.0.0b2.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,1316 @@
1
+ # Allow classes to use self-referencing Type hints in Python 3.7.
2
+ from __future__ import annotations
3
+
4
+ import copy
5
+
6
+ from typing import Optional, Tuple, List
7
+
8
+ import base64
9
+ import json
10
+ import msgpack
11
+ import pyarrow.fs
12
+ import posixpath
13
+ import uuid
14
+ import deltacat
15
+
16
+ from deltacat.constants import (
17
+ METAFILE_FORMAT,
18
+ REVISION_DIR_NAME,
19
+ METAFILE_EXT,
20
+ SUPPORTED_METAFILE_FORMATS,
21
+ TXN_DIR_NAME,
22
+ TXN_PART_SEPARATOR,
23
+ SUCCESS_TXN_DIR_NAME,
24
+ )
25
+ from deltacat.storage.model.list_result import ListResult
26
+ from deltacat.storage.model.locator import Locator
27
+ from deltacat.storage.model.types import TransactionOperationType
28
+ from deltacat.utils.filesystem import (
29
+ resolve_path_and_filesystem,
30
+ list_directory,
31
+ get_file_info,
32
+ )
33
+
34
+
35
+ class MetafileRevisionInfo(dict):
36
+ """
37
+ Base class for DeltaCAT metafile revision info.
38
+ """
39
+
40
+ @staticmethod
41
+ def undefined() -> MetafileRevisionInfo:
42
+ mri = MetafileRevisionInfo()
43
+ mri.revision = 0
44
+ mri.txn_id = None
45
+ mri.txn_op_type = None
46
+ mri.dir_path = None
47
+ return mri
48
+
49
+ @staticmethod
50
+ def parse(revision_file_path: str) -> MetafileRevisionInfo:
51
+ dir_path = posixpath.dirname(revision_file_path)
52
+ metafile_name = posixpath.basename(revision_file_path)
53
+ metafile_and_ext = posixpath.splitext(metafile_name)
54
+ metafile_ext = metafile_and_ext[1] if len(metafile_and_ext) > 1 else None
55
+ metafile_rev_and_txn_info = metafile_and_ext[0]
56
+ txn_info_parts = metafile_rev_and_txn_info.split(TXN_PART_SEPARATOR)
57
+
58
+ mri = MetafileRevisionInfo()
59
+ mri.dir_path = dir_path
60
+ mri.extension = metafile_ext
61
+ mri.revision = int(txn_info_parts[0])
62
+ mri.txn_op_type = txn_info_parts[1]
63
+ mri.txn_id = f"{txn_info_parts[2]}{TXN_PART_SEPARATOR}{txn_info_parts[3]}"
64
+ return mri
65
+
66
+ @staticmethod
67
+ def list_revisions(
68
+ revision_dir_path: str,
69
+ filesystem: pyarrow.fs.FileSystem,
70
+ success_txn_log_dir: str,
71
+ current_txn_start_time: Optional[int] = None,
72
+ current_txn_id: Optional[str] = None,
73
+ limit: Optional[int] = None,
74
+ ) -> List[MetafileRevisionInfo]:
75
+ if not success_txn_log_dir:
76
+ err_msg = f"No transaction log found for: {revision_dir_path}."
77
+ raise ValueError(err_msg)
78
+ # find the latest committed revision of the target metafile
79
+ sorted_metafile_paths = MetafileRevisionInfo._sorted_file_paths(
80
+ revision_dir_path=revision_dir_path,
81
+ filesystem=filesystem,
82
+ ignore_missing_revision=True,
83
+ )
84
+ revisions = []
85
+ while sorted_metafile_paths:
86
+ latest_metafile_path = sorted_metafile_paths.pop()
87
+ mri = MetafileRevisionInfo.parse(latest_metafile_path)
88
+ if not current_txn_id or mri.txn_id == current_txn_id:
89
+ # consider the current transaction (if any) to be committed
90
+ revisions.append(mri)
91
+ elif current_txn_start_time is not None:
92
+ # the current transaction can only build on top of the snapshot
93
+ # of commits from transactions that completed before it started
94
+ txn_end_time = (
95
+ deltacat.storage.model.transaction.Transaction.read_end_time(
96
+ path=posixpath.join(success_txn_log_dir, mri.txn_id),
97
+ filesystem=filesystem,
98
+ )
99
+ )
100
+ if txn_end_time is not None and txn_end_time < current_txn_start_time:
101
+ revisions.append(mri)
102
+ else:
103
+ raise ValueError(
104
+ f"Current transaction ID `{current_txn_id} provided "
105
+ f"without a transaction start time."
106
+ )
107
+ if limit <= len(revisions):
108
+ break
109
+ return revisions
110
+
111
+ @staticmethod
112
+ def latest_revision(
113
+ revision_dir_path: str,
114
+ filesystem: pyarrow.fs.FileSystem,
115
+ success_txn_log_dir: str,
116
+ current_txn_start_time: Optional[int] = None,
117
+ current_txn_id: Optional[str] = None,
118
+ ignore_missing_revision: bool = False,
119
+ ) -> MetafileRevisionInfo:
120
+ """
121
+ Fetch latest revision of a metafile, or return None if no
122
+ revisions exist.
123
+ :param revision_dir_path: root path of directory for metafile
124
+ :param ignore_missing_revision: if True, will return
125
+ MetafileRevisionInfo.undefined() on no revisions
126
+ :raises ValueError if no revisions are found AND
127
+ ignore_missing_revision=False
128
+ """
129
+ revisions = MetafileRevisionInfo.list_revisions(
130
+ revision_dir_path=revision_dir_path,
131
+ filesystem=filesystem,
132
+ success_txn_log_dir=success_txn_log_dir,
133
+ current_txn_start_time=current_txn_start_time,
134
+ current_txn_id=current_txn_id,
135
+ limit=1,
136
+ )
137
+ if not revisions and not ignore_missing_revision:
138
+ err_msg = f"No committed revision found at {revision_dir_path}."
139
+ raise ValueError(err_msg)
140
+ return revisions[0] if revisions else MetafileRevisionInfo.undefined()
141
+
142
+ @staticmethod
143
+ def new_revision(
144
+ revision_dir_path: str,
145
+ current_txn_op_type: deltacat.storage.model.transaction.TransactionOperationType,
146
+ current_txn_start_time: int,
147
+ current_txn_id: str,
148
+ filesystem: pyarrow.fs.FileSystem,
149
+ extension: Optional[str] = METAFILE_EXT,
150
+ success_txn_log_dir: Optional[str] = None,
151
+ ) -> MetafileRevisionInfo:
152
+ """
153
+ Creates and returns a new MetafileRevisionInfo object for the next
154
+ revision of the metafile.
155
+
156
+ This method determines the next revision information based on the
157
+ latest existing revision in the specified directory path and the
158
+ current transaction details.
159
+
160
+ Args:
161
+ revision_dir_path (str): Metafile revision directory path to
162
+ generate the next metafile revision info for.
163
+ current_txn_op_type (TransactionOperationType): The current
164
+ transaction's operation type.
165
+ current_txn_start_time (int): The current transaction's start time.
166
+ current_txn_id (str): The current transaction's ID.
167
+ filesystem (pyarrow.fs.FileSystem): The filesystem interface to
168
+ use for file operations
169
+ extension (str, optional): The file extension for metafiles.
170
+ Defaults to METAFILE_EXT.
171
+ success_txn_log_dir (Optional[str], optional): Directory path for
172
+ successful transaction logs. Will be automatically discovered by
173
+ traversing revision directory parent paths if not specified.
174
+
175
+ Returns:
176
+ MetafileRevisionInfo: A new revision info object containing
177
+ metadata for the next revision
178
+
179
+ Notes:
180
+ - For CREATE operations, the method will ignore missing previous
181
+ revisions.
182
+ - The method validates the transaction operation type before
183
+ creating the new revision.
184
+ - Uses the pyarrow filesystem interface for file operations.
185
+ """
186
+ is_create_txn = current_txn_op_type == TransactionOperationType.CREATE
187
+ mri = MetafileRevisionInfo.latest_revision(
188
+ revision_dir_path=revision_dir_path,
189
+ filesystem=filesystem,
190
+ success_txn_log_dir=success_txn_log_dir,
191
+ current_txn_start_time=current_txn_start_time,
192
+ current_txn_id=current_txn_id,
193
+ ignore_missing_revision=is_create_txn,
194
+ )
195
+ # validate the transaction operation type
196
+ if mri.exists():
197
+ # update/delete fails if the last metafile was deleted
198
+ if mri.txn_op_type == TransactionOperationType.DELETE:
199
+ if current_txn_op_type != TransactionOperationType.CREATE:
200
+ raise ValueError(
201
+ f"Metafile {current_txn_op_type.value} failed "
202
+ f"for transaction ID {current_txn_id} failed. "
203
+ f"Metafile state at {mri.path} is deleted."
204
+ )
205
+ # create fails unless the last metafile was deleted
206
+ elif is_create_txn:
207
+ raise ValueError(
208
+ f"Metafile creation for transaction ID {current_txn_id} "
209
+ f"failed. Metafile commit at {mri.path} already exists."
210
+ )
211
+ elif not is_create_txn:
212
+ # update/delete fails if the last metafile doesn't exist
213
+ raise ValueError(
214
+ f"Metafile {current_txn_op_type.value} failed for "
215
+ f"transaction ID {current_txn_id} failed. Metafile at "
216
+ f"{mri.path} does not exist."
217
+ )
218
+ mri.revision = mri.revision + 1
219
+ mri.txn_id = current_txn_id
220
+ mri.txn_op_type = current_txn_op_type
221
+ mri.dir_path = revision_dir_path
222
+ mri.extension = extension
223
+ return mri
224
+
225
+ @staticmethod
226
+ def check_for_concurrent_txn_conflict(
227
+ success_txn_log_dir: str,
228
+ current_txn_revision_file_path: str,
229
+ filesystem: pyarrow.fs.FileSystem,
230
+ ) -> None:
231
+ """
232
+ Checks for a concurrent modification conflict between a file commited
233
+ by the current transaction and another parallel transaction. Raises
234
+ an exception if a concurrent modification conflict is found.
235
+
236
+ :param success_txn_log_dir: Path to the log of successful transactions.
237
+ :param current_txn_revision_file_path: Path to a metafile revision
238
+ written by the current transaction to check for conflicts against.
239
+ :param filesystem: Filesystem that can read the metafile revision.
240
+ :raises RuntimeError: if a conflict is found with another transaction.
241
+ """
242
+ revision_dir_path = posixpath.dirname(current_txn_revision_file_path)
243
+ cur_txn_mri = MetafileRevisionInfo.parse(current_txn_revision_file_path)
244
+
245
+ sorted_metafile_paths = MetafileRevisionInfo._sorted_file_paths(
246
+ revision_dir_path=revision_dir_path,
247
+ filesystem=filesystem,
248
+ )
249
+ conflict_mris = []
250
+ while sorted_metafile_paths:
251
+ next_metafile_path = sorted_metafile_paths.pop()
252
+ mri = MetafileRevisionInfo.parse(next_metafile_path)
253
+ if mri.revision < cur_txn_mri.revision:
254
+ # no conflict was found
255
+ break
256
+ elif (
257
+ mri.revision == cur_txn_mri.revision
258
+ and mri.txn_id != cur_txn_mri.txn_id
259
+ ):
260
+ # we've found a conflict between txn_id and current_txn_id
261
+ # defer to the transaction with the higher lexicographic order
262
+ # (i.e., the transaction that started most recently)
263
+ # TODO(pdames): Ensure the conflicting transaction is alive
264
+ # (e.g., give each transaction a heartbeat timeout that gives
265
+ # it 1-2 seconds per operation, and record known failed
266
+ # transaction IDs)
267
+ if mri.txn_id > cur_txn_mri.txn_id:
268
+ raise RuntimeError(
269
+ f"Aborting transaction {cur_txn_mri.txn_id} due to "
270
+ f"concurrent conflict at "
271
+ f"{current_txn_revision_file_path} with transaction "
272
+ f"{mri.txn_id} at {next_metafile_path}."
273
+ )
274
+ conflict_mris.append(mri)
275
+ if conflict_mris:
276
+ # current txn wins the ordering challenge among all conflicts,
277
+ # but we still need to ensure that no conflicting transactions
278
+ # completed before seeing the conflict with this transaction
279
+ for mri in conflict_mris:
280
+ txn_end_time = (
281
+ deltacat.storage.model.transaction.Transaction.read_end_time(
282
+ path=posixpath.join(success_txn_log_dir, mri.txn_id),
283
+ filesystem=filesystem,
284
+ )
285
+ )
286
+ # TODO(pdames): Resolve risk of passing this check if it
287
+ # runs before the conflicting transaction marks itself as
288
+ # complete in the txn log. Some fixes include enforcing
289
+ # serializable isolation of the txn log, eventually
290
+ # consistent detection & repair, writing a mutex file
291
+ # that tells future transactions to only consider this txn
292
+ # complete if the conflicting txn is not complete, etc.
293
+ if txn_end_time:
294
+ raise RuntimeError(
295
+ f"Aborting transaction {cur_txn_mri.txn_id} due to "
296
+ f"concurrent conflict at {revision_dir_path} with "
297
+ f"previously completed transaction {mri.txn_id} at "
298
+ f"{next_metafile_path}."
299
+ )
300
+
301
+ @staticmethod
302
+ def _sorted_file_paths(
303
+ revision_dir_path: str,
304
+ filesystem: pyarrow.fs.FileSystem,
305
+ ignore_missing_revision: bool = False,
306
+ ) -> List[str]:
307
+ file_paths_and_sizes = list_directory(
308
+ path=revision_dir_path,
309
+ filesystem=filesystem,
310
+ ignore_missing_path=True,
311
+ )
312
+ if not file_paths_and_sizes and not ignore_missing_revision:
313
+ err_msg = (
314
+ f"Expected to find at least 1 Metafile at "
315
+ f"{revision_dir_path} but found none."
316
+ )
317
+ raise ValueError(err_msg)
318
+ return list(list(zip(*file_paths_and_sizes))[0]) if file_paths_and_sizes else []
319
+
320
+ @property
321
+ def revision(self) -> int:
322
+ return self["revision"]
323
+
324
+ @revision.setter
325
+ def revision(self, revision: int):
326
+ self["revision"] = revision
327
+
328
+ @property
329
+ def txn_id(self) -> Optional[str]:
330
+ return self["txn_id"]
331
+
332
+ @txn_id.setter
333
+ def txn_id(self, txn_id: str):
334
+ self["txn_id"] = txn_id
335
+
336
+ @property
337
+ def txn_op_type(self) -> Optional[TransactionOperationType]:
338
+ op_type = self.get("txn_op_type")
339
+ return None if op_type is None else TransactionOperationType(op_type)
340
+
341
+ @txn_op_type.setter
342
+ def txn_op_type(self, txn_op_type: TransactionOperationType):
343
+ self["txn_op_type"] = txn_op_type
344
+
345
+ @property
346
+ def dir_path(self) -> Optional[str]:
347
+ return self["dir_path"]
348
+
349
+ @dir_path.setter
350
+ def dir_path(self, dir_path: str):
351
+ self["dir_path"] = dir_path
352
+
353
+ @property
354
+ def extension(self) -> str:
355
+ return self.get("extension") or METAFILE_EXT
356
+
357
+ @extension.setter
358
+ def extension(self, extension: str):
359
+ self["extension"] = extension
360
+
361
+ @property
362
+ def file_name(self) -> Optional[str]:
363
+ return (
364
+ TXN_PART_SEPARATOR.join(
365
+ [
366
+ f"{self.revision:020}",
367
+ self.txn_op_type,
368
+ f"{self.txn_id}{self.extension}",
369
+ ]
370
+ )
371
+ if self.txn_op_type and self.txn_id
372
+ else None
373
+ )
374
+
375
+ @property
376
+ def path(self) -> Optional[str]:
377
+ file_name = self.file_name
378
+ return (
379
+ posixpath.join(
380
+ self.dir_path,
381
+ file_name,
382
+ )
383
+ if self.dir_path and file_name
384
+ else None
385
+ )
386
+
387
+ def exists(self) -> bool:
388
+ return bool(self.revision)
389
+
390
+
391
+ class Metafile(dict):
392
+ """
393
+ Base class for DeltaCAT metadata files, with read and write methods
394
+ for dict-based DeltaCAT models. Uses msgpack (https://msgpack.org/) for
395
+ cross-language-compatible serialization and deserialization.
396
+ """
397
+
398
+ @staticmethod
399
+ def update_for(other: Optional[Metafile]) -> Optional[Metafile]:
400
+ """
401
+ Returns a new metafile that can be used as the destination metafile
402
+ in an update transaction operation against the input source metafile.
403
+ The returned metafile starts as an identical deep copy of the input
404
+ metafile such that, if the output is changed and committed as part of
405
+ an update transaction operation on the source metafile, then it will
406
+ update instead of replace the source metafile.
407
+ :param other: Source metafile for the copy.
408
+ :return: New copy of the source metafile.
409
+ """
410
+ return copy.deepcopy(other) if other is not None else None
411
+
412
+ @staticmethod
413
+ def based_on(
414
+ other: Optional[Metafile],
415
+ new_id: Optional[Locator] = None,
416
+ ) -> Optional[Metafile]:
417
+ """
418
+ Returns a new metafile equivalent to the input metafile, but with a new
419
+ ID assigned to distinguish it as a separate catalog object. This means
420
+ that, if the output is simply committed as part of an update transaction
421
+ operation on the source metafile, then it will replace instead of update
422
+ the source metafile.
423
+ :param other: Source metafile that is the basis for the new metafile.
424
+ :param new_id: New immutable ID to assign to the new metafile. Should
425
+ not be specified for metafiles with mutable names (e.g., namespaces and
426
+ tables).
427
+ :return: A new metafile based on the input metafile with a different ID.
428
+ """
429
+ metafile_copy = Metafile.update_for(other)
430
+ if metafile_copy:
431
+ # remove the source metafile ID so that this is treated as a
432
+ # different catalog object with otherwise identical properties
433
+ if not other.named_immutable_id:
434
+ metafile_copy.pop("id", None)
435
+ if new_id:
436
+ raise ValueError(
437
+ f"New ID cannot be specified for metafiles that "
438
+ f"don't have a named immutable ID."
439
+ )
440
+ else:
441
+ if not new_id:
442
+ raise ValueError(
443
+ f"New ID must be specified for metafiles that have a "
444
+ f"named immutable ID."
445
+ )
446
+ metafile_copy.named_immutable_id = new_id
447
+ # remove all ancestors of the original source metafile
448
+ metafile_copy.pop("ancestor_ids", None)
449
+ return metafile_copy
450
+
451
+ @staticmethod
452
+ def read_txn(
453
+ catalog_root_dir: str,
454
+ success_txn_log_dir: str,
455
+ current_txn_op: deltacat.storage.model.transaction.TransactionOperation,
456
+ current_txn_start_time: int,
457
+ current_txn_id: str,
458
+ filesystem: Optional[pyarrow.fs.FileSystem] = None,
459
+ ) -> ListResult[Metafile]:
460
+ """
461
+ Read one or more metadata files within the context of a transaction.
462
+ :param catalog_root_dir: Catalog root dir to read the metafile from.
463
+ :param success_txn_log_dir: Catalog root successful transaction log
464
+ directory.
465
+ :param current_txn_op: Transaction operation for this read.
466
+ :param current_txn_start_time: Transaction start time for this read.
467
+ :param current_txn_id: Transaction ID for this read.
468
+ :param filesystem: File system to use for reading the metadata file. If
469
+ not given, a default filesystem will be automatically selected based on
470
+ the catalog root path.
471
+ :return: ListResult of deserialized metadata files read.
472
+ """
473
+ kwargs = {
474
+ "catalog_root": catalog_root_dir,
475
+ "success_txn_log_dir": success_txn_log_dir,
476
+ "current_txn_start_time": current_txn_start_time,
477
+ "current_txn_id": current_txn_id,
478
+ "filesystem": filesystem,
479
+ "limit": current_txn_op.read_limit,
480
+ }
481
+ if current_txn_op.type == TransactionOperationType.READ_SIBLINGS:
482
+ return current_txn_op.dest_metafile.siblings(**kwargs)
483
+ elif current_txn_op.type == TransactionOperationType.READ_CHILDREN:
484
+ return current_txn_op.dest_metafile.children(**kwargs)
485
+ elif current_txn_op.type == TransactionOperationType.READ_LATEST:
486
+ kwargs["limit"] = 1
487
+ elif current_txn_op.type == TransactionOperationType.READ_EXISTS:
488
+ kwargs["limit"] = 1
489
+ kwargs["materialize_revisions"] = False
490
+ else:
491
+ raise ValueError(
492
+ f"Unsupported transaction operation type: {current_txn_op.type}"
493
+ )
494
+ # return the latest metafile revision for READ_LATEST and READ_EXISTS
495
+ list_result = current_txn_op.dest_metafile.revisions(**kwargs)
496
+ revisions = list_result.all_items()
497
+ metafiles = []
498
+ if revisions:
499
+ op_type = revisions[0][0]
500
+ if op_type != TransactionOperationType.DELETE:
501
+ metafiles.append(revisions[0][1])
502
+ # TODO(pdames): Add Optional[Metafile] to return type and just
503
+ # return the latest metafile (if any) directly?
504
+ return ListResult.of(
505
+ items=metafiles,
506
+ pagination_key=None,
507
+ next_page_provider=None,
508
+ )
509
+ else:
510
+ # Could not find any revisions in list operations - return no results
511
+ return ListResult.empty()
512
+
513
+ @staticmethod
514
+ def get_class(serialized_dict: dict):
515
+ """
516
+ Given a serialized dictionary of Metafile data, gets the metafile child
517
+ class type to instantiate.
518
+ """
519
+ # TODO: more robust implementation. Right now this relies on the
520
+ # assumption that XLocator key will only be present in class X, and
521
+ # is brittle to renames. On the other hand, this implementation does
522
+ # not require any marker fields to be persisted, and a regression
523
+ # will be quickly detected by test_metafile.io or other unit tests
524
+ if serialized_dict.__contains__("tableLocator"):
525
+ return deltacat.storage.model.table.Table
526
+ elif serialized_dict.__contains__("namespaceLocator"):
527
+ return deltacat.storage.model.namespace.Namespace
528
+ elif serialized_dict.__contains__("tableVersionLocator"):
529
+ return deltacat.storage.model.table_version.TableVersion
530
+ elif serialized_dict.__contains__("partitionLocator"):
531
+ return deltacat.storage.model.partition.Partition
532
+ elif serialized_dict.__contains__("streamLocator"):
533
+ return deltacat.storage.model.stream.Stream
534
+ elif serialized_dict.__contains__("deltaLocator"):
535
+ return deltacat.storage.model.delta.Delta
536
+ else:
537
+ raise ValueError(
538
+ f"Could not find metafile class from serialized form: "
539
+ f"${serialized_dict}"
540
+ )
541
+
542
+ @classmethod
543
+ def read(
544
+ cls,
545
+ path: str,
546
+ filesystem: Optional[pyarrow.fs.FileSystem] = None,
547
+ format: Optional[str] = METAFILE_FORMAT,
548
+ ) -> Metafile:
549
+ """
550
+ Read a metadata file and return the deserialized object.
551
+ :param path: Metadata file path to read.
552
+ :param filesystem: File system to use for reading the metadata file.
553
+ :param format: Format to use for deserializing the metadata file.
554
+ :return: Deserialized object from the metadata file.
555
+ """
556
+ if format not in SUPPORTED_METAFILE_FORMATS:
557
+ raise ValueError(
558
+ f"Unsupported format '{format}'. Supported formats include: {SUPPORTED_METAFILE_FORMATS}."
559
+ )
560
+
561
+ if not filesystem:
562
+ path, filesystem = resolve_path_and_filesystem(path, filesystem)
563
+ with filesystem.open_input_stream(path) as file:
564
+ binary = file.readall()
565
+ reader = {
566
+ "json": lambda b: json.loads(
567
+ b.decode("utf-8"),
568
+ object_hook=lambda obj: {
569
+ k: base64.b64decode(v)
570
+ if isinstance(v, str) and v.startswith("b64:")
571
+ else v
572
+ for k, v in obj.items()
573
+ },
574
+ ),
575
+ "msgpack": msgpack.loads,
576
+ }[format]
577
+ data = reader(binary)
578
+ # cast this Metafile into the appropriate child class type
579
+ clazz = Metafile.get_class(data)
580
+ obj = clazz(**data).from_serializable(path, filesystem)
581
+ return obj
582
+
583
+ def write_txn(
584
+ self,
585
+ catalog_root_dir: str,
586
+ success_txn_log_dir: str,
587
+ current_txn_op: deltacat.storage.model.transaction.TransactionOperation,
588
+ current_txn_start_time: int,
589
+ current_txn_id: str,
590
+ filesystem: Optional[pyarrow.fs.FileSystem] = None,
591
+ ) -> None:
592
+ """
593
+ Serialize and write this object to a metadata file within the context
594
+ of a transaction.
595
+ :param catalog_root_dir: Catalog root dir to write the metafile to.
596
+ :param success_txn_log_dir: Catalog root successful transaction log
597
+ directory.
598
+ :param current_txn_op: Transaction operation for this write.
599
+ :param current_txn_start_time: Transaction start time for this write.
600
+ :param current_txn_id: Transaction ID for this write.
601
+ :param filesystem: File system to use for writing the metadata file. If
602
+ not given, a default filesystem will be automatically selected based on
603
+ the catalog root path.
604
+ """
605
+ if not filesystem:
606
+ catalog_root_dir, filesystem = resolve_path_and_filesystem(
607
+ path=catalog_root_dir,
608
+ filesystem=filesystem,
609
+ )
610
+ self._write_metafile_revisions(
611
+ catalog_root=catalog_root_dir,
612
+ success_txn_log_dir=success_txn_log_dir,
613
+ current_txn_op=current_txn_op,
614
+ current_txn_start_time=current_txn_start_time,
615
+ current_txn_id=current_txn_id,
616
+ filesystem=filesystem,
617
+ )
618
+
619
+ def write(
620
+ self,
621
+ path: str,
622
+ filesystem: Optional[pyarrow.fs.FileSystem] = None,
623
+ format: Optional[str] = METAFILE_FORMAT,
624
+ ) -> None:
625
+ """
626
+ Serialize and write this object to a metadata file.
627
+ :param path: Metadata file path to write to.
628
+ :param filesystem: File system to use for writing the metadata file. If
629
+ not given, a default filesystem will be automatically selected based on
630
+ the catalog root path.
631
+ param: format: Format to use for serializing the metadata file.
632
+ """
633
+ if format not in SUPPORTED_METAFILE_FORMATS:
634
+ raise ValueError(
635
+ f"Unsupported format '{format}'. Supported formats include: {SUPPORTED_METAFILE_FORMATS}."
636
+ )
637
+
638
+ if not filesystem:
639
+ path, filesystem = resolve_path_and_filesystem(path, filesystem)
640
+ revision_dir_path = posixpath.dirname(path)
641
+ filesystem.create_dir(revision_dir_path, recursive=True)
642
+
643
+ writer = {
644
+ "json": lambda data: json.dumps(
645
+ data,
646
+ indent=4,
647
+ default=lambda b: base64.b64encode(b).decode("utf-8")
648
+ if isinstance(b, bytes)
649
+ else b,
650
+ ).encode("utf-8"),
651
+ "msgpack": msgpack.dumps,
652
+ }[format]
653
+
654
+ with filesystem.open_output_stream(path) as file:
655
+ file.write(writer(self.to_serializable()))
656
+
657
+ def equivalent_to(self, other: Metafile) -> bool:
658
+ """
659
+ True if this Metafile is equivalent to the other Metafile minus its
660
+ unique ID and ancestor IDs.
661
+
662
+ :param other: Metafile to compare to.
663
+ :return: True if the other metafile is equivalent, false if not.
664
+ """
665
+ identifiers = {"id", "ancestor_ids"}
666
+ for k, v in self.items():
667
+ if k not in identifiers and (k not in other or other[k] != v):
668
+ return False
669
+ for k in other.keys():
670
+ if k not in identifiers and k not in self:
671
+ return False
672
+ return True
673
+
674
+ @property
675
+ def named_immutable_id(self) -> Optional[str]:
676
+ """
677
+ If this metafile's locator name is immutable (i.e., if the object it
678
+ refers to can't be renamed) then returns an immutable ID suitable for
679
+ use in URLS or filesystem paths. Returns None if this locator name is
680
+ mutable (i.e., if the object it refers to can be renamed).
681
+ """
682
+ return self.locator.name.immutable_id
683
+
684
+ @named_immutable_id.setter
685
+ def named_immutable_id(self, immutable_id: Optional[str]) -> None:
686
+ """
687
+ If this metafile's locator name is immutable (i.e., if the object it
688
+ refers to can't be renamed), then sets an immutable ID for this
689
+ locator name suitable for use in URLS or filesystem paths. Note that
690
+ the ID is only considered immutable in durable catalog storage, and
691
+ remains mutable in transient memory (i.e., this setter remains
692
+ functional regardless of whether an ID is already assigned, but each
693
+ update will cause it to refer to a different, distinct object in
694
+ durable storage).
695
+ :raises NotImplementedError: If this metafile type does not have a
696
+ named immutable ID (i.e., its immutable ID is auto-generated).
697
+ """
698
+ self.locator.name.immutable_id = immutable_id
699
+
700
+ @property
701
+ def id(self) -> str:
702
+ """
703
+ Returns an existing immutable ID for this metafile or generates a new
704
+ one. This ID can be used for equality checks (i.e. 2 metafiles refer
705
+ to the same catalog object if they have the same ID) and deterministic
706
+ references (e.g. for generating a root namespace or table path that
707
+ remains the same regardless of renames).
708
+ """
709
+
710
+ # check if the locator name can be reused as an immutable ID
711
+ # or if we need to use a generated UUID as an immutable ID
712
+ _id = self.locator.name.immutable_id or self.get("id")
713
+ if not _id:
714
+ _id = self["id"] = str(uuid.uuid4())
715
+ return _id
716
+
717
+ @property
718
+ def locator(self) -> Optional[Locator]:
719
+ """
720
+ Returns the canonical locator for this metafile, which is typically used
721
+ to efficiently resolve internal system references to this object.
722
+ """
723
+ raise NotImplementedError()
724
+
725
+ @property
726
+ def locator_alias(self) -> Optional[Locator]:
727
+ """
728
+ Returns an optional locator alias for this metafile. This is
729
+ typically used to resolve a unique, human-readable reference to this
730
+ object (e.g., by using partition values instead of partition ID or
731
+ stream format name instead of stream ID). Locator aliases are
732
+ typically used during partition predicate pushdown (e.g., by
733
+ partition value + partition scheme ID) or to display unique
734
+ human-readable metafile names.
735
+ """
736
+ return None
737
+
738
+ def children(
739
+ self,
740
+ catalog_root: str,
741
+ success_txn_log_dir: str,
742
+ current_txn_start_time: Optional[int] = None,
743
+ current_txn_id: Optional[str] = None,
744
+ filesystem: Optional[pyarrow.fs.FileSystem] = None,
745
+ limit: Optional[int] = None,
746
+ ) -> ListResult[Metafile]:
747
+ """
748
+ Retrieve all children of this object.
749
+ :return: ListResult containing all children of this object.
750
+ """
751
+ catalog_root, filesystem = resolve_path_and_filesystem(
752
+ catalog_root,
753
+ filesystem,
754
+ )
755
+ metafile_root_dir_path = self.metafile_root_path(
756
+ catalog_root=catalog_root,
757
+ current_txn_start_time=current_txn_start_time,
758
+ current_txn_id=current_txn_id,
759
+ filesystem=filesystem,
760
+ )
761
+ # List metafiles with respect to this metafile's URI as root
762
+ return self._list_metafiles(
763
+ success_txn_log_dir=success_txn_log_dir,
764
+ metafile_root_dir_path=metafile_root_dir_path,
765
+ current_txn_start_time=current_txn_start_time,
766
+ current_txn_id=current_txn_id,
767
+ filesystem=filesystem,
768
+ limit=limit,
769
+ )
770
+
771
+ def siblings(
772
+ self,
773
+ catalog_root: str,
774
+ success_txn_log_dir: str,
775
+ current_txn_start_time: Optional[int] = None,
776
+ current_txn_id: Optional[str] = None,
777
+ filesystem: Optional[pyarrow.fs.FileSystem] = None,
778
+ limit: Optional[int] = None,
779
+ ) -> ListResult[Metafile]:
780
+ """
781
+ Retrieve all siblings of this object.
782
+ :return: ListResult containing all siblings of this object.
783
+ """
784
+ catalog_root, filesystem = resolve_path_and_filesystem(
785
+ catalog_root,
786
+ filesystem,
787
+ )
788
+ parent_obj_path = self.parent_root_path(
789
+ catalog_root=catalog_root,
790
+ current_txn_start_time=current_txn_start_time,
791
+ current_txn_id=current_txn_id,
792
+ filesystem=filesystem,
793
+ )
794
+ return self._list_metafiles(
795
+ success_txn_log_dir=success_txn_log_dir,
796
+ metafile_root_dir_path=parent_obj_path,
797
+ current_txn_start_time=current_txn_start_time,
798
+ current_txn_id=current_txn_id,
799
+ filesystem=filesystem,
800
+ limit=limit,
801
+ )
802
+
803
+ def revisions(
804
+ self,
805
+ catalog_root: str,
806
+ success_txn_log_dir: str,
807
+ current_txn_start_time: Optional[int] = None,
808
+ current_txn_id: Optional[str] = None,
809
+ filesystem: Optional[pyarrow.fs.FileSystem] = None,
810
+ limit: Optional[int] = None,
811
+ materialize_revisions: bool = True,
812
+ ) -> ListResult[Tuple[TransactionOperationType, Optional[Metafile]]]:
813
+ """
814
+ Retrieve all revisions of this object.
815
+ :return: ListResult containing all revisions of this object.
816
+ """
817
+ catalog_root, filesystem = resolve_path_and_filesystem(
818
+ catalog_root,
819
+ filesystem,
820
+ )
821
+ try:
822
+ parent_root = self.parent_root_path(
823
+ catalog_root=catalog_root,
824
+ current_txn_start_time=current_txn_start_time,
825
+ current_txn_id=current_txn_id,
826
+ filesystem=filesystem,
827
+ )
828
+ except ValueError:
829
+ # one or more ancestor's don't exist - return an empty list result
830
+ # TODO(pdames): Raise and catch a more explicit AncestorNotFound
831
+ # error type here.
832
+ return ListResult.empty()
833
+ try:
834
+ locator = (
835
+ self.locator
836
+ if self.locator.name.exists()
837
+ else self.locator_alias
838
+ if self.locator_alias and self.locator_alias.name.exists()
839
+ else None
840
+ )
841
+ immutable_id = (
842
+ # TODO(pdames): Refactor id lazy assignment into explicit getter/setter
843
+ self.get("id")
844
+ or Metafile._locator_to_id(
845
+ locator=locator,
846
+ catalog_root=catalog_root,
847
+ metafile_root=parent_root,
848
+ filesystem=filesystem,
849
+ txn_start_time=current_txn_start_time,
850
+ txn_id=current_txn_id,
851
+ )
852
+ if locator
853
+ else None
854
+ )
855
+ except ValueError:
856
+ # the metafile has been deleted
857
+ return ListResult.empty()
858
+ if not immutable_id:
859
+ # the metafile does not exist
860
+ return ListResult.empty()
861
+ revision_dir_path = posixpath.join(
862
+ parent_root,
863
+ immutable_id,
864
+ REVISION_DIR_NAME,
865
+ )
866
+ revisions = MetafileRevisionInfo.list_revisions(
867
+ revision_dir_path=revision_dir_path,
868
+ filesystem=filesystem,
869
+ success_txn_log_dir=success_txn_log_dir,
870
+ current_txn_start_time=current_txn_start_time,
871
+ current_txn_id=current_txn_id,
872
+ limit=limit,
873
+ )
874
+ items = []
875
+ for mri in revisions:
876
+ if mri.exists():
877
+ metafile = (
878
+ {}
879
+ if not materialize_revisions
880
+ else self.read(
881
+ path=mri.path,
882
+ filesystem=filesystem,
883
+ )
884
+ )
885
+ items.append((mri.txn_op_type, metafile))
886
+ # TODO(pdames): Add pagination.
887
+ return ListResult.of(
888
+ items=items,
889
+ pagination_key=None,
890
+ next_page_provider=None,
891
+ )
892
+
893
+ def to_serializable(self) -> Metafile:
894
+ """
895
+ Prepare the object for serialization by converting any non-serializable
896
+ types to serializable types. May also run any required pre-write
897
+ validations on the serialized or deserialized object.
898
+ :return: a serializable version of the object
899
+ """
900
+ return self
901
+
902
+ def from_serializable(
903
+ self,
904
+ path: str,
905
+ filesystem: Optional[pyarrow.fs.FileSystem] = None,
906
+ ) -> Metafile:
907
+ """
908
+ Restore any non-serializable types from a serializable version of this
909
+ object. May also run any required post-read validations on the
910
+ serialized or deserialized object.
911
+ :return: a deserialized version of the object
912
+ """
913
+ return self
914
+
915
+ def parent_root_path(
916
+ self,
917
+ catalog_root: str,
918
+ current_txn_start_time: Optional[int] = None,
919
+ current_txn_id: Optional[str] = None,
920
+ filesystem: Optional[pyarrow.fs.FileSystem] = None,
921
+ ) -> str:
922
+ ancestor_ids = self.ancestor_ids(
923
+ catalog_root=catalog_root,
924
+ current_txn_start_time=current_txn_start_time,
925
+ current_txn_id=current_txn_id,
926
+ filesystem=filesystem,
927
+ )
928
+ return posixpath.join(*[catalog_root] + ancestor_ids)
929
+
930
+ def metafile_root_path(
931
+ self,
932
+ catalog_root: str,
933
+ current_txn_start_time: Optional[int] = None,
934
+ current_txn_id: Optional[str] = None,
935
+ filesystem: Optional[pyarrow.fs.FileSystem] = None,
936
+ ) -> str:
937
+ parent_obj_path = self.parent_root_path(
938
+ catalog_root=catalog_root,
939
+ current_txn_start_time=current_txn_start_time,
940
+ current_txn_id=current_txn_id,
941
+ filesystem=filesystem,
942
+ )
943
+ return posixpath.join(
944
+ parent_obj_path,
945
+ self.id,
946
+ )
947
+
948
+ def ancestor_ids(
949
+ self,
950
+ catalog_root: str,
951
+ current_txn_start_time: Optional[int] = None,
952
+ current_txn_id: Optional[str] = None,
953
+ filesystem: Optional[pyarrow.fs.FileSystem] = None,
954
+ ) -> List[str]:
955
+ """
956
+ Returns the IDs for this metafile's ancestor metafiles. IDs are
957
+ listed in order from root to immediate parent.
958
+ """
959
+ ancestor_ids = self.get("ancestor_ids") or []
960
+ if not ancestor_ids:
961
+ ancestor_ids = Metafile._ancestor_ids(
962
+ locator=self.locator,
963
+ catalog_root=catalog_root,
964
+ current_txn_start_time=current_txn_start_time,
965
+ current_txn_id=current_txn_id,
966
+ filesystem=filesystem,
967
+ )
968
+ self["ancestor_ids"] = ancestor_ids
969
+ return ancestor_ids
970
+
971
+ @staticmethod
972
+ def _parent_metafile_rev_dir_path(
973
+ base_metafile_path: str,
974
+ parent_number,
975
+ ):
976
+ # TODO(pdames): Stop parent traversal at catalog root.
977
+ current_dir = posixpath.dirname( # base metafile root dir
978
+ posixpath.dirname( # base metafile revision dir
979
+ base_metafile_path,
980
+ )
981
+ )
982
+ while parent_number and current_dir != posixpath.sep:
983
+ current_dir = posixpath.dirname(current_dir)
984
+ parent_number -= 1
985
+ return posixpath.join(
986
+ current_dir,
987
+ REVISION_DIR_NAME,
988
+ )
989
+
990
+ @staticmethod
991
+ def _locator_to_id(
992
+ locator: Locator,
993
+ catalog_root: str,
994
+ metafile_root: str,
995
+ filesystem: pyarrow.fs.FileSystem,
996
+ txn_start_time: Optional[int] = None,
997
+ txn_id: Optional[str] = None,
998
+ ) -> Optional[str]:
999
+ """
1000
+ Resolves the immutable metafile ID for the given locator.
1001
+
1002
+ :return: Immutable ID read from mapping file. None if no mapping exists.
1003
+ :raises: ValueError if the id is found but has been deleted
1004
+ """
1005
+ metafile_id = locator.name.immutable_id
1006
+ if not metafile_id:
1007
+ # the locator name is mutable, so we need to resolve the mapping
1008
+ # from the locator back to its immutable metafile ID
1009
+ locator_path = locator.path(metafile_root)
1010
+ success_txn_log_dir = posixpath.join(
1011
+ catalog_root,
1012
+ TXN_DIR_NAME,
1013
+ SUCCESS_TXN_DIR_NAME,
1014
+ )
1015
+ mri = MetafileRevisionInfo.latest_revision(
1016
+ revision_dir_path=locator_path,
1017
+ filesystem=filesystem,
1018
+ success_txn_log_dir=success_txn_log_dir,
1019
+ current_txn_start_time=txn_start_time,
1020
+ current_txn_id=txn_id,
1021
+ ignore_missing_revision=True,
1022
+ )
1023
+ if not mri.exists():
1024
+ return None
1025
+ if mri.txn_op_type == TransactionOperationType.DELETE:
1026
+ err_msg = (
1027
+ f"Locator {locator} to metafile ID resolution failed "
1028
+ f"because its metafile ID mapping was deleted. You may "
1029
+ f"have an old reference to a renamed or deleted object."
1030
+ )
1031
+ raise ValueError(err_msg)
1032
+ metafile_id = posixpath.splitext(mri.path)[1][1:]
1033
+ return metafile_id
1034
+
1035
+ @staticmethod
1036
+ def _ancestor_ids(
1037
+ locator: Locator,
1038
+ catalog_root: str,
1039
+ current_txn_start_time: Optional[int] = None,
1040
+ current_txn_id: Optional[str] = None,
1041
+ filesystem: Optional[pyarrow.fs.FileSystem] = None,
1042
+ ) -> List[str]:
1043
+ ancestor_ids = []
1044
+ catalog_root, filesystem = resolve_path_and_filesystem(
1045
+ path=catalog_root,
1046
+ filesystem=filesystem,
1047
+ )
1048
+ parent_locators = []
1049
+ # TODO(pdames): Correctly resolve missing parents and K of N
1050
+ # specified ancestors by using placeholder IDs for missing
1051
+ # ancestors
1052
+ parent_locator = locator.parent
1053
+ while parent_locator:
1054
+ parent_locators.append(parent_locator)
1055
+ parent_locator = parent_locator.parent
1056
+ metafile_root = catalog_root
1057
+ while parent_locators:
1058
+ parent_locator = parent_locators.pop()
1059
+ ancestor_id = Metafile._locator_to_id(
1060
+ locator=parent_locator,
1061
+ catalog_root=catalog_root,
1062
+ metafile_root=metafile_root,
1063
+ filesystem=filesystem,
1064
+ txn_start_time=current_txn_start_time,
1065
+ txn_id=current_txn_id,
1066
+ )
1067
+ if not ancestor_id:
1068
+ err_msg = f"Ancestor does not exist: {parent_locator}."
1069
+ raise ValueError(err_msg)
1070
+ metafile_root = posixpath.join(
1071
+ metafile_root,
1072
+ ancestor_id,
1073
+ )
1074
+ try:
1075
+ get_file_info(
1076
+ path=metafile_root,
1077
+ filesystem=filesystem,
1078
+ )
1079
+ except FileNotFoundError:
1080
+ raise ValueError(
1081
+ f"Ancestor {parent_locator} does not exist at: " f"{metafile_root}"
1082
+ )
1083
+ ancestor_ids.append(ancestor_id)
1084
+ return ancestor_ids
1085
+
1086
+ def _write_locator_to_id_map_file(
1087
+ self,
1088
+ locator: Locator,
1089
+ success_txn_log_dir: str,
1090
+ parent_obj_path: str,
1091
+ current_txn_op: deltacat.storage.model.transaction.TransactionOperation,
1092
+ current_txn_op_type: TransactionOperationType,
1093
+ current_txn_start_time: int,
1094
+ current_txn_id: str,
1095
+ filesystem: pyarrow.fs.FileSystem,
1096
+ ) -> None:
1097
+ name_resolution_dir_path = locator.path(parent_obj_path)
1098
+ # TODO(pdames): Don't write updated revisions with the same mapping as
1099
+ # the latest revision.
1100
+ mri = MetafileRevisionInfo.new_revision(
1101
+ revision_dir_path=name_resolution_dir_path,
1102
+ current_txn_op_type=current_txn_op_type,
1103
+ current_txn_start_time=current_txn_start_time,
1104
+ current_txn_id=current_txn_id,
1105
+ filesystem=filesystem,
1106
+ extension=f".{self.id}",
1107
+ success_txn_log_dir=success_txn_log_dir,
1108
+ )
1109
+ revision_file_path = mri.path
1110
+ filesystem.create_dir(posixpath.dirname(revision_file_path), recursive=True)
1111
+ with filesystem.open_output_stream(revision_file_path):
1112
+ pass # Just create an empty ID file to map to the locator
1113
+ current_txn_op.append_locator_write_path(revision_file_path)
1114
+
1115
+ def _write_metafile_revision(
1116
+ self,
1117
+ success_txn_log_dir: str,
1118
+ revision_dir_path: str,
1119
+ current_txn_op: deltacat.storage.model.transaction.TransactionOperation,
1120
+ current_txn_op_type: TransactionOperationType,
1121
+ current_txn_start_time: int,
1122
+ current_txn_id: str,
1123
+ filesystem: pyarrow.fs.FileSystem,
1124
+ ) -> None:
1125
+ mri = MetafileRevisionInfo.new_revision(
1126
+ revision_dir_path=revision_dir_path,
1127
+ current_txn_op_type=current_txn_op_type,
1128
+ current_txn_start_time=current_txn_start_time,
1129
+ current_txn_id=current_txn_id,
1130
+ filesystem=filesystem,
1131
+ success_txn_log_dir=success_txn_log_dir,
1132
+ )
1133
+ self.write(
1134
+ path=mri.path,
1135
+ filesystem=filesystem,
1136
+ )
1137
+ current_txn_op.append_metafile_write_path(mri.path)
1138
+
1139
+ def _write_metafile_revisions(
1140
+ self,
1141
+ catalog_root: str,
1142
+ success_txn_log_dir: str,
1143
+ current_txn_op: deltacat.storage.model.transaction.TransactionOperation,
1144
+ current_txn_start_time: int,
1145
+ current_txn_id: str,
1146
+ filesystem: pyarrow.fs.FileSystem,
1147
+ ) -> None:
1148
+ """
1149
+ Generates the fully qualified paths required to write this metafile as
1150
+ part of the given transaction. All paths returned will be based in the
1151
+ given root directory.
1152
+ """
1153
+ parent_obj_path = self.parent_root_path(
1154
+ catalog_root=catalog_root,
1155
+ current_txn_start_time=current_txn_start_time,
1156
+ current_txn_id=current_txn_id,
1157
+ filesystem=filesystem,
1158
+ )
1159
+ mutable_src_locator = None
1160
+ mutable_dest_locator = None
1161
+ # metafiles without named immutable IDs have mutable name mappings
1162
+ if not self.named_immutable_id:
1163
+ mutable_src_locator = (
1164
+ current_txn_op.src_metafile.locator
1165
+ if current_txn_op.src_metafile
1166
+ else None
1167
+ )
1168
+ mutable_dest_locator = current_txn_op.dest_metafile.locator
1169
+ # metafiles with named immutable IDs may have aliases
1170
+ elif self.locator_alias:
1171
+ mutable_src_locator = (
1172
+ current_txn_op.src_metafile.locator_alias
1173
+ if current_txn_op.src_metafile
1174
+ else None
1175
+ )
1176
+ mutable_dest_locator = current_txn_op.dest_metafile.locator_alias
1177
+ if mutable_dest_locator:
1178
+ # the locator name is mutable, so we need to persist a mapping
1179
+ # from the locator back to its immutable metafile ID
1180
+ if (
1181
+ current_txn_op.type == TransactionOperationType.UPDATE
1182
+ and mutable_src_locator is not None
1183
+ and mutable_src_locator != mutable_dest_locator
1184
+ ):
1185
+ # this update includes a rename
1186
+ # mark the source metafile mapping as deleted
1187
+ current_txn_op.src_metafile._write_locator_to_id_map_file(
1188
+ locator=mutable_src_locator,
1189
+ success_txn_log_dir=success_txn_log_dir,
1190
+ parent_obj_path=parent_obj_path,
1191
+ current_txn_op=current_txn_op,
1192
+ current_txn_op_type=TransactionOperationType.DELETE,
1193
+ current_txn_start_time=current_txn_start_time,
1194
+ current_txn_id=current_txn_id,
1195
+ filesystem=filesystem,
1196
+ )
1197
+ # mark the dest metafile mapping as created
1198
+ self._write_locator_to_id_map_file(
1199
+ locator=mutable_dest_locator,
1200
+ success_txn_log_dir=success_txn_log_dir,
1201
+ parent_obj_path=parent_obj_path,
1202
+ current_txn_op=current_txn_op,
1203
+ current_txn_op_type=TransactionOperationType.CREATE,
1204
+ current_txn_start_time=current_txn_start_time,
1205
+ current_txn_id=current_txn_id,
1206
+ filesystem=filesystem,
1207
+ )
1208
+ else:
1209
+ self._write_locator_to_id_map_file(
1210
+ locator=mutable_dest_locator,
1211
+ success_txn_log_dir=success_txn_log_dir,
1212
+ parent_obj_path=parent_obj_path,
1213
+ current_txn_op=current_txn_op,
1214
+ current_txn_op_type=current_txn_op.type,
1215
+ current_txn_start_time=current_txn_start_time,
1216
+ current_txn_id=current_txn_id,
1217
+ filesystem=filesystem,
1218
+ )
1219
+ metafile_revision_dir_path = posixpath.join(
1220
+ parent_obj_path,
1221
+ self.id,
1222
+ REVISION_DIR_NAME,
1223
+ )
1224
+ if (
1225
+ current_txn_op.type == TransactionOperationType.UPDATE
1226
+ and current_txn_op.src_metafile.id != current_txn_op.dest_metafile.id
1227
+ ):
1228
+ # TODO(pdames): block operations including both a rename & replace?
1229
+ # this update includes a replace
1230
+ # mark the source metafile as deleted
1231
+ src_metafile_revision_dir_path = posixpath.join(
1232
+ parent_obj_path,
1233
+ current_txn_op.src_metafile.id,
1234
+ REVISION_DIR_NAME,
1235
+ )
1236
+ self._write_metafile_revision(
1237
+ success_txn_log_dir=success_txn_log_dir,
1238
+ revision_dir_path=src_metafile_revision_dir_path,
1239
+ current_txn_op=current_txn_op,
1240
+ current_txn_op_type=TransactionOperationType.DELETE,
1241
+ current_txn_start_time=current_txn_start_time,
1242
+ current_txn_id=current_txn_id,
1243
+ filesystem=filesystem,
1244
+ )
1245
+ try:
1246
+ # mark the dest metafile as created
1247
+ self._write_metafile_revision(
1248
+ success_txn_log_dir=success_txn_log_dir,
1249
+ revision_dir_path=metafile_revision_dir_path,
1250
+ current_txn_op=current_txn_op,
1251
+ current_txn_op_type=TransactionOperationType.CREATE,
1252
+ current_txn_start_time=current_txn_start_time,
1253
+ current_txn_id=current_txn_id,
1254
+ filesystem=filesystem,
1255
+ )
1256
+ except ValueError as e:
1257
+ # TODO(pdames): raise/catch a DuplicateMetafileCreate exception.
1258
+ if "already exists" not in str(e):
1259
+ raise e
1260
+ # src metafile is being replaced by an existing dest metafile
1261
+
1262
+ else:
1263
+ self._write_metafile_revision(
1264
+ success_txn_log_dir=success_txn_log_dir,
1265
+ revision_dir_path=metafile_revision_dir_path,
1266
+ current_txn_op=current_txn_op,
1267
+ current_txn_op_type=current_txn_op.type,
1268
+ current_txn_start_time=current_txn_start_time,
1269
+ current_txn_id=current_txn_id,
1270
+ filesystem=filesystem,
1271
+ )
1272
+
1273
+ def _list_metafiles(
1274
+ self,
1275
+ success_txn_log_dir: str,
1276
+ metafile_root_dir_path: str,
1277
+ current_txn_start_time: Optional[int] = None,
1278
+ current_txn_id: Optional[str] = None,
1279
+ filesystem: Optional[pyarrow.fs.FileSystem] = None,
1280
+ limit: Optional[int] = None,
1281
+ ) -> ListResult[Metafile]:
1282
+ file_paths_and_sizes = list_directory(
1283
+ path=metafile_root_dir_path,
1284
+ filesystem=filesystem,
1285
+ ignore_missing_path=True,
1286
+ )
1287
+ # TODO(pdames): Exclude name resolution directories
1288
+ revision_dir_paths = [
1289
+ posixpath.join(file_path_and_size[0], REVISION_DIR_NAME)
1290
+ for file_path_and_size in file_paths_and_sizes
1291
+ if file_path_and_size[0] != success_txn_log_dir
1292
+ ]
1293
+ items = []
1294
+ for path in revision_dir_paths:
1295
+ mri = MetafileRevisionInfo.latest_revision(
1296
+ revision_dir_path=path,
1297
+ filesystem=filesystem,
1298
+ success_txn_log_dir=success_txn_log_dir,
1299
+ current_txn_start_time=current_txn_start_time,
1300
+ current_txn_id=current_txn_id,
1301
+ ignore_missing_revision=True,
1302
+ )
1303
+ if mri.exists():
1304
+ item = self.read(
1305
+ path=mri.path,
1306
+ filesystem=filesystem,
1307
+ )
1308
+ items.append(item)
1309
+ if limit and limit <= len(items):
1310
+ break
1311
+ # TODO(pdames): Add pagination.
1312
+ return ListResult.of(
1313
+ items=items,
1314
+ pagination_key=None,
1315
+ next_page_provider=None,
1316
+ )