deltacat 1.1.36__py3-none-any.whl → 2.0.0b2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (238) hide show
  1. deltacat/__init__.py +42 -3
  2. deltacat/annotations.py +36 -0
  3. deltacat/api.py +168 -0
  4. deltacat/aws/s3u.py +4 -4
  5. deltacat/benchmarking/benchmark_engine.py +82 -0
  6. deltacat/benchmarking/benchmark_report.py +86 -0
  7. deltacat/benchmarking/benchmark_suite.py +11 -0
  8. deltacat/benchmarking/conftest.py +21 -0
  9. deltacat/benchmarking/data/random_row_generator.py +94 -0
  10. deltacat/benchmarking/data/row_generator.py +10 -0
  11. deltacat/benchmarking/test_benchmark_pipeline.py +106 -0
  12. deltacat/catalog/__init__.py +14 -0
  13. deltacat/catalog/delegate.py +199 -106
  14. deltacat/catalog/iceberg/__init__.py +4 -0
  15. deltacat/catalog/iceberg/iceberg_catalog_config.py +26 -0
  16. deltacat/catalog/iceberg/impl.py +368 -0
  17. deltacat/catalog/iceberg/overrides.py +74 -0
  18. deltacat/catalog/interface.py +273 -76
  19. deltacat/catalog/main/impl.py +720 -0
  20. deltacat/catalog/model/catalog.py +227 -20
  21. deltacat/catalog/model/properties.py +116 -0
  22. deltacat/catalog/model/table_definition.py +32 -1
  23. deltacat/compute/compactor/model/compaction_session_audit_info.py +7 -3
  24. deltacat/compute/compactor/model/delta_annotated.py +3 -3
  25. deltacat/compute/compactor/model/delta_file_envelope.py +3 -1
  26. deltacat/compute/compactor/model/delta_file_locator.py +3 -1
  27. deltacat/compute/compactor/model/round_completion_info.py +5 -5
  28. deltacat/compute/compactor/model/table_object_store.py +3 -2
  29. deltacat/compute/compactor/repartition_session.py +1 -1
  30. deltacat/compute/compactor/steps/dedupe.py +11 -4
  31. deltacat/compute/compactor/steps/hash_bucket.py +1 -1
  32. deltacat/compute/compactor/steps/materialize.py +6 -2
  33. deltacat/compute/compactor/utils/io.py +1 -1
  34. deltacat/compute/compactor/utils/sort_key.py +9 -2
  35. deltacat/compute/compactor_v2/compaction_session.py +5 -9
  36. deltacat/compute/compactor_v2/constants.py +1 -30
  37. deltacat/compute/compactor_v2/deletes/utils.py +3 -3
  38. deltacat/compute/compactor_v2/model/merge_input.py +1 -7
  39. deltacat/compute/compactor_v2/private/compaction_utils.py +5 -6
  40. deltacat/compute/compactor_v2/steps/merge.py +17 -126
  41. deltacat/compute/compactor_v2/utils/content_type_params.py +0 -17
  42. deltacat/compute/compactor_v2/utils/dedupe.py +1 -1
  43. deltacat/compute/compactor_v2/utils/io.py +1 -1
  44. deltacat/compute/compactor_v2/utils/merge.py +0 -1
  45. deltacat/compute/compactor_v2/utils/primary_key_index.py +3 -15
  46. deltacat/compute/compactor_v2/utils/task_options.py +23 -43
  47. deltacat/compute/converter/constants.py +4 -0
  48. deltacat/compute/converter/converter_session.py +143 -0
  49. deltacat/compute/converter/model/convert_input.py +69 -0
  50. deltacat/compute/converter/model/convert_input_files.py +61 -0
  51. deltacat/compute/converter/model/converter_session_params.py +99 -0
  52. deltacat/compute/converter/pyiceberg/__init__.py +0 -0
  53. deltacat/compute/converter/pyiceberg/catalog.py +75 -0
  54. deltacat/compute/converter/pyiceberg/overrides.py +135 -0
  55. deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +251 -0
  56. deltacat/compute/converter/steps/__init__.py +0 -0
  57. deltacat/compute/converter/steps/convert.py +211 -0
  58. deltacat/compute/converter/steps/dedupe.py +60 -0
  59. deltacat/compute/converter/utils/__init__.py +0 -0
  60. deltacat/compute/converter/utils/convert_task_options.py +88 -0
  61. deltacat/compute/converter/utils/converter_session_utils.py +109 -0
  62. deltacat/compute/converter/utils/iceberg_columns.py +82 -0
  63. deltacat/compute/converter/utils/io.py +43 -0
  64. deltacat/compute/converter/utils/s3u.py +133 -0
  65. deltacat/compute/resource_estimation/delta.py +1 -19
  66. deltacat/constants.py +47 -1
  67. deltacat/env.py +51 -0
  68. deltacat/examples/__init__.py +0 -0
  69. deltacat/examples/basic_logging.py +101 -0
  70. deltacat/examples/common/__init__.py +0 -0
  71. deltacat/examples/common/fixtures.py +15 -0
  72. deltacat/examples/hello_world.py +27 -0
  73. deltacat/examples/iceberg/__init__.py +0 -0
  74. deltacat/examples/iceberg/iceberg_bucket_writer.py +139 -0
  75. deltacat/examples/iceberg/iceberg_reader.py +149 -0
  76. deltacat/exceptions.py +51 -9
  77. deltacat/logs.py +4 -1
  78. deltacat/storage/__init__.py +118 -28
  79. deltacat/storage/iceberg/__init__.py +0 -0
  80. deltacat/storage/iceberg/iceberg_scan_planner.py +28 -0
  81. deltacat/storage/iceberg/impl.py +737 -0
  82. deltacat/storage/iceberg/model.py +709 -0
  83. deltacat/storage/interface.py +217 -134
  84. deltacat/storage/main/__init__.py +0 -0
  85. deltacat/storage/main/impl.py +2077 -0
  86. deltacat/storage/model/delta.py +118 -71
  87. deltacat/storage/model/interop.py +24 -0
  88. deltacat/storage/model/list_result.py +8 -0
  89. deltacat/storage/model/locator.py +93 -3
  90. deltacat/{aws/redshift → storage}/model/manifest.py +122 -98
  91. deltacat/storage/model/metafile.py +1316 -0
  92. deltacat/storage/model/namespace.py +34 -18
  93. deltacat/storage/model/partition.py +362 -37
  94. deltacat/storage/model/scan/__init__.py +0 -0
  95. deltacat/storage/model/scan/push_down.py +19 -0
  96. deltacat/storage/model/scan/scan_plan.py +10 -0
  97. deltacat/storage/model/scan/scan_task.py +34 -0
  98. deltacat/storage/model/schema.py +892 -0
  99. deltacat/storage/model/shard.py +47 -0
  100. deltacat/storage/model/sort_key.py +170 -13
  101. deltacat/storage/model/stream.py +208 -80
  102. deltacat/storage/model/table.py +123 -29
  103. deltacat/storage/model/table_version.py +322 -46
  104. deltacat/storage/model/transaction.py +757 -0
  105. deltacat/storage/model/transform.py +198 -61
  106. deltacat/storage/model/types.py +111 -13
  107. deltacat/storage/rivulet/__init__.py +11 -0
  108. deltacat/storage/rivulet/arrow/__init__.py +0 -0
  109. deltacat/storage/rivulet/arrow/serializer.py +75 -0
  110. deltacat/storage/rivulet/dataset.py +744 -0
  111. deltacat/storage/rivulet/dataset_executor.py +87 -0
  112. deltacat/storage/rivulet/feather/__init__.py +5 -0
  113. deltacat/storage/rivulet/feather/file_reader.py +136 -0
  114. deltacat/storage/rivulet/feather/serializer.py +35 -0
  115. deltacat/storage/rivulet/fs/__init__.py +0 -0
  116. deltacat/storage/rivulet/fs/file_provider.py +105 -0
  117. deltacat/storage/rivulet/fs/file_store.py +130 -0
  118. deltacat/storage/rivulet/fs/input_file.py +76 -0
  119. deltacat/storage/rivulet/fs/output_file.py +86 -0
  120. deltacat/storage/rivulet/logical_plan.py +105 -0
  121. deltacat/storage/rivulet/metastore/__init__.py +0 -0
  122. deltacat/storage/rivulet/metastore/delta.py +190 -0
  123. deltacat/storage/rivulet/metastore/json_sst.py +105 -0
  124. deltacat/storage/rivulet/metastore/sst.py +82 -0
  125. deltacat/storage/rivulet/metastore/sst_interval_tree.py +260 -0
  126. deltacat/storage/rivulet/mvp/Table.py +101 -0
  127. deltacat/storage/rivulet/mvp/__init__.py +5 -0
  128. deltacat/storage/rivulet/parquet/__init__.py +5 -0
  129. deltacat/storage/rivulet/parquet/data_reader.py +0 -0
  130. deltacat/storage/rivulet/parquet/file_reader.py +127 -0
  131. deltacat/storage/rivulet/parquet/serializer.py +37 -0
  132. deltacat/storage/rivulet/reader/__init__.py +0 -0
  133. deltacat/storage/rivulet/reader/block_scanner.py +378 -0
  134. deltacat/storage/rivulet/reader/data_reader.py +136 -0
  135. deltacat/storage/rivulet/reader/data_scan.py +63 -0
  136. deltacat/storage/rivulet/reader/dataset_metastore.py +178 -0
  137. deltacat/storage/rivulet/reader/dataset_reader.py +156 -0
  138. deltacat/storage/rivulet/reader/pyarrow_data_reader.py +121 -0
  139. deltacat/storage/rivulet/reader/query_expression.py +99 -0
  140. deltacat/storage/rivulet/reader/reader_type_registrar.py +84 -0
  141. deltacat/storage/rivulet/schema/__init__.py +0 -0
  142. deltacat/storage/rivulet/schema/datatype.py +128 -0
  143. deltacat/storage/rivulet/schema/schema.py +251 -0
  144. deltacat/storage/rivulet/serializer.py +40 -0
  145. deltacat/storage/rivulet/serializer_factory.py +42 -0
  146. deltacat/storage/rivulet/writer/__init__.py +0 -0
  147. deltacat/storage/rivulet/writer/dataset_writer.py +29 -0
  148. deltacat/storage/rivulet/writer/memtable_dataset_writer.py +294 -0
  149. deltacat/storage/util/__init__.py +0 -0
  150. deltacat/storage/util/scan_planner.py +26 -0
  151. deltacat/tests/_io/__init__.py +1 -0
  152. deltacat/tests/catalog/test_catalogs.py +324 -0
  153. deltacat/tests/catalog/test_default_catalog_impl.py +16 -8
  154. deltacat/tests/compute/compact_partition_multiple_rounds_test_cases.py +21 -21
  155. deltacat/tests/compute/compact_partition_rebase_test_cases.py +6 -6
  156. deltacat/tests/compute/compact_partition_rebase_then_incremental_test_cases.py +56 -56
  157. deltacat/tests/compute/compact_partition_test_cases.py +19 -53
  158. deltacat/tests/compute/compactor/steps/test_repartition.py +2 -2
  159. deltacat/tests/compute/compactor/utils/test_io.py +6 -8
  160. deltacat/tests/compute/compactor_v2/test_compaction_session.py +0 -466
  161. deltacat/tests/compute/compactor_v2/utils/test_task_options.py +1 -273
  162. deltacat/tests/compute/conftest.py +75 -0
  163. deltacat/tests/compute/converter/__init__.py +0 -0
  164. deltacat/tests/compute/converter/conftest.py +80 -0
  165. deltacat/tests/compute/converter/test_convert_session.py +478 -0
  166. deltacat/tests/compute/converter/utils.py +123 -0
  167. deltacat/tests/compute/resource_estimation/test_delta.py +0 -16
  168. deltacat/tests/compute/test_compact_partition_incremental.py +2 -42
  169. deltacat/tests/compute/test_compact_partition_multiple_rounds.py +5 -46
  170. deltacat/tests/compute/test_compact_partition_params.py +3 -3
  171. deltacat/tests/compute/test_compact_partition_rebase.py +1 -46
  172. deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +5 -46
  173. deltacat/tests/compute/test_util_common.py +19 -12
  174. deltacat/tests/compute/test_util_create_table_deltas_repo.py +13 -22
  175. deltacat/tests/local_deltacat_storage/__init__.py +76 -103
  176. deltacat/tests/storage/__init__.py +0 -0
  177. deltacat/tests/storage/conftest.py +25 -0
  178. deltacat/tests/storage/main/__init__.py +0 -0
  179. deltacat/tests/storage/main/test_main_storage.py +1399 -0
  180. deltacat/tests/storage/model/__init__.py +0 -0
  181. deltacat/tests/storage/model/test_delete_parameters.py +21 -0
  182. deltacat/tests/storage/model/test_metafile_io.py +2535 -0
  183. deltacat/tests/storage/model/test_schema.py +308 -0
  184. deltacat/tests/storage/model/test_shard.py +22 -0
  185. deltacat/tests/storage/model/test_table_version.py +110 -0
  186. deltacat/tests/storage/model/test_transaction.py +308 -0
  187. deltacat/tests/storage/rivulet/__init__.py +0 -0
  188. deltacat/tests/storage/rivulet/conftest.py +149 -0
  189. deltacat/tests/storage/rivulet/fs/__init__.py +0 -0
  190. deltacat/tests/storage/rivulet/fs/test_file_location_provider.py +93 -0
  191. deltacat/tests/storage/rivulet/schema/__init__.py +0 -0
  192. deltacat/tests/storage/rivulet/schema/test_schema.py +241 -0
  193. deltacat/tests/storage/rivulet/test_dataset.py +406 -0
  194. deltacat/tests/storage/rivulet/test_manifest.py +67 -0
  195. deltacat/tests/storage/rivulet/test_sst_interval_tree.py +232 -0
  196. deltacat/tests/storage/rivulet/test_utils.py +122 -0
  197. deltacat/tests/storage/rivulet/writer/__init__.py +0 -0
  198. deltacat/tests/storage/rivulet/writer/test_dataset_write_then_read.py +341 -0
  199. deltacat/tests/storage/rivulet/writer/test_dataset_writer.py +79 -0
  200. deltacat/tests/storage/rivulet/writer/test_memtable_dataset_writer.py +75 -0
  201. deltacat/tests/test_deltacat_api.py +39 -0
  202. deltacat/tests/test_utils/filesystem.py +14 -0
  203. deltacat/tests/test_utils/message_pack_utils.py +54 -0
  204. deltacat/tests/test_utils/pyarrow.py +8 -15
  205. deltacat/tests/test_utils/storage.py +266 -3
  206. deltacat/tests/utils/test_daft.py +3 -3
  207. deltacat/tests/utils/test_pyarrow.py +0 -432
  208. deltacat/types/partial_download.py +1 -1
  209. deltacat/types/tables.py +1 -1
  210. deltacat/utils/export.py +59 -0
  211. deltacat/utils/filesystem.py +320 -0
  212. deltacat/utils/metafile_locator.py +73 -0
  213. deltacat/utils/pyarrow.py +36 -183
  214. deltacat-2.0.0b2.dist-info/METADATA +65 -0
  215. deltacat-2.0.0b2.dist-info/RECORD +349 -0
  216. deltacat/aws/redshift/__init__.py +0 -19
  217. deltacat/catalog/default_catalog_impl/__init__.py +0 -369
  218. deltacat/io/dataset.py +0 -73
  219. deltacat/io/read_api.py +0 -143
  220. deltacat/storage/model/delete_parameters.py +0 -40
  221. deltacat/storage/model/partition_spec.py +0 -71
  222. deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +0 -253
  223. deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +0 -45
  224. deltacat-1.1.36.dist-info/METADATA +0 -64
  225. deltacat-1.1.36.dist-info/RECORD +0 -219
  226. /deltacat/{aws/redshift/model → benchmarking/data}/__init__.py +0 -0
  227. /deltacat/{io/aws → catalog/main}/__init__.py +0 -0
  228. /deltacat/{io/aws/redshift → compute/converter}/__init__.py +0 -0
  229. /deltacat/{tests/io → compute/converter/model}/__init__.py +0 -0
  230. /deltacat/tests/{io → _io}/test_cloudpickle_bug_fix.py +0 -0
  231. /deltacat/tests/{io → _io}/test_file_object_store.py +0 -0
  232. /deltacat/tests/{io → _io}/test_memcached_object_store.py +0 -0
  233. /deltacat/tests/{io → _io}/test_ray_plasma_object_store.py +0 -0
  234. /deltacat/tests/{io → _io}/test_redis_object_store.py +0 -0
  235. /deltacat/tests/{io → _io}/test_s3_object_store.py +0 -0
  236. {deltacat-1.1.36.dist-info → deltacat-2.0.0b2.dist-info}/LICENSE +0 -0
  237. {deltacat-1.1.36.dist-info → deltacat-2.0.0b2.dist-info}/WHEEL +0 -0
  238. {deltacat-1.1.36.dist-info → deltacat-2.0.0b2.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,2077 @@
1
+ import uuid
2
+
3
+ from typing import Any, Callable, Dict, List, Optional, Union, Tuple
4
+
5
+ from deltacat.catalog import get_catalog_properties
6
+ from deltacat.constants import DEFAULT_TABLE_VERSION
7
+ from deltacat.exceptions import TableNotFoundError
8
+ from deltacat.storage.model.manifest import (
9
+ EntryParams,
10
+ ManifestAuthor,
11
+ )
12
+ from deltacat.storage.model.delta import (
13
+ Delta,
14
+ DeltaLocator,
15
+ DeltaProperties,
16
+ DeltaType,
17
+ )
18
+ from deltacat.storage.model.types import (
19
+ CommitState,
20
+ DistributedDataset,
21
+ LifecycleState,
22
+ LocalDataset,
23
+ LocalTable,
24
+ TransactionType,
25
+ TransactionOperationType,
26
+ StreamFormat,
27
+ )
28
+ from deltacat.storage.model.list_result import ListResult
29
+ from deltacat.storage.model.namespace import (
30
+ Namespace,
31
+ NamespaceLocator,
32
+ NamespaceProperties,
33
+ )
34
+ from deltacat.storage.model.partition import (
35
+ Partition,
36
+ PartitionLocator,
37
+ PartitionScheme,
38
+ PartitionValues,
39
+ UNPARTITIONED_SCHEME_ID,
40
+ PartitionLocatorAlias,
41
+ )
42
+ from deltacat.storage.model.schema import (
43
+ Schema,
44
+ )
45
+ from deltacat.storage.model.sort_key import (
46
+ SortScheme,
47
+ )
48
+ from deltacat.storage.model.stream import (
49
+ Stream,
50
+ StreamLocator,
51
+ )
52
+ from deltacat.storage.model.table import (
53
+ Table,
54
+ TableProperties,
55
+ TableLocator,
56
+ )
57
+ from deltacat.storage.model.table_version import (
58
+ TableVersion,
59
+ TableVersionProperties,
60
+ TableVersionLocator,
61
+ )
62
+ from deltacat.storage.model.metafile import (
63
+ Metafile,
64
+ )
65
+ from deltacat.storage.model.transaction import (
66
+ TransactionOperation,
67
+ Transaction,
68
+ TransactionOperationList,
69
+ )
70
+ from deltacat.storage.model.manifest import Manifest
71
+ from deltacat.types.media import (
72
+ ContentType,
73
+ DistributedDatasetType,
74
+ StorageType,
75
+ TableType,
76
+ )
77
+ from deltacat.utils.common import ReadKwargsProvider
78
+
79
+
80
+ def _list(
81
+ metafile: Metafile,
82
+ txn_op_type: TransactionOperationType,
83
+ *args,
84
+ **kwargs,
85
+ ) -> ListResult[Metafile]:
86
+ catalog_properties = get_catalog_properties(**kwargs)
87
+ limit = kwargs.get("limit") or None
88
+ transaction = Transaction.of(
89
+ txn_type=TransactionType.READ,
90
+ txn_operations=[
91
+ TransactionOperation.of(
92
+ operation_type=txn_op_type,
93
+ dest_metafile=metafile,
94
+ read_limit=limit,
95
+ )
96
+ ],
97
+ )
98
+ list_results_per_op = transaction.commit(
99
+ catalog_root_dir=catalog_properties.root,
100
+ filesystem=catalog_properties.filesystem,
101
+ )
102
+ return list_results_per_op[0]
103
+
104
+
105
+ def _latest(
106
+ metafile: Metafile,
107
+ *args,
108
+ **kwargs,
109
+ ) -> Optional[Metafile]:
110
+ list_results = _list(
111
+ *args,
112
+ metafile=metafile,
113
+ txn_op_type=TransactionOperationType.READ_LATEST,
114
+ **kwargs,
115
+ )
116
+ results = list_results.all_items()
117
+ return results[0] if results else None
118
+
119
+
120
+ def _exists(
121
+ metafile: Metafile,
122
+ *args,
123
+ **kwargs,
124
+ ) -> Optional[Metafile]:
125
+ list_results = _list(
126
+ *args,
127
+ metafile=metafile,
128
+ txn_op_type=TransactionOperationType.READ_EXISTS,
129
+ **kwargs,
130
+ )
131
+ results = list_results.all_items()
132
+ return True if results else False
133
+
134
+
135
+ def _resolve_partition_locator_alias(
136
+ namespace: str,
137
+ table_name: str,
138
+ table_version: Optional[str] = None,
139
+ partition_values: Optional[PartitionValues] = None,
140
+ partition_scheme_id: Optional[str] = None,
141
+ *args,
142
+ **kwargs,
143
+ ) -> PartitionLocatorAlias:
144
+ # TODO(pdames): A read shouldn't initiate N transactions that
145
+ # read against different catalog snapshots. To resolve this, add
146
+ # new "start", "step", and "end" methods to Transaction that
147
+ # support starting a txn, defining and executing a txn op, retrieve
148
+ # its results, then define and execute the next txn op. When
149
+ # stepping through a transaction its txn heartbeat timeout should
150
+ # be set manually.
151
+ partition_locator = None
152
+ if not partition_values:
153
+ partition_scheme_id = UNPARTITIONED_SCHEME_ID
154
+ elif not partition_scheme_id:
155
+ # resolve latest partition scheme from the current
156
+ # revision of its `deltacat` stream
157
+ stream = get_stream(
158
+ *args,
159
+ namespace=namespace,
160
+ table_name=table_name,
161
+ table_version=table_version,
162
+ **kwargs,
163
+ )
164
+ if not stream:
165
+ raise ValueError(
166
+ f"Failed to resolve latest partition scheme for "
167
+ f"`{namespace}.{table_name}` at table version "
168
+ f"`{table_version or 'latest'}` (no stream found)."
169
+ )
170
+ partition_locator = PartitionLocator.of(
171
+ stream_locator=stream.locator,
172
+ partition_values=partition_values,
173
+ partition_id=None,
174
+ )
175
+ partition_scheme_id = stream.partition_scheme.id
176
+ if not partition_locator:
177
+ partition_locator = PartitionLocator.at(
178
+ namespace=namespace,
179
+ table_name=table_name,
180
+ table_version=table_version,
181
+ stream_id=None,
182
+ stream_format=StreamFormat.DELTACAT,
183
+ partition_values=partition_values,
184
+ partition_id=None,
185
+ )
186
+ partition = Partition.of(
187
+ locator=partition_locator,
188
+ schema=None,
189
+ content_types=None,
190
+ partition_scheme_id=partition_scheme_id,
191
+ )
192
+ return partition.locator_alias
193
+
194
+
195
+ def _resolve_latest_active_table_version_id(
196
+ namespace: str,
197
+ table_name: str,
198
+ fail_if_no_active_table_version: True,
199
+ *args,
200
+ **kwargs,
201
+ ) -> Optional[str]:
202
+ table = get_table(
203
+ *args,
204
+ namespace=namespace,
205
+ table_name=table_name,
206
+ **kwargs,
207
+ )
208
+ if not table:
209
+ raise ValueError(f"Table does not exist: {namespace}.{table_name}")
210
+ if fail_if_no_active_table_version and not table.latest_active_table_version:
211
+ raise ValueError(f"Table has no active table version: {namespace}.{table_name}")
212
+ return table.latest_active_table_version
213
+
214
+
215
+ def _resolve_latest_table_version_id(
216
+ namespace: str,
217
+ table_name: str,
218
+ fail_if_no_active_table_version: True,
219
+ *args,
220
+ **kwargs,
221
+ ) -> Optional[str]:
222
+ table = get_table(
223
+ *args,
224
+ namespace=namespace,
225
+ table_name=table_name,
226
+ **kwargs,
227
+ )
228
+ if not table:
229
+ raise ValueError(f"Table does not exist: {namespace}.{table_name}")
230
+ if fail_if_no_active_table_version and not table.latest_table_version:
231
+ raise ValueError(f"Table has no table version: {namespace}.{table_name}")
232
+ return table.latest_table_version
233
+
234
+
235
+ def list_namespaces(*args, **kwargs) -> ListResult[Namespace]:
236
+ """
237
+ Lists a page of table namespaces. Namespaces are returned as list result
238
+ items.
239
+ """
240
+ return _list(
241
+ *args,
242
+ metafile=Namespace.of(NamespaceLocator.of("placeholder")),
243
+ txn_op_type=TransactionOperationType.READ_SIBLINGS,
244
+ **kwargs,
245
+ )
246
+
247
+
248
+ def list_tables(namespace: str, *args, **kwargs) -> ListResult[Table]:
249
+ """
250
+ Lists a page of tables for the given table namespace. Tables are returned as
251
+ list result items. Raises an error if the given namespace does not exist.
252
+ """
253
+ locator = TableLocator.at(namespace=namespace, table_name="placeholder")
254
+ return _list(
255
+ *args,
256
+ metafile=Table.of(locator=locator),
257
+ txn_op_type=TransactionOperationType.READ_SIBLINGS,
258
+ **kwargs,
259
+ )
260
+
261
+
262
+ def list_table_versions(
263
+ namespace: str,
264
+ table_name: str,
265
+ *args,
266
+ **kwargs,
267
+ ) -> ListResult[TableVersion]:
268
+ """
269
+ Lists a page of table versions for the given table. Table versions are
270
+ returned as list result items. Raises an error if the given table does not
271
+ exist.
272
+ """
273
+ locator = TableVersionLocator.at(
274
+ namespace=namespace,
275
+ table_name=table_name,
276
+ table_version="placeholder.0",
277
+ )
278
+ table_version = TableVersion.of(
279
+ locator=locator,
280
+ schema=None,
281
+ )
282
+ return _list(
283
+ *args,
284
+ metafile=table_version,
285
+ txn_op_type=TransactionOperationType.READ_SIBLINGS,
286
+ **kwargs,
287
+ )
288
+
289
+
290
+ def list_streams(
291
+ namespace: str,
292
+ table_name: str,
293
+ table_version: str,
294
+ *args,
295
+ **kwargs,
296
+ ) -> ListResult[Stream]:
297
+ """
298
+ Lists a page of streams for the given table version.
299
+ Raises an error if the table version does not exist.
300
+ """
301
+ locator = StreamLocator.at(
302
+ namespace=namespace,
303
+ table_name=table_name,
304
+ table_version=table_version,
305
+ stream_id="placeholder",
306
+ stream_format=None,
307
+ )
308
+ stream = Stream.of(
309
+ locator=locator,
310
+ partition_scheme=None,
311
+ )
312
+ return _list(
313
+ stream,
314
+ TransactionOperationType.READ_SIBLINGS,
315
+ *args,
316
+ **kwargs,
317
+ )
318
+
319
+
320
+ def list_partitions(
321
+ namespace: str,
322
+ table_name: str,
323
+ table_version: Optional[str] = None,
324
+ *args,
325
+ **kwargs,
326
+ ) -> ListResult[Partition]:
327
+ """
328
+ Lists a page of partitions for the given table version. Partitions are
329
+ returned as list result items. Table version resolves to the latest active
330
+ table version if not specified. Raises an error if the table version does
331
+ not exist.
332
+ """
333
+ locator = PartitionLocator.at(
334
+ namespace=namespace,
335
+ table_name=table_name,
336
+ table_version=table_version,
337
+ stream_id=None,
338
+ stream_format=StreamFormat.DELTACAT,
339
+ partition_values=["placeholder"],
340
+ partition_id="placeholder",
341
+ )
342
+ partition = Partition.of(
343
+ locator=locator,
344
+ schema=None,
345
+ content_types=None,
346
+ )
347
+ return _list(
348
+ *args,
349
+ metafile=partition,
350
+ txn_op_type=TransactionOperationType.READ_SIBLINGS,
351
+ **kwargs,
352
+ )
353
+
354
+
355
+ def list_stream_partitions(stream: Stream, *args, **kwargs) -> ListResult[Partition]:
356
+ """
357
+ Lists all partitions committed to the given stream.
358
+ """
359
+ if stream.stream_format != StreamFormat.DELTACAT:
360
+ raise ValueError(
361
+ f"Unsupported stream format: {stream.stream_format}"
362
+ f"Expected stream format: {StreamFormat.DELTACAT}"
363
+ )
364
+ locator = PartitionLocator.of(
365
+ stream_locator=stream.locator,
366
+ partition_values=["placeholder"],
367
+ partition_id="placeholder",
368
+ )
369
+ partition = Partition.of(
370
+ locator=locator,
371
+ schema=None,
372
+ content_types=None,
373
+ )
374
+ return _list(
375
+ *args,
376
+ metafile=partition,
377
+ txn_op_type=TransactionOperationType.READ_SIBLINGS,
378
+ **kwargs,
379
+ )
380
+
381
+
382
+ def list_deltas(
383
+ namespace: str,
384
+ table_name: str,
385
+ partition_values: Optional[PartitionValues] = None,
386
+ table_version: Optional[str] = None,
387
+ first_stream_position: Optional[int] = None,
388
+ last_stream_position: Optional[int] = None,
389
+ ascending_order: Optional[bool] = None,
390
+ include_manifest: bool = False,
391
+ partition_scheme_id: Optional[str] = None,
392
+ *args,
393
+ **kwargs,
394
+ ) -> ListResult[Delta]:
395
+ """
396
+ Lists a page of deltas for the given table version and committed partition.
397
+ Deltas are returned as list result items. Deltas returned can optionally be
398
+ limited to inclusive first and last stream positions. Deltas are returned by
399
+ descending stream position by default. Table version resolves to the latest
400
+ active table version if not specified. Partition values should not be
401
+ specified for unpartitioned tables. Partition scheme ID resolves to the
402
+ table version's current partition scheme by default. Raises an error if the
403
+ given table version or partition does not exist.
404
+
405
+ To conserve memory, the deltas returned do not include manifests by
406
+ default. The manifests can either be optionally retrieved as part of this
407
+ call or lazily loaded via subsequent calls to `get_delta_manifest`.
408
+ """
409
+ # TODO(pdames): Delta listing should ideally either use an efficient
410
+ # range-limited dir listing of partition children between start and end
411
+ # positions, or should traverse using Partition.stream_position (to
412
+ # resolve last stream position) and Delta.previous_stream_position
413
+ # (down to first stream position).
414
+ partition_locator_alias = _resolve_partition_locator_alias(
415
+ *args,
416
+ namespace=namespace,
417
+ table_name=table_name,
418
+ table_version=table_version,
419
+ partition_values=partition_values,
420
+ partition_scheme_id=partition_scheme_id,
421
+ **kwargs,
422
+ )
423
+ locator = DeltaLocator.of(locator=partition_locator_alias)
424
+ delta = Delta.of(
425
+ locator=locator,
426
+ delta_type=None,
427
+ meta=None,
428
+ properties=None,
429
+ manifest=None,
430
+ )
431
+ all_deltas_list_result: ListResult[Delta] = _list(
432
+ *args,
433
+ metafile=delta,
434
+ txn_op_type=TransactionOperationType.READ_SIBLINGS,
435
+ **kwargs,
436
+ )
437
+ all_deltas = all_deltas_list_result.all_items()
438
+ filtered_deltas = [
439
+ delta
440
+ for delta in all_deltas
441
+ if first_stream_position <= delta.stream_position <= last_stream_position
442
+ ]
443
+ if ascending_order:
444
+ filtered_deltas.reverse()
445
+ return filtered_deltas
446
+
447
+
448
+ def list_partition_deltas(
449
+ partition_like: Union[Partition, PartitionLocator],
450
+ first_stream_position: Optional[int] = None,
451
+ last_stream_position: Optional[int] = None,
452
+ ascending_order: bool = False,
453
+ include_manifest: bool = False,
454
+ *args,
455
+ **kwargs,
456
+ ) -> ListResult[Delta]:
457
+ """
458
+ Lists a page of deltas committed to the given partition.
459
+
460
+ To conserve memory, the deltas returned do not include manifests by
461
+ default. The manifests can either be optionally retrieved as part of this
462
+ call or lazily loaded via subsequent calls to `get_delta_manifest`.
463
+ """
464
+ # TODO(pdames): Delta listing should ideally either use an efficient
465
+ # range-limited dir listing of partition children between start and end
466
+ # positions, or should traverse using Partition.stream_position (to
467
+ # resolve last stream position) and Delta.previous_stream_position
468
+ # (down to first stream position).
469
+ locator = DeltaLocator.of(
470
+ partition_locator=partition_like
471
+ if isinstance(partition_like, PartitionLocator)
472
+ else partition_like.locator,
473
+ stream_position=None,
474
+ )
475
+ delta = Delta.of(
476
+ locator=locator,
477
+ delta_type=None,
478
+ meta=None,
479
+ properties=None,
480
+ manifest=None,
481
+ )
482
+ all_deltas_list_result: ListResult[Delta] = _list(
483
+ *args,
484
+ metafile=delta,
485
+ txn_op_type=TransactionOperationType.READ_SIBLINGS,
486
+ **kwargs,
487
+ )
488
+ all_deltas = all_deltas_list_result.all_items()
489
+ filtered_deltas = [
490
+ delta
491
+ for delta in all_deltas
492
+ if first_stream_position <= delta.stream_position <= last_stream_position
493
+ ]
494
+ if ascending_order:
495
+ filtered_deltas.reverse()
496
+ return filtered_deltas
497
+
498
+
499
+ def get_delta(
500
+ namespace: str,
501
+ table_name: str,
502
+ stream_position: int,
503
+ partition_values: Optional[PartitionValues] = None,
504
+ table_version: Optional[str] = None,
505
+ include_manifest: bool = False,
506
+ partition_scheme_id: Optional[str] = None,
507
+ *args,
508
+ **kwargs,
509
+ ) -> Optional[Delta]:
510
+ """
511
+ Gets the delta for the given table version, partition, and stream position.
512
+ Table version resolves to the latest active table version if not specified.
513
+ Partition values should not be specified for unpartitioned tables. Partition
514
+ scheme ID resolves to the table version's current partition scheme by
515
+ default. Raises an error if the given table version or partition does not
516
+ exist.
517
+
518
+ To conserve memory, the delta returned does not include a manifest by
519
+ default. The manifest can either be optionally retrieved as part of this
520
+ call or lazily loaded via a subsequent call to `get_delta_manifest`.
521
+ """
522
+ # TODO(pdames): Honor `include_manifest` param.
523
+ partition_locator_alias = _resolve_partition_locator_alias(
524
+ *args,
525
+ namespace=namespace,
526
+ table_name=table_name,
527
+ table_version=table_version,
528
+ partition_values=partition_values,
529
+ partition_scheme_id=partition_scheme_id,
530
+ **kwargs,
531
+ )
532
+ locator = DeltaLocator.of(
533
+ locator=partition_locator_alias,
534
+ stream_position=stream_position,
535
+ )
536
+ delta = Delta.of(
537
+ locator=locator,
538
+ delta_type=None,
539
+ meta=None,
540
+ properties=None,
541
+ manifest=None,
542
+ )
543
+ return _latest(
544
+ *args,
545
+ metafile=delta,
546
+ **kwargs,
547
+ )
548
+
549
+
550
+ def get_latest_delta(
551
+ namespace: str,
552
+ table_name: str,
553
+ partition_values: Optional[PartitionValues] = None,
554
+ table_version: Optional[str] = None,
555
+ include_manifest: bool = False,
556
+ partition_scheme_id: Optional[str] = None,
557
+ *args,
558
+ **kwargs,
559
+ ) -> Optional[Delta]:
560
+ """
561
+ Gets the latest delta (i.e. the delta with the greatest stream position) for
562
+ the given table version and partition. Table version resolves to the latest
563
+ active table version if not specified. Partition values should not be
564
+ specified for unpartitioned tables. Partition scheme ID resolves to the
565
+ table version's current partition scheme by default. Raises an error if the
566
+ given table version or partition does not exist.
567
+
568
+ To conserve memory, the delta returned does not include a manifest by
569
+ default. The manifest can either be optionally retrieved as part of this
570
+ call or lazily loaded via a subsequent call to `get_delta_manifest`.
571
+ """
572
+ # TODO(pdames): Wrap this method in 1 single txn.
573
+ stream = get_stream(
574
+ namespace=namespace,
575
+ table_name=table_name,
576
+ table_version=table_version,
577
+ )
578
+ partition = get_partition(
579
+ stream_locator=stream.locator,
580
+ partition_values=partition_values,
581
+ partition_scheme_id=partition_scheme_id,
582
+ )
583
+ locator = DeltaLocator.of(
584
+ locator=partition.locator,
585
+ stream_position=partition.stream_position,
586
+ )
587
+ delta = Delta.of(
588
+ locator=locator,
589
+ delta_type=None,
590
+ meta=None,
591
+ properties=None,
592
+ manifest=None,
593
+ )
594
+ return _latest(
595
+ *args,
596
+ metafile=delta,
597
+ **kwargs,
598
+ )
599
+
600
+
601
+ def download_delta(
602
+ delta_like: Union[Delta, DeltaLocator],
603
+ table_type: TableType = TableType.PYARROW,
604
+ storage_type: StorageType = StorageType.DISTRIBUTED,
605
+ max_parallelism: Optional[int] = None,
606
+ columns: Optional[List[str]] = None,
607
+ file_reader_kwargs_provider: Optional[ReadKwargsProvider] = None,
608
+ ray_options_provider: Callable[[int, Any], Dict[str, Any]] = None,
609
+ distributed_dataset_type: DistributedDatasetType = DistributedDatasetType.RAY_DATASET,
610
+ *args,
611
+ **kwargs,
612
+ ) -> Union[LocalDataset, DistributedDataset]: # type: ignore
613
+ """
614
+ Download the given delta or delta locator into either a list of
615
+ tables resident in the local node's memory, or into a dataset distributed
616
+ across this Ray cluster's object store memory. Ordered table N of a local
617
+ table list, or ordered block N of a distributed dataset, always contain
618
+ the contents of ordered delta manifest entry N.
619
+ """
620
+ raise NotImplementedError("download_delta not implemented")
621
+
622
+
623
+ def download_delta_manifest_entry(
624
+ delta_like: Union[Delta, DeltaLocator],
625
+ entry_index: int,
626
+ table_type: TableType = TableType.PYARROW,
627
+ columns: Optional[List[str]] = None,
628
+ file_reader_kwargs_provider: Optional[ReadKwargsProvider] = None,
629
+ *args,
630
+ **kwargs,
631
+ ) -> LocalTable:
632
+ """
633
+ Downloads a single manifest entry into the specified table type for the
634
+ given delta or delta locator. If a delta is provided with a non-empty
635
+ manifest, then the entry is downloaded from this manifest. Otherwise, the
636
+ manifest is first retrieved then the given entry index downloaded.
637
+
638
+ NOTE: The entry will be downloaded in the current node's memory.
639
+ """
640
+ raise NotImplementedError("download_delta_manifest_entry not implemented")
641
+
642
+
643
+ def get_delta_manifest(
644
+ delta_like: Union[Delta, DeltaLocator],
645
+ *args,
646
+ **kwargs,
647
+ ) -> Manifest:
648
+ """
649
+ Get the manifest associated with the given delta or delta locator. This
650
+ always retrieves the authoritative durable copy of the delta manifest, and
651
+ never the local manifest defined for any input delta. Raises an error if
652
+ the delta can't be found, or if it doesn't contain a manifest.
653
+ """
654
+ if isinstance(delta_like, Delta):
655
+ delta_locator = delta_like.locator
656
+ elif isinstance(delta_like, DeltaLocator):
657
+ delta_locator = delta_like
658
+ else:
659
+ raise ValueError(
660
+ f"Expected delta or delta locator, but got: {type(delta_like)}"
661
+ )
662
+ delta = Delta.of(
663
+ locator=delta_locator,
664
+ delta_type=None,
665
+ meta=None,
666
+ properties=None,
667
+ manifest=None,
668
+ )
669
+ latest_delta = _latest(
670
+ metafile=delta,
671
+ *args,
672
+ **kwargs,
673
+ )
674
+ if not latest_delta or not latest_delta.manifest:
675
+ raise ValueError(f"No manifest found for delta: {delta_locator}")
676
+ return latest_delta.manifest
677
+
678
+
679
+ def create_namespace(
680
+ namespace: str,
681
+ properties: Optional[NamespaceProperties] = None,
682
+ *args,
683
+ **kwargs,
684
+ ) -> Namespace:
685
+ """
686
+ Creates a table namespace with the given name and properties. Returns
687
+ the created namespace.
688
+ """
689
+ namespace = Namespace.of(
690
+ locator=NamespaceLocator.of(namespace=namespace),
691
+ properties=properties,
692
+ )
693
+ transaction = Transaction.of(
694
+ txn_type=TransactionType.APPEND,
695
+ txn_operations=[
696
+ TransactionOperation.of(
697
+ operation_type=TransactionOperationType.CREATE,
698
+ dest_metafile=namespace,
699
+ )
700
+ ],
701
+ )
702
+ catalog_properties = get_catalog_properties(**kwargs)
703
+ transaction.commit(
704
+ catalog_root_dir=catalog_properties.root,
705
+ filesystem=catalog_properties.filesystem,
706
+ )
707
+ return namespace
708
+
709
+
710
+ def update_namespace(
711
+ namespace: str,
712
+ properties: Optional[NamespaceProperties] = None,
713
+ new_namespace: Optional[str] = None,
714
+ *args,
715
+ **kwargs,
716
+ ) -> None:
717
+ """
718
+ Updates a table namespace's name and/or properties. Raises an error if the
719
+ given namespace does not exist.
720
+ """
721
+ # TODO(pdames): Wrap get & update within a single txn.
722
+ old_namespace = get_namespace(
723
+ *args,
724
+ namespace=namespace,
725
+ **kwargs,
726
+ )
727
+ new_namespace: Namespace = Metafile.update_for(old_namespace)
728
+ new_namespace.namespace = namespace
729
+ new_namespace.properties = properties
730
+ transaction = Transaction.of(
731
+ txn_type=TransactionType.ALTER,
732
+ txn_operations=[
733
+ TransactionOperation.of(
734
+ operation_type=TransactionOperationType.UPDATE,
735
+ dest_metafile=new_namespace,
736
+ src_metafile=old_namespace,
737
+ )
738
+ ],
739
+ )
740
+ catalog_properties = get_catalog_properties(**kwargs)
741
+ transaction.commit(
742
+ catalog_root_dir=catalog_properties.root,
743
+ filesystem=catalog_properties.filesystem,
744
+ )
745
+ return namespace
746
+
747
+
748
+ def create_table_version(
749
+ namespace: str,
750
+ table_name: str,
751
+ table_version: Optional[str] = None,
752
+ schema: Optional[Schema] = None,
753
+ partition_scheme: Optional[PartitionScheme] = None,
754
+ sort_keys: Optional[SortScheme] = None,
755
+ table_version_description: Optional[str] = None,
756
+ table_version_properties: Optional[TableVersionProperties] = None,
757
+ table_description: Optional[str] = None,
758
+ table_properties: Optional[TableProperties] = None,
759
+ supported_content_types: Optional[List[ContentType]] = None,
760
+ *args,
761
+ **kwargs,
762
+ ) -> Tuple[Table, TableVersion, Stream]:
763
+ """
764
+ Create a table version with an unreleased lifecycle state and an empty delta
765
+ stream. Table versions may be schemaless and unpartitioned to improve write
766
+ performance, or have their writes governed by a schema and partition scheme
767
+ to improve data consistency and read performance.
768
+
769
+ Returns a tuple containing the created/updated table, table version, and
770
+ stream (respectively).
771
+
772
+ Raises an error if the given namespace does not exist.
773
+ """
774
+ if not namespace_exists(
775
+ *args,
776
+ namespace=namespace,
777
+ **kwargs,
778
+ ):
779
+ raise ValueError(f"Namespace {namespace} does not exist")
780
+ # check if a parent table and/or previous table version already exist
781
+ prev_table_version = None
782
+ prev_table = get_table(
783
+ *args,
784
+ namespace=namespace,
785
+ table_name=table_name,
786
+ **kwargs,
787
+ )
788
+ if not prev_table:
789
+ # no parent table exists, so we'll create it in this transaction
790
+ txn_type = TransactionType.APPEND
791
+ table_txn_op_type = TransactionOperationType.CREATE
792
+ prev_table = None
793
+ new_table = Table.of(
794
+ locator=TableLocator.at(namespace=namespace, table_name=table_name),
795
+ )
796
+ table_version = table_version or DEFAULT_TABLE_VERSION
797
+ else:
798
+ # the parent table exists, so we'll update it in this transaction
799
+ txn_type = TransactionType.ALTER
800
+ table_txn_op_type = TransactionOperationType.UPDATE
801
+ new_table: Table = Metafile.update_for(prev_table)
802
+ prev_table_version = prev_table.latest_table_version
803
+ if not table_version:
804
+ # generate the next table version ID
805
+ table_version = TableVersion.next_version(prev_table_version)
806
+ else:
807
+ # ensure that the given table version number matches expectations
808
+ expected_table_version = TableVersion.next_version(prev_table_version)
809
+ _, version_number = TableVersion.parse_table_version(
810
+ table_version,
811
+ )
812
+ _, expected_version_number = TableVersion.parse_table_version(
813
+ expected_table_version,
814
+ )
815
+ if version_number != expected_version_number:
816
+ raise ValueError(
817
+ f"Expected to create table version "
818
+ f"{expected_version_number} but found {version_number}.",
819
+ )
820
+ new_table.description = table_description or table_version_description
821
+ new_table.properties = table_properties
822
+ new_table.latest_table_version = table_version
823
+ catalog_properties = get_catalog_properties(**kwargs)
824
+ locator = TableVersionLocator.at(
825
+ namespace=namespace,
826
+ table_name=table_name,
827
+ table_version=table_version,
828
+ )
829
+ table_version = TableVersion.of(
830
+ locator=locator,
831
+ schema=schema,
832
+ partition_scheme=partition_scheme,
833
+ description=table_version_description,
834
+ properties=table_version_properties,
835
+ content_types=supported_content_types,
836
+ sort_scheme=sort_keys,
837
+ watermark=None,
838
+ lifecycle_state=LifecycleState.CREATED,
839
+ schemas=[schema] if schema else None,
840
+ partition_schemes=[partition_scheme] if partition_scheme else None,
841
+ sort_schemes=[sort_keys] if sort_keys else None,
842
+ previous_table_version=prev_table_version,
843
+ )
844
+ # create the table version's default deltacat stream in this transaction
845
+ stream_locator = StreamLocator.of(
846
+ table_version_locator=locator,
847
+ stream_id=str(uuid.uuid4()),
848
+ stream_format=StreamFormat.DELTACAT,
849
+ )
850
+ stream = Stream.of(
851
+ locator=stream_locator,
852
+ partition_scheme=partition_scheme,
853
+ state=CommitState.COMMITTED,
854
+ previous_stream_id=None,
855
+ watermark=None,
856
+ )
857
+ transaction = Transaction.of(
858
+ txn_type=txn_type,
859
+ txn_operations=[
860
+ TransactionOperation.of(
861
+ operation_type=table_txn_op_type,
862
+ dest_metafile=new_table,
863
+ src_metafile=prev_table,
864
+ ),
865
+ TransactionOperation.of(
866
+ operation_type=TransactionOperationType.CREATE,
867
+ dest_metafile=table_version,
868
+ ),
869
+ TransactionOperation.of(
870
+ operation_type=TransactionOperationType.CREATE,
871
+ dest_metafile=stream,
872
+ ),
873
+ ],
874
+ )
875
+ transaction.commit(
876
+ catalog_root_dir=catalog_properties.root,
877
+ filesystem=catalog_properties.filesystem,
878
+ )
879
+ return new_table, table_version, stream
880
+
881
+
882
+ def update_table(
883
+ namespace: str,
884
+ table_name: str,
885
+ description: Optional[str] = None,
886
+ properties: Optional[TableProperties] = None,
887
+ new_table_name: Optional[str] = None,
888
+ *args,
889
+ **kwargs,
890
+ ) -> None:
891
+ """
892
+ Update table metadata describing the table versions it contains. By default,
893
+ a table's properties are empty, and its description is equal to that given
894
+ when its first table version was created. Raises an error if the given
895
+ table does not exist.
896
+ """
897
+ old_table = get_table(
898
+ *args,
899
+ namespace=namespace,
900
+ table_name=table_name,
901
+ **kwargs,
902
+ )
903
+ if not old_table:
904
+ raise TableNotFoundError(f"Table `{namespace}.{table_name}` does not exist.")
905
+ new_table: Table = Metafile.update_for(old_table)
906
+ new_table.description = description or old_table.description
907
+ new_table.properties = properties or old_table.properties
908
+ new_table.table_name = new_table_name or old_table.table_name
909
+ transaction = Transaction.of(
910
+ txn_type=TransactionType.ALTER,
911
+ txn_operations=[
912
+ TransactionOperation.of(
913
+ operation_type=TransactionOperationType.UPDATE,
914
+ dest_metafile=new_table,
915
+ src_metafile=old_table,
916
+ )
917
+ ],
918
+ )
919
+ catalog_properties = get_catalog_properties(**kwargs)
920
+ transaction.commit(
921
+ catalog_root_dir=catalog_properties.root,
922
+ filesystem=catalog_properties.filesystem,
923
+ )
924
+
925
+
926
+ def update_table_version(
927
+ namespace: str,
928
+ table_name: str,
929
+ table_version: str,
930
+ lifecycle_state: Optional[LifecycleState] = None,
931
+ schema: Optional[Schema] = None,
932
+ description: Optional[str] = None,
933
+ properties: Optional[TableVersionProperties] = None,
934
+ partition_scheme: Optional[PartitionScheme] = None,
935
+ sort_keys: Optional[SortScheme] = None,
936
+ *args,
937
+ **kwargs,
938
+ ) -> None:
939
+ """
940
+ Update a table version. Notably, updating an unreleased table version's
941
+ lifecycle state to 'active' telegraphs that it is ready for external
942
+ consumption, and causes all calls made to consume/produce streams,
943
+ partitions, or deltas from/to its parent table to automatically resolve to
944
+ this table version by default (i.e., when the client does not explicitly
945
+ specify a different table version). Raises an error if the given table
946
+ version does not exist.
947
+ """
948
+ # TODO(pdames): Wrap get & update within a single txn.
949
+ old_table_version = get_table_version(
950
+ *args,
951
+ namespace=namespace,
952
+ table_name=table_name,
953
+ table_version=table_version,
954
+ **kwargs,
955
+ )
956
+ if not old_table_version:
957
+ raise ValueError(
958
+ f"Table version `{table_version}` does not exist for "
959
+ f"table `{namespace}.{table_name}`."
960
+ )
961
+ new_table_version: TableVersion = Metafile.update_for(old_table_version)
962
+ new_table_version.state = lifecycle_state or old_table_version.state
963
+ # TODO(pdames): Use schema patch to check for backwards incompatible changes.
964
+ # By default, backwards incompatible changes should be pushed to a new
965
+ # table version unless the user explicitly forces the update to this
966
+ # table version (i.e., at the cost of potentially breaking consumers).
967
+ update_schema = schema and not schema.equivalent_to(
968
+ old_table_version.schema,
969
+ True,
970
+ )
971
+ if update_schema and schema.id in [s.id for s in old_table_version.schemas]:
972
+ raise ValueError(
973
+ f"Schema ID `{schema.id}` already exists in "
974
+ f"table version `{table_version}`."
975
+ )
976
+ new_table_version.schema = schema if update_schema else old_table_version.schema
977
+ new_table_version.schemas = (
978
+ old_table_version.schemas + [schema]
979
+ if update_schema
980
+ else old_table_version.schemas
981
+ )
982
+ new_table_version.description = (
983
+ description if description is not None else old_table_version.description
984
+ )
985
+ new_table_version.properties = (
986
+ properties if properties is not None else old_table_version.properties
987
+ )
988
+ new_table_version.partition_scheme = (
989
+ partition_scheme or old_table_version.partition_scheme
990
+ )
991
+ # TODO(pdames): Check for backwards incompatible partition scheme changes.
992
+ update_partition_scheme = partition_scheme and not partition_scheme.equivalent_to(
993
+ old_table_version.partition_scheme,
994
+ True,
995
+ )
996
+ if update_partition_scheme and partition_scheme.id in [
997
+ ps.id for ps in old_table_version.partition_schemes
998
+ ]:
999
+ raise ValueError(
1000
+ f"Partition scheme ID `{partition_scheme.id}` already exists in "
1001
+ f"table version `{table_version}`."
1002
+ )
1003
+ new_table_version.partition_schemes = (
1004
+ old_table_version.partition_schemes + [partition_scheme]
1005
+ if update_partition_scheme
1006
+ else old_table_version.partition_schemes
1007
+ )
1008
+ # TODO(pdames): Check for backwards incompatible sort scheme changes.
1009
+ update_sort_scheme = sort_keys and not sort_keys.equivalent_to(
1010
+ old_table_version.sort_scheme,
1011
+ True,
1012
+ )
1013
+ if update_sort_scheme and sort_keys.id in [
1014
+ sk.id for sk in old_table_version.sort_schemes
1015
+ ]:
1016
+ raise ValueError(
1017
+ f"Sort scheme ID `{sort_keys.id}` already exists in "
1018
+ f"table version `{table_version}`."
1019
+ )
1020
+ new_table_version.sort_scheme = sort_keys or old_table_version.sort_scheme
1021
+ new_table_version.sort_schemes = (
1022
+ old_table_version.sort_schemes + [sort_keys]
1023
+ if update_sort_scheme
1024
+ else old_table_version.sort_schemes
1025
+ )
1026
+ old_table = get_table(
1027
+ *args,
1028
+ namespace=namespace,
1029
+ table_name=table_name,
1030
+ **kwargs,
1031
+ )
1032
+ txn_operations = []
1033
+ if (
1034
+ lifecycle_state == LifecycleState.ACTIVE
1035
+ and old_table_version.state != LifecycleState.ACTIVE
1036
+ ):
1037
+ _, old_version_number = (
1038
+ TableVersion.parse_table_version(
1039
+ old_table.latest_active_table_version,
1040
+ )
1041
+ if old_table.latest_active_table_version
1042
+ else (None, None)
1043
+ )
1044
+ _, new_version_number = TableVersion.parse_table_version(table_version)
1045
+ if old_version_number is None or old_version_number < new_version_number:
1046
+ # update the table's latest table version
1047
+ new_table: Table = Metafile.update_for(old_table)
1048
+ new_table.latest_active_table_version = table_version
1049
+ txn_operations.append(
1050
+ TransactionOperation.of(
1051
+ operation_type=TransactionOperationType.UPDATE,
1052
+ dest_metafile=new_table,
1053
+ src_metafile=old_table,
1054
+ )
1055
+ )
1056
+ txn_operations.append(
1057
+ TransactionOperation.of(
1058
+ operation_type=TransactionOperationType.UPDATE,
1059
+ dest_metafile=new_table_version,
1060
+ src_metafile=old_table_version,
1061
+ ),
1062
+ )
1063
+ # TODO(pdames): Push changes down to non-deltacat streams via sync module.
1064
+ # Also copy sort scheme changes down to deltacat child stream?
1065
+ if partition_scheme:
1066
+ old_stream = get_stream(
1067
+ *args,
1068
+ namespace=namespace,
1069
+ table_name=table_name,
1070
+ table_version=table_version,
1071
+ **kwargs,
1072
+ )
1073
+ new_stream: Stream = Metafile.update_for(old_stream)
1074
+ new_stream.partition_scheme = partition_scheme
1075
+ txn_operations.append(
1076
+ TransactionOperation.of(
1077
+ operation_type=TransactionOperationType.UPDATE,
1078
+ dest_metafile=new_stream,
1079
+ src_metafile=old_stream,
1080
+ )
1081
+ )
1082
+ transaction = Transaction.of(
1083
+ txn_type=TransactionType.ALTER,
1084
+ txn_operations=txn_operations,
1085
+ )
1086
+ catalog_properties = get_catalog_properties(**kwargs)
1087
+ transaction.commit(
1088
+ catalog_root_dir=catalog_properties.root,
1089
+ filesystem=catalog_properties.filesystem,
1090
+ )
1091
+
1092
+
1093
+ def stage_stream(
1094
+ namespace: str,
1095
+ table_name: str,
1096
+ table_version: Optional[str] = None,
1097
+ stream_format: StreamFormat = StreamFormat.DELTACAT,
1098
+ *args,
1099
+ **kwargs,
1100
+ ) -> Stream:
1101
+ """
1102
+ Stages a new delta stream for the given table version. Resolves to the
1103
+ latest active table version if no table version is given. Resolves to the
1104
+ DeltaCAT stream format if no stream format is given. If this stream
1105
+ will replace another stream with the same format and scheme, then it will
1106
+ have its previous stream ID set to the ID of the stream being replaced.
1107
+ Returns the staged stream. Raises an error if the table version does not
1108
+ exist.
1109
+ """
1110
+ # TODO(pdames): Support retrieving previously staged streams by ID.
1111
+ if not table_version:
1112
+ table_version = _resolve_latest_active_table_version_id(
1113
+ *args,
1114
+ namespace=namespace,
1115
+ table_name=table_name,
1116
+ **kwargs,
1117
+ )
1118
+ table_version_meta = get_table_version(
1119
+ *args,
1120
+ namespace=namespace,
1121
+ table_name=table_name,
1122
+ table_version=table_version,
1123
+ **kwargs,
1124
+ )
1125
+ locator = StreamLocator.at(
1126
+ namespace=namespace,
1127
+ table_name=table_name,
1128
+ table_version=table_version,
1129
+ stream_id=str(uuid.uuid4()),
1130
+ stream_format=stream_format or StreamFormat.DELTACAT,
1131
+ )
1132
+ stream = Stream.of(
1133
+ locator=locator,
1134
+ partition_scheme=table_version_meta.partition_scheme,
1135
+ state=CommitState.STAGED,
1136
+ previous_stream_id=None,
1137
+ watermark=None,
1138
+ )
1139
+ prev_stream = get_stream(
1140
+ *args,
1141
+ namespace=stream.namespace,
1142
+ table_name=stream.table_name,
1143
+ table_version=stream.table_version,
1144
+ stream_format=stream.stream_format,
1145
+ **kwargs,
1146
+ )
1147
+ if prev_stream:
1148
+ if prev_stream.stream_id == stream.stream_id:
1149
+ raise ValueError(
1150
+ f"Stream to stage has the same ID as existing stream: {prev_stream}."
1151
+ )
1152
+ stream.previous_stream_id = prev_stream.stream_id
1153
+ transaction = Transaction.of(
1154
+ txn_type=TransactionType.APPEND,
1155
+ txn_operations=[
1156
+ TransactionOperation.of(
1157
+ operation_type=TransactionOperationType.CREATE,
1158
+ dest_metafile=stream,
1159
+ )
1160
+ ],
1161
+ )
1162
+ catalog_properties = get_catalog_properties(**kwargs)
1163
+ transaction.commit(
1164
+ catalog_root_dir=catalog_properties.root,
1165
+ filesystem=catalog_properties.filesystem,
1166
+ )
1167
+ return stream
1168
+
1169
+
1170
+ def commit_stream(
1171
+ stream: Stream,
1172
+ *args,
1173
+ **kwargs,
1174
+ ) -> Stream:
1175
+ """
1176
+ Registers a staged delta stream with a target table version, replacing any
1177
+ previous stream registered for the same table version. Returns the
1178
+ committed stream.
1179
+ """
1180
+ if not stream.stream_id:
1181
+ raise ValueError("Stream ID to commit must be set to a staged stream ID.")
1182
+ if not stream.table_version_locator:
1183
+ raise ValueError(
1184
+ "Stream to commit must have its table version locator "
1185
+ "set to the parent of its staged stream ID."
1186
+ )
1187
+ prev_staged_stream = get_stream_by_id(
1188
+ *args,
1189
+ table_version_locator=stream.table_version_locator,
1190
+ stream_id=stream.stream_id,
1191
+ **kwargs,
1192
+ )
1193
+ if not prev_staged_stream:
1194
+ raise ValueError(
1195
+ f"Stream at table version {stream.table_version_locator} with ID "
1196
+ f"{stream.stream_id} not found."
1197
+ )
1198
+ if prev_staged_stream.state != CommitState.STAGED:
1199
+ raise ValueError(
1200
+ f"Expected to find a `{CommitState.STAGED}` stream at table version "
1201
+ f"{stream.table_version_locator} with ID {stream.stream_id},"
1202
+ f"but found a `{prev_staged_stream.state}` partition."
1203
+ )
1204
+ if not prev_staged_stream:
1205
+ raise ValueError(
1206
+ f"Stream at table_version {stream.table_version_locator} with ID "
1207
+ f"{stream.stream_id} not found."
1208
+ )
1209
+ if prev_staged_stream.state != CommitState.STAGED:
1210
+ raise ValueError(
1211
+ f"Expected to find a `{CommitState.STAGED}` stream at table version "
1212
+ f"{stream.table_version_locator} with ID {stream.stream_id},"
1213
+ f"but found a `{prev_staged_stream.state}` stream."
1214
+ )
1215
+ stream: Stream = Metafile.update_for(prev_staged_stream)
1216
+ stream.state = CommitState.COMMITTED
1217
+ prev_committed_stream = get_stream(
1218
+ *args,
1219
+ namespace=stream.namespace,
1220
+ table_name=stream.table_name,
1221
+ table_version=stream.table_version,
1222
+ stream_format=stream.stream_format,
1223
+ **kwargs,
1224
+ )
1225
+ # the first transaction operation updates the staged stream commit state
1226
+ txn_type = TransactionType.ALTER
1227
+ txn_ops = [
1228
+ TransactionOperation.of(
1229
+ operation_type=TransactionOperationType.UPDATE,
1230
+ dest_metafile=stream,
1231
+ src_metafile=prev_staged_stream,
1232
+ )
1233
+ ]
1234
+ if prev_committed_stream:
1235
+ if prev_committed_stream.stream_id != stream.previous_stream_id:
1236
+ raise ValueError(
1237
+ f"Previous stream ID mismatch Expected "
1238
+ f"{stream.previous_stream_id} but found "
1239
+ f"{prev_committed_stream.stream_id}."
1240
+ )
1241
+ if prev_committed_stream.stream_id == stream.stream_id:
1242
+ raise ValueError(
1243
+ f"Stream to commit has the same ID as existing stream: {prev_committed_stream}."
1244
+ )
1245
+ # there's a previously committed stream, so update the transaction
1246
+ # type to overwrite the previously committed stream, and add another
1247
+ # transaction operation to replace it with the staged stream
1248
+ txn_type = TransactionType.OVERWRITE
1249
+ txn_ops.append(
1250
+ TransactionOperation.of(
1251
+ operation_type=TransactionOperationType.UPDATE,
1252
+ dest_metafile=stream,
1253
+ src_metafile=prev_committed_stream,
1254
+ )
1255
+ )
1256
+ transaction = Transaction.of(
1257
+ txn_type=txn_type,
1258
+ txn_operations=txn_ops,
1259
+ )
1260
+ catalog_properties = get_catalog_properties(**kwargs)
1261
+ transaction.commit(
1262
+ catalog_root_dir=catalog_properties.root,
1263
+ filesystem=catalog_properties.filesystem,
1264
+ )
1265
+ return stream
1266
+
1267
+
1268
+ def delete_stream(
1269
+ namespace: str,
1270
+ table_name: str,
1271
+ table_version: Optional[str] = None,
1272
+ stream_format: StreamFormat = StreamFormat.DELTACAT,
1273
+ *args,
1274
+ **kwargs,
1275
+ ) -> None:
1276
+ """
1277
+ Deletes the delta stream currently registered with the given table version.
1278
+ Resolves to the latest active table version if no table version is given.
1279
+ Resolves to the deltacat stream format if no stream format is given.
1280
+ Raises an error if the stream does not exist.
1281
+ """
1282
+ if not table_version:
1283
+ table_version = _resolve_latest_active_table_version_id(
1284
+ *args,
1285
+ namespace=namespace,
1286
+ table_name=table_name,
1287
+ **kwargs,
1288
+ )
1289
+ stream_to_delete = get_stream(
1290
+ *args,
1291
+ namespace=namespace,
1292
+ table_name=table_name,
1293
+ table_version=table_version,
1294
+ stream_format=stream_format,
1295
+ **kwargs,
1296
+ )
1297
+ if not stream_to_delete:
1298
+ raise ValueError(
1299
+ f"Stream to delete not found: {namespace}.{table_name}"
1300
+ f".{table_version}.{stream_format}."
1301
+ )
1302
+ else:
1303
+ stream_to_delete.state = CommitState.DEPRECATED
1304
+ transaction = Transaction.of(
1305
+ txn_type=TransactionType.DELETE,
1306
+ txn_operations=[
1307
+ TransactionOperation.of(
1308
+ operation_type=TransactionOperationType.DELETE,
1309
+ dest_metafile=stream_to_delete,
1310
+ )
1311
+ ],
1312
+ )
1313
+ catalog_properties = get_catalog_properties(**kwargs)
1314
+ transaction.commit(
1315
+ catalog_root_dir=catalog_properties.root,
1316
+ filesystem=catalog_properties.filesystem,
1317
+ )
1318
+
1319
+
1320
+ def delete_table(
1321
+ namespace: str,
1322
+ name: str,
1323
+ purge: bool = False,
1324
+ *args,
1325
+ **kwargs,
1326
+ ) -> None:
1327
+ """
1328
+ Drops the given table and all its contents (table versions, streams, partitions,
1329
+ and deltas). If purge is True, also removes all data files associated with the table.
1330
+ Raises an error if the given table does not exist.
1331
+
1332
+ TODO: Honor purge once garbage collection is implemented.
1333
+ """
1334
+ table: Optional[Table] = get_table(
1335
+ *args,
1336
+ namespace=namespace,
1337
+ table_name=name,
1338
+ **kwargs,
1339
+ )
1340
+
1341
+ if not table:
1342
+ raise TableNotFoundError(f"Table `{namespace}.{name}` does not exist.")
1343
+
1344
+ transaction = Transaction.of(
1345
+ txn_type=TransactionType.DELETE,
1346
+ txn_operations=TransactionOperationList.of(
1347
+ [
1348
+ TransactionOperation.of(
1349
+ operation_type=TransactionOperationType.DELETE,
1350
+ dest_metafile=table,
1351
+ )
1352
+ ]
1353
+ ),
1354
+ )
1355
+
1356
+ catalog_properties = get_catalog_properties(**kwargs)
1357
+ transaction.commit(
1358
+ catalog_root_dir=catalog_properties.root,
1359
+ filesystem=catalog_properties.filesystem,
1360
+ )
1361
+
1362
+
1363
+ def delete_namespace(
1364
+ namespace: str,
1365
+ purge: bool = False,
1366
+ *args,
1367
+ **kwargs,
1368
+ ) -> None:
1369
+ """
1370
+ Drops the given table namespace and all its contents. Raises an error if the
1371
+ given namespace does not exist.
1372
+ """
1373
+ namespace: Optional[Namespace] = get_namespace(
1374
+ *args,
1375
+ namespace=namespace,
1376
+ **kwargs,
1377
+ )
1378
+
1379
+ if not namespace:
1380
+ raise ValueError(f"Namespace `{namespace}` does not exist.")
1381
+
1382
+ transaction = Transaction.of(
1383
+ txn_type=TransactionType.DELETE,
1384
+ txn_operations=[
1385
+ TransactionOperation.of(
1386
+ operation_type=TransactionOperationType.DELETE,
1387
+ dest_metafile=namespace,
1388
+ )
1389
+ ],
1390
+ )
1391
+ catalog_properties = get_catalog_properties(**kwargs)
1392
+ transaction.commit(
1393
+ catalog_root_dir=catalog_properties.root,
1394
+ filesystem=catalog_properties.filesystem,
1395
+ )
1396
+
1397
+
1398
+ def get_stream_by_id(
1399
+ table_version_locator: TableVersionLocator,
1400
+ stream_id: str,
1401
+ *args,
1402
+ **kwargs,
1403
+ ) -> Optional[Partition]:
1404
+ """
1405
+ Gets the stream for the given table version locator and stream ID.
1406
+ Returns None if the stream does not exist. Raises an error if the given
1407
+ table version locator does not exist.
1408
+ """
1409
+ locator = StreamLocator.of(
1410
+ table_version_locator=table_version_locator,
1411
+ stream_id=stream_id,
1412
+ stream_format=None,
1413
+ )
1414
+ return _latest(
1415
+ *args,
1416
+ metafile=Stream.of(locator=locator, partition_scheme=None),
1417
+ **kwargs,
1418
+ )
1419
+
1420
+
1421
+ def get_stream(
1422
+ namespace: str,
1423
+ table_name: str,
1424
+ table_version: Optional[str] = None,
1425
+ stream_format: StreamFormat = StreamFormat.DELTACAT,
1426
+ *args,
1427
+ **kwargs,
1428
+ ) -> Optional[Stream]:
1429
+ """
1430
+ Gets the most recently committed stream for the given table version.
1431
+ Resolves to the latest active table version if no table version is given.
1432
+ Resolves to the DeltaCAT stream format if no stream format is given.
1433
+ Returns None if the table version or stream format does not exist.
1434
+ """
1435
+ if not table_version:
1436
+ table_version = _resolve_latest_active_table_version_id(
1437
+ *args,
1438
+ namespace=namespace,
1439
+ table_name=table_name,
1440
+ fail_if_no_active_table_version=False,
1441
+ **kwargs,
1442
+ )
1443
+ locator = StreamLocator.at(
1444
+ namespace=namespace,
1445
+ table_name=table_name,
1446
+ table_version=table_version,
1447
+ stream_id=None,
1448
+ stream_format=stream_format,
1449
+ )
1450
+ return _latest(
1451
+ *args,
1452
+ metafile=Stream.of(
1453
+ locator=locator,
1454
+ partition_scheme=None,
1455
+ state=CommitState.COMMITTED,
1456
+ ),
1457
+ **kwargs,
1458
+ )
1459
+
1460
+
1461
+ def stream_exists(
1462
+ namespace: str,
1463
+ table_name: str,
1464
+ table_version: Optional[str] = None,
1465
+ stream_format: StreamFormat = StreamFormat.DELTACAT,
1466
+ *args,
1467
+ **kwargs,
1468
+ ) -> Optional[Stream]:
1469
+ """
1470
+ Returns True if the given Stream exists, False if not.
1471
+ Resolves to the latest active table version if no table version is given.
1472
+ Resolves to the DeltaCAT stream format if no stream format is given.
1473
+ Returns None if the table version or stream format does not exist.
1474
+ """
1475
+ if not table_version:
1476
+ table_version = _resolve_latest_active_table_version_id(
1477
+ *args,
1478
+ namespace=namespace,
1479
+ table_name=table_name,
1480
+ fail_if_no_active_table_version=False,
1481
+ **kwargs,
1482
+ )
1483
+ locator = StreamLocator.at(
1484
+ namespace=namespace,
1485
+ table_name=table_name,
1486
+ table_version=table_version,
1487
+ stream_id=None,
1488
+ stream_format=stream_format,
1489
+ )
1490
+ return _exists(
1491
+ *args,
1492
+ metafile=Stream.of(
1493
+ locator=locator,
1494
+ partition_scheme=None,
1495
+ state=CommitState.COMMITTED,
1496
+ ),
1497
+ **kwargs,
1498
+ )
1499
+
1500
+
1501
+ def stage_partition(
1502
+ stream: Stream,
1503
+ partition_values: Optional[PartitionValues] = None,
1504
+ partition_scheme_id: Optional[str] = None,
1505
+ *args,
1506
+ **kwargs,
1507
+ ) -> Partition:
1508
+ """
1509
+ Stages a new partition for the given stream and partition values. Returns
1510
+ the staged partition. If this partition will replace another partition
1511
+ with the same partition values and scheme, then it will have its previous
1512
+ partition ID set to the ID of the partition being replaced. Partition values
1513
+ should not be specified for unpartitioned tables.
1514
+
1515
+ The partition_values must represent the results of transforms in a partition
1516
+ spec specified in the stream.
1517
+ """
1518
+ # TODO(pdames): Cache last retrieved metafile revisions in memory to resolve
1519
+ # potentially high cost of staging many partitions.
1520
+ table_version = get_table_version(
1521
+ *args,
1522
+ namespace=stream.namespace,
1523
+ table_name=stream.table_name,
1524
+ table_version=stream.table_version,
1525
+ **kwargs,
1526
+ )
1527
+ if not table_version:
1528
+ raise ValueError(
1529
+ f"Table version not found: {stream.namespace}.{stream.table_name}."
1530
+ f"{stream.table_version}."
1531
+ )
1532
+ if not table_version.partition_schemes or partition_scheme_id not in [
1533
+ ps.id for ps in table_version.partition_schemes
1534
+ ]:
1535
+ raise ValueError(
1536
+ f"Invalid partition scheme ID `{partition_scheme_id}` (not found "
1537
+ f"in parent table version `{stream.namespace}.{stream.table_name}"
1538
+ f".{table_version.table_version}` partition scheme IDs)."
1539
+ )
1540
+ if stream.partition_scheme.id not in table_version.partition_schemes:
1541
+ # this should never happen, but just in case
1542
+ raise ValueError(
1543
+ f"Invalid stream partition scheme ID `{stream.partition_scheme.id}`"
1544
+ f"in parent table version `{stream.namespace}.{stream.table_name}"
1545
+ f".{table_version.table_version}` partition scheme IDs)."
1546
+ )
1547
+ locator = PartitionLocator.of(
1548
+ stream_locator=stream.locator,
1549
+ partition_values=partition_values,
1550
+ partition_id=str(uuid.uuid4()),
1551
+ )
1552
+ partition = Partition.of(
1553
+ locator=locator,
1554
+ schema=table_version.schema,
1555
+ content_types=table_version.content_types,
1556
+ state=CommitState.STAGED,
1557
+ previous_stream_position=None,
1558
+ partition_values=partition_values,
1559
+ previous_partition_id=None,
1560
+ stream_position=None,
1561
+ partition_scheme_id=partition_scheme_id,
1562
+ )
1563
+ prev_partition = get_partition(
1564
+ *args,
1565
+ stream_locator=stream.locator,
1566
+ partition_values=partition_values,
1567
+ partition_scheme_id=partition_scheme_id,
1568
+ **kwargs,
1569
+ )
1570
+ if prev_partition:
1571
+ if prev_partition.partition_id == partition.partition_id:
1572
+ raise ValueError(
1573
+ f"Partition to stage has the same ID as existing partition: {prev_partition}."
1574
+ )
1575
+ partition.previous_partition_id = prev_partition.partition_id
1576
+ transaction = Transaction.of(
1577
+ txn_type=TransactionType.APPEND,
1578
+ txn_operations=[
1579
+ TransactionOperation.of(
1580
+ operation_type=TransactionOperationType.CREATE,
1581
+ dest_metafile=partition,
1582
+ )
1583
+ ],
1584
+ )
1585
+ catalog_properties = get_catalog_properties(**kwargs)
1586
+ transaction.commit(
1587
+ catalog_root_dir=catalog_properties.root,
1588
+ filesystem=catalog_properties.filesystem,
1589
+ )
1590
+ return partition
1591
+
1592
+
1593
+ def commit_partition(
1594
+ partition: Partition,
1595
+ previous_partition: Optional[Partition] = None,
1596
+ *args,
1597
+ **kwargs,
1598
+ ) -> Partition:
1599
+ """
1600
+ Commits the staged partition to its associated table version stream,
1601
+ replacing any previous partition registered for the same stream and
1602
+ partition values.
1603
+
1604
+ If previous partition is given then it will be replaced with its deltas
1605
+ prepended to the new partition being committed. Otherwise the latest
1606
+ committed partition with the same keys and partition scheme ID will be
1607
+ retrieved.
1608
+
1609
+ Returns the registered partition. If the partition's
1610
+ previous delta stream position is specified, then the commit will
1611
+ be rejected if it does not match the actual previous stream position of
1612
+ the partition being replaced. If the partition's previous partition ID is
1613
+ specified, then the commit will be rejected if it does not match the actual
1614
+ ID of the partition being replaced.
1615
+ """
1616
+ if previous_partition:
1617
+ raise NotImplementedError(
1618
+ f"delta prepending from previous partition {previous_partition} "
1619
+ f"is not yet implemented"
1620
+ )
1621
+ if not partition.partition_id:
1622
+ raise ValueError("Partition ID to commit must be set to a staged partition ID.")
1623
+ if not partition.stream_locator:
1624
+ raise ValueError(
1625
+ "Partition to commit must have its stream locator "
1626
+ "set to the parent of its staged partition ID."
1627
+ )
1628
+ prev_staged_partition = get_partition_by_id(
1629
+ *args,
1630
+ stream_locator=partition.stream_locator,
1631
+ partition_id=partition.partition_id,
1632
+ **kwargs,
1633
+ )
1634
+ if not prev_staged_partition:
1635
+ raise ValueError(
1636
+ f"Partition at stream {partition.stream_locator} with ID "
1637
+ f"{partition.partition_id} not found."
1638
+ )
1639
+ if prev_staged_partition.state != CommitState.STAGED:
1640
+ raise ValueError(
1641
+ f"Expected to find a `{CommitState.STAGED}` partition at stream "
1642
+ f"{partition.stream_locator} with ID {partition.partition_id},"
1643
+ f"but found a `{prev_staged_partition.state}` partition."
1644
+ )
1645
+ partition: Partition = Metafile.update_for(prev_staged_partition)
1646
+ partition.state = CommitState.COMMITTED
1647
+ prev_committed_partition = get_partition(
1648
+ *args,
1649
+ stream_locator=partition.stream_locator,
1650
+ partition_value=partition.partition_values,
1651
+ partition_scheme_id=partition.partition_scheme_id,
1652
+ **kwargs,
1653
+ )
1654
+ # the first transaction operation updates the staged partition commit state
1655
+ txn_type = TransactionType.ALTER
1656
+ txn_ops = [
1657
+ TransactionOperation.of(
1658
+ operation_type=TransactionOperationType.UPDATE,
1659
+ dest_metafile=partition,
1660
+ src_metafile=prev_staged_partition,
1661
+ )
1662
+ ]
1663
+ if prev_committed_partition:
1664
+ if prev_committed_partition.partition_id != partition.previous_partition_id:
1665
+ raise ValueError(
1666
+ f"Previous partition ID mismatch Expected "
1667
+ f"{partition.previous_partition_id} but found "
1668
+ f"{prev_committed_partition.partition_id}."
1669
+ )
1670
+ # TODO(pdames): Add previous partition stream position validation.
1671
+ if prev_committed_partition.partition_id == partition.partition_id:
1672
+ raise ValueError(
1673
+ f"Partition to commit has the same ID as existing partition: "
1674
+ f"{prev_committed_partition}."
1675
+ )
1676
+ # there's a previously committed partition, so update the transaction
1677
+ # type to overwrite the previously committed partition, and add another
1678
+ # transaction operation to replace it with the staged partition
1679
+ txn_type = TransactionType.OVERWRITE
1680
+ txn_ops.append(
1681
+ TransactionOperation.of(
1682
+ operation_type=TransactionOperationType.UPDATE,
1683
+ dest_metafile=partition,
1684
+ src_metafile=prev_committed_partition,
1685
+ )
1686
+ )
1687
+ transaction = Transaction.of(
1688
+ txn_type=txn_type,
1689
+ txn_operations=txn_ops,
1690
+ )
1691
+ catalog_properties = get_catalog_properties(**kwargs)
1692
+ transaction.commit(
1693
+ catalog_root_dir=catalog_properties.root,
1694
+ filesystem=catalog_properties.filesystem,
1695
+ )
1696
+ return partition
1697
+
1698
+
1699
+ def delete_partition(
1700
+ stream_locator: StreamLocator,
1701
+ partition_values: Optional[PartitionValues] = None,
1702
+ partition_scheme_id: Optional[str] = None,
1703
+ *args,
1704
+ **kwargs,
1705
+ ) -> None:
1706
+ """
1707
+ Deletes the given partition from the specified stream. Partition
1708
+ values should not be specified for unpartitioned tables. Raises an error
1709
+ if the partition does not exist.
1710
+ """
1711
+ partition_to_delete = get_partition(
1712
+ *args,
1713
+ stream_locator=stream_locator,
1714
+ partition_values=partition_values,
1715
+ partition_scheme_id=partition_scheme_id,
1716
+ **kwargs,
1717
+ )
1718
+ if not partition_to_delete:
1719
+ raise ValueError(
1720
+ f"Partition with values {partition_values} and scheme "
1721
+ f"{partition_scheme_id} not found in stream: {stream_locator}"
1722
+ )
1723
+ else:
1724
+ partition_to_delete.state = CommitState.DEPRECATED
1725
+ transaction = Transaction.of(
1726
+ txn_type=TransactionType.DELETE,
1727
+ txn_operations=[
1728
+ TransactionOperation.of(
1729
+ operation_type=TransactionOperationType.DELETE,
1730
+ src_metafile=partition_to_delete,
1731
+ )
1732
+ ],
1733
+ )
1734
+ catalog_properties = get_catalog_properties(**kwargs)
1735
+ transaction.commit(
1736
+ catalog_root_dir=catalog_properties.root,
1737
+ filesystem=catalog_properties.filesystem,
1738
+ )
1739
+
1740
+
1741
+ def get_partition_by_id(
1742
+ stream_locator: StreamLocator,
1743
+ partition_id: str,
1744
+ *args,
1745
+ **kwargs,
1746
+ ) -> Optional[Partition]:
1747
+ """
1748
+ Gets the partition for the given stream locator and partition ID.
1749
+ Returns None if the partition does not exist. Raises an error if the
1750
+ given stream locator does not exist.
1751
+ """
1752
+ locator = PartitionLocator.of(
1753
+ stream_locator=stream_locator,
1754
+ partition_values=None,
1755
+ partition_id=partition_id,
1756
+ )
1757
+ return _latest(
1758
+ *args,
1759
+ metafile=Partition.of(
1760
+ locator=locator,
1761
+ schema=None,
1762
+ content_types=None,
1763
+ ),
1764
+ **kwargs,
1765
+ )
1766
+
1767
+
1768
+ def get_partition(
1769
+ stream_locator: StreamLocator,
1770
+ partition_values: Optional[PartitionValues] = None,
1771
+ partition_scheme_id: Optional[str] = None,
1772
+ *args,
1773
+ **kwargs,
1774
+ ) -> Optional[Partition]:
1775
+ """
1776
+ Gets the most recently committed partition for the given stream locator and
1777
+ partition key values. Returns None if no partition has been committed for
1778
+ the given table version and/or partition key values. Partition values
1779
+ should not be specified for unpartitioned tables. Partition scheme ID
1780
+ resolves to the table version's current partition scheme by default.
1781
+ Raises an error if the given stream locator does not exist.
1782
+ """
1783
+ locator = PartitionLocator.of(
1784
+ stream_locator=stream_locator,
1785
+ partition_values=partition_values,
1786
+ partition_id=None,
1787
+ )
1788
+ if not partition_scheme_id:
1789
+ # resolve latest partition scheme from the current
1790
+ # revision of its `deltacat` stream
1791
+ stream = get_stream(
1792
+ *args,
1793
+ namespace=stream_locator.namespace,
1794
+ table_name=stream_locator.table_name,
1795
+ table_version=stream_locator.table_version,
1796
+ **kwargs,
1797
+ )
1798
+ if not stream:
1799
+ raise ValueError(f"Stream {stream_locator} not found.")
1800
+ partition_scheme_id = stream.partition_scheme.id
1801
+ return _latest(
1802
+ *args,
1803
+ metafile=Partition.of(
1804
+ locator=locator,
1805
+ schema=None,
1806
+ content_types=None,
1807
+ state=CommitState.COMMITTED,
1808
+ partition_scheme_id=partition_scheme_id,
1809
+ ),
1810
+ **kwargs,
1811
+ )
1812
+
1813
+
1814
+ def stage_delta(
1815
+ data: Union[LocalTable, LocalDataset, DistributedDataset, Manifest],
1816
+ partition: Partition,
1817
+ delta_type: DeltaType = DeltaType.UPSERT,
1818
+ max_records_per_entry: Optional[int] = None,
1819
+ author: Optional[ManifestAuthor] = None,
1820
+ properties: Optional[DeltaProperties] = None,
1821
+ s3_table_writer_kwargs: Optional[Dict[str, Any]] = None,
1822
+ content_type: ContentType = ContentType.PARQUET,
1823
+ entry_params: Optional[EntryParams] = None,
1824
+ *args,
1825
+ **kwargs,
1826
+ ) -> Delta:
1827
+ """
1828
+ Writes the given table to 1 or more S3 files. Returns an unregistered
1829
+ delta whose manifest entries point to the uploaded files. Applies any
1830
+ schema consistency policies configured for the parent table version.
1831
+
1832
+ The partition spec will be used to split the input table into
1833
+ multiple files. Optionally, partition_values can be provided to avoid
1834
+ this method to recompute partition_values from the provided data.
1835
+
1836
+ Raises an error if the provided data does not conform to a unique ordered
1837
+ list of partition_values
1838
+ """
1839
+ raise NotImplementedError("stage_delta not implemented")
1840
+
1841
+
1842
+ def commit_delta(delta: Delta, *args, **kwargs) -> Delta:
1843
+ """
1844
+ Registers a new delta with its associated target table version and
1845
+ partition. Returns the registered delta. If the delta's previous stream
1846
+ position is specified, then the commit will be rejected if it does not match
1847
+ the target partition's actual previous stream position. If the delta's
1848
+ stream position is specified, it must be greater than the latest stream
1849
+ position in the target partition.
1850
+ """
1851
+ raise NotImplementedError("commit_delta not implemented")
1852
+
1853
+
1854
+ def get_namespace(namespace: str, *args, **kwargs) -> Optional[Namespace]:
1855
+ """
1856
+ Gets table namespace metadata for the specified table namespace. Returns
1857
+ None if the given namespace does not exist.
1858
+ """
1859
+ return _latest(
1860
+ *args,
1861
+ metafile=Namespace.of(NamespaceLocator.of(namespace)),
1862
+ **kwargs,
1863
+ )
1864
+
1865
+
1866
+ def namespace_exists(namespace: str, *args, **kwargs) -> bool:
1867
+ """
1868
+ Returns True if the given table namespace exists, False if not.
1869
+ """
1870
+ return _exists(
1871
+ *args,
1872
+ metafile=Namespace.of(NamespaceLocator.of(namespace)),
1873
+ **kwargs,
1874
+ )
1875
+
1876
+
1877
+ def get_table(namespace: str, table_name: str, *args, **kwargs) -> Optional[Table]:
1878
+ """
1879
+ Gets table metadata for the specified table. Returns None if the given
1880
+ table does not exist.
1881
+ """
1882
+ locator = TableLocator.at(namespace=namespace, table_name=table_name)
1883
+ return _latest(
1884
+ *args,
1885
+ metafile=Table.of(locator=locator),
1886
+ **kwargs,
1887
+ )
1888
+
1889
+
1890
+ def table_exists(namespace: str, table_name: str, *args, **kwargs) -> bool:
1891
+ """
1892
+ Returns True if the given table exists, False if not.
1893
+ """
1894
+ locator = TableLocator.at(namespace=namespace, table_name=table_name)
1895
+ return _exists(
1896
+ *args,
1897
+ metafile=Table.of(locator=locator),
1898
+ **kwargs,
1899
+ )
1900
+
1901
+
1902
+ def get_table_version(
1903
+ namespace: str,
1904
+ table_name: str,
1905
+ table_version: str,
1906
+ *args,
1907
+ **kwargs,
1908
+ ) -> Optional[TableVersion]:
1909
+ """
1910
+ Gets table version metadata for the specified table version. Returns None
1911
+ if the given table version does not exist.
1912
+ """
1913
+ locator = TableVersionLocator.at(
1914
+ namespace=namespace,
1915
+ table_name=table_name,
1916
+ table_version=table_version,
1917
+ )
1918
+ table_version = TableVersion.of(
1919
+ locator=locator,
1920
+ schema=None,
1921
+ )
1922
+ return _latest(
1923
+ *args,
1924
+ metafile=table_version,
1925
+ **kwargs,
1926
+ )
1927
+
1928
+
1929
+ def get_latest_table_version(
1930
+ namespace: str, table_name: str, *args, **kwargs
1931
+ ) -> Optional[TableVersion]:
1932
+ """
1933
+ Gets table version metadata for the latest version of the specified table.
1934
+ Returns None if no table version exists for the given table. Raises
1935
+ an error if the given table doesn't exist.
1936
+ """
1937
+ table_version_id = _resolve_latest_table_version_id(
1938
+ *args,
1939
+ namespace=namespace,
1940
+ table_name=table_name,
1941
+ fail_if_no_active_table_version=False,
1942
+ **kwargs,
1943
+ )
1944
+
1945
+ return (
1946
+ get_table_version(
1947
+ *args,
1948
+ namespace=namespace,
1949
+ table_name=table_name,
1950
+ table_version=table_version_id,
1951
+ **kwargs,
1952
+ )
1953
+ if table_version_id
1954
+ else None
1955
+ )
1956
+
1957
+
1958
+ def get_latest_active_table_version(
1959
+ namespace: str, table_name: str, *args, **kwargs
1960
+ ) -> Optional[TableVersion]:
1961
+ """
1962
+ Gets table version metadata for the latest active version of the specified
1963
+ table. Returns None if no active table version exists for the given table.
1964
+ Raises an error if the given table doesn't exist.
1965
+ """
1966
+ table_version_id = _resolve_latest_active_table_version_id(
1967
+ *args,
1968
+ namespace=namespace,
1969
+ table_name=table_name,
1970
+ fail_if_no_active_table_version=False,
1971
+ **kwargs,
1972
+ )
1973
+ return (
1974
+ get_table_version(
1975
+ *args,
1976
+ namespace=namespace,
1977
+ table_name=table_name,
1978
+ table_version=table_version_id,
1979
+ **kwargs,
1980
+ )
1981
+ if table_version_id
1982
+ else None
1983
+ )
1984
+
1985
+
1986
+ def get_table_version_column_names(
1987
+ namespace: str,
1988
+ table_name: str,
1989
+ table_version: Optional[str] = None,
1990
+ *args,
1991
+ **kwargs,
1992
+ ) -> Optional[List[str]]:
1993
+ """
1994
+ Gets a list of column names for the specified table version, or for the
1995
+ latest active table version if none is specified. The index of each
1996
+ column name returned represents its ordinal position in a delimited text
1997
+ file or other row-oriented content type files appended to the table.
1998
+ Returns None for schemaless tables. Raises an error if the table version
1999
+ does not exist.
2000
+ """
2001
+ schema = get_table_version_schema(
2002
+ namespace=namespace,
2003
+ table_name=table_name,
2004
+ table_version=table_version,
2005
+ )
2006
+ return schema.arrow.names if schema else None
2007
+
2008
+
2009
+ def get_table_version_schema(
2010
+ namespace: str,
2011
+ table_name: str,
2012
+ table_version: Optional[str] = None,
2013
+ *args,
2014
+ **kwargs,
2015
+ ) -> Optional[Schema]:
2016
+ """
2017
+ Gets the schema for the specified table version, or for the latest active
2018
+ table version if none is specified. Returns None if the table version is
2019
+ schemaless. Raises an error if the table version does not exist.
2020
+ """
2021
+ table_version = (
2022
+ get_table_version(
2023
+ *args,
2024
+ namespace=namespace,
2025
+ table_name=table_name,
2026
+ table_version=table_version,
2027
+ **kwargs,
2028
+ )
2029
+ if table_version
2030
+ else get_latest_active_table_version(
2031
+ *args,
2032
+ namespace=namespace,
2033
+ table_name=table_name,
2034
+ **kwargs,
2035
+ )
2036
+ )
2037
+ return table_version.schema
2038
+
2039
+
2040
+ def table_version_exists(
2041
+ namespace: str,
2042
+ table_name: str,
2043
+ table_version: str,
2044
+ *args,
2045
+ **kwargs,
2046
+ ) -> bool:
2047
+ """
2048
+ Returns True if the given table version exists, False if not.
2049
+ """
2050
+ locator = TableVersionLocator.at(
2051
+ namespace=namespace,
2052
+ table_name=table_name,
2053
+ table_version=table_version,
2054
+ )
2055
+ table_version = TableVersion.of(
2056
+ locator=locator,
2057
+ schema=None,
2058
+ )
2059
+ return _exists(
2060
+ *args,
2061
+ metafile=table_version,
2062
+ **kwargs,
2063
+ )
2064
+
2065
+
2066
+ def can_categorize(e: BaseException, *args, **kwargs) -> bool:
2067
+ """
2068
+ Return whether input error is from storage implementation layer.
2069
+ """
2070
+ raise NotImplementedError
2071
+
2072
+
2073
+ def raise_categorized_error(e: BaseException, *args, **kwargs):
2074
+ """
2075
+ Raise and handle storage implementation layer specific errors.
2076
+ """
2077
+ raise NotImplementedError