deltacat 1.1.36__py3-none-any.whl → 2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (236) hide show
  1. deltacat/__init__.py +42 -3
  2. deltacat/annotations.py +36 -0
  3. deltacat/api.py +168 -0
  4. deltacat/aws/s3u.py +4 -4
  5. deltacat/benchmarking/benchmark_engine.py +82 -0
  6. deltacat/benchmarking/benchmark_report.py +86 -0
  7. deltacat/benchmarking/benchmark_suite.py +11 -0
  8. deltacat/benchmarking/conftest.py +21 -0
  9. deltacat/benchmarking/data/random_row_generator.py +94 -0
  10. deltacat/benchmarking/data/row_generator.py +10 -0
  11. deltacat/benchmarking/test_benchmark_pipeline.py +106 -0
  12. deltacat/catalog/__init__.py +14 -0
  13. deltacat/catalog/delegate.py +199 -106
  14. deltacat/catalog/iceberg/__init__.py +4 -0
  15. deltacat/catalog/iceberg/iceberg_catalog_config.py +26 -0
  16. deltacat/catalog/iceberg/impl.py +368 -0
  17. deltacat/catalog/iceberg/overrides.py +74 -0
  18. deltacat/catalog/interface.py +273 -76
  19. deltacat/catalog/main/impl.py +720 -0
  20. deltacat/catalog/model/catalog.py +227 -20
  21. deltacat/catalog/model/properties.py +116 -0
  22. deltacat/catalog/model/table_definition.py +32 -1
  23. deltacat/compute/compactor/model/compaction_session_audit_info.py +7 -3
  24. deltacat/compute/compactor/model/delta_annotated.py +3 -3
  25. deltacat/compute/compactor/model/delta_file_envelope.py +3 -1
  26. deltacat/compute/compactor/model/delta_file_locator.py +3 -1
  27. deltacat/compute/compactor/model/round_completion_info.py +5 -5
  28. deltacat/compute/compactor/model/table_object_store.py +3 -2
  29. deltacat/compute/compactor/repartition_session.py +1 -1
  30. deltacat/compute/compactor/steps/dedupe.py +11 -4
  31. deltacat/compute/compactor/steps/hash_bucket.py +1 -1
  32. deltacat/compute/compactor/steps/materialize.py +6 -2
  33. deltacat/compute/compactor/utils/io.py +1 -1
  34. deltacat/compute/compactor/utils/sort_key.py +9 -2
  35. deltacat/compute/compactor_v2/compaction_session.py +5 -9
  36. deltacat/compute/compactor_v2/constants.py +1 -30
  37. deltacat/compute/compactor_v2/deletes/utils.py +3 -3
  38. deltacat/compute/compactor_v2/model/merge_input.py +1 -7
  39. deltacat/compute/compactor_v2/private/compaction_utils.py +5 -6
  40. deltacat/compute/compactor_v2/steps/merge.py +17 -126
  41. deltacat/compute/compactor_v2/utils/content_type_params.py +0 -17
  42. deltacat/compute/compactor_v2/utils/dedupe.py +1 -1
  43. deltacat/compute/compactor_v2/utils/io.py +1 -1
  44. deltacat/compute/compactor_v2/utils/merge.py +0 -1
  45. deltacat/compute/compactor_v2/utils/primary_key_index.py +3 -15
  46. deltacat/compute/compactor_v2/utils/task_options.py +23 -43
  47. deltacat/compute/converter/constants.py +4 -0
  48. deltacat/compute/converter/converter_session.py +143 -0
  49. deltacat/compute/converter/model/convert_input.py +69 -0
  50. deltacat/compute/converter/model/convert_input_files.py +61 -0
  51. deltacat/compute/converter/model/converter_session_params.py +99 -0
  52. deltacat/compute/converter/pyiceberg/__init__.py +0 -0
  53. deltacat/compute/converter/pyiceberg/catalog.py +75 -0
  54. deltacat/compute/converter/pyiceberg/overrides.py +135 -0
  55. deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +251 -0
  56. deltacat/compute/converter/steps/__init__.py +0 -0
  57. deltacat/compute/converter/steps/convert.py +211 -0
  58. deltacat/compute/converter/steps/dedupe.py +60 -0
  59. deltacat/compute/converter/utils/__init__.py +0 -0
  60. deltacat/compute/converter/utils/convert_task_options.py +88 -0
  61. deltacat/compute/converter/utils/converter_session_utils.py +109 -0
  62. deltacat/compute/converter/utils/iceberg_columns.py +82 -0
  63. deltacat/compute/converter/utils/io.py +43 -0
  64. deltacat/compute/converter/utils/s3u.py +133 -0
  65. deltacat/compute/resource_estimation/delta.py +1 -19
  66. deltacat/constants.py +47 -1
  67. deltacat/env.py +51 -0
  68. deltacat/examples/__init__.py +0 -0
  69. deltacat/examples/basic_logging.py +101 -0
  70. deltacat/examples/common/__init__.py +0 -0
  71. deltacat/examples/common/fixtures.py +15 -0
  72. deltacat/examples/hello_world.py +27 -0
  73. deltacat/examples/iceberg/__init__.py +0 -0
  74. deltacat/examples/iceberg/iceberg_bucket_writer.py +139 -0
  75. deltacat/examples/iceberg/iceberg_reader.py +149 -0
  76. deltacat/exceptions.py +51 -9
  77. deltacat/logs.py +4 -1
  78. deltacat/storage/__init__.py +118 -28
  79. deltacat/storage/iceberg/__init__.py +0 -0
  80. deltacat/storage/iceberg/iceberg_scan_planner.py +28 -0
  81. deltacat/storage/iceberg/impl.py +737 -0
  82. deltacat/storage/iceberg/model.py +709 -0
  83. deltacat/storage/interface.py +217 -134
  84. deltacat/storage/main/__init__.py +0 -0
  85. deltacat/storage/main/impl.py +2077 -0
  86. deltacat/storage/model/delta.py +118 -71
  87. deltacat/storage/model/interop.py +24 -0
  88. deltacat/storage/model/list_result.py +8 -0
  89. deltacat/storage/model/locator.py +93 -3
  90. deltacat/{aws/redshift → storage}/model/manifest.py +122 -98
  91. deltacat/storage/model/metafile.py +1316 -0
  92. deltacat/storage/model/namespace.py +34 -18
  93. deltacat/storage/model/partition.py +362 -37
  94. deltacat/storage/model/scan/__init__.py +0 -0
  95. deltacat/storage/model/scan/push_down.py +19 -0
  96. deltacat/storage/model/scan/scan_plan.py +10 -0
  97. deltacat/storage/model/scan/scan_task.py +34 -0
  98. deltacat/storage/model/schema.py +892 -0
  99. deltacat/storage/model/shard.py +47 -0
  100. deltacat/storage/model/sort_key.py +170 -13
  101. deltacat/storage/model/stream.py +208 -80
  102. deltacat/storage/model/table.py +123 -29
  103. deltacat/storage/model/table_version.py +322 -46
  104. deltacat/storage/model/transaction.py +757 -0
  105. deltacat/storage/model/transform.py +198 -61
  106. deltacat/storage/model/types.py +111 -13
  107. deltacat/storage/rivulet/__init__.py +11 -0
  108. deltacat/storage/rivulet/arrow/__init__.py +0 -0
  109. deltacat/storage/rivulet/arrow/serializer.py +75 -0
  110. deltacat/storage/rivulet/dataset.py +744 -0
  111. deltacat/storage/rivulet/dataset_executor.py +87 -0
  112. deltacat/storage/rivulet/feather/__init__.py +5 -0
  113. deltacat/storage/rivulet/feather/file_reader.py +136 -0
  114. deltacat/storage/rivulet/feather/serializer.py +35 -0
  115. deltacat/storage/rivulet/fs/__init__.py +0 -0
  116. deltacat/storage/rivulet/fs/file_provider.py +105 -0
  117. deltacat/storage/rivulet/fs/file_store.py +130 -0
  118. deltacat/storage/rivulet/fs/input_file.py +76 -0
  119. deltacat/storage/rivulet/fs/output_file.py +86 -0
  120. deltacat/storage/rivulet/logical_plan.py +105 -0
  121. deltacat/storage/rivulet/metastore/__init__.py +0 -0
  122. deltacat/storage/rivulet/metastore/delta.py +190 -0
  123. deltacat/storage/rivulet/metastore/json_sst.py +105 -0
  124. deltacat/storage/rivulet/metastore/sst.py +82 -0
  125. deltacat/storage/rivulet/metastore/sst_interval_tree.py +260 -0
  126. deltacat/storage/rivulet/mvp/Table.py +101 -0
  127. deltacat/storage/rivulet/mvp/__init__.py +5 -0
  128. deltacat/storage/rivulet/parquet/__init__.py +5 -0
  129. deltacat/storage/rivulet/parquet/data_reader.py +0 -0
  130. deltacat/storage/rivulet/parquet/file_reader.py +127 -0
  131. deltacat/storage/rivulet/parquet/serializer.py +37 -0
  132. deltacat/storage/rivulet/reader/__init__.py +0 -0
  133. deltacat/storage/rivulet/reader/block_scanner.py +378 -0
  134. deltacat/storage/rivulet/reader/data_reader.py +136 -0
  135. deltacat/storage/rivulet/reader/data_scan.py +63 -0
  136. deltacat/storage/rivulet/reader/dataset_metastore.py +178 -0
  137. deltacat/storage/rivulet/reader/dataset_reader.py +156 -0
  138. deltacat/storage/rivulet/reader/pyarrow_data_reader.py +121 -0
  139. deltacat/storage/rivulet/reader/query_expression.py +99 -0
  140. deltacat/storage/rivulet/reader/reader_type_registrar.py +84 -0
  141. deltacat/storage/rivulet/schema/__init__.py +0 -0
  142. deltacat/storage/rivulet/schema/datatype.py +128 -0
  143. deltacat/storage/rivulet/schema/schema.py +251 -0
  144. deltacat/storage/rivulet/serializer.py +40 -0
  145. deltacat/storage/rivulet/serializer_factory.py +42 -0
  146. deltacat/storage/rivulet/writer/__init__.py +0 -0
  147. deltacat/storage/rivulet/writer/dataset_writer.py +29 -0
  148. deltacat/storage/rivulet/writer/memtable_dataset_writer.py +294 -0
  149. deltacat/tests/_io/__init__.py +1 -0
  150. deltacat/tests/catalog/test_catalogs.py +324 -0
  151. deltacat/tests/catalog/test_default_catalog_impl.py +16 -8
  152. deltacat/tests/compute/compact_partition_multiple_rounds_test_cases.py +21 -21
  153. deltacat/tests/compute/compact_partition_rebase_test_cases.py +6 -6
  154. deltacat/tests/compute/compact_partition_rebase_then_incremental_test_cases.py +56 -56
  155. deltacat/tests/compute/compact_partition_test_cases.py +19 -53
  156. deltacat/tests/compute/compactor/steps/test_repartition.py +2 -2
  157. deltacat/tests/compute/compactor/utils/test_io.py +6 -8
  158. deltacat/tests/compute/compactor_v2/test_compaction_session.py +0 -466
  159. deltacat/tests/compute/compactor_v2/utils/test_task_options.py +1 -273
  160. deltacat/tests/compute/conftest.py +75 -0
  161. deltacat/tests/compute/converter/__init__.py +0 -0
  162. deltacat/tests/compute/converter/conftest.py +80 -0
  163. deltacat/tests/compute/converter/test_convert_session.py +478 -0
  164. deltacat/tests/compute/converter/utils.py +123 -0
  165. deltacat/tests/compute/resource_estimation/test_delta.py +0 -16
  166. deltacat/tests/compute/test_compact_partition_incremental.py +2 -42
  167. deltacat/tests/compute/test_compact_partition_multiple_rounds.py +5 -46
  168. deltacat/tests/compute/test_compact_partition_params.py +3 -3
  169. deltacat/tests/compute/test_compact_partition_rebase.py +1 -46
  170. deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +5 -46
  171. deltacat/tests/compute/test_util_common.py +19 -12
  172. deltacat/tests/compute/test_util_create_table_deltas_repo.py +13 -22
  173. deltacat/tests/local_deltacat_storage/__init__.py +76 -103
  174. deltacat/tests/storage/__init__.py +0 -0
  175. deltacat/tests/storage/conftest.py +25 -0
  176. deltacat/tests/storage/main/__init__.py +0 -0
  177. deltacat/tests/storage/main/test_main_storage.py +1399 -0
  178. deltacat/tests/storage/model/__init__.py +0 -0
  179. deltacat/tests/storage/model/test_delete_parameters.py +21 -0
  180. deltacat/tests/storage/model/test_metafile_io.py +2535 -0
  181. deltacat/tests/storage/model/test_schema.py +308 -0
  182. deltacat/tests/storage/model/test_shard.py +22 -0
  183. deltacat/tests/storage/model/test_table_version.py +110 -0
  184. deltacat/tests/storage/model/test_transaction.py +308 -0
  185. deltacat/tests/storage/rivulet/__init__.py +0 -0
  186. deltacat/tests/storage/rivulet/conftest.py +149 -0
  187. deltacat/tests/storage/rivulet/fs/__init__.py +0 -0
  188. deltacat/tests/storage/rivulet/fs/test_file_location_provider.py +93 -0
  189. deltacat/tests/storage/rivulet/schema/__init__.py +0 -0
  190. deltacat/tests/storage/rivulet/schema/test_schema.py +241 -0
  191. deltacat/tests/storage/rivulet/test_dataset.py +406 -0
  192. deltacat/tests/storage/rivulet/test_manifest.py +67 -0
  193. deltacat/tests/storage/rivulet/test_sst_interval_tree.py +232 -0
  194. deltacat/tests/storage/rivulet/test_utils.py +122 -0
  195. deltacat/tests/storage/rivulet/writer/__init__.py +0 -0
  196. deltacat/tests/storage/rivulet/writer/test_dataset_write_then_read.py +341 -0
  197. deltacat/tests/storage/rivulet/writer/test_dataset_writer.py +79 -0
  198. deltacat/tests/storage/rivulet/writer/test_memtable_dataset_writer.py +75 -0
  199. deltacat/tests/test_deltacat_api.py +39 -0
  200. deltacat/tests/test_utils/filesystem.py +14 -0
  201. deltacat/tests/test_utils/message_pack_utils.py +54 -0
  202. deltacat/tests/test_utils/pyarrow.py +8 -15
  203. deltacat/tests/test_utils/storage.py +266 -3
  204. deltacat/tests/utils/test_daft.py +3 -3
  205. deltacat/tests/utils/test_pyarrow.py +0 -432
  206. deltacat/types/partial_download.py +1 -1
  207. deltacat/types/tables.py +1 -1
  208. deltacat/utils/export.py +59 -0
  209. deltacat/utils/filesystem.py +320 -0
  210. deltacat/utils/metafile_locator.py +73 -0
  211. deltacat/utils/pyarrow.py +36 -183
  212. deltacat-2.0.dist-info/METADATA +65 -0
  213. deltacat-2.0.dist-info/RECORD +347 -0
  214. deltacat/aws/redshift/__init__.py +0 -19
  215. deltacat/catalog/default_catalog_impl/__init__.py +0 -369
  216. deltacat/io/dataset.py +0 -73
  217. deltacat/io/read_api.py +0 -143
  218. deltacat/storage/model/delete_parameters.py +0 -40
  219. deltacat/storage/model/partition_spec.py +0 -71
  220. deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +0 -253
  221. deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +0 -45
  222. deltacat-1.1.36.dist-info/METADATA +0 -64
  223. deltacat-1.1.36.dist-info/RECORD +0 -219
  224. /deltacat/{aws/redshift/model → benchmarking/data}/__init__.py +0 -0
  225. /deltacat/{io/aws → catalog/main}/__init__.py +0 -0
  226. /deltacat/{io/aws/redshift → compute/converter}/__init__.py +0 -0
  227. /deltacat/{tests/io → compute/converter/model}/__init__.py +0 -0
  228. /deltacat/tests/{io → _io}/test_cloudpickle_bug_fix.py +0 -0
  229. /deltacat/tests/{io → _io}/test_file_object_store.py +0 -0
  230. /deltacat/tests/{io → _io}/test_memcached_object_store.py +0 -0
  231. /deltacat/tests/{io → _io}/test_ray_plasma_object_store.py +0 -0
  232. /deltacat/tests/{io → _io}/test_redis_object_store.py +0 -0
  233. /deltacat/tests/{io → _io}/test_s3_object_store.py +0 -0
  234. {deltacat-1.1.36.dist-info → deltacat-2.0.dist-info}/LICENSE +0 -0
  235. {deltacat-1.1.36.dist-info → deltacat-2.0.dist-info}/WHEEL +0 -0
  236. {deltacat-1.1.36.dist-info → deltacat-2.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,720 @@
1
+ from typing import Any, Dict, List, Optional, Union, Tuple
2
+ import logging
3
+
4
+ from deltacat.catalog import CatalogProperties
5
+ from deltacat.exceptions import (
6
+ NamespaceAlreadyExistsError,
7
+ StreamNotFoundError,
8
+ TableAlreadyExistsError,
9
+ TableVersionNotFoundError,
10
+ )
11
+ from deltacat.catalog.model.table_definition import TableDefinition
12
+ from deltacat.storage.model.sort_key import SortScheme
13
+ from deltacat.storage.model.list_result import ListResult
14
+ from deltacat.storage.model.namespace import Namespace, NamespaceProperties
15
+ from deltacat.storage.model.schema import Schema
16
+ from deltacat.storage.model.table import TableProperties, Table
17
+ from deltacat.storage.model.types import (
18
+ DistributedDataset,
19
+ LifecycleState,
20
+ LocalDataset,
21
+ LocalTable,
22
+ StreamFormat,
23
+ )
24
+ from deltacat.storage.model.partition import (
25
+ Partition,
26
+ PartitionLocator,
27
+ PartitionScheme,
28
+ )
29
+ from deltacat.storage.model.table_version import TableVersion
30
+ from deltacat.compute.merge_on_read.model.merge_on_read_params import MergeOnReadParams
31
+ from deltacat.storage.model.delta import DeltaType
32
+ from deltacat.types.media import ContentType, TableType, DistributedDatasetType
33
+ from deltacat.types.tables import TableWriteMode
34
+ from deltacat.compute.merge_on_read import MERGE_FUNC_BY_DISTRIBUTED_DATASET_TYPE
35
+ from deltacat import logs
36
+ from deltacat.constants import DEFAULT_NAMESPACE
37
+ from deltacat.storage import metastore as storage_impl
38
+
39
+ logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
40
+
41
+ """
42
+ This is the default implementation for the Catalog interface, using DeltaCAT native storage
43
+
44
+ Note that, when this catalog implementation gets called through the normal pattern of `delegate.py`, all functions
45
+ will be called the kwarg "inner" equal to the `CatalogProperties` this was initialized with.
46
+
47
+ `CatalogProperties` has all state required to implement catalog functions, such as metastore root URI
48
+ """
49
+
50
+
51
+ # catalog functions
52
+ def initialize(config: CatalogProperties = None, *args, **kwargs) -> CatalogProperties:
53
+ """
54
+ Initializes the data catalog with the given arguments.
55
+
56
+ returns CatalogProperties as the "inner" state value for a DC native catalog
57
+ """
58
+ if config is not None:
59
+ return config
60
+ else:
61
+ return CatalogProperties(*args, **kwargs)
62
+
63
+
64
+ # table functions
65
+ def write_to_table(
66
+ data: Union[LocalTable, LocalDataset, DistributedDataset], # type: ignore
67
+ table: str,
68
+ *args,
69
+ namespace: Optional[str] = None,
70
+ mode: TableWriteMode = TableWriteMode.AUTO,
71
+ content_type: ContentType = ContentType.PARQUET,
72
+ **kwargs,
73
+ ) -> None:
74
+ """Write local or distributed data to a table. Raises an error if the
75
+ table does not exist and the table write mode is not CREATE or AUTO.
76
+
77
+ When creating a table, all `create_table` parameters may be optionally
78
+ specified as additional keyword arguments. When appending to, or replacing,
79
+ an existing table, all `alter_table` parameters may be optionally specified
80
+ as additional keyword arguments."""
81
+ raise NotImplementedError("write_to_table not implemented")
82
+
83
+
84
+ def read_table(
85
+ table: str,
86
+ *args,
87
+ namespace: Optional[str] = None,
88
+ table_version: Optional[str] = None,
89
+ table_type: Optional[TableType] = TableType.PYARROW,
90
+ distributed_dataset_type: Optional[
91
+ DistributedDatasetType
92
+ ] = DistributedDatasetType.RAY_DATASET,
93
+ partition_filter: Optional[List[Union[Partition, PartitionLocator]]] = None,
94
+ stream_position_range_inclusive: Optional[Tuple[int, int]] = None,
95
+ merge_on_read: Optional[bool] = False,
96
+ reader_kwargs: Optional[Dict[Any, Any]] = None,
97
+ **kwargs,
98
+ ) -> DistributedDataset: # type: ignore
99
+ """Read a table into a distributed dataset."""
100
+
101
+ if reader_kwargs is None:
102
+ reader_kwargs = {}
103
+
104
+ _validate_read_table_args(
105
+ namespace=namespace,
106
+ table_type=table_type,
107
+ distributed_dataset_type=distributed_dataset_type,
108
+ merge_on_read=merge_on_read,
109
+ **kwargs,
110
+ )
111
+
112
+ table_version_obj = _get_latest_or_given_table_version(
113
+ namespace=namespace,
114
+ table_name=table,
115
+ table_version=table_version,
116
+ **kwargs,
117
+ )
118
+ table_version = table_version_obj.table_version
119
+
120
+ if (
121
+ table_version_obj.content_types is None
122
+ or len(table_version_obj.content_types) != 1
123
+ ):
124
+ raise ValueError(
125
+ "Expected exactly one content type but "
126
+ f"found {table_version_obj.content_types}."
127
+ )
128
+
129
+ logger.info(
130
+ f"Reading metadata for table={namespace}/{table}/{table_version} "
131
+ f"with partition_filters={partition_filter} and stream position"
132
+ f" range={stream_position_range_inclusive}"
133
+ )
134
+
135
+ if partition_filter is None:
136
+ logger.info(
137
+ f"Reading all partitions metadata in the table={table} "
138
+ "as partition_filter was None."
139
+ )
140
+ partition_filter = (
141
+ _get_storage(**kwargs)
142
+ .list_partitions(
143
+ table_name=table,
144
+ namespace=namespace,
145
+ table_version=table_version,
146
+ **kwargs,
147
+ )
148
+ .all_items()
149
+ )
150
+
151
+ qualified_deltas = _get_deltas_from_partition_filter(
152
+ stream_position_range_inclusive=stream_position_range_inclusive,
153
+ partition_filter=partition_filter,
154
+ **kwargs,
155
+ )
156
+
157
+ logger.info(
158
+ f"Total qualified deltas={len(qualified_deltas)} "
159
+ f"from {len(partition_filter)} partitions."
160
+ )
161
+
162
+ merge_on_read_params = MergeOnReadParams.of(
163
+ {
164
+ "deltas": qualified_deltas,
165
+ "deltacat_storage": _get_storage(**kwargs),
166
+ "deltacat_storage_kwargs": {**kwargs},
167
+ "reader_kwargs": reader_kwargs,
168
+ }
169
+ )
170
+
171
+ return MERGE_FUNC_BY_DISTRIBUTED_DATASET_TYPE[distributed_dataset_type.value](
172
+ params=merge_on_read_params, **kwargs
173
+ )
174
+
175
+
176
+ def alter_table(
177
+ table: str,
178
+ *args,
179
+ namespace: Optional[str] = None,
180
+ lifecycle_state: Optional[LifecycleState] = None,
181
+ schema_updates: Optional[Dict[str, Any]] = None,
182
+ partition_updates: Optional[Dict[str, Any]] = None,
183
+ sort_keys: Optional[SortScheme] = None,
184
+ description: Optional[str] = None,
185
+ properties: Optional[TableProperties] = None,
186
+ **kwargs,
187
+ ) -> None:
188
+ """Alter deltacat table/table_version definition.
189
+
190
+ Modifies various aspects of a table's metadata including lifecycle state,
191
+ schema, partitioning, sort keys, description, and properties.
192
+
193
+ Args:
194
+ table: Name of the table to alter.
195
+ namespace: Optional namespace of the table. Uses default namespace if not specified.
196
+ lifecycle_state: New lifecycle state for the table.
197
+ schema_updates: Map of schema updates to apply.
198
+ partition_updates: Map of partition scheme updates to apply.
199
+ sort_keys: New sort keys scheme.
200
+ description: New description for the table.
201
+ properties: New table properties.
202
+
203
+ Returns:
204
+ None
205
+
206
+ Raises:
207
+ TableNotFoundError: If the table does not already exist.
208
+ """
209
+ namespace = namespace or default_namespace()
210
+
211
+ _get_storage(**kwargs).update_table(
212
+ *args,
213
+ namespace=namespace,
214
+ table_name=table,
215
+ description=description,
216
+ properties=properties,
217
+ lifecycle_state=lifecycle_state,
218
+ **kwargs,
219
+ )
220
+
221
+ table_version = _get_storage(**kwargs).get_latest_table_version(
222
+ namespace, table, **kwargs
223
+ )
224
+ _get_storage(**kwargs).update_table_version(
225
+ *args,
226
+ namespace=namespace,
227
+ table_name=table,
228
+ table_version=table_version.id,
229
+ description=description,
230
+ schema_updates=schema_updates,
231
+ partition_updates=partition_updates,
232
+ sort_keys=sort_keys,
233
+ **kwargs,
234
+ )
235
+
236
+
237
+ def create_table(
238
+ name: str,
239
+ *args,
240
+ namespace: Optional[str] = None,
241
+ version: Optional[str] = None,
242
+ lifecycle_state: Optional[LifecycleState] = LifecycleState.ACTIVE,
243
+ schema: Optional[Schema] = None,
244
+ partition_scheme: Optional[PartitionScheme] = None,
245
+ sort_keys: Optional[SortScheme] = None,
246
+ description: Optional[str] = None,
247
+ table_properties: Optional[TableProperties] = None,
248
+ namespace_properties: Optional[NamespaceProperties] = None,
249
+ content_types: Optional[List[ContentType]] = None,
250
+ fail_if_exists: bool = True,
251
+ **kwargs,
252
+ ) -> TableDefinition:
253
+ """Create an empty table in the catalog.
254
+
255
+ If a namespace isn't provided, the table will be created within the default deltacat namespace.
256
+ Additionally if the provided namespace does not exist, it will be created for you.
257
+
258
+
259
+ Args:
260
+ name: Name of the table to create.
261
+ namespace: Optional namespace for the table. Uses default namespace if not specified.
262
+ version: Optional version identifier for the table.
263
+ lifecycle_state: Lifecycle state of the new table. Defaults to ACTIVE.
264
+ schema: Schema definition for the table.
265
+ partition_scheme: Optional partitioning scheme for the table.
266
+ sort_keys: Optional sort keys for the table.
267
+ description: Optional description of the table.
268
+ table_properties: Optional properties for the table.
269
+ namespace_properties: Optional properties for the namespace if it needs to be created.
270
+ content_types: Optional list of allowed content types for the table.
271
+ fail_if_exists: If True, raises an error if table already exists. If False, returns existing table.
272
+
273
+ Returns:
274
+ TableDefinition object for the created or existing table.
275
+
276
+ Raises:
277
+ TableAlreadyExistsError: If the table already exists and fail_if_exists is True.
278
+ NamespaceNotFoundError: If the provided namespace does not exist.
279
+ """
280
+ namespace = namespace or default_namespace()
281
+
282
+ table = get_table(*args, name, namespace=namespace, table_version=version, **kwargs)
283
+ if table is not None:
284
+ if fail_if_exists:
285
+ raise TableAlreadyExistsError(f"Table {namespace}.{name} already exists")
286
+ return table
287
+
288
+ if not namespace_exists(*args, namespace, **kwargs):
289
+ create_namespace(
290
+ *args, namespace=namespace, properties=namespace_properties, **kwargs
291
+ )
292
+
293
+ (table, table_version, stream) = _get_storage(**kwargs).create_table_version(
294
+ *args,
295
+ namespace=namespace,
296
+ table_name=name,
297
+ table_version=version,
298
+ schema=schema,
299
+ partition_scheme=partition_scheme,
300
+ sort_keys=sort_keys,
301
+ table_version_description=description,
302
+ table_description=description,
303
+ table_properties=table_properties,
304
+ lifecycle_state=lifecycle_state or LifecycleState.ACTIVE,
305
+ supported_content_types=content_types,
306
+ **kwargs,
307
+ )
308
+
309
+ return TableDefinition.of(
310
+ table=table,
311
+ table_version=table_version,
312
+ stream=stream,
313
+ )
314
+
315
+
316
+ def drop_table(
317
+ name: str,
318
+ *args,
319
+ namespace: Optional[str] = None,
320
+ table_version: Optional[str] = None,
321
+ purge: bool = False,
322
+ **kwargs,
323
+ ) -> None:
324
+ """Drop a table from the catalog and optionally purges underlying data.
325
+
326
+ Args:
327
+ name: Name of the table to drop.
328
+ namespace: Optional namespace of the table. Uses default namespace if not specified.
329
+ purge: If True, permanently delete the table data. If False, only remove from catalog.
330
+
331
+ Returns:
332
+ None
333
+
334
+ Raises:
335
+ TableNotFoundError: If the table does not exist.
336
+
337
+ TODO: Honor purge once garbage collection is implemented.
338
+ TODO: Drop table version if specified, possibly create a delete_table_version api.
339
+ """
340
+ if purge:
341
+ raise NotImplementedError("Purge flag is not currently supported.")
342
+
343
+ namespace = namespace or default_namespace()
344
+ _get_storage(**kwargs).delete_table(
345
+ *args, namespace=namespace, name=name, purge=purge, **kwargs
346
+ )
347
+
348
+
349
+ def refresh_table(table: str, *args, namespace: Optional[str] = None, **kwargs) -> None:
350
+ """Refresh metadata cached on the Ray cluster for the given table.
351
+
352
+ Args:
353
+ table: Name of the table to refresh.
354
+ namespace: Optional namespace of the table. Uses default namespace if not specified.
355
+
356
+ Returns:
357
+ None
358
+ """
359
+ raise NotImplementedError("refresh_table not implemented")
360
+
361
+
362
+ def list_tables(
363
+ *args, namespace: Optional[str] = None, **kwargs
364
+ ) -> ListResult[TableDefinition]:
365
+ """List a page of table definitions.
366
+
367
+ Args:
368
+ namespace: Optional namespace to list tables from. Uses default namespace if not specified.
369
+
370
+ Returns:
371
+ ListResult containing TableDefinition objects for tables in the namespace.
372
+ """
373
+ namespace = namespace or default_namespace()
374
+ tables = _get_storage(**kwargs).list_tables(*args, namespace=namespace, **kwargs)
375
+ table_definitions = [
376
+ get_table(*args, table.table_name, namespace, **kwargs)
377
+ for table in tables.all_items()
378
+ ]
379
+
380
+ return ListResult(items=table_definitions)
381
+
382
+
383
+ def get_table(
384
+ name: str,
385
+ *args,
386
+ namespace: Optional[str] = None,
387
+ table_version: Optional[str] = None,
388
+ stream_format: StreamFormat = StreamFormat.DELTACAT,
389
+ **kwargs,
390
+ ) -> Optional[TableDefinition]:
391
+ """Get table definition metadata.
392
+
393
+ Args:
394
+ name: Name of the table to retrieve.
395
+ namespace: Optional namespace of the table. Uses default namespace if not specified.
396
+ table_version: Optional specific version of the table to retrieve.
397
+ If not specified, the latest version is used.
398
+ stream_format: Optional stream format to retrieve. Uses the default Deltacat stream
399
+ format if not specified.
400
+
401
+ Returns:
402
+ Deltacat TableDefinition if the table exists, None otherwise.
403
+
404
+ Raises:
405
+ TableVersionNotFoundError: If the table version does not exist.
406
+ StreamNotFoundError: If the stream does not exist.
407
+ """
408
+ namespace = namespace or default_namespace()
409
+ table: Optional[Table] = _get_storage(**kwargs).get_table(
410
+ *args, table_name=name, namespace=namespace, **kwargs
411
+ )
412
+
413
+ if table is None:
414
+ return None
415
+
416
+ table_version: Optional[TableVersion] = _get_storage(**kwargs).get_table_version(
417
+ *args, namespace, name, table_version or table.latest_table_version, **kwargs
418
+ )
419
+
420
+ if table_version is None:
421
+ raise TableVersionNotFoundError(
422
+ f"TableVersion {namespace}.{name}.{table_version} does not exist."
423
+ )
424
+
425
+ stream = _get_storage(**kwargs).get_stream(
426
+ *args,
427
+ namespace=namespace,
428
+ table_name=name,
429
+ table_version=table_version.id,
430
+ stream_format=stream_format,
431
+ **kwargs,
432
+ )
433
+
434
+ if stream is None:
435
+ raise StreamNotFoundError(
436
+ f"Stream {namespace}.{table}.{table_version}.{stream} does not exist."
437
+ )
438
+
439
+ return TableDefinition.of(
440
+ table=table,
441
+ table_version=table_version,
442
+ stream=stream,
443
+ )
444
+
445
+
446
+ def truncate_table(
447
+ table: str, *args, namespace: Optional[str] = None, **kwargs
448
+ ) -> None:
449
+ """Truncate table data.
450
+
451
+ Args:
452
+ table: Name of the table to truncate.
453
+ namespace: Optional namespace of the table. Uses default namespace if not specified.
454
+
455
+ Returns:
456
+ None
457
+ """
458
+ raise NotImplementedError("truncate_table not implemented")
459
+
460
+
461
+ def rename_table(
462
+ table: str, new_name: str, *args, namespace: Optional[str] = None, **kwargs
463
+ ) -> None:
464
+ """Rename an existing table.
465
+
466
+ Args:
467
+ table: Current name of the table.
468
+ new_name: New name for the table.
469
+ namespace: Optional namespace of the table. Uses default namespace if not specified.
470
+
471
+ Returns:
472
+ None
473
+
474
+ Raises:
475
+ TableNotFoundError: If the table does not exist.
476
+ """
477
+ namespace = namespace or default_namespace()
478
+ _get_storage(**kwargs).update_table(
479
+ *args, table_name=table, new_table_name=new_name, namespace=namespace, **kwargs
480
+ )
481
+
482
+
483
+ def table_exists(table: str, *args, namespace: Optional[str] = None, **kwargs) -> bool:
484
+ """Check if a table exists in the catalog.
485
+
486
+ Args:
487
+ table: Name of the table to check.
488
+ namespace: Optional namespace of the table. Uses default namespace if not specified.
489
+
490
+ Returns:
491
+ True if the table exists, False otherwise.
492
+ """
493
+ namespace = namespace or default_namespace()
494
+ return _get_storage(**kwargs).table_exists(
495
+ *args, table_name=table, namespace=namespace, **kwargs
496
+ )
497
+
498
+
499
+ def list_namespaces(*args, **kwargs) -> ListResult[Namespace]:
500
+ """List a page of table namespaces.
501
+
502
+ Args:
503
+ catalog: Catalog properties instance.
504
+
505
+ Returns:
506
+ ListResult containing Namespace objects.
507
+ """
508
+ return _get_storage(**kwargs).list_namespaces(*args, **kwargs)
509
+
510
+
511
+ def get_namespace(namespace: str, *args, **kwargs) -> Optional[Namespace]:
512
+ """Get metadata for a specific table namespace.
513
+
514
+ Args:
515
+ namespace: Name of the namespace to retrieve.
516
+
517
+ Returns:
518
+ Namespace object if the namespace exists, None otherwise.
519
+ """
520
+ return _get_storage(**kwargs).get_namespace(*args, namespace=namespace, **kwargs)
521
+
522
+
523
+ def namespace_exists(namespace: str, *args, **kwargs) -> bool:
524
+ """Check if a namespace exists.
525
+
526
+ Args:
527
+ namespace: Name of the namespace to check.
528
+
529
+ Returns:
530
+ True if the namespace exists, False otherwise.
531
+ """
532
+ return _get_storage(**kwargs).namespace_exists(*args, namespace=namespace, **kwargs)
533
+
534
+
535
+ def create_namespace(
536
+ namespace: str, *args, properties: Optional[NamespaceProperties] = None, **kwargs
537
+ ) -> Namespace:
538
+ """Create a new namespace.
539
+
540
+ Args:
541
+ namespace: Name of the namespace to create.
542
+ properties: Optional properties for the namespace.
543
+
544
+ Returns:
545
+ Created Namespace object.
546
+
547
+ Raises:
548
+ NamespaceAlreadyExistsError: If the namespace already exists.
549
+ """
550
+ if namespace_exists(namespace, **kwargs):
551
+ raise NamespaceAlreadyExistsError(f"Namespace {namespace} already exists")
552
+
553
+ return _get_storage(**kwargs).create_namespace(
554
+ *args, namespace=namespace, properties=properties, **kwargs
555
+ )
556
+
557
+
558
+ def alter_namespace(
559
+ namespace: str,
560
+ *args,
561
+ properties: Optional[NamespaceProperties] = None,
562
+ new_namespace: Optional[str] = None,
563
+ **kwargs,
564
+ ) -> None:
565
+ """Alter a namespace definition.
566
+
567
+ Args:
568
+ namespace: Name of the namespace to alter.
569
+ properties: Optional new properties for the namespace.
570
+ new_namespace: Optional new name for the namespace.
571
+
572
+ Returns:
573
+ None
574
+ """
575
+ _get_storage(**kwargs).update_namespace(
576
+ namespace=namespace,
577
+ properties=properties,
578
+ new_namespace=new_namespace,
579
+ *args,
580
+ **kwargs,
581
+ )
582
+
583
+
584
+ def drop_namespace(namespace: str, *args, purge: bool = False, **kwargs) -> None:
585
+ """Drop a namespace and all of its tables from the catalog.
586
+
587
+ Args:
588
+ namespace: Name of the namespace to drop.
589
+ purge: If True, permanently delete all tables in the namespace.
590
+ If False, only remove from catalog.
591
+
592
+ Returns:
593
+ None
594
+
595
+ TODO: Honor purge once garbage collection is implemented.
596
+ """
597
+ if purge:
598
+ raise NotImplementedError("Purge flag is not currently supported.")
599
+
600
+ _get_storage(**kwargs).delete_namespace(
601
+ *args, namespace=namespace, purge=purge, **kwargs
602
+ )
603
+
604
+
605
+ def default_namespace(*args, **kwargs) -> str:
606
+ """Return the default namespace for the catalog.
607
+
608
+ Returns:
609
+ String name of the default namespace.
610
+ """
611
+ return DEFAULT_NAMESPACE # table functions
612
+
613
+
614
+ def _validate_read_table_args(
615
+ namespace: Optional[str] = None,
616
+ table_type: Optional[TableType] = None,
617
+ distributed_dataset_type: Optional[DistributedDatasetType] = None,
618
+ merge_on_read: Optional[bool] = None,
619
+ **kwargs,
620
+ ):
621
+ storage = _get_storage(**kwargs)
622
+ if storage is None:
623
+ raise ValueError(
624
+ "Catalog not initialized. Did you miss calling "
625
+ "initialize(ds=<deltacat_storage>)?"
626
+ )
627
+
628
+ if merge_on_read:
629
+ raise ValueError("Merge on read not supported currently.")
630
+
631
+ if table_type is not TableType.PYARROW:
632
+ raise ValueError("Only PYARROW table type is supported as of now")
633
+
634
+ if distributed_dataset_type is not DistributedDatasetType.DAFT:
635
+ raise ValueError("Only DAFT dataset type is supported as of now")
636
+
637
+ if namespace is None:
638
+ raise ValueError(
639
+ "namespace must be passed to uniquely identify a table in the catalog."
640
+ )
641
+
642
+
643
+ def _get_latest_or_given_table_version(
644
+ namespace: str,
645
+ table_name: str,
646
+ table_version: Optional[str] = None,
647
+ *args,
648
+ **kwargs,
649
+ ) -> TableVersion:
650
+ table_version_obj = None
651
+ if table_version is None:
652
+ table_version_obj = _get_storage(**kwargs).get_latest_table_version(
653
+ namespace=namespace, table_name=table_name, *args, **kwargs
654
+ )
655
+ table_version = table_version_obj.table_version
656
+ else:
657
+ table_version_obj = _get_storage(**kwargs).get_table_version(
658
+ namespace=namespace,
659
+ table_name=table_name,
660
+ table_version=table_version,
661
+ *args,
662
+ **kwargs,
663
+ )
664
+
665
+ return table_version_obj
666
+
667
+
668
+ def _get_deltas_from_partition_filter(
669
+ partition_filter: Optional[List[Union[Partition, PartitionLocator]]] = None,
670
+ stream_position_range_inclusive: Optional[Tuple[int, int]] = None,
671
+ *args,
672
+ **kwargs,
673
+ ):
674
+
675
+ result_deltas = []
676
+ start_stream_position, end_stream_position = stream_position_range_inclusive or (
677
+ None,
678
+ None,
679
+ )
680
+ for partition_like in partition_filter:
681
+ deltas = (
682
+ _get_storage(**kwargs)
683
+ .list_partition_deltas(
684
+ partition_like=partition_like,
685
+ ascending_order=True,
686
+ include_manifest=True,
687
+ start_stream_position=start_stream_position,
688
+ last_stream_position=end_stream_position,
689
+ *args,
690
+ **kwargs,
691
+ )
692
+ .all_items()
693
+ )
694
+
695
+ for delta in deltas:
696
+ if (
697
+ start_stream_position is None
698
+ or delta.stream_position >= start_stream_position
699
+ ) and (
700
+ end_stream_position is None
701
+ or delta.stream_position <= end_stream_position
702
+ ):
703
+ if delta.type == DeltaType.DELETE:
704
+ raise ValueError("DELETE type deltas are not supported")
705
+ result_deltas.append(delta)
706
+
707
+ return result_deltas
708
+
709
+
710
+ def _get_storage(**kwargs):
711
+ """
712
+ Returns the implementation of `deltacat.storage.interface` to use with this catalog.
713
+
714
+ This is configured in the `CatalogProperties` stored during initialization and passed through `delegate.py`
715
+ """
716
+ properties: Optional[CatalogProperties] = kwargs.get("inner")
717
+ if properties is not None and properties.storage is not None:
718
+ return properties.storage
719
+ else:
720
+ return storage_impl