deltacat 2.0.0b11__py3-none-any.whl → 2.0.0b12__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (194) hide show
  1. deltacat/__init__.py +78 -3
  2. deltacat/api.py +122 -67
  3. deltacat/aws/constants.py +0 -23
  4. deltacat/aws/s3u.py +4 -631
  5. deltacat/benchmarking/conftest.py +0 -18
  6. deltacat/catalog/__init__.py +2 -0
  7. deltacat/catalog/delegate.py +445 -63
  8. deltacat/catalog/interface.py +188 -62
  9. deltacat/catalog/main/impl.py +2417 -271
  10. deltacat/catalog/model/catalog.py +49 -10
  11. deltacat/catalog/model/properties.py +38 -0
  12. deltacat/compute/compactor/compaction_session.py +97 -75
  13. deltacat/compute/compactor/model/compact_partition_params.py +75 -30
  14. deltacat/compute/compactor/model/compaction_session_audit_info.py +17 -0
  15. deltacat/compute/compactor/model/round_completion_info.py +16 -6
  16. deltacat/compute/compactor/repartition_session.py +8 -21
  17. deltacat/compute/compactor/steps/hash_bucket.py +5 -5
  18. deltacat/compute/compactor/steps/materialize.py +9 -7
  19. deltacat/compute/compactor/steps/repartition.py +12 -11
  20. deltacat/compute/compactor/utils/io.py +6 -5
  21. deltacat/compute/compactor/utils/round_completion_reader.py +117 -0
  22. deltacat/compute/compactor/utils/system_columns.py +3 -1
  23. deltacat/compute/compactor_v2/compaction_session.py +17 -14
  24. deltacat/compute/compactor_v2/constants.py +30 -1
  25. deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -1
  26. deltacat/compute/compactor_v2/model/hash_bucket_input.py +9 -3
  27. deltacat/compute/compactor_v2/model/merge_file_group.py +5 -2
  28. deltacat/compute/compactor_v2/model/merge_input.py +33 -8
  29. deltacat/compute/compactor_v2/private/compaction_utils.py +167 -68
  30. deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -2
  31. deltacat/compute/compactor_v2/steps/merge.py +267 -55
  32. deltacat/compute/compactor_v2/utils/content_type_params.py +34 -6
  33. deltacat/compute/compactor_v2/utils/dedupe.py +1 -1
  34. deltacat/compute/compactor_v2/utils/delta.py +5 -3
  35. deltacat/compute/compactor_v2/utils/io.py +11 -4
  36. deltacat/compute/compactor_v2/utils/merge.py +15 -2
  37. deltacat/compute/compactor_v2/utils/primary_key_index.py +28 -4
  38. deltacat/compute/compactor_v2/utils/task_options.py +45 -33
  39. deltacat/compute/converter/converter_session.py +145 -32
  40. deltacat/compute/converter/model/convert_input.py +26 -19
  41. deltacat/compute/converter/model/convert_input_files.py +33 -16
  42. deltacat/compute/converter/model/convert_result.py +35 -16
  43. deltacat/compute/converter/model/converter_session_params.py +24 -21
  44. deltacat/compute/converter/pyiceberg/catalog.py +21 -18
  45. deltacat/compute/converter/pyiceberg/overrides.py +18 -9
  46. deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +148 -100
  47. deltacat/compute/converter/steps/convert.py +157 -50
  48. deltacat/compute/converter/steps/dedupe.py +24 -11
  49. deltacat/compute/converter/utils/convert_task_options.py +27 -12
  50. deltacat/compute/converter/utils/converter_session_utils.py +126 -60
  51. deltacat/compute/converter/utils/iceberg_columns.py +8 -8
  52. deltacat/compute/converter/utils/io.py +101 -12
  53. deltacat/compute/converter/utils/s3u.py +33 -27
  54. deltacat/compute/janitor.py +205 -0
  55. deltacat/compute/jobs/client.py +19 -8
  56. deltacat/compute/resource_estimation/delta.py +38 -6
  57. deltacat/compute/resource_estimation/model.py +8 -0
  58. deltacat/constants.py +44 -0
  59. deltacat/docs/autogen/schema/__init__.py +0 -0
  60. deltacat/docs/autogen/schema/inference/__init__.py +0 -0
  61. deltacat/docs/autogen/schema/inference/generate_type_mappings.py +687 -0
  62. deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py +673 -0
  63. deltacat/examples/compactor/__init__.py +0 -0
  64. deltacat/examples/compactor/aws/__init__.py +1 -0
  65. deltacat/examples/compactor/bootstrap.py +863 -0
  66. deltacat/examples/compactor/compactor.py +373 -0
  67. deltacat/examples/compactor/explorer.py +473 -0
  68. deltacat/examples/compactor/gcp/__init__.py +1 -0
  69. deltacat/examples/compactor/job_runner.py +439 -0
  70. deltacat/examples/compactor/utils/__init__.py +1 -0
  71. deltacat/examples/compactor/utils/common.py +261 -0
  72. deltacat/examples/experimental/iceberg/converter/__init__.py +0 -0
  73. deltacat/examples/experimental/iceberg/converter/beam/__init__.py +0 -0
  74. deltacat/examples/experimental/iceberg/converter/beam/app.py +226 -0
  75. deltacat/examples/experimental/iceberg/converter/beam/main.py +133 -0
  76. deltacat/examples/experimental/iceberg/converter/beam/test_workflow.py +113 -0
  77. deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py +3 -0
  78. deltacat/examples/experimental/iceberg/converter/beam/utils/common.py +174 -0
  79. deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py +263 -0
  80. deltacat/exceptions.py +66 -4
  81. deltacat/experimental/catalog/iceberg/impl.py +2 -2
  82. deltacat/experimental/compatibility/__init__.py +0 -0
  83. deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py +201 -0
  84. deltacat/experimental/converter_agent/__init__.py +0 -0
  85. deltacat/experimental/converter_agent/beam/__init__.py +0 -0
  86. deltacat/experimental/converter_agent/beam/managed.py +173 -0
  87. deltacat/experimental/converter_agent/table_monitor.py +479 -0
  88. deltacat/experimental/storage/iceberg/iceberg_scan_planner.py +105 -4
  89. deltacat/experimental/storage/iceberg/impl.py +5 -3
  90. deltacat/experimental/storage/iceberg/model.py +7 -3
  91. deltacat/experimental/storage/iceberg/visitor.py +119 -0
  92. deltacat/experimental/storage/rivulet/dataset.py +0 -3
  93. deltacat/experimental/storage/rivulet/metastore/delta.py +0 -2
  94. deltacat/experimental/storage/rivulet/reader/dataset_metastore.py +3 -2
  95. deltacat/io/datasource/deltacat_datasource.py +0 -1
  96. deltacat/storage/__init__.py +20 -2
  97. deltacat/storage/interface.py +54 -32
  98. deltacat/storage/main/impl.py +1494 -541
  99. deltacat/storage/model/delta.py +27 -3
  100. deltacat/storage/model/locator.py +6 -12
  101. deltacat/storage/model/manifest.py +182 -6
  102. deltacat/storage/model/metafile.py +151 -78
  103. deltacat/storage/model/namespace.py +8 -1
  104. deltacat/storage/model/partition.py +117 -42
  105. deltacat/storage/model/schema.py +2427 -159
  106. deltacat/storage/model/sort_key.py +40 -0
  107. deltacat/storage/model/stream.py +9 -2
  108. deltacat/storage/model/table.py +12 -1
  109. deltacat/storage/model/table_version.py +11 -0
  110. deltacat/storage/model/transaction.py +1184 -208
  111. deltacat/storage/model/transform.py +81 -2
  112. deltacat/storage/model/types.py +48 -26
  113. deltacat/tests/_io/test_cloudpickle_bug_fix.py +8 -4
  114. deltacat/tests/aws/test_s3u.py +2 -31
  115. deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +1606 -70
  116. deltacat/tests/catalog/test_catalogs.py +54 -11
  117. deltacat/tests/catalog/test_default_catalog_impl.py +12152 -71
  118. deltacat/tests/compute/compact_partition_test_cases.py +35 -8
  119. deltacat/tests/compute/compactor/steps/test_repartition.py +12 -12
  120. deltacat/tests/compute/compactor/utils/test_io.py +124 -120
  121. deltacat/tests/compute/compactor/utils/test_round_completion_reader.py +254 -0
  122. deltacat/tests/compute/compactor_v2/test_compaction_session.py +423 -312
  123. deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +266 -0
  124. deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +45 -0
  125. deltacat/tests/compute/compactor_v2/utils/test_task_options.py +270 -1
  126. deltacat/tests/compute/conftest.py +8 -44
  127. deltacat/tests/compute/converter/test_convert_session.py +675 -490
  128. deltacat/tests/compute/converter/utils.py +15 -6
  129. deltacat/tests/compute/resource_estimation/test_delta.py +145 -79
  130. deltacat/tests/compute/test_compact_partition_incremental.py +103 -70
  131. deltacat/tests/compute/test_compact_partition_multiple_rounds.py +89 -66
  132. deltacat/tests/compute/test_compact_partition_params.py +13 -8
  133. deltacat/tests/compute/test_compact_partition_rebase.py +77 -62
  134. deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +263 -193
  135. deltacat/tests/compute/test_janitor.py +236 -0
  136. deltacat/tests/compute/test_util_common.py +716 -43
  137. deltacat/tests/compute/test_util_constant.py +0 -1
  138. deltacat/tests/{storage/conftest.py → conftest.py} +1 -1
  139. deltacat/tests/experimental/__init__.py +1 -0
  140. deltacat/tests/experimental/compatibility/__init__.py +1 -0
  141. deltacat/tests/experimental/compatibility/test_backfill_locator_to_id_mappings.py +582 -0
  142. deltacat/tests/storage/main/test_main_storage.py +6900 -95
  143. deltacat/tests/storage/model/test_metafile_io.py +78 -173
  144. deltacat/tests/storage/model/test_partition_scheme.py +85 -0
  145. deltacat/tests/storage/model/test_schema.py +171 -0
  146. deltacat/tests/storage/model/test_schema_update.py +1925 -0
  147. deltacat/tests/storage/model/test_sort_scheme.py +90 -0
  148. deltacat/tests/storage/model/test_transaction.py +393 -48
  149. deltacat/tests/storage/model/test_transaction_history.py +886 -0
  150. deltacat/tests/test_deltacat_api.py +988 -4
  151. deltacat/tests/test_exceptions.py +9 -5
  152. deltacat/tests/test_utils/pyarrow.py +52 -21
  153. deltacat/tests/test_utils/storage.py +23 -34
  154. deltacat/tests/types/__init__.py +0 -0
  155. deltacat/tests/types/test_tables.py +104 -0
  156. deltacat/tests/utils/exceptions.py +22 -0
  157. deltacat/tests/utils/main_deltacat_storage_mock.py +31 -0
  158. deltacat/tests/utils/ray_utils/test_dataset.py +123 -5
  159. deltacat/tests/utils/test_daft.py +121 -31
  160. deltacat/tests/utils/test_numpy.py +1193 -0
  161. deltacat/tests/utils/test_pandas.py +1106 -0
  162. deltacat/tests/utils/test_polars.py +1040 -0
  163. deltacat/tests/utils/test_pyarrow.py +1370 -89
  164. deltacat/types/media.py +221 -11
  165. deltacat/types/tables.py +2329 -59
  166. deltacat/utils/arguments.py +33 -1
  167. deltacat/utils/daft.py +411 -150
  168. deltacat/utils/filesystem.py +100 -0
  169. deltacat/utils/metafile_locator.py +2 -1
  170. deltacat/utils/numpy.py +118 -26
  171. deltacat/utils/pandas.py +577 -48
  172. deltacat/utils/polars.py +658 -27
  173. deltacat/utils/pyarrow.py +1258 -213
  174. deltacat/utils/ray_utils/dataset.py +101 -10
  175. deltacat/utils/reader_compatibility_mapping.py +3083 -0
  176. deltacat/utils/url.py +56 -15
  177. deltacat-2.0.0b12.dist-info/METADATA +1163 -0
  178. {deltacat-2.0.0b11.dist-info → deltacat-2.0.0b12.dist-info}/RECORD +183 -145
  179. {deltacat-2.0.0b11.dist-info → deltacat-2.0.0b12.dist-info}/WHEEL +1 -1
  180. deltacat/compute/compactor/utils/round_completion_file.py +0 -97
  181. deltacat/compute/merge_on_read/__init__.py +0 -4
  182. deltacat/compute/merge_on_read/daft.py +0 -40
  183. deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -66
  184. deltacat/compute/merge_on_read/utils/delta.py +0 -42
  185. deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -231
  186. deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -388
  187. deltacat/tests/local_deltacat_storage/__init__.py +0 -1236
  188. deltacat/tests/local_deltacat_storage/exceptions.py +0 -10
  189. deltacat/utils/s3fs.py +0 -21
  190. deltacat-2.0.0b11.dist-info/METADATA +0 -67
  191. /deltacat/{compute/merge_on_read/model → docs}/__init__.py +0 -0
  192. /deltacat/{compute/merge_on_read/utils → docs/autogen}/__init__.py +0 -0
  193. {deltacat-2.0.0b11.dist-info → deltacat-2.0.0b12.dist-info/licenses}/LICENSE +0 -0
  194. {deltacat-2.0.0b11.dist-info → deltacat-2.0.0b12.dist-info}/top_level.txt +0 -0
@@ -1,39 +1,79 @@
1
- from typing import Any, Dict, List, Optional, Union, Tuple
1
+ from typing import Any, Dict, List, Optional, Union, Tuple, Set
2
2
  import logging
3
+ from collections import defaultdict
3
4
 
5
+ import numpy as np
6
+ import pyarrow as pa
7
+ import pandas as pd
8
+ import daft
4
9
  import deltacat as dc
5
10
 
11
+ from deltacat.storage.model.manifest import ManifestAuthor
6
12
  from deltacat.catalog.model.properties import CatalogProperties
7
13
  from deltacat.exceptions import (
8
14
  NamespaceAlreadyExistsError,
9
- StreamNotFoundError,
10
15
  TableAlreadyExistsError,
11
16
  TableVersionNotFoundError,
17
+ TableNotFoundError,
18
+ TableVersionAlreadyExistsError,
19
+ TableValidationError,
20
+ SchemaValidationError,
12
21
  )
13
22
  from deltacat.catalog.model.table_definition import TableDefinition
14
23
  from deltacat.storage.model.sort_key import SortScheme
15
24
  from deltacat.storage.model.list_result import ListResult
16
25
  from deltacat.storage.model.namespace import Namespace, NamespaceProperties
17
- from deltacat.storage.model.schema import Schema
26
+ from deltacat.storage.model.schema import (
27
+ Schema,
28
+ SchemaUpdate,
29
+ )
18
30
  from deltacat.storage.model.table import TableProperties, Table
19
31
  from deltacat.storage.model.types import (
20
- DistributedDataset,
32
+ Dataset,
21
33
  LifecycleState,
22
- LocalDataset,
23
- LocalTable,
24
34
  StreamFormat,
35
+ SchemaConsistencyType,
25
36
  )
26
37
  from deltacat.storage.model.partition import (
27
38
  Partition,
28
39
  PartitionLocator,
29
40
  PartitionScheme,
30
41
  )
31
- from deltacat.storage.model.table_version import TableVersion
32
- from deltacat.compute.merge_on_read.model.merge_on_read_params import MergeOnReadParams
33
- from deltacat.storage.model.delta import DeltaType
34
- from deltacat.types.media import ContentType, TableType, DistributedDatasetType
35
- from deltacat.types.tables import TableWriteMode
36
- from deltacat.compute.merge_on_read import MERGE_FUNC_BY_DISTRIBUTED_DATASET_TYPE
42
+ from deltacat.storage.model.table_version import (
43
+ TableVersion,
44
+ TableVersionProperties,
45
+ )
46
+ from deltacat.storage.model.types import DeltaType
47
+ from deltacat.storage import Delta
48
+ from deltacat.storage.model.types import CommitState
49
+ from deltacat.storage.model.transaction import (
50
+ Transaction,
51
+ setup_transaction,
52
+ )
53
+ from deltacat.types.media import (
54
+ ContentType,
55
+ DatasetType,
56
+ StorageType,
57
+ SCHEMA_CONTENT_TYPES,
58
+ )
59
+ from deltacat.types.tables import (
60
+ SchemaEvolutionMode,
61
+ TableProperty,
62
+ TablePropertyDefaultValues,
63
+ TableReadOptimizationLevel,
64
+ TableWriteMode,
65
+ get_dataset_type,
66
+ get_table_schema,
67
+ get_table_column_names,
68
+ from_pyarrow,
69
+ concat_tables,
70
+ empty_table,
71
+ infer_table_schema,
72
+ to_pandas,
73
+ )
74
+ from deltacat.utils import pyarrow as pa_utils
75
+ from deltacat.utils.reader_compatibility_mapping import get_compatible_readers
76
+ from deltacat.utils.pyarrow import get_base_arrow_type_name
37
77
  from deltacat import logs
38
78
  from deltacat.constants import DEFAULT_NAMESPACE
39
79
 
@@ -78,13 +118,146 @@ def initialize(
78
118
 
79
119
 
80
120
  # table functions
121
+ def _validate_write_mode_and_table_existence(
122
+ table: str,
123
+ namespace: str,
124
+ mode: TableWriteMode,
125
+ **kwargs,
126
+ ) -> bool:
127
+ """Validate write mode against table existence and return whether table exists."""
128
+ table_exists_flag = table_exists(
129
+ table,
130
+ namespace=namespace,
131
+ **kwargs,
132
+ )
133
+ logger.info(f"Table to write to ({namespace}.{table}) exists: {table_exists_flag}")
134
+
135
+ if mode == TableWriteMode.CREATE and table_exists_flag:
136
+ raise ValueError(
137
+ f"Table {namespace}.{table} already exists and mode is CREATE."
138
+ )
139
+ elif (
140
+ mode not in (TableWriteMode.CREATE, TableWriteMode.AUTO)
141
+ and not table_exists_flag
142
+ ):
143
+ raise TableNotFoundError(
144
+ f"Table {namespace}.{table} does not exist and mode is {mode.value.upper() if hasattr(mode, 'value') else str(mode).upper()}. Use CREATE or AUTO mode to create a new table."
145
+ )
146
+
147
+ return table_exists_flag
148
+
149
+
150
+ def _get_table_and_validate_write_mode(
151
+ table: str,
152
+ namespace: str,
153
+ table_version: Optional[str],
154
+ mode: TableWriteMode,
155
+ **kwargs,
156
+ ) -> Tuple[bool, TableDefinition]:
157
+ """Validate write mode against table and table version existence.
158
+
159
+ Returns:
160
+ Tuple of (table_exists_flag, table_definition)
161
+ """
162
+ # First validate table, table version, and stream existence
163
+ existing_table_def = get_table(
164
+ table,
165
+ namespace=namespace,
166
+ table_version=table_version,
167
+ **kwargs,
168
+ )
169
+ table_exists_flag = (
170
+ existing_table_def is not None
171
+ and existing_table_def.table_version
172
+ and existing_table_def.stream
173
+ )
174
+ logger.info(f"Table to write to ({namespace}.{table}) exists: {table_exists_flag}")
175
+
176
+ # Then validate table existence constraints
177
+ if mode == TableWriteMode.CREATE and table_exists_flag and table_version is None:
178
+ raise TableAlreadyExistsError(
179
+ f"Table {namespace}.{table} already exists and mode is CREATE."
180
+ )
181
+ elif (
182
+ mode not in (TableWriteMode.CREATE, TableWriteMode.AUTO)
183
+ and existing_table_def is None
184
+ ):
185
+ raise TableNotFoundError(
186
+ f"Table {namespace}.{table} does not exist and write mode is {mode}. Use CREATE or AUTO mode to create a new table."
187
+ )
188
+
189
+ # Then validate table version existence constraints
190
+ if table_version is not None and table_exists_flag:
191
+ if mode == TableWriteMode.CREATE:
192
+ raise TableVersionAlreadyExistsError(
193
+ f"Table version {namespace}.{table}.{table_version} already exists and mode is CREATE."
194
+ )
195
+ logger.info(f"Table version ({namespace}.{table}.{table_version}) exists.")
196
+ elif (
197
+ mode not in (TableWriteMode.CREATE, TableWriteMode.AUTO)
198
+ and table_version is not None
199
+ and not table_exists_flag
200
+ ):
201
+ raise TableVersionNotFoundError(
202
+ f"Table version {namespace}.{table}.{table_version} does not exist and write mode is {mode}. "
203
+ f"Use CREATE or AUTO mode to create a new table version, or omit table_version "
204
+ f"to use the latest version."
205
+ )
206
+ return table_exists_flag, existing_table_def
207
+
208
+
209
+ def _validate_content_type_against_supported_content_types(
210
+ namespace: str,
211
+ table: str,
212
+ content_type: ContentType,
213
+ supported_content_types: Optional[List[ContentType]],
214
+ ) -> None:
215
+ if supported_content_types and content_type not in supported_content_types:
216
+ raise ValueError(
217
+ f"Content type proposed for write to table {namespace}.{table} ({content_type}) "
218
+ f"conflicts with the proposed list of new supported content types: {supported_content_types}"
219
+ )
220
+
221
+
222
+ def _create_table_for_write(
223
+ data: Dataset,
224
+ table: str,
225
+ namespace: str,
226
+ table_version: Optional[str],
227
+ content_type: ContentType,
228
+ existing_table_definition: Optional[TableDefinition],
229
+ *args,
230
+ **kwargs,
231
+ ) -> TableDefinition:
232
+ """Creates a new table, table version, and/or stream in preparation for a write operation."""
233
+ if "schema" not in kwargs:
234
+ kwargs["schema"] = infer_table_schema(data)
235
+
236
+ _validate_content_type_against_supported_content_types(
237
+ namespace,
238
+ table,
239
+ content_type,
240
+ kwargs.get("content_types"),
241
+ )
242
+ return create_table(
243
+ table,
244
+ namespace=namespace,
245
+ table_version=table_version,
246
+ existing_table_definition=existing_table_definition,
247
+ *args,
248
+ **kwargs,
249
+ )
250
+
251
+
81
252
  def write_to_table(
82
- data: Union[LocalTable, LocalDataset, DistributedDataset], # type: ignore
253
+ data: Dataset,
83
254
  table: str,
84
255
  *args,
85
256
  namespace: Optional[str] = None,
257
+ table_version: Optional[str] = None,
86
258
  mode: TableWriteMode = TableWriteMode.AUTO,
87
259
  content_type: ContentType = ContentType.PARQUET,
260
+ transaction: Optional[Transaction] = None,
88
261
  **kwargs,
89
262
  ) -> None:
90
263
  """Write local or distributed data to a table. Raises an error if the
@@ -93,79 +266,1137 @@ def write_to_table(
93
266
  When creating a table, all `create_table` parameters may be optionally
94
267
  specified as additional keyword arguments. When appending to, or replacing,
95
268
  an existing table, all `alter_table` parameters may be optionally specified
96
- as additional keyword arguments."""
97
- raise NotImplementedError("write_to_table not implemented")
269
+ as additional keyword arguments.
98
270
 
271
+ Args:
272
+ data: Local or distributed data to write to the table.
273
+ table: Name of the table to write to.
274
+ namespace: Optional namespace for the table. Uses default if not specified.
275
+ table_version: Optional version of the table to write to. If specified,
276
+ will create this version if it doesn't exist (in CREATE mode) or
277
+ get this version if it exists (in other modes). If not specified,
278
+ uses the latest version.
279
+ mode: Write mode (AUTO, CREATE, APPEND, REPLACE, MERGE, DELETE).
280
+ content_type: Content type used to write the data files. Defaults to PARQUET.
281
+ transaction: Optional transaction to append write operations to instead of
282
+ creating and committing a new transaction.
283
+ **kwargs: Additional keyword arguments.
284
+ """
285
+ namespace = namespace or default_namespace()
99
286
 
100
- def read_table(
287
+ # Set up transaction handling
288
+ write_transaction, commit_transaction = setup_transaction(transaction, **kwargs)
289
+ kwargs["transaction"] = write_transaction
290
+
291
+ try:
292
+ # Validate write mode and table/table version/stream existence
293
+ (table_exists_flag, table_definition,) = _get_table_and_validate_write_mode(
294
+ table,
295
+ namespace,
296
+ table_version,
297
+ mode,
298
+ **kwargs,
299
+ )
300
+
301
+ # Get or create table, table version, and/or stream
302
+ if not table_exists_flag:
303
+ table_definition = _create_table_for_write(
304
+ data,
305
+ table,
306
+ namespace,
307
+ table_version,
308
+ content_type,
309
+ table_definition,
310
+ *args,
311
+ **kwargs,
312
+ )
313
+ else:
314
+ # call alter_table if there are any alter_table kwargs provided
315
+ if (
316
+ "lifecycle_state" in kwargs
317
+ or "schema_updates" in kwargs
318
+ or "partition_updates" in kwargs
319
+ or "sort_scheme" in kwargs
320
+ or "table_description" in kwargs
321
+ or "table_version_description" in kwargs
322
+ or "table_properties" in kwargs
323
+ or "table_version_properties" in kwargs
324
+ ):
325
+ alter_table(
326
+ table,
327
+ namespace=namespace,
328
+ table_version=table_version,
329
+ *args,
330
+ **kwargs,
331
+ )
332
+
333
+ # Get the active table version and stream
334
+ table_version_obj = _get_latest_active_or_given_table_version(
335
+ namespace=table_definition.table.namespace,
336
+ table_name=table_definition.table.table_name,
337
+ table_version=table_version or table_definition.table_version.table_version,
338
+ **kwargs,
339
+ )
340
+
341
+ # Validate schema compatibility for schemaless content types with schema tables
342
+ if (
343
+ content_type.value not in SCHEMA_CONTENT_TYPES
344
+ and table_version_obj.schema is not None
345
+ ):
346
+ schemaless_types = {
347
+ ct for ct in ContentType if ct.value not in SCHEMA_CONTENT_TYPES
348
+ }
349
+ raise TableValidationError(
350
+ f"Content type '{content_type.value}' cannot be written to a table with a schema. "
351
+ f"Table '{namespace}.{table}' has a schema, but content type '{content_type.value}' "
352
+ f"is schemaless. Schemaless content types ({', '.join(sorted([ct.value for ct in schemaless_types]))}) "
353
+ f"can only be written to schemaless tables."
354
+ )
355
+
356
+ # Handle different write modes and get stream and delta type
357
+ stream, delta_type = _handle_write_mode(
358
+ mode,
359
+ table_definition,
360
+ table_version_obj,
361
+ namespace,
362
+ table,
363
+ **kwargs,
364
+ )
365
+
366
+ if not stream:
367
+ raise ValueError(f"No default stream found for table {namespace}.{table}")
368
+
369
+ # Automatically set entry_params for DELETE/MERGE modes if not provided
370
+ _set_entry_params_if_needed(
371
+ mode,
372
+ table_version_obj,
373
+ kwargs,
374
+ )
375
+
376
+ # Validate table configuration
377
+ _validate_table_configuration(
378
+ stream,
379
+ table_version_obj,
380
+ namespace,
381
+ table,
382
+ )
383
+
384
+ # Handle partition creation/retrieval
385
+ partition, commit_staged_partition = _handle_partition_creation(
386
+ mode,
387
+ table_exists_flag,
388
+ delta_type,
389
+ stream,
390
+ **kwargs,
391
+ )
392
+
393
+ # Get table properties for schema evolution
394
+ schema_evolution_mode = table_version_obj.read_table_property(
395
+ TableProperty.SCHEMA_EVOLUTION_MODE
396
+ )
397
+ default_schema_consistency_type = table_version_obj.read_table_property(
398
+ TableProperty.DEFAULT_SCHEMA_CONSISTENCY_TYPE
399
+ )
400
+
401
+ # Convert unsupported dataset types and NumPy arrays that need schema validation
402
+ if isinstance(data, np.ndarray) and table_version_obj.schema is not None:
403
+ # NumPy arrays need conversion to Pandas for proper column naming in schema validation
404
+ converted_data = _convert_numpy_for_schema_validation(
405
+ data, table_version_obj.schema
406
+ )
407
+ else:
408
+ # Convert other unsupported dataset types (e.g., Daft) or keep NumPy as-is for schemaless tables
409
+ converted_data = _convert_data_if_needed(data)
410
+
411
+ # Capture original field set before schema coercion for partial UPSERT support
412
+ original_fields = set(get_table_column_names(converted_data))
413
+
414
+ # Validate and coerce data against schema
415
+ # This ensures proper schema evolution and type handling
416
+ (
417
+ validated_data,
418
+ schema_modified,
419
+ updated_schema,
420
+ ) = _validate_and_coerce_data_against_schema(
421
+ converted_data, # Use converted data for NumPy, original for others
422
+ table_version_obj.schema,
423
+ schema_evolution_mode=schema_evolution_mode,
424
+ default_schema_consistency_type=default_schema_consistency_type,
425
+ )
426
+
427
+ # Convert validated data to supported format for storage if needed
428
+ converted_data = _convert_data_if_needed(validated_data)
429
+
430
+ # Validate reader compatibility against supported reader types
431
+ supported_reader_types = table_version_obj.read_table_property(
432
+ TableProperty.SUPPORTED_READER_TYPES
433
+ )
434
+ _validate_reader_compatibility(
435
+ converted_data,
436
+ content_type,
437
+ supported_reader_types,
438
+ )
439
+
440
+ # Update table version if schema was modified during evolution
441
+ if schema_modified:
442
+ # Extract catalog properties and filter kwargs
443
+ catalog_kwargs = {
444
+ "catalog": kwargs.get("catalog"),
445
+ "inner": kwargs.get("inner"),
446
+ "transaction": write_transaction, # Pass transaction to update_table_version
447
+ }
448
+
449
+ _get_storage(**catalog_kwargs).update_table_version(
450
+ namespace=namespace,
451
+ table_name=table,
452
+ table_version=table_version_obj.table_version,
453
+ schema=updated_schema,
454
+ **catalog_kwargs,
455
+ )
456
+
457
+ # Stage and commit delta, handle compaction
458
+ # Remove schema from kwargs to avoid duplicate parameter conflict
459
+ filtered_kwargs = {k: v for k, v in kwargs.items() if k != "schema"}
460
+ # Use updated schema if schema evolution occurred, otherwise use original schema
461
+ _stage_commit_and_compact(
462
+ converted_data,
463
+ partition,
464
+ delta_type,
465
+ content_type,
466
+ commit_staged_partition,
467
+ table_version_obj,
468
+ namespace,
469
+ table,
470
+ schema=updated_schema if schema_modified else table_version_obj.schema,
471
+ original_fields=original_fields,
472
+ **filtered_kwargs,
473
+ )
474
+ except Exception as e:
475
+ # If any error occurs, the transaction remains uncommitted
476
+ commit_transaction = False
477
+ logger.error(f"Error during write_to_table: {e}")
478
+ raise
479
+ finally:
480
+ if commit_transaction:
481
+ # Seal the interactive transaction to commit all operations atomically
482
+ write_transaction.seal()
483
+
484
+
485
+ def _handle_write_mode(
486
+ mode: TableWriteMode,
487
+ table_definition: TableDefinition,
488
+ table_version_obj: TableVersion,
489
+ namespace: str,
101
490
  table: str,
102
- *args,
103
- namespace: Optional[str] = None,
104
- table_version: Optional[str] = None,
105
- table_type: Optional[TableType] = TableType.PYARROW,
106
- distributed_dataset_type: Optional[
107
- DistributedDatasetType
108
- ] = DistributedDatasetType.RAY_DATASET,
109
- partition_filter: Optional[List[Union[Partition, PartitionLocator]]] = None,
110
- stream_position_range_inclusive: Optional[Tuple[int, int]] = None,
111
- merge_on_read: Optional[bool] = False,
112
- reader_kwargs: Optional[Dict[Any, Any]] = None,
113
491
  **kwargs,
114
- ) -> DistributedDataset: # type: ignore
115
- """Read a table into a distributed dataset."""
492
+ ) -> Tuple[Any, DeltaType]: # Using Any for stream type to avoid complex imports
493
+ """Handle different write modes and return appropriate stream and delta type."""
494
+ table_schema = table_definition.table_version.schema
495
+
496
+ if mode == TableWriteMode.REPLACE:
497
+ return _handle_replace_mode(
498
+ table_schema,
499
+ namespace,
500
+ table,
501
+ table_version_obj,
502
+ **kwargs,
503
+ )
504
+ elif mode == TableWriteMode.APPEND:
505
+ return _handle_append_mode(
506
+ table_schema,
507
+ namespace,
508
+ table,
509
+ table_version_obj,
510
+ **kwargs,
511
+ )
512
+ elif mode in (TableWriteMode.MERGE, TableWriteMode.DELETE):
513
+ return _handle_merge_delete_mode(
514
+ mode,
515
+ table_schema,
516
+ namespace,
517
+ table,
518
+ table_version_obj,
519
+ **kwargs,
520
+ )
521
+ else:
522
+ # AUTO and CREATE modes
523
+ return _handle_auto_create_mode(
524
+ table_schema,
525
+ namespace,
526
+ table,
527
+ table_version_obj,
528
+ **kwargs,
529
+ )
116
530
 
117
- if reader_kwargs is None:
118
- reader_kwargs = {}
119
531
 
120
- _validate_read_table_args(
532
+ def _handle_replace_mode(
533
+ table_schema,
534
+ namespace: str,
535
+ table: str,
536
+ table_version_obj: TableVersion,
537
+ **kwargs,
538
+ ) -> Tuple[Any, DeltaType]:
539
+ """Handle REPLACE mode by staging and committing a new stream."""
540
+ stream = _get_storage(**kwargs).stage_stream(
121
541
  namespace=namespace,
122
- table_type=table_type,
123
- distributed_dataset_type=distributed_dataset_type,
124
- merge_on_read=merge_on_read,
542
+ table_name=table,
543
+ table_version=table_version_obj.table_version,
544
+ **kwargs,
545
+ )
546
+
547
+ stream = _get_storage(**kwargs).commit_stream(stream=stream, **kwargs)
548
+ delta_type = (
549
+ DeltaType.UPSERT
550
+ if table_schema and table_schema.merge_keys
551
+ else DeltaType.APPEND
552
+ )
553
+ return stream, delta_type
554
+
555
+
556
+ def _handle_append_mode(
557
+ table_schema,
558
+ namespace: str,
559
+ table: str,
560
+ table_version_obj: TableVersion,
561
+ **kwargs,
562
+ ) -> Tuple[Any, DeltaType]:
563
+ """Handle APPEND mode by validating no merge keys and getting existing stream."""
564
+ if table_schema and table_schema.merge_keys:
565
+ raise SchemaValidationError(
566
+ f"APPEND mode cannot be used with tables that have merge keys. "
567
+ f"Table {namespace}.{table} has merge keys: {table_schema.merge_keys}. "
568
+ f"Use MERGE mode instead."
569
+ )
570
+
571
+ stream = _get_table_stream(
572
+ namespace,
573
+ table,
574
+ table_version_obj.table_version,
575
+ **kwargs,
576
+ )
577
+ return stream, DeltaType.APPEND
578
+
579
+
580
+ def _handle_merge_delete_mode(
581
+ mode: TableWriteMode,
582
+ table_schema,
583
+ namespace: str,
584
+ table: str,
585
+ table_version_obj: TableVersion,
586
+ **kwargs,
587
+ ) -> Tuple[Any, DeltaType]:
588
+ """Handle MERGE/DELETE modes by validating merge keys and getting existing stream."""
589
+ if not table_schema or not table_schema.merge_keys:
590
+ raise TableValidationError(
591
+ f"{mode.value.upper() if hasattr(mode, 'value') else str(mode).upper()} mode requires tables to have at least one merge key. "
592
+ f"Table {namespace}.{table}.{table_version_obj.table_version} has no merge keys. "
593
+ f"Use APPEND, AUTO, or REPLACE mode instead."
594
+ )
595
+
596
+ stream = _get_table_stream(
597
+ namespace,
598
+ table,
599
+ table_version_obj.table_version,
125
600
  **kwargs,
126
601
  )
602
+ delta_type = DeltaType.UPSERT if mode == TableWriteMode.MERGE else DeltaType.DELETE
603
+ return stream, delta_type
604
+
605
+
606
+ def _handle_auto_create_mode(
607
+ table_schema,
608
+ namespace: str,
609
+ table: str,
610
+ table_version_obj: TableVersion,
611
+ **kwargs,
612
+ ) -> Tuple[Any, DeltaType]:
613
+ """Handle AUTO and CREATE modes by getting existing stream."""
614
+ stream = _get_table_stream(
615
+ namespace,
616
+ table,
617
+ table_version_obj.table_version,
618
+ **kwargs,
619
+ )
620
+ delta_type = (
621
+ DeltaType.UPSERT
622
+ if table_schema and table_schema.merge_keys
623
+ else DeltaType.APPEND
624
+ )
625
+ return stream, delta_type
626
+
627
+
628
+ def _validate_table_configuration(
629
+ stream,
630
+ table_version_obj: TableVersion,
631
+ namespace: str,
632
+ table: str,
633
+ ) -> None:
634
+ """Validate table configuration for unsupported features."""
635
+ # Check if table is partitioned
636
+ if (
637
+ stream.partition_scheme
638
+ and stream.partition_scheme.keys is not None
639
+ and len(stream.partition_scheme.keys) > 0
640
+ ):
641
+ raise NotImplementedError(
642
+ f"write_to_table does not yet support partitioned tables. "
643
+ f"Table {namespace}.{table} has partition scheme with "
644
+ f"{len(stream.partition_scheme.keys)} partition key(s): "
645
+ f"{[key.name or key.key[0] for key in stream.partition_scheme.keys]}. "
646
+ f"Please use the lower-level metastore API for partitioned tables."
647
+ )
648
+
649
+ # Check if table has sort keys
650
+ if (
651
+ table_version_obj.sort_scheme
652
+ and table_version_obj.sort_scheme.keys is not None
653
+ and len(table_version_obj.sort_scheme.keys) > 0
654
+ ):
655
+ raise NotImplementedError(
656
+ f"write_to_table does not yet support tables with sort keys. "
657
+ f"Table {namespace}.{table} has sort scheme with "
658
+ f"{len(table_version_obj.sort_scheme.keys)} sort key(s): "
659
+ f"{[key.key[0] for key in table_version_obj.sort_scheme.keys]}. "
660
+ f"Please use the lower-level metastore API for sorted tables."
661
+ )
662
+
663
+
664
+ def _handle_partition_creation(
665
+ mode: TableWriteMode,
666
+ table_exists_flag: bool,
667
+ delta_type: DeltaType,
668
+ stream,
669
+ **kwargs,
670
+ ) -> Tuple[Any, bool]: # partition, commit_staged_partition
671
+ """Handle partition creation/retrieval based on write mode."""
672
+ if mode == TableWriteMode.REPLACE or not table_exists_flag:
673
+ # REPLACE mode or new table: Stage a new partition
674
+ partition = _get_storage(**kwargs).stage_partition(stream=stream, **kwargs)
675
+ # If we're doing UPSERT/DELETE operations, let compaction handle the commit
676
+ commit_staged_partition = delta_type not in (DeltaType.UPSERT, DeltaType.DELETE)
677
+ return partition, commit_staged_partition
678
+ elif delta_type in (DeltaType.UPSERT, DeltaType.DELETE):
679
+ # UPSERT/DELETE operations: Try to use existing committed partition first
680
+ partition = _get_storage(**kwargs).get_partition(
681
+ stream_locator=stream.locator,
682
+ partition_values=None,
683
+ **kwargs,
684
+ )
685
+ commit_staged_partition = False
686
+
687
+ if not partition:
688
+ # No existing committed partition found, stage a new one
689
+ partition = _get_storage(**kwargs).stage_partition(stream=stream, **kwargs)
690
+ commit_staged_partition = False # Let compaction handle the commit
691
+
692
+ return partition, commit_staged_partition
693
+ else:
694
+ # APPEND mode on existing table: Get existing partition
695
+ partition = _get_storage(**kwargs).get_partition(
696
+ stream_locator=stream.locator,
697
+ partition_values=None,
698
+ **kwargs,
699
+ )
700
+ commit_staged_partition = False
701
+
702
+ if not partition:
703
+ # No existing partition found, create a new one
704
+ partition = _get_storage(**kwargs).stage_partition(stream=stream, **kwargs)
705
+ commit_staged_partition = True
706
+
707
+ return partition, commit_staged_partition
708
+
709
+
710
+ def _convert_numpy_for_schema_validation(
711
+ data: np.ndarray, schema: Optional[Schema]
712
+ ) -> Dataset:
713
+ """Convert NumPy array to Pandas DataFrame with proper column names for schema validation.
714
+
715
+ Args:
716
+ data: NumPy array to convert
717
+ schema: DeltaCAT Schema object for column naming
718
+
719
+ Returns:
720
+ Pandas DataFrame with proper column names matching schema
721
+
722
+ Raises:
723
+ ValueError: If array has more columns than schema or schema is invalid
724
+ """
725
+ if not isinstance(schema, Schema) or not schema.arrow:
726
+ raise ValueError(
727
+ f"Expected DeltaCAT schema for Numpy schema validation, but found: {schema}"
728
+ )
729
+
730
+ # Use schema subset matching NumPy array dimensions
731
+ arrow_schema = schema.arrow
732
+ num_cols = data.shape[1] if data.ndim > 1 else 1
733
+
734
+ if len(arrow_schema) >= num_cols:
735
+ # Use the first N columns from the schema to match data dimensions
736
+ subset_fields = [arrow_schema.field(i) for i in range(num_cols)]
737
+ subset_schema = pa.schema(subset_fields)
738
+ return to_pandas(data, schema=subset_schema)
739
+ else:
740
+ raise ValueError(
741
+ f"NumPy array has {num_cols} columns but table schema only has {len(arrow_schema)} columns. "
742
+ f"Cannot write NumPy data with more columns than the table schema supports."
743
+ )
744
+
745
+
746
+ def _build_entry_index_to_schema_mapping(
747
+ qualified_deltas: List[Delta], table_version_obj, **kwargs
748
+ ) -> List[Schema]:
749
+ """Build a mapping from manifest entry index to schema for reading operations.
750
+
751
+ Args:
752
+ qualified_deltas: List of deltas to process
753
+ table_version_obj: Table version containing schemas
754
+ **kwargs: Additional arguments passed to storage operations
755
+
756
+ Returns:
757
+ List mapping each manifest entry index to its corresponding schema
758
+
759
+ Raises:
760
+ ValueError: If a manifest's schema ID is not found in table version schemas
761
+ """
762
+ entry_index_to_schema = []
763
+ for delta in qualified_deltas:
764
+ if delta.manifest:
765
+ manifest = delta.manifest
766
+ else:
767
+ # Fetch manifest from storage
768
+ manifest = _get_storage(**kwargs).get_delta_manifest(
769
+ delta.locator,
770
+ **kwargs,
771
+ )
772
+ # Map manifest entry index to schema ID
773
+ schema_id = manifest.meta.schema_id
774
+
775
+ # Find the schema that matches this manifest's schema_id
776
+ matching_schema = None
777
+ if table_version_obj.schemas:
778
+ for schema in table_version_obj.schemas:
779
+ if schema.id == schema_id:
780
+ matching_schema = schema
781
+ break
782
+
783
+ if matching_schema is None:
784
+ available_schema_ids = (
785
+ [s.id for s in table_version_obj.schemas]
786
+ if table_version_obj.schemas
787
+ else []
788
+ )
789
+ raise ValueError(
790
+ f"Manifest schema ID {schema_id} not found in table version schemas. "
791
+ f"Available schema IDs: {available_schema_ids}. "
792
+ )
793
+
794
+ # Add the matching schema for each entry in this manifest
795
+ for _ in range(len(manifest.entries)):
796
+ entry_index_to_schema.append(matching_schema)
797
+
798
+ return entry_index_to_schema
799
+
800
+
801
+ def _convert_data_if_needed(data: Dataset) -> Dataset:
802
+ """Convert unsupported data types to supported ones."""
803
+ if isinstance(data, daft.DataFrame):
804
+ # Daft DataFrame - convert based on execution mode
805
+ ctx = daft.context.get_context()
806
+ runner = ctx.get_or_create_runner()
807
+ runner_type = runner.name
808
+
809
+ if runner_type == "ray":
810
+ # Running with Ray backend - convert to Ray Dataset
811
+ return data.to_ray_dataset()
812
+ else:
813
+ # Running with local backend - convert to PyArrow Table
814
+ return data.to_arrow()
815
+
816
+ return data
817
+
818
+
819
+ def _validate_and_coerce_data_against_schema(
820
+ data: Dataset,
821
+ schema: Optional[Schema],
822
+ schema_evolution_mode: Optional[SchemaEvolutionMode] = None,
823
+ default_schema_consistency_type: Optional[SchemaConsistencyType] = None,
824
+ ) -> Tuple[Dataset, bool, Optional[Schema]]:
825
+ """Validate and coerce data against the table schema if schema consistency types are set.
826
+
827
+ Args:
828
+ data: The dataset to validate/coerce
829
+ schema: The DeltaCAT schema to validate against (optional)
830
+ schema_evolution_mode: How to handle fields not in schema (MANUAL or AUTO)
831
+ default_schema_consistency_type: Default consistency type for new fields in AUTO mode
832
+
833
+ Returns:
834
+ Tuple[Dataset, bool, Optional[Schema]]: Validated/coerced data, flag indicating if schema was modified, and updated schema
835
+
836
+ Raises:
837
+ ValueError: If validation fails or coercion is not possible
838
+ """
839
+ if not schema:
840
+ return data, False, None
841
+
842
+ validated_data, updated_schema = schema.validate_and_coerce_dataset(
843
+ data,
844
+ schema_evolution_mode=schema_evolution_mode,
845
+ default_schema_consistency_type=default_schema_consistency_type,
846
+ )
847
+
848
+ # Check if schema was modified by comparing with original
849
+ schema_modified = not updated_schema.equivalent_to(schema, True)
850
+ # Return updated schema only if it was modified
851
+ updated_schema = updated_schema if schema_modified else None
852
+
853
+ return validated_data, schema_modified, updated_schema
854
+
855
+
856
+ def _validate_reader_compatibility(
857
+ data: Dataset,
858
+ content_type: ContentType,
859
+ supported_reader_types: Optional[List[DatasetType]],
860
+ ) -> None:
861
+ """Validate that the data types being written are compatible with all supported reader types.
862
+
863
+ Args:
864
+ data: The dataset to validate
865
+ content_type: Content type being written
866
+ supported_reader_types: List of DatasetTypes that must be able to read this data
127
867
 
128
- table_version_obj = _get_latest_or_given_table_version(
868
+ Raises:
869
+ TableValidationError: If any data types would break supported reader compatibility
870
+ """
871
+ if not supported_reader_types:
872
+ return
873
+
874
+ # Get the schema from the data
875
+ schema = get_table_schema(data)
876
+
877
+ # Get the dataset type of the current data
878
+ writer_dataset_type = get_dataset_type(data)
879
+
880
+ # PYARROW_PARQUET is equivalent to PYARROW for compatibility
881
+ writer_type_str = (
882
+ writer_dataset_type.value
883
+ if writer_dataset_type != DatasetType.PYARROW_PARQUET
884
+ else "pyarrow"
885
+ )
886
+
887
+ content_type_str = content_type.value
888
+
889
+ # Check each field type for compatibility
890
+ incompatible_fields = []
891
+
892
+ for field in schema:
893
+ field_name = field.name
894
+ arrow_type_str = str(field.type)
895
+
896
+ # Get the base type name from PyArrow field type
897
+ base_type_name = get_base_arrow_type_name(field.type)
898
+
899
+ # Get compatible readers for this (arrow_type, writer_dataset_type, content_type) combination
900
+ compatible_readers = get_compatible_readers(
901
+ base_type_name,
902
+ writer_type_str,
903
+ content_type_str,
904
+ )
905
+
906
+ # Check if all supported reader types are compatible
907
+ for required_reader in supported_reader_types:
908
+ reader_is_compatible = required_reader in compatible_readers
909
+
910
+ # Special case: PYARROW_PARQUET is equivalent to PYARROW for compatibility if we're writing parquet
911
+ if (
912
+ not reader_is_compatible
913
+ and content_type == ContentType.PARQUET
914
+ and required_reader == DatasetType.PYARROW_PARQUET
915
+ ):
916
+ reader_is_compatible = DatasetType.PYARROW in compatible_readers
917
+
918
+ if not reader_is_compatible:
919
+ incompatible_fields.append(
920
+ {
921
+ "field_name": field_name,
922
+ "arrow_type": arrow_type_str,
923
+ "incompatible_reader": required_reader,
924
+ "writer_type": writer_dataset_type,
925
+ "content_type": content_type,
926
+ }
927
+ )
928
+
929
+ # Raise error if any incompatibilities found
930
+ if incompatible_fields:
931
+ error_details = []
932
+ for incompatible in incompatible_fields:
933
+ error_details.append(
934
+ f"Field '{incompatible['field_name']}' with type '{incompatible['arrow_type']}' "
935
+ f"written by {incompatible['writer_type']} to {incompatible['content_type']} "
936
+ f"cannot be read by required reader type {incompatible['incompatible_reader']}. "
937
+ f"If you expect this write to succeed and this reader is not required, then it "
938
+ f"can be removed from the table's supported reader types property."
939
+ )
940
+
941
+ raise TableValidationError(
942
+ f"Reader compatibility validation failed. The following fields would break "
943
+ f"supported reader types:\n" + "\n".join(error_details)
944
+ )
945
+
946
+
947
+ def _stage_commit_and_compact(
948
+ converted_data: Dataset,
949
+ partition,
950
+ delta_type: DeltaType,
951
+ content_type: ContentType,
952
+ commit_staged_partition: bool,
953
+ table_version_obj: TableVersion,
954
+ namespace: str,
955
+ table: str,
956
+ schema: Schema,
957
+ original_fields: Set[str],
958
+ **kwargs,
959
+ ) -> None:
960
+ """Stage and commit delta, then handle compaction if needed."""
961
+ # Remove schema from kwargs to avoid duplicate parameter conflict
962
+ # We explicitly pass the correct schema parameter
963
+ kwargs.pop("schema", None)
964
+
965
+ # Stage a delta with the data
966
+ delta = _get_storage(**kwargs).stage_delta(
967
+ data=converted_data,
968
+ partition=partition,
969
+ delta_type=delta_type,
970
+ content_type=content_type,
971
+ author=ManifestAuthor.of(
972
+ name="deltacat.write_to_table", version=dc.__version__
973
+ ),
974
+ schema=schema,
975
+ **kwargs,
976
+ )
977
+
978
+ delta = _get_storage(**kwargs).commit_delta(delta=delta, **kwargs)
979
+
980
+ if commit_staged_partition:
981
+ _get_storage(**kwargs).commit_partition(partition=partition, **kwargs)
982
+
983
+ # Check compaction trigger decision
984
+ should_compact = _trigger_compaction(
985
+ table_version_obj,
986
+ delta,
987
+ TableReadOptimizationLevel.MAX,
988
+ **kwargs,
989
+ )
990
+ if should_compact:
991
+ # Run V2 compaction session to merge or delete data
992
+ if table_version_obj.schema:
993
+ all_column_names = table_version_obj.schema.arrow.names
994
+ else:
995
+ raise RuntimeError("Table version schema is required to run compaction.")
996
+ _run_compaction_session(
997
+ table_version_obj=table_version_obj,
998
+ partition=partition,
999
+ latest_delta_stream_position=delta.stream_position,
1000
+ namespace=namespace,
1001
+ table=table,
1002
+ original_fields=original_fields,
1003
+ all_column_names=all_column_names,
1004
+ **kwargs,
1005
+ )
1006
+
1007
+
1008
+ def _trigger_compaction(
1009
+ table_version_obj: TableVersion,
1010
+ latest_delta: Optional[Delta],
1011
+ target_read_optimization_level: TableReadOptimizationLevel,
1012
+ **kwargs,
1013
+ ) -> bool:
1014
+ # Import inside function to avoid circular imports
1015
+ from deltacat.compute.compactor.utils import round_completion_reader as rci
1016
+
1017
+ # Extract delta type from latest_delta if available, otherwise default to no compaction
1018
+ if latest_delta is not None:
1019
+ delta_type = latest_delta.type
1020
+ partition_values = latest_delta.partition_locator.partition_values
1021
+ logger.info(
1022
+ f"Using delta type {delta_type} from latest delta {latest_delta.locator}"
1023
+ )
1024
+ else:
1025
+ logger.info(f"No latest delta discovered, defaulting to no compaction.")
1026
+ return False
1027
+
1028
+ if (
1029
+ table_version_obj.read_table_property(TableProperty.READ_OPTIMIZATION_LEVEL)
1030
+ == target_read_optimization_level
1031
+ ):
1032
+ if delta_type == DeltaType.DELETE or delta_type == DeltaType.UPSERT:
1033
+ return True
1034
+ elif delta_type == DeltaType.APPEND:
1035
+ # Get default stream to determine partition locator
1036
+ stream = _get_table_stream(
1037
+ table_version_obj.locator.namespace,
1038
+ table_version_obj.locator.table_name,
1039
+ table_version_obj.locator.table_version,
1040
+ **kwargs,
1041
+ )
1042
+
1043
+ if not stream:
1044
+ return False
1045
+
1046
+ # Use provided partition_values or None for unpartitioned tables
1047
+ partition_locator = PartitionLocator.of(
1048
+ stream_locator=stream.locator,
1049
+ partition_values=partition_values,
1050
+ partition_id=None,
1051
+ )
1052
+
1053
+ # Get round completion info to determine high watermark
1054
+ round_completion_info = rci.read_round_completion_info(
1055
+ source_partition_locator=partition_locator,
1056
+ destination_partition_locator=partition_locator,
1057
+ deltacat_storage=_get_storage(**kwargs),
1058
+ deltacat_storage_kwargs=kwargs,
1059
+ )
1060
+
1061
+ high_watermark = (
1062
+ round_completion_info.high_watermark
1063
+ if round_completion_info
1064
+ and isinstance(round_completion_info.high_watermark, int)
1065
+ else 0
1066
+ )
1067
+
1068
+ # Get all deltas appended since last compaction
1069
+ deltas = _get_storage(**kwargs).list_deltas(
1070
+ namespace=table_version_obj.locator.namespace,
1071
+ table_name=table_version_obj.locator.table_name,
1072
+ table_version=table_version_obj.locator.table_version,
1073
+ partition_values=partition_values,
1074
+ start_stream_position=high_watermark + 1,
1075
+ **kwargs,
1076
+ )
1077
+
1078
+ if not deltas:
1079
+ return False
1080
+
1081
+ # Count deltas appended since last compaction
1082
+ appended_deltas_since_last_compaction = len(deltas)
1083
+ delta_trigger = table_version_obj.read_table_property(
1084
+ TableProperty.APPENDED_DELTA_COUNT_COMPACTION_TRIGGER
1085
+ )
1086
+ if delta_trigger and appended_deltas_since_last_compaction >= delta_trigger:
1087
+ return True
1088
+
1089
+ # Count files appended since last compaction
1090
+ appended_files_since_last_compaction = 0
1091
+ for delta in deltas:
1092
+ if delta.manifest and delta.manifest.entries:
1093
+ appended_files_since_last_compaction += len(delta.manifest.entries)
1094
+
1095
+ file_trigger = table_version_obj.read_table_property(
1096
+ TableProperty.APPENDED_FILE_COUNT_COMPACTION_TRIGGER
1097
+ )
1098
+ if file_trigger and appended_files_since_last_compaction >= file_trigger:
1099
+ return True
1100
+
1101
+ # Count records appended since last compaction
1102
+ appended_records_since_last_compaction = 0
1103
+ for delta in deltas:
1104
+ if delta.meta and delta.meta.record_count:
1105
+ appended_records_since_last_compaction += delta.meta.record_count
1106
+
1107
+ record_trigger = table_version_obj.read_table_property(
1108
+ TableProperty.APPENDED_RECORD_COUNT_COMPACTION_TRIGGER
1109
+ )
1110
+ if (
1111
+ record_trigger
1112
+ and appended_records_since_last_compaction >= record_trigger
1113
+ ):
1114
+ return True
1115
+ return False
1116
+
1117
+
1118
+ def _get_compaction_primary_keys(table_version_obj: TableVersion) -> set:
1119
+ """Extract primary keys from table schema for compaction."""
1120
+ table_schema = table_version_obj.schema
1121
+ return (
1122
+ set(table_schema.merge_keys)
1123
+ if table_schema and table_schema.merge_keys
1124
+ else set()
1125
+ )
1126
+
1127
+
1128
+ def _get_compaction_hash_bucket_count(
1129
+ partition: Partition, table_version_obj: TableVersion
1130
+ ) -> int:
1131
+ """Determine hash bucket count from previous compaction, table property, or default."""
1132
+ # First check if we have a hash bucket count from previous compaction
1133
+ if (
1134
+ partition.compaction_round_completion_info
1135
+ and partition.compaction_round_completion_info.hash_bucket_count
1136
+ ):
1137
+ hash_bucket_count = partition.compaction_round_completion_info.hash_bucket_count
1138
+ logger.info(
1139
+ f"Using hash bucket count {hash_bucket_count} from previous compaction"
1140
+ )
1141
+ return hash_bucket_count
1142
+
1143
+ # Otherwise use the table property for default compaction hash bucket count
1144
+ hash_bucket_count = table_version_obj.read_table_property(
1145
+ TableProperty.DEFAULT_COMPACTION_HASH_BUCKET_COUNT
1146
+ )
1147
+ logger.info(f"Using hash bucket count {hash_bucket_count} from table property")
1148
+ return hash_bucket_count
1149
+
1150
+
1151
+ def _get_merge_order_sort_keys(table_version_obj: TableVersion):
1152
+ """Extract sort keys from merge_order fields in schema for compaction.
1153
+
1154
+ Args:
1155
+ table_version_obj: The table version containing schema
1156
+
1157
+ Returns:
1158
+ List of SortKey objects from merge_order fields, or None if no merge_order fields are defined
1159
+ """
1160
+ if table_version_obj.schema:
1161
+ return table_version_obj.schema.merge_order_sort_keys()
1162
+ return None
1163
+
1164
+
1165
+ def _create_compaction_params(
1166
+ table_version_obj: TableVersion,
1167
+ partition: Partition,
1168
+ latest_stream_position: int,
1169
+ primary_keys: set,
1170
+ hash_bucket_count: int,
1171
+ original_fields: Set[str],
1172
+ all_column_names: Optional[List[str]],
1173
+ **kwargs,
1174
+ ):
1175
+ """Create compaction parameters for the compaction session."""
1176
+ from deltacat.compute.compactor.model.compact_partition_params import (
1177
+ CompactPartitionParams,
1178
+ )
1179
+
1180
+ # Remove create_table/alter_table kwargs not needed for compaction
1181
+ kwargs.pop("lifecycle_state", None)
1182
+ kwargs.pop("schema", None)
1183
+ kwargs.pop("partition_scheme", None)
1184
+ kwargs.pop("sort_keys", None)
1185
+ kwargs.pop("table_description", None)
1186
+ kwargs.pop("table_version_description", None)
1187
+ kwargs.pop("table_properties", None)
1188
+ kwargs.pop("table_version_properties", None)
1189
+ kwargs.pop("namespace_properties", None)
1190
+ kwargs.pop("content_types", None)
1191
+ kwargs.pop("fail_if_exists", None)
1192
+ kwargs.pop("schema_updates", None)
1193
+ kwargs.pop("partition_updates", None)
1194
+ kwargs.pop("sort_scheme", None)
1195
+
1196
+ table_writer_kwargs = kwargs.pop("table_writer_kwargs", {})
1197
+ table_writer_kwargs["schema"] = table_version_obj.schema
1198
+ table_writer_kwargs["sort_scheme_id"] = table_version_obj.sort_scheme.id
1199
+ deltacat_storage_kwargs = kwargs.pop("deltacat_storage_kwargs", {})
1200
+ deltacat_storage_kwargs["transaction"] = kwargs.get("transaction", None)
1201
+ list_deltas_kwargs = kwargs.pop("list_deltas_kwargs", {})
1202
+ list_deltas_kwargs["transaction"] = kwargs.get("transaction", None)
1203
+
1204
+ return CompactPartitionParams.of(
1205
+ {
1206
+ "catalog": kwargs.get("inner", kwargs.get("catalog")),
1207
+ "source_partition_locator": partition.locator,
1208
+ "destination_partition_locator": partition.locator, # In-place compaction
1209
+ "primary_keys": primary_keys,
1210
+ "last_stream_position_to_compact": latest_stream_position,
1211
+ "deltacat_storage": _get_storage(**kwargs),
1212
+ "deltacat_storage_kwargs": deltacat_storage_kwargs,
1213
+ "list_deltas_kwargs": list_deltas_kwargs,
1214
+ "table_writer_kwargs": table_writer_kwargs,
1215
+ "hash_bucket_count": hash_bucket_count,
1216
+ "records_per_compacted_file": table_version_obj.read_table_property(
1217
+ TableProperty.RECORDS_PER_COMPACTED_FILE,
1218
+ ),
1219
+ "compacted_file_content_type": ContentType.PARQUET,
1220
+ "drop_duplicates": True,
1221
+ "sort_keys": _get_merge_order_sort_keys(table_version_obj),
1222
+ "original_fields": original_fields,
1223
+ "all_column_names": all_column_names,
1224
+ }
1225
+ )
1226
+
1227
+
1228
+ def _run_compaction_session(
1229
+ table_version_obj: TableVersion,
1230
+ partition: Partition,
1231
+ latest_delta_stream_position: int,
1232
+ namespace: str,
1233
+ table: str,
1234
+ original_fields: Set[str],
1235
+ all_column_names: List[str],
1236
+ **kwargs,
1237
+ ) -> None:
1238
+ """
1239
+ Run a V2 compaction session for the given table and partition.
1240
+
1241
+ Args:
1242
+ table_version_obj: The table version object
1243
+ partition: The partition to compact
1244
+ latest_delta_stream_position: Stream position of the latest delta
1245
+ namespace: The table namespace
1246
+ table: The table name
1247
+ original_fields: The original field set for partial UPSERT support
1248
+ **kwargs: Additional arguments including catalog and storage parameters
1249
+ """
1250
+ # Import inside function to avoid circular imports
1251
+ from deltacat.compute.compactor_v2.compaction_session import compact_partition
1252
+
1253
+ try:
1254
+ # Extract compaction configuration
1255
+ primary_keys = _get_compaction_primary_keys(table_version_obj)
1256
+ hash_bucket_count = _get_compaction_hash_bucket_count(
1257
+ partition, table_version_obj
1258
+ )
1259
+
1260
+ # Create compaction parameters
1261
+ compact_partition_params = _create_compaction_params(
1262
+ table_version_obj,
1263
+ partition,
1264
+ latest_delta_stream_position,
1265
+ primary_keys,
1266
+ hash_bucket_count,
1267
+ original_fields=original_fields,
1268
+ all_column_names=all_column_names,
1269
+ **kwargs,
1270
+ )
1271
+
1272
+ # Run V2 compaction session
1273
+ compact_partition(params=compact_partition_params)
1274
+ except Exception as e:
1275
+ logger.error(
1276
+ f"Error during compaction session for {namespace}.{table}, "
1277
+ f"partition {partition.locator}: {e}"
1278
+ )
1279
+ raise
1280
+
1281
+
1282
+ def _get_merge_key_field_names_from_schema(schema) -> List[str]:
1283
+ """Extract merge key field names from a DeltaCAT Schema object.
1284
+
1285
+ Args:
1286
+ schema: DeltaCAT Schema object
1287
+
1288
+ Returns:
1289
+ List of field names that are marked as merge keys
1290
+ """
1291
+ if not schema or not schema.merge_keys:
1292
+ return []
1293
+
1294
+ merge_key_field_names = []
1295
+ field_ids_to_fields = schema.field_ids_to_fields
1296
+
1297
+ for merge_key_id in schema.merge_keys:
1298
+ if merge_key_id in field_ids_to_fields:
1299
+ field = field_ids_to_fields[merge_key_id]
1300
+ merge_key_field_names.append(field.arrow.name)
1301
+
1302
+ return merge_key_field_names
1303
+
1304
+
1305
+ def _set_entry_params_if_needed(
1306
+ mode: TableWriteMode, table_version_obj, kwargs: dict
1307
+ ) -> None:
1308
+ """Automatically set entry_params to merge keys if not already set by user.
1309
+
1310
+ Args:
1311
+ mode: The table write mode
1312
+ table_version_obj: The table version object containing schema
1313
+ kwargs: Keyword arguments dictionary that may contain entry_params
1314
+ """
1315
+ # Only set entry_params for DELETE and MERGE modes
1316
+ if mode not in [TableWriteMode.DELETE, TableWriteMode.MERGE]:
1317
+ return
1318
+
1319
+ # Don't override if user already provided entry_params
1320
+ if "entry_params" in kwargs and kwargs["entry_params"] is not None:
1321
+ return
1322
+
1323
+ # Get schema from table version
1324
+ if not table_version_obj or not table_version_obj.schema:
1325
+ return
1326
+
1327
+ # Extract merge key field names
1328
+ merge_key_field_names = _get_merge_key_field_names_from_schema(
1329
+ table_version_obj.schema
1330
+ )
1331
+
1332
+ if merge_key_field_names:
1333
+ from deltacat.storage import EntryParams
1334
+
1335
+ kwargs["entry_params"] = EntryParams.of(merge_key_field_names)
1336
+
1337
+
1338
+ def _get_table_stream(namespace: str, table: str, table_version: str, **kwargs):
1339
+ """Helper function to get a stream for a table version."""
1340
+ return _get_storage(**kwargs).get_stream(
129
1341
  namespace=namespace,
130
1342
  table_name=table,
131
1343
  table_version=table_version,
132
1344
  **kwargs,
133
1345
  )
134
- table_version = table_version_obj.table_version
135
1346
 
1347
+
1348
+ def _validate_read_table_input(
1349
+ namespace: str,
1350
+ table: str,
1351
+ table_schema: Optional[Schema],
1352
+ table_type: Optional[DatasetType],
1353
+ distributed_dataset_type: Optional[DatasetType],
1354
+ ) -> None:
1355
+ """Validate input parameters for read_table operation."""
136
1356
  if (
137
- table_version_obj.content_types is None
138
- or len(table_version_obj.content_types) != 1
1357
+ distributed_dataset_type
1358
+ and distributed_dataset_type not in DatasetType.distributed()
139
1359
  ):
140
1360
  raise ValueError(
141
- "Expected exactly one content type but "
142
- f"found {table_version_obj.content_types}."
1361
+ f"{distributed_dataset_type} is not a valid distributed dataset type. "
1362
+ f"Valid distributed dataset types are: {DatasetType.distributed()}."
1363
+ )
1364
+ if table_type and table_type not in DatasetType.local():
1365
+ raise ValueError(
1366
+ f"{table_type} is not a valid local table type. "
1367
+ f"Valid table types are: {DatasetType.local()}."
143
1368
  )
144
1369
 
1370
+ # For schemaless tables, distributed datasets are not yet supported
1371
+ if table_schema is None and distributed_dataset_type:
1372
+ raise NotImplementedError(
1373
+ f"Distributed dataset reading is not yet supported for schemaless tables. "
1374
+ f"Table '{namespace}.{table}' has no schema, but distributed_dataset_type={distributed_dataset_type} was specified. "
1375
+ f"Please use local storage by setting distributed_dataset_type=None."
1376
+ )
1377
+
1378
+
1379
+ def _get_qualified_deltas_for_read(
1380
+ table: str,
1381
+ namespace: str,
1382
+ table_version: str,
1383
+ partition_filter: Optional[List[Union[Partition, PartitionLocator]]],
1384
+ **kwargs,
1385
+ ) -> List[Delta]:
1386
+ """Get qualified deltas for reading based on partition filter."""
145
1387
  logger.info(
146
1388
  f"Reading metadata for table={namespace}/{table}/{table_version} "
147
- f"with partition_filters={partition_filter} and stream position"
148
- f" range={stream_position_range_inclusive}"
1389
+ f"with partition_filters={partition_filter}."
149
1390
  )
150
1391
 
1392
+ # Get partition filter if not provided
151
1393
  if partition_filter is None:
152
- logger.info(
153
- f"Reading all partitions metadata in the table={table} "
154
- "as partition_filter was None."
155
- )
156
- partition_filter = (
157
- _get_storage(**kwargs)
158
- .list_partitions(
159
- table_name=table,
160
- namespace=namespace,
161
- table_version=table_version,
162
- **kwargs,
163
- )
164
- .all_items()
1394
+ partition_filter = _get_all_committed_partitions(
1395
+ table, namespace, table_version, **kwargs
165
1396
  )
166
1397
 
1398
+ # Get deltas from partitions
167
1399
  qualified_deltas = _get_deltas_from_partition_filter(
168
- stream_position_range_inclusive=stream_position_range_inclusive,
169
1400
  partition_filter=partition_filter,
170
1401
  **kwargs,
171
1402
  )
@@ -175,30 +1406,390 @@ def read_table(
175
1406
  f"from {len(partition_filter)} partitions."
176
1407
  )
177
1408
 
178
- merge_on_read_params = MergeOnReadParams.of(
179
- {
180
- "deltas": qualified_deltas,
181
- "deltacat_storage": _get_storage(**kwargs),
182
- "deltacat_storage_kwargs": {**kwargs},
183
- "reader_kwargs": reader_kwargs,
184
- }
1409
+ return qualified_deltas
1410
+
1411
+
1412
+ def _get_max_parallelism(
1413
+ max_parallelism: Optional[int],
1414
+ distributed_dataset_type: Optional[DatasetType],
1415
+ ) -> int:
1416
+ """Get the max parallelism for a read operation."""
1417
+ if distributed_dataset_type:
1418
+ max_parallelism = max_parallelism or 100
1419
+ else:
1420
+ # TODO(pdames): Set max parallelism using available resources and dataset size
1421
+ max_parallelism = 1
1422
+ if max_parallelism < 1:
1423
+ raise ValueError(
1424
+ f"max_parallelism must be greater than 0, but got {max_parallelism}"
1425
+ )
1426
+ logger.info(f"Using max_parallelism={max_parallelism} for read operation")
1427
+
1428
+ return max_parallelism
1429
+
1430
+
1431
+ def _handle_schemaless_table_read(
1432
+ qualified_deltas: List[Delta],
1433
+ read_as: DatasetType,
1434
+ **kwargs,
1435
+ ) -> Dataset:
1436
+ """Handle reading schemaless tables by flattening manifest entries."""
1437
+ # Create a PyArrow table for each delta
1438
+ # TODO(pdames): More efficient implementation for tables with millions/billions of entries
1439
+ tables = []
1440
+ for delta in qualified_deltas:
1441
+ # Get the manifest for this delta
1442
+ if delta.manifest:
1443
+ manifest = delta.manifest
1444
+ else:
1445
+ # Fetch manifest from storage
1446
+ manifest = _get_storage(**kwargs).get_delta_manifest(
1447
+ delta.locator,
1448
+ transaction=kwargs.get("transaction"),
1449
+ **kwargs,
1450
+ )
1451
+ # Create flattened table from this delta's manifest
1452
+ table = pa_utils.delta_manifest_to_table(
1453
+ manifest,
1454
+ delta,
1455
+ )
1456
+ tables.append(table)
1457
+
1458
+ # Concatenate all PyArrow tables
1459
+ final_table = pa_utils.concat_tables(tables)
1460
+
1461
+ # Convert from PyArrow to the requested dataset type
1462
+ return from_pyarrow(final_table, read_as)
1463
+
1464
+
1465
+ def _download_and_process_table_data(
1466
+ namespace: str,
1467
+ table: str,
1468
+ qualified_deltas: List[Delta],
1469
+ read_as: DatasetType,
1470
+ max_parallelism: Optional[int],
1471
+ columns: Optional[List[str]],
1472
+ file_path_column: Optional[str],
1473
+ table_version_obj: Optional[TableVersion],
1474
+ **kwargs,
1475
+ ) -> Dataset:
1476
+ """Download delta data and process result based on storage type."""
1477
+
1478
+ # Handle NUMPY read requests by translating to PANDAS internally
1479
+ original_read_as = read_as
1480
+ effective_read_as = read_as
1481
+ if read_as == DatasetType.NUMPY:
1482
+ effective_read_as = DatasetType.PANDAS
1483
+ logger.debug("Translating NUMPY read request to PANDAS for internal processing")
1484
+
1485
+ # Merge deltas and download data
1486
+ if not qualified_deltas:
1487
+ # Return empty table with original read_as type
1488
+ return empty_table(original_read_as)
1489
+
1490
+ # Special handling for non-empty schemaless tables
1491
+ if table_version_obj.schema is None:
1492
+ result = _handle_schemaless_table_read(
1493
+ qualified_deltas,
1494
+ effective_read_as,
1495
+ **kwargs,
1496
+ )
1497
+ # Convert to numpy if original request was for numpy
1498
+ if original_read_as == DatasetType.NUMPY:
1499
+ return _convert_pandas_to_numpy(result)
1500
+ return result
1501
+
1502
+ # Get schemas for each manifest entry
1503
+ entry_index_to_schema = _build_entry_index_to_schema_mapping(
1504
+ qualified_deltas, table_version_obj, **kwargs
1505
+ )
1506
+ # Standard non-empty schema table read path - merge deltas and download data
1507
+ merged_delta = Delta.merge_deltas(qualified_deltas)
1508
+
1509
+ # Convert read parameters to download parameters
1510
+ table_type = (
1511
+ effective_read_as
1512
+ if effective_read_as in DatasetType.local()
1513
+ else (kwargs.pop("table_type", None) or DatasetType.PYARROW)
1514
+ )
1515
+ distributed_dataset_type = (
1516
+ effective_read_as if effective_read_as in DatasetType.distributed() else None
185
1517
  )
186
1518
 
187
- return MERGE_FUNC_BY_DISTRIBUTED_DATASET_TYPE[distributed_dataset_type.value](
188
- params=merge_on_read_params, **kwargs
1519
+ # Validate input parameters
1520
+ _validate_read_table_input(
1521
+ namespace,
1522
+ table,
1523
+ table_version_obj.schema,
1524
+ table_type,
1525
+ distributed_dataset_type,
189
1526
  )
190
1527
 
1528
+ # Determine max parallelism
1529
+ max_parallelism = _get_max_parallelism(
1530
+ max_parallelism,
1531
+ distributed_dataset_type,
1532
+ )
1533
+ # Filter out parameters that are already passed as keyword arguments
1534
+ # to avoid "multiple values for argument" errors
1535
+ filtered_kwargs = {
1536
+ k: v
1537
+ for k, v in kwargs.items()
1538
+ if k
1539
+ not in [
1540
+ "delta_like",
1541
+ "table_type",
1542
+ "storage_type",
1543
+ "max_parallelism",
1544
+ "columns",
1545
+ "distributed_dataset_type",
1546
+ "file_path_column",
1547
+ ]
1548
+ }
1549
+ result = _get_storage(**kwargs).download_delta(
1550
+ merged_delta,
1551
+ table_type=effective_read_as,
1552
+ storage_type=StorageType.DISTRIBUTED
1553
+ if distributed_dataset_type
1554
+ else StorageType.LOCAL,
1555
+ max_parallelism=max_parallelism,
1556
+ columns=columns,
1557
+ distributed_dataset_type=distributed_dataset_type,
1558
+ file_path_column=file_path_column,
1559
+ **filtered_kwargs,
1560
+ )
1561
+
1562
+ # Handle local storage table concatenation and PYARROW_PARQUET lazy materialization
1563
+ if not distributed_dataset_type and table_type and isinstance(result, list):
1564
+ if table_type == DatasetType.PYARROW_PARQUET:
1565
+ # For PYARROW_PARQUET, preserve lazy materialization:
1566
+ return result[0] if len(result) == 1 else result
1567
+ else:
1568
+ # For other types, perform normal concatenation
1569
+ result = _handle_local_table_concatenation(
1570
+ result,
1571
+ table_type,
1572
+ table_version_obj.schema,
1573
+ entry_index_to_schema,
1574
+ file_path_column,
1575
+ columns,
1576
+ )
1577
+ # Convert to numpy if original request was for numpy
1578
+ if original_read_as == DatasetType.NUMPY:
1579
+ return _convert_pandas_to_numpy(result)
1580
+
1581
+ return result
1582
+
1583
+
1584
+ def _convert_pandas_to_numpy(dataset: Dataset):
1585
+ """Convert pandas DataFrame to numpy ndarray."""
1586
+ if not isinstance(dataset, pd.DataFrame):
1587
+ raise ValueError(f"Expected pandas DataFrame but found {type(dataset)}")
1588
+ return dataset.to_numpy()
1589
+
1590
+
1591
+ def _coerce_dataset_to_schema(
1592
+ dataset: Dataset, target_schema: pa.Schema, manifest_entry_schema: Schema
1593
+ ) -> Dataset:
1594
+ """Coerce a dataset to match the target PyArrow schema using DeltaCAT Schema.coerce method."""
1595
+ # Convert target PyArrow schema to DeltaCAT schema and use its coerce method
1596
+ deltacat_schema = Schema.of(schema=target_schema)
1597
+ return deltacat_schema.coerce(dataset, manifest_entry_schema)
1598
+
1599
+
1600
+ def _coerce_results_to_schema(
1601
+ results: Dataset, target_schema: pa.Schema, entry_index_to_schema: List[Schema]
1602
+ ) -> List[Dataset]:
1603
+ """Coerce all table results to match the target schema."""
1604
+ coerced_results = []
1605
+ for i, table_result in enumerate(results):
1606
+ coerced_result = _coerce_dataset_to_schema(
1607
+ table_result, target_schema, entry_index_to_schema[i]
1608
+ )
1609
+ coerced_results.append(coerced_result)
1610
+ logger.debug(f"Coerced table {i} to unified schema")
1611
+ return coerced_results
1612
+
1613
+
1614
+ def _create_target_schema(
1615
+ arrow_schema: pa.Schema,
1616
+ columns: Optional[List[str]] = None,
1617
+ file_path_column: Optional[str] = None,
1618
+ ) -> pa.Schema:
1619
+ """Create target schema for concatenation with optional column selection and file_path_column."""
1620
+ if columns is not None:
1621
+ # Column selection - use only specified columns
1622
+ field_map = {field.name: field for field in arrow_schema}
1623
+ selected_fields = []
1624
+
1625
+ for col_name in columns:
1626
+ if col_name in field_map:
1627
+ selected_fields.append(field_map[col_name])
1628
+ arrow_schema = pa.schema(selected_fields)
1629
+ if file_path_column and file_path_column not in arrow_schema.names:
1630
+ arrow_schema = arrow_schema.append(pa.field(file_path_column, pa.string()))
1631
+ return arrow_schema
1632
+
1633
+
1634
+ def _create_entry_schemas_for_concatenation(
1635
+ entry_index_to_schema: List[Schema],
1636
+ columns: Optional[List[str]] = None,
1637
+ file_path_column: Optional[str] = None,
1638
+ ) -> List[Schema]:
1639
+ """Create entry schemas for concatenation, optionally filtered by column selection."""
1640
+ if columns is None:
1641
+ # No column selection - return original schemas as-is
1642
+ return entry_index_to_schema
1643
+
1644
+ # Column selection - filter each entry schema
1645
+ modified_schemas = []
1646
+ for entry_schema in entry_index_to_schema:
1647
+ if entry_schema and entry_schema.arrow:
1648
+ filtered_schema = _create_target_schema(
1649
+ entry_schema.arrow, columns, file_path_column
1650
+ )
1651
+ modified_schemas.append(Schema.of(schema=filtered_schema))
1652
+ else:
1653
+ modified_schemas.append(entry_schema)
1654
+
1655
+ return modified_schemas
1656
+
1657
+
1658
+ def _handle_local_table_concatenation(
1659
+ results: Dataset,
1660
+ table_type: DatasetType,
1661
+ table_schema: Optional[Schema],
1662
+ entry_index_to_schema: List[Schema],
1663
+ file_path_column: Optional[str] = None,
1664
+ columns: Optional[List[str]] = None,
1665
+ ) -> Dataset:
1666
+ """Handle concatenation of local table results with schema coercion."""
1667
+ logger.debug(f"Target table schema for concatenation: {table_schema}")
1668
+
1669
+ # Create target schema for coercion, respecting column selection
1670
+ target_schema = _create_target_schema(table_schema.arrow, columns, file_path_column)
1671
+ logger.debug(f"Created target schema: {target_schema.names}")
1672
+
1673
+ # Filter entry schemas to match column selection and file_path_column
1674
+ modified_entry_schemas = _create_entry_schemas_for_concatenation(
1675
+ entry_index_to_schema, columns, file_path_column
1676
+ )
1677
+
1678
+ # Coerce results to unified schema
1679
+ coerced_results = _coerce_results_to_schema(
1680
+ results, target_schema, modified_entry_schemas
1681
+ )
1682
+
1683
+ # Second step: concatenate the coerced results
1684
+ logger.debug(
1685
+ f"Concatenating {len(coerced_results)} local tables of type {table_type} with unified schemas"
1686
+ )
1687
+ concatenated_result = concat_tables(coerced_results, table_type)
1688
+ logger.debug(f"Concatenation complete, result type: {type(concatenated_result)}")
1689
+ return concatenated_result
1690
+
1691
+
1692
+ def read_table(
1693
+ table: str,
1694
+ *args,
1695
+ namespace: Optional[str] = None,
1696
+ table_version: Optional[str] = None,
1697
+ read_as: DatasetType = DatasetType.DAFT,
1698
+ partition_filter: Optional[List[Union[Partition, PartitionLocator]]] = None,
1699
+ max_parallelism: Optional[int] = None,
1700
+ columns: Optional[List[str]] = None,
1701
+ file_path_column: Optional[str] = None,
1702
+ transaction: Optional[Transaction] = None,
1703
+ **kwargs,
1704
+ ) -> Dataset:
1705
+ """Read a table into a dataset.
1706
+
1707
+ Args:
1708
+ table: Name of the table to read.
1709
+ namespace: Optional namespace of the table. Uses default if not specified.
1710
+ table_version: Optional specific version of the table to read.
1711
+ read_as: Dataset type to use for reading table files. Defaults to DatasetType.DAFT.
1712
+ partition_filter: Optional list of partitions to read from.
1713
+ max_parallelism: Optional maximum parallelism for data download. Defaults to the number of
1714
+ available CPU cores for local dataset type reads (i.e., members of DatasetType.local())
1715
+ and 100 for distributed dataset type reads (i.e., members of DatasetType.distributed()).
1716
+ columns: Optional list of columns to include in the result.
1717
+ file_path_column: Optional column name to add file paths to the result.
1718
+ transaction: Optional transaction to chain this read operation to. If provided, uncommitted
1719
+ changes from the transaction will be visible to this read operation.
1720
+ **kwargs: Additional keyword arguments.
1721
+
1722
+ Returns:
1723
+ Dataset containing the table data.
1724
+ """
1725
+ # Set up transaction handling
1726
+ read_transaction, commit_transaction = setup_transaction(transaction, **kwargs)
1727
+ kwargs["transaction"] = read_transaction
1728
+
1729
+ try:
1730
+ # Resolve namespace and get table metadata
1731
+ namespace = namespace or default_namespace()
1732
+
1733
+ table_version_obj = _get_latest_active_or_given_table_version(
1734
+ namespace=namespace,
1735
+ table_name=table,
1736
+ table_version=table_version,
1737
+ **kwargs,
1738
+ )
1739
+
1740
+ # Get partitions and deltas to read
1741
+ qualified_deltas = _get_qualified_deltas_for_read(
1742
+ table,
1743
+ namespace,
1744
+ table_version_obj.table_version,
1745
+ partition_filter,
1746
+ **kwargs,
1747
+ )
1748
+
1749
+ # Download and process the data
1750
+ # TODO(pdames): Remove once we implement a custom SerDe for pa.ParquetFile
1751
+ if read_as == DatasetType.PYARROW_PARQUET:
1752
+ max_parallelism = 1
1753
+ logger.warning(
1754
+ f"Forcing max_parallelism to 1 for PyArrow Parquet reads to avoid serialization errors."
1755
+ )
1756
+ result = _download_and_process_table_data(
1757
+ namespace,
1758
+ table,
1759
+ qualified_deltas,
1760
+ read_as,
1761
+ max_parallelism,
1762
+ columns,
1763
+ file_path_column,
1764
+ table_version_obj,
1765
+ **kwargs,
1766
+ )
1767
+ return result
1768
+ except Exception as e:
1769
+ # If any error occurs, the transaction remains uncommitted
1770
+ commit_transaction = False
1771
+ logger.error(f"Error during read_table: {e}")
1772
+ raise
1773
+ finally:
1774
+ if commit_transaction:
1775
+ # Seal the interactive transaction to commit all operations atomically
1776
+ read_transaction.seal()
1777
+
191
1778
 
192
1779
  def alter_table(
193
1780
  table: str,
194
1781
  *args,
195
1782
  namespace: Optional[str] = None,
1783
+ table_version: Optional[str] = None,
196
1784
  lifecycle_state: Optional[LifecycleState] = None,
197
- schema_updates: Optional[Dict[str, Any]] = None,
1785
+ schema_updates: Optional[SchemaUpdate] = None,
198
1786
  partition_updates: Optional[Dict[str, Any]] = None,
199
- sort_keys: Optional[SortScheme] = None,
200
- description: Optional[str] = None,
201
- properties: Optional[TableProperties] = None,
1787
+ sort_scheme: Optional[SortScheme] = None,
1788
+ table_description: Optional[str] = None,
1789
+ table_version_description: Optional[str] = None,
1790
+ table_properties: Optional[TableProperties] = None,
1791
+ table_version_properties: Optional[TableVersionProperties] = None,
1792
+ transaction: Optional[Transaction] = None,
202
1793
  **kwargs,
203
1794
  ) -> None:
204
1795
  """Alter deltacat table/table_version definition.
@@ -209,61 +1800,169 @@ def alter_table(
209
1800
  Args:
210
1801
  table: Name of the table to alter.
211
1802
  namespace: Optional namespace of the table. Uses default namespace if not specified.
1803
+ table_version: Optional specific version of the table to alter. Defaults to the latest active version.
212
1804
  lifecycle_state: New lifecycle state for the table.
213
- schema_updates: Map of schema updates to apply.
214
- partition_updates: Map of partition scheme updates to apply.
215
- sort_keys: New sort keys scheme.
216
- description: New description for the table.
217
- properties: New table properties.
1805
+ schema_updates: Schema updates to apply.
1806
+ partition_updates: Partition scheme updates to apply.
1807
+ sort_scheme: New sort scheme.
1808
+ table_description: New description for the table.
1809
+ table_version_description: New description for the table version. Defaults to `table_description` if not specified.
1810
+ table_properties: New table properties.
1811
+ table_version_properties: New table version properties. Defaults to the current parent table properties if not specified.
1812
+ transaction: Optional transaction to use. If None, creates a new transaction.
218
1813
 
219
1814
  Returns:
220
1815
  None
221
1816
 
222
1817
  Raises:
223
1818
  TableNotFoundError: If the table does not already exist.
1819
+ TableVersionNotFoundError: If the specified table version or active table version does not exist.
224
1820
  """
1821
+ resolved_table_properties = None
1822
+ if table_properties is not None:
1823
+ resolved_table_properties = _add_default_table_properties(table_properties)
1824
+ _validate_table_properties(resolved_table_properties)
1825
+
225
1826
  namespace = namespace or default_namespace()
226
1827
 
227
- _get_storage(**kwargs).update_table(
228
- *args,
229
- namespace=namespace,
230
- table_name=table,
231
- description=description,
232
- properties=properties,
233
- lifecycle_state=lifecycle_state,
234
- **kwargs,
235
- )
1828
+ # Set up transaction handling
1829
+ alter_transaction, commit_transaction = setup_transaction(transaction, **kwargs)
1830
+ kwargs["transaction"] = alter_transaction
236
1831
 
237
- table_version = _get_storage(**kwargs).get_latest_table_version(
238
- namespace, table, **kwargs
239
- )
240
- _get_storage(**kwargs).update_table_version(
241
- *args,
242
- namespace=namespace,
243
- table_name=table,
244
- table_version=table_version.id,
245
- description=description,
246
- schema_updates=schema_updates,
247
- partition_updates=partition_updates,
248
- sort_keys=sort_keys,
249
- **kwargs,
1832
+ try:
1833
+ if partition_updates:
1834
+ raise NotImplementedError("Partition updates are not yet supported.")
1835
+ if sort_scheme:
1836
+ raise NotImplementedError("Sort scheme updates are not yet supported.")
1837
+
1838
+ new_table: Table = _get_storage(**kwargs).update_table(
1839
+ *args,
1840
+ namespace=namespace,
1841
+ table_name=table,
1842
+ description=table_description,
1843
+ properties=resolved_table_properties,
1844
+ **kwargs,
1845
+ )
1846
+
1847
+ if table_version is None:
1848
+ table_version: Optional[TableVersion] = _get_storage(
1849
+ **kwargs
1850
+ ).get_latest_active_table_version(namespace, table, **kwargs)
1851
+ if table_version is None:
1852
+ raise TableVersionNotFoundError(
1853
+ f"No active table version found for table {namespace}.{table}. "
1854
+ "Please specify a table_version parameter."
1855
+ )
1856
+ else:
1857
+ table_version = _get_storage(**kwargs).get_table_version(
1858
+ namespace, table, table_version, **kwargs
1859
+ )
1860
+ if table_version is None:
1861
+ raise TableVersionNotFoundError(
1862
+ f"Table version '{table_version}' not found for table {namespace}.{table}"
1863
+ )
1864
+
1865
+ # Get table properties for schema evolution
1866
+ schema_evolution_mode = table_version.read_table_property(
1867
+ TableProperty.SCHEMA_EVOLUTION_MODE
1868
+ )
1869
+ if schema_updates and schema_evolution_mode == SchemaEvolutionMode.DISABLED:
1870
+ raise TableValidationError(
1871
+ "Schema evolution is disabled for this table. Please enable schema evolution or remove schema updates."
1872
+ )
1873
+
1874
+ # Only update table version properties if they are explicitly provided
1875
+ resolved_tv_properties = None
1876
+ if table_version_properties is not None:
1877
+ # inherit properties from the parent table if not specified
1878
+ default_tv_properties = new_table.properties
1879
+ if table_version.schema is None:
1880
+ # schemaless tables don't validate reader compatibility by default
1881
+ default_tv_properties[TableProperty.SUPPORTED_READER_TYPES] = None
1882
+ resolved_tv_properties = _add_default_table_properties(
1883
+ table_version_properties,
1884
+ default_tv_properties,
1885
+ )
1886
+ _validate_table_properties(resolved_tv_properties)
1887
+
1888
+ # Apply schema updates if provided
1889
+ updated_schema = None
1890
+ if schema_updates is not None:
1891
+ # Get the current schema from the table version
1892
+ current_schema = table_version.schema
1893
+ if current_schema != schema_updates.base_schema:
1894
+ raise ValueError(
1895
+ f"Schema updates are not compatible with the current schema for table `{namespace}.{table}`. Current schema: {current_schema}, Schema update base schema: {schema_updates.base_schema}"
1896
+ )
1897
+
1898
+ # Apply all the updates to get the final schema
1899
+ updated_schema = schema_updates.apply()
1900
+
1901
+ _get_storage(**kwargs).update_table_version(
1902
+ *args,
1903
+ namespace=namespace,
1904
+ table_name=table,
1905
+ table_version=table_version.id,
1906
+ lifecycle_state=lifecycle_state,
1907
+ description=table_version_description or table_description,
1908
+ schema=updated_schema,
1909
+ properties=resolved_tv_properties, # This will be None if table_version_properties was not provided
1910
+ **kwargs,
1911
+ )
1912
+
1913
+ except Exception as e:
1914
+ # If any error occurs, the transaction remains uncommitted
1915
+ commit_transaction = False
1916
+ logger.error(f"Error during alter_table: {e}")
1917
+ raise
1918
+ finally:
1919
+ if commit_transaction:
1920
+ # Seal the interactive transaction to commit all operations atomically
1921
+ alter_transaction.seal()
1922
+
1923
+
1924
+ def _add_default_table_properties(
1925
+ table_properties: Optional[TableProperties],
1926
+ default_table_properties: TableProperties = TablePropertyDefaultValues,
1927
+ ) -> TableProperties:
1928
+ if table_properties is None:
1929
+ table_properties = {}
1930
+ for k, v in default_table_properties.items():
1931
+ if k not in table_properties:
1932
+ table_properties[k] = v
1933
+ return table_properties
1934
+
1935
+
1936
+ def _validate_table_properties(
1937
+ table_properties: TableProperties,
1938
+ ) -> None:
1939
+ read_optimization_level = table_properties.get(
1940
+ TableProperty.READ_OPTIMIZATION_LEVEL,
1941
+ TablePropertyDefaultValues[TableProperty.READ_OPTIMIZATION_LEVEL],
250
1942
  )
1943
+ if read_optimization_level != TableReadOptimizationLevel.MAX:
1944
+ raise NotImplementedError(
1945
+ f"Table read optimization level `{read_optimization_level} is not yet supported. Please use {TableReadOptimizationLevel.MAX}"
1946
+ )
251
1947
 
252
1948
 
253
1949
  def create_table(
254
- name: str,
1950
+ table: str,
255
1951
  *args,
256
1952
  namespace: Optional[str] = None,
257
- version: Optional[str] = None,
1953
+ table_version: Optional[str] = None,
258
1954
  lifecycle_state: Optional[LifecycleState] = LifecycleState.ACTIVE,
259
1955
  schema: Optional[Schema] = None,
260
1956
  partition_scheme: Optional[PartitionScheme] = None,
261
1957
  sort_keys: Optional[SortScheme] = None,
262
- description: Optional[str] = None,
1958
+ table_description: Optional[str] = None,
1959
+ table_version_description: Optional[str] = None,
263
1960
  table_properties: Optional[TableProperties] = None,
1961
+ table_version_properties: Optional[TableVersionProperties] = None,
264
1962
  namespace_properties: Optional[NamespaceProperties] = None,
265
1963
  content_types: Optional[List[ContentType]] = None,
266
1964
  fail_if_exists: bool = True,
1965
+ transaction: Optional[Transaction] = None,
267
1966
  **kwargs,
268
1967
  ) -> TableDefinition:
269
1968
  """Create an empty table in the catalog.
@@ -271,20 +1970,22 @@ def create_table(
271
1970
  If a namespace isn't provided, the table will be created within the default deltacat namespace.
272
1971
  Additionally if the provided namespace does not exist, it will be created for you.
273
1972
 
274
-
275
1973
  Args:
276
- name: Name of the table to create.
1974
+ table: Name of the table to create.
277
1975
  namespace: Optional namespace for the table. Uses default namespace if not specified.
278
1976
  version: Optional version identifier for the table.
279
1977
  lifecycle_state: Lifecycle state of the new table. Defaults to ACTIVE.
280
1978
  schema: Schema definition for the table.
281
1979
  partition_scheme: Optional partitioning scheme for the table.
282
1980
  sort_keys: Optional sort keys for the table.
283
- description: Optional description of the table.
1981
+ table_description: Optional description of the table.
1982
+ table_version_description: Optional description for the table version.
284
1983
  table_properties: Optional properties for the table.
1984
+ table_version_properties: Optional properties for the table version. Defaults to the current parent table properties if not specified.
285
1985
  namespace_properties: Optional properties for the namespace if it needs to be created.
286
1986
  content_types: Optional list of allowed content types for the table.
287
1987
  fail_if_exists: If True, raises an error if table already exists. If False, returns existing table.
1988
+ transaction: Optional transaction to use. If None, creates a new transaction.
288
1989
 
289
1990
  Returns:
290
1991
  TableDefinition object for the created or existing table.
@@ -293,56 +1994,133 @@ def create_table(
293
1994
  TableAlreadyExistsError: If the table already exists and fail_if_exists is True.
294
1995
  NamespaceNotFoundError: If the provided namespace does not exist.
295
1996
  """
1997
+ resolved_table_properties = _add_default_table_properties(table_properties)
1998
+ # Note: resolved_tv_properties will be set after checking existing table
1999
+
296
2000
  namespace = namespace or default_namespace()
297
2001
 
298
- table = get_table(*args, name, namespace=namespace, table_version=version, **kwargs)
299
- if table is not None:
300
- if fail_if_exists:
301
- raise TableAlreadyExistsError(f"Table {namespace}.{name} already exists")
302
- return table
2002
+ # Set up transaction handling
2003
+ create_transaction, commit_transaction = setup_transaction(transaction, **kwargs)
2004
+ kwargs["transaction"] = create_transaction
303
2005
 
304
- if not namespace_exists(*args, namespace, **kwargs):
305
- create_namespace(
306
- *args, namespace=namespace, properties=namespace_properties, **kwargs
2006
+ try:
2007
+ existing_table = (
2008
+ get_table(
2009
+ table,
2010
+ namespace=namespace,
2011
+ table_version=table_version,
2012
+ *args,
2013
+ **kwargs,
2014
+ )
2015
+ if "existing_table_definition" not in kwargs
2016
+ else kwargs["existing_table_definition"]
307
2017
  )
2018
+ if existing_table is not None:
2019
+ if existing_table.table_version and existing_table.stream:
2020
+ if fail_if_exists:
2021
+ table_identifier = (
2022
+ f"{namespace}.{table}"
2023
+ if not table_version
2024
+ else f"{namespace}.{table}.{table_version}"
2025
+ )
2026
+ raise TableAlreadyExistsError(
2027
+ f"Table {table_identifier} already exists"
2028
+ )
2029
+ return existing_table
2030
+ # the table exists but the table version doesn't - inherit the existing table properties
2031
+ # Also ensure table properties are inherited when not explicitly provided
2032
+ if table_properties is None:
2033
+ resolved_table_properties = existing_table.table.properties
2034
+
2035
+ # Set up table version properties based on existing table or explicit properties
2036
+ default_tv_properties = resolved_table_properties
2037
+ if schema is None:
2038
+ default_tv_properties = dict(
2039
+ default_tv_properties
2040
+ ) # Make a copy to avoid modifying original
2041
+ default_tv_properties[TableProperty.SUPPORTED_READER_TYPES] = None
2042
+ resolved_tv_properties = _add_default_table_properties(
2043
+ table_version_properties, default_tv_properties
2044
+ )
2045
+ else:
2046
+ # create the namespace if it doesn't exist
2047
+ if not namespace_exists(namespace, **kwargs):
2048
+ create_namespace(
2049
+ namespace=namespace,
2050
+ properties=namespace_properties,
2051
+ *args,
2052
+ **kwargs,
2053
+ )
2054
+
2055
+ # Set up table version properties for new table
2056
+ default_tv_properties = resolved_table_properties
2057
+ if schema is None:
2058
+ default_tv_properties = dict(
2059
+ default_tv_properties
2060
+ ) # Make a copy to avoid modifying original
2061
+ default_tv_properties[TableProperty.SUPPORTED_READER_TYPES] = None
2062
+ resolved_tv_properties = _add_default_table_properties(
2063
+ table_version_properties, default_tv_properties
2064
+ )
308
2065
 
309
- (table, table_version, stream) = _get_storage(**kwargs).create_table_version(
310
- *args,
311
- namespace=namespace,
312
- table_name=name,
313
- table_version=version,
314
- schema=schema,
315
- partition_scheme=partition_scheme,
316
- sort_keys=sort_keys,
317
- table_version_description=description,
318
- table_description=description,
319
- table_properties=table_properties,
320
- lifecycle_state=lifecycle_state or LifecycleState.ACTIVE,
321
- supported_content_types=content_types,
322
- **kwargs,
323
- )
2066
+ _validate_table_properties(resolved_tv_properties)
324
2067
 
325
- return TableDefinition.of(
326
- table=table,
327
- table_version=table_version,
328
- stream=stream,
329
- )
2068
+ (table, table_version, stream) = _get_storage(**kwargs).create_table_version(
2069
+ namespace=namespace,
2070
+ table_name=table,
2071
+ table_version=table_version,
2072
+ schema=schema,
2073
+ partition_scheme=partition_scheme,
2074
+ sort_keys=sort_keys,
2075
+ table_version_description=table_version_description
2076
+ if table_version_description is not None
2077
+ else table_description,
2078
+ table_description=table_description,
2079
+ table_properties=resolved_table_properties,
2080
+ table_version_properties=resolved_tv_properties,
2081
+ lifecycle_state=lifecycle_state or LifecycleState.ACTIVE,
2082
+ supported_content_types=content_types,
2083
+ *args,
2084
+ **kwargs,
2085
+ )
2086
+
2087
+ result = TableDefinition.of(
2088
+ table=table,
2089
+ table_version=table_version,
2090
+ stream=stream,
2091
+ )
2092
+
2093
+ return result
2094
+
2095
+ except Exception as e:
2096
+ # If any error occurs, the transaction remains uncommitted
2097
+ commit_transaction = False
2098
+ logger.error(f"Error during create_table: {e}")
2099
+ raise
2100
+ finally:
2101
+ if commit_transaction:
2102
+ # Seal the interactive transaction to commit all operations atomically
2103
+ create_transaction.seal()
330
2104
 
331
2105
 
332
2106
  def drop_table(
333
- name: str,
2107
+ table: str,
334
2108
  *args,
335
2109
  namespace: Optional[str] = None,
336
2110
  table_version: Optional[str] = None,
337
2111
  purge: bool = False,
2112
+ transaction: Optional[Transaction] = None,
338
2113
  **kwargs,
339
2114
  ) -> None:
340
2115
  """Drop a table from the catalog and optionally purges underlying data.
341
2116
 
342
2117
  Args:
343
- name: Name of the table to drop.
2118
+ table: Name of the table to drop.
344
2119
  namespace: Optional namespace of the table. Uses default namespace if not specified.
2120
+ table_version: Optional table version of the table to drop. If not specified, the parent table of all
2121
+ table versions will be dropped.
345
2122
  purge: If True, permanently delete the table data. If False, only remove from catalog.
2123
+ transaction: Optional transaction to use. If None, creates a new transaction.
346
2124
 
347
2125
  Returns:
348
2126
  None
@@ -357,17 +2135,56 @@ def drop_table(
357
2135
  raise NotImplementedError("Purge flag is not currently supported.")
358
2136
 
359
2137
  namespace = namespace or default_namespace()
360
- _get_storage(**kwargs).delete_table(
361
- *args, namespace=namespace, name=name, purge=purge, **kwargs
362
- )
2138
+
2139
+ # Set up transaction handling
2140
+ drop_transaction, commit_transaction = setup_transaction(transaction, **kwargs)
2141
+ kwargs["transaction"] = drop_transaction
2142
+
2143
+ try:
2144
+ if not table_version:
2145
+ _get_storage(**kwargs).delete_table(
2146
+ namespace=namespace,
2147
+ table_name=table,
2148
+ purge=purge,
2149
+ *args,
2150
+ **kwargs,
2151
+ )
2152
+ else:
2153
+ _get_storage(**kwargs).update_table_version(
2154
+ namespace=namespace,
2155
+ table_name=table,
2156
+ table_version=table_version,
2157
+ lifecycle_state=LifecycleState.DELETED,
2158
+ *args,
2159
+ **kwargs,
2160
+ )
2161
+
2162
+ except Exception as e:
2163
+ # If any error occurs, the transaction remains uncommitted
2164
+ commit_transaction = False
2165
+ logger.error(f"Error during drop_table: {e}")
2166
+ raise
2167
+ finally:
2168
+ if commit_transaction:
2169
+ # Seal the interactive transaction to commit all operations atomically
2170
+ drop_transaction.seal()
363
2171
 
364
2172
 
365
- def refresh_table(table: str, *args, namespace: Optional[str] = None, **kwargs) -> None:
2173
+ def refresh_table(
2174
+ table: str,
2175
+ *args,
2176
+ namespace: Optional[str] = None,
2177
+ table_version: Optional[str] = None,
2178
+ transaction: Optional[Transaction] = None,
2179
+ **kwargs,
2180
+ ) -> None:
366
2181
  """Refresh metadata cached on the Ray cluster for the given table.
367
2182
 
368
2183
  Args:
369
2184
  table: Name of the table to refresh.
370
2185
  namespace: Optional namespace of the table. Uses default namespace if not specified.
2186
+ table_version: Optional specific version of the table to refresh.
2187
+ transaction: Optional transaction to use. If None, creates a new transaction.
371
2188
 
372
2189
  Returns:
373
2190
  None
@@ -376,32 +2193,79 @@ def refresh_table(table: str, *args, namespace: Optional[str] = None, **kwargs)
376
2193
 
377
2194
 
378
2195
  def list_tables(
379
- *args, namespace: Optional[str] = None, **kwargs
2196
+ *args,
2197
+ namespace: Optional[str] = None,
2198
+ table: Optional[str] = None,
2199
+ transaction: Optional[Transaction] = None,
2200
+ **kwargs,
380
2201
  ) -> ListResult[TableDefinition]:
381
2202
  """List a page of table definitions.
382
2203
 
383
2204
  Args:
384
2205
  namespace: Optional namespace to list tables from. Uses default namespace if not specified.
2206
+ table: Optional table to list its table versions. If not specified, lists the latest active version of each table in the namespace.
2207
+ transaction: Optional transaction to use for reading. If provided, will see uncommitted changes.
385
2208
 
386
2209
  Returns:
387
2210
  ListResult containing TableDefinition objects for tables in the namespace.
388
2211
  """
389
2212
  namespace = namespace or default_namespace()
390
- tables = _get_storage(**kwargs).list_tables(*args, namespace=namespace, **kwargs)
391
- table_definitions = [
392
- get_table(*args, table.table_name, namespace, **kwargs)
393
- for table in tables.all_items()
394
- ]
395
2213
 
396
- return ListResult(items=table_definitions)
2214
+ # Set up transaction handling
2215
+ list_transaction, commit_transaction = setup_transaction(transaction, **kwargs)
2216
+ kwargs["transaction"] = list_transaction
2217
+
2218
+ try:
2219
+ if not table:
2220
+ tables = _get_storage(**kwargs).list_tables(
2221
+ namespace=namespace,
2222
+ *args,
2223
+ **kwargs,
2224
+ )
2225
+ table_definitions = [
2226
+ get_table(table.table_name, namespace=namespace, *args, **kwargs)
2227
+ for table in tables.all_items()
2228
+ ]
2229
+ else:
2230
+ table_versions = _get_storage(**kwargs).list_table_versions(
2231
+ namespace=namespace,
2232
+ table_name=table,
2233
+ *args,
2234
+ **kwargs,
2235
+ )
2236
+ table_definitions = [
2237
+ get_table(
2238
+ table,
2239
+ namespace=namespace,
2240
+ table_version=table_version.id,
2241
+ *args,
2242
+ **kwargs,
2243
+ )
2244
+ for table_version in table_versions.all_items()
2245
+ ]
2246
+
2247
+ result = ListResult(items=table_definitions)
2248
+
2249
+ return result
2250
+
2251
+ except Exception as e:
2252
+ # If any error occurs, the transaction remains uncommitted
2253
+ commit_transaction = False
2254
+ logger.error(f"Error during list_tables: {e}")
2255
+ raise
2256
+ finally:
2257
+ if commit_transaction:
2258
+ # Seal the interactive transaction to commit all operations atomically
2259
+ list_transaction.seal()
397
2260
 
398
2261
 
399
2262
  def get_table(
400
- name: str,
2263
+ table: str,
401
2264
  *args,
402
2265
  namespace: Optional[str] = None,
403
2266
  table_version: Optional[str] = None,
404
2267
  stream_format: StreamFormat = StreamFormat.DELTACAT,
2268
+ transaction: Optional[Transaction] = None,
405
2269
  **kwargs,
406
2270
  ) -> Optional[TableDefinition]:
407
2271
  """Get table definition metadata.
@@ -409,64 +2273,84 @@ def get_table(
409
2273
  Args:
410
2274
  name: Name of the table to retrieve.
411
2275
  namespace: Optional namespace of the table. Uses default namespace if not specified.
412
- table_version: Optional specific version of the table to retrieve.
413
- If not specified, the latest version is used.
414
- stream_format: Optional stream format to retrieve. Uses the default Deltacat stream
415
- format if not specified.
2276
+ table_version: Optional specific version of the table to retrieve. Defaults to the latest active version.
2277
+ stream_format: Optional stream format to retrieve. Defaults to DELTACAT.
2278
+ transaction: Optional transaction to use. If None, creates a new transaction.
416
2279
 
417
2280
  Returns:
418
- Deltacat TableDefinition if the table exists, None otherwise.
419
-
420
- Raises:
421
- TableVersionNotFoundError: If the table version does not exist.
422
- StreamNotFoundError: If the stream does not exist.
2281
+ Deltacat TableDefinition if the table exists, None otherwise. The table definition's table version will be
2282
+ None if the requested version is not found. The table definition's stream will be None if the requested stream
2283
+ format is not found.
423
2284
  """
424
2285
  namespace = namespace or default_namespace()
425
- table: Optional[Table] = _get_storage(**kwargs).get_table(
426
- *args, table_name=name, namespace=namespace, **kwargs
427
- )
428
2286
 
429
- if table is None:
430
- return None
431
-
432
- table_version: Optional[TableVersion] = _get_storage(**kwargs).get_table_version(
433
- *args, namespace, name, table_version or table.latest_table_version, **kwargs
434
- )
2287
+ # Set up transaction handling
2288
+ get_transaction, commit_transaction = setup_transaction(transaction, **kwargs)
2289
+ kwargs["transaction"] = get_transaction
435
2290
 
436
- if table_version is None:
437
- raise TableVersionNotFoundError(
438
- f"TableVersion {namespace}.{name}.{table_version} does not exist."
2291
+ try:
2292
+ table_obj: Optional[Table] = _get_storage(**kwargs).get_table(
2293
+ table_name=table,
2294
+ namespace=namespace,
2295
+ *args,
2296
+ **kwargs,
439
2297
  )
440
2298
 
441
- stream = _get_storage(**kwargs).get_stream(
442
- *args,
443
- namespace=namespace,
444
- table_name=name,
445
- table_version=table_version.id,
446
- stream_format=stream_format,
447
- **kwargs,
448
- )
2299
+ if table_obj is None:
2300
+ return None
449
2301
 
450
- if stream is None:
451
- raise StreamNotFoundError(
452
- f"Stream {namespace}.{table}.{table_version}.{stream} does not exist."
2302
+ table_version_obj: Optional[TableVersion] = _get_storage(
2303
+ **kwargs
2304
+ ).get_table_version(
2305
+ namespace,
2306
+ table,
2307
+ table_version or table_obj.latest_active_table_version,
2308
+ *args,
2309
+ **kwargs,
453
2310
  )
454
2311
 
455
- return TableDefinition.of(
456
- table=table,
457
- table_version=table_version,
458
- stream=stream,
459
- )
2312
+ stream = None
2313
+ if table_version_obj:
2314
+ stream = _get_storage(**kwargs).get_stream(
2315
+ namespace=namespace,
2316
+ table_name=table,
2317
+ table_version=table_version_obj.id,
2318
+ stream_format=stream_format,
2319
+ *args,
2320
+ **kwargs,
2321
+ )
2322
+
2323
+ return TableDefinition.of(
2324
+ table=table_obj,
2325
+ table_version=table_version_obj,
2326
+ stream=stream,
2327
+ )
2328
+ except Exception as e:
2329
+ # If any error occurs, the transaction remains uncommitted
2330
+ commit_transaction = False
2331
+ logger.error(f"Error during get_table: {e}")
2332
+ raise
2333
+ finally:
2334
+ if commit_transaction:
2335
+ # Seal the interactive transaction to commit all operations atomically
2336
+ get_transaction.seal()
460
2337
 
461
2338
 
462
2339
  def truncate_table(
463
- table: str, *args, namespace: Optional[str] = None, **kwargs
2340
+ table: str,
2341
+ *args,
2342
+ namespace: Optional[str] = None,
2343
+ table_version: Optional[str] = None,
2344
+ transaction: Optional[Transaction] = None,
2345
+ **kwargs,
464
2346
  ) -> None:
465
2347
  """Truncate table data.
466
2348
 
467
2349
  Args:
468
2350
  table: Name of the table to truncate.
469
2351
  namespace: Optional namespace of the table. Uses default namespace if not specified.
2352
+ table_version: Optional specific version of the table to truncate. Defaults to the latest active version.
2353
+ transaction: Optional transaction to use. If None, creates a new transaction.
470
2354
 
471
2355
  Returns:
472
2356
  None
@@ -475,7 +2359,12 @@ def truncate_table(
475
2359
 
476
2360
 
477
2361
  def rename_table(
478
- table: str, new_name: str, *args, namespace: Optional[str] = None, **kwargs
2362
+ table: str,
2363
+ new_name: str,
2364
+ *args,
2365
+ namespace: Optional[str] = None,
2366
+ transaction: Optional[Transaction] = None,
2367
+ **kwargs,
479
2368
  ) -> None:
480
2369
  """Rename an existing table.
481
2370
 
@@ -483,6 +2372,7 @@ def rename_table(
483
2372
  table: Current name of the table.
484
2373
  new_name: New name for the table.
485
2374
  namespace: Optional namespace of the table. Uses default namespace if not specified.
2375
+ transaction: Optional transaction to use. If None, creates a new transaction.
486
2376
 
487
2377
  Returns:
488
2378
  None
@@ -491,71 +2381,219 @@ def rename_table(
491
2381
  TableNotFoundError: If the table does not exist.
492
2382
  """
493
2383
  namespace = namespace or default_namespace()
494
- _get_storage(**kwargs).update_table(
495
- *args, table_name=table, new_table_name=new_name, namespace=namespace, **kwargs
496
- )
497
2384
 
2385
+ # Set up transaction handling
2386
+ rename_transaction, commit_transaction = setup_transaction(transaction, **kwargs)
2387
+ kwargs["transaction"] = rename_transaction
2388
+
2389
+ try:
2390
+ _get_storage(**kwargs).update_table(
2391
+ table_name=table,
2392
+ new_table_name=new_name,
2393
+ namespace=namespace,
2394
+ *args,
2395
+ **kwargs,
2396
+ )
2397
+
2398
+ except Exception as e:
2399
+ # If any error occurs, the transaction remains uncommitted
2400
+ commit_transaction = False
2401
+ logger.error(f"Error during rename_table: {e}")
2402
+ raise
2403
+ finally:
2404
+ if commit_transaction:
2405
+ # Seal the interactive transaction to commit all operations atomically
2406
+ rename_transaction.seal()
498
2407
 
499
- def table_exists(table: str, *args, namespace: Optional[str] = None, **kwargs) -> bool:
2408
+
2409
+ def table_exists(
2410
+ table: str,
2411
+ *args,
2412
+ namespace: Optional[str] = None,
2413
+ table_version: Optional[str] = None,
2414
+ stream_format: StreamFormat = StreamFormat.DELTACAT,
2415
+ transaction: Optional[Transaction] = None,
2416
+ **kwargs,
2417
+ ) -> bool:
500
2418
  """Check if a table exists in the catalog.
501
2419
 
502
2420
  Args:
503
2421
  table: Name of the table to check.
504
2422
  namespace: Optional namespace of the table. Uses default namespace if not specified.
2423
+ table_version: Optional specific version of the table to check. Defaults to the latest active version.
2424
+ stream_format: Optional stream format to check. Defaults to DELTACAT.
2425
+ transaction: Optional transaction to use. If None, creates a new transaction.
505
2426
 
506
2427
  Returns:
507
2428
  True if the table exists, False otherwise.
508
2429
  """
509
2430
  namespace = namespace or default_namespace()
510
- return _get_storage(**kwargs).table_exists(
511
- *args, table_name=table, namespace=namespace, **kwargs
512
- )
513
2431
 
2432
+ # Set up transaction handling
2433
+ exists_transaction, commit_transaction = setup_transaction(transaction, **kwargs)
2434
+ kwargs["transaction"] = exists_transaction
514
2435
 
515
- def list_namespaces(*args, **kwargs) -> ListResult[Namespace]:
2436
+ try:
2437
+ table_obj = _get_storage(**kwargs).get_table(
2438
+ namespace=namespace,
2439
+ table_name=table,
2440
+ *args,
2441
+ **kwargs,
2442
+ )
2443
+ if table_obj is None:
2444
+ return False
2445
+ table_version = table_version or table_obj.latest_active_table_version
2446
+ if not table_version:
2447
+ return False
2448
+ table_version_exists = _get_storage(**kwargs).table_version_exists(
2449
+ namespace,
2450
+ table,
2451
+ table_version,
2452
+ *args,
2453
+ **kwargs,
2454
+ )
2455
+ if not table_version_exists:
2456
+ return False
2457
+ stream_exists = _get_storage(**kwargs).stream_exists(
2458
+ namespace=namespace,
2459
+ table_name=table,
2460
+ table_version=table_version,
2461
+ stream_format=stream_format,
2462
+ *args,
2463
+ **kwargs,
2464
+ )
2465
+ return stream_exists
2466
+ except Exception as e:
2467
+ # If any error occurs, the transaction remains uncommitted
2468
+ commit_transaction = False
2469
+ logger.error(f"Error during table_exists: {e}")
2470
+ raise
2471
+ finally:
2472
+ if commit_transaction:
2473
+ # Seal the interactive transaction to commit all operations atomically
2474
+ exists_transaction.seal()
2475
+
2476
+
2477
+ def list_namespaces(
2478
+ *args,
2479
+ transaction: Optional[Transaction] = None,
2480
+ **kwargs,
2481
+ ) -> ListResult[Namespace]:
516
2482
  """List a page of table namespaces.
517
2483
 
518
2484
  Args:
519
- catalog: Catalog properties instance.
2485
+ transaction: Optional transaction to use. If None, creates a new transaction.
520
2486
 
521
2487
  Returns:
522
2488
  ListResult containing Namespace objects.
523
2489
  """
524
- return _get_storage(**kwargs).list_namespaces(*args, **kwargs)
2490
+ # Set up transaction handling
2491
+ list_transaction, commit_transaction = setup_transaction(transaction, **kwargs)
2492
+ kwargs["transaction"] = list_transaction
2493
+
2494
+ try:
2495
+ result = _get_storage(**kwargs).list_namespaces(*args, **kwargs)
2496
+
2497
+ return result
525
2498
 
2499
+ except Exception as e:
2500
+ # If any error occurs, the transaction remains uncommitted
2501
+ commit_transaction = False
2502
+ logger.error(f"Error during list_namespaces: {e}")
2503
+ raise
2504
+ finally:
2505
+ if commit_transaction:
2506
+ # Seal the interactive transaction to commit all operations atomically
2507
+ list_transaction.seal()
526
2508
 
527
- def get_namespace(namespace: str, *args, **kwargs) -> Optional[Namespace]:
2509
+
2510
+ def get_namespace(
2511
+ namespace: str,
2512
+ *args,
2513
+ transaction: Optional[Transaction] = None,
2514
+ **kwargs,
2515
+ ) -> Optional[Namespace]:
528
2516
  """Get metadata for a specific table namespace.
529
2517
 
530
2518
  Args:
531
2519
  namespace: Name of the namespace to retrieve.
2520
+ transaction: Optional transaction to use. If None, creates a new transaction.
532
2521
 
533
2522
  Returns:
534
2523
  Namespace object if the namespace exists, None otherwise.
535
2524
  """
536
- return _get_storage(**kwargs).get_namespace(*args, namespace=namespace, **kwargs)
2525
+ # Set up transaction handling
2526
+ get_ns_transaction, commit_transaction = setup_transaction(transaction, **kwargs)
2527
+ kwargs["transaction"] = get_ns_transaction
2528
+
2529
+ try:
2530
+ result = _get_storage(**kwargs).get_namespace(
2531
+ *args, namespace=namespace, **kwargs
2532
+ )
2533
+
2534
+ return result
2535
+
2536
+ except Exception as e:
2537
+ # If any error occurs, the transaction remains uncommitted
2538
+ commit_transaction = False
2539
+ logger.error(f"Error during get_namespace: {e}")
2540
+ raise
2541
+ finally:
2542
+ if commit_transaction:
2543
+ # Seal the interactive transaction to commit all operations atomically
2544
+ get_ns_transaction.seal()
537
2545
 
538
2546
 
539
- def namespace_exists(namespace: str, *args, **kwargs) -> bool:
2547
+ def namespace_exists(
2548
+ namespace: str,
2549
+ *args,
2550
+ transaction: Optional[Transaction] = None,
2551
+ **kwargs,
2552
+ ) -> bool:
540
2553
  """Check if a namespace exists.
541
2554
 
542
2555
  Args:
543
2556
  namespace: Name of the namespace to check.
2557
+ transaction: Optional transaction to use for reading. If provided, will see uncommitted changes.
544
2558
 
545
2559
  Returns:
546
2560
  True if the namespace exists, False otherwise.
547
2561
  """
548
- return _get_storage(**kwargs).namespace_exists(*args, namespace=namespace, **kwargs)
2562
+ # Set up transaction handling
2563
+ exists_transaction, commit_transaction = setup_transaction(transaction, **kwargs)
2564
+ kwargs["transaction"] = exists_transaction
2565
+
2566
+ try:
2567
+ result = _get_storage(**kwargs).namespace_exists(
2568
+ *args, namespace=namespace, **kwargs
2569
+ )
2570
+
2571
+ return result
2572
+
2573
+ except Exception as e:
2574
+ # If any error occurs, the transaction remains uncommitted
2575
+ commit_transaction = False
2576
+ logger.error(f"Error during namespace_exists: {e}")
2577
+ raise
2578
+ finally:
2579
+ if commit_transaction:
2580
+ # Seal the interactive transaction to commit all operations atomically
2581
+ exists_transaction.seal()
549
2582
 
550
2583
 
551
2584
  def create_namespace(
552
- namespace: str, *args, properties: Optional[NamespaceProperties] = None, **kwargs
2585
+ namespace: str,
2586
+ *args,
2587
+ properties: Optional[NamespaceProperties] = None,
2588
+ transaction: Optional[Transaction] = None,
2589
+ **kwargs,
553
2590
  ) -> Namespace:
554
2591
  """Create a new namespace.
555
2592
 
556
2593
  Args:
557
2594
  namespace: Name of the namespace to create.
558
2595
  properties: Optional properties for the namespace.
2596
+ transaction: Optional transaction to use. If None, creates a new transaction.
559
2597
 
560
2598
  Returns:
561
2599
  Created Namespace object.
@@ -563,12 +2601,29 @@ def create_namespace(
563
2601
  Raises:
564
2602
  NamespaceAlreadyExistsError: If the namespace already exists.
565
2603
  """
566
- if namespace_exists(namespace, **kwargs):
567
- raise NamespaceAlreadyExistsError(f"Namespace {namespace} already exists")
2604
+ # Set up transaction handling
2605
+ namespace_transaction, commit_transaction = setup_transaction(transaction, **kwargs)
2606
+ kwargs["transaction"] = namespace_transaction
568
2607
 
569
- return _get_storage(**kwargs).create_namespace(
570
- *args, namespace=namespace, properties=properties, **kwargs
571
- )
2608
+ try:
2609
+ if namespace_exists(namespace, **kwargs):
2610
+ raise NamespaceAlreadyExistsError(f"Namespace {namespace} already exists")
2611
+
2612
+ result = _get_storage(**kwargs).create_namespace(
2613
+ *args, namespace=namespace, properties=properties, **kwargs
2614
+ )
2615
+
2616
+ return result
2617
+
2618
+ except Exception as e:
2619
+ # If any error occurs, the transaction remains uncommitted
2620
+ commit_transaction = False
2621
+ logger.error(f"Error during create_namespace: {e}")
2622
+ raise
2623
+ finally:
2624
+ if commit_transaction:
2625
+ # Seal the interactive transaction to commit all operations atomically
2626
+ namespace_transaction.seal()
572
2627
 
573
2628
 
574
2629
  def alter_namespace(
@@ -576,6 +2631,7 @@ def alter_namespace(
576
2631
  *args,
577
2632
  properties: Optional[NamespaceProperties] = None,
578
2633
  new_namespace: Optional[str] = None,
2634
+ transaction: Optional[Transaction] = None,
579
2635
  **kwargs,
580
2636
  ) -> None:
581
2637
  """Alter a namespace definition.
@@ -584,26 +2640,49 @@ def alter_namespace(
584
2640
  namespace: Name of the namespace to alter.
585
2641
  properties: Optional new properties for the namespace.
586
2642
  new_namespace: Optional new name for the namespace.
2643
+ transaction: Optional transaction to use. If None, creates a new transaction.
587
2644
 
588
2645
  Returns:
589
2646
  None
590
2647
  """
591
- _get_storage(**kwargs).update_namespace(
592
- namespace=namespace,
593
- properties=properties,
594
- new_namespace=new_namespace,
595
- *args,
596
- **kwargs,
597
- )
2648
+ # Set up transaction handling
2649
+ alter_ns_transaction, commit_transaction = setup_transaction(transaction, **kwargs)
2650
+ kwargs["transaction"] = alter_ns_transaction
2651
+
2652
+ try:
2653
+ _get_storage(**kwargs).update_namespace(
2654
+ namespace=namespace,
2655
+ properties=properties,
2656
+ new_namespace=new_namespace,
2657
+ *args,
2658
+ **kwargs,
2659
+ )
2660
+
2661
+ except Exception as e:
2662
+ # If any error occurs, the transaction remains uncommitted
2663
+ commit_transaction = False
2664
+ logger.error(f"Error during alter_namespace: {e}")
2665
+ raise
2666
+ finally:
2667
+ if commit_transaction:
2668
+ # Seal the interactive transaction to commit all operations atomically
2669
+ alter_ns_transaction.seal()
598
2670
 
599
2671
 
600
- def drop_namespace(namespace: str, *args, purge: bool = False, **kwargs) -> None:
2672
+ def drop_namespace(
2673
+ namespace: str,
2674
+ *args,
2675
+ purge: bool = False,
2676
+ transaction: Optional[Transaction] = None,
2677
+ **kwargs,
2678
+ ) -> None:
601
2679
  """Drop a namespace and all of its tables from the catalog.
602
2680
 
603
2681
  Args:
604
2682
  namespace: Name of the namespace to drop.
605
- purge: If True, permanently delete all tables in the namespace.
606
- If False, only remove from catalog.
2683
+ purge: If True, permanently delete all table data in the namespace.
2684
+ If False, only removes the namespace from the catalog.
2685
+ transaction: Optional transaction to use. If None, creates a new transaction.
607
2686
 
608
2687
  Returns:
609
2688
  None
@@ -613,50 +2692,39 @@ def drop_namespace(namespace: str, *args, purge: bool = False, **kwargs) -> None
613
2692
  if purge:
614
2693
  raise NotImplementedError("Purge flag is not currently supported.")
615
2694
 
616
- _get_storage(**kwargs).delete_namespace(
617
- *args, namespace=namespace, purge=purge, **kwargs
618
- )
2695
+ # Set up transaction handling
2696
+ drop_ns_transaction, commit_transaction = setup_transaction(transaction, **kwargs)
2697
+ kwargs["transaction"] = drop_ns_transaction
2698
+
2699
+ try:
2700
+ _get_storage(**kwargs).delete_namespace(
2701
+ *args,
2702
+ namespace=namespace,
2703
+ purge=purge,
2704
+ **kwargs,
2705
+ )
2706
+
2707
+ except Exception as e:
2708
+ # If any error occurs, the transaction remains uncommitted
2709
+ commit_transaction = False
2710
+ logger.error(f"Error during drop_namespace: {e}")
2711
+ raise
2712
+ finally:
2713
+ if commit_transaction:
2714
+ # Seal the interactive transaction to commit all operations atomically
2715
+ drop_ns_transaction.seal()
619
2716
 
620
2717
 
621
2718
  def default_namespace(*args, **kwargs) -> str:
622
2719
  """Return the default namespace for the catalog.
623
2720
 
624
2721
  Returns:
625
- String name of the default namespace.
2722
+ Name of the default namespace.
626
2723
  """
627
- return DEFAULT_NAMESPACE # table functions
628
-
629
-
630
- def _validate_read_table_args(
631
- namespace: Optional[str] = None,
632
- table_type: Optional[TableType] = None,
633
- distributed_dataset_type: Optional[DistributedDatasetType] = None,
634
- merge_on_read: Optional[bool] = None,
635
- **kwargs,
636
- ):
637
- storage = _get_storage(**kwargs)
638
- if storage is None:
639
- raise ValueError(
640
- "Catalog not initialized. Did you miss calling "
641
- "initialize(ds=<deltacat_storage>)?"
642
- )
643
-
644
- if merge_on_read:
645
- raise ValueError("Merge on read not supported currently.")
646
-
647
- if table_type is not TableType.PYARROW:
648
- raise ValueError("Only PYARROW table type is supported as of now")
649
-
650
- if distributed_dataset_type is not DistributedDatasetType.DAFT:
651
- raise ValueError("Only DAFT dataset type is supported as of now")
2724
+ return DEFAULT_NAMESPACE
652
2725
 
653
- if namespace is None:
654
- raise ValueError(
655
- "namespace must be passed to uniquely identify a table in the catalog."
656
- )
657
2726
 
658
-
659
- def _get_latest_or_given_table_version(
2727
+ def _get_latest_active_or_given_table_version(
660
2728
  namespace: str,
661
2729
  table_name: str,
662
2730
  table_version: Optional[str] = None,
@@ -665,9 +2733,16 @@ def _get_latest_or_given_table_version(
665
2733
  ) -> TableVersion:
666
2734
  table_version_obj = None
667
2735
  if table_version is None:
668
- table_version_obj = _get_storage(**kwargs).get_latest_table_version(
669
- namespace=namespace, table_name=table_name, *args, **kwargs
2736
+ table_version_obj = _get_storage(**kwargs).get_latest_active_table_version(
2737
+ namespace=namespace,
2738
+ table_name=table_name,
2739
+ *args,
2740
+ **kwargs,
670
2741
  )
2742
+ if table_version_obj is None:
2743
+ raise TableVersionNotFoundError(
2744
+ f"No active table version found for table {namespace}.{table_name}"
2745
+ )
671
2746
  table_version = table_version_obj.table_version
672
2747
  else:
673
2748
  table_version_obj = _get_storage(**kwargs).get_table_version(
@@ -681,18 +2756,82 @@ def _get_latest_or_given_table_version(
681
2756
  return table_version_obj
682
2757
 
683
2758
 
2759
+ def _get_all_committed_partitions(
2760
+ table: str,
2761
+ namespace: str,
2762
+ table_version: str,
2763
+ **kwargs,
2764
+ ) -> List[Union[Partition, PartitionLocator]]:
2765
+ """Get all committed partitions for a table and validate uniqueness."""
2766
+ logger.info(
2767
+ f"Reading all partitions metadata in the table={table} "
2768
+ "as partition_filter was None."
2769
+ )
2770
+
2771
+ all_partitions = (
2772
+ _get_storage(**kwargs)
2773
+ .list_partitions(
2774
+ table_name=table,
2775
+ namespace=namespace,
2776
+ table_version=table_version,
2777
+ **kwargs,
2778
+ )
2779
+ .all_items()
2780
+ )
2781
+
2782
+ committed_partitions = [
2783
+ partition
2784
+ for partition in all_partitions
2785
+ if partition.state == CommitState.COMMITTED
2786
+ ]
2787
+
2788
+ logger.info(
2789
+ f"Found {len(committed_partitions)} committed partitions for "
2790
+ f"table={namespace}/{table}/{table_version}"
2791
+ )
2792
+
2793
+ _validate_partition_uniqueness(
2794
+ committed_partitions, namespace, table, table_version
2795
+ )
2796
+ return committed_partitions
2797
+
2798
+
2799
+ def _validate_partition_uniqueness(
2800
+ partitions: List[Partition], namespace: str, table: str, table_version: str
2801
+ ) -> None:
2802
+ """Validate that there are no duplicate committed partitions for the same partition values."""
2803
+ commit_count_per_partition_value = defaultdict(int)
2804
+ for partition in partitions:
2805
+ # Normalize partition values: both None and [] represent unpartitioned data
2806
+ normalized_values = (
2807
+ None
2808
+ if (
2809
+ partition.partition_values is None
2810
+ or (
2811
+ isinstance(partition.partition_values, list)
2812
+ and len(partition.partition_values) == 0
2813
+ )
2814
+ )
2815
+ else partition.partition_values
2816
+ )
2817
+ commit_count_per_partition_value[normalized_values] += 1
2818
+
2819
+ # Check for multiple committed partitions for the same partition values
2820
+ for partition_values, commit_count in commit_count_per_partition_value.items():
2821
+ if commit_count > 1:
2822
+ raise RuntimeError(
2823
+ f"Multiple committed partitions found for table={namespace}/{table}/{table_version}. "
2824
+ f"Partition values: {partition_values}. Commit count: {commit_count}. "
2825
+ f"This should not happen."
2826
+ )
2827
+
2828
+
684
2829
  def _get_deltas_from_partition_filter(
685
2830
  partition_filter: Optional[List[Union[Partition, PartitionLocator]]] = None,
686
- stream_position_range_inclusive: Optional[Tuple[int, int]] = None,
687
2831
  *args,
688
2832
  **kwargs,
689
2833
  ):
690
-
691
2834
  result_deltas = []
692
- start_stream_position, end_stream_position = stream_position_range_inclusive or (
693
- None,
694
- None,
695
- )
696
2835
  for partition_like in partition_filter:
697
2836
  deltas = (
698
2837
  _get_storage(**kwargs)
@@ -700,26 +2839,33 @@ def _get_deltas_from_partition_filter(
700
2839
  partition_like=partition_like,
701
2840
  ascending_order=True,
702
2841
  include_manifest=True,
703
- start_stream_position=start_stream_position,
704
- last_stream_position=end_stream_position,
705
2842
  *args,
706
2843
  **kwargs,
707
2844
  )
708
2845
  .all_items()
709
2846
  )
710
2847
 
711
- for delta in deltas:
712
- if (
713
- start_stream_position is None
714
- or delta.stream_position >= start_stream_position
715
- ) and (
716
- end_stream_position is None
717
- or delta.stream_position <= end_stream_position
718
- ):
719
- if delta.type == DeltaType.DELETE:
720
- raise ValueError("DELETE type deltas are not supported")
721
- result_deltas.append(delta)
722
-
2848
+ # Validate that all qualified deltas are append type - merge-on-read not yet implemented
2849
+ # TODO(pdames): Run compaction minus materialize for MoR of each partition.
2850
+ if deltas:
2851
+ non_append_deltas = []
2852
+ for delta in deltas:
2853
+ if delta.type != DeltaType.APPEND:
2854
+ non_append_deltas.append(delta)
2855
+ else:
2856
+ result_deltas.append(delta)
2857
+ if non_append_deltas:
2858
+ delta_types = {delta.type for delta in non_append_deltas}
2859
+ delta_info = [
2860
+ (str(delta.locator), delta.type) for delta in non_append_deltas[:5]
2861
+ ] # Show first 5
2862
+ raise NotImplementedError(
2863
+ f"Merge-on-read is not yet implemented. Found {len(non_append_deltas)} non-append deltas "
2864
+ f"with types {delta_types}. All deltas must be APPEND type for read operations. "
2865
+ f"Examples: {delta_info}. Please run compaction first to merge non-append deltas."
2866
+ )
2867
+
2868
+ logger.info(f"Validated {len(deltas)} qualified deltas are all APPEND type")
723
2869
  return result_deltas
724
2870
 
725
2871