deltacat 2.0__py3-none-any.whl → 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (324) hide show
  1. deltacat/__init__.py +117 -18
  2. deltacat/api.py +536 -126
  3. deltacat/aws/constants.py +0 -23
  4. deltacat/aws/s3u.py +4 -631
  5. deltacat/benchmarking/benchmark_engine.py +4 -2
  6. deltacat/benchmarking/conftest.py +1 -19
  7. deltacat/benchmarking/test_benchmark_pipeline.py +6 -4
  8. deltacat/catalog/__init__.py +64 -5
  9. deltacat/catalog/delegate.py +445 -63
  10. deltacat/catalog/interface.py +188 -62
  11. deltacat/catalog/main/impl.py +2444 -282
  12. deltacat/catalog/model/catalog.py +208 -113
  13. deltacat/catalog/model/properties.py +63 -24
  14. deltacat/compute/__init__.py +14 -0
  15. deltacat/compute/compactor/compaction_session.py +97 -75
  16. deltacat/compute/compactor/model/compact_partition_params.py +75 -30
  17. deltacat/compute/compactor/model/compaction_session_audit_info.py +17 -0
  18. deltacat/compute/compactor/model/round_completion_info.py +16 -6
  19. deltacat/compute/compactor/repartition_session.py +8 -21
  20. deltacat/compute/compactor/steps/hash_bucket.py +5 -5
  21. deltacat/compute/compactor/steps/materialize.py +9 -7
  22. deltacat/compute/compactor/steps/repartition.py +12 -11
  23. deltacat/compute/compactor/utils/io.py +6 -5
  24. deltacat/compute/compactor/utils/round_completion_reader.py +117 -0
  25. deltacat/compute/compactor/utils/system_columns.py +3 -1
  26. deltacat/compute/compactor_v2/compaction_session.py +17 -14
  27. deltacat/compute/compactor_v2/constants.py +30 -1
  28. deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -1
  29. deltacat/compute/compactor_v2/model/hash_bucket_input.py +9 -3
  30. deltacat/compute/compactor_v2/model/merge_file_group.py +5 -2
  31. deltacat/compute/compactor_v2/model/merge_input.py +33 -8
  32. deltacat/compute/compactor_v2/private/compaction_utils.py +167 -68
  33. deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -2
  34. deltacat/compute/compactor_v2/steps/merge.py +267 -55
  35. deltacat/compute/compactor_v2/utils/content_type_params.py +34 -6
  36. deltacat/compute/compactor_v2/utils/dedupe.py +1 -1
  37. deltacat/compute/compactor_v2/utils/delta.py +5 -3
  38. deltacat/compute/compactor_v2/utils/io.py +11 -4
  39. deltacat/compute/compactor_v2/utils/merge.py +15 -2
  40. deltacat/compute/compactor_v2/utils/primary_key_index.py +28 -4
  41. deltacat/compute/compactor_v2/utils/task_options.py +45 -33
  42. deltacat/compute/converter/constants.py +5 -0
  43. deltacat/compute/converter/converter_session.py +207 -52
  44. deltacat/compute/converter/model/convert_input.py +43 -16
  45. deltacat/compute/converter/model/convert_input_files.py +33 -16
  46. deltacat/compute/converter/model/convert_result.py +80 -0
  47. deltacat/compute/converter/model/converter_session_params.py +64 -19
  48. deltacat/compute/converter/pyiceberg/catalog.py +21 -18
  49. deltacat/compute/converter/pyiceberg/overrides.py +193 -65
  50. deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +148 -100
  51. deltacat/compute/converter/steps/convert.py +230 -75
  52. deltacat/compute/converter/steps/dedupe.py +46 -12
  53. deltacat/compute/converter/utils/convert_task_options.py +66 -22
  54. deltacat/compute/converter/utils/converter_session_utils.py +126 -60
  55. deltacat/compute/converter/utils/iceberg_columns.py +13 -8
  56. deltacat/compute/converter/utils/io.py +173 -13
  57. deltacat/compute/converter/utils/s3u.py +42 -27
  58. deltacat/compute/janitor.py +205 -0
  59. deltacat/compute/jobs/client.py +417 -0
  60. deltacat/compute/resource_estimation/delta.py +38 -6
  61. deltacat/compute/resource_estimation/model.py +8 -0
  62. deltacat/constants.py +49 -6
  63. deltacat/docs/autogen/schema/inference/generate_type_mappings.py +687 -0
  64. deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py +673 -0
  65. deltacat/env.py +10 -0
  66. deltacat/examples/basic_logging.py +6 -6
  67. deltacat/examples/compactor/aws/__init__.py +1 -0
  68. deltacat/examples/compactor/bootstrap.py +863 -0
  69. deltacat/examples/compactor/compactor.py +373 -0
  70. deltacat/examples/compactor/explorer.py +473 -0
  71. deltacat/examples/compactor/gcp/__init__.py +1 -0
  72. deltacat/examples/compactor/job_runner.py +439 -0
  73. deltacat/examples/compactor/utils/__init__.py +1 -0
  74. deltacat/examples/compactor/utils/common.py +261 -0
  75. deltacat/examples/experimental/iceberg/converter/beam/app.py +226 -0
  76. deltacat/examples/experimental/iceberg/converter/beam/main.py +133 -0
  77. deltacat/examples/experimental/iceberg/converter/beam/test_workflow.py +113 -0
  78. deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py +3 -0
  79. deltacat/examples/experimental/iceberg/converter/beam/utils/common.py +174 -0
  80. deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py +263 -0
  81. deltacat/examples/{iceberg → experimental/iceberg}/iceberg_bucket_writer.py +66 -21
  82. deltacat/examples/{iceberg → experimental/iceberg}/iceberg_reader.py +2 -4
  83. deltacat/examples/hello_world.py +4 -2
  84. deltacat/examples/indexer/indexer.py +163 -0
  85. deltacat/examples/indexer/job_runner.py +198 -0
  86. deltacat/exceptions.py +66 -4
  87. deltacat/experimental/catalog/iceberg/__init__.py +6 -0
  88. deltacat/{catalog → experimental/catalog}/iceberg/iceberg_catalog_config.py +1 -1
  89. deltacat/{catalog → experimental/catalog}/iceberg/impl.py +43 -12
  90. deltacat/{catalog → experimental/catalog}/iceberg/overrides.py +12 -14
  91. deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py +201 -0
  92. deltacat/experimental/converter_agent/__init__.py +0 -0
  93. deltacat/experimental/converter_agent/beam/__init__.py +0 -0
  94. deltacat/experimental/converter_agent/beam/managed.py +173 -0
  95. deltacat/experimental/converter_agent/table_monitor.py +479 -0
  96. deltacat/experimental/daft/__init__.py +4 -0
  97. deltacat/experimental/daft/daft_catalog.py +229 -0
  98. deltacat/experimental/storage/__init__.py +0 -0
  99. deltacat/experimental/storage/iceberg/__init__.py +0 -0
  100. deltacat/experimental/storage/iceberg/iceberg_scan_planner.py +129 -0
  101. deltacat/{storage → experimental/storage}/iceberg/impl.py +6 -4
  102. deltacat/{storage → experimental/storage}/iceberg/model.py +7 -3
  103. deltacat/experimental/storage/iceberg/visitor.py +119 -0
  104. deltacat/experimental/storage/rivulet/__init__.py +11 -0
  105. deltacat/experimental/storage/rivulet/arrow/__init__.py +0 -0
  106. deltacat/{storage → experimental/storage}/rivulet/arrow/serializer.py +7 -4
  107. deltacat/{storage → experimental/storage}/rivulet/dataset.py +13 -12
  108. deltacat/{storage → experimental/storage}/rivulet/dataset_executor.py +12 -20
  109. deltacat/experimental/storage/rivulet/feather/__init__.py +7 -0
  110. deltacat/{storage → experimental/storage}/rivulet/feather/file_reader.py +7 -5
  111. deltacat/{storage → experimental/storage}/rivulet/feather/serializer.py +4 -4
  112. deltacat/experimental/storage/rivulet/fs/__init__.py +0 -0
  113. deltacat/{storage → experimental/storage}/rivulet/fs/file_provider.py +3 -3
  114. deltacat/{storage → experimental/storage}/rivulet/fs/file_store.py +2 -2
  115. deltacat/{storage → experimental/storage}/rivulet/fs/output_file.py +1 -1
  116. deltacat/{storage → experimental/storage}/rivulet/logical_plan.py +4 -4
  117. deltacat/experimental/storage/rivulet/metastore/__init__.py +0 -0
  118. deltacat/{storage → experimental/storage}/rivulet/metastore/delta.py +1 -3
  119. deltacat/{storage → experimental/storage}/rivulet/metastore/json_sst.py +3 -3
  120. deltacat/{storage → experimental/storage}/rivulet/metastore/sst.py +2 -2
  121. deltacat/{storage → experimental/storage}/rivulet/metastore/sst_interval_tree.py +3 -3
  122. deltacat/experimental/storage/rivulet/parquet/__init__.py +7 -0
  123. deltacat/experimental/storage/rivulet/parquet/data_reader.py +0 -0
  124. deltacat/{storage → experimental/storage}/rivulet/parquet/file_reader.py +7 -5
  125. deltacat/{storage → experimental/storage}/rivulet/parquet/serializer.py +4 -4
  126. deltacat/experimental/storage/rivulet/reader/__init__.py +0 -0
  127. deltacat/{storage → experimental/storage}/rivulet/reader/block_scanner.py +20 -9
  128. deltacat/{storage → experimental/storage}/rivulet/reader/data_reader.py +3 -3
  129. deltacat/{storage → experimental/storage}/rivulet/reader/data_scan.py +5 -3
  130. deltacat/{storage → experimental/storage}/rivulet/reader/dataset_metastore.py +7 -6
  131. deltacat/{storage → experimental/storage}/rivulet/reader/dataset_reader.py +8 -6
  132. deltacat/{storage → experimental/storage}/rivulet/reader/pyarrow_data_reader.py +4 -1
  133. deltacat/{storage → experimental/storage}/rivulet/reader/reader_type_registrar.py +4 -4
  134. deltacat/experimental/storage/rivulet/schema/__init__.py +0 -0
  135. deltacat/{storage → experimental/storage}/rivulet/schema/schema.py +1 -1
  136. deltacat/{storage → experimental/storage}/rivulet/serializer.py +1 -1
  137. deltacat/{storage → experimental/storage}/rivulet/serializer_factory.py +9 -5
  138. deltacat/experimental/storage/rivulet/shard/__init__.py +0 -0
  139. deltacat/experimental/storage/rivulet/shard/range_shard.py +129 -0
  140. deltacat/experimental/storage/rivulet/writer/__init__.py +0 -0
  141. deltacat/{storage → experimental/storage}/rivulet/writer/memtable_dataset_writer.py +20 -9
  142. deltacat/io/__init__.py +13 -0
  143. deltacat/io/dataset/__init__.py +0 -0
  144. deltacat/io/dataset/deltacat_dataset.py +91 -0
  145. deltacat/io/datasink/__init__.py +0 -0
  146. deltacat/io/datasink/deltacat_datasink.py +207 -0
  147. deltacat/io/datasource/__init__.py +0 -0
  148. deltacat/io/datasource/deltacat_datasource.py +579 -0
  149. deltacat/io/reader/__init__.py +0 -0
  150. deltacat/io/reader/deltacat_read_api.py +172 -0
  151. deltacat/storage/__init__.py +22 -2
  152. deltacat/storage/interface.py +54 -32
  153. deltacat/storage/main/impl.py +1494 -541
  154. deltacat/storage/model/delta.py +27 -3
  155. deltacat/storage/model/expression/__init__.py +47 -0
  156. deltacat/storage/model/expression/expression.py +656 -0
  157. deltacat/storage/model/expression/visitor.py +248 -0
  158. deltacat/storage/model/locator.py +6 -12
  159. deltacat/storage/model/manifest.py +231 -6
  160. deltacat/storage/model/metafile.py +224 -119
  161. deltacat/storage/model/namespace.py +8 -1
  162. deltacat/storage/model/partition.py +117 -42
  163. deltacat/storage/model/scan/push_down.py +32 -5
  164. deltacat/storage/model/schema.py +2427 -159
  165. deltacat/storage/model/shard.py +6 -2
  166. deltacat/storage/model/sort_key.py +40 -0
  167. deltacat/storage/model/stream.py +9 -2
  168. deltacat/storage/model/table.py +12 -1
  169. deltacat/storage/model/table_version.py +11 -0
  170. deltacat/storage/model/transaction.py +1184 -208
  171. deltacat/storage/model/transform.py +81 -2
  172. deltacat/storage/model/types.py +53 -29
  173. deltacat/storage/util/__init__.py +0 -0
  174. deltacat/storage/util/scan_planner.py +26 -0
  175. deltacat/tests/_io/reader/__init__.py +0 -0
  176. deltacat/tests/_io/reader/test_deltacat_read_api.py +0 -0
  177. deltacat/tests/_io/test_cloudpickle_bug_fix.py +8 -4
  178. deltacat/tests/aws/test_s3u.py +2 -31
  179. deltacat/tests/catalog/data/__init__.py +0 -0
  180. deltacat/tests/catalog/main/__init__.py +0 -0
  181. deltacat/tests/catalog/main/test_catalog_impl_namespace_operations.py +130 -0
  182. deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +1972 -0
  183. deltacat/tests/catalog/model/__init__.py +0 -0
  184. deltacat/tests/catalog/model/test_table_definition.py +16 -0
  185. deltacat/tests/catalog/test_catalogs.py +103 -106
  186. deltacat/tests/catalog/test_default_catalog_impl.py +12152 -72
  187. deltacat/tests/compute/compact_partition_test_cases.py +35 -8
  188. deltacat/tests/compute/compactor/steps/test_repartition.py +12 -12
  189. deltacat/tests/compute/compactor/utils/test_io.py +124 -120
  190. deltacat/tests/compute/compactor/utils/test_round_completion_reader.py +254 -0
  191. deltacat/tests/compute/compactor_v2/test_compaction_session.py +423 -312
  192. deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +266 -0
  193. deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +45 -0
  194. deltacat/tests/compute/compactor_v2/utils/test_task_options.py +270 -1
  195. deltacat/tests/compute/conftest.py +8 -44
  196. deltacat/tests/compute/converter/test_convert_session.py +697 -349
  197. deltacat/tests/compute/converter/utils.py +15 -6
  198. deltacat/tests/compute/resource_estimation/test_delta.py +145 -79
  199. deltacat/tests/compute/test_compact_partition_incremental.py +103 -70
  200. deltacat/tests/compute/test_compact_partition_multiple_rounds.py +89 -66
  201. deltacat/tests/compute/test_compact_partition_params.py +13 -8
  202. deltacat/tests/compute/test_compact_partition_rebase.py +77 -62
  203. deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +263 -193
  204. deltacat/tests/compute/test_janitor.py +236 -0
  205. deltacat/tests/compute/test_util_common.py +716 -43
  206. deltacat/tests/compute/test_util_constant.py +0 -1
  207. deltacat/tests/{storage/conftest.py → conftest.py} +1 -1
  208. deltacat/tests/daft/__init__.py +0 -0
  209. deltacat/tests/daft/test_model.py +97 -0
  210. deltacat/tests/experimental/__init__.py +1 -0
  211. deltacat/tests/experimental/catalog/__init__.py +0 -0
  212. deltacat/tests/experimental/catalog/iceberg/__init__.py +0 -0
  213. deltacat/tests/experimental/catalog/iceberg/test_iceberg_catalog.py +71 -0
  214. deltacat/tests/experimental/compatibility/__init__.py +1 -0
  215. deltacat/tests/experimental/compatibility/test_backfill_locator_to_id_mappings.py +582 -0
  216. deltacat/tests/experimental/daft/__init__.py +0 -0
  217. deltacat/tests/experimental/daft/test_deltacat_daft_integration.py +136 -0
  218. deltacat/tests/experimental/storage/__init__.py +0 -0
  219. deltacat/tests/experimental/storage/rivulet/__init__.py +0 -0
  220. deltacat/tests/{storage → experimental/storage}/rivulet/conftest.py +3 -3
  221. deltacat/tests/experimental/storage/rivulet/fs/__init__.py +0 -0
  222. deltacat/tests/{storage → experimental/storage}/rivulet/fs/test_file_location_provider.py +3 -2
  223. deltacat/tests/experimental/storage/rivulet/reader/__init__.py +0 -0
  224. deltacat/tests/experimental/storage/rivulet/reader/query_expression.py +80 -0
  225. deltacat/tests/experimental/storage/rivulet/reader/test_data_scan.py +119 -0
  226. deltacat/tests/experimental/storage/rivulet/reader/test_dataset_metastore.py +71 -0
  227. deltacat/tests/experimental/storage/rivulet/schema/__init__.py +0 -0
  228. deltacat/tests/{storage → experimental/storage}/rivulet/schema/test_schema.py +1 -1
  229. deltacat/tests/experimental/storage/rivulet/shard/__init__.py +0 -0
  230. deltacat/tests/experimental/storage/rivulet/shard/test_range_shard.py +162 -0
  231. deltacat/tests/{storage → experimental/storage}/rivulet/test_dataset.py +6 -4
  232. deltacat/tests/{storage → experimental/storage}/rivulet/test_manifest.py +5 -5
  233. deltacat/tests/{storage → experimental/storage}/rivulet/test_sst_interval_tree.py +5 -5
  234. deltacat/tests/{storage → experimental/storage}/rivulet/test_utils.py +8 -6
  235. deltacat/tests/experimental/storage/rivulet/writer/__init__.py +0 -0
  236. deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_dataset_write_then_read.py +11 -9
  237. deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_dataset_writer.py +2 -2
  238. deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_memtable_dataset_writer.py +7 -7
  239. deltacat/tests/storage/main/test_main_storage.py +6900 -95
  240. deltacat/tests/storage/model/test_expression.py +327 -0
  241. deltacat/tests/storage/model/test_manifest.py +129 -0
  242. deltacat/tests/storage/model/test_metafile_io.py +78 -173
  243. deltacat/tests/storage/model/test_partition_scheme.py +85 -0
  244. deltacat/tests/storage/model/test_schema.py +171 -0
  245. deltacat/tests/storage/model/test_schema_update.py +1925 -0
  246. deltacat/tests/storage/model/test_shard.py +3 -1
  247. deltacat/tests/storage/model/test_sort_scheme.py +90 -0
  248. deltacat/tests/storage/model/test_transaction.py +393 -48
  249. deltacat/tests/storage/model/test_transaction_history.py +886 -0
  250. deltacat/tests/test_deltacat_api.py +1036 -11
  251. deltacat/tests/test_exceptions.py +9 -5
  252. deltacat/tests/test_utils/pyarrow.py +52 -21
  253. deltacat/tests/test_utils/storage.py +23 -34
  254. deltacat/tests/types/__init__.py +0 -0
  255. deltacat/tests/types/test_tables.py +104 -0
  256. deltacat/tests/utils/exceptions.py +22 -0
  257. deltacat/tests/utils/main_deltacat_storage_mock.py +31 -0
  258. deltacat/tests/utils/ray_utils/test_dataset.py +123 -5
  259. deltacat/tests/utils/test_daft.py +121 -31
  260. deltacat/tests/utils/test_numpy.py +1193 -0
  261. deltacat/tests/utils/test_pandas.py +1106 -0
  262. deltacat/tests/utils/test_polars.py +1040 -0
  263. deltacat/tests/utils/test_pyarrow.py +1370 -89
  264. deltacat/types/media.py +345 -37
  265. deltacat/types/tables.py +2344 -46
  266. deltacat/utils/arguments.py +33 -1
  267. deltacat/utils/daft.py +824 -40
  268. deltacat/utils/export.py +3 -1
  269. deltacat/utils/filesystem.py +139 -9
  270. deltacat/utils/metafile_locator.py +2 -1
  271. deltacat/utils/numpy.py +118 -26
  272. deltacat/utils/pandas.py +577 -48
  273. deltacat/utils/polars.py +759 -0
  274. deltacat/utils/pyarrow.py +1373 -192
  275. deltacat/utils/ray_utils/concurrency.py +1 -1
  276. deltacat/utils/ray_utils/dataset.py +101 -10
  277. deltacat/utils/ray_utils/runtime.py +56 -4
  278. deltacat/utils/reader_compatibility_mapping.py +3083 -0
  279. deltacat/utils/url.py +1325 -0
  280. deltacat-2.0.0.dist-info/METADATA +1163 -0
  281. deltacat-2.0.0.dist-info/RECORD +439 -0
  282. {deltacat-2.0.dist-info → deltacat-2.0.0.dist-info}/WHEEL +1 -1
  283. deltacat/catalog/iceberg/__init__.py +0 -4
  284. deltacat/compute/compactor/utils/round_completion_file.py +0 -97
  285. deltacat/compute/merge_on_read/__init__.py +0 -4
  286. deltacat/compute/merge_on_read/daft.py +0 -40
  287. deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -66
  288. deltacat/compute/merge_on_read/utils/delta.py +0 -42
  289. deltacat/examples/common/fixtures.py +0 -15
  290. deltacat/storage/iceberg/iceberg_scan_planner.py +0 -28
  291. deltacat/storage/rivulet/__init__.py +0 -11
  292. deltacat/storage/rivulet/feather/__init__.py +0 -5
  293. deltacat/storage/rivulet/parquet/__init__.py +0 -5
  294. deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -231
  295. deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -388
  296. deltacat/tests/local_deltacat_storage/__init__.py +0 -1235
  297. deltacat/tests/local_deltacat_storage/exceptions.py +0 -10
  298. deltacat/utils/s3fs.py +0 -21
  299. deltacat-2.0.dist-info/METADATA +0 -65
  300. deltacat-2.0.dist-info/RECORD +0 -347
  301. /deltacat/compute/{merge_on_read/model → jobs}/__init__.py +0 -0
  302. /deltacat/{compute/merge_on_read/utils → docs}/__init__.py +0 -0
  303. /deltacat/{examples/common → docs/autogen}/__init__.py +0 -0
  304. /deltacat/{examples/iceberg → docs/autogen/schema}/__init__.py +0 -0
  305. /deltacat/{storage/iceberg → docs/autogen/schema/inference}/__init__.py +0 -0
  306. /deltacat/{storage/rivulet/arrow → examples/compactor}/__init__.py +0 -0
  307. /deltacat/{storage/rivulet/fs → examples/experimental}/__init__.py +0 -0
  308. /deltacat/{storage/rivulet/metastore → examples/experimental/iceberg}/__init__.py +0 -0
  309. /deltacat/{storage/rivulet/reader → examples/experimental/iceberg/converter}/__init__.py +0 -0
  310. /deltacat/{storage/rivulet/schema → examples/experimental/iceberg/converter/beam}/__init__.py +0 -0
  311. /deltacat/{storage/rivulet/writer → examples/indexer}/__init__.py +0 -0
  312. /deltacat/{tests/storage/rivulet → examples/indexer/aws}/__init__.py +0 -0
  313. /deltacat/{tests/storage/rivulet/fs → examples/indexer/gcp}/__init__.py +0 -0
  314. /deltacat/{tests/storage/rivulet/schema → experimental}/__init__.py +0 -0
  315. /deltacat/{tests/storage/rivulet/writer → experimental/catalog}/__init__.py +0 -0
  316. /deltacat/{storage/rivulet/parquet/data_reader.py → experimental/compatibility/__init__.py} +0 -0
  317. /deltacat/{storage → experimental/storage}/rivulet/fs/input_file.py +0 -0
  318. /deltacat/{storage → experimental/storage}/rivulet/mvp/Table.py +0 -0
  319. /deltacat/{storage → experimental/storage}/rivulet/mvp/__init__.py +0 -0
  320. /deltacat/{storage → experimental/storage}/rivulet/reader/query_expression.py +0 -0
  321. /deltacat/{storage → experimental/storage}/rivulet/schema/datatype.py +0 -0
  322. /deltacat/{storage → experimental/storage}/rivulet/writer/dataset_writer.py +0 -0
  323. {deltacat-2.0.dist-info → deltacat-2.0.0.dist-info/licenses}/LICENSE +0 -0
  324. {deltacat-2.0.dist-info → deltacat-2.0.0.dist-info}/top_level.txt +0 -0
@@ -1,13 +1,39 @@
1
+ import logging
1
2
  import uuid
3
+ import posixpath
4
+ import pyarrow
2
5
 
3
6
  from typing import Any, Callable, Dict, List, Optional, Union, Tuple
4
7
 
5
- from deltacat.catalog import get_catalog_properties
6
- from deltacat.constants import DEFAULT_TABLE_VERSION
7
- from deltacat.exceptions import TableNotFoundError
8
+ from deltacat.catalog.model.properties import get_catalog_properties
9
+ from deltacat.constants import (
10
+ DEFAULT_TABLE_VERSION,
11
+ DATA_FILE_DIR_NAME,
12
+ )
13
+ from deltacat.exceptions import (
14
+ TableNotFoundError,
15
+ TableVersionNotFoundError,
16
+ DeltaCatError,
17
+ UnclassifiedDeltaCatError,
18
+ SchemaValidationError,
19
+ StreamNotFoundError,
20
+ PartitionNotFoundError,
21
+ DeltaNotFoundError,
22
+ NamespaceNotFoundError,
23
+ TableValidationError,
24
+ ConcurrentModificationError,
25
+ ObjectAlreadyExistsError,
26
+ NamespaceAlreadyExistsError,
27
+ TableAlreadyExistsError,
28
+ TableVersionAlreadyExistsError,
29
+ ObjectNotFoundError,
30
+ )
8
31
  from deltacat.storage.model.manifest import (
9
32
  EntryParams,
33
+ EntryType,
10
34
  ManifestAuthor,
35
+ ManifestEntryList,
36
+ ManifestEntry,
11
37
  )
12
38
  from deltacat.storage.model.delta import (
13
39
  Delta,
@@ -15,13 +41,13 @@ from deltacat.storage.model.delta import (
15
41
  DeltaProperties,
16
42
  DeltaType,
17
43
  )
44
+ from deltacat.storage.model.transaction import setup_transaction
18
45
  from deltacat.storage.model.types import (
19
46
  CommitState,
20
47
  DistributedDataset,
21
48
  LifecycleState,
22
49
  LocalDataset,
23
50
  LocalTable,
24
- TransactionType,
25
51
  TransactionOperationType,
26
52
  StreamFormat,
27
53
  )
@@ -36,14 +62,13 @@ from deltacat.storage.model.partition import (
36
62
  PartitionLocator,
37
63
  PartitionScheme,
38
64
  PartitionValues,
65
+ UNPARTITIONED_SCHEME,
39
66
  UNPARTITIONED_SCHEME_ID,
40
- PartitionLocatorAlias,
41
- )
42
- from deltacat.storage.model.schema import (
43
- Schema,
44
67
  )
68
+ from deltacat.storage.model.schema import Schema
45
69
  from deltacat.storage.model.sort_key import (
46
70
  SortScheme,
71
+ UNSORTED_SCHEME,
47
72
  )
48
73
  from deltacat.storage.model.stream import (
49
74
  Stream,
@@ -65,52 +90,95 @@ from deltacat.storage.model.metafile import (
65
90
  from deltacat.storage.model.transaction import (
66
91
  TransactionOperation,
67
92
  Transaction,
68
- TransactionOperationList,
69
93
  )
70
94
  from deltacat.storage.model.manifest import Manifest
71
95
  from deltacat.types.media import (
72
96
  ContentType,
97
+ DatasetType,
73
98
  DistributedDatasetType,
74
99
  StorageType,
75
- TableType,
100
+ ContentEncoding,
76
101
  )
77
102
  from deltacat.utils.common import ReadKwargsProvider
103
+ import pyarrow as pa
104
+
105
+ from deltacat.types.tables import (
106
+ TableProperty,
107
+ get_table_writer,
108
+ get_table_slicer,
109
+ write_sliced_table,
110
+ download_manifest_entries,
111
+ download_manifest_entries_distributed,
112
+ download_manifest_entry,
113
+ )
114
+ from deltacat import logs
115
+
116
+ logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
117
+
118
+
119
+ def _normalize_partition_values(
120
+ partition_values: Optional[PartitionValues],
121
+ ) -> Optional[PartitionValues]:
122
+ """
123
+ Normalize partition values to ensure consistent representation of unpartitioned data.
124
+
125
+ Both None and empty list [] represent unpartitioned data, but they should be
126
+ normalized to None for consistent lookup and validation.
127
+
128
+ Args:
129
+ partition_values: The partition values to normalize
130
+
131
+ Returns:
132
+ None for unpartitioned data (both None and [] inputs),
133
+ original value for partitioned data
134
+ """
135
+ if partition_values is None or (
136
+ isinstance(partition_values, list) and len(partition_values) == 0
137
+ ):
138
+ return None
139
+ return partition_values
78
140
 
79
141
 
80
142
  def _list(
81
143
  metafile: Metafile,
82
144
  txn_op_type: TransactionOperationType,
83
145
  *args,
146
+ transaction: Optional[Transaction] = None,
84
147
  **kwargs,
85
148
  ) -> ListResult[Metafile]:
86
149
  catalog_properties = get_catalog_properties(**kwargs)
87
150
  limit = kwargs.get("limit") or None
88
- transaction = Transaction.of(
89
- txn_type=TransactionType.READ,
90
- txn_operations=[
91
- TransactionOperation.of(
92
- operation_type=txn_op_type,
93
- dest_metafile=metafile,
94
- read_limit=limit,
95
- )
96
- ],
97
- )
98
- list_results_per_op = transaction.commit(
99
- catalog_root_dir=catalog_properties.root,
100
- filesystem=catalog_properties.filesystem,
151
+
152
+ operation = TransactionOperation.of(
153
+ operation_type=txn_op_type,
154
+ dest_metafile=metafile,
155
+ read_limit=limit,
101
156
  )
102
- return list_results_per_op[0]
157
+
158
+ if transaction is not None:
159
+ # Add the read operation to the existing transaction and return the result
160
+ return transaction.step(operation)
161
+ else:
162
+ # Create and commit a new transaction (legacy behavior)
163
+ new_transaction = Transaction.of([operation])
164
+ list_results_per_op = new_transaction.commit(
165
+ catalog_root_dir=catalog_properties.root,
166
+ filesystem=catalog_properties.filesystem,
167
+ )
168
+ return list_results_per_op[0]
103
169
 
104
170
 
105
171
  def _latest(
106
172
  metafile: Metafile,
107
173
  *args,
174
+ transaction: Optional[Transaction] = None,
108
175
  **kwargs,
109
176
  ) -> Optional[Metafile]:
110
177
  list_results = _list(
111
- *args,
112
178
  metafile=metafile,
113
179
  txn_op_type=TransactionOperationType.READ_LATEST,
180
+ transaction=transaction,
181
+ *args,
114
182
  **kwargs,
115
183
  )
116
184
  results = list_results.all_items()
@@ -121,94 +189,38 @@ def _exists(
121
189
  metafile: Metafile,
122
190
  *args,
123
191
  **kwargs,
124
- ) -> Optional[Metafile]:
192
+ ) -> Optional[bool]:
125
193
  list_results = _list(
126
- *args,
127
194
  metafile=metafile,
128
195
  txn_op_type=TransactionOperationType.READ_EXISTS,
196
+ *args,
129
197
  **kwargs,
130
198
  )
131
199
  results = list_results.all_items()
132
200
  return True if results else False
133
201
 
134
202
 
135
- def _resolve_partition_locator_alias(
136
- namespace: str,
137
- table_name: str,
138
- table_version: Optional[str] = None,
139
- partition_values: Optional[PartitionValues] = None,
140
- partition_scheme_id: Optional[str] = None,
141
- *args,
142
- **kwargs,
143
- ) -> PartitionLocatorAlias:
144
- # TODO(pdames): A read shouldn't initiate N transactions that
145
- # read against different catalog snapshots. To resolve this, add
146
- # new "start", "step", and "end" methods to Transaction that
147
- # support starting a txn, defining and executing a txn op, retrieve
148
- # its results, then define and execute the next txn op. When
149
- # stepping through a transaction its txn heartbeat timeout should
150
- # be set manually.
151
- partition_locator = None
152
- if not partition_values:
153
- partition_scheme_id = UNPARTITIONED_SCHEME_ID
154
- elif not partition_scheme_id:
155
- # resolve latest partition scheme from the current
156
- # revision of its `deltacat` stream
157
- stream = get_stream(
158
- *args,
159
- namespace=namespace,
160
- table_name=table_name,
161
- table_version=table_version,
162
- **kwargs,
163
- )
164
- if not stream:
165
- raise ValueError(
166
- f"Failed to resolve latest partition scheme for "
167
- f"`{namespace}.{table_name}` at table version "
168
- f"`{table_version or 'latest'}` (no stream found)."
169
- )
170
- partition_locator = PartitionLocator.of(
171
- stream_locator=stream.locator,
172
- partition_values=partition_values,
173
- partition_id=None,
174
- )
175
- partition_scheme_id = stream.partition_scheme.id
176
- if not partition_locator:
177
- partition_locator = PartitionLocator.at(
178
- namespace=namespace,
179
- table_name=table_name,
180
- table_version=table_version,
181
- stream_id=None,
182
- stream_format=StreamFormat.DELTACAT,
183
- partition_values=partition_values,
184
- partition_id=None,
185
- )
186
- partition = Partition.of(
187
- locator=partition_locator,
188
- schema=None,
189
- content_types=None,
190
- partition_scheme_id=partition_scheme_id,
191
- )
192
- return partition.locator_alias
193
-
194
-
195
203
  def _resolve_latest_active_table_version_id(
196
204
  namespace: str,
197
205
  table_name: str,
198
- fail_if_no_active_table_version: True,
199
206
  *args,
207
+ fail_if_no_active_table_version: bool = True,
208
+ transaction: Optional[Transaction] = None,
200
209
  **kwargs,
201
210
  ) -> Optional[str]:
202
211
  table = get_table(
203
- *args,
204
212
  namespace=namespace,
205
213
  table_name=table_name,
214
+ transaction=transaction,
215
+ *args,
206
216
  **kwargs,
207
217
  )
208
218
  if not table:
209
- raise ValueError(f"Table does not exist: {namespace}.{table_name}")
219
+ raise TableNotFoundError(f"Table does not exist: {namespace}.{table_name}")
210
220
  if fail_if_no_active_table_version and not table.latest_active_table_version:
211
- raise ValueError(f"Table has no active table version: {namespace}.{table_name}")
221
+ raise TableVersionNotFoundError(
222
+ f"Table has no active table version: {namespace}.{table_name}"
223
+ )
212
224
  return table.latest_active_table_version
213
225
 
214
226
 
@@ -217,30 +229,114 @@ def _resolve_latest_table_version_id(
217
229
  table_name: str,
218
230
  fail_if_no_active_table_version: True,
219
231
  *args,
232
+ transaction: Optional[Transaction] = None,
220
233
  **kwargs,
221
234
  ) -> Optional[str]:
222
235
  table = get_table(
223
- *args,
224
236
  namespace=namespace,
225
237
  table_name=table_name,
238
+ transaction=transaction,
239
+ *args,
226
240
  **kwargs,
227
241
  )
228
242
  if not table:
229
- raise ValueError(f"Table does not exist: {namespace}.{table_name}")
243
+ raise TableNotFoundError(f"Table does not exist: {namespace}.{table_name}")
230
244
  if fail_if_no_active_table_version and not table.latest_table_version:
231
- raise ValueError(f"Table has no table version: {namespace}.{table_name}")
245
+ raise TableVersionNotFoundError(
246
+ f"Table has no table version: {namespace}.{table_name}"
247
+ )
232
248
  return table.latest_table_version
233
249
 
234
250
 
251
+ def _validate_schemes_against_schema(
252
+ schema: Optional[Schema],
253
+ partition_scheme: Optional[PartitionScheme],
254
+ sort_scheme: Optional[SortScheme],
255
+ ) -> None:
256
+ """
257
+ Validates partition and sort schemes against a schema, ensuring all referenced fields exist.
258
+ If schema is None, validation is skipped.
259
+ """
260
+ if schema is None:
261
+ return
262
+
263
+ schema_fields = set(field.name for field in schema.arrow)
264
+
265
+ # Validate partition scheme
266
+ if partition_scheme is not None and partition_scheme.keys is not None:
267
+ for key in partition_scheme.keys:
268
+ if key.key[0] not in schema_fields:
269
+ raise SchemaValidationError(
270
+ f"Partition key field '{key.key[0]}' not found in schema"
271
+ )
272
+
273
+ # Validate sort scheme
274
+ if sort_scheme is not None and sort_scheme.keys is not None:
275
+ for key in sort_scheme.keys:
276
+ if key.key[0] not in schema_fields:
277
+ raise SchemaValidationError(
278
+ f"Sort key field '{key.key[0]}' not found in schema"
279
+ )
280
+
281
+
282
+ def _validate_partition_values_against_scheme(
283
+ partition_values: Optional[PartitionValues],
284
+ partition_scheme: PartitionScheme,
285
+ schema: Optional[Schema],
286
+ ) -> None:
287
+ """
288
+ Validates that partition values match the data types of the partition key fields in the schema.
289
+
290
+ Args:
291
+ partition_values: List of partition values to validate
292
+ partition_scheme: The partition scheme containing the keys to validate against
293
+ schema: The schema containing the field types to validate against
294
+
295
+ Raises:
296
+ TableValidationError: If validation fails
297
+ """
298
+ if not partition_values:
299
+ raise TableValidationError("Partition values cannot be empty")
300
+
301
+ if not schema:
302
+ raise TableValidationError(
303
+ "Table version must have a schema to validate partition values"
304
+ )
305
+
306
+ if len(partition_values) != len(partition_scheme.keys):
307
+ raise TableValidationError(
308
+ f"Number of partition values ({len(partition_values)}) does not match "
309
+ f"number of partition keys ({len(partition_scheme.keys)})"
310
+ )
311
+
312
+ # Validate each partition value against its corresponding field type
313
+ for i in range(len(partition_scheme.keys)):
314
+ field_type = partition_scheme.keys[i].transform.return_type
315
+ partition_value = partition_values[i]
316
+ if field_type is None:
317
+ # the transform returns the same type as the source schema type
318
+ # (which also implies that it is a single-key transform)
319
+ field_type = schema.field(partition_scheme.keys[i].key[0]).arrow.type
320
+ try:
321
+ # Try to convert the value to PyArrow to validate its type
322
+ pa.array([partition_value], type=field_type)
323
+ # If successful, the type is valid
324
+ except (pa.lib.ArrowInvalid, pa.lib.ArrowTypeError) as e:
325
+ raise TableValidationError(
326
+ f"Partition value {partition_value} (type {type(partition_value)}) "
327
+ f"incompatible with partition transform return type {field_type}"
328
+ ) from e
329
+
330
+
235
331
  def list_namespaces(*args, **kwargs) -> ListResult[Namespace]:
236
332
  """
237
333
  Lists a page of table namespaces. Namespaces are returned as list result
238
334
  items.
239
335
  """
240
336
  return _list(
241
- *args,
242
337
  metafile=Namespace.of(NamespaceLocator.of("placeholder")),
243
338
  txn_op_type=TransactionOperationType.READ_SIBLINGS,
339
+ *args,
244
340
  **kwargs,
245
341
  )
246
342
 
@@ -251,12 +347,15 @@ def list_tables(namespace: str, *args, **kwargs) -> ListResult[Table]:
251
347
  list result items. Raises an error if the given namespace does not exist.
252
348
  """
253
349
  locator = TableLocator.at(namespace=namespace, table_name="placeholder")
254
- return _list(
255
- *args,
256
- metafile=Table.of(locator=locator),
257
- txn_op_type=TransactionOperationType.READ_SIBLINGS,
258
- **kwargs,
259
- )
350
+ try:
351
+ return _list(
352
+ metafile=Table.of(locator=locator),
353
+ txn_op_type=TransactionOperationType.READ_SIBLINGS,
354
+ *args,
355
+ **kwargs,
356
+ )
357
+ except ObjectNotFoundError as e:
358
+ raise NamespaceNotFoundError(f"Namespace {namespace} not found") from e
260
359
 
261
360
 
262
361
  def list_table_versions(
@@ -279,12 +378,15 @@ def list_table_versions(
279
378
  locator=locator,
280
379
  schema=None,
281
380
  )
282
- return _list(
283
- *args,
284
- metafile=table_version,
285
- txn_op_type=TransactionOperationType.READ_SIBLINGS,
286
- **kwargs,
287
- )
381
+ try:
382
+ return _list(
383
+ metafile=table_version,
384
+ txn_op_type=TransactionOperationType.READ_SIBLINGS,
385
+ *args,
386
+ **kwargs,
387
+ )
388
+ except ObjectNotFoundError as e:
389
+ raise TableNotFoundError(f"Table {namespace}.{table_name} not found") from e
288
390
 
289
391
 
290
392
  def list_streams(
@@ -298,6 +400,7 @@ def list_streams(
298
400
  Lists a page of streams for the given table version.
299
401
  Raises an error if the table version does not exist.
300
402
  """
403
+ # TODO(pdames): Support listing uncommitted streams.
301
404
  locator = StreamLocator.at(
302
405
  namespace=namespace,
303
406
  table_name=table_name,
@@ -309,12 +412,17 @@ def list_streams(
309
412
  locator=locator,
310
413
  partition_scheme=None,
311
414
  )
312
- return _list(
313
- stream,
314
- TransactionOperationType.READ_SIBLINGS,
315
- *args,
316
- **kwargs,
317
- )
415
+ try:
416
+ return _list(
417
+ stream,
418
+ TransactionOperationType.READ_SIBLINGS,
419
+ *args,
420
+ **kwargs,
421
+ )
422
+ except ObjectNotFoundError as e:
423
+ raise TableVersionNotFoundError(
424
+ f"Table version {namespace}.{table_name}.{table_version} not found"
425
+ ) from e
318
426
 
319
427
 
320
428
  def list_partitions(
@@ -322,6 +430,7 @@ def list_partitions(
322
430
  table_name: str,
323
431
  table_version: Optional[str] = None,
324
432
  *args,
433
+ transaction: Optional[Transaction] = None,
325
434
  **kwargs,
326
435
  ) -> ListResult[Partition]:
327
436
  """
@@ -330,32 +439,58 @@ def list_partitions(
330
439
  table version if not specified. Raises an error if the table version does
331
440
  not exist.
332
441
  """
333
- locator = PartitionLocator.at(
442
+ transaction, commit_transaction = setup_transaction(transaction, **kwargs)
443
+
444
+ if not namespace:
445
+ raise ValueError("Namespace cannot be empty.")
446
+ if not table_name:
447
+ raise ValueError("Table name cannot be empty.")
448
+ # resolve default deltacat stream for the given namespace, table name, and table version
449
+ # TODO(pdames): debug why this doesn't work when only the table_version is provided
450
+ # and PartitionLocator.stream_format is hard-coded to deltacat (we should be able
451
+ # to resolve the default deltacat stream automatically)
452
+ stream = get_stream(
334
453
  namespace=namespace,
335
454
  table_name=table_name,
336
455
  table_version=table_version,
337
- stream_id=None,
338
- stream_format=StreamFormat.DELTACAT,
456
+ transaction=transaction,
457
+ *args,
458
+ **kwargs,
459
+ )
460
+ if not stream:
461
+ raise StreamNotFoundError(
462
+ f"Default stream for {namespace}.{table_name}.{table_version} not found."
463
+ )
464
+ locator = PartitionLocator.of(
465
+ stream_locator=stream.locator,
339
466
  partition_values=["placeholder"],
340
467
  partition_id="placeholder",
341
468
  )
342
469
  partition = Partition.of(
343
470
  locator=locator,
344
- schema=None,
345
471
  content_types=None,
346
472
  )
347
- return _list(
348
- *args,
349
- metafile=partition,
350
- txn_op_type=TransactionOperationType.READ_SIBLINGS,
351
- **kwargs,
352
- )
473
+ try:
474
+ result = _list(
475
+ metafile=partition,
476
+ txn_op_type=TransactionOperationType.READ_SIBLINGS,
477
+ transaction=transaction,
478
+ *args,
479
+ **kwargs,
480
+ )
481
+ except ObjectNotFoundError as e:
482
+ raise StreamNotFoundError(f"Stream {stream.locator} not found") from e
483
+
484
+ if commit_transaction:
485
+ transaction.seal()
486
+ return result
353
487
 
354
488
 
355
489
  def list_stream_partitions(stream: Stream, *args, **kwargs) -> ListResult[Partition]:
356
490
  """
357
491
  Lists all partitions committed to the given stream.
358
492
  """
493
+ # TODO(pdames): Support listing uncommitted partitions.
359
494
  if stream.stream_format != StreamFormat.DELTACAT:
360
495
  raise ValueError(
361
496
  f"Unsupported stream format: {stream.stream_format}"
@@ -368,15 +503,17 @@ def list_stream_partitions(stream: Stream, *args, **kwargs) -> ListResult[Partit
368
503
  )
369
504
  partition = Partition.of(
370
505
  locator=locator,
371
- schema=None,
372
506
  content_types=None,
373
507
  )
374
- return _list(
375
- *args,
376
- metafile=partition,
377
- txn_op_type=TransactionOperationType.READ_SIBLINGS,
378
- **kwargs,
379
- )
508
+ try:
509
+ return _list(
510
+ metafile=partition,
511
+ txn_op_type=TransactionOperationType.READ_SIBLINGS,
512
+ *args,
513
+ **kwargs,
514
+ )
515
+ except ObjectNotFoundError as e:
516
+ raise StreamNotFoundError(f"Stream {stream.locator} not found") from e
380
517
 
381
518
 
382
519
  def list_deltas(
@@ -390,6 +527,7 @@ def list_deltas(
390
527
  include_manifest: bool = False,
391
528
  partition_scheme_id: Optional[str] = None,
392
529
  *args,
530
+ transaction: Optional[Transaction] = None,
393
531
  **kwargs,
394
532
  ) -> ListResult[Delta]:
395
533
  """
@@ -406,21 +544,48 @@ def list_deltas(
406
544
  default. The manifests can either be optionally retrieved as part of this
407
545
  call or lazily loaded via subsequent calls to `get_delta_manifest`.
408
546
  """
547
+ transaction, commit_transaction = setup_transaction(transaction, **kwargs)
548
+
409
549
  # TODO(pdames): Delta listing should ideally either use an efficient
410
550
  # range-limited dir listing of partition children between start and end
411
551
  # positions, or should traverse using Partition.stream_position (to
412
552
  # resolve last stream position) and Delta.previous_stream_position
413
553
  # (down to first stream position).
414
- partition_locator_alias = _resolve_partition_locator_alias(
415
- *args,
554
+
555
+ # First get the stream to resolve proper table version and stream locator
556
+ stream = get_stream(
416
557
  namespace=namespace,
417
558
  table_name=table_name,
418
559
  table_version=table_version,
560
+ transaction=transaction,
561
+ *args,
562
+ **kwargs,
563
+ )
564
+ if not stream:
565
+ raise StreamNotFoundError(
566
+ f"Failed to resolve stream for "
567
+ f"`{namespace}.{table_name}` at table version "
568
+ f"`{table_version or 'latest'}` (no stream found)."
569
+ )
570
+
571
+ # Then get the actual partition to ensure we have the real partition locator with ID
572
+ partition = get_partition(
573
+ stream_locator=stream.locator,
419
574
  partition_values=partition_values,
420
575
  partition_scheme_id=partition_scheme_id,
576
+ transaction=transaction,
577
+ *args,
421
578
  **kwargs,
422
579
  )
423
- locator = DeltaLocator.of(locator=partition_locator_alias)
580
+ if not partition:
581
+ raise PartitionNotFoundError(
582
+ f"Failed to find partition for stream {stream.locator} "
583
+ f"with partition_values={partition_values} and "
584
+ f"partition_scheme_id={partition_scheme_id}"
585
+ )
586
+
587
+ # Use the actual partition locator (with partition ID) for listing deltas
588
+ locator = DeltaLocator.of(partition_locator=partition.locator)
424
589
  delta = Delta.of(
425
590
  locator=locator,
426
591
  delta_type=None,
@@ -428,20 +593,34 @@ def list_deltas(
428
593
  properties=None,
429
594
  manifest=None,
430
595
  )
431
- all_deltas_list_result: ListResult[Delta] = _list(
432
- *args,
433
- metafile=delta,
434
- txn_op_type=TransactionOperationType.READ_SIBLINGS,
435
- **kwargs,
436
- )
596
+ try:
597
+ all_deltas_list_result: ListResult[Delta] = _list(
598
+ metafile=delta,
599
+ txn_op_type=TransactionOperationType.READ_SIBLINGS,
600
+ transaction=transaction,
601
+ *args,
602
+ **kwargs,
603
+ )
604
+ except ObjectNotFoundError as e:
605
+ raise PartitionNotFoundError(f"Partition {partition.locator} not found") from e
437
606
  all_deltas = all_deltas_list_result.all_items()
438
607
  filtered_deltas = [
439
608
  delta
440
609
  for delta in all_deltas
441
- if first_stream_position <= delta.stream_position <= last_stream_position
610
+ if (
611
+ first_stream_position is None
612
+ or first_stream_position <= delta.stream_position
613
+ )
614
+ and (
615
+ last_stream_position is None
616
+ or delta.stream_position <= last_stream_position
617
+ )
442
618
  ]
443
- if ascending_order:
444
- filtered_deltas.reverse()
619
+ # Sort deltas by stream position in the requested order
620
+ filtered_deltas.sort(reverse=(not ascending_order), key=lambda d: d.stream_position)
621
+
622
+ if commit_transaction:
623
+ transaction.seal()
445
624
  return filtered_deltas
446
625
 
447
626
 
@@ -479,21 +658,37 @@ def list_partition_deltas(
479
658
  properties=None,
480
659
  manifest=None,
481
660
  )
482
- all_deltas_list_result: ListResult[Delta] = _list(
483
- *args,
484
- metafile=delta,
485
- txn_op_type=TransactionOperationType.READ_SIBLINGS,
486
- **kwargs,
487
- )
661
+ try:
662
+ all_deltas_list_result: ListResult[Delta] = _list(
663
+ metafile=delta,
664
+ txn_op_type=TransactionOperationType.READ_SIBLINGS,
665
+ *args,
666
+ **kwargs,
667
+ )
668
+ except ObjectNotFoundError as e:
669
+ raise PartitionNotFoundError(
670
+ f"Partition {partition_like.locator} not found"
671
+ ) from e
488
672
  all_deltas = all_deltas_list_result.all_items()
489
673
  filtered_deltas = [
490
674
  delta
491
675
  for delta in all_deltas
492
- if first_stream_position <= delta.stream_position <= last_stream_position
676
+ if (
677
+ first_stream_position is None
678
+ or first_stream_position <= delta.stream_position
679
+ )
680
+ and (
681
+ last_stream_position is None
682
+ or delta.stream_position <= last_stream_position
683
+ )
493
684
  ]
494
- if ascending_order:
495
- filtered_deltas.reverse()
496
- return filtered_deltas
685
+ # Sort deltas by stream position in the requested order
686
+ filtered_deltas.sort(reverse=(not ascending_order), key=lambda d: d.stream_position)
687
+ return ListResult.of(
688
+ items=filtered_deltas,
689
+ pagination_key=None,
690
+ next_page_provider=None,
691
+ )
497
692
 
498
693
 
499
694
  def get_delta(
@@ -505,6 +700,7 @@ def get_delta(
505
700
  include_manifest: bool = False,
506
701
  partition_scheme_id: Optional[str] = None,
507
702
  *args,
703
+ transaction: Optional[Transaction] = None,
508
704
  **kwargs,
509
705
  ) -> Optional[Delta]:
510
706
  """
@@ -519,18 +715,45 @@ def get_delta(
519
715
  default. The manifest can either be optionally retrieved as part of this
520
716
  call or lazily loaded via a subsequent call to `get_delta_manifest`.
521
717
  """
718
+ transaction, commit_transaction = setup_transaction(transaction, **kwargs)
719
+
522
720
  # TODO(pdames): Honor `include_manifest` param.
523
- partition_locator_alias = _resolve_partition_locator_alias(
524
- *args,
721
+
722
+ # First get the stream to resolve proper table version and stream locator
723
+ stream = get_stream(
525
724
  namespace=namespace,
526
725
  table_name=table_name,
527
726
  table_version=table_version,
727
+ transaction=transaction,
728
+ *args,
729
+ **kwargs,
730
+ )
731
+ if not stream:
732
+ raise StreamNotFoundError(
733
+ f"Failed to resolve stream for "
734
+ f"`{namespace}.{table_name}` at table version "
735
+ f"`{table_version or 'latest'}` (no stream found)."
736
+ )
737
+
738
+ # Then get the actual partition to ensure we have the real partition locator with ID
739
+ partition = get_partition(
740
+ stream_locator=stream.locator,
528
741
  partition_values=partition_values,
529
742
  partition_scheme_id=partition_scheme_id,
743
+ transaction=transaction,
744
+ *args,
530
745
  **kwargs,
531
746
  )
747
+ if not partition:
748
+ raise PartitionNotFoundError(
749
+ f"Failed to find partition for stream {stream.locator} "
750
+ f"with partition_values={partition_values} and "
751
+ f"partition_scheme_id={partition_scheme_id}"
752
+ )
753
+
754
+ # Use the actual partition locator (with partition ID) for getting the delta
532
755
  locator = DeltaLocator.of(
533
- locator=partition_locator_alias,
756
+ partition_locator=partition.locator,
534
757
  stream_position=stream_position,
535
758
  )
536
759
  delta = Delta.of(
@@ -540,12 +763,22 @@ def get_delta(
540
763
  properties=None,
541
764
  manifest=None,
542
765
  )
543
- return _latest(
544
- *args,
766
+ result = _latest(
545
767
  metafile=delta,
768
+ transaction=transaction,
769
+ *args,
546
770
  **kwargs,
547
771
  )
548
772
 
773
+ # TODO(pdames): Honor the include_manifest parameter during retrieval from _latest, since
774
+ # the point is to avoid loading the manifest into memory if it's not needed.
775
+ if result and not include_manifest:
776
+ result.manifest = None
777
+
778
+ if commit_transaction:
779
+ transaction.seal()
780
+ return result
781
+
549
782
 
550
783
  def get_latest_delta(
551
784
  namespace: str,
@@ -555,6 +788,7 @@ def get_latest_delta(
555
788
  include_manifest: bool = False,
556
789
  partition_scheme_id: Optional[str] = None,
557
790
  *args,
791
+ transaction: Optional[Transaction] = None,
558
792
  **kwargs,
559
793
  ) -> Optional[Delta]:
560
794
  """
@@ -569,19 +803,26 @@ def get_latest_delta(
569
803
  default. The manifest can either be optionally retrieved as part of this
570
804
  call or lazily loaded via a subsequent call to `get_delta_manifest`.
571
805
  """
572
- # TODO(pdames): Wrap this method in 1 single txn.
806
+ transaction, commit_transaction = setup_transaction(transaction, **kwargs)
807
+
573
808
  stream = get_stream(
574
809
  namespace=namespace,
575
810
  table_name=table_name,
576
811
  table_version=table_version,
812
+ transaction=transaction,
813
+ *args,
814
+ **kwargs,
577
815
  )
578
816
  partition = get_partition(
579
817
  stream_locator=stream.locator,
580
818
  partition_values=partition_values,
581
819
  partition_scheme_id=partition_scheme_id,
820
+ transaction=transaction,
821
+ *args,
822
+ **kwargs,
582
823
  )
583
824
  locator = DeltaLocator.of(
584
- locator=partition.locator,
825
+ partition_locator=partition.locator,
585
826
  stream_position=partition.stream_position,
586
827
  )
587
828
  delta = Delta.of(
@@ -591,53 +832,327 @@ def get_latest_delta(
591
832
  properties=None,
592
833
  manifest=None,
593
834
  )
594
- return _latest(
595
- *args,
835
+ result = _latest(
596
836
  metafile=delta,
837
+ transaction=transaction,
838
+ *args,
839
+ **kwargs,
840
+ )
841
+
842
+ # TODO(pdames): Honor the include_manifest parameter during retrieval from _latest, since
843
+ # the point is to avoid loading the manifest into memory if it's not needed.
844
+ if result and not include_manifest:
845
+ result.manifest = None
846
+
847
+ if commit_transaction:
848
+ transaction.seal()
849
+ return result
850
+
851
+
852
+ def _download_delta_distributed(
853
+ manifest: Manifest,
854
+ table_type: DatasetType = DatasetType.PYARROW,
855
+ max_parallelism: Optional[int] = None,
856
+ column_names: Optional[List[str]] = None,
857
+ include_columns: Optional[List[str]] = None,
858
+ file_reader_kwargs_provider: Optional[ReadKwargsProvider] = None,
859
+ *args,
860
+ ray_options_provider: Callable[[int, Any], Dict[str, Any]] = None,
861
+ distributed_dataset_type: Optional[
862
+ DistributedDatasetType
863
+ ] = DistributedDatasetType.RAY_DATASET,
864
+ **kwargs,
865
+ ) -> DistributedDataset:
866
+
867
+ distributed_dataset: DistributedDataset = download_manifest_entries_distributed(
868
+ manifest=manifest,
869
+ table_type=table_type,
870
+ max_parallelism=max_parallelism,
871
+ column_names=column_names,
872
+ include_columns=include_columns,
873
+ file_reader_kwargs_provider=file_reader_kwargs_provider,
874
+ ray_options_provider=ray_options_provider,
875
+ distributed_dataset_type=distributed_dataset_type,
876
+ *args,
877
+ **kwargs,
878
+ )
879
+
880
+ return distributed_dataset
881
+
882
+
883
+ def _download_delta_local(
884
+ manifest: Manifest,
885
+ table_type: DatasetType = DatasetType.PYARROW,
886
+ max_parallelism: Optional[int] = None,
887
+ column_names: Optional[List[str]] = None,
888
+ include_columns: Optional[List[str]] = None,
889
+ file_reader_kwargs_provider: Optional[ReadKwargsProvider] = None,
890
+ *args,
891
+ **kwargs,
892
+ ) -> LocalDataset:
893
+ tables: LocalDataset = download_manifest_entries(
894
+ manifest,
895
+ table_type,
896
+ max_parallelism if max_parallelism else 1,
897
+ column_names,
898
+ include_columns,
899
+ file_reader_kwargs_provider,
597
900
  **kwargs,
598
901
  )
902
+ return tables
599
903
 
600
904
 
601
905
  def download_delta(
602
906
  delta_like: Union[Delta, DeltaLocator],
603
- table_type: TableType = TableType.PYARROW,
907
+ table_type: DatasetType = DatasetType.PYARROW,
604
908
  storage_type: StorageType = StorageType.DISTRIBUTED,
605
909
  max_parallelism: Optional[int] = None,
606
910
  columns: Optional[List[str]] = None,
607
911
  file_reader_kwargs_provider: Optional[ReadKwargsProvider] = None,
608
912
  ray_options_provider: Callable[[int, Any], Dict[str, Any]] = None,
609
913
  distributed_dataset_type: DistributedDatasetType = DistributedDatasetType.RAY_DATASET,
914
+ file_path_column: Optional[str] = None,
610
915
  *args,
916
+ transaction: Optional[Transaction] = None,
917
+ all_column_names: Optional[List[str]] = None,
611
918
  **kwargs,
612
919
  ) -> Union[LocalDataset, DistributedDataset]: # type: ignore
613
920
  """
614
- Download the given delta or delta locator into either a list of
921
+ Read the given delta or delta locator into either a list of
615
922
  tables resident in the local node's memory, or into a dataset distributed
616
923
  across this Ray cluster's object store memory. Ordered table N of a local
617
924
  table list, or ordered block N of a distributed dataset, always contain
618
925
  the contents of ordered delta manifest entry N.
619
926
  """
620
- raise NotImplementedError("download_delta not implemented")
927
+ # TODO (pdames): Cast delimited text types to the table's schema types
928
+ # TODO (pdames): Deprecate this method and replace with `read_delta`
929
+ # TODO (pdames): Replace dependence on TableType, StorageType, and DistributedDatasetType
930
+ # with DatasetType
931
+
932
+ # if all column names are provided, then this is a pure manifest entry download (no transaction needed)
933
+ commit_transaction = False
934
+ if not all_column_names:
935
+ transaction, commit_transaction = setup_transaction(transaction, **kwargs)
936
+
937
+ storage_type_to_download_func = {
938
+ StorageType.LOCAL: _download_delta_local,
939
+ StorageType.DISTRIBUTED: _download_delta_distributed,
940
+ }
941
+
942
+ is_delta = isinstance(delta_like, Delta)
943
+ is_delta_locator = isinstance(delta_like, DeltaLocator)
944
+
945
+ delta_locator: Optional[DeltaLocator] = None
946
+ if is_delta_locator:
947
+ delta_locator = delta_like
948
+ elif is_delta:
949
+ delta_locator = Delta(delta_like).locator
950
+ if not delta_locator:
951
+ raise ValueError(
952
+ f"Expected delta_like to be a Delta or DeltaLocator, but found "
953
+ f"{type(delta_like)}."
954
+ )
955
+
956
+ # Get manifest - if delta_like is a Delta with a manifest, use it, otherwise fetch from storage
957
+ if is_delta and delta_like.manifest:
958
+ manifest = delta_like.manifest
959
+ elif all_column_names:
960
+ raise ValueError(
961
+ "All column names can only be specified with a delta with an inline manifest."
962
+ )
963
+ else:
964
+ manifest = get_delta_manifest(
965
+ delta_locator,
966
+ transaction=transaction,
967
+ *args,
968
+ **kwargs,
969
+ )
970
+ all_column_names = all_column_names or None
971
+ if not all_column_names:
972
+ table_version_schema = get_table_version_schema(
973
+ delta_locator.namespace,
974
+ delta_locator.table_name,
975
+ delta_locator.table_version,
976
+ transaction=transaction,
977
+ *args,
978
+ **kwargs,
979
+ )
980
+ if table_version_schema and table_version_schema.arrow:
981
+ all_column_names = [field.name for field in table_version_schema.arrow]
982
+ if distributed_dataset_type == DatasetType.DAFT:
983
+ # Daft needs the latest table version schema to properly handle schema evolution
984
+ kwargs["table_version_schema"] = table_version_schema.arrow
985
+ elif distributed_dataset_type == DatasetType.DAFT:
986
+ raise ValueError("All column names canot be specified with Daft.")
987
+ if columns:
988
+ # Extract file_path_column since it's appended after reading each file
989
+ columns_to_validate = (
990
+ [col for col in columns if col != file_path_column]
991
+ if file_path_column
992
+ else columns
993
+ )
994
+
995
+ # Only validate columns if we have schema information (all_column_names is not None)
996
+ if all_column_names is not None:
997
+ if not all(
998
+ col in [col_name.lower() for col_name in all_column_names]
999
+ for col in columns_to_validate
1000
+ ):
1001
+ raise SchemaValidationError(
1002
+ f"One or more columns in {columns_to_validate} are not present in table "
1003
+ f"version columns {all_column_names}"
1004
+ )
1005
+ columns = [column.lower() for column in columns]
1006
+ logger.debug(
1007
+ f"Reading {columns or 'all'} columns from table version column "
1008
+ f"names: {all_column_names}. "
1009
+ )
1010
+
1011
+ # Filter out parameters that are already passed as positional/keyword arguments
1012
+ # to avoid "multiple values for argument" errors
1013
+ filtered_kwargs = {
1014
+ k: v
1015
+ for k, v in kwargs.items()
1016
+ if k
1017
+ not in [
1018
+ "manifest",
1019
+ "table_type",
1020
+ "max_parallelism",
1021
+ "column_names",
1022
+ "include_columns",
1023
+ "file_reader_kwargs_provider",
1024
+ "ray_options_provider",
1025
+ "distributed_dataset_type",
1026
+ ]
1027
+ }
1028
+
1029
+ dataset = storage_type_to_download_func[storage_type](
1030
+ manifest,
1031
+ table_type,
1032
+ max_parallelism,
1033
+ all_column_names,
1034
+ columns,
1035
+ file_reader_kwargs_provider,
1036
+ ray_options_provider=ray_options_provider,
1037
+ distributed_dataset_type=distributed_dataset_type,
1038
+ file_path_column=file_path_column,
1039
+ **filtered_kwargs,
1040
+ )
1041
+ if commit_transaction:
1042
+ transaction.seal()
1043
+ return dataset
1044
+
1045
+
1046
+ def _download_manifest_entry(
1047
+ manifest_entry: ManifestEntry,
1048
+ table_type: DatasetType = DatasetType.PYARROW,
1049
+ column_names: Optional[List[str]] = None,
1050
+ include_columns: Optional[List[str]] = None,
1051
+ file_reader_kwargs_provider: Optional[ReadKwargsProvider] = None,
1052
+ content_type: Optional[ContentType] = None,
1053
+ content_encoding: Optional[ContentEncoding] = None,
1054
+ filesystem: Optional[pyarrow.fs.FileSystem] = None,
1055
+ ) -> LocalTable:
1056
+
1057
+ return download_manifest_entry(
1058
+ manifest_entry,
1059
+ table_type,
1060
+ column_names,
1061
+ include_columns,
1062
+ file_reader_kwargs_provider,
1063
+ content_type,
1064
+ content_encoding,
1065
+ filesystem,
1066
+ )
621
1067
 
622
1068
 
623
1069
  def download_delta_manifest_entry(
624
1070
  delta_like: Union[Delta, DeltaLocator],
625
1071
  entry_index: int,
626
- table_type: TableType = TableType.PYARROW,
1072
+ table_type: DatasetType = DatasetType.PYARROW,
627
1073
  columns: Optional[List[str]] = None,
628
1074
  file_reader_kwargs_provider: Optional[ReadKwargsProvider] = None,
629
1075
  *args,
1076
+ transaction: Optional[Transaction] = None,
1077
+ all_column_names: Optional[List[str]] = None,
630
1078
  **kwargs,
631
1079
  ) -> LocalTable:
632
1080
  """
633
- Downloads a single manifest entry into the specified table type for the
1081
+ Reads a single manifest entry into the specified table type for the
634
1082
  given delta or delta locator. If a delta is provided with a non-empty
635
- manifest, then the entry is downloaded from this manifest. Otherwise, the
636
- manifest is first retrieved then the given entry index downloaded.
1083
+ manifest, then the entry is read from this manifest. Otherwise, the
1084
+ manifest is first retrieved then the given entry index read.
637
1085
 
638
- NOTE: The entry will be downloaded in the current node's memory.
1086
+ NOTE: The entry will be read in the current node's memory.
639
1087
  """
640
- raise NotImplementedError("download_delta_manifest_entry not implemented")
1088
+ # if all column names are provided, then this is a pure manifest entry download (no transaction needed)
1089
+ commit_transaction = False
1090
+ if not all_column_names:
1091
+ transaction, commit_transaction = setup_transaction(transaction, **kwargs)
1092
+
1093
+ is_delta = isinstance(delta_like, Delta)
1094
+ is_delta_locator = isinstance(delta_like, DeltaLocator)
1095
+
1096
+ delta_locator: Optional[DeltaLocator] = None
1097
+ if is_delta_locator:
1098
+ delta_locator = delta_like
1099
+ elif is_delta:
1100
+ delta_locator = Delta(delta_like).locator
1101
+ if not delta_locator:
1102
+ raise ValueError(
1103
+ f"Expected delta_like to be a Delta or DeltaLocator, but found "
1104
+ f"{type(delta_like)}."
1105
+ )
1106
+
1107
+ if is_delta and delta_like.manifest:
1108
+ manifest = delta_like.manifest
1109
+ elif all_column_names:
1110
+ raise ValueError(
1111
+ "All column names can only be specified with a delta with an inline manifest."
1112
+ )
1113
+ else:
1114
+ manifest = get_delta_manifest(
1115
+ delta_locator,
1116
+ transaction=transaction,
1117
+ *args,
1118
+ **kwargs,
1119
+ )
1120
+ # TODO(pdames): Cache table version column names and only invoke when
1121
+ # needed.
1122
+ all_column_names = all_column_names or get_table_version_column_names(
1123
+ delta_locator.namespace,
1124
+ delta_locator.table_name,
1125
+ delta_locator.table_version,
1126
+ transaction=transaction,
1127
+ *args,
1128
+ **kwargs,
1129
+ )
1130
+ if columns:
1131
+ if not all(
1132
+ col in [col_name.lower() for col_name in all_column_names]
1133
+ for col in columns
1134
+ ):
1135
+ raise SchemaValidationError(
1136
+ f"One or more columns in {columns} are not present in table "
1137
+ f"version columns {all_column_names}"
1138
+ )
1139
+ columns = [column.lower() for column in columns]
1140
+ logger.debug(
1141
+ f"Reading {columns or 'all'} columns from table version column "
1142
+ f"names: {all_column_names}. "
1143
+ )
1144
+ catalog_properties = get_catalog_properties(**kwargs)
1145
+ manifest_entry = _download_manifest_entry(
1146
+ manifest.entries[entry_index],
1147
+ table_type,
1148
+ all_column_names,
1149
+ columns,
1150
+ file_reader_kwargs_provider,
1151
+ filesystem=catalog_properties.filesystem,
1152
+ )
1153
+ if commit_transaction:
1154
+ transaction.seal()
1155
+ return manifest_entry
641
1156
 
642
1157
 
643
1158
  def get_delta_manifest(
@@ -666,13 +1181,15 @@ def get_delta_manifest(
666
1181
  properties=None,
667
1182
  manifest=None,
668
1183
  )
669
- latest_delta = _latest(
1184
+ latest_delta: Delta = _latest(
670
1185
  metafile=delta,
671
1186
  *args,
672
1187
  **kwargs,
673
1188
  )
674
- if not latest_delta or not latest_delta.manifest:
675
- raise ValueError(f"No manifest found for delta: {delta_locator}")
1189
+ if not latest_delta:
1190
+ raise DeltaNotFoundError(f"No delta found for locator: {delta_locator}")
1191
+ elif not latest_delta.manifest:
1192
+ raise DeltaNotFoundError(f"No manifest found for delta: {latest_delta}")
676
1193
  return latest_delta.manifest
677
1194
 
678
1195
 
@@ -680,30 +1197,30 @@ def create_namespace(
680
1197
  namespace: str,
681
1198
  properties: Optional[NamespaceProperties] = None,
682
1199
  *args,
1200
+ transaction: Optional[Transaction] = None,
683
1201
  **kwargs,
684
1202
  ) -> Namespace:
685
1203
  """
686
1204
  Creates a table namespace with the given name and properties. Returns
687
1205
  the created namespace.
688
1206
  """
1207
+ transaction, commit_transaction = setup_transaction(transaction, **kwargs)
1208
+
689
1209
  namespace = Namespace.of(
690
1210
  locator=NamespaceLocator.of(namespace=namespace),
691
1211
  properties=properties,
692
1212
  )
693
- transaction = Transaction.of(
694
- txn_type=TransactionType.APPEND,
695
- txn_operations=[
696
- TransactionOperation.of(
697
- operation_type=TransactionOperationType.CREATE,
698
- dest_metafile=namespace,
699
- )
700
- ],
701
- )
702
- catalog_properties = get_catalog_properties(**kwargs)
703
- transaction.commit(
704
- catalog_root_dir=catalog_properties.root,
705
- filesystem=catalog_properties.filesystem,
1213
+
1214
+ # Add the operation to the transaction
1215
+ transaction.step(
1216
+ TransactionOperation.of(
1217
+ operation_type=TransactionOperationType.CREATE,
1218
+ dest_metafile=namespace,
1219
+ ),
706
1220
  )
1221
+
1222
+ if commit_transaction:
1223
+ transaction.seal()
707
1224
  return namespace
708
1225
 
709
1226
 
@@ -712,43 +1229,55 @@ def update_namespace(
712
1229
  properties: Optional[NamespaceProperties] = None,
713
1230
  new_namespace: Optional[str] = None,
714
1231
  *args,
1232
+ transaction: Optional[Transaction] = None,
715
1233
  **kwargs,
716
1234
  ) -> None:
717
1235
  """
718
1236
  Updates a table namespace's name and/or properties. Raises an error if the
719
1237
  given namespace does not exist.
720
1238
  """
721
- # TODO(pdames): Wrap get & update within a single txn.
722
- old_namespace = get_namespace(
723
- *args,
1239
+ transaction, commit_transaction = setup_transaction(transaction, **kwargs)
1240
+
1241
+ # Check if the namespace exists
1242
+ old_namespace_meta = get_namespace(
724
1243
  namespace=namespace,
1244
+ transaction=transaction,
1245
+ *args,
725
1246
  **kwargs,
726
1247
  )
727
- new_namespace: Namespace = Metafile.update_for(old_namespace)
728
- new_namespace.namespace = namespace
729
- new_namespace.properties = properties
730
- transaction = Transaction.of(
731
- txn_type=TransactionType.ALTER,
732
- txn_operations=[
1248
+ if not old_namespace_meta:
1249
+ raise NamespaceNotFoundError(f"Namespace {namespace} does not exist")
1250
+
1251
+ # Create new namespace metadata
1252
+ new_namespace_meta: Namespace = Metafile.update_for(old_namespace_meta)
1253
+ if new_namespace:
1254
+ new_namespace_meta.locator.namespace = new_namespace
1255
+ if properties is not None:
1256
+ new_namespace_meta.properties = properties
1257
+
1258
+ # Add the update operation to the transaction
1259
+ try:
1260
+ transaction.step(
733
1261
  TransactionOperation.of(
734
1262
  operation_type=TransactionOperationType.UPDATE,
735
- dest_metafile=new_namespace,
736
- src_metafile=old_namespace,
737
- )
738
- ],
739
- )
740
- catalog_properties = get_catalog_properties(**kwargs)
741
- transaction.commit(
742
- catalog_root_dir=catalog_properties.root,
743
- filesystem=catalog_properties.filesystem,
744
- )
745
- return namespace
1263
+ dest_metafile=new_namespace_meta,
1264
+ src_metafile=old_namespace_meta,
1265
+ ),
1266
+ )
1267
+ except ObjectAlreadyExistsError as e:
1268
+ raise NamespaceAlreadyExistsError(
1269
+ f"Namespace {namespace} already exists"
1270
+ ) from e
1271
+
1272
+ if commit_transaction:
1273
+ transaction.seal()
746
1274
 
747
1275
 
748
1276
  def create_table_version(
749
1277
  namespace: str,
750
1278
  table_name: str,
751
1279
  table_version: Optional[str] = None,
1280
+ lifecycle_state: Optional[LifecycleState] = LifecycleState.CREATED,
752
1281
  schema: Optional[Schema] = None,
753
1282
  partition_scheme: Optional[PartitionScheme] = None,
754
1283
  sort_keys: Optional[SortScheme] = None,
@@ -758,10 +1287,11 @@ def create_table_version(
758
1287
  table_properties: Optional[TableProperties] = None,
759
1288
  supported_content_types: Optional[List[ContentType]] = None,
760
1289
  *args,
1290
+ transaction: Optional[Transaction] = None,
761
1291
  **kwargs,
762
1292
  ) -> Tuple[Table, TableVersion, Stream]:
763
1293
  """
764
- Create a table version with an unreleased lifecycle state and an empty delta
1294
+ Create a table version with the given or CREATED lifecycle state and an empty delta
765
1295
  stream. Table versions may be schemaless and unpartitioned to improve write
766
1296
  performance, or have their writes governed by a schema and partition scheme
767
1297
  to improve data consistency and read performance.
@@ -771,23 +1301,34 @@ def create_table_version(
771
1301
 
772
1302
  Raises an error if the given namespace does not exist.
773
1303
  """
1304
+ transaction, commit_transaction = setup_transaction(transaction, **kwargs)
1305
+
774
1306
  if not namespace_exists(
775
- *args,
776
1307
  namespace=namespace,
1308
+ transaction=transaction,
1309
+ *args,
777
1310
  **kwargs,
778
1311
  ):
779
- raise ValueError(f"Namespace {namespace} does not exist")
1312
+ raise NamespaceNotFoundError(f"Namespace {namespace} does not exist")
1313
+
1314
+ # Validate schemes against schema
1315
+ _validate_schemes_against_schema(schema, partition_scheme, sort_keys)
1316
+
1317
+ # coerce unspecified partition schemes to the unpartitioned scheme
1318
+ partition_scheme = partition_scheme or UNPARTITIONED_SCHEME
1319
+ # coerce unspecified sort schemes to the unsorted scheme
1320
+ sort_keys = sort_keys or UNSORTED_SCHEME
780
1321
  # check if a parent table and/or previous table version already exist
781
1322
  prev_table_version = None
782
1323
  prev_table = get_table(
783
- *args,
784
1324
  namespace=namespace,
785
1325
  table_name=table_name,
1326
+ transaction=transaction,
1327
+ *args,
786
1328
  **kwargs,
787
1329
  )
788
1330
  if not prev_table:
789
1331
  # no parent table exists, so we'll create it in this transaction
790
- txn_type = TransactionType.APPEND
791
1332
  table_txn_op_type = TransactionOperationType.CREATE
792
1333
  prev_table = None
793
1334
  new_table = Table.of(
@@ -796,7 +1337,6 @@ def create_table_version(
796
1337
  table_version = table_version or DEFAULT_TABLE_VERSION
797
1338
  else:
798
1339
  # the parent table exists, so we'll update it in this transaction
799
- txn_type = TransactionType.ALTER
800
1340
  table_txn_op_type = TransactionOperationType.UPDATE
801
1341
  new_table: Table = Metafile.update_for(prev_table)
802
1342
  prev_table_version = prev_table.latest_table_version
@@ -813,14 +1353,18 @@ def create_table_version(
813
1353
  expected_table_version,
814
1354
  )
815
1355
  if version_number != expected_version_number:
816
- raise ValueError(
1356
+ raise TableValidationError(
817
1357
  f"Expected to create table version "
818
1358
  f"{expected_version_number} but found {version_number}.",
819
1359
  )
820
- new_table.description = table_description or table_version_description
821
- new_table.properties = table_properties
1360
+ if table_description is not None:
1361
+ new_table.description = table_description
1362
+ if table_properties is not None:
1363
+ new_table.properties = table_properties
822
1364
  new_table.latest_table_version = table_version
823
- catalog_properties = get_catalog_properties(**kwargs)
1365
+ new_table.latest_active_table_version = (
1366
+ table_version if lifecycle_state == LifecycleState.ACTIVE else None
1367
+ )
824
1368
  locator = TableVersionLocator.at(
825
1369
  namespace=namespace,
826
1370
  table_name=table_name,
@@ -835,10 +1379,10 @@ def create_table_version(
835
1379
  content_types=supported_content_types,
836
1380
  sort_scheme=sort_keys,
837
1381
  watermark=None,
838
- lifecycle_state=LifecycleState.CREATED,
1382
+ lifecycle_state=lifecycle_state,
839
1383
  schemas=[schema] if schema else None,
840
- partition_schemes=[partition_scheme] if partition_scheme else None,
841
- sort_schemes=[sort_keys] if sort_keys else None,
1384
+ partition_schemes=[partition_scheme],
1385
+ sort_schemes=[sort_keys],
842
1386
  previous_table_version=prev_table_version,
843
1387
  )
844
1388
  # create the table version's default deltacat stream in this transaction
@@ -854,31 +1398,68 @@ def create_table_version(
854
1398
  previous_stream_id=None,
855
1399
  watermark=None,
856
1400
  )
857
- transaction = Transaction.of(
858
- txn_type=txn_type,
859
- txn_operations=[
860
- TransactionOperation.of(
861
- operation_type=table_txn_op_type,
862
- dest_metafile=new_table,
863
- src_metafile=prev_table,
864
- ),
865
- TransactionOperation.of(
866
- operation_type=TransactionOperationType.CREATE,
867
- dest_metafile=table_version,
868
- ),
869
- TransactionOperation.of(
870
- operation_type=TransactionOperationType.CREATE,
871
- dest_metafile=stream,
872
- ),
873
- ],
1401
+ # Add operations to the transaction
1402
+ transaction.step(
1403
+ TransactionOperation.of(
1404
+ operation_type=table_txn_op_type,
1405
+ dest_metafile=new_table,
1406
+ src_metafile=prev_table,
1407
+ ),
874
1408
  )
875
- transaction.commit(
876
- catalog_root_dir=catalog_properties.root,
877
- filesystem=catalog_properties.filesystem,
1409
+ transaction.step(
1410
+ TransactionOperation.of(
1411
+ operation_type=TransactionOperationType.CREATE,
1412
+ dest_metafile=table_version,
1413
+ ),
878
1414
  )
1415
+ transaction.step(
1416
+ TransactionOperation.of(
1417
+ operation_type=TransactionOperationType.CREATE,
1418
+ dest_metafile=stream,
1419
+ ),
1420
+ )
1421
+
1422
+ if commit_transaction:
1423
+ transaction.seal()
879
1424
  return new_table, table_version, stream
880
1425
 
881
1426
 
1427
+ def create_table(
1428
+ namespace: str,
1429
+ table_name: str,
1430
+ description: Optional[str] = None,
1431
+ properties: Optional[TableProperties] = None,
1432
+ *args,
1433
+ transaction: Optional[Transaction] = None,
1434
+ **kwargs,
1435
+ ) -> Table:
1436
+ """
1437
+ Create a new table. Raises an error if the given table already exists.
1438
+ """
1439
+ transaction, commit_transaction = setup_transaction(transaction, **kwargs)
1440
+
1441
+ new_table: Table = Table.of(
1442
+ locator=TableLocator.at(namespace=namespace, table_name=table_name),
1443
+ description=description,
1444
+ properties=properties,
1445
+ )
1446
+ try:
1447
+ transaction.step(
1448
+ TransactionOperation.of(
1449
+ operation_type=TransactionOperationType.CREATE,
1450
+ dest_metafile=new_table,
1451
+ ),
1452
+ )
1453
+ except ObjectAlreadyExistsError as e:
1454
+ raise TableAlreadyExistsError(
1455
+ f"Table {namespace}.{table_name} already exists"
1456
+ ) from e
1457
+
1458
+ if commit_transaction:
1459
+ transaction.seal()
1460
+ return new_table
1461
+
1462
+
882
1463
  def update_table(
883
1464
  namespace: str,
884
1465
  table_name: str,
@@ -886,18 +1467,22 @@ def update_table(
886
1467
  properties: Optional[TableProperties] = None,
887
1468
  new_table_name: Optional[str] = None,
888
1469
  *args,
1470
+ transaction: Optional[Transaction] = None,
889
1471
  **kwargs,
890
- ) -> None:
1472
+ ) -> Table:
891
1473
  """
892
1474
  Update table metadata describing the table versions it contains. By default,
893
1475
  a table's properties are empty, and its description is equal to that given
894
1476
  when its first table version was created. Raises an error if the given
895
1477
  table does not exist.
896
1478
  """
1479
+ transaction, commit_transaction = setup_transaction(transaction, **kwargs)
1480
+
897
1481
  old_table = get_table(
898
- *args,
899
1482
  namespace=namespace,
900
1483
  table_name=table_name,
1484
+ transaction=transaction,
1485
+ *args,
901
1486
  **kwargs,
902
1487
  )
903
1488
  if not old_table:
@@ -906,21 +1491,23 @@ def update_table(
906
1491
  new_table.description = description or old_table.description
907
1492
  new_table.properties = properties or old_table.properties
908
1493
  new_table.table_name = new_table_name or old_table.table_name
909
- transaction = Transaction.of(
910
- txn_type=TransactionType.ALTER,
911
- txn_operations=[
1494
+
1495
+ try:
1496
+ transaction.step(
912
1497
  TransactionOperation.of(
913
1498
  operation_type=TransactionOperationType.UPDATE,
914
1499
  dest_metafile=new_table,
915
1500
  src_metafile=old_table,
916
- )
917
- ],
918
- )
919
- catalog_properties = get_catalog_properties(**kwargs)
920
- transaction.commit(
921
- catalog_root_dir=catalog_properties.root,
922
- filesystem=catalog_properties.filesystem,
923
- )
1501
+ ),
1502
+ )
1503
+ except ObjectAlreadyExistsError as e:
1504
+ raise TableAlreadyExistsError(
1505
+ f"Table {namespace}.{table_name} already exists"
1506
+ ) from e
1507
+
1508
+ if commit_transaction:
1509
+ transaction.seal()
1510
+ return new_table
924
1511
 
925
1512
 
926
1513
  def update_table_version(
@@ -934,42 +1521,53 @@ def update_table_version(
934
1521
  partition_scheme: Optional[PartitionScheme] = None,
935
1522
  sort_keys: Optional[SortScheme] = None,
936
1523
  *args,
1524
+ transaction: Optional[Transaction] = None,
937
1525
  **kwargs,
938
- ) -> None:
1526
+ ) -> Tuple[Optional[Table], TableVersion, Optional[Stream]]:
939
1527
  """
940
1528
  Update a table version. Notably, updating an unreleased table version's
941
- lifecycle state to 'active' telegraphs that it is ready for external
1529
+ lifecycle state to 'ACTIVE' telegraphs that it is ready for external
942
1530
  consumption, and causes all calls made to consume/produce streams,
943
1531
  partitions, or deltas from/to its parent table to automatically resolve to
944
1532
  this table version by default (i.e., when the client does not explicitly
945
1533
  specify a different table version). Raises an error if the given table
946
1534
  version does not exist.
1535
+
1536
+ Note that, to transition a table version from partitioned to unpartitioned,
1537
+ partition_scheme must be explicitly set to UNPARTITIONED_SCHEME. Similarly
1538
+ to transition a table version from sorted to unsorted, sort_keys must be
1539
+ explicitly set to UNSORTED_SCHEME.
947
1540
  """
948
- # TODO(pdames): Wrap get & update within a single txn.
1541
+ transaction, commit_transaction = setup_transaction(transaction, **kwargs)
949
1542
  old_table_version = get_table_version(
950
- *args,
951
1543
  namespace=namespace,
952
1544
  table_name=table_name,
953
1545
  table_version=table_version,
1546
+ transaction=transaction,
1547
+ *args,
954
1548
  **kwargs,
955
1549
  )
956
1550
  if not old_table_version:
957
- raise ValueError(
1551
+ raise TableVersionNotFoundError(
958
1552
  f"Table version `{table_version}` does not exist for "
959
1553
  f"table `{namespace}.{table_name}`."
960
1554
  )
1555
+
1556
+ # If schema is not provided but partition_scheme or sort_keys are,
1557
+ # validate against the existing schema
1558
+ schema_to_validate = schema or old_table_version.schema
1559
+ _validate_schemes_against_schema(schema_to_validate, partition_scheme, sort_keys)
1560
+
961
1561
  new_table_version: TableVersion = Metafile.update_for(old_table_version)
962
1562
  new_table_version.state = lifecycle_state or old_table_version.state
963
- # TODO(pdames): Use schema patch to check for backwards incompatible changes.
964
- # By default, backwards incompatible changes should be pushed to a new
965
- # table version unless the user explicitly forces the update to this
966
- # table version (i.e., at the cost of potentially breaking consumers).
1563
+
1564
+ # Caller is expected to do all necessary backwards compatibility schema checks
967
1565
  update_schema = schema and not schema.equivalent_to(
968
1566
  old_table_version.schema,
969
1567
  True,
970
1568
  )
971
1569
  if update_schema and schema.id in [s.id for s in old_table_version.schemas]:
972
- raise ValueError(
1570
+ raise TableValidationError(
973
1571
  f"Schema ID `{schema.id}` already exists in "
974
1572
  f"table version `{table_version}`."
975
1573
  )
@@ -985,6 +1583,21 @@ def update_table_version(
985
1583
  new_table_version.properties = (
986
1584
  properties if properties is not None else old_table_version.properties
987
1585
  )
1586
+ new_supported_reader_types = new_table_version.read_table_property(
1587
+ TableProperty.SUPPORTED_READER_TYPES
1588
+ )
1589
+ if new_supported_reader_types:
1590
+ old_supported_reader_types = (
1591
+ old_table_version.read_table_property(TableProperty.SUPPORTED_READER_TYPES)
1592
+ or {}
1593
+ )
1594
+ added_supported_reader_types = set(new_supported_reader_types) - set(
1595
+ old_supported_reader_types
1596
+ )
1597
+ if added_supported_reader_types:
1598
+ raise TableValidationError(
1599
+ f"Cannot add new supported reader types: {added_supported_reader_types}"
1600
+ )
988
1601
  new_table_version.partition_scheme = (
989
1602
  partition_scheme or old_table_version.partition_scheme
990
1603
  )
@@ -996,7 +1609,7 @@ def update_table_version(
996
1609
  if update_partition_scheme and partition_scheme.id in [
997
1610
  ps.id for ps in old_table_version.partition_schemes
998
1611
  ]:
999
- raise ValueError(
1612
+ raise TableValidationError(
1000
1613
  f"Partition scheme ID `{partition_scheme.id}` already exists in "
1001
1614
  f"table version `{table_version}`."
1002
1615
  )
@@ -1013,7 +1626,7 @@ def update_table_version(
1013
1626
  if update_sort_scheme and sort_keys.id in [
1014
1627
  sk.id for sk in old_table_version.sort_schemes
1015
1628
  ]:
1016
- raise ValueError(
1629
+ raise TableValidationError(
1017
1630
  f"Sort scheme ID `{sort_keys.id}` already exists in "
1018
1631
  f"table version `{table_version}`."
1019
1632
  )
@@ -1024,12 +1637,13 @@ def update_table_version(
1024
1637
  else old_table_version.sort_schemes
1025
1638
  )
1026
1639
  old_table = get_table(
1027
- *args,
1028
1640
  namespace=namespace,
1029
1641
  table_name=table_name,
1642
+ transaction=transaction,
1643
+ *args,
1030
1644
  **kwargs,
1031
1645
  )
1032
- txn_operations = []
1646
+ new_table: Table = None
1033
1647
  if (
1034
1648
  lifecycle_state == LifecycleState.ACTIVE
1035
1649
  and old_table_version.state != LifecycleState.ACTIVE
@@ -1044,50 +1658,52 @@ def update_table_version(
1044
1658
  _, new_version_number = TableVersion.parse_table_version(table_version)
1045
1659
  if old_version_number is None or old_version_number < new_version_number:
1046
1660
  # update the table's latest table version
1047
- new_table: Table = Metafile.update_for(old_table)
1661
+ new_table = Metafile.update_for(old_table)
1048
1662
  new_table.latest_active_table_version = table_version
1049
- txn_operations.append(
1663
+ transaction.step(
1050
1664
  TransactionOperation.of(
1051
1665
  operation_type=TransactionOperationType.UPDATE,
1052
1666
  dest_metafile=new_table,
1053
1667
  src_metafile=old_table,
1054
- )
1668
+ ),
1055
1669
  )
1056
- txn_operations.append(
1057
- TransactionOperation.of(
1058
- operation_type=TransactionOperationType.UPDATE,
1059
- dest_metafile=new_table_version,
1060
- src_metafile=old_table_version,
1061
- ),
1062
- )
1670
+ try:
1671
+ transaction.step(
1672
+ TransactionOperation.of(
1673
+ operation_type=TransactionOperationType.UPDATE,
1674
+ dest_metafile=new_table_version,
1675
+ src_metafile=old_table_version,
1676
+ ),
1677
+ )
1678
+ except ObjectAlreadyExistsError as e:
1679
+ raise TableVersionAlreadyExistsError(
1680
+ f"Table version {namespace}.{table_name}.{table_version} already exists"
1681
+ ) from e
1682
+
1063
1683
  # TODO(pdames): Push changes down to non-deltacat streams via sync module.
1064
1684
  # Also copy sort scheme changes down to deltacat child stream?
1685
+ new_stream: Stream = None
1065
1686
  if partition_scheme:
1066
1687
  old_stream = get_stream(
1067
- *args,
1068
1688
  namespace=namespace,
1069
1689
  table_name=table_name,
1070
1690
  table_version=table_version,
1691
+ transaction=transaction,
1692
+ *args,
1071
1693
  **kwargs,
1072
1694
  )
1073
- new_stream: Stream = Metafile.update_for(old_stream)
1695
+ new_stream = Metafile.update_for(old_stream)
1074
1696
  new_stream.partition_scheme = partition_scheme
1075
- txn_operations.append(
1697
+ transaction.step(
1076
1698
  TransactionOperation.of(
1077
1699
  operation_type=TransactionOperationType.UPDATE,
1078
1700
  dest_metafile=new_stream,
1079
1701
  src_metafile=old_stream,
1080
- )
1702
+ ),
1081
1703
  )
1082
- transaction = Transaction.of(
1083
- txn_type=TransactionType.ALTER,
1084
- txn_operations=txn_operations,
1085
- )
1086
- catalog_properties = get_catalog_properties(**kwargs)
1087
- transaction.commit(
1088
- catalog_root_dir=catalog_properties.root,
1089
- filesystem=catalog_properties.filesystem,
1090
- )
1704
+ if commit_transaction:
1705
+ transaction.seal()
1706
+ return new_table, new_table_version, new_stream
1091
1707
 
1092
1708
 
1093
1709
  def stage_stream(
@@ -1096,6 +1712,7 @@ def stage_stream(
1096
1712
  table_version: Optional[str] = None,
1097
1713
  stream_format: StreamFormat = StreamFormat.DELTACAT,
1098
1714
  *args,
1715
+ transaction: Optional[Transaction] = None,
1099
1716
  **kwargs,
1100
1717
  ) -> Stream:
1101
1718
  """
@@ -1107,21 +1724,28 @@ def stage_stream(
1107
1724
  Returns the staged stream. Raises an error if the table version does not
1108
1725
  exist.
1109
1726
  """
1110
- # TODO(pdames): Support retrieving previously staged streams by ID.
1727
+ transaction, commit_transaction = setup_transaction(transaction, **kwargs)
1728
+
1111
1729
  if not table_version:
1112
1730
  table_version = _resolve_latest_active_table_version_id(
1113
- *args,
1114
1731
  namespace=namespace,
1115
1732
  table_name=table_name,
1733
+ transaction=transaction,
1734
+ *args,
1116
1735
  **kwargs,
1117
1736
  )
1118
1737
  table_version_meta = get_table_version(
1119
- *args,
1120
1738
  namespace=namespace,
1121
1739
  table_name=table_name,
1122
1740
  table_version=table_version,
1741
+ transaction=transaction,
1742
+ *args,
1123
1743
  **kwargs,
1124
1744
  )
1745
+ if not table_version_meta:
1746
+ raise TableVersionNotFoundError(
1747
+ f"Table version not found: {namespace}.{table_name}.{table_version}."
1748
+ )
1125
1749
  locator = StreamLocator.at(
1126
1750
  namespace=namespace,
1127
1751
  table_name=table_name,
@@ -1137,39 +1761,37 @@ def stage_stream(
1137
1761
  watermark=None,
1138
1762
  )
1139
1763
  prev_stream = get_stream(
1140
- *args,
1141
1764
  namespace=stream.namespace,
1142
1765
  table_name=stream.table_name,
1143
1766
  table_version=stream.table_version,
1144
1767
  stream_format=stream.stream_format,
1768
+ transaction=transaction,
1769
+ *args,
1145
1770
  **kwargs,
1146
1771
  )
1147
1772
  if prev_stream:
1148
1773
  if prev_stream.stream_id == stream.stream_id:
1149
- raise ValueError(
1774
+ raise TableValidationError(
1150
1775
  f"Stream to stage has the same ID as existing stream: {prev_stream}."
1151
1776
  )
1152
1777
  stream.previous_stream_id = prev_stream.stream_id
1153
- transaction = Transaction.of(
1154
- txn_type=TransactionType.APPEND,
1155
- txn_operations=[
1156
- TransactionOperation.of(
1157
- operation_type=TransactionOperationType.CREATE,
1158
- dest_metafile=stream,
1159
- )
1160
- ],
1161
- )
1162
- catalog_properties = get_catalog_properties(**kwargs)
1163
- transaction.commit(
1164
- catalog_root_dir=catalog_properties.root,
1165
- filesystem=catalog_properties.filesystem,
1778
+ # Add the operation to the transaction
1779
+ transaction.step(
1780
+ TransactionOperation.of(
1781
+ operation_type=TransactionOperationType.CREATE,
1782
+ dest_metafile=stream,
1783
+ ),
1166
1784
  )
1785
+
1786
+ if commit_transaction:
1787
+ transaction.seal()
1167
1788
  return stream
1168
1789
 
1169
1790
 
1170
1791
  def commit_stream(
1171
1792
  stream: Stream,
1172
1793
  *args,
1794
+ transaction: Optional[Transaction] = None,
1173
1795
  **kwargs,
1174
1796
  ) -> Stream:
1175
1797
  """
@@ -1177,6 +1799,8 @@ def commit_stream(
1177
1799
  previous stream registered for the same table version. Returns the
1178
1800
  committed stream.
1179
1801
  """
1802
+ transaction, commit_transaction = setup_transaction(transaction, **kwargs)
1803
+
1180
1804
  if not stream.stream_id:
1181
1805
  raise ValueError("Stream ID to commit must be set to a staged stream ID.")
1182
1806
  if not stream.table_version_locator:
@@ -1185,83 +1809,71 @@ def commit_stream(
1185
1809
  "set to the parent of its staged stream ID."
1186
1810
  )
1187
1811
  prev_staged_stream = get_stream_by_id(
1188
- *args,
1189
1812
  table_version_locator=stream.table_version_locator,
1190
1813
  stream_id=stream.stream_id,
1814
+ transaction=transaction,
1815
+ *args,
1191
1816
  **kwargs,
1192
1817
  )
1193
1818
  if not prev_staged_stream:
1194
- raise ValueError(
1819
+ raise StreamNotFoundError(
1195
1820
  f"Stream at table version {stream.table_version_locator} with ID "
1196
1821
  f"{stream.stream_id} not found."
1197
1822
  )
1198
1823
  if prev_staged_stream.state != CommitState.STAGED:
1199
- raise ValueError(
1824
+ raise TableValidationError(
1200
1825
  f"Expected to find a `{CommitState.STAGED}` stream at table version "
1201
1826
  f"{stream.table_version_locator} with ID {stream.stream_id},"
1202
1827
  f"but found a `{prev_staged_stream.state}` partition."
1203
1828
  )
1204
- if not prev_staged_stream:
1205
- raise ValueError(
1206
- f"Stream at table_version {stream.table_version_locator} with ID "
1207
- f"{stream.stream_id} not found."
1208
- )
1209
- if prev_staged_stream.state != CommitState.STAGED:
1210
- raise ValueError(
1211
- f"Expected to find a `{CommitState.STAGED}` stream at table version "
1212
- f"{stream.table_version_locator} with ID {stream.stream_id},"
1213
- f"but found a `{prev_staged_stream.state}` stream."
1214
- )
1215
1829
  stream: Stream = Metafile.update_for(prev_staged_stream)
1216
1830
  stream.state = CommitState.COMMITTED
1217
1831
  prev_committed_stream = get_stream(
1218
- *args,
1219
1832
  namespace=stream.namespace,
1220
1833
  table_name=stream.table_name,
1221
1834
  table_version=stream.table_version,
1222
1835
  stream_format=stream.stream_format,
1836
+ transaction=transaction,
1837
+ *args,
1223
1838
  **kwargs,
1224
1839
  )
1840
+ if prev_committed_stream:
1841
+ # there's a previously committed stream, so update the transaction
1842
+ # type to overwrite the previously committed stream
1843
+ txn_op_type = TransactionOperationType.REPLACE
1844
+ else:
1845
+ txn_op_type = TransactionOperationType.UPDATE
1846
+
1225
1847
  # the first transaction operation updates the staged stream commit state
1226
- txn_type = TransactionType.ALTER
1227
- txn_ops = [
1848
+ transaction.step(
1228
1849
  TransactionOperation.of(
1229
- operation_type=TransactionOperationType.UPDATE,
1850
+ operation_type=txn_op_type,
1230
1851
  dest_metafile=stream,
1231
1852
  src_metafile=prev_staged_stream,
1232
- )
1233
- ]
1853
+ ),
1854
+ )
1234
1855
  if prev_committed_stream:
1235
1856
  if prev_committed_stream.stream_id != stream.previous_stream_id:
1236
- raise ValueError(
1857
+ raise ConcurrentModificationError(
1237
1858
  f"Previous stream ID mismatch Expected "
1238
1859
  f"{stream.previous_stream_id} but found "
1239
1860
  f"{prev_committed_stream.stream_id}."
1240
1861
  )
1241
1862
  if prev_committed_stream.stream_id == stream.stream_id:
1242
- raise ValueError(
1863
+ raise TableValidationError(
1243
1864
  f"Stream to commit has the same ID as existing stream: {prev_committed_stream}."
1244
1865
  )
1245
- # there's a previously committed stream, so update the transaction
1246
- # type to overwrite the previously committed stream, and add another
1247
- # transaction operation to replace it with the staged stream
1248
- txn_type = TransactionType.OVERWRITE
1249
- txn_ops.append(
1866
+ # add another transaction operation to replace the previously committed stream
1867
+ # with the staged stream
1868
+ transaction.step(
1250
1869
  TransactionOperation.of(
1251
- operation_type=TransactionOperationType.UPDATE,
1870
+ operation_type=txn_op_type,
1252
1871
  dest_metafile=stream,
1253
1872
  src_metafile=prev_committed_stream,
1254
- )
1873
+ ),
1255
1874
  )
1256
- transaction = Transaction.of(
1257
- txn_type=txn_type,
1258
- txn_operations=txn_ops,
1259
- )
1260
- catalog_properties = get_catalog_properties(**kwargs)
1261
- transaction.commit(
1262
- catalog_root_dir=catalog_properties.root,
1263
- filesystem=catalog_properties.filesystem,
1264
- )
1875
+ if commit_transaction:
1876
+ transaction.seal()
1265
1877
  return stream
1266
1878
 
1267
1879
 
@@ -1271,6 +1883,7 @@ def delete_stream(
1271
1883
  table_version: Optional[str] = None,
1272
1884
  stream_format: StreamFormat = StreamFormat.DELTACAT,
1273
1885
  *args,
1886
+ transaction: Optional[Transaction] = None,
1274
1887
  **kwargs,
1275
1888
  ) -> None:
1276
1889
  """
@@ -1279,121 +1892,120 @@ def delete_stream(
1279
1892
  Resolves to the deltacat stream format if no stream format is given.
1280
1893
  Raises an error if the stream does not exist.
1281
1894
  """
1895
+ transaction, commit_transaction = setup_transaction(transaction, **kwargs)
1896
+
1282
1897
  if not table_version:
1283
1898
  table_version = _resolve_latest_active_table_version_id(
1284
- *args,
1285
1899
  namespace=namespace,
1286
1900
  table_name=table_name,
1901
+ transaction=transaction,
1902
+ *args,
1287
1903
  **kwargs,
1288
1904
  )
1289
1905
  stream_to_delete = get_stream(
1290
- *args,
1291
1906
  namespace=namespace,
1292
1907
  table_name=table_name,
1293
1908
  table_version=table_version,
1294
1909
  stream_format=stream_format,
1910
+ transaction=transaction,
1911
+ *args,
1295
1912
  **kwargs,
1296
1913
  )
1297
1914
  if not stream_to_delete:
1298
- raise ValueError(
1915
+ raise StreamNotFoundError(
1299
1916
  f"Stream to delete not found: {namespace}.{table_name}"
1300
1917
  f".{table_version}.{stream_format}."
1301
1918
  )
1302
1919
  else:
1303
1920
  stream_to_delete.state = CommitState.DEPRECATED
1304
- transaction = Transaction.of(
1305
- txn_type=TransactionType.DELETE,
1306
- txn_operations=[
1307
- TransactionOperation.of(
1308
- operation_type=TransactionOperationType.DELETE,
1309
- dest_metafile=stream_to_delete,
1310
- )
1311
- ],
1312
- )
1313
- catalog_properties = get_catalog_properties(**kwargs)
1314
- transaction.commit(
1315
- catalog_root_dir=catalog_properties.root,
1316
- filesystem=catalog_properties.filesystem,
1921
+
1922
+ transaction.step(
1923
+ TransactionOperation.of(
1924
+ operation_type=TransactionOperationType.DELETE,
1925
+ dest_metafile=stream_to_delete,
1926
+ ),
1317
1927
  )
1318
1928
 
1929
+ if commit_transaction:
1930
+ transaction.seal()
1931
+
1319
1932
 
1320
1933
  def delete_table(
1321
1934
  namespace: str,
1322
- name: str,
1935
+ table_name: str,
1323
1936
  purge: bool = False,
1324
1937
  *args,
1938
+ transaction: Optional[Transaction] = None,
1325
1939
  **kwargs,
1326
1940
  ) -> None:
1327
1941
  """
1328
- Drops the given table and all its contents (table versions, streams, partitions,
1329
- and deltas). If purge is True, also removes all data files associated with the table.
1330
- Raises an error if the given table does not exist.
1331
-
1332
- TODO: Honor purge once garbage collection is implemented.
1942
+ Drops the given table from the catalog. If purge is True, also removes
1943
+ all data files associated with the table. Raises an error if the given table
1944
+ does not exist.
1333
1945
  """
1946
+ if purge:
1947
+ raise NotImplementedError("Purge flag is not currently supported.")
1948
+ transaction, commit_transaction = setup_transaction(transaction, **kwargs)
1949
+
1334
1950
  table: Optional[Table] = get_table(
1335
- *args,
1336
1951
  namespace=namespace,
1337
- table_name=name,
1952
+ table_name=table_name,
1953
+ transaction=transaction,
1954
+ *args,
1338
1955
  **kwargs,
1339
1956
  )
1340
1957
 
1341
1958
  if not table:
1342
- raise TableNotFoundError(f"Table `{namespace}.{name}` does not exist.")
1959
+ # TODO(pdames): Refactor this so that it doesn't initialize Ray
1960
+ raise TableNotFoundError(f"Table `{namespace}.{table_name}` does not exist.")
1343
1961
 
1344
- transaction = Transaction.of(
1345
- txn_type=TransactionType.DELETE,
1346
- txn_operations=TransactionOperationList.of(
1347
- [
1348
- TransactionOperation.of(
1349
- operation_type=TransactionOperationType.DELETE,
1350
- dest_metafile=table,
1351
- )
1352
- ]
1962
+ transaction.step(
1963
+ TransactionOperation.of(
1964
+ operation_type=TransactionOperationType.DELETE,
1965
+ dest_metafile=table,
1353
1966
  ),
1354
1967
  )
1355
1968
 
1356
- catalog_properties = get_catalog_properties(**kwargs)
1357
- transaction.commit(
1358
- catalog_root_dir=catalog_properties.root,
1359
- filesystem=catalog_properties.filesystem,
1360
- )
1969
+ if commit_transaction:
1970
+ transaction.seal()
1361
1971
 
1362
1972
 
1363
1973
  def delete_namespace(
1364
1974
  namespace: str,
1365
1975
  purge: bool = False,
1366
1976
  *args,
1977
+ transaction: Optional[Transaction] = None,
1367
1978
  **kwargs,
1368
1979
  ) -> None:
1369
1980
  """
1370
- Drops the given table namespace and all its contents. Raises an error if the
1371
- given namespace does not exist.
1981
+ Drops the given namespace from the catalog. If purge is True, also removes
1982
+ all data files associated with the namespace. Raises an error if the given
1983
+ namespace does not exist.
1372
1984
  """
1373
- namespace: Optional[Namespace] = get_namespace(
1374
- *args,
1985
+ if purge:
1986
+ raise NotImplementedError("Purge flag is not currently supported.")
1987
+ transaction, commit_transaction = setup_transaction(transaction, **kwargs)
1988
+
1989
+ namespace_obj: Optional[Namespace] = get_namespace(
1375
1990
  namespace=namespace,
1991
+ transaction=transaction,
1992
+ *args,
1376
1993
  **kwargs,
1377
1994
  )
1378
1995
 
1379
- if not namespace:
1380
- raise ValueError(f"Namespace `{namespace}` does not exist.")
1996
+ if not namespace_obj:
1997
+ raise NamespaceNotFoundError(f"Namespace `{namespace}` does not exist.")
1381
1998
 
1382
- transaction = Transaction.of(
1383
- txn_type=TransactionType.DELETE,
1384
- txn_operations=[
1385
- TransactionOperation.of(
1386
- operation_type=TransactionOperationType.DELETE,
1387
- dest_metafile=namespace,
1388
- )
1389
- ],
1390
- )
1391
- catalog_properties = get_catalog_properties(**kwargs)
1392
- transaction.commit(
1393
- catalog_root_dir=catalog_properties.root,
1394
- filesystem=catalog_properties.filesystem,
1999
+ transaction.step(
2000
+ TransactionOperation.of(
2001
+ operation_type=TransactionOperationType.DELETE,
2002
+ dest_metafile=namespace_obj,
2003
+ ),
1395
2004
  )
1396
2005
 
2006
+ if commit_transaction:
2007
+ transaction.seal()
2008
+
1397
2009
 
1398
2010
  def get_stream_by_id(
1399
2011
  table_version_locator: TableVersionLocator,
@@ -1412,8 +2024,8 @@ def get_stream_by_id(
1412
2024
  stream_format=None,
1413
2025
  )
1414
2026
  return _latest(
1415
- *args,
1416
2027
  metafile=Stream.of(locator=locator, partition_scheme=None),
2028
+ *args,
1417
2029
  **kwargs,
1418
2030
  )
1419
2031
 
@@ -1424,6 +2036,7 @@ def get_stream(
1424
2036
  table_version: Optional[str] = None,
1425
2037
  stream_format: StreamFormat = StreamFormat.DELTACAT,
1426
2038
  *args,
2039
+ transaction: Optional[Transaction] = None,
1427
2040
  **kwargs,
1428
2041
  ) -> Optional[Stream]:
1429
2042
  """
@@ -1432,12 +2045,14 @@ def get_stream(
1432
2045
  Resolves to the DeltaCAT stream format if no stream format is given.
1433
2046
  Returns None if the table version or stream format does not exist.
1434
2047
  """
2048
+ transaction, commit_transaction = setup_transaction(transaction, **kwargs)
1435
2049
  if not table_version:
1436
2050
  table_version = _resolve_latest_active_table_version_id(
1437
- *args,
1438
2051
  namespace=namespace,
1439
2052
  table_name=table_name,
1440
2053
  fail_if_no_active_table_version=False,
2054
+ transaction=transaction,
2055
+ *args,
1441
2056
  **kwargs,
1442
2057
  )
1443
2058
  locator = StreamLocator.at(
@@ -1447,15 +2062,19 @@ def get_stream(
1447
2062
  stream_id=None,
1448
2063
  stream_format=stream_format,
1449
2064
  )
1450
- return _latest(
1451
- *args,
2065
+ stream = _latest(
1452
2066
  metafile=Stream.of(
1453
2067
  locator=locator,
1454
2068
  partition_scheme=None,
1455
2069
  state=CommitState.COMMITTED,
1456
2070
  ),
2071
+ transaction=transaction,
2072
+ *args,
1457
2073
  **kwargs,
1458
2074
  )
2075
+ if commit_transaction:
2076
+ transaction.seal()
2077
+ return stream
1459
2078
 
1460
2079
 
1461
2080
  def stream_exists(
@@ -1464,6 +2083,7 @@ def stream_exists(
1464
2083
  table_version: Optional[str] = None,
1465
2084
  stream_format: StreamFormat = StreamFormat.DELTACAT,
1466
2085
  *args,
2086
+ transaction: Optional[Transaction] = None,
1467
2087
  **kwargs,
1468
2088
  ) -> Optional[Stream]:
1469
2089
  """
@@ -1472,14 +2092,18 @@ def stream_exists(
1472
2092
  Resolves to the DeltaCAT stream format if no stream format is given.
1473
2093
  Returns None if the table version or stream format does not exist.
1474
2094
  """
2095
+ transaction, commit_transaction = setup_transaction(transaction, **kwargs)
1475
2096
  if not table_version:
1476
2097
  table_version = _resolve_latest_active_table_version_id(
1477
- *args,
1478
2098
  namespace=namespace,
1479
2099
  table_name=table_name,
1480
2100
  fail_if_no_active_table_version=False,
2101
+ transaction=transaction,
2102
+ *args,
1481
2103
  **kwargs,
1482
2104
  )
2105
+
2106
+ # Try with the provided table name first
1483
2107
  locator = StreamLocator.at(
1484
2108
  namespace=namespace,
1485
2109
  table_name=table_name,
@@ -1487,15 +2111,19 @@ def stream_exists(
1487
2111
  stream_id=None,
1488
2112
  stream_format=stream_format,
1489
2113
  )
1490
- return _exists(
1491
- *args,
2114
+ exists = _exists(
1492
2115
  metafile=Stream.of(
1493
2116
  locator=locator,
1494
2117
  partition_scheme=None,
1495
2118
  state=CommitState.COMMITTED,
1496
2119
  ),
2120
+ transaction=transaction,
2121
+ *args,
1497
2122
  **kwargs,
1498
2123
  )
2124
+ if commit_transaction:
2125
+ transaction.seal()
2126
+ return exists
1499
2127
 
1500
2128
 
1501
2129
  def stage_partition(
@@ -1503,6 +2131,7 @@ def stage_partition(
1503
2131
  partition_values: Optional[PartitionValues] = None,
1504
2132
  partition_scheme_id: Optional[str] = None,
1505
2133
  *args,
2134
+ transaction: Optional[Transaction] = None,
1506
2135
  **kwargs,
1507
2136
  ) -> Partition:
1508
2137
  """
@@ -1515,35 +2144,65 @@ def stage_partition(
1515
2144
  The partition_values must represent the results of transforms in a partition
1516
2145
  spec specified in the stream.
1517
2146
  """
2147
+ transaction, commit_transaction = setup_transaction(transaction, **kwargs)
2148
+
1518
2149
  # TODO(pdames): Cache last retrieved metafile revisions in memory to resolve
1519
2150
  # potentially high cost of staging many partitions.
1520
2151
  table_version = get_table_version(
1521
- *args,
1522
2152
  namespace=stream.namespace,
1523
2153
  table_name=stream.table_name,
1524
2154
  table_version=stream.table_version,
2155
+ transaction=transaction,
2156
+ *args,
1525
2157
  **kwargs,
1526
2158
  )
1527
2159
  if not table_version:
1528
- raise ValueError(
2160
+ raise TableVersionNotFoundError(
1529
2161
  f"Table version not found: {stream.namespace}.{stream.table_name}."
1530
2162
  f"{stream.table_version}."
1531
2163
  )
2164
+ # Set partition_scheme_id to UNPARTITIONED_SCHEME_ID when partition_values
2165
+ # is None or empty
2166
+ if not partition_values:
2167
+ partition_scheme_id = UNPARTITIONED_SCHEME_ID
2168
+ # Use stream's partition scheme ID if none provided and partition_values
2169
+ # are specified
2170
+ elif partition_scheme_id is None:
2171
+ partition_scheme_id = stream.partition_scheme.id
1532
2172
  if not table_version.partition_schemes or partition_scheme_id not in [
1533
2173
  ps.id for ps in table_version.partition_schemes
1534
2174
  ]:
1535
- raise ValueError(
2175
+ raise TableValidationError(
1536
2176
  f"Invalid partition scheme ID `{partition_scheme_id}` (not found "
1537
2177
  f"in parent table version `{stream.namespace}.{stream.table_name}"
1538
2178
  f".{table_version.table_version}` partition scheme IDs)."
1539
2179
  )
1540
- if stream.partition_scheme.id not in table_version.partition_schemes:
2180
+ if stream.partition_scheme.id not in [
2181
+ ps.id for ps in table_version.partition_schemes
2182
+ ]:
1541
2183
  # this should never happen, but just in case
1542
- raise ValueError(
2184
+ raise TableValidationError(
1543
2185
  f"Invalid stream partition scheme ID `{stream.partition_scheme.id}`"
1544
- f"in parent table version `{stream.namespace}.{stream.table_name}"
2186
+ f" (not found in parent table version "
2187
+ f"`{stream.namespace}.{stream.table_name}"
1545
2188
  f".{table_version.table_version}` partition scheme IDs)."
1546
2189
  )
2190
+
2191
+ if partition_values:
2192
+ if partition_scheme_id == UNPARTITIONED_SCHEME_ID:
2193
+ raise TableValidationError(
2194
+ "Partition values cannot be specified for unpartitioned tables"
2195
+ )
2196
+ # Validate partition values against partition scheme
2197
+ partition_scheme = next(
2198
+ ps for ps in table_version.partition_schemes if ps.id == partition_scheme_id
2199
+ )
2200
+ _validate_partition_values_against_scheme(
2201
+ partition_values=partition_values,
2202
+ partition_scheme=partition_scheme,
2203
+ schema=table_version.schema,
2204
+ )
2205
+
1547
2206
  locator = PartitionLocator.of(
1548
2207
  stream_locator=stream.locator,
1549
2208
  partition_values=partition_values,
@@ -1551,42 +2210,40 @@ def stage_partition(
1551
2210
  )
1552
2211
  partition = Partition.of(
1553
2212
  locator=locator,
1554
- schema=table_version.schema,
1555
2213
  content_types=table_version.content_types,
1556
2214
  state=CommitState.STAGED,
1557
2215
  previous_stream_position=None,
1558
- partition_values=partition_values,
1559
2216
  previous_partition_id=None,
1560
2217
  stream_position=None,
1561
2218
  partition_scheme_id=partition_scheme_id,
1562
2219
  )
1563
2220
  prev_partition = get_partition(
1564
- *args,
1565
2221
  stream_locator=stream.locator,
1566
2222
  partition_values=partition_values,
1567
2223
  partition_scheme_id=partition_scheme_id,
2224
+ transaction=transaction,
2225
+ *args,
1568
2226
  **kwargs,
1569
2227
  )
1570
- if prev_partition:
1571
- if prev_partition.partition_id == partition.partition_id:
1572
- raise ValueError(
1573
- f"Partition to stage has the same ID as existing partition: {prev_partition}."
1574
- )
1575
- partition.previous_partition_id = prev_partition.partition_id
1576
- transaction = Transaction.of(
1577
- txn_type=TransactionType.APPEND,
1578
- txn_operations=[
1579
- TransactionOperation.of(
1580
- operation_type=TransactionOperationType.CREATE,
1581
- dest_metafile=partition,
1582
- )
1583
- ],
1584
- )
1585
- catalog_properties = get_catalog_properties(**kwargs)
1586
- transaction.commit(
1587
- catalog_root_dir=catalog_properties.root,
1588
- filesystem=catalog_properties.filesystem,
2228
+ prev_partition_id = prev_partition.partition_id if prev_partition else None
2229
+
2230
+ # TODO(pdames): Check all historic partitions for the same partition ID
2231
+ if prev_partition_id == partition.partition_id:
2232
+ raise TableValidationError(
2233
+ f"Partition to stage has the same ID as previous partition: {prev_partition_id}."
2234
+ )
2235
+ partition.previous_partition_id = prev_partition_id
2236
+
2237
+ # Add the operation to the transaction
2238
+ transaction.step(
2239
+ TransactionOperation.of(
2240
+ operation_type=TransactionOperationType.CREATE,
2241
+ dest_metafile=partition,
2242
+ ),
1589
2243
  )
2244
+
2245
+ if commit_transaction:
2246
+ transaction.seal()
1590
2247
  return partition
1591
2248
 
1592
2249
 
@@ -1594,12 +2251,15 @@ def commit_partition(
1594
2251
  partition: Partition,
1595
2252
  previous_partition: Optional[Partition] = None,
1596
2253
  *args,
2254
+ transaction: Optional[Transaction] = None,
1597
2255
  **kwargs,
1598
2256
  ) -> Partition:
1599
2257
  """
1600
2258
  Commits the staged partition to its associated table version stream,
1601
2259
  replacing any previous partition registered for the same stream and
1602
- partition values.
2260
+ partition values. All values set on the input partition except compaction
2261
+ round completion info will be overwritten with the values stored in the
2262
+ staged partition.
1603
2263
 
1604
2264
  If previous partition is given then it will be replaced with its deltas
1605
2265
  prepended to the new partition being committed. Otherwise the latest
@@ -1613,6 +2273,8 @@ def commit_partition(
1613
2273
  specified, then the commit will be rejected if it does not match the actual
1614
2274
  ID of the partition being replaced.
1615
2275
  """
2276
+ transaction, commit_transaction = setup_transaction(transaction, **kwargs)
2277
+
1616
2278
  if previous_partition:
1617
2279
  raise NotImplementedError(
1618
2280
  f"delta prepending from previous partition {previous_partition} "
@@ -1625,74 +2287,98 @@ def commit_partition(
1625
2287
  "Partition to commit must have its stream locator "
1626
2288
  "set to the parent of its staged partition ID."
1627
2289
  )
2290
+
2291
+ # Start a single multi-step transaction for all operations (both read and write)
2292
+ # Step 1: Get the staged partition using transaction
1628
2293
  prev_staged_partition = get_partition_by_id(
1629
- *args,
1630
2294
  stream_locator=partition.stream_locator,
1631
2295
  partition_id=partition.partition_id,
2296
+ transaction=transaction,
2297
+ *args,
1632
2298
  **kwargs,
1633
2299
  )
2300
+
2301
+ # Validate staged partition
1634
2302
  if not prev_staged_partition:
1635
- raise ValueError(
2303
+ raise PartitionNotFoundError(
1636
2304
  f"Partition at stream {partition.stream_locator} with ID "
1637
2305
  f"{partition.partition_id} not found."
1638
2306
  )
1639
2307
  if prev_staged_partition.state != CommitState.STAGED:
1640
- raise ValueError(
2308
+ raise TableValidationError(
1641
2309
  f"Expected to find a `{CommitState.STAGED}` partition at stream "
1642
2310
  f"{partition.stream_locator} with ID {partition.partition_id},"
1643
2311
  f"but found a `{prev_staged_partition.state}` partition."
1644
2312
  )
1645
- partition: Partition = Metafile.update_for(prev_staged_partition)
1646
- partition.state = CommitState.COMMITTED
1647
- prev_committed_partition = get_partition(
1648
- *args,
1649
- stream_locator=partition.stream_locator,
1650
- partition_value=partition.partition_values,
1651
- partition_scheme_id=partition.partition_scheme_id,
1652
- **kwargs,
1653
- )
1654
- # the first transaction operation updates the staged partition commit state
1655
- txn_type = TransactionType.ALTER
1656
- txn_ops = [
1657
- TransactionOperation.of(
1658
- operation_type=TransactionOperationType.UPDATE,
1659
- dest_metafile=partition,
1660
- src_metafile=prev_staged_partition,
2313
+
2314
+ # Step 2: Check for existing committed partition
2315
+ prev_committed_partition = None
2316
+ if partition.previous_partition_id is not None:
2317
+ prev_committed_partition = get_partition(
2318
+ stream_locator=partition.stream_locator,
2319
+ partition_values=partition.partition_values,
2320
+ partition_scheme_id=partition.partition_scheme_id,
2321
+ transaction=transaction,
2322
+ *args,
2323
+ **kwargs,
1661
2324
  )
1662
- ]
2325
+
2326
+ # Validate expected previous partition ID for race condition detection
1663
2327
  if prev_committed_partition:
2328
+ logger.info(
2329
+ f"Checking previous committed partition for conflicts: {prev_committed_partition}"
2330
+ )
1664
2331
  if prev_committed_partition.partition_id != partition.previous_partition_id:
1665
- raise ValueError(
1666
- f"Previous partition ID mismatch Expected "
2332
+ raise ConcurrentModificationError(
2333
+ f"Concurrent modification detected: Expected committed partition "
1667
2334
  f"{partition.previous_partition_id} but found "
1668
2335
  f"{prev_committed_partition.partition_id}."
1669
2336
  )
1670
- # TODO(pdames): Add previous partition stream position validation.
2337
+
2338
+ if prev_committed_partition:
2339
+ # Update transaction type based on what we found
2340
+ txn_op_type = TransactionOperationType.REPLACE
1671
2341
  if prev_committed_partition.partition_id == partition.partition_id:
1672
- raise ValueError(
2342
+ raise TableValidationError(
1673
2343
  f"Partition to commit has the same ID as existing partition: "
1674
2344
  f"{prev_committed_partition}."
1675
2345
  )
1676
- # there's a previously committed partition, so update the transaction
1677
- # type to overwrite the previously committed partition, and add another
1678
- # transaction operation to replace it with the staged partition
1679
- txn_type = TransactionType.OVERWRITE
1680
- txn_ops.append(
2346
+ else:
2347
+ txn_op_type = TransactionOperationType.UPDATE
2348
+
2349
+ # Prepare the committed partition based on the staged partition
2350
+ # Compaction round completion info (if any) is not set on the staged partition,
2351
+ # so we need to save it from the input partition to commit.
2352
+ input_partition_rci = partition.compaction_round_completion_info
2353
+ partition: Partition = Metafile.update_for(prev_staged_partition)
2354
+ partition.state = CommitState.COMMITTED
2355
+ # Restore compaction round completion info (if any) from the input partition.
2356
+ if input_partition_rci is not None:
2357
+ partition.compaction_round_completion_info = input_partition_rci
2358
+
2359
+ # Step 4: Add write operations to the same transaction
2360
+ # Always UPDATE the staged partition to committed state
2361
+ transaction.step(
2362
+ TransactionOperation.of(
2363
+ operation_type=txn_op_type,
2364
+ dest_metafile=partition,
2365
+ src_metafile=prev_staged_partition,
2366
+ ),
2367
+ )
2368
+
2369
+ # If there's a previously committed partition, we need to replace it too
2370
+ if prev_committed_partition:
2371
+ transaction.step(
1681
2372
  TransactionOperation.of(
1682
- operation_type=TransactionOperationType.UPDATE,
2373
+ operation_type=txn_op_type,
1683
2374
  dest_metafile=partition,
1684
2375
  src_metafile=prev_committed_partition,
1685
- )
2376
+ ),
1686
2377
  )
1687
- transaction = Transaction.of(
1688
- txn_type=txn_type,
1689
- txn_operations=txn_ops,
1690
- )
1691
- catalog_properties = get_catalog_properties(**kwargs)
1692
- transaction.commit(
1693
- catalog_root_dir=catalog_properties.root,
1694
- filesystem=catalog_properties.filesystem,
1695
- )
2378
+
2379
+ if commit_transaction:
2380
+ transaction.seal()
2381
+
1696
2382
  return partition
1697
2383
 
1698
2384
 
@@ -1701,6 +2387,7 @@ def delete_partition(
1701
2387
  partition_values: Optional[PartitionValues] = None,
1702
2388
  partition_scheme_id: Optional[str] = None,
1703
2389
  *args,
2390
+ transaction: Optional[Transaction] = None,
1704
2391
  **kwargs,
1705
2392
  ) -> None:
1706
2393
  """
@@ -1708,35 +2395,34 @@ def delete_partition(
1708
2395
  values should not be specified for unpartitioned tables. Raises an error
1709
2396
  if the partition does not exist.
1710
2397
  """
2398
+ transaction, commit_transaction = setup_transaction(transaction, **kwargs)
2399
+
1711
2400
  partition_to_delete = get_partition(
1712
- *args,
1713
2401
  stream_locator=stream_locator,
1714
2402
  partition_values=partition_values,
1715
2403
  partition_scheme_id=partition_scheme_id,
2404
+ transaction=transaction,
2405
+ *args,
1716
2406
  **kwargs,
1717
2407
  )
1718
2408
  if not partition_to_delete:
1719
- raise ValueError(
2409
+ raise PartitionNotFoundError(
1720
2410
  f"Partition with values {partition_values} and scheme "
1721
2411
  f"{partition_scheme_id} not found in stream: {stream_locator}"
1722
2412
  )
1723
2413
  else:
1724
2414
  partition_to_delete.state = CommitState.DEPRECATED
1725
- transaction = Transaction.of(
1726
- txn_type=TransactionType.DELETE,
1727
- txn_operations=[
1728
- TransactionOperation.of(
1729
- operation_type=TransactionOperationType.DELETE,
1730
- src_metafile=partition_to_delete,
1731
- )
1732
- ],
1733
- )
1734
- catalog_properties = get_catalog_properties(**kwargs)
1735
- transaction.commit(
1736
- catalog_root_dir=catalog_properties.root,
1737
- filesystem=catalog_properties.filesystem,
2415
+
2416
+ transaction.step(
2417
+ TransactionOperation.of(
2418
+ operation_type=TransactionOperationType.DELETE,
2419
+ dest_metafile=partition_to_delete,
2420
+ ),
1738
2421
  )
1739
2422
 
2423
+ if commit_transaction:
2424
+ transaction.seal()
2425
+
1740
2426
 
1741
2427
  def get_partition_by_id(
1742
2428
  stream_locator: StreamLocator,
@@ -1755,12 +2441,11 @@ def get_partition_by_id(
1755
2441
  partition_id=partition_id,
1756
2442
  )
1757
2443
  return _latest(
1758
- *args,
1759
2444
  metafile=Partition.of(
1760
2445
  locator=locator,
1761
- schema=None,
1762
2446
  content_types=None,
1763
2447
  ),
2448
+ *args,
1764
2449
  **kwargs,
1765
2450
  )
1766
2451
 
@@ -1770,6 +2455,7 @@ def get_partition(
1770
2455
  partition_values: Optional[PartitionValues] = None,
1771
2456
  partition_scheme_id: Optional[str] = None,
1772
2457
  *args,
2458
+ transaction: Optional[Transaction] = None,
1773
2459
  **kwargs,
1774
2460
  ) -> Optional[Partition]:
1775
2461
  """
@@ -1780,35 +2466,124 @@ def get_partition(
1780
2466
  resolves to the table version's current partition scheme by default.
1781
2467
  Raises an error if the given stream locator does not exist.
1782
2468
  """
1783
- locator = PartitionLocator.of(
1784
- stream_locator=stream_locator,
1785
- partition_values=partition_values,
1786
- partition_id=None,
1787
- )
1788
- if not partition_scheme_id:
2469
+ transaction, commit_transaction = setup_transaction(transaction, **kwargs)
2470
+ if not partition_scheme_id or not stream_locator.stream_id:
1789
2471
  # resolve latest partition scheme from the current
1790
2472
  # revision of its `deltacat` stream
1791
2473
  stream = get_stream(
1792
- *args,
1793
2474
  namespace=stream_locator.namespace,
1794
2475
  table_name=stream_locator.table_name,
1795
2476
  table_version=stream_locator.table_version,
2477
+ transaction=transaction,
2478
+ *args,
1796
2479
  **kwargs,
1797
2480
  )
1798
2481
  if not stream:
1799
- raise ValueError(f"Stream {stream_locator} not found.")
2482
+ raise StreamNotFoundError(f"Stream {stream_locator} not found.")
1800
2483
  partition_scheme_id = stream.partition_scheme.id
1801
- return _latest(
1802
- *args,
2484
+ # ensure that we always use a fully qualified stream locator
2485
+ stream_locator = stream.locator
2486
+ locator = PartitionLocator.of(
2487
+ stream_locator=stream_locator,
2488
+ partition_values=partition_values,
2489
+ partition_id=None,
2490
+ )
2491
+ partition = _latest(
1803
2492
  metafile=Partition.of(
1804
2493
  locator=locator,
1805
- schema=None,
1806
2494
  content_types=None,
1807
2495
  state=CommitState.COMMITTED,
1808
2496
  partition_scheme_id=partition_scheme_id,
1809
2497
  ),
2498
+ transaction=transaction,
2499
+ *args,
2500
+ **kwargs,
2501
+ )
2502
+ if commit_transaction:
2503
+ transaction.seal()
2504
+ return partition
2505
+
2506
+
2507
+ def _write_table_slices(
2508
+ table: Union[LocalTable, LocalDataset, DistributedDataset],
2509
+ partition_id: str,
2510
+ max_records_per_entry: Optional[int],
2511
+ table_writer_fn: Callable,
2512
+ table_slicer_fn: Callable,
2513
+ content_type: ContentType = ContentType.PARQUET,
2514
+ entry_params: Optional[EntryParams] = None,
2515
+ entry_type: Optional[EntryType] = EntryType.DATA,
2516
+ table_writer_kwargs: Optional[Dict[str, Any]] = None,
2517
+ **kwargs,
2518
+ ) -> ManifestEntryList:
2519
+ catalog_properties = get_catalog_properties(**kwargs)
2520
+ manifest_entries = ManifestEntryList()
2521
+ # LocalDataset is a special case to upload iteratively
2522
+ tables = [t for t in table] if isinstance(table, list) else [table]
2523
+ filesystem = catalog_properties.filesystem
2524
+ data_dir_path = posixpath.join(
2525
+ catalog_properties.root,
2526
+ DATA_FILE_DIR_NAME,
2527
+ partition_id,
2528
+ )
2529
+ filesystem.create_dir(data_dir_path, recursive=True)
2530
+ for t in tables:
2531
+ manifest_entries.extend(
2532
+ write_sliced_table(
2533
+ t,
2534
+ data_dir_path,
2535
+ filesystem,
2536
+ max_records_per_entry,
2537
+ table_writer_fn,
2538
+ table_slicer_fn,
2539
+ table_writer_kwargs,
2540
+ content_type,
2541
+ entry_params,
2542
+ entry_type,
2543
+ )
2544
+ )
2545
+ return manifest_entries
2546
+
2547
+
2548
+ def _write_table(
2549
+ partition_id: str,
2550
+ table: Union[LocalTable, LocalDataset, DistributedDataset],
2551
+ max_records_per_entry: Optional[int] = None,
2552
+ author: Optional[ManifestAuthor] = None,
2553
+ content_type: ContentType = ContentType.PARQUET,
2554
+ entry_params: Optional[EntryParams] = None,
2555
+ entry_type: Optional[EntryType] = EntryType.DATA,
2556
+ write_table_slices_fn: Optional[Callable] = _write_table_slices,
2557
+ table_writer_kwargs: Optional[Dict[str, Any]] = None,
2558
+ **kwargs,
2559
+ ) -> Manifest:
2560
+ """
2561
+ Writes the given table to 1 or more files and returns a
2562
+ Redshift manifest pointing to the uploaded files.
2563
+ """
2564
+ table_writer_fn = get_table_writer(table)
2565
+ table_slicer_fn = get_table_slicer(table)
2566
+
2567
+ manifest_entries = write_table_slices_fn(
2568
+ table,
2569
+ partition_id,
2570
+ max_records_per_entry,
2571
+ table_writer_fn,
2572
+ table_slicer_fn,
2573
+ content_type,
2574
+ entry_params,
2575
+ entry_type,
2576
+ table_writer_kwargs,
1810
2577
  **kwargs,
1811
2578
  )
2579
+ manifest = Manifest.of(
2580
+ entries=manifest_entries,
2581
+ author=author,
2582
+ uuid=str(uuid.uuid4()),
2583
+ entry_type=entry_type,
2584
+ entry_params=entry_params,
2585
+ )
2586
+ return manifest
1812
2587
 
1813
2588
 
1814
2589
  def stage_delta(
@@ -1818,28 +2593,82 @@ def stage_delta(
1818
2593
  max_records_per_entry: Optional[int] = None,
1819
2594
  author: Optional[ManifestAuthor] = None,
1820
2595
  properties: Optional[DeltaProperties] = None,
1821
- s3_table_writer_kwargs: Optional[Dict[str, Any]] = None,
2596
+ table_writer_kwargs: Optional[Dict[str, Any]] = None,
1822
2597
  content_type: ContentType = ContentType.PARQUET,
1823
2598
  entry_params: Optional[EntryParams] = None,
2599
+ entry_type: Optional[EntryType] = EntryType.DATA,
2600
+ write_table_slices_fn: Optional[Callable] = _write_table_slices,
2601
+ schema: Optional[Schema] = None,
2602
+ sort_scheme_id: Optional[str] = None,
1824
2603
  *args,
1825
2604
  **kwargs,
1826
2605
  ) -> Delta:
1827
2606
  """
1828
- Writes the given table to 1 or more S3 files. Returns an unregistered
2607
+ Writes the given dataset to 1 or more files. Returns an unregistered
1829
2608
  delta whose manifest entries point to the uploaded files. Applies any
1830
2609
  schema consistency policies configured for the parent table version.
1831
-
1832
- The partition spec will be used to split the input table into
1833
- multiple files. Optionally, partition_values can be provided to avoid
1834
- this method to recompute partition_values from the provided data.
1835
-
1836
- Raises an error if the provided data does not conform to a unique ordered
1837
- list of partition_values
1838
2610
  """
1839
- raise NotImplementedError("stage_delta not implemented")
2611
+ # TODO(pdames): Validate that equality delete entry types either have
2612
+ # entry params specified, or are being added to a table with merge keys.
2613
+ if not partition.is_supported_content_type(content_type):
2614
+ raise TableValidationError(
2615
+ f"Content type {content_type} is not supported by "
2616
+ f"partition: {partition}"
2617
+ )
2618
+ if partition.state == CommitState.DEPRECATED:
2619
+ raise TableValidationError(
2620
+ f"Cannot stage delta to {partition.state} partition: {partition}",
2621
+ )
2622
+ previous_stream_position: Optional[int] = partition.stream_position
2623
+
2624
+ # Handle schema parameter and add to table_writer_kwargs if available
2625
+ table_writer_kwargs = table_writer_kwargs or {}
2626
+
2627
+ # Extract schema_id from the schema if it's a DeltaCAT Schema
2628
+ schema_id = None
2629
+ if isinstance(schema, Schema):
2630
+ schema_id = schema.id
2631
+ table_writer_kwargs["schema_id"] = schema_id
2632
+ # Add PyArrow schema to table_writer_kwargs if not already present
2633
+ if "schema" not in table_writer_kwargs:
2634
+ table_writer_kwargs["schema"] = schema.arrow
2635
+ elif schema is not None and "schema" not in table_writer_kwargs:
2636
+ # For PyArrow schemas or other types, add directly
2637
+ table_writer_kwargs["schema"] = schema
2638
+
2639
+ # Add sort_scheme_id to table_writer_kwargs for manifest entry creation
2640
+ if sort_scheme_id is not None:
2641
+ table_writer_kwargs["sort_scheme_id"] = sort_scheme_id
2642
+
2643
+ manifest: Manifest = _write_table(
2644
+ partition.partition_id,
2645
+ data,
2646
+ max_records_per_entry,
2647
+ author,
2648
+ content_type,
2649
+ entry_params,
2650
+ entry_type,
2651
+ write_table_slices_fn,
2652
+ table_writer_kwargs,
2653
+ **kwargs,
2654
+ )
2655
+ staged_delta: Delta = Delta.of(
2656
+ locator=DeltaLocator.of(partition.locator, None),
2657
+ delta_type=delta_type,
2658
+ meta=manifest.meta,
2659
+ properties=properties,
2660
+ manifest=manifest,
2661
+ previous_stream_position=previous_stream_position,
2662
+ )
2663
+ return staged_delta
1840
2664
 
1841
2665
 
1842
- def commit_delta(delta: Delta, *args, **kwargs) -> Delta:
2666
+ def commit_delta(
2667
+ delta: Delta,
2668
+ *args,
2669
+ transaction: Optional[Transaction] = None,
2670
+ **kwargs,
2671
+ ) -> Delta:
1843
2672
  """
1844
2673
  Registers a new delta with its associated target table version and
1845
2674
  partition. Returns the registered delta. If the delta's previous stream
@@ -1848,7 +2677,72 @@ def commit_delta(delta: Delta, *args, **kwargs) -> Delta:
1848
2677
  stream position is specified, it must be greater than the latest stream
1849
2678
  position in the target partition.
1850
2679
  """
1851
- raise NotImplementedError("commit_delta not implemented")
2680
+ transaction, commit_transaction = setup_transaction(transaction, **kwargs)
2681
+
2682
+ delta: Delta = Metafile.update_for(delta)
2683
+ delta_type: Optional[DeltaType] = delta.type
2684
+ resolved_delta_type = delta_type if delta_type is not None else DeltaType.UPSERT
2685
+ delta.type = resolved_delta_type
2686
+ delta.properties = kwargs.get("properties") or delta.properties
2687
+
2688
+ if delta.partition_id:
2689
+ parent_partition = get_partition_by_id(
2690
+ stream_locator=delta.stream_locator,
2691
+ partition_id=delta.partition_id,
2692
+ transaction=transaction,
2693
+ *args,
2694
+ **kwargs,
2695
+ )
2696
+ else:
2697
+ parent_partition = get_partition(
2698
+ stream_locator=delta.stream_locator,
2699
+ partition_values=delta.partition_values,
2700
+ transaction=transaction,
2701
+ *args,
2702
+ **kwargs,
2703
+ )
2704
+ if not parent_partition:
2705
+ raise PartitionNotFoundError(f"Partition not found: {delta.locator}")
2706
+ # ensure that we always use a fully qualified partition locator
2707
+ delta.locator.partition_locator = parent_partition.locator
2708
+ # resolve the delta's stream position
2709
+ delta.previous_stream_position = parent_partition.stream_position or 0
2710
+ if delta.stream_position is not None:
2711
+ if delta.stream_position <= delta.previous_stream_position:
2712
+ # manually specified delta stream positions must be greater than the
2713
+ # previous stream position
2714
+ raise TableValidationError(
2715
+ f"Delta stream position {delta.stream_position} must be "
2716
+ f"greater than previous stream position "
2717
+ f"{delta.previous_stream_position}"
2718
+ )
2719
+ else:
2720
+ delta.locator.stream_position = delta.previous_stream_position + 1
2721
+
2722
+ # update the parent partition's stream position
2723
+ new_parent_partition: Partition = Metafile.update_for(parent_partition)
2724
+ new_parent_partition.stream_position = delta.locator.stream_position
2725
+
2726
+ # Add operations to the transaction
2727
+ # the 1st operation creates the delta
2728
+ transaction.step(
2729
+ TransactionOperation.of(
2730
+ operation_type=TransactionOperationType.CREATE,
2731
+ dest_metafile=delta,
2732
+ ),
2733
+ )
2734
+ # the 2nd operation alters the stream position of the partition
2735
+ transaction.step(
2736
+ TransactionOperation.of(
2737
+ operation_type=TransactionOperationType.UPDATE,
2738
+ dest_metafile=new_parent_partition,
2739
+ src_metafile=parent_partition,
2740
+ ),
2741
+ )
2742
+
2743
+ if commit_transaction:
2744
+ transaction.seal()
2745
+ return delta
1852
2746
 
1853
2747
 
1854
2748
  def get_namespace(namespace: str, *args, **kwargs) -> Optional[Namespace]:
@@ -1857,8 +2751,8 @@ def get_namespace(namespace: str, *args, **kwargs) -> Optional[Namespace]:
1857
2751
  None if the given namespace does not exist.
1858
2752
  """
1859
2753
  return _latest(
1860
- *args,
1861
2754
  metafile=Namespace.of(NamespaceLocator.of(namespace)),
2755
+ *args,
1862
2756
  **kwargs,
1863
2757
  )
1864
2758
 
@@ -1868,33 +2762,43 @@ def namespace_exists(namespace: str, *args, **kwargs) -> bool:
1868
2762
  Returns True if the given table namespace exists, False if not.
1869
2763
  """
1870
2764
  return _exists(
1871
- *args,
1872
2765
  metafile=Namespace.of(NamespaceLocator.of(namespace)),
2766
+ *args,
1873
2767
  **kwargs,
1874
2768
  )
1875
2769
 
1876
2770
 
1877
- def get_table(namespace: str, table_name: str, *args, **kwargs) -> Optional[Table]:
2771
+ def get_table(
2772
+ namespace: str,
2773
+ table_name: str,
2774
+ *args,
2775
+ **kwargs,
2776
+ ) -> Optional[Table]:
1878
2777
  """
1879
2778
  Gets table metadata for the specified table. Returns None if the given
1880
2779
  table does not exist.
1881
2780
  """
1882
2781
  locator = TableLocator.at(namespace=namespace, table_name=table_name)
1883
2782
  return _latest(
1884
- *args,
1885
2783
  metafile=Table.of(locator=locator),
2784
+ *args,
1886
2785
  **kwargs,
1887
2786
  )
1888
2787
 
1889
2788
 
1890
- def table_exists(namespace: str, table_name: str, *args, **kwargs) -> bool:
2789
+ def table_exists(
2790
+ namespace: str,
2791
+ table_name: str,
2792
+ *args,
2793
+ **kwargs,
2794
+ ) -> bool:
1891
2795
  """
1892
2796
  Returns True if the given table exists, False if not.
1893
2797
  """
1894
2798
  locator = TableLocator.at(namespace=namespace, table_name=table_name)
1895
2799
  return _exists(
1896
- *args,
1897
2800
  metafile=Table.of(locator=locator),
2801
+ *args,
1898
2802
  **kwargs,
1899
2803
  )
1900
2804
 
@@ -1920,67 +2824,87 @@ def get_table_version(
1920
2824
  schema=None,
1921
2825
  )
1922
2826
  return _latest(
1923
- *args,
1924
2827
  metafile=table_version,
2828
+ *args,
1925
2829
  **kwargs,
1926
2830
  )
1927
2831
 
1928
2832
 
1929
2833
  def get_latest_table_version(
1930
- namespace: str, table_name: str, *args, **kwargs
2834
+ namespace: str,
2835
+ table_name: str,
2836
+ *args,
2837
+ transaction: Optional[Transaction] = None,
2838
+ **kwargs,
1931
2839
  ) -> Optional[TableVersion]:
1932
2840
  """
1933
2841
  Gets table version metadata for the latest version of the specified table.
1934
2842
  Returns None if no table version exists for the given table. Raises
1935
2843
  an error if the given table doesn't exist.
1936
2844
  """
2845
+ transaction, commit_transaction = setup_transaction(transaction, **kwargs)
1937
2846
  table_version_id = _resolve_latest_table_version_id(
1938
- *args,
1939
2847
  namespace=namespace,
1940
2848
  table_name=table_name,
1941
2849
  fail_if_no_active_table_version=False,
2850
+ transaction=transaction,
2851
+ *args,
1942
2852
  **kwargs,
1943
2853
  )
1944
2854
 
1945
- return (
2855
+ table_version = (
1946
2856
  get_table_version(
1947
- *args,
1948
2857
  namespace=namespace,
1949
2858
  table_name=table_name,
1950
2859
  table_version=table_version_id,
2860
+ transaction=transaction,
2861
+ *args,
1951
2862
  **kwargs,
1952
2863
  )
1953
2864
  if table_version_id
1954
2865
  else None
1955
2866
  )
2867
+ if commit_transaction:
2868
+ transaction.seal()
2869
+ return table_version
1956
2870
 
1957
2871
 
1958
2872
  def get_latest_active_table_version(
1959
- namespace: str, table_name: str, *args, **kwargs
2873
+ namespace: str,
2874
+ table_name: str,
2875
+ *args,
2876
+ transaction: Optional[Transaction] = None,
2877
+ **kwargs,
1960
2878
  ) -> Optional[TableVersion]:
1961
2879
  """
1962
2880
  Gets table version metadata for the latest active version of the specified
1963
2881
  table. Returns None if no active table version exists for the given table.
1964
2882
  Raises an error if the given table doesn't exist.
1965
2883
  """
2884
+ transaction, commit_transaction = setup_transaction(transaction, **kwargs)
1966
2885
  table_version_id = _resolve_latest_active_table_version_id(
1967
- *args,
1968
2886
  namespace=namespace,
1969
2887
  table_name=table_name,
1970
2888
  fail_if_no_active_table_version=False,
2889
+ transaction=transaction,
2890
+ *args,
1971
2891
  **kwargs,
1972
2892
  )
1973
- return (
2893
+ table_version = (
1974
2894
  get_table_version(
1975
- *args,
1976
2895
  namespace=namespace,
1977
2896
  table_name=table_name,
1978
2897
  table_version=table_version_id,
2898
+ transaction=transaction,
2899
+ *args,
1979
2900
  **kwargs,
1980
2901
  )
1981
2902
  if table_version_id
1982
2903
  else None
1983
2904
  )
2905
+ if commit_transaction:
2906
+ transaction.seal()
2907
+ return table_version
1984
2908
 
1985
2909
 
1986
2910
  def get_table_version_column_names(
@@ -2002,6 +2926,8 @@ def get_table_version_column_names(
2002
2926
  namespace=namespace,
2003
2927
  table_name=table_name,
2004
2928
  table_version=table_version,
2929
+ *args,
2930
+ **kwargs,
2005
2931
  )
2006
2932
  return schema.arrow.names if schema else None
2007
2933
 
@@ -2018,7 +2944,7 @@ def get_table_version_schema(
2018
2944
  table version if none is specified. Returns None if the table version is
2019
2945
  schemaless. Raises an error if the table version does not exist.
2020
2946
  """
2021
- table_version = (
2947
+ table_version_meta = (
2022
2948
  get_table_version(
2023
2949
  *args,
2024
2950
  namespace=namespace,
@@ -2034,7 +2960,7 @@ def get_table_version_schema(
2034
2960
  **kwargs,
2035
2961
  )
2036
2962
  )
2037
- return table_version.schema
2963
+ return table_version_meta.schema
2038
2964
 
2039
2965
 
2040
2966
  def table_version_exists(
@@ -2065,13 +2991,40 @@ def table_version_exists(
2065
2991
 
2066
2992
  def can_categorize(e: BaseException, *args, **kwargs) -> bool:
2067
2993
  """
2068
- Return whether input error is from storage implementation layer.
2994
+ True if the input error originated from the storage
2995
+ implementation layer and can be categorized under an
2996
+ existing DeltaCatError. The "categorize_errors" decorator
2997
+ uses this to determine if an unknown error from the storage
2998
+ implementation can be categorized prior to casting it to
2999
+ the equivalent DeltaCatError via `raise_categorized_error`
2069
3000
  """
2070
- raise NotImplementedError
3001
+
3002
+ # DeltaCAT native storage can only categorize DeltaCatError
3003
+ # (i.e., this is effectively a no-op for native storage)
3004
+ if isinstance(e, DeltaCatError):
3005
+ return True
3006
+ else:
3007
+ return False
2071
3008
 
2072
3009
 
2073
3010
  def raise_categorized_error(e: BaseException, *args, **kwargs):
2074
3011
  """
2075
- Raise and handle storage implementation layer specific errors.
2076
- """
2077
- raise NotImplementedError
3012
+ Casts a categorizable error that originaed from the storage
3013
+ implementation layer to its equivalent DeltaCatError
3014
+ for uniform handling (e.g., determining whether an error
3015
+ is retryable or not) via the "categorize_errors" decorator.
3016
+ Raises an UnclassifiedDeltaCatError from the input exception
3017
+ if the error cannot be categorized.
3018
+ """
3019
+
3020
+ # DeltaCAT native storage can only categorize DeltaCatError
3021
+ # (i.e., this is effectively a no-op for native storage)
3022
+ logger.info(f"Categorizing exception: {e}")
3023
+ categorized = None
3024
+ if isinstance(categorized, DeltaCatError):
3025
+ raise categorized from e
3026
+
3027
+ logger.warning(f"Could not classify {type(e).__name__}: {e}")
3028
+ raise UnclassifiedDeltaCatError(
3029
+ f"Failed to classify error {type(e).__name__}: {e}"
3030
+ ) from e