deltacat 2.0__py3-none-any.whl → 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (324) hide show
  1. deltacat/__init__.py +117 -18
  2. deltacat/api.py +536 -126
  3. deltacat/aws/constants.py +0 -23
  4. deltacat/aws/s3u.py +4 -631
  5. deltacat/benchmarking/benchmark_engine.py +4 -2
  6. deltacat/benchmarking/conftest.py +1 -19
  7. deltacat/benchmarking/test_benchmark_pipeline.py +6 -4
  8. deltacat/catalog/__init__.py +64 -5
  9. deltacat/catalog/delegate.py +445 -63
  10. deltacat/catalog/interface.py +188 -62
  11. deltacat/catalog/main/impl.py +2444 -282
  12. deltacat/catalog/model/catalog.py +208 -113
  13. deltacat/catalog/model/properties.py +63 -24
  14. deltacat/compute/__init__.py +14 -0
  15. deltacat/compute/compactor/compaction_session.py +97 -75
  16. deltacat/compute/compactor/model/compact_partition_params.py +75 -30
  17. deltacat/compute/compactor/model/compaction_session_audit_info.py +17 -0
  18. deltacat/compute/compactor/model/round_completion_info.py +16 -6
  19. deltacat/compute/compactor/repartition_session.py +8 -21
  20. deltacat/compute/compactor/steps/hash_bucket.py +5 -5
  21. deltacat/compute/compactor/steps/materialize.py +9 -7
  22. deltacat/compute/compactor/steps/repartition.py +12 -11
  23. deltacat/compute/compactor/utils/io.py +6 -5
  24. deltacat/compute/compactor/utils/round_completion_reader.py +117 -0
  25. deltacat/compute/compactor/utils/system_columns.py +3 -1
  26. deltacat/compute/compactor_v2/compaction_session.py +17 -14
  27. deltacat/compute/compactor_v2/constants.py +30 -1
  28. deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -1
  29. deltacat/compute/compactor_v2/model/hash_bucket_input.py +9 -3
  30. deltacat/compute/compactor_v2/model/merge_file_group.py +5 -2
  31. deltacat/compute/compactor_v2/model/merge_input.py +33 -8
  32. deltacat/compute/compactor_v2/private/compaction_utils.py +167 -68
  33. deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -2
  34. deltacat/compute/compactor_v2/steps/merge.py +267 -55
  35. deltacat/compute/compactor_v2/utils/content_type_params.py +34 -6
  36. deltacat/compute/compactor_v2/utils/dedupe.py +1 -1
  37. deltacat/compute/compactor_v2/utils/delta.py +5 -3
  38. deltacat/compute/compactor_v2/utils/io.py +11 -4
  39. deltacat/compute/compactor_v2/utils/merge.py +15 -2
  40. deltacat/compute/compactor_v2/utils/primary_key_index.py +28 -4
  41. deltacat/compute/compactor_v2/utils/task_options.py +45 -33
  42. deltacat/compute/converter/constants.py +5 -0
  43. deltacat/compute/converter/converter_session.py +207 -52
  44. deltacat/compute/converter/model/convert_input.py +43 -16
  45. deltacat/compute/converter/model/convert_input_files.py +33 -16
  46. deltacat/compute/converter/model/convert_result.py +80 -0
  47. deltacat/compute/converter/model/converter_session_params.py +64 -19
  48. deltacat/compute/converter/pyiceberg/catalog.py +21 -18
  49. deltacat/compute/converter/pyiceberg/overrides.py +193 -65
  50. deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +148 -100
  51. deltacat/compute/converter/steps/convert.py +230 -75
  52. deltacat/compute/converter/steps/dedupe.py +46 -12
  53. deltacat/compute/converter/utils/convert_task_options.py +66 -22
  54. deltacat/compute/converter/utils/converter_session_utils.py +126 -60
  55. deltacat/compute/converter/utils/iceberg_columns.py +13 -8
  56. deltacat/compute/converter/utils/io.py +173 -13
  57. deltacat/compute/converter/utils/s3u.py +42 -27
  58. deltacat/compute/janitor.py +205 -0
  59. deltacat/compute/jobs/client.py +417 -0
  60. deltacat/compute/resource_estimation/delta.py +38 -6
  61. deltacat/compute/resource_estimation/model.py +8 -0
  62. deltacat/constants.py +49 -6
  63. deltacat/docs/autogen/schema/inference/generate_type_mappings.py +687 -0
  64. deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py +673 -0
  65. deltacat/env.py +10 -0
  66. deltacat/examples/basic_logging.py +6 -6
  67. deltacat/examples/compactor/aws/__init__.py +1 -0
  68. deltacat/examples/compactor/bootstrap.py +863 -0
  69. deltacat/examples/compactor/compactor.py +373 -0
  70. deltacat/examples/compactor/explorer.py +473 -0
  71. deltacat/examples/compactor/gcp/__init__.py +1 -0
  72. deltacat/examples/compactor/job_runner.py +439 -0
  73. deltacat/examples/compactor/utils/__init__.py +1 -0
  74. deltacat/examples/compactor/utils/common.py +261 -0
  75. deltacat/examples/experimental/iceberg/converter/beam/app.py +226 -0
  76. deltacat/examples/experimental/iceberg/converter/beam/main.py +133 -0
  77. deltacat/examples/experimental/iceberg/converter/beam/test_workflow.py +113 -0
  78. deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py +3 -0
  79. deltacat/examples/experimental/iceberg/converter/beam/utils/common.py +174 -0
  80. deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py +263 -0
  81. deltacat/examples/{iceberg → experimental/iceberg}/iceberg_bucket_writer.py +66 -21
  82. deltacat/examples/{iceberg → experimental/iceberg}/iceberg_reader.py +2 -4
  83. deltacat/examples/hello_world.py +4 -2
  84. deltacat/examples/indexer/indexer.py +163 -0
  85. deltacat/examples/indexer/job_runner.py +198 -0
  86. deltacat/exceptions.py +66 -4
  87. deltacat/experimental/catalog/iceberg/__init__.py +6 -0
  88. deltacat/{catalog → experimental/catalog}/iceberg/iceberg_catalog_config.py +1 -1
  89. deltacat/{catalog → experimental/catalog}/iceberg/impl.py +43 -12
  90. deltacat/{catalog → experimental/catalog}/iceberg/overrides.py +12 -14
  91. deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py +201 -0
  92. deltacat/experimental/converter_agent/__init__.py +0 -0
  93. deltacat/experimental/converter_agent/beam/__init__.py +0 -0
  94. deltacat/experimental/converter_agent/beam/managed.py +173 -0
  95. deltacat/experimental/converter_agent/table_monitor.py +479 -0
  96. deltacat/experimental/daft/__init__.py +4 -0
  97. deltacat/experimental/daft/daft_catalog.py +229 -0
  98. deltacat/experimental/storage/__init__.py +0 -0
  99. deltacat/experimental/storage/iceberg/__init__.py +0 -0
  100. deltacat/experimental/storage/iceberg/iceberg_scan_planner.py +129 -0
  101. deltacat/{storage → experimental/storage}/iceberg/impl.py +6 -4
  102. deltacat/{storage → experimental/storage}/iceberg/model.py +7 -3
  103. deltacat/experimental/storage/iceberg/visitor.py +119 -0
  104. deltacat/experimental/storage/rivulet/__init__.py +11 -0
  105. deltacat/experimental/storage/rivulet/arrow/__init__.py +0 -0
  106. deltacat/{storage → experimental/storage}/rivulet/arrow/serializer.py +7 -4
  107. deltacat/{storage → experimental/storage}/rivulet/dataset.py +13 -12
  108. deltacat/{storage → experimental/storage}/rivulet/dataset_executor.py +12 -20
  109. deltacat/experimental/storage/rivulet/feather/__init__.py +7 -0
  110. deltacat/{storage → experimental/storage}/rivulet/feather/file_reader.py +7 -5
  111. deltacat/{storage → experimental/storage}/rivulet/feather/serializer.py +4 -4
  112. deltacat/experimental/storage/rivulet/fs/__init__.py +0 -0
  113. deltacat/{storage → experimental/storage}/rivulet/fs/file_provider.py +3 -3
  114. deltacat/{storage → experimental/storage}/rivulet/fs/file_store.py +2 -2
  115. deltacat/{storage → experimental/storage}/rivulet/fs/output_file.py +1 -1
  116. deltacat/{storage → experimental/storage}/rivulet/logical_plan.py +4 -4
  117. deltacat/experimental/storage/rivulet/metastore/__init__.py +0 -0
  118. deltacat/{storage → experimental/storage}/rivulet/metastore/delta.py +1 -3
  119. deltacat/{storage → experimental/storage}/rivulet/metastore/json_sst.py +3 -3
  120. deltacat/{storage → experimental/storage}/rivulet/metastore/sst.py +2 -2
  121. deltacat/{storage → experimental/storage}/rivulet/metastore/sst_interval_tree.py +3 -3
  122. deltacat/experimental/storage/rivulet/parquet/__init__.py +7 -0
  123. deltacat/experimental/storage/rivulet/parquet/data_reader.py +0 -0
  124. deltacat/{storage → experimental/storage}/rivulet/parquet/file_reader.py +7 -5
  125. deltacat/{storage → experimental/storage}/rivulet/parquet/serializer.py +4 -4
  126. deltacat/experimental/storage/rivulet/reader/__init__.py +0 -0
  127. deltacat/{storage → experimental/storage}/rivulet/reader/block_scanner.py +20 -9
  128. deltacat/{storage → experimental/storage}/rivulet/reader/data_reader.py +3 -3
  129. deltacat/{storage → experimental/storage}/rivulet/reader/data_scan.py +5 -3
  130. deltacat/{storage → experimental/storage}/rivulet/reader/dataset_metastore.py +7 -6
  131. deltacat/{storage → experimental/storage}/rivulet/reader/dataset_reader.py +8 -6
  132. deltacat/{storage → experimental/storage}/rivulet/reader/pyarrow_data_reader.py +4 -1
  133. deltacat/{storage → experimental/storage}/rivulet/reader/reader_type_registrar.py +4 -4
  134. deltacat/experimental/storage/rivulet/schema/__init__.py +0 -0
  135. deltacat/{storage → experimental/storage}/rivulet/schema/schema.py +1 -1
  136. deltacat/{storage → experimental/storage}/rivulet/serializer.py +1 -1
  137. deltacat/{storage → experimental/storage}/rivulet/serializer_factory.py +9 -5
  138. deltacat/experimental/storage/rivulet/shard/__init__.py +0 -0
  139. deltacat/experimental/storage/rivulet/shard/range_shard.py +129 -0
  140. deltacat/experimental/storage/rivulet/writer/__init__.py +0 -0
  141. deltacat/{storage → experimental/storage}/rivulet/writer/memtable_dataset_writer.py +20 -9
  142. deltacat/io/__init__.py +13 -0
  143. deltacat/io/dataset/__init__.py +0 -0
  144. deltacat/io/dataset/deltacat_dataset.py +91 -0
  145. deltacat/io/datasink/__init__.py +0 -0
  146. deltacat/io/datasink/deltacat_datasink.py +207 -0
  147. deltacat/io/datasource/__init__.py +0 -0
  148. deltacat/io/datasource/deltacat_datasource.py +579 -0
  149. deltacat/io/reader/__init__.py +0 -0
  150. deltacat/io/reader/deltacat_read_api.py +172 -0
  151. deltacat/storage/__init__.py +22 -2
  152. deltacat/storage/interface.py +54 -32
  153. deltacat/storage/main/impl.py +1494 -541
  154. deltacat/storage/model/delta.py +27 -3
  155. deltacat/storage/model/expression/__init__.py +47 -0
  156. deltacat/storage/model/expression/expression.py +656 -0
  157. deltacat/storage/model/expression/visitor.py +248 -0
  158. deltacat/storage/model/locator.py +6 -12
  159. deltacat/storage/model/manifest.py +231 -6
  160. deltacat/storage/model/metafile.py +224 -119
  161. deltacat/storage/model/namespace.py +8 -1
  162. deltacat/storage/model/partition.py +117 -42
  163. deltacat/storage/model/scan/push_down.py +32 -5
  164. deltacat/storage/model/schema.py +2427 -159
  165. deltacat/storage/model/shard.py +6 -2
  166. deltacat/storage/model/sort_key.py +40 -0
  167. deltacat/storage/model/stream.py +9 -2
  168. deltacat/storage/model/table.py +12 -1
  169. deltacat/storage/model/table_version.py +11 -0
  170. deltacat/storage/model/transaction.py +1184 -208
  171. deltacat/storage/model/transform.py +81 -2
  172. deltacat/storage/model/types.py +53 -29
  173. deltacat/storage/util/__init__.py +0 -0
  174. deltacat/storage/util/scan_planner.py +26 -0
  175. deltacat/tests/_io/reader/__init__.py +0 -0
  176. deltacat/tests/_io/reader/test_deltacat_read_api.py +0 -0
  177. deltacat/tests/_io/test_cloudpickle_bug_fix.py +8 -4
  178. deltacat/tests/aws/test_s3u.py +2 -31
  179. deltacat/tests/catalog/data/__init__.py +0 -0
  180. deltacat/tests/catalog/main/__init__.py +0 -0
  181. deltacat/tests/catalog/main/test_catalog_impl_namespace_operations.py +130 -0
  182. deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +1972 -0
  183. deltacat/tests/catalog/model/__init__.py +0 -0
  184. deltacat/tests/catalog/model/test_table_definition.py +16 -0
  185. deltacat/tests/catalog/test_catalogs.py +103 -106
  186. deltacat/tests/catalog/test_default_catalog_impl.py +12152 -72
  187. deltacat/tests/compute/compact_partition_test_cases.py +35 -8
  188. deltacat/tests/compute/compactor/steps/test_repartition.py +12 -12
  189. deltacat/tests/compute/compactor/utils/test_io.py +124 -120
  190. deltacat/tests/compute/compactor/utils/test_round_completion_reader.py +254 -0
  191. deltacat/tests/compute/compactor_v2/test_compaction_session.py +423 -312
  192. deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +266 -0
  193. deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +45 -0
  194. deltacat/tests/compute/compactor_v2/utils/test_task_options.py +270 -1
  195. deltacat/tests/compute/conftest.py +8 -44
  196. deltacat/tests/compute/converter/test_convert_session.py +697 -349
  197. deltacat/tests/compute/converter/utils.py +15 -6
  198. deltacat/tests/compute/resource_estimation/test_delta.py +145 -79
  199. deltacat/tests/compute/test_compact_partition_incremental.py +103 -70
  200. deltacat/tests/compute/test_compact_partition_multiple_rounds.py +89 -66
  201. deltacat/tests/compute/test_compact_partition_params.py +13 -8
  202. deltacat/tests/compute/test_compact_partition_rebase.py +77 -62
  203. deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +263 -193
  204. deltacat/tests/compute/test_janitor.py +236 -0
  205. deltacat/tests/compute/test_util_common.py +716 -43
  206. deltacat/tests/compute/test_util_constant.py +0 -1
  207. deltacat/tests/{storage/conftest.py → conftest.py} +1 -1
  208. deltacat/tests/daft/__init__.py +0 -0
  209. deltacat/tests/daft/test_model.py +97 -0
  210. deltacat/tests/experimental/__init__.py +1 -0
  211. deltacat/tests/experimental/catalog/__init__.py +0 -0
  212. deltacat/tests/experimental/catalog/iceberg/__init__.py +0 -0
  213. deltacat/tests/experimental/catalog/iceberg/test_iceberg_catalog.py +71 -0
  214. deltacat/tests/experimental/compatibility/__init__.py +1 -0
  215. deltacat/tests/experimental/compatibility/test_backfill_locator_to_id_mappings.py +582 -0
  216. deltacat/tests/experimental/daft/__init__.py +0 -0
  217. deltacat/tests/experimental/daft/test_deltacat_daft_integration.py +136 -0
  218. deltacat/tests/experimental/storage/__init__.py +0 -0
  219. deltacat/tests/experimental/storage/rivulet/__init__.py +0 -0
  220. deltacat/tests/{storage → experimental/storage}/rivulet/conftest.py +3 -3
  221. deltacat/tests/experimental/storage/rivulet/fs/__init__.py +0 -0
  222. deltacat/tests/{storage → experimental/storage}/rivulet/fs/test_file_location_provider.py +3 -2
  223. deltacat/tests/experimental/storage/rivulet/reader/__init__.py +0 -0
  224. deltacat/tests/experimental/storage/rivulet/reader/query_expression.py +80 -0
  225. deltacat/tests/experimental/storage/rivulet/reader/test_data_scan.py +119 -0
  226. deltacat/tests/experimental/storage/rivulet/reader/test_dataset_metastore.py +71 -0
  227. deltacat/tests/experimental/storage/rivulet/schema/__init__.py +0 -0
  228. deltacat/tests/{storage → experimental/storage}/rivulet/schema/test_schema.py +1 -1
  229. deltacat/tests/experimental/storage/rivulet/shard/__init__.py +0 -0
  230. deltacat/tests/experimental/storage/rivulet/shard/test_range_shard.py +162 -0
  231. deltacat/tests/{storage → experimental/storage}/rivulet/test_dataset.py +6 -4
  232. deltacat/tests/{storage → experimental/storage}/rivulet/test_manifest.py +5 -5
  233. deltacat/tests/{storage → experimental/storage}/rivulet/test_sst_interval_tree.py +5 -5
  234. deltacat/tests/{storage → experimental/storage}/rivulet/test_utils.py +8 -6
  235. deltacat/tests/experimental/storage/rivulet/writer/__init__.py +0 -0
  236. deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_dataset_write_then_read.py +11 -9
  237. deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_dataset_writer.py +2 -2
  238. deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_memtable_dataset_writer.py +7 -7
  239. deltacat/tests/storage/main/test_main_storage.py +6900 -95
  240. deltacat/tests/storage/model/test_expression.py +327 -0
  241. deltacat/tests/storage/model/test_manifest.py +129 -0
  242. deltacat/tests/storage/model/test_metafile_io.py +78 -173
  243. deltacat/tests/storage/model/test_partition_scheme.py +85 -0
  244. deltacat/tests/storage/model/test_schema.py +171 -0
  245. deltacat/tests/storage/model/test_schema_update.py +1925 -0
  246. deltacat/tests/storage/model/test_shard.py +3 -1
  247. deltacat/tests/storage/model/test_sort_scheme.py +90 -0
  248. deltacat/tests/storage/model/test_transaction.py +393 -48
  249. deltacat/tests/storage/model/test_transaction_history.py +886 -0
  250. deltacat/tests/test_deltacat_api.py +1036 -11
  251. deltacat/tests/test_exceptions.py +9 -5
  252. deltacat/tests/test_utils/pyarrow.py +52 -21
  253. deltacat/tests/test_utils/storage.py +23 -34
  254. deltacat/tests/types/__init__.py +0 -0
  255. deltacat/tests/types/test_tables.py +104 -0
  256. deltacat/tests/utils/exceptions.py +22 -0
  257. deltacat/tests/utils/main_deltacat_storage_mock.py +31 -0
  258. deltacat/tests/utils/ray_utils/test_dataset.py +123 -5
  259. deltacat/tests/utils/test_daft.py +121 -31
  260. deltacat/tests/utils/test_numpy.py +1193 -0
  261. deltacat/tests/utils/test_pandas.py +1106 -0
  262. deltacat/tests/utils/test_polars.py +1040 -0
  263. deltacat/tests/utils/test_pyarrow.py +1370 -89
  264. deltacat/types/media.py +345 -37
  265. deltacat/types/tables.py +2344 -46
  266. deltacat/utils/arguments.py +33 -1
  267. deltacat/utils/daft.py +824 -40
  268. deltacat/utils/export.py +3 -1
  269. deltacat/utils/filesystem.py +139 -9
  270. deltacat/utils/metafile_locator.py +2 -1
  271. deltacat/utils/numpy.py +118 -26
  272. deltacat/utils/pandas.py +577 -48
  273. deltacat/utils/polars.py +759 -0
  274. deltacat/utils/pyarrow.py +1373 -192
  275. deltacat/utils/ray_utils/concurrency.py +1 -1
  276. deltacat/utils/ray_utils/dataset.py +101 -10
  277. deltacat/utils/ray_utils/runtime.py +56 -4
  278. deltacat/utils/reader_compatibility_mapping.py +3083 -0
  279. deltacat/utils/url.py +1325 -0
  280. deltacat-2.0.0.dist-info/METADATA +1163 -0
  281. deltacat-2.0.0.dist-info/RECORD +439 -0
  282. {deltacat-2.0.dist-info → deltacat-2.0.0.dist-info}/WHEEL +1 -1
  283. deltacat/catalog/iceberg/__init__.py +0 -4
  284. deltacat/compute/compactor/utils/round_completion_file.py +0 -97
  285. deltacat/compute/merge_on_read/__init__.py +0 -4
  286. deltacat/compute/merge_on_read/daft.py +0 -40
  287. deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -66
  288. deltacat/compute/merge_on_read/utils/delta.py +0 -42
  289. deltacat/examples/common/fixtures.py +0 -15
  290. deltacat/storage/iceberg/iceberg_scan_planner.py +0 -28
  291. deltacat/storage/rivulet/__init__.py +0 -11
  292. deltacat/storage/rivulet/feather/__init__.py +0 -5
  293. deltacat/storage/rivulet/parquet/__init__.py +0 -5
  294. deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -231
  295. deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -388
  296. deltacat/tests/local_deltacat_storage/__init__.py +0 -1235
  297. deltacat/tests/local_deltacat_storage/exceptions.py +0 -10
  298. deltacat/utils/s3fs.py +0 -21
  299. deltacat-2.0.dist-info/METADATA +0 -65
  300. deltacat-2.0.dist-info/RECORD +0 -347
  301. /deltacat/compute/{merge_on_read/model → jobs}/__init__.py +0 -0
  302. /deltacat/{compute/merge_on_read/utils → docs}/__init__.py +0 -0
  303. /deltacat/{examples/common → docs/autogen}/__init__.py +0 -0
  304. /deltacat/{examples/iceberg → docs/autogen/schema}/__init__.py +0 -0
  305. /deltacat/{storage/iceberg → docs/autogen/schema/inference}/__init__.py +0 -0
  306. /deltacat/{storage/rivulet/arrow → examples/compactor}/__init__.py +0 -0
  307. /deltacat/{storage/rivulet/fs → examples/experimental}/__init__.py +0 -0
  308. /deltacat/{storage/rivulet/metastore → examples/experimental/iceberg}/__init__.py +0 -0
  309. /deltacat/{storage/rivulet/reader → examples/experimental/iceberg/converter}/__init__.py +0 -0
  310. /deltacat/{storage/rivulet/schema → examples/experimental/iceberg/converter/beam}/__init__.py +0 -0
  311. /deltacat/{storage/rivulet/writer → examples/indexer}/__init__.py +0 -0
  312. /deltacat/{tests/storage/rivulet → examples/indexer/aws}/__init__.py +0 -0
  313. /deltacat/{tests/storage/rivulet/fs → examples/indexer/gcp}/__init__.py +0 -0
  314. /deltacat/{tests/storage/rivulet/schema → experimental}/__init__.py +0 -0
  315. /deltacat/{tests/storage/rivulet/writer → experimental/catalog}/__init__.py +0 -0
  316. /deltacat/{storage/rivulet/parquet/data_reader.py → experimental/compatibility/__init__.py} +0 -0
  317. /deltacat/{storage → experimental/storage}/rivulet/fs/input_file.py +0 -0
  318. /deltacat/{storage → experimental/storage}/rivulet/mvp/Table.py +0 -0
  319. /deltacat/{storage → experimental/storage}/rivulet/mvp/__init__.py +0 -0
  320. /deltacat/{storage → experimental/storage}/rivulet/reader/query_expression.py +0 -0
  321. /deltacat/{storage → experimental/storage}/rivulet/schema/datatype.py +0 -0
  322. /deltacat/{storage → experimental/storage}/rivulet/writer/dataset_writer.py +0 -0
  323. {deltacat-2.0.dist-info → deltacat-2.0.0.dist-info/licenses}/LICENSE +0 -0
  324. {deltacat-2.0.dist-info → deltacat-2.0.0.dist-info}/top_level.txt +0 -0
@@ -1,74 +1,263 @@
1
- from typing import Any, Dict, List, Optional, Union, Tuple
1
+ from typing import Any, Dict, List, Optional, Union, Tuple, Set
2
2
  import logging
3
+ from collections import defaultdict
3
4
 
4
- from deltacat.catalog import CatalogProperties
5
+ import numpy as np
6
+ import pyarrow as pa
7
+ import pandas as pd
8
+ import daft
9
+ import deltacat as dc
10
+
11
+ from deltacat.storage.model.manifest import ManifestAuthor
12
+ from deltacat.catalog.model.properties import CatalogProperties
5
13
  from deltacat.exceptions import (
6
14
  NamespaceAlreadyExistsError,
7
- StreamNotFoundError,
8
15
  TableAlreadyExistsError,
9
16
  TableVersionNotFoundError,
17
+ TableNotFoundError,
18
+ TableVersionAlreadyExistsError,
19
+ TableValidationError,
20
+ SchemaValidationError,
10
21
  )
11
22
  from deltacat.catalog.model.table_definition import TableDefinition
12
23
  from deltacat.storage.model.sort_key import SortScheme
13
24
  from deltacat.storage.model.list_result import ListResult
14
25
  from deltacat.storage.model.namespace import Namespace, NamespaceProperties
15
- from deltacat.storage.model.schema import Schema
26
+ from deltacat.storage.model.schema import (
27
+ Schema,
28
+ SchemaUpdate,
29
+ )
16
30
  from deltacat.storage.model.table import TableProperties, Table
17
31
  from deltacat.storage.model.types import (
18
- DistributedDataset,
32
+ Dataset,
19
33
  LifecycleState,
20
- LocalDataset,
21
- LocalTable,
22
34
  StreamFormat,
35
+ SchemaConsistencyType,
23
36
  )
24
37
  from deltacat.storage.model.partition import (
25
38
  Partition,
26
39
  PartitionLocator,
27
40
  PartitionScheme,
28
41
  )
29
- from deltacat.storage.model.table_version import TableVersion
30
- from deltacat.compute.merge_on_read.model.merge_on_read_params import MergeOnReadParams
31
- from deltacat.storage.model.delta import DeltaType
32
- from deltacat.types.media import ContentType, TableType, DistributedDatasetType
33
- from deltacat.types.tables import TableWriteMode
34
- from deltacat.compute.merge_on_read import MERGE_FUNC_BY_DISTRIBUTED_DATASET_TYPE
42
+ from deltacat.storage.model.table_version import (
43
+ TableVersion,
44
+ TableVersionProperties,
45
+ )
46
+ from deltacat.storage.model.types import DeltaType
47
+ from deltacat.storage import Delta
48
+ from deltacat.storage.model.types import CommitState
49
+ from deltacat.storage.model.transaction import (
50
+ Transaction,
51
+ setup_transaction,
52
+ )
53
+ from deltacat.types.media import (
54
+ ContentType,
55
+ DatasetType,
56
+ StorageType,
57
+ SCHEMA_CONTENT_TYPES,
58
+ )
59
+ from deltacat.types.tables import (
60
+ SchemaEvolutionMode,
61
+ TableProperty,
62
+ TablePropertyDefaultValues,
63
+ TableReadOptimizationLevel,
64
+ TableWriteMode,
65
+ get_dataset_type,
66
+ get_table_schema,
67
+ get_table_column_names,
68
+ from_pyarrow,
69
+ concat_tables,
70
+ empty_table,
71
+ infer_table_schema,
72
+ to_pandas,
73
+ )
74
+ from deltacat.utils import pyarrow as pa_utils
75
+ from deltacat.utils.reader_compatibility_mapping import get_compatible_readers
76
+ from deltacat.utils.pyarrow import get_base_arrow_type_name
35
77
  from deltacat import logs
36
78
  from deltacat.constants import DEFAULT_NAMESPACE
37
- from deltacat.storage import metastore as storage_impl
38
79
 
39
80
  logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
40
81
 
41
82
  """
42
- This is the default implementation for the Catalog interface, using DeltaCAT native storage
83
+ Default Catalog interface implementation using DeltaCAT native storage.
43
84
 
44
- Note that, when this catalog implementation gets called through the normal pattern of `delegate.py`, all functions
45
- will be called the kwarg "inner" equal to the `CatalogProperties` this was initialized with.
85
+ The functions here should not be invoked directly, but should instead be
86
+ invoked through `delegate.py` (e.g., to support passing catalog's by name, and
87
+ to ensure that each initialized `Catalog` implementation has its `inner`
88
+ property set to the `CatalogProperties` returned from `initialize()`).
46
89
 
47
- `CatalogProperties` has all state required to implement catalog functions, such as metastore root URI
90
+ The `CatalogProperties` instance returned by `initialize()` contains all
91
+ durable state required to deterministically reconstruct the associated DeltaCAT
92
+ native `Catalog` implementation (e.g., the root URI for the catalog metastore).
48
93
  """
49
94
 
50
95
 
51
96
  # catalog functions
52
- def initialize(config: CatalogProperties = None, *args, **kwargs) -> CatalogProperties:
97
+ def initialize(
98
+ config: Optional[CatalogProperties] = None,
99
+ *args,
100
+ **kwargs,
101
+ ) -> CatalogProperties:
53
102
  """
54
- Initializes the data catalog with the given arguments.
103
+ Performs any required one-time initialization and validation of this
104
+ catalog implementation based on the input configuration. If no config
105
+ instance is given, a new `CatalogProperties` instance is constructed
106
+ using the given keyword arguments.
55
107
 
56
- returns CatalogProperties as the "inner" state value for a DC native catalog
108
+ Returns the input config if given, and the newly created config otherwise.
57
109
  """
58
110
  if config is not None:
111
+ if not isinstance(config, CatalogProperties):
112
+ raise ValueError(
113
+ f"Expected `CatalogProperties` but found `{type(config)}`."
114
+ )
59
115
  return config
60
116
  else:
61
117
  return CatalogProperties(*args, **kwargs)
62
118
 
63
119
 
64
120
  # table functions
121
+ def _validate_write_mode_and_table_existence(
122
+ table: str,
123
+ namespace: str,
124
+ mode: TableWriteMode,
125
+ **kwargs,
126
+ ) -> bool:
127
+ """Validate write mode against table existence and return whether table exists."""
128
+ table_exists_flag = table_exists(
129
+ table,
130
+ namespace=namespace,
131
+ **kwargs,
132
+ )
133
+ logger.info(f"Table to write to ({namespace}.{table}) exists: {table_exists_flag}")
134
+
135
+ if mode == TableWriteMode.CREATE and table_exists_flag:
136
+ raise ValueError(
137
+ f"Table {namespace}.{table} already exists and mode is CREATE."
138
+ )
139
+ elif (
140
+ mode not in (TableWriteMode.CREATE, TableWriteMode.AUTO)
141
+ and not table_exists_flag
142
+ ):
143
+ raise TableNotFoundError(
144
+ f"Table {namespace}.{table} does not exist and mode is {mode.value.upper() if hasattr(mode, 'value') else str(mode).upper()}. Use CREATE or AUTO mode to create a new table."
145
+ )
146
+
147
+ return table_exists_flag
148
+
149
+
150
+ def _get_table_and_validate_write_mode(
151
+ table: str,
152
+ namespace: str,
153
+ table_version: Optional[str],
154
+ mode: TableWriteMode,
155
+ **kwargs,
156
+ ) -> Tuple[bool, TableDefinition]:
157
+ """Validate write mode against table and table version existence.
158
+
159
+ Returns:
160
+ Tuple of (table_exists_flag, table_definition)
161
+ """
162
+ # First validate table, table version, and stream existence
163
+ existing_table_def = get_table(
164
+ table,
165
+ namespace=namespace,
166
+ table_version=table_version,
167
+ **kwargs,
168
+ )
169
+ table_exists_flag = (
170
+ existing_table_def is not None
171
+ and existing_table_def.table_version
172
+ and existing_table_def.stream
173
+ )
174
+ logger.info(f"Table to write to ({namespace}.{table}) exists: {table_exists_flag}")
175
+
176
+ # Then validate table existence constraints
177
+ if mode == TableWriteMode.CREATE and table_exists_flag and table_version is None:
178
+ raise TableAlreadyExistsError(
179
+ f"Table {namespace}.{table} already exists and mode is CREATE."
180
+ )
181
+ elif (
182
+ mode not in (TableWriteMode.CREATE, TableWriteMode.AUTO)
183
+ and existing_table_def is None
184
+ ):
185
+ raise TableNotFoundError(
186
+ f"Table {namespace}.{table} does not exist and write mode is {mode}. Use CREATE or AUTO mode to create a new table."
187
+ )
188
+
189
+ # Then validate table version existence constraints
190
+ if table_version is not None and table_exists_flag:
191
+ if mode == TableWriteMode.CREATE:
192
+ raise TableVersionAlreadyExistsError(
193
+ f"Table version {namespace}.{table}.{table_version} already exists and mode is CREATE."
194
+ )
195
+ logger.info(f"Table version ({namespace}.{table}.{table_version}) exists.")
196
+ elif (
197
+ mode not in (TableWriteMode.CREATE, TableWriteMode.AUTO)
198
+ and table_version is not None
199
+ and not table_exists_flag
200
+ ):
201
+ raise TableVersionNotFoundError(
202
+ f"Table version {namespace}.{table}.{table_version} does not exist and write mode is {mode}. "
203
+ f"Use CREATE or AUTO mode to create a new table version, or omit table_version "
204
+ f"to use the latest version."
205
+ )
206
+ return table_exists_flag, existing_table_def
207
+
208
+
209
+ def _validate_content_type_against_supported_content_types(
210
+ namespace: str,
211
+ table: str,
212
+ content_type: ContentType,
213
+ supported_content_types: Optional[List[ContentType]],
214
+ ) -> None:
215
+ if supported_content_types and content_type not in supported_content_types:
216
+ raise ValueError(
217
+ f"Content type proposed for write to table {namespace}.{table} ({content_type}) "
218
+ f"conflicts with the proposed list of new supported content types: {supported_content_types}"
219
+ )
220
+
221
+
222
+ def _create_table_for_write(
223
+ data: Dataset,
224
+ table: str,
225
+ namespace: str,
226
+ table_version: Optional[str],
227
+ content_type: ContentType,
228
+ existing_table_definition: Optional[TableDefinition],
229
+ *args,
230
+ **kwargs,
231
+ ) -> TableDefinition:
232
+ """Creates a new table, table version, and/or stream in preparation for a write operation."""
233
+ if "schema" not in kwargs:
234
+ kwargs["schema"] = infer_table_schema(data)
235
+
236
+ _validate_content_type_against_supported_content_types(
237
+ namespace,
238
+ table,
239
+ content_type,
240
+ kwargs.get("content_types"),
241
+ )
242
+ return create_table(
243
+ table,
244
+ namespace=namespace,
245
+ table_version=table_version,
246
+ existing_table_definition=existing_table_definition,
247
+ *args,
248
+ **kwargs,
249
+ )
250
+
251
+
65
252
  def write_to_table(
66
- data: Union[LocalTable, LocalDataset, DistributedDataset], # type: ignore
253
+ data: Dataset,
67
254
  table: str,
68
255
  *args,
69
256
  namespace: Optional[str] = None,
257
+ table_version: Optional[str] = None,
70
258
  mode: TableWriteMode = TableWriteMode.AUTO,
71
259
  content_type: ContentType = ContentType.PARQUET,
260
+ transaction: Optional[Transaction] = None,
72
261
  **kwargs,
73
262
  ) -> None:
74
263
  """Write local or distributed data to a table. Raises an error if the
@@ -77,79 +266,1137 @@ def write_to_table(
77
266
  When creating a table, all `create_table` parameters may be optionally
78
267
  specified as additional keyword arguments. When appending to, or replacing,
79
268
  an existing table, all `alter_table` parameters may be optionally specified
80
- as additional keyword arguments."""
81
- raise NotImplementedError("write_to_table not implemented")
269
+ as additional keyword arguments.
82
270
 
271
+ Args:
272
+ data: Local or distributed data to write to the table.
273
+ table: Name of the table to write to.
274
+ namespace: Optional namespace for the table. Uses default if not specified.
275
+ table_version: Optional version of the table to write to. If specified,
276
+ will create this version if it doesn't exist (in CREATE mode) or
277
+ get this version if it exists (in other modes). If not specified,
278
+ uses the latest version.
279
+ mode: Write mode (AUTO, CREATE, APPEND, REPLACE, MERGE, DELETE).
280
+ content_type: Content type used to write the data files. Defaults to PARQUET.
281
+ transaction: Optional transaction to append write operations to instead of
282
+ creating and committing a new transaction.
283
+ **kwargs: Additional keyword arguments.
284
+ """
285
+ namespace = namespace or default_namespace()
83
286
 
84
- def read_table(
287
+ # Set up transaction handling
288
+ write_transaction, commit_transaction = setup_transaction(transaction, **kwargs)
289
+ kwargs["transaction"] = write_transaction
290
+
291
+ try:
292
+ # Validate write mode and table/table version/stream existence
293
+ (table_exists_flag, table_definition,) = _get_table_and_validate_write_mode(
294
+ table,
295
+ namespace,
296
+ table_version,
297
+ mode,
298
+ **kwargs,
299
+ )
300
+
301
+ # Get or create table, table version, and/or stream
302
+ if not table_exists_flag:
303
+ table_definition = _create_table_for_write(
304
+ data,
305
+ table,
306
+ namespace,
307
+ table_version,
308
+ content_type,
309
+ table_definition,
310
+ *args,
311
+ **kwargs,
312
+ )
313
+ else:
314
+ # call alter_table if there are any alter_table kwargs provided
315
+ if (
316
+ "lifecycle_state" in kwargs
317
+ or "schema_updates" in kwargs
318
+ or "partition_updates" in kwargs
319
+ or "sort_scheme" in kwargs
320
+ or "table_description" in kwargs
321
+ or "table_version_description" in kwargs
322
+ or "table_properties" in kwargs
323
+ or "table_version_properties" in kwargs
324
+ ):
325
+ alter_table(
326
+ table,
327
+ namespace=namespace,
328
+ table_version=table_version,
329
+ *args,
330
+ **kwargs,
331
+ )
332
+
333
+ # Get the active table version and stream
334
+ table_version_obj = _get_latest_active_or_given_table_version(
335
+ namespace=table_definition.table.namespace,
336
+ table_name=table_definition.table.table_name,
337
+ table_version=table_version or table_definition.table_version.table_version,
338
+ **kwargs,
339
+ )
340
+
341
+ # Validate schema compatibility for schemaless content types with schema tables
342
+ if (
343
+ content_type.value not in SCHEMA_CONTENT_TYPES
344
+ and table_version_obj.schema is not None
345
+ ):
346
+ schemaless_types = {
347
+ ct for ct in ContentType if ct.value not in SCHEMA_CONTENT_TYPES
348
+ }
349
+ raise TableValidationError(
350
+ f"Content type '{content_type.value}' cannot be written to a table with a schema. "
351
+ f"Table '{namespace}.{table}' has a schema, but content type '{content_type.value}' "
352
+ f"is schemaless. Schemaless content types ({', '.join(sorted([ct.value for ct in schemaless_types]))}) "
353
+ f"can only be written to schemaless tables."
354
+ )
355
+
356
+ # Handle different write modes and get stream and delta type
357
+ stream, delta_type = _handle_write_mode(
358
+ mode,
359
+ table_definition,
360
+ table_version_obj,
361
+ namespace,
362
+ table,
363
+ **kwargs,
364
+ )
365
+
366
+ if not stream:
367
+ raise ValueError(f"No default stream found for table {namespace}.{table}")
368
+
369
+ # Automatically set entry_params for DELETE/MERGE modes if not provided
370
+ _set_entry_params_if_needed(
371
+ mode,
372
+ table_version_obj,
373
+ kwargs,
374
+ )
375
+
376
+ # Validate table configuration
377
+ _validate_table_configuration(
378
+ stream,
379
+ table_version_obj,
380
+ namespace,
381
+ table,
382
+ )
383
+
384
+ # Handle partition creation/retrieval
385
+ partition, commit_staged_partition = _handle_partition_creation(
386
+ mode,
387
+ table_exists_flag,
388
+ delta_type,
389
+ stream,
390
+ **kwargs,
391
+ )
392
+
393
+ # Get table properties for schema evolution
394
+ schema_evolution_mode = table_version_obj.read_table_property(
395
+ TableProperty.SCHEMA_EVOLUTION_MODE
396
+ )
397
+ default_schema_consistency_type = table_version_obj.read_table_property(
398
+ TableProperty.DEFAULT_SCHEMA_CONSISTENCY_TYPE
399
+ )
400
+
401
+ # Convert unsupported dataset types and NumPy arrays that need schema validation
402
+ if isinstance(data, np.ndarray) and table_version_obj.schema is not None:
403
+ # NumPy arrays need conversion to Pandas for proper column naming in schema validation
404
+ converted_data = _convert_numpy_for_schema_validation(
405
+ data, table_version_obj.schema
406
+ )
407
+ else:
408
+ # Convert other unsupported dataset types (e.g., Daft) or keep NumPy as-is for schemaless tables
409
+ converted_data = _convert_data_if_needed(data)
410
+
411
+ # Capture original field set before schema coercion for partial UPSERT support
412
+ original_fields = set(get_table_column_names(converted_data))
413
+
414
+ # Validate and coerce data against schema
415
+ # This ensures proper schema evolution and type handling
416
+ (
417
+ validated_data,
418
+ schema_modified,
419
+ updated_schema,
420
+ ) = _validate_and_coerce_data_against_schema(
421
+ converted_data, # Use converted data for NumPy, original for others
422
+ table_version_obj.schema,
423
+ schema_evolution_mode=schema_evolution_mode,
424
+ default_schema_consistency_type=default_schema_consistency_type,
425
+ )
426
+
427
+ # Convert validated data to supported format for storage if needed
428
+ converted_data = _convert_data_if_needed(validated_data)
429
+
430
+ # Validate reader compatibility against supported reader types
431
+ supported_reader_types = table_version_obj.read_table_property(
432
+ TableProperty.SUPPORTED_READER_TYPES
433
+ )
434
+ _validate_reader_compatibility(
435
+ converted_data,
436
+ content_type,
437
+ supported_reader_types,
438
+ )
439
+
440
+ # Update table version if schema was modified during evolution
441
+ if schema_modified:
442
+ # Extract catalog properties and filter kwargs
443
+ catalog_kwargs = {
444
+ "catalog": kwargs.get("catalog"),
445
+ "inner": kwargs.get("inner"),
446
+ "transaction": write_transaction, # Pass transaction to update_table_version
447
+ }
448
+
449
+ _get_storage(**catalog_kwargs).update_table_version(
450
+ namespace=namespace,
451
+ table_name=table,
452
+ table_version=table_version_obj.table_version,
453
+ schema=updated_schema,
454
+ **catalog_kwargs,
455
+ )
456
+
457
+ # Stage and commit delta, handle compaction
458
+ # Remove schema from kwargs to avoid duplicate parameter conflict
459
+ filtered_kwargs = {k: v for k, v in kwargs.items() if k != "schema"}
460
+ # Use updated schema if schema evolution occurred, otherwise use original schema
461
+ _stage_commit_and_compact(
462
+ converted_data,
463
+ partition,
464
+ delta_type,
465
+ content_type,
466
+ commit_staged_partition,
467
+ table_version_obj,
468
+ namespace,
469
+ table,
470
+ schema=updated_schema if schema_modified else table_version_obj.schema,
471
+ original_fields=original_fields,
472
+ **filtered_kwargs,
473
+ )
474
+ except Exception as e:
475
+ # If any error occurs, the transaction remains uncommitted
476
+ commit_transaction = False
477
+ logger.error(f"Error during write_to_table: {e}")
478
+ raise
479
+ finally:
480
+ if commit_transaction:
481
+ # Seal the interactive transaction to commit all operations atomically
482
+ write_transaction.seal()
483
+
484
+
485
+ def _handle_write_mode(
486
+ mode: TableWriteMode,
487
+ table_definition: TableDefinition,
488
+ table_version_obj: TableVersion,
489
+ namespace: str,
85
490
  table: str,
86
- *args,
87
- namespace: Optional[str] = None,
88
- table_version: Optional[str] = None,
89
- table_type: Optional[TableType] = TableType.PYARROW,
90
- distributed_dataset_type: Optional[
91
- DistributedDatasetType
92
- ] = DistributedDatasetType.RAY_DATASET,
93
- partition_filter: Optional[List[Union[Partition, PartitionLocator]]] = None,
94
- stream_position_range_inclusive: Optional[Tuple[int, int]] = None,
95
- merge_on_read: Optional[bool] = False,
96
- reader_kwargs: Optional[Dict[Any, Any]] = None,
97
491
  **kwargs,
98
- ) -> DistributedDataset: # type: ignore
99
- """Read a table into a distributed dataset."""
492
+ ) -> Tuple[Any, DeltaType]: # Using Any for stream type to avoid complex imports
493
+ """Handle different write modes and return appropriate stream and delta type."""
494
+ table_schema = table_definition.table_version.schema
495
+
496
+ if mode == TableWriteMode.REPLACE:
497
+ return _handle_replace_mode(
498
+ table_schema,
499
+ namespace,
500
+ table,
501
+ table_version_obj,
502
+ **kwargs,
503
+ )
504
+ elif mode == TableWriteMode.APPEND:
505
+ return _handle_append_mode(
506
+ table_schema,
507
+ namespace,
508
+ table,
509
+ table_version_obj,
510
+ **kwargs,
511
+ )
512
+ elif mode in (TableWriteMode.MERGE, TableWriteMode.DELETE):
513
+ return _handle_merge_delete_mode(
514
+ mode,
515
+ table_schema,
516
+ namespace,
517
+ table,
518
+ table_version_obj,
519
+ **kwargs,
520
+ )
521
+ else:
522
+ # AUTO and CREATE modes
523
+ return _handle_auto_create_mode(
524
+ table_schema,
525
+ namespace,
526
+ table,
527
+ table_version_obj,
528
+ **kwargs,
529
+ )
100
530
 
101
- if reader_kwargs is None:
102
- reader_kwargs = {}
103
531
 
104
- _validate_read_table_args(
532
+ def _handle_replace_mode(
533
+ table_schema,
534
+ namespace: str,
535
+ table: str,
536
+ table_version_obj: TableVersion,
537
+ **kwargs,
538
+ ) -> Tuple[Any, DeltaType]:
539
+ """Handle REPLACE mode by staging and committing a new stream."""
540
+ stream = _get_storage(**kwargs).stage_stream(
105
541
  namespace=namespace,
106
- table_type=table_type,
107
- distributed_dataset_type=distributed_dataset_type,
108
- merge_on_read=merge_on_read,
542
+ table_name=table,
543
+ table_version=table_version_obj.table_version,
544
+ **kwargs,
545
+ )
546
+
547
+ stream = _get_storage(**kwargs).commit_stream(stream=stream, **kwargs)
548
+ delta_type = (
549
+ DeltaType.UPSERT
550
+ if table_schema and table_schema.merge_keys
551
+ else DeltaType.APPEND
552
+ )
553
+ return stream, delta_type
554
+
555
+
556
+ def _handle_append_mode(
557
+ table_schema,
558
+ namespace: str,
559
+ table: str,
560
+ table_version_obj: TableVersion,
561
+ **kwargs,
562
+ ) -> Tuple[Any, DeltaType]:
563
+ """Handle APPEND mode by validating no merge keys and getting existing stream."""
564
+ if table_schema and table_schema.merge_keys:
565
+ raise SchemaValidationError(
566
+ f"APPEND mode cannot be used with tables that have merge keys. "
567
+ f"Table {namespace}.{table} has merge keys: {table_schema.merge_keys}. "
568
+ f"Use MERGE mode instead."
569
+ )
570
+
571
+ stream = _get_table_stream(
572
+ namespace,
573
+ table,
574
+ table_version_obj.table_version,
575
+ **kwargs,
576
+ )
577
+ return stream, DeltaType.APPEND
578
+
579
+
580
+ def _handle_merge_delete_mode(
581
+ mode: TableWriteMode,
582
+ table_schema,
583
+ namespace: str,
584
+ table: str,
585
+ table_version_obj: TableVersion,
586
+ **kwargs,
587
+ ) -> Tuple[Any, DeltaType]:
588
+ """Handle MERGE/DELETE modes by validating merge keys and getting existing stream."""
589
+ if not table_schema or not table_schema.merge_keys:
590
+ raise TableValidationError(
591
+ f"{mode.value.upper() if hasattr(mode, 'value') else str(mode).upper()} mode requires tables to have at least one merge key. "
592
+ f"Table {namespace}.{table}.{table_version_obj.table_version} has no merge keys. "
593
+ f"Use APPEND, AUTO, or REPLACE mode instead."
594
+ )
595
+
596
+ stream = _get_table_stream(
597
+ namespace,
598
+ table,
599
+ table_version_obj.table_version,
600
+ **kwargs,
601
+ )
602
+ delta_type = DeltaType.UPSERT if mode == TableWriteMode.MERGE else DeltaType.DELETE
603
+ return stream, delta_type
604
+
605
+
606
+ def _handle_auto_create_mode(
607
+ table_schema,
608
+ namespace: str,
609
+ table: str,
610
+ table_version_obj: TableVersion,
611
+ **kwargs,
612
+ ) -> Tuple[Any, DeltaType]:
613
+ """Handle AUTO and CREATE modes by getting existing stream."""
614
+ stream = _get_table_stream(
615
+ namespace,
616
+ table,
617
+ table_version_obj.table_version,
618
+ **kwargs,
619
+ )
620
+ delta_type = (
621
+ DeltaType.UPSERT
622
+ if table_schema and table_schema.merge_keys
623
+ else DeltaType.APPEND
624
+ )
625
+ return stream, delta_type
626
+
627
+
628
+ def _validate_table_configuration(
629
+ stream,
630
+ table_version_obj: TableVersion,
631
+ namespace: str,
632
+ table: str,
633
+ ) -> None:
634
+ """Validate table configuration for unsupported features."""
635
+ # Check if table is partitioned
636
+ if (
637
+ stream.partition_scheme
638
+ and stream.partition_scheme.keys is not None
639
+ and len(stream.partition_scheme.keys) > 0
640
+ ):
641
+ raise NotImplementedError(
642
+ f"write_to_table does not yet support partitioned tables. "
643
+ f"Table {namespace}.{table} has partition scheme with "
644
+ f"{len(stream.partition_scheme.keys)} partition key(s): "
645
+ f"{[key.name or key.key[0] for key in stream.partition_scheme.keys]}. "
646
+ f"Please use the lower-level metastore API for partitioned tables."
647
+ )
648
+
649
+ # Check if table has sort keys
650
+ if (
651
+ table_version_obj.sort_scheme
652
+ and table_version_obj.sort_scheme.keys is not None
653
+ and len(table_version_obj.sort_scheme.keys) > 0
654
+ ):
655
+ raise NotImplementedError(
656
+ f"write_to_table does not yet support tables with sort keys. "
657
+ f"Table {namespace}.{table} has sort scheme with "
658
+ f"{len(table_version_obj.sort_scheme.keys)} sort key(s): "
659
+ f"{[key.key[0] for key in table_version_obj.sort_scheme.keys]}. "
660
+ f"Please use the lower-level metastore API for sorted tables."
661
+ )
662
+
663
+
664
+ def _handle_partition_creation(
665
+ mode: TableWriteMode,
666
+ table_exists_flag: bool,
667
+ delta_type: DeltaType,
668
+ stream,
669
+ **kwargs,
670
+ ) -> Tuple[Any, bool]: # partition, commit_staged_partition
671
+ """Handle partition creation/retrieval based on write mode."""
672
+ if mode == TableWriteMode.REPLACE or not table_exists_flag:
673
+ # REPLACE mode or new table: Stage a new partition
674
+ partition = _get_storage(**kwargs).stage_partition(stream=stream, **kwargs)
675
+ # If we're doing UPSERT/DELETE operations, let compaction handle the commit
676
+ commit_staged_partition = delta_type not in (DeltaType.UPSERT, DeltaType.DELETE)
677
+ return partition, commit_staged_partition
678
+ elif delta_type in (DeltaType.UPSERT, DeltaType.DELETE):
679
+ # UPSERT/DELETE operations: Try to use existing committed partition first
680
+ partition = _get_storage(**kwargs).get_partition(
681
+ stream_locator=stream.locator,
682
+ partition_values=None,
683
+ **kwargs,
684
+ )
685
+ commit_staged_partition = False
686
+
687
+ if not partition:
688
+ # No existing committed partition found, stage a new one
689
+ partition = _get_storage(**kwargs).stage_partition(stream=stream, **kwargs)
690
+ commit_staged_partition = False # Let compaction handle the commit
691
+
692
+ return partition, commit_staged_partition
693
+ else:
694
+ # APPEND mode on existing table: Get existing partition
695
+ partition = _get_storage(**kwargs).get_partition(
696
+ stream_locator=stream.locator,
697
+ partition_values=None,
698
+ **kwargs,
699
+ )
700
+ commit_staged_partition = False
701
+
702
+ if not partition:
703
+ # No existing partition found, create a new one
704
+ partition = _get_storage(**kwargs).stage_partition(stream=stream, **kwargs)
705
+ commit_staged_partition = True
706
+
707
+ return partition, commit_staged_partition
708
+
709
+
710
+ def _convert_numpy_for_schema_validation(
711
+ data: np.ndarray, schema: Optional[Schema]
712
+ ) -> Dataset:
713
+ """Convert NumPy array to Pandas DataFrame with proper column names for schema validation.
714
+
715
+ Args:
716
+ data: NumPy array to convert
717
+ schema: DeltaCAT Schema object for column naming
718
+
719
+ Returns:
720
+ Pandas DataFrame with proper column names matching schema
721
+
722
+ Raises:
723
+ ValueError: If array has more columns than schema or schema is invalid
724
+ """
725
+ if not isinstance(schema, Schema) or not schema.arrow:
726
+ raise ValueError(
727
+ f"Expected DeltaCAT schema for Numpy schema validation, but found: {schema}"
728
+ )
729
+
730
+ # Use schema subset matching NumPy array dimensions
731
+ arrow_schema = schema.arrow
732
+ num_cols = data.shape[1] if data.ndim > 1 else 1
733
+
734
+ if len(arrow_schema) >= num_cols:
735
+ # Use the first N columns from the schema to match data dimensions
736
+ subset_fields = [arrow_schema.field(i) for i in range(num_cols)]
737
+ subset_schema = pa.schema(subset_fields)
738
+ return to_pandas(data, schema=subset_schema)
739
+ else:
740
+ raise ValueError(
741
+ f"NumPy array has {num_cols} columns but table schema only has {len(arrow_schema)} columns. "
742
+ f"Cannot write NumPy data with more columns than the table schema supports."
743
+ )
744
+
745
+
746
+ def _build_entry_index_to_schema_mapping(
747
+ qualified_deltas: List[Delta], table_version_obj, **kwargs
748
+ ) -> List[Schema]:
749
+ """Build a mapping from manifest entry index to schema for reading operations.
750
+
751
+ Args:
752
+ qualified_deltas: List of deltas to process
753
+ table_version_obj: Table version containing schemas
754
+ **kwargs: Additional arguments passed to storage operations
755
+
756
+ Returns:
757
+ List mapping each manifest entry index to its corresponding schema
758
+
759
+ Raises:
760
+ ValueError: If a manifest's schema ID is not found in table version schemas
761
+ """
762
+ entry_index_to_schema = []
763
+ for delta in qualified_deltas:
764
+ if delta.manifest:
765
+ manifest = delta.manifest
766
+ else:
767
+ # Fetch manifest from storage
768
+ manifest = _get_storage(**kwargs).get_delta_manifest(
769
+ delta.locator,
770
+ **kwargs,
771
+ )
772
+ # Map manifest entry index to schema ID
773
+ schema_id = manifest.meta.schema_id
774
+
775
+ # Find the schema that matches this manifest's schema_id
776
+ matching_schema = None
777
+ if table_version_obj.schemas:
778
+ for schema in table_version_obj.schemas:
779
+ if schema.id == schema_id:
780
+ matching_schema = schema
781
+ break
782
+
783
+ if matching_schema is None:
784
+ available_schema_ids = (
785
+ [s.id for s in table_version_obj.schemas]
786
+ if table_version_obj.schemas
787
+ else []
788
+ )
789
+ raise ValueError(
790
+ f"Manifest schema ID {schema_id} not found in table version schemas. "
791
+ f"Available schema IDs: {available_schema_ids}. "
792
+ )
793
+
794
+ # Add the matching schema for each entry in this manifest
795
+ for _ in range(len(manifest.entries)):
796
+ entry_index_to_schema.append(matching_schema)
797
+
798
+ return entry_index_to_schema
799
+
800
+
801
+ def _convert_data_if_needed(data: Dataset) -> Dataset:
802
+ """Convert unsupported data types to supported ones."""
803
+ if isinstance(data, daft.DataFrame):
804
+ # Daft DataFrame - convert based on execution mode
805
+ ctx = daft.context.get_context()
806
+ runner = ctx.get_or_create_runner()
807
+ runner_type = runner.name
808
+
809
+ if runner_type == "ray":
810
+ # Running with Ray backend - convert to Ray Dataset
811
+ return data.to_ray_dataset()
812
+ else:
813
+ # Running with local backend - convert to PyArrow Table
814
+ return data.to_arrow()
815
+
816
+ return data
817
+
818
+
819
+ def _validate_and_coerce_data_against_schema(
820
+ data: Dataset,
821
+ schema: Optional[Schema],
822
+ schema_evolution_mode: Optional[SchemaEvolutionMode] = None,
823
+ default_schema_consistency_type: Optional[SchemaConsistencyType] = None,
824
+ ) -> Tuple[Dataset, bool, Optional[Schema]]:
825
+ """Validate and coerce data against the table schema if schema consistency types are set.
826
+
827
+ Args:
828
+ data: The dataset to validate/coerce
829
+ schema: The DeltaCAT schema to validate against (optional)
830
+ schema_evolution_mode: How to handle fields not in schema (MANUAL or AUTO)
831
+ default_schema_consistency_type: Default consistency type for new fields in AUTO mode
832
+
833
+ Returns:
834
+ Tuple[Dataset, bool, Optional[Schema]]: Validated/coerced data, flag indicating if schema was modified, and updated schema
835
+
836
+ Raises:
837
+ ValueError: If validation fails or coercion is not possible
838
+ """
839
+ if not schema:
840
+ return data, False, None
841
+
842
+ validated_data, updated_schema = schema.validate_and_coerce_dataset(
843
+ data,
844
+ schema_evolution_mode=schema_evolution_mode,
845
+ default_schema_consistency_type=default_schema_consistency_type,
846
+ )
847
+
848
+ # Check if schema was modified by comparing with original
849
+ schema_modified = not updated_schema.equivalent_to(schema, True)
850
+ # Return updated schema only if it was modified
851
+ updated_schema = updated_schema if schema_modified else None
852
+
853
+ return validated_data, schema_modified, updated_schema
854
+
855
+
856
+ def _validate_reader_compatibility(
857
+ data: Dataset,
858
+ content_type: ContentType,
859
+ supported_reader_types: Optional[List[DatasetType]],
860
+ ) -> None:
861
+ """Validate that the data types being written are compatible with all supported reader types.
862
+
863
+ Args:
864
+ data: The dataset to validate
865
+ content_type: Content type being written
866
+ supported_reader_types: List of DatasetTypes that must be able to read this data
867
+
868
+ Raises:
869
+ TableValidationError: If any data types would break supported reader compatibility
870
+ """
871
+ if not supported_reader_types:
872
+ return
873
+
874
+ # Get the schema from the data
875
+ schema = get_table_schema(data)
876
+
877
+ # Get the dataset type of the current data
878
+ writer_dataset_type = get_dataset_type(data)
879
+
880
+ # PYARROW_PARQUET is equivalent to PYARROW for compatibility
881
+ writer_type_str = (
882
+ writer_dataset_type.value
883
+ if writer_dataset_type != DatasetType.PYARROW_PARQUET
884
+ else "pyarrow"
885
+ )
886
+
887
+ content_type_str = content_type.value
888
+
889
+ # Check each field type for compatibility
890
+ incompatible_fields = []
891
+
892
+ for field in schema:
893
+ field_name = field.name
894
+ arrow_type_str = str(field.type)
895
+
896
+ # Get the base type name from PyArrow field type
897
+ base_type_name = get_base_arrow_type_name(field.type)
898
+
899
+ # Get compatible readers for this (arrow_type, writer_dataset_type, content_type) combination
900
+ compatible_readers = get_compatible_readers(
901
+ base_type_name,
902
+ writer_type_str,
903
+ content_type_str,
904
+ )
905
+
906
+ # Check if all supported reader types are compatible
907
+ for required_reader in supported_reader_types:
908
+ reader_is_compatible = required_reader in compatible_readers
909
+
910
+ # Special case: PYARROW_PARQUET is equivalent to PYARROW for compatibility if we're writing parquet
911
+ if (
912
+ not reader_is_compatible
913
+ and content_type == ContentType.PARQUET
914
+ and required_reader == DatasetType.PYARROW_PARQUET
915
+ ):
916
+ reader_is_compatible = DatasetType.PYARROW in compatible_readers
917
+
918
+ if not reader_is_compatible:
919
+ incompatible_fields.append(
920
+ {
921
+ "field_name": field_name,
922
+ "arrow_type": arrow_type_str,
923
+ "incompatible_reader": required_reader,
924
+ "writer_type": writer_dataset_type,
925
+ "content_type": content_type,
926
+ }
927
+ )
928
+
929
+ # Raise error if any incompatibilities found
930
+ if incompatible_fields:
931
+ error_details = []
932
+ for incompatible in incompatible_fields:
933
+ error_details.append(
934
+ f"Field '{incompatible['field_name']}' with type '{incompatible['arrow_type']}' "
935
+ f"written by {incompatible['writer_type']} to {incompatible['content_type']} "
936
+ f"cannot be read by required reader type {incompatible['incompatible_reader']}. "
937
+ f"If you expect this write to succeed and this reader is not required, then it "
938
+ f"can be removed from the table's supported reader types property."
939
+ )
940
+
941
+ raise TableValidationError(
942
+ f"Reader compatibility validation failed. The following fields would break "
943
+ f"supported reader types:\n" + "\n".join(error_details)
944
+ )
945
+
946
+
947
+ def _stage_commit_and_compact(
948
+ converted_data: Dataset,
949
+ partition,
950
+ delta_type: DeltaType,
951
+ content_type: ContentType,
952
+ commit_staged_partition: bool,
953
+ table_version_obj: TableVersion,
954
+ namespace: str,
955
+ table: str,
956
+ schema: Schema,
957
+ original_fields: Set[str],
958
+ **kwargs,
959
+ ) -> None:
960
+ """Stage and commit delta, then handle compaction if needed."""
961
+ # Remove schema from kwargs to avoid duplicate parameter conflict
962
+ # We explicitly pass the correct schema parameter
963
+ kwargs.pop("schema", None)
964
+
965
+ # Stage a delta with the data
966
+ delta = _get_storage(**kwargs).stage_delta(
967
+ data=converted_data,
968
+ partition=partition,
969
+ delta_type=delta_type,
970
+ content_type=content_type,
971
+ author=ManifestAuthor.of(
972
+ name="deltacat.write_to_table", version=dc.__version__
973
+ ),
974
+ schema=schema,
975
+ **kwargs,
976
+ )
977
+
978
+ delta = _get_storage(**kwargs).commit_delta(delta=delta, **kwargs)
979
+
980
+ if commit_staged_partition:
981
+ _get_storage(**kwargs).commit_partition(partition=partition, **kwargs)
982
+
983
+ # Check compaction trigger decision
984
+ should_compact = _trigger_compaction(
985
+ table_version_obj,
986
+ delta,
987
+ TableReadOptimizationLevel.MAX,
109
988
  **kwargs,
110
989
  )
990
+ if should_compact:
991
+ # Run V2 compaction session to merge or delete data
992
+ if table_version_obj.schema:
993
+ all_column_names = table_version_obj.schema.arrow.names
994
+ else:
995
+ raise RuntimeError("Table version schema is required to run compaction.")
996
+ _run_compaction_session(
997
+ table_version_obj=table_version_obj,
998
+ partition=partition,
999
+ latest_delta_stream_position=delta.stream_position,
1000
+ namespace=namespace,
1001
+ table=table,
1002
+ original_fields=original_fields,
1003
+ all_column_names=all_column_names,
1004
+ **kwargs,
1005
+ )
1006
+
1007
+
1008
+ def _trigger_compaction(
1009
+ table_version_obj: TableVersion,
1010
+ latest_delta: Optional[Delta],
1011
+ target_read_optimization_level: TableReadOptimizationLevel,
1012
+ **kwargs,
1013
+ ) -> bool:
1014
+ # Import inside function to avoid circular imports
1015
+ from deltacat.compute.compactor.utils import round_completion_reader as rci
1016
+
1017
+ # Extract delta type from latest_delta if available, otherwise default to no compaction
1018
+ if latest_delta is not None:
1019
+ delta_type = latest_delta.type
1020
+ partition_values = latest_delta.partition_locator.partition_values
1021
+ logger.info(
1022
+ f"Using delta type {delta_type} from latest delta {latest_delta.locator}"
1023
+ )
1024
+ else:
1025
+ logger.info(f"No latest delta discovered, defaulting to no compaction.")
1026
+ return False
1027
+
1028
+ if (
1029
+ table_version_obj.read_table_property(TableProperty.READ_OPTIMIZATION_LEVEL)
1030
+ == target_read_optimization_level
1031
+ ):
1032
+ if delta_type == DeltaType.DELETE or delta_type == DeltaType.UPSERT:
1033
+ return True
1034
+ elif delta_type == DeltaType.APPEND:
1035
+ # Get default stream to determine partition locator
1036
+ stream = _get_table_stream(
1037
+ table_version_obj.locator.namespace,
1038
+ table_version_obj.locator.table_name,
1039
+ table_version_obj.locator.table_version,
1040
+ **kwargs,
1041
+ )
1042
+
1043
+ if not stream:
1044
+ return False
1045
+
1046
+ # Use provided partition_values or None for unpartitioned tables
1047
+ partition_locator = PartitionLocator.of(
1048
+ stream_locator=stream.locator,
1049
+ partition_values=partition_values,
1050
+ partition_id=None,
1051
+ )
1052
+
1053
+ # Get round completion info to determine high watermark
1054
+ round_completion_info = rci.read_round_completion_info(
1055
+ source_partition_locator=partition_locator,
1056
+ destination_partition_locator=partition_locator,
1057
+ deltacat_storage=_get_storage(**kwargs),
1058
+ deltacat_storage_kwargs=kwargs,
1059
+ )
1060
+
1061
+ high_watermark = (
1062
+ round_completion_info.high_watermark
1063
+ if round_completion_info
1064
+ and isinstance(round_completion_info.high_watermark, int)
1065
+ else 0
1066
+ )
1067
+
1068
+ # Get all deltas appended since last compaction
1069
+ deltas = _get_storage(**kwargs).list_deltas(
1070
+ namespace=table_version_obj.locator.namespace,
1071
+ table_name=table_version_obj.locator.table_name,
1072
+ table_version=table_version_obj.locator.table_version,
1073
+ partition_values=partition_values,
1074
+ start_stream_position=high_watermark + 1,
1075
+ **kwargs,
1076
+ )
1077
+
1078
+ if not deltas:
1079
+ return False
1080
+
1081
+ # Count deltas appended since last compaction
1082
+ appended_deltas_since_last_compaction = len(deltas)
1083
+ delta_trigger = table_version_obj.read_table_property(
1084
+ TableProperty.APPENDED_DELTA_COUNT_COMPACTION_TRIGGER
1085
+ )
1086
+ if delta_trigger and appended_deltas_since_last_compaction >= delta_trigger:
1087
+ return True
1088
+
1089
+ # Count files appended since last compaction
1090
+ appended_files_since_last_compaction = 0
1091
+ for delta in deltas:
1092
+ if delta.manifest and delta.manifest.entries:
1093
+ appended_files_since_last_compaction += len(delta.manifest.entries)
1094
+
1095
+ file_trigger = table_version_obj.read_table_property(
1096
+ TableProperty.APPENDED_FILE_COUNT_COMPACTION_TRIGGER
1097
+ )
1098
+ if file_trigger and appended_files_since_last_compaction >= file_trigger:
1099
+ return True
1100
+
1101
+ # Count records appended since last compaction
1102
+ appended_records_since_last_compaction = 0
1103
+ for delta in deltas:
1104
+ if delta.meta and delta.meta.record_count:
1105
+ appended_records_since_last_compaction += delta.meta.record_count
1106
+
1107
+ record_trigger = table_version_obj.read_table_property(
1108
+ TableProperty.APPENDED_RECORD_COUNT_COMPACTION_TRIGGER
1109
+ )
1110
+ if (
1111
+ record_trigger
1112
+ and appended_records_since_last_compaction >= record_trigger
1113
+ ):
1114
+ return True
1115
+ return False
1116
+
1117
+
1118
+ def _get_compaction_primary_keys(table_version_obj: TableVersion) -> set:
1119
+ """Extract primary keys from table schema for compaction."""
1120
+ table_schema = table_version_obj.schema
1121
+ return (
1122
+ set(table_schema.merge_keys)
1123
+ if table_schema and table_schema.merge_keys
1124
+ else set()
1125
+ )
1126
+
1127
+
1128
+ def _get_compaction_hash_bucket_count(
1129
+ partition: Partition, table_version_obj: TableVersion
1130
+ ) -> int:
1131
+ """Determine hash bucket count from previous compaction, table property, or default."""
1132
+ # First check if we have a hash bucket count from previous compaction
1133
+ if (
1134
+ partition.compaction_round_completion_info
1135
+ and partition.compaction_round_completion_info.hash_bucket_count
1136
+ ):
1137
+ hash_bucket_count = partition.compaction_round_completion_info.hash_bucket_count
1138
+ logger.info(
1139
+ f"Using hash bucket count {hash_bucket_count} from previous compaction"
1140
+ )
1141
+ return hash_bucket_count
1142
+
1143
+ # Otherwise use the table property for default compaction hash bucket count
1144
+ hash_bucket_count = table_version_obj.read_table_property(
1145
+ TableProperty.DEFAULT_COMPACTION_HASH_BUCKET_COUNT
1146
+ )
1147
+ logger.info(f"Using hash bucket count {hash_bucket_count} from table property")
1148
+ return hash_bucket_count
1149
+
1150
+
1151
+ def _get_merge_order_sort_keys(table_version_obj: TableVersion):
1152
+ """Extract sort keys from merge_order fields in schema for compaction.
1153
+
1154
+ Args:
1155
+ table_version_obj: The table version containing schema
1156
+
1157
+ Returns:
1158
+ List of SortKey objects from merge_order fields, or None if no merge_order fields are defined
1159
+ """
1160
+ if table_version_obj.schema:
1161
+ return table_version_obj.schema.merge_order_sort_keys()
1162
+ return None
1163
+
1164
+
1165
+ def _create_compaction_params(
1166
+ table_version_obj: TableVersion,
1167
+ partition: Partition,
1168
+ latest_stream_position: int,
1169
+ primary_keys: set,
1170
+ hash_bucket_count: int,
1171
+ original_fields: Set[str],
1172
+ all_column_names: Optional[List[str]],
1173
+ **kwargs,
1174
+ ):
1175
+ """Create compaction parameters for the compaction session."""
1176
+ from deltacat.compute.compactor.model.compact_partition_params import (
1177
+ CompactPartitionParams,
1178
+ )
1179
+
1180
+ # Remove create_table/alter_table kwargs not needed for compaction
1181
+ kwargs.pop("lifecycle_state", None)
1182
+ kwargs.pop("schema", None)
1183
+ kwargs.pop("partition_scheme", None)
1184
+ kwargs.pop("sort_keys", None)
1185
+ kwargs.pop("table_description", None)
1186
+ kwargs.pop("table_version_description", None)
1187
+ kwargs.pop("table_properties", None)
1188
+ kwargs.pop("table_version_properties", None)
1189
+ kwargs.pop("namespace_properties", None)
1190
+ kwargs.pop("content_types", None)
1191
+ kwargs.pop("fail_if_exists", None)
1192
+ kwargs.pop("schema_updates", None)
1193
+ kwargs.pop("partition_updates", None)
1194
+ kwargs.pop("sort_scheme", None)
1195
+
1196
+ table_writer_kwargs = kwargs.pop("table_writer_kwargs", {})
1197
+ table_writer_kwargs["schema"] = table_version_obj.schema
1198
+ table_writer_kwargs["sort_scheme_id"] = table_version_obj.sort_scheme.id
1199
+ deltacat_storage_kwargs = kwargs.pop("deltacat_storage_kwargs", {})
1200
+ deltacat_storage_kwargs["transaction"] = kwargs.get("transaction", None)
1201
+ list_deltas_kwargs = kwargs.pop("list_deltas_kwargs", {})
1202
+ list_deltas_kwargs["transaction"] = kwargs.get("transaction", None)
1203
+
1204
+ return CompactPartitionParams.of(
1205
+ {
1206
+ "catalog": kwargs.get("inner", kwargs.get("catalog")),
1207
+ "source_partition_locator": partition.locator,
1208
+ "destination_partition_locator": partition.locator, # In-place compaction
1209
+ "primary_keys": primary_keys,
1210
+ "last_stream_position_to_compact": latest_stream_position,
1211
+ "deltacat_storage": _get_storage(**kwargs),
1212
+ "deltacat_storage_kwargs": deltacat_storage_kwargs,
1213
+ "list_deltas_kwargs": list_deltas_kwargs,
1214
+ "table_writer_kwargs": table_writer_kwargs,
1215
+ "hash_bucket_count": hash_bucket_count,
1216
+ "records_per_compacted_file": table_version_obj.read_table_property(
1217
+ TableProperty.RECORDS_PER_COMPACTED_FILE,
1218
+ ),
1219
+ "compacted_file_content_type": ContentType.PARQUET,
1220
+ "drop_duplicates": True,
1221
+ "sort_keys": _get_merge_order_sort_keys(table_version_obj),
1222
+ "original_fields": original_fields,
1223
+ "all_column_names": all_column_names,
1224
+ }
1225
+ )
1226
+
1227
+
1228
+ def _run_compaction_session(
1229
+ table_version_obj: TableVersion,
1230
+ partition: Partition,
1231
+ latest_delta_stream_position: int,
1232
+ namespace: str,
1233
+ table: str,
1234
+ original_fields: Set[str],
1235
+ all_column_names: List[str],
1236
+ **kwargs,
1237
+ ) -> None:
1238
+ """
1239
+ Run a V2 compaction session for the given table and partition.
1240
+
1241
+ Args:
1242
+ table_version_obj: The table version object
1243
+ partition: The partition to compact
1244
+ latest_delta_stream_position: Stream position of the latest delta
1245
+ namespace: The table namespace
1246
+ table: The table name
1247
+ original_fields: The original field set for partial UPSERT support
1248
+ **kwargs: Additional arguments including catalog and storage parameters
1249
+ """
1250
+ # Import inside function to avoid circular imports
1251
+ from deltacat.compute.compactor_v2.compaction_session import compact_partition
1252
+
1253
+ try:
1254
+ # Extract compaction configuration
1255
+ primary_keys = _get_compaction_primary_keys(table_version_obj)
1256
+ hash_bucket_count = _get_compaction_hash_bucket_count(
1257
+ partition, table_version_obj
1258
+ )
1259
+
1260
+ # Create compaction parameters
1261
+ compact_partition_params = _create_compaction_params(
1262
+ table_version_obj,
1263
+ partition,
1264
+ latest_delta_stream_position,
1265
+ primary_keys,
1266
+ hash_bucket_count,
1267
+ original_fields=original_fields,
1268
+ all_column_names=all_column_names,
1269
+ **kwargs,
1270
+ )
1271
+
1272
+ # Run V2 compaction session
1273
+ compact_partition(params=compact_partition_params)
1274
+ except Exception as e:
1275
+ logger.error(
1276
+ f"Error during compaction session for {namespace}.{table}, "
1277
+ f"partition {partition.locator}: {e}"
1278
+ )
1279
+ raise
1280
+
1281
+
1282
+ def _get_merge_key_field_names_from_schema(schema) -> List[str]:
1283
+ """Extract merge key field names from a DeltaCAT Schema object.
1284
+
1285
+ Args:
1286
+ schema: DeltaCAT Schema object
1287
+
1288
+ Returns:
1289
+ List of field names that are marked as merge keys
1290
+ """
1291
+ if not schema or not schema.merge_keys:
1292
+ return []
1293
+
1294
+ merge_key_field_names = []
1295
+ field_ids_to_fields = schema.field_ids_to_fields
1296
+
1297
+ for merge_key_id in schema.merge_keys:
1298
+ if merge_key_id in field_ids_to_fields:
1299
+ field = field_ids_to_fields[merge_key_id]
1300
+ merge_key_field_names.append(field.arrow.name)
1301
+
1302
+ return merge_key_field_names
1303
+
1304
+
1305
+ def _set_entry_params_if_needed(
1306
+ mode: TableWriteMode, table_version_obj, kwargs: dict
1307
+ ) -> None:
1308
+ """Automatically set entry_params to merge keys if not already set by user.
1309
+
1310
+ Args:
1311
+ mode: The table write mode
1312
+ table_version_obj: The table version object containing schema
1313
+ kwargs: Keyword arguments dictionary that may contain entry_params
1314
+ """
1315
+ # Only set entry_params for DELETE and MERGE modes
1316
+ if mode not in [TableWriteMode.DELETE, TableWriteMode.MERGE]:
1317
+ return
1318
+
1319
+ # Don't override if user already provided entry_params
1320
+ if "entry_params" in kwargs and kwargs["entry_params"] is not None:
1321
+ return
1322
+
1323
+ # Get schema from table version
1324
+ if not table_version_obj or not table_version_obj.schema:
1325
+ return
1326
+
1327
+ # Extract merge key field names
1328
+ merge_key_field_names = _get_merge_key_field_names_from_schema(
1329
+ table_version_obj.schema
1330
+ )
1331
+
1332
+ if merge_key_field_names:
1333
+ from deltacat.storage import EntryParams
1334
+
1335
+ kwargs["entry_params"] = EntryParams.of(merge_key_field_names)
111
1336
 
112
- table_version_obj = _get_latest_or_given_table_version(
1337
+
1338
+ def _get_table_stream(namespace: str, table: str, table_version: str, **kwargs):
1339
+ """Helper function to get a stream for a table version."""
1340
+ return _get_storage(**kwargs).get_stream(
113
1341
  namespace=namespace,
114
1342
  table_name=table,
115
1343
  table_version=table_version,
116
1344
  **kwargs,
117
1345
  )
118
- table_version = table_version_obj.table_version
119
1346
 
1347
+
1348
+ def _validate_read_table_input(
1349
+ namespace: str,
1350
+ table: str,
1351
+ table_schema: Optional[Schema],
1352
+ table_type: Optional[DatasetType],
1353
+ distributed_dataset_type: Optional[DatasetType],
1354
+ ) -> None:
1355
+ """Validate input parameters for read_table operation."""
120
1356
  if (
121
- table_version_obj.content_types is None
122
- or len(table_version_obj.content_types) != 1
1357
+ distributed_dataset_type
1358
+ and distributed_dataset_type not in DatasetType.distributed()
123
1359
  ):
124
1360
  raise ValueError(
125
- "Expected exactly one content type but "
126
- f"found {table_version_obj.content_types}."
1361
+ f"{distributed_dataset_type} is not a valid distributed dataset type. "
1362
+ f"Valid distributed dataset types are: {DatasetType.distributed()}."
1363
+ )
1364
+ if table_type and table_type not in DatasetType.local():
1365
+ raise ValueError(
1366
+ f"{table_type} is not a valid local table type. "
1367
+ f"Valid table types are: {DatasetType.local()}."
127
1368
  )
128
1369
 
1370
+ # For schemaless tables, distributed datasets are not yet supported
1371
+ if table_schema is None and distributed_dataset_type:
1372
+ raise NotImplementedError(
1373
+ f"Distributed dataset reading is not yet supported for schemaless tables. "
1374
+ f"Table '{namespace}.{table}' has no schema, but distributed_dataset_type={distributed_dataset_type} was specified. "
1375
+ f"Please use local storage by setting distributed_dataset_type=None."
1376
+ )
1377
+
1378
+
1379
+ def _get_qualified_deltas_for_read(
1380
+ table: str,
1381
+ namespace: str,
1382
+ table_version: str,
1383
+ partition_filter: Optional[List[Union[Partition, PartitionLocator]]],
1384
+ **kwargs,
1385
+ ) -> List[Delta]:
1386
+ """Get qualified deltas for reading based on partition filter."""
129
1387
  logger.info(
130
1388
  f"Reading metadata for table={namespace}/{table}/{table_version} "
131
- f"with partition_filters={partition_filter} and stream position"
132
- f" range={stream_position_range_inclusive}"
1389
+ f"with partition_filters={partition_filter}."
133
1390
  )
134
1391
 
1392
+ # Get partition filter if not provided
135
1393
  if partition_filter is None:
136
- logger.info(
137
- f"Reading all partitions metadata in the table={table} "
138
- "as partition_filter was None."
139
- )
140
- partition_filter = (
141
- _get_storage(**kwargs)
142
- .list_partitions(
143
- table_name=table,
144
- namespace=namespace,
145
- table_version=table_version,
146
- **kwargs,
147
- )
148
- .all_items()
1394
+ partition_filter = _get_all_committed_partitions(
1395
+ table, namespace, table_version, **kwargs
149
1396
  )
150
1397
 
1398
+ # Get deltas from partitions
151
1399
  qualified_deltas = _get_deltas_from_partition_filter(
152
- stream_position_range_inclusive=stream_position_range_inclusive,
153
1400
  partition_filter=partition_filter,
154
1401
  **kwargs,
155
1402
  )
@@ -159,30 +1406,390 @@ def read_table(
159
1406
  f"from {len(partition_filter)} partitions."
160
1407
  )
161
1408
 
162
- merge_on_read_params = MergeOnReadParams.of(
163
- {
164
- "deltas": qualified_deltas,
165
- "deltacat_storage": _get_storage(**kwargs),
166
- "deltacat_storage_kwargs": {**kwargs},
167
- "reader_kwargs": reader_kwargs,
168
- }
1409
+ return qualified_deltas
1410
+
1411
+
1412
+ def _get_max_parallelism(
1413
+ max_parallelism: Optional[int],
1414
+ distributed_dataset_type: Optional[DatasetType],
1415
+ ) -> int:
1416
+ """Get the max parallelism for a read operation."""
1417
+ if distributed_dataset_type:
1418
+ max_parallelism = max_parallelism or 100
1419
+ else:
1420
+ # TODO(pdames): Set max parallelism using available resources and dataset size
1421
+ max_parallelism = 1
1422
+ if max_parallelism < 1:
1423
+ raise ValueError(
1424
+ f"max_parallelism must be greater than 0, but got {max_parallelism}"
1425
+ )
1426
+ logger.info(f"Using max_parallelism={max_parallelism} for read operation")
1427
+
1428
+ return max_parallelism
1429
+
1430
+
1431
+ def _handle_schemaless_table_read(
1432
+ qualified_deltas: List[Delta],
1433
+ read_as: DatasetType,
1434
+ **kwargs,
1435
+ ) -> Dataset:
1436
+ """Handle reading schemaless tables by flattening manifest entries."""
1437
+ # Create a PyArrow table for each delta
1438
+ # TODO(pdames): More efficient implementation for tables with millions/billions of entries
1439
+ tables = []
1440
+ for delta in qualified_deltas:
1441
+ # Get the manifest for this delta
1442
+ if delta.manifest:
1443
+ manifest = delta.manifest
1444
+ else:
1445
+ # Fetch manifest from storage
1446
+ manifest = _get_storage(**kwargs).get_delta_manifest(
1447
+ delta.locator,
1448
+ transaction=kwargs.get("transaction"),
1449
+ **kwargs,
1450
+ )
1451
+ # Create flattened table from this delta's manifest
1452
+ table = pa_utils.delta_manifest_to_table(
1453
+ manifest,
1454
+ delta,
1455
+ )
1456
+ tables.append(table)
1457
+
1458
+ # Concatenate all PyArrow tables
1459
+ final_table = pa_utils.concat_tables(tables)
1460
+
1461
+ # Convert from PyArrow to the requested dataset type
1462
+ return from_pyarrow(final_table, read_as)
1463
+
1464
+
1465
+ def _download_and_process_table_data(
1466
+ namespace: str,
1467
+ table: str,
1468
+ qualified_deltas: List[Delta],
1469
+ read_as: DatasetType,
1470
+ max_parallelism: Optional[int],
1471
+ columns: Optional[List[str]],
1472
+ file_path_column: Optional[str],
1473
+ table_version_obj: Optional[TableVersion],
1474
+ **kwargs,
1475
+ ) -> Dataset:
1476
+ """Download delta data and process result based on storage type."""
1477
+
1478
+ # Handle NUMPY read requests by translating to PANDAS internally
1479
+ original_read_as = read_as
1480
+ effective_read_as = read_as
1481
+ if read_as == DatasetType.NUMPY:
1482
+ effective_read_as = DatasetType.PANDAS
1483
+ logger.debug("Translating NUMPY read request to PANDAS for internal processing")
1484
+
1485
+ # Merge deltas and download data
1486
+ if not qualified_deltas:
1487
+ # Return empty table with original read_as type
1488
+ return empty_table(original_read_as)
1489
+
1490
+ # Special handling for non-empty schemaless tables
1491
+ if table_version_obj.schema is None:
1492
+ result = _handle_schemaless_table_read(
1493
+ qualified_deltas,
1494
+ effective_read_as,
1495
+ **kwargs,
1496
+ )
1497
+ # Convert to numpy if original request was for numpy
1498
+ if original_read_as == DatasetType.NUMPY:
1499
+ return _convert_pandas_to_numpy(result)
1500
+ return result
1501
+
1502
+ # Get schemas for each manifest entry
1503
+ entry_index_to_schema = _build_entry_index_to_schema_mapping(
1504
+ qualified_deltas, table_version_obj, **kwargs
1505
+ )
1506
+ # Standard non-empty schema table read path - merge deltas and download data
1507
+ merged_delta = Delta.merge_deltas(qualified_deltas)
1508
+
1509
+ # Convert read parameters to download parameters
1510
+ table_type = (
1511
+ effective_read_as
1512
+ if effective_read_as in DatasetType.local()
1513
+ else (kwargs.pop("table_type", None) or DatasetType.PYARROW)
1514
+ )
1515
+ distributed_dataset_type = (
1516
+ effective_read_as if effective_read_as in DatasetType.distributed() else None
1517
+ )
1518
+
1519
+ # Validate input parameters
1520
+ _validate_read_table_input(
1521
+ namespace,
1522
+ table,
1523
+ table_version_obj.schema,
1524
+ table_type,
1525
+ distributed_dataset_type,
1526
+ )
1527
+
1528
+ # Determine max parallelism
1529
+ max_parallelism = _get_max_parallelism(
1530
+ max_parallelism,
1531
+ distributed_dataset_type,
1532
+ )
1533
+ # Filter out parameters that are already passed as keyword arguments
1534
+ # to avoid "multiple values for argument" errors
1535
+ filtered_kwargs = {
1536
+ k: v
1537
+ for k, v in kwargs.items()
1538
+ if k
1539
+ not in [
1540
+ "delta_like",
1541
+ "table_type",
1542
+ "storage_type",
1543
+ "max_parallelism",
1544
+ "columns",
1545
+ "distributed_dataset_type",
1546
+ "file_path_column",
1547
+ ]
1548
+ }
1549
+ result = _get_storage(**kwargs).download_delta(
1550
+ merged_delta,
1551
+ table_type=effective_read_as,
1552
+ storage_type=StorageType.DISTRIBUTED
1553
+ if distributed_dataset_type
1554
+ else StorageType.LOCAL,
1555
+ max_parallelism=max_parallelism,
1556
+ columns=columns,
1557
+ distributed_dataset_type=distributed_dataset_type,
1558
+ file_path_column=file_path_column,
1559
+ **filtered_kwargs,
1560
+ )
1561
+
1562
+ # Handle local storage table concatenation and PYARROW_PARQUET lazy materialization
1563
+ if not distributed_dataset_type and table_type and isinstance(result, list):
1564
+ if table_type == DatasetType.PYARROW_PARQUET:
1565
+ # For PYARROW_PARQUET, preserve lazy materialization:
1566
+ return result[0] if len(result) == 1 else result
1567
+ else:
1568
+ # For other types, perform normal concatenation
1569
+ result = _handle_local_table_concatenation(
1570
+ result,
1571
+ table_type,
1572
+ table_version_obj.schema,
1573
+ entry_index_to_schema,
1574
+ file_path_column,
1575
+ columns,
1576
+ )
1577
+ # Convert to numpy if original request was for numpy
1578
+ if original_read_as == DatasetType.NUMPY:
1579
+ return _convert_pandas_to_numpy(result)
1580
+
1581
+ return result
1582
+
1583
+
1584
+ def _convert_pandas_to_numpy(dataset: Dataset):
1585
+ """Convert pandas DataFrame to numpy ndarray."""
1586
+ if not isinstance(dataset, pd.DataFrame):
1587
+ raise ValueError(f"Expected pandas DataFrame but found {type(dataset)}")
1588
+ return dataset.to_numpy()
1589
+
1590
+
1591
+ def _coerce_dataset_to_schema(
1592
+ dataset: Dataset, target_schema: pa.Schema, manifest_entry_schema: Schema
1593
+ ) -> Dataset:
1594
+ """Coerce a dataset to match the target PyArrow schema using DeltaCAT Schema.coerce method."""
1595
+ # Convert target PyArrow schema to DeltaCAT schema and use its coerce method
1596
+ deltacat_schema = Schema.of(schema=target_schema)
1597
+ return deltacat_schema.coerce(dataset, manifest_entry_schema)
1598
+
1599
+
1600
+ def _coerce_results_to_schema(
1601
+ results: Dataset, target_schema: pa.Schema, entry_index_to_schema: List[Schema]
1602
+ ) -> List[Dataset]:
1603
+ """Coerce all table results to match the target schema."""
1604
+ coerced_results = []
1605
+ for i, table_result in enumerate(results):
1606
+ coerced_result = _coerce_dataset_to_schema(
1607
+ table_result, target_schema, entry_index_to_schema[i]
1608
+ )
1609
+ coerced_results.append(coerced_result)
1610
+ logger.debug(f"Coerced table {i} to unified schema")
1611
+ return coerced_results
1612
+
1613
+
1614
+ def _create_target_schema(
1615
+ arrow_schema: pa.Schema,
1616
+ columns: Optional[List[str]] = None,
1617
+ file_path_column: Optional[str] = None,
1618
+ ) -> pa.Schema:
1619
+ """Create target schema for concatenation with optional column selection and file_path_column."""
1620
+ if columns is not None:
1621
+ # Column selection - use only specified columns
1622
+ field_map = {field.name: field for field in arrow_schema}
1623
+ selected_fields = []
1624
+
1625
+ for col_name in columns:
1626
+ if col_name in field_map:
1627
+ selected_fields.append(field_map[col_name])
1628
+ arrow_schema = pa.schema(selected_fields)
1629
+ if file_path_column and file_path_column not in arrow_schema.names:
1630
+ arrow_schema = arrow_schema.append(pa.field(file_path_column, pa.string()))
1631
+ return arrow_schema
1632
+
1633
+
1634
+ def _create_entry_schemas_for_concatenation(
1635
+ entry_index_to_schema: List[Schema],
1636
+ columns: Optional[List[str]] = None,
1637
+ file_path_column: Optional[str] = None,
1638
+ ) -> List[Schema]:
1639
+ """Create entry schemas for concatenation, optionally filtered by column selection."""
1640
+ if columns is None:
1641
+ # No column selection - return original schemas as-is
1642
+ return entry_index_to_schema
1643
+
1644
+ # Column selection - filter each entry schema
1645
+ modified_schemas = []
1646
+ for entry_schema in entry_index_to_schema:
1647
+ if entry_schema and entry_schema.arrow:
1648
+ filtered_schema = _create_target_schema(
1649
+ entry_schema.arrow, columns, file_path_column
1650
+ )
1651
+ modified_schemas.append(Schema.of(schema=filtered_schema))
1652
+ else:
1653
+ modified_schemas.append(entry_schema)
1654
+
1655
+ return modified_schemas
1656
+
1657
+
1658
+ def _handle_local_table_concatenation(
1659
+ results: Dataset,
1660
+ table_type: DatasetType,
1661
+ table_schema: Optional[Schema],
1662
+ entry_index_to_schema: List[Schema],
1663
+ file_path_column: Optional[str] = None,
1664
+ columns: Optional[List[str]] = None,
1665
+ ) -> Dataset:
1666
+ """Handle concatenation of local table results with schema coercion."""
1667
+ logger.debug(f"Target table schema for concatenation: {table_schema}")
1668
+
1669
+ # Create target schema for coercion, respecting column selection
1670
+ target_schema = _create_target_schema(table_schema.arrow, columns, file_path_column)
1671
+ logger.debug(f"Created target schema: {target_schema.names}")
1672
+
1673
+ # Filter entry schemas to match column selection and file_path_column
1674
+ modified_entry_schemas = _create_entry_schemas_for_concatenation(
1675
+ entry_index_to_schema, columns, file_path_column
1676
+ )
1677
+
1678
+ # Coerce results to unified schema
1679
+ coerced_results = _coerce_results_to_schema(
1680
+ results, target_schema, modified_entry_schemas
169
1681
  )
170
1682
 
171
- return MERGE_FUNC_BY_DISTRIBUTED_DATASET_TYPE[distributed_dataset_type.value](
172
- params=merge_on_read_params, **kwargs
1683
+ # Second step: concatenate the coerced results
1684
+ logger.debug(
1685
+ f"Concatenating {len(coerced_results)} local tables of type {table_type} with unified schemas"
173
1686
  )
1687
+ concatenated_result = concat_tables(coerced_results, table_type)
1688
+ logger.debug(f"Concatenation complete, result type: {type(concatenated_result)}")
1689
+ return concatenated_result
1690
+
1691
+
1692
+ def read_table(
1693
+ table: str,
1694
+ *args,
1695
+ namespace: Optional[str] = None,
1696
+ table_version: Optional[str] = None,
1697
+ read_as: DatasetType = DatasetType.DAFT,
1698
+ partition_filter: Optional[List[Union[Partition, PartitionLocator]]] = None,
1699
+ max_parallelism: Optional[int] = None,
1700
+ columns: Optional[List[str]] = None,
1701
+ file_path_column: Optional[str] = None,
1702
+ transaction: Optional[Transaction] = None,
1703
+ **kwargs,
1704
+ ) -> Dataset:
1705
+ """Read a table into a dataset.
1706
+
1707
+ Args:
1708
+ table: Name of the table to read.
1709
+ namespace: Optional namespace of the table. Uses default if not specified.
1710
+ table_version: Optional specific version of the table to read.
1711
+ read_as: Dataset type to use for reading table files. Defaults to DatasetType.DAFT.
1712
+ partition_filter: Optional list of partitions to read from.
1713
+ max_parallelism: Optional maximum parallelism for data download. Defaults to the number of
1714
+ available CPU cores for local dataset type reads (i.e., members of DatasetType.local())
1715
+ and 100 for distributed dataset type reads (i.e., members of DatasetType.distributed()).
1716
+ columns: Optional list of columns to include in the result.
1717
+ file_path_column: Optional column name to add file paths to the result.
1718
+ transaction: Optional transaction to chain this read operation to. If provided, uncommitted
1719
+ changes from the transaction will be visible to this read operation.
1720
+ **kwargs: Additional keyword arguments.
1721
+
1722
+ Returns:
1723
+ Dataset containing the table data.
1724
+ """
1725
+ # Set up transaction handling
1726
+ read_transaction, commit_transaction = setup_transaction(transaction, **kwargs)
1727
+ kwargs["transaction"] = read_transaction
1728
+
1729
+ try:
1730
+ # Resolve namespace and get table metadata
1731
+ namespace = namespace or default_namespace()
1732
+
1733
+ table_version_obj = _get_latest_active_or_given_table_version(
1734
+ namespace=namespace,
1735
+ table_name=table,
1736
+ table_version=table_version,
1737
+ **kwargs,
1738
+ )
1739
+
1740
+ # Get partitions and deltas to read
1741
+ qualified_deltas = _get_qualified_deltas_for_read(
1742
+ table,
1743
+ namespace,
1744
+ table_version_obj.table_version,
1745
+ partition_filter,
1746
+ **kwargs,
1747
+ )
1748
+
1749
+ # Download and process the data
1750
+ # TODO(pdames): Remove once we implement a custom SerDe for pa.ParquetFile
1751
+ if read_as == DatasetType.PYARROW_PARQUET:
1752
+ max_parallelism = 1
1753
+ logger.warning(
1754
+ f"Forcing max_parallelism to 1 for PyArrow Parquet reads to avoid serialization errors."
1755
+ )
1756
+ result = _download_and_process_table_data(
1757
+ namespace,
1758
+ table,
1759
+ qualified_deltas,
1760
+ read_as,
1761
+ max_parallelism,
1762
+ columns,
1763
+ file_path_column,
1764
+ table_version_obj,
1765
+ **kwargs,
1766
+ )
1767
+ return result
1768
+ except Exception as e:
1769
+ # If any error occurs, the transaction remains uncommitted
1770
+ commit_transaction = False
1771
+ logger.error(f"Error during read_table: {e}")
1772
+ raise
1773
+ finally:
1774
+ if commit_transaction:
1775
+ # Seal the interactive transaction to commit all operations atomically
1776
+ read_transaction.seal()
174
1777
 
175
1778
 
176
1779
  def alter_table(
177
1780
  table: str,
178
1781
  *args,
179
1782
  namespace: Optional[str] = None,
1783
+ table_version: Optional[str] = None,
180
1784
  lifecycle_state: Optional[LifecycleState] = None,
181
- schema_updates: Optional[Dict[str, Any]] = None,
1785
+ schema_updates: Optional[SchemaUpdate] = None,
182
1786
  partition_updates: Optional[Dict[str, Any]] = None,
183
- sort_keys: Optional[SortScheme] = None,
184
- description: Optional[str] = None,
185
- properties: Optional[TableProperties] = None,
1787
+ sort_scheme: Optional[SortScheme] = None,
1788
+ table_description: Optional[str] = None,
1789
+ table_version_description: Optional[str] = None,
1790
+ table_properties: Optional[TableProperties] = None,
1791
+ table_version_properties: Optional[TableVersionProperties] = None,
1792
+ transaction: Optional[Transaction] = None,
186
1793
  **kwargs,
187
1794
  ) -> None:
188
1795
  """Alter deltacat table/table_version definition.
@@ -193,61 +1800,169 @@ def alter_table(
193
1800
  Args:
194
1801
  table: Name of the table to alter.
195
1802
  namespace: Optional namespace of the table. Uses default namespace if not specified.
1803
+ table_version: Optional specific version of the table to alter. Defaults to the latest active version.
196
1804
  lifecycle_state: New lifecycle state for the table.
197
- schema_updates: Map of schema updates to apply.
198
- partition_updates: Map of partition scheme updates to apply.
199
- sort_keys: New sort keys scheme.
200
- description: New description for the table.
201
- properties: New table properties.
1805
+ schema_updates: Schema updates to apply.
1806
+ partition_updates: Partition scheme updates to apply.
1807
+ sort_scheme: New sort scheme.
1808
+ table_description: New description for the table.
1809
+ table_version_description: New description for the table version. Defaults to `table_description` if not specified.
1810
+ table_properties: New table properties.
1811
+ table_version_properties: New table version properties. Defaults to the current parent table properties if not specified.
1812
+ transaction: Optional transaction to use. If None, creates a new transaction.
202
1813
 
203
1814
  Returns:
204
1815
  None
205
1816
 
206
1817
  Raises:
207
1818
  TableNotFoundError: If the table does not already exist.
1819
+ TableVersionNotFoundError: If the specified table version or active table version does not exist.
208
1820
  """
1821
+ resolved_table_properties = None
1822
+ if table_properties is not None:
1823
+ resolved_table_properties = _add_default_table_properties(table_properties)
1824
+ _validate_table_properties(resolved_table_properties)
1825
+
209
1826
  namespace = namespace or default_namespace()
210
1827
 
211
- _get_storage(**kwargs).update_table(
212
- *args,
213
- namespace=namespace,
214
- table_name=table,
215
- description=description,
216
- properties=properties,
217
- lifecycle_state=lifecycle_state,
218
- **kwargs,
219
- )
1828
+ # Set up transaction handling
1829
+ alter_transaction, commit_transaction = setup_transaction(transaction, **kwargs)
1830
+ kwargs["transaction"] = alter_transaction
220
1831
 
221
- table_version = _get_storage(**kwargs).get_latest_table_version(
222
- namespace, table, **kwargs
223
- )
224
- _get_storage(**kwargs).update_table_version(
225
- *args,
226
- namespace=namespace,
227
- table_name=table,
228
- table_version=table_version.id,
229
- description=description,
230
- schema_updates=schema_updates,
231
- partition_updates=partition_updates,
232
- sort_keys=sort_keys,
233
- **kwargs,
1832
+ try:
1833
+ if partition_updates:
1834
+ raise NotImplementedError("Partition updates are not yet supported.")
1835
+ if sort_scheme:
1836
+ raise NotImplementedError("Sort scheme updates are not yet supported.")
1837
+
1838
+ new_table: Table = _get_storage(**kwargs).update_table(
1839
+ *args,
1840
+ namespace=namespace,
1841
+ table_name=table,
1842
+ description=table_description,
1843
+ properties=resolved_table_properties,
1844
+ **kwargs,
1845
+ )
1846
+
1847
+ if table_version is None:
1848
+ table_version: Optional[TableVersion] = _get_storage(
1849
+ **kwargs
1850
+ ).get_latest_active_table_version(namespace, table, **kwargs)
1851
+ if table_version is None:
1852
+ raise TableVersionNotFoundError(
1853
+ f"No active table version found for table {namespace}.{table}. "
1854
+ "Please specify a table_version parameter."
1855
+ )
1856
+ else:
1857
+ table_version = _get_storage(**kwargs).get_table_version(
1858
+ namespace, table, table_version, **kwargs
1859
+ )
1860
+ if table_version is None:
1861
+ raise TableVersionNotFoundError(
1862
+ f"Table version '{table_version}' not found for table {namespace}.{table}"
1863
+ )
1864
+
1865
+ # Get table properties for schema evolution
1866
+ schema_evolution_mode = table_version.read_table_property(
1867
+ TableProperty.SCHEMA_EVOLUTION_MODE
1868
+ )
1869
+ if schema_updates and schema_evolution_mode == SchemaEvolutionMode.DISABLED:
1870
+ raise TableValidationError(
1871
+ "Schema evolution is disabled for this table. Please enable schema evolution or remove schema updates."
1872
+ )
1873
+
1874
+ # Only update table version properties if they are explicitly provided
1875
+ resolved_tv_properties = None
1876
+ if table_version_properties is not None:
1877
+ # inherit properties from the parent table if not specified
1878
+ default_tv_properties = new_table.properties
1879
+ if table_version.schema is None:
1880
+ # schemaless tables don't validate reader compatibility by default
1881
+ default_tv_properties[TableProperty.SUPPORTED_READER_TYPES] = None
1882
+ resolved_tv_properties = _add_default_table_properties(
1883
+ table_version_properties,
1884
+ default_tv_properties,
1885
+ )
1886
+ _validate_table_properties(resolved_tv_properties)
1887
+
1888
+ # Apply schema updates if provided
1889
+ updated_schema = None
1890
+ if schema_updates is not None:
1891
+ # Get the current schema from the table version
1892
+ current_schema = table_version.schema
1893
+ if current_schema != schema_updates.base_schema:
1894
+ raise ValueError(
1895
+ f"Schema updates are not compatible with the current schema for table `{namespace}.{table}`. Current schema: {current_schema}, Schema update base schema: {schema_updates.base_schema}"
1896
+ )
1897
+
1898
+ # Apply all the updates to get the final schema
1899
+ updated_schema = schema_updates.apply()
1900
+
1901
+ _get_storage(**kwargs).update_table_version(
1902
+ *args,
1903
+ namespace=namespace,
1904
+ table_name=table,
1905
+ table_version=table_version.id,
1906
+ lifecycle_state=lifecycle_state,
1907
+ description=table_version_description or table_description,
1908
+ schema=updated_schema,
1909
+ properties=resolved_tv_properties, # This will be None if table_version_properties was not provided
1910
+ **kwargs,
1911
+ )
1912
+
1913
+ except Exception as e:
1914
+ # If any error occurs, the transaction remains uncommitted
1915
+ commit_transaction = False
1916
+ logger.error(f"Error during alter_table: {e}")
1917
+ raise
1918
+ finally:
1919
+ if commit_transaction:
1920
+ # Seal the interactive transaction to commit all operations atomically
1921
+ alter_transaction.seal()
1922
+
1923
+
1924
+ def _add_default_table_properties(
1925
+ table_properties: Optional[TableProperties],
1926
+ default_table_properties: TableProperties = TablePropertyDefaultValues,
1927
+ ) -> TableProperties:
1928
+ if table_properties is None:
1929
+ table_properties = {}
1930
+ for k, v in default_table_properties.items():
1931
+ if k not in table_properties:
1932
+ table_properties[k] = v
1933
+ return table_properties
1934
+
1935
+
1936
+ def _validate_table_properties(
1937
+ table_properties: TableProperties,
1938
+ ) -> None:
1939
+ read_optimization_level = table_properties.get(
1940
+ TableProperty.READ_OPTIMIZATION_LEVEL,
1941
+ TablePropertyDefaultValues[TableProperty.READ_OPTIMIZATION_LEVEL],
234
1942
  )
1943
+ if read_optimization_level != TableReadOptimizationLevel.MAX:
1944
+ raise NotImplementedError(
1945
+ f"Table read optimization level `{read_optimization_level} is not yet supported. Please use {TableReadOptimizationLevel.MAX}"
1946
+ )
235
1947
 
236
1948
 
237
1949
  def create_table(
238
- name: str,
1950
+ table: str,
239
1951
  *args,
240
1952
  namespace: Optional[str] = None,
241
- version: Optional[str] = None,
1953
+ table_version: Optional[str] = None,
242
1954
  lifecycle_state: Optional[LifecycleState] = LifecycleState.ACTIVE,
243
1955
  schema: Optional[Schema] = None,
244
1956
  partition_scheme: Optional[PartitionScheme] = None,
245
1957
  sort_keys: Optional[SortScheme] = None,
246
- description: Optional[str] = None,
1958
+ table_description: Optional[str] = None,
1959
+ table_version_description: Optional[str] = None,
247
1960
  table_properties: Optional[TableProperties] = None,
1961
+ table_version_properties: Optional[TableVersionProperties] = None,
248
1962
  namespace_properties: Optional[NamespaceProperties] = None,
249
1963
  content_types: Optional[List[ContentType]] = None,
250
1964
  fail_if_exists: bool = True,
1965
+ transaction: Optional[Transaction] = None,
251
1966
  **kwargs,
252
1967
  ) -> TableDefinition:
253
1968
  """Create an empty table in the catalog.
@@ -255,20 +1970,22 @@ def create_table(
255
1970
  If a namespace isn't provided, the table will be created within the default deltacat namespace.
256
1971
  Additionally if the provided namespace does not exist, it will be created for you.
257
1972
 
258
-
259
1973
  Args:
260
- name: Name of the table to create.
1974
+ table: Name of the table to create.
261
1975
  namespace: Optional namespace for the table. Uses default namespace if not specified.
262
1976
  version: Optional version identifier for the table.
263
1977
  lifecycle_state: Lifecycle state of the new table. Defaults to ACTIVE.
264
1978
  schema: Schema definition for the table.
265
1979
  partition_scheme: Optional partitioning scheme for the table.
266
1980
  sort_keys: Optional sort keys for the table.
267
- description: Optional description of the table.
1981
+ table_description: Optional description of the table.
1982
+ table_version_description: Optional description for the table version.
268
1983
  table_properties: Optional properties for the table.
1984
+ table_version_properties: Optional properties for the table version. Defaults to the current parent table properties if not specified.
269
1985
  namespace_properties: Optional properties for the namespace if it needs to be created.
270
1986
  content_types: Optional list of allowed content types for the table.
271
1987
  fail_if_exists: If True, raises an error if table already exists. If False, returns existing table.
1988
+ transaction: Optional transaction to use. If None, creates a new transaction.
272
1989
 
273
1990
  Returns:
274
1991
  TableDefinition object for the created or existing table.
@@ -277,56 +1994,133 @@ def create_table(
277
1994
  TableAlreadyExistsError: If the table already exists and fail_if_exists is True.
278
1995
  NamespaceNotFoundError: If the provided namespace does not exist.
279
1996
  """
1997
+ resolved_table_properties = _add_default_table_properties(table_properties)
1998
+ # Note: resolved_tv_properties will be set after checking existing table
1999
+
280
2000
  namespace = namespace or default_namespace()
281
2001
 
282
- table = get_table(*args, name, namespace=namespace, table_version=version, **kwargs)
283
- if table is not None:
284
- if fail_if_exists:
285
- raise TableAlreadyExistsError(f"Table {namespace}.{name} already exists")
286
- return table
2002
+ # Set up transaction handling
2003
+ create_transaction, commit_transaction = setup_transaction(transaction, **kwargs)
2004
+ kwargs["transaction"] = create_transaction
287
2005
 
288
- if not namespace_exists(*args, namespace, **kwargs):
289
- create_namespace(
290
- *args, namespace=namespace, properties=namespace_properties, **kwargs
2006
+ try:
2007
+ existing_table = (
2008
+ get_table(
2009
+ table,
2010
+ namespace=namespace,
2011
+ table_version=table_version,
2012
+ *args,
2013
+ **kwargs,
2014
+ )
2015
+ if "existing_table_definition" not in kwargs
2016
+ else kwargs["existing_table_definition"]
291
2017
  )
2018
+ if existing_table is not None:
2019
+ if existing_table.table_version and existing_table.stream:
2020
+ if fail_if_exists:
2021
+ table_identifier = (
2022
+ f"{namespace}.{table}"
2023
+ if not table_version
2024
+ else f"{namespace}.{table}.{table_version}"
2025
+ )
2026
+ raise TableAlreadyExistsError(
2027
+ f"Table {table_identifier} already exists"
2028
+ )
2029
+ return existing_table
2030
+ # the table exists but the table version doesn't - inherit the existing table properties
2031
+ # Also ensure table properties are inherited when not explicitly provided
2032
+ if table_properties is None:
2033
+ resolved_table_properties = existing_table.table.properties
2034
+
2035
+ # Set up table version properties based on existing table or explicit properties
2036
+ default_tv_properties = resolved_table_properties
2037
+ if schema is None:
2038
+ default_tv_properties = dict(
2039
+ default_tv_properties
2040
+ ) # Make a copy to avoid modifying original
2041
+ default_tv_properties[TableProperty.SUPPORTED_READER_TYPES] = None
2042
+ resolved_tv_properties = _add_default_table_properties(
2043
+ table_version_properties, default_tv_properties
2044
+ )
2045
+ else:
2046
+ # create the namespace if it doesn't exist
2047
+ if not namespace_exists(namespace, **kwargs):
2048
+ create_namespace(
2049
+ namespace=namespace,
2050
+ properties=namespace_properties,
2051
+ *args,
2052
+ **kwargs,
2053
+ )
2054
+
2055
+ # Set up table version properties for new table
2056
+ default_tv_properties = resolved_table_properties
2057
+ if schema is None:
2058
+ default_tv_properties = dict(
2059
+ default_tv_properties
2060
+ ) # Make a copy to avoid modifying original
2061
+ default_tv_properties[TableProperty.SUPPORTED_READER_TYPES] = None
2062
+ resolved_tv_properties = _add_default_table_properties(
2063
+ table_version_properties, default_tv_properties
2064
+ )
292
2065
 
293
- (table, table_version, stream) = _get_storage(**kwargs).create_table_version(
294
- *args,
295
- namespace=namespace,
296
- table_name=name,
297
- table_version=version,
298
- schema=schema,
299
- partition_scheme=partition_scheme,
300
- sort_keys=sort_keys,
301
- table_version_description=description,
302
- table_description=description,
303
- table_properties=table_properties,
304
- lifecycle_state=lifecycle_state or LifecycleState.ACTIVE,
305
- supported_content_types=content_types,
306
- **kwargs,
307
- )
2066
+ _validate_table_properties(resolved_tv_properties)
308
2067
 
309
- return TableDefinition.of(
310
- table=table,
311
- table_version=table_version,
312
- stream=stream,
313
- )
2068
+ (table, table_version, stream) = _get_storage(**kwargs).create_table_version(
2069
+ namespace=namespace,
2070
+ table_name=table,
2071
+ table_version=table_version,
2072
+ schema=schema,
2073
+ partition_scheme=partition_scheme,
2074
+ sort_keys=sort_keys,
2075
+ table_version_description=table_version_description
2076
+ if table_version_description is not None
2077
+ else table_description,
2078
+ table_description=table_description,
2079
+ table_properties=resolved_table_properties,
2080
+ table_version_properties=resolved_tv_properties,
2081
+ lifecycle_state=lifecycle_state or LifecycleState.ACTIVE,
2082
+ supported_content_types=content_types,
2083
+ *args,
2084
+ **kwargs,
2085
+ )
2086
+
2087
+ result = TableDefinition.of(
2088
+ table=table,
2089
+ table_version=table_version,
2090
+ stream=stream,
2091
+ )
2092
+
2093
+ return result
2094
+
2095
+ except Exception as e:
2096
+ # If any error occurs, the transaction remains uncommitted
2097
+ commit_transaction = False
2098
+ logger.error(f"Error during create_table: {e}")
2099
+ raise
2100
+ finally:
2101
+ if commit_transaction:
2102
+ # Seal the interactive transaction to commit all operations atomically
2103
+ create_transaction.seal()
314
2104
 
315
2105
 
316
2106
  def drop_table(
317
- name: str,
2107
+ table: str,
318
2108
  *args,
319
2109
  namespace: Optional[str] = None,
320
2110
  table_version: Optional[str] = None,
321
2111
  purge: bool = False,
2112
+ transaction: Optional[Transaction] = None,
322
2113
  **kwargs,
323
2114
  ) -> None:
324
2115
  """Drop a table from the catalog and optionally purges underlying data.
325
2116
 
326
2117
  Args:
327
- name: Name of the table to drop.
2118
+ table: Name of the table to drop.
328
2119
  namespace: Optional namespace of the table. Uses default namespace if not specified.
2120
+ table_version: Optional table version of the table to drop. If not specified, the parent table of all
2121
+ table versions will be dropped.
329
2122
  purge: If True, permanently delete the table data. If False, only remove from catalog.
2123
+ transaction: Optional transaction to use. If None, creates a new transaction.
330
2124
 
331
2125
  Returns:
332
2126
  None
@@ -341,17 +2135,56 @@ def drop_table(
341
2135
  raise NotImplementedError("Purge flag is not currently supported.")
342
2136
 
343
2137
  namespace = namespace or default_namespace()
344
- _get_storage(**kwargs).delete_table(
345
- *args, namespace=namespace, name=name, purge=purge, **kwargs
346
- )
2138
+
2139
+ # Set up transaction handling
2140
+ drop_transaction, commit_transaction = setup_transaction(transaction, **kwargs)
2141
+ kwargs["transaction"] = drop_transaction
2142
+
2143
+ try:
2144
+ if not table_version:
2145
+ _get_storage(**kwargs).delete_table(
2146
+ namespace=namespace,
2147
+ table_name=table,
2148
+ purge=purge,
2149
+ *args,
2150
+ **kwargs,
2151
+ )
2152
+ else:
2153
+ _get_storage(**kwargs).update_table_version(
2154
+ namespace=namespace,
2155
+ table_name=table,
2156
+ table_version=table_version,
2157
+ lifecycle_state=LifecycleState.DELETED,
2158
+ *args,
2159
+ **kwargs,
2160
+ )
2161
+
2162
+ except Exception as e:
2163
+ # If any error occurs, the transaction remains uncommitted
2164
+ commit_transaction = False
2165
+ logger.error(f"Error during drop_table: {e}")
2166
+ raise
2167
+ finally:
2168
+ if commit_transaction:
2169
+ # Seal the interactive transaction to commit all operations atomically
2170
+ drop_transaction.seal()
347
2171
 
348
2172
 
349
- def refresh_table(table: str, *args, namespace: Optional[str] = None, **kwargs) -> None:
2173
+ def refresh_table(
2174
+ table: str,
2175
+ *args,
2176
+ namespace: Optional[str] = None,
2177
+ table_version: Optional[str] = None,
2178
+ transaction: Optional[Transaction] = None,
2179
+ **kwargs,
2180
+ ) -> None:
350
2181
  """Refresh metadata cached on the Ray cluster for the given table.
351
2182
 
352
2183
  Args:
353
2184
  table: Name of the table to refresh.
354
2185
  namespace: Optional namespace of the table. Uses default namespace if not specified.
2186
+ table_version: Optional specific version of the table to refresh.
2187
+ transaction: Optional transaction to use. If None, creates a new transaction.
355
2188
 
356
2189
  Returns:
357
2190
  None
@@ -360,32 +2193,79 @@ def refresh_table(table: str, *args, namespace: Optional[str] = None, **kwargs)
360
2193
 
361
2194
 
362
2195
  def list_tables(
363
- *args, namespace: Optional[str] = None, **kwargs
2196
+ *args,
2197
+ namespace: Optional[str] = None,
2198
+ table: Optional[str] = None,
2199
+ transaction: Optional[Transaction] = None,
2200
+ **kwargs,
364
2201
  ) -> ListResult[TableDefinition]:
365
2202
  """List a page of table definitions.
366
2203
 
367
2204
  Args:
368
2205
  namespace: Optional namespace to list tables from. Uses default namespace if not specified.
2206
+ table: Optional table to list its table versions. If not specified, lists the latest active version of each table in the namespace.
2207
+ transaction: Optional transaction to use for reading. If provided, will see uncommitted changes.
369
2208
 
370
2209
  Returns:
371
2210
  ListResult containing TableDefinition objects for tables in the namespace.
372
2211
  """
373
2212
  namespace = namespace or default_namespace()
374
- tables = _get_storage(**kwargs).list_tables(*args, namespace=namespace, **kwargs)
375
- table_definitions = [
376
- get_table(*args, table.table_name, namespace, **kwargs)
377
- for table in tables.all_items()
378
- ]
379
2213
 
380
- return ListResult(items=table_definitions)
2214
+ # Set up transaction handling
2215
+ list_transaction, commit_transaction = setup_transaction(transaction, **kwargs)
2216
+ kwargs["transaction"] = list_transaction
2217
+
2218
+ try:
2219
+ if not table:
2220
+ tables = _get_storage(**kwargs).list_tables(
2221
+ namespace=namespace,
2222
+ *args,
2223
+ **kwargs,
2224
+ )
2225
+ table_definitions = [
2226
+ get_table(table.table_name, namespace=namespace, *args, **kwargs)
2227
+ for table in tables.all_items()
2228
+ ]
2229
+ else:
2230
+ table_versions = _get_storage(**kwargs).list_table_versions(
2231
+ namespace=namespace,
2232
+ table_name=table,
2233
+ *args,
2234
+ **kwargs,
2235
+ )
2236
+ table_definitions = [
2237
+ get_table(
2238
+ table,
2239
+ namespace=namespace,
2240
+ table_version=table_version.id,
2241
+ *args,
2242
+ **kwargs,
2243
+ )
2244
+ for table_version in table_versions.all_items()
2245
+ ]
2246
+
2247
+ result = ListResult(items=table_definitions)
2248
+
2249
+ return result
2250
+
2251
+ except Exception as e:
2252
+ # If any error occurs, the transaction remains uncommitted
2253
+ commit_transaction = False
2254
+ logger.error(f"Error during list_tables: {e}")
2255
+ raise
2256
+ finally:
2257
+ if commit_transaction:
2258
+ # Seal the interactive transaction to commit all operations atomically
2259
+ list_transaction.seal()
381
2260
 
382
2261
 
383
2262
  def get_table(
384
- name: str,
2263
+ table: str,
385
2264
  *args,
386
2265
  namespace: Optional[str] = None,
387
2266
  table_version: Optional[str] = None,
388
2267
  stream_format: StreamFormat = StreamFormat.DELTACAT,
2268
+ transaction: Optional[Transaction] = None,
389
2269
  **kwargs,
390
2270
  ) -> Optional[TableDefinition]:
391
2271
  """Get table definition metadata.
@@ -393,64 +2273,84 @@ def get_table(
393
2273
  Args:
394
2274
  name: Name of the table to retrieve.
395
2275
  namespace: Optional namespace of the table. Uses default namespace if not specified.
396
- table_version: Optional specific version of the table to retrieve.
397
- If not specified, the latest version is used.
398
- stream_format: Optional stream format to retrieve. Uses the default Deltacat stream
399
- format if not specified.
2276
+ table_version: Optional specific version of the table to retrieve. Defaults to the latest active version.
2277
+ stream_format: Optional stream format to retrieve. Defaults to DELTACAT.
2278
+ transaction: Optional transaction to use. If None, creates a new transaction.
400
2279
 
401
2280
  Returns:
402
- Deltacat TableDefinition if the table exists, None otherwise.
403
-
404
- Raises:
405
- TableVersionNotFoundError: If the table version does not exist.
406
- StreamNotFoundError: If the stream does not exist.
2281
+ Deltacat TableDefinition if the table exists, None otherwise. The table definition's table version will be
2282
+ None if the requested version is not found. The table definition's stream will be None if the requested stream
2283
+ format is not found.
407
2284
  """
408
2285
  namespace = namespace or default_namespace()
409
- table: Optional[Table] = _get_storage(**kwargs).get_table(
410
- *args, table_name=name, namespace=namespace, **kwargs
411
- )
412
2286
 
413
- if table is None:
414
- return None
415
-
416
- table_version: Optional[TableVersion] = _get_storage(**kwargs).get_table_version(
417
- *args, namespace, name, table_version or table.latest_table_version, **kwargs
418
- )
2287
+ # Set up transaction handling
2288
+ get_transaction, commit_transaction = setup_transaction(transaction, **kwargs)
2289
+ kwargs["transaction"] = get_transaction
419
2290
 
420
- if table_version is None:
421
- raise TableVersionNotFoundError(
422
- f"TableVersion {namespace}.{name}.{table_version} does not exist."
2291
+ try:
2292
+ table_obj: Optional[Table] = _get_storage(**kwargs).get_table(
2293
+ table_name=table,
2294
+ namespace=namespace,
2295
+ *args,
2296
+ **kwargs,
423
2297
  )
424
2298
 
425
- stream = _get_storage(**kwargs).get_stream(
426
- *args,
427
- namespace=namespace,
428
- table_name=name,
429
- table_version=table_version.id,
430
- stream_format=stream_format,
431
- **kwargs,
432
- )
2299
+ if table_obj is None:
2300
+ return None
433
2301
 
434
- if stream is None:
435
- raise StreamNotFoundError(
436
- f"Stream {namespace}.{table}.{table_version}.{stream} does not exist."
2302
+ table_version_obj: Optional[TableVersion] = _get_storage(
2303
+ **kwargs
2304
+ ).get_table_version(
2305
+ namespace,
2306
+ table,
2307
+ table_version or table_obj.latest_active_table_version,
2308
+ *args,
2309
+ **kwargs,
437
2310
  )
438
2311
 
439
- return TableDefinition.of(
440
- table=table,
441
- table_version=table_version,
442
- stream=stream,
443
- )
2312
+ stream = None
2313
+ if table_version_obj:
2314
+ stream = _get_storage(**kwargs).get_stream(
2315
+ namespace=namespace,
2316
+ table_name=table,
2317
+ table_version=table_version_obj.id,
2318
+ stream_format=stream_format,
2319
+ *args,
2320
+ **kwargs,
2321
+ )
2322
+
2323
+ return TableDefinition.of(
2324
+ table=table_obj,
2325
+ table_version=table_version_obj,
2326
+ stream=stream,
2327
+ )
2328
+ except Exception as e:
2329
+ # If any error occurs, the transaction remains uncommitted
2330
+ commit_transaction = False
2331
+ logger.error(f"Error during get_table: {e}")
2332
+ raise
2333
+ finally:
2334
+ if commit_transaction:
2335
+ # Seal the interactive transaction to commit all operations atomically
2336
+ get_transaction.seal()
444
2337
 
445
2338
 
446
2339
  def truncate_table(
447
- table: str, *args, namespace: Optional[str] = None, **kwargs
2340
+ table: str,
2341
+ *args,
2342
+ namespace: Optional[str] = None,
2343
+ table_version: Optional[str] = None,
2344
+ transaction: Optional[Transaction] = None,
2345
+ **kwargs,
448
2346
  ) -> None:
449
2347
  """Truncate table data.
450
2348
 
451
2349
  Args:
452
2350
  table: Name of the table to truncate.
453
2351
  namespace: Optional namespace of the table. Uses default namespace if not specified.
2352
+ table_version: Optional specific version of the table to truncate. Defaults to the latest active version.
2353
+ transaction: Optional transaction to use. If None, creates a new transaction.
454
2354
 
455
2355
  Returns:
456
2356
  None
@@ -459,7 +2359,12 @@ def truncate_table(
459
2359
 
460
2360
 
461
2361
  def rename_table(
462
- table: str, new_name: str, *args, namespace: Optional[str] = None, **kwargs
2362
+ table: str,
2363
+ new_name: str,
2364
+ *args,
2365
+ namespace: Optional[str] = None,
2366
+ transaction: Optional[Transaction] = None,
2367
+ **kwargs,
463
2368
  ) -> None:
464
2369
  """Rename an existing table.
465
2370
 
@@ -467,6 +2372,7 @@ def rename_table(
467
2372
  table: Current name of the table.
468
2373
  new_name: New name for the table.
469
2374
  namespace: Optional namespace of the table. Uses default namespace if not specified.
2375
+ transaction: Optional transaction to use. If None, creates a new transaction.
470
2376
 
471
2377
  Returns:
472
2378
  None
@@ -475,71 +2381,219 @@ def rename_table(
475
2381
  TableNotFoundError: If the table does not exist.
476
2382
  """
477
2383
  namespace = namespace or default_namespace()
478
- _get_storage(**kwargs).update_table(
479
- *args, table_name=table, new_table_name=new_name, namespace=namespace, **kwargs
480
- )
481
2384
 
2385
+ # Set up transaction handling
2386
+ rename_transaction, commit_transaction = setup_transaction(transaction, **kwargs)
2387
+ kwargs["transaction"] = rename_transaction
2388
+
2389
+ try:
2390
+ _get_storage(**kwargs).update_table(
2391
+ table_name=table,
2392
+ new_table_name=new_name,
2393
+ namespace=namespace,
2394
+ *args,
2395
+ **kwargs,
2396
+ )
2397
+
2398
+ except Exception as e:
2399
+ # If any error occurs, the transaction remains uncommitted
2400
+ commit_transaction = False
2401
+ logger.error(f"Error during rename_table: {e}")
2402
+ raise
2403
+ finally:
2404
+ if commit_transaction:
2405
+ # Seal the interactive transaction to commit all operations atomically
2406
+ rename_transaction.seal()
482
2407
 
483
- def table_exists(table: str, *args, namespace: Optional[str] = None, **kwargs) -> bool:
2408
+
2409
+ def table_exists(
2410
+ table: str,
2411
+ *args,
2412
+ namespace: Optional[str] = None,
2413
+ table_version: Optional[str] = None,
2414
+ stream_format: StreamFormat = StreamFormat.DELTACAT,
2415
+ transaction: Optional[Transaction] = None,
2416
+ **kwargs,
2417
+ ) -> bool:
484
2418
  """Check if a table exists in the catalog.
485
2419
 
486
2420
  Args:
487
2421
  table: Name of the table to check.
488
2422
  namespace: Optional namespace of the table. Uses default namespace if not specified.
2423
+ table_version: Optional specific version of the table to check. Defaults to the latest active version.
2424
+ stream_format: Optional stream format to check. Defaults to DELTACAT.
2425
+ transaction: Optional transaction to use. If None, creates a new transaction.
489
2426
 
490
2427
  Returns:
491
2428
  True if the table exists, False otherwise.
492
2429
  """
493
2430
  namespace = namespace or default_namespace()
494
- return _get_storage(**kwargs).table_exists(
495
- *args, table_name=table, namespace=namespace, **kwargs
496
- )
497
2431
 
2432
+ # Set up transaction handling
2433
+ exists_transaction, commit_transaction = setup_transaction(transaction, **kwargs)
2434
+ kwargs["transaction"] = exists_transaction
498
2435
 
499
- def list_namespaces(*args, **kwargs) -> ListResult[Namespace]:
2436
+ try:
2437
+ table_obj = _get_storage(**kwargs).get_table(
2438
+ namespace=namespace,
2439
+ table_name=table,
2440
+ *args,
2441
+ **kwargs,
2442
+ )
2443
+ if table_obj is None:
2444
+ return False
2445
+ table_version = table_version or table_obj.latest_active_table_version
2446
+ if not table_version:
2447
+ return False
2448
+ table_version_exists = _get_storage(**kwargs).table_version_exists(
2449
+ namespace,
2450
+ table,
2451
+ table_version,
2452
+ *args,
2453
+ **kwargs,
2454
+ )
2455
+ if not table_version_exists:
2456
+ return False
2457
+ stream_exists = _get_storage(**kwargs).stream_exists(
2458
+ namespace=namespace,
2459
+ table_name=table,
2460
+ table_version=table_version,
2461
+ stream_format=stream_format,
2462
+ *args,
2463
+ **kwargs,
2464
+ )
2465
+ return stream_exists
2466
+ except Exception as e:
2467
+ # If any error occurs, the transaction remains uncommitted
2468
+ commit_transaction = False
2469
+ logger.error(f"Error during table_exists: {e}")
2470
+ raise
2471
+ finally:
2472
+ if commit_transaction:
2473
+ # Seal the interactive transaction to commit all operations atomically
2474
+ exists_transaction.seal()
2475
+
2476
+
2477
+ def list_namespaces(
2478
+ *args,
2479
+ transaction: Optional[Transaction] = None,
2480
+ **kwargs,
2481
+ ) -> ListResult[Namespace]:
500
2482
  """List a page of table namespaces.
501
2483
 
502
2484
  Args:
503
- catalog: Catalog properties instance.
2485
+ transaction: Optional transaction to use. If None, creates a new transaction.
504
2486
 
505
2487
  Returns:
506
2488
  ListResult containing Namespace objects.
507
2489
  """
508
- return _get_storage(**kwargs).list_namespaces(*args, **kwargs)
2490
+ # Set up transaction handling
2491
+ list_transaction, commit_transaction = setup_transaction(transaction, **kwargs)
2492
+ kwargs["transaction"] = list_transaction
2493
+
2494
+ try:
2495
+ result = _get_storage(**kwargs).list_namespaces(*args, **kwargs)
2496
+
2497
+ return result
509
2498
 
2499
+ except Exception as e:
2500
+ # If any error occurs, the transaction remains uncommitted
2501
+ commit_transaction = False
2502
+ logger.error(f"Error during list_namespaces: {e}")
2503
+ raise
2504
+ finally:
2505
+ if commit_transaction:
2506
+ # Seal the interactive transaction to commit all operations atomically
2507
+ list_transaction.seal()
510
2508
 
511
- def get_namespace(namespace: str, *args, **kwargs) -> Optional[Namespace]:
2509
+
2510
+ def get_namespace(
2511
+ namespace: str,
2512
+ *args,
2513
+ transaction: Optional[Transaction] = None,
2514
+ **kwargs,
2515
+ ) -> Optional[Namespace]:
512
2516
  """Get metadata for a specific table namespace.
513
2517
 
514
2518
  Args:
515
2519
  namespace: Name of the namespace to retrieve.
2520
+ transaction: Optional transaction to use. If None, creates a new transaction.
516
2521
 
517
2522
  Returns:
518
2523
  Namespace object if the namespace exists, None otherwise.
519
2524
  """
520
- return _get_storage(**kwargs).get_namespace(*args, namespace=namespace, **kwargs)
2525
+ # Set up transaction handling
2526
+ get_ns_transaction, commit_transaction = setup_transaction(transaction, **kwargs)
2527
+ kwargs["transaction"] = get_ns_transaction
2528
+
2529
+ try:
2530
+ result = _get_storage(**kwargs).get_namespace(
2531
+ *args, namespace=namespace, **kwargs
2532
+ )
2533
+
2534
+ return result
2535
+
2536
+ except Exception as e:
2537
+ # If any error occurs, the transaction remains uncommitted
2538
+ commit_transaction = False
2539
+ logger.error(f"Error during get_namespace: {e}")
2540
+ raise
2541
+ finally:
2542
+ if commit_transaction:
2543
+ # Seal the interactive transaction to commit all operations atomically
2544
+ get_ns_transaction.seal()
521
2545
 
522
2546
 
523
- def namespace_exists(namespace: str, *args, **kwargs) -> bool:
2547
+ def namespace_exists(
2548
+ namespace: str,
2549
+ *args,
2550
+ transaction: Optional[Transaction] = None,
2551
+ **kwargs,
2552
+ ) -> bool:
524
2553
  """Check if a namespace exists.
525
2554
 
526
2555
  Args:
527
2556
  namespace: Name of the namespace to check.
2557
+ transaction: Optional transaction to use for reading. If provided, will see uncommitted changes.
528
2558
 
529
2559
  Returns:
530
2560
  True if the namespace exists, False otherwise.
531
2561
  """
532
- return _get_storage(**kwargs).namespace_exists(*args, namespace=namespace, **kwargs)
2562
+ # Set up transaction handling
2563
+ exists_transaction, commit_transaction = setup_transaction(transaction, **kwargs)
2564
+ kwargs["transaction"] = exists_transaction
2565
+
2566
+ try:
2567
+ result = _get_storage(**kwargs).namespace_exists(
2568
+ *args, namespace=namespace, **kwargs
2569
+ )
2570
+
2571
+ return result
2572
+
2573
+ except Exception as e:
2574
+ # If any error occurs, the transaction remains uncommitted
2575
+ commit_transaction = False
2576
+ logger.error(f"Error during namespace_exists: {e}")
2577
+ raise
2578
+ finally:
2579
+ if commit_transaction:
2580
+ # Seal the interactive transaction to commit all operations atomically
2581
+ exists_transaction.seal()
533
2582
 
534
2583
 
535
2584
  def create_namespace(
536
- namespace: str, *args, properties: Optional[NamespaceProperties] = None, **kwargs
2585
+ namespace: str,
2586
+ *args,
2587
+ properties: Optional[NamespaceProperties] = None,
2588
+ transaction: Optional[Transaction] = None,
2589
+ **kwargs,
537
2590
  ) -> Namespace:
538
2591
  """Create a new namespace.
539
2592
 
540
2593
  Args:
541
2594
  namespace: Name of the namespace to create.
542
2595
  properties: Optional properties for the namespace.
2596
+ transaction: Optional transaction to use. If None, creates a new transaction.
543
2597
 
544
2598
  Returns:
545
2599
  Created Namespace object.
@@ -547,12 +2601,29 @@ def create_namespace(
547
2601
  Raises:
548
2602
  NamespaceAlreadyExistsError: If the namespace already exists.
549
2603
  """
550
- if namespace_exists(namespace, **kwargs):
551
- raise NamespaceAlreadyExistsError(f"Namespace {namespace} already exists")
2604
+ # Set up transaction handling
2605
+ namespace_transaction, commit_transaction = setup_transaction(transaction, **kwargs)
2606
+ kwargs["transaction"] = namespace_transaction
552
2607
 
553
- return _get_storage(**kwargs).create_namespace(
554
- *args, namespace=namespace, properties=properties, **kwargs
555
- )
2608
+ try:
2609
+ if namespace_exists(namespace, **kwargs):
2610
+ raise NamespaceAlreadyExistsError(f"Namespace {namespace} already exists")
2611
+
2612
+ result = _get_storage(**kwargs).create_namespace(
2613
+ *args, namespace=namespace, properties=properties, **kwargs
2614
+ )
2615
+
2616
+ return result
2617
+
2618
+ except Exception as e:
2619
+ # If any error occurs, the transaction remains uncommitted
2620
+ commit_transaction = False
2621
+ logger.error(f"Error during create_namespace: {e}")
2622
+ raise
2623
+ finally:
2624
+ if commit_transaction:
2625
+ # Seal the interactive transaction to commit all operations atomically
2626
+ namespace_transaction.seal()
556
2627
 
557
2628
 
558
2629
  def alter_namespace(
@@ -560,6 +2631,7 @@ def alter_namespace(
560
2631
  *args,
561
2632
  properties: Optional[NamespaceProperties] = None,
562
2633
  new_namespace: Optional[str] = None,
2634
+ transaction: Optional[Transaction] = None,
563
2635
  **kwargs,
564
2636
  ) -> None:
565
2637
  """Alter a namespace definition.
@@ -568,26 +2640,49 @@ def alter_namespace(
568
2640
  namespace: Name of the namespace to alter.
569
2641
  properties: Optional new properties for the namespace.
570
2642
  new_namespace: Optional new name for the namespace.
2643
+ transaction: Optional transaction to use. If None, creates a new transaction.
571
2644
 
572
2645
  Returns:
573
2646
  None
574
2647
  """
575
- _get_storage(**kwargs).update_namespace(
576
- namespace=namespace,
577
- properties=properties,
578
- new_namespace=new_namespace,
579
- *args,
580
- **kwargs,
581
- )
2648
+ # Set up transaction handling
2649
+ alter_ns_transaction, commit_transaction = setup_transaction(transaction, **kwargs)
2650
+ kwargs["transaction"] = alter_ns_transaction
2651
+
2652
+ try:
2653
+ _get_storage(**kwargs).update_namespace(
2654
+ namespace=namespace,
2655
+ properties=properties,
2656
+ new_namespace=new_namespace,
2657
+ *args,
2658
+ **kwargs,
2659
+ )
2660
+
2661
+ except Exception as e:
2662
+ # If any error occurs, the transaction remains uncommitted
2663
+ commit_transaction = False
2664
+ logger.error(f"Error during alter_namespace: {e}")
2665
+ raise
2666
+ finally:
2667
+ if commit_transaction:
2668
+ # Seal the interactive transaction to commit all operations atomically
2669
+ alter_ns_transaction.seal()
582
2670
 
583
2671
 
584
- def drop_namespace(namespace: str, *args, purge: bool = False, **kwargs) -> None:
2672
+ def drop_namespace(
2673
+ namespace: str,
2674
+ *args,
2675
+ purge: bool = False,
2676
+ transaction: Optional[Transaction] = None,
2677
+ **kwargs,
2678
+ ) -> None:
585
2679
  """Drop a namespace and all of its tables from the catalog.
586
2680
 
587
2681
  Args:
588
2682
  namespace: Name of the namespace to drop.
589
- purge: If True, permanently delete all tables in the namespace.
590
- If False, only remove from catalog.
2683
+ purge: If True, permanently delete all table data in the namespace.
2684
+ If False, only removes the namespace from the catalog.
2685
+ transaction: Optional transaction to use. If None, creates a new transaction.
591
2686
 
592
2687
  Returns:
593
2688
  None
@@ -597,50 +2692,39 @@ def drop_namespace(namespace: str, *args, purge: bool = False, **kwargs) -> None
597
2692
  if purge:
598
2693
  raise NotImplementedError("Purge flag is not currently supported.")
599
2694
 
600
- _get_storage(**kwargs).delete_namespace(
601
- *args, namespace=namespace, purge=purge, **kwargs
602
- )
2695
+ # Set up transaction handling
2696
+ drop_ns_transaction, commit_transaction = setup_transaction(transaction, **kwargs)
2697
+ kwargs["transaction"] = drop_ns_transaction
2698
+
2699
+ try:
2700
+ _get_storage(**kwargs).delete_namespace(
2701
+ *args,
2702
+ namespace=namespace,
2703
+ purge=purge,
2704
+ **kwargs,
2705
+ )
2706
+
2707
+ except Exception as e:
2708
+ # If any error occurs, the transaction remains uncommitted
2709
+ commit_transaction = False
2710
+ logger.error(f"Error during drop_namespace: {e}")
2711
+ raise
2712
+ finally:
2713
+ if commit_transaction:
2714
+ # Seal the interactive transaction to commit all operations atomically
2715
+ drop_ns_transaction.seal()
603
2716
 
604
2717
 
605
2718
  def default_namespace(*args, **kwargs) -> str:
606
2719
  """Return the default namespace for the catalog.
607
2720
 
608
2721
  Returns:
609
- String name of the default namespace.
2722
+ Name of the default namespace.
610
2723
  """
611
- return DEFAULT_NAMESPACE # table functions
612
-
613
-
614
- def _validate_read_table_args(
615
- namespace: Optional[str] = None,
616
- table_type: Optional[TableType] = None,
617
- distributed_dataset_type: Optional[DistributedDatasetType] = None,
618
- merge_on_read: Optional[bool] = None,
619
- **kwargs,
620
- ):
621
- storage = _get_storage(**kwargs)
622
- if storage is None:
623
- raise ValueError(
624
- "Catalog not initialized. Did you miss calling "
625
- "initialize(ds=<deltacat_storage>)?"
626
- )
627
-
628
- if merge_on_read:
629
- raise ValueError("Merge on read not supported currently.")
630
-
631
- if table_type is not TableType.PYARROW:
632
- raise ValueError("Only PYARROW table type is supported as of now")
633
-
634
- if distributed_dataset_type is not DistributedDatasetType.DAFT:
635
- raise ValueError("Only DAFT dataset type is supported as of now")
2724
+ return DEFAULT_NAMESPACE
636
2725
 
637
- if namespace is None:
638
- raise ValueError(
639
- "namespace must be passed to uniquely identify a table in the catalog."
640
- )
641
2726
 
642
-
643
- def _get_latest_or_given_table_version(
2727
+ def _get_latest_active_or_given_table_version(
644
2728
  namespace: str,
645
2729
  table_name: str,
646
2730
  table_version: Optional[str] = None,
@@ -649,9 +2733,16 @@ def _get_latest_or_given_table_version(
649
2733
  ) -> TableVersion:
650
2734
  table_version_obj = None
651
2735
  if table_version is None:
652
- table_version_obj = _get_storage(**kwargs).get_latest_table_version(
653
- namespace=namespace, table_name=table_name, *args, **kwargs
2736
+ table_version_obj = _get_storage(**kwargs).get_latest_active_table_version(
2737
+ namespace=namespace,
2738
+ table_name=table_name,
2739
+ *args,
2740
+ **kwargs,
654
2741
  )
2742
+ if table_version_obj is None:
2743
+ raise TableVersionNotFoundError(
2744
+ f"No active table version found for table {namespace}.{table_name}"
2745
+ )
655
2746
  table_version = table_version_obj.table_version
656
2747
  else:
657
2748
  table_version_obj = _get_storage(**kwargs).get_table_version(
@@ -665,18 +2756,82 @@ def _get_latest_or_given_table_version(
665
2756
  return table_version_obj
666
2757
 
667
2758
 
2759
+ def _get_all_committed_partitions(
2760
+ table: str,
2761
+ namespace: str,
2762
+ table_version: str,
2763
+ **kwargs,
2764
+ ) -> List[Union[Partition, PartitionLocator]]:
2765
+ """Get all committed partitions for a table and validate uniqueness."""
2766
+ logger.info(
2767
+ f"Reading all partitions metadata in the table={table} "
2768
+ "as partition_filter was None."
2769
+ )
2770
+
2771
+ all_partitions = (
2772
+ _get_storage(**kwargs)
2773
+ .list_partitions(
2774
+ table_name=table,
2775
+ namespace=namespace,
2776
+ table_version=table_version,
2777
+ **kwargs,
2778
+ )
2779
+ .all_items()
2780
+ )
2781
+
2782
+ committed_partitions = [
2783
+ partition
2784
+ for partition in all_partitions
2785
+ if partition.state == CommitState.COMMITTED
2786
+ ]
2787
+
2788
+ logger.info(
2789
+ f"Found {len(committed_partitions)} committed partitions for "
2790
+ f"table={namespace}/{table}/{table_version}"
2791
+ )
2792
+
2793
+ _validate_partition_uniqueness(
2794
+ committed_partitions, namespace, table, table_version
2795
+ )
2796
+ return committed_partitions
2797
+
2798
+
2799
+ def _validate_partition_uniqueness(
2800
+ partitions: List[Partition], namespace: str, table: str, table_version: str
2801
+ ) -> None:
2802
+ """Validate that there are no duplicate committed partitions for the same partition values."""
2803
+ commit_count_per_partition_value = defaultdict(int)
2804
+ for partition in partitions:
2805
+ # Normalize partition values: both None and [] represent unpartitioned data
2806
+ normalized_values = (
2807
+ None
2808
+ if (
2809
+ partition.partition_values is None
2810
+ or (
2811
+ isinstance(partition.partition_values, list)
2812
+ and len(partition.partition_values) == 0
2813
+ )
2814
+ )
2815
+ else partition.partition_values
2816
+ )
2817
+ commit_count_per_partition_value[normalized_values] += 1
2818
+
2819
+ # Check for multiple committed partitions for the same partition values
2820
+ for partition_values, commit_count in commit_count_per_partition_value.items():
2821
+ if commit_count > 1:
2822
+ raise RuntimeError(
2823
+ f"Multiple committed partitions found for table={namespace}/{table}/{table_version}. "
2824
+ f"Partition values: {partition_values}. Commit count: {commit_count}. "
2825
+ f"This should not happen."
2826
+ )
2827
+
2828
+
668
2829
  def _get_deltas_from_partition_filter(
669
2830
  partition_filter: Optional[List[Union[Partition, PartitionLocator]]] = None,
670
- stream_position_range_inclusive: Optional[Tuple[int, int]] = None,
671
2831
  *args,
672
2832
  **kwargs,
673
2833
  ):
674
-
675
2834
  result_deltas = []
676
- start_stream_position, end_stream_position = stream_position_range_inclusive or (
677
- None,
678
- None,
679
- )
680
2835
  for partition_like in partition_filter:
681
2836
  deltas = (
682
2837
  _get_storage(**kwargs)
@@ -684,32 +2839,39 @@ def _get_deltas_from_partition_filter(
684
2839
  partition_like=partition_like,
685
2840
  ascending_order=True,
686
2841
  include_manifest=True,
687
- start_stream_position=start_stream_position,
688
- last_stream_position=end_stream_position,
689
2842
  *args,
690
2843
  **kwargs,
691
2844
  )
692
2845
  .all_items()
693
2846
  )
694
2847
 
695
- for delta in deltas:
696
- if (
697
- start_stream_position is None
698
- or delta.stream_position >= start_stream_position
699
- ) and (
700
- end_stream_position is None
701
- or delta.stream_position <= end_stream_position
702
- ):
703
- if delta.type == DeltaType.DELETE:
704
- raise ValueError("DELETE type deltas are not supported")
705
- result_deltas.append(delta)
706
-
2848
+ # Validate that all qualified deltas are append type - merge-on-read not yet implemented
2849
+ # TODO(pdames): Run compaction minus materialize for MoR of each partition.
2850
+ if deltas:
2851
+ non_append_deltas = []
2852
+ for delta in deltas:
2853
+ if delta.type != DeltaType.APPEND:
2854
+ non_append_deltas.append(delta)
2855
+ else:
2856
+ result_deltas.append(delta)
2857
+ if non_append_deltas:
2858
+ delta_types = {delta.type for delta in non_append_deltas}
2859
+ delta_info = [
2860
+ (str(delta.locator), delta.type) for delta in non_append_deltas[:5]
2861
+ ] # Show first 5
2862
+ raise NotImplementedError(
2863
+ f"Merge-on-read is not yet implemented. Found {len(non_append_deltas)} non-append deltas "
2864
+ f"with types {delta_types}. All deltas must be APPEND type for read operations. "
2865
+ f"Examples: {delta_info}. Please run compaction first to merge non-append deltas."
2866
+ )
2867
+
2868
+ logger.info(f"Validated {len(deltas)} qualified deltas are all APPEND type")
707
2869
  return result_deltas
708
2870
 
709
2871
 
710
2872
  def _get_storage(**kwargs):
711
2873
  """
712
- Returns the implementation of `deltacat.storage.interface` to use with this catalog.
2874
+ Returns the implementation of `deltacat.storage.interface` to use with this catalog
713
2875
 
714
2876
  This is configured in the `CatalogProperties` stored during initialization and passed through `delegate.py`
715
2877
  """
@@ -717,4 +2879,4 @@ def _get_storage(**kwargs):
717
2879
  if properties is not None and properties.storage is not None:
718
2880
  return properties.storage
719
2881
  else:
720
- return storage_impl
2882
+ return dc.storage.metastore