deltacat 1.1.38__py3-none-any.whl → 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (367) hide show
  1. deltacat/__init__.py +150 -12
  2. deltacat/annotations.py +36 -0
  3. deltacat/api.py +578 -0
  4. deltacat/aws/constants.py +0 -23
  5. deltacat/aws/s3u.py +4 -631
  6. deltacat/benchmarking/benchmark_engine.py +84 -0
  7. deltacat/benchmarking/benchmark_report.py +86 -0
  8. deltacat/benchmarking/benchmark_suite.py +11 -0
  9. deltacat/benchmarking/conftest.py +22 -19
  10. deltacat/benchmarking/data/random_row_generator.py +94 -0
  11. deltacat/benchmarking/data/row_generator.py +10 -0
  12. deltacat/benchmarking/test_benchmark_pipeline.py +108 -0
  13. deltacat/catalog/__init__.py +73 -0
  14. deltacat/catalog/delegate.py +615 -140
  15. deltacat/catalog/interface.py +404 -81
  16. deltacat/catalog/main/impl.py +2882 -0
  17. deltacat/catalog/model/catalog.py +348 -46
  18. deltacat/catalog/model/properties.py +155 -0
  19. deltacat/catalog/model/table_definition.py +32 -1
  20. deltacat/compute/__init__.py +14 -0
  21. deltacat/compute/compactor/compaction_session.py +97 -75
  22. deltacat/compute/compactor/model/compact_partition_params.py +75 -30
  23. deltacat/compute/compactor/model/compaction_session_audit_info.py +23 -30
  24. deltacat/compute/compactor/model/delta_annotated.py +3 -3
  25. deltacat/compute/compactor/model/delta_file_envelope.py +3 -1
  26. deltacat/compute/compactor/model/delta_file_locator.py +3 -1
  27. deltacat/compute/compactor/model/round_completion_info.py +19 -9
  28. deltacat/compute/compactor/model/table_object_store.py +3 -2
  29. deltacat/compute/compactor/repartition_session.py +9 -22
  30. deltacat/compute/compactor/steps/dedupe.py +11 -4
  31. deltacat/compute/compactor/steps/hash_bucket.py +6 -6
  32. deltacat/compute/compactor/steps/materialize.py +15 -9
  33. deltacat/compute/compactor/steps/repartition.py +12 -11
  34. deltacat/compute/compactor/utils/io.py +7 -6
  35. deltacat/compute/compactor/utils/round_completion_reader.py +117 -0
  36. deltacat/compute/compactor/utils/sort_key.py +9 -2
  37. deltacat/compute/compactor/utils/system_columns.py +3 -1
  38. deltacat/compute/compactor_v2/compaction_session.py +13 -14
  39. deltacat/compute/compactor_v2/deletes/utils.py +3 -3
  40. deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -1
  41. deltacat/compute/compactor_v2/model/hash_bucket_input.py +9 -3
  42. deltacat/compute/compactor_v2/model/merge_file_group.py +5 -2
  43. deltacat/compute/compactor_v2/model/merge_input.py +28 -9
  44. deltacat/compute/compactor_v2/private/compaction_utils.py +171 -73
  45. deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -2
  46. deltacat/compute/compactor_v2/steps/merge.py +156 -53
  47. deltacat/compute/compactor_v2/utils/content_type_params.py +17 -6
  48. deltacat/compute/compactor_v2/utils/delta.py +5 -3
  49. deltacat/compute/compactor_v2/utils/io.py +10 -3
  50. deltacat/compute/compactor_v2/utils/merge.py +14 -2
  51. deltacat/compute/compactor_v2/utils/task_options.py +2 -10
  52. deltacat/compute/converter/constants.py +9 -0
  53. deltacat/compute/converter/converter_session.py +298 -0
  54. deltacat/compute/converter/model/convert_input.py +96 -0
  55. deltacat/compute/converter/model/convert_input_files.py +78 -0
  56. deltacat/compute/converter/model/convert_result.py +80 -0
  57. deltacat/compute/converter/model/converter_session_params.py +144 -0
  58. deltacat/compute/converter/pyiceberg/catalog.py +78 -0
  59. deltacat/compute/converter/pyiceberg/overrides.py +263 -0
  60. deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +299 -0
  61. deltacat/compute/converter/steps/convert.py +366 -0
  62. deltacat/compute/converter/steps/dedupe.py +94 -0
  63. deltacat/compute/converter/utils/__init__.py +0 -0
  64. deltacat/compute/converter/utils/convert_task_options.py +132 -0
  65. deltacat/compute/converter/utils/converter_session_utils.py +175 -0
  66. deltacat/compute/converter/utils/iceberg_columns.py +87 -0
  67. deltacat/compute/converter/utils/io.py +203 -0
  68. deltacat/compute/converter/utils/s3u.py +148 -0
  69. deltacat/compute/janitor.py +205 -0
  70. deltacat/compute/jobs/__init__.py +0 -0
  71. deltacat/compute/jobs/client.py +417 -0
  72. deltacat/compute/resource_estimation/delta.py +11 -1
  73. deltacat/constants.py +90 -1
  74. deltacat/docs/__init__.py +0 -0
  75. deltacat/docs/autogen/__init__.py +0 -0
  76. deltacat/docs/autogen/schema/__init__.py +0 -0
  77. deltacat/docs/autogen/schema/inference/__init__.py +0 -0
  78. deltacat/docs/autogen/schema/inference/generate_type_mappings.py +687 -0
  79. deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py +673 -0
  80. deltacat/env.py +61 -0
  81. deltacat/examples/__init__.py +0 -0
  82. deltacat/examples/basic_logging.py +101 -0
  83. deltacat/examples/compactor/__init__.py +0 -0
  84. deltacat/examples/compactor/aws/__init__.py +1 -0
  85. deltacat/examples/compactor/bootstrap.py +863 -0
  86. deltacat/examples/compactor/compactor.py +373 -0
  87. deltacat/examples/compactor/explorer.py +473 -0
  88. deltacat/examples/compactor/gcp/__init__.py +1 -0
  89. deltacat/examples/compactor/job_runner.py +439 -0
  90. deltacat/examples/compactor/utils/__init__.py +1 -0
  91. deltacat/examples/compactor/utils/common.py +261 -0
  92. deltacat/examples/experimental/__init__.py +0 -0
  93. deltacat/examples/experimental/iceberg/__init__.py +0 -0
  94. deltacat/examples/experimental/iceberg/converter/__init__.py +0 -0
  95. deltacat/examples/experimental/iceberg/converter/beam/__init__.py +0 -0
  96. deltacat/examples/experimental/iceberg/converter/beam/app.py +226 -0
  97. deltacat/examples/experimental/iceberg/converter/beam/main.py +133 -0
  98. deltacat/examples/experimental/iceberg/converter/beam/test_workflow.py +113 -0
  99. deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py +3 -0
  100. deltacat/examples/experimental/iceberg/converter/beam/utils/common.py +174 -0
  101. deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py +263 -0
  102. deltacat/examples/experimental/iceberg/iceberg_bucket_writer.py +184 -0
  103. deltacat/examples/experimental/iceberg/iceberg_reader.py +147 -0
  104. deltacat/examples/hello_world.py +29 -0
  105. deltacat/examples/indexer/__init__.py +0 -0
  106. deltacat/examples/indexer/aws/__init__.py +0 -0
  107. deltacat/examples/indexer/gcp/__init__.py +0 -0
  108. deltacat/examples/indexer/indexer.py +163 -0
  109. deltacat/examples/indexer/job_runner.py +198 -0
  110. deltacat/exceptions.py +116 -12
  111. deltacat/experimental/__init__.py +0 -0
  112. deltacat/experimental/catalog/__init__.py +0 -0
  113. deltacat/experimental/catalog/iceberg/__init__.py +6 -0
  114. deltacat/experimental/catalog/iceberg/iceberg_catalog_config.py +26 -0
  115. deltacat/experimental/catalog/iceberg/impl.py +399 -0
  116. deltacat/experimental/catalog/iceberg/overrides.py +72 -0
  117. deltacat/experimental/compatibility/__init__.py +0 -0
  118. deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py +201 -0
  119. deltacat/experimental/converter_agent/__init__.py +0 -0
  120. deltacat/experimental/converter_agent/beam/__init__.py +0 -0
  121. deltacat/experimental/converter_agent/beam/managed.py +173 -0
  122. deltacat/experimental/converter_agent/table_monitor.py +479 -0
  123. deltacat/experimental/daft/__init__.py +4 -0
  124. deltacat/experimental/daft/daft_catalog.py +229 -0
  125. deltacat/experimental/storage/__init__.py +0 -0
  126. deltacat/experimental/storage/iceberg/__init__.py +0 -0
  127. deltacat/experimental/storage/iceberg/iceberg_scan_planner.py +129 -0
  128. deltacat/experimental/storage/iceberg/impl.py +739 -0
  129. deltacat/experimental/storage/iceberg/model.py +713 -0
  130. deltacat/experimental/storage/iceberg/visitor.py +119 -0
  131. deltacat/experimental/storage/rivulet/__init__.py +11 -0
  132. deltacat/experimental/storage/rivulet/arrow/__init__.py +0 -0
  133. deltacat/experimental/storage/rivulet/arrow/serializer.py +78 -0
  134. deltacat/experimental/storage/rivulet/dataset.py +745 -0
  135. deltacat/experimental/storage/rivulet/dataset_executor.py +79 -0
  136. deltacat/experimental/storage/rivulet/feather/__init__.py +7 -0
  137. deltacat/experimental/storage/rivulet/feather/file_reader.py +138 -0
  138. deltacat/experimental/storage/rivulet/feather/serializer.py +35 -0
  139. deltacat/experimental/storage/rivulet/fs/__init__.py +0 -0
  140. deltacat/experimental/storage/rivulet/fs/file_provider.py +105 -0
  141. deltacat/experimental/storage/rivulet/fs/file_store.py +130 -0
  142. deltacat/experimental/storage/rivulet/fs/input_file.py +76 -0
  143. deltacat/experimental/storage/rivulet/fs/output_file.py +86 -0
  144. deltacat/experimental/storage/rivulet/logical_plan.py +105 -0
  145. deltacat/experimental/storage/rivulet/metastore/__init__.py +0 -0
  146. deltacat/experimental/storage/rivulet/metastore/delta.py +188 -0
  147. deltacat/experimental/storage/rivulet/metastore/json_sst.py +105 -0
  148. deltacat/experimental/storage/rivulet/metastore/sst.py +82 -0
  149. deltacat/experimental/storage/rivulet/metastore/sst_interval_tree.py +260 -0
  150. deltacat/experimental/storage/rivulet/mvp/Table.py +101 -0
  151. deltacat/experimental/storage/rivulet/mvp/__init__.py +5 -0
  152. deltacat/experimental/storage/rivulet/parquet/__init__.py +7 -0
  153. deltacat/experimental/storage/rivulet/parquet/data_reader.py +0 -0
  154. deltacat/experimental/storage/rivulet/parquet/file_reader.py +129 -0
  155. deltacat/experimental/storage/rivulet/parquet/serializer.py +37 -0
  156. deltacat/experimental/storage/rivulet/reader/__init__.py +0 -0
  157. deltacat/experimental/storage/rivulet/reader/block_scanner.py +389 -0
  158. deltacat/experimental/storage/rivulet/reader/data_reader.py +136 -0
  159. deltacat/experimental/storage/rivulet/reader/data_scan.py +65 -0
  160. deltacat/experimental/storage/rivulet/reader/dataset_metastore.py +179 -0
  161. deltacat/experimental/storage/rivulet/reader/dataset_reader.py +158 -0
  162. deltacat/experimental/storage/rivulet/reader/pyarrow_data_reader.py +124 -0
  163. deltacat/experimental/storage/rivulet/reader/query_expression.py +99 -0
  164. deltacat/experimental/storage/rivulet/reader/reader_type_registrar.py +84 -0
  165. deltacat/experimental/storage/rivulet/schema/__init__.py +0 -0
  166. deltacat/experimental/storage/rivulet/schema/datatype.py +128 -0
  167. deltacat/experimental/storage/rivulet/schema/schema.py +251 -0
  168. deltacat/experimental/storage/rivulet/serializer.py +40 -0
  169. deltacat/experimental/storage/rivulet/serializer_factory.py +46 -0
  170. deltacat/experimental/storage/rivulet/shard/__init__.py +0 -0
  171. deltacat/experimental/storage/rivulet/shard/range_shard.py +129 -0
  172. deltacat/experimental/storage/rivulet/writer/__init__.py +0 -0
  173. deltacat/experimental/storage/rivulet/writer/dataset_writer.py +29 -0
  174. deltacat/experimental/storage/rivulet/writer/memtable_dataset_writer.py +305 -0
  175. deltacat/io/__init__.py +13 -0
  176. deltacat/io/dataset/__init__.py +0 -0
  177. deltacat/io/dataset/deltacat_dataset.py +91 -0
  178. deltacat/io/datasink/__init__.py +0 -0
  179. deltacat/io/datasink/deltacat_datasink.py +207 -0
  180. deltacat/io/datasource/__init__.py +0 -0
  181. deltacat/io/datasource/deltacat_datasource.py +579 -0
  182. deltacat/io/reader/__init__.py +0 -0
  183. deltacat/io/reader/deltacat_read_api.py +172 -0
  184. deltacat/logs.py +4 -1
  185. deltacat/storage/__init__.py +138 -28
  186. deltacat/storage/interface.py +260 -155
  187. deltacat/storage/main/__init__.py +0 -0
  188. deltacat/storage/main/impl.py +3030 -0
  189. deltacat/storage/model/delta.py +142 -71
  190. deltacat/storage/model/expression/__init__.py +47 -0
  191. deltacat/storage/model/expression/expression.py +656 -0
  192. deltacat/storage/model/expression/visitor.py +248 -0
  193. deltacat/storage/model/interop.py +24 -0
  194. deltacat/storage/model/list_result.py +8 -0
  195. deltacat/storage/model/locator.py +93 -9
  196. deltacat/storage/model/manifest.py +643 -0
  197. deltacat/storage/model/metafile.py +1421 -0
  198. deltacat/storage/model/namespace.py +41 -18
  199. deltacat/storage/model/partition.py +443 -43
  200. deltacat/storage/model/scan/__init__.py +0 -0
  201. deltacat/storage/model/scan/push_down.py +46 -0
  202. deltacat/storage/model/scan/scan_plan.py +10 -0
  203. deltacat/storage/model/scan/scan_task.py +34 -0
  204. deltacat/storage/model/schema.py +3160 -0
  205. deltacat/storage/model/shard.py +51 -0
  206. deltacat/storage/model/sort_key.py +210 -13
  207. deltacat/storage/model/stream.py +215 -80
  208. deltacat/storage/model/table.py +134 -29
  209. deltacat/storage/model/table_version.py +333 -46
  210. deltacat/storage/model/transaction.py +1733 -0
  211. deltacat/storage/model/transform.py +274 -58
  212. deltacat/storage/model/types.py +138 -16
  213. deltacat/storage/util/__init__.py +0 -0
  214. deltacat/storage/util/scan_planner.py +26 -0
  215. deltacat/tests/_io/__init__.py +1 -0
  216. deltacat/tests/_io/reader/__init__.py +0 -0
  217. deltacat/tests/_io/reader/test_deltacat_read_api.py +0 -0
  218. deltacat/tests/{io → _io}/test_cloudpickle_bug_fix.py +8 -4
  219. deltacat/tests/aws/test_s3u.py +2 -31
  220. deltacat/tests/catalog/data/__init__.py +0 -0
  221. deltacat/tests/catalog/main/__init__.py +0 -0
  222. deltacat/tests/catalog/main/test_catalog_impl_namespace_operations.py +130 -0
  223. deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +1972 -0
  224. deltacat/tests/catalog/model/__init__.py +0 -0
  225. deltacat/tests/catalog/model/test_table_definition.py +16 -0
  226. deltacat/tests/catalog/test_catalogs.py +321 -0
  227. deltacat/tests/catalog/test_default_catalog_impl.py +12154 -66
  228. deltacat/tests/compute/compact_partition_multiple_rounds_test_cases.py +21 -21
  229. deltacat/tests/compute/compact_partition_rebase_test_cases.py +6 -6
  230. deltacat/tests/compute/compact_partition_rebase_then_incremental_test_cases.py +56 -56
  231. deltacat/tests/compute/compact_partition_test_cases.py +23 -30
  232. deltacat/tests/compute/compactor/steps/test_repartition.py +14 -14
  233. deltacat/tests/compute/compactor/utils/test_io.py +125 -123
  234. deltacat/tests/compute/compactor/utils/test_round_completion_reader.py +254 -0
  235. deltacat/tests/compute/compactor_v2/test_compaction_session.py +387 -830
  236. deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +70 -57
  237. deltacat/tests/compute/compactor_v2/utils/test_task_options.py +0 -3
  238. deltacat/tests/compute/conftest.py +39 -0
  239. deltacat/tests/compute/converter/__init__.py +0 -0
  240. deltacat/tests/compute/converter/conftest.py +80 -0
  241. deltacat/tests/compute/converter/test_convert_session.py +826 -0
  242. deltacat/tests/compute/converter/utils.py +132 -0
  243. deltacat/tests/compute/resource_estimation/test_delta.py +88 -104
  244. deltacat/tests/compute/test_compact_partition_incremental.py +91 -98
  245. deltacat/tests/compute/test_compact_partition_multiple_rounds.py +79 -97
  246. deltacat/tests/compute/test_compact_partition_params.py +16 -11
  247. deltacat/tests/compute/test_compact_partition_rebase.py +63 -93
  248. deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +249 -220
  249. deltacat/tests/compute/test_janitor.py +236 -0
  250. deltacat/tests/compute/test_util_common.py +726 -46
  251. deltacat/tests/compute/test_util_constant.py +0 -1
  252. deltacat/tests/conftest.py +25 -0
  253. deltacat/tests/daft/__init__.py +0 -0
  254. deltacat/tests/daft/test_model.py +97 -0
  255. deltacat/tests/experimental/__init__.py +1 -0
  256. deltacat/tests/experimental/catalog/__init__.py +0 -0
  257. deltacat/tests/experimental/catalog/iceberg/__init__.py +0 -0
  258. deltacat/tests/experimental/catalog/iceberg/test_iceberg_catalog.py +71 -0
  259. deltacat/tests/experimental/compatibility/__init__.py +1 -0
  260. deltacat/tests/experimental/compatibility/test_backfill_locator_to_id_mappings.py +582 -0
  261. deltacat/tests/experimental/daft/__init__.py +0 -0
  262. deltacat/tests/experimental/daft/test_deltacat_daft_integration.py +136 -0
  263. deltacat/tests/experimental/storage/__init__.py +0 -0
  264. deltacat/tests/experimental/storage/rivulet/__init__.py +0 -0
  265. deltacat/tests/experimental/storage/rivulet/conftest.py +149 -0
  266. deltacat/tests/experimental/storage/rivulet/fs/__init__.py +0 -0
  267. deltacat/tests/experimental/storage/rivulet/fs/test_file_location_provider.py +94 -0
  268. deltacat/tests/experimental/storage/rivulet/reader/__init__.py +0 -0
  269. deltacat/tests/experimental/storage/rivulet/reader/query_expression.py +80 -0
  270. deltacat/tests/experimental/storage/rivulet/reader/test_data_scan.py +119 -0
  271. deltacat/tests/experimental/storage/rivulet/reader/test_dataset_metastore.py +71 -0
  272. deltacat/tests/experimental/storage/rivulet/schema/__init__.py +0 -0
  273. deltacat/tests/experimental/storage/rivulet/schema/test_schema.py +241 -0
  274. deltacat/tests/experimental/storage/rivulet/shard/__init__.py +0 -0
  275. deltacat/tests/experimental/storage/rivulet/shard/test_range_shard.py +162 -0
  276. deltacat/tests/experimental/storage/rivulet/test_dataset.py +408 -0
  277. deltacat/tests/experimental/storage/rivulet/test_manifest.py +67 -0
  278. deltacat/tests/experimental/storage/rivulet/test_sst_interval_tree.py +232 -0
  279. deltacat/tests/experimental/storage/rivulet/test_utils.py +124 -0
  280. deltacat/tests/experimental/storage/rivulet/writer/__init__.py +0 -0
  281. deltacat/tests/experimental/storage/rivulet/writer/test_dataset_write_then_read.py +343 -0
  282. deltacat/tests/experimental/storage/rivulet/writer/test_dataset_writer.py +79 -0
  283. deltacat/tests/experimental/storage/rivulet/writer/test_memtable_dataset_writer.py +75 -0
  284. deltacat/tests/storage/__init__.py +0 -0
  285. deltacat/tests/storage/main/__init__.py +0 -0
  286. deltacat/tests/storage/main/test_main_storage.py +8204 -0
  287. deltacat/tests/storage/model/__init__.py +0 -0
  288. deltacat/tests/storage/model/test_delete_parameters.py +21 -0
  289. deltacat/tests/storage/model/test_expression.py +327 -0
  290. deltacat/tests/storage/model/test_manifest.py +129 -0
  291. deltacat/tests/storage/model/test_metafile_io.py +2440 -0
  292. deltacat/tests/storage/model/test_partition_scheme.py +85 -0
  293. deltacat/tests/storage/model/test_schema.py +479 -0
  294. deltacat/tests/storage/model/test_schema_update.py +1925 -0
  295. deltacat/tests/storage/model/test_shard.py +24 -0
  296. deltacat/tests/storage/model/test_sort_scheme.py +90 -0
  297. deltacat/tests/storage/model/test_table_version.py +110 -0
  298. deltacat/tests/storage/model/test_transaction.py +653 -0
  299. deltacat/tests/storage/model/test_transaction_history.py +886 -0
  300. deltacat/tests/test_deltacat_api.py +1064 -0
  301. deltacat/tests/test_exceptions.py +9 -5
  302. deltacat/tests/test_utils/filesystem.py +14 -0
  303. deltacat/tests/test_utils/message_pack_utils.py +54 -0
  304. deltacat/tests/test_utils/pyarrow.py +50 -26
  305. deltacat/tests/test_utils/storage.py +256 -4
  306. deltacat/tests/types/__init__.py +0 -0
  307. deltacat/tests/types/test_tables.py +104 -0
  308. deltacat/tests/utils/exceptions.py +22 -0
  309. deltacat/tests/utils/main_deltacat_storage_mock.py +31 -0
  310. deltacat/tests/utils/ray_utils/test_dataset.py +123 -5
  311. deltacat/tests/utils/test_daft.py +124 -34
  312. deltacat/tests/utils/test_numpy.py +1193 -0
  313. deltacat/tests/utils/test_pandas.py +1106 -0
  314. deltacat/tests/utils/test_polars.py +1040 -0
  315. deltacat/tests/utils/test_pyarrow.py +1107 -258
  316. deltacat/types/media.py +345 -37
  317. deltacat/types/partial_download.py +1 -1
  318. deltacat/types/tables.py +2345 -47
  319. deltacat/utils/arguments.py +33 -1
  320. deltacat/utils/daft.py +824 -40
  321. deltacat/utils/export.py +61 -0
  322. deltacat/utils/filesystem.py +450 -0
  323. deltacat/utils/metafile_locator.py +74 -0
  324. deltacat/utils/numpy.py +118 -26
  325. deltacat/utils/pandas.py +577 -48
  326. deltacat/utils/polars.py +759 -0
  327. deltacat/utils/pyarrow.py +1212 -178
  328. deltacat/utils/ray_utils/concurrency.py +1 -1
  329. deltacat/utils/ray_utils/dataset.py +101 -10
  330. deltacat/utils/ray_utils/runtime.py +56 -4
  331. deltacat/utils/reader_compatibility_mapping.py +3083 -0
  332. deltacat/utils/url.py +1325 -0
  333. deltacat-2.0.0.dist-info/METADATA +1163 -0
  334. deltacat-2.0.0.dist-info/RECORD +439 -0
  335. {deltacat-1.1.38.dist-info → deltacat-2.0.0.dist-info}/WHEEL +1 -1
  336. deltacat/aws/redshift/__init__.py +0 -19
  337. deltacat/aws/redshift/model/manifest.py +0 -394
  338. deltacat/catalog/default_catalog_impl/__init__.py +0 -369
  339. deltacat/compute/compactor/utils/round_completion_file.py +0 -97
  340. deltacat/compute/merge_on_read/__init__.py +0 -4
  341. deltacat/compute/merge_on_read/daft.py +0 -40
  342. deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -66
  343. deltacat/compute/merge_on_read/utils/delta.py +0 -42
  344. deltacat/io/dataset.py +0 -73
  345. deltacat/io/read_api.py +0 -143
  346. deltacat/storage/model/delete_parameters.py +0 -40
  347. deltacat/storage/model/partition_spec.py +0 -71
  348. deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -231
  349. deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -397
  350. deltacat/tests/local_deltacat_storage/__init__.py +0 -1262
  351. deltacat/tests/local_deltacat_storage/exceptions.py +0 -10
  352. deltacat/utils/s3fs.py +0 -21
  353. deltacat-1.1.38.dist-info/METADATA +0 -64
  354. deltacat-1.1.38.dist-info/RECORD +0 -219
  355. /deltacat/{aws/redshift/model → benchmarking/data}/__init__.py +0 -0
  356. /deltacat/{compute/merge_on_read/model → catalog/main}/__init__.py +0 -0
  357. /deltacat/compute/{merge_on_read/utils → converter}/__init__.py +0 -0
  358. /deltacat/{io/aws → compute/converter/model}/__init__.py +0 -0
  359. /deltacat/{io/aws/redshift → compute/converter/pyiceberg}/__init__.py +0 -0
  360. /deltacat/{tests/io → compute/converter/steps}/__init__.py +0 -0
  361. /deltacat/tests/{io → _io}/test_file_object_store.py +0 -0
  362. /deltacat/tests/{io → _io}/test_memcached_object_store.py +0 -0
  363. /deltacat/tests/{io → _io}/test_ray_plasma_object_store.py +0 -0
  364. /deltacat/tests/{io → _io}/test_redis_object_store.py +0 -0
  365. /deltacat/tests/{io → _io}/test_s3_object_store.py +0 -0
  366. {deltacat-1.1.38.dist-info → deltacat-2.0.0.dist-info/licenses}/LICENSE +0 -0
  367. {deltacat-1.1.38.dist-info → deltacat-2.0.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,2882 @@
1
+ from typing import Any, Dict, List, Optional, Union, Tuple, Set
2
+ import logging
3
+ from collections import defaultdict
4
+
5
+ import numpy as np
6
+ import pyarrow as pa
7
+ import pandas as pd
8
+ import daft
9
+ import deltacat as dc
10
+
11
+ from deltacat.storage.model.manifest import ManifestAuthor
12
+ from deltacat.catalog.model.properties import CatalogProperties
13
+ from deltacat.exceptions import (
14
+ NamespaceAlreadyExistsError,
15
+ TableAlreadyExistsError,
16
+ TableVersionNotFoundError,
17
+ TableNotFoundError,
18
+ TableVersionAlreadyExistsError,
19
+ TableValidationError,
20
+ SchemaValidationError,
21
+ )
22
+ from deltacat.catalog.model.table_definition import TableDefinition
23
+ from deltacat.storage.model.sort_key import SortScheme
24
+ from deltacat.storage.model.list_result import ListResult
25
+ from deltacat.storage.model.namespace import Namespace, NamespaceProperties
26
+ from deltacat.storage.model.schema import (
27
+ Schema,
28
+ SchemaUpdate,
29
+ )
30
+ from deltacat.storage.model.table import TableProperties, Table
31
+ from deltacat.storage.model.types import (
32
+ Dataset,
33
+ LifecycleState,
34
+ StreamFormat,
35
+ SchemaConsistencyType,
36
+ )
37
+ from deltacat.storage.model.partition import (
38
+ Partition,
39
+ PartitionLocator,
40
+ PartitionScheme,
41
+ )
42
+ from deltacat.storage.model.table_version import (
43
+ TableVersion,
44
+ TableVersionProperties,
45
+ )
46
+ from deltacat.storage.model.types import DeltaType
47
+ from deltacat.storage import Delta
48
+ from deltacat.storage.model.types import CommitState
49
+ from deltacat.storage.model.transaction import (
50
+ Transaction,
51
+ setup_transaction,
52
+ )
53
+ from deltacat.types.media import (
54
+ ContentType,
55
+ DatasetType,
56
+ StorageType,
57
+ SCHEMA_CONTENT_TYPES,
58
+ )
59
+ from deltacat.types.tables import (
60
+ SchemaEvolutionMode,
61
+ TableProperty,
62
+ TablePropertyDefaultValues,
63
+ TableReadOptimizationLevel,
64
+ TableWriteMode,
65
+ get_dataset_type,
66
+ get_table_schema,
67
+ get_table_column_names,
68
+ from_pyarrow,
69
+ concat_tables,
70
+ empty_table,
71
+ infer_table_schema,
72
+ to_pandas,
73
+ )
74
+ from deltacat.utils import pyarrow as pa_utils
75
+ from deltacat.utils.reader_compatibility_mapping import get_compatible_readers
76
+ from deltacat.utils.pyarrow import get_base_arrow_type_name
77
+ from deltacat import logs
78
+ from deltacat.constants import DEFAULT_NAMESPACE
79
+
80
+ logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
81
+
82
+ """
83
+ Default Catalog interface implementation using DeltaCAT native storage.
84
+
85
+ The functions here should not be invoked directly, but should instead be
86
+ invoked through `delegate.py` (e.g., to support passing catalog's by name, and
87
+ to ensure that each initialized `Catalog` implementation has its `inner`
88
+ property set to the `CatalogProperties` returned from `initialize()`).
89
+
90
+ The `CatalogProperties` instance returned by `initialize()` contains all
91
+ durable state required to deterministically reconstruct the associated DeltaCAT
92
+ native `Catalog` implementation (e.g., the root URI for the catalog metastore).
93
+ """
94
+
95
+
96
+ # catalog functions
97
+ def initialize(
98
+ config: Optional[CatalogProperties] = None,
99
+ *args,
100
+ **kwargs,
101
+ ) -> CatalogProperties:
102
+ """
103
+ Performs any required one-time initialization and validation of this
104
+ catalog implementation based on the input configuration. If no config
105
+ instance is given, a new `CatalogProperties` instance is constructed
106
+ using the given keyword arguments.
107
+
108
+ Returns the input config if given, and the newly created config otherwise.
109
+ """
110
+ if config is not None:
111
+ if not isinstance(config, CatalogProperties):
112
+ raise ValueError(
113
+ f"Expected `CatalogProperties` but found `{type(config)}`."
114
+ )
115
+ return config
116
+ else:
117
+ return CatalogProperties(*args, **kwargs)
118
+
119
+
120
+ # table functions
121
+ def _validate_write_mode_and_table_existence(
122
+ table: str,
123
+ namespace: str,
124
+ mode: TableWriteMode,
125
+ **kwargs,
126
+ ) -> bool:
127
+ """Validate write mode against table existence and return whether table exists."""
128
+ table_exists_flag = table_exists(
129
+ table,
130
+ namespace=namespace,
131
+ **kwargs,
132
+ )
133
+ logger.info(f"Table to write to ({namespace}.{table}) exists: {table_exists_flag}")
134
+
135
+ if mode == TableWriteMode.CREATE and table_exists_flag:
136
+ raise ValueError(
137
+ f"Table {namespace}.{table} already exists and mode is CREATE."
138
+ )
139
+ elif (
140
+ mode not in (TableWriteMode.CREATE, TableWriteMode.AUTO)
141
+ and not table_exists_flag
142
+ ):
143
+ raise TableNotFoundError(
144
+ f"Table {namespace}.{table} does not exist and mode is {mode.value.upper() if hasattr(mode, 'value') else str(mode).upper()}. Use CREATE or AUTO mode to create a new table."
145
+ )
146
+
147
+ return table_exists_flag
148
+
149
+
150
+ def _get_table_and_validate_write_mode(
151
+ table: str,
152
+ namespace: str,
153
+ table_version: Optional[str],
154
+ mode: TableWriteMode,
155
+ **kwargs,
156
+ ) -> Tuple[bool, TableDefinition]:
157
+ """Validate write mode against table and table version existence.
158
+
159
+ Returns:
160
+ Tuple of (table_exists_flag, table_definition)
161
+ """
162
+ # First validate table, table version, and stream existence
163
+ existing_table_def = get_table(
164
+ table,
165
+ namespace=namespace,
166
+ table_version=table_version,
167
+ **kwargs,
168
+ )
169
+ table_exists_flag = (
170
+ existing_table_def is not None
171
+ and existing_table_def.table_version
172
+ and existing_table_def.stream
173
+ )
174
+ logger.info(f"Table to write to ({namespace}.{table}) exists: {table_exists_flag}")
175
+
176
+ # Then validate table existence constraints
177
+ if mode == TableWriteMode.CREATE and table_exists_flag and table_version is None:
178
+ raise TableAlreadyExistsError(
179
+ f"Table {namespace}.{table} already exists and mode is CREATE."
180
+ )
181
+ elif (
182
+ mode not in (TableWriteMode.CREATE, TableWriteMode.AUTO)
183
+ and existing_table_def is None
184
+ ):
185
+ raise TableNotFoundError(
186
+ f"Table {namespace}.{table} does not exist and write mode is {mode}. Use CREATE or AUTO mode to create a new table."
187
+ )
188
+
189
+ # Then validate table version existence constraints
190
+ if table_version is not None and table_exists_flag:
191
+ if mode == TableWriteMode.CREATE:
192
+ raise TableVersionAlreadyExistsError(
193
+ f"Table version {namespace}.{table}.{table_version} already exists and mode is CREATE."
194
+ )
195
+ logger.info(f"Table version ({namespace}.{table}.{table_version}) exists.")
196
+ elif (
197
+ mode not in (TableWriteMode.CREATE, TableWriteMode.AUTO)
198
+ and table_version is not None
199
+ and not table_exists_flag
200
+ ):
201
+ raise TableVersionNotFoundError(
202
+ f"Table version {namespace}.{table}.{table_version} does not exist and write mode is {mode}. "
203
+ f"Use CREATE or AUTO mode to create a new table version, or omit table_version "
204
+ f"to use the latest version."
205
+ )
206
+ return table_exists_flag, existing_table_def
207
+
208
+
209
+ def _validate_content_type_against_supported_content_types(
210
+ namespace: str,
211
+ table: str,
212
+ content_type: ContentType,
213
+ supported_content_types: Optional[List[ContentType]],
214
+ ) -> None:
215
+ if supported_content_types and content_type not in supported_content_types:
216
+ raise ValueError(
217
+ f"Content type proposed for write to table {namespace}.{table} ({content_type}) "
218
+ f"conflicts with the proposed list of new supported content types: {supported_content_types}"
219
+ )
220
+
221
+
222
+ def _create_table_for_write(
223
+ data: Dataset,
224
+ table: str,
225
+ namespace: str,
226
+ table_version: Optional[str],
227
+ content_type: ContentType,
228
+ existing_table_definition: Optional[TableDefinition],
229
+ *args,
230
+ **kwargs,
231
+ ) -> TableDefinition:
232
+ """Creates a new table, table version, and/or stream in preparation for a write operation."""
233
+ if "schema" not in kwargs:
234
+ kwargs["schema"] = infer_table_schema(data)
235
+
236
+ _validate_content_type_against_supported_content_types(
237
+ namespace,
238
+ table,
239
+ content_type,
240
+ kwargs.get("content_types"),
241
+ )
242
+ return create_table(
243
+ table,
244
+ namespace=namespace,
245
+ table_version=table_version,
246
+ existing_table_definition=existing_table_definition,
247
+ *args,
248
+ **kwargs,
249
+ )
250
+
251
+
252
+ def write_to_table(
253
+ data: Dataset,
254
+ table: str,
255
+ *args,
256
+ namespace: Optional[str] = None,
257
+ table_version: Optional[str] = None,
258
+ mode: TableWriteMode = TableWriteMode.AUTO,
259
+ content_type: ContentType = ContentType.PARQUET,
260
+ transaction: Optional[Transaction] = None,
261
+ **kwargs,
262
+ ) -> None:
263
+ """Write local or distributed data to a table. Raises an error if the
264
+ table does not exist and the table write mode is not CREATE or AUTO.
265
+
266
+ When creating a table, all `create_table` parameters may be optionally
267
+ specified as additional keyword arguments. When appending to, or replacing,
268
+ an existing table, all `alter_table` parameters may be optionally specified
269
+ as additional keyword arguments.
270
+
271
+ Args:
272
+ data: Local or distributed data to write to the table.
273
+ table: Name of the table to write to.
274
+ namespace: Optional namespace for the table. Uses default if not specified.
275
+ table_version: Optional version of the table to write to. If specified,
276
+ will create this version if it doesn't exist (in CREATE mode) or
277
+ get this version if it exists (in other modes). If not specified,
278
+ uses the latest version.
279
+ mode: Write mode (AUTO, CREATE, APPEND, REPLACE, MERGE, DELETE).
280
+ content_type: Content type used to write the data files. Defaults to PARQUET.
281
+ transaction: Optional transaction to append write operations to instead of
282
+ creating and committing a new transaction.
283
+ **kwargs: Additional keyword arguments.
284
+ """
285
+ namespace = namespace or default_namespace()
286
+
287
+ # Set up transaction handling
288
+ write_transaction, commit_transaction = setup_transaction(transaction, **kwargs)
289
+ kwargs["transaction"] = write_transaction
290
+
291
+ try:
292
+ # Validate write mode and table/table version/stream existence
293
+ (table_exists_flag, table_definition,) = _get_table_and_validate_write_mode(
294
+ table,
295
+ namespace,
296
+ table_version,
297
+ mode,
298
+ **kwargs,
299
+ )
300
+
301
+ # Get or create table, table version, and/or stream
302
+ if not table_exists_flag:
303
+ table_definition = _create_table_for_write(
304
+ data,
305
+ table,
306
+ namespace,
307
+ table_version,
308
+ content_type,
309
+ table_definition,
310
+ *args,
311
+ **kwargs,
312
+ )
313
+ else:
314
+ # call alter_table if there are any alter_table kwargs provided
315
+ if (
316
+ "lifecycle_state" in kwargs
317
+ or "schema_updates" in kwargs
318
+ or "partition_updates" in kwargs
319
+ or "sort_scheme" in kwargs
320
+ or "table_description" in kwargs
321
+ or "table_version_description" in kwargs
322
+ or "table_properties" in kwargs
323
+ or "table_version_properties" in kwargs
324
+ ):
325
+ alter_table(
326
+ table,
327
+ namespace=namespace,
328
+ table_version=table_version,
329
+ *args,
330
+ **kwargs,
331
+ )
332
+
333
+ # Get the active table version and stream
334
+ table_version_obj = _get_latest_active_or_given_table_version(
335
+ namespace=table_definition.table.namespace,
336
+ table_name=table_definition.table.table_name,
337
+ table_version=table_version or table_definition.table_version.table_version,
338
+ **kwargs,
339
+ )
340
+
341
+ # Validate schema compatibility for schemaless content types with schema tables
342
+ if (
343
+ content_type.value not in SCHEMA_CONTENT_TYPES
344
+ and table_version_obj.schema is not None
345
+ ):
346
+ schemaless_types = {
347
+ ct for ct in ContentType if ct.value not in SCHEMA_CONTENT_TYPES
348
+ }
349
+ raise TableValidationError(
350
+ f"Content type '{content_type.value}' cannot be written to a table with a schema. "
351
+ f"Table '{namespace}.{table}' has a schema, but content type '{content_type.value}' "
352
+ f"is schemaless. Schemaless content types ({', '.join(sorted([ct.value for ct in schemaless_types]))}) "
353
+ f"can only be written to schemaless tables."
354
+ )
355
+
356
+ # Handle different write modes and get stream and delta type
357
+ stream, delta_type = _handle_write_mode(
358
+ mode,
359
+ table_definition,
360
+ table_version_obj,
361
+ namespace,
362
+ table,
363
+ **kwargs,
364
+ )
365
+
366
+ if not stream:
367
+ raise ValueError(f"No default stream found for table {namespace}.{table}")
368
+
369
+ # Automatically set entry_params for DELETE/MERGE modes if not provided
370
+ _set_entry_params_if_needed(
371
+ mode,
372
+ table_version_obj,
373
+ kwargs,
374
+ )
375
+
376
+ # Validate table configuration
377
+ _validate_table_configuration(
378
+ stream,
379
+ table_version_obj,
380
+ namespace,
381
+ table,
382
+ )
383
+
384
+ # Handle partition creation/retrieval
385
+ partition, commit_staged_partition = _handle_partition_creation(
386
+ mode,
387
+ table_exists_flag,
388
+ delta_type,
389
+ stream,
390
+ **kwargs,
391
+ )
392
+
393
+ # Get table properties for schema evolution
394
+ schema_evolution_mode = table_version_obj.read_table_property(
395
+ TableProperty.SCHEMA_EVOLUTION_MODE
396
+ )
397
+ default_schema_consistency_type = table_version_obj.read_table_property(
398
+ TableProperty.DEFAULT_SCHEMA_CONSISTENCY_TYPE
399
+ )
400
+
401
+ # Convert unsupported dataset types and NumPy arrays that need schema validation
402
+ if isinstance(data, np.ndarray) and table_version_obj.schema is not None:
403
+ # NumPy arrays need conversion to Pandas for proper column naming in schema validation
404
+ converted_data = _convert_numpy_for_schema_validation(
405
+ data, table_version_obj.schema
406
+ )
407
+ else:
408
+ # Convert other unsupported dataset types (e.g., Daft) or keep NumPy as-is for schemaless tables
409
+ converted_data = _convert_data_if_needed(data)
410
+
411
+ # Capture original field set before schema coercion for partial UPSERT support
412
+ original_fields = set(get_table_column_names(converted_data))
413
+
414
+ # Validate and coerce data against schema
415
+ # This ensures proper schema evolution and type handling
416
+ (
417
+ validated_data,
418
+ schema_modified,
419
+ updated_schema,
420
+ ) = _validate_and_coerce_data_against_schema(
421
+ converted_data, # Use converted data for NumPy, original for others
422
+ table_version_obj.schema,
423
+ schema_evolution_mode=schema_evolution_mode,
424
+ default_schema_consistency_type=default_schema_consistency_type,
425
+ )
426
+
427
+ # Convert validated data to supported format for storage if needed
428
+ converted_data = _convert_data_if_needed(validated_data)
429
+
430
+ # Validate reader compatibility against supported reader types
431
+ supported_reader_types = table_version_obj.read_table_property(
432
+ TableProperty.SUPPORTED_READER_TYPES
433
+ )
434
+ _validate_reader_compatibility(
435
+ converted_data,
436
+ content_type,
437
+ supported_reader_types,
438
+ )
439
+
440
+ # Update table version if schema was modified during evolution
441
+ if schema_modified:
442
+ # Extract catalog properties and filter kwargs
443
+ catalog_kwargs = {
444
+ "catalog": kwargs.get("catalog"),
445
+ "inner": kwargs.get("inner"),
446
+ "transaction": write_transaction, # Pass transaction to update_table_version
447
+ }
448
+
449
+ _get_storage(**catalog_kwargs).update_table_version(
450
+ namespace=namespace,
451
+ table_name=table,
452
+ table_version=table_version_obj.table_version,
453
+ schema=updated_schema,
454
+ **catalog_kwargs,
455
+ )
456
+
457
+ # Stage and commit delta, handle compaction
458
+ # Remove schema from kwargs to avoid duplicate parameter conflict
459
+ filtered_kwargs = {k: v for k, v in kwargs.items() if k != "schema"}
460
+ # Use updated schema if schema evolution occurred, otherwise use original schema
461
+ _stage_commit_and_compact(
462
+ converted_data,
463
+ partition,
464
+ delta_type,
465
+ content_type,
466
+ commit_staged_partition,
467
+ table_version_obj,
468
+ namespace,
469
+ table,
470
+ schema=updated_schema if schema_modified else table_version_obj.schema,
471
+ original_fields=original_fields,
472
+ **filtered_kwargs,
473
+ )
474
+ except Exception as e:
475
+ # If any error occurs, the transaction remains uncommitted
476
+ commit_transaction = False
477
+ logger.error(f"Error during write_to_table: {e}")
478
+ raise
479
+ finally:
480
+ if commit_transaction:
481
+ # Seal the interactive transaction to commit all operations atomically
482
+ write_transaction.seal()
483
+
484
+
485
+ def _handle_write_mode(
486
+ mode: TableWriteMode,
487
+ table_definition: TableDefinition,
488
+ table_version_obj: TableVersion,
489
+ namespace: str,
490
+ table: str,
491
+ **kwargs,
492
+ ) -> Tuple[Any, DeltaType]: # Using Any for stream type to avoid complex imports
493
+ """Handle different write modes and return appropriate stream and delta type."""
494
+ table_schema = table_definition.table_version.schema
495
+
496
+ if mode == TableWriteMode.REPLACE:
497
+ return _handle_replace_mode(
498
+ table_schema,
499
+ namespace,
500
+ table,
501
+ table_version_obj,
502
+ **kwargs,
503
+ )
504
+ elif mode == TableWriteMode.APPEND:
505
+ return _handle_append_mode(
506
+ table_schema,
507
+ namespace,
508
+ table,
509
+ table_version_obj,
510
+ **kwargs,
511
+ )
512
+ elif mode in (TableWriteMode.MERGE, TableWriteMode.DELETE):
513
+ return _handle_merge_delete_mode(
514
+ mode,
515
+ table_schema,
516
+ namespace,
517
+ table,
518
+ table_version_obj,
519
+ **kwargs,
520
+ )
521
+ else:
522
+ # AUTO and CREATE modes
523
+ return _handle_auto_create_mode(
524
+ table_schema,
525
+ namespace,
526
+ table,
527
+ table_version_obj,
528
+ **kwargs,
529
+ )
530
+
531
+
532
+ def _handle_replace_mode(
533
+ table_schema,
534
+ namespace: str,
535
+ table: str,
536
+ table_version_obj: TableVersion,
537
+ **kwargs,
538
+ ) -> Tuple[Any, DeltaType]:
539
+ """Handle REPLACE mode by staging and committing a new stream."""
540
+ stream = _get_storage(**kwargs).stage_stream(
541
+ namespace=namespace,
542
+ table_name=table,
543
+ table_version=table_version_obj.table_version,
544
+ **kwargs,
545
+ )
546
+
547
+ stream = _get_storage(**kwargs).commit_stream(stream=stream, **kwargs)
548
+ delta_type = (
549
+ DeltaType.UPSERT
550
+ if table_schema and table_schema.merge_keys
551
+ else DeltaType.APPEND
552
+ )
553
+ return stream, delta_type
554
+
555
+
556
+ def _handle_append_mode(
557
+ table_schema,
558
+ namespace: str,
559
+ table: str,
560
+ table_version_obj: TableVersion,
561
+ **kwargs,
562
+ ) -> Tuple[Any, DeltaType]:
563
+ """Handle APPEND mode by validating no merge keys and getting existing stream."""
564
+ if table_schema and table_schema.merge_keys:
565
+ raise SchemaValidationError(
566
+ f"APPEND mode cannot be used with tables that have merge keys. "
567
+ f"Table {namespace}.{table} has merge keys: {table_schema.merge_keys}. "
568
+ f"Use MERGE mode instead."
569
+ )
570
+
571
+ stream = _get_table_stream(
572
+ namespace,
573
+ table,
574
+ table_version_obj.table_version,
575
+ **kwargs,
576
+ )
577
+ return stream, DeltaType.APPEND
578
+
579
+
580
+ def _handle_merge_delete_mode(
581
+ mode: TableWriteMode,
582
+ table_schema,
583
+ namespace: str,
584
+ table: str,
585
+ table_version_obj: TableVersion,
586
+ **kwargs,
587
+ ) -> Tuple[Any, DeltaType]:
588
+ """Handle MERGE/DELETE modes by validating merge keys and getting existing stream."""
589
+ if not table_schema or not table_schema.merge_keys:
590
+ raise TableValidationError(
591
+ f"{mode.value.upper() if hasattr(mode, 'value') else str(mode).upper()} mode requires tables to have at least one merge key. "
592
+ f"Table {namespace}.{table}.{table_version_obj.table_version} has no merge keys. "
593
+ f"Use APPEND, AUTO, or REPLACE mode instead."
594
+ )
595
+
596
+ stream = _get_table_stream(
597
+ namespace,
598
+ table,
599
+ table_version_obj.table_version,
600
+ **kwargs,
601
+ )
602
+ delta_type = DeltaType.UPSERT if mode == TableWriteMode.MERGE else DeltaType.DELETE
603
+ return stream, delta_type
604
+
605
+
606
+ def _handle_auto_create_mode(
607
+ table_schema,
608
+ namespace: str,
609
+ table: str,
610
+ table_version_obj: TableVersion,
611
+ **kwargs,
612
+ ) -> Tuple[Any, DeltaType]:
613
+ """Handle AUTO and CREATE modes by getting existing stream."""
614
+ stream = _get_table_stream(
615
+ namespace,
616
+ table,
617
+ table_version_obj.table_version,
618
+ **kwargs,
619
+ )
620
+ delta_type = (
621
+ DeltaType.UPSERT
622
+ if table_schema and table_schema.merge_keys
623
+ else DeltaType.APPEND
624
+ )
625
+ return stream, delta_type
626
+
627
+
628
+ def _validate_table_configuration(
629
+ stream,
630
+ table_version_obj: TableVersion,
631
+ namespace: str,
632
+ table: str,
633
+ ) -> None:
634
+ """Validate table configuration for unsupported features."""
635
+ # Check if table is partitioned
636
+ if (
637
+ stream.partition_scheme
638
+ and stream.partition_scheme.keys is not None
639
+ and len(stream.partition_scheme.keys) > 0
640
+ ):
641
+ raise NotImplementedError(
642
+ f"write_to_table does not yet support partitioned tables. "
643
+ f"Table {namespace}.{table} has partition scheme with "
644
+ f"{len(stream.partition_scheme.keys)} partition key(s): "
645
+ f"{[key.name or key.key[0] for key in stream.partition_scheme.keys]}. "
646
+ f"Please use the lower-level metastore API for partitioned tables."
647
+ )
648
+
649
+ # Check if table has sort keys
650
+ if (
651
+ table_version_obj.sort_scheme
652
+ and table_version_obj.sort_scheme.keys is not None
653
+ and len(table_version_obj.sort_scheme.keys) > 0
654
+ ):
655
+ raise NotImplementedError(
656
+ f"write_to_table does not yet support tables with sort keys. "
657
+ f"Table {namespace}.{table} has sort scheme with "
658
+ f"{len(table_version_obj.sort_scheme.keys)} sort key(s): "
659
+ f"{[key.key[0] for key in table_version_obj.sort_scheme.keys]}. "
660
+ f"Please use the lower-level metastore API for sorted tables."
661
+ )
662
+
663
+
664
+ def _handle_partition_creation(
665
+ mode: TableWriteMode,
666
+ table_exists_flag: bool,
667
+ delta_type: DeltaType,
668
+ stream,
669
+ **kwargs,
670
+ ) -> Tuple[Any, bool]: # partition, commit_staged_partition
671
+ """Handle partition creation/retrieval based on write mode."""
672
+ if mode == TableWriteMode.REPLACE or not table_exists_flag:
673
+ # REPLACE mode or new table: Stage a new partition
674
+ partition = _get_storage(**kwargs).stage_partition(stream=stream, **kwargs)
675
+ # If we're doing UPSERT/DELETE operations, let compaction handle the commit
676
+ commit_staged_partition = delta_type not in (DeltaType.UPSERT, DeltaType.DELETE)
677
+ return partition, commit_staged_partition
678
+ elif delta_type in (DeltaType.UPSERT, DeltaType.DELETE):
679
+ # UPSERT/DELETE operations: Try to use existing committed partition first
680
+ partition = _get_storage(**kwargs).get_partition(
681
+ stream_locator=stream.locator,
682
+ partition_values=None,
683
+ **kwargs,
684
+ )
685
+ commit_staged_partition = False
686
+
687
+ if not partition:
688
+ # No existing committed partition found, stage a new one
689
+ partition = _get_storage(**kwargs).stage_partition(stream=stream, **kwargs)
690
+ commit_staged_partition = False # Let compaction handle the commit
691
+
692
+ return partition, commit_staged_partition
693
+ else:
694
+ # APPEND mode on existing table: Get existing partition
695
+ partition = _get_storage(**kwargs).get_partition(
696
+ stream_locator=stream.locator,
697
+ partition_values=None,
698
+ **kwargs,
699
+ )
700
+ commit_staged_partition = False
701
+
702
+ if not partition:
703
+ # No existing partition found, create a new one
704
+ partition = _get_storage(**kwargs).stage_partition(stream=stream, **kwargs)
705
+ commit_staged_partition = True
706
+
707
+ return partition, commit_staged_partition
708
+
709
+
710
+ def _convert_numpy_for_schema_validation(
711
+ data: np.ndarray, schema: Optional[Schema]
712
+ ) -> Dataset:
713
+ """Convert NumPy array to Pandas DataFrame with proper column names for schema validation.
714
+
715
+ Args:
716
+ data: NumPy array to convert
717
+ schema: DeltaCAT Schema object for column naming
718
+
719
+ Returns:
720
+ Pandas DataFrame with proper column names matching schema
721
+
722
+ Raises:
723
+ ValueError: If array has more columns than schema or schema is invalid
724
+ """
725
+ if not isinstance(schema, Schema) or not schema.arrow:
726
+ raise ValueError(
727
+ f"Expected DeltaCAT schema for Numpy schema validation, but found: {schema}"
728
+ )
729
+
730
+ # Use schema subset matching NumPy array dimensions
731
+ arrow_schema = schema.arrow
732
+ num_cols = data.shape[1] if data.ndim > 1 else 1
733
+
734
+ if len(arrow_schema) >= num_cols:
735
+ # Use the first N columns from the schema to match data dimensions
736
+ subset_fields = [arrow_schema.field(i) for i in range(num_cols)]
737
+ subset_schema = pa.schema(subset_fields)
738
+ return to_pandas(data, schema=subset_schema)
739
+ else:
740
+ raise ValueError(
741
+ f"NumPy array has {num_cols} columns but table schema only has {len(arrow_schema)} columns. "
742
+ f"Cannot write NumPy data with more columns than the table schema supports."
743
+ )
744
+
745
+
746
+ def _build_entry_index_to_schema_mapping(
747
+ qualified_deltas: List[Delta], table_version_obj, **kwargs
748
+ ) -> List[Schema]:
749
+ """Build a mapping from manifest entry index to schema for reading operations.
750
+
751
+ Args:
752
+ qualified_deltas: List of deltas to process
753
+ table_version_obj: Table version containing schemas
754
+ **kwargs: Additional arguments passed to storage operations
755
+
756
+ Returns:
757
+ List mapping each manifest entry index to its corresponding schema
758
+
759
+ Raises:
760
+ ValueError: If a manifest's schema ID is not found in table version schemas
761
+ """
762
+ entry_index_to_schema = []
763
+ for delta in qualified_deltas:
764
+ if delta.manifest:
765
+ manifest = delta.manifest
766
+ else:
767
+ # Fetch manifest from storage
768
+ manifest = _get_storage(**kwargs).get_delta_manifest(
769
+ delta.locator,
770
+ **kwargs,
771
+ )
772
+ # Map manifest entry index to schema ID
773
+ schema_id = manifest.meta.schema_id
774
+
775
+ # Find the schema that matches this manifest's schema_id
776
+ matching_schema = None
777
+ if table_version_obj.schemas:
778
+ for schema in table_version_obj.schemas:
779
+ if schema.id == schema_id:
780
+ matching_schema = schema
781
+ break
782
+
783
+ if matching_schema is None:
784
+ available_schema_ids = (
785
+ [s.id for s in table_version_obj.schemas]
786
+ if table_version_obj.schemas
787
+ else []
788
+ )
789
+ raise ValueError(
790
+ f"Manifest schema ID {schema_id} not found in table version schemas. "
791
+ f"Available schema IDs: {available_schema_ids}. "
792
+ )
793
+
794
+ # Add the matching schema for each entry in this manifest
795
+ for _ in range(len(manifest.entries)):
796
+ entry_index_to_schema.append(matching_schema)
797
+
798
+ return entry_index_to_schema
799
+
800
+
801
+ def _convert_data_if_needed(data: Dataset) -> Dataset:
802
+ """Convert unsupported data types to supported ones."""
803
+ if isinstance(data, daft.DataFrame):
804
+ # Daft DataFrame - convert based on execution mode
805
+ ctx = daft.context.get_context()
806
+ runner = ctx.get_or_create_runner()
807
+ runner_type = runner.name
808
+
809
+ if runner_type == "ray":
810
+ # Running with Ray backend - convert to Ray Dataset
811
+ return data.to_ray_dataset()
812
+ else:
813
+ # Running with local backend - convert to PyArrow Table
814
+ return data.to_arrow()
815
+
816
+ return data
817
+
818
+
819
+ def _validate_and_coerce_data_against_schema(
820
+ data: Dataset,
821
+ schema: Optional[Schema],
822
+ schema_evolution_mode: Optional[SchemaEvolutionMode] = None,
823
+ default_schema_consistency_type: Optional[SchemaConsistencyType] = None,
824
+ ) -> Tuple[Dataset, bool, Optional[Schema]]:
825
+ """Validate and coerce data against the table schema if schema consistency types are set.
826
+
827
+ Args:
828
+ data: The dataset to validate/coerce
829
+ schema: The DeltaCAT schema to validate against (optional)
830
+ schema_evolution_mode: How to handle fields not in schema (MANUAL or AUTO)
831
+ default_schema_consistency_type: Default consistency type for new fields in AUTO mode
832
+
833
+ Returns:
834
+ Tuple[Dataset, bool, Optional[Schema]]: Validated/coerced data, flag indicating if schema was modified, and updated schema
835
+
836
+ Raises:
837
+ ValueError: If validation fails or coercion is not possible
838
+ """
839
+ if not schema:
840
+ return data, False, None
841
+
842
+ validated_data, updated_schema = schema.validate_and_coerce_dataset(
843
+ data,
844
+ schema_evolution_mode=schema_evolution_mode,
845
+ default_schema_consistency_type=default_schema_consistency_type,
846
+ )
847
+
848
+ # Check if schema was modified by comparing with original
849
+ schema_modified = not updated_schema.equivalent_to(schema, True)
850
+ # Return updated schema only if it was modified
851
+ updated_schema = updated_schema if schema_modified else None
852
+
853
+ return validated_data, schema_modified, updated_schema
854
+
855
+
856
+ def _validate_reader_compatibility(
857
+ data: Dataset,
858
+ content_type: ContentType,
859
+ supported_reader_types: Optional[List[DatasetType]],
860
+ ) -> None:
861
+ """Validate that the data types being written are compatible with all supported reader types.
862
+
863
+ Args:
864
+ data: The dataset to validate
865
+ content_type: Content type being written
866
+ supported_reader_types: List of DatasetTypes that must be able to read this data
867
+
868
+ Raises:
869
+ TableValidationError: If any data types would break supported reader compatibility
870
+ """
871
+ if not supported_reader_types:
872
+ return
873
+
874
+ # Get the schema from the data
875
+ schema = get_table_schema(data)
876
+
877
+ # Get the dataset type of the current data
878
+ writer_dataset_type = get_dataset_type(data)
879
+
880
+ # PYARROW_PARQUET is equivalent to PYARROW for compatibility
881
+ writer_type_str = (
882
+ writer_dataset_type.value
883
+ if writer_dataset_type != DatasetType.PYARROW_PARQUET
884
+ else "pyarrow"
885
+ )
886
+
887
+ content_type_str = content_type.value
888
+
889
+ # Check each field type for compatibility
890
+ incompatible_fields = []
891
+
892
+ for field in schema:
893
+ field_name = field.name
894
+ arrow_type_str = str(field.type)
895
+
896
+ # Get the base type name from PyArrow field type
897
+ base_type_name = get_base_arrow_type_name(field.type)
898
+
899
+ # Get compatible readers for this (arrow_type, writer_dataset_type, content_type) combination
900
+ compatible_readers = get_compatible_readers(
901
+ base_type_name,
902
+ writer_type_str,
903
+ content_type_str,
904
+ )
905
+
906
+ # Check if all supported reader types are compatible
907
+ for required_reader in supported_reader_types:
908
+ reader_is_compatible = required_reader in compatible_readers
909
+
910
+ # Special case: PYARROW_PARQUET is equivalent to PYARROW for compatibility if we're writing parquet
911
+ if (
912
+ not reader_is_compatible
913
+ and content_type == ContentType.PARQUET
914
+ and required_reader == DatasetType.PYARROW_PARQUET
915
+ ):
916
+ reader_is_compatible = DatasetType.PYARROW in compatible_readers
917
+
918
+ if not reader_is_compatible:
919
+ incompatible_fields.append(
920
+ {
921
+ "field_name": field_name,
922
+ "arrow_type": arrow_type_str,
923
+ "incompatible_reader": required_reader,
924
+ "writer_type": writer_dataset_type,
925
+ "content_type": content_type,
926
+ }
927
+ )
928
+
929
+ # Raise error if any incompatibilities found
930
+ if incompatible_fields:
931
+ error_details = []
932
+ for incompatible in incompatible_fields:
933
+ error_details.append(
934
+ f"Field '{incompatible['field_name']}' with type '{incompatible['arrow_type']}' "
935
+ f"written by {incompatible['writer_type']} to {incompatible['content_type']} "
936
+ f"cannot be read by required reader type {incompatible['incompatible_reader']}. "
937
+ f"If you expect this write to succeed and this reader is not required, then it "
938
+ f"can be removed from the table's supported reader types property."
939
+ )
940
+
941
+ raise TableValidationError(
942
+ f"Reader compatibility validation failed. The following fields would break "
943
+ f"supported reader types:\n" + "\n".join(error_details)
944
+ )
945
+
946
+
947
+ def _stage_commit_and_compact(
948
+ converted_data: Dataset,
949
+ partition,
950
+ delta_type: DeltaType,
951
+ content_type: ContentType,
952
+ commit_staged_partition: bool,
953
+ table_version_obj: TableVersion,
954
+ namespace: str,
955
+ table: str,
956
+ schema: Schema,
957
+ original_fields: Set[str],
958
+ **kwargs,
959
+ ) -> None:
960
+ """Stage and commit delta, then handle compaction if needed."""
961
+ # Remove schema from kwargs to avoid duplicate parameter conflict
962
+ # We explicitly pass the correct schema parameter
963
+ kwargs.pop("schema", None)
964
+
965
+ # Stage a delta with the data
966
+ delta = _get_storage(**kwargs).stage_delta(
967
+ data=converted_data,
968
+ partition=partition,
969
+ delta_type=delta_type,
970
+ content_type=content_type,
971
+ author=ManifestAuthor.of(
972
+ name="deltacat.write_to_table", version=dc.__version__
973
+ ),
974
+ schema=schema,
975
+ **kwargs,
976
+ )
977
+
978
+ delta = _get_storage(**kwargs).commit_delta(delta=delta, **kwargs)
979
+
980
+ if commit_staged_partition:
981
+ _get_storage(**kwargs).commit_partition(partition=partition, **kwargs)
982
+
983
+ # Check compaction trigger decision
984
+ should_compact = _trigger_compaction(
985
+ table_version_obj,
986
+ delta,
987
+ TableReadOptimizationLevel.MAX,
988
+ **kwargs,
989
+ )
990
+ if should_compact:
991
+ # Run V2 compaction session to merge or delete data
992
+ if table_version_obj.schema:
993
+ all_column_names = table_version_obj.schema.arrow.names
994
+ else:
995
+ raise RuntimeError("Table version schema is required to run compaction.")
996
+ _run_compaction_session(
997
+ table_version_obj=table_version_obj,
998
+ partition=partition,
999
+ latest_delta_stream_position=delta.stream_position,
1000
+ namespace=namespace,
1001
+ table=table,
1002
+ original_fields=original_fields,
1003
+ all_column_names=all_column_names,
1004
+ **kwargs,
1005
+ )
1006
+
1007
+
1008
+ def _trigger_compaction(
1009
+ table_version_obj: TableVersion,
1010
+ latest_delta: Optional[Delta],
1011
+ target_read_optimization_level: TableReadOptimizationLevel,
1012
+ **kwargs,
1013
+ ) -> bool:
1014
+ # Import inside function to avoid circular imports
1015
+ from deltacat.compute.compactor.utils import round_completion_reader as rci
1016
+
1017
+ # Extract delta type from latest_delta if available, otherwise default to no compaction
1018
+ if latest_delta is not None:
1019
+ delta_type = latest_delta.type
1020
+ partition_values = latest_delta.partition_locator.partition_values
1021
+ logger.info(
1022
+ f"Using delta type {delta_type} from latest delta {latest_delta.locator}"
1023
+ )
1024
+ else:
1025
+ logger.info(f"No latest delta discovered, defaulting to no compaction.")
1026
+ return False
1027
+
1028
+ if (
1029
+ table_version_obj.read_table_property(TableProperty.READ_OPTIMIZATION_LEVEL)
1030
+ == target_read_optimization_level
1031
+ ):
1032
+ if delta_type == DeltaType.DELETE or delta_type == DeltaType.UPSERT:
1033
+ return True
1034
+ elif delta_type == DeltaType.APPEND:
1035
+ # Get default stream to determine partition locator
1036
+ stream = _get_table_stream(
1037
+ table_version_obj.locator.namespace,
1038
+ table_version_obj.locator.table_name,
1039
+ table_version_obj.locator.table_version,
1040
+ **kwargs,
1041
+ )
1042
+
1043
+ if not stream:
1044
+ return False
1045
+
1046
+ # Use provided partition_values or None for unpartitioned tables
1047
+ partition_locator = PartitionLocator.of(
1048
+ stream_locator=stream.locator,
1049
+ partition_values=partition_values,
1050
+ partition_id=None,
1051
+ )
1052
+
1053
+ # Get round completion info to determine high watermark
1054
+ round_completion_info = rci.read_round_completion_info(
1055
+ source_partition_locator=partition_locator,
1056
+ destination_partition_locator=partition_locator,
1057
+ deltacat_storage=_get_storage(**kwargs),
1058
+ deltacat_storage_kwargs=kwargs,
1059
+ )
1060
+
1061
+ high_watermark = (
1062
+ round_completion_info.high_watermark
1063
+ if round_completion_info
1064
+ and isinstance(round_completion_info.high_watermark, int)
1065
+ else 0
1066
+ )
1067
+
1068
+ # Get all deltas appended since last compaction
1069
+ deltas = _get_storage(**kwargs).list_deltas(
1070
+ namespace=table_version_obj.locator.namespace,
1071
+ table_name=table_version_obj.locator.table_name,
1072
+ table_version=table_version_obj.locator.table_version,
1073
+ partition_values=partition_values,
1074
+ start_stream_position=high_watermark + 1,
1075
+ **kwargs,
1076
+ )
1077
+
1078
+ if not deltas:
1079
+ return False
1080
+
1081
+ # Count deltas appended since last compaction
1082
+ appended_deltas_since_last_compaction = len(deltas)
1083
+ delta_trigger = table_version_obj.read_table_property(
1084
+ TableProperty.APPENDED_DELTA_COUNT_COMPACTION_TRIGGER
1085
+ )
1086
+ if delta_trigger and appended_deltas_since_last_compaction >= delta_trigger:
1087
+ return True
1088
+
1089
+ # Count files appended since last compaction
1090
+ appended_files_since_last_compaction = 0
1091
+ for delta in deltas:
1092
+ if delta.manifest and delta.manifest.entries:
1093
+ appended_files_since_last_compaction += len(delta.manifest.entries)
1094
+
1095
+ file_trigger = table_version_obj.read_table_property(
1096
+ TableProperty.APPENDED_FILE_COUNT_COMPACTION_TRIGGER
1097
+ )
1098
+ if file_trigger and appended_files_since_last_compaction >= file_trigger:
1099
+ return True
1100
+
1101
+ # Count records appended since last compaction
1102
+ appended_records_since_last_compaction = 0
1103
+ for delta in deltas:
1104
+ if delta.meta and delta.meta.record_count:
1105
+ appended_records_since_last_compaction += delta.meta.record_count
1106
+
1107
+ record_trigger = table_version_obj.read_table_property(
1108
+ TableProperty.APPENDED_RECORD_COUNT_COMPACTION_TRIGGER
1109
+ )
1110
+ if (
1111
+ record_trigger
1112
+ and appended_records_since_last_compaction >= record_trigger
1113
+ ):
1114
+ return True
1115
+ return False
1116
+
1117
+
1118
+ def _get_compaction_primary_keys(table_version_obj: TableVersion) -> set:
1119
+ """Extract primary keys from table schema for compaction."""
1120
+ table_schema = table_version_obj.schema
1121
+ return (
1122
+ set(table_schema.merge_keys)
1123
+ if table_schema and table_schema.merge_keys
1124
+ else set()
1125
+ )
1126
+
1127
+
1128
+ def _get_compaction_hash_bucket_count(
1129
+ partition: Partition, table_version_obj: TableVersion
1130
+ ) -> int:
1131
+ """Determine hash bucket count from previous compaction, table property, or default."""
1132
+ # First check if we have a hash bucket count from previous compaction
1133
+ if (
1134
+ partition.compaction_round_completion_info
1135
+ and partition.compaction_round_completion_info.hash_bucket_count
1136
+ ):
1137
+ hash_bucket_count = partition.compaction_round_completion_info.hash_bucket_count
1138
+ logger.info(
1139
+ f"Using hash bucket count {hash_bucket_count} from previous compaction"
1140
+ )
1141
+ return hash_bucket_count
1142
+
1143
+ # Otherwise use the table property for default compaction hash bucket count
1144
+ hash_bucket_count = table_version_obj.read_table_property(
1145
+ TableProperty.DEFAULT_COMPACTION_HASH_BUCKET_COUNT
1146
+ )
1147
+ logger.info(f"Using hash bucket count {hash_bucket_count} from table property")
1148
+ return hash_bucket_count
1149
+
1150
+
1151
+ def _get_merge_order_sort_keys(table_version_obj: TableVersion):
1152
+ """Extract sort keys from merge_order fields in schema for compaction.
1153
+
1154
+ Args:
1155
+ table_version_obj: The table version containing schema
1156
+
1157
+ Returns:
1158
+ List of SortKey objects from merge_order fields, or None if no merge_order fields are defined
1159
+ """
1160
+ if table_version_obj.schema:
1161
+ return table_version_obj.schema.merge_order_sort_keys()
1162
+ return None
1163
+
1164
+
1165
+ def _create_compaction_params(
1166
+ table_version_obj: TableVersion,
1167
+ partition: Partition,
1168
+ latest_stream_position: int,
1169
+ primary_keys: set,
1170
+ hash_bucket_count: int,
1171
+ original_fields: Set[str],
1172
+ all_column_names: Optional[List[str]],
1173
+ **kwargs,
1174
+ ):
1175
+ """Create compaction parameters for the compaction session."""
1176
+ from deltacat.compute.compactor.model.compact_partition_params import (
1177
+ CompactPartitionParams,
1178
+ )
1179
+
1180
+ # Remove create_table/alter_table kwargs not needed for compaction
1181
+ kwargs.pop("lifecycle_state", None)
1182
+ kwargs.pop("schema", None)
1183
+ kwargs.pop("partition_scheme", None)
1184
+ kwargs.pop("sort_keys", None)
1185
+ kwargs.pop("table_description", None)
1186
+ kwargs.pop("table_version_description", None)
1187
+ kwargs.pop("table_properties", None)
1188
+ kwargs.pop("table_version_properties", None)
1189
+ kwargs.pop("namespace_properties", None)
1190
+ kwargs.pop("content_types", None)
1191
+ kwargs.pop("fail_if_exists", None)
1192
+ kwargs.pop("schema_updates", None)
1193
+ kwargs.pop("partition_updates", None)
1194
+ kwargs.pop("sort_scheme", None)
1195
+
1196
+ table_writer_kwargs = kwargs.pop("table_writer_kwargs", {})
1197
+ table_writer_kwargs["schema"] = table_version_obj.schema
1198
+ table_writer_kwargs["sort_scheme_id"] = table_version_obj.sort_scheme.id
1199
+ deltacat_storage_kwargs = kwargs.pop("deltacat_storage_kwargs", {})
1200
+ deltacat_storage_kwargs["transaction"] = kwargs.get("transaction", None)
1201
+ list_deltas_kwargs = kwargs.pop("list_deltas_kwargs", {})
1202
+ list_deltas_kwargs["transaction"] = kwargs.get("transaction", None)
1203
+
1204
+ return CompactPartitionParams.of(
1205
+ {
1206
+ "catalog": kwargs.get("inner", kwargs.get("catalog")),
1207
+ "source_partition_locator": partition.locator,
1208
+ "destination_partition_locator": partition.locator, # In-place compaction
1209
+ "primary_keys": primary_keys,
1210
+ "last_stream_position_to_compact": latest_stream_position,
1211
+ "deltacat_storage": _get_storage(**kwargs),
1212
+ "deltacat_storage_kwargs": deltacat_storage_kwargs,
1213
+ "list_deltas_kwargs": list_deltas_kwargs,
1214
+ "table_writer_kwargs": table_writer_kwargs,
1215
+ "hash_bucket_count": hash_bucket_count,
1216
+ "records_per_compacted_file": table_version_obj.read_table_property(
1217
+ TableProperty.RECORDS_PER_COMPACTED_FILE,
1218
+ ),
1219
+ "compacted_file_content_type": ContentType.PARQUET,
1220
+ "drop_duplicates": True,
1221
+ "sort_keys": _get_merge_order_sort_keys(table_version_obj),
1222
+ "original_fields": original_fields,
1223
+ "all_column_names": all_column_names,
1224
+ }
1225
+ )
1226
+
1227
+
1228
+ def _run_compaction_session(
1229
+ table_version_obj: TableVersion,
1230
+ partition: Partition,
1231
+ latest_delta_stream_position: int,
1232
+ namespace: str,
1233
+ table: str,
1234
+ original_fields: Set[str],
1235
+ all_column_names: List[str],
1236
+ **kwargs,
1237
+ ) -> None:
1238
+ """
1239
+ Run a V2 compaction session for the given table and partition.
1240
+
1241
+ Args:
1242
+ table_version_obj: The table version object
1243
+ partition: The partition to compact
1244
+ latest_delta_stream_position: Stream position of the latest delta
1245
+ namespace: The table namespace
1246
+ table: The table name
1247
+ original_fields: The original field set for partial UPSERT support
1248
+ **kwargs: Additional arguments including catalog and storage parameters
1249
+ """
1250
+ # Import inside function to avoid circular imports
1251
+ from deltacat.compute.compactor_v2.compaction_session import compact_partition
1252
+
1253
+ try:
1254
+ # Extract compaction configuration
1255
+ primary_keys = _get_compaction_primary_keys(table_version_obj)
1256
+ hash_bucket_count = _get_compaction_hash_bucket_count(
1257
+ partition, table_version_obj
1258
+ )
1259
+
1260
+ # Create compaction parameters
1261
+ compact_partition_params = _create_compaction_params(
1262
+ table_version_obj,
1263
+ partition,
1264
+ latest_delta_stream_position,
1265
+ primary_keys,
1266
+ hash_bucket_count,
1267
+ original_fields=original_fields,
1268
+ all_column_names=all_column_names,
1269
+ **kwargs,
1270
+ )
1271
+
1272
+ # Run V2 compaction session
1273
+ compact_partition(params=compact_partition_params)
1274
+ except Exception as e:
1275
+ logger.error(
1276
+ f"Error during compaction session for {namespace}.{table}, "
1277
+ f"partition {partition.locator}: {e}"
1278
+ )
1279
+ raise
1280
+
1281
+
1282
+ def _get_merge_key_field_names_from_schema(schema) -> List[str]:
1283
+ """Extract merge key field names from a DeltaCAT Schema object.
1284
+
1285
+ Args:
1286
+ schema: DeltaCAT Schema object
1287
+
1288
+ Returns:
1289
+ List of field names that are marked as merge keys
1290
+ """
1291
+ if not schema or not schema.merge_keys:
1292
+ return []
1293
+
1294
+ merge_key_field_names = []
1295
+ field_ids_to_fields = schema.field_ids_to_fields
1296
+
1297
+ for merge_key_id in schema.merge_keys:
1298
+ if merge_key_id in field_ids_to_fields:
1299
+ field = field_ids_to_fields[merge_key_id]
1300
+ merge_key_field_names.append(field.arrow.name)
1301
+
1302
+ return merge_key_field_names
1303
+
1304
+
1305
+ def _set_entry_params_if_needed(
1306
+ mode: TableWriteMode, table_version_obj, kwargs: dict
1307
+ ) -> None:
1308
+ """Automatically set entry_params to merge keys if not already set by user.
1309
+
1310
+ Args:
1311
+ mode: The table write mode
1312
+ table_version_obj: The table version object containing schema
1313
+ kwargs: Keyword arguments dictionary that may contain entry_params
1314
+ """
1315
+ # Only set entry_params for DELETE and MERGE modes
1316
+ if mode not in [TableWriteMode.DELETE, TableWriteMode.MERGE]:
1317
+ return
1318
+
1319
+ # Don't override if user already provided entry_params
1320
+ if "entry_params" in kwargs and kwargs["entry_params"] is not None:
1321
+ return
1322
+
1323
+ # Get schema from table version
1324
+ if not table_version_obj or not table_version_obj.schema:
1325
+ return
1326
+
1327
+ # Extract merge key field names
1328
+ merge_key_field_names = _get_merge_key_field_names_from_schema(
1329
+ table_version_obj.schema
1330
+ )
1331
+
1332
+ if merge_key_field_names:
1333
+ from deltacat.storage import EntryParams
1334
+
1335
+ kwargs["entry_params"] = EntryParams.of(merge_key_field_names)
1336
+
1337
+
1338
+ def _get_table_stream(namespace: str, table: str, table_version: str, **kwargs):
1339
+ """Helper function to get a stream for a table version."""
1340
+ return _get_storage(**kwargs).get_stream(
1341
+ namespace=namespace,
1342
+ table_name=table,
1343
+ table_version=table_version,
1344
+ **kwargs,
1345
+ )
1346
+
1347
+
1348
+ def _validate_read_table_input(
1349
+ namespace: str,
1350
+ table: str,
1351
+ table_schema: Optional[Schema],
1352
+ table_type: Optional[DatasetType],
1353
+ distributed_dataset_type: Optional[DatasetType],
1354
+ ) -> None:
1355
+ """Validate input parameters for read_table operation."""
1356
+ if (
1357
+ distributed_dataset_type
1358
+ and distributed_dataset_type not in DatasetType.distributed()
1359
+ ):
1360
+ raise ValueError(
1361
+ f"{distributed_dataset_type} is not a valid distributed dataset type. "
1362
+ f"Valid distributed dataset types are: {DatasetType.distributed()}."
1363
+ )
1364
+ if table_type and table_type not in DatasetType.local():
1365
+ raise ValueError(
1366
+ f"{table_type} is not a valid local table type. "
1367
+ f"Valid table types are: {DatasetType.local()}."
1368
+ )
1369
+
1370
+ # For schemaless tables, distributed datasets are not yet supported
1371
+ if table_schema is None and distributed_dataset_type:
1372
+ raise NotImplementedError(
1373
+ f"Distributed dataset reading is not yet supported for schemaless tables. "
1374
+ f"Table '{namespace}.{table}' has no schema, but distributed_dataset_type={distributed_dataset_type} was specified. "
1375
+ f"Please use local storage by setting distributed_dataset_type=None."
1376
+ )
1377
+
1378
+
1379
+ def _get_qualified_deltas_for_read(
1380
+ table: str,
1381
+ namespace: str,
1382
+ table_version: str,
1383
+ partition_filter: Optional[List[Union[Partition, PartitionLocator]]],
1384
+ **kwargs,
1385
+ ) -> List[Delta]:
1386
+ """Get qualified deltas for reading based on partition filter."""
1387
+ logger.info(
1388
+ f"Reading metadata for table={namespace}/{table}/{table_version} "
1389
+ f"with partition_filters={partition_filter}."
1390
+ )
1391
+
1392
+ # Get partition filter if not provided
1393
+ if partition_filter is None:
1394
+ partition_filter = _get_all_committed_partitions(
1395
+ table, namespace, table_version, **kwargs
1396
+ )
1397
+
1398
+ # Get deltas from partitions
1399
+ qualified_deltas = _get_deltas_from_partition_filter(
1400
+ partition_filter=partition_filter,
1401
+ **kwargs,
1402
+ )
1403
+
1404
+ logger.info(
1405
+ f"Total qualified deltas={len(qualified_deltas)} "
1406
+ f"from {len(partition_filter)} partitions."
1407
+ )
1408
+
1409
+ return qualified_deltas
1410
+
1411
+
1412
+ def _get_max_parallelism(
1413
+ max_parallelism: Optional[int],
1414
+ distributed_dataset_type: Optional[DatasetType],
1415
+ ) -> int:
1416
+ """Get the max parallelism for a read operation."""
1417
+ if distributed_dataset_type:
1418
+ max_parallelism = max_parallelism or 100
1419
+ else:
1420
+ # TODO(pdames): Set max parallelism using available resources and dataset size
1421
+ max_parallelism = 1
1422
+ if max_parallelism < 1:
1423
+ raise ValueError(
1424
+ f"max_parallelism must be greater than 0, but got {max_parallelism}"
1425
+ )
1426
+ logger.info(f"Using max_parallelism={max_parallelism} for read operation")
1427
+
1428
+ return max_parallelism
1429
+
1430
+
1431
+ def _handle_schemaless_table_read(
1432
+ qualified_deltas: List[Delta],
1433
+ read_as: DatasetType,
1434
+ **kwargs,
1435
+ ) -> Dataset:
1436
+ """Handle reading schemaless tables by flattening manifest entries."""
1437
+ # Create a PyArrow table for each delta
1438
+ # TODO(pdames): More efficient implementation for tables with millions/billions of entries
1439
+ tables = []
1440
+ for delta in qualified_deltas:
1441
+ # Get the manifest for this delta
1442
+ if delta.manifest:
1443
+ manifest = delta.manifest
1444
+ else:
1445
+ # Fetch manifest from storage
1446
+ manifest = _get_storage(**kwargs).get_delta_manifest(
1447
+ delta.locator,
1448
+ transaction=kwargs.get("transaction"),
1449
+ **kwargs,
1450
+ )
1451
+ # Create flattened table from this delta's manifest
1452
+ table = pa_utils.delta_manifest_to_table(
1453
+ manifest,
1454
+ delta,
1455
+ )
1456
+ tables.append(table)
1457
+
1458
+ # Concatenate all PyArrow tables
1459
+ final_table = pa_utils.concat_tables(tables)
1460
+
1461
+ # Convert from PyArrow to the requested dataset type
1462
+ return from_pyarrow(final_table, read_as)
1463
+
1464
+
1465
+ def _download_and_process_table_data(
1466
+ namespace: str,
1467
+ table: str,
1468
+ qualified_deltas: List[Delta],
1469
+ read_as: DatasetType,
1470
+ max_parallelism: Optional[int],
1471
+ columns: Optional[List[str]],
1472
+ file_path_column: Optional[str],
1473
+ table_version_obj: Optional[TableVersion],
1474
+ **kwargs,
1475
+ ) -> Dataset:
1476
+ """Download delta data and process result based on storage type."""
1477
+
1478
+ # Handle NUMPY read requests by translating to PANDAS internally
1479
+ original_read_as = read_as
1480
+ effective_read_as = read_as
1481
+ if read_as == DatasetType.NUMPY:
1482
+ effective_read_as = DatasetType.PANDAS
1483
+ logger.debug("Translating NUMPY read request to PANDAS for internal processing")
1484
+
1485
+ # Merge deltas and download data
1486
+ if not qualified_deltas:
1487
+ # Return empty table with original read_as type
1488
+ return empty_table(original_read_as)
1489
+
1490
+ # Special handling for non-empty schemaless tables
1491
+ if table_version_obj.schema is None:
1492
+ result = _handle_schemaless_table_read(
1493
+ qualified_deltas,
1494
+ effective_read_as,
1495
+ **kwargs,
1496
+ )
1497
+ # Convert to numpy if original request was for numpy
1498
+ if original_read_as == DatasetType.NUMPY:
1499
+ return _convert_pandas_to_numpy(result)
1500
+ return result
1501
+
1502
+ # Get schemas for each manifest entry
1503
+ entry_index_to_schema = _build_entry_index_to_schema_mapping(
1504
+ qualified_deltas, table_version_obj, **kwargs
1505
+ )
1506
+ # Standard non-empty schema table read path - merge deltas and download data
1507
+ merged_delta = Delta.merge_deltas(qualified_deltas)
1508
+
1509
+ # Convert read parameters to download parameters
1510
+ table_type = (
1511
+ effective_read_as
1512
+ if effective_read_as in DatasetType.local()
1513
+ else (kwargs.pop("table_type", None) or DatasetType.PYARROW)
1514
+ )
1515
+ distributed_dataset_type = (
1516
+ effective_read_as if effective_read_as in DatasetType.distributed() else None
1517
+ )
1518
+
1519
+ # Validate input parameters
1520
+ _validate_read_table_input(
1521
+ namespace,
1522
+ table,
1523
+ table_version_obj.schema,
1524
+ table_type,
1525
+ distributed_dataset_type,
1526
+ )
1527
+
1528
+ # Determine max parallelism
1529
+ max_parallelism = _get_max_parallelism(
1530
+ max_parallelism,
1531
+ distributed_dataset_type,
1532
+ )
1533
+ # Filter out parameters that are already passed as keyword arguments
1534
+ # to avoid "multiple values for argument" errors
1535
+ filtered_kwargs = {
1536
+ k: v
1537
+ for k, v in kwargs.items()
1538
+ if k
1539
+ not in [
1540
+ "delta_like",
1541
+ "table_type",
1542
+ "storage_type",
1543
+ "max_parallelism",
1544
+ "columns",
1545
+ "distributed_dataset_type",
1546
+ "file_path_column",
1547
+ ]
1548
+ }
1549
+ result = _get_storage(**kwargs).download_delta(
1550
+ merged_delta,
1551
+ table_type=effective_read_as,
1552
+ storage_type=StorageType.DISTRIBUTED
1553
+ if distributed_dataset_type
1554
+ else StorageType.LOCAL,
1555
+ max_parallelism=max_parallelism,
1556
+ columns=columns,
1557
+ distributed_dataset_type=distributed_dataset_type,
1558
+ file_path_column=file_path_column,
1559
+ **filtered_kwargs,
1560
+ )
1561
+
1562
+ # Handle local storage table concatenation and PYARROW_PARQUET lazy materialization
1563
+ if not distributed_dataset_type and table_type and isinstance(result, list):
1564
+ if table_type == DatasetType.PYARROW_PARQUET:
1565
+ # For PYARROW_PARQUET, preserve lazy materialization:
1566
+ return result[0] if len(result) == 1 else result
1567
+ else:
1568
+ # For other types, perform normal concatenation
1569
+ result = _handle_local_table_concatenation(
1570
+ result,
1571
+ table_type,
1572
+ table_version_obj.schema,
1573
+ entry_index_to_schema,
1574
+ file_path_column,
1575
+ columns,
1576
+ )
1577
+ # Convert to numpy if original request was for numpy
1578
+ if original_read_as == DatasetType.NUMPY:
1579
+ return _convert_pandas_to_numpy(result)
1580
+
1581
+ return result
1582
+
1583
+
1584
+ def _convert_pandas_to_numpy(dataset: Dataset):
1585
+ """Convert pandas DataFrame to numpy ndarray."""
1586
+ if not isinstance(dataset, pd.DataFrame):
1587
+ raise ValueError(f"Expected pandas DataFrame but found {type(dataset)}")
1588
+ return dataset.to_numpy()
1589
+
1590
+
1591
+ def _coerce_dataset_to_schema(
1592
+ dataset: Dataset, target_schema: pa.Schema, manifest_entry_schema: Schema
1593
+ ) -> Dataset:
1594
+ """Coerce a dataset to match the target PyArrow schema using DeltaCAT Schema.coerce method."""
1595
+ # Convert target PyArrow schema to DeltaCAT schema and use its coerce method
1596
+ deltacat_schema = Schema.of(schema=target_schema)
1597
+ return deltacat_schema.coerce(dataset, manifest_entry_schema)
1598
+
1599
+
1600
+ def _coerce_results_to_schema(
1601
+ results: Dataset, target_schema: pa.Schema, entry_index_to_schema: List[Schema]
1602
+ ) -> List[Dataset]:
1603
+ """Coerce all table results to match the target schema."""
1604
+ coerced_results = []
1605
+ for i, table_result in enumerate(results):
1606
+ coerced_result = _coerce_dataset_to_schema(
1607
+ table_result, target_schema, entry_index_to_schema[i]
1608
+ )
1609
+ coerced_results.append(coerced_result)
1610
+ logger.debug(f"Coerced table {i} to unified schema")
1611
+ return coerced_results
1612
+
1613
+
1614
+ def _create_target_schema(
1615
+ arrow_schema: pa.Schema,
1616
+ columns: Optional[List[str]] = None,
1617
+ file_path_column: Optional[str] = None,
1618
+ ) -> pa.Schema:
1619
+ """Create target schema for concatenation with optional column selection and file_path_column."""
1620
+ if columns is not None:
1621
+ # Column selection - use only specified columns
1622
+ field_map = {field.name: field for field in arrow_schema}
1623
+ selected_fields = []
1624
+
1625
+ for col_name in columns:
1626
+ if col_name in field_map:
1627
+ selected_fields.append(field_map[col_name])
1628
+ arrow_schema = pa.schema(selected_fields)
1629
+ if file_path_column and file_path_column not in arrow_schema.names:
1630
+ arrow_schema = arrow_schema.append(pa.field(file_path_column, pa.string()))
1631
+ return arrow_schema
1632
+
1633
+
1634
+ def _create_entry_schemas_for_concatenation(
1635
+ entry_index_to_schema: List[Schema],
1636
+ columns: Optional[List[str]] = None,
1637
+ file_path_column: Optional[str] = None,
1638
+ ) -> List[Schema]:
1639
+ """Create entry schemas for concatenation, optionally filtered by column selection."""
1640
+ if columns is None:
1641
+ # No column selection - return original schemas as-is
1642
+ return entry_index_to_schema
1643
+
1644
+ # Column selection - filter each entry schema
1645
+ modified_schemas = []
1646
+ for entry_schema in entry_index_to_schema:
1647
+ if entry_schema and entry_schema.arrow:
1648
+ filtered_schema = _create_target_schema(
1649
+ entry_schema.arrow, columns, file_path_column
1650
+ )
1651
+ modified_schemas.append(Schema.of(schema=filtered_schema))
1652
+ else:
1653
+ modified_schemas.append(entry_schema)
1654
+
1655
+ return modified_schemas
1656
+
1657
+
1658
+ def _handle_local_table_concatenation(
1659
+ results: Dataset,
1660
+ table_type: DatasetType,
1661
+ table_schema: Optional[Schema],
1662
+ entry_index_to_schema: List[Schema],
1663
+ file_path_column: Optional[str] = None,
1664
+ columns: Optional[List[str]] = None,
1665
+ ) -> Dataset:
1666
+ """Handle concatenation of local table results with schema coercion."""
1667
+ logger.debug(f"Target table schema for concatenation: {table_schema}")
1668
+
1669
+ # Create target schema for coercion, respecting column selection
1670
+ target_schema = _create_target_schema(table_schema.arrow, columns, file_path_column)
1671
+ logger.debug(f"Created target schema: {target_schema.names}")
1672
+
1673
+ # Filter entry schemas to match column selection and file_path_column
1674
+ modified_entry_schemas = _create_entry_schemas_for_concatenation(
1675
+ entry_index_to_schema, columns, file_path_column
1676
+ )
1677
+
1678
+ # Coerce results to unified schema
1679
+ coerced_results = _coerce_results_to_schema(
1680
+ results, target_schema, modified_entry_schemas
1681
+ )
1682
+
1683
+ # Second step: concatenate the coerced results
1684
+ logger.debug(
1685
+ f"Concatenating {len(coerced_results)} local tables of type {table_type} with unified schemas"
1686
+ )
1687
+ concatenated_result = concat_tables(coerced_results, table_type)
1688
+ logger.debug(f"Concatenation complete, result type: {type(concatenated_result)}")
1689
+ return concatenated_result
1690
+
1691
+
1692
+ def read_table(
1693
+ table: str,
1694
+ *args,
1695
+ namespace: Optional[str] = None,
1696
+ table_version: Optional[str] = None,
1697
+ read_as: DatasetType = DatasetType.DAFT,
1698
+ partition_filter: Optional[List[Union[Partition, PartitionLocator]]] = None,
1699
+ max_parallelism: Optional[int] = None,
1700
+ columns: Optional[List[str]] = None,
1701
+ file_path_column: Optional[str] = None,
1702
+ transaction: Optional[Transaction] = None,
1703
+ **kwargs,
1704
+ ) -> Dataset:
1705
+ """Read a table into a dataset.
1706
+
1707
+ Args:
1708
+ table: Name of the table to read.
1709
+ namespace: Optional namespace of the table. Uses default if not specified.
1710
+ table_version: Optional specific version of the table to read.
1711
+ read_as: Dataset type to use for reading table files. Defaults to DatasetType.DAFT.
1712
+ partition_filter: Optional list of partitions to read from.
1713
+ max_parallelism: Optional maximum parallelism for data download. Defaults to the number of
1714
+ available CPU cores for local dataset type reads (i.e., members of DatasetType.local())
1715
+ and 100 for distributed dataset type reads (i.e., members of DatasetType.distributed()).
1716
+ columns: Optional list of columns to include in the result.
1717
+ file_path_column: Optional column name to add file paths to the result.
1718
+ transaction: Optional transaction to chain this read operation to. If provided, uncommitted
1719
+ changes from the transaction will be visible to this read operation.
1720
+ **kwargs: Additional keyword arguments.
1721
+
1722
+ Returns:
1723
+ Dataset containing the table data.
1724
+ """
1725
+ # Set up transaction handling
1726
+ read_transaction, commit_transaction = setup_transaction(transaction, **kwargs)
1727
+ kwargs["transaction"] = read_transaction
1728
+
1729
+ try:
1730
+ # Resolve namespace and get table metadata
1731
+ namespace = namespace or default_namespace()
1732
+
1733
+ table_version_obj = _get_latest_active_or_given_table_version(
1734
+ namespace=namespace,
1735
+ table_name=table,
1736
+ table_version=table_version,
1737
+ **kwargs,
1738
+ )
1739
+
1740
+ # Get partitions and deltas to read
1741
+ qualified_deltas = _get_qualified_deltas_for_read(
1742
+ table,
1743
+ namespace,
1744
+ table_version_obj.table_version,
1745
+ partition_filter,
1746
+ **kwargs,
1747
+ )
1748
+
1749
+ # Download and process the data
1750
+ # TODO(pdames): Remove once we implement a custom SerDe for pa.ParquetFile
1751
+ if read_as == DatasetType.PYARROW_PARQUET:
1752
+ max_parallelism = 1
1753
+ logger.warning(
1754
+ f"Forcing max_parallelism to 1 for PyArrow Parquet reads to avoid serialization errors."
1755
+ )
1756
+ result = _download_and_process_table_data(
1757
+ namespace,
1758
+ table,
1759
+ qualified_deltas,
1760
+ read_as,
1761
+ max_parallelism,
1762
+ columns,
1763
+ file_path_column,
1764
+ table_version_obj,
1765
+ **kwargs,
1766
+ )
1767
+ return result
1768
+ except Exception as e:
1769
+ # If any error occurs, the transaction remains uncommitted
1770
+ commit_transaction = False
1771
+ logger.error(f"Error during read_table: {e}")
1772
+ raise
1773
+ finally:
1774
+ if commit_transaction:
1775
+ # Seal the interactive transaction to commit all operations atomically
1776
+ read_transaction.seal()
1777
+
1778
+
1779
+ def alter_table(
1780
+ table: str,
1781
+ *args,
1782
+ namespace: Optional[str] = None,
1783
+ table_version: Optional[str] = None,
1784
+ lifecycle_state: Optional[LifecycleState] = None,
1785
+ schema_updates: Optional[SchemaUpdate] = None,
1786
+ partition_updates: Optional[Dict[str, Any]] = None,
1787
+ sort_scheme: Optional[SortScheme] = None,
1788
+ table_description: Optional[str] = None,
1789
+ table_version_description: Optional[str] = None,
1790
+ table_properties: Optional[TableProperties] = None,
1791
+ table_version_properties: Optional[TableVersionProperties] = None,
1792
+ transaction: Optional[Transaction] = None,
1793
+ **kwargs,
1794
+ ) -> None:
1795
+ """Alter deltacat table/table_version definition.
1796
+
1797
+ Modifies various aspects of a table's metadata including lifecycle state,
1798
+ schema, partitioning, sort keys, description, and properties.
1799
+
1800
+ Args:
1801
+ table: Name of the table to alter.
1802
+ namespace: Optional namespace of the table. Uses default namespace if not specified.
1803
+ table_version: Optional specific version of the table to alter. Defaults to the latest active version.
1804
+ lifecycle_state: New lifecycle state for the table.
1805
+ schema_updates: Schema updates to apply.
1806
+ partition_updates: Partition scheme updates to apply.
1807
+ sort_scheme: New sort scheme.
1808
+ table_description: New description for the table.
1809
+ table_version_description: New description for the table version. Defaults to `table_description` if not specified.
1810
+ table_properties: New table properties.
1811
+ table_version_properties: New table version properties. Defaults to the current parent table properties if not specified.
1812
+ transaction: Optional transaction to use. If None, creates a new transaction.
1813
+
1814
+ Returns:
1815
+ None
1816
+
1817
+ Raises:
1818
+ TableNotFoundError: If the table does not already exist.
1819
+ TableVersionNotFoundError: If the specified table version or active table version does not exist.
1820
+ """
1821
+ resolved_table_properties = None
1822
+ if table_properties is not None:
1823
+ resolved_table_properties = _add_default_table_properties(table_properties)
1824
+ _validate_table_properties(resolved_table_properties)
1825
+
1826
+ namespace = namespace or default_namespace()
1827
+
1828
+ # Set up transaction handling
1829
+ alter_transaction, commit_transaction = setup_transaction(transaction, **kwargs)
1830
+ kwargs["transaction"] = alter_transaction
1831
+
1832
+ try:
1833
+ if partition_updates:
1834
+ raise NotImplementedError("Partition updates are not yet supported.")
1835
+ if sort_scheme:
1836
+ raise NotImplementedError("Sort scheme updates are not yet supported.")
1837
+
1838
+ new_table: Table = _get_storage(**kwargs).update_table(
1839
+ *args,
1840
+ namespace=namespace,
1841
+ table_name=table,
1842
+ description=table_description,
1843
+ properties=resolved_table_properties,
1844
+ **kwargs,
1845
+ )
1846
+
1847
+ if table_version is None:
1848
+ table_version: Optional[TableVersion] = _get_storage(
1849
+ **kwargs
1850
+ ).get_latest_active_table_version(namespace, table, **kwargs)
1851
+ if table_version is None:
1852
+ raise TableVersionNotFoundError(
1853
+ f"No active table version found for table {namespace}.{table}. "
1854
+ "Please specify a table_version parameter."
1855
+ )
1856
+ else:
1857
+ table_version = _get_storage(**kwargs).get_table_version(
1858
+ namespace, table, table_version, **kwargs
1859
+ )
1860
+ if table_version is None:
1861
+ raise TableVersionNotFoundError(
1862
+ f"Table version '{table_version}' not found for table {namespace}.{table}"
1863
+ )
1864
+
1865
+ # Get table properties for schema evolution
1866
+ schema_evolution_mode = table_version.read_table_property(
1867
+ TableProperty.SCHEMA_EVOLUTION_MODE
1868
+ )
1869
+ if schema_updates and schema_evolution_mode == SchemaEvolutionMode.DISABLED:
1870
+ raise TableValidationError(
1871
+ "Schema evolution is disabled for this table. Please enable schema evolution or remove schema updates."
1872
+ )
1873
+
1874
+ # Only update table version properties if they are explicitly provided
1875
+ resolved_tv_properties = None
1876
+ if table_version_properties is not None:
1877
+ # inherit properties from the parent table if not specified
1878
+ default_tv_properties = new_table.properties
1879
+ if table_version.schema is None:
1880
+ # schemaless tables don't validate reader compatibility by default
1881
+ default_tv_properties[TableProperty.SUPPORTED_READER_TYPES] = None
1882
+ resolved_tv_properties = _add_default_table_properties(
1883
+ table_version_properties,
1884
+ default_tv_properties,
1885
+ )
1886
+ _validate_table_properties(resolved_tv_properties)
1887
+
1888
+ # Apply schema updates if provided
1889
+ updated_schema = None
1890
+ if schema_updates is not None:
1891
+ # Get the current schema from the table version
1892
+ current_schema = table_version.schema
1893
+ if current_schema != schema_updates.base_schema:
1894
+ raise ValueError(
1895
+ f"Schema updates are not compatible with the current schema for table `{namespace}.{table}`. Current schema: {current_schema}, Schema update base schema: {schema_updates.base_schema}"
1896
+ )
1897
+
1898
+ # Apply all the updates to get the final schema
1899
+ updated_schema = schema_updates.apply()
1900
+
1901
+ _get_storage(**kwargs).update_table_version(
1902
+ *args,
1903
+ namespace=namespace,
1904
+ table_name=table,
1905
+ table_version=table_version.id,
1906
+ lifecycle_state=lifecycle_state,
1907
+ description=table_version_description or table_description,
1908
+ schema=updated_schema,
1909
+ properties=resolved_tv_properties, # This will be None if table_version_properties was not provided
1910
+ **kwargs,
1911
+ )
1912
+
1913
+ except Exception as e:
1914
+ # If any error occurs, the transaction remains uncommitted
1915
+ commit_transaction = False
1916
+ logger.error(f"Error during alter_table: {e}")
1917
+ raise
1918
+ finally:
1919
+ if commit_transaction:
1920
+ # Seal the interactive transaction to commit all operations atomically
1921
+ alter_transaction.seal()
1922
+
1923
+
1924
+ def _add_default_table_properties(
1925
+ table_properties: Optional[TableProperties],
1926
+ default_table_properties: TableProperties = TablePropertyDefaultValues,
1927
+ ) -> TableProperties:
1928
+ if table_properties is None:
1929
+ table_properties = {}
1930
+ for k, v in default_table_properties.items():
1931
+ if k not in table_properties:
1932
+ table_properties[k] = v
1933
+ return table_properties
1934
+
1935
+
1936
+ def _validate_table_properties(
1937
+ table_properties: TableProperties,
1938
+ ) -> None:
1939
+ read_optimization_level = table_properties.get(
1940
+ TableProperty.READ_OPTIMIZATION_LEVEL,
1941
+ TablePropertyDefaultValues[TableProperty.READ_OPTIMIZATION_LEVEL],
1942
+ )
1943
+ if read_optimization_level != TableReadOptimizationLevel.MAX:
1944
+ raise NotImplementedError(
1945
+ f"Table read optimization level `{read_optimization_level} is not yet supported. Please use {TableReadOptimizationLevel.MAX}"
1946
+ )
1947
+
1948
+
1949
+ def create_table(
1950
+ table: str,
1951
+ *args,
1952
+ namespace: Optional[str] = None,
1953
+ table_version: Optional[str] = None,
1954
+ lifecycle_state: Optional[LifecycleState] = LifecycleState.ACTIVE,
1955
+ schema: Optional[Schema] = None,
1956
+ partition_scheme: Optional[PartitionScheme] = None,
1957
+ sort_keys: Optional[SortScheme] = None,
1958
+ table_description: Optional[str] = None,
1959
+ table_version_description: Optional[str] = None,
1960
+ table_properties: Optional[TableProperties] = None,
1961
+ table_version_properties: Optional[TableVersionProperties] = None,
1962
+ namespace_properties: Optional[NamespaceProperties] = None,
1963
+ content_types: Optional[List[ContentType]] = None,
1964
+ fail_if_exists: bool = True,
1965
+ transaction: Optional[Transaction] = None,
1966
+ **kwargs,
1967
+ ) -> TableDefinition:
1968
+ """Create an empty table in the catalog.
1969
+
1970
+ If a namespace isn't provided, the table will be created within the default deltacat namespace.
1971
+ Additionally if the provided namespace does not exist, it will be created for you.
1972
+
1973
+ Args:
1974
+ table: Name of the table to create.
1975
+ namespace: Optional namespace for the table. Uses default namespace if not specified.
1976
+ version: Optional version identifier for the table.
1977
+ lifecycle_state: Lifecycle state of the new table. Defaults to ACTIVE.
1978
+ schema: Schema definition for the table.
1979
+ partition_scheme: Optional partitioning scheme for the table.
1980
+ sort_keys: Optional sort keys for the table.
1981
+ table_description: Optional description of the table.
1982
+ table_version_description: Optional description for the table version.
1983
+ table_properties: Optional properties for the table.
1984
+ table_version_properties: Optional properties for the table version. Defaults to the current parent table properties if not specified.
1985
+ namespace_properties: Optional properties for the namespace if it needs to be created.
1986
+ content_types: Optional list of allowed content types for the table.
1987
+ fail_if_exists: If True, raises an error if table already exists. If False, returns existing table.
1988
+ transaction: Optional transaction to use. If None, creates a new transaction.
1989
+
1990
+ Returns:
1991
+ TableDefinition object for the created or existing table.
1992
+
1993
+ Raises:
1994
+ TableAlreadyExistsError: If the table already exists and fail_if_exists is True.
1995
+ NamespaceNotFoundError: If the provided namespace does not exist.
1996
+ """
1997
+ resolved_table_properties = _add_default_table_properties(table_properties)
1998
+ # Note: resolved_tv_properties will be set after checking existing table
1999
+
2000
+ namespace = namespace or default_namespace()
2001
+
2002
+ # Set up transaction handling
2003
+ create_transaction, commit_transaction = setup_transaction(transaction, **kwargs)
2004
+ kwargs["transaction"] = create_transaction
2005
+
2006
+ try:
2007
+ existing_table = (
2008
+ get_table(
2009
+ table,
2010
+ namespace=namespace,
2011
+ table_version=table_version,
2012
+ *args,
2013
+ **kwargs,
2014
+ )
2015
+ if "existing_table_definition" not in kwargs
2016
+ else kwargs["existing_table_definition"]
2017
+ )
2018
+ if existing_table is not None:
2019
+ if existing_table.table_version and existing_table.stream:
2020
+ if fail_if_exists:
2021
+ table_identifier = (
2022
+ f"{namespace}.{table}"
2023
+ if not table_version
2024
+ else f"{namespace}.{table}.{table_version}"
2025
+ )
2026
+ raise TableAlreadyExistsError(
2027
+ f"Table {table_identifier} already exists"
2028
+ )
2029
+ return existing_table
2030
+ # the table exists but the table version doesn't - inherit the existing table properties
2031
+ # Also ensure table properties are inherited when not explicitly provided
2032
+ if table_properties is None:
2033
+ resolved_table_properties = existing_table.table.properties
2034
+
2035
+ # Set up table version properties based on existing table or explicit properties
2036
+ default_tv_properties = resolved_table_properties
2037
+ if schema is None:
2038
+ default_tv_properties = dict(
2039
+ default_tv_properties
2040
+ ) # Make a copy to avoid modifying original
2041
+ default_tv_properties[TableProperty.SUPPORTED_READER_TYPES] = None
2042
+ resolved_tv_properties = _add_default_table_properties(
2043
+ table_version_properties, default_tv_properties
2044
+ )
2045
+ else:
2046
+ # create the namespace if it doesn't exist
2047
+ if not namespace_exists(namespace, **kwargs):
2048
+ create_namespace(
2049
+ namespace=namespace,
2050
+ properties=namespace_properties,
2051
+ *args,
2052
+ **kwargs,
2053
+ )
2054
+
2055
+ # Set up table version properties for new table
2056
+ default_tv_properties = resolved_table_properties
2057
+ if schema is None:
2058
+ default_tv_properties = dict(
2059
+ default_tv_properties
2060
+ ) # Make a copy to avoid modifying original
2061
+ default_tv_properties[TableProperty.SUPPORTED_READER_TYPES] = None
2062
+ resolved_tv_properties = _add_default_table_properties(
2063
+ table_version_properties, default_tv_properties
2064
+ )
2065
+
2066
+ _validate_table_properties(resolved_tv_properties)
2067
+
2068
+ (table, table_version, stream) = _get_storage(**kwargs).create_table_version(
2069
+ namespace=namespace,
2070
+ table_name=table,
2071
+ table_version=table_version,
2072
+ schema=schema,
2073
+ partition_scheme=partition_scheme,
2074
+ sort_keys=sort_keys,
2075
+ table_version_description=table_version_description
2076
+ if table_version_description is not None
2077
+ else table_description,
2078
+ table_description=table_description,
2079
+ table_properties=resolved_table_properties,
2080
+ table_version_properties=resolved_tv_properties,
2081
+ lifecycle_state=lifecycle_state or LifecycleState.ACTIVE,
2082
+ supported_content_types=content_types,
2083
+ *args,
2084
+ **kwargs,
2085
+ )
2086
+
2087
+ result = TableDefinition.of(
2088
+ table=table,
2089
+ table_version=table_version,
2090
+ stream=stream,
2091
+ )
2092
+
2093
+ return result
2094
+
2095
+ except Exception as e:
2096
+ # If any error occurs, the transaction remains uncommitted
2097
+ commit_transaction = False
2098
+ logger.error(f"Error during create_table: {e}")
2099
+ raise
2100
+ finally:
2101
+ if commit_transaction:
2102
+ # Seal the interactive transaction to commit all operations atomically
2103
+ create_transaction.seal()
2104
+
2105
+
2106
+ def drop_table(
2107
+ table: str,
2108
+ *args,
2109
+ namespace: Optional[str] = None,
2110
+ table_version: Optional[str] = None,
2111
+ purge: bool = False,
2112
+ transaction: Optional[Transaction] = None,
2113
+ **kwargs,
2114
+ ) -> None:
2115
+ """Drop a table from the catalog and optionally purges underlying data.
2116
+
2117
+ Args:
2118
+ table: Name of the table to drop.
2119
+ namespace: Optional namespace of the table. Uses default namespace if not specified.
2120
+ table_version: Optional table version of the table to drop. If not specified, the parent table of all
2121
+ table versions will be dropped.
2122
+ purge: If True, permanently delete the table data. If False, only remove from catalog.
2123
+ transaction: Optional transaction to use. If None, creates a new transaction.
2124
+
2125
+ Returns:
2126
+ None
2127
+
2128
+ Raises:
2129
+ TableNotFoundError: If the table does not exist.
2130
+
2131
+ TODO: Honor purge once garbage collection is implemented.
2132
+ TODO: Drop table version if specified, possibly create a delete_table_version api.
2133
+ """
2134
+ if purge:
2135
+ raise NotImplementedError("Purge flag is not currently supported.")
2136
+
2137
+ namespace = namespace or default_namespace()
2138
+
2139
+ # Set up transaction handling
2140
+ drop_transaction, commit_transaction = setup_transaction(transaction, **kwargs)
2141
+ kwargs["transaction"] = drop_transaction
2142
+
2143
+ try:
2144
+ if not table_version:
2145
+ _get_storage(**kwargs).delete_table(
2146
+ namespace=namespace,
2147
+ table_name=table,
2148
+ purge=purge,
2149
+ *args,
2150
+ **kwargs,
2151
+ )
2152
+ else:
2153
+ _get_storage(**kwargs).update_table_version(
2154
+ namespace=namespace,
2155
+ table_name=table,
2156
+ table_version=table_version,
2157
+ lifecycle_state=LifecycleState.DELETED,
2158
+ *args,
2159
+ **kwargs,
2160
+ )
2161
+
2162
+ except Exception as e:
2163
+ # If any error occurs, the transaction remains uncommitted
2164
+ commit_transaction = False
2165
+ logger.error(f"Error during drop_table: {e}")
2166
+ raise
2167
+ finally:
2168
+ if commit_transaction:
2169
+ # Seal the interactive transaction to commit all operations atomically
2170
+ drop_transaction.seal()
2171
+
2172
+
2173
+ def refresh_table(
2174
+ table: str,
2175
+ *args,
2176
+ namespace: Optional[str] = None,
2177
+ table_version: Optional[str] = None,
2178
+ transaction: Optional[Transaction] = None,
2179
+ **kwargs,
2180
+ ) -> None:
2181
+ """Refresh metadata cached on the Ray cluster for the given table.
2182
+
2183
+ Args:
2184
+ table: Name of the table to refresh.
2185
+ namespace: Optional namespace of the table. Uses default namespace if not specified.
2186
+ table_version: Optional specific version of the table to refresh.
2187
+ transaction: Optional transaction to use. If None, creates a new transaction.
2188
+
2189
+ Returns:
2190
+ None
2191
+ """
2192
+ raise NotImplementedError("refresh_table not implemented")
2193
+
2194
+
2195
+ def list_tables(
2196
+ *args,
2197
+ namespace: Optional[str] = None,
2198
+ table: Optional[str] = None,
2199
+ transaction: Optional[Transaction] = None,
2200
+ **kwargs,
2201
+ ) -> ListResult[TableDefinition]:
2202
+ """List a page of table definitions.
2203
+
2204
+ Args:
2205
+ namespace: Optional namespace to list tables from. Uses default namespace if not specified.
2206
+ table: Optional table to list its table versions. If not specified, lists the latest active version of each table in the namespace.
2207
+ transaction: Optional transaction to use for reading. If provided, will see uncommitted changes.
2208
+
2209
+ Returns:
2210
+ ListResult containing TableDefinition objects for tables in the namespace.
2211
+ """
2212
+ namespace = namespace or default_namespace()
2213
+
2214
+ # Set up transaction handling
2215
+ list_transaction, commit_transaction = setup_transaction(transaction, **kwargs)
2216
+ kwargs["transaction"] = list_transaction
2217
+
2218
+ try:
2219
+ if not table:
2220
+ tables = _get_storage(**kwargs).list_tables(
2221
+ namespace=namespace,
2222
+ *args,
2223
+ **kwargs,
2224
+ )
2225
+ table_definitions = [
2226
+ get_table(table.table_name, namespace=namespace, *args, **kwargs)
2227
+ for table in tables.all_items()
2228
+ ]
2229
+ else:
2230
+ table_versions = _get_storage(**kwargs).list_table_versions(
2231
+ namespace=namespace,
2232
+ table_name=table,
2233
+ *args,
2234
+ **kwargs,
2235
+ )
2236
+ table_definitions = [
2237
+ get_table(
2238
+ table,
2239
+ namespace=namespace,
2240
+ table_version=table_version.id,
2241
+ *args,
2242
+ **kwargs,
2243
+ )
2244
+ for table_version in table_versions.all_items()
2245
+ ]
2246
+
2247
+ result = ListResult(items=table_definitions)
2248
+
2249
+ return result
2250
+
2251
+ except Exception as e:
2252
+ # If any error occurs, the transaction remains uncommitted
2253
+ commit_transaction = False
2254
+ logger.error(f"Error during list_tables: {e}")
2255
+ raise
2256
+ finally:
2257
+ if commit_transaction:
2258
+ # Seal the interactive transaction to commit all operations atomically
2259
+ list_transaction.seal()
2260
+
2261
+
2262
+ def get_table(
2263
+ table: str,
2264
+ *args,
2265
+ namespace: Optional[str] = None,
2266
+ table_version: Optional[str] = None,
2267
+ stream_format: StreamFormat = StreamFormat.DELTACAT,
2268
+ transaction: Optional[Transaction] = None,
2269
+ **kwargs,
2270
+ ) -> Optional[TableDefinition]:
2271
+ """Get table definition metadata.
2272
+
2273
+ Args:
2274
+ name: Name of the table to retrieve.
2275
+ namespace: Optional namespace of the table. Uses default namespace if not specified.
2276
+ table_version: Optional specific version of the table to retrieve. Defaults to the latest active version.
2277
+ stream_format: Optional stream format to retrieve. Defaults to DELTACAT.
2278
+ transaction: Optional transaction to use. If None, creates a new transaction.
2279
+
2280
+ Returns:
2281
+ Deltacat TableDefinition if the table exists, None otherwise. The table definition's table version will be
2282
+ None if the requested version is not found. The table definition's stream will be None if the requested stream
2283
+ format is not found.
2284
+ """
2285
+ namespace = namespace or default_namespace()
2286
+
2287
+ # Set up transaction handling
2288
+ get_transaction, commit_transaction = setup_transaction(transaction, **kwargs)
2289
+ kwargs["transaction"] = get_transaction
2290
+
2291
+ try:
2292
+ table_obj: Optional[Table] = _get_storage(**kwargs).get_table(
2293
+ table_name=table,
2294
+ namespace=namespace,
2295
+ *args,
2296
+ **kwargs,
2297
+ )
2298
+
2299
+ if table_obj is None:
2300
+ return None
2301
+
2302
+ table_version_obj: Optional[TableVersion] = _get_storage(
2303
+ **kwargs
2304
+ ).get_table_version(
2305
+ namespace,
2306
+ table,
2307
+ table_version or table_obj.latest_active_table_version,
2308
+ *args,
2309
+ **kwargs,
2310
+ )
2311
+
2312
+ stream = None
2313
+ if table_version_obj:
2314
+ stream = _get_storage(**kwargs).get_stream(
2315
+ namespace=namespace,
2316
+ table_name=table,
2317
+ table_version=table_version_obj.id,
2318
+ stream_format=stream_format,
2319
+ *args,
2320
+ **kwargs,
2321
+ )
2322
+
2323
+ return TableDefinition.of(
2324
+ table=table_obj,
2325
+ table_version=table_version_obj,
2326
+ stream=stream,
2327
+ )
2328
+ except Exception as e:
2329
+ # If any error occurs, the transaction remains uncommitted
2330
+ commit_transaction = False
2331
+ logger.error(f"Error during get_table: {e}")
2332
+ raise
2333
+ finally:
2334
+ if commit_transaction:
2335
+ # Seal the interactive transaction to commit all operations atomically
2336
+ get_transaction.seal()
2337
+
2338
+
2339
+ def truncate_table(
2340
+ table: str,
2341
+ *args,
2342
+ namespace: Optional[str] = None,
2343
+ table_version: Optional[str] = None,
2344
+ transaction: Optional[Transaction] = None,
2345
+ **kwargs,
2346
+ ) -> None:
2347
+ """Truncate table data.
2348
+
2349
+ Args:
2350
+ table: Name of the table to truncate.
2351
+ namespace: Optional namespace of the table. Uses default namespace if not specified.
2352
+ table_version: Optional specific version of the table to truncate. Defaults to the latest active version.
2353
+ transaction: Optional transaction to use. If None, creates a new transaction.
2354
+
2355
+ Returns:
2356
+ None
2357
+ """
2358
+ raise NotImplementedError("truncate_table not implemented")
2359
+
2360
+
2361
+ def rename_table(
2362
+ table: str,
2363
+ new_name: str,
2364
+ *args,
2365
+ namespace: Optional[str] = None,
2366
+ transaction: Optional[Transaction] = None,
2367
+ **kwargs,
2368
+ ) -> None:
2369
+ """Rename an existing table.
2370
+
2371
+ Args:
2372
+ table: Current name of the table.
2373
+ new_name: New name for the table.
2374
+ namespace: Optional namespace of the table. Uses default namespace if not specified.
2375
+ transaction: Optional transaction to use. If None, creates a new transaction.
2376
+
2377
+ Returns:
2378
+ None
2379
+
2380
+ Raises:
2381
+ TableNotFoundError: If the table does not exist.
2382
+ """
2383
+ namespace = namespace or default_namespace()
2384
+
2385
+ # Set up transaction handling
2386
+ rename_transaction, commit_transaction = setup_transaction(transaction, **kwargs)
2387
+ kwargs["transaction"] = rename_transaction
2388
+
2389
+ try:
2390
+ _get_storage(**kwargs).update_table(
2391
+ table_name=table,
2392
+ new_table_name=new_name,
2393
+ namespace=namespace,
2394
+ *args,
2395
+ **kwargs,
2396
+ )
2397
+
2398
+ except Exception as e:
2399
+ # If any error occurs, the transaction remains uncommitted
2400
+ commit_transaction = False
2401
+ logger.error(f"Error during rename_table: {e}")
2402
+ raise
2403
+ finally:
2404
+ if commit_transaction:
2405
+ # Seal the interactive transaction to commit all operations atomically
2406
+ rename_transaction.seal()
2407
+
2408
+
2409
+ def table_exists(
2410
+ table: str,
2411
+ *args,
2412
+ namespace: Optional[str] = None,
2413
+ table_version: Optional[str] = None,
2414
+ stream_format: StreamFormat = StreamFormat.DELTACAT,
2415
+ transaction: Optional[Transaction] = None,
2416
+ **kwargs,
2417
+ ) -> bool:
2418
+ """Check if a table exists in the catalog.
2419
+
2420
+ Args:
2421
+ table: Name of the table to check.
2422
+ namespace: Optional namespace of the table. Uses default namespace if not specified.
2423
+ table_version: Optional specific version of the table to check. Defaults to the latest active version.
2424
+ stream_format: Optional stream format to check. Defaults to DELTACAT.
2425
+ transaction: Optional transaction to use. If None, creates a new transaction.
2426
+
2427
+ Returns:
2428
+ True if the table exists, False otherwise.
2429
+ """
2430
+ namespace = namespace or default_namespace()
2431
+
2432
+ # Set up transaction handling
2433
+ exists_transaction, commit_transaction = setup_transaction(transaction, **kwargs)
2434
+ kwargs["transaction"] = exists_transaction
2435
+
2436
+ try:
2437
+ table_obj = _get_storage(**kwargs).get_table(
2438
+ namespace=namespace,
2439
+ table_name=table,
2440
+ *args,
2441
+ **kwargs,
2442
+ )
2443
+ if table_obj is None:
2444
+ return False
2445
+ table_version = table_version or table_obj.latest_active_table_version
2446
+ if not table_version:
2447
+ return False
2448
+ table_version_exists = _get_storage(**kwargs).table_version_exists(
2449
+ namespace,
2450
+ table,
2451
+ table_version,
2452
+ *args,
2453
+ **kwargs,
2454
+ )
2455
+ if not table_version_exists:
2456
+ return False
2457
+ stream_exists = _get_storage(**kwargs).stream_exists(
2458
+ namespace=namespace,
2459
+ table_name=table,
2460
+ table_version=table_version,
2461
+ stream_format=stream_format,
2462
+ *args,
2463
+ **kwargs,
2464
+ )
2465
+ return stream_exists
2466
+ except Exception as e:
2467
+ # If any error occurs, the transaction remains uncommitted
2468
+ commit_transaction = False
2469
+ logger.error(f"Error during table_exists: {e}")
2470
+ raise
2471
+ finally:
2472
+ if commit_transaction:
2473
+ # Seal the interactive transaction to commit all operations atomically
2474
+ exists_transaction.seal()
2475
+
2476
+
2477
+ def list_namespaces(
2478
+ *args,
2479
+ transaction: Optional[Transaction] = None,
2480
+ **kwargs,
2481
+ ) -> ListResult[Namespace]:
2482
+ """List a page of table namespaces.
2483
+
2484
+ Args:
2485
+ transaction: Optional transaction to use. If None, creates a new transaction.
2486
+
2487
+ Returns:
2488
+ ListResult containing Namespace objects.
2489
+ """
2490
+ # Set up transaction handling
2491
+ list_transaction, commit_transaction = setup_transaction(transaction, **kwargs)
2492
+ kwargs["transaction"] = list_transaction
2493
+
2494
+ try:
2495
+ result = _get_storage(**kwargs).list_namespaces(*args, **kwargs)
2496
+
2497
+ return result
2498
+
2499
+ except Exception as e:
2500
+ # If any error occurs, the transaction remains uncommitted
2501
+ commit_transaction = False
2502
+ logger.error(f"Error during list_namespaces: {e}")
2503
+ raise
2504
+ finally:
2505
+ if commit_transaction:
2506
+ # Seal the interactive transaction to commit all operations atomically
2507
+ list_transaction.seal()
2508
+
2509
+
2510
+ def get_namespace(
2511
+ namespace: str,
2512
+ *args,
2513
+ transaction: Optional[Transaction] = None,
2514
+ **kwargs,
2515
+ ) -> Optional[Namespace]:
2516
+ """Get metadata for a specific table namespace.
2517
+
2518
+ Args:
2519
+ namespace: Name of the namespace to retrieve.
2520
+ transaction: Optional transaction to use. If None, creates a new transaction.
2521
+
2522
+ Returns:
2523
+ Namespace object if the namespace exists, None otherwise.
2524
+ """
2525
+ # Set up transaction handling
2526
+ get_ns_transaction, commit_transaction = setup_transaction(transaction, **kwargs)
2527
+ kwargs["transaction"] = get_ns_transaction
2528
+
2529
+ try:
2530
+ result = _get_storage(**kwargs).get_namespace(
2531
+ *args, namespace=namespace, **kwargs
2532
+ )
2533
+
2534
+ return result
2535
+
2536
+ except Exception as e:
2537
+ # If any error occurs, the transaction remains uncommitted
2538
+ commit_transaction = False
2539
+ logger.error(f"Error during get_namespace: {e}")
2540
+ raise
2541
+ finally:
2542
+ if commit_transaction:
2543
+ # Seal the interactive transaction to commit all operations atomically
2544
+ get_ns_transaction.seal()
2545
+
2546
+
2547
+ def namespace_exists(
2548
+ namespace: str,
2549
+ *args,
2550
+ transaction: Optional[Transaction] = None,
2551
+ **kwargs,
2552
+ ) -> bool:
2553
+ """Check if a namespace exists.
2554
+
2555
+ Args:
2556
+ namespace: Name of the namespace to check.
2557
+ transaction: Optional transaction to use for reading. If provided, will see uncommitted changes.
2558
+
2559
+ Returns:
2560
+ True if the namespace exists, False otherwise.
2561
+ """
2562
+ # Set up transaction handling
2563
+ exists_transaction, commit_transaction = setup_transaction(transaction, **kwargs)
2564
+ kwargs["transaction"] = exists_transaction
2565
+
2566
+ try:
2567
+ result = _get_storage(**kwargs).namespace_exists(
2568
+ *args, namespace=namespace, **kwargs
2569
+ )
2570
+
2571
+ return result
2572
+
2573
+ except Exception as e:
2574
+ # If any error occurs, the transaction remains uncommitted
2575
+ commit_transaction = False
2576
+ logger.error(f"Error during namespace_exists: {e}")
2577
+ raise
2578
+ finally:
2579
+ if commit_transaction:
2580
+ # Seal the interactive transaction to commit all operations atomically
2581
+ exists_transaction.seal()
2582
+
2583
+
2584
+ def create_namespace(
2585
+ namespace: str,
2586
+ *args,
2587
+ properties: Optional[NamespaceProperties] = None,
2588
+ transaction: Optional[Transaction] = None,
2589
+ **kwargs,
2590
+ ) -> Namespace:
2591
+ """Create a new namespace.
2592
+
2593
+ Args:
2594
+ namespace: Name of the namespace to create.
2595
+ properties: Optional properties for the namespace.
2596
+ transaction: Optional transaction to use. If None, creates a new transaction.
2597
+
2598
+ Returns:
2599
+ Created Namespace object.
2600
+
2601
+ Raises:
2602
+ NamespaceAlreadyExistsError: If the namespace already exists.
2603
+ """
2604
+ # Set up transaction handling
2605
+ namespace_transaction, commit_transaction = setup_transaction(transaction, **kwargs)
2606
+ kwargs["transaction"] = namespace_transaction
2607
+
2608
+ try:
2609
+ if namespace_exists(namespace, **kwargs):
2610
+ raise NamespaceAlreadyExistsError(f"Namespace {namespace} already exists")
2611
+
2612
+ result = _get_storage(**kwargs).create_namespace(
2613
+ *args, namespace=namespace, properties=properties, **kwargs
2614
+ )
2615
+
2616
+ return result
2617
+
2618
+ except Exception as e:
2619
+ # If any error occurs, the transaction remains uncommitted
2620
+ commit_transaction = False
2621
+ logger.error(f"Error during create_namespace: {e}")
2622
+ raise
2623
+ finally:
2624
+ if commit_transaction:
2625
+ # Seal the interactive transaction to commit all operations atomically
2626
+ namespace_transaction.seal()
2627
+
2628
+
2629
+ def alter_namespace(
2630
+ namespace: str,
2631
+ *args,
2632
+ properties: Optional[NamespaceProperties] = None,
2633
+ new_namespace: Optional[str] = None,
2634
+ transaction: Optional[Transaction] = None,
2635
+ **kwargs,
2636
+ ) -> None:
2637
+ """Alter a namespace definition.
2638
+
2639
+ Args:
2640
+ namespace: Name of the namespace to alter.
2641
+ properties: Optional new properties for the namespace.
2642
+ new_namespace: Optional new name for the namespace.
2643
+ transaction: Optional transaction to use. If None, creates a new transaction.
2644
+
2645
+ Returns:
2646
+ None
2647
+ """
2648
+ # Set up transaction handling
2649
+ alter_ns_transaction, commit_transaction = setup_transaction(transaction, **kwargs)
2650
+ kwargs["transaction"] = alter_ns_transaction
2651
+
2652
+ try:
2653
+ _get_storage(**kwargs).update_namespace(
2654
+ namespace=namespace,
2655
+ properties=properties,
2656
+ new_namespace=new_namespace,
2657
+ *args,
2658
+ **kwargs,
2659
+ )
2660
+
2661
+ except Exception as e:
2662
+ # If any error occurs, the transaction remains uncommitted
2663
+ commit_transaction = False
2664
+ logger.error(f"Error during alter_namespace: {e}")
2665
+ raise
2666
+ finally:
2667
+ if commit_transaction:
2668
+ # Seal the interactive transaction to commit all operations atomically
2669
+ alter_ns_transaction.seal()
2670
+
2671
+
2672
+ def drop_namespace(
2673
+ namespace: str,
2674
+ *args,
2675
+ purge: bool = False,
2676
+ transaction: Optional[Transaction] = None,
2677
+ **kwargs,
2678
+ ) -> None:
2679
+ """Drop a namespace and all of its tables from the catalog.
2680
+
2681
+ Args:
2682
+ namespace: Name of the namespace to drop.
2683
+ purge: If True, permanently delete all table data in the namespace.
2684
+ If False, only removes the namespace from the catalog.
2685
+ transaction: Optional transaction to use. If None, creates a new transaction.
2686
+
2687
+ Returns:
2688
+ None
2689
+
2690
+ TODO: Honor purge once garbage collection is implemented.
2691
+ """
2692
+ if purge:
2693
+ raise NotImplementedError("Purge flag is not currently supported.")
2694
+
2695
+ # Set up transaction handling
2696
+ drop_ns_transaction, commit_transaction = setup_transaction(transaction, **kwargs)
2697
+ kwargs["transaction"] = drop_ns_transaction
2698
+
2699
+ try:
2700
+ _get_storage(**kwargs).delete_namespace(
2701
+ *args,
2702
+ namespace=namespace,
2703
+ purge=purge,
2704
+ **kwargs,
2705
+ )
2706
+
2707
+ except Exception as e:
2708
+ # If any error occurs, the transaction remains uncommitted
2709
+ commit_transaction = False
2710
+ logger.error(f"Error during drop_namespace: {e}")
2711
+ raise
2712
+ finally:
2713
+ if commit_transaction:
2714
+ # Seal the interactive transaction to commit all operations atomically
2715
+ drop_ns_transaction.seal()
2716
+
2717
+
2718
+ def default_namespace(*args, **kwargs) -> str:
2719
+ """Return the default namespace for the catalog.
2720
+
2721
+ Returns:
2722
+ Name of the default namespace.
2723
+ """
2724
+ return DEFAULT_NAMESPACE
2725
+
2726
+
2727
+ def _get_latest_active_or_given_table_version(
2728
+ namespace: str,
2729
+ table_name: str,
2730
+ table_version: Optional[str] = None,
2731
+ *args,
2732
+ **kwargs,
2733
+ ) -> TableVersion:
2734
+ table_version_obj = None
2735
+ if table_version is None:
2736
+ table_version_obj = _get_storage(**kwargs).get_latest_active_table_version(
2737
+ namespace=namespace,
2738
+ table_name=table_name,
2739
+ *args,
2740
+ **kwargs,
2741
+ )
2742
+ if table_version_obj is None:
2743
+ raise TableVersionNotFoundError(
2744
+ f"No active table version found for table {namespace}.{table_name}"
2745
+ )
2746
+ table_version = table_version_obj.table_version
2747
+ else:
2748
+ table_version_obj = _get_storage(**kwargs).get_table_version(
2749
+ namespace=namespace,
2750
+ table_name=table_name,
2751
+ table_version=table_version,
2752
+ *args,
2753
+ **kwargs,
2754
+ )
2755
+
2756
+ return table_version_obj
2757
+
2758
+
2759
+ def _get_all_committed_partitions(
2760
+ table: str,
2761
+ namespace: str,
2762
+ table_version: str,
2763
+ **kwargs,
2764
+ ) -> List[Union[Partition, PartitionLocator]]:
2765
+ """Get all committed partitions for a table and validate uniqueness."""
2766
+ logger.info(
2767
+ f"Reading all partitions metadata in the table={table} "
2768
+ "as partition_filter was None."
2769
+ )
2770
+
2771
+ all_partitions = (
2772
+ _get_storage(**kwargs)
2773
+ .list_partitions(
2774
+ table_name=table,
2775
+ namespace=namespace,
2776
+ table_version=table_version,
2777
+ **kwargs,
2778
+ )
2779
+ .all_items()
2780
+ )
2781
+
2782
+ committed_partitions = [
2783
+ partition
2784
+ for partition in all_partitions
2785
+ if partition.state == CommitState.COMMITTED
2786
+ ]
2787
+
2788
+ logger.info(
2789
+ f"Found {len(committed_partitions)} committed partitions for "
2790
+ f"table={namespace}/{table}/{table_version}"
2791
+ )
2792
+
2793
+ _validate_partition_uniqueness(
2794
+ committed_partitions, namespace, table, table_version
2795
+ )
2796
+ return committed_partitions
2797
+
2798
+
2799
+ def _validate_partition_uniqueness(
2800
+ partitions: List[Partition], namespace: str, table: str, table_version: str
2801
+ ) -> None:
2802
+ """Validate that there are no duplicate committed partitions for the same partition values."""
2803
+ commit_count_per_partition_value = defaultdict(int)
2804
+ for partition in partitions:
2805
+ # Normalize partition values: both None and [] represent unpartitioned data
2806
+ normalized_values = (
2807
+ None
2808
+ if (
2809
+ partition.partition_values is None
2810
+ or (
2811
+ isinstance(partition.partition_values, list)
2812
+ and len(partition.partition_values) == 0
2813
+ )
2814
+ )
2815
+ else partition.partition_values
2816
+ )
2817
+ commit_count_per_partition_value[normalized_values] += 1
2818
+
2819
+ # Check for multiple committed partitions for the same partition values
2820
+ for partition_values, commit_count in commit_count_per_partition_value.items():
2821
+ if commit_count > 1:
2822
+ raise RuntimeError(
2823
+ f"Multiple committed partitions found for table={namespace}/{table}/{table_version}. "
2824
+ f"Partition values: {partition_values}. Commit count: {commit_count}. "
2825
+ f"This should not happen."
2826
+ )
2827
+
2828
+
2829
+ def _get_deltas_from_partition_filter(
2830
+ partition_filter: Optional[List[Union[Partition, PartitionLocator]]] = None,
2831
+ *args,
2832
+ **kwargs,
2833
+ ):
2834
+ result_deltas = []
2835
+ for partition_like in partition_filter:
2836
+ deltas = (
2837
+ _get_storage(**kwargs)
2838
+ .list_partition_deltas(
2839
+ partition_like=partition_like,
2840
+ ascending_order=True,
2841
+ include_manifest=True,
2842
+ *args,
2843
+ **kwargs,
2844
+ )
2845
+ .all_items()
2846
+ )
2847
+
2848
+ # Validate that all qualified deltas are append type - merge-on-read not yet implemented
2849
+ # TODO(pdames): Run compaction minus materialize for MoR of each partition.
2850
+ if deltas:
2851
+ non_append_deltas = []
2852
+ for delta in deltas:
2853
+ if delta.type != DeltaType.APPEND:
2854
+ non_append_deltas.append(delta)
2855
+ else:
2856
+ result_deltas.append(delta)
2857
+ if non_append_deltas:
2858
+ delta_types = {delta.type for delta in non_append_deltas}
2859
+ delta_info = [
2860
+ (str(delta.locator), delta.type) for delta in non_append_deltas[:5]
2861
+ ] # Show first 5
2862
+ raise NotImplementedError(
2863
+ f"Merge-on-read is not yet implemented. Found {len(non_append_deltas)} non-append deltas "
2864
+ f"with types {delta_types}. All deltas must be APPEND type for read operations. "
2865
+ f"Examples: {delta_info}. Please run compaction first to merge non-append deltas."
2866
+ )
2867
+
2868
+ logger.info(f"Validated {len(deltas)} qualified deltas are all APPEND type")
2869
+ return result_deltas
2870
+
2871
+
2872
+ def _get_storage(**kwargs):
2873
+ """
2874
+ Returns the implementation of `deltacat.storage.interface` to use with this catalog
2875
+
2876
+ This is configured in the `CatalogProperties` stored during initialization and passed through `delegate.py`
2877
+ """
2878
+ properties: Optional[CatalogProperties] = kwargs.get("inner")
2879
+ if properties is not None and properties.storage is not None:
2880
+ return properties.storage
2881
+ else:
2882
+ return dc.storage.metastore