deltacat 1.1.38__py3-none-any.whl → 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (367) hide show
  1. deltacat/__init__.py +150 -12
  2. deltacat/annotations.py +36 -0
  3. deltacat/api.py +578 -0
  4. deltacat/aws/constants.py +0 -23
  5. deltacat/aws/s3u.py +4 -631
  6. deltacat/benchmarking/benchmark_engine.py +84 -0
  7. deltacat/benchmarking/benchmark_report.py +86 -0
  8. deltacat/benchmarking/benchmark_suite.py +11 -0
  9. deltacat/benchmarking/conftest.py +22 -19
  10. deltacat/benchmarking/data/random_row_generator.py +94 -0
  11. deltacat/benchmarking/data/row_generator.py +10 -0
  12. deltacat/benchmarking/test_benchmark_pipeline.py +108 -0
  13. deltacat/catalog/__init__.py +73 -0
  14. deltacat/catalog/delegate.py +615 -140
  15. deltacat/catalog/interface.py +404 -81
  16. deltacat/catalog/main/impl.py +2882 -0
  17. deltacat/catalog/model/catalog.py +348 -46
  18. deltacat/catalog/model/properties.py +155 -0
  19. deltacat/catalog/model/table_definition.py +32 -1
  20. deltacat/compute/__init__.py +14 -0
  21. deltacat/compute/compactor/compaction_session.py +97 -75
  22. deltacat/compute/compactor/model/compact_partition_params.py +75 -30
  23. deltacat/compute/compactor/model/compaction_session_audit_info.py +23 -30
  24. deltacat/compute/compactor/model/delta_annotated.py +3 -3
  25. deltacat/compute/compactor/model/delta_file_envelope.py +3 -1
  26. deltacat/compute/compactor/model/delta_file_locator.py +3 -1
  27. deltacat/compute/compactor/model/round_completion_info.py +19 -9
  28. deltacat/compute/compactor/model/table_object_store.py +3 -2
  29. deltacat/compute/compactor/repartition_session.py +9 -22
  30. deltacat/compute/compactor/steps/dedupe.py +11 -4
  31. deltacat/compute/compactor/steps/hash_bucket.py +6 -6
  32. deltacat/compute/compactor/steps/materialize.py +15 -9
  33. deltacat/compute/compactor/steps/repartition.py +12 -11
  34. deltacat/compute/compactor/utils/io.py +7 -6
  35. deltacat/compute/compactor/utils/round_completion_reader.py +117 -0
  36. deltacat/compute/compactor/utils/sort_key.py +9 -2
  37. deltacat/compute/compactor/utils/system_columns.py +3 -1
  38. deltacat/compute/compactor_v2/compaction_session.py +13 -14
  39. deltacat/compute/compactor_v2/deletes/utils.py +3 -3
  40. deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -1
  41. deltacat/compute/compactor_v2/model/hash_bucket_input.py +9 -3
  42. deltacat/compute/compactor_v2/model/merge_file_group.py +5 -2
  43. deltacat/compute/compactor_v2/model/merge_input.py +28 -9
  44. deltacat/compute/compactor_v2/private/compaction_utils.py +171 -73
  45. deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -2
  46. deltacat/compute/compactor_v2/steps/merge.py +156 -53
  47. deltacat/compute/compactor_v2/utils/content_type_params.py +17 -6
  48. deltacat/compute/compactor_v2/utils/delta.py +5 -3
  49. deltacat/compute/compactor_v2/utils/io.py +10 -3
  50. deltacat/compute/compactor_v2/utils/merge.py +14 -2
  51. deltacat/compute/compactor_v2/utils/task_options.py +2 -10
  52. deltacat/compute/converter/constants.py +9 -0
  53. deltacat/compute/converter/converter_session.py +298 -0
  54. deltacat/compute/converter/model/convert_input.py +96 -0
  55. deltacat/compute/converter/model/convert_input_files.py +78 -0
  56. deltacat/compute/converter/model/convert_result.py +80 -0
  57. deltacat/compute/converter/model/converter_session_params.py +144 -0
  58. deltacat/compute/converter/pyiceberg/catalog.py +78 -0
  59. deltacat/compute/converter/pyiceberg/overrides.py +263 -0
  60. deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +299 -0
  61. deltacat/compute/converter/steps/convert.py +366 -0
  62. deltacat/compute/converter/steps/dedupe.py +94 -0
  63. deltacat/compute/converter/utils/__init__.py +0 -0
  64. deltacat/compute/converter/utils/convert_task_options.py +132 -0
  65. deltacat/compute/converter/utils/converter_session_utils.py +175 -0
  66. deltacat/compute/converter/utils/iceberg_columns.py +87 -0
  67. deltacat/compute/converter/utils/io.py +203 -0
  68. deltacat/compute/converter/utils/s3u.py +148 -0
  69. deltacat/compute/janitor.py +205 -0
  70. deltacat/compute/jobs/__init__.py +0 -0
  71. deltacat/compute/jobs/client.py +417 -0
  72. deltacat/compute/resource_estimation/delta.py +11 -1
  73. deltacat/constants.py +90 -1
  74. deltacat/docs/__init__.py +0 -0
  75. deltacat/docs/autogen/__init__.py +0 -0
  76. deltacat/docs/autogen/schema/__init__.py +0 -0
  77. deltacat/docs/autogen/schema/inference/__init__.py +0 -0
  78. deltacat/docs/autogen/schema/inference/generate_type_mappings.py +687 -0
  79. deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py +673 -0
  80. deltacat/env.py +61 -0
  81. deltacat/examples/__init__.py +0 -0
  82. deltacat/examples/basic_logging.py +101 -0
  83. deltacat/examples/compactor/__init__.py +0 -0
  84. deltacat/examples/compactor/aws/__init__.py +1 -0
  85. deltacat/examples/compactor/bootstrap.py +863 -0
  86. deltacat/examples/compactor/compactor.py +373 -0
  87. deltacat/examples/compactor/explorer.py +473 -0
  88. deltacat/examples/compactor/gcp/__init__.py +1 -0
  89. deltacat/examples/compactor/job_runner.py +439 -0
  90. deltacat/examples/compactor/utils/__init__.py +1 -0
  91. deltacat/examples/compactor/utils/common.py +261 -0
  92. deltacat/examples/experimental/__init__.py +0 -0
  93. deltacat/examples/experimental/iceberg/__init__.py +0 -0
  94. deltacat/examples/experimental/iceberg/converter/__init__.py +0 -0
  95. deltacat/examples/experimental/iceberg/converter/beam/__init__.py +0 -0
  96. deltacat/examples/experimental/iceberg/converter/beam/app.py +226 -0
  97. deltacat/examples/experimental/iceberg/converter/beam/main.py +133 -0
  98. deltacat/examples/experimental/iceberg/converter/beam/test_workflow.py +113 -0
  99. deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py +3 -0
  100. deltacat/examples/experimental/iceberg/converter/beam/utils/common.py +174 -0
  101. deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py +263 -0
  102. deltacat/examples/experimental/iceberg/iceberg_bucket_writer.py +184 -0
  103. deltacat/examples/experimental/iceberg/iceberg_reader.py +147 -0
  104. deltacat/examples/hello_world.py +29 -0
  105. deltacat/examples/indexer/__init__.py +0 -0
  106. deltacat/examples/indexer/aws/__init__.py +0 -0
  107. deltacat/examples/indexer/gcp/__init__.py +0 -0
  108. deltacat/examples/indexer/indexer.py +163 -0
  109. deltacat/examples/indexer/job_runner.py +198 -0
  110. deltacat/exceptions.py +116 -12
  111. deltacat/experimental/__init__.py +0 -0
  112. deltacat/experimental/catalog/__init__.py +0 -0
  113. deltacat/experimental/catalog/iceberg/__init__.py +6 -0
  114. deltacat/experimental/catalog/iceberg/iceberg_catalog_config.py +26 -0
  115. deltacat/experimental/catalog/iceberg/impl.py +399 -0
  116. deltacat/experimental/catalog/iceberg/overrides.py +72 -0
  117. deltacat/experimental/compatibility/__init__.py +0 -0
  118. deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py +201 -0
  119. deltacat/experimental/converter_agent/__init__.py +0 -0
  120. deltacat/experimental/converter_agent/beam/__init__.py +0 -0
  121. deltacat/experimental/converter_agent/beam/managed.py +173 -0
  122. deltacat/experimental/converter_agent/table_monitor.py +479 -0
  123. deltacat/experimental/daft/__init__.py +4 -0
  124. deltacat/experimental/daft/daft_catalog.py +229 -0
  125. deltacat/experimental/storage/__init__.py +0 -0
  126. deltacat/experimental/storage/iceberg/__init__.py +0 -0
  127. deltacat/experimental/storage/iceberg/iceberg_scan_planner.py +129 -0
  128. deltacat/experimental/storage/iceberg/impl.py +739 -0
  129. deltacat/experimental/storage/iceberg/model.py +713 -0
  130. deltacat/experimental/storage/iceberg/visitor.py +119 -0
  131. deltacat/experimental/storage/rivulet/__init__.py +11 -0
  132. deltacat/experimental/storage/rivulet/arrow/__init__.py +0 -0
  133. deltacat/experimental/storage/rivulet/arrow/serializer.py +78 -0
  134. deltacat/experimental/storage/rivulet/dataset.py +745 -0
  135. deltacat/experimental/storage/rivulet/dataset_executor.py +79 -0
  136. deltacat/experimental/storage/rivulet/feather/__init__.py +7 -0
  137. deltacat/experimental/storage/rivulet/feather/file_reader.py +138 -0
  138. deltacat/experimental/storage/rivulet/feather/serializer.py +35 -0
  139. deltacat/experimental/storage/rivulet/fs/__init__.py +0 -0
  140. deltacat/experimental/storage/rivulet/fs/file_provider.py +105 -0
  141. deltacat/experimental/storage/rivulet/fs/file_store.py +130 -0
  142. deltacat/experimental/storage/rivulet/fs/input_file.py +76 -0
  143. deltacat/experimental/storage/rivulet/fs/output_file.py +86 -0
  144. deltacat/experimental/storage/rivulet/logical_plan.py +105 -0
  145. deltacat/experimental/storage/rivulet/metastore/__init__.py +0 -0
  146. deltacat/experimental/storage/rivulet/metastore/delta.py +188 -0
  147. deltacat/experimental/storage/rivulet/metastore/json_sst.py +105 -0
  148. deltacat/experimental/storage/rivulet/metastore/sst.py +82 -0
  149. deltacat/experimental/storage/rivulet/metastore/sst_interval_tree.py +260 -0
  150. deltacat/experimental/storage/rivulet/mvp/Table.py +101 -0
  151. deltacat/experimental/storage/rivulet/mvp/__init__.py +5 -0
  152. deltacat/experimental/storage/rivulet/parquet/__init__.py +7 -0
  153. deltacat/experimental/storage/rivulet/parquet/data_reader.py +0 -0
  154. deltacat/experimental/storage/rivulet/parquet/file_reader.py +129 -0
  155. deltacat/experimental/storage/rivulet/parquet/serializer.py +37 -0
  156. deltacat/experimental/storage/rivulet/reader/__init__.py +0 -0
  157. deltacat/experimental/storage/rivulet/reader/block_scanner.py +389 -0
  158. deltacat/experimental/storage/rivulet/reader/data_reader.py +136 -0
  159. deltacat/experimental/storage/rivulet/reader/data_scan.py +65 -0
  160. deltacat/experimental/storage/rivulet/reader/dataset_metastore.py +179 -0
  161. deltacat/experimental/storage/rivulet/reader/dataset_reader.py +158 -0
  162. deltacat/experimental/storage/rivulet/reader/pyarrow_data_reader.py +124 -0
  163. deltacat/experimental/storage/rivulet/reader/query_expression.py +99 -0
  164. deltacat/experimental/storage/rivulet/reader/reader_type_registrar.py +84 -0
  165. deltacat/experimental/storage/rivulet/schema/__init__.py +0 -0
  166. deltacat/experimental/storage/rivulet/schema/datatype.py +128 -0
  167. deltacat/experimental/storage/rivulet/schema/schema.py +251 -0
  168. deltacat/experimental/storage/rivulet/serializer.py +40 -0
  169. deltacat/experimental/storage/rivulet/serializer_factory.py +46 -0
  170. deltacat/experimental/storage/rivulet/shard/__init__.py +0 -0
  171. deltacat/experimental/storage/rivulet/shard/range_shard.py +129 -0
  172. deltacat/experimental/storage/rivulet/writer/__init__.py +0 -0
  173. deltacat/experimental/storage/rivulet/writer/dataset_writer.py +29 -0
  174. deltacat/experimental/storage/rivulet/writer/memtable_dataset_writer.py +305 -0
  175. deltacat/io/__init__.py +13 -0
  176. deltacat/io/dataset/__init__.py +0 -0
  177. deltacat/io/dataset/deltacat_dataset.py +91 -0
  178. deltacat/io/datasink/__init__.py +0 -0
  179. deltacat/io/datasink/deltacat_datasink.py +207 -0
  180. deltacat/io/datasource/__init__.py +0 -0
  181. deltacat/io/datasource/deltacat_datasource.py +579 -0
  182. deltacat/io/reader/__init__.py +0 -0
  183. deltacat/io/reader/deltacat_read_api.py +172 -0
  184. deltacat/logs.py +4 -1
  185. deltacat/storage/__init__.py +138 -28
  186. deltacat/storage/interface.py +260 -155
  187. deltacat/storage/main/__init__.py +0 -0
  188. deltacat/storage/main/impl.py +3030 -0
  189. deltacat/storage/model/delta.py +142 -71
  190. deltacat/storage/model/expression/__init__.py +47 -0
  191. deltacat/storage/model/expression/expression.py +656 -0
  192. deltacat/storage/model/expression/visitor.py +248 -0
  193. deltacat/storage/model/interop.py +24 -0
  194. deltacat/storage/model/list_result.py +8 -0
  195. deltacat/storage/model/locator.py +93 -9
  196. deltacat/storage/model/manifest.py +643 -0
  197. deltacat/storage/model/metafile.py +1421 -0
  198. deltacat/storage/model/namespace.py +41 -18
  199. deltacat/storage/model/partition.py +443 -43
  200. deltacat/storage/model/scan/__init__.py +0 -0
  201. deltacat/storage/model/scan/push_down.py +46 -0
  202. deltacat/storage/model/scan/scan_plan.py +10 -0
  203. deltacat/storage/model/scan/scan_task.py +34 -0
  204. deltacat/storage/model/schema.py +3160 -0
  205. deltacat/storage/model/shard.py +51 -0
  206. deltacat/storage/model/sort_key.py +210 -13
  207. deltacat/storage/model/stream.py +215 -80
  208. deltacat/storage/model/table.py +134 -29
  209. deltacat/storage/model/table_version.py +333 -46
  210. deltacat/storage/model/transaction.py +1733 -0
  211. deltacat/storage/model/transform.py +274 -58
  212. deltacat/storage/model/types.py +138 -16
  213. deltacat/storage/util/__init__.py +0 -0
  214. deltacat/storage/util/scan_planner.py +26 -0
  215. deltacat/tests/_io/__init__.py +1 -0
  216. deltacat/tests/_io/reader/__init__.py +0 -0
  217. deltacat/tests/_io/reader/test_deltacat_read_api.py +0 -0
  218. deltacat/tests/{io → _io}/test_cloudpickle_bug_fix.py +8 -4
  219. deltacat/tests/aws/test_s3u.py +2 -31
  220. deltacat/tests/catalog/data/__init__.py +0 -0
  221. deltacat/tests/catalog/main/__init__.py +0 -0
  222. deltacat/tests/catalog/main/test_catalog_impl_namespace_operations.py +130 -0
  223. deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +1972 -0
  224. deltacat/tests/catalog/model/__init__.py +0 -0
  225. deltacat/tests/catalog/model/test_table_definition.py +16 -0
  226. deltacat/tests/catalog/test_catalogs.py +321 -0
  227. deltacat/tests/catalog/test_default_catalog_impl.py +12154 -66
  228. deltacat/tests/compute/compact_partition_multiple_rounds_test_cases.py +21 -21
  229. deltacat/tests/compute/compact_partition_rebase_test_cases.py +6 -6
  230. deltacat/tests/compute/compact_partition_rebase_then_incremental_test_cases.py +56 -56
  231. deltacat/tests/compute/compact_partition_test_cases.py +23 -30
  232. deltacat/tests/compute/compactor/steps/test_repartition.py +14 -14
  233. deltacat/tests/compute/compactor/utils/test_io.py +125 -123
  234. deltacat/tests/compute/compactor/utils/test_round_completion_reader.py +254 -0
  235. deltacat/tests/compute/compactor_v2/test_compaction_session.py +387 -830
  236. deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +70 -57
  237. deltacat/tests/compute/compactor_v2/utils/test_task_options.py +0 -3
  238. deltacat/tests/compute/conftest.py +39 -0
  239. deltacat/tests/compute/converter/__init__.py +0 -0
  240. deltacat/tests/compute/converter/conftest.py +80 -0
  241. deltacat/tests/compute/converter/test_convert_session.py +826 -0
  242. deltacat/tests/compute/converter/utils.py +132 -0
  243. deltacat/tests/compute/resource_estimation/test_delta.py +88 -104
  244. deltacat/tests/compute/test_compact_partition_incremental.py +91 -98
  245. deltacat/tests/compute/test_compact_partition_multiple_rounds.py +79 -97
  246. deltacat/tests/compute/test_compact_partition_params.py +16 -11
  247. deltacat/tests/compute/test_compact_partition_rebase.py +63 -93
  248. deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +249 -220
  249. deltacat/tests/compute/test_janitor.py +236 -0
  250. deltacat/tests/compute/test_util_common.py +726 -46
  251. deltacat/tests/compute/test_util_constant.py +0 -1
  252. deltacat/tests/conftest.py +25 -0
  253. deltacat/tests/daft/__init__.py +0 -0
  254. deltacat/tests/daft/test_model.py +97 -0
  255. deltacat/tests/experimental/__init__.py +1 -0
  256. deltacat/tests/experimental/catalog/__init__.py +0 -0
  257. deltacat/tests/experimental/catalog/iceberg/__init__.py +0 -0
  258. deltacat/tests/experimental/catalog/iceberg/test_iceberg_catalog.py +71 -0
  259. deltacat/tests/experimental/compatibility/__init__.py +1 -0
  260. deltacat/tests/experimental/compatibility/test_backfill_locator_to_id_mappings.py +582 -0
  261. deltacat/tests/experimental/daft/__init__.py +0 -0
  262. deltacat/tests/experimental/daft/test_deltacat_daft_integration.py +136 -0
  263. deltacat/tests/experimental/storage/__init__.py +0 -0
  264. deltacat/tests/experimental/storage/rivulet/__init__.py +0 -0
  265. deltacat/tests/experimental/storage/rivulet/conftest.py +149 -0
  266. deltacat/tests/experimental/storage/rivulet/fs/__init__.py +0 -0
  267. deltacat/tests/experimental/storage/rivulet/fs/test_file_location_provider.py +94 -0
  268. deltacat/tests/experimental/storage/rivulet/reader/__init__.py +0 -0
  269. deltacat/tests/experimental/storage/rivulet/reader/query_expression.py +80 -0
  270. deltacat/tests/experimental/storage/rivulet/reader/test_data_scan.py +119 -0
  271. deltacat/tests/experimental/storage/rivulet/reader/test_dataset_metastore.py +71 -0
  272. deltacat/tests/experimental/storage/rivulet/schema/__init__.py +0 -0
  273. deltacat/tests/experimental/storage/rivulet/schema/test_schema.py +241 -0
  274. deltacat/tests/experimental/storage/rivulet/shard/__init__.py +0 -0
  275. deltacat/tests/experimental/storage/rivulet/shard/test_range_shard.py +162 -0
  276. deltacat/tests/experimental/storage/rivulet/test_dataset.py +408 -0
  277. deltacat/tests/experimental/storage/rivulet/test_manifest.py +67 -0
  278. deltacat/tests/experimental/storage/rivulet/test_sst_interval_tree.py +232 -0
  279. deltacat/tests/experimental/storage/rivulet/test_utils.py +124 -0
  280. deltacat/tests/experimental/storage/rivulet/writer/__init__.py +0 -0
  281. deltacat/tests/experimental/storage/rivulet/writer/test_dataset_write_then_read.py +343 -0
  282. deltacat/tests/experimental/storage/rivulet/writer/test_dataset_writer.py +79 -0
  283. deltacat/tests/experimental/storage/rivulet/writer/test_memtable_dataset_writer.py +75 -0
  284. deltacat/tests/storage/__init__.py +0 -0
  285. deltacat/tests/storage/main/__init__.py +0 -0
  286. deltacat/tests/storage/main/test_main_storage.py +8204 -0
  287. deltacat/tests/storage/model/__init__.py +0 -0
  288. deltacat/tests/storage/model/test_delete_parameters.py +21 -0
  289. deltacat/tests/storage/model/test_expression.py +327 -0
  290. deltacat/tests/storage/model/test_manifest.py +129 -0
  291. deltacat/tests/storage/model/test_metafile_io.py +2440 -0
  292. deltacat/tests/storage/model/test_partition_scheme.py +85 -0
  293. deltacat/tests/storage/model/test_schema.py +479 -0
  294. deltacat/tests/storage/model/test_schema_update.py +1925 -0
  295. deltacat/tests/storage/model/test_shard.py +24 -0
  296. deltacat/tests/storage/model/test_sort_scheme.py +90 -0
  297. deltacat/tests/storage/model/test_table_version.py +110 -0
  298. deltacat/tests/storage/model/test_transaction.py +653 -0
  299. deltacat/tests/storage/model/test_transaction_history.py +886 -0
  300. deltacat/tests/test_deltacat_api.py +1064 -0
  301. deltacat/tests/test_exceptions.py +9 -5
  302. deltacat/tests/test_utils/filesystem.py +14 -0
  303. deltacat/tests/test_utils/message_pack_utils.py +54 -0
  304. deltacat/tests/test_utils/pyarrow.py +50 -26
  305. deltacat/tests/test_utils/storage.py +256 -4
  306. deltacat/tests/types/__init__.py +0 -0
  307. deltacat/tests/types/test_tables.py +104 -0
  308. deltacat/tests/utils/exceptions.py +22 -0
  309. deltacat/tests/utils/main_deltacat_storage_mock.py +31 -0
  310. deltacat/tests/utils/ray_utils/test_dataset.py +123 -5
  311. deltacat/tests/utils/test_daft.py +124 -34
  312. deltacat/tests/utils/test_numpy.py +1193 -0
  313. deltacat/tests/utils/test_pandas.py +1106 -0
  314. deltacat/tests/utils/test_polars.py +1040 -0
  315. deltacat/tests/utils/test_pyarrow.py +1107 -258
  316. deltacat/types/media.py +345 -37
  317. deltacat/types/partial_download.py +1 -1
  318. deltacat/types/tables.py +2345 -47
  319. deltacat/utils/arguments.py +33 -1
  320. deltacat/utils/daft.py +824 -40
  321. deltacat/utils/export.py +61 -0
  322. deltacat/utils/filesystem.py +450 -0
  323. deltacat/utils/metafile_locator.py +74 -0
  324. deltacat/utils/numpy.py +118 -26
  325. deltacat/utils/pandas.py +577 -48
  326. deltacat/utils/polars.py +759 -0
  327. deltacat/utils/pyarrow.py +1212 -178
  328. deltacat/utils/ray_utils/concurrency.py +1 -1
  329. deltacat/utils/ray_utils/dataset.py +101 -10
  330. deltacat/utils/ray_utils/runtime.py +56 -4
  331. deltacat/utils/reader_compatibility_mapping.py +3083 -0
  332. deltacat/utils/url.py +1325 -0
  333. deltacat-2.0.0.dist-info/METADATA +1163 -0
  334. deltacat-2.0.0.dist-info/RECORD +439 -0
  335. {deltacat-1.1.38.dist-info → deltacat-2.0.0.dist-info}/WHEEL +1 -1
  336. deltacat/aws/redshift/__init__.py +0 -19
  337. deltacat/aws/redshift/model/manifest.py +0 -394
  338. deltacat/catalog/default_catalog_impl/__init__.py +0 -369
  339. deltacat/compute/compactor/utils/round_completion_file.py +0 -97
  340. deltacat/compute/merge_on_read/__init__.py +0 -4
  341. deltacat/compute/merge_on_read/daft.py +0 -40
  342. deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -66
  343. deltacat/compute/merge_on_read/utils/delta.py +0 -42
  344. deltacat/io/dataset.py +0 -73
  345. deltacat/io/read_api.py +0 -143
  346. deltacat/storage/model/delete_parameters.py +0 -40
  347. deltacat/storage/model/partition_spec.py +0 -71
  348. deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -231
  349. deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -397
  350. deltacat/tests/local_deltacat_storage/__init__.py +0 -1262
  351. deltacat/tests/local_deltacat_storage/exceptions.py +0 -10
  352. deltacat/utils/s3fs.py +0 -21
  353. deltacat-1.1.38.dist-info/METADATA +0 -64
  354. deltacat-1.1.38.dist-info/RECORD +0 -219
  355. /deltacat/{aws/redshift/model → benchmarking/data}/__init__.py +0 -0
  356. /deltacat/{compute/merge_on_read/model → catalog/main}/__init__.py +0 -0
  357. /deltacat/compute/{merge_on_read/utils → converter}/__init__.py +0 -0
  358. /deltacat/{io/aws → compute/converter/model}/__init__.py +0 -0
  359. /deltacat/{io/aws/redshift → compute/converter/pyiceberg}/__init__.py +0 -0
  360. /deltacat/{tests/io → compute/converter/steps}/__init__.py +0 -0
  361. /deltacat/tests/{io → _io}/test_file_object_store.py +0 -0
  362. /deltacat/tests/{io → _io}/test_memcached_object_store.py +0 -0
  363. /deltacat/tests/{io → _io}/test_ray_plasma_object_store.py +0 -0
  364. /deltacat/tests/{io → _io}/test_redis_object_store.py +0 -0
  365. /deltacat/tests/{io → _io}/test_s3_object_store.py +0 -0
  366. {deltacat-1.1.38.dist-info → deltacat-2.0.0.dist-info/licenses}/LICENSE +0 -0
  367. {deltacat-1.1.38.dist-info → deltacat-2.0.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,3030 @@
1
+ import logging
2
+ import uuid
3
+ import posixpath
4
+ import pyarrow
5
+
6
+ from typing import Any, Callable, Dict, List, Optional, Union, Tuple
7
+
8
+ from deltacat.catalog.model.properties import get_catalog_properties
9
+ from deltacat.constants import (
10
+ DEFAULT_TABLE_VERSION,
11
+ DATA_FILE_DIR_NAME,
12
+ )
13
+ from deltacat.exceptions import (
14
+ TableNotFoundError,
15
+ TableVersionNotFoundError,
16
+ DeltaCatError,
17
+ UnclassifiedDeltaCatError,
18
+ SchemaValidationError,
19
+ StreamNotFoundError,
20
+ PartitionNotFoundError,
21
+ DeltaNotFoundError,
22
+ NamespaceNotFoundError,
23
+ TableValidationError,
24
+ ConcurrentModificationError,
25
+ ObjectAlreadyExistsError,
26
+ NamespaceAlreadyExistsError,
27
+ TableAlreadyExistsError,
28
+ TableVersionAlreadyExistsError,
29
+ ObjectNotFoundError,
30
+ )
31
+ from deltacat.storage.model.manifest import (
32
+ EntryParams,
33
+ EntryType,
34
+ ManifestAuthor,
35
+ ManifestEntryList,
36
+ ManifestEntry,
37
+ )
38
+ from deltacat.storage.model.delta import (
39
+ Delta,
40
+ DeltaLocator,
41
+ DeltaProperties,
42
+ DeltaType,
43
+ )
44
+ from deltacat.storage.model.transaction import setup_transaction
45
+ from deltacat.storage.model.types import (
46
+ CommitState,
47
+ DistributedDataset,
48
+ LifecycleState,
49
+ LocalDataset,
50
+ LocalTable,
51
+ TransactionOperationType,
52
+ StreamFormat,
53
+ )
54
+ from deltacat.storage.model.list_result import ListResult
55
+ from deltacat.storage.model.namespace import (
56
+ Namespace,
57
+ NamespaceLocator,
58
+ NamespaceProperties,
59
+ )
60
+ from deltacat.storage.model.partition import (
61
+ Partition,
62
+ PartitionLocator,
63
+ PartitionScheme,
64
+ PartitionValues,
65
+ UNPARTITIONED_SCHEME,
66
+ UNPARTITIONED_SCHEME_ID,
67
+ )
68
+ from deltacat.storage.model.schema import Schema
69
+ from deltacat.storage.model.sort_key import (
70
+ SortScheme,
71
+ UNSORTED_SCHEME,
72
+ )
73
+ from deltacat.storage.model.stream import (
74
+ Stream,
75
+ StreamLocator,
76
+ )
77
+ from deltacat.storage.model.table import (
78
+ Table,
79
+ TableProperties,
80
+ TableLocator,
81
+ )
82
+ from deltacat.storage.model.table_version import (
83
+ TableVersion,
84
+ TableVersionProperties,
85
+ TableVersionLocator,
86
+ )
87
+ from deltacat.storage.model.metafile import (
88
+ Metafile,
89
+ )
90
+ from deltacat.storage.model.transaction import (
91
+ TransactionOperation,
92
+ Transaction,
93
+ )
94
+ from deltacat.storage.model.manifest import Manifest
95
+ from deltacat.types.media import (
96
+ ContentType,
97
+ DatasetType,
98
+ DistributedDatasetType,
99
+ StorageType,
100
+ ContentEncoding,
101
+ )
102
+ from deltacat.utils.common import ReadKwargsProvider
103
+ import pyarrow as pa
104
+
105
+ from deltacat.types.tables import (
106
+ TableProperty,
107
+ get_table_writer,
108
+ get_table_slicer,
109
+ write_sliced_table,
110
+ download_manifest_entries,
111
+ download_manifest_entries_distributed,
112
+ download_manifest_entry,
113
+ )
114
+ from deltacat import logs
115
+
116
+ logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
117
+
118
+
119
+ def _normalize_partition_values(
120
+ partition_values: Optional[PartitionValues],
121
+ ) -> Optional[PartitionValues]:
122
+ """
123
+ Normalize partition values to ensure consistent representation of unpartitioned data.
124
+
125
+ Both None and empty list [] represent unpartitioned data, but they should be
126
+ normalized to None for consistent lookup and validation.
127
+
128
+ Args:
129
+ partition_values: The partition values to normalize
130
+
131
+ Returns:
132
+ None for unpartitioned data (both None and [] inputs),
133
+ original value for partitioned data
134
+ """
135
+ if partition_values is None or (
136
+ isinstance(partition_values, list) and len(partition_values) == 0
137
+ ):
138
+ return None
139
+ return partition_values
140
+
141
+
142
+ def _list(
143
+ metafile: Metafile,
144
+ txn_op_type: TransactionOperationType,
145
+ *args,
146
+ transaction: Optional[Transaction] = None,
147
+ **kwargs,
148
+ ) -> ListResult[Metafile]:
149
+ catalog_properties = get_catalog_properties(**kwargs)
150
+ limit = kwargs.get("limit") or None
151
+
152
+ operation = TransactionOperation.of(
153
+ operation_type=txn_op_type,
154
+ dest_metafile=metafile,
155
+ read_limit=limit,
156
+ )
157
+
158
+ if transaction is not None:
159
+ # Add the read operation to the existing transaction and return the result
160
+ return transaction.step(operation)
161
+ else:
162
+ # Create and commit a new transaction (legacy behavior)
163
+ new_transaction = Transaction.of([operation])
164
+ list_results_per_op = new_transaction.commit(
165
+ catalog_root_dir=catalog_properties.root,
166
+ filesystem=catalog_properties.filesystem,
167
+ )
168
+ return list_results_per_op[0]
169
+
170
+
171
+ def _latest(
172
+ metafile: Metafile,
173
+ *args,
174
+ transaction: Optional[Transaction] = None,
175
+ **kwargs,
176
+ ) -> Optional[Metafile]:
177
+ list_results = _list(
178
+ metafile=metafile,
179
+ txn_op_type=TransactionOperationType.READ_LATEST,
180
+ transaction=transaction,
181
+ *args,
182
+ **kwargs,
183
+ )
184
+ results = list_results.all_items()
185
+ return results[0] if results else None
186
+
187
+
188
+ def _exists(
189
+ metafile: Metafile,
190
+ *args,
191
+ **kwargs,
192
+ ) -> Optional[bool]:
193
+ list_results = _list(
194
+ metafile=metafile,
195
+ txn_op_type=TransactionOperationType.READ_EXISTS,
196
+ *args,
197
+ **kwargs,
198
+ )
199
+ results = list_results.all_items()
200
+ return True if results else False
201
+
202
+
203
+ def _resolve_latest_active_table_version_id(
204
+ namespace: str,
205
+ table_name: str,
206
+ *args,
207
+ fail_if_no_active_table_version: bool = True,
208
+ transaction: Optional[Transaction] = None,
209
+ **kwargs,
210
+ ) -> Optional[str]:
211
+ table = get_table(
212
+ namespace=namespace,
213
+ table_name=table_name,
214
+ transaction=transaction,
215
+ *args,
216
+ **kwargs,
217
+ )
218
+ if not table:
219
+ raise TableNotFoundError(f"Table does not exist: {namespace}.{table_name}")
220
+ if fail_if_no_active_table_version and not table.latest_active_table_version:
221
+ raise TableVersionNotFoundError(
222
+ f"Table has no active table version: {namespace}.{table_name}"
223
+ )
224
+ return table.latest_active_table_version
225
+
226
+
227
+ def _resolve_latest_table_version_id(
228
+ namespace: str,
229
+ table_name: str,
230
+ fail_if_no_active_table_version: True,
231
+ *args,
232
+ transaction: Optional[Transaction] = None,
233
+ **kwargs,
234
+ ) -> Optional[str]:
235
+ table = get_table(
236
+ namespace=namespace,
237
+ table_name=table_name,
238
+ transaction=transaction,
239
+ *args,
240
+ **kwargs,
241
+ )
242
+ if not table:
243
+ raise TableNotFoundError(f"Table does not exist: {namespace}.{table_name}")
244
+ if fail_if_no_active_table_version and not table.latest_table_version:
245
+ raise TableVersionNotFoundError(
246
+ f"Table has no table version: {namespace}.{table_name}"
247
+ )
248
+ return table.latest_table_version
249
+
250
+
251
+ def _validate_schemes_against_schema(
252
+ schema: Optional[Schema],
253
+ partition_scheme: Optional[PartitionScheme],
254
+ sort_scheme: Optional[SortScheme],
255
+ ) -> None:
256
+ """
257
+ Validates partition and sort schemes against a schema, ensuring all referenced fields exist.
258
+ If schema is None, validation is skipped.
259
+ """
260
+ if schema is None:
261
+ return
262
+
263
+ schema_fields = set(field.name for field in schema.arrow)
264
+
265
+ # Validate partition scheme
266
+ if partition_scheme is not None and partition_scheme.keys is not None:
267
+ for key in partition_scheme.keys:
268
+ if key.key[0] not in schema_fields:
269
+ raise SchemaValidationError(
270
+ f"Partition key field '{key.key[0]}' not found in schema"
271
+ )
272
+
273
+ # Validate sort scheme
274
+ if sort_scheme is not None and sort_scheme.keys is not None:
275
+ for key in sort_scheme.keys:
276
+ if key.key[0] not in schema_fields:
277
+ raise SchemaValidationError(
278
+ f"Sort key field '{key.key[0]}' not found in schema"
279
+ )
280
+
281
+
282
+ def _validate_partition_values_against_scheme(
283
+ partition_values: Optional[PartitionValues],
284
+ partition_scheme: PartitionScheme,
285
+ schema: Optional[Schema],
286
+ ) -> None:
287
+ """
288
+ Validates that partition values match the data types of the partition key fields in the schema.
289
+
290
+ Args:
291
+ partition_values: List of partition values to validate
292
+ partition_scheme: The partition scheme containing the keys to validate against
293
+ schema: The schema containing the field types to validate against
294
+
295
+ Raises:
296
+ TableValidationError: If validation fails
297
+ """
298
+ if not partition_values:
299
+ raise TableValidationError("Partition values cannot be empty")
300
+
301
+ if not schema:
302
+ raise TableValidationError(
303
+ "Table version must have a schema to validate partition values"
304
+ )
305
+
306
+ if len(partition_values) != len(partition_scheme.keys):
307
+ raise TableValidationError(
308
+ f"Number of partition values ({len(partition_values)}) does not match "
309
+ f"number of partition keys ({len(partition_scheme.keys)})"
310
+ )
311
+
312
+ # Validate each partition value against its corresponding field type
313
+ for i in range(len(partition_scheme.keys)):
314
+ field_type = partition_scheme.keys[i].transform.return_type
315
+ partition_value = partition_values[i]
316
+ if field_type is None:
317
+ # the transform returns the same type as the source schema type
318
+ # (which also implies that it is a single-key transform)
319
+ field_type = schema.field(partition_scheme.keys[i].key[0]).arrow.type
320
+ try:
321
+ # Try to convert the value to PyArrow to validate its type
322
+ pa.array([partition_value], type=field_type)
323
+ # If successful, the type is valid
324
+ except (pa.lib.ArrowInvalid, pa.lib.ArrowTypeError) as e:
325
+ raise TableValidationError(
326
+ f"Partition value {partition_value} (type {type(partition_value)}) "
327
+ f"incompatible with partition transform return type {field_type}"
328
+ ) from e
329
+
330
+
331
+ def list_namespaces(*args, **kwargs) -> ListResult[Namespace]:
332
+ """
333
+ Lists a page of table namespaces. Namespaces are returned as list result
334
+ items.
335
+ """
336
+ return _list(
337
+ metafile=Namespace.of(NamespaceLocator.of("placeholder")),
338
+ txn_op_type=TransactionOperationType.READ_SIBLINGS,
339
+ *args,
340
+ **kwargs,
341
+ )
342
+
343
+
344
+ def list_tables(namespace: str, *args, **kwargs) -> ListResult[Table]:
345
+ """
346
+ Lists a page of tables for the given table namespace. Tables are returned as
347
+ list result items. Raises an error if the given namespace does not exist.
348
+ """
349
+ locator = TableLocator.at(namespace=namespace, table_name="placeholder")
350
+ try:
351
+ return _list(
352
+ metafile=Table.of(locator=locator),
353
+ txn_op_type=TransactionOperationType.READ_SIBLINGS,
354
+ *args,
355
+ **kwargs,
356
+ )
357
+ except ObjectNotFoundError as e:
358
+ raise NamespaceNotFoundError(f"Namespace {namespace} not found") from e
359
+
360
+
361
+ def list_table_versions(
362
+ namespace: str,
363
+ table_name: str,
364
+ *args,
365
+ **kwargs,
366
+ ) -> ListResult[TableVersion]:
367
+ """
368
+ Lists a page of table versions for the given table. Table versions are
369
+ returned as list result items. Raises an error if the given table does not
370
+ exist.
371
+ """
372
+ locator = TableVersionLocator.at(
373
+ namespace=namespace,
374
+ table_name=table_name,
375
+ table_version="placeholder.0",
376
+ )
377
+ table_version = TableVersion.of(
378
+ locator=locator,
379
+ schema=None,
380
+ )
381
+ try:
382
+ return _list(
383
+ metafile=table_version,
384
+ txn_op_type=TransactionOperationType.READ_SIBLINGS,
385
+ *args,
386
+ **kwargs,
387
+ )
388
+ except ObjectNotFoundError as e:
389
+ raise TableNotFoundError(f"Table {namespace}.{table_name} not found") from e
390
+
391
+
392
+ def list_streams(
393
+ namespace: str,
394
+ table_name: str,
395
+ table_version: str,
396
+ *args,
397
+ **kwargs,
398
+ ) -> ListResult[Stream]:
399
+ """
400
+ Lists a page of streams for the given table version.
401
+ Raises an error if the table version does not exist.
402
+ """
403
+ # TODO(pdames): Support listing uncommitted streams.
404
+ locator = StreamLocator.at(
405
+ namespace=namespace,
406
+ table_name=table_name,
407
+ table_version=table_version,
408
+ stream_id="placeholder",
409
+ stream_format=None,
410
+ )
411
+ stream = Stream.of(
412
+ locator=locator,
413
+ partition_scheme=None,
414
+ )
415
+ try:
416
+ return _list(
417
+ stream,
418
+ TransactionOperationType.READ_SIBLINGS,
419
+ *args,
420
+ **kwargs,
421
+ )
422
+ except ObjectNotFoundError as e:
423
+ raise TableVersionNotFoundError(
424
+ f"Table version {namespace}.{table_name}.{table_version} not found"
425
+ ) from e
426
+
427
+
428
+ def list_partitions(
429
+ namespace: str,
430
+ table_name: str,
431
+ table_version: Optional[str] = None,
432
+ *args,
433
+ transaction: Optional[Transaction] = None,
434
+ **kwargs,
435
+ ) -> ListResult[Partition]:
436
+ """
437
+ Lists a page of partitions for the given table version. Partitions are
438
+ returned as list result items. Table version resolves to the latest active
439
+ table version if not specified. Raises an error if the table version does
440
+ not exist.
441
+ """
442
+ transaction, commit_transaction = setup_transaction(transaction, **kwargs)
443
+
444
+ if not namespace:
445
+ raise ValueError("Namespace cannot be empty.")
446
+ if not table_name:
447
+ raise ValueError("Table name cannot be empty.")
448
+ # resolve default deltacat stream for the given namespace, table name, and table version
449
+ # TODO(pdames): debug why this doesn't work when only the table_version is provided
450
+ # and PartitionLocator.stream_format is hard-coded to deltacat (we should be able
451
+ # to resolve the default deltacat stream automatically)
452
+ stream = get_stream(
453
+ namespace=namespace,
454
+ table_name=table_name,
455
+ table_version=table_version,
456
+ transaction=transaction,
457
+ *args,
458
+ **kwargs,
459
+ )
460
+ if not stream:
461
+ raise StreamNotFoundError(
462
+ f"Default stream for {namespace}.{table_name}.{table_version} not found."
463
+ )
464
+ locator = PartitionLocator.of(
465
+ stream_locator=stream.locator,
466
+ partition_values=["placeholder"],
467
+ partition_id="placeholder",
468
+ )
469
+ partition = Partition.of(
470
+ locator=locator,
471
+ content_types=None,
472
+ )
473
+ try:
474
+ result = _list(
475
+ metafile=partition,
476
+ txn_op_type=TransactionOperationType.READ_SIBLINGS,
477
+ transaction=transaction,
478
+ *args,
479
+ **kwargs,
480
+ )
481
+ except ObjectNotFoundError as e:
482
+ raise StreamNotFoundError(f"Stream {stream.locator} not found") from e
483
+
484
+ if commit_transaction:
485
+ transaction.seal()
486
+ return result
487
+
488
+
489
+ def list_stream_partitions(stream: Stream, *args, **kwargs) -> ListResult[Partition]:
490
+ """
491
+ Lists all partitions committed to the given stream.
492
+ """
493
+ # TODO(pdames): Support listing uncommitted partitions.
494
+ if stream.stream_format != StreamFormat.DELTACAT:
495
+ raise ValueError(
496
+ f"Unsupported stream format: {stream.stream_format}"
497
+ f"Expected stream format: {StreamFormat.DELTACAT}"
498
+ )
499
+ locator = PartitionLocator.of(
500
+ stream_locator=stream.locator,
501
+ partition_values=["placeholder"],
502
+ partition_id="placeholder",
503
+ )
504
+ partition = Partition.of(
505
+ locator=locator,
506
+ content_types=None,
507
+ )
508
+ try:
509
+ return _list(
510
+ metafile=partition,
511
+ txn_op_type=TransactionOperationType.READ_SIBLINGS,
512
+ *args,
513
+ **kwargs,
514
+ )
515
+ except ObjectNotFoundError as e:
516
+ raise StreamNotFoundError(f"Stream {stream.locator} not found") from e
517
+
518
+
519
+ def list_deltas(
520
+ namespace: str,
521
+ table_name: str,
522
+ partition_values: Optional[PartitionValues] = None,
523
+ table_version: Optional[str] = None,
524
+ first_stream_position: Optional[int] = None,
525
+ last_stream_position: Optional[int] = None,
526
+ ascending_order: Optional[bool] = None,
527
+ include_manifest: bool = False,
528
+ partition_scheme_id: Optional[str] = None,
529
+ *args,
530
+ transaction: Optional[Transaction] = None,
531
+ **kwargs,
532
+ ) -> ListResult[Delta]:
533
+ """
534
+ Lists a page of deltas for the given table version and committed partition.
535
+ Deltas are returned as list result items. Deltas returned can optionally be
536
+ limited to inclusive first and last stream positions. Deltas are returned by
537
+ descending stream position by default. Table version resolves to the latest
538
+ active table version if not specified. Partition values should not be
539
+ specified for unpartitioned tables. Partition scheme ID resolves to the
540
+ table version's current partition scheme by default. Raises an error if the
541
+ given table version or partition does not exist.
542
+
543
+ To conserve memory, the deltas returned do not include manifests by
544
+ default. The manifests can either be optionally retrieved as part of this
545
+ call or lazily loaded via subsequent calls to `get_delta_manifest`.
546
+ """
547
+ transaction, commit_transaction = setup_transaction(transaction, **kwargs)
548
+
549
+ # TODO(pdames): Delta listing should ideally either use an efficient
550
+ # range-limited dir listing of partition children between start and end
551
+ # positions, or should traverse using Partition.stream_position (to
552
+ # resolve last stream position) and Delta.previous_stream_position
553
+ # (down to first stream position).
554
+
555
+ # First get the stream to resolve proper table version and stream locator
556
+ stream = get_stream(
557
+ namespace=namespace,
558
+ table_name=table_name,
559
+ table_version=table_version,
560
+ transaction=transaction,
561
+ *args,
562
+ **kwargs,
563
+ )
564
+ if not stream:
565
+ raise StreamNotFoundError(
566
+ f"Failed to resolve stream for "
567
+ f"`{namespace}.{table_name}` at table version "
568
+ f"`{table_version or 'latest'}` (no stream found)."
569
+ )
570
+
571
+ # Then get the actual partition to ensure we have the real partition locator with ID
572
+ partition = get_partition(
573
+ stream_locator=stream.locator,
574
+ partition_values=partition_values,
575
+ partition_scheme_id=partition_scheme_id,
576
+ transaction=transaction,
577
+ *args,
578
+ **kwargs,
579
+ )
580
+ if not partition:
581
+ raise PartitionNotFoundError(
582
+ f"Failed to find partition for stream {stream.locator} "
583
+ f"with partition_values={partition_values} and "
584
+ f"partition_scheme_id={partition_scheme_id}"
585
+ )
586
+
587
+ # Use the actual partition locator (with partition ID) for listing deltas
588
+ locator = DeltaLocator.of(partition_locator=partition.locator)
589
+ delta = Delta.of(
590
+ locator=locator,
591
+ delta_type=None,
592
+ meta=None,
593
+ properties=None,
594
+ manifest=None,
595
+ )
596
+ try:
597
+ all_deltas_list_result: ListResult[Delta] = _list(
598
+ metafile=delta,
599
+ txn_op_type=TransactionOperationType.READ_SIBLINGS,
600
+ transaction=transaction,
601
+ *args,
602
+ **kwargs,
603
+ )
604
+ except ObjectNotFoundError as e:
605
+ raise PartitionNotFoundError(f"Partition {partition.locator} not found") from e
606
+ all_deltas = all_deltas_list_result.all_items()
607
+ filtered_deltas = [
608
+ delta
609
+ for delta in all_deltas
610
+ if (
611
+ first_stream_position is None
612
+ or first_stream_position <= delta.stream_position
613
+ )
614
+ and (
615
+ last_stream_position is None
616
+ or delta.stream_position <= last_stream_position
617
+ )
618
+ ]
619
+ # Sort deltas by stream position in the requested order
620
+ filtered_deltas.sort(reverse=(not ascending_order), key=lambda d: d.stream_position)
621
+
622
+ if commit_transaction:
623
+ transaction.seal()
624
+ return filtered_deltas
625
+
626
+
627
+ def list_partition_deltas(
628
+ partition_like: Union[Partition, PartitionLocator],
629
+ first_stream_position: Optional[int] = None,
630
+ last_stream_position: Optional[int] = None,
631
+ ascending_order: bool = False,
632
+ include_manifest: bool = False,
633
+ *args,
634
+ **kwargs,
635
+ ) -> ListResult[Delta]:
636
+ """
637
+ Lists a page of deltas committed to the given partition.
638
+
639
+ To conserve memory, the deltas returned do not include manifests by
640
+ default. The manifests can either be optionally retrieved as part of this
641
+ call or lazily loaded via subsequent calls to `get_delta_manifest`.
642
+ """
643
+ # TODO(pdames): Delta listing should ideally either use an efficient
644
+ # range-limited dir listing of partition children between start and end
645
+ # positions, or should traverse using Partition.stream_position (to
646
+ # resolve last stream position) and Delta.previous_stream_position
647
+ # (down to first stream position).
648
+ locator = DeltaLocator.of(
649
+ partition_locator=partition_like
650
+ if isinstance(partition_like, PartitionLocator)
651
+ else partition_like.locator,
652
+ stream_position=None,
653
+ )
654
+ delta = Delta.of(
655
+ locator=locator,
656
+ delta_type=None,
657
+ meta=None,
658
+ properties=None,
659
+ manifest=None,
660
+ )
661
+ try:
662
+ all_deltas_list_result: ListResult[Delta] = _list(
663
+ metafile=delta,
664
+ txn_op_type=TransactionOperationType.READ_SIBLINGS,
665
+ *args,
666
+ **kwargs,
667
+ )
668
+ except ObjectNotFoundError as e:
669
+ raise PartitionNotFoundError(
670
+ f"Partition {partition_like.locator} not found"
671
+ ) from e
672
+ all_deltas = all_deltas_list_result.all_items()
673
+ filtered_deltas = [
674
+ delta
675
+ for delta in all_deltas
676
+ if (
677
+ first_stream_position is None
678
+ or first_stream_position <= delta.stream_position
679
+ )
680
+ and (
681
+ last_stream_position is None
682
+ or delta.stream_position <= last_stream_position
683
+ )
684
+ ]
685
+ # Sort deltas by stream position in the requested order
686
+ filtered_deltas.sort(reverse=(not ascending_order), key=lambda d: d.stream_position)
687
+ return ListResult.of(
688
+ items=filtered_deltas,
689
+ pagination_key=None,
690
+ next_page_provider=None,
691
+ )
692
+
693
+
694
+ def get_delta(
695
+ namespace: str,
696
+ table_name: str,
697
+ stream_position: int,
698
+ partition_values: Optional[PartitionValues] = None,
699
+ table_version: Optional[str] = None,
700
+ include_manifest: bool = False,
701
+ partition_scheme_id: Optional[str] = None,
702
+ *args,
703
+ transaction: Optional[Transaction] = None,
704
+ **kwargs,
705
+ ) -> Optional[Delta]:
706
+ """
707
+ Gets the delta for the given table version, partition, and stream position.
708
+ Table version resolves to the latest active table version if not specified.
709
+ Partition values should not be specified for unpartitioned tables. Partition
710
+ scheme ID resolves to the table version's current partition scheme by
711
+ default. Raises an error if the given table version or partition does not
712
+ exist.
713
+
714
+ To conserve memory, the delta returned does not include a manifest by
715
+ default. The manifest can either be optionally retrieved as part of this
716
+ call or lazily loaded via a subsequent call to `get_delta_manifest`.
717
+ """
718
+ transaction, commit_transaction = setup_transaction(transaction, **kwargs)
719
+
720
+ # TODO(pdames): Honor `include_manifest` param.
721
+
722
+ # First get the stream to resolve proper table version and stream locator
723
+ stream = get_stream(
724
+ namespace=namespace,
725
+ table_name=table_name,
726
+ table_version=table_version,
727
+ transaction=transaction,
728
+ *args,
729
+ **kwargs,
730
+ )
731
+ if not stream:
732
+ raise StreamNotFoundError(
733
+ f"Failed to resolve stream for "
734
+ f"`{namespace}.{table_name}` at table version "
735
+ f"`{table_version or 'latest'}` (no stream found)."
736
+ )
737
+
738
+ # Then get the actual partition to ensure we have the real partition locator with ID
739
+ partition = get_partition(
740
+ stream_locator=stream.locator,
741
+ partition_values=partition_values,
742
+ partition_scheme_id=partition_scheme_id,
743
+ transaction=transaction,
744
+ *args,
745
+ **kwargs,
746
+ )
747
+ if not partition:
748
+ raise PartitionNotFoundError(
749
+ f"Failed to find partition for stream {stream.locator} "
750
+ f"with partition_values={partition_values} and "
751
+ f"partition_scheme_id={partition_scheme_id}"
752
+ )
753
+
754
+ # Use the actual partition locator (with partition ID) for getting the delta
755
+ locator = DeltaLocator.of(
756
+ partition_locator=partition.locator,
757
+ stream_position=stream_position,
758
+ )
759
+ delta = Delta.of(
760
+ locator=locator,
761
+ delta_type=None,
762
+ meta=None,
763
+ properties=None,
764
+ manifest=None,
765
+ )
766
+ result = _latest(
767
+ metafile=delta,
768
+ transaction=transaction,
769
+ *args,
770
+ **kwargs,
771
+ )
772
+
773
+ # TODO(pdames): Honor the include_manifest parameter during retrieval from _latest, since
774
+ # the point is to avoid loading the manifest into memory if it's not needed.
775
+ if result and not include_manifest:
776
+ result.manifest = None
777
+
778
+ if commit_transaction:
779
+ transaction.seal()
780
+ return result
781
+
782
+
783
+ def get_latest_delta(
784
+ namespace: str,
785
+ table_name: str,
786
+ partition_values: Optional[PartitionValues] = None,
787
+ table_version: Optional[str] = None,
788
+ include_manifest: bool = False,
789
+ partition_scheme_id: Optional[str] = None,
790
+ *args,
791
+ transaction: Optional[Transaction] = None,
792
+ **kwargs,
793
+ ) -> Optional[Delta]:
794
+ """
795
+ Gets the latest delta (i.e. the delta with the greatest stream position) for
796
+ the given table version and partition. Table version resolves to the latest
797
+ active table version if not specified. Partition values should not be
798
+ specified for unpartitioned tables. Partition scheme ID resolves to the
799
+ table version's current partition scheme by default. Raises an error if the
800
+ given table version or partition does not exist.
801
+
802
+ To conserve memory, the delta returned does not include a manifest by
803
+ default. The manifest can either be optionally retrieved as part of this
804
+ call or lazily loaded via a subsequent call to `get_delta_manifest`.
805
+ """
806
+ transaction, commit_transaction = setup_transaction(transaction, **kwargs)
807
+
808
+ stream = get_stream(
809
+ namespace=namespace,
810
+ table_name=table_name,
811
+ table_version=table_version,
812
+ transaction=transaction,
813
+ *args,
814
+ **kwargs,
815
+ )
816
+ partition = get_partition(
817
+ stream_locator=stream.locator,
818
+ partition_values=partition_values,
819
+ partition_scheme_id=partition_scheme_id,
820
+ transaction=transaction,
821
+ *args,
822
+ **kwargs,
823
+ )
824
+ locator = DeltaLocator.of(
825
+ partition_locator=partition.locator,
826
+ stream_position=partition.stream_position,
827
+ )
828
+ delta = Delta.of(
829
+ locator=locator,
830
+ delta_type=None,
831
+ meta=None,
832
+ properties=None,
833
+ manifest=None,
834
+ )
835
+ result = _latest(
836
+ metafile=delta,
837
+ transaction=transaction,
838
+ *args,
839
+ **kwargs,
840
+ )
841
+
842
+ # TODO(pdames): Honor the include_manifest parameter during retrieval from _latest, since
843
+ # the point is to avoid loading the manifest into memory if it's not needed.
844
+ if result and not include_manifest:
845
+ result.manifest = None
846
+
847
+ if commit_transaction:
848
+ transaction.seal()
849
+ return result
850
+
851
+
852
+ def _download_delta_distributed(
853
+ manifest: Manifest,
854
+ table_type: DatasetType = DatasetType.PYARROW,
855
+ max_parallelism: Optional[int] = None,
856
+ column_names: Optional[List[str]] = None,
857
+ include_columns: Optional[List[str]] = None,
858
+ file_reader_kwargs_provider: Optional[ReadKwargsProvider] = None,
859
+ *args,
860
+ ray_options_provider: Callable[[int, Any], Dict[str, Any]] = None,
861
+ distributed_dataset_type: Optional[
862
+ DistributedDatasetType
863
+ ] = DistributedDatasetType.RAY_DATASET,
864
+ **kwargs,
865
+ ) -> DistributedDataset:
866
+
867
+ distributed_dataset: DistributedDataset = download_manifest_entries_distributed(
868
+ manifest=manifest,
869
+ table_type=table_type,
870
+ max_parallelism=max_parallelism,
871
+ column_names=column_names,
872
+ include_columns=include_columns,
873
+ file_reader_kwargs_provider=file_reader_kwargs_provider,
874
+ ray_options_provider=ray_options_provider,
875
+ distributed_dataset_type=distributed_dataset_type,
876
+ *args,
877
+ **kwargs,
878
+ )
879
+
880
+ return distributed_dataset
881
+
882
+
883
+ def _download_delta_local(
884
+ manifest: Manifest,
885
+ table_type: DatasetType = DatasetType.PYARROW,
886
+ max_parallelism: Optional[int] = None,
887
+ column_names: Optional[List[str]] = None,
888
+ include_columns: Optional[List[str]] = None,
889
+ file_reader_kwargs_provider: Optional[ReadKwargsProvider] = None,
890
+ *args,
891
+ **kwargs,
892
+ ) -> LocalDataset:
893
+ tables: LocalDataset = download_manifest_entries(
894
+ manifest,
895
+ table_type,
896
+ max_parallelism if max_parallelism else 1,
897
+ column_names,
898
+ include_columns,
899
+ file_reader_kwargs_provider,
900
+ **kwargs,
901
+ )
902
+ return tables
903
+
904
+
905
+ def download_delta(
906
+ delta_like: Union[Delta, DeltaLocator],
907
+ table_type: DatasetType = DatasetType.PYARROW,
908
+ storage_type: StorageType = StorageType.DISTRIBUTED,
909
+ max_parallelism: Optional[int] = None,
910
+ columns: Optional[List[str]] = None,
911
+ file_reader_kwargs_provider: Optional[ReadKwargsProvider] = None,
912
+ ray_options_provider: Callable[[int, Any], Dict[str, Any]] = None,
913
+ distributed_dataset_type: DistributedDatasetType = DistributedDatasetType.RAY_DATASET,
914
+ file_path_column: Optional[str] = None,
915
+ *args,
916
+ transaction: Optional[Transaction] = None,
917
+ all_column_names: Optional[List[str]] = None,
918
+ **kwargs,
919
+ ) -> Union[LocalDataset, DistributedDataset]: # type: ignore
920
+ """
921
+ Read the given delta or delta locator into either a list of
922
+ tables resident in the local node's memory, or into a dataset distributed
923
+ across this Ray cluster's object store memory. Ordered table N of a local
924
+ table list, or ordered block N of a distributed dataset, always contain
925
+ the contents of ordered delta manifest entry N.
926
+ """
927
+ # TODO (pdames): Cast delimited text types to the table's schema types
928
+ # TODO (pdames): Deprecate this method and replace with `read_delta`
929
+ # TODO (pdames): Replace dependence on TableType, StorageType, and DistributedDatasetType
930
+ # with DatasetType
931
+
932
+ # if all column names are provided, then this is a pure manifest entry download (no transaction needed)
933
+ commit_transaction = False
934
+ if not all_column_names:
935
+ transaction, commit_transaction = setup_transaction(transaction, **kwargs)
936
+
937
+ storage_type_to_download_func = {
938
+ StorageType.LOCAL: _download_delta_local,
939
+ StorageType.DISTRIBUTED: _download_delta_distributed,
940
+ }
941
+
942
+ is_delta = isinstance(delta_like, Delta)
943
+ is_delta_locator = isinstance(delta_like, DeltaLocator)
944
+
945
+ delta_locator: Optional[DeltaLocator] = None
946
+ if is_delta_locator:
947
+ delta_locator = delta_like
948
+ elif is_delta:
949
+ delta_locator = Delta(delta_like).locator
950
+ if not delta_locator:
951
+ raise ValueError(
952
+ f"Expected delta_like to be a Delta or DeltaLocator, but found "
953
+ f"{type(delta_like)}."
954
+ )
955
+
956
+ # Get manifest - if delta_like is a Delta with a manifest, use it, otherwise fetch from storage
957
+ if is_delta and delta_like.manifest:
958
+ manifest = delta_like.manifest
959
+ elif all_column_names:
960
+ raise ValueError(
961
+ "All column names can only be specified with a delta with an inline manifest."
962
+ )
963
+ else:
964
+ manifest = get_delta_manifest(
965
+ delta_locator,
966
+ transaction=transaction,
967
+ *args,
968
+ **kwargs,
969
+ )
970
+ all_column_names = all_column_names or None
971
+ if not all_column_names:
972
+ table_version_schema = get_table_version_schema(
973
+ delta_locator.namespace,
974
+ delta_locator.table_name,
975
+ delta_locator.table_version,
976
+ transaction=transaction,
977
+ *args,
978
+ **kwargs,
979
+ )
980
+ if table_version_schema and table_version_schema.arrow:
981
+ all_column_names = [field.name for field in table_version_schema.arrow]
982
+ if distributed_dataset_type == DatasetType.DAFT:
983
+ # Daft needs the latest table version schema to properly handle schema evolution
984
+ kwargs["table_version_schema"] = table_version_schema.arrow
985
+ elif distributed_dataset_type == DatasetType.DAFT:
986
+ raise ValueError("All column names canot be specified with Daft.")
987
+ if columns:
988
+ # Extract file_path_column since it's appended after reading each file
989
+ columns_to_validate = (
990
+ [col for col in columns if col != file_path_column]
991
+ if file_path_column
992
+ else columns
993
+ )
994
+
995
+ # Only validate columns if we have schema information (all_column_names is not None)
996
+ if all_column_names is not None:
997
+ if not all(
998
+ col in [col_name.lower() for col_name in all_column_names]
999
+ for col in columns_to_validate
1000
+ ):
1001
+ raise SchemaValidationError(
1002
+ f"One or more columns in {columns_to_validate} are not present in table "
1003
+ f"version columns {all_column_names}"
1004
+ )
1005
+ columns = [column.lower() for column in columns]
1006
+ logger.debug(
1007
+ f"Reading {columns or 'all'} columns from table version column "
1008
+ f"names: {all_column_names}. "
1009
+ )
1010
+
1011
+ # Filter out parameters that are already passed as positional/keyword arguments
1012
+ # to avoid "multiple values for argument" errors
1013
+ filtered_kwargs = {
1014
+ k: v
1015
+ for k, v in kwargs.items()
1016
+ if k
1017
+ not in [
1018
+ "manifest",
1019
+ "table_type",
1020
+ "max_parallelism",
1021
+ "column_names",
1022
+ "include_columns",
1023
+ "file_reader_kwargs_provider",
1024
+ "ray_options_provider",
1025
+ "distributed_dataset_type",
1026
+ ]
1027
+ }
1028
+
1029
+ dataset = storage_type_to_download_func[storage_type](
1030
+ manifest,
1031
+ table_type,
1032
+ max_parallelism,
1033
+ all_column_names,
1034
+ columns,
1035
+ file_reader_kwargs_provider,
1036
+ ray_options_provider=ray_options_provider,
1037
+ distributed_dataset_type=distributed_dataset_type,
1038
+ file_path_column=file_path_column,
1039
+ **filtered_kwargs,
1040
+ )
1041
+ if commit_transaction:
1042
+ transaction.seal()
1043
+ return dataset
1044
+
1045
+
1046
+ def _download_manifest_entry(
1047
+ manifest_entry: ManifestEntry,
1048
+ table_type: DatasetType = DatasetType.PYARROW,
1049
+ column_names: Optional[List[str]] = None,
1050
+ include_columns: Optional[List[str]] = None,
1051
+ file_reader_kwargs_provider: Optional[ReadKwargsProvider] = None,
1052
+ content_type: Optional[ContentType] = None,
1053
+ content_encoding: Optional[ContentEncoding] = None,
1054
+ filesystem: Optional[pyarrow.fs.FileSystem] = None,
1055
+ ) -> LocalTable:
1056
+
1057
+ return download_manifest_entry(
1058
+ manifest_entry,
1059
+ table_type,
1060
+ column_names,
1061
+ include_columns,
1062
+ file_reader_kwargs_provider,
1063
+ content_type,
1064
+ content_encoding,
1065
+ filesystem,
1066
+ )
1067
+
1068
+
1069
+ def download_delta_manifest_entry(
1070
+ delta_like: Union[Delta, DeltaLocator],
1071
+ entry_index: int,
1072
+ table_type: DatasetType = DatasetType.PYARROW,
1073
+ columns: Optional[List[str]] = None,
1074
+ file_reader_kwargs_provider: Optional[ReadKwargsProvider] = None,
1075
+ *args,
1076
+ transaction: Optional[Transaction] = None,
1077
+ all_column_names: Optional[List[str]] = None,
1078
+ **kwargs,
1079
+ ) -> LocalTable:
1080
+ """
1081
+ Reads a single manifest entry into the specified table type for the
1082
+ given delta or delta locator. If a delta is provided with a non-empty
1083
+ manifest, then the entry is read from this manifest. Otherwise, the
1084
+ manifest is first retrieved then the given entry index read.
1085
+
1086
+ NOTE: The entry will be read in the current node's memory.
1087
+ """
1088
+ # if all column names are provided, then this is a pure manifest entry download (no transaction needed)
1089
+ commit_transaction = False
1090
+ if not all_column_names:
1091
+ transaction, commit_transaction = setup_transaction(transaction, **kwargs)
1092
+
1093
+ is_delta = isinstance(delta_like, Delta)
1094
+ is_delta_locator = isinstance(delta_like, DeltaLocator)
1095
+
1096
+ delta_locator: Optional[DeltaLocator] = None
1097
+ if is_delta_locator:
1098
+ delta_locator = delta_like
1099
+ elif is_delta:
1100
+ delta_locator = Delta(delta_like).locator
1101
+ if not delta_locator:
1102
+ raise ValueError(
1103
+ f"Expected delta_like to be a Delta or DeltaLocator, but found "
1104
+ f"{type(delta_like)}."
1105
+ )
1106
+
1107
+ if is_delta and delta_like.manifest:
1108
+ manifest = delta_like.manifest
1109
+ elif all_column_names:
1110
+ raise ValueError(
1111
+ "All column names can only be specified with a delta with an inline manifest."
1112
+ )
1113
+ else:
1114
+ manifest = get_delta_manifest(
1115
+ delta_locator,
1116
+ transaction=transaction,
1117
+ *args,
1118
+ **kwargs,
1119
+ )
1120
+ # TODO(pdames): Cache table version column names and only invoke when
1121
+ # needed.
1122
+ all_column_names = all_column_names or get_table_version_column_names(
1123
+ delta_locator.namespace,
1124
+ delta_locator.table_name,
1125
+ delta_locator.table_version,
1126
+ transaction=transaction,
1127
+ *args,
1128
+ **kwargs,
1129
+ )
1130
+ if columns:
1131
+ if not all(
1132
+ col in [col_name.lower() for col_name in all_column_names]
1133
+ for col in columns
1134
+ ):
1135
+ raise SchemaValidationError(
1136
+ f"One or more columns in {columns} are not present in table "
1137
+ f"version columns {all_column_names}"
1138
+ )
1139
+ columns = [column.lower() for column in columns]
1140
+ logger.debug(
1141
+ f"Reading {columns or 'all'} columns from table version column "
1142
+ f"names: {all_column_names}. "
1143
+ )
1144
+ catalog_properties = get_catalog_properties(**kwargs)
1145
+ manifest_entry = _download_manifest_entry(
1146
+ manifest.entries[entry_index],
1147
+ table_type,
1148
+ all_column_names,
1149
+ columns,
1150
+ file_reader_kwargs_provider,
1151
+ filesystem=catalog_properties.filesystem,
1152
+ )
1153
+ if commit_transaction:
1154
+ transaction.seal()
1155
+ return manifest_entry
1156
+
1157
+
1158
+ def get_delta_manifest(
1159
+ delta_like: Union[Delta, DeltaLocator],
1160
+ *args,
1161
+ **kwargs,
1162
+ ) -> Manifest:
1163
+ """
1164
+ Get the manifest associated with the given delta or delta locator. This
1165
+ always retrieves the authoritative durable copy of the delta manifest, and
1166
+ never the local manifest defined for any input delta. Raises an error if
1167
+ the delta can't be found, or if it doesn't contain a manifest.
1168
+ """
1169
+ if isinstance(delta_like, Delta):
1170
+ delta_locator = delta_like.locator
1171
+ elif isinstance(delta_like, DeltaLocator):
1172
+ delta_locator = delta_like
1173
+ else:
1174
+ raise ValueError(
1175
+ f"Expected delta or delta locator, but got: {type(delta_like)}"
1176
+ )
1177
+ delta = Delta.of(
1178
+ locator=delta_locator,
1179
+ delta_type=None,
1180
+ meta=None,
1181
+ properties=None,
1182
+ manifest=None,
1183
+ )
1184
+ latest_delta: Delta = _latest(
1185
+ metafile=delta,
1186
+ *args,
1187
+ **kwargs,
1188
+ )
1189
+ if not latest_delta:
1190
+ raise DeltaNotFoundError(f"No delta found for locator: {delta_locator}")
1191
+ elif not latest_delta.manifest:
1192
+ raise DeltaNotFoundError(f"No manifest found for delta: {latest_delta}")
1193
+ return latest_delta.manifest
1194
+
1195
+
1196
+ def create_namespace(
1197
+ namespace: str,
1198
+ properties: Optional[NamespaceProperties] = None,
1199
+ *args,
1200
+ transaction: Optional[Transaction] = None,
1201
+ **kwargs,
1202
+ ) -> Namespace:
1203
+ """
1204
+ Creates a table namespace with the given name and properties. Returns
1205
+ the created namespace.
1206
+ """
1207
+ transaction, commit_transaction = setup_transaction(transaction, **kwargs)
1208
+
1209
+ namespace = Namespace.of(
1210
+ locator=NamespaceLocator.of(namespace=namespace),
1211
+ properties=properties,
1212
+ )
1213
+
1214
+ # Add the operation to the transaction
1215
+ transaction.step(
1216
+ TransactionOperation.of(
1217
+ operation_type=TransactionOperationType.CREATE,
1218
+ dest_metafile=namespace,
1219
+ ),
1220
+ )
1221
+
1222
+ if commit_transaction:
1223
+ transaction.seal()
1224
+ return namespace
1225
+
1226
+
1227
+ def update_namespace(
1228
+ namespace: str,
1229
+ properties: Optional[NamespaceProperties] = None,
1230
+ new_namespace: Optional[str] = None,
1231
+ *args,
1232
+ transaction: Optional[Transaction] = None,
1233
+ **kwargs,
1234
+ ) -> None:
1235
+ """
1236
+ Updates a table namespace's name and/or properties. Raises an error if the
1237
+ given namespace does not exist.
1238
+ """
1239
+ transaction, commit_transaction = setup_transaction(transaction, **kwargs)
1240
+
1241
+ # Check if the namespace exists
1242
+ old_namespace_meta = get_namespace(
1243
+ namespace=namespace,
1244
+ transaction=transaction,
1245
+ *args,
1246
+ **kwargs,
1247
+ )
1248
+ if not old_namespace_meta:
1249
+ raise NamespaceNotFoundError(f"Namespace {namespace} does not exist")
1250
+
1251
+ # Create new namespace metadata
1252
+ new_namespace_meta: Namespace = Metafile.update_for(old_namespace_meta)
1253
+ if new_namespace:
1254
+ new_namespace_meta.locator.namespace = new_namespace
1255
+ if properties is not None:
1256
+ new_namespace_meta.properties = properties
1257
+
1258
+ # Add the update operation to the transaction
1259
+ try:
1260
+ transaction.step(
1261
+ TransactionOperation.of(
1262
+ operation_type=TransactionOperationType.UPDATE,
1263
+ dest_metafile=new_namespace_meta,
1264
+ src_metafile=old_namespace_meta,
1265
+ ),
1266
+ )
1267
+ except ObjectAlreadyExistsError as e:
1268
+ raise NamespaceAlreadyExistsError(
1269
+ f"Namespace {namespace} already exists"
1270
+ ) from e
1271
+
1272
+ if commit_transaction:
1273
+ transaction.seal()
1274
+
1275
+
1276
+ def create_table_version(
1277
+ namespace: str,
1278
+ table_name: str,
1279
+ table_version: Optional[str] = None,
1280
+ lifecycle_state: Optional[LifecycleState] = LifecycleState.CREATED,
1281
+ schema: Optional[Schema] = None,
1282
+ partition_scheme: Optional[PartitionScheme] = None,
1283
+ sort_keys: Optional[SortScheme] = None,
1284
+ table_version_description: Optional[str] = None,
1285
+ table_version_properties: Optional[TableVersionProperties] = None,
1286
+ table_description: Optional[str] = None,
1287
+ table_properties: Optional[TableProperties] = None,
1288
+ supported_content_types: Optional[List[ContentType]] = None,
1289
+ *args,
1290
+ transaction: Optional[Transaction] = None,
1291
+ **kwargs,
1292
+ ) -> Tuple[Table, TableVersion, Stream]:
1293
+ """
1294
+ Create a table version with the given or CREATED lifecycle state and an empty delta
1295
+ stream. Table versions may be schemaless and unpartitioned to improve write
1296
+ performance, or have their writes governed by a schema and partition scheme
1297
+ to improve data consistency and read performance.
1298
+
1299
+ Returns a tuple containing the created/updated table, table version, and
1300
+ stream (respectively).
1301
+
1302
+ Raises an error if the given namespace does not exist.
1303
+ """
1304
+ transaction, commit_transaction = setup_transaction(transaction, **kwargs)
1305
+
1306
+ if not namespace_exists(
1307
+ namespace=namespace,
1308
+ transaction=transaction,
1309
+ *args,
1310
+ **kwargs,
1311
+ ):
1312
+ raise NamespaceNotFoundError(f"Namespace {namespace} does not exist")
1313
+
1314
+ # Validate schemes against schema
1315
+ _validate_schemes_against_schema(schema, partition_scheme, sort_keys)
1316
+
1317
+ # coerce unspecified partition schemes to the unpartitioned scheme
1318
+ partition_scheme = partition_scheme or UNPARTITIONED_SCHEME
1319
+ # coerce unspecified sort schemes to the unsorted scheme
1320
+ sort_keys = sort_keys or UNSORTED_SCHEME
1321
+ # check if a parent table and/or previous table version already exist
1322
+ prev_table_version = None
1323
+ prev_table = get_table(
1324
+ namespace=namespace,
1325
+ table_name=table_name,
1326
+ transaction=transaction,
1327
+ *args,
1328
+ **kwargs,
1329
+ )
1330
+ if not prev_table:
1331
+ # no parent table exists, so we'll create it in this transaction
1332
+ table_txn_op_type = TransactionOperationType.CREATE
1333
+ prev_table = None
1334
+ new_table = Table.of(
1335
+ locator=TableLocator.at(namespace=namespace, table_name=table_name),
1336
+ )
1337
+ table_version = table_version or DEFAULT_TABLE_VERSION
1338
+ else:
1339
+ # the parent table exists, so we'll update it in this transaction
1340
+ table_txn_op_type = TransactionOperationType.UPDATE
1341
+ new_table: Table = Metafile.update_for(prev_table)
1342
+ prev_table_version = prev_table.latest_table_version
1343
+ if not table_version:
1344
+ # generate the next table version ID
1345
+ table_version = TableVersion.next_version(prev_table_version)
1346
+ else:
1347
+ # ensure that the given table version number matches expectations
1348
+ expected_table_version = TableVersion.next_version(prev_table_version)
1349
+ _, version_number = TableVersion.parse_table_version(
1350
+ table_version,
1351
+ )
1352
+ _, expected_version_number = TableVersion.parse_table_version(
1353
+ expected_table_version,
1354
+ )
1355
+ if version_number != expected_version_number:
1356
+ raise TableValidationError(
1357
+ f"Expected to create table version "
1358
+ f"{expected_version_number} but found {version_number}.",
1359
+ )
1360
+ if table_description is not None:
1361
+ new_table.description = table_description
1362
+ if table_properties is not None:
1363
+ new_table.properties = table_properties
1364
+ new_table.latest_table_version = table_version
1365
+ new_table.latest_active_table_version = (
1366
+ table_version if lifecycle_state == LifecycleState.ACTIVE else None
1367
+ )
1368
+ locator = TableVersionLocator.at(
1369
+ namespace=namespace,
1370
+ table_name=table_name,
1371
+ table_version=table_version,
1372
+ )
1373
+ table_version = TableVersion.of(
1374
+ locator=locator,
1375
+ schema=schema,
1376
+ partition_scheme=partition_scheme,
1377
+ description=table_version_description,
1378
+ properties=table_version_properties,
1379
+ content_types=supported_content_types,
1380
+ sort_scheme=sort_keys,
1381
+ watermark=None,
1382
+ lifecycle_state=lifecycle_state,
1383
+ schemas=[schema] if schema else None,
1384
+ partition_schemes=[partition_scheme],
1385
+ sort_schemes=[sort_keys],
1386
+ previous_table_version=prev_table_version,
1387
+ )
1388
+ # create the table version's default deltacat stream in this transaction
1389
+ stream_locator = StreamLocator.of(
1390
+ table_version_locator=locator,
1391
+ stream_id=str(uuid.uuid4()),
1392
+ stream_format=StreamFormat.DELTACAT,
1393
+ )
1394
+ stream = Stream.of(
1395
+ locator=stream_locator,
1396
+ partition_scheme=partition_scheme,
1397
+ state=CommitState.COMMITTED,
1398
+ previous_stream_id=None,
1399
+ watermark=None,
1400
+ )
1401
+ # Add operations to the transaction
1402
+ transaction.step(
1403
+ TransactionOperation.of(
1404
+ operation_type=table_txn_op_type,
1405
+ dest_metafile=new_table,
1406
+ src_metafile=prev_table,
1407
+ ),
1408
+ )
1409
+ transaction.step(
1410
+ TransactionOperation.of(
1411
+ operation_type=TransactionOperationType.CREATE,
1412
+ dest_metafile=table_version,
1413
+ ),
1414
+ )
1415
+ transaction.step(
1416
+ TransactionOperation.of(
1417
+ operation_type=TransactionOperationType.CREATE,
1418
+ dest_metafile=stream,
1419
+ ),
1420
+ )
1421
+
1422
+ if commit_transaction:
1423
+ transaction.seal()
1424
+ return new_table, table_version, stream
1425
+
1426
+
1427
+ def create_table(
1428
+ namespace: str,
1429
+ table_name: str,
1430
+ description: Optional[str] = None,
1431
+ properties: Optional[TableProperties] = None,
1432
+ *args,
1433
+ transaction: Optional[Transaction] = None,
1434
+ **kwargs,
1435
+ ) -> Table:
1436
+ """
1437
+ Create a new table. Raises an error if the given table already exists.
1438
+ """
1439
+ transaction, commit_transaction = setup_transaction(transaction, **kwargs)
1440
+
1441
+ new_table: Table = Table.of(
1442
+ locator=TableLocator.at(namespace=namespace, table_name=table_name),
1443
+ description=description,
1444
+ properties=properties,
1445
+ )
1446
+ try:
1447
+ transaction.step(
1448
+ TransactionOperation.of(
1449
+ operation_type=TransactionOperationType.CREATE,
1450
+ dest_metafile=new_table,
1451
+ ),
1452
+ )
1453
+ except ObjectAlreadyExistsError as e:
1454
+ raise TableAlreadyExistsError(
1455
+ f"Table {namespace}.{table_name} already exists"
1456
+ ) from e
1457
+
1458
+ if commit_transaction:
1459
+ transaction.seal()
1460
+ return new_table
1461
+
1462
+
1463
+ def update_table(
1464
+ namespace: str,
1465
+ table_name: str,
1466
+ description: Optional[str] = None,
1467
+ properties: Optional[TableProperties] = None,
1468
+ new_table_name: Optional[str] = None,
1469
+ *args,
1470
+ transaction: Optional[Transaction] = None,
1471
+ **kwargs,
1472
+ ) -> Table:
1473
+ """
1474
+ Update table metadata describing the table versions it contains. By default,
1475
+ a table's properties are empty, and its description is equal to that given
1476
+ when its first table version was created. Raises an error if the given
1477
+ table does not exist.
1478
+ """
1479
+ transaction, commit_transaction = setup_transaction(transaction, **kwargs)
1480
+
1481
+ old_table = get_table(
1482
+ namespace=namespace,
1483
+ table_name=table_name,
1484
+ transaction=transaction,
1485
+ *args,
1486
+ **kwargs,
1487
+ )
1488
+ if not old_table:
1489
+ raise TableNotFoundError(f"Table `{namespace}.{table_name}` does not exist.")
1490
+ new_table: Table = Metafile.update_for(old_table)
1491
+ new_table.description = description or old_table.description
1492
+ new_table.properties = properties or old_table.properties
1493
+ new_table.table_name = new_table_name or old_table.table_name
1494
+
1495
+ try:
1496
+ transaction.step(
1497
+ TransactionOperation.of(
1498
+ operation_type=TransactionOperationType.UPDATE,
1499
+ dest_metafile=new_table,
1500
+ src_metafile=old_table,
1501
+ ),
1502
+ )
1503
+ except ObjectAlreadyExistsError as e:
1504
+ raise TableAlreadyExistsError(
1505
+ f"Table {namespace}.{table_name} already exists"
1506
+ ) from e
1507
+
1508
+ if commit_transaction:
1509
+ transaction.seal()
1510
+ return new_table
1511
+
1512
+
1513
+ def update_table_version(
1514
+ namespace: str,
1515
+ table_name: str,
1516
+ table_version: str,
1517
+ lifecycle_state: Optional[LifecycleState] = None,
1518
+ schema: Optional[Schema] = None,
1519
+ description: Optional[str] = None,
1520
+ properties: Optional[TableVersionProperties] = None,
1521
+ partition_scheme: Optional[PartitionScheme] = None,
1522
+ sort_keys: Optional[SortScheme] = None,
1523
+ *args,
1524
+ transaction: Optional[Transaction] = None,
1525
+ **kwargs,
1526
+ ) -> Tuple[Optional[Table], TableVersion, Optional[Stream]]:
1527
+ """
1528
+ Update a table version. Notably, updating an unreleased table version's
1529
+ lifecycle state to 'ACTIVE' telegraphs that it is ready for external
1530
+ consumption, and causes all calls made to consume/produce streams,
1531
+ partitions, or deltas from/to its parent table to automatically resolve to
1532
+ this table version by default (i.e., when the client does not explicitly
1533
+ specify a different table version). Raises an error if the given table
1534
+ version does not exist.
1535
+
1536
+ Note that, to transition a table version from partitioned to unpartitioned,
1537
+ partition_scheme must be explicitly set to UNPARTITIONED_SCHEME. Similarly
1538
+ to transition a table version from sorted to unsorted, sort_keys must be
1539
+ explicitly set to UNSORTED_SCHEME.
1540
+ """
1541
+ transaction, commit_transaction = setup_transaction(transaction, **kwargs)
1542
+ old_table_version = get_table_version(
1543
+ namespace=namespace,
1544
+ table_name=table_name,
1545
+ table_version=table_version,
1546
+ transaction=transaction,
1547
+ *args,
1548
+ **kwargs,
1549
+ )
1550
+ if not old_table_version:
1551
+ raise TableVersionNotFoundError(
1552
+ f"Table version `{table_version}` does not exist for "
1553
+ f"table `{namespace}.{table_name}`."
1554
+ )
1555
+
1556
+ # If schema is not provided but partition_scheme or sort_keys are,
1557
+ # validate against the existing schema
1558
+ schema_to_validate = schema or old_table_version.schema
1559
+ _validate_schemes_against_schema(schema_to_validate, partition_scheme, sort_keys)
1560
+
1561
+ new_table_version: TableVersion = Metafile.update_for(old_table_version)
1562
+ new_table_version.state = lifecycle_state or old_table_version.state
1563
+
1564
+ # Caller is expected to do all necessary backwards compatibility schema checks
1565
+ update_schema = schema and not schema.equivalent_to(
1566
+ old_table_version.schema,
1567
+ True,
1568
+ )
1569
+ if update_schema and schema.id in [s.id for s in old_table_version.schemas]:
1570
+ raise TableValidationError(
1571
+ f"Schema ID `{schema.id}` already exists in "
1572
+ f"table version `{table_version}`."
1573
+ )
1574
+ new_table_version.schema = schema if update_schema else old_table_version.schema
1575
+ new_table_version.schemas = (
1576
+ old_table_version.schemas + [schema]
1577
+ if update_schema
1578
+ else old_table_version.schemas
1579
+ )
1580
+ new_table_version.description = (
1581
+ description if description is not None else old_table_version.description
1582
+ )
1583
+ new_table_version.properties = (
1584
+ properties if properties is not None else old_table_version.properties
1585
+ )
1586
+ new_supported_reader_types = new_table_version.read_table_property(
1587
+ TableProperty.SUPPORTED_READER_TYPES
1588
+ )
1589
+ if new_supported_reader_types:
1590
+ old_supported_reader_types = (
1591
+ old_table_version.read_table_property(TableProperty.SUPPORTED_READER_TYPES)
1592
+ or {}
1593
+ )
1594
+ added_supported_reader_types = set(new_supported_reader_types) - set(
1595
+ old_supported_reader_types
1596
+ )
1597
+ if added_supported_reader_types:
1598
+ raise TableValidationError(
1599
+ f"Cannot add new supported reader types: {added_supported_reader_types}"
1600
+ )
1601
+ new_table_version.partition_scheme = (
1602
+ partition_scheme or old_table_version.partition_scheme
1603
+ )
1604
+ # TODO(pdames): Check for backwards incompatible partition scheme changes.
1605
+ update_partition_scheme = partition_scheme and not partition_scheme.equivalent_to(
1606
+ old_table_version.partition_scheme,
1607
+ True,
1608
+ )
1609
+ if update_partition_scheme and partition_scheme.id in [
1610
+ ps.id for ps in old_table_version.partition_schemes
1611
+ ]:
1612
+ raise TableValidationError(
1613
+ f"Partition scheme ID `{partition_scheme.id}` already exists in "
1614
+ f"table version `{table_version}`."
1615
+ )
1616
+ new_table_version.partition_schemes = (
1617
+ old_table_version.partition_schemes + [partition_scheme]
1618
+ if update_partition_scheme
1619
+ else old_table_version.partition_schemes
1620
+ )
1621
+ # TODO(pdames): Check for backwards incompatible sort scheme changes.
1622
+ update_sort_scheme = sort_keys and not sort_keys.equivalent_to(
1623
+ old_table_version.sort_scheme,
1624
+ True,
1625
+ )
1626
+ if update_sort_scheme and sort_keys.id in [
1627
+ sk.id for sk in old_table_version.sort_schemes
1628
+ ]:
1629
+ raise TableValidationError(
1630
+ f"Sort scheme ID `{sort_keys.id}` already exists in "
1631
+ f"table version `{table_version}`."
1632
+ )
1633
+ new_table_version.sort_scheme = sort_keys or old_table_version.sort_scheme
1634
+ new_table_version.sort_schemes = (
1635
+ old_table_version.sort_schemes + [sort_keys]
1636
+ if update_sort_scheme
1637
+ else old_table_version.sort_schemes
1638
+ )
1639
+ old_table = get_table(
1640
+ namespace=namespace,
1641
+ table_name=table_name,
1642
+ transaction=transaction,
1643
+ *args,
1644
+ **kwargs,
1645
+ )
1646
+ new_table: Table = None
1647
+ if (
1648
+ lifecycle_state == LifecycleState.ACTIVE
1649
+ and old_table_version.state != LifecycleState.ACTIVE
1650
+ ):
1651
+ _, old_version_number = (
1652
+ TableVersion.parse_table_version(
1653
+ old_table.latest_active_table_version,
1654
+ )
1655
+ if old_table.latest_active_table_version
1656
+ else (None, None)
1657
+ )
1658
+ _, new_version_number = TableVersion.parse_table_version(table_version)
1659
+ if old_version_number is None or old_version_number < new_version_number:
1660
+ # update the table's latest table version
1661
+ new_table = Metafile.update_for(old_table)
1662
+ new_table.latest_active_table_version = table_version
1663
+ transaction.step(
1664
+ TransactionOperation.of(
1665
+ operation_type=TransactionOperationType.UPDATE,
1666
+ dest_metafile=new_table,
1667
+ src_metafile=old_table,
1668
+ ),
1669
+ )
1670
+ try:
1671
+ transaction.step(
1672
+ TransactionOperation.of(
1673
+ operation_type=TransactionOperationType.UPDATE,
1674
+ dest_metafile=new_table_version,
1675
+ src_metafile=old_table_version,
1676
+ ),
1677
+ )
1678
+ except ObjectAlreadyExistsError as e:
1679
+ raise TableVersionAlreadyExistsError(
1680
+ f"Table version {namespace}.{table_name}.{table_version} already exists"
1681
+ ) from e
1682
+
1683
+ # TODO(pdames): Push changes down to non-deltacat streams via sync module.
1684
+ # Also copy sort scheme changes down to deltacat child stream?
1685
+ new_stream: Stream = None
1686
+ if partition_scheme:
1687
+ old_stream = get_stream(
1688
+ namespace=namespace,
1689
+ table_name=table_name,
1690
+ table_version=table_version,
1691
+ transaction=transaction,
1692
+ *args,
1693
+ **kwargs,
1694
+ )
1695
+ new_stream = Metafile.update_for(old_stream)
1696
+ new_stream.partition_scheme = partition_scheme
1697
+ transaction.step(
1698
+ TransactionOperation.of(
1699
+ operation_type=TransactionOperationType.UPDATE,
1700
+ dest_metafile=new_stream,
1701
+ src_metafile=old_stream,
1702
+ ),
1703
+ )
1704
+ if commit_transaction:
1705
+ transaction.seal()
1706
+ return new_table, new_table_version, new_stream
1707
+
1708
+
1709
+ def stage_stream(
1710
+ namespace: str,
1711
+ table_name: str,
1712
+ table_version: Optional[str] = None,
1713
+ stream_format: StreamFormat = StreamFormat.DELTACAT,
1714
+ *args,
1715
+ transaction: Optional[Transaction] = None,
1716
+ **kwargs,
1717
+ ) -> Stream:
1718
+ """
1719
+ Stages a new delta stream for the given table version. Resolves to the
1720
+ latest active table version if no table version is given. Resolves to the
1721
+ DeltaCAT stream format if no stream format is given. If this stream
1722
+ will replace another stream with the same format and scheme, then it will
1723
+ have its previous stream ID set to the ID of the stream being replaced.
1724
+ Returns the staged stream. Raises an error if the table version does not
1725
+ exist.
1726
+ """
1727
+ transaction, commit_transaction = setup_transaction(transaction, **kwargs)
1728
+
1729
+ if not table_version:
1730
+ table_version = _resolve_latest_active_table_version_id(
1731
+ namespace=namespace,
1732
+ table_name=table_name,
1733
+ transaction=transaction,
1734
+ *args,
1735
+ **kwargs,
1736
+ )
1737
+ table_version_meta = get_table_version(
1738
+ namespace=namespace,
1739
+ table_name=table_name,
1740
+ table_version=table_version,
1741
+ transaction=transaction,
1742
+ *args,
1743
+ **kwargs,
1744
+ )
1745
+ if not table_version_meta:
1746
+ raise TableVersionNotFoundError(
1747
+ f"Table version not found: {namespace}.{table_name}.{table_version}."
1748
+ )
1749
+ locator = StreamLocator.at(
1750
+ namespace=namespace,
1751
+ table_name=table_name,
1752
+ table_version=table_version,
1753
+ stream_id=str(uuid.uuid4()),
1754
+ stream_format=stream_format or StreamFormat.DELTACAT,
1755
+ )
1756
+ stream = Stream.of(
1757
+ locator=locator,
1758
+ partition_scheme=table_version_meta.partition_scheme,
1759
+ state=CommitState.STAGED,
1760
+ previous_stream_id=None,
1761
+ watermark=None,
1762
+ )
1763
+ prev_stream = get_stream(
1764
+ namespace=stream.namespace,
1765
+ table_name=stream.table_name,
1766
+ table_version=stream.table_version,
1767
+ stream_format=stream.stream_format,
1768
+ transaction=transaction,
1769
+ *args,
1770
+ **kwargs,
1771
+ )
1772
+ if prev_stream:
1773
+ if prev_stream.stream_id == stream.stream_id:
1774
+ raise TableValidationError(
1775
+ f"Stream to stage has the same ID as existing stream: {prev_stream}."
1776
+ )
1777
+ stream.previous_stream_id = prev_stream.stream_id
1778
+ # Add the operation to the transaction
1779
+ transaction.step(
1780
+ TransactionOperation.of(
1781
+ operation_type=TransactionOperationType.CREATE,
1782
+ dest_metafile=stream,
1783
+ ),
1784
+ )
1785
+
1786
+ if commit_transaction:
1787
+ transaction.seal()
1788
+ return stream
1789
+
1790
+
1791
+ def commit_stream(
1792
+ stream: Stream,
1793
+ *args,
1794
+ transaction: Optional[Transaction] = None,
1795
+ **kwargs,
1796
+ ) -> Stream:
1797
+ """
1798
+ Registers a staged delta stream with a target table version, replacing any
1799
+ previous stream registered for the same table version. Returns the
1800
+ committed stream.
1801
+ """
1802
+ transaction, commit_transaction = setup_transaction(transaction, **kwargs)
1803
+
1804
+ if not stream.stream_id:
1805
+ raise ValueError("Stream ID to commit must be set to a staged stream ID.")
1806
+ if not stream.table_version_locator:
1807
+ raise ValueError(
1808
+ "Stream to commit must have its table version locator "
1809
+ "set to the parent of its staged stream ID."
1810
+ )
1811
+ prev_staged_stream = get_stream_by_id(
1812
+ table_version_locator=stream.table_version_locator,
1813
+ stream_id=stream.stream_id,
1814
+ transaction=transaction,
1815
+ *args,
1816
+ **kwargs,
1817
+ )
1818
+ if not prev_staged_stream:
1819
+ raise StreamNotFoundError(
1820
+ f"Stream at table version {stream.table_version_locator} with ID "
1821
+ f"{stream.stream_id} not found."
1822
+ )
1823
+ if prev_staged_stream.state != CommitState.STAGED:
1824
+ raise TableValidationError(
1825
+ f"Expected to find a `{CommitState.STAGED}` stream at table version "
1826
+ f"{stream.table_version_locator} with ID {stream.stream_id},"
1827
+ f"but found a `{prev_staged_stream.state}` partition."
1828
+ )
1829
+ stream: Stream = Metafile.update_for(prev_staged_stream)
1830
+ stream.state = CommitState.COMMITTED
1831
+ prev_committed_stream = get_stream(
1832
+ namespace=stream.namespace,
1833
+ table_name=stream.table_name,
1834
+ table_version=stream.table_version,
1835
+ stream_format=stream.stream_format,
1836
+ transaction=transaction,
1837
+ *args,
1838
+ **kwargs,
1839
+ )
1840
+ if prev_committed_stream:
1841
+ # there's a previously committed stream, so update the transaction
1842
+ # type to overwrite the previously committed stream
1843
+ txn_op_type = TransactionOperationType.REPLACE
1844
+ else:
1845
+ txn_op_type = TransactionOperationType.UPDATE
1846
+
1847
+ # the first transaction operation updates the staged stream commit state
1848
+ transaction.step(
1849
+ TransactionOperation.of(
1850
+ operation_type=txn_op_type,
1851
+ dest_metafile=stream,
1852
+ src_metafile=prev_staged_stream,
1853
+ ),
1854
+ )
1855
+ if prev_committed_stream:
1856
+ if prev_committed_stream.stream_id != stream.previous_stream_id:
1857
+ raise ConcurrentModificationError(
1858
+ f"Previous stream ID mismatch Expected "
1859
+ f"{stream.previous_stream_id} but found "
1860
+ f"{prev_committed_stream.stream_id}."
1861
+ )
1862
+ if prev_committed_stream.stream_id == stream.stream_id:
1863
+ raise TableValidationError(
1864
+ f"Stream to commit has the same ID as existing stream: {prev_committed_stream}."
1865
+ )
1866
+ # add another transaction operation to replace the previously committed stream
1867
+ # with the staged stream
1868
+ transaction.step(
1869
+ TransactionOperation.of(
1870
+ operation_type=txn_op_type,
1871
+ dest_metafile=stream,
1872
+ src_metafile=prev_committed_stream,
1873
+ ),
1874
+ )
1875
+ if commit_transaction:
1876
+ transaction.seal()
1877
+ return stream
1878
+
1879
+
1880
+ def delete_stream(
1881
+ namespace: str,
1882
+ table_name: str,
1883
+ table_version: Optional[str] = None,
1884
+ stream_format: StreamFormat = StreamFormat.DELTACAT,
1885
+ *args,
1886
+ transaction: Optional[Transaction] = None,
1887
+ **kwargs,
1888
+ ) -> None:
1889
+ """
1890
+ Deletes the delta stream currently registered with the given table version.
1891
+ Resolves to the latest active table version if no table version is given.
1892
+ Resolves to the deltacat stream format if no stream format is given.
1893
+ Raises an error if the stream does not exist.
1894
+ """
1895
+ transaction, commit_transaction = setup_transaction(transaction, **kwargs)
1896
+
1897
+ if not table_version:
1898
+ table_version = _resolve_latest_active_table_version_id(
1899
+ namespace=namespace,
1900
+ table_name=table_name,
1901
+ transaction=transaction,
1902
+ *args,
1903
+ **kwargs,
1904
+ )
1905
+ stream_to_delete = get_stream(
1906
+ namespace=namespace,
1907
+ table_name=table_name,
1908
+ table_version=table_version,
1909
+ stream_format=stream_format,
1910
+ transaction=transaction,
1911
+ *args,
1912
+ **kwargs,
1913
+ )
1914
+ if not stream_to_delete:
1915
+ raise StreamNotFoundError(
1916
+ f"Stream to delete not found: {namespace}.{table_name}"
1917
+ f".{table_version}.{stream_format}."
1918
+ )
1919
+ else:
1920
+ stream_to_delete.state = CommitState.DEPRECATED
1921
+
1922
+ transaction.step(
1923
+ TransactionOperation.of(
1924
+ operation_type=TransactionOperationType.DELETE,
1925
+ dest_metafile=stream_to_delete,
1926
+ ),
1927
+ )
1928
+
1929
+ if commit_transaction:
1930
+ transaction.seal()
1931
+
1932
+
1933
+ def delete_table(
1934
+ namespace: str,
1935
+ table_name: str,
1936
+ purge: bool = False,
1937
+ *args,
1938
+ transaction: Optional[Transaction] = None,
1939
+ **kwargs,
1940
+ ) -> None:
1941
+ """
1942
+ Drops the given table from the catalog. If purge is True, also removes
1943
+ all data files associated with the table. Raises an error if the given table
1944
+ does not exist.
1945
+ """
1946
+ if purge:
1947
+ raise NotImplementedError("Purge flag is not currently supported.")
1948
+ transaction, commit_transaction = setup_transaction(transaction, **kwargs)
1949
+
1950
+ table: Optional[Table] = get_table(
1951
+ namespace=namespace,
1952
+ table_name=table_name,
1953
+ transaction=transaction,
1954
+ *args,
1955
+ **kwargs,
1956
+ )
1957
+
1958
+ if not table:
1959
+ # TODO(pdames): Refactor this so that it doesn't initialize Ray
1960
+ raise TableNotFoundError(f"Table `{namespace}.{table_name}` does not exist.")
1961
+
1962
+ transaction.step(
1963
+ TransactionOperation.of(
1964
+ operation_type=TransactionOperationType.DELETE,
1965
+ dest_metafile=table,
1966
+ ),
1967
+ )
1968
+
1969
+ if commit_transaction:
1970
+ transaction.seal()
1971
+
1972
+
1973
+ def delete_namespace(
1974
+ namespace: str,
1975
+ purge: bool = False,
1976
+ *args,
1977
+ transaction: Optional[Transaction] = None,
1978
+ **kwargs,
1979
+ ) -> None:
1980
+ """
1981
+ Drops the given namespace from the catalog. If purge is True, also removes
1982
+ all data files associated with the namespace. Raises an error if the given
1983
+ namespace does not exist.
1984
+ """
1985
+ if purge:
1986
+ raise NotImplementedError("Purge flag is not currently supported.")
1987
+ transaction, commit_transaction = setup_transaction(transaction, **kwargs)
1988
+
1989
+ namespace_obj: Optional[Namespace] = get_namespace(
1990
+ namespace=namespace,
1991
+ transaction=transaction,
1992
+ *args,
1993
+ **kwargs,
1994
+ )
1995
+
1996
+ if not namespace_obj:
1997
+ raise NamespaceNotFoundError(f"Namespace `{namespace}` does not exist.")
1998
+
1999
+ transaction.step(
2000
+ TransactionOperation.of(
2001
+ operation_type=TransactionOperationType.DELETE,
2002
+ dest_metafile=namespace_obj,
2003
+ ),
2004
+ )
2005
+
2006
+ if commit_transaction:
2007
+ transaction.seal()
2008
+
2009
+
2010
+ def get_stream_by_id(
2011
+ table_version_locator: TableVersionLocator,
2012
+ stream_id: str,
2013
+ *args,
2014
+ **kwargs,
2015
+ ) -> Optional[Partition]:
2016
+ """
2017
+ Gets the stream for the given table version locator and stream ID.
2018
+ Returns None if the stream does not exist. Raises an error if the given
2019
+ table version locator does not exist.
2020
+ """
2021
+ locator = StreamLocator.of(
2022
+ table_version_locator=table_version_locator,
2023
+ stream_id=stream_id,
2024
+ stream_format=None,
2025
+ )
2026
+ return _latest(
2027
+ metafile=Stream.of(locator=locator, partition_scheme=None),
2028
+ *args,
2029
+ **kwargs,
2030
+ )
2031
+
2032
+
2033
+ def get_stream(
2034
+ namespace: str,
2035
+ table_name: str,
2036
+ table_version: Optional[str] = None,
2037
+ stream_format: StreamFormat = StreamFormat.DELTACAT,
2038
+ *args,
2039
+ transaction: Optional[Transaction] = None,
2040
+ **kwargs,
2041
+ ) -> Optional[Stream]:
2042
+ """
2043
+ Gets the most recently committed stream for the given table version.
2044
+ Resolves to the latest active table version if no table version is given.
2045
+ Resolves to the DeltaCAT stream format if no stream format is given.
2046
+ Returns None if the table version or stream format does not exist.
2047
+ """
2048
+ transaction, commit_transaction = setup_transaction(transaction, **kwargs)
2049
+ if not table_version:
2050
+ table_version = _resolve_latest_active_table_version_id(
2051
+ namespace=namespace,
2052
+ table_name=table_name,
2053
+ fail_if_no_active_table_version=False,
2054
+ transaction=transaction,
2055
+ *args,
2056
+ **kwargs,
2057
+ )
2058
+ locator = StreamLocator.at(
2059
+ namespace=namespace,
2060
+ table_name=table_name,
2061
+ table_version=table_version,
2062
+ stream_id=None,
2063
+ stream_format=stream_format,
2064
+ )
2065
+ stream = _latest(
2066
+ metafile=Stream.of(
2067
+ locator=locator,
2068
+ partition_scheme=None,
2069
+ state=CommitState.COMMITTED,
2070
+ ),
2071
+ transaction=transaction,
2072
+ *args,
2073
+ **kwargs,
2074
+ )
2075
+ if commit_transaction:
2076
+ transaction.seal()
2077
+ return stream
2078
+
2079
+
2080
+ def stream_exists(
2081
+ namespace: str,
2082
+ table_name: str,
2083
+ table_version: Optional[str] = None,
2084
+ stream_format: StreamFormat = StreamFormat.DELTACAT,
2085
+ *args,
2086
+ transaction: Optional[Transaction] = None,
2087
+ **kwargs,
2088
+ ) -> Optional[Stream]:
2089
+ """
2090
+ Returns True if the given Stream exists, False if not.
2091
+ Resolves to the latest active table version if no table version is given.
2092
+ Resolves to the DeltaCAT stream format if no stream format is given.
2093
+ Returns None if the table version or stream format does not exist.
2094
+ """
2095
+ transaction, commit_transaction = setup_transaction(transaction, **kwargs)
2096
+ if not table_version:
2097
+ table_version = _resolve_latest_active_table_version_id(
2098
+ namespace=namespace,
2099
+ table_name=table_name,
2100
+ fail_if_no_active_table_version=False,
2101
+ transaction=transaction,
2102
+ *args,
2103
+ **kwargs,
2104
+ )
2105
+
2106
+ # Try with the provided table name first
2107
+ locator = StreamLocator.at(
2108
+ namespace=namespace,
2109
+ table_name=table_name,
2110
+ table_version=table_version,
2111
+ stream_id=None,
2112
+ stream_format=stream_format,
2113
+ )
2114
+ exists = _exists(
2115
+ metafile=Stream.of(
2116
+ locator=locator,
2117
+ partition_scheme=None,
2118
+ state=CommitState.COMMITTED,
2119
+ ),
2120
+ transaction=transaction,
2121
+ *args,
2122
+ **kwargs,
2123
+ )
2124
+ if commit_transaction:
2125
+ transaction.seal()
2126
+ return exists
2127
+
2128
+
2129
+ def stage_partition(
2130
+ stream: Stream,
2131
+ partition_values: Optional[PartitionValues] = None,
2132
+ partition_scheme_id: Optional[str] = None,
2133
+ *args,
2134
+ transaction: Optional[Transaction] = None,
2135
+ **kwargs,
2136
+ ) -> Partition:
2137
+ """
2138
+ Stages a new partition for the given stream and partition values. Returns
2139
+ the staged partition. If this partition will replace another partition
2140
+ with the same partition values and scheme, then it will have its previous
2141
+ partition ID set to the ID of the partition being replaced. Partition values
2142
+ should not be specified for unpartitioned tables.
2143
+
2144
+ The partition_values must represent the results of transforms in a partition
2145
+ spec specified in the stream.
2146
+ """
2147
+ transaction, commit_transaction = setup_transaction(transaction, **kwargs)
2148
+
2149
+ # TODO(pdames): Cache last retrieved metafile revisions in memory to resolve
2150
+ # potentially high cost of staging many partitions.
2151
+ table_version = get_table_version(
2152
+ namespace=stream.namespace,
2153
+ table_name=stream.table_name,
2154
+ table_version=stream.table_version,
2155
+ transaction=transaction,
2156
+ *args,
2157
+ **kwargs,
2158
+ )
2159
+ if not table_version:
2160
+ raise TableVersionNotFoundError(
2161
+ f"Table version not found: {stream.namespace}.{stream.table_name}."
2162
+ f"{stream.table_version}."
2163
+ )
2164
+ # Set partition_scheme_id to UNPARTITIONED_SCHEME_ID when partition_values
2165
+ # is None or empty
2166
+ if not partition_values:
2167
+ partition_scheme_id = UNPARTITIONED_SCHEME_ID
2168
+ # Use stream's partition scheme ID if none provided and partition_values
2169
+ # are specified
2170
+ elif partition_scheme_id is None:
2171
+ partition_scheme_id = stream.partition_scheme.id
2172
+ if not table_version.partition_schemes or partition_scheme_id not in [
2173
+ ps.id for ps in table_version.partition_schemes
2174
+ ]:
2175
+ raise TableValidationError(
2176
+ f"Invalid partition scheme ID `{partition_scheme_id}` (not found "
2177
+ f"in parent table version `{stream.namespace}.{stream.table_name}"
2178
+ f".{table_version.table_version}` partition scheme IDs)."
2179
+ )
2180
+ if stream.partition_scheme.id not in [
2181
+ ps.id for ps in table_version.partition_schemes
2182
+ ]:
2183
+ # this should never happen, but just in case
2184
+ raise TableValidationError(
2185
+ f"Invalid stream partition scheme ID `{stream.partition_scheme.id}`"
2186
+ f" (not found in parent table version "
2187
+ f"`{stream.namespace}.{stream.table_name}"
2188
+ f".{table_version.table_version}` partition scheme IDs)."
2189
+ )
2190
+
2191
+ if partition_values:
2192
+ if partition_scheme_id == UNPARTITIONED_SCHEME_ID:
2193
+ raise TableValidationError(
2194
+ "Partition values cannot be specified for unpartitioned tables"
2195
+ )
2196
+ # Validate partition values against partition scheme
2197
+ partition_scheme = next(
2198
+ ps for ps in table_version.partition_schemes if ps.id == partition_scheme_id
2199
+ )
2200
+ _validate_partition_values_against_scheme(
2201
+ partition_values=partition_values,
2202
+ partition_scheme=partition_scheme,
2203
+ schema=table_version.schema,
2204
+ )
2205
+
2206
+ locator = PartitionLocator.of(
2207
+ stream_locator=stream.locator,
2208
+ partition_values=partition_values,
2209
+ partition_id=str(uuid.uuid4()),
2210
+ )
2211
+ partition = Partition.of(
2212
+ locator=locator,
2213
+ content_types=table_version.content_types,
2214
+ state=CommitState.STAGED,
2215
+ previous_stream_position=None,
2216
+ previous_partition_id=None,
2217
+ stream_position=None,
2218
+ partition_scheme_id=partition_scheme_id,
2219
+ )
2220
+ prev_partition = get_partition(
2221
+ stream_locator=stream.locator,
2222
+ partition_values=partition_values,
2223
+ partition_scheme_id=partition_scheme_id,
2224
+ transaction=transaction,
2225
+ *args,
2226
+ **kwargs,
2227
+ )
2228
+ prev_partition_id = prev_partition.partition_id if prev_partition else None
2229
+
2230
+ # TODO(pdames): Check all historic partitions for the same partition ID
2231
+ if prev_partition_id == partition.partition_id:
2232
+ raise TableValidationError(
2233
+ f"Partition to stage has the same ID as previous partition: {prev_partition_id}."
2234
+ )
2235
+ partition.previous_partition_id = prev_partition_id
2236
+
2237
+ # Add the operation to the transaction
2238
+ transaction.step(
2239
+ TransactionOperation.of(
2240
+ operation_type=TransactionOperationType.CREATE,
2241
+ dest_metafile=partition,
2242
+ ),
2243
+ )
2244
+
2245
+ if commit_transaction:
2246
+ transaction.seal()
2247
+ return partition
2248
+
2249
+
2250
+ def commit_partition(
2251
+ partition: Partition,
2252
+ previous_partition: Optional[Partition] = None,
2253
+ *args,
2254
+ transaction: Optional[Transaction] = None,
2255
+ **kwargs,
2256
+ ) -> Partition:
2257
+ """
2258
+ Commits the staged partition to its associated table version stream,
2259
+ replacing any previous partition registered for the same stream and
2260
+ partition values. All values set on the input partition except compaction
2261
+ round completion info will be overwritten with the values stored in the
2262
+ staged partition.
2263
+
2264
+ If previous partition is given then it will be replaced with its deltas
2265
+ prepended to the new partition being committed. Otherwise the latest
2266
+ committed partition with the same keys and partition scheme ID will be
2267
+ retrieved.
2268
+
2269
+ Returns the registered partition. If the partition's
2270
+ previous delta stream position is specified, then the commit will
2271
+ be rejected if it does not match the actual previous stream position of
2272
+ the partition being replaced. If the partition's previous partition ID is
2273
+ specified, then the commit will be rejected if it does not match the actual
2274
+ ID of the partition being replaced.
2275
+ """
2276
+ transaction, commit_transaction = setup_transaction(transaction, **kwargs)
2277
+
2278
+ if previous_partition:
2279
+ raise NotImplementedError(
2280
+ f"delta prepending from previous partition {previous_partition} "
2281
+ f"is not yet implemented"
2282
+ )
2283
+ if not partition.partition_id:
2284
+ raise ValueError("Partition ID to commit must be set to a staged partition ID.")
2285
+ if not partition.stream_locator:
2286
+ raise ValueError(
2287
+ "Partition to commit must have its stream locator "
2288
+ "set to the parent of its staged partition ID."
2289
+ )
2290
+
2291
+ # Start a single multi-step transaction for all operations (both read and write)
2292
+ # Step 1: Get the staged partition using transaction
2293
+ prev_staged_partition = get_partition_by_id(
2294
+ stream_locator=partition.stream_locator,
2295
+ partition_id=partition.partition_id,
2296
+ transaction=transaction,
2297
+ *args,
2298
+ **kwargs,
2299
+ )
2300
+
2301
+ # Validate staged partition
2302
+ if not prev_staged_partition:
2303
+ raise PartitionNotFoundError(
2304
+ f"Partition at stream {partition.stream_locator} with ID "
2305
+ f"{partition.partition_id} not found."
2306
+ )
2307
+ if prev_staged_partition.state != CommitState.STAGED:
2308
+ raise TableValidationError(
2309
+ f"Expected to find a `{CommitState.STAGED}` partition at stream "
2310
+ f"{partition.stream_locator} with ID {partition.partition_id},"
2311
+ f"but found a `{prev_staged_partition.state}` partition."
2312
+ )
2313
+
2314
+ # Step 2: Check for existing committed partition
2315
+ prev_committed_partition = None
2316
+ if partition.previous_partition_id is not None:
2317
+ prev_committed_partition = get_partition(
2318
+ stream_locator=partition.stream_locator,
2319
+ partition_values=partition.partition_values,
2320
+ partition_scheme_id=partition.partition_scheme_id,
2321
+ transaction=transaction,
2322
+ *args,
2323
+ **kwargs,
2324
+ )
2325
+
2326
+ # Validate expected previous partition ID for race condition detection
2327
+ if prev_committed_partition:
2328
+ logger.info(
2329
+ f"Checking previous committed partition for conflicts: {prev_committed_partition}"
2330
+ )
2331
+ if prev_committed_partition.partition_id != partition.previous_partition_id:
2332
+ raise ConcurrentModificationError(
2333
+ f"Concurrent modification detected: Expected committed partition "
2334
+ f"{partition.previous_partition_id} but found "
2335
+ f"{prev_committed_partition.partition_id}."
2336
+ )
2337
+
2338
+ if prev_committed_partition:
2339
+ # Update transaction type based on what we found
2340
+ txn_op_type = TransactionOperationType.REPLACE
2341
+ if prev_committed_partition.partition_id == partition.partition_id:
2342
+ raise TableValidationError(
2343
+ f"Partition to commit has the same ID as existing partition: "
2344
+ f"{prev_committed_partition}."
2345
+ )
2346
+ else:
2347
+ txn_op_type = TransactionOperationType.UPDATE
2348
+
2349
+ # Prepare the committed partition based on the staged partition
2350
+ # Compaction round completion info (if any) is not set on the staged partition,
2351
+ # so we need to save it from the input partition to commit.
2352
+ input_partition_rci = partition.compaction_round_completion_info
2353
+ partition: Partition = Metafile.update_for(prev_staged_partition)
2354
+ partition.state = CommitState.COMMITTED
2355
+ # Restore compaction round completion info (if any) from the input partition.
2356
+ if input_partition_rci is not None:
2357
+ partition.compaction_round_completion_info = input_partition_rci
2358
+
2359
+ # Step 4: Add write operations to the same transaction
2360
+ # Always UPDATE the staged partition to committed state
2361
+ transaction.step(
2362
+ TransactionOperation.of(
2363
+ operation_type=txn_op_type,
2364
+ dest_metafile=partition,
2365
+ src_metafile=prev_staged_partition,
2366
+ ),
2367
+ )
2368
+
2369
+ # If there's a previously committed partition, we need to replace it too
2370
+ if prev_committed_partition:
2371
+ transaction.step(
2372
+ TransactionOperation.of(
2373
+ operation_type=txn_op_type,
2374
+ dest_metafile=partition,
2375
+ src_metafile=prev_committed_partition,
2376
+ ),
2377
+ )
2378
+
2379
+ if commit_transaction:
2380
+ transaction.seal()
2381
+
2382
+ return partition
2383
+
2384
+
2385
+ def delete_partition(
2386
+ stream_locator: StreamLocator,
2387
+ partition_values: Optional[PartitionValues] = None,
2388
+ partition_scheme_id: Optional[str] = None,
2389
+ *args,
2390
+ transaction: Optional[Transaction] = None,
2391
+ **kwargs,
2392
+ ) -> None:
2393
+ """
2394
+ Deletes the given partition from the specified stream. Partition
2395
+ values should not be specified for unpartitioned tables. Raises an error
2396
+ if the partition does not exist.
2397
+ """
2398
+ transaction, commit_transaction = setup_transaction(transaction, **kwargs)
2399
+
2400
+ partition_to_delete = get_partition(
2401
+ stream_locator=stream_locator,
2402
+ partition_values=partition_values,
2403
+ partition_scheme_id=partition_scheme_id,
2404
+ transaction=transaction,
2405
+ *args,
2406
+ **kwargs,
2407
+ )
2408
+ if not partition_to_delete:
2409
+ raise PartitionNotFoundError(
2410
+ f"Partition with values {partition_values} and scheme "
2411
+ f"{partition_scheme_id} not found in stream: {stream_locator}"
2412
+ )
2413
+ else:
2414
+ partition_to_delete.state = CommitState.DEPRECATED
2415
+
2416
+ transaction.step(
2417
+ TransactionOperation.of(
2418
+ operation_type=TransactionOperationType.DELETE,
2419
+ dest_metafile=partition_to_delete,
2420
+ ),
2421
+ )
2422
+
2423
+ if commit_transaction:
2424
+ transaction.seal()
2425
+
2426
+
2427
+ def get_partition_by_id(
2428
+ stream_locator: StreamLocator,
2429
+ partition_id: str,
2430
+ *args,
2431
+ **kwargs,
2432
+ ) -> Optional[Partition]:
2433
+ """
2434
+ Gets the partition for the given stream locator and partition ID.
2435
+ Returns None if the partition does not exist. Raises an error if the
2436
+ given stream locator does not exist.
2437
+ """
2438
+ locator = PartitionLocator.of(
2439
+ stream_locator=stream_locator,
2440
+ partition_values=None,
2441
+ partition_id=partition_id,
2442
+ )
2443
+ return _latest(
2444
+ metafile=Partition.of(
2445
+ locator=locator,
2446
+ content_types=None,
2447
+ ),
2448
+ *args,
2449
+ **kwargs,
2450
+ )
2451
+
2452
+
2453
+ def get_partition(
2454
+ stream_locator: StreamLocator,
2455
+ partition_values: Optional[PartitionValues] = None,
2456
+ partition_scheme_id: Optional[str] = None,
2457
+ *args,
2458
+ transaction: Optional[Transaction] = None,
2459
+ **kwargs,
2460
+ ) -> Optional[Partition]:
2461
+ """
2462
+ Gets the most recently committed partition for the given stream locator and
2463
+ partition key values. Returns None if no partition has been committed for
2464
+ the given table version and/or partition key values. Partition values
2465
+ should not be specified for unpartitioned tables. Partition scheme ID
2466
+ resolves to the table version's current partition scheme by default.
2467
+ Raises an error if the given stream locator does not exist.
2468
+ """
2469
+ transaction, commit_transaction = setup_transaction(transaction, **kwargs)
2470
+ if not partition_scheme_id or not stream_locator.stream_id:
2471
+ # resolve latest partition scheme from the current
2472
+ # revision of its `deltacat` stream
2473
+ stream = get_stream(
2474
+ namespace=stream_locator.namespace,
2475
+ table_name=stream_locator.table_name,
2476
+ table_version=stream_locator.table_version,
2477
+ transaction=transaction,
2478
+ *args,
2479
+ **kwargs,
2480
+ )
2481
+ if not stream:
2482
+ raise StreamNotFoundError(f"Stream {stream_locator} not found.")
2483
+ partition_scheme_id = stream.partition_scheme.id
2484
+ # ensure that we always use a fully qualified stream locator
2485
+ stream_locator = stream.locator
2486
+ locator = PartitionLocator.of(
2487
+ stream_locator=stream_locator,
2488
+ partition_values=partition_values,
2489
+ partition_id=None,
2490
+ )
2491
+ partition = _latest(
2492
+ metafile=Partition.of(
2493
+ locator=locator,
2494
+ content_types=None,
2495
+ state=CommitState.COMMITTED,
2496
+ partition_scheme_id=partition_scheme_id,
2497
+ ),
2498
+ transaction=transaction,
2499
+ *args,
2500
+ **kwargs,
2501
+ )
2502
+ if commit_transaction:
2503
+ transaction.seal()
2504
+ return partition
2505
+
2506
+
2507
+ def _write_table_slices(
2508
+ table: Union[LocalTable, LocalDataset, DistributedDataset],
2509
+ partition_id: str,
2510
+ max_records_per_entry: Optional[int],
2511
+ table_writer_fn: Callable,
2512
+ table_slicer_fn: Callable,
2513
+ content_type: ContentType = ContentType.PARQUET,
2514
+ entry_params: Optional[EntryParams] = None,
2515
+ entry_type: Optional[EntryType] = EntryType.DATA,
2516
+ table_writer_kwargs: Optional[Dict[str, Any]] = None,
2517
+ **kwargs,
2518
+ ) -> ManifestEntryList:
2519
+ catalog_properties = get_catalog_properties(**kwargs)
2520
+ manifest_entries = ManifestEntryList()
2521
+ # LocalDataset is a special case to upload iteratively
2522
+ tables = [t for t in table] if isinstance(table, list) else [table]
2523
+ filesystem = catalog_properties.filesystem
2524
+ data_dir_path = posixpath.join(
2525
+ catalog_properties.root,
2526
+ DATA_FILE_DIR_NAME,
2527
+ partition_id,
2528
+ )
2529
+ filesystem.create_dir(data_dir_path, recursive=True)
2530
+ for t in tables:
2531
+ manifest_entries.extend(
2532
+ write_sliced_table(
2533
+ t,
2534
+ data_dir_path,
2535
+ filesystem,
2536
+ max_records_per_entry,
2537
+ table_writer_fn,
2538
+ table_slicer_fn,
2539
+ table_writer_kwargs,
2540
+ content_type,
2541
+ entry_params,
2542
+ entry_type,
2543
+ )
2544
+ )
2545
+ return manifest_entries
2546
+
2547
+
2548
+ def _write_table(
2549
+ partition_id: str,
2550
+ table: Union[LocalTable, LocalDataset, DistributedDataset],
2551
+ max_records_per_entry: Optional[int] = None,
2552
+ author: Optional[ManifestAuthor] = None,
2553
+ content_type: ContentType = ContentType.PARQUET,
2554
+ entry_params: Optional[EntryParams] = None,
2555
+ entry_type: Optional[EntryType] = EntryType.DATA,
2556
+ write_table_slices_fn: Optional[Callable] = _write_table_slices,
2557
+ table_writer_kwargs: Optional[Dict[str, Any]] = None,
2558
+ **kwargs,
2559
+ ) -> Manifest:
2560
+ """
2561
+ Writes the given table to 1 or more files and returns a
2562
+ Redshift manifest pointing to the uploaded files.
2563
+ """
2564
+ table_writer_fn = get_table_writer(table)
2565
+ table_slicer_fn = get_table_slicer(table)
2566
+
2567
+ manifest_entries = write_table_slices_fn(
2568
+ table,
2569
+ partition_id,
2570
+ max_records_per_entry,
2571
+ table_writer_fn,
2572
+ table_slicer_fn,
2573
+ content_type,
2574
+ entry_params,
2575
+ entry_type,
2576
+ table_writer_kwargs,
2577
+ **kwargs,
2578
+ )
2579
+ manifest = Manifest.of(
2580
+ entries=manifest_entries,
2581
+ author=author,
2582
+ uuid=str(uuid.uuid4()),
2583
+ entry_type=entry_type,
2584
+ entry_params=entry_params,
2585
+ )
2586
+ return manifest
2587
+
2588
+
2589
+ def stage_delta(
2590
+ data: Union[LocalTable, LocalDataset, DistributedDataset, Manifest],
2591
+ partition: Partition,
2592
+ delta_type: DeltaType = DeltaType.UPSERT,
2593
+ max_records_per_entry: Optional[int] = None,
2594
+ author: Optional[ManifestAuthor] = None,
2595
+ properties: Optional[DeltaProperties] = None,
2596
+ table_writer_kwargs: Optional[Dict[str, Any]] = None,
2597
+ content_type: ContentType = ContentType.PARQUET,
2598
+ entry_params: Optional[EntryParams] = None,
2599
+ entry_type: Optional[EntryType] = EntryType.DATA,
2600
+ write_table_slices_fn: Optional[Callable] = _write_table_slices,
2601
+ schema: Optional[Schema] = None,
2602
+ sort_scheme_id: Optional[str] = None,
2603
+ *args,
2604
+ **kwargs,
2605
+ ) -> Delta:
2606
+ """
2607
+ Writes the given dataset to 1 or more files. Returns an unregistered
2608
+ delta whose manifest entries point to the uploaded files. Applies any
2609
+ schema consistency policies configured for the parent table version.
2610
+ """
2611
+ # TODO(pdames): Validate that equality delete entry types either have
2612
+ # entry params specified, or are being added to a table with merge keys.
2613
+ if not partition.is_supported_content_type(content_type):
2614
+ raise TableValidationError(
2615
+ f"Content type {content_type} is not supported by "
2616
+ f"partition: {partition}"
2617
+ )
2618
+ if partition.state == CommitState.DEPRECATED:
2619
+ raise TableValidationError(
2620
+ f"Cannot stage delta to {partition.state} partition: {partition}",
2621
+ )
2622
+ previous_stream_position: Optional[int] = partition.stream_position
2623
+
2624
+ # Handle schema parameter and add to table_writer_kwargs if available
2625
+ table_writer_kwargs = table_writer_kwargs or {}
2626
+
2627
+ # Extract schema_id from the schema if it's a DeltaCAT Schema
2628
+ schema_id = None
2629
+ if isinstance(schema, Schema):
2630
+ schema_id = schema.id
2631
+ table_writer_kwargs["schema_id"] = schema_id
2632
+ # Add PyArrow schema to table_writer_kwargs if not already present
2633
+ if "schema" not in table_writer_kwargs:
2634
+ table_writer_kwargs["schema"] = schema.arrow
2635
+ elif schema is not None and "schema" not in table_writer_kwargs:
2636
+ # For PyArrow schemas or other types, add directly
2637
+ table_writer_kwargs["schema"] = schema
2638
+
2639
+ # Add sort_scheme_id to table_writer_kwargs for manifest entry creation
2640
+ if sort_scheme_id is not None:
2641
+ table_writer_kwargs["sort_scheme_id"] = sort_scheme_id
2642
+
2643
+ manifest: Manifest = _write_table(
2644
+ partition.partition_id,
2645
+ data,
2646
+ max_records_per_entry,
2647
+ author,
2648
+ content_type,
2649
+ entry_params,
2650
+ entry_type,
2651
+ write_table_slices_fn,
2652
+ table_writer_kwargs,
2653
+ **kwargs,
2654
+ )
2655
+ staged_delta: Delta = Delta.of(
2656
+ locator=DeltaLocator.of(partition.locator, None),
2657
+ delta_type=delta_type,
2658
+ meta=manifest.meta,
2659
+ properties=properties,
2660
+ manifest=manifest,
2661
+ previous_stream_position=previous_stream_position,
2662
+ )
2663
+ return staged_delta
2664
+
2665
+
2666
+ def commit_delta(
2667
+ delta: Delta,
2668
+ *args,
2669
+ transaction: Optional[Transaction] = None,
2670
+ **kwargs,
2671
+ ) -> Delta:
2672
+ """
2673
+ Registers a new delta with its associated target table version and
2674
+ partition. Returns the registered delta. If the delta's previous stream
2675
+ position is specified, then the commit will be rejected if it does not match
2676
+ the target partition's actual previous stream position. If the delta's
2677
+ stream position is specified, it must be greater than the latest stream
2678
+ position in the target partition.
2679
+ """
2680
+ transaction, commit_transaction = setup_transaction(transaction, **kwargs)
2681
+
2682
+ delta: Delta = Metafile.update_for(delta)
2683
+ delta_type: Optional[DeltaType] = delta.type
2684
+ resolved_delta_type = delta_type if delta_type is not None else DeltaType.UPSERT
2685
+ delta.type = resolved_delta_type
2686
+ delta.properties = kwargs.get("properties") or delta.properties
2687
+
2688
+ if delta.partition_id:
2689
+ parent_partition = get_partition_by_id(
2690
+ stream_locator=delta.stream_locator,
2691
+ partition_id=delta.partition_id,
2692
+ transaction=transaction,
2693
+ *args,
2694
+ **kwargs,
2695
+ )
2696
+ else:
2697
+ parent_partition = get_partition(
2698
+ stream_locator=delta.stream_locator,
2699
+ partition_values=delta.partition_values,
2700
+ transaction=transaction,
2701
+ *args,
2702
+ **kwargs,
2703
+ )
2704
+ if not parent_partition:
2705
+ raise PartitionNotFoundError(f"Partition not found: {delta.locator}")
2706
+ # ensure that we always use a fully qualified partition locator
2707
+ delta.locator.partition_locator = parent_partition.locator
2708
+ # resolve the delta's stream position
2709
+ delta.previous_stream_position = parent_partition.stream_position or 0
2710
+ if delta.stream_position is not None:
2711
+ if delta.stream_position <= delta.previous_stream_position:
2712
+ # manually specified delta stream positions must be greater than the
2713
+ # previous stream position
2714
+ raise TableValidationError(
2715
+ f"Delta stream position {delta.stream_position} must be "
2716
+ f"greater than previous stream position "
2717
+ f"{delta.previous_stream_position}"
2718
+ )
2719
+ else:
2720
+ delta.locator.stream_position = delta.previous_stream_position + 1
2721
+
2722
+ # update the parent partition's stream position
2723
+ new_parent_partition: Partition = Metafile.update_for(parent_partition)
2724
+ new_parent_partition.stream_position = delta.locator.stream_position
2725
+
2726
+ # Add operations to the transaction
2727
+ # the 1st operation creates the delta
2728
+ transaction.step(
2729
+ TransactionOperation.of(
2730
+ operation_type=TransactionOperationType.CREATE,
2731
+ dest_metafile=delta,
2732
+ ),
2733
+ )
2734
+ # the 2nd operation alters the stream position of the partition
2735
+ transaction.step(
2736
+ TransactionOperation.of(
2737
+ operation_type=TransactionOperationType.UPDATE,
2738
+ dest_metafile=new_parent_partition,
2739
+ src_metafile=parent_partition,
2740
+ ),
2741
+ )
2742
+
2743
+ if commit_transaction:
2744
+ transaction.seal()
2745
+ return delta
2746
+
2747
+
2748
+ def get_namespace(namespace: str, *args, **kwargs) -> Optional[Namespace]:
2749
+ """
2750
+ Gets table namespace metadata for the specified table namespace. Returns
2751
+ None if the given namespace does not exist.
2752
+ """
2753
+ return _latest(
2754
+ metafile=Namespace.of(NamespaceLocator.of(namespace)),
2755
+ *args,
2756
+ **kwargs,
2757
+ )
2758
+
2759
+
2760
+ def namespace_exists(namespace: str, *args, **kwargs) -> bool:
2761
+ """
2762
+ Returns True if the given table namespace exists, False if not.
2763
+ """
2764
+ return _exists(
2765
+ metafile=Namespace.of(NamespaceLocator.of(namespace)),
2766
+ *args,
2767
+ **kwargs,
2768
+ )
2769
+
2770
+
2771
+ def get_table(
2772
+ namespace: str,
2773
+ table_name: str,
2774
+ *args,
2775
+ **kwargs,
2776
+ ) -> Optional[Table]:
2777
+ """
2778
+ Gets table metadata for the specified table. Returns None if the given
2779
+ table does not exist.
2780
+ """
2781
+ locator = TableLocator.at(namespace=namespace, table_name=table_name)
2782
+ return _latest(
2783
+ metafile=Table.of(locator=locator),
2784
+ *args,
2785
+ **kwargs,
2786
+ )
2787
+
2788
+
2789
+ def table_exists(
2790
+ namespace: str,
2791
+ table_name: str,
2792
+ *args,
2793
+ **kwargs,
2794
+ ) -> bool:
2795
+ """
2796
+ Returns True if the given table exists, False if not.
2797
+ """
2798
+ locator = TableLocator.at(namespace=namespace, table_name=table_name)
2799
+ return _exists(
2800
+ metafile=Table.of(locator=locator),
2801
+ *args,
2802
+ **kwargs,
2803
+ )
2804
+
2805
+
2806
+ def get_table_version(
2807
+ namespace: str,
2808
+ table_name: str,
2809
+ table_version: str,
2810
+ *args,
2811
+ **kwargs,
2812
+ ) -> Optional[TableVersion]:
2813
+ """
2814
+ Gets table version metadata for the specified table version. Returns None
2815
+ if the given table version does not exist.
2816
+ """
2817
+ locator = TableVersionLocator.at(
2818
+ namespace=namespace,
2819
+ table_name=table_name,
2820
+ table_version=table_version,
2821
+ )
2822
+ table_version = TableVersion.of(
2823
+ locator=locator,
2824
+ schema=None,
2825
+ )
2826
+ return _latest(
2827
+ metafile=table_version,
2828
+ *args,
2829
+ **kwargs,
2830
+ )
2831
+
2832
+
2833
+ def get_latest_table_version(
2834
+ namespace: str,
2835
+ table_name: str,
2836
+ *args,
2837
+ transaction: Optional[Transaction] = None,
2838
+ **kwargs,
2839
+ ) -> Optional[TableVersion]:
2840
+ """
2841
+ Gets table version metadata for the latest version of the specified table.
2842
+ Returns None if no table version exists for the given table. Raises
2843
+ an error if the given table doesn't exist.
2844
+ """
2845
+ transaction, commit_transaction = setup_transaction(transaction, **kwargs)
2846
+ table_version_id = _resolve_latest_table_version_id(
2847
+ namespace=namespace,
2848
+ table_name=table_name,
2849
+ fail_if_no_active_table_version=False,
2850
+ transaction=transaction,
2851
+ *args,
2852
+ **kwargs,
2853
+ )
2854
+
2855
+ table_version = (
2856
+ get_table_version(
2857
+ namespace=namespace,
2858
+ table_name=table_name,
2859
+ table_version=table_version_id,
2860
+ transaction=transaction,
2861
+ *args,
2862
+ **kwargs,
2863
+ )
2864
+ if table_version_id
2865
+ else None
2866
+ )
2867
+ if commit_transaction:
2868
+ transaction.seal()
2869
+ return table_version
2870
+
2871
+
2872
+ def get_latest_active_table_version(
2873
+ namespace: str,
2874
+ table_name: str,
2875
+ *args,
2876
+ transaction: Optional[Transaction] = None,
2877
+ **kwargs,
2878
+ ) -> Optional[TableVersion]:
2879
+ """
2880
+ Gets table version metadata for the latest active version of the specified
2881
+ table. Returns None if no active table version exists for the given table.
2882
+ Raises an error if the given table doesn't exist.
2883
+ """
2884
+ transaction, commit_transaction = setup_transaction(transaction, **kwargs)
2885
+ table_version_id = _resolve_latest_active_table_version_id(
2886
+ namespace=namespace,
2887
+ table_name=table_name,
2888
+ fail_if_no_active_table_version=False,
2889
+ transaction=transaction,
2890
+ *args,
2891
+ **kwargs,
2892
+ )
2893
+ table_version = (
2894
+ get_table_version(
2895
+ namespace=namespace,
2896
+ table_name=table_name,
2897
+ table_version=table_version_id,
2898
+ transaction=transaction,
2899
+ *args,
2900
+ **kwargs,
2901
+ )
2902
+ if table_version_id
2903
+ else None
2904
+ )
2905
+ if commit_transaction:
2906
+ transaction.seal()
2907
+ return table_version
2908
+
2909
+
2910
+ def get_table_version_column_names(
2911
+ namespace: str,
2912
+ table_name: str,
2913
+ table_version: Optional[str] = None,
2914
+ *args,
2915
+ **kwargs,
2916
+ ) -> Optional[List[str]]:
2917
+ """
2918
+ Gets a list of column names for the specified table version, or for the
2919
+ latest active table version if none is specified. The index of each
2920
+ column name returned represents its ordinal position in a delimited text
2921
+ file or other row-oriented content type files appended to the table.
2922
+ Returns None for schemaless tables. Raises an error if the table version
2923
+ does not exist.
2924
+ """
2925
+ schema = get_table_version_schema(
2926
+ namespace=namespace,
2927
+ table_name=table_name,
2928
+ table_version=table_version,
2929
+ *args,
2930
+ **kwargs,
2931
+ )
2932
+ return schema.arrow.names if schema else None
2933
+
2934
+
2935
+ def get_table_version_schema(
2936
+ namespace: str,
2937
+ table_name: str,
2938
+ table_version: Optional[str] = None,
2939
+ *args,
2940
+ **kwargs,
2941
+ ) -> Optional[Schema]:
2942
+ """
2943
+ Gets the schema for the specified table version, or for the latest active
2944
+ table version if none is specified. Returns None if the table version is
2945
+ schemaless. Raises an error if the table version does not exist.
2946
+ """
2947
+ table_version_meta = (
2948
+ get_table_version(
2949
+ *args,
2950
+ namespace=namespace,
2951
+ table_name=table_name,
2952
+ table_version=table_version,
2953
+ **kwargs,
2954
+ )
2955
+ if table_version
2956
+ else get_latest_active_table_version(
2957
+ *args,
2958
+ namespace=namespace,
2959
+ table_name=table_name,
2960
+ **kwargs,
2961
+ )
2962
+ )
2963
+ return table_version_meta.schema
2964
+
2965
+
2966
+ def table_version_exists(
2967
+ namespace: str,
2968
+ table_name: str,
2969
+ table_version: str,
2970
+ *args,
2971
+ **kwargs,
2972
+ ) -> bool:
2973
+ """
2974
+ Returns True if the given table version exists, False if not.
2975
+ """
2976
+ locator = TableVersionLocator.at(
2977
+ namespace=namespace,
2978
+ table_name=table_name,
2979
+ table_version=table_version,
2980
+ )
2981
+ table_version = TableVersion.of(
2982
+ locator=locator,
2983
+ schema=None,
2984
+ )
2985
+ return _exists(
2986
+ *args,
2987
+ metafile=table_version,
2988
+ **kwargs,
2989
+ )
2990
+
2991
+
2992
+ def can_categorize(e: BaseException, *args, **kwargs) -> bool:
2993
+ """
2994
+ True if the input error originated from the storage
2995
+ implementation layer and can be categorized under an
2996
+ existing DeltaCatError. The "categorize_errors" decorator
2997
+ uses this to determine if an unknown error from the storage
2998
+ implementation can be categorized prior to casting it to
2999
+ the equivalent DeltaCatError via `raise_categorized_error`
3000
+ """
3001
+
3002
+ # DeltaCAT native storage can only categorize DeltaCatError
3003
+ # (i.e., this is effectively a no-op for native storage)
3004
+ if isinstance(e, DeltaCatError):
3005
+ return True
3006
+ else:
3007
+ return False
3008
+
3009
+
3010
+ def raise_categorized_error(e: BaseException, *args, **kwargs):
3011
+ """
3012
+ Casts a categorizable error that originaed from the storage
3013
+ implementation layer to its equivalent DeltaCatError
3014
+ for uniform handling (e.g., determining whether an error
3015
+ is retryable or not) via the "categorize_errors" decorator.
3016
+ Raises an UnclassifiedDeltaCatError from the input exception
3017
+ if the error cannot be categorized.
3018
+ """
3019
+
3020
+ # DeltaCAT native storage can only categorize DeltaCatError
3021
+ # (i.e., this is effectively a no-op for native storage)
3022
+ logger.info(f"Categorizing exception: {e}")
3023
+ categorized = None
3024
+ if isinstance(categorized, DeltaCatError):
3025
+ raise categorized from e
3026
+
3027
+ logger.warning(f"Could not classify {type(e).__name__}: {e}")
3028
+ raise UnclassifiedDeltaCatError(
3029
+ f"Failed to classify error {type(e).__name__}: {e}"
3030
+ ) from e