deltacat 1.1.38__py3-none-any.whl → 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (367) hide show
  1. deltacat/__init__.py +150 -12
  2. deltacat/annotations.py +36 -0
  3. deltacat/api.py +578 -0
  4. deltacat/aws/constants.py +0 -23
  5. deltacat/aws/s3u.py +4 -631
  6. deltacat/benchmarking/benchmark_engine.py +84 -0
  7. deltacat/benchmarking/benchmark_report.py +86 -0
  8. deltacat/benchmarking/benchmark_suite.py +11 -0
  9. deltacat/benchmarking/conftest.py +22 -19
  10. deltacat/benchmarking/data/random_row_generator.py +94 -0
  11. deltacat/benchmarking/data/row_generator.py +10 -0
  12. deltacat/benchmarking/test_benchmark_pipeline.py +108 -0
  13. deltacat/catalog/__init__.py +73 -0
  14. deltacat/catalog/delegate.py +615 -140
  15. deltacat/catalog/interface.py +404 -81
  16. deltacat/catalog/main/impl.py +2882 -0
  17. deltacat/catalog/model/catalog.py +348 -46
  18. deltacat/catalog/model/properties.py +155 -0
  19. deltacat/catalog/model/table_definition.py +32 -1
  20. deltacat/compute/__init__.py +14 -0
  21. deltacat/compute/compactor/compaction_session.py +97 -75
  22. deltacat/compute/compactor/model/compact_partition_params.py +75 -30
  23. deltacat/compute/compactor/model/compaction_session_audit_info.py +23 -30
  24. deltacat/compute/compactor/model/delta_annotated.py +3 -3
  25. deltacat/compute/compactor/model/delta_file_envelope.py +3 -1
  26. deltacat/compute/compactor/model/delta_file_locator.py +3 -1
  27. deltacat/compute/compactor/model/round_completion_info.py +19 -9
  28. deltacat/compute/compactor/model/table_object_store.py +3 -2
  29. deltacat/compute/compactor/repartition_session.py +9 -22
  30. deltacat/compute/compactor/steps/dedupe.py +11 -4
  31. deltacat/compute/compactor/steps/hash_bucket.py +6 -6
  32. deltacat/compute/compactor/steps/materialize.py +15 -9
  33. deltacat/compute/compactor/steps/repartition.py +12 -11
  34. deltacat/compute/compactor/utils/io.py +7 -6
  35. deltacat/compute/compactor/utils/round_completion_reader.py +117 -0
  36. deltacat/compute/compactor/utils/sort_key.py +9 -2
  37. deltacat/compute/compactor/utils/system_columns.py +3 -1
  38. deltacat/compute/compactor_v2/compaction_session.py +13 -14
  39. deltacat/compute/compactor_v2/deletes/utils.py +3 -3
  40. deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -1
  41. deltacat/compute/compactor_v2/model/hash_bucket_input.py +9 -3
  42. deltacat/compute/compactor_v2/model/merge_file_group.py +5 -2
  43. deltacat/compute/compactor_v2/model/merge_input.py +28 -9
  44. deltacat/compute/compactor_v2/private/compaction_utils.py +171 -73
  45. deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -2
  46. deltacat/compute/compactor_v2/steps/merge.py +156 -53
  47. deltacat/compute/compactor_v2/utils/content_type_params.py +17 -6
  48. deltacat/compute/compactor_v2/utils/delta.py +5 -3
  49. deltacat/compute/compactor_v2/utils/io.py +10 -3
  50. deltacat/compute/compactor_v2/utils/merge.py +14 -2
  51. deltacat/compute/compactor_v2/utils/task_options.py +2 -10
  52. deltacat/compute/converter/constants.py +9 -0
  53. deltacat/compute/converter/converter_session.py +298 -0
  54. deltacat/compute/converter/model/convert_input.py +96 -0
  55. deltacat/compute/converter/model/convert_input_files.py +78 -0
  56. deltacat/compute/converter/model/convert_result.py +80 -0
  57. deltacat/compute/converter/model/converter_session_params.py +144 -0
  58. deltacat/compute/converter/pyiceberg/catalog.py +78 -0
  59. deltacat/compute/converter/pyiceberg/overrides.py +263 -0
  60. deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +299 -0
  61. deltacat/compute/converter/steps/convert.py +366 -0
  62. deltacat/compute/converter/steps/dedupe.py +94 -0
  63. deltacat/compute/converter/utils/__init__.py +0 -0
  64. deltacat/compute/converter/utils/convert_task_options.py +132 -0
  65. deltacat/compute/converter/utils/converter_session_utils.py +175 -0
  66. deltacat/compute/converter/utils/iceberg_columns.py +87 -0
  67. deltacat/compute/converter/utils/io.py +203 -0
  68. deltacat/compute/converter/utils/s3u.py +148 -0
  69. deltacat/compute/janitor.py +205 -0
  70. deltacat/compute/jobs/__init__.py +0 -0
  71. deltacat/compute/jobs/client.py +417 -0
  72. deltacat/compute/resource_estimation/delta.py +11 -1
  73. deltacat/constants.py +90 -1
  74. deltacat/docs/__init__.py +0 -0
  75. deltacat/docs/autogen/__init__.py +0 -0
  76. deltacat/docs/autogen/schema/__init__.py +0 -0
  77. deltacat/docs/autogen/schema/inference/__init__.py +0 -0
  78. deltacat/docs/autogen/schema/inference/generate_type_mappings.py +687 -0
  79. deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py +673 -0
  80. deltacat/env.py +61 -0
  81. deltacat/examples/__init__.py +0 -0
  82. deltacat/examples/basic_logging.py +101 -0
  83. deltacat/examples/compactor/__init__.py +0 -0
  84. deltacat/examples/compactor/aws/__init__.py +1 -0
  85. deltacat/examples/compactor/bootstrap.py +863 -0
  86. deltacat/examples/compactor/compactor.py +373 -0
  87. deltacat/examples/compactor/explorer.py +473 -0
  88. deltacat/examples/compactor/gcp/__init__.py +1 -0
  89. deltacat/examples/compactor/job_runner.py +439 -0
  90. deltacat/examples/compactor/utils/__init__.py +1 -0
  91. deltacat/examples/compactor/utils/common.py +261 -0
  92. deltacat/examples/experimental/__init__.py +0 -0
  93. deltacat/examples/experimental/iceberg/__init__.py +0 -0
  94. deltacat/examples/experimental/iceberg/converter/__init__.py +0 -0
  95. deltacat/examples/experimental/iceberg/converter/beam/__init__.py +0 -0
  96. deltacat/examples/experimental/iceberg/converter/beam/app.py +226 -0
  97. deltacat/examples/experimental/iceberg/converter/beam/main.py +133 -0
  98. deltacat/examples/experimental/iceberg/converter/beam/test_workflow.py +113 -0
  99. deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py +3 -0
  100. deltacat/examples/experimental/iceberg/converter/beam/utils/common.py +174 -0
  101. deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py +263 -0
  102. deltacat/examples/experimental/iceberg/iceberg_bucket_writer.py +184 -0
  103. deltacat/examples/experimental/iceberg/iceberg_reader.py +147 -0
  104. deltacat/examples/hello_world.py +29 -0
  105. deltacat/examples/indexer/__init__.py +0 -0
  106. deltacat/examples/indexer/aws/__init__.py +0 -0
  107. deltacat/examples/indexer/gcp/__init__.py +0 -0
  108. deltacat/examples/indexer/indexer.py +163 -0
  109. deltacat/examples/indexer/job_runner.py +198 -0
  110. deltacat/exceptions.py +116 -12
  111. deltacat/experimental/__init__.py +0 -0
  112. deltacat/experimental/catalog/__init__.py +0 -0
  113. deltacat/experimental/catalog/iceberg/__init__.py +6 -0
  114. deltacat/experimental/catalog/iceberg/iceberg_catalog_config.py +26 -0
  115. deltacat/experimental/catalog/iceberg/impl.py +399 -0
  116. deltacat/experimental/catalog/iceberg/overrides.py +72 -0
  117. deltacat/experimental/compatibility/__init__.py +0 -0
  118. deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py +201 -0
  119. deltacat/experimental/converter_agent/__init__.py +0 -0
  120. deltacat/experimental/converter_agent/beam/__init__.py +0 -0
  121. deltacat/experimental/converter_agent/beam/managed.py +173 -0
  122. deltacat/experimental/converter_agent/table_monitor.py +479 -0
  123. deltacat/experimental/daft/__init__.py +4 -0
  124. deltacat/experimental/daft/daft_catalog.py +229 -0
  125. deltacat/experimental/storage/__init__.py +0 -0
  126. deltacat/experimental/storage/iceberg/__init__.py +0 -0
  127. deltacat/experimental/storage/iceberg/iceberg_scan_planner.py +129 -0
  128. deltacat/experimental/storage/iceberg/impl.py +739 -0
  129. deltacat/experimental/storage/iceberg/model.py +713 -0
  130. deltacat/experimental/storage/iceberg/visitor.py +119 -0
  131. deltacat/experimental/storage/rivulet/__init__.py +11 -0
  132. deltacat/experimental/storage/rivulet/arrow/__init__.py +0 -0
  133. deltacat/experimental/storage/rivulet/arrow/serializer.py +78 -0
  134. deltacat/experimental/storage/rivulet/dataset.py +745 -0
  135. deltacat/experimental/storage/rivulet/dataset_executor.py +79 -0
  136. deltacat/experimental/storage/rivulet/feather/__init__.py +7 -0
  137. deltacat/experimental/storage/rivulet/feather/file_reader.py +138 -0
  138. deltacat/experimental/storage/rivulet/feather/serializer.py +35 -0
  139. deltacat/experimental/storage/rivulet/fs/__init__.py +0 -0
  140. deltacat/experimental/storage/rivulet/fs/file_provider.py +105 -0
  141. deltacat/experimental/storage/rivulet/fs/file_store.py +130 -0
  142. deltacat/experimental/storage/rivulet/fs/input_file.py +76 -0
  143. deltacat/experimental/storage/rivulet/fs/output_file.py +86 -0
  144. deltacat/experimental/storage/rivulet/logical_plan.py +105 -0
  145. deltacat/experimental/storage/rivulet/metastore/__init__.py +0 -0
  146. deltacat/experimental/storage/rivulet/metastore/delta.py +188 -0
  147. deltacat/experimental/storage/rivulet/metastore/json_sst.py +105 -0
  148. deltacat/experimental/storage/rivulet/metastore/sst.py +82 -0
  149. deltacat/experimental/storage/rivulet/metastore/sst_interval_tree.py +260 -0
  150. deltacat/experimental/storage/rivulet/mvp/Table.py +101 -0
  151. deltacat/experimental/storage/rivulet/mvp/__init__.py +5 -0
  152. deltacat/experimental/storage/rivulet/parquet/__init__.py +7 -0
  153. deltacat/experimental/storage/rivulet/parquet/data_reader.py +0 -0
  154. deltacat/experimental/storage/rivulet/parquet/file_reader.py +129 -0
  155. deltacat/experimental/storage/rivulet/parquet/serializer.py +37 -0
  156. deltacat/experimental/storage/rivulet/reader/__init__.py +0 -0
  157. deltacat/experimental/storage/rivulet/reader/block_scanner.py +389 -0
  158. deltacat/experimental/storage/rivulet/reader/data_reader.py +136 -0
  159. deltacat/experimental/storage/rivulet/reader/data_scan.py +65 -0
  160. deltacat/experimental/storage/rivulet/reader/dataset_metastore.py +179 -0
  161. deltacat/experimental/storage/rivulet/reader/dataset_reader.py +158 -0
  162. deltacat/experimental/storage/rivulet/reader/pyarrow_data_reader.py +124 -0
  163. deltacat/experimental/storage/rivulet/reader/query_expression.py +99 -0
  164. deltacat/experimental/storage/rivulet/reader/reader_type_registrar.py +84 -0
  165. deltacat/experimental/storage/rivulet/schema/__init__.py +0 -0
  166. deltacat/experimental/storage/rivulet/schema/datatype.py +128 -0
  167. deltacat/experimental/storage/rivulet/schema/schema.py +251 -0
  168. deltacat/experimental/storage/rivulet/serializer.py +40 -0
  169. deltacat/experimental/storage/rivulet/serializer_factory.py +46 -0
  170. deltacat/experimental/storage/rivulet/shard/__init__.py +0 -0
  171. deltacat/experimental/storage/rivulet/shard/range_shard.py +129 -0
  172. deltacat/experimental/storage/rivulet/writer/__init__.py +0 -0
  173. deltacat/experimental/storage/rivulet/writer/dataset_writer.py +29 -0
  174. deltacat/experimental/storage/rivulet/writer/memtable_dataset_writer.py +305 -0
  175. deltacat/io/__init__.py +13 -0
  176. deltacat/io/dataset/__init__.py +0 -0
  177. deltacat/io/dataset/deltacat_dataset.py +91 -0
  178. deltacat/io/datasink/__init__.py +0 -0
  179. deltacat/io/datasink/deltacat_datasink.py +207 -0
  180. deltacat/io/datasource/__init__.py +0 -0
  181. deltacat/io/datasource/deltacat_datasource.py +579 -0
  182. deltacat/io/reader/__init__.py +0 -0
  183. deltacat/io/reader/deltacat_read_api.py +172 -0
  184. deltacat/logs.py +4 -1
  185. deltacat/storage/__init__.py +138 -28
  186. deltacat/storage/interface.py +260 -155
  187. deltacat/storage/main/__init__.py +0 -0
  188. deltacat/storage/main/impl.py +3030 -0
  189. deltacat/storage/model/delta.py +142 -71
  190. deltacat/storage/model/expression/__init__.py +47 -0
  191. deltacat/storage/model/expression/expression.py +656 -0
  192. deltacat/storage/model/expression/visitor.py +248 -0
  193. deltacat/storage/model/interop.py +24 -0
  194. deltacat/storage/model/list_result.py +8 -0
  195. deltacat/storage/model/locator.py +93 -9
  196. deltacat/storage/model/manifest.py +643 -0
  197. deltacat/storage/model/metafile.py +1421 -0
  198. deltacat/storage/model/namespace.py +41 -18
  199. deltacat/storage/model/partition.py +443 -43
  200. deltacat/storage/model/scan/__init__.py +0 -0
  201. deltacat/storage/model/scan/push_down.py +46 -0
  202. deltacat/storage/model/scan/scan_plan.py +10 -0
  203. deltacat/storage/model/scan/scan_task.py +34 -0
  204. deltacat/storage/model/schema.py +3160 -0
  205. deltacat/storage/model/shard.py +51 -0
  206. deltacat/storage/model/sort_key.py +210 -13
  207. deltacat/storage/model/stream.py +215 -80
  208. deltacat/storage/model/table.py +134 -29
  209. deltacat/storage/model/table_version.py +333 -46
  210. deltacat/storage/model/transaction.py +1733 -0
  211. deltacat/storage/model/transform.py +274 -58
  212. deltacat/storage/model/types.py +138 -16
  213. deltacat/storage/util/__init__.py +0 -0
  214. deltacat/storage/util/scan_planner.py +26 -0
  215. deltacat/tests/_io/__init__.py +1 -0
  216. deltacat/tests/_io/reader/__init__.py +0 -0
  217. deltacat/tests/_io/reader/test_deltacat_read_api.py +0 -0
  218. deltacat/tests/{io → _io}/test_cloudpickle_bug_fix.py +8 -4
  219. deltacat/tests/aws/test_s3u.py +2 -31
  220. deltacat/tests/catalog/data/__init__.py +0 -0
  221. deltacat/tests/catalog/main/__init__.py +0 -0
  222. deltacat/tests/catalog/main/test_catalog_impl_namespace_operations.py +130 -0
  223. deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +1972 -0
  224. deltacat/tests/catalog/model/__init__.py +0 -0
  225. deltacat/tests/catalog/model/test_table_definition.py +16 -0
  226. deltacat/tests/catalog/test_catalogs.py +321 -0
  227. deltacat/tests/catalog/test_default_catalog_impl.py +12154 -66
  228. deltacat/tests/compute/compact_partition_multiple_rounds_test_cases.py +21 -21
  229. deltacat/tests/compute/compact_partition_rebase_test_cases.py +6 -6
  230. deltacat/tests/compute/compact_partition_rebase_then_incremental_test_cases.py +56 -56
  231. deltacat/tests/compute/compact_partition_test_cases.py +23 -30
  232. deltacat/tests/compute/compactor/steps/test_repartition.py +14 -14
  233. deltacat/tests/compute/compactor/utils/test_io.py +125 -123
  234. deltacat/tests/compute/compactor/utils/test_round_completion_reader.py +254 -0
  235. deltacat/tests/compute/compactor_v2/test_compaction_session.py +387 -830
  236. deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +70 -57
  237. deltacat/tests/compute/compactor_v2/utils/test_task_options.py +0 -3
  238. deltacat/tests/compute/conftest.py +39 -0
  239. deltacat/tests/compute/converter/__init__.py +0 -0
  240. deltacat/tests/compute/converter/conftest.py +80 -0
  241. deltacat/tests/compute/converter/test_convert_session.py +826 -0
  242. deltacat/tests/compute/converter/utils.py +132 -0
  243. deltacat/tests/compute/resource_estimation/test_delta.py +88 -104
  244. deltacat/tests/compute/test_compact_partition_incremental.py +91 -98
  245. deltacat/tests/compute/test_compact_partition_multiple_rounds.py +79 -97
  246. deltacat/tests/compute/test_compact_partition_params.py +16 -11
  247. deltacat/tests/compute/test_compact_partition_rebase.py +63 -93
  248. deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +249 -220
  249. deltacat/tests/compute/test_janitor.py +236 -0
  250. deltacat/tests/compute/test_util_common.py +726 -46
  251. deltacat/tests/compute/test_util_constant.py +0 -1
  252. deltacat/tests/conftest.py +25 -0
  253. deltacat/tests/daft/__init__.py +0 -0
  254. deltacat/tests/daft/test_model.py +97 -0
  255. deltacat/tests/experimental/__init__.py +1 -0
  256. deltacat/tests/experimental/catalog/__init__.py +0 -0
  257. deltacat/tests/experimental/catalog/iceberg/__init__.py +0 -0
  258. deltacat/tests/experimental/catalog/iceberg/test_iceberg_catalog.py +71 -0
  259. deltacat/tests/experimental/compatibility/__init__.py +1 -0
  260. deltacat/tests/experimental/compatibility/test_backfill_locator_to_id_mappings.py +582 -0
  261. deltacat/tests/experimental/daft/__init__.py +0 -0
  262. deltacat/tests/experimental/daft/test_deltacat_daft_integration.py +136 -0
  263. deltacat/tests/experimental/storage/__init__.py +0 -0
  264. deltacat/tests/experimental/storage/rivulet/__init__.py +0 -0
  265. deltacat/tests/experimental/storage/rivulet/conftest.py +149 -0
  266. deltacat/tests/experimental/storage/rivulet/fs/__init__.py +0 -0
  267. deltacat/tests/experimental/storage/rivulet/fs/test_file_location_provider.py +94 -0
  268. deltacat/tests/experimental/storage/rivulet/reader/__init__.py +0 -0
  269. deltacat/tests/experimental/storage/rivulet/reader/query_expression.py +80 -0
  270. deltacat/tests/experimental/storage/rivulet/reader/test_data_scan.py +119 -0
  271. deltacat/tests/experimental/storage/rivulet/reader/test_dataset_metastore.py +71 -0
  272. deltacat/tests/experimental/storage/rivulet/schema/__init__.py +0 -0
  273. deltacat/tests/experimental/storage/rivulet/schema/test_schema.py +241 -0
  274. deltacat/tests/experimental/storage/rivulet/shard/__init__.py +0 -0
  275. deltacat/tests/experimental/storage/rivulet/shard/test_range_shard.py +162 -0
  276. deltacat/tests/experimental/storage/rivulet/test_dataset.py +408 -0
  277. deltacat/tests/experimental/storage/rivulet/test_manifest.py +67 -0
  278. deltacat/tests/experimental/storage/rivulet/test_sst_interval_tree.py +232 -0
  279. deltacat/tests/experimental/storage/rivulet/test_utils.py +124 -0
  280. deltacat/tests/experimental/storage/rivulet/writer/__init__.py +0 -0
  281. deltacat/tests/experimental/storage/rivulet/writer/test_dataset_write_then_read.py +343 -0
  282. deltacat/tests/experimental/storage/rivulet/writer/test_dataset_writer.py +79 -0
  283. deltacat/tests/experimental/storage/rivulet/writer/test_memtable_dataset_writer.py +75 -0
  284. deltacat/tests/storage/__init__.py +0 -0
  285. deltacat/tests/storage/main/__init__.py +0 -0
  286. deltacat/tests/storage/main/test_main_storage.py +8204 -0
  287. deltacat/tests/storage/model/__init__.py +0 -0
  288. deltacat/tests/storage/model/test_delete_parameters.py +21 -0
  289. deltacat/tests/storage/model/test_expression.py +327 -0
  290. deltacat/tests/storage/model/test_manifest.py +129 -0
  291. deltacat/tests/storage/model/test_metafile_io.py +2440 -0
  292. deltacat/tests/storage/model/test_partition_scheme.py +85 -0
  293. deltacat/tests/storage/model/test_schema.py +479 -0
  294. deltacat/tests/storage/model/test_schema_update.py +1925 -0
  295. deltacat/tests/storage/model/test_shard.py +24 -0
  296. deltacat/tests/storage/model/test_sort_scheme.py +90 -0
  297. deltacat/tests/storage/model/test_table_version.py +110 -0
  298. deltacat/tests/storage/model/test_transaction.py +653 -0
  299. deltacat/tests/storage/model/test_transaction_history.py +886 -0
  300. deltacat/tests/test_deltacat_api.py +1064 -0
  301. deltacat/tests/test_exceptions.py +9 -5
  302. deltacat/tests/test_utils/filesystem.py +14 -0
  303. deltacat/tests/test_utils/message_pack_utils.py +54 -0
  304. deltacat/tests/test_utils/pyarrow.py +50 -26
  305. deltacat/tests/test_utils/storage.py +256 -4
  306. deltacat/tests/types/__init__.py +0 -0
  307. deltacat/tests/types/test_tables.py +104 -0
  308. deltacat/tests/utils/exceptions.py +22 -0
  309. deltacat/tests/utils/main_deltacat_storage_mock.py +31 -0
  310. deltacat/tests/utils/ray_utils/test_dataset.py +123 -5
  311. deltacat/tests/utils/test_daft.py +124 -34
  312. deltacat/tests/utils/test_numpy.py +1193 -0
  313. deltacat/tests/utils/test_pandas.py +1106 -0
  314. deltacat/tests/utils/test_polars.py +1040 -0
  315. deltacat/tests/utils/test_pyarrow.py +1107 -258
  316. deltacat/types/media.py +345 -37
  317. deltacat/types/partial_download.py +1 -1
  318. deltacat/types/tables.py +2345 -47
  319. deltacat/utils/arguments.py +33 -1
  320. deltacat/utils/daft.py +824 -40
  321. deltacat/utils/export.py +61 -0
  322. deltacat/utils/filesystem.py +450 -0
  323. deltacat/utils/metafile_locator.py +74 -0
  324. deltacat/utils/numpy.py +118 -26
  325. deltacat/utils/pandas.py +577 -48
  326. deltacat/utils/polars.py +759 -0
  327. deltacat/utils/pyarrow.py +1212 -178
  328. deltacat/utils/ray_utils/concurrency.py +1 -1
  329. deltacat/utils/ray_utils/dataset.py +101 -10
  330. deltacat/utils/ray_utils/runtime.py +56 -4
  331. deltacat/utils/reader_compatibility_mapping.py +3083 -0
  332. deltacat/utils/url.py +1325 -0
  333. deltacat-2.0.0.dist-info/METADATA +1163 -0
  334. deltacat-2.0.0.dist-info/RECORD +439 -0
  335. {deltacat-1.1.38.dist-info → deltacat-2.0.0.dist-info}/WHEEL +1 -1
  336. deltacat/aws/redshift/__init__.py +0 -19
  337. deltacat/aws/redshift/model/manifest.py +0 -394
  338. deltacat/catalog/default_catalog_impl/__init__.py +0 -369
  339. deltacat/compute/compactor/utils/round_completion_file.py +0 -97
  340. deltacat/compute/merge_on_read/__init__.py +0 -4
  341. deltacat/compute/merge_on_read/daft.py +0 -40
  342. deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -66
  343. deltacat/compute/merge_on_read/utils/delta.py +0 -42
  344. deltacat/io/dataset.py +0 -73
  345. deltacat/io/read_api.py +0 -143
  346. deltacat/storage/model/delete_parameters.py +0 -40
  347. deltacat/storage/model/partition_spec.py +0 -71
  348. deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -231
  349. deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -397
  350. deltacat/tests/local_deltacat_storage/__init__.py +0 -1262
  351. deltacat/tests/local_deltacat_storage/exceptions.py +0 -10
  352. deltacat/utils/s3fs.py +0 -21
  353. deltacat-1.1.38.dist-info/METADATA +0 -64
  354. deltacat-1.1.38.dist-info/RECORD +0 -219
  355. /deltacat/{aws/redshift/model → benchmarking/data}/__init__.py +0 -0
  356. /deltacat/{compute/merge_on_read/model → catalog/main}/__init__.py +0 -0
  357. /deltacat/compute/{merge_on_read/utils → converter}/__init__.py +0 -0
  358. /deltacat/{io/aws → compute/converter/model}/__init__.py +0 -0
  359. /deltacat/{io/aws/redshift → compute/converter/pyiceberg}/__init__.py +0 -0
  360. /deltacat/{tests/io → compute/converter/steps}/__init__.py +0 -0
  361. /deltacat/tests/{io → _io}/test_file_object_store.py +0 -0
  362. /deltacat/tests/{io → _io}/test_memcached_object_store.py +0 -0
  363. /deltacat/tests/{io → _io}/test_ray_plasma_object_store.py +0 -0
  364. /deltacat/tests/{io → _io}/test_redis_object_store.py +0 -0
  365. /deltacat/tests/{io → _io}/test_s3_object_store.py +0 -0
  366. {deltacat-1.1.38.dist-info → deltacat-2.0.0.dist-info/licenses}/LICENSE +0 -0
  367. {deltacat-1.1.38.dist-info → deltacat-2.0.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,1733 @@
1
+ from __future__ import annotations
2
+
3
+ import os
4
+ import copy
5
+ import time
6
+ import uuid
7
+ import logging
8
+ import posixpath
9
+ from pathlib import PosixPath
10
+ import threading
11
+ import contextvars
12
+ from collections import defaultdict
13
+
14
+ from types import TracebackType
15
+ from typing import Optional, List, Union, Tuple, Type, TYPE_CHECKING, Iterable
16
+
17
+ if TYPE_CHECKING:
18
+ from deltacat.types.tables import Dataset
19
+
20
+ import msgpack
21
+ import pyarrow as pa
22
+ import pyarrow.fs
23
+
24
+ from deltacat.constants import (
25
+ TXN_DIR_NAME,
26
+ TXN_PART_SEPARATOR,
27
+ RUNNING_TXN_DIR_NAME,
28
+ FAILED_TXN_DIR_NAME,
29
+ PAUSED_TXN_DIR_NAME,
30
+ SUCCESS_TXN_DIR_NAME,
31
+ NANOS_PER_SEC,
32
+ )
33
+ from deltacat.storage.model.list_result import ListResult
34
+ from deltacat.storage.model.types import (
35
+ TransactionOperationType,
36
+ TransactionState,
37
+ TransactionStatus,
38
+ )
39
+ from deltacat.storage.model.namespace import NamespaceLocator
40
+ from deltacat.storage.model.table import TableLocator
41
+ from deltacat.storage.model.table_version import TableVersionLocator
42
+ from deltacat.storage.model.stream import StreamLocator
43
+ from deltacat.storage.model.partition import PartitionLocator
44
+ from deltacat.storage.model.delta import DeltaLocator
45
+ from deltacat.storage.model.metafile import (
46
+ Metafile,
47
+ MetafileRevisionInfo,
48
+ )
49
+ from deltacat.types.tables import (
50
+ DatasetType,
51
+ from_pyarrow,
52
+ )
53
+ from deltacat.utils.filesystem import (
54
+ resolve_path_and_filesystem,
55
+ list_directory,
56
+ get_file_info,
57
+ )
58
+ from deltacat import logs
59
+
60
+ logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
61
+
62
+
63
+ # Context variable to store the current transaction
64
+ _current_transaction: contextvars.ContextVar[
65
+ Optional[Transaction]
66
+ ] = contextvars.ContextVar("current_transaction", default=None)
67
+
68
+
69
+ def get_current_transaction() -> Optional[Transaction]:
70
+ """Get the currently active transaction from context."""
71
+ return _current_transaction.get()
72
+
73
+
74
+ def set_current_transaction(transaction: Optional[Transaction]) -> contextvars.Token:
75
+ """Set the current transaction in context, returns token for restoration."""
76
+ return _current_transaction.set(transaction)
77
+
78
+
79
+ def setup_transaction(
80
+ transaction: Optional[Transaction] = None,
81
+ **kwargs,
82
+ ) -> Tuple[Transaction, bool]:
83
+ """
84
+ Utility method to ensure a transaction exists and determine if it should be committed
85
+ within the caller's context. Creates a new transaction if none is provided.
86
+
87
+ Args:
88
+ transaction: Optional existing transaction to use
89
+ **kwargs: Additional arguments for catalog properties
90
+
91
+ Returns:
92
+ Tuple[Transaction, bool]: The transaction to use and whether to commit it
93
+ """
94
+ # Check for active transaction in context first
95
+ if transaction is None:
96
+ transaction = get_current_transaction()
97
+
98
+ commit_transaction = transaction is None
99
+ if commit_transaction:
100
+ from deltacat.catalog.model.properties import get_catalog_properties
101
+
102
+ catalog_properties = get_catalog_properties(**kwargs)
103
+ transaction = Transaction.of().start(
104
+ catalog_properties.root,
105
+ catalog_properties.filesystem,
106
+ )
107
+ return transaction, commit_transaction
108
+
109
+
110
+ def transaction_log_dir_and_filesystem(
111
+ catalog_name: Optional[str] = None,
112
+ ) -> Tuple[str, pyarrow.fs.FileSystem]:
113
+ """
114
+ Get the transaction log directory and filesystem for the given catalog.
115
+
116
+ Args:
117
+ catalog_name: Name of the catalog to get the transaction log directory and filesystem for.
118
+ If None, uses the default catalog.
119
+
120
+ Returns:
121
+ Tuple[str, pyarrow.fs.FileSystem]: The transaction log directory and filesystem for the given catalog.
122
+ """
123
+ # Get the catalog and its properties
124
+ from deltacat.catalog.model.catalog import get_catalog
125
+
126
+ catalog = get_catalog(catalog_name)
127
+ catalog_properties = catalog.inner
128
+
129
+ # Get transaction directory paths
130
+ catalog_root_normalized, filesystem = resolve_path_and_filesystem(
131
+ catalog_properties.root,
132
+ catalog_properties.filesystem,
133
+ )
134
+
135
+ return posixpath.join(catalog_root_normalized, TXN_DIR_NAME), filesystem
136
+
137
+
138
+ def transaction(
139
+ catalog_name: Optional[str] = None,
140
+ as_of: Optional[int] = None,
141
+ commit_message: Optional[str] = None,
142
+ ) -> Transaction:
143
+ """
144
+ Start a new interactive transaction for the given catalog.
145
+
146
+ Args:
147
+ catalog_name: Optional name of the catalog to run the transaction against.
148
+ If None, uses the default catalog.
149
+ as_of: Optional historic timestamp in nanoseconds since epoch.
150
+ If provided, creates a read-only transaction that reads only transactions
151
+ with end times strictly less than the specified timestamp.
152
+ commit_message: Optional commit message to describe the transaction purpose.
153
+ Helps with time travel functionality by providing context
154
+ for each transaction when browsing transaction history.
155
+
156
+ Returns:
157
+ Transaction: A started interactive transaction ready for use with the given catalog.
158
+
159
+ Example:
160
+ # Read-write transaction with commit message
161
+ with dc.transaction(commit_message="Initial data load for Q4 analytics") as txn:
162
+ dc.write_to_table(data, "my_table")
163
+ dc.write_to_table(more_data, "my_other_table")
164
+
165
+ # Read-only historic transaction
166
+ import time
167
+ historic_time = time.time_ns() - 3600 * 1000000000 # 1 hour ago
168
+ with dc.transaction(as_of=historic_time) as txn:
169
+ # Only read operations allowed - provides snapshot as of historic_time
170
+ data = dc.read_table("my_table")
171
+ """
172
+ from deltacat.catalog.model.catalog import get_catalog
173
+
174
+ # Get the catalog and its properties
175
+ catalog = get_catalog(catalog_name)
176
+ catalog_properties = catalog.inner
177
+
178
+ # Create interactive transaction
179
+ if as_of is not None:
180
+ # Create read-only historic transaction
181
+ txn = Transaction.of(commit_message=commit_message).start(
182
+ catalog_properties.root,
183
+ catalog_properties.filesystem,
184
+ historic_timestamp=as_of,
185
+ )
186
+ else:
187
+ # Create regular read-write transaction
188
+ txn = Transaction.of(commit_message=commit_message).start(
189
+ catalog_properties.root, catalog_properties.filesystem
190
+ )
191
+ # Initialize the lazy transaction ID
192
+ logger.info(f"Created transaction with ID: {txn.id}")
193
+ return txn
194
+
195
+
196
+ def _read_txn(
197
+ txn_log_dir: str,
198
+ txn_status: TransactionStatus,
199
+ transaction_id: str,
200
+ filesystem: pyarrow.fs.FileSystem,
201
+ ) -> Transaction:
202
+ """
203
+ Read a transaction ID with the expected status from the given root transaction log directory.
204
+
205
+ Args:
206
+ txn_log_dir: The directory containing the transaction log.
207
+ txn_status: The expected status of the transaction.
208
+ transaction_id: The ID of the transaction.
209
+ filesystem: The filesystem to use for reading the transaction.
210
+
211
+ Returns:
212
+ Transaction: The transaction.
213
+ """
214
+ # Transaction directories contain the actual transaction file
215
+ txn_dir_path = posixpath.join(
216
+ txn_log_dir, txn_status.dir_name(), posixpath.basename(transaction_id)
217
+ )
218
+
219
+ try:
220
+ file_info = get_file_info(txn_dir_path, filesystem)
221
+ except FileNotFoundError:
222
+ raise FileNotFoundError(
223
+ f"Transaction with ID '{transaction_id}' and status '{txn_status}' not found."
224
+ )
225
+
226
+ # Only read transaction directories (skip any stray files)
227
+ if file_info.type != pyarrow.fs.FileType.Directory:
228
+ raise FileNotFoundError(
229
+ f"Transaction directory for transaction ID '{transaction_id}' with status '{txn_status}' not found."
230
+ )
231
+
232
+ # List files in the transaction directory
233
+ txn_files = list_directory(
234
+ path=txn_dir_path,
235
+ filesystem=filesystem,
236
+ ignore_missing_path=True,
237
+ )
238
+
239
+ if not txn_files:
240
+ raise FileNotFoundError(
241
+ f"No transaction file found for transaction ID '{transaction_id}' and status '{txn_status}'."
242
+ )
243
+
244
+ if len(txn_files) > 1:
245
+ raise RuntimeError(
246
+ f"Expected 1 transaction file in '{txn_dir_path}', but found {len(txn_files)}"
247
+ )
248
+
249
+ # Get the transaction file path
250
+ txn_file_path, _ = txn_files[0]
251
+
252
+ # Read the transaction from the file
253
+ return Transaction.read(txn_file_path, filesystem)
254
+
255
+
256
+ def read_transaction(
257
+ transaction_id: str,
258
+ catalog_name: Optional[str] = None,
259
+ status: TransactionStatus = TransactionStatus.SUCCESS,
260
+ ) -> Transaction:
261
+ """
262
+ Read a transaction from the given catalog and transaction ID.
263
+ """
264
+ txn_log_dir, filesystem = transaction_log_dir_and_filesystem(catalog_name)
265
+ return _read_txn(txn_log_dir, status, transaction_id, filesystem)
266
+
267
+
268
+ def transactions(
269
+ catalog_name: Optional[str] = None,
270
+ read_as: "DatasetType" = None,
271
+ start_time: Optional[int] = None,
272
+ end_time: Optional[int] = None,
273
+ limit: Optional[int] = None,
274
+ status_in: Iterable[TransactionStatus] = [TransactionStatus.SUCCESS],
275
+ ) -> Dataset:
276
+ """
277
+ Query transaction history for a catalog.
278
+
279
+ Args:
280
+ catalog_name: Optional name of the catalog to query. If None, uses the default catalog.
281
+ read_as: Dataset type to return results as. If None, defaults to DatasetType.PYARROW.
282
+ start_time: Optional start timestamp in nanoseconds since epoch to filter transactions.
283
+ end_time: Optional end timestamp in nanoseconds since epoch to filter transactions.
284
+ limit: Optional maximum number of transactions to return (most recent first).
285
+ status_in: Optional iterable of transaction status types to include. Defaults to [TransactionStatus.SUCCESS].
286
+
287
+ Returns:
288
+ Dataset: Transaction history as the specified dataset type with columns:
289
+ - transaction_id: Unique transaction identifier
290
+ - commit_message: Optional user-provided commit message
291
+ - start_time: Transaction start timestamp (nanoseconds since epoch)
292
+ - end_time: Transaction end timestamp (nanoseconds since epoch, None for running)
293
+ - status: Transaction status (SUCCESS, RUNNING, FAILED, PAUSED)
294
+ - operation_count: Number of operations in the transaction
295
+ - operation_types: Comma-separated list of distinct operation types in the transaction
296
+ - namespace_count: Number of distinct namespaces affected by the transaction
297
+ - table_count: Number of distinct tables affected by the transaction
298
+ - table_version_count: Number of distinct table versions affected by the transaction
299
+ - stream_count: Number of distinct streams affected by the transaction
300
+ - partition_count: Number of distinct partitions affected by the transaction
301
+ - delta_count: Number of distinct deltas affected by the transaction
302
+
303
+ Example:
304
+ # Get recent successful transactions
305
+ recent = dc.transactions(limit=10)
306
+
307
+ # Get transactions for a specific time range
308
+ import time
309
+ hour_ago = time.time_ns() - 3600 * 1000000000
310
+ recent_hour = dc.transactions(start_time=hour_ago)
311
+
312
+ # Get transaction history as pandas DataFrame
313
+ df = dc.transactions(read_as=dc.DatasetType.PANDAS)
314
+ """
315
+ # Validate inputs
316
+ if limit is not None and limit <= 0:
317
+ raise ValueError("limit must be greater than 0")
318
+
319
+ # Set default read_as if not provided
320
+ if read_as is None:
321
+ read_as = DatasetType.PYARROW
322
+
323
+ if not status_in:
324
+ status_in = [TransactionStatus.SUCCESS]
325
+
326
+ # Get transaction directory path and filesystem
327
+ txn_log_dir, filesystem = transaction_log_dir_and_filesystem(catalog_name)
328
+
329
+ # Collect transaction data
330
+ transaction_records = {
331
+ "transaction_id": [],
332
+ "commit_message": [],
333
+ "start_time": [],
334
+ "end_time": [],
335
+ "status": [],
336
+ "operation_count": [],
337
+ "operation_types": [],
338
+ "namespace_count": [],
339
+ "table_count": [],
340
+ "table_version_count": [],
341
+ "stream_count": [],
342
+ "partition_count": [],
343
+ "delta_count": [],
344
+ }
345
+
346
+ # Helper function to process transactions in a directory
347
+ def process_transactions_in_directory(
348
+ directory: str, expected_status: TransactionStatus
349
+ ):
350
+ # TODO(pdames): Do a recursive listing to get the transaction files returned directly.
351
+ file_info_and_sizes = list_directory(
352
+ path=directory,
353
+ filesystem=filesystem,
354
+ ignore_missing_path=True,
355
+ )
356
+
357
+ for file_path, _ in file_info_and_sizes:
358
+ # Read the transaction from the file
359
+ # TODO(pdames): Do a recursive listing to get the transaction files returned directly.
360
+ try:
361
+ txn = _read_txn(
362
+ txn_log_dir,
363
+ expected_status,
364
+ posixpath.basename(file_path),
365
+ filesystem,
366
+ )
367
+ except FileNotFoundError:
368
+ # this may be a stray file or the transaction is being created - skip it
369
+ continue
370
+
371
+ # Apply time filters
372
+ # TODO(pdames): Parse start and end times from the transaction file path.
373
+ if (
374
+ start_time is not None
375
+ and txn.start_time
376
+ and txn.start_time < start_time
377
+ ):
378
+ continue
379
+ if end_time is not None and txn.end_time and txn.end_time > end_time:
380
+ continue
381
+
382
+ # Count operations and affected metadata objects by type.
383
+ operation_count = len(txn.operations)
384
+ operation_types = set()
385
+ affected_namespaces = set()
386
+ affected_tables = set()
387
+ affected_table_versions = set()
388
+ affected_streams = set()
389
+ affected_partitions = set()
390
+ affected_deltas = set()
391
+
392
+ for op in txn.operations:
393
+ operation_types.add(op.type)
394
+
395
+ # Determine locator type and cast to appropriate locator class
396
+ locator_dict = op.dest_metafile.get("locator", {})
397
+ if "tableName" in locator_dict and "namespaceLocator" in locator_dict:
398
+ locator = TableLocator(locator_dict)
399
+ elif "namespace" in locator_dict:
400
+ locator = NamespaceLocator(locator_dict)
401
+ elif "tableVersion" in locator_dict:
402
+ locator = TableVersionLocator(locator_dict)
403
+ elif "streamId" in locator_dict:
404
+ locator = StreamLocator(locator_dict)
405
+ elif "partitionId" in locator_dict:
406
+ locator = PartitionLocator(locator_dict)
407
+ elif "streamPosition" in locator_dict:
408
+ locator = DeltaLocator(locator_dict)
409
+ else:
410
+ raise ValueError(
411
+ f"Unknown locator type from structure: {locator_dict}"
412
+ )
413
+
414
+ # Extract distinct metafiles updated by common/alias name (e.g., a table rename impacts 2 tables instead of 1)
415
+ if op.type in TransactionOperationType.write_operations():
416
+ if locator.namespace is not None:
417
+ affected_namespaces.add(locator.namespace)
418
+ if isinstance(locator, TableLocator):
419
+ affected_tables.add((locator.namespace, locator.table_name))
420
+ elif isinstance(locator, TableVersionLocator):
421
+ affected_table_versions.add(
422
+ (
423
+ locator.namespace,
424
+ locator.table_name,
425
+ locator.table_version,
426
+ )
427
+ )
428
+ elif isinstance(locator, StreamLocator):
429
+ affected_tables.add((locator.namespace, locator.table_name))
430
+ affected_table_versions.add(
431
+ (
432
+ locator.namespace,
433
+ locator.table_name,
434
+ locator.table_version,
435
+ )
436
+ )
437
+ affected_streams.add(
438
+ (
439
+ locator.namespace,
440
+ locator.table_name,
441
+ locator.table_version,
442
+ locator.stream_id,
443
+ )
444
+ )
445
+ elif isinstance(locator, PartitionLocator):
446
+ affected_tables.add((locator.namespace, locator.table_name))
447
+ affected_table_versions.add(
448
+ (
449
+ locator.namespace,
450
+ locator.table_name,
451
+ locator.table_version,
452
+ )
453
+ )
454
+ affected_streams.add(
455
+ (
456
+ locator.namespace,
457
+ locator.table_name,
458
+ locator.table_version,
459
+ locator.stream_id,
460
+ )
461
+ )
462
+ affected_partitions.add(
463
+ (
464
+ locator.namespace,
465
+ locator.table_name,
466
+ locator.table_version,
467
+ locator.stream_id,
468
+ locator.partition_id,
469
+ )
470
+ )
471
+ elif isinstance(locator, DeltaLocator):
472
+ affected_tables.add((locator.namespace, locator.table_name))
473
+ affected_table_versions.add(
474
+ (
475
+ locator.namespace,
476
+ locator.table_name,
477
+ locator.table_version,
478
+ )
479
+ )
480
+ affected_streams.add(
481
+ (
482
+ locator.namespace,
483
+ locator.table_name,
484
+ locator.table_version,
485
+ locator.stream_id,
486
+ )
487
+ )
488
+ affected_partitions.add(
489
+ (
490
+ locator.namespace,
491
+ locator.table_name,
492
+ locator.table_version,
493
+ locator.stream_id,
494
+ locator.partition_id,
495
+ )
496
+ )
497
+ affected_deltas.add(
498
+ (
499
+ locator.namespace,
500
+ locator.table_name,
501
+ locator.table_version,
502
+ locator.stream_id,
503
+ locator.partition_id,
504
+ locator.stream_position,
505
+ )
506
+ )
507
+
508
+ # Create transaction record
509
+ transaction_records["transaction_id"].append(txn.id)
510
+ transaction_records["commit_message"].append(txn.commit_message)
511
+ transaction_records["start_time"].append(txn.start_time)
512
+ transaction_records["end_time"].append(txn.end_time)
513
+ transaction_records["status"].append(expected_status)
514
+ transaction_records["operation_count"].append(operation_count)
515
+ transaction_records["operation_types"].append(operation_types)
516
+ transaction_records["namespace_count"].append(len(affected_namespaces))
517
+ transaction_records["table_count"].append(len(affected_tables))
518
+ transaction_records["table_version_count"].append(
519
+ len(affected_table_versions)
520
+ )
521
+ transaction_records["stream_count"].append(len(affected_streams))
522
+ transaction_records["partition_count"].append(len(affected_partitions))
523
+ transaction_records["delta_count"].append(len(affected_deltas))
524
+
525
+ for status in status_in:
526
+ dir_path = posixpath.join(txn_log_dir, status.dir_name())
527
+ process_transactions_in_directory(dir_path, status)
528
+
529
+ # Sort by start_time descending (most recent first)
530
+ # Convert to list of records, sort, then convert back
531
+ if transaction_records["transaction_id"]: # Only sort if we have records
532
+ # Create list of tuples (start_time, record_index)
533
+ sorted_indices = sorted(
534
+ range(len(transaction_records["start_time"])),
535
+ key=lambda i: transaction_records["start_time"][i] or 0,
536
+ reverse=True,
537
+ )
538
+
539
+ # Reorder all columns based on sorted indices
540
+ for key in transaction_records:
541
+ transaction_records[key] = [
542
+ transaction_records[key][i] for i in sorted_indices
543
+ ]
544
+
545
+ # Apply limit
546
+ # TODO(pdames): Apply limit during listing (pyarrow fs doesn't support limits natively).
547
+ if limit is not None and limit > 0:
548
+ for key in transaction_records:
549
+ transaction_records[key] = transaction_records[key][:limit]
550
+
551
+ # Convert to requested dataset type
552
+ return from_pyarrow(pa.Table.from_pydict(transaction_records), read_as)
553
+
554
+
555
+ class TransactionTimeProvider:
556
+ """
557
+ Provider interface for transaction start and end times. An ideal
558
+ transaction time provider is externally consistent (e.g.,
559
+ https://cloud.google.com/spanner/docs/true-time-external-consistency),
560
+ such that:
561
+ 1. A transaction start time is never less than a previously completed
562
+ transaction's end time.
563
+ 2. A transaction end time is never less than an in-progress
564
+ transaction's start time.
565
+ 3. Every transaction has a unique start and end time.
566
+ 4. Start/end time assignment is non-blocking.
567
+ """
568
+
569
+ def start_time(self) -> int:
570
+ raise NotImplementedError("start_time not implemented")
571
+
572
+ def end_time(self) -> int:
573
+ raise NotImplementedError("end_time not implemented")
574
+
575
+
576
+ class TransactionSystemTimeProvider(TransactionTimeProvider):
577
+ """
578
+ A local transaction time provider that returns the current system clock
579
+ epoch time in nanoseconds. Ensures that all local transaction start
580
+ times are greater than all last known end times, and that all known end
581
+ times are no less than all last known start time across all local threads
582
+ using this time provider.
583
+
584
+ Note that this time provider gives no external consistency guarantees due
585
+ to potential clock skew between distributed nodes writing to the same
586
+ catalog, and is only recommended for use with local catalogs.
587
+ """
588
+
589
+ last_known_start_times = defaultdict(int)
590
+ last_known_end_times = defaultdict(int)
591
+
592
+ # don't wait more than 60 seconds for the system clock to catch up
593
+ # between transactions (assumed to be indicative of a larger system
594
+ # clock change made between transactions)
595
+ max_sync_wait_time = 60 * NANOS_PER_SEC
596
+
597
+ def start_time(self) -> int:
598
+ """
599
+ Gets the current system time in nanoseconds since the epoch. Ensures
600
+ that the start time returned is greater than the last known end time
601
+ recorded at the time this method is invoked.
602
+ :return: Current epoch time in nanoseconds.
603
+ """
604
+ # ensure serial transactions in a single process have start times after
605
+ # the last known end time
606
+ last_known_end_times = self.last_known_end_times.values() or [0]
607
+ max_known_end_time = max(last_known_end_times)
608
+
609
+ elapsed_start_time = time.monotonic_ns()
610
+ current_time = time.time_ns()
611
+ while current_time <= max_known_end_time:
612
+ elapsed_time = time.monotonic_ns() - elapsed_start_time
613
+ if elapsed_time > self.max_sync_wait_time:
614
+ raise TimeoutError(
615
+ f"Failed to sync cross-transaction system clock time after "
616
+ f"{self.max_sync_wait_time / NANOS_PER_SEC} seconds, "
617
+ f"aborting."
618
+ )
619
+ time.sleep(0.000001)
620
+ current_time = time.time_ns()
621
+
622
+ # update the current thread's last known end time
623
+ pid = os.getpid()
624
+ tid = threading.current_thread().ident
625
+ current_thread_time_key = (pid, tid)
626
+ self.last_known_end_times[current_thread_time_key] = current_time
627
+
628
+ return current_time
629
+
630
+ def end_time(self) -> int:
631
+ """
632
+ Gets the current system time in nanoseconds since the epoch. Ensures
633
+ that the end time returned is no less than the last known start time
634
+ recorded at the time this method is invoked.
635
+ :return: Current epoch time in nanoseconds.
636
+ """
637
+ # ensure serial transactions in a single process have end times no less
638
+ # than the last known start time
639
+ last_known_start_times = self.last_known_start_times.values() or [0]
640
+ last_start_time = max(last_known_start_times)
641
+
642
+ elapsed_start_time = time.monotonic_ns()
643
+ current_time = time.time_ns()
644
+ while current_time < last_start_time:
645
+ elapsed_time = time.monotonic_ns() - elapsed_start_time
646
+ if elapsed_time > self.max_sync_wait_time:
647
+ raise TimeoutError(
648
+ f"Failed to sync cross-transaction system clock time after "
649
+ f"{self.max_sync_wait_time / NANOS_PER_SEC} seconds, "
650
+ f"aborting."
651
+ )
652
+ time.sleep(0.000001)
653
+ current_time = time.time_ns()
654
+
655
+ # update the current thread's last known end time
656
+ pid = os.getpid()
657
+ tid = threading.current_thread().ident
658
+ current_thread_time_key = (pid, tid)
659
+ self.last_known_start_times[current_thread_time_key] = current_time
660
+
661
+ return current_time
662
+
663
+
664
+ class TransactionHistoricTimeProvider(TransactionTimeProvider):
665
+ """
666
+ A transaction time provider that returns a fixed historic timestamp
667
+ for read-only transactions. This enables MVCC snapshot isolation
668
+ as-of the specified timestamp.
669
+ """
670
+
671
+ def __init__(
672
+ self,
673
+ historic_timestamp: int,
674
+ base_time_provider: TransactionTimeProvider,
675
+ ):
676
+ """
677
+ Initialize with a fixed historic timestamp and a base time provider.
678
+
679
+ Args:
680
+ historic_timestamp: Timestamp in nanoseconds since epoch to use
681
+ for both start and end times.
682
+ base_time_provider: Time provider to use for the end time.
683
+ """
684
+ # Validate that historic timestamp is not in the future
685
+ if historic_timestamp > base_time_provider.start_time():
686
+ raise ValueError(
687
+ f"Historic timestamp {historic_timestamp} cannot be set in the future."
688
+ )
689
+ self.base_time_provider = base_time_provider
690
+ self.historic_timestamp = historic_timestamp
691
+
692
+ def start_time(self) -> int:
693
+ """
694
+ Returns the fixed historic timestamp.
695
+ """
696
+ return self.historic_timestamp
697
+
698
+ def end_time(self) -> int:
699
+ """
700
+ Returns the end time of the base time provider.
701
+ """
702
+ return self.base_time_provider.end_time()
703
+
704
+
705
+ class TransactionOperation(dict):
706
+ """
707
+ Base class for DeltaCAT transaction operations against individual metafiles.
708
+ """
709
+
710
+ @staticmethod
711
+ def of(
712
+ operation_type: Optional[TransactionOperationType],
713
+ dest_metafile: Metafile,
714
+ src_metafile: Optional[Metafile] = None,
715
+ read_limit: Optional[int] = None,
716
+ ) -> TransactionOperation:
717
+ if not dest_metafile:
718
+ raise ValueError("Transaction operations must have a destination metafile.")
719
+ if operation_type in [
720
+ TransactionOperationType.UPDATE,
721
+ TransactionOperationType.REPLACE,
722
+ ]:
723
+ if not src_metafile:
724
+ raise ValueError(
725
+ f"{operation_type.value} transaction operations must have a source metafile."
726
+ )
727
+ elif type(dest_metafile) is not type(src_metafile):
728
+ raise ValueError(
729
+ f"Source metafile type `{type(src_metafile)}` is not "
730
+ f"equal to dest metafile type `{type(dest_metafile)}`."
731
+ )
732
+ elif src_metafile:
733
+ raise ValueError(
734
+ f"Only {TransactionOperationType.UPDATE.value} and {TransactionOperationType.REPLACE.value} transaction operations may have a source metafile."
735
+ )
736
+ if operation_type.is_write_operation() and read_limit:
737
+ raise ValueError(
738
+ f"Only {TransactionOperationType.READ.value} transaction operations may have a read limit."
739
+ )
740
+ txn_op = TransactionOperation()
741
+ txn_op.type = operation_type
742
+ txn_op.dest_metafile = dest_metafile
743
+ txn_op.src_metafile = src_metafile
744
+ txn_op.read_limit = read_limit
745
+ return txn_op
746
+
747
+ @property
748
+ def type(self) -> TransactionOperationType:
749
+ """
750
+ Returns the type of the transaction operation.
751
+ """
752
+ val = self["type"]
753
+ if val is not None and not isinstance(val, TransactionOperationType):
754
+ self["type"] = val = TransactionOperationType(val)
755
+ return val
756
+
757
+ @type.setter
758
+ def type(self, txn_op_type: TransactionOperationType):
759
+ self["type"] = txn_op_type
760
+
761
+ @property
762
+ def dest_metafile(self) -> Metafile:
763
+ """
764
+ Returns the metafile that is the target of this transaction operation.
765
+ """
766
+ val = self["dest_metafile"]
767
+ if val is not None and not isinstance(val, Metafile):
768
+ self["dest_metafile"] = val = Metafile(val)
769
+ return val
770
+
771
+ @dest_metafile.setter
772
+ def dest_metafile(self, metafile: Metafile):
773
+ self["dest_metafile"] = metafile
774
+
775
+ @property
776
+ def src_metafile(self) -> Optional[Metafile]:
777
+ """
778
+ Returns the metafile that is the source of this transaction operation.
779
+ """
780
+ val = self.get("src_metafile")
781
+ if val is not None and not isinstance(val, Metafile):
782
+ self["src_metafile"] = val = Metafile(val)
783
+ return val
784
+
785
+ @src_metafile.setter
786
+ def src_metafile(self, src_metafile: Optional[Metafile]):
787
+ self["src_metafile"] = src_metafile
788
+
789
+ @property
790
+ def read_limit(self) -> Optional[int]:
791
+ """
792
+ Returns the read limit for this transaction operation.
793
+ """
794
+ return self.get("read_limit")
795
+
796
+ @read_limit.setter
797
+ def read_limit(self, read_limit: Optional[int]):
798
+ self["read_limit"] = read_limit
799
+
800
+ @property
801
+ def metafile_write_paths(self) -> List[str]:
802
+ return self.get("metafile_write_paths") or []
803
+
804
+ @property
805
+ def locator_write_paths(self) -> List[str]:
806
+ return self.get("locator_write_paths") or []
807
+
808
+ def append_metafile_write_path(self, write_path: str):
809
+ metafile_write_paths = self.get("metafile_write_paths")
810
+ if not metafile_write_paths:
811
+ metafile_write_paths = self["metafile_write_paths"] = []
812
+ metafile_write_paths.append(write_path)
813
+
814
+ def append_locator_write_path(self, write_path: str):
815
+ locator_write_paths = self.get("locator_write_paths")
816
+ if not locator_write_paths:
817
+ locator_write_paths = self["locator_write_paths"] = []
818
+ locator_write_paths.append(write_path)
819
+
820
+ @metafile_write_paths.setter
821
+ def metafile_write_paths(self, write_paths: List[str]) -> None:
822
+ self["metafile_write_paths"] = write_paths
823
+
824
+ @locator_write_paths.setter
825
+ def locator_write_paths(self, write_paths: List[str]):
826
+ self["locator_write_paths"] = write_paths
827
+
828
+
829
+ class TransactionOperationList(List[TransactionOperation]):
830
+ @staticmethod
831
+ def of(items: List[TransactionOperation]) -> TransactionOperationList:
832
+ typed_items = TransactionOperationList()
833
+ for item in items:
834
+ if item is not None and not isinstance(item, TransactionOperation):
835
+ item = TransactionOperation(item)
836
+ typed_items.append(item)
837
+ return typed_items
838
+
839
+ def __getitem__(self, item):
840
+ val = super().__getitem__(item)
841
+ if val is not None and not isinstance(val, TransactionOperation):
842
+ self[item] = val = TransactionOperation(val)
843
+ return val
844
+
845
+ def __iter__(self):
846
+ """Support enumeration by returning TransactionOperation objects."""
847
+ for i in range(len(self)):
848
+ yield self[i] # This triggers __getitem__ conversion
849
+
850
+
851
+ class Transaction(dict):
852
+ """
853
+ Base class for DeltaCAT transactions.
854
+ """
855
+
856
+ @staticmethod
857
+ def of(
858
+ txn_operations: Optional[TransactionOperationList] = None,
859
+ commit_message: Optional[str] = None,
860
+ ) -> Transaction:
861
+ if txn_operations is None:
862
+ txn_operations = []
863
+ transaction = Transaction()
864
+ transaction.operations = txn_operations
865
+ transaction.interactive = len(txn_operations) == 0
866
+ if commit_message:
867
+ transaction.commit_message = commit_message
868
+ return transaction
869
+
870
+ @staticmethod
871
+ def read_end_time(
872
+ path: str,
873
+ filesystem: Optional[pyarrow.fs.FileSystem] = None,
874
+ ) -> Optional[int]:
875
+ """
876
+ Returns the end time of the transaction, or None if the transaction
877
+ log file does not exist.
878
+ :param path: Transaction log path to read.
879
+ :param filesystem: File system to use for reading the Transaction file.
880
+ :return: Deserialized object from the Transaction file.
881
+ """
882
+ # TODO(pdames): Validate that input file path is a valid txn log.
883
+ if not filesystem:
884
+ path, filesystem = resolve_path_and_filesystem(path, filesystem)
885
+ file_info_and_sizes = list_directory(
886
+ path=path,
887
+ filesystem=filesystem,
888
+ ignore_missing_path=True,
889
+ )
890
+ end_time = None
891
+ if file_info_and_sizes:
892
+ if len(file_info_and_sizes) > 1:
893
+ raise ValueError(
894
+ f"Expected to find only one transaction log at {path}, "
895
+ f"but found {len(file_info_and_sizes)}"
896
+ )
897
+ end_time = Transaction._parse_end_time(file_info_and_sizes[0][0])
898
+ return end_time
899
+
900
+ @staticmethod
901
+ def _parse_end_time(txn_log_file_name_or_path: str) -> int:
902
+ return int(posixpath.basename(txn_log_file_name_or_path))
903
+
904
+ @classmethod
905
+ def read(
906
+ cls,
907
+ path: str,
908
+ filesystem: Optional[pyarrow.fs.FileSystem] = None,
909
+ ) -> Transaction:
910
+ """
911
+ Read a Transaction file and return the deserialized object.
912
+ :param path: Transaction file path to read.
913
+ :param filesystem: File system to use for reading the Transaction file.
914
+ :return: Deserialized object from the Transaction file.
915
+ """
916
+
917
+ if not filesystem:
918
+ path, filesystem = resolve_path_and_filesystem(path, filesystem)
919
+ with filesystem.open_input_stream(path) as file:
920
+ binary = file.readall()
921
+ obj = cls(**msgpack.loads(binary))
922
+ return obj
923
+
924
+ @staticmethod
925
+ def read_time_provider(provider_name: str):
926
+ """
927
+ Given the string name of a time provider class, return a new instance of it.
928
+ Raises ValueError if the provider name is unknown.
929
+ """
930
+ TIME_PROVIDER_CLASSES = {
931
+ "TransactionSystemTimeProvider": TransactionSystemTimeProvider,
932
+ # Add additional mappings as needed
933
+ }
934
+
935
+ provider_cls = TIME_PROVIDER_CLASSES.get(provider_name)
936
+ if provider_cls is None:
937
+ raise ValueError(f"Unknown time provider: {provider_name}")
938
+
939
+ return provider_cls()
940
+
941
+ @property
942
+ def id(self) -> Optional[str]:
943
+ """
944
+ Returns this transaction's unique ID assigned at commit start time, or
945
+ None if the unique ID has not yet been assigned.
946
+ """
947
+ _id = self.get("id")
948
+ if not _id and self.start_time:
949
+ _id = self["id"] = f"{self.start_time}{TXN_PART_SEPARATOR}{uuid.uuid4()}"
950
+ return _id
951
+
952
+ def state(self, catalog_root_dir: str, filesystem: pyarrow.fs.FileSystem = None):
953
+ """
954
+ Infer the transaction state based on its presence in different directories.
955
+ """
956
+
957
+ txn_name = self.id
958
+
959
+ catalog_root_normalized, filesystem = resolve_path_and_filesystem(
960
+ catalog_root_dir
961
+ )
962
+
963
+ txn_log_dir = posixpath.join(catalog_root_normalized, TXN_DIR_NAME)
964
+ running_txn_log_dir = posixpath.join(txn_log_dir, RUNNING_TXN_DIR_NAME)
965
+ filesystem.create_dir(running_txn_log_dir, recursive=True)
966
+ failed_txn_log_dir = posixpath.join(txn_log_dir, FAILED_TXN_DIR_NAME)
967
+ filesystem.create_dir(failed_txn_log_dir, recursive=False)
968
+ success_txn_log_dir = posixpath.join(txn_log_dir, SUCCESS_TXN_DIR_NAME)
969
+ filesystem.create_dir(success_txn_log_dir, recursive=False)
970
+ paused_txn_log_dir = posixpath.join(txn_log_dir, PAUSED_TXN_DIR_NAME)
971
+ filesystem.create_dir(paused_txn_log_dir, recursive=False)
972
+
973
+ # Check if the transaction file exists in the failed directory
974
+ in_failed = os.path.exists(os.path.join(failed_txn_log_dir, txn_name))
975
+
976
+ # Check if the transaction file exists in the running directory
977
+ in_running = os.path.exists(os.path.join(running_txn_log_dir, txn_name))
978
+
979
+ # Check if the transaction file exists in the success directory
980
+ in_success = os.path.exists(os.path.join(success_txn_log_dir, txn_name))
981
+
982
+ # Check if the transaction file exists in the paused directory
983
+ in_paused = os.path.exists(os.path.join(paused_txn_log_dir, txn_name))
984
+
985
+ if in_failed and in_running:
986
+ return TransactionState.FAILED
987
+ elif in_failed and not in_running:
988
+ return TransactionState.PURGED
989
+ elif in_success:
990
+ return TransactionState.SUCCESS
991
+ elif in_running:
992
+ return TransactionState.RUNNING
993
+ elif in_paused:
994
+ return TransactionState.PAUSED
995
+
996
+ @property
997
+ def operations(self) -> TransactionOperationList:
998
+ """
999
+ Returns the list of transaction operations.
1000
+ """
1001
+ return TransactionOperationList(self["operations"])
1002
+
1003
+ @operations.setter
1004
+ def operations(self, operations: TransactionOperationList):
1005
+ self["operations"] = operations
1006
+
1007
+ @property
1008
+ def metafile_write_paths(self) -> List[str]:
1009
+ return [path for op in self.operations for path in op.metafile_write_paths]
1010
+
1011
+ @property
1012
+ def locator_write_paths(self) -> List[str]:
1013
+ return [path for op in self.operations for path in op.locator_write_paths]
1014
+
1015
+ @property
1016
+ def catalog_root_normalized(self) -> str:
1017
+ """
1018
+ Returns the catalog_root_normalized for this transaction.
1019
+ """
1020
+ return self.get("catalog_root_normalized")
1021
+
1022
+ @catalog_root_normalized.setter
1023
+ def catalog_root_normalized(self, path: str):
1024
+ self["catalog_root_normalized"] = path
1025
+
1026
+ @property
1027
+ def _time_provider(self) -> TransactionSystemTimeProvider:
1028
+ """
1029
+ Returns the time_provider of the transaction.
1030
+ """
1031
+ return self.get("_time_provider")
1032
+
1033
+ @_time_provider.setter
1034
+ def _time_provider(
1035
+ self, tp: TransactionSystemTimeProvider
1036
+ ) -> TransactionSystemTimeProvider:
1037
+ self["_time_provider"] = tp
1038
+
1039
+ @property
1040
+ def start_time(self) -> Optional[int]:
1041
+ """
1042
+ Returns the start time of the transaction.
1043
+ """
1044
+ return self.get("start_time")
1045
+
1046
+ @property
1047
+ def pause_time(self) -> Optional[int]:
1048
+ """
1049
+ Returns the last pause time of the transaction.
1050
+ """
1051
+ return self.get("pause_time")
1052
+
1053
+ @property
1054
+ def end_time(self) -> Optional[int]:
1055
+ """
1056
+ Returns the end time of the transaction.
1057
+ """
1058
+ return self.get("end_time")
1059
+
1060
+ @property
1061
+ def commit_message(self) -> Optional[str]:
1062
+ """
1063
+ Returns the commit message for the transaction.
1064
+ """
1065
+ return self.get("commit_message")
1066
+
1067
+ @commit_message.setter
1068
+ def commit_message(self, message: str):
1069
+ """
1070
+ Sets the commit message for the transaction.
1071
+ """
1072
+ self["commit_message"] = message
1073
+
1074
+ @property
1075
+ def historic_timestamp(self) -> Optional[int]:
1076
+ """
1077
+ Returns the historic timestamp for the transaction.
1078
+ """
1079
+ return self.get("historic_timestamp")
1080
+
1081
+ @historic_timestamp.setter
1082
+ def historic_timestamp(self, timestamp: int):
1083
+ """
1084
+ Sets the historic timestamp for the transaction.
1085
+ """
1086
+ self["historic_timestamp"] = timestamp
1087
+
1088
+ def _mark_start_time(self, time_provider: TransactionTimeProvider) -> int:
1089
+ """
1090
+ Sets the start time of the transaction using the given
1091
+ TransactionTimeProvider. Raises a runtime error if the transaction
1092
+ start time has already been set by a previous commit.
1093
+ """
1094
+ if self.get("start_time"):
1095
+ raise RuntimeError("Cannot restart a previously started transaction.")
1096
+ start_time = self["start_time"] = time_provider.start_time()
1097
+ return start_time
1098
+
1099
+ def _mark_end_time(self, time_provider: TransactionTimeProvider) -> int:
1100
+ """
1101
+ Sets the end time of the transaction using the given
1102
+ TransactionTimeProvider. Raises a runtime error if the transaction end
1103
+ time has already been set by a previous commit, or if the transaction
1104
+ start time has not been set.
1105
+ """
1106
+ if not self.get("start_time"):
1107
+ raise RuntimeError("Cannot end an unstarted transaction.")
1108
+ if self.get("end_time"):
1109
+ raise RuntimeError("Cannot end a completed transaction.")
1110
+ end_time = self["end_time"] = time_provider.end_time()
1111
+ return end_time
1112
+
1113
+ def _mark_pause_time(self, time_provider: TransactionTimeProvider) -> int:
1114
+ """
1115
+ Sets the pause time of the transaction using the given
1116
+ TransactionTimeProvider. Raises a runtime error if the transaction pause
1117
+ time has already been set by a previous commit, or if the transaction
1118
+ start time has not been set.
1119
+ """
1120
+ if not self.get("start_time"):
1121
+ raise RuntimeError("Cannot pause an unstarted transaction.")
1122
+ if self.get("end_time"):
1123
+ raise RuntimeError("Cannot pause a completed transaction.")
1124
+ pause_time = self["pause_time"] = time_provider.end_time()
1125
+ return pause_time
1126
+
1127
+ @staticmethod
1128
+ def _abs_txn_meta_path_to_relative(root: str, target: str) -> str:
1129
+ """
1130
+ Takes an absolute root directory path and target absolute path to
1131
+ relativize with respect to the root directory. Returns the target
1132
+ path relative to the root directory path. Raises an error if the
1133
+ target path is not contained in the given root directory path, if
1134
+ either path is not an absolute path, or if the target path is equal
1135
+ to the root directory path.
1136
+ """
1137
+ root_path = PosixPath(root)
1138
+ target_path = PosixPath(target)
1139
+ # TODO (martinezdavid): Check why is_absolute() fails for certain Delta paths
1140
+ # if not root_path.is_absolute() or not target_path.is_absolute():
1141
+ # raise ValueError("Both root and target must be absolute paths.")
1142
+ if root_path == target_path:
1143
+ raise ValueError(
1144
+ "Target and root are identical, but expected target to be a child of root."
1145
+ )
1146
+ try:
1147
+ relative_path = target_path.relative_to(root_path)
1148
+ except ValueError:
1149
+ raise ValueError("Expected target to be a child of root.")
1150
+ return str(relative_path)
1151
+
1152
+ def relativize_operation_paths(
1153
+ self, operation: TransactionOperation, catalog_root: str
1154
+ ) -> None:
1155
+ """
1156
+ Converts all absolute paths in an operation to relative paths
1157
+ with respect to the catalog root directory.
1158
+ """
1159
+ # handle metafile paths
1160
+ if operation.metafile_write_paths:
1161
+ metafile_write_paths = [
1162
+ Transaction._abs_txn_meta_path_to_relative(catalog_root, path)
1163
+ for path in operation.metafile_write_paths
1164
+ ]
1165
+ operation.metafile_write_paths = metafile_write_paths
1166
+ # handle locator paths
1167
+ if operation.locator_write_paths:
1168
+ locator_write_paths = [
1169
+ Transaction._abs_txn_meta_path_to_relative(catalog_root, path)
1170
+ for path in operation.locator_write_paths
1171
+ ]
1172
+ operation.locator_write_paths = locator_write_paths
1173
+
1174
+ def to_serializable(self, catalog_root) -> Transaction:
1175
+ """
1176
+ Prepare the object for serialization by converting any non-serializable
1177
+ types to serializable types. May also run any required pre-write
1178
+ validations on the serialized or deserialized object.
1179
+ :return: a serializable version of the object
1180
+ """
1181
+ # Only copy dictionary keys - all other members should not be serialized
1182
+ serializable = Transaction({})
1183
+ for key, value in self.items():
1184
+ serializable[key] = copy.deepcopy(value)
1185
+
1186
+ # remove all src/dest metafile contents except IDs and locators to
1187
+ # reduce file size (they can be reconstructed from their corresponding
1188
+ # files as required).
1189
+ for operation in serializable.operations:
1190
+ # Sanity check that IDs exist on source and dest metafiles
1191
+ if operation.dest_metafile and operation.dest_metafile.id is None:
1192
+ raise ValueError(
1193
+ f"Transaction operation ${operation} dest metafile does "
1194
+ f"not have ID: ${operation.dest_metafile}"
1195
+ )
1196
+ if operation.src_metafile and operation.src_metafile.id is None:
1197
+ raise ValueError(
1198
+ f"Transaction operation ${operation} src metafile does "
1199
+ f"not have ID: ${operation.src_metafile}"
1200
+ )
1201
+ # relativize after checking that dest and src metafiles are valid
1202
+ self.relativize_operation_paths(operation, catalog_root)
1203
+ operation.dest_metafile = {
1204
+ "id": operation.dest_metafile.id,
1205
+ "locator": operation.dest_metafile.locator,
1206
+ "locator_alias": operation.dest_metafile.locator_alias,
1207
+ }
1208
+ if operation.src_metafile:
1209
+ operation.src_metafile = {
1210
+ "id": operation.src_metafile.id,
1211
+ "locator": operation.src_metafile.locator,
1212
+ "locator_alias": operation.src_metafile.locator_alias,
1213
+ }
1214
+ # TODO(pdames): Ensure that all file paths recorded are relative to the
1215
+ # catalog root.
1216
+
1217
+ # TODO: check if we care about order or exact time stamps --> pickling time_provider?
1218
+ # serializable.pop("_time_provider", None)
1219
+
1220
+ serializable["_time_provider"] = {
1221
+ "type": type(self._time_provider).__name__,
1222
+ "params": {},
1223
+ }
1224
+
1225
+ serializable.catalog_root_normalized = self.catalog_root_normalized
1226
+
1227
+ return serializable
1228
+
1229
+ @staticmethod
1230
+ def _validate_txn_log_file(success_txn_log_file: str) -> None:
1231
+ txn_log_dir_name = posixpath.basename(posixpath.dirname(success_txn_log_file))
1232
+ txn_log_parts = txn_log_dir_name.split(TXN_PART_SEPARATOR)
1233
+ # ensure that the transaction start time is valid
1234
+ try:
1235
+ start_time = int(txn_log_parts[0])
1236
+ except ValueError as e:
1237
+ raise ValueError(
1238
+ f"Transaction log file `{success_txn_log_file}` does not "
1239
+ f"contain a valid start time."
1240
+ ) from e
1241
+ # ensure that the txn uuid is valid
1242
+ txn_uuid_str = txn_log_parts[1]
1243
+ try:
1244
+ uuid.UUID(txn_uuid_str)
1245
+ except ValueError as e:
1246
+ raise OSError(
1247
+ f"Transaction log file `{success_txn_log_file}` does not "
1248
+ f"contain a valid UUID string."
1249
+ ) from e
1250
+ # ensure that the transaction end time is valid
1251
+ try:
1252
+ end_time = Transaction._parse_end_time(success_txn_log_file)
1253
+ except ValueError as e:
1254
+ raise ValueError(
1255
+ f"Transaction log file `{success_txn_log_file}` does not "
1256
+ f"contain a valid end time."
1257
+ ) from e
1258
+ # ensure transaction end time was not recorded before start time
1259
+ if end_time < start_time:
1260
+ raise OSError(
1261
+ f"Transaction end time {end_time} is earlier than start "
1262
+ f"time {start_time}! To preserve catalog integrity, the "
1263
+ f"corresponding completed transaction log at "
1264
+ f"`{success_txn_log_file}` has been removed."
1265
+ )
1266
+
1267
+ def commit(
1268
+ self,
1269
+ catalog_root_dir: str,
1270
+ filesystem: Optional[pyarrow.fs.FileSystem] = None,
1271
+ ) -> Union[
1272
+ List[ListResult[Metafile]],
1273
+ Tuple[List[str], str],
1274
+ Tuple[List["ListResult[Metafile]"], List[str], str],
1275
+ ]:
1276
+ """
1277
+ Legacy wrapper that preserves the original `commit()` contract while
1278
+ delegating the heavy lifting to the incremental helpers.
1279
+
1280
+ Returns
1281
+ -------
1282
+ - For READ transactions: List[ListResult[Metafile]]
1283
+ - For WRITE transactions: Tuple[List[str], str]
1284
+ (list of successful write-paths, path to success-txn log file)
1285
+ - For mixed READ/WRITE transactions: Tuple[List["ListResult[Metafile]"], List[str], str]
1286
+ """
1287
+
1288
+ if hasattr(self, "interactive") and self.interactive:
1289
+ raise RuntimeError(
1290
+ "Cannot commit an interactive transaction. Use transaction.start(),transaction.step(), and transaction.seal() instead."
1291
+ )
1292
+
1293
+ if self.operations and len(self.operations) > 0:
1294
+ # Start a working copy (deep-copy, directory scaffolding, start-time, running/failed/success/paused dirs …)
1295
+ txn_active = self.start(catalog_root_dir, filesystem) # deep copy
1296
+ # Sequentially execute every TransactionOperation
1297
+ for op in txn_active.operations:
1298
+ txn_active.step(op)
1299
+ return txn_active._seal_steps()
1300
+
1301
+ def start(
1302
+ self,
1303
+ catalog_root_dir: str,
1304
+ filesystem: Optional[pyarrow.fs.FileSystem] = None,
1305
+ historic_timestamp: Optional[int] = None,
1306
+ ) -> "Transaction":
1307
+ """
1308
+ Create directory scaffolding, timestamp the txn, and return a DEEP COPY
1309
+ that the caller should use for all subsequent calls to step(), pause(),
1310
+ and seal(). The original object remains read-only.
1311
+
1312
+ Args:
1313
+ catalog_root_dir: Root directory for the catalog
1314
+ filesystem: Optional filesystem to use
1315
+ historic_timestamp: Optional timestamp in nanoseconds since epoch for snapshot isolation
1316
+ """
1317
+ # Create a deep copy
1318
+ txn: "Transaction" = copy.deepcopy(self)
1319
+
1320
+ # Set up time provider based on transaction type
1321
+ if historic_timestamp is not None:
1322
+ # Use historic time provider for snapshot isolation
1323
+ # TODO(pdames): Set base time provider to the catalog's configured time provider when more than one is supported.
1324
+ txn._time_provider = TransactionHistoricTimeProvider(
1325
+ historic_timestamp,
1326
+ TransactionSystemTimeProvider(),
1327
+ )
1328
+ txn.historic_timestamp = historic_timestamp
1329
+ else:
1330
+ # Use system time provider for regular transactions
1331
+ txn._time_provider = TransactionSystemTimeProvider()
1332
+
1333
+ txn._mark_start_time(txn._time_provider) # start time on deep_copy
1334
+
1335
+ # Set up filesystem and directories
1336
+ catalog_root_normalized, filesystem = resolve_path_and_filesystem(
1337
+ catalog_root_dir,
1338
+ filesystem,
1339
+ )
1340
+ txn.catalog_root_normalized = catalog_root_normalized
1341
+ txn._filesystem = filesystem # keep for pause/resume
1342
+ txn.running_log_written = False # internal flags
1343
+ txn._list_results = []
1344
+
1345
+ # Make sure txn/ directories exist (idempotent)
1346
+ txn_log_dir = posixpath.join(catalog_root_normalized, TXN_DIR_NAME)
1347
+ filesystem.create_dir(
1348
+ posixpath.join(txn_log_dir, RUNNING_TXN_DIR_NAME),
1349
+ recursive=True,
1350
+ )
1351
+ for subdir in (FAILED_TXN_DIR_NAME, SUCCESS_TXN_DIR_NAME, PAUSED_TXN_DIR_NAME):
1352
+ try:
1353
+ filesystem.create_dir(
1354
+ posixpath.join(txn_log_dir, subdir),
1355
+ recursive=False,
1356
+ )
1357
+ except FileExistsError:
1358
+ pass # allowed when catalog already initialised
1359
+ return txn
1360
+
1361
+ def step(
1362
+ self,
1363
+ operation: "TransactionOperation",
1364
+ ) -> Union[ListResult[Metafile], Tuple[List[str], List[str]]]:
1365
+ """
1366
+ Executes a single transaction operation.
1367
+
1368
+ Parameters
1369
+ ----------
1370
+ operation: TransactionOperation
1371
+ The transaction operation to execute.
1372
+
1373
+ Returns
1374
+ -------
1375
+ - For READ transaction operation: ListResult[Metafile]
1376
+ - For WRITE transaction operation: Tuple[List[str], List[str]]
1377
+ (list of successful write-paths, list of successful locator write-paths)
1378
+ """
1379
+
1380
+ catalog_root_normalized = self.catalog_root_normalized
1381
+ filesystem = self._filesystem
1382
+ txn_log_dir = posixpath.join(catalog_root_normalized, TXN_DIR_NAME)
1383
+
1384
+ running_txn_log_file_path = posixpath.join(
1385
+ txn_log_dir, RUNNING_TXN_DIR_NAME, self.id
1386
+ )
1387
+
1388
+ # Validate read-only transaction constraints
1389
+ if self.historic_timestamp is not None:
1390
+ if not operation.type.is_read_operation():
1391
+ raise RuntimeError(
1392
+ f"Cannot perform {operation.type.value} operation in a read-only historic transaction."
1393
+ )
1394
+
1395
+ # Add new operation to the transaction's list of operations
1396
+ if self.interactive:
1397
+ self.operations = self.operations + [operation]
1398
+
1399
+ # (a) READ txn op
1400
+ if operation.type.is_read_operation():
1401
+ list_result = operation.dest_metafile.read_txn(
1402
+ catalog_root_dir=catalog_root_normalized,
1403
+ success_txn_log_dir=posixpath.join(txn_log_dir, SUCCESS_TXN_DIR_NAME),
1404
+ current_txn_op=operation,
1405
+ current_txn_start_time=self.start_time,
1406
+ current_txn_id=self.id,
1407
+ filesystem=filesystem,
1408
+ )
1409
+ self._list_results.append(list_result)
1410
+ return list_result
1411
+
1412
+ # (b) WRITE txn op
1413
+ # First operation? -> create running log so an external janitor can
1414
+ # see that a txn is in-flight.
1415
+ if not self.running_log_written:
1416
+ self._write_running_log(running_txn_log_file_path)
1417
+
1418
+ try:
1419
+ (
1420
+ metafile_write_paths,
1421
+ locator_write_paths,
1422
+ ) = operation.dest_metafile.write_txn(
1423
+ catalog_root_dir=catalog_root_normalized,
1424
+ success_txn_log_dir=posixpath.join(txn_log_dir, SUCCESS_TXN_DIR_NAME),
1425
+ current_txn_op=operation,
1426
+ current_txn_start_time=self.start_time,
1427
+ current_txn_id=self.id,
1428
+ filesystem=filesystem,
1429
+ )
1430
+ # Check for concurrent txn conflicts on the metafile and locator write paths just written
1431
+ # TODO(pdames): Remove the fast-fail check here if it grows too expensive?
1432
+ for path in metafile_write_paths + locator_write_paths:
1433
+ MetafileRevisionInfo.check_for_concurrent_txn_conflict(
1434
+ success_txn_log_dir=posixpath.join(
1435
+ txn_log_dir,
1436
+ SUCCESS_TXN_DIR_NAME,
1437
+ ),
1438
+ current_txn_revision_file_path=path,
1439
+ filesystem=filesystem,
1440
+ )
1441
+ return metafile_write_paths, locator_write_paths
1442
+ except Exception:
1443
+ # convert in-flight txn → FAILED and clean up partial files
1444
+ self._fail_and_cleanup(
1445
+ failed_txn_log_dir=posixpath.join(txn_log_dir, FAILED_TXN_DIR_NAME),
1446
+ running_log_path=running_txn_log_file_path,
1447
+ )
1448
+ raise # surface original error
1449
+
1450
+ def pause(self) -> None:
1451
+ fs = self._filesystem
1452
+ root = self.catalog_root_normalized
1453
+ txn_log_dir = posixpath.join(root, TXN_DIR_NAME)
1454
+
1455
+ running_path = posixpath.join(txn_log_dir, RUNNING_TXN_DIR_NAME, self.id)
1456
+ paused_path = posixpath.join(txn_log_dir, PAUSED_TXN_DIR_NAME, self.id)
1457
+
1458
+ fs.create_dir(posixpath.dirname(paused_path), recursive=True)
1459
+
1460
+ # Record pause time (e.g., for time consistency guarantees)
1461
+ self._mark_pause_time(self._time_provider)
1462
+
1463
+ # Serialize current transaction state into paused/txn_id
1464
+ with fs.open_output_stream(paused_path) as f:
1465
+ f.write(msgpack.dumps(self.to_serializable(root)))
1466
+
1467
+ # Clean up original running log
1468
+ fs.delete_file(running_path)
1469
+
1470
+ def resume(self) -> None:
1471
+ fs = self._filesystem
1472
+ root = self.catalog_root_normalized
1473
+ txn_log_dir = posixpath.join(root, TXN_DIR_NAME)
1474
+
1475
+ running_path = posixpath.join(txn_log_dir, RUNNING_TXN_DIR_NAME, self.id)
1476
+ paused_path = posixpath.join(txn_log_dir, PAUSED_TXN_DIR_NAME, self.id)
1477
+
1478
+ # Load serialized transaction state
1479
+ with fs.open_input_stream(paused_path) as f:
1480
+ loaded_txn_data = msgpack.loads(f.readall())
1481
+
1482
+ # Restore relevant fields
1483
+ restored_txn = Transaction(**loaded_txn_data)
1484
+ self.__dict__.update(
1485
+ restored_txn.__dict__
1486
+ ) # make curr txn the same as restored (fill vars and stuff)
1487
+
1488
+ # To support restoring time provider state if we ever add non-ephemeral ones.
1489
+ new_provider = Transaction.read_time_provider(
1490
+ restored_txn["_time_provider"]["type"]
1491
+ )
1492
+
1493
+ # evaluate system clock
1494
+ now = new_provider.start_time()
1495
+ self._time_provider = new_provider # start time should be preserved
1496
+ if now < self.pause_time:
1497
+ raise RuntimeError(
1498
+ f"System clock {now} is behind paused transaction time {self._pause_time}"
1499
+ )
1500
+ # TODO: set new start time or keep error if clock is off?
1501
+
1502
+ # Move back to running state
1503
+ fs.create_dir(posixpath.dirname(running_path), recursive=True)
1504
+ with fs.open_output_stream(running_path) as f:
1505
+ f.write(msgpack.dumps(self.to_serializable(root)))
1506
+ fs.delete_file(paused_path)
1507
+
1508
+ def seal(
1509
+ self,
1510
+ ) -> Union[
1511
+ List["ListResult[Metafile]"],
1512
+ Tuple[List[str], str],
1513
+ Tuple[List["ListResult[Metafile]"], List[str], str],
1514
+ ]:
1515
+ """
1516
+ For READ → returns list_results collected during step().
1517
+ For WRITE → returns (written_paths, success_log_path).
1518
+ """
1519
+ if not self.interactive:
1520
+ raise RuntimeError(
1521
+ "Cannot seal a non-interactive transaction. Call transaction.commit() instead."
1522
+ )
1523
+
1524
+ # Read-only transactions can only perform read operations
1525
+ if self.historic_timestamp is not None:
1526
+ if self._has_write_operations():
1527
+ raise RuntimeError(
1528
+ "Cannot seal a read-only historic transaction that contains write operations."
1529
+ )
1530
+
1531
+ return self._seal_steps()
1532
+
1533
+ def _has_write_operations(self) -> bool:
1534
+ """
1535
+ Check if the transaction contains any write operations.
1536
+ Read-only transactions should only contain READ operations.
1537
+ """
1538
+ for operation in self.operations:
1539
+ if not operation.type.is_read_operation():
1540
+ return True
1541
+ return False
1542
+
1543
+ def _seal_steps(
1544
+ self,
1545
+ ) -> Union[
1546
+ List["ListResult[Metafile]"],
1547
+ Tuple[List[str], str],
1548
+ Tuple[List["ListResult[Metafile]"], List[str], str],
1549
+ ]:
1550
+ fs = self._filesystem
1551
+ root = self.catalog_root_normalized
1552
+ txn_log_dir = posixpath.join(root, TXN_DIR_NAME)
1553
+ end_time = self._mark_end_time(self._time_provider)
1554
+
1555
+ # READ path: nothing persisted, so we are done
1556
+ if all(op.type.is_read_operation() for op in self.operations):
1557
+ return self._list_results
1558
+
1559
+ running_path = posixpath.join(txn_log_dir, RUNNING_TXN_DIR_NAME, self.id)
1560
+ failed_dir = posixpath.join(txn_log_dir, FAILED_TXN_DIR_NAME)
1561
+ success_dir = posixpath.join(txn_log_dir, SUCCESS_TXN_DIR_NAME)
1562
+
1563
+ # If no operations ever succeeded we still need a running log.
1564
+ if not self.running_log_written:
1565
+ self._write_running_log(running_path)
1566
+ try:
1567
+ # Check for concurrent txn conflicts on metafile and locator write paths
1568
+ for path in self.metafile_write_paths + self.locator_write_paths:
1569
+ MetafileRevisionInfo.check_for_concurrent_txn_conflict(
1570
+ success_txn_log_dir=posixpath.join(
1571
+ txn_log_dir, SUCCESS_TXN_DIR_NAME
1572
+ ),
1573
+ current_txn_revision_file_path=path,
1574
+ filesystem=fs,
1575
+ )
1576
+ except Exception:
1577
+ self._fail_and_cleanup(
1578
+ failed_txn_log_dir=failed_dir,
1579
+ running_log_path=running_path,
1580
+ )
1581
+ # raise the original error
1582
+ raise
1583
+ success_log_path = None
1584
+ try:
1585
+ # write transaction log
1586
+ success_txn_dir = posixpath.join(success_dir, self.id)
1587
+ fs.create_dir(success_txn_dir, recursive=False)
1588
+
1589
+ success_log_path = posixpath.join(success_txn_dir, str(end_time))
1590
+ with fs.open_output_stream(success_log_path) as f:
1591
+ f.write(msgpack.dumps(self.to_serializable(root)))
1592
+
1593
+ Transaction._validate_txn_log_file(success_txn_log_file=success_log_path)
1594
+
1595
+ except Exception as e1:
1596
+ self._fail_and_cleanup(
1597
+ failed_txn_log_dir=failed_dir,
1598
+ running_log_path=running_path,
1599
+ success_log_path=success_log_path,
1600
+ )
1601
+ raise RuntimeError(
1602
+ f"Transaction validation failed. To preserve catalog integrity, "
1603
+ f"the corresponding completed transaction log at "
1604
+ f"`{success_log_path}` has been removed."
1605
+ ) from e1
1606
+
1607
+ else:
1608
+ fs.delete_file(running_path)
1609
+ if all(op.type.is_write_operation() for op in self.operations):
1610
+ # pure write transaction - just return write paths and success log path
1611
+ return self.metafile_write_paths, success_log_path
1612
+ else:
1613
+ # mixed read/write transaction - return read results, write paths, and success log path
1614
+ return self._list_results, self.metafile_write_paths, success_log_path
1615
+
1616
+ # Helper: write or overwrite the running/ID file exactly once
1617
+ def _write_running_log(self, running_log_path: str) -> None:
1618
+ with self._filesystem.open_output_stream(running_log_path) as f:
1619
+ f.write(msgpack.dumps(self.to_serializable(self.catalog_root_normalized)))
1620
+ self.running_log_written = True
1621
+
1622
+ # Helper: mark txn FAILED and clean partial output
1623
+ def _fail_and_cleanup(
1624
+ self,
1625
+ failed_txn_log_dir: str,
1626
+ running_log_path: str,
1627
+ success_log_path: Optional[str] = None,
1628
+ ) -> None:
1629
+ fs = self._filesystem
1630
+
1631
+ # 1. write failed/ID
1632
+ failed_log_path = posixpath.join(failed_txn_log_dir, self.id)
1633
+ with fs.open_output_stream(failed_log_path) as f:
1634
+ f.write(msgpack.dumps(self.to_serializable(self.catalog_root_normalized)))
1635
+
1636
+ # 2. delete all provisional files
1637
+ for path in self.metafile_write_paths:
1638
+ try:
1639
+ fs.delete_file(path)
1640
+ except Exception:
1641
+ pass # best-effort; janitor job will catch leftovers
1642
+ for path in self.locator_write_paths:
1643
+ try:
1644
+ fs.delete_file(path)
1645
+ except Exception:
1646
+ pass # best-effort; janitor job will catch leftovers
1647
+
1648
+ # 3. tidy up bookkeeping logs
1649
+ try:
1650
+ fs.delete_file(running_log_path)
1651
+ except Exception:
1652
+ pass
1653
+ if success_log_path:
1654
+ try:
1655
+ fs.delete_file(success_log_path)
1656
+ except Exception:
1657
+ pass
1658
+
1659
+ def __enter__(self) -> "Transaction":
1660
+ """
1661
+ Context manager entry point. Sets this transaction as the current context.
1662
+ Supports nested transactions by preserving the context stack.
1663
+ """
1664
+ if not hasattr(self, "interactive") or not self.interactive:
1665
+ raise RuntimeError(
1666
+ "Transaction must be interactive to use with context manager. "
1667
+ "Use dc.transaction() to create an interactive transaction."
1668
+ )
1669
+ if self.start_time is None:
1670
+ raise RuntimeError(
1671
+ "Transaction has not been started. "
1672
+ "Use dc.transaction() to create a properly initialized transaction."
1673
+ )
1674
+
1675
+ # Store the context token for restoration in __exit__
1676
+ self._context_token = set_current_transaction(self)
1677
+ return self
1678
+
1679
+ def __exit__(
1680
+ self,
1681
+ exc_type: Optional[Type[BaseException]],
1682
+ exc_value: Optional[BaseException],
1683
+ traceback: Optional[TracebackType],
1684
+ ) -> None:
1685
+ """
1686
+ Context manager exit point. Restores previous transaction context and
1687
+ automatically seals the transaction on successful completion or fails it
1688
+ if an exception occurred.
1689
+
1690
+ Args:
1691
+ exc_type: Exception type if an exception occurred, None otherwise
1692
+ exc_value: Exception value if an exception occurred, None otherwise
1693
+ traceback: Exception traceback if an exception occurred, None otherwise
1694
+ """
1695
+ try:
1696
+ if exc_type is None and exc_value is None and traceback is None:
1697
+ # No exception occurred - seal the transaction
1698
+ self.seal()
1699
+ else:
1700
+ # Exception occurred during transaction - fail and cleanup
1701
+ try:
1702
+ catalog_root_normalized = self.catalog_root_normalized
1703
+ txn_log_dir = posixpath.join(catalog_root_normalized, TXN_DIR_NAME)
1704
+ running_txn_log_file_path = posixpath.join(
1705
+ txn_log_dir, RUNNING_TXN_DIR_NAME, self.id
1706
+ )
1707
+ self._fail_and_cleanup(
1708
+ failed_txn_log_dir=posixpath.join(
1709
+ txn_log_dir, FAILED_TXN_DIR_NAME
1710
+ ),
1711
+ running_log_path=running_txn_log_file_path,
1712
+ )
1713
+ except Exception:
1714
+ # If cleanup fails, still let the original exception propagate
1715
+ pass
1716
+ finally:
1717
+ # Always restore the previous transaction context using the token
1718
+ if hasattr(self, "_context_token"):
1719
+ try:
1720
+ # Get the previous value from the token
1721
+ old_value = self._context_token.old_value
1722
+ # Only set if the old value is a valid transaction or None
1723
+ if old_value is None or isinstance(old_value, Transaction):
1724
+ _current_transaction.set(old_value)
1725
+ else:
1726
+ # If old_value is not valid (e.g., Token.MISSING), set to None
1727
+ _current_transaction.set(None)
1728
+ except (AttributeError, LookupError):
1729
+ # If token doesn't have old_value or context is corrupted, clear it
1730
+ try:
1731
+ _current_transaction.set(None)
1732
+ except LookupError:
1733
+ pass