deltacat 2.0.0b10__py3-none-any.whl → 2.0.0b12__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (298) hide show
  1. deltacat/__init__.py +96 -17
  2. deltacat/api.py +122 -67
  3. deltacat/aws/constants.py +0 -23
  4. deltacat/aws/s3u.py +4 -631
  5. deltacat/benchmarking/benchmark_engine.py +4 -2
  6. deltacat/benchmarking/conftest.py +0 -18
  7. deltacat/benchmarking/test_benchmark_pipeline.py +6 -4
  8. deltacat/catalog/__init__.py +64 -5
  9. deltacat/catalog/delegate.py +445 -63
  10. deltacat/catalog/interface.py +188 -62
  11. deltacat/catalog/main/impl.py +2435 -279
  12. deltacat/catalog/model/catalog.py +154 -77
  13. deltacat/catalog/model/properties.py +63 -22
  14. deltacat/compute/compactor/compaction_session.py +97 -75
  15. deltacat/compute/compactor/model/compact_partition_params.py +75 -30
  16. deltacat/compute/compactor/model/compaction_session_audit_info.py +17 -0
  17. deltacat/compute/compactor/model/round_completion_info.py +16 -6
  18. deltacat/compute/compactor/repartition_session.py +8 -21
  19. deltacat/compute/compactor/steps/hash_bucket.py +5 -5
  20. deltacat/compute/compactor/steps/materialize.py +9 -7
  21. deltacat/compute/compactor/steps/repartition.py +12 -11
  22. deltacat/compute/compactor/utils/io.py +6 -5
  23. deltacat/compute/compactor/utils/round_completion_reader.py +117 -0
  24. deltacat/compute/compactor/utils/system_columns.py +3 -1
  25. deltacat/compute/compactor_v2/compaction_session.py +17 -14
  26. deltacat/compute/compactor_v2/constants.py +30 -1
  27. deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -1
  28. deltacat/compute/compactor_v2/model/hash_bucket_input.py +9 -3
  29. deltacat/compute/compactor_v2/model/merge_file_group.py +5 -2
  30. deltacat/compute/compactor_v2/model/merge_input.py +33 -8
  31. deltacat/compute/compactor_v2/private/compaction_utils.py +167 -68
  32. deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -2
  33. deltacat/compute/compactor_v2/steps/merge.py +267 -55
  34. deltacat/compute/compactor_v2/utils/content_type_params.py +34 -6
  35. deltacat/compute/compactor_v2/utils/dedupe.py +1 -1
  36. deltacat/compute/compactor_v2/utils/delta.py +5 -3
  37. deltacat/compute/compactor_v2/utils/io.py +11 -4
  38. deltacat/compute/compactor_v2/utils/merge.py +15 -2
  39. deltacat/compute/compactor_v2/utils/primary_key_index.py +28 -4
  40. deltacat/compute/compactor_v2/utils/task_options.py +45 -33
  41. deltacat/compute/converter/converter_session.py +145 -32
  42. deltacat/compute/converter/model/convert_input.py +26 -19
  43. deltacat/compute/converter/model/convert_input_files.py +33 -16
  44. deltacat/compute/converter/model/convert_result.py +35 -16
  45. deltacat/compute/converter/model/converter_session_params.py +24 -21
  46. deltacat/compute/converter/pyiceberg/catalog.py +21 -18
  47. deltacat/compute/converter/pyiceberg/overrides.py +18 -9
  48. deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +148 -100
  49. deltacat/compute/converter/steps/convert.py +157 -50
  50. deltacat/compute/converter/steps/dedupe.py +24 -11
  51. deltacat/compute/converter/utils/convert_task_options.py +27 -12
  52. deltacat/compute/converter/utils/converter_session_utils.py +126 -60
  53. deltacat/compute/converter/utils/iceberg_columns.py +8 -8
  54. deltacat/compute/converter/utils/io.py +101 -12
  55. deltacat/compute/converter/utils/s3u.py +33 -27
  56. deltacat/compute/janitor.py +205 -0
  57. deltacat/compute/jobs/client.py +25 -12
  58. deltacat/compute/resource_estimation/delta.py +38 -6
  59. deltacat/compute/resource_estimation/model.py +8 -0
  60. deltacat/constants.py +45 -2
  61. deltacat/docs/autogen/schema/inference/generate_type_mappings.py +687 -0
  62. deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py +673 -0
  63. deltacat/env.py +10 -0
  64. deltacat/examples/basic_logging.py +1 -3
  65. deltacat/examples/compactor/aws/__init__.py +1 -0
  66. deltacat/examples/compactor/bootstrap.py +863 -0
  67. deltacat/examples/compactor/compactor.py +373 -0
  68. deltacat/examples/compactor/explorer.py +473 -0
  69. deltacat/examples/compactor/gcp/__init__.py +1 -0
  70. deltacat/examples/compactor/job_runner.py +439 -0
  71. deltacat/examples/compactor/utils/__init__.py +1 -0
  72. deltacat/examples/compactor/utils/common.py +261 -0
  73. deltacat/examples/experimental/iceberg/converter/beam/app.py +226 -0
  74. deltacat/examples/experimental/iceberg/converter/beam/main.py +133 -0
  75. deltacat/examples/experimental/iceberg/converter/beam/test_workflow.py +113 -0
  76. deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py +3 -0
  77. deltacat/examples/experimental/iceberg/converter/beam/utils/common.py +174 -0
  78. deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py +263 -0
  79. deltacat/examples/{iceberg → experimental/iceberg}/iceberg_bucket_writer.py +3 -5
  80. deltacat/examples/{iceberg → experimental/iceberg}/iceberg_reader.py +2 -4
  81. deltacat/examples/indexer/indexer.py +2 -2
  82. deltacat/examples/indexer/job_runner.py +1 -2
  83. deltacat/exceptions.py +66 -4
  84. deltacat/experimental/catalog/iceberg/__init__.py +6 -0
  85. deltacat/{catalog → experimental/catalog}/iceberg/iceberg_catalog_config.py +1 -1
  86. deltacat/{catalog → experimental/catalog}/iceberg/impl.py +29 -11
  87. deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py +201 -0
  88. deltacat/experimental/converter_agent/beam/managed.py +173 -0
  89. deltacat/experimental/converter_agent/table_monitor.py +479 -0
  90. deltacat/experimental/storage/iceberg/iceberg_scan_planner.py +129 -0
  91. deltacat/{storage → experimental/storage}/iceberg/impl.py +6 -4
  92. deltacat/{storage → experimental/storage}/iceberg/model.py +7 -3
  93. deltacat/experimental/storage/iceberg/visitor.py +119 -0
  94. deltacat/experimental/storage/rivulet/__init__.py +11 -0
  95. deltacat/{storage → experimental/storage}/rivulet/arrow/serializer.py +7 -4
  96. deltacat/{storage → experimental/storage}/rivulet/dataset.py +13 -12
  97. deltacat/{storage → experimental/storage}/rivulet/dataset_executor.py +12 -20
  98. deltacat/experimental/storage/rivulet/feather/__init__.py +7 -0
  99. deltacat/{storage → experimental/storage}/rivulet/feather/file_reader.py +7 -5
  100. deltacat/{storage → experimental/storage}/rivulet/feather/serializer.py +4 -4
  101. deltacat/{storage → experimental/storage}/rivulet/fs/file_provider.py +3 -3
  102. deltacat/{storage → experimental/storage}/rivulet/fs/file_store.py +2 -2
  103. deltacat/{storage → experimental/storage}/rivulet/fs/output_file.py +1 -1
  104. deltacat/{storage → experimental/storage}/rivulet/logical_plan.py +4 -4
  105. deltacat/experimental/storage/rivulet/metastore/__init__.py +0 -0
  106. deltacat/{storage → experimental/storage}/rivulet/metastore/delta.py +1 -3
  107. deltacat/{storage → experimental/storage}/rivulet/metastore/json_sst.py +3 -3
  108. deltacat/{storage → experimental/storage}/rivulet/metastore/sst.py +2 -2
  109. deltacat/{storage → experimental/storage}/rivulet/metastore/sst_interval_tree.py +3 -3
  110. deltacat/experimental/storage/rivulet/parquet/__init__.py +7 -0
  111. deltacat/experimental/storage/rivulet/parquet/data_reader.py +0 -0
  112. deltacat/{storage → experimental/storage}/rivulet/parquet/file_reader.py +7 -5
  113. deltacat/{storage → experimental/storage}/rivulet/parquet/serializer.py +4 -4
  114. deltacat/experimental/storage/rivulet/reader/__init__.py +0 -0
  115. deltacat/{storage → experimental/storage}/rivulet/reader/block_scanner.py +20 -9
  116. deltacat/{storage → experimental/storage}/rivulet/reader/data_reader.py +3 -3
  117. deltacat/{storage → experimental/storage}/rivulet/reader/data_scan.py +5 -3
  118. deltacat/{storage → experimental/storage}/rivulet/reader/dataset_metastore.py +7 -6
  119. deltacat/{storage → experimental/storage}/rivulet/reader/dataset_reader.py +8 -6
  120. deltacat/{storage → experimental/storage}/rivulet/reader/pyarrow_data_reader.py +4 -1
  121. deltacat/{storage → experimental/storage}/rivulet/reader/reader_type_registrar.py +4 -4
  122. deltacat/experimental/storage/rivulet/schema/__init__.py +0 -0
  123. deltacat/{storage → experimental/storage}/rivulet/schema/schema.py +1 -1
  124. deltacat/{storage → experimental/storage}/rivulet/serializer.py +1 -1
  125. deltacat/{storage → experimental/storage}/rivulet/serializer_factory.py +9 -5
  126. deltacat/experimental/storage/rivulet/shard/__init__.py +0 -0
  127. deltacat/experimental/storage/rivulet/shard/range_shard.py +129 -0
  128. deltacat/experimental/storage/rivulet/writer/__init__.py +0 -0
  129. deltacat/{storage → experimental/storage}/rivulet/writer/memtable_dataset_writer.py +20 -9
  130. deltacat/io/datasource/deltacat_datasource.py +0 -1
  131. deltacat/io/reader/deltacat_read_api.py +1 -1
  132. deltacat/storage/__init__.py +20 -2
  133. deltacat/storage/interface.py +54 -32
  134. deltacat/storage/main/impl.py +1494 -541
  135. deltacat/storage/model/delta.py +27 -3
  136. deltacat/storage/model/locator.py +6 -12
  137. deltacat/storage/model/manifest.py +182 -6
  138. deltacat/storage/model/metafile.py +151 -78
  139. deltacat/storage/model/namespace.py +8 -1
  140. deltacat/storage/model/partition.py +117 -42
  141. deltacat/storage/model/schema.py +2427 -159
  142. deltacat/storage/model/shard.py +6 -2
  143. deltacat/storage/model/sort_key.py +40 -0
  144. deltacat/storage/model/stream.py +9 -2
  145. deltacat/storage/model/table.py +12 -1
  146. deltacat/storage/model/table_version.py +11 -0
  147. deltacat/storage/model/transaction.py +1184 -208
  148. deltacat/storage/model/transform.py +81 -2
  149. deltacat/storage/model/types.py +48 -26
  150. deltacat/tests/_io/test_cloudpickle_bug_fix.py +8 -4
  151. deltacat/tests/aws/test_s3u.py +2 -31
  152. deltacat/tests/catalog/data/__init__.py +0 -0
  153. deltacat/tests/catalog/main/__init__.py +0 -0
  154. deltacat/tests/catalog/main/test_catalog_impl_namespace_operations.py +130 -0
  155. deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +1972 -0
  156. deltacat/tests/catalog/model/__init__.py +0 -0
  157. deltacat/tests/catalog/model/test_table_definition.py +16 -0
  158. deltacat/tests/catalog/test_catalogs.py +103 -106
  159. deltacat/tests/catalog/test_default_catalog_impl.py +12152 -72
  160. deltacat/tests/compute/compact_partition_test_cases.py +35 -8
  161. deltacat/tests/compute/compactor/steps/test_repartition.py +12 -12
  162. deltacat/tests/compute/compactor/utils/test_io.py +124 -120
  163. deltacat/tests/compute/compactor/utils/test_round_completion_reader.py +254 -0
  164. deltacat/tests/compute/compactor_v2/test_compaction_session.py +423 -312
  165. deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +266 -0
  166. deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +45 -0
  167. deltacat/tests/compute/compactor_v2/utils/test_task_options.py +270 -1
  168. deltacat/tests/compute/conftest.py +8 -44
  169. deltacat/tests/compute/converter/test_convert_session.py +675 -490
  170. deltacat/tests/compute/converter/utils.py +15 -6
  171. deltacat/tests/compute/resource_estimation/test_delta.py +145 -79
  172. deltacat/tests/compute/test_compact_partition_incremental.py +103 -70
  173. deltacat/tests/compute/test_compact_partition_multiple_rounds.py +89 -66
  174. deltacat/tests/compute/test_compact_partition_params.py +13 -8
  175. deltacat/tests/compute/test_compact_partition_rebase.py +77 -62
  176. deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +263 -193
  177. deltacat/tests/compute/test_janitor.py +236 -0
  178. deltacat/tests/compute/test_util_common.py +716 -43
  179. deltacat/tests/compute/test_util_constant.py +0 -1
  180. deltacat/tests/{storage/conftest.py → conftest.py} +1 -1
  181. deltacat/tests/daft/__init__.py +0 -0
  182. deltacat/tests/daft/test_model.py +97 -0
  183. deltacat/tests/experimental/__init__.py +1 -0
  184. deltacat/tests/experimental/catalog/__init__.py +0 -0
  185. deltacat/tests/experimental/catalog/iceberg/__init__.py +0 -0
  186. deltacat/tests/experimental/catalog/iceberg/test_iceberg_catalog.py +71 -0
  187. deltacat/tests/experimental/compatibility/__init__.py +1 -0
  188. deltacat/tests/experimental/compatibility/test_backfill_locator_to_id_mappings.py +582 -0
  189. deltacat/tests/experimental/daft/__init__.py +0 -0
  190. deltacat/tests/experimental/daft/test_deltacat_daft_integration.py +136 -0
  191. deltacat/tests/experimental/storage/__init__.py +0 -0
  192. deltacat/tests/experimental/storage/rivulet/__init__.py +0 -0
  193. deltacat/tests/{storage → experimental/storage}/rivulet/conftest.py +3 -3
  194. deltacat/tests/experimental/storage/rivulet/fs/__init__.py +0 -0
  195. deltacat/tests/{storage → experimental/storage}/rivulet/fs/test_file_location_provider.py +3 -3
  196. deltacat/tests/experimental/storage/rivulet/reader/__init__.py +0 -0
  197. deltacat/tests/experimental/storage/rivulet/reader/query_expression.py +80 -0
  198. deltacat/tests/experimental/storage/rivulet/reader/test_data_scan.py +119 -0
  199. deltacat/tests/experimental/storage/rivulet/reader/test_dataset_metastore.py +71 -0
  200. deltacat/tests/experimental/storage/rivulet/schema/__init__.py +0 -0
  201. deltacat/tests/{storage → experimental/storage}/rivulet/schema/test_schema.py +1 -1
  202. deltacat/tests/experimental/storage/rivulet/shard/__init__.py +0 -0
  203. deltacat/tests/experimental/storage/rivulet/shard/test_range_shard.py +162 -0
  204. deltacat/tests/{storage → experimental/storage}/rivulet/test_dataset.py +5 -3
  205. deltacat/tests/{storage → experimental/storage}/rivulet/test_manifest.py +5 -5
  206. deltacat/tests/{storage → experimental/storage}/rivulet/test_sst_interval_tree.py +5 -5
  207. deltacat/tests/{storage → experimental/storage}/rivulet/test_utils.py +8 -6
  208. deltacat/tests/experimental/storage/rivulet/writer/__init__.py +0 -0
  209. deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_dataset_write_then_read.py +11 -9
  210. deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_dataset_writer.py +2 -2
  211. deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_memtable_dataset_writer.py +7 -7
  212. deltacat/tests/storage/main/test_main_storage.py +6900 -95
  213. deltacat/tests/storage/model/test_metafile_io.py +78 -173
  214. deltacat/tests/storage/model/test_partition_scheme.py +85 -0
  215. deltacat/tests/storage/model/test_schema.py +171 -0
  216. deltacat/tests/storage/model/test_schema_update.py +1925 -0
  217. deltacat/tests/storage/model/test_shard.py +3 -1
  218. deltacat/tests/storage/model/test_sort_scheme.py +90 -0
  219. deltacat/tests/storage/model/test_transaction.py +393 -48
  220. deltacat/tests/storage/model/test_transaction_history.py +886 -0
  221. deltacat/tests/test_deltacat_api.py +988 -4
  222. deltacat/tests/test_exceptions.py +9 -5
  223. deltacat/tests/test_utils/pyarrow.py +52 -21
  224. deltacat/tests/test_utils/storage.py +23 -34
  225. deltacat/tests/types/__init__.py +0 -0
  226. deltacat/tests/types/test_tables.py +104 -0
  227. deltacat/tests/utils/exceptions.py +22 -0
  228. deltacat/tests/utils/main_deltacat_storage_mock.py +31 -0
  229. deltacat/tests/utils/ray_utils/test_dataset.py +123 -5
  230. deltacat/tests/utils/test_daft.py +121 -31
  231. deltacat/tests/utils/test_numpy.py +1193 -0
  232. deltacat/tests/utils/test_pandas.py +1106 -0
  233. deltacat/tests/utils/test_polars.py +1040 -0
  234. deltacat/tests/utils/test_pyarrow.py +1370 -89
  235. deltacat/types/media.py +224 -14
  236. deltacat/types/tables.py +2329 -59
  237. deltacat/utils/arguments.py +33 -1
  238. deltacat/utils/daft.py +823 -36
  239. deltacat/utils/export.py +3 -1
  240. deltacat/utils/filesystem.py +100 -0
  241. deltacat/utils/metafile_locator.py +2 -1
  242. deltacat/utils/numpy.py +118 -26
  243. deltacat/utils/pandas.py +577 -48
  244. deltacat/utils/polars.py +658 -27
  245. deltacat/utils/pyarrow.py +1258 -213
  246. deltacat/utils/ray_utils/dataset.py +101 -10
  247. deltacat/utils/reader_compatibility_mapping.py +3083 -0
  248. deltacat/utils/url.py +57 -16
  249. deltacat-2.0.0b12.dist-info/METADATA +1163 -0
  250. deltacat-2.0.0b12.dist-info/RECORD +439 -0
  251. {deltacat-2.0.0b10.dist-info → deltacat-2.0.0b12.dist-info}/WHEEL +1 -1
  252. deltacat/catalog/iceberg/__init__.py +0 -4
  253. deltacat/compute/compactor/utils/round_completion_file.py +0 -97
  254. deltacat/compute/merge_on_read/__init__.py +0 -4
  255. deltacat/compute/merge_on_read/daft.py +0 -40
  256. deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -66
  257. deltacat/compute/merge_on_read/utils/delta.py +0 -42
  258. deltacat/daft/daft_scan.py +0 -115
  259. deltacat/daft/model.py +0 -258
  260. deltacat/daft/translator.py +0 -126
  261. deltacat/examples/common/fixtures.py +0 -15
  262. deltacat/storage/iceberg/iceberg_scan_planner.py +0 -28
  263. deltacat/storage/rivulet/__init__.py +0 -11
  264. deltacat/storage/rivulet/feather/__init__.py +0 -5
  265. deltacat/storage/rivulet/parquet/__init__.py +0 -5
  266. deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -231
  267. deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -388
  268. deltacat/tests/local_deltacat_storage/__init__.py +0 -1236
  269. deltacat/tests/local_deltacat_storage/exceptions.py +0 -10
  270. deltacat/utils/s3fs.py +0 -21
  271. deltacat-2.0.0b10.dist-info/METADATA +0 -68
  272. deltacat-2.0.0b10.dist-info/RECORD +0 -381
  273. /deltacat/{compute/merge_on_read/model → docs}/__init__.py +0 -0
  274. /deltacat/{compute/merge_on_read/utils → docs/autogen}/__init__.py +0 -0
  275. /deltacat/{daft → docs/autogen/schema}/__init__.py +0 -0
  276. /deltacat/{examples/common → docs/autogen/schema/inference}/__init__.py +0 -0
  277. /deltacat/examples/{iceberg → compactor}/__init__.py +0 -0
  278. /deltacat/{storage/iceberg → examples/experimental}/__init__.py +0 -0
  279. /deltacat/{storage/rivulet/arrow → examples/experimental/iceberg}/__init__.py +0 -0
  280. /deltacat/{storage/rivulet/fs → examples/experimental/iceberg/converter}/__init__.py +0 -0
  281. /deltacat/{storage/rivulet/metastore → examples/experimental/iceberg/converter/beam}/__init__.py +0 -0
  282. /deltacat/{storage/rivulet/reader → experimental/catalog}/__init__.py +0 -0
  283. /deltacat/{catalog → experimental/catalog}/iceberg/overrides.py +0 -0
  284. /deltacat/{storage/rivulet/schema → experimental/compatibility}/__init__.py +0 -0
  285. /deltacat/{storage/rivulet/writer → experimental/converter_agent}/__init__.py +0 -0
  286. /deltacat/{tests/storage/rivulet → experimental/converter_agent/beam}/__init__.py +0 -0
  287. /deltacat/{tests/storage/rivulet/fs → experimental/storage}/__init__.py +0 -0
  288. /deltacat/{tests/storage/rivulet/schema → experimental/storage/iceberg}/__init__.py +0 -0
  289. /deltacat/{tests/storage/rivulet/writer → experimental/storage/rivulet/arrow}/__init__.py +0 -0
  290. /deltacat/{storage/rivulet/parquet/data_reader.py → experimental/storage/rivulet/fs/__init__.py} +0 -0
  291. /deltacat/{storage → experimental/storage}/rivulet/fs/input_file.py +0 -0
  292. /deltacat/{storage → experimental/storage}/rivulet/mvp/Table.py +0 -0
  293. /deltacat/{storage → experimental/storage}/rivulet/mvp/__init__.py +0 -0
  294. /deltacat/{storage → experimental/storage}/rivulet/reader/query_expression.py +0 -0
  295. /deltacat/{storage → experimental/storage}/rivulet/schema/datatype.py +0 -0
  296. /deltacat/{storage → experimental/storage}/rivulet/writer/dataset_writer.py +0 -0
  297. {deltacat-2.0.0b10.dist-info → deltacat-2.0.0b12.dist-info/licenses}/LICENSE +0 -0
  298. {deltacat-2.0.0b10.dist-info → deltacat-2.0.0b12.dist-info}/top_level.txt +0 -0
@@ -4,15 +4,21 @@ import os
4
4
  import copy
5
5
  import time
6
6
  import uuid
7
+ import logging
7
8
  import posixpath
8
9
  from pathlib import PosixPath
9
10
  import threading
11
+ import contextvars
10
12
  from collections import defaultdict
11
13
 
12
- from itertools import chain
13
- from typing import Optional, List, Union, Tuple
14
+ from types import TracebackType
15
+ from typing import Optional, List, Union, Tuple, Type, TYPE_CHECKING, Iterable
16
+
17
+ if TYPE_CHECKING:
18
+ from deltacat.types.tables import Dataset
14
19
 
15
20
  import msgpack
21
+ import pyarrow as pa
16
22
  import pyarrow.fs
17
23
 
18
24
  from deltacat.constants import (
@@ -20,22 +26,530 @@ from deltacat.constants import (
20
26
  TXN_PART_SEPARATOR,
21
27
  RUNNING_TXN_DIR_NAME,
22
28
  FAILED_TXN_DIR_NAME,
29
+ PAUSED_TXN_DIR_NAME,
23
30
  SUCCESS_TXN_DIR_NAME,
24
31
  NANOS_PER_SEC,
25
32
  )
26
33
  from deltacat.storage.model.list_result import ListResult
27
34
  from deltacat.storage.model.types import (
28
35
  TransactionOperationType,
29
- TransactionType,
36
+ TransactionState,
37
+ TransactionStatus,
30
38
  )
39
+ from deltacat.storage.model.namespace import NamespaceLocator
40
+ from deltacat.storage.model.table import TableLocator
41
+ from deltacat.storage.model.table_version import TableVersionLocator
42
+ from deltacat.storage.model.stream import StreamLocator
43
+ from deltacat.storage.model.partition import PartitionLocator
44
+ from deltacat.storage.model.delta import DeltaLocator
31
45
  from deltacat.storage.model.metafile import (
32
46
  Metafile,
33
47
  MetafileRevisionInfo,
34
48
  )
49
+ from deltacat.types.tables import (
50
+ DatasetType,
51
+ from_pyarrow,
52
+ )
35
53
  from deltacat.utils.filesystem import (
36
54
  resolve_path_and_filesystem,
37
55
  list_directory,
56
+ get_file_info,
38
57
  )
58
+ from deltacat import logs
59
+
60
+ logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
61
+
62
+
63
+ # Context variable to store the current transaction
64
+ _current_transaction: contextvars.ContextVar[
65
+ Optional[Transaction]
66
+ ] = contextvars.ContextVar("current_transaction", default=None)
67
+
68
+
69
+ def get_current_transaction() -> Optional[Transaction]:
70
+ """Get the currently active transaction from context."""
71
+ return _current_transaction.get()
72
+
73
+
74
+ def set_current_transaction(transaction: Optional[Transaction]) -> contextvars.Token:
75
+ """Set the current transaction in context, returns token for restoration."""
76
+ return _current_transaction.set(transaction)
77
+
78
+
79
+ def setup_transaction(
80
+ transaction: Optional[Transaction] = None,
81
+ **kwargs,
82
+ ) -> Tuple[Transaction, bool]:
83
+ """
84
+ Utility method to ensure a transaction exists and determine if it should be committed
85
+ within the caller's context. Creates a new transaction if none is provided.
86
+
87
+ Args:
88
+ transaction: Optional existing transaction to use
89
+ **kwargs: Additional arguments for catalog properties
90
+
91
+ Returns:
92
+ Tuple[Transaction, bool]: The transaction to use and whether to commit it
93
+ """
94
+ # Check for active transaction in context first
95
+ if transaction is None:
96
+ transaction = get_current_transaction()
97
+
98
+ commit_transaction = transaction is None
99
+ if commit_transaction:
100
+ from deltacat.catalog.model.properties import get_catalog_properties
101
+
102
+ catalog_properties = get_catalog_properties(**kwargs)
103
+ transaction = Transaction.of().start(
104
+ catalog_properties.root,
105
+ catalog_properties.filesystem,
106
+ )
107
+ return transaction, commit_transaction
108
+
109
+
110
+ def transaction_log_dir_and_filesystem(
111
+ catalog_name: Optional[str] = None,
112
+ ) -> Tuple[str, pyarrow.fs.FileSystem]:
113
+ """
114
+ Get the transaction log directory and filesystem for the given catalog.
115
+
116
+ Args:
117
+ catalog_name: Name of the catalog to get the transaction log directory and filesystem for.
118
+ If None, uses the default catalog.
119
+
120
+ Returns:
121
+ Tuple[str, pyarrow.fs.FileSystem]: The transaction log directory and filesystem for the given catalog.
122
+ """
123
+ # Get the catalog and its properties
124
+ from deltacat.catalog.model.catalog import get_catalog
125
+
126
+ catalog = get_catalog(catalog_name)
127
+ catalog_properties = catalog.inner
128
+
129
+ # Get transaction directory paths
130
+ catalog_root_normalized, filesystem = resolve_path_and_filesystem(
131
+ catalog_properties.root,
132
+ catalog_properties.filesystem,
133
+ )
134
+
135
+ return posixpath.join(catalog_root_normalized, TXN_DIR_NAME), filesystem
136
+
137
+
138
+ def transaction(
139
+ catalog_name: Optional[str] = None,
140
+ as_of: Optional[int] = None,
141
+ commit_message: Optional[str] = None,
142
+ ) -> Transaction:
143
+ """
144
+ Start a new interactive transaction for the given catalog.
145
+
146
+ Args:
147
+ catalog_name: Optional name of the catalog to run the transaction against.
148
+ If None, uses the default catalog.
149
+ as_of: Optional historic timestamp in nanoseconds since epoch.
150
+ If provided, creates a read-only transaction that reads only transactions
151
+ with end times strictly less than the specified timestamp.
152
+ commit_message: Optional commit message to describe the transaction purpose.
153
+ Helps with time travel functionality by providing context
154
+ for each transaction when browsing transaction history.
155
+
156
+ Returns:
157
+ Transaction: A started interactive transaction ready for use with the given catalog.
158
+
159
+ Example:
160
+ # Read-write transaction with commit message
161
+ with dc.transaction(commit_message="Initial data load for Q4 analytics") as txn:
162
+ dc.write_to_table(data, "my_table")
163
+ dc.write_to_table(more_data, "my_other_table")
164
+
165
+ # Read-only historic transaction
166
+ import time
167
+ historic_time = time.time_ns() - 3600 * 1000000000 # 1 hour ago
168
+ with dc.transaction(as_of=historic_time) as txn:
169
+ # Only read operations allowed - provides snapshot as of historic_time
170
+ data = dc.read_table("my_table")
171
+ """
172
+ from deltacat.catalog.model.catalog import get_catalog
173
+
174
+ # Get the catalog and its properties
175
+ catalog = get_catalog(catalog_name)
176
+ catalog_properties = catalog.inner
177
+
178
+ # Create interactive transaction
179
+ if as_of is not None:
180
+ # Create read-only historic transaction
181
+ txn = Transaction.of(commit_message=commit_message).start(
182
+ catalog_properties.root,
183
+ catalog_properties.filesystem,
184
+ historic_timestamp=as_of,
185
+ )
186
+ else:
187
+ # Create regular read-write transaction
188
+ txn = Transaction.of(commit_message=commit_message).start(
189
+ catalog_properties.root, catalog_properties.filesystem
190
+ )
191
+ # Initialize the lazy transaction ID
192
+ logger.info(f"Created transaction with ID: {txn.id}")
193
+ return txn
194
+
195
+
196
+ def _read_txn(
197
+ txn_log_dir: str,
198
+ txn_status: TransactionStatus,
199
+ transaction_id: str,
200
+ filesystem: pyarrow.fs.FileSystem,
201
+ ) -> Transaction:
202
+ """
203
+ Read a transaction ID with the expected status from the given root transaction log directory.
204
+
205
+ Args:
206
+ txn_log_dir: The directory containing the transaction log.
207
+ txn_status: The expected status of the transaction.
208
+ transaction_id: The ID of the transaction.
209
+ filesystem: The filesystem to use for reading the transaction.
210
+
211
+ Returns:
212
+ Transaction: The transaction.
213
+ """
214
+ # Transaction directories contain the actual transaction file
215
+ txn_dir_path = posixpath.join(
216
+ txn_log_dir, txn_status.dir_name(), posixpath.basename(transaction_id)
217
+ )
218
+
219
+ try:
220
+ file_info = get_file_info(txn_dir_path, filesystem)
221
+ except FileNotFoundError:
222
+ raise FileNotFoundError(
223
+ f"Transaction with ID '{transaction_id}' and status '{txn_status}' not found."
224
+ )
225
+
226
+ # Only read transaction directories (skip any stray files)
227
+ if file_info.type != pyarrow.fs.FileType.Directory:
228
+ raise FileNotFoundError(
229
+ f"Transaction directory for transaction ID '{transaction_id}' with status '{txn_status}' not found."
230
+ )
231
+
232
+ # List files in the transaction directory
233
+ txn_files = list_directory(
234
+ path=txn_dir_path,
235
+ filesystem=filesystem,
236
+ ignore_missing_path=True,
237
+ )
238
+
239
+ if not txn_files:
240
+ raise FileNotFoundError(
241
+ f"No transaction file found for transaction ID '{transaction_id}' and status '{txn_status}'."
242
+ )
243
+
244
+ if len(txn_files) > 1:
245
+ raise RuntimeError(
246
+ f"Expected 1 transaction file in '{txn_dir_path}', but found {len(txn_files)}"
247
+ )
248
+
249
+ # Get the transaction file path
250
+ txn_file_path, _ = txn_files[0]
251
+
252
+ # Read the transaction from the file
253
+ return Transaction.read(txn_file_path, filesystem)
254
+
255
+
256
+ def read_transaction(
257
+ transaction_id: str,
258
+ catalog_name: Optional[str] = None,
259
+ status: TransactionStatus = TransactionStatus.SUCCESS,
260
+ ) -> Transaction:
261
+ """
262
+ Read a transaction from the given catalog and transaction ID.
263
+ """
264
+ txn_log_dir, filesystem = transaction_log_dir_and_filesystem(catalog_name)
265
+ return _read_txn(txn_log_dir, status, transaction_id, filesystem)
266
+
267
+
268
+ def transactions(
269
+ catalog_name: Optional[str] = None,
270
+ read_as: "DatasetType" = None,
271
+ start_time: Optional[int] = None,
272
+ end_time: Optional[int] = None,
273
+ limit: Optional[int] = None,
274
+ status_in: Iterable[TransactionStatus] = [TransactionStatus.SUCCESS],
275
+ ) -> Dataset:
276
+ """
277
+ Query transaction history for a catalog.
278
+
279
+ Args:
280
+ catalog_name: Optional name of the catalog to query. If None, uses the default catalog.
281
+ read_as: Dataset type to return results as. If None, defaults to DatasetType.PYARROW.
282
+ start_time: Optional start timestamp in nanoseconds since epoch to filter transactions.
283
+ end_time: Optional end timestamp in nanoseconds since epoch to filter transactions.
284
+ limit: Optional maximum number of transactions to return (most recent first).
285
+ status_in: Optional iterable of transaction status types to include. Defaults to [TransactionStatus.SUCCESS].
286
+
287
+ Returns:
288
+ Dataset: Transaction history as the specified dataset type with columns:
289
+ - transaction_id: Unique transaction identifier
290
+ - commit_message: Optional user-provided commit message
291
+ - start_time: Transaction start timestamp (nanoseconds since epoch)
292
+ - end_time: Transaction end timestamp (nanoseconds since epoch, None for running)
293
+ - status: Transaction status (SUCCESS, RUNNING, FAILED, PAUSED)
294
+ - operation_count: Number of operations in the transaction
295
+ - operation_types: Comma-separated list of distinct operation types in the transaction
296
+ - namespace_count: Number of distinct namespaces affected by the transaction
297
+ - table_count: Number of distinct tables affected by the transaction
298
+ - table_version_count: Number of distinct table versions affected by the transaction
299
+ - stream_count: Number of distinct streams affected by the transaction
300
+ - partition_count: Number of distinct partitions affected by the transaction
301
+ - delta_count: Number of distinct deltas affected by the transaction
302
+
303
+ Example:
304
+ # Get recent successful transactions
305
+ recent = dc.transactions(limit=10)
306
+
307
+ # Get transactions for a specific time range
308
+ import time
309
+ hour_ago = time.time_ns() - 3600 * 1000000000
310
+ recent_hour = dc.transactions(start_time=hour_ago)
311
+
312
+ # Get transaction history as pandas DataFrame
313
+ df = dc.transactions(read_as=dc.DatasetType.PANDAS)
314
+ """
315
+ # Validate inputs
316
+ if limit is not None and limit <= 0:
317
+ raise ValueError("limit must be greater than 0")
318
+
319
+ # Set default read_as if not provided
320
+ if read_as is None:
321
+ read_as = DatasetType.PYARROW
322
+
323
+ if not status_in:
324
+ status_in = [TransactionStatus.SUCCESS]
325
+
326
+ # Get transaction directory path and filesystem
327
+ txn_log_dir, filesystem = transaction_log_dir_and_filesystem(catalog_name)
328
+
329
+ # Collect transaction data
330
+ transaction_records = {
331
+ "transaction_id": [],
332
+ "commit_message": [],
333
+ "start_time": [],
334
+ "end_time": [],
335
+ "status": [],
336
+ "operation_count": [],
337
+ "operation_types": [],
338
+ "namespace_count": [],
339
+ "table_count": [],
340
+ "table_version_count": [],
341
+ "stream_count": [],
342
+ "partition_count": [],
343
+ "delta_count": [],
344
+ }
345
+
346
+ # Helper function to process transactions in a directory
347
+ def process_transactions_in_directory(
348
+ directory: str, expected_status: TransactionStatus
349
+ ):
350
+ # TODO(pdames): Do a recursive listing to get the transaction files returned directly.
351
+ file_info_and_sizes = list_directory(
352
+ path=directory,
353
+ filesystem=filesystem,
354
+ ignore_missing_path=True,
355
+ )
356
+
357
+ for file_path, _ in file_info_and_sizes:
358
+ # Read the transaction from the file
359
+ # TODO(pdames): Do a recursive listing to get the transaction files returned directly.
360
+ try:
361
+ txn = _read_txn(
362
+ txn_log_dir,
363
+ expected_status,
364
+ posixpath.basename(file_path),
365
+ filesystem,
366
+ )
367
+ except FileNotFoundError:
368
+ # this may be a stray file or the transaction is being created - skip it
369
+ continue
370
+
371
+ # Apply time filters
372
+ # TODO(pdames): Parse start and end times from the transaction file path.
373
+ if (
374
+ start_time is not None
375
+ and txn.start_time
376
+ and txn.start_time < start_time
377
+ ):
378
+ continue
379
+ if end_time is not None and txn.end_time and txn.end_time > end_time:
380
+ continue
381
+
382
+ # Count operations and affected metadata objects by type.
383
+ operation_count = len(txn.operations)
384
+ operation_types = set()
385
+ affected_namespaces = set()
386
+ affected_tables = set()
387
+ affected_table_versions = set()
388
+ affected_streams = set()
389
+ affected_partitions = set()
390
+ affected_deltas = set()
391
+
392
+ for op in txn.operations:
393
+ operation_types.add(op.type)
394
+
395
+ # Determine locator type and cast to appropriate locator class
396
+ locator_dict = op.dest_metafile.get("locator", {})
397
+ if "tableName" in locator_dict and "namespaceLocator" in locator_dict:
398
+ locator = TableLocator(locator_dict)
399
+ elif "namespace" in locator_dict:
400
+ locator = NamespaceLocator(locator_dict)
401
+ elif "tableVersion" in locator_dict:
402
+ locator = TableVersionLocator(locator_dict)
403
+ elif "streamId" in locator_dict:
404
+ locator = StreamLocator(locator_dict)
405
+ elif "partitionId" in locator_dict:
406
+ locator = PartitionLocator(locator_dict)
407
+ elif "streamPosition" in locator_dict:
408
+ locator = DeltaLocator(locator_dict)
409
+ else:
410
+ raise ValueError(
411
+ f"Unknown locator type from structure: {locator_dict}"
412
+ )
413
+
414
+ # Extract distinct metafiles updated by common/alias name (e.g., a table rename impacts 2 tables instead of 1)
415
+ if op.type in TransactionOperationType.write_operations():
416
+ if locator.namespace is not None:
417
+ affected_namespaces.add(locator.namespace)
418
+ if isinstance(locator, TableLocator):
419
+ affected_tables.add((locator.namespace, locator.table_name))
420
+ elif isinstance(locator, TableVersionLocator):
421
+ affected_table_versions.add(
422
+ (
423
+ locator.namespace,
424
+ locator.table_name,
425
+ locator.table_version,
426
+ )
427
+ )
428
+ elif isinstance(locator, StreamLocator):
429
+ affected_tables.add((locator.namespace, locator.table_name))
430
+ affected_table_versions.add(
431
+ (
432
+ locator.namespace,
433
+ locator.table_name,
434
+ locator.table_version,
435
+ )
436
+ )
437
+ affected_streams.add(
438
+ (
439
+ locator.namespace,
440
+ locator.table_name,
441
+ locator.table_version,
442
+ locator.stream_id,
443
+ )
444
+ )
445
+ elif isinstance(locator, PartitionLocator):
446
+ affected_tables.add((locator.namespace, locator.table_name))
447
+ affected_table_versions.add(
448
+ (
449
+ locator.namespace,
450
+ locator.table_name,
451
+ locator.table_version,
452
+ )
453
+ )
454
+ affected_streams.add(
455
+ (
456
+ locator.namespace,
457
+ locator.table_name,
458
+ locator.table_version,
459
+ locator.stream_id,
460
+ )
461
+ )
462
+ affected_partitions.add(
463
+ (
464
+ locator.namespace,
465
+ locator.table_name,
466
+ locator.table_version,
467
+ locator.stream_id,
468
+ locator.partition_id,
469
+ )
470
+ )
471
+ elif isinstance(locator, DeltaLocator):
472
+ affected_tables.add((locator.namespace, locator.table_name))
473
+ affected_table_versions.add(
474
+ (
475
+ locator.namespace,
476
+ locator.table_name,
477
+ locator.table_version,
478
+ )
479
+ )
480
+ affected_streams.add(
481
+ (
482
+ locator.namespace,
483
+ locator.table_name,
484
+ locator.table_version,
485
+ locator.stream_id,
486
+ )
487
+ )
488
+ affected_partitions.add(
489
+ (
490
+ locator.namespace,
491
+ locator.table_name,
492
+ locator.table_version,
493
+ locator.stream_id,
494
+ locator.partition_id,
495
+ )
496
+ )
497
+ affected_deltas.add(
498
+ (
499
+ locator.namespace,
500
+ locator.table_name,
501
+ locator.table_version,
502
+ locator.stream_id,
503
+ locator.partition_id,
504
+ locator.stream_position,
505
+ )
506
+ )
507
+
508
+ # Create transaction record
509
+ transaction_records["transaction_id"].append(txn.id)
510
+ transaction_records["commit_message"].append(txn.commit_message)
511
+ transaction_records["start_time"].append(txn.start_time)
512
+ transaction_records["end_time"].append(txn.end_time)
513
+ transaction_records["status"].append(expected_status)
514
+ transaction_records["operation_count"].append(operation_count)
515
+ transaction_records["operation_types"].append(operation_types)
516
+ transaction_records["namespace_count"].append(len(affected_namespaces))
517
+ transaction_records["table_count"].append(len(affected_tables))
518
+ transaction_records["table_version_count"].append(
519
+ len(affected_table_versions)
520
+ )
521
+ transaction_records["stream_count"].append(len(affected_streams))
522
+ transaction_records["partition_count"].append(len(affected_partitions))
523
+ transaction_records["delta_count"].append(len(affected_deltas))
524
+
525
+ for status in status_in:
526
+ dir_path = posixpath.join(txn_log_dir, status.dir_name())
527
+ process_transactions_in_directory(dir_path, status)
528
+
529
+ # Sort by start_time descending (most recent first)
530
+ # Convert to list of records, sort, then convert back
531
+ if transaction_records["transaction_id"]: # Only sort if we have records
532
+ # Create list of tuples (start_time, record_index)
533
+ sorted_indices = sorted(
534
+ range(len(transaction_records["start_time"])),
535
+ key=lambda i: transaction_records["start_time"][i] or 0,
536
+ reverse=True,
537
+ )
538
+
539
+ # Reorder all columns based on sorted indices
540
+ for key in transaction_records:
541
+ transaction_records[key] = [
542
+ transaction_records[key][i] for i in sorted_indices
543
+ ]
544
+
545
+ # Apply limit
546
+ # TODO(pdames): Apply limit during listing (pyarrow fs doesn't support limits natively).
547
+ if limit is not None and limit > 0:
548
+ for key in transaction_records:
549
+ transaction_records[key] = transaction_records[key][:limit]
550
+
551
+ # Convert to requested dataset type
552
+ return from_pyarrow(pa.Table.from_pydict(transaction_records), read_as)
39
553
 
40
554
 
41
555
  class TransactionTimeProvider:
@@ -147,6 +661,47 @@ class TransactionSystemTimeProvider(TransactionTimeProvider):
147
661
  return current_time
148
662
 
149
663
 
664
+ class TransactionHistoricTimeProvider(TransactionTimeProvider):
665
+ """
666
+ A transaction time provider that returns a fixed historic timestamp
667
+ for read-only transactions. This enables MVCC snapshot isolation
668
+ as-of the specified timestamp.
669
+ """
670
+
671
+ def __init__(
672
+ self,
673
+ historic_timestamp: int,
674
+ base_time_provider: TransactionTimeProvider,
675
+ ):
676
+ """
677
+ Initialize with a fixed historic timestamp and a base time provider.
678
+
679
+ Args:
680
+ historic_timestamp: Timestamp in nanoseconds since epoch to use
681
+ for both start and end times.
682
+ base_time_provider: Time provider to use for the end time.
683
+ """
684
+ # Validate that historic timestamp is not in the future
685
+ if historic_timestamp > base_time_provider.start_time():
686
+ raise ValueError(
687
+ f"Historic timestamp {historic_timestamp} cannot be set in the future."
688
+ )
689
+ self.base_time_provider = base_time_provider
690
+ self.historic_timestamp = historic_timestamp
691
+
692
+ def start_time(self) -> int:
693
+ """
694
+ Returns the fixed historic timestamp.
695
+ """
696
+ return self.historic_timestamp
697
+
698
+ def end_time(self) -> int:
699
+ """
700
+ Returns the end time of the base time provider.
701
+ """
702
+ return self.base_time_provider.end_time()
703
+
704
+
150
705
  class TransactionOperation(dict):
151
706
  """
152
707
  Base class for DeltaCAT transaction operations against individual metafiles.
@@ -161,10 +716,13 @@ class TransactionOperation(dict):
161
716
  ) -> TransactionOperation:
162
717
  if not dest_metafile:
163
718
  raise ValueError("Transaction operations must have a destination metafile.")
164
- if operation_type == TransactionOperationType.UPDATE:
719
+ if operation_type in [
720
+ TransactionOperationType.UPDATE,
721
+ TransactionOperationType.REPLACE,
722
+ ]:
165
723
  if not src_metafile:
166
724
  raise ValueError(
167
- "UPDATE transaction operations must have a source metafile."
725
+ f"{operation_type.value} transaction operations must have a source metafile."
168
726
  )
169
727
  elif type(dest_metafile) is not type(src_metafile):
170
728
  raise ValueError(
@@ -173,10 +731,12 @@ class TransactionOperation(dict):
173
731
  )
174
732
  elif src_metafile:
175
733
  raise ValueError(
176
- "Only UPDATE transaction operations may have a source metafile."
734
+ f"Only {TransactionOperationType.UPDATE.value} and {TransactionOperationType.REPLACE.value} transaction operations may have a source metafile."
177
735
  )
178
736
  if operation_type.is_write_operation() and read_limit:
179
- raise ValueError("Only READ transaction operations may have a read limit.")
737
+ raise ValueError(
738
+ f"Only {TransactionOperationType.READ.value} transaction operations may have a read limit."
739
+ )
180
740
  txn_op = TransactionOperation()
181
741
  txn_op.type = operation_type
182
742
  txn_op.dest_metafile = dest_metafile
@@ -189,7 +749,10 @@ class TransactionOperation(dict):
189
749
  """
190
750
  Returns the type of the transaction operation.
191
751
  """
192
- return TransactionOperationType(self["type"])
752
+ val = self["type"]
753
+ if val is not None and not isinstance(val, TransactionOperationType):
754
+ self["type"] = val = TransactionOperationType(val)
755
+ return val
193
756
 
194
757
  @type.setter
195
758
  def type(self, txn_op_type: TransactionOperationType):
@@ -200,7 +763,10 @@ class TransactionOperation(dict):
200
763
  """
201
764
  Returns the metafile that is the target of this transaction operation.
202
765
  """
203
- return self["dest_metafile"]
766
+ val = self["dest_metafile"]
767
+ if val is not None and not isinstance(val, Metafile):
768
+ self["dest_metafile"] = val = Metafile(val)
769
+ return val
204
770
 
205
771
  @dest_metafile.setter
206
772
  def dest_metafile(self, metafile: Metafile):
@@ -211,7 +777,10 @@ class TransactionOperation(dict):
211
777
  """
212
778
  Returns the metafile that is the source of this transaction operation.
213
779
  """
214
- return self["src_metafile"]
780
+ val = self.get("src_metafile")
781
+ if val is not None and not isinstance(val, Metafile):
782
+ self["src_metafile"] = val = Metafile(val)
783
+ return val
215
784
 
216
785
  @src_metafile.setter
217
786
  def src_metafile(self, src_metafile: Optional[Metafile]):
@@ -273,6 +842,11 @@ class TransactionOperationList(List[TransactionOperation]):
273
842
  self[item] = val = TransactionOperation(val)
274
843
  return val
275
844
 
845
+ def __iter__(self):
846
+ """Support enumeration by returning TransactionOperation objects."""
847
+ for i in range(len(self)):
848
+ yield self[i] # This triggers __getitem__ conversion
849
+
276
850
 
277
851
  class Transaction(dict):
278
852
  """
@@ -281,43 +855,16 @@ class Transaction(dict):
281
855
 
282
856
  @staticmethod
283
857
  def of(
284
- txn_type: TransactionType,
285
- txn_operations: Optional[TransactionOperationList],
858
+ txn_operations: Optional[TransactionOperationList] = None,
859
+ commit_message: Optional[str] = None,
286
860
  ) -> Transaction:
287
- operation_types = set([op.type for op in txn_operations])
288
- if txn_type == TransactionType.READ:
289
- if operation_types - TransactionOperationType.read_operations():
290
- raise ValueError(
291
- "Only READ transaction operation types may be specified as "
292
- "part of a READ transaction."
293
- )
294
- elif (
295
- len(operation_types) == 1
296
- and TransactionOperationType.CREATE in operation_types
297
- ):
298
- if txn_type != TransactionType.APPEND:
299
- raise ValueError(
300
- "Transactions with only CREATE operations must be "
301
- "specified as part of an APPEND transaction."
302
- )
303
- elif TransactionOperationType.DELETE in operation_types:
304
- if txn_type != TransactionType.DELETE:
305
- raise ValueError(
306
- "DELETE transaction operations must be specified as part "
307
- "of a DELETE transaction."
308
- )
309
- elif TransactionOperationType.UPDATE in operation_types and txn_type not in {
310
- TransactionType.ALTER,
311
- TransactionType.RESTATE,
312
- TransactionType.OVERWRITE,
313
- }:
314
- raise ValueError(
315
- "Transactions with UPDATE operations must be specified "
316
- "as part of an ALTER, RESTATE, or OVERWRITE transaction."
317
- )
861
+ if txn_operations is None:
862
+ txn_operations = []
318
863
  transaction = Transaction()
319
- transaction.type = txn_type
320
864
  transaction.operations = txn_operations
865
+ transaction.interactive = len(txn_operations) == 0
866
+ if commit_message:
867
+ transaction.commit_message = commit_message
321
868
  return transaction
322
869
 
323
870
  @staticmethod
@@ -366,6 +913,7 @@ class Transaction(dict):
366
913
  :param filesystem: File system to use for reading the Transaction file.
367
914
  :return: Deserialized object from the Transaction file.
368
915
  """
916
+
369
917
  if not filesystem:
370
918
  path, filesystem = resolve_path_and_filesystem(path, filesystem)
371
919
  with filesystem.open_input_stream(path) as file:
@@ -373,6 +921,23 @@ class Transaction(dict):
373
921
  obj = cls(**msgpack.loads(binary))
374
922
  return obj
375
923
 
924
+ @staticmethod
925
+ def read_time_provider(provider_name: str):
926
+ """
927
+ Given the string name of a time provider class, return a new instance of it.
928
+ Raises ValueError if the provider name is unknown.
929
+ """
930
+ TIME_PROVIDER_CLASSES = {
931
+ "TransactionSystemTimeProvider": TransactionSystemTimeProvider,
932
+ # Add additional mappings as needed
933
+ }
934
+
935
+ provider_cls = TIME_PROVIDER_CLASSES.get(provider_name)
936
+ if provider_cls is None:
937
+ raise ValueError(f"Unknown time provider: {provider_name}")
938
+
939
+ return provider_cls()
940
+
376
941
  @property
377
942
  def id(self) -> Optional[str]:
378
943
  """
@@ -384,16 +949,49 @@ class Transaction(dict):
384
949
  _id = self["id"] = f"{self.start_time}{TXN_PART_SEPARATOR}{uuid.uuid4()}"
385
950
  return _id
386
951
 
387
- @property
388
- def type(self) -> TransactionType:
952
+ def state(self, catalog_root_dir: str, filesystem: pyarrow.fs.FileSystem = None):
389
953
  """
390
- Returns the type of the transaction.
954
+ Infer the transaction state based on its presence in different directories.
391
955
  """
392
- return TransactionType(self["type"])
393
956
 
394
- @type.setter
395
- def type(self, txn_type: TransactionType):
396
- self["type"] = txn_type
957
+ txn_name = self.id
958
+
959
+ catalog_root_normalized, filesystem = resolve_path_and_filesystem(
960
+ catalog_root_dir
961
+ )
962
+
963
+ txn_log_dir = posixpath.join(catalog_root_normalized, TXN_DIR_NAME)
964
+ running_txn_log_dir = posixpath.join(txn_log_dir, RUNNING_TXN_DIR_NAME)
965
+ filesystem.create_dir(running_txn_log_dir, recursive=True)
966
+ failed_txn_log_dir = posixpath.join(txn_log_dir, FAILED_TXN_DIR_NAME)
967
+ filesystem.create_dir(failed_txn_log_dir, recursive=False)
968
+ success_txn_log_dir = posixpath.join(txn_log_dir, SUCCESS_TXN_DIR_NAME)
969
+ filesystem.create_dir(success_txn_log_dir, recursive=False)
970
+ paused_txn_log_dir = posixpath.join(txn_log_dir, PAUSED_TXN_DIR_NAME)
971
+ filesystem.create_dir(paused_txn_log_dir, recursive=False)
972
+
973
+ # Check if the transaction file exists in the failed directory
974
+ in_failed = os.path.exists(os.path.join(failed_txn_log_dir, txn_name))
975
+
976
+ # Check if the transaction file exists in the running directory
977
+ in_running = os.path.exists(os.path.join(running_txn_log_dir, txn_name))
978
+
979
+ # Check if the transaction file exists in the success directory
980
+ in_success = os.path.exists(os.path.join(success_txn_log_dir, txn_name))
981
+
982
+ # Check if the transaction file exists in the paused directory
983
+ in_paused = os.path.exists(os.path.join(paused_txn_log_dir, txn_name))
984
+
985
+ if in_failed and in_running:
986
+ return TransactionState.FAILED
987
+ elif in_failed and not in_running:
988
+ return TransactionState.PURGED
989
+ elif in_success:
990
+ return TransactionState.SUCCESS
991
+ elif in_running:
992
+ return TransactionState.RUNNING
993
+ elif in_paused:
994
+ return TransactionState.PAUSED
397
995
 
398
996
  @property
399
997
  def operations(self) -> TransactionOperationList:
@@ -406,6 +1004,38 @@ class Transaction(dict):
406
1004
  def operations(self, operations: TransactionOperationList):
407
1005
  self["operations"] = operations
408
1006
 
1007
+ @property
1008
+ def metafile_write_paths(self) -> List[str]:
1009
+ return [path for op in self.operations for path in op.metafile_write_paths]
1010
+
1011
+ @property
1012
+ def locator_write_paths(self) -> List[str]:
1013
+ return [path for op in self.operations for path in op.locator_write_paths]
1014
+
1015
+ @property
1016
+ def catalog_root_normalized(self) -> str:
1017
+ """
1018
+ Returns the catalog_root_normalized for this transaction.
1019
+ """
1020
+ return self.get("catalog_root_normalized")
1021
+
1022
+ @catalog_root_normalized.setter
1023
+ def catalog_root_normalized(self, path: str):
1024
+ self["catalog_root_normalized"] = path
1025
+
1026
+ @property
1027
+ def _time_provider(self) -> TransactionSystemTimeProvider:
1028
+ """
1029
+ Returns the time_provider of the transaction.
1030
+ """
1031
+ return self.get("_time_provider")
1032
+
1033
+ @_time_provider.setter
1034
+ def _time_provider(
1035
+ self, tp: TransactionSystemTimeProvider
1036
+ ) -> TransactionSystemTimeProvider:
1037
+ self["_time_provider"] = tp
1038
+
409
1039
  @property
410
1040
  def start_time(self) -> Optional[int]:
411
1041
  """
@@ -413,6 +1043,13 @@ class Transaction(dict):
413
1043
  """
414
1044
  return self.get("start_time")
415
1045
 
1046
+ @property
1047
+ def pause_time(self) -> Optional[int]:
1048
+ """
1049
+ Returns the last pause time of the transaction.
1050
+ """
1051
+ return self.get("pause_time")
1052
+
416
1053
  @property
417
1054
  def end_time(self) -> Optional[int]:
418
1055
  """
@@ -420,6 +1057,34 @@ class Transaction(dict):
420
1057
  """
421
1058
  return self.get("end_time")
422
1059
 
1060
+ @property
1061
+ def commit_message(self) -> Optional[str]:
1062
+ """
1063
+ Returns the commit message for the transaction.
1064
+ """
1065
+ return self.get("commit_message")
1066
+
1067
+ @commit_message.setter
1068
+ def commit_message(self, message: str):
1069
+ """
1070
+ Sets the commit message for the transaction.
1071
+ """
1072
+ self["commit_message"] = message
1073
+
1074
+ @property
1075
+ def historic_timestamp(self) -> Optional[int]:
1076
+ """
1077
+ Returns the historic timestamp for the transaction.
1078
+ """
1079
+ return self.get("historic_timestamp")
1080
+
1081
+ @historic_timestamp.setter
1082
+ def historic_timestamp(self, timestamp: int):
1083
+ """
1084
+ Sets the historic timestamp for the transaction.
1085
+ """
1086
+ self["historic_timestamp"] = timestamp
1087
+
423
1088
  def _mark_start_time(self, time_provider: TransactionTimeProvider) -> int:
424
1089
  """
425
1090
  Sets the start time of the transaction using the given
@@ -445,6 +1110,20 @@ class Transaction(dict):
445
1110
  end_time = self["end_time"] = time_provider.end_time()
446
1111
  return end_time
447
1112
 
1113
+ def _mark_pause_time(self, time_provider: TransactionTimeProvider) -> int:
1114
+ """
1115
+ Sets the pause time of the transaction using the given
1116
+ TransactionTimeProvider. Raises a runtime error if the transaction pause
1117
+ time has already been set by a previous commit, or if the transaction
1118
+ start time has not been set.
1119
+ """
1120
+ if not self.get("start_time"):
1121
+ raise RuntimeError("Cannot pause an unstarted transaction.")
1122
+ if self.get("end_time"):
1123
+ raise RuntimeError("Cannot pause a completed transaction.")
1124
+ pause_time = self["pause_time"] = time_provider.end_time()
1125
+ return pause_time
1126
+
448
1127
  @staticmethod
449
1128
  def _abs_txn_meta_path_to_relative(root: str, target: str) -> str:
450
1129
  """
@@ -499,7 +1178,11 @@ class Transaction(dict):
499
1178
  validations on the serialized or deserialized object.
500
1179
  :return: a serializable version of the object
501
1180
  """
502
- serializable = copy.deepcopy(self)
1181
+ # Only copy dictionary keys - all other members should not be serialized
1182
+ serializable = Transaction({})
1183
+ for key, value in self.items():
1184
+ serializable[key] = copy.deepcopy(value)
1185
+
503
1186
  # remove all src/dest metafile contents except IDs and locators to
504
1187
  # reduce file size (they can be reconstructed from their corresponding
505
1188
  # files as required).
@@ -530,6 +1213,17 @@ class Transaction(dict):
530
1213
  }
531
1214
  # TODO(pdames): Ensure that all file paths recorded are relative to the
532
1215
  # catalog root.
1216
+
1217
+ # TODO: check if we care about order or exact time stamps --> pickling time_provider?
1218
+ # serializable.pop("_time_provider", None)
1219
+
1220
+ serializable["_time_provider"] = {
1221
+ "type": type(self._time_provider).__name__,
1222
+ "params": {},
1223
+ }
1224
+
1225
+ serializable.catalog_root_normalized = self.catalog_root_normalized
1226
+
533
1227
  return serializable
534
1228
 
535
1229
  @staticmethod
@@ -574,184 +1268,466 @@ class Transaction(dict):
574
1268
  self,
575
1269
  catalog_root_dir: str,
576
1270
  filesystem: Optional[pyarrow.fs.FileSystem] = None,
577
- ) -> Union[List[ListResult[Metafile]], Tuple[List[str], str]]:
578
- # TODO(pdames): allow transactions to be durably staged and resumed
579
- # across multiple sessions prior to commit
1271
+ ) -> Union[
1272
+ List[ListResult[Metafile]],
1273
+ Tuple[List[str], str],
1274
+ Tuple[List["ListResult[Metafile]"], List[str], str],
1275
+ ]:
1276
+ """
1277
+ Legacy wrapper that preserves the original `commit()` contract while
1278
+ delegating the heavy lifting to the incremental helpers.
1279
+
1280
+ Returns
1281
+ -------
1282
+ - For READ transactions: List[ListResult[Metafile]]
1283
+ - For WRITE transactions: Tuple[List[str], str]
1284
+ (list of successful write-paths, path to success-txn log file)
1285
+ - For mixed READ/WRITE transactions: Tuple[List["ListResult[Metafile]"], List[str], str]
1286
+ """
580
1287
 
581
- # create a new internal copy of this transaction to guard against
582
- # external modification and dirty state across retries
583
- txn = copy.deepcopy(self)
1288
+ if hasattr(self, "interactive") and self.interactive:
1289
+ raise RuntimeError(
1290
+ "Cannot commit an interactive transaction. Use transaction.start(),transaction.step(), and transaction.seal() instead."
1291
+ )
584
1292
 
585
- # create the transaction directory first to telegraph that at least 1
586
- # transaction at this root has been attempted
1293
+ if self.operations and len(self.operations) > 0:
1294
+ # Start a working copy (deep-copy, directory scaffolding, start-time, running/failed/success/paused dirs …)
1295
+ txn_active = self.start(catalog_root_dir, filesystem) # deep copy
1296
+ # Sequentially execute every TransactionOperation
1297
+ for op in txn_active.operations:
1298
+ txn_active.step(op)
1299
+ return txn_active._seal_steps()
1300
+
1301
+ def start(
1302
+ self,
1303
+ catalog_root_dir: str,
1304
+ filesystem: Optional[pyarrow.fs.FileSystem] = None,
1305
+ historic_timestamp: Optional[int] = None,
1306
+ ) -> "Transaction":
1307
+ """
1308
+ Create directory scaffolding, timestamp the txn, and return a DEEP COPY
1309
+ that the caller should use for all subsequent calls to step(), pause(),
1310
+ and seal(). The original object remains read-only.
1311
+
1312
+ Args:
1313
+ catalog_root_dir: Root directory for the catalog
1314
+ filesystem: Optional filesystem to use
1315
+ historic_timestamp: Optional timestamp in nanoseconds since epoch for snapshot isolation
1316
+ """
1317
+ # Create a deep copy
1318
+ txn: "Transaction" = copy.deepcopy(self)
1319
+
1320
+ # Set up time provider based on transaction type
1321
+ if historic_timestamp is not None:
1322
+ # Use historic time provider for snapshot isolation
1323
+ # TODO(pdames): Set base time provider to the catalog's configured time provider when more than one is supported.
1324
+ txn._time_provider = TransactionHistoricTimeProvider(
1325
+ historic_timestamp,
1326
+ TransactionSystemTimeProvider(),
1327
+ )
1328
+ txn.historic_timestamp = historic_timestamp
1329
+ else:
1330
+ # Use system time provider for regular transactions
1331
+ txn._time_provider = TransactionSystemTimeProvider()
1332
+
1333
+ txn._mark_start_time(txn._time_provider) # start time on deep_copy
1334
+
1335
+ # Set up filesystem and directories
587
1336
  catalog_root_normalized, filesystem = resolve_path_and_filesystem(
588
1337
  catalog_root_dir,
589
1338
  filesystem,
590
1339
  )
591
- txn_log_dir = posixpath.join(catalog_root_normalized, TXN_DIR_NAME)
592
- running_txn_log_dir = posixpath.join(txn_log_dir, RUNNING_TXN_DIR_NAME)
593
- filesystem.create_dir(running_txn_log_dir, recursive=True)
594
- failed_txn_log_dir = posixpath.join(txn_log_dir, FAILED_TXN_DIR_NAME)
595
- filesystem.create_dir(failed_txn_log_dir, recursive=False)
596
- success_txn_log_dir = posixpath.join(txn_log_dir, SUCCESS_TXN_DIR_NAME)
597
- filesystem.create_dir(success_txn_log_dir, recursive=False)
1340
+ txn.catalog_root_normalized = catalog_root_normalized
1341
+ txn._filesystem = filesystem # keep for pause/resume
1342
+ txn.running_log_written = False # internal flags
1343
+ txn._list_results = []
598
1344
 
599
- # TODO(pdames): Support injection of other time providers, but ensure
600
- # that ALL transactions in a catalog use the same time provider.
601
- time_provider = TransactionSystemTimeProvider()
602
-
603
- # record the transaction start time
604
- txn._mark_start_time(time_provider)
605
-
606
- if txn.type == TransactionType.READ:
607
- list_results = []
608
- for operation in self.operations:
609
- list_result = operation.dest_metafile.read_txn(
610
- catalog_root_dir=catalog_root_normalized,
611
- success_txn_log_dir=success_txn_log_dir,
612
- current_txn_op=operation,
613
- current_txn_start_time=txn.start_time,
614
- current_txn_id=txn.id,
615
- filesystem=filesystem,
1345
+ # Make sure txn/ directories exist (idempotent)
1346
+ txn_log_dir = posixpath.join(catalog_root_normalized, TXN_DIR_NAME)
1347
+ filesystem.create_dir(
1348
+ posixpath.join(txn_log_dir, RUNNING_TXN_DIR_NAME),
1349
+ recursive=True,
1350
+ )
1351
+ for subdir in (FAILED_TXN_DIR_NAME, SUCCESS_TXN_DIR_NAME, PAUSED_TXN_DIR_NAME):
1352
+ try:
1353
+ filesystem.create_dir(
1354
+ posixpath.join(txn_log_dir, subdir),
1355
+ recursive=False,
616
1356
  )
617
- list_results.append(list_result)
618
- return list_results
619
- else:
620
- return txn._commit_write(
621
- catalog_root_normalized=catalog_root_normalized,
622
- running_txn_log_dir=running_txn_log_dir,
623
- failed_txn_log_dir=failed_txn_log_dir,
624
- success_txn_log_dir=success_txn_log_dir,
625
- filesystem=filesystem,
626
- time_provider=time_provider,
627
- )
1357
+ except FileExistsError:
1358
+ pass # allowed when catalog already initialised
1359
+ return txn
628
1360
 
629
- def _commit_write(
1361
+ def step(
630
1362
  self,
631
- catalog_root_normalized: str,
632
- running_txn_log_dir: str,
633
- failed_txn_log_dir: str,
634
- success_txn_log_dir: str,
635
- filesystem: pyarrow.fs.FileSystem,
636
- time_provider: TransactionTimeProvider,
637
- ) -> Tuple[List[str], str]:
638
- # write the in-progress transaction log file
1363
+ operation: "TransactionOperation",
1364
+ ) -> Union[ListResult[Metafile], Tuple[List[str], List[str]]]:
1365
+ """
1366
+ Executes a single transaction operation.
1367
+
1368
+ Parameters
1369
+ ----------
1370
+ operation: TransactionOperation
1371
+ The transaction operation to execute.
1372
+
1373
+ Returns
1374
+ -------
1375
+ - For READ transaction operation: ListResult[Metafile]
1376
+ - For WRITE transaction operation: Tuple[List[str], List[str]]
1377
+ (list of successful write-paths, list of successful locator write-paths)
1378
+ """
1379
+
1380
+ catalog_root_normalized = self.catalog_root_normalized
1381
+ filesystem = self._filesystem
1382
+ txn_log_dir = posixpath.join(catalog_root_normalized, TXN_DIR_NAME)
1383
+
639
1384
  running_txn_log_file_path = posixpath.join(
640
- running_txn_log_dir,
641
- self.id,
1385
+ txn_log_dir, RUNNING_TXN_DIR_NAME, self.id
642
1386
  )
643
- with filesystem.open_output_stream(running_txn_log_file_path) as file:
644
- packed = msgpack.dumps(self.to_serializable(catalog_root_normalized))
645
- file.write(packed)
646
1387
 
647
- # write each metafile associated with the transaction
648
- metafile_write_paths = []
649
- locator_write_paths = []
1388
+ # Validate read-only transaction constraints
1389
+ if self.historic_timestamp is not None:
1390
+ if not operation.type.is_read_operation():
1391
+ raise RuntimeError(
1392
+ f"Cannot perform {operation.type.value} operation in a read-only historic transaction."
1393
+ )
1394
+
1395
+ # Add new operation to the transaction's list of operations
1396
+ if self.interactive:
1397
+ self.operations = self.operations + [operation]
1398
+
1399
+ # (a) READ txn op
1400
+ if operation.type.is_read_operation():
1401
+ list_result = operation.dest_metafile.read_txn(
1402
+ catalog_root_dir=catalog_root_normalized,
1403
+ success_txn_log_dir=posixpath.join(txn_log_dir, SUCCESS_TXN_DIR_NAME),
1404
+ current_txn_op=operation,
1405
+ current_txn_start_time=self.start_time,
1406
+ current_txn_id=self.id,
1407
+ filesystem=filesystem,
1408
+ )
1409
+ self._list_results.append(list_result)
1410
+ return list_result
1411
+
1412
+ # (b) WRITE txn op
1413
+ # First operation? -> create running log so an external janitor can
1414
+ # see that a txn is in-flight.
1415
+ if not self.running_log_written:
1416
+ self._write_running_log(running_txn_log_file_path)
1417
+
650
1418
  try:
651
- for operation in self.operations:
652
- operation.dest_metafile.write_txn(
653
- catalog_root_dir=catalog_root_normalized,
654
- success_txn_log_dir=success_txn_log_dir,
655
- current_txn_op=operation,
656
- current_txn_start_time=self.start_time,
657
- current_txn_id=self.id,
1419
+ (
1420
+ metafile_write_paths,
1421
+ locator_write_paths,
1422
+ ) = operation.dest_metafile.write_txn(
1423
+ catalog_root_dir=catalog_root_normalized,
1424
+ success_txn_log_dir=posixpath.join(txn_log_dir, SUCCESS_TXN_DIR_NAME),
1425
+ current_txn_op=operation,
1426
+ current_txn_start_time=self.start_time,
1427
+ current_txn_id=self.id,
1428
+ filesystem=filesystem,
1429
+ )
1430
+ # Check for concurrent txn conflicts on the metafile and locator write paths just written
1431
+ # TODO(pdames): Remove the fast-fail check here if it grows too expensive?
1432
+ for path in metafile_write_paths + locator_write_paths:
1433
+ MetafileRevisionInfo.check_for_concurrent_txn_conflict(
1434
+ success_txn_log_dir=posixpath.join(
1435
+ txn_log_dir,
1436
+ SUCCESS_TXN_DIR_NAME,
1437
+ ),
1438
+ current_txn_revision_file_path=path,
658
1439
  filesystem=filesystem,
659
1440
  )
660
- metafile_write_paths.extend(operation.metafile_write_paths)
661
- locator_write_paths.extend(operation.locator_write_paths)
662
- # check for conflicts with concurrent transactions
663
- for path in metafile_write_paths + locator_write_paths:
664
- MetafileRevisionInfo.check_for_concurrent_txn_conflict(
665
- success_txn_log_dir=success_txn_log_dir,
666
- current_txn_revision_file_path=path,
667
- filesystem=filesystem,
668
- )
1441
+ return metafile_write_paths, locator_write_paths
669
1442
  except Exception:
670
- # write a failed transaction log file entry
671
- failed_txn_log_file_path = posixpath.join(
672
- failed_txn_log_dir,
673
- self.id,
674
- )
675
- with filesystem.open_output_stream(failed_txn_log_file_path) as file:
676
- packed = msgpack.dumps(self.to_serializable(catalog_root_normalized))
677
- file.write(packed)
678
-
679
- ###################################################################
680
- ###################################################################
681
- # failure past here telegraphs a failed transaction cleanup attempt
682
- ###################################################################
683
- ###################################################################
684
-
685
- # delete all files written during the failed transaction
686
- known_write_paths = chain.from_iterable(
687
- [
688
- operation.metafile_write_paths + operation.locator_write_paths
689
- for operation in self.operations
690
- ]
1443
+ # convert in-flight txn FAILED and clean up partial files
1444
+ self._fail_and_cleanup(
1445
+ failed_txn_log_dir=posixpath.join(txn_log_dir, FAILED_TXN_DIR_NAME),
1446
+ running_log_path=running_txn_log_file_path,
691
1447
  )
692
- # TODO(pdames): Add separate janitor job to cleanup files that we
693
- # either failed to add to the known write paths, or fail to delete.
694
- for write_path in known_write_paths:
695
- filesystem.delete_file(write_path)
696
-
697
- # delete the in-progress transaction log file entry
698
- filesystem.delete_file(running_txn_log_file_path)
699
- # failed transaction cleanup is now complete
700
- raise
1448
+ raise # surface original error
701
1449
 
702
- # record the completed transaction
703
- success_txn_log_file_dir = posixpath.join(
704
- success_txn_log_dir,
705
- self.id,
706
- )
707
- filesystem.create_dir(
708
- success_txn_log_file_dir,
709
- recursive=False,
710
- )
711
- end_time = self._mark_end_time(time_provider)
712
- success_txn_log_file_path = posixpath.join(
713
- success_txn_log_file_dir,
714
- str(end_time),
1450
+ def pause(self) -> None:
1451
+ fs = self._filesystem
1452
+ root = self.catalog_root_normalized
1453
+ txn_log_dir = posixpath.join(root, TXN_DIR_NAME)
1454
+
1455
+ running_path = posixpath.join(txn_log_dir, RUNNING_TXN_DIR_NAME, self.id)
1456
+ paused_path = posixpath.join(txn_log_dir, PAUSED_TXN_DIR_NAME, self.id)
1457
+
1458
+ fs.create_dir(posixpath.dirname(paused_path), recursive=True)
1459
+
1460
+ # Record pause time (e.g., for time consistency guarantees)
1461
+ self._mark_pause_time(self._time_provider)
1462
+
1463
+ # Serialize current transaction state into paused/txn_id
1464
+ with fs.open_output_stream(paused_path) as f:
1465
+ f.write(msgpack.dumps(self.to_serializable(root)))
1466
+
1467
+ # Clean up original running log
1468
+ fs.delete_file(running_path)
1469
+
1470
+ def resume(self) -> None:
1471
+ fs = self._filesystem
1472
+ root = self.catalog_root_normalized
1473
+ txn_log_dir = posixpath.join(root, TXN_DIR_NAME)
1474
+
1475
+ running_path = posixpath.join(txn_log_dir, RUNNING_TXN_DIR_NAME, self.id)
1476
+ paused_path = posixpath.join(txn_log_dir, PAUSED_TXN_DIR_NAME, self.id)
1477
+
1478
+ # Load serialized transaction state
1479
+ with fs.open_input_stream(paused_path) as f:
1480
+ loaded_txn_data = msgpack.loads(f.readall())
1481
+
1482
+ # Restore relevant fields
1483
+ restored_txn = Transaction(**loaded_txn_data)
1484
+ self.__dict__.update(
1485
+ restored_txn.__dict__
1486
+ ) # make curr txn the same as restored (fill vars and stuff)
1487
+
1488
+ # To support restoring time provider state if we ever add non-ephemeral ones.
1489
+ new_provider = Transaction.read_time_provider(
1490
+ restored_txn["_time_provider"]["type"]
715
1491
  )
716
- with filesystem.open_output_stream(success_txn_log_file_path) as file:
717
- packed = msgpack.dumps(self.to_serializable(catalog_root_normalized))
718
- file.write(packed)
1492
+
1493
+ # evaluate system clock
1494
+ now = new_provider.start_time()
1495
+ self._time_provider = new_provider # start time should be preserved
1496
+ if now < self.pause_time:
1497
+ raise RuntimeError(
1498
+ f"System clock {now} is behind paused transaction time {self._pause_time}"
1499
+ )
1500
+ # TODO: set new start time or keep error if clock is off?
1501
+
1502
+ # Move back to running state
1503
+ fs.create_dir(posixpath.dirname(running_path), recursive=True)
1504
+ with fs.open_output_stream(running_path) as f:
1505
+ f.write(msgpack.dumps(self.to_serializable(root)))
1506
+ fs.delete_file(paused_path)
1507
+
1508
+ def seal(
1509
+ self,
1510
+ ) -> Union[
1511
+ List["ListResult[Metafile]"],
1512
+ Tuple[List[str], str],
1513
+ Tuple[List["ListResult[Metafile]"], List[str], str],
1514
+ ]:
1515
+ """
1516
+ For READ → returns list_results collected during step().
1517
+ For WRITE → returns (written_paths, success_log_path).
1518
+ """
1519
+ if not self.interactive:
1520
+ raise RuntimeError(
1521
+ "Cannot seal a non-interactive transaction. Call transaction.commit() instead."
1522
+ )
1523
+
1524
+ # Read-only transactions can only perform read operations
1525
+ if self.historic_timestamp is not None:
1526
+ if self._has_write_operations():
1527
+ raise RuntimeError(
1528
+ "Cannot seal a read-only historic transaction that contains write operations."
1529
+ )
1530
+
1531
+ return self._seal_steps()
1532
+
1533
+ def _has_write_operations(self) -> bool:
1534
+ """
1535
+ Check if the transaction contains any write operations.
1536
+ Read-only transactions should only contain READ operations.
1537
+ """
1538
+ for operation in self.operations:
1539
+ if not operation.type.is_read_operation():
1540
+ return True
1541
+ return False
1542
+
1543
+ def _seal_steps(
1544
+ self,
1545
+ ) -> Union[
1546
+ List["ListResult[Metafile]"],
1547
+ Tuple[List[str], str],
1548
+ Tuple[List["ListResult[Metafile]"], List[str], str],
1549
+ ]:
1550
+ fs = self._filesystem
1551
+ root = self.catalog_root_normalized
1552
+ txn_log_dir = posixpath.join(root, TXN_DIR_NAME)
1553
+ end_time = self._mark_end_time(self._time_provider)
1554
+
1555
+ # READ path: nothing persisted, so we are done
1556
+ if all(op.type.is_read_operation() for op in self.operations):
1557
+ return self._list_results
1558
+
1559
+ running_path = posixpath.join(txn_log_dir, RUNNING_TXN_DIR_NAME, self.id)
1560
+ failed_dir = posixpath.join(txn_log_dir, FAILED_TXN_DIR_NAME)
1561
+ success_dir = posixpath.join(txn_log_dir, SUCCESS_TXN_DIR_NAME)
1562
+
1563
+ # If no operations ever succeeded we still need a running log.
1564
+ if not self.running_log_written:
1565
+ self._write_running_log(running_path)
719
1566
  try:
720
- Transaction._validate_txn_log_file(
721
- success_txn_log_file=success_txn_log_file_path
1567
+ # Check for concurrent txn conflicts on metafile and locator write paths
1568
+ for path in self.metafile_write_paths + self.locator_write_paths:
1569
+ MetafileRevisionInfo.check_for_concurrent_txn_conflict(
1570
+ success_txn_log_dir=posixpath.join(
1571
+ txn_log_dir, SUCCESS_TXN_DIR_NAME
1572
+ ),
1573
+ current_txn_revision_file_path=path,
1574
+ filesystem=fs,
1575
+ )
1576
+ except Exception:
1577
+ self._fail_and_cleanup(
1578
+ failed_txn_log_dir=failed_dir,
1579
+ running_log_path=running_path,
722
1580
  )
1581
+ # raise the original error
1582
+ raise
1583
+ success_log_path = None
1584
+ try:
1585
+ # write transaction log
1586
+ success_txn_dir = posixpath.join(success_dir, self.id)
1587
+ fs.create_dir(success_txn_dir, recursive=False)
1588
+
1589
+ success_log_path = posixpath.join(success_txn_dir, str(end_time))
1590
+ with fs.open_output_stream(success_log_path) as f:
1591
+ f.write(msgpack.dumps(self.to_serializable(root)))
1592
+
1593
+ Transaction._validate_txn_log_file(success_txn_log_file=success_log_path)
1594
+
723
1595
  except Exception as e1:
1596
+ self._fail_and_cleanup(
1597
+ failed_txn_log_dir=failed_dir,
1598
+ running_log_path=running_path,
1599
+ success_log_path=success_log_path,
1600
+ )
1601
+ raise RuntimeError(
1602
+ f"Transaction validation failed. To preserve catalog integrity, "
1603
+ f"the corresponding completed transaction log at "
1604
+ f"`{success_log_path}` has been removed."
1605
+ ) from e1
1606
+
1607
+ else:
1608
+ fs.delete_file(running_path)
1609
+ if all(op.type.is_write_operation() for op in self.operations):
1610
+ # pure write transaction - just return write paths and success log path
1611
+ return self.metafile_write_paths, success_log_path
1612
+ else:
1613
+ # mixed read/write transaction - return read results, write paths, and success log path
1614
+ return self._list_results, self.metafile_write_paths, success_log_path
1615
+
1616
+ # Helper: write or overwrite the running/ID file exactly once
1617
+ def _write_running_log(self, running_log_path: str) -> None:
1618
+ with self._filesystem.open_output_stream(running_log_path) as f:
1619
+ f.write(msgpack.dumps(self.to_serializable(self.catalog_root_normalized)))
1620
+ self.running_log_written = True
1621
+
1622
+ # Helper: mark txn FAILED and clean partial output
1623
+ def _fail_and_cleanup(
1624
+ self,
1625
+ failed_txn_log_dir: str,
1626
+ running_log_path: str,
1627
+ success_log_path: Optional[str] = None,
1628
+ ) -> None:
1629
+ fs = self._filesystem
1630
+
1631
+ # 1. write failed/ID
1632
+ failed_log_path = posixpath.join(failed_txn_log_dir, self.id)
1633
+ with fs.open_output_stream(failed_log_path) as f:
1634
+ f.write(msgpack.dumps(self.to_serializable(self.catalog_root_normalized)))
1635
+
1636
+ # 2. delete all provisional files
1637
+ for path in self.metafile_write_paths:
724
1638
  try:
725
- # move the txn log from success dir to failed dir
726
- failed_txn_log_file_path = posixpath.join(
727
- failed_txn_log_dir,
728
- self.id,
729
- )
730
- filesystem.move(
731
- src=success_txn_log_file_path,
732
- dest=failed_txn_log_file_path,
733
- )
734
- # keep parent success txn log dir to telegraph failed validation
735
-
736
- ###############################################################
737
- ###############################################################
738
- # failure past here telegraphs a failed transaction validation
739
- # cleanup attempt
740
- ###############################################################
741
- ###############################################################
742
- except Exception as e2:
743
- raise OSError(
744
- f"Failed to cleanup bad transaction log file at "
745
- f"`{success_txn_log_file_path}`"
746
- ) from e2
747
- finally:
748
- raise RuntimeError(
749
- f"Transaction validation failed. To preserve "
750
- f"catalog integrity, the corresponding completed "
751
- f"transaction log at `{success_txn_log_file_path}` has "
752
- f"been removed."
753
- ) from e1
1639
+ fs.delete_file(path)
1640
+ except Exception:
1641
+ pass # best-effort; janitor job will catch leftovers
1642
+ for path in self.locator_write_paths:
1643
+ try:
1644
+ fs.delete_file(path)
1645
+ except Exception:
1646
+ pass # best-effort; janitor job will catch leftovers
1647
+
1648
+ # 3. tidy up bookkeeping logs
1649
+ try:
1650
+ fs.delete_file(running_log_path)
1651
+ except Exception:
1652
+ pass
1653
+ if success_log_path:
1654
+ try:
1655
+ fs.delete_file(success_log_path)
1656
+ except Exception:
1657
+ pass
1658
+
1659
+ def __enter__(self) -> "Transaction":
1660
+ """
1661
+ Context manager entry point. Sets this transaction as the current context.
1662
+ Supports nested transactions by preserving the context stack.
1663
+ """
1664
+ if not hasattr(self, "interactive") or not self.interactive:
1665
+ raise RuntimeError(
1666
+ "Transaction must be interactive to use with context manager. "
1667
+ "Use dc.transaction() to create an interactive transaction."
1668
+ )
1669
+ if self.start_time is None:
1670
+ raise RuntimeError(
1671
+ "Transaction has not been started. "
1672
+ "Use dc.transaction() to create a properly initialized transaction."
1673
+ )
1674
+
1675
+ # Store the context token for restoration in __exit__
1676
+ self._context_token = set_current_transaction(self)
1677
+ return self
1678
+
1679
+ def __exit__(
1680
+ self,
1681
+ exc_type: Optional[Type[BaseException]],
1682
+ exc_value: Optional[BaseException],
1683
+ traceback: Optional[TracebackType],
1684
+ ) -> None:
1685
+ """
1686
+ Context manager exit point. Restores previous transaction context and
1687
+ automatically seals the transaction on successful completion or fails it
1688
+ if an exception occurred.
1689
+
1690
+ Args:
1691
+ exc_type: Exception type if an exception occurred, None otherwise
1692
+ exc_value: Exception value if an exception occurred, None otherwise
1693
+ traceback: Exception traceback if an exception occurred, None otherwise
1694
+ """
1695
+ try:
1696
+ if exc_type is None and exc_value is None and traceback is None:
1697
+ # No exception occurred - seal the transaction
1698
+ self.seal()
1699
+ else:
1700
+ # Exception occurred during transaction - fail and cleanup
1701
+ try:
1702
+ catalog_root_normalized = self.catalog_root_normalized
1703
+ txn_log_dir = posixpath.join(catalog_root_normalized, TXN_DIR_NAME)
1704
+ running_txn_log_file_path = posixpath.join(
1705
+ txn_log_dir, RUNNING_TXN_DIR_NAME, self.id
1706
+ )
1707
+ self._fail_and_cleanup(
1708
+ failed_txn_log_dir=posixpath.join(
1709
+ txn_log_dir, FAILED_TXN_DIR_NAME
1710
+ ),
1711
+ running_log_path=running_txn_log_file_path,
1712
+ )
1713
+ except Exception:
1714
+ # If cleanup fails, still let the original exception propagate
1715
+ pass
754
1716
  finally:
755
- # delete the in-progress transaction log file entry
756
- filesystem.delete_file(running_txn_log_file_path)
757
- return metafile_write_paths, success_txn_log_file_path
1717
+ # Always restore the previous transaction context using the token
1718
+ if hasattr(self, "_context_token"):
1719
+ try:
1720
+ # Get the previous value from the token
1721
+ old_value = self._context_token.old_value
1722
+ # Only set if the old value is a valid transaction or None
1723
+ if old_value is None or isinstance(old_value, Transaction):
1724
+ _current_transaction.set(old_value)
1725
+ else:
1726
+ # If old_value is not valid (e.g., Token.MISSING), set to None
1727
+ _current_transaction.set(None)
1728
+ except (AttributeError, LookupError):
1729
+ # If token doesn't have old_value or context is corrupted, clear it
1730
+ try:
1731
+ _current_transaction.set(None)
1732
+ except LookupError:
1733
+ pass