deltacat 1.1.35__py3-none-any.whl → 2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +42 -3
- deltacat/annotations.py +36 -0
- deltacat/api.py +168 -0
- deltacat/aws/s3u.py +4 -4
- deltacat/benchmarking/benchmark_engine.py +82 -0
- deltacat/benchmarking/benchmark_report.py +86 -0
- deltacat/benchmarking/benchmark_suite.py +11 -0
- deltacat/benchmarking/conftest.py +21 -0
- deltacat/benchmarking/data/random_row_generator.py +94 -0
- deltacat/benchmarking/data/row_generator.py +10 -0
- deltacat/benchmarking/test_benchmark_pipeline.py +106 -0
- deltacat/catalog/__init__.py +14 -0
- deltacat/catalog/delegate.py +199 -106
- deltacat/catalog/iceberg/__init__.py +4 -0
- deltacat/catalog/iceberg/iceberg_catalog_config.py +26 -0
- deltacat/catalog/iceberg/impl.py +368 -0
- deltacat/catalog/iceberg/overrides.py +74 -0
- deltacat/catalog/interface.py +273 -76
- deltacat/catalog/main/impl.py +720 -0
- deltacat/catalog/model/catalog.py +227 -20
- deltacat/catalog/model/properties.py +116 -0
- deltacat/catalog/model/table_definition.py +32 -1
- deltacat/compute/compactor/model/compaction_session_audit_info.py +7 -3
- deltacat/compute/compactor/model/delta_annotated.py +3 -3
- deltacat/compute/compactor/model/delta_file_envelope.py +3 -1
- deltacat/compute/compactor/model/delta_file_locator.py +3 -1
- deltacat/compute/compactor/model/round_completion_info.py +5 -5
- deltacat/compute/compactor/model/table_object_store.py +3 -2
- deltacat/compute/compactor/repartition_session.py +1 -1
- deltacat/compute/compactor/steps/dedupe.py +11 -4
- deltacat/compute/compactor/steps/hash_bucket.py +1 -1
- deltacat/compute/compactor/steps/materialize.py +6 -2
- deltacat/compute/compactor/utils/io.py +1 -1
- deltacat/compute/compactor/utils/sort_key.py +9 -2
- deltacat/compute/compactor_v2/compaction_session.py +2 -3
- deltacat/compute/compactor_v2/constants.py +1 -30
- deltacat/compute/compactor_v2/deletes/utils.py +3 -3
- deltacat/compute/compactor_v2/model/merge_input.py +1 -1
- deltacat/compute/compactor_v2/private/compaction_utils.py +5 -5
- deltacat/compute/compactor_v2/steps/merge.py +11 -80
- deltacat/compute/compactor_v2/utils/content_type_params.py +0 -17
- deltacat/compute/compactor_v2/utils/dedupe.py +1 -1
- deltacat/compute/compactor_v2/utils/io.py +1 -1
- deltacat/compute/compactor_v2/utils/primary_key_index.py +3 -15
- deltacat/compute/compactor_v2/utils/task_options.py +23 -43
- deltacat/compute/converter/constants.py +4 -0
- deltacat/compute/converter/converter_session.py +143 -0
- deltacat/compute/converter/model/convert_input.py +69 -0
- deltacat/compute/converter/model/convert_input_files.py +61 -0
- deltacat/compute/converter/model/converter_session_params.py +99 -0
- deltacat/compute/converter/pyiceberg/__init__.py +0 -0
- deltacat/compute/converter/pyiceberg/catalog.py +75 -0
- deltacat/compute/converter/pyiceberg/overrides.py +135 -0
- deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +251 -0
- deltacat/compute/converter/steps/__init__.py +0 -0
- deltacat/compute/converter/steps/convert.py +211 -0
- deltacat/compute/converter/steps/dedupe.py +60 -0
- deltacat/compute/converter/utils/__init__.py +0 -0
- deltacat/compute/converter/utils/convert_task_options.py +88 -0
- deltacat/compute/converter/utils/converter_session_utils.py +109 -0
- deltacat/compute/converter/utils/iceberg_columns.py +82 -0
- deltacat/compute/converter/utils/io.py +43 -0
- deltacat/compute/converter/utils/s3u.py +133 -0
- deltacat/compute/resource_estimation/delta.py +1 -19
- deltacat/constants.py +47 -1
- deltacat/env.py +51 -0
- deltacat/examples/__init__.py +0 -0
- deltacat/examples/basic_logging.py +101 -0
- deltacat/examples/common/__init__.py +0 -0
- deltacat/examples/common/fixtures.py +15 -0
- deltacat/examples/hello_world.py +27 -0
- deltacat/examples/iceberg/__init__.py +0 -0
- deltacat/examples/iceberg/iceberg_bucket_writer.py +139 -0
- deltacat/examples/iceberg/iceberg_reader.py +149 -0
- deltacat/exceptions.py +51 -9
- deltacat/logs.py +4 -1
- deltacat/storage/__init__.py +118 -28
- deltacat/storage/iceberg/__init__.py +0 -0
- deltacat/storage/iceberg/iceberg_scan_planner.py +28 -0
- deltacat/storage/iceberg/impl.py +737 -0
- deltacat/storage/iceberg/model.py +709 -0
- deltacat/storage/interface.py +217 -134
- deltacat/storage/main/__init__.py +0 -0
- deltacat/storage/main/impl.py +2077 -0
- deltacat/storage/model/delta.py +118 -71
- deltacat/storage/model/interop.py +24 -0
- deltacat/storage/model/list_result.py +8 -0
- deltacat/storage/model/locator.py +93 -3
- deltacat/{aws/redshift → storage}/model/manifest.py +122 -98
- deltacat/storage/model/metafile.py +1316 -0
- deltacat/storage/model/namespace.py +34 -18
- deltacat/storage/model/partition.py +362 -37
- deltacat/storage/model/scan/__init__.py +0 -0
- deltacat/storage/model/scan/push_down.py +19 -0
- deltacat/storage/model/scan/scan_plan.py +10 -0
- deltacat/storage/model/scan/scan_task.py +34 -0
- deltacat/storage/model/schema.py +892 -0
- deltacat/storage/model/shard.py +47 -0
- deltacat/storage/model/sort_key.py +170 -13
- deltacat/storage/model/stream.py +208 -80
- deltacat/storage/model/table.py +123 -29
- deltacat/storage/model/table_version.py +322 -46
- deltacat/storage/model/transaction.py +757 -0
- deltacat/storage/model/transform.py +198 -61
- deltacat/storage/model/types.py +111 -13
- deltacat/storage/rivulet/__init__.py +11 -0
- deltacat/storage/rivulet/arrow/__init__.py +0 -0
- deltacat/storage/rivulet/arrow/serializer.py +75 -0
- deltacat/storage/rivulet/dataset.py +744 -0
- deltacat/storage/rivulet/dataset_executor.py +87 -0
- deltacat/storage/rivulet/feather/__init__.py +5 -0
- deltacat/storage/rivulet/feather/file_reader.py +136 -0
- deltacat/storage/rivulet/feather/serializer.py +35 -0
- deltacat/storage/rivulet/fs/__init__.py +0 -0
- deltacat/storage/rivulet/fs/file_provider.py +105 -0
- deltacat/storage/rivulet/fs/file_store.py +130 -0
- deltacat/storage/rivulet/fs/input_file.py +76 -0
- deltacat/storage/rivulet/fs/output_file.py +86 -0
- deltacat/storage/rivulet/logical_plan.py +105 -0
- deltacat/storage/rivulet/metastore/__init__.py +0 -0
- deltacat/storage/rivulet/metastore/delta.py +190 -0
- deltacat/storage/rivulet/metastore/json_sst.py +105 -0
- deltacat/storage/rivulet/metastore/sst.py +82 -0
- deltacat/storage/rivulet/metastore/sst_interval_tree.py +260 -0
- deltacat/storage/rivulet/mvp/Table.py +101 -0
- deltacat/storage/rivulet/mvp/__init__.py +5 -0
- deltacat/storage/rivulet/parquet/__init__.py +5 -0
- deltacat/storage/rivulet/parquet/data_reader.py +0 -0
- deltacat/storage/rivulet/parquet/file_reader.py +127 -0
- deltacat/storage/rivulet/parquet/serializer.py +37 -0
- deltacat/storage/rivulet/reader/__init__.py +0 -0
- deltacat/storage/rivulet/reader/block_scanner.py +378 -0
- deltacat/storage/rivulet/reader/data_reader.py +136 -0
- deltacat/storage/rivulet/reader/data_scan.py +63 -0
- deltacat/storage/rivulet/reader/dataset_metastore.py +178 -0
- deltacat/storage/rivulet/reader/dataset_reader.py +156 -0
- deltacat/storage/rivulet/reader/pyarrow_data_reader.py +121 -0
- deltacat/storage/rivulet/reader/query_expression.py +99 -0
- deltacat/storage/rivulet/reader/reader_type_registrar.py +84 -0
- deltacat/storage/rivulet/schema/__init__.py +0 -0
- deltacat/storage/rivulet/schema/datatype.py +128 -0
- deltacat/storage/rivulet/schema/schema.py +251 -0
- deltacat/storage/rivulet/serializer.py +40 -0
- deltacat/storage/rivulet/serializer_factory.py +42 -0
- deltacat/storage/rivulet/writer/__init__.py +0 -0
- deltacat/storage/rivulet/writer/dataset_writer.py +29 -0
- deltacat/storage/rivulet/writer/memtable_dataset_writer.py +294 -0
- deltacat/tests/_io/__init__.py +1 -0
- deltacat/tests/catalog/test_catalogs.py +324 -0
- deltacat/tests/catalog/test_default_catalog_impl.py +16 -8
- deltacat/tests/compute/compact_partition_multiple_rounds_test_cases.py +21 -21
- deltacat/tests/compute/compact_partition_rebase_test_cases.py +6 -6
- deltacat/tests/compute/compact_partition_rebase_then_incremental_test_cases.py +56 -56
- deltacat/tests/compute/compact_partition_test_cases.py +19 -53
- deltacat/tests/compute/compactor/steps/test_repartition.py +2 -2
- deltacat/tests/compute/compactor/utils/test_io.py +6 -8
- deltacat/tests/compute/compactor_v2/test_compaction_session.py +0 -466
- deltacat/tests/compute/compactor_v2/utils/test_task_options.py +1 -273
- deltacat/tests/compute/conftest.py +75 -0
- deltacat/tests/compute/converter/__init__.py +0 -0
- deltacat/tests/compute/converter/conftest.py +80 -0
- deltacat/tests/compute/converter/test_convert_session.py +478 -0
- deltacat/tests/compute/converter/utils.py +123 -0
- deltacat/tests/compute/resource_estimation/test_delta.py +0 -16
- deltacat/tests/compute/test_compact_partition_incremental.py +2 -42
- deltacat/tests/compute/test_compact_partition_multiple_rounds.py +5 -46
- deltacat/tests/compute/test_compact_partition_params.py +3 -3
- deltacat/tests/compute/test_compact_partition_rebase.py +1 -46
- deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +5 -46
- deltacat/tests/compute/test_util_common.py +19 -12
- deltacat/tests/compute/test_util_create_table_deltas_repo.py +13 -22
- deltacat/tests/local_deltacat_storage/__init__.py +76 -103
- deltacat/tests/storage/__init__.py +0 -0
- deltacat/tests/storage/conftest.py +25 -0
- deltacat/tests/storage/main/__init__.py +0 -0
- deltacat/tests/storage/main/test_main_storage.py +1399 -0
- deltacat/tests/storage/model/__init__.py +0 -0
- deltacat/tests/storage/model/test_delete_parameters.py +21 -0
- deltacat/tests/storage/model/test_metafile_io.py +2535 -0
- deltacat/tests/storage/model/test_schema.py +308 -0
- deltacat/tests/storage/model/test_shard.py +22 -0
- deltacat/tests/storage/model/test_table_version.py +110 -0
- deltacat/tests/storage/model/test_transaction.py +308 -0
- deltacat/tests/storage/rivulet/__init__.py +0 -0
- deltacat/tests/storage/rivulet/conftest.py +149 -0
- deltacat/tests/storage/rivulet/fs/__init__.py +0 -0
- deltacat/tests/storage/rivulet/fs/test_file_location_provider.py +93 -0
- deltacat/tests/storage/rivulet/schema/__init__.py +0 -0
- deltacat/tests/storage/rivulet/schema/test_schema.py +241 -0
- deltacat/tests/storage/rivulet/test_dataset.py +406 -0
- deltacat/tests/storage/rivulet/test_manifest.py +67 -0
- deltacat/tests/storage/rivulet/test_sst_interval_tree.py +232 -0
- deltacat/tests/storage/rivulet/test_utils.py +122 -0
- deltacat/tests/storage/rivulet/writer/__init__.py +0 -0
- deltacat/tests/storage/rivulet/writer/test_dataset_write_then_read.py +341 -0
- deltacat/tests/storage/rivulet/writer/test_dataset_writer.py +79 -0
- deltacat/tests/storage/rivulet/writer/test_memtable_dataset_writer.py +75 -0
- deltacat/tests/test_deltacat_api.py +39 -0
- deltacat/tests/test_utils/filesystem.py +14 -0
- deltacat/tests/test_utils/message_pack_utils.py +54 -0
- deltacat/tests/test_utils/pyarrow.py +8 -15
- deltacat/tests/test_utils/storage.py +266 -3
- deltacat/tests/utils/test_daft.py +3 -3
- deltacat/tests/utils/test_pyarrow.py +0 -432
- deltacat/types/partial_download.py +1 -1
- deltacat/types/tables.py +1 -1
- deltacat/utils/export.py +59 -0
- deltacat/utils/filesystem.py +320 -0
- deltacat/utils/metafile_locator.py +73 -0
- deltacat/utils/pyarrow.py +36 -183
- deltacat-2.0.dist-info/METADATA +65 -0
- deltacat-2.0.dist-info/RECORD +347 -0
- deltacat/aws/redshift/__init__.py +0 -19
- deltacat/catalog/default_catalog_impl/__init__.py +0 -369
- deltacat/io/dataset.py +0 -73
- deltacat/io/read_api.py +0 -143
- deltacat/storage/model/delete_parameters.py +0 -40
- deltacat/storage/model/partition_spec.py +0 -71
- deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +0 -253
- deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +0 -45
- deltacat-1.1.35.dist-info/METADATA +0 -64
- deltacat-1.1.35.dist-info/RECORD +0 -219
- /deltacat/{aws/redshift/model → benchmarking/data}/__init__.py +0 -0
- /deltacat/{io/aws → catalog/main}/__init__.py +0 -0
- /deltacat/{io/aws/redshift → compute/converter}/__init__.py +0 -0
- /deltacat/{tests/io → compute/converter/model}/__init__.py +0 -0
- /deltacat/tests/{io → _io}/test_cloudpickle_bug_fix.py +0 -0
- /deltacat/tests/{io → _io}/test_file_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_memcached_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_ray_plasma_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_redis_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_s3_object_store.py +0 -0
- {deltacat-1.1.35.dist-info → deltacat-2.0.dist-info}/LICENSE +0 -0
- {deltacat-1.1.35.dist-info → deltacat-2.0.dist-info}/WHEEL +0 -0
- {deltacat-1.1.35.dist-info → deltacat-2.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,757 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
import os
|
4
|
+
import copy
|
5
|
+
import time
|
6
|
+
import uuid
|
7
|
+
import posixpath
|
8
|
+
from pathlib import PosixPath
|
9
|
+
import threading
|
10
|
+
from collections import defaultdict
|
11
|
+
|
12
|
+
from itertools import chain
|
13
|
+
from typing import Optional, List, Union, Tuple
|
14
|
+
|
15
|
+
import msgpack
|
16
|
+
import pyarrow.fs
|
17
|
+
|
18
|
+
from deltacat.constants import (
|
19
|
+
TXN_DIR_NAME,
|
20
|
+
TXN_PART_SEPARATOR,
|
21
|
+
RUNNING_TXN_DIR_NAME,
|
22
|
+
FAILED_TXN_DIR_NAME,
|
23
|
+
SUCCESS_TXN_DIR_NAME,
|
24
|
+
NANOS_PER_SEC,
|
25
|
+
)
|
26
|
+
from deltacat.storage.model.list_result import ListResult
|
27
|
+
from deltacat.storage.model.types import (
|
28
|
+
TransactionOperationType,
|
29
|
+
TransactionType,
|
30
|
+
)
|
31
|
+
from deltacat.storage.model.metafile import (
|
32
|
+
Metafile,
|
33
|
+
MetafileRevisionInfo,
|
34
|
+
)
|
35
|
+
from deltacat.utils.filesystem import (
|
36
|
+
resolve_path_and_filesystem,
|
37
|
+
list_directory,
|
38
|
+
)
|
39
|
+
|
40
|
+
|
41
|
+
class TransactionTimeProvider:
|
42
|
+
"""
|
43
|
+
Provider interface for transaction start and end times. An ideal
|
44
|
+
transaction time provider is externally consistent (e.g.,
|
45
|
+
https://cloud.google.com/spanner/docs/true-time-external-consistency),
|
46
|
+
such that:
|
47
|
+
1. A transaction start time is never less than a previously completed
|
48
|
+
transaction's end time.
|
49
|
+
2. A transaction end time is never less than an in-progress
|
50
|
+
transaction's start time.
|
51
|
+
3. Every transaction has a unique start and end time.
|
52
|
+
4. Start/end time assignment is non-blocking.
|
53
|
+
"""
|
54
|
+
|
55
|
+
def start_time(self) -> int:
|
56
|
+
raise NotImplementedError("start_time not implemented")
|
57
|
+
|
58
|
+
def end_time(self) -> int:
|
59
|
+
raise NotImplementedError("end_time not implemented")
|
60
|
+
|
61
|
+
|
62
|
+
class TransactionSystemTimeProvider(TransactionTimeProvider):
|
63
|
+
"""
|
64
|
+
A local transaction time provider that returns the current system clock
|
65
|
+
epoch time in nanoseconds. Ensures that all local transaction start
|
66
|
+
times are greater than all last known end times, and that all known end
|
67
|
+
times are no less than all last known start time across all local threads
|
68
|
+
using this time provider.
|
69
|
+
|
70
|
+
Note that this time provider gives no external consistency guarantees due
|
71
|
+
to potential clock skew between distributed nodes writing to the same
|
72
|
+
catalog, and is only recommended for use with local catalogs.
|
73
|
+
"""
|
74
|
+
|
75
|
+
last_known_start_times = defaultdict(int)
|
76
|
+
last_known_end_times = defaultdict(int)
|
77
|
+
|
78
|
+
# don't wait more than 60 seconds for the system clock to catch up
|
79
|
+
# between transactions (assumed to be indicative of a larger system
|
80
|
+
# clock change made between transactions)
|
81
|
+
max_sync_wait_time = 60 * NANOS_PER_SEC
|
82
|
+
|
83
|
+
def start_time(self) -> int:
|
84
|
+
"""
|
85
|
+
Gets the current system time in nanoseconds since the epoch. Ensures
|
86
|
+
that the start time returned is greater than the last known end time
|
87
|
+
recorded at the time this method is invoked.
|
88
|
+
:return: Current epoch time in nanoseconds.
|
89
|
+
"""
|
90
|
+
# ensure serial transactions in a single process have start times after
|
91
|
+
# the last known end time
|
92
|
+
last_known_end_times = self.last_known_end_times.values() or [0]
|
93
|
+
max_known_end_time = max(last_known_end_times)
|
94
|
+
|
95
|
+
elapsed_start_time = time.monotonic_ns()
|
96
|
+
current_time = time.time_ns()
|
97
|
+
while current_time <= max_known_end_time:
|
98
|
+
elapsed_time = time.monotonic_ns() - elapsed_start_time
|
99
|
+
if elapsed_time > self.max_sync_wait_time:
|
100
|
+
raise TimeoutError(
|
101
|
+
f"Failed to sync cross-transaction system clock time after "
|
102
|
+
f"{self.max_sync_wait_time / NANOS_PER_SEC} seconds, "
|
103
|
+
f"aborting."
|
104
|
+
)
|
105
|
+
time.sleep(0.000001)
|
106
|
+
current_time = time.time_ns()
|
107
|
+
|
108
|
+
# update the current thread's last known end time
|
109
|
+
pid = os.getpid()
|
110
|
+
tid = threading.current_thread().ident
|
111
|
+
current_thread_time_key = (pid, tid)
|
112
|
+
self.last_known_end_times[current_thread_time_key] = current_time
|
113
|
+
|
114
|
+
return current_time
|
115
|
+
|
116
|
+
def end_time(self) -> int:
|
117
|
+
"""
|
118
|
+
Gets the current system time in nanoseconds since the epoch. Ensures
|
119
|
+
that the end time returned is no less than the last known start time
|
120
|
+
recorded at the time this method is invoked.
|
121
|
+
:return: Current epoch time in nanoseconds.
|
122
|
+
"""
|
123
|
+
# ensure serial transactions in a single process have end times no less
|
124
|
+
# than the last known start time
|
125
|
+
last_known_start_times = self.last_known_start_times.values() or [0]
|
126
|
+
last_start_time = max(last_known_start_times)
|
127
|
+
|
128
|
+
elapsed_start_time = time.monotonic_ns()
|
129
|
+
current_time = time.time_ns()
|
130
|
+
while current_time < last_start_time:
|
131
|
+
elapsed_time = time.monotonic_ns() - elapsed_start_time
|
132
|
+
if elapsed_time > self.max_sync_wait_time:
|
133
|
+
raise TimeoutError(
|
134
|
+
f"Failed to sync cross-transaction system clock time after "
|
135
|
+
f"{self.max_sync_wait_time / NANOS_PER_SEC} seconds, "
|
136
|
+
f"aborting."
|
137
|
+
)
|
138
|
+
time.sleep(0.000001)
|
139
|
+
current_time = time.time_ns()
|
140
|
+
|
141
|
+
# update the current thread's last known end time
|
142
|
+
pid = os.getpid()
|
143
|
+
tid = threading.current_thread().ident
|
144
|
+
current_thread_time_key = (pid, tid)
|
145
|
+
self.last_known_start_times[current_thread_time_key] = current_time
|
146
|
+
|
147
|
+
return current_time
|
148
|
+
|
149
|
+
|
150
|
+
class TransactionOperation(dict):
|
151
|
+
"""
|
152
|
+
Base class for DeltaCAT transaction operations against individual metafiles.
|
153
|
+
"""
|
154
|
+
|
155
|
+
@staticmethod
|
156
|
+
def of(
|
157
|
+
operation_type: Optional[TransactionOperationType],
|
158
|
+
dest_metafile: Metafile,
|
159
|
+
src_metafile: Optional[Metafile] = None,
|
160
|
+
read_limit: Optional[int] = None,
|
161
|
+
) -> TransactionOperation:
|
162
|
+
if not dest_metafile:
|
163
|
+
raise ValueError("Transaction operations must have a destination metafile.")
|
164
|
+
if operation_type == TransactionOperationType.UPDATE:
|
165
|
+
if not src_metafile:
|
166
|
+
raise ValueError(
|
167
|
+
"UPDATE transaction operations must have a source metafile."
|
168
|
+
)
|
169
|
+
elif type(dest_metafile) is not type(src_metafile):
|
170
|
+
raise ValueError(
|
171
|
+
f"Source metafile type `{type(src_metafile)}` is not "
|
172
|
+
f"equal to dest metafile type `{type(dest_metafile)}`."
|
173
|
+
)
|
174
|
+
elif src_metafile:
|
175
|
+
raise ValueError(
|
176
|
+
"Only UPDATE transaction operations may have a source metafile."
|
177
|
+
)
|
178
|
+
if operation_type.is_write_operation() and read_limit:
|
179
|
+
raise ValueError("Only READ transaction operations may have a read limit.")
|
180
|
+
txn_op = TransactionOperation()
|
181
|
+
txn_op.type = operation_type
|
182
|
+
txn_op.dest_metafile = dest_metafile
|
183
|
+
txn_op.src_metafile = src_metafile
|
184
|
+
txn_op.read_limit = read_limit
|
185
|
+
return txn_op
|
186
|
+
|
187
|
+
@property
|
188
|
+
def type(self) -> TransactionOperationType:
|
189
|
+
"""
|
190
|
+
Returns the type of the transaction operation.
|
191
|
+
"""
|
192
|
+
return TransactionOperationType(self["type"])
|
193
|
+
|
194
|
+
@type.setter
|
195
|
+
def type(self, txn_op_type: TransactionOperationType):
|
196
|
+
self["type"] = txn_op_type
|
197
|
+
|
198
|
+
@property
|
199
|
+
def dest_metafile(self) -> Metafile:
|
200
|
+
"""
|
201
|
+
Returns the metafile that is the target of this transaction operation.
|
202
|
+
"""
|
203
|
+
return self["dest_metafile"]
|
204
|
+
|
205
|
+
@dest_metafile.setter
|
206
|
+
def dest_metafile(self, metafile: Metafile):
|
207
|
+
self["dest_metafile"] = metafile
|
208
|
+
|
209
|
+
@property
|
210
|
+
def src_metafile(self) -> Optional[Metafile]:
|
211
|
+
"""
|
212
|
+
Returns the metafile that is the source of this transaction operation.
|
213
|
+
"""
|
214
|
+
return self["src_metafile"]
|
215
|
+
|
216
|
+
@src_metafile.setter
|
217
|
+
def src_metafile(self, src_metafile: Optional[Metafile]):
|
218
|
+
self["src_metafile"] = src_metafile
|
219
|
+
|
220
|
+
@property
|
221
|
+
def read_limit(self) -> Optional[int]:
|
222
|
+
"""
|
223
|
+
Returns the read limit for this transaction operation.
|
224
|
+
"""
|
225
|
+
return self.get("read_limit")
|
226
|
+
|
227
|
+
@read_limit.setter
|
228
|
+
def read_limit(self, read_limit: Optional[int]):
|
229
|
+
self["read_limit"] = read_limit
|
230
|
+
|
231
|
+
@property
|
232
|
+
def metafile_write_paths(self) -> List[str]:
|
233
|
+
return self.get("metafile_write_paths") or []
|
234
|
+
|
235
|
+
@property
|
236
|
+
def locator_write_paths(self) -> List[str]:
|
237
|
+
return self.get("locator_write_paths") or []
|
238
|
+
|
239
|
+
def append_metafile_write_path(self, write_path: str):
|
240
|
+
metafile_write_paths = self.get("metafile_write_paths")
|
241
|
+
if not metafile_write_paths:
|
242
|
+
metafile_write_paths = self["metafile_write_paths"] = []
|
243
|
+
metafile_write_paths.append(write_path)
|
244
|
+
|
245
|
+
def append_locator_write_path(self, write_path: str):
|
246
|
+
locator_write_paths = self.get("locator_write_paths")
|
247
|
+
if not locator_write_paths:
|
248
|
+
locator_write_paths = self["locator_write_paths"] = []
|
249
|
+
locator_write_paths.append(write_path)
|
250
|
+
|
251
|
+
@metafile_write_paths.setter
|
252
|
+
def metafile_write_paths(self, write_paths: List[str]) -> None:
|
253
|
+
self["metafile_write_paths"] = write_paths
|
254
|
+
|
255
|
+
@locator_write_paths.setter
|
256
|
+
def locator_write_paths(self, write_paths: List[str]):
|
257
|
+
self["locator_write_paths"] = write_paths
|
258
|
+
|
259
|
+
|
260
|
+
class TransactionOperationList(List[TransactionOperation]):
|
261
|
+
@staticmethod
|
262
|
+
def of(items: List[TransactionOperation]) -> TransactionOperationList:
|
263
|
+
typed_items = TransactionOperationList()
|
264
|
+
for item in items:
|
265
|
+
if item is not None and not isinstance(item, TransactionOperation):
|
266
|
+
item = TransactionOperation(item)
|
267
|
+
typed_items.append(item)
|
268
|
+
return typed_items
|
269
|
+
|
270
|
+
def __getitem__(self, item):
|
271
|
+
val = super().__getitem__(item)
|
272
|
+
if val is not None and not isinstance(val, TransactionOperation):
|
273
|
+
self[item] = val = TransactionOperation(val)
|
274
|
+
return val
|
275
|
+
|
276
|
+
|
277
|
+
class Transaction(dict):
|
278
|
+
"""
|
279
|
+
Base class for DeltaCAT transactions.
|
280
|
+
"""
|
281
|
+
|
282
|
+
@staticmethod
|
283
|
+
def of(
|
284
|
+
txn_type: TransactionType,
|
285
|
+
txn_operations: Optional[TransactionOperationList],
|
286
|
+
) -> Transaction:
|
287
|
+
operation_types = set([op.type for op in txn_operations])
|
288
|
+
if txn_type == TransactionType.READ:
|
289
|
+
if operation_types - TransactionOperationType.read_operations():
|
290
|
+
raise ValueError(
|
291
|
+
"Only READ transaction operation types may be specified as "
|
292
|
+
"part of a READ transaction."
|
293
|
+
)
|
294
|
+
elif (
|
295
|
+
len(operation_types) == 1
|
296
|
+
and TransactionOperationType.CREATE in operation_types
|
297
|
+
):
|
298
|
+
if txn_type != TransactionType.APPEND:
|
299
|
+
raise ValueError(
|
300
|
+
"Transactions with only CREATE operations must be "
|
301
|
+
"specified as part of an APPEND transaction."
|
302
|
+
)
|
303
|
+
elif TransactionOperationType.DELETE in operation_types:
|
304
|
+
if txn_type != TransactionType.DELETE:
|
305
|
+
raise ValueError(
|
306
|
+
"DELETE transaction operations must be specified as part "
|
307
|
+
"of a DELETE transaction."
|
308
|
+
)
|
309
|
+
elif TransactionOperationType.UPDATE in operation_types and txn_type not in {
|
310
|
+
TransactionType.ALTER,
|
311
|
+
TransactionType.RESTATE,
|
312
|
+
TransactionType.OVERWRITE,
|
313
|
+
}:
|
314
|
+
raise ValueError(
|
315
|
+
"Transactions with UPDATE operations must be specified "
|
316
|
+
"as part of an ALTER, RESTATE, or OVERWRITE transaction."
|
317
|
+
)
|
318
|
+
transaction = Transaction()
|
319
|
+
transaction.type = txn_type
|
320
|
+
transaction.operations = txn_operations
|
321
|
+
return transaction
|
322
|
+
|
323
|
+
@staticmethod
|
324
|
+
def read_end_time(
|
325
|
+
path: str,
|
326
|
+
filesystem: Optional[pyarrow.fs.FileSystem] = None,
|
327
|
+
) -> Optional[int]:
|
328
|
+
"""
|
329
|
+
Returns the end time of the transaction, or None if the transaction
|
330
|
+
log file does not exist.
|
331
|
+
:param path: Transaction log path to read.
|
332
|
+
:param filesystem: File system to use for reading the Transaction file.
|
333
|
+
:return: Deserialized object from the Transaction file.
|
334
|
+
"""
|
335
|
+
# TODO(pdames): Validate that input file path is a valid txn log.
|
336
|
+
if not filesystem:
|
337
|
+
path, filesystem = resolve_path_and_filesystem(path, filesystem)
|
338
|
+
file_info_and_sizes = list_directory(
|
339
|
+
path=path,
|
340
|
+
filesystem=filesystem,
|
341
|
+
ignore_missing_path=True,
|
342
|
+
)
|
343
|
+
end_time = None
|
344
|
+
if file_info_and_sizes:
|
345
|
+
if len(file_info_and_sizes) > 1:
|
346
|
+
raise ValueError(
|
347
|
+
f"Expected to find only one transaction log at {path}, "
|
348
|
+
f"but found {len(file_info_and_sizes)}"
|
349
|
+
)
|
350
|
+
end_time = Transaction._parse_end_time(file_info_and_sizes[0][0])
|
351
|
+
return end_time
|
352
|
+
|
353
|
+
@staticmethod
|
354
|
+
def _parse_end_time(txn_log_file_name_or_path: str) -> int:
|
355
|
+
return int(posixpath.basename(txn_log_file_name_or_path))
|
356
|
+
|
357
|
+
@classmethod
|
358
|
+
def read(
|
359
|
+
cls,
|
360
|
+
path: str,
|
361
|
+
filesystem: Optional[pyarrow.fs.FileSystem] = None,
|
362
|
+
) -> Transaction:
|
363
|
+
"""
|
364
|
+
Read a Transaction file and return the deserialized object.
|
365
|
+
:param path: Transaction file path to read.
|
366
|
+
:param filesystem: File system to use for reading the Transaction file.
|
367
|
+
:return: Deserialized object from the Transaction file.
|
368
|
+
"""
|
369
|
+
if not filesystem:
|
370
|
+
path, filesystem = resolve_path_and_filesystem(path, filesystem)
|
371
|
+
with filesystem.open_input_stream(path) as file:
|
372
|
+
binary = file.readall()
|
373
|
+
obj = cls(**msgpack.loads(binary))
|
374
|
+
return obj
|
375
|
+
|
376
|
+
@property
|
377
|
+
def id(self) -> Optional[str]:
|
378
|
+
"""
|
379
|
+
Returns this transaction's unique ID assigned at commit start time, or
|
380
|
+
None if the unique ID has not yet been assigned.
|
381
|
+
"""
|
382
|
+
_id = self.get("id")
|
383
|
+
if not _id and self.start_time:
|
384
|
+
_id = self["id"] = f"{self.start_time}{TXN_PART_SEPARATOR}{uuid.uuid4()}"
|
385
|
+
return _id
|
386
|
+
|
387
|
+
@property
|
388
|
+
def type(self) -> TransactionType:
|
389
|
+
"""
|
390
|
+
Returns the type of the transaction.
|
391
|
+
"""
|
392
|
+
return TransactionType(self["type"])
|
393
|
+
|
394
|
+
@type.setter
|
395
|
+
def type(self, txn_type: TransactionType):
|
396
|
+
self["type"] = txn_type
|
397
|
+
|
398
|
+
@property
|
399
|
+
def operations(self) -> TransactionOperationList:
|
400
|
+
"""
|
401
|
+
Returns the list of transaction operations.
|
402
|
+
"""
|
403
|
+
return TransactionOperationList(self["operations"])
|
404
|
+
|
405
|
+
@operations.setter
|
406
|
+
def operations(self, operations: TransactionOperationList):
|
407
|
+
self["operations"] = operations
|
408
|
+
|
409
|
+
@property
|
410
|
+
def start_time(self) -> Optional[int]:
|
411
|
+
"""
|
412
|
+
Returns the start time of the transaction.
|
413
|
+
"""
|
414
|
+
return self.get("start_time")
|
415
|
+
|
416
|
+
@property
|
417
|
+
def end_time(self) -> Optional[int]:
|
418
|
+
"""
|
419
|
+
Returns the end time of the transaction.
|
420
|
+
"""
|
421
|
+
return self.get("end_time")
|
422
|
+
|
423
|
+
def _mark_start_time(self, time_provider: TransactionTimeProvider) -> int:
|
424
|
+
"""
|
425
|
+
Sets the start time of the transaction using the given
|
426
|
+
TransactionTimeProvider. Raises a runtime error if the transaction
|
427
|
+
start time has already been set by a previous commit.
|
428
|
+
"""
|
429
|
+
if self.get("start_time"):
|
430
|
+
raise RuntimeError("Cannot restart a previously started transaction.")
|
431
|
+
start_time = self["start_time"] = time_provider.start_time()
|
432
|
+
return start_time
|
433
|
+
|
434
|
+
def _mark_end_time(self, time_provider: TransactionTimeProvider) -> int:
|
435
|
+
"""
|
436
|
+
Sets the end time of the transaction using the given
|
437
|
+
TransactionTimeProvider. Raises a runtime error if the transaction end
|
438
|
+
time has already been set by a previous commit, or if the transaction
|
439
|
+
start time has not been set.
|
440
|
+
"""
|
441
|
+
if not self.get("start_time"):
|
442
|
+
raise RuntimeError("Cannot end an unstarted transaction.")
|
443
|
+
if self.get("end_time"):
|
444
|
+
raise RuntimeError("Cannot end a completed transaction.")
|
445
|
+
end_time = self["end_time"] = time_provider.end_time()
|
446
|
+
return end_time
|
447
|
+
|
448
|
+
@staticmethod
|
449
|
+
def _abs_txn_meta_path_to_relative(root: str, target: str) -> str:
|
450
|
+
"""
|
451
|
+
Takes an absolute root directory path and target absolute path to
|
452
|
+
relativize with respect to the root directory. Returns the target
|
453
|
+
path relative to the root directory path. Raises an error if the
|
454
|
+
target path is not contained in the given root directory path, if
|
455
|
+
either path is not an absolute path, or if the target path is equal
|
456
|
+
to the root directory path.
|
457
|
+
"""
|
458
|
+
root_path = PosixPath(root)
|
459
|
+
target_path = PosixPath(target)
|
460
|
+
# TODO (martinezdavid): Check why is_absolute() fails for certain Delta paths
|
461
|
+
# if not root_path.is_absolute() or not target_path.is_absolute():
|
462
|
+
# raise ValueError("Both root and target must be absolute paths.")
|
463
|
+
if root_path == target_path:
|
464
|
+
raise ValueError(
|
465
|
+
"Target and root are identical, but expected target to be a child of root."
|
466
|
+
)
|
467
|
+
try:
|
468
|
+
relative_path = target_path.relative_to(root_path)
|
469
|
+
except ValueError:
|
470
|
+
raise ValueError("Expected target to be a child of root.")
|
471
|
+
return str(relative_path)
|
472
|
+
|
473
|
+
def relativize_operation_paths(
|
474
|
+
self, operation: TransactionOperation, catalog_root: str
|
475
|
+
) -> None:
|
476
|
+
"""
|
477
|
+
Converts all absolute paths in an operation to relative paths
|
478
|
+
with respect to the catalog root directory.
|
479
|
+
"""
|
480
|
+
# handle metafile paths
|
481
|
+
if operation.metafile_write_paths:
|
482
|
+
metafile_write_paths = [
|
483
|
+
Transaction._abs_txn_meta_path_to_relative(catalog_root, path)
|
484
|
+
for path in operation.metafile_write_paths
|
485
|
+
]
|
486
|
+
operation.metafile_write_paths = metafile_write_paths
|
487
|
+
# handle locator paths
|
488
|
+
if operation.locator_write_paths:
|
489
|
+
locator_write_paths = [
|
490
|
+
Transaction._abs_txn_meta_path_to_relative(catalog_root, path)
|
491
|
+
for path in operation.locator_write_paths
|
492
|
+
]
|
493
|
+
operation.locator_write_paths = locator_write_paths
|
494
|
+
|
495
|
+
def to_serializable(self, catalog_root) -> Transaction:
|
496
|
+
"""
|
497
|
+
Prepare the object for serialization by converting any non-serializable
|
498
|
+
types to serializable types. May also run any required pre-write
|
499
|
+
validations on the serialized or deserialized object.
|
500
|
+
:return: a serializable version of the object
|
501
|
+
"""
|
502
|
+
serializable = copy.deepcopy(self)
|
503
|
+
# remove all src/dest metafile contents except IDs and locators to
|
504
|
+
# reduce file size (they can be reconstructed from their corresponding
|
505
|
+
# files as required).
|
506
|
+
for operation in serializable.operations:
|
507
|
+
# Sanity check that IDs exist on source and dest metafiles
|
508
|
+
if operation.dest_metafile and operation.dest_metafile.id is None:
|
509
|
+
raise ValueError(
|
510
|
+
f"Transaction operation ${operation} dest metafile does "
|
511
|
+
f"not have ID: ${operation.dest_metafile}"
|
512
|
+
)
|
513
|
+
if operation.src_metafile and operation.src_metafile.id is None:
|
514
|
+
raise ValueError(
|
515
|
+
f"Transaction operation ${operation} src metafile does "
|
516
|
+
f"not have ID: ${operation.src_metafile}"
|
517
|
+
)
|
518
|
+
# relativize after checking that dest and src metafiles are valid
|
519
|
+
self.relativize_operation_paths(operation, catalog_root)
|
520
|
+
operation.dest_metafile = {
|
521
|
+
"id": operation.dest_metafile.id,
|
522
|
+
"locator": operation.dest_metafile.locator,
|
523
|
+
"locator_alias": operation.dest_metafile.locator_alias,
|
524
|
+
}
|
525
|
+
if operation.src_metafile:
|
526
|
+
operation.src_metafile = {
|
527
|
+
"id": operation.src_metafile.id,
|
528
|
+
"locator": operation.src_metafile.locator,
|
529
|
+
"locator_alias": operation.src_metafile.locator_alias,
|
530
|
+
}
|
531
|
+
# TODO(pdames): Ensure that all file paths recorded are relative to the
|
532
|
+
# catalog root.
|
533
|
+
return serializable
|
534
|
+
|
535
|
+
@staticmethod
|
536
|
+
def _validate_txn_log_file(success_txn_log_file: str) -> None:
|
537
|
+
txn_log_dir_name = posixpath.basename(posixpath.dirname(success_txn_log_file))
|
538
|
+
txn_log_parts = txn_log_dir_name.split(TXN_PART_SEPARATOR)
|
539
|
+
# ensure that the transaction start time is valid
|
540
|
+
try:
|
541
|
+
start_time = int(txn_log_parts[0])
|
542
|
+
except ValueError as e:
|
543
|
+
raise ValueError(
|
544
|
+
f"Transaction log file `{success_txn_log_file}` does not "
|
545
|
+
f"contain a valid start time."
|
546
|
+
) from e
|
547
|
+
# ensure that the txn uuid is valid
|
548
|
+
txn_uuid_str = txn_log_parts[1]
|
549
|
+
try:
|
550
|
+
uuid.UUID(txn_uuid_str)
|
551
|
+
except ValueError as e:
|
552
|
+
raise OSError(
|
553
|
+
f"Transaction log file `{success_txn_log_file}` does not "
|
554
|
+
f"contain a valid UUID string."
|
555
|
+
) from e
|
556
|
+
# ensure that the transaction end time is valid
|
557
|
+
try:
|
558
|
+
end_time = Transaction._parse_end_time(success_txn_log_file)
|
559
|
+
except ValueError as e:
|
560
|
+
raise ValueError(
|
561
|
+
f"Transaction log file `{success_txn_log_file}` does not "
|
562
|
+
f"contain a valid end time."
|
563
|
+
) from e
|
564
|
+
# ensure transaction end time was not recorded before start time
|
565
|
+
if end_time < start_time:
|
566
|
+
raise OSError(
|
567
|
+
f"Transaction end time {end_time} is earlier than start "
|
568
|
+
f"time {start_time}! To preserve catalog integrity, the "
|
569
|
+
f"corresponding completed transaction log at "
|
570
|
+
f"`{success_txn_log_file}` has been removed."
|
571
|
+
)
|
572
|
+
|
573
|
+
def commit(
|
574
|
+
self,
|
575
|
+
catalog_root_dir: str,
|
576
|
+
filesystem: Optional[pyarrow.fs.FileSystem] = None,
|
577
|
+
) -> Union[List[ListResult[Metafile]], Tuple[List[str], str]]:
|
578
|
+
# TODO(pdames): allow transactions to be durably staged and resumed
|
579
|
+
# across multiple sessions prior to commit
|
580
|
+
|
581
|
+
# create a new internal copy of this transaction to guard against
|
582
|
+
# external modification and dirty state across retries
|
583
|
+
txn = copy.deepcopy(self)
|
584
|
+
|
585
|
+
# create the transaction directory first to telegraph that at least 1
|
586
|
+
# transaction at this root has been attempted
|
587
|
+
catalog_root_normalized, filesystem = resolve_path_and_filesystem(
|
588
|
+
catalog_root_dir,
|
589
|
+
filesystem,
|
590
|
+
)
|
591
|
+
txn_log_dir = posixpath.join(catalog_root_normalized, TXN_DIR_NAME)
|
592
|
+
running_txn_log_dir = posixpath.join(txn_log_dir, RUNNING_TXN_DIR_NAME)
|
593
|
+
filesystem.create_dir(running_txn_log_dir, recursive=True)
|
594
|
+
failed_txn_log_dir = posixpath.join(txn_log_dir, FAILED_TXN_DIR_NAME)
|
595
|
+
filesystem.create_dir(failed_txn_log_dir, recursive=False)
|
596
|
+
success_txn_log_dir = posixpath.join(txn_log_dir, SUCCESS_TXN_DIR_NAME)
|
597
|
+
filesystem.create_dir(success_txn_log_dir, recursive=False)
|
598
|
+
|
599
|
+
# TODO(pdames): Support injection of other time providers, but ensure
|
600
|
+
# that ALL transactions in a catalog use the same time provider.
|
601
|
+
time_provider = TransactionSystemTimeProvider()
|
602
|
+
|
603
|
+
# record the transaction start time
|
604
|
+
txn._mark_start_time(time_provider)
|
605
|
+
|
606
|
+
if txn.type == TransactionType.READ:
|
607
|
+
list_results = []
|
608
|
+
for operation in self.operations:
|
609
|
+
list_result = operation.dest_metafile.read_txn(
|
610
|
+
catalog_root_dir=catalog_root_normalized,
|
611
|
+
success_txn_log_dir=success_txn_log_dir,
|
612
|
+
current_txn_op=operation,
|
613
|
+
current_txn_start_time=txn.start_time,
|
614
|
+
current_txn_id=txn.id,
|
615
|
+
filesystem=filesystem,
|
616
|
+
)
|
617
|
+
list_results.append(list_result)
|
618
|
+
return list_results
|
619
|
+
else:
|
620
|
+
return txn._commit_write(
|
621
|
+
catalog_root_normalized=catalog_root_normalized,
|
622
|
+
running_txn_log_dir=running_txn_log_dir,
|
623
|
+
failed_txn_log_dir=failed_txn_log_dir,
|
624
|
+
success_txn_log_dir=success_txn_log_dir,
|
625
|
+
filesystem=filesystem,
|
626
|
+
time_provider=time_provider,
|
627
|
+
)
|
628
|
+
|
629
|
+
def _commit_write(
|
630
|
+
self,
|
631
|
+
catalog_root_normalized: str,
|
632
|
+
running_txn_log_dir: str,
|
633
|
+
failed_txn_log_dir: str,
|
634
|
+
success_txn_log_dir: str,
|
635
|
+
filesystem: pyarrow.fs.FileSystem,
|
636
|
+
time_provider: TransactionTimeProvider,
|
637
|
+
) -> Tuple[List[str], str]:
|
638
|
+
# write the in-progress transaction log file
|
639
|
+
running_txn_log_file_path = posixpath.join(
|
640
|
+
running_txn_log_dir,
|
641
|
+
self.id,
|
642
|
+
)
|
643
|
+
with filesystem.open_output_stream(running_txn_log_file_path) as file:
|
644
|
+
packed = msgpack.dumps(self.to_serializable(catalog_root_normalized))
|
645
|
+
file.write(packed)
|
646
|
+
|
647
|
+
# write each metafile associated with the transaction
|
648
|
+
metafile_write_paths = []
|
649
|
+
locator_write_paths = []
|
650
|
+
try:
|
651
|
+
for operation in self.operations:
|
652
|
+
operation.dest_metafile.write_txn(
|
653
|
+
catalog_root_dir=catalog_root_normalized,
|
654
|
+
success_txn_log_dir=success_txn_log_dir,
|
655
|
+
current_txn_op=operation,
|
656
|
+
current_txn_start_time=self.start_time,
|
657
|
+
current_txn_id=self.id,
|
658
|
+
filesystem=filesystem,
|
659
|
+
)
|
660
|
+
metafile_write_paths.extend(operation.metafile_write_paths)
|
661
|
+
locator_write_paths.extend(operation.locator_write_paths)
|
662
|
+
# check for conflicts with concurrent transactions
|
663
|
+
for path in metafile_write_paths + locator_write_paths:
|
664
|
+
MetafileRevisionInfo.check_for_concurrent_txn_conflict(
|
665
|
+
success_txn_log_dir=success_txn_log_dir,
|
666
|
+
current_txn_revision_file_path=path,
|
667
|
+
filesystem=filesystem,
|
668
|
+
)
|
669
|
+
except Exception:
|
670
|
+
# write a failed transaction log file entry
|
671
|
+
failed_txn_log_file_path = posixpath.join(
|
672
|
+
failed_txn_log_dir,
|
673
|
+
self.id,
|
674
|
+
)
|
675
|
+
with filesystem.open_output_stream(failed_txn_log_file_path) as file:
|
676
|
+
packed = msgpack.dumps(self.to_serializable(catalog_root_normalized))
|
677
|
+
file.write(packed)
|
678
|
+
|
679
|
+
###################################################################
|
680
|
+
###################################################################
|
681
|
+
# failure past here telegraphs a failed transaction cleanup attempt
|
682
|
+
###################################################################
|
683
|
+
###################################################################
|
684
|
+
|
685
|
+
# delete all files written during the failed transaction
|
686
|
+
known_write_paths = chain.from_iterable(
|
687
|
+
[
|
688
|
+
operation.metafile_write_paths + operation.locator_write_paths
|
689
|
+
for operation in self.operations
|
690
|
+
]
|
691
|
+
)
|
692
|
+
# TODO(pdames): Add separate janitor job to cleanup files that we
|
693
|
+
# either failed to add to the known write paths, or fail to delete.
|
694
|
+
for write_path in known_write_paths:
|
695
|
+
filesystem.delete_file(write_path)
|
696
|
+
|
697
|
+
# delete the in-progress transaction log file entry
|
698
|
+
filesystem.delete_file(running_txn_log_file_path)
|
699
|
+
# failed transaction cleanup is now complete
|
700
|
+
raise
|
701
|
+
|
702
|
+
# record the completed transaction
|
703
|
+
success_txn_log_file_dir = posixpath.join(
|
704
|
+
success_txn_log_dir,
|
705
|
+
self.id,
|
706
|
+
)
|
707
|
+
filesystem.create_dir(
|
708
|
+
success_txn_log_file_dir,
|
709
|
+
recursive=False,
|
710
|
+
)
|
711
|
+
end_time = self._mark_end_time(time_provider)
|
712
|
+
success_txn_log_file_path = posixpath.join(
|
713
|
+
success_txn_log_file_dir,
|
714
|
+
str(end_time),
|
715
|
+
)
|
716
|
+
with filesystem.open_output_stream(success_txn_log_file_path) as file:
|
717
|
+
packed = msgpack.dumps(self.to_serializable(catalog_root_normalized))
|
718
|
+
file.write(packed)
|
719
|
+
try:
|
720
|
+
Transaction._validate_txn_log_file(
|
721
|
+
success_txn_log_file=success_txn_log_file_path
|
722
|
+
)
|
723
|
+
except Exception as e1:
|
724
|
+
try:
|
725
|
+
# move the txn log from success dir to failed dir
|
726
|
+
failed_txn_log_file_path = posixpath.join(
|
727
|
+
failed_txn_log_dir,
|
728
|
+
self.id,
|
729
|
+
)
|
730
|
+
filesystem.move(
|
731
|
+
src=success_txn_log_file_path,
|
732
|
+
dest=failed_txn_log_file_path,
|
733
|
+
)
|
734
|
+
# keep parent success txn log dir to telegraph failed validation
|
735
|
+
|
736
|
+
###############################################################
|
737
|
+
###############################################################
|
738
|
+
# failure past here telegraphs a failed transaction validation
|
739
|
+
# cleanup attempt
|
740
|
+
###############################################################
|
741
|
+
###############################################################
|
742
|
+
except Exception as e2:
|
743
|
+
raise OSError(
|
744
|
+
f"Failed to cleanup bad transaction log file at "
|
745
|
+
f"`{success_txn_log_file_path}`"
|
746
|
+
) from e2
|
747
|
+
finally:
|
748
|
+
raise RuntimeError(
|
749
|
+
f"Transaction validation failed. To preserve "
|
750
|
+
f"catalog integrity, the corresponding completed "
|
751
|
+
f"transaction log at `{success_txn_log_file_path}` has "
|
752
|
+
f"been removed."
|
753
|
+
) from e1
|
754
|
+
finally:
|
755
|
+
# delete the in-progress transaction log file entry
|
756
|
+
filesystem.delete_file(running_txn_log_file_path)
|
757
|
+
return metafile_write_paths, success_txn_log_file_path
|