deltacat 1.1.35__py3-none-any.whl → 2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +42 -3
- deltacat/annotations.py +36 -0
- deltacat/api.py +168 -0
- deltacat/aws/s3u.py +4 -4
- deltacat/benchmarking/benchmark_engine.py +82 -0
- deltacat/benchmarking/benchmark_report.py +86 -0
- deltacat/benchmarking/benchmark_suite.py +11 -0
- deltacat/benchmarking/conftest.py +21 -0
- deltacat/benchmarking/data/random_row_generator.py +94 -0
- deltacat/benchmarking/data/row_generator.py +10 -0
- deltacat/benchmarking/test_benchmark_pipeline.py +106 -0
- deltacat/catalog/__init__.py +14 -0
- deltacat/catalog/delegate.py +199 -106
- deltacat/catalog/iceberg/__init__.py +4 -0
- deltacat/catalog/iceberg/iceberg_catalog_config.py +26 -0
- deltacat/catalog/iceberg/impl.py +368 -0
- deltacat/catalog/iceberg/overrides.py +74 -0
- deltacat/catalog/interface.py +273 -76
- deltacat/catalog/main/impl.py +720 -0
- deltacat/catalog/model/catalog.py +227 -20
- deltacat/catalog/model/properties.py +116 -0
- deltacat/catalog/model/table_definition.py +32 -1
- deltacat/compute/compactor/model/compaction_session_audit_info.py +7 -3
- deltacat/compute/compactor/model/delta_annotated.py +3 -3
- deltacat/compute/compactor/model/delta_file_envelope.py +3 -1
- deltacat/compute/compactor/model/delta_file_locator.py +3 -1
- deltacat/compute/compactor/model/round_completion_info.py +5 -5
- deltacat/compute/compactor/model/table_object_store.py +3 -2
- deltacat/compute/compactor/repartition_session.py +1 -1
- deltacat/compute/compactor/steps/dedupe.py +11 -4
- deltacat/compute/compactor/steps/hash_bucket.py +1 -1
- deltacat/compute/compactor/steps/materialize.py +6 -2
- deltacat/compute/compactor/utils/io.py +1 -1
- deltacat/compute/compactor/utils/sort_key.py +9 -2
- deltacat/compute/compactor_v2/compaction_session.py +2 -3
- deltacat/compute/compactor_v2/constants.py +1 -30
- deltacat/compute/compactor_v2/deletes/utils.py +3 -3
- deltacat/compute/compactor_v2/model/merge_input.py +1 -1
- deltacat/compute/compactor_v2/private/compaction_utils.py +5 -5
- deltacat/compute/compactor_v2/steps/merge.py +11 -80
- deltacat/compute/compactor_v2/utils/content_type_params.py +0 -17
- deltacat/compute/compactor_v2/utils/dedupe.py +1 -1
- deltacat/compute/compactor_v2/utils/io.py +1 -1
- deltacat/compute/compactor_v2/utils/primary_key_index.py +3 -15
- deltacat/compute/compactor_v2/utils/task_options.py +23 -43
- deltacat/compute/converter/constants.py +4 -0
- deltacat/compute/converter/converter_session.py +143 -0
- deltacat/compute/converter/model/convert_input.py +69 -0
- deltacat/compute/converter/model/convert_input_files.py +61 -0
- deltacat/compute/converter/model/converter_session_params.py +99 -0
- deltacat/compute/converter/pyiceberg/__init__.py +0 -0
- deltacat/compute/converter/pyiceberg/catalog.py +75 -0
- deltacat/compute/converter/pyiceberg/overrides.py +135 -0
- deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +251 -0
- deltacat/compute/converter/steps/__init__.py +0 -0
- deltacat/compute/converter/steps/convert.py +211 -0
- deltacat/compute/converter/steps/dedupe.py +60 -0
- deltacat/compute/converter/utils/__init__.py +0 -0
- deltacat/compute/converter/utils/convert_task_options.py +88 -0
- deltacat/compute/converter/utils/converter_session_utils.py +109 -0
- deltacat/compute/converter/utils/iceberg_columns.py +82 -0
- deltacat/compute/converter/utils/io.py +43 -0
- deltacat/compute/converter/utils/s3u.py +133 -0
- deltacat/compute/resource_estimation/delta.py +1 -19
- deltacat/constants.py +47 -1
- deltacat/env.py +51 -0
- deltacat/examples/__init__.py +0 -0
- deltacat/examples/basic_logging.py +101 -0
- deltacat/examples/common/__init__.py +0 -0
- deltacat/examples/common/fixtures.py +15 -0
- deltacat/examples/hello_world.py +27 -0
- deltacat/examples/iceberg/__init__.py +0 -0
- deltacat/examples/iceberg/iceberg_bucket_writer.py +139 -0
- deltacat/examples/iceberg/iceberg_reader.py +149 -0
- deltacat/exceptions.py +51 -9
- deltacat/logs.py +4 -1
- deltacat/storage/__init__.py +118 -28
- deltacat/storage/iceberg/__init__.py +0 -0
- deltacat/storage/iceberg/iceberg_scan_planner.py +28 -0
- deltacat/storage/iceberg/impl.py +737 -0
- deltacat/storage/iceberg/model.py +709 -0
- deltacat/storage/interface.py +217 -134
- deltacat/storage/main/__init__.py +0 -0
- deltacat/storage/main/impl.py +2077 -0
- deltacat/storage/model/delta.py +118 -71
- deltacat/storage/model/interop.py +24 -0
- deltacat/storage/model/list_result.py +8 -0
- deltacat/storage/model/locator.py +93 -3
- deltacat/{aws/redshift → storage}/model/manifest.py +122 -98
- deltacat/storage/model/metafile.py +1316 -0
- deltacat/storage/model/namespace.py +34 -18
- deltacat/storage/model/partition.py +362 -37
- deltacat/storage/model/scan/__init__.py +0 -0
- deltacat/storage/model/scan/push_down.py +19 -0
- deltacat/storage/model/scan/scan_plan.py +10 -0
- deltacat/storage/model/scan/scan_task.py +34 -0
- deltacat/storage/model/schema.py +892 -0
- deltacat/storage/model/shard.py +47 -0
- deltacat/storage/model/sort_key.py +170 -13
- deltacat/storage/model/stream.py +208 -80
- deltacat/storage/model/table.py +123 -29
- deltacat/storage/model/table_version.py +322 -46
- deltacat/storage/model/transaction.py +757 -0
- deltacat/storage/model/transform.py +198 -61
- deltacat/storage/model/types.py +111 -13
- deltacat/storage/rivulet/__init__.py +11 -0
- deltacat/storage/rivulet/arrow/__init__.py +0 -0
- deltacat/storage/rivulet/arrow/serializer.py +75 -0
- deltacat/storage/rivulet/dataset.py +744 -0
- deltacat/storage/rivulet/dataset_executor.py +87 -0
- deltacat/storage/rivulet/feather/__init__.py +5 -0
- deltacat/storage/rivulet/feather/file_reader.py +136 -0
- deltacat/storage/rivulet/feather/serializer.py +35 -0
- deltacat/storage/rivulet/fs/__init__.py +0 -0
- deltacat/storage/rivulet/fs/file_provider.py +105 -0
- deltacat/storage/rivulet/fs/file_store.py +130 -0
- deltacat/storage/rivulet/fs/input_file.py +76 -0
- deltacat/storage/rivulet/fs/output_file.py +86 -0
- deltacat/storage/rivulet/logical_plan.py +105 -0
- deltacat/storage/rivulet/metastore/__init__.py +0 -0
- deltacat/storage/rivulet/metastore/delta.py +190 -0
- deltacat/storage/rivulet/metastore/json_sst.py +105 -0
- deltacat/storage/rivulet/metastore/sst.py +82 -0
- deltacat/storage/rivulet/metastore/sst_interval_tree.py +260 -0
- deltacat/storage/rivulet/mvp/Table.py +101 -0
- deltacat/storage/rivulet/mvp/__init__.py +5 -0
- deltacat/storage/rivulet/parquet/__init__.py +5 -0
- deltacat/storage/rivulet/parquet/data_reader.py +0 -0
- deltacat/storage/rivulet/parquet/file_reader.py +127 -0
- deltacat/storage/rivulet/parquet/serializer.py +37 -0
- deltacat/storage/rivulet/reader/__init__.py +0 -0
- deltacat/storage/rivulet/reader/block_scanner.py +378 -0
- deltacat/storage/rivulet/reader/data_reader.py +136 -0
- deltacat/storage/rivulet/reader/data_scan.py +63 -0
- deltacat/storage/rivulet/reader/dataset_metastore.py +178 -0
- deltacat/storage/rivulet/reader/dataset_reader.py +156 -0
- deltacat/storage/rivulet/reader/pyarrow_data_reader.py +121 -0
- deltacat/storage/rivulet/reader/query_expression.py +99 -0
- deltacat/storage/rivulet/reader/reader_type_registrar.py +84 -0
- deltacat/storage/rivulet/schema/__init__.py +0 -0
- deltacat/storage/rivulet/schema/datatype.py +128 -0
- deltacat/storage/rivulet/schema/schema.py +251 -0
- deltacat/storage/rivulet/serializer.py +40 -0
- deltacat/storage/rivulet/serializer_factory.py +42 -0
- deltacat/storage/rivulet/writer/__init__.py +0 -0
- deltacat/storage/rivulet/writer/dataset_writer.py +29 -0
- deltacat/storage/rivulet/writer/memtable_dataset_writer.py +294 -0
- deltacat/tests/_io/__init__.py +1 -0
- deltacat/tests/catalog/test_catalogs.py +324 -0
- deltacat/tests/catalog/test_default_catalog_impl.py +16 -8
- deltacat/tests/compute/compact_partition_multiple_rounds_test_cases.py +21 -21
- deltacat/tests/compute/compact_partition_rebase_test_cases.py +6 -6
- deltacat/tests/compute/compact_partition_rebase_then_incremental_test_cases.py +56 -56
- deltacat/tests/compute/compact_partition_test_cases.py +19 -53
- deltacat/tests/compute/compactor/steps/test_repartition.py +2 -2
- deltacat/tests/compute/compactor/utils/test_io.py +6 -8
- deltacat/tests/compute/compactor_v2/test_compaction_session.py +0 -466
- deltacat/tests/compute/compactor_v2/utils/test_task_options.py +1 -273
- deltacat/tests/compute/conftest.py +75 -0
- deltacat/tests/compute/converter/__init__.py +0 -0
- deltacat/tests/compute/converter/conftest.py +80 -0
- deltacat/tests/compute/converter/test_convert_session.py +478 -0
- deltacat/tests/compute/converter/utils.py +123 -0
- deltacat/tests/compute/resource_estimation/test_delta.py +0 -16
- deltacat/tests/compute/test_compact_partition_incremental.py +2 -42
- deltacat/tests/compute/test_compact_partition_multiple_rounds.py +5 -46
- deltacat/tests/compute/test_compact_partition_params.py +3 -3
- deltacat/tests/compute/test_compact_partition_rebase.py +1 -46
- deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +5 -46
- deltacat/tests/compute/test_util_common.py +19 -12
- deltacat/tests/compute/test_util_create_table_deltas_repo.py +13 -22
- deltacat/tests/local_deltacat_storage/__init__.py +76 -103
- deltacat/tests/storage/__init__.py +0 -0
- deltacat/tests/storage/conftest.py +25 -0
- deltacat/tests/storage/main/__init__.py +0 -0
- deltacat/tests/storage/main/test_main_storage.py +1399 -0
- deltacat/tests/storage/model/__init__.py +0 -0
- deltacat/tests/storage/model/test_delete_parameters.py +21 -0
- deltacat/tests/storage/model/test_metafile_io.py +2535 -0
- deltacat/tests/storage/model/test_schema.py +308 -0
- deltacat/tests/storage/model/test_shard.py +22 -0
- deltacat/tests/storage/model/test_table_version.py +110 -0
- deltacat/tests/storage/model/test_transaction.py +308 -0
- deltacat/tests/storage/rivulet/__init__.py +0 -0
- deltacat/tests/storage/rivulet/conftest.py +149 -0
- deltacat/tests/storage/rivulet/fs/__init__.py +0 -0
- deltacat/tests/storage/rivulet/fs/test_file_location_provider.py +93 -0
- deltacat/tests/storage/rivulet/schema/__init__.py +0 -0
- deltacat/tests/storage/rivulet/schema/test_schema.py +241 -0
- deltacat/tests/storage/rivulet/test_dataset.py +406 -0
- deltacat/tests/storage/rivulet/test_manifest.py +67 -0
- deltacat/tests/storage/rivulet/test_sst_interval_tree.py +232 -0
- deltacat/tests/storage/rivulet/test_utils.py +122 -0
- deltacat/tests/storage/rivulet/writer/__init__.py +0 -0
- deltacat/tests/storage/rivulet/writer/test_dataset_write_then_read.py +341 -0
- deltacat/tests/storage/rivulet/writer/test_dataset_writer.py +79 -0
- deltacat/tests/storage/rivulet/writer/test_memtable_dataset_writer.py +75 -0
- deltacat/tests/test_deltacat_api.py +39 -0
- deltacat/tests/test_utils/filesystem.py +14 -0
- deltacat/tests/test_utils/message_pack_utils.py +54 -0
- deltacat/tests/test_utils/pyarrow.py +8 -15
- deltacat/tests/test_utils/storage.py +266 -3
- deltacat/tests/utils/test_daft.py +3 -3
- deltacat/tests/utils/test_pyarrow.py +0 -432
- deltacat/types/partial_download.py +1 -1
- deltacat/types/tables.py +1 -1
- deltacat/utils/export.py +59 -0
- deltacat/utils/filesystem.py +320 -0
- deltacat/utils/metafile_locator.py +73 -0
- deltacat/utils/pyarrow.py +36 -183
- deltacat-2.0.dist-info/METADATA +65 -0
- deltacat-2.0.dist-info/RECORD +347 -0
- deltacat/aws/redshift/__init__.py +0 -19
- deltacat/catalog/default_catalog_impl/__init__.py +0 -369
- deltacat/io/dataset.py +0 -73
- deltacat/io/read_api.py +0 -143
- deltacat/storage/model/delete_parameters.py +0 -40
- deltacat/storage/model/partition_spec.py +0 -71
- deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +0 -253
- deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +0 -45
- deltacat-1.1.35.dist-info/METADATA +0 -64
- deltacat-1.1.35.dist-info/RECORD +0 -219
- /deltacat/{aws/redshift/model → benchmarking/data}/__init__.py +0 -0
- /deltacat/{io/aws → catalog/main}/__init__.py +0 -0
- /deltacat/{io/aws/redshift → compute/converter}/__init__.py +0 -0
- /deltacat/{tests/io → compute/converter/model}/__init__.py +0 -0
- /deltacat/tests/{io → _io}/test_cloudpickle_bug_fix.py +0 -0
- /deltacat/tests/{io → _io}/test_file_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_memcached_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_ray_plasma_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_redis_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_s3_object_store.py +0 -0
- {deltacat-1.1.35.dist-info → deltacat-2.0.dist-info}/LICENSE +0 -0
- {deltacat-1.1.35.dist-info → deltacat-2.0.dist-info}/WHEEL +0 -0
- {deltacat-1.1.35.dist-info → deltacat-2.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,1316 @@
|
|
1
|
+
# Allow classes to use self-referencing Type hints in Python 3.7.
|
2
|
+
from __future__ import annotations
|
3
|
+
|
4
|
+
import copy
|
5
|
+
|
6
|
+
from typing import Optional, Tuple, List
|
7
|
+
|
8
|
+
import base64
|
9
|
+
import json
|
10
|
+
import msgpack
|
11
|
+
import pyarrow.fs
|
12
|
+
import posixpath
|
13
|
+
import uuid
|
14
|
+
import deltacat
|
15
|
+
|
16
|
+
from deltacat.constants import (
|
17
|
+
METAFILE_FORMAT,
|
18
|
+
REVISION_DIR_NAME,
|
19
|
+
METAFILE_EXT,
|
20
|
+
SUPPORTED_METAFILE_FORMATS,
|
21
|
+
TXN_DIR_NAME,
|
22
|
+
TXN_PART_SEPARATOR,
|
23
|
+
SUCCESS_TXN_DIR_NAME,
|
24
|
+
)
|
25
|
+
from deltacat.storage.model.list_result import ListResult
|
26
|
+
from deltacat.storage.model.locator import Locator
|
27
|
+
from deltacat.storage.model.types import TransactionOperationType
|
28
|
+
from deltacat.utils.filesystem import (
|
29
|
+
resolve_path_and_filesystem,
|
30
|
+
list_directory,
|
31
|
+
get_file_info,
|
32
|
+
)
|
33
|
+
|
34
|
+
|
35
|
+
class MetafileRevisionInfo(dict):
|
36
|
+
"""
|
37
|
+
Base class for DeltaCAT metafile revision info.
|
38
|
+
"""
|
39
|
+
|
40
|
+
@staticmethod
|
41
|
+
def undefined() -> MetafileRevisionInfo:
|
42
|
+
mri = MetafileRevisionInfo()
|
43
|
+
mri.revision = 0
|
44
|
+
mri.txn_id = None
|
45
|
+
mri.txn_op_type = None
|
46
|
+
mri.dir_path = None
|
47
|
+
return mri
|
48
|
+
|
49
|
+
@staticmethod
|
50
|
+
def parse(revision_file_path: str) -> MetafileRevisionInfo:
|
51
|
+
dir_path = posixpath.dirname(revision_file_path)
|
52
|
+
metafile_name = posixpath.basename(revision_file_path)
|
53
|
+
metafile_and_ext = posixpath.splitext(metafile_name)
|
54
|
+
metafile_ext = metafile_and_ext[1] if len(metafile_and_ext) > 1 else None
|
55
|
+
metafile_rev_and_txn_info = metafile_and_ext[0]
|
56
|
+
txn_info_parts = metafile_rev_and_txn_info.split(TXN_PART_SEPARATOR)
|
57
|
+
|
58
|
+
mri = MetafileRevisionInfo()
|
59
|
+
mri.dir_path = dir_path
|
60
|
+
mri.extension = metafile_ext
|
61
|
+
mri.revision = int(txn_info_parts[0])
|
62
|
+
mri.txn_op_type = txn_info_parts[1]
|
63
|
+
mri.txn_id = f"{txn_info_parts[2]}{TXN_PART_SEPARATOR}{txn_info_parts[3]}"
|
64
|
+
return mri
|
65
|
+
|
66
|
+
@staticmethod
|
67
|
+
def list_revisions(
|
68
|
+
revision_dir_path: str,
|
69
|
+
filesystem: pyarrow.fs.FileSystem,
|
70
|
+
success_txn_log_dir: str,
|
71
|
+
current_txn_start_time: Optional[int] = None,
|
72
|
+
current_txn_id: Optional[str] = None,
|
73
|
+
limit: Optional[int] = None,
|
74
|
+
) -> List[MetafileRevisionInfo]:
|
75
|
+
if not success_txn_log_dir:
|
76
|
+
err_msg = f"No transaction log found for: {revision_dir_path}."
|
77
|
+
raise ValueError(err_msg)
|
78
|
+
# find the latest committed revision of the target metafile
|
79
|
+
sorted_metafile_paths = MetafileRevisionInfo._sorted_file_paths(
|
80
|
+
revision_dir_path=revision_dir_path,
|
81
|
+
filesystem=filesystem,
|
82
|
+
ignore_missing_revision=True,
|
83
|
+
)
|
84
|
+
revisions = []
|
85
|
+
while sorted_metafile_paths:
|
86
|
+
latest_metafile_path = sorted_metafile_paths.pop()
|
87
|
+
mri = MetafileRevisionInfo.parse(latest_metafile_path)
|
88
|
+
if not current_txn_id or mri.txn_id == current_txn_id:
|
89
|
+
# consider the current transaction (if any) to be committed
|
90
|
+
revisions.append(mri)
|
91
|
+
elif current_txn_start_time is not None:
|
92
|
+
# the current transaction can only build on top of the snapshot
|
93
|
+
# of commits from transactions that completed before it started
|
94
|
+
txn_end_time = (
|
95
|
+
deltacat.storage.model.transaction.Transaction.read_end_time(
|
96
|
+
path=posixpath.join(success_txn_log_dir, mri.txn_id),
|
97
|
+
filesystem=filesystem,
|
98
|
+
)
|
99
|
+
)
|
100
|
+
if txn_end_time is not None and txn_end_time < current_txn_start_time:
|
101
|
+
revisions.append(mri)
|
102
|
+
else:
|
103
|
+
raise ValueError(
|
104
|
+
f"Current transaction ID `{current_txn_id} provided "
|
105
|
+
f"without a transaction start time."
|
106
|
+
)
|
107
|
+
if limit <= len(revisions):
|
108
|
+
break
|
109
|
+
return revisions
|
110
|
+
|
111
|
+
@staticmethod
|
112
|
+
def latest_revision(
|
113
|
+
revision_dir_path: str,
|
114
|
+
filesystem: pyarrow.fs.FileSystem,
|
115
|
+
success_txn_log_dir: str,
|
116
|
+
current_txn_start_time: Optional[int] = None,
|
117
|
+
current_txn_id: Optional[str] = None,
|
118
|
+
ignore_missing_revision: bool = False,
|
119
|
+
) -> MetafileRevisionInfo:
|
120
|
+
"""
|
121
|
+
Fetch latest revision of a metafile, or return None if no
|
122
|
+
revisions exist.
|
123
|
+
:param revision_dir_path: root path of directory for metafile
|
124
|
+
:param ignore_missing_revision: if True, will return
|
125
|
+
MetafileRevisionInfo.undefined() on no revisions
|
126
|
+
:raises ValueError if no revisions are found AND
|
127
|
+
ignore_missing_revision=False
|
128
|
+
"""
|
129
|
+
revisions = MetafileRevisionInfo.list_revisions(
|
130
|
+
revision_dir_path=revision_dir_path,
|
131
|
+
filesystem=filesystem,
|
132
|
+
success_txn_log_dir=success_txn_log_dir,
|
133
|
+
current_txn_start_time=current_txn_start_time,
|
134
|
+
current_txn_id=current_txn_id,
|
135
|
+
limit=1,
|
136
|
+
)
|
137
|
+
if not revisions and not ignore_missing_revision:
|
138
|
+
err_msg = f"No committed revision found at {revision_dir_path}."
|
139
|
+
raise ValueError(err_msg)
|
140
|
+
return revisions[0] if revisions else MetafileRevisionInfo.undefined()
|
141
|
+
|
142
|
+
@staticmethod
|
143
|
+
def new_revision(
|
144
|
+
revision_dir_path: str,
|
145
|
+
current_txn_op_type: deltacat.storage.model.transaction.TransactionOperationType,
|
146
|
+
current_txn_start_time: int,
|
147
|
+
current_txn_id: str,
|
148
|
+
filesystem: pyarrow.fs.FileSystem,
|
149
|
+
extension: Optional[str] = METAFILE_EXT,
|
150
|
+
success_txn_log_dir: Optional[str] = None,
|
151
|
+
) -> MetafileRevisionInfo:
|
152
|
+
"""
|
153
|
+
Creates and returns a new MetafileRevisionInfo object for the next
|
154
|
+
revision of the metafile.
|
155
|
+
|
156
|
+
This method determines the next revision information based on the
|
157
|
+
latest existing revision in the specified directory path and the
|
158
|
+
current transaction details.
|
159
|
+
|
160
|
+
Args:
|
161
|
+
revision_dir_path (str): Metafile revision directory path to
|
162
|
+
generate the next metafile revision info for.
|
163
|
+
current_txn_op_type (TransactionOperationType): The current
|
164
|
+
transaction's operation type.
|
165
|
+
current_txn_start_time (int): The current transaction's start time.
|
166
|
+
current_txn_id (str): The current transaction's ID.
|
167
|
+
filesystem (pyarrow.fs.FileSystem): The filesystem interface to
|
168
|
+
use for file operations
|
169
|
+
extension (str, optional): The file extension for metafiles.
|
170
|
+
Defaults to METAFILE_EXT.
|
171
|
+
success_txn_log_dir (Optional[str], optional): Directory path for
|
172
|
+
successful transaction logs. Will be automatically discovered by
|
173
|
+
traversing revision directory parent paths if not specified.
|
174
|
+
|
175
|
+
Returns:
|
176
|
+
MetafileRevisionInfo: A new revision info object containing
|
177
|
+
metadata for the next revision
|
178
|
+
|
179
|
+
Notes:
|
180
|
+
- For CREATE operations, the method will ignore missing previous
|
181
|
+
revisions.
|
182
|
+
- The method validates the transaction operation type before
|
183
|
+
creating the new revision.
|
184
|
+
- Uses the pyarrow filesystem interface for file operations.
|
185
|
+
"""
|
186
|
+
is_create_txn = current_txn_op_type == TransactionOperationType.CREATE
|
187
|
+
mri = MetafileRevisionInfo.latest_revision(
|
188
|
+
revision_dir_path=revision_dir_path,
|
189
|
+
filesystem=filesystem,
|
190
|
+
success_txn_log_dir=success_txn_log_dir,
|
191
|
+
current_txn_start_time=current_txn_start_time,
|
192
|
+
current_txn_id=current_txn_id,
|
193
|
+
ignore_missing_revision=is_create_txn,
|
194
|
+
)
|
195
|
+
# validate the transaction operation type
|
196
|
+
if mri.exists():
|
197
|
+
# update/delete fails if the last metafile was deleted
|
198
|
+
if mri.txn_op_type == TransactionOperationType.DELETE:
|
199
|
+
if current_txn_op_type != TransactionOperationType.CREATE:
|
200
|
+
raise ValueError(
|
201
|
+
f"Metafile {current_txn_op_type.value} failed "
|
202
|
+
f"for transaction ID {current_txn_id} failed. "
|
203
|
+
f"Metafile state at {mri.path} is deleted."
|
204
|
+
)
|
205
|
+
# create fails unless the last metafile was deleted
|
206
|
+
elif is_create_txn:
|
207
|
+
raise ValueError(
|
208
|
+
f"Metafile creation for transaction ID {current_txn_id} "
|
209
|
+
f"failed. Metafile commit at {mri.path} already exists."
|
210
|
+
)
|
211
|
+
elif not is_create_txn:
|
212
|
+
# update/delete fails if the last metafile doesn't exist
|
213
|
+
raise ValueError(
|
214
|
+
f"Metafile {current_txn_op_type.value} failed for "
|
215
|
+
f"transaction ID {current_txn_id} failed. Metafile at "
|
216
|
+
f"{mri.path} does not exist."
|
217
|
+
)
|
218
|
+
mri.revision = mri.revision + 1
|
219
|
+
mri.txn_id = current_txn_id
|
220
|
+
mri.txn_op_type = current_txn_op_type
|
221
|
+
mri.dir_path = revision_dir_path
|
222
|
+
mri.extension = extension
|
223
|
+
return mri
|
224
|
+
|
225
|
+
@staticmethod
|
226
|
+
def check_for_concurrent_txn_conflict(
|
227
|
+
success_txn_log_dir: str,
|
228
|
+
current_txn_revision_file_path: str,
|
229
|
+
filesystem: pyarrow.fs.FileSystem,
|
230
|
+
) -> None:
|
231
|
+
"""
|
232
|
+
Checks for a concurrent modification conflict between a file commited
|
233
|
+
by the current transaction and another parallel transaction. Raises
|
234
|
+
an exception if a concurrent modification conflict is found.
|
235
|
+
|
236
|
+
:param success_txn_log_dir: Path to the log of successful transactions.
|
237
|
+
:param current_txn_revision_file_path: Path to a metafile revision
|
238
|
+
written by the current transaction to check for conflicts against.
|
239
|
+
:param filesystem: Filesystem that can read the metafile revision.
|
240
|
+
:raises RuntimeError: if a conflict is found with another transaction.
|
241
|
+
"""
|
242
|
+
revision_dir_path = posixpath.dirname(current_txn_revision_file_path)
|
243
|
+
cur_txn_mri = MetafileRevisionInfo.parse(current_txn_revision_file_path)
|
244
|
+
|
245
|
+
sorted_metafile_paths = MetafileRevisionInfo._sorted_file_paths(
|
246
|
+
revision_dir_path=revision_dir_path,
|
247
|
+
filesystem=filesystem,
|
248
|
+
)
|
249
|
+
conflict_mris = []
|
250
|
+
while sorted_metafile_paths:
|
251
|
+
next_metafile_path = sorted_metafile_paths.pop()
|
252
|
+
mri = MetafileRevisionInfo.parse(next_metafile_path)
|
253
|
+
if mri.revision < cur_txn_mri.revision:
|
254
|
+
# no conflict was found
|
255
|
+
break
|
256
|
+
elif (
|
257
|
+
mri.revision == cur_txn_mri.revision
|
258
|
+
and mri.txn_id != cur_txn_mri.txn_id
|
259
|
+
):
|
260
|
+
# we've found a conflict between txn_id and current_txn_id
|
261
|
+
# defer to the transaction with the higher lexicographic order
|
262
|
+
# (i.e., the transaction that started most recently)
|
263
|
+
# TODO(pdames): Ensure the conflicting transaction is alive
|
264
|
+
# (e.g., give each transaction a heartbeat timeout that gives
|
265
|
+
# it 1-2 seconds per operation, and record known failed
|
266
|
+
# transaction IDs)
|
267
|
+
if mri.txn_id > cur_txn_mri.txn_id:
|
268
|
+
raise RuntimeError(
|
269
|
+
f"Aborting transaction {cur_txn_mri.txn_id} due to "
|
270
|
+
f"concurrent conflict at "
|
271
|
+
f"{current_txn_revision_file_path} with transaction "
|
272
|
+
f"{mri.txn_id} at {next_metafile_path}."
|
273
|
+
)
|
274
|
+
conflict_mris.append(mri)
|
275
|
+
if conflict_mris:
|
276
|
+
# current txn wins the ordering challenge among all conflicts,
|
277
|
+
# but we still need to ensure that no conflicting transactions
|
278
|
+
# completed before seeing the conflict with this transaction
|
279
|
+
for mri in conflict_mris:
|
280
|
+
txn_end_time = (
|
281
|
+
deltacat.storage.model.transaction.Transaction.read_end_time(
|
282
|
+
path=posixpath.join(success_txn_log_dir, mri.txn_id),
|
283
|
+
filesystem=filesystem,
|
284
|
+
)
|
285
|
+
)
|
286
|
+
# TODO(pdames): Resolve risk of passing this check if it
|
287
|
+
# runs before the conflicting transaction marks itself as
|
288
|
+
# complete in the txn log. Some fixes include enforcing
|
289
|
+
# serializable isolation of the txn log, eventually
|
290
|
+
# consistent detection & repair, writing a mutex file
|
291
|
+
# that tells future transactions to only consider this txn
|
292
|
+
# complete if the conflicting txn is not complete, etc.
|
293
|
+
if txn_end_time:
|
294
|
+
raise RuntimeError(
|
295
|
+
f"Aborting transaction {cur_txn_mri.txn_id} due to "
|
296
|
+
f"concurrent conflict at {revision_dir_path} with "
|
297
|
+
f"previously completed transaction {mri.txn_id} at "
|
298
|
+
f"{next_metafile_path}."
|
299
|
+
)
|
300
|
+
|
301
|
+
@staticmethod
|
302
|
+
def _sorted_file_paths(
|
303
|
+
revision_dir_path: str,
|
304
|
+
filesystem: pyarrow.fs.FileSystem,
|
305
|
+
ignore_missing_revision: bool = False,
|
306
|
+
) -> List[str]:
|
307
|
+
file_paths_and_sizes = list_directory(
|
308
|
+
path=revision_dir_path,
|
309
|
+
filesystem=filesystem,
|
310
|
+
ignore_missing_path=True,
|
311
|
+
)
|
312
|
+
if not file_paths_and_sizes and not ignore_missing_revision:
|
313
|
+
err_msg = (
|
314
|
+
f"Expected to find at least 1 Metafile at "
|
315
|
+
f"{revision_dir_path} but found none."
|
316
|
+
)
|
317
|
+
raise ValueError(err_msg)
|
318
|
+
return list(list(zip(*file_paths_and_sizes))[0]) if file_paths_and_sizes else []
|
319
|
+
|
320
|
+
@property
|
321
|
+
def revision(self) -> int:
|
322
|
+
return self["revision"]
|
323
|
+
|
324
|
+
@revision.setter
|
325
|
+
def revision(self, revision: int):
|
326
|
+
self["revision"] = revision
|
327
|
+
|
328
|
+
@property
|
329
|
+
def txn_id(self) -> Optional[str]:
|
330
|
+
return self["txn_id"]
|
331
|
+
|
332
|
+
@txn_id.setter
|
333
|
+
def txn_id(self, txn_id: str):
|
334
|
+
self["txn_id"] = txn_id
|
335
|
+
|
336
|
+
@property
|
337
|
+
def txn_op_type(self) -> Optional[TransactionOperationType]:
|
338
|
+
op_type = self.get("txn_op_type")
|
339
|
+
return None if op_type is None else TransactionOperationType(op_type)
|
340
|
+
|
341
|
+
@txn_op_type.setter
|
342
|
+
def txn_op_type(self, txn_op_type: TransactionOperationType):
|
343
|
+
self["txn_op_type"] = txn_op_type
|
344
|
+
|
345
|
+
@property
|
346
|
+
def dir_path(self) -> Optional[str]:
|
347
|
+
return self["dir_path"]
|
348
|
+
|
349
|
+
@dir_path.setter
|
350
|
+
def dir_path(self, dir_path: str):
|
351
|
+
self["dir_path"] = dir_path
|
352
|
+
|
353
|
+
@property
|
354
|
+
def extension(self) -> str:
|
355
|
+
return self.get("extension") or METAFILE_EXT
|
356
|
+
|
357
|
+
@extension.setter
|
358
|
+
def extension(self, extension: str):
|
359
|
+
self["extension"] = extension
|
360
|
+
|
361
|
+
@property
|
362
|
+
def file_name(self) -> Optional[str]:
|
363
|
+
return (
|
364
|
+
TXN_PART_SEPARATOR.join(
|
365
|
+
[
|
366
|
+
f"{self.revision:020}",
|
367
|
+
self.txn_op_type,
|
368
|
+
f"{self.txn_id}{self.extension}",
|
369
|
+
]
|
370
|
+
)
|
371
|
+
if self.txn_op_type and self.txn_id
|
372
|
+
else None
|
373
|
+
)
|
374
|
+
|
375
|
+
@property
|
376
|
+
def path(self) -> Optional[str]:
|
377
|
+
file_name = self.file_name
|
378
|
+
return (
|
379
|
+
posixpath.join(
|
380
|
+
self.dir_path,
|
381
|
+
file_name,
|
382
|
+
)
|
383
|
+
if self.dir_path and file_name
|
384
|
+
else None
|
385
|
+
)
|
386
|
+
|
387
|
+
def exists(self) -> bool:
|
388
|
+
return bool(self.revision)
|
389
|
+
|
390
|
+
|
391
|
+
class Metafile(dict):
|
392
|
+
"""
|
393
|
+
Base class for DeltaCAT metadata files, with read and write methods
|
394
|
+
for dict-based DeltaCAT models. Uses msgpack (https://msgpack.org/) for
|
395
|
+
cross-language-compatible serialization and deserialization.
|
396
|
+
"""
|
397
|
+
|
398
|
+
@staticmethod
|
399
|
+
def update_for(other: Optional[Metafile]) -> Optional[Metafile]:
|
400
|
+
"""
|
401
|
+
Returns a new metafile that can be used as the destination metafile
|
402
|
+
in an update transaction operation against the input source metafile.
|
403
|
+
The returned metafile starts as an identical deep copy of the input
|
404
|
+
metafile such that, if the output is changed and committed as part of
|
405
|
+
an update transaction operation on the source metafile, then it will
|
406
|
+
update instead of replace the source metafile.
|
407
|
+
:param other: Source metafile for the copy.
|
408
|
+
:return: New copy of the source metafile.
|
409
|
+
"""
|
410
|
+
return copy.deepcopy(other) if other is not None else None
|
411
|
+
|
412
|
+
@staticmethod
|
413
|
+
def based_on(
|
414
|
+
other: Optional[Metafile],
|
415
|
+
new_id: Optional[Locator] = None,
|
416
|
+
) -> Optional[Metafile]:
|
417
|
+
"""
|
418
|
+
Returns a new metafile equivalent to the input metafile, but with a new
|
419
|
+
ID assigned to distinguish it as a separate catalog object. This means
|
420
|
+
that, if the output is simply committed as part of an update transaction
|
421
|
+
operation on the source metafile, then it will replace instead of update
|
422
|
+
the source metafile.
|
423
|
+
:param other: Source metafile that is the basis for the new metafile.
|
424
|
+
:param new_id: New immutable ID to assign to the new metafile. Should
|
425
|
+
not be specified for metafiles with mutable names (e.g., namespaces and
|
426
|
+
tables).
|
427
|
+
:return: A new metafile based on the input metafile with a different ID.
|
428
|
+
"""
|
429
|
+
metafile_copy = Metafile.update_for(other)
|
430
|
+
if metafile_copy:
|
431
|
+
# remove the source metafile ID so that this is treated as a
|
432
|
+
# different catalog object with otherwise identical properties
|
433
|
+
if not other.named_immutable_id:
|
434
|
+
metafile_copy.pop("id", None)
|
435
|
+
if new_id:
|
436
|
+
raise ValueError(
|
437
|
+
f"New ID cannot be specified for metafiles that "
|
438
|
+
f"don't have a named immutable ID."
|
439
|
+
)
|
440
|
+
else:
|
441
|
+
if not new_id:
|
442
|
+
raise ValueError(
|
443
|
+
f"New ID must be specified for metafiles that have a "
|
444
|
+
f"named immutable ID."
|
445
|
+
)
|
446
|
+
metafile_copy.named_immutable_id = new_id
|
447
|
+
# remove all ancestors of the original source metafile
|
448
|
+
metafile_copy.pop("ancestor_ids", None)
|
449
|
+
return metafile_copy
|
450
|
+
|
451
|
+
@staticmethod
|
452
|
+
def read_txn(
|
453
|
+
catalog_root_dir: str,
|
454
|
+
success_txn_log_dir: str,
|
455
|
+
current_txn_op: deltacat.storage.model.transaction.TransactionOperation,
|
456
|
+
current_txn_start_time: int,
|
457
|
+
current_txn_id: str,
|
458
|
+
filesystem: Optional[pyarrow.fs.FileSystem] = None,
|
459
|
+
) -> ListResult[Metafile]:
|
460
|
+
"""
|
461
|
+
Read one or more metadata files within the context of a transaction.
|
462
|
+
:param catalog_root_dir: Catalog root dir to read the metafile from.
|
463
|
+
:param success_txn_log_dir: Catalog root successful transaction log
|
464
|
+
directory.
|
465
|
+
:param current_txn_op: Transaction operation for this read.
|
466
|
+
:param current_txn_start_time: Transaction start time for this read.
|
467
|
+
:param current_txn_id: Transaction ID for this read.
|
468
|
+
:param filesystem: File system to use for reading the metadata file. If
|
469
|
+
not given, a default filesystem will be automatically selected based on
|
470
|
+
the catalog root path.
|
471
|
+
:return: ListResult of deserialized metadata files read.
|
472
|
+
"""
|
473
|
+
kwargs = {
|
474
|
+
"catalog_root": catalog_root_dir,
|
475
|
+
"success_txn_log_dir": success_txn_log_dir,
|
476
|
+
"current_txn_start_time": current_txn_start_time,
|
477
|
+
"current_txn_id": current_txn_id,
|
478
|
+
"filesystem": filesystem,
|
479
|
+
"limit": current_txn_op.read_limit,
|
480
|
+
}
|
481
|
+
if current_txn_op.type == TransactionOperationType.READ_SIBLINGS:
|
482
|
+
return current_txn_op.dest_metafile.siblings(**kwargs)
|
483
|
+
elif current_txn_op.type == TransactionOperationType.READ_CHILDREN:
|
484
|
+
return current_txn_op.dest_metafile.children(**kwargs)
|
485
|
+
elif current_txn_op.type == TransactionOperationType.READ_LATEST:
|
486
|
+
kwargs["limit"] = 1
|
487
|
+
elif current_txn_op.type == TransactionOperationType.READ_EXISTS:
|
488
|
+
kwargs["limit"] = 1
|
489
|
+
kwargs["materialize_revisions"] = False
|
490
|
+
else:
|
491
|
+
raise ValueError(
|
492
|
+
f"Unsupported transaction operation type: {current_txn_op.type}"
|
493
|
+
)
|
494
|
+
# return the latest metafile revision for READ_LATEST and READ_EXISTS
|
495
|
+
list_result = current_txn_op.dest_metafile.revisions(**kwargs)
|
496
|
+
revisions = list_result.all_items()
|
497
|
+
metafiles = []
|
498
|
+
if revisions:
|
499
|
+
op_type = revisions[0][0]
|
500
|
+
if op_type != TransactionOperationType.DELETE:
|
501
|
+
metafiles.append(revisions[0][1])
|
502
|
+
# TODO(pdames): Add Optional[Metafile] to return type and just
|
503
|
+
# return the latest metafile (if any) directly?
|
504
|
+
return ListResult.of(
|
505
|
+
items=metafiles,
|
506
|
+
pagination_key=None,
|
507
|
+
next_page_provider=None,
|
508
|
+
)
|
509
|
+
else:
|
510
|
+
# Could not find any revisions in list operations - return no results
|
511
|
+
return ListResult.empty()
|
512
|
+
|
513
|
+
@staticmethod
|
514
|
+
def get_class(serialized_dict: dict):
|
515
|
+
"""
|
516
|
+
Given a serialized dictionary of Metafile data, gets the metafile child
|
517
|
+
class type to instantiate.
|
518
|
+
"""
|
519
|
+
# TODO: more robust implementation. Right now this relies on the
|
520
|
+
# assumption that XLocator key will only be present in class X, and
|
521
|
+
# is brittle to renames. On the other hand, this implementation does
|
522
|
+
# not require any marker fields to be persisted, and a regression
|
523
|
+
# will be quickly detected by test_metafile.io or other unit tests
|
524
|
+
if serialized_dict.__contains__("tableLocator"):
|
525
|
+
return deltacat.storage.model.table.Table
|
526
|
+
elif serialized_dict.__contains__("namespaceLocator"):
|
527
|
+
return deltacat.storage.model.namespace.Namespace
|
528
|
+
elif serialized_dict.__contains__("tableVersionLocator"):
|
529
|
+
return deltacat.storage.model.table_version.TableVersion
|
530
|
+
elif serialized_dict.__contains__("partitionLocator"):
|
531
|
+
return deltacat.storage.model.partition.Partition
|
532
|
+
elif serialized_dict.__contains__("streamLocator"):
|
533
|
+
return deltacat.storage.model.stream.Stream
|
534
|
+
elif serialized_dict.__contains__("deltaLocator"):
|
535
|
+
return deltacat.storage.model.delta.Delta
|
536
|
+
else:
|
537
|
+
raise ValueError(
|
538
|
+
f"Could not find metafile class from serialized form: "
|
539
|
+
f"${serialized_dict}"
|
540
|
+
)
|
541
|
+
|
542
|
+
@classmethod
|
543
|
+
def read(
|
544
|
+
cls,
|
545
|
+
path: str,
|
546
|
+
filesystem: Optional[pyarrow.fs.FileSystem] = None,
|
547
|
+
format: Optional[str] = METAFILE_FORMAT,
|
548
|
+
) -> Metafile:
|
549
|
+
"""
|
550
|
+
Read a metadata file and return the deserialized object.
|
551
|
+
:param path: Metadata file path to read.
|
552
|
+
:param filesystem: File system to use for reading the metadata file.
|
553
|
+
:param format: Format to use for deserializing the metadata file.
|
554
|
+
:return: Deserialized object from the metadata file.
|
555
|
+
"""
|
556
|
+
if format not in SUPPORTED_METAFILE_FORMATS:
|
557
|
+
raise ValueError(
|
558
|
+
f"Unsupported format '{format}'. Supported formats include: {SUPPORTED_METAFILE_FORMATS}."
|
559
|
+
)
|
560
|
+
|
561
|
+
if not filesystem:
|
562
|
+
path, filesystem = resolve_path_and_filesystem(path, filesystem)
|
563
|
+
with filesystem.open_input_stream(path) as file:
|
564
|
+
binary = file.readall()
|
565
|
+
reader = {
|
566
|
+
"json": lambda b: json.loads(
|
567
|
+
b.decode("utf-8"),
|
568
|
+
object_hook=lambda obj: {
|
569
|
+
k: base64.b64decode(v)
|
570
|
+
if isinstance(v, str) and v.startswith("b64:")
|
571
|
+
else v
|
572
|
+
for k, v in obj.items()
|
573
|
+
},
|
574
|
+
),
|
575
|
+
"msgpack": msgpack.loads,
|
576
|
+
}[format]
|
577
|
+
data = reader(binary)
|
578
|
+
# cast this Metafile into the appropriate child class type
|
579
|
+
clazz = Metafile.get_class(data)
|
580
|
+
obj = clazz(**data).from_serializable(path, filesystem)
|
581
|
+
return obj
|
582
|
+
|
583
|
+
def write_txn(
|
584
|
+
self,
|
585
|
+
catalog_root_dir: str,
|
586
|
+
success_txn_log_dir: str,
|
587
|
+
current_txn_op: deltacat.storage.model.transaction.TransactionOperation,
|
588
|
+
current_txn_start_time: int,
|
589
|
+
current_txn_id: str,
|
590
|
+
filesystem: Optional[pyarrow.fs.FileSystem] = None,
|
591
|
+
) -> None:
|
592
|
+
"""
|
593
|
+
Serialize and write this object to a metadata file within the context
|
594
|
+
of a transaction.
|
595
|
+
:param catalog_root_dir: Catalog root dir to write the metafile to.
|
596
|
+
:param success_txn_log_dir: Catalog root successful transaction log
|
597
|
+
directory.
|
598
|
+
:param current_txn_op: Transaction operation for this write.
|
599
|
+
:param current_txn_start_time: Transaction start time for this write.
|
600
|
+
:param current_txn_id: Transaction ID for this write.
|
601
|
+
:param filesystem: File system to use for writing the metadata file. If
|
602
|
+
not given, a default filesystem will be automatically selected based on
|
603
|
+
the catalog root path.
|
604
|
+
"""
|
605
|
+
if not filesystem:
|
606
|
+
catalog_root_dir, filesystem = resolve_path_and_filesystem(
|
607
|
+
path=catalog_root_dir,
|
608
|
+
filesystem=filesystem,
|
609
|
+
)
|
610
|
+
self._write_metafile_revisions(
|
611
|
+
catalog_root=catalog_root_dir,
|
612
|
+
success_txn_log_dir=success_txn_log_dir,
|
613
|
+
current_txn_op=current_txn_op,
|
614
|
+
current_txn_start_time=current_txn_start_time,
|
615
|
+
current_txn_id=current_txn_id,
|
616
|
+
filesystem=filesystem,
|
617
|
+
)
|
618
|
+
|
619
|
+
def write(
|
620
|
+
self,
|
621
|
+
path: str,
|
622
|
+
filesystem: Optional[pyarrow.fs.FileSystem] = None,
|
623
|
+
format: Optional[str] = METAFILE_FORMAT,
|
624
|
+
) -> None:
|
625
|
+
"""
|
626
|
+
Serialize and write this object to a metadata file.
|
627
|
+
:param path: Metadata file path to write to.
|
628
|
+
:param filesystem: File system to use for writing the metadata file. If
|
629
|
+
not given, a default filesystem will be automatically selected based on
|
630
|
+
the catalog root path.
|
631
|
+
param: format: Format to use for serializing the metadata file.
|
632
|
+
"""
|
633
|
+
if format not in SUPPORTED_METAFILE_FORMATS:
|
634
|
+
raise ValueError(
|
635
|
+
f"Unsupported format '{format}'. Supported formats include: {SUPPORTED_METAFILE_FORMATS}."
|
636
|
+
)
|
637
|
+
|
638
|
+
if not filesystem:
|
639
|
+
path, filesystem = resolve_path_and_filesystem(path, filesystem)
|
640
|
+
revision_dir_path = posixpath.dirname(path)
|
641
|
+
filesystem.create_dir(revision_dir_path, recursive=True)
|
642
|
+
|
643
|
+
writer = {
|
644
|
+
"json": lambda data: json.dumps(
|
645
|
+
data,
|
646
|
+
indent=4,
|
647
|
+
default=lambda b: base64.b64encode(b).decode("utf-8")
|
648
|
+
if isinstance(b, bytes)
|
649
|
+
else b,
|
650
|
+
).encode("utf-8"),
|
651
|
+
"msgpack": msgpack.dumps,
|
652
|
+
}[format]
|
653
|
+
|
654
|
+
with filesystem.open_output_stream(path) as file:
|
655
|
+
file.write(writer(self.to_serializable()))
|
656
|
+
|
657
|
+
def equivalent_to(self, other: Metafile) -> bool:
|
658
|
+
"""
|
659
|
+
True if this Metafile is equivalent to the other Metafile minus its
|
660
|
+
unique ID and ancestor IDs.
|
661
|
+
|
662
|
+
:param other: Metafile to compare to.
|
663
|
+
:return: True if the other metafile is equivalent, false if not.
|
664
|
+
"""
|
665
|
+
identifiers = {"id", "ancestor_ids"}
|
666
|
+
for k, v in self.items():
|
667
|
+
if k not in identifiers and (k not in other or other[k] != v):
|
668
|
+
return False
|
669
|
+
for k in other.keys():
|
670
|
+
if k not in identifiers and k not in self:
|
671
|
+
return False
|
672
|
+
return True
|
673
|
+
|
674
|
+
@property
|
675
|
+
def named_immutable_id(self) -> Optional[str]:
|
676
|
+
"""
|
677
|
+
If this metafile's locator name is immutable (i.e., if the object it
|
678
|
+
refers to can't be renamed) then returns an immutable ID suitable for
|
679
|
+
use in URLS or filesystem paths. Returns None if this locator name is
|
680
|
+
mutable (i.e., if the object it refers to can be renamed).
|
681
|
+
"""
|
682
|
+
return self.locator.name.immutable_id
|
683
|
+
|
684
|
+
@named_immutable_id.setter
|
685
|
+
def named_immutable_id(self, immutable_id: Optional[str]) -> None:
|
686
|
+
"""
|
687
|
+
If this metafile's locator name is immutable (i.e., if the object it
|
688
|
+
refers to can't be renamed), then sets an immutable ID for this
|
689
|
+
locator name suitable for use in URLS or filesystem paths. Note that
|
690
|
+
the ID is only considered immutable in durable catalog storage, and
|
691
|
+
remains mutable in transient memory (i.e., this setter remains
|
692
|
+
functional regardless of whether an ID is already assigned, but each
|
693
|
+
update will cause it to refer to a different, distinct object in
|
694
|
+
durable storage).
|
695
|
+
:raises NotImplementedError: If this metafile type does not have a
|
696
|
+
named immutable ID (i.e., its immutable ID is auto-generated).
|
697
|
+
"""
|
698
|
+
self.locator.name.immutable_id = immutable_id
|
699
|
+
|
700
|
+
@property
|
701
|
+
def id(self) -> str:
|
702
|
+
"""
|
703
|
+
Returns an existing immutable ID for this metafile or generates a new
|
704
|
+
one. This ID can be used for equality checks (i.e. 2 metafiles refer
|
705
|
+
to the same catalog object if they have the same ID) and deterministic
|
706
|
+
references (e.g. for generating a root namespace or table path that
|
707
|
+
remains the same regardless of renames).
|
708
|
+
"""
|
709
|
+
|
710
|
+
# check if the locator name can be reused as an immutable ID
|
711
|
+
# or if we need to use a generated UUID as an immutable ID
|
712
|
+
_id = self.locator.name.immutable_id or self.get("id")
|
713
|
+
if not _id:
|
714
|
+
_id = self["id"] = str(uuid.uuid4())
|
715
|
+
return _id
|
716
|
+
|
717
|
+
@property
|
718
|
+
def locator(self) -> Optional[Locator]:
|
719
|
+
"""
|
720
|
+
Returns the canonical locator for this metafile, which is typically used
|
721
|
+
to efficiently resolve internal system references to this object.
|
722
|
+
"""
|
723
|
+
raise NotImplementedError()
|
724
|
+
|
725
|
+
@property
|
726
|
+
def locator_alias(self) -> Optional[Locator]:
|
727
|
+
"""
|
728
|
+
Returns an optional locator alias for this metafile. This is
|
729
|
+
typically used to resolve a unique, human-readable reference to this
|
730
|
+
object (e.g., by using partition values instead of partition ID or
|
731
|
+
stream format name instead of stream ID). Locator aliases are
|
732
|
+
typically used during partition predicate pushdown (e.g., by
|
733
|
+
partition value + partition scheme ID) or to display unique
|
734
|
+
human-readable metafile names.
|
735
|
+
"""
|
736
|
+
return None
|
737
|
+
|
738
|
+
def children(
|
739
|
+
self,
|
740
|
+
catalog_root: str,
|
741
|
+
success_txn_log_dir: str,
|
742
|
+
current_txn_start_time: Optional[int] = None,
|
743
|
+
current_txn_id: Optional[str] = None,
|
744
|
+
filesystem: Optional[pyarrow.fs.FileSystem] = None,
|
745
|
+
limit: Optional[int] = None,
|
746
|
+
) -> ListResult[Metafile]:
|
747
|
+
"""
|
748
|
+
Retrieve all children of this object.
|
749
|
+
:return: ListResult containing all children of this object.
|
750
|
+
"""
|
751
|
+
catalog_root, filesystem = resolve_path_and_filesystem(
|
752
|
+
catalog_root,
|
753
|
+
filesystem,
|
754
|
+
)
|
755
|
+
metafile_root_dir_path = self.metafile_root_path(
|
756
|
+
catalog_root=catalog_root,
|
757
|
+
current_txn_start_time=current_txn_start_time,
|
758
|
+
current_txn_id=current_txn_id,
|
759
|
+
filesystem=filesystem,
|
760
|
+
)
|
761
|
+
# List metafiles with respect to this metafile's URI as root
|
762
|
+
return self._list_metafiles(
|
763
|
+
success_txn_log_dir=success_txn_log_dir,
|
764
|
+
metafile_root_dir_path=metafile_root_dir_path,
|
765
|
+
current_txn_start_time=current_txn_start_time,
|
766
|
+
current_txn_id=current_txn_id,
|
767
|
+
filesystem=filesystem,
|
768
|
+
limit=limit,
|
769
|
+
)
|
770
|
+
|
771
|
+
def siblings(
|
772
|
+
self,
|
773
|
+
catalog_root: str,
|
774
|
+
success_txn_log_dir: str,
|
775
|
+
current_txn_start_time: Optional[int] = None,
|
776
|
+
current_txn_id: Optional[str] = None,
|
777
|
+
filesystem: Optional[pyarrow.fs.FileSystem] = None,
|
778
|
+
limit: Optional[int] = None,
|
779
|
+
) -> ListResult[Metafile]:
|
780
|
+
"""
|
781
|
+
Retrieve all siblings of this object.
|
782
|
+
:return: ListResult containing all siblings of this object.
|
783
|
+
"""
|
784
|
+
catalog_root, filesystem = resolve_path_and_filesystem(
|
785
|
+
catalog_root,
|
786
|
+
filesystem,
|
787
|
+
)
|
788
|
+
parent_obj_path = self.parent_root_path(
|
789
|
+
catalog_root=catalog_root,
|
790
|
+
current_txn_start_time=current_txn_start_time,
|
791
|
+
current_txn_id=current_txn_id,
|
792
|
+
filesystem=filesystem,
|
793
|
+
)
|
794
|
+
return self._list_metafiles(
|
795
|
+
success_txn_log_dir=success_txn_log_dir,
|
796
|
+
metafile_root_dir_path=parent_obj_path,
|
797
|
+
current_txn_start_time=current_txn_start_time,
|
798
|
+
current_txn_id=current_txn_id,
|
799
|
+
filesystem=filesystem,
|
800
|
+
limit=limit,
|
801
|
+
)
|
802
|
+
|
803
|
+
def revisions(
|
804
|
+
self,
|
805
|
+
catalog_root: str,
|
806
|
+
success_txn_log_dir: str,
|
807
|
+
current_txn_start_time: Optional[int] = None,
|
808
|
+
current_txn_id: Optional[str] = None,
|
809
|
+
filesystem: Optional[pyarrow.fs.FileSystem] = None,
|
810
|
+
limit: Optional[int] = None,
|
811
|
+
materialize_revisions: bool = True,
|
812
|
+
) -> ListResult[Tuple[TransactionOperationType, Optional[Metafile]]]:
|
813
|
+
"""
|
814
|
+
Retrieve all revisions of this object.
|
815
|
+
:return: ListResult containing all revisions of this object.
|
816
|
+
"""
|
817
|
+
catalog_root, filesystem = resolve_path_and_filesystem(
|
818
|
+
catalog_root,
|
819
|
+
filesystem,
|
820
|
+
)
|
821
|
+
try:
|
822
|
+
parent_root = self.parent_root_path(
|
823
|
+
catalog_root=catalog_root,
|
824
|
+
current_txn_start_time=current_txn_start_time,
|
825
|
+
current_txn_id=current_txn_id,
|
826
|
+
filesystem=filesystem,
|
827
|
+
)
|
828
|
+
except ValueError:
|
829
|
+
# one or more ancestor's don't exist - return an empty list result
|
830
|
+
# TODO(pdames): Raise and catch a more explicit AncestorNotFound
|
831
|
+
# error type here.
|
832
|
+
return ListResult.empty()
|
833
|
+
try:
|
834
|
+
locator = (
|
835
|
+
self.locator
|
836
|
+
if self.locator.name.exists()
|
837
|
+
else self.locator_alias
|
838
|
+
if self.locator_alias and self.locator_alias.name.exists()
|
839
|
+
else None
|
840
|
+
)
|
841
|
+
immutable_id = (
|
842
|
+
# TODO(pdames): Refactor id lazy assignment into explicit getter/setter
|
843
|
+
self.get("id")
|
844
|
+
or Metafile._locator_to_id(
|
845
|
+
locator=locator,
|
846
|
+
catalog_root=catalog_root,
|
847
|
+
metafile_root=parent_root,
|
848
|
+
filesystem=filesystem,
|
849
|
+
txn_start_time=current_txn_start_time,
|
850
|
+
txn_id=current_txn_id,
|
851
|
+
)
|
852
|
+
if locator
|
853
|
+
else None
|
854
|
+
)
|
855
|
+
except ValueError:
|
856
|
+
# the metafile has been deleted
|
857
|
+
return ListResult.empty()
|
858
|
+
if not immutable_id:
|
859
|
+
# the metafile does not exist
|
860
|
+
return ListResult.empty()
|
861
|
+
revision_dir_path = posixpath.join(
|
862
|
+
parent_root,
|
863
|
+
immutable_id,
|
864
|
+
REVISION_DIR_NAME,
|
865
|
+
)
|
866
|
+
revisions = MetafileRevisionInfo.list_revisions(
|
867
|
+
revision_dir_path=revision_dir_path,
|
868
|
+
filesystem=filesystem,
|
869
|
+
success_txn_log_dir=success_txn_log_dir,
|
870
|
+
current_txn_start_time=current_txn_start_time,
|
871
|
+
current_txn_id=current_txn_id,
|
872
|
+
limit=limit,
|
873
|
+
)
|
874
|
+
items = []
|
875
|
+
for mri in revisions:
|
876
|
+
if mri.exists():
|
877
|
+
metafile = (
|
878
|
+
{}
|
879
|
+
if not materialize_revisions
|
880
|
+
else self.read(
|
881
|
+
path=mri.path,
|
882
|
+
filesystem=filesystem,
|
883
|
+
)
|
884
|
+
)
|
885
|
+
items.append((mri.txn_op_type, metafile))
|
886
|
+
# TODO(pdames): Add pagination.
|
887
|
+
return ListResult.of(
|
888
|
+
items=items,
|
889
|
+
pagination_key=None,
|
890
|
+
next_page_provider=None,
|
891
|
+
)
|
892
|
+
|
893
|
+
def to_serializable(self) -> Metafile:
|
894
|
+
"""
|
895
|
+
Prepare the object for serialization by converting any non-serializable
|
896
|
+
types to serializable types. May also run any required pre-write
|
897
|
+
validations on the serialized or deserialized object.
|
898
|
+
:return: a serializable version of the object
|
899
|
+
"""
|
900
|
+
return self
|
901
|
+
|
902
|
+
def from_serializable(
|
903
|
+
self,
|
904
|
+
path: str,
|
905
|
+
filesystem: Optional[pyarrow.fs.FileSystem] = None,
|
906
|
+
) -> Metafile:
|
907
|
+
"""
|
908
|
+
Restore any non-serializable types from a serializable version of this
|
909
|
+
object. May also run any required post-read validations on the
|
910
|
+
serialized or deserialized object.
|
911
|
+
:return: a deserialized version of the object
|
912
|
+
"""
|
913
|
+
return self
|
914
|
+
|
915
|
+
def parent_root_path(
|
916
|
+
self,
|
917
|
+
catalog_root: str,
|
918
|
+
current_txn_start_time: Optional[int] = None,
|
919
|
+
current_txn_id: Optional[str] = None,
|
920
|
+
filesystem: Optional[pyarrow.fs.FileSystem] = None,
|
921
|
+
) -> str:
|
922
|
+
ancestor_ids = self.ancestor_ids(
|
923
|
+
catalog_root=catalog_root,
|
924
|
+
current_txn_start_time=current_txn_start_time,
|
925
|
+
current_txn_id=current_txn_id,
|
926
|
+
filesystem=filesystem,
|
927
|
+
)
|
928
|
+
return posixpath.join(*[catalog_root] + ancestor_ids)
|
929
|
+
|
930
|
+
def metafile_root_path(
|
931
|
+
self,
|
932
|
+
catalog_root: str,
|
933
|
+
current_txn_start_time: Optional[int] = None,
|
934
|
+
current_txn_id: Optional[str] = None,
|
935
|
+
filesystem: Optional[pyarrow.fs.FileSystem] = None,
|
936
|
+
) -> str:
|
937
|
+
parent_obj_path = self.parent_root_path(
|
938
|
+
catalog_root=catalog_root,
|
939
|
+
current_txn_start_time=current_txn_start_time,
|
940
|
+
current_txn_id=current_txn_id,
|
941
|
+
filesystem=filesystem,
|
942
|
+
)
|
943
|
+
return posixpath.join(
|
944
|
+
parent_obj_path,
|
945
|
+
self.id,
|
946
|
+
)
|
947
|
+
|
948
|
+
def ancestor_ids(
|
949
|
+
self,
|
950
|
+
catalog_root: str,
|
951
|
+
current_txn_start_time: Optional[int] = None,
|
952
|
+
current_txn_id: Optional[str] = None,
|
953
|
+
filesystem: Optional[pyarrow.fs.FileSystem] = None,
|
954
|
+
) -> List[str]:
|
955
|
+
"""
|
956
|
+
Returns the IDs for this metafile's ancestor metafiles. IDs are
|
957
|
+
listed in order from root to immediate parent.
|
958
|
+
"""
|
959
|
+
ancestor_ids = self.get("ancestor_ids") or []
|
960
|
+
if not ancestor_ids:
|
961
|
+
ancestor_ids = Metafile._ancestor_ids(
|
962
|
+
locator=self.locator,
|
963
|
+
catalog_root=catalog_root,
|
964
|
+
current_txn_start_time=current_txn_start_time,
|
965
|
+
current_txn_id=current_txn_id,
|
966
|
+
filesystem=filesystem,
|
967
|
+
)
|
968
|
+
self["ancestor_ids"] = ancestor_ids
|
969
|
+
return ancestor_ids
|
970
|
+
|
971
|
+
@staticmethod
|
972
|
+
def _parent_metafile_rev_dir_path(
|
973
|
+
base_metafile_path: str,
|
974
|
+
parent_number,
|
975
|
+
):
|
976
|
+
# TODO(pdames): Stop parent traversal at catalog root.
|
977
|
+
current_dir = posixpath.dirname( # base metafile root dir
|
978
|
+
posixpath.dirname( # base metafile revision dir
|
979
|
+
base_metafile_path,
|
980
|
+
)
|
981
|
+
)
|
982
|
+
while parent_number and current_dir != posixpath.sep:
|
983
|
+
current_dir = posixpath.dirname(current_dir)
|
984
|
+
parent_number -= 1
|
985
|
+
return posixpath.join(
|
986
|
+
current_dir,
|
987
|
+
REVISION_DIR_NAME,
|
988
|
+
)
|
989
|
+
|
990
|
+
@staticmethod
|
991
|
+
def _locator_to_id(
|
992
|
+
locator: Locator,
|
993
|
+
catalog_root: str,
|
994
|
+
metafile_root: str,
|
995
|
+
filesystem: pyarrow.fs.FileSystem,
|
996
|
+
txn_start_time: Optional[int] = None,
|
997
|
+
txn_id: Optional[str] = None,
|
998
|
+
) -> Optional[str]:
|
999
|
+
"""
|
1000
|
+
Resolves the immutable metafile ID for the given locator.
|
1001
|
+
|
1002
|
+
:return: Immutable ID read from mapping file. None if no mapping exists.
|
1003
|
+
:raises: ValueError if the id is found but has been deleted
|
1004
|
+
"""
|
1005
|
+
metafile_id = locator.name.immutable_id
|
1006
|
+
if not metafile_id:
|
1007
|
+
# the locator name is mutable, so we need to resolve the mapping
|
1008
|
+
# from the locator back to its immutable metafile ID
|
1009
|
+
locator_path = locator.path(metafile_root)
|
1010
|
+
success_txn_log_dir = posixpath.join(
|
1011
|
+
catalog_root,
|
1012
|
+
TXN_DIR_NAME,
|
1013
|
+
SUCCESS_TXN_DIR_NAME,
|
1014
|
+
)
|
1015
|
+
mri = MetafileRevisionInfo.latest_revision(
|
1016
|
+
revision_dir_path=locator_path,
|
1017
|
+
filesystem=filesystem,
|
1018
|
+
success_txn_log_dir=success_txn_log_dir,
|
1019
|
+
current_txn_start_time=txn_start_time,
|
1020
|
+
current_txn_id=txn_id,
|
1021
|
+
ignore_missing_revision=True,
|
1022
|
+
)
|
1023
|
+
if not mri.exists():
|
1024
|
+
return None
|
1025
|
+
if mri.txn_op_type == TransactionOperationType.DELETE:
|
1026
|
+
err_msg = (
|
1027
|
+
f"Locator {locator} to metafile ID resolution failed "
|
1028
|
+
f"because its metafile ID mapping was deleted. You may "
|
1029
|
+
f"have an old reference to a renamed or deleted object."
|
1030
|
+
)
|
1031
|
+
raise ValueError(err_msg)
|
1032
|
+
metafile_id = posixpath.splitext(mri.path)[1][1:]
|
1033
|
+
return metafile_id
|
1034
|
+
|
1035
|
+
@staticmethod
|
1036
|
+
def _ancestor_ids(
|
1037
|
+
locator: Locator,
|
1038
|
+
catalog_root: str,
|
1039
|
+
current_txn_start_time: Optional[int] = None,
|
1040
|
+
current_txn_id: Optional[str] = None,
|
1041
|
+
filesystem: Optional[pyarrow.fs.FileSystem] = None,
|
1042
|
+
) -> List[str]:
|
1043
|
+
ancestor_ids = []
|
1044
|
+
catalog_root, filesystem = resolve_path_and_filesystem(
|
1045
|
+
path=catalog_root,
|
1046
|
+
filesystem=filesystem,
|
1047
|
+
)
|
1048
|
+
parent_locators = []
|
1049
|
+
# TODO(pdames): Correctly resolve missing parents and K of N
|
1050
|
+
# specified ancestors by using placeholder IDs for missing
|
1051
|
+
# ancestors
|
1052
|
+
parent_locator = locator.parent
|
1053
|
+
while parent_locator:
|
1054
|
+
parent_locators.append(parent_locator)
|
1055
|
+
parent_locator = parent_locator.parent
|
1056
|
+
metafile_root = catalog_root
|
1057
|
+
while parent_locators:
|
1058
|
+
parent_locator = parent_locators.pop()
|
1059
|
+
ancestor_id = Metafile._locator_to_id(
|
1060
|
+
locator=parent_locator,
|
1061
|
+
catalog_root=catalog_root,
|
1062
|
+
metafile_root=metafile_root,
|
1063
|
+
filesystem=filesystem,
|
1064
|
+
txn_start_time=current_txn_start_time,
|
1065
|
+
txn_id=current_txn_id,
|
1066
|
+
)
|
1067
|
+
if not ancestor_id:
|
1068
|
+
err_msg = f"Ancestor does not exist: {parent_locator}."
|
1069
|
+
raise ValueError(err_msg)
|
1070
|
+
metafile_root = posixpath.join(
|
1071
|
+
metafile_root,
|
1072
|
+
ancestor_id,
|
1073
|
+
)
|
1074
|
+
try:
|
1075
|
+
get_file_info(
|
1076
|
+
path=metafile_root,
|
1077
|
+
filesystem=filesystem,
|
1078
|
+
)
|
1079
|
+
except FileNotFoundError:
|
1080
|
+
raise ValueError(
|
1081
|
+
f"Ancestor {parent_locator} does not exist at: " f"{metafile_root}"
|
1082
|
+
)
|
1083
|
+
ancestor_ids.append(ancestor_id)
|
1084
|
+
return ancestor_ids
|
1085
|
+
|
1086
|
+
def _write_locator_to_id_map_file(
|
1087
|
+
self,
|
1088
|
+
locator: Locator,
|
1089
|
+
success_txn_log_dir: str,
|
1090
|
+
parent_obj_path: str,
|
1091
|
+
current_txn_op: deltacat.storage.model.transaction.TransactionOperation,
|
1092
|
+
current_txn_op_type: TransactionOperationType,
|
1093
|
+
current_txn_start_time: int,
|
1094
|
+
current_txn_id: str,
|
1095
|
+
filesystem: pyarrow.fs.FileSystem,
|
1096
|
+
) -> None:
|
1097
|
+
name_resolution_dir_path = locator.path(parent_obj_path)
|
1098
|
+
# TODO(pdames): Don't write updated revisions with the same mapping as
|
1099
|
+
# the latest revision.
|
1100
|
+
mri = MetafileRevisionInfo.new_revision(
|
1101
|
+
revision_dir_path=name_resolution_dir_path,
|
1102
|
+
current_txn_op_type=current_txn_op_type,
|
1103
|
+
current_txn_start_time=current_txn_start_time,
|
1104
|
+
current_txn_id=current_txn_id,
|
1105
|
+
filesystem=filesystem,
|
1106
|
+
extension=f".{self.id}",
|
1107
|
+
success_txn_log_dir=success_txn_log_dir,
|
1108
|
+
)
|
1109
|
+
revision_file_path = mri.path
|
1110
|
+
filesystem.create_dir(posixpath.dirname(revision_file_path), recursive=True)
|
1111
|
+
with filesystem.open_output_stream(revision_file_path):
|
1112
|
+
pass # Just create an empty ID file to map to the locator
|
1113
|
+
current_txn_op.append_locator_write_path(revision_file_path)
|
1114
|
+
|
1115
|
+
def _write_metafile_revision(
|
1116
|
+
self,
|
1117
|
+
success_txn_log_dir: str,
|
1118
|
+
revision_dir_path: str,
|
1119
|
+
current_txn_op: deltacat.storage.model.transaction.TransactionOperation,
|
1120
|
+
current_txn_op_type: TransactionOperationType,
|
1121
|
+
current_txn_start_time: int,
|
1122
|
+
current_txn_id: str,
|
1123
|
+
filesystem: pyarrow.fs.FileSystem,
|
1124
|
+
) -> None:
|
1125
|
+
mri = MetafileRevisionInfo.new_revision(
|
1126
|
+
revision_dir_path=revision_dir_path,
|
1127
|
+
current_txn_op_type=current_txn_op_type,
|
1128
|
+
current_txn_start_time=current_txn_start_time,
|
1129
|
+
current_txn_id=current_txn_id,
|
1130
|
+
filesystem=filesystem,
|
1131
|
+
success_txn_log_dir=success_txn_log_dir,
|
1132
|
+
)
|
1133
|
+
self.write(
|
1134
|
+
path=mri.path,
|
1135
|
+
filesystem=filesystem,
|
1136
|
+
)
|
1137
|
+
current_txn_op.append_metafile_write_path(mri.path)
|
1138
|
+
|
1139
|
+
def _write_metafile_revisions(
|
1140
|
+
self,
|
1141
|
+
catalog_root: str,
|
1142
|
+
success_txn_log_dir: str,
|
1143
|
+
current_txn_op: deltacat.storage.model.transaction.TransactionOperation,
|
1144
|
+
current_txn_start_time: int,
|
1145
|
+
current_txn_id: str,
|
1146
|
+
filesystem: pyarrow.fs.FileSystem,
|
1147
|
+
) -> None:
|
1148
|
+
"""
|
1149
|
+
Generates the fully qualified paths required to write this metafile as
|
1150
|
+
part of the given transaction. All paths returned will be based in the
|
1151
|
+
given root directory.
|
1152
|
+
"""
|
1153
|
+
parent_obj_path = self.parent_root_path(
|
1154
|
+
catalog_root=catalog_root,
|
1155
|
+
current_txn_start_time=current_txn_start_time,
|
1156
|
+
current_txn_id=current_txn_id,
|
1157
|
+
filesystem=filesystem,
|
1158
|
+
)
|
1159
|
+
mutable_src_locator = None
|
1160
|
+
mutable_dest_locator = None
|
1161
|
+
# metafiles without named immutable IDs have mutable name mappings
|
1162
|
+
if not self.named_immutable_id:
|
1163
|
+
mutable_src_locator = (
|
1164
|
+
current_txn_op.src_metafile.locator
|
1165
|
+
if current_txn_op.src_metafile
|
1166
|
+
else None
|
1167
|
+
)
|
1168
|
+
mutable_dest_locator = current_txn_op.dest_metafile.locator
|
1169
|
+
# metafiles with named immutable IDs may have aliases
|
1170
|
+
elif self.locator_alias:
|
1171
|
+
mutable_src_locator = (
|
1172
|
+
current_txn_op.src_metafile.locator_alias
|
1173
|
+
if current_txn_op.src_metafile
|
1174
|
+
else None
|
1175
|
+
)
|
1176
|
+
mutable_dest_locator = current_txn_op.dest_metafile.locator_alias
|
1177
|
+
if mutable_dest_locator:
|
1178
|
+
# the locator name is mutable, so we need to persist a mapping
|
1179
|
+
# from the locator back to its immutable metafile ID
|
1180
|
+
if (
|
1181
|
+
current_txn_op.type == TransactionOperationType.UPDATE
|
1182
|
+
and mutable_src_locator is not None
|
1183
|
+
and mutable_src_locator != mutable_dest_locator
|
1184
|
+
):
|
1185
|
+
# this update includes a rename
|
1186
|
+
# mark the source metafile mapping as deleted
|
1187
|
+
current_txn_op.src_metafile._write_locator_to_id_map_file(
|
1188
|
+
locator=mutable_src_locator,
|
1189
|
+
success_txn_log_dir=success_txn_log_dir,
|
1190
|
+
parent_obj_path=parent_obj_path,
|
1191
|
+
current_txn_op=current_txn_op,
|
1192
|
+
current_txn_op_type=TransactionOperationType.DELETE,
|
1193
|
+
current_txn_start_time=current_txn_start_time,
|
1194
|
+
current_txn_id=current_txn_id,
|
1195
|
+
filesystem=filesystem,
|
1196
|
+
)
|
1197
|
+
# mark the dest metafile mapping as created
|
1198
|
+
self._write_locator_to_id_map_file(
|
1199
|
+
locator=mutable_dest_locator,
|
1200
|
+
success_txn_log_dir=success_txn_log_dir,
|
1201
|
+
parent_obj_path=parent_obj_path,
|
1202
|
+
current_txn_op=current_txn_op,
|
1203
|
+
current_txn_op_type=TransactionOperationType.CREATE,
|
1204
|
+
current_txn_start_time=current_txn_start_time,
|
1205
|
+
current_txn_id=current_txn_id,
|
1206
|
+
filesystem=filesystem,
|
1207
|
+
)
|
1208
|
+
else:
|
1209
|
+
self._write_locator_to_id_map_file(
|
1210
|
+
locator=mutable_dest_locator,
|
1211
|
+
success_txn_log_dir=success_txn_log_dir,
|
1212
|
+
parent_obj_path=parent_obj_path,
|
1213
|
+
current_txn_op=current_txn_op,
|
1214
|
+
current_txn_op_type=current_txn_op.type,
|
1215
|
+
current_txn_start_time=current_txn_start_time,
|
1216
|
+
current_txn_id=current_txn_id,
|
1217
|
+
filesystem=filesystem,
|
1218
|
+
)
|
1219
|
+
metafile_revision_dir_path = posixpath.join(
|
1220
|
+
parent_obj_path,
|
1221
|
+
self.id,
|
1222
|
+
REVISION_DIR_NAME,
|
1223
|
+
)
|
1224
|
+
if (
|
1225
|
+
current_txn_op.type == TransactionOperationType.UPDATE
|
1226
|
+
and current_txn_op.src_metafile.id != current_txn_op.dest_metafile.id
|
1227
|
+
):
|
1228
|
+
# TODO(pdames): block operations including both a rename & replace?
|
1229
|
+
# this update includes a replace
|
1230
|
+
# mark the source metafile as deleted
|
1231
|
+
src_metafile_revision_dir_path = posixpath.join(
|
1232
|
+
parent_obj_path,
|
1233
|
+
current_txn_op.src_metafile.id,
|
1234
|
+
REVISION_DIR_NAME,
|
1235
|
+
)
|
1236
|
+
self._write_metafile_revision(
|
1237
|
+
success_txn_log_dir=success_txn_log_dir,
|
1238
|
+
revision_dir_path=src_metafile_revision_dir_path,
|
1239
|
+
current_txn_op=current_txn_op,
|
1240
|
+
current_txn_op_type=TransactionOperationType.DELETE,
|
1241
|
+
current_txn_start_time=current_txn_start_time,
|
1242
|
+
current_txn_id=current_txn_id,
|
1243
|
+
filesystem=filesystem,
|
1244
|
+
)
|
1245
|
+
try:
|
1246
|
+
# mark the dest metafile as created
|
1247
|
+
self._write_metafile_revision(
|
1248
|
+
success_txn_log_dir=success_txn_log_dir,
|
1249
|
+
revision_dir_path=metafile_revision_dir_path,
|
1250
|
+
current_txn_op=current_txn_op,
|
1251
|
+
current_txn_op_type=TransactionOperationType.CREATE,
|
1252
|
+
current_txn_start_time=current_txn_start_time,
|
1253
|
+
current_txn_id=current_txn_id,
|
1254
|
+
filesystem=filesystem,
|
1255
|
+
)
|
1256
|
+
except ValueError as e:
|
1257
|
+
# TODO(pdames): raise/catch a DuplicateMetafileCreate exception.
|
1258
|
+
if "already exists" not in str(e):
|
1259
|
+
raise e
|
1260
|
+
# src metafile is being replaced by an existing dest metafile
|
1261
|
+
|
1262
|
+
else:
|
1263
|
+
self._write_metafile_revision(
|
1264
|
+
success_txn_log_dir=success_txn_log_dir,
|
1265
|
+
revision_dir_path=metafile_revision_dir_path,
|
1266
|
+
current_txn_op=current_txn_op,
|
1267
|
+
current_txn_op_type=current_txn_op.type,
|
1268
|
+
current_txn_start_time=current_txn_start_time,
|
1269
|
+
current_txn_id=current_txn_id,
|
1270
|
+
filesystem=filesystem,
|
1271
|
+
)
|
1272
|
+
|
1273
|
+
def _list_metafiles(
|
1274
|
+
self,
|
1275
|
+
success_txn_log_dir: str,
|
1276
|
+
metafile_root_dir_path: str,
|
1277
|
+
current_txn_start_time: Optional[int] = None,
|
1278
|
+
current_txn_id: Optional[str] = None,
|
1279
|
+
filesystem: Optional[pyarrow.fs.FileSystem] = None,
|
1280
|
+
limit: Optional[int] = None,
|
1281
|
+
) -> ListResult[Metafile]:
|
1282
|
+
file_paths_and_sizes = list_directory(
|
1283
|
+
path=metafile_root_dir_path,
|
1284
|
+
filesystem=filesystem,
|
1285
|
+
ignore_missing_path=True,
|
1286
|
+
)
|
1287
|
+
# TODO(pdames): Exclude name resolution directories
|
1288
|
+
revision_dir_paths = [
|
1289
|
+
posixpath.join(file_path_and_size[0], REVISION_DIR_NAME)
|
1290
|
+
for file_path_and_size in file_paths_and_sizes
|
1291
|
+
if file_path_and_size[0] != success_txn_log_dir
|
1292
|
+
]
|
1293
|
+
items = []
|
1294
|
+
for path in revision_dir_paths:
|
1295
|
+
mri = MetafileRevisionInfo.latest_revision(
|
1296
|
+
revision_dir_path=path,
|
1297
|
+
filesystem=filesystem,
|
1298
|
+
success_txn_log_dir=success_txn_log_dir,
|
1299
|
+
current_txn_start_time=current_txn_start_time,
|
1300
|
+
current_txn_id=current_txn_id,
|
1301
|
+
ignore_missing_revision=True,
|
1302
|
+
)
|
1303
|
+
if mri.exists():
|
1304
|
+
item = self.read(
|
1305
|
+
path=mri.path,
|
1306
|
+
filesystem=filesystem,
|
1307
|
+
)
|
1308
|
+
items.append(item)
|
1309
|
+
if limit and limit <= len(items):
|
1310
|
+
break
|
1311
|
+
# TODO(pdames): Add pagination.
|
1312
|
+
return ListResult.of(
|
1313
|
+
items=items,
|
1314
|
+
pagination_key=None,
|
1315
|
+
next_page_provider=None,
|
1316
|
+
)
|