deltacat 2.0.0b10__py3-none-any.whl → 2.0.0b12__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +96 -17
- deltacat/api.py +122 -67
- deltacat/aws/constants.py +0 -23
- deltacat/aws/s3u.py +4 -631
- deltacat/benchmarking/benchmark_engine.py +4 -2
- deltacat/benchmarking/conftest.py +0 -18
- deltacat/benchmarking/test_benchmark_pipeline.py +6 -4
- deltacat/catalog/__init__.py +64 -5
- deltacat/catalog/delegate.py +445 -63
- deltacat/catalog/interface.py +188 -62
- deltacat/catalog/main/impl.py +2435 -279
- deltacat/catalog/model/catalog.py +154 -77
- deltacat/catalog/model/properties.py +63 -22
- deltacat/compute/compactor/compaction_session.py +97 -75
- deltacat/compute/compactor/model/compact_partition_params.py +75 -30
- deltacat/compute/compactor/model/compaction_session_audit_info.py +17 -0
- deltacat/compute/compactor/model/round_completion_info.py +16 -6
- deltacat/compute/compactor/repartition_session.py +8 -21
- deltacat/compute/compactor/steps/hash_bucket.py +5 -5
- deltacat/compute/compactor/steps/materialize.py +9 -7
- deltacat/compute/compactor/steps/repartition.py +12 -11
- deltacat/compute/compactor/utils/io.py +6 -5
- deltacat/compute/compactor/utils/round_completion_reader.py +117 -0
- deltacat/compute/compactor/utils/system_columns.py +3 -1
- deltacat/compute/compactor_v2/compaction_session.py +17 -14
- deltacat/compute/compactor_v2/constants.py +30 -1
- deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -1
- deltacat/compute/compactor_v2/model/hash_bucket_input.py +9 -3
- deltacat/compute/compactor_v2/model/merge_file_group.py +5 -2
- deltacat/compute/compactor_v2/model/merge_input.py +33 -8
- deltacat/compute/compactor_v2/private/compaction_utils.py +167 -68
- deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -2
- deltacat/compute/compactor_v2/steps/merge.py +267 -55
- deltacat/compute/compactor_v2/utils/content_type_params.py +34 -6
- deltacat/compute/compactor_v2/utils/dedupe.py +1 -1
- deltacat/compute/compactor_v2/utils/delta.py +5 -3
- deltacat/compute/compactor_v2/utils/io.py +11 -4
- deltacat/compute/compactor_v2/utils/merge.py +15 -2
- deltacat/compute/compactor_v2/utils/primary_key_index.py +28 -4
- deltacat/compute/compactor_v2/utils/task_options.py +45 -33
- deltacat/compute/converter/converter_session.py +145 -32
- deltacat/compute/converter/model/convert_input.py +26 -19
- deltacat/compute/converter/model/convert_input_files.py +33 -16
- deltacat/compute/converter/model/convert_result.py +35 -16
- deltacat/compute/converter/model/converter_session_params.py +24 -21
- deltacat/compute/converter/pyiceberg/catalog.py +21 -18
- deltacat/compute/converter/pyiceberg/overrides.py +18 -9
- deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +148 -100
- deltacat/compute/converter/steps/convert.py +157 -50
- deltacat/compute/converter/steps/dedupe.py +24 -11
- deltacat/compute/converter/utils/convert_task_options.py +27 -12
- deltacat/compute/converter/utils/converter_session_utils.py +126 -60
- deltacat/compute/converter/utils/iceberg_columns.py +8 -8
- deltacat/compute/converter/utils/io.py +101 -12
- deltacat/compute/converter/utils/s3u.py +33 -27
- deltacat/compute/janitor.py +205 -0
- deltacat/compute/jobs/client.py +25 -12
- deltacat/compute/resource_estimation/delta.py +38 -6
- deltacat/compute/resource_estimation/model.py +8 -0
- deltacat/constants.py +45 -2
- deltacat/docs/autogen/schema/inference/generate_type_mappings.py +687 -0
- deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py +673 -0
- deltacat/env.py +10 -0
- deltacat/examples/basic_logging.py +1 -3
- deltacat/examples/compactor/aws/__init__.py +1 -0
- deltacat/examples/compactor/bootstrap.py +863 -0
- deltacat/examples/compactor/compactor.py +373 -0
- deltacat/examples/compactor/explorer.py +473 -0
- deltacat/examples/compactor/gcp/__init__.py +1 -0
- deltacat/examples/compactor/job_runner.py +439 -0
- deltacat/examples/compactor/utils/__init__.py +1 -0
- deltacat/examples/compactor/utils/common.py +261 -0
- deltacat/examples/experimental/iceberg/converter/beam/app.py +226 -0
- deltacat/examples/experimental/iceberg/converter/beam/main.py +133 -0
- deltacat/examples/experimental/iceberg/converter/beam/test_workflow.py +113 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py +3 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/common.py +174 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py +263 -0
- deltacat/examples/{iceberg → experimental/iceberg}/iceberg_bucket_writer.py +3 -5
- deltacat/examples/{iceberg → experimental/iceberg}/iceberg_reader.py +2 -4
- deltacat/examples/indexer/indexer.py +2 -2
- deltacat/examples/indexer/job_runner.py +1 -2
- deltacat/exceptions.py +66 -4
- deltacat/experimental/catalog/iceberg/__init__.py +6 -0
- deltacat/{catalog → experimental/catalog}/iceberg/iceberg_catalog_config.py +1 -1
- deltacat/{catalog → experimental/catalog}/iceberg/impl.py +29 -11
- deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py +201 -0
- deltacat/experimental/converter_agent/beam/managed.py +173 -0
- deltacat/experimental/converter_agent/table_monitor.py +479 -0
- deltacat/experimental/storage/iceberg/iceberg_scan_planner.py +129 -0
- deltacat/{storage → experimental/storage}/iceberg/impl.py +6 -4
- deltacat/{storage → experimental/storage}/iceberg/model.py +7 -3
- deltacat/experimental/storage/iceberg/visitor.py +119 -0
- deltacat/experimental/storage/rivulet/__init__.py +11 -0
- deltacat/{storage → experimental/storage}/rivulet/arrow/serializer.py +7 -4
- deltacat/{storage → experimental/storage}/rivulet/dataset.py +13 -12
- deltacat/{storage → experimental/storage}/rivulet/dataset_executor.py +12 -20
- deltacat/experimental/storage/rivulet/feather/__init__.py +7 -0
- deltacat/{storage → experimental/storage}/rivulet/feather/file_reader.py +7 -5
- deltacat/{storage → experimental/storage}/rivulet/feather/serializer.py +4 -4
- deltacat/{storage → experimental/storage}/rivulet/fs/file_provider.py +3 -3
- deltacat/{storage → experimental/storage}/rivulet/fs/file_store.py +2 -2
- deltacat/{storage → experimental/storage}/rivulet/fs/output_file.py +1 -1
- deltacat/{storage → experimental/storage}/rivulet/logical_plan.py +4 -4
- deltacat/experimental/storage/rivulet/metastore/__init__.py +0 -0
- deltacat/{storage → experimental/storage}/rivulet/metastore/delta.py +1 -3
- deltacat/{storage → experimental/storage}/rivulet/metastore/json_sst.py +3 -3
- deltacat/{storage → experimental/storage}/rivulet/metastore/sst.py +2 -2
- deltacat/{storage → experimental/storage}/rivulet/metastore/sst_interval_tree.py +3 -3
- deltacat/experimental/storage/rivulet/parquet/__init__.py +7 -0
- deltacat/experimental/storage/rivulet/parquet/data_reader.py +0 -0
- deltacat/{storage → experimental/storage}/rivulet/parquet/file_reader.py +7 -5
- deltacat/{storage → experimental/storage}/rivulet/parquet/serializer.py +4 -4
- deltacat/experimental/storage/rivulet/reader/__init__.py +0 -0
- deltacat/{storage → experimental/storage}/rivulet/reader/block_scanner.py +20 -9
- deltacat/{storage → experimental/storage}/rivulet/reader/data_reader.py +3 -3
- deltacat/{storage → experimental/storage}/rivulet/reader/data_scan.py +5 -3
- deltacat/{storage → experimental/storage}/rivulet/reader/dataset_metastore.py +7 -6
- deltacat/{storage → experimental/storage}/rivulet/reader/dataset_reader.py +8 -6
- deltacat/{storage → experimental/storage}/rivulet/reader/pyarrow_data_reader.py +4 -1
- deltacat/{storage → experimental/storage}/rivulet/reader/reader_type_registrar.py +4 -4
- deltacat/experimental/storage/rivulet/schema/__init__.py +0 -0
- deltacat/{storage → experimental/storage}/rivulet/schema/schema.py +1 -1
- deltacat/{storage → experimental/storage}/rivulet/serializer.py +1 -1
- deltacat/{storage → experimental/storage}/rivulet/serializer_factory.py +9 -5
- deltacat/experimental/storage/rivulet/shard/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/shard/range_shard.py +129 -0
- deltacat/experimental/storage/rivulet/writer/__init__.py +0 -0
- deltacat/{storage → experimental/storage}/rivulet/writer/memtable_dataset_writer.py +20 -9
- deltacat/io/datasource/deltacat_datasource.py +0 -1
- deltacat/io/reader/deltacat_read_api.py +1 -1
- deltacat/storage/__init__.py +20 -2
- deltacat/storage/interface.py +54 -32
- deltacat/storage/main/impl.py +1494 -541
- deltacat/storage/model/delta.py +27 -3
- deltacat/storage/model/locator.py +6 -12
- deltacat/storage/model/manifest.py +182 -6
- deltacat/storage/model/metafile.py +151 -78
- deltacat/storage/model/namespace.py +8 -1
- deltacat/storage/model/partition.py +117 -42
- deltacat/storage/model/schema.py +2427 -159
- deltacat/storage/model/shard.py +6 -2
- deltacat/storage/model/sort_key.py +40 -0
- deltacat/storage/model/stream.py +9 -2
- deltacat/storage/model/table.py +12 -1
- deltacat/storage/model/table_version.py +11 -0
- deltacat/storage/model/transaction.py +1184 -208
- deltacat/storage/model/transform.py +81 -2
- deltacat/storage/model/types.py +48 -26
- deltacat/tests/_io/test_cloudpickle_bug_fix.py +8 -4
- deltacat/tests/aws/test_s3u.py +2 -31
- deltacat/tests/catalog/data/__init__.py +0 -0
- deltacat/tests/catalog/main/__init__.py +0 -0
- deltacat/tests/catalog/main/test_catalog_impl_namespace_operations.py +130 -0
- deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +1972 -0
- deltacat/tests/catalog/model/__init__.py +0 -0
- deltacat/tests/catalog/model/test_table_definition.py +16 -0
- deltacat/tests/catalog/test_catalogs.py +103 -106
- deltacat/tests/catalog/test_default_catalog_impl.py +12152 -72
- deltacat/tests/compute/compact_partition_test_cases.py +35 -8
- deltacat/tests/compute/compactor/steps/test_repartition.py +12 -12
- deltacat/tests/compute/compactor/utils/test_io.py +124 -120
- deltacat/tests/compute/compactor/utils/test_round_completion_reader.py +254 -0
- deltacat/tests/compute/compactor_v2/test_compaction_session.py +423 -312
- deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +266 -0
- deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +45 -0
- deltacat/tests/compute/compactor_v2/utils/test_task_options.py +270 -1
- deltacat/tests/compute/conftest.py +8 -44
- deltacat/tests/compute/converter/test_convert_session.py +675 -490
- deltacat/tests/compute/converter/utils.py +15 -6
- deltacat/tests/compute/resource_estimation/test_delta.py +145 -79
- deltacat/tests/compute/test_compact_partition_incremental.py +103 -70
- deltacat/tests/compute/test_compact_partition_multiple_rounds.py +89 -66
- deltacat/tests/compute/test_compact_partition_params.py +13 -8
- deltacat/tests/compute/test_compact_partition_rebase.py +77 -62
- deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +263 -193
- deltacat/tests/compute/test_janitor.py +236 -0
- deltacat/tests/compute/test_util_common.py +716 -43
- deltacat/tests/compute/test_util_constant.py +0 -1
- deltacat/tests/{storage/conftest.py → conftest.py} +1 -1
- deltacat/tests/daft/__init__.py +0 -0
- deltacat/tests/daft/test_model.py +97 -0
- deltacat/tests/experimental/__init__.py +1 -0
- deltacat/tests/experimental/catalog/__init__.py +0 -0
- deltacat/tests/experimental/catalog/iceberg/__init__.py +0 -0
- deltacat/tests/experimental/catalog/iceberg/test_iceberg_catalog.py +71 -0
- deltacat/tests/experimental/compatibility/__init__.py +1 -0
- deltacat/tests/experimental/compatibility/test_backfill_locator_to_id_mappings.py +582 -0
- deltacat/tests/experimental/daft/__init__.py +0 -0
- deltacat/tests/experimental/daft/test_deltacat_daft_integration.py +136 -0
- deltacat/tests/experimental/storage/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/__init__.py +0 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/conftest.py +3 -3
- deltacat/tests/experimental/storage/rivulet/fs/__init__.py +0 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/fs/test_file_location_provider.py +3 -3
- deltacat/tests/experimental/storage/rivulet/reader/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/reader/query_expression.py +80 -0
- deltacat/tests/experimental/storage/rivulet/reader/test_data_scan.py +119 -0
- deltacat/tests/experimental/storage/rivulet/reader/test_dataset_metastore.py +71 -0
- deltacat/tests/experimental/storage/rivulet/schema/__init__.py +0 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/schema/test_schema.py +1 -1
- deltacat/tests/experimental/storage/rivulet/shard/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/shard/test_range_shard.py +162 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/test_dataset.py +5 -3
- deltacat/tests/{storage → experimental/storage}/rivulet/test_manifest.py +5 -5
- deltacat/tests/{storage → experimental/storage}/rivulet/test_sst_interval_tree.py +5 -5
- deltacat/tests/{storage → experimental/storage}/rivulet/test_utils.py +8 -6
- deltacat/tests/experimental/storage/rivulet/writer/__init__.py +0 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_dataset_write_then_read.py +11 -9
- deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_dataset_writer.py +2 -2
- deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_memtable_dataset_writer.py +7 -7
- deltacat/tests/storage/main/test_main_storage.py +6900 -95
- deltacat/tests/storage/model/test_metafile_io.py +78 -173
- deltacat/tests/storage/model/test_partition_scheme.py +85 -0
- deltacat/tests/storage/model/test_schema.py +171 -0
- deltacat/tests/storage/model/test_schema_update.py +1925 -0
- deltacat/tests/storage/model/test_shard.py +3 -1
- deltacat/tests/storage/model/test_sort_scheme.py +90 -0
- deltacat/tests/storage/model/test_transaction.py +393 -48
- deltacat/tests/storage/model/test_transaction_history.py +886 -0
- deltacat/tests/test_deltacat_api.py +988 -4
- deltacat/tests/test_exceptions.py +9 -5
- deltacat/tests/test_utils/pyarrow.py +52 -21
- deltacat/tests/test_utils/storage.py +23 -34
- deltacat/tests/types/__init__.py +0 -0
- deltacat/tests/types/test_tables.py +104 -0
- deltacat/tests/utils/exceptions.py +22 -0
- deltacat/tests/utils/main_deltacat_storage_mock.py +31 -0
- deltacat/tests/utils/ray_utils/test_dataset.py +123 -5
- deltacat/tests/utils/test_daft.py +121 -31
- deltacat/tests/utils/test_numpy.py +1193 -0
- deltacat/tests/utils/test_pandas.py +1106 -0
- deltacat/tests/utils/test_polars.py +1040 -0
- deltacat/tests/utils/test_pyarrow.py +1370 -89
- deltacat/types/media.py +224 -14
- deltacat/types/tables.py +2329 -59
- deltacat/utils/arguments.py +33 -1
- deltacat/utils/daft.py +823 -36
- deltacat/utils/export.py +3 -1
- deltacat/utils/filesystem.py +100 -0
- deltacat/utils/metafile_locator.py +2 -1
- deltacat/utils/numpy.py +118 -26
- deltacat/utils/pandas.py +577 -48
- deltacat/utils/polars.py +658 -27
- deltacat/utils/pyarrow.py +1258 -213
- deltacat/utils/ray_utils/dataset.py +101 -10
- deltacat/utils/reader_compatibility_mapping.py +3083 -0
- deltacat/utils/url.py +57 -16
- deltacat-2.0.0b12.dist-info/METADATA +1163 -0
- deltacat-2.0.0b12.dist-info/RECORD +439 -0
- {deltacat-2.0.0b10.dist-info → deltacat-2.0.0b12.dist-info}/WHEEL +1 -1
- deltacat/catalog/iceberg/__init__.py +0 -4
- deltacat/compute/compactor/utils/round_completion_file.py +0 -97
- deltacat/compute/merge_on_read/__init__.py +0 -4
- deltacat/compute/merge_on_read/daft.py +0 -40
- deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -66
- deltacat/compute/merge_on_read/utils/delta.py +0 -42
- deltacat/daft/daft_scan.py +0 -115
- deltacat/daft/model.py +0 -258
- deltacat/daft/translator.py +0 -126
- deltacat/examples/common/fixtures.py +0 -15
- deltacat/storage/iceberg/iceberg_scan_planner.py +0 -28
- deltacat/storage/rivulet/__init__.py +0 -11
- deltacat/storage/rivulet/feather/__init__.py +0 -5
- deltacat/storage/rivulet/parquet/__init__.py +0 -5
- deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -231
- deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -388
- deltacat/tests/local_deltacat_storage/__init__.py +0 -1236
- deltacat/tests/local_deltacat_storage/exceptions.py +0 -10
- deltacat/utils/s3fs.py +0 -21
- deltacat-2.0.0b10.dist-info/METADATA +0 -68
- deltacat-2.0.0b10.dist-info/RECORD +0 -381
- /deltacat/{compute/merge_on_read/model → docs}/__init__.py +0 -0
- /deltacat/{compute/merge_on_read/utils → docs/autogen}/__init__.py +0 -0
- /deltacat/{daft → docs/autogen/schema}/__init__.py +0 -0
- /deltacat/{examples/common → docs/autogen/schema/inference}/__init__.py +0 -0
- /deltacat/examples/{iceberg → compactor}/__init__.py +0 -0
- /deltacat/{storage/iceberg → examples/experimental}/__init__.py +0 -0
- /deltacat/{storage/rivulet/arrow → examples/experimental/iceberg}/__init__.py +0 -0
- /deltacat/{storage/rivulet/fs → examples/experimental/iceberg/converter}/__init__.py +0 -0
- /deltacat/{storage/rivulet/metastore → examples/experimental/iceberg/converter/beam}/__init__.py +0 -0
- /deltacat/{storage/rivulet/reader → experimental/catalog}/__init__.py +0 -0
- /deltacat/{catalog → experimental/catalog}/iceberg/overrides.py +0 -0
- /deltacat/{storage/rivulet/schema → experimental/compatibility}/__init__.py +0 -0
- /deltacat/{storage/rivulet/writer → experimental/converter_agent}/__init__.py +0 -0
- /deltacat/{tests/storage/rivulet → experimental/converter_agent/beam}/__init__.py +0 -0
- /deltacat/{tests/storage/rivulet/fs → experimental/storage}/__init__.py +0 -0
- /deltacat/{tests/storage/rivulet/schema → experimental/storage/iceberg}/__init__.py +0 -0
- /deltacat/{tests/storage/rivulet/writer → experimental/storage/rivulet/arrow}/__init__.py +0 -0
- /deltacat/{storage/rivulet/parquet/data_reader.py → experimental/storage/rivulet/fs/__init__.py} +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/fs/input_file.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/mvp/Table.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/mvp/__init__.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/reader/query_expression.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/schema/datatype.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/writer/dataset_writer.py +0 -0
- {deltacat-2.0.0b10.dist-info → deltacat-2.0.0b12.dist-info/licenses}/LICENSE +0 -0
- {deltacat-2.0.0b10.dist-info → deltacat-2.0.0b12.dist-info}/top_level.txt +0 -0
@@ -4,15 +4,21 @@ import os
|
|
4
4
|
import copy
|
5
5
|
import time
|
6
6
|
import uuid
|
7
|
+
import logging
|
7
8
|
import posixpath
|
8
9
|
from pathlib import PosixPath
|
9
10
|
import threading
|
11
|
+
import contextvars
|
10
12
|
from collections import defaultdict
|
11
13
|
|
12
|
-
from
|
13
|
-
from typing import Optional, List, Union, Tuple
|
14
|
+
from types import TracebackType
|
15
|
+
from typing import Optional, List, Union, Tuple, Type, TYPE_CHECKING, Iterable
|
16
|
+
|
17
|
+
if TYPE_CHECKING:
|
18
|
+
from deltacat.types.tables import Dataset
|
14
19
|
|
15
20
|
import msgpack
|
21
|
+
import pyarrow as pa
|
16
22
|
import pyarrow.fs
|
17
23
|
|
18
24
|
from deltacat.constants import (
|
@@ -20,22 +26,530 @@ from deltacat.constants import (
|
|
20
26
|
TXN_PART_SEPARATOR,
|
21
27
|
RUNNING_TXN_DIR_NAME,
|
22
28
|
FAILED_TXN_DIR_NAME,
|
29
|
+
PAUSED_TXN_DIR_NAME,
|
23
30
|
SUCCESS_TXN_DIR_NAME,
|
24
31
|
NANOS_PER_SEC,
|
25
32
|
)
|
26
33
|
from deltacat.storage.model.list_result import ListResult
|
27
34
|
from deltacat.storage.model.types import (
|
28
35
|
TransactionOperationType,
|
29
|
-
|
36
|
+
TransactionState,
|
37
|
+
TransactionStatus,
|
30
38
|
)
|
39
|
+
from deltacat.storage.model.namespace import NamespaceLocator
|
40
|
+
from deltacat.storage.model.table import TableLocator
|
41
|
+
from deltacat.storage.model.table_version import TableVersionLocator
|
42
|
+
from deltacat.storage.model.stream import StreamLocator
|
43
|
+
from deltacat.storage.model.partition import PartitionLocator
|
44
|
+
from deltacat.storage.model.delta import DeltaLocator
|
31
45
|
from deltacat.storage.model.metafile import (
|
32
46
|
Metafile,
|
33
47
|
MetafileRevisionInfo,
|
34
48
|
)
|
49
|
+
from deltacat.types.tables import (
|
50
|
+
DatasetType,
|
51
|
+
from_pyarrow,
|
52
|
+
)
|
35
53
|
from deltacat.utils.filesystem import (
|
36
54
|
resolve_path_and_filesystem,
|
37
55
|
list_directory,
|
56
|
+
get_file_info,
|
38
57
|
)
|
58
|
+
from deltacat import logs
|
59
|
+
|
60
|
+
logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
|
61
|
+
|
62
|
+
|
63
|
+
# Context variable to store the current transaction
|
64
|
+
_current_transaction: contextvars.ContextVar[
|
65
|
+
Optional[Transaction]
|
66
|
+
] = contextvars.ContextVar("current_transaction", default=None)
|
67
|
+
|
68
|
+
|
69
|
+
def get_current_transaction() -> Optional[Transaction]:
|
70
|
+
"""Get the currently active transaction from context."""
|
71
|
+
return _current_transaction.get()
|
72
|
+
|
73
|
+
|
74
|
+
def set_current_transaction(transaction: Optional[Transaction]) -> contextvars.Token:
|
75
|
+
"""Set the current transaction in context, returns token for restoration."""
|
76
|
+
return _current_transaction.set(transaction)
|
77
|
+
|
78
|
+
|
79
|
+
def setup_transaction(
|
80
|
+
transaction: Optional[Transaction] = None,
|
81
|
+
**kwargs,
|
82
|
+
) -> Tuple[Transaction, bool]:
|
83
|
+
"""
|
84
|
+
Utility method to ensure a transaction exists and determine if it should be committed
|
85
|
+
within the caller's context. Creates a new transaction if none is provided.
|
86
|
+
|
87
|
+
Args:
|
88
|
+
transaction: Optional existing transaction to use
|
89
|
+
**kwargs: Additional arguments for catalog properties
|
90
|
+
|
91
|
+
Returns:
|
92
|
+
Tuple[Transaction, bool]: The transaction to use and whether to commit it
|
93
|
+
"""
|
94
|
+
# Check for active transaction in context first
|
95
|
+
if transaction is None:
|
96
|
+
transaction = get_current_transaction()
|
97
|
+
|
98
|
+
commit_transaction = transaction is None
|
99
|
+
if commit_transaction:
|
100
|
+
from deltacat.catalog.model.properties import get_catalog_properties
|
101
|
+
|
102
|
+
catalog_properties = get_catalog_properties(**kwargs)
|
103
|
+
transaction = Transaction.of().start(
|
104
|
+
catalog_properties.root,
|
105
|
+
catalog_properties.filesystem,
|
106
|
+
)
|
107
|
+
return transaction, commit_transaction
|
108
|
+
|
109
|
+
|
110
|
+
def transaction_log_dir_and_filesystem(
|
111
|
+
catalog_name: Optional[str] = None,
|
112
|
+
) -> Tuple[str, pyarrow.fs.FileSystem]:
|
113
|
+
"""
|
114
|
+
Get the transaction log directory and filesystem for the given catalog.
|
115
|
+
|
116
|
+
Args:
|
117
|
+
catalog_name: Name of the catalog to get the transaction log directory and filesystem for.
|
118
|
+
If None, uses the default catalog.
|
119
|
+
|
120
|
+
Returns:
|
121
|
+
Tuple[str, pyarrow.fs.FileSystem]: The transaction log directory and filesystem for the given catalog.
|
122
|
+
"""
|
123
|
+
# Get the catalog and its properties
|
124
|
+
from deltacat.catalog.model.catalog import get_catalog
|
125
|
+
|
126
|
+
catalog = get_catalog(catalog_name)
|
127
|
+
catalog_properties = catalog.inner
|
128
|
+
|
129
|
+
# Get transaction directory paths
|
130
|
+
catalog_root_normalized, filesystem = resolve_path_and_filesystem(
|
131
|
+
catalog_properties.root,
|
132
|
+
catalog_properties.filesystem,
|
133
|
+
)
|
134
|
+
|
135
|
+
return posixpath.join(catalog_root_normalized, TXN_DIR_NAME), filesystem
|
136
|
+
|
137
|
+
|
138
|
+
def transaction(
|
139
|
+
catalog_name: Optional[str] = None,
|
140
|
+
as_of: Optional[int] = None,
|
141
|
+
commit_message: Optional[str] = None,
|
142
|
+
) -> Transaction:
|
143
|
+
"""
|
144
|
+
Start a new interactive transaction for the given catalog.
|
145
|
+
|
146
|
+
Args:
|
147
|
+
catalog_name: Optional name of the catalog to run the transaction against.
|
148
|
+
If None, uses the default catalog.
|
149
|
+
as_of: Optional historic timestamp in nanoseconds since epoch.
|
150
|
+
If provided, creates a read-only transaction that reads only transactions
|
151
|
+
with end times strictly less than the specified timestamp.
|
152
|
+
commit_message: Optional commit message to describe the transaction purpose.
|
153
|
+
Helps with time travel functionality by providing context
|
154
|
+
for each transaction when browsing transaction history.
|
155
|
+
|
156
|
+
Returns:
|
157
|
+
Transaction: A started interactive transaction ready for use with the given catalog.
|
158
|
+
|
159
|
+
Example:
|
160
|
+
# Read-write transaction with commit message
|
161
|
+
with dc.transaction(commit_message="Initial data load for Q4 analytics") as txn:
|
162
|
+
dc.write_to_table(data, "my_table")
|
163
|
+
dc.write_to_table(more_data, "my_other_table")
|
164
|
+
|
165
|
+
# Read-only historic transaction
|
166
|
+
import time
|
167
|
+
historic_time = time.time_ns() - 3600 * 1000000000 # 1 hour ago
|
168
|
+
with dc.transaction(as_of=historic_time) as txn:
|
169
|
+
# Only read operations allowed - provides snapshot as of historic_time
|
170
|
+
data = dc.read_table("my_table")
|
171
|
+
"""
|
172
|
+
from deltacat.catalog.model.catalog import get_catalog
|
173
|
+
|
174
|
+
# Get the catalog and its properties
|
175
|
+
catalog = get_catalog(catalog_name)
|
176
|
+
catalog_properties = catalog.inner
|
177
|
+
|
178
|
+
# Create interactive transaction
|
179
|
+
if as_of is not None:
|
180
|
+
# Create read-only historic transaction
|
181
|
+
txn = Transaction.of(commit_message=commit_message).start(
|
182
|
+
catalog_properties.root,
|
183
|
+
catalog_properties.filesystem,
|
184
|
+
historic_timestamp=as_of,
|
185
|
+
)
|
186
|
+
else:
|
187
|
+
# Create regular read-write transaction
|
188
|
+
txn = Transaction.of(commit_message=commit_message).start(
|
189
|
+
catalog_properties.root, catalog_properties.filesystem
|
190
|
+
)
|
191
|
+
# Initialize the lazy transaction ID
|
192
|
+
logger.info(f"Created transaction with ID: {txn.id}")
|
193
|
+
return txn
|
194
|
+
|
195
|
+
|
196
|
+
def _read_txn(
|
197
|
+
txn_log_dir: str,
|
198
|
+
txn_status: TransactionStatus,
|
199
|
+
transaction_id: str,
|
200
|
+
filesystem: pyarrow.fs.FileSystem,
|
201
|
+
) -> Transaction:
|
202
|
+
"""
|
203
|
+
Read a transaction ID with the expected status from the given root transaction log directory.
|
204
|
+
|
205
|
+
Args:
|
206
|
+
txn_log_dir: The directory containing the transaction log.
|
207
|
+
txn_status: The expected status of the transaction.
|
208
|
+
transaction_id: The ID of the transaction.
|
209
|
+
filesystem: The filesystem to use for reading the transaction.
|
210
|
+
|
211
|
+
Returns:
|
212
|
+
Transaction: The transaction.
|
213
|
+
"""
|
214
|
+
# Transaction directories contain the actual transaction file
|
215
|
+
txn_dir_path = posixpath.join(
|
216
|
+
txn_log_dir, txn_status.dir_name(), posixpath.basename(transaction_id)
|
217
|
+
)
|
218
|
+
|
219
|
+
try:
|
220
|
+
file_info = get_file_info(txn_dir_path, filesystem)
|
221
|
+
except FileNotFoundError:
|
222
|
+
raise FileNotFoundError(
|
223
|
+
f"Transaction with ID '{transaction_id}' and status '{txn_status}' not found."
|
224
|
+
)
|
225
|
+
|
226
|
+
# Only read transaction directories (skip any stray files)
|
227
|
+
if file_info.type != pyarrow.fs.FileType.Directory:
|
228
|
+
raise FileNotFoundError(
|
229
|
+
f"Transaction directory for transaction ID '{transaction_id}' with status '{txn_status}' not found."
|
230
|
+
)
|
231
|
+
|
232
|
+
# List files in the transaction directory
|
233
|
+
txn_files = list_directory(
|
234
|
+
path=txn_dir_path,
|
235
|
+
filesystem=filesystem,
|
236
|
+
ignore_missing_path=True,
|
237
|
+
)
|
238
|
+
|
239
|
+
if not txn_files:
|
240
|
+
raise FileNotFoundError(
|
241
|
+
f"No transaction file found for transaction ID '{transaction_id}' and status '{txn_status}'."
|
242
|
+
)
|
243
|
+
|
244
|
+
if len(txn_files) > 1:
|
245
|
+
raise RuntimeError(
|
246
|
+
f"Expected 1 transaction file in '{txn_dir_path}', but found {len(txn_files)}"
|
247
|
+
)
|
248
|
+
|
249
|
+
# Get the transaction file path
|
250
|
+
txn_file_path, _ = txn_files[0]
|
251
|
+
|
252
|
+
# Read the transaction from the file
|
253
|
+
return Transaction.read(txn_file_path, filesystem)
|
254
|
+
|
255
|
+
|
256
|
+
def read_transaction(
|
257
|
+
transaction_id: str,
|
258
|
+
catalog_name: Optional[str] = None,
|
259
|
+
status: TransactionStatus = TransactionStatus.SUCCESS,
|
260
|
+
) -> Transaction:
|
261
|
+
"""
|
262
|
+
Read a transaction from the given catalog and transaction ID.
|
263
|
+
"""
|
264
|
+
txn_log_dir, filesystem = transaction_log_dir_and_filesystem(catalog_name)
|
265
|
+
return _read_txn(txn_log_dir, status, transaction_id, filesystem)
|
266
|
+
|
267
|
+
|
268
|
+
def transactions(
|
269
|
+
catalog_name: Optional[str] = None,
|
270
|
+
read_as: "DatasetType" = None,
|
271
|
+
start_time: Optional[int] = None,
|
272
|
+
end_time: Optional[int] = None,
|
273
|
+
limit: Optional[int] = None,
|
274
|
+
status_in: Iterable[TransactionStatus] = [TransactionStatus.SUCCESS],
|
275
|
+
) -> Dataset:
|
276
|
+
"""
|
277
|
+
Query transaction history for a catalog.
|
278
|
+
|
279
|
+
Args:
|
280
|
+
catalog_name: Optional name of the catalog to query. If None, uses the default catalog.
|
281
|
+
read_as: Dataset type to return results as. If None, defaults to DatasetType.PYARROW.
|
282
|
+
start_time: Optional start timestamp in nanoseconds since epoch to filter transactions.
|
283
|
+
end_time: Optional end timestamp in nanoseconds since epoch to filter transactions.
|
284
|
+
limit: Optional maximum number of transactions to return (most recent first).
|
285
|
+
status_in: Optional iterable of transaction status types to include. Defaults to [TransactionStatus.SUCCESS].
|
286
|
+
|
287
|
+
Returns:
|
288
|
+
Dataset: Transaction history as the specified dataset type with columns:
|
289
|
+
- transaction_id: Unique transaction identifier
|
290
|
+
- commit_message: Optional user-provided commit message
|
291
|
+
- start_time: Transaction start timestamp (nanoseconds since epoch)
|
292
|
+
- end_time: Transaction end timestamp (nanoseconds since epoch, None for running)
|
293
|
+
- status: Transaction status (SUCCESS, RUNNING, FAILED, PAUSED)
|
294
|
+
- operation_count: Number of operations in the transaction
|
295
|
+
- operation_types: Comma-separated list of distinct operation types in the transaction
|
296
|
+
- namespace_count: Number of distinct namespaces affected by the transaction
|
297
|
+
- table_count: Number of distinct tables affected by the transaction
|
298
|
+
- table_version_count: Number of distinct table versions affected by the transaction
|
299
|
+
- stream_count: Number of distinct streams affected by the transaction
|
300
|
+
- partition_count: Number of distinct partitions affected by the transaction
|
301
|
+
- delta_count: Number of distinct deltas affected by the transaction
|
302
|
+
|
303
|
+
Example:
|
304
|
+
# Get recent successful transactions
|
305
|
+
recent = dc.transactions(limit=10)
|
306
|
+
|
307
|
+
# Get transactions for a specific time range
|
308
|
+
import time
|
309
|
+
hour_ago = time.time_ns() - 3600 * 1000000000
|
310
|
+
recent_hour = dc.transactions(start_time=hour_ago)
|
311
|
+
|
312
|
+
# Get transaction history as pandas DataFrame
|
313
|
+
df = dc.transactions(read_as=dc.DatasetType.PANDAS)
|
314
|
+
"""
|
315
|
+
# Validate inputs
|
316
|
+
if limit is not None and limit <= 0:
|
317
|
+
raise ValueError("limit must be greater than 0")
|
318
|
+
|
319
|
+
# Set default read_as if not provided
|
320
|
+
if read_as is None:
|
321
|
+
read_as = DatasetType.PYARROW
|
322
|
+
|
323
|
+
if not status_in:
|
324
|
+
status_in = [TransactionStatus.SUCCESS]
|
325
|
+
|
326
|
+
# Get transaction directory path and filesystem
|
327
|
+
txn_log_dir, filesystem = transaction_log_dir_and_filesystem(catalog_name)
|
328
|
+
|
329
|
+
# Collect transaction data
|
330
|
+
transaction_records = {
|
331
|
+
"transaction_id": [],
|
332
|
+
"commit_message": [],
|
333
|
+
"start_time": [],
|
334
|
+
"end_time": [],
|
335
|
+
"status": [],
|
336
|
+
"operation_count": [],
|
337
|
+
"operation_types": [],
|
338
|
+
"namespace_count": [],
|
339
|
+
"table_count": [],
|
340
|
+
"table_version_count": [],
|
341
|
+
"stream_count": [],
|
342
|
+
"partition_count": [],
|
343
|
+
"delta_count": [],
|
344
|
+
}
|
345
|
+
|
346
|
+
# Helper function to process transactions in a directory
|
347
|
+
def process_transactions_in_directory(
|
348
|
+
directory: str, expected_status: TransactionStatus
|
349
|
+
):
|
350
|
+
# TODO(pdames): Do a recursive listing to get the transaction files returned directly.
|
351
|
+
file_info_and_sizes = list_directory(
|
352
|
+
path=directory,
|
353
|
+
filesystem=filesystem,
|
354
|
+
ignore_missing_path=True,
|
355
|
+
)
|
356
|
+
|
357
|
+
for file_path, _ in file_info_and_sizes:
|
358
|
+
# Read the transaction from the file
|
359
|
+
# TODO(pdames): Do a recursive listing to get the transaction files returned directly.
|
360
|
+
try:
|
361
|
+
txn = _read_txn(
|
362
|
+
txn_log_dir,
|
363
|
+
expected_status,
|
364
|
+
posixpath.basename(file_path),
|
365
|
+
filesystem,
|
366
|
+
)
|
367
|
+
except FileNotFoundError:
|
368
|
+
# this may be a stray file or the transaction is being created - skip it
|
369
|
+
continue
|
370
|
+
|
371
|
+
# Apply time filters
|
372
|
+
# TODO(pdames): Parse start and end times from the transaction file path.
|
373
|
+
if (
|
374
|
+
start_time is not None
|
375
|
+
and txn.start_time
|
376
|
+
and txn.start_time < start_time
|
377
|
+
):
|
378
|
+
continue
|
379
|
+
if end_time is not None and txn.end_time and txn.end_time > end_time:
|
380
|
+
continue
|
381
|
+
|
382
|
+
# Count operations and affected metadata objects by type.
|
383
|
+
operation_count = len(txn.operations)
|
384
|
+
operation_types = set()
|
385
|
+
affected_namespaces = set()
|
386
|
+
affected_tables = set()
|
387
|
+
affected_table_versions = set()
|
388
|
+
affected_streams = set()
|
389
|
+
affected_partitions = set()
|
390
|
+
affected_deltas = set()
|
391
|
+
|
392
|
+
for op in txn.operations:
|
393
|
+
operation_types.add(op.type)
|
394
|
+
|
395
|
+
# Determine locator type and cast to appropriate locator class
|
396
|
+
locator_dict = op.dest_metafile.get("locator", {})
|
397
|
+
if "tableName" in locator_dict and "namespaceLocator" in locator_dict:
|
398
|
+
locator = TableLocator(locator_dict)
|
399
|
+
elif "namespace" in locator_dict:
|
400
|
+
locator = NamespaceLocator(locator_dict)
|
401
|
+
elif "tableVersion" in locator_dict:
|
402
|
+
locator = TableVersionLocator(locator_dict)
|
403
|
+
elif "streamId" in locator_dict:
|
404
|
+
locator = StreamLocator(locator_dict)
|
405
|
+
elif "partitionId" in locator_dict:
|
406
|
+
locator = PartitionLocator(locator_dict)
|
407
|
+
elif "streamPosition" in locator_dict:
|
408
|
+
locator = DeltaLocator(locator_dict)
|
409
|
+
else:
|
410
|
+
raise ValueError(
|
411
|
+
f"Unknown locator type from structure: {locator_dict}"
|
412
|
+
)
|
413
|
+
|
414
|
+
# Extract distinct metafiles updated by common/alias name (e.g., a table rename impacts 2 tables instead of 1)
|
415
|
+
if op.type in TransactionOperationType.write_operations():
|
416
|
+
if locator.namespace is not None:
|
417
|
+
affected_namespaces.add(locator.namespace)
|
418
|
+
if isinstance(locator, TableLocator):
|
419
|
+
affected_tables.add((locator.namespace, locator.table_name))
|
420
|
+
elif isinstance(locator, TableVersionLocator):
|
421
|
+
affected_table_versions.add(
|
422
|
+
(
|
423
|
+
locator.namespace,
|
424
|
+
locator.table_name,
|
425
|
+
locator.table_version,
|
426
|
+
)
|
427
|
+
)
|
428
|
+
elif isinstance(locator, StreamLocator):
|
429
|
+
affected_tables.add((locator.namespace, locator.table_name))
|
430
|
+
affected_table_versions.add(
|
431
|
+
(
|
432
|
+
locator.namespace,
|
433
|
+
locator.table_name,
|
434
|
+
locator.table_version,
|
435
|
+
)
|
436
|
+
)
|
437
|
+
affected_streams.add(
|
438
|
+
(
|
439
|
+
locator.namespace,
|
440
|
+
locator.table_name,
|
441
|
+
locator.table_version,
|
442
|
+
locator.stream_id,
|
443
|
+
)
|
444
|
+
)
|
445
|
+
elif isinstance(locator, PartitionLocator):
|
446
|
+
affected_tables.add((locator.namespace, locator.table_name))
|
447
|
+
affected_table_versions.add(
|
448
|
+
(
|
449
|
+
locator.namespace,
|
450
|
+
locator.table_name,
|
451
|
+
locator.table_version,
|
452
|
+
)
|
453
|
+
)
|
454
|
+
affected_streams.add(
|
455
|
+
(
|
456
|
+
locator.namespace,
|
457
|
+
locator.table_name,
|
458
|
+
locator.table_version,
|
459
|
+
locator.stream_id,
|
460
|
+
)
|
461
|
+
)
|
462
|
+
affected_partitions.add(
|
463
|
+
(
|
464
|
+
locator.namespace,
|
465
|
+
locator.table_name,
|
466
|
+
locator.table_version,
|
467
|
+
locator.stream_id,
|
468
|
+
locator.partition_id,
|
469
|
+
)
|
470
|
+
)
|
471
|
+
elif isinstance(locator, DeltaLocator):
|
472
|
+
affected_tables.add((locator.namespace, locator.table_name))
|
473
|
+
affected_table_versions.add(
|
474
|
+
(
|
475
|
+
locator.namespace,
|
476
|
+
locator.table_name,
|
477
|
+
locator.table_version,
|
478
|
+
)
|
479
|
+
)
|
480
|
+
affected_streams.add(
|
481
|
+
(
|
482
|
+
locator.namespace,
|
483
|
+
locator.table_name,
|
484
|
+
locator.table_version,
|
485
|
+
locator.stream_id,
|
486
|
+
)
|
487
|
+
)
|
488
|
+
affected_partitions.add(
|
489
|
+
(
|
490
|
+
locator.namespace,
|
491
|
+
locator.table_name,
|
492
|
+
locator.table_version,
|
493
|
+
locator.stream_id,
|
494
|
+
locator.partition_id,
|
495
|
+
)
|
496
|
+
)
|
497
|
+
affected_deltas.add(
|
498
|
+
(
|
499
|
+
locator.namespace,
|
500
|
+
locator.table_name,
|
501
|
+
locator.table_version,
|
502
|
+
locator.stream_id,
|
503
|
+
locator.partition_id,
|
504
|
+
locator.stream_position,
|
505
|
+
)
|
506
|
+
)
|
507
|
+
|
508
|
+
# Create transaction record
|
509
|
+
transaction_records["transaction_id"].append(txn.id)
|
510
|
+
transaction_records["commit_message"].append(txn.commit_message)
|
511
|
+
transaction_records["start_time"].append(txn.start_time)
|
512
|
+
transaction_records["end_time"].append(txn.end_time)
|
513
|
+
transaction_records["status"].append(expected_status)
|
514
|
+
transaction_records["operation_count"].append(operation_count)
|
515
|
+
transaction_records["operation_types"].append(operation_types)
|
516
|
+
transaction_records["namespace_count"].append(len(affected_namespaces))
|
517
|
+
transaction_records["table_count"].append(len(affected_tables))
|
518
|
+
transaction_records["table_version_count"].append(
|
519
|
+
len(affected_table_versions)
|
520
|
+
)
|
521
|
+
transaction_records["stream_count"].append(len(affected_streams))
|
522
|
+
transaction_records["partition_count"].append(len(affected_partitions))
|
523
|
+
transaction_records["delta_count"].append(len(affected_deltas))
|
524
|
+
|
525
|
+
for status in status_in:
|
526
|
+
dir_path = posixpath.join(txn_log_dir, status.dir_name())
|
527
|
+
process_transactions_in_directory(dir_path, status)
|
528
|
+
|
529
|
+
# Sort by start_time descending (most recent first)
|
530
|
+
# Convert to list of records, sort, then convert back
|
531
|
+
if transaction_records["transaction_id"]: # Only sort if we have records
|
532
|
+
# Create list of tuples (start_time, record_index)
|
533
|
+
sorted_indices = sorted(
|
534
|
+
range(len(transaction_records["start_time"])),
|
535
|
+
key=lambda i: transaction_records["start_time"][i] or 0,
|
536
|
+
reverse=True,
|
537
|
+
)
|
538
|
+
|
539
|
+
# Reorder all columns based on sorted indices
|
540
|
+
for key in transaction_records:
|
541
|
+
transaction_records[key] = [
|
542
|
+
transaction_records[key][i] for i in sorted_indices
|
543
|
+
]
|
544
|
+
|
545
|
+
# Apply limit
|
546
|
+
# TODO(pdames): Apply limit during listing (pyarrow fs doesn't support limits natively).
|
547
|
+
if limit is not None and limit > 0:
|
548
|
+
for key in transaction_records:
|
549
|
+
transaction_records[key] = transaction_records[key][:limit]
|
550
|
+
|
551
|
+
# Convert to requested dataset type
|
552
|
+
return from_pyarrow(pa.Table.from_pydict(transaction_records), read_as)
|
39
553
|
|
40
554
|
|
41
555
|
class TransactionTimeProvider:
|
@@ -147,6 +661,47 @@ class TransactionSystemTimeProvider(TransactionTimeProvider):
|
|
147
661
|
return current_time
|
148
662
|
|
149
663
|
|
664
|
+
class TransactionHistoricTimeProvider(TransactionTimeProvider):
|
665
|
+
"""
|
666
|
+
A transaction time provider that returns a fixed historic timestamp
|
667
|
+
for read-only transactions. This enables MVCC snapshot isolation
|
668
|
+
as-of the specified timestamp.
|
669
|
+
"""
|
670
|
+
|
671
|
+
def __init__(
|
672
|
+
self,
|
673
|
+
historic_timestamp: int,
|
674
|
+
base_time_provider: TransactionTimeProvider,
|
675
|
+
):
|
676
|
+
"""
|
677
|
+
Initialize with a fixed historic timestamp and a base time provider.
|
678
|
+
|
679
|
+
Args:
|
680
|
+
historic_timestamp: Timestamp in nanoseconds since epoch to use
|
681
|
+
for both start and end times.
|
682
|
+
base_time_provider: Time provider to use for the end time.
|
683
|
+
"""
|
684
|
+
# Validate that historic timestamp is not in the future
|
685
|
+
if historic_timestamp > base_time_provider.start_time():
|
686
|
+
raise ValueError(
|
687
|
+
f"Historic timestamp {historic_timestamp} cannot be set in the future."
|
688
|
+
)
|
689
|
+
self.base_time_provider = base_time_provider
|
690
|
+
self.historic_timestamp = historic_timestamp
|
691
|
+
|
692
|
+
def start_time(self) -> int:
|
693
|
+
"""
|
694
|
+
Returns the fixed historic timestamp.
|
695
|
+
"""
|
696
|
+
return self.historic_timestamp
|
697
|
+
|
698
|
+
def end_time(self) -> int:
|
699
|
+
"""
|
700
|
+
Returns the end time of the base time provider.
|
701
|
+
"""
|
702
|
+
return self.base_time_provider.end_time()
|
703
|
+
|
704
|
+
|
150
705
|
class TransactionOperation(dict):
|
151
706
|
"""
|
152
707
|
Base class for DeltaCAT transaction operations against individual metafiles.
|
@@ -161,10 +716,13 @@ class TransactionOperation(dict):
|
|
161
716
|
) -> TransactionOperation:
|
162
717
|
if not dest_metafile:
|
163
718
|
raise ValueError("Transaction operations must have a destination metafile.")
|
164
|
-
if operation_type
|
719
|
+
if operation_type in [
|
720
|
+
TransactionOperationType.UPDATE,
|
721
|
+
TransactionOperationType.REPLACE,
|
722
|
+
]:
|
165
723
|
if not src_metafile:
|
166
724
|
raise ValueError(
|
167
|
-
"
|
725
|
+
f"{operation_type.value} transaction operations must have a source metafile."
|
168
726
|
)
|
169
727
|
elif type(dest_metafile) is not type(src_metafile):
|
170
728
|
raise ValueError(
|
@@ -173,10 +731,12 @@ class TransactionOperation(dict):
|
|
173
731
|
)
|
174
732
|
elif src_metafile:
|
175
733
|
raise ValueError(
|
176
|
-
"Only UPDATE transaction operations may have a source metafile."
|
734
|
+
f"Only {TransactionOperationType.UPDATE.value} and {TransactionOperationType.REPLACE.value} transaction operations may have a source metafile."
|
177
735
|
)
|
178
736
|
if operation_type.is_write_operation() and read_limit:
|
179
|
-
raise ValueError(
|
737
|
+
raise ValueError(
|
738
|
+
f"Only {TransactionOperationType.READ.value} transaction operations may have a read limit."
|
739
|
+
)
|
180
740
|
txn_op = TransactionOperation()
|
181
741
|
txn_op.type = operation_type
|
182
742
|
txn_op.dest_metafile = dest_metafile
|
@@ -189,7 +749,10 @@ class TransactionOperation(dict):
|
|
189
749
|
"""
|
190
750
|
Returns the type of the transaction operation.
|
191
751
|
"""
|
192
|
-
|
752
|
+
val = self["type"]
|
753
|
+
if val is not None and not isinstance(val, TransactionOperationType):
|
754
|
+
self["type"] = val = TransactionOperationType(val)
|
755
|
+
return val
|
193
756
|
|
194
757
|
@type.setter
|
195
758
|
def type(self, txn_op_type: TransactionOperationType):
|
@@ -200,7 +763,10 @@ class TransactionOperation(dict):
|
|
200
763
|
"""
|
201
764
|
Returns the metafile that is the target of this transaction operation.
|
202
765
|
"""
|
203
|
-
|
766
|
+
val = self["dest_metafile"]
|
767
|
+
if val is not None and not isinstance(val, Metafile):
|
768
|
+
self["dest_metafile"] = val = Metafile(val)
|
769
|
+
return val
|
204
770
|
|
205
771
|
@dest_metafile.setter
|
206
772
|
def dest_metafile(self, metafile: Metafile):
|
@@ -211,7 +777,10 @@ class TransactionOperation(dict):
|
|
211
777
|
"""
|
212
778
|
Returns the metafile that is the source of this transaction operation.
|
213
779
|
"""
|
214
|
-
|
780
|
+
val = self.get("src_metafile")
|
781
|
+
if val is not None and not isinstance(val, Metafile):
|
782
|
+
self["src_metafile"] = val = Metafile(val)
|
783
|
+
return val
|
215
784
|
|
216
785
|
@src_metafile.setter
|
217
786
|
def src_metafile(self, src_metafile: Optional[Metafile]):
|
@@ -273,6 +842,11 @@ class TransactionOperationList(List[TransactionOperation]):
|
|
273
842
|
self[item] = val = TransactionOperation(val)
|
274
843
|
return val
|
275
844
|
|
845
|
+
def __iter__(self):
|
846
|
+
"""Support enumeration by returning TransactionOperation objects."""
|
847
|
+
for i in range(len(self)):
|
848
|
+
yield self[i] # This triggers __getitem__ conversion
|
849
|
+
|
276
850
|
|
277
851
|
class Transaction(dict):
|
278
852
|
"""
|
@@ -281,43 +855,16 @@ class Transaction(dict):
|
|
281
855
|
|
282
856
|
@staticmethod
|
283
857
|
def of(
|
284
|
-
|
285
|
-
|
858
|
+
txn_operations: Optional[TransactionOperationList] = None,
|
859
|
+
commit_message: Optional[str] = None,
|
286
860
|
) -> Transaction:
|
287
|
-
|
288
|
-
|
289
|
-
if operation_types - TransactionOperationType.read_operations():
|
290
|
-
raise ValueError(
|
291
|
-
"Only READ transaction operation types may be specified as "
|
292
|
-
"part of a READ transaction."
|
293
|
-
)
|
294
|
-
elif (
|
295
|
-
len(operation_types) == 1
|
296
|
-
and TransactionOperationType.CREATE in operation_types
|
297
|
-
):
|
298
|
-
if txn_type != TransactionType.APPEND:
|
299
|
-
raise ValueError(
|
300
|
-
"Transactions with only CREATE operations must be "
|
301
|
-
"specified as part of an APPEND transaction."
|
302
|
-
)
|
303
|
-
elif TransactionOperationType.DELETE in operation_types:
|
304
|
-
if txn_type != TransactionType.DELETE:
|
305
|
-
raise ValueError(
|
306
|
-
"DELETE transaction operations must be specified as part "
|
307
|
-
"of a DELETE transaction."
|
308
|
-
)
|
309
|
-
elif TransactionOperationType.UPDATE in operation_types and txn_type not in {
|
310
|
-
TransactionType.ALTER,
|
311
|
-
TransactionType.RESTATE,
|
312
|
-
TransactionType.OVERWRITE,
|
313
|
-
}:
|
314
|
-
raise ValueError(
|
315
|
-
"Transactions with UPDATE operations must be specified "
|
316
|
-
"as part of an ALTER, RESTATE, or OVERWRITE transaction."
|
317
|
-
)
|
861
|
+
if txn_operations is None:
|
862
|
+
txn_operations = []
|
318
863
|
transaction = Transaction()
|
319
|
-
transaction.type = txn_type
|
320
864
|
transaction.operations = txn_operations
|
865
|
+
transaction.interactive = len(txn_operations) == 0
|
866
|
+
if commit_message:
|
867
|
+
transaction.commit_message = commit_message
|
321
868
|
return transaction
|
322
869
|
|
323
870
|
@staticmethod
|
@@ -366,6 +913,7 @@ class Transaction(dict):
|
|
366
913
|
:param filesystem: File system to use for reading the Transaction file.
|
367
914
|
:return: Deserialized object from the Transaction file.
|
368
915
|
"""
|
916
|
+
|
369
917
|
if not filesystem:
|
370
918
|
path, filesystem = resolve_path_and_filesystem(path, filesystem)
|
371
919
|
with filesystem.open_input_stream(path) as file:
|
@@ -373,6 +921,23 @@ class Transaction(dict):
|
|
373
921
|
obj = cls(**msgpack.loads(binary))
|
374
922
|
return obj
|
375
923
|
|
924
|
+
@staticmethod
|
925
|
+
def read_time_provider(provider_name: str):
|
926
|
+
"""
|
927
|
+
Given the string name of a time provider class, return a new instance of it.
|
928
|
+
Raises ValueError if the provider name is unknown.
|
929
|
+
"""
|
930
|
+
TIME_PROVIDER_CLASSES = {
|
931
|
+
"TransactionSystemTimeProvider": TransactionSystemTimeProvider,
|
932
|
+
# Add additional mappings as needed
|
933
|
+
}
|
934
|
+
|
935
|
+
provider_cls = TIME_PROVIDER_CLASSES.get(provider_name)
|
936
|
+
if provider_cls is None:
|
937
|
+
raise ValueError(f"Unknown time provider: {provider_name}")
|
938
|
+
|
939
|
+
return provider_cls()
|
940
|
+
|
376
941
|
@property
|
377
942
|
def id(self) -> Optional[str]:
|
378
943
|
"""
|
@@ -384,16 +949,49 @@ class Transaction(dict):
|
|
384
949
|
_id = self["id"] = f"{self.start_time}{TXN_PART_SEPARATOR}{uuid.uuid4()}"
|
385
950
|
return _id
|
386
951
|
|
387
|
-
|
388
|
-
def type(self) -> TransactionType:
|
952
|
+
def state(self, catalog_root_dir: str, filesystem: pyarrow.fs.FileSystem = None):
|
389
953
|
"""
|
390
|
-
|
954
|
+
Infer the transaction state based on its presence in different directories.
|
391
955
|
"""
|
392
|
-
return TransactionType(self["type"])
|
393
956
|
|
394
|
-
|
395
|
-
|
396
|
-
|
957
|
+
txn_name = self.id
|
958
|
+
|
959
|
+
catalog_root_normalized, filesystem = resolve_path_and_filesystem(
|
960
|
+
catalog_root_dir
|
961
|
+
)
|
962
|
+
|
963
|
+
txn_log_dir = posixpath.join(catalog_root_normalized, TXN_DIR_NAME)
|
964
|
+
running_txn_log_dir = posixpath.join(txn_log_dir, RUNNING_TXN_DIR_NAME)
|
965
|
+
filesystem.create_dir(running_txn_log_dir, recursive=True)
|
966
|
+
failed_txn_log_dir = posixpath.join(txn_log_dir, FAILED_TXN_DIR_NAME)
|
967
|
+
filesystem.create_dir(failed_txn_log_dir, recursive=False)
|
968
|
+
success_txn_log_dir = posixpath.join(txn_log_dir, SUCCESS_TXN_DIR_NAME)
|
969
|
+
filesystem.create_dir(success_txn_log_dir, recursive=False)
|
970
|
+
paused_txn_log_dir = posixpath.join(txn_log_dir, PAUSED_TXN_DIR_NAME)
|
971
|
+
filesystem.create_dir(paused_txn_log_dir, recursive=False)
|
972
|
+
|
973
|
+
# Check if the transaction file exists in the failed directory
|
974
|
+
in_failed = os.path.exists(os.path.join(failed_txn_log_dir, txn_name))
|
975
|
+
|
976
|
+
# Check if the transaction file exists in the running directory
|
977
|
+
in_running = os.path.exists(os.path.join(running_txn_log_dir, txn_name))
|
978
|
+
|
979
|
+
# Check if the transaction file exists in the success directory
|
980
|
+
in_success = os.path.exists(os.path.join(success_txn_log_dir, txn_name))
|
981
|
+
|
982
|
+
# Check if the transaction file exists in the paused directory
|
983
|
+
in_paused = os.path.exists(os.path.join(paused_txn_log_dir, txn_name))
|
984
|
+
|
985
|
+
if in_failed and in_running:
|
986
|
+
return TransactionState.FAILED
|
987
|
+
elif in_failed and not in_running:
|
988
|
+
return TransactionState.PURGED
|
989
|
+
elif in_success:
|
990
|
+
return TransactionState.SUCCESS
|
991
|
+
elif in_running:
|
992
|
+
return TransactionState.RUNNING
|
993
|
+
elif in_paused:
|
994
|
+
return TransactionState.PAUSED
|
397
995
|
|
398
996
|
@property
|
399
997
|
def operations(self) -> TransactionOperationList:
|
@@ -406,6 +1004,38 @@ class Transaction(dict):
|
|
406
1004
|
def operations(self, operations: TransactionOperationList):
|
407
1005
|
self["operations"] = operations
|
408
1006
|
|
1007
|
+
@property
|
1008
|
+
def metafile_write_paths(self) -> List[str]:
|
1009
|
+
return [path for op in self.operations for path in op.metafile_write_paths]
|
1010
|
+
|
1011
|
+
@property
|
1012
|
+
def locator_write_paths(self) -> List[str]:
|
1013
|
+
return [path for op in self.operations for path in op.locator_write_paths]
|
1014
|
+
|
1015
|
+
@property
|
1016
|
+
def catalog_root_normalized(self) -> str:
|
1017
|
+
"""
|
1018
|
+
Returns the catalog_root_normalized for this transaction.
|
1019
|
+
"""
|
1020
|
+
return self.get("catalog_root_normalized")
|
1021
|
+
|
1022
|
+
@catalog_root_normalized.setter
|
1023
|
+
def catalog_root_normalized(self, path: str):
|
1024
|
+
self["catalog_root_normalized"] = path
|
1025
|
+
|
1026
|
+
@property
|
1027
|
+
def _time_provider(self) -> TransactionSystemTimeProvider:
|
1028
|
+
"""
|
1029
|
+
Returns the time_provider of the transaction.
|
1030
|
+
"""
|
1031
|
+
return self.get("_time_provider")
|
1032
|
+
|
1033
|
+
@_time_provider.setter
|
1034
|
+
def _time_provider(
|
1035
|
+
self, tp: TransactionSystemTimeProvider
|
1036
|
+
) -> TransactionSystemTimeProvider:
|
1037
|
+
self["_time_provider"] = tp
|
1038
|
+
|
409
1039
|
@property
|
410
1040
|
def start_time(self) -> Optional[int]:
|
411
1041
|
"""
|
@@ -413,6 +1043,13 @@ class Transaction(dict):
|
|
413
1043
|
"""
|
414
1044
|
return self.get("start_time")
|
415
1045
|
|
1046
|
+
@property
|
1047
|
+
def pause_time(self) -> Optional[int]:
|
1048
|
+
"""
|
1049
|
+
Returns the last pause time of the transaction.
|
1050
|
+
"""
|
1051
|
+
return self.get("pause_time")
|
1052
|
+
|
416
1053
|
@property
|
417
1054
|
def end_time(self) -> Optional[int]:
|
418
1055
|
"""
|
@@ -420,6 +1057,34 @@ class Transaction(dict):
|
|
420
1057
|
"""
|
421
1058
|
return self.get("end_time")
|
422
1059
|
|
1060
|
+
@property
|
1061
|
+
def commit_message(self) -> Optional[str]:
|
1062
|
+
"""
|
1063
|
+
Returns the commit message for the transaction.
|
1064
|
+
"""
|
1065
|
+
return self.get("commit_message")
|
1066
|
+
|
1067
|
+
@commit_message.setter
|
1068
|
+
def commit_message(self, message: str):
|
1069
|
+
"""
|
1070
|
+
Sets the commit message for the transaction.
|
1071
|
+
"""
|
1072
|
+
self["commit_message"] = message
|
1073
|
+
|
1074
|
+
@property
|
1075
|
+
def historic_timestamp(self) -> Optional[int]:
|
1076
|
+
"""
|
1077
|
+
Returns the historic timestamp for the transaction.
|
1078
|
+
"""
|
1079
|
+
return self.get("historic_timestamp")
|
1080
|
+
|
1081
|
+
@historic_timestamp.setter
|
1082
|
+
def historic_timestamp(self, timestamp: int):
|
1083
|
+
"""
|
1084
|
+
Sets the historic timestamp for the transaction.
|
1085
|
+
"""
|
1086
|
+
self["historic_timestamp"] = timestamp
|
1087
|
+
|
423
1088
|
def _mark_start_time(self, time_provider: TransactionTimeProvider) -> int:
|
424
1089
|
"""
|
425
1090
|
Sets the start time of the transaction using the given
|
@@ -445,6 +1110,20 @@ class Transaction(dict):
|
|
445
1110
|
end_time = self["end_time"] = time_provider.end_time()
|
446
1111
|
return end_time
|
447
1112
|
|
1113
|
+
def _mark_pause_time(self, time_provider: TransactionTimeProvider) -> int:
|
1114
|
+
"""
|
1115
|
+
Sets the pause time of the transaction using the given
|
1116
|
+
TransactionTimeProvider. Raises a runtime error if the transaction pause
|
1117
|
+
time has already been set by a previous commit, or if the transaction
|
1118
|
+
start time has not been set.
|
1119
|
+
"""
|
1120
|
+
if not self.get("start_time"):
|
1121
|
+
raise RuntimeError("Cannot pause an unstarted transaction.")
|
1122
|
+
if self.get("end_time"):
|
1123
|
+
raise RuntimeError("Cannot pause a completed transaction.")
|
1124
|
+
pause_time = self["pause_time"] = time_provider.end_time()
|
1125
|
+
return pause_time
|
1126
|
+
|
448
1127
|
@staticmethod
|
449
1128
|
def _abs_txn_meta_path_to_relative(root: str, target: str) -> str:
|
450
1129
|
"""
|
@@ -499,7 +1178,11 @@ class Transaction(dict):
|
|
499
1178
|
validations on the serialized or deserialized object.
|
500
1179
|
:return: a serializable version of the object
|
501
1180
|
"""
|
502
|
-
|
1181
|
+
# Only copy dictionary keys - all other members should not be serialized
|
1182
|
+
serializable = Transaction({})
|
1183
|
+
for key, value in self.items():
|
1184
|
+
serializable[key] = copy.deepcopy(value)
|
1185
|
+
|
503
1186
|
# remove all src/dest metafile contents except IDs and locators to
|
504
1187
|
# reduce file size (they can be reconstructed from their corresponding
|
505
1188
|
# files as required).
|
@@ -530,6 +1213,17 @@ class Transaction(dict):
|
|
530
1213
|
}
|
531
1214
|
# TODO(pdames): Ensure that all file paths recorded are relative to the
|
532
1215
|
# catalog root.
|
1216
|
+
|
1217
|
+
# TODO: check if we care about order or exact time stamps --> pickling time_provider?
|
1218
|
+
# serializable.pop("_time_provider", None)
|
1219
|
+
|
1220
|
+
serializable["_time_provider"] = {
|
1221
|
+
"type": type(self._time_provider).__name__,
|
1222
|
+
"params": {},
|
1223
|
+
}
|
1224
|
+
|
1225
|
+
serializable.catalog_root_normalized = self.catalog_root_normalized
|
1226
|
+
|
533
1227
|
return serializable
|
534
1228
|
|
535
1229
|
@staticmethod
|
@@ -574,184 +1268,466 @@ class Transaction(dict):
|
|
574
1268
|
self,
|
575
1269
|
catalog_root_dir: str,
|
576
1270
|
filesystem: Optional[pyarrow.fs.FileSystem] = None,
|
577
|
-
) -> Union[
|
578
|
-
|
579
|
-
|
1271
|
+
) -> Union[
|
1272
|
+
List[ListResult[Metafile]],
|
1273
|
+
Tuple[List[str], str],
|
1274
|
+
Tuple[List["ListResult[Metafile]"], List[str], str],
|
1275
|
+
]:
|
1276
|
+
"""
|
1277
|
+
Legacy wrapper that preserves the original `commit()` contract while
|
1278
|
+
delegating the heavy lifting to the incremental helpers.
|
1279
|
+
|
1280
|
+
Returns
|
1281
|
+
-------
|
1282
|
+
- For READ transactions: List[ListResult[Metafile]]
|
1283
|
+
- For WRITE transactions: Tuple[List[str], str]
|
1284
|
+
(list of successful write-paths, path to success-txn log file)
|
1285
|
+
- For mixed READ/WRITE transactions: Tuple[List["ListResult[Metafile]"], List[str], str]
|
1286
|
+
"""
|
580
1287
|
|
581
|
-
|
582
|
-
|
583
|
-
|
1288
|
+
if hasattr(self, "interactive") and self.interactive:
|
1289
|
+
raise RuntimeError(
|
1290
|
+
"Cannot commit an interactive transaction. Use transaction.start(),transaction.step(), and transaction.seal() instead."
|
1291
|
+
)
|
584
1292
|
|
585
|
-
|
586
|
-
|
1293
|
+
if self.operations and len(self.operations) > 0:
|
1294
|
+
# Start a working copy (deep-copy, directory scaffolding, start-time, running/failed/success/paused dirs …)
|
1295
|
+
txn_active = self.start(catalog_root_dir, filesystem) # deep copy
|
1296
|
+
# Sequentially execute every TransactionOperation
|
1297
|
+
for op in txn_active.operations:
|
1298
|
+
txn_active.step(op)
|
1299
|
+
return txn_active._seal_steps()
|
1300
|
+
|
1301
|
+
def start(
|
1302
|
+
self,
|
1303
|
+
catalog_root_dir: str,
|
1304
|
+
filesystem: Optional[pyarrow.fs.FileSystem] = None,
|
1305
|
+
historic_timestamp: Optional[int] = None,
|
1306
|
+
) -> "Transaction":
|
1307
|
+
"""
|
1308
|
+
Create directory scaffolding, timestamp the txn, and return a DEEP COPY
|
1309
|
+
that the caller should use for all subsequent calls to step(), pause(),
|
1310
|
+
and seal(). The original object remains read-only.
|
1311
|
+
|
1312
|
+
Args:
|
1313
|
+
catalog_root_dir: Root directory for the catalog
|
1314
|
+
filesystem: Optional filesystem to use
|
1315
|
+
historic_timestamp: Optional timestamp in nanoseconds since epoch for snapshot isolation
|
1316
|
+
"""
|
1317
|
+
# Create a deep copy
|
1318
|
+
txn: "Transaction" = copy.deepcopy(self)
|
1319
|
+
|
1320
|
+
# Set up time provider based on transaction type
|
1321
|
+
if historic_timestamp is not None:
|
1322
|
+
# Use historic time provider for snapshot isolation
|
1323
|
+
# TODO(pdames): Set base time provider to the catalog's configured time provider when more than one is supported.
|
1324
|
+
txn._time_provider = TransactionHistoricTimeProvider(
|
1325
|
+
historic_timestamp,
|
1326
|
+
TransactionSystemTimeProvider(),
|
1327
|
+
)
|
1328
|
+
txn.historic_timestamp = historic_timestamp
|
1329
|
+
else:
|
1330
|
+
# Use system time provider for regular transactions
|
1331
|
+
txn._time_provider = TransactionSystemTimeProvider()
|
1332
|
+
|
1333
|
+
txn._mark_start_time(txn._time_provider) # start time on deep_copy
|
1334
|
+
|
1335
|
+
# Set up filesystem and directories
|
587
1336
|
catalog_root_normalized, filesystem = resolve_path_and_filesystem(
|
588
1337
|
catalog_root_dir,
|
589
1338
|
filesystem,
|
590
1339
|
)
|
591
|
-
|
592
|
-
|
593
|
-
|
594
|
-
|
595
|
-
filesystem.create_dir(failed_txn_log_dir, recursive=False)
|
596
|
-
success_txn_log_dir = posixpath.join(txn_log_dir, SUCCESS_TXN_DIR_NAME)
|
597
|
-
filesystem.create_dir(success_txn_log_dir, recursive=False)
|
1340
|
+
txn.catalog_root_normalized = catalog_root_normalized
|
1341
|
+
txn._filesystem = filesystem # keep for pause/resume
|
1342
|
+
txn.running_log_written = False # internal flags
|
1343
|
+
txn._list_results = []
|
598
1344
|
|
599
|
-
#
|
600
|
-
|
601
|
-
|
602
|
-
|
603
|
-
|
604
|
-
|
605
|
-
|
606
|
-
|
607
|
-
|
608
|
-
|
609
|
-
|
610
|
-
catalog_root_dir=catalog_root_normalized,
|
611
|
-
success_txn_log_dir=success_txn_log_dir,
|
612
|
-
current_txn_op=operation,
|
613
|
-
current_txn_start_time=txn.start_time,
|
614
|
-
current_txn_id=txn.id,
|
615
|
-
filesystem=filesystem,
|
1345
|
+
# Make sure txn/ directories exist (idempotent)
|
1346
|
+
txn_log_dir = posixpath.join(catalog_root_normalized, TXN_DIR_NAME)
|
1347
|
+
filesystem.create_dir(
|
1348
|
+
posixpath.join(txn_log_dir, RUNNING_TXN_DIR_NAME),
|
1349
|
+
recursive=True,
|
1350
|
+
)
|
1351
|
+
for subdir in (FAILED_TXN_DIR_NAME, SUCCESS_TXN_DIR_NAME, PAUSED_TXN_DIR_NAME):
|
1352
|
+
try:
|
1353
|
+
filesystem.create_dir(
|
1354
|
+
posixpath.join(txn_log_dir, subdir),
|
1355
|
+
recursive=False,
|
616
1356
|
)
|
617
|
-
|
618
|
-
|
619
|
-
|
620
|
-
return txn._commit_write(
|
621
|
-
catalog_root_normalized=catalog_root_normalized,
|
622
|
-
running_txn_log_dir=running_txn_log_dir,
|
623
|
-
failed_txn_log_dir=failed_txn_log_dir,
|
624
|
-
success_txn_log_dir=success_txn_log_dir,
|
625
|
-
filesystem=filesystem,
|
626
|
-
time_provider=time_provider,
|
627
|
-
)
|
1357
|
+
except FileExistsError:
|
1358
|
+
pass # allowed when catalog already initialised
|
1359
|
+
return txn
|
628
1360
|
|
629
|
-
def
|
1361
|
+
def step(
|
630
1362
|
self,
|
631
|
-
|
632
|
-
|
633
|
-
|
634
|
-
|
635
|
-
|
636
|
-
|
637
|
-
|
638
|
-
|
1363
|
+
operation: "TransactionOperation",
|
1364
|
+
) -> Union[ListResult[Metafile], Tuple[List[str], List[str]]]:
|
1365
|
+
"""
|
1366
|
+
Executes a single transaction operation.
|
1367
|
+
|
1368
|
+
Parameters
|
1369
|
+
----------
|
1370
|
+
operation: TransactionOperation
|
1371
|
+
The transaction operation to execute.
|
1372
|
+
|
1373
|
+
Returns
|
1374
|
+
-------
|
1375
|
+
- For READ transaction operation: ListResult[Metafile]
|
1376
|
+
- For WRITE transaction operation: Tuple[List[str], List[str]]
|
1377
|
+
(list of successful write-paths, list of successful locator write-paths)
|
1378
|
+
"""
|
1379
|
+
|
1380
|
+
catalog_root_normalized = self.catalog_root_normalized
|
1381
|
+
filesystem = self._filesystem
|
1382
|
+
txn_log_dir = posixpath.join(catalog_root_normalized, TXN_DIR_NAME)
|
1383
|
+
|
639
1384
|
running_txn_log_file_path = posixpath.join(
|
640
|
-
|
641
|
-
self.id,
|
1385
|
+
txn_log_dir, RUNNING_TXN_DIR_NAME, self.id
|
642
1386
|
)
|
643
|
-
with filesystem.open_output_stream(running_txn_log_file_path) as file:
|
644
|
-
packed = msgpack.dumps(self.to_serializable(catalog_root_normalized))
|
645
|
-
file.write(packed)
|
646
1387
|
|
647
|
-
#
|
648
|
-
|
649
|
-
|
1388
|
+
# Validate read-only transaction constraints
|
1389
|
+
if self.historic_timestamp is not None:
|
1390
|
+
if not operation.type.is_read_operation():
|
1391
|
+
raise RuntimeError(
|
1392
|
+
f"Cannot perform {operation.type.value} operation in a read-only historic transaction."
|
1393
|
+
)
|
1394
|
+
|
1395
|
+
# Add new operation to the transaction's list of operations
|
1396
|
+
if self.interactive:
|
1397
|
+
self.operations = self.operations + [operation]
|
1398
|
+
|
1399
|
+
# (a) READ txn op
|
1400
|
+
if operation.type.is_read_operation():
|
1401
|
+
list_result = operation.dest_metafile.read_txn(
|
1402
|
+
catalog_root_dir=catalog_root_normalized,
|
1403
|
+
success_txn_log_dir=posixpath.join(txn_log_dir, SUCCESS_TXN_DIR_NAME),
|
1404
|
+
current_txn_op=operation,
|
1405
|
+
current_txn_start_time=self.start_time,
|
1406
|
+
current_txn_id=self.id,
|
1407
|
+
filesystem=filesystem,
|
1408
|
+
)
|
1409
|
+
self._list_results.append(list_result)
|
1410
|
+
return list_result
|
1411
|
+
|
1412
|
+
# (b) WRITE txn op
|
1413
|
+
# First operation? -> create running log so an external janitor can
|
1414
|
+
# see that a txn is in-flight.
|
1415
|
+
if not self.running_log_written:
|
1416
|
+
self._write_running_log(running_txn_log_file_path)
|
1417
|
+
|
650
1418
|
try:
|
651
|
-
|
652
|
-
|
653
|
-
|
654
|
-
|
655
|
-
|
656
|
-
|
657
|
-
|
1419
|
+
(
|
1420
|
+
metafile_write_paths,
|
1421
|
+
locator_write_paths,
|
1422
|
+
) = operation.dest_metafile.write_txn(
|
1423
|
+
catalog_root_dir=catalog_root_normalized,
|
1424
|
+
success_txn_log_dir=posixpath.join(txn_log_dir, SUCCESS_TXN_DIR_NAME),
|
1425
|
+
current_txn_op=operation,
|
1426
|
+
current_txn_start_time=self.start_time,
|
1427
|
+
current_txn_id=self.id,
|
1428
|
+
filesystem=filesystem,
|
1429
|
+
)
|
1430
|
+
# Check for concurrent txn conflicts on the metafile and locator write paths just written
|
1431
|
+
# TODO(pdames): Remove the fast-fail check here if it grows too expensive?
|
1432
|
+
for path in metafile_write_paths + locator_write_paths:
|
1433
|
+
MetafileRevisionInfo.check_for_concurrent_txn_conflict(
|
1434
|
+
success_txn_log_dir=posixpath.join(
|
1435
|
+
txn_log_dir,
|
1436
|
+
SUCCESS_TXN_DIR_NAME,
|
1437
|
+
),
|
1438
|
+
current_txn_revision_file_path=path,
|
658
1439
|
filesystem=filesystem,
|
659
1440
|
)
|
660
|
-
|
661
|
-
locator_write_paths.extend(operation.locator_write_paths)
|
662
|
-
# check for conflicts with concurrent transactions
|
663
|
-
for path in metafile_write_paths + locator_write_paths:
|
664
|
-
MetafileRevisionInfo.check_for_concurrent_txn_conflict(
|
665
|
-
success_txn_log_dir=success_txn_log_dir,
|
666
|
-
current_txn_revision_file_path=path,
|
667
|
-
filesystem=filesystem,
|
668
|
-
)
|
1441
|
+
return metafile_write_paths, locator_write_paths
|
669
1442
|
except Exception:
|
670
|
-
#
|
671
|
-
|
672
|
-
failed_txn_log_dir,
|
673
|
-
|
674
|
-
)
|
675
|
-
with filesystem.open_output_stream(failed_txn_log_file_path) as file:
|
676
|
-
packed = msgpack.dumps(self.to_serializable(catalog_root_normalized))
|
677
|
-
file.write(packed)
|
678
|
-
|
679
|
-
###################################################################
|
680
|
-
###################################################################
|
681
|
-
# failure past here telegraphs a failed transaction cleanup attempt
|
682
|
-
###################################################################
|
683
|
-
###################################################################
|
684
|
-
|
685
|
-
# delete all files written during the failed transaction
|
686
|
-
known_write_paths = chain.from_iterable(
|
687
|
-
[
|
688
|
-
operation.metafile_write_paths + operation.locator_write_paths
|
689
|
-
for operation in self.operations
|
690
|
-
]
|
1443
|
+
# convert in-flight txn → FAILED and clean up partial files
|
1444
|
+
self._fail_and_cleanup(
|
1445
|
+
failed_txn_log_dir=posixpath.join(txn_log_dir, FAILED_TXN_DIR_NAME),
|
1446
|
+
running_log_path=running_txn_log_file_path,
|
691
1447
|
)
|
692
|
-
#
|
693
|
-
# either failed to add to the known write paths, or fail to delete.
|
694
|
-
for write_path in known_write_paths:
|
695
|
-
filesystem.delete_file(write_path)
|
696
|
-
|
697
|
-
# delete the in-progress transaction log file entry
|
698
|
-
filesystem.delete_file(running_txn_log_file_path)
|
699
|
-
# failed transaction cleanup is now complete
|
700
|
-
raise
|
1448
|
+
raise # surface original error
|
701
1449
|
|
702
|
-
|
703
|
-
|
704
|
-
|
705
|
-
|
706
|
-
|
707
|
-
|
708
|
-
|
709
|
-
|
710
|
-
)
|
711
|
-
|
712
|
-
|
713
|
-
|
714
|
-
|
1450
|
+
def pause(self) -> None:
|
1451
|
+
fs = self._filesystem
|
1452
|
+
root = self.catalog_root_normalized
|
1453
|
+
txn_log_dir = posixpath.join(root, TXN_DIR_NAME)
|
1454
|
+
|
1455
|
+
running_path = posixpath.join(txn_log_dir, RUNNING_TXN_DIR_NAME, self.id)
|
1456
|
+
paused_path = posixpath.join(txn_log_dir, PAUSED_TXN_DIR_NAME, self.id)
|
1457
|
+
|
1458
|
+
fs.create_dir(posixpath.dirname(paused_path), recursive=True)
|
1459
|
+
|
1460
|
+
# Record pause time (e.g., for time consistency guarantees)
|
1461
|
+
self._mark_pause_time(self._time_provider)
|
1462
|
+
|
1463
|
+
# Serialize current transaction state into paused/txn_id
|
1464
|
+
with fs.open_output_stream(paused_path) as f:
|
1465
|
+
f.write(msgpack.dumps(self.to_serializable(root)))
|
1466
|
+
|
1467
|
+
# Clean up original running log
|
1468
|
+
fs.delete_file(running_path)
|
1469
|
+
|
1470
|
+
def resume(self) -> None:
|
1471
|
+
fs = self._filesystem
|
1472
|
+
root = self.catalog_root_normalized
|
1473
|
+
txn_log_dir = posixpath.join(root, TXN_DIR_NAME)
|
1474
|
+
|
1475
|
+
running_path = posixpath.join(txn_log_dir, RUNNING_TXN_DIR_NAME, self.id)
|
1476
|
+
paused_path = posixpath.join(txn_log_dir, PAUSED_TXN_DIR_NAME, self.id)
|
1477
|
+
|
1478
|
+
# Load serialized transaction state
|
1479
|
+
with fs.open_input_stream(paused_path) as f:
|
1480
|
+
loaded_txn_data = msgpack.loads(f.readall())
|
1481
|
+
|
1482
|
+
# Restore relevant fields
|
1483
|
+
restored_txn = Transaction(**loaded_txn_data)
|
1484
|
+
self.__dict__.update(
|
1485
|
+
restored_txn.__dict__
|
1486
|
+
) # make curr txn the same as restored (fill vars and stuff)
|
1487
|
+
|
1488
|
+
# To support restoring time provider state if we ever add non-ephemeral ones.
|
1489
|
+
new_provider = Transaction.read_time_provider(
|
1490
|
+
restored_txn["_time_provider"]["type"]
|
715
1491
|
)
|
716
|
-
|
717
|
-
|
718
|
-
|
1492
|
+
|
1493
|
+
# evaluate system clock
|
1494
|
+
now = new_provider.start_time()
|
1495
|
+
self._time_provider = new_provider # start time should be preserved
|
1496
|
+
if now < self.pause_time:
|
1497
|
+
raise RuntimeError(
|
1498
|
+
f"System clock {now} is behind paused transaction time {self._pause_time}"
|
1499
|
+
)
|
1500
|
+
# TODO: set new start time or keep error if clock is off?
|
1501
|
+
|
1502
|
+
# Move back to running state
|
1503
|
+
fs.create_dir(posixpath.dirname(running_path), recursive=True)
|
1504
|
+
with fs.open_output_stream(running_path) as f:
|
1505
|
+
f.write(msgpack.dumps(self.to_serializable(root)))
|
1506
|
+
fs.delete_file(paused_path)
|
1507
|
+
|
1508
|
+
def seal(
|
1509
|
+
self,
|
1510
|
+
) -> Union[
|
1511
|
+
List["ListResult[Metafile]"],
|
1512
|
+
Tuple[List[str], str],
|
1513
|
+
Tuple[List["ListResult[Metafile]"], List[str], str],
|
1514
|
+
]:
|
1515
|
+
"""
|
1516
|
+
For READ → returns list_results collected during step().
|
1517
|
+
For WRITE → returns (written_paths, success_log_path).
|
1518
|
+
"""
|
1519
|
+
if not self.interactive:
|
1520
|
+
raise RuntimeError(
|
1521
|
+
"Cannot seal a non-interactive transaction. Call transaction.commit() instead."
|
1522
|
+
)
|
1523
|
+
|
1524
|
+
# Read-only transactions can only perform read operations
|
1525
|
+
if self.historic_timestamp is not None:
|
1526
|
+
if self._has_write_operations():
|
1527
|
+
raise RuntimeError(
|
1528
|
+
"Cannot seal a read-only historic transaction that contains write operations."
|
1529
|
+
)
|
1530
|
+
|
1531
|
+
return self._seal_steps()
|
1532
|
+
|
1533
|
+
def _has_write_operations(self) -> bool:
|
1534
|
+
"""
|
1535
|
+
Check if the transaction contains any write operations.
|
1536
|
+
Read-only transactions should only contain READ operations.
|
1537
|
+
"""
|
1538
|
+
for operation in self.operations:
|
1539
|
+
if not operation.type.is_read_operation():
|
1540
|
+
return True
|
1541
|
+
return False
|
1542
|
+
|
1543
|
+
def _seal_steps(
|
1544
|
+
self,
|
1545
|
+
) -> Union[
|
1546
|
+
List["ListResult[Metafile]"],
|
1547
|
+
Tuple[List[str], str],
|
1548
|
+
Tuple[List["ListResult[Metafile]"], List[str], str],
|
1549
|
+
]:
|
1550
|
+
fs = self._filesystem
|
1551
|
+
root = self.catalog_root_normalized
|
1552
|
+
txn_log_dir = posixpath.join(root, TXN_DIR_NAME)
|
1553
|
+
end_time = self._mark_end_time(self._time_provider)
|
1554
|
+
|
1555
|
+
# READ path: nothing persisted, so we are done
|
1556
|
+
if all(op.type.is_read_operation() for op in self.operations):
|
1557
|
+
return self._list_results
|
1558
|
+
|
1559
|
+
running_path = posixpath.join(txn_log_dir, RUNNING_TXN_DIR_NAME, self.id)
|
1560
|
+
failed_dir = posixpath.join(txn_log_dir, FAILED_TXN_DIR_NAME)
|
1561
|
+
success_dir = posixpath.join(txn_log_dir, SUCCESS_TXN_DIR_NAME)
|
1562
|
+
|
1563
|
+
# If no operations ever succeeded we still need a running log.
|
1564
|
+
if not self.running_log_written:
|
1565
|
+
self._write_running_log(running_path)
|
719
1566
|
try:
|
720
|
-
|
721
|
-
|
1567
|
+
# Check for concurrent txn conflicts on metafile and locator write paths
|
1568
|
+
for path in self.metafile_write_paths + self.locator_write_paths:
|
1569
|
+
MetafileRevisionInfo.check_for_concurrent_txn_conflict(
|
1570
|
+
success_txn_log_dir=posixpath.join(
|
1571
|
+
txn_log_dir, SUCCESS_TXN_DIR_NAME
|
1572
|
+
),
|
1573
|
+
current_txn_revision_file_path=path,
|
1574
|
+
filesystem=fs,
|
1575
|
+
)
|
1576
|
+
except Exception:
|
1577
|
+
self._fail_and_cleanup(
|
1578
|
+
failed_txn_log_dir=failed_dir,
|
1579
|
+
running_log_path=running_path,
|
722
1580
|
)
|
1581
|
+
# raise the original error
|
1582
|
+
raise
|
1583
|
+
success_log_path = None
|
1584
|
+
try:
|
1585
|
+
# write transaction log
|
1586
|
+
success_txn_dir = posixpath.join(success_dir, self.id)
|
1587
|
+
fs.create_dir(success_txn_dir, recursive=False)
|
1588
|
+
|
1589
|
+
success_log_path = posixpath.join(success_txn_dir, str(end_time))
|
1590
|
+
with fs.open_output_stream(success_log_path) as f:
|
1591
|
+
f.write(msgpack.dumps(self.to_serializable(root)))
|
1592
|
+
|
1593
|
+
Transaction._validate_txn_log_file(success_txn_log_file=success_log_path)
|
1594
|
+
|
723
1595
|
except Exception as e1:
|
1596
|
+
self._fail_and_cleanup(
|
1597
|
+
failed_txn_log_dir=failed_dir,
|
1598
|
+
running_log_path=running_path,
|
1599
|
+
success_log_path=success_log_path,
|
1600
|
+
)
|
1601
|
+
raise RuntimeError(
|
1602
|
+
f"Transaction validation failed. To preserve catalog integrity, "
|
1603
|
+
f"the corresponding completed transaction log at "
|
1604
|
+
f"`{success_log_path}` has been removed."
|
1605
|
+
) from e1
|
1606
|
+
|
1607
|
+
else:
|
1608
|
+
fs.delete_file(running_path)
|
1609
|
+
if all(op.type.is_write_operation() for op in self.operations):
|
1610
|
+
# pure write transaction - just return write paths and success log path
|
1611
|
+
return self.metafile_write_paths, success_log_path
|
1612
|
+
else:
|
1613
|
+
# mixed read/write transaction - return read results, write paths, and success log path
|
1614
|
+
return self._list_results, self.metafile_write_paths, success_log_path
|
1615
|
+
|
1616
|
+
# Helper: write or overwrite the running/ID file exactly once
|
1617
|
+
def _write_running_log(self, running_log_path: str) -> None:
|
1618
|
+
with self._filesystem.open_output_stream(running_log_path) as f:
|
1619
|
+
f.write(msgpack.dumps(self.to_serializable(self.catalog_root_normalized)))
|
1620
|
+
self.running_log_written = True
|
1621
|
+
|
1622
|
+
# Helper: mark txn FAILED and clean partial output
|
1623
|
+
def _fail_and_cleanup(
|
1624
|
+
self,
|
1625
|
+
failed_txn_log_dir: str,
|
1626
|
+
running_log_path: str,
|
1627
|
+
success_log_path: Optional[str] = None,
|
1628
|
+
) -> None:
|
1629
|
+
fs = self._filesystem
|
1630
|
+
|
1631
|
+
# 1. write failed/ID
|
1632
|
+
failed_log_path = posixpath.join(failed_txn_log_dir, self.id)
|
1633
|
+
with fs.open_output_stream(failed_log_path) as f:
|
1634
|
+
f.write(msgpack.dumps(self.to_serializable(self.catalog_root_normalized)))
|
1635
|
+
|
1636
|
+
# 2. delete all provisional files
|
1637
|
+
for path in self.metafile_write_paths:
|
724
1638
|
try:
|
725
|
-
|
726
|
-
|
727
|
-
|
728
|
-
|
729
|
-
|
730
|
-
|
731
|
-
|
732
|
-
|
733
|
-
|
734
|
-
|
735
|
-
|
736
|
-
|
737
|
-
|
738
|
-
|
739
|
-
|
740
|
-
|
741
|
-
|
742
|
-
except Exception
|
743
|
-
|
744
|
-
|
745
|
-
|
746
|
-
|
747
|
-
|
748
|
-
|
749
|
-
|
750
|
-
|
751
|
-
|
752
|
-
|
753
|
-
)
|
1639
|
+
fs.delete_file(path)
|
1640
|
+
except Exception:
|
1641
|
+
pass # best-effort; janitor job will catch leftovers
|
1642
|
+
for path in self.locator_write_paths:
|
1643
|
+
try:
|
1644
|
+
fs.delete_file(path)
|
1645
|
+
except Exception:
|
1646
|
+
pass # best-effort; janitor job will catch leftovers
|
1647
|
+
|
1648
|
+
# 3. tidy up bookkeeping logs
|
1649
|
+
try:
|
1650
|
+
fs.delete_file(running_log_path)
|
1651
|
+
except Exception:
|
1652
|
+
pass
|
1653
|
+
if success_log_path:
|
1654
|
+
try:
|
1655
|
+
fs.delete_file(success_log_path)
|
1656
|
+
except Exception:
|
1657
|
+
pass
|
1658
|
+
|
1659
|
+
def __enter__(self) -> "Transaction":
|
1660
|
+
"""
|
1661
|
+
Context manager entry point. Sets this transaction as the current context.
|
1662
|
+
Supports nested transactions by preserving the context stack.
|
1663
|
+
"""
|
1664
|
+
if not hasattr(self, "interactive") or not self.interactive:
|
1665
|
+
raise RuntimeError(
|
1666
|
+
"Transaction must be interactive to use with context manager. "
|
1667
|
+
"Use dc.transaction() to create an interactive transaction."
|
1668
|
+
)
|
1669
|
+
if self.start_time is None:
|
1670
|
+
raise RuntimeError(
|
1671
|
+
"Transaction has not been started. "
|
1672
|
+
"Use dc.transaction() to create a properly initialized transaction."
|
1673
|
+
)
|
1674
|
+
|
1675
|
+
# Store the context token for restoration in __exit__
|
1676
|
+
self._context_token = set_current_transaction(self)
|
1677
|
+
return self
|
1678
|
+
|
1679
|
+
def __exit__(
|
1680
|
+
self,
|
1681
|
+
exc_type: Optional[Type[BaseException]],
|
1682
|
+
exc_value: Optional[BaseException],
|
1683
|
+
traceback: Optional[TracebackType],
|
1684
|
+
) -> None:
|
1685
|
+
"""
|
1686
|
+
Context manager exit point. Restores previous transaction context and
|
1687
|
+
automatically seals the transaction on successful completion or fails it
|
1688
|
+
if an exception occurred.
|
1689
|
+
|
1690
|
+
Args:
|
1691
|
+
exc_type: Exception type if an exception occurred, None otherwise
|
1692
|
+
exc_value: Exception value if an exception occurred, None otherwise
|
1693
|
+
traceback: Exception traceback if an exception occurred, None otherwise
|
1694
|
+
"""
|
1695
|
+
try:
|
1696
|
+
if exc_type is None and exc_value is None and traceback is None:
|
1697
|
+
# No exception occurred - seal the transaction
|
1698
|
+
self.seal()
|
1699
|
+
else:
|
1700
|
+
# Exception occurred during transaction - fail and cleanup
|
1701
|
+
try:
|
1702
|
+
catalog_root_normalized = self.catalog_root_normalized
|
1703
|
+
txn_log_dir = posixpath.join(catalog_root_normalized, TXN_DIR_NAME)
|
1704
|
+
running_txn_log_file_path = posixpath.join(
|
1705
|
+
txn_log_dir, RUNNING_TXN_DIR_NAME, self.id
|
1706
|
+
)
|
1707
|
+
self._fail_and_cleanup(
|
1708
|
+
failed_txn_log_dir=posixpath.join(
|
1709
|
+
txn_log_dir, FAILED_TXN_DIR_NAME
|
1710
|
+
),
|
1711
|
+
running_log_path=running_txn_log_file_path,
|
1712
|
+
)
|
1713
|
+
except Exception:
|
1714
|
+
# If cleanup fails, still let the original exception propagate
|
1715
|
+
pass
|
754
1716
|
finally:
|
755
|
-
#
|
756
|
-
|
757
|
-
|
1717
|
+
# Always restore the previous transaction context using the token
|
1718
|
+
if hasattr(self, "_context_token"):
|
1719
|
+
try:
|
1720
|
+
# Get the previous value from the token
|
1721
|
+
old_value = self._context_token.old_value
|
1722
|
+
# Only set if the old value is a valid transaction or None
|
1723
|
+
if old_value is None or isinstance(old_value, Transaction):
|
1724
|
+
_current_transaction.set(old_value)
|
1725
|
+
else:
|
1726
|
+
# If old_value is not valid (e.g., Token.MISSING), set to None
|
1727
|
+
_current_transaction.set(None)
|
1728
|
+
except (AttributeError, LookupError):
|
1729
|
+
# If token doesn't have old_value or context is corrupted, clear it
|
1730
|
+
try:
|
1731
|
+
_current_transaction.set(None)
|
1732
|
+
except LookupError:
|
1733
|
+
pass
|