deltacat 1.1.38__py3-none-any.whl → 2.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +150 -12
- deltacat/annotations.py +36 -0
- deltacat/api.py +578 -0
- deltacat/aws/constants.py +0 -23
- deltacat/aws/s3u.py +4 -631
- deltacat/benchmarking/benchmark_engine.py +84 -0
- deltacat/benchmarking/benchmark_report.py +86 -0
- deltacat/benchmarking/benchmark_suite.py +11 -0
- deltacat/benchmarking/conftest.py +22 -19
- deltacat/benchmarking/data/random_row_generator.py +94 -0
- deltacat/benchmarking/data/row_generator.py +10 -0
- deltacat/benchmarking/test_benchmark_pipeline.py +108 -0
- deltacat/catalog/__init__.py +73 -0
- deltacat/catalog/delegate.py +615 -140
- deltacat/catalog/interface.py +404 -81
- deltacat/catalog/main/impl.py +2882 -0
- deltacat/catalog/model/catalog.py +348 -46
- deltacat/catalog/model/properties.py +155 -0
- deltacat/catalog/model/table_definition.py +32 -1
- deltacat/compute/__init__.py +14 -0
- deltacat/compute/compactor/compaction_session.py +97 -75
- deltacat/compute/compactor/model/compact_partition_params.py +75 -30
- deltacat/compute/compactor/model/compaction_session_audit_info.py +23 -30
- deltacat/compute/compactor/model/delta_annotated.py +3 -3
- deltacat/compute/compactor/model/delta_file_envelope.py +3 -1
- deltacat/compute/compactor/model/delta_file_locator.py +3 -1
- deltacat/compute/compactor/model/round_completion_info.py +19 -9
- deltacat/compute/compactor/model/table_object_store.py +3 -2
- deltacat/compute/compactor/repartition_session.py +9 -22
- deltacat/compute/compactor/steps/dedupe.py +11 -4
- deltacat/compute/compactor/steps/hash_bucket.py +6 -6
- deltacat/compute/compactor/steps/materialize.py +15 -9
- deltacat/compute/compactor/steps/repartition.py +12 -11
- deltacat/compute/compactor/utils/io.py +7 -6
- deltacat/compute/compactor/utils/round_completion_reader.py +117 -0
- deltacat/compute/compactor/utils/sort_key.py +9 -2
- deltacat/compute/compactor/utils/system_columns.py +3 -1
- deltacat/compute/compactor_v2/compaction_session.py +13 -14
- deltacat/compute/compactor_v2/deletes/utils.py +3 -3
- deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -1
- deltacat/compute/compactor_v2/model/hash_bucket_input.py +9 -3
- deltacat/compute/compactor_v2/model/merge_file_group.py +5 -2
- deltacat/compute/compactor_v2/model/merge_input.py +28 -9
- deltacat/compute/compactor_v2/private/compaction_utils.py +171 -73
- deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -2
- deltacat/compute/compactor_v2/steps/merge.py +156 -53
- deltacat/compute/compactor_v2/utils/content_type_params.py +17 -6
- deltacat/compute/compactor_v2/utils/delta.py +5 -3
- deltacat/compute/compactor_v2/utils/io.py +10 -3
- deltacat/compute/compactor_v2/utils/merge.py +14 -2
- deltacat/compute/compactor_v2/utils/task_options.py +2 -10
- deltacat/compute/converter/constants.py +9 -0
- deltacat/compute/converter/converter_session.py +298 -0
- deltacat/compute/converter/model/convert_input.py +96 -0
- deltacat/compute/converter/model/convert_input_files.py +78 -0
- deltacat/compute/converter/model/convert_result.py +80 -0
- deltacat/compute/converter/model/converter_session_params.py +144 -0
- deltacat/compute/converter/pyiceberg/catalog.py +78 -0
- deltacat/compute/converter/pyiceberg/overrides.py +263 -0
- deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +299 -0
- deltacat/compute/converter/steps/convert.py +366 -0
- deltacat/compute/converter/steps/dedupe.py +94 -0
- deltacat/compute/converter/utils/__init__.py +0 -0
- deltacat/compute/converter/utils/convert_task_options.py +132 -0
- deltacat/compute/converter/utils/converter_session_utils.py +175 -0
- deltacat/compute/converter/utils/iceberg_columns.py +87 -0
- deltacat/compute/converter/utils/io.py +203 -0
- deltacat/compute/converter/utils/s3u.py +148 -0
- deltacat/compute/janitor.py +205 -0
- deltacat/compute/jobs/__init__.py +0 -0
- deltacat/compute/jobs/client.py +417 -0
- deltacat/compute/resource_estimation/delta.py +11 -1
- deltacat/constants.py +90 -1
- deltacat/docs/__init__.py +0 -0
- deltacat/docs/autogen/__init__.py +0 -0
- deltacat/docs/autogen/schema/__init__.py +0 -0
- deltacat/docs/autogen/schema/inference/__init__.py +0 -0
- deltacat/docs/autogen/schema/inference/generate_type_mappings.py +687 -0
- deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py +673 -0
- deltacat/env.py +61 -0
- deltacat/examples/__init__.py +0 -0
- deltacat/examples/basic_logging.py +101 -0
- deltacat/examples/compactor/__init__.py +0 -0
- deltacat/examples/compactor/aws/__init__.py +1 -0
- deltacat/examples/compactor/bootstrap.py +863 -0
- deltacat/examples/compactor/compactor.py +373 -0
- deltacat/examples/compactor/explorer.py +473 -0
- deltacat/examples/compactor/gcp/__init__.py +1 -0
- deltacat/examples/compactor/job_runner.py +439 -0
- deltacat/examples/compactor/utils/__init__.py +1 -0
- deltacat/examples/compactor/utils/common.py +261 -0
- deltacat/examples/experimental/__init__.py +0 -0
- deltacat/examples/experimental/iceberg/__init__.py +0 -0
- deltacat/examples/experimental/iceberg/converter/__init__.py +0 -0
- deltacat/examples/experimental/iceberg/converter/beam/__init__.py +0 -0
- deltacat/examples/experimental/iceberg/converter/beam/app.py +226 -0
- deltacat/examples/experimental/iceberg/converter/beam/main.py +133 -0
- deltacat/examples/experimental/iceberg/converter/beam/test_workflow.py +113 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py +3 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/common.py +174 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py +263 -0
- deltacat/examples/experimental/iceberg/iceberg_bucket_writer.py +184 -0
- deltacat/examples/experimental/iceberg/iceberg_reader.py +147 -0
- deltacat/examples/hello_world.py +29 -0
- deltacat/examples/indexer/__init__.py +0 -0
- deltacat/examples/indexer/aws/__init__.py +0 -0
- deltacat/examples/indexer/gcp/__init__.py +0 -0
- deltacat/examples/indexer/indexer.py +163 -0
- deltacat/examples/indexer/job_runner.py +198 -0
- deltacat/exceptions.py +116 -12
- deltacat/experimental/__init__.py +0 -0
- deltacat/experimental/catalog/__init__.py +0 -0
- deltacat/experimental/catalog/iceberg/__init__.py +6 -0
- deltacat/experimental/catalog/iceberg/iceberg_catalog_config.py +26 -0
- deltacat/experimental/catalog/iceberg/impl.py +399 -0
- deltacat/experimental/catalog/iceberg/overrides.py +72 -0
- deltacat/experimental/compatibility/__init__.py +0 -0
- deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py +201 -0
- deltacat/experimental/converter_agent/__init__.py +0 -0
- deltacat/experimental/converter_agent/beam/__init__.py +0 -0
- deltacat/experimental/converter_agent/beam/managed.py +173 -0
- deltacat/experimental/converter_agent/table_monitor.py +479 -0
- deltacat/experimental/daft/__init__.py +4 -0
- deltacat/experimental/daft/daft_catalog.py +229 -0
- deltacat/experimental/storage/__init__.py +0 -0
- deltacat/experimental/storage/iceberg/__init__.py +0 -0
- deltacat/experimental/storage/iceberg/iceberg_scan_planner.py +129 -0
- deltacat/experimental/storage/iceberg/impl.py +739 -0
- deltacat/experimental/storage/iceberg/model.py +713 -0
- deltacat/experimental/storage/iceberg/visitor.py +119 -0
- deltacat/experimental/storage/rivulet/__init__.py +11 -0
- deltacat/experimental/storage/rivulet/arrow/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/arrow/serializer.py +78 -0
- deltacat/experimental/storage/rivulet/dataset.py +745 -0
- deltacat/experimental/storage/rivulet/dataset_executor.py +79 -0
- deltacat/experimental/storage/rivulet/feather/__init__.py +7 -0
- deltacat/experimental/storage/rivulet/feather/file_reader.py +138 -0
- deltacat/experimental/storage/rivulet/feather/serializer.py +35 -0
- deltacat/experimental/storage/rivulet/fs/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/fs/file_provider.py +105 -0
- deltacat/experimental/storage/rivulet/fs/file_store.py +130 -0
- deltacat/experimental/storage/rivulet/fs/input_file.py +76 -0
- deltacat/experimental/storage/rivulet/fs/output_file.py +86 -0
- deltacat/experimental/storage/rivulet/logical_plan.py +105 -0
- deltacat/experimental/storage/rivulet/metastore/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/metastore/delta.py +188 -0
- deltacat/experimental/storage/rivulet/metastore/json_sst.py +105 -0
- deltacat/experimental/storage/rivulet/metastore/sst.py +82 -0
- deltacat/experimental/storage/rivulet/metastore/sst_interval_tree.py +260 -0
- deltacat/experimental/storage/rivulet/mvp/Table.py +101 -0
- deltacat/experimental/storage/rivulet/mvp/__init__.py +5 -0
- deltacat/experimental/storage/rivulet/parquet/__init__.py +7 -0
- deltacat/experimental/storage/rivulet/parquet/data_reader.py +0 -0
- deltacat/experimental/storage/rivulet/parquet/file_reader.py +129 -0
- deltacat/experimental/storage/rivulet/parquet/serializer.py +37 -0
- deltacat/experimental/storage/rivulet/reader/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/reader/block_scanner.py +389 -0
- deltacat/experimental/storage/rivulet/reader/data_reader.py +136 -0
- deltacat/experimental/storage/rivulet/reader/data_scan.py +65 -0
- deltacat/experimental/storage/rivulet/reader/dataset_metastore.py +179 -0
- deltacat/experimental/storage/rivulet/reader/dataset_reader.py +158 -0
- deltacat/experimental/storage/rivulet/reader/pyarrow_data_reader.py +124 -0
- deltacat/experimental/storage/rivulet/reader/query_expression.py +99 -0
- deltacat/experimental/storage/rivulet/reader/reader_type_registrar.py +84 -0
- deltacat/experimental/storage/rivulet/schema/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/schema/datatype.py +128 -0
- deltacat/experimental/storage/rivulet/schema/schema.py +251 -0
- deltacat/experimental/storage/rivulet/serializer.py +40 -0
- deltacat/experimental/storage/rivulet/serializer_factory.py +46 -0
- deltacat/experimental/storage/rivulet/shard/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/shard/range_shard.py +129 -0
- deltacat/experimental/storage/rivulet/writer/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/writer/dataset_writer.py +29 -0
- deltacat/experimental/storage/rivulet/writer/memtable_dataset_writer.py +305 -0
- deltacat/io/__init__.py +13 -0
- deltacat/io/dataset/__init__.py +0 -0
- deltacat/io/dataset/deltacat_dataset.py +91 -0
- deltacat/io/datasink/__init__.py +0 -0
- deltacat/io/datasink/deltacat_datasink.py +207 -0
- deltacat/io/datasource/__init__.py +0 -0
- deltacat/io/datasource/deltacat_datasource.py +579 -0
- deltacat/io/reader/__init__.py +0 -0
- deltacat/io/reader/deltacat_read_api.py +172 -0
- deltacat/logs.py +4 -1
- deltacat/storage/__init__.py +138 -28
- deltacat/storage/interface.py +260 -155
- deltacat/storage/main/__init__.py +0 -0
- deltacat/storage/main/impl.py +3030 -0
- deltacat/storage/model/delta.py +142 -71
- deltacat/storage/model/expression/__init__.py +47 -0
- deltacat/storage/model/expression/expression.py +656 -0
- deltacat/storage/model/expression/visitor.py +248 -0
- deltacat/storage/model/interop.py +24 -0
- deltacat/storage/model/list_result.py +8 -0
- deltacat/storage/model/locator.py +93 -9
- deltacat/storage/model/manifest.py +643 -0
- deltacat/storage/model/metafile.py +1421 -0
- deltacat/storage/model/namespace.py +41 -18
- deltacat/storage/model/partition.py +443 -43
- deltacat/storage/model/scan/__init__.py +0 -0
- deltacat/storage/model/scan/push_down.py +46 -0
- deltacat/storage/model/scan/scan_plan.py +10 -0
- deltacat/storage/model/scan/scan_task.py +34 -0
- deltacat/storage/model/schema.py +3160 -0
- deltacat/storage/model/shard.py +51 -0
- deltacat/storage/model/sort_key.py +210 -13
- deltacat/storage/model/stream.py +215 -80
- deltacat/storage/model/table.py +134 -29
- deltacat/storage/model/table_version.py +333 -46
- deltacat/storage/model/transaction.py +1733 -0
- deltacat/storage/model/transform.py +274 -58
- deltacat/storage/model/types.py +138 -16
- deltacat/storage/util/__init__.py +0 -0
- deltacat/storage/util/scan_planner.py +26 -0
- deltacat/tests/_io/__init__.py +1 -0
- deltacat/tests/_io/reader/__init__.py +0 -0
- deltacat/tests/_io/reader/test_deltacat_read_api.py +0 -0
- deltacat/tests/{io → _io}/test_cloudpickle_bug_fix.py +8 -4
- deltacat/tests/aws/test_s3u.py +2 -31
- deltacat/tests/catalog/data/__init__.py +0 -0
- deltacat/tests/catalog/main/__init__.py +0 -0
- deltacat/tests/catalog/main/test_catalog_impl_namespace_operations.py +130 -0
- deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +1972 -0
- deltacat/tests/catalog/model/__init__.py +0 -0
- deltacat/tests/catalog/model/test_table_definition.py +16 -0
- deltacat/tests/catalog/test_catalogs.py +321 -0
- deltacat/tests/catalog/test_default_catalog_impl.py +12154 -66
- deltacat/tests/compute/compact_partition_multiple_rounds_test_cases.py +21 -21
- deltacat/tests/compute/compact_partition_rebase_test_cases.py +6 -6
- deltacat/tests/compute/compact_partition_rebase_then_incremental_test_cases.py +56 -56
- deltacat/tests/compute/compact_partition_test_cases.py +23 -30
- deltacat/tests/compute/compactor/steps/test_repartition.py +14 -14
- deltacat/tests/compute/compactor/utils/test_io.py +125 -123
- deltacat/tests/compute/compactor/utils/test_round_completion_reader.py +254 -0
- deltacat/tests/compute/compactor_v2/test_compaction_session.py +387 -830
- deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +70 -57
- deltacat/tests/compute/compactor_v2/utils/test_task_options.py +0 -3
- deltacat/tests/compute/conftest.py +39 -0
- deltacat/tests/compute/converter/__init__.py +0 -0
- deltacat/tests/compute/converter/conftest.py +80 -0
- deltacat/tests/compute/converter/test_convert_session.py +826 -0
- deltacat/tests/compute/converter/utils.py +132 -0
- deltacat/tests/compute/resource_estimation/test_delta.py +88 -104
- deltacat/tests/compute/test_compact_partition_incremental.py +91 -98
- deltacat/tests/compute/test_compact_partition_multiple_rounds.py +79 -97
- deltacat/tests/compute/test_compact_partition_params.py +16 -11
- deltacat/tests/compute/test_compact_partition_rebase.py +63 -93
- deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +249 -220
- deltacat/tests/compute/test_janitor.py +236 -0
- deltacat/tests/compute/test_util_common.py +726 -46
- deltacat/tests/compute/test_util_constant.py +0 -1
- deltacat/tests/conftest.py +25 -0
- deltacat/tests/daft/__init__.py +0 -0
- deltacat/tests/daft/test_model.py +97 -0
- deltacat/tests/experimental/__init__.py +1 -0
- deltacat/tests/experimental/catalog/__init__.py +0 -0
- deltacat/tests/experimental/catalog/iceberg/__init__.py +0 -0
- deltacat/tests/experimental/catalog/iceberg/test_iceberg_catalog.py +71 -0
- deltacat/tests/experimental/compatibility/__init__.py +1 -0
- deltacat/tests/experimental/compatibility/test_backfill_locator_to_id_mappings.py +582 -0
- deltacat/tests/experimental/daft/__init__.py +0 -0
- deltacat/tests/experimental/daft/test_deltacat_daft_integration.py +136 -0
- deltacat/tests/experimental/storage/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/conftest.py +149 -0
- deltacat/tests/experimental/storage/rivulet/fs/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/fs/test_file_location_provider.py +94 -0
- deltacat/tests/experimental/storage/rivulet/reader/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/reader/query_expression.py +80 -0
- deltacat/tests/experimental/storage/rivulet/reader/test_data_scan.py +119 -0
- deltacat/tests/experimental/storage/rivulet/reader/test_dataset_metastore.py +71 -0
- deltacat/tests/experimental/storage/rivulet/schema/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/schema/test_schema.py +241 -0
- deltacat/tests/experimental/storage/rivulet/shard/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/shard/test_range_shard.py +162 -0
- deltacat/tests/experimental/storage/rivulet/test_dataset.py +408 -0
- deltacat/tests/experimental/storage/rivulet/test_manifest.py +67 -0
- deltacat/tests/experimental/storage/rivulet/test_sst_interval_tree.py +232 -0
- deltacat/tests/experimental/storage/rivulet/test_utils.py +124 -0
- deltacat/tests/experimental/storage/rivulet/writer/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/writer/test_dataset_write_then_read.py +343 -0
- deltacat/tests/experimental/storage/rivulet/writer/test_dataset_writer.py +79 -0
- deltacat/tests/experimental/storage/rivulet/writer/test_memtable_dataset_writer.py +75 -0
- deltacat/tests/storage/__init__.py +0 -0
- deltacat/tests/storage/main/__init__.py +0 -0
- deltacat/tests/storage/main/test_main_storage.py +8204 -0
- deltacat/tests/storage/model/__init__.py +0 -0
- deltacat/tests/storage/model/test_delete_parameters.py +21 -0
- deltacat/tests/storage/model/test_expression.py +327 -0
- deltacat/tests/storage/model/test_manifest.py +129 -0
- deltacat/tests/storage/model/test_metafile_io.py +2440 -0
- deltacat/tests/storage/model/test_partition_scheme.py +85 -0
- deltacat/tests/storage/model/test_schema.py +479 -0
- deltacat/tests/storage/model/test_schema_update.py +1925 -0
- deltacat/tests/storage/model/test_shard.py +24 -0
- deltacat/tests/storage/model/test_sort_scheme.py +90 -0
- deltacat/tests/storage/model/test_table_version.py +110 -0
- deltacat/tests/storage/model/test_transaction.py +653 -0
- deltacat/tests/storage/model/test_transaction_history.py +886 -0
- deltacat/tests/test_deltacat_api.py +1064 -0
- deltacat/tests/test_exceptions.py +9 -5
- deltacat/tests/test_utils/filesystem.py +14 -0
- deltacat/tests/test_utils/message_pack_utils.py +54 -0
- deltacat/tests/test_utils/pyarrow.py +50 -26
- deltacat/tests/test_utils/storage.py +256 -4
- deltacat/tests/types/__init__.py +0 -0
- deltacat/tests/types/test_tables.py +104 -0
- deltacat/tests/utils/exceptions.py +22 -0
- deltacat/tests/utils/main_deltacat_storage_mock.py +31 -0
- deltacat/tests/utils/ray_utils/test_dataset.py +123 -5
- deltacat/tests/utils/test_daft.py +124 -34
- deltacat/tests/utils/test_numpy.py +1193 -0
- deltacat/tests/utils/test_pandas.py +1106 -0
- deltacat/tests/utils/test_polars.py +1040 -0
- deltacat/tests/utils/test_pyarrow.py +1107 -258
- deltacat/types/media.py +345 -37
- deltacat/types/partial_download.py +1 -1
- deltacat/types/tables.py +2345 -47
- deltacat/utils/arguments.py +33 -1
- deltacat/utils/daft.py +824 -40
- deltacat/utils/export.py +61 -0
- deltacat/utils/filesystem.py +450 -0
- deltacat/utils/metafile_locator.py +74 -0
- deltacat/utils/numpy.py +118 -26
- deltacat/utils/pandas.py +577 -48
- deltacat/utils/polars.py +759 -0
- deltacat/utils/pyarrow.py +1212 -178
- deltacat/utils/ray_utils/concurrency.py +1 -1
- deltacat/utils/ray_utils/dataset.py +101 -10
- deltacat/utils/ray_utils/runtime.py +56 -4
- deltacat/utils/reader_compatibility_mapping.py +3083 -0
- deltacat/utils/url.py +1325 -0
- deltacat-2.0.0.dist-info/METADATA +1163 -0
- deltacat-2.0.0.dist-info/RECORD +439 -0
- {deltacat-1.1.38.dist-info → deltacat-2.0.0.dist-info}/WHEEL +1 -1
- deltacat/aws/redshift/__init__.py +0 -19
- deltacat/aws/redshift/model/manifest.py +0 -394
- deltacat/catalog/default_catalog_impl/__init__.py +0 -369
- deltacat/compute/compactor/utils/round_completion_file.py +0 -97
- deltacat/compute/merge_on_read/__init__.py +0 -4
- deltacat/compute/merge_on_read/daft.py +0 -40
- deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -66
- deltacat/compute/merge_on_read/utils/delta.py +0 -42
- deltacat/io/dataset.py +0 -73
- deltacat/io/read_api.py +0 -143
- deltacat/storage/model/delete_parameters.py +0 -40
- deltacat/storage/model/partition_spec.py +0 -71
- deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -231
- deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -397
- deltacat/tests/local_deltacat_storage/__init__.py +0 -1262
- deltacat/tests/local_deltacat_storage/exceptions.py +0 -10
- deltacat/utils/s3fs.py +0 -21
- deltacat-1.1.38.dist-info/METADATA +0 -64
- deltacat-1.1.38.dist-info/RECORD +0 -219
- /deltacat/{aws/redshift/model → benchmarking/data}/__init__.py +0 -0
- /deltacat/{compute/merge_on_read/model → catalog/main}/__init__.py +0 -0
- /deltacat/compute/{merge_on_read/utils → converter}/__init__.py +0 -0
- /deltacat/{io/aws → compute/converter/model}/__init__.py +0 -0
- /deltacat/{io/aws/redshift → compute/converter/pyiceberg}/__init__.py +0 -0
- /deltacat/{tests/io → compute/converter/steps}/__init__.py +0 -0
- /deltacat/tests/{io → _io}/test_file_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_memcached_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_ray_plasma_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_redis_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_s3_object_store.py +0 -0
- {deltacat-1.1.38.dist-info → deltacat-2.0.0.dist-info/licenses}/LICENSE +0 -0
- {deltacat-1.1.38.dist-info → deltacat-2.0.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,1733 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
import os
|
4
|
+
import copy
|
5
|
+
import time
|
6
|
+
import uuid
|
7
|
+
import logging
|
8
|
+
import posixpath
|
9
|
+
from pathlib import PosixPath
|
10
|
+
import threading
|
11
|
+
import contextvars
|
12
|
+
from collections import defaultdict
|
13
|
+
|
14
|
+
from types import TracebackType
|
15
|
+
from typing import Optional, List, Union, Tuple, Type, TYPE_CHECKING, Iterable
|
16
|
+
|
17
|
+
if TYPE_CHECKING:
|
18
|
+
from deltacat.types.tables import Dataset
|
19
|
+
|
20
|
+
import msgpack
|
21
|
+
import pyarrow as pa
|
22
|
+
import pyarrow.fs
|
23
|
+
|
24
|
+
from deltacat.constants import (
|
25
|
+
TXN_DIR_NAME,
|
26
|
+
TXN_PART_SEPARATOR,
|
27
|
+
RUNNING_TXN_DIR_NAME,
|
28
|
+
FAILED_TXN_DIR_NAME,
|
29
|
+
PAUSED_TXN_DIR_NAME,
|
30
|
+
SUCCESS_TXN_DIR_NAME,
|
31
|
+
NANOS_PER_SEC,
|
32
|
+
)
|
33
|
+
from deltacat.storage.model.list_result import ListResult
|
34
|
+
from deltacat.storage.model.types import (
|
35
|
+
TransactionOperationType,
|
36
|
+
TransactionState,
|
37
|
+
TransactionStatus,
|
38
|
+
)
|
39
|
+
from deltacat.storage.model.namespace import NamespaceLocator
|
40
|
+
from deltacat.storage.model.table import TableLocator
|
41
|
+
from deltacat.storage.model.table_version import TableVersionLocator
|
42
|
+
from deltacat.storage.model.stream import StreamLocator
|
43
|
+
from deltacat.storage.model.partition import PartitionLocator
|
44
|
+
from deltacat.storage.model.delta import DeltaLocator
|
45
|
+
from deltacat.storage.model.metafile import (
|
46
|
+
Metafile,
|
47
|
+
MetafileRevisionInfo,
|
48
|
+
)
|
49
|
+
from deltacat.types.tables import (
|
50
|
+
DatasetType,
|
51
|
+
from_pyarrow,
|
52
|
+
)
|
53
|
+
from deltacat.utils.filesystem import (
|
54
|
+
resolve_path_and_filesystem,
|
55
|
+
list_directory,
|
56
|
+
get_file_info,
|
57
|
+
)
|
58
|
+
from deltacat import logs
|
59
|
+
|
60
|
+
logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
|
61
|
+
|
62
|
+
|
63
|
+
# Context variable to store the current transaction
|
64
|
+
_current_transaction: contextvars.ContextVar[
|
65
|
+
Optional[Transaction]
|
66
|
+
] = contextvars.ContextVar("current_transaction", default=None)
|
67
|
+
|
68
|
+
|
69
|
+
def get_current_transaction() -> Optional[Transaction]:
|
70
|
+
"""Get the currently active transaction from context."""
|
71
|
+
return _current_transaction.get()
|
72
|
+
|
73
|
+
|
74
|
+
def set_current_transaction(transaction: Optional[Transaction]) -> contextvars.Token:
|
75
|
+
"""Set the current transaction in context, returns token for restoration."""
|
76
|
+
return _current_transaction.set(transaction)
|
77
|
+
|
78
|
+
|
79
|
+
def setup_transaction(
|
80
|
+
transaction: Optional[Transaction] = None,
|
81
|
+
**kwargs,
|
82
|
+
) -> Tuple[Transaction, bool]:
|
83
|
+
"""
|
84
|
+
Utility method to ensure a transaction exists and determine if it should be committed
|
85
|
+
within the caller's context. Creates a new transaction if none is provided.
|
86
|
+
|
87
|
+
Args:
|
88
|
+
transaction: Optional existing transaction to use
|
89
|
+
**kwargs: Additional arguments for catalog properties
|
90
|
+
|
91
|
+
Returns:
|
92
|
+
Tuple[Transaction, bool]: The transaction to use and whether to commit it
|
93
|
+
"""
|
94
|
+
# Check for active transaction in context first
|
95
|
+
if transaction is None:
|
96
|
+
transaction = get_current_transaction()
|
97
|
+
|
98
|
+
commit_transaction = transaction is None
|
99
|
+
if commit_transaction:
|
100
|
+
from deltacat.catalog.model.properties import get_catalog_properties
|
101
|
+
|
102
|
+
catalog_properties = get_catalog_properties(**kwargs)
|
103
|
+
transaction = Transaction.of().start(
|
104
|
+
catalog_properties.root,
|
105
|
+
catalog_properties.filesystem,
|
106
|
+
)
|
107
|
+
return transaction, commit_transaction
|
108
|
+
|
109
|
+
|
110
|
+
def transaction_log_dir_and_filesystem(
|
111
|
+
catalog_name: Optional[str] = None,
|
112
|
+
) -> Tuple[str, pyarrow.fs.FileSystem]:
|
113
|
+
"""
|
114
|
+
Get the transaction log directory and filesystem for the given catalog.
|
115
|
+
|
116
|
+
Args:
|
117
|
+
catalog_name: Name of the catalog to get the transaction log directory and filesystem for.
|
118
|
+
If None, uses the default catalog.
|
119
|
+
|
120
|
+
Returns:
|
121
|
+
Tuple[str, pyarrow.fs.FileSystem]: The transaction log directory and filesystem for the given catalog.
|
122
|
+
"""
|
123
|
+
# Get the catalog and its properties
|
124
|
+
from deltacat.catalog.model.catalog import get_catalog
|
125
|
+
|
126
|
+
catalog = get_catalog(catalog_name)
|
127
|
+
catalog_properties = catalog.inner
|
128
|
+
|
129
|
+
# Get transaction directory paths
|
130
|
+
catalog_root_normalized, filesystem = resolve_path_and_filesystem(
|
131
|
+
catalog_properties.root,
|
132
|
+
catalog_properties.filesystem,
|
133
|
+
)
|
134
|
+
|
135
|
+
return posixpath.join(catalog_root_normalized, TXN_DIR_NAME), filesystem
|
136
|
+
|
137
|
+
|
138
|
+
def transaction(
|
139
|
+
catalog_name: Optional[str] = None,
|
140
|
+
as_of: Optional[int] = None,
|
141
|
+
commit_message: Optional[str] = None,
|
142
|
+
) -> Transaction:
|
143
|
+
"""
|
144
|
+
Start a new interactive transaction for the given catalog.
|
145
|
+
|
146
|
+
Args:
|
147
|
+
catalog_name: Optional name of the catalog to run the transaction against.
|
148
|
+
If None, uses the default catalog.
|
149
|
+
as_of: Optional historic timestamp in nanoseconds since epoch.
|
150
|
+
If provided, creates a read-only transaction that reads only transactions
|
151
|
+
with end times strictly less than the specified timestamp.
|
152
|
+
commit_message: Optional commit message to describe the transaction purpose.
|
153
|
+
Helps with time travel functionality by providing context
|
154
|
+
for each transaction when browsing transaction history.
|
155
|
+
|
156
|
+
Returns:
|
157
|
+
Transaction: A started interactive transaction ready for use with the given catalog.
|
158
|
+
|
159
|
+
Example:
|
160
|
+
# Read-write transaction with commit message
|
161
|
+
with dc.transaction(commit_message="Initial data load for Q4 analytics") as txn:
|
162
|
+
dc.write_to_table(data, "my_table")
|
163
|
+
dc.write_to_table(more_data, "my_other_table")
|
164
|
+
|
165
|
+
# Read-only historic transaction
|
166
|
+
import time
|
167
|
+
historic_time = time.time_ns() - 3600 * 1000000000 # 1 hour ago
|
168
|
+
with dc.transaction(as_of=historic_time) as txn:
|
169
|
+
# Only read operations allowed - provides snapshot as of historic_time
|
170
|
+
data = dc.read_table("my_table")
|
171
|
+
"""
|
172
|
+
from deltacat.catalog.model.catalog import get_catalog
|
173
|
+
|
174
|
+
# Get the catalog and its properties
|
175
|
+
catalog = get_catalog(catalog_name)
|
176
|
+
catalog_properties = catalog.inner
|
177
|
+
|
178
|
+
# Create interactive transaction
|
179
|
+
if as_of is not None:
|
180
|
+
# Create read-only historic transaction
|
181
|
+
txn = Transaction.of(commit_message=commit_message).start(
|
182
|
+
catalog_properties.root,
|
183
|
+
catalog_properties.filesystem,
|
184
|
+
historic_timestamp=as_of,
|
185
|
+
)
|
186
|
+
else:
|
187
|
+
# Create regular read-write transaction
|
188
|
+
txn = Transaction.of(commit_message=commit_message).start(
|
189
|
+
catalog_properties.root, catalog_properties.filesystem
|
190
|
+
)
|
191
|
+
# Initialize the lazy transaction ID
|
192
|
+
logger.info(f"Created transaction with ID: {txn.id}")
|
193
|
+
return txn
|
194
|
+
|
195
|
+
|
196
|
+
def _read_txn(
|
197
|
+
txn_log_dir: str,
|
198
|
+
txn_status: TransactionStatus,
|
199
|
+
transaction_id: str,
|
200
|
+
filesystem: pyarrow.fs.FileSystem,
|
201
|
+
) -> Transaction:
|
202
|
+
"""
|
203
|
+
Read a transaction ID with the expected status from the given root transaction log directory.
|
204
|
+
|
205
|
+
Args:
|
206
|
+
txn_log_dir: The directory containing the transaction log.
|
207
|
+
txn_status: The expected status of the transaction.
|
208
|
+
transaction_id: The ID of the transaction.
|
209
|
+
filesystem: The filesystem to use for reading the transaction.
|
210
|
+
|
211
|
+
Returns:
|
212
|
+
Transaction: The transaction.
|
213
|
+
"""
|
214
|
+
# Transaction directories contain the actual transaction file
|
215
|
+
txn_dir_path = posixpath.join(
|
216
|
+
txn_log_dir, txn_status.dir_name(), posixpath.basename(transaction_id)
|
217
|
+
)
|
218
|
+
|
219
|
+
try:
|
220
|
+
file_info = get_file_info(txn_dir_path, filesystem)
|
221
|
+
except FileNotFoundError:
|
222
|
+
raise FileNotFoundError(
|
223
|
+
f"Transaction with ID '{transaction_id}' and status '{txn_status}' not found."
|
224
|
+
)
|
225
|
+
|
226
|
+
# Only read transaction directories (skip any stray files)
|
227
|
+
if file_info.type != pyarrow.fs.FileType.Directory:
|
228
|
+
raise FileNotFoundError(
|
229
|
+
f"Transaction directory for transaction ID '{transaction_id}' with status '{txn_status}' not found."
|
230
|
+
)
|
231
|
+
|
232
|
+
# List files in the transaction directory
|
233
|
+
txn_files = list_directory(
|
234
|
+
path=txn_dir_path,
|
235
|
+
filesystem=filesystem,
|
236
|
+
ignore_missing_path=True,
|
237
|
+
)
|
238
|
+
|
239
|
+
if not txn_files:
|
240
|
+
raise FileNotFoundError(
|
241
|
+
f"No transaction file found for transaction ID '{transaction_id}' and status '{txn_status}'."
|
242
|
+
)
|
243
|
+
|
244
|
+
if len(txn_files) > 1:
|
245
|
+
raise RuntimeError(
|
246
|
+
f"Expected 1 transaction file in '{txn_dir_path}', but found {len(txn_files)}"
|
247
|
+
)
|
248
|
+
|
249
|
+
# Get the transaction file path
|
250
|
+
txn_file_path, _ = txn_files[0]
|
251
|
+
|
252
|
+
# Read the transaction from the file
|
253
|
+
return Transaction.read(txn_file_path, filesystem)
|
254
|
+
|
255
|
+
|
256
|
+
def read_transaction(
|
257
|
+
transaction_id: str,
|
258
|
+
catalog_name: Optional[str] = None,
|
259
|
+
status: TransactionStatus = TransactionStatus.SUCCESS,
|
260
|
+
) -> Transaction:
|
261
|
+
"""
|
262
|
+
Read a transaction from the given catalog and transaction ID.
|
263
|
+
"""
|
264
|
+
txn_log_dir, filesystem = transaction_log_dir_and_filesystem(catalog_name)
|
265
|
+
return _read_txn(txn_log_dir, status, transaction_id, filesystem)
|
266
|
+
|
267
|
+
|
268
|
+
def transactions(
|
269
|
+
catalog_name: Optional[str] = None,
|
270
|
+
read_as: "DatasetType" = None,
|
271
|
+
start_time: Optional[int] = None,
|
272
|
+
end_time: Optional[int] = None,
|
273
|
+
limit: Optional[int] = None,
|
274
|
+
status_in: Iterable[TransactionStatus] = [TransactionStatus.SUCCESS],
|
275
|
+
) -> Dataset:
|
276
|
+
"""
|
277
|
+
Query transaction history for a catalog.
|
278
|
+
|
279
|
+
Args:
|
280
|
+
catalog_name: Optional name of the catalog to query. If None, uses the default catalog.
|
281
|
+
read_as: Dataset type to return results as. If None, defaults to DatasetType.PYARROW.
|
282
|
+
start_time: Optional start timestamp in nanoseconds since epoch to filter transactions.
|
283
|
+
end_time: Optional end timestamp in nanoseconds since epoch to filter transactions.
|
284
|
+
limit: Optional maximum number of transactions to return (most recent first).
|
285
|
+
status_in: Optional iterable of transaction status types to include. Defaults to [TransactionStatus.SUCCESS].
|
286
|
+
|
287
|
+
Returns:
|
288
|
+
Dataset: Transaction history as the specified dataset type with columns:
|
289
|
+
- transaction_id: Unique transaction identifier
|
290
|
+
- commit_message: Optional user-provided commit message
|
291
|
+
- start_time: Transaction start timestamp (nanoseconds since epoch)
|
292
|
+
- end_time: Transaction end timestamp (nanoseconds since epoch, None for running)
|
293
|
+
- status: Transaction status (SUCCESS, RUNNING, FAILED, PAUSED)
|
294
|
+
- operation_count: Number of operations in the transaction
|
295
|
+
- operation_types: Comma-separated list of distinct operation types in the transaction
|
296
|
+
- namespace_count: Number of distinct namespaces affected by the transaction
|
297
|
+
- table_count: Number of distinct tables affected by the transaction
|
298
|
+
- table_version_count: Number of distinct table versions affected by the transaction
|
299
|
+
- stream_count: Number of distinct streams affected by the transaction
|
300
|
+
- partition_count: Number of distinct partitions affected by the transaction
|
301
|
+
- delta_count: Number of distinct deltas affected by the transaction
|
302
|
+
|
303
|
+
Example:
|
304
|
+
# Get recent successful transactions
|
305
|
+
recent = dc.transactions(limit=10)
|
306
|
+
|
307
|
+
# Get transactions for a specific time range
|
308
|
+
import time
|
309
|
+
hour_ago = time.time_ns() - 3600 * 1000000000
|
310
|
+
recent_hour = dc.transactions(start_time=hour_ago)
|
311
|
+
|
312
|
+
# Get transaction history as pandas DataFrame
|
313
|
+
df = dc.transactions(read_as=dc.DatasetType.PANDAS)
|
314
|
+
"""
|
315
|
+
# Validate inputs
|
316
|
+
if limit is not None and limit <= 0:
|
317
|
+
raise ValueError("limit must be greater than 0")
|
318
|
+
|
319
|
+
# Set default read_as if not provided
|
320
|
+
if read_as is None:
|
321
|
+
read_as = DatasetType.PYARROW
|
322
|
+
|
323
|
+
if not status_in:
|
324
|
+
status_in = [TransactionStatus.SUCCESS]
|
325
|
+
|
326
|
+
# Get transaction directory path and filesystem
|
327
|
+
txn_log_dir, filesystem = transaction_log_dir_and_filesystem(catalog_name)
|
328
|
+
|
329
|
+
# Collect transaction data
|
330
|
+
transaction_records = {
|
331
|
+
"transaction_id": [],
|
332
|
+
"commit_message": [],
|
333
|
+
"start_time": [],
|
334
|
+
"end_time": [],
|
335
|
+
"status": [],
|
336
|
+
"operation_count": [],
|
337
|
+
"operation_types": [],
|
338
|
+
"namespace_count": [],
|
339
|
+
"table_count": [],
|
340
|
+
"table_version_count": [],
|
341
|
+
"stream_count": [],
|
342
|
+
"partition_count": [],
|
343
|
+
"delta_count": [],
|
344
|
+
}
|
345
|
+
|
346
|
+
# Helper function to process transactions in a directory
|
347
|
+
def process_transactions_in_directory(
|
348
|
+
directory: str, expected_status: TransactionStatus
|
349
|
+
):
|
350
|
+
# TODO(pdames): Do a recursive listing to get the transaction files returned directly.
|
351
|
+
file_info_and_sizes = list_directory(
|
352
|
+
path=directory,
|
353
|
+
filesystem=filesystem,
|
354
|
+
ignore_missing_path=True,
|
355
|
+
)
|
356
|
+
|
357
|
+
for file_path, _ in file_info_and_sizes:
|
358
|
+
# Read the transaction from the file
|
359
|
+
# TODO(pdames): Do a recursive listing to get the transaction files returned directly.
|
360
|
+
try:
|
361
|
+
txn = _read_txn(
|
362
|
+
txn_log_dir,
|
363
|
+
expected_status,
|
364
|
+
posixpath.basename(file_path),
|
365
|
+
filesystem,
|
366
|
+
)
|
367
|
+
except FileNotFoundError:
|
368
|
+
# this may be a stray file or the transaction is being created - skip it
|
369
|
+
continue
|
370
|
+
|
371
|
+
# Apply time filters
|
372
|
+
# TODO(pdames): Parse start and end times from the transaction file path.
|
373
|
+
if (
|
374
|
+
start_time is not None
|
375
|
+
and txn.start_time
|
376
|
+
and txn.start_time < start_time
|
377
|
+
):
|
378
|
+
continue
|
379
|
+
if end_time is not None and txn.end_time and txn.end_time > end_time:
|
380
|
+
continue
|
381
|
+
|
382
|
+
# Count operations and affected metadata objects by type.
|
383
|
+
operation_count = len(txn.operations)
|
384
|
+
operation_types = set()
|
385
|
+
affected_namespaces = set()
|
386
|
+
affected_tables = set()
|
387
|
+
affected_table_versions = set()
|
388
|
+
affected_streams = set()
|
389
|
+
affected_partitions = set()
|
390
|
+
affected_deltas = set()
|
391
|
+
|
392
|
+
for op in txn.operations:
|
393
|
+
operation_types.add(op.type)
|
394
|
+
|
395
|
+
# Determine locator type and cast to appropriate locator class
|
396
|
+
locator_dict = op.dest_metafile.get("locator", {})
|
397
|
+
if "tableName" in locator_dict and "namespaceLocator" in locator_dict:
|
398
|
+
locator = TableLocator(locator_dict)
|
399
|
+
elif "namespace" in locator_dict:
|
400
|
+
locator = NamespaceLocator(locator_dict)
|
401
|
+
elif "tableVersion" in locator_dict:
|
402
|
+
locator = TableVersionLocator(locator_dict)
|
403
|
+
elif "streamId" in locator_dict:
|
404
|
+
locator = StreamLocator(locator_dict)
|
405
|
+
elif "partitionId" in locator_dict:
|
406
|
+
locator = PartitionLocator(locator_dict)
|
407
|
+
elif "streamPosition" in locator_dict:
|
408
|
+
locator = DeltaLocator(locator_dict)
|
409
|
+
else:
|
410
|
+
raise ValueError(
|
411
|
+
f"Unknown locator type from structure: {locator_dict}"
|
412
|
+
)
|
413
|
+
|
414
|
+
# Extract distinct metafiles updated by common/alias name (e.g., a table rename impacts 2 tables instead of 1)
|
415
|
+
if op.type in TransactionOperationType.write_operations():
|
416
|
+
if locator.namespace is not None:
|
417
|
+
affected_namespaces.add(locator.namespace)
|
418
|
+
if isinstance(locator, TableLocator):
|
419
|
+
affected_tables.add((locator.namespace, locator.table_name))
|
420
|
+
elif isinstance(locator, TableVersionLocator):
|
421
|
+
affected_table_versions.add(
|
422
|
+
(
|
423
|
+
locator.namespace,
|
424
|
+
locator.table_name,
|
425
|
+
locator.table_version,
|
426
|
+
)
|
427
|
+
)
|
428
|
+
elif isinstance(locator, StreamLocator):
|
429
|
+
affected_tables.add((locator.namespace, locator.table_name))
|
430
|
+
affected_table_versions.add(
|
431
|
+
(
|
432
|
+
locator.namespace,
|
433
|
+
locator.table_name,
|
434
|
+
locator.table_version,
|
435
|
+
)
|
436
|
+
)
|
437
|
+
affected_streams.add(
|
438
|
+
(
|
439
|
+
locator.namespace,
|
440
|
+
locator.table_name,
|
441
|
+
locator.table_version,
|
442
|
+
locator.stream_id,
|
443
|
+
)
|
444
|
+
)
|
445
|
+
elif isinstance(locator, PartitionLocator):
|
446
|
+
affected_tables.add((locator.namespace, locator.table_name))
|
447
|
+
affected_table_versions.add(
|
448
|
+
(
|
449
|
+
locator.namespace,
|
450
|
+
locator.table_name,
|
451
|
+
locator.table_version,
|
452
|
+
)
|
453
|
+
)
|
454
|
+
affected_streams.add(
|
455
|
+
(
|
456
|
+
locator.namespace,
|
457
|
+
locator.table_name,
|
458
|
+
locator.table_version,
|
459
|
+
locator.stream_id,
|
460
|
+
)
|
461
|
+
)
|
462
|
+
affected_partitions.add(
|
463
|
+
(
|
464
|
+
locator.namespace,
|
465
|
+
locator.table_name,
|
466
|
+
locator.table_version,
|
467
|
+
locator.stream_id,
|
468
|
+
locator.partition_id,
|
469
|
+
)
|
470
|
+
)
|
471
|
+
elif isinstance(locator, DeltaLocator):
|
472
|
+
affected_tables.add((locator.namespace, locator.table_name))
|
473
|
+
affected_table_versions.add(
|
474
|
+
(
|
475
|
+
locator.namespace,
|
476
|
+
locator.table_name,
|
477
|
+
locator.table_version,
|
478
|
+
)
|
479
|
+
)
|
480
|
+
affected_streams.add(
|
481
|
+
(
|
482
|
+
locator.namespace,
|
483
|
+
locator.table_name,
|
484
|
+
locator.table_version,
|
485
|
+
locator.stream_id,
|
486
|
+
)
|
487
|
+
)
|
488
|
+
affected_partitions.add(
|
489
|
+
(
|
490
|
+
locator.namespace,
|
491
|
+
locator.table_name,
|
492
|
+
locator.table_version,
|
493
|
+
locator.stream_id,
|
494
|
+
locator.partition_id,
|
495
|
+
)
|
496
|
+
)
|
497
|
+
affected_deltas.add(
|
498
|
+
(
|
499
|
+
locator.namespace,
|
500
|
+
locator.table_name,
|
501
|
+
locator.table_version,
|
502
|
+
locator.stream_id,
|
503
|
+
locator.partition_id,
|
504
|
+
locator.stream_position,
|
505
|
+
)
|
506
|
+
)
|
507
|
+
|
508
|
+
# Create transaction record
|
509
|
+
transaction_records["transaction_id"].append(txn.id)
|
510
|
+
transaction_records["commit_message"].append(txn.commit_message)
|
511
|
+
transaction_records["start_time"].append(txn.start_time)
|
512
|
+
transaction_records["end_time"].append(txn.end_time)
|
513
|
+
transaction_records["status"].append(expected_status)
|
514
|
+
transaction_records["operation_count"].append(operation_count)
|
515
|
+
transaction_records["operation_types"].append(operation_types)
|
516
|
+
transaction_records["namespace_count"].append(len(affected_namespaces))
|
517
|
+
transaction_records["table_count"].append(len(affected_tables))
|
518
|
+
transaction_records["table_version_count"].append(
|
519
|
+
len(affected_table_versions)
|
520
|
+
)
|
521
|
+
transaction_records["stream_count"].append(len(affected_streams))
|
522
|
+
transaction_records["partition_count"].append(len(affected_partitions))
|
523
|
+
transaction_records["delta_count"].append(len(affected_deltas))
|
524
|
+
|
525
|
+
for status in status_in:
|
526
|
+
dir_path = posixpath.join(txn_log_dir, status.dir_name())
|
527
|
+
process_transactions_in_directory(dir_path, status)
|
528
|
+
|
529
|
+
# Sort by start_time descending (most recent first)
|
530
|
+
# Convert to list of records, sort, then convert back
|
531
|
+
if transaction_records["transaction_id"]: # Only sort if we have records
|
532
|
+
# Create list of tuples (start_time, record_index)
|
533
|
+
sorted_indices = sorted(
|
534
|
+
range(len(transaction_records["start_time"])),
|
535
|
+
key=lambda i: transaction_records["start_time"][i] or 0,
|
536
|
+
reverse=True,
|
537
|
+
)
|
538
|
+
|
539
|
+
# Reorder all columns based on sorted indices
|
540
|
+
for key in transaction_records:
|
541
|
+
transaction_records[key] = [
|
542
|
+
transaction_records[key][i] for i in sorted_indices
|
543
|
+
]
|
544
|
+
|
545
|
+
# Apply limit
|
546
|
+
# TODO(pdames): Apply limit during listing (pyarrow fs doesn't support limits natively).
|
547
|
+
if limit is not None and limit > 0:
|
548
|
+
for key in transaction_records:
|
549
|
+
transaction_records[key] = transaction_records[key][:limit]
|
550
|
+
|
551
|
+
# Convert to requested dataset type
|
552
|
+
return from_pyarrow(pa.Table.from_pydict(transaction_records), read_as)
|
553
|
+
|
554
|
+
|
555
|
+
class TransactionTimeProvider:
|
556
|
+
"""
|
557
|
+
Provider interface for transaction start and end times. An ideal
|
558
|
+
transaction time provider is externally consistent (e.g.,
|
559
|
+
https://cloud.google.com/spanner/docs/true-time-external-consistency),
|
560
|
+
such that:
|
561
|
+
1. A transaction start time is never less than a previously completed
|
562
|
+
transaction's end time.
|
563
|
+
2. A transaction end time is never less than an in-progress
|
564
|
+
transaction's start time.
|
565
|
+
3. Every transaction has a unique start and end time.
|
566
|
+
4. Start/end time assignment is non-blocking.
|
567
|
+
"""
|
568
|
+
|
569
|
+
def start_time(self) -> int:
|
570
|
+
raise NotImplementedError("start_time not implemented")
|
571
|
+
|
572
|
+
def end_time(self) -> int:
|
573
|
+
raise NotImplementedError("end_time not implemented")
|
574
|
+
|
575
|
+
|
576
|
+
class TransactionSystemTimeProvider(TransactionTimeProvider):
|
577
|
+
"""
|
578
|
+
A local transaction time provider that returns the current system clock
|
579
|
+
epoch time in nanoseconds. Ensures that all local transaction start
|
580
|
+
times are greater than all last known end times, and that all known end
|
581
|
+
times are no less than all last known start time across all local threads
|
582
|
+
using this time provider.
|
583
|
+
|
584
|
+
Note that this time provider gives no external consistency guarantees due
|
585
|
+
to potential clock skew between distributed nodes writing to the same
|
586
|
+
catalog, and is only recommended for use with local catalogs.
|
587
|
+
"""
|
588
|
+
|
589
|
+
last_known_start_times = defaultdict(int)
|
590
|
+
last_known_end_times = defaultdict(int)
|
591
|
+
|
592
|
+
# don't wait more than 60 seconds for the system clock to catch up
|
593
|
+
# between transactions (assumed to be indicative of a larger system
|
594
|
+
# clock change made between transactions)
|
595
|
+
max_sync_wait_time = 60 * NANOS_PER_SEC
|
596
|
+
|
597
|
+
def start_time(self) -> int:
|
598
|
+
"""
|
599
|
+
Gets the current system time in nanoseconds since the epoch. Ensures
|
600
|
+
that the start time returned is greater than the last known end time
|
601
|
+
recorded at the time this method is invoked.
|
602
|
+
:return: Current epoch time in nanoseconds.
|
603
|
+
"""
|
604
|
+
# ensure serial transactions in a single process have start times after
|
605
|
+
# the last known end time
|
606
|
+
last_known_end_times = self.last_known_end_times.values() or [0]
|
607
|
+
max_known_end_time = max(last_known_end_times)
|
608
|
+
|
609
|
+
elapsed_start_time = time.monotonic_ns()
|
610
|
+
current_time = time.time_ns()
|
611
|
+
while current_time <= max_known_end_time:
|
612
|
+
elapsed_time = time.monotonic_ns() - elapsed_start_time
|
613
|
+
if elapsed_time > self.max_sync_wait_time:
|
614
|
+
raise TimeoutError(
|
615
|
+
f"Failed to sync cross-transaction system clock time after "
|
616
|
+
f"{self.max_sync_wait_time / NANOS_PER_SEC} seconds, "
|
617
|
+
f"aborting."
|
618
|
+
)
|
619
|
+
time.sleep(0.000001)
|
620
|
+
current_time = time.time_ns()
|
621
|
+
|
622
|
+
# update the current thread's last known end time
|
623
|
+
pid = os.getpid()
|
624
|
+
tid = threading.current_thread().ident
|
625
|
+
current_thread_time_key = (pid, tid)
|
626
|
+
self.last_known_end_times[current_thread_time_key] = current_time
|
627
|
+
|
628
|
+
return current_time
|
629
|
+
|
630
|
+
def end_time(self) -> int:
|
631
|
+
"""
|
632
|
+
Gets the current system time in nanoseconds since the epoch. Ensures
|
633
|
+
that the end time returned is no less than the last known start time
|
634
|
+
recorded at the time this method is invoked.
|
635
|
+
:return: Current epoch time in nanoseconds.
|
636
|
+
"""
|
637
|
+
# ensure serial transactions in a single process have end times no less
|
638
|
+
# than the last known start time
|
639
|
+
last_known_start_times = self.last_known_start_times.values() or [0]
|
640
|
+
last_start_time = max(last_known_start_times)
|
641
|
+
|
642
|
+
elapsed_start_time = time.monotonic_ns()
|
643
|
+
current_time = time.time_ns()
|
644
|
+
while current_time < last_start_time:
|
645
|
+
elapsed_time = time.monotonic_ns() - elapsed_start_time
|
646
|
+
if elapsed_time > self.max_sync_wait_time:
|
647
|
+
raise TimeoutError(
|
648
|
+
f"Failed to sync cross-transaction system clock time after "
|
649
|
+
f"{self.max_sync_wait_time / NANOS_PER_SEC} seconds, "
|
650
|
+
f"aborting."
|
651
|
+
)
|
652
|
+
time.sleep(0.000001)
|
653
|
+
current_time = time.time_ns()
|
654
|
+
|
655
|
+
# update the current thread's last known end time
|
656
|
+
pid = os.getpid()
|
657
|
+
tid = threading.current_thread().ident
|
658
|
+
current_thread_time_key = (pid, tid)
|
659
|
+
self.last_known_start_times[current_thread_time_key] = current_time
|
660
|
+
|
661
|
+
return current_time
|
662
|
+
|
663
|
+
|
664
|
+
class TransactionHistoricTimeProvider(TransactionTimeProvider):
|
665
|
+
"""
|
666
|
+
A transaction time provider that returns a fixed historic timestamp
|
667
|
+
for read-only transactions. This enables MVCC snapshot isolation
|
668
|
+
as-of the specified timestamp.
|
669
|
+
"""
|
670
|
+
|
671
|
+
def __init__(
|
672
|
+
self,
|
673
|
+
historic_timestamp: int,
|
674
|
+
base_time_provider: TransactionTimeProvider,
|
675
|
+
):
|
676
|
+
"""
|
677
|
+
Initialize with a fixed historic timestamp and a base time provider.
|
678
|
+
|
679
|
+
Args:
|
680
|
+
historic_timestamp: Timestamp in nanoseconds since epoch to use
|
681
|
+
for both start and end times.
|
682
|
+
base_time_provider: Time provider to use for the end time.
|
683
|
+
"""
|
684
|
+
# Validate that historic timestamp is not in the future
|
685
|
+
if historic_timestamp > base_time_provider.start_time():
|
686
|
+
raise ValueError(
|
687
|
+
f"Historic timestamp {historic_timestamp} cannot be set in the future."
|
688
|
+
)
|
689
|
+
self.base_time_provider = base_time_provider
|
690
|
+
self.historic_timestamp = historic_timestamp
|
691
|
+
|
692
|
+
def start_time(self) -> int:
|
693
|
+
"""
|
694
|
+
Returns the fixed historic timestamp.
|
695
|
+
"""
|
696
|
+
return self.historic_timestamp
|
697
|
+
|
698
|
+
def end_time(self) -> int:
|
699
|
+
"""
|
700
|
+
Returns the end time of the base time provider.
|
701
|
+
"""
|
702
|
+
return self.base_time_provider.end_time()
|
703
|
+
|
704
|
+
|
705
|
+
class TransactionOperation(dict):
|
706
|
+
"""
|
707
|
+
Base class for DeltaCAT transaction operations against individual metafiles.
|
708
|
+
"""
|
709
|
+
|
710
|
+
@staticmethod
|
711
|
+
def of(
|
712
|
+
operation_type: Optional[TransactionOperationType],
|
713
|
+
dest_metafile: Metafile,
|
714
|
+
src_metafile: Optional[Metafile] = None,
|
715
|
+
read_limit: Optional[int] = None,
|
716
|
+
) -> TransactionOperation:
|
717
|
+
if not dest_metafile:
|
718
|
+
raise ValueError("Transaction operations must have a destination metafile.")
|
719
|
+
if operation_type in [
|
720
|
+
TransactionOperationType.UPDATE,
|
721
|
+
TransactionOperationType.REPLACE,
|
722
|
+
]:
|
723
|
+
if not src_metafile:
|
724
|
+
raise ValueError(
|
725
|
+
f"{operation_type.value} transaction operations must have a source metafile."
|
726
|
+
)
|
727
|
+
elif type(dest_metafile) is not type(src_metafile):
|
728
|
+
raise ValueError(
|
729
|
+
f"Source metafile type `{type(src_metafile)}` is not "
|
730
|
+
f"equal to dest metafile type `{type(dest_metafile)}`."
|
731
|
+
)
|
732
|
+
elif src_metafile:
|
733
|
+
raise ValueError(
|
734
|
+
f"Only {TransactionOperationType.UPDATE.value} and {TransactionOperationType.REPLACE.value} transaction operations may have a source metafile."
|
735
|
+
)
|
736
|
+
if operation_type.is_write_operation() and read_limit:
|
737
|
+
raise ValueError(
|
738
|
+
f"Only {TransactionOperationType.READ.value} transaction operations may have a read limit."
|
739
|
+
)
|
740
|
+
txn_op = TransactionOperation()
|
741
|
+
txn_op.type = operation_type
|
742
|
+
txn_op.dest_metafile = dest_metafile
|
743
|
+
txn_op.src_metafile = src_metafile
|
744
|
+
txn_op.read_limit = read_limit
|
745
|
+
return txn_op
|
746
|
+
|
747
|
+
@property
|
748
|
+
def type(self) -> TransactionOperationType:
|
749
|
+
"""
|
750
|
+
Returns the type of the transaction operation.
|
751
|
+
"""
|
752
|
+
val = self["type"]
|
753
|
+
if val is not None and not isinstance(val, TransactionOperationType):
|
754
|
+
self["type"] = val = TransactionOperationType(val)
|
755
|
+
return val
|
756
|
+
|
757
|
+
@type.setter
|
758
|
+
def type(self, txn_op_type: TransactionOperationType):
|
759
|
+
self["type"] = txn_op_type
|
760
|
+
|
761
|
+
@property
|
762
|
+
def dest_metafile(self) -> Metafile:
|
763
|
+
"""
|
764
|
+
Returns the metafile that is the target of this transaction operation.
|
765
|
+
"""
|
766
|
+
val = self["dest_metafile"]
|
767
|
+
if val is not None and not isinstance(val, Metafile):
|
768
|
+
self["dest_metafile"] = val = Metafile(val)
|
769
|
+
return val
|
770
|
+
|
771
|
+
@dest_metafile.setter
|
772
|
+
def dest_metafile(self, metafile: Metafile):
|
773
|
+
self["dest_metafile"] = metafile
|
774
|
+
|
775
|
+
@property
|
776
|
+
def src_metafile(self) -> Optional[Metafile]:
|
777
|
+
"""
|
778
|
+
Returns the metafile that is the source of this transaction operation.
|
779
|
+
"""
|
780
|
+
val = self.get("src_metafile")
|
781
|
+
if val is not None and not isinstance(val, Metafile):
|
782
|
+
self["src_metafile"] = val = Metafile(val)
|
783
|
+
return val
|
784
|
+
|
785
|
+
@src_metafile.setter
|
786
|
+
def src_metafile(self, src_metafile: Optional[Metafile]):
|
787
|
+
self["src_metafile"] = src_metafile
|
788
|
+
|
789
|
+
@property
|
790
|
+
def read_limit(self) -> Optional[int]:
|
791
|
+
"""
|
792
|
+
Returns the read limit for this transaction operation.
|
793
|
+
"""
|
794
|
+
return self.get("read_limit")
|
795
|
+
|
796
|
+
@read_limit.setter
|
797
|
+
def read_limit(self, read_limit: Optional[int]):
|
798
|
+
self["read_limit"] = read_limit
|
799
|
+
|
800
|
+
@property
|
801
|
+
def metafile_write_paths(self) -> List[str]:
|
802
|
+
return self.get("metafile_write_paths") or []
|
803
|
+
|
804
|
+
@property
|
805
|
+
def locator_write_paths(self) -> List[str]:
|
806
|
+
return self.get("locator_write_paths") or []
|
807
|
+
|
808
|
+
def append_metafile_write_path(self, write_path: str):
|
809
|
+
metafile_write_paths = self.get("metafile_write_paths")
|
810
|
+
if not metafile_write_paths:
|
811
|
+
metafile_write_paths = self["metafile_write_paths"] = []
|
812
|
+
metafile_write_paths.append(write_path)
|
813
|
+
|
814
|
+
def append_locator_write_path(self, write_path: str):
|
815
|
+
locator_write_paths = self.get("locator_write_paths")
|
816
|
+
if not locator_write_paths:
|
817
|
+
locator_write_paths = self["locator_write_paths"] = []
|
818
|
+
locator_write_paths.append(write_path)
|
819
|
+
|
820
|
+
@metafile_write_paths.setter
|
821
|
+
def metafile_write_paths(self, write_paths: List[str]) -> None:
|
822
|
+
self["metafile_write_paths"] = write_paths
|
823
|
+
|
824
|
+
@locator_write_paths.setter
|
825
|
+
def locator_write_paths(self, write_paths: List[str]):
|
826
|
+
self["locator_write_paths"] = write_paths
|
827
|
+
|
828
|
+
|
829
|
+
class TransactionOperationList(List[TransactionOperation]):
|
830
|
+
@staticmethod
|
831
|
+
def of(items: List[TransactionOperation]) -> TransactionOperationList:
|
832
|
+
typed_items = TransactionOperationList()
|
833
|
+
for item in items:
|
834
|
+
if item is not None and not isinstance(item, TransactionOperation):
|
835
|
+
item = TransactionOperation(item)
|
836
|
+
typed_items.append(item)
|
837
|
+
return typed_items
|
838
|
+
|
839
|
+
def __getitem__(self, item):
|
840
|
+
val = super().__getitem__(item)
|
841
|
+
if val is not None and not isinstance(val, TransactionOperation):
|
842
|
+
self[item] = val = TransactionOperation(val)
|
843
|
+
return val
|
844
|
+
|
845
|
+
def __iter__(self):
|
846
|
+
"""Support enumeration by returning TransactionOperation objects."""
|
847
|
+
for i in range(len(self)):
|
848
|
+
yield self[i] # This triggers __getitem__ conversion
|
849
|
+
|
850
|
+
|
851
|
+
class Transaction(dict):
|
852
|
+
"""
|
853
|
+
Base class for DeltaCAT transactions.
|
854
|
+
"""
|
855
|
+
|
856
|
+
@staticmethod
|
857
|
+
def of(
|
858
|
+
txn_operations: Optional[TransactionOperationList] = None,
|
859
|
+
commit_message: Optional[str] = None,
|
860
|
+
) -> Transaction:
|
861
|
+
if txn_operations is None:
|
862
|
+
txn_operations = []
|
863
|
+
transaction = Transaction()
|
864
|
+
transaction.operations = txn_operations
|
865
|
+
transaction.interactive = len(txn_operations) == 0
|
866
|
+
if commit_message:
|
867
|
+
transaction.commit_message = commit_message
|
868
|
+
return transaction
|
869
|
+
|
870
|
+
@staticmethod
|
871
|
+
def read_end_time(
|
872
|
+
path: str,
|
873
|
+
filesystem: Optional[pyarrow.fs.FileSystem] = None,
|
874
|
+
) -> Optional[int]:
|
875
|
+
"""
|
876
|
+
Returns the end time of the transaction, or None if the transaction
|
877
|
+
log file does not exist.
|
878
|
+
:param path: Transaction log path to read.
|
879
|
+
:param filesystem: File system to use for reading the Transaction file.
|
880
|
+
:return: Deserialized object from the Transaction file.
|
881
|
+
"""
|
882
|
+
# TODO(pdames): Validate that input file path is a valid txn log.
|
883
|
+
if not filesystem:
|
884
|
+
path, filesystem = resolve_path_and_filesystem(path, filesystem)
|
885
|
+
file_info_and_sizes = list_directory(
|
886
|
+
path=path,
|
887
|
+
filesystem=filesystem,
|
888
|
+
ignore_missing_path=True,
|
889
|
+
)
|
890
|
+
end_time = None
|
891
|
+
if file_info_and_sizes:
|
892
|
+
if len(file_info_and_sizes) > 1:
|
893
|
+
raise ValueError(
|
894
|
+
f"Expected to find only one transaction log at {path}, "
|
895
|
+
f"but found {len(file_info_and_sizes)}"
|
896
|
+
)
|
897
|
+
end_time = Transaction._parse_end_time(file_info_and_sizes[0][0])
|
898
|
+
return end_time
|
899
|
+
|
900
|
+
@staticmethod
|
901
|
+
def _parse_end_time(txn_log_file_name_or_path: str) -> int:
|
902
|
+
return int(posixpath.basename(txn_log_file_name_or_path))
|
903
|
+
|
904
|
+
@classmethod
|
905
|
+
def read(
|
906
|
+
cls,
|
907
|
+
path: str,
|
908
|
+
filesystem: Optional[pyarrow.fs.FileSystem] = None,
|
909
|
+
) -> Transaction:
|
910
|
+
"""
|
911
|
+
Read a Transaction file and return the deserialized object.
|
912
|
+
:param path: Transaction file path to read.
|
913
|
+
:param filesystem: File system to use for reading the Transaction file.
|
914
|
+
:return: Deserialized object from the Transaction file.
|
915
|
+
"""
|
916
|
+
|
917
|
+
if not filesystem:
|
918
|
+
path, filesystem = resolve_path_and_filesystem(path, filesystem)
|
919
|
+
with filesystem.open_input_stream(path) as file:
|
920
|
+
binary = file.readall()
|
921
|
+
obj = cls(**msgpack.loads(binary))
|
922
|
+
return obj
|
923
|
+
|
924
|
+
@staticmethod
|
925
|
+
def read_time_provider(provider_name: str):
|
926
|
+
"""
|
927
|
+
Given the string name of a time provider class, return a new instance of it.
|
928
|
+
Raises ValueError if the provider name is unknown.
|
929
|
+
"""
|
930
|
+
TIME_PROVIDER_CLASSES = {
|
931
|
+
"TransactionSystemTimeProvider": TransactionSystemTimeProvider,
|
932
|
+
# Add additional mappings as needed
|
933
|
+
}
|
934
|
+
|
935
|
+
provider_cls = TIME_PROVIDER_CLASSES.get(provider_name)
|
936
|
+
if provider_cls is None:
|
937
|
+
raise ValueError(f"Unknown time provider: {provider_name}")
|
938
|
+
|
939
|
+
return provider_cls()
|
940
|
+
|
941
|
+
@property
|
942
|
+
def id(self) -> Optional[str]:
|
943
|
+
"""
|
944
|
+
Returns this transaction's unique ID assigned at commit start time, or
|
945
|
+
None if the unique ID has not yet been assigned.
|
946
|
+
"""
|
947
|
+
_id = self.get("id")
|
948
|
+
if not _id and self.start_time:
|
949
|
+
_id = self["id"] = f"{self.start_time}{TXN_PART_SEPARATOR}{uuid.uuid4()}"
|
950
|
+
return _id
|
951
|
+
|
952
|
+
def state(self, catalog_root_dir: str, filesystem: pyarrow.fs.FileSystem = None):
|
953
|
+
"""
|
954
|
+
Infer the transaction state based on its presence in different directories.
|
955
|
+
"""
|
956
|
+
|
957
|
+
txn_name = self.id
|
958
|
+
|
959
|
+
catalog_root_normalized, filesystem = resolve_path_and_filesystem(
|
960
|
+
catalog_root_dir
|
961
|
+
)
|
962
|
+
|
963
|
+
txn_log_dir = posixpath.join(catalog_root_normalized, TXN_DIR_NAME)
|
964
|
+
running_txn_log_dir = posixpath.join(txn_log_dir, RUNNING_TXN_DIR_NAME)
|
965
|
+
filesystem.create_dir(running_txn_log_dir, recursive=True)
|
966
|
+
failed_txn_log_dir = posixpath.join(txn_log_dir, FAILED_TXN_DIR_NAME)
|
967
|
+
filesystem.create_dir(failed_txn_log_dir, recursive=False)
|
968
|
+
success_txn_log_dir = posixpath.join(txn_log_dir, SUCCESS_TXN_DIR_NAME)
|
969
|
+
filesystem.create_dir(success_txn_log_dir, recursive=False)
|
970
|
+
paused_txn_log_dir = posixpath.join(txn_log_dir, PAUSED_TXN_DIR_NAME)
|
971
|
+
filesystem.create_dir(paused_txn_log_dir, recursive=False)
|
972
|
+
|
973
|
+
# Check if the transaction file exists in the failed directory
|
974
|
+
in_failed = os.path.exists(os.path.join(failed_txn_log_dir, txn_name))
|
975
|
+
|
976
|
+
# Check if the transaction file exists in the running directory
|
977
|
+
in_running = os.path.exists(os.path.join(running_txn_log_dir, txn_name))
|
978
|
+
|
979
|
+
# Check if the transaction file exists in the success directory
|
980
|
+
in_success = os.path.exists(os.path.join(success_txn_log_dir, txn_name))
|
981
|
+
|
982
|
+
# Check if the transaction file exists in the paused directory
|
983
|
+
in_paused = os.path.exists(os.path.join(paused_txn_log_dir, txn_name))
|
984
|
+
|
985
|
+
if in_failed and in_running:
|
986
|
+
return TransactionState.FAILED
|
987
|
+
elif in_failed and not in_running:
|
988
|
+
return TransactionState.PURGED
|
989
|
+
elif in_success:
|
990
|
+
return TransactionState.SUCCESS
|
991
|
+
elif in_running:
|
992
|
+
return TransactionState.RUNNING
|
993
|
+
elif in_paused:
|
994
|
+
return TransactionState.PAUSED
|
995
|
+
|
996
|
+
@property
|
997
|
+
def operations(self) -> TransactionOperationList:
|
998
|
+
"""
|
999
|
+
Returns the list of transaction operations.
|
1000
|
+
"""
|
1001
|
+
return TransactionOperationList(self["operations"])
|
1002
|
+
|
1003
|
+
@operations.setter
|
1004
|
+
def operations(self, operations: TransactionOperationList):
|
1005
|
+
self["operations"] = operations
|
1006
|
+
|
1007
|
+
@property
|
1008
|
+
def metafile_write_paths(self) -> List[str]:
|
1009
|
+
return [path for op in self.operations for path in op.metafile_write_paths]
|
1010
|
+
|
1011
|
+
@property
|
1012
|
+
def locator_write_paths(self) -> List[str]:
|
1013
|
+
return [path for op in self.operations for path in op.locator_write_paths]
|
1014
|
+
|
1015
|
+
@property
|
1016
|
+
def catalog_root_normalized(self) -> str:
|
1017
|
+
"""
|
1018
|
+
Returns the catalog_root_normalized for this transaction.
|
1019
|
+
"""
|
1020
|
+
return self.get("catalog_root_normalized")
|
1021
|
+
|
1022
|
+
@catalog_root_normalized.setter
|
1023
|
+
def catalog_root_normalized(self, path: str):
|
1024
|
+
self["catalog_root_normalized"] = path
|
1025
|
+
|
1026
|
+
@property
|
1027
|
+
def _time_provider(self) -> TransactionSystemTimeProvider:
|
1028
|
+
"""
|
1029
|
+
Returns the time_provider of the transaction.
|
1030
|
+
"""
|
1031
|
+
return self.get("_time_provider")
|
1032
|
+
|
1033
|
+
@_time_provider.setter
|
1034
|
+
def _time_provider(
|
1035
|
+
self, tp: TransactionSystemTimeProvider
|
1036
|
+
) -> TransactionSystemTimeProvider:
|
1037
|
+
self["_time_provider"] = tp
|
1038
|
+
|
1039
|
+
@property
|
1040
|
+
def start_time(self) -> Optional[int]:
|
1041
|
+
"""
|
1042
|
+
Returns the start time of the transaction.
|
1043
|
+
"""
|
1044
|
+
return self.get("start_time")
|
1045
|
+
|
1046
|
+
@property
|
1047
|
+
def pause_time(self) -> Optional[int]:
|
1048
|
+
"""
|
1049
|
+
Returns the last pause time of the transaction.
|
1050
|
+
"""
|
1051
|
+
return self.get("pause_time")
|
1052
|
+
|
1053
|
+
@property
|
1054
|
+
def end_time(self) -> Optional[int]:
|
1055
|
+
"""
|
1056
|
+
Returns the end time of the transaction.
|
1057
|
+
"""
|
1058
|
+
return self.get("end_time")
|
1059
|
+
|
1060
|
+
@property
|
1061
|
+
def commit_message(self) -> Optional[str]:
|
1062
|
+
"""
|
1063
|
+
Returns the commit message for the transaction.
|
1064
|
+
"""
|
1065
|
+
return self.get("commit_message")
|
1066
|
+
|
1067
|
+
@commit_message.setter
|
1068
|
+
def commit_message(self, message: str):
|
1069
|
+
"""
|
1070
|
+
Sets the commit message for the transaction.
|
1071
|
+
"""
|
1072
|
+
self["commit_message"] = message
|
1073
|
+
|
1074
|
+
@property
|
1075
|
+
def historic_timestamp(self) -> Optional[int]:
|
1076
|
+
"""
|
1077
|
+
Returns the historic timestamp for the transaction.
|
1078
|
+
"""
|
1079
|
+
return self.get("historic_timestamp")
|
1080
|
+
|
1081
|
+
@historic_timestamp.setter
|
1082
|
+
def historic_timestamp(self, timestamp: int):
|
1083
|
+
"""
|
1084
|
+
Sets the historic timestamp for the transaction.
|
1085
|
+
"""
|
1086
|
+
self["historic_timestamp"] = timestamp
|
1087
|
+
|
1088
|
+
def _mark_start_time(self, time_provider: TransactionTimeProvider) -> int:
|
1089
|
+
"""
|
1090
|
+
Sets the start time of the transaction using the given
|
1091
|
+
TransactionTimeProvider. Raises a runtime error if the transaction
|
1092
|
+
start time has already been set by a previous commit.
|
1093
|
+
"""
|
1094
|
+
if self.get("start_time"):
|
1095
|
+
raise RuntimeError("Cannot restart a previously started transaction.")
|
1096
|
+
start_time = self["start_time"] = time_provider.start_time()
|
1097
|
+
return start_time
|
1098
|
+
|
1099
|
+
def _mark_end_time(self, time_provider: TransactionTimeProvider) -> int:
|
1100
|
+
"""
|
1101
|
+
Sets the end time of the transaction using the given
|
1102
|
+
TransactionTimeProvider. Raises a runtime error if the transaction end
|
1103
|
+
time has already been set by a previous commit, or if the transaction
|
1104
|
+
start time has not been set.
|
1105
|
+
"""
|
1106
|
+
if not self.get("start_time"):
|
1107
|
+
raise RuntimeError("Cannot end an unstarted transaction.")
|
1108
|
+
if self.get("end_time"):
|
1109
|
+
raise RuntimeError("Cannot end a completed transaction.")
|
1110
|
+
end_time = self["end_time"] = time_provider.end_time()
|
1111
|
+
return end_time
|
1112
|
+
|
1113
|
+
def _mark_pause_time(self, time_provider: TransactionTimeProvider) -> int:
|
1114
|
+
"""
|
1115
|
+
Sets the pause time of the transaction using the given
|
1116
|
+
TransactionTimeProvider. Raises a runtime error if the transaction pause
|
1117
|
+
time has already been set by a previous commit, or if the transaction
|
1118
|
+
start time has not been set.
|
1119
|
+
"""
|
1120
|
+
if not self.get("start_time"):
|
1121
|
+
raise RuntimeError("Cannot pause an unstarted transaction.")
|
1122
|
+
if self.get("end_time"):
|
1123
|
+
raise RuntimeError("Cannot pause a completed transaction.")
|
1124
|
+
pause_time = self["pause_time"] = time_provider.end_time()
|
1125
|
+
return pause_time
|
1126
|
+
|
1127
|
+
@staticmethod
|
1128
|
+
def _abs_txn_meta_path_to_relative(root: str, target: str) -> str:
|
1129
|
+
"""
|
1130
|
+
Takes an absolute root directory path and target absolute path to
|
1131
|
+
relativize with respect to the root directory. Returns the target
|
1132
|
+
path relative to the root directory path. Raises an error if the
|
1133
|
+
target path is not contained in the given root directory path, if
|
1134
|
+
either path is not an absolute path, or if the target path is equal
|
1135
|
+
to the root directory path.
|
1136
|
+
"""
|
1137
|
+
root_path = PosixPath(root)
|
1138
|
+
target_path = PosixPath(target)
|
1139
|
+
# TODO (martinezdavid): Check why is_absolute() fails for certain Delta paths
|
1140
|
+
# if not root_path.is_absolute() or not target_path.is_absolute():
|
1141
|
+
# raise ValueError("Both root and target must be absolute paths.")
|
1142
|
+
if root_path == target_path:
|
1143
|
+
raise ValueError(
|
1144
|
+
"Target and root are identical, but expected target to be a child of root."
|
1145
|
+
)
|
1146
|
+
try:
|
1147
|
+
relative_path = target_path.relative_to(root_path)
|
1148
|
+
except ValueError:
|
1149
|
+
raise ValueError("Expected target to be a child of root.")
|
1150
|
+
return str(relative_path)
|
1151
|
+
|
1152
|
+
def relativize_operation_paths(
|
1153
|
+
self, operation: TransactionOperation, catalog_root: str
|
1154
|
+
) -> None:
|
1155
|
+
"""
|
1156
|
+
Converts all absolute paths in an operation to relative paths
|
1157
|
+
with respect to the catalog root directory.
|
1158
|
+
"""
|
1159
|
+
# handle metafile paths
|
1160
|
+
if operation.metafile_write_paths:
|
1161
|
+
metafile_write_paths = [
|
1162
|
+
Transaction._abs_txn_meta_path_to_relative(catalog_root, path)
|
1163
|
+
for path in operation.metafile_write_paths
|
1164
|
+
]
|
1165
|
+
operation.metafile_write_paths = metafile_write_paths
|
1166
|
+
# handle locator paths
|
1167
|
+
if operation.locator_write_paths:
|
1168
|
+
locator_write_paths = [
|
1169
|
+
Transaction._abs_txn_meta_path_to_relative(catalog_root, path)
|
1170
|
+
for path in operation.locator_write_paths
|
1171
|
+
]
|
1172
|
+
operation.locator_write_paths = locator_write_paths
|
1173
|
+
|
1174
|
+
def to_serializable(self, catalog_root) -> Transaction:
|
1175
|
+
"""
|
1176
|
+
Prepare the object for serialization by converting any non-serializable
|
1177
|
+
types to serializable types. May also run any required pre-write
|
1178
|
+
validations on the serialized or deserialized object.
|
1179
|
+
:return: a serializable version of the object
|
1180
|
+
"""
|
1181
|
+
# Only copy dictionary keys - all other members should not be serialized
|
1182
|
+
serializable = Transaction({})
|
1183
|
+
for key, value in self.items():
|
1184
|
+
serializable[key] = copy.deepcopy(value)
|
1185
|
+
|
1186
|
+
# remove all src/dest metafile contents except IDs and locators to
|
1187
|
+
# reduce file size (they can be reconstructed from their corresponding
|
1188
|
+
# files as required).
|
1189
|
+
for operation in serializable.operations:
|
1190
|
+
# Sanity check that IDs exist on source and dest metafiles
|
1191
|
+
if operation.dest_metafile and operation.dest_metafile.id is None:
|
1192
|
+
raise ValueError(
|
1193
|
+
f"Transaction operation ${operation} dest metafile does "
|
1194
|
+
f"not have ID: ${operation.dest_metafile}"
|
1195
|
+
)
|
1196
|
+
if operation.src_metafile and operation.src_metafile.id is None:
|
1197
|
+
raise ValueError(
|
1198
|
+
f"Transaction operation ${operation} src metafile does "
|
1199
|
+
f"not have ID: ${operation.src_metafile}"
|
1200
|
+
)
|
1201
|
+
# relativize after checking that dest and src metafiles are valid
|
1202
|
+
self.relativize_operation_paths(operation, catalog_root)
|
1203
|
+
operation.dest_metafile = {
|
1204
|
+
"id": operation.dest_metafile.id,
|
1205
|
+
"locator": operation.dest_metafile.locator,
|
1206
|
+
"locator_alias": operation.dest_metafile.locator_alias,
|
1207
|
+
}
|
1208
|
+
if operation.src_metafile:
|
1209
|
+
operation.src_metafile = {
|
1210
|
+
"id": operation.src_metafile.id,
|
1211
|
+
"locator": operation.src_metafile.locator,
|
1212
|
+
"locator_alias": operation.src_metafile.locator_alias,
|
1213
|
+
}
|
1214
|
+
# TODO(pdames): Ensure that all file paths recorded are relative to the
|
1215
|
+
# catalog root.
|
1216
|
+
|
1217
|
+
# TODO: check if we care about order or exact time stamps --> pickling time_provider?
|
1218
|
+
# serializable.pop("_time_provider", None)
|
1219
|
+
|
1220
|
+
serializable["_time_provider"] = {
|
1221
|
+
"type": type(self._time_provider).__name__,
|
1222
|
+
"params": {},
|
1223
|
+
}
|
1224
|
+
|
1225
|
+
serializable.catalog_root_normalized = self.catalog_root_normalized
|
1226
|
+
|
1227
|
+
return serializable
|
1228
|
+
|
1229
|
+
@staticmethod
|
1230
|
+
def _validate_txn_log_file(success_txn_log_file: str) -> None:
|
1231
|
+
txn_log_dir_name = posixpath.basename(posixpath.dirname(success_txn_log_file))
|
1232
|
+
txn_log_parts = txn_log_dir_name.split(TXN_PART_SEPARATOR)
|
1233
|
+
# ensure that the transaction start time is valid
|
1234
|
+
try:
|
1235
|
+
start_time = int(txn_log_parts[0])
|
1236
|
+
except ValueError as e:
|
1237
|
+
raise ValueError(
|
1238
|
+
f"Transaction log file `{success_txn_log_file}` does not "
|
1239
|
+
f"contain a valid start time."
|
1240
|
+
) from e
|
1241
|
+
# ensure that the txn uuid is valid
|
1242
|
+
txn_uuid_str = txn_log_parts[1]
|
1243
|
+
try:
|
1244
|
+
uuid.UUID(txn_uuid_str)
|
1245
|
+
except ValueError as e:
|
1246
|
+
raise OSError(
|
1247
|
+
f"Transaction log file `{success_txn_log_file}` does not "
|
1248
|
+
f"contain a valid UUID string."
|
1249
|
+
) from e
|
1250
|
+
# ensure that the transaction end time is valid
|
1251
|
+
try:
|
1252
|
+
end_time = Transaction._parse_end_time(success_txn_log_file)
|
1253
|
+
except ValueError as e:
|
1254
|
+
raise ValueError(
|
1255
|
+
f"Transaction log file `{success_txn_log_file}` does not "
|
1256
|
+
f"contain a valid end time."
|
1257
|
+
) from e
|
1258
|
+
# ensure transaction end time was not recorded before start time
|
1259
|
+
if end_time < start_time:
|
1260
|
+
raise OSError(
|
1261
|
+
f"Transaction end time {end_time} is earlier than start "
|
1262
|
+
f"time {start_time}! To preserve catalog integrity, the "
|
1263
|
+
f"corresponding completed transaction log at "
|
1264
|
+
f"`{success_txn_log_file}` has been removed."
|
1265
|
+
)
|
1266
|
+
|
1267
|
+
def commit(
|
1268
|
+
self,
|
1269
|
+
catalog_root_dir: str,
|
1270
|
+
filesystem: Optional[pyarrow.fs.FileSystem] = None,
|
1271
|
+
) -> Union[
|
1272
|
+
List[ListResult[Metafile]],
|
1273
|
+
Tuple[List[str], str],
|
1274
|
+
Tuple[List["ListResult[Metafile]"], List[str], str],
|
1275
|
+
]:
|
1276
|
+
"""
|
1277
|
+
Legacy wrapper that preserves the original `commit()` contract while
|
1278
|
+
delegating the heavy lifting to the incremental helpers.
|
1279
|
+
|
1280
|
+
Returns
|
1281
|
+
-------
|
1282
|
+
- For READ transactions: List[ListResult[Metafile]]
|
1283
|
+
- For WRITE transactions: Tuple[List[str], str]
|
1284
|
+
(list of successful write-paths, path to success-txn log file)
|
1285
|
+
- For mixed READ/WRITE transactions: Tuple[List["ListResult[Metafile]"], List[str], str]
|
1286
|
+
"""
|
1287
|
+
|
1288
|
+
if hasattr(self, "interactive") and self.interactive:
|
1289
|
+
raise RuntimeError(
|
1290
|
+
"Cannot commit an interactive transaction. Use transaction.start(),transaction.step(), and transaction.seal() instead."
|
1291
|
+
)
|
1292
|
+
|
1293
|
+
if self.operations and len(self.operations) > 0:
|
1294
|
+
# Start a working copy (deep-copy, directory scaffolding, start-time, running/failed/success/paused dirs …)
|
1295
|
+
txn_active = self.start(catalog_root_dir, filesystem) # deep copy
|
1296
|
+
# Sequentially execute every TransactionOperation
|
1297
|
+
for op in txn_active.operations:
|
1298
|
+
txn_active.step(op)
|
1299
|
+
return txn_active._seal_steps()
|
1300
|
+
|
1301
|
+
def start(
|
1302
|
+
self,
|
1303
|
+
catalog_root_dir: str,
|
1304
|
+
filesystem: Optional[pyarrow.fs.FileSystem] = None,
|
1305
|
+
historic_timestamp: Optional[int] = None,
|
1306
|
+
) -> "Transaction":
|
1307
|
+
"""
|
1308
|
+
Create directory scaffolding, timestamp the txn, and return a DEEP COPY
|
1309
|
+
that the caller should use for all subsequent calls to step(), pause(),
|
1310
|
+
and seal(). The original object remains read-only.
|
1311
|
+
|
1312
|
+
Args:
|
1313
|
+
catalog_root_dir: Root directory for the catalog
|
1314
|
+
filesystem: Optional filesystem to use
|
1315
|
+
historic_timestamp: Optional timestamp in nanoseconds since epoch for snapshot isolation
|
1316
|
+
"""
|
1317
|
+
# Create a deep copy
|
1318
|
+
txn: "Transaction" = copy.deepcopy(self)
|
1319
|
+
|
1320
|
+
# Set up time provider based on transaction type
|
1321
|
+
if historic_timestamp is not None:
|
1322
|
+
# Use historic time provider for snapshot isolation
|
1323
|
+
# TODO(pdames): Set base time provider to the catalog's configured time provider when more than one is supported.
|
1324
|
+
txn._time_provider = TransactionHistoricTimeProvider(
|
1325
|
+
historic_timestamp,
|
1326
|
+
TransactionSystemTimeProvider(),
|
1327
|
+
)
|
1328
|
+
txn.historic_timestamp = historic_timestamp
|
1329
|
+
else:
|
1330
|
+
# Use system time provider for regular transactions
|
1331
|
+
txn._time_provider = TransactionSystemTimeProvider()
|
1332
|
+
|
1333
|
+
txn._mark_start_time(txn._time_provider) # start time on deep_copy
|
1334
|
+
|
1335
|
+
# Set up filesystem and directories
|
1336
|
+
catalog_root_normalized, filesystem = resolve_path_and_filesystem(
|
1337
|
+
catalog_root_dir,
|
1338
|
+
filesystem,
|
1339
|
+
)
|
1340
|
+
txn.catalog_root_normalized = catalog_root_normalized
|
1341
|
+
txn._filesystem = filesystem # keep for pause/resume
|
1342
|
+
txn.running_log_written = False # internal flags
|
1343
|
+
txn._list_results = []
|
1344
|
+
|
1345
|
+
# Make sure txn/ directories exist (idempotent)
|
1346
|
+
txn_log_dir = posixpath.join(catalog_root_normalized, TXN_DIR_NAME)
|
1347
|
+
filesystem.create_dir(
|
1348
|
+
posixpath.join(txn_log_dir, RUNNING_TXN_DIR_NAME),
|
1349
|
+
recursive=True,
|
1350
|
+
)
|
1351
|
+
for subdir in (FAILED_TXN_DIR_NAME, SUCCESS_TXN_DIR_NAME, PAUSED_TXN_DIR_NAME):
|
1352
|
+
try:
|
1353
|
+
filesystem.create_dir(
|
1354
|
+
posixpath.join(txn_log_dir, subdir),
|
1355
|
+
recursive=False,
|
1356
|
+
)
|
1357
|
+
except FileExistsError:
|
1358
|
+
pass # allowed when catalog already initialised
|
1359
|
+
return txn
|
1360
|
+
|
1361
|
+
def step(
|
1362
|
+
self,
|
1363
|
+
operation: "TransactionOperation",
|
1364
|
+
) -> Union[ListResult[Metafile], Tuple[List[str], List[str]]]:
|
1365
|
+
"""
|
1366
|
+
Executes a single transaction operation.
|
1367
|
+
|
1368
|
+
Parameters
|
1369
|
+
----------
|
1370
|
+
operation: TransactionOperation
|
1371
|
+
The transaction operation to execute.
|
1372
|
+
|
1373
|
+
Returns
|
1374
|
+
-------
|
1375
|
+
- For READ transaction operation: ListResult[Metafile]
|
1376
|
+
- For WRITE transaction operation: Tuple[List[str], List[str]]
|
1377
|
+
(list of successful write-paths, list of successful locator write-paths)
|
1378
|
+
"""
|
1379
|
+
|
1380
|
+
catalog_root_normalized = self.catalog_root_normalized
|
1381
|
+
filesystem = self._filesystem
|
1382
|
+
txn_log_dir = posixpath.join(catalog_root_normalized, TXN_DIR_NAME)
|
1383
|
+
|
1384
|
+
running_txn_log_file_path = posixpath.join(
|
1385
|
+
txn_log_dir, RUNNING_TXN_DIR_NAME, self.id
|
1386
|
+
)
|
1387
|
+
|
1388
|
+
# Validate read-only transaction constraints
|
1389
|
+
if self.historic_timestamp is not None:
|
1390
|
+
if not operation.type.is_read_operation():
|
1391
|
+
raise RuntimeError(
|
1392
|
+
f"Cannot perform {operation.type.value} operation in a read-only historic transaction."
|
1393
|
+
)
|
1394
|
+
|
1395
|
+
# Add new operation to the transaction's list of operations
|
1396
|
+
if self.interactive:
|
1397
|
+
self.operations = self.operations + [operation]
|
1398
|
+
|
1399
|
+
# (a) READ txn op
|
1400
|
+
if operation.type.is_read_operation():
|
1401
|
+
list_result = operation.dest_metafile.read_txn(
|
1402
|
+
catalog_root_dir=catalog_root_normalized,
|
1403
|
+
success_txn_log_dir=posixpath.join(txn_log_dir, SUCCESS_TXN_DIR_NAME),
|
1404
|
+
current_txn_op=operation,
|
1405
|
+
current_txn_start_time=self.start_time,
|
1406
|
+
current_txn_id=self.id,
|
1407
|
+
filesystem=filesystem,
|
1408
|
+
)
|
1409
|
+
self._list_results.append(list_result)
|
1410
|
+
return list_result
|
1411
|
+
|
1412
|
+
# (b) WRITE txn op
|
1413
|
+
# First operation? -> create running log so an external janitor can
|
1414
|
+
# see that a txn is in-flight.
|
1415
|
+
if not self.running_log_written:
|
1416
|
+
self._write_running_log(running_txn_log_file_path)
|
1417
|
+
|
1418
|
+
try:
|
1419
|
+
(
|
1420
|
+
metafile_write_paths,
|
1421
|
+
locator_write_paths,
|
1422
|
+
) = operation.dest_metafile.write_txn(
|
1423
|
+
catalog_root_dir=catalog_root_normalized,
|
1424
|
+
success_txn_log_dir=posixpath.join(txn_log_dir, SUCCESS_TXN_DIR_NAME),
|
1425
|
+
current_txn_op=operation,
|
1426
|
+
current_txn_start_time=self.start_time,
|
1427
|
+
current_txn_id=self.id,
|
1428
|
+
filesystem=filesystem,
|
1429
|
+
)
|
1430
|
+
# Check for concurrent txn conflicts on the metafile and locator write paths just written
|
1431
|
+
# TODO(pdames): Remove the fast-fail check here if it grows too expensive?
|
1432
|
+
for path in metafile_write_paths + locator_write_paths:
|
1433
|
+
MetafileRevisionInfo.check_for_concurrent_txn_conflict(
|
1434
|
+
success_txn_log_dir=posixpath.join(
|
1435
|
+
txn_log_dir,
|
1436
|
+
SUCCESS_TXN_DIR_NAME,
|
1437
|
+
),
|
1438
|
+
current_txn_revision_file_path=path,
|
1439
|
+
filesystem=filesystem,
|
1440
|
+
)
|
1441
|
+
return metafile_write_paths, locator_write_paths
|
1442
|
+
except Exception:
|
1443
|
+
# convert in-flight txn → FAILED and clean up partial files
|
1444
|
+
self._fail_and_cleanup(
|
1445
|
+
failed_txn_log_dir=posixpath.join(txn_log_dir, FAILED_TXN_DIR_NAME),
|
1446
|
+
running_log_path=running_txn_log_file_path,
|
1447
|
+
)
|
1448
|
+
raise # surface original error
|
1449
|
+
|
1450
|
+
def pause(self) -> None:
|
1451
|
+
fs = self._filesystem
|
1452
|
+
root = self.catalog_root_normalized
|
1453
|
+
txn_log_dir = posixpath.join(root, TXN_DIR_NAME)
|
1454
|
+
|
1455
|
+
running_path = posixpath.join(txn_log_dir, RUNNING_TXN_DIR_NAME, self.id)
|
1456
|
+
paused_path = posixpath.join(txn_log_dir, PAUSED_TXN_DIR_NAME, self.id)
|
1457
|
+
|
1458
|
+
fs.create_dir(posixpath.dirname(paused_path), recursive=True)
|
1459
|
+
|
1460
|
+
# Record pause time (e.g., for time consistency guarantees)
|
1461
|
+
self._mark_pause_time(self._time_provider)
|
1462
|
+
|
1463
|
+
# Serialize current transaction state into paused/txn_id
|
1464
|
+
with fs.open_output_stream(paused_path) as f:
|
1465
|
+
f.write(msgpack.dumps(self.to_serializable(root)))
|
1466
|
+
|
1467
|
+
# Clean up original running log
|
1468
|
+
fs.delete_file(running_path)
|
1469
|
+
|
1470
|
+
def resume(self) -> None:
|
1471
|
+
fs = self._filesystem
|
1472
|
+
root = self.catalog_root_normalized
|
1473
|
+
txn_log_dir = posixpath.join(root, TXN_DIR_NAME)
|
1474
|
+
|
1475
|
+
running_path = posixpath.join(txn_log_dir, RUNNING_TXN_DIR_NAME, self.id)
|
1476
|
+
paused_path = posixpath.join(txn_log_dir, PAUSED_TXN_DIR_NAME, self.id)
|
1477
|
+
|
1478
|
+
# Load serialized transaction state
|
1479
|
+
with fs.open_input_stream(paused_path) as f:
|
1480
|
+
loaded_txn_data = msgpack.loads(f.readall())
|
1481
|
+
|
1482
|
+
# Restore relevant fields
|
1483
|
+
restored_txn = Transaction(**loaded_txn_data)
|
1484
|
+
self.__dict__.update(
|
1485
|
+
restored_txn.__dict__
|
1486
|
+
) # make curr txn the same as restored (fill vars and stuff)
|
1487
|
+
|
1488
|
+
# To support restoring time provider state if we ever add non-ephemeral ones.
|
1489
|
+
new_provider = Transaction.read_time_provider(
|
1490
|
+
restored_txn["_time_provider"]["type"]
|
1491
|
+
)
|
1492
|
+
|
1493
|
+
# evaluate system clock
|
1494
|
+
now = new_provider.start_time()
|
1495
|
+
self._time_provider = new_provider # start time should be preserved
|
1496
|
+
if now < self.pause_time:
|
1497
|
+
raise RuntimeError(
|
1498
|
+
f"System clock {now} is behind paused transaction time {self._pause_time}"
|
1499
|
+
)
|
1500
|
+
# TODO: set new start time or keep error if clock is off?
|
1501
|
+
|
1502
|
+
# Move back to running state
|
1503
|
+
fs.create_dir(posixpath.dirname(running_path), recursive=True)
|
1504
|
+
with fs.open_output_stream(running_path) as f:
|
1505
|
+
f.write(msgpack.dumps(self.to_serializable(root)))
|
1506
|
+
fs.delete_file(paused_path)
|
1507
|
+
|
1508
|
+
def seal(
|
1509
|
+
self,
|
1510
|
+
) -> Union[
|
1511
|
+
List["ListResult[Metafile]"],
|
1512
|
+
Tuple[List[str], str],
|
1513
|
+
Tuple[List["ListResult[Metafile]"], List[str], str],
|
1514
|
+
]:
|
1515
|
+
"""
|
1516
|
+
For READ → returns list_results collected during step().
|
1517
|
+
For WRITE → returns (written_paths, success_log_path).
|
1518
|
+
"""
|
1519
|
+
if not self.interactive:
|
1520
|
+
raise RuntimeError(
|
1521
|
+
"Cannot seal a non-interactive transaction. Call transaction.commit() instead."
|
1522
|
+
)
|
1523
|
+
|
1524
|
+
# Read-only transactions can only perform read operations
|
1525
|
+
if self.historic_timestamp is not None:
|
1526
|
+
if self._has_write_operations():
|
1527
|
+
raise RuntimeError(
|
1528
|
+
"Cannot seal a read-only historic transaction that contains write operations."
|
1529
|
+
)
|
1530
|
+
|
1531
|
+
return self._seal_steps()
|
1532
|
+
|
1533
|
+
def _has_write_operations(self) -> bool:
|
1534
|
+
"""
|
1535
|
+
Check if the transaction contains any write operations.
|
1536
|
+
Read-only transactions should only contain READ operations.
|
1537
|
+
"""
|
1538
|
+
for operation in self.operations:
|
1539
|
+
if not operation.type.is_read_operation():
|
1540
|
+
return True
|
1541
|
+
return False
|
1542
|
+
|
1543
|
+
def _seal_steps(
|
1544
|
+
self,
|
1545
|
+
) -> Union[
|
1546
|
+
List["ListResult[Metafile]"],
|
1547
|
+
Tuple[List[str], str],
|
1548
|
+
Tuple[List["ListResult[Metafile]"], List[str], str],
|
1549
|
+
]:
|
1550
|
+
fs = self._filesystem
|
1551
|
+
root = self.catalog_root_normalized
|
1552
|
+
txn_log_dir = posixpath.join(root, TXN_DIR_NAME)
|
1553
|
+
end_time = self._mark_end_time(self._time_provider)
|
1554
|
+
|
1555
|
+
# READ path: nothing persisted, so we are done
|
1556
|
+
if all(op.type.is_read_operation() for op in self.operations):
|
1557
|
+
return self._list_results
|
1558
|
+
|
1559
|
+
running_path = posixpath.join(txn_log_dir, RUNNING_TXN_DIR_NAME, self.id)
|
1560
|
+
failed_dir = posixpath.join(txn_log_dir, FAILED_TXN_DIR_NAME)
|
1561
|
+
success_dir = posixpath.join(txn_log_dir, SUCCESS_TXN_DIR_NAME)
|
1562
|
+
|
1563
|
+
# If no operations ever succeeded we still need a running log.
|
1564
|
+
if not self.running_log_written:
|
1565
|
+
self._write_running_log(running_path)
|
1566
|
+
try:
|
1567
|
+
# Check for concurrent txn conflicts on metafile and locator write paths
|
1568
|
+
for path in self.metafile_write_paths + self.locator_write_paths:
|
1569
|
+
MetafileRevisionInfo.check_for_concurrent_txn_conflict(
|
1570
|
+
success_txn_log_dir=posixpath.join(
|
1571
|
+
txn_log_dir, SUCCESS_TXN_DIR_NAME
|
1572
|
+
),
|
1573
|
+
current_txn_revision_file_path=path,
|
1574
|
+
filesystem=fs,
|
1575
|
+
)
|
1576
|
+
except Exception:
|
1577
|
+
self._fail_and_cleanup(
|
1578
|
+
failed_txn_log_dir=failed_dir,
|
1579
|
+
running_log_path=running_path,
|
1580
|
+
)
|
1581
|
+
# raise the original error
|
1582
|
+
raise
|
1583
|
+
success_log_path = None
|
1584
|
+
try:
|
1585
|
+
# write transaction log
|
1586
|
+
success_txn_dir = posixpath.join(success_dir, self.id)
|
1587
|
+
fs.create_dir(success_txn_dir, recursive=False)
|
1588
|
+
|
1589
|
+
success_log_path = posixpath.join(success_txn_dir, str(end_time))
|
1590
|
+
with fs.open_output_stream(success_log_path) as f:
|
1591
|
+
f.write(msgpack.dumps(self.to_serializable(root)))
|
1592
|
+
|
1593
|
+
Transaction._validate_txn_log_file(success_txn_log_file=success_log_path)
|
1594
|
+
|
1595
|
+
except Exception as e1:
|
1596
|
+
self._fail_and_cleanup(
|
1597
|
+
failed_txn_log_dir=failed_dir,
|
1598
|
+
running_log_path=running_path,
|
1599
|
+
success_log_path=success_log_path,
|
1600
|
+
)
|
1601
|
+
raise RuntimeError(
|
1602
|
+
f"Transaction validation failed. To preserve catalog integrity, "
|
1603
|
+
f"the corresponding completed transaction log at "
|
1604
|
+
f"`{success_log_path}` has been removed."
|
1605
|
+
) from e1
|
1606
|
+
|
1607
|
+
else:
|
1608
|
+
fs.delete_file(running_path)
|
1609
|
+
if all(op.type.is_write_operation() for op in self.operations):
|
1610
|
+
# pure write transaction - just return write paths and success log path
|
1611
|
+
return self.metafile_write_paths, success_log_path
|
1612
|
+
else:
|
1613
|
+
# mixed read/write transaction - return read results, write paths, and success log path
|
1614
|
+
return self._list_results, self.metafile_write_paths, success_log_path
|
1615
|
+
|
1616
|
+
# Helper: write or overwrite the running/ID file exactly once
|
1617
|
+
def _write_running_log(self, running_log_path: str) -> None:
|
1618
|
+
with self._filesystem.open_output_stream(running_log_path) as f:
|
1619
|
+
f.write(msgpack.dumps(self.to_serializable(self.catalog_root_normalized)))
|
1620
|
+
self.running_log_written = True
|
1621
|
+
|
1622
|
+
# Helper: mark txn FAILED and clean partial output
|
1623
|
+
def _fail_and_cleanup(
|
1624
|
+
self,
|
1625
|
+
failed_txn_log_dir: str,
|
1626
|
+
running_log_path: str,
|
1627
|
+
success_log_path: Optional[str] = None,
|
1628
|
+
) -> None:
|
1629
|
+
fs = self._filesystem
|
1630
|
+
|
1631
|
+
# 1. write failed/ID
|
1632
|
+
failed_log_path = posixpath.join(failed_txn_log_dir, self.id)
|
1633
|
+
with fs.open_output_stream(failed_log_path) as f:
|
1634
|
+
f.write(msgpack.dumps(self.to_serializable(self.catalog_root_normalized)))
|
1635
|
+
|
1636
|
+
# 2. delete all provisional files
|
1637
|
+
for path in self.metafile_write_paths:
|
1638
|
+
try:
|
1639
|
+
fs.delete_file(path)
|
1640
|
+
except Exception:
|
1641
|
+
pass # best-effort; janitor job will catch leftovers
|
1642
|
+
for path in self.locator_write_paths:
|
1643
|
+
try:
|
1644
|
+
fs.delete_file(path)
|
1645
|
+
except Exception:
|
1646
|
+
pass # best-effort; janitor job will catch leftovers
|
1647
|
+
|
1648
|
+
# 3. tidy up bookkeeping logs
|
1649
|
+
try:
|
1650
|
+
fs.delete_file(running_log_path)
|
1651
|
+
except Exception:
|
1652
|
+
pass
|
1653
|
+
if success_log_path:
|
1654
|
+
try:
|
1655
|
+
fs.delete_file(success_log_path)
|
1656
|
+
except Exception:
|
1657
|
+
pass
|
1658
|
+
|
1659
|
+
def __enter__(self) -> "Transaction":
|
1660
|
+
"""
|
1661
|
+
Context manager entry point. Sets this transaction as the current context.
|
1662
|
+
Supports nested transactions by preserving the context stack.
|
1663
|
+
"""
|
1664
|
+
if not hasattr(self, "interactive") or not self.interactive:
|
1665
|
+
raise RuntimeError(
|
1666
|
+
"Transaction must be interactive to use with context manager. "
|
1667
|
+
"Use dc.transaction() to create an interactive transaction."
|
1668
|
+
)
|
1669
|
+
if self.start_time is None:
|
1670
|
+
raise RuntimeError(
|
1671
|
+
"Transaction has not been started. "
|
1672
|
+
"Use dc.transaction() to create a properly initialized transaction."
|
1673
|
+
)
|
1674
|
+
|
1675
|
+
# Store the context token for restoration in __exit__
|
1676
|
+
self._context_token = set_current_transaction(self)
|
1677
|
+
return self
|
1678
|
+
|
1679
|
+
def __exit__(
|
1680
|
+
self,
|
1681
|
+
exc_type: Optional[Type[BaseException]],
|
1682
|
+
exc_value: Optional[BaseException],
|
1683
|
+
traceback: Optional[TracebackType],
|
1684
|
+
) -> None:
|
1685
|
+
"""
|
1686
|
+
Context manager exit point. Restores previous transaction context and
|
1687
|
+
automatically seals the transaction on successful completion or fails it
|
1688
|
+
if an exception occurred.
|
1689
|
+
|
1690
|
+
Args:
|
1691
|
+
exc_type: Exception type if an exception occurred, None otherwise
|
1692
|
+
exc_value: Exception value if an exception occurred, None otherwise
|
1693
|
+
traceback: Exception traceback if an exception occurred, None otherwise
|
1694
|
+
"""
|
1695
|
+
try:
|
1696
|
+
if exc_type is None and exc_value is None and traceback is None:
|
1697
|
+
# No exception occurred - seal the transaction
|
1698
|
+
self.seal()
|
1699
|
+
else:
|
1700
|
+
# Exception occurred during transaction - fail and cleanup
|
1701
|
+
try:
|
1702
|
+
catalog_root_normalized = self.catalog_root_normalized
|
1703
|
+
txn_log_dir = posixpath.join(catalog_root_normalized, TXN_DIR_NAME)
|
1704
|
+
running_txn_log_file_path = posixpath.join(
|
1705
|
+
txn_log_dir, RUNNING_TXN_DIR_NAME, self.id
|
1706
|
+
)
|
1707
|
+
self._fail_and_cleanup(
|
1708
|
+
failed_txn_log_dir=posixpath.join(
|
1709
|
+
txn_log_dir, FAILED_TXN_DIR_NAME
|
1710
|
+
),
|
1711
|
+
running_log_path=running_txn_log_file_path,
|
1712
|
+
)
|
1713
|
+
except Exception:
|
1714
|
+
# If cleanup fails, still let the original exception propagate
|
1715
|
+
pass
|
1716
|
+
finally:
|
1717
|
+
# Always restore the previous transaction context using the token
|
1718
|
+
if hasattr(self, "_context_token"):
|
1719
|
+
try:
|
1720
|
+
# Get the previous value from the token
|
1721
|
+
old_value = self._context_token.old_value
|
1722
|
+
# Only set if the old value is a valid transaction or None
|
1723
|
+
if old_value is None or isinstance(old_value, Transaction):
|
1724
|
+
_current_transaction.set(old_value)
|
1725
|
+
else:
|
1726
|
+
# If old_value is not valid (e.g., Token.MISSING), set to None
|
1727
|
+
_current_transaction.set(None)
|
1728
|
+
except (AttributeError, LookupError):
|
1729
|
+
# If token doesn't have old_value or context is corrupted, clear it
|
1730
|
+
try:
|
1731
|
+
_current_transaction.set(None)
|
1732
|
+
except LookupError:
|
1733
|
+
pass
|