deltacat 2.0.0b11__py3-none-any.whl → 2.0.0.post1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +78 -3
- deltacat/api.py +122 -67
- deltacat/aws/constants.py +0 -23
- deltacat/aws/s3u.py +4 -631
- deltacat/benchmarking/conftest.py +0 -18
- deltacat/catalog/__init__.py +2 -0
- deltacat/catalog/delegate.py +445 -63
- deltacat/catalog/interface.py +188 -62
- deltacat/catalog/main/impl.py +2417 -271
- deltacat/catalog/model/catalog.py +49 -10
- deltacat/catalog/model/properties.py +38 -0
- deltacat/compute/compactor/compaction_session.py +97 -75
- deltacat/compute/compactor/model/compact_partition_params.py +75 -30
- deltacat/compute/compactor/model/compaction_session_audit_info.py +17 -0
- deltacat/compute/compactor/model/round_completion_info.py +16 -6
- deltacat/compute/compactor/repartition_session.py +8 -21
- deltacat/compute/compactor/steps/hash_bucket.py +5 -5
- deltacat/compute/compactor/steps/materialize.py +9 -7
- deltacat/compute/compactor/steps/repartition.py +12 -11
- deltacat/compute/compactor/utils/io.py +6 -5
- deltacat/compute/compactor/utils/round_completion_reader.py +117 -0
- deltacat/compute/compactor/utils/system_columns.py +3 -1
- deltacat/compute/compactor_v2/compaction_session.py +17 -14
- deltacat/compute/compactor_v2/constants.py +30 -1
- deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -1
- deltacat/compute/compactor_v2/model/hash_bucket_input.py +9 -3
- deltacat/compute/compactor_v2/model/merge_file_group.py +5 -2
- deltacat/compute/compactor_v2/model/merge_input.py +33 -8
- deltacat/compute/compactor_v2/private/compaction_utils.py +167 -68
- deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -2
- deltacat/compute/compactor_v2/steps/merge.py +267 -55
- deltacat/compute/compactor_v2/utils/content_type_params.py +34 -6
- deltacat/compute/compactor_v2/utils/dedupe.py +1 -1
- deltacat/compute/compactor_v2/utils/delta.py +5 -3
- deltacat/compute/compactor_v2/utils/io.py +11 -4
- deltacat/compute/compactor_v2/utils/merge.py +15 -2
- deltacat/compute/compactor_v2/utils/primary_key_index.py +28 -4
- deltacat/compute/compactor_v2/utils/task_options.py +45 -33
- deltacat/compute/converter/converter_session.py +145 -32
- deltacat/compute/converter/model/convert_input.py +26 -19
- deltacat/compute/converter/model/convert_input_files.py +33 -16
- deltacat/compute/converter/model/convert_result.py +35 -16
- deltacat/compute/converter/model/converter_session_params.py +24 -21
- deltacat/compute/converter/pyiceberg/catalog.py +21 -18
- deltacat/compute/converter/pyiceberg/overrides.py +18 -9
- deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +148 -100
- deltacat/compute/converter/steps/convert.py +157 -50
- deltacat/compute/converter/steps/dedupe.py +24 -11
- deltacat/compute/converter/utils/convert_task_options.py +27 -12
- deltacat/compute/converter/utils/converter_session_utils.py +126 -60
- deltacat/compute/converter/utils/iceberg_columns.py +8 -8
- deltacat/compute/converter/utils/io.py +101 -12
- deltacat/compute/converter/utils/s3u.py +33 -27
- deltacat/compute/janitor.py +205 -0
- deltacat/compute/jobs/client.py +19 -8
- deltacat/compute/resource_estimation/delta.py +38 -6
- deltacat/compute/resource_estimation/model.py +8 -0
- deltacat/constants.py +44 -0
- deltacat/docs/autogen/schema/__init__.py +0 -0
- deltacat/docs/autogen/schema/inference/__init__.py +0 -0
- deltacat/docs/autogen/schema/inference/generate_type_mappings.py +687 -0
- deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py +673 -0
- deltacat/examples/compactor/__init__.py +0 -0
- deltacat/examples/compactor/aws/__init__.py +1 -0
- deltacat/examples/compactor/bootstrap.py +863 -0
- deltacat/examples/compactor/compactor.py +373 -0
- deltacat/examples/compactor/explorer.py +473 -0
- deltacat/examples/compactor/gcp/__init__.py +1 -0
- deltacat/examples/compactor/job_runner.py +439 -0
- deltacat/examples/compactor/utils/__init__.py +1 -0
- deltacat/examples/compactor/utils/common.py +261 -0
- deltacat/examples/experimental/iceberg/converter/__init__.py +0 -0
- deltacat/examples/experimental/iceberg/converter/beam/__init__.py +0 -0
- deltacat/examples/experimental/iceberg/converter/beam/app.py +226 -0
- deltacat/examples/experimental/iceberg/converter/beam/main.py +133 -0
- deltacat/examples/experimental/iceberg/converter/beam/test_workflow.py +113 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py +3 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/common.py +174 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py +263 -0
- deltacat/exceptions.py +66 -4
- deltacat/experimental/catalog/iceberg/impl.py +2 -2
- deltacat/experimental/compatibility/__init__.py +0 -0
- deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py +201 -0
- deltacat/experimental/converter_agent/__init__.py +0 -0
- deltacat/experimental/converter_agent/beam/__init__.py +0 -0
- deltacat/experimental/converter_agent/beam/managed.py +173 -0
- deltacat/experimental/converter_agent/table_monitor.py +479 -0
- deltacat/experimental/storage/iceberg/iceberg_scan_planner.py +105 -4
- deltacat/experimental/storage/iceberg/impl.py +5 -3
- deltacat/experimental/storage/iceberg/model.py +7 -3
- deltacat/experimental/storage/iceberg/visitor.py +119 -0
- deltacat/experimental/storage/rivulet/dataset.py +0 -3
- deltacat/experimental/storage/rivulet/metastore/delta.py +0 -2
- deltacat/experimental/storage/rivulet/reader/dataset_metastore.py +3 -2
- deltacat/io/datasource/deltacat_datasource.py +0 -1
- deltacat/storage/__init__.py +20 -2
- deltacat/storage/interface.py +54 -32
- deltacat/storage/main/impl.py +1494 -541
- deltacat/storage/model/delta.py +27 -3
- deltacat/storage/model/locator.py +6 -12
- deltacat/storage/model/manifest.py +182 -6
- deltacat/storage/model/metafile.py +151 -78
- deltacat/storage/model/namespace.py +8 -1
- deltacat/storage/model/partition.py +117 -42
- deltacat/storage/model/schema.py +2427 -159
- deltacat/storage/model/sort_key.py +40 -0
- deltacat/storage/model/stream.py +9 -2
- deltacat/storage/model/table.py +12 -1
- deltacat/storage/model/table_version.py +11 -0
- deltacat/storage/model/transaction.py +1184 -208
- deltacat/storage/model/transform.py +81 -2
- deltacat/storage/model/types.py +48 -26
- deltacat/tests/_io/test_cloudpickle_bug_fix.py +8 -4
- deltacat/tests/aws/test_s3u.py +2 -31
- deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +1606 -70
- deltacat/tests/catalog/test_catalogs.py +54 -11
- deltacat/tests/catalog/test_default_catalog_impl.py +12152 -71
- deltacat/tests/compute/compact_partition_test_cases.py +35 -8
- deltacat/tests/compute/compactor/steps/test_repartition.py +12 -12
- deltacat/tests/compute/compactor/utils/test_io.py +124 -120
- deltacat/tests/compute/compactor/utils/test_round_completion_reader.py +254 -0
- deltacat/tests/compute/compactor_v2/test_compaction_session.py +423 -312
- deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +266 -0
- deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +45 -0
- deltacat/tests/compute/compactor_v2/utils/test_task_options.py +270 -1
- deltacat/tests/compute/conftest.py +8 -44
- deltacat/tests/compute/converter/test_convert_session.py +675 -490
- deltacat/tests/compute/converter/utils.py +15 -6
- deltacat/tests/compute/resource_estimation/test_delta.py +145 -79
- deltacat/tests/compute/test_compact_partition_incremental.py +103 -70
- deltacat/tests/compute/test_compact_partition_multiple_rounds.py +89 -66
- deltacat/tests/compute/test_compact_partition_params.py +13 -8
- deltacat/tests/compute/test_compact_partition_rebase.py +77 -62
- deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +263 -193
- deltacat/tests/compute/test_janitor.py +236 -0
- deltacat/tests/compute/test_util_common.py +716 -43
- deltacat/tests/compute/test_util_constant.py +0 -1
- deltacat/tests/{storage/conftest.py → conftest.py} +1 -1
- deltacat/tests/experimental/__init__.py +1 -0
- deltacat/tests/experimental/compatibility/__init__.py +1 -0
- deltacat/tests/experimental/compatibility/test_backfill_locator_to_id_mappings.py +582 -0
- deltacat/tests/storage/main/test_main_storage.py +6900 -95
- deltacat/tests/storage/model/test_metafile_io.py +78 -173
- deltacat/tests/storage/model/test_partition_scheme.py +85 -0
- deltacat/tests/storage/model/test_schema.py +171 -0
- deltacat/tests/storage/model/test_schema_update.py +1925 -0
- deltacat/tests/storage/model/test_sort_scheme.py +90 -0
- deltacat/tests/storage/model/test_transaction.py +393 -48
- deltacat/tests/storage/model/test_transaction_history.py +886 -0
- deltacat/tests/test_deltacat_api.py +988 -4
- deltacat/tests/test_exceptions.py +9 -5
- deltacat/tests/test_utils/pyarrow.py +52 -21
- deltacat/tests/test_utils/storage.py +23 -34
- deltacat/tests/types/__init__.py +0 -0
- deltacat/tests/types/test_tables.py +104 -0
- deltacat/tests/utils/exceptions.py +22 -0
- deltacat/tests/utils/main_deltacat_storage_mock.py +31 -0
- deltacat/tests/utils/ray_utils/test_dataset.py +123 -5
- deltacat/tests/utils/test_daft.py +121 -31
- deltacat/tests/utils/test_numpy.py +1193 -0
- deltacat/tests/utils/test_pandas.py +1106 -0
- deltacat/tests/utils/test_polars.py +1040 -0
- deltacat/tests/utils/test_pyarrow.py +1370 -89
- deltacat/types/media.py +221 -11
- deltacat/types/tables.py +2329 -59
- deltacat/utils/arguments.py +33 -1
- deltacat/utils/daft.py +411 -150
- deltacat/utils/filesystem.py +100 -0
- deltacat/utils/metafile_locator.py +2 -1
- deltacat/utils/numpy.py +118 -26
- deltacat/utils/pandas.py +577 -48
- deltacat/utils/polars.py +658 -27
- deltacat/utils/pyarrow.py +1258 -213
- deltacat/utils/ray_utils/dataset.py +101 -10
- deltacat/utils/reader_compatibility_mapping.py +3083 -0
- deltacat/utils/url.py +56 -15
- deltacat-2.0.0.post1.dist-info/METADATA +1163 -0
- {deltacat-2.0.0b11.dist-info → deltacat-2.0.0.post1.dist-info}/RECORD +183 -145
- {deltacat-2.0.0b11.dist-info → deltacat-2.0.0.post1.dist-info}/WHEEL +1 -1
- deltacat/compute/compactor/utils/round_completion_file.py +0 -97
- deltacat/compute/merge_on_read/__init__.py +0 -4
- deltacat/compute/merge_on_read/daft.py +0 -40
- deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -66
- deltacat/compute/merge_on_read/utils/delta.py +0 -42
- deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -231
- deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -388
- deltacat/tests/local_deltacat_storage/__init__.py +0 -1236
- deltacat/tests/local_deltacat_storage/exceptions.py +0 -10
- deltacat/utils/s3fs.py +0 -21
- deltacat-2.0.0b11.dist-info/METADATA +0 -67
- /deltacat/{compute/merge_on_read/model → docs}/__init__.py +0 -0
- /deltacat/{compute/merge_on_read/utils → docs/autogen}/__init__.py +0 -0
- {deltacat-2.0.0b11.dist-info → deltacat-2.0.0.post1.dist-info/licenses}/LICENSE +0 -0
- {deltacat-2.0.0b11.dist-info → deltacat-2.0.0.post1.dist-info}/top_level.txt +0 -0
deltacat/storage/main/impl.py
CHANGED
@@ -1,13 +1,39 @@
|
|
1
|
+
import logging
|
1
2
|
import uuid
|
3
|
+
import posixpath
|
4
|
+
import pyarrow
|
2
5
|
|
3
6
|
from typing import Any, Callable, Dict, List, Optional, Union, Tuple
|
4
7
|
|
5
|
-
from deltacat.catalog import get_catalog_properties
|
6
|
-
from deltacat.constants import
|
7
|
-
|
8
|
+
from deltacat.catalog.model.properties import get_catalog_properties
|
9
|
+
from deltacat.constants import (
|
10
|
+
DEFAULT_TABLE_VERSION,
|
11
|
+
DATA_FILE_DIR_NAME,
|
12
|
+
)
|
13
|
+
from deltacat.exceptions import (
|
14
|
+
TableNotFoundError,
|
15
|
+
TableVersionNotFoundError,
|
16
|
+
DeltaCatError,
|
17
|
+
UnclassifiedDeltaCatError,
|
18
|
+
SchemaValidationError,
|
19
|
+
StreamNotFoundError,
|
20
|
+
PartitionNotFoundError,
|
21
|
+
DeltaNotFoundError,
|
22
|
+
NamespaceNotFoundError,
|
23
|
+
TableValidationError,
|
24
|
+
ConcurrentModificationError,
|
25
|
+
ObjectAlreadyExistsError,
|
26
|
+
NamespaceAlreadyExistsError,
|
27
|
+
TableAlreadyExistsError,
|
28
|
+
TableVersionAlreadyExistsError,
|
29
|
+
ObjectNotFoundError,
|
30
|
+
)
|
8
31
|
from deltacat.storage.model.manifest import (
|
9
32
|
EntryParams,
|
33
|
+
EntryType,
|
10
34
|
ManifestAuthor,
|
35
|
+
ManifestEntryList,
|
36
|
+
ManifestEntry,
|
11
37
|
)
|
12
38
|
from deltacat.storage.model.delta import (
|
13
39
|
Delta,
|
@@ -15,13 +41,13 @@ from deltacat.storage.model.delta import (
|
|
15
41
|
DeltaProperties,
|
16
42
|
DeltaType,
|
17
43
|
)
|
44
|
+
from deltacat.storage.model.transaction import setup_transaction
|
18
45
|
from deltacat.storage.model.types import (
|
19
46
|
CommitState,
|
20
47
|
DistributedDataset,
|
21
48
|
LifecycleState,
|
22
49
|
LocalDataset,
|
23
50
|
LocalTable,
|
24
|
-
TransactionType,
|
25
51
|
TransactionOperationType,
|
26
52
|
StreamFormat,
|
27
53
|
)
|
@@ -36,14 +62,13 @@ from deltacat.storage.model.partition import (
|
|
36
62
|
PartitionLocator,
|
37
63
|
PartitionScheme,
|
38
64
|
PartitionValues,
|
65
|
+
UNPARTITIONED_SCHEME,
|
39
66
|
UNPARTITIONED_SCHEME_ID,
|
40
|
-
PartitionLocatorAlias,
|
41
|
-
)
|
42
|
-
from deltacat.storage.model.schema import (
|
43
|
-
Schema,
|
44
67
|
)
|
68
|
+
from deltacat.storage.model.schema import Schema
|
45
69
|
from deltacat.storage.model.sort_key import (
|
46
70
|
SortScheme,
|
71
|
+
UNSORTED_SCHEME,
|
47
72
|
)
|
48
73
|
from deltacat.storage.model.stream import (
|
49
74
|
Stream,
|
@@ -65,52 +90,95 @@ from deltacat.storage.model.metafile import (
|
|
65
90
|
from deltacat.storage.model.transaction import (
|
66
91
|
TransactionOperation,
|
67
92
|
Transaction,
|
68
|
-
TransactionOperationList,
|
69
93
|
)
|
70
94
|
from deltacat.storage.model.manifest import Manifest
|
71
95
|
from deltacat.types.media import (
|
72
96
|
ContentType,
|
97
|
+
DatasetType,
|
73
98
|
DistributedDatasetType,
|
74
99
|
StorageType,
|
75
|
-
|
100
|
+
ContentEncoding,
|
76
101
|
)
|
77
102
|
from deltacat.utils.common import ReadKwargsProvider
|
103
|
+
import pyarrow as pa
|
104
|
+
|
105
|
+
from deltacat.types.tables import (
|
106
|
+
TableProperty,
|
107
|
+
get_table_writer,
|
108
|
+
get_table_slicer,
|
109
|
+
write_sliced_table,
|
110
|
+
download_manifest_entries,
|
111
|
+
download_manifest_entries_distributed,
|
112
|
+
download_manifest_entry,
|
113
|
+
)
|
114
|
+
from deltacat import logs
|
115
|
+
|
116
|
+
logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
|
117
|
+
|
118
|
+
|
119
|
+
def _normalize_partition_values(
|
120
|
+
partition_values: Optional[PartitionValues],
|
121
|
+
) -> Optional[PartitionValues]:
|
122
|
+
"""
|
123
|
+
Normalize partition values to ensure consistent representation of unpartitioned data.
|
124
|
+
|
125
|
+
Both None and empty list [] represent unpartitioned data, but they should be
|
126
|
+
normalized to None for consistent lookup and validation.
|
127
|
+
|
128
|
+
Args:
|
129
|
+
partition_values: The partition values to normalize
|
130
|
+
|
131
|
+
Returns:
|
132
|
+
None for unpartitioned data (both None and [] inputs),
|
133
|
+
original value for partitioned data
|
134
|
+
"""
|
135
|
+
if partition_values is None or (
|
136
|
+
isinstance(partition_values, list) and len(partition_values) == 0
|
137
|
+
):
|
138
|
+
return None
|
139
|
+
return partition_values
|
78
140
|
|
79
141
|
|
80
142
|
def _list(
|
81
143
|
metafile: Metafile,
|
82
144
|
txn_op_type: TransactionOperationType,
|
83
145
|
*args,
|
146
|
+
transaction: Optional[Transaction] = None,
|
84
147
|
**kwargs,
|
85
148
|
) -> ListResult[Metafile]:
|
86
149
|
catalog_properties = get_catalog_properties(**kwargs)
|
87
150
|
limit = kwargs.get("limit") or None
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
dest_metafile=metafile,
|
94
|
-
read_limit=limit,
|
95
|
-
)
|
96
|
-
],
|
97
|
-
)
|
98
|
-
list_results_per_op = transaction.commit(
|
99
|
-
catalog_root_dir=catalog_properties.root,
|
100
|
-
filesystem=catalog_properties.filesystem,
|
151
|
+
|
152
|
+
operation = TransactionOperation.of(
|
153
|
+
operation_type=txn_op_type,
|
154
|
+
dest_metafile=metafile,
|
155
|
+
read_limit=limit,
|
101
156
|
)
|
102
|
-
|
157
|
+
|
158
|
+
if transaction is not None:
|
159
|
+
# Add the read operation to the existing transaction and return the result
|
160
|
+
return transaction.step(operation)
|
161
|
+
else:
|
162
|
+
# Create and commit a new transaction (legacy behavior)
|
163
|
+
new_transaction = Transaction.of([operation])
|
164
|
+
list_results_per_op = new_transaction.commit(
|
165
|
+
catalog_root_dir=catalog_properties.root,
|
166
|
+
filesystem=catalog_properties.filesystem,
|
167
|
+
)
|
168
|
+
return list_results_per_op[0]
|
103
169
|
|
104
170
|
|
105
171
|
def _latest(
|
106
172
|
metafile: Metafile,
|
107
173
|
*args,
|
174
|
+
transaction: Optional[Transaction] = None,
|
108
175
|
**kwargs,
|
109
176
|
) -> Optional[Metafile]:
|
110
177
|
list_results = _list(
|
111
|
-
*args,
|
112
178
|
metafile=metafile,
|
113
179
|
txn_op_type=TransactionOperationType.READ_LATEST,
|
180
|
+
transaction=transaction,
|
181
|
+
*args,
|
114
182
|
**kwargs,
|
115
183
|
)
|
116
184
|
results = list_results.all_items()
|
@@ -121,94 +189,38 @@ def _exists(
|
|
121
189
|
metafile: Metafile,
|
122
190
|
*args,
|
123
191
|
**kwargs,
|
124
|
-
) -> Optional[
|
192
|
+
) -> Optional[bool]:
|
125
193
|
list_results = _list(
|
126
|
-
*args,
|
127
194
|
metafile=metafile,
|
128
195
|
txn_op_type=TransactionOperationType.READ_EXISTS,
|
196
|
+
*args,
|
129
197
|
**kwargs,
|
130
198
|
)
|
131
199
|
results = list_results.all_items()
|
132
200
|
return True if results else False
|
133
201
|
|
134
202
|
|
135
|
-
def _resolve_partition_locator_alias(
|
136
|
-
namespace: str,
|
137
|
-
table_name: str,
|
138
|
-
table_version: Optional[str] = None,
|
139
|
-
partition_values: Optional[PartitionValues] = None,
|
140
|
-
partition_scheme_id: Optional[str] = None,
|
141
|
-
*args,
|
142
|
-
**kwargs,
|
143
|
-
) -> PartitionLocatorAlias:
|
144
|
-
# TODO(pdames): A read shouldn't initiate N transactions that
|
145
|
-
# read against different catalog snapshots. To resolve this, add
|
146
|
-
# new "start", "step", and "end" methods to Transaction that
|
147
|
-
# support starting a txn, defining and executing a txn op, retrieve
|
148
|
-
# its results, then define and execute the next txn op. When
|
149
|
-
# stepping through a transaction its txn heartbeat timeout should
|
150
|
-
# be set manually.
|
151
|
-
partition_locator = None
|
152
|
-
if not partition_values:
|
153
|
-
partition_scheme_id = UNPARTITIONED_SCHEME_ID
|
154
|
-
elif not partition_scheme_id:
|
155
|
-
# resolve latest partition scheme from the current
|
156
|
-
# revision of its `deltacat` stream
|
157
|
-
stream = get_stream(
|
158
|
-
*args,
|
159
|
-
namespace=namespace,
|
160
|
-
table_name=table_name,
|
161
|
-
table_version=table_version,
|
162
|
-
**kwargs,
|
163
|
-
)
|
164
|
-
if not stream:
|
165
|
-
raise ValueError(
|
166
|
-
f"Failed to resolve latest partition scheme for "
|
167
|
-
f"`{namespace}.{table_name}` at table version "
|
168
|
-
f"`{table_version or 'latest'}` (no stream found)."
|
169
|
-
)
|
170
|
-
partition_locator = PartitionLocator.of(
|
171
|
-
stream_locator=stream.locator,
|
172
|
-
partition_values=partition_values,
|
173
|
-
partition_id=None,
|
174
|
-
)
|
175
|
-
partition_scheme_id = stream.partition_scheme.id
|
176
|
-
if not partition_locator:
|
177
|
-
partition_locator = PartitionLocator.at(
|
178
|
-
namespace=namespace,
|
179
|
-
table_name=table_name,
|
180
|
-
table_version=table_version,
|
181
|
-
stream_id=None,
|
182
|
-
stream_format=StreamFormat.DELTACAT,
|
183
|
-
partition_values=partition_values,
|
184
|
-
partition_id=None,
|
185
|
-
)
|
186
|
-
partition = Partition.of(
|
187
|
-
locator=partition_locator,
|
188
|
-
schema=None,
|
189
|
-
content_types=None,
|
190
|
-
partition_scheme_id=partition_scheme_id,
|
191
|
-
)
|
192
|
-
return partition.locator_alias
|
193
|
-
|
194
|
-
|
195
203
|
def _resolve_latest_active_table_version_id(
|
196
204
|
namespace: str,
|
197
205
|
table_name: str,
|
198
|
-
fail_if_no_active_table_version: True,
|
199
206
|
*args,
|
207
|
+
fail_if_no_active_table_version: bool = True,
|
208
|
+
transaction: Optional[Transaction] = None,
|
200
209
|
**kwargs,
|
201
210
|
) -> Optional[str]:
|
202
211
|
table = get_table(
|
203
|
-
*args,
|
204
212
|
namespace=namespace,
|
205
213
|
table_name=table_name,
|
214
|
+
transaction=transaction,
|
215
|
+
*args,
|
206
216
|
**kwargs,
|
207
217
|
)
|
208
218
|
if not table:
|
209
|
-
raise
|
219
|
+
raise TableNotFoundError(f"Table does not exist: {namespace}.{table_name}")
|
210
220
|
if fail_if_no_active_table_version and not table.latest_active_table_version:
|
211
|
-
raise
|
221
|
+
raise TableVersionNotFoundError(
|
222
|
+
f"Table has no active table version: {namespace}.{table_name}"
|
223
|
+
)
|
212
224
|
return table.latest_active_table_version
|
213
225
|
|
214
226
|
|
@@ -217,30 +229,114 @@ def _resolve_latest_table_version_id(
|
|
217
229
|
table_name: str,
|
218
230
|
fail_if_no_active_table_version: True,
|
219
231
|
*args,
|
232
|
+
transaction: Optional[Transaction] = None,
|
220
233
|
**kwargs,
|
221
234
|
) -> Optional[str]:
|
222
235
|
table = get_table(
|
223
|
-
*args,
|
224
236
|
namespace=namespace,
|
225
237
|
table_name=table_name,
|
238
|
+
transaction=transaction,
|
239
|
+
*args,
|
226
240
|
**kwargs,
|
227
241
|
)
|
228
242
|
if not table:
|
229
|
-
raise
|
243
|
+
raise TableNotFoundError(f"Table does not exist: {namespace}.{table_name}")
|
230
244
|
if fail_if_no_active_table_version and not table.latest_table_version:
|
231
|
-
raise
|
245
|
+
raise TableVersionNotFoundError(
|
246
|
+
f"Table has no table version: {namespace}.{table_name}"
|
247
|
+
)
|
232
248
|
return table.latest_table_version
|
233
249
|
|
234
250
|
|
251
|
+
def _validate_schemes_against_schema(
|
252
|
+
schema: Optional[Schema],
|
253
|
+
partition_scheme: Optional[PartitionScheme],
|
254
|
+
sort_scheme: Optional[SortScheme],
|
255
|
+
) -> None:
|
256
|
+
"""
|
257
|
+
Validates partition and sort schemes against a schema, ensuring all referenced fields exist.
|
258
|
+
If schema is None, validation is skipped.
|
259
|
+
"""
|
260
|
+
if schema is None:
|
261
|
+
return
|
262
|
+
|
263
|
+
schema_fields = set(field.name for field in schema.arrow)
|
264
|
+
|
265
|
+
# Validate partition scheme
|
266
|
+
if partition_scheme is not None and partition_scheme.keys is not None:
|
267
|
+
for key in partition_scheme.keys:
|
268
|
+
if key.key[0] not in schema_fields:
|
269
|
+
raise SchemaValidationError(
|
270
|
+
f"Partition key field '{key.key[0]}' not found in schema"
|
271
|
+
)
|
272
|
+
|
273
|
+
# Validate sort scheme
|
274
|
+
if sort_scheme is not None and sort_scheme.keys is not None:
|
275
|
+
for key in sort_scheme.keys:
|
276
|
+
if key.key[0] not in schema_fields:
|
277
|
+
raise SchemaValidationError(
|
278
|
+
f"Sort key field '{key.key[0]}' not found in schema"
|
279
|
+
)
|
280
|
+
|
281
|
+
|
282
|
+
def _validate_partition_values_against_scheme(
|
283
|
+
partition_values: Optional[PartitionValues],
|
284
|
+
partition_scheme: PartitionScheme,
|
285
|
+
schema: Optional[Schema],
|
286
|
+
) -> None:
|
287
|
+
"""
|
288
|
+
Validates that partition values match the data types of the partition key fields in the schema.
|
289
|
+
|
290
|
+
Args:
|
291
|
+
partition_values: List of partition values to validate
|
292
|
+
partition_scheme: The partition scheme containing the keys to validate against
|
293
|
+
schema: The schema containing the field types to validate against
|
294
|
+
|
295
|
+
Raises:
|
296
|
+
TableValidationError: If validation fails
|
297
|
+
"""
|
298
|
+
if not partition_values:
|
299
|
+
raise TableValidationError("Partition values cannot be empty")
|
300
|
+
|
301
|
+
if not schema:
|
302
|
+
raise TableValidationError(
|
303
|
+
"Table version must have a schema to validate partition values"
|
304
|
+
)
|
305
|
+
|
306
|
+
if len(partition_values) != len(partition_scheme.keys):
|
307
|
+
raise TableValidationError(
|
308
|
+
f"Number of partition values ({len(partition_values)}) does not match "
|
309
|
+
f"number of partition keys ({len(partition_scheme.keys)})"
|
310
|
+
)
|
311
|
+
|
312
|
+
# Validate each partition value against its corresponding field type
|
313
|
+
for i in range(len(partition_scheme.keys)):
|
314
|
+
field_type = partition_scheme.keys[i].transform.return_type
|
315
|
+
partition_value = partition_values[i]
|
316
|
+
if field_type is None:
|
317
|
+
# the transform returns the same type as the source schema type
|
318
|
+
# (which also implies that it is a single-key transform)
|
319
|
+
field_type = schema.field(partition_scheme.keys[i].key[0]).arrow.type
|
320
|
+
try:
|
321
|
+
# Try to convert the value to PyArrow to validate its type
|
322
|
+
pa.array([partition_value], type=field_type)
|
323
|
+
# If successful, the type is valid
|
324
|
+
except (pa.lib.ArrowInvalid, pa.lib.ArrowTypeError) as e:
|
325
|
+
raise TableValidationError(
|
326
|
+
f"Partition value {partition_value} (type {type(partition_value)}) "
|
327
|
+
f"incompatible with partition transform return type {field_type}"
|
328
|
+
) from e
|
329
|
+
|
330
|
+
|
235
331
|
def list_namespaces(*args, **kwargs) -> ListResult[Namespace]:
|
236
332
|
"""
|
237
333
|
Lists a page of table namespaces. Namespaces are returned as list result
|
238
334
|
items.
|
239
335
|
"""
|
240
336
|
return _list(
|
241
|
-
*args,
|
242
337
|
metafile=Namespace.of(NamespaceLocator.of("placeholder")),
|
243
338
|
txn_op_type=TransactionOperationType.READ_SIBLINGS,
|
339
|
+
*args,
|
244
340
|
**kwargs,
|
245
341
|
)
|
246
342
|
|
@@ -251,12 +347,15 @@ def list_tables(namespace: str, *args, **kwargs) -> ListResult[Table]:
|
|
251
347
|
list result items. Raises an error if the given namespace does not exist.
|
252
348
|
"""
|
253
349
|
locator = TableLocator.at(namespace=namespace, table_name="placeholder")
|
254
|
-
|
255
|
-
|
256
|
-
|
257
|
-
|
258
|
-
|
259
|
-
|
350
|
+
try:
|
351
|
+
return _list(
|
352
|
+
metafile=Table.of(locator=locator),
|
353
|
+
txn_op_type=TransactionOperationType.READ_SIBLINGS,
|
354
|
+
*args,
|
355
|
+
**kwargs,
|
356
|
+
)
|
357
|
+
except ObjectNotFoundError as e:
|
358
|
+
raise NamespaceNotFoundError(f"Namespace {namespace} not found") from e
|
260
359
|
|
261
360
|
|
262
361
|
def list_table_versions(
|
@@ -279,12 +378,15 @@ def list_table_versions(
|
|
279
378
|
locator=locator,
|
280
379
|
schema=None,
|
281
380
|
)
|
282
|
-
|
283
|
-
|
284
|
-
|
285
|
-
|
286
|
-
|
287
|
-
|
381
|
+
try:
|
382
|
+
return _list(
|
383
|
+
metafile=table_version,
|
384
|
+
txn_op_type=TransactionOperationType.READ_SIBLINGS,
|
385
|
+
*args,
|
386
|
+
**kwargs,
|
387
|
+
)
|
388
|
+
except ObjectNotFoundError as e:
|
389
|
+
raise TableNotFoundError(f"Table {namespace}.{table_name} not found") from e
|
288
390
|
|
289
391
|
|
290
392
|
def list_streams(
|
@@ -298,6 +400,7 @@ def list_streams(
|
|
298
400
|
Lists a page of streams for the given table version.
|
299
401
|
Raises an error if the table version does not exist.
|
300
402
|
"""
|
403
|
+
# TODO(pdames): Support listing uncommitted streams.
|
301
404
|
locator = StreamLocator.at(
|
302
405
|
namespace=namespace,
|
303
406
|
table_name=table_name,
|
@@ -309,12 +412,17 @@ def list_streams(
|
|
309
412
|
locator=locator,
|
310
413
|
partition_scheme=None,
|
311
414
|
)
|
312
|
-
|
313
|
-
|
314
|
-
|
315
|
-
|
316
|
-
|
317
|
-
|
415
|
+
try:
|
416
|
+
return _list(
|
417
|
+
stream,
|
418
|
+
TransactionOperationType.READ_SIBLINGS,
|
419
|
+
*args,
|
420
|
+
**kwargs,
|
421
|
+
)
|
422
|
+
except ObjectNotFoundError as e:
|
423
|
+
raise TableVersionNotFoundError(
|
424
|
+
f"Table version {namespace}.{table_name}.{table_version} not found"
|
425
|
+
) from e
|
318
426
|
|
319
427
|
|
320
428
|
def list_partitions(
|
@@ -322,6 +430,7 @@ def list_partitions(
|
|
322
430
|
table_name: str,
|
323
431
|
table_version: Optional[str] = None,
|
324
432
|
*args,
|
433
|
+
transaction: Optional[Transaction] = None,
|
325
434
|
**kwargs,
|
326
435
|
) -> ListResult[Partition]:
|
327
436
|
"""
|
@@ -330,32 +439,58 @@ def list_partitions(
|
|
330
439
|
table version if not specified. Raises an error if the table version does
|
331
440
|
not exist.
|
332
441
|
"""
|
333
|
-
|
442
|
+
transaction, commit_transaction = setup_transaction(transaction, **kwargs)
|
443
|
+
|
444
|
+
if not namespace:
|
445
|
+
raise ValueError("Namespace cannot be empty.")
|
446
|
+
if not table_name:
|
447
|
+
raise ValueError("Table name cannot be empty.")
|
448
|
+
# resolve default deltacat stream for the given namespace, table name, and table version
|
449
|
+
# TODO(pdames): debug why this doesn't work when only the table_version is provided
|
450
|
+
# and PartitionLocator.stream_format is hard-coded to deltacat (we should be able
|
451
|
+
# to resolve the default deltacat stream automatically)
|
452
|
+
stream = get_stream(
|
334
453
|
namespace=namespace,
|
335
454
|
table_name=table_name,
|
336
455
|
table_version=table_version,
|
337
|
-
|
338
|
-
|
456
|
+
transaction=transaction,
|
457
|
+
*args,
|
458
|
+
**kwargs,
|
459
|
+
)
|
460
|
+
if not stream:
|
461
|
+
raise StreamNotFoundError(
|
462
|
+
f"Default stream for {namespace}.{table_name}.{table_version} not found."
|
463
|
+
)
|
464
|
+
locator = PartitionLocator.of(
|
465
|
+
stream_locator=stream.locator,
|
339
466
|
partition_values=["placeholder"],
|
340
467
|
partition_id="placeholder",
|
341
468
|
)
|
342
469
|
partition = Partition.of(
|
343
470
|
locator=locator,
|
344
|
-
schema=None,
|
345
471
|
content_types=None,
|
346
472
|
)
|
347
|
-
|
348
|
-
|
349
|
-
|
350
|
-
|
351
|
-
|
352
|
-
|
473
|
+
try:
|
474
|
+
result = _list(
|
475
|
+
metafile=partition,
|
476
|
+
txn_op_type=TransactionOperationType.READ_SIBLINGS,
|
477
|
+
transaction=transaction,
|
478
|
+
*args,
|
479
|
+
**kwargs,
|
480
|
+
)
|
481
|
+
except ObjectNotFoundError as e:
|
482
|
+
raise StreamNotFoundError(f"Stream {stream.locator} not found") from e
|
483
|
+
|
484
|
+
if commit_transaction:
|
485
|
+
transaction.seal()
|
486
|
+
return result
|
353
487
|
|
354
488
|
|
355
489
|
def list_stream_partitions(stream: Stream, *args, **kwargs) -> ListResult[Partition]:
|
356
490
|
"""
|
357
491
|
Lists all partitions committed to the given stream.
|
358
492
|
"""
|
493
|
+
# TODO(pdames): Support listing uncommitted partitions.
|
359
494
|
if stream.stream_format != StreamFormat.DELTACAT:
|
360
495
|
raise ValueError(
|
361
496
|
f"Unsupported stream format: {stream.stream_format}"
|
@@ -368,15 +503,17 @@ def list_stream_partitions(stream: Stream, *args, **kwargs) -> ListResult[Partit
|
|
368
503
|
)
|
369
504
|
partition = Partition.of(
|
370
505
|
locator=locator,
|
371
|
-
schema=None,
|
372
506
|
content_types=None,
|
373
507
|
)
|
374
|
-
|
375
|
-
|
376
|
-
|
377
|
-
|
378
|
-
|
379
|
-
|
508
|
+
try:
|
509
|
+
return _list(
|
510
|
+
metafile=partition,
|
511
|
+
txn_op_type=TransactionOperationType.READ_SIBLINGS,
|
512
|
+
*args,
|
513
|
+
**kwargs,
|
514
|
+
)
|
515
|
+
except ObjectNotFoundError as e:
|
516
|
+
raise StreamNotFoundError(f"Stream {stream.locator} not found") from e
|
380
517
|
|
381
518
|
|
382
519
|
def list_deltas(
|
@@ -390,6 +527,7 @@ def list_deltas(
|
|
390
527
|
include_manifest: bool = False,
|
391
528
|
partition_scheme_id: Optional[str] = None,
|
392
529
|
*args,
|
530
|
+
transaction: Optional[Transaction] = None,
|
393
531
|
**kwargs,
|
394
532
|
) -> ListResult[Delta]:
|
395
533
|
"""
|
@@ -406,21 +544,48 @@ def list_deltas(
|
|
406
544
|
default. The manifests can either be optionally retrieved as part of this
|
407
545
|
call or lazily loaded via subsequent calls to `get_delta_manifest`.
|
408
546
|
"""
|
547
|
+
transaction, commit_transaction = setup_transaction(transaction, **kwargs)
|
548
|
+
|
409
549
|
# TODO(pdames): Delta listing should ideally either use an efficient
|
410
550
|
# range-limited dir listing of partition children between start and end
|
411
551
|
# positions, or should traverse using Partition.stream_position (to
|
412
552
|
# resolve last stream position) and Delta.previous_stream_position
|
413
553
|
# (down to first stream position).
|
414
|
-
|
415
|
-
|
554
|
+
|
555
|
+
# First get the stream to resolve proper table version and stream locator
|
556
|
+
stream = get_stream(
|
416
557
|
namespace=namespace,
|
417
558
|
table_name=table_name,
|
418
559
|
table_version=table_version,
|
560
|
+
transaction=transaction,
|
561
|
+
*args,
|
562
|
+
**kwargs,
|
563
|
+
)
|
564
|
+
if not stream:
|
565
|
+
raise StreamNotFoundError(
|
566
|
+
f"Failed to resolve stream for "
|
567
|
+
f"`{namespace}.{table_name}` at table version "
|
568
|
+
f"`{table_version or 'latest'}` (no stream found)."
|
569
|
+
)
|
570
|
+
|
571
|
+
# Then get the actual partition to ensure we have the real partition locator with ID
|
572
|
+
partition = get_partition(
|
573
|
+
stream_locator=stream.locator,
|
419
574
|
partition_values=partition_values,
|
420
575
|
partition_scheme_id=partition_scheme_id,
|
576
|
+
transaction=transaction,
|
577
|
+
*args,
|
421
578
|
**kwargs,
|
422
579
|
)
|
423
|
-
|
580
|
+
if not partition:
|
581
|
+
raise PartitionNotFoundError(
|
582
|
+
f"Failed to find partition for stream {stream.locator} "
|
583
|
+
f"with partition_values={partition_values} and "
|
584
|
+
f"partition_scheme_id={partition_scheme_id}"
|
585
|
+
)
|
586
|
+
|
587
|
+
# Use the actual partition locator (with partition ID) for listing deltas
|
588
|
+
locator = DeltaLocator.of(partition_locator=partition.locator)
|
424
589
|
delta = Delta.of(
|
425
590
|
locator=locator,
|
426
591
|
delta_type=None,
|
@@ -428,20 +593,34 @@ def list_deltas(
|
|
428
593
|
properties=None,
|
429
594
|
manifest=None,
|
430
595
|
)
|
431
|
-
|
432
|
-
|
433
|
-
|
434
|
-
|
435
|
-
|
436
|
-
|
596
|
+
try:
|
597
|
+
all_deltas_list_result: ListResult[Delta] = _list(
|
598
|
+
metafile=delta,
|
599
|
+
txn_op_type=TransactionOperationType.READ_SIBLINGS,
|
600
|
+
transaction=transaction,
|
601
|
+
*args,
|
602
|
+
**kwargs,
|
603
|
+
)
|
604
|
+
except ObjectNotFoundError as e:
|
605
|
+
raise PartitionNotFoundError(f"Partition {partition.locator} not found") from e
|
437
606
|
all_deltas = all_deltas_list_result.all_items()
|
438
607
|
filtered_deltas = [
|
439
608
|
delta
|
440
609
|
for delta in all_deltas
|
441
|
-
if
|
610
|
+
if (
|
611
|
+
first_stream_position is None
|
612
|
+
or first_stream_position <= delta.stream_position
|
613
|
+
)
|
614
|
+
and (
|
615
|
+
last_stream_position is None
|
616
|
+
or delta.stream_position <= last_stream_position
|
617
|
+
)
|
442
618
|
]
|
443
|
-
|
444
|
-
|
619
|
+
# Sort deltas by stream position in the requested order
|
620
|
+
filtered_deltas.sort(reverse=(not ascending_order), key=lambda d: d.stream_position)
|
621
|
+
|
622
|
+
if commit_transaction:
|
623
|
+
transaction.seal()
|
445
624
|
return filtered_deltas
|
446
625
|
|
447
626
|
|
@@ -479,21 +658,37 @@ def list_partition_deltas(
|
|
479
658
|
properties=None,
|
480
659
|
manifest=None,
|
481
660
|
)
|
482
|
-
|
483
|
-
|
484
|
-
|
485
|
-
|
486
|
-
|
487
|
-
|
661
|
+
try:
|
662
|
+
all_deltas_list_result: ListResult[Delta] = _list(
|
663
|
+
metafile=delta,
|
664
|
+
txn_op_type=TransactionOperationType.READ_SIBLINGS,
|
665
|
+
*args,
|
666
|
+
**kwargs,
|
667
|
+
)
|
668
|
+
except ObjectNotFoundError as e:
|
669
|
+
raise PartitionNotFoundError(
|
670
|
+
f"Partition {partition_like.locator} not found"
|
671
|
+
) from e
|
488
672
|
all_deltas = all_deltas_list_result.all_items()
|
489
673
|
filtered_deltas = [
|
490
674
|
delta
|
491
675
|
for delta in all_deltas
|
492
|
-
if
|
676
|
+
if (
|
677
|
+
first_stream_position is None
|
678
|
+
or first_stream_position <= delta.stream_position
|
679
|
+
)
|
680
|
+
and (
|
681
|
+
last_stream_position is None
|
682
|
+
or delta.stream_position <= last_stream_position
|
683
|
+
)
|
493
684
|
]
|
494
|
-
|
495
|
-
|
496
|
-
return
|
685
|
+
# Sort deltas by stream position in the requested order
|
686
|
+
filtered_deltas.sort(reverse=(not ascending_order), key=lambda d: d.stream_position)
|
687
|
+
return ListResult.of(
|
688
|
+
items=filtered_deltas,
|
689
|
+
pagination_key=None,
|
690
|
+
next_page_provider=None,
|
691
|
+
)
|
497
692
|
|
498
693
|
|
499
694
|
def get_delta(
|
@@ -505,6 +700,7 @@ def get_delta(
|
|
505
700
|
include_manifest: bool = False,
|
506
701
|
partition_scheme_id: Optional[str] = None,
|
507
702
|
*args,
|
703
|
+
transaction: Optional[Transaction] = None,
|
508
704
|
**kwargs,
|
509
705
|
) -> Optional[Delta]:
|
510
706
|
"""
|
@@ -519,18 +715,45 @@ def get_delta(
|
|
519
715
|
default. The manifest can either be optionally retrieved as part of this
|
520
716
|
call or lazily loaded via a subsequent call to `get_delta_manifest`.
|
521
717
|
"""
|
718
|
+
transaction, commit_transaction = setup_transaction(transaction, **kwargs)
|
719
|
+
|
522
720
|
# TODO(pdames): Honor `include_manifest` param.
|
523
|
-
|
524
|
-
|
721
|
+
|
722
|
+
# First get the stream to resolve proper table version and stream locator
|
723
|
+
stream = get_stream(
|
525
724
|
namespace=namespace,
|
526
725
|
table_name=table_name,
|
527
726
|
table_version=table_version,
|
727
|
+
transaction=transaction,
|
728
|
+
*args,
|
729
|
+
**kwargs,
|
730
|
+
)
|
731
|
+
if not stream:
|
732
|
+
raise StreamNotFoundError(
|
733
|
+
f"Failed to resolve stream for "
|
734
|
+
f"`{namespace}.{table_name}` at table version "
|
735
|
+
f"`{table_version or 'latest'}` (no stream found)."
|
736
|
+
)
|
737
|
+
|
738
|
+
# Then get the actual partition to ensure we have the real partition locator with ID
|
739
|
+
partition = get_partition(
|
740
|
+
stream_locator=stream.locator,
|
528
741
|
partition_values=partition_values,
|
529
742
|
partition_scheme_id=partition_scheme_id,
|
743
|
+
transaction=transaction,
|
744
|
+
*args,
|
530
745
|
**kwargs,
|
531
746
|
)
|
747
|
+
if not partition:
|
748
|
+
raise PartitionNotFoundError(
|
749
|
+
f"Failed to find partition for stream {stream.locator} "
|
750
|
+
f"with partition_values={partition_values} and "
|
751
|
+
f"partition_scheme_id={partition_scheme_id}"
|
752
|
+
)
|
753
|
+
|
754
|
+
# Use the actual partition locator (with partition ID) for getting the delta
|
532
755
|
locator = DeltaLocator.of(
|
533
|
-
locator
|
756
|
+
partition_locator=partition.locator,
|
534
757
|
stream_position=stream_position,
|
535
758
|
)
|
536
759
|
delta = Delta.of(
|
@@ -540,12 +763,22 @@ def get_delta(
|
|
540
763
|
properties=None,
|
541
764
|
manifest=None,
|
542
765
|
)
|
543
|
-
|
544
|
-
*args,
|
766
|
+
result = _latest(
|
545
767
|
metafile=delta,
|
768
|
+
transaction=transaction,
|
769
|
+
*args,
|
546
770
|
**kwargs,
|
547
771
|
)
|
548
772
|
|
773
|
+
# TODO(pdames): Honor the include_manifest parameter during retrieval from _latest, since
|
774
|
+
# the point is to avoid loading the manifest into memory if it's not needed.
|
775
|
+
if result and not include_manifest:
|
776
|
+
result.manifest = None
|
777
|
+
|
778
|
+
if commit_transaction:
|
779
|
+
transaction.seal()
|
780
|
+
return result
|
781
|
+
|
549
782
|
|
550
783
|
def get_latest_delta(
|
551
784
|
namespace: str,
|
@@ -555,6 +788,7 @@ def get_latest_delta(
|
|
555
788
|
include_manifest: bool = False,
|
556
789
|
partition_scheme_id: Optional[str] = None,
|
557
790
|
*args,
|
791
|
+
transaction: Optional[Transaction] = None,
|
558
792
|
**kwargs,
|
559
793
|
) -> Optional[Delta]:
|
560
794
|
"""
|
@@ -569,19 +803,26 @@ def get_latest_delta(
|
|
569
803
|
default. The manifest can either be optionally retrieved as part of this
|
570
804
|
call or lazily loaded via a subsequent call to `get_delta_manifest`.
|
571
805
|
"""
|
572
|
-
|
806
|
+
transaction, commit_transaction = setup_transaction(transaction, **kwargs)
|
807
|
+
|
573
808
|
stream = get_stream(
|
574
809
|
namespace=namespace,
|
575
810
|
table_name=table_name,
|
576
811
|
table_version=table_version,
|
812
|
+
transaction=transaction,
|
813
|
+
*args,
|
814
|
+
**kwargs,
|
577
815
|
)
|
578
816
|
partition = get_partition(
|
579
817
|
stream_locator=stream.locator,
|
580
818
|
partition_values=partition_values,
|
581
819
|
partition_scheme_id=partition_scheme_id,
|
820
|
+
transaction=transaction,
|
821
|
+
*args,
|
822
|
+
**kwargs,
|
582
823
|
)
|
583
824
|
locator = DeltaLocator.of(
|
584
|
-
|
825
|
+
partition_locator=partition.locator,
|
585
826
|
stream_position=partition.stream_position,
|
586
827
|
)
|
587
828
|
delta = Delta.of(
|
@@ -591,53 +832,327 @@ def get_latest_delta(
|
|
591
832
|
properties=None,
|
592
833
|
manifest=None,
|
593
834
|
)
|
594
|
-
|
595
|
-
*args,
|
835
|
+
result = _latest(
|
596
836
|
metafile=delta,
|
837
|
+
transaction=transaction,
|
838
|
+
*args,
|
839
|
+
**kwargs,
|
840
|
+
)
|
841
|
+
|
842
|
+
# TODO(pdames): Honor the include_manifest parameter during retrieval from _latest, since
|
843
|
+
# the point is to avoid loading the manifest into memory if it's not needed.
|
844
|
+
if result and not include_manifest:
|
845
|
+
result.manifest = None
|
846
|
+
|
847
|
+
if commit_transaction:
|
848
|
+
transaction.seal()
|
849
|
+
return result
|
850
|
+
|
851
|
+
|
852
|
+
def _download_delta_distributed(
|
853
|
+
manifest: Manifest,
|
854
|
+
table_type: DatasetType = DatasetType.PYARROW,
|
855
|
+
max_parallelism: Optional[int] = None,
|
856
|
+
column_names: Optional[List[str]] = None,
|
857
|
+
include_columns: Optional[List[str]] = None,
|
858
|
+
file_reader_kwargs_provider: Optional[ReadKwargsProvider] = None,
|
859
|
+
*args,
|
860
|
+
ray_options_provider: Callable[[int, Any], Dict[str, Any]] = None,
|
861
|
+
distributed_dataset_type: Optional[
|
862
|
+
DistributedDatasetType
|
863
|
+
] = DistributedDatasetType.RAY_DATASET,
|
864
|
+
**kwargs,
|
865
|
+
) -> DistributedDataset:
|
866
|
+
|
867
|
+
distributed_dataset: DistributedDataset = download_manifest_entries_distributed(
|
868
|
+
manifest=manifest,
|
869
|
+
table_type=table_type,
|
870
|
+
max_parallelism=max_parallelism,
|
871
|
+
column_names=column_names,
|
872
|
+
include_columns=include_columns,
|
873
|
+
file_reader_kwargs_provider=file_reader_kwargs_provider,
|
874
|
+
ray_options_provider=ray_options_provider,
|
875
|
+
distributed_dataset_type=distributed_dataset_type,
|
876
|
+
*args,
|
877
|
+
**kwargs,
|
878
|
+
)
|
879
|
+
|
880
|
+
return distributed_dataset
|
881
|
+
|
882
|
+
|
883
|
+
def _download_delta_local(
|
884
|
+
manifest: Manifest,
|
885
|
+
table_type: DatasetType = DatasetType.PYARROW,
|
886
|
+
max_parallelism: Optional[int] = None,
|
887
|
+
column_names: Optional[List[str]] = None,
|
888
|
+
include_columns: Optional[List[str]] = None,
|
889
|
+
file_reader_kwargs_provider: Optional[ReadKwargsProvider] = None,
|
890
|
+
*args,
|
891
|
+
**kwargs,
|
892
|
+
) -> LocalDataset:
|
893
|
+
tables: LocalDataset = download_manifest_entries(
|
894
|
+
manifest,
|
895
|
+
table_type,
|
896
|
+
max_parallelism if max_parallelism else 1,
|
897
|
+
column_names,
|
898
|
+
include_columns,
|
899
|
+
file_reader_kwargs_provider,
|
597
900
|
**kwargs,
|
598
901
|
)
|
902
|
+
return tables
|
599
903
|
|
600
904
|
|
601
905
|
def download_delta(
|
602
906
|
delta_like: Union[Delta, DeltaLocator],
|
603
|
-
table_type:
|
907
|
+
table_type: DatasetType = DatasetType.PYARROW,
|
604
908
|
storage_type: StorageType = StorageType.DISTRIBUTED,
|
605
909
|
max_parallelism: Optional[int] = None,
|
606
910
|
columns: Optional[List[str]] = None,
|
607
911
|
file_reader_kwargs_provider: Optional[ReadKwargsProvider] = None,
|
608
912
|
ray_options_provider: Callable[[int, Any], Dict[str, Any]] = None,
|
609
913
|
distributed_dataset_type: DistributedDatasetType = DistributedDatasetType.RAY_DATASET,
|
914
|
+
file_path_column: Optional[str] = None,
|
610
915
|
*args,
|
916
|
+
transaction: Optional[Transaction] = None,
|
917
|
+
all_column_names: Optional[List[str]] = None,
|
611
918
|
**kwargs,
|
612
919
|
) -> Union[LocalDataset, DistributedDataset]: # type: ignore
|
613
920
|
"""
|
614
|
-
|
921
|
+
Read the given delta or delta locator into either a list of
|
615
922
|
tables resident in the local node's memory, or into a dataset distributed
|
616
923
|
across this Ray cluster's object store memory. Ordered table N of a local
|
617
924
|
table list, or ordered block N of a distributed dataset, always contain
|
618
925
|
the contents of ordered delta manifest entry N.
|
619
926
|
"""
|
620
|
-
|
927
|
+
# TODO (pdames): Cast delimited text types to the table's schema types
|
928
|
+
# TODO (pdames): Deprecate this method and replace with `read_delta`
|
929
|
+
# TODO (pdames): Replace dependence on TableType, StorageType, and DistributedDatasetType
|
930
|
+
# with DatasetType
|
931
|
+
|
932
|
+
# if all column names are provided, then this is a pure manifest entry download (no transaction needed)
|
933
|
+
commit_transaction = False
|
934
|
+
if not all_column_names:
|
935
|
+
transaction, commit_transaction = setup_transaction(transaction, **kwargs)
|
936
|
+
|
937
|
+
storage_type_to_download_func = {
|
938
|
+
StorageType.LOCAL: _download_delta_local,
|
939
|
+
StorageType.DISTRIBUTED: _download_delta_distributed,
|
940
|
+
}
|
941
|
+
|
942
|
+
is_delta = isinstance(delta_like, Delta)
|
943
|
+
is_delta_locator = isinstance(delta_like, DeltaLocator)
|
944
|
+
|
945
|
+
delta_locator: Optional[DeltaLocator] = None
|
946
|
+
if is_delta_locator:
|
947
|
+
delta_locator = delta_like
|
948
|
+
elif is_delta:
|
949
|
+
delta_locator = Delta(delta_like).locator
|
950
|
+
if not delta_locator:
|
951
|
+
raise ValueError(
|
952
|
+
f"Expected delta_like to be a Delta or DeltaLocator, but found "
|
953
|
+
f"{type(delta_like)}."
|
954
|
+
)
|
955
|
+
|
956
|
+
# Get manifest - if delta_like is a Delta with a manifest, use it, otherwise fetch from storage
|
957
|
+
if is_delta and delta_like.manifest:
|
958
|
+
manifest = delta_like.manifest
|
959
|
+
elif all_column_names:
|
960
|
+
raise ValueError(
|
961
|
+
"All column names can only be specified with a delta with an inline manifest."
|
962
|
+
)
|
963
|
+
else:
|
964
|
+
manifest = get_delta_manifest(
|
965
|
+
delta_locator,
|
966
|
+
transaction=transaction,
|
967
|
+
*args,
|
968
|
+
**kwargs,
|
969
|
+
)
|
970
|
+
all_column_names = all_column_names or None
|
971
|
+
if not all_column_names:
|
972
|
+
table_version_schema = get_table_version_schema(
|
973
|
+
delta_locator.namespace,
|
974
|
+
delta_locator.table_name,
|
975
|
+
delta_locator.table_version,
|
976
|
+
transaction=transaction,
|
977
|
+
*args,
|
978
|
+
**kwargs,
|
979
|
+
)
|
980
|
+
if table_version_schema and table_version_schema.arrow:
|
981
|
+
all_column_names = [field.name for field in table_version_schema.arrow]
|
982
|
+
if distributed_dataset_type == DatasetType.DAFT:
|
983
|
+
# Daft needs the latest table version schema to properly handle schema evolution
|
984
|
+
kwargs["table_version_schema"] = table_version_schema.arrow
|
985
|
+
elif distributed_dataset_type == DatasetType.DAFT:
|
986
|
+
raise ValueError("All column names canot be specified with Daft.")
|
987
|
+
if columns:
|
988
|
+
# Extract file_path_column since it's appended after reading each file
|
989
|
+
columns_to_validate = (
|
990
|
+
[col for col in columns if col != file_path_column]
|
991
|
+
if file_path_column
|
992
|
+
else columns
|
993
|
+
)
|
994
|
+
|
995
|
+
# Only validate columns if we have schema information (all_column_names is not None)
|
996
|
+
if all_column_names is not None:
|
997
|
+
if not all(
|
998
|
+
col in [col_name.lower() for col_name in all_column_names]
|
999
|
+
for col in columns_to_validate
|
1000
|
+
):
|
1001
|
+
raise SchemaValidationError(
|
1002
|
+
f"One or more columns in {columns_to_validate} are not present in table "
|
1003
|
+
f"version columns {all_column_names}"
|
1004
|
+
)
|
1005
|
+
columns = [column.lower() for column in columns]
|
1006
|
+
logger.debug(
|
1007
|
+
f"Reading {columns or 'all'} columns from table version column "
|
1008
|
+
f"names: {all_column_names}. "
|
1009
|
+
)
|
1010
|
+
|
1011
|
+
# Filter out parameters that are already passed as positional/keyword arguments
|
1012
|
+
# to avoid "multiple values for argument" errors
|
1013
|
+
filtered_kwargs = {
|
1014
|
+
k: v
|
1015
|
+
for k, v in kwargs.items()
|
1016
|
+
if k
|
1017
|
+
not in [
|
1018
|
+
"manifest",
|
1019
|
+
"table_type",
|
1020
|
+
"max_parallelism",
|
1021
|
+
"column_names",
|
1022
|
+
"include_columns",
|
1023
|
+
"file_reader_kwargs_provider",
|
1024
|
+
"ray_options_provider",
|
1025
|
+
"distributed_dataset_type",
|
1026
|
+
]
|
1027
|
+
}
|
1028
|
+
|
1029
|
+
dataset = storage_type_to_download_func[storage_type](
|
1030
|
+
manifest,
|
1031
|
+
table_type,
|
1032
|
+
max_parallelism,
|
1033
|
+
all_column_names,
|
1034
|
+
columns,
|
1035
|
+
file_reader_kwargs_provider,
|
1036
|
+
ray_options_provider=ray_options_provider,
|
1037
|
+
distributed_dataset_type=distributed_dataset_type,
|
1038
|
+
file_path_column=file_path_column,
|
1039
|
+
**filtered_kwargs,
|
1040
|
+
)
|
1041
|
+
if commit_transaction:
|
1042
|
+
transaction.seal()
|
1043
|
+
return dataset
|
1044
|
+
|
1045
|
+
|
1046
|
+
def _download_manifest_entry(
|
1047
|
+
manifest_entry: ManifestEntry,
|
1048
|
+
table_type: DatasetType = DatasetType.PYARROW,
|
1049
|
+
column_names: Optional[List[str]] = None,
|
1050
|
+
include_columns: Optional[List[str]] = None,
|
1051
|
+
file_reader_kwargs_provider: Optional[ReadKwargsProvider] = None,
|
1052
|
+
content_type: Optional[ContentType] = None,
|
1053
|
+
content_encoding: Optional[ContentEncoding] = None,
|
1054
|
+
filesystem: Optional[pyarrow.fs.FileSystem] = None,
|
1055
|
+
) -> LocalTable:
|
1056
|
+
|
1057
|
+
return download_manifest_entry(
|
1058
|
+
manifest_entry,
|
1059
|
+
table_type,
|
1060
|
+
column_names,
|
1061
|
+
include_columns,
|
1062
|
+
file_reader_kwargs_provider,
|
1063
|
+
content_type,
|
1064
|
+
content_encoding,
|
1065
|
+
filesystem,
|
1066
|
+
)
|
621
1067
|
|
622
1068
|
|
623
1069
|
def download_delta_manifest_entry(
|
624
1070
|
delta_like: Union[Delta, DeltaLocator],
|
625
1071
|
entry_index: int,
|
626
|
-
table_type:
|
1072
|
+
table_type: DatasetType = DatasetType.PYARROW,
|
627
1073
|
columns: Optional[List[str]] = None,
|
628
1074
|
file_reader_kwargs_provider: Optional[ReadKwargsProvider] = None,
|
629
1075
|
*args,
|
1076
|
+
transaction: Optional[Transaction] = None,
|
1077
|
+
all_column_names: Optional[List[str]] = None,
|
630
1078
|
**kwargs,
|
631
1079
|
) -> LocalTable:
|
632
1080
|
"""
|
633
|
-
|
1081
|
+
Reads a single manifest entry into the specified table type for the
|
634
1082
|
given delta or delta locator. If a delta is provided with a non-empty
|
635
|
-
manifest, then the entry is
|
636
|
-
manifest is first retrieved then the given entry index
|
1083
|
+
manifest, then the entry is read from this manifest. Otherwise, the
|
1084
|
+
manifest is first retrieved then the given entry index read.
|
637
1085
|
|
638
|
-
NOTE: The entry will be
|
1086
|
+
NOTE: The entry will be read in the current node's memory.
|
639
1087
|
"""
|
640
|
-
|
1088
|
+
# if all column names are provided, then this is a pure manifest entry download (no transaction needed)
|
1089
|
+
commit_transaction = False
|
1090
|
+
if not all_column_names:
|
1091
|
+
transaction, commit_transaction = setup_transaction(transaction, **kwargs)
|
1092
|
+
|
1093
|
+
is_delta = isinstance(delta_like, Delta)
|
1094
|
+
is_delta_locator = isinstance(delta_like, DeltaLocator)
|
1095
|
+
|
1096
|
+
delta_locator: Optional[DeltaLocator] = None
|
1097
|
+
if is_delta_locator:
|
1098
|
+
delta_locator = delta_like
|
1099
|
+
elif is_delta:
|
1100
|
+
delta_locator = Delta(delta_like).locator
|
1101
|
+
if not delta_locator:
|
1102
|
+
raise ValueError(
|
1103
|
+
f"Expected delta_like to be a Delta or DeltaLocator, but found "
|
1104
|
+
f"{type(delta_like)}."
|
1105
|
+
)
|
1106
|
+
|
1107
|
+
if is_delta and delta_like.manifest:
|
1108
|
+
manifest = delta_like.manifest
|
1109
|
+
elif all_column_names:
|
1110
|
+
raise ValueError(
|
1111
|
+
"All column names can only be specified with a delta with an inline manifest."
|
1112
|
+
)
|
1113
|
+
else:
|
1114
|
+
manifest = get_delta_manifest(
|
1115
|
+
delta_locator,
|
1116
|
+
transaction=transaction,
|
1117
|
+
*args,
|
1118
|
+
**kwargs,
|
1119
|
+
)
|
1120
|
+
# TODO(pdames): Cache table version column names and only invoke when
|
1121
|
+
# needed.
|
1122
|
+
all_column_names = all_column_names or get_table_version_column_names(
|
1123
|
+
delta_locator.namespace,
|
1124
|
+
delta_locator.table_name,
|
1125
|
+
delta_locator.table_version,
|
1126
|
+
transaction=transaction,
|
1127
|
+
*args,
|
1128
|
+
**kwargs,
|
1129
|
+
)
|
1130
|
+
if columns:
|
1131
|
+
if not all(
|
1132
|
+
col in [col_name.lower() for col_name in all_column_names]
|
1133
|
+
for col in columns
|
1134
|
+
):
|
1135
|
+
raise SchemaValidationError(
|
1136
|
+
f"One or more columns in {columns} are not present in table "
|
1137
|
+
f"version columns {all_column_names}"
|
1138
|
+
)
|
1139
|
+
columns = [column.lower() for column in columns]
|
1140
|
+
logger.debug(
|
1141
|
+
f"Reading {columns or 'all'} columns from table version column "
|
1142
|
+
f"names: {all_column_names}. "
|
1143
|
+
)
|
1144
|
+
catalog_properties = get_catalog_properties(**kwargs)
|
1145
|
+
manifest_entry = _download_manifest_entry(
|
1146
|
+
manifest.entries[entry_index],
|
1147
|
+
table_type,
|
1148
|
+
all_column_names,
|
1149
|
+
columns,
|
1150
|
+
file_reader_kwargs_provider,
|
1151
|
+
filesystem=catalog_properties.filesystem,
|
1152
|
+
)
|
1153
|
+
if commit_transaction:
|
1154
|
+
transaction.seal()
|
1155
|
+
return manifest_entry
|
641
1156
|
|
642
1157
|
|
643
1158
|
def get_delta_manifest(
|
@@ -666,13 +1181,15 @@ def get_delta_manifest(
|
|
666
1181
|
properties=None,
|
667
1182
|
manifest=None,
|
668
1183
|
)
|
669
|
-
latest_delta = _latest(
|
1184
|
+
latest_delta: Delta = _latest(
|
670
1185
|
metafile=delta,
|
671
1186
|
*args,
|
672
1187
|
**kwargs,
|
673
1188
|
)
|
674
|
-
if not latest_delta
|
675
|
-
raise
|
1189
|
+
if not latest_delta:
|
1190
|
+
raise DeltaNotFoundError(f"No delta found for locator: {delta_locator}")
|
1191
|
+
elif not latest_delta.manifest:
|
1192
|
+
raise DeltaNotFoundError(f"No manifest found for delta: {latest_delta}")
|
676
1193
|
return latest_delta.manifest
|
677
1194
|
|
678
1195
|
|
@@ -680,30 +1197,30 @@ def create_namespace(
|
|
680
1197
|
namespace: str,
|
681
1198
|
properties: Optional[NamespaceProperties] = None,
|
682
1199
|
*args,
|
1200
|
+
transaction: Optional[Transaction] = None,
|
683
1201
|
**kwargs,
|
684
1202
|
) -> Namespace:
|
685
1203
|
"""
|
686
1204
|
Creates a table namespace with the given name and properties. Returns
|
687
1205
|
the created namespace.
|
688
1206
|
"""
|
1207
|
+
transaction, commit_transaction = setup_transaction(transaction, **kwargs)
|
1208
|
+
|
689
1209
|
namespace = Namespace.of(
|
690
1210
|
locator=NamespaceLocator.of(namespace=namespace),
|
691
1211
|
properties=properties,
|
692
1212
|
)
|
693
|
-
|
694
|
-
|
695
|
-
|
696
|
-
|
697
|
-
|
698
|
-
|
699
|
-
|
700
|
-
],
|
701
|
-
)
|
702
|
-
catalog_properties = get_catalog_properties(**kwargs)
|
703
|
-
transaction.commit(
|
704
|
-
catalog_root_dir=catalog_properties.root,
|
705
|
-
filesystem=catalog_properties.filesystem,
|
1213
|
+
|
1214
|
+
# Add the operation to the transaction
|
1215
|
+
transaction.step(
|
1216
|
+
TransactionOperation.of(
|
1217
|
+
operation_type=TransactionOperationType.CREATE,
|
1218
|
+
dest_metafile=namespace,
|
1219
|
+
),
|
706
1220
|
)
|
1221
|
+
|
1222
|
+
if commit_transaction:
|
1223
|
+
transaction.seal()
|
707
1224
|
return namespace
|
708
1225
|
|
709
1226
|
|
@@ -712,43 +1229,55 @@ def update_namespace(
|
|
712
1229
|
properties: Optional[NamespaceProperties] = None,
|
713
1230
|
new_namespace: Optional[str] = None,
|
714
1231
|
*args,
|
1232
|
+
transaction: Optional[Transaction] = None,
|
715
1233
|
**kwargs,
|
716
1234
|
) -> None:
|
717
1235
|
"""
|
718
1236
|
Updates a table namespace's name and/or properties. Raises an error if the
|
719
1237
|
given namespace does not exist.
|
720
1238
|
"""
|
721
|
-
|
722
|
-
|
723
|
-
|
1239
|
+
transaction, commit_transaction = setup_transaction(transaction, **kwargs)
|
1240
|
+
|
1241
|
+
# Check if the namespace exists
|
1242
|
+
old_namespace_meta = get_namespace(
|
724
1243
|
namespace=namespace,
|
1244
|
+
transaction=transaction,
|
1245
|
+
*args,
|
725
1246
|
**kwargs,
|
726
1247
|
)
|
727
|
-
|
728
|
-
|
729
|
-
|
730
|
-
|
731
|
-
|
732
|
-
|
1248
|
+
if not old_namespace_meta:
|
1249
|
+
raise NamespaceNotFoundError(f"Namespace {namespace} does not exist")
|
1250
|
+
|
1251
|
+
# Create new namespace metadata
|
1252
|
+
new_namespace_meta: Namespace = Metafile.update_for(old_namespace_meta)
|
1253
|
+
if new_namespace:
|
1254
|
+
new_namespace_meta.locator.namespace = new_namespace
|
1255
|
+
if properties is not None:
|
1256
|
+
new_namespace_meta.properties = properties
|
1257
|
+
|
1258
|
+
# Add the update operation to the transaction
|
1259
|
+
try:
|
1260
|
+
transaction.step(
|
733
1261
|
TransactionOperation.of(
|
734
1262
|
operation_type=TransactionOperationType.UPDATE,
|
735
|
-
dest_metafile=
|
736
|
-
src_metafile=
|
737
|
-
)
|
738
|
-
|
739
|
-
|
740
|
-
|
741
|
-
|
742
|
-
|
743
|
-
|
744
|
-
|
745
|
-
|
1263
|
+
dest_metafile=new_namespace_meta,
|
1264
|
+
src_metafile=old_namespace_meta,
|
1265
|
+
),
|
1266
|
+
)
|
1267
|
+
except ObjectAlreadyExistsError as e:
|
1268
|
+
raise NamespaceAlreadyExistsError(
|
1269
|
+
f"Namespace {namespace} already exists"
|
1270
|
+
) from e
|
1271
|
+
|
1272
|
+
if commit_transaction:
|
1273
|
+
transaction.seal()
|
746
1274
|
|
747
1275
|
|
748
1276
|
def create_table_version(
|
749
1277
|
namespace: str,
|
750
1278
|
table_name: str,
|
751
1279
|
table_version: Optional[str] = None,
|
1280
|
+
lifecycle_state: Optional[LifecycleState] = LifecycleState.CREATED,
|
752
1281
|
schema: Optional[Schema] = None,
|
753
1282
|
partition_scheme: Optional[PartitionScheme] = None,
|
754
1283
|
sort_keys: Optional[SortScheme] = None,
|
@@ -758,10 +1287,11 @@ def create_table_version(
|
|
758
1287
|
table_properties: Optional[TableProperties] = None,
|
759
1288
|
supported_content_types: Optional[List[ContentType]] = None,
|
760
1289
|
*args,
|
1290
|
+
transaction: Optional[Transaction] = None,
|
761
1291
|
**kwargs,
|
762
1292
|
) -> Tuple[Table, TableVersion, Stream]:
|
763
1293
|
"""
|
764
|
-
Create a table version with
|
1294
|
+
Create a table version with the given or CREATED lifecycle state and an empty delta
|
765
1295
|
stream. Table versions may be schemaless and unpartitioned to improve write
|
766
1296
|
performance, or have their writes governed by a schema and partition scheme
|
767
1297
|
to improve data consistency and read performance.
|
@@ -771,23 +1301,34 @@ def create_table_version(
|
|
771
1301
|
|
772
1302
|
Raises an error if the given namespace does not exist.
|
773
1303
|
"""
|
1304
|
+
transaction, commit_transaction = setup_transaction(transaction, **kwargs)
|
1305
|
+
|
774
1306
|
if not namespace_exists(
|
775
|
-
*args,
|
776
1307
|
namespace=namespace,
|
1308
|
+
transaction=transaction,
|
1309
|
+
*args,
|
777
1310
|
**kwargs,
|
778
1311
|
):
|
779
|
-
raise
|
1312
|
+
raise NamespaceNotFoundError(f"Namespace {namespace} does not exist")
|
1313
|
+
|
1314
|
+
# Validate schemes against schema
|
1315
|
+
_validate_schemes_against_schema(schema, partition_scheme, sort_keys)
|
1316
|
+
|
1317
|
+
# coerce unspecified partition schemes to the unpartitioned scheme
|
1318
|
+
partition_scheme = partition_scheme or UNPARTITIONED_SCHEME
|
1319
|
+
# coerce unspecified sort schemes to the unsorted scheme
|
1320
|
+
sort_keys = sort_keys or UNSORTED_SCHEME
|
780
1321
|
# check if a parent table and/or previous table version already exist
|
781
1322
|
prev_table_version = None
|
782
1323
|
prev_table = get_table(
|
783
|
-
*args,
|
784
1324
|
namespace=namespace,
|
785
1325
|
table_name=table_name,
|
1326
|
+
transaction=transaction,
|
1327
|
+
*args,
|
786
1328
|
**kwargs,
|
787
1329
|
)
|
788
1330
|
if not prev_table:
|
789
1331
|
# no parent table exists, so we'll create it in this transaction
|
790
|
-
txn_type = TransactionType.APPEND
|
791
1332
|
table_txn_op_type = TransactionOperationType.CREATE
|
792
1333
|
prev_table = None
|
793
1334
|
new_table = Table.of(
|
@@ -796,7 +1337,6 @@ def create_table_version(
|
|
796
1337
|
table_version = table_version or DEFAULT_TABLE_VERSION
|
797
1338
|
else:
|
798
1339
|
# the parent table exists, so we'll update it in this transaction
|
799
|
-
txn_type = TransactionType.ALTER
|
800
1340
|
table_txn_op_type = TransactionOperationType.UPDATE
|
801
1341
|
new_table: Table = Metafile.update_for(prev_table)
|
802
1342
|
prev_table_version = prev_table.latest_table_version
|
@@ -813,14 +1353,18 @@ def create_table_version(
|
|
813
1353
|
expected_table_version,
|
814
1354
|
)
|
815
1355
|
if version_number != expected_version_number:
|
816
|
-
raise
|
1356
|
+
raise TableValidationError(
|
817
1357
|
f"Expected to create table version "
|
818
1358
|
f"{expected_version_number} but found {version_number}.",
|
819
1359
|
)
|
820
|
-
|
821
|
-
|
1360
|
+
if table_description is not None:
|
1361
|
+
new_table.description = table_description
|
1362
|
+
if table_properties is not None:
|
1363
|
+
new_table.properties = table_properties
|
822
1364
|
new_table.latest_table_version = table_version
|
823
|
-
|
1365
|
+
new_table.latest_active_table_version = (
|
1366
|
+
table_version if lifecycle_state == LifecycleState.ACTIVE else None
|
1367
|
+
)
|
824
1368
|
locator = TableVersionLocator.at(
|
825
1369
|
namespace=namespace,
|
826
1370
|
table_name=table_name,
|
@@ -835,10 +1379,10 @@ def create_table_version(
|
|
835
1379
|
content_types=supported_content_types,
|
836
1380
|
sort_scheme=sort_keys,
|
837
1381
|
watermark=None,
|
838
|
-
lifecycle_state=
|
1382
|
+
lifecycle_state=lifecycle_state,
|
839
1383
|
schemas=[schema] if schema else None,
|
840
|
-
partition_schemes=[partition_scheme]
|
841
|
-
sort_schemes=[sort_keys]
|
1384
|
+
partition_schemes=[partition_scheme],
|
1385
|
+
sort_schemes=[sort_keys],
|
842
1386
|
previous_table_version=prev_table_version,
|
843
1387
|
)
|
844
1388
|
# create the table version's default deltacat stream in this transaction
|
@@ -854,31 +1398,68 @@ def create_table_version(
|
|
854
1398
|
previous_stream_id=None,
|
855
1399
|
watermark=None,
|
856
1400
|
)
|
857
|
-
|
858
|
-
|
859
|
-
|
860
|
-
|
861
|
-
|
862
|
-
|
863
|
-
|
864
|
-
),
|
865
|
-
TransactionOperation.of(
|
866
|
-
operation_type=TransactionOperationType.CREATE,
|
867
|
-
dest_metafile=table_version,
|
868
|
-
),
|
869
|
-
TransactionOperation.of(
|
870
|
-
operation_type=TransactionOperationType.CREATE,
|
871
|
-
dest_metafile=stream,
|
872
|
-
),
|
873
|
-
],
|
1401
|
+
# Add operations to the transaction
|
1402
|
+
transaction.step(
|
1403
|
+
TransactionOperation.of(
|
1404
|
+
operation_type=table_txn_op_type,
|
1405
|
+
dest_metafile=new_table,
|
1406
|
+
src_metafile=prev_table,
|
1407
|
+
),
|
874
1408
|
)
|
875
|
-
transaction.
|
876
|
-
|
877
|
-
|
1409
|
+
transaction.step(
|
1410
|
+
TransactionOperation.of(
|
1411
|
+
operation_type=TransactionOperationType.CREATE,
|
1412
|
+
dest_metafile=table_version,
|
1413
|
+
),
|
878
1414
|
)
|
1415
|
+
transaction.step(
|
1416
|
+
TransactionOperation.of(
|
1417
|
+
operation_type=TransactionOperationType.CREATE,
|
1418
|
+
dest_metafile=stream,
|
1419
|
+
),
|
1420
|
+
)
|
1421
|
+
|
1422
|
+
if commit_transaction:
|
1423
|
+
transaction.seal()
|
879
1424
|
return new_table, table_version, stream
|
880
1425
|
|
881
1426
|
|
1427
|
+
def create_table(
|
1428
|
+
namespace: str,
|
1429
|
+
table_name: str,
|
1430
|
+
description: Optional[str] = None,
|
1431
|
+
properties: Optional[TableProperties] = None,
|
1432
|
+
*args,
|
1433
|
+
transaction: Optional[Transaction] = None,
|
1434
|
+
**kwargs,
|
1435
|
+
) -> Table:
|
1436
|
+
"""
|
1437
|
+
Create a new table. Raises an error if the given table already exists.
|
1438
|
+
"""
|
1439
|
+
transaction, commit_transaction = setup_transaction(transaction, **kwargs)
|
1440
|
+
|
1441
|
+
new_table: Table = Table.of(
|
1442
|
+
locator=TableLocator.at(namespace=namespace, table_name=table_name),
|
1443
|
+
description=description,
|
1444
|
+
properties=properties,
|
1445
|
+
)
|
1446
|
+
try:
|
1447
|
+
transaction.step(
|
1448
|
+
TransactionOperation.of(
|
1449
|
+
operation_type=TransactionOperationType.CREATE,
|
1450
|
+
dest_metafile=new_table,
|
1451
|
+
),
|
1452
|
+
)
|
1453
|
+
except ObjectAlreadyExistsError as e:
|
1454
|
+
raise TableAlreadyExistsError(
|
1455
|
+
f"Table {namespace}.{table_name} already exists"
|
1456
|
+
) from e
|
1457
|
+
|
1458
|
+
if commit_transaction:
|
1459
|
+
transaction.seal()
|
1460
|
+
return new_table
|
1461
|
+
|
1462
|
+
|
882
1463
|
def update_table(
|
883
1464
|
namespace: str,
|
884
1465
|
table_name: str,
|
@@ -886,18 +1467,22 @@ def update_table(
|
|
886
1467
|
properties: Optional[TableProperties] = None,
|
887
1468
|
new_table_name: Optional[str] = None,
|
888
1469
|
*args,
|
1470
|
+
transaction: Optional[Transaction] = None,
|
889
1471
|
**kwargs,
|
890
|
-
) ->
|
1472
|
+
) -> Table:
|
891
1473
|
"""
|
892
1474
|
Update table metadata describing the table versions it contains. By default,
|
893
1475
|
a table's properties are empty, and its description is equal to that given
|
894
1476
|
when its first table version was created. Raises an error if the given
|
895
1477
|
table does not exist.
|
896
1478
|
"""
|
1479
|
+
transaction, commit_transaction = setup_transaction(transaction, **kwargs)
|
1480
|
+
|
897
1481
|
old_table = get_table(
|
898
|
-
*args,
|
899
1482
|
namespace=namespace,
|
900
1483
|
table_name=table_name,
|
1484
|
+
transaction=transaction,
|
1485
|
+
*args,
|
901
1486
|
**kwargs,
|
902
1487
|
)
|
903
1488
|
if not old_table:
|
@@ -906,21 +1491,23 @@ def update_table(
|
|
906
1491
|
new_table.description = description or old_table.description
|
907
1492
|
new_table.properties = properties or old_table.properties
|
908
1493
|
new_table.table_name = new_table_name or old_table.table_name
|
909
|
-
|
910
|
-
|
911
|
-
|
1494
|
+
|
1495
|
+
try:
|
1496
|
+
transaction.step(
|
912
1497
|
TransactionOperation.of(
|
913
1498
|
operation_type=TransactionOperationType.UPDATE,
|
914
1499
|
dest_metafile=new_table,
|
915
1500
|
src_metafile=old_table,
|
916
|
-
)
|
917
|
-
|
918
|
-
|
919
|
-
|
920
|
-
|
921
|
-
|
922
|
-
|
923
|
-
|
1501
|
+
),
|
1502
|
+
)
|
1503
|
+
except ObjectAlreadyExistsError as e:
|
1504
|
+
raise TableAlreadyExistsError(
|
1505
|
+
f"Table {namespace}.{table_name} already exists"
|
1506
|
+
) from e
|
1507
|
+
|
1508
|
+
if commit_transaction:
|
1509
|
+
transaction.seal()
|
1510
|
+
return new_table
|
924
1511
|
|
925
1512
|
|
926
1513
|
def update_table_version(
|
@@ -934,42 +1521,53 @@ def update_table_version(
|
|
934
1521
|
partition_scheme: Optional[PartitionScheme] = None,
|
935
1522
|
sort_keys: Optional[SortScheme] = None,
|
936
1523
|
*args,
|
1524
|
+
transaction: Optional[Transaction] = None,
|
937
1525
|
**kwargs,
|
938
|
-
) ->
|
1526
|
+
) -> Tuple[Optional[Table], TableVersion, Optional[Stream]]:
|
939
1527
|
"""
|
940
1528
|
Update a table version. Notably, updating an unreleased table version's
|
941
|
-
lifecycle state to '
|
1529
|
+
lifecycle state to 'ACTIVE' telegraphs that it is ready for external
|
942
1530
|
consumption, and causes all calls made to consume/produce streams,
|
943
1531
|
partitions, or deltas from/to its parent table to automatically resolve to
|
944
1532
|
this table version by default (i.e., when the client does not explicitly
|
945
1533
|
specify a different table version). Raises an error if the given table
|
946
1534
|
version does not exist.
|
1535
|
+
|
1536
|
+
Note that, to transition a table version from partitioned to unpartitioned,
|
1537
|
+
partition_scheme must be explicitly set to UNPARTITIONED_SCHEME. Similarly
|
1538
|
+
to transition a table version from sorted to unsorted, sort_keys must be
|
1539
|
+
explicitly set to UNSORTED_SCHEME.
|
947
1540
|
"""
|
948
|
-
|
1541
|
+
transaction, commit_transaction = setup_transaction(transaction, **kwargs)
|
949
1542
|
old_table_version = get_table_version(
|
950
|
-
*args,
|
951
1543
|
namespace=namespace,
|
952
1544
|
table_name=table_name,
|
953
1545
|
table_version=table_version,
|
1546
|
+
transaction=transaction,
|
1547
|
+
*args,
|
954
1548
|
**kwargs,
|
955
1549
|
)
|
956
1550
|
if not old_table_version:
|
957
|
-
raise
|
1551
|
+
raise TableVersionNotFoundError(
|
958
1552
|
f"Table version `{table_version}` does not exist for "
|
959
1553
|
f"table `{namespace}.{table_name}`."
|
960
1554
|
)
|
1555
|
+
|
1556
|
+
# If schema is not provided but partition_scheme or sort_keys are,
|
1557
|
+
# validate against the existing schema
|
1558
|
+
schema_to_validate = schema or old_table_version.schema
|
1559
|
+
_validate_schemes_against_schema(schema_to_validate, partition_scheme, sort_keys)
|
1560
|
+
|
961
1561
|
new_table_version: TableVersion = Metafile.update_for(old_table_version)
|
962
1562
|
new_table_version.state = lifecycle_state or old_table_version.state
|
963
|
-
|
964
|
-
#
|
965
|
-
# table version unless the user explicitly forces the update to this
|
966
|
-
# table version (i.e., at the cost of potentially breaking consumers).
|
1563
|
+
|
1564
|
+
# Caller is expected to do all necessary backwards compatibility schema checks
|
967
1565
|
update_schema = schema and not schema.equivalent_to(
|
968
1566
|
old_table_version.schema,
|
969
1567
|
True,
|
970
1568
|
)
|
971
1569
|
if update_schema and schema.id in [s.id for s in old_table_version.schemas]:
|
972
|
-
raise
|
1570
|
+
raise TableValidationError(
|
973
1571
|
f"Schema ID `{schema.id}` already exists in "
|
974
1572
|
f"table version `{table_version}`."
|
975
1573
|
)
|
@@ -985,6 +1583,21 @@ def update_table_version(
|
|
985
1583
|
new_table_version.properties = (
|
986
1584
|
properties if properties is not None else old_table_version.properties
|
987
1585
|
)
|
1586
|
+
new_supported_reader_types = new_table_version.read_table_property(
|
1587
|
+
TableProperty.SUPPORTED_READER_TYPES
|
1588
|
+
)
|
1589
|
+
if new_supported_reader_types:
|
1590
|
+
old_supported_reader_types = (
|
1591
|
+
old_table_version.read_table_property(TableProperty.SUPPORTED_READER_TYPES)
|
1592
|
+
or {}
|
1593
|
+
)
|
1594
|
+
added_supported_reader_types = set(new_supported_reader_types) - set(
|
1595
|
+
old_supported_reader_types
|
1596
|
+
)
|
1597
|
+
if added_supported_reader_types:
|
1598
|
+
raise TableValidationError(
|
1599
|
+
f"Cannot add new supported reader types: {added_supported_reader_types}"
|
1600
|
+
)
|
988
1601
|
new_table_version.partition_scheme = (
|
989
1602
|
partition_scheme or old_table_version.partition_scheme
|
990
1603
|
)
|
@@ -996,7 +1609,7 @@ def update_table_version(
|
|
996
1609
|
if update_partition_scheme and partition_scheme.id in [
|
997
1610
|
ps.id for ps in old_table_version.partition_schemes
|
998
1611
|
]:
|
999
|
-
raise
|
1612
|
+
raise TableValidationError(
|
1000
1613
|
f"Partition scheme ID `{partition_scheme.id}` already exists in "
|
1001
1614
|
f"table version `{table_version}`."
|
1002
1615
|
)
|
@@ -1013,7 +1626,7 @@ def update_table_version(
|
|
1013
1626
|
if update_sort_scheme and sort_keys.id in [
|
1014
1627
|
sk.id for sk in old_table_version.sort_schemes
|
1015
1628
|
]:
|
1016
|
-
raise
|
1629
|
+
raise TableValidationError(
|
1017
1630
|
f"Sort scheme ID `{sort_keys.id}` already exists in "
|
1018
1631
|
f"table version `{table_version}`."
|
1019
1632
|
)
|
@@ -1024,12 +1637,13 @@ def update_table_version(
|
|
1024
1637
|
else old_table_version.sort_schemes
|
1025
1638
|
)
|
1026
1639
|
old_table = get_table(
|
1027
|
-
*args,
|
1028
1640
|
namespace=namespace,
|
1029
1641
|
table_name=table_name,
|
1642
|
+
transaction=transaction,
|
1643
|
+
*args,
|
1030
1644
|
**kwargs,
|
1031
1645
|
)
|
1032
|
-
|
1646
|
+
new_table: Table = None
|
1033
1647
|
if (
|
1034
1648
|
lifecycle_state == LifecycleState.ACTIVE
|
1035
1649
|
and old_table_version.state != LifecycleState.ACTIVE
|
@@ -1044,50 +1658,52 @@ def update_table_version(
|
|
1044
1658
|
_, new_version_number = TableVersion.parse_table_version(table_version)
|
1045
1659
|
if old_version_number is None or old_version_number < new_version_number:
|
1046
1660
|
# update the table's latest table version
|
1047
|
-
new_table
|
1661
|
+
new_table = Metafile.update_for(old_table)
|
1048
1662
|
new_table.latest_active_table_version = table_version
|
1049
|
-
|
1663
|
+
transaction.step(
|
1050
1664
|
TransactionOperation.of(
|
1051
1665
|
operation_type=TransactionOperationType.UPDATE,
|
1052
1666
|
dest_metafile=new_table,
|
1053
1667
|
src_metafile=old_table,
|
1054
|
-
)
|
1668
|
+
),
|
1055
1669
|
)
|
1056
|
-
|
1057
|
-
|
1058
|
-
|
1059
|
-
|
1060
|
-
|
1061
|
-
|
1062
|
-
|
1670
|
+
try:
|
1671
|
+
transaction.step(
|
1672
|
+
TransactionOperation.of(
|
1673
|
+
operation_type=TransactionOperationType.UPDATE,
|
1674
|
+
dest_metafile=new_table_version,
|
1675
|
+
src_metafile=old_table_version,
|
1676
|
+
),
|
1677
|
+
)
|
1678
|
+
except ObjectAlreadyExistsError as e:
|
1679
|
+
raise TableVersionAlreadyExistsError(
|
1680
|
+
f"Table version {namespace}.{table_name}.{table_version} already exists"
|
1681
|
+
) from e
|
1682
|
+
|
1063
1683
|
# TODO(pdames): Push changes down to non-deltacat streams via sync module.
|
1064
1684
|
# Also copy sort scheme changes down to deltacat child stream?
|
1685
|
+
new_stream: Stream = None
|
1065
1686
|
if partition_scheme:
|
1066
1687
|
old_stream = get_stream(
|
1067
|
-
*args,
|
1068
1688
|
namespace=namespace,
|
1069
1689
|
table_name=table_name,
|
1070
1690
|
table_version=table_version,
|
1691
|
+
transaction=transaction,
|
1692
|
+
*args,
|
1071
1693
|
**kwargs,
|
1072
1694
|
)
|
1073
|
-
new_stream
|
1695
|
+
new_stream = Metafile.update_for(old_stream)
|
1074
1696
|
new_stream.partition_scheme = partition_scheme
|
1075
|
-
|
1697
|
+
transaction.step(
|
1076
1698
|
TransactionOperation.of(
|
1077
1699
|
operation_type=TransactionOperationType.UPDATE,
|
1078
1700
|
dest_metafile=new_stream,
|
1079
1701
|
src_metafile=old_stream,
|
1080
|
-
)
|
1702
|
+
),
|
1081
1703
|
)
|
1082
|
-
|
1083
|
-
|
1084
|
-
|
1085
|
-
)
|
1086
|
-
catalog_properties = get_catalog_properties(**kwargs)
|
1087
|
-
transaction.commit(
|
1088
|
-
catalog_root_dir=catalog_properties.root,
|
1089
|
-
filesystem=catalog_properties.filesystem,
|
1090
|
-
)
|
1704
|
+
if commit_transaction:
|
1705
|
+
transaction.seal()
|
1706
|
+
return new_table, new_table_version, new_stream
|
1091
1707
|
|
1092
1708
|
|
1093
1709
|
def stage_stream(
|
@@ -1096,6 +1712,7 @@ def stage_stream(
|
|
1096
1712
|
table_version: Optional[str] = None,
|
1097
1713
|
stream_format: StreamFormat = StreamFormat.DELTACAT,
|
1098
1714
|
*args,
|
1715
|
+
transaction: Optional[Transaction] = None,
|
1099
1716
|
**kwargs,
|
1100
1717
|
) -> Stream:
|
1101
1718
|
"""
|
@@ -1107,21 +1724,28 @@ def stage_stream(
|
|
1107
1724
|
Returns the staged stream. Raises an error if the table version does not
|
1108
1725
|
exist.
|
1109
1726
|
"""
|
1110
|
-
|
1727
|
+
transaction, commit_transaction = setup_transaction(transaction, **kwargs)
|
1728
|
+
|
1111
1729
|
if not table_version:
|
1112
1730
|
table_version = _resolve_latest_active_table_version_id(
|
1113
|
-
*args,
|
1114
1731
|
namespace=namespace,
|
1115
1732
|
table_name=table_name,
|
1733
|
+
transaction=transaction,
|
1734
|
+
*args,
|
1116
1735
|
**kwargs,
|
1117
1736
|
)
|
1118
1737
|
table_version_meta = get_table_version(
|
1119
|
-
*args,
|
1120
1738
|
namespace=namespace,
|
1121
1739
|
table_name=table_name,
|
1122
1740
|
table_version=table_version,
|
1741
|
+
transaction=transaction,
|
1742
|
+
*args,
|
1123
1743
|
**kwargs,
|
1124
1744
|
)
|
1745
|
+
if not table_version_meta:
|
1746
|
+
raise TableVersionNotFoundError(
|
1747
|
+
f"Table version not found: {namespace}.{table_name}.{table_version}."
|
1748
|
+
)
|
1125
1749
|
locator = StreamLocator.at(
|
1126
1750
|
namespace=namespace,
|
1127
1751
|
table_name=table_name,
|
@@ -1137,39 +1761,37 @@ def stage_stream(
|
|
1137
1761
|
watermark=None,
|
1138
1762
|
)
|
1139
1763
|
prev_stream = get_stream(
|
1140
|
-
*args,
|
1141
1764
|
namespace=stream.namespace,
|
1142
1765
|
table_name=stream.table_name,
|
1143
1766
|
table_version=stream.table_version,
|
1144
1767
|
stream_format=stream.stream_format,
|
1768
|
+
transaction=transaction,
|
1769
|
+
*args,
|
1145
1770
|
**kwargs,
|
1146
1771
|
)
|
1147
1772
|
if prev_stream:
|
1148
1773
|
if prev_stream.stream_id == stream.stream_id:
|
1149
|
-
raise
|
1774
|
+
raise TableValidationError(
|
1150
1775
|
f"Stream to stage has the same ID as existing stream: {prev_stream}."
|
1151
1776
|
)
|
1152
1777
|
stream.previous_stream_id = prev_stream.stream_id
|
1153
|
-
|
1154
|
-
|
1155
|
-
|
1156
|
-
|
1157
|
-
|
1158
|
-
|
1159
|
-
)
|
1160
|
-
],
|
1161
|
-
)
|
1162
|
-
catalog_properties = get_catalog_properties(**kwargs)
|
1163
|
-
transaction.commit(
|
1164
|
-
catalog_root_dir=catalog_properties.root,
|
1165
|
-
filesystem=catalog_properties.filesystem,
|
1778
|
+
# Add the operation to the transaction
|
1779
|
+
transaction.step(
|
1780
|
+
TransactionOperation.of(
|
1781
|
+
operation_type=TransactionOperationType.CREATE,
|
1782
|
+
dest_metafile=stream,
|
1783
|
+
),
|
1166
1784
|
)
|
1785
|
+
|
1786
|
+
if commit_transaction:
|
1787
|
+
transaction.seal()
|
1167
1788
|
return stream
|
1168
1789
|
|
1169
1790
|
|
1170
1791
|
def commit_stream(
|
1171
1792
|
stream: Stream,
|
1172
1793
|
*args,
|
1794
|
+
transaction: Optional[Transaction] = None,
|
1173
1795
|
**kwargs,
|
1174
1796
|
) -> Stream:
|
1175
1797
|
"""
|
@@ -1177,6 +1799,8 @@ def commit_stream(
|
|
1177
1799
|
previous stream registered for the same table version. Returns the
|
1178
1800
|
committed stream.
|
1179
1801
|
"""
|
1802
|
+
transaction, commit_transaction = setup_transaction(transaction, **kwargs)
|
1803
|
+
|
1180
1804
|
if not stream.stream_id:
|
1181
1805
|
raise ValueError("Stream ID to commit must be set to a staged stream ID.")
|
1182
1806
|
if not stream.table_version_locator:
|
@@ -1185,83 +1809,71 @@ def commit_stream(
|
|
1185
1809
|
"set to the parent of its staged stream ID."
|
1186
1810
|
)
|
1187
1811
|
prev_staged_stream = get_stream_by_id(
|
1188
|
-
*args,
|
1189
1812
|
table_version_locator=stream.table_version_locator,
|
1190
1813
|
stream_id=stream.stream_id,
|
1814
|
+
transaction=transaction,
|
1815
|
+
*args,
|
1191
1816
|
**kwargs,
|
1192
1817
|
)
|
1193
1818
|
if not prev_staged_stream:
|
1194
|
-
raise
|
1819
|
+
raise StreamNotFoundError(
|
1195
1820
|
f"Stream at table version {stream.table_version_locator} with ID "
|
1196
1821
|
f"{stream.stream_id} not found."
|
1197
1822
|
)
|
1198
1823
|
if prev_staged_stream.state != CommitState.STAGED:
|
1199
|
-
raise
|
1824
|
+
raise TableValidationError(
|
1200
1825
|
f"Expected to find a `{CommitState.STAGED}` stream at table version "
|
1201
1826
|
f"{stream.table_version_locator} with ID {stream.stream_id},"
|
1202
1827
|
f"but found a `{prev_staged_stream.state}` partition."
|
1203
1828
|
)
|
1204
|
-
if not prev_staged_stream:
|
1205
|
-
raise ValueError(
|
1206
|
-
f"Stream at table_version {stream.table_version_locator} with ID "
|
1207
|
-
f"{stream.stream_id} not found."
|
1208
|
-
)
|
1209
|
-
if prev_staged_stream.state != CommitState.STAGED:
|
1210
|
-
raise ValueError(
|
1211
|
-
f"Expected to find a `{CommitState.STAGED}` stream at table version "
|
1212
|
-
f"{stream.table_version_locator} with ID {stream.stream_id},"
|
1213
|
-
f"but found a `{prev_staged_stream.state}` stream."
|
1214
|
-
)
|
1215
1829
|
stream: Stream = Metafile.update_for(prev_staged_stream)
|
1216
1830
|
stream.state = CommitState.COMMITTED
|
1217
1831
|
prev_committed_stream = get_stream(
|
1218
|
-
*args,
|
1219
1832
|
namespace=stream.namespace,
|
1220
1833
|
table_name=stream.table_name,
|
1221
1834
|
table_version=stream.table_version,
|
1222
1835
|
stream_format=stream.stream_format,
|
1836
|
+
transaction=transaction,
|
1837
|
+
*args,
|
1223
1838
|
**kwargs,
|
1224
1839
|
)
|
1840
|
+
if prev_committed_stream:
|
1841
|
+
# there's a previously committed stream, so update the transaction
|
1842
|
+
# type to overwrite the previously committed stream
|
1843
|
+
txn_op_type = TransactionOperationType.REPLACE
|
1844
|
+
else:
|
1845
|
+
txn_op_type = TransactionOperationType.UPDATE
|
1846
|
+
|
1225
1847
|
# the first transaction operation updates the staged stream commit state
|
1226
|
-
|
1227
|
-
txn_ops = [
|
1848
|
+
transaction.step(
|
1228
1849
|
TransactionOperation.of(
|
1229
|
-
operation_type=
|
1850
|
+
operation_type=txn_op_type,
|
1230
1851
|
dest_metafile=stream,
|
1231
1852
|
src_metafile=prev_staged_stream,
|
1232
|
-
)
|
1233
|
-
|
1853
|
+
),
|
1854
|
+
)
|
1234
1855
|
if prev_committed_stream:
|
1235
1856
|
if prev_committed_stream.stream_id != stream.previous_stream_id:
|
1236
|
-
raise
|
1857
|
+
raise ConcurrentModificationError(
|
1237
1858
|
f"Previous stream ID mismatch Expected "
|
1238
1859
|
f"{stream.previous_stream_id} but found "
|
1239
1860
|
f"{prev_committed_stream.stream_id}."
|
1240
1861
|
)
|
1241
1862
|
if prev_committed_stream.stream_id == stream.stream_id:
|
1242
|
-
raise
|
1863
|
+
raise TableValidationError(
|
1243
1864
|
f"Stream to commit has the same ID as existing stream: {prev_committed_stream}."
|
1244
1865
|
)
|
1245
|
-
#
|
1246
|
-
#
|
1247
|
-
|
1248
|
-
txn_type = TransactionType.OVERWRITE
|
1249
|
-
txn_ops.append(
|
1866
|
+
# add another transaction operation to replace the previously committed stream
|
1867
|
+
# with the staged stream
|
1868
|
+
transaction.step(
|
1250
1869
|
TransactionOperation.of(
|
1251
|
-
operation_type=
|
1870
|
+
operation_type=txn_op_type,
|
1252
1871
|
dest_metafile=stream,
|
1253
1872
|
src_metafile=prev_committed_stream,
|
1254
|
-
)
|
1873
|
+
),
|
1255
1874
|
)
|
1256
|
-
|
1257
|
-
|
1258
|
-
txn_operations=txn_ops,
|
1259
|
-
)
|
1260
|
-
catalog_properties = get_catalog_properties(**kwargs)
|
1261
|
-
transaction.commit(
|
1262
|
-
catalog_root_dir=catalog_properties.root,
|
1263
|
-
filesystem=catalog_properties.filesystem,
|
1264
|
-
)
|
1875
|
+
if commit_transaction:
|
1876
|
+
transaction.seal()
|
1265
1877
|
return stream
|
1266
1878
|
|
1267
1879
|
|
@@ -1271,6 +1883,7 @@ def delete_stream(
|
|
1271
1883
|
table_version: Optional[str] = None,
|
1272
1884
|
stream_format: StreamFormat = StreamFormat.DELTACAT,
|
1273
1885
|
*args,
|
1886
|
+
transaction: Optional[Transaction] = None,
|
1274
1887
|
**kwargs,
|
1275
1888
|
) -> None:
|
1276
1889
|
"""
|
@@ -1279,121 +1892,120 @@ def delete_stream(
|
|
1279
1892
|
Resolves to the deltacat stream format if no stream format is given.
|
1280
1893
|
Raises an error if the stream does not exist.
|
1281
1894
|
"""
|
1895
|
+
transaction, commit_transaction = setup_transaction(transaction, **kwargs)
|
1896
|
+
|
1282
1897
|
if not table_version:
|
1283
1898
|
table_version = _resolve_latest_active_table_version_id(
|
1284
|
-
*args,
|
1285
1899
|
namespace=namespace,
|
1286
1900
|
table_name=table_name,
|
1901
|
+
transaction=transaction,
|
1902
|
+
*args,
|
1287
1903
|
**kwargs,
|
1288
1904
|
)
|
1289
1905
|
stream_to_delete = get_stream(
|
1290
|
-
*args,
|
1291
1906
|
namespace=namespace,
|
1292
1907
|
table_name=table_name,
|
1293
1908
|
table_version=table_version,
|
1294
1909
|
stream_format=stream_format,
|
1910
|
+
transaction=transaction,
|
1911
|
+
*args,
|
1295
1912
|
**kwargs,
|
1296
1913
|
)
|
1297
1914
|
if not stream_to_delete:
|
1298
|
-
raise
|
1915
|
+
raise StreamNotFoundError(
|
1299
1916
|
f"Stream to delete not found: {namespace}.{table_name}"
|
1300
1917
|
f".{table_version}.{stream_format}."
|
1301
1918
|
)
|
1302
1919
|
else:
|
1303
1920
|
stream_to_delete.state = CommitState.DEPRECATED
|
1304
|
-
|
1305
|
-
|
1306
|
-
|
1307
|
-
|
1308
|
-
|
1309
|
-
|
1310
|
-
)
|
1311
|
-
],
|
1312
|
-
)
|
1313
|
-
catalog_properties = get_catalog_properties(**kwargs)
|
1314
|
-
transaction.commit(
|
1315
|
-
catalog_root_dir=catalog_properties.root,
|
1316
|
-
filesystem=catalog_properties.filesystem,
|
1921
|
+
|
1922
|
+
transaction.step(
|
1923
|
+
TransactionOperation.of(
|
1924
|
+
operation_type=TransactionOperationType.DELETE,
|
1925
|
+
dest_metafile=stream_to_delete,
|
1926
|
+
),
|
1317
1927
|
)
|
1318
1928
|
|
1929
|
+
if commit_transaction:
|
1930
|
+
transaction.seal()
|
1931
|
+
|
1319
1932
|
|
1320
1933
|
def delete_table(
|
1321
1934
|
namespace: str,
|
1322
|
-
|
1935
|
+
table_name: str,
|
1323
1936
|
purge: bool = False,
|
1324
1937
|
*args,
|
1938
|
+
transaction: Optional[Transaction] = None,
|
1325
1939
|
**kwargs,
|
1326
1940
|
) -> None:
|
1327
1941
|
"""
|
1328
|
-
Drops the given table
|
1329
|
-
|
1330
|
-
|
1331
|
-
|
1332
|
-
TODO: Honor purge once garbage collection is implemented.
|
1942
|
+
Drops the given table from the catalog. If purge is True, also removes
|
1943
|
+
all data files associated with the table. Raises an error if the given table
|
1944
|
+
does not exist.
|
1333
1945
|
"""
|
1946
|
+
if purge:
|
1947
|
+
raise NotImplementedError("Purge flag is not currently supported.")
|
1948
|
+
transaction, commit_transaction = setup_transaction(transaction, **kwargs)
|
1949
|
+
|
1334
1950
|
table: Optional[Table] = get_table(
|
1335
|
-
*args,
|
1336
1951
|
namespace=namespace,
|
1337
|
-
table_name=
|
1952
|
+
table_name=table_name,
|
1953
|
+
transaction=transaction,
|
1954
|
+
*args,
|
1338
1955
|
**kwargs,
|
1339
1956
|
)
|
1340
1957
|
|
1341
1958
|
if not table:
|
1342
|
-
|
1959
|
+
# TODO(pdames): Refactor this so that it doesn't initialize Ray
|
1960
|
+
raise TableNotFoundError(f"Table `{namespace}.{table_name}` does not exist.")
|
1343
1961
|
|
1344
|
-
transaction
|
1345
|
-
|
1346
|
-
|
1347
|
-
|
1348
|
-
TransactionOperation.of(
|
1349
|
-
operation_type=TransactionOperationType.DELETE,
|
1350
|
-
dest_metafile=table,
|
1351
|
-
)
|
1352
|
-
]
|
1962
|
+
transaction.step(
|
1963
|
+
TransactionOperation.of(
|
1964
|
+
operation_type=TransactionOperationType.DELETE,
|
1965
|
+
dest_metafile=table,
|
1353
1966
|
),
|
1354
1967
|
)
|
1355
1968
|
|
1356
|
-
|
1357
|
-
|
1358
|
-
catalog_root_dir=catalog_properties.root,
|
1359
|
-
filesystem=catalog_properties.filesystem,
|
1360
|
-
)
|
1969
|
+
if commit_transaction:
|
1970
|
+
transaction.seal()
|
1361
1971
|
|
1362
1972
|
|
1363
1973
|
def delete_namespace(
|
1364
1974
|
namespace: str,
|
1365
1975
|
purge: bool = False,
|
1366
1976
|
*args,
|
1977
|
+
transaction: Optional[Transaction] = None,
|
1367
1978
|
**kwargs,
|
1368
1979
|
) -> None:
|
1369
1980
|
"""
|
1370
|
-
Drops the given
|
1371
|
-
|
1981
|
+
Drops the given namespace from the catalog. If purge is True, also removes
|
1982
|
+
all data files associated with the namespace. Raises an error if the given
|
1983
|
+
namespace does not exist.
|
1372
1984
|
"""
|
1373
|
-
|
1374
|
-
|
1985
|
+
if purge:
|
1986
|
+
raise NotImplementedError("Purge flag is not currently supported.")
|
1987
|
+
transaction, commit_transaction = setup_transaction(transaction, **kwargs)
|
1988
|
+
|
1989
|
+
namespace_obj: Optional[Namespace] = get_namespace(
|
1375
1990
|
namespace=namespace,
|
1991
|
+
transaction=transaction,
|
1992
|
+
*args,
|
1376
1993
|
**kwargs,
|
1377
1994
|
)
|
1378
1995
|
|
1379
|
-
if not
|
1380
|
-
raise
|
1996
|
+
if not namespace_obj:
|
1997
|
+
raise NamespaceNotFoundError(f"Namespace `{namespace}` does not exist.")
|
1381
1998
|
|
1382
|
-
transaction
|
1383
|
-
|
1384
|
-
|
1385
|
-
|
1386
|
-
|
1387
|
-
dest_metafile=namespace,
|
1388
|
-
)
|
1389
|
-
],
|
1390
|
-
)
|
1391
|
-
catalog_properties = get_catalog_properties(**kwargs)
|
1392
|
-
transaction.commit(
|
1393
|
-
catalog_root_dir=catalog_properties.root,
|
1394
|
-
filesystem=catalog_properties.filesystem,
|
1999
|
+
transaction.step(
|
2000
|
+
TransactionOperation.of(
|
2001
|
+
operation_type=TransactionOperationType.DELETE,
|
2002
|
+
dest_metafile=namespace_obj,
|
2003
|
+
),
|
1395
2004
|
)
|
1396
2005
|
|
2006
|
+
if commit_transaction:
|
2007
|
+
transaction.seal()
|
2008
|
+
|
1397
2009
|
|
1398
2010
|
def get_stream_by_id(
|
1399
2011
|
table_version_locator: TableVersionLocator,
|
@@ -1412,8 +2024,8 @@ def get_stream_by_id(
|
|
1412
2024
|
stream_format=None,
|
1413
2025
|
)
|
1414
2026
|
return _latest(
|
1415
|
-
*args,
|
1416
2027
|
metafile=Stream.of(locator=locator, partition_scheme=None),
|
2028
|
+
*args,
|
1417
2029
|
**kwargs,
|
1418
2030
|
)
|
1419
2031
|
|
@@ -1424,6 +2036,7 @@ def get_stream(
|
|
1424
2036
|
table_version: Optional[str] = None,
|
1425
2037
|
stream_format: StreamFormat = StreamFormat.DELTACAT,
|
1426
2038
|
*args,
|
2039
|
+
transaction: Optional[Transaction] = None,
|
1427
2040
|
**kwargs,
|
1428
2041
|
) -> Optional[Stream]:
|
1429
2042
|
"""
|
@@ -1432,12 +2045,14 @@ def get_stream(
|
|
1432
2045
|
Resolves to the DeltaCAT stream format if no stream format is given.
|
1433
2046
|
Returns None if the table version or stream format does not exist.
|
1434
2047
|
"""
|
2048
|
+
transaction, commit_transaction = setup_transaction(transaction, **kwargs)
|
1435
2049
|
if not table_version:
|
1436
2050
|
table_version = _resolve_latest_active_table_version_id(
|
1437
|
-
*args,
|
1438
2051
|
namespace=namespace,
|
1439
2052
|
table_name=table_name,
|
1440
2053
|
fail_if_no_active_table_version=False,
|
2054
|
+
transaction=transaction,
|
2055
|
+
*args,
|
1441
2056
|
**kwargs,
|
1442
2057
|
)
|
1443
2058
|
locator = StreamLocator.at(
|
@@ -1447,15 +2062,19 @@ def get_stream(
|
|
1447
2062
|
stream_id=None,
|
1448
2063
|
stream_format=stream_format,
|
1449
2064
|
)
|
1450
|
-
|
1451
|
-
*args,
|
2065
|
+
stream = _latest(
|
1452
2066
|
metafile=Stream.of(
|
1453
2067
|
locator=locator,
|
1454
2068
|
partition_scheme=None,
|
1455
2069
|
state=CommitState.COMMITTED,
|
1456
2070
|
),
|
2071
|
+
transaction=transaction,
|
2072
|
+
*args,
|
1457
2073
|
**kwargs,
|
1458
2074
|
)
|
2075
|
+
if commit_transaction:
|
2076
|
+
transaction.seal()
|
2077
|
+
return stream
|
1459
2078
|
|
1460
2079
|
|
1461
2080
|
def stream_exists(
|
@@ -1464,6 +2083,7 @@ def stream_exists(
|
|
1464
2083
|
table_version: Optional[str] = None,
|
1465
2084
|
stream_format: StreamFormat = StreamFormat.DELTACAT,
|
1466
2085
|
*args,
|
2086
|
+
transaction: Optional[Transaction] = None,
|
1467
2087
|
**kwargs,
|
1468
2088
|
) -> Optional[Stream]:
|
1469
2089
|
"""
|
@@ -1472,14 +2092,18 @@ def stream_exists(
|
|
1472
2092
|
Resolves to the DeltaCAT stream format if no stream format is given.
|
1473
2093
|
Returns None if the table version or stream format does not exist.
|
1474
2094
|
"""
|
2095
|
+
transaction, commit_transaction = setup_transaction(transaction, **kwargs)
|
1475
2096
|
if not table_version:
|
1476
2097
|
table_version = _resolve_latest_active_table_version_id(
|
1477
|
-
*args,
|
1478
2098
|
namespace=namespace,
|
1479
2099
|
table_name=table_name,
|
1480
2100
|
fail_if_no_active_table_version=False,
|
2101
|
+
transaction=transaction,
|
2102
|
+
*args,
|
1481
2103
|
**kwargs,
|
1482
2104
|
)
|
2105
|
+
|
2106
|
+
# Try with the provided table name first
|
1483
2107
|
locator = StreamLocator.at(
|
1484
2108
|
namespace=namespace,
|
1485
2109
|
table_name=table_name,
|
@@ -1487,15 +2111,19 @@ def stream_exists(
|
|
1487
2111
|
stream_id=None,
|
1488
2112
|
stream_format=stream_format,
|
1489
2113
|
)
|
1490
|
-
|
1491
|
-
*args,
|
2114
|
+
exists = _exists(
|
1492
2115
|
metafile=Stream.of(
|
1493
2116
|
locator=locator,
|
1494
2117
|
partition_scheme=None,
|
1495
2118
|
state=CommitState.COMMITTED,
|
1496
2119
|
),
|
2120
|
+
transaction=transaction,
|
2121
|
+
*args,
|
1497
2122
|
**kwargs,
|
1498
2123
|
)
|
2124
|
+
if commit_transaction:
|
2125
|
+
transaction.seal()
|
2126
|
+
return exists
|
1499
2127
|
|
1500
2128
|
|
1501
2129
|
def stage_partition(
|
@@ -1503,6 +2131,7 @@ def stage_partition(
|
|
1503
2131
|
partition_values: Optional[PartitionValues] = None,
|
1504
2132
|
partition_scheme_id: Optional[str] = None,
|
1505
2133
|
*args,
|
2134
|
+
transaction: Optional[Transaction] = None,
|
1506
2135
|
**kwargs,
|
1507
2136
|
) -> Partition:
|
1508
2137
|
"""
|
@@ -1515,35 +2144,65 @@ def stage_partition(
|
|
1515
2144
|
The partition_values must represent the results of transforms in a partition
|
1516
2145
|
spec specified in the stream.
|
1517
2146
|
"""
|
2147
|
+
transaction, commit_transaction = setup_transaction(transaction, **kwargs)
|
2148
|
+
|
1518
2149
|
# TODO(pdames): Cache last retrieved metafile revisions in memory to resolve
|
1519
2150
|
# potentially high cost of staging many partitions.
|
1520
2151
|
table_version = get_table_version(
|
1521
|
-
*args,
|
1522
2152
|
namespace=stream.namespace,
|
1523
2153
|
table_name=stream.table_name,
|
1524
2154
|
table_version=stream.table_version,
|
2155
|
+
transaction=transaction,
|
2156
|
+
*args,
|
1525
2157
|
**kwargs,
|
1526
2158
|
)
|
1527
2159
|
if not table_version:
|
1528
|
-
raise
|
2160
|
+
raise TableVersionNotFoundError(
|
1529
2161
|
f"Table version not found: {stream.namespace}.{stream.table_name}."
|
1530
2162
|
f"{stream.table_version}."
|
1531
2163
|
)
|
2164
|
+
# Set partition_scheme_id to UNPARTITIONED_SCHEME_ID when partition_values
|
2165
|
+
# is None or empty
|
2166
|
+
if not partition_values:
|
2167
|
+
partition_scheme_id = UNPARTITIONED_SCHEME_ID
|
2168
|
+
# Use stream's partition scheme ID if none provided and partition_values
|
2169
|
+
# are specified
|
2170
|
+
elif partition_scheme_id is None:
|
2171
|
+
partition_scheme_id = stream.partition_scheme.id
|
1532
2172
|
if not table_version.partition_schemes or partition_scheme_id not in [
|
1533
2173
|
ps.id for ps in table_version.partition_schemes
|
1534
2174
|
]:
|
1535
|
-
raise
|
2175
|
+
raise TableValidationError(
|
1536
2176
|
f"Invalid partition scheme ID `{partition_scheme_id}` (not found "
|
1537
2177
|
f"in parent table version `{stream.namespace}.{stream.table_name}"
|
1538
2178
|
f".{table_version.table_version}` partition scheme IDs)."
|
1539
2179
|
)
|
1540
|
-
if stream.partition_scheme.id not in
|
2180
|
+
if stream.partition_scheme.id not in [
|
2181
|
+
ps.id for ps in table_version.partition_schemes
|
2182
|
+
]:
|
1541
2183
|
# this should never happen, but just in case
|
1542
|
-
raise
|
2184
|
+
raise TableValidationError(
|
1543
2185
|
f"Invalid stream partition scheme ID `{stream.partition_scheme.id}`"
|
1544
|
-
f"in parent table version
|
2186
|
+
f" (not found in parent table version "
|
2187
|
+
f"`{stream.namespace}.{stream.table_name}"
|
1545
2188
|
f".{table_version.table_version}` partition scheme IDs)."
|
1546
2189
|
)
|
2190
|
+
|
2191
|
+
if partition_values:
|
2192
|
+
if partition_scheme_id == UNPARTITIONED_SCHEME_ID:
|
2193
|
+
raise TableValidationError(
|
2194
|
+
"Partition values cannot be specified for unpartitioned tables"
|
2195
|
+
)
|
2196
|
+
# Validate partition values against partition scheme
|
2197
|
+
partition_scheme = next(
|
2198
|
+
ps for ps in table_version.partition_schemes if ps.id == partition_scheme_id
|
2199
|
+
)
|
2200
|
+
_validate_partition_values_against_scheme(
|
2201
|
+
partition_values=partition_values,
|
2202
|
+
partition_scheme=partition_scheme,
|
2203
|
+
schema=table_version.schema,
|
2204
|
+
)
|
2205
|
+
|
1547
2206
|
locator = PartitionLocator.of(
|
1548
2207
|
stream_locator=stream.locator,
|
1549
2208
|
partition_values=partition_values,
|
@@ -1551,42 +2210,40 @@ def stage_partition(
|
|
1551
2210
|
)
|
1552
2211
|
partition = Partition.of(
|
1553
2212
|
locator=locator,
|
1554
|
-
schema=table_version.schema,
|
1555
2213
|
content_types=table_version.content_types,
|
1556
2214
|
state=CommitState.STAGED,
|
1557
2215
|
previous_stream_position=None,
|
1558
|
-
partition_values=partition_values,
|
1559
2216
|
previous_partition_id=None,
|
1560
2217
|
stream_position=None,
|
1561
2218
|
partition_scheme_id=partition_scheme_id,
|
1562
2219
|
)
|
1563
2220
|
prev_partition = get_partition(
|
1564
|
-
*args,
|
1565
2221
|
stream_locator=stream.locator,
|
1566
2222
|
partition_values=partition_values,
|
1567
2223
|
partition_scheme_id=partition_scheme_id,
|
2224
|
+
transaction=transaction,
|
2225
|
+
*args,
|
1568
2226
|
**kwargs,
|
1569
2227
|
)
|
1570
|
-
if prev_partition
|
1571
|
-
|
1572
|
-
|
1573
|
-
|
1574
|
-
|
1575
|
-
|
1576
|
-
|
1577
|
-
|
1578
|
-
|
1579
|
-
|
1580
|
-
|
1581
|
-
|
1582
|
-
|
1583
|
-
|
1584
|
-
|
1585
|
-
catalog_properties = get_catalog_properties(**kwargs)
|
1586
|
-
transaction.commit(
|
1587
|
-
catalog_root_dir=catalog_properties.root,
|
1588
|
-
filesystem=catalog_properties.filesystem,
|
2228
|
+
prev_partition_id = prev_partition.partition_id if prev_partition else None
|
2229
|
+
|
2230
|
+
# TODO(pdames): Check all historic partitions for the same partition ID
|
2231
|
+
if prev_partition_id == partition.partition_id:
|
2232
|
+
raise TableValidationError(
|
2233
|
+
f"Partition to stage has the same ID as previous partition: {prev_partition_id}."
|
2234
|
+
)
|
2235
|
+
partition.previous_partition_id = prev_partition_id
|
2236
|
+
|
2237
|
+
# Add the operation to the transaction
|
2238
|
+
transaction.step(
|
2239
|
+
TransactionOperation.of(
|
2240
|
+
operation_type=TransactionOperationType.CREATE,
|
2241
|
+
dest_metafile=partition,
|
2242
|
+
),
|
1589
2243
|
)
|
2244
|
+
|
2245
|
+
if commit_transaction:
|
2246
|
+
transaction.seal()
|
1590
2247
|
return partition
|
1591
2248
|
|
1592
2249
|
|
@@ -1594,12 +2251,15 @@ def commit_partition(
|
|
1594
2251
|
partition: Partition,
|
1595
2252
|
previous_partition: Optional[Partition] = None,
|
1596
2253
|
*args,
|
2254
|
+
transaction: Optional[Transaction] = None,
|
1597
2255
|
**kwargs,
|
1598
2256
|
) -> Partition:
|
1599
2257
|
"""
|
1600
2258
|
Commits the staged partition to its associated table version stream,
|
1601
2259
|
replacing any previous partition registered for the same stream and
|
1602
|
-
partition values.
|
2260
|
+
partition values. All values set on the input partition except compaction
|
2261
|
+
round completion info will be overwritten with the values stored in the
|
2262
|
+
staged partition.
|
1603
2263
|
|
1604
2264
|
If previous partition is given then it will be replaced with its deltas
|
1605
2265
|
prepended to the new partition being committed. Otherwise the latest
|
@@ -1613,6 +2273,8 @@ def commit_partition(
|
|
1613
2273
|
specified, then the commit will be rejected if it does not match the actual
|
1614
2274
|
ID of the partition being replaced.
|
1615
2275
|
"""
|
2276
|
+
transaction, commit_transaction = setup_transaction(transaction, **kwargs)
|
2277
|
+
|
1616
2278
|
if previous_partition:
|
1617
2279
|
raise NotImplementedError(
|
1618
2280
|
f"delta prepending from previous partition {previous_partition} "
|
@@ -1625,74 +2287,98 @@ def commit_partition(
|
|
1625
2287
|
"Partition to commit must have its stream locator "
|
1626
2288
|
"set to the parent of its staged partition ID."
|
1627
2289
|
)
|
2290
|
+
|
2291
|
+
# Start a single multi-step transaction for all operations (both read and write)
|
2292
|
+
# Step 1: Get the staged partition using transaction
|
1628
2293
|
prev_staged_partition = get_partition_by_id(
|
1629
|
-
*args,
|
1630
2294
|
stream_locator=partition.stream_locator,
|
1631
2295
|
partition_id=partition.partition_id,
|
2296
|
+
transaction=transaction,
|
2297
|
+
*args,
|
1632
2298
|
**kwargs,
|
1633
2299
|
)
|
2300
|
+
|
2301
|
+
# Validate staged partition
|
1634
2302
|
if not prev_staged_partition:
|
1635
|
-
raise
|
2303
|
+
raise PartitionNotFoundError(
|
1636
2304
|
f"Partition at stream {partition.stream_locator} with ID "
|
1637
2305
|
f"{partition.partition_id} not found."
|
1638
2306
|
)
|
1639
2307
|
if prev_staged_partition.state != CommitState.STAGED:
|
1640
|
-
raise
|
2308
|
+
raise TableValidationError(
|
1641
2309
|
f"Expected to find a `{CommitState.STAGED}` partition at stream "
|
1642
2310
|
f"{partition.stream_locator} with ID {partition.partition_id},"
|
1643
2311
|
f"but found a `{prev_staged_partition.state}` partition."
|
1644
2312
|
)
|
1645
|
-
|
1646
|
-
|
1647
|
-
prev_committed_partition =
|
1648
|
-
|
1649
|
-
|
1650
|
-
|
1651
|
-
|
1652
|
-
|
1653
|
-
|
1654
|
-
|
1655
|
-
|
1656
|
-
txn_ops = [
|
1657
|
-
TransactionOperation.of(
|
1658
|
-
operation_type=TransactionOperationType.UPDATE,
|
1659
|
-
dest_metafile=partition,
|
1660
|
-
src_metafile=prev_staged_partition,
|
2313
|
+
|
2314
|
+
# Step 2: Check for existing committed partition
|
2315
|
+
prev_committed_partition = None
|
2316
|
+
if partition.previous_partition_id is not None:
|
2317
|
+
prev_committed_partition = get_partition(
|
2318
|
+
stream_locator=partition.stream_locator,
|
2319
|
+
partition_values=partition.partition_values,
|
2320
|
+
partition_scheme_id=partition.partition_scheme_id,
|
2321
|
+
transaction=transaction,
|
2322
|
+
*args,
|
2323
|
+
**kwargs,
|
1661
2324
|
)
|
1662
|
-
|
2325
|
+
|
2326
|
+
# Validate expected previous partition ID for race condition detection
|
1663
2327
|
if prev_committed_partition:
|
2328
|
+
logger.info(
|
2329
|
+
f"Checking previous committed partition for conflicts: {prev_committed_partition}"
|
2330
|
+
)
|
1664
2331
|
if prev_committed_partition.partition_id != partition.previous_partition_id:
|
1665
|
-
raise
|
1666
|
-
f"
|
2332
|
+
raise ConcurrentModificationError(
|
2333
|
+
f"Concurrent modification detected: Expected committed partition "
|
1667
2334
|
f"{partition.previous_partition_id} but found "
|
1668
2335
|
f"{prev_committed_partition.partition_id}."
|
1669
2336
|
)
|
1670
|
-
|
2337
|
+
|
2338
|
+
if prev_committed_partition:
|
2339
|
+
# Update transaction type based on what we found
|
2340
|
+
txn_op_type = TransactionOperationType.REPLACE
|
1671
2341
|
if prev_committed_partition.partition_id == partition.partition_id:
|
1672
|
-
raise
|
2342
|
+
raise TableValidationError(
|
1673
2343
|
f"Partition to commit has the same ID as existing partition: "
|
1674
2344
|
f"{prev_committed_partition}."
|
1675
2345
|
)
|
1676
|
-
|
1677
|
-
|
1678
|
-
|
1679
|
-
|
1680
|
-
|
2346
|
+
else:
|
2347
|
+
txn_op_type = TransactionOperationType.UPDATE
|
2348
|
+
|
2349
|
+
# Prepare the committed partition based on the staged partition
|
2350
|
+
# Compaction round completion info (if any) is not set on the staged partition,
|
2351
|
+
# so we need to save it from the input partition to commit.
|
2352
|
+
input_partition_rci = partition.compaction_round_completion_info
|
2353
|
+
partition: Partition = Metafile.update_for(prev_staged_partition)
|
2354
|
+
partition.state = CommitState.COMMITTED
|
2355
|
+
# Restore compaction round completion info (if any) from the input partition.
|
2356
|
+
if input_partition_rci is not None:
|
2357
|
+
partition.compaction_round_completion_info = input_partition_rci
|
2358
|
+
|
2359
|
+
# Step 4: Add write operations to the same transaction
|
2360
|
+
# Always UPDATE the staged partition to committed state
|
2361
|
+
transaction.step(
|
2362
|
+
TransactionOperation.of(
|
2363
|
+
operation_type=txn_op_type,
|
2364
|
+
dest_metafile=partition,
|
2365
|
+
src_metafile=prev_staged_partition,
|
2366
|
+
),
|
2367
|
+
)
|
2368
|
+
|
2369
|
+
# If there's a previously committed partition, we need to replace it too
|
2370
|
+
if prev_committed_partition:
|
2371
|
+
transaction.step(
|
1681
2372
|
TransactionOperation.of(
|
1682
|
-
operation_type=
|
2373
|
+
operation_type=txn_op_type,
|
1683
2374
|
dest_metafile=partition,
|
1684
2375
|
src_metafile=prev_committed_partition,
|
1685
|
-
)
|
2376
|
+
),
|
1686
2377
|
)
|
1687
|
-
|
1688
|
-
|
1689
|
-
|
1690
|
-
|
1691
|
-
catalog_properties = get_catalog_properties(**kwargs)
|
1692
|
-
transaction.commit(
|
1693
|
-
catalog_root_dir=catalog_properties.root,
|
1694
|
-
filesystem=catalog_properties.filesystem,
|
1695
|
-
)
|
2378
|
+
|
2379
|
+
if commit_transaction:
|
2380
|
+
transaction.seal()
|
2381
|
+
|
1696
2382
|
return partition
|
1697
2383
|
|
1698
2384
|
|
@@ -1701,6 +2387,7 @@ def delete_partition(
|
|
1701
2387
|
partition_values: Optional[PartitionValues] = None,
|
1702
2388
|
partition_scheme_id: Optional[str] = None,
|
1703
2389
|
*args,
|
2390
|
+
transaction: Optional[Transaction] = None,
|
1704
2391
|
**kwargs,
|
1705
2392
|
) -> None:
|
1706
2393
|
"""
|
@@ -1708,35 +2395,34 @@ def delete_partition(
|
|
1708
2395
|
values should not be specified for unpartitioned tables. Raises an error
|
1709
2396
|
if the partition does not exist.
|
1710
2397
|
"""
|
2398
|
+
transaction, commit_transaction = setup_transaction(transaction, **kwargs)
|
2399
|
+
|
1711
2400
|
partition_to_delete = get_partition(
|
1712
|
-
*args,
|
1713
2401
|
stream_locator=stream_locator,
|
1714
2402
|
partition_values=partition_values,
|
1715
2403
|
partition_scheme_id=partition_scheme_id,
|
2404
|
+
transaction=transaction,
|
2405
|
+
*args,
|
1716
2406
|
**kwargs,
|
1717
2407
|
)
|
1718
2408
|
if not partition_to_delete:
|
1719
|
-
raise
|
2409
|
+
raise PartitionNotFoundError(
|
1720
2410
|
f"Partition with values {partition_values} and scheme "
|
1721
2411
|
f"{partition_scheme_id} not found in stream: {stream_locator}"
|
1722
2412
|
)
|
1723
2413
|
else:
|
1724
2414
|
partition_to_delete.state = CommitState.DEPRECATED
|
1725
|
-
|
1726
|
-
|
1727
|
-
|
1728
|
-
|
1729
|
-
|
1730
|
-
|
1731
|
-
)
|
1732
|
-
],
|
1733
|
-
)
|
1734
|
-
catalog_properties = get_catalog_properties(**kwargs)
|
1735
|
-
transaction.commit(
|
1736
|
-
catalog_root_dir=catalog_properties.root,
|
1737
|
-
filesystem=catalog_properties.filesystem,
|
2415
|
+
|
2416
|
+
transaction.step(
|
2417
|
+
TransactionOperation.of(
|
2418
|
+
operation_type=TransactionOperationType.DELETE,
|
2419
|
+
dest_metafile=partition_to_delete,
|
2420
|
+
),
|
1738
2421
|
)
|
1739
2422
|
|
2423
|
+
if commit_transaction:
|
2424
|
+
transaction.seal()
|
2425
|
+
|
1740
2426
|
|
1741
2427
|
def get_partition_by_id(
|
1742
2428
|
stream_locator: StreamLocator,
|
@@ -1755,12 +2441,11 @@ def get_partition_by_id(
|
|
1755
2441
|
partition_id=partition_id,
|
1756
2442
|
)
|
1757
2443
|
return _latest(
|
1758
|
-
*args,
|
1759
2444
|
metafile=Partition.of(
|
1760
2445
|
locator=locator,
|
1761
|
-
schema=None,
|
1762
2446
|
content_types=None,
|
1763
2447
|
),
|
2448
|
+
*args,
|
1764
2449
|
**kwargs,
|
1765
2450
|
)
|
1766
2451
|
|
@@ -1770,6 +2455,7 @@ def get_partition(
|
|
1770
2455
|
partition_values: Optional[PartitionValues] = None,
|
1771
2456
|
partition_scheme_id: Optional[str] = None,
|
1772
2457
|
*args,
|
2458
|
+
transaction: Optional[Transaction] = None,
|
1773
2459
|
**kwargs,
|
1774
2460
|
) -> Optional[Partition]:
|
1775
2461
|
"""
|
@@ -1780,35 +2466,124 @@ def get_partition(
|
|
1780
2466
|
resolves to the table version's current partition scheme by default.
|
1781
2467
|
Raises an error if the given stream locator does not exist.
|
1782
2468
|
"""
|
1783
|
-
|
1784
|
-
|
1785
|
-
partition_values=partition_values,
|
1786
|
-
partition_id=None,
|
1787
|
-
)
|
1788
|
-
if not partition_scheme_id:
|
2469
|
+
transaction, commit_transaction = setup_transaction(transaction, **kwargs)
|
2470
|
+
if not partition_scheme_id or not stream_locator.stream_id:
|
1789
2471
|
# resolve latest partition scheme from the current
|
1790
2472
|
# revision of its `deltacat` stream
|
1791
2473
|
stream = get_stream(
|
1792
|
-
*args,
|
1793
2474
|
namespace=stream_locator.namespace,
|
1794
2475
|
table_name=stream_locator.table_name,
|
1795
2476
|
table_version=stream_locator.table_version,
|
2477
|
+
transaction=transaction,
|
2478
|
+
*args,
|
1796
2479
|
**kwargs,
|
1797
2480
|
)
|
1798
2481
|
if not stream:
|
1799
|
-
raise
|
2482
|
+
raise StreamNotFoundError(f"Stream {stream_locator} not found.")
|
1800
2483
|
partition_scheme_id = stream.partition_scheme.id
|
1801
|
-
|
1802
|
-
|
2484
|
+
# ensure that we always use a fully qualified stream locator
|
2485
|
+
stream_locator = stream.locator
|
2486
|
+
locator = PartitionLocator.of(
|
2487
|
+
stream_locator=stream_locator,
|
2488
|
+
partition_values=partition_values,
|
2489
|
+
partition_id=None,
|
2490
|
+
)
|
2491
|
+
partition = _latest(
|
1803
2492
|
metafile=Partition.of(
|
1804
2493
|
locator=locator,
|
1805
|
-
schema=None,
|
1806
2494
|
content_types=None,
|
1807
2495
|
state=CommitState.COMMITTED,
|
1808
2496
|
partition_scheme_id=partition_scheme_id,
|
1809
2497
|
),
|
2498
|
+
transaction=transaction,
|
2499
|
+
*args,
|
2500
|
+
**kwargs,
|
2501
|
+
)
|
2502
|
+
if commit_transaction:
|
2503
|
+
transaction.seal()
|
2504
|
+
return partition
|
2505
|
+
|
2506
|
+
|
2507
|
+
def _write_table_slices(
|
2508
|
+
table: Union[LocalTable, LocalDataset, DistributedDataset],
|
2509
|
+
partition_id: str,
|
2510
|
+
max_records_per_entry: Optional[int],
|
2511
|
+
table_writer_fn: Callable,
|
2512
|
+
table_slicer_fn: Callable,
|
2513
|
+
content_type: ContentType = ContentType.PARQUET,
|
2514
|
+
entry_params: Optional[EntryParams] = None,
|
2515
|
+
entry_type: Optional[EntryType] = EntryType.DATA,
|
2516
|
+
table_writer_kwargs: Optional[Dict[str, Any]] = None,
|
2517
|
+
**kwargs,
|
2518
|
+
) -> ManifestEntryList:
|
2519
|
+
catalog_properties = get_catalog_properties(**kwargs)
|
2520
|
+
manifest_entries = ManifestEntryList()
|
2521
|
+
# LocalDataset is a special case to upload iteratively
|
2522
|
+
tables = [t for t in table] if isinstance(table, list) else [table]
|
2523
|
+
filesystem = catalog_properties.filesystem
|
2524
|
+
data_dir_path = posixpath.join(
|
2525
|
+
catalog_properties.root,
|
2526
|
+
DATA_FILE_DIR_NAME,
|
2527
|
+
partition_id,
|
2528
|
+
)
|
2529
|
+
filesystem.create_dir(data_dir_path, recursive=True)
|
2530
|
+
for t in tables:
|
2531
|
+
manifest_entries.extend(
|
2532
|
+
write_sliced_table(
|
2533
|
+
t,
|
2534
|
+
data_dir_path,
|
2535
|
+
filesystem,
|
2536
|
+
max_records_per_entry,
|
2537
|
+
table_writer_fn,
|
2538
|
+
table_slicer_fn,
|
2539
|
+
table_writer_kwargs,
|
2540
|
+
content_type,
|
2541
|
+
entry_params,
|
2542
|
+
entry_type,
|
2543
|
+
)
|
2544
|
+
)
|
2545
|
+
return manifest_entries
|
2546
|
+
|
2547
|
+
|
2548
|
+
def _write_table(
|
2549
|
+
partition_id: str,
|
2550
|
+
table: Union[LocalTable, LocalDataset, DistributedDataset],
|
2551
|
+
max_records_per_entry: Optional[int] = None,
|
2552
|
+
author: Optional[ManifestAuthor] = None,
|
2553
|
+
content_type: ContentType = ContentType.PARQUET,
|
2554
|
+
entry_params: Optional[EntryParams] = None,
|
2555
|
+
entry_type: Optional[EntryType] = EntryType.DATA,
|
2556
|
+
write_table_slices_fn: Optional[Callable] = _write_table_slices,
|
2557
|
+
table_writer_kwargs: Optional[Dict[str, Any]] = None,
|
2558
|
+
**kwargs,
|
2559
|
+
) -> Manifest:
|
2560
|
+
"""
|
2561
|
+
Writes the given table to 1 or more files and returns a
|
2562
|
+
Redshift manifest pointing to the uploaded files.
|
2563
|
+
"""
|
2564
|
+
table_writer_fn = get_table_writer(table)
|
2565
|
+
table_slicer_fn = get_table_slicer(table)
|
2566
|
+
|
2567
|
+
manifest_entries = write_table_slices_fn(
|
2568
|
+
table,
|
2569
|
+
partition_id,
|
2570
|
+
max_records_per_entry,
|
2571
|
+
table_writer_fn,
|
2572
|
+
table_slicer_fn,
|
2573
|
+
content_type,
|
2574
|
+
entry_params,
|
2575
|
+
entry_type,
|
2576
|
+
table_writer_kwargs,
|
1810
2577
|
**kwargs,
|
1811
2578
|
)
|
2579
|
+
manifest = Manifest.of(
|
2580
|
+
entries=manifest_entries,
|
2581
|
+
author=author,
|
2582
|
+
uuid=str(uuid.uuid4()),
|
2583
|
+
entry_type=entry_type,
|
2584
|
+
entry_params=entry_params,
|
2585
|
+
)
|
2586
|
+
return manifest
|
1812
2587
|
|
1813
2588
|
|
1814
2589
|
def stage_delta(
|
@@ -1818,28 +2593,82 @@ def stage_delta(
|
|
1818
2593
|
max_records_per_entry: Optional[int] = None,
|
1819
2594
|
author: Optional[ManifestAuthor] = None,
|
1820
2595
|
properties: Optional[DeltaProperties] = None,
|
1821
|
-
|
2596
|
+
table_writer_kwargs: Optional[Dict[str, Any]] = None,
|
1822
2597
|
content_type: ContentType = ContentType.PARQUET,
|
1823
2598
|
entry_params: Optional[EntryParams] = None,
|
2599
|
+
entry_type: Optional[EntryType] = EntryType.DATA,
|
2600
|
+
write_table_slices_fn: Optional[Callable] = _write_table_slices,
|
2601
|
+
schema: Optional[Schema] = None,
|
2602
|
+
sort_scheme_id: Optional[str] = None,
|
1824
2603
|
*args,
|
1825
2604
|
**kwargs,
|
1826
2605
|
) -> Delta:
|
1827
2606
|
"""
|
1828
|
-
Writes the given
|
2607
|
+
Writes the given dataset to 1 or more files. Returns an unregistered
|
1829
2608
|
delta whose manifest entries point to the uploaded files. Applies any
|
1830
2609
|
schema consistency policies configured for the parent table version.
|
1831
|
-
|
1832
|
-
The partition spec will be used to split the input table into
|
1833
|
-
multiple files. Optionally, partition_values can be provided to avoid
|
1834
|
-
this method to recompute partition_values from the provided data.
|
1835
|
-
|
1836
|
-
Raises an error if the provided data does not conform to a unique ordered
|
1837
|
-
list of partition_values
|
1838
2610
|
"""
|
1839
|
-
|
2611
|
+
# TODO(pdames): Validate that equality delete entry types either have
|
2612
|
+
# entry params specified, or are being added to a table with merge keys.
|
2613
|
+
if not partition.is_supported_content_type(content_type):
|
2614
|
+
raise TableValidationError(
|
2615
|
+
f"Content type {content_type} is not supported by "
|
2616
|
+
f"partition: {partition}"
|
2617
|
+
)
|
2618
|
+
if partition.state == CommitState.DEPRECATED:
|
2619
|
+
raise TableValidationError(
|
2620
|
+
f"Cannot stage delta to {partition.state} partition: {partition}",
|
2621
|
+
)
|
2622
|
+
previous_stream_position: Optional[int] = partition.stream_position
|
2623
|
+
|
2624
|
+
# Handle schema parameter and add to table_writer_kwargs if available
|
2625
|
+
table_writer_kwargs = table_writer_kwargs or {}
|
2626
|
+
|
2627
|
+
# Extract schema_id from the schema if it's a DeltaCAT Schema
|
2628
|
+
schema_id = None
|
2629
|
+
if isinstance(schema, Schema):
|
2630
|
+
schema_id = schema.id
|
2631
|
+
table_writer_kwargs["schema_id"] = schema_id
|
2632
|
+
# Add PyArrow schema to table_writer_kwargs if not already present
|
2633
|
+
if "schema" not in table_writer_kwargs:
|
2634
|
+
table_writer_kwargs["schema"] = schema.arrow
|
2635
|
+
elif schema is not None and "schema" not in table_writer_kwargs:
|
2636
|
+
# For PyArrow schemas or other types, add directly
|
2637
|
+
table_writer_kwargs["schema"] = schema
|
2638
|
+
|
2639
|
+
# Add sort_scheme_id to table_writer_kwargs for manifest entry creation
|
2640
|
+
if sort_scheme_id is not None:
|
2641
|
+
table_writer_kwargs["sort_scheme_id"] = sort_scheme_id
|
2642
|
+
|
2643
|
+
manifest: Manifest = _write_table(
|
2644
|
+
partition.partition_id,
|
2645
|
+
data,
|
2646
|
+
max_records_per_entry,
|
2647
|
+
author,
|
2648
|
+
content_type,
|
2649
|
+
entry_params,
|
2650
|
+
entry_type,
|
2651
|
+
write_table_slices_fn,
|
2652
|
+
table_writer_kwargs,
|
2653
|
+
**kwargs,
|
2654
|
+
)
|
2655
|
+
staged_delta: Delta = Delta.of(
|
2656
|
+
locator=DeltaLocator.of(partition.locator, None),
|
2657
|
+
delta_type=delta_type,
|
2658
|
+
meta=manifest.meta,
|
2659
|
+
properties=properties,
|
2660
|
+
manifest=manifest,
|
2661
|
+
previous_stream_position=previous_stream_position,
|
2662
|
+
)
|
2663
|
+
return staged_delta
|
1840
2664
|
|
1841
2665
|
|
1842
|
-
def commit_delta(
|
2666
|
+
def commit_delta(
|
2667
|
+
delta: Delta,
|
2668
|
+
*args,
|
2669
|
+
transaction: Optional[Transaction] = None,
|
2670
|
+
**kwargs,
|
2671
|
+
) -> Delta:
|
1843
2672
|
"""
|
1844
2673
|
Registers a new delta with its associated target table version and
|
1845
2674
|
partition. Returns the registered delta. If the delta's previous stream
|
@@ -1848,7 +2677,72 @@ def commit_delta(delta: Delta, *args, **kwargs) -> Delta:
|
|
1848
2677
|
stream position is specified, it must be greater than the latest stream
|
1849
2678
|
position in the target partition.
|
1850
2679
|
"""
|
1851
|
-
|
2680
|
+
transaction, commit_transaction = setup_transaction(transaction, **kwargs)
|
2681
|
+
|
2682
|
+
delta: Delta = Metafile.update_for(delta)
|
2683
|
+
delta_type: Optional[DeltaType] = delta.type
|
2684
|
+
resolved_delta_type = delta_type if delta_type is not None else DeltaType.UPSERT
|
2685
|
+
delta.type = resolved_delta_type
|
2686
|
+
delta.properties = kwargs.get("properties") or delta.properties
|
2687
|
+
|
2688
|
+
if delta.partition_id:
|
2689
|
+
parent_partition = get_partition_by_id(
|
2690
|
+
stream_locator=delta.stream_locator,
|
2691
|
+
partition_id=delta.partition_id,
|
2692
|
+
transaction=transaction,
|
2693
|
+
*args,
|
2694
|
+
**kwargs,
|
2695
|
+
)
|
2696
|
+
else:
|
2697
|
+
parent_partition = get_partition(
|
2698
|
+
stream_locator=delta.stream_locator,
|
2699
|
+
partition_values=delta.partition_values,
|
2700
|
+
transaction=transaction,
|
2701
|
+
*args,
|
2702
|
+
**kwargs,
|
2703
|
+
)
|
2704
|
+
if not parent_partition:
|
2705
|
+
raise PartitionNotFoundError(f"Partition not found: {delta.locator}")
|
2706
|
+
# ensure that we always use a fully qualified partition locator
|
2707
|
+
delta.locator.partition_locator = parent_partition.locator
|
2708
|
+
# resolve the delta's stream position
|
2709
|
+
delta.previous_stream_position = parent_partition.stream_position or 0
|
2710
|
+
if delta.stream_position is not None:
|
2711
|
+
if delta.stream_position <= delta.previous_stream_position:
|
2712
|
+
# manually specified delta stream positions must be greater than the
|
2713
|
+
# previous stream position
|
2714
|
+
raise TableValidationError(
|
2715
|
+
f"Delta stream position {delta.stream_position} must be "
|
2716
|
+
f"greater than previous stream position "
|
2717
|
+
f"{delta.previous_stream_position}"
|
2718
|
+
)
|
2719
|
+
else:
|
2720
|
+
delta.locator.stream_position = delta.previous_stream_position + 1
|
2721
|
+
|
2722
|
+
# update the parent partition's stream position
|
2723
|
+
new_parent_partition: Partition = Metafile.update_for(parent_partition)
|
2724
|
+
new_parent_partition.stream_position = delta.locator.stream_position
|
2725
|
+
|
2726
|
+
# Add operations to the transaction
|
2727
|
+
# the 1st operation creates the delta
|
2728
|
+
transaction.step(
|
2729
|
+
TransactionOperation.of(
|
2730
|
+
operation_type=TransactionOperationType.CREATE,
|
2731
|
+
dest_metafile=delta,
|
2732
|
+
),
|
2733
|
+
)
|
2734
|
+
# the 2nd operation alters the stream position of the partition
|
2735
|
+
transaction.step(
|
2736
|
+
TransactionOperation.of(
|
2737
|
+
operation_type=TransactionOperationType.UPDATE,
|
2738
|
+
dest_metafile=new_parent_partition,
|
2739
|
+
src_metafile=parent_partition,
|
2740
|
+
),
|
2741
|
+
)
|
2742
|
+
|
2743
|
+
if commit_transaction:
|
2744
|
+
transaction.seal()
|
2745
|
+
return delta
|
1852
2746
|
|
1853
2747
|
|
1854
2748
|
def get_namespace(namespace: str, *args, **kwargs) -> Optional[Namespace]:
|
@@ -1857,8 +2751,8 @@ def get_namespace(namespace: str, *args, **kwargs) -> Optional[Namespace]:
|
|
1857
2751
|
None if the given namespace does not exist.
|
1858
2752
|
"""
|
1859
2753
|
return _latest(
|
1860
|
-
*args,
|
1861
2754
|
metafile=Namespace.of(NamespaceLocator.of(namespace)),
|
2755
|
+
*args,
|
1862
2756
|
**kwargs,
|
1863
2757
|
)
|
1864
2758
|
|
@@ -1868,33 +2762,43 @@ def namespace_exists(namespace: str, *args, **kwargs) -> bool:
|
|
1868
2762
|
Returns True if the given table namespace exists, False if not.
|
1869
2763
|
"""
|
1870
2764
|
return _exists(
|
1871
|
-
*args,
|
1872
2765
|
metafile=Namespace.of(NamespaceLocator.of(namespace)),
|
2766
|
+
*args,
|
1873
2767
|
**kwargs,
|
1874
2768
|
)
|
1875
2769
|
|
1876
2770
|
|
1877
|
-
def get_table(
|
2771
|
+
def get_table(
|
2772
|
+
namespace: str,
|
2773
|
+
table_name: str,
|
2774
|
+
*args,
|
2775
|
+
**kwargs,
|
2776
|
+
) -> Optional[Table]:
|
1878
2777
|
"""
|
1879
2778
|
Gets table metadata for the specified table. Returns None if the given
|
1880
2779
|
table does not exist.
|
1881
2780
|
"""
|
1882
2781
|
locator = TableLocator.at(namespace=namespace, table_name=table_name)
|
1883
2782
|
return _latest(
|
1884
|
-
*args,
|
1885
2783
|
metafile=Table.of(locator=locator),
|
2784
|
+
*args,
|
1886
2785
|
**kwargs,
|
1887
2786
|
)
|
1888
2787
|
|
1889
2788
|
|
1890
|
-
def table_exists(
|
2789
|
+
def table_exists(
|
2790
|
+
namespace: str,
|
2791
|
+
table_name: str,
|
2792
|
+
*args,
|
2793
|
+
**kwargs,
|
2794
|
+
) -> bool:
|
1891
2795
|
"""
|
1892
2796
|
Returns True if the given table exists, False if not.
|
1893
2797
|
"""
|
1894
2798
|
locator = TableLocator.at(namespace=namespace, table_name=table_name)
|
1895
2799
|
return _exists(
|
1896
|
-
*args,
|
1897
2800
|
metafile=Table.of(locator=locator),
|
2801
|
+
*args,
|
1898
2802
|
**kwargs,
|
1899
2803
|
)
|
1900
2804
|
|
@@ -1920,67 +2824,87 @@ def get_table_version(
|
|
1920
2824
|
schema=None,
|
1921
2825
|
)
|
1922
2826
|
return _latest(
|
1923
|
-
*args,
|
1924
2827
|
metafile=table_version,
|
2828
|
+
*args,
|
1925
2829
|
**kwargs,
|
1926
2830
|
)
|
1927
2831
|
|
1928
2832
|
|
1929
2833
|
def get_latest_table_version(
|
1930
|
-
namespace: str,
|
2834
|
+
namespace: str,
|
2835
|
+
table_name: str,
|
2836
|
+
*args,
|
2837
|
+
transaction: Optional[Transaction] = None,
|
2838
|
+
**kwargs,
|
1931
2839
|
) -> Optional[TableVersion]:
|
1932
2840
|
"""
|
1933
2841
|
Gets table version metadata for the latest version of the specified table.
|
1934
2842
|
Returns None if no table version exists for the given table. Raises
|
1935
2843
|
an error if the given table doesn't exist.
|
1936
2844
|
"""
|
2845
|
+
transaction, commit_transaction = setup_transaction(transaction, **kwargs)
|
1937
2846
|
table_version_id = _resolve_latest_table_version_id(
|
1938
|
-
*args,
|
1939
2847
|
namespace=namespace,
|
1940
2848
|
table_name=table_name,
|
1941
2849
|
fail_if_no_active_table_version=False,
|
2850
|
+
transaction=transaction,
|
2851
|
+
*args,
|
1942
2852
|
**kwargs,
|
1943
2853
|
)
|
1944
2854
|
|
1945
|
-
|
2855
|
+
table_version = (
|
1946
2856
|
get_table_version(
|
1947
|
-
*args,
|
1948
2857
|
namespace=namespace,
|
1949
2858
|
table_name=table_name,
|
1950
2859
|
table_version=table_version_id,
|
2860
|
+
transaction=transaction,
|
2861
|
+
*args,
|
1951
2862
|
**kwargs,
|
1952
2863
|
)
|
1953
2864
|
if table_version_id
|
1954
2865
|
else None
|
1955
2866
|
)
|
2867
|
+
if commit_transaction:
|
2868
|
+
transaction.seal()
|
2869
|
+
return table_version
|
1956
2870
|
|
1957
2871
|
|
1958
2872
|
def get_latest_active_table_version(
|
1959
|
-
namespace: str,
|
2873
|
+
namespace: str,
|
2874
|
+
table_name: str,
|
2875
|
+
*args,
|
2876
|
+
transaction: Optional[Transaction] = None,
|
2877
|
+
**kwargs,
|
1960
2878
|
) -> Optional[TableVersion]:
|
1961
2879
|
"""
|
1962
2880
|
Gets table version metadata for the latest active version of the specified
|
1963
2881
|
table. Returns None if no active table version exists for the given table.
|
1964
2882
|
Raises an error if the given table doesn't exist.
|
1965
2883
|
"""
|
2884
|
+
transaction, commit_transaction = setup_transaction(transaction, **kwargs)
|
1966
2885
|
table_version_id = _resolve_latest_active_table_version_id(
|
1967
|
-
*args,
|
1968
2886
|
namespace=namespace,
|
1969
2887
|
table_name=table_name,
|
1970
2888
|
fail_if_no_active_table_version=False,
|
2889
|
+
transaction=transaction,
|
2890
|
+
*args,
|
1971
2891
|
**kwargs,
|
1972
2892
|
)
|
1973
|
-
|
2893
|
+
table_version = (
|
1974
2894
|
get_table_version(
|
1975
|
-
*args,
|
1976
2895
|
namespace=namespace,
|
1977
2896
|
table_name=table_name,
|
1978
2897
|
table_version=table_version_id,
|
2898
|
+
transaction=transaction,
|
2899
|
+
*args,
|
1979
2900
|
**kwargs,
|
1980
2901
|
)
|
1981
2902
|
if table_version_id
|
1982
2903
|
else None
|
1983
2904
|
)
|
2905
|
+
if commit_transaction:
|
2906
|
+
transaction.seal()
|
2907
|
+
return table_version
|
1984
2908
|
|
1985
2909
|
|
1986
2910
|
def get_table_version_column_names(
|
@@ -2002,6 +2926,8 @@ def get_table_version_column_names(
|
|
2002
2926
|
namespace=namespace,
|
2003
2927
|
table_name=table_name,
|
2004
2928
|
table_version=table_version,
|
2929
|
+
*args,
|
2930
|
+
**kwargs,
|
2005
2931
|
)
|
2006
2932
|
return schema.arrow.names if schema else None
|
2007
2933
|
|
@@ -2018,7 +2944,7 @@ def get_table_version_schema(
|
|
2018
2944
|
table version if none is specified. Returns None if the table version is
|
2019
2945
|
schemaless. Raises an error if the table version does not exist.
|
2020
2946
|
"""
|
2021
|
-
|
2947
|
+
table_version_meta = (
|
2022
2948
|
get_table_version(
|
2023
2949
|
*args,
|
2024
2950
|
namespace=namespace,
|
@@ -2034,7 +2960,7 @@ def get_table_version_schema(
|
|
2034
2960
|
**kwargs,
|
2035
2961
|
)
|
2036
2962
|
)
|
2037
|
-
return
|
2963
|
+
return table_version_meta.schema
|
2038
2964
|
|
2039
2965
|
|
2040
2966
|
def table_version_exists(
|
@@ -2065,13 +2991,40 @@ def table_version_exists(
|
|
2065
2991
|
|
2066
2992
|
def can_categorize(e: BaseException, *args, **kwargs) -> bool:
|
2067
2993
|
"""
|
2068
|
-
|
2994
|
+
True if the input error originated from the storage
|
2995
|
+
implementation layer and can be categorized under an
|
2996
|
+
existing DeltaCatError. The "categorize_errors" decorator
|
2997
|
+
uses this to determine if an unknown error from the storage
|
2998
|
+
implementation can be categorized prior to casting it to
|
2999
|
+
the equivalent DeltaCatError via `raise_categorized_error`
|
2069
3000
|
"""
|
2070
|
-
|
3001
|
+
|
3002
|
+
# DeltaCAT native storage can only categorize DeltaCatError
|
3003
|
+
# (i.e., this is effectively a no-op for native storage)
|
3004
|
+
if isinstance(e, DeltaCatError):
|
3005
|
+
return True
|
3006
|
+
else:
|
3007
|
+
return False
|
2071
3008
|
|
2072
3009
|
|
2073
3010
|
def raise_categorized_error(e: BaseException, *args, **kwargs):
|
2074
3011
|
"""
|
2075
|
-
|
2076
|
-
|
2077
|
-
|
3012
|
+
Casts a categorizable error that originaed from the storage
|
3013
|
+
implementation layer to its equivalent DeltaCatError
|
3014
|
+
for uniform handling (e.g., determining whether an error
|
3015
|
+
is retryable or not) via the "categorize_errors" decorator.
|
3016
|
+
Raises an UnclassifiedDeltaCatError from the input exception
|
3017
|
+
if the error cannot be categorized.
|
3018
|
+
"""
|
3019
|
+
|
3020
|
+
# DeltaCAT native storage can only categorize DeltaCatError
|
3021
|
+
# (i.e., this is effectively a no-op for native storage)
|
3022
|
+
logger.info(f"Categorizing exception: {e}")
|
3023
|
+
categorized = None
|
3024
|
+
if isinstance(categorized, DeltaCatError):
|
3025
|
+
raise categorized from e
|
3026
|
+
|
3027
|
+
logger.warning(f"Could not classify {type(e).__name__}: {e}")
|
3028
|
+
raise UnclassifiedDeltaCatError(
|
3029
|
+
f"Failed to classify error {type(e).__name__}: {e}"
|
3030
|
+
) from e
|