deltacat 1.1.36__py3-none-any.whl → 2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +42 -3
- deltacat/annotations.py +36 -0
- deltacat/api.py +168 -0
- deltacat/aws/s3u.py +4 -4
- deltacat/benchmarking/benchmark_engine.py +82 -0
- deltacat/benchmarking/benchmark_report.py +86 -0
- deltacat/benchmarking/benchmark_suite.py +11 -0
- deltacat/benchmarking/conftest.py +21 -0
- deltacat/benchmarking/data/random_row_generator.py +94 -0
- deltacat/benchmarking/data/row_generator.py +10 -0
- deltacat/benchmarking/test_benchmark_pipeline.py +106 -0
- deltacat/catalog/__init__.py +14 -0
- deltacat/catalog/delegate.py +199 -106
- deltacat/catalog/iceberg/__init__.py +4 -0
- deltacat/catalog/iceberg/iceberg_catalog_config.py +26 -0
- deltacat/catalog/iceberg/impl.py +368 -0
- deltacat/catalog/iceberg/overrides.py +74 -0
- deltacat/catalog/interface.py +273 -76
- deltacat/catalog/main/impl.py +720 -0
- deltacat/catalog/model/catalog.py +227 -20
- deltacat/catalog/model/properties.py +116 -0
- deltacat/catalog/model/table_definition.py +32 -1
- deltacat/compute/compactor/model/compaction_session_audit_info.py +7 -3
- deltacat/compute/compactor/model/delta_annotated.py +3 -3
- deltacat/compute/compactor/model/delta_file_envelope.py +3 -1
- deltacat/compute/compactor/model/delta_file_locator.py +3 -1
- deltacat/compute/compactor/model/round_completion_info.py +5 -5
- deltacat/compute/compactor/model/table_object_store.py +3 -2
- deltacat/compute/compactor/repartition_session.py +1 -1
- deltacat/compute/compactor/steps/dedupe.py +11 -4
- deltacat/compute/compactor/steps/hash_bucket.py +1 -1
- deltacat/compute/compactor/steps/materialize.py +6 -2
- deltacat/compute/compactor/utils/io.py +1 -1
- deltacat/compute/compactor/utils/sort_key.py +9 -2
- deltacat/compute/compactor_v2/compaction_session.py +5 -9
- deltacat/compute/compactor_v2/constants.py +1 -30
- deltacat/compute/compactor_v2/deletes/utils.py +3 -3
- deltacat/compute/compactor_v2/model/merge_input.py +1 -7
- deltacat/compute/compactor_v2/private/compaction_utils.py +5 -6
- deltacat/compute/compactor_v2/steps/merge.py +17 -126
- deltacat/compute/compactor_v2/utils/content_type_params.py +0 -17
- deltacat/compute/compactor_v2/utils/dedupe.py +1 -1
- deltacat/compute/compactor_v2/utils/io.py +1 -1
- deltacat/compute/compactor_v2/utils/merge.py +0 -1
- deltacat/compute/compactor_v2/utils/primary_key_index.py +3 -15
- deltacat/compute/compactor_v2/utils/task_options.py +23 -43
- deltacat/compute/converter/constants.py +4 -0
- deltacat/compute/converter/converter_session.py +143 -0
- deltacat/compute/converter/model/convert_input.py +69 -0
- deltacat/compute/converter/model/convert_input_files.py +61 -0
- deltacat/compute/converter/model/converter_session_params.py +99 -0
- deltacat/compute/converter/pyiceberg/__init__.py +0 -0
- deltacat/compute/converter/pyiceberg/catalog.py +75 -0
- deltacat/compute/converter/pyiceberg/overrides.py +135 -0
- deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +251 -0
- deltacat/compute/converter/steps/__init__.py +0 -0
- deltacat/compute/converter/steps/convert.py +211 -0
- deltacat/compute/converter/steps/dedupe.py +60 -0
- deltacat/compute/converter/utils/__init__.py +0 -0
- deltacat/compute/converter/utils/convert_task_options.py +88 -0
- deltacat/compute/converter/utils/converter_session_utils.py +109 -0
- deltacat/compute/converter/utils/iceberg_columns.py +82 -0
- deltacat/compute/converter/utils/io.py +43 -0
- deltacat/compute/converter/utils/s3u.py +133 -0
- deltacat/compute/resource_estimation/delta.py +1 -19
- deltacat/constants.py +47 -1
- deltacat/env.py +51 -0
- deltacat/examples/__init__.py +0 -0
- deltacat/examples/basic_logging.py +101 -0
- deltacat/examples/common/__init__.py +0 -0
- deltacat/examples/common/fixtures.py +15 -0
- deltacat/examples/hello_world.py +27 -0
- deltacat/examples/iceberg/__init__.py +0 -0
- deltacat/examples/iceberg/iceberg_bucket_writer.py +139 -0
- deltacat/examples/iceberg/iceberg_reader.py +149 -0
- deltacat/exceptions.py +51 -9
- deltacat/logs.py +4 -1
- deltacat/storage/__init__.py +118 -28
- deltacat/storage/iceberg/__init__.py +0 -0
- deltacat/storage/iceberg/iceberg_scan_planner.py +28 -0
- deltacat/storage/iceberg/impl.py +737 -0
- deltacat/storage/iceberg/model.py +709 -0
- deltacat/storage/interface.py +217 -134
- deltacat/storage/main/__init__.py +0 -0
- deltacat/storage/main/impl.py +2077 -0
- deltacat/storage/model/delta.py +118 -71
- deltacat/storage/model/interop.py +24 -0
- deltacat/storage/model/list_result.py +8 -0
- deltacat/storage/model/locator.py +93 -3
- deltacat/{aws/redshift → storage}/model/manifest.py +122 -98
- deltacat/storage/model/metafile.py +1316 -0
- deltacat/storage/model/namespace.py +34 -18
- deltacat/storage/model/partition.py +362 -37
- deltacat/storage/model/scan/__init__.py +0 -0
- deltacat/storage/model/scan/push_down.py +19 -0
- deltacat/storage/model/scan/scan_plan.py +10 -0
- deltacat/storage/model/scan/scan_task.py +34 -0
- deltacat/storage/model/schema.py +892 -0
- deltacat/storage/model/shard.py +47 -0
- deltacat/storage/model/sort_key.py +170 -13
- deltacat/storage/model/stream.py +208 -80
- deltacat/storage/model/table.py +123 -29
- deltacat/storage/model/table_version.py +322 -46
- deltacat/storage/model/transaction.py +757 -0
- deltacat/storage/model/transform.py +198 -61
- deltacat/storage/model/types.py +111 -13
- deltacat/storage/rivulet/__init__.py +11 -0
- deltacat/storage/rivulet/arrow/__init__.py +0 -0
- deltacat/storage/rivulet/arrow/serializer.py +75 -0
- deltacat/storage/rivulet/dataset.py +744 -0
- deltacat/storage/rivulet/dataset_executor.py +87 -0
- deltacat/storage/rivulet/feather/__init__.py +5 -0
- deltacat/storage/rivulet/feather/file_reader.py +136 -0
- deltacat/storage/rivulet/feather/serializer.py +35 -0
- deltacat/storage/rivulet/fs/__init__.py +0 -0
- deltacat/storage/rivulet/fs/file_provider.py +105 -0
- deltacat/storage/rivulet/fs/file_store.py +130 -0
- deltacat/storage/rivulet/fs/input_file.py +76 -0
- deltacat/storage/rivulet/fs/output_file.py +86 -0
- deltacat/storage/rivulet/logical_plan.py +105 -0
- deltacat/storage/rivulet/metastore/__init__.py +0 -0
- deltacat/storage/rivulet/metastore/delta.py +190 -0
- deltacat/storage/rivulet/metastore/json_sst.py +105 -0
- deltacat/storage/rivulet/metastore/sst.py +82 -0
- deltacat/storage/rivulet/metastore/sst_interval_tree.py +260 -0
- deltacat/storage/rivulet/mvp/Table.py +101 -0
- deltacat/storage/rivulet/mvp/__init__.py +5 -0
- deltacat/storage/rivulet/parquet/__init__.py +5 -0
- deltacat/storage/rivulet/parquet/data_reader.py +0 -0
- deltacat/storage/rivulet/parquet/file_reader.py +127 -0
- deltacat/storage/rivulet/parquet/serializer.py +37 -0
- deltacat/storage/rivulet/reader/__init__.py +0 -0
- deltacat/storage/rivulet/reader/block_scanner.py +378 -0
- deltacat/storage/rivulet/reader/data_reader.py +136 -0
- deltacat/storage/rivulet/reader/data_scan.py +63 -0
- deltacat/storage/rivulet/reader/dataset_metastore.py +178 -0
- deltacat/storage/rivulet/reader/dataset_reader.py +156 -0
- deltacat/storage/rivulet/reader/pyarrow_data_reader.py +121 -0
- deltacat/storage/rivulet/reader/query_expression.py +99 -0
- deltacat/storage/rivulet/reader/reader_type_registrar.py +84 -0
- deltacat/storage/rivulet/schema/__init__.py +0 -0
- deltacat/storage/rivulet/schema/datatype.py +128 -0
- deltacat/storage/rivulet/schema/schema.py +251 -0
- deltacat/storage/rivulet/serializer.py +40 -0
- deltacat/storage/rivulet/serializer_factory.py +42 -0
- deltacat/storage/rivulet/writer/__init__.py +0 -0
- deltacat/storage/rivulet/writer/dataset_writer.py +29 -0
- deltacat/storage/rivulet/writer/memtable_dataset_writer.py +294 -0
- deltacat/tests/_io/__init__.py +1 -0
- deltacat/tests/catalog/test_catalogs.py +324 -0
- deltacat/tests/catalog/test_default_catalog_impl.py +16 -8
- deltacat/tests/compute/compact_partition_multiple_rounds_test_cases.py +21 -21
- deltacat/tests/compute/compact_partition_rebase_test_cases.py +6 -6
- deltacat/tests/compute/compact_partition_rebase_then_incremental_test_cases.py +56 -56
- deltacat/tests/compute/compact_partition_test_cases.py +19 -53
- deltacat/tests/compute/compactor/steps/test_repartition.py +2 -2
- deltacat/tests/compute/compactor/utils/test_io.py +6 -8
- deltacat/tests/compute/compactor_v2/test_compaction_session.py +0 -466
- deltacat/tests/compute/compactor_v2/utils/test_task_options.py +1 -273
- deltacat/tests/compute/conftest.py +75 -0
- deltacat/tests/compute/converter/__init__.py +0 -0
- deltacat/tests/compute/converter/conftest.py +80 -0
- deltacat/tests/compute/converter/test_convert_session.py +478 -0
- deltacat/tests/compute/converter/utils.py +123 -0
- deltacat/tests/compute/resource_estimation/test_delta.py +0 -16
- deltacat/tests/compute/test_compact_partition_incremental.py +2 -42
- deltacat/tests/compute/test_compact_partition_multiple_rounds.py +5 -46
- deltacat/tests/compute/test_compact_partition_params.py +3 -3
- deltacat/tests/compute/test_compact_partition_rebase.py +1 -46
- deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +5 -46
- deltacat/tests/compute/test_util_common.py +19 -12
- deltacat/tests/compute/test_util_create_table_deltas_repo.py +13 -22
- deltacat/tests/local_deltacat_storage/__init__.py +76 -103
- deltacat/tests/storage/__init__.py +0 -0
- deltacat/tests/storage/conftest.py +25 -0
- deltacat/tests/storage/main/__init__.py +0 -0
- deltacat/tests/storage/main/test_main_storage.py +1399 -0
- deltacat/tests/storage/model/__init__.py +0 -0
- deltacat/tests/storage/model/test_delete_parameters.py +21 -0
- deltacat/tests/storage/model/test_metafile_io.py +2535 -0
- deltacat/tests/storage/model/test_schema.py +308 -0
- deltacat/tests/storage/model/test_shard.py +22 -0
- deltacat/tests/storage/model/test_table_version.py +110 -0
- deltacat/tests/storage/model/test_transaction.py +308 -0
- deltacat/tests/storage/rivulet/__init__.py +0 -0
- deltacat/tests/storage/rivulet/conftest.py +149 -0
- deltacat/tests/storage/rivulet/fs/__init__.py +0 -0
- deltacat/tests/storage/rivulet/fs/test_file_location_provider.py +93 -0
- deltacat/tests/storage/rivulet/schema/__init__.py +0 -0
- deltacat/tests/storage/rivulet/schema/test_schema.py +241 -0
- deltacat/tests/storage/rivulet/test_dataset.py +406 -0
- deltacat/tests/storage/rivulet/test_manifest.py +67 -0
- deltacat/tests/storage/rivulet/test_sst_interval_tree.py +232 -0
- deltacat/tests/storage/rivulet/test_utils.py +122 -0
- deltacat/tests/storage/rivulet/writer/__init__.py +0 -0
- deltacat/tests/storage/rivulet/writer/test_dataset_write_then_read.py +341 -0
- deltacat/tests/storage/rivulet/writer/test_dataset_writer.py +79 -0
- deltacat/tests/storage/rivulet/writer/test_memtable_dataset_writer.py +75 -0
- deltacat/tests/test_deltacat_api.py +39 -0
- deltacat/tests/test_utils/filesystem.py +14 -0
- deltacat/tests/test_utils/message_pack_utils.py +54 -0
- deltacat/tests/test_utils/pyarrow.py +8 -15
- deltacat/tests/test_utils/storage.py +266 -3
- deltacat/tests/utils/test_daft.py +3 -3
- deltacat/tests/utils/test_pyarrow.py +0 -432
- deltacat/types/partial_download.py +1 -1
- deltacat/types/tables.py +1 -1
- deltacat/utils/export.py +59 -0
- deltacat/utils/filesystem.py +320 -0
- deltacat/utils/metafile_locator.py +73 -0
- deltacat/utils/pyarrow.py +36 -183
- deltacat-2.0.dist-info/METADATA +65 -0
- deltacat-2.0.dist-info/RECORD +347 -0
- deltacat/aws/redshift/__init__.py +0 -19
- deltacat/catalog/default_catalog_impl/__init__.py +0 -369
- deltacat/io/dataset.py +0 -73
- deltacat/io/read_api.py +0 -143
- deltacat/storage/model/delete_parameters.py +0 -40
- deltacat/storage/model/partition_spec.py +0 -71
- deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +0 -253
- deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +0 -45
- deltacat-1.1.36.dist-info/METADATA +0 -64
- deltacat-1.1.36.dist-info/RECORD +0 -219
- /deltacat/{aws/redshift/model → benchmarking/data}/__init__.py +0 -0
- /deltacat/{io/aws → catalog/main}/__init__.py +0 -0
- /deltacat/{io/aws/redshift → compute/converter}/__init__.py +0 -0
- /deltacat/{tests/io → compute/converter/model}/__init__.py +0 -0
- /deltacat/tests/{io → _io}/test_cloudpickle_bug_fix.py +0 -0
- /deltacat/tests/{io → _io}/test_file_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_memcached_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_ray_plasma_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_redis_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_s3_object_store.py +0 -0
- {deltacat-1.1.36.dist-info → deltacat-2.0.dist-info}/LICENSE +0 -0
- {deltacat-1.1.36.dist-info → deltacat-2.0.dist-info}/WHEEL +0 -0
- {deltacat-1.1.36.dist-info → deltacat-2.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,2077 @@
|
|
1
|
+
import uuid
|
2
|
+
|
3
|
+
from typing import Any, Callable, Dict, List, Optional, Union, Tuple
|
4
|
+
|
5
|
+
from deltacat.catalog import get_catalog_properties
|
6
|
+
from deltacat.constants import DEFAULT_TABLE_VERSION
|
7
|
+
from deltacat.exceptions import TableNotFoundError
|
8
|
+
from deltacat.storage.model.manifest import (
|
9
|
+
EntryParams,
|
10
|
+
ManifestAuthor,
|
11
|
+
)
|
12
|
+
from deltacat.storage.model.delta import (
|
13
|
+
Delta,
|
14
|
+
DeltaLocator,
|
15
|
+
DeltaProperties,
|
16
|
+
DeltaType,
|
17
|
+
)
|
18
|
+
from deltacat.storage.model.types import (
|
19
|
+
CommitState,
|
20
|
+
DistributedDataset,
|
21
|
+
LifecycleState,
|
22
|
+
LocalDataset,
|
23
|
+
LocalTable,
|
24
|
+
TransactionType,
|
25
|
+
TransactionOperationType,
|
26
|
+
StreamFormat,
|
27
|
+
)
|
28
|
+
from deltacat.storage.model.list_result import ListResult
|
29
|
+
from deltacat.storage.model.namespace import (
|
30
|
+
Namespace,
|
31
|
+
NamespaceLocator,
|
32
|
+
NamespaceProperties,
|
33
|
+
)
|
34
|
+
from deltacat.storage.model.partition import (
|
35
|
+
Partition,
|
36
|
+
PartitionLocator,
|
37
|
+
PartitionScheme,
|
38
|
+
PartitionValues,
|
39
|
+
UNPARTITIONED_SCHEME_ID,
|
40
|
+
PartitionLocatorAlias,
|
41
|
+
)
|
42
|
+
from deltacat.storage.model.schema import (
|
43
|
+
Schema,
|
44
|
+
)
|
45
|
+
from deltacat.storage.model.sort_key import (
|
46
|
+
SortScheme,
|
47
|
+
)
|
48
|
+
from deltacat.storage.model.stream import (
|
49
|
+
Stream,
|
50
|
+
StreamLocator,
|
51
|
+
)
|
52
|
+
from deltacat.storage.model.table import (
|
53
|
+
Table,
|
54
|
+
TableProperties,
|
55
|
+
TableLocator,
|
56
|
+
)
|
57
|
+
from deltacat.storage.model.table_version import (
|
58
|
+
TableVersion,
|
59
|
+
TableVersionProperties,
|
60
|
+
TableVersionLocator,
|
61
|
+
)
|
62
|
+
from deltacat.storage.model.metafile import (
|
63
|
+
Metafile,
|
64
|
+
)
|
65
|
+
from deltacat.storage.model.transaction import (
|
66
|
+
TransactionOperation,
|
67
|
+
Transaction,
|
68
|
+
TransactionOperationList,
|
69
|
+
)
|
70
|
+
from deltacat.storage.model.manifest import Manifest
|
71
|
+
from deltacat.types.media import (
|
72
|
+
ContentType,
|
73
|
+
DistributedDatasetType,
|
74
|
+
StorageType,
|
75
|
+
TableType,
|
76
|
+
)
|
77
|
+
from deltacat.utils.common import ReadKwargsProvider
|
78
|
+
|
79
|
+
|
80
|
+
def _list(
|
81
|
+
metafile: Metafile,
|
82
|
+
txn_op_type: TransactionOperationType,
|
83
|
+
*args,
|
84
|
+
**kwargs,
|
85
|
+
) -> ListResult[Metafile]:
|
86
|
+
catalog_properties = get_catalog_properties(**kwargs)
|
87
|
+
limit = kwargs.get("limit") or None
|
88
|
+
transaction = Transaction.of(
|
89
|
+
txn_type=TransactionType.READ,
|
90
|
+
txn_operations=[
|
91
|
+
TransactionOperation.of(
|
92
|
+
operation_type=txn_op_type,
|
93
|
+
dest_metafile=metafile,
|
94
|
+
read_limit=limit,
|
95
|
+
)
|
96
|
+
],
|
97
|
+
)
|
98
|
+
list_results_per_op = transaction.commit(
|
99
|
+
catalog_root_dir=catalog_properties.root,
|
100
|
+
filesystem=catalog_properties.filesystem,
|
101
|
+
)
|
102
|
+
return list_results_per_op[0]
|
103
|
+
|
104
|
+
|
105
|
+
def _latest(
|
106
|
+
metafile: Metafile,
|
107
|
+
*args,
|
108
|
+
**kwargs,
|
109
|
+
) -> Optional[Metafile]:
|
110
|
+
list_results = _list(
|
111
|
+
*args,
|
112
|
+
metafile=metafile,
|
113
|
+
txn_op_type=TransactionOperationType.READ_LATEST,
|
114
|
+
**kwargs,
|
115
|
+
)
|
116
|
+
results = list_results.all_items()
|
117
|
+
return results[0] if results else None
|
118
|
+
|
119
|
+
|
120
|
+
def _exists(
|
121
|
+
metafile: Metafile,
|
122
|
+
*args,
|
123
|
+
**kwargs,
|
124
|
+
) -> Optional[Metafile]:
|
125
|
+
list_results = _list(
|
126
|
+
*args,
|
127
|
+
metafile=metafile,
|
128
|
+
txn_op_type=TransactionOperationType.READ_EXISTS,
|
129
|
+
**kwargs,
|
130
|
+
)
|
131
|
+
results = list_results.all_items()
|
132
|
+
return True if results else False
|
133
|
+
|
134
|
+
|
135
|
+
def _resolve_partition_locator_alias(
|
136
|
+
namespace: str,
|
137
|
+
table_name: str,
|
138
|
+
table_version: Optional[str] = None,
|
139
|
+
partition_values: Optional[PartitionValues] = None,
|
140
|
+
partition_scheme_id: Optional[str] = None,
|
141
|
+
*args,
|
142
|
+
**kwargs,
|
143
|
+
) -> PartitionLocatorAlias:
|
144
|
+
# TODO(pdames): A read shouldn't initiate N transactions that
|
145
|
+
# read against different catalog snapshots. To resolve this, add
|
146
|
+
# new "start", "step", and "end" methods to Transaction that
|
147
|
+
# support starting a txn, defining and executing a txn op, retrieve
|
148
|
+
# its results, then define and execute the next txn op. When
|
149
|
+
# stepping through a transaction its txn heartbeat timeout should
|
150
|
+
# be set manually.
|
151
|
+
partition_locator = None
|
152
|
+
if not partition_values:
|
153
|
+
partition_scheme_id = UNPARTITIONED_SCHEME_ID
|
154
|
+
elif not partition_scheme_id:
|
155
|
+
# resolve latest partition scheme from the current
|
156
|
+
# revision of its `deltacat` stream
|
157
|
+
stream = get_stream(
|
158
|
+
*args,
|
159
|
+
namespace=namespace,
|
160
|
+
table_name=table_name,
|
161
|
+
table_version=table_version,
|
162
|
+
**kwargs,
|
163
|
+
)
|
164
|
+
if not stream:
|
165
|
+
raise ValueError(
|
166
|
+
f"Failed to resolve latest partition scheme for "
|
167
|
+
f"`{namespace}.{table_name}` at table version "
|
168
|
+
f"`{table_version or 'latest'}` (no stream found)."
|
169
|
+
)
|
170
|
+
partition_locator = PartitionLocator.of(
|
171
|
+
stream_locator=stream.locator,
|
172
|
+
partition_values=partition_values,
|
173
|
+
partition_id=None,
|
174
|
+
)
|
175
|
+
partition_scheme_id = stream.partition_scheme.id
|
176
|
+
if not partition_locator:
|
177
|
+
partition_locator = PartitionLocator.at(
|
178
|
+
namespace=namespace,
|
179
|
+
table_name=table_name,
|
180
|
+
table_version=table_version,
|
181
|
+
stream_id=None,
|
182
|
+
stream_format=StreamFormat.DELTACAT,
|
183
|
+
partition_values=partition_values,
|
184
|
+
partition_id=None,
|
185
|
+
)
|
186
|
+
partition = Partition.of(
|
187
|
+
locator=partition_locator,
|
188
|
+
schema=None,
|
189
|
+
content_types=None,
|
190
|
+
partition_scheme_id=partition_scheme_id,
|
191
|
+
)
|
192
|
+
return partition.locator_alias
|
193
|
+
|
194
|
+
|
195
|
+
def _resolve_latest_active_table_version_id(
|
196
|
+
namespace: str,
|
197
|
+
table_name: str,
|
198
|
+
fail_if_no_active_table_version: True,
|
199
|
+
*args,
|
200
|
+
**kwargs,
|
201
|
+
) -> Optional[str]:
|
202
|
+
table = get_table(
|
203
|
+
*args,
|
204
|
+
namespace=namespace,
|
205
|
+
table_name=table_name,
|
206
|
+
**kwargs,
|
207
|
+
)
|
208
|
+
if not table:
|
209
|
+
raise ValueError(f"Table does not exist: {namespace}.{table_name}")
|
210
|
+
if fail_if_no_active_table_version and not table.latest_active_table_version:
|
211
|
+
raise ValueError(f"Table has no active table version: {namespace}.{table_name}")
|
212
|
+
return table.latest_active_table_version
|
213
|
+
|
214
|
+
|
215
|
+
def _resolve_latest_table_version_id(
|
216
|
+
namespace: str,
|
217
|
+
table_name: str,
|
218
|
+
fail_if_no_active_table_version: True,
|
219
|
+
*args,
|
220
|
+
**kwargs,
|
221
|
+
) -> Optional[str]:
|
222
|
+
table = get_table(
|
223
|
+
*args,
|
224
|
+
namespace=namespace,
|
225
|
+
table_name=table_name,
|
226
|
+
**kwargs,
|
227
|
+
)
|
228
|
+
if not table:
|
229
|
+
raise ValueError(f"Table does not exist: {namespace}.{table_name}")
|
230
|
+
if fail_if_no_active_table_version and not table.latest_table_version:
|
231
|
+
raise ValueError(f"Table has no table version: {namespace}.{table_name}")
|
232
|
+
return table.latest_table_version
|
233
|
+
|
234
|
+
|
235
|
+
def list_namespaces(*args, **kwargs) -> ListResult[Namespace]:
|
236
|
+
"""
|
237
|
+
Lists a page of table namespaces. Namespaces are returned as list result
|
238
|
+
items.
|
239
|
+
"""
|
240
|
+
return _list(
|
241
|
+
*args,
|
242
|
+
metafile=Namespace.of(NamespaceLocator.of("placeholder")),
|
243
|
+
txn_op_type=TransactionOperationType.READ_SIBLINGS,
|
244
|
+
**kwargs,
|
245
|
+
)
|
246
|
+
|
247
|
+
|
248
|
+
def list_tables(namespace: str, *args, **kwargs) -> ListResult[Table]:
|
249
|
+
"""
|
250
|
+
Lists a page of tables for the given table namespace. Tables are returned as
|
251
|
+
list result items. Raises an error if the given namespace does not exist.
|
252
|
+
"""
|
253
|
+
locator = TableLocator.at(namespace=namespace, table_name="placeholder")
|
254
|
+
return _list(
|
255
|
+
*args,
|
256
|
+
metafile=Table.of(locator=locator),
|
257
|
+
txn_op_type=TransactionOperationType.READ_SIBLINGS,
|
258
|
+
**kwargs,
|
259
|
+
)
|
260
|
+
|
261
|
+
|
262
|
+
def list_table_versions(
|
263
|
+
namespace: str,
|
264
|
+
table_name: str,
|
265
|
+
*args,
|
266
|
+
**kwargs,
|
267
|
+
) -> ListResult[TableVersion]:
|
268
|
+
"""
|
269
|
+
Lists a page of table versions for the given table. Table versions are
|
270
|
+
returned as list result items. Raises an error if the given table does not
|
271
|
+
exist.
|
272
|
+
"""
|
273
|
+
locator = TableVersionLocator.at(
|
274
|
+
namespace=namespace,
|
275
|
+
table_name=table_name,
|
276
|
+
table_version="placeholder.0",
|
277
|
+
)
|
278
|
+
table_version = TableVersion.of(
|
279
|
+
locator=locator,
|
280
|
+
schema=None,
|
281
|
+
)
|
282
|
+
return _list(
|
283
|
+
*args,
|
284
|
+
metafile=table_version,
|
285
|
+
txn_op_type=TransactionOperationType.READ_SIBLINGS,
|
286
|
+
**kwargs,
|
287
|
+
)
|
288
|
+
|
289
|
+
|
290
|
+
def list_streams(
|
291
|
+
namespace: str,
|
292
|
+
table_name: str,
|
293
|
+
table_version: str,
|
294
|
+
*args,
|
295
|
+
**kwargs,
|
296
|
+
) -> ListResult[Stream]:
|
297
|
+
"""
|
298
|
+
Lists a page of streams for the given table version.
|
299
|
+
Raises an error if the table version does not exist.
|
300
|
+
"""
|
301
|
+
locator = StreamLocator.at(
|
302
|
+
namespace=namespace,
|
303
|
+
table_name=table_name,
|
304
|
+
table_version=table_version,
|
305
|
+
stream_id="placeholder",
|
306
|
+
stream_format=None,
|
307
|
+
)
|
308
|
+
stream = Stream.of(
|
309
|
+
locator=locator,
|
310
|
+
partition_scheme=None,
|
311
|
+
)
|
312
|
+
return _list(
|
313
|
+
stream,
|
314
|
+
TransactionOperationType.READ_SIBLINGS,
|
315
|
+
*args,
|
316
|
+
**kwargs,
|
317
|
+
)
|
318
|
+
|
319
|
+
|
320
|
+
def list_partitions(
|
321
|
+
namespace: str,
|
322
|
+
table_name: str,
|
323
|
+
table_version: Optional[str] = None,
|
324
|
+
*args,
|
325
|
+
**kwargs,
|
326
|
+
) -> ListResult[Partition]:
|
327
|
+
"""
|
328
|
+
Lists a page of partitions for the given table version. Partitions are
|
329
|
+
returned as list result items. Table version resolves to the latest active
|
330
|
+
table version if not specified. Raises an error if the table version does
|
331
|
+
not exist.
|
332
|
+
"""
|
333
|
+
locator = PartitionLocator.at(
|
334
|
+
namespace=namespace,
|
335
|
+
table_name=table_name,
|
336
|
+
table_version=table_version,
|
337
|
+
stream_id=None,
|
338
|
+
stream_format=StreamFormat.DELTACAT,
|
339
|
+
partition_values=["placeholder"],
|
340
|
+
partition_id="placeholder",
|
341
|
+
)
|
342
|
+
partition = Partition.of(
|
343
|
+
locator=locator,
|
344
|
+
schema=None,
|
345
|
+
content_types=None,
|
346
|
+
)
|
347
|
+
return _list(
|
348
|
+
*args,
|
349
|
+
metafile=partition,
|
350
|
+
txn_op_type=TransactionOperationType.READ_SIBLINGS,
|
351
|
+
**kwargs,
|
352
|
+
)
|
353
|
+
|
354
|
+
|
355
|
+
def list_stream_partitions(stream: Stream, *args, **kwargs) -> ListResult[Partition]:
|
356
|
+
"""
|
357
|
+
Lists all partitions committed to the given stream.
|
358
|
+
"""
|
359
|
+
if stream.stream_format != StreamFormat.DELTACAT:
|
360
|
+
raise ValueError(
|
361
|
+
f"Unsupported stream format: {stream.stream_format}"
|
362
|
+
f"Expected stream format: {StreamFormat.DELTACAT}"
|
363
|
+
)
|
364
|
+
locator = PartitionLocator.of(
|
365
|
+
stream_locator=stream.locator,
|
366
|
+
partition_values=["placeholder"],
|
367
|
+
partition_id="placeholder",
|
368
|
+
)
|
369
|
+
partition = Partition.of(
|
370
|
+
locator=locator,
|
371
|
+
schema=None,
|
372
|
+
content_types=None,
|
373
|
+
)
|
374
|
+
return _list(
|
375
|
+
*args,
|
376
|
+
metafile=partition,
|
377
|
+
txn_op_type=TransactionOperationType.READ_SIBLINGS,
|
378
|
+
**kwargs,
|
379
|
+
)
|
380
|
+
|
381
|
+
|
382
|
+
def list_deltas(
|
383
|
+
namespace: str,
|
384
|
+
table_name: str,
|
385
|
+
partition_values: Optional[PartitionValues] = None,
|
386
|
+
table_version: Optional[str] = None,
|
387
|
+
first_stream_position: Optional[int] = None,
|
388
|
+
last_stream_position: Optional[int] = None,
|
389
|
+
ascending_order: Optional[bool] = None,
|
390
|
+
include_manifest: bool = False,
|
391
|
+
partition_scheme_id: Optional[str] = None,
|
392
|
+
*args,
|
393
|
+
**kwargs,
|
394
|
+
) -> ListResult[Delta]:
|
395
|
+
"""
|
396
|
+
Lists a page of deltas for the given table version and committed partition.
|
397
|
+
Deltas are returned as list result items. Deltas returned can optionally be
|
398
|
+
limited to inclusive first and last stream positions. Deltas are returned by
|
399
|
+
descending stream position by default. Table version resolves to the latest
|
400
|
+
active table version if not specified. Partition values should not be
|
401
|
+
specified for unpartitioned tables. Partition scheme ID resolves to the
|
402
|
+
table version's current partition scheme by default. Raises an error if the
|
403
|
+
given table version or partition does not exist.
|
404
|
+
|
405
|
+
To conserve memory, the deltas returned do not include manifests by
|
406
|
+
default. The manifests can either be optionally retrieved as part of this
|
407
|
+
call or lazily loaded via subsequent calls to `get_delta_manifest`.
|
408
|
+
"""
|
409
|
+
# TODO(pdames): Delta listing should ideally either use an efficient
|
410
|
+
# range-limited dir listing of partition children between start and end
|
411
|
+
# positions, or should traverse using Partition.stream_position (to
|
412
|
+
# resolve last stream position) and Delta.previous_stream_position
|
413
|
+
# (down to first stream position).
|
414
|
+
partition_locator_alias = _resolve_partition_locator_alias(
|
415
|
+
*args,
|
416
|
+
namespace=namespace,
|
417
|
+
table_name=table_name,
|
418
|
+
table_version=table_version,
|
419
|
+
partition_values=partition_values,
|
420
|
+
partition_scheme_id=partition_scheme_id,
|
421
|
+
**kwargs,
|
422
|
+
)
|
423
|
+
locator = DeltaLocator.of(locator=partition_locator_alias)
|
424
|
+
delta = Delta.of(
|
425
|
+
locator=locator,
|
426
|
+
delta_type=None,
|
427
|
+
meta=None,
|
428
|
+
properties=None,
|
429
|
+
manifest=None,
|
430
|
+
)
|
431
|
+
all_deltas_list_result: ListResult[Delta] = _list(
|
432
|
+
*args,
|
433
|
+
metafile=delta,
|
434
|
+
txn_op_type=TransactionOperationType.READ_SIBLINGS,
|
435
|
+
**kwargs,
|
436
|
+
)
|
437
|
+
all_deltas = all_deltas_list_result.all_items()
|
438
|
+
filtered_deltas = [
|
439
|
+
delta
|
440
|
+
for delta in all_deltas
|
441
|
+
if first_stream_position <= delta.stream_position <= last_stream_position
|
442
|
+
]
|
443
|
+
if ascending_order:
|
444
|
+
filtered_deltas.reverse()
|
445
|
+
return filtered_deltas
|
446
|
+
|
447
|
+
|
448
|
+
def list_partition_deltas(
|
449
|
+
partition_like: Union[Partition, PartitionLocator],
|
450
|
+
first_stream_position: Optional[int] = None,
|
451
|
+
last_stream_position: Optional[int] = None,
|
452
|
+
ascending_order: bool = False,
|
453
|
+
include_manifest: bool = False,
|
454
|
+
*args,
|
455
|
+
**kwargs,
|
456
|
+
) -> ListResult[Delta]:
|
457
|
+
"""
|
458
|
+
Lists a page of deltas committed to the given partition.
|
459
|
+
|
460
|
+
To conserve memory, the deltas returned do not include manifests by
|
461
|
+
default. The manifests can either be optionally retrieved as part of this
|
462
|
+
call or lazily loaded via subsequent calls to `get_delta_manifest`.
|
463
|
+
"""
|
464
|
+
# TODO(pdames): Delta listing should ideally either use an efficient
|
465
|
+
# range-limited dir listing of partition children between start and end
|
466
|
+
# positions, or should traverse using Partition.stream_position (to
|
467
|
+
# resolve last stream position) and Delta.previous_stream_position
|
468
|
+
# (down to first stream position).
|
469
|
+
locator = DeltaLocator.of(
|
470
|
+
partition_locator=partition_like
|
471
|
+
if isinstance(partition_like, PartitionLocator)
|
472
|
+
else partition_like.locator,
|
473
|
+
stream_position=None,
|
474
|
+
)
|
475
|
+
delta = Delta.of(
|
476
|
+
locator=locator,
|
477
|
+
delta_type=None,
|
478
|
+
meta=None,
|
479
|
+
properties=None,
|
480
|
+
manifest=None,
|
481
|
+
)
|
482
|
+
all_deltas_list_result: ListResult[Delta] = _list(
|
483
|
+
*args,
|
484
|
+
metafile=delta,
|
485
|
+
txn_op_type=TransactionOperationType.READ_SIBLINGS,
|
486
|
+
**kwargs,
|
487
|
+
)
|
488
|
+
all_deltas = all_deltas_list_result.all_items()
|
489
|
+
filtered_deltas = [
|
490
|
+
delta
|
491
|
+
for delta in all_deltas
|
492
|
+
if first_stream_position <= delta.stream_position <= last_stream_position
|
493
|
+
]
|
494
|
+
if ascending_order:
|
495
|
+
filtered_deltas.reverse()
|
496
|
+
return filtered_deltas
|
497
|
+
|
498
|
+
|
499
|
+
def get_delta(
|
500
|
+
namespace: str,
|
501
|
+
table_name: str,
|
502
|
+
stream_position: int,
|
503
|
+
partition_values: Optional[PartitionValues] = None,
|
504
|
+
table_version: Optional[str] = None,
|
505
|
+
include_manifest: bool = False,
|
506
|
+
partition_scheme_id: Optional[str] = None,
|
507
|
+
*args,
|
508
|
+
**kwargs,
|
509
|
+
) -> Optional[Delta]:
|
510
|
+
"""
|
511
|
+
Gets the delta for the given table version, partition, and stream position.
|
512
|
+
Table version resolves to the latest active table version if not specified.
|
513
|
+
Partition values should not be specified for unpartitioned tables. Partition
|
514
|
+
scheme ID resolves to the table version's current partition scheme by
|
515
|
+
default. Raises an error if the given table version or partition does not
|
516
|
+
exist.
|
517
|
+
|
518
|
+
To conserve memory, the delta returned does not include a manifest by
|
519
|
+
default. The manifest can either be optionally retrieved as part of this
|
520
|
+
call or lazily loaded via a subsequent call to `get_delta_manifest`.
|
521
|
+
"""
|
522
|
+
# TODO(pdames): Honor `include_manifest` param.
|
523
|
+
partition_locator_alias = _resolve_partition_locator_alias(
|
524
|
+
*args,
|
525
|
+
namespace=namespace,
|
526
|
+
table_name=table_name,
|
527
|
+
table_version=table_version,
|
528
|
+
partition_values=partition_values,
|
529
|
+
partition_scheme_id=partition_scheme_id,
|
530
|
+
**kwargs,
|
531
|
+
)
|
532
|
+
locator = DeltaLocator.of(
|
533
|
+
locator=partition_locator_alias,
|
534
|
+
stream_position=stream_position,
|
535
|
+
)
|
536
|
+
delta = Delta.of(
|
537
|
+
locator=locator,
|
538
|
+
delta_type=None,
|
539
|
+
meta=None,
|
540
|
+
properties=None,
|
541
|
+
manifest=None,
|
542
|
+
)
|
543
|
+
return _latest(
|
544
|
+
*args,
|
545
|
+
metafile=delta,
|
546
|
+
**kwargs,
|
547
|
+
)
|
548
|
+
|
549
|
+
|
550
|
+
def get_latest_delta(
|
551
|
+
namespace: str,
|
552
|
+
table_name: str,
|
553
|
+
partition_values: Optional[PartitionValues] = None,
|
554
|
+
table_version: Optional[str] = None,
|
555
|
+
include_manifest: bool = False,
|
556
|
+
partition_scheme_id: Optional[str] = None,
|
557
|
+
*args,
|
558
|
+
**kwargs,
|
559
|
+
) -> Optional[Delta]:
|
560
|
+
"""
|
561
|
+
Gets the latest delta (i.e. the delta with the greatest stream position) for
|
562
|
+
the given table version and partition. Table version resolves to the latest
|
563
|
+
active table version if not specified. Partition values should not be
|
564
|
+
specified for unpartitioned tables. Partition scheme ID resolves to the
|
565
|
+
table version's current partition scheme by default. Raises an error if the
|
566
|
+
given table version or partition does not exist.
|
567
|
+
|
568
|
+
To conserve memory, the delta returned does not include a manifest by
|
569
|
+
default. The manifest can either be optionally retrieved as part of this
|
570
|
+
call or lazily loaded via a subsequent call to `get_delta_manifest`.
|
571
|
+
"""
|
572
|
+
# TODO(pdames): Wrap this method in 1 single txn.
|
573
|
+
stream = get_stream(
|
574
|
+
namespace=namespace,
|
575
|
+
table_name=table_name,
|
576
|
+
table_version=table_version,
|
577
|
+
)
|
578
|
+
partition = get_partition(
|
579
|
+
stream_locator=stream.locator,
|
580
|
+
partition_values=partition_values,
|
581
|
+
partition_scheme_id=partition_scheme_id,
|
582
|
+
)
|
583
|
+
locator = DeltaLocator.of(
|
584
|
+
locator=partition.locator,
|
585
|
+
stream_position=partition.stream_position,
|
586
|
+
)
|
587
|
+
delta = Delta.of(
|
588
|
+
locator=locator,
|
589
|
+
delta_type=None,
|
590
|
+
meta=None,
|
591
|
+
properties=None,
|
592
|
+
manifest=None,
|
593
|
+
)
|
594
|
+
return _latest(
|
595
|
+
*args,
|
596
|
+
metafile=delta,
|
597
|
+
**kwargs,
|
598
|
+
)
|
599
|
+
|
600
|
+
|
601
|
+
def download_delta(
|
602
|
+
delta_like: Union[Delta, DeltaLocator],
|
603
|
+
table_type: TableType = TableType.PYARROW,
|
604
|
+
storage_type: StorageType = StorageType.DISTRIBUTED,
|
605
|
+
max_parallelism: Optional[int] = None,
|
606
|
+
columns: Optional[List[str]] = None,
|
607
|
+
file_reader_kwargs_provider: Optional[ReadKwargsProvider] = None,
|
608
|
+
ray_options_provider: Callable[[int, Any], Dict[str, Any]] = None,
|
609
|
+
distributed_dataset_type: DistributedDatasetType = DistributedDatasetType.RAY_DATASET,
|
610
|
+
*args,
|
611
|
+
**kwargs,
|
612
|
+
) -> Union[LocalDataset, DistributedDataset]: # type: ignore
|
613
|
+
"""
|
614
|
+
Download the given delta or delta locator into either a list of
|
615
|
+
tables resident in the local node's memory, or into a dataset distributed
|
616
|
+
across this Ray cluster's object store memory. Ordered table N of a local
|
617
|
+
table list, or ordered block N of a distributed dataset, always contain
|
618
|
+
the contents of ordered delta manifest entry N.
|
619
|
+
"""
|
620
|
+
raise NotImplementedError("download_delta not implemented")
|
621
|
+
|
622
|
+
|
623
|
+
def download_delta_manifest_entry(
|
624
|
+
delta_like: Union[Delta, DeltaLocator],
|
625
|
+
entry_index: int,
|
626
|
+
table_type: TableType = TableType.PYARROW,
|
627
|
+
columns: Optional[List[str]] = None,
|
628
|
+
file_reader_kwargs_provider: Optional[ReadKwargsProvider] = None,
|
629
|
+
*args,
|
630
|
+
**kwargs,
|
631
|
+
) -> LocalTable:
|
632
|
+
"""
|
633
|
+
Downloads a single manifest entry into the specified table type for the
|
634
|
+
given delta or delta locator. If a delta is provided with a non-empty
|
635
|
+
manifest, then the entry is downloaded from this manifest. Otherwise, the
|
636
|
+
manifest is first retrieved then the given entry index downloaded.
|
637
|
+
|
638
|
+
NOTE: The entry will be downloaded in the current node's memory.
|
639
|
+
"""
|
640
|
+
raise NotImplementedError("download_delta_manifest_entry not implemented")
|
641
|
+
|
642
|
+
|
643
|
+
def get_delta_manifest(
|
644
|
+
delta_like: Union[Delta, DeltaLocator],
|
645
|
+
*args,
|
646
|
+
**kwargs,
|
647
|
+
) -> Manifest:
|
648
|
+
"""
|
649
|
+
Get the manifest associated with the given delta or delta locator. This
|
650
|
+
always retrieves the authoritative durable copy of the delta manifest, and
|
651
|
+
never the local manifest defined for any input delta. Raises an error if
|
652
|
+
the delta can't be found, or if it doesn't contain a manifest.
|
653
|
+
"""
|
654
|
+
if isinstance(delta_like, Delta):
|
655
|
+
delta_locator = delta_like.locator
|
656
|
+
elif isinstance(delta_like, DeltaLocator):
|
657
|
+
delta_locator = delta_like
|
658
|
+
else:
|
659
|
+
raise ValueError(
|
660
|
+
f"Expected delta or delta locator, but got: {type(delta_like)}"
|
661
|
+
)
|
662
|
+
delta = Delta.of(
|
663
|
+
locator=delta_locator,
|
664
|
+
delta_type=None,
|
665
|
+
meta=None,
|
666
|
+
properties=None,
|
667
|
+
manifest=None,
|
668
|
+
)
|
669
|
+
latest_delta = _latest(
|
670
|
+
metafile=delta,
|
671
|
+
*args,
|
672
|
+
**kwargs,
|
673
|
+
)
|
674
|
+
if not latest_delta or not latest_delta.manifest:
|
675
|
+
raise ValueError(f"No manifest found for delta: {delta_locator}")
|
676
|
+
return latest_delta.manifest
|
677
|
+
|
678
|
+
|
679
|
+
def create_namespace(
|
680
|
+
namespace: str,
|
681
|
+
properties: Optional[NamespaceProperties] = None,
|
682
|
+
*args,
|
683
|
+
**kwargs,
|
684
|
+
) -> Namespace:
|
685
|
+
"""
|
686
|
+
Creates a table namespace with the given name and properties. Returns
|
687
|
+
the created namespace.
|
688
|
+
"""
|
689
|
+
namespace = Namespace.of(
|
690
|
+
locator=NamespaceLocator.of(namespace=namespace),
|
691
|
+
properties=properties,
|
692
|
+
)
|
693
|
+
transaction = Transaction.of(
|
694
|
+
txn_type=TransactionType.APPEND,
|
695
|
+
txn_operations=[
|
696
|
+
TransactionOperation.of(
|
697
|
+
operation_type=TransactionOperationType.CREATE,
|
698
|
+
dest_metafile=namespace,
|
699
|
+
)
|
700
|
+
],
|
701
|
+
)
|
702
|
+
catalog_properties = get_catalog_properties(**kwargs)
|
703
|
+
transaction.commit(
|
704
|
+
catalog_root_dir=catalog_properties.root,
|
705
|
+
filesystem=catalog_properties.filesystem,
|
706
|
+
)
|
707
|
+
return namespace
|
708
|
+
|
709
|
+
|
710
|
+
def update_namespace(
|
711
|
+
namespace: str,
|
712
|
+
properties: Optional[NamespaceProperties] = None,
|
713
|
+
new_namespace: Optional[str] = None,
|
714
|
+
*args,
|
715
|
+
**kwargs,
|
716
|
+
) -> None:
|
717
|
+
"""
|
718
|
+
Updates a table namespace's name and/or properties. Raises an error if the
|
719
|
+
given namespace does not exist.
|
720
|
+
"""
|
721
|
+
# TODO(pdames): Wrap get & update within a single txn.
|
722
|
+
old_namespace = get_namespace(
|
723
|
+
*args,
|
724
|
+
namespace=namespace,
|
725
|
+
**kwargs,
|
726
|
+
)
|
727
|
+
new_namespace: Namespace = Metafile.update_for(old_namespace)
|
728
|
+
new_namespace.namespace = namespace
|
729
|
+
new_namespace.properties = properties
|
730
|
+
transaction = Transaction.of(
|
731
|
+
txn_type=TransactionType.ALTER,
|
732
|
+
txn_operations=[
|
733
|
+
TransactionOperation.of(
|
734
|
+
operation_type=TransactionOperationType.UPDATE,
|
735
|
+
dest_metafile=new_namespace,
|
736
|
+
src_metafile=old_namespace,
|
737
|
+
)
|
738
|
+
],
|
739
|
+
)
|
740
|
+
catalog_properties = get_catalog_properties(**kwargs)
|
741
|
+
transaction.commit(
|
742
|
+
catalog_root_dir=catalog_properties.root,
|
743
|
+
filesystem=catalog_properties.filesystem,
|
744
|
+
)
|
745
|
+
return namespace
|
746
|
+
|
747
|
+
|
748
|
+
def create_table_version(
|
749
|
+
namespace: str,
|
750
|
+
table_name: str,
|
751
|
+
table_version: Optional[str] = None,
|
752
|
+
schema: Optional[Schema] = None,
|
753
|
+
partition_scheme: Optional[PartitionScheme] = None,
|
754
|
+
sort_keys: Optional[SortScheme] = None,
|
755
|
+
table_version_description: Optional[str] = None,
|
756
|
+
table_version_properties: Optional[TableVersionProperties] = None,
|
757
|
+
table_description: Optional[str] = None,
|
758
|
+
table_properties: Optional[TableProperties] = None,
|
759
|
+
supported_content_types: Optional[List[ContentType]] = None,
|
760
|
+
*args,
|
761
|
+
**kwargs,
|
762
|
+
) -> Tuple[Table, TableVersion, Stream]:
|
763
|
+
"""
|
764
|
+
Create a table version with an unreleased lifecycle state and an empty delta
|
765
|
+
stream. Table versions may be schemaless and unpartitioned to improve write
|
766
|
+
performance, or have their writes governed by a schema and partition scheme
|
767
|
+
to improve data consistency and read performance.
|
768
|
+
|
769
|
+
Returns a tuple containing the created/updated table, table version, and
|
770
|
+
stream (respectively).
|
771
|
+
|
772
|
+
Raises an error if the given namespace does not exist.
|
773
|
+
"""
|
774
|
+
if not namespace_exists(
|
775
|
+
*args,
|
776
|
+
namespace=namespace,
|
777
|
+
**kwargs,
|
778
|
+
):
|
779
|
+
raise ValueError(f"Namespace {namespace} does not exist")
|
780
|
+
# check if a parent table and/or previous table version already exist
|
781
|
+
prev_table_version = None
|
782
|
+
prev_table = get_table(
|
783
|
+
*args,
|
784
|
+
namespace=namespace,
|
785
|
+
table_name=table_name,
|
786
|
+
**kwargs,
|
787
|
+
)
|
788
|
+
if not prev_table:
|
789
|
+
# no parent table exists, so we'll create it in this transaction
|
790
|
+
txn_type = TransactionType.APPEND
|
791
|
+
table_txn_op_type = TransactionOperationType.CREATE
|
792
|
+
prev_table = None
|
793
|
+
new_table = Table.of(
|
794
|
+
locator=TableLocator.at(namespace=namespace, table_name=table_name),
|
795
|
+
)
|
796
|
+
table_version = table_version or DEFAULT_TABLE_VERSION
|
797
|
+
else:
|
798
|
+
# the parent table exists, so we'll update it in this transaction
|
799
|
+
txn_type = TransactionType.ALTER
|
800
|
+
table_txn_op_type = TransactionOperationType.UPDATE
|
801
|
+
new_table: Table = Metafile.update_for(prev_table)
|
802
|
+
prev_table_version = prev_table.latest_table_version
|
803
|
+
if not table_version:
|
804
|
+
# generate the next table version ID
|
805
|
+
table_version = TableVersion.next_version(prev_table_version)
|
806
|
+
else:
|
807
|
+
# ensure that the given table version number matches expectations
|
808
|
+
expected_table_version = TableVersion.next_version(prev_table_version)
|
809
|
+
_, version_number = TableVersion.parse_table_version(
|
810
|
+
table_version,
|
811
|
+
)
|
812
|
+
_, expected_version_number = TableVersion.parse_table_version(
|
813
|
+
expected_table_version,
|
814
|
+
)
|
815
|
+
if version_number != expected_version_number:
|
816
|
+
raise ValueError(
|
817
|
+
f"Expected to create table version "
|
818
|
+
f"{expected_version_number} but found {version_number}.",
|
819
|
+
)
|
820
|
+
new_table.description = table_description or table_version_description
|
821
|
+
new_table.properties = table_properties
|
822
|
+
new_table.latest_table_version = table_version
|
823
|
+
catalog_properties = get_catalog_properties(**kwargs)
|
824
|
+
locator = TableVersionLocator.at(
|
825
|
+
namespace=namespace,
|
826
|
+
table_name=table_name,
|
827
|
+
table_version=table_version,
|
828
|
+
)
|
829
|
+
table_version = TableVersion.of(
|
830
|
+
locator=locator,
|
831
|
+
schema=schema,
|
832
|
+
partition_scheme=partition_scheme,
|
833
|
+
description=table_version_description,
|
834
|
+
properties=table_version_properties,
|
835
|
+
content_types=supported_content_types,
|
836
|
+
sort_scheme=sort_keys,
|
837
|
+
watermark=None,
|
838
|
+
lifecycle_state=LifecycleState.CREATED,
|
839
|
+
schemas=[schema] if schema else None,
|
840
|
+
partition_schemes=[partition_scheme] if partition_scheme else None,
|
841
|
+
sort_schemes=[sort_keys] if sort_keys else None,
|
842
|
+
previous_table_version=prev_table_version,
|
843
|
+
)
|
844
|
+
# create the table version's default deltacat stream in this transaction
|
845
|
+
stream_locator = StreamLocator.of(
|
846
|
+
table_version_locator=locator,
|
847
|
+
stream_id=str(uuid.uuid4()),
|
848
|
+
stream_format=StreamFormat.DELTACAT,
|
849
|
+
)
|
850
|
+
stream = Stream.of(
|
851
|
+
locator=stream_locator,
|
852
|
+
partition_scheme=partition_scheme,
|
853
|
+
state=CommitState.COMMITTED,
|
854
|
+
previous_stream_id=None,
|
855
|
+
watermark=None,
|
856
|
+
)
|
857
|
+
transaction = Transaction.of(
|
858
|
+
txn_type=txn_type,
|
859
|
+
txn_operations=[
|
860
|
+
TransactionOperation.of(
|
861
|
+
operation_type=table_txn_op_type,
|
862
|
+
dest_metafile=new_table,
|
863
|
+
src_metafile=prev_table,
|
864
|
+
),
|
865
|
+
TransactionOperation.of(
|
866
|
+
operation_type=TransactionOperationType.CREATE,
|
867
|
+
dest_metafile=table_version,
|
868
|
+
),
|
869
|
+
TransactionOperation.of(
|
870
|
+
operation_type=TransactionOperationType.CREATE,
|
871
|
+
dest_metafile=stream,
|
872
|
+
),
|
873
|
+
],
|
874
|
+
)
|
875
|
+
transaction.commit(
|
876
|
+
catalog_root_dir=catalog_properties.root,
|
877
|
+
filesystem=catalog_properties.filesystem,
|
878
|
+
)
|
879
|
+
return new_table, table_version, stream
|
880
|
+
|
881
|
+
|
882
|
+
def update_table(
|
883
|
+
namespace: str,
|
884
|
+
table_name: str,
|
885
|
+
description: Optional[str] = None,
|
886
|
+
properties: Optional[TableProperties] = None,
|
887
|
+
new_table_name: Optional[str] = None,
|
888
|
+
*args,
|
889
|
+
**kwargs,
|
890
|
+
) -> None:
|
891
|
+
"""
|
892
|
+
Update table metadata describing the table versions it contains. By default,
|
893
|
+
a table's properties are empty, and its description is equal to that given
|
894
|
+
when its first table version was created. Raises an error if the given
|
895
|
+
table does not exist.
|
896
|
+
"""
|
897
|
+
old_table = get_table(
|
898
|
+
*args,
|
899
|
+
namespace=namespace,
|
900
|
+
table_name=table_name,
|
901
|
+
**kwargs,
|
902
|
+
)
|
903
|
+
if not old_table:
|
904
|
+
raise TableNotFoundError(f"Table `{namespace}.{table_name}` does not exist.")
|
905
|
+
new_table: Table = Metafile.update_for(old_table)
|
906
|
+
new_table.description = description or old_table.description
|
907
|
+
new_table.properties = properties or old_table.properties
|
908
|
+
new_table.table_name = new_table_name or old_table.table_name
|
909
|
+
transaction = Transaction.of(
|
910
|
+
txn_type=TransactionType.ALTER,
|
911
|
+
txn_operations=[
|
912
|
+
TransactionOperation.of(
|
913
|
+
operation_type=TransactionOperationType.UPDATE,
|
914
|
+
dest_metafile=new_table,
|
915
|
+
src_metafile=old_table,
|
916
|
+
)
|
917
|
+
],
|
918
|
+
)
|
919
|
+
catalog_properties = get_catalog_properties(**kwargs)
|
920
|
+
transaction.commit(
|
921
|
+
catalog_root_dir=catalog_properties.root,
|
922
|
+
filesystem=catalog_properties.filesystem,
|
923
|
+
)
|
924
|
+
|
925
|
+
|
926
|
+
def update_table_version(
|
927
|
+
namespace: str,
|
928
|
+
table_name: str,
|
929
|
+
table_version: str,
|
930
|
+
lifecycle_state: Optional[LifecycleState] = None,
|
931
|
+
schema: Optional[Schema] = None,
|
932
|
+
description: Optional[str] = None,
|
933
|
+
properties: Optional[TableVersionProperties] = None,
|
934
|
+
partition_scheme: Optional[PartitionScheme] = None,
|
935
|
+
sort_keys: Optional[SortScheme] = None,
|
936
|
+
*args,
|
937
|
+
**kwargs,
|
938
|
+
) -> None:
|
939
|
+
"""
|
940
|
+
Update a table version. Notably, updating an unreleased table version's
|
941
|
+
lifecycle state to 'active' telegraphs that it is ready for external
|
942
|
+
consumption, and causes all calls made to consume/produce streams,
|
943
|
+
partitions, or deltas from/to its parent table to automatically resolve to
|
944
|
+
this table version by default (i.e., when the client does not explicitly
|
945
|
+
specify a different table version). Raises an error if the given table
|
946
|
+
version does not exist.
|
947
|
+
"""
|
948
|
+
# TODO(pdames): Wrap get & update within a single txn.
|
949
|
+
old_table_version = get_table_version(
|
950
|
+
*args,
|
951
|
+
namespace=namespace,
|
952
|
+
table_name=table_name,
|
953
|
+
table_version=table_version,
|
954
|
+
**kwargs,
|
955
|
+
)
|
956
|
+
if not old_table_version:
|
957
|
+
raise ValueError(
|
958
|
+
f"Table version `{table_version}` does not exist for "
|
959
|
+
f"table `{namespace}.{table_name}`."
|
960
|
+
)
|
961
|
+
new_table_version: TableVersion = Metafile.update_for(old_table_version)
|
962
|
+
new_table_version.state = lifecycle_state or old_table_version.state
|
963
|
+
# TODO(pdames): Use schema patch to check for backwards incompatible changes.
|
964
|
+
# By default, backwards incompatible changes should be pushed to a new
|
965
|
+
# table version unless the user explicitly forces the update to this
|
966
|
+
# table version (i.e., at the cost of potentially breaking consumers).
|
967
|
+
update_schema = schema and not schema.equivalent_to(
|
968
|
+
old_table_version.schema,
|
969
|
+
True,
|
970
|
+
)
|
971
|
+
if update_schema and schema.id in [s.id for s in old_table_version.schemas]:
|
972
|
+
raise ValueError(
|
973
|
+
f"Schema ID `{schema.id}` already exists in "
|
974
|
+
f"table version `{table_version}`."
|
975
|
+
)
|
976
|
+
new_table_version.schema = schema if update_schema else old_table_version.schema
|
977
|
+
new_table_version.schemas = (
|
978
|
+
old_table_version.schemas + [schema]
|
979
|
+
if update_schema
|
980
|
+
else old_table_version.schemas
|
981
|
+
)
|
982
|
+
new_table_version.description = (
|
983
|
+
description if description is not None else old_table_version.description
|
984
|
+
)
|
985
|
+
new_table_version.properties = (
|
986
|
+
properties if properties is not None else old_table_version.properties
|
987
|
+
)
|
988
|
+
new_table_version.partition_scheme = (
|
989
|
+
partition_scheme or old_table_version.partition_scheme
|
990
|
+
)
|
991
|
+
# TODO(pdames): Check for backwards incompatible partition scheme changes.
|
992
|
+
update_partition_scheme = partition_scheme and not partition_scheme.equivalent_to(
|
993
|
+
old_table_version.partition_scheme,
|
994
|
+
True,
|
995
|
+
)
|
996
|
+
if update_partition_scheme and partition_scheme.id in [
|
997
|
+
ps.id for ps in old_table_version.partition_schemes
|
998
|
+
]:
|
999
|
+
raise ValueError(
|
1000
|
+
f"Partition scheme ID `{partition_scheme.id}` already exists in "
|
1001
|
+
f"table version `{table_version}`."
|
1002
|
+
)
|
1003
|
+
new_table_version.partition_schemes = (
|
1004
|
+
old_table_version.partition_schemes + [partition_scheme]
|
1005
|
+
if update_partition_scheme
|
1006
|
+
else old_table_version.partition_schemes
|
1007
|
+
)
|
1008
|
+
# TODO(pdames): Check for backwards incompatible sort scheme changes.
|
1009
|
+
update_sort_scheme = sort_keys and not sort_keys.equivalent_to(
|
1010
|
+
old_table_version.sort_scheme,
|
1011
|
+
True,
|
1012
|
+
)
|
1013
|
+
if update_sort_scheme and sort_keys.id in [
|
1014
|
+
sk.id for sk in old_table_version.sort_schemes
|
1015
|
+
]:
|
1016
|
+
raise ValueError(
|
1017
|
+
f"Sort scheme ID `{sort_keys.id}` already exists in "
|
1018
|
+
f"table version `{table_version}`."
|
1019
|
+
)
|
1020
|
+
new_table_version.sort_scheme = sort_keys or old_table_version.sort_scheme
|
1021
|
+
new_table_version.sort_schemes = (
|
1022
|
+
old_table_version.sort_schemes + [sort_keys]
|
1023
|
+
if update_sort_scheme
|
1024
|
+
else old_table_version.sort_schemes
|
1025
|
+
)
|
1026
|
+
old_table = get_table(
|
1027
|
+
*args,
|
1028
|
+
namespace=namespace,
|
1029
|
+
table_name=table_name,
|
1030
|
+
**kwargs,
|
1031
|
+
)
|
1032
|
+
txn_operations = []
|
1033
|
+
if (
|
1034
|
+
lifecycle_state == LifecycleState.ACTIVE
|
1035
|
+
and old_table_version.state != LifecycleState.ACTIVE
|
1036
|
+
):
|
1037
|
+
_, old_version_number = (
|
1038
|
+
TableVersion.parse_table_version(
|
1039
|
+
old_table.latest_active_table_version,
|
1040
|
+
)
|
1041
|
+
if old_table.latest_active_table_version
|
1042
|
+
else (None, None)
|
1043
|
+
)
|
1044
|
+
_, new_version_number = TableVersion.parse_table_version(table_version)
|
1045
|
+
if old_version_number is None or old_version_number < new_version_number:
|
1046
|
+
# update the table's latest table version
|
1047
|
+
new_table: Table = Metafile.update_for(old_table)
|
1048
|
+
new_table.latest_active_table_version = table_version
|
1049
|
+
txn_operations.append(
|
1050
|
+
TransactionOperation.of(
|
1051
|
+
operation_type=TransactionOperationType.UPDATE,
|
1052
|
+
dest_metafile=new_table,
|
1053
|
+
src_metafile=old_table,
|
1054
|
+
)
|
1055
|
+
)
|
1056
|
+
txn_operations.append(
|
1057
|
+
TransactionOperation.of(
|
1058
|
+
operation_type=TransactionOperationType.UPDATE,
|
1059
|
+
dest_metafile=new_table_version,
|
1060
|
+
src_metafile=old_table_version,
|
1061
|
+
),
|
1062
|
+
)
|
1063
|
+
# TODO(pdames): Push changes down to non-deltacat streams via sync module.
|
1064
|
+
# Also copy sort scheme changes down to deltacat child stream?
|
1065
|
+
if partition_scheme:
|
1066
|
+
old_stream = get_stream(
|
1067
|
+
*args,
|
1068
|
+
namespace=namespace,
|
1069
|
+
table_name=table_name,
|
1070
|
+
table_version=table_version,
|
1071
|
+
**kwargs,
|
1072
|
+
)
|
1073
|
+
new_stream: Stream = Metafile.update_for(old_stream)
|
1074
|
+
new_stream.partition_scheme = partition_scheme
|
1075
|
+
txn_operations.append(
|
1076
|
+
TransactionOperation.of(
|
1077
|
+
operation_type=TransactionOperationType.UPDATE,
|
1078
|
+
dest_metafile=new_stream,
|
1079
|
+
src_metafile=old_stream,
|
1080
|
+
)
|
1081
|
+
)
|
1082
|
+
transaction = Transaction.of(
|
1083
|
+
txn_type=TransactionType.ALTER,
|
1084
|
+
txn_operations=txn_operations,
|
1085
|
+
)
|
1086
|
+
catalog_properties = get_catalog_properties(**kwargs)
|
1087
|
+
transaction.commit(
|
1088
|
+
catalog_root_dir=catalog_properties.root,
|
1089
|
+
filesystem=catalog_properties.filesystem,
|
1090
|
+
)
|
1091
|
+
|
1092
|
+
|
1093
|
+
def stage_stream(
|
1094
|
+
namespace: str,
|
1095
|
+
table_name: str,
|
1096
|
+
table_version: Optional[str] = None,
|
1097
|
+
stream_format: StreamFormat = StreamFormat.DELTACAT,
|
1098
|
+
*args,
|
1099
|
+
**kwargs,
|
1100
|
+
) -> Stream:
|
1101
|
+
"""
|
1102
|
+
Stages a new delta stream for the given table version. Resolves to the
|
1103
|
+
latest active table version if no table version is given. Resolves to the
|
1104
|
+
DeltaCAT stream format if no stream format is given. If this stream
|
1105
|
+
will replace another stream with the same format and scheme, then it will
|
1106
|
+
have its previous stream ID set to the ID of the stream being replaced.
|
1107
|
+
Returns the staged stream. Raises an error if the table version does not
|
1108
|
+
exist.
|
1109
|
+
"""
|
1110
|
+
# TODO(pdames): Support retrieving previously staged streams by ID.
|
1111
|
+
if not table_version:
|
1112
|
+
table_version = _resolve_latest_active_table_version_id(
|
1113
|
+
*args,
|
1114
|
+
namespace=namespace,
|
1115
|
+
table_name=table_name,
|
1116
|
+
**kwargs,
|
1117
|
+
)
|
1118
|
+
table_version_meta = get_table_version(
|
1119
|
+
*args,
|
1120
|
+
namespace=namespace,
|
1121
|
+
table_name=table_name,
|
1122
|
+
table_version=table_version,
|
1123
|
+
**kwargs,
|
1124
|
+
)
|
1125
|
+
locator = StreamLocator.at(
|
1126
|
+
namespace=namespace,
|
1127
|
+
table_name=table_name,
|
1128
|
+
table_version=table_version,
|
1129
|
+
stream_id=str(uuid.uuid4()),
|
1130
|
+
stream_format=stream_format or StreamFormat.DELTACAT,
|
1131
|
+
)
|
1132
|
+
stream = Stream.of(
|
1133
|
+
locator=locator,
|
1134
|
+
partition_scheme=table_version_meta.partition_scheme,
|
1135
|
+
state=CommitState.STAGED,
|
1136
|
+
previous_stream_id=None,
|
1137
|
+
watermark=None,
|
1138
|
+
)
|
1139
|
+
prev_stream = get_stream(
|
1140
|
+
*args,
|
1141
|
+
namespace=stream.namespace,
|
1142
|
+
table_name=stream.table_name,
|
1143
|
+
table_version=stream.table_version,
|
1144
|
+
stream_format=stream.stream_format,
|
1145
|
+
**kwargs,
|
1146
|
+
)
|
1147
|
+
if prev_stream:
|
1148
|
+
if prev_stream.stream_id == stream.stream_id:
|
1149
|
+
raise ValueError(
|
1150
|
+
f"Stream to stage has the same ID as existing stream: {prev_stream}."
|
1151
|
+
)
|
1152
|
+
stream.previous_stream_id = prev_stream.stream_id
|
1153
|
+
transaction = Transaction.of(
|
1154
|
+
txn_type=TransactionType.APPEND,
|
1155
|
+
txn_operations=[
|
1156
|
+
TransactionOperation.of(
|
1157
|
+
operation_type=TransactionOperationType.CREATE,
|
1158
|
+
dest_metafile=stream,
|
1159
|
+
)
|
1160
|
+
],
|
1161
|
+
)
|
1162
|
+
catalog_properties = get_catalog_properties(**kwargs)
|
1163
|
+
transaction.commit(
|
1164
|
+
catalog_root_dir=catalog_properties.root,
|
1165
|
+
filesystem=catalog_properties.filesystem,
|
1166
|
+
)
|
1167
|
+
return stream
|
1168
|
+
|
1169
|
+
|
1170
|
+
def commit_stream(
|
1171
|
+
stream: Stream,
|
1172
|
+
*args,
|
1173
|
+
**kwargs,
|
1174
|
+
) -> Stream:
|
1175
|
+
"""
|
1176
|
+
Registers a staged delta stream with a target table version, replacing any
|
1177
|
+
previous stream registered for the same table version. Returns the
|
1178
|
+
committed stream.
|
1179
|
+
"""
|
1180
|
+
if not stream.stream_id:
|
1181
|
+
raise ValueError("Stream ID to commit must be set to a staged stream ID.")
|
1182
|
+
if not stream.table_version_locator:
|
1183
|
+
raise ValueError(
|
1184
|
+
"Stream to commit must have its table version locator "
|
1185
|
+
"set to the parent of its staged stream ID."
|
1186
|
+
)
|
1187
|
+
prev_staged_stream = get_stream_by_id(
|
1188
|
+
*args,
|
1189
|
+
table_version_locator=stream.table_version_locator,
|
1190
|
+
stream_id=stream.stream_id,
|
1191
|
+
**kwargs,
|
1192
|
+
)
|
1193
|
+
if not prev_staged_stream:
|
1194
|
+
raise ValueError(
|
1195
|
+
f"Stream at table version {stream.table_version_locator} with ID "
|
1196
|
+
f"{stream.stream_id} not found."
|
1197
|
+
)
|
1198
|
+
if prev_staged_stream.state != CommitState.STAGED:
|
1199
|
+
raise ValueError(
|
1200
|
+
f"Expected to find a `{CommitState.STAGED}` stream at table version "
|
1201
|
+
f"{stream.table_version_locator} with ID {stream.stream_id},"
|
1202
|
+
f"but found a `{prev_staged_stream.state}` partition."
|
1203
|
+
)
|
1204
|
+
if not prev_staged_stream:
|
1205
|
+
raise ValueError(
|
1206
|
+
f"Stream at table_version {stream.table_version_locator} with ID "
|
1207
|
+
f"{stream.stream_id} not found."
|
1208
|
+
)
|
1209
|
+
if prev_staged_stream.state != CommitState.STAGED:
|
1210
|
+
raise ValueError(
|
1211
|
+
f"Expected to find a `{CommitState.STAGED}` stream at table version "
|
1212
|
+
f"{stream.table_version_locator} with ID {stream.stream_id},"
|
1213
|
+
f"but found a `{prev_staged_stream.state}` stream."
|
1214
|
+
)
|
1215
|
+
stream: Stream = Metafile.update_for(prev_staged_stream)
|
1216
|
+
stream.state = CommitState.COMMITTED
|
1217
|
+
prev_committed_stream = get_stream(
|
1218
|
+
*args,
|
1219
|
+
namespace=stream.namespace,
|
1220
|
+
table_name=stream.table_name,
|
1221
|
+
table_version=stream.table_version,
|
1222
|
+
stream_format=stream.stream_format,
|
1223
|
+
**kwargs,
|
1224
|
+
)
|
1225
|
+
# the first transaction operation updates the staged stream commit state
|
1226
|
+
txn_type = TransactionType.ALTER
|
1227
|
+
txn_ops = [
|
1228
|
+
TransactionOperation.of(
|
1229
|
+
operation_type=TransactionOperationType.UPDATE,
|
1230
|
+
dest_metafile=stream,
|
1231
|
+
src_metafile=prev_staged_stream,
|
1232
|
+
)
|
1233
|
+
]
|
1234
|
+
if prev_committed_stream:
|
1235
|
+
if prev_committed_stream.stream_id != stream.previous_stream_id:
|
1236
|
+
raise ValueError(
|
1237
|
+
f"Previous stream ID mismatch Expected "
|
1238
|
+
f"{stream.previous_stream_id} but found "
|
1239
|
+
f"{prev_committed_stream.stream_id}."
|
1240
|
+
)
|
1241
|
+
if prev_committed_stream.stream_id == stream.stream_id:
|
1242
|
+
raise ValueError(
|
1243
|
+
f"Stream to commit has the same ID as existing stream: {prev_committed_stream}."
|
1244
|
+
)
|
1245
|
+
# there's a previously committed stream, so update the transaction
|
1246
|
+
# type to overwrite the previously committed stream, and add another
|
1247
|
+
# transaction operation to replace it with the staged stream
|
1248
|
+
txn_type = TransactionType.OVERWRITE
|
1249
|
+
txn_ops.append(
|
1250
|
+
TransactionOperation.of(
|
1251
|
+
operation_type=TransactionOperationType.UPDATE,
|
1252
|
+
dest_metafile=stream,
|
1253
|
+
src_metafile=prev_committed_stream,
|
1254
|
+
)
|
1255
|
+
)
|
1256
|
+
transaction = Transaction.of(
|
1257
|
+
txn_type=txn_type,
|
1258
|
+
txn_operations=txn_ops,
|
1259
|
+
)
|
1260
|
+
catalog_properties = get_catalog_properties(**kwargs)
|
1261
|
+
transaction.commit(
|
1262
|
+
catalog_root_dir=catalog_properties.root,
|
1263
|
+
filesystem=catalog_properties.filesystem,
|
1264
|
+
)
|
1265
|
+
return stream
|
1266
|
+
|
1267
|
+
|
1268
|
+
def delete_stream(
|
1269
|
+
namespace: str,
|
1270
|
+
table_name: str,
|
1271
|
+
table_version: Optional[str] = None,
|
1272
|
+
stream_format: StreamFormat = StreamFormat.DELTACAT,
|
1273
|
+
*args,
|
1274
|
+
**kwargs,
|
1275
|
+
) -> None:
|
1276
|
+
"""
|
1277
|
+
Deletes the delta stream currently registered with the given table version.
|
1278
|
+
Resolves to the latest active table version if no table version is given.
|
1279
|
+
Resolves to the deltacat stream format if no stream format is given.
|
1280
|
+
Raises an error if the stream does not exist.
|
1281
|
+
"""
|
1282
|
+
if not table_version:
|
1283
|
+
table_version = _resolve_latest_active_table_version_id(
|
1284
|
+
*args,
|
1285
|
+
namespace=namespace,
|
1286
|
+
table_name=table_name,
|
1287
|
+
**kwargs,
|
1288
|
+
)
|
1289
|
+
stream_to_delete = get_stream(
|
1290
|
+
*args,
|
1291
|
+
namespace=namespace,
|
1292
|
+
table_name=table_name,
|
1293
|
+
table_version=table_version,
|
1294
|
+
stream_format=stream_format,
|
1295
|
+
**kwargs,
|
1296
|
+
)
|
1297
|
+
if not stream_to_delete:
|
1298
|
+
raise ValueError(
|
1299
|
+
f"Stream to delete not found: {namespace}.{table_name}"
|
1300
|
+
f".{table_version}.{stream_format}."
|
1301
|
+
)
|
1302
|
+
else:
|
1303
|
+
stream_to_delete.state = CommitState.DEPRECATED
|
1304
|
+
transaction = Transaction.of(
|
1305
|
+
txn_type=TransactionType.DELETE,
|
1306
|
+
txn_operations=[
|
1307
|
+
TransactionOperation.of(
|
1308
|
+
operation_type=TransactionOperationType.DELETE,
|
1309
|
+
dest_metafile=stream_to_delete,
|
1310
|
+
)
|
1311
|
+
],
|
1312
|
+
)
|
1313
|
+
catalog_properties = get_catalog_properties(**kwargs)
|
1314
|
+
transaction.commit(
|
1315
|
+
catalog_root_dir=catalog_properties.root,
|
1316
|
+
filesystem=catalog_properties.filesystem,
|
1317
|
+
)
|
1318
|
+
|
1319
|
+
|
1320
|
+
def delete_table(
|
1321
|
+
namespace: str,
|
1322
|
+
name: str,
|
1323
|
+
purge: bool = False,
|
1324
|
+
*args,
|
1325
|
+
**kwargs,
|
1326
|
+
) -> None:
|
1327
|
+
"""
|
1328
|
+
Drops the given table and all its contents (table versions, streams, partitions,
|
1329
|
+
and deltas). If purge is True, also removes all data files associated with the table.
|
1330
|
+
Raises an error if the given table does not exist.
|
1331
|
+
|
1332
|
+
TODO: Honor purge once garbage collection is implemented.
|
1333
|
+
"""
|
1334
|
+
table: Optional[Table] = get_table(
|
1335
|
+
*args,
|
1336
|
+
namespace=namespace,
|
1337
|
+
table_name=name,
|
1338
|
+
**kwargs,
|
1339
|
+
)
|
1340
|
+
|
1341
|
+
if not table:
|
1342
|
+
raise TableNotFoundError(f"Table `{namespace}.{name}` does not exist.")
|
1343
|
+
|
1344
|
+
transaction = Transaction.of(
|
1345
|
+
txn_type=TransactionType.DELETE,
|
1346
|
+
txn_operations=TransactionOperationList.of(
|
1347
|
+
[
|
1348
|
+
TransactionOperation.of(
|
1349
|
+
operation_type=TransactionOperationType.DELETE,
|
1350
|
+
dest_metafile=table,
|
1351
|
+
)
|
1352
|
+
]
|
1353
|
+
),
|
1354
|
+
)
|
1355
|
+
|
1356
|
+
catalog_properties = get_catalog_properties(**kwargs)
|
1357
|
+
transaction.commit(
|
1358
|
+
catalog_root_dir=catalog_properties.root,
|
1359
|
+
filesystem=catalog_properties.filesystem,
|
1360
|
+
)
|
1361
|
+
|
1362
|
+
|
1363
|
+
def delete_namespace(
|
1364
|
+
namespace: str,
|
1365
|
+
purge: bool = False,
|
1366
|
+
*args,
|
1367
|
+
**kwargs,
|
1368
|
+
) -> None:
|
1369
|
+
"""
|
1370
|
+
Drops the given table namespace and all its contents. Raises an error if the
|
1371
|
+
given namespace does not exist.
|
1372
|
+
"""
|
1373
|
+
namespace: Optional[Namespace] = get_namespace(
|
1374
|
+
*args,
|
1375
|
+
namespace=namespace,
|
1376
|
+
**kwargs,
|
1377
|
+
)
|
1378
|
+
|
1379
|
+
if not namespace:
|
1380
|
+
raise ValueError(f"Namespace `{namespace}` does not exist.")
|
1381
|
+
|
1382
|
+
transaction = Transaction.of(
|
1383
|
+
txn_type=TransactionType.DELETE,
|
1384
|
+
txn_operations=[
|
1385
|
+
TransactionOperation.of(
|
1386
|
+
operation_type=TransactionOperationType.DELETE,
|
1387
|
+
dest_metafile=namespace,
|
1388
|
+
)
|
1389
|
+
],
|
1390
|
+
)
|
1391
|
+
catalog_properties = get_catalog_properties(**kwargs)
|
1392
|
+
transaction.commit(
|
1393
|
+
catalog_root_dir=catalog_properties.root,
|
1394
|
+
filesystem=catalog_properties.filesystem,
|
1395
|
+
)
|
1396
|
+
|
1397
|
+
|
1398
|
+
def get_stream_by_id(
|
1399
|
+
table_version_locator: TableVersionLocator,
|
1400
|
+
stream_id: str,
|
1401
|
+
*args,
|
1402
|
+
**kwargs,
|
1403
|
+
) -> Optional[Partition]:
|
1404
|
+
"""
|
1405
|
+
Gets the stream for the given table version locator and stream ID.
|
1406
|
+
Returns None if the stream does not exist. Raises an error if the given
|
1407
|
+
table version locator does not exist.
|
1408
|
+
"""
|
1409
|
+
locator = StreamLocator.of(
|
1410
|
+
table_version_locator=table_version_locator,
|
1411
|
+
stream_id=stream_id,
|
1412
|
+
stream_format=None,
|
1413
|
+
)
|
1414
|
+
return _latest(
|
1415
|
+
*args,
|
1416
|
+
metafile=Stream.of(locator=locator, partition_scheme=None),
|
1417
|
+
**kwargs,
|
1418
|
+
)
|
1419
|
+
|
1420
|
+
|
1421
|
+
def get_stream(
|
1422
|
+
namespace: str,
|
1423
|
+
table_name: str,
|
1424
|
+
table_version: Optional[str] = None,
|
1425
|
+
stream_format: StreamFormat = StreamFormat.DELTACAT,
|
1426
|
+
*args,
|
1427
|
+
**kwargs,
|
1428
|
+
) -> Optional[Stream]:
|
1429
|
+
"""
|
1430
|
+
Gets the most recently committed stream for the given table version.
|
1431
|
+
Resolves to the latest active table version if no table version is given.
|
1432
|
+
Resolves to the DeltaCAT stream format if no stream format is given.
|
1433
|
+
Returns None if the table version or stream format does not exist.
|
1434
|
+
"""
|
1435
|
+
if not table_version:
|
1436
|
+
table_version = _resolve_latest_active_table_version_id(
|
1437
|
+
*args,
|
1438
|
+
namespace=namespace,
|
1439
|
+
table_name=table_name,
|
1440
|
+
fail_if_no_active_table_version=False,
|
1441
|
+
**kwargs,
|
1442
|
+
)
|
1443
|
+
locator = StreamLocator.at(
|
1444
|
+
namespace=namespace,
|
1445
|
+
table_name=table_name,
|
1446
|
+
table_version=table_version,
|
1447
|
+
stream_id=None,
|
1448
|
+
stream_format=stream_format,
|
1449
|
+
)
|
1450
|
+
return _latest(
|
1451
|
+
*args,
|
1452
|
+
metafile=Stream.of(
|
1453
|
+
locator=locator,
|
1454
|
+
partition_scheme=None,
|
1455
|
+
state=CommitState.COMMITTED,
|
1456
|
+
),
|
1457
|
+
**kwargs,
|
1458
|
+
)
|
1459
|
+
|
1460
|
+
|
1461
|
+
def stream_exists(
|
1462
|
+
namespace: str,
|
1463
|
+
table_name: str,
|
1464
|
+
table_version: Optional[str] = None,
|
1465
|
+
stream_format: StreamFormat = StreamFormat.DELTACAT,
|
1466
|
+
*args,
|
1467
|
+
**kwargs,
|
1468
|
+
) -> Optional[Stream]:
|
1469
|
+
"""
|
1470
|
+
Returns True if the given Stream exists, False if not.
|
1471
|
+
Resolves to the latest active table version if no table version is given.
|
1472
|
+
Resolves to the DeltaCAT stream format if no stream format is given.
|
1473
|
+
Returns None if the table version or stream format does not exist.
|
1474
|
+
"""
|
1475
|
+
if not table_version:
|
1476
|
+
table_version = _resolve_latest_active_table_version_id(
|
1477
|
+
*args,
|
1478
|
+
namespace=namespace,
|
1479
|
+
table_name=table_name,
|
1480
|
+
fail_if_no_active_table_version=False,
|
1481
|
+
**kwargs,
|
1482
|
+
)
|
1483
|
+
locator = StreamLocator.at(
|
1484
|
+
namespace=namespace,
|
1485
|
+
table_name=table_name,
|
1486
|
+
table_version=table_version,
|
1487
|
+
stream_id=None,
|
1488
|
+
stream_format=stream_format,
|
1489
|
+
)
|
1490
|
+
return _exists(
|
1491
|
+
*args,
|
1492
|
+
metafile=Stream.of(
|
1493
|
+
locator=locator,
|
1494
|
+
partition_scheme=None,
|
1495
|
+
state=CommitState.COMMITTED,
|
1496
|
+
),
|
1497
|
+
**kwargs,
|
1498
|
+
)
|
1499
|
+
|
1500
|
+
|
1501
|
+
def stage_partition(
|
1502
|
+
stream: Stream,
|
1503
|
+
partition_values: Optional[PartitionValues] = None,
|
1504
|
+
partition_scheme_id: Optional[str] = None,
|
1505
|
+
*args,
|
1506
|
+
**kwargs,
|
1507
|
+
) -> Partition:
|
1508
|
+
"""
|
1509
|
+
Stages a new partition for the given stream and partition values. Returns
|
1510
|
+
the staged partition. If this partition will replace another partition
|
1511
|
+
with the same partition values and scheme, then it will have its previous
|
1512
|
+
partition ID set to the ID of the partition being replaced. Partition values
|
1513
|
+
should not be specified for unpartitioned tables.
|
1514
|
+
|
1515
|
+
The partition_values must represent the results of transforms in a partition
|
1516
|
+
spec specified in the stream.
|
1517
|
+
"""
|
1518
|
+
# TODO(pdames): Cache last retrieved metafile revisions in memory to resolve
|
1519
|
+
# potentially high cost of staging many partitions.
|
1520
|
+
table_version = get_table_version(
|
1521
|
+
*args,
|
1522
|
+
namespace=stream.namespace,
|
1523
|
+
table_name=stream.table_name,
|
1524
|
+
table_version=stream.table_version,
|
1525
|
+
**kwargs,
|
1526
|
+
)
|
1527
|
+
if not table_version:
|
1528
|
+
raise ValueError(
|
1529
|
+
f"Table version not found: {stream.namespace}.{stream.table_name}."
|
1530
|
+
f"{stream.table_version}."
|
1531
|
+
)
|
1532
|
+
if not table_version.partition_schemes or partition_scheme_id not in [
|
1533
|
+
ps.id for ps in table_version.partition_schemes
|
1534
|
+
]:
|
1535
|
+
raise ValueError(
|
1536
|
+
f"Invalid partition scheme ID `{partition_scheme_id}` (not found "
|
1537
|
+
f"in parent table version `{stream.namespace}.{stream.table_name}"
|
1538
|
+
f".{table_version.table_version}` partition scheme IDs)."
|
1539
|
+
)
|
1540
|
+
if stream.partition_scheme.id not in table_version.partition_schemes:
|
1541
|
+
# this should never happen, but just in case
|
1542
|
+
raise ValueError(
|
1543
|
+
f"Invalid stream partition scheme ID `{stream.partition_scheme.id}`"
|
1544
|
+
f"in parent table version `{stream.namespace}.{stream.table_name}"
|
1545
|
+
f".{table_version.table_version}` partition scheme IDs)."
|
1546
|
+
)
|
1547
|
+
locator = PartitionLocator.of(
|
1548
|
+
stream_locator=stream.locator,
|
1549
|
+
partition_values=partition_values,
|
1550
|
+
partition_id=str(uuid.uuid4()),
|
1551
|
+
)
|
1552
|
+
partition = Partition.of(
|
1553
|
+
locator=locator,
|
1554
|
+
schema=table_version.schema,
|
1555
|
+
content_types=table_version.content_types,
|
1556
|
+
state=CommitState.STAGED,
|
1557
|
+
previous_stream_position=None,
|
1558
|
+
partition_values=partition_values,
|
1559
|
+
previous_partition_id=None,
|
1560
|
+
stream_position=None,
|
1561
|
+
partition_scheme_id=partition_scheme_id,
|
1562
|
+
)
|
1563
|
+
prev_partition = get_partition(
|
1564
|
+
*args,
|
1565
|
+
stream_locator=stream.locator,
|
1566
|
+
partition_values=partition_values,
|
1567
|
+
partition_scheme_id=partition_scheme_id,
|
1568
|
+
**kwargs,
|
1569
|
+
)
|
1570
|
+
if prev_partition:
|
1571
|
+
if prev_partition.partition_id == partition.partition_id:
|
1572
|
+
raise ValueError(
|
1573
|
+
f"Partition to stage has the same ID as existing partition: {prev_partition}."
|
1574
|
+
)
|
1575
|
+
partition.previous_partition_id = prev_partition.partition_id
|
1576
|
+
transaction = Transaction.of(
|
1577
|
+
txn_type=TransactionType.APPEND,
|
1578
|
+
txn_operations=[
|
1579
|
+
TransactionOperation.of(
|
1580
|
+
operation_type=TransactionOperationType.CREATE,
|
1581
|
+
dest_metafile=partition,
|
1582
|
+
)
|
1583
|
+
],
|
1584
|
+
)
|
1585
|
+
catalog_properties = get_catalog_properties(**kwargs)
|
1586
|
+
transaction.commit(
|
1587
|
+
catalog_root_dir=catalog_properties.root,
|
1588
|
+
filesystem=catalog_properties.filesystem,
|
1589
|
+
)
|
1590
|
+
return partition
|
1591
|
+
|
1592
|
+
|
1593
|
+
def commit_partition(
|
1594
|
+
partition: Partition,
|
1595
|
+
previous_partition: Optional[Partition] = None,
|
1596
|
+
*args,
|
1597
|
+
**kwargs,
|
1598
|
+
) -> Partition:
|
1599
|
+
"""
|
1600
|
+
Commits the staged partition to its associated table version stream,
|
1601
|
+
replacing any previous partition registered for the same stream and
|
1602
|
+
partition values.
|
1603
|
+
|
1604
|
+
If previous partition is given then it will be replaced with its deltas
|
1605
|
+
prepended to the new partition being committed. Otherwise the latest
|
1606
|
+
committed partition with the same keys and partition scheme ID will be
|
1607
|
+
retrieved.
|
1608
|
+
|
1609
|
+
Returns the registered partition. If the partition's
|
1610
|
+
previous delta stream position is specified, then the commit will
|
1611
|
+
be rejected if it does not match the actual previous stream position of
|
1612
|
+
the partition being replaced. If the partition's previous partition ID is
|
1613
|
+
specified, then the commit will be rejected if it does not match the actual
|
1614
|
+
ID of the partition being replaced.
|
1615
|
+
"""
|
1616
|
+
if previous_partition:
|
1617
|
+
raise NotImplementedError(
|
1618
|
+
f"delta prepending from previous partition {previous_partition} "
|
1619
|
+
f"is not yet implemented"
|
1620
|
+
)
|
1621
|
+
if not partition.partition_id:
|
1622
|
+
raise ValueError("Partition ID to commit must be set to a staged partition ID.")
|
1623
|
+
if not partition.stream_locator:
|
1624
|
+
raise ValueError(
|
1625
|
+
"Partition to commit must have its stream locator "
|
1626
|
+
"set to the parent of its staged partition ID."
|
1627
|
+
)
|
1628
|
+
prev_staged_partition = get_partition_by_id(
|
1629
|
+
*args,
|
1630
|
+
stream_locator=partition.stream_locator,
|
1631
|
+
partition_id=partition.partition_id,
|
1632
|
+
**kwargs,
|
1633
|
+
)
|
1634
|
+
if not prev_staged_partition:
|
1635
|
+
raise ValueError(
|
1636
|
+
f"Partition at stream {partition.stream_locator} with ID "
|
1637
|
+
f"{partition.partition_id} not found."
|
1638
|
+
)
|
1639
|
+
if prev_staged_partition.state != CommitState.STAGED:
|
1640
|
+
raise ValueError(
|
1641
|
+
f"Expected to find a `{CommitState.STAGED}` partition at stream "
|
1642
|
+
f"{partition.stream_locator} with ID {partition.partition_id},"
|
1643
|
+
f"but found a `{prev_staged_partition.state}` partition."
|
1644
|
+
)
|
1645
|
+
partition: Partition = Metafile.update_for(prev_staged_partition)
|
1646
|
+
partition.state = CommitState.COMMITTED
|
1647
|
+
prev_committed_partition = get_partition(
|
1648
|
+
*args,
|
1649
|
+
stream_locator=partition.stream_locator,
|
1650
|
+
partition_value=partition.partition_values,
|
1651
|
+
partition_scheme_id=partition.partition_scheme_id,
|
1652
|
+
**kwargs,
|
1653
|
+
)
|
1654
|
+
# the first transaction operation updates the staged partition commit state
|
1655
|
+
txn_type = TransactionType.ALTER
|
1656
|
+
txn_ops = [
|
1657
|
+
TransactionOperation.of(
|
1658
|
+
operation_type=TransactionOperationType.UPDATE,
|
1659
|
+
dest_metafile=partition,
|
1660
|
+
src_metafile=prev_staged_partition,
|
1661
|
+
)
|
1662
|
+
]
|
1663
|
+
if prev_committed_partition:
|
1664
|
+
if prev_committed_partition.partition_id != partition.previous_partition_id:
|
1665
|
+
raise ValueError(
|
1666
|
+
f"Previous partition ID mismatch Expected "
|
1667
|
+
f"{partition.previous_partition_id} but found "
|
1668
|
+
f"{prev_committed_partition.partition_id}."
|
1669
|
+
)
|
1670
|
+
# TODO(pdames): Add previous partition stream position validation.
|
1671
|
+
if prev_committed_partition.partition_id == partition.partition_id:
|
1672
|
+
raise ValueError(
|
1673
|
+
f"Partition to commit has the same ID as existing partition: "
|
1674
|
+
f"{prev_committed_partition}."
|
1675
|
+
)
|
1676
|
+
# there's a previously committed partition, so update the transaction
|
1677
|
+
# type to overwrite the previously committed partition, and add another
|
1678
|
+
# transaction operation to replace it with the staged partition
|
1679
|
+
txn_type = TransactionType.OVERWRITE
|
1680
|
+
txn_ops.append(
|
1681
|
+
TransactionOperation.of(
|
1682
|
+
operation_type=TransactionOperationType.UPDATE,
|
1683
|
+
dest_metafile=partition,
|
1684
|
+
src_metafile=prev_committed_partition,
|
1685
|
+
)
|
1686
|
+
)
|
1687
|
+
transaction = Transaction.of(
|
1688
|
+
txn_type=txn_type,
|
1689
|
+
txn_operations=txn_ops,
|
1690
|
+
)
|
1691
|
+
catalog_properties = get_catalog_properties(**kwargs)
|
1692
|
+
transaction.commit(
|
1693
|
+
catalog_root_dir=catalog_properties.root,
|
1694
|
+
filesystem=catalog_properties.filesystem,
|
1695
|
+
)
|
1696
|
+
return partition
|
1697
|
+
|
1698
|
+
|
1699
|
+
def delete_partition(
|
1700
|
+
stream_locator: StreamLocator,
|
1701
|
+
partition_values: Optional[PartitionValues] = None,
|
1702
|
+
partition_scheme_id: Optional[str] = None,
|
1703
|
+
*args,
|
1704
|
+
**kwargs,
|
1705
|
+
) -> None:
|
1706
|
+
"""
|
1707
|
+
Deletes the given partition from the specified stream. Partition
|
1708
|
+
values should not be specified for unpartitioned tables. Raises an error
|
1709
|
+
if the partition does not exist.
|
1710
|
+
"""
|
1711
|
+
partition_to_delete = get_partition(
|
1712
|
+
*args,
|
1713
|
+
stream_locator=stream_locator,
|
1714
|
+
partition_values=partition_values,
|
1715
|
+
partition_scheme_id=partition_scheme_id,
|
1716
|
+
**kwargs,
|
1717
|
+
)
|
1718
|
+
if not partition_to_delete:
|
1719
|
+
raise ValueError(
|
1720
|
+
f"Partition with values {partition_values} and scheme "
|
1721
|
+
f"{partition_scheme_id} not found in stream: {stream_locator}"
|
1722
|
+
)
|
1723
|
+
else:
|
1724
|
+
partition_to_delete.state = CommitState.DEPRECATED
|
1725
|
+
transaction = Transaction.of(
|
1726
|
+
txn_type=TransactionType.DELETE,
|
1727
|
+
txn_operations=[
|
1728
|
+
TransactionOperation.of(
|
1729
|
+
operation_type=TransactionOperationType.DELETE,
|
1730
|
+
src_metafile=partition_to_delete,
|
1731
|
+
)
|
1732
|
+
],
|
1733
|
+
)
|
1734
|
+
catalog_properties = get_catalog_properties(**kwargs)
|
1735
|
+
transaction.commit(
|
1736
|
+
catalog_root_dir=catalog_properties.root,
|
1737
|
+
filesystem=catalog_properties.filesystem,
|
1738
|
+
)
|
1739
|
+
|
1740
|
+
|
1741
|
+
def get_partition_by_id(
|
1742
|
+
stream_locator: StreamLocator,
|
1743
|
+
partition_id: str,
|
1744
|
+
*args,
|
1745
|
+
**kwargs,
|
1746
|
+
) -> Optional[Partition]:
|
1747
|
+
"""
|
1748
|
+
Gets the partition for the given stream locator and partition ID.
|
1749
|
+
Returns None if the partition does not exist. Raises an error if the
|
1750
|
+
given stream locator does not exist.
|
1751
|
+
"""
|
1752
|
+
locator = PartitionLocator.of(
|
1753
|
+
stream_locator=stream_locator,
|
1754
|
+
partition_values=None,
|
1755
|
+
partition_id=partition_id,
|
1756
|
+
)
|
1757
|
+
return _latest(
|
1758
|
+
*args,
|
1759
|
+
metafile=Partition.of(
|
1760
|
+
locator=locator,
|
1761
|
+
schema=None,
|
1762
|
+
content_types=None,
|
1763
|
+
),
|
1764
|
+
**kwargs,
|
1765
|
+
)
|
1766
|
+
|
1767
|
+
|
1768
|
+
def get_partition(
|
1769
|
+
stream_locator: StreamLocator,
|
1770
|
+
partition_values: Optional[PartitionValues] = None,
|
1771
|
+
partition_scheme_id: Optional[str] = None,
|
1772
|
+
*args,
|
1773
|
+
**kwargs,
|
1774
|
+
) -> Optional[Partition]:
|
1775
|
+
"""
|
1776
|
+
Gets the most recently committed partition for the given stream locator and
|
1777
|
+
partition key values. Returns None if no partition has been committed for
|
1778
|
+
the given table version and/or partition key values. Partition values
|
1779
|
+
should not be specified for unpartitioned tables. Partition scheme ID
|
1780
|
+
resolves to the table version's current partition scheme by default.
|
1781
|
+
Raises an error if the given stream locator does not exist.
|
1782
|
+
"""
|
1783
|
+
locator = PartitionLocator.of(
|
1784
|
+
stream_locator=stream_locator,
|
1785
|
+
partition_values=partition_values,
|
1786
|
+
partition_id=None,
|
1787
|
+
)
|
1788
|
+
if not partition_scheme_id:
|
1789
|
+
# resolve latest partition scheme from the current
|
1790
|
+
# revision of its `deltacat` stream
|
1791
|
+
stream = get_stream(
|
1792
|
+
*args,
|
1793
|
+
namespace=stream_locator.namespace,
|
1794
|
+
table_name=stream_locator.table_name,
|
1795
|
+
table_version=stream_locator.table_version,
|
1796
|
+
**kwargs,
|
1797
|
+
)
|
1798
|
+
if not stream:
|
1799
|
+
raise ValueError(f"Stream {stream_locator} not found.")
|
1800
|
+
partition_scheme_id = stream.partition_scheme.id
|
1801
|
+
return _latest(
|
1802
|
+
*args,
|
1803
|
+
metafile=Partition.of(
|
1804
|
+
locator=locator,
|
1805
|
+
schema=None,
|
1806
|
+
content_types=None,
|
1807
|
+
state=CommitState.COMMITTED,
|
1808
|
+
partition_scheme_id=partition_scheme_id,
|
1809
|
+
),
|
1810
|
+
**kwargs,
|
1811
|
+
)
|
1812
|
+
|
1813
|
+
|
1814
|
+
def stage_delta(
|
1815
|
+
data: Union[LocalTable, LocalDataset, DistributedDataset, Manifest],
|
1816
|
+
partition: Partition,
|
1817
|
+
delta_type: DeltaType = DeltaType.UPSERT,
|
1818
|
+
max_records_per_entry: Optional[int] = None,
|
1819
|
+
author: Optional[ManifestAuthor] = None,
|
1820
|
+
properties: Optional[DeltaProperties] = None,
|
1821
|
+
s3_table_writer_kwargs: Optional[Dict[str, Any]] = None,
|
1822
|
+
content_type: ContentType = ContentType.PARQUET,
|
1823
|
+
entry_params: Optional[EntryParams] = None,
|
1824
|
+
*args,
|
1825
|
+
**kwargs,
|
1826
|
+
) -> Delta:
|
1827
|
+
"""
|
1828
|
+
Writes the given table to 1 or more S3 files. Returns an unregistered
|
1829
|
+
delta whose manifest entries point to the uploaded files. Applies any
|
1830
|
+
schema consistency policies configured for the parent table version.
|
1831
|
+
|
1832
|
+
The partition spec will be used to split the input table into
|
1833
|
+
multiple files. Optionally, partition_values can be provided to avoid
|
1834
|
+
this method to recompute partition_values from the provided data.
|
1835
|
+
|
1836
|
+
Raises an error if the provided data does not conform to a unique ordered
|
1837
|
+
list of partition_values
|
1838
|
+
"""
|
1839
|
+
raise NotImplementedError("stage_delta not implemented")
|
1840
|
+
|
1841
|
+
|
1842
|
+
def commit_delta(delta: Delta, *args, **kwargs) -> Delta:
|
1843
|
+
"""
|
1844
|
+
Registers a new delta with its associated target table version and
|
1845
|
+
partition. Returns the registered delta. If the delta's previous stream
|
1846
|
+
position is specified, then the commit will be rejected if it does not match
|
1847
|
+
the target partition's actual previous stream position. If the delta's
|
1848
|
+
stream position is specified, it must be greater than the latest stream
|
1849
|
+
position in the target partition.
|
1850
|
+
"""
|
1851
|
+
raise NotImplementedError("commit_delta not implemented")
|
1852
|
+
|
1853
|
+
|
1854
|
+
def get_namespace(namespace: str, *args, **kwargs) -> Optional[Namespace]:
|
1855
|
+
"""
|
1856
|
+
Gets table namespace metadata for the specified table namespace. Returns
|
1857
|
+
None if the given namespace does not exist.
|
1858
|
+
"""
|
1859
|
+
return _latest(
|
1860
|
+
*args,
|
1861
|
+
metafile=Namespace.of(NamespaceLocator.of(namespace)),
|
1862
|
+
**kwargs,
|
1863
|
+
)
|
1864
|
+
|
1865
|
+
|
1866
|
+
def namespace_exists(namespace: str, *args, **kwargs) -> bool:
|
1867
|
+
"""
|
1868
|
+
Returns True if the given table namespace exists, False if not.
|
1869
|
+
"""
|
1870
|
+
return _exists(
|
1871
|
+
*args,
|
1872
|
+
metafile=Namespace.of(NamespaceLocator.of(namespace)),
|
1873
|
+
**kwargs,
|
1874
|
+
)
|
1875
|
+
|
1876
|
+
|
1877
|
+
def get_table(namespace: str, table_name: str, *args, **kwargs) -> Optional[Table]:
|
1878
|
+
"""
|
1879
|
+
Gets table metadata for the specified table. Returns None if the given
|
1880
|
+
table does not exist.
|
1881
|
+
"""
|
1882
|
+
locator = TableLocator.at(namespace=namespace, table_name=table_name)
|
1883
|
+
return _latest(
|
1884
|
+
*args,
|
1885
|
+
metafile=Table.of(locator=locator),
|
1886
|
+
**kwargs,
|
1887
|
+
)
|
1888
|
+
|
1889
|
+
|
1890
|
+
def table_exists(namespace: str, table_name: str, *args, **kwargs) -> bool:
|
1891
|
+
"""
|
1892
|
+
Returns True if the given table exists, False if not.
|
1893
|
+
"""
|
1894
|
+
locator = TableLocator.at(namespace=namespace, table_name=table_name)
|
1895
|
+
return _exists(
|
1896
|
+
*args,
|
1897
|
+
metafile=Table.of(locator=locator),
|
1898
|
+
**kwargs,
|
1899
|
+
)
|
1900
|
+
|
1901
|
+
|
1902
|
+
def get_table_version(
|
1903
|
+
namespace: str,
|
1904
|
+
table_name: str,
|
1905
|
+
table_version: str,
|
1906
|
+
*args,
|
1907
|
+
**kwargs,
|
1908
|
+
) -> Optional[TableVersion]:
|
1909
|
+
"""
|
1910
|
+
Gets table version metadata for the specified table version. Returns None
|
1911
|
+
if the given table version does not exist.
|
1912
|
+
"""
|
1913
|
+
locator = TableVersionLocator.at(
|
1914
|
+
namespace=namespace,
|
1915
|
+
table_name=table_name,
|
1916
|
+
table_version=table_version,
|
1917
|
+
)
|
1918
|
+
table_version = TableVersion.of(
|
1919
|
+
locator=locator,
|
1920
|
+
schema=None,
|
1921
|
+
)
|
1922
|
+
return _latest(
|
1923
|
+
*args,
|
1924
|
+
metafile=table_version,
|
1925
|
+
**kwargs,
|
1926
|
+
)
|
1927
|
+
|
1928
|
+
|
1929
|
+
def get_latest_table_version(
|
1930
|
+
namespace: str, table_name: str, *args, **kwargs
|
1931
|
+
) -> Optional[TableVersion]:
|
1932
|
+
"""
|
1933
|
+
Gets table version metadata for the latest version of the specified table.
|
1934
|
+
Returns None if no table version exists for the given table. Raises
|
1935
|
+
an error if the given table doesn't exist.
|
1936
|
+
"""
|
1937
|
+
table_version_id = _resolve_latest_table_version_id(
|
1938
|
+
*args,
|
1939
|
+
namespace=namespace,
|
1940
|
+
table_name=table_name,
|
1941
|
+
fail_if_no_active_table_version=False,
|
1942
|
+
**kwargs,
|
1943
|
+
)
|
1944
|
+
|
1945
|
+
return (
|
1946
|
+
get_table_version(
|
1947
|
+
*args,
|
1948
|
+
namespace=namespace,
|
1949
|
+
table_name=table_name,
|
1950
|
+
table_version=table_version_id,
|
1951
|
+
**kwargs,
|
1952
|
+
)
|
1953
|
+
if table_version_id
|
1954
|
+
else None
|
1955
|
+
)
|
1956
|
+
|
1957
|
+
|
1958
|
+
def get_latest_active_table_version(
|
1959
|
+
namespace: str, table_name: str, *args, **kwargs
|
1960
|
+
) -> Optional[TableVersion]:
|
1961
|
+
"""
|
1962
|
+
Gets table version metadata for the latest active version of the specified
|
1963
|
+
table. Returns None if no active table version exists for the given table.
|
1964
|
+
Raises an error if the given table doesn't exist.
|
1965
|
+
"""
|
1966
|
+
table_version_id = _resolve_latest_active_table_version_id(
|
1967
|
+
*args,
|
1968
|
+
namespace=namespace,
|
1969
|
+
table_name=table_name,
|
1970
|
+
fail_if_no_active_table_version=False,
|
1971
|
+
**kwargs,
|
1972
|
+
)
|
1973
|
+
return (
|
1974
|
+
get_table_version(
|
1975
|
+
*args,
|
1976
|
+
namespace=namespace,
|
1977
|
+
table_name=table_name,
|
1978
|
+
table_version=table_version_id,
|
1979
|
+
**kwargs,
|
1980
|
+
)
|
1981
|
+
if table_version_id
|
1982
|
+
else None
|
1983
|
+
)
|
1984
|
+
|
1985
|
+
|
1986
|
+
def get_table_version_column_names(
|
1987
|
+
namespace: str,
|
1988
|
+
table_name: str,
|
1989
|
+
table_version: Optional[str] = None,
|
1990
|
+
*args,
|
1991
|
+
**kwargs,
|
1992
|
+
) -> Optional[List[str]]:
|
1993
|
+
"""
|
1994
|
+
Gets a list of column names for the specified table version, or for the
|
1995
|
+
latest active table version if none is specified. The index of each
|
1996
|
+
column name returned represents its ordinal position in a delimited text
|
1997
|
+
file or other row-oriented content type files appended to the table.
|
1998
|
+
Returns None for schemaless tables. Raises an error if the table version
|
1999
|
+
does not exist.
|
2000
|
+
"""
|
2001
|
+
schema = get_table_version_schema(
|
2002
|
+
namespace=namespace,
|
2003
|
+
table_name=table_name,
|
2004
|
+
table_version=table_version,
|
2005
|
+
)
|
2006
|
+
return schema.arrow.names if schema else None
|
2007
|
+
|
2008
|
+
|
2009
|
+
def get_table_version_schema(
|
2010
|
+
namespace: str,
|
2011
|
+
table_name: str,
|
2012
|
+
table_version: Optional[str] = None,
|
2013
|
+
*args,
|
2014
|
+
**kwargs,
|
2015
|
+
) -> Optional[Schema]:
|
2016
|
+
"""
|
2017
|
+
Gets the schema for the specified table version, or for the latest active
|
2018
|
+
table version if none is specified. Returns None if the table version is
|
2019
|
+
schemaless. Raises an error if the table version does not exist.
|
2020
|
+
"""
|
2021
|
+
table_version = (
|
2022
|
+
get_table_version(
|
2023
|
+
*args,
|
2024
|
+
namespace=namespace,
|
2025
|
+
table_name=table_name,
|
2026
|
+
table_version=table_version,
|
2027
|
+
**kwargs,
|
2028
|
+
)
|
2029
|
+
if table_version
|
2030
|
+
else get_latest_active_table_version(
|
2031
|
+
*args,
|
2032
|
+
namespace=namespace,
|
2033
|
+
table_name=table_name,
|
2034
|
+
**kwargs,
|
2035
|
+
)
|
2036
|
+
)
|
2037
|
+
return table_version.schema
|
2038
|
+
|
2039
|
+
|
2040
|
+
def table_version_exists(
|
2041
|
+
namespace: str,
|
2042
|
+
table_name: str,
|
2043
|
+
table_version: str,
|
2044
|
+
*args,
|
2045
|
+
**kwargs,
|
2046
|
+
) -> bool:
|
2047
|
+
"""
|
2048
|
+
Returns True if the given table version exists, False if not.
|
2049
|
+
"""
|
2050
|
+
locator = TableVersionLocator.at(
|
2051
|
+
namespace=namespace,
|
2052
|
+
table_name=table_name,
|
2053
|
+
table_version=table_version,
|
2054
|
+
)
|
2055
|
+
table_version = TableVersion.of(
|
2056
|
+
locator=locator,
|
2057
|
+
schema=None,
|
2058
|
+
)
|
2059
|
+
return _exists(
|
2060
|
+
*args,
|
2061
|
+
metafile=table_version,
|
2062
|
+
**kwargs,
|
2063
|
+
)
|
2064
|
+
|
2065
|
+
|
2066
|
+
def can_categorize(e: BaseException, *args, **kwargs) -> bool:
|
2067
|
+
"""
|
2068
|
+
Return whether input error is from storage implementation layer.
|
2069
|
+
"""
|
2070
|
+
raise NotImplementedError
|
2071
|
+
|
2072
|
+
|
2073
|
+
def raise_categorized_error(e: BaseException, *args, **kwargs):
|
2074
|
+
"""
|
2075
|
+
Raise and handle storage implementation layer specific errors.
|
2076
|
+
"""
|
2077
|
+
raise NotImplementedError
|