deltacat 1.1.36__py3-none-any.whl → 2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +42 -3
- deltacat/annotations.py +36 -0
- deltacat/api.py +168 -0
- deltacat/aws/s3u.py +4 -4
- deltacat/benchmarking/benchmark_engine.py +82 -0
- deltacat/benchmarking/benchmark_report.py +86 -0
- deltacat/benchmarking/benchmark_suite.py +11 -0
- deltacat/benchmarking/conftest.py +21 -0
- deltacat/benchmarking/data/random_row_generator.py +94 -0
- deltacat/benchmarking/data/row_generator.py +10 -0
- deltacat/benchmarking/test_benchmark_pipeline.py +106 -0
- deltacat/catalog/__init__.py +14 -0
- deltacat/catalog/delegate.py +199 -106
- deltacat/catalog/iceberg/__init__.py +4 -0
- deltacat/catalog/iceberg/iceberg_catalog_config.py +26 -0
- deltacat/catalog/iceberg/impl.py +368 -0
- deltacat/catalog/iceberg/overrides.py +74 -0
- deltacat/catalog/interface.py +273 -76
- deltacat/catalog/main/impl.py +720 -0
- deltacat/catalog/model/catalog.py +227 -20
- deltacat/catalog/model/properties.py +116 -0
- deltacat/catalog/model/table_definition.py +32 -1
- deltacat/compute/compactor/model/compaction_session_audit_info.py +7 -3
- deltacat/compute/compactor/model/delta_annotated.py +3 -3
- deltacat/compute/compactor/model/delta_file_envelope.py +3 -1
- deltacat/compute/compactor/model/delta_file_locator.py +3 -1
- deltacat/compute/compactor/model/round_completion_info.py +5 -5
- deltacat/compute/compactor/model/table_object_store.py +3 -2
- deltacat/compute/compactor/repartition_session.py +1 -1
- deltacat/compute/compactor/steps/dedupe.py +11 -4
- deltacat/compute/compactor/steps/hash_bucket.py +1 -1
- deltacat/compute/compactor/steps/materialize.py +6 -2
- deltacat/compute/compactor/utils/io.py +1 -1
- deltacat/compute/compactor/utils/sort_key.py +9 -2
- deltacat/compute/compactor_v2/compaction_session.py +5 -9
- deltacat/compute/compactor_v2/constants.py +1 -30
- deltacat/compute/compactor_v2/deletes/utils.py +3 -3
- deltacat/compute/compactor_v2/model/merge_input.py +1 -7
- deltacat/compute/compactor_v2/private/compaction_utils.py +5 -6
- deltacat/compute/compactor_v2/steps/merge.py +17 -126
- deltacat/compute/compactor_v2/utils/content_type_params.py +0 -17
- deltacat/compute/compactor_v2/utils/dedupe.py +1 -1
- deltacat/compute/compactor_v2/utils/io.py +1 -1
- deltacat/compute/compactor_v2/utils/merge.py +0 -1
- deltacat/compute/compactor_v2/utils/primary_key_index.py +3 -15
- deltacat/compute/compactor_v2/utils/task_options.py +23 -43
- deltacat/compute/converter/constants.py +4 -0
- deltacat/compute/converter/converter_session.py +143 -0
- deltacat/compute/converter/model/convert_input.py +69 -0
- deltacat/compute/converter/model/convert_input_files.py +61 -0
- deltacat/compute/converter/model/converter_session_params.py +99 -0
- deltacat/compute/converter/pyiceberg/__init__.py +0 -0
- deltacat/compute/converter/pyiceberg/catalog.py +75 -0
- deltacat/compute/converter/pyiceberg/overrides.py +135 -0
- deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +251 -0
- deltacat/compute/converter/steps/__init__.py +0 -0
- deltacat/compute/converter/steps/convert.py +211 -0
- deltacat/compute/converter/steps/dedupe.py +60 -0
- deltacat/compute/converter/utils/__init__.py +0 -0
- deltacat/compute/converter/utils/convert_task_options.py +88 -0
- deltacat/compute/converter/utils/converter_session_utils.py +109 -0
- deltacat/compute/converter/utils/iceberg_columns.py +82 -0
- deltacat/compute/converter/utils/io.py +43 -0
- deltacat/compute/converter/utils/s3u.py +133 -0
- deltacat/compute/resource_estimation/delta.py +1 -19
- deltacat/constants.py +47 -1
- deltacat/env.py +51 -0
- deltacat/examples/__init__.py +0 -0
- deltacat/examples/basic_logging.py +101 -0
- deltacat/examples/common/__init__.py +0 -0
- deltacat/examples/common/fixtures.py +15 -0
- deltacat/examples/hello_world.py +27 -0
- deltacat/examples/iceberg/__init__.py +0 -0
- deltacat/examples/iceberg/iceberg_bucket_writer.py +139 -0
- deltacat/examples/iceberg/iceberg_reader.py +149 -0
- deltacat/exceptions.py +51 -9
- deltacat/logs.py +4 -1
- deltacat/storage/__init__.py +118 -28
- deltacat/storage/iceberg/__init__.py +0 -0
- deltacat/storage/iceberg/iceberg_scan_planner.py +28 -0
- deltacat/storage/iceberg/impl.py +737 -0
- deltacat/storage/iceberg/model.py +709 -0
- deltacat/storage/interface.py +217 -134
- deltacat/storage/main/__init__.py +0 -0
- deltacat/storage/main/impl.py +2077 -0
- deltacat/storage/model/delta.py +118 -71
- deltacat/storage/model/interop.py +24 -0
- deltacat/storage/model/list_result.py +8 -0
- deltacat/storage/model/locator.py +93 -3
- deltacat/{aws/redshift → storage}/model/manifest.py +122 -98
- deltacat/storage/model/metafile.py +1316 -0
- deltacat/storage/model/namespace.py +34 -18
- deltacat/storage/model/partition.py +362 -37
- deltacat/storage/model/scan/__init__.py +0 -0
- deltacat/storage/model/scan/push_down.py +19 -0
- deltacat/storage/model/scan/scan_plan.py +10 -0
- deltacat/storage/model/scan/scan_task.py +34 -0
- deltacat/storage/model/schema.py +892 -0
- deltacat/storage/model/shard.py +47 -0
- deltacat/storage/model/sort_key.py +170 -13
- deltacat/storage/model/stream.py +208 -80
- deltacat/storage/model/table.py +123 -29
- deltacat/storage/model/table_version.py +322 -46
- deltacat/storage/model/transaction.py +757 -0
- deltacat/storage/model/transform.py +198 -61
- deltacat/storage/model/types.py +111 -13
- deltacat/storage/rivulet/__init__.py +11 -0
- deltacat/storage/rivulet/arrow/__init__.py +0 -0
- deltacat/storage/rivulet/arrow/serializer.py +75 -0
- deltacat/storage/rivulet/dataset.py +744 -0
- deltacat/storage/rivulet/dataset_executor.py +87 -0
- deltacat/storage/rivulet/feather/__init__.py +5 -0
- deltacat/storage/rivulet/feather/file_reader.py +136 -0
- deltacat/storage/rivulet/feather/serializer.py +35 -0
- deltacat/storage/rivulet/fs/__init__.py +0 -0
- deltacat/storage/rivulet/fs/file_provider.py +105 -0
- deltacat/storage/rivulet/fs/file_store.py +130 -0
- deltacat/storage/rivulet/fs/input_file.py +76 -0
- deltacat/storage/rivulet/fs/output_file.py +86 -0
- deltacat/storage/rivulet/logical_plan.py +105 -0
- deltacat/storage/rivulet/metastore/__init__.py +0 -0
- deltacat/storage/rivulet/metastore/delta.py +190 -0
- deltacat/storage/rivulet/metastore/json_sst.py +105 -0
- deltacat/storage/rivulet/metastore/sst.py +82 -0
- deltacat/storage/rivulet/metastore/sst_interval_tree.py +260 -0
- deltacat/storage/rivulet/mvp/Table.py +101 -0
- deltacat/storage/rivulet/mvp/__init__.py +5 -0
- deltacat/storage/rivulet/parquet/__init__.py +5 -0
- deltacat/storage/rivulet/parquet/data_reader.py +0 -0
- deltacat/storage/rivulet/parquet/file_reader.py +127 -0
- deltacat/storage/rivulet/parquet/serializer.py +37 -0
- deltacat/storage/rivulet/reader/__init__.py +0 -0
- deltacat/storage/rivulet/reader/block_scanner.py +378 -0
- deltacat/storage/rivulet/reader/data_reader.py +136 -0
- deltacat/storage/rivulet/reader/data_scan.py +63 -0
- deltacat/storage/rivulet/reader/dataset_metastore.py +178 -0
- deltacat/storage/rivulet/reader/dataset_reader.py +156 -0
- deltacat/storage/rivulet/reader/pyarrow_data_reader.py +121 -0
- deltacat/storage/rivulet/reader/query_expression.py +99 -0
- deltacat/storage/rivulet/reader/reader_type_registrar.py +84 -0
- deltacat/storage/rivulet/schema/__init__.py +0 -0
- deltacat/storage/rivulet/schema/datatype.py +128 -0
- deltacat/storage/rivulet/schema/schema.py +251 -0
- deltacat/storage/rivulet/serializer.py +40 -0
- deltacat/storage/rivulet/serializer_factory.py +42 -0
- deltacat/storage/rivulet/writer/__init__.py +0 -0
- deltacat/storage/rivulet/writer/dataset_writer.py +29 -0
- deltacat/storage/rivulet/writer/memtable_dataset_writer.py +294 -0
- deltacat/tests/_io/__init__.py +1 -0
- deltacat/tests/catalog/test_catalogs.py +324 -0
- deltacat/tests/catalog/test_default_catalog_impl.py +16 -8
- deltacat/tests/compute/compact_partition_multiple_rounds_test_cases.py +21 -21
- deltacat/tests/compute/compact_partition_rebase_test_cases.py +6 -6
- deltacat/tests/compute/compact_partition_rebase_then_incremental_test_cases.py +56 -56
- deltacat/tests/compute/compact_partition_test_cases.py +19 -53
- deltacat/tests/compute/compactor/steps/test_repartition.py +2 -2
- deltacat/tests/compute/compactor/utils/test_io.py +6 -8
- deltacat/tests/compute/compactor_v2/test_compaction_session.py +0 -466
- deltacat/tests/compute/compactor_v2/utils/test_task_options.py +1 -273
- deltacat/tests/compute/conftest.py +75 -0
- deltacat/tests/compute/converter/__init__.py +0 -0
- deltacat/tests/compute/converter/conftest.py +80 -0
- deltacat/tests/compute/converter/test_convert_session.py +478 -0
- deltacat/tests/compute/converter/utils.py +123 -0
- deltacat/tests/compute/resource_estimation/test_delta.py +0 -16
- deltacat/tests/compute/test_compact_partition_incremental.py +2 -42
- deltacat/tests/compute/test_compact_partition_multiple_rounds.py +5 -46
- deltacat/tests/compute/test_compact_partition_params.py +3 -3
- deltacat/tests/compute/test_compact_partition_rebase.py +1 -46
- deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +5 -46
- deltacat/tests/compute/test_util_common.py +19 -12
- deltacat/tests/compute/test_util_create_table_deltas_repo.py +13 -22
- deltacat/tests/local_deltacat_storage/__init__.py +76 -103
- deltacat/tests/storage/__init__.py +0 -0
- deltacat/tests/storage/conftest.py +25 -0
- deltacat/tests/storage/main/__init__.py +0 -0
- deltacat/tests/storage/main/test_main_storage.py +1399 -0
- deltacat/tests/storage/model/__init__.py +0 -0
- deltacat/tests/storage/model/test_delete_parameters.py +21 -0
- deltacat/tests/storage/model/test_metafile_io.py +2535 -0
- deltacat/tests/storage/model/test_schema.py +308 -0
- deltacat/tests/storage/model/test_shard.py +22 -0
- deltacat/tests/storage/model/test_table_version.py +110 -0
- deltacat/tests/storage/model/test_transaction.py +308 -0
- deltacat/tests/storage/rivulet/__init__.py +0 -0
- deltacat/tests/storage/rivulet/conftest.py +149 -0
- deltacat/tests/storage/rivulet/fs/__init__.py +0 -0
- deltacat/tests/storage/rivulet/fs/test_file_location_provider.py +93 -0
- deltacat/tests/storage/rivulet/schema/__init__.py +0 -0
- deltacat/tests/storage/rivulet/schema/test_schema.py +241 -0
- deltacat/tests/storage/rivulet/test_dataset.py +406 -0
- deltacat/tests/storage/rivulet/test_manifest.py +67 -0
- deltacat/tests/storage/rivulet/test_sst_interval_tree.py +232 -0
- deltacat/tests/storage/rivulet/test_utils.py +122 -0
- deltacat/tests/storage/rivulet/writer/__init__.py +0 -0
- deltacat/tests/storage/rivulet/writer/test_dataset_write_then_read.py +341 -0
- deltacat/tests/storage/rivulet/writer/test_dataset_writer.py +79 -0
- deltacat/tests/storage/rivulet/writer/test_memtable_dataset_writer.py +75 -0
- deltacat/tests/test_deltacat_api.py +39 -0
- deltacat/tests/test_utils/filesystem.py +14 -0
- deltacat/tests/test_utils/message_pack_utils.py +54 -0
- deltacat/tests/test_utils/pyarrow.py +8 -15
- deltacat/tests/test_utils/storage.py +266 -3
- deltacat/tests/utils/test_daft.py +3 -3
- deltacat/tests/utils/test_pyarrow.py +0 -432
- deltacat/types/partial_download.py +1 -1
- deltacat/types/tables.py +1 -1
- deltacat/utils/export.py +59 -0
- deltacat/utils/filesystem.py +320 -0
- deltacat/utils/metafile_locator.py +73 -0
- deltacat/utils/pyarrow.py +36 -183
- deltacat-2.0.dist-info/METADATA +65 -0
- deltacat-2.0.dist-info/RECORD +347 -0
- deltacat/aws/redshift/__init__.py +0 -19
- deltacat/catalog/default_catalog_impl/__init__.py +0 -369
- deltacat/io/dataset.py +0 -73
- deltacat/io/read_api.py +0 -143
- deltacat/storage/model/delete_parameters.py +0 -40
- deltacat/storage/model/partition_spec.py +0 -71
- deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +0 -253
- deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +0 -45
- deltacat-1.1.36.dist-info/METADATA +0 -64
- deltacat-1.1.36.dist-info/RECORD +0 -219
- /deltacat/{aws/redshift/model → benchmarking/data}/__init__.py +0 -0
- /deltacat/{io/aws → catalog/main}/__init__.py +0 -0
- /deltacat/{io/aws/redshift → compute/converter}/__init__.py +0 -0
- /deltacat/{tests/io → compute/converter/model}/__init__.py +0 -0
- /deltacat/tests/{io → _io}/test_cloudpickle_bug_fix.py +0 -0
- /deltacat/tests/{io → _io}/test_file_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_memcached_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_ray_plasma_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_redis_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_s3_object_store.py +0 -0
- {deltacat-1.1.36.dist-info → deltacat-2.0.dist-info}/LICENSE +0 -0
- {deltacat-1.1.36.dist-info → deltacat-2.0.dist-info}/WHEEL +0 -0
- {deltacat-1.1.36.dist-info → deltacat-2.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,720 @@
|
|
1
|
+
from typing import Any, Dict, List, Optional, Union, Tuple
|
2
|
+
import logging
|
3
|
+
|
4
|
+
from deltacat.catalog import CatalogProperties
|
5
|
+
from deltacat.exceptions import (
|
6
|
+
NamespaceAlreadyExistsError,
|
7
|
+
StreamNotFoundError,
|
8
|
+
TableAlreadyExistsError,
|
9
|
+
TableVersionNotFoundError,
|
10
|
+
)
|
11
|
+
from deltacat.catalog.model.table_definition import TableDefinition
|
12
|
+
from deltacat.storage.model.sort_key import SortScheme
|
13
|
+
from deltacat.storage.model.list_result import ListResult
|
14
|
+
from deltacat.storage.model.namespace import Namespace, NamespaceProperties
|
15
|
+
from deltacat.storage.model.schema import Schema
|
16
|
+
from deltacat.storage.model.table import TableProperties, Table
|
17
|
+
from deltacat.storage.model.types import (
|
18
|
+
DistributedDataset,
|
19
|
+
LifecycleState,
|
20
|
+
LocalDataset,
|
21
|
+
LocalTable,
|
22
|
+
StreamFormat,
|
23
|
+
)
|
24
|
+
from deltacat.storage.model.partition import (
|
25
|
+
Partition,
|
26
|
+
PartitionLocator,
|
27
|
+
PartitionScheme,
|
28
|
+
)
|
29
|
+
from deltacat.storage.model.table_version import TableVersion
|
30
|
+
from deltacat.compute.merge_on_read.model.merge_on_read_params import MergeOnReadParams
|
31
|
+
from deltacat.storage.model.delta import DeltaType
|
32
|
+
from deltacat.types.media import ContentType, TableType, DistributedDatasetType
|
33
|
+
from deltacat.types.tables import TableWriteMode
|
34
|
+
from deltacat.compute.merge_on_read import MERGE_FUNC_BY_DISTRIBUTED_DATASET_TYPE
|
35
|
+
from deltacat import logs
|
36
|
+
from deltacat.constants import DEFAULT_NAMESPACE
|
37
|
+
from deltacat.storage import metastore as storage_impl
|
38
|
+
|
39
|
+
logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
|
40
|
+
|
41
|
+
"""
|
42
|
+
This is the default implementation for the Catalog interface, using DeltaCAT native storage
|
43
|
+
|
44
|
+
Note that, when this catalog implementation gets called through the normal pattern of `delegate.py`, all functions
|
45
|
+
will be called the kwarg "inner" equal to the `CatalogProperties` this was initialized with.
|
46
|
+
|
47
|
+
`CatalogProperties` has all state required to implement catalog functions, such as metastore root URI
|
48
|
+
"""
|
49
|
+
|
50
|
+
|
51
|
+
# catalog functions
|
52
|
+
def initialize(config: CatalogProperties = None, *args, **kwargs) -> CatalogProperties:
|
53
|
+
"""
|
54
|
+
Initializes the data catalog with the given arguments.
|
55
|
+
|
56
|
+
returns CatalogProperties as the "inner" state value for a DC native catalog
|
57
|
+
"""
|
58
|
+
if config is not None:
|
59
|
+
return config
|
60
|
+
else:
|
61
|
+
return CatalogProperties(*args, **kwargs)
|
62
|
+
|
63
|
+
|
64
|
+
# table functions
|
65
|
+
def write_to_table(
|
66
|
+
data: Union[LocalTable, LocalDataset, DistributedDataset], # type: ignore
|
67
|
+
table: str,
|
68
|
+
*args,
|
69
|
+
namespace: Optional[str] = None,
|
70
|
+
mode: TableWriteMode = TableWriteMode.AUTO,
|
71
|
+
content_type: ContentType = ContentType.PARQUET,
|
72
|
+
**kwargs,
|
73
|
+
) -> None:
|
74
|
+
"""Write local or distributed data to a table. Raises an error if the
|
75
|
+
table does not exist and the table write mode is not CREATE or AUTO.
|
76
|
+
|
77
|
+
When creating a table, all `create_table` parameters may be optionally
|
78
|
+
specified as additional keyword arguments. When appending to, or replacing,
|
79
|
+
an existing table, all `alter_table` parameters may be optionally specified
|
80
|
+
as additional keyword arguments."""
|
81
|
+
raise NotImplementedError("write_to_table not implemented")
|
82
|
+
|
83
|
+
|
84
|
+
def read_table(
|
85
|
+
table: str,
|
86
|
+
*args,
|
87
|
+
namespace: Optional[str] = None,
|
88
|
+
table_version: Optional[str] = None,
|
89
|
+
table_type: Optional[TableType] = TableType.PYARROW,
|
90
|
+
distributed_dataset_type: Optional[
|
91
|
+
DistributedDatasetType
|
92
|
+
] = DistributedDatasetType.RAY_DATASET,
|
93
|
+
partition_filter: Optional[List[Union[Partition, PartitionLocator]]] = None,
|
94
|
+
stream_position_range_inclusive: Optional[Tuple[int, int]] = None,
|
95
|
+
merge_on_read: Optional[bool] = False,
|
96
|
+
reader_kwargs: Optional[Dict[Any, Any]] = None,
|
97
|
+
**kwargs,
|
98
|
+
) -> DistributedDataset: # type: ignore
|
99
|
+
"""Read a table into a distributed dataset."""
|
100
|
+
|
101
|
+
if reader_kwargs is None:
|
102
|
+
reader_kwargs = {}
|
103
|
+
|
104
|
+
_validate_read_table_args(
|
105
|
+
namespace=namespace,
|
106
|
+
table_type=table_type,
|
107
|
+
distributed_dataset_type=distributed_dataset_type,
|
108
|
+
merge_on_read=merge_on_read,
|
109
|
+
**kwargs,
|
110
|
+
)
|
111
|
+
|
112
|
+
table_version_obj = _get_latest_or_given_table_version(
|
113
|
+
namespace=namespace,
|
114
|
+
table_name=table,
|
115
|
+
table_version=table_version,
|
116
|
+
**kwargs,
|
117
|
+
)
|
118
|
+
table_version = table_version_obj.table_version
|
119
|
+
|
120
|
+
if (
|
121
|
+
table_version_obj.content_types is None
|
122
|
+
or len(table_version_obj.content_types) != 1
|
123
|
+
):
|
124
|
+
raise ValueError(
|
125
|
+
"Expected exactly one content type but "
|
126
|
+
f"found {table_version_obj.content_types}."
|
127
|
+
)
|
128
|
+
|
129
|
+
logger.info(
|
130
|
+
f"Reading metadata for table={namespace}/{table}/{table_version} "
|
131
|
+
f"with partition_filters={partition_filter} and stream position"
|
132
|
+
f" range={stream_position_range_inclusive}"
|
133
|
+
)
|
134
|
+
|
135
|
+
if partition_filter is None:
|
136
|
+
logger.info(
|
137
|
+
f"Reading all partitions metadata in the table={table} "
|
138
|
+
"as partition_filter was None."
|
139
|
+
)
|
140
|
+
partition_filter = (
|
141
|
+
_get_storage(**kwargs)
|
142
|
+
.list_partitions(
|
143
|
+
table_name=table,
|
144
|
+
namespace=namespace,
|
145
|
+
table_version=table_version,
|
146
|
+
**kwargs,
|
147
|
+
)
|
148
|
+
.all_items()
|
149
|
+
)
|
150
|
+
|
151
|
+
qualified_deltas = _get_deltas_from_partition_filter(
|
152
|
+
stream_position_range_inclusive=stream_position_range_inclusive,
|
153
|
+
partition_filter=partition_filter,
|
154
|
+
**kwargs,
|
155
|
+
)
|
156
|
+
|
157
|
+
logger.info(
|
158
|
+
f"Total qualified deltas={len(qualified_deltas)} "
|
159
|
+
f"from {len(partition_filter)} partitions."
|
160
|
+
)
|
161
|
+
|
162
|
+
merge_on_read_params = MergeOnReadParams.of(
|
163
|
+
{
|
164
|
+
"deltas": qualified_deltas,
|
165
|
+
"deltacat_storage": _get_storage(**kwargs),
|
166
|
+
"deltacat_storage_kwargs": {**kwargs},
|
167
|
+
"reader_kwargs": reader_kwargs,
|
168
|
+
}
|
169
|
+
)
|
170
|
+
|
171
|
+
return MERGE_FUNC_BY_DISTRIBUTED_DATASET_TYPE[distributed_dataset_type.value](
|
172
|
+
params=merge_on_read_params, **kwargs
|
173
|
+
)
|
174
|
+
|
175
|
+
|
176
|
+
def alter_table(
|
177
|
+
table: str,
|
178
|
+
*args,
|
179
|
+
namespace: Optional[str] = None,
|
180
|
+
lifecycle_state: Optional[LifecycleState] = None,
|
181
|
+
schema_updates: Optional[Dict[str, Any]] = None,
|
182
|
+
partition_updates: Optional[Dict[str, Any]] = None,
|
183
|
+
sort_keys: Optional[SortScheme] = None,
|
184
|
+
description: Optional[str] = None,
|
185
|
+
properties: Optional[TableProperties] = None,
|
186
|
+
**kwargs,
|
187
|
+
) -> None:
|
188
|
+
"""Alter deltacat table/table_version definition.
|
189
|
+
|
190
|
+
Modifies various aspects of a table's metadata including lifecycle state,
|
191
|
+
schema, partitioning, sort keys, description, and properties.
|
192
|
+
|
193
|
+
Args:
|
194
|
+
table: Name of the table to alter.
|
195
|
+
namespace: Optional namespace of the table. Uses default namespace if not specified.
|
196
|
+
lifecycle_state: New lifecycle state for the table.
|
197
|
+
schema_updates: Map of schema updates to apply.
|
198
|
+
partition_updates: Map of partition scheme updates to apply.
|
199
|
+
sort_keys: New sort keys scheme.
|
200
|
+
description: New description for the table.
|
201
|
+
properties: New table properties.
|
202
|
+
|
203
|
+
Returns:
|
204
|
+
None
|
205
|
+
|
206
|
+
Raises:
|
207
|
+
TableNotFoundError: If the table does not already exist.
|
208
|
+
"""
|
209
|
+
namespace = namespace or default_namespace()
|
210
|
+
|
211
|
+
_get_storage(**kwargs).update_table(
|
212
|
+
*args,
|
213
|
+
namespace=namespace,
|
214
|
+
table_name=table,
|
215
|
+
description=description,
|
216
|
+
properties=properties,
|
217
|
+
lifecycle_state=lifecycle_state,
|
218
|
+
**kwargs,
|
219
|
+
)
|
220
|
+
|
221
|
+
table_version = _get_storage(**kwargs).get_latest_table_version(
|
222
|
+
namespace, table, **kwargs
|
223
|
+
)
|
224
|
+
_get_storage(**kwargs).update_table_version(
|
225
|
+
*args,
|
226
|
+
namespace=namespace,
|
227
|
+
table_name=table,
|
228
|
+
table_version=table_version.id,
|
229
|
+
description=description,
|
230
|
+
schema_updates=schema_updates,
|
231
|
+
partition_updates=partition_updates,
|
232
|
+
sort_keys=sort_keys,
|
233
|
+
**kwargs,
|
234
|
+
)
|
235
|
+
|
236
|
+
|
237
|
+
def create_table(
|
238
|
+
name: str,
|
239
|
+
*args,
|
240
|
+
namespace: Optional[str] = None,
|
241
|
+
version: Optional[str] = None,
|
242
|
+
lifecycle_state: Optional[LifecycleState] = LifecycleState.ACTIVE,
|
243
|
+
schema: Optional[Schema] = None,
|
244
|
+
partition_scheme: Optional[PartitionScheme] = None,
|
245
|
+
sort_keys: Optional[SortScheme] = None,
|
246
|
+
description: Optional[str] = None,
|
247
|
+
table_properties: Optional[TableProperties] = None,
|
248
|
+
namespace_properties: Optional[NamespaceProperties] = None,
|
249
|
+
content_types: Optional[List[ContentType]] = None,
|
250
|
+
fail_if_exists: bool = True,
|
251
|
+
**kwargs,
|
252
|
+
) -> TableDefinition:
|
253
|
+
"""Create an empty table in the catalog.
|
254
|
+
|
255
|
+
If a namespace isn't provided, the table will be created within the default deltacat namespace.
|
256
|
+
Additionally if the provided namespace does not exist, it will be created for you.
|
257
|
+
|
258
|
+
|
259
|
+
Args:
|
260
|
+
name: Name of the table to create.
|
261
|
+
namespace: Optional namespace for the table. Uses default namespace if not specified.
|
262
|
+
version: Optional version identifier for the table.
|
263
|
+
lifecycle_state: Lifecycle state of the new table. Defaults to ACTIVE.
|
264
|
+
schema: Schema definition for the table.
|
265
|
+
partition_scheme: Optional partitioning scheme for the table.
|
266
|
+
sort_keys: Optional sort keys for the table.
|
267
|
+
description: Optional description of the table.
|
268
|
+
table_properties: Optional properties for the table.
|
269
|
+
namespace_properties: Optional properties for the namespace if it needs to be created.
|
270
|
+
content_types: Optional list of allowed content types for the table.
|
271
|
+
fail_if_exists: If True, raises an error if table already exists. If False, returns existing table.
|
272
|
+
|
273
|
+
Returns:
|
274
|
+
TableDefinition object for the created or existing table.
|
275
|
+
|
276
|
+
Raises:
|
277
|
+
TableAlreadyExistsError: If the table already exists and fail_if_exists is True.
|
278
|
+
NamespaceNotFoundError: If the provided namespace does not exist.
|
279
|
+
"""
|
280
|
+
namespace = namespace or default_namespace()
|
281
|
+
|
282
|
+
table = get_table(*args, name, namespace=namespace, table_version=version, **kwargs)
|
283
|
+
if table is not None:
|
284
|
+
if fail_if_exists:
|
285
|
+
raise TableAlreadyExistsError(f"Table {namespace}.{name} already exists")
|
286
|
+
return table
|
287
|
+
|
288
|
+
if not namespace_exists(*args, namespace, **kwargs):
|
289
|
+
create_namespace(
|
290
|
+
*args, namespace=namespace, properties=namespace_properties, **kwargs
|
291
|
+
)
|
292
|
+
|
293
|
+
(table, table_version, stream) = _get_storage(**kwargs).create_table_version(
|
294
|
+
*args,
|
295
|
+
namespace=namespace,
|
296
|
+
table_name=name,
|
297
|
+
table_version=version,
|
298
|
+
schema=schema,
|
299
|
+
partition_scheme=partition_scheme,
|
300
|
+
sort_keys=sort_keys,
|
301
|
+
table_version_description=description,
|
302
|
+
table_description=description,
|
303
|
+
table_properties=table_properties,
|
304
|
+
lifecycle_state=lifecycle_state or LifecycleState.ACTIVE,
|
305
|
+
supported_content_types=content_types,
|
306
|
+
**kwargs,
|
307
|
+
)
|
308
|
+
|
309
|
+
return TableDefinition.of(
|
310
|
+
table=table,
|
311
|
+
table_version=table_version,
|
312
|
+
stream=stream,
|
313
|
+
)
|
314
|
+
|
315
|
+
|
316
|
+
def drop_table(
|
317
|
+
name: str,
|
318
|
+
*args,
|
319
|
+
namespace: Optional[str] = None,
|
320
|
+
table_version: Optional[str] = None,
|
321
|
+
purge: bool = False,
|
322
|
+
**kwargs,
|
323
|
+
) -> None:
|
324
|
+
"""Drop a table from the catalog and optionally purges underlying data.
|
325
|
+
|
326
|
+
Args:
|
327
|
+
name: Name of the table to drop.
|
328
|
+
namespace: Optional namespace of the table. Uses default namespace if not specified.
|
329
|
+
purge: If True, permanently delete the table data. If False, only remove from catalog.
|
330
|
+
|
331
|
+
Returns:
|
332
|
+
None
|
333
|
+
|
334
|
+
Raises:
|
335
|
+
TableNotFoundError: If the table does not exist.
|
336
|
+
|
337
|
+
TODO: Honor purge once garbage collection is implemented.
|
338
|
+
TODO: Drop table version if specified, possibly create a delete_table_version api.
|
339
|
+
"""
|
340
|
+
if purge:
|
341
|
+
raise NotImplementedError("Purge flag is not currently supported.")
|
342
|
+
|
343
|
+
namespace = namespace or default_namespace()
|
344
|
+
_get_storage(**kwargs).delete_table(
|
345
|
+
*args, namespace=namespace, name=name, purge=purge, **kwargs
|
346
|
+
)
|
347
|
+
|
348
|
+
|
349
|
+
def refresh_table(table: str, *args, namespace: Optional[str] = None, **kwargs) -> None:
|
350
|
+
"""Refresh metadata cached on the Ray cluster for the given table.
|
351
|
+
|
352
|
+
Args:
|
353
|
+
table: Name of the table to refresh.
|
354
|
+
namespace: Optional namespace of the table. Uses default namespace if not specified.
|
355
|
+
|
356
|
+
Returns:
|
357
|
+
None
|
358
|
+
"""
|
359
|
+
raise NotImplementedError("refresh_table not implemented")
|
360
|
+
|
361
|
+
|
362
|
+
def list_tables(
|
363
|
+
*args, namespace: Optional[str] = None, **kwargs
|
364
|
+
) -> ListResult[TableDefinition]:
|
365
|
+
"""List a page of table definitions.
|
366
|
+
|
367
|
+
Args:
|
368
|
+
namespace: Optional namespace to list tables from. Uses default namespace if not specified.
|
369
|
+
|
370
|
+
Returns:
|
371
|
+
ListResult containing TableDefinition objects for tables in the namespace.
|
372
|
+
"""
|
373
|
+
namespace = namespace or default_namespace()
|
374
|
+
tables = _get_storage(**kwargs).list_tables(*args, namespace=namespace, **kwargs)
|
375
|
+
table_definitions = [
|
376
|
+
get_table(*args, table.table_name, namespace, **kwargs)
|
377
|
+
for table in tables.all_items()
|
378
|
+
]
|
379
|
+
|
380
|
+
return ListResult(items=table_definitions)
|
381
|
+
|
382
|
+
|
383
|
+
def get_table(
|
384
|
+
name: str,
|
385
|
+
*args,
|
386
|
+
namespace: Optional[str] = None,
|
387
|
+
table_version: Optional[str] = None,
|
388
|
+
stream_format: StreamFormat = StreamFormat.DELTACAT,
|
389
|
+
**kwargs,
|
390
|
+
) -> Optional[TableDefinition]:
|
391
|
+
"""Get table definition metadata.
|
392
|
+
|
393
|
+
Args:
|
394
|
+
name: Name of the table to retrieve.
|
395
|
+
namespace: Optional namespace of the table. Uses default namespace if not specified.
|
396
|
+
table_version: Optional specific version of the table to retrieve.
|
397
|
+
If not specified, the latest version is used.
|
398
|
+
stream_format: Optional stream format to retrieve. Uses the default Deltacat stream
|
399
|
+
format if not specified.
|
400
|
+
|
401
|
+
Returns:
|
402
|
+
Deltacat TableDefinition if the table exists, None otherwise.
|
403
|
+
|
404
|
+
Raises:
|
405
|
+
TableVersionNotFoundError: If the table version does not exist.
|
406
|
+
StreamNotFoundError: If the stream does not exist.
|
407
|
+
"""
|
408
|
+
namespace = namespace or default_namespace()
|
409
|
+
table: Optional[Table] = _get_storage(**kwargs).get_table(
|
410
|
+
*args, table_name=name, namespace=namespace, **kwargs
|
411
|
+
)
|
412
|
+
|
413
|
+
if table is None:
|
414
|
+
return None
|
415
|
+
|
416
|
+
table_version: Optional[TableVersion] = _get_storage(**kwargs).get_table_version(
|
417
|
+
*args, namespace, name, table_version or table.latest_table_version, **kwargs
|
418
|
+
)
|
419
|
+
|
420
|
+
if table_version is None:
|
421
|
+
raise TableVersionNotFoundError(
|
422
|
+
f"TableVersion {namespace}.{name}.{table_version} does not exist."
|
423
|
+
)
|
424
|
+
|
425
|
+
stream = _get_storage(**kwargs).get_stream(
|
426
|
+
*args,
|
427
|
+
namespace=namespace,
|
428
|
+
table_name=name,
|
429
|
+
table_version=table_version.id,
|
430
|
+
stream_format=stream_format,
|
431
|
+
**kwargs,
|
432
|
+
)
|
433
|
+
|
434
|
+
if stream is None:
|
435
|
+
raise StreamNotFoundError(
|
436
|
+
f"Stream {namespace}.{table}.{table_version}.{stream} does not exist."
|
437
|
+
)
|
438
|
+
|
439
|
+
return TableDefinition.of(
|
440
|
+
table=table,
|
441
|
+
table_version=table_version,
|
442
|
+
stream=stream,
|
443
|
+
)
|
444
|
+
|
445
|
+
|
446
|
+
def truncate_table(
|
447
|
+
table: str, *args, namespace: Optional[str] = None, **kwargs
|
448
|
+
) -> None:
|
449
|
+
"""Truncate table data.
|
450
|
+
|
451
|
+
Args:
|
452
|
+
table: Name of the table to truncate.
|
453
|
+
namespace: Optional namespace of the table. Uses default namespace if not specified.
|
454
|
+
|
455
|
+
Returns:
|
456
|
+
None
|
457
|
+
"""
|
458
|
+
raise NotImplementedError("truncate_table not implemented")
|
459
|
+
|
460
|
+
|
461
|
+
def rename_table(
|
462
|
+
table: str, new_name: str, *args, namespace: Optional[str] = None, **kwargs
|
463
|
+
) -> None:
|
464
|
+
"""Rename an existing table.
|
465
|
+
|
466
|
+
Args:
|
467
|
+
table: Current name of the table.
|
468
|
+
new_name: New name for the table.
|
469
|
+
namespace: Optional namespace of the table. Uses default namespace if not specified.
|
470
|
+
|
471
|
+
Returns:
|
472
|
+
None
|
473
|
+
|
474
|
+
Raises:
|
475
|
+
TableNotFoundError: If the table does not exist.
|
476
|
+
"""
|
477
|
+
namespace = namespace or default_namespace()
|
478
|
+
_get_storage(**kwargs).update_table(
|
479
|
+
*args, table_name=table, new_table_name=new_name, namespace=namespace, **kwargs
|
480
|
+
)
|
481
|
+
|
482
|
+
|
483
|
+
def table_exists(table: str, *args, namespace: Optional[str] = None, **kwargs) -> bool:
|
484
|
+
"""Check if a table exists in the catalog.
|
485
|
+
|
486
|
+
Args:
|
487
|
+
table: Name of the table to check.
|
488
|
+
namespace: Optional namespace of the table. Uses default namespace if not specified.
|
489
|
+
|
490
|
+
Returns:
|
491
|
+
True if the table exists, False otherwise.
|
492
|
+
"""
|
493
|
+
namespace = namespace or default_namespace()
|
494
|
+
return _get_storage(**kwargs).table_exists(
|
495
|
+
*args, table_name=table, namespace=namespace, **kwargs
|
496
|
+
)
|
497
|
+
|
498
|
+
|
499
|
+
def list_namespaces(*args, **kwargs) -> ListResult[Namespace]:
|
500
|
+
"""List a page of table namespaces.
|
501
|
+
|
502
|
+
Args:
|
503
|
+
catalog: Catalog properties instance.
|
504
|
+
|
505
|
+
Returns:
|
506
|
+
ListResult containing Namespace objects.
|
507
|
+
"""
|
508
|
+
return _get_storage(**kwargs).list_namespaces(*args, **kwargs)
|
509
|
+
|
510
|
+
|
511
|
+
def get_namespace(namespace: str, *args, **kwargs) -> Optional[Namespace]:
|
512
|
+
"""Get metadata for a specific table namespace.
|
513
|
+
|
514
|
+
Args:
|
515
|
+
namespace: Name of the namespace to retrieve.
|
516
|
+
|
517
|
+
Returns:
|
518
|
+
Namespace object if the namespace exists, None otherwise.
|
519
|
+
"""
|
520
|
+
return _get_storage(**kwargs).get_namespace(*args, namespace=namespace, **kwargs)
|
521
|
+
|
522
|
+
|
523
|
+
def namespace_exists(namespace: str, *args, **kwargs) -> bool:
|
524
|
+
"""Check if a namespace exists.
|
525
|
+
|
526
|
+
Args:
|
527
|
+
namespace: Name of the namespace to check.
|
528
|
+
|
529
|
+
Returns:
|
530
|
+
True if the namespace exists, False otherwise.
|
531
|
+
"""
|
532
|
+
return _get_storage(**kwargs).namespace_exists(*args, namespace=namespace, **kwargs)
|
533
|
+
|
534
|
+
|
535
|
+
def create_namespace(
|
536
|
+
namespace: str, *args, properties: Optional[NamespaceProperties] = None, **kwargs
|
537
|
+
) -> Namespace:
|
538
|
+
"""Create a new namespace.
|
539
|
+
|
540
|
+
Args:
|
541
|
+
namespace: Name of the namespace to create.
|
542
|
+
properties: Optional properties for the namespace.
|
543
|
+
|
544
|
+
Returns:
|
545
|
+
Created Namespace object.
|
546
|
+
|
547
|
+
Raises:
|
548
|
+
NamespaceAlreadyExistsError: If the namespace already exists.
|
549
|
+
"""
|
550
|
+
if namespace_exists(namespace, **kwargs):
|
551
|
+
raise NamespaceAlreadyExistsError(f"Namespace {namespace} already exists")
|
552
|
+
|
553
|
+
return _get_storage(**kwargs).create_namespace(
|
554
|
+
*args, namespace=namespace, properties=properties, **kwargs
|
555
|
+
)
|
556
|
+
|
557
|
+
|
558
|
+
def alter_namespace(
|
559
|
+
namespace: str,
|
560
|
+
*args,
|
561
|
+
properties: Optional[NamespaceProperties] = None,
|
562
|
+
new_namespace: Optional[str] = None,
|
563
|
+
**kwargs,
|
564
|
+
) -> None:
|
565
|
+
"""Alter a namespace definition.
|
566
|
+
|
567
|
+
Args:
|
568
|
+
namespace: Name of the namespace to alter.
|
569
|
+
properties: Optional new properties for the namespace.
|
570
|
+
new_namespace: Optional new name for the namespace.
|
571
|
+
|
572
|
+
Returns:
|
573
|
+
None
|
574
|
+
"""
|
575
|
+
_get_storage(**kwargs).update_namespace(
|
576
|
+
namespace=namespace,
|
577
|
+
properties=properties,
|
578
|
+
new_namespace=new_namespace,
|
579
|
+
*args,
|
580
|
+
**kwargs,
|
581
|
+
)
|
582
|
+
|
583
|
+
|
584
|
+
def drop_namespace(namespace: str, *args, purge: bool = False, **kwargs) -> None:
|
585
|
+
"""Drop a namespace and all of its tables from the catalog.
|
586
|
+
|
587
|
+
Args:
|
588
|
+
namespace: Name of the namespace to drop.
|
589
|
+
purge: If True, permanently delete all tables in the namespace.
|
590
|
+
If False, only remove from catalog.
|
591
|
+
|
592
|
+
Returns:
|
593
|
+
None
|
594
|
+
|
595
|
+
TODO: Honor purge once garbage collection is implemented.
|
596
|
+
"""
|
597
|
+
if purge:
|
598
|
+
raise NotImplementedError("Purge flag is not currently supported.")
|
599
|
+
|
600
|
+
_get_storage(**kwargs).delete_namespace(
|
601
|
+
*args, namespace=namespace, purge=purge, **kwargs
|
602
|
+
)
|
603
|
+
|
604
|
+
|
605
|
+
def default_namespace(*args, **kwargs) -> str:
|
606
|
+
"""Return the default namespace for the catalog.
|
607
|
+
|
608
|
+
Returns:
|
609
|
+
String name of the default namespace.
|
610
|
+
"""
|
611
|
+
return DEFAULT_NAMESPACE # table functions
|
612
|
+
|
613
|
+
|
614
|
+
def _validate_read_table_args(
|
615
|
+
namespace: Optional[str] = None,
|
616
|
+
table_type: Optional[TableType] = None,
|
617
|
+
distributed_dataset_type: Optional[DistributedDatasetType] = None,
|
618
|
+
merge_on_read: Optional[bool] = None,
|
619
|
+
**kwargs,
|
620
|
+
):
|
621
|
+
storage = _get_storage(**kwargs)
|
622
|
+
if storage is None:
|
623
|
+
raise ValueError(
|
624
|
+
"Catalog not initialized. Did you miss calling "
|
625
|
+
"initialize(ds=<deltacat_storage>)?"
|
626
|
+
)
|
627
|
+
|
628
|
+
if merge_on_read:
|
629
|
+
raise ValueError("Merge on read not supported currently.")
|
630
|
+
|
631
|
+
if table_type is not TableType.PYARROW:
|
632
|
+
raise ValueError("Only PYARROW table type is supported as of now")
|
633
|
+
|
634
|
+
if distributed_dataset_type is not DistributedDatasetType.DAFT:
|
635
|
+
raise ValueError("Only DAFT dataset type is supported as of now")
|
636
|
+
|
637
|
+
if namespace is None:
|
638
|
+
raise ValueError(
|
639
|
+
"namespace must be passed to uniquely identify a table in the catalog."
|
640
|
+
)
|
641
|
+
|
642
|
+
|
643
|
+
def _get_latest_or_given_table_version(
|
644
|
+
namespace: str,
|
645
|
+
table_name: str,
|
646
|
+
table_version: Optional[str] = None,
|
647
|
+
*args,
|
648
|
+
**kwargs,
|
649
|
+
) -> TableVersion:
|
650
|
+
table_version_obj = None
|
651
|
+
if table_version is None:
|
652
|
+
table_version_obj = _get_storage(**kwargs).get_latest_table_version(
|
653
|
+
namespace=namespace, table_name=table_name, *args, **kwargs
|
654
|
+
)
|
655
|
+
table_version = table_version_obj.table_version
|
656
|
+
else:
|
657
|
+
table_version_obj = _get_storage(**kwargs).get_table_version(
|
658
|
+
namespace=namespace,
|
659
|
+
table_name=table_name,
|
660
|
+
table_version=table_version,
|
661
|
+
*args,
|
662
|
+
**kwargs,
|
663
|
+
)
|
664
|
+
|
665
|
+
return table_version_obj
|
666
|
+
|
667
|
+
|
668
|
+
def _get_deltas_from_partition_filter(
|
669
|
+
partition_filter: Optional[List[Union[Partition, PartitionLocator]]] = None,
|
670
|
+
stream_position_range_inclusive: Optional[Tuple[int, int]] = None,
|
671
|
+
*args,
|
672
|
+
**kwargs,
|
673
|
+
):
|
674
|
+
|
675
|
+
result_deltas = []
|
676
|
+
start_stream_position, end_stream_position = stream_position_range_inclusive or (
|
677
|
+
None,
|
678
|
+
None,
|
679
|
+
)
|
680
|
+
for partition_like in partition_filter:
|
681
|
+
deltas = (
|
682
|
+
_get_storage(**kwargs)
|
683
|
+
.list_partition_deltas(
|
684
|
+
partition_like=partition_like,
|
685
|
+
ascending_order=True,
|
686
|
+
include_manifest=True,
|
687
|
+
start_stream_position=start_stream_position,
|
688
|
+
last_stream_position=end_stream_position,
|
689
|
+
*args,
|
690
|
+
**kwargs,
|
691
|
+
)
|
692
|
+
.all_items()
|
693
|
+
)
|
694
|
+
|
695
|
+
for delta in deltas:
|
696
|
+
if (
|
697
|
+
start_stream_position is None
|
698
|
+
or delta.stream_position >= start_stream_position
|
699
|
+
) and (
|
700
|
+
end_stream_position is None
|
701
|
+
or delta.stream_position <= end_stream_position
|
702
|
+
):
|
703
|
+
if delta.type == DeltaType.DELETE:
|
704
|
+
raise ValueError("DELETE type deltas are not supported")
|
705
|
+
result_deltas.append(delta)
|
706
|
+
|
707
|
+
return result_deltas
|
708
|
+
|
709
|
+
|
710
|
+
def _get_storage(**kwargs):
|
711
|
+
"""
|
712
|
+
Returns the implementation of `deltacat.storage.interface` to use with this catalog.
|
713
|
+
|
714
|
+
This is configured in the `CatalogProperties` stored during initialization and passed through `delegate.py`
|
715
|
+
"""
|
716
|
+
properties: Optional[CatalogProperties] = kwargs.get("inner")
|
717
|
+
if properties is not None and properties.storage is not None:
|
718
|
+
return properties.storage
|
719
|
+
else:
|
720
|
+
return storage_impl
|