deltacat 1.1.36__py3-none-any.whl → 2.0.0b2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +42 -3
- deltacat/annotations.py +36 -0
- deltacat/api.py +168 -0
- deltacat/aws/s3u.py +4 -4
- deltacat/benchmarking/benchmark_engine.py +82 -0
- deltacat/benchmarking/benchmark_report.py +86 -0
- deltacat/benchmarking/benchmark_suite.py +11 -0
- deltacat/benchmarking/conftest.py +21 -0
- deltacat/benchmarking/data/random_row_generator.py +94 -0
- deltacat/benchmarking/data/row_generator.py +10 -0
- deltacat/benchmarking/test_benchmark_pipeline.py +106 -0
- deltacat/catalog/__init__.py +14 -0
- deltacat/catalog/delegate.py +199 -106
- deltacat/catalog/iceberg/__init__.py +4 -0
- deltacat/catalog/iceberg/iceberg_catalog_config.py +26 -0
- deltacat/catalog/iceberg/impl.py +368 -0
- deltacat/catalog/iceberg/overrides.py +74 -0
- deltacat/catalog/interface.py +273 -76
- deltacat/catalog/main/impl.py +720 -0
- deltacat/catalog/model/catalog.py +227 -20
- deltacat/catalog/model/properties.py +116 -0
- deltacat/catalog/model/table_definition.py +32 -1
- deltacat/compute/compactor/model/compaction_session_audit_info.py +7 -3
- deltacat/compute/compactor/model/delta_annotated.py +3 -3
- deltacat/compute/compactor/model/delta_file_envelope.py +3 -1
- deltacat/compute/compactor/model/delta_file_locator.py +3 -1
- deltacat/compute/compactor/model/round_completion_info.py +5 -5
- deltacat/compute/compactor/model/table_object_store.py +3 -2
- deltacat/compute/compactor/repartition_session.py +1 -1
- deltacat/compute/compactor/steps/dedupe.py +11 -4
- deltacat/compute/compactor/steps/hash_bucket.py +1 -1
- deltacat/compute/compactor/steps/materialize.py +6 -2
- deltacat/compute/compactor/utils/io.py +1 -1
- deltacat/compute/compactor/utils/sort_key.py +9 -2
- deltacat/compute/compactor_v2/compaction_session.py +5 -9
- deltacat/compute/compactor_v2/constants.py +1 -30
- deltacat/compute/compactor_v2/deletes/utils.py +3 -3
- deltacat/compute/compactor_v2/model/merge_input.py +1 -7
- deltacat/compute/compactor_v2/private/compaction_utils.py +5 -6
- deltacat/compute/compactor_v2/steps/merge.py +17 -126
- deltacat/compute/compactor_v2/utils/content_type_params.py +0 -17
- deltacat/compute/compactor_v2/utils/dedupe.py +1 -1
- deltacat/compute/compactor_v2/utils/io.py +1 -1
- deltacat/compute/compactor_v2/utils/merge.py +0 -1
- deltacat/compute/compactor_v2/utils/primary_key_index.py +3 -15
- deltacat/compute/compactor_v2/utils/task_options.py +23 -43
- deltacat/compute/converter/constants.py +4 -0
- deltacat/compute/converter/converter_session.py +143 -0
- deltacat/compute/converter/model/convert_input.py +69 -0
- deltacat/compute/converter/model/convert_input_files.py +61 -0
- deltacat/compute/converter/model/converter_session_params.py +99 -0
- deltacat/compute/converter/pyiceberg/__init__.py +0 -0
- deltacat/compute/converter/pyiceberg/catalog.py +75 -0
- deltacat/compute/converter/pyiceberg/overrides.py +135 -0
- deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +251 -0
- deltacat/compute/converter/steps/__init__.py +0 -0
- deltacat/compute/converter/steps/convert.py +211 -0
- deltacat/compute/converter/steps/dedupe.py +60 -0
- deltacat/compute/converter/utils/__init__.py +0 -0
- deltacat/compute/converter/utils/convert_task_options.py +88 -0
- deltacat/compute/converter/utils/converter_session_utils.py +109 -0
- deltacat/compute/converter/utils/iceberg_columns.py +82 -0
- deltacat/compute/converter/utils/io.py +43 -0
- deltacat/compute/converter/utils/s3u.py +133 -0
- deltacat/compute/resource_estimation/delta.py +1 -19
- deltacat/constants.py +47 -1
- deltacat/env.py +51 -0
- deltacat/examples/__init__.py +0 -0
- deltacat/examples/basic_logging.py +101 -0
- deltacat/examples/common/__init__.py +0 -0
- deltacat/examples/common/fixtures.py +15 -0
- deltacat/examples/hello_world.py +27 -0
- deltacat/examples/iceberg/__init__.py +0 -0
- deltacat/examples/iceberg/iceberg_bucket_writer.py +139 -0
- deltacat/examples/iceberg/iceberg_reader.py +149 -0
- deltacat/exceptions.py +51 -9
- deltacat/logs.py +4 -1
- deltacat/storage/__init__.py +118 -28
- deltacat/storage/iceberg/__init__.py +0 -0
- deltacat/storage/iceberg/iceberg_scan_planner.py +28 -0
- deltacat/storage/iceberg/impl.py +737 -0
- deltacat/storage/iceberg/model.py +709 -0
- deltacat/storage/interface.py +217 -134
- deltacat/storage/main/__init__.py +0 -0
- deltacat/storage/main/impl.py +2077 -0
- deltacat/storage/model/delta.py +118 -71
- deltacat/storage/model/interop.py +24 -0
- deltacat/storage/model/list_result.py +8 -0
- deltacat/storage/model/locator.py +93 -3
- deltacat/{aws/redshift → storage}/model/manifest.py +122 -98
- deltacat/storage/model/metafile.py +1316 -0
- deltacat/storage/model/namespace.py +34 -18
- deltacat/storage/model/partition.py +362 -37
- deltacat/storage/model/scan/__init__.py +0 -0
- deltacat/storage/model/scan/push_down.py +19 -0
- deltacat/storage/model/scan/scan_plan.py +10 -0
- deltacat/storage/model/scan/scan_task.py +34 -0
- deltacat/storage/model/schema.py +892 -0
- deltacat/storage/model/shard.py +47 -0
- deltacat/storage/model/sort_key.py +170 -13
- deltacat/storage/model/stream.py +208 -80
- deltacat/storage/model/table.py +123 -29
- deltacat/storage/model/table_version.py +322 -46
- deltacat/storage/model/transaction.py +757 -0
- deltacat/storage/model/transform.py +198 -61
- deltacat/storage/model/types.py +111 -13
- deltacat/storage/rivulet/__init__.py +11 -0
- deltacat/storage/rivulet/arrow/__init__.py +0 -0
- deltacat/storage/rivulet/arrow/serializer.py +75 -0
- deltacat/storage/rivulet/dataset.py +744 -0
- deltacat/storage/rivulet/dataset_executor.py +87 -0
- deltacat/storage/rivulet/feather/__init__.py +5 -0
- deltacat/storage/rivulet/feather/file_reader.py +136 -0
- deltacat/storage/rivulet/feather/serializer.py +35 -0
- deltacat/storage/rivulet/fs/__init__.py +0 -0
- deltacat/storage/rivulet/fs/file_provider.py +105 -0
- deltacat/storage/rivulet/fs/file_store.py +130 -0
- deltacat/storage/rivulet/fs/input_file.py +76 -0
- deltacat/storage/rivulet/fs/output_file.py +86 -0
- deltacat/storage/rivulet/logical_plan.py +105 -0
- deltacat/storage/rivulet/metastore/__init__.py +0 -0
- deltacat/storage/rivulet/metastore/delta.py +190 -0
- deltacat/storage/rivulet/metastore/json_sst.py +105 -0
- deltacat/storage/rivulet/metastore/sst.py +82 -0
- deltacat/storage/rivulet/metastore/sst_interval_tree.py +260 -0
- deltacat/storage/rivulet/mvp/Table.py +101 -0
- deltacat/storage/rivulet/mvp/__init__.py +5 -0
- deltacat/storage/rivulet/parquet/__init__.py +5 -0
- deltacat/storage/rivulet/parquet/data_reader.py +0 -0
- deltacat/storage/rivulet/parquet/file_reader.py +127 -0
- deltacat/storage/rivulet/parquet/serializer.py +37 -0
- deltacat/storage/rivulet/reader/__init__.py +0 -0
- deltacat/storage/rivulet/reader/block_scanner.py +378 -0
- deltacat/storage/rivulet/reader/data_reader.py +136 -0
- deltacat/storage/rivulet/reader/data_scan.py +63 -0
- deltacat/storage/rivulet/reader/dataset_metastore.py +178 -0
- deltacat/storage/rivulet/reader/dataset_reader.py +156 -0
- deltacat/storage/rivulet/reader/pyarrow_data_reader.py +121 -0
- deltacat/storage/rivulet/reader/query_expression.py +99 -0
- deltacat/storage/rivulet/reader/reader_type_registrar.py +84 -0
- deltacat/storage/rivulet/schema/__init__.py +0 -0
- deltacat/storage/rivulet/schema/datatype.py +128 -0
- deltacat/storage/rivulet/schema/schema.py +251 -0
- deltacat/storage/rivulet/serializer.py +40 -0
- deltacat/storage/rivulet/serializer_factory.py +42 -0
- deltacat/storage/rivulet/writer/__init__.py +0 -0
- deltacat/storage/rivulet/writer/dataset_writer.py +29 -0
- deltacat/storage/rivulet/writer/memtable_dataset_writer.py +294 -0
- deltacat/storage/util/__init__.py +0 -0
- deltacat/storage/util/scan_planner.py +26 -0
- deltacat/tests/_io/__init__.py +1 -0
- deltacat/tests/catalog/test_catalogs.py +324 -0
- deltacat/tests/catalog/test_default_catalog_impl.py +16 -8
- deltacat/tests/compute/compact_partition_multiple_rounds_test_cases.py +21 -21
- deltacat/tests/compute/compact_partition_rebase_test_cases.py +6 -6
- deltacat/tests/compute/compact_partition_rebase_then_incremental_test_cases.py +56 -56
- deltacat/tests/compute/compact_partition_test_cases.py +19 -53
- deltacat/tests/compute/compactor/steps/test_repartition.py +2 -2
- deltacat/tests/compute/compactor/utils/test_io.py +6 -8
- deltacat/tests/compute/compactor_v2/test_compaction_session.py +0 -466
- deltacat/tests/compute/compactor_v2/utils/test_task_options.py +1 -273
- deltacat/tests/compute/conftest.py +75 -0
- deltacat/tests/compute/converter/__init__.py +0 -0
- deltacat/tests/compute/converter/conftest.py +80 -0
- deltacat/tests/compute/converter/test_convert_session.py +478 -0
- deltacat/tests/compute/converter/utils.py +123 -0
- deltacat/tests/compute/resource_estimation/test_delta.py +0 -16
- deltacat/tests/compute/test_compact_partition_incremental.py +2 -42
- deltacat/tests/compute/test_compact_partition_multiple_rounds.py +5 -46
- deltacat/tests/compute/test_compact_partition_params.py +3 -3
- deltacat/tests/compute/test_compact_partition_rebase.py +1 -46
- deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +5 -46
- deltacat/tests/compute/test_util_common.py +19 -12
- deltacat/tests/compute/test_util_create_table_deltas_repo.py +13 -22
- deltacat/tests/local_deltacat_storage/__init__.py +76 -103
- deltacat/tests/storage/__init__.py +0 -0
- deltacat/tests/storage/conftest.py +25 -0
- deltacat/tests/storage/main/__init__.py +0 -0
- deltacat/tests/storage/main/test_main_storage.py +1399 -0
- deltacat/tests/storage/model/__init__.py +0 -0
- deltacat/tests/storage/model/test_delete_parameters.py +21 -0
- deltacat/tests/storage/model/test_metafile_io.py +2535 -0
- deltacat/tests/storage/model/test_schema.py +308 -0
- deltacat/tests/storage/model/test_shard.py +22 -0
- deltacat/tests/storage/model/test_table_version.py +110 -0
- deltacat/tests/storage/model/test_transaction.py +308 -0
- deltacat/tests/storage/rivulet/__init__.py +0 -0
- deltacat/tests/storage/rivulet/conftest.py +149 -0
- deltacat/tests/storage/rivulet/fs/__init__.py +0 -0
- deltacat/tests/storage/rivulet/fs/test_file_location_provider.py +93 -0
- deltacat/tests/storage/rivulet/schema/__init__.py +0 -0
- deltacat/tests/storage/rivulet/schema/test_schema.py +241 -0
- deltacat/tests/storage/rivulet/test_dataset.py +406 -0
- deltacat/tests/storage/rivulet/test_manifest.py +67 -0
- deltacat/tests/storage/rivulet/test_sst_interval_tree.py +232 -0
- deltacat/tests/storage/rivulet/test_utils.py +122 -0
- deltacat/tests/storage/rivulet/writer/__init__.py +0 -0
- deltacat/tests/storage/rivulet/writer/test_dataset_write_then_read.py +341 -0
- deltacat/tests/storage/rivulet/writer/test_dataset_writer.py +79 -0
- deltacat/tests/storage/rivulet/writer/test_memtable_dataset_writer.py +75 -0
- deltacat/tests/test_deltacat_api.py +39 -0
- deltacat/tests/test_utils/filesystem.py +14 -0
- deltacat/tests/test_utils/message_pack_utils.py +54 -0
- deltacat/tests/test_utils/pyarrow.py +8 -15
- deltacat/tests/test_utils/storage.py +266 -3
- deltacat/tests/utils/test_daft.py +3 -3
- deltacat/tests/utils/test_pyarrow.py +0 -432
- deltacat/types/partial_download.py +1 -1
- deltacat/types/tables.py +1 -1
- deltacat/utils/export.py +59 -0
- deltacat/utils/filesystem.py +320 -0
- deltacat/utils/metafile_locator.py +73 -0
- deltacat/utils/pyarrow.py +36 -183
- deltacat-2.0.0b2.dist-info/METADATA +65 -0
- deltacat-2.0.0b2.dist-info/RECORD +349 -0
- deltacat/aws/redshift/__init__.py +0 -19
- deltacat/catalog/default_catalog_impl/__init__.py +0 -369
- deltacat/io/dataset.py +0 -73
- deltacat/io/read_api.py +0 -143
- deltacat/storage/model/delete_parameters.py +0 -40
- deltacat/storage/model/partition_spec.py +0 -71
- deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +0 -253
- deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +0 -45
- deltacat-1.1.36.dist-info/METADATA +0 -64
- deltacat-1.1.36.dist-info/RECORD +0 -219
- /deltacat/{aws/redshift/model → benchmarking/data}/__init__.py +0 -0
- /deltacat/{io/aws → catalog/main}/__init__.py +0 -0
- /deltacat/{io/aws/redshift → compute/converter}/__init__.py +0 -0
- /deltacat/{tests/io → compute/converter/model}/__init__.py +0 -0
- /deltacat/tests/{io → _io}/test_cloudpickle_bug_fix.py +0 -0
- /deltacat/tests/{io → _io}/test_file_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_memcached_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_ray_plasma_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_redis_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_s3_object_store.py +0 -0
- {deltacat-1.1.36.dist-info → deltacat-2.0.0b2.dist-info}/LICENSE +0 -0
- {deltacat-1.1.36.dist-info → deltacat-2.0.0b2.dist-info}/WHEEL +0 -0
- {deltacat-1.1.36.dist-info → deltacat-2.0.0b2.dist-info}/top_level.txt +0 -0
deltacat/storage/interface.py
CHANGED
@@ -1,38 +1,40 @@
|
|
1
|
-
from typing import Any, Callable, Dict, List, Optional,
|
2
|
-
|
3
|
-
import pyarrow as pa
|
1
|
+
from typing import Any, Callable, Dict, List, Optional, Union, Tuple
|
4
2
|
|
5
3
|
from deltacat.storage import (
|
6
|
-
|
4
|
+
EntryParams,
|
7
5
|
Delta,
|
8
6
|
DeltaLocator,
|
7
|
+
DeltaProperties,
|
9
8
|
DeltaType,
|
10
9
|
DistributedDataset,
|
11
10
|
LifecycleState,
|
12
11
|
ListResult,
|
13
12
|
LocalDataset,
|
14
13
|
LocalTable,
|
15
|
-
Manifest,
|
16
14
|
ManifestAuthor,
|
17
15
|
Namespace,
|
16
|
+
NamespaceProperties,
|
18
17
|
Partition,
|
19
|
-
|
18
|
+
PartitionLocator,
|
19
|
+
PartitionScheme,
|
20
|
+
PartitionValues,
|
21
|
+
Schema,
|
22
|
+
SortScheme,
|
20
23
|
Stream,
|
24
|
+
StreamFormat,
|
21
25
|
StreamLocator,
|
22
26
|
Table,
|
27
|
+
TableProperties,
|
23
28
|
TableVersion,
|
24
|
-
|
25
|
-
|
26
|
-
PartitionFilter,
|
27
|
-
PartitionValues,
|
28
|
-
DeltaPartitionSpec,
|
29
|
-
StreamPartitionSpec,
|
29
|
+
TableVersionLocator,
|
30
|
+
TableVersionProperties,
|
30
31
|
)
|
32
|
+
from deltacat.storage.model.manifest import Manifest
|
31
33
|
from deltacat.types.media import (
|
32
34
|
ContentType,
|
35
|
+
DistributedDatasetType,
|
33
36
|
StorageType,
|
34
37
|
TableType,
|
35
|
-
DistributedDatasetType,
|
36
38
|
)
|
37
39
|
from deltacat.utils.common import ReadKwargsProvider
|
38
40
|
|
@@ -64,12 +66,26 @@ def list_table_versions(
|
|
64
66
|
raise NotImplementedError("list_table_versions not implemented")
|
65
67
|
|
66
68
|
|
69
|
+
def list_streams(
|
70
|
+
namespace: str,
|
71
|
+
table_name: str,
|
72
|
+
table_version: str,
|
73
|
+
*args,
|
74
|
+
**kwargs,
|
75
|
+
) -> ListResult[Stream]:
|
76
|
+
"""
|
77
|
+
Lists a page of streams for the given table version.
|
78
|
+
Raises an error if the table version does not exist.
|
79
|
+
"""
|
80
|
+
raise NotImplementedError("list_streams not implemented")
|
81
|
+
|
82
|
+
|
67
83
|
def list_partitions(
|
68
84
|
namespace: str,
|
69
85
|
table_name: str,
|
70
86
|
table_version: Optional[str] = None,
|
71
87
|
*args,
|
72
|
-
**kwargs
|
88
|
+
**kwargs,
|
73
89
|
) -> ListResult[Partition]:
|
74
90
|
"""
|
75
91
|
Lists a page of partitions for the given table version. Partitions are
|
@@ -96,9 +112,9 @@ def list_deltas(
|
|
96
112
|
last_stream_position: Optional[int] = None,
|
97
113
|
ascending_order: Optional[bool] = None,
|
98
114
|
include_manifest: bool = False,
|
99
|
-
|
115
|
+
partition_scheme_id: Optional[str] = None,
|
100
116
|
*args,
|
101
|
-
**kwargs
|
117
|
+
**kwargs,
|
102
118
|
) -> ListResult[Delta]:
|
103
119
|
"""
|
104
120
|
Lists a page of deltas for the given table version and committed partition.
|
@@ -106,15 +122,13 @@ def list_deltas(
|
|
106
122
|
limited to inclusive first and last stream positions. Deltas are returned by
|
107
123
|
descending stream position by default. Table version resolves to the latest
|
108
124
|
active table version if not specified. Partition values should not be
|
109
|
-
specified for unpartitioned tables.
|
110
|
-
version
|
125
|
+
specified for unpartitioned tables. Partition scheme ID resolves to the
|
126
|
+
table version's current partition scheme by default. Raises an error if the
|
127
|
+
given table version or partition does not exist.
|
111
128
|
|
112
129
|
To conserve memory, the deltas returned do not include manifests by
|
113
130
|
default. The manifests can either be optionally retrieved as part of this
|
114
131
|
call or lazily loaded via subsequent calls to `get_delta_manifest`.
|
115
|
-
|
116
|
-
Note: partition_values is deprecated and will be removed in future releases.
|
117
|
-
Use partition_filter instead.
|
118
132
|
"""
|
119
133
|
raise NotImplementedError("list_deltas not implemented")
|
120
134
|
|
@@ -126,7 +140,7 @@ def list_partition_deltas(
|
|
126
140
|
ascending_order: bool = False,
|
127
141
|
include_manifest: bool = False,
|
128
142
|
*args,
|
129
|
-
**kwargs
|
143
|
+
**kwargs,
|
130
144
|
) -> ListResult[Delta]:
|
131
145
|
"""
|
132
146
|
Lists a page of deltas committed to the given partition.
|
@@ -145,22 +159,21 @@ def get_delta(
|
|
145
159
|
partition_values: Optional[PartitionValues] = None,
|
146
160
|
table_version: Optional[str] = None,
|
147
161
|
include_manifest: bool = False,
|
148
|
-
|
162
|
+
partition_scheme_id: Optional[str] = None,
|
149
163
|
*args,
|
150
|
-
**kwargs
|
164
|
+
**kwargs,
|
151
165
|
) -> Optional[Delta]:
|
152
166
|
"""
|
153
167
|
Gets the delta for the given table version, partition, and stream position.
|
154
168
|
Table version resolves to the latest active table version if not specified.
|
155
|
-
Partition values should not be specified for unpartitioned tables.
|
156
|
-
|
169
|
+
Partition values should not be specified for unpartitioned tables. Partition
|
170
|
+
scheme ID resolves to the table version's current partition scheme by
|
171
|
+
default. Raises an error if the given table version or partition does not
|
172
|
+
exist.
|
157
173
|
|
158
174
|
To conserve memory, the delta returned does not include a manifest by
|
159
175
|
default. The manifest can either be optionally retrieved as part of this
|
160
176
|
call or lazily loaded via a subsequent call to `get_delta_manifest`.
|
161
|
-
|
162
|
-
Note: partition_values is deprecated and will be removed in future releases.
|
163
|
-
Use partition_filter instead.
|
164
177
|
"""
|
165
178
|
raise NotImplementedError("get_delta not implemented")
|
166
179
|
|
@@ -171,23 +184,21 @@ def get_latest_delta(
|
|
171
184
|
partition_values: Optional[PartitionValues] = None,
|
172
185
|
table_version: Optional[str] = None,
|
173
186
|
include_manifest: bool = False,
|
174
|
-
|
187
|
+
partition_scheme_id: Optional[str] = None,
|
175
188
|
*args,
|
176
|
-
**kwargs
|
189
|
+
**kwargs,
|
177
190
|
) -> Optional[Delta]:
|
178
191
|
"""
|
179
192
|
Gets the latest delta (i.e. the delta with the greatest stream position) for
|
180
193
|
the given table version and partition. Table version resolves to the latest
|
181
194
|
active table version if not specified. Partition values should not be
|
182
|
-
specified for unpartitioned tables.
|
183
|
-
version
|
195
|
+
specified for unpartitioned tables. Partition scheme ID resolves to the
|
196
|
+
table version's current partition scheme by default. Raises an error if the
|
197
|
+
given table version or partition does not exist.
|
184
198
|
|
185
199
|
To conserve memory, the delta returned does not include a manifest by
|
186
200
|
default. The manifest can either be optionally retrieved as part of this
|
187
201
|
call or lazily loaded via a subsequent call to `get_delta_manifest`.
|
188
|
-
|
189
|
-
Note: partition_values is deprecated and will be removed in future releases.
|
190
|
-
Use partition_filter instead.
|
191
202
|
"""
|
192
203
|
raise NotImplementedError("get_latest_delta not implemented")
|
193
204
|
|
@@ -201,9 +212,8 @@ def download_delta(
|
|
201
212
|
file_reader_kwargs_provider: Optional[ReadKwargsProvider] = None,
|
202
213
|
ray_options_provider: Callable[[int, Any], Dict[str, Any]] = None,
|
203
214
|
distributed_dataset_type: DistributedDatasetType = DistributedDatasetType.RAY_DATASET,
|
204
|
-
partition_filter: Optional[PartitionFilter] = None,
|
205
215
|
*args,
|
206
|
-
**kwargs
|
216
|
+
**kwargs,
|
207
217
|
) -> Union[LocalDataset, DistributedDataset]: # type: ignore
|
208
218
|
"""
|
209
219
|
Download the given delta or delta locator into either a list of
|
@@ -211,10 +221,6 @@ def download_delta(
|
|
211
221
|
across this Ray cluster's object store memory. Ordered table N of a local
|
212
222
|
table list, or ordered block N of a distributed dataset, always contain
|
213
223
|
the contents of ordered delta manifest entry N.
|
214
|
-
|
215
|
-
partition_filter is an optional parameter which determines which files to
|
216
|
-
download from the delta manifest. A delta manifest contains all the data files
|
217
|
-
for a given delta.
|
218
224
|
"""
|
219
225
|
raise NotImplementedError("download_delta not implemented")
|
220
226
|
|
@@ -226,7 +232,7 @@ def download_delta_manifest_entry(
|
|
226
232
|
columns: Optional[List[str]] = None,
|
227
233
|
file_reader_kwargs_provider: Optional[ReadKwargsProvider] = None,
|
228
234
|
*args,
|
229
|
-
**kwargs
|
235
|
+
**kwargs,
|
230
236
|
) -> LocalTable:
|
231
237
|
"""
|
232
238
|
Downloads a single manifest entry into the specified table type for the
|
@@ -244,17 +250,21 @@ def get_delta_manifest(
|
|
244
250
|
) -> Manifest:
|
245
251
|
"""
|
246
252
|
Get the manifest associated with the given delta or delta locator. This
|
247
|
-
always retrieves the authoritative
|
248
|
-
never the local manifest defined for any input delta.
|
253
|
+
always retrieves the authoritative durable copy of the delta manifest, and
|
254
|
+
never the local manifest defined for any input delta. Raises an error if
|
255
|
+
the delta can't be found, or if it doesn't contain a manifest.
|
249
256
|
"""
|
250
257
|
raise NotImplementedError("get_delta_manifest not implemented")
|
251
258
|
|
252
259
|
|
253
260
|
def create_namespace(
|
254
|
-
namespace: str,
|
261
|
+
namespace: str,
|
262
|
+
properties: Optional[NamespaceProperties] = None,
|
263
|
+
*args,
|
264
|
+
**kwargs,
|
255
265
|
) -> Namespace:
|
256
266
|
"""
|
257
|
-
Creates a table namespace with the given name and
|
267
|
+
Creates a table namespace with the given name and properties. Returns
|
258
268
|
the created namespace.
|
259
269
|
"""
|
260
270
|
raise NotImplementedError("create_namespace not implemented")
|
@@ -262,13 +272,13 @@ def create_namespace(
|
|
262
272
|
|
263
273
|
def update_namespace(
|
264
274
|
namespace: str,
|
265
|
-
|
275
|
+
properties: Optional[NamespaceProperties] = None,
|
266
276
|
new_namespace: Optional[str] = None,
|
267
277
|
*args,
|
268
|
-
**kwargs
|
278
|
+
**kwargs,
|
269
279
|
) -> None:
|
270
280
|
"""
|
271
|
-
Updates a table namespace's name and/or
|
281
|
+
Updates a table namespace's name and/or properties. Raises an error if the
|
272
282
|
given namespace does not exist.
|
273
283
|
"""
|
274
284
|
raise NotImplementedError("update_namespace not implemented")
|
@@ -278,52 +288,28 @@ def create_table_version(
|
|
278
288
|
namespace: str,
|
279
289
|
table_name: str,
|
280
290
|
table_version: Optional[str] = None,
|
281
|
-
schema: Optional[
|
282
|
-
|
283
|
-
|
284
|
-
|
285
|
-
sort_keys: Optional[List[SortKey]] = None,
|
291
|
+
schema: Optional[Schema] = None,
|
292
|
+
partition_scheme: Optional[PartitionScheme] = None,
|
293
|
+
# TODO(pdames): rename to `sort_scheme`
|
294
|
+
sort_keys: Optional[SortScheme] = None,
|
286
295
|
table_version_description: Optional[str] = None,
|
287
|
-
table_version_properties: Optional[
|
288
|
-
table_permissions: Optional[Dict[str, Any]] = None,
|
296
|
+
table_version_properties: Optional[TableVersionProperties] = None,
|
289
297
|
table_description: Optional[str] = None,
|
290
|
-
table_properties: Optional[
|
298
|
+
table_properties: Optional[TableProperties] = None,
|
291
299
|
supported_content_types: Optional[List[ContentType]] = None,
|
292
|
-
partition_spec: Optional[StreamPartitionSpec] = None,
|
293
300
|
*args,
|
294
|
-
**kwargs
|
295
|
-
) -> Stream:
|
301
|
+
**kwargs,
|
302
|
+
) -> Tuple[Optional[Table], TableVersion, Stream]:
|
296
303
|
"""
|
297
304
|
Create a table version with an unreleased lifecycle state and an empty delta
|
298
|
-
stream. Table versions may be schemaless and unpartitioned
|
299
|
-
|
300
|
-
|
301
|
-
used with schemaless tables. This can be useful for creating logical shards
|
302
|
-
of a delta stream where partition keys are known but not projected onto each
|
303
|
-
row of the table (e.g. all rows of a customer orders table are known to
|
304
|
-
correspond to a given order day, even if this column doesn't exist in the
|
305
|
-
table). Primary and sort keys must exist within the table's schema.
|
306
|
-
Permissions specified at the table level override any conflicting
|
307
|
-
permissions specified at the table namespace level. Returns the stream
|
308
|
-
for the created table version. Raises an error if the given namespace does
|
309
|
-
not exist.
|
310
|
-
|
311
|
-
Schemas are optional for DeltaCAT tables and can be used to inform the data
|
312
|
-
consistency checks run for each field. If a schema is present, it can be
|
313
|
-
used to enforce the following column-level data consistency policies at
|
314
|
-
table load time:
|
315
|
-
|
316
|
-
None: No consistency checks are run. May be mixed with the below two
|
317
|
-
policies by specifying column names to pass through together with
|
318
|
-
column names to coerce/validate.
|
305
|
+
stream. Table versions may be schemaless and unpartitioned to improve write
|
306
|
+
performance, or have their writes governed by a schema and partition scheme
|
307
|
+
to improve data consistency and read performance.
|
319
308
|
|
320
|
-
|
321
|
-
|
309
|
+
Returns a tuple containing the created/updated table, table version, and
|
310
|
+
stream (respectively).
|
322
311
|
|
323
|
-
|
324
|
-
explicit subset of column names to validate may optionally be specified.
|
325
|
-
|
326
|
-
Either partition_keys or partition_spec must be specified but not both.
|
312
|
+
Raises an error if the given namespace does not exist.
|
327
313
|
"""
|
328
314
|
raise NotImplementedError("create_table_version not implemented")
|
329
315
|
|
@@ -331,18 +317,17 @@ def create_table_version(
|
|
331
317
|
def update_table(
|
332
318
|
namespace: str,
|
333
319
|
table_name: str,
|
334
|
-
permissions: Optional[Dict[str, Any]] = None,
|
335
320
|
description: Optional[str] = None,
|
336
|
-
properties: Optional[
|
321
|
+
properties: Optional[TableProperties] = None,
|
337
322
|
new_table_name: Optional[str] = None,
|
338
323
|
*args,
|
339
|
-
**kwargs
|
324
|
+
**kwargs,
|
340
325
|
) -> None:
|
341
326
|
"""
|
342
327
|
Update table metadata describing the table versions it contains. By default,
|
343
|
-
a table's properties are empty, and its description
|
344
|
-
|
345
|
-
|
328
|
+
a table's properties are empty, and its description is equal to that given
|
329
|
+
when its first table version was created. Raises an error if the given
|
330
|
+
table does not exist.
|
346
331
|
"""
|
347
332
|
raise NotImplementedError("update_table not implemented")
|
348
333
|
|
@@ -352,12 +337,14 @@ def update_table_version(
|
|
352
337
|
table_name: str,
|
353
338
|
table_version: str,
|
354
339
|
lifecycle_state: Optional[LifecycleState] = None,
|
355
|
-
schema: Optional[
|
356
|
-
schema_consistency: Optional[Dict[str, SchemaConsistencyType]] = None,
|
340
|
+
schema: Optional[Schema] = None,
|
357
341
|
description: Optional[str] = None,
|
358
|
-
properties: Optional[
|
342
|
+
properties: Optional[TableVersionProperties] = None,
|
343
|
+
partition_scheme: Optional[PartitionScheme] = None,
|
344
|
+
# TODO(pdames): rename to `sort_scheme`
|
345
|
+
sort_keys: Optional[SortScheme] = None,
|
359
346
|
*args,
|
360
|
-
**kwargs
|
347
|
+
**kwargs,
|
361
348
|
) -> None:
|
362
349
|
"""
|
363
350
|
Update a table version. Notably, updating an unreleased table version's
|
@@ -375,18 +362,27 @@ def stage_stream(
|
|
375
362
|
namespace: str,
|
376
363
|
table_name: str,
|
377
364
|
table_version: Optional[str] = None,
|
365
|
+
stream_format: StreamFormat = StreamFormat.DELTACAT,
|
378
366
|
*args,
|
379
|
-
**kwargs
|
367
|
+
**kwargs,
|
380
368
|
) -> Stream:
|
381
369
|
"""
|
382
370
|
Stages a new delta stream for the given table version. Resolves to the
|
383
|
-
latest active table version if no table version is given.
|
384
|
-
|
371
|
+
latest active table version if no table version is given. Resolves to the
|
372
|
+
DeltaCAT stream format if no stream format is given. If this stream
|
373
|
+
will replace another stream with the same format and scheme, then it will
|
374
|
+
have its previous stream ID set to the ID of the stream being replaced.
|
375
|
+
Returns the staged stream. Raises an error if the table version does not
|
376
|
+
exist.
|
385
377
|
"""
|
386
378
|
raise NotImplementedError("stage_stream not implemented")
|
387
379
|
|
388
380
|
|
389
|
-
def commit_stream(
|
381
|
+
def commit_stream(
|
382
|
+
stream: Stream,
|
383
|
+
*args,
|
384
|
+
**kwargs,
|
385
|
+
) -> Stream:
|
390
386
|
"""
|
391
387
|
Registers a delta stream with a target table version, replacing any
|
392
388
|
previous stream registered for the same table version. Returns the
|
@@ -399,43 +395,112 @@ def delete_stream(
|
|
399
395
|
namespace: str,
|
400
396
|
table_name: str,
|
401
397
|
table_version: Optional[str] = None,
|
398
|
+
stream_format: StreamFormat = StreamFormat.DELTACAT,
|
402
399
|
*args,
|
403
|
-
**kwargs
|
400
|
+
**kwargs,
|
404
401
|
) -> None:
|
405
402
|
"""
|
406
403
|
Deletes the delta stream currently registered with the given table version.
|
407
404
|
Resolves to the latest active table version if no table version is given.
|
408
|
-
|
405
|
+
Resolves to the deltacat stream format if no stream format is given.
|
406
|
+
Raises an error if the stream does not exist.
|
409
407
|
"""
|
410
408
|
raise NotImplementedError("delete_stream not implemented")
|
411
409
|
|
412
410
|
|
411
|
+
def delete_table(
|
412
|
+
namespace: str,
|
413
|
+
name: str,
|
414
|
+
purge: bool = False,
|
415
|
+
*args,
|
416
|
+
**kwargs,
|
417
|
+
) -> None:
|
418
|
+
"""
|
419
|
+
Drops the given table and all its contents (table versions, streams, partitions,
|
420
|
+
and deltas). If purge is True, also removes all data files associated with the table.
|
421
|
+
Raises an error if the given table does not exist.
|
422
|
+
"""
|
423
|
+
raise NotImplementedError("delete_table not implemented")
|
424
|
+
|
425
|
+
|
426
|
+
def delete_namespace(
|
427
|
+
namespace: str,
|
428
|
+
purge: bool = False,
|
429
|
+
*args,
|
430
|
+
**kwargs,
|
431
|
+
) -> None:
|
432
|
+
"""
|
433
|
+
Drops a table namespace and all its contents. If purge is True, then all
|
434
|
+
tables, table versions, and deltas will be deleted. Otherwise, the namespace
|
435
|
+
will be dropped only if it is empty. Raises an error if the given namespace
|
436
|
+
does not exist.
|
437
|
+
"""
|
438
|
+
raise NotImplementedError("drop_namespace not implemented")
|
439
|
+
|
440
|
+
|
441
|
+
def get_stream_by_id(
|
442
|
+
table_version_locator: TableVersionLocator,
|
443
|
+
stream_id: str,
|
444
|
+
*args,
|
445
|
+
**kwargs,
|
446
|
+
) -> Optional[Partition]:
|
447
|
+
"""
|
448
|
+
Gets the stream for the given table version locator and stream ID.
|
449
|
+
Returns None if the stream does not exist. Raises an error if the given
|
450
|
+
table version locator does not exist.
|
451
|
+
"""
|
452
|
+
raise NotImplementedError("get_stream_by_id not implemented")
|
453
|
+
|
454
|
+
|
413
455
|
def get_stream(
|
414
456
|
namespace: str,
|
415
457
|
table_name: str,
|
416
458
|
table_version: Optional[str] = None,
|
459
|
+
stream_format: StreamFormat = StreamFormat.DELTACAT,
|
417
460
|
*args,
|
418
|
-
**kwargs
|
461
|
+
**kwargs,
|
419
462
|
) -> Optional[Stream]:
|
420
463
|
"""
|
421
|
-
Gets the most recently committed stream for the given table version
|
422
|
-
|
423
|
-
|
464
|
+
Gets the most recently committed stream for the given table version.
|
465
|
+
Resolves to the latest active table version if no table version is given.
|
466
|
+
Resolves to the deltacat stream format if no stream format is given.
|
467
|
+
Returns None if the table version or stream format does not exist.
|
424
468
|
"""
|
425
469
|
raise NotImplementedError("get_stream not implemented")
|
426
470
|
|
427
471
|
|
472
|
+
def stream_exists(
|
473
|
+
namespace: str,
|
474
|
+
table_name: str,
|
475
|
+
table_version: Optional[str] = None,
|
476
|
+
stream_format: StreamFormat = StreamFormat.DELTACAT,
|
477
|
+
*args,
|
478
|
+
**kwargs,
|
479
|
+
) -> bool:
|
480
|
+
"""
|
481
|
+
Returns True if the given Stream exists, False if not.
|
482
|
+
Resolves to the latest active table version if no table version is given.
|
483
|
+
Resolves to the DeltaCAT stream format if no stream format is given.
|
484
|
+
Returns None if the table version or stream format does not exist.
|
485
|
+
"""
|
486
|
+
raise NotImplementedError("stream_exists not implemented")
|
487
|
+
|
488
|
+
|
428
489
|
def stage_partition(
|
429
|
-
stream: Stream,
|
490
|
+
stream: Stream,
|
491
|
+
partition_values: Optional[PartitionValues] = None,
|
492
|
+
partition_scheme_id: Optional[str] = None,
|
493
|
+
*args,
|
494
|
+
**kwargs,
|
430
495
|
) -> Partition:
|
431
496
|
"""
|
432
497
|
Stages a new partition for the given stream and partition values. Returns
|
433
498
|
the staged partition. If this partition will replace another partition
|
434
|
-
with the same partition values, then it will have its previous
|
435
|
-
set to the ID of the partition being replaced. Partition
|
436
|
-
specified for unpartitioned tables.
|
499
|
+
with the same partition values and scheme, then it will have its previous
|
500
|
+
partition ID set to the ID of the partition being replaced. Partition values
|
501
|
+
should not be specified for unpartitioned tables.
|
437
502
|
|
438
|
-
The partition_values must
|
503
|
+
The partition_values must represent the results of transforms in a partition
|
439
504
|
spec specified in the stream.
|
440
505
|
"""
|
441
506
|
raise NotImplementedError("stage_partition not implemented")
|
@@ -445,13 +510,18 @@ def commit_partition(
|
|
445
510
|
partition: Partition,
|
446
511
|
previous_partition: Optional[Partition] = None,
|
447
512
|
*args,
|
448
|
-
**kwargs
|
513
|
+
**kwargs,
|
449
514
|
) -> Partition:
|
450
515
|
"""
|
451
|
-
Commits the
|
452
|
-
replacing any previous partition
|
516
|
+
Commits the staged partition to its associated table version stream,
|
517
|
+
replacing any previous partition registered for the same stream and
|
453
518
|
partition values.
|
454
|
-
|
519
|
+
|
520
|
+
If previous partition is given then it will be replaced with its deltas
|
521
|
+
prepended to the new partition being committed. Otherwise the latest
|
522
|
+
committed partition with the same keys and partition scheme ID will be
|
523
|
+
retrieved.
|
524
|
+
|
455
525
|
Returns the registered partition. If the partition's
|
456
526
|
previous delta stream position is specified, then the commit will
|
457
527
|
be rejected if it does not match the actual previous stream position of
|
@@ -463,33 +533,48 @@ def commit_partition(
|
|
463
533
|
|
464
534
|
|
465
535
|
def delete_partition(
|
466
|
-
|
467
|
-
table_name: str,
|
468
|
-
table_version: Optional[str] = None,
|
536
|
+
stream_locator: StreamLocator,
|
469
537
|
partition_values: Optional[PartitionValues] = None,
|
538
|
+
partition_scheme_id: Optional[str] = None,
|
470
539
|
*args,
|
471
|
-
**kwargs
|
540
|
+
**kwargs,
|
472
541
|
) -> None:
|
473
542
|
"""
|
474
|
-
Deletes the given partition from the specified
|
475
|
-
the latest active table version if no table version is given. Partition
|
543
|
+
Deletes the given partition from the specified stream. Partition
|
476
544
|
values should not be specified for unpartitioned tables. Raises an error
|
477
|
-
if the
|
545
|
+
if the partition does not exist.
|
478
546
|
"""
|
479
547
|
raise NotImplementedError("delete_partition not implemented")
|
480
548
|
|
481
549
|
|
550
|
+
def get_partition_by_id(
|
551
|
+
stream_locator: StreamLocator,
|
552
|
+
partition_id: str,
|
553
|
+
*args,
|
554
|
+
**kwargs,
|
555
|
+
) -> Optional[Partition]:
|
556
|
+
"""
|
557
|
+
Gets the partition for the given stream locator and partition ID.
|
558
|
+
Returns None if the partition does not exist. Raises an error if the
|
559
|
+
given stream locator does not exist.
|
560
|
+
"""
|
561
|
+
raise NotImplementedError("get_partition_by_id not implemented")
|
562
|
+
|
563
|
+
|
482
564
|
def get_partition(
|
483
565
|
stream_locator: StreamLocator,
|
484
566
|
partition_values: Optional[PartitionValues] = None,
|
567
|
+
partition_scheme_id: Optional[str] = None,
|
485
568
|
*args,
|
486
|
-
**kwargs
|
569
|
+
**kwargs,
|
487
570
|
) -> Optional[Partition]:
|
488
571
|
"""
|
489
572
|
Gets the most recently committed partition for the given stream locator and
|
490
573
|
partition key values. Returns None if no partition has been committed for
|
491
574
|
the given table version and/or partition key values. Partition values
|
492
|
-
should not be specified for unpartitioned tables.
|
575
|
+
should not be specified for unpartitioned tables. Partition scheme ID
|
576
|
+
resolves to the table version's current partition scheme by default.
|
577
|
+
Raises an error if the given stream locator does not exist.
|
493
578
|
"""
|
494
579
|
raise NotImplementedError("get_partition not implemented")
|
495
580
|
|
@@ -500,14 +585,12 @@ def stage_delta(
|
|
500
585
|
delta_type: DeltaType = DeltaType.UPSERT,
|
501
586
|
max_records_per_entry: Optional[int] = None,
|
502
587
|
author: Optional[ManifestAuthor] = None,
|
503
|
-
properties: Optional[
|
588
|
+
properties: Optional[DeltaProperties] = None,
|
504
589
|
s3_table_writer_kwargs: Optional[Dict[str, Any]] = None,
|
505
590
|
content_type: ContentType = ContentType.PARQUET,
|
506
|
-
|
507
|
-
partition_spec: Optional[DeltaPartitionSpec] = None,
|
508
|
-
partition_values: Optional[PartitionValues] = None,
|
591
|
+
entry_params: Optional[EntryParams] = None,
|
509
592
|
*args,
|
510
|
-
**kwargs
|
593
|
+
**kwargs,
|
511
594
|
) -> Delta:
|
512
595
|
"""
|
513
596
|
Writes the given table to 1 or more S3 files. Returns an unregistered
|
@@ -601,7 +684,7 @@ def get_table_version_column_names(
|
|
601
684
|
table_name: str,
|
602
685
|
table_version: Optional[str] = None,
|
603
686
|
*args,
|
604
|
-
**kwargs
|
687
|
+
**kwargs,
|
605
688
|
) -> Optional[List[str]]:
|
606
689
|
"""
|
607
690
|
Gets a list of column names for the specified table version, or for the
|
@@ -619,8 +702,8 @@ def get_table_version_schema(
|
|
619
702
|
table_name: str,
|
620
703
|
table_version: Optional[str] = None,
|
621
704
|
*args,
|
622
|
-
**kwargs
|
623
|
-
) -> Optional[
|
705
|
+
**kwargs,
|
706
|
+
) -> Optional[Schema]:
|
624
707
|
"""
|
625
708
|
Gets the schema for the specified table version, or for the latest active
|
626
709
|
table version if none is specified. Returns None if the table version is
|
File without changes
|