deltacat 1.1.9__py3-none-any.whl → 1.1.10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +1 -1
- deltacat/aws/redshift/model/manifest.py +16 -0
- deltacat/aws/s3u.py +19 -13
- deltacat/compute/compactor/compaction_session.py +5 -1
- deltacat/compute/compactor/repartition_session.py +1 -0
- deltacat/compute/compactor/utils/round_completion_file.py +39 -9
- deltacat/compute/compactor_v2/compaction_session.py +15 -11
- deltacat/compute/compactor_v2/constants.py +3 -0
- deltacat/compute/compactor_v2/model/{compaction_session.py → evaluate_compaction_result.py} +1 -2
- deltacat/io/dataset.py +5 -17
- deltacat/storage/__init__.py +24 -0
- deltacat/storage/interface.py +42 -6
- deltacat/storage/model/delta.py +23 -3
- deltacat/storage/model/partition.py +6 -7
- deltacat/storage/model/partition_spec.py +71 -0
- deltacat/storage/model/stream.py +38 -1
- deltacat/storage/model/transform.py +127 -0
- deltacat/tests/aws/test_s3u.py +2 -0
- deltacat/tests/compute/compactor/utils/test_round_completion_file.py +209 -0
- deltacat/tests/compute/compactor_v2/test_compaction_session.py +201 -36
- deltacat/tests/compute/test_compact_partition_rebase.py +1 -1
- deltacat/tests/compute/test_util_common.py +19 -4
- deltacat/tests/local_deltacat_storage/__init__.py +83 -19
- deltacat/tests/test_utils/pyarrow.py +4 -1
- deltacat/tests/utils/ray_utils/test_dataset.py +66 -0
- deltacat/utils/numpy.py +3 -3
- deltacat/utils/pandas.py +3 -3
- deltacat/utils/pyarrow.py +3 -3
- deltacat/utils/ray_utils/dataset.py +7 -7
- {deltacat-1.1.9.dist-info → deltacat-1.1.10.dist-info}/METADATA +5 -4
- {deltacat-1.1.9.dist-info → deltacat-1.1.10.dist-info}/RECORD +34 -31
- deltacat/io/aws/redshift/redshift_datasource.py +0 -578
- {deltacat-1.1.9.dist-info → deltacat-1.1.10.dist-info}/LICENSE +0 -0
- {deltacat-1.1.9.dist-info → deltacat-1.1.10.dist-info}/WHEEL +0 -0
- {deltacat-1.1.9.dist-info → deltacat-1.1.10.dist-info}/top_level.txt +0 -0
deltacat/__init__.py
CHANGED
@@ -99,6 +99,8 @@ class Manifest(dict):
|
|
99
99
|
total_source_content_length = 0
|
100
100
|
content_type = None
|
101
101
|
content_encoding = None
|
102
|
+
partition_values_set = set()
|
103
|
+
partition_values = None
|
102
104
|
if entries:
|
103
105
|
content_type = entries[0].meta.content_type
|
104
106
|
content_encoding = entries[0].meta.content_encoding
|
@@ -127,6 +129,12 @@ class Manifest(dict):
|
|
127
129
|
total_record_count += meta.record_count or 0
|
128
130
|
total_content_length += meta.content_length or 0
|
129
131
|
total_source_content_length += meta.source_content_length or 0
|
132
|
+
if len(partition_values_set) <= 1:
|
133
|
+
partition_values_set.add(entry.meta.partition_values)
|
134
|
+
|
135
|
+
if len(partition_values_set) == 1:
|
136
|
+
partition_values = partition_values_set.pop()
|
137
|
+
|
130
138
|
meta = ManifestMeta.of(
|
131
139
|
total_record_count,
|
132
140
|
total_content_length,
|
@@ -134,6 +142,7 @@ class Manifest(dict):
|
|
134
142
|
content_encoding,
|
135
143
|
total_source_content_length,
|
136
144
|
entry_type=entry_type,
|
145
|
+
partition_values=partition_values,
|
137
146
|
)
|
138
147
|
manifest = Manifest._build_manifest(meta, entries, author, uuid, entry_type)
|
139
148
|
return manifest
|
@@ -185,6 +194,7 @@ class ManifestMeta(dict):
|
|
185
194
|
credentials: Optional[Dict[str, str]] = None,
|
186
195
|
content_type_parameters: Optional[List[Dict[str, str]]] = None,
|
187
196
|
entry_type: Optional[EntryType] = None,
|
197
|
+
partition_values: Optional[List[str]] = None,
|
188
198
|
) -> ManifestMeta:
|
189
199
|
manifest_meta = ManifestMeta()
|
190
200
|
if record_count is not None:
|
@@ -203,6 +213,8 @@ class ManifestMeta(dict):
|
|
203
213
|
manifest_meta["credentials"] = credentials
|
204
214
|
if entry_type is not None:
|
205
215
|
manifest_meta["entry_type"] = entry_type.value
|
216
|
+
if partition_values is not None:
|
217
|
+
manifest_meta["partition_values"] = partition_values
|
206
218
|
return manifest_meta
|
207
219
|
|
208
220
|
@property
|
@@ -244,6 +256,10 @@ class ManifestMeta(dict):
|
|
244
256
|
return EntryType(self["entry_type"])
|
245
257
|
return val
|
246
258
|
|
259
|
+
@property
|
260
|
+
def partition_values(self) -> Optional[List[str]]:
|
261
|
+
return self.get("partition_values")
|
262
|
+
|
247
263
|
|
248
264
|
class ManifestAuthor(dict):
|
249
265
|
@staticmethod
|
deltacat/aws/s3u.py
CHANGED
@@ -21,7 +21,7 @@ from boto3.resources.base import ServiceResource
|
|
21
21
|
from botocore.client import BaseClient
|
22
22
|
from botocore.exceptions import ClientError
|
23
23
|
from ray.data.block import Block, BlockAccessor, BlockMetadata
|
24
|
-
from ray.data.datasource import
|
24
|
+
from ray.data.datasource import FilenameProvider
|
25
25
|
from ray.types import ObjectRef
|
26
26
|
from tenacity import (
|
27
27
|
Retrying,
|
@@ -70,9 +70,6 @@ from deltacat.exceptions import categorize_errors
|
|
70
70
|
|
71
71
|
logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
|
72
72
|
|
73
|
-
# TODO(raghumdani): refactor redshift datasource to reuse the
|
74
|
-
# same module for writing output files.
|
75
|
-
|
76
73
|
|
77
74
|
class CapturedBlockWritePaths:
|
78
75
|
def __init__(self):
|
@@ -100,12 +97,15 @@ class CapturedBlockWritePaths:
|
|
100
97
|
return self._block_refs
|
101
98
|
|
102
99
|
|
103
|
-
class UuidBlockWritePathProvider(
|
100
|
+
class UuidBlockWritePathProvider(FilenameProvider):
|
104
101
|
"""Block write path provider implementation that writes each
|
105
102
|
dataset block out to a file of the form: {base_path}/{uuid}
|
106
103
|
"""
|
107
104
|
|
108
|
-
def __init__(
|
105
|
+
def __init__(
|
106
|
+
self, capture_object: CapturedBlockWritePaths, base_path: Optional[str] = None
|
107
|
+
):
|
108
|
+
self.base_path = base_path
|
109
109
|
self.write_paths: List[str] = []
|
110
110
|
self.block_refs: List[ObjectRef[Block]] = []
|
111
111
|
self.capture_object = capture_object
|
@@ -117,6 +117,19 @@ class UuidBlockWritePathProvider(BlockWritePathProvider):
|
|
117
117
|
self.block_refs,
|
118
118
|
)
|
119
119
|
|
120
|
+
def get_filename_for_block(
|
121
|
+
self, block: Any, task_index: int, block_index: int
|
122
|
+
) -> str:
|
123
|
+
if self.base_path is None:
|
124
|
+
raise ValueError(
|
125
|
+
"Base path must be provided to UuidBlockWritePathProvider",
|
126
|
+
)
|
127
|
+
return self._get_write_path_for_block(
|
128
|
+
base_path=self.base_path,
|
129
|
+
block=block,
|
130
|
+
block_index=block_index,
|
131
|
+
)
|
132
|
+
|
120
133
|
def _get_write_path_for_block(
|
121
134
|
self,
|
122
135
|
base_path: str,
|
@@ -143,13 +156,6 @@ class UuidBlockWritePathProvider(BlockWritePathProvider):
|
|
143
156
|
block_index: Optional[int] = None,
|
144
157
|
file_format: Optional[str] = None,
|
145
158
|
) -> str:
|
146
|
-
"""
|
147
|
-
TODO: BlockWritePathProvider is deprecated as of Ray version 2.20.0. Please use FilenameProvider.
|
148
|
-
See: https://docs.ray.io/en/master/data/api/doc/ray.data.datasource.FilenameProvider.html
|
149
|
-
Also See: https://github.com/ray-project/deltacat/issues/299
|
150
|
-
|
151
|
-
Hence, this class only works with Ray version 2.20.0 or lower when used in Ray Dataset.
|
152
|
-
"""
|
153
159
|
return self._get_write_path_for_block(
|
154
160
|
base_path,
|
155
161
|
filesystem=filesystem,
|
@@ -193,6 +193,7 @@ def compact_partition(
|
|
193
193
|
round_completion_file_s3_url = rcf.write_round_completion_file(
|
194
194
|
compaction_artifact_s3_bucket,
|
195
195
|
new_rcf_partition_locator,
|
196
|
+
partition.locator,
|
196
197
|
new_rci,
|
197
198
|
**s3_client_kwargs,
|
198
199
|
)
|
@@ -312,7 +313,10 @@ def _execute_compaction_round(
|
|
312
313
|
round_completion_info = None
|
313
314
|
if not rebase_source_partition_locator:
|
314
315
|
round_completion_info = rcf.read_round_completion_file(
|
315
|
-
compaction_artifact_s3_bucket,
|
316
|
+
compaction_artifact_s3_bucket,
|
317
|
+
source_partition_locator,
|
318
|
+
destination_partition_locator,
|
319
|
+
**s3_client_kwargs,
|
316
320
|
)
|
317
321
|
if not round_completion_info:
|
318
322
|
logger.info(
|
@@ -12,10 +12,17 @@ logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
|
|
12
12
|
|
13
13
|
|
14
14
|
def get_round_completion_file_s3_url(
|
15
|
-
bucket: str,
|
15
|
+
bucket: str,
|
16
|
+
source_partition_locator: PartitionLocator,
|
17
|
+
destination_partition_locator: Optional[PartitionLocator] = None,
|
16
18
|
) -> str:
|
17
19
|
|
18
20
|
base_url = source_partition_locator.path(f"s3://{bucket}")
|
21
|
+
if destination_partition_locator:
|
22
|
+
base_url = destination_partition_locator.path(
|
23
|
+
f"s3://{bucket}/{source_partition_locator.hexdigest()}"
|
24
|
+
)
|
25
|
+
|
19
26
|
return f"{base_url}.json"
|
20
27
|
|
21
28
|
|
@@ -23,20 +30,41 @@ def get_round_completion_file_s3_url(
|
|
23
30
|
def read_round_completion_file(
|
24
31
|
bucket: str,
|
25
32
|
source_partition_locator: PartitionLocator,
|
33
|
+
destination_partition_locator: Optional[PartitionLocator] = None,
|
26
34
|
**s3_client_kwargs: Optional[Dict[str, Any]],
|
27
35
|
) -> RoundCompletionInfo:
|
28
36
|
|
29
|
-
|
37
|
+
all_uris = []
|
38
|
+
if destination_partition_locator:
|
39
|
+
round_completion_file_url_with_destination = get_round_completion_file_s3_url(
|
40
|
+
bucket,
|
41
|
+
source_partition_locator,
|
42
|
+
destination_partition_locator,
|
43
|
+
)
|
44
|
+
all_uris.append(round_completion_file_url_with_destination)
|
45
|
+
|
46
|
+
# Note: we read from RCF at two different URI for backward
|
47
|
+
# compatibility reasons.
|
48
|
+
round_completion_file_url_prev = get_round_completion_file_s3_url(
|
30
49
|
bucket,
|
31
50
|
source_partition_locator,
|
32
51
|
)
|
33
|
-
|
52
|
+
|
53
|
+
all_uris.append(round_completion_file_url_prev)
|
54
|
+
|
34
55
|
round_completion_info = None
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
56
|
+
|
57
|
+
for rcf_uri in all_uris:
|
58
|
+
logger.info(f"Reading round completion file from: {rcf_uri}")
|
59
|
+
result = s3_utils.download(rcf_uri, False, **s3_client_kwargs)
|
60
|
+
if result:
|
61
|
+
json_str = result["Body"].read().decode("utf-8")
|
62
|
+
round_completion_info = RoundCompletionInfo(json.loads(json_str))
|
63
|
+
logger.info(f"Read round completion info: {round_completion_info}")
|
64
|
+
break
|
65
|
+
else:
|
66
|
+
logger.warn(f"Round completion file not present at {rcf_uri}")
|
67
|
+
|
40
68
|
return round_completion_info
|
41
69
|
|
42
70
|
|
@@ -44,8 +72,9 @@ def read_round_completion_file(
|
|
44
72
|
def write_round_completion_file(
|
45
73
|
bucket: Optional[str],
|
46
74
|
source_partition_locator: Optional[PartitionLocator],
|
75
|
+
destination_partition_locator: Optional[PartitionLocator],
|
47
76
|
round_completion_info: RoundCompletionInfo,
|
48
|
-
completion_file_s3_url: str = None,
|
77
|
+
completion_file_s3_url: Optional[str] = None,
|
49
78
|
**s3_client_kwargs: Optional[Dict[str, Any]],
|
50
79
|
) -> str:
|
51
80
|
if bucket is None and completion_file_s3_url is None:
|
@@ -56,6 +85,7 @@ def write_round_completion_file(
|
|
56
85
|
completion_file_s3_url = get_round_completion_file_s3_url(
|
57
86
|
bucket,
|
58
87
|
source_partition_locator,
|
88
|
+
destination_partition_locator,
|
59
89
|
)
|
60
90
|
logger.info(f"writing round completion file to: {completion_file_s3_url}")
|
61
91
|
s3_utils.upload(
|
@@ -24,7 +24,7 @@ from deltacat.compute.compactor import (
|
|
24
24
|
)
|
25
25
|
from deltacat.compute.compactor_v2.model.merge_result import MergeResult
|
26
26
|
from deltacat.compute.compactor_v2.model.hash_bucket_result import HashBucketResult
|
27
|
-
from deltacat.compute.compactor_v2.model.
|
27
|
+
from deltacat.compute.compactor_v2.model.evaluate_compaction_result import (
|
28
28
|
ExecutionCompactionResult,
|
29
29
|
)
|
30
30
|
from deltacat.compute.compactor.model.materialize_result import MaterializeResult
|
@@ -78,6 +78,7 @@ from deltacat.compute.compactor_v2.utils.task_options import (
|
|
78
78
|
)
|
79
79
|
from deltacat.compute.compactor.model.compactor_version import CompactorVersion
|
80
80
|
from deltacat.exceptions import categorize_errors
|
81
|
+
from deltacat.compute.compactor_v2.constants import COMPACT_PARTITION_METRIC_PREFIX
|
81
82
|
|
82
83
|
if importlib.util.find_spec("memray"):
|
83
84
|
import memray
|
@@ -86,7 +87,7 @@ if importlib.util.find_spec("memray"):
|
|
86
87
|
logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
|
87
88
|
|
88
89
|
|
89
|
-
@metrics
|
90
|
+
@metrics(prefix=COMPACT_PARTITION_METRIC_PREFIX)
|
90
91
|
@categorize_errors
|
91
92
|
def compact_partition(params: CompactPartitionParams, **kwargs) -> Optional[str]:
|
92
93
|
assert (
|
@@ -109,7 +110,6 @@ def compact_partition(params: CompactPartitionParams, **kwargs) -> Optional[str]
|
|
109
110
|
f"Partition-{params.source_partition_locator} -> "
|
110
111
|
f"{compaction_session_type} Compaction session data processing completed"
|
111
112
|
)
|
112
|
-
round_completion_file_s3_url: Optional[str] = None
|
113
113
|
if execute_compaction_result.new_compacted_partition:
|
114
114
|
previous_partition: Optional[Partition] = None
|
115
115
|
if execute_compaction_result.is_inplace_compacted:
|
@@ -131,19 +131,13 @@ def compact_partition(params: CompactPartitionParams, **kwargs) -> Optional[str]
|
|
131
131
|
**params.deltacat_storage_kwargs,
|
132
132
|
)
|
133
133
|
logger.info(f"Committed compacted partition: {committed_partition}")
|
134
|
-
round_completion_file_s3_url = rcf.write_round_completion_file(
|
135
|
-
params.compaction_artifact_s3_bucket,
|
136
|
-
execute_compaction_result.new_round_completion_file_partition_locator,
|
137
|
-
execute_compaction_result.new_round_completion_info,
|
138
|
-
**params.s3_client_kwargs,
|
139
|
-
)
|
140
134
|
else:
|
141
135
|
logger.warning("No new partition was committed during compaction.")
|
142
136
|
|
143
137
|
logger.info(
|
144
138
|
f"Completed compaction session for: {params.source_partition_locator}"
|
145
139
|
)
|
146
|
-
return round_completion_file_s3_url
|
140
|
+
return execute_compaction_result.round_completion_file_s3_url
|
147
141
|
|
148
142
|
|
149
143
|
def _execute_compaction(
|
@@ -188,6 +182,7 @@ def _execute_compaction(
|
|
188
182
|
round_completion_info = rcf.read_round_completion_file(
|
189
183
|
params.compaction_artifact_s3_bucket,
|
190
184
|
params.source_partition_locator,
|
185
|
+
params.destination_partition_locator,
|
191
186
|
**params.s3_client_kwargs,
|
192
187
|
)
|
193
188
|
if not round_completion_info:
|
@@ -684,9 +679,18 @@ def _execute_compaction(
|
|
684
679
|
f"and rcf source partition_id of {rcf_source_partition_locator.partition_id}."
|
685
680
|
)
|
686
681
|
rcf_source_partition_locator = compacted_partition.locator
|
682
|
+
|
683
|
+
round_completion_file_s3_url = rcf.write_round_completion_file(
|
684
|
+
params.compaction_artifact_s3_bucket,
|
685
|
+
rcf_source_partition_locator,
|
686
|
+
compacted_partition.locator,
|
687
|
+
new_round_completion_info,
|
688
|
+
**params.s3_client_kwargs,
|
689
|
+
)
|
690
|
+
|
687
691
|
return ExecutionCompactionResult(
|
688
692
|
compacted_partition,
|
689
693
|
new_round_completion_info,
|
690
|
-
|
694
|
+
round_completion_file_s3_url,
|
691
695
|
is_inplace_compacted,
|
692
696
|
)
|
@@ -2,7 +2,6 @@ from dataclasses import dataclass, fields
|
|
2
2
|
|
3
3
|
from deltacat.storage import (
|
4
4
|
Partition,
|
5
|
-
PartitionLocator,
|
6
5
|
)
|
7
6
|
from deltacat.compute.compactor import (
|
8
7
|
RoundCompletionInfo,
|
@@ -14,7 +13,7 @@ from typing import Optional
|
|
14
13
|
class ExecutionCompactionResult:
|
15
14
|
new_compacted_partition: Optional[Partition]
|
16
15
|
new_round_completion_info: Optional[RoundCompletionInfo]
|
17
|
-
|
16
|
+
round_completion_file_s3_url: Optional[str]
|
18
17
|
is_inplace_compacted: bool
|
19
18
|
|
20
19
|
def __iter__(self):
|
deltacat/io/dataset.py
CHANGED
@@ -6,9 +6,6 @@ from typing import Any, Callable, Dict, Optional, TypeVar, Union, cast
|
|
6
6
|
import pyarrow as pa
|
7
7
|
import s3fs
|
8
8
|
from ray.data import Dataset
|
9
|
-
from ray.data.datasource import BlockWritePathProvider, DefaultBlockWritePathProvider
|
10
|
-
|
11
|
-
from deltacat.io.aws.redshift.redshift_datasource import RedshiftDatasource
|
12
9
|
|
13
10
|
T = TypeVar("T")
|
14
11
|
|
@@ -27,7 +24,6 @@ class DeltacatDataset(Dataset[T]):
|
|
27
24
|
filesystem: Optional[Union[pa.fs.FileSystem, s3fs.S3FileSystem]] = None,
|
28
25
|
try_create_dir: bool = True,
|
29
26
|
arrow_open_stream_args: Optional[Dict[str, Any]] = None,
|
30
|
-
block_path_provider: BlockWritePathProvider = DefaultBlockWritePathProvider(),
|
31
27
|
arrow_parquet_args_fn: Callable[[], Dict[str, Any]] = lambda: {},
|
32
28
|
**arrow_parquet_args,
|
33
29
|
) -> None:
|
@@ -59,9 +55,8 @@ class DeltacatDataset(Dataset[T]):
|
|
59
55
|
if True. Does nothing if all directories already exist.
|
60
56
|
arrow_open_stream_args: kwargs passed to
|
61
57
|
pyarrow.fs.FileSystem.open_output_stream
|
62
|
-
|
63
|
-
to write each dataset block to a custom output path.
|
64
|
-
DefaultBlockWritePathProvider if None.
|
58
|
+
filename_provider: FilenameProvider implementation
|
59
|
+
to write each dataset block to a custom output path.
|
65
60
|
arrow_parquet_args_fn: Callable that returns a dictionary of write
|
66
61
|
arguments to use when writing each block to a file. Overrides
|
67
62
|
any duplicate keys from arrow_parquet_args. This should be used
|
@@ -72,14 +67,7 @@ class DeltacatDataset(Dataset[T]):
|
|
72
67
|
pyarrow.parquet.write_table(), which is used to write out each
|
73
68
|
block to a file.
|
74
69
|
"""
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
dataset_uuid=self._uuid,
|
79
|
-
filesystem=filesystem,
|
80
|
-
try_create_dir=try_create_dir,
|
81
|
-
open_stream_args=arrow_open_stream_args,
|
82
|
-
block_path_provider=block_path_provider,
|
83
|
-
write_args_fn=arrow_parquet_args_fn,
|
84
|
-
**arrow_parquet_args,
|
70
|
+
raise NotImplementedError(
|
71
|
+
"Writing to Redshift is not yet supported. "
|
72
|
+
"Please use DeltacatDataset.write_parquet() instead."
|
85
73
|
)
|
deltacat/storage/__init__.py
CHANGED
@@ -14,6 +14,20 @@ from deltacat.storage.model.stream import Stream, StreamLocator
|
|
14
14
|
from deltacat.storage.model.table import Table, TableLocator
|
15
15
|
from deltacat.storage.model.table_version import TableVersion, TableVersionLocator
|
16
16
|
from deltacat.storage.model.delete_parameters import DeleteParameters
|
17
|
+
from deltacat.storage.model.partition_spec import (
|
18
|
+
PartitionFilter,
|
19
|
+
PartitionValues,
|
20
|
+
DeltaPartitionSpec,
|
21
|
+
StreamPartitionSpec,
|
22
|
+
)
|
23
|
+
from deltacat.storage.model.transform import (
|
24
|
+
Transform,
|
25
|
+
TransformName,
|
26
|
+
TransformParameters,
|
27
|
+
BucketingStrategy,
|
28
|
+
BucketTransformParameters,
|
29
|
+
IdentityTransformParameters,
|
30
|
+
)
|
17
31
|
|
18
32
|
from deltacat.storage.model.types import (
|
19
33
|
CommitState,
|
@@ -56,4 +70,14 @@ __all__ = [
|
|
56
70
|
"TableVersionLocator",
|
57
71
|
"SortKey",
|
58
72
|
"SortOrder",
|
73
|
+
"PartitionFilter",
|
74
|
+
"PartitionValues",
|
75
|
+
"DeltaPartitionSpec",
|
76
|
+
"StreamPartitionSpec",
|
77
|
+
"Transform",
|
78
|
+
"TransformName",
|
79
|
+
"TransformParameters",
|
80
|
+
"BucketingStrategy",
|
81
|
+
"BucketTransformParameters",
|
82
|
+
"IdentityTransformParameters",
|
59
83
|
]
|
deltacat/storage/interface.py
CHANGED
@@ -23,6 +23,10 @@ from deltacat.storage import (
|
|
23
23
|
TableVersion,
|
24
24
|
SortKey,
|
25
25
|
PartitionLocator,
|
26
|
+
PartitionFilter,
|
27
|
+
PartitionValues,
|
28
|
+
DeltaPartitionSpec,
|
29
|
+
StreamPartitionSpec,
|
26
30
|
)
|
27
31
|
from deltacat.types.media import (
|
28
32
|
ContentType,
|
@@ -86,12 +90,13 @@ def list_stream_partitions(stream: Stream, *args, **kwargs) -> ListResult[Partit
|
|
86
90
|
def list_deltas(
|
87
91
|
namespace: str,
|
88
92
|
table_name: str,
|
89
|
-
partition_values: Optional[
|
93
|
+
partition_values: Optional[PartitionValues] = None,
|
90
94
|
table_version: Optional[str] = None,
|
91
95
|
first_stream_position: Optional[int] = None,
|
92
96
|
last_stream_position: Optional[int] = None,
|
93
97
|
ascending_order: Optional[bool] = None,
|
94
98
|
include_manifest: bool = False,
|
99
|
+
partition_filter: Optional[PartitionFilter] = None,
|
95
100
|
*args,
|
96
101
|
**kwargs
|
97
102
|
) -> ListResult[Delta]:
|
@@ -107,6 +112,9 @@ def list_deltas(
|
|
107
112
|
To conserve memory, the deltas returned do not include manifests by
|
108
113
|
default. The manifests can either be optionally retrieved as part of this
|
109
114
|
call or lazily loaded via subsequent calls to `get_delta_manifest`.
|
115
|
+
|
116
|
+
Note: partition_values is deprecated and will be removed in future releases.
|
117
|
+
Use partition_filter instead.
|
110
118
|
"""
|
111
119
|
raise NotImplementedError("list_deltas not implemented")
|
112
120
|
|
@@ -134,9 +142,10 @@ def get_delta(
|
|
134
142
|
namespace: str,
|
135
143
|
table_name: str,
|
136
144
|
stream_position: int,
|
137
|
-
partition_values: Optional[
|
145
|
+
partition_values: Optional[PartitionValues] = None,
|
138
146
|
table_version: Optional[str] = None,
|
139
147
|
include_manifest: bool = False,
|
148
|
+
partition_filter: Optional[PartitionFilter] = None,
|
140
149
|
*args,
|
141
150
|
**kwargs
|
142
151
|
) -> Optional[Delta]:
|
@@ -149,6 +158,9 @@ def get_delta(
|
|
149
158
|
To conserve memory, the delta returned does not include a manifest by
|
150
159
|
default. The manifest can either be optionally retrieved as part of this
|
151
160
|
call or lazily loaded via a subsequent call to `get_delta_manifest`.
|
161
|
+
|
162
|
+
Note: partition_values is deprecated and will be removed in future releases.
|
163
|
+
Use partition_filter instead.
|
152
164
|
"""
|
153
165
|
raise NotImplementedError("get_delta not implemented")
|
154
166
|
|
@@ -156,9 +168,10 @@ def get_delta(
|
|
156
168
|
def get_latest_delta(
|
157
169
|
namespace: str,
|
158
170
|
table_name: str,
|
159
|
-
partition_values: Optional[
|
171
|
+
partition_values: Optional[PartitionValues] = None,
|
160
172
|
table_version: Optional[str] = None,
|
161
173
|
include_manifest: bool = False,
|
174
|
+
partition_filter: Optional[PartitionFilter] = None,
|
162
175
|
*args,
|
163
176
|
**kwargs
|
164
177
|
) -> Optional[Delta]:
|
@@ -172,6 +185,9 @@ def get_latest_delta(
|
|
172
185
|
To conserve memory, the delta returned does not include a manifest by
|
173
186
|
default. The manifest can either be optionally retrieved as part of this
|
174
187
|
call or lazily loaded via a subsequent call to `get_delta_manifest`.
|
188
|
+
|
189
|
+
Note: partition_values is deprecated and will be removed in future releases.
|
190
|
+
Use partition_filter instead.
|
175
191
|
"""
|
176
192
|
raise NotImplementedError("get_latest_delta not implemented")
|
177
193
|
|
@@ -185,6 +201,7 @@ def download_delta(
|
|
185
201
|
file_reader_kwargs_provider: Optional[ReadKwargsProvider] = None,
|
186
202
|
ray_options_provider: Callable[[int, Any], Dict[str, Any]] = None,
|
187
203
|
distributed_dataset_type: DistributedDatasetType = DistributedDatasetType.RAY_DATASET,
|
204
|
+
partition_filter: Optional[PartitionFilter] = None,
|
188
205
|
*args,
|
189
206
|
**kwargs
|
190
207
|
) -> Union[LocalDataset, DistributedDataset]: # type: ignore
|
@@ -194,6 +211,10 @@ def download_delta(
|
|
194
211
|
across this Ray cluster's object store memory. Ordered table N of a local
|
195
212
|
table list, or ordered block N of a distributed dataset, always contain
|
196
213
|
the contents of ordered delta manifest entry N.
|
214
|
+
|
215
|
+
partition_filter is an optional parameter which determines which files to
|
216
|
+
download from the delta manifest. A delta manifest contains all the data files
|
217
|
+
for a given delta.
|
197
218
|
"""
|
198
219
|
raise NotImplementedError("download_delta not implemented")
|
199
220
|
|
@@ -268,6 +289,7 @@ def create_table_version(
|
|
268
289
|
table_description: Optional[str] = None,
|
269
290
|
table_properties: Optional[Dict[str, str]] = None,
|
270
291
|
supported_content_types: Optional[List[ContentType]] = None,
|
292
|
+
partition_spec: Optional[StreamPartitionSpec] = None,
|
271
293
|
*args,
|
272
294
|
**kwargs
|
273
295
|
) -> Stream:
|
@@ -300,6 +322,8 @@ def create_table_version(
|
|
300
322
|
|
301
323
|
Validate: Raise an error for any fields that don't fit the schema. An
|
302
324
|
explicit subset of column names to validate may optionally be specified.
|
325
|
+
|
326
|
+
Either partition_keys or partition_spec must be specified but not both.
|
303
327
|
"""
|
304
328
|
raise NotImplementedError("create_table_version not implemented")
|
305
329
|
|
@@ -402,7 +426,7 @@ def get_stream(
|
|
402
426
|
|
403
427
|
|
404
428
|
def stage_partition(
|
405
|
-
stream: Stream, partition_values: Optional[
|
429
|
+
stream: Stream, partition_values: Optional[PartitionValues] = None, *args, **kwargs
|
406
430
|
) -> Partition:
|
407
431
|
"""
|
408
432
|
Stages a new partition for the given stream and partition values. Returns
|
@@ -410,6 +434,9 @@ def stage_partition(
|
|
410
434
|
with the same partition values, then it will have its previous partition ID
|
411
435
|
set to the ID of the partition being replaced. Partition keys should not be
|
412
436
|
specified for unpartitioned tables.
|
437
|
+
|
438
|
+
The partition_values must represents the results of transforms in a partition
|
439
|
+
spec specified in the stream.
|
413
440
|
"""
|
414
441
|
raise NotImplementedError("stage_partition not implemented")
|
415
442
|
|
@@ -439,7 +466,7 @@ def delete_partition(
|
|
439
466
|
namespace: str,
|
440
467
|
table_name: str,
|
441
468
|
table_version: Optional[str] = None,
|
442
|
-
partition_values: Optional[
|
469
|
+
partition_values: Optional[PartitionValues] = None,
|
443
470
|
*args,
|
444
471
|
**kwargs
|
445
472
|
) -> None:
|
@@ -454,7 +481,7 @@ def delete_partition(
|
|
454
481
|
|
455
482
|
def get_partition(
|
456
483
|
stream_locator: StreamLocator,
|
457
|
-
partition_values: Optional[
|
484
|
+
partition_values: Optional[PartitionValues] = None,
|
458
485
|
*args,
|
459
486
|
**kwargs
|
460
487
|
) -> Optional[Partition]:
|
@@ -477,6 +504,8 @@ def stage_delta(
|
|
477
504
|
s3_table_writer_kwargs: Optional[Dict[str, Any]] = None,
|
478
505
|
content_type: ContentType = ContentType.PARQUET,
|
479
506
|
delete_parameters: Optional[DeleteParameters] = None,
|
507
|
+
partition_spec: Optional[DeltaPartitionSpec] = None,
|
508
|
+
partition_values: Optional[PartitionValues] = None,
|
480
509
|
*args,
|
481
510
|
**kwargs
|
482
511
|
) -> Delta:
|
@@ -484,6 +513,13 @@ def stage_delta(
|
|
484
513
|
Writes the given table to 1 or more S3 files. Returns an unregistered
|
485
514
|
delta whose manifest entries point to the uploaded files. Applies any
|
486
515
|
schema consistency policies configured for the parent table version.
|
516
|
+
|
517
|
+
The partition spec will be used to split the input table into
|
518
|
+
multiple files. Optionally, partition_values can be provided to avoid
|
519
|
+
this method to recompute partition_values from the provided data.
|
520
|
+
|
521
|
+
Raises an error if the provided data does not conform to a unique ordered
|
522
|
+
list of partition_values
|
487
523
|
"""
|
488
524
|
raise NotImplementedError("stage_delta not implemented")
|
489
525
|
|