deltacat 1.1.8__py3-none-any.whl → 1.1.10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +1 -1
- deltacat/aws/constants.py +6 -0
- deltacat/aws/redshift/model/manifest.py +16 -0
- deltacat/aws/s3u.py +65 -38
- deltacat/compute/compactor/compaction_session.py +5 -1
- deltacat/compute/compactor/model/compact_partition_params.py +12 -1
- deltacat/compute/compactor/model/materialize_result.py +0 -4
- deltacat/compute/compactor/repartition_session.py +1 -0
- deltacat/compute/compactor/utils/round_completion_file.py +39 -9
- deltacat/compute/compactor_v2/compaction_session.py +26 -16
- deltacat/compute/compactor_v2/constants.py +5 -11
- deltacat/compute/compactor_v2/model/{compaction_session.py → evaluate_compaction_result.py} +1 -2
- deltacat/compute/compactor_v2/model/merge_input.py +6 -0
- deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -7
- deltacat/compute/compactor_v2/steps/merge.py +12 -12
- deltacat/compute/compactor_v2/utils/merge.py +1 -0
- deltacat/compute/compactor_v2/utils/primary_key_index.py +9 -4
- deltacat/compute/compactor_v2/utils/task_options.py +2 -12
- deltacat/exceptions.py +342 -7
- deltacat/io/dataset.py +5 -17
- deltacat/io/memcached_object_store.py +7 -4
- deltacat/storage/__init__.py +24 -0
- deltacat/storage/interface.py +56 -6
- deltacat/storage/model/delta.py +23 -3
- deltacat/storage/model/partition.py +6 -7
- deltacat/storage/model/partition_spec.py +71 -0
- deltacat/storage/model/stream.py +38 -1
- deltacat/storage/model/transform.py +127 -0
- deltacat/tests/aws/test_s3u.py +2 -0
- deltacat/tests/compute/compact_partition_rebase_test_cases.py +88 -0
- deltacat/tests/compute/compact_partition_rebase_then_incremental_test_cases.py +3 -2
- deltacat/tests/compute/compact_partition_test_cases.py +4 -2
- deltacat/tests/compute/compactor/utils/test_round_completion_file.py +209 -0
- deltacat/tests/compute/compactor_v2/test_compaction_session.py +204 -37
- deltacat/tests/compute/test_compact_partition_rebase.py +289 -0
- deltacat/tests/compute/test_util_common.py +19 -4
- deltacat/tests/compute/test_util_create_table_deltas_repo.py +1 -0
- deltacat/tests/io/test_memcached_object_store.py +5 -2
- deltacat/tests/local_deltacat_storage/__init__.py +124 -29
- deltacat/tests/local_deltacat_storage/exceptions.py +10 -0
- deltacat/tests/test_exceptions.py +100 -0
- deltacat/tests/test_logs.py +1 -0
- deltacat/tests/test_utils/pyarrow.py +4 -1
- deltacat/tests/utils/ray_utils/test_dataset.py +66 -0
- deltacat/tests/utils/test_daft.py +0 -1
- deltacat/tests/utils/test_resources.py +0 -28
- deltacat/utils/daft.py +3 -0
- deltacat/utils/numpy.py +3 -3
- deltacat/utils/pandas.py +3 -3
- deltacat/utils/pyarrow.py +11 -8
- deltacat/utils/ray_utils/dataset.py +7 -7
- deltacat/utils/ray_utils/runtime.py +2 -2
- deltacat/utils/resources.py +0 -45
- {deltacat-1.1.8.dist-info → deltacat-1.1.10.dist-info}/METADATA +6 -5
- {deltacat-1.1.8.dist-info → deltacat-1.1.10.dist-info}/RECORD +58 -51
- deltacat/io/aws/redshift/redshift_datasource.py +0 -578
- {deltacat-1.1.8.dist-info → deltacat-1.1.10.dist-info}/LICENSE +0 -0
- {deltacat-1.1.8.dist-info → deltacat-1.1.10.dist-info}/WHEEL +0 -0
- {deltacat-1.1.8.dist-info → deltacat-1.1.10.dist-info}/top_level.txt +0 -0
deltacat/storage/interface.py
CHANGED
@@ -23,6 +23,10 @@ from deltacat.storage import (
|
|
23
23
|
TableVersion,
|
24
24
|
SortKey,
|
25
25
|
PartitionLocator,
|
26
|
+
PartitionFilter,
|
27
|
+
PartitionValues,
|
28
|
+
DeltaPartitionSpec,
|
29
|
+
StreamPartitionSpec,
|
26
30
|
)
|
27
31
|
from deltacat.types.media import (
|
28
32
|
ContentType,
|
@@ -86,12 +90,13 @@ def list_stream_partitions(stream: Stream, *args, **kwargs) -> ListResult[Partit
|
|
86
90
|
def list_deltas(
|
87
91
|
namespace: str,
|
88
92
|
table_name: str,
|
89
|
-
partition_values: Optional[
|
93
|
+
partition_values: Optional[PartitionValues] = None,
|
90
94
|
table_version: Optional[str] = None,
|
91
95
|
first_stream_position: Optional[int] = None,
|
92
96
|
last_stream_position: Optional[int] = None,
|
93
97
|
ascending_order: Optional[bool] = None,
|
94
98
|
include_manifest: bool = False,
|
99
|
+
partition_filter: Optional[PartitionFilter] = None,
|
95
100
|
*args,
|
96
101
|
**kwargs
|
97
102
|
) -> ListResult[Delta]:
|
@@ -107,6 +112,9 @@ def list_deltas(
|
|
107
112
|
To conserve memory, the deltas returned do not include manifests by
|
108
113
|
default. The manifests can either be optionally retrieved as part of this
|
109
114
|
call or lazily loaded via subsequent calls to `get_delta_manifest`.
|
115
|
+
|
116
|
+
Note: partition_values is deprecated and will be removed in future releases.
|
117
|
+
Use partition_filter instead.
|
110
118
|
"""
|
111
119
|
raise NotImplementedError("list_deltas not implemented")
|
112
120
|
|
@@ -134,9 +142,10 @@ def get_delta(
|
|
134
142
|
namespace: str,
|
135
143
|
table_name: str,
|
136
144
|
stream_position: int,
|
137
|
-
partition_values: Optional[
|
145
|
+
partition_values: Optional[PartitionValues] = None,
|
138
146
|
table_version: Optional[str] = None,
|
139
147
|
include_manifest: bool = False,
|
148
|
+
partition_filter: Optional[PartitionFilter] = None,
|
140
149
|
*args,
|
141
150
|
**kwargs
|
142
151
|
) -> Optional[Delta]:
|
@@ -149,6 +158,9 @@ def get_delta(
|
|
149
158
|
To conserve memory, the delta returned does not include a manifest by
|
150
159
|
default. The manifest can either be optionally retrieved as part of this
|
151
160
|
call or lazily loaded via a subsequent call to `get_delta_manifest`.
|
161
|
+
|
162
|
+
Note: partition_values is deprecated and will be removed in future releases.
|
163
|
+
Use partition_filter instead.
|
152
164
|
"""
|
153
165
|
raise NotImplementedError("get_delta not implemented")
|
154
166
|
|
@@ -156,9 +168,10 @@ def get_delta(
|
|
156
168
|
def get_latest_delta(
|
157
169
|
namespace: str,
|
158
170
|
table_name: str,
|
159
|
-
partition_values: Optional[
|
171
|
+
partition_values: Optional[PartitionValues] = None,
|
160
172
|
table_version: Optional[str] = None,
|
161
173
|
include_manifest: bool = False,
|
174
|
+
partition_filter: Optional[PartitionFilter] = None,
|
162
175
|
*args,
|
163
176
|
**kwargs
|
164
177
|
) -> Optional[Delta]:
|
@@ -172,6 +185,9 @@ def get_latest_delta(
|
|
172
185
|
To conserve memory, the delta returned does not include a manifest by
|
173
186
|
default. The manifest can either be optionally retrieved as part of this
|
174
187
|
call or lazily loaded via a subsequent call to `get_delta_manifest`.
|
188
|
+
|
189
|
+
Note: partition_values is deprecated and will be removed in future releases.
|
190
|
+
Use partition_filter instead.
|
175
191
|
"""
|
176
192
|
raise NotImplementedError("get_latest_delta not implemented")
|
177
193
|
|
@@ -185,6 +201,7 @@ def download_delta(
|
|
185
201
|
file_reader_kwargs_provider: Optional[ReadKwargsProvider] = None,
|
186
202
|
ray_options_provider: Callable[[int, Any], Dict[str, Any]] = None,
|
187
203
|
distributed_dataset_type: DistributedDatasetType = DistributedDatasetType.RAY_DATASET,
|
204
|
+
partition_filter: Optional[PartitionFilter] = None,
|
188
205
|
*args,
|
189
206
|
**kwargs
|
190
207
|
) -> Union[LocalDataset, DistributedDataset]: # type: ignore
|
@@ -194,6 +211,10 @@ def download_delta(
|
|
194
211
|
across this Ray cluster's object store memory. Ordered table N of a local
|
195
212
|
table list, or ordered block N of a distributed dataset, always contain
|
196
213
|
the contents of ordered delta manifest entry N.
|
214
|
+
|
215
|
+
partition_filter is an optional parameter which determines which files to
|
216
|
+
download from the delta manifest. A delta manifest contains all the data files
|
217
|
+
for a given delta.
|
197
218
|
"""
|
198
219
|
raise NotImplementedError("download_delta not implemented")
|
199
220
|
|
@@ -268,6 +289,7 @@ def create_table_version(
|
|
268
289
|
table_description: Optional[str] = None,
|
269
290
|
table_properties: Optional[Dict[str, str]] = None,
|
270
291
|
supported_content_types: Optional[List[ContentType]] = None,
|
292
|
+
partition_spec: Optional[StreamPartitionSpec] = None,
|
271
293
|
*args,
|
272
294
|
**kwargs
|
273
295
|
) -> Stream:
|
@@ -300,6 +322,8 @@ def create_table_version(
|
|
300
322
|
|
301
323
|
Validate: Raise an error for any fields that don't fit the schema. An
|
302
324
|
explicit subset of column names to validate may optionally be specified.
|
325
|
+
|
326
|
+
Either partition_keys or partition_spec must be specified but not both.
|
303
327
|
"""
|
304
328
|
raise NotImplementedError("create_table_version not implemented")
|
305
329
|
|
@@ -402,7 +426,7 @@ def get_stream(
|
|
402
426
|
|
403
427
|
|
404
428
|
def stage_partition(
|
405
|
-
stream: Stream, partition_values: Optional[
|
429
|
+
stream: Stream, partition_values: Optional[PartitionValues] = None, *args, **kwargs
|
406
430
|
) -> Partition:
|
407
431
|
"""
|
408
432
|
Stages a new partition for the given stream and partition values. Returns
|
@@ -410,6 +434,9 @@ def stage_partition(
|
|
410
434
|
with the same partition values, then it will have its previous partition ID
|
411
435
|
set to the ID of the partition being replaced. Partition keys should not be
|
412
436
|
specified for unpartitioned tables.
|
437
|
+
|
438
|
+
The partition_values must represents the results of transforms in a partition
|
439
|
+
spec specified in the stream.
|
413
440
|
"""
|
414
441
|
raise NotImplementedError("stage_partition not implemented")
|
415
442
|
|
@@ -439,7 +466,7 @@ def delete_partition(
|
|
439
466
|
namespace: str,
|
440
467
|
table_name: str,
|
441
468
|
table_version: Optional[str] = None,
|
442
|
-
partition_values: Optional[
|
469
|
+
partition_values: Optional[PartitionValues] = None,
|
443
470
|
*args,
|
444
471
|
**kwargs
|
445
472
|
) -> None:
|
@@ -454,7 +481,7 @@ def delete_partition(
|
|
454
481
|
|
455
482
|
def get_partition(
|
456
483
|
stream_locator: StreamLocator,
|
457
|
-
partition_values: Optional[
|
484
|
+
partition_values: Optional[PartitionValues] = None,
|
458
485
|
*args,
|
459
486
|
**kwargs
|
460
487
|
) -> Optional[Partition]:
|
@@ -477,6 +504,8 @@ def stage_delta(
|
|
477
504
|
s3_table_writer_kwargs: Optional[Dict[str, Any]] = None,
|
478
505
|
content_type: ContentType = ContentType.PARQUET,
|
479
506
|
delete_parameters: Optional[DeleteParameters] = None,
|
507
|
+
partition_spec: Optional[DeltaPartitionSpec] = None,
|
508
|
+
partition_values: Optional[PartitionValues] = None,
|
480
509
|
*args,
|
481
510
|
**kwargs
|
482
511
|
) -> Delta:
|
@@ -484,6 +513,13 @@ def stage_delta(
|
|
484
513
|
Writes the given table to 1 or more S3 files. Returns an unregistered
|
485
514
|
delta whose manifest entries point to the uploaded files. Applies any
|
486
515
|
schema consistency policies configured for the parent table version.
|
516
|
+
|
517
|
+
The partition spec will be used to split the input table into
|
518
|
+
multiple files. Optionally, partition_values can be provided to avoid
|
519
|
+
this method to recompute partition_values from the provided data.
|
520
|
+
|
521
|
+
Raises an error if the provided data does not conform to a unique ordered
|
522
|
+
list of partition_values
|
487
523
|
"""
|
488
524
|
raise NotImplementedError("stage_delta not implemented")
|
489
525
|
|
@@ -600,3 +636,17 @@ def table_version_exists(
|
|
600
636
|
Returns True if the given table version exists, False if not.
|
601
637
|
"""
|
602
638
|
raise NotImplementedError("table_version_exists not implemented")
|
639
|
+
|
640
|
+
|
641
|
+
def can_categorize(e: BaseException, *args, **kwargs) -> bool:
|
642
|
+
"""
|
643
|
+
Return whether input error is from storage implementation layer.
|
644
|
+
"""
|
645
|
+
raise NotImplementedError
|
646
|
+
|
647
|
+
|
648
|
+
def raise_categorized_error(e: BaseException, *args, **kwargs):
|
649
|
+
"""
|
650
|
+
Raise and handle storage implementation layer specific errors.
|
651
|
+
"""
|
652
|
+
raise NotImplementedError
|
deltacat/storage/model/delta.py
CHANGED
@@ -12,6 +12,7 @@ from deltacat.storage.model.stream import StreamLocator
|
|
12
12
|
from deltacat.storage.model.table import TableLocator
|
13
13
|
from deltacat.storage.model.table_version import TableVersionLocator
|
14
14
|
from deltacat.storage.model.types import DeltaType
|
15
|
+
from deltacat.storage.model.partition_spec import DeltaPartitionSpec, PartitionValues
|
15
16
|
|
16
17
|
|
17
18
|
class Delta(dict):
|
@@ -24,6 +25,7 @@ class Delta(dict):
|
|
24
25
|
manifest: Optional[Manifest],
|
25
26
|
previous_stream_position: Optional[int] = None,
|
26
27
|
delete_parameters: Optional[DeleteParameters] = None,
|
28
|
+
partition_spec: Optional[DeltaPartitionSpec] = None,
|
27
29
|
) -> Delta:
|
28
30
|
"""
|
29
31
|
Creates a Delta metadata model with the given Delta Locator, Delta Type,
|
@@ -38,6 +40,7 @@ class Delta(dict):
|
|
38
40
|
delta.manifest = manifest
|
39
41
|
delta.previous_stream_position = previous_stream_position
|
40
42
|
delta.delete_parameters = delete_parameters
|
43
|
+
delta.partition_spec = partition_spec
|
41
44
|
return delta
|
42
45
|
|
43
46
|
@staticmethod
|
@@ -90,6 +93,12 @@ class Delta(dict):
|
|
90
93
|
f"Deltas to merge must all share the same delta type "
|
91
94
|
f"(found {len(distinct_delta_types)} delta types)."
|
92
95
|
)
|
96
|
+
distinct_partition_spec = set([d.partition_spec for d in deltas])
|
97
|
+
if len(distinct_partition_spec) > 1:
|
98
|
+
raise ValueError(
|
99
|
+
f"Deltas to merge must all share the same partition spec "
|
100
|
+
f"(found {len(distinct_partition_spec)} partition specs)."
|
101
|
+
)
|
93
102
|
merged_manifest = Manifest.merge_manifests(
|
94
103
|
manifests,
|
95
104
|
manifest_author,
|
@@ -252,7 +261,7 @@ class Delta(dict):
|
|
252
261
|
return None
|
253
262
|
|
254
263
|
@property
|
255
|
-
def partition_values(self) -> Optional[
|
264
|
+
def partition_values(self) -> Optional[PartitionValues]:
|
256
265
|
delta_locator = self.locator
|
257
266
|
if delta_locator:
|
258
267
|
return delta_locator.partition_values
|
@@ -276,6 +285,17 @@ class Delta(dict):
|
|
276
285
|
def delete_parameters(self, delete_parameters: Optional[DeleteParameters]) -> None:
|
277
286
|
self["delete_parameters"] = delete_parameters
|
278
287
|
|
288
|
+
@property
|
289
|
+
def partition_spec(self) -> Optional[DeltaPartitionSpec]:
|
290
|
+
val: Dict[str, Any] = self.get("partitionSpec")
|
291
|
+
if val is not None and not isinstance(val, DeltaPartitionSpec):
|
292
|
+
self.partition_spec = val = DeltaPartitionSpec(val)
|
293
|
+
return val
|
294
|
+
|
295
|
+
@partition_spec.setter
|
296
|
+
def partition_spec(self, value: Optional[DeltaPartitionSpec]) -> None:
|
297
|
+
self["partitionSpec"] = value
|
298
|
+
|
279
299
|
|
280
300
|
class DeltaLocator(Locator, dict):
|
281
301
|
@staticmethod
|
@@ -299,7 +319,7 @@ class DeltaLocator(Locator, dict):
|
|
299
319
|
table_version: Optional[str],
|
300
320
|
stream_id: Optional[str],
|
301
321
|
storage_type: Optional[str],
|
302
|
-
partition_values: Optional[
|
322
|
+
partition_values: Optional[PartitionValues],
|
303
323
|
partition_id: Optional[str],
|
304
324
|
stream_position: Optional[int],
|
305
325
|
) -> DeltaLocator:
|
@@ -365,7 +385,7 @@ class DeltaLocator(Locator, dict):
|
|
365
385
|
return None
|
366
386
|
|
367
387
|
@property
|
368
|
-
def partition_values(self) -> Optional[
|
388
|
+
def partition_values(self) -> Optional[PartitionValues]:
|
369
389
|
partition_locator = self.partition_locator
|
370
390
|
if partition_locator:
|
371
391
|
return partition_locator.partition_values
|
@@ -1,10 +1,9 @@
|
|
1
1
|
# Allow classes to use self-referencing Type hints in Python 3.7.
|
2
2
|
from __future__ import annotations
|
3
|
-
|
4
3
|
from typing import Any, Dict, List, Optional, Union
|
5
4
|
|
6
5
|
import pyarrow as pa
|
7
|
-
|
6
|
+
from deltacat.storage.model.partition_spec import PartitionValues
|
8
7
|
from deltacat.storage.model.locator import Locator
|
9
8
|
from deltacat.storage.model.namespace import NamespaceLocator
|
10
9
|
from deltacat.storage.model.stream import StreamLocator
|
@@ -127,7 +126,7 @@ class Partition(dict):
|
|
127
126
|
return None
|
128
127
|
|
129
128
|
@property
|
130
|
-
def partition_values(self) -> Optional[
|
129
|
+
def partition_values(self) -> Optional[PartitionValues]:
|
131
130
|
partition_locator = self.locator
|
132
131
|
if partition_locator:
|
133
132
|
return partition_locator.partition_values
|
@@ -199,7 +198,7 @@ class PartitionLocator(Locator, dict):
|
|
199
198
|
@staticmethod
|
200
199
|
def of(
|
201
200
|
stream_locator: Optional[StreamLocator],
|
202
|
-
partition_values: Optional[
|
201
|
+
partition_values: Optional[PartitionValues],
|
203
202
|
partition_id: Optional[str],
|
204
203
|
) -> PartitionLocator:
|
205
204
|
"""
|
@@ -225,7 +224,7 @@ class PartitionLocator(Locator, dict):
|
|
225
224
|
table_version: Optional[str],
|
226
225
|
stream_id: Optional[str],
|
227
226
|
storage_type: Optional[str],
|
228
|
-
partition_values: Optional[
|
227
|
+
partition_values: Optional[PartitionValues],
|
229
228
|
partition_id: Optional[str],
|
230
229
|
) -> PartitionLocator:
|
231
230
|
stream_locator = StreamLocator.at(
|
@@ -253,11 +252,11 @@ class PartitionLocator(Locator, dict):
|
|
253
252
|
self["streamLocator"] = stream_locator
|
254
253
|
|
255
254
|
@property
|
256
|
-
def partition_values(self) -> Optional[
|
255
|
+
def partition_values(self) -> Optional[PartitionValues]:
|
257
256
|
return self.get("partitionValues")
|
258
257
|
|
259
258
|
@partition_values.setter
|
260
|
-
def partition_values(self, partition_values: Optional[
|
259
|
+
def partition_values(self, partition_values: Optional[PartitionValues]) -> None:
|
261
260
|
self["partitionValues"] = partition_values
|
262
261
|
|
263
262
|
@property
|
@@ -0,0 +1,71 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
from typing import List, Optional, Any
|
3
|
+
from deltacat.storage.model.transform import Transform
|
4
|
+
|
5
|
+
"""
|
6
|
+
An ordered list of partition values determining the values of
|
7
|
+
ordered transforms specified in the partition spec.
|
8
|
+
"""
|
9
|
+
PartitionValues = List[Any]
|
10
|
+
|
11
|
+
|
12
|
+
class PartitionFilter(dict):
|
13
|
+
"""
|
14
|
+
This class represents a filter for partitions.
|
15
|
+
It is used to filter partitions based on certain criteria.
|
16
|
+
"""
|
17
|
+
|
18
|
+
@staticmethod
|
19
|
+
def of(
|
20
|
+
partition_values: Optional[PartitionValues] = None,
|
21
|
+
) -> PartitionFilter:
|
22
|
+
"""
|
23
|
+
Creates a new PartitionFilter instance with the specified partition key and value.
|
24
|
+
"""
|
25
|
+
partition_filter = PartitionFilter()
|
26
|
+
partition_filter["partitionValues"] = partition_values
|
27
|
+
return partition_filter
|
28
|
+
|
29
|
+
@property
|
30
|
+
def partition_values(self) -> Optional[PartitionValues]:
|
31
|
+
return self.get("partitionValues")
|
32
|
+
|
33
|
+
|
34
|
+
class PartitionSpec(dict):
|
35
|
+
"""
|
36
|
+
This class determines how the underlying entities in the
|
37
|
+
hierarchy are partitioned. Stream partitions deltas and
|
38
|
+
delta partitions files.
|
39
|
+
"""
|
40
|
+
|
41
|
+
@staticmethod
|
42
|
+
def of(ordered_transforms: List[Transform] = None) -> PartitionSpec:
|
43
|
+
partition_spec = PartitionSpec()
|
44
|
+
partition_spec.ordered_transforms = ordered_transforms
|
45
|
+
return partition_spec
|
46
|
+
|
47
|
+
@property
|
48
|
+
def ordered_transforms(self) -> List[Transform]:
|
49
|
+
return self.get("orderedTransforms")
|
50
|
+
|
51
|
+
@ordered_transforms.setter
|
52
|
+
def ordered_transforms(self, value: List[Transform]) -> None:
|
53
|
+
self["orderedTransforms"] = value
|
54
|
+
|
55
|
+
|
56
|
+
class StreamPartitionSpec(PartitionSpec):
|
57
|
+
"""
|
58
|
+
A class representing a stream partition specification.
|
59
|
+
A stream partitions deltas into multiple different Partition
|
60
|
+
"""
|
61
|
+
|
62
|
+
pass
|
63
|
+
|
64
|
+
|
65
|
+
class DeltaPartitionSpec(PartitionSpec):
|
66
|
+
"""
|
67
|
+
A class representing delta partition specification.
|
68
|
+
The manifest entries in delta are partitioned based on this spec.
|
69
|
+
"""
|
70
|
+
|
71
|
+
pass
|
deltacat/storage/model/stream.py
CHANGED
@@ -8,6 +8,7 @@ from deltacat.storage.model.namespace import NamespaceLocator
|
|
8
8
|
from deltacat.storage.model.table import TableLocator
|
9
9
|
from deltacat.storage.model.table_version import TableVersionLocator
|
10
10
|
from deltacat.storage.model.types import CommitState
|
11
|
+
from deltacat.storage.model.partition_spec import StreamPartitionSpec, PartitionValues
|
11
12
|
|
12
13
|
|
13
14
|
class Stream(dict):
|
@@ -17,12 +18,14 @@ class Stream(dict):
|
|
17
18
|
partition_keys: Optional[List[Dict[str, Any]]],
|
18
19
|
state: Optional[CommitState] = None,
|
19
20
|
previous_stream_digest: Optional[bytes] = None,
|
21
|
+
partition_spec: Optional[StreamPartitionSpec] = None,
|
20
22
|
) -> Stream:
|
21
23
|
stream = Stream()
|
22
24
|
stream.locator = locator
|
23
25
|
stream.partition_keys = partition_keys
|
24
26
|
stream.state = state
|
25
27
|
stream.previous_stream_digest = previous_stream_digest
|
28
|
+
stream.partition_spec = partition_spec
|
26
29
|
return stream
|
27
30
|
|
28
31
|
@property
|
@@ -38,6 +41,14 @@ class Stream(dict):
|
|
38
41
|
|
39
42
|
@property
|
40
43
|
def partition_keys(self) -> Optional[List[Dict[str, Any]]]:
|
44
|
+
"""
|
45
|
+
Ordered list of unique column names in the table schema on
|
46
|
+
which the underlying data is partitioned. Either partition_spec
|
47
|
+
or partition_keys must be specified but not both.
|
48
|
+
|
49
|
+
(Deprecated): Partition keys will be deprecated in the favor
|
50
|
+
of partition_spec in future releases.
|
51
|
+
"""
|
41
52
|
return self.get("partitionKeys")
|
42
53
|
|
43
54
|
@partition_keys.setter
|
@@ -46,6 +57,9 @@ class Stream(dict):
|
|
46
57
|
|
47
58
|
@property
|
48
59
|
def previous_stream_digest(self) -> Optional[str]:
|
60
|
+
"""
|
61
|
+
Previous stream digest
|
62
|
+
"""
|
49
63
|
return self.get("previousStreamDigest")
|
50
64
|
|
51
65
|
@previous_stream_digest.setter
|
@@ -54,6 +68,9 @@ class Stream(dict):
|
|
54
68
|
|
55
69
|
@property
|
56
70
|
def state(self) -> Optional[CommitState]:
|
71
|
+
"""
|
72
|
+
The commit state of a stream.
|
73
|
+
"""
|
57
74
|
state = self.get("state")
|
58
75
|
return None if state is None else CommitState(state)
|
59
76
|
|
@@ -61,6 +78,26 @@ class Stream(dict):
|
|
61
78
|
def state(self, state: Optional[CommitState]) -> None:
|
62
79
|
self["state"] = state
|
63
80
|
|
81
|
+
@property
|
82
|
+
def partition_spec(self) -> Optional[StreamPartitionSpec]:
|
83
|
+
"""
|
84
|
+
If a table uses complex partitioning instead of identity,
|
85
|
+
partition spec can be specified to define that strategy.
|
86
|
+
For example, a partition spec can define a bucketing strategy
|
87
|
+
on composite column values or can define iceberg compliant
|
88
|
+
bucketing.
|
89
|
+
|
90
|
+
Either partition_spec or partition_keys must be specified but not both.
|
91
|
+
"""
|
92
|
+
val: Dict[str, Any] = self.get("partitionSpec")
|
93
|
+
if val is not None and not isinstance(val, StreamPartitionSpec):
|
94
|
+
self.partition_spec = val = StreamPartitionSpec(val)
|
95
|
+
return val
|
96
|
+
|
97
|
+
@partition_spec.setter
|
98
|
+
def partition_spec(self, spec: StreamPartitionSpec) -> None:
|
99
|
+
self["partitionSpec"] = spec
|
100
|
+
|
64
101
|
@property
|
65
102
|
def namespace_locator(self) -> Optional[NamespaceLocator]:
|
66
103
|
stream_locator = self.locator
|
@@ -110,7 +147,7 @@ class Stream(dict):
|
|
110
147
|
return stream_locator.table_version
|
111
148
|
return None
|
112
149
|
|
113
|
-
def validate_partition_values(self, partition_values: Optional[
|
150
|
+
def validate_partition_values(self, partition_values: Optional[PartitionValues]):
|
114
151
|
# TODO (pdames): ensure value data types match key data types
|
115
152
|
partition_keys = self.partition_keys
|
116
153
|
num_keys = len(partition_keys) if partition_keys else 0
|
@@ -0,0 +1,127 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
from typing import List
|
3
|
+
from enum import Enum
|
4
|
+
|
5
|
+
|
6
|
+
class TransformName(str, Enum):
|
7
|
+
IDENTITY = "identity"
|
8
|
+
BUCKET = "bucket"
|
9
|
+
|
10
|
+
|
11
|
+
class TransformParameters(dict):
|
12
|
+
"""
|
13
|
+
This is a parent class that contains properties
|
14
|
+
to be passed to the corresponding transform
|
15
|
+
"""
|
16
|
+
|
17
|
+
pass
|
18
|
+
|
19
|
+
|
20
|
+
class IdentityTransformParameters(TransformParameters):
|
21
|
+
"""
|
22
|
+
This class is used to pass parameters to the identity transform
|
23
|
+
"""
|
24
|
+
|
25
|
+
@staticmethod
|
26
|
+
def of(column_name: str) -> IdentityTransformParameters:
|
27
|
+
identify_transform_parameters = IdentityTransformParameters()
|
28
|
+
identify_transform_parameters["columnName"] = column_name
|
29
|
+
return identify_transform_parameters
|
30
|
+
|
31
|
+
@property
|
32
|
+
def column_name(self) -> str:
|
33
|
+
"""
|
34
|
+
The name of the column to use for identity transform
|
35
|
+
"""
|
36
|
+
return self["columnName"]
|
37
|
+
|
38
|
+
@column_name.setter
|
39
|
+
def column_name(self, value: str) -> None:
|
40
|
+
self["columnName"] = value
|
41
|
+
|
42
|
+
|
43
|
+
class BucketingStrategy(str, Enum):
|
44
|
+
"""
|
45
|
+
A bucketing strategy for the transform
|
46
|
+
"""
|
47
|
+
|
48
|
+
# Uses default deltacat bucketing strategy.
|
49
|
+
# This strategy supports hashing on composite keys
|
50
|
+
# and uses SHA1 hashing for determining the bucket.
|
51
|
+
# If no columns passed, it will use a random UUID
|
52
|
+
# for determining the bucket.
|
53
|
+
DEFAULT = "default"
|
54
|
+
|
55
|
+
# Uses iceberg compliant bucketing strategy.
|
56
|
+
# As indicated in the iceberg spec, it does not support
|
57
|
+
# composite keys and uses murmur3 hash for determining
|
58
|
+
# the bucket.
|
59
|
+
# See https://iceberg.apache.org/spec/#partitioning
|
60
|
+
ICEBERG = "iceberg"
|
61
|
+
|
62
|
+
|
63
|
+
class BucketTransformParameters(TransformParameters):
|
64
|
+
"""
|
65
|
+
Encapsulates parameters for the bucket transform.
|
66
|
+
"""
|
67
|
+
|
68
|
+
def of(
|
69
|
+
self,
|
70
|
+
num_buckets: int,
|
71
|
+
column_names: List[str],
|
72
|
+
bucketing_strategy: BucketingStrategy,
|
73
|
+
) -> BucketTransformParameters:
|
74
|
+
bucket_transform_parameters = BucketTransformParameters()
|
75
|
+
bucket_transform_parameters["numBuckets"] = num_buckets
|
76
|
+
bucket_transform_parameters["columnNames"] = column_names
|
77
|
+
bucket_transform_parameters["bucketingStrategy"] = bucketing_strategy
|
78
|
+
|
79
|
+
return bucket_transform_parameters
|
80
|
+
|
81
|
+
@property
|
82
|
+
def num_buckets(self) -> int:
|
83
|
+
"""
|
84
|
+
The total number of buckets to create for values of the column
|
85
|
+
"""
|
86
|
+
return self["numBuckets"]
|
87
|
+
|
88
|
+
@property
|
89
|
+
def column_names(self) -> List[str]:
|
90
|
+
"""
|
91
|
+
An ordered list of unique column names from the table schema
|
92
|
+
to use for bucketings.
|
93
|
+
"""
|
94
|
+
return self["columnNames"]
|
95
|
+
|
96
|
+
@property
|
97
|
+
def bucketing_strategy(self) -> BucketingStrategy:
|
98
|
+
"""
|
99
|
+
The bucketing strategy to used.
|
100
|
+
"""
|
101
|
+
return self["bucketingStrategy"]
|
102
|
+
|
103
|
+
|
104
|
+
class Transform(dict):
|
105
|
+
"""
|
106
|
+
A transform is represents how a particular column value can be
|
107
|
+
transformed into a new value. This is mostly used in the context
|
108
|
+
of partitioning the data files in a table.
|
109
|
+
"""
|
110
|
+
|
111
|
+
@staticmethod
|
112
|
+
def of(
|
113
|
+
name: TransformName,
|
114
|
+
parameters: TransformParameters,
|
115
|
+
) -> Transform:
|
116
|
+
partition_transform = Transform()
|
117
|
+
partition_transform["name"] = name
|
118
|
+
partition_transform["parameters"] = parameters
|
119
|
+
return partition_transform
|
120
|
+
|
121
|
+
@property
|
122
|
+
def name(self) -> TransformName:
|
123
|
+
return self["name"]
|
124
|
+
|
125
|
+
@property
|
126
|
+
def parameters(self) -> TransformParameters:
|
127
|
+
return self["parameters"]
|
deltacat/tests/aws/test_s3u.py
CHANGED
@@ -20,6 +20,7 @@ from botocore.exceptions import (
|
|
20
20
|
ConnectTimeoutError,
|
21
21
|
HTTPClientError,
|
22
22
|
)
|
23
|
+
from ray.data.datasource import FilenameProvider
|
23
24
|
from deltacat.exceptions import NonRetryableError
|
24
25
|
from moto import mock_s3
|
25
26
|
from tenacity import RetryError
|
@@ -34,6 +35,7 @@ class TestUuidBlockWritePathProvider(unittest.TestCase):
|
|
34
35
|
|
35
36
|
result = provider("base_path")
|
36
37
|
|
38
|
+
self.assertTrue(isinstance(provider, FilenameProvider))
|
37
39
|
self.assertRegex(result, r"^base_path/[\w-]{36}$")
|
38
40
|
|
39
41
|
|