deltacat 1.1.9__py3-none-any.whl → 1.1.11__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +1 -1
- deltacat/aws/redshift/model/manifest.py +16 -0
- deltacat/aws/s3u.py +19 -13
- deltacat/compute/compactor/compaction_session.py +5 -1
- deltacat/compute/compactor/repartition_session.py +1 -0
- deltacat/compute/compactor/utils/round_completion_file.py +39 -9
- deltacat/compute/compactor_v2/compaction_session.py +15 -11
- deltacat/compute/compactor_v2/constants.py +3 -0
- deltacat/compute/compactor_v2/model/{compaction_session.py → evaluate_compaction_result.py} +1 -2
- deltacat/compute/compactor_v2/utils/primary_key_index.py +1 -1
- deltacat/exceptions.py +5 -2
- deltacat/io/dataset.py +5 -17
- deltacat/storage/__init__.py +24 -0
- deltacat/storage/interface.py +42 -6
- deltacat/storage/model/delta.py +23 -3
- deltacat/storage/model/partition.py +6 -7
- deltacat/storage/model/partition_spec.py +71 -0
- deltacat/storage/model/stream.py +38 -1
- deltacat/storage/model/transform.py +127 -0
- deltacat/tests/aws/test_s3u.py +2 -0
- deltacat/tests/compute/compactor/utils/test_round_completion_file.py +231 -0
- deltacat/tests/compute/compactor_v2/test_compaction_session.py +201 -36
- deltacat/tests/compute/test_compact_partition_rebase.py +1 -1
- deltacat/tests/compute/test_util_common.py +19 -4
- deltacat/tests/local_deltacat_storage/__init__.py +83 -19
- deltacat/tests/test_utils/pyarrow.py +4 -1
- deltacat/tests/utils/ray_utils/test_dataset.py +66 -0
- deltacat/utils/numpy.py +3 -3
- deltacat/utils/pandas.py +3 -3
- deltacat/utils/pyarrow.py +3 -3
- deltacat/utils/ray_utils/dataset.py +7 -7
- {deltacat-1.1.9.dist-info → deltacat-1.1.11.dist-info}/METADATA +6 -5
- {deltacat-1.1.9.dist-info → deltacat-1.1.11.dist-info}/RECORD +36 -33
- deltacat/io/aws/redshift/redshift_datasource.py +0 -578
- {deltacat-1.1.9.dist-info → deltacat-1.1.11.dist-info}/LICENSE +0 -0
- {deltacat-1.1.9.dist-info → deltacat-1.1.11.dist-info}/WHEEL +0 -0
- {deltacat-1.1.9.dist-info → deltacat-1.1.11.dist-info}/top_level.txt +0 -0
deltacat/storage/model/delta.py
CHANGED
@@ -12,6 +12,7 @@ from deltacat.storage.model.stream import StreamLocator
|
|
12
12
|
from deltacat.storage.model.table import TableLocator
|
13
13
|
from deltacat.storage.model.table_version import TableVersionLocator
|
14
14
|
from deltacat.storage.model.types import DeltaType
|
15
|
+
from deltacat.storage.model.partition_spec import DeltaPartitionSpec, PartitionValues
|
15
16
|
|
16
17
|
|
17
18
|
class Delta(dict):
|
@@ -24,6 +25,7 @@ class Delta(dict):
|
|
24
25
|
manifest: Optional[Manifest],
|
25
26
|
previous_stream_position: Optional[int] = None,
|
26
27
|
delete_parameters: Optional[DeleteParameters] = None,
|
28
|
+
partition_spec: Optional[DeltaPartitionSpec] = None,
|
27
29
|
) -> Delta:
|
28
30
|
"""
|
29
31
|
Creates a Delta metadata model with the given Delta Locator, Delta Type,
|
@@ -38,6 +40,7 @@ class Delta(dict):
|
|
38
40
|
delta.manifest = manifest
|
39
41
|
delta.previous_stream_position = previous_stream_position
|
40
42
|
delta.delete_parameters = delete_parameters
|
43
|
+
delta.partition_spec = partition_spec
|
41
44
|
return delta
|
42
45
|
|
43
46
|
@staticmethod
|
@@ -90,6 +93,12 @@ class Delta(dict):
|
|
90
93
|
f"Deltas to merge must all share the same delta type "
|
91
94
|
f"(found {len(distinct_delta_types)} delta types)."
|
92
95
|
)
|
96
|
+
distinct_partition_spec = set([d.partition_spec for d in deltas])
|
97
|
+
if len(distinct_partition_spec) > 1:
|
98
|
+
raise ValueError(
|
99
|
+
f"Deltas to merge must all share the same partition spec "
|
100
|
+
f"(found {len(distinct_partition_spec)} partition specs)."
|
101
|
+
)
|
93
102
|
merged_manifest = Manifest.merge_manifests(
|
94
103
|
manifests,
|
95
104
|
manifest_author,
|
@@ -252,7 +261,7 @@ class Delta(dict):
|
|
252
261
|
return None
|
253
262
|
|
254
263
|
@property
|
255
|
-
def partition_values(self) -> Optional[
|
264
|
+
def partition_values(self) -> Optional[PartitionValues]:
|
256
265
|
delta_locator = self.locator
|
257
266
|
if delta_locator:
|
258
267
|
return delta_locator.partition_values
|
@@ -276,6 +285,17 @@ class Delta(dict):
|
|
276
285
|
def delete_parameters(self, delete_parameters: Optional[DeleteParameters]) -> None:
|
277
286
|
self["delete_parameters"] = delete_parameters
|
278
287
|
|
288
|
+
@property
|
289
|
+
def partition_spec(self) -> Optional[DeltaPartitionSpec]:
|
290
|
+
val: Dict[str, Any] = self.get("partitionSpec")
|
291
|
+
if val is not None and not isinstance(val, DeltaPartitionSpec):
|
292
|
+
self.partition_spec = val = DeltaPartitionSpec(val)
|
293
|
+
return val
|
294
|
+
|
295
|
+
@partition_spec.setter
|
296
|
+
def partition_spec(self, value: Optional[DeltaPartitionSpec]) -> None:
|
297
|
+
self["partitionSpec"] = value
|
298
|
+
|
279
299
|
|
280
300
|
class DeltaLocator(Locator, dict):
|
281
301
|
@staticmethod
|
@@ -299,7 +319,7 @@ class DeltaLocator(Locator, dict):
|
|
299
319
|
table_version: Optional[str],
|
300
320
|
stream_id: Optional[str],
|
301
321
|
storage_type: Optional[str],
|
302
|
-
partition_values: Optional[
|
322
|
+
partition_values: Optional[PartitionValues],
|
303
323
|
partition_id: Optional[str],
|
304
324
|
stream_position: Optional[int],
|
305
325
|
) -> DeltaLocator:
|
@@ -365,7 +385,7 @@ class DeltaLocator(Locator, dict):
|
|
365
385
|
return None
|
366
386
|
|
367
387
|
@property
|
368
|
-
def partition_values(self) -> Optional[
|
388
|
+
def partition_values(self) -> Optional[PartitionValues]:
|
369
389
|
partition_locator = self.partition_locator
|
370
390
|
if partition_locator:
|
371
391
|
return partition_locator.partition_values
|
@@ -1,10 +1,9 @@
|
|
1
1
|
# Allow classes to use self-referencing Type hints in Python 3.7.
|
2
2
|
from __future__ import annotations
|
3
|
-
|
4
3
|
from typing import Any, Dict, List, Optional, Union
|
5
4
|
|
6
5
|
import pyarrow as pa
|
7
|
-
|
6
|
+
from deltacat.storage.model.partition_spec import PartitionValues
|
8
7
|
from deltacat.storage.model.locator import Locator
|
9
8
|
from deltacat.storage.model.namespace import NamespaceLocator
|
10
9
|
from deltacat.storage.model.stream import StreamLocator
|
@@ -127,7 +126,7 @@ class Partition(dict):
|
|
127
126
|
return None
|
128
127
|
|
129
128
|
@property
|
130
|
-
def partition_values(self) -> Optional[
|
129
|
+
def partition_values(self) -> Optional[PartitionValues]:
|
131
130
|
partition_locator = self.locator
|
132
131
|
if partition_locator:
|
133
132
|
return partition_locator.partition_values
|
@@ -199,7 +198,7 @@ class PartitionLocator(Locator, dict):
|
|
199
198
|
@staticmethod
|
200
199
|
def of(
|
201
200
|
stream_locator: Optional[StreamLocator],
|
202
|
-
partition_values: Optional[
|
201
|
+
partition_values: Optional[PartitionValues],
|
203
202
|
partition_id: Optional[str],
|
204
203
|
) -> PartitionLocator:
|
205
204
|
"""
|
@@ -225,7 +224,7 @@ class PartitionLocator(Locator, dict):
|
|
225
224
|
table_version: Optional[str],
|
226
225
|
stream_id: Optional[str],
|
227
226
|
storage_type: Optional[str],
|
228
|
-
partition_values: Optional[
|
227
|
+
partition_values: Optional[PartitionValues],
|
229
228
|
partition_id: Optional[str],
|
230
229
|
) -> PartitionLocator:
|
231
230
|
stream_locator = StreamLocator.at(
|
@@ -253,11 +252,11 @@ class PartitionLocator(Locator, dict):
|
|
253
252
|
self["streamLocator"] = stream_locator
|
254
253
|
|
255
254
|
@property
|
256
|
-
def partition_values(self) -> Optional[
|
255
|
+
def partition_values(self) -> Optional[PartitionValues]:
|
257
256
|
return self.get("partitionValues")
|
258
257
|
|
259
258
|
@partition_values.setter
|
260
|
-
def partition_values(self, partition_values: Optional[
|
259
|
+
def partition_values(self, partition_values: Optional[PartitionValues]) -> None:
|
261
260
|
self["partitionValues"] = partition_values
|
262
261
|
|
263
262
|
@property
|
@@ -0,0 +1,71 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
from typing import List, Optional, Any
|
3
|
+
from deltacat.storage.model.transform import Transform
|
4
|
+
|
5
|
+
"""
|
6
|
+
An ordered list of partition values determining the values of
|
7
|
+
ordered transforms specified in the partition spec.
|
8
|
+
"""
|
9
|
+
PartitionValues = List[Any]
|
10
|
+
|
11
|
+
|
12
|
+
class PartitionFilter(dict):
|
13
|
+
"""
|
14
|
+
This class represents a filter for partitions.
|
15
|
+
It is used to filter partitions based on certain criteria.
|
16
|
+
"""
|
17
|
+
|
18
|
+
@staticmethod
|
19
|
+
def of(
|
20
|
+
partition_values: Optional[PartitionValues] = None,
|
21
|
+
) -> PartitionFilter:
|
22
|
+
"""
|
23
|
+
Creates a new PartitionFilter instance with the specified partition key and value.
|
24
|
+
"""
|
25
|
+
partition_filter = PartitionFilter()
|
26
|
+
partition_filter["partitionValues"] = partition_values
|
27
|
+
return partition_filter
|
28
|
+
|
29
|
+
@property
|
30
|
+
def partition_values(self) -> Optional[PartitionValues]:
|
31
|
+
return self.get("partitionValues")
|
32
|
+
|
33
|
+
|
34
|
+
class PartitionSpec(dict):
|
35
|
+
"""
|
36
|
+
This class determines how the underlying entities in the
|
37
|
+
hierarchy are partitioned. Stream partitions deltas and
|
38
|
+
delta partitions files.
|
39
|
+
"""
|
40
|
+
|
41
|
+
@staticmethod
|
42
|
+
def of(ordered_transforms: List[Transform] = None) -> PartitionSpec:
|
43
|
+
partition_spec = PartitionSpec()
|
44
|
+
partition_spec.ordered_transforms = ordered_transforms
|
45
|
+
return partition_spec
|
46
|
+
|
47
|
+
@property
|
48
|
+
def ordered_transforms(self) -> List[Transform]:
|
49
|
+
return self.get("orderedTransforms")
|
50
|
+
|
51
|
+
@ordered_transforms.setter
|
52
|
+
def ordered_transforms(self, value: List[Transform]) -> None:
|
53
|
+
self["orderedTransforms"] = value
|
54
|
+
|
55
|
+
|
56
|
+
class StreamPartitionSpec(PartitionSpec):
|
57
|
+
"""
|
58
|
+
A class representing a stream partition specification.
|
59
|
+
A stream partitions deltas into multiple different Partition
|
60
|
+
"""
|
61
|
+
|
62
|
+
pass
|
63
|
+
|
64
|
+
|
65
|
+
class DeltaPartitionSpec(PartitionSpec):
|
66
|
+
"""
|
67
|
+
A class representing delta partition specification.
|
68
|
+
The manifest entries in delta are partitioned based on this spec.
|
69
|
+
"""
|
70
|
+
|
71
|
+
pass
|
deltacat/storage/model/stream.py
CHANGED
@@ -8,6 +8,7 @@ from deltacat.storage.model.namespace import NamespaceLocator
|
|
8
8
|
from deltacat.storage.model.table import TableLocator
|
9
9
|
from deltacat.storage.model.table_version import TableVersionLocator
|
10
10
|
from deltacat.storage.model.types import CommitState
|
11
|
+
from deltacat.storage.model.partition_spec import StreamPartitionSpec, PartitionValues
|
11
12
|
|
12
13
|
|
13
14
|
class Stream(dict):
|
@@ -17,12 +18,14 @@ class Stream(dict):
|
|
17
18
|
partition_keys: Optional[List[Dict[str, Any]]],
|
18
19
|
state: Optional[CommitState] = None,
|
19
20
|
previous_stream_digest: Optional[bytes] = None,
|
21
|
+
partition_spec: Optional[StreamPartitionSpec] = None,
|
20
22
|
) -> Stream:
|
21
23
|
stream = Stream()
|
22
24
|
stream.locator = locator
|
23
25
|
stream.partition_keys = partition_keys
|
24
26
|
stream.state = state
|
25
27
|
stream.previous_stream_digest = previous_stream_digest
|
28
|
+
stream.partition_spec = partition_spec
|
26
29
|
return stream
|
27
30
|
|
28
31
|
@property
|
@@ -38,6 +41,14 @@ class Stream(dict):
|
|
38
41
|
|
39
42
|
@property
|
40
43
|
def partition_keys(self) -> Optional[List[Dict[str, Any]]]:
|
44
|
+
"""
|
45
|
+
Ordered list of unique column names in the table schema on
|
46
|
+
which the underlying data is partitioned. Either partition_spec
|
47
|
+
or partition_keys must be specified but not both.
|
48
|
+
|
49
|
+
(Deprecated): Partition keys will be deprecated in the favor
|
50
|
+
of partition_spec in future releases.
|
51
|
+
"""
|
41
52
|
return self.get("partitionKeys")
|
42
53
|
|
43
54
|
@partition_keys.setter
|
@@ -46,6 +57,9 @@ class Stream(dict):
|
|
46
57
|
|
47
58
|
@property
|
48
59
|
def previous_stream_digest(self) -> Optional[str]:
|
60
|
+
"""
|
61
|
+
Previous stream digest
|
62
|
+
"""
|
49
63
|
return self.get("previousStreamDigest")
|
50
64
|
|
51
65
|
@previous_stream_digest.setter
|
@@ -54,6 +68,9 @@ class Stream(dict):
|
|
54
68
|
|
55
69
|
@property
|
56
70
|
def state(self) -> Optional[CommitState]:
|
71
|
+
"""
|
72
|
+
The commit state of a stream.
|
73
|
+
"""
|
57
74
|
state = self.get("state")
|
58
75
|
return None if state is None else CommitState(state)
|
59
76
|
|
@@ -61,6 +78,26 @@ class Stream(dict):
|
|
61
78
|
def state(self, state: Optional[CommitState]) -> None:
|
62
79
|
self["state"] = state
|
63
80
|
|
81
|
+
@property
|
82
|
+
def partition_spec(self) -> Optional[StreamPartitionSpec]:
|
83
|
+
"""
|
84
|
+
If a table uses complex partitioning instead of identity,
|
85
|
+
partition spec can be specified to define that strategy.
|
86
|
+
For example, a partition spec can define a bucketing strategy
|
87
|
+
on composite column values or can define iceberg compliant
|
88
|
+
bucketing.
|
89
|
+
|
90
|
+
Either partition_spec or partition_keys must be specified but not both.
|
91
|
+
"""
|
92
|
+
val: Dict[str, Any] = self.get("partitionSpec")
|
93
|
+
if val is not None and not isinstance(val, StreamPartitionSpec):
|
94
|
+
self.partition_spec = val = StreamPartitionSpec(val)
|
95
|
+
return val
|
96
|
+
|
97
|
+
@partition_spec.setter
|
98
|
+
def partition_spec(self, spec: StreamPartitionSpec) -> None:
|
99
|
+
self["partitionSpec"] = spec
|
100
|
+
|
64
101
|
@property
|
65
102
|
def namespace_locator(self) -> Optional[NamespaceLocator]:
|
66
103
|
stream_locator = self.locator
|
@@ -110,7 +147,7 @@ class Stream(dict):
|
|
110
147
|
return stream_locator.table_version
|
111
148
|
return None
|
112
149
|
|
113
|
-
def validate_partition_values(self, partition_values: Optional[
|
150
|
+
def validate_partition_values(self, partition_values: Optional[PartitionValues]):
|
114
151
|
# TODO (pdames): ensure value data types match key data types
|
115
152
|
partition_keys = self.partition_keys
|
116
153
|
num_keys = len(partition_keys) if partition_keys else 0
|
@@ -0,0 +1,127 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
from typing import List
|
3
|
+
from enum import Enum
|
4
|
+
|
5
|
+
|
6
|
+
class TransformName(str, Enum):
|
7
|
+
IDENTITY = "identity"
|
8
|
+
BUCKET = "bucket"
|
9
|
+
|
10
|
+
|
11
|
+
class TransformParameters(dict):
|
12
|
+
"""
|
13
|
+
This is a parent class that contains properties
|
14
|
+
to be passed to the corresponding transform
|
15
|
+
"""
|
16
|
+
|
17
|
+
pass
|
18
|
+
|
19
|
+
|
20
|
+
class IdentityTransformParameters(TransformParameters):
|
21
|
+
"""
|
22
|
+
This class is used to pass parameters to the identity transform
|
23
|
+
"""
|
24
|
+
|
25
|
+
@staticmethod
|
26
|
+
def of(column_name: str) -> IdentityTransformParameters:
|
27
|
+
identify_transform_parameters = IdentityTransformParameters()
|
28
|
+
identify_transform_parameters["columnName"] = column_name
|
29
|
+
return identify_transform_parameters
|
30
|
+
|
31
|
+
@property
|
32
|
+
def column_name(self) -> str:
|
33
|
+
"""
|
34
|
+
The name of the column to use for identity transform
|
35
|
+
"""
|
36
|
+
return self["columnName"]
|
37
|
+
|
38
|
+
@column_name.setter
|
39
|
+
def column_name(self, value: str) -> None:
|
40
|
+
self["columnName"] = value
|
41
|
+
|
42
|
+
|
43
|
+
class BucketingStrategy(str, Enum):
|
44
|
+
"""
|
45
|
+
A bucketing strategy for the transform
|
46
|
+
"""
|
47
|
+
|
48
|
+
# Uses default deltacat bucketing strategy.
|
49
|
+
# This strategy supports hashing on composite keys
|
50
|
+
# and uses SHA1 hashing for determining the bucket.
|
51
|
+
# If no columns passed, it will use a random UUID
|
52
|
+
# for determining the bucket.
|
53
|
+
DEFAULT = "default"
|
54
|
+
|
55
|
+
# Uses iceberg compliant bucketing strategy.
|
56
|
+
# As indicated in the iceberg spec, it does not support
|
57
|
+
# composite keys and uses murmur3 hash for determining
|
58
|
+
# the bucket.
|
59
|
+
# See https://iceberg.apache.org/spec/#partitioning
|
60
|
+
ICEBERG = "iceberg"
|
61
|
+
|
62
|
+
|
63
|
+
class BucketTransformParameters(TransformParameters):
|
64
|
+
"""
|
65
|
+
Encapsulates parameters for the bucket transform.
|
66
|
+
"""
|
67
|
+
|
68
|
+
def of(
|
69
|
+
self,
|
70
|
+
num_buckets: int,
|
71
|
+
column_names: List[str],
|
72
|
+
bucketing_strategy: BucketingStrategy,
|
73
|
+
) -> BucketTransformParameters:
|
74
|
+
bucket_transform_parameters = BucketTransformParameters()
|
75
|
+
bucket_transform_parameters["numBuckets"] = num_buckets
|
76
|
+
bucket_transform_parameters["columnNames"] = column_names
|
77
|
+
bucket_transform_parameters["bucketingStrategy"] = bucketing_strategy
|
78
|
+
|
79
|
+
return bucket_transform_parameters
|
80
|
+
|
81
|
+
@property
|
82
|
+
def num_buckets(self) -> int:
|
83
|
+
"""
|
84
|
+
The total number of buckets to create for values of the column
|
85
|
+
"""
|
86
|
+
return self["numBuckets"]
|
87
|
+
|
88
|
+
@property
|
89
|
+
def column_names(self) -> List[str]:
|
90
|
+
"""
|
91
|
+
An ordered list of unique column names from the table schema
|
92
|
+
to use for bucketings.
|
93
|
+
"""
|
94
|
+
return self["columnNames"]
|
95
|
+
|
96
|
+
@property
|
97
|
+
def bucketing_strategy(self) -> BucketingStrategy:
|
98
|
+
"""
|
99
|
+
The bucketing strategy to used.
|
100
|
+
"""
|
101
|
+
return self["bucketingStrategy"]
|
102
|
+
|
103
|
+
|
104
|
+
class Transform(dict):
|
105
|
+
"""
|
106
|
+
A transform is represents how a particular column value can be
|
107
|
+
transformed into a new value. This is mostly used in the context
|
108
|
+
of partitioning the data files in a table.
|
109
|
+
"""
|
110
|
+
|
111
|
+
@staticmethod
|
112
|
+
def of(
|
113
|
+
name: TransformName,
|
114
|
+
parameters: TransformParameters,
|
115
|
+
) -> Transform:
|
116
|
+
partition_transform = Transform()
|
117
|
+
partition_transform["name"] = name
|
118
|
+
partition_transform["parameters"] = parameters
|
119
|
+
return partition_transform
|
120
|
+
|
121
|
+
@property
|
122
|
+
def name(self) -> TransformName:
|
123
|
+
return self["name"]
|
124
|
+
|
125
|
+
@property
|
126
|
+
def parameters(self) -> TransformParameters:
|
127
|
+
return self["parameters"]
|
deltacat/tests/aws/test_s3u.py
CHANGED
@@ -20,6 +20,7 @@ from botocore.exceptions import (
|
|
20
20
|
ConnectTimeoutError,
|
21
21
|
HTTPClientError,
|
22
22
|
)
|
23
|
+
from ray.data.datasource import FilenameProvider
|
23
24
|
from deltacat.exceptions import NonRetryableError
|
24
25
|
from moto import mock_s3
|
25
26
|
from tenacity import RetryError
|
@@ -34,6 +35,7 @@ class TestUuidBlockWritePathProvider(unittest.TestCase):
|
|
34
35
|
|
35
36
|
result = provider("base_path")
|
36
37
|
|
38
|
+
self.assertTrue(isinstance(provider, FilenameProvider))
|
37
39
|
self.assertRegex(result, r"^base_path/[\w-]{36}$")
|
38
40
|
|
39
41
|
|
@@ -0,0 +1,231 @@
|
|
1
|
+
import pytest
|
2
|
+
import os
|
3
|
+
from moto import mock_s3
|
4
|
+
import boto3
|
5
|
+
from boto3.resources.base import ServiceResource
|
6
|
+
from deltacat.compute.compactor.utils.round_completion_file import (
|
7
|
+
read_round_completion_file,
|
8
|
+
write_round_completion_file,
|
9
|
+
)
|
10
|
+
from deltacat.tests.compute.test_util_common import get_test_partition_locator
|
11
|
+
from deltacat.compute.compactor import RoundCompletionInfo
|
12
|
+
|
13
|
+
RCF_BUCKET_NAME = "rcf-bucket"
|
14
|
+
|
15
|
+
|
16
|
+
@pytest.fixture(autouse=True, scope="module")
|
17
|
+
def mock_aws_credential():
|
18
|
+
os.environ["AWS_ACCESS_KEY_ID"] = "testing"
|
19
|
+
os.environ["AWS_SECRET_ACCESS_ID"] = "testing"
|
20
|
+
os.environ["AWS_SECURITY_TOKEN"] = "testing"
|
21
|
+
os.environ["AWS_SESSION_TOKEN"] = "testing"
|
22
|
+
os.environ["AWS_DEFAULT_REGION"] = "us-east-1"
|
23
|
+
yield
|
24
|
+
|
25
|
+
|
26
|
+
@pytest.fixture(autouse=True, scope="module")
|
27
|
+
def s3_resource(mock_aws_credential):
|
28
|
+
with mock_s3():
|
29
|
+
yield boto3.resource("s3")
|
30
|
+
|
31
|
+
|
32
|
+
@pytest.fixture(autouse=True, scope="function")
|
33
|
+
def setup_compaction_artifacts_s3_bucket(s3_resource: ServiceResource):
|
34
|
+
s3_resource.create_bucket(
|
35
|
+
ACL="authenticated-read",
|
36
|
+
Bucket=RCF_BUCKET_NAME,
|
37
|
+
)
|
38
|
+
yield
|
39
|
+
s3_resource.Bucket(RCF_BUCKET_NAME).objects.all().delete()
|
40
|
+
|
41
|
+
|
42
|
+
class TestReadWriteRoundCompletionFile:
|
43
|
+
def test_read_when_rcf_written_without_destination(self):
|
44
|
+
"""
|
45
|
+
This test case tests the backward compatibility by successfully
|
46
|
+
reading the previously written rcf.
|
47
|
+
"""
|
48
|
+
|
49
|
+
source_locator = get_test_partition_locator("source")
|
50
|
+
destination_locator = get_test_partition_locator("destination")
|
51
|
+
|
52
|
+
expected_rcf = RoundCompletionInfo.of(
|
53
|
+
high_watermark=122,
|
54
|
+
compacted_delta_locator={},
|
55
|
+
compacted_pyarrow_write_result={},
|
56
|
+
sort_keys_bit_width=12,
|
57
|
+
)
|
58
|
+
|
59
|
+
rcf_url = write_round_completion_file(
|
60
|
+
RCF_BUCKET_NAME, source_locator, None, expected_rcf
|
61
|
+
)
|
62
|
+
|
63
|
+
rcf = read_round_completion_file(
|
64
|
+
RCF_BUCKET_NAME, source_locator, destination_locator
|
65
|
+
)
|
66
|
+
|
67
|
+
assert (
|
68
|
+
rcf_url == "s3://rcf-bucket/f9829af39770d904dbb811bd8f4e886dd307f507.json"
|
69
|
+
)
|
70
|
+
assert rcf == expected_rcf
|
71
|
+
|
72
|
+
def test_read_when_rcf_written_with_destination(self):
|
73
|
+
"""
|
74
|
+
This test case tests the backward compatibility by successfully
|
75
|
+
reading the previously written rcf.
|
76
|
+
"""
|
77
|
+
|
78
|
+
source_locator = get_test_partition_locator("source")
|
79
|
+
destination_locator = get_test_partition_locator("destination")
|
80
|
+
|
81
|
+
expected_rcf = RoundCompletionInfo.of(
|
82
|
+
high_watermark=122,
|
83
|
+
compacted_delta_locator={},
|
84
|
+
compacted_pyarrow_write_result={},
|
85
|
+
sort_keys_bit_width=12,
|
86
|
+
)
|
87
|
+
|
88
|
+
rcf_url = write_round_completion_file(
|
89
|
+
RCF_BUCKET_NAME, source_locator, destination_locator, expected_rcf
|
90
|
+
)
|
91
|
+
|
92
|
+
rcf = read_round_completion_file(
|
93
|
+
RCF_BUCKET_NAME, source_locator, destination_locator
|
94
|
+
)
|
95
|
+
|
96
|
+
assert (
|
97
|
+
rcf_url
|
98
|
+
== "s3://rcf-bucket/f9829af39770d904dbb811bd8f4e886dd307f507/e9939deadc091b3289a2eb0ca56b1ba86b9892f4.json"
|
99
|
+
)
|
100
|
+
assert rcf == expected_rcf
|
101
|
+
|
102
|
+
def test_read_without_destination_when_rcf_written_with_destination(self):
|
103
|
+
"""
|
104
|
+
This test case tests the backward compatibility by successfully
|
105
|
+
reading the previously written rcf.
|
106
|
+
"""
|
107
|
+
|
108
|
+
source_locator = get_test_partition_locator("source")
|
109
|
+
destination_locator = get_test_partition_locator("destination")
|
110
|
+
|
111
|
+
expected_rcf = RoundCompletionInfo.of(
|
112
|
+
high_watermark=122,
|
113
|
+
compacted_delta_locator={},
|
114
|
+
compacted_pyarrow_write_result={},
|
115
|
+
sort_keys_bit_width=12,
|
116
|
+
)
|
117
|
+
|
118
|
+
write_round_completion_file(
|
119
|
+
RCF_BUCKET_NAME, source_locator, destination_locator, expected_rcf
|
120
|
+
)
|
121
|
+
|
122
|
+
rcf = read_round_completion_file(RCF_BUCKET_NAME, source_locator, None)
|
123
|
+
|
124
|
+
assert rcf is None
|
125
|
+
|
126
|
+
def test_read_without_destination_when_rcf_written_without_destination(self):
|
127
|
+
"""
|
128
|
+
This test case tests the backward compatibility by successfully
|
129
|
+
reading the previously written rcf.
|
130
|
+
"""
|
131
|
+
|
132
|
+
source_locator = get_test_partition_locator("source")
|
133
|
+
|
134
|
+
expected_rcf = RoundCompletionInfo.of(
|
135
|
+
high_watermark=122,
|
136
|
+
compacted_delta_locator={},
|
137
|
+
compacted_pyarrow_write_result={},
|
138
|
+
sort_keys_bit_width=12,
|
139
|
+
)
|
140
|
+
|
141
|
+
write_round_completion_file(RCF_BUCKET_NAME, source_locator, None, expected_rcf)
|
142
|
+
|
143
|
+
rcf = read_round_completion_file(RCF_BUCKET_NAME, source_locator, None)
|
144
|
+
|
145
|
+
assert rcf == expected_rcf
|
146
|
+
|
147
|
+
def test_read_when_rcf_written_both_with_and_without_destination(self):
|
148
|
+
"""
|
149
|
+
This test case tests the backward compatibility by successfully
|
150
|
+
reading the previously written rcf.
|
151
|
+
"""
|
152
|
+
|
153
|
+
source_locator = get_test_partition_locator("source")
|
154
|
+
destination_locator = get_test_partition_locator("destination")
|
155
|
+
|
156
|
+
expected_rcf = RoundCompletionInfo.of(
|
157
|
+
high_watermark=122,
|
158
|
+
compacted_delta_locator={},
|
159
|
+
compacted_pyarrow_write_result={},
|
160
|
+
sort_keys_bit_width=12,
|
161
|
+
)
|
162
|
+
|
163
|
+
expected_rcf_2 = RoundCompletionInfo.of(
|
164
|
+
high_watermark=1223,
|
165
|
+
compacted_delta_locator={},
|
166
|
+
compacted_pyarrow_write_result={},
|
167
|
+
sort_keys_bit_width=1233,
|
168
|
+
)
|
169
|
+
|
170
|
+
write_round_completion_file(RCF_BUCKET_NAME, source_locator, None, expected_rcf)
|
171
|
+
|
172
|
+
write_round_completion_file(
|
173
|
+
RCF_BUCKET_NAME, source_locator, destination_locator, expected_rcf_2
|
174
|
+
)
|
175
|
+
|
176
|
+
rcf = read_round_completion_file(
|
177
|
+
RCF_BUCKET_NAME, source_locator, destination_locator
|
178
|
+
)
|
179
|
+
|
180
|
+
assert rcf == expected_rcf_2
|
181
|
+
|
182
|
+
def test_read_when_none_destination_partition_id(self):
|
183
|
+
|
184
|
+
source_locator = get_test_partition_locator("source")
|
185
|
+
destination_locator = get_test_partition_locator(None)
|
186
|
+
|
187
|
+
expected_rcf = RoundCompletionInfo.of(
|
188
|
+
high_watermark=122,
|
189
|
+
compacted_delta_locator={},
|
190
|
+
compacted_pyarrow_write_result={},
|
191
|
+
sort_keys_bit_width=12,
|
192
|
+
)
|
193
|
+
|
194
|
+
write_round_completion_file(
|
195
|
+
RCF_BUCKET_NAME, source_locator, destination_locator, expected_rcf
|
196
|
+
)
|
197
|
+
|
198
|
+
rcf = read_round_completion_file(
|
199
|
+
RCF_BUCKET_NAME, source_locator, destination_locator
|
200
|
+
)
|
201
|
+
|
202
|
+
assert rcf == expected_rcf
|
203
|
+
|
204
|
+
def test_write_when_custom_url_is_passed(self):
|
205
|
+
"""
|
206
|
+
This test case tests the backward compatibility by successfully
|
207
|
+
reading the previously written rcf.
|
208
|
+
"""
|
209
|
+
|
210
|
+
source_locator = get_test_partition_locator("source")
|
211
|
+
|
212
|
+
expected_rcf = RoundCompletionInfo.of(
|
213
|
+
high_watermark=122,
|
214
|
+
compacted_delta_locator={},
|
215
|
+
compacted_pyarrow_write_result={},
|
216
|
+
sort_keys_bit_width=12,
|
217
|
+
)
|
218
|
+
|
219
|
+
completion_file_s3_url = f"s3://{RCF_BUCKET_NAME}/test.json"
|
220
|
+
rcf_url = write_round_completion_file(
|
221
|
+
RCF_BUCKET_NAME,
|
222
|
+
source_locator,
|
223
|
+
None,
|
224
|
+
expected_rcf,
|
225
|
+
completion_file_s3_url=completion_file_s3_url,
|
226
|
+
)
|
227
|
+
|
228
|
+
rcf = read_round_completion_file(RCF_BUCKET_NAME, source_locator, None)
|
229
|
+
|
230
|
+
assert rcf_url == completion_file_s3_url
|
231
|
+
assert rcf is None
|