deltacat 0.1.18b3__py3-none-any.whl → 0.1.18b7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +1 -1
- deltacat/compute/compactor/compaction_session.py +184 -29
- deltacat/compute/compactor/model/compact_partition_params.py +153 -0
- deltacat/compute/compactor/model/compaction_session_audit_info.py +725 -0
- deltacat/compute/compactor/model/dedupe_result.py +3 -0
- deltacat/compute/compactor/model/delta_file_envelope.py +8 -0
- deltacat/compute/compactor/model/delta_file_locator.py +11 -6
- deltacat/compute/compactor/model/hash_bucket_result.py +3 -0
- deltacat/compute/compactor/model/materialize_result.py +27 -6
- deltacat/compute/compactor/model/round_completion_info.py +9 -0
- deltacat/compute/compactor/steps/dedupe.py +35 -19
- deltacat/compute/compactor/steps/hash_bucket.py +41 -16
- deltacat/compute/compactor/steps/materialize.py +73 -70
- deltacat/compute/compactor/utils/io.py +15 -0
- deltacat/compute/compactor/utils/primary_key_index.py +9 -15
- deltacat/compute/compactor/utils/round_completion_file.py +13 -4
- deltacat/compute/compactor/utils/system_columns.py +32 -0
- deltacat/io/__init__.py +0 -7
- deltacat/io/file_object_store.py +48 -0
- deltacat/io/memcached_object_store.py +121 -0
- deltacat/io/object_store.py +51 -0
- deltacat/io/ray_plasma_object_store.py +23 -0
- deltacat/io/redis_object_store.py +114 -0
- deltacat/io/s3_object_store.py +44 -0
- deltacat/storage/model/delta.py +2 -1
- deltacat/tests/compactor/test_compact_partition_params.py +237 -0
- deltacat/tests/compactor/utils/test_io.py +27 -5
- deltacat/tests/io/__init__.py +0 -0
- deltacat/tests/io/test_file_object_store.py +86 -0
- deltacat/tests/io/test_memcached_object_store.py +158 -0
- deltacat/tests/io/test_ray_plasma_object_store.py +54 -0
- deltacat/tests/io/test_redis_object_store.py +103 -0
- deltacat/tests/io/test_s3_object_store.py +59 -0
- deltacat/tests/utils/test_record_batch_tables.py +1 -1
- deltacat/tests/utils/test_resources.py +9 -0
- deltacat/utils/ray_utils/concurrency.py +0 -2
- deltacat/utils/resources.py +30 -18
- {deltacat-0.1.18b3.dist-info → deltacat-0.1.18b7.dist-info}/METADATA +3 -1
- {deltacat-0.1.18b3.dist-info → deltacat-0.1.18b7.dist-info}/RECORD +42 -27
- {deltacat-0.1.18b3.dist-info → deltacat-0.1.18b7.dist-info}/LICENSE +0 -0
- {deltacat-0.1.18b3.dist-info → deltacat-0.1.18b7.dist-info}/WHEEL +0 -0
- {deltacat-0.1.18b3.dist-info → deltacat-0.1.18b7.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,51 @@
|
|
1
|
+
from typing import List, Any
|
2
|
+
|
3
|
+
|
4
|
+
class IObjectStore:
|
5
|
+
"""
|
6
|
+
An object store interface.
|
7
|
+
"""
|
8
|
+
|
9
|
+
def setup(self, *args, **kwargs) -> Any:
|
10
|
+
...
|
11
|
+
|
12
|
+
"""
|
13
|
+
Sets up everything needed to run the object store.
|
14
|
+
"""
|
15
|
+
|
16
|
+
def put(self, obj: object, *args, **kwargs) -> Any:
|
17
|
+
"""
|
18
|
+
Put a single object into the object store. Depending
|
19
|
+
on the implementation, this method can be sync or async.
|
20
|
+
"""
|
21
|
+
return self.put_many([obj])[0]
|
22
|
+
|
23
|
+
def put_many(self, objects: List[object], *args, **kwargs) -> List[Any]:
|
24
|
+
...
|
25
|
+
|
26
|
+
"""
|
27
|
+
Put many objects into the object store. It would return an ordered list
|
28
|
+
of object references corresponding to each object in the input.
|
29
|
+
"""
|
30
|
+
|
31
|
+
def get(self, ref: Any, *args, **kwargs) -> object:
|
32
|
+
"""
|
33
|
+
Get a single object from an object store.
|
34
|
+
"""
|
35
|
+
return self.get_many([ref])[0]
|
36
|
+
|
37
|
+
def get_many(self, refs: List[Any], *args, **kwargs) -> List[object]:
|
38
|
+
...
|
39
|
+
|
40
|
+
"""
|
41
|
+
Get a list of objects from the object store. Use this method to
|
42
|
+
avoid multiple get calls. Note that depending on implementation it may
|
43
|
+
or may not return ordered results.
|
44
|
+
"""
|
45
|
+
|
46
|
+
def clear(self, *args, **kwargs) -> bool:
|
47
|
+
...
|
48
|
+
|
49
|
+
"""
|
50
|
+
Clears the object store and all the associated data in it.
|
51
|
+
"""
|
@@ -0,0 +1,23 @@
|
|
1
|
+
import ray
|
2
|
+
from ray import cloudpickle
|
3
|
+
from deltacat.io.object_store import IObjectStore
|
4
|
+
from typing import Any, List
|
5
|
+
|
6
|
+
|
7
|
+
class RayPlasmaObjectStore(IObjectStore):
|
8
|
+
"""
|
9
|
+
An implementation of object store that uses Ray plasma object store.
|
10
|
+
"""
|
11
|
+
|
12
|
+
def put_many(self, objects: List[object], *args, **kwargs) -> List[Any]:
|
13
|
+
result = []
|
14
|
+
for obj in objects:
|
15
|
+
object_ref = ray.put(obj)
|
16
|
+
pickled = cloudpickle.dumps(object_ref)
|
17
|
+
result.append(pickled)
|
18
|
+
|
19
|
+
return result
|
20
|
+
|
21
|
+
def get_many(self, refs: List[Any], *args, **kwargs) -> List[object]:
|
22
|
+
loaded_refs = [cloudpickle.loads(obj_id) for obj_id in refs]
|
23
|
+
return ray.get(loaded_refs)
|
@@ -0,0 +1,114 @@
|
|
1
|
+
import logging
|
2
|
+
from ray import cloudpickle
|
3
|
+
import time
|
4
|
+
from deltacat.io.object_store import IObjectStore
|
5
|
+
from typing import Any, List
|
6
|
+
from deltacat import logs
|
7
|
+
import uuid
|
8
|
+
import socket
|
9
|
+
import redis
|
10
|
+
from collections import defaultdict
|
11
|
+
|
12
|
+
|
13
|
+
logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
|
14
|
+
|
15
|
+
|
16
|
+
class RedisObjectStore(IObjectStore):
|
17
|
+
"""
|
18
|
+
An implementation of object store that uses Redis in memory DB.
|
19
|
+
"""
|
20
|
+
|
21
|
+
def __init__(self) -> None:
|
22
|
+
self.client_cache = {}
|
23
|
+
self.current_ip = None
|
24
|
+
self.SEPARATOR = "_"
|
25
|
+
super().__init__()
|
26
|
+
|
27
|
+
def put(self, obj: object, *args, **kwargs) -> Any:
|
28
|
+
serialized = cloudpickle.dumps(obj)
|
29
|
+
uid = uuid.uuid4()
|
30
|
+
current_ip = self._get_current_ip()
|
31
|
+
ref = self._create_ref(uid, current_ip)
|
32
|
+
|
33
|
+
client = self._get_client_by_ip(current_ip)
|
34
|
+
if client.set(uid.__str__(), serialized):
|
35
|
+
return ref
|
36
|
+
else:
|
37
|
+
raise RuntimeError(f"Unable to write {ref} to cache")
|
38
|
+
|
39
|
+
def put_many(self, objects: List[object], *args, **kwargs) -> List[Any]:
|
40
|
+
input = {}
|
41
|
+
result = []
|
42
|
+
current_ip = self._get_current_ip()
|
43
|
+
for obj in objects:
|
44
|
+
serialized = cloudpickle.dumps(obj)
|
45
|
+
uid = uuid.uuid4()
|
46
|
+
ref = self._create_ref(uid, current_ip)
|
47
|
+
input[uid.__str__()] = serialized
|
48
|
+
result.append(ref)
|
49
|
+
|
50
|
+
client = self._get_client_by_ip(current_ip)
|
51
|
+
|
52
|
+
if client.mset(input):
|
53
|
+
return result
|
54
|
+
else:
|
55
|
+
raise RuntimeError("Unable to update cache")
|
56
|
+
|
57
|
+
def get_many(self, refs: List[Any], *args, **kwargs) -> List[object]:
|
58
|
+
result = []
|
59
|
+
uid_per_ip = defaultdict(lambda: [])
|
60
|
+
|
61
|
+
start = time.monotonic()
|
62
|
+
for ref in refs:
|
63
|
+
uid, ip = ref.split(self.SEPARATOR)
|
64
|
+
uid_per_ip[ip].append(uid)
|
65
|
+
|
66
|
+
for (ip, uids) in uid_per_ip.items():
|
67
|
+
client = self._get_client_by_ip(ip)
|
68
|
+
cache_result = client.mget(uids)
|
69
|
+
assert len(cache_result) == len(
|
70
|
+
uids
|
71
|
+
), "Not all values were returned from cache"
|
72
|
+
|
73
|
+
total_bytes = 0
|
74
|
+
|
75
|
+
deserialize_start = time.monotonic()
|
76
|
+
for serialized in cache_result:
|
77
|
+
deserialized = cloudpickle.loads(serialized)
|
78
|
+
total_bytes += len(serialized)
|
79
|
+
result.append(deserialized)
|
80
|
+
|
81
|
+
deserialize_end = time.monotonic()
|
82
|
+
logger.debug(
|
83
|
+
f"The time taken to deserialize {total_bytes} bytes is: {deserialize_end - deserialize_start}",
|
84
|
+
)
|
85
|
+
|
86
|
+
end = time.monotonic()
|
87
|
+
|
88
|
+
logger.info(f"The total time taken to read all objects is: {end - start}")
|
89
|
+
|
90
|
+
return result
|
91
|
+
|
92
|
+
def get(self, ref: Any, *args, **kwargs) -> object:
|
93
|
+
uid, ip = ref.split(self.SEPARATOR)
|
94
|
+
client = self._get_client_by_ip(ip)
|
95
|
+
serialized = client.get(uid)
|
96
|
+
return cloudpickle.loads(serialized)
|
97
|
+
|
98
|
+
def _get_client_by_ip(self, ip_address: str):
|
99
|
+
if ip_address in self.client_cache:
|
100
|
+
return self.client_cache[ip_address]
|
101
|
+
|
102
|
+
base_client = redis.Redis(ip_address, 7777)
|
103
|
+
|
104
|
+
self.client_cache[ip_address] = base_client
|
105
|
+
return base_client
|
106
|
+
|
107
|
+
def _get_current_ip(self):
|
108
|
+
if self.current_ip is None:
|
109
|
+
self.current_ip = socket.gethostbyname(socket.gethostname())
|
110
|
+
|
111
|
+
return self.current_ip
|
112
|
+
|
113
|
+
def _create_ref(self, uid, ip):
|
114
|
+
return f"{uid}{self.SEPARATOR}{ip}"
|
@@ -0,0 +1,44 @@
|
|
1
|
+
import logging
|
2
|
+
from ray import cloudpickle
|
3
|
+
import time
|
4
|
+
from deltacat.io.object_store import IObjectStore
|
5
|
+
from typing import Any, List
|
6
|
+
from deltacat import logs
|
7
|
+
import uuid
|
8
|
+
from deltacat.aws import s3u as s3_utils
|
9
|
+
|
10
|
+
logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
|
11
|
+
|
12
|
+
|
13
|
+
class S3ObjectStore(IObjectStore):
|
14
|
+
"""
|
15
|
+
An implementation of object store that uses S3.
|
16
|
+
"""
|
17
|
+
|
18
|
+
def __init__(self, bucket_prefix: str) -> None:
|
19
|
+
self.bucket = bucket_prefix
|
20
|
+
super().__init__()
|
21
|
+
|
22
|
+
def put_many(self, objects: List[object], *args, **kwargs) -> List[Any]:
|
23
|
+
result = []
|
24
|
+
for obj in objects:
|
25
|
+
serialized = cloudpickle.dumps(obj)
|
26
|
+
ref = uuid.uuid4()
|
27
|
+
|
28
|
+
s3_utils.upload(f"s3://{self.bucket}/{ref}", serialized)
|
29
|
+
result.append(ref)
|
30
|
+
|
31
|
+
return result
|
32
|
+
|
33
|
+
def get_many(self, refs: List[Any], *args, **kwargs) -> List[object]:
|
34
|
+
result = []
|
35
|
+
start = time.monotonic()
|
36
|
+
for ref in refs:
|
37
|
+
cur = s3_utils.download(f"s3://{self.bucket}/{ref}")
|
38
|
+
serialized = cur["Body"].read()
|
39
|
+
loaded = cloudpickle.loads(serialized)
|
40
|
+
result.append(loaded)
|
41
|
+
end = time.monotonic()
|
42
|
+
|
43
|
+
logger.info(f"The total time taken to read all objects is: {end - start}")
|
44
|
+
return result
|
deltacat/storage/model/delta.py
CHANGED
@@ -256,7 +256,8 @@ class Delta(dict):
|
|
256
256
|
class DeltaLocator(Locator, dict):
|
257
257
|
@staticmethod
|
258
258
|
def of(
|
259
|
-
partition_locator: Optional[PartitionLocator]
|
259
|
+
partition_locator: Optional[PartitionLocator] = None,
|
260
|
+
stream_position: Optional[int] = None,
|
260
261
|
) -> DeltaLocator:
|
261
262
|
"""
|
262
263
|
Creates a partition delta locator. Stream Position, if provided, should
|
@@ -0,0 +1,237 @@
|
|
1
|
+
import json
|
2
|
+
|
3
|
+
import unittest
|
4
|
+
|
5
|
+
|
6
|
+
class TestCompactPartitionParams(unittest.TestCase):
|
7
|
+
@classmethod
|
8
|
+
def setUpClass(cls):
|
9
|
+
from deltacat.types.media import ContentType
|
10
|
+
|
11
|
+
cls.VALID_COMPACT_PARTITION_PARAMS = {
|
12
|
+
"compaction_artifact_s3_bucket": "foobar",
|
13
|
+
"compacted_file_content_type": ContentType.PARQUET,
|
14
|
+
"deltacat_storage": "foobar",
|
15
|
+
"destination_partition_locator": {
|
16
|
+
"streamLocator": {
|
17
|
+
"tableVersionLocator": {
|
18
|
+
"tableLocator": {
|
19
|
+
"namespaceLocator": {"namespace": "testNamespaceLocator"},
|
20
|
+
"tableName": "TABLE_FOO",
|
21
|
+
},
|
22
|
+
"tableVersion": "1",
|
23
|
+
},
|
24
|
+
"streamId": "foobar",
|
25
|
+
"storageType": "fooType",
|
26
|
+
},
|
27
|
+
"partitionValues": [],
|
28
|
+
"partitionId": None,
|
29
|
+
},
|
30
|
+
"hash_bucket_count": None,
|
31
|
+
"last_stream_position_to_compact": 168000000000,
|
32
|
+
"list_deltas_kwargs": {"equivalent_table_types": []},
|
33
|
+
"primary_keys": {"id"},
|
34
|
+
"properties": {
|
35
|
+
"parent_stream_position": "1688000000000",
|
36
|
+
},
|
37
|
+
"read_kwargs_provider": "foo",
|
38
|
+
"rebase_source_partition_high_watermark": 1688000000000,
|
39
|
+
"rebase_source_partition_locator": {
|
40
|
+
"stream_locator": {
|
41
|
+
"table_version_locator": {
|
42
|
+
"table_locator": {
|
43
|
+
"namespace_locator": {"namespace": "testNamespaceLocator"},
|
44
|
+
"tableName": "TABLE_FOO",
|
45
|
+
},
|
46
|
+
"table_version": "1",
|
47
|
+
},
|
48
|
+
"streamId": "foobar",
|
49
|
+
"storageType": "fooType",
|
50
|
+
},
|
51
|
+
"partitionValues": [],
|
52
|
+
"partitionId": "79612ea39ac5493eae925abe60767d42",
|
53
|
+
},
|
54
|
+
"s3_table_writer_kwargs": {
|
55
|
+
"version": "1.0",
|
56
|
+
"flavor": "foobar",
|
57
|
+
"coerce_timestamps": "ms",
|
58
|
+
},
|
59
|
+
"source_partition_locator": {
|
60
|
+
"streamLocator": {
|
61
|
+
"tableVersionLocator": {
|
62
|
+
"tableLocator": {
|
63
|
+
"namespaceLocator": {"namespace": "testNamespaceLocator"},
|
64
|
+
"tableName": "TABLE_FOO",
|
65
|
+
},
|
66
|
+
"tableVersion": "2",
|
67
|
+
},
|
68
|
+
"streamId": "foobar",
|
69
|
+
"storageType": "fooType",
|
70
|
+
},
|
71
|
+
"partitionValues": [],
|
72
|
+
"partitionId": "79612ea39ac5493eae925abe60767d42",
|
73
|
+
},
|
74
|
+
}
|
75
|
+
|
76
|
+
super().setUpClass()
|
77
|
+
|
78
|
+
def test_destination_partition_locator_is_optional(self):
|
79
|
+
from deltacat.compute.compactor.model.compact_partition_params import (
|
80
|
+
CompactPartitionParams,
|
81
|
+
)
|
82
|
+
|
83
|
+
params = CompactPartitionParams.of({})
|
84
|
+
assert params.destination_partition_locator is None
|
85
|
+
|
86
|
+
def test_serialize_returns_json_string(self):
|
87
|
+
from deltacat.compute.compactor.model.compact_partition_params import (
|
88
|
+
CompactPartitionParams,
|
89
|
+
)
|
90
|
+
|
91
|
+
params = CompactPartitionParams.of(
|
92
|
+
{"destination_partition_locator": "my-partition"}
|
93
|
+
)
|
94
|
+
serialized_params = params.serialize()
|
95
|
+
assert isinstance(serialized_params, str)
|
96
|
+
assert json.loads(serialized_params) == {
|
97
|
+
"compacted_file_content_type": None,
|
98
|
+
"compaction_artifact_s3_bucket": None,
|
99
|
+
"deltacat_storage": None,
|
100
|
+
"hash_bucket_count": None,
|
101
|
+
"last_stream_position_to_compact": None,
|
102
|
+
"list_deltas_kwargs": None,
|
103
|
+
"pg_config": None,
|
104
|
+
"primary_keys": None,
|
105
|
+
"properties": None,
|
106
|
+
"read_kwargs_provider": None,
|
107
|
+
"rebase_source_partition_high_watermark": None,
|
108
|
+
"rebase_source_partition_locator": None,
|
109
|
+
"s3_table_writer_kwargs": None,
|
110
|
+
"source_partition_locator": None,
|
111
|
+
"destination_partition_locator": "my-partition",
|
112
|
+
}
|
113
|
+
|
114
|
+
def test_serialize_returns_json_string_with_all_fields(self):
|
115
|
+
from deltacat.compute.compactor.model.compact_partition_params import (
|
116
|
+
CompactPartitionParams,
|
117
|
+
)
|
118
|
+
|
119
|
+
params = CompactPartitionParams.of(
|
120
|
+
TestCompactPartitionParams.VALID_COMPACT_PARTITION_PARAMS
|
121
|
+
)
|
122
|
+
serialized_params = params.serialize()
|
123
|
+
assert isinstance(serialized_params, str)
|
124
|
+
assert (
|
125
|
+
json.loads(serialized_params)["compacted_file_content_type"]
|
126
|
+
== params.compacted_file_content_type
|
127
|
+
)
|
128
|
+
assert (
|
129
|
+
json.loads(serialized_params)["compaction_artifact_s3_bucket"]
|
130
|
+
== params.compaction_artifact_s3_bucket
|
131
|
+
)
|
132
|
+
assert (
|
133
|
+
json.loads(serialized_params)["hash_bucket_count"]
|
134
|
+
== params.hash_bucket_count
|
135
|
+
)
|
136
|
+
assert (
|
137
|
+
json.loads(serialized_params)["last_stream_position_to_compact"]
|
138
|
+
== params.last_stream_position_to_compact
|
139
|
+
)
|
140
|
+
assert (
|
141
|
+
json.loads(serialized_params)["list_deltas_kwargs"]
|
142
|
+
== params.list_deltas_kwargs
|
143
|
+
)
|
144
|
+
assert json.loads(serialized_params)["primary_keys"] == params.primary_keys
|
145
|
+
assert json.loads(serialized_params)["properties"] == params.properties
|
146
|
+
assert (
|
147
|
+
json.loads(serialized_params)["rebase_source_partition_high_watermark"]
|
148
|
+
== params.rebase_source_partition_high_watermark
|
149
|
+
)
|
150
|
+
assert (
|
151
|
+
json.loads(serialized_params)["rebase_source_partition_locator"]
|
152
|
+
== params.rebase_source_partition_locator
|
153
|
+
)
|
154
|
+
assert (
|
155
|
+
json.loads(serialized_params)["source_partition_locator"]
|
156
|
+
== params.source_partition_locator
|
157
|
+
)
|
158
|
+
assert (
|
159
|
+
json.loads(serialized_params)["destination_partition_locator"]
|
160
|
+
== params.destination_partition_locator
|
161
|
+
)
|
162
|
+
|
163
|
+
def test_serialize_handles_sets(self):
|
164
|
+
from deltacat.compute.compactor.model.compact_partition_params import (
|
165
|
+
CompactPartitionParams,
|
166
|
+
)
|
167
|
+
|
168
|
+
params = CompactPartitionParams.of({"primary_keys": {"foo", "bar", "baz"}})
|
169
|
+
serialized_params = params.serialize()
|
170
|
+
self.assertCountEqual(
|
171
|
+
json.loads(serialized_params)["primary_keys"], ["foo", "bar", "baz"]
|
172
|
+
)
|
173
|
+
|
174
|
+
def test_serialize_handles_objects_with_toJSON_method(self):
|
175
|
+
from deltacat.compute.compactor.model.compact_partition_params import (
|
176
|
+
CompactPartitionParams,
|
177
|
+
)
|
178
|
+
|
179
|
+
class MyObject:
|
180
|
+
def toJSON(self) -> str:
|
181
|
+
return "my-json-object"
|
182
|
+
|
183
|
+
params = CompactPartitionParams.of({"compacted_file_content_type": MyObject()})
|
184
|
+
serialized_params = params.serialize()
|
185
|
+
assert (
|
186
|
+
json.loads(serialized_params)["compacted_file_content_type"]
|
187
|
+
== "my-json-object"
|
188
|
+
)
|
189
|
+
|
190
|
+
def test_json_handler_for_compact_partition_params_serializes_set_to_list(self):
|
191
|
+
from deltacat.compute.compactor.model.compact_partition_params import (
|
192
|
+
CompactPartitionParams,
|
193
|
+
)
|
194
|
+
|
195
|
+
my_set = {1, 2, 3}
|
196
|
+
json_string = json.dumps(
|
197
|
+
my_set,
|
198
|
+
default=CompactPartitionParams.json_handler_for_compact_partition_params,
|
199
|
+
)
|
200
|
+
assert json.loads(json_string) == [1, 2, 3]
|
201
|
+
|
202
|
+
def test_json_handler_for_compact_partition_params_serializes_object_with_toJSON_method_to_dict(
|
203
|
+
self,
|
204
|
+
):
|
205
|
+
from dataclasses import dataclass
|
206
|
+
|
207
|
+
from deltacat.compute.compactor.model.compact_partition_params import (
|
208
|
+
CompactPartitionParams,
|
209
|
+
)
|
210
|
+
|
211
|
+
@dataclass
|
212
|
+
class DummyObject:
|
213
|
+
some_property: str = "foo"
|
214
|
+
|
215
|
+
def toJSON(self):
|
216
|
+
return self.__dict__
|
217
|
+
|
218
|
+
dummy_object = DummyObject()
|
219
|
+
json_string = json.dumps(
|
220
|
+
dummy_object,
|
221
|
+
default=CompactPartitionParams.json_handler_for_compact_partition_params,
|
222
|
+
)
|
223
|
+
assert json.loads(json_string) == {"some_property": "foo"}
|
224
|
+
|
225
|
+
def test_json_handler_for_compact_partition_params_returns_class_name_for_unknown_objects(
|
226
|
+
self,
|
227
|
+
):
|
228
|
+
from deltacat.compute.compactor.model.compact_partition_params import (
|
229
|
+
CompactPartitionParams,
|
230
|
+
)
|
231
|
+
|
232
|
+
my_object = object()
|
233
|
+
json_string = json.dumps(
|
234
|
+
my_object,
|
235
|
+
default=CompactPartitionParams.json_handler_for_compact_partition_params,
|
236
|
+
)
|
237
|
+
assert json.loads(json_string) == "object"
|
@@ -9,8 +9,18 @@ class TestFitInputDeltas(unittest.TestCase):
|
|
9
9
|
cls.module_patcher = mock.patch.dict("sys.modules", {"ray": mock.MagicMock()})
|
10
10
|
cls.module_patcher.start()
|
11
11
|
|
12
|
+
from deltacat.compute.compactor.model.compaction_session_audit_info import (
|
13
|
+
CompactionSessionAuditInfo,
|
14
|
+
)
|
15
|
+
|
16
|
+
cls.COMPACTION_AUDIT = CompactionSessionAuditInfo("1.0", "test")
|
17
|
+
|
12
18
|
super().setUpClass()
|
13
19
|
|
20
|
+
@classmethod
|
21
|
+
def tearDownClass(cls) -> None:
|
22
|
+
cls.module_patcher.stop()
|
23
|
+
|
14
24
|
def test_sanity(self):
|
15
25
|
from deltacat.compute.compactor.utils import io
|
16
26
|
|
@@ -19,12 +29,18 @@ class TestFitInputDeltas(unittest.TestCase):
|
|
19
29
|
hash_bucket_count,
|
20
30
|
high_watermark,
|
21
31
|
require_multiple_rounds,
|
22
|
-
) = io.fit_input_deltas(
|
32
|
+
) = io.fit_input_deltas(
|
33
|
+
[TEST_DELTA], {"CPU": 1, "memory": 20000000}, self.COMPACTION_AUDIT, None
|
34
|
+
)
|
23
35
|
|
24
36
|
self.assertIsNotNone(hash_bucket_count)
|
25
37
|
self.assertTrue(1, len(delta_list))
|
26
38
|
self.assertIsNotNone(high_watermark)
|
27
39
|
self.assertFalse(require_multiple_rounds)
|
40
|
+
self.assertIsNotNone(hash_bucket_count, self.COMPACTION_AUDIT.hash_bucket_count)
|
41
|
+
self.assertIsNotNone(self.COMPACTION_AUDIT.input_file_count)
|
42
|
+
self.assertIsNotNone(self.COMPACTION_AUDIT.input_size_bytes)
|
43
|
+
self.assertIsNotNone(self.COMPACTION_AUDIT.total_cluster_memory_bytes)
|
28
44
|
|
29
45
|
def test_when_hash_bucket_count_overridden(self):
|
30
46
|
from deltacat.compute.compactor.utils import io
|
@@ -34,7 +50,9 @@ class TestFitInputDeltas(unittest.TestCase):
|
|
34
50
|
hash_bucket_count,
|
35
51
|
high_watermark,
|
36
52
|
require_multiple_rounds,
|
37
|
-
) = io.fit_input_deltas(
|
53
|
+
) = io.fit_input_deltas(
|
54
|
+
[TEST_DELTA], {"CPU": 1, "memory": 20000000}, self.COMPACTION_AUDIT, 20
|
55
|
+
)
|
38
56
|
|
39
57
|
self.assertEqual(20, hash_bucket_count)
|
40
58
|
self.assertEqual(1, len(delta_list))
|
@@ -49,7 +67,9 @@ class TestFitInputDeltas(unittest.TestCase):
|
|
49
67
|
hash_bucket_count,
|
50
68
|
high_watermark,
|
51
69
|
require_multiple_rounds,
|
52
|
-
) = io.fit_input_deltas(
|
70
|
+
) = io.fit_input_deltas(
|
71
|
+
[TEST_DELTA], {"CPU": 2, "memory": 10}, self.COMPACTION_AUDIT, 20
|
72
|
+
)
|
53
73
|
|
54
74
|
self.assertIsNotNone(hash_bucket_count)
|
55
75
|
self.assertTrue(2, len(delta_list))
|
@@ -60,10 +80,12 @@ class TestFitInputDeltas(unittest.TestCase):
|
|
60
80
|
from deltacat.compute.compactor.utils import io
|
61
81
|
|
62
82
|
with self.assertRaises(AssertionError):
|
63
|
-
io.fit_input_deltas(
|
83
|
+
io.fit_input_deltas(
|
84
|
+
[], {"CPU": 100, "memory": 20000.0}, self.COMPACTION_AUDIT, None
|
85
|
+
)
|
64
86
|
|
65
87
|
def test_when_cpu_resources_is_not_passed(self):
|
66
88
|
from deltacat.compute.compactor.utils import io
|
67
89
|
|
68
90
|
with self.assertRaises(KeyError):
|
69
|
-
io.fit_input_deltas([], {}, None)
|
91
|
+
io.fit_input_deltas([], {}, self.COMPACTION_AUDIT, None)
|
File without changes
|
@@ -0,0 +1,86 @@
|
|
1
|
+
import unittest
|
2
|
+
from unittest import mock
|
3
|
+
|
4
|
+
|
5
|
+
class TestFileObjectStore(unittest.TestCase):
|
6
|
+
|
7
|
+
TEST_VALUE = "test-value"
|
8
|
+
|
9
|
+
@classmethod
|
10
|
+
def setUpClass(cls):
|
11
|
+
cls.ray_mock = mock.MagicMock()
|
12
|
+
cls.os_mock = mock.MagicMock()
|
13
|
+
|
14
|
+
cls.module_patcher = mock.patch.dict(
|
15
|
+
"sys.modules", {"ray": cls.ray_mock, "os": cls.os_mock}
|
16
|
+
)
|
17
|
+
cls.module_patcher.start()
|
18
|
+
|
19
|
+
super().setUpClass()
|
20
|
+
|
21
|
+
@classmethod
|
22
|
+
def tearDownClass(cls) -> None:
|
23
|
+
cls.module_patcher.stop()
|
24
|
+
|
25
|
+
@mock.patch(
|
26
|
+
"deltacat.io.file_object_store.open",
|
27
|
+
new_callable=mock.mock_open,
|
28
|
+
read_data="data",
|
29
|
+
)
|
30
|
+
def test_put_many_sanity(self, mock_file):
|
31
|
+
from deltacat.io.file_object_store import FileObjectStore
|
32
|
+
|
33
|
+
object_store = FileObjectStore(dir_path="")
|
34
|
+
self.ray_mock.cloudpickle.dumps.return_value = self.TEST_VALUE
|
35
|
+
result = object_store.put_many(["a", "b"])
|
36
|
+
|
37
|
+
self.assertEqual(2, len(result))
|
38
|
+
self.assertEqual(2, mock_file.call_count)
|
39
|
+
|
40
|
+
@mock.patch(
|
41
|
+
"deltacat.io.file_object_store.open",
|
42
|
+
new_callable=mock.mock_open,
|
43
|
+
read_data="data",
|
44
|
+
)
|
45
|
+
def test_get_many_sanity(self, mock_file):
|
46
|
+
from deltacat.io.file_object_store import FileObjectStore
|
47
|
+
|
48
|
+
object_store = FileObjectStore(dir_path="")
|
49
|
+
self.ray_mock.cloudpickle.loads.return_value = self.TEST_VALUE
|
50
|
+
|
51
|
+
result = object_store.get_many(["test", "test"])
|
52
|
+
|
53
|
+
self.assertEqual(2, len(result))
|
54
|
+
self.assertEqual(2, mock_file.call_count)
|
55
|
+
|
56
|
+
@mock.patch(
|
57
|
+
"deltacat.io.file_object_store.open",
|
58
|
+
new_callable=mock.mock_open,
|
59
|
+
read_data="data",
|
60
|
+
)
|
61
|
+
def test_get_sanity(self, mock_file):
|
62
|
+
from deltacat.io.file_object_store import FileObjectStore
|
63
|
+
|
64
|
+
object_store = FileObjectStore(dir_path="")
|
65
|
+
self.ray_mock.cloudpickle.loads.return_value = self.TEST_VALUE
|
66
|
+
|
67
|
+
result = object_store.get("test")
|
68
|
+
|
69
|
+
self.assertEqual(self.TEST_VALUE, result)
|
70
|
+
self.assertEqual(1, mock_file.call_count)
|
71
|
+
|
72
|
+
@mock.patch(
|
73
|
+
"deltacat.io.file_object_store.open",
|
74
|
+
new_callable=mock.mock_open,
|
75
|
+
read_data="data",
|
76
|
+
)
|
77
|
+
def test_put_sanity(self, mock_file):
|
78
|
+
from deltacat.io.file_object_store import FileObjectStore
|
79
|
+
|
80
|
+
object_store = FileObjectStore(dir_path="")
|
81
|
+
self.ray_mock.cloudpickle.dumps.return_value = self.TEST_VALUE
|
82
|
+
|
83
|
+
result = object_store.put("test")
|
84
|
+
|
85
|
+
self.assertIsNotNone(result)
|
86
|
+
self.assertEqual(1, mock_file.call_count)
|