deltacat 1.1.0__py3-none-any.whl → 1.1.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +1 -1
- deltacat/aws/clients.py +12 -2
- deltacat/aws/constants.py +5 -1
- deltacat/aws/s3u.py +8 -1
- deltacat/compute/compactor/model/compact_partition_params.py +24 -0
- deltacat/compute/compactor/model/compaction_session_audit_info.py +11 -0
- deltacat/compute/compactor_v2/compaction_session.py +44 -6
- deltacat/compute/compactor_v2/constants.py +28 -0
- deltacat/compute/compactor_v2/deletes/utils.py +3 -0
- deltacat/compute/compactor_v2/model/hash_bucket_input.py +6 -0
- deltacat/compute/compactor_v2/model/merge_input.py +6 -0
- deltacat/compute/compactor_v2/steps/hash_bucket.py +11 -3
- deltacat/compute/compactor_v2/steps/merge.py +35 -6
- deltacat/compute/compactor_v2/utils/io.py +3 -0
- deltacat/compute/compactor_v2/utils/merge.py +3 -0
- deltacat/compute/compactor_v2/utils/task_options.py +94 -8
- deltacat/io/memcached_object_store.py +20 -0
- deltacat/logs.py +29 -2
- deltacat/tests/compute/test_compact_partition_params.py +5 -0
- deltacat/tests/io/test_memcached_object_store.py +19 -0
- deltacat/tests/utils/test_metrics.py +575 -0
- deltacat/utils/metrics.py +158 -23
- deltacat/utils/resources.py +5 -3
- {deltacat-1.1.0.dist-info → deltacat-1.1.2.dist-info}/METADATA +1 -1
- {deltacat-1.1.0.dist-info → deltacat-1.1.2.dist-info}/RECORD +28 -27
- {deltacat-1.1.0.dist-info → deltacat-1.1.2.dist-info}/LICENSE +0 -0
- {deltacat-1.1.0.dist-info → deltacat-1.1.2.dist-info}/WHEEL +0 -0
- {deltacat-1.1.0.dist-info → deltacat-1.1.2.dist-info}/top_level.txt +0 -0
@@ -1,7 +1,10 @@
|
|
1
1
|
import botocore
|
2
2
|
import logging
|
3
|
-
from typing import Dict, Optional, List, Tuple
|
3
|
+
from typing import Dict, Optional, List, Tuple, Any
|
4
4
|
from deltacat import logs
|
5
|
+
from deltacat.compute.compactor_v2.model.merge_file_group import (
|
6
|
+
LocalMergeFileGroupsProvider,
|
7
|
+
)
|
5
8
|
from deltacat.types.media import ContentEncoding, ContentType
|
6
9
|
from deltacat.types.partial_download import PartialParquetParameters
|
7
10
|
from deltacat.storage import (
|
@@ -15,7 +18,6 @@ from deltacat.compute.compactor_v2.utils.primary_key_index import (
|
|
15
18
|
hash_group_index_to_hash_bucket_indices,
|
16
19
|
)
|
17
20
|
from deltacat.compute.compactor_v2.constants import (
|
18
|
-
TOTAL_MEMORY_BUFFER_PERCENTAGE,
|
19
21
|
PARQUET_TO_PYARROW_INFLATION,
|
20
22
|
)
|
21
23
|
|
@@ -133,8 +135,10 @@ def hash_bucket_resource_options_provider(
|
|
133
135
|
item: DeltaAnnotated,
|
134
136
|
previous_inflation: float,
|
135
137
|
average_record_size_bytes: float,
|
138
|
+
total_memory_buffer_percentage: int,
|
136
139
|
primary_keys: List[str] = None,
|
137
140
|
ray_custom_resources: Optional[Dict] = None,
|
141
|
+
memory_logs_enabled: Optional[bool] = None,
|
138
142
|
**kwargs,
|
139
143
|
) -> Dict:
|
140
144
|
debug_memory_params = {"hash_bucket_task_index": index}
|
@@ -189,10 +193,11 @@ def hash_bucket_resource_options_provider(
|
|
189
193
|
debug_memory_params["average_record_size_bytes"] = average_record_size_bytes
|
190
194
|
|
191
195
|
# Consider buffer
|
192
|
-
total_memory = total_memory * (1 +
|
196
|
+
total_memory = total_memory * (1 + total_memory_buffer_percentage / 100.0)
|
193
197
|
debug_memory_params["total_memory_with_buffer"] = total_memory
|
194
|
-
logger.
|
195
|
-
f"[Hash bucket task {index}]: Params used for calculating hash bucketing memory: {debug_memory_params}"
|
198
|
+
logger.debug_conditional(
|
199
|
+
f"[Hash bucket task {index}]: Params used for calculating hash bucketing memory: {debug_memory_params}",
|
200
|
+
memory_logs_enabled,
|
196
201
|
)
|
197
202
|
|
198
203
|
return get_task_options(0.01, total_memory, ray_custom_resources)
|
@@ -204,12 +209,14 @@ def merge_resource_options_provider(
|
|
204
209
|
num_hash_groups: int,
|
205
210
|
hash_group_size_bytes: Dict[int, int],
|
206
211
|
hash_group_num_rows: Dict[int, int],
|
212
|
+
total_memory_buffer_percentage: int,
|
207
213
|
round_completion_info: Optional[RoundCompletionInfo] = None,
|
208
214
|
compacted_delta_manifest: Optional[Manifest] = None,
|
209
215
|
ray_custom_resources: Optional[Dict] = None,
|
210
216
|
primary_keys: Optional[List[str]] = None,
|
211
217
|
deltacat_storage=unimplemented_deltacat_storage,
|
212
218
|
deltacat_storage_kwargs: Optional[Dict] = {},
|
219
|
+
memory_logs_enabled: Optional[bool] = None,
|
213
220
|
**kwargs,
|
214
221
|
) -> Dict:
|
215
222
|
debug_memory_params = {"merge_task_index": index}
|
@@ -224,6 +231,84 @@ def merge_resource_options_provider(
|
|
224
231
|
pk_size_bytes = data_size
|
225
232
|
incremental_index_array_size = num_rows * 4
|
226
233
|
|
234
|
+
return get_merge_task_options(
|
235
|
+
index,
|
236
|
+
hb_group_idx,
|
237
|
+
data_size,
|
238
|
+
pk_size_bytes,
|
239
|
+
num_rows,
|
240
|
+
num_hash_groups,
|
241
|
+
total_memory_buffer_percentage,
|
242
|
+
incremental_index_array_size,
|
243
|
+
debug_memory_params,
|
244
|
+
ray_custom_resources,
|
245
|
+
round_completion_info=round_completion_info,
|
246
|
+
compacted_delta_manifest=compacted_delta_manifest,
|
247
|
+
primary_keys=primary_keys,
|
248
|
+
deltacat_storage=deltacat_storage,
|
249
|
+
deltacat_storage_kwargs=deltacat_storage_kwargs,
|
250
|
+
memory_logs_enabled=memory_logs_enabled,
|
251
|
+
)
|
252
|
+
|
253
|
+
|
254
|
+
def local_merge_resource_options_provider(
|
255
|
+
estimated_da_size: float,
|
256
|
+
estimated_num_rows: int,
|
257
|
+
total_memory_buffer_percentage: int,
|
258
|
+
round_completion_info: Optional[RoundCompletionInfo] = None,
|
259
|
+
compacted_delta_manifest: Optional[Manifest] = None,
|
260
|
+
ray_custom_resources: Optional[Dict] = None,
|
261
|
+
primary_keys: Optional[List[str]] = None,
|
262
|
+
deltacat_storage=unimplemented_deltacat_storage,
|
263
|
+
deltacat_storage_kwargs: Optional[Dict] = {},
|
264
|
+
memory_logs_enabled: Optional[bool] = None,
|
265
|
+
**kwargs,
|
266
|
+
) -> Dict:
|
267
|
+
index = hb_group_idx = LocalMergeFileGroupsProvider.LOCAL_HASH_BUCKET_INDEX
|
268
|
+
debug_memory_params = {"merge_task_index": index}
|
269
|
+
|
270
|
+
# upper bound for pk size of incremental
|
271
|
+
pk_size_bytes = estimated_da_size
|
272
|
+
incremental_index_array_size = estimated_num_rows * 4
|
273
|
+
|
274
|
+
return get_merge_task_options(
|
275
|
+
index=index,
|
276
|
+
hb_group_idx=hb_group_idx,
|
277
|
+
data_size=estimated_da_size,
|
278
|
+
pk_size_bytes=pk_size_bytes,
|
279
|
+
num_rows=estimated_num_rows,
|
280
|
+
num_hash_groups=1,
|
281
|
+
incremental_index_array_size=incremental_index_array_size,
|
282
|
+
total_memory_buffer_percentage=total_memory_buffer_percentage,
|
283
|
+
debug_memory_params=debug_memory_params,
|
284
|
+
ray_custom_resources=ray_custom_resources,
|
285
|
+
round_completion_info=round_completion_info,
|
286
|
+
compacted_delta_manifest=compacted_delta_manifest,
|
287
|
+
primary_keys=primary_keys,
|
288
|
+
deltacat_storage=deltacat_storage,
|
289
|
+
deltacat_storage_kwargs=deltacat_storage_kwargs,
|
290
|
+
memory_logs_enabled=memory_logs_enabled,
|
291
|
+
)
|
292
|
+
|
293
|
+
|
294
|
+
def get_merge_task_options(
|
295
|
+
index: int,
|
296
|
+
hb_group_idx: int,
|
297
|
+
data_size: float,
|
298
|
+
pk_size_bytes: float,
|
299
|
+
num_rows: int,
|
300
|
+
num_hash_groups: int,
|
301
|
+
total_memory_buffer_percentage: int,
|
302
|
+
incremental_index_array_size: int,
|
303
|
+
debug_memory_params: Dict[str, Any],
|
304
|
+
ray_custom_resources: Optional[Dict],
|
305
|
+
round_completion_info: Optional[RoundCompletionInfo] = None,
|
306
|
+
compacted_delta_manifest: Optional[Manifest] = None,
|
307
|
+
primary_keys: Optional[List[str]] = None,
|
308
|
+
deltacat_storage=unimplemented_deltacat_storage,
|
309
|
+
deltacat_storage_kwargs: Optional[Dict] = {},
|
310
|
+
memory_logs_enabled: Optional[bool] = None,
|
311
|
+
) -> Dict[str, Any]:
|
227
312
|
if (
|
228
313
|
round_completion_info
|
229
314
|
and compacted_delta_manifest
|
@@ -296,10 +381,11 @@ def merge_resource_options_provider(
|
|
296
381
|
debug_memory_params["incremental_index_array_size"] = incremental_index_array_size
|
297
382
|
debug_memory_params["total_memory"] = total_memory
|
298
383
|
|
299
|
-
total_memory = total_memory * (1 +
|
384
|
+
total_memory = total_memory * (1 + total_memory_buffer_percentage / 100.0)
|
300
385
|
debug_memory_params["total_memory_with_buffer"] = total_memory
|
301
|
-
logger.
|
302
|
-
f"[Merge task {index}]: Params used for calculating merge memory: {debug_memory_params}"
|
386
|
+
logger.debug_conditional(
|
387
|
+
f"[Merge task {index}]: Params used for calculating merge memory: {debug_memory_params}",
|
388
|
+
memory_logs_enabled,
|
303
389
|
)
|
304
390
|
|
305
391
|
return get_task_options(0.01, total_memory, ray_custom_resources)
|
@@ -181,15 +181,35 @@ class MemcachedObjectStore(IObjectStore):
|
|
181
181
|
for chunk_index in range(chunk_count):
|
182
182
|
ref = self._create_ref(uid, ip, chunk_index)
|
183
183
|
chunk = client.get(ref)
|
184
|
+
if chunk is None:
|
185
|
+
raise ValueError(
|
186
|
+
f"Expected uid: {uid}, chunk index: {chunk_index} from client ip: {ip}"
|
187
|
+
f" to be non-empty."
|
188
|
+
)
|
184
189
|
serialized.extend(chunk)
|
185
190
|
|
186
191
|
return cloudpickle.loads(serialized)
|
187
192
|
|
193
|
+
def clear(self) -> bool:
|
194
|
+
flushed = all(
|
195
|
+
[
|
196
|
+
self._get_client_by_ip(ip).flush_all(noreply=False)
|
197
|
+
for ip in self.storage_node_ips
|
198
|
+
]
|
199
|
+
)
|
200
|
+
self.client_cache.clear()
|
201
|
+
|
202
|
+
if flushed:
|
203
|
+
logger.info("Successfully cleared cache contents.")
|
204
|
+
|
205
|
+
return flushed
|
206
|
+
|
188
207
|
def close(self) -> None:
|
189
208
|
for client in self.client_cache.values():
|
190
209
|
client.close()
|
191
210
|
|
192
211
|
self.client_cache.clear()
|
212
|
+
logger.info("Successfully closed object store clients.")
|
193
213
|
|
194
214
|
def _create_ref(self, uid, ip, chunk_index) -> str:
|
195
215
|
return f"{uid}{self.SEPARATOR}{ip}{self.SEPARATOR}{chunk_index}"
|
deltacat/logs.py
CHANGED
@@ -2,7 +2,7 @@ import logging
|
|
2
2
|
import os
|
3
3
|
import pathlib
|
4
4
|
from logging import FileHandler, Handler, Logger, LoggerAdapter, handlers
|
5
|
-
from typing import Union
|
5
|
+
from typing import Any, Dict, Optional, Union
|
6
6
|
|
7
7
|
import ray
|
8
8
|
from ray.runtime_context import RuntimeContext
|
@@ -26,7 +26,32 @@ DEFAULT_MAX_BYTES_PER_LOG = 2 ^ 20 * 256 # 256 MiB
|
|
26
26
|
DEFAULT_BACKUP_COUNT = 0
|
27
27
|
|
28
28
|
|
29
|
-
class
|
29
|
+
class DeltaCATLoggerAdapter(logging.LoggerAdapter):
|
30
|
+
"""
|
31
|
+
Logger Adapter class with additional functionality
|
32
|
+
"""
|
33
|
+
|
34
|
+
def __init__(self, logger: Logger, extra: Optional[Dict[str, Any]] = {}):
|
35
|
+
super().__init__(logger, extra)
|
36
|
+
|
37
|
+
def debug_conditional(self, msg, do_print: bool, *args, **kwargs):
|
38
|
+
if do_print:
|
39
|
+
self.debug(msg, *args, **kwargs)
|
40
|
+
|
41
|
+
def info_conditional(self, msg, do_print: bool, *args, **kwargs):
|
42
|
+
if do_print:
|
43
|
+
self.info(msg, *args, **kwargs)
|
44
|
+
|
45
|
+
def warning_conditional(self, msg, do_print: bool, *args, **kwargs):
|
46
|
+
if do_print:
|
47
|
+
self.warning(msg, *args, **kwargs)
|
48
|
+
|
49
|
+
def error_conditional(self, msg, do_print: bool, *args, **kwargs):
|
50
|
+
if do_print:
|
51
|
+
self.error(msg, *args, **kwargs)
|
52
|
+
|
53
|
+
|
54
|
+
class RayRuntimeContextLoggerAdapter(DeltaCATLoggerAdapter):
|
30
55
|
"""
|
31
56
|
Logger Adapter for injecting Ray Runtime Context into logging messages.
|
32
57
|
"""
|
@@ -147,6 +172,8 @@ def _configure_logger(
|
|
147
172
|
ray_runtime_ctx = ray.get_runtime_context()
|
148
173
|
if ray_runtime_ctx.worker.connected:
|
149
174
|
logger = RayRuntimeContextLoggerAdapter(logger, ray_runtime_ctx)
|
175
|
+
else:
|
176
|
+
logger = DeltaCATLoggerAdapter(logger)
|
150
177
|
|
151
178
|
return logger
|
152
179
|
|
@@ -72,6 +72,7 @@ class TestCompactPartitionParams(unittest.TestCase):
|
|
72
72
|
"partitionValues": [],
|
73
73
|
"partitionId": "79612ea39ac5493eae925abe60767d42",
|
74
74
|
},
|
75
|
+
"memory_logs_enabled": True,
|
75
76
|
"metrics_config": MetricsConfig("us-east-1", MetricsTarget.CLOUDWATCH_EMF),
|
76
77
|
}
|
77
78
|
|
@@ -135,6 +136,10 @@ class TestCompactPartitionParams(unittest.TestCase):
|
|
135
136
|
json.loads(serialized_params)["destination_partition_locator"]
|
136
137
|
== params.destination_partition_locator
|
137
138
|
)
|
139
|
+
assert (
|
140
|
+
json.loads(serialized_params)["memory_logs_enabled"]
|
141
|
+
== params.memory_logs_enabled
|
142
|
+
)
|
138
143
|
assert (
|
139
144
|
json.loads(serialized_params)["metrics_config"]["metrics_target"]
|
140
145
|
== params.metrics_config.metrics_target
|
@@ -25,6 +25,10 @@ class MockPyMemcacheClient:
|
|
25
25
|
def get(self, key, *args, **kwargs):
|
26
26
|
return self.store.get(key)
|
27
27
|
|
28
|
+
def flush_all(self, *args, **kwargs):
|
29
|
+
for key, value in self.store.items():
|
30
|
+
self.store[key] = None
|
31
|
+
|
28
32
|
|
29
33
|
class TestMemcachedObjectStore(unittest.TestCase):
|
30
34
|
|
@@ -192,3 +196,18 @@ class TestMemcachedObjectStore(unittest.TestCase):
|
|
192
196
|
# assert
|
193
197
|
result = self.object_store.get(ref)
|
194
198
|
self.assertEqual(result, self.TEST_VALUE_LARGE)
|
199
|
+
|
200
|
+
@mock.patch("deltacat.io.memcached_object_store.Client")
|
201
|
+
@mock.patch("deltacat.io.memcached_object_store.RetryingClient")
|
202
|
+
def test_clear_sanity(self, mock_retrying_client, mock_client):
|
203
|
+
# setup
|
204
|
+
mock_client.return_value = MockPyMemcacheClient()
|
205
|
+
mock_retrying_client.return_value = mock_client.return_value
|
206
|
+
|
207
|
+
# action
|
208
|
+
ref = self.object_store.put(self.TEST_VALUE_LARGE)
|
209
|
+
self.object_store.clear()
|
210
|
+
|
211
|
+
# assert
|
212
|
+
with self.assertRaises(ValueError):
|
213
|
+
self.object_store.get(ref)
|