deltacat 1.1.0__py3-none-any.whl → 1.1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,7 +1,10 @@
1
1
  import botocore
2
2
  import logging
3
- from typing import Dict, Optional, List, Tuple
3
+ from typing import Dict, Optional, List, Tuple, Any
4
4
  from deltacat import logs
5
+ from deltacat.compute.compactor_v2.model.merge_file_group import (
6
+ LocalMergeFileGroupsProvider,
7
+ )
5
8
  from deltacat.types.media import ContentEncoding, ContentType
6
9
  from deltacat.types.partial_download import PartialParquetParameters
7
10
  from deltacat.storage import (
@@ -15,7 +18,6 @@ from deltacat.compute.compactor_v2.utils.primary_key_index import (
15
18
  hash_group_index_to_hash_bucket_indices,
16
19
  )
17
20
  from deltacat.compute.compactor_v2.constants import (
18
- TOTAL_MEMORY_BUFFER_PERCENTAGE,
19
21
  PARQUET_TO_PYARROW_INFLATION,
20
22
  )
21
23
 
@@ -133,8 +135,10 @@ def hash_bucket_resource_options_provider(
133
135
  item: DeltaAnnotated,
134
136
  previous_inflation: float,
135
137
  average_record_size_bytes: float,
138
+ total_memory_buffer_percentage: int,
136
139
  primary_keys: List[str] = None,
137
140
  ray_custom_resources: Optional[Dict] = None,
141
+ memory_logs_enabled: Optional[bool] = None,
138
142
  **kwargs,
139
143
  ) -> Dict:
140
144
  debug_memory_params = {"hash_bucket_task_index": index}
@@ -189,10 +193,11 @@ def hash_bucket_resource_options_provider(
189
193
  debug_memory_params["average_record_size_bytes"] = average_record_size_bytes
190
194
 
191
195
  # Consider buffer
192
- total_memory = total_memory * (1 + TOTAL_MEMORY_BUFFER_PERCENTAGE / 100.0)
196
+ total_memory = total_memory * (1 + total_memory_buffer_percentage / 100.0)
193
197
  debug_memory_params["total_memory_with_buffer"] = total_memory
194
- logger.debug(
195
- f"[Hash bucket task {index}]: Params used for calculating hash bucketing memory: {debug_memory_params}"
198
+ logger.debug_conditional(
199
+ f"[Hash bucket task {index}]: Params used for calculating hash bucketing memory: {debug_memory_params}",
200
+ memory_logs_enabled,
196
201
  )
197
202
 
198
203
  return get_task_options(0.01, total_memory, ray_custom_resources)
@@ -204,12 +209,14 @@ def merge_resource_options_provider(
204
209
  num_hash_groups: int,
205
210
  hash_group_size_bytes: Dict[int, int],
206
211
  hash_group_num_rows: Dict[int, int],
212
+ total_memory_buffer_percentage: int,
207
213
  round_completion_info: Optional[RoundCompletionInfo] = None,
208
214
  compacted_delta_manifest: Optional[Manifest] = None,
209
215
  ray_custom_resources: Optional[Dict] = None,
210
216
  primary_keys: Optional[List[str]] = None,
211
217
  deltacat_storage=unimplemented_deltacat_storage,
212
218
  deltacat_storage_kwargs: Optional[Dict] = {},
219
+ memory_logs_enabled: Optional[bool] = None,
213
220
  **kwargs,
214
221
  ) -> Dict:
215
222
  debug_memory_params = {"merge_task_index": index}
@@ -224,6 +231,84 @@ def merge_resource_options_provider(
224
231
  pk_size_bytes = data_size
225
232
  incremental_index_array_size = num_rows * 4
226
233
 
234
+ return get_merge_task_options(
235
+ index,
236
+ hb_group_idx,
237
+ data_size,
238
+ pk_size_bytes,
239
+ num_rows,
240
+ num_hash_groups,
241
+ total_memory_buffer_percentage,
242
+ incremental_index_array_size,
243
+ debug_memory_params,
244
+ ray_custom_resources,
245
+ round_completion_info=round_completion_info,
246
+ compacted_delta_manifest=compacted_delta_manifest,
247
+ primary_keys=primary_keys,
248
+ deltacat_storage=deltacat_storage,
249
+ deltacat_storage_kwargs=deltacat_storage_kwargs,
250
+ memory_logs_enabled=memory_logs_enabled,
251
+ )
252
+
253
+
254
+ def local_merge_resource_options_provider(
255
+ estimated_da_size: float,
256
+ estimated_num_rows: int,
257
+ total_memory_buffer_percentage: int,
258
+ round_completion_info: Optional[RoundCompletionInfo] = None,
259
+ compacted_delta_manifest: Optional[Manifest] = None,
260
+ ray_custom_resources: Optional[Dict] = None,
261
+ primary_keys: Optional[List[str]] = None,
262
+ deltacat_storage=unimplemented_deltacat_storage,
263
+ deltacat_storage_kwargs: Optional[Dict] = {},
264
+ memory_logs_enabled: Optional[bool] = None,
265
+ **kwargs,
266
+ ) -> Dict:
267
+ index = hb_group_idx = LocalMergeFileGroupsProvider.LOCAL_HASH_BUCKET_INDEX
268
+ debug_memory_params = {"merge_task_index": index}
269
+
270
+ # upper bound for pk size of incremental
271
+ pk_size_bytes = estimated_da_size
272
+ incremental_index_array_size = estimated_num_rows * 4
273
+
274
+ return get_merge_task_options(
275
+ index=index,
276
+ hb_group_idx=hb_group_idx,
277
+ data_size=estimated_da_size,
278
+ pk_size_bytes=pk_size_bytes,
279
+ num_rows=estimated_num_rows,
280
+ num_hash_groups=1,
281
+ incremental_index_array_size=incremental_index_array_size,
282
+ total_memory_buffer_percentage=total_memory_buffer_percentage,
283
+ debug_memory_params=debug_memory_params,
284
+ ray_custom_resources=ray_custom_resources,
285
+ round_completion_info=round_completion_info,
286
+ compacted_delta_manifest=compacted_delta_manifest,
287
+ primary_keys=primary_keys,
288
+ deltacat_storage=deltacat_storage,
289
+ deltacat_storage_kwargs=deltacat_storage_kwargs,
290
+ memory_logs_enabled=memory_logs_enabled,
291
+ )
292
+
293
+
294
+ def get_merge_task_options(
295
+ index: int,
296
+ hb_group_idx: int,
297
+ data_size: float,
298
+ pk_size_bytes: float,
299
+ num_rows: int,
300
+ num_hash_groups: int,
301
+ total_memory_buffer_percentage: int,
302
+ incremental_index_array_size: int,
303
+ debug_memory_params: Dict[str, Any],
304
+ ray_custom_resources: Optional[Dict],
305
+ round_completion_info: Optional[RoundCompletionInfo] = None,
306
+ compacted_delta_manifest: Optional[Manifest] = None,
307
+ primary_keys: Optional[List[str]] = None,
308
+ deltacat_storage=unimplemented_deltacat_storage,
309
+ deltacat_storage_kwargs: Optional[Dict] = {},
310
+ memory_logs_enabled: Optional[bool] = None,
311
+ ) -> Dict[str, Any]:
227
312
  if (
228
313
  round_completion_info
229
314
  and compacted_delta_manifest
@@ -296,10 +381,11 @@ def merge_resource_options_provider(
296
381
  debug_memory_params["incremental_index_array_size"] = incremental_index_array_size
297
382
  debug_memory_params["total_memory"] = total_memory
298
383
 
299
- total_memory = total_memory * (1 + TOTAL_MEMORY_BUFFER_PERCENTAGE / 100.0)
384
+ total_memory = total_memory * (1 + total_memory_buffer_percentage / 100.0)
300
385
  debug_memory_params["total_memory_with_buffer"] = total_memory
301
- logger.debug(
302
- f"[Merge task {index}]: Params used for calculating merge memory: {debug_memory_params}"
386
+ logger.debug_conditional(
387
+ f"[Merge task {index}]: Params used for calculating merge memory: {debug_memory_params}",
388
+ memory_logs_enabled,
303
389
  )
304
390
 
305
391
  return get_task_options(0.01, total_memory, ray_custom_resources)
@@ -181,15 +181,35 @@ class MemcachedObjectStore(IObjectStore):
181
181
  for chunk_index in range(chunk_count):
182
182
  ref = self._create_ref(uid, ip, chunk_index)
183
183
  chunk = client.get(ref)
184
+ if chunk is None:
185
+ raise ValueError(
186
+ f"Expected uid: {uid}, chunk index: {chunk_index} from client ip: {ip}"
187
+ f" to be non-empty."
188
+ )
184
189
  serialized.extend(chunk)
185
190
 
186
191
  return cloudpickle.loads(serialized)
187
192
 
193
+ def clear(self) -> bool:
194
+ flushed = all(
195
+ [
196
+ self._get_client_by_ip(ip).flush_all(noreply=False)
197
+ for ip in self.storage_node_ips
198
+ ]
199
+ )
200
+ self.client_cache.clear()
201
+
202
+ if flushed:
203
+ logger.info("Successfully cleared cache contents.")
204
+
205
+ return flushed
206
+
188
207
  def close(self) -> None:
189
208
  for client in self.client_cache.values():
190
209
  client.close()
191
210
 
192
211
  self.client_cache.clear()
212
+ logger.info("Successfully closed object store clients.")
193
213
 
194
214
  def _create_ref(self, uid, ip, chunk_index) -> str:
195
215
  return f"{uid}{self.SEPARATOR}{ip}{self.SEPARATOR}{chunk_index}"
deltacat/logs.py CHANGED
@@ -2,7 +2,7 @@ import logging
2
2
  import os
3
3
  import pathlib
4
4
  from logging import FileHandler, Handler, Logger, LoggerAdapter, handlers
5
- from typing import Union
5
+ from typing import Any, Dict, Optional, Union
6
6
 
7
7
  import ray
8
8
  from ray.runtime_context import RuntimeContext
@@ -26,7 +26,32 @@ DEFAULT_MAX_BYTES_PER_LOG = 2 ^ 20 * 256 # 256 MiB
26
26
  DEFAULT_BACKUP_COUNT = 0
27
27
 
28
28
 
29
- class RayRuntimeContextLoggerAdapter(logging.LoggerAdapter):
29
+ class DeltaCATLoggerAdapter(logging.LoggerAdapter):
30
+ """
31
+ Logger Adapter class with additional functionality
32
+ """
33
+
34
+ def __init__(self, logger: Logger, extra: Optional[Dict[str, Any]] = {}):
35
+ super().__init__(logger, extra)
36
+
37
+ def debug_conditional(self, msg, do_print: bool, *args, **kwargs):
38
+ if do_print:
39
+ self.debug(msg, *args, **kwargs)
40
+
41
+ def info_conditional(self, msg, do_print: bool, *args, **kwargs):
42
+ if do_print:
43
+ self.info(msg, *args, **kwargs)
44
+
45
+ def warning_conditional(self, msg, do_print: bool, *args, **kwargs):
46
+ if do_print:
47
+ self.warning(msg, *args, **kwargs)
48
+
49
+ def error_conditional(self, msg, do_print: bool, *args, **kwargs):
50
+ if do_print:
51
+ self.error(msg, *args, **kwargs)
52
+
53
+
54
+ class RayRuntimeContextLoggerAdapter(DeltaCATLoggerAdapter):
30
55
  """
31
56
  Logger Adapter for injecting Ray Runtime Context into logging messages.
32
57
  """
@@ -147,6 +172,8 @@ def _configure_logger(
147
172
  ray_runtime_ctx = ray.get_runtime_context()
148
173
  if ray_runtime_ctx.worker.connected:
149
174
  logger = RayRuntimeContextLoggerAdapter(logger, ray_runtime_ctx)
175
+ else:
176
+ logger = DeltaCATLoggerAdapter(logger)
150
177
 
151
178
  return logger
152
179
 
@@ -72,6 +72,7 @@ class TestCompactPartitionParams(unittest.TestCase):
72
72
  "partitionValues": [],
73
73
  "partitionId": "79612ea39ac5493eae925abe60767d42",
74
74
  },
75
+ "memory_logs_enabled": True,
75
76
  "metrics_config": MetricsConfig("us-east-1", MetricsTarget.CLOUDWATCH_EMF),
76
77
  }
77
78
 
@@ -135,6 +136,10 @@ class TestCompactPartitionParams(unittest.TestCase):
135
136
  json.loads(serialized_params)["destination_partition_locator"]
136
137
  == params.destination_partition_locator
137
138
  )
139
+ assert (
140
+ json.loads(serialized_params)["memory_logs_enabled"]
141
+ == params.memory_logs_enabled
142
+ )
138
143
  assert (
139
144
  json.loads(serialized_params)["metrics_config"]["metrics_target"]
140
145
  == params.metrics_config.metrics_target
@@ -25,6 +25,10 @@ class MockPyMemcacheClient:
25
25
  def get(self, key, *args, **kwargs):
26
26
  return self.store.get(key)
27
27
 
28
+ def flush_all(self, *args, **kwargs):
29
+ for key, value in self.store.items():
30
+ self.store[key] = None
31
+
28
32
 
29
33
  class TestMemcachedObjectStore(unittest.TestCase):
30
34
 
@@ -192,3 +196,18 @@ class TestMemcachedObjectStore(unittest.TestCase):
192
196
  # assert
193
197
  result = self.object_store.get(ref)
194
198
  self.assertEqual(result, self.TEST_VALUE_LARGE)
199
+
200
+ @mock.patch("deltacat.io.memcached_object_store.Client")
201
+ @mock.patch("deltacat.io.memcached_object_store.RetryingClient")
202
+ def test_clear_sanity(self, mock_retrying_client, mock_client):
203
+ # setup
204
+ mock_client.return_value = MockPyMemcacheClient()
205
+ mock_retrying_client.return_value = mock_client.return_value
206
+
207
+ # action
208
+ ref = self.object_store.put(self.TEST_VALUE_LARGE)
209
+ self.object_store.clear()
210
+
211
+ # assert
212
+ with self.assertRaises(ValueError):
213
+ self.object_store.get(ref)