deltacat 1.1.17__py3-none-any.whl → 1.1.19__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (61) hide show
  1. deltacat/__init__.py +1 -1
  2. deltacat/aws/constants.py +0 -1
  3. deltacat/compute/compactor/model/compact_partition_params.py +76 -0
  4. deltacat/compute/compactor/model/compaction_session_audit_info.py +26 -0
  5. deltacat/compute/compactor/model/delta_annotated.py +16 -9
  6. deltacat/compute/compactor_v2/constants.py +3 -0
  7. deltacat/compute/compactor_v2/private/compaction_utils.py +9 -5
  8. deltacat/compute/compactor_v2/utils/content_type_params.py +185 -34
  9. deltacat/compute/compactor_v2/utils/io.py +28 -14
  10. deltacat/compute/compactor_v2/utils/primary_key_index.py +9 -4
  11. deltacat/compute/compactor_v2/utils/task_options.py +128 -183
  12. deltacat/compute/resource_estimation/__init__.py +27 -0
  13. deltacat/compute/resource_estimation/delta.py +271 -0
  14. deltacat/compute/resource_estimation/manifest.py +394 -0
  15. deltacat/compute/resource_estimation/model.py +165 -0
  16. deltacat/compute/resource_estimation/parquet.py +108 -0
  17. deltacat/constants.py +5 -0
  18. deltacat/exceptions.py +2 -4
  19. deltacat/logs.py +8 -0
  20. deltacat/tests/compute/compact_partition_multiple_rounds_test_cases.py +77 -0
  21. deltacat/tests/compute/compact_partition_rebase_test_cases.py +308 -0
  22. deltacat/tests/compute/compact_partition_rebase_then_incremental_test_cases.py +159 -0
  23. deltacat/tests/compute/compactor_v2/test_compaction_session.py +157 -0
  24. deltacat/tests/compute/compactor_v2/utils/test_task_options.py +3 -3
  25. deltacat/tests/compute/resource_estimation/test_delta.py +605 -0
  26. deltacat/tests/compute/resource_estimation/test_manifest.py +921 -0
  27. deltacat/tests/compute/test_compact_partition_rebase.py +13 -4
  28. deltacat/tests/compute/test_util_common.py +2 -0
  29. deltacat/tests/compute/test_util_create_table_deltas_repo.py +13 -5
  30. deltacat/tests/test_logs.py +34 -0
  31. deltacat/tests/test_utils/pyarrow.py +15 -5
  32. {deltacat-1.1.17.dist-info → deltacat-1.1.19.dist-info}/METADATA +2 -2
  33. {deltacat-1.1.17.dist-info → deltacat-1.1.19.dist-info}/RECORD +38 -54
  34. deltacat/compute/metastats/meta_stats.py +0 -479
  35. deltacat/compute/metastats/model/__init__.py +0 -0
  36. deltacat/compute/metastats/model/partition_stats_dict.py +0 -34
  37. deltacat/compute/metastats/model/stats_cluster_size_estimator.py +0 -68
  38. deltacat/compute/metastats/stats.py +0 -182
  39. deltacat/compute/metastats/utils/__init__.py +0 -0
  40. deltacat/compute/metastats/utils/constants.py +0 -16
  41. deltacat/compute/metastats/utils/io.py +0 -223
  42. deltacat/compute/metastats/utils/pyarrow_memory_estimation_function.py +0 -18
  43. deltacat/compute/metastats/utils/ray_utils.py +0 -129
  44. deltacat/compute/stats/basic.py +0 -226
  45. deltacat/compute/stats/models/__init__.py +0 -0
  46. deltacat/compute/stats/models/delta_column_stats.py +0 -98
  47. deltacat/compute/stats/models/delta_stats.py +0 -233
  48. deltacat/compute/stats/models/delta_stats_cache_result.py +0 -49
  49. deltacat/compute/stats/models/manifest_entry_stats.py +0 -72
  50. deltacat/compute/stats/models/stats_result.py +0 -104
  51. deltacat/compute/stats/utils/__init__.py +0 -0
  52. deltacat/compute/stats/utils/intervals.py +0 -94
  53. deltacat/compute/stats/utils/io.py +0 -230
  54. deltacat/compute/stats/utils/manifest_stats_file.py +0 -100
  55. deltacat/tests/stats/__init__.py +0 -0
  56. deltacat/tests/stats/test_intervals.py +0 -49
  57. /deltacat/{compute/metastats → tests/compute/resource_estimation}/__init__.py +0 -0
  58. /deltacat/{compute/metastats/config → tests/compute/resource_estimation/data}/__init__.py +0 -0
  59. {deltacat-1.1.17.dist-info → deltacat-1.1.19.dist-info}/LICENSE +0 -0
  60. {deltacat-1.1.17.dist-info → deltacat-1.1.19.dist-info}/WHEEL +0 -0
  61. {deltacat-1.1.17.dist-info → deltacat-1.1.19.dist-info}/top_level.txt +0 -0
@@ -1,479 +0,0 @@
1
- # Allow classes to use self-referencing Type hints in Python 3.7.
2
- from __future__ import annotations
3
-
4
- import functools
5
- import logging
6
- import os
7
- import pathlib
8
- from typing import Any, Dict, List, Optional, Set
9
-
10
- import ray
11
- from ray.types import ObjectRef
12
-
13
- from deltacat import logs
14
- from deltacat.compute.compactor import DeltaAnnotated
15
- from deltacat.compute.metastats.model.stats_cluster_size_estimator import (
16
- StatsClusterSizeEstimator,
17
- )
18
- from deltacat.compute.metastats.stats import start_stats_collection
19
- from deltacat.compute.metastats.utils.constants import (
20
- DEFAULT_CPUS_PER_INSTANCE_R5_8XLARGE,
21
- DEFAULT_JOB_RUN_TRACE_ID,
22
- HEAD_NODE_OBJECT_STORE_MEMORY_RESERVE_RATIO,
23
- MANIFEST_FILE_COUNT_PER_CPU,
24
- STATS_CLUSTER_R5_INSTANCE_TYPE,
25
- WORKER_NODE_OBJECT_STORE_MEMORY_RESERVE_RATIO,
26
- )
27
- from deltacat.compute.metastats.utils.io import read_cached_partition_stats
28
- from deltacat.compute.metastats.utils.pyarrow_memory_estimation_function import (
29
- estimation_function,
30
- )
31
- from deltacat.compute.metastats.utils.ray_utils import replace_cluster_cfg_vars
32
- from deltacat.compute.stats.models.delta_stats import DeltaStats
33
- from deltacat.compute.stats.models.delta_stats_cache_result import DeltaStatsCacheResult
34
- from deltacat.compute.stats.utils.intervals import DeltaRange, merge_intervals
35
- from deltacat.compute.stats.utils.io import get_deltas_from_range
36
- from deltacat.constants import (
37
- BYTES_PER_GIBIBYTE,
38
- PYARROW_INFLATION_MULTIPLIER_ALL_COLUMNS,
39
- )
40
- from deltacat.storage import Delta, DeltaLocator, PartitionLocator
41
- from deltacat.storage import interface as unimplemented_deltacat_storage
42
- from deltacat.utils.performance import timed_invocation
43
-
44
- logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
45
-
46
-
47
- def collect_metastats(
48
- source_partition_locators: List[PartitionLocator],
49
- columns: Optional[List[str]] = None,
50
- file_count_per_cpu: Optional[int] = MANIFEST_FILE_COUNT_PER_CPU,
51
- stat_results_s3_bucket: Optional[str] = None,
52
- metastats_results_s3_bucket: Optional[str] = None,
53
- deltacat_storage=unimplemented_deltacat_storage,
54
- *args,
55
- **kwargs,
56
- ) -> Dict[str, Dict[int, DeltaStats]]:
57
-
58
- # TODO: Add CompactionEventDispatcher for metastats collection started event
59
- stats_res_all_partitions: Dict[str, Dict[int, DeltaStats]] = {}
60
- stats_res_obj_ref_all_partitions: Dict[str, ObjectRef] = {}
61
- for partition_locator in source_partition_locators:
62
- partition_id = partition_locator.partition_id
63
- if partition_locator.partition_values:
64
- partition_value_string = "_".join(partition_locator.partition_values)
65
- else:
66
- partition_value_string = f"no_partition_value_{partition_id}"
67
- partition_canonical_string = partition_locator.canonical_string()
68
- stats_res_obj_ref = collect_from_partition.remote(
69
- source_partition_locator=partition_locator,
70
- partition_value_string=partition_value_string,
71
- partition_canonical_string=partition_canonical_string,
72
- columns=columns,
73
- stat_results_s3_bucket=stat_results_s3_bucket,
74
- metastats_results_s3_bucket=metastats_results_s3_bucket,
75
- file_count_per_cpu=file_count_per_cpu,
76
- deltacat_storage=deltacat_storage,
77
- *args,
78
- **kwargs,
79
- )
80
- stats_res_obj_ref_all_partitions[partition_value_string] = stats_res_obj_ref
81
- for pv, stats_res_obj_ref in stats_res_obj_ref_all_partitions.items():
82
- stats_res_all_partitions[pv] = ray.get(stats_res_obj_ref)
83
- # TODO: Add CompactionEventDispatcher for metastats collection completed event
84
-
85
- logger.info(f"stats_res_all_partitions: {stats_res_all_partitions}")
86
-
87
- # For compaction result validation purpose only
88
- aggregate_partition_stats_for_validation: Dict[str, list] = {}
89
- for partition_val, delta_stream_range_set in stats_res_all_partitions.items():
90
- partition_stats_sum_row_count = 0
91
- partition_pyarrow_sum = 0
92
- for stream_pos, stats_column_result in delta_stream_range_set.items():
93
- for cs in stats_column_result.column_stats[0].manifest_stats.stats:
94
- partition_stats_sum_row_count += cs.get("rowCount")
95
-
96
- for stats in stats_column_result.get("column_stats"):
97
- partition_pyarrow_sum += stats.get("stats").get("pyarrowTableBytes")
98
- aggregate_partition_stats_for_validation[partition_val] = [
99
- partition_stats_sum_row_count,
100
- partition_pyarrow_sum,
101
- ]
102
- logger.info(
103
- f"partitions_stats_result for partition value: {partition_val}: rowCount: {partition_stats_sum_row_count}; pyarrowTableBytes: {partition_pyarrow_sum}"
104
- )
105
- return aggregate_partition_stats_for_validation
106
-
107
- # return stats_res_all_partitions
108
-
109
-
110
- @ray.remote(num_cpus=1)
111
- def collect_from_partition(
112
- source_partition_locator: PartitionLocator,
113
- partition_value_string,
114
- partition_canonical_string,
115
- delta_stream_position_range_set: Optional[Set[DeltaRange]] = None,
116
- columns: Optional[List[str]] = None,
117
- file_count_per_cpu: Optional[int] = MANIFEST_FILE_COUNT_PER_CPU,
118
- stat_results_s3_bucket: Optional[str] = None,
119
- metastats_results_s3_bucket: Optional[str] = None,
120
- deltacat_storage=unimplemented_deltacat_storage,
121
- deltacat_storage_kwargs: Optional[Dict[str, Any]] = None,
122
- *args,
123
- **kwargs,
124
- ) -> ObjectRef[Dict[int, DeltaStats]]:
125
- if deltacat_storage_kwargs is None:
126
- deltacat_storage_kwargs = {}
127
- if not columns:
128
- columns = deltacat_storage.get_table_version_column_names(
129
- source_partition_locator.namespace,
130
- source_partition_locator.table_name,
131
- source_partition_locator.table_version,
132
- )
133
- deltas = _find_deltas(
134
- source_partition_locator, delta_stream_position_range_set, deltacat_storage
135
- )
136
-
137
- logger.info(f"Find {len(deltas)} deltas!")
138
- trace_id = DEFAULT_JOB_RUN_TRACE_ID
139
- if "trace_id" in kwargs:
140
- trace_id = kwargs.get("trace_id")
141
- else:
142
- logger.warning(
143
- f"No job run trace id specified, default to {DEFAULT_JOB_RUN_TRACE_ID}"
144
- )
145
-
146
- cpus_per_instance = DEFAULT_CPUS_PER_INSTANCE_R5_8XLARGE
147
- if cpus_per_instance in kwargs:
148
- cpus_per_instance = kwargs.get("cpus_per_instance")
149
- else:
150
- logger.info(
151
- f"Stats cluster CPUS per instance not specified, default to {DEFAULT_CPUS_PER_INSTANCE_R5_8XLARGE}"
152
- )
153
-
154
- stats_res_obj_ref = _start_all_stats_collection_from_deltas(
155
- deltas,
156
- partition_value_string,
157
- partition_canonical_string,
158
- columns,
159
- trace_id,
160
- file_count_per_cpu,
161
- cpus_per_instance,
162
- stat_results_s3_bucket,
163
- metastats_results_s3_bucket,
164
- deltacat_storage,
165
- )
166
- return stats_res_obj_ref
167
-
168
-
169
- def _find_deltas(
170
- source_partition_locator: PartitionLocator,
171
- delta_stream_position_range_set: Optional[Set[DeltaRange]] = None,
172
- deltacat_storage=unimplemented_deltacat_storage,
173
- ) -> List[Delta]:
174
-
175
- if delta_stream_position_range_set is None:
176
- delta_stream_position_range_set = {(None, None)}
177
- delta_range_lookup_pending: List[ObjectRef[List[Delta]]] = []
178
-
179
- for range_pair in merge_intervals(delta_stream_position_range_set):
180
- begin, end = range_pair
181
- promise: ObjectRef[List[Delta]] = get_deltas_from_range.remote(
182
- source_partition_locator, begin, end, deltacat_storage
183
- )
184
- delta_range_lookup_pending.append(promise)
185
-
186
- delta_list_by_ranges: List[List[Delta]] = ray.get(delta_range_lookup_pending)
187
- deltas = [delta for delta_list in delta_list_by_ranges for delta in delta_list]
188
- return deltas
189
-
190
-
191
- def _start_all_stats_collection_from_deltas(
192
- deltas: List[Delta],
193
- partition_value_string: Optional[str],
194
- partition_canonical_string: Optional[str],
195
- columns: Optional[List[str]] = None,
196
- trace_id: Optional[str] = None,
197
- file_count_per_cpu: Optional[int] = MANIFEST_FILE_COUNT_PER_CPU,
198
- cpus_per_instance: Optional[int] = DEFAULT_CPUS_PER_INSTANCE_R5_8XLARGE,
199
- stat_results_s3_bucket: Optional[str] = None,
200
- metastats_results_s3_bucket: Optional[str] = None,
201
- deltacat_storage=unimplemented_deltacat_storage,
202
- ) -> Dict[int, DeltaStats]:
203
-
204
- delta_stats_compute_list: List[DeltaLocator] = []
205
- meta_stats_list_ready: List[DeltaLocator] = []
206
- meta_stats_list_to_compute: List[DeltaLocator] = []
207
-
208
- if stat_results_s3_bucket:
209
- found_columns_stats_map: Dict[
210
- int, List[DeltaStatsCacheResult]
211
- ] = read_cached_partition_stats(
212
- partition_canonical_string, stat_results_s3_bucket
213
- )
214
-
215
- delta_cache_res: List[DeltaStats] = []
216
- for delta in deltas:
217
- if found_columns_stats_map and delta.stream_position in found_columns_stats_map:
218
- cached_result = found_columns_stats_map[delta.stream_position]
219
- if cached_result.hits:
220
- delta_cache_res.append(cached_result.hits)
221
- meta_stats_list_ready.append(
222
- cached_result.hits.column_stats[0].manifest_stats.delta_locator
223
- )
224
-
225
- if cached_result.misses:
226
- delta_locator: DeltaLocator = cached_result.misses.delta_locator
227
- delta_stats_compute_list.append(delta_locator)
228
- meta_stats_list_to_compute.append(delta_locator)
229
- else:
230
- delta_stats_compute_list.append(delta.locator)
231
- meta_stats_list_to_compute.append(delta.locator)
232
-
233
- logger.info(f"Collecting stats on {len(delta_stats_compute_list)} deltas!")
234
- delta_stats_compute_res: Dict[int, DeltaStats] = {}
235
- if delta_stats_compute_list:
236
- delta_stats_compute_res = _start_metadata_stats_collection(
237
- delta_stats_compute_list=delta_stats_compute_list,
238
- meta_stats_list_ready=meta_stats_list_ready,
239
- meta_stats_list_to_compute=meta_stats_list_to_compute,
240
- partition_value_string=partition_value_string,
241
- partition_canonical_string=partition_canonical_string,
242
- columns=columns,
243
- trace_id=trace_id,
244
- file_count_per_cpu=file_count_per_cpu,
245
- cpus_per_instance=cpus_per_instance,
246
- stat_results_s3_bucket=stat_results_s3_bucket,
247
- metastats_results_s3_bucket=metastats_results_s3_bucket,
248
- deltacat_storage=deltacat_storage,
249
- )
250
-
251
- delta_stream_range_stats: Dict[int, DeltaStats] = {}
252
- for delta_column_stats in delta_cache_res:
253
- assert (
254
- len(delta_column_stats.column_stats) > 0
255
- ), f"Expected columns of `{delta_column_stats}` to be non-empty"
256
- stream_position = delta_column_stats.column_stats[
257
- 0
258
- ].manifest_stats.delta_locator.stream_position
259
- delta_stream_range_stats[stream_position] = delta_column_stats
260
-
261
- # stats collection result: if we have cached stats and missed column stats for same delta, stats collection for this delta is still needed
262
- # and the final result will use the newly collected stats for this delta.
263
- stats_collection_res: Dict[int, DeltaStats] = {
264
- **delta_stream_range_stats,
265
- **delta_stats_compute_res,
266
- }
267
-
268
- return stats_collection_res
269
-
270
-
271
- def _start_metadata_stats_collection(
272
- delta_stats_compute_list: List[DeltaLocator],
273
- meta_stats_list_ready: List[DeltaLocator],
274
- meta_stats_list_to_compute: List[DeltaLocator],
275
- partition_value_string: Optional[str],
276
- partition_canonical_string: Optional[str],
277
- columns: Optional[List[str]] = None,
278
- trace_id: Optional[str] = None,
279
- file_count_per_cpu: Optional[int] = MANIFEST_FILE_COUNT_PER_CPU,
280
- cpus_per_instance: Optional[int] = DEFAULT_CPUS_PER_INSTANCE_R5_8XLARGE,
281
- stat_results_s3_bucket: Optional[str] = None,
282
- metastats_results_s3_bucket: Optional[str] = None,
283
- deltacat_storage=unimplemented_deltacat_storage,
284
- ) -> Dict[int, DeltaStats]:
285
-
286
- meta_stats_res_ready: Dict[int, int] = {}
287
-
288
- for delta_locator in meta_stats_list_ready:
289
- delta_meta_count = 0
290
- manifest = deltacat_storage.get_delta_manifest(delta_locator)
291
- delta = Delta.of(delta_locator, None, None, None, manifest)
292
-
293
- for entry in delta.manifest.entries:
294
- delta_meta_count += entry.meta.content_length
295
- meta_stats_res_ready[delta.stream_position] = delta_meta_count
296
-
297
- first_delta_locator = (
298
- meta_stats_list_ready[0]
299
- if meta_stats_list_ready
300
- else meta_stats_list_to_compute[0]
301
- )
302
- manifest = deltacat_storage.get_delta_manifest(first_delta_locator)
303
- content_type = manifest.meta.content_type
304
- content_encoding = manifest.meta.content_type
305
-
306
- meta_stats_to_compute: Dict[int, int] = {}
307
- manifest_file_count_to_compute: Dict[int, int] = {}
308
-
309
- for delta_locator in meta_stats_list_to_compute:
310
- delta_meta_count = 0
311
- manifest = deltacat_storage.get_delta_manifest(delta_locator)
312
- delta = Delta.of(delta_locator, None, None, None, manifest)
313
- file_count = len(delta.manifest.entries)
314
- manifest_file_count_to_compute[delta.stream_position] = file_count
315
- for entry in delta.manifest.entries:
316
- delta_meta_count += entry.meta.content_length
317
- meta_stats_to_compute[delta.stream_position] = delta_meta_count
318
-
319
- batched_delta_stats_compute_list = _batch_deltas(
320
- delta_stats_compute_list,
321
- file_count_per_cpu,
322
- cpus_per_instance,
323
- deltacat_storage,
324
- content_type,
325
- content_encoding,
326
- )
327
-
328
- # out_cluster_cfg = _setup_stats_cluster(min_workers,
329
- # partition_value_string,
330
- # trace_id,
331
- # cpus_per_instance)
332
- out_cluster_cfg = None
333
- delta_stats_res: Dict[int, DeltaStats] = _start_stats_cluster(
334
- out_cluster_cfg,
335
- batched_delta_stats_compute_list,
336
- columns,
337
- stat_results_s3_bucket,
338
- metastats_results_s3_bucket,
339
- deltacat_storage,
340
- partition_canonical_string,
341
- )
342
-
343
- return delta_stats_res
344
-
345
-
346
- def _start_stats_cluster(
347
- out_cluster_cfg: str,
348
- batched_delta_stats_compute_list: List[DeltaAnnotated],
349
- columns: List[str],
350
- stat_results_s3_bucket: Optional[str] = None,
351
- metastats_results_s3_bucket: Optional[str] = None,
352
- deltacat_storage=unimplemented_deltacat_storage,
353
- partition_val: Optional[str] = "partition_val",
354
- ):
355
- # ray_up_latency = timed_invocation(
356
- # func=ray_up,
357
- # cluster_cfg=out_cluster_cfg
358
- # )
359
- # logger.info(f"ray_up_latency: {partition_val}:{ray_up_latency}")
360
-
361
- # head_node_ip = get_head_node_ip(out_cluster_cfg)
362
- # client = ray_init(head_node_ip, 10001)
363
- # with client:
364
- delta_stream_range_stats, stats_collection_latency = timed_invocation(
365
- func=start_stats_collection,
366
- batched_delta_stats_compute_list=batched_delta_stats_compute_list,
367
- columns=columns,
368
- stat_results_s3_bucket=stat_results_s3_bucket,
369
- metastats_results_s3_bucket=metastats_results_s3_bucket,
370
- deltacat_storage=deltacat_storage,
371
- )
372
- logger.info(
373
- f"actual_stats_collection_latency: {partition_val}: {stats_collection_latency}"
374
- )
375
- # client.disconnect()
376
- # ray_down(out_cluster_cfg)
377
- # clean_up_cluster_cfg_file(out_cluster_cfg)
378
- return delta_stream_range_stats
379
-
380
-
381
- def _estimate_cpus_needed(
382
- meta_stats_to_compute,
383
- memory_gb_per_cpu,
384
- file_count_per_cpu,
385
- manifest_file_count_to_compute,
386
- partition_val,
387
- ):
388
- content_length_sum = 0
389
- for val in meta_stats_to_compute.values():
390
- content_length_sum += val
391
- manifest_file_count_sum = 0
392
- for val in manifest_file_count_to_compute.values():
393
- manifest_file_count_sum += val
394
- estimated_memory_bytes_needed = (
395
- content_length_sum * PYARROW_INFLATION_MULTIPLIER_ALL_COLUMNS
396
- )
397
- estimated_memory_gib_needed = estimated_memory_bytes_needed / BYTES_PER_GIBIBYTE
398
-
399
- logger.info(
400
- f"estimated_memory_gib_needed: {partition_val} : {estimated_memory_gib_needed}"
401
- )
402
- logger.info(f"manifest_file_count_sum: {partition_val} : {manifest_file_count_sum}")
403
-
404
- memory_per_cpu_available = memory_gb_per_cpu * (
405
- 1 - WORKER_NODE_OBJECT_STORE_MEMORY_RESERVE_RATIO
406
- )
407
- estimator = StatsClusterSizeEstimator.of(
408
- memory_per_cpu_available,
409
- file_count_per_cpu,
410
- estimated_memory_gib_needed,
411
- manifest_file_count_sum,
412
- )
413
- min_cpus = StatsClusterSizeEstimator.estimate_cpus_needed(estimator)
414
- return min_cpus
415
-
416
-
417
- def _batch_deltas(
418
- delta_stats_compute_list,
419
- file_count_per_cpu,
420
- cpu_per_instance,
421
- deltacat_storage,
422
- content_type,
423
- content_encoding,
424
- ) -> List[DeltaAnnotated]:
425
- worker_node_mem = (
426
- cpu_per_instance
427
- * (1 - WORKER_NODE_OBJECT_STORE_MEMORY_RESERVE_RATIO)
428
- * BYTES_PER_GIBIBYTE
429
- )
430
- delta_list = []
431
-
432
- estimate_based_on_content_length = functools.partial(
433
- estimation_function,
434
- content_type=content_type,
435
- content_encoding=content_encoding,
436
- )
437
-
438
- for delta_locator in delta_stats_compute_list:
439
- manifest = deltacat_storage.get_delta_manifest(delta_locator)
440
- delta = Delta.of(delta_locator, None, None, None, manifest)
441
- delta_annotated = DeltaAnnotated.of(delta)
442
- delta_list.append(delta_annotated)
443
-
444
- rebatched_da_list = DeltaAnnotated.rebatch(
445
- delta_list,
446
- worker_node_mem,
447
- file_count_per_cpu,
448
- estimate_based_on_content_length,
449
- )
450
-
451
- logger.info(f"Rebatched_delta_list_length: {len(rebatched_da_list)}")
452
-
453
- return rebatched_da_list
454
-
455
-
456
- def _setup_stats_cluster(
457
- min_workers, partition_value_string, trace_id, cpus_per_instance
458
- ):
459
- stats_cluster_instance_type = (
460
- int(cpus_per_instance // 4)
461
- if cpus_per_instance
462
- else STATS_CLUSTER_R5_INSTANCE_TYPE
463
- )
464
- stats_cluster_instance_type_str = f"r5.{stats_cluster_instance_type}xlarge".strip()
465
- parent_dir_path = pathlib.Path(__file__).parent.resolve()
466
- in_cfg = os.path.join(parent_dir_path, "config", "stats_cluster_example.yaml")
467
- out_cluster_cfg_file_path = replace_cluster_cfg_vars(
468
- partition_canonical_string=partition_value_string,
469
- trace_id=trace_id,
470
- file_path=in_cfg,
471
- min_workers=min_workers,
472
- head_type=stats_cluster_instance_type_str,
473
- worker_type=stats_cluster_instance_type_str,
474
- head_object_store_memory_pct=HEAD_NODE_OBJECT_STORE_MEMORY_RESERVE_RATIO * 100,
475
- worker_object_store_memory_pct=WORKER_NODE_OBJECT_STORE_MEMORY_RESERVE_RATIO
476
- * 100,
477
- )
478
-
479
- return out_cluster_cfg_file_path
File without changes
@@ -1,34 +0,0 @@
1
- # Allow classes to use self-referencing Type hints in Python 3.7.
2
- from __future__ import annotations
3
-
4
- from typing import Dict
5
-
6
- from deltacat.compute.stats.models.delta_stats import DeltaStats
7
-
8
-
9
- class PartitionStats(dict):
10
- @staticmethod
11
- def of(
12
- delta_stats: Dict[DeltaStats], partition_canonical_string: str
13
- ) -> PartitionStats:
14
- ps = PartitionStats()
15
- ps["delta_stats"] = delta_stats
16
- ps["partition_canonical_string"] = partition_canonical_string
17
- return ps
18
-
19
- @staticmethod
20
- def build_from_dict(partition_stats: str) -> PartitionStats:
21
- delta_stats_dict = {}
22
- for stream_position, delta_stats in partition_stats["delta_stats"].items():
23
- delta_stats_dict[stream_position] = DeltaStats.build_from_dict(delta_stats)
24
- return PartitionStats.of(
25
- delta_stats_dict, partition_stats["partition_canonical_string"]
26
- )
27
-
28
- @property
29
- def delta_stats(self) -> Dict[DeltaStats]:
30
- return self["delta_stats"]
31
-
32
- @property
33
- def partition_canonical_string(self) -> str:
34
- return self["partition_canonical_string"]
@@ -1,68 +0,0 @@
1
- # Allow classes to use self-referencing Type hints in Python 3.7.
2
- from __future__ import annotations
3
-
4
- from deltacat.compute.stats.models.delta_stats import DeltaStats
5
-
6
-
7
- class StatsClusterSizeEstimator(dict):
8
- @staticmethod
9
- def of(
10
- memory_per_cpu: int,
11
- file_count_per_cpu: int,
12
- total_memory_needed: int,
13
- total_file_count: int,
14
- ) -> DeltaStats:
15
- estimator = StatsClusterSizeEstimator()
16
- estimator["memory_per_cpu"] = memory_per_cpu
17
- estimator["file_count_per_cpu"] = file_count_per_cpu
18
- estimator["total_memory_needed"] = total_memory_needed
19
- estimator["total_file_count"] = total_file_count
20
- return estimator
21
-
22
- @property
23
- def memory_per_cpu(self) -> int:
24
- """
25
- Returns a list of stats associated to each column in this delta.
26
- """
27
- return self["memory_per_cpu"]
28
-
29
- @property
30
- def file_count_per_cpu(self) -> int:
31
- """
32
- Returns a list of stats associated to each column in this delta.
33
- """
34
- return self["file_count_per_cpu"]
35
-
36
- @property
37
- def total_memory_needed(self) -> int:
38
- """
39
- Returns a list of stats associated to each column in this delta.
40
- """
41
- return self["total_memory_needed"]
42
-
43
- @property
44
- def total_file_count(self) -> int:
45
- """
46
- Returns a list of stats associated to each column in this delta.
47
- """
48
- return self["total_file_count"]
49
-
50
- @staticmethod
51
- def estimate_cpus_needed(estimator: StatsClusterSizeEstimator):
52
-
53
- # TODO(zyiqin): Current implementation is only for a rough guess using the PYARROW_INFLATION_MULTIPLIER,
54
- # note the inflation rate is for content_length to pyarrow_table_bytes for all columns.
55
- # The full implementation logic should be like:
56
- # 1. liner regression with 99 confidence level: pull metastats data for all deltas for this partition if len(datapoints) > 30.
57
- # 2. if not enough previous stats collected for same partition: Fall back to datapoints for all paritions for same table.
58
- # 3. If not enough stats collected for this table: use average content length to each content_type and content_encoding inflation rates
59
- # 4. If not enough stats for this content_type and content_encoding combination: use the basic PYARROW_INFLATION_MULTIPLIER instead.
60
- # So, only option 4 is implemented here since this pre-requirement for first 3 options are not met for first round of metastats&stats collection.
61
-
62
- min_cpus_based_on_memory = (
63
- estimator.total_memory_needed // estimator.memory_per_cpu
64
- ) + 1
65
- min_cpus_based_on_file_count = (
66
- estimator.total_file_count // estimator.file_count_per_cpu
67
- ) + 1
68
- return max(min_cpus_based_on_memory, min_cpus_based_on_file_count)