deltacat 1.1.17__py3-none-any.whl → 1.1.19__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +1 -1
- deltacat/aws/constants.py +0 -1
- deltacat/compute/compactor/model/compact_partition_params.py +76 -0
- deltacat/compute/compactor/model/compaction_session_audit_info.py +26 -0
- deltacat/compute/compactor/model/delta_annotated.py +16 -9
- deltacat/compute/compactor_v2/constants.py +3 -0
- deltacat/compute/compactor_v2/private/compaction_utils.py +9 -5
- deltacat/compute/compactor_v2/utils/content_type_params.py +185 -34
- deltacat/compute/compactor_v2/utils/io.py +28 -14
- deltacat/compute/compactor_v2/utils/primary_key_index.py +9 -4
- deltacat/compute/compactor_v2/utils/task_options.py +128 -183
- deltacat/compute/resource_estimation/__init__.py +27 -0
- deltacat/compute/resource_estimation/delta.py +271 -0
- deltacat/compute/resource_estimation/manifest.py +394 -0
- deltacat/compute/resource_estimation/model.py +165 -0
- deltacat/compute/resource_estimation/parquet.py +108 -0
- deltacat/constants.py +5 -0
- deltacat/exceptions.py +2 -4
- deltacat/logs.py +8 -0
- deltacat/tests/compute/compact_partition_multiple_rounds_test_cases.py +77 -0
- deltacat/tests/compute/compact_partition_rebase_test_cases.py +308 -0
- deltacat/tests/compute/compact_partition_rebase_then_incremental_test_cases.py +159 -0
- deltacat/tests/compute/compactor_v2/test_compaction_session.py +157 -0
- deltacat/tests/compute/compactor_v2/utils/test_task_options.py +3 -3
- deltacat/tests/compute/resource_estimation/test_delta.py +605 -0
- deltacat/tests/compute/resource_estimation/test_manifest.py +921 -0
- deltacat/tests/compute/test_compact_partition_rebase.py +13 -4
- deltacat/tests/compute/test_util_common.py +2 -0
- deltacat/tests/compute/test_util_create_table_deltas_repo.py +13 -5
- deltacat/tests/test_logs.py +34 -0
- deltacat/tests/test_utils/pyarrow.py +15 -5
- {deltacat-1.1.17.dist-info → deltacat-1.1.19.dist-info}/METADATA +2 -2
- {deltacat-1.1.17.dist-info → deltacat-1.1.19.dist-info}/RECORD +38 -54
- deltacat/compute/metastats/meta_stats.py +0 -479
- deltacat/compute/metastats/model/__init__.py +0 -0
- deltacat/compute/metastats/model/partition_stats_dict.py +0 -34
- deltacat/compute/metastats/model/stats_cluster_size_estimator.py +0 -68
- deltacat/compute/metastats/stats.py +0 -182
- deltacat/compute/metastats/utils/__init__.py +0 -0
- deltacat/compute/metastats/utils/constants.py +0 -16
- deltacat/compute/metastats/utils/io.py +0 -223
- deltacat/compute/metastats/utils/pyarrow_memory_estimation_function.py +0 -18
- deltacat/compute/metastats/utils/ray_utils.py +0 -129
- deltacat/compute/stats/basic.py +0 -226
- deltacat/compute/stats/models/__init__.py +0 -0
- deltacat/compute/stats/models/delta_column_stats.py +0 -98
- deltacat/compute/stats/models/delta_stats.py +0 -233
- deltacat/compute/stats/models/delta_stats_cache_result.py +0 -49
- deltacat/compute/stats/models/manifest_entry_stats.py +0 -72
- deltacat/compute/stats/models/stats_result.py +0 -104
- deltacat/compute/stats/utils/__init__.py +0 -0
- deltacat/compute/stats/utils/intervals.py +0 -94
- deltacat/compute/stats/utils/io.py +0 -230
- deltacat/compute/stats/utils/manifest_stats_file.py +0 -100
- deltacat/tests/stats/__init__.py +0 -0
- deltacat/tests/stats/test_intervals.py +0 -49
- /deltacat/{compute/metastats → tests/compute/resource_estimation}/__init__.py +0 -0
- /deltacat/{compute/metastats/config → tests/compute/resource_estimation/data}/__init__.py +0 -0
- {deltacat-1.1.17.dist-info → deltacat-1.1.19.dist-info}/LICENSE +0 -0
- {deltacat-1.1.17.dist-info → deltacat-1.1.19.dist-info}/WHEEL +0 -0
- {deltacat-1.1.17.dist-info → deltacat-1.1.19.dist-info}/top_level.txt +0 -0
@@ -1,479 +0,0 @@
|
|
1
|
-
# Allow classes to use self-referencing Type hints in Python 3.7.
|
2
|
-
from __future__ import annotations
|
3
|
-
|
4
|
-
import functools
|
5
|
-
import logging
|
6
|
-
import os
|
7
|
-
import pathlib
|
8
|
-
from typing import Any, Dict, List, Optional, Set
|
9
|
-
|
10
|
-
import ray
|
11
|
-
from ray.types import ObjectRef
|
12
|
-
|
13
|
-
from deltacat import logs
|
14
|
-
from deltacat.compute.compactor import DeltaAnnotated
|
15
|
-
from deltacat.compute.metastats.model.stats_cluster_size_estimator import (
|
16
|
-
StatsClusterSizeEstimator,
|
17
|
-
)
|
18
|
-
from deltacat.compute.metastats.stats import start_stats_collection
|
19
|
-
from deltacat.compute.metastats.utils.constants import (
|
20
|
-
DEFAULT_CPUS_PER_INSTANCE_R5_8XLARGE,
|
21
|
-
DEFAULT_JOB_RUN_TRACE_ID,
|
22
|
-
HEAD_NODE_OBJECT_STORE_MEMORY_RESERVE_RATIO,
|
23
|
-
MANIFEST_FILE_COUNT_PER_CPU,
|
24
|
-
STATS_CLUSTER_R5_INSTANCE_TYPE,
|
25
|
-
WORKER_NODE_OBJECT_STORE_MEMORY_RESERVE_RATIO,
|
26
|
-
)
|
27
|
-
from deltacat.compute.metastats.utils.io import read_cached_partition_stats
|
28
|
-
from deltacat.compute.metastats.utils.pyarrow_memory_estimation_function import (
|
29
|
-
estimation_function,
|
30
|
-
)
|
31
|
-
from deltacat.compute.metastats.utils.ray_utils import replace_cluster_cfg_vars
|
32
|
-
from deltacat.compute.stats.models.delta_stats import DeltaStats
|
33
|
-
from deltacat.compute.stats.models.delta_stats_cache_result import DeltaStatsCacheResult
|
34
|
-
from deltacat.compute.stats.utils.intervals import DeltaRange, merge_intervals
|
35
|
-
from deltacat.compute.stats.utils.io import get_deltas_from_range
|
36
|
-
from deltacat.constants import (
|
37
|
-
BYTES_PER_GIBIBYTE,
|
38
|
-
PYARROW_INFLATION_MULTIPLIER_ALL_COLUMNS,
|
39
|
-
)
|
40
|
-
from deltacat.storage import Delta, DeltaLocator, PartitionLocator
|
41
|
-
from deltacat.storage import interface as unimplemented_deltacat_storage
|
42
|
-
from deltacat.utils.performance import timed_invocation
|
43
|
-
|
44
|
-
logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
|
45
|
-
|
46
|
-
|
47
|
-
def collect_metastats(
|
48
|
-
source_partition_locators: List[PartitionLocator],
|
49
|
-
columns: Optional[List[str]] = None,
|
50
|
-
file_count_per_cpu: Optional[int] = MANIFEST_FILE_COUNT_PER_CPU,
|
51
|
-
stat_results_s3_bucket: Optional[str] = None,
|
52
|
-
metastats_results_s3_bucket: Optional[str] = None,
|
53
|
-
deltacat_storage=unimplemented_deltacat_storage,
|
54
|
-
*args,
|
55
|
-
**kwargs,
|
56
|
-
) -> Dict[str, Dict[int, DeltaStats]]:
|
57
|
-
|
58
|
-
# TODO: Add CompactionEventDispatcher for metastats collection started event
|
59
|
-
stats_res_all_partitions: Dict[str, Dict[int, DeltaStats]] = {}
|
60
|
-
stats_res_obj_ref_all_partitions: Dict[str, ObjectRef] = {}
|
61
|
-
for partition_locator in source_partition_locators:
|
62
|
-
partition_id = partition_locator.partition_id
|
63
|
-
if partition_locator.partition_values:
|
64
|
-
partition_value_string = "_".join(partition_locator.partition_values)
|
65
|
-
else:
|
66
|
-
partition_value_string = f"no_partition_value_{partition_id}"
|
67
|
-
partition_canonical_string = partition_locator.canonical_string()
|
68
|
-
stats_res_obj_ref = collect_from_partition.remote(
|
69
|
-
source_partition_locator=partition_locator,
|
70
|
-
partition_value_string=partition_value_string,
|
71
|
-
partition_canonical_string=partition_canonical_string,
|
72
|
-
columns=columns,
|
73
|
-
stat_results_s3_bucket=stat_results_s3_bucket,
|
74
|
-
metastats_results_s3_bucket=metastats_results_s3_bucket,
|
75
|
-
file_count_per_cpu=file_count_per_cpu,
|
76
|
-
deltacat_storage=deltacat_storage,
|
77
|
-
*args,
|
78
|
-
**kwargs,
|
79
|
-
)
|
80
|
-
stats_res_obj_ref_all_partitions[partition_value_string] = stats_res_obj_ref
|
81
|
-
for pv, stats_res_obj_ref in stats_res_obj_ref_all_partitions.items():
|
82
|
-
stats_res_all_partitions[pv] = ray.get(stats_res_obj_ref)
|
83
|
-
# TODO: Add CompactionEventDispatcher for metastats collection completed event
|
84
|
-
|
85
|
-
logger.info(f"stats_res_all_partitions: {stats_res_all_partitions}")
|
86
|
-
|
87
|
-
# For compaction result validation purpose only
|
88
|
-
aggregate_partition_stats_for_validation: Dict[str, list] = {}
|
89
|
-
for partition_val, delta_stream_range_set in stats_res_all_partitions.items():
|
90
|
-
partition_stats_sum_row_count = 0
|
91
|
-
partition_pyarrow_sum = 0
|
92
|
-
for stream_pos, stats_column_result in delta_stream_range_set.items():
|
93
|
-
for cs in stats_column_result.column_stats[0].manifest_stats.stats:
|
94
|
-
partition_stats_sum_row_count += cs.get("rowCount")
|
95
|
-
|
96
|
-
for stats in stats_column_result.get("column_stats"):
|
97
|
-
partition_pyarrow_sum += stats.get("stats").get("pyarrowTableBytes")
|
98
|
-
aggregate_partition_stats_for_validation[partition_val] = [
|
99
|
-
partition_stats_sum_row_count,
|
100
|
-
partition_pyarrow_sum,
|
101
|
-
]
|
102
|
-
logger.info(
|
103
|
-
f"partitions_stats_result for partition value: {partition_val}: rowCount: {partition_stats_sum_row_count}; pyarrowTableBytes: {partition_pyarrow_sum}"
|
104
|
-
)
|
105
|
-
return aggregate_partition_stats_for_validation
|
106
|
-
|
107
|
-
# return stats_res_all_partitions
|
108
|
-
|
109
|
-
|
110
|
-
@ray.remote(num_cpus=1)
|
111
|
-
def collect_from_partition(
|
112
|
-
source_partition_locator: PartitionLocator,
|
113
|
-
partition_value_string,
|
114
|
-
partition_canonical_string,
|
115
|
-
delta_stream_position_range_set: Optional[Set[DeltaRange]] = None,
|
116
|
-
columns: Optional[List[str]] = None,
|
117
|
-
file_count_per_cpu: Optional[int] = MANIFEST_FILE_COUNT_PER_CPU,
|
118
|
-
stat_results_s3_bucket: Optional[str] = None,
|
119
|
-
metastats_results_s3_bucket: Optional[str] = None,
|
120
|
-
deltacat_storage=unimplemented_deltacat_storage,
|
121
|
-
deltacat_storage_kwargs: Optional[Dict[str, Any]] = None,
|
122
|
-
*args,
|
123
|
-
**kwargs,
|
124
|
-
) -> ObjectRef[Dict[int, DeltaStats]]:
|
125
|
-
if deltacat_storage_kwargs is None:
|
126
|
-
deltacat_storage_kwargs = {}
|
127
|
-
if not columns:
|
128
|
-
columns = deltacat_storage.get_table_version_column_names(
|
129
|
-
source_partition_locator.namespace,
|
130
|
-
source_partition_locator.table_name,
|
131
|
-
source_partition_locator.table_version,
|
132
|
-
)
|
133
|
-
deltas = _find_deltas(
|
134
|
-
source_partition_locator, delta_stream_position_range_set, deltacat_storage
|
135
|
-
)
|
136
|
-
|
137
|
-
logger.info(f"Find {len(deltas)} deltas!")
|
138
|
-
trace_id = DEFAULT_JOB_RUN_TRACE_ID
|
139
|
-
if "trace_id" in kwargs:
|
140
|
-
trace_id = kwargs.get("trace_id")
|
141
|
-
else:
|
142
|
-
logger.warning(
|
143
|
-
f"No job run trace id specified, default to {DEFAULT_JOB_RUN_TRACE_ID}"
|
144
|
-
)
|
145
|
-
|
146
|
-
cpus_per_instance = DEFAULT_CPUS_PER_INSTANCE_R5_8XLARGE
|
147
|
-
if cpus_per_instance in kwargs:
|
148
|
-
cpus_per_instance = kwargs.get("cpus_per_instance")
|
149
|
-
else:
|
150
|
-
logger.info(
|
151
|
-
f"Stats cluster CPUS per instance not specified, default to {DEFAULT_CPUS_PER_INSTANCE_R5_8XLARGE}"
|
152
|
-
)
|
153
|
-
|
154
|
-
stats_res_obj_ref = _start_all_stats_collection_from_deltas(
|
155
|
-
deltas,
|
156
|
-
partition_value_string,
|
157
|
-
partition_canonical_string,
|
158
|
-
columns,
|
159
|
-
trace_id,
|
160
|
-
file_count_per_cpu,
|
161
|
-
cpus_per_instance,
|
162
|
-
stat_results_s3_bucket,
|
163
|
-
metastats_results_s3_bucket,
|
164
|
-
deltacat_storage,
|
165
|
-
)
|
166
|
-
return stats_res_obj_ref
|
167
|
-
|
168
|
-
|
169
|
-
def _find_deltas(
|
170
|
-
source_partition_locator: PartitionLocator,
|
171
|
-
delta_stream_position_range_set: Optional[Set[DeltaRange]] = None,
|
172
|
-
deltacat_storage=unimplemented_deltacat_storage,
|
173
|
-
) -> List[Delta]:
|
174
|
-
|
175
|
-
if delta_stream_position_range_set is None:
|
176
|
-
delta_stream_position_range_set = {(None, None)}
|
177
|
-
delta_range_lookup_pending: List[ObjectRef[List[Delta]]] = []
|
178
|
-
|
179
|
-
for range_pair in merge_intervals(delta_stream_position_range_set):
|
180
|
-
begin, end = range_pair
|
181
|
-
promise: ObjectRef[List[Delta]] = get_deltas_from_range.remote(
|
182
|
-
source_partition_locator, begin, end, deltacat_storage
|
183
|
-
)
|
184
|
-
delta_range_lookup_pending.append(promise)
|
185
|
-
|
186
|
-
delta_list_by_ranges: List[List[Delta]] = ray.get(delta_range_lookup_pending)
|
187
|
-
deltas = [delta for delta_list in delta_list_by_ranges for delta in delta_list]
|
188
|
-
return deltas
|
189
|
-
|
190
|
-
|
191
|
-
def _start_all_stats_collection_from_deltas(
|
192
|
-
deltas: List[Delta],
|
193
|
-
partition_value_string: Optional[str],
|
194
|
-
partition_canonical_string: Optional[str],
|
195
|
-
columns: Optional[List[str]] = None,
|
196
|
-
trace_id: Optional[str] = None,
|
197
|
-
file_count_per_cpu: Optional[int] = MANIFEST_FILE_COUNT_PER_CPU,
|
198
|
-
cpus_per_instance: Optional[int] = DEFAULT_CPUS_PER_INSTANCE_R5_8XLARGE,
|
199
|
-
stat_results_s3_bucket: Optional[str] = None,
|
200
|
-
metastats_results_s3_bucket: Optional[str] = None,
|
201
|
-
deltacat_storage=unimplemented_deltacat_storage,
|
202
|
-
) -> Dict[int, DeltaStats]:
|
203
|
-
|
204
|
-
delta_stats_compute_list: List[DeltaLocator] = []
|
205
|
-
meta_stats_list_ready: List[DeltaLocator] = []
|
206
|
-
meta_stats_list_to_compute: List[DeltaLocator] = []
|
207
|
-
|
208
|
-
if stat_results_s3_bucket:
|
209
|
-
found_columns_stats_map: Dict[
|
210
|
-
int, List[DeltaStatsCacheResult]
|
211
|
-
] = read_cached_partition_stats(
|
212
|
-
partition_canonical_string, stat_results_s3_bucket
|
213
|
-
)
|
214
|
-
|
215
|
-
delta_cache_res: List[DeltaStats] = []
|
216
|
-
for delta in deltas:
|
217
|
-
if found_columns_stats_map and delta.stream_position in found_columns_stats_map:
|
218
|
-
cached_result = found_columns_stats_map[delta.stream_position]
|
219
|
-
if cached_result.hits:
|
220
|
-
delta_cache_res.append(cached_result.hits)
|
221
|
-
meta_stats_list_ready.append(
|
222
|
-
cached_result.hits.column_stats[0].manifest_stats.delta_locator
|
223
|
-
)
|
224
|
-
|
225
|
-
if cached_result.misses:
|
226
|
-
delta_locator: DeltaLocator = cached_result.misses.delta_locator
|
227
|
-
delta_stats_compute_list.append(delta_locator)
|
228
|
-
meta_stats_list_to_compute.append(delta_locator)
|
229
|
-
else:
|
230
|
-
delta_stats_compute_list.append(delta.locator)
|
231
|
-
meta_stats_list_to_compute.append(delta.locator)
|
232
|
-
|
233
|
-
logger.info(f"Collecting stats on {len(delta_stats_compute_list)} deltas!")
|
234
|
-
delta_stats_compute_res: Dict[int, DeltaStats] = {}
|
235
|
-
if delta_stats_compute_list:
|
236
|
-
delta_stats_compute_res = _start_metadata_stats_collection(
|
237
|
-
delta_stats_compute_list=delta_stats_compute_list,
|
238
|
-
meta_stats_list_ready=meta_stats_list_ready,
|
239
|
-
meta_stats_list_to_compute=meta_stats_list_to_compute,
|
240
|
-
partition_value_string=partition_value_string,
|
241
|
-
partition_canonical_string=partition_canonical_string,
|
242
|
-
columns=columns,
|
243
|
-
trace_id=trace_id,
|
244
|
-
file_count_per_cpu=file_count_per_cpu,
|
245
|
-
cpus_per_instance=cpus_per_instance,
|
246
|
-
stat_results_s3_bucket=stat_results_s3_bucket,
|
247
|
-
metastats_results_s3_bucket=metastats_results_s3_bucket,
|
248
|
-
deltacat_storage=deltacat_storage,
|
249
|
-
)
|
250
|
-
|
251
|
-
delta_stream_range_stats: Dict[int, DeltaStats] = {}
|
252
|
-
for delta_column_stats in delta_cache_res:
|
253
|
-
assert (
|
254
|
-
len(delta_column_stats.column_stats) > 0
|
255
|
-
), f"Expected columns of `{delta_column_stats}` to be non-empty"
|
256
|
-
stream_position = delta_column_stats.column_stats[
|
257
|
-
0
|
258
|
-
].manifest_stats.delta_locator.stream_position
|
259
|
-
delta_stream_range_stats[stream_position] = delta_column_stats
|
260
|
-
|
261
|
-
# stats collection result: if we have cached stats and missed column stats for same delta, stats collection for this delta is still needed
|
262
|
-
# and the final result will use the newly collected stats for this delta.
|
263
|
-
stats_collection_res: Dict[int, DeltaStats] = {
|
264
|
-
**delta_stream_range_stats,
|
265
|
-
**delta_stats_compute_res,
|
266
|
-
}
|
267
|
-
|
268
|
-
return stats_collection_res
|
269
|
-
|
270
|
-
|
271
|
-
def _start_metadata_stats_collection(
|
272
|
-
delta_stats_compute_list: List[DeltaLocator],
|
273
|
-
meta_stats_list_ready: List[DeltaLocator],
|
274
|
-
meta_stats_list_to_compute: List[DeltaLocator],
|
275
|
-
partition_value_string: Optional[str],
|
276
|
-
partition_canonical_string: Optional[str],
|
277
|
-
columns: Optional[List[str]] = None,
|
278
|
-
trace_id: Optional[str] = None,
|
279
|
-
file_count_per_cpu: Optional[int] = MANIFEST_FILE_COUNT_PER_CPU,
|
280
|
-
cpus_per_instance: Optional[int] = DEFAULT_CPUS_PER_INSTANCE_R5_8XLARGE,
|
281
|
-
stat_results_s3_bucket: Optional[str] = None,
|
282
|
-
metastats_results_s3_bucket: Optional[str] = None,
|
283
|
-
deltacat_storage=unimplemented_deltacat_storage,
|
284
|
-
) -> Dict[int, DeltaStats]:
|
285
|
-
|
286
|
-
meta_stats_res_ready: Dict[int, int] = {}
|
287
|
-
|
288
|
-
for delta_locator in meta_stats_list_ready:
|
289
|
-
delta_meta_count = 0
|
290
|
-
manifest = deltacat_storage.get_delta_manifest(delta_locator)
|
291
|
-
delta = Delta.of(delta_locator, None, None, None, manifest)
|
292
|
-
|
293
|
-
for entry in delta.manifest.entries:
|
294
|
-
delta_meta_count += entry.meta.content_length
|
295
|
-
meta_stats_res_ready[delta.stream_position] = delta_meta_count
|
296
|
-
|
297
|
-
first_delta_locator = (
|
298
|
-
meta_stats_list_ready[0]
|
299
|
-
if meta_stats_list_ready
|
300
|
-
else meta_stats_list_to_compute[0]
|
301
|
-
)
|
302
|
-
manifest = deltacat_storage.get_delta_manifest(first_delta_locator)
|
303
|
-
content_type = manifest.meta.content_type
|
304
|
-
content_encoding = manifest.meta.content_type
|
305
|
-
|
306
|
-
meta_stats_to_compute: Dict[int, int] = {}
|
307
|
-
manifest_file_count_to_compute: Dict[int, int] = {}
|
308
|
-
|
309
|
-
for delta_locator in meta_stats_list_to_compute:
|
310
|
-
delta_meta_count = 0
|
311
|
-
manifest = deltacat_storage.get_delta_manifest(delta_locator)
|
312
|
-
delta = Delta.of(delta_locator, None, None, None, manifest)
|
313
|
-
file_count = len(delta.manifest.entries)
|
314
|
-
manifest_file_count_to_compute[delta.stream_position] = file_count
|
315
|
-
for entry in delta.manifest.entries:
|
316
|
-
delta_meta_count += entry.meta.content_length
|
317
|
-
meta_stats_to_compute[delta.stream_position] = delta_meta_count
|
318
|
-
|
319
|
-
batched_delta_stats_compute_list = _batch_deltas(
|
320
|
-
delta_stats_compute_list,
|
321
|
-
file_count_per_cpu,
|
322
|
-
cpus_per_instance,
|
323
|
-
deltacat_storage,
|
324
|
-
content_type,
|
325
|
-
content_encoding,
|
326
|
-
)
|
327
|
-
|
328
|
-
# out_cluster_cfg = _setup_stats_cluster(min_workers,
|
329
|
-
# partition_value_string,
|
330
|
-
# trace_id,
|
331
|
-
# cpus_per_instance)
|
332
|
-
out_cluster_cfg = None
|
333
|
-
delta_stats_res: Dict[int, DeltaStats] = _start_stats_cluster(
|
334
|
-
out_cluster_cfg,
|
335
|
-
batched_delta_stats_compute_list,
|
336
|
-
columns,
|
337
|
-
stat_results_s3_bucket,
|
338
|
-
metastats_results_s3_bucket,
|
339
|
-
deltacat_storage,
|
340
|
-
partition_canonical_string,
|
341
|
-
)
|
342
|
-
|
343
|
-
return delta_stats_res
|
344
|
-
|
345
|
-
|
346
|
-
def _start_stats_cluster(
|
347
|
-
out_cluster_cfg: str,
|
348
|
-
batched_delta_stats_compute_list: List[DeltaAnnotated],
|
349
|
-
columns: List[str],
|
350
|
-
stat_results_s3_bucket: Optional[str] = None,
|
351
|
-
metastats_results_s3_bucket: Optional[str] = None,
|
352
|
-
deltacat_storage=unimplemented_deltacat_storage,
|
353
|
-
partition_val: Optional[str] = "partition_val",
|
354
|
-
):
|
355
|
-
# ray_up_latency = timed_invocation(
|
356
|
-
# func=ray_up,
|
357
|
-
# cluster_cfg=out_cluster_cfg
|
358
|
-
# )
|
359
|
-
# logger.info(f"ray_up_latency: {partition_val}:{ray_up_latency}")
|
360
|
-
|
361
|
-
# head_node_ip = get_head_node_ip(out_cluster_cfg)
|
362
|
-
# client = ray_init(head_node_ip, 10001)
|
363
|
-
# with client:
|
364
|
-
delta_stream_range_stats, stats_collection_latency = timed_invocation(
|
365
|
-
func=start_stats_collection,
|
366
|
-
batched_delta_stats_compute_list=batched_delta_stats_compute_list,
|
367
|
-
columns=columns,
|
368
|
-
stat_results_s3_bucket=stat_results_s3_bucket,
|
369
|
-
metastats_results_s3_bucket=metastats_results_s3_bucket,
|
370
|
-
deltacat_storage=deltacat_storage,
|
371
|
-
)
|
372
|
-
logger.info(
|
373
|
-
f"actual_stats_collection_latency: {partition_val}: {stats_collection_latency}"
|
374
|
-
)
|
375
|
-
# client.disconnect()
|
376
|
-
# ray_down(out_cluster_cfg)
|
377
|
-
# clean_up_cluster_cfg_file(out_cluster_cfg)
|
378
|
-
return delta_stream_range_stats
|
379
|
-
|
380
|
-
|
381
|
-
def _estimate_cpus_needed(
|
382
|
-
meta_stats_to_compute,
|
383
|
-
memory_gb_per_cpu,
|
384
|
-
file_count_per_cpu,
|
385
|
-
manifest_file_count_to_compute,
|
386
|
-
partition_val,
|
387
|
-
):
|
388
|
-
content_length_sum = 0
|
389
|
-
for val in meta_stats_to_compute.values():
|
390
|
-
content_length_sum += val
|
391
|
-
manifest_file_count_sum = 0
|
392
|
-
for val in manifest_file_count_to_compute.values():
|
393
|
-
manifest_file_count_sum += val
|
394
|
-
estimated_memory_bytes_needed = (
|
395
|
-
content_length_sum * PYARROW_INFLATION_MULTIPLIER_ALL_COLUMNS
|
396
|
-
)
|
397
|
-
estimated_memory_gib_needed = estimated_memory_bytes_needed / BYTES_PER_GIBIBYTE
|
398
|
-
|
399
|
-
logger.info(
|
400
|
-
f"estimated_memory_gib_needed: {partition_val} : {estimated_memory_gib_needed}"
|
401
|
-
)
|
402
|
-
logger.info(f"manifest_file_count_sum: {partition_val} : {manifest_file_count_sum}")
|
403
|
-
|
404
|
-
memory_per_cpu_available = memory_gb_per_cpu * (
|
405
|
-
1 - WORKER_NODE_OBJECT_STORE_MEMORY_RESERVE_RATIO
|
406
|
-
)
|
407
|
-
estimator = StatsClusterSizeEstimator.of(
|
408
|
-
memory_per_cpu_available,
|
409
|
-
file_count_per_cpu,
|
410
|
-
estimated_memory_gib_needed,
|
411
|
-
manifest_file_count_sum,
|
412
|
-
)
|
413
|
-
min_cpus = StatsClusterSizeEstimator.estimate_cpus_needed(estimator)
|
414
|
-
return min_cpus
|
415
|
-
|
416
|
-
|
417
|
-
def _batch_deltas(
|
418
|
-
delta_stats_compute_list,
|
419
|
-
file_count_per_cpu,
|
420
|
-
cpu_per_instance,
|
421
|
-
deltacat_storage,
|
422
|
-
content_type,
|
423
|
-
content_encoding,
|
424
|
-
) -> List[DeltaAnnotated]:
|
425
|
-
worker_node_mem = (
|
426
|
-
cpu_per_instance
|
427
|
-
* (1 - WORKER_NODE_OBJECT_STORE_MEMORY_RESERVE_RATIO)
|
428
|
-
* BYTES_PER_GIBIBYTE
|
429
|
-
)
|
430
|
-
delta_list = []
|
431
|
-
|
432
|
-
estimate_based_on_content_length = functools.partial(
|
433
|
-
estimation_function,
|
434
|
-
content_type=content_type,
|
435
|
-
content_encoding=content_encoding,
|
436
|
-
)
|
437
|
-
|
438
|
-
for delta_locator in delta_stats_compute_list:
|
439
|
-
manifest = deltacat_storage.get_delta_manifest(delta_locator)
|
440
|
-
delta = Delta.of(delta_locator, None, None, None, manifest)
|
441
|
-
delta_annotated = DeltaAnnotated.of(delta)
|
442
|
-
delta_list.append(delta_annotated)
|
443
|
-
|
444
|
-
rebatched_da_list = DeltaAnnotated.rebatch(
|
445
|
-
delta_list,
|
446
|
-
worker_node_mem,
|
447
|
-
file_count_per_cpu,
|
448
|
-
estimate_based_on_content_length,
|
449
|
-
)
|
450
|
-
|
451
|
-
logger.info(f"Rebatched_delta_list_length: {len(rebatched_da_list)}")
|
452
|
-
|
453
|
-
return rebatched_da_list
|
454
|
-
|
455
|
-
|
456
|
-
def _setup_stats_cluster(
|
457
|
-
min_workers, partition_value_string, trace_id, cpus_per_instance
|
458
|
-
):
|
459
|
-
stats_cluster_instance_type = (
|
460
|
-
int(cpus_per_instance // 4)
|
461
|
-
if cpus_per_instance
|
462
|
-
else STATS_CLUSTER_R5_INSTANCE_TYPE
|
463
|
-
)
|
464
|
-
stats_cluster_instance_type_str = f"r5.{stats_cluster_instance_type}xlarge".strip()
|
465
|
-
parent_dir_path = pathlib.Path(__file__).parent.resolve()
|
466
|
-
in_cfg = os.path.join(parent_dir_path, "config", "stats_cluster_example.yaml")
|
467
|
-
out_cluster_cfg_file_path = replace_cluster_cfg_vars(
|
468
|
-
partition_canonical_string=partition_value_string,
|
469
|
-
trace_id=trace_id,
|
470
|
-
file_path=in_cfg,
|
471
|
-
min_workers=min_workers,
|
472
|
-
head_type=stats_cluster_instance_type_str,
|
473
|
-
worker_type=stats_cluster_instance_type_str,
|
474
|
-
head_object_store_memory_pct=HEAD_NODE_OBJECT_STORE_MEMORY_RESERVE_RATIO * 100,
|
475
|
-
worker_object_store_memory_pct=WORKER_NODE_OBJECT_STORE_MEMORY_RESERVE_RATIO
|
476
|
-
* 100,
|
477
|
-
)
|
478
|
-
|
479
|
-
return out_cluster_cfg_file_path
|
File without changes
|
@@ -1,34 +0,0 @@
|
|
1
|
-
# Allow classes to use self-referencing Type hints in Python 3.7.
|
2
|
-
from __future__ import annotations
|
3
|
-
|
4
|
-
from typing import Dict
|
5
|
-
|
6
|
-
from deltacat.compute.stats.models.delta_stats import DeltaStats
|
7
|
-
|
8
|
-
|
9
|
-
class PartitionStats(dict):
|
10
|
-
@staticmethod
|
11
|
-
def of(
|
12
|
-
delta_stats: Dict[DeltaStats], partition_canonical_string: str
|
13
|
-
) -> PartitionStats:
|
14
|
-
ps = PartitionStats()
|
15
|
-
ps["delta_stats"] = delta_stats
|
16
|
-
ps["partition_canonical_string"] = partition_canonical_string
|
17
|
-
return ps
|
18
|
-
|
19
|
-
@staticmethod
|
20
|
-
def build_from_dict(partition_stats: str) -> PartitionStats:
|
21
|
-
delta_stats_dict = {}
|
22
|
-
for stream_position, delta_stats in partition_stats["delta_stats"].items():
|
23
|
-
delta_stats_dict[stream_position] = DeltaStats.build_from_dict(delta_stats)
|
24
|
-
return PartitionStats.of(
|
25
|
-
delta_stats_dict, partition_stats["partition_canonical_string"]
|
26
|
-
)
|
27
|
-
|
28
|
-
@property
|
29
|
-
def delta_stats(self) -> Dict[DeltaStats]:
|
30
|
-
return self["delta_stats"]
|
31
|
-
|
32
|
-
@property
|
33
|
-
def partition_canonical_string(self) -> str:
|
34
|
-
return self["partition_canonical_string"]
|
@@ -1,68 +0,0 @@
|
|
1
|
-
# Allow classes to use self-referencing Type hints in Python 3.7.
|
2
|
-
from __future__ import annotations
|
3
|
-
|
4
|
-
from deltacat.compute.stats.models.delta_stats import DeltaStats
|
5
|
-
|
6
|
-
|
7
|
-
class StatsClusterSizeEstimator(dict):
|
8
|
-
@staticmethod
|
9
|
-
def of(
|
10
|
-
memory_per_cpu: int,
|
11
|
-
file_count_per_cpu: int,
|
12
|
-
total_memory_needed: int,
|
13
|
-
total_file_count: int,
|
14
|
-
) -> DeltaStats:
|
15
|
-
estimator = StatsClusterSizeEstimator()
|
16
|
-
estimator["memory_per_cpu"] = memory_per_cpu
|
17
|
-
estimator["file_count_per_cpu"] = file_count_per_cpu
|
18
|
-
estimator["total_memory_needed"] = total_memory_needed
|
19
|
-
estimator["total_file_count"] = total_file_count
|
20
|
-
return estimator
|
21
|
-
|
22
|
-
@property
|
23
|
-
def memory_per_cpu(self) -> int:
|
24
|
-
"""
|
25
|
-
Returns a list of stats associated to each column in this delta.
|
26
|
-
"""
|
27
|
-
return self["memory_per_cpu"]
|
28
|
-
|
29
|
-
@property
|
30
|
-
def file_count_per_cpu(self) -> int:
|
31
|
-
"""
|
32
|
-
Returns a list of stats associated to each column in this delta.
|
33
|
-
"""
|
34
|
-
return self["file_count_per_cpu"]
|
35
|
-
|
36
|
-
@property
|
37
|
-
def total_memory_needed(self) -> int:
|
38
|
-
"""
|
39
|
-
Returns a list of stats associated to each column in this delta.
|
40
|
-
"""
|
41
|
-
return self["total_memory_needed"]
|
42
|
-
|
43
|
-
@property
|
44
|
-
def total_file_count(self) -> int:
|
45
|
-
"""
|
46
|
-
Returns a list of stats associated to each column in this delta.
|
47
|
-
"""
|
48
|
-
return self["total_file_count"]
|
49
|
-
|
50
|
-
@staticmethod
|
51
|
-
def estimate_cpus_needed(estimator: StatsClusterSizeEstimator):
|
52
|
-
|
53
|
-
# TODO(zyiqin): Current implementation is only for a rough guess using the PYARROW_INFLATION_MULTIPLIER,
|
54
|
-
# note the inflation rate is for content_length to pyarrow_table_bytes for all columns.
|
55
|
-
# The full implementation logic should be like:
|
56
|
-
# 1. liner regression with 99 confidence level: pull metastats data for all deltas for this partition if len(datapoints) > 30.
|
57
|
-
# 2. if not enough previous stats collected for same partition: Fall back to datapoints for all paritions for same table.
|
58
|
-
# 3. If not enough stats collected for this table: use average content length to each content_type and content_encoding inflation rates
|
59
|
-
# 4. If not enough stats for this content_type and content_encoding combination: use the basic PYARROW_INFLATION_MULTIPLIER instead.
|
60
|
-
# So, only option 4 is implemented here since this pre-requirement for first 3 options are not met for first round of metastats&stats collection.
|
61
|
-
|
62
|
-
min_cpus_based_on_memory = (
|
63
|
-
estimator.total_memory_needed // estimator.memory_per_cpu
|
64
|
-
) + 1
|
65
|
-
min_cpus_based_on_file_count = (
|
66
|
-
estimator.total_file_count // estimator.file_count_per_cpu
|
67
|
-
) + 1
|
68
|
-
return max(min_cpus_based_on_memory, min_cpus_based_on_file_count)
|