deltacat 0.1.10.dev0__py3-none-any.whl → 0.1.12__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +41 -15
- deltacat/aws/clients.py +12 -31
- deltacat/aws/constants.py +1 -1
- deltacat/aws/redshift/__init__.py +7 -2
- deltacat/aws/redshift/model/manifest.py +54 -50
- deltacat/aws/s3u.py +176 -187
- deltacat/catalog/delegate.py +151 -185
- deltacat/catalog/interface.py +78 -97
- deltacat/catalog/model/catalog.py +21 -21
- deltacat/catalog/model/table_definition.py +11 -9
- deltacat/compute/compactor/__init__.py +12 -16
- deltacat/compute/compactor/compaction_session.py +237 -166
- deltacat/compute/compactor/model/delta_annotated.py +60 -44
- deltacat/compute/compactor/model/delta_file_envelope.py +5 -6
- deltacat/compute/compactor/model/delta_file_locator.py +10 -8
- deltacat/compute/compactor/model/materialize_result.py +6 -7
- deltacat/compute/compactor/model/primary_key_index.py +38 -34
- deltacat/compute/compactor/model/pyarrow_write_result.py +3 -4
- deltacat/compute/compactor/model/round_completion_info.py +25 -19
- deltacat/compute/compactor/model/sort_key.py +18 -15
- deltacat/compute/compactor/steps/dedupe.py +119 -94
- deltacat/compute/compactor/steps/hash_bucket.py +48 -47
- deltacat/compute/compactor/steps/materialize.py +86 -92
- deltacat/compute/compactor/steps/rehash/rehash_bucket.py +13 -13
- deltacat/compute/compactor/steps/rehash/rewrite_index.py +5 -5
- deltacat/compute/compactor/utils/io.py +59 -47
- deltacat/compute/compactor/utils/primary_key_index.py +91 -80
- deltacat/compute/compactor/utils/round_completion_file.py +22 -23
- deltacat/compute/compactor/utils/system_columns.py +33 -45
- deltacat/compute/metastats/meta_stats.py +235 -157
- deltacat/compute/metastats/model/partition_stats_dict.py +7 -10
- deltacat/compute/metastats/model/stats_cluster_size_estimator.py +13 -5
- deltacat/compute/metastats/stats.py +95 -64
- deltacat/compute/metastats/utils/io.py +100 -53
- deltacat/compute/metastats/utils/pyarrow_memory_estimation_function.py +5 -2
- deltacat/compute/metastats/utils/ray_utils.py +38 -33
- deltacat/compute/stats/basic.py +107 -69
- deltacat/compute/stats/models/delta_column_stats.py +11 -8
- deltacat/compute/stats/models/delta_stats.py +59 -32
- deltacat/compute/stats/models/delta_stats_cache_result.py +4 -1
- deltacat/compute/stats/models/manifest_entry_stats.py +12 -6
- deltacat/compute/stats/models/stats_result.py +24 -14
- deltacat/compute/stats/utils/intervals.py +16 -9
- deltacat/compute/stats/utils/io.py +86 -51
- deltacat/compute/stats/utils/manifest_stats_file.py +24 -33
- deltacat/constants.py +4 -13
- deltacat/io/__init__.py +2 -2
- deltacat/io/aws/redshift/redshift_datasource.py +157 -143
- deltacat/io/dataset.py +14 -17
- deltacat/io/read_api.py +36 -33
- deltacat/logs.py +94 -42
- deltacat/storage/__init__.py +18 -8
- deltacat/storage/interface.py +196 -213
- deltacat/storage/model/delta.py +45 -51
- deltacat/storage/model/list_result.py +12 -8
- deltacat/storage/model/namespace.py +4 -5
- deltacat/storage/model/partition.py +42 -42
- deltacat/storage/model/stream.py +29 -30
- deltacat/storage/model/table.py +14 -14
- deltacat/storage/model/table_version.py +32 -31
- deltacat/storage/model/types.py +1 -0
- deltacat/tests/stats/test_intervals.py +11 -24
- deltacat/tests/utils/__init__.py +0 -0
- deltacat/tests/utils/test_record_batch_tables.py +284 -0
- deltacat/types/media.py +3 -4
- deltacat/types/tables.py +31 -21
- deltacat/utils/common.py +5 -11
- deltacat/utils/numpy.py +20 -22
- deltacat/utils/pandas.py +73 -100
- deltacat/utils/performance.py +3 -9
- deltacat/utils/placement.py +259 -230
- deltacat/utils/pyarrow.py +302 -89
- deltacat/utils/ray_utils/collections.py +2 -1
- deltacat/utils/ray_utils/concurrency.py +27 -28
- deltacat/utils/ray_utils/dataset.py +28 -28
- deltacat/utils/ray_utils/performance.py +5 -9
- deltacat/utils/ray_utils/runtime.py +9 -10
- {deltacat-0.1.10.dev0.dist-info → deltacat-0.1.12.dist-info}/METADATA +1 -1
- deltacat-0.1.12.dist-info/RECORD +110 -0
- deltacat-0.1.10.dev0.dist-info/RECORD +0 -108
- {deltacat-0.1.10.dev0.dist-info → deltacat-0.1.12.dist-info}/LICENSE +0 -0
- {deltacat-0.1.10.dev0.dist-info → deltacat-0.1.12.dist-info}/WHEEL +0 -0
- {deltacat-0.1.10.dev0.dist-info → deltacat-0.1.12.dist-info}/top_level.txt +0 -0
@@ -1,52 +1,59 @@
|
|
1
1
|
# Allow classes to use self-referencing Type hints in Python 3.7.
|
2
2
|
from __future__ import annotations
|
3
3
|
|
4
|
-
import ray
|
5
|
-
import os
|
6
4
|
import functools
|
7
5
|
import logging
|
8
|
-
|
6
|
+
import os
|
9
7
|
import pathlib
|
8
|
+
from typing import Dict, List, Optional, Set
|
10
9
|
|
11
|
-
|
12
|
-
|
13
|
-
from deltacat.compute.stats.models.delta_stats import DeltaStats
|
10
|
+
import ray
|
14
11
|
from ray.types import ObjectRef
|
15
12
|
|
16
13
|
from deltacat import logs
|
17
|
-
from deltacat.
|
18
|
-
from deltacat.compute.
|
19
|
-
|
14
|
+
from deltacat.compute.compactor import DeltaAnnotated
|
15
|
+
from deltacat.compute.metastats.model.stats_cluster_size_estimator import (
|
16
|
+
StatsClusterSizeEstimator,
|
17
|
+
)
|
18
|
+
from deltacat.compute.metastats.stats import start_stats_collection
|
19
|
+
from deltacat.compute.metastats.utils.constants import (
|
20
|
+
DEFAULT_CPUS_PER_INSTANCE_R5_8XLARGE,
|
21
|
+
DEFAULT_JOB_RUN_TRACE_ID,
|
22
|
+
HEAD_NODE_OBJECT_STORE_MEMORY_RESERVE_RATIO,
|
23
|
+
MANIFEST_FILE_COUNT_PER_CPU,
|
24
|
+
STATS_CLUSTER_R5_INSTANCE_TYPE,
|
25
|
+
WORKER_NODE_OBJECT_STORE_MEMORY_RESERVE_RATIO,
|
26
|
+
)
|
20
27
|
from deltacat.compute.metastats.utils.io import read_cached_partition_stats
|
28
|
+
from deltacat.compute.metastats.utils.pyarrow_memory_estimation_function import (
|
29
|
+
estimation_function,
|
30
|
+
)
|
31
|
+
from deltacat.compute.metastats.utils.ray_utils import replace_cluster_cfg_vars
|
32
|
+
from deltacat.compute.stats.models.delta_stats import DeltaStats
|
33
|
+
from deltacat.compute.stats.models.delta_stats_cache_result import DeltaStatsCacheResult
|
34
|
+
from deltacat.compute.stats.utils.intervals import DeltaRange, merge_intervals
|
21
35
|
from deltacat.compute.stats.utils.io import get_deltas_from_range
|
22
|
-
from deltacat.
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
from deltacat.compute.metastats.model.stats_cluster_size_estimator import StatsClusterSizeEstimator
|
28
|
-
from deltacat.compute.metastats.utils.pyarrow_memory_estimation_function import estimation_function
|
29
|
-
|
30
|
-
from deltacat.storage import PartitionLocator, DeltaLocator, Delta
|
36
|
+
from deltacat.constants import (
|
37
|
+
BYTES_PER_GIBIBYTE,
|
38
|
+
PYARROW_INFLATION_MULTIPLIER_ALL_COLUMNS,
|
39
|
+
)
|
40
|
+
from deltacat.storage import Delta, DeltaLocator, PartitionLocator
|
31
41
|
from deltacat.storage import interface as unimplemented_deltacat_storage
|
32
|
-
|
33
|
-
from deltacat.constants import PYARROW_INFLATION_MULTIPLIER_ALL_COLUMNS
|
34
|
-
from deltacat.compute.compactor import DeltaAnnotated
|
35
|
-
from deltacat.compute.metastats.stats import start_stats_collection
|
36
|
-
|
37
42
|
from deltacat.utils.performance import timed_invocation
|
38
43
|
|
39
44
|
logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
|
40
45
|
|
41
46
|
|
42
|
-
def collect_metastats(
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
47
|
+
def collect_metastats(
|
48
|
+
source_partition_locators: List[PartitionLocator],
|
49
|
+
columns: Optional[List[str]] = None,
|
50
|
+
file_count_per_cpu: Optional[int] = MANIFEST_FILE_COUNT_PER_CPU,
|
51
|
+
stat_results_s3_bucket: Optional[str] = None,
|
52
|
+
metastats_results_s3_bucket: Optional[str] = None,
|
53
|
+
deltacat_storage=unimplemented_deltacat_storage,
|
54
|
+
*args,
|
55
|
+
**kwargs,
|
56
|
+
) -> Dict[str, Dict[int, DeltaStats]]:
|
50
57
|
|
51
58
|
# TODO: Add CompactionEventDispatcher for metastats collection started event
|
52
59
|
stats_res_all_partitions: Dict[str, Dict[int, DeltaStats]] = {}
|
@@ -68,7 +75,7 @@ def collect_metastats(source_partition_locators: List[PartitionLocator],
|
|
68
75
|
file_count_per_cpu=file_count_per_cpu,
|
69
76
|
deltacat_storage=deltacat_storage,
|
70
77
|
*args,
|
71
|
-
**kwargs
|
78
|
+
**kwargs,
|
72
79
|
)
|
73
80
|
stats_res_obj_ref_all_partitions[partition_value_string] = stats_res_obj_ref
|
74
81
|
for pv, stats_res_obj_ref in stats_res_obj_ref_all_partitions.items():
|
@@ -88,63 +95,80 @@ def collect_metastats(source_partition_locators: List[PartitionLocator],
|
|
88
95
|
|
89
96
|
for stats in stats_column_result.get("column_stats"):
|
90
97
|
partition_pyarrow_sum += stats.get("stats").get("pyarrowTableBytes")
|
91
|
-
aggregate_partition_stats_for_validation[partition_val] = [
|
92
|
-
|
98
|
+
aggregate_partition_stats_for_validation[partition_val] = [
|
99
|
+
partition_stats_sum_row_count,
|
100
|
+
partition_pyarrow_sum,
|
101
|
+
]
|
102
|
+
logger.info(
|
103
|
+
f"partitions_stats_result for partition value: {partition_val}: rowCount: {partition_stats_sum_row_count}; pyarrowTableBytes: {partition_pyarrow_sum}"
|
104
|
+
)
|
93
105
|
return aggregate_partition_stats_for_validation
|
94
106
|
|
95
107
|
# return stats_res_all_partitions
|
96
108
|
|
109
|
+
|
97
110
|
@ray.remote(num_cpus=1)
|
98
|
-
def collect_from_partition(
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
111
|
+
def collect_from_partition(
|
112
|
+
source_partition_locator: PartitionLocator,
|
113
|
+
partition_value_string,
|
114
|
+
partition_canonical_string,
|
115
|
+
delta_stream_position_range_set: Optional[Set[DeltaRange]] = None,
|
116
|
+
columns: Optional[List[str]] = None,
|
117
|
+
file_count_per_cpu: Optional[int] = MANIFEST_FILE_COUNT_PER_CPU,
|
118
|
+
stat_results_s3_bucket: Optional[str] = None,
|
119
|
+
metastats_results_s3_bucket: Optional[str] = None,
|
120
|
+
deltacat_storage=unimplemented_deltacat_storage,
|
121
|
+
*args,
|
122
|
+
**kwargs,
|
123
|
+
) -> ObjectRef[Dict[int, DeltaStats]]:
|
109
124
|
|
110
125
|
if not columns:
|
111
|
-
columns = deltacat_storage.get_table_version_column_names(
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
126
|
+
columns = deltacat_storage.get_table_version_column_names(
|
127
|
+
source_partition_locator.namespace,
|
128
|
+
source_partition_locator.table_name,
|
129
|
+
source_partition_locator.table_version,
|
130
|
+
)
|
131
|
+
deltas = _find_deltas(
|
132
|
+
source_partition_locator, delta_stream_position_range_set, deltacat_storage
|
133
|
+
)
|
117
134
|
|
118
135
|
logger.info(f"Find {len(deltas)} deltas!")
|
119
136
|
trace_id = DEFAULT_JOB_RUN_TRACE_ID
|
120
137
|
if "trace_id" in kwargs:
|
121
138
|
trace_id = kwargs.get("trace_id")
|
122
139
|
else:
|
123
|
-
logger.warning(
|
140
|
+
logger.warning(
|
141
|
+
f"No job run trace id specified, default to {DEFAULT_JOB_RUN_TRACE_ID}"
|
142
|
+
)
|
124
143
|
|
125
144
|
cpus_per_instance = DEFAULT_CPUS_PER_INSTANCE_R5_8XLARGE
|
126
145
|
if cpus_per_instance in kwargs:
|
127
146
|
cpus_per_instance = kwargs.get("cpus_per_instance")
|
128
147
|
else:
|
129
|
-
logger.info(
|
148
|
+
logger.info(
|
149
|
+
f"Stats cluster CPUS per instance not specified, default to {DEFAULT_CPUS_PER_INSTANCE_R5_8XLARGE}"
|
150
|
+
)
|
130
151
|
|
131
152
|
stats_res_obj_ref = _start_all_stats_collection_from_deltas(
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
153
|
+
deltas,
|
154
|
+
partition_value_string,
|
155
|
+
partition_canonical_string,
|
156
|
+
columns,
|
157
|
+
trace_id,
|
158
|
+
file_count_per_cpu,
|
159
|
+
cpus_per_instance,
|
160
|
+
stat_results_s3_bucket,
|
161
|
+
metastats_results_s3_bucket,
|
162
|
+
deltacat_storage,
|
163
|
+
)
|
142
164
|
return stats_res_obj_ref
|
143
165
|
|
144
166
|
|
145
|
-
def _find_deltas(
|
146
|
-
|
147
|
-
|
167
|
+
def _find_deltas(
|
168
|
+
source_partition_locator: PartitionLocator,
|
169
|
+
delta_stream_position_range_set: Optional[Set[DeltaRange]] = None,
|
170
|
+
deltacat_storage=unimplemented_deltacat_storage,
|
171
|
+
) -> List[Delta]:
|
148
172
|
|
149
173
|
if delta_stream_position_range_set is None:
|
150
174
|
delta_stream_position_range_set = {(None, None)}
|
@@ -152,8 +176,9 @@ def _find_deltas(source_partition_locator: PartitionLocator,
|
|
152
176
|
|
153
177
|
for range_pair in merge_intervals(delta_stream_position_range_set):
|
154
178
|
begin, end = range_pair
|
155
|
-
promise: ObjectRef[List[Delta]] = get_deltas_from_range.remote(
|
156
|
-
|
179
|
+
promise: ObjectRef[List[Delta]] = get_deltas_from_range.remote(
|
180
|
+
source_partition_locator, begin, end, deltacat_storage
|
181
|
+
)
|
157
182
|
delta_range_lookup_pending.append(promise)
|
158
183
|
|
159
184
|
delta_list_by_ranges: List[List[Delta]] = ray.get(delta_range_lookup_pending)
|
@@ -162,25 +187,28 @@ def _find_deltas(source_partition_locator: PartitionLocator,
|
|
162
187
|
|
163
188
|
|
164
189
|
def _start_all_stats_collection_from_deltas(
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
190
|
+
deltas: List[Delta],
|
191
|
+
partition_value_string: Optional[str],
|
192
|
+
partition_canonical_string: Optional[str],
|
193
|
+
columns: Optional[List[str]] = None,
|
194
|
+
trace_id: Optional[str] = None,
|
195
|
+
file_count_per_cpu: Optional[int] = MANIFEST_FILE_COUNT_PER_CPU,
|
196
|
+
cpus_per_instance: Optional[int] = DEFAULT_CPUS_PER_INSTANCE_R5_8XLARGE,
|
197
|
+
stat_results_s3_bucket: Optional[str] = None,
|
198
|
+
metastats_results_s3_bucket: Optional[str] = None,
|
199
|
+
deltacat_storage=unimplemented_deltacat_storage,
|
200
|
+
) -> Dict[int, DeltaStats]:
|
201
|
+
|
177
202
|
delta_stats_compute_list: List[DeltaLocator] = []
|
178
203
|
meta_stats_list_ready: List[DeltaLocator] = []
|
179
204
|
meta_stats_list_to_compute: List[DeltaLocator] = []
|
180
205
|
|
181
206
|
if stat_results_s3_bucket:
|
182
|
-
found_columns_stats_map: Dict[
|
183
|
-
|
207
|
+
found_columns_stats_map: Dict[
|
208
|
+
int, List[DeltaStatsCacheResult]
|
209
|
+
] = read_cached_partition_stats(
|
210
|
+
partition_canonical_string, stat_results_s3_bucket
|
211
|
+
)
|
184
212
|
|
185
213
|
delta_cache_res: List[DeltaStats] = []
|
186
214
|
for delta in deltas:
|
@@ -188,10 +216,11 @@ def _start_all_stats_collection_from_deltas(
|
|
188
216
|
cached_result = found_columns_stats_map[delta.stream_position]
|
189
217
|
if cached_result.hits:
|
190
218
|
delta_cache_res.append(cached_result.hits)
|
191
|
-
meta_stats_list_ready.append(
|
219
|
+
meta_stats_list_ready.append(
|
220
|
+
cached_result.hits.column_stats[0].manifest_stats.delta_locator
|
221
|
+
)
|
192
222
|
|
193
223
|
if cached_result.misses:
|
194
|
-
missed_column_names: List[str] = cached_result.misses.column_names
|
195
224
|
delta_locator: DeltaLocator = cached_result.misses.delta_locator
|
196
225
|
delta_stats_compute_list.append(delta_locator)
|
197
226
|
meta_stats_list_to_compute.append(delta_locator)
|
@@ -202,46 +231,55 @@ def _start_all_stats_collection_from_deltas(
|
|
202
231
|
logger.info(f"Collecting stats on {len(delta_stats_compute_list)} deltas!")
|
203
232
|
delta_stats_compute_res: Dict[int, DeltaStats] = {}
|
204
233
|
if delta_stats_compute_list:
|
205
|
-
delta_stats_compute_res = _start_metadata_stats_collection(
|
206
|
-
|
207
|
-
|
208
|
-
|
209
|
-
|
210
|
-
|
211
|
-
|
212
|
-
|
213
|
-
|
214
|
-
|
215
|
-
|
216
|
-
|
234
|
+
delta_stats_compute_res = _start_metadata_stats_collection(
|
235
|
+
delta_stats_compute_list=delta_stats_compute_list,
|
236
|
+
meta_stats_list_ready=meta_stats_list_ready,
|
237
|
+
meta_stats_list_to_compute=meta_stats_list_to_compute,
|
238
|
+
partition_value_string=partition_value_string,
|
239
|
+
partition_canonical_string=partition_canonical_string,
|
240
|
+
columns=columns,
|
241
|
+
trace_id=trace_id,
|
242
|
+
file_count_per_cpu=file_count_per_cpu,
|
243
|
+
cpus_per_instance=cpus_per_instance,
|
244
|
+
stat_results_s3_bucket=stat_results_s3_bucket,
|
245
|
+
metastats_results_s3_bucket=metastats_results_s3_bucket,
|
246
|
+
deltacat_storage=deltacat_storage,
|
247
|
+
)
|
217
248
|
|
218
249
|
delta_stream_range_stats: Dict[int, DeltaStats] = {}
|
219
250
|
for delta_column_stats in delta_cache_res:
|
220
|
-
assert
|
221
|
-
|
222
|
-
|
251
|
+
assert (
|
252
|
+
len(delta_column_stats.column_stats) > 0
|
253
|
+
), f"Expected columns of `{delta_column_stats}` to be non-empty"
|
254
|
+
stream_position = delta_column_stats.column_stats[
|
255
|
+
0
|
256
|
+
].manifest_stats.delta_locator.stream_position
|
223
257
|
delta_stream_range_stats[stream_position] = delta_column_stats
|
224
258
|
|
225
259
|
# stats collection result: if we have cached stats and missed column stats for same delta, stats collection for this delta is still needed
|
226
260
|
# and the final result will use the newly collected stats for this delta.
|
227
|
-
stats_collection_res: Dict[int, DeltaStats] = {
|
261
|
+
stats_collection_res: Dict[int, DeltaStats] = {
|
262
|
+
**delta_stream_range_stats,
|
263
|
+
**delta_stats_compute_res,
|
264
|
+
}
|
228
265
|
|
229
266
|
return stats_collection_res
|
230
267
|
|
231
268
|
|
232
269
|
def _start_metadata_stats_collection(
|
233
|
-
|
234
|
-
|
235
|
-
|
236
|
-
|
237
|
-
|
238
|
-
|
239
|
-
|
240
|
-
|
241
|
-
|
242
|
-
|
243
|
-
|
244
|
-
|
270
|
+
delta_stats_compute_list: List[DeltaLocator],
|
271
|
+
meta_stats_list_ready: List[DeltaLocator],
|
272
|
+
meta_stats_list_to_compute: List[DeltaLocator],
|
273
|
+
partition_value_string: Optional[str],
|
274
|
+
partition_canonical_string: Optional[str],
|
275
|
+
columns: Optional[List[str]] = None,
|
276
|
+
trace_id: Optional[str] = None,
|
277
|
+
file_count_per_cpu: Optional[int] = MANIFEST_FILE_COUNT_PER_CPU,
|
278
|
+
cpus_per_instance: Optional[int] = DEFAULT_CPUS_PER_INSTANCE_R5_8XLARGE,
|
279
|
+
stat_results_s3_bucket: Optional[str] = None,
|
280
|
+
metastats_results_s3_bucket: Optional[str] = None,
|
281
|
+
deltacat_storage=unimplemented_deltacat_storage,
|
282
|
+
) -> Dict[int, DeltaStats]:
|
245
283
|
|
246
284
|
meta_stats_res_ready: Dict[int, int] = {}
|
247
285
|
|
@@ -254,7 +292,11 @@ def _start_metadata_stats_collection(
|
|
254
292
|
delta_meta_count += entry.meta.content_length
|
255
293
|
meta_stats_res_ready[delta.stream_position] = delta_meta_count
|
256
294
|
|
257
|
-
first_delta_locator =
|
295
|
+
first_delta_locator = (
|
296
|
+
meta_stats_list_ready[0]
|
297
|
+
if meta_stats_list_ready
|
298
|
+
else meta_stats_list_to_compute[0]
|
299
|
+
)
|
258
300
|
manifest = deltacat_storage.get_delta_manifest(first_delta_locator)
|
259
301
|
content_type = manifest.meta.content_type
|
260
302
|
content_encoding = manifest.meta.content_type
|
@@ -272,40 +314,42 @@ def _start_metadata_stats_collection(
|
|
272
314
|
delta_meta_count += entry.meta.content_length
|
273
315
|
meta_stats_to_compute[delta.stream_position] = delta_meta_count
|
274
316
|
|
275
|
-
|
276
|
-
|
277
|
-
|
278
|
-
|
279
|
-
|
280
|
-
|
281
|
-
|
282
|
-
|
283
|
-
content_type,
|
284
|
-
content_encoding)
|
317
|
+
batched_delta_stats_compute_list = _batch_deltas(
|
318
|
+
delta_stats_compute_list,
|
319
|
+
file_count_per_cpu,
|
320
|
+
cpus_per_instance,
|
321
|
+
deltacat_storage,
|
322
|
+
content_type,
|
323
|
+
content_encoding,
|
324
|
+
)
|
285
325
|
|
286
326
|
# out_cluster_cfg = _setup_stats_cluster(min_workers,
|
287
327
|
# partition_value_string,
|
288
328
|
# trace_id,
|
289
329
|
# cpus_per_instance)
|
290
330
|
out_cluster_cfg = None
|
291
|
-
delta_stats_res: Dict[int, DeltaStats] = _start_stats_cluster(
|
292
|
-
|
293
|
-
|
294
|
-
|
295
|
-
|
296
|
-
|
297
|
-
|
331
|
+
delta_stats_res: Dict[int, DeltaStats] = _start_stats_cluster(
|
332
|
+
out_cluster_cfg,
|
333
|
+
batched_delta_stats_compute_list,
|
334
|
+
columns,
|
335
|
+
stat_results_s3_bucket,
|
336
|
+
metastats_results_s3_bucket,
|
337
|
+
deltacat_storage,
|
338
|
+
partition_canonical_string,
|
339
|
+
)
|
298
340
|
|
299
341
|
return delta_stats_res
|
300
342
|
|
301
343
|
|
302
|
-
def _start_stats_cluster(
|
303
|
-
|
304
|
-
|
305
|
-
|
306
|
-
|
307
|
-
|
308
|
-
|
344
|
+
def _start_stats_cluster(
|
345
|
+
out_cluster_cfg: str,
|
346
|
+
batched_delta_stats_compute_list: List[DeltaAnnotated],
|
347
|
+
columns: List[str],
|
348
|
+
stat_results_s3_bucket: Optional[str] = None,
|
349
|
+
metastats_results_s3_bucket: Optional[str] = None,
|
350
|
+
deltacat_storage=unimplemented_deltacat_storage,
|
351
|
+
partition_val: Optional[str] = "partition_val",
|
352
|
+
):
|
309
353
|
# ray_up_latency = timed_invocation(
|
310
354
|
# func=ray_up,
|
311
355
|
# cluster_cfg=out_cluster_cfg
|
@@ -321,46 +365,72 @@ def _start_stats_cluster(out_cluster_cfg: str,
|
|
321
365
|
columns=columns,
|
322
366
|
stat_results_s3_bucket=stat_results_s3_bucket,
|
323
367
|
metastats_results_s3_bucket=metastats_results_s3_bucket,
|
324
|
-
deltacat_storage=deltacat_storage
|
368
|
+
deltacat_storage=deltacat_storage,
|
369
|
+
)
|
370
|
+
logger.info(
|
371
|
+
f"actual_stats_collection_latency: {partition_val}: {stats_collection_latency}"
|
325
372
|
)
|
326
|
-
logger.info(f"actual_stats_collection_latency: {partition_val}: {stats_collection_latency}")
|
327
373
|
# client.disconnect()
|
328
374
|
# ray_down(out_cluster_cfg)
|
329
375
|
# clean_up_cluster_cfg_file(out_cluster_cfg)
|
330
376
|
return delta_stream_range_stats
|
331
377
|
|
332
378
|
|
333
|
-
def _estimate_cpus_needed(
|
379
|
+
def _estimate_cpus_needed(
|
380
|
+
meta_stats_to_compute,
|
381
|
+
memory_gb_per_cpu,
|
382
|
+
file_count_per_cpu,
|
383
|
+
manifest_file_count_to_compute,
|
384
|
+
partition_val,
|
385
|
+
):
|
334
386
|
content_length_sum = 0
|
335
387
|
for val in meta_stats_to_compute.values():
|
336
388
|
content_length_sum += val
|
337
389
|
manifest_file_count_sum = 0
|
338
390
|
for val in manifest_file_count_to_compute.values():
|
339
391
|
manifest_file_count_sum += val
|
340
|
-
estimated_memory_bytes_needed =
|
392
|
+
estimated_memory_bytes_needed = (
|
393
|
+
content_length_sum * PYARROW_INFLATION_MULTIPLIER_ALL_COLUMNS
|
394
|
+
)
|
341
395
|
estimated_memory_gib_needed = estimated_memory_bytes_needed / BYTES_PER_GIBIBYTE
|
342
396
|
|
343
|
-
logger.info(
|
397
|
+
logger.info(
|
398
|
+
f"estimated_memory_gib_needed: {partition_val} : {estimated_memory_gib_needed}"
|
399
|
+
)
|
344
400
|
logger.info(f"manifest_file_count_sum: {partition_val} : {manifest_file_count_sum}")
|
345
401
|
|
346
|
-
memory_per_cpu_available = memory_gb_per_cpu * (
|
347
|
-
|
348
|
-
|
402
|
+
memory_per_cpu_available = memory_gb_per_cpu * (
|
403
|
+
1 - WORKER_NODE_OBJECT_STORE_MEMORY_RESERVE_RATIO
|
404
|
+
)
|
405
|
+
estimator = StatsClusterSizeEstimator.of(
|
406
|
+
memory_per_cpu_available,
|
407
|
+
file_count_per_cpu,
|
408
|
+
estimated_memory_gib_needed,
|
409
|
+
manifest_file_count_sum,
|
410
|
+
)
|
349
411
|
min_cpus = StatsClusterSizeEstimator.estimate_cpus_needed(estimator)
|
350
412
|
return min_cpus
|
351
413
|
|
352
414
|
|
353
|
-
def _batch_deltas(
|
354
|
-
|
355
|
-
|
356
|
-
|
357
|
-
|
358
|
-
|
359
|
-
|
415
|
+
def _batch_deltas(
|
416
|
+
delta_stats_compute_list,
|
417
|
+
file_count_per_cpu,
|
418
|
+
cpu_per_instance,
|
419
|
+
deltacat_storage,
|
420
|
+
content_type,
|
421
|
+
content_encoding,
|
422
|
+
) -> List[DeltaAnnotated]:
|
423
|
+
worker_node_mem = (
|
424
|
+
cpu_per_instance
|
425
|
+
* (1 - WORKER_NODE_OBJECT_STORE_MEMORY_RESERVE_RATIO)
|
426
|
+
* BYTES_PER_GIBIBYTE
|
427
|
+
)
|
360
428
|
delta_list = []
|
361
429
|
|
362
430
|
estimate_based_on_content_length = functools.partial(
|
363
|
-
estimation_function,
|
431
|
+
estimation_function,
|
432
|
+
content_type=content_type,
|
433
|
+
content_encoding=content_encoding,
|
364
434
|
)
|
365
435
|
|
366
436
|
for delta_locator in delta_stats_compute_list:
|
@@ -381,8 +451,14 @@ def _batch_deltas(delta_stats_compute_list,
|
|
381
451
|
return rebatched_da_list
|
382
452
|
|
383
453
|
|
384
|
-
def _setup_stats_cluster(
|
385
|
-
|
454
|
+
def _setup_stats_cluster(
|
455
|
+
min_workers, partition_value_string, trace_id, cpus_per_instance
|
456
|
+
):
|
457
|
+
stats_cluster_instance_type = (
|
458
|
+
int(cpus_per_instance // 4)
|
459
|
+
if cpus_per_instance
|
460
|
+
else STATS_CLUSTER_R5_INSTANCE_TYPE
|
461
|
+
)
|
386
462
|
stats_cluster_instance_type_str = f"r5.{stats_cluster_instance_type}xlarge".strip()
|
387
463
|
parent_dir_path = pathlib.Path(__file__).parent.resolve()
|
388
464
|
in_cfg = os.path.join(parent_dir_path, "config", "stats_cluster_example.yaml")
|
@@ -393,7 +469,9 @@ def _setup_stats_cluster(min_workers, partition_value_string, trace_id, cpus_per
|
|
393
469
|
min_workers=min_workers,
|
394
470
|
head_type=stats_cluster_instance_type_str,
|
395
471
|
worker_type=stats_cluster_instance_type_str,
|
396
|
-
head_object_store_memory_pct=HEAD_NODE_OBJECT_STORE_MEMORY_RESERVE_RATIO*100,
|
397
|
-
worker_object_store_memory_pct=WORKER_NODE_OBJECT_STORE_MEMORY_RESERVE_RATIO
|
472
|
+
head_object_store_memory_pct=HEAD_NODE_OBJECT_STORE_MEMORY_RESERVE_RATIO * 100,
|
473
|
+
worker_object_store_memory_pct=WORKER_NODE_OBJECT_STORE_MEMORY_RESERVE_RATIO
|
474
|
+
* 100,
|
475
|
+
)
|
398
476
|
|
399
477
|
return out_cluster_cfg_file_path
|
@@ -1,20 +1,16 @@
|
|
1
1
|
# Allow classes to use self-referencing Type hints in Python 3.7.
|
2
2
|
from __future__ import annotations
|
3
3
|
|
4
|
-
from
|
5
|
-
from typing import List, Dict, Optional, Set, Any, NamedTuple
|
4
|
+
from typing import Dict
|
6
5
|
|
7
6
|
from deltacat.compute.stats.models.delta_stats import DeltaStats
|
8
|
-
from deltacat.compute.stats.models.manifest_entry_stats import ManifestEntryStats
|
9
|
-
from deltacat.compute.stats.models.stats_result import StatsResult
|
10
|
-
from deltacat.compute.stats.types import StatsType
|
11
|
-
from deltacat.storage import DeltaLocator
|
12
7
|
|
13
8
|
|
14
9
|
class PartitionStats(dict):
|
15
|
-
|
16
10
|
@staticmethod
|
17
|
-
def of(
|
11
|
+
def of(
|
12
|
+
delta_stats: Dict[DeltaStats], partition_canonical_string: str
|
13
|
+
) -> PartitionStats:
|
18
14
|
ps = PartitionStats()
|
19
15
|
ps["delta_stats"] = delta_stats
|
20
16
|
ps["partition_canonical_string"] = partition_canonical_string
|
@@ -25,7 +21,9 @@ class PartitionStats(dict):
|
|
25
21
|
delta_stats_dict = {}
|
26
22
|
for stream_position, delta_stats in partition_stats["delta_stats"].items():
|
27
23
|
delta_stats_dict[stream_position] = DeltaStats.build_from_dict(delta_stats)
|
28
|
-
return PartitionStats.of(
|
24
|
+
return PartitionStats.of(
|
25
|
+
delta_stats_dict, partition_stats["partition_canonical_string"]
|
26
|
+
)
|
29
27
|
|
30
28
|
@property
|
31
29
|
def delta_stats(self) -> Dict[DeltaStats]:
|
@@ -34,4 +32,3 @@ class PartitionStats(dict):
|
|
34
32
|
@property
|
35
33
|
def partition_canonical_string(self) -> str:
|
36
34
|
return self["partition_canonical_string"]
|
37
|
-
|
@@ -5,9 +5,13 @@ from deltacat.compute.stats.models.delta_stats import DeltaStats
|
|
5
5
|
|
6
6
|
|
7
7
|
class StatsClusterSizeEstimator(dict):
|
8
|
-
|
9
8
|
@staticmethod
|
10
|
-
def of(
|
9
|
+
def of(
|
10
|
+
memory_per_cpu: int,
|
11
|
+
file_count_per_cpu: int,
|
12
|
+
total_memory_needed: int,
|
13
|
+
total_file_count: int,
|
14
|
+
) -> DeltaStats:
|
11
15
|
estimator = StatsClusterSizeEstimator()
|
12
16
|
estimator["memory_per_cpu"] = memory_per_cpu
|
13
17
|
estimator["file_count_per_cpu"] = file_count_per_cpu
|
@@ -55,6 +59,10 @@ class StatsClusterSizeEstimator(dict):
|
|
55
59
|
# 4. If not enough stats for this content_type and content_encoding combination: use the basic PYARROW_INFLATION_MULTIPLIER instead.
|
56
60
|
# So, only option 4 is implemented here since this pre-requirement for first 3 options are not met for first round of metastats&stats collection.
|
57
61
|
|
58
|
-
min_cpus_based_on_memory = (
|
59
|
-
|
60
|
-
|
62
|
+
min_cpus_based_on_memory = (
|
63
|
+
estimator.total_memory_needed // estimator.memory_per_cpu
|
64
|
+
) + 1
|
65
|
+
min_cpus_based_on_file_count = (
|
66
|
+
estimator.total_file_count // estimator.file_count_per_cpu
|
67
|
+
) + 1
|
68
|
+
return max(min_cpus_based_on_memory, min_cpus_based_on_file_count)
|