deltacat 1.1.17__py3-none-any.whl → 1.1.19__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +1 -1
- deltacat/aws/constants.py +0 -1
- deltacat/compute/compactor/model/compact_partition_params.py +76 -0
- deltacat/compute/compactor/model/compaction_session_audit_info.py +26 -0
- deltacat/compute/compactor/model/delta_annotated.py +16 -9
- deltacat/compute/compactor_v2/constants.py +3 -0
- deltacat/compute/compactor_v2/private/compaction_utils.py +9 -5
- deltacat/compute/compactor_v2/utils/content_type_params.py +185 -34
- deltacat/compute/compactor_v2/utils/io.py +28 -14
- deltacat/compute/compactor_v2/utils/primary_key_index.py +9 -4
- deltacat/compute/compactor_v2/utils/task_options.py +128 -183
- deltacat/compute/resource_estimation/__init__.py +27 -0
- deltacat/compute/resource_estimation/delta.py +271 -0
- deltacat/compute/resource_estimation/manifest.py +394 -0
- deltacat/compute/resource_estimation/model.py +165 -0
- deltacat/compute/resource_estimation/parquet.py +108 -0
- deltacat/constants.py +5 -0
- deltacat/exceptions.py +2 -4
- deltacat/logs.py +8 -0
- deltacat/tests/compute/compact_partition_multiple_rounds_test_cases.py +77 -0
- deltacat/tests/compute/compact_partition_rebase_test_cases.py +308 -0
- deltacat/tests/compute/compact_partition_rebase_then_incremental_test_cases.py +159 -0
- deltacat/tests/compute/compactor_v2/test_compaction_session.py +157 -0
- deltacat/tests/compute/compactor_v2/utils/test_task_options.py +3 -3
- deltacat/tests/compute/resource_estimation/test_delta.py +605 -0
- deltacat/tests/compute/resource_estimation/test_manifest.py +921 -0
- deltacat/tests/compute/test_compact_partition_rebase.py +13 -4
- deltacat/tests/compute/test_util_common.py +2 -0
- deltacat/tests/compute/test_util_create_table_deltas_repo.py +13 -5
- deltacat/tests/test_logs.py +34 -0
- deltacat/tests/test_utils/pyarrow.py +15 -5
- {deltacat-1.1.17.dist-info → deltacat-1.1.19.dist-info}/METADATA +2 -2
- {deltacat-1.1.17.dist-info → deltacat-1.1.19.dist-info}/RECORD +38 -54
- deltacat/compute/metastats/meta_stats.py +0 -479
- deltacat/compute/metastats/model/__init__.py +0 -0
- deltacat/compute/metastats/model/partition_stats_dict.py +0 -34
- deltacat/compute/metastats/model/stats_cluster_size_estimator.py +0 -68
- deltacat/compute/metastats/stats.py +0 -182
- deltacat/compute/metastats/utils/__init__.py +0 -0
- deltacat/compute/metastats/utils/constants.py +0 -16
- deltacat/compute/metastats/utils/io.py +0 -223
- deltacat/compute/metastats/utils/pyarrow_memory_estimation_function.py +0 -18
- deltacat/compute/metastats/utils/ray_utils.py +0 -129
- deltacat/compute/stats/basic.py +0 -226
- deltacat/compute/stats/models/__init__.py +0 -0
- deltacat/compute/stats/models/delta_column_stats.py +0 -98
- deltacat/compute/stats/models/delta_stats.py +0 -233
- deltacat/compute/stats/models/delta_stats_cache_result.py +0 -49
- deltacat/compute/stats/models/manifest_entry_stats.py +0 -72
- deltacat/compute/stats/models/stats_result.py +0 -104
- deltacat/compute/stats/utils/__init__.py +0 -0
- deltacat/compute/stats/utils/intervals.py +0 -94
- deltacat/compute/stats/utils/io.py +0 -230
- deltacat/compute/stats/utils/manifest_stats_file.py +0 -100
- deltacat/tests/stats/__init__.py +0 -0
- deltacat/tests/stats/test_intervals.py +0 -49
- /deltacat/{compute/metastats → tests/compute/resource_estimation}/__init__.py +0 -0
- /deltacat/{compute/metastats/config → tests/compute/resource_estimation/data}/__init__.py +0 -0
- {deltacat-1.1.17.dist-info → deltacat-1.1.19.dist-info}/LICENSE +0 -0
- {deltacat-1.1.17.dist-info → deltacat-1.1.19.dist-info}/WHEEL +0 -0
- {deltacat-1.1.17.dist-info → deltacat-1.1.19.dist-info}/top_level.txt +0 -0
@@ -4,11 +4,8 @@ from deltacat import logs
|
|
4
4
|
from deltacat.compute.compactor_v2.model.merge_file_group import (
|
5
5
|
LocalMergeFileGroupsProvider,
|
6
6
|
)
|
7
|
-
from deltacat.types.media import ContentEncoding, ContentType
|
8
|
-
from deltacat.types.partial_download import PartialParquetParameters
|
9
7
|
from deltacat.storage import (
|
10
8
|
Manifest,
|
11
|
-
ManifestEntry,
|
12
9
|
interface as unimplemented_deltacat_storage,
|
13
10
|
)
|
14
11
|
from deltacat.compute.compactor.model.delta_annotated import DeltaAnnotated
|
@@ -16,50 +13,25 @@ from deltacat.compute.compactor.model.round_completion_info import RoundCompleti
|
|
16
13
|
from deltacat.compute.compactor_v2.utils.primary_key_index import (
|
17
14
|
hash_group_index_to_hash_bucket_indices,
|
18
15
|
)
|
19
|
-
from deltacat.compute.
|
20
|
-
|
16
|
+
from deltacat.compute.resource_estimation.manifest import (
|
17
|
+
estimate_manifest_entry_num_rows,
|
18
|
+
estimate_manifest_entry_size_bytes,
|
19
|
+
estimate_manifest_entry_column_size_bytes,
|
20
|
+
)
|
21
|
+
from deltacat.compute.resource_estimation.model import (
|
22
|
+
EstimateResourcesParams,
|
23
|
+
OperationType,
|
21
24
|
)
|
22
25
|
from deltacat.exceptions import RetryableError
|
23
26
|
|
24
27
|
logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
|
25
28
|
|
26
29
|
|
27
|
-
def
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
and entry.meta.content_type == ContentType.PARQUET
|
33
|
-
and entry.meta.content_encoding == ContentEncoding.IDENTITY
|
34
|
-
and entry.meta.content_type_parameters
|
35
|
-
):
|
36
|
-
for type_params in entry.meta.content_type_parameters:
|
37
|
-
if isinstance(type_params, PartialParquetParameters):
|
38
|
-
return type_params
|
39
|
-
return None
|
40
|
-
|
41
|
-
|
42
|
-
def _calculate_parquet_column_size(
|
43
|
-
type_params: PartialParquetParameters, columns: List[str]
|
44
|
-
):
|
45
|
-
column_size = 0.0
|
46
|
-
for rg in type_params.row_groups_to_download:
|
47
|
-
columns_found = 0
|
48
|
-
row_group_meta = type_params.pq_metadata.row_group(rg)
|
49
|
-
for col in range(row_group_meta.num_columns):
|
50
|
-
column_meta = row_group_meta.column(col)
|
51
|
-
if column_meta.path_in_schema in columns:
|
52
|
-
columns_found += 1
|
53
|
-
column_size += column_meta.total_uncompressed_size
|
54
|
-
assert columns_found == len(columns), (
|
55
|
-
"Columns not found in the parquet data as "
|
56
|
-
f"{columns_found} != {len(columns)}"
|
57
|
-
)
|
58
|
-
return column_size * PARQUET_TO_PYARROW_INFLATION
|
59
|
-
|
60
|
-
|
61
|
-
def get_task_options(
|
62
|
-
cpu: float, memory: float, ray_custom_resources: Optional[Dict] = None
|
30
|
+
def _get_task_options(
|
31
|
+
cpu: float,
|
32
|
+
memory: float,
|
33
|
+
ray_custom_resources: Optional[Dict] = None,
|
34
|
+
scheduling_strategy: str = "SPREAD",
|
63
35
|
) -> Dict:
|
64
36
|
|
65
37
|
# NOTE: With DEFAULT scheduling strategy in Ray 2.20.0, autoscaler does
|
@@ -67,7 +39,11 @@ def get_task_options(
|
|
67
39
|
# 20 tasks get scheduled out of 100 tasks in queue. Hence, we use SPREAD
|
68
40
|
# which is also ideal for merge and hash bucket tasks.
|
69
41
|
# https://docs.ray.io/en/latest/ray-core/scheduling/index.html
|
70
|
-
task_opts = {
|
42
|
+
task_opts = {
|
43
|
+
"num_cpus": cpu,
|
44
|
+
"memory": memory,
|
45
|
+
"scheduling_strategy": scheduling_strategy,
|
46
|
+
}
|
71
47
|
|
72
48
|
if ray_custom_resources:
|
73
49
|
task_opts["resources"] = ray_custom_resources
|
@@ -81,53 +57,110 @@ def get_task_options(
|
|
81
57
|
return task_opts
|
82
58
|
|
83
59
|
|
84
|
-
def
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
60
|
+
def _get_merge_task_options(
|
61
|
+
index: int,
|
62
|
+
hb_group_idx: int,
|
63
|
+
data_size: float,
|
64
|
+
pk_size_bytes: float,
|
65
|
+
num_rows: int,
|
66
|
+
num_hash_groups: int,
|
67
|
+
total_memory_buffer_percentage: int,
|
68
|
+
incremental_index_array_size: int,
|
69
|
+
debug_memory_params: Dict[str, Any],
|
70
|
+
ray_custom_resources: Optional[Dict],
|
71
|
+
estimate_resources_params: EstimateResourcesParams,
|
72
|
+
round_completion_info: Optional[RoundCompletionInfo] = None,
|
73
|
+
compacted_delta_manifest: Optional[Manifest] = None,
|
74
|
+
primary_keys: Optional[List[str]] = None,
|
75
|
+
deltacat_storage=unimplemented_deltacat_storage,
|
76
|
+
deltacat_storage_kwargs: Optional[Dict] = {},
|
77
|
+
memory_logs_enabled: Optional[bool] = None,
|
78
|
+
) -> Dict[str, Any]:
|
79
|
+
if (
|
80
|
+
round_completion_info
|
81
|
+
and compacted_delta_manifest
|
82
|
+
and round_completion_info.hb_index_to_entry_range
|
83
|
+
):
|
94
84
|
|
95
|
-
|
85
|
+
previous_inflation = (
|
86
|
+
round_completion_info.compacted_pyarrow_write_result.pyarrow_bytes
|
87
|
+
/ round_completion_info.compacted_pyarrow_write_result.file_bytes
|
88
|
+
)
|
89
|
+
debug_memory_params["previous_inflation"] = previous_inflation
|
96
90
|
|
91
|
+
average_record_size = (
|
92
|
+
round_completion_info.compacted_pyarrow_write_result.pyarrow_bytes
|
93
|
+
/ round_completion_info.compacted_pyarrow_write_result.records
|
94
|
+
)
|
95
|
+
debug_memory_params["average_record_size"] = average_record_size
|
97
96
|
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
previous_inflation: float,
|
102
|
-
**kwargs,
|
103
|
-
) -> int:
|
104
|
-
if entry.meta.record_count:
|
105
|
-
return entry.meta.record_count
|
97
|
+
iterable = hash_group_index_to_hash_bucket_indices(
|
98
|
+
hb_group_idx, round_completion_info.hash_bucket_count, num_hash_groups
|
99
|
+
)
|
106
100
|
|
107
|
-
|
101
|
+
for hb_idx in iterable:
|
102
|
+
if round_completion_info.hb_index_to_entry_range.get(str(hb_idx)) is None:
|
103
|
+
continue
|
108
104
|
|
109
|
-
|
110
|
-
|
105
|
+
entry_start, entry_end = round_completion_info.hb_index_to_entry_range[
|
106
|
+
str(hb_idx)
|
107
|
+
]
|
108
|
+
for entry_index in range(entry_start, entry_end):
|
109
|
+
entry = compacted_delta_manifest.entries[entry_index]
|
111
110
|
|
112
|
-
|
113
|
-
|
114
|
-
|
111
|
+
current_entry_size = estimate_manifest_entry_size_bytes(
|
112
|
+
entry=entry,
|
113
|
+
operation_type=OperationType.PYARROW_DOWNLOAD,
|
114
|
+
estimate_resources_params=estimate_resources_params,
|
115
|
+
)
|
116
|
+
current_entry_rows = estimate_manifest_entry_num_rows(
|
117
|
+
entry=entry,
|
118
|
+
operation_type=OperationType.PYARROW_DOWNLOAD,
|
119
|
+
estimate_resources_params=estimate_resources_params,
|
120
|
+
)
|
115
121
|
|
116
|
-
|
122
|
+
data_size += current_entry_size
|
123
|
+
num_rows += current_entry_rows
|
117
124
|
|
125
|
+
if primary_keys:
|
126
|
+
pk_size = estimate_manifest_entry_column_size_bytes(
|
127
|
+
entry=entry,
|
128
|
+
columns=primary_keys,
|
129
|
+
operation_type=OperationType.PYARROW_DOWNLOAD,
|
130
|
+
estimate_resources_params=estimate_resources_params,
|
131
|
+
)
|
118
132
|
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
return 0
|
133
|
+
if pk_size is None:
|
134
|
+
pk_size_bytes += current_entry_size
|
135
|
+
else:
|
136
|
+
pk_size_bytes += pk_size
|
124
137
|
|
125
|
-
|
138
|
+
# total data downloaded + primary key hash column + pyarrow-to-numpy conversion
|
139
|
+
# + primary key column + hashlib inefficiency + dict size for merge + incremental index array size
|
140
|
+
total_memory = (
|
141
|
+
data_size
|
142
|
+
+ pk_size_bytes
|
143
|
+
+ pk_size_bytes
|
144
|
+
+ num_rows * 20
|
145
|
+
+ num_rows * 20
|
146
|
+
+ num_rows * 20
|
147
|
+
+ incremental_index_array_size
|
148
|
+
)
|
149
|
+
debug_memory_params["data_size"] = data_size
|
150
|
+
debug_memory_params["num_rows"] = num_rows
|
151
|
+
debug_memory_params["pk_size_bytes"] = pk_size_bytes
|
152
|
+
debug_memory_params["incremental_index_array_size"] = incremental_index_array_size
|
153
|
+
debug_memory_params["total_memory"] = total_memory
|
154
|
+
debug_memory_params["estimate_resources_params"] = estimate_resources_params
|
126
155
|
|
127
|
-
|
128
|
-
|
156
|
+
total_memory = total_memory * (1 + total_memory_buffer_percentage / 100.0)
|
157
|
+
debug_memory_params["total_memory_with_buffer"] = total_memory
|
158
|
+
logger.debug_conditional(
|
159
|
+
f"[Merge task {index}]: Params used for calculating merge memory: {debug_memory_params}",
|
160
|
+
memory_logs_enabled,
|
161
|
+
)
|
129
162
|
|
130
|
-
return
|
163
|
+
return _get_task_options(0.01, total_memory, ray_custom_resources)
|
131
164
|
|
132
165
|
|
133
166
|
def hash_bucket_resource_options_provider(
|
@@ -136,6 +169,7 @@ def hash_bucket_resource_options_provider(
|
|
136
169
|
previous_inflation: float,
|
137
170
|
average_record_size_bytes: float,
|
138
171
|
total_memory_buffer_percentage: int,
|
172
|
+
estimate_resources_params: EstimateResourcesParams,
|
139
173
|
primary_keys: List[str] = None,
|
140
174
|
ray_custom_resources: Optional[Dict] = None,
|
141
175
|
memory_logs_enabled: Optional[bool] = None,
|
@@ -153,19 +187,25 @@ def hash_bucket_resource_options_provider(
|
|
153
187
|
|
154
188
|
for entry in item.manifest.entries:
|
155
189
|
entry_size = estimate_manifest_entry_size_bytes(
|
156
|
-
entry=entry,
|
190
|
+
entry=entry,
|
191
|
+
operation_type=OperationType.PYARROW_DOWNLOAD,
|
192
|
+
estimate_resources_params=estimate_resources_params,
|
193
|
+
**kwargs,
|
157
194
|
)
|
158
195
|
num_rows += estimate_manifest_entry_num_rows(
|
159
196
|
entry=entry,
|
160
|
-
|
161
|
-
|
197
|
+
operation_type=OperationType.PYARROW_DOWNLOAD,
|
198
|
+
estimate_resources_params=estimate_resources_params,
|
199
|
+
**kwargs,
|
162
200
|
)
|
163
201
|
size_bytes += entry_size
|
164
202
|
|
165
203
|
if primary_keys:
|
166
204
|
pk_size = estimate_manifest_entry_column_size_bytes(
|
167
205
|
entry=entry,
|
206
|
+
operation_type=OperationType.PYARROW_DOWNLOAD,
|
168
207
|
columns=primary_keys,
|
208
|
+
estimate_resources_params=estimate_resources_params,
|
169
209
|
)
|
170
210
|
|
171
211
|
if pk_size is None:
|
@@ -187,6 +227,7 @@ def hash_bucket_resource_options_provider(
|
|
187
227
|
debug_memory_params["num_rows"] = num_rows
|
188
228
|
debug_memory_params["total_pk_size"] = total_pk_size
|
189
229
|
debug_memory_params["total_memory"] = total_memory
|
230
|
+
debug_memory_params["estimate_resources_params"] = estimate_resources_params
|
190
231
|
|
191
232
|
debug_memory_params["previous_inflation"] = previous_inflation
|
192
233
|
debug_memory_params["average_record_size_bytes"] = average_record_size_bytes
|
@@ -199,7 +240,7 @@ def hash_bucket_resource_options_provider(
|
|
199
240
|
memory_logs_enabled,
|
200
241
|
)
|
201
242
|
|
202
|
-
return
|
243
|
+
return _get_task_options(0.01, total_memory, ray_custom_resources)
|
203
244
|
|
204
245
|
|
205
246
|
def merge_resource_options_provider(
|
@@ -209,6 +250,7 @@ def merge_resource_options_provider(
|
|
209
250
|
hash_group_size_bytes: Dict[int, int],
|
210
251
|
hash_group_num_rows: Dict[int, int],
|
211
252
|
total_memory_buffer_percentage: int,
|
253
|
+
estimate_resources_params: EstimateResourcesParams,
|
212
254
|
round_completion_info: Optional[RoundCompletionInfo] = None,
|
213
255
|
compacted_delta_manifest: Optional[Manifest] = None,
|
214
256
|
ray_custom_resources: Optional[Dict] = None,
|
@@ -230,7 +272,7 @@ def merge_resource_options_provider(
|
|
230
272
|
pk_size_bytes = data_size
|
231
273
|
incremental_index_array_size = num_rows * 4
|
232
274
|
|
233
|
-
return
|
275
|
+
return _get_merge_task_options(
|
234
276
|
index,
|
235
277
|
hb_group_idx,
|
236
278
|
data_size,
|
@@ -247,6 +289,7 @@ def merge_resource_options_provider(
|
|
247
289
|
deltacat_storage=deltacat_storage,
|
248
290
|
deltacat_storage_kwargs=deltacat_storage_kwargs,
|
249
291
|
memory_logs_enabled=memory_logs_enabled,
|
292
|
+
estimate_resources_params=estimate_resources_params,
|
250
293
|
)
|
251
294
|
|
252
295
|
|
@@ -254,6 +297,7 @@ def local_merge_resource_options_provider(
|
|
254
297
|
estimated_da_size: float,
|
255
298
|
estimated_num_rows: int,
|
256
299
|
total_memory_buffer_percentage: int,
|
300
|
+
estimate_resources_params: EstimateResourcesParams,
|
257
301
|
round_completion_info: Optional[RoundCompletionInfo] = None,
|
258
302
|
compacted_delta_manifest: Optional[Manifest] = None,
|
259
303
|
ray_custom_resources: Optional[Dict] = None,
|
@@ -270,7 +314,7 @@ def local_merge_resource_options_provider(
|
|
270
314
|
pk_size_bytes = estimated_da_size
|
271
315
|
incremental_index_array_size = estimated_num_rows * 4
|
272
316
|
|
273
|
-
return
|
317
|
+
return _get_merge_task_options(
|
274
318
|
index=index,
|
275
319
|
hb_group_idx=hb_group_idx,
|
276
320
|
data_size=estimated_da_size,
|
@@ -287,104 +331,5 @@ def local_merge_resource_options_provider(
|
|
287
331
|
deltacat_storage=deltacat_storage,
|
288
332
|
deltacat_storage_kwargs=deltacat_storage_kwargs,
|
289
333
|
memory_logs_enabled=memory_logs_enabled,
|
334
|
+
estimate_resources_params=estimate_resources_params,
|
290
335
|
)
|
291
|
-
|
292
|
-
|
293
|
-
def get_merge_task_options(
|
294
|
-
index: int,
|
295
|
-
hb_group_idx: int,
|
296
|
-
data_size: float,
|
297
|
-
pk_size_bytes: float,
|
298
|
-
num_rows: int,
|
299
|
-
num_hash_groups: int,
|
300
|
-
total_memory_buffer_percentage: int,
|
301
|
-
incremental_index_array_size: int,
|
302
|
-
debug_memory_params: Dict[str, Any],
|
303
|
-
ray_custom_resources: Optional[Dict],
|
304
|
-
round_completion_info: Optional[RoundCompletionInfo] = None,
|
305
|
-
compacted_delta_manifest: Optional[Manifest] = None,
|
306
|
-
primary_keys: Optional[List[str]] = None,
|
307
|
-
deltacat_storage=unimplemented_deltacat_storage,
|
308
|
-
deltacat_storage_kwargs: Optional[Dict] = {},
|
309
|
-
memory_logs_enabled: Optional[bool] = None,
|
310
|
-
) -> Dict[str, Any]:
|
311
|
-
if (
|
312
|
-
round_completion_info
|
313
|
-
and compacted_delta_manifest
|
314
|
-
and round_completion_info.hb_index_to_entry_range
|
315
|
-
):
|
316
|
-
|
317
|
-
previous_inflation = (
|
318
|
-
round_completion_info.compacted_pyarrow_write_result.pyarrow_bytes
|
319
|
-
/ round_completion_info.compacted_pyarrow_write_result.file_bytes
|
320
|
-
)
|
321
|
-
debug_memory_params["previous_inflation"] = previous_inflation
|
322
|
-
|
323
|
-
average_record_size = (
|
324
|
-
round_completion_info.compacted_pyarrow_write_result.pyarrow_bytes
|
325
|
-
/ round_completion_info.compacted_pyarrow_write_result.records
|
326
|
-
)
|
327
|
-
debug_memory_params["average_record_size"] = average_record_size
|
328
|
-
|
329
|
-
iterable = hash_group_index_to_hash_bucket_indices(
|
330
|
-
hb_group_idx, round_completion_info.hash_bucket_count, num_hash_groups
|
331
|
-
)
|
332
|
-
|
333
|
-
for hb_idx in iterable:
|
334
|
-
if round_completion_info.hb_index_to_entry_range.get(str(hb_idx)) is None:
|
335
|
-
continue
|
336
|
-
|
337
|
-
entry_start, entry_end = round_completion_info.hb_index_to_entry_range[
|
338
|
-
str(hb_idx)
|
339
|
-
]
|
340
|
-
for entry_index in range(entry_start, entry_end):
|
341
|
-
entry = compacted_delta_manifest.entries[entry_index]
|
342
|
-
|
343
|
-
current_entry_size = estimate_manifest_entry_size_bytes(
|
344
|
-
entry=entry, previous_inflation=previous_inflation
|
345
|
-
)
|
346
|
-
current_entry_rows = estimate_manifest_entry_num_rows(
|
347
|
-
entry=entry,
|
348
|
-
average_record_size_bytes=average_record_size,
|
349
|
-
previous_inflation=previous_inflation,
|
350
|
-
)
|
351
|
-
|
352
|
-
data_size += current_entry_size
|
353
|
-
num_rows += current_entry_rows
|
354
|
-
|
355
|
-
if primary_keys:
|
356
|
-
pk_size = estimate_manifest_entry_column_size_bytes(
|
357
|
-
entry=entry,
|
358
|
-
columns=primary_keys,
|
359
|
-
)
|
360
|
-
|
361
|
-
if pk_size is None:
|
362
|
-
pk_size_bytes += current_entry_size
|
363
|
-
else:
|
364
|
-
pk_size_bytes += pk_size
|
365
|
-
|
366
|
-
# total data downloaded + primary key hash column + pyarrow-to-numpy conversion
|
367
|
-
# + primary key column + hashlib inefficiency + dict size for merge + incremental index array size
|
368
|
-
total_memory = (
|
369
|
-
data_size
|
370
|
-
+ pk_size_bytes
|
371
|
-
+ pk_size_bytes
|
372
|
-
+ num_rows * 20
|
373
|
-
+ num_rows * 20
|
374
|
-
+ num_rows * 20
|
375
|
-
+ incremental_index_array_size
|
376
|
-
)
|
377
|
-
debug_memory_params["data_size"] = data_size
|
378
|
-
debug_memory_params["num_rows"] = num_rows
|
379
|
-
debug_memory_params["pk_size_bytes"] = pk_size_bytes
|
380
|
-
debug_memory_params["incremental_index_array_size"] = incremental_index_array_size
|
381
|
-
debug_memory_params["total_memory"] = total_memory
|
382
|
-
|
383
|
-
total_memory = total_memory * (1 + total_memory_buffer_percentage / 100.0)
|
384
|
-
debug_memory_params["total_memory_with_buffer"] = total_memory
|
385
|
-
logger.debug_conditional(
|
386
|
-
f"[Merge task {index}]: Params used for calculating merge memory: {debug_memory_params}",
|
387
|
-
memory_logs_enabled,
|
388
|
-
)
|
389
|
-
|
390
|
-
return get_task_options(0.01, total_memory, ray_custom_resources)
|
@@ -0,0 +1,27 @@
|
|
1
|
+
from deltacat.compute.resource_estimation.model import (
|
2
|
+
ResourceEstimationMethod,
|
3
|
+
EstimatedResources,
|
4
|
+
Statistics,
|
5
|
+
EstimateResourcesParams,
|
6
|
+
OperationType,
|
7
|
+
)
|
8
|
+
from deltacat.compute.resource_estimation.manifest import (
|
9
|
+
estimate_manifest_entry_column_size_bytes,
|
10
|
+
estimate_manifest_entry_num_rows,
|
11
|
+
estimate_manifest_entry_size_bytes,
|
12
|
+
)
|
13
|
+
from deltacat.compute.resource_estimation.delta import (
|
14
|
+
estimate_resources_required_to_process_delta,
|
15
|
+
)
|
16
|
+
|
17
|
+
__all__ = [
|
18
|
+
"ResourceEstimationMethod",
|
19
|
+
"EstimatedResources",
|
20
|
+
"EstimateResourcesParams",
|
21
|
+
"Statistics",
|
22
|
+
"estimate_resources_required_to_process_delta",
|
23
|
+
"estimate_manifest_entry_size_bytes",
|
24
|
+
"estimate_manifest_entry_num_rows",
|
25
|
+
"estimate_manifest_entry_column_size_bytes",
|
26
|
+
"OperationType",
|
27
|
+
]
|