deltacat 1.1.18__py3-none-any.whl → 1.1.20__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +1 -1
- deltacat/compute/compactor/model/compact_partition_params.py +76 -0
- deltacat/compute/compactor/model/compaction_session_audit_info.py +26 -0
- deltacat/compute/compactor/model/delta_annotated.py +16 -9
- deltacat/compute/compactor_v2/constants.py +3 -0
- deltacat/compute/compactor_v2/private/compaction_utils.py +11 -5
- deltacat/compute/compactor_v2/utils/content_type_params.py +185 -34
- deltacat/compute/compactor_v2/utils/io.py +28 -14
- deltacat/compute/compactor_v2/utils/task_options.py +128 -183
- deltacat/compute/resource_estimation/__init__.py +27 -0
- deltacat/compute/resource_estimation/delta.py +271 -0
- deltacat/compute/resource_estimation/manifest.py +394 -0
- deltacat/compute/resource_estimation/model.py +165 -0
- deltacat/compute/resource_estimation/parquet.py +108 -0
- deltacat/constants.py +5 -0
- deltacat/logs.py +8 -0
- deltacat/tests/compute/compactor_v2/test_compaction_session.py +157 -0
- deltacat/tests/compute/compactor_v2/utils/test_task_options.py +3 -3
- deltacat/tests/compute/resource_estimation/test_delta.py +605 -0
- deltacat/tests/compute/resource_estimation/test_manifest.py +921 -0
- deltacat/tests/compute/test_util_common.py +2 -0
- deltacat/tests/test_logs.py +34 -0
- deltacat/tests/test_utils/pyarrow.py +15 -5
- {deltacat-1.1.18.dist-info → deltacat-1.1.20.dist-info}/METADATA +2 -2
- {deltacat-1.1.18.dist-info → deltacat-1.1.20.dist-info}/RECORD +30 -46
- deltacat/compute/metastats/meta_stats.py +0 -479
- deltacat/compute/metastats/model/__init__.py +0 -0
- deltacat/compute/metastats/model/partition_stats_dict.py +0 -34
- deltacat/compute/metastats/model/stats_cluster_size_estimator.py +0 -68
- deltacat/compute/metastats/stats.py +0 -182
- deltacat/compute/metastats/utils/__init__.py +0 -0
- deltacat/compute/metastats/utils/constants.py +0 -16
- deltacat/compute/metastats/utils/io.py +0 -223
- deltacat/compute/metastats/utils/pyarrow_memory_estimation_function.py +0 -18
- deltacat/compute/metastats/utils/ray_utils.py +0 -129
- deltacat/compute/stats/basic.py +0 -226
- deltacat/compute/stats/models/__init__.py +0 -0
- deltacat/compute/stats/models/delta_column_stats.py +0 -98
- deltacat/compute/stats/models/delta_stats.py +0 -233
- deltacat/compute/stats/models/delta_stats_cache_result.py +0 -49
- deltacat/compute/stats/models/manifest_entry_stats.py +0 -72
- deltacat/compute/stats/models/stats_result.py +0 -104
- deltacat/compute/stats/utils/__init__.py +0 -0
- deltacat/compute/stats/utils/intervals.py +0 -94
- deltacat/compute/stats/utils/io.py +0 -230
- deltacat/compute/stats/utils/manifest_stats_file.py +0 -100
- deltacat/tests/stats/__init__.py +0 -0
- deltacat/tests/stats/test_intervals.py +0 -49
- /deltacat/{compute/metastats → tests/compute/resource_estimation}/__init__.py +0 -0
- /deltacat/{compute/metastats/config → tests/compute/resource_estimation/data}/__init__.py +0 -0
- {deltacat-1.1.18.dist-info → deltacat-1.1.20.dist-info}/LICENSE +0 -0
- {deltacat-1.1.18.dist-info → deltacat-1.1.20.dist-info}/WHEEL +0 -0
- {deltacat-1.1.18.dist-info → deltacat-1.1.20.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,271 @@
|
|
1
|
+
import logging
|
2
|
+
from typing import Optional, Dict, Any
|
3
|
+
from deltacat import logs
|
4
|
+
from deltacat.storage import (
|
5
|
+
Delta,
|
6
|
+
interface as unimplemented_deltacat_storage,
|
7
|
+
)
|
8
|
+
from deltacat.compute.compactor_v2.utils.content_type_params import (
|
9
|
+
append_content_type_params,
|
10
|
+
)
|
11
|
+
from deltacat.compute.resource_estimation.model import (
|
12
|
+
OperationType,
|
13
|
+
EstimateResourcesParams,
|
14
|
+
ResourceEstimationMethod,
|
15
|
+
EstimatedResources,
|
16
|
+
Statistics,
|
17
|
+
)
|
18
|
+
from deltacat.compute.resource_estimation.manifest import (
|
19
|
+
estimate_manifest_entry_size_bytes,
|
20
|
+
estimate_manifest_entry_num_rows,
|
21
|
+
)
|
22
|
+
|
23
|
+
logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
|
24
|
+
|
25
|
+
|
26
|
+
def _estimate_resources_required_to_process_delta_using_previous_inflation(
|
27
|
+
delta: Delta,
|
28
|
+
operation_type: OperationType,
|
29
|
+
estimate_resources_params: EstimateResourcesParams,
|
30
|
+
deltacat_storage: unimplemented_deltacat_storage,
|
31
|
+
deltacat_storage_kwargs: Dict[str, Any],
|
32
|
+
**kwargs,
|
33
|
+
) -> Optional[EstimatedResources]:
|
34
|
+
|
35
|
+
assert (
|
36
|
+
operation_type == OperationType.PYARROW_DOWNLOAD
|
37
|
+
), "Number of rows can only be estimated for PYARROW_DOWNLOAD operation"
|
38
|
+
assert (
|
39
|
+
estimate_resources_params.previous_inflation
|
40
|
+
), "Previous inflation must be provided to estimate delta size"
|
41
|
+
|
42
|
+
in_memory_size = (
|
43
|
+
delta.meta.content_length * estimate_resources_params.previous_inflation
|
44
|
+
)
|
45
|
+
num_rows = int(in_memory_size / estimate_resources_params.average_record_size_bytes)
|
46
|
+
|
47
|
+
return EstimatedResources.of(
|
48
|
+
memory_bytes=in_memory_size,
|
49
|
+
statistics=Statistics.of(
|
50
|
+
in_memory_size_bytes=in_memory_size,
|
51
|
+
record_count=num_rows,
|
52
|
+
on_disk_size_bytes=delta.meta.content_length,
|
53
|
+
),
|
54
|
+
)
|
55
|
+
|
56
|
+
|
57
|
+
def _estimate_resources_required_to_process_delta_using_type_params(
|
58
|
+
delta: Delta,
|
59
|
+
operation_type: OperationType,
|
60
|
+
estimate_resources_params: EstimateResourcesParams,
|
61
|
+
deltacat_storage: unimplemented_deltacat_storage,
|
62
|
+
deltacat_storage_kwargs: Dict[str, Any],
|
63
|
+
**kwargs,
|
64
|
+
) -> Optional[EstimatedResources]:
|
65
|
+
|
66
|
+
assert (
|
67
|
+
operation_type == OperationType.PYARROW_DOWNLOAD
|
68
|
+
), "Number of rows can only be estimated for PYARROW_DOWNLOAD operation"
|
69
|
+
|
70
|
+
if estimate_resources_params.parquet_to_pyarrow_inflation is None:
|
71
|
+
return None
|
72
|
+
|
73
|
+
if not delta.manifest:
|
74
|
+
delta.manifest = deltacat_storage.get_delta_manifest(
|
75
|
+
delta.locator,
|
76
|
+
**deltacat_storage_kwargs,
|
77
|
+
)
|
78
|
+
|
79
|
+
if not delta.manifest or not delta.manifest.entries:
|
80
|
+
return EstimatedResources.of(
|
81
|
+
memory_bytes=0,
|
82
|
+
statistics=Statistics.of(
|
83
|
+
in_memory_size_bytes=0,
|
84
|
+
record_count=0,
|
85
|
+
on_disk_size_bytes=delta.meta.content_length,
|
86
|
+
),
|
87
|
+
)
|
88
|
+
|
89
|
+
append_content_type_params(
|
90
|
+
delta=delta,
|
91
|
+
deltacat_storage=deltacat_storage,
|
92
|
+
deltacat_storage_kwargs=deltacat_storage_kwargs,
|
93
|
+
)
|
94
|
+
|
95
|
+
in_memory_size = 0.0
|
96
|
+
num_rows = 0
|
97
|
+
|
98
|
+
for entry in delta.manifest.entries:
|
99
|
+
cur_memory = estimate_manifest_entry_size_bytes(
|
100
|
+
entry=entry,
|
101
|
+
operation_type=operation_type,
|
102
|
+
estimate_resources_params=estimate_resources_params,
|
103
|
+
**kwargs,
|
104
|
+
)
|
105
|
+
cur_num_rows = estimate_manifest_entry_num_rows(
|
106
|
+
entry=entry,
|
107
|
+
operation_type=operation_type,
|
108
|
+
estimate_resources_params=estimate_resources_params,
|
109
|
+
**kwargs,
|
110
|
+
)
|
111
|
+
|
112
|
+
if cur_memory is None or cur_num_rows is None:
|
113
|
+
return None
|
114
|
+
|
115
|
+
in_memory_size += cur_memory
|
116
|
+
num_rows += cur_num_rows
|
117
|
+
|
118
|
+
return EstimatedResources.of(
|
119
|
+
memory_bytes=in_memory_size,
|
120
|
+
statistics=Statistics.of(
|
121
|
+
in_memory_size_bytes=in_memory_size,
|
122
|
+
record_count=num_rows,
|
123
|
+
on_disk_size_bytes=delta.meta.content_length,
|
124
|
+
),
|
125
|
+
)
|
126
|
+
|
127
|
+
|
128
|
+
def _estimate_resources_required_to_process_delta_using_file_sampling(
|
129
|
+
delta: Delta,
|
130
|
+
operation_type: OperationType,
|
131
|
+
estimate_resources_params: EstimateResourcesParams,
|
132
|
+
deltacat_storage: unimplemented_deltacat_storage,
|
133
|
+
deltacat_storage_kwargs: Dict[str, Any],
|
134
|
+
**kwargs,
|
135
|
+
) -> Optional[EstimatedResources]:
|
136
|
+
|
137
|
+
assert (
|
138
|
+
operation_type == OperationType.PYARROW_DOWNLOAD
|
139
|
+
), "Number of rows can only be estimated for PYARROW_DOWNLOAD operation"
|
140
|
+
|
141
|
+
if not delta.manifest:
|
142
|
+
delta.manifest = deltacat_storage.get_delta_manifest(
|
143
|
+
delta.locator,
|
144
|
+
**deltacat_storage_kwargs,
|
145
|
+
)
|
146
|
+
|
147
|
+
if not delta.manifest or not delta.manifest.entries:
|
148
|
+
return EstimatedResources.of(
|
149
|
+
memory_bytes=0,
|
150
|
+
statistics=Statistics.of(
|
151
|
+
in_memory_size_bytes=0,
|
152
|
+
record_count=0,
|
153
|
+
on_disk_size_bytes=delta.meta.content_length,
|
154
|
+
),
|
155
|
+
)
|
156
|
+
|
157
|
+
if not estimate_resources_params.max_files_to_sample:
|
158
|
+
# we cannot calculate if we cannot sample
|
159
|
+
return None
|
160
|
+
|
161
|
+
sampled_in_memory_size = 0.0
|
162
|
+
sampled_on_disk_size = 0.0
|
163
|
+
sampled_num_rows = 0
|
164
|
+
|
165
|
+
for entry_index in range(
|
166
|
+
min(estimate_resources_params.max_files_to_sample, len(delta.manifest.entries))
|
167
|
+
):
|
168
|
+
tbl = deltacat_storage.download_delta_manifest_entry(
|
169
|
+
delta,
|
170
|
+
entry_index,
|
171
|
+
**deltacat_storage_kwargs,
|
172
|
+
)
|
173
|
+
sampled_in_memory_size += tbl.nbytes
|
174
|
+
sampled_on_disk_size += delta.manifest.entries[entry_index].meta.content_length
|
175
|
+
sampled_num_rows += len(tbl)
|
176
|
+
|
177
|
+
if not sampled_on_disk_size:
|
178
|
+
return EstimatedResources.of(
|
179
|
+
memory_bytes=0,
|
180
|
+
statistics=Statistics.of(
|
181
|
+
in_memory_size_bytes=0,
|
182
|
+
record_count=0,
|
183
|
+
on_disk_size_bytes=delta.meta.content_length,
|
184
|
+
),
|
185
|
+
)
|
186
|
+
|
187
|
+
sampled_inflation = sampled_in_memory_size / sampled_on_disk_size
|
188
|
+
|
189
|
+
in_memory_size = sampled_inflation * delta.meta.content_length
|
190
|
+
num_rows = int(in_memory_size / sampled_in_memory_size * sampled_num_rows)
|
191
|
+
|
192
|
+
return EstimatedResources.of(
|
193
|
+
memory_bytes=in_memory_size,
|
194
|
+
statistics=Statistics.of(
|
195
|
+
in_memory_size_bytes=in_memory_size,
|
196
|
+
record_count=num_rows,
|
197
|
+
on_disk_size_bytes=delta.meta.content_length,
|
198
|
+
),
|
199
|
+
)
|
200
|
+
|
201
|
+
|
202
|
+
RESOURCE_ESTIMATION_METHOD_TO_DELTA_RESOURCE_ESTIMATION_FUNCTIONS = {
|
203
|
+
ResourceEstimationMethod.PREVIOUS_INFLATION: [
|
204
|
+
_estimate_resources_required_to_process_delta_using_previous_inflation
|
205
|
+
],
|
206
|
+
ResourceEstimationMethod.CONTENT_TYPE_META: [
|
207
|
+
_estimate_resources_required_to_process_delta_using_type_params
|
208
|
+
],
|
209
|
+
ResourceEstimationMethod.INTELLIGENT_ESTIMATION: [
|
210
|
+
_estimate_resources_required_to_process_delta_using_type_params,
|
211
|
+
],
|
212
|
+
ResourceEstimationMethod.FILE_SAMPLING: [
|
213
|
+
_estimate_resources_required_to_process_delta_using_file_sampling
|
214
|
+
],
|
215
|
+
ResourceEstimationMethod.DEFAULT: [
|
216
|
+
_estimate_resources_required_to_process_delta_using_previous_inflation,
|
217
|
+
],
|
218
|
+
ResourceEstimationMethod.DEFAULT_V2: [
|
219
|
+
_estimate_resources_required_to_process_delta_using_type_params,
|
220
|
+
_estimate_resources_required_to_process_delta_using_file_sampling,
|
221
|
+
_estimate_resources_required_to_process_delta_using_previous_inflation,
|
222
|
+
],
|
223
|
+
}
|
224
|
+
|
225
|
+
|
226
|
+
def estimate_resources_required_to_process_delta(
|
227
|
+
delta: Delta,
|
228
|
+
operation_type: OperationType,
|
229
|
+
estimate_resources_params: EstimateResourcesParams = None,
|
230
|
+
deltacat_storage=unimplemented_deltacat_storage,
|
231
|
+
deltacat_storage_kwargs: Dict[str, Any] = {},
|
232
|
+
**kwargs,
|
233
|
+
) -> Optional[EstimatedResources]:
|
234
|
+
assert (
|
235
|
+
operation_type == OperationType.PYARROW_DOWNLOAD
|
236
|
+
), "Number of rows can only be estimated for PYARROW_DOWNLOAD operation"
|
237
|
+
|
238
|
+
if delta.meta.record_count and delta.meta.source_content_length:
|
239
|
+
# No need to estimate
|
240
|
+
return EstimatedResources.of(
|
241
|
+
memory_bytes=delta.meta.source_content_length,
|
242
|
+
statistics=Statistics.of(
|
243
|
+
in_memory_size_bytes=delta.meta.source_content_length,
|
244
|
+
record_count=delta.meta.record_count,
|
245
|
+
on_disk_size_bytes=delta.meta.content_length,
|
246
|
+
),
|
247
|
+
)
|
248
|
+
|
249
|
+
if estimate_resources_params is None:
|
250
|
+
estimate_resources_params = EstimateResourcesParams.of()
|
251
|
+
|
252
|
+
functions = RESOURCE_ESTIMATION_METHOD_TO_DELTA_RESOURCE_ESTIMATION_FUNCTIONS.get(
|
253
|
+
estimate_resources_params.resource_estimation_method
|
254
|
+
)
|
255
|
+
|
256
|
+
for func in functions:
|
257
|
+
resources = func(
|
258
|
+
delta=delta,
|
259
|
+
operation_type=operation_type,
|
260
|
+
estimate_resources_params=estimate_resources_params,
|
261
|
+
deltacat_storage=deltacat_storage,
|
262
|
+
deltacat_storage_kwargs=deltacat_storage_kwargs,
|
263
|
+
**kwargs,
|
264
|
+
)
|
265
|
+
if resources is not None:
|
266
|
+
logger.debug(
|
267
|
+
f"Estimated resources for delta={delta.locator} is {resources} using {func}"
|
268
|
+
)
|
269
|
+
return resources
|
270
|
+
|
271
|
+
return None
|
@@ -0,0 +1,394 @@
|
|
1
|
+
import logging
|
2
|
+
from typing import Optional, List
|
3
|
+
from deltacat import logs
|
4
|
+
from deltacat.constants import NULL_SIZE_BYTES
|
5
|
+
from deltacat.compute.resource_estimation.parquet import (
|
6
|
+
parquet_column_chunk_size_estimator,
|
7
|
+
)
|
8
|
+
from deltacat.types.media import ContentEncoding, ContentType
|
9
|
+
from deltacat.types.partial_download import PartialParquetParameters
|
10
|
+
from deltacat.storage import (
|
11
|
+
ManifestEntry,
|
12
|
+
)
|
13
|
+
from deltacat.compute.resource_estimation.model import (
|
14
|
+
OperationType,
|
15
|
+
EstimateResourcesParams,
|
16
|
+
ResourceEstimationMethod,
|
17
|
+
)
|
18
|
+
|
19
|
+
logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
|
20
|
+
|
21
|
+
|
22
|
+
def _get_parquet_type_params_if_exist(
|
23
|
+
entry: ManifestEntry,
|
24
|
+
) -> Optional[PartialParquetParameters]:
|
25
|
+
if (
|
26
|
+
entry.meta
|
27
|
+
and entry.meta.content_type == ContentType.PARQUET
|
28
|
+
and entry.meta.content_encoding == ContentEncoding.IDENTITY
|
29
|
+
and entry.meta.content_type_parameters
|
30
|
+
):
|
31
|
+
for type_params in entry.meta.content_type_parameters:
|
32
|
+
if isinstance(type_params, PartialParquetParameters):
|
33
|
+
return type_params
|
34
|
+
return None
|
35
|
+
|
36
|
+
|
37
|
+
def _calculate_parquet_column_size(
|
38
|
+
type_params: PartialParquetParameters,
|
39
|
+
parquet_to_pyarrow_inflation: float,
|
40
|
+
column: str,
|
41
|
+
enable_intelligent_size_estimation: bool,
|
42
|
+
) -> float:
|
43
|
+
|
44
|
+
memory_estimator = (
|
45
|
+
parquet_column_chunk_size_estimator
|
46
|
+
if enable_intelligent_size_estimation
|
47
|
+
else lambda column_meta: column_meta.total_uncompressed_size
|
48
|
+
)
|
49
|
+
|
50
|
+
final_size = 0.0
|
51
|
+
for rg in type_params.row_groups_to_download:
|
52
|
+
columns_found = 0
|
53
|
+
row_group_meta = type_params.pq_metadata.row_group(rg)
|
54
|
+
for col in range(row_group_meta.num_columns):
|
55
|
+
column_meta = row_group_meta.column(col)
|
56
|
+
if column_meta.path_in_schema == column:
|
57
|
+
columns_found += 1
|
58
|
+
final_size += memory_estimator(column_meta=column_meta)
|
59
|
+
if columns_found == 0:
|
60
|
+
# This indicates a null column
|
61
|
+
final_size += NULL_SIZE_BYTES * row_group_meta.num_rows
|
62
|
+
elif columns_found > 1:
|
63
|
+
raise ValueError(f"Duplicate column found in parquet file: {column}")
|
64
|
+
|
65
|
+
return final_size * parquet_to_pyarrow_inflation
|
66
|
+
|
67
|
+
|
68
|
+
def _estimate_manifest_entry_size_bytes_using_previous_inflation(
|
69
|
+
entry: ManifestEntry,
|
70
|
+
operation_type: OperationType,
|
71
|
+
estimate_resources_params: EstimateResourcesParams,
|
72
|
+
**kwargs,
|
73
|
+
) -> Optional[float]:
|
74
|
+
|
75
|
+
assert (
|
76
|
+
operation_type == OperationType.PYARROW_DOWNLOAD
|
77
|
+
), "Size can only be estimated for PYARROW_DOWNLOAD operation"
|
78
|
+
assert (
|
79
|
+
estimate_resources_params.previous_inflation is not None
|
80
|
+
), "Expected previous_inflation when resource estimation method is PREVIOUS_INFLATION"
|
81
|
+
|
82
|
+
return entry.meta.content_length * estimate_resources_params.previous_inflation
|
83
|
+
|
84
|
+
|
85
|
+
def _estimate_manifest_entry_size_bytes_using_content_type_meta(
|
86
|
+
entry: ManifestEntry,
|
87
|
+
operation_type: OperationType,
|
88
|
+
estimate_resources_params: EstimateResourcesParams,
|
89
|
+
**kwargs,
|
90
|
+
) -> Optional[float]:
|
91
|
+
|
92
|
+
assert (
|
93
|
+
operation_type == OperationType.PYARROW_DOWNLOAD
|
94
|
+
), "Size can only be estimated for PYARROW_DOWNLOAD operation"
|
95
|
+
|
96
|
+
type_params = _get_parquet_type_params_if_exist(entry=entry)
|
97
|
+
|
98
|
+
if (
|
99
|
+
not type_params
|
100
|
+
or estimate_resources_params.parquet_to_pyarrow_inflation is None
|
101
|
+
):
|
102
|
+
return None
|
103
|
+
|
104
|
+
if not type_params.row_groups_to_download:
|
105
|
+
return 0
|
106
|
+
|
107
|
+
return (
|
108
|
+
type_params.in_memory_size_bytes
|
109
|
+
* estimate_resources_params.parquet_to_pyarrow_inflation
|
110
|
+
)
|
111
|
+
|
112
|
+
|
113
|
+
def _estimate_manifest_entry_size_bytes_using_intelligent_estimation(
|
114
|
+
entry: ManifestEntry,
|
115
|
+
operation_type: OperationType,
|
116
|
+
estimate_resources_params: EstimateResourcesParams,
|
117
|
+
**kwargs,
|
118
|
+
) -> Optional[float]:
|
119
|
+
|
120
|
+
assert (
|
121
|
+
operation_type == OperationType.PYARROW_DOWNLOAD
|
122
|
+
), "Size can only be estimated for PYARROW_DOWNLOAD operation"
|
123
|
+
|
124
|
+
type_params = _get_parquet_type_params_if_exist(entry=entry)
|
125
|
+
|
126
|
+
if (
|
127
|
+
not type_params
|
128
|
+
or estimate_resources_params.parquet_to_pyarrow_inflation is None
|
129
|
+
):
|
130
|
+
return None
|
131
|
+
|
132
|
+
if not type_params.row_groups_to_download:
|
133
|
+
return 0
|
134
|
+
|
135
|
+
column_names = [
|
136
|
+
type_params.pq_metadata.row_group(0).column(col).path_in_schema
|
137
|
+
for col in range(type_params.pq_metadata.num_columns)
|
138
|
+
]
|
139
|
+
return estimate_manifest_entry_column_size_bytes(
|
140
|
+
entry=entry,
|
141
|
+
operation_type=operation_type,
|
142
|
+
columns=column_names,
|
143
|
+
estimate_resources_params=estimate_resources_params,
|
144
|
+
)
|
145
|
+
|
146
|
+
|
147
|
+
RESOURCE_ESTIMATION_METHOD_TO_SIZE_ESTIMATION_FUNCTIONS = {
|
148
|
+
ResourceEstimationMethod.PREVIOUS_INFLATION: [
|
149
|
+
_estimate_manifest_entry_size_bytes_using_previous_inflation
|
150
|
+
],
|
151
|
+
ResourceEstimationMethod.CONTENT_TYPE_META: [
|
152
|
+
_estimate_manifest_entry_size_bytes_using_content_type_meta
|
153
|
+
],
|
154
|
+
ResourceEstimationMethod.INTELLIGENT_ESTIMATION: [
|
155
|
+
_estimate_manifest_entry_size_bytes_using_intelligent_estimation
|
156
|
+
],
|
157
|
+
ResourceEstimationMethod.DEFAULT: [
|
158
|
+
_estimate_manifest_entry_size_bytes_using_content_type_meta,
|
159
|
+
_estimate_manifest_entry_size_bytes_using_previous_inflation,
|
160
|
+
],
|
161
|
+
ResourceEstimationMethod.DEFAULT_V2: [
|
162
|
+
_estimate_manifest_entry_size_bytes_using_intelligent_estimation,
|
163
|
+
_estimate_manifest_entry_size_bytes_using_previous_inflation,
|
164
|
+
],
|
165
|
+
}
|
166
|
+
|
167
|
+
|
168
|
+
def _estimate_manifest_entry_num_rows_using_previous_inflation(
|
169
|
+
entry: ManifestEntry,
|
170
|
+
operation_type: OperationType,
|
171
|
+
estimate_resources_params: EstimateResourcesParams,
|
172
|
+
**kwargs,
|
173
|
+
) -> Optional[int]:
|
174
|
+
assert (
|
175
|
+
operation_type == OperationType.PYARROW_DOWNLOAD
|
176
|
+
), "Number of rows can only be estimated for PYARROW_DOWNLOAD operation"
|
177
|
+
assert (
|
178
|
+
estimate_resources_params.previous_inflation is not None
|
179
|
+
), "Expected previous_inflation when resource estimation method is PREVIOUS_INFLATION"
|
180
|
+
assert (
|
181
|
+
estimate_resources_params.average_record_size_bytes is not None
|
182
|
+
), "Expected average_record_size_bytes when resource estimation method is PREVIOUS_INFLATION"
|
183
|
+
|
184
|
+
total_size_bytes = estimate_manifest_entry_size_bytes(
|
185
|
+
entry=entry,
|
186
|
+
operation_type=operation_type,
|
187
|
+
estimate_resources_params=estimate_resources_params,
|
188
|
+
**kwargs,
|
189
|
+
)
|
190
|
+
|
191
|
+
return int(total_size_bytes / estimate_resources_params.average_record_size_bytes)
|
192
|
+
|
193
|
+
|
194
|
+
def _estimate_manifest_entry_num_rows_using_content_type_meta(
|
195
|
+
entry: ManifestEntry,
|
196
|
+
operation_type: OperationType,
|
197
|
+
estimate_resources_params: EstimateResourcesParams,
|
198
|
+
**kwargs,
|
199
|
+
) -> Optional[int]:
|
200
|
+
assert (
|
201
|
+
operation_type == OperationType.PYARROW_DOWNLOAD
|
202
|
+
), "Number of rows can only be estimated for PYARROW_DOWNLOAD operation"
|
203
|
+
|
204
|
+
type_params = _get_parquet_type_params_if_exist(entry=entry)
|
205
|
+
|
206
|
+
if not type_params:
|
207
|
+
return None
|
208
|
+
|
209
|
+
return type_params.num_rows
|
210
|
+
|
211
|
+
|
212
|
+
def _estimate_manifest_entry_num_rows_using_intelligent_estimation(
|
213
|
+
entry: ManifestEntry,
|
214
|
+
operation_type: OperationType,
|
215
|
+
estimate_resources_params: EstimateResourcesParams,
|
216
|
+
**kwargs,
|
217
|
+
) -> Optional[int]:
|
218
|
+
assert (
|
219
|
+
operation_type == OperationType.PYARROW_DOWNLOAD
|
220
|
+
), "Number of rows can only be estimated for PYARROW_DOWNLOAD operation"
|
221
|
+
|
222
|
+
type_params = _get_parquet_type_params_if_exist(entry=entry)
|
223
|
+
|
224
|
+
if not type_params:
|
225
|
+
return None
|
226
|
+
|
227
|
+
return type_params.num_rows
|
228
|
+
|
229
|
+
|
230
|
+
RESOURCE_ESTIMATION_METHOD_TO_NUM_ROWS_ESTIMATION_FUNCTIONS = {
|
231
|
+
ResourceEstimationMethod.PREVIOUS_INFLATION: [
|
232
|
+
_estimate_manifest_entry_num_rows_using_previous_inflation
|
233
|
+
],
|
234
|
+
ResourceEstimationMethod.CONTENT_TYPE_META: [
|
235
|
+
_estimate_manifest_entry_num_rows_using_content_type_meta
|
236
|
+
],
|
237
|
+
ResourceEstimationMethod.INTELLIGENT_ESTIMATION: [
|
238
|
+
_estimate_manifest_entry_num_rows_using_intelligent_estimation
|
239
|
+
],
|
240
|
+
ResourceEstimationMethod.DEFAULT: [
|
241
|
+
_estimate_manifest_entry_num_rows_using_content_type_meta,
|
242
|
+
_estimate_manifest_entry_num_rows_using_previous_inflation,
|
243
|
+
],
|
244
|
+
ResourceEstimationMethod.DEFAULT_V2: [
|
245
|
+
_estimate_manifest_entry_num_rows_using_intelligent_estimation,
|
246
|
+
_estimate_manifest_entry_num_rows_using_previous_inflation,
|
247
|
+
],
|
248
|
+
}
|
249
|
+
|
250
|
+
|
251
|
+
def estimate_manifest_entry_size_bytes(
|
252
|
+
entry: ManifestEntry,
|
253
|
+
operation_type: OperationType,
|
254
|
+
estimate_resources_params: EstimateResourcesParams = None,
|
255
|
+
**kwargs,
|
256
|
+
) -> Optional[float]:
|
257
|
+
assert (
|
258
|
+
operation_type == OperationType.PYARROW_DOWNLOAD
|
259
|
+
), "Size can only be estimated for PYARROW_DOWNLOAD operation"
|
260
|
+
|
261
|
+
if entry.meta.source_content_length:
|
262
|
+
# No need to estimate size as source_content_length is already present
|
263
|
+
return entry.meta.source_content_length
|
264
|
+
|
265
|
+
if estimate_resources_params is None:
|
266
|
+
estimate_resources_params = EstimateResourcesParams.of()
|
267
|
+
|
268
|
+
functions = RESOURCE_ESTIMATION_METHOD_TO_SIZE_ESTIMATION_FUNCTIONS.get(
|
269
|
+
estimate_resources_params.resource_estimation_method
|
270
|
+
)
|
271
|
+
|
272
|
+
if functions is None:
|
273
|
+
raise ValueError(
|
274
|
+
"Unsupported size estimation method"
|
275
|
+
f": {estimate_resources_params.resource_estimation_method} for entry: {entry}"
|
276
|
+
)
|
277
|
+
|
278
|
+
for func in functions:
|
279
|
+
size_bytes = func(
|
280
|
+
entry=entry,
|
281
|
+
operation_type=operation_type,
|
282
|
+
estimate_resources_params=estimate_resources_params,
|
283
|
+
**kwargs,
|
284
|
+
)
|
285
|
+
if size_bytes is not None:
|
286
|
+
logger.debug(
|
287
|
+
f"Estimated size for entry={entry.uri} is {size_bytes} using {func}"
|
288
|
+
)
|
289
|
+
return size_bytes
|
290
|
+
|
291
|
+
return None
|
292
|
+
|
293
|
+
|
294
|
+
def estimate_manifest_entry_num_rows(
|
295
|
+
entry: ManifestEntry,
|
296
|
+
operation_type: OperationType,
|
297
|
+
estimate_resources_params: EstimateResourcesParams = None,
|
298
|
+
**kwargs,
|
299
|
+
) -> Optional[int]:
|
300
|
+
"""
|
301
|
+
Estimate number of records in the manifest entry file.
|
302
|
+
"""
|
303
|
+
assert (
|
304
|
+
operation_type == OperationType.PYARROW_DOWNLOAD
|
305
|
+
), "Number of rows can only be estimated for PYARROW_DOWNLOAD operation"
|
306
|
+
|
307
|
+
if entry.meta.record_count:
|
308
|
+
# No need to estimate as record_count is already present
|
309
|
+
return entry.meta.record_count
|
310
|
+
|
311
|
+
if estimate_resources_params is None:
|
312
|
+
estimate_resources_params = EstimateResourcesParams.of()
|
313
|
+
|
314
|
+
functions = RESOURCE_ESTIMATION_METHOD_TO_NUM_ROWS_ESTIMATION_FUNCTIONS.get(
|
315
|
+
estimate_resources_params.resource_estimation_method
|
316
|
+
)
|
317
|
+
|
318
|
+
if functions is None:
|
319
|
+
raise ValueError(
|
320
|
+
"Unsupported num rows estimation method"
|
321
|
+
f": {estimate_resources_params.resource_estimation_method} for entry: {entry}"
|
322
|
+
)
|
323
|
+
|
324
|
+
for func in functions:
|
325
|
+
num_rows = func(
|
326
|
+
entry=entry,
|
327
|
+
operation_type=operation_type,
|
328
|
+
estimate_resources_params=estimate_resources_params,
|
329
|
+
**kwargs,
|
330
|
+
)
|
331
|
+
if num_rows is not None:
|
332
|
+
logger.debug(
|
333
|
+
f"Estimated number of rows for entry={entry.uri} is {num_rows} using {func}"
|
334
|
+
)
|
335
|
+
return num_rows
|
336
|
+
|
337
|
+
return None
|
338
|
+
|
339
|
+
|
340
|
+
def estimate_manifest_entry_column_size_bytes(
|
341
|
+
entry: ManifestEntry,
|
342
|
+
operation_type: OperationType,
|
343
|
+
columns: Optional[List[str]] = None,
|
344
|
+
estimate_resources_params: EstimateResourcesParams = None,
|
345
|
+
) -> Optional[float]:
|
346
|
+
"""
|
347
|
+
Estimate the size of specified columns in the manifest entry file.
|
348
|
+
This method only supports parquet. For other types, it returns None.
|
349
|
+
"""
|
350
|
+
|
351
|
+
assert (
|
352
|
+
operation_type == OperationType.PYARROW_DOWNLOAD
|
353
|
+
), "Resources can only be estimated for PYARROW_DOWNLOAD operation"
|
354
|
+
|
355
|
+
type_params = _get_parquet_type_params_if_exist(entry=entry)
|
356
|
+
|
357
|
+
if (
|
358
|
+
not type_params
|
359
|
+
or not type_params.pq_metadata
|
360
|
+
or not estimate_resources_params.parquet_to_pyarrow_inflation
|
361
|
+
):
|
362
|
+
return None
|
363
|
+
|
364
|
+
if not columns or not type_params.row_groups_to_download:
|
365
|
+
return 0
|
366
|
+
|
367
|
+
if estimate_resources_params is None:
|
368
|
+
estimate_resources_params = EstimateResourcesParams.of()
|
369
|
+
|
370
|
+
is_intelligent_estimation = (
|
371
|
+
estimate_resources_params.resource_estimation_method
|
372
|
+
== ResourceEstimationMethod.INTELLIGENT_ESTIMATION
|
373
|
+
or estimate_resources_params.resource_estimation_method
|
374
|
+
== ResourceEstimationMethod.DEFAULT_V2
|
375
|
+
)
|
376
|
+
|
377
|
+
columns_size = 0.0
|
378
|
+
for column_name in columns:
|
379
|
+
columns_size += _calculate_parquet_column_size(
|
380
|
+
type_params=type_params,
|
381
|
+
column=column_name,
|
382
|
+
parquet_to_pyarrow_inflation=estimate_resources_params.parquet_to_pyarrow_inflation,
|
383
|
+
enable_intelligent_size_estimation=is_intelligent_estimation,
|
384
|
+
)
|
385
|
+
return columns_size
|
386
|
+
|
387
|
+
|
388
|
+
def does_require_content_type_params(
|
389
|
+
resource_estimation_method: ResourceEstimationMethod,
|
390
|
+
) -> bool:
|
391
|
+
return (
|
392
|
+
resource_estimation_method == ResourceEstimationMethod.DEFAULT_V2
|
393
|
+
or resource_estimation_method == ResourceEstimationMethod.INTELLIGENT_ESTIMATION
|
394
|
+
)
|