deltacat 1.1.18__py3-none-any.whl → 1.1.20__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. deltacat/__init__.py +1 -1
  2. deltacat/compute/compactor/model/compact_partition_params.py +76 -0
  3. deltacat/compute/compactor/model/compaction_session_audit_info.py +26 -0
  4. deltacat/compute/compactor/model/delta_annotated.py +16 -9
  5. deltacat/compute/compactor_v2/constants.py +3 -0
  6. deltacat/compute/compactor_v2/private/compaction_utils.py +11 -5
  7. deltacat/compute/compactor_v2/utils/content_type_params.py +185 -34
  8. deltacat/compute/compactor_v2/utils/io.py +28 -14
  9. deltacat/compute/compactor_v2/utils/task_options.py +128 -183
  10. deltacat/compute/resource_estimation/__init__.py +27 -0
  11. deltacat/compute/resource_estimation/delta.py +271 -0
  12. deltacat/compute/resource_estimation/manifest.py +394 -0
  13. deltacat/compute/resource_estimation/model.py +165 -0
  14. deltacat/compute/resource_estimation/parquet.py +108 -0
  15. deltacat/constants.py +5 -0
  16. deltacat/logs.py +8 -0
  17. deltacat/tests/compute/compactor_v2/test_compaction_session.py +157 -0
  18. deltacat/tests/compute/compactor_v2/utils/test_task_options.py +3 -3
  19. deltacat/tests/compute/resource_estimation/test_delta.py +605 -0
  20. deltacat/tests/compute/resource_estimation/test_manifest.py +921 -0
  21. deltacat/tests/compute/test_util_common.py +2 -0
  22. deltacat/tests/test_logs.py +34 -0
  23. deltacat/tests/test_utils/pyarrow.py +15 -5
  24. {deltacat-1.1.18.dist-info → deltacat-1.1.20.dist-info}/METADATA +2 -2
  25. {deltacat-1.1.18.dist-info → deltacat-1.1.20.dist-info}/RECORD +30 -46
  26. deltacat/compute/metastats/meta_stats.py +0 -479
  27. deltacat/compute/metastats/model/__init__.py +0 -0
  28. deltacat/compute/metastats/model/partition_stats_dict.py +0 -34
  29. deltacat/compute/metastats/model/stats_cluster_size_estimator.py +0 -68
  30. deltacat/compute/metastats/stats.py +0 -182
  31. deltacat/compute/metastats/utils/__init__.py +0 -0
  32. deltacat/compute/metastats/utils/constants.py +0 -16
  33. deltacat/compute/metastats/utils/io.py +0 -223
  34. deltacat/compute/metastats/utils/pyarrow_memory_estimation_function.py +0 -18
  35. deltacat/compute/metastats/utils/ray_utils.py +0 -129
  36. deltacat/compute/stats/basic.py +0 -226
  37. deltacat/compute/stats/models/__init__.py +0 -0
  38. deltacat/compute/stats/models/delta_column_stats.py +0 -98
  39. deltacat/compute/stats/models/delta_stats.py +0 -233
  40. deltacat/compute/stats/models/delta_stats_cache_result.py +0 -49
  41. deltacat/compute/stats/models/manifest_entry_stats.py +0 -72
  42. deltacat/compute/stats/models/stats_result.py +0 -104
  43. deltacat/compute/stats/utils/__init__.py +0 -0
  44. deltacat/compute/stats/utils/intervals.py +0 -94
  45. deltacat/compute/stats/utils/io.py +0 -230
  46. deltacat/compute/stats/utils/manifest_stats_file.py +0 -100
  47. deltacat/tests/stats/__init__.py +0 -0
  48. deltacat/tests/stats/test_intervals.py +0 -49
  49. /deltacat/{compute/metastats → tests/compute/resource_estimation}/__init__.py +0 -0
  50. /deltacat/{compute/metastats/config → tests/compute/resource_estimation/data}/__init__.py +0 -0
  51. {deltacat-1.1.18.dist-info → deltacat-1.1.20.dist-info}/LICENSE +0 -0
  52. {deltacat-1.1.18.dist-info → deltacat-1.1.20.dist-info}/WHEEL +0 -0
  53. {deltacat-1.1.18.dist-info → deltacat-1.1.20.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,271 @@
1
+ import logging
2
+ from typing import Optional, Dict, Any
3
+ from deltacat import logs
4
+ from deltacat.storage import (
5
+ Delta,
6
+ interface as unimplemented_deltacat_storage,
7
+ )
8
+ from deltacat.compute.compactor_v2.utils.content_type_params import (
9
+ append_content_type_params,
10
+ )
11
+ from deltacat.compute.resource_estimation.model import (
12
+ OperationType,
13
+ EstimateResourcesParams,
14
+ ResourceEstimationMethod,
15
+ EstimatedResources,
16
+ Statistics,
17
+ )
18
+ from deltacat.compute.resource_estimation.manifest import (
19
+ estimate_manifest_entry_size_bytes,
20
+ estimate_manifest_entry_num_rows,
21
+ )
22
+
23
+ logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
24
+
25
+
26
+ def _estimate_resources_required_to_process_delta_using_previous_inflation(
27
+ delta: Delta,
28
+ operation_type: OperationType,
29
+ estimate_resources_params: EstimateResourcesParams,
30
+ deltacat_storage: unimplemented_deltacat_storage,
31
+ deltacat_storage_kwargs: Dict[str, Any],
32
+ **kwargs,
33
+ ) -> Optional[EstimatedResources]:
34
+
35
+ assert (
36
+ operation_type == OperationType.PYARROW_DOWNLOAD
37
+ ), "Number of rows can only be estimated for PYARROW_DOWNLOAD operation"
38
+ assert (
39
+ estimate_resources_params.previous_inflation
40
+ ), "Previous inflation must be provided to estimate delta size"
41
+
42
+ in_memory_size = (
43
+ delta.meta.content_length * estimate_resources_params.previous_inflation
44
+ )
45
+ num_rows = int(in_memory_size / estimate_resources_params.average_record_size_bytes)
46
+
47
+ return EstimatedResources.of(
48
+ memory_bytes=in_memory_size,
49
+ statistics=Statistics.of(
50
+ in_memory_size_bytes=in_memory_size,
51
+ record_count=num_rows,
52
+ on_disk_size_bytes=delta.meta.content_length,
53
+ ),
54
+ )
55
+
56
+
57
+ def _estimate_resources_required_to_process_delta_using_type_params(
58
+ delta: Delta,
59
+ operation_type: OperationType,
60
+ estimate_resources_params: EstimateResourcesParams,
61
+ deltacat_storage: unimplemented_deltacat_storage,
62
+ deltacat_storage_kwargs: Dict[str, Any],
63
+ **kwargs,
64
+ ) -> Optional[EstimatedResources]:
65
+
66
+ assert (
67
+ operation_type == OperationType.PYARROW_DOWNLOAD
68
+ ), "Number of rows can only be estimated for PYARROW_DOWNLOAD operation"
69
+
70
+ if estimate_resources_params.parquet_to_pyarrow_inflation is None:
71
+ return None
72
+
73
+ if not delta.manifest:
74
+ delta.manifest = deltacat_storage.get_delta_manifest(
75
+ delta.locator,
76
+ **deltacat_storage_kwargs,
77
+ )
78
+
79
+ if not delta.manifest or not delta.manifest.entries:
80
+ return EstimatedResources.of(
81
+ memory_bytes=0,
82
+ statistics=Statistics.of(
83
+ in_memory_size_bytes=0,
84
+ record_count=0,
85
+ on_disk_size_bytes=delta.meta.content_length,
86
+ ),
87
+ )
88
+
89
+ append_content_type_params(
90
+ delta=delta,
91
+ deltacat_storage=deltacat_storage,
92
+ deltacat_storage_kwargs=deltacat_storage_kwargs,
93
+ )
94
+
95
+ in_memory_size = 0.0
96
+ num_rows = 0
97
+
98
+ for entry in delta.manifest.entries:
99
+ cur_memory = estimate_manifest_entry_size_bytes(
100
+ entry=entry,
101
+ operation_type=operation_type,
102
+ estimate_resources_params=estimate_resources_params,
103
+ **kwargs,
104
+ )
105
+ cur_num_rows = estimate_manifest_entry_num_rows(
106
+ entry=entry,
107
+ operation_type=operation_type,
108
+ estimate_resources_params=estimate_resources_params,
109
+ **kwargs,
110
+ )
111
+
112
+ if cur_memory is None or cur_num_rows is None:
113
+ return None
114
+
115
+ in_memory_size += cur_memory
116
+ num_rows += cur_num_rows
117
+
118
+ return EstimatedResources.of(
119
+ memory_bytes=in_memory_size,
120
+ statistics=Statistics.of(
121
+ in_memory_size_bytes=in_memory_size,
122
+ record_count=num_rows,
123
+ on_disk_size_bytes=delta.meta.content_length,
124
+ ),
125
+ )
126
+
127
+
128
+ def _estimate_resources_required_to_process_delta_using_file_sampling(
129
+ delta: Delta,
130
+ operation_type: OperationType,
131
+ estimate_resources_params: EstimateResourcesParams,
132
+ deltacat_storage: unimplemented_deltacat_storage,
133
+ deltacat_storage_kwargs: Dict[str, Any],
134
+ **kwargs,
135
+ ) -> Optional[EstimatedResources]:
136
+
137
+ assert (
138
+ operation_type == OperationType.PYARROW_DOWNLOAD
139
+ ), "Number of rows can only be estimated for PYARROW_DOWNLOAD operation"
140
+
141
+ if not delta.manifest:
142
+ delta.manifest = deltacat_storage.get_delta_manifest(
143
+ delta.locator,
144
+ **deltacat_storage_kwargs,
145
+ )
146
+
147
+ if not delta.manifest or not delta.manifest.entries:
148
+ return EstimatedResources.of(
149
+ memory_bytes=0,
150
+ statistics=Statistics.of(
151
+ in_memory_size_bytes=0,
152
+ record_count=0,
153
+ on_disk_size_bytes=delta.meta.content_length,
154
+ ),
155
+ )
156
+
157
+ if not estimate_resources_params.max_files_to_sample:
158
+ # we cannot calculate if we cannot sample
159
+ return None
160
+
161
+ sampled_in_memory_size = 0.0
162
+ sampled_on_disk_size = 0.0
163
+ sampled_num_rows = 0
164
+
165
+ for entry_index in range(
166
+ min(estimate_resources_params.max_files_to_sample, len(delta.manifest.entries))
167
+ ):
168
+ tbl = deltacat_storage.download_delta_manifest_entry(
169
+ delta,
170
+ entry_index,
171
+ **deltacat_storage_kwargs,
172
+ )
173
+ sampled_in_memory_size += tbl.nbytes
174
+ sampled_on_disk_size += delta.manifest.entries[entry_index].meta.content_length
175
+ sampled_num_rows += len(tbl)
176
+
177
+ if not sampled_on_disk_size:
178
+ return EstimatedResources.of(
179
+ memory_bytes=0,
180
+ statistics=Statistics.of(
181
+ in_memory_size_bytes=0,
182
+ record_count=0,
183
+ on_disk_size_bytes=delta.meta.content_length,
184
+ ),
185
+ )
186
+
187
+ sampled_inflation = sampled_in_memory_size / sampled_on_disk_size
188
+
189
+ in_memory_size = sampled_inflation * delta.meta.content_length
190
+ num_rows = int(in_memory_size / sampled_in_memory_size * sampled_num_rows)
191
+
192
+ return EstimatedResources.of(
193
+ memory_bytes=in_memory_size,
194
+ statistics=Statistics.of(
195
+ in_memory_size_bytes=in_memory_size,
196
+ record_count=num_rows,
197
+ on_disk_size_bytes=delta.meta.content_length,
198
+ ),
199
+ )
200
+
201
+
202
+ RESOURCE_ESTIMATION_METHOD_TO_DELTA_RESOURCE_ESTIMATION_FUNCTIONS = {
203
+ ResourceEstimationMethod.PREVIOUS_INFLATION: [
204
+ _estimate_resources_required_to_process_delta_using_previous_inflation
205
+ ],
206
+ ResourceEstimationMethod.CONTENT_TYPE_META: [
207
+ _estimate_resources_required_to_process_delta_using_type_params
208
+ ],
209
+ ResourceEstimationMethod.INTELLIGENT_ESTIMATION: [
210
+ _estimate_resources_required_to_process_delta_using_type_params,
211
+ ],
212
+ ResourceEstimationMethod.FILE_SAMPLING: [
213
+ _estimate_resources_required_to_process_delta_using_file_sampling
214
+ ],
215
+ ResourceEstimationMethod.DEFAULT: [
216
+ _estimate_resources_required_to_process_delta_using_previous_inflation,
217
+ ],
218
+ ResourceEstimationMethod.DEFAULT_V2: [
219
+ _estimate_resources_required_to_process_delta_using_type_params,
220
+ _estimate_resources_required_to_process_delta_using_file_sampling,
221
+ _estimate_resources_required_to_process_delta_using_previous_inflation,
222
+ ],
223
+ }
224
+
225
+
226
+ def estimate_resources_required_to_process_delta(
227
+ delta: Delta,
228
+ operation_type: OperationType,
229
+ estimate_resources_params: EstimateResourcesParams = None,
230
+ deltacat_storage=unimplemented_deltacat_storage,
231
+ deltacat_storage_kwargs: Dict[str, Any] = {},
232
+ **kwargs,
233
+ ) -> Optional[EstimatedResources]:
234
+ assert (
235
+ operation_type == OperationType.PYARROW_DOWNLOAD
236
+ ), "Number of rows can only be estimated for PYARROW_DOWNLOAD operation"
237
+
238
+ if delta.meta.record_count and delta.meta.source_content_length:
239
+ # No need to estimate
240
+ return EstimatedResources.of(
241
+ memory_bytes=delta.meta.source_content_length,
242
+ statistics=Statistics.of(
243
+ in_memory_size_bytes=delta.meta.source_content_length,
244
+ record_count=delta.meta.record_count,
245
+ on_disk_size_bytes=delta.meta.content_length,
246
+ ),
247
+ )
248
+
249
+ if estimate_resources_params is None:
250
+ estimate_resources_params = EstimateResourcesParams.of()
251
+
252
+ functions = RESOURCE_ESTIMATION_METHOD_TO_DELTA_RESOURCE_ESTIMATION_FUNCTIONS.get(
253
+ estimate_resources_params.resource_estimation_method
254
+ )
255
+
256
+ for func in functions:
257
+ resources = func(
258
+ delta=delta,
259
+ operation_type=operation_type,
260
+ estimate_resources_params=estimate_resources_params,
261
+ deltacat_storage=deltacat_storage,
262
+ deltacat_storage_kwargs=deltacat_storage_kwargs,
263
+ **kwargs,
264
+ )
265
+ if resources is not None:
266
+ logger.debug(
267
+ f"Estimated resources for delta={delta.locator} is {resources} using {func}"
268
+ )
269
+ return resources
270
+
271
+ return None
@@ -0,0 +1,394 @@
1
+ import logging
2
+ from typing import Optional, List
3
+ from deltacat import logs
4
+ from deltacat.constants import NULL_SIZE_BYTES
5
+ from deltacat.compute.resource_estimation.parquet import (
6
+ parquet_column_chunk_size_estimator,
7
+ )
8
+ from deltacat.types.media import ContentEncoding, ContentType
9
+ from deltacat.types.partial_download import PartialParquetParameters
10
+ from deltacat.storage import (
11
+ ManifestEntry,
12
+ )
13
+ from deltacat.compute.resource_estimation.model import (
14
+ OperationType,
15
+ EstimateResourcesParams,
16
+ ResourceEstimationMethod,
17
+ )
18
+
19
+ logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
20
+
21
+
22
+ def _get_parquet_type_params_if_exist(
23
+ entry: ManifestEntry,
24
+ ) -> Optional[PartialParquetParameters]:
25
+ if (
26
+ entry.meta
27
+ and entry.meta.content_type == ContentType.PARQUET
28
+ and entry.meta.content_encoding == ContentEncoding.IDENTITY
29
+ and entry.meta.content_type_parameters
30
+ ):
31
+ for type_params in entry.meta.content_type_parameters:
32
+ if isinstance(type_params, PartialParquetParameters):
33
+ return type_params
34
+ return None
35
+
36
+
37
+ def _calculate_parquet_column_size(
38
+ type_params: PartialParquetParameters,
39
+ parquet_to_pyarrow_inflation: float,
40
+ column: str,
41
+ enable_intelligent_size_estimation: bool,
42
+ ) -> float:
43
+
44
+ memory_estimator = (
45
+ parquet_column_chunk_size_estimator
46
+ if enable_intelligent_size_estimation
47
+ else lambda column_meta: column_meta.total_uncompressed_size
48
+ )
49
+
50
+ final_size = 0.0
51
+ for rg in type_params.row_groups_to_download:
52
+ columns_found = 0
53
+ row_group_meta = type_params.pq_metadata.row_group(rg)
54
+ for col in range(row_group_meta.num_columns):
55
+ column_meta = row_group_meta.column(col)
56
+ if column_meta.path_in_schema == column:
57
+ columns_found += 1
58
+ final_size += memory_estimator(column_meta=column_meta)
59
+ if columns_found == 0:
60
+ # This indicates a null column
61
+ final_size += NULL_SIZE_BYTES * row_group_meta.num_rows
62
+ elif columns_found > 1:
63
+ raise ValueError(f"Duplicate column found in parquet file: {column}")
64
+
65
+ return final_size * parquet_to_pyarrow_inflation
66
+
67
+
68
+ def _estimate_manifest_entry_size_bytes_using_previous_inflation(
69
+ entry: ManifestEntry,
70
+ operation_type: OperationType,
71
+ estimate_resources_params: EstimateResourcesParams,
72
+ **kwargs,
73
+ ) -> Optional[float]:
74
+
75
+ assert (
76
+ operation_type == OperationType.PYARROW_DOWNLOAD
77
+ ), "Size can only be estimated for PYARROW_DOWNLOAD operation"
78
+ assert (
79
+ estimate_resources_params.previous_inflation is not None
80
+ ), "Expected previous_inflation when resource estimation method is PREVIOUS_INFLATION"
81
+
82
+ return entry.meta.content_length * estimate_resources_params.previous_inflation
83
+
84
+
85
+ def _estimate_manifest_entry_size_bytes_using_content_type_meta(
86
+ entry: ManifestEntry,
87
+ operation_type: OperationType,
88
+ estimate_resources_params: EstimateResourcesParams,
89
+ **kwargs,
90
+ ) -> Optional[float]:
91
+
92
+ assert (
93
+ operation_type == OperationType.PYARROW_DOWNLOAD
94
+ ), "Size can only be estimated for PYARROW_DOWNLOAD operation"
95
+
96
+ type_params = _get_parquet_type_params_if_exist(entry=entry)
97
+
98
+ if (
99
+ not type_params
100
+ or estimate_resources_params.parquet_to_pyarrow_inflation is None
101
+ ):
102
+ return None
103
+
104
+ if not type_params.row_groups_to_download:
105
+ return 0
106
+
107
+ return (
108
+ type_params.in_memory_size_bytes
109
+ * estimate_resources_params.parquet_to_pyarrow_inflation
110
+ )
111
+
112
+
113
+ def _estimate_manifest_entry_size_bytes_using_intelligent_estimation(
114
+ entry: ManifestEntry,
115
+ operation_type: OperationType,
116
+ estimate_resources_params: EstimateResourcesParams,
117
+ **kwargs,
118
+ ) -> Optional[float]:
119
+
120
+ assert (
121
+ operation_type == OperationType.PYARROW_DOWNLOAD
122
+ ), "Size can only be estimated for PYARROW_DOWNLOAD operation"
123
+
124
+ type_params = _get_parquet_type_params_if_exist(entry=entry)
125
+
126
+ if (
127
+ not type_params
128
+ or estimate_resources_params.parquet_to_pyarrow_inflation is None
129
+ ):
130
+ return None
131
+
132
+ if not type_params.row_groups_to_download:
133
+ return 0
134
+
135
+ column_names = [
136
+ type_params.pq_metadata.row_group(0).column(col).path_in_schema
137
+ for col in range(type_params.pq_metadata.num_columns)
138
+ ]
139
+ return estimate_manifest_entry_column_size_bytes(
140
+ entry=entry,
141
+ operation_type=operation_type,
142
+ columns=column_names,
143
+ estimate_resources_params=estimate_resources_params,
144
+ )
145
+
146
+
147
+ RESOURCE_ESTIMATION_METHOD_TO_SIZE_ESTIMATION_FUNCTIONS = {
148
+ ResourceEstimationMethod.PREVIOUS_INFLATION: [
149
+ _estimate_manifest_entry_size_bytes_using_previous_inflation
150
+ ],
151
+ ResourceEstimationMethod.CONTENT_TYPE_META: [
152
+ _estimate_manifest_entry_size_bytes_using_content_type_meta
153
+ ],
154
+ ResourceEstimationMethod.INTELLIGENT_ESTIMATION: [
155
+ _estimate_manifest_entry_size_bytes_using_intelligent_estimation
156
+ ],
157
+ ResourceEstimationMethod.DEFAULT: [
158
+ _estimate_manifest_entry_size_bytes_using_content_type_meta,
159
+ _estimate_manifest_entry_size_bytes_using_previous_inflation,
160
+ ],
161
+ ResourceEstimationMethod.DEFAULT_V2: [
162
+ _estimate_manifest_entry_size_bytes_using_intelligent_estimation,
163
+ _estimate_manifest_entry_size_bytes_using_previous_inflation,
164
+ ],
165
+ }
166
+
167
+
168
+ def _estimate_manifest_entry_num_rows_using_previous_inflation(
169
+ entry: ManifestEntry,
170
+ operation_type: OperationType,
171
+ estimate_resources_params: EstimateResourcesParams,
172
+ **kwargs,
173
+ ) -> Optional[int]:
174
+ assert (
175
+ operation_type == OperationType.PYARROW_DOWNLOAD
176
+ ), "Number of rows can only be estimated for PYARROW_DOWNLOAD operation"
177
+ assert (
178
+ estimate_resources_params.previous_inflation is not None
179
+ ), "Expected previous_inflation when resource estimation method is PREVIOUS_INFLATION"
180
+ assert (
181
+ estimate_resources_params.average_record_size_bytes is not None
182
+ ), "Expected average_record_size_bytes when resource estimation method is PREVIOUS_INFLATION"
183
+
184
+ total_size_bytes = estimate_manifest_entry_size_bytes(
185
+ entry=entry,
186
+ operation_type=operation_type,
187
+ estimate_resources_params=estimate_resources_params,
188
+ **kwargs,
189
+ )
190
+
191
+ return int(total_size_bytes / estimate_resources_params.average_record_size_bytes)
192
+
193
+
194
+ def _estimate_manifest_entry_num_rows_using_content_type_meta(
195
+ entry: ManifestEntry,
196
+ operation_type: OperationType,
197
+ estimate_resources_params: EstimateResourcesParams,
198
+ **kwargs,
199
+ ) -> Optional[int]:
200
+ assert (
201
+ operation_type == OperationType.PYARROW_DOWNLOAD
202
+ ), "Number of rows can only be estimated for PYARROW_DOWNLOAD operation"
203
+
204
+ type_params = _get_parquet_type_params_if_exist(entry=entry)
205
+
206
+ if not type_params:
207
+ return None
208
+
209
+ return type_params.num_rows
210
+
211
+
212
+ def _estimate_manifest_entry_num_rows_using_intelligent_estimation(
213
+ entry: ManifestEntry,
214
+ operation_type: OperationType,
215
+ estimate_resources_params: EstimateResourcesParams,
216
+ **kwargs,
217
+ ) -> Optional[int]:
218
+ assert (
219
+ operation_type == OperationType.PYARROW_DOWNLOAD
220
+ ), "Number of rows can only be estimated for PYARROW_DOWNLOAD operation"
221
+
222
+ type_params = _get_parquet_type_params_if_exist(entry=entry)
223
+
224
+ if not type_params:
225
+ return None
226
+
227
+ return type_params.num_rows
228
+
229
+
230
+ RESOURCE_ESTIMATION_METHOD_TO_NUM_ROWS_ESTIMATION_FUNCTIONS = {
231
+ ResourceEstimationMethod.PREVIOUS_INFLATION: [
232
+ _estimate_manifest_entry_num_rows_using_previous_inflation
233
+ ],
234
+ ResourceEstimationMethod.CONTENT_TYPE_META: [
235
+ _estimate_manifest_entry_num_rows_using_content_type_meta
236
+ ],
237
+ ResourceEstimationMethod.INTELLIGENT_ESTIMATION: [
238
+ _estimate_manifest_entry_num_rows_using_intelligent_estimation
239
+ ],
240
+ ResourceEstimationMethod.DEFAULT: [
241
+ _estimate_manifest_entry_num_rows_using_content_type_meta,
242
+ _estimate_manifest_entry_num_rows_using_previous_inflation,
243
+ ],
244
+ ResourceEstimationMethod.DEFAULT_V2: [
245
+ _estimate_manifest_entry_num_rows_using_intelligent_estimation,
246
+ _estimate_manifest_entry_num_rows_using_previous_inflation,
247
+ ],
248
+ }
249
+
250
+
251
+ def estimate_manifest_entry_size_bytes(
252
+ entry: ManifestEntry,
253
+ operation_type: OperationType,
254
+ estimate_resources_params: EstimateResourcesParams = None,
255
+ **kwargs,
256
+ ) -> Optional[float]:
257
+ assert (
258
+ operation_type == OperationType.PYARROW_DOWNLOAD
259
+ ), "Size can only be estimated for PYARROW_DOWNLOAD operation"
260
+
261
+ if entry.meta.source_content_length:
262
+ # No need to estimate size as source_content_length is already present
263
+ return entry.meta.source_content_length
264
+
265
+ if estimate_resources_params is None:
266
+ estimate_resources_params = EstimateResourcesParams.of()
267
+
268
+ functions = RESOURCE_ESTIMATION_METHOD_TO_SIZE_ESTIMATION_FUNCTIONS.get(
269
+ estimate_resources_params.resource_estimation_method
270
+ )
271
+
272
+ if functions is None:
273
+ raise ValueError(
274
+ "Unsupported size estimation method"
275
+ f": {estimate_resources_params.resource_estimation_method} for entry: {entry}"
276
+ )
277
+
278
+ for func in functions:
279
+ size_bytes = func(
280
+ entry=entry,
281
+ operation_type=operation_type,
282
+ estimate_resources_params=estimate_resources_params,
283
+ **kwargs,
284
+ )
285
+ if size_bytes is not None:
286
+ logger.debug(
287
+ f"Estimated size for entry={entry.uri} is {size_bytes} using {func}"
288
+ )
289
+ return size_bytes
290
+
291
+ return None
292
+
293
+
294
+ def estimate_manifest_entry_num_rows(
295
+ entry: ManifestEntry,
296
+ operation_type: OperationType,
297
+ estimate_resources_params: EstimateResourcesParams = None,
298
+ **kwargs,
299
+ ) -> Optional[int]:
300
+ """
301
+ Estimate number of records in the manifest entry file.
302
+ """
303
+ assert (
304
+ operation_type == OperationType.PYARROW_DOWNLOAD
305
+ ), "Number of rows can only be estimated for PYARROW_DOWNLOAD operation"
306
+
307
+ if entry.meta.record_count:
308
+ # No need to estimate as record_count is already present
309
+ return entry.meta.record_count
310
+
311
+ if estimate_resources_params is None:
312
+ estimate_resources_params = EstimateResourcesParams.of()
313
+
314
+ functions = RESOURCE_ESTIMATION_METHOD_TO_NUM_ROWS_ESTIMATION_FUNCTIONS.get(
315
+ estimate_resources_params.resource_estimation_method
316
+ )
317
+
318
+ if functions is None:
319
+ raise ValueError(
320
+ "Unsupported num rows estimation method"
321
+ f": {estimate_resources_params.resource_estimation_method} for entry: {entry}"
322
+ )
323
+
324
+ for func in functions:
325
+ num_rows = func(
326
+ entry=entry,
327
+ operation_type=operation_type,
328
+ estimate_resources_params=estimate_resources_params,
329
+ **kwargs,
330
+ )
331
+ if num_rows is not None:
332
+ logger.debug(
333
+ f"Estimated number of rows for entry={entry.uri} is {num_rows} using {func}"
334
+ )
335
+ return num_rows
336
+
337
+ return None
338
+
339
+
340
+ def estimate_manifest_entry_column_size_bytes(
341
+ entry: ManifestEntry,
342
+ operation_type: OperationType,
343
+ columns: Optional[List[str]] = None,
344
+ estimate_resources_params: EstimateResourcesParams = None,
345
+ ) -> Optional[float]:
346
+ """
347
+ Estimate the size of specified columns in the manifest entry file.
348
+ This method only supports parquet. For other types, it returns None.
349
+ """
350
+
351
+ assert (
352
+ operation_type == OperationType.PYARROW_DOWNLOAD
353
+ ), "Resources can only be estimated for PYARROW_DOWNLOAD operation"
354
+
355
+ type_params = _get_parquet_type_params_if_exist(entry=entry)
356
+
357
+ if (
358
+ not type_params
359
+ or not type_params.pq_metadata
360
+ or not estimate_resources_params.parquet_to_pyarrow_inflation
361
+ ):
362
+ return None
363
+
364
+ if not columns or not type_params.row_groups_to_download:
365
+ return 0
366
+
367
+ if estimate_resources_params is None:
368
+ estimate_resources_params = EstimateResourcesParams.of()
369
+
370
+ is_intelligent_estimation = (
371
+ estimate_resources_params.resource_estimation_method
372
+ == ResourceEstimationMethod.INTELLIGENT_ESTIMATION
373
+ or estimate_resources_params.resource_estimation_method
374
+ == ResourceEstimationMethod.DEFAULT_V2
375
+ )
376
+
377
+ columns_size = 0.0
378
+ for column_name in columns:
379
+ columns_size += _calculate_parquet_column_size(
380
+ type_params=type_params,
381
+ column=column_name,
382
+ parquet_to_pyarrow_inflation=estimate_resources_params.parquet_to_pyarrow_inflation,
383
+ enable_intelligent_size_estimation=is_intelligent_estimation,
384
+ )
385
+ return columns_size
386
+
387
+
388
+ def does_require_content_type_params(
389
+ resource_estimation_method: ResourceEstimationMethod,
390
+ ) -> bool:
391
+ return (
392
+ resource_estimation_method == ResourceEstimationMethod.DEFAULT_V2
393
+ or resource_estimation_method == ResourceEstimationMethod.INTELLIGENT_ESTIMATION
394
+ )