deltacat 1.1.17__py3-none-any.whl → 1.1.19__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +1 -1
- deltacat/aws/constants.py +0 -1
- deltacat/compute/compactor/model/compact_partition_params.py +76 -0
- deltacat/compute/compactor/model/compaction_session_audit_info.py +26 -0
- deltacat/compute/compactor/model/delta_annotated.py +16 -9
- deltacat/compute/compactor_v2/constants.py +3 -0
- deltacat/compute/compactor_v2/private/compaction_utils.py +9 -5
- deltacat/compute/compactor_v2/utils/content_type_params.py +185 -34
- deltacat/compute/compactor_v2/utils/io.py +28 -14
- deltacat/compute/compactor_v2/utils/primary_key_index.py +9 -4
- deltacat/compute/compactor_v2/utils/task_options.py +128 -183
- deltacat/compute/resource_estimation/__init__.py +27 -0
- deltacat/compute/resource_estimation/delta.py +271 -0
- deltacat/compute/resource_estimation/manifest.py +394 -0
- deltacat/compute/resource_estimation/model.py +165 -0
- deltacat/compute/resource_estimation/parquet.py +108 -0
- deltacat/constants.py +5 -0
- deltacat/exceptions.py +2 -4
- deltacat/logs.py +8 -0
- deltacat/tests/compute/compact_partition_multiple_rounds_test_cases.py +77 -0
- deltacat/tests/compute/compact_partition_rebase_test_cases.py +308 -0
- deltacat/tests/compute/compact_partition_rebase_then_incremental_test_cases.py +159 -0
- deltacat/tests/compute/compactor_v2/test_compaction_session.py +157 -0
- deltacat/tests/compute/compactor_v2/utils/test_task_options.py +3 -3
- deltacat/tests/compute/resource_estimation/test_delta.py +605 -0
- deltacat/tests/compute/resource_estimation/test_manifest.py +921 -0
- deltacat/tests/compute/test_compact_partition_rebase.py +13 -4
- deltacat/tests/compute/test_util_common.py +2 -0
- deltacat/tests/compute/test_util_create_table_deltas_repo.py +13 -5
- deltacat/tests/test_logs.py +34 -0
- deltacat/tests/test_utils/pyarrow.py +15 -5
- {deltacat-1.1.17.dist-info → deltacat-1.1.19.dist-info}/METADATA +2 -2
- {deltacat-1.1.17.dist-info → deltacat-1.1.19.dist-info}/RECORD +38 -54
- deltacat/compute/metastats/meta_stats.py +0 -479
- deltacat/compute/metastats/model/__init__.py +0 -0
- deltacat/compute/metastats/model/partition_stats_dict.py +0 -34
- deltacat/compute/metastats/model/stats_cluster_size_estimator.py +0 -68
- deltacat/compute/metastats/stats.py +0 -182
- deltacat/compute/metastats/utils/__init__.py +0 -0
- deltacat/compute/metastats/utils/constants.py +0 -16
- deltacat/compute/metastats/utils/io.py +0 -223
- deltacat/compute/metastats/utils/pyarrow_memory_estimation_function.py +0 -18
- deltacat/compute/metastats/utils/ray_utils.py +0 -129
- deltacat/compute/stats/basic.py +0 -226
- deltacat/compute/stats/models/__init__.py +0 -0
- deltacat/compute/stats/models/delta_column_stats.py +0 -98
- deltacat/compute/stats/models/delta_stats.py +0 -233
- deltacat/compute/stats/models/delta_stats_cache_result.py +0 -49
- deltacat/compute/stats/models/manifest_entry_stats.py +0 -72
- deltacat/compute/stats/models/stats_result.py +0 -104
- deltacat/compute/stats/utils/__init__.py +0 -0
- deltacat/compute/stats/utils/intervals.py +0 -94
- deltacat/compute/stats/utils/io.py +0 -230
- deltacat/compute/stats/utils/manifest_stats_file.py +0 -100
- deltacat/tests/stats/__init__.py +0 -0
- deltacat/tests/stats/test_intervals.py +0 -49
- /deltacat/{compute/metastats → tests/compute/resource_estimation}/__init__.py +0 -0
- /deltacat/{compute/metastats/config → tests/compute/resource_estimation/data}/__init__.py +0 -0
- {deltacat-1.1.17.dist-info → deltacat-1.1.19.dist-info}/LICENSE +0 -0
- {deltacat-1.1.17.dist-info → deltacat-1.1.19.dist-info}/WHEEL +0 -0
- {deltacat-1.1.17.dist-info → deltacat-1.1.19.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,605 @@
|
|
1
|
+
import deltacat.tests.local_deltacat_storage as ds
|
2
|
+
from deltacat.types.media import ContentType
|
3
|
+
import os
|
4
|
+
import pytest
|
5
|
+
from deltacat.storage import Delta
|
6
|
+
from deltacat.compute.resource_estimation.delta import (
|
7
|
+
estimate_resources_required_to_process_delta,
|
8
|
+
)
|
9
|
+
from deltacat.compute.resource_estimation.model import (
|
10
|
+
OperationType,
|
11
|
+
EstimateResourcesParams,
|
12
|
+
ResourceEstimationMethod,
|
13
|
+
)
|
14
|
+
|
15
|
+
DELTA_CSV_FILE_PATH = (
|
16
|
+
"deltacat/tests/compute/resource_estimation/data/date_pk_table.csv"
|
17
|
+
)
|
18
|
+
|
19
|
+
"""
|
20
|
+
Function scoped fixtures
|
21
|
+
"""
|
22
|
+
|
23
|
+
|
24
|
+
@pytest.fixture(scope="function")
|
25
|
+
def local_deltacat_storage_kwargs():
|
26
|
+
DATABASE_FILE_PATH_KEY, DATABASE_FILE_PATH_VALUE = (
|
27
|
+
"db_file_path",
|
28
|
+
"deltacat/tests/local_deltacat_storage/db_test.sqlite",
|
29
|
+
)
|
30
|
+
# see deltacat/tests/local_deltacat_storage/README.md for documentation
|
31
|
+
kwargs_for_local_deltacat_storage = {
|
32
|
+
DATABASE_FILE_PATH_KEY: DATABASE_FILE_PATH_VALUE,
|
33
|
+
}
|
34
|
+
yield kwargs_for_local_deltacat_storage
|
35
|
+
if os.path.exists(DATABASE_FILE_PATH_VALUE):
|
36
|
+
os.remove(DATABASE_FILE_PATH_VALUE)
|
37
|
+
|
38
|
+
|
39
|
+
@pytest.fixture(scope="function")
|
40
|
+
def parquet_delta_with_manifest(local_deltacat_storage_kwargs):
|
41
|
+
"""
|
42
|
+
These fixtures are function scoped as functions can modify the delta.
|
43
|
+
"""
|
44
|
+
from deltacat.tests.test_utils.pyarrow import create_delta_from_csv_file
|
45
|
+
|
46
|
+
result = create_delta_from_csv_file(
|
47
|
+
"test_namespace",
|
48
|
+
file_paths=[DELTA_CSV_FILE_PATH],
|
49
|
+
content_type=ContentType.PARQUET,
|
50
|
+
**local_deltacat_storage_kwargs
|
51
|
+
)
|
52
|
+
|
53
|
+
result.meta["source_content_length"] = 0
|
54
|
+
result.meta["record_count"] = 0
|
55
|
+
for entry in result.manifest.entries:
|
56
|
+
entry.meta["source_content_length"] = 0
|
57
|
+
entry.meta["record_count"] = 0
|
58
|
+
|
59
|
+
return result
|
60
|
+
|
61
|
+
|
62
|
+
@pytest.fixture(scope="function")
|
63
|
+
def utsv_delta_with_manifest(local_deltacat_storage_kwargs):
|
64
|
+
from deltacat.tests.test_utils.pyarrow import create_delta_from_csv_file
|
65
|
+
|
66
|
+
result = create_delta_from_csv_file(
|
67
|
+
"test_namespace",
|
68
|
+
file_paths=[DELTA_CSV_FILE_PATH],
|
69
|
+
content_type=ContentType.UNESCAPED_TSV,
|
70
|
+
**local_deltacat_storage_kwargs
|
71
|
+
)
|
72
|
+
|
73
|
+
result.meta["source_content_length"] = 0
|
74
|
+
result.meta["record_count"] = 0
|
75
|
+
for entry in result.manifest.entries:
|
76
|
+
entry.meta["source_content_length"] = 0
|
77
|
+
entry.meta["record_count"] = 0
|
78
|
+
|
79
|
+
return result
|
80
|
+
|
81
|
+
|
82
|
+
@pytest.fixture(scope="function")
|
83
|
+
def delta_without_manifest(local_deltacat_storage_kwargs):
|
84
|
+
from deltacat.tests.test_utils.pyarrow import create_delta_from_csv_file
|
85
|
+
|
86
|
+
delta = create_delta_from_csv_file(
|
87
|
+
"test_namespace",
|
88
|
+
file_paths=[DELTA_CSV_FILE_PATH],
|
89
|
+
content_type=ContentType.PARQUET,
|
90
|
+
**local_deltacat_storage_kwargs
|
91
|
+
)
|
92
|
+
|
93
|
+
# now we intentionally remove manifest
|
94
|
+
delta.manifest = None
|
95
|
+
delta.meta["source_content_length"] = 0
|
96
|
+
delta.meta["record_count"] = 0
|
97
|
+
|
98
|
+
return delta
|
99
|
+
|
100
|
+
|
101
|
+
@pytest.fixture(scope="function")
|
102
|
+
def delta_with_populated_meta(local_deltacat_storage_kwargs):
|
103
|
+
from deltacat.tests.test_utils.pyarrow import create_delta_from_csv_file
|
104
|
+
|
105
|
+
delta = create_delta_from_csv_file(
|
106
|
+
"test_namespace",
|
107
|
+
file_paths=[DELTA_CSV_FILE_PATH],
|
108
|
+
content_type=ContentType.PARQUET,
|
109
|
+
**local_deltacat_storage_kwargs
|
110
|
+
)
|
111
|
+
|
112
|
+
return delta
|
113
|
+
|
114
|
+
|
115
|
+
class TestEstimateResourcesRequiredToProcessDelta:
|
116
|
+
def test_delta_with_prepopulated_meta_returns_directly(
|
117
|
+
self, local_deltacat_storage_kwargs, delta_with_populated_meta: Delta
|
118
|
+
):
|
119
|
+
|
120
|
+
result = estimate_resources_required_to_process_delta(
|
121
|
+
delta=delta_with_populated_meta,
|
122
|
+
operation_type=OperationType.PYARROW_DOWNLOAD,
|
123
|
+
deltacat_storage=ds,
|
124
|
+
deltacat_storage_kwargs=local_deltacat_storage_kwargs,
|
125
|
+
)
|
126
|
+
|
127
|
+
assert (
|
128
|
+
result.memory_bytes == delta_with_populated_meta.meta.source_content_length
|
129
|
+
)
|
130
|
+
assert (
|
131
|
+
result.statistics.in_memory_size_bytes
|
132
|
+
== delta_with_populated_meta.meta.source_content_length
|
133
|
+
)
|
134
|
+
assert (
|
135
|
+
result.statistics.on_disk_size_bytes
|
136
|
+
== delta_with_populated_meta.meta.content_length
|
137
|
+
)
|
138
|
+
assert (
|
139
|
+
result.statistics.record_count
|
140
|
+
== delta_with_populated_meta.meta.record_count
|
141
|
+
)
|
142
|
+
|
143
|
+
def test_delta_manifest_empty_when_default_method(
|
144
|
+
self, local_deltacat_storage_kwargs, delta_without_manifest: Delta
|
145
|
+
):
|
146
|
+
params = EstimateResourcesParams.of(
|
147
|
+
resource_estimation_method=ResourceEstimationMethod.DEFAULT,
|
148
|
+
previous_inflation=7,
|
149
|
+
average_record_size_bytes=1000,
|
150
|
+
)
|
151
|
+
|
152
|
+
result = estimate_resources_required_to_process_delta(
|
153
|
+
delta=delta_without_manifest,
|
154
|
+
operation_type=OperationType.PYARROW_DOWNLOAD,
|
155
|
+
deltacat_storage=ds,
|
156
|
+
deltacat_storage_kwargs=local_deltacat_storage_kwargs,
|
157
|
+
estimate_resources_params=params,
|
158
|
+
)
|
159
|
+
|
160
|
+
assert result.memory_bytes != delta_without_manifest.meta.source_content_length
|
161
|
+
assert (
|
162
|
+
result.memory_bytes
|
163
|
+
== delta_without_manifest.meta.content_length * params.previous_inflation
|
164
|
+
)
|
165
|
+
assert result.statistics.in_memory_size_bytes == result.memory_bytes
|
166
|
+
assert (
|
167
|
+
result.statistics.on_disk_size_bytes
|
168
|
+
== delta_without_manifest.meta.content_length
|
169
|
+
)
|
170
|
+
assert result.statistics.record_count == int(
|
171
|
+
result.memory_bytes / params.average_record_size_bytes
|
172
|
+
)
|
173
|
+
|
174
|
+
def test_delta_manifest_exists_when_default_method(
|
175
|
+
self, local_deltacat_storage_kwargs, parquet_delta_with_manifest: Delta
|
176
|
+
):
|
177
|
+
params = EstimateResourcesParams.of(
|
178
|
+
resource_estimation_method=ResourceEstimationMethod.DEFAULT,
|
179
|
+
previous_inflation=7,
|
180
|
+
average_record_size_bytes=1000,
|
181
|
+
)
|
182
|
+
|
183
|
+
result = estimate_resources_required_to_process_delta(
|
184
|
+
delta=parquet_delta_with_manifest,
|
185
|
+
operation_type=OperationType.PYARROW_DOWNLOAD,
|
186
|
+
deltacat_storage=ds,
|
187
|
+
deltacat_storage_kwargs=local_deltacat_storage_kwargs,
|
188
|
+
estimate_resources_params=params,
|
189
|
+
)
|
190
|
+
|
191
|
+
assert (
|
192
|
+
result.memory_bytes
|
193
|
+
!= parquet_delta_with_manifest.meta.source_content_length
|
194
|
+
)
|
195
|
+
assert (
|
196
|
+
result.memory_bytes
|
197
|
+
== parquet_delta_with_manifest.meta.content_length
|
198
|
+
* params.previous_inflation
|
199
|
+
)
|
200
|
+
assert result.statistics.in_memory_size_bytes == result.memory_bytes
|
201
|
+
assert (
|
202
|
+
result.statistics.on_disk_size_bytes
|
203
|
+
== parquet_delta_with_manifest.meta.content_length
|
204
|
+
)
|
205
|
+
assert result.statistics.record_count == int(
|
206
|
+
result.memory_bytes / params.average_record_size_bytes
|
207
|
+
)
|
208
|
+
|
209
|
+
def test_previous_inflation_arg_not_passed_when_default_method(
|
210
|
+
self, local_deltacat_storage_kwargs, parquet_delta_with_manifest: Delta
|
211
|
+
):
|
212
|
+
with pytest.raises(AssertionError):
|
213
|
+
params = EstimateResourcesParams.of(
|
214
|
+
resource_estimation_method=ResourceEstimationMethod.DEFAULT,
|
215
|
+
average_record_size_bytes=1000,
|
216
|
+
)
|
217
|
+
|
218
|
+
estimate_resources_required_to_process_delta(
|
219
|
+
delta=parquet_delta_with_manifest,
|
220
|
+
operation_type=OperationType.PYARROW_DOWNLOAD,
|
221
|
+
deltacat_storage=ds,
|
222
|
+
deltacat_storage_kwargs=local_deltacat_storage_kwargs,
|
223
|
+
estimate_resources_params=params,
|
224
|
+
)
|
225
|
+
|
226
|
+
def test_estimate_resources_params_not_passed_assumes_default(
|
227
|
+
self, local_deltacat_storage_kwargs, parquet_delta_with_manifest: Delta
|
228
|
+
):
|
229
|
+
params = EstimateResourcesParams.of(
|
230
|
+
previous_inflation=7,
|
231
|
+
average_record_size_bytes=1000,
|
232
|
+
)
|
233
|
+
|
234
|
+
result = estimate_resources_required_to_process_delta(
|
235
|
+
delta=parquet_delta_with_manifest,
|
236
|
+
operation_type=OperationType.PYARROW_DOWNLOAD,
|
237
|
+
deltacat_storage=ds,
|
238
|
+
deltacat_storage_kwargs=local_deltacat_storage_kwargs,
|
239
|
+
estimate_resources_params=params,
|
240
|
+
)
|
241
|
+
|
242
|
+
assert (
|
243
|
+
result.memory_bytes
|
244
|
+
!= parquet_delta_with_manifest.meta.source_content_length
|
245
|
+
)
|
246
|
+
assert (
|
247
|
+
result.memory_bytes
|
248
|
+
== parquet_delta_with_manifest.meta.content_length
|
249
|
+
* params.previous_inflation
|
250
|
+
)
|
251
|
+
assert result.statistics.in_memory_size_bytes == result.memory_bytes
|
252
|
+
assert (
|
253
|
+
result.statistics.on_disk_size_bytes
|
254
|
+
== parquet_delta_with_manifest.meta.content_length
|
255
|
+
)
|
256
|
+
assert result.statistics.record_count == int(
|
257
|
+
result.memory_bytes / params.average_record_size_bytes
|
258
|
+
)
|
259
|
+
|
260
|
+
def test_delta_manifest_empty_when_content_type_meta(
|
261
|
+
self, local_deltacat_storage_kwargs, delta_without_manifest: Delta
|
262
|
+
):
|
263
|
+
params = EstimateResourcesParams.of(
|
264
|
+
resource_estimation_method=ResourceEstimationMethod.CONTENT_TYPE_META,
|
265
|
+
parquet_to_pyarrow_inflation=2,
|
266
|
+
)
|
267
|
+
|
268
|
+
result = estimate_resources_required_to_process_delta(
|
269
|
+
delta=delta_without_manifest,
|
270
|
+
operation_type=OperationType.PYARROW_DOWNLOAD,
|
271
|
+
deltacat_storage=ds,
|
272
|
+
deltacat_storage_kwargs=local_deltacat_storage_kwargs,
|
273
|
+
estimate_resources_params=params,
|
274
|
+
)
|
275
|
+
|
276
|
+
assert delta_without_manifest.manifest is not None
|
277
|
+
assert int(result.memory_bytes) == 84
|
278
|
+
assert int(result.statistics.in_memory_size_bytes) == 84
|
279
|
+
assert (
|
280
|
+
result.statistics.on_disk_size_bytes
|
281
|
+
== delta_without_manifest.meta.content_length
|
282
|
+
)
|
283
|
+
assert result.statistics.record_count == 7
|
284
|
+
|
285
|
+
def test_delta_manifest_exists_when_content_type_meta(
|
286
|
+
self, local_deltacat_storage_kwargs, parquet_delta_with_manifest: Delta
|
287
|
+
):
|
288
|
+
params = EstimateResourcesParams.of(
|
289
|
+
resource_estimation_method=ResourceEstimationMethod.CONTENT_TYPE_META,
|
290
|
+
parquet_to_pyarrow_inflation=2,
|
291
|
+
)
|
292
|
+
|
293
|
+
result = estimate_resources_required_to_process_delta(
|
294
|
+
delta=parquet_delta_with_manifest,
|
295
|
+
operation_type=OperationType.PYARROW_DOWNLOAD,
|
296
|
+
deltacat_storage=ds,
|
297
|
+
deltacat_storage_kwargs=local_deltacat_storage_kwargs,
|
298
|
+
estimate_resources_params=params,
|
299
|
+
)
|
300
|
+
|
301
|
+
assert parquet_delta_with_manifest.manifest is not None
|
302
|
+
assert int(result.memory_bytes) == 464
|
303
|
+
assert int(result.statistics.in_memory_size_bytes) == int(result.memory_bytes)
|
304
|
+
assert (
|
305
|
+
result.statistics.on_disk_size_bytes
|
306
|
+
== parquet_delta_with_manifest.meta.content_length
|
307
|
+
)
|
308
|
+
assert result.statistics.record_count == 7
|
309
|
+
|
310
|
+
def test_delta_manifest_empty_when_intelligent_estimation(
|
311
|
+
self, local_deltacat_storage_kwargs, delta_without_manifest: Delta
|
312
|
+
):
|
313
|
+
params = EstimateResourcesParams.of(
|
314
|
+
resource_estimation_method=ResourceEstimationMethod.INTELLIGENT_ESTIMATION,
|
315
|
+
parquet_to_pyarrow_inflation=2,
|
316
|
+
)
|
317
|
+
|
318
|
+
result = estimate_resources_required_to_process_delta(
|
319
|
+
delta=delta_without_manifest,
|
320
|
+
operation_type=OperationType.PYARROW_DOWNLOAD,
|
321
|
+
deltacat_storage=ds,
|
322
|
+
deltacat_storage_kwargs=local_deltacat_storage_kwargs,
|
323
|
+
estimate_resources_params=params,
|
324
|
+
)
|
325
|
+
|
326
|
+
assert delta_without_manifest.manifest is not None
|
327
|
+
assert int(result.memory_bytes) == 84
|
328
|
+
assert int(result.statistics.in_memory_size_bytes) == 84
|
329
|
+
assert (
|
330
|
+
result.statistics.on_disk_size_bytes
|
331
|
+
== delta_without_manifest.meta.content_length
|
332
|
+
)
|
333
|
+
assert result.statistics.record_count == 7
|
334
|
+
|
335
|
+
def test_delta_manifest_exists_when_intelligent_estimation(
|
336
|
+
self, local_deltacat_storage_kwargs, parquet_delta_with_manifest: Delta
|
337
|
+
):
|
338
|
+
params = EstimateResourcesParams.of(
|
339
|
+
resource_estimation_method=ResourceEstimationMethod.INTELLIGENT_ESTIMATION,
|
340
|
+
parquet_to_pyarrow_inflation=2,
|
341
|
+
)
|
342
|
+
|
343
|
+
result = estimate_resources_required_to_process_delta(
|
344
|
+
delta=parquet_delta_with_manifest,
|
345
|
+
operation_type=OperationType.PYARROW_DOWNLOAD,
|
346
|
+
deltacat_storage=ds,
|
347
|
+
deltacat_storage_kwargs=local_deltacat_storage_kwargs,
|
348
|
+
estimate_resources_params=params,
|
349
|
+
)
|
350
|
+
|
351
|
+
assert parquet_delta_with_manifest.manifest is not None
|
352
|
+
assert int(result.memory_bytes) == 168
|
353
|
+
assert int(result.statistics.in_memory_size_bytes) == int(result.memory_bytes)
|
354
|
+
assert (
|
355
|
+
result.statistics.on_disk_size_bytes
|
356
|
+
== parquet_delta_with_manifest.meta.content_length
|
357
|
+
)
|
358
|
+
assert result.statistics.record_count == 7
|
359
|
+
|
360
|
+
def test_delta_manifest_exists_inflation_absent_when_intelligent_estimation(
|
361
|
+
self, local_deltacat_storage_kwargs, parquet_delta_with_manifest: Delta
|
362
|
+
):
|
363
|
+
params = EstimateResourcesParams.of(
|
364
|
+
resource_estimation_method=ResourceEstimationMethod.INTELLIGENT_ESTIMATION,
|
365
|
+
parquet_to_pyarrow_inflation=None,
|
366
|
+
)
|
367
|
+
|
368
|
+
result = estimate_resources_required_to_process_delta(
|
369
|
+
delta=parquet_delta_with_manifest,
|
370
|
+
operation_type=OperationType.PYARROW_DOWNLOAD,
|
371
|
+
deltacat_storage=ds,
|
372
|
+
deltacat_storage_kwargs=local_deltacat_storage_kwargs,
|
373
|
+
estimate_resources_params=params,
|
374
|
+
)
|
375
|
+
|
376
|
+
assert result is None
|
377
|
+
|
378
|
+
def test_delta_utsv_data_when_intelligent_estimation(
|
379
|
+
self, local_deltacat_storage_kwargs, utsv_delta_with_manifest: Delta
|
380
|
+
):
|
381
|
+
params = EstimateResourcesParams.of(
|
382
|
+
resource_estimation_method=ResourceEstimationMethod.INTELLIGENT_ESTIMATION,
|
383
|
+
parquet_to_pyarrow_inflation=2,
|
384
|
+
)
|
385
|
+
|
386
|
+
result = estimate_resources_required_to_process_delta(
|
387
|
+
delta=utsv_delta_with_manifest,
|
388
|
+
operation_type=OperationType.PYARROW_DOWNLOAD,
|
389
|
+
deltacat_storage=ds,
|
390
|
+
deltacat_storage_kwargs=local_deltacat_storage_kwargs,
|
391
|
+
estimate_resources_params=params,
|
392
|
+
)
|
393
|
+
|
394
|
+
assert result is None
|
395
|
+
|
396
|
+
def test_empty_delta_sampled_when_file_sampling(
|
397
|
+
self, local_deltacat_storage_kwargs, delta_without_manifest: Delta
|
398
|
+
):
|
399
|
+
params = EstimateResourcesParams.of(
|
400
|
+
resource_estimation_method=ResourceEstimationMethod.FILE_SAMPLING,
|
401
|
+
max_files_to_sample=2,
|
402
|
+
)
|
403
|
+
|
404
|
+
result = estimate_resources_required_to_process_delta(
|
405
|
+
delta=delta_without_manifest,
|
406
|
+
operation_type=OperationType.PYARROW_DOWNLOAD,
|
407
|
+
deltacat_storage=ds,
|
408
|
+
deltacat_storage_kwargs=local_deltacat_storage_kwargs,
|
409
|
+
estimate_resources_params=params,
|
410
|
+
)
|
411
|
+
|
412
|
+
assert delta_without_manifest.manifest is not None
|
413
|
+
assert result.memory_bytes is not None
|
414
|
+
assert (
|
415
|
+
result.statistics.on_disk_size_bytes
|
416
|
+
== delta_without_manifest.meta.content_length
|
417
|
+
)
|
418
|
+
|
419
|
+
def test_delta_manifest_parquet_when_file_sampling(
|
420
|
+
self, local_deltacat_storage_kwargs, parquet_delta_with_manifest: Delta
|
421
|
+
):
|
422
|
+
params = EstimateResourcesParams.of(
|
423
|
+
resource_estimation_method=ResourceEstimationMethod.FILE_SAMPLING,
|
424
|
+
max_files_to_sample=2,
|
425
|
+
)
|
426
|
+
|
427
|
+
result = estimate_resources_required_to_process_delta(
|
428
|
+
delta=parquet_delta_with_manifest,
|
429
|
+
operation_type=OperationType.PYARROW_DOWNLOAD,
|
430
|
+
deltacat_storage=ds,
|
431
|
+
deltacat_storage_kwargs=local_deltacat_storage_kwargs,
|
432
|
+
estimate_resources_params=params,
|
433
|
+
)
|
434
|
+
assert result.memory_bytes is not None
|
435
|
+
assert (
|
436
|
+
result.statistics.on_disk_size_bytes
|
437
|
+
== parquet_delta_with_manifest.meta.content_length
|
438
|
+
)
|
439
|
+
|
440
|
+
def test_delta_manifest_utsv_when_file_sampling(
|
441
|
+
self, local_deltacat_storage_kwargs, utsv_delta_with_manifest: Delta
|
442
|
+
):
|
443
|
+
params = EstimateResourcesParams.of(
|
444
|
+
resource_estimation_method=ResourceEstimationMethod.FILE_SAMPLING,
|
445
|
+
max_files_to_sample=2,
|
446
|
+
)
|
447
|
+
|
448
|
+
result = estimate_resources_required_to_process_delta(
|
449
|
+
delta=utsv_delta_with_manifest,
|
450
|
+
operation_type=OperationType.PYARROW_DOWNLOAD,
|
451
|
+
deltacat_storage=ds,
|
452
|
+
deltacat_storage_kwargs=local_deltacat_storage_kwargs,
|
453
|
+
estimate_resources_params=params,
|
454
|
+
)
|
455
|
+
assert result.memory_bytes is not None
|
456
|
+
assert (
|
457
|
+
result.statistics.on_disk_size_bytes
|
458
|
+
== utsv_delta_with_manifest.meta.content_length
|
459
|
+
)
|
460
|
+
|
461
|
+
def test_delta_manifest_utsv_when_file_sampling_zero_files_to_sample(
|
462
|
+
self, local_deltacat_storage_kwargs, utsv_delta_with_manifest: Delta
|
463
|
+
):
|
464
|
+
params = EstimateResourcesParams.of(
|
465
|
+
resource_estimation_method=ResourceEstimationMethod.FILE_SAMPLING,
|
466
|
+
max_files_to_sample=None,
|
467
|
+
)
|
468
|
+
|
469
|
+
result = estimate_resources_required_to_process_delta(
|
470
|
+
delta=utsv_delta_with_manifest,
|
471
|
+
operation_type=OperationType.PYARROW_DOWNLOAD,
|
472
|
+
deltacat_storage=ds,
|
473
|
+
deltacat_storage_kwargs=local_deltacat_storage_kwargs,
|
474
|
+
estimate_resources_params=params,
|
475
|
+
)
|
476
|
+
assert result is None
|
477
|
+
|
478
|
+
def test_empty_delta_when_default_v2(
|
479
|
+
self, local_deltacat_storage_kwargs, delta_without_manifest: Delta
|
480
|
+
):
|
481
|
+
params = EstimateResourcesParams.of(
|
482
|
+
resource_estimation_method=ResourceEstimationMethod.DEFAULT_V2,
|
483
|
+
max_files_to_sample=2,
|
484
|
+
previous_inflation=7,
|
485
|
+
average_record_size_bytes=1000,
|
486
|
+
)
|
487
|
+
|
488
|
+
result = estimate_resources_required_to_process_delta(
|
489
|
+
delta=delta_without_manifest,
|
490
|
+
operation_type=OperationType.PYARROW_DOWNLOAD,
|
491
|
+
deltacat_storage=ds,
|
492
|
+
deltacat_storage_kwargs=local_deltacat_storage_kwargs,
|
493
|
+
estimate_resources_params=params,
|
494
|
+
)
|
495
|
+
|
496
|
+
assert delta_without_manifest.manifest is not None
|
497
|
+
assert result.memory_bytes is not None
|
498
|
+
assert (
|
499
|
+
result.statistics.on_disk_size_bytes
|
500
|
+
== delta_without_manifest.meta.content_length
|
501
|
+
)
|
502
|
+
|
503
|
+
def test_parquet_delta_when_default_v2(
|
504
|
+
self, local_deltacat_storage_kwargs, parquet_delta_with_manifest: Delta
|
505
|
+
):
|
506
|
+
params = EstimateResourcesParams.of(
|
507
|
+
resource_estimation_method=ResourceEstimationMethod.DEFAULT_V2,
|
508
|
+
max_files_to_sample=2,
|
509
|
+
previous_inflation=7,
|
510
|
+
average_record_size_bytes=1000,
|
511
|
+
parquet_to_pyarrow_inflation=1,
|
512
|
+
)
|
513
|
+
|
514
|
+
result = estimate_resources_required_to_process_delta(
|
515
|
+
delta=parquet_delta_with_manifest,
|
516
|
+
operation_type=OperationType.PYARROW_DOWNLOAD,
|
517
|
+
deltacat_storage=ds,
|
518
|
+
deltacat_storage_kwargs=local_deltacat_storage_kwargs,
|
519
|
+
estimate_resources_params=params,
|
520
|
+
)
|
521
|
+
|
522
|
+
assert parquet_delta_with_manifest.manifest is not None
|
523
|
+
assert result.memory_bytes is not None
|
524
|
+
assert (
|
525
|
+
result.statistics.on_disk_size_bytes
|
526
|
+
== parquet_delta_with_manifest.meta.content_length
|
527
|
+
)
|
528
|
+
|
529
|
+
def test_parquet_delta_when_default_v2_and_files_to_sample_zero(
|
530
|
+
self, local_deltacat_storage_kwargs, parquet_delta_with_manifest: Delta
|
531
|
+
):
|
532
|
+
params = EstimateResourcesParams.of(
|
533
|
+
resource_estimation_method=ResourceEstimationMethod.DEFAULT_V2,
|
534
|
+
max_files_to_sample=0,
|
535
|
+
previous_inflation=7,
|
536
|
+
average_record_size_bytes=1000,
|
537
|
+
parquet_to_pyarrow_inflation=1,
|
538
|
+
)
|
539
|
+
|
540
|
+
result = estimate_resources_required_to_process_delta(
|
541
|
+
delta=parquet_delta_with_manifest,
|
542
|
+
operation_type=OperationType.PYARROW_DOWNLOAD,
|
543
|
+
deltacat_storage=ds,
|
544
|
+
deltacat_storage_kwargs=local_deltacat_storage_kwargs,
|
545
|
+
estimate_resources_params=params,
|
546
|
+
)
|
547
|
+
|
548
|
+
assert parquet_delta_with_manifest.manifest is not None
|
549
|
+
assert result.memory_bytes is not None
|
550
|
+
assert (
|
551
|
+
result.statistics.on_disk_size_bytes
|
552
|
+
== parquet_delta_with_manifest.meta.content_length
|
553
|
+
)
|
554
|
+
|
555
|
+
def test_utsv_delta_when_default_v2(
|
556
|
+
self, local_deltacat_storage_kwargs, utsv_delta_with_manifest: Delta
|
557
|
+
):
|
558
|
+
params = EstimateResourcesParams.of(
|
559
|
+
resource_estimation_method=ResourceEstimationMethod.DEFAULT_V2,
|
560
|
+
max_files_to_sample=2,
|
561
|
+
previous_inflation=7,
|
562
|
+
average_record_size_bytes=1000,
|
563
|
+
parquet_to_pyarrow_inflation=1,
|
564
|
+
)
|
565
|
+
|
566
|
+
result = estimate_resources_required_to_process_delta(
|
567
|
+
delta=utsv_delta_with_manifest,
|
568
|
+
operation_type=OperationType.PYARROW_DOWNLOAD,
|
569
|
+
deltacat_storage=ds,
|
570
|
+
deltacat_storage_kwargs=local_deltacat_storage_kwargs,
|
571
|
+
estimate_resources_params=params,
|
572
|
+
)
|
573
|
+
|
574
|
+
assert utsv_delta_with_manifest.manifest is not None
|
575
|
+
assert result.memory_bytes is not None
|
576
|
+
assert (
|
577
|
+
result.statistics.on_disk_size_bytes
|
578
|
+
== utsv_delta_with_manifest.meta.content_length
|
579
|
+
)
|
580
|
+
|
581
|
+
def test_parquet_delta_without_inflation_when_default_v2(
|
582
|
+
self, local_deltacat_storage_kwargs, parquet_delta_with_manifest: Delta
|
583
|
+
):
|
584
|
+
params = EstimateResourcesParams.of(
|
585
|
+
resource_estimation_method=ResourceEstimationMethod.DEFAULT_V2,
|
586
|
+
max_files_to_sample=2,
|
587
|
+
previous_inflation=7,
|
588
|
+
average_record_size_bytes=1000,
|
589
|
+
parquet_to_pyarrow_inflation=None, # inflation is None
|
590
|
+
)
|
591
|
+
|
592
|
+
result = estimate_resources_required_to_process_delta(
|
593
|
+
delta=parquet_delta_with_manifest,
|
594
|
+
operation_type=OperationType.PYARROW_DOWNLOAD,
|
595
|
+
deltacat_storage=ds,
|
596
|
+
deltacat_storage_kwargs=local_deltacat_storage_kwargs,
|
597
|
+
estimate_resources_params=params,
|
598
|
+
)
|
599
|
+
|
600
|
+
assert parquet_delta_with_manifest.manifest is not None
|
601
|
+
assert result.memory_bytes is not None
|
602
|
+
assert (
|
603
|
+
result.statistics.on_disk_size_bytes
|
604
|
+
== parquet_delta_with_manifest.meta.content_length
|
605
|
+
)
|