deltacat 1.1.17__py3-none-any.whl → 1.1.19__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (61) hide show
  1. deltacat/__init__.py +1 -1
  2. deltacat/aws/constants.py +0 -1
  3. deltacat/compute/compactor/model/compact_partition_params.py +76 -0
  4. deltacat/compute/compactor/model/compaction_session_audit_info.py +26 -0
  5. deltacat/compute/compactor/model/delta_annotated.py +16 -9
  6. deltacat/compute/compactor_v2/constants.py +3 -0
  7. deltacat/compute/compactor_v2/private/compaction_utils.py +9 -5
  8. deltacat/compute/compactor_v2/utils/content_type_params.py +185 -34
  9. deltacat/compute/compactor_v2/utils/io.py +28 -14
  10. deltacat/compute/compactor_v2/utils/primary_key_index.py +9 -4
  11. deltacat/compute/compactor_v2/utils/task_options.py +128 -183
  12. deltacat/compute/resource_estimation/__init__.py +27 -0
  13. deltacat/compute/resource_estimation/delta.py +271 -0
  14. deltacat/compute/resource_estimation/manifest.py +394 -0
  15. deltacat/compute/resource_estimation/model.py +165 -0
  16. deltacat/compute/resource_estimation/parquet.py +108 -0
  17. deltacat/constants.py +5 -0
  18. deltacat/exceptions.py +2 -4
  19. deltacat/logs.py +8 -0
  20. deltacat/tests/compute/compact_partition_multiple_rounds_test_cases.py +77 -0
  21. deltacat/tests/compute/compact_partition_rebase_test_cases.py +308 -0
  22. deltacat/tests/compute/compact_partition_rebase_then_incremental_test_cases.py +159 -0
  23. deltacat/tests/compute/compactor_v2/test_compaction_session.py +157 -0
  24. deltacat/tests/compute/compactor_v2/utils/test_task_options.py +3 -3
  25. deltacat/tests/compute/resource_estimation/test_delta.py +605 -0
  26. deltacat/tests/compute/resource_estimation/test_manifest.py +921 -0
  27. deltacat/tests/compute/test_compact_partition_rebase.py +13 -4
  28. deltacat/tests/compute/test_util_common.py +2 -0
  29. deltacat/tests/compute/test_util_create_table_deltas_repo.py +13 -5
  30. deltacat/tests/test_logs.py +34 -0
  31. deltacat/tests/test_utils/pyarrow.py +15 -5
  32. {deltacat-1.1.17.dist-info → deltacat-1.1.19.dist-info}/METADATA +2 -2
  33. {deltacat-1.1.17.dist-info → deltacat-1.1.19.dist-info}/RECORD +38 -54
  34. deltacat/compute/metastats/meta_stats.py +0 -479
  35. deltacat/compute/metastats/model/__init__.py +0 -0
  36. deltacat/compute/metastats/model/partition_stats_dict.py +0 -34
  37. deltacat/compute/metastats/model/stats_cluster_size_estimator.py +0 -68
  38. deltacat/compute/metastats/stats.py +0 -182
  39. deltacat/compute/metastats/utils/__init__.py +0 -0
  40. deltacat/compute/metastats/utils/constants.py +0 -16
  41. deltacat/compute/metastats/utils/io.py +0 -223
  42. deltacat/compute/metastats/utils/pyarrow_memory_estimation_function.py +0 -18
  43. deltacat/compute/metastats/utils/ray_utils.py +0 -129
  44. deltacat/compute/stats/basic.py +0 -226
  45. deltacat/compute/stats/models/__init__.py +0 -0
  46. deltacat/compute/stats/models/delta_column_stats.py +0 -98
  47. deltacat/compute/stats/models/delta_stats.py +0 -233
  48. deltacat/compute/stats/models/delta_stats_cache_result.py +0 -49
  49. deltacat/compute/stats/models/manifest_entry_stats.py +0 -72
  50. deltacat/compute/stats/models/stats_result.py +0 -104
  51. deltacat/compute/stats/utils/__init__.py +0 -0
  52. deltacat/compute/stats/utils/intervals.py +0 -94
  53. deltacat/compute/stats/utils/io.py +0 -230
  54. deltacat/compute/stats/utils/manifest_stats_file.py +0 -100
  55. deltacat/tests/stats/__init__.py +0 -0
  56. deltacat/tests/stats/test_intervals.py +0 -49
  57. /deltacat/{compute/metastats → tests/compute/resource_estimation}/__init__.py +0 -0
  58. /deltacat/{compute/metastats/config → tests/compute/resource_estimation/data}/__init__.py +0 -0
  59. {deltacat-1.1.17.dist-info → deltacat-1.1.19.dist-info}/LICENSE +0 -0
  60. {deltacat-1.1.17.dist-info → deltacat-1.1.19.dist-info}/WHEEL +0 -0
  61. {deltacat-1.1.17.dist-info → deltacat-1.1.19.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,605 @@
1
+ import deltacat.tests.local_deltacat_storage as ds
2
+ from deltacat.types.media import ContentType
3
+ import os
4
+ import pytest
5
+ from deltacat.storage import Delta
6
+ from deltacat.compute.resource_estimation.delta import (
7
+ estimate_resources_required_to_process_delta,
8
+ )
9
+ from deltacat.compute.resource_estimation.model import (
10
+ OperationType,
11
+ EstimateResourcesParams,
12
+ ResourceEstimationMethod,
13
+ )
14
+
15
+ DELTA_CSV_FILE_PATH = (
16
+ "deltacat/tests/compute/resource_estimation/data/date_pk_table.csv"
17
+ )
18
+
19
+ """
20
+ Function scoped fixtures
21
+ """
22
+
23
+
24
+ @pytest.fixture(scope="function")
25
+ def local_deltacat_storage_kwargs():
26
+ DATABASE_FILE_PATH_KEY, DATABASE_FILE_PATH_VALUE = (
27
+ "db_file_path",
28
+ "deltacat/tests/local_deltacat_storage/db_test.sqlite",
29
+ )
30
+ # see deltacat/tests/local_deltacat_storage/README.md for documentation
31
+ kwargs_for_local_deltacat_storage = {
32
+ DATABASE_FILE_PATH_KEY: DATABASE_FILE_PATH_VALUE,
33
+ }
34
+ yield kwargs_for_local_deltacat_storage
35
+ if os.path.exists(DATABASE_FILE_PATH_VALUE):
36
+ os.remove(DATABASE_FILE_PATH_VALUE)
37
+
38
+
39
+ @pytest.fixture(scope="function")
40
+ def parquet_delta_with_manifest(local_deltacat_storage_kwargs):
41
+ """
42
+ These fixtures are function scoped as functions can modify the delta.
43
+ """
44
+ from deltacat.tests.test_utils.pyarrow import create_delta_from_csv_file
45
+
46
+ result = create_delta_from_csv_file(
47
+ "test_namespace",
48
+ file_paths=[DELTA_CSV_FILE_PATH],
49
+ content_type=ContentType.PARQUET,
50
+ **local_deltacat_storage_kwargs
51
+ )
52
+
53
+ result.meta["source_content_length"] = 0
54
+ result.meta["record_count"] = 0
55
+ for entry in result.manifest.entries:
56
+ entry.meta["source_content_length"] = 0
57
+ entry.meta["record_count"] = 0
58
+
59
+ return result
60
+
61
+
62
+ @pytest.fixture(scope="function")
63
+ def utsv_delta_with_manifest(local_deltacat_storage_kwargs):
64
+ from deltacat.tests.test_utils.pyarrow import create_delta_from_csv_file
65
+
66
+ result = create_delta_from_csv_file(
67
+ "test_namespace",
68
+ file_paths=[DELTA_CSV_FILE_PATH],
69
+ content_type=ContentType.UNESCAPED_TSV,
70
+ **local_deltacat_storage_kwargs
71
+ )
72
+
73
+ result.meta["source_content_length"] = 0
74
+ result.meta["record_count"] = 0
75
+ for entry in result.manifest.entries:
76
+ entry.meta["source_content_length"] = 0
77
+ entry.meta["record_count"] = 0
78
+
79
+ return result
80
+
81
+
82
+ @pytest.fixture(scope="function")
83
+ def delta_without_manifest(local_deltacat_storage_kwargs):
84
+ from deltacat.tests.test_utils.pyarrow import create_delta_from_csv_file
85
+
86
+ delta = create_delta_from_csv_file(
87
+ "test_namespace",
88
+ file_paths=[DELTA_CSV_FILE_PATH],
89
+ content_type=ContentType.PARQUET,
90
+ **local_deltacat_storage_kwargs
91
+ )
92
+
93
+ # now we intentionally remove manifest
94
+ delta.manifest = None
95
+ delta.meta["source_content_length"] = 0
96
+ delta.meta["record_count"] = 0
97
+
98
+ return delta
99
+
100
+
101
+ @pytest.fixture(scope="function")
102
+ def delta_with_populated_meta(local_deltacat_storage_kwargs):
103
+ from deltacat.tests.test_utils.pyarrow import create_delta_from_csv_file
104
+
105
+ delta = create_delta_from_csv_file(
106
+ "test_namespace",
107
+ file_paths=[DELTA_CSV_FILE_PATH],
108
+ content_type=ContentType.PARQUET,
109
+ **local_deltacat_storage_kwargs
110
+ )
111
+
112
+ return delta
113
+
114
+
115
+ class TestEstimateResourcesRequiredToProcessDelta:
116
+ def test_delta_with_prepopulated_meta_returns_directly(
117
+ self, local_deltacat_storage_kwargs, delta_with_populated_meta: Delta
118
+ ):
119
+
120
+ result = estimate_resources_required_to_process_delta(
121
+ delta=delta_with_populated_meta,
122
+ operation_type=OperationType.PYARROW_DOWNLOAD,
123
+ deltacat_storage=ds,
124
+ deltacat_storage_kwargs=local_deltacat_storage_kwargs,
125
+ )
126
+
127
+ assert (
128
+ result.memory_bytes == delta_with_populated_meta.meta.source_content_length
129
+ )
130
+ assert (
131
+ result.statistics.in_memory_size_bytes
132
+ == delta_with_populated_meta.meta.source_content_length
133
+ )
134
+ assert (
135
+ result.statistics.on_disk_size_bytes
136
+ == delta_with_populated_meta.meta.content_length
137
+ )
138
+ assert (
139
+ result.statistics.record_count
140
+ == delta_with_populated_meta.meta.record_count
141
+ )
142
+
143
+ def test_delta_manifest_empty_when_default_method(
144
+ self, local_deltacat_storage_kwargs, delta_without_manifest: Delta
145
+ ):
146
+ params = EstimateResourcesParams.of(
147
+ resource_estimation_method=ResourceEstimationMethod.DEFAULT,
148
+ previous_inflation=7,
149
+ average_record_size_bytes=1000,
150
+ )
151
+
152
+ result = estimate_resources_required_to_process_delta(
153
+ delta=delta_without_manifest,
154
+ operation_type=OperationType.PYARROW_DOWNLOAD,
155
+ deltacat_storage=ds,
156
+ deltacat_storage_kwargs=local_deltacat_storage_kwargs,
157
+ estimate_resources_params=params,
158
+ )
159
+
160
+ assert result.memory_bytes != delta_without_manifest.meta.source_content_length
161
+ assert (
162
+ result.memory_bytes
163
+ == delta_without_manifest.meta.content_length * params.previous_inflation
164
+ )
165
+ assert result.statistics.in_memory_size_bytes == result.memory_bytes
166
+ assert (
167
+ result.statistics.on_disk_size_bytes
168
+ == delta_without_manifest.meta.content_length
169
+ )
170
+ assert result.statistics.record_count == int(
171
+ result.memory_bytes / params.average_record_size_bytes
172
+ )
173
+
174
+ def test_delta_manifest_exists_when_default_method(
175
+ self, local_deltacat_storage_kwargs, parquet_delta_with_manifest: Delta
176
+ ):
177
+ params = EstimateResourcesParams.of(
178
+ resource_estimation_method=ResourceEstimationMethod.DEFAULT,
179
+ previous_inflation=7,
180
+ average_record_size_bytes=1000,
181
+ )
182
+
183
+ result = estimate_resources_required_to_process_delta(
184
+ delta=parquet_delta_with_manifest,
185
+ operation_type=OperationType.PYARROW_DOWNLOAD,
186
+ deltacat_storage=ds,
187
+ deltacat_storage_kwargs=local_deltacat_storage_kwargs,
188
+ estimate_resources_params=params,
189
+ )
190
+
191
+ assert (
192
+ result.memory_bytes
193
+ != parquet_delta_with_manifest.meta.source_content_length
194
+ )
195
+ assert (
196
+ result.memory_bytes
197
+ == parquet_delta_with_manifest.meta.content_length
198
+ * params.previous_inflation
199
+ )
200
+ assert result.statistics.in_memory_size_bytes == result.memory_bytes
201
+ assert (
202
+ result.statistics.on_disk_size_bytes
203
+ == parquet_delta_with_manifest.meta.content_length
204
+ )
205
+ assert result.statistics.record_count == int(
206
+ result.memory_bytes / params.average_record_size_bytes
207
+ )
208
+
209
+ def test_previous_inflation_arg_not_passed_when_default_method(
210
+ self, local_deltacat_storage_kwargs, parquet_delta_with_manifest: Delta
211
+ ):
212
+ with pytest.raises(AssertionError):
213
+ params = EstimateResourcesParams.of(
214
+ resource_estimation_method=ResourceEstimationMethod.DEFAULT,
215
+ average_record_size_bytes=1000,
216
+ )
217
+
218
+ estimate_resources_required_to_process_delta(
219
+ delta=parquet_delta_with_manifest,
220
+ operation_type=OperationType.PYARROW_DOWNLOAD,
221
+ deltacat_storage=ds,
222
+ deltacat_storage_kwargs=local_deltacat_storage_kwargs,
223
+ estimate_resources_params=params,
224
+ )
225
+
226
+ def test_estimate_resources_params_not_passed_assumes_default(
227
+ self, local_deltacat_storage_kwargs, parquet_delta_with_manifest: Delta
228
+ ):
229
+ params = EstimateResourcesParams.of(
230
+ previous_inflation=7,
231
+ average_record_size_bytes=1000,
232
+ )
233
+
234
+ result = estimate_resources_required_to_process_delta(
235
+ delta=parquet_delta_with_manifest,
236
+ operation_type=OperationType.PYARROW_DOWNLOAD,
237
+ deltacat_storage=ds,
238
+ deltacat_storage_kwargs=local_deltacat_storage_kwargs,
239
+ estimate_resources_params=params,
240
+ )
241
+
242
+ assert (
243
+ result.memory_bytes
244
+ != parquet_delta_with_manifest.meta.source_content_length
245
+ )
246
+ assert (
247
+ result.memory_bytes
248
+ == parquet_delta_with_manifest.meta.content_length
249
+ * params.previous_inflation
250
+ )
251
+ assert result.statistics.in_memory_size_bytes == result.memory_bytes
252
+ assert (
253
+ result.statistics.on_disk_size_bytes
254
+ == parquet_delta_with_manifest.meta.content_length
255
+ )
256
+ assert result.statistics.record_count == int(
257
+ result.memory_bytes / params.average_record_size_bytes
258
+ )
259
+
260
+ def test_delta_manifest_empty_when_content_type_meta(
261
+ self, local_deltacat_storage_kwargs, delta_without_manifest: Delta
262
+ ):
263
+ params = EstimateResourcesParams.of(
264
+ resource_estimation_method=ResourceEstimationMethod.CONTENT_TYPE_META,
265
+ parquet_to_pyarrow_inflation=2,
266
+ )
267
+
268
+ result = estimate_resources_required_to_process_delta(
269
+ delta=delta_without_manifest,
270
+ operation_type=OperationType.PYARROW_DOWNLOAD,
271
+ deltacat_storage=ds,
272
+ deltacat_storage_kwargs=local_deltacat_storage_kwargs,
273
+ estimate_resources_params=params,
274
+ )
275
+
276
+ assert delta_without_manifest.manifest is not None
277
+ assert int(result.memory_bytes) == 84
278
+ assert int(result.statistics.in_memory_size_bytes) == 84
279
+ assert (
280
+ result.statistics.on_disk_size_bytes
281
+ == delta_without_manifest.meta.content_length
282
+ )
283
+ assert result.statistics.record_count == 7
284
+
285
+ def test_delta_manifest_exists_when_content_type_meta(
286
+ self, local_deltacat_storage_kwargs, parquet_delta_with_manifest: Delta
287
+ ):
288
+ params = EstimateResourcesParams.of(
289
+ resource_estimation_method=ResourceEstimationMethod.CONTENT_TYPE_META,
290
+ parquet_to_pyarrow_inflation=2,
291
+ )
292
+
293
+ result = estimate_resources_required_to_process_delta(
294
+ delta=parquet_delta_with_manifest,
295
+ operation_type=OperationType.PYARROW_DOWNLOAD,
296
+ deltacat_storage=ds,
297
+ deltacat_storage_kwargs=local_deltacat_storage_kwargs,
298
+ estimate_resources_params=params,
299
+ )
300
+
301
+ assert parquet_delta_with_manifest.manifest is not None
302
+ assert int(result.memory_bytes) == 464
303
+ assert int(result.statistics.in_memory_size_bytes) == int(result.memory_bytes)
304
+ assert (
305
+ result.statistics.on_disk_size_bytes
306
+ == parquet_delta_with_manifest.meta.content_length
307
+ )
308
+ assert result.statistics.record_count == 7
309
+
310
+ def test_delta_manifest_empty_when_intelligent_estimation(
311
+ self, local_deltacat_storage_kwargs, delta_without_manifest: Delta
312
+ ):
313
+ params = EstimateResourcesParams.of(
314
+ resource_estimation_method=ResourceEstimationMethod.INTELLIGENT_ESTIMATION,
315
+ parquet_to_pyarrow_inflation=2,
316
+ )
317
+
318
+ result = estimate_resources_required_to_process_delta(
319
+ delta=delta_without_manifest,
320
+ operation_type=OperationType.PYARROW_DOWNLOAD,
321
+ deltacat_storage=ds,
322
+ deltacat_storage_kwargs=local_deltacat_storage_kwargs,
323
+ estimate_resources_params=params,
324
+ )
325
+
326
+ assert delta_without_manifest.manifest is not None
327
+ assert int(result.memory_bytes) == 84
328
+ assert int(result.statistics.in_memory_size_bytes) == 84
329
+ assert (
330
+ result.statistics.on_disk_size_bytes
331
+ == delta_without_manifest.meta.content_length
332
+ )
333
+ assert result.statistics.record_count == 7
334
+
335
+ def test_delta_manifest_exists_when_intelligent_estimation(
336
+ self, local_deltacat_storage_kwargs, parquet_delta_with_manifest: Delta
337
+ ):
338
+ params = EstimateResourcesParams.of(
339
+ resource_estimation_method=ResourceEstimationMethod.INTELLIGENT_ESTIMATION,
340
+ parquet_to_pyarrow_inflation=2,
341
+ )
342
+
343
+ result = estimate_resources_required_to_process_delta(
344
+ delta=parquet_delta_with_manifest,
345
+ operation_type=OperationType.PYARROW_DOWNLOAD,
346
+ deltacat_storage=ds,
347
+ deltacat_storage_kwargs=local_deltacat_storage_kwargs,
348
+ estimate_resources_params=params,
349
+ )
350
+
351
+ assert parquet_delta_with_manifest.manifest is not None
352
+ assert int(result.memory_bytes) == 168
353
+ assert int(result.statistics.in_memory_size_bytes) == int(result.memory_bytes)
354
+ assert (
355
+ result.statistics.on_disk_size_bytes
356
+ == parquet_delta_with_manifest.meta.content_length
357
+ )
358
+ assert result.statistics.record_count == 7
359
+
360
+ def test_delta_manifest_exists_inflation_absent_when_intelligent_estimation(
361
+ self, local_deltacat_storage_kwargs, parquet_delta_with_manifest: Delta
362
+ ):
363
+ params = EstimateResourcesParams.of(
364
+ resource_estimation_method=ResourceEstimationMethod.INTELLIGENT_ESTIMATION,
365
+ parquet_to_pyarrow_inflation=None,
366
+ )
367
+
368
+ result = estimate_resources_required_to_process_delta(
369
+ delta=parquet_delta_with_manifest,
370
+ operation_type=OperationType.PYARROW_DOWNLOAD,
371
+ deltacat_storage=ds,
372
+ deltacat_storage_kwargs=local_deltacat_storage_kwargs,
373
+ estimate_resources_params=params,
374
+ )
375
+
376
+ assert result is None
377
+
378
+ def test_delta_utsv_data_when_intelligent_estimation(
379
+ self, local_deltacat_storage_kwargs, utsv_delta_with_manifest: Delta
380
+ ):
381
+ params = EstimateResourcesParams.of(
382
+ resource_estimation_method=ResourceEstimationMethod.INTELLIGENT_ESTIMATION,
383
+ parquet_to_pyarrow_inflation=2,
384
+ )
385
+
386
+ result = estimate_resources_required_to_process_delta(
387
+ delta=utsv_delta_with_manifest,
388
+ operation_type=OperationType.PYARROW_DOWNLOAD,
389
+ deltacat_storage=ds,
390
+ deltacat_storage_kwargs=local_deltacat_storage_kwargs,
391
+ estimate_resources_params=params,
392
+ )
393
+
394
+ assert result is None
395
+
396
+ def test_empty_delta_sampled_when_file_sampling(
397
+ self, local_deltacat_storage_kwargs, delta_without_manifest: Delta
398
+ ):
399
+ params = EstimateResourcesParams.of(
400
+ resource_estimation_method=ResourceEstimationMethod.FILE_SAMPLING,
401
+ max_files_to_sample=2,
402
+ )
403
+
404
+ result = estimate_resources_required_to_process_delta(
405
+ delta=delta_without_manifest,
406
+ operation_type=OperationType.PYARROW_DOWNLOAD,
407
+ deltacat_storage=ds,
408
+ deltacat_storage_kwargs=local_deltacat_storage_kwargs,
409
+ estimate_resources_params=params,
410
+ )
411
+
412
+ assert delta_without_manifest.manifest is not None
413
+ assert result.memory_bytes is not None
414
+ assert (
415
+ result.statistics.on_disk_size_bytes
416
+ == delta_without_manifest.meta.content_length
417
+ )
418
+
419
+ def test_delta_manifest_parquet_when_file_sampling(
420
+ self, local_deltacat_storage_kwargs, parquet_delta_with_manifest: Delta
421
+ ):
422
+ params = EstimateResourcesParams.of(
423
+ resource_estimation_method=ResourceEstimationMethod.FILE_SAMPLING,
424
+ max_files_to_sample=2,
425
+ )
426
+
427
+ result = estimate_resources_required_to_process_delta(
428
+ delta=parquet_delta_with_manifest,
429
+ operation_type=OperationType.PYARROW_DOWNLOAD,
430
+ deltacat_storage=ds,
431
+ deltacat_storage_kwargs=local_deltacat_storage_kwargs,
432
+ estimate_resources_params=params,
433
+ )
434
+ assert result.memory_bytes is not None
435
+ assert (
436
+ result.statistics.on_disk_size_bytes
437
+ == parquet_delta_with_manifest.meta.content_length
438
+ )
439
+
440
+ def test_delta_manifest_utsv_when_file_sampling(
441
+ self, local_deltacat_storage_kwargs, utsv_delta_with_manifest: Delta
442
+ ):
443
+ params = EstimateResourcesParams.of(
444
+ resource_estimation_method=ResourceEstimationMethod.FILE_SAMPLING,
445
+ max_files_to_sample=2,
446
+ )
447
+
448
+ result = estimate_resources_required_to_process_delta(
449
+ delta=utsv_delta_with_manifest,
450
+ operation_type=OperationType.PYARROW_DOWNLOAD,
451
+ deltacat_storage=ds,
452
+ deltacat_storage_kwargs=local_deltacat_storage_kwargs,
453
+ estimate_resources_params=params,
454
+ )
455
+ assert result.memory_bytes is not None
456
+ assert (
457
+ result.statistics.on_disk_size_bytes
458
+ == utsv_delta_with_manifest.meta.content_length
459
+ )
460
+
461
+ def test_delta_manifest_utsv_when_file_sampling_zero_files_to_sample(
462
+ self, local_deltacat_storage_kwargs, utsv_delta_with_manifest: Delta
463
+ ):
464
+ params = EstimateResourcesParams.of(
465
+ resource_estimation_method=ResourceEstimationMethod.FILE_SAMPLING,
466
+ max_files_to_sample=None,
467
+ )
468
+
469
+ result = estimate_resources_required_to_process_delta(
470
+ delta=utsv_delta_with_manifest,
471
+ operation_type=OperationType.PYARROW_DOWNLOAD,
472
+ deltacat_storage=ds,
473
+ deltacat_storage_kwargs=local_deltacat_storage_kwargs,
474
+ estimate_resources_params=params,
475
+ )
476
+ assert result is None
477
+
478
+ def test_empty_delta_when_default_v2(
479
+ self, local_deltacat_storage_kwargs, delta_without_manifest: Delta
480
+ ):
481
+ params = EstimateResourcesParams.of(
482
+ resource_estimation_method=ResourceEstimationMethod.DEFAULT_V2,
483
+ max_files_to_sample=2,
484
+ previous_inflation=7,
485
+ average_record_size_bytes=1000,
486
+ )
487
+
488
+ result = estimate_resources_required_to_process_delta(
489
+ delta=delta_without_manifest,
490
+ operation_type=OperationType.PYARROW_DOWNLOAD,
491
+ deltacat_storage=ds,
492
+ deltacat_storage_kwargs=local_deltacat_storage_kwargs,
493
+ estimate_resources_params=params,
494
+ )
495
+
496
+ assert delta_without_manifest.manifest is not None
497
+ assert result.memory_bytes is not None
498
+ assert (
499
+ result.statistics.on_disk_size_bytes
500
+ == delta_without_manifest.meta.content_length
501
+ )
502
+
503
+ def test_parquet_delta_when_default_v2(
504
+ self, local_deltacat_storage_kwargs, parquet_delta_with_manifest: Delta
505
+ ):
506
+ params = EstimateResourcesParams.of(
507
+ resource_estimation_method=ResourceEstimationMethod.DEFAULT_V2,
508
+ max_files_to_sample=2,
509
+ previous_inflation=7,
510
+ average_record_size_bytes=1000,
511
+ parquet_to_pyarrow_inflation=1,
512
+ )
513
+
514
+ result = estimate_resources_required_to_process_delta(
515
+ delta=parquet_delta_with_manifest,
516
+ operation_type=OperationType.PYARROW_DOWNLOAD,
517
+ deltacat_storage=ds,
518
+ deltacat_storage_kwargs=local_deltacat_storage_kwargs,
519
+ estimate_resources_params=params,
520
+ )
521
+
522
+ assert parquet_delta_with_manifest.manifest is not None
523
+ assert result.memory_bytes is not None
524
+ assert (
525
+ result.statistics.on_disk_size_bytes
526
+ == parquet_delta_with_manifest.meta.content_length
527
+ )
528
+
529
+ def test_parquet_delta_when_default_v2_and_files_to_sample_zero(
530
+ self, local_deltacat_storage_kwargs, parquet_delta_with_manifest: Delta
531
+ ):
532
+ params = EstimateResourcesParams.of(
533
+ resource_estimation_method=ResourceEstimationMethod.DEFAULT_V2,
534
+ max_files_to_sample=0,
535
+ previous_inflation=7,
536
+ average_record_size_bytes=1000,
537
+ parquet_to_pyarrow_inflation=1,
538
+ )
539
+
540
+ result = estimate_resources_required_to_process_delta(
541
+ delta=parquet_delta_with_manifest,
542
+ operation_type=OperationType.PYARROW_DOWNLOAD,
543
+ deltacat_storage=ds,
544
+ deltacat_storage_kwargs=local_deltacat_storage_kwargs,
545
+ estimate_resources_params=params,
546
+ )
547
+
548
+ assert parquet_delta_with_manifest.manifest is not None
549
+ assert result.memory_bytes is not None
550
+ assert (
551
+ result.statistics.on_disk_size_bytes
552
+ == parquet_delta_with_manifest.meta.content_length
553
+ )
554
+
555
+ def test_utsv_delta_when_default_v2(
556
+ self, local_deltacat_storage_kwargs, utsv_delta_with_manifest: Delta
557
+ ):
558
+ params = EstimateResourcesParams.of(
559
+ resource_estimation_method=ResourceEstimationMethod.DEFAULT_V2,
560
+ max_files_to_sample=2,
561
+ previous_inflation=7,
562
+ average_record_size_bytes=1000,
563
+ parquet_to_pyarrow_inflation=1,
564
+ )
565
+
566
+ result = estimate_resources_required_to_process_delta(
567
+ delta=utsv_delta_with_manifest,
568
+ operation_type=OperationType.PYARROW_DOWNLOAD,
569
+ deltacat_storage=ds,
570
+ deltacat_storage_kwargs=local_deltacat_storage_kwargs,
571
+ estimate_resources_params=params,
572
+ )
573
+
574
+ assert utsv_delta_with_manifest.manifest is not None
575
+ assert result.memory_bytes is not None
576
+ assert (
577
+ result.statistics.on_disk_size_bytes
578
+ == utsv_delta_with_manifest.meta.content_length
579
+ )
580
+
581
+ def test_parquet_delta_without_inflation_when_default_v2(
582
+ self, local_deltacat_storage_kwargs, parquet_delta_with_manifest: Delta
583
+ ):
584
+ params = EstimateResourcesParams.of(
585
+ resource_estimation_method=ResourceEstimationMethod.DEFAULT_V2,
586
+ max_files_to_sample=2,
587
+ previous_inflation=7,
588
+ average_record_size_bytes=1000,
589
+ parquet_to_pyarrow_inflation=None, # inflation is None
590
+ )
591
+
592
+ result = estimate_resources_required_to_process_delta(
593
+ delta=parquet_delta_with_manifest,
594
+ operation_type=OperationType.PYARROW_DOWNLOAD,
595
+ deltacat_storage=ds,
596
+ deltacat_storage_kwargs=local_deltacat_storage_kwargs,
597
+ estimate_resources_params=params,
598
+ )
599
+
600
+ assert parquet_delta_with_manifest.manifest is not None
601
+ assert result.memory_bytes is not None
602
+ assert (
603
+ result.statistics.on_disk_size_bytes
604
+ == parquet_delta_with_manifest.meta.content_length
605
+ )