deltacat 1.1.17__py3-none-any.whl → 1.1.19__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (61) hide show
  1. deltacat/__init__.py +1 -1
  2. deltacat/aws/constants.py +0 -1
  3. deltacat/compute/compactor/model/compact_partition_params.py +76 -0
  4. deltacat/compute/compactor/model/compaction_session_audit_info.py +26 -0
  5. deltacat/compute/compactor/model/delta_annotated.py +16 -9
  6. deltacat/compute/compactor_v2/constants.py +3 -0
  7. deltacat/compute/compactor_v2/private/compaction_utils.py +9 -5
  8. deltacat/compute/compactor_v2/utils/content_type_params.py +185 -34
  9. deltacat/compute/compactor_v2/utils/io.py +28 -14
  10. deltacat/compute/compactor_v2/utils/primary_key_index.py +9 -4
  11. deltacat/compute/compactor_v2/utils/task_options.py +128 -183
  12. deltacat/compute/resource_estimation/__init__.py +27 -0
  13. deltacat/compute/resource_estimation/delta.py +271 -0
  14. deltacat/compute/resource_estimation/manifest.py +394 -0
  15. deltacat/compute/resource_estimation/model.py +165 -0
  16. deltacat/compute/resource_estimation/parquet.py +108 -0
  17. deltacat/constants.py +5 -0
  18. deltacat/exceptions.py +2 -4
  19. deltacat/logs.py +8 -0
  20. deltacat/tests/compute/compact_partition_multiple_rounds_test_cases.py +77 -0
  21. deltacat/tests/compute/compact_partition_rebase_test_cases.py +308 -0
  22. deltacat/tests/compute/compact_partition_rebase_then_incremental_test_cases.py +159 -0
  23. deltacat/tests/compute/compactor_v2/test_compaction_session.py +157 -0
  24. deltacat/tests/compute/compactor_v2/utils/test_task_options.py +3 -3
  25. deltacat/tests/compute/resource_estimation/test_delta.py +605 -0
  26. deltacat/tests/compute/resource_estimation/test_manifest.py +921 -0
  27. deltacat/tests/compute/test_compact_partition_rebase.py +13 -4
  28. deltacat/tests/compute/test_util_common.py +2 -0
  29. deltacat/tests/compute/test_util_create_table_deltas_repo.py +13 -5
  30. deltacat/tests/test_logs.py +34 -0
  31. deltacat/tests/test_utils/pyarrow.py +15 -5
  32. {deltacat-1.1.17.dist-info → deltacat-1.1.19.dist-info}/METADATA +2 -2
  33. {deltacat-1.1.17.dist-info → deltacat-1.1.19.dist-info}/RECORD +38 -54
  34. deltacat/compute/metastats/meta_stats.py +0 -479
  35. deltacat/compute/metastats/model/__init__.py +0 -0
  36. deltacat/compute/metastats/model/partition_stats_dict.py +0 -34
  37. deltacat/compute/metastats/model/stats_cluster_size_estimator.py +0 -68
  38. deltacat/compute/metastats/stats.py +0 -182
  39. deltacat/compute/metastats/utils/__init__.py +0 -0
  40. deltacat/compute/metastats/utils/constants.py +0 -16
  41. deltacat/compute/metastats/utils/io.py +0 -223
  42. deltacat/compute/metastats/utils/pyarrow_memory_estimation_function.py +0 -18
  43. deltacat/compute/metastats/utils/ray_utils.py +0 -129
  44. deltacat/compute/stats/basic.py +0 -226
  45. deltacat/compute/stats/models/__init__.py +0 -0
  46. deltacat/compute/stats/models/delta_column_stats.py +0 -98
  47. deltacat/compute/stats/models/delta_stats.py +0 -233
  48. deltacat/compute/stats/models/delta_stats_cache_result.py +0 -49
  49. deltacat/compute/stats/models/manifest_entry_stats.py +0 -72
  50. deltacat/compute/stats/models/stats_result.py +0 -104
  51. deltacat/compute/stats/utils/__init__.py +0 -0
  52. deltacat/compute/stats/utils/intervals.py +0 -94
  53. deltacat/compute/stats/utils/io.py +0 -230
  54. deltacat/compute/stats/utils/manifest_stats_file.py +0 -100
  55. deltacat/tests/stats/__init__.py +0 -0
  56. deltacat/tests/stats/test_intervals.py +0 -49
  57. /deltacat/{compute/metastats → tests/compute/resource_estimation}/__init__.py +0 -0
  58. /deltacat/{compute/metastats/config → tests/compute/resource_estimation/data}/__init__.py +0 -0
  59. {deltacat-1.1.17.dist-info → deltacat-1.1.19.dist-info}/LICENSE +0 -0
  60. {deltacat-1.1.17.dist-info → deltacat-1.1.19.dist-info}/WHEEL +0 -0
  61. {deltacat-1.1.17.dist-info → deltacat-1.1.19.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,921 @@
1
+ import pytest
2
+ from deltacat.storage import ManifestEntry, ManifestMeta
3
+ import pyarrow.parquet as papq
4
+ from deltacat.types.partial_download import PartialParquetParameters
5
+ from deltacat.compute.resource_estimation.manifest import (
6
+ estimate_manifest_entry_column_size_bytes,
7
+ estimate_manifest_entry_num_rows,
8
+ estimate_manifest_entry_size_bytes,
9
+ )
10
+ from deltacat.compute.resource_estimation.model import (
11
+ OperationType,
12
+ EstimateResourcesParams,
13
+ ResourceEstimationMethod,
14
+ )
15
+
16
+ PARQUET_FILE_PATH_NO_STATS = (
17
+ "deltacat/tests/compute/resource_estimation/data/sample_no_stats.parquet"
18
+ )
19
+ PARQUET_FILE_PATH_WITH_STATS = (
20
+ "deltacat/tests/compute/resource_estimation/data/sample_with_stats.parquet"
21
+ )
22
+
23
+
24
+ @pytest.fixture(scope="module")
25
+ def sample_no_stats_entry():
26
+ manifest_meta = ManifestMeta.of(
27
+ content_length=113629,
28
+ record_count=0,
29
+ content_type="application/parquet",
30
+ content_encoding="identity",
31
+ content_type_parameters=[
32
+ PartialParquetParameters.of(
33
+ pq_metadata=papq.ParquetFile(PARQUET_FILE_PATH_NO_STATS).metadata
34
+ )
35
+ ],
36
+ )
37
+ return ManifestEntry.of(
38
+ url=PARQUET_FILE_PATH_NO_STATS,
39
+ uri=PARQUET_FILE_PATH_NO_STATS,
40
+ mandatory=True,
41
+ uuid="test",
42
+ meta=manifest_meta,
43
+ )
44
+
45
+
46
+ @pytest.fixture(scope="module")
47
+ def sample_with_no_type_params():
48
+ manifest_meta = ManifestMeta.of(
49
+ content_length=113629,
50
+ record_count=0,
51
+ content_type="application/parquet",
52
+ content_encoding="identity",
53
+ content_type_parameters=[],
54
+ )
55
+ return ManifestEntry.of(
56
+ url=PARQUET_FILE_PATH_NO_STATS,
57
+ uri=PARQUET_FILE_PATH_NO_STATS,
58
+ mandatory=True,
59
+ uuid="test",
60
+ meta=manifest_meta,
61
+ )
62
+
63
+
64
+ @pytest.fixture(scope="module")
65
+ def sample_with_stats_entry():
66
+ manifest_meta = ManifestMeta.of(
67
+ content_length=113629,
68
+ record_count=0,
69
+ content_type="application/parquet",
70
+ content_encoding="identity",
71
+ content_type_parameters=[
72
+ PartialParquetParameters.of(
73
+ pq_metadata=papq.ParquetFile(PARQUET_FILE_PATH_WITH_STATS).metadata
74
+ )
75
+ ],
76
+ )
77
+ return ManifestEntry.of(
78
+ url=PARQUET_FILE_PATH_WITH_STATS,
79
+ uri=PARQUET_FILE_PATH_WITH_STATS,
80
+ mandatory=True,
81
+ uuid="test",
82
+ meta=manifest_meta,
83
+ )
84
+
85
+
86
+ class TestEstimateManifestEntryColumnSizeBytes:
87
+ def test_when_no_columns_passed_sanity(
88
+ self, sample_no_stats_entry, sample_with_stats_entry
89
+ ):
90
+ estimate_resources_params = EstimateResourcesParams.of(
91
+ parquet_to_pyarrow_inflation=1
92
+ )
93
+ assert (
94
+ int(
95
+ estimate_manifest_entry_column_size_bytes(
96
+ sample_no_stats_entry,
97
+ operation_type=OperationType.PYARROW_DOWNLOAD,
98
+ estimate_resources_params=estimate_resources_params,
99
+ )
100
+ )
101
+ == 0
102
+ )
103
+ assert (
104
+ int(
105
+ estimate_manifest_entry_column_size_bytes(
106
+ sample_with_stats_entry,
107
+ operation_type=OperationType.PYARROW_DOWNLOAD,
108
+ estimate_resources_params=estimate_resources_params,
109
+ )
110
+ )
111
+ == 0
112
+ )
113
+
114
+ def test_when_no_columns_passed_with_intelligent_estimation(
115
+ self, sample_no_stats_entry, sample_with_stats_entry
116
+ ):
117
+ estimate_resources_params = EstimateResourcesParams.of(
118
+ parquet_to_pyarrow_inflation=1,
119
+ resource_estimation_method=ResourceEstimationMethod.INTELLIGENT_ESTIMATION,
120
+ )
121
+ assert (
122
+ int(
123
+ estimate_manifest_entry_column_size_bytes(
124
+ sample_no_stats_entry,
125
+ operation_type=OperationType.PYARROW_DOWNLOAD,
126
+ estimate_resources_params=estimate_resources_params,
127
+ )
128
+ )
129
+ == 0
130
+ )
131
+ assert (
132
+ int(
133
+ estimate_manifest_entry_column_size_bytes(
134
+ sample_with_stats_entry,
135
+ operation_type=OperationType.PYARROW_DOWNLOAD,
136
+ estimate_resources_params=estimate_resources_params,
137
+ )
138
+ )
139
+ == 0
140
+ )
141
+
142
+ def test_when_one_string_column_passed(
143
+ self, sample_no_stats_entry, sample_with_stats_entry
144
+ ):
145
+ estimate_resources_params = EstimateResourcesParams.of(
146
+ parquet_to_pyarrow_inflation=1
147
+ )
148
+ assert (
149
+ int(
150
+ estimate_manifest_entry_column_size_bytes(
151
+ sample_no_stats_entry,
152
+ columns=["first_name"],
153
+ operation_type=OperationType.PYARROW_DOWNLOAD,
154
+ estimate_resources_params=estimate_resources_params,
155
+ )
156
+ )
157
+ == 2988
158
+ )
159
+ assert (
160
+ int(
161
+ estimate_manifest_entry_column_size_bytes(
162
+ sample_with_stats_entry,
163
+ columns=["first_name"],
164
+ operation_type=OperationType.PYARROW_DOWNLOAD,
165
+ estimate_resources_params=estimate_resources_params,
166
+ )
167
+ )
168
+ == 2989
169
+ )
170
+
171
+ def test_when_invalid_column_passed_assumes_null(self, sample_no_stats_entry):
172
+ estimate_resources_params = EstimateResourcesParams.of(
173
+ parquet_to_pyarrow_inflation=1
174
+ )
175
+
176
+ assert (
177
+ int(
178
+ estimate_manifest_entry_column_size_bytes(
179
+ sample_no_stats_entry,
180
+ columns=["invalid_column"],
181
+ operation_type=OperationType.PYARROW_DOWNLOAD,
182
+ estimate_resources_params=estimate_resources_params,
183
+ )
184
+ )
185
+ == 4000
186
+ )
187
+
188
+ def test_when_multiple_columns_passed(
189
+ self, sample_no_stats_entry, sample_with_stats_entry
190
+ ):
191
+ estimate_resources_params = EstimateResourcesParams.of(
192
+ parquet_to_pyarrow_inflation=1
193
+ )
194
+
195
+ assert (
196
+ int(
197
+ estimate_manifest_entry_column_size_bytes(
198
+ sample_no_stats_entry,
199
+ columns=["first_name", "id"],
200
+ operation_type=OperationType.PYARROW_DOWNLOAD,
201
+ estimate_resources_params=estimate_resources_params,
202
+ )
203
+ )
204
+ == 7031
205
+ )
206
+ assert (
207
+ int(
208
+ estimate_manifest_entry_column_size_bytes(
209
+ sample_with_stats_entry,
210
+ columns=["first_name", "id"],
211
+ operation_type=OperationType.PYARROW_DOWNLOAD,
212
+ estimate_resources_params=estimate_resources_params,
213
+ )
214
+ )
215
+ == 8314
216
+ )
217
+
218
+ def test_when_timestamp_column_passed(
219
+ self, sample_no_stats_entry, sample_with_stats_entry
220
+ ):
221
+ estimate_resources_params = EstimateResourcesParams.of(
222
+ parquet_to_pyarrow_inflation=2
223
+ )
224
+ assert (
225
+ int(
226
+ estimate_manifest_entry_column_size_bytes(
227
+ sample_no_stats_entry,
228
+ columns=["registration_dttm"],
229
+ operation_type=OperationType.PYARROW_DOWNLOAD,
230
+ estimate_resources_params=estimate_resources_params,
231
+ )
232
+ )
233
+ == 26540
234
+ )
235
+ assert (
236
+ int(
237
+ estimate_manifest_entry_column_size_bytes(
238
+ sample_with_stats_entry,
239
+ columns=["registration_dttm"],
240
+ operation_type=OperationType.PYARROW_DOWNLOAD,
241
+ estimate_resources_params=estimate_resources_params,
242
+ )
243
+ )
244
+ == 18602
245
+ )
246
+
247
+ def test_when_intelligent_estimation_enabled_single_column(
248
+ self, sample_no_stats_entry, sample_with_stats_entry
249
+ ):
250
+ estimate_resources_params = EstimateResourcesParams.of(
251
+ parquet_to_pyarrow_inflation=1,
252
+ resource_estimation_method=ResourceEstimationMethod.INTELLIGENT_ESTIMATION,
253
+ )
254
+ assert (
255
+ int(
256
+ estimate_manifest_entry_column_size_bytes(
257
+ sample_no_stats_entry,
258
+ columns=["first_name"],
259
+ operation_type=OperationType.PYARROW_DOWNLOAD,
260
+ estimate_resources_params=estimate_resources_params,
261
+ )
262
+ )
263
+ == 2988
264
+ )
265
+ assert (
266
+ int(
267
+ estimate_manifest_entry_column_size_bytes(
268
+ sample_with_stats_entry,
269
+ columns=["first_name"],
270
+ operation_type=OperationType.PYARROW_DOWNLOAD,
271
+ estimate_resources_params=estimate_resources_params,
272
+ )
273
+ )
274
+ == 7000
275
+ )
276
+
277
+ def test_when_intelligent_estimation_enabled_timestamp_column(
278
+ self, sample_no_stats_entry, sample_with_stats_entry
279
+ ):
280
+ estimate_resources_params = EstimateResourcesParams.of(
281
+ parquet_to_pyarrow_inflation=1,
282
+ resource_estimation_method=ResourceEstimationMethod.INTELLIGENT_ESTIMATION,
283
+ )
284
+ assert (
285
+ int(
286
+ estimate_manifest_entry_column_size_bytes(
287
+ sample_no_stats_entry,
288
+ columns=["registration_dttm"],
289
+ operation_type=OperationType.PYARROW_DOWNLOAD,
290
+ estimate_resources_params=estimate_resources_params,
291
+ )
292
+ )
293
+ == 12000
294
+ )
295
+ assert (
296
+ int(
297
+ estimate_manifest_entry_column_size_bytes(
298
+ sample_with_stats_entry,
299
+ columns=["registration_dttm"],
300
+ operation_type=OperationType.PYARROW_DOWNLOAD,
301
+ estimate_resources_params=estimate_resources_params,
302
+ )
303
+ )
304
+ == 8000
305
+ )
306
+
307
+ def test_when_intelligent_estimation_enabled_int_column(
308
+ self, sample_no_stats_entry, sample_with_stats_entry
309
+ ):
310
+ estimate_resources_params = EstimateResourcesParams.of(
311
+ parquet_to_pyarrow_inflation=1,
312
+ resource_estimation_method=ResourceEstimationMethod.INTELLIGENT_ESTIMATION,
313
+ )
314
+ assert (
315
+ int(
316
+ estimate_manifest_entry_column_size_bytes(
317
+ sample_no_stats_entry,
318
+ columns=["id"],
319
+ operation_type=OperationType.PYARROW_DOWNLOAD,
320
+ estimate_resources_params=estimate_resources_params,
321
+ ),
322
+ )
323
+ == 4000
324
+ )
325
+ assert (
326
+ int(
327
+ estimate_manifest_entry_column_size_bytes(
328
+ sample_with_stats_entry,
329
+ columns=["id"],
330
+ operation_type=OperationType.PYARROW_DOWNLOAD,
331
+ estimate_resources_params=estimate_resources_params,
332
+ )
333
+ )
334
+ == 4000
335
+ )
336
+
337
+ def test_when_intelligent_estimation_enabled_double_column(
338
+ self, sample_no_stats_entry, sample_with_stats_entry
339
+ ):
340
+ estimate_resources_params = EstimateResourcesParams.of(
341
+ parquet_to_pyarrow_inflation=1,
342
+ resource_estimation_method=ResourceEstimationMethod.INTELLIGENT_ESTIMATION,
343
+ )
344
+
345
+ assert (
346
+ int(
347
+ estimate_manifest_entry_column_size_bytes(
348
+ sample_no_stats_entry,
349
+ columns=["salary"],
350
+ operation_type=OperationType.PYARROW_DOWNLOAD,
351
+ estimate_resources_params=estimate_resources_params,
352
+ )
353
+ )
354
+ == 8000
355
+ )
356
+ assert (
357
+ int(
358
+ estimate_manifest_entry_column_size_bytes(
359
+ sample_with_stats_entry,
360
+ columns=["salary"],
361
+ operation_type=OperationType.PYARROW_DOWNLOAD,
362
+ estimate_resources_params=estimate_resources_params,
363
+ )
364
+ )
365
+ == 8000
366
+ )
367
+
368
+ def test_when_intelligent_estimation_enabled_multiple_columns(
369
+ self, sample_no_stats_entry, sample_with_stats_entry
370
+ ):
371
+ estimate_resources_params = EstimateResourcesParams.of(
372
+ parquet_to_pyarrow_inflation=1,
373
+ resource_estimation_method=ResourceEstimationMethod.INTELLIGENT_ESTIMATION,
374
+ )
375
+
376
+ assert (
377
+ int(
378
+ estimate_manifest_entry_column_size_bytes(
379
+ sample_no_stats_entry,
380
+ columns=["first_name", "id"],
381
+ operation_type=OperationType.PYARROW_DOWNLOAD,
382
+ estimate_resources_params=estimate_resources_params,
383
+ )
384
+ )
385
+ == 6988
386
+ )
387
+ assert (
388
+ int(
389
+ estimate_manifest_entry_column_size_bytes(
390
+ sample_with_stats_entry,
391
+ columns=["first_name", "id"],
392
+ operation_type=OperationType.PYARROW_DOWNLOAD,
393
+ estimate_resources_params=estimate_resources_params,
394
+ )
395
+ )
396
+ == 11000
397
+ )
398
+
399
+ def test_when_default_v2_enabled_multiple_columns(
400
+ self, sample_no_stats_entry, sample_with_stats_entry
401
+ ):
402
+ estimate_resources_params = EstimateResourcesParams.of(
403
+ parquet_to_pyarrow_inflation=1,
404
+ resource_estimation_method=ResourceEstimationMethod.DEFAULT_V2,
405
+ )
406
+
407
+ assert (
408
+ int(
409
+ estimate_manifest_entry_column_size_bytes(
410
+ sample_no_stats_entry,
411
+ columns=["first_name", "id"],
412
+ operation_type=OperationType.PYARROW_DOWNLOAD,
413
+ estimate_resources_params=estimate_resources_params,
414
+ )
415
+ )
416
+ == 6988
417
+ )
418
+ assert (
419
+ int(
420
+ estimate_manifest_entry_column_size_bytes(
421
+ sample_with_stats_entry,
422
+ columns=["first_name", "id"],
423
+ operation_type=OperationType.PYARROW_DOWNLOAD,
424
+ estimate_resources_params=estimate_resources_params,
425
+ )
426
+ )
427
+ == 11000
428
+ )
429
+
430
+ def test_when_default_v2_enabled_multiple_columns_and_inflation_not_passed(
431
+ self, sample_no_stats_entry
432
+ ):
433
+ estimate_resources_params = EstimateResourcesParams.of(
434
+ parquet_to_pyarrow_inflation=None,
435
+ resource_estimation_method=ResourceEstimationMethod.DEFAULT_V2,
436
+ )
437
+
438
+ assert (
439
+ estimate_manifest_entry_column_size_bytes(
440
+ sample_no_stats_entry,
441
+ columns=["first_name", "id"],
442
+ operation_type=OperationType.PYARROW_DOWNLOAD,
443
+ estimate_resources_params=estimate_resources_params,
444
+ )
445
+ is None
446
+ )
447
+
448
+ def test_when_intelligent_estimation_enabled_with_no_type_params(
449
+ self, sample_with_no_type_params
450
+ ):
451
+ estimate_resources_params = EstimateResourcesParams.of(
452
+ parquet_to_pyarrow_inflation=1,
453
+ resource_estimation_method=ResourceEstimationMethod.INTELLIGENT_ESTIMATION,
454
+ )
455
+
456
+ assert (
457
+ estimate_manifest_entry_column_size_bytes(
458
+ sample_with_no_type_params,
459
+ columns=["first_name"],
460
+ operation_type=OperationType.PYARROW_DOWNLOAD,
461
+ estimate_resources_params=estimate_resources_params,
462
+ )
463
+ is None
464
+ )
465
+
466
+ def test_when_previous_inflation_method_with_no_type_params(
467
+ self, sample_with_no_type_params
468
+ ):
469
+ estimate_resources_params = EstimateResourcesParams.of(
470
+ parquet_to_pyarrow_inflation=1,
471
+ resource_estimation_method=ResourceEstimationMethod.PREVIOUS_INFLATION,
472
+ )
473
+
474
+ assert (
475
+ estimate_manifest_entry_column_size_bytes(
476
+ sample_with_no_type_params,
477
+ columns=["first_name"],
478
+ operation_type=OperationType.PYARROW_DOWNLOAD,
479
+ estimate_resources_params=estimate_resources_params,
480
+ )
481
+ is None
482
+ )
483
+
484
+ def test_when_default_v2_method_with_no_type_params(
485
+ self, sample_with_no_type_params
486
+ ):
487
+ estimate_resources_params = EstimateResourcesParams.of(
488
+ parquet_to_pyarrow_inflation=1,
489
+ resource_estimation_method=ResourceEstimationMethod.DEFAULT_V2,
490
+ )
491
+
492
+ assert (
493
+ estimate_manifest_entry_column_size_bytes(
494
+ sample_with_no_type_params,
495
+ columns=["first_name"],
496
+ operation_type=OperationType.PYARROW_DOWNLOAD,
497
+ estimate_resources_params=estimate_resources_params,
498
+ )
499
+ is None
500
+ )
501
+
502
+
503
+ class TestEstimateManifestEntryNumRows:
504
+ def test_sanity(self, sample_no_stats_entry):
505
+ estimate_resources_params = EstimateResourcesParams.of(
506
+ parquet_to_pyarrow_inflation=1,
507
+ previous_inflation=7,
508
+ average_record_size_bytes=1000,
509
+ resource_estimation_method=ResourceEstimationMethod.DEFAULT,
510
+ )
511
+
512
+ assert (
513
+ estimate_manifest_entry_num_rows(
514
+ sample_no_stats_entry,
515
+ operation_type=OperationType.PYARROW_DOWNLOAD,
516
+ estimate_resources_params=estimate_resources_params,
517
+ )
518
+ == 1000
519
+ )
520
+
521
+ def test_when_previous_inflation_forced(self, sample_no_stats_entry):
522
+ estimate_resources_params = EstimateResourcesParams.of(
523
+ parquet_to_pyarrow_inflation=1,
524
+ previous_inflation=7,
525
+ average_record_size_bytes=1000,
526
+ resource_estimation_method=ResourceEstimationMethod.PREVIOUS_INFLATION,
527
+ )
528
+ assert (
529
+ estimate_manifest_entry_num_rows(
530
+ sample_no_stats_entry,
531
+ operation_type=OperationType.PYARROW_DOWNLOAD,
532
+ estimate_resources_params=estimate_resources_params,
533
+ )
534
+ == 795
535
+ )
536
+
537
+ def test_when_type_params_absent_default_method(self, sample_with_no_type_params):
538
+ estimate_resources_params = EstimateResourcesParams.of(
539
+ parquet_to_pyarrow_inflation=1,
540
+ previous_inflation=7,
541
+ average_record_size_bytes=1000,
542
+ resource_estimation_method=ResourceEstimationMethod.DEFAULT,
543
+ )
544
+
545
+ assert (
546
+ estimate_manifest_entry_num_rows(
547
+ sample_with_no_type_params,
548
+ operation_type=OperationType.PYARROW_DOWNLOAD,
549
+ estimate_resources_params=estimate_resources_params,
550
+ )
551
+ == 795
552
+ )
553
+
554
+ def test_when_type_params_absent_intelligent_estimation(
555
+ self, sample_with_no_type_params
556
+ ):
557
+ estimate_resources_params = EstimateResourcesParams.of(
558
+ parquet_to_pyarrow_inflation=1,
559
+ previous_inflation=7,
560
+ average_record_size_bytes=1000,
561
+ resource_estimation_method=ResourceEstimationMethod.INTELLIGENT_ESTIMATION,
562
+ )
563
+
564
+ assert (
565
+ estimate_manifest_entry_num_rows(
566
+ sample_with_no_type_params,
567
+ operation_type=OperationType.PYARROW_DOWNLOAD,
568
+ estimate_resources_params=estimate_resources_params,
569
+ )
570
+ is None
571
+ )
572
+
573
+ def test_when_type_params_absent_content_type_meta(
574
+ self, sample_with_no_type_params
575
+ ):
576
+ estimate_resources_params = EstimateResourcesParams.of(
577
+ parquet_to_pyarrow_inflation=1,
578
+ previous_inflation=7,
579
+ average_record_size_bytes=1000,
580
+ resource_estimation_method=ResourceEstimationMethod.CONTENT_TYPE_META,
581
+ )
582
+
583
+ assert (
584
+ estimate_manifest_entry_num_rows(
585
+ sample_with_no_type_params,
586
+ operation_type=OperationType.PYARROW_DOWNLOAD,
587
+ estimate_resources_params=estimate_resources_params,
588
+ )
589
+ is None
590
+ )
591
+
592
+ def test_when_type_params_absent_previous_inflation(
593
+ self, sample_with_no_type_params
594
+ ):
595
+ estimate_resources_params = EstimateResourcesParams.of(
596
+ parquet_to_pyarrow_inflation=1,
597
+ previous_inflation=7,
598
+ average_record_size_bytes=1000,
599
+ resource_estimation_method=ResourceEstimationMethod.PREVIOUS_INFLATION,
600
+ )
601
+
602
+ assert (
603
+ estimate_manifest_entry_num_rows(
604
+ sample_with_no_type_params,
605
+ operation_type=OperationType.PYARROW_DOWNLOAD,
606
+ estimate_resources_params=estimate_resources_params,
607
+ )
608
+ == 795
609
+ )
610
+
611
+ def test_when_type_params_absent_default_v2(self, sample_with_no_type_params):
612
+ estimate_resources_params = EstimateResourcesParams.of(
613
+ parquet_to_pyarrow_inflation=1,
614
+ previous_inflation=7,
615
+ average_record_size_bytes=1000,
616
+ resource_estimation_method=ResourceEstimationMethod.DEFAULT_V2,
617
+ )
618
+
619
+ assert (
620
+ estimate_manifest_entry_num_rows(
621
+ sample_with_no_type_params,
622
+ operation_type=OperationType.PYARROW_DOWNLOAD,
623
+ estimate_resources_params=estimate_resources_params,
624
+ )
625
+ == 795 # same as previous inflation
626
+ )
627
+
628
+ def test_when_type_params_no_stats_with_default_v2(self, sample_no_stats_entry):
629
+ estimate_resources_params = EstimateResourcesParams.of(
630
+ parquet_to_pyarrow_inflation=1,
631
+ previous_inflation=7,
632
+ average_record_size_bytes=1000,
633
+ resource_estimation_method=ResourceEstimationMethod.DEFAULT_V2,
634
+ )
635
+ assert (
636
+ estimate_manifest_entry_num_rows(
637
+ sample_no_stats_entry,
638
+ operation_type=OperationType.PYARROW_DOWNLOAD,
639
+ estimate_resources_params=estimate_resources_params,
640
+ )
641
+ == 1000
642
+ )
643
+
644
+ def test_when_type_params_parquet_inflation_absent_with_default_v2(
645
+ self, sample_no_stats_entry
646
+ ):
647
+ estimate_resources_params = EstimateResourcesParams.of(
648
+ parquet_to_pyarrow_inflation=None,
649
+ previous_inflation=7,
650
+ average_record_size_bytes=1000,
651
+ resource_estimation_method=ResourceEstimationMethod.DEFAULT_V2,
652
+ )
653
+ assert (
654
+ estimate_manifest_entry_num_rows(
655
+ sample_no_stats_entry,
656
+ operation_type=OperationType.PYARROW_DOWNLOAD,
657
+ estimate_resources_params=estimate_resources_params,
658
+ )
659
+ == 1000
660
+ )
661
+
662
+ def test_when_type_params_with_default_v2(self, sample_with_stats_entry):
663
+ estimate_resources_params = EstimateResourcesParams.of(
664
+ parquet_to_pyarrow_inflation=1,
665
+ previous_inflation=7,
666
+ average_record_size_bytes=1000,
667
+ resource_estimation_method=ResourceEstimationMethod.DEFAULT_V2,
668
+ )
669
+ assert (
670
+ estimate_manifest_entry_num_rows(
671
+ sample_with_stats_entry,
672
+ operation_type=OperationType.PYARROW_DOWNLOAD,
673
+ estimate_resources_params=estimate_resources_params,
674
+ )
675
+ == 1000
676
+ )
677
+
678
+
679
+ class TestEstimateManifestEntrySizeBytes:
680
+ def test_sanity(self, sample_no_stats_entry):
681
+ estimate_resources_params = EstimateResourcesParams.of(
682
+ parquet_to_pyarrow_inflation=2,
683
+ previous_inflation=7,
684
+ resource_estimation_method=ResourceEstimationMethod.DEFAULT,
685
+ )
686
+
687
+ assert (
688
+ int(
689
+ estimate_manifest_entry_size_bytes(
690
+ sample_no_stats_entry,
691
+ operation_type=OperationType.PYARROW_DOWNLOAD,
692
+ estimate_resources_params=estimate_resources_params,
693
+ )
694
+ )
695
+ == 224984
696
+ )
697
+
698
+ def test_when_previous_inflation_forced(self, sample_no_stats_entry):
699
+ estimate_resources_params = EstimateResourcesParams.of(
700
+ parquet_to_pyarrow_inflation=2,
701
+ previous_inflation=7,
702
+ resource_estimation_method=ResourceEstimationMethod.PREVIOUS_INFLATION,
703
+ )
704
+ assert (
705
+ int(
706
+ estimate_manifest_entry_size_bytes(
707
+ sample_no_stats_entry,
708
+ operation_type=OperationType.PYARROW_DOWNLOAD,
709
+ estimate_resources_params=estimate_resources_params,
710
+ )
711
+ )
712
+ == 795403
713
+ )
714
+
715
+ def test_when_type_params_absent_default(self, sample_with_no_type_params):
716
+ estimate_resources_params = EstimateResourcesParams.of(
717
+ parquet_to_pyarrow_inflation=2,
718
+ previous_inflation=7,
719
+ resource_estimation_method=ResourceEstimationMethod.DEFAULT,
720
+ )
721
+ assert (
722
+ int(
723
+ estimate_manifest_entry_size_bytes(
724
+ sample_with_no_type_params,
725
+ operation_type=OperationType.PYARROW_DOWNLOAD,
726
+ estimate_resources_params=estimate_resources_params,
727
+ )
728
+ )
729
+ == 795403
730
+ )
731
+
732
+ def test_when_type_params_absent_intelligent_estimation(
733
+ self, sample_with_no_type_params
734
+ ):
735
+ estimate_resources_params = EstimateResourcesParams.of(
736
+ parquet_to_pyarrow_inflation=2,
737
+ previous_inflation=7,
738
+ resource_estimation_method=ResourceEstimationMethod.INTELLIGENT_ESTIMATION,
739
+ )
740
+ assert (
741
+ estimate_manifest_entry_size_bytes(
742
+ sample_with_no_type_params,
743
+ operation_type=OperationType.PYARROW_DOWNLOAD,
744
+ estimate_resources_params=estimate_resources_params,
745
+ )
746
+ is None
747
+ )
748
+
749
+ def test_when_type_params_absent_content_meta(self, sample_with_no_type_params):
750
+ estimate_resources_params = EstimateResourcesParams.of(
751
+ parquet_to_pyarrow_inflation=2,
752
+ previous_inflation=7,
753
+ resource_estimation_method=ResourceEstimationMethod.CONTENT_TYPE_META,
754
+ )
755
+ assert (
756
+ estimate_manifest_entry_size_bytes(
757
+ sample_with_no_type_params,
758
+ operation_type=OperationType.PYARROW_DOWNLOAD,
759
+ estimate_resources_params=estimate_resources_params,
760
+ )
761
+ is None
762
+ )
763
+
764
+ def test_when_type_params_absent_previous_inflation(
765
+ self, sample_with_no_type_params
766
+ ):
767
+ estimate_resources_params = EstimateResourcesParams.of(
768
+ parquet_to_pyarrow_inflation=2,
769
+ previous_inflation=7,
770
+ resource_estimation_method=ResourceEstimationMethod.PREVIOUS_INFLATION,
771
+ )
772
+ assert (
773
+ estimate_manifest_entry_size_bytes(
774
+ sample_with_no_type_params,
775
+ operation_type=OperationType.PYARROW_DOWNLOAD,
776
+ estimate_resources_params=estimate_resources_params,
777
+ )
778
+ == 795403
779
+ )
780
+
781
+ def test_when_intelligent_estimation_sanity(self, sample_no_stats_entry):
782
+ estimate_resources_params = EstimateResourcesParams.of(
783
+ parquet_to_pyarrow_inflation=2,
784
+ previous_inflation=7,
785
+ resource_estimation_method=ResourceEstimationMethod.INTELLIGENT_ESTIMATION,
786
+ )
787
+ assert (
788
+ int(
789
+ estimate_manifest_entry_size_bytes(
790
+ sample_no_stats_entry,
791
+ operation_type=OperationType.PYARROW_DOWNLOAD,
792
+ estimate_resources_params=estimate_resources_params,
793
+ )
794
+ )
795
+ == 223096
796
+ )
797
+
798
+ def test_when_type_params_with_stats_default_method(self, sample_with_stats_entry):
799
+ estimate_resources_params = EstimateResourcesParams.of(
800
+ parquet_to_pyarrow_inflation=2,
801
+ previous_inflation=7,
802
+ resource_estimation_method=ResourceEstimationMethod.DEFAULT,
803
+ )
804
+ assert (
805
+ int(
806
+ estimate_manifest_entry_size_bytes(
807
+ sample_with_stats_entry,
808
+ operation_type=OperationType.PYARROW_DOWNLOAD,
809
+ estimate_resources_params=estimate_resources_params,
810
+ )
811
+ )
812
+ == 227794
813
+ )
814
+
815
+ def test_when_type_params_with_stats_intelligent_method(
816
+ self, sample_with_stats_entry
817
+ ):
818
+ estimate_resources_params = EstimateResourcesParams.of(
819
+ parquet_to_pyarrow_inflation=2,
820
+ previous_inflation=7,
821
+ resource_estimation_method=ResourceEstimationMethod.INTELLIGENT_ESTIMATION,
822
+ )
823
+ assert (
824
+ int(
825
+ estimate_manifest_entry_size_bytes(
826
+ sample_with_stats_entry,
827
+ operation_type=OperationType.PYARROW_DOWNLOAD,
828
+ estimate_resources_params=estimate_resources_params,
829
+ )
830
+ )
831
+ == 290222
832
+ )
833
+
834
+ def test_when_type_params_with_content_type_meta_method(
835
+ self, sample_with_stats_entry
836
+ ):
837
+ estimate_resources_params = EstimateResourcesParams.of(
838
+ parquet_to_pyarrow_inflation=2,
839
+ previous_inflation=7,
840
+ resource_estimation_method=ResourceEstimationMethod.CONTENT_TYPE_META,
841
+ )
842
+ assert (
843
+ int(
844
+ estimate_manifest_entry_size_bytes(
845
+ sample_with_stats_entry,
846
+ operation_type=OperationType.PYARROW_DOWNLOAD,
847
+ estimate_resources_params=estimate_resources_params,
848
+ )
849
+ )
850
+ == 227794
851
+ )
852
+
853
+ def test_when_type_params_with_stats_default_v2_method(
854
+ self, sample_with_stats_entry
855
+ ):
856
+ estimate_resources_params = EstimateResourcesParams.of(
857
+ parquet_to_pyarrow_inflation=2,
858
+ previous_inflation=7,
859
+ resource_estimation_method=ResourceEstimationMethod.DEFAULT_V2,
860
+ )
861
+ assert (
862
+ int(
863
+ estimate_manifest_entry_size_bytes(
864
+ sample_with_stats_entry,
865
+ operation_type=OperationType.PYARROW_DOWNLOAD,
866
+ estimate_resources_params=estimate_resources_params,
867
+ )
868
+ )
869
+ == 290222 # same result as intelligent estimation
870
+ )
871
+
872
+ def test_when_type_params_without_stats_default_v2_method(
873
+ self, sample_no_stats_entry
874
+ ):
875
+ estimate_resources_params = EstimateResourcesParams.of(
876
+ parquet_to_pyarrow_inflation=2,
877
+ previous_inflation=7,
878
+ resource_estimation_method=ResourceEstimationMethod.DEFAULT_V2,
879
+ )
880
+ assert (
881
+ int(
882
+ estimate_manifest_entry_size_bytes(
883
+ sample_no_stats_entry,
884
+ operation_type=OperationType.PYARROW_DOWNLOAD,
885
+ estimate_resources_params=estimate_resources_params,
886
+ )
887
+ )
888
+ == 223096
889
+ )
890
+
891
+ def test_when_no_type_params_default_v2_method(self, sample_with_no_type_params):
892
+ estimate_resources_params = EstimateResourcesParams.of(
893
+ parquet_to_pyarrow_inflation=2,
894
+ previous_inflation=7,
895
+ resource_estimation_method=ResourceEstimationMethod.DEFAULT_V2,
896
+ )
897
+ assert (
898
+ estimate_manifest_entry_size_bytes(
899
+ sample_with_no_type_params,
900
+ operation_type=OperationType.PYARROW_DOWNLOAD,
901
+ estimate_resources_params=estimate_resources_params,
902
+ )
903
+ == 795403 # same as previous inflation
904
+ )
905
+
906
+ def test_when_type_params_but_inflation_absent_default_v2_method(
907
+ self, sample_with_stats_entry
908
+ ):
909
+ estimate_resources_params = EstimateResourcesParams.of(
910
+ parquet_to_pyarrow_inflation=None,
911
+ previous_inflation=7,
912
+ resource_estimation_method=ResourceEstimationMethod.DEFAULT_V2,
913
+ )
914
+ assert (
915
+ estimate_manifest_entry_size_bytes(
916
+ sample_with_stats_entry,
917
+ operation_type=OperationType.PYARROW_DOWNLOAD,
918
+ estimate_resources_params=estimate_resources_params,
919
+ )
920
+ == 795403 # same as previous inflation
921
+ )