deltacat 1.1.36__py3-none-any.whl → 1.1.37__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +1 -1
- deltacat/compute/compactor_v2/utils/primary_key_index.py +14 -1
- deltacat/compute/resource_estimation/delta.py +8 -4
- deltacat/compute/resource_estimation/model.py +8 -0
- deltacat/tests/compute/resource_estimation/test_delta.py +66 -0
- {deltacat-1.1.36.dist-info → deltacat-1.1.37.dist-info}/METADATA +1 -1
- {deltacat-1.1.36.dist-info → deltacat-1.1.37.dist-info}/RECORD +10 -10
- {deltacat-1.1.36.dist-info → deltacat-1.1.37.dist-info}/LICENSE +0 -0
- {deltacat-1.1.36.dist-info → deltacat-1.1.37.dist-info}/WHEEL +0 -0
- {deltacat-1.1.36.dist-info → deltacat-1.1.37.dist-info}/top_level.txt +0 -0
deltacat/__init__.py
CHANGED
@@ -78,13 +78,25 @@ def _append_table_by_hash_bucket(
|
|
78
78
|
f"Grouping a pki table of length {len(pki_table)} took {groupby_latency}s"
|
79
79
|
)
|
80
80
|
|
81
|
+
hb_pk_grouped_by = hb_pk_grouped_by.sort_by(sc._HASH_BUCKET_IDX_COLUMN_NAME)
|
81
82
|
group_count_array = hb_pk_grouped_by[f"{sc._HASH_BUCKET_IDX_COLUMN_NAME}_count"]
|
82
83
|
hb_group_array = hb_pk_grouped_by[sc._HASH_BUCKET_IDX_COLUMN_NAME]
|
83
84
|
|
84
85
|
result_len = 0
|
85
86
|
for i, group_count in enumerate(group_count_array):
|
86
87
|
hb_idx = hb_group_array[i].as_py()
|
87
|
-
|
88
|
+
group_count_py = group_count.as_py()
|
89
|
+
pyarrow_table = hb_pk_table.slice(offset=result_len, length=group_count_py)
|
90
|
+
assert group_count_py == len(
|
91
|
+
pyarrow_table
|
92
|
+
), f"Group count {group_count_py} not equal to {len(pyarrow_table)}"
|
93
|
+
all_buckets = pc.unique(pyarrow_table[sc._HASH_BUCKET_IDX_COLUMN_NAME])
|
94
|
+
assert (
|
95
|
+
len(all_buckets) == 1
|
96
|
+
), f"Only one hash bucket is allowed by found {len(all_buckets)}"
|
97
|
+
assert (
|
98
|
+
all_buckets[0].as_py() == hb_idx
|
99
|
+
), f"Hash bucket not equal, {all_buckets[0]} and {hb_idx}"
|
88
100
|
pyarrow_table = pyarrow_table.drop(
|
89
101
|
[sc._HASH_BUCKET_IDX_COLUMN_NAME, sc._PK_HASH_STRING_COLUMN_NAME]
|
90
102
|
)
|
@@ -141,6 +153,7 @@ def _optimized_group_record_batches_by_hash_bucket(
|
|
141
153
|
record_batches.append(record_batch)
|
142
154
|
|
143
155
|
if record_batches:
|
156
|
+
print(f"{len(record_batches)} -- END")
|
144
157
|
appended_len, append_latency = timed_invocation(
|
145
158
|
_append_table_by_hash_bucket,
|
146
159
|
pa.Table.from_batches(record_batches),
|
@@ -170,6 +170,10 @@ def _estimate_resources_required_to_process_delta_using_file_sampling(
|
|
170
170
|
operation_type == OperationType.PYARROW_DOWNLOAD
|
171
171
|
), "Number of rows can only be estimated for PYARROW_DOWNLOAD operation"
|
172
172
|
|
173
|
+
if not estimate_resources_params.max_files_to_sample:
|
174
|
+
# we cannot calculate if we cannot sample
|
175
|
+
return None
|
176
|
+
|
173
177
|
if not delta.manifest:
|
174
178
|
delta.manifest = deltacat_storage.get_delta_manifest(
|
175
179
|
delta.locator,
|
@@ -186,10 +190,6 @@ def _estimate_resources_required_to_process_delta_using_file_sampling(
|
|
186
190
|
),
|
187
191
|
)
|
188
192
|
|
189
|
-
if not estimate_resources_params.max_files_to_sample:
|
190
|
-
# we cannot calculate if we cannot sample
|
191
|
-
return None
|
192
|
-
|
193
193
|
sampled_in_memory_size = 0.0
|
194
194
|
sampled_on_disk_size = 0.0
|
195
195
|
sampled_num_rows = 0
|
@@ -252,6 +252,10 @@ RESOURCE_ESTIMATION_METHOD_TO_DELTA_RESOURCE_ESTIMATION_FUNCTIONS = {
|
|
252
252
|
_estimate_resources_required_to_process_delta_using_file_sampling,
|
253
253
|
_estimate_resources_required_to_process_delta_using_previous_inflation,
|
254
254
|
],
|
255
|
+
ResourceEstimationMethod.FILE_SAMPLING_WITH_PREVIOUS_INFLATION: [
|
256
|
+
_estimate_resources_required_to_process_delta_using_file_sampling,
|
257
|
+
_estimate_resources_required_to_process_delta_using_previous_inflation,
|
258
|
+
],
|
255
259
|
}
|
256
260
|
|
257
261
|
|
@@ -23,6 +23,14 @@ class ResourceEstimationMethod(str, Enum):
|
|
23
23
|
"""
|
24
24
|
DEFAULT_V2 = "DEFAULT_V2"
|
25
25
|
|
26
|
+
"""
|
27
|
+
This approach combines file sampling estimation and inflation based methods
|
28
|
+
and runs them in the order specified below:
|
29
|
+
1. FILE_SAMPLING
|
30
|
+
2. PREVIOUS_INFLATION
|
31
|
+
"""
|
32
|
+
FILE_SAMPLING_WITH_PREVIOUS_INFLATION = "FILE_SAMPLING_WITH_PREVIOUS_INFLATION"
|
33
|
+
|
26
34
|
"""
|
27
35
|
This approach strictly uses previous inflation and average record size to arrive
|
28
36
|
at a resource estimate. It requires users to pass in previous inflation and average
|
@@ -416,6 +416,29 @@ class TestEstimateResourcesRequiredToProcessDelta:
|
|
416
416
|
== delta_without_manifest.meta.content_length
|
417
417
|
)
|
418
418
|
|
419
|
+
def test_empty_delta_sampled_when_file_sampling_with_previous_inflation(
|
420
|
+
self, local_deltacat_storage_kwargs, delta_without_manifest: Delta
|
421
|
+
):
|
422
|
+
params = EstimateResourcesParams.of(
|
423
|
+
resource_estimation_method=ResourceEstimationMethod.FILE_SAMPLING_WITH_PREVIOUS_INFLATION,
|
424
|
+
max_files_to_sample=2,
|
425
|
+
)
|
426
|
+
|
427
|
+
result = estimate_resources_required_to_process_delta(
|
428
|
+
delta=delta_without_manifest,
|
429
|
+
operation_type=OperationType.PYARROW_DOWNLOAD,
|
430
|
+
deltacat_storage=ds,
|
431
|
+
deltacat_storage_kwargs=local_deltacat_storage_kwargs,
|
432
|
+
estimate_resources_params=params,
|
433
|
+
)
|
434
|
+
|
435
|
+
assert delta_without_manifest.manifest is not None
|
436
|
+
assert result.memory_bytes is not None
|
437
|
+
assert (
|
438
|
+
result.statistics.on_disk_size_bytes
|
439
|
+
== delta_without_manifest.meta.content_length
|
440
|
+
)
|
441
|
+
|
419
442
|
def test_delta_manifest_parquet_when_file_sampling(
|
420
443
|
self, local_deltacat_storage_kwargs, parquet_delta_with_manifest: Delta
|
421
444
|
):
|
@@ -437,6 +460,27 @@ class TestEstimateResourcesRequiredToProcessDelta:
|
|
437
460
|
== parquet_delta_with_manifest.meta.content_length
|
438
461
|
)
|
439
462
|
|
463
|
+
def test_delta_manifest_parquet_when_file_sampling_with_previous_inflation(
|
464
|
+
self, local_deltacat_storage_kwargs, parquet_delta_with_manifest: Delta
|
465
|
+
):
|
466
|
+
params = EstimateResourcesParams.of(
|
467
|
+
resource_estimation_method=ResourceEstimationMethod.FILE_SAMPLING_WITH_PREVIOUS_INFLATION,
|
468
|
+
max_files_to_sample=2,
|
469
|
+
)
|
470
|
+
|
471
|
+
result = estimate_resources_required_to_process_delta(
|
472
|
+
delta=parquet_delta_with_manifest,
|
473
|
+
operation_type=OperationType.PYARROW_DOWNLOAD,
|
474
|
+
deltacat_storage=ds,
|
475
|
+
deltacat_storage_kwargs=local_deltacat_storage_kwargs,
|
476
|
+
estimate_resources_params=params,
|
477
|
+
)
|
478
|
+
assert result.memory_bytes is not None
|
479
|
+
assert (
|
480
|
+
result.statistics.on_disk_size_bytes
|
481
|
+
== parquet_delta_with_manifest.meta.content_length
|
482
|
+
)
|
483
|
+
|
440
484
|
def test_parquet_delta_when_file_sampling_and_arrow_size_zero(
|
441
485
|
self,
|
442
486
|
local_deltacat_storage_kwargs,
|
@@ -512,6 +556,28 @@ class TestEstimateResourcesRequiredToProcessDelta:
|
|
512
556
|
)
|
513
557
|
assert result is None
|
514
558
|
|
559
|
+
def test_delta_manifest_utsv_when_file_sampling_with_previous_inflation_zero_files_to_sample(
|
560
|
+
self, local_deltacat_storage_kwargs, utsv_delta_with_manifest: Delta
|
561
|
+
):
|
562
|
+
previous_inflation = 7
|
563
|
+
params = EstimateResourcesParams.of(
|
564
|
+
resource_estimation_method=ResourceEstimationMethod.FILE_SAMPLING_WITH_PREVIOUS_INFLATION,
|
565
|
+
max_files_to_sample=None,
|
566
|
+
previous_inflation=previous_inflation,
|
567
|
+
)
|
568
|
+
|
569
|
+
result = estimate_resources_required_to_process_delta(
|
570
|
+
delta=utsv_delta_with_manifest,
|
571
|
+
operation_type=OperationType.PYARROW_DOWNLOAD,
|
572
|
+
deltacat_storage=ds,
|
573
|
+
deltacat_storage_kwargs=local_deltacat_storage_kwargs,
|
574
|
+
estimate_resources_params=params,
|
575
|
+
)
|
576
|
+
assert result is not None
|
577
|
+
assert result.memory_bytes == (
|
578
|
+
utsv_delta_with_manifest.meta.content_length * previous_inflation
|
579
|
+
)
|
580
|
+
|
515
581
|
def test_empty_delta_when_default_v2(
|
516
582
|
self, local_deltacat_storage_kwargs, delta_without_manifest: Delta
|
517
583
|
):
|
@@ -1,4 +1,4 @@
|
|
1
|
-
deltacat/__init__.py,sha256=
|
1
|
+
deltacat/__init__.py,sha256=u00X92zHfZJzS08a-2kx3kCLcz40L-THm0HowDiBOiA,1778
|
2
2
|
deltacat/constants.py,sha256=TUJLXUJ9xq1Ryil72yLkKR8EDH_Irp5wUg56QstbRNE,2181
|
3
3
|
deltacat/exceptions.py,sha256=7sjk3BuMY5Oo-6OvAfHncZx_OcvtEL47BblWr2F7waE,12740
|
4
4
|
deltacat/logs.py,sha256=EQSDin1deehzz5xlLV1_TrFJrO_IBZ9Ahp7MdL-4cK8,9363
|
@@ -76,7 +76,7 @@ deltacat/compute/compactor_v2/utils/dedupe.py,sha256=Jz1QbBOdZJwT8K1vD9q01eOn7hd
|
|
76
76
|
deltacat/compute/compactor_v2/utils/delta.py,sha256=I7Yvda8NVbpKXG3nM2Ku1utvR2r2OpHvUMqUL2ja3aw,3626
|
77
77
|
deltacat/compute/compactor_v2/utils/io.py,sha256=Xjs7_D-0xKSetvllIe4o96aM1elfdjt1Ii7YfsHPvZs,6108
|
78
78
|
deltacat/compute/compactor_v2/utils/merge.py,sha256=fAzEYwQYH2ia8MLdEFdZFivWHpi6qZu8AyyEK0H0vwE,5363
|
79
|
-
deltacat/compute/compactor_v2/utils/primary_key_index.py,sha256=
|
79
|
+
deltacat/compute/compactor_v2/utils/primary_key_index.py,sha256=Qsn0BQrlBWSLqu4srd-LJUX8BaVqG6Wo1oAros7LYWw,12677
|
80
80
|
deltacat/compute/compactor_v2/utils/task_options.py,sha256=0GoB_DLkCN1q8CVKTlWlDYt55qnpTDIa9fPyXJwB-cU,13801
|
81
81
|
deltacat/compute/merge_on_read/__init__.py,sha256=ckbgngmqPjYBYz_NySsR1vNTOb_hNpeL1sYkZKvBI9M,214
|
82
82
|
deltacat/compute/merge_on_read/daft.py,sha256=1oC38u5ig_aTrq7EzyWBo8Ui54rb6yERYMk-vEFbpxM,1400
|
@@ -85,9 +85,9 @@ deltacat/compute/merge_on_read/model/merge_on_read_params.py,sha256=Q51znagh8PtL
|
|
85
85
|
deltacat/compute/merge_on_read/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
86
86
|
deltacat/compute/merge_on_read/utils/delta.py,sha256=e4BtOHa5XPpUnR4r0HqBKjXckBsTI8qBwdUWwpJfkWQ,1367
|
87
87
|
deltacat/compute/resource_estimation/__init__.py,sha256=4bfBXcq-VAt9JCmjvj3yAmn0lEHVGdGsUCCoMGxjEqA,799
|
88
|
-
deltacat/compute/resource_estimation/delta.py,sha256=
|
88
|
+
deltacat/compute/resource_estimation/delta.py,sha256=zd1ivoA3EzdrjgJYYBXY3wrhwZDlt-Xoqke0e5xz6AY,10815
|
89
89
|
deltacat/compute/resource_estimation/manifest.py,sha256=gSqOyIda-pYq3vRsKFq3IiZvwhV3mMqrWPtsmUH9dD8,13035
|
90
|
-
deltacat/compute/resource_estimation/model.py,sha256=
|
90
|
+
deltacat/compute/resource_estimation/model.py,sha256=1svgVfhNIAyyVkHy-QXcOzO0UVigbVH8M7xyAlgvCbg,5741
|
91
91
|
deltacat/compute/resource_estimation/parquet.py,sha256=5_apma4EKbKcm-nfV73-qN2nfnCeyhFW23ZHX3jz0Kw,3158
|
92
92
|
deltacat/compute/stats/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
93
93
|
deltacat/compute/stats/types.py,sha256=cp0lT8nITTKbnkc03OysRjXfcfXzQml9a4wqCnR6kqs,215
|
@@ -159,7 +159,7 @@ deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py,sha256=eoi
|
|
159
159
|
deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py,sha256=aFb9rzT_EK9k8qAMHPtpqd5btyEmll1So1loDmZkotQ,1769
|
160
160
|
deltacat/tests/compute/compactor_v2/utils/test_task_options.py,sha256=YDQKUKv3Vv8S1fe0YQmjHTrwnWSliqKHIWGu0fEdKnI,11478
|
161
161
|
deltacat/tests/compute/resource_estimation/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
162
|
-
deltacat/tests/compute/resource_estimation/test_delta.py,sha256=
|
162
|
+
deltacat/tests/compute/resource_estimation/test_delta.py,sha256=vbqKwZOxrNtfbuXWz08nUvi_srR4y2aMQmUwLR2jDcs,28446
|
163
163
|
deltacat/tests/compute/resource_estimation/test_manifest.py,sha256=yrMvqDjolExdRf6Vtg5XaKDuaKz9ok15PCZ7_aJOYrI,32893
|
164
164
|
deltacat/tests/compute/resource_estimation/data/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
165
165
|
deltacat/tests/io/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
@@ -212,8 +212,8 @@ deltacat/utils/ray_utils/concurrency.py,sha256=JDVwMiQWrmuSlyCWAoiq9ctoJ0XADEfDD
|
|
212
212
|
deltacat/utils/ray_utils/dataset.py,sha256=waHdtH0c835a-2t51HYRHnulfC0_zBxx8mFSAPvPSPM,3274
|
213
213
|
deltacat/utils/ray_utils/performance.py,sha256=d7JFM7vTXHzkGx9qNQcZzUWajnqINvYRwaM088_FpsE,464
|
214
214
|
deltacat/utils/ray_utils/runtime.py,sha256=rB0A-tU9WZHz0J11LzJdANYtL397YyuemcA1l-K9dAw,5029
|
215
|
-
deltacat-1.1.
|
216
|
-
deltacat-1.1.
|
217
|
-
deltacat-1.1.
|
218
|
-
deltacat-1.1.
|
219
|
-
deltacat-1.1.
|
215
|
+
deltacat-1.1.37.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
216
|
+
deltacat-1.1.37.dist-info/METADATA,sha256=iHlaZ9sS-CrQby0kxCrOigl1ZGZKpniwf9LyYbagwzI,1733
|
217
|
+
deltacat-1.1.37.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
|
218
|
+
deltacat-1.1.37.dist-info/top_level.txt,sha256=RWdIcid4Bv2i2ozLVh-70kJpyB61xEKXod9XXGpiono,9
|
219
|
+
deltacat-1.1.37.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|