deltacat 1.1.14__py3-none-any.whl → 1.1.16__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +1 -1
- deltacat/compute/compactor/compaction_session.py +3 -2
- deltacat/compute/compactor/model/compact_partition_params.py +11 -1
- deltacat/compute/compactor/model/compaction_session_audit_info.py +2 -2
- deltacat/compute/compactor/model/delta_annotated.py +2 -4
- deltacat/compute/compactor/steps/hash_bucket.py +2 -3
- deltacat/compute/compactor_v2/compaction_session.py +27 -33
- deltacat/compute/compactor_v2/constants.py +4 -0
- deltacat/compute/compactor_v2/private/compaction_utils.py +112 -67
- deltacat/compute/compactor_v2/steps/merge.py +0 -3
- deltacat/compute/compactor_v2/utils/delta.py +2 -3
- deltacat/compute/compactor_v2/utils/io.py +0 -2
- deltacat/compute/compactor_v2/utils/merge.py +0 -1
- deltacat/tests/compute/compact_partition_multiple_rounds_test_cases.py +855 -0
- deltacat/tests/compute/compactor_v2/test_compaction_session.py +147 -1
- deltacat/tests/compute/test_compact_partition_multiple_rounds.py +330 -0
- deltacat/tests/compute/test_compact_partition_rebase.py +1 -1
- deltacat/tests/compute/test_util_create_table_deltas_repo.py +118 -0
- deltacat/tests/local_deltacat_storage/__init__.py +8 -5
- {deltacat-1.1.14.dist-info → deltacat-1.1.16.dist-info}/METADATA +1 -1
- {deltacat-1.1.14.dist-info → deltacat-1.1.16.dist-info}/RECORD +24 -22
- {deltacat-1.1.14.dist-info → deltacat-1.1.16.dist-info}/LICENSE +0 -0
- {deltacat-1.1.14.dist-info → deltacat-1.1.16.dist-info}/WHEEL +0 -0
- {deltacat-1.1.14.dist-info → deltacat-1.1.16.dist-info}/top_level.txt +0 -0
@@ -87,6 +87,7 @@ class TestCompactionSession:
|
|
87
87
|
INCREMENTAL_FILE_PATH = (
|
88
88
|
"deltacat/tests/compute/compactor_v2/data/incremental_source_date_pk.csv"
|
89
89
|
)
|
90
|
+
ERROR_RATE = 0.05
|
90
91
|
|
91
92
|
def test_compact_partition_when_no_input_deltas_to_compact(
|
92
93
|
self, local_deltacat_storage_kwargs
|
@@ -178,7 +179,7 @@ class TestCompactionSession:
|
|
178
179
|
},
|
179
180
|
"primary_keys": [],
|
180
181
|
"rebase_source_partition_locator": source_delta.partition_locator,
|
181
|
-
"rebase_source_partition_high_watermark":
|
182
|
+
"rebase_source_partition_high_watermark": source_delta.stream_position,
|
182
183
|
"records_per_compacted_file": 4000,
|
183
184
|
"s3_client_kwargs": {},
|
184
185
|
"source_partition_locator": source_delta.partition_locator,
|
@@ -253,3 +254,148 @@ class TestCompactionSession:
|
|
253
254
|
# as it should be running incremental
|
254
255
|
assert compaction_audit.uniform_deltas_created == 1
|
255
256
|
assert compaction_audit.input_records == 6
|
257
|
+
|
258
|
+
def test_compact_partition_when_incremental_then_rcf_stats_accurate(
|
259
|
+
self, s3_resource, local_deltacat_storage_kwargs
|
260
|
+
):
|
261
|
+
"""
|
262
|
+
A test case which asserts the RCF stats are correctly generated for
|
263
|
+
a rebase and incremental use-case.
|
264
|
+
"""
|
265
|
+
|
266
|
+
# setup
|
267
|
+
staged_source = stage_partition_from_file_paths(
|
268
|
+
self.NAMESPACE, ["source"], **local_deltacat_storage_kwargs
|
269
|
+
)
|
270
|
+
|
271
|
+
source_delta = commit_delta_to_staged_partition(
|
272
|
+
staged_source, [self.BACKFILL_FILE_PATH], **local_deltacat_storage_kwargs
|
273
|
+
)
|
274
|
+
|
275
|
+
staged_dest = stage_partition_from_file_paths(
|
276
|
+
self.NAMESPACE, ["destination"], **local_deltacat_storage_kwargs
|
277
|
+
)
|
278
|
+
dest_partition = ds.commit_partition(
|
279
|
+
staged_dest, **local_deltacat_storage_kwargs
|
280
|
+
)
|
281
|
+
|
282
|
+
# action
|
283
|
+
rcf_url = compact_partition(
|
284
|
+
CompactPartitionParams.of(
|
285
|
+
{
|
286
|
+
"compaction_artifact_s3_bucket": TEST_S3_RCF_BUCKET_NAME,
|
287
|
+
"compacted_file_content_type": ContentType.PARQUET,
|
288
|
+
"dd_max_parallelism_ratio": 1.0,
|
289
|
+
"deltacat_storage": ds,
|
290
|
+
"deltacat_storage_kwargs": local_deltacat_storage_kwargs,
|
291
|
+
"destination_partition_locator": dest_partition.locator,
|
292
|
+
"drop_duplicates": True,
|
293
|
+
"hash_bucket_count": 2,
|
294
|
+
"last_stream_position_to_compact": source_delta.stream_position,
|
295
|
+
"list_deltas_kwargs": {
|
296
|
+
**local_deltacat_storage_kwargs,
|
297
|
+
**{"equivalent_table_types": []},
|
298
|
+
},
|
299
|
+
"primary_keys": ["pk"],
|
300
|
+
"rebase_source_partition_locator": source_delta.partition_locator,
|
301
|
+
"rebase_source_partition_high_watermark": source_delta.stream_position,
|
302
|
+
"records_per_compacted_file": 4000,
|
303
|
+
"s3_client_kwargs": {},
|
304
|
+
"source_partition_locator": source_delta.partition_locator,
|
305
|
+
}
|
306
|
+
)
|
307
|
+
)
|
308
|
+
|
309
|
+
backfill_rcf = get_rcf(s3_resource, rcf_url)
|
310
|
+
_, compaction_audit_key = backfill_rcf.compaction_audit_url.strip(
|
311
|
+
"s3://"
|
312
|
+
).split("/", 1)
|
313
|
+
compaction_audit = CompactionSessionAuditInfo(
|
314
|
+
**read_s3_contents(
|
315
|
+
s3_resource, TEST_S3_RCF_BUCKET_NAME, compaction_audit_key
|
316
|
+
)
|
317
|
+
)
|
318
|
+
|
319
|
+
assert abs(backfill_rcf.input_inflation - 0.05235042735042735) <= 1e-5
|
320
|
+
assert abs(backfill_rcf.input_average_record_size_bytes - 12.25) <= 1e-5
|
321
|
+
|
322
|
+
assert compaction_audit.input_records == 4
|
323
|
+
assert compaction_audit.records_deduped == 0
|
324
|
+
assert compaction_audit.records_deleted == 0
|
325
|
+
assert compaction_audit.untouched_file_count == 0
|
326
|
+
assert compaction_audit.untouched_record_count == 0
|
327
|
+
assert compaction_audit.untouched_size_bytes == 0
|
328
|
+
assert compaction_audit.untouched_file_ratio == 0
|
329
|
+
assert compaction_audit.uniform_deltas_created == 1
|
330
|
+
assert compaction_audit.hash_bucket_count == 2
|
331
|
+
assert compaction_audit.input_file_count == 1
|
332
|
+
assert compaction_audit.output_file_count == 2
|
333
|
+
assert abs(compaction_audit.output_size_bytes - 1832) / 1832 <= self.ERROR_RATE
|
334
|
+
assert abs(compaction_audit.input_size_bytes - 936) / 936 <= self.ERROR_RATE
|
335
|
+
|
336
|
+
# Now run an incremental compaction and verify if the previous RCF was read properly.
|
337
|
+
new_source_delta = commit_delta_to_partition(
|
338
|
+
source_delta.partition_locator,
|
339
|
+
[self.INCREMENTAL_FILE_PATH],
|
340
|
+
**local_deltacat_storage_kwargs,
|
341
|
+
)
|
342
|
+
|
343
|
+
new_destination_partition = ds.get_partition(
|
344
|
+
dest_partition.stream_locator, [], **local_deltacat_storage_kwargs
|
345
|
+
)
|
346
|
+
|
347
|
+
new_rcf_url = compact_partition(
|
348
|
+
CompactPartitionParams.of(
|
349
|
+
{
|
350
|
+
"compaction_artifact_s3_bucket": TEST_S3_RCF_BUCKET_NAME,
|
351
|
+
"compacted_file_content_type": ContentType.PARQUET,
|
352
|
+
"dd_max_parallelism_ratio": 1.0,
|
353
|
+
"deltacat_storage": ds,
|
354
|
+
"deltacat_storage_kwargs": local_deltacat_storage_kwargs,
|
355
|
+
"destination_partition_locator": new_destination_partition.locator,
|
356
|
+
"drop_duplicates": True,
|
357
|
+
"hash_bucket_count": 2,
|
358
|
+
"last_stream_position_to_compact": new_source_delta.stream_position,
|
359
|
+
"list_deltas_kwargs": {
|
360
|
+
**local_deltacat_storage_kwargs,
|
361
|
+
**{"equivalent_table_types": []},
|
362
|
+
},
|
363
|
+
"primary_keys": ["pk"],
|
364
|
+
"rebase_source_partition_locator": None,
|
365
|
+
"rebase_source_partition_high_watermark": None,
|
366
|
+
"records_per_compacted_file": 4000,
|
367
|
+
"s3_client_kwargs": {},
|
368
|
+
"source_partition_locator": new_source_delta.partition_locator,
|
369
|
+
}
|
370
|
+
)
|
371
|
+
)
|
372
|
+
|
373
|
+
new_rcf = get_rcf(s3_resource, new_rcf_url)
|
374
|
+
_, compaction_audit_key = new_rcf.compaction_audit_url.strip("s3://").split(
|
375
|
+
"/", 1
|
376
|
+
)
|
377
|
+
compaction_audit = CompactionSessionAuditInfo(
|
378
|
+
**read_s3_contents(
|
379
|
+
s3_resource, TEST_S3_RCF_BUCKET_NAME, compaction_audit_key
|
380
|
+
)
|
381
|
+
)
|
382
|
+
|
383
|
+
# as it should be running incremental
|
384
|
+
assert abs(new_rcf.input_inflation - 0.027292576419213975) <= 1e-5
|
385
|
+
assert abs(new_rcf.input_average_record_size_bytes - 12.5) <= 1e-5
|
386
|
+
|
387
|
+
assert compaction_audit.input_records == 6
|
388
|
+
assert compaction_audit.records_deduped == 1
|
389
|
+
assert compaction_audit.records_deleted == 0
|
390
|
+
assert compaction_audit.untouched_file_count == 1
|
391
|
+
assert compaction_audit.untouched_record_count == 2
|
392
|
+
assert (
|
393
|
+
abs(compaction_audit.untouched_size_bytes - 916) / 916 <= self.ERROR_RATE
|
394
|
+
) # 5% error
|
395
|
+
assert abs(compaction_audit.untouched_file_ratio - 50) <= 1e-5
|
396
|
+
assert compaction_audit.uniform_deltas_created == 1
|
397
|
+
assert compaction_audit.hash_bucket_count == 2
|
398
|
+
assert compaction_audit.input_file_count == 3
|
399
|
+
assert compaction_audit.output_file_count == 2
|
400
|
+
assert abs(compaction_audit.output_size_bytes - 1843) / 1843 <= self.ERROR_RATE
|
401
|
+
assert abs(compaction_audit.input_size_bytes - 2748) / 2748 <= self.ERROR_RATE
|
@@ -0,0 +1,330 @@
|
|
1
|
+
import ray
|
2
|
+
import os
|
3
|
+
from moto import mock_s3
|
4
|
+
import pytest
|
5
|
+
import boto3
|
6
|
+
from boto3.resources.base import ServiceResource
|
7
|
+
import pyarrow as pa
|
8
|
+
from deltacat.io.ray_plasma_object_store import RayPlasmaObjectStore
|
9
|
+
from pytest_benchmark.fixture import BenchmarkFixture
|
10
|
+
|
11
|
+
from deltacat.tests.compute.test_util_constant import (
|
12
|
+
TEST_S3_RCF_BUCKET_NAME,
|
13
|
+
DEFAULT_NUM_WORKERS,
|
14
|
+
DEFAULT_WORKER_INSTANCE_CPUS,
|
15
|
+
)
|
16
|
+
from deltacat.tests.compute.test_util_common import (
|
17
|
+
get_rcf,
|
18
|
+
)
|
19
|
+
from deltacat.tests.test_utils.utils import read_s3_contents
|
20
|
+
from deltacat.compute.compactor.model.compactor_version import CompactorVersion
|
21
|
+
from deltacat.tests.compute.test_util_common import (
|
22
|
+
get_compacted_delta_locator_from_rcf,
|
23
|
+
)
|
24
|
+
from deltacat.compute.compactor.model.compaction_session_audit_info import (
|
25
|
+
CompactionSessionAuditInfo,
|
26
|
+
)
|
27
|
+
from deltacat.tests.compute.test_util_create_table_deltas_repo import (
|
28
|
+
multiple_rounds_create_src_w_deltas_destination_rebase_w_deltas_strategy,
|
29
|
+
)
|
30
|
+
from deltacat.tests.compute.compact_partition_multiple_rounds_test_cases import (
|
31
|
+
MULTIPLE_ROUNDS_TEST_CASES,
|
32
|
+
)
|
33
|
+
from typing import Any, Callable, Dict, List, Optional, Set
|
34
|
+
from deltacat.types.media import StorageType
|
35
|
+
from deltacat.storage import (
|
36
|
+
DeltaLocator,
|
37
|
+
Partition,
|
38
|
+
)
|
39
|
+
from deltacat.types.media import ContentType
|
40
|
+
from deltacat.compute.compactor.model.compact_partition_params import (
|
41
|
+
CompactPartitionParams,
|
42
|
+
)
|
43
|
+
from deltacat.compute.compactor import (
|
44
|
+
RoundCompletionInfo,
|
45
|
+
)
|
46
|
+
from deltacat.utils.placement import (
|
47
|
+
PlacementGroupManager,
|
48
|
+
)
|
49
|
+
|
50
|
+
DATABASE_FILE_PATH_KEY, DATABASE_FILE_PATH_VALUE = (
|
51
|
+
"db_file_path",
|
52
|
+
"deltacat/tests/local_deltacat_storage/db_test.sqlite",
|
53
|
+
)
|
54
|
+
|
55
|
+
|
56
|
+
"""
|
57
|
+
MODULE scoped fixtures
|
58
|
+
"""
|
59
|
+
|
60
|
+
|
61
|
+
@pytest.fixture(autouse=True, scope="module")
|
62
|
+
def setup_ray_cluster():
|
63
|
+
ray.init(local_mode=True, ignore_reinit_error=True)
|
64
|
+
yield
|
65
|
+
ray.shutdown()
|
66
|
+
|
67
|
+
|
68
|
+
@pytest.fixture(autouse=True, scope="module")
|
69
|
+
def mock_aws_credential():
|
70
|
+
os.environ["AWS_ACCESS_KEY_ID"] = "testing"
|
71
|
+
os.environ["AWS_SECRET_ACCESS_ID"] = "testing"
|
72
|
+
os.environ["AWS_SECURITY_TOKEN"] = "testing"
|
73
|
+
os.environ["AWS_SESSION_TOKEN"] = "testing"
|
74
|
+
os.environ["AWS_DEFAULT_REGION"] = "us-east-1"
|
75
|
+
yield
|
76
|
+
|
77
|
+
|
78
|
+
@pytest.fixture(autouse=True, scope="module")
|
79
|
+
def cleanup_the_database_file_after_all_compaction_session_package_tests_complete():
|
80
|
+
# make sure the database file is deleted after all the compactor package tests are completed
|
81
|
+
if os.path.exists(DATABASE_FILE_PATH_VALUE):
|
82
|
+
os.remove(DATABASE_FILE_PATH_VALUE)
|
83
|
+
|
84
|
+
|
85
|
+
@pytest.fixture(scope="module")
|
86
|
+
def s3_resource(mock_aws_credential):
|
87
|
+
with mock_s3():
|
88
|
+
yield boto3.resource("s3")
|
89
|
+
|
90
|
+
|
91
|
+
@pytest.fixture(autouse=True, scope="module")
|
92
|
+
def setup_compaction_artifacts_s3_bucket(s3_resource: ServiceResource):
|
93
|
+
s3_resource.create_bucket(
|
94
|
+
ACL="authenticated-read",
|
95
|
+
Bucket=TEST_S3_RCF_BUCKET_NAME,
|
96
|
+
)
|
97
|
+
yield
|
98
|
+
|
99
|
+
|
100
|
+
"""
|
101
|
+
FUNCTION scoped fixtures
|
102
|
+
"""
|
103
|
+
|
104
|
+
|
105
|
+
@pytest.fixture(scope="function")
|
106
|
+
def local_deltacat_storage_kwargs(request: pytest.FixtureRequest):
|
107
|
+
# see deltacat/tests/local_deltacat_storage/README.md for documentation
|
108
|
+
kwargs_for_local_deltacat_storage: Dict[str, Any] = {
|
109
|
+
DATABASE_FILE_PATH_KEY: DATABASE_FILE_PATH_VALUE,
|
110
|
+
}
|
111
|
+
yield kwargs_for_local_deltacat_storage
|
112
|
+
if os.path.exists(DATABASE_FILE_PATH_VALUE):
|
113
|
+
os.remove(DATABASE_FILE_PATH_VALUE)
|
114
|
+
|
115
|
+
|
116
|
+
@pytest.mark.parametrize(
|
117
|
+
[
|
118
|
+
"test_name",
|
119
|
+
"primary_keys",
|
120
|
+
"sort_keys",
|
121
|
+
"partition_keys_param",
|
122
|
+
"partition_values_param",
|
123
|
+
"input_deltas_param",
|
124
|
+
"expected_terminal_compact_partition_result",
|
125
|
+
"expected_terminal_exception",
|
126
|
+
"expected_terminal_exception_message",
|
127
|
+
"create_placement_group_param",
|
128
|
+
"records_per_compacted_file_param",
|
129
|
+
"hash_bucket_count_param",
|
130
|
+
"read_kwargs_provider_param",
|
131
|
+
"drop_duplicates_param",
|
132
|
+
"skip_enabled_compact_partition_drivers",
|
133
|
+
"assert_compaction_audit",
|
134
|
+
"rebase_expected_compact_partition_result",
|
135
|
+
"num_rounds_param",
|
136
|
+
"compact_partition_func",
|
137
|
+
"compactor_version",
|
138
|
+
],
|
139
|
+
[
|
140
|
+
(
|
141
|
+
test_name,
|
142
|
+
primary_keys,
|
143
|
+
sort_keys,
|
144
|
+
partition_keys_param,
|
145
|
+
partition_values_param,
|
146
|
+
input_deltas,
|
147
|
+
expected_terminal_compact_partition_result,
|
148
|
+
expected_terminal_exception,
|
149
|
+
expected_terminal_exception_message,
|
150
|
+
create_placement_group_param,
|
151
|
+
records_per_compacted_file_param,
|
152
|
+
hash_bucket_count_param,
|
153
|
+
drop_duplicates_param,
|
154
|
+
read_kwargs_provider,
|
155
|
+
skip_enabled_compact_partition_drivers,
|
156
|
+
assert_compaction_audit,
|
157
|
+
rebase_expected_compact_partition_result,
|
158
|
+
num_rounds_param,
|
159
|
+
compact_partition_func,
|
160
|
+
compactor_version,
|
161
|
+
)
|
162
|
+
for test_name, (
|
163
|
+
primary_keys,
|
164
|
+
sort_keys,
|
165
|
+
partition_keys_param,
|
166
|
+
partition_values_param,
|
167
|
+
input_deltas,
|
168
|
+
expected_terminal_compact_partition_result,
|
169
|
+
expected_terminal_exception,
|
170
|
+
expected_terminal_exception_message,
|
171
|
+
create_placement_group_param,
|
172
|
+
records_per_compacted_file_param,
|
173
|
+
hash_bucket_count_param,
|
174
|
+
drop_duplicates_param,
|
175
|
+
read_kwargs_provider,
|
176
|
+
skip_enabled_compact_partition_drivers,
|
177
|
+
assert_compaction_audit,
|
178
|
+
rebase_expected_compact_partition_result,
|
179
|
+
num_rounds_param,
|
180
|
+
compact_partition_func,
|
181
|
+
compactor_version,
|
182
|
+
) in MULTIPLE_ROUNDS_TEST_CASES.items()
|
183
|
+
],
|
184
|
+
ids=[test_name for test_name in MULTIPLE_ROUNDS_TEST_CASES],
|
185
|
+
)
|
186
|
+
def test_compact_partition_rebase_multiple_rounds_same_source_and_destination(
|
187
|
+
mocker,
|
188
|
+
s3_resource: ServiceResource,
|
189
|
+
local_deltacat_storage_kwargs: Dict[str, Any],
|
190
|
+
test_name: str,
|
191
|
+
primary_keys: Set[str],
|
192
|
+
sort_keys: List[Optional[Any]],
|
193
|
+
partition_keys_param: Optional[List[Any]],
|
194
|
+
partition_values_param: List[Optional[str]],
|
195
|
+
input_deltas_param: List[pa.Array],
|
196
|
+
expected_terminal_compact_partition_result: pa.Table,
|
197
|
+
expected_terminal_exception: BaseException,
|
198
|
+
expected_terminal_exception_message: Optional[str],
|
199
|
+
create_placement_group_param: bool,
|
200
|
+
records_per_compacted_file_param: int,
|
201
|
+
hash_bucket_count_param: int,
|
202
|
+
drop_duplicates_param: bool,
|
203
|
+
read_kwargs_provider_param: Any,
|
204
|
+
rebase_expected_compact_partition_result: pa.Table,
|
205
|
+
skip_enabled_compact_partition_drivers: List[CompactorVersion],
|
206
|
+
assert_compaction_audit: Optional[Callable],
|
207
|
+
compactor_version: Optional[CompactorVersion],
|
208
|
+
compact_partition_func: Callable,
|
209
|
+
num_rounds_param: int,
|
210
|
+
benchmark: BenchmarkFixture,
|
211
|
+
):
|
212
|
+
import deltacat.tests.local_deltacat_storage as ds
|
213
|
+
|
214
|
+
ds_mock_kwargs = local_deltacat_storage_kwargs
|
215
|
+
"""
|
216
|
+
This test tests different multi-round compaction rebase configurations,
|
217
|
+
as specified in compact_partition_multiple_rounds_test_cases.py
|
218
|
+
These tests do not test multi-round compaction backfill, which is
|
219
|
+
currently unsupported.
|
220
|
+
"""
|
221
|
+
(
|
222
|
+
source_table_stream,
|
223
|
+
_,
|
224
|
+
rebased_table_stream,
|
225
|
+
_,
|
226
|
+
) = multiple_rounds_create_src_w_deltas_destination_rebase_w_deltas_strategy(
|
227
|
+
primary_keys,
|
228
|
+
sort_keys,
|
229
|
+
partition_keys_param,
|
230
|
+
input_deltas_param,
|
231
|
+
partition_values_param,
|
232
|
+
ds_mock_kwargs,
|
233
|
+
)
|
234
|
+
source_partition: Partition = ds.get_partition(
|
235
|
+
source_table_stream.locator,
|
236
|
+
partition_values_param,
|
237
|
+
**ds_mock_kwargs,
|
238
|
+
)
|
239
|
+
rebased_partition: Partition = ds.get_partition(
|
240
|
+
rebased_table_stream.locator,
|
241
|
+
partition_values_param,
|
242
|
+
**ds_mock_kwargs,
|
243
|
+
)
|
244
|
+
total_cpus = DEFAULT_NUM_WORKERS * DEFAULT_WORKER_INSTANCE_CPUS
|
245
|
+
pgm = None
|
246
|
+
if create_placement_group_param:
|
247
|
+
pgm = PlacementGroupManager(
|
248
|
+
1, total_cpus, DEFAULT_WORKER_INSTANCE_CPUS, memory_per_bundle=4000000
|
249
|
+
).pgs[0]
|
250
|
+
compact_partition_params = CompactPartitionParams.of(
|
251
|
+
{
|
252
|
+
"compaction_artifact_s3_bucket": TEST_S3_RCF_BUCKET_NAME,
|
253
|
+
"compacted_file_content_type": ContentType.PARQUET,
|
254
|
+
"dd_max_parallelism_ratio": 1.0,
|
255
|
+
"deltacat_storage": ds,
|
256
|
+
"deltacat_storage_kwargs": ds_mock_kwargs,
|
257
|
+
"destination_partition_locator": rebased_partition.locator,
|
258
|
+
"hash_bucket_count": hash_bucket_count_param,
|
259
|
+
"last_stream_position_to_compact": source_partition.stream_position,
|
260
|
+
"list_deltas_kwargs": {**ds_mock_kwargs, **{"equivalent_table_types": []}},
|
261
|
+
"object_store": RayPlasmaObjectStore(),
|
262
|
+
"pg_config": pgm,
|
263
|
+
"primary_keys": primary_keys,
|
264
|
+
"read_kwargs_provider": read_kwargs_provider_param,
|
265
|
+
"rebase_source_partition_locator": source_partition.locator,
|
266
|
+
"rebase_source_partition_high_watermark": rebased_partition.stream_position,
|
267
|
+
"records_per_compacted_file": records_per_compacted_file_param,
|
268
|
+
"s3_client_kwargs": {},
|
269
|
+
"source_partition_locator": rebased_partition.locator,
|
270
|
+
"sort_keys": sort_keys if sort_keys else None,
|
271
|
+
"num_rounds": num_rounds_param,
|
272
|
+
"drop_duplicates": drop_duplicates_param,
|
273
|
+
"min_delta_bytes": 560,
|
274
|
+
}
|
275
|
+
)
|
276
|
+
if expected_terminal_exception:
|
277
|
+
with pytest.raises(expected_terminal_exception) as exc_info:
|
278
|
+
benchmark(compact_partition_func, compact_partition_params)
|
279
|
+
assert expected_terminal_exception_message in str(exc_info.value)
|
280
|
+
return
|
281
|
+
from deltacat.compute.compactor_v2.model.evaluate_compaction_result import (
|
282
|
+
ExecutionCompactionResult,
|
283
|
+
)
|
284
|
+
|
285
|
+
execute_compaction_result_spy = mocker.spy(ExecutionCompactionResult, "__init__")
|
286
|
+
|
287
|
+
# execute
|
288
|
+
rcf_file_s3_uri = benchmark(compact_partition_func, compact_partition_params)
|
289
|
+
|
290
|
+
round_completion_info: RoundCompletionInfo = get_rcf(s3_resource, rcf_file_s3_uri)
|
291
|
+
audit_bucket, audit_key = RoundCompletionInfo.get_audit_bucket_name_and_key(
|
292
|
+
round_completion_info.compaction_audit_url
|
293
|
+
)
|
294
|
+
|
295
|
+
compaction_audit_obj: Dict[str, Any] = read_s3_contents(
|
296
|
+
s3_resource, audit_bucket, audit_key
|
297
|
+
)
|
298
|
+
compaction_audit: CompactionSessionAuditInfo = CompactionSessionAuditInfo(
|
299
|
+
**compaction_audit_obj
|
300
|
+
)
|
301
|
+
|
302
|
+
# Assert not in-place compacted
|
303
|
+
assert (
|
304
|
+
execute_compaction_result_spy.call_args.args[-1] is False
|
305
|
+
), "Table version erroneously marked as in-place compacted!"
|
306
|
+
compacted_delta_locator: DeltaLocator = get_compacted_delta_locator_from_rcf(
|
307
|
+
s3_resource, rcf_file_s3_uri
|
308
|
+
)
|
309
|
+
tables = ds.download_delta(
|
310
|
+
compacted_delta_locator, storage_type=StorageType.LOCAL, **ds_mock_kwargs
|
311
|
+
)
|
312
|
+
actual_rebase_compacted_table = pa.concat_tables(tables)
|
313
|
+
# if no primary key is specified then sort by sort_key for consistent assertion
|
314
|
+
sorting_cols: List[Any] = (
|
315
|
+
[(val, "ascending") for val in primary_keys] if primary_keys else sort_keys
|
316
|
+
)
|
317
|
+
rebase_expected_compact_partition_result = (
|
318
|
+
rebase_expected_compact_partition_result.combine_chunks().sort_by(sorting_cols)
|
319
|
+
)
|
320
|
+
actual_rebase_compacted_table = (
|
321
|
+
actual_rebase_compacted_table.combine_chunks().sort_by(sorting_cols)
|
322
|
+
)
|
323
|
+
assert actual_rebase_compacted_table.equals(
|
324
|
+
rebase_expected_compact_partition_result
|
325
|
+
), f"{actual_rebase_compacted_table} does not match {rebase_expected_compact_partition_result}"
|
326
|
+
|
327
|
+
if assert_compaction_audit:
|
328
|
+
if not assert_compaction_audit(compactor_version, compaction_audit):
|
329
|
+
assert False, "Compaction audit assertion failed"
|
330
|
+
return
|
@@ -279,7 +279,7 @@ def test_compact_partition_rebase_same_source_and_destination(
|
|
279
279
|
execute_compaction_result_spy = mocker.spy(ExecutionCompactionResult, "__init__")
|
280
280
|
|
281
281
|
# execute
|
282
|
-
rcf_file_s3_uri = compact_partition_func
|
282
|
+
rcf_file_s3_uri = benchmark(compact_partition_func, compact_partition_params)
|
283
283
|
|
284
284
|
round_completion_info: RoundCompletionInfo = get_rcf(s3_resource, rcf_file_s3_uri)
|
285
285
|
audit_bucket, audit_key = RoundCompletionInfo.get_audit_bucket_name_and_key(
|
@@ -269,3 +269,121 @@ def create_src_w_deltas_destination_rebase_w_deltas_strategy(
|
|
269
269
|
destination_table_stream,
|
270
270
|
rebased_stream_after_committed,
|
271
271
|
)
|
272
|
+
|
273
|
+
|
274
|
+
def multiple_rounds_create_src_w_deltas_destination_rebase_w_deltas_strategy(
|
275
|
+
primary_keys: Set[str],
|
276
|
+
sort_keys: Optional[List[Any]],
|
277
|
+
partition_keys: Optional[List[PartitionKey]],
|
278
|
+
input_deltas: List[pa.Table],
|
279
|
+
partition_values: Optional[List[Any]],
|
280
|
+
ds_mock_kwargs: Optional[Dict[str, Any]],
|
281
|
+
) -> Tuple[Stream, Stream, Optional[Stream], bool]:
|
282
|
+
import deltacat.tests.local_deltacat_storage as ds
|
283
|
+
from deltacat.storage import Partition, Stream
|
284
|
+
|
285
|
+
source_namespace, source_table_name, source_table_version = create_src_table(
|
286
|
+
primary_keys, sort_keys, partition_keys, ds_mock_kwargs
|
287
|
+
)
|
288
|
+
|
289
|
+
source_table_stream: Stream = ds.get_stream(
|
290
|
+
namespace=source_namespace,
|
291
|
+
table_name=source_table_name,
|
292
|
+
table_version=source_table_version,
|
293
|
+
**ds_mock_kwargs,
|
294
|
+
)
|
295
|
+
staged_partition: Partition = ds.stage_partition(
|
296
|
+
source_table_stream, partition_values, **ds_mock_kwargs
|
297
|
+
)
|
298
|
+
is_delete = False
|
299
|
+
input_delta_length = 0
|
300
|
+
for (
|
301
|
+
input_delta,
|
302
|
+
input_delta_type,
|
303
|
+
input_delta_parameters,
|
304
|
+
) in input_deltas:
|
305
|
+
if input_delta_type is DeltaType.DELETE:
|
306
|
+
is_delete = True
|
307
|
+
staged_delta = ds.stage_delta(
|
308
|
+
input_delta,
|
309
|
+
staged_partition,
|
310
|
+
input_delta_type,
|
311
|
+
delete_parameters=input_delta_parameters,
|
312
|
+
**ds_mock_kwargs,
|
313
|
+
)
|
314
|
+
ds.commit_delta(
|
315
|
+
staged_delta,
|
316
|
+
**ds_mock_kwargs,
|
317
|
+
)
|
318
|
+
input_delta_length += len(input_delta) if input_delta else 0
|
319
|
+
ds.commit_partition(staged_partition, **ds_mock_kwargs)
|
320
|
+
source_table_stream_after_committed: Stream = ds.get_stream(
|
321
|
+
namespace=source_namespace,
|
322
|
+
table_name=source_table_name,
|
323
|
+
table_version=source_table_version,
|
324
|
+
**ds_mock_kwargs,
|
325
|
+
)
|
326
|
+
# create the destination table
|
327
|
+
(
|
328
|
+
destination_table_namespace,
|
329
|
+
destination_table_name,
|
330
|
+
destination_table_version,
|
331
|
+
) = create_destination_table(
|
332
|
+
primary_keys, sort_keys, partition_keys, ds_mock_kwargs
|
333
|
+
)
|
334
|
+
# create the rebase table
|
335
|
+
(
|
336
|
+
rebase_table_namespace,
|
337
|
+
rebase_table_name,
|
338
|
+
rebase_table_version,
|
339
|
+
) = create_rebase_table(primary_keys, sort_keys, partition_keys, ds_mock_kwargs)
|
340
|
+
rebasing_table_stream: Stream = ds.get_stream(
|
341
|
+
namespace=rebase_table_namespace,
|
342
|
+
table_name=rebase_table_name,
|
343
|
+
table_version=rebase_table_version,
|
344
|
+
**ds_mock_kwargs,
|
345
|
+
)
|
346
|
+
staged_partition: Partition = ds.stage_partition(
|
347
|
+
rebasing_table_stream, partition_values, **ds_mock_kwargs
|
348
|
+
)
|
349
|
+
input_delta_length = 0
|
350
|
+
for (
|
351
|
+
input_delta,
|
352
|
+
input_delta_type,
|
353
|
+
input_delta_parameters,
|
354
|
+
) in input_deltas:
|
355
|
+
if input_delta_type is DeltaType.DELETE:
|
356
|
+
is_delete = True
|
357
|
+
staged_delta = ds.stage_delta(
|
358
|
+
input_delta,
|
359
|
+
staged_partition,
|
360
|
+
input_delta_type,
|
361
|
+
delete_parameters=input_delta_parameters,
|
362
|
+
**ds_mock_kwargs,
|
363
|
+
)
|
364
|
+
ds.commit_delta(
|
365
|
+
staged_delta,
|
366
|
+
**ds_mock_kwargs,
|
367
|
+
)
|
368
|
+
input_delta_length += len(input_delta) if input_delta else 0
|
369
|
+
ds.commit_partition(staged_partition, **ds_mock_kwargs)
|
370
|
+
|
371
|
+
# get streams
|
372
|
+
destination_table_stream: Stream = ds.get_stream(
|
373
|
+
namespace=destination_table_namespace,
|
374
|
+
table_name=destination_table_name,
|
375
|
+
table_version=destination_table_version,
|
376
|
+
**ds_mock_kwargs,
|
377
|
+
)
|
378
|
+
rebased_stream_after_committed: Stream = ds.get_stream(
|
379
|
+
namespace=rebase_table_namespace,
|
380
|
+
table_name=rebase_table_name,
|
381
|
+
table_version=rebase_table_version,
|
382
|
+
**ds_mock_kwargs,
|
383
|
+
)
|
384
|
+
return (
|
385
|
+
source_table_stream_after_committed,
|
386
|
+
destination_table_stream,
|
387
|
+
rebased_stream_after_committed,
|
388
|
+
is_delete,
|
389
|
+
)
|
@@ -394,12 +394,13 @@ def download_delta(
|
|
394
394
|
**kwargs,
|
395
395
|
) -> Union[LocalDataset, DistributedDataset]: # type: ignore
|
396
396
|
result = []
|
397
|
-
|
398
|
-
|
397
|
+
if isinstance(delta_like, Delta) and delta_like.manifest is not None:
|
398
|
+
manifest = Delta(delta_like).manifest
|
399
|
+
else:
|
400
|
+
manifest = get_delta_manifest(delta_like, *args, **kwargs)
|
399
401
|
partition_values: PartitionValues = None
|
400
402
|
if partition_filter is not None:
|
401
403
|
partition_values = partition_filter.partition_values
|
402
|
-
|
403
404
|
for entry_index in range(len(manifest.entries)):
|
404
405
|
if (
|
405
406
|
partition_values is not None
|
@@ -440,8 +441,10 @@ def download_delta_manifest_entry(
|
|
440
441
|
**kwargs,
|
441
442
|
) -> LocalTable:
|
442
443
|
cur, con = _get_sqlite3_cursor_con(kwargs)
|
443
|
-
|
444
|
-
|
444
|
+
if isinstance(delta_like, Delta) and delta_like.manifest is not None:
|
445
|
+
manifest = Delta(delta_like).manifest
|
446
|
+
else:
|
447
|
+
manifest = get_delta_manifest(delta_like, *args, **kwargs)
|
445
448
|
if entry_index >= len(manifest.entries):
|
446
449
|
raise IndexError(
|
447
450
|
f"Manifest entry index {entry_index} does not exist. "
|