deltacat 0.1.18b14__py3-none-any.whl → 0.1.18b16__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +1 -1
- deltacat/aws/clients.py +17 -6
- deltacat/aws/redshift/model/manifest.py +4 -0
- deltacat/aws/s3u.py +24 -1
- deltacat/compute/compactor/compaction_session.py +42 -18
- deltacat/compute/compactor/model/compact_partition_params.py +297 -58
- deltacat/compute/compactor/model/compaction_session_audit_info.py +163 -9
- deltacat/compute/compactor/model/delta_annotated.py +95 -9
- deltacat/compute/compactor/model/delta_file_envelope.py +14 -2
- deltacat/compute/compactor/model/round_completion_info.py +17 -1
- deltacat/compute/compactor/repartition_session.py +4 -1
- deltacat/compute/compactor/steps/dedupe.py +9 -6
- deltacat/compute/compactor/steps/hash_bucket.py +24 -3
- deltacat/compute/compactor/steps/materialize.py +11 -6
- deltacat/compute/compactor/steps/repartition.py +22 -1
- deltacat/compute/compactor/utils/io.py +40 -23
- deltacat/compute/compactor/utils/sort_key.py +5 -0
- deltacat/compute/compactor/utils/system_columns.py +43 -0
- deltacat/compute/compactor_v2/compaction_session.py +509 -0
- deltacat/compute/compactor_v2/constants.py +37 -0
- deltacat/compute/compactor_v2/model/hash_bucket_input.py +78 -0
- deltacat/compute/compactor_v2/model/hash_bucket_result.py +12 -0
- deltacat/compute/compactor_v2/model/merge_input.py +143 -0
- deltacat/compute/compactor_v2/model/merge_result.py +12 -0
- deltacat/compute/compactor_v2/steps/__init__.py +0 -0
- deltacat/compute/compactor_v2/steps/hash_bucket.py +203 -0
- deltacat/compute/compactor_v2/steps/merge.py +469 -0
- deltacat/compute/compactor_v2/utils/__init__.py +0 -0
- deltacat/compute/compactor_v2/utils/content_type_params.py +66 -0
- deltacat/compute/compactor_v2/utils/dedupe.py +58 -0
- deltacat/compute/compactor_v2/utils/io.py +152 -0
- deltacat/compute/compactor_v2/utils/primary_key_index.py +341 -0
- deltacat/compute/compactor_v2/utils/task_options.py +221 -0
- deltacat/compute/metastats/meta_stats.py +4 -2
- deltacat/compute/metastats/stats.py +1 -0
- deltacat/compute/metastats/utils/io.py +4 -0
- deltacat/compute/stats/utils/io.py +20 -5
- deltacat/exceptions.py +4 -0
- deltacat/io/memcached_object_store.py +37 -14
- deltacat/logs.py +4 -3
- deltacat/storage/interface.py +8 -1
- deltacat/storage/model/types.py +2 -1
- deltacat/tests/aws/test_clients.py +16 -3
- deltacat/tests/compute/__init__.py +0 -0
- deltacat/tests/compute/common.py +96 -0
- deltacat/tests/compute/compactor/__init__.py +0 -0
- deltacat/tests/compute/compactor/steps/__init__.py +0 -0
- deltacat/tests/{test_repartition.py → compute/compactor/steps/test_repartition.py} +34 -8
- deltacat/tests/compute/compactor/utils/__init__.py +0 -0
- deltacat/tests/{compactor → compute/compactor}/utils/test_io.py +47 -5
- deltacat/tests/compute/compactor_v2/__init__.py +0 -0
- deltacat/tests/{compactor → compute}/test_compact_partition_params.py +14 -30
- deltacat/tests/compute/test_compaction_session_incremental.py +363 -0
- deltacat/tests/compute/testcases.py +395 -0
- deltacat/tests/io/test_memcached_object_store.py +5 -4
- deltacat/tests/local_deltacat_storage/__init__.py +62 -19
- deltacat/tests/test_utils/pyarrow.py +49 -0
- deltacat/tests/test_utils/utils.py +13 -0
- deltacat/tests/utils/data/__init__.py +0 -0
- deltacat/tests/utils/test_daft.py +76 -0
- deltacat/tests/utils/test_pyarrow.py +133 -0
- deltacat/tests/utils/test_resources.py +23 -20
- deltacat/types/media.py +1 -0
- deltacat/types/partial_download.py +83 -0
- deltacat/types/tables.py +6 -0
- deltacat/utils/arguments.py +25 -0
- deltacat/utils/daft.py +87 -0
- deltacat/utils/placement.py +20 -3
- deltacat/utils/pyarrow.py +218 -1
- deltacat/utils/ray_utils/concurrency.py +26 -1
- deltacat/utils/resources.py +72 -1
- deltacat/utils/s3fs.py +21 -0
- {deltacat-0.1.18b14.dist-info → deltacat-0.1.18b16.dist-info}/METADATA +17 -3
- {deltacat-0.1.18b14.dist-info → deltacat-0.1.18b16.dist-info}/RECORD +79 -47
- {deltacat-0.1.18b14.dist-info → deltacat-0.1.18b16.dist-info}/WHEEL +1 -1
- /deltacat/{tests/compactor → compute/compactor_v2}/__init__.py +0 -0
- /deltacat/{tests/compactor/utils → compute/compactor_v2/model}/__init__.py +0 -0
- {deltacat-0.1.18b14.dist-info → deltacat-0.1.18b16.dist-info}/LICENSE +0 -0
- {deltacat-0.1.18b14.dist-info → deltacat-0.1.18b16.dist-info}/top_level.txt +0 -0
@@ -1,6 +1,12 @@
|
|
1
1
|
import unittest
|
2
2
|
from unittest import mock
|
3
3
|
from deltacat.tests.test_utils.constants import TEST_DELTA
|
4
|
+
from typing import Any, Dict
|
5
|
+
|
6
|
+
DATABASE_FILE_PATH_KEY, DATABASE_FILE_PATH_VALUE = (
|
7
|
+
"db_file_path",
|
8
|
+
"deltacat/tests/local_deltacat_storage/db_test.sqlite",
|
9
|
+
)
|
4
10
|
|
5
11
|
|
6
12
|
class TestFitInputDeltas(unittest.TestCase):
|
@@ -13,6 +19,10 @@ class TestFitInputDeltas(unittest.TestCase):
|
|
13
19
|
CompactionSessionAuditInfo,
|
14
20
|
)
|
15
21
|
|
22
|
+
cls.kwargs_for_local_deltacat_storage: Dict[str, Any] = {
|
23
|
+
DATABASE_FILE_PATH_KEY: DATABASE_FILE_PATH_VALUE,
|
24
|
+
}
|
25
|
+
|
16
26
|
cls.COMPACTION_AUDIT = CompactionSessionAuditInfo("1.0", "test")
|
17
27
|
|
18
28
|
super().setUpClass()
|
@@ -23,6 +33,7 @@ class TestFitInputDeltas(unittest.TestCase):
|
|
23
33
|
|
24
34
|
def test_sanity(self):
|
25
35
|
from deltacat.compute.compactor.utils import io
|
36
|
+
import deltacat.tests.local_deltacat_storage as ds
|
26
37
|
|
27
38
|
(
|
28
39
|
delta_list,
|
@@ -30,7 +41,12 @@ class TestFitInputDeltas(unittest.TestCase):
|
|
30
41
|
high_watermark,
|
31
42
|
require_multiple_rounds,
|
32
43
|
) = io.fit_input_deltas(
|
33
|
-
[TEST_DELTA],
|
44
|
+
[TEST_DELTA],
|
45
|
+
{"CPU": 1, "memory": 20000000},
|
46
|
+
self.COMPACTION_AUDIT,
|
47
|
+
None,
|
48
|
+
ds,
|
49
|
+
self.kwargs_for_local_deltacat_storage,
|
34
50
|
)
|
35
51
|
|
36
52
|
self.assertIsNotNone(hash_bucket_count)
|
@@ -44,6 +60,7 @@ class TestFitInputDeltas(unittest.TestCase):
|
|
44
60
|
|
45
61
|
def test_when_hash_bucket_count_overridden(self):
|
46
62
|
from deltacat.compute.compactor.utils import io
|
63
|
+
import deltacat.tests.local_deltacat_storage as ds
|
47
64
|
|
48
65
|
(
|
49
66
|
delta_list,
|
@@ -51,7 +68,12 @@ class TestFitInputDeltas(unittest.TestCase):
|
|
51
68
|
high_watermark,
|
52
69
|
require_multiple_rounds,
|
53
70
|
) = io.fit_input_deltas(
|
54
|
-
[TEST_DELTA],
|
71
|
+
[TEST_DELTA],
|
72
|
+
{"CPU": 1, "memory": 20000000},
|
73
|
+
self.COMPACTION_AUDIT,
|
74
|
+
20,
|
75
|
+
ds,
|
76
|
+
self.kwargs_for_local_deltacat_storage,
|
55
77
|
)
|
56
78
|
|
57
79
|
self.assertEqual(20, hash_bucket_count)
|
@@ -61,6 +83,7 @@ class TestFitInputDeltas(unittest.TestCase):
|
|
61
83
|
|
62
84
|
def test_when_not_enough_memory_splits_manifest_entries(self):
|
63
85
|
from deltacat.compute.compactor.utils import io
|
86
|
+
import deltacat.tests.local_deltacat_storage as ds
|
64
87
|
|
65
88
|
(
|
66
89
|
delta_list,
|
@@ -68,7 +91,12 @@ class TestFitInputDeltas(unittest.TestCase):
|
|
68
91
|
high_watermark,
|
69
92
|
require_multiple_rounds,
|
70
93
|
) = io.fit_input_deltas(
|
71
|
-
[TEST_DELTA],
|
94
|
+
[TEST_DELTA],
|
95
|
+
{"CPU": 2, "memory": 10},
|
96
|
+
self.COMPACTION_AUDIT,
|
97
|
+
20,
|
98
|
+
ds,
|
99
|
+
self.kwargs_for_local_deltacat_storage,
|
72
100
|
)
|
73
101
|
|
74
102
|
self.assertIsNotNone(hash_bucket_count)
|
@@ -78,14 +106,28 @@ class TestFitInputDeltas(unittest.TestCase):
|
|
78
106
|
|
79
107
|
def test_when_no_input_deltas(self):
|
80
108
|
from deltacat.compute.compactor.utils import io
|
109
|
+
import deltacat.tests.local_deltacat_storage as ds
|
81
110
|
|
82
111
|
with self.assertRaises(AssertionError):
|
83
112
|
io.fit_input_deltas(
|
84
|
-
[],
|
113
|
+
[],
|
114
|
+
{"CPU": 100, "memory": 20000.0},
|
115
|
+
self.COMPACTION_AUDIT,
|
116
|
+
None,
|
117
|
+
ds,
|
118
|
+
self.kwargs_for_local_deltacat_storage,
|
85
119
|
)
|
86
120
|
|
87
121
|
def test_when_cpu_resources_is_not_passed(self):
|
88
122
|
from deltacat.compute.compactor.utils import io
|
123
|
+
import deltacat.tests.local_deltacat_storage as ds
|
89
124
|
|
90
125
|
with self.assertRaises(KeyError):
|
91
|
-
io.fit_input_deltas(
|
126
|
+
io.fit_input_deltas(
|
127
|
+
[],
|
128
|
+
{},
|
129
|
+
self.COMPACTION_AUDIT,
|
130
|
+
None,
|
131
|
+
ds,
|
132
|
+
self.kwargs_for_local_deltacat_storage,
|
133
|
+
)
|
File without changes
|
@@ -27,7 +27,7 @@ class TestCompactPartitionParams(unittest.TestCase):
|
|
27
27
|
"partitionValues": [],
|
28
28
|
"partitionId": None,
|
29
29
|
},
|
30
|
-
"hash_bucket_count":
|
30
|
+
"hash_bucket_count": 200,
|
31
31
|
"last_stream_position_to_compact": 168000000000,
|
32
32
|
"list_deltas_kwargs": {"equivalent_table_types": []},
|
33
33
|
"primary_keys": {"id"},
|
@@ -75,41 +75,16 @@ class TestCompactPartitionParams(unittest.TestCase):
|
|
75
75
|
|
76
76
|
super().setUpClass()
|
77
77
|
|
78
|
-
def test_destination_partition_locator_is_optional(self):
|
79
|
-
from deltacat.compute.compactor.model.compact_partition_params import (
|
80
|
-
CompactPartitionParams,
|
81
|
-
)
|
82
|
-
|
83
|
-
params = CompactPartitionParams.of({})
|
84
|
-
assert params.destination_partition_locator is None
|
85
|
-
|
86
78
|
def test_serialize_returns_json_string(self):
|
87
79
|
from deltacat.compute.compactor.model.compact_partition_params import (
|
88
80
|
CompactPartitionParams,
|
89
81
|
)
|
90
82
|
|
91
83
|
params = CompactPartitionParams.of(
|
92
|
-
|
84
|
+
TestCompactPartitionParams.VALID_COMPACT_PARTITION_PARAMS
|
93
85
|
)
|
94
86
|
serialized_params = params.serialize()
|
95
87
|
assert isinstance(serialized_params, str)
|
96
|
-
assert json.loads(serialized_params) == {
|
97
|
-
"compacted_file_content_type": None,
|
98
|
-
"compaction_artifact_s3_bucket": None,
|
99
|
-
"deltacat_storage": None,
|
100
|
-
"hash_bucket_count": None,
|
101
|
-
"last_stream_position_to_compact": None,
|
102
|
-
"list_deltas_kwargs": None,
|
103
|
-
"pg_config": None,
|
104
|
-
"primary_keys": None,
|
105
|
-
"properties": None,
|
106
|
-
"read_kwargs_provider": None,
|
107
|
-
"rebase_source_partition_high_watermark": None,
|
108
|
-
"rebase_source_partition_locator": None,
|
109
|
-
"s3_table_writer_kwargs": None,
|
110
|
-
"source_partition_locator": None,
|
111
|
-
"destination_partition_locator": "my-partition",
|
112
|
-
}
|
113
88
|
|
114
89
|
def test_serialize_returns_json_string_with_all_fields(self):
|
115
90
|
from deltacat.compute.compactor.model.compact_partition_params import (
|
@@ -142,7 +117,6 @@ class TestCompactPartitionParams(unittest.TestCase):
|
|
142
117
|
== params.list_deltas_kwargs
|
143
118
|
)
|
144
119
|
assert json.loads(serialized_params)["primary_keys"] == params.primary_keys
|
145
|
-
assert json.loads(serialized_params)["properties"] == params.properties
|
146
120
|
assert (
|
147
121
|
json.loads(serialized_params)["rebase_source_partition_high_watermark"]
|
148
122
|
== params.rebase_source_partition_high_watermark
|
@@ -165,7 +139,12 @@ class TestCompactPartitionParams(unittest.TestCase):
|
|
165
139
|
CompactPartitionParams,
|
166
140
|
)
|
167
141
|
|
168
|
-
params = CompactPartitionParams.of(
|
142
|
+
params = CompactPartitionParams.of(
|
143
|
+
{
|
144
|
+
**TestCompactPartitionParams.VALID_COMPACT_PARTITION_PARAMS,
|
145
|
+
"primary_keys": {"foo", "bar", "baz"},
|
146
|
+
}
|
147
|
+
)
|
169
148
|
serialized_params = params.serialize()
|
170
149
|
self.assertCountEqual(
|
171
150
|
json.loads(serialized_params)["primary_keys"], ["foo", "bar", "baz"]
|
@@ -180,7 +159,12 @@ class TestCompactPartitionParams(unittest.TestCase):
|
|
180
159
|
def toJSON(self) -> str:
|
181
160
|
return "my-json-object"
|
182
161
|
|
183
|
-
params = CompactPartitionParams.of(
|
162
|
+
params = CompactPartitionParams.of(
|
163
|
+
{
|
164
|
+
**TestCompactPartitionParams.VALID_COMPACT_PARTITION_PARAMS,
|
165
|
+
"compacted_file_content_type": MyObject(),
|
166
|
+
}
|
167
|
+
)
|
184
168
|
serialized_params = params.serialize()
|
185
169
|
assert (
|
186
170
|
json.loads(serialized_params)["compacted_file_content_type"]
|
@@ -0,0 +1,363 @@
|
|
1
|
+
import ray
|
2
|
+
from moto import mock_s3
|
3
|
+
import pytest
|
4
|
+
import os
|
5
|
+
import json
|
6
|
+
import boto3
|
7
|
+
from typing import Any, Dict, List, Optional, Set
|
8
|
+
from boto3.resources.base import ServiceResource
|
9
|
+
import pyarrow as pa
|
10
|
+
from deltacat.tests.test_utils.utils import read_s3_contents
|
11
|
+
from deltacat.tests.compute.common import (
|
12
|
+
setup_sort_and_partition_keys,
|
13
|
+
PartitionKey,
|
14
|
+
TEST_S3_RCF_BUCKET_NAME,
|
15
|
+
BASE_TEST_SOURCE_NAMESPACE,
|
16
|
+
BASE_TEST_SOURCE_TABLE_NAME,
|
17
|
+
BASE_TEST_DESTINATION_NAMESPACE,
|
18
|
+
BASE_TEST_DESTINATION_TABLE_NAME,
|
19
|
+
)
|
20
|
+
from deltacat.tests.compute.testcases import (
|
21
|
+
INCREMENTAL_TEST_CASES,
|
22
|
+
)
|
23
|
+
|
24
|
+
DATABASE_FILE_PATH_KEY, DATABASE_FILE_PATH_VALUE = (
|
25
|
+
"db_file_path",
|
26
|
+
"deltacat/tests/local_deltacat_storage/db_test.sqlite",
|
27
|
+
)
|
28
|
+
|
29
|
+
"""
|
30
|
+
MODULE scoped fixtures
|
31
|
+
"""
|
32
|
+
|
33
|
+
|
34
|
+
@pytest.fixture(autouse=True, scope="module")
|
35
|
+
def mock_aws_credential():
|
36
|
+
os.environ["AWS_ACCESS_KEY_ID"] = "testing"
|
37
|
+
os.environ["AWS_SECRET_ACCESS_ID"] = "testing"
|
38
|
+
os.environ["AWS_SECURITY_TOKEN"] = "testing"
|
39
|
+
os.environ["AWS_SESSION_TOKEN"] = "testing"
|
40
|
+
os.environ["AWS_DEFAULT_REGION"] = "us-east-1"
|
41
|
+
yield
|
42
|
+
|
43
|
+
|
44
|
+
@pytest.fixture(autouse=True, scope="module")
|
45
|
+
def cleanup_the_database_file_after_all_compaction_session_package_tests_complete():
|
46
|
+
# make sure the database file is deleted after all the compactor package tests are completed
|
47
|
+
if os.path.exists(DATABASE_FILE_PATH_VALUE):
|
48
|
+
os.remove(DATABASE_FILE_PATH_VALUE)
|
49
|
+
|
50
|
+
|
51
|
+
@pytest.fixture(scope="module")
|
52
|
+
def setup_s3_resource(mock_aws_credential):
|
53
|
+
with mock_s3():
|
54
|
+
yield boto3.resource("s3")
|
55
|
+
|
56
|
+
|
57
|
+
@pytest.fixture(scope="module")
|
58
|
+
def setup_compaction_artifacts_s3_bucket(setup_s3_resource: ServiceResource):
|
59
|
+
setup_s3_resource.create_bucket(
|
60
|
+
ACL="authenticated-read",
|
61
|
+
Bucket=TEST_S3_RCF_BUCKET_NAME,
|
62
|
+
)
|
63
|
+
yield
|
64
|
+
|
65
|
+
|
66
|
+
"""
|
67
|
+
FUNCTION scoped fixtures
|
68
|
+
"""
|
69
|
+
|
70
|
+
|
71
|
+
@pytest.fixture(scope="function")
|
72
|
+
def teardown_local_deltacat_storage_db():
|
73
|
+
if os.path.exists(DATABASE_FILE_PATH_VALUE):
|
74
|
+
os.remove(DATABASE_FILE_PATH_VALUE)
|
75
|
+
|
76
|
+
|
77
|
+
@pytest.fixture(scope="function")
|
78
|
+
def setup_local_deltacat_storage_conn():
|
79
|
+
# see deltacat/tests/local_deltacat_storage/README.md for documentation
|
80
|
+
kwargs_for_local_deltacat_storage: Dict[str, Any] = {
|
81
|
+
DATABASE_FILE_PATH_KEY: DATABASE_FILE_PATH_VALUE,
|
82
|
+
}
|
83
|
+
yield kwargs_for_local_deltacat_storage
|
84
|
+
|
85
|
+
|
86
|
+
def setup_incremental_source_and_destination_tables(
|
87
|
+
source_table_version: str,
|
88
|
+
destination_table_version: str,
|
89
|
+
primary_keys: Set[str],
|
90
|
+
sort_keys: Optional[List[Any]],
|
91
|
+
partition_keys: Optional[List[PartitionKey]],
|
92
|
+
column_names: List[str],
|
93
|
+
arrow_arrays: List[pa.Array],
|
94
|
+
partition_values: Optional[List[Any]],
|
95
|
+
ds_mock_kwargs: Optional[Dict[str, Any]],
|
96
|
+
source_namespace: str = BASE_TEST_SOURCE_NAMESPACE,
|
97
|
+
source_table_name: str = BASE_TEST_SOURCE_TABLE_NAME,
|
98
|
+
destination_namespace: str = BASE_TEST_DESTINATION_NAMESPACE,
|
99
|
+
destination_table_name: str = BASE_TEST_DESTINATION_TABLE_NAME,
|
100
|
+
):
|
101
|
+
import deltacat.tests.local_deltacat_storage as ds
|
102
|
+
from deltacat.types.media import ContentType
|
103
|
+
from deltacat.storage import Partition, Stream
|
104
|
+
|
105
|
+
ds.create_namespace(source_namespace, {}, **ds_mock_kwargs)
|
106
|
+
ds.create_table_version(
|
107
|
+
source_namespace,
|
108
|
+
source_table_name,
|
109
|
+
source_table_version,
|
110
|
+
primary_key_column_names=list(primary_keys),
|
111
|
+
sort_keys=sort_keys,
|
112
|
+
partition_keys=partition_keys,
|
113
|
+
supported_content_types=[ContentType.PARQUET],
|
114
|
+
**ds_mock_kwargs,
|
115
|
+
)
|
116
|
+
source_table_stream: Stream = ds.get_stream(
|
117
|
+
namespace=source_namespace,
|
118
|
+
table_name=source_table_name,
|
119
|
+
table_version=source_table_version,
|
120
|
+
**ds_mock_kwargs,
|
121
|
+
)
|
122
|
+
test_table: pa.Table = pa.Table.from_arrays(arrow_arrays, names=column_names)
|
123
|
+
staged_partition: Partition = ds.stage_partition(
|
124
|
+
source_table_stream, partition_values, **ds_mock_kwargs
|
125
|
+
)
|
126
|
+
ds.commit_delta(
|
127
|
+
ds.stage_delta(test_table, staged_partition, **ds_mock_kwargs), **ds_mock_kwargs
|
128
|
+
)
|
129
|
+
ds.commit_partition(staged_partition, **ds_mock_kwargs)
|
130
|
+
# create the destination table
|
131
|
+
ds.create_namespace(destination_namespace, {}, **ds_mock_kwargs)
|
132
|
+
ds.create_table_version(
|
133
|
+
destination_namespace,
|
134
|
+
destination_table_name,
|
135
|
+
destination_table_version,
|
136
|
+
primary_key_column_names=list(primary_keys),
|
137
|
+
sort_keys=sort_keys,
|
138
|
+
partition_keys=partition_keys,
|
139
|
+
supported_content_types=[ContentType.PARQUET],
|
140
|
+
**ds_mock_kwargs,
|
141
|
+
)
|
142
|
+
destination_table_stream: Stream = ds.get_stream(
|
143
|
+
namespace=destination_namespace,
|
144
|
+
table_name=destination_table_name,
|
145
|
+
table_version=destination_table_version,
|
146
|
+
**ds_mock_kwargs,
|
147
|
+
)
|
148
|
+
source_table_stream_after_committed: Stream = ds.get_stream(
|
149
|
+
namespace=source_namespace,
|
150
|
+
table_name=source_table_name,
|
151
|
+
table_version=source_table_version,
|
152
|
+
**ds_mock_kwargs,
|
153
|
+
)
|
154
|
+
return source_table_stream_after_committed, destination_table_stream
|
155
|
+
|
156
|
+
|
157
|
+
@pytest.mark.parametrize(
|
158
|
+
[
|
159
|
+
"test_name",
|
160
|
+
"source_table_version",
|
161
|
+
"destination_table_version",
|
162
|
+
"primary_keys_param",
|
163
|
+
"sort_keys_param",
|
164
|
+
"partition_keys_param",
|
165
|
+
"column_names_param",
|
166
|
+
"arrow_arrays_param",
|
167
|
+
"rebase_source_partition_locator_param",
|
168
|
+
"partition_values_param",
|
169
|
+
"expected_result",
|
170
|
+
"validation_callback_func",
|
171
|
+
"validation_callback_func_kwargs",
|
172
|
+
"do_teardown_local_deltacat_storage_db",
|
173
|
+
"use_prev_compacted",
|
174
|
+
"create_placement_group_param",
|
175
|
+
"records_per_compacted_file_param",
|
176
|
+
"hash_bucket_count_param",
|
177
|
+
"compact_partition_func",
|
178
|
+
],
|
179
|
+
[
|
180
|
+
(
|
181
|
+
test_name,
|
182
|
+
source_table_version,
|
183
|
+
destination_table_version,
|
184
|
+
primary_keys_param,
|
185
|
+
sort_keys_param,
|
186
|
+
partition_keys_param,
|
187
|
+
column_names_param,
|
188
|
+
arrow_arrays_param,
|
189
|
+
rebase_source_partition_locator_param,
|
190
|
+
partition_values_param,
|
191
|
+
expected_result,
|
192
|
+
validation_callback_func,
|
193
|
+
validation_callback_func_kwargs,
|
194
|
+
do_teardown_local_deltacat_storage_db,
|
195
|
+
use_prev_compacted,
|
196
|
+
create_placement_group_param,
|
197
|
+
records_per_compacted_file_param,
|
198
|
+
hash_bucket_count_param,
|
199
|
+
compact_partition_func,
|
200
|
+
)
|
201
|
+
for test_name, (
|
202
|
+
source_table_version,
|
203
|
+
destination_table_version,
|
204
|
+
primary_keys_param,
|
205
|
+
sort_keys_param,
|
206
|
+
partition_keys_param,
|
207
|
+
column_names_param,
|
208
|
+
arrow_arrays_param,
|
209
|
+
rebase_source_partition_locator_param,
|
210
|
+
partition_values_param,
|
211
|
+
expected_result,
|
212
|
+
validation_callback_func,
|
213
|
+
validation_callback_func_kwargs,
|
214
|
+
do_teardown_local_deltacat_storage_db,
|
215
|
+
use_prev_compacted,
|
216
|
+
create_placement_group_param,
|
217
|
+
records_per_compacted_file_param,
|
218
|
+
hash_bucket_count_param,
|
219
|
+
compact_partition_func,
|
220
|
+
) in INCREMENTAL_TEST_CASES.items()
|
221
|
+
],
|
222
|
+
ids=[test_name for test_name in INCREMENTAL_TEST_CASES],
|
223
|
+
indirect=[],
|
224
|
+
)
|
225
|
+
def test_compact_partition_incremental(
|
226
|
+
request: pytest.FixtureRequest,
|
227
|
+
setup_s3_resource: ServiceResource,
|
228
|
+
setup_local_deltacat_storage_conn: Dict[str, Any],
|
229
|
+
setup_compaction_artifacts_s3_bucket: None,
|
230
|
+
test_name: str,
|
231
|
+
source_table_version: str,
|
232
|
+
destination_table_version: str,
|
233
|
+
primary_keys_param: Set[str],
|
234
|
+
sort_keys_param,
|
235
|
+
partition_keys_param,
|
236
|
+
column_names_param: List[str],
|
237
|
+
arrow_arrays_param: List[pa.Array],
|
238
|
+
rebase_source_partition_locator_param,
|
239
|
+
partition_values_param,
|
240
|
+
expected_result,
|
241
|
+
# use and implement func and func_kwargs if you want to run additional validations apart from the ones in the test
|
242
|
+
validation_callback_func,
|
243
|
+
validation_callback_func_kwargs,
|
244
|
+
do_teardown_local_deltacat_storage_db,
|
245
|
+
use_prev_compacted,
|
246
|
+
create_placement_group_param,
|
247
|
+
records_per_compacted_file_param,
|
248
|
+
hash_bucket_count_param,
|
249
|
+
compact_partition_func,
|
250
|
+
):
|
251
|
+
import deltacat.tests.local_deltacat_storage as ds
|
252
|
+
from deltacat.types.media import ContentType
|
253
|
+
from deltacat.storage import (
|
254
|
+
PartitionLocator,
|
255
|
+
)
|
256
|
+
from deltacat.compute.compactor.model.compact_partition_params import (
|
257
|
+
CompactPartitionParams,
|
258
|
+
)
|
259
|
+
from deltacat.utils.placement import (
|
260
|
+
PlacementGroupManager,
|
261
|
+
)
|
262
|
+
from deltacat.compute.compactor import (
|
263
|
+
RoundCompletionInfo,
|
264
|
+
)
|
265
|
+
|
266
|
+
ds_mock_kwargs = setup_local_deltacat_storage_conn
|
267
|
+
|
268
|
+
# setup
|
269
|
+
sort_keys, partition_keys = setup_sort_and_partition_keys(
|
270
|
+
sort_keys_param, partition_keys_param
|
271
|
+
)
|
272
|
+
(
|
273
|
+
source_table_stream,
|
274
|
+
destination_table_stream,
|
275
|
+
) = setup_incremental_source_and_destination_tables(
|
276
|
+
source_table_version,
|
277
|
+
destination_table_version,
|
278
|
+
primary_keys_param,
|
279
|
+
sort_keys,
|
280
|
+
partition_keys,
|
281
|
+
column_names_param,
|
282
|
+
arrow_arrays_param,
|
283
|
+
partition_values_param,
|
284
|
+
ds_mock_kwargs,
|
285
|
+
f"{test_name}_src_namespace",
|
286
|
+
f"{test_name}_table_src",
|
287
|
+
f"{test_name}_dest_namespace",
|
288
|
+
f"{test_name}_table_dest",
|
289
|
+
)
|
290
|
+
ray.shutdown()
|
291
|
+
ray.init(local_mode=True)
|
292
|
+
assert ray.is_initialized()
|
293
|
+
source_partition = ds.get_partition(
|
294
|
+
source_table_stream.locator,
|
295
|
+
partition_values_param,
|
296
|
+
**ds_mock_kwargs,
|
297
|
+
)
|
298
|
+
destination_partition_locator = PartitionLocator.of(
|
299
|
+
destination_table_stream.locator,
|
300
|
+
partition_values_param,
|
301
|
+
None,
|
302
|
+
)
|
303
|
+
num_workers, worker_instance_cpu = 1, 1
|
304
|
+
total_cpus = num_workers * worker_instance_cpu
|
305
|
+
pgm = None
|
306
|
+
if create_placement_group_param:
|
307
|
+
pgm = PlacementGroupManager(
|
308
|
+
1, total_cpus, worker_instance_cpu, memory_per_bundle=4000000
|
309
|
+
).pgs[0]
|
310
|
+
compact_partition_params = CompactPartitionParams.of(
|
311
|
+
{
|
312
|
+
"compaction_artifact_s3_bucket": TEST_S3_RCF_BUCKET_NAME,
|
313
|
+
"compacted_file_content_type": ContentType.PARQUET,
|
314
|
+
"dd_max_parallelism_ratio": 1.0,
|
315
|
+
"deltacat_storage": ds,
|
316
|
+
"deltacat_storage_kwargs": ds_mock_kwargs,
|
317
|
+
"destination_partition_locator": destination_partition_locator,
|
318
|
+
"hash_bucket_count": hash_bucket_count_param,
|
319
|
+
"last_stream_position_to_compact": source_partition.stream_position,
|
320
|
+
"list_deltas_kwargs": {**ds_mock_kwargs, **{"equivalent_table_types": []}},
|
321
|
+
"pg_config": pgm,
|
322
|
+
"primary_keys": primary_keys_param,
|
323
|
+
"rebase_source_partition_locator": rebase_source_partition_locator_param,
|
324
|
+
"records_per_compacted_file": records_per_compacted_file_param,
|
325
|
+
"s3_client_kwargs": {},
|
326
|
+
"source_partition_locator": source_partition.locator,
|
327
|
+
"sort_keys": sort_keys if sort_keys else None,
|
328
|
+
}
|
329
|
+
)
|
330
|
+
# execute
|
331
|
+
rcf_file_s3_uri = compact_partition_func(compact_partition_params)
|
332
|
+
# validate
|
333
|
+
_, rcf_object_key = rcf_file_s3_uri.rsplit("/", 1)
|
334
|
+
rcf_file_output: Dict[str, Any] = read_s3_contents(
|
335
|
+
setup_s3_resource, TEST_S3_RCF_BUCKET_NAME, rcf_object_key
|
336
|
+
)
|
337
|
+
round_completion_info = RoundCompletionInfo(**rcf_file_output)
|
338
|
+
print(f"rcf_file_output: {json.dumps(rcf_file_output, indent=2)}")
|
339
|
+
compacted_delta_locator = round_completion_info.compacted_delta_locator
|
340
|
+
tables = ds.download_delta(compacted_delta_locator, **ds_mock_kwargs)
|
341
|
+
compacted_table = pa.concat_tables(tables)
|
342
|
+
|
343
|
+
# the compacted table may contain multiple files and chunks
|
344
|
+
# and order of records may be incorrect due to multiple files.
|
345
|
+
expected_result = expected_result.combine_chunks().sort_by(
|
346
|
+
[(val, "ascending") for val in primary_keys_param]
|
347
|
+
)
|
348
|
+
compacted_table = compacted_table.combine_chunks().sort_by(
|
349
|
+
[(val, "ascending") for val in primary_keys_param]
|
350
|
+
)
|
351
|
+
|
352
|
+
assert compacted_table.equals(
|
353
|
+
expected_result
|
354
|
+
), f"{compacted_table} does not match {expected_result}"
|
355
|
+
if (
|
356
|
+
validation_callback_func is not None
|
357
|
+
and validation_callback_func_kwargs is not None
|
358
|
+
):
|
359
|
+
validation_callback_func(**validation_callback_func_kwargs)
|
360
|
+
# https://docs.pytest.org/en/7.1.x/reference/reference.html#pytest.FixtureRequest.getfixturevalue
|
361
|
+
if do_teardown_local_deltacat_storage_db:
|
362
|
+
request.getfixturevalue("teardown_local_deltacat_storage_db")
|
363
|
+
return
|