deltacat 0.1.18b13__py3-none-any.whl → 0.1.18b15__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +3 -2
- deltacat/aws/clients.py +123 -3
- deltacat/aws/redshift/model/manifest.py +4 -0
- deltacat/aws/s3u.py +24 -1
- deltacat/benchmarking/benchmark_parquet_reads.py +53 -0
- deltacat/benchmarking/conftest.py +61 -0
- deltacat/catalog/delegate.py +1 -1
- deltacat/catalog/interface.py +1 -1
- deltacat/compute/compactor/__init__.py +0 -3
- deltacat/compute/compactor/compaction_session.py +45 -20
- deltacat/compute/compactor/model/compact_partition_params.py +287 -58
- deltacat/compute/compactor/model/compaction_session_audit_info.py +150 -9
- deltacat/compute/compactor/model/delta_annotated.py +91 -9
- deltacat/compute/compactor/model/delta_file_envelope.py +15 -3
- deltacat/compute/compactor/model/primary_key_index.py +1 -1
- deltacat/compute/compactor/model/round_completion_info.py +17 -1
- deltacat/compute/compactor/repartition_session.py +5 -3
- deltacat/compute/compactor/steps/dedupe.py +10 -8
- deltacat/compute/compactor/steps/hash_bucket.py +25 -4
- deltacat/compute/compactor/steps/materialize.py +11 -6
- deltacat/compute/compactor/steps/repartition.py +16 -1
- deltacat/compute/compactor/utils/io.py +40 -23
- deltacat/compute/compactor/utils/primary_key_index.py +1 -15
- deltacat/compute/compactor/utils/sort_key.py +57 -0
- deltacat/compute/compactor/utils/system_columns.py +43 -0
- deltacat/compute/compactor_v2/compaction_session.py +506 -0
- deltacat/compute/compactor_v2/constants.py +34 -0
- deltacat/compute/compactor_v2/model/__init__.py +0 -0
- deltacat/compute/compactor_v2/model/hash_bucket_input.py +78 -0
- deltacat/compute/compactor_v2/model/hash_bucket_result.py +12 -0
- deltacat/compute/compactor_v2/model/merge_input.py +127 -0
- deltacat/compute/compactor_v2/model/merge_result.py +12 -0
- deltacat/compute/compactor_v2/steps/__init__.py +0 -0
- deltacat/compute/compactor_v2/steps/hash_bucket.py +203 -0
- deltacat/compute/compactor_v2/steps/merge.py +41 -0
- deltacat/compute/compactor_v2/utils/__init__.py +0 -0
- deltacat/compute/compactor_v2/utils/content_type_params.py +37 -0
- deltacat/compute/compactor_v2/utils/io.py +149 -0
- deltacat/compute/compactor_v2/utils/primary_key_index.py +308 -0
- deltacat/compute/compactor_v2/utils/task_options.py +228 -0
- deltacat/compute/metastats/meta_stats.py +4 -2
- deltacat/compute/metastats/stats.py +1 -0
- deltacat/compute/metastats/utils/io.py +4 -0
- deltacat/compute/stats/utils/io.py +20 -5
- deltacat/exceptions.py +4 -0
- deltacat/io/memcached_object_store.py +37 -14
- deltacat/logs.py +4 -3
- deltacat/storage/__init__.py +3 -0
- deltacat/storage/interface.py +11 -2
- deltacat/storage/model/sort_key.py +33 -0
- deltacat/storage/model/table_version.py +11 -0
- deltacat/storage/model/types.py +2 -1
- deltacat/tests/aws/__init__.py +0 -0
- deltacat/tests/aws/test_clients.py +80 -0
- deltacat/tests/compute/__init__.py +0 -0
- deltacat/tests/compute/common.py +96 -0
- deltacat/tests/compute/compactor/__init__.py +0 -0
- deltacat/tests/compute/compactor/steps/__init__.py +0 -0
- deltacat/tests/{test_repartition.py → compute/compactor/steps/test_repartition.py} +22 -8
- deltacat/tests/compute/compactor/utils/__init__.py +0 -0
- deltacat/tests/{compactor → compute/compactor}/utils/test_io.py +47 -5
- deltacat/tests/compute/compactor_v2/__init__.py +0 -0
- deltacat/tests/compute/compactor_v2/steps/__init__.py +0 -0
- deltacat/tests/compute/compactor_v2/steps/test_hash_bucket.py +199 -0
- deltacat/tests/{compactor → compute}/test_compact_partition_params.py +14 -30
- deltacat/tests/compute/test_compaction_session_incremental.py +348 -0
- deltacat/tests/compute/testcases.py +390 -0
- deltacat/tests/io/test_memcached_object_store.py +5 -4
- deltacat/tests/local_deltacat_storage/__init__.py +1109 -0
- deltacat/tests/test_utils/pyarrow.py +32 -0
- deltacat/tests/test_utils/utils.py +13 -0
- deltacat/tests/utils/data/__init__.py +0 -0
- deltacat/tests/utils/test_daft.py +76 -0
- deltacat/tests/utils/test_pyarrow.py +133 -0
- deltacat/tests/utils/test_resources.py +23 -20
- deltacat/types/media.py +1 -0
- deltacat/types/partial_download.py +82 -0
- deltacat/types/tables.py +1 -0
- deltacat/utils/arguments.py +26 -0
- deltacat/utils/daft.py +87 -0
- deltacat/utils/performance.py +4 -2
- deltacat/utils/placement.py +20 -3
- deltacat/utils/pyarrow.py +213 -1
- deltacat/utils/ray_utils/concurrency.py +26 -1
- deltacat/utils/resources.py +72 -1
- deltacat/utils/s3fs.py +21 -0
- {deltacat-0.1.18b13.dist-info → deltacat-0.1.18b15.dist-info}/METADATA +27 -13
- deltacat-0.1.18b15.dist-info/RECORD +176 -0
- {deltacat-0.1.18b13.dist-info → deltacat-0.1.18b15.dist-info}/WHEEL +1 -1
- deltacat/compute/compactor/model/sort_key.py +0 -98
- deltacat-0.1.18b13.dist-info/RECORD +0 -136
- /deltacat/{tests/compactor → benchmarking}/__init__.py +0 -0
- /deltacat/{tests/compactor/utils → compute/compactor_v2}/__init__.py +0 -0
- {deltacat-0.1.18b13.dist-info → deltacat-0.1.18b15.dist-info}/LICENSE +0 -0
- {deltacat-0.1.18b13.dist-info → deltacat-0.1.18b15.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,199 @@
|
|
1
|
+
import unittest
|
2
|
+
import sqlite3
|
3
|
+
import ray
|
4
|
+
import os
|
5
|
+
from collections import defaultdict
|
6
|
+
from deltacat.compute.compactor import DeltaAnnotated
|
7
|
+
import deltacat.tests.local_deltacat_storage as ds
|
8
|
+
from deltacat.io.ray_plasma_object_store import RayPlasmaObjectStore
|
9
|
+
from deltacat.compute.compactor_v2.model.hash_bucket_input import HashBucketInput
|
10
|
+
from deltacat.compute.compactor_v2.model.hash_bucket_result import HashBucketResult
|
11
|
+
from deltacat.compute.compactor_v2.steps.hash_bucket import hash_bucket
|
12
|
+
from deltacat.utils.common import current_time_ms
|
13
|
+
from deltacat.tests.test_utils.pyarrow import create_delta_from_csv_file
|
14
|
+
|
15
|
+
|
16
|
+
class TestHashBucket(unittest.TestCase):
|
17
|
+
HASH_BUCKET_NAMESPACE = "test_hash_bucket"
|
18
|
+
DB_FILE_PATH = f"{current_time_ms()}.db"
|
19
|
+
STRING_PK_FILE_PATH = (
|
20
|
+
"deltacat/tests/compute/compactor_v2/steps/data/string_pk_table.csv"
|
21
|
+
)
|
22
|
+
DATE_PK_FILE_PATH = (
|
23
|
+
"deltacat/tests/compute/compactor_v2/steps/data/date_pk_table.csv"
|
24
|
+
)
|
25
|
+
MULTIPLE_PK_FILE_PATH = (
|
26
|
+
"deltacat/tests/compute/compactor_v2/steps/data/multiple_pk_table.csv"
|
27
|
+
)
|
28
|
+
NO_PK_FILE_PATH = "deltacat/tests/compute/compactor_v2/steps/data/no_pk_table.csv"
|
29
|
+
|
30
|
+
@classmethod
|
31
|
+
def setUpClass(cls):
|
32
|
+
ray.init(local_mode=True, ignore_reinit_error=True)
|
33
|
+
|
34
|
+
con = sqlite3.connect(cls.DB_FILE_PATH)
|
35
|
+
cur = con.cursor()
|
36
|
+
cls.kwargs = {ds.SQLITE_CON_ARG: con, ds.SQLITE_CUR_ARG: cur}
|
37
|
+
cls.deltacat_storage_kwargs = {ds.DB_FILE_PATH_ARG: cls.DB_FILE_PATH}
|
38
|
+
|
39
|
+
super().setUpClass()
|
40
|
+
|
41
|
+
@classmethod
|
42
|
+
def doClassCleanups(cls) -> None:
|
43
|
+
os.remove(cls.DB_FILE_PATH)
|
44
|
+
|
45
|
+
def test_single_string_pk_correctly_hashes(self):
|
46
|
+
# setup
|
47
|
+
delta = create_delta_from_csv_file(
|
48
|
+
self.HASH_BUCKET_NAMESPACE, [self.STRING_PK_FILE_PATH], **self.kwargs
|
49
|
+
)
|
50
|
+
|
51
|
+
annotated_delta = DeltaAnnotated.of(delta)
|
52
|
+
object_store = RayPlasmaObjectStore()
|
53
|
+
hb_input = HashBucketInput.of(
|
54
|
+
annotated_delta=annotated_delta,
|
55
|
+
primary_keys=["pk"],
|
56
|
+
num_hash_buckets=3,
|
57
|
+
num_hash_groups=2,
|
58
|
+
deltacat_storage=ds,
|
59
|
+
deltacat_storage_kwargs=self.deltacat_storage_kwargs,
|
60
|
+
object_store=object_store,
|
61
|
+
)
|
62
|
+
|
63
|
+
# action
|
64
|
+
hb_result_promise = hash_bucket.remote(hb_input)
|
65
|
+
hb_result: HashBucketResult = ray.get(hb_result_promise)
|
66
|
+
|
67
|
+
# assert
|
68
|
+
# PK hash column is also persisted.
|
69
|
+
self._validate_hash_bucket_result(
|
70
|
+
hb_result,
|
71
|
+
record_count=6,
|
72
|
+
num_hash_buckets=3,
|
73
|
+
num_columns=3,
|
74
|
+
object_store=object_store,
|
75
|
+
)
|
76
|
+
|
77
|
+
def test_single_date_pk_correctly_hashes(self):
|
78
|
+
# setup
|
79
|
+
delta = create_delta_from_csv_file(
|
80
|
+
self.HASH_BUCKET_NAMESPACE, [self.DATE_PK_FILE_PATH], **self.kwargs
|
81
|
+
)
|
82
|
+
|
83
|
+
annotated_delta = DeltaAnnotated.of(delta)
|
84
|
+
object_store = RayPlasmaObjectStore()
|
85
|
+
hb_input = HashBucketInput.of(
|
86
|
+
annotated_delta=annotated_delta,
|
87
|
+
primary_keys=["pk"],
|
88
|
+
num_hash_buckets=2,
|
89
|
+
num_hash_groups=1,
|
90
|
+
deltacat_storage=ds,
|
91
|
+
deltacat_storage_kwargs=self.deltacat_storage_kwargs,
|
92
|
+
object_store=object_store,
|
93
|
+
)
|
94
|
+
|
95
|
+
# action
|
96
|
+
hb_result_promise = hash_bucket.remote(hb_input)
|
97
|
+
hb_result: HashBucketResult = ray.get(hb_result_promise)
|
98
|
+
|
99
|
+
# assert
|
100
|
+
self._validate_hash_bucket_result(
|
101
|
+
hb_result,
|
102
|
+
record_count=7,
|
103
|
+
num_hash_buckets=2,
|
104
|
+
num_columns=3,
|
105
|
+
object_store=object_store,
|
106
|
+
)
|
107
|
+
|
108
|
+
def test_no_pk_does_not_hash(self):
|
109
|
+
# setup
|
110
|
+
delta = create_delta_from_csv_file(
|
111
|
+
self.HASH_BUCKET_NAMESPACE, [self.NO_PK_FILE_PATH], **self.kwargs
|
112
|
+
)
|
113
|
+
|
114
|
+
annotated_delta = DeltaAnnotated.of(delta)
|
115
|
+
object_store = RayPlasmaObjectStore()
|
116
|
+
hb_input = HashBucketInput.of(
|
117
|
+
annotated_delta=annotated_delta,
|
118
|
+
primary_keys=[],
|
119
|
+
num_hash_buckets=2,
|
120
|
+
num_hash_groups=1,
|
121
|
+
deltacat_storage=ds,
|
122
|
+
deltacat_storage_kwargs=self.deltacat_storage_kwargs,
|
123
|
+
object_store=object_store,
|
124
|
+
)
|
125
|
+
|
126
|
+
# action
|
127
|
+
hb_result_promise = hash_bucket.remote(hb_input)
|
128
|
+
hb_result: HashBucketResult = ray.get(hb_result_promise)
|
129
|
+
|
130
|
+
# assert
|
131
|
+
self._validate_hash_bucket_result(
|
132
|
+
hb_result,
|
133
|
+
record_count=6,
|
134
|
+
num_hash_buckets=2,
|
135
|
+
num_columns=3,
|
136
|
+
object_store=object_store,
|
137
|
+
)
|
138
|
+
|
139
|
+
def test_multiple_pk_correctly_hashes(self):
|
140
|
+
# setup
|
141
|
+
delta = create_delta_from_csv_file(
|
142
|
+
self.HASH_BUCKET_NAMESPACE, [self.MULTIPLE_PK_FILE_PATH], **self.kwargs
|
143
|
+
)
|
144
|
+
|
145
|
+
annotated_delta = DeltaAnnotated.of(delta)
|
146
|
+
object_store = RayPlasmaObjectStore()
|
147
|
+
hb_input = HashBucketInput.of(
|
148
|
+
annotated_delta=annotated_delta,
|
149
|
+
primary_keys=["pk1", "pk2"],
|
150
|
+
num_hash_buckets=2,
|
151
|
+
num_hash_groups=1,
|
152
|
+
deltacat_storage=ds,
|
153
|
+
deltacat_storage_kwargs=self.deltacat_storage_kwargs,
|
154
|
+
object_store=object_store,
|
155
|
+
)
|
156
|
+
|
157
|
+
# action
|
158
|
+
hb_result_promise = hash_bucket.remote(hb_input)
|
159
|
+
hb_result: HashBucketResult = ray.get(hb_result_promise)
|
160
|
+
|
161
|
+
# assert
|
162
|
+
self._validate_hash_bucket_result(
|
163
|
+
hb_result,
|
164
|
+
record_count=6,
|
165
|
+
num_hash_buckets=2,
|
166
|
+
num_columns=4,
|
167
|
+
object_store=object_store,
|
168
|
+
)
|
169
|
+
|
170
|
+
def _validate_hash_bucket_result(
|
171
|
+
self,
|
172
|
+
hb_result: HashBucketResult,
|
173
|
+
record_count: int,
|
174
|
+
num_hash_buckets: int,
|
175
|
+
num_columns: int,
|
176
|
+
object_store,
|
177
|
+
):
|
178
|
+
|
179
|
+
self.assertEqual(hb_result.hb_record_count, record_count)
|
180
|
+
self.assertIsNotNone(hb_result)
|
181
|
+
self.assertIsNotNone(hb_result.peak_memory_usage_bytes)
|
182
|
+
self.assertIsNotNone(hb_result.task_completed_at)
|
183
|
+
self.assertIsNotNone(hb_result.telemetry_time_in_seconds)
|
184
|
+
|
185
|
+
hb_index_to_dfes = defaultdict(list)
|
186
|
+
total_records_in_result = 0
|
187
|
+
for _, object_id in enumerate(hb_result.hash_bucket_group_to_obj_id_tuple):
|
188
|
+
if object_id:
|
189
|
+
obj = object_store.get(object_id[0])
|
190
|
+
for hb_idx, dfes in enumerate(obj):
|
191
|
+
if dfes is not None:
|
192
|
+
hb_index_to_dfes[hb_idx].extend(dfes)
|
193
|
+
for dfe in dfes:
|
194
|
+
self.assertIsNotNone(dfe)
|
195
|
+
total_records_in_result += len(dfe.table)
|
196
|
+
self.assertEqual(num_columns, len(dfe.table.column_names))
|
197
|
+
|
198
|
+
self.assertTrue(len(hb_index_to_dfes) <= num_hash_buckets)
|
199
|
+
self.assertEqual(total_records_in_result, record_count)
|
@@ -27,7 +27,7 @@ class TestCompactPartitionParams(unittest.TestCase):
|
|
27
27
|
"partitionValues": [],
|
28
28
|
"partitionId": None,
|
29
29
|
},
|
30
|
-
"hash_bucket_count":
|
30
|
+
"hash_bucket_count": 200,
|
31
31
|
"last_stream_position_to_compact": 168000000000,
|
32
32
|
"list_deltas_kwargs": {"equivalent_table_types": []},
|
33
33
|
"primary_keys": {"id"},
|
@@ -75,41 +75,16 @@ class TestCompactPartitionParams(unittest.TestCase):
|
|
75
75
|
|
76
76
|
super().setUpClass()
|
77
77
|
|
78
|
-
def test_destination_partition_locator_is_optional(self):
|
79
|
-
from deltacat.compute.compactor.model.compact_partition_params import (
|
80
|
-
CompactPartitionParams,
|
81
|
-
)
|
82
|
-
|
83
|
-
params = CompactPartitionParams.of({})
|
84
|
-
assert params.destination_partition_locator is None
|
85
|
-
|
86
78
|
def test_serialize_returns_json_string(self):
|
87
79
|
from deltacat.compute.compactor.model.compact_partition_params import (
|
88
80
|
CompactPartitionParams,
|
89
81
|
)
|
90
82
|
|
91
83
|
params = CompactPartitionParams.of(
|
92
|
-
|
84
|
+
TestCompactPartitionParams.VALID_COMPACT_PARTITION_PARAMS
|
93
85
|
)
|
94
86
|
serialized_params = params.serialize()
|
95
87
|
assert isinstance(serialized_params, str)
|
96
|
-
assert json.loads(serialized_params) == {
|
97
|
-
"compacted_file_content_type": None,
|
98
|
-
"compaction_artifact_s3_bucket": None,
|
99
|
-
"deltacat_storage": None,
|
100
|
-
"hash_bucket_count": None,
|
101
|
-
"last_stream_position_to_compact": None,
|
102
|
-
"list_deltas_kwargs": None,
|
103
|
-
"pg_config": None,
|
104
|
-
"primary_keys": None,
|
105
|
-
"properties": None,
|
106
|
-
"read_kwargs_provider": None,
|
107
|
-
"rebase_source_partition_high_watermark": None,
|
108
|
-
"rebase_source_partition_locator": None,
|
109
|
-
"s3_table_writer_kwargs": None,
|
110
|
-
"source_partition_locator": None,
|
111
|
-
"destination_partition_locator": "my-partition",
|
112
|
-
}
|
113
88
|
|
114
89
|
def test_serialize_returns_json_string_with_all_fields(self):
|
115
90
|
from deltacat.compute.compactor.model.compact_partition_params import (
|
@@ -142,7 +117,6 @@ class TestCompactPartitionParams(unittest.TestCase):
|
|
142
117
|
== params.list_deltas_kwargs
|
143
118
|
)
|
144
119
|
assert json.loads(serialized_params)["primary_keys"] == params.primary_keys
|
145
|
-
assert json.loads(serialized_params)["properties"] == params.properties
|
146
120
|
assert (
|
147
121
|
json.loads(serialized_params)["rebase_source_partition_high_watermark"]
|
148
122
|
== params.rebase_source_partition_high_watermark
|
@@ -165,7 +139,12 @@ class TestCompactPartitionParams(unittest.TestCase):
|
|
165
139
|
CompactPartitionParams,
|
166
140
|
)
|
167
141
|
|
168
|
-
params = CompactPartitionParams.of(
|
142
|
+
params = CompactPartitionParams.of(
|
143
|
+
{
|
144
|
+
**TestCompactPartitionParams.VALID_COMPACT_PARTITION_PARAMS,
|
145
|
+
"primary_keys": {"foo", "bar", "baz"},
|
146
|
+
}
|
147
|
+
)
|
169
148
|
serialized_params = params.serialize()
|
170
149
|
self.assertCountEqual(
|
171
150
|
json.loads(serialized_params)["primary_keys"], ["foo", "bar", "baz"]
|
@@ -180,7 +159,12 @@ class TestCompactPartitionParams(unittest.TestCase):
|
|
180
159
|
def toJSON(self) -> str:
|
181
160
|
return "my-json-object"
|
182
161
|
|
183
|
-
params = CompactPartitionParams.of(
|
162
|
+
params = CompactPartitionParams.of(
|
163
|
+
{
|
164
|
+
**TestCompactPartitionParams.VALID_COMPACT_PARTITION_PARAMS,
|
165
|
+
"compacted_file_content_type": MyObject(),
|
166
|
+
}
|
167
|
+
)
|
184
168
|
serialized_params = params.serialize()
|
185
169
|
assert (
|
186
170
|
json.loads(serialized_params)["compacted_file_content_type"]
|
@@ -0,0 +1,348 @@
|
|
1
|
+
import ray
|
2
|
+
from moto import mock_s3
|
3
|
+
import pytest
|
4
|
+
import os
|
5
|
+
import json
|
6
|
+
import boto3
|
7
|
+
from typing import Any, Dict, List, Optional, Set
|
8
|
+
from boto3.resources.base import ServiceResource
|
9
|
+
import pyarrow as pa
|
10
|
+
from deltacat.tests.test_utils.utils import read_s3_contents
|
11
|
+
from deltacat.tests.compute.common import (
|
12
|
+
setup_sort_and_partition_keys,
|
13
|
+
PartitionKey,
|
14
|
+
TEST_S3_RCF_BUCKET_NAME,
|
15
|
+
BASE_TEST_SOURCE_NAMESPACE,
|
16
|
+
BASE_TEST_SOURCE_TABLE_NAME,
|
17
|
+
BASE_TEST_DESTINATION_NAMESPACE,
|
18
|
+
BASE_TEST_DESTINATION_TABLE_NAME,
|
19
|
+
)
|
20
|
+
from deltacat.tests.compute.testcases import (
|
21
|
+
INCREMENTAL_TEST_CASES,
|
22
|
+
)
|
23
|
+
|
24
|
+
DATABASE_FILE_PATH_KEY, DATABASE_FILE_PATH_VALUE = (
|
25
|
+
"db_file_path",
|
26
|
+
"deltacat/tests/local_deltacat_storage/db_test.sqlite",
|
27
|
+
)
|
28
|
+
|
29
|
+
"""
|
30
|
+
MODULE scoped fixtures
|
31
|
+
"""
|
32
|
+
|
33
|
+
|
34
|
+
@pytest.fixture(autouse=True, scope="module")
|
35
|
+
def mock_aws_credential():
|
36
|
+
os.environ["AWS_ACCESS_KEY_ID"] = "testing"
|
37
|
+
os.environ["AWS_SECRET_ACCESS_ID"] = "testing"
|
38
|
+
os.environ["AWS_SECURITY_TOKEN"] = "testing"
|
39
|
+
os.environ["AWS_SESSION_TOKEN"] = "testing"
|
40
|
+
os.environ["AWS_DEFAULT_REGION"] = "us-east-1"
|
41
|
+
yield
|
42
|
+
|
43
|
+
|
44
|
+
@pytest.fixture(autouse=True, scope="module")
|
45
|
+
def cleanup_the_database_file_after_all_compaction_session_package_tests_complete():
|
46
|
+
# make sure the database file is deleted after all the compactor package tests are completed
|
47
|
+
if os.path.exists(DATABASE_FILE_PATH_VALUE):
|
48
|
+
os.remove(DATABASE_FILE_PATH_VALUE)
|
49
|
+
|
50
|
+
|
51
|
+
@pytest.fixture(scope="module")
|
52
|
+
def setup_s3_resource(mock_aws_credential):
|
53
|
+
with mock_s3():
|
54
|
+
yield boto3.resource("s3")
|
55
|
+
|
56
|
+
|
57
|
+
@pytest.fixture(scope="module")
|
58
|
+
def setup_compaction_artifacts_s3_bucket(setup_s3_resource: ServiceResource):
|
59
|
+
setup_s3_resource.create_bucket(
|
60
|
+
ACL="authenticated-read",
|
61
|
+
Bucket=TEST_S3_RCF_BUCKET_NAME,
|
62
|
+
)
|
63
|
+
yield
|
64
|
+
|
65
|
+
|
66
|
+
"""
|
67
|
+
FUNCTION scoped fixtures
|
68
|
+
"""
|
69
|
+
|
70
|
+
|
71
|
+
@pytest.fixture(scope="function")
|
72
|
+
def teardown_local_deltacat_storage_db():
|
73
|
+
if os.path.exists(DATABASE_FILE_PATH_VALUE):
|
74
|
+
os.remove(DATABASE_FILE_PATH_VALUE)
|
75
|
+
|
76
|
+
|
77
|
+
@pytest.fixture(scope="function")
|
78
|
+
def setup_local_deltacat_storage_conn():
|
79
|
+
# see deltacat/tests/local_deltacat_storage/README.md for documentation
|
80
|
+
kwargs_for_local_deltacat_storage: Dict[str, Any] = {
|
81
|
+
DATABASE_FILE_PATH_KEY: DATABASE_FILE_PATH_VALUE,
|
82
|
+
}
|
83
|
+
yield kwargs_for_local_deltacat_storage
|
84
|
+
|
85
|
+
|
86
|
+
def setup_incremental_source_and_destination_tables(
|
87
|
+
source_table_version: str,
|
88
|
+
destination_table_version: str,
|
89
|
+
primary_keys: Set[str],
|
90
|
+
sort_keys: Optional[List[Any]],
|
91
|
+
partition_keys: Optional[List[PartitionKey]],
|
92
|
+
column_names: List[str],
|
93
|
+
arrow_arrays: List[pa.Array],
|
94
|
+
partition_values: Optional[List[Any]],
|
95
|
+
ds_mock_kwargs: Optional[Dict[str, Any]],
|
96
|
+
source_namespace: str = BASE_TEST_SOURCE_NAMESPACE,
|
97
|
+
source_table_name: str = BASE_TEST_SOURCE_TABLE_NAME,
|
98
|
+
destination_namespace: str = BASE_TEST_DESTINATION_NAMESPACE,
|
99
|
+
destination_table_name: str = BASE_TEST_DESTINATION_TABLE_NAME,
|
100
|
+
):
|
101
|
+
import deltacat.tests.local_deltacat_storage as ds
|
102
|
+
from deltacat.types.media import ContentType
|
103
|
+
from deltacat.storage import Partition, Stream
|
104
|
+
|
105
|
+
ds.create_namespace(source_namespace, {}, **ds_mock_kwargs)
|
106
|
+
ds.create_table_version(
|
107
|
+
source_namespace,
|
108
|
+
source_table_name,
|
109
|
+
source_table_version,
|
110
|
+
primary_key_column_names=list(primary_keys),
|
111
|
+
sort_keys=sort_keys,
|
112
|
+
partition_keys=partition_keys,
|
113
|
+
supported_content_types=[ContentType.PARQUET],
|
114
|
+
**ds_mock_kwargs,
|
115
|
+
)
|
116
|
+
source_table_stream: Stream = ds.get_stream(
|
117
|
+
namespace=source_namespace,
|
118
|
+
table_name=source_table_name,
|
119
|
+
table_version=source_table_version,
|
120
|
+
**ds_mock_kwargs,
|
121
|
+
)
|
122
|
+
test_table: pa.Table = pa.Table.from_arrays(arrow_arrays, names=column_names)
|
123
|
+
staged_partition: Partition = ds.stage_partition(
|
124
|
+
source_table_stream, partition_values, **ds_mock_kwargs
|
125
|
+
)
|
126
|
+
ds.commit_delta(
|
127
|
+
ds.stage_delta(test_table, staged_partition, **ds_mock_kwargs), **ds_mock_kwargs
|
128
|
+
)
|
129
|
+
ds.commit_partition(staged_partition, **ds_mock_kwargs)
|
130
|
+
# create the destination table
|
131
|
+
ds.create_namespace(destination_namespace, {}, **ds_mock_kwargs)
|
132
|
+
ds.create_table_version(
|
133
|
+
destination_namespace,
|
134
|
+
destination_table_name,
|
135
|
+
destination_table_version,
|
136
|
+
primary_key_column_names=list(primary_keys),
|
137
|
+
sort_keys=sort_keys,
|
138
|
+
partition_keys=partition_keys,
|
139
|
+
supported_content_types=[ContentType.PARQUET],
|
140
|
+
**ds_mock_kwargs,
|
141
|
+
)
|
142
|
+
destination_table_stream: Stream = ds.get_stream(
|
143
|
+
namespace=destination_namespace,
|
144
|
+
table_name=destination_table_name,
|
145
|
+
table_version=destination_table_version,
|
146
|
+
**ds_mock_kwargs,
|
147
|
+
)
|
148
|
+
source_table_stream_after_committed: Stream = ds.get_stream(
|
149
|
+
namespace=source_namespace,
|
150
|
+
table_name=source_table_name,
|
151
|
+
table_version=source_table_version,
|
152
|
+
**ds_mock_kwargs,
|
153
|
+
)
|
154
|
+
return source_table_stream_after_committed, destination_table_stream
|
155
|
+
|
156
|
+
|
157
|
+
@pytest.mark.parametrize(
|
158
|
+
[
|
159
|
+
"test_name",
|
160
|
+
"source_table_version",
|
161
|
+
"destination_table_version",
|
162
|
+
"primary_keys_param",
|
163
|
+
"sort_keys_param",
|
164
|
+
"partition_keys_param",
|
165
|
+
"column_names_param",
|
166
|
+
"arrow_arrays_param",
|
167
|
+
"rebase_source_partition_locator_param",
|
168
|
+
"partition_values_param",
|
169
|
+
"expected_result",
|
170
|
+
"validation_callback_func",
|
171
|
+
"validation_callback_func_kwargs",
|
172
|
+
"do_teardown_local_deltacat_storage_db",
|
173
|
+
"use_prev_compacted",
|
174
|
+
"create_placement_group_param",
|
175
|
+
"records_per_compacted_file_param",
|
176
|
+
"hash_bucket_count_param",
|
177
|
+
"compact_partition_func",
|
178
|
+
],
|
179
|
+
[
|
180
|
+
(
|
181
|
+
test_name,
|
182
|
+
source_table_version,
|
183
|
+
destination_table_version,
|
184
|
+
primary_keys_param,
|
185
|
+
sort_keys_param,
|
186
|
+
partition_keys_param,
|
187
|
+
column_names_param,
|
188
|
+
arrow_arrays_param,
|
189
|
+
rebase_source_partition_locator_param,
|
190
|
+
partition_values_param,
|
191
|
+
expected_result,
|
192
|
+
validation_callback_func,
|
193
|
+
validation_callback_func_kwargs,
|
194
|
+
do_teardown_local_deltacat_storage_db,
|
195
|
+
use_prev_compacted,
|
196
|
+
create_placement_group_param,
|
197
|
+
records_per_compacted_file_param,
|
198
|
+
hash_bucket_count_param,
|
199
|
+
compact_partition_func,
|
200
|
+
)
|
201
|
+
for test_name, (
|
202
|
+
source_table_version,
|
203
|
+
destination_table_version,
|
204
|
+
primary_keys_param,
|
205
|
+
sort_keys_param,
|
206
|
+
partition_keys_param,
|
207
|
+
column_names_param,
|
208
|
+
arrow_arrays_param,
|
209
|
+
rebase_source_partition_locator_param,
|
210
|
+
partition_values_param,
|
211
|
+
expected_result,
|
212
|
+
validation_callback_func,
|
213
|
+
validation_callback_func_kwargs,
|
214
|
+
do_teardown_local_deltacat_storage_db,
|
215
|
+
use_prev_compacted,
|
216
|
+
create_placement_group_param,
|
217
|
+
records_per_compacted_file_param,
|
218
|
+
hash_bucket_count_param,
|
219
|
+
compact_partition_func,
|
220
|
+
) in INCREMENTAL_TEST_CASES.items()
|
221
|
+
],
|
222
|
+
ids=[test_name for test_name in INCREMENTAL_TEST_CASES],
|
223
|
+
indirect=[],
|
224
|
+
)
|
225
|
+
def test_compact_partition_incremental(
|
226
|
+
request: pytest.FixtureRequest,
|
227
|
+
setup_s3_resource: ServiceResource,
|
228
|
+
setup_local_deltacat_storage_conn: Dict[str, Any],
|
229
|
+
setup_compaction_artifacts_s3_bucket: None,
|
230
|
+
test_name: str,
|
231
|
+
source_table_version: str,
|
232
|
+
destination_table_version: str,
|
233
|
+
primary_keys_param: Set[str],
|
234
|
+
sort_keys_param,
|
235
|
+
partition_keys_param,
|
236
|
+
column_names_param: List[str],
|
237
|
+
arrow_arrays_param: List[pa.Array],
|
238
|
+
rebase_source_partition_locator_param,
|
239
|
+
partition_values_param,
|
240
|
+
expected_result,
|
241
|
+
validation_callback_func, # use and implement func and func_kwargs if you want to run additional validations apart from the ones in the test
|
242
|
+
validation_callback_func_kwargs,
|
243
|
+
do_teardown_local_deltacat_storage_db,
|
244
|
+
use_prev_compacted,
|
245
|
+
create_placement_group_param,
|
246
|
+
records_per_compacted_file_param,
|
247
|
+
hash_bucket_count_param,
|
248
|
+
compact_partition_func,
|
249
|
+
):
|
250
|
+
import deltacat.tests.local_deltacat_storage as ds
|
251
|
+
from deltacat.types.media import ContentType
|
252
|
+
from deltacat.storage import (
|
253
|
+
PartitionLocator,
|
254
|
+
)
|
255
|
+
from deltacat.compute.compactor.model.compact_partition_params import (
|
256
|
+
CompactPartitionParams,
|
257
|
+
)
|
258
|
+
from deltacat.utils.placement import (
|
259
|
+
PlacementGroupManager,
|
260
|
+
)
|
261
|
+
from deltacat.compute.compactor import (
|
262
|
+
RoundCompletionInfo,
|
263
|
+
)
|
264
|
+
|
265
|
+
ds_mock_kwargs = setup_local_deltacat_storage_conn
|
266
|
+
|
267
|
+
# setup
|
268
|
+
sort_keys, partition_keys = setup_sort_and_partition_keys(
|
269
|
+
sort_keys_param, partition_keys_param
|
270
|
+
)
|
271
|
+
(
|
272
|
+
source_table_stream,
|
273
|
+
destination_table_stream,
|
274
|
+
) = setup_incremental_source_and_destination_tables(
|
275
|
+
source_table_version,
|
276
|
+
destination_table_version,
|
277
|
+
primary_keys_param,
|
278
|
+
sort_keys,
|
279
|
+
partition_keys,
|
280
|
+
column_names_param,
|
281
|
+
arrow_arrays_param,
|
282
|
+
partition_values_param,
|
283
|
+
ds_mock_kwargs,
|
284
|
+
)
|
285
|
+
ray.shutdown()
|
286
|
+
ray.init(local_mode=True)
|
287
|
+
assert ray.is_initialized()
|
288
|
+
source_partition = ds.get_partition(
|
289
|
+
source_table_stream.locator,
|
290
|
+
partition_values_param,
|
291
|
+
**ds_mock_kwargs,
|
292
|
+
)
|
293
|
+
destination_partition_locator = PartitionLocator.of(
|
294
|
+
destination_table_stream.locator,
|
295
|
+
partition_values_param,
|
296
|
+
None,
|
297
|
+
)
|
298
|
+
num_workers, worker_instance_cpu = 1, 1
|
299
|
+
total_cpus = num_workers * worker_instance_cpu
|
300
|
+
pgm = None
|
301
|
+
if create_placement_group_param:
|
302
|
+
pgm = PlacementGroupManager(
|
303
|
+
1, total_cpus, worker_instance_cpu, memory_per_bundle=4000000
|
304
|
+
).pgs[0]
|
305
|
+
compact_partition_params = CompactPartitionParams.of(
|
306
|
+
{
|
307
|
+
"compaction_artifact_s3_bucket": TEST_S3_RCF_BUCKET_NAME,
|
308
|
+
"compacted_file_content_type": ContentType.PARQUET,
|
309
|
+
"dd_max_parallelism_ratio": 1.0,
|
310
|
+
"deltacat_storage": ds,
|
311
|
+
"deltacat_storage_kwargs": ds_mock_kwargs,
|
312
|
+
"destination_partition_locator": destination_partition_locator,
|
313
|
+
"hash_bucket_count": hash_bucket_count_param,
|
314
|
+
"last_stream_position_to_compact": source_partition.stream_position,
|
315
|
+
"list_deltas_kwargs": {**ds_mock_kwargs, **{"equivalent_table_types": []}},
|
316
|
+
"pg_config": pgm,
|
317
|
+
"primary_keys": primary_keys_param,
|
318
|
+
"rebase_source_partition_locator": rebase_source_partition_locator_param,
|
319
|
+
"records_per_compacted_file": records_per_compacted_file_param,
|
320
|
+
"s3_client_kwargs": {},
|
321
|
+
"source_partition_locator": source_partition.locator,
|
322
|
+
"sort_keys": sort_keys if sort_keys else None,
|
323
|
+
}
|
324
|
+
)
|
325
|
+
# execute
|
326
|
+
rcf_file_s3_uri = compact_partition_func(compact_partition_params)
|
327
|
+
# validate
|
328
|
+
_, rcf_object_key = rcf_file_s3_uri.rsplit("/", 1)
|
329
|
+
rcf_file_output: Dict[str, Any] = read_s3_contents(
|
330
|
+
setup_s3_resource, TEST_S3_RCF_BUCKET_NAME, rcf_object_key
|
331
|
+
)
|
332
|
+
round_completion_info = RoundCompletionInfo(**rcf_file_output)
|
333
|
+
print(f"rcf_file_output: {json.dumps(rcf_file_output, indent=2)}")
|
334
|
+
compacted_delta_locator = round_completion_info.compacted_delta_locator
|
335
|
+
tables = ds.download_delta(compacted_delta_locator, **ds_mock_kwargs)
|
336
|
+
compacted_table = pa.concat_tables(tables)
|
337
|
+
assert compacted_table.equals(
|
338
|
+
expected_result
|
339
|
+
), f"{compacted_table} does not match {expected_result}"
|
340
|
+
if (
|
341
|
+
validation_callback_func is not None
|
342
|
+
and validation_callback_func_kwargs is not None
|
343
|
+
):
|
344
|
+
validation_callback_func(**validation_callback_func_kwargs)
|
345
|
+
# https://docs.pytest.org/en/7.1.x/reference/reference.html#pytest.FixtureRequest.getfixturevalue
|
346
|
+
if do_teardown_local_deltacat_storage_db:
|
347
|
+
request.getfixturevalue("teardown_local_deltacat_storage_db")
|
348
|
+
return
|