deltacat 1.1.8__py3-none-any.whl → 1.1.10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. deltacat/__init__.py +1 -1
  2. deltacat/aws/constants.py +6 -0
  3. deltacat/aws/redshift/model/manifest.py +16 -0
  4. deltacat/aws/s3u.py +65 -38
  5. deltacat/compute/compactor/compaction_session.py +5 -1
  6. deltacat/compute/compactor/model/compact_partition_params.py +12 -1
  7. deltacat/compute/compactor/model/materialize_result.py +0 -4
  8. deltacat/compute/compactor/repartition_session.py +1 -0
  9. deltacat/compute/compactor/utils/round_completion_file.py +39 -9
  10. deltacat/compute/compactor_v2/compaction_session.py +26 -16
  11. deltacat/compute/compactor_v2/constants.py +5 -11
  12. deltacat/compute/compactor_v2/model/{compaction_session.py → evaluate_compaction_result.py} +1 -2
  13. deltacat/compute/compactor_v2/model/merge_input.py +6 -0
  14. deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -7
  15. deltacat/compute/compactor_v2/steps/merge.py +12 -12
  16. deltacat/compute/compactor_v2/utils/merge.py +1 -0
  17. deltacat/compute/compactor_v2/utils/primary_key_index.py +9 -4
  18. deltacat/compute/compactor_v2/utils/task_options.py +2 -12
  19. deltacat/exceptions.py +342 -7
  20. deltacat/io/dataset.py +5 -17
  21. deltacat/io/memcached_object_store.py +7 -4
  22. deltacat/storage/__init__.py +24 -0
  23. deltacat/storage/interface.py +56 -6
  24. deltacat/storage/model/delta.py +23 -3
  25. deltacat/storage/model/partition.py +6 -7
  26. deltacat/storage/model/partition_spec.py +71 -0
  27. deltacat/storage/model/stream.py +38 -1
  28. deltacat/storage/model/transform.py +127 -0
  29. deltacat/tests/aws/test_s3u.py +2 -0
  30. deltacat/tests/compute/compact_partition_rebase_test_cases.py +88 -0
  31. deltacat/tests/compute/compact_partition_rebase_then_incremental_test_cases.py +3 -2
  32. deltacat/tests/compute/compact_partition_test_cases.py +4 -2
  33. deltacat/tests/compute/compactor/utils/test_round_completion_file.py +209 -0
  34. deltacat/tests/compute/compactor_v2/test_compaction_session.py +204 -37
  35. deltacat/tests/compute/test_compact_partition_rebase.py +289 -0
  36. deltacat/tests/compute/test_util_common.py +19 -4
  37. deltacat/tests/compute/test_util_create_table_deltas_repo.py +1 -0
  38. deltacat/tests/io/test_memcached_object_store.py +5 -2
  39. deltacat/tests/local_deltacat_storage/__init__.py +124 -29
  40. deltacat/tests/local_deltacat_storage/exceptions.py +10 -0
  41. deltacat/tests/test_exceptions.py +100 -0
  42. deltacat/tests/test_logs.py +1 -0
  43. deltacat/tests/test_utils/pyarrow.py +4 -1
  44. deltacat/tests/utils/ray_utils/test_dataset.py +66 -0
  45. deltacat/tests/utils/test_daft.py +0 -1
  46. deltacat/tests/utils/test_resources.py +0 -28
  47. deltacat/utils/daft.py +3 -0
  48. deltacat/utils/numpy.py +3 -3
  49. deltacat/utils/pandas.py +3 -3
  50. deltacat/utils/pyarrow.py +11 -8
  51. deltacat/utils/ray_utils/dataset.py +7 -7
  52. deltacat/utils/ray_utils/runtime.py +2 -2
  53. deltacat/utils/resources.py +0 -45
  54. {deltacat-1.1.8.dist-info → deltacat-1.1.10.dist-info}/METADATA +6 -5
  55. {deltacat-1.1.8.dist-info → deltacat-1.1.10.dist-info}/RECORD +58 -51
  56. deltacat/io/aws/redshift/redshift_datasource.py +0 -578
  57. {deltacat-1.1.8.dist-info → deltacat-1.1.10.dist-info}/LICENSE +0 -0
  58. {deltacat-1.1.8.dist-info → deltacat-1.1.10.dist-info}/WHEEL +0 -0
  59. {deltacat-1.1.8.dist-info → deltacat-1.1.10.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,88 @@
1
+ import pyarrow as pa
2
+ from deltacat.tests.compute.test_util_common import (
3
+ PartitionKey,
4
+ PartitionKeyType,
5
+ )
6
+ from deltacat.tests.compute.test_util_constant import (
7
+ DEFAULT_MAX_RECORDS_PER_FILE,
8
+ DEFAULT_HASH_BUCKET_COUNT,
9
+ )
10
+ from dataclasses import dataclass
11
+
12
+
13
+ from deltacat.storage import (
14
+ DeltaType,
15
+ )
16
+
17
+ from deltacat.compute.compactor.model.compactor_version import CompactorVersion
18
+
19
+ from deltacat.storage.model.sort_key import SortKey
20
+
21
+ from deltacat.tests.compute.compact_partition_test_cases import (
22
+ BaseCompactorTestCase,
23
+ with_compactor_version_func_test_param,
24
+ )
25
+
26
+
27
+ @dataclass(frozen=True)
28
+ class RebaseCompactionTestCaseParams(BaseCompactorTestCase):
29
+ """
30
+ A pytest parameterized test case for the `compact_partition` function with rebase compaction.
31
+
32
+ Args:
33
+ * (inherited from CompactorTestCase): see CompactorTestCase docstring for details
34
+ rebase_expected_compact_partition_result: pa.Table - expected table after rebase compaction runs. An output that is asserted on in Rebase unit tests
35
+ """
36
+
37
+ rebase_expected_compact_partition_result: pa.Table
38
+
39
+
40
+ REBASE_TEST_CASES = {
41
+ "1-rebase-sanity": RebaseCompactionTestCaseParams(
42
+ primary_keys={"pk_col_1"},
43
+ sort_keys=[
44
+ SortKey.of(key_name="sk_col_1"),
45
+ SortKey.of(key_name="sk_col_2"),
46
+ ],
47
+ partition_keys=[PartitionKey.of("region_id", PartitionKeyType.INT)],
48
+ partition_values=["1"],
49
+ input_deltas=pa.Table.from_arrays(
50
+ [
51
+ pa.array([str(i) for i in range(10)]),
52
+ pa.array([i for i in range(0, 10)]),
53
+ pa.array(["foo"] * 10),
54
+ pa.array([i / 10 for i in range(10, 20)]),
55
+ ],
56
+ names=["pk_col_1", "sk_col_1", "sk_col_2", "col_1"],
57
+ ),
58
+ input_deltas_delta_type=DeltaType.UPSERT,
59
+ rebase_expected_compact_partition_result=pa.Table.from_arrays(
60
+ [
61
+ pa.array([str(i) for i in range(10)]),
62
+ pa.array([i for i in range(0, 10)]),
63
+ pa.array(["foo"] * 10),
64
+ pa.array([i / 10 for i in range(10, 20)]),
65
+ ],
66
+ names=["pk_col_1", "sk_col_1", "sk_col_2", "col_1"],
67
+ ),
68
+ expected_terminal_compact_partition_result=pa.Table.from_arrays(
69
+ [
70
+ pa.array([str(i) for i in range(10)]),
71
+ pa.array([i for i in range(20, 30)]),
72
+ pa.array(["foo"] * 10),
73
+ pa.array([i / 10 for i in range(40, 50)]),
74
+ ],
75
+ names=["pk_col_1", "sk_col_1", "sk_col_2", "col_1"],
76
+ ),
77
+ expected_terminal_exception=None,
78
+ expected_terminal_exception_message=None,
79
+ do_create_placement_group=False,
80
+ records_per_compacted_file=DEFAULT_MAX_RECORDS_PER_FILE,
81
+ hash_bucket_count=DEFAULT_HASH_BUCKET_COUNT,
82
+ read_kwargs_provider=None,
83
+ drop_duplicates=True,
84
+ skip_enabled_compact_partition_drivers=[CompactorVersion.V1],
85
+ ),
86
+ }
87
+
88
+ REBASE_TEST_CASES = with_compactor_version_func_test_param(REBASE_TEST_CASES)
@@ -37,6 +37,7 @@ from deltacat.tests.compute.compact_partition_test_cases import (
37
37
  EMPTY_UTSV_PATH,
38
38
  )
39
39
  from deltacat.storage import DeleteParameters
40
+ from deltacat.exceptions import ValidationError
40
41
 
41
42
 
42
43
  @dataclass(frozen=True)
@@ -1538,8 +1539,8 @@ REBASE_THEN_INCREMENTAL_DELETE_DELTA_TYPE_TEST_CASES = {
1538
1539
  ]
1539
1540
  ),
1540
1541
  ),
1541
- expected_terminal_exception=AssertionError,
1542
- expected_terminal_exception_message="Delete type deltas are required to have delete parameters defined",
1542
+ expected_terminal_exception=ValidationError,
1543
+ expected_terminal_exception_message="One of the assertions in DeltaCAT has failed",
1543
1544
  do_create_placement_group=True,
1544
1545
  records_per_compacted_file=DEFAULT_MAX_RECORDS_PER_FILE,
1545
1546
  hash_bucket_count=DEFAULT_HASH_BUCKET_COUNT,
@@ -28,6 +28,8 @@ from deltacat.compute.compactor.model.compactor_version import CompactorVersion
28
28
 
29
29
  from deltacat.storage.model.sort_key import SortKey
30
30
 
31
+ from deltacat.exceptions import ValidationError
32
+
31
33
  ZERO_VALUED_SORT_KEY, ZERO_VALUED_PARTITION_VALUES_PARAM = [], []
32
34
  ZERO_VALUED_PARTITION_KEYS_PARAM = None
33
35
  ZERO_VALUED_PRIMARY_KEY = {}
@@ -570,8 +572,8 @@ INCREMENTAL_TEST_CASES: Dict[str, IncrementalCompactionTestCaseParams] = {
570
572
  ],
571
573
  names=["pk_col_1", "sk_col_1"],
572
574
  ),
573
- expected_terminal_exception=AssertionError,
574
- expected_terminal_exception_message="hash_bucket_count is a required arg for compactor v2",
575
+ expected_terminal_exception=ValidationError,
576
+ expected_terminal_exception_message="One of the assertions in DeltaCAT has failed",
575
577
  do_create_placement_group=False,
576
578
  records_per_compacted_file=DEFAULT_MAX_RECORDS_PER_FILE,
577
579
  hash_bucket_count=None,
@@ -0,0 +1,209 @@
1
+ import pytest
2
+ import os
3
+ from moto import mock_s3
4
+ import boto3
5
+ from boto3.resources.base import ServiceResource
6
+ from deltacat.compute.compactor.utils.round_completion_file import (
7
+ read_round_completion_file,
8
+ write_round_completion_file,
9
+ )
10
+ from deltacat.tests.compute.test_util_common import get_test_partition_locator
11
+ from deltacat.compute.compactor import RoundCompletionInfo
12
+
13
+ RCF_BUCKET_NAME = "rcf-bucket"
14
+
15
+
16
+ @pytest.fixture(autouse=True, scope="module")
17
+ def mock_aws_credential():
18
+ os.environ["AWS_ACCESS_KEY_ID"] = "testing"
19
+ os.environ["AWS_SECRET_ACCESS_ID"] = "testing"
20
+ os.environ["AWS_SECURITY_TOKEN"] = "testing"
21
+ os.environ["AWS_SESSION_TOKEN"] = "testing"
22
+ os.environ["AWS_DEFAULT_REGION"] = "us-east-1"
23
+ yield
24
+
25
+
26
+ @pytest.fixture(autouse=True, scope="module")
27
+ def s3_resource(mock_aws_credential):
28
+ with mock_s3():
29
+ yield boto3.resource("s3")
30
+
31
+
32
+ @pytest.fixture(autouse=True, scope="function")
33
+ def setup_compaction_artifacts_s3_bucket(s3_resource: ServiceResource):
34
+ s3_resource.create_bucket(
35
+ ACL="authenticated-read",
36
+ Bucket=RCF_BUCKET_NAME,
37
+ )
38
+ yield
39
+ s3_resource.Bucket(RCF_BUCKET_NAME).objects.all().delete()
40
+
41
+
42
+ class TestReadWriteRoundCompletionFile:
43
+ def test_read_when_rcf_written_without_destination(self):
44
+ """
45
+ This test case tests the backward compatibility by successfully
46
+ reading the previously written rcf.
47
+ """
48
+
49
+ source_locator = get_test_partition_locator("source")
50
+ destination_locator = get_test_partition_locator("destination")
51
+
52
+ expected_rcf = RoundCompletionInfo.of(
53
+ high_watermark=122,
54
+ compacted_delta_locator={},
55
+ compacted_pyarrow_write_result={},
56
+ sort_keys_bit_width=12,
57
+ )
58
+
59
+ rcf_url = write_round_completion_file(
60
+ RCF_BUCKET_NAME, source_locator, None, expected_rcf
61
+ )
62
+
63
+ rcf = read_round_completion_file(
64
+ RCF_BUCKET_NAME, source_locator, destination_locator
65
+ )
66
+
67
+ assert (
68
+ rcf_url == "s3://rcf-bucket/f9829af39770d904dbb811bd8f4e886dd307f507.json"
69
+ )
70
+ assert rcf == expected_rcf
71
+
72
+ def test_read_when_rcf_written_with_destination(self):
73
+ """
74
+ This test case tests the backward compatibility by successfully
75
+ reading the previously written rcf.
76
+ """
77
+
78
+ source_locator = get_test_partition_locator("source")
79
+ destination_locator = get_test_partition_locator("destination")
80
+
81
+ expected_rcf = RoundCompletionInfo.of(
82
+ high_watermark=122,
83
+ compacted_delta_locator={},
84
+ compacted_pyarrow_write_result={},
85
+ sort_keys_bit_width=12,
86
+ )
87
+
88
+ rcf_url = write_round_completion_file(
89
+ RCF_BUCKET_NAME, source_locator, destination_locator, expected_rcf
90
+ )
91
+
92
+ rcf = read_round_completion_file(
93
+ RCF_BUCKET_NAME, source_locator, destination_locator
94
+ )
95
+
96
+ assert (
97
+ rcf_url
98
+ == "s3://rcf-bucket/f9829af39770d904dbb811bd8f4e886dd307f507/e9939deadc091b3289a2eb0ca56b1ba86b9892f4.json"
99
+ )
100
+ assert rcf == expected_rcf
101
+
102
+ def test_read_without_destination_when_rcf_written_with_destination(self):
103
+ """
104
+ This test case tests the backward compatibility by successfully
105
+ reading the previously written rcf.
106
+ """
107
+
108
+ source_locator = get_test_partition_locator("source")
109
+ destination_locator = get_test_partition_locator("destination")
110
+
111
+ expected_rcf = RoundCompletionInfo.of(
112
+ high_watermark=122,
113
+ compacted_delta_locator={},
114
+ compacted_pyarrow_write_result={},
115
+ sort_keys_bit_width=12,
116
+ )
117
+
118
+ write_round_completion_file(
119
+ RCF_BUCKET_NAME, source_locator, destination_locator, expected_rcf
120
+ )
121
+
122
+ rcf = read_round_completion_file(RCF_BUCKET_NAME, source_locator, None)
123
+
124
+ assert rcf is None
125
+
126
+ def test_read_without_destination_when_rcf_written_without_destination(self):
127
+ """
128
+ This test case tests the backward compatibility by successfully
129
+ reading the previously written rcf.
130
+ """
131
+
132
+ source_locator = get_test_partition_locator("source")
133
+
134
+ expected_rcf = RoundCompletionInfo.of(
135
+ high_watermark=122,
136
+ compacted_delta_locator={},
137
+ compacted_pyarrow_write_result={},
138
+ sort_keys_bit_width=12,
139
+ )
140
+
141
+ write_round_completion_file(RCF_BUCKET_NAME, source_locator, None, expected_rcf)
142
+
143
+ rcf = read_round_completion_file(RCF_BUCKET_NAME, source_locator, None)
144
+
145
+ assert rcf == expected_rcf
146
+
147
+ def test_read_when_rcf_written_both_with_and_without_destination(self):
148
+ """
149
+ This test case tests the backward compatibility by successfully
150
+ reading the previously written rcf.
151
+ """
152
+
153
+ source_locator = get_test_partition_locator("source")
154
+ destination_locator = get_test_partition_locator("destination")
155
+
156
+ expected_rcf = RoundCompletionInfo.of(
157
+ high_watermark=122,
158
+ compacted_delta_locator={},
159
+ compacted_pyarrow_write_result={},
160
+ sort_keys_bit_width=12,
161
+ )
162
+
163
+ expected_rcf_2 = RoundCompletionInfo.of(
164
+ high_watermark=1223,
165
+ compacted_delta_locator={},
166
+ compacted_pyarrow_write_result={},
167
+ sort_keys_bit_width=1233,
168
+ )
169
+
170
+ write_round_completion_file(RCF_BUCKET_NAME, source_locator, None, expected_rcf)
171
+
172
+ write_round_completion_file(
173
+ RCF_BUCKET_NAME, source_locator, destination_locator, expected_rcf_2
174
+ )
175
+
176
+ rcf = read_round_completion_file(
177
+ RCF_BUCKET_NAME, source_locator, destination_locator
178
+ )
179
+
180
+ assert rcf == expected_rcf_2
181
+
182
+ def test_write_when_custom_url_is_passed(self):
183
+ """
184
+ This test case tests the backward compatibility by successfully
185
+ reading the previously written rcf.
186
+ """
187
+
188
+ source_locator = get_test_partition_locator("source")
189
+
190
+ expected_rcf = RoundCompletionInfo.of(
191
+ high_watermark=122,
192
+ compacted_delta_locator={},
193
+ compacted_pyarrow_write_result={},
194
+ sort_keys_bit_width=12,
195
+ )
196
+
197
+ completion_file_s3_url = f"s3://{RCF_BUCKET_NAME}/test.json"
198
+ rcf_url = write_round_completion_file(
199
+ RCF_BUCKET_NAME,
200
+ source_locator,
201
+ None,
202
+ expected_rcf,
203
+ completion_file_s3_url=completion_file_s3_url,
204
+ )
205
+
206
+ rcf = read_round_completion_file(RCF_BUCKET_NAME, source_locator, None)
207
+
208
+ assert rcf_url == completion_file_s3_url
209
+ assert rcf is None
@@ -1,88 +1,255 @@
1
- import unittest
2
- import sqlite3
1
+ from typing import Dict, Any
3
2
  import ray
4
3
  import os
5
- from unittest.mock import patch
4
+ import pytest
5
+ import boto3
6
+ from deltacat.compute.compactor.model.compaction_session_audit_info import (
7
+ CompactionSessionAuditInfo,
8
+ )
9
+ from boto3.resources.base import ServiceResource
6
10
  import deltacat.tests.local_deltacat_storage as ds
7
11
  from deltacat.types.media import ContentType
8
- from deltacat.compute.compactor_v2.compaction_session import compact_partition
12
+ from deltacat.compute.compactor_v2.compaction_session import (
13
+ compact_partition,
14
+ )
9
15
  from deltacat.compute.compactor.model.compact_partition_params import (
10
16
  CompactPartitionParams,
11
17
  )
12
- from deltacat.utils.common import current_time_ms
13
- from deltacat.tests.test_utils.pyarrow import stage_partition_from_file_paths
18
+ from deltacat.tests.test_utils.utils import read_s3_contents
19
+ from deltacat.tests.compute.test_util_constant import (
20
+ TEST_S3_RCF_BUCKET_NAME,
21
+ )
22
+ from deltacat.tests.compute.test_util_common import get_rcf
23
+ from deltacat.tests.test_utils.pyarrow import (
24
+ stage_partition_from_file_paths,
25
+ commit_delta_to_staged_partition,
26
+ commit_delta_to_partition,
27
+ )
28
+ from moto import mock_s3
29
+
30
+ DATABASE_FILE_PATH_KEY, DATABASE_FILE_PATH_VALUE = (
31
+ "db_file_path",
32
+ "deltacat/tests/local_deltacat_storage/db_test.sqlite",
33
+ )
34
+
35
+
36
+ @pytest.fixture(autouse=True, scope="module")
37
+ def setup_ray_cluster():
38
+ ray.init(local_mode=True, ignore_reinit_error=True)
39
+ yield
40
+ ray.shutdown()
41
+
42
+
43
+ @pytest.fixture(autouse=True, scope="module")
44
+ def mock_aws_credential():
45
+ os.environ["AWS_ACCESS_KEY_ID"] = "testing"
46
+ os.environ["AWS_SECRET_ACCESS_ID"] = "testing"
47
+ os.environ["AWS_SECURITY_TOKEN"] = "testing"
48
+ os.environ["AWS_SESSION_TOKEN"] = "testing"
49
+ os.environ["AWS_DEFAULT_REGION"] = "us-east-1"
50
+ yield
51
+
14
52
 
53
+ @pytest.fixture(scope="module")
54
+ def s3_resource(mock_aws_credential):
55
+ with mock_s3():
56
+ yield boto3.resource("s3")
15
57
 
16
- class TestCompactionSession(unittest.TestCase):
58
+
59
+ @pytest.fixture(autouse=True, scope="module")
60
+ def setup_compaction_artifacts_s3_bucket(s3_resource: ServiceResource):
61
+ s3_resource.create_bucket(
62
+ ACL="authenticated-read",
63
+ Bucket=TEST_S3_RCF_BUCKET_NAME,
64
+ )
65
+ yield
66
+
67
+
68
+ @pytest.fixture(scope="function")
69
+ def local_deltacat_storage_kwargs(request: pytest.FixtureRequest):
70
+ kwargs_for_local_deltacat_storage: Dict[str, Any] = {
71
+ DATABASE_FILE_PATH_KEY: DATABASE_FILE_PATH_VALUE,
72
+ }
73
+ yield kwargs_for_local_deltacat_storage
74
+ if os.path.exists(DATABASE_FILE_PATH_VALUE):
75
+ os.remove(DATABASE_FILE_PATH_VALUE)
76
+
77
+
78
+ class TestCompactionSession:
17
79
  """
18
80
  This class adds specific tests that aren't part of the parametrized test suite.
19
81
  """
20
82
 
21
- DB_FILE_PATH = f"{current_time_ms()}.db"
22
83
  NAMESPACE = "compact_partition_v2_namespace"
84
+ BACKFILL_FILE_PATH = (
85
+ "deltacat/tests/compute/compactor_v2/data/backfill_source_date_pk.csv"
86
+ )
87
+ INCREMENTAL_FILE_PATH = (
88
+ "deltacat/tests/compute/compactor_v2/data/incremental_source_date_pk.csv"
89
+ )
90
+
91
+ def test_compact_partition_when_no_input_deltas_to_compact(
92
+ self, local_deltacat_storage_kwargs
93
+ ):
94
+ # setup
95
+ staged_source = stage_partition_from_file_paths(
96
+ self.NAMESPACE, ["test"], **local_deltacat_storage_kwargs
97
+ )
98
+ source_partition = ds.commit_partition(
99
+ staged_source, **local_deltacat_storage_kwargs
100
+ )
23
101
 
24
- @classmethod
25
- def setUpClass(cls):
26
- ray.init(local_mode=True, ignore_reinit_error=True)
102
+ staged_dest = stage_partition_from_file_paths(
103
+ self.NAMESPACE, ["destination"], **local_deltacat_storage_kwargs
104
+ )
105
+ dest_partition = ds.commit_partition(
106
+ staged_dest, **local_deltacat_storage_kwargs
107
+ )
27
108
 
28
- con = sqlite3.connect(cls.DB_FILE_PATH)
29
- cur = con.cursor()
30
- cls.kwargs = {ds.SQLITE_CON_ARG: con, ds.SQLITE_CUR_ARG: cur}
31
- cls.deltacat_storage_kwargs = {ds.DB_FILE_PATH_ARG: cls.DB_FILE_PATH}
109
+ # action
110
+ rcf_url = compact_partition(
111
+ CompactPartitionParams.of(
112
+ {
113
+ "compaction_artifact_s3_bucket": TEST_S3_RCF_BUCKET_NAME,
114
+ "compacted_file_content_type": ContentType.PARQUET,
115
+ "dd_max_parallelism_ratio": 1.0,
116
+ "deltacat_storage": ds,
117
+ "deltacat_storage_kwargs": local_deltacat_storage_kwargs,
118
+ "destination_partition_locator": dest_partition.locator,
119
+ "drop_duplicates": True,
120
+ "hash_bucket_count": 2,
121
+ "last_stream_position_to_compact": source_partition.stream_position,
122
+ "list_deltas_kwargs": {
123
+ **local_deltacat_storage_kwargs,
124
+ **{"equivalent_table_types": []},
125
+ },
126
+ "primary_keys": ["pk"],
127
+ "rebase_source_partition_locator": None,
128
+ "rebase_source_partition_high_watermark": None,
129
+ "records_per_compacted_file": 4000,
130
+ "s3_client_kwargs": {},
131
+ "source_partition_locator": source_partition.locator,
132
+ }
133
+ )
134
+ )
32
135
 
33
- super().setUpClass()
136
+ # verify that no RCF is written
137
+ assert rcf_url is None
34
138
 
35
- @classmethod
36
- def doClassCleanups(cls) -> None:
37
- os.remove(cls.DB_FILE_PATH)
38
- ray.shutdown()
39
- super().tearDownClass()
139
+ def test_compact_partition_when_rcf_was_written_by_past_commit(
140
+ self, s3_resource, local_deltacat_storage_kwargs
141
+ ):
142
+ """
143
+ Backward compatibility test for when a RCF was written by a previous commit.
144
+ """
40
145
 
41
- @patch("deltacat.compute.compactor_v2.compaction_session.rcf")
42
- @patch("deltacat.compute.compactor_v2.compaction_session.s3_utils")
43
- def test_compact_partition_when_no_input_deltas_to_compact(self, s3_utils, rcf_url):
44
146
  # setup
45
- rcf_url.read_round_completion_file.return_value = None
46
147
  staged_source = stage_partition_from_file_paths(
47
- self.NAMESPACE, ["test"], **self.deltacat_storage_kwargs
148
+ self.NAMESPACE, ["source"], **local_deltacat_storage_kwargs
48
149
  )
49
- source_partition = ds.commit_partition(
50
- staged_source, **self.deltacat_storage_kwargs
150
+
151
+ source_delta = commit_delta_to_staged_partition(
152
+ staged_source, [self.BACKFILL_FILE_PATH], **local_deltacat_storage_kwargs
51
153
  )
52
154
 
53
155
  staged_dest = stage_partition_from_file_paths(
54
- self.NAMESPACE, ["destination"], **self.deltacat_storage_kwargs
156
+ self.NAMESPACE, ["destination"], **local_deltacat_storage_kwargs
55
157
  )
56
158
  dest_partition = ds.commit_partition(
57
- staged_dest, **self.deltacat_storage_kwargs
159
+ staged_dest, **local_deltacat_storage_kwargs
58
160
  )
59
161
 
60
162
  # action
61
163
  rcf_url = compact_partition(
62
164
  CompactPartitionParams.of(
63
165
  {
64
- "compaction_artifact_s3_bucket": "test_bucket",
166
+ "compaction_artifact_s3_bucket": TEST_S3_RCF_BUCKET_NAME,
65
167
  "compacted_file_content_type": ContentType.PARQUET,
66
168
  "dd_max_parallelism_ratio": 1.0,
67
169
  "deltacat_storage": ds,
68
- "deltacat_storage_kwargs": self.deltacat_storage_kwargs,
170
+ "deltacat_storage_kwargs": local_deltacat_storage_kwargs,
69
171
  "destination_partition_locator": dest_partition.locator,
70
172
  "drop_duplicates": True,
71
173
  "hash_bucket_count": 1,
72
- "last_stream_position_to_compact": source_partition.stream_position,
174
+ "last_stream_position_to_compact": source_delta.stream_position,
73
175
  "list_deltas_kwargs": {
74
- **self.deltacat_storage_kwargs,
176
+ **local_deltacat_storage_kwargs,
75
177
  **{"equivalent_table_types": []},
76
178
  },
77
179
  "primary_keys": [],
180
+ "rebase_source_partition_locator": source_delta.partition_locator,
181
+ "rebase_source_partition_high_watermark": None,
182
+ "records_per_compacted_file": 4000,
183
+ "s3_client_kwargs": {},
184
+ "source_partition_locator": source_delta.partition_locator,
185
+ }
186
+ )
187
+ )
188
+
189
+ bucket, backfill_key1, backfill_key2 = rcf_url.strip("s3://").split("/")
190
+ assert bucket == TEST_S3_RCF_BUCKET_NAME
191
+
192
+ # Now delete the RCF at new location and copy it to old location
193
+ # Copy the RCF from rcf_url to another location
194
+ s3_resource.Object(TEST_S3_RCF_BUCKET_NAME, f"{backfill_key1}.json").copy_from(
195
+ CopySource=f"{TEST_S3_RCF_BUCKET_NAME}/{backfill_key1}/{backfill_key2}"
196
+ )
197
+
198
+ s3_resource.Object(
199
+ TEST_S3_RCF_BUCKET_NAME, f"{backfill_key1}/{backfill_key2}"
200
+ ).delete()
201
+
202
+ # Now run an incremental compaction and verify if the previous RCF was read properly.
203
+
204
+ new_source_delta = commit_delta_to_partition(
205
+ source_delta.partition_locator,
206
+ [self.INCREMENTAL_FILE_PATH],
207
+ **local_deltacat_storage_kwargs,
208
+ )
209
+
210
+ new_rcf_url = compact_partition(
211
+ CompactPartitionParams.of(
212
+ {
213
+ "compaction_artifact_s3_bucket": TEST_S3_RCF_BUCKET_NAME,
214
+ "compacted_file_content_type": ContentType.PARQUET,
215
+ "dd_max_parallelism_ratio": 1.0,
216
+ "deltacat_storage": ds,
217
+ "deltacat_storage_kwargs": local_deltacat_storage_kwargs,
218
+ "destination_partition_locator": dest_partition.locator,
219
+ "drop_duplicates": True,
220
+ "hash_bucket_count": 1,
221
+ "last_stream_position_to_compact": new_source_delta.stream_position,
222
+ "list_deltas_kwargs": {
223
+ **local_deltacat_storage_kwargs,
224
+ **{"equivalent_table_types": []},
225
+ },
226
+ "primary_keys": ["pk"],
78
227
  "rebase_source_partition_locator": None,
79
228
  "rebase_source_partition_high_watermark": None,
80
229
  "records_per_compacted_file": 4000,
81
230
  "s3_client_kwargs": {},
82
- "source_partition_locator": source_partition.locator,
231
+ "source_partition_locator": new_source_delta.partition_locator,
83
232
  }
84
233
  )
85
234
  )
86
235
 
87
- # verify that no RCF is written
88
- self.assertIsNone(rcf_url)
236
+ new_bucket, incremental_key1, incremental_key2 = new_rcf_url.strip(
237
+ "s3://"
238
+ ).split("/")
239
+
240
+ assert new_bucket == TEST_S3_RCF_BUCKET_NAME
241
+ assert backfill_key1 == incremental_key1
242
+ assert backfill_key2 != incremental_key2
243
+
244
+ rcf = get_rcf(s3_resource, new_rcf_url)
245
+
246
+ _, compaction_audit_key = rcf.compaction_audit_url.strip("s3://").split("/", 1)
247
+ compaction_audit = CompactionSessionAuditInfo(
248
+ **read_s3_contents(
249
+ s3_resource, TEST_S3_RCF_BUCKET_NAME, compaction_audit_key
250
+ )
251
+ )
252
+
253
+ # as it should be running incremental
254
+ assert compaction_audit.uniform_deltas_created == 1
255
+ assert compaction_audit.input_records == 6