deltacat 0.1.18b14__py3-none-any.whl → 0.1.18b16__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (79) hide show
  1. deltacat/__init__.py +1 -1
  2. deltacat/aws/clients.py +17 -6
  3. deltacat/aws/redshift/model/manifest.py +4 -0
  4. deltacat/aws/s3u.py +24 -1
  5. deltacat/compute/compactor/compaction_session.py +42 -18
  6. deltacat/compute/compactor/model/compact_partition_params.py +297 -58
  7. deltacat/compute/compactor/model/compaction_session_audit_info.py +163 -9
  8. deltacat/compute/compactor/model/delta_annotated.py +95 -9
  9. deltacat/compute/compactor/model/delta_file_envelope.py +14 -2
  10. deltacat/compute/compactor/model/round_completion_info.py +17 -1
  11. deltacat/compute/compactor/repartition_session.py +4 -1
  12. deltacat/compute/compactor/steps/dedupe.py +9 -6
  13. deltacat/compute/compactor/steps/hash_bucket.py +24 -3
  14. deltacat/compute/compactor/steps/materialize.py +11 -6
  15. deltacat/compute/compactor/steps/repartition.py +22 -1
  16. deltacat/compute/compactor/utils/io.py +40 -23
  17. deltacat/compute/compactor/utils/sort_key.py +5 -0
  18. deltacat/compute/compactor/utils/system_columns.py +43 -0
  19. deltacat/compute/compactor_v2/compaction_session.py +509 -0
  20. deltacat/compute/compactor_v2/constants.py +37 -0
  21. deltacat/compute/compactor_v2/model/hash_bucket_input.py +78 -0
  22. deltacat/compute/compactor_v2/model/hash_bucket_result.py +12 -0
  23. deltacat/compute/compactor_v2/model/merge_input.py +143 -0
  24. deltacat/compute/compactor_v2/model/merge_result.py +12 -0
  25. deltacat/compute/compactor_v2/steps/__init__.py +0 -0
  26. deltacat/compute/compactor_v2/steps/hash_bucket.py +203 -0
  27. deltacat/compute/compactor_v2/steps/merge.py +469 -0
  28. deltacat/compute/compactor_v2/utils/__init__.py +0 -0
  29. deltacat/compute/compactor_v2/utils/content_type_params.py +66 -0
  30. deltacat/compute/compactor_v2/utils/dedupe.py +58 -0
  31. deltacat/compute/compactor_v2/utils/io.py +152 -0
  32. deltacat/compute/compactor_v2/utils/primary_key_index.py +341 -0
  33. deltacat/compute/compactor_v2/utils/task_options.py +221 -0
  34. deltacat/compute/metastats/meta_stats.py +4 -2
  35. deltacat/compute/metastats/stats.py +1 -0
  36. deltacat/compute/metastats/utils/io.py +4 -0
  37. deltacat/compute/stats/utils/io.py +20 -5
  38. deltacat/exceptions.py +4 -0
  39. deltacat/io/memcached_object_store.py +37 -14
  40. deltacat/logs.py +4 -3
  41. deltacat/storage/interface.py +8 -1
  42. deltacat/storage/model/types.py +2 -1
  43. deltacat/tests/aws/test_clients.py +16 -3
  44. deltacat/tests/compute/__init__.py +0 -0
  45. deltacat/tests/compute/common.py +96 -0
  46. deltacat/tests/compute/compactor/__init__.py +0 -0
  47. deltacat/tests/compute/compactor/steps/__init__.py +0 -0
  48. deltacat/tests/{test_repartition.py → compute/compactor/steps/test_repartition.py} +34 -8
  49. deltacat/tests/compute/compactor/utils/__init__.py +0 -0
  50. deltacat/tests/{compactor → compute/compactor}/utils/test_io.py +47 -5
  51. deltacat/tests/compute/compactor_v2/__init__.py +0 -0
  52. deltacat/tests/{compactor → compute}/test_compact_partition_params.py +14 -30
  53. deltacat/tests/compute/test_compaction_session_incremental.py +363 -0
  54. deltacat/tests/compute/testcases.py +395 -0
  55. deltacat/tests/io/test_memcached_object_store.py +5 -4
  56. deltacat/tests/local_deltacat_storage/__init__.py +62 -19
  57. deltacat/tests/test_utils/pyarrow.py +49 -0
  58. deltacat/tests/test_utils/utils.py +13 -0
  59. deltacat/tests/utils/data/__init__.py +0 -0
  60. deltacat/tests/utils/test_daft.py +76 -0
  61. deltacat/tests/utils/test_pyarrow.py +133 -0
  62. deltacat/tests/utils/test_resources.py +23 -20
  63. deltacat/types/media.py +1 -0
  64. deltacat/types/partial_download.py +83 -0
  65. deltacat/types/tables.py +6 -0
  66. deltacat/utils/arguments.py +25 -0
  67. deltacat/utils/daft.py +87 -0
  68. deltacat/utils/placement.py +20 -3
  69. deltacat/utils/pyarrow.py +218 -1
  70. deltacat/utils/ray_utils/concurrency.py +26 -1
  71. deltacat/utils/resources.py +72 -1
  72. deltacat/utils/s3fs.py +21 -0
  73. {deltacat-0.1.18b14.dist-info → deltacat-0.1.18b16.dist-info}/METADATA +17 -3
  74. {deltacat-0.1.18b14.dist-info → deltacat-0.1.18b16.dist-info}/RECORD +79 -47
  75. {deltacat-0.1.18b14.dist-info → deltacat-0.1.18b16.dist-info}/WHEEL +1 -1
  76. /deltacat/{tests/compactor → compute/compactor_v2}/__init__.py +0 -0
  77. /deltacat/{tests/compactor/utils → compute/compactor_v2/model}/__init__.py +0 -0
  78. {deltacat-0.1.18b14.dist-info → deltacat-0.1.18b16.dist-info}/LICENSE +0 -0
  79. {deltacat-0.1.18b14.dist-info → deltacat-0.1.18b16.dist-info}/top_level.txt +0 -0
@@ -1,6 +1,12 @@
1
1
  import unittest
2
2
  from unittest import mock
3
3
  from deltacat.tests.test_utils.constants import TEST_DELTA
4
+ from typing import Any, Dict
5
+
6
+ DATABASE_FILE_PATH_KEY, DATABASE_FILE_PATH_VALUE = (
7
+ "db_file_path",
8
+ "deltacat/tests/local_deltacat_storage/db_test.sqlite",
9
+ )
4
10
 
5
11
 
6
12
  class TestFitInputDeltas(unittest.TestCase):
@@ -13,6 +19,10 @@ class TestFitInputDeltas(unittest.TestCase):
13
19
  CompactionSessionAuditInfo,
14
20
  )
15
21
 
22
+ cls.kwargs_for_local_deltacat_storage: Dict[str, Any] = {
23
+ DATABASE_FILE_PATH_KEY: DATABASE_FILE_PATH_VALUE,
24
+ }
25
+
16
26
  cls.COMPACTION_AUDIT = CompactionSessionAuditInfo("1.0", "test")
17
27
 
18
28
  super().setUpClass()
@@ -23,6 +33,7 @@ class TestFitInputDeltas(unittest.TestCase):
23
33
 
24
34
  def test_sanity(self):
25
35
  from deltacat.compute.compactor.utils import io
36
+ import deltacat.tests.local_deltacat_storage as ds
26
37
 
27
38
  (
28
39
  delta_list,
@@ -30,7 +41,12 @@ class TestFitInputDeltas(unittest.TestCase):
30
41
  high_watermark,
31
42
  require_multiple_rounds,
32
43
  ) = io.fit_input_deltas(
33
- [TEST_DELTA], {"CPU": 1, "memory": 20000000}, self.COMPACTION_AUDIT, None
44
+ [TEST_DELTA],
45
+ {"CPU": 1, "memory": 20000000},
46
+ self.COMPACTION_AUDIT,
47
+ None,
48
+ ds,
49
+ self.kwargs_for_local_deltacat_storage,
34
50
  )
35
51
 
36
52
  self.assertIsNotNone(hash_bucket_count)
@@ -44,6 +60,7 @@ class TestFitInputDeltas(unittest.TestCase):
44
60
 
45
61
  def test_when_hash_bucket_count_overridden(self):
46
62
  from deltacat.compute.compactor.utils import io
63
+ import deltacat.tests.local_deltacat_storage as ds
47
64
 
48
65
  (
49
66
  delta_list,
@@ -51,7 +68,12 @@ class TestFitInputDeltas(unittest.TestCase):
51
68
  high_watermark,
52
69
  require_multiple_rounds,
53
70
  ) = io.fit_input_deltas(
54
- [TEST_DELTA], {"CPU": 1, "memory": 20000000}, self.COMPACTION_AUDIT, 20
71
+ [TEST_DELTA],
72
+ {"CPU": 1, "memory": 20000000},
73
+ self.COMPACTION_AUDIT,
74
+ 20,
75
+ ds,
76
+ self.kwargs_for_local_deltacat_storage,
55
77
  )
56
78
 
57
79
  self.assertEqual(20, hash_bucket_count)
@@ -61,6 +83,7 @@ class TestFitInputDeltas(unittest.TestCase):
61
83
 
62
84
  def test_when_not_enough_memory_splits_manifest_entries(self):
63
85
  from deltacat.compute.compactor.utils import io
86
+ import deltacat.tests.local_deltacat_storage as ds
64
87
 
65
88
  (
66
89
  delta_list,
@@ -68,7 +91,12 @@ class TestFitInputDeltas(unittest.TestCase):
68
91
  high_watermark,
69
92
  require_multiple_rounds,
70
93
  ) = io.fit_input_deltas(
71
- [TEST_DELTA], {"CPU": 2, "memory": 10}, self.COMPACTION_AUDIT, 20
94
+ [TEST_DELTA],
95
+ {"CPU": 2, "memory": 10},
96
+ self.COMPACTION_AUDIT,
97
+ 20,
98
+ ds,
99
+ self.kwargs_for_local_deltacat_storage,
72
100
  )
73
101
 
74
102
  self.assertIsNotNone(hash_bucket_count)
@@ -78,14 +106,28 @@ class TestFitInputDeltas(unittest.TestCase):
78
106
 
79
107
  def test_when_no_input_deltas(self):
80
108
  from deltacat.compute.compactor.utils import io
109
+ import deltacat.tests.local_deltacat_storage as ds
81
110
 
82
111
  with self.assertRaises(AssertionError):
83
112
  io.fit_input_deltas(
84
- [], {"CPU": 100, "memory": 20000.0}, self.COMPACTION_AUDIT, None
113
+ [],
114
+ {"CPU": 100, "memory": 20000.0},
115
+ self.COMPACTION_AUDIT,
116
+ None,
117
+ ds,
118
+ self.kwargs_for_local_deltacat_storage,
85
119
  )
86
120
 
87
121
  def test_when_cpu_resources_is_not_passed(self):
88
122
  from deltacat.compute.compactor.utils import io
123
+ import deltacat.tests.local_deltacat_storage as ds
89
124
 
90
125
  with self.assertRaises(KeyError):
91
- io.fit_input_deltas([], {}, self.COMPACTION_AUDIT, None)
126
+ io.fit_input_deltas(
127
+ [],
128
+ {},
129
+ self.COMPACTION_AUDIT,
130
+ None,
131
+ ds,
132
+ self.kwargs_for_local_deltacat_storage,
133
+ )
File without changes
@@ -27,7 +27,7 @@ class TestCompactPartitionParams(unittest.TestCase):
27
27
  "partitionValues": [],
28
28
  "partitionId": None,
29
29
  },
30
- "hash_bucket_count": None,
30
+ "hash_bucket_count": 200,
31
31
  "last_stream_position_to_compact": 168000000000,
32
32
  "list_deltas_kwargs": {"equivalent_table_types": []},
33
33
  "primary_keys": {"id"},
@@ -75,41 +75,16 @@ class TestCompactPartitionParams(unittest.TestCase):
75
75
 
76
76
  super().setUpClass()
77
77
 
78
- def test_destination_partition_locator_is_optional(self):
79
- from deltacat.compute.compactor.model.compact_partition_params import (
80
- CompactPartitionParams,
81
- )
82
-
83
- params = CompactPartitionParams.of({})
84
- assert params.destination_partition_locator is None
85
-
86
78
  def test_serialize_returns_json_string(self):
87
79
  from deltacat.compute.compactor.model.compact_partition_params import (
88
80
  CompactPartitionParams,
89
81
  )
90
82
 
91
83
  params = CompactPartitionParams.of(
92
- {"destination_partition_locator": "my-partition"}
84
+ TestCompactPartitionParams.VALID_COMPACT_PARTITION_PARAMS
93
85
  )
94
86
  serialized_params = params.serialize()
95
87
  assert isinstance(serialized_params, str)
96
- assert json.loads(serialized_params) == {
97
- "compacted_file_content_type": None,
98
- "compaction_artifact_s3_bucket": None,
99
- "deltacat_storage": None,
100
- "hash_bucket_count": None,
101
- "last_stream_position_to_compact": None,
102
- "list_deltas_kwargs": None,
103
- "pg_config": None,
104
- "primary_keys": None,
105
- "properties": None,
106
- "read_kwargs_provider": None,
107
- "rebase_source_partition_high_watermark": None,
108
- "rebase_source_partition_locator": None,
109
- "s3_table_writer_kwargs": None,
110
- "source_partition_locator": None,
111
- "destination_partition_locator": "my-partition",
112
- }
113
88
 
114
89
  def test_serialize_returns_json_string_with_all_fields(self):
115
90
  from deltacat.compute.compactor.model.compact_partition_params import (
@@ -142,7 +117,6 @@ class TestCompactPartitionParams(unittest.TestCase):
142
117
  == params.list_deltas_kwargs
143
118
  )
144
119
  assert json.loads(serialized_params)["primary_keys"] == params.primary_keys
145
- assert json.loads(serialized_params)["properties"] == params.properties
146
120
  assert (
147
121
  json.loads(serialized_params)["rebase_source_partition_high_watermark"]
148
122
  == params.rebase_source_partition_high_watermark
@@ -165,7 +139,12 @@ class TestCompactPartitionParams(unittest.TestCase):
165
139
  CompactPartitionParams,
166
140
  )
167
141
 
168
- params = CompactPartitionParams.of({"primary_keys": {"foo", "bar", "baz"}})
142
+ params = CompactPartitionParams.of(
143
+ {
144
+ **TestCompactPartitionParams.VALID_COMPACT_PARTITION_PARAMS,
145
+ "primary_keys": {"foo", "bar", "baz"},
146
+ }
147
+ )
169
148
  serialized_params = params.serialize()
170
149
  self.assertCountEqual(
171
150
  json.loads(serialized_params)["primary_keys"], ["foo", "bar", "baz"]
@@ -180,7 +159,12 @@ class TestCompactPartitionParams(unittest.TestCase):
180
159
  def toJSON(self) -> str:
181
160
  return "my-json-object"
182
161
 
183
- params = CompactPartitionParams.of({"compacted_file_content_type": MyObject()})
162
+ params = CompactPartitionParams.of(
163
+ {
164
+ **TestCompactPartitionParams.VALID_COMPACT_PARTITION_PARAMS,
165
+ "compacted_file_content_type": MyObject(),
166
+ }
167
+ )
184
168
  serialized_params = params.serialize()
185
169
  assert (
186
170
  json.loads(serialized_params)["compacted_file_content_type"]
@@ -0,0 +1,363 @@
1
+ import ray
2
+ from moto import mock_s3
3
+ import pytest
4
+ import os
5
+ import json
6
+ import boto3
7
+ from typing import Any, Dict, List, Optional, Set
8
+ from boto3.resources.base import ServiceResource
9
+ import pyarrow as pa
10
+ from deltacat.tests.test_utils.utils import read_s3_contents
11
+ from deltacat.tests.compute.common import (
12
+ setup_sort_and_partition_keys,
13
+ PartitionKey,
14
+ TEST_S3_RCF_BUCKET_NAME,
15
+ BASE_TEST_SOURCE_NAMESPACE,
16
+ BASE_TEST_SOURCE_TABLE_NAME,
17
+ BASE_TEST_DESTINATION_NAMESPACE,
18
+ BASE_TEST_DESTINATION_TABLE_NAME,
19
+ )
20
+ from deltacat.tests.compute.testcases import (
21
+ INCREMENTAL_TEST_CASES,
22
+ )
23
+
24
+ DATABASE_FILE_PATH_KEY, DATABASE_FILE_PATH_VALUE = (
25
+ "db_file_path",
26
+ "deltacat/tests/local_deltacat_storage/db_test.sqlite",
27
+ )
28
+
29
+ """
30
+ MODULE scoped fixtures
31
+ """
32
+
33
+
34
+ @pytest.fixture(autouse=True, scope="module")
35
+ def mock_aws_credential():
36
+ os.environ["AWS_ACCESS_KEY_ID"] = "testing"
37
+ os.environ["AWS_SECRET_ACCESS_ID"] = "testing"
38
+ os.environ["AWS_SECURITY_TOKEN"] = "testing"
39
+ os.environ["AWS_SESSION_TOKEN"] = "testing"
40
+ os.environ["AWS_DEFAULT_REGION"] = "us-east-1"
41
+ yield
42
+
43
+
44
+ @pytest.fixture(autouse=True, scope="module")
45
+ def cleanup_the_database_file_after_all_compaction_session_package_tests_complete():
46
+ # make sure the database file is deleted after all the compactor package tests are completed
47
+ if os.path.exists(DATABASE_FILE_PATH_VALUE):
48
+ os.remove(DATABASE_FILE_PATH_VALUE)
49
+
50
+
51
+ @pytest.fixture(scope="module")
52
+ def setup_s3_resource(mock_aws_credential):
53
+ with mock_s3():
54
+ yield boto3.resource("s3")
55
+
56
+
57
+ @pytest.fixture(scope="module")
58
+ def setup_compaction_artifacts_s3_bucket(setup_s3_resource: ServiceResource):
59
+ setup_s3_resource.create_bucket(
60
+ ACL="authenticated-read",
61
+ Bucket=TEST_S3_RCF_BUCKET_NAME,
62
+ )
63
+ yield
64
+
65
+
66
+ """
67
+ FUNCTION scoped fixtures
68
+ """
69
+
70
+
71
+ @pytest.fixture(scope="function")
72
+ def teardown_local_deltacat_storage_db():
73
+ if os.path.exists(DATABASE_FILE_PATH_VALUE):
74
+ os.remove(DATABASE_FILE_PATH_VALUE)
75
+
76
+
77
+ @pytest.fixture(scope="function")
78
+ def setup_local_deltacat_storage_conn():
79
+ # see deltacat/tests/local_deltacat_storage/README.md for documentation
80
+ kwargs_for_local_deltacat_storage: Dict[str, Any] = {
81
+ DATABASE_FILE_PATH_KEY: DATABASE_FILE_PATH_VALUE,
82
+ }
83
+ yield kwargs_for_local_deltacat_storage
84
+
85
+
86
+ def setup_incremental_source_and_destination_tables(
87
+ source_table_version: str,
88
+ destination_table_version: str,
89
+ primary_keys: Set[str],
90
+ sort_keys: Optional[List[Any]],
91
+ partition_keys: Optional[List[PartitionKey]],
92
+ column_names: List[str],
93
+ arrow_arrays: List[pa.Array],
94
+ partition_values: Optional[List[Any]],
95
+ ds_mock_kwargs: Optional[Dict[str, Any]],
96
+ source_namespace: str = BASE_TEST_SOURCE_NAMESPACE,
97
+ source_table_name: str = BASE_TEST_SOURCE_TABLE_NAME,
98
+ destination_namespace: str = BASE_TEST_DESTINATION_NAMESPACE,
99
+ destination_table_name: str = BASE_TEST_DESTINATION_TABLE_NAME,
100
+ ):
101
+ import deltacat.tests.local_deltacat_storage as ds
102
+ from deltacat.types.media import ContentType
103
+ from deltacat.storage import Partition, Stream
104
+
105
+ ds.create_namespace(source_namespace, {}, **ds_mock_kwargs)
106
+ ds.create_table_version(
107
+ source_namespace,
108
+ source_table_name,
109
+ source_table_version,
110
+ primary_key_column_names=list(primary_keys),
111
+ sort_keys=sort_keys,
112
+ partition_keys=partition_keys,
113
+ supported_content_types=[ContentType.PARQUET],
114
+ **ds_mock_kwargs,
115
+ )
116
+ source_table_stream: Stream = ds.get_stream(
117
+ namespace=source_namespace,
118
+ table_name=source_table_name,
119
+ table_version=source_table_version,
120
+ **ds_mock_kwargs,
121
+ )
122
+ test_table: pa.Table = pa.Table.from_arrays(arrow_arrays, names=column_names)
123
+ staged_partition: Partition = ds.stage_partition(
124
+ source_table_stream, partition_values, **ds_mock_kwargs
125
+ )
126
+ ds.commit_delta(
127
+ ds.stage_delta(test_table, staged_partition, **ds_mock_kwargs), **ds_mock_kwargs
128
+ )
129
+ ds.commit_partition(staged_partition, **ds_mock_kwargs)
130
+ # create the destination table
131
+ ds.create_namespace(destination_namespace, {}, **ds_mock_kwargs)
132
+ ds.create_table_version(
133
+ destination_namespace,
134
+ destination_table_name,
135
+ destination_table_version,
136
+ primary_key_column_names=list(primary_keys),
137
+ sort_keys=sort_keys,
138
+ partition_keys=partition_keys,
139
+ supported_content_types=[ContentType.PARQUET],
140
+ **ds_mock_kwargs,
141
+ )
142
+ destination_table_stream: Stream = ds.get_stream(
143
+ namespace=destination_namespace,
144
+ table_name=destination_table_name,
145
+ table_version=destination_table_version,
146
+ **ds_mock_kwargs,
147
+ )
148
+ source_table_stream_after_committed: Stream = ds.get_stream(
149
+ namespace=source_namespace,
150
+ table_name=source_table_name,
151
+ table_version=source_table_version,
152
+ **ds_mock_kwargs,
153
+ )
154
+ return source_table_stream_after_committed, destination_table_stream
155
+
156
+
157
+ @pytest.mark.parametrize(
158
+ [
159
+ "test_name",
160
+ "source_table_version",
161
+ "destination_table_version",
162
+ "primary_keys_param",
163
+ "sort_keys_param",
164
+ "partition_keys_param",
165
+ "column_names_param",
166
+ "arrow_arrays_param",
167
+ "rebase_source_partition_locator_param",
168
+ "partition_values_param",
169
+ "expected_result",
170
+ "validation_callback_func",
171
+ "validation_callback_func_kwargs",
172
+ "do_teardown_local_deltacat_storage_db",
173
+ "use_prev_compacted",
174
+ "create_placement_group_param",
175
+ "records_per_compacted_file_param",
176
+ "hash_bucket_count_param",
177
+ "compact_partition_func",
178
+ ],
179
+ [
180
+ (
181
+ test_name,
182
+ source_table_version,
183
+ destination_table_version,
184
+ primary_keys_param,
185
+ sort_keys_param,
186
+ partition_keys_param,
187
+ column_names_param,
188
+ arrow_arrays_param,
189
+ rebase_source_partition_locator_param,
190
+ partition_values_param,
191
+ expected_result,
192
+ validation_callback_func,
193
+ validation_callback_func_kwargs,
194
+ do_teardown_local_deltacat_storage_db,
195
+ use_prev_compacted,
196
+ create_placement_group_param,
197
+ records_per_compacted_file_param,
198
+ hash_bucket_count_param,
199
+ compact_partition_func,
200
+ )
201
+ for test_name, (
202
+ source_table_version,
203
+ destination_table_version,
204
+ primary_keys_param,
205
+ sort_keys_param,
206
+ partition_keys_param,
207
+ column_names_param,
208
+ arrow_arrays_param,
209
+ rebase_source_partition_locator_param,
210
+ partition_values_param,
211
+ expected_result,
212
+ validation_callback_func,
213
+ validation_callback_func_kwargs,
214
+ do_teardown_local_deltacat_storage_db,
215
+ use_prev_compacted,
216
+ create_placement_group_param,
217
+ records_per_compacted_file_param,
218
+ hash_bucket_count_param,
219
+ compact_partition_func,
220
+ ) in INCREMENTAL_TEST_CASES.items()
221
+ ],
222
+ ids=[test_name for test_name in INCREMENTAL_TEST_CASES],
223
+ indirect=[],
224
+ )
225
+ def test_compact_partition_incremental(
226
+ request: pytest.FixtureRequest,
227
+ setup_s3_resource: ServiceResource,
228
+ setup_local_deltacat_storage_conn: Dict[str, Any],
229
+ setup_compaction_artifacts_s3_bucket: None,
230
+ test_name: str,
231
+ source_table_version: str,
232
+ destination_table_version: str,
233
+ primary_keys_param: Set[str],
234
+ sort_keys_param,
235
+ partition_keys_param,
236
+ column_names_param: List[str],
237
+ arrow_arrays_param: List[pa.Array],
238
+ rebase_source_partition_locator_param,
239
+ partition_values_param,
240
+ expected_result,
241
+ # use and implement func and func_kwargs if you want to run additional validations apart from the ones in the test
242
+ validation_callback_func,
243
+ validation_callback_func_kwargs,
244
+ do_teardown_local_deltacat_storage_db,
245
+ use_prev_compacted,
246
+ create_placement_group_param,
247
+ records_per_compacted_file_param,
248
+ hash_bucket_count_param,
249
+ compact_partition_func,
250
+ ):
251
+ import deltacat.tests.local_deltacat_storage as ds
252
+ from deltacat.types.media import ContentType
253
+ from deltacat.storage import (
254
+ PartitionLocator,
255
+ )
256
+ from deltacat.compute.compactor.model.compact_partition_params import (
257
+ CompactPartitionParams,
258
+ )
259
+ from deltacat.utils.placement import (
260
+ PlacementGroupManager,
261
+ )
262
+ from deltacat.compute.compactor import (
263
+ RoundCompletionInfo,
264
+ )
265
+
266
+ ds_mock_kwargs = setup_local_deltacat_storage_conn
267
+
268
+ # setup
269
+ sort_keys, partition_keys = setup_sort_and_partition_keys(
270
+ sort_keys_param, partition_keys_param
271
+ )
272
+ (
273
+ source_table_stream,
274
+ destination_table_stream,
275
+ ) = setup_incremental_source_and_destination_tables(
276
+ source_table_version,
277
+ destination_table_version,
278
+ primary_keys_param,
279
+ sort_keys,
280
+ partition_keys,
281
+ column_names_param,
282
+ arrow_arrays_param,
283
+ partition_values_param,
284
+ ds_mock_kwargs,
285
+ f"{test_name}_src_namespace",
286
+ f"{test_name}_table_src",
287
+ f"{test_name}_dest_namespace",
288
+ f"{test_name}_table_dest",
289
+ )
290
+ ray.shutdown()
291
+ ray.init(local_mode=True)
292
+ assert ray.is_initialized()
293
+ source_partition = ds.get_partition(
294
+ source_table_stream.locator,
295
+ partition_values_param,
296
+ **ds_mock_kwargs,
297
+ )
298
+ destination_partition_locator = PartitionLocator.of(
299
+ destination_table_stream.locator,
300
+ partition_values_param,
301
+ None,
302
+ )
303
+ num_workers, worker_instance_cpu = 1, 1
304
+ total_cpus = num_workers * worker_instance_cpu
305
+ pgm = None
306
+ if create_placement_group_param:
307
+ pgm = PlacementGroupManager(
308
+ 1, total_cpus, worker_instance_cpu, memory_per_bundle=4000000
309
+ ).pgs[0]
310
+ compact_partition_params = CompactPartitionParams.of(
311
+ {
312
+ "compaction_artifact_s3_bucket": TEST_S3_RCF_BUCKET_NAME,
313
+ "compacted_file_content_type": ContentType.PARQUET,
314
+ "dd_max_parallelism_ratio": 1.0,
315
+ "deltacat_storage": ds,
316
+ "deltacat_storage_kwargs": ds_mock_kwargs,
317
+ "destination_partition_locator": destination_partition_locator,
318
+ "hash_bucket_count": hash_bucket_count_param,
319
+ "last_stream_position_to_compact": source_partition.stream_position,
320
+ "list_deltas_kwargs": {**ds_mock_kwargs, **{"equivalent_table_types": []}},
321
+ "pg_config": pgm,
322
+ "primary_keys": primary_keys_param,
323
+ "rebase_source_partition_locator": rebase_source_partition_locator_param,
324
+ "records_per_compacted_file": records_per_compacted_file_param,
325
+ "s3_client_kwargs": {},
326
+ "source_partition_locator": source_partition.locator,
327
+ "sort_keys": sort_keys if sort_keys else None,
328
+ }
329
+ )
330
+ # execute
331
+ rcf_file_s3_uri = compact_partition_func(compact_partition_params)
332
+ # validate
333
+ _, rcf_object_key = rcf_file_s3_uri.rsplit("/", 1)
334
+ rcf_file_output: Dict[str, Any] = read_s3_contents(
335
+ setup_s3_resource, TEST_S3_RCF_BUCKET_NAME, rcf_object_key
336
+ )
337
+ round_completion_info = RoundCompletionInfo(**rcf_file_output)
338
+ print(f"rcf_file_output: {json.dumps(rcf_file_output, indent=2)}")
339
+ compacted_delta_locator = round_completion_info.compacted_delta_locator
340
+ tables = ds.download_delta(compacted_delta_locator, **ds_mock_kwargs)
341
+ compacted_table = pa.concat_tables(tables)
342
+
343
+ # the compacted table may contain multiple files and chunks
344
+ # and order of records may be incorrect due to multiple files.
345
+ expected_result = expected_result.combine_chunks().sort_by(
346
+ [(val, "ascending") for val in primary_keys_param]
347
+ )
348
+ compacted_table = compacted_table.combine_chunks().sort_by(
349
+ [(val, "ascending") for val in primary_keys_param]
350
+ )
351
+
352
+ assert compacted_table.equals(
353
+ expected_result
354
+ ), f"{compacted_table} does not match {expected_result}"
355
+ if (
356
+ validation_callback_func is not None
357
+ and validation_callback_func_kwargs is not None
358
+ ):
359
+ validation_callback_func(**validation_callback_func_kwargs)
360
+ # https://docs.pytest.org/en/7.1.x/reference/reference.html#pytest.FixtureRequest.getfixturevalue
361
+ if do_teardown_local_deltacat_storage_db:
362
+ request.getfixturevalue("teardown_local_deltacat_storage_db")
363
+ return