deltacat 0.1.18b14__py3-none-any.whl → 0.1.18b15__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (80) hide show
  1. deltacat/__init__.py +1 -1
  2. deltacat/aws/clients.py +17 -6
  3. deltacat/aws/redshift/model/manifest.py +4 -0
  4. deltacat/aws/s3u.py +24 -1
  5. deltacat/compute/compactor/compaction_session.py +42 -18
  6. deltacat/compute/compactor/model/compact_partition_params.py +287 -58
  7. deltacat/compute/compactor/model/compaction_session_audit_info.py +150 -9
  8. deltacat/compute/compactor/model/delta_annotated.py +91 -9
  9. deltacat/compute/compactor/model/delta_file_envelope.py +14 -2
  10. deltacat/compute/compactor/model/round_completion_info.py +17 -1
  11. deltacat/compute/compactor/repartition_session.py +2 -1
  12. deltacat/compute/compactor/steps/dedupe.py +9 -6
  13. deltacat/compute/compactor/steps/hash_bucket.py +24 -3
  14. deltacat/compute/compactor/steps/materialize.py +11 -6
  15. deltacat/compute/compactor/steps/repartition.py +16 -1
  16. deltacat/compute/compactor/utils/io.py +40 -23
  17. deltacat/compute/compactor/utils/sort_key.py +5 -0
  18. deltacat/compute/compactor/utils/system_columns.py +43 -0
  19. deltacat/compute/compactor_v2/compaction_session.py +506 -0
  20. deltacat/compute/compactor_v2/constants.py +34 -0
  21. deltacat/compute/compactor_v2/model/hash_bucket_input.py +78 -0
  22. deltacat/compute/compactor_v2/model/hash_bucket_result.py +12 -0
  23. deltacat/compute/compactor_v2/model/merge_input.py +127 -0
  24. deltacat/compute/compactor_v2/model/merge_result.py +12 -0
  25. deltacat/compute/compactor_v2/steps/__init__.py +0 -0
  26. deltacat/compute/compactor_v2/steps/hash_bucket.py +203 -0
  27. deltacat/compute/compactor_v2/steps/merge.py +41 -0
  28. deltacat/compute/compactor_v2/utils/__init__.py +0 -0
  29. deltacat/compute/compactor_v2/utils/content_type_params.py +37 -0
  30. deltacat/compute/compactor_v2/utils/io.py +149 -0
  31. deltacat/compute/compactor_v2/utils/primary_key_index.py +308 -0
  32. deltacat/compute/compactor_v2/utils/task_options.py +228 -0
  33. deltacat/compute/metastats/meta_stats.py +4 -2
  34. deltacat/compute/metastats/stats.py +1 -0
  35. deltacat/compute/metastats/utils/io.py +4 -0
  36. deltacat/compute/stats/utils/io.py +20 -5
  37. deltacat/exceptions.py +4 -0
  38. deltacat/io/memcached_object_store.py +37 -14
  39. deltacat/logs.py +4 -3
  40. deltacat/storage/interface.py +8 -1
  41. deltacat/storage/model/types.py +2 -1
  42. deltacat/tests/aws/test_clients.py +16 -3
  43. deltacat/tests/compute/__init__.py +0 -0
  44. deltacat/tests/compute/common.py +96 -0
  45. deltacat/tests/compute/compactor/__init__.py +0 -0
  46. deltacat/tests/compute/compactor/steps/__init__.py +0 -0
  47. deltacat/tests/{test_repartition.py → compute/compactor/steps/test_repartition.py} +22 -8
  48. deltacat/tests/compute/compactor/utils/__init__.py +0 -0
  49. deltacat/tests/{compactor → compute/compactor}/utils/test_io.py +47 -5
  50. deltacat/tests/compute/compactor_v2/__init__.py +0 -0
  51. deltacat/tests/compute/compactor_v2/steps/__init__.py +0 -0
  52. deltacat/tests/compute/compactor_v2/steps/test_hash_bucket.py +199 -0
  53. deltacat/tests/{compactor → compute}/test_compact_partition_params.py +14 -30
  54. deltacat/tests/compute/test_compaction_session_incremental.py +348 -0
  55. deltacat/tests/compute/testcases.py +390 -0
  56. deltacat/tests/io/test_memcached_object_store.py +5 -4
  57. deltacat/tests/local_deltacat_storage/__init__.py +62 -19
  58. deltacat/tests/test_utils/pyarrow.py +32 -0
  59. deltacat/tests/test_utils/utils.py +13 -0
  60. deltacat/tests/utils/data/__init__.py +0 -0
  61. deltacat/tests/utils/test_daft.py +76 -0
  62. deltacat/tests/utils/test_pyarrow.py +133 -0
  63. deltacat/tests/utils/test_resources.py +23 -20
  64. deltacat/types/media.py +1 -0
  65. deltacat/types/partial_download.py +82 -0
  66. deltacat/types/tables.py +1 -0
  67. deltacat/utils/arguments.py +26 -0
  68. deltacat/utils/daft.py +87 -0
  69. deltacat/utils/placement.py +20 -3
  70. deltacat/utils/pyarrow.py +213 -1
  71. deltacat/utils/ray_utils/concurrency.py +26 -1
  72. deltacat/utils/resources.py +72 -1
  73. deltacat/utils/s3fs.py +21 -0
  74. {deltacat-0.1.18b14.dist-info → deltacat-0.1.18b15.dist-info}/METADATA +17 -3
  75. {deltacat-0.1.18b14.dist-info → deltacat-0.1.18b15.dist-info}/RECORD +80 -47
  76. {deltacat-0.1.18b14.dist-info → deltacat-0.1.18b15.dist-info}/WHEEL +1 -1
  77. /deltacat/{tests/compactor → compute/compactor_v2}/__init__.py +0 -0
  78. /deltacat/{tests/compactor/utils → compute/compactor_v2/model}/__init__.py +0 -0
  79. {deltacat-0.1.18b14.dist-info → deltacat-0.1.18b15.dist-info}/LICENSE +0 -0
  80. {deltacat-0.1.18b14.dist-info → deltacat-0.1.18b15.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,199 @@
1
+ import unittest
2
+ import sqlite3
3
+ import ray
4
+ import os
5
+ from collections import defaultdict
6
+ from deltacat.compute.compactor import DeltaAnnotated
7
+ import deltacat.tests.local_deltacat_storage as ds
8
+ from deltacat.io.ray_plasma_object_store import RayPlasmaObjectStore
9
+ from deltacat.compute.compactor_v2.model.hash_bucket_input import HashBucketInput
10
+ from deltacat.compute.compactor_v2.model.hash_bucket_result import HashBucketResult
11
+ from deltacat.compute.compactor_v2.steps.hash_bucket import hash_bucket
12
+ from deltacat.utils.common import current_time_ms
13
+ from deltacat.tests.test_utils.pyarrow import create_delta_from_csv_file
14
+
15
+
16
+ class TestHashBucket(unittest.TestCase):
17
+ HASH_BUCKET_NAMESPACE = "test_hash_bucket"
18
+ DB_FILE_PATH = f"{current_time_ms()}.db"
19
+ STRING_PK_FILE_PATH = (
20
+ "deltacat/tests/compute/compactor_v2/steps/data/string_pk_table.csv"
21
+ )
22
+ DATE_PK_FILE_PATH = (
23
+ "deltacat/tests/compute/compactor_v2/steps/data/date_pk_table.csv"
24
+ )
25
+ MULTIPLE_PK_FILE_PATH = (
26
+ "deltacat/tests/compute/compactor_v2/steps/data/multiple_pk_table.csv"
27
+ )
28
+ NO_PK_FILE_PATH = "deltacat/tests/compute/compactor_v2/steps/data/no_pk_table.csv"
29
+
30
+ @classmethod
31
+ def setUpClass(cls):
32
+ ray.init(local_mode=True, ignore_reinit_error=True)
33
+
34
+ con = sqlite3.connect(cls.DB_FILE_PATH)
35
+ cur = con.cursor()
36
+ cls.kwargs = {ds.SQLITE_CON_ARG: con, ds.SQLITE_CUR_ARG: cur}
37
+ cls.deltacat_storage_kwargs = {ds.DB_FILE_PATH_ARG: cls.DB_FILE_PATH}
38
+
39
+ super().setUpClass()
40
+
41
+ @classmethod
42
+ def doClassCleanups(cls) -> None:
43
+ os.remove(cls.DB_FILE_PATH)
44
+
45
+ def test_single_string_pk_correctly_hashes(self):
46
+ # setup
47
+ delta = create_delta_from_csv_file(
48
+ self.HASH_BUCKET_NAMESPACE, [self.STRING_PK_FILE_PATH], **self.kwargs
49
+ )
50
+
51
+ annotated_delta = DeltaAnnotated.of(delta)
52
+ object_store = RayPlasmaObjectStore()
53
+ hb_input = HashBucketInput.of(
54
+ annotated_delta=annotated_delta,
55
+ primary_keys=["pk"],
56
+ num_hash_buckets=3,
57
+ num_hash_groups=2,
58
+ deltacat_storage=ds,
59
+ deltacat_storage_kwargs=self.deltacat_storage_kwargs,
60
+ object_store=object_store,
61
+ )
62
+
63
+ # action
64
+ hb_result_promise = hash_bucket.remote(hb_input)
65
+ hb_result: HashBucketResult = ray.get(hb_result_promise)
66
+
67
+ # assert
68
+ # PK hash column is also persisted.
69
+ self._validate_hash_bucket_result(
70
+ hb_result,
71
+ record_count=6,
72
+ num_hash_buckets=3,
73
+ num_columns=3,
74
+ object_store=object_store,
75
+ )
76
+
77
+ def test_single_date_pk_correctly_hashes(self):
78
+ # setup
79
+ delta = create_delta_from_csv_file(
80
+ self.HASH_BUCKET_NAMESPACE, [self.DATE_PK_FILE_PATH], **self.kwargs
81
+ )
82
+
83
+ annotated_delta = DeltaAnnotated.of(delta)
84
+ object_store = RayPlasmaObjectStore()
85
+ hb_input = HashBucketInput.of(
86
+ annotated_delta=annotated_delta,
87
+ primary_keys=["pk"],
88
+ num_hash_buckets=2,
89
+ num_hash_groups=1,
90
+ deltacat_storage=ds,
91
+ deltacat_storage_kwargs=self.deltacat_storage_kwargs,
92
+ object_store=object_store,
93
+ )
94
+
95
+ # action
96
+ hb_result_promise = hash_bucket.remote(hb_input)
97
+ hb_result: HashBucketResult = ray.get(hb_result_promise)
98
+
99
+ # assert
100
+ self._validate_hash_bucket_result(
101
+ hb_result,
102
+ record_count=7,
103
+ num_hash_buckets=2,
104
+ num_columns=3,
105
+ object_store=object_store,
106
+ )
107
+
108
+ def test_no_pk_does_not_hash(self):
109
+ # setup
110
+ delta = create_delta_from_csv_file(
111
+ self.HASH_BUCKET_NAMESPACE, [self.NO_PK_FILE_PATH], **self.kwargs
112
+ )
113
+
114
+ annotated_delta = DeltaAnnotated.of(delta)
115
+ object_store = RayPlasmaObjectStore()
116
+ hb_input = HashBucketInput.of(
117
+ annotated_delta=annotated_delta,
118
+ primary_keys=[],
119
+ num_hash_buckets=2,
120
+ num_hash_groups=1,
121
+ deltacat_storage=ds,
122
+ deltacat_storage_kwargs=self.deltacat_storage_kwargs,
123
+ object_store=object_store,
124
+ )
125
+
126
+ # action
127
+ hb_result_promise = hash_bucket.remote(hb_input)
128
+ hb_result: HashBucketResult = ray.get(hb_result_promise)
129
+
130
+ # assert
131
+ self._validate_hash_bucket_result(
132
+ hb_result,
133
+ record_count=6,
134
+ num_hash_buckets=2,
135
+ num_columns=3,
136
+ object_store=object_store,
137
+ )
138
+
139
+ def test_multiple_pk_correctly_hashes(self):
140
+ # setup
141
+ delta = create_delta_from_csv_file(
142
+ self.HASH_BUCKET_NAMESPACE, [self.MULTIPLE_PK_FILE_PATH], **self.kwargs
143
+ )
144
+
145
+ annotated_delta = DeltaAnnotated.of(delta)
146
+ object_store = RayPlasmaObjectStore()
147
+ hb_input = HashBucketInput.of(
148
+ annotated_delta=annotated_delta,
149
+ primary_keys=["pk1", "pk2"],
150
+ num_hash_buckets=2,
151
+ num_hash_groups=1,
152
+ deltacat_storage=ds,
153
+ deltacat_storage_kwargs=self.deltacat_storage_kwargs,
154
+ object_store=object_store,
155
+ )
156
+
157
+ # action
158
+ hb_result_promise = hash_bucket.remote(hb_input)
159
+ hb_result: HashBucketResult = ray.get(hb_result_promise)
160
+
161
+ # assert
162
+ self._validate_hash_bucket_result(
163
+ hb_result,
164
+ record_count=6,
165
+ num_hash_buckets=2,
166
+ num_columns=4,
167
+ object_store=object_store,
168
+ )
169
+
170
+ def _validate_hash_bucket_result(
171
+ self,
172
+ hb_result: HashBucketResult,
173
+ record_count: int,
174
+ num_hash_buckets: int,
175
+ num_columns: int,
176
+ object_store,
177
+ ):
178
+
179
+ self.assertEqual(hb_result.hb_record_count, record_count)
180
+ self.assertIsNotNone(hb_result)
181
+ self.assertIsNotNone(hb_result.peak_memory_usage_bytes)
182
+ self.assertIsNotNone(hb_result.task_completed_at)
183
+ self.assertIsNotNone(hb_result.telemetry_time_in_seconds)
184
+
185
+ hb_index_to_dfes = defaultdict(list)
186
+ total_records_in_result = 0
187
+ for _, object_id in enumerate(hb_result.hash_bucket_group_to_obj_id_tuple):
188
+ if object_id:
189
+ obj = object_store.get(object_id[0])
190
+ for hb_idx, dfes in enumerate(obj):
191
+ if dfes is not None:
192
+ hb_index_to_dfes[hb_idx].extend(dfes)
193
+ for dfe in dfes:
194
+ self.assertIsNotNone(dfe)
195
+ total_records_in_result += len(dfe.table)
196
+ self.assertEqual(num_columns, len(dfe.table.column_names))
197
+
198
+ self.assertTrue(len(hb_index_to_dfes) <= num_hash_buckets)
199
+ self.assertEqual(total_records_in_result, record_count)
@@ -27,7 +27,7 @@ class TestCompactPartitionParams(unittest.TestCase):
27
27
  "partitionValues": [],
28
28
  "partitionId": None,
29
29
  },
30
- "hash_bucket_count": None,
30
+ "hash_bucket_count": 200,
31
31
  "last_stream_position_to_compact": 168000000000,
32
32
  "list_deltas_kwargs": {"equivalent_table_types": []},
33
33
  "primary_keys": {"id"},
@@ -75,41 +75,16 @@ class TestCompactPartitionParams(unittest.TestCase):
75
75
 
76
76
  super().setUpClass()
77
77
 
78
- def test_destination_partition_locator_is_optional(self):
79
- from deltacat.compute.compactor.model.compact_partition_params import (
80
- CompactPartitionParams,
81
- )
82
-
83
- params = CompactPartitionParams.of({})
84
- assert params.destination_partition_locator is None
85
-
86
78
  def test_serialize_returns_json_string(self):
87
79
  from deltacat.compute.compactor.model.compact_partition_params import (
88
80
  CompactPartitionParams,
89
81
  )
90
82
 
91
83
  params = CompactPartitionParams.of(
92
- {"destination_partition_locator": "my-partition"}
84
+ TestCompactPartitionParams.VALID_COMPACT_PARTITION_PARAMS
93
85
  )
94
86
  serialized_params = params.serialize()
95
87
  assert isinstance(serialized_params, str)
96
- assert json.loads(serialized_params) == {
97
- "compacted_file_content_type": None,
98
- "compaction_artifact_s3_bucket": None,
99
- "deltacat_storage": None,
100
- "hash_bucket_count": None,
101
- "last_stream_position_to_compact": None,
102
- "list_deltas_kwargs": None,
103
- "pg_config": None,
104
- "primary_keys": None,
105
- "properties": None,
106
- "read_kwargs_provider": None,
107
- "rebase_source_partition_high_watermark": None,
108
- "rebase_source_partition_locator": None,
109
- "s3_table_writer_kwargs": None,
110
- "source_partition_locator": None,
111
- "destination_partition_locator": "my-partition",
112
- }
113
88
 
114
89
  def test_serialize_returns_json_string_with_all_fields(self):
115
90
  from deltacat.compute.compactor.model.compact_partition_params import (
@@ -142,7 +117,6 @@ class TestCompactPartitionParams(unittest.TestCase):
142
117
  == params.list_deltas_kwargs
143
118
  )
144
119
  assert json.loads(serialized_params)["primary_keys"] == params.primary_keys
145
- assert json.loads(serialized_params)["properties"] == params.properties
146
120
  assert (
147
121
  json.loads(serialized_params)["rebase_source_partition_high_watermark"]
148
122
  == params.rebase_source_partition_high_watermark
@@ -165,7 +139,12 @@ class TestCompactPartitionParams(unittest.TestCase):
165
139
  CompactPartitionParams,
166
140
  )
167
141
 
168
- params = CompactPartitionParams.of({"primary_keys": {"foo", "bar", "baz"}})
142
+ params = CompactPartitionParams.of(
143
+ {
144
+ **TestCompactPartitionParams.VALID_COMPACT_PARTITION_PARAMS,
145
+ "primary_keys": {"foo", "bar", "baz"},
146
+ }
147
+ )
169
148
  serialized_params = params.serialize()
170
149
  self.assertCountEqual(
171
150
  json.loads(serialized_params)["primary_keys"], ["foo", "bar", "baz"]
@@ -180,7 +159,12 @@ class TestCompactPartitionParams(unittest.TestCase):
180
159
  def toJSON(self) -> str:
181
160
  return "my-json-object"
182
161
 
183
- params = CompactPartitionParams.of({"compacted_file_content_type": MyObject()})
162
+ params = CompactPartitionParams.of(
163
+ {
164
+ **TestCompactPartitionParams.VALID_COMPACT_PARTITION_PARAMS,
165
+ "compacted_file_content_type": MyObject(),
166
+ }
167
+ )
184
168
  serialized_params = params.serialize()
185
169
  assert (
186
170
  json.loads(serialized_params)["compacted_file_content_type"]
@@ -0,0 +1,348 @@
1
+ import ray
2
+ from moto import mock_s3
3
+ import pytest
4
+ import os
5
+ import json
6
+ import boto3
7
+ from typing import Any, Dict, List, Optional, Set
8
+ from boto3.resources.base import ServiceResource
9
+ import pyarrow as pa
10
+ from deltacat.tests.test_utils.utils import read_s3_contents
11
+ from deltacat.tests.compute.common import (
12
+ setup_sort_and_partition_keys,
13
+ PartitionKey,
14
+ TEST_S3_RCF_BUCKET_NAME,
15
+ BASE_TEST_SOURCE_NAMESPACE,
16
+ BASE_TEST_SOURCE_TABLE_NAME,
17
+ BASE_TEST_DESTINATION_NAMESPACE,
18
+ BASE_TEST_DESTINATION_TABLE_NAME,
19
+ )
20
+ from deltacat.tests.compute.testcases import (
21
+ INCREMENTAL_TEST_CASES,
22
+ )
23
+
24
+ DATABASE_FILE_PATH_KEY, DATABASE_FILE_PATH_VALUE = (
25
+ "db_file_path",
26
+ "deltacat/tests/local_deltacat_storage/db_test.sqlite",
27
+ )
28
+
29
+ """
30
+ MODULE scoped fixtures
31
+ """
32
+
33
+
34
+ @pytest.fixture(autouse=True, scope="module")
35
+ def mock_aws_credential():
36
+ os.environ["AWS_ACCESS_KEY_ID"] = "testing"
37
+ os.environ["AWS_SECRET_ACCESS_ID"] = "testing"
38
+ os.environ["AWS_SECURITY_TOKEN"] = "testing"
39
+ os.environ["AWS_SESSION_TOKEN"] = "testing"
40
+ os.environ["AWS_DEFAULT_REGION"] = "us-east-1"
41
+ yield
42
+
43
+
44
+ @pytest.fixture(autouse=True, scope="module")
45
+ def cleanup_the_database_file_after_all_compaction_session_package_tests_complete():
46
+ # make sure the database file is deleted after all the compactor package tests are completed
47
+ if os.path.exists(DATABASE_FILE_PATH_VALUE):
48
+ os.remove(DATABASE_FILE_PATH_VALUE)
49
+
50
+
51
+ @pytest.fixture(scope="module")
52
+ def setup_s3_resource(mock_aws_credential):
53
+ with mock_s3():
54
+ yield boto3.resource("s3")
55
+
56
+
57
+ @pytest.fixture(scope="module")
58
+ def setup_compaction_artifacts_s3_bucket(setup_s3_resource: ServiceResource):
59
+ setup_s3_resource.create_bucket(
60
+ ACL="authenticated-read",
61
+ Bucket=TEST_S3_RCF_BUCKET_NAME,
62
+ )
63
+ yield
64
+
65
+
66
+ """
67
+ FUNCTION scoped fixtures
68
+ """
69
+
70
+
71
+ @pytest.fixture(scope="function")
72
+ def teardown_local_deltacat_storage_db():
73
+ if os.path.exists(DATABASE_FILE_PATH_VALUE):
74
+ os.remove(DATABASE_FILE_PATH_VALUE)
75
+
76
+
77
+ @pytest.fixture(scope="function")
78
+ def setup_local_deltacat_storage_conn():
79
+ # see deltacat/tests/local_deltacat_storage/README.md for documentation
80
+ kwargs_for_local_deltacat_storage: Dict[str, Any] = {
81
+ DATABASE_FILE_PATH_KEY: DATABASE_FILE_PATH_VALUE,
82
+ }
83
+ yield kwargs_for_local_deltacat_storage
84
+
85
+
86
+ def setup_incremental_source_and_destination_tables(
87
+ source_table_version: str,
88
+ destination_table_version: str,
89
+ primary_keys: Set[str],
90
+ sort_keys: Optional[List[Any]],
91
+ partition_keys: Optional[List[PartitionKey]],
92
+ column_names: List[str],
93
+ arrow_arrays: List[pa.Array],
94
+ partition_values: Optional[List[Any]],
95
+ ds_mock_kwargs: Optional[Dict[str, Any]],
96
+ source_namespace: str = BASE_TEST_SOURCE_NAMESPACE,
97
+ source_table_name: str = BASE_TEST_SOURCE_TABLE_NAME,
98
+ destination_namespace: str = BASE_TEST_DESTINATION_NAMESPACE,
99
+ destination_table_name: str = BASE_TEST_DESTINATION_TABLE_NAME,
100
+ ):
101
+ import deltacat.tests.local_deltacat_storage as ds
102
+ from deltacat.types.media import ContentType
103
+ from deltacat.storage import Partition, Stream
104
+
105
+ ds.create_namespace(source_namespace, {}, **ds_mock_kwargs)
106
+ ds.create_table_version(
107
+ source_namespace,
108
+ source_table_name,
109
+ source_table_version,
110
+ primary_key_column_names=list(primary_keys),
111
+ sort_keys=sort_keys,
112
+ partition_keys=partition_keys,
113
+ supported_content_types=[ContentType.PARQUET],
114
+ **ds_mock_kwargs,
115
+ )
116
+ source_table_stream: Stream = ds.get_stream(
117
+ namespace=source_namespace,
118
+ table_name=source_table_name,
119
+ table_version=source_table_version,
120
+ **ds_mock_kwargs,
121
+ )
122
+ test_table: pa.Table = pa.Table.from_arrays(arrow_arrays, names=column_names)
123
+ staged_partition: Partition = ds.stage_partition(
124
+ source_table_stream, partition_values, **ds_mock_kwargs
125
+ )
126
+ ds.commit_delta(
127
+ ds.stage_delta(test_table, staged_partition, **ds_mock_kwargs), **ds_mock_kwargs
128
+ )
129
+ ds.commit_partition(staged_partition, **ds_mock_kwargs)
130
+ # create the destination table
131
+ ds.create_namespace(destination_namespace, {}, **ds_mock_kwargs)
132
+ ds.create_table_version(
133
+ destination_namespace,
134
+ destination_table_name,
135
+ destination_table_version,
136
+ primary_key_column_names=list(primary_keys),
137
+ sort_keys=sort_keys,
138
+ partition_keys=partition_keys,
139
+ supported_content_types=[ContentType.PARQUET],
140
+ **ds_mock_kwargs,
141
+ )
142
+ destination_table_stream: Stream = ds.get_stream(
143
+ namespace=destination_namespace,
144
+ table_name=destination_table_name,
145
+ table_version=destination_table_version,
146
+ **ds_mock_kwargs,
147
+ )
148
+ source_table_stream_after_committed: Stream = ds.get_stream(
149
+ namespace=source_namespace,
150
+ table_name=source_table_name,
151
+ table_version=source_table_version,
152
+ **ds_mock_kwargs,
153
+ )
154
+ return source_table_stream_after_committed, destination_table_stream
155
+
156
+
157
+ @pytest.mark.parametrize(
158
+ [
159
+ "test_name",
160
+ "source_table_version",
161
+ "destination_table_version",
162
+ "primary_keys_param",
163
+ "sort_keys_param",
164
+ "partition_keys_param",
165
+ "column_names_param",
166
+ "arrow_arrays_param",
167
+ "rebase_source_partition_locator_param",
168
+ "partition_values_param",
169
+ "expected_result",
170
+ "validation_callback_func",
171
+ "validation_callback_func_kwargs",
172
+ "do_teardown_local_deltacat_storage_db",
173
+ "use_prev_compacted",
174
+ "create_placement_group_param",
175
+ "records_per_compacted_file_param",
176
+ "hash_bucket_count_param",
177
+ "compact_partition_func",
178
+ ],
179
+ [
180
+ (
181
+ test_name,
182
+ source_table_version,
183
+ destination_table_version,
184
+ primary_keys_param,
185
+ sort_keys_param,
186
+ partition_keys_param,
187
+ column_names_param,
188
+ arrow_arrays_param,
189
+ rebase_source_partition_locator_param,
190
+ partition_values_param,
191
+ expected_result,
192
+ validation_callback_func,
193
+ validation_callback_func_kwargs,
194
+ do_teardown_local_deltacat_storage_db,
195
+ use_prev_compacted,
196
+ create_placement_group_param,
197
+ records_per_compacted_file_param,
198
+ hash_bucket_count_param,
199
+ compact_partition_func,
200
+ )
201
+ for test_name, (
202
+ source_table_version,
203
+ destination_table_version,
204
+ primary_keys_param,
205
+ sort_keys_param,
206
+ partition_keys_param,
207
+ column_names_param,
208
+ arrow_arrays_param,
209
+ rebase_source_partition_locator_param,
210
+ partition_values_param,
211
+ expected_result,
212
+ validation_callback_func,
213
+ validation_callback_func_kwargs,
214
+ do_teardown_local_deltacat_storage_db,
215
+ use_prev_compacted,
216
+ create_placement_group_param,
217
+ records_per_compacted_file_param,
218
+ hash_bucket_count_param,
219
+ compact_partition_func,
220
+ ) in INCREMENTAL_TEST_CASES.items()
221
+ ],
222
+ ids=[test_name for test_name in INCREMENTAL_TEST_CASES],
223
+ indirect=[],
224
+ )
225
+ def test_compact_partition_incremental(
226
+ request: pytest.FixtureRequest,
227
+ setup_s3_resource: ServiceResource,
228
+ setup_local_deltacat_storage_conn: Dict[str, Any],
229
+ setup_compaction_artifacts_s3_bucket: None,
230
+ test_name: str,
231
+ source_table_version: str,
232
+ destination_table_version: str,
233
+ primary_keys_param: Set[str],
234
+ sort_keys_param,
235
+ partition_keys_param,
236
+ column_names_param: List[str],
237
+ arrow_arrays_param: List[pa.Array],
238
+ rebase_source_partition_locator_param,
239
+ partition_values_param,
240
+ expected_result,
241
+ validation_callback_func, # use and implement func and func_kwargs if you want to run additional validations apart from the ones in the test
242
+ validation_callback_func_kwargs,
243
+ do_teardown_local_deltacat_storage_db,
244
+ use_prev_compacted,
245
+ create_placement_group_param,
246
+ records_per_compacted_file_param,
247
+ hash_bucket_count_param,
248
+ compact_partition_func,
249
+ ):
250
+ import deltacat.tests.local_deltacat_storage as ds
251
+ from deltacat.types.media import ContentType
252
+ from deltacat.storage import (
253
+ PartitionLocator,
254
+ )
255
+ from deltacat.compute.compactor.model.compact_partition_params import (
256
+ CompactPartitionParams,
257
+ )
258
+ from deltacat.utils.placement import (
259
+ PlacementGroupManager,
260
+ )
261
+ from deltacat.compute.compactor import (
262
+ RoundCompletionInfo,
263
+ )
264
+
265
+ ds_mock_kwargs = setup_local_deltacat_storage_conn
266
+
267
+ # setup
268
+ sort_keys, partition_keys = setup_sort_and_partition_keys(
269
+ sort_keys_param, partition_keys_param
270
+ )
271
+ (
272
+ source_table_stream,
273
+ destination_table_stream,
274
+ ) = setup_incremental_source_and_destination_tables(
275
+ source_table_version,
276
+ destination_table_version,
277
+ primary_keys_param,
278
+ sort_keys,
279
+ partition_keys,
280
+ column_names_param,
281
+ arrow_arrays_param,
282
+ partition_values_param,
283
+ ds_mock_kwargs,
284
+ )
285
+ ray.shutdown()
286
+ ray.init(local_mode=True)
287
+ assert ray.is_initialized()
288
+ source_partition = ds.get_partition(
289
+ source_table_stream.locator,
290
+ partition_values_param,
291
+ **ds_mock_kwargs,
292
+ )
293
+ destination_partition_locator = PartitionLocator.of(
294
+ destination_table_stream.locator,
295
+ partition_values_param,
296
+ None,
297
+ )
298
+ num_workers, worker_instance_cpu = 1, 1
299
+ total_cpus = num_workers * worker_instance_cpu
300
+ pgm = None
301
+ if create_placement_group_param:
302
+ pgm = PlacementGroupManager(
303
+ 1, total_cpus, worker_instance_cpu, memory_per_bundle=4000000
304
+ ).pgs[0]
305
+ compact_partition_params = CompactPartitionParams.of(
306
+ {
307
+ "compaction_artifact_s3_bucket": TEST_S3_RCF_BUCKET_NAME,
308
+ "compacted_file_content_type": ContentType.PARQUET,
309
+ "dd_max_parallelism_ratio": 1.0,
310
+ "deltacat_storage": ds,
311
+ "deltacat_storage_kwargs": ds_mock_kwargs,
312
+ "destination_partition_locator": destination_partition_locator,
313
+ "hash_bucket_count": hash_bucket_count_param,
314
+ "last_stream_position_to_compact": source_partition.stream_position,
315
+ "list_deltas_kwargs": {**ds_mock_kwargs, **{"equivalent_table_types": []}},
316
+ "pg_config": pgm,
317
+ "primary_keys": primary_keys_param,
318
+ "rebase_source_partition_locator": rebase_source_partition_locator_param,
319
+ "records_per_compacted_file": records_per_compacted_file_param,
320
+ "s3_client_kwargs": {},
321
+ "source_partition_locator": source_partition.locator,
322
+ "sort_keys": sort_keys if sort_keys else None,
323
+ }
324
+ )
325
+ # execute
326
+ rcf_file_s3_uri = compact_partition_func(compact_partition_params)
327
+ # validate
328
+ _, rcf_object_key = rcf_file_s3_uri.rsplit("/", 1)
329
+ rcf_file_output: Dict[str, Any] = read_s3_contents(
330
+ setup_s3_resource, TEST_S3_RCF_BUCKET_NAME, rcf_object_key
331
+ )
332
+ round_completion_info = RoundCompletionInfo(**rcf_file_output)
333
+ print(f"rcf_file_output: {json.dumps(rcf_file_output, indent=2)}")
334
+ compacted_delta_locator = round_completion_info.compacted_delta_locator
335
+ tables = ds.download_delta(compacted_delta_locator, **ds_mock_kwargs)
336
+ compacted_table = pa.concat_tables(tables)
337
+ assert compacted_table.equals(
338
+ expected_result
339
+ ), f"{compacted_table} does not match {expected_result}"
340
+ if (
341
+ validation_callback_func is not None
342
+ and validation_callback_func_kwargs is not None
343
+ ):
344
+ validation_callback_func(**validation_callback_func_kwargs)
345
+ # https://docs.pytest.org/en/7.1.x/reference/reference.html#pytest.FixtureRequest.getfixturevalue
346
+ if do_teardown_local_deltacat_storage_db:
347
+ request.getfixturevalue("teardown_local_deltacat_storage_db")
348
+ return