deltacat 1.1.8__py3-none-any.whl → 1.1.10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. deltacat/__init__.py +1 -1
  2. deltacat/aws/constants.py +6 -0
  3. deltacat/aws/redshift/model/manifest.py +16 -0
  4. deltacat/aws/s3u.py +65 -38
  5. deltacat/compute/compactor/compaction_session.py +5 -1
  6. deltacat/compute/compactor/model/compact_partition_params.py +12 -1
  7. deltacat/compute/compactor/model/materialize_result.py +0 -4
  8. deltacat/compute/compactor/repartition_session.py +1 -0
  9. deltacat/compute/compactor/utils/round_completion_file.py +39 -9
  10. deltacat/compute/compactor_v2/compaction_session.py +26 -16
  11. deltacat/compute/compactor_v2/constants.py +5 -11
  12. deltacat/compute/compactor_v2/model/{compaction_session.py → evaluate_compaction_result.py} +1 -2
  13. deltacat/compute/compactor_v2/model/merge_input.py +6 -0
  14. deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -7
  15. deltacat/compute/compactor_v2/steps/merge.py +12 -12
  16. deltacat/compute/compactor_v2/utils/merge.py +1 -0
  17. deltacat/compute/compactor_v2/utils/primary_key_index.py +9 -4
  18. deltacat/compute/compactor_v2/utils/task_options.py +2 -12
  19. deltacat/exceptions.py +342 -7
  20. deltacat/io/dataset.py +5 -17
  21. deltacat/io/memcached_object_store.py +7 -4
  22. deltacat/storage/__init__.py +24 -0
  23. deltacat/storage/interface.py +56 -6
  24. deltacat/storage/model/delta.py +23 -3
  25. deltacat/storage/model/partition.py +6 -7
  26. deltacat/storage/model/partition_spec.py +71 -0
  27. deltacat/storage/model/stream.py +38 -1
  28. deltacat/storage/model/transform.py +127 -0
  29. deltacat/tests/aws/test_s3u.py +2 -0
  30. deltacat/tests/compute/compact_partition_rebase_test_cases.py +88 -0
  31. deltacat/tests/compute/compact_partition_rebase_then_incremental_test_cases.py +3 -2
  32. deltacat/tests/compute/compact_partition_test_cases.py +4 -2
  33. deltacat/tests/compute/compactor/utils/test_round_completion_file.py +209 -0
  34. deltacat/tests/compute/compactor_v2/test_compaction_session.py +204 -37
  35. deltacat/tests/compute/test_compact_partition_rebase.py +289 -0
  36. deltacat/tests/compute/test_util_common.py +19 -4
  37. deltacat/tests/compute/test_util_create_table_deltas_repo.py +1 -0
  38. deltacat/tests/io/test_memcached_object_store.py +5 -2
  39. deltacat/tests/local_deltacat_storage/__init__.py +124 -29
  40. deltacat/tests/local_deltacat_storage/exceptions.py +10 -0
  41. deltacat/tests/test_exceptions.py +100 -0
  42. deltacat/tests/test_logs.py +1 -0
  43. deltacat/tests/test_utils/pyarrow.py +4 -1
  44. deltacat/tests/utils/ray_utils/test_dataset.py +66 -0
  45. deltacat/tests/utils/test_daft.py +0 -1
  46. deltacat/tests/utils/test_resources.py +0 -28
  47. deltacat/utils/daft.py +3 -0
  48. deltacat/utils/numpy.py +3 -3
  49. deltacat/utils/pandas.py +3 -3
  50. deltacat/utils/pyarrow.py +11 -8
  51. deltacat/utils/ray_utils/dataset.py +7 -7
  52. deltacat/utils/ray_utils/runtime.py +2 -2
  53. deltacat/utils/resources.py +0 -45
  54. {deltacat-1.1.8.dist-info → deltacat-1.1.10.dist-info}/METADATA +6 -5
  55. {deltacat-1.1.8.dist-info → deltacat-1.1.10.dist-info}/RECORD +58 -51
  56. deltacat/io/aws/redshift/redshift_datasource.py +0 -578
  57. {deltacat-1.1.8.dist-info → deltacat-1.1.10.dist-info}/LICENSE +0 -0
  58. {deltacat-1.1.8.dist-info → deltacat-1.1.10.dist-info}/WHEEL +0 -0
  59. {deltacat-1.1.8.dist-info → deltacat-1.1.10.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,289 @@
1
+ import ray
2
+ import os
3
+ from moto import mock_s3
4
+ import pytest
5
+ import boto3
6
+ from boto3.resources.base import ServiceResource
7
+ import pyarrow as pa
8
+ from deltacat.io.ray_plasma_object_store import RayPlasmaObjectStore
9
+ from pytest_benchmark.fixture import BenchmarkFixture
10
+
11
+ from deltacat.tests.compute.test_util_constant import (
12
+ TEST_S3_RCF_BUCKET_NAME,
13
+ DEFAULT_NUM_WORKERS,
14
+ DEFAULT_WORKER_INSTANCE_CPUS,
15
+ )
16
+ from deltacat.compute.compactor.model.compactor_version import CompactorVersion
17
+ from deltacat.tests.compute.test_util_common import (
18
+ get_compacted_delta_locator_from_rcf,
19
+ )
20
+ from deltacat.tests.compute.test_util_create_table_deltas_repo import (
21
+ create_src_w_deltas_destination_rebase_w_deltas_strategy,
22
+ )
23
+ from deltacat.tests.compute.compact_partition_rebase_test_cases import (
24
+ REBASE_TEST_CASES,
25
+ )
26
+ from typing import Any, Callable, Dict, List, Optional, Set
27
+ from deltacat.types.media import StorageType
28
+ from deltacat.storage import (
29
+ DeltaLocator,
30
+ Partition,
31
+ )
32
+ from deltacat.types.media import ContentType
33
+ from deltacat.compute.compactor.model.compact_partition_params import (
34
+ CompactPartitionParams,
35
+ )
36
+ from deltacat.utils.placement import (
37
+ PlacementGroupManager,
38
+ )
39
+
40
+ DATABASE_FILE_PATH_KEY, DATABASE_FILE_PATH_VALUE = (
41
+ "db_file_path",
42
+ "deltacat/tests/local_deltacat_storage/db_test.sqlite",
43
+ )
44
+
45
+
46
+ """
47
+ MODULE scoped fixtures
48
+ """
49
+
50
+
51
+ @pytest.fixture(autouse=True, scope="module")
52
+ def setup_ray_cluster():
53
+ ray.init(local_mode=True, ignore_reinit_error=True)
54
+ yield
55
+ ray.shutdown()
56
+
57
+
58
+ @pytest.fixture(autouse=True, scope="module")
59
+ def mock_aws_credential():
60
+ os.environ["AWS_ACCESS_KEY_ID"] = "testing"
61
+ os.environ["AWS_SECRET_ACCESS_ID"] = "testing"
62
+ os.environ["AWS_SECURITY_TOKEN"] = "testing"
63
+ os.environ["AWS_SESSION_TOKEN"] = "testing"
64
+ os.environ["AWS_DEFAULT_REGION"] = "us-east-1"
65
+ yield
66
+
67
+
68
+ @pytest.fixture(autouse=True, scope="module")
69
+ def cleanup_the_database_file_after_all_compaction_session_package_tests_complete():
70
+ # make sure the database file is deleted after all the compactor package tests are completed
71
+ if os.path.exists(DATABASE_FILE_PATH_VALUE):
72
+ os.remove(DATABASE_FILE_PATH_VALUE)
73
+
74
+
75
+ @pytest.fixture(scope="module")
76
+ def s3_resource(mock_aws_credential):
77
+ with mock_s3():
78
+ yield boto3.resource("s3")
79
+
80
+
81
+ @pytest.fixture(autouse=True, scope="module")
82
+ def setup_compaction_artifacts_s3_bucket(s3_resource: ServiceResource):
83
+ s3_resource.create_bucket(
84
+ ACL="authenticated-read",
85
+ Bucket=TEST_S3_RCF_BUCKET_NAME,
86
+ )
87
+ yield
88
+
89
+
90
+ """
91
+ FUNCTION scoped fixtures
92
+ """
93
+
94
+
95
+ @pytest.fixture(scope="function")
96
+ def local_deltacat_storage_kwargs(request: pytest.FixtureRequest):
97
+ # see deltacat/tests/local_deltacat_storage/README.md for documentation
98
+ kwargs_for_local_deltacat_storage: Dict[str, Any] = {
99
+ DATABASE_FILE_PATH_KEY: DATABASE_FILE_PATH_VALUE,
100
+ }
101
+ yield kwargs_for_local_deltacat_storage
102
+ if os.path.exists(DATABASE_FILE_PATH_VALUE):
103
+ os.remove(DATABASE_FILE_PATH_VALUE)
104
+
105
+
106
+ @pytest.mark.parametrize(
107
+ [
108
+ "test_name",
109
+ "primary_keys",
110
+ "sort_keys",
111
+ "partition_keys_param",
112
+ "partition_values_param",
113
+ "input_deltas_param",
114
+ "input_deltas_delta_type",
115
+ "expected_terminal_compact_partition_result",
116
+ "expected_terminal_exception",
117
+ "expected_terminal_exception_message",
118
+ "create_placement_group_param",
119
+ "records_per_compacted_file_param",
120
+ "hash_bucket_count_param",
121
+ "read_kwargs_provider_param",
122
+ "drop_duplicates_param",
123
+ "skip_enabled_compact_partition_drivers",
124
+ "rebase_expected_compact_partition_result",
125
+ "compact_partition_func",
126
+ ],
127
+ [
128
+ (
129
+ test_name,
130
+ primary_keys,
131
+ sort_keys,
132
+ partition_keys_param,
133
+ partition_values_param,
134
+ input_deltas,
135
+ input_deltas_delta_type,
136
+ expected_terminal_compact_partition_result,
137
+ expected_terminal_exception,
138
+ expected_terminal_exception_message,
139
+ create_placement_group_param,
140
+ records_per_compacted_file_param,
141
+ hash_bucket_count_param,
142
+ drop_duplicates_param,
143
+ read_kwargs_provider,
144
+ skip_enabled_compact_partition_drivers,
145
+ rebase_expected_compact_partition_result,
146
+ compact_partition_func,
147
+ )
148
+ for test_name, (
149
+ primary_keys,
150
+ sort_keys,
151
+ partition_keys_param,
152
+ partition_values_param,
153
+ input_deltas,
154
+ input_deltas_delta_type,
155
+ expected_terminal_compact_partition_result,
156
+ expected_terminal_exception,
157
+ expected_terminal_exception_message,
158
+ create_placement_group_param,
159
+ records_per_compacted_file_param,
160
+ hash_bucket_count_param,
161
+ drop_duplicates_param,
162
+ read_kwargs_provider,
163
+ skip_enabled_compact_partition_drivers,
164
+ rebase_expected_compact_partition_result,
165
+ compact_partition_func,
166
+ ) in REBASE_TEST_CASES.items()
167
+ ],
168
+ ids=[test_name for test_name in REBASE_TEST_CASES],
169
+ )
170
+ def test_compact_partition_rebase_same_source_and_destination(
171
+ mocker,
172
+ s3_resource: ServiceResource,
173
+ local_deltacat_storage_kwargs: Dict[str, Any],
174
+ test_name: str,
175
+ primary_keys: Set[str],
176
+ sort_keys: List[Optional[Any]],
177
+ partition_keys_param: Optional[List[Any]],
178
+ partition_values_param: List[Optional[str]],
179
+ input_deltas_param: List[pa.Array],
180
+ input_deltas_delta_type: str,
181
+ expected_terminal_compact_partition_result: pa.Table,
182
+ expected_terminal_exception: BaseException,
183
+ expected_terminal_exception_message: Optional[str],
184
+ create_placement_group_param: bool,
185
+ records_per_compacted_file_param: int,
186
+ hash_bucket_count_param: int,
187
+ drop_duplicates_param: bool,
188
+ read_kwargs_provider_param: Any,
189
+ rebase_expected_compact_partition_result: pa.Table,
190
+ skip_enabled_compact_partition_drivers: List[CompactorVersion],
191
+ compact_partition_func: Callable,
192
+ benchmark: BenchmarkFixture,
193
+ ):
194
+ import deltacat.tests.local_deltacat_storage as ds
195
+
196
+ ds_mock_kwargs = local_deltacat_storage_kwargs
197
+ """
198
+ This test tests the scenario where source partition locator == destination partition locator,
199
+ but rebase source partition locator is different.
200
+ This scenario could occur when hash bucket count changes.
201
+ """
202
+ partition_keys = partition_keys_param
203
+ (
204
+ source_table_stream,
205
+ _,
206
+ rebased_table_stream,
207
+ ) = create_src_w_deltas_destination_rebase_w_deltas_strategy(
208
+ primary_keys,
209
+ sort_keys,
210
+ partition_keys,
211
+ input_deltas_param,
212
+ input_deltas_delta_type,
213
+ partition_values_param,
214
+ ds_mock_kwargs,
215
+ )
216
+ source_partition: Partition = ds.get_partition(
217
+ source_table_stream.locator,
218
+ partition_values_param,
219
+ **ds_mock_kwargs,
220
+ )
221
+ rebased_partition: Partition = ds.get_partition(
222
+ rebased_table_stream.locator,
223
+ partition_values_param,
224
+ **ds_mock_kwargs,
225
+ )
226
+ num_workers, worker_instance_cpu = DEFAULT_NUM_WORKERS, DEFAULT_WORKER_INSTANCE_CPUS
227
+ total_cpus = num_workers * worker_instance_cpu
228
+ pgm = None
229
+ create_placement_group_param = False
230
+ if create_placement_group_param:
231
+ pgm = PlacementGroupManager(
232
+ 1, total_cpus, worker_instance_cpu, memory_per_bundle=4000000
233
+ ).pgs[0]
234
+ compact_partition_params = CompactPartitionParams.of(
235
+ {
236
+ "compaction_artifact_s3_bucket": TEST_S3_RCF_BUCKET_NAME,
237
+ "compacted_file_content_type": ContentType.PARQUET,
238
+ "dd_max_parallelism_ratio": 1.0,
239
+ "deltacat_storage": ds,
240
+ "deltacat_storage_kwargs": ds_mock_kwargs,
241
+ "destination_partition_locator": rebased_partition.locator,
242
+ "hash_bucket_count": hash_bucket_count_param,
243
+ "last_stream_position_to_compact": source_partition.stream_position,
244
+ "list_deltas_kwargs": {**ds_mock_kwargs, **{"equivalent_table_types": []}},
245
+ "object_store": RayPlasmaObjectStore(),
246
+ "pg_config": pgm,
247
+ "primary_keys": primary_keys,
248
+ "read_kwargs_provider": read_kwargs_provider_param,
249
+ "rebase_source_partition_locator": source_partition.locator,
250
+ "records_per_compacted_file": records_per_compacted_file_param,
251
+ "s3_client_kwargs": {},
252
+ "source_partition_locator": rebased_partition.locator,
253
+ "sort_keys": sort_keys if sort_keys else None,
254
+ }
255
+ )
256
+
257
+ from deltacat.compute.compactor_v2.model.evaluate_compaction_result import (
258
+ ExecutionCompactionResult,
259
+ )
260
+
261
+ execute_compaction_result_spy = mocker.spy(ExecutionCompactionResult, "__init__")
262
+
263
+ # execute
264
+ rcf_file_s3_uri = compact_partition_func(compact_partition_params)
265
+
266
+ # Assert not in-place compacted
267
+ assert (
268
+ execute_compaction_result_spy.call_args.args[-1] is False
269
+ ), "Table version erroneously marked as in-place compacted!"
270
+ compacted_delta_locator: DeltaLocator = get_compacted_delta_locator_from_rcf(
271
+ s3_resource, rcf_file_s3_uri
272
+ )
273
+ tables = ds.download_delta(
274
+ compacted_delta_locator, storage_type=StorageType.LOCAL, **ds_mock_kwargs
275
+ )
276
+ actual_rebase_compacted_table = pa.concat_tables(tables)
277
+ # if no primary key is specified then sort by sort_key for consistent assertion
278
+ sorting_cols: List[Any] = (
279
+ [(val, "ascending") for val in primary_keys] if primary_keys else sort_keys
280
+ )
281
+ rebase_expected_compact_partition_result = (
282
+ rebase_expected_compact_partition_result.combine_chunks().sort_by(sorting_cols)
283
+ )
284
+ actual_rebase_compacted_table = (
285
+ actual_rebase_compacted_table.combine_chunks().sort_by(sorting_cols)
286
+ )
287
+ assert actual_rebase_compacted_table.equals(
288
+ rebase_expected_compact_partition_result
289
+ ), f"{actual_rebase_compacted_table} does not match {rebase_expected_compact_partition_result}"
@@ -24,6 +24,12 @@ from deltacat.compute.compactor import (
24
24
  RoundCompletionInfo,
25
25
  )
26
26
 
27
+ from deltacat.storage.model.partition import PartitionLocator
28
+ from deltacat.storage.model.stream import StreamLocator
29
+ from deltacat.storage.model.table_version import TableVersionLocator
30
+ from deltacat.storage.model.table import TableLocator
31
+ from deltacat.storage.model.namespace import NamespaceLocator
32
+
27
33
 
28
34
  class PartitionKeyType(str, Enum):
29
35
  INT = "int"
@@ -51,6 +57,18 @@ UTILS
51
57
  """
52
58
 
53
59
 
60
+ def get_test_partition_locator(partition_id):
61
+ tv_locator = TableVersionLocator.of(
62
+ TableLocator.of(NamespaceLocator.of("default"), "test_table"), "1"
63
+ )
64
+ stream_locator = StreamLocator.of(tv_locator, "test_stream_id", "local")
65
+ partition_locator = PartitionLocator.of(
66
+ stream_locator, partition_id=partition_id, partition_values=[]
67
+ )
68
+
69
+ return partition_locator
70
+
71
+
54
72
  def _create_table(
55
73
  namespace: str,
56
74
  table_name: str,
@@ -140,7 +158,7 @@ def create_rebase_table(
140
158
  def get_rcf(s3_resource, rcf_file_s3_uri: str) -> RoundCompletionInfo:
141
159
  from deltacat.tests.test_utils.utils import read_s3_contents
142
160
 
143
- _, rcf_object_key = rcf_file_s3_uri.rsplit("/", 1)
161
+ _, rcf_object_key = rcf_file_s3_uri.strip("s3://").split("/", 1)
144
162
  rcf_file_output: Dict[str, Any] = read_s3_contents(
145
163
  s3_resource, TEST_S3_RCF_BUCKET_NAME, rcf_object_key
146
164
  )
@@ -151,9 +169,6 @@ def get_compacted_delta_locator_from_rcf(
151
169
  s3_resource: ServiceResource, rcf_file_s3_uri: str
152
170
  ):
153
171
  from deltacat.storage import DeltaLocator
154
- from deltacat.compute.compactor import (
155
- RoundCompletionInfo,
156
- )
157
172
 
158
173
  round_completion_info: RoundCompletionInfo = get_rcf(s3_resource, rcf_file_s3_uri)
159
174
 
@@ -251,6 +251,7 @@ def create_src_w_deltas_destination_rebase_w_deltas_strategy(
251
251
  ds.commit_partition(staged_partition, **ds_mock_kwargs)
252
252
 
253
253
  # get streams
254
+ # TODO: Add deltas to destination stream
254
255
  destination_table_stream: Stream = ds.get_stream(
255
256
  namespace=destination_table_namespace,
256
257
  table_name=destination_table_name,
@@ -1,6 +1,9 @@
1
1
  import unittest
2
2
  import numpy as np
3
3
  from unittest import mock
4
+ from deltacat.exceptions import (
5
+ PymemcachedPutObjectError,
6
+ )
4
7
 
5
8
 
6
9
  class MockPyMemcacheClient:
@@ -86,7 +89,7 @@ class TestMemcachedObjectStore(unittest.TestCase):
86
89
  mock_retrying_client.return_value = mock_client.return_value
87
90
  mock_client.return_value.set_many.return_value = ["abcd"]
88
91
 
89
- with self.assertRaises(RuntimeError):
92
+ with self.assertRaises(PymemcachedPutObjectError):
90
93
  self.object_store.put_many(["a", "b"])
91
94
 
92
95
  self.assertEqual(1, mock_client.return_value.set_many.call_count)
@@ -169,7 +172,7 @@ class TestMemcachedObjectStore(unittest.TestCase):
169
172
  mock_client.return_value.set.return_value = False
170
173
  mock_retrying_client.return_value = mock_client.return_value
171
174
 
172
- with self.assertRaises(RuntimeError):
175
+ with self.assertRaises(PymemcachedPutObjectError):
173
176
  self.object_store.put("test_ip")
174
177
 
175
178
  self.assertEqual(1, mock_client.return_value.set.call_count)