deltacat 1.1.9__py3-none-any.whl → 1.1.11__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +1 -1
- deltacat/aws/redshift/model/manifest.py +16 -0
- deltacat/aws/s3u.py +19 -13
- deltacat/compute/compactor/compaction_session.py +5 -1
- deltacat/compute/compactor/repartition_session.py +1 -0
- deltacat/compute/compactor/utils/round_completion_file.py +39 -9
- deltacat/compute/compactor_v2/compaction_session.py +15 -11
- deltacat/compute/compactor_v2/constants.py +3 -0
- deltacat/compute/compactor_v2/model/{compaction_session.py → evaluate_compaction_result.py} +1 -2
- deltacat/compute/compactor_v2/utils/primary_key_index.py +1 -1
- deltacat/exceptions.py +5 -2
- deltacat/io/dataset.py +5 -17
- deltacat/storage/__init__.py +24 -0
- deltacat/storage/interface.py +42 -6
- deltacat/storage/model/delta.py +23 -3
- deltacat/storage/model/partition.py +6 -7
- deltacat/storage/model/partition_spec.py +71 -0
- deltacat/storage/model/stream.py +38 -1
- deltacat/storage/model/transform.py +127 -0
- deltacat/tests/aws/test_s3u.py +2 -0
- deltacat/tests/compute/compactor/utils/test_round_completion_file.py +231 -0
- deltacat/tests/compute/compactor_v2/test_compaction_session.py +201 -36
- deltacat/tests/compute/test_compact_partition_rebase.py +1 -1
- deltacat/tests/compute/test_util_common.py +19 -4
- deltacat/tests/local_deltacat_storage/__init__.py +83 -19
- deltacat/tests/test_utils/pyarrow.py +4 -1
- deltacat/tests/utils/ray_utils/test_dataset.py +66 -0
- deltacat/utils/numpy.py +3 -3
- deltacat/utils/pandas.py +3 -3
- deltacat/utils/pyarrow.py +3 -3
- deltacat/utils/ray_utils/dataset.py +7 -7
- {deltacat-1.1.9.dist-info → deltacat-1.1.11.dist-info}/METADATA +6 -5
- {deltacat-1.1.9.dist-info → deltacat-1.1.11.dist-info}/RECORD +36 -33
- deltacat/io/aws/redshift/redshift_datasource.py +0 -578
- {deltacat-1.1.9.dist-info → deltacat-1.1.11.dist-info}/LICENSE +0 -0
- {deltacat-1.1.9.dist-info → deltacat-1.1.11.dist-info}/WHEEL +0 -0
- {deltacat-1.1.9.dist-info → deltacat-1.1.11.dist-info}/top_level.txt +0 -0
@@ -1,8 +1,12 @@
|
|
1
|
-
import
|
2
|
-
import sqlite3
|
1
|
+
from typing import Dict, Any
|
3
2
|
import ray
|
4
3
|
import os
|
5
|
-
|
4
|
+
import pytest
|
5
|
+
import boto3
|
6
|
+
from deltacat.compute.compactor.model.compaction_session_audit_info import (
|
7
|
+
CompactionSessionAuditInfo,
|
8
|
+
)
|
9
|
+
from boto3.resources.base import ServiceResource
|
6
10
|
import deltacat.tests.local_deltacat_storage as ds
|
7
11
|
from deltacat.types.media import ContentType
|
8
12
|
from deltacat.compute.compactor_v2.compaction_session import (
|
@@ -11,80 +15,241 @@ from deltacat.compute.compactor_v2.compaction_session import (
|
|
11
15
|
from deltacat.compute.compactor.model.compact_partition_params import (
|
12
16
|
CompactPartitionParams,
|
13
17
|
)
|
14
|
-
from deltacat.utils
|
15
|
-
from deltacat.tests.
|
18
|
+
from deltacat.tests.test_utils.utils import read_s3_contents
|
19
|
+
from deltacat.tests.compute.test_util_constant import (
|
20
|
+
TEST_S3_RCF_BUCKET_NAME,
|
21
|
+
)
|
22
|
+
from deltacat.tests.compute.test_util_common import get_rcf
|
23
|
+
from deltacat.tests.test_utils.pyarrow import (
|
24
|
+
stage_partition_from_file_paths,
|
25
|
+
commit_delta_to_staged_partition,
|
26
|
+
commit_delta_to_partition,
|
27
|
+
)
|
28
|
+
from moto import mock_s3
|
29
|
+
|
30
|
+
DATABASE_FILE_PATH_KEY, DATABASE_FILE_PATH_VALUE = (
|
31
|
+
"db_file_path",
|
32
|
+
"deltacat/tests/local_deltacat_storage/db_test.sqlite",
|
33
|
+
)
|
34
|
+
|
35
|
+
|
36
|
+
@pytest.fixture(autouse=True, scope="module")
|
37
|
+
def setup_ray_cluster():
|
38
|
+
ray.init(local_mode=True, ignore_reinit_error=True)
|
39
|
+
yield
|
40
|
+
ray.shutdown()
|
41
|
+
|
42
|
+
|
43
|
+
@pytest.fixture(autouse=True, scope="module")
|
44
|
+
def mock_aws_credential():
|
45
|
+
os.environ["AWS_ACCESS_KEY_ID"] = "testing"
|
46
|
+
os.environ["AWS_SECRET_ACCESS_ID"] = "testing"
|
47
|
+
os.environ["AWS_SECURITY_TOKEN"] = "testing"
|
48
|
+
os.environ["AWS_SESSION_TOKEN"] = "testing"
|
49
|
+
os.environ["AWS_DEFAULT_REGION"] = "us-east-1"
|
50
|
+
yield
|
51
|
+
|
52
|
+
|
53
|
+
@pytest.fixture(scope="module")
|
54
|
+
def s3_resource(mock_aws_credential):
|
55
|
+
with mock_s3():
|
56
|
+
yield boto3.resource("s3")
|
16
57
|
|
17
58
|
|
18
|
-
|
59
|
+
@pytest.fixture(autouse=True, scope="module")
|
60
|
+
def setup_compaction_artifacts_s3_bucket(s3_resource: ServiceResource):
|
61
|
+
s3_resource.create_bucket(
|
62
|
+
ACL="authenticated-read",
|
63
|
+
Bucket=TEST_S3_RCF_BUCKET_NAME,
|
64
|
+
)
|
65
|
+
yield
|
66
|
+
|
67
|
+
|
68
|
+
@pytest.fixture(scope="function")
|
69
|
+
def local_deltacat_storage_kwargs(request: pytest.FixtureRequest):
|
70
|
+
kwargs_for_local_deltacat_storage: Dict[str, Any] = {
|
71
|
+
DATABASE_FILE_PATH_KEY: DATABASE_FILE_PATH_VALUE,
|
72
|
+
}
|
73
|
+
yield kwargs_for_local_deltacat_storage
|
74
|
+
if os.path.exists(DATABASE_FILE_PATH_VALUE):
|
75
|
+
os.remove(DATABASE_FILE_PATH_VALUE)
|
76
|
+
|
77
|
+
|
78
|
+
class TestCompactionSession:
|
19
79
|
"""
|
20
80
|
This class adds specific tests that aren't part of the parametrized test suite.
|
21
81
|
"""
|
22
82
|
|
23
|
-
DB_FILE_PATH = f"{current_time_ms()}.db"
|
24
83
|
NAMESPACE = "compact_partition_v2_namespace"
|
84
|
+
BACKFILL_FILE_PATH = (
|
85
|
+
"deltacat/tests/compute/compactor_v2/data/backfill_source_date_pk.csv"
|
86
|
+
)
|
87
|
+
INCREMENTAL_FILE_PATH = (
|
88
|
+
"deltacat/tests/compute/compactor_v2/data/incremental_source_date_pk.csv"
|
89
|
+
)
|
25
90
|
|
26
|
-
|
27
|
-
|
28
|
-
|
91
|
+
def test_compact_partition_when_no_input_deltas_to_compact(
|
92
|
+
self, local_deltacat_storage_kwargs
|
93
|
+
):
|
94
|
+
# setup
|
95
|
+
staged_source = stage_partition_from_file_paths(
|
96
|
+
self.NAMESPACE, ["test"], **local_deltacat_storage_kwargs
|
97
|
+
)
|
98
|
+
source_partition = ds.commit_partition(
|
99
|
+
staged_source, **local_deltacat_storage_kwargs
|
100
|
+
)
|
29
101
|
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
102
|
+
staged_dest = stage_partition_from_file_paths(
|
103
|
+
self.NAMESPACE, ["destination"], **local_deltacat_storage_kwargs
|
104
|
+
)
|
105
|
+
dest_partition = ds.commit_partition(
|
106
|
+
staged_dest, **local_deltacat_storage_kwargs
|
107
|
+
)
|
34
108
|
|
35
|
-
|
109
|
+
# action
|
110
|
+
rcf_url = compact_partition(
|
111
|
+
CompactPartitionParams.of(
|
112
|
+
{
|
113
|
+
"compaction_artifact_s3_bucket": TEST_S3_RCF_BUCKET_NAME,
|
114
|
+
"compacted_file_content_type": ContentType.PARQUET,
|
115
|
+
"dd_max_parallelism_ratio": 1.0,
|
116
|
+
"deltacat_storage": ds,
|
117
|
+
"deltacat_storage_kwargs": local_deltacat_storage_kwargs,
|
118
|
+
"destination_partition_locator": dest_partition.locator,
|
119
|
+
"drop_duplicates": True,
|
120
|
+
"hash_bucket_count": 2,
|
121
|
+
"last_stream_position_to_compact": source_partition.stream_position,
|
122
|
+
"list_deltas_kwargs": {
|
123
|
+
**local_deltacat_storage_kwargs,
|
124
|
+
**{"equivalent_table_types": []},
|
125
|
+
},
|
126
|
+
"primary_keys": ["pk"],
|
127
|
+
"rebase_source_partition_locator": None,
|
128
|
+
"rebase_source_partition_high_watermark": None,
|
129
|
+
"records_per_compacted_file": 4000,
|
130
|
+
"s3_client_kwargs": {},
|
131
|
+
"source_partition_locator": source_partition.locator,
|
132
|
+
}
|
133
|
+
)
|
134
|
+
)
|
36
135
|
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
136
|
+
# verify that no RCF is written
|
137
|
+
assert rcf_url is None
|
138
|
+
|
139
|
+
def test_compact_partition_when_rcf_was_written_by_past_commit(
|
140
|
+
self, s3_resource, local_deltacat_storage_kwargs
|
141
|
+
):
|
142
|
+
"""
|
143
|
+
Backward compatibility test for when a RCF was written by a previous commit.
|
144
|
+
"""
|
42
145
|
|
43
|
-
@patch("deltacat.compute.compactor_v2.compaction_session.rcf")
|
44
|
-
@patch("deltacat.compute.compactor_v2.compaction_session.s3_utils")
|
45
|
-
def test_compact_partition_when_no_input_deltas_to_compact(self, s3_utils, rcf_url):
|
46
146
|
# setup
|
47
|
-
rcf_url.read_round_completion_file.return_value = None
|
48
147
|
staged_source = stage_partition_from_file_paths(
|
49
|
-
self.NAMESPACE, ["
|
148
|
+
self.NAMESPACE, ["source"], **local_deltacat_storage_kwargs
|
50
149
|
)
|
51
|
-
|
52
|
-
|
150
|
+
|
151
|
+
source_delta = commit_delta_to_staged_partition(
|
152
|
+
staged_source, [self.BACKFILL_FILE_PATH], **local_deltacat_storage_kwargs
|
53
153
|
)
|
54
154
|
|
55
155
|
staged_dest = stage_partition_from_file_paths(
|
56
|
-
self.NAMESPACE, ["destination"], **
|
156
|
+
self.NAMESPACE, ["destination"], **local_deltacat_storage_kwargs
|
57
157
|
)
|
58
158
|
dest_partition = ds.commit_partition(
|
59
|
-
staged_dest, **
|
159
|
+
staged_dest, **local_deltacat_storage_kwargs
|
60
160
|
)
|
61
161
|
|
62
162
|
# action
|
63
163
|
rcf_url = compact_partition(
|
64
164
|
CompactPartitionParams.of(
|
65
165
|
{
|
66
|
-
"compaction_artifact_s3_bucket":
|
166
|
+
"compaction_artifact_s3_bucket": TEST_S3_RCF_BUCKET_NAME,
|
67
167
|
"compacted_file_content_type": ContentType.PARQUET,
|
68
168
|
"dd_max_parallelism_ratio": 1.0,
|
69
169
|
"deltacat_storage": ds,
|
70
|
-
"deltacat_storage_kwargs":
|
170
|
+
"deltacat_storage_kwargs": local_deltacat_storage_kwargs,
|
71
171
|
"destination_partition_locator": dest_partition.locator,
|
72
172
|
"drop_duplicates": True,
|
73
173
|
"hash_bucket_count": 1,
|
74
|
-
"last_stream_position_to_compact":
|
174
|
+
"last_stream_position_to_compact": source_delta.stream_position,
|
75
175
|
"list_deltas_kwargs": {
|
76
|
-
**
|
176
|
+
**local_deltacat_storage_kwargs,
|
77
177
|
**{"equivalent_table_types": []},
|
78
178
|
},
|
79
179
|
"primary_keys": [],
|
180
|
+
"rebase_source_partition_locator": source_delta.partition_locator,
|
181
|
+
"rebase_source_partition_high_watermark": None,
|
182
|
+
"records_per_compacted_file": 4000,
|
183
|
+
"s3_client_kwargs": {},
|
184
|
+
"source_partition_locator": source_delta.partition_locator,
|
185
|
+
}
|
186
|
+
)
|
187
|
+
)
|
188
|
+
|
189
|
+
bucket, backfill_key1, backfill_key2 = rcf_url.strip("s3://").split("/")
|
190
|
+
assert bucket == TEST_S3_RCF_BUCKET_NAME
|
191
|
+
|
192
|
+
# Now delete the RCF at new location and copy it to old location
|
193
|
+
# Copy the RCF from rcf_url to another location
|
194
|
+
s3_resource.Object(TEST_S3_RCF_BUCKET_NAME, f"{backfill_key1}.json").copy_from(
|
195
|
+
CopySource=f"{TEST_S3_RCF_BUCKET_NAME}/{backfill_key1}/{backfill_key2}"
|
196
|
+
)
|
197
|
+
|
198
|
+
s3_resource.Object(
|
199
|
+
TEST_S3_RCF_BUCKET_NAME, f"{backfill_key1}/{backfill_key2}"
|
200
|
+
).delete()
|
201
|
+
|
202
|
+
# Now run an incremental compaction and verify if the previous RCF was read properly.
|
203
|
+
|
204
|
+
new_source_delta = commit_delta_to_partition(
|
205
|
+
source_delta.partition_locator,
|
206
|
+
[self.INCREMENTAL_FILE_PATH],
|
207
|
+
**local_deltacat_storage_kwargs,
|
208
|
+
)
|
209
|
+
|
210
|
+
new_rcf_url = compact_partition(
|
211
|
+
CompactPartitionParams.of(
|
212
|
+
{
|
213
|
+
"compaction_artifact_s3_bucket": TEST_S3_RCF_BUCKET_NAME,
|
214
|
+
"compacted_file_content_type": ContentType.PARQUET,
|
215
|
+
"dd_max_parallelism_ratio": 1.0,
|
216
|
+
"deltacat_storage": ds,
|
217
|
+
"deltacat_storage_kwargs": local_deltacat_storage_kwargs,
|
218
|
+
"destination_partition_locator": dest_partition.locator,
|
219
|
+
"drop_duplicates": True,
|
220
|
+
"hash_bucket_count": 1,
|
221
|
+
"last_stream_position_to_compact": new_source_delta.stream_position,
|
222
|
+
"list_deltas_kwargs": {
|
223
|
+
**local_deltacat_storage_kwargs,
|
224
|
+
**{"equivalent_table_types": []},
|
225
|
+
},
|
226
|
+
"primary_keys": ["pk"],
|
80
227
|
"rebase_source_partition_locator": None,
|
81
228
|
"rebase_source_partition_high_watermark": None,
|
82
229
|
"records_per_compacted_file": 4000,
|
83
230
|
"s3_client_kwargs": {},
|
84
|
-
"source_partition_locator":
|
231
|
+
"source_partition_locator": new_source_delta.partition_locator,
|
85
232
|
}
|
86
233
|
)
|
87
234
|
)
|
88
235
|
|
89
|
-
|
90
|
-
|
236
|
+
new_bucket, incremental_key1, incremental_key2 = new_rcf_url.strip(
|
237
|
+
"s3://"
|
238
|
+
).split("/")
|
239
|
+
|
240
|
+
assert new_bucket == TEST_S3_RCF_BUCKET_NAME
|
241
|
+
assert backfill_key1 == incremental_key1
|
242
|
+
assert backfill_key2 != incremental_key2
|
243
|
+
|
244
|
+
rcf = get_rcf(s3_resource, new_rcf_url)
|
245
|
+
|
246
|
+
_, compaction_audit_key = rcf.compaction_audit_url.strip("s3://").split("/", 1)
|
247
|
+
compaction_audit = CompactionSessionAuditInfo(
|
248
|
+
**read_s3_contents(
|
249
|
+
s3_resource, TEST_S3_RCF_BUCKET_NAME, compaction_audit_key
|
250
|
+
)
|
251
|
+
)
|
252
|
+
|
253
|
+
# as it should be running incremental
|
254
|
+
assert compaction_audit.uniform_deltas_created == 1
|
255
|
+
assert compaction_audit.input_records == 6
|
@@ -254,7 +254,7 @@ def test_compact_partition_rebase_same_source_and_destination(
|
|
254
254
|
}
|
255
255
|
)
|
256
256
|
|
257
|
-
from deltacat.compute.compactor_v2.model.
|
257
|
+
from deltacat.compute.compactor_v2.model.evaluate_compaction_result import (
|
258
258
|
ExecutionCompactionResult,
|
259
259
|
)
|
260
260
|
|
@@ -24,6 +24,12 @@ from deltacat.compute.compactor import (
|
|
24
24
|
RoundCompletionInfo,
|
25
25
|
)
|
26
26
|
|
27
|
+
from deltacat.storage.model.partition import PartitionLocator
|
28
|
+
from deltacat.storage.model.stream import StreamLocator
|
29
|
+
from deltacat.storage.model.table_version import TableVersionLocator
|
30
|
+
from deltacat.storage.model.table import TableLocator
|
31
|
+
from deltacat.storage.model.namespace import NamespaceLocator
|
32
|
+
|
27
33
|
|
28
34
|
class PartitionKeyType(str, Enum):
|
29
35
|
INT = "int"
|
@@ -51,6 +57,18 @@ UTILS
|
|
51
57
|
"""
|
52
58
|
|
53
59
|
|
60
|
+
def get_test_partition_locator(partition_id):
|
61
|
+
tv_locator = TableVersionLocator.of(
|
62
|
+
TableLocator.of(NamespaceLocator.of("default"), "test_table"), "1"
|
63
|
+
)
|
64
|
+
stream_locator = StreamLocator.of(tv_locator, "test_stream_id", "local")
|
65
|
+
partition_locator = PartitionLocator.of(
|
66
|
+
stream_locator, partition_id=partition_id, partition_values=[]
|
67
|
+
)
|
68
|
+
|
69
|
+
return partition_locator
|
70
|
+
|
71
|
+
|
54
72
|
def _create_table(
|
55
73
|
namespace: str,
|
56
74
|
table_name: str,
|
@@ -140,7 +158,7 @@ def create_rebase_table(
|
|
140
158
|
def get_rcf(s3_resource, rcf_file_s3_uri: str) -> RoundCompletionInfo:
|
141
159
|
from deltacat.tests.test_utils.utils import read_s3_contents
|
142
160
|
|
143
|
-
_, rcf_object_key = rcf_file_s3_uri.
|
161
|
+
_, rcf_object_key = rcf_file_s3_uri.strip("s3://").split("/", 1)
|
144
162
|
rcf_file_output: Dict[str, Any] = read_s3_contents(
|
145
163
|
s3_resource, TEST_S3_RCF_BUCKET_NAME, rcf_object_key
|
146
164
|
)
|
@@ -151,9 +169,6 @@ def get_compacted_delta_locator_from_rcf(
|
|
151
169
|
s3_resource: ServiceResource, rcf_file_s3_uri: str
|
152
170
|
):
|
153
171
|
from deltacat.storage import DeltaLocator
|
154
|
-
from deltacat.compute.compactor import (
|
155
|
-
RoundCompletionInfo,
|
156
|
-
)
|
157
172
|
|
158
173
|
round_completion_info: RoundCompletionInfo = get_rcf(s3_resource, rcf_file_s3_uri)
|
159
174
|
|
@@ -41,6 +41,12 @@ from deltacat.storage import (
|
|
41
41
|
ManifestEntry,
|
42
42
|
ManifestEntryList,
|
43
43
|
DeleteParameters,
|
44
|
+
PartitionFilter,
|
45
|
+
PartitionValues,
|
46
|
+
DeltaPartitionSpec,
|
47
|
+
StreamPartitionSpec,
|
48
|
+
TransformName,
|
49
|
+
IdentityTransformParameters,
|
44
50
|
)
|
45
51
|
from deltacat.types.media import (
|
46
52
|
ContentType,
|
@@ -194,12 +200,13 @@ def list_stream_partitions(stream: Stream, *args, **kwargs) -> ListResult[Partit
|
|
194
200
|
def list_deltas(
|
195
201
|
namespace: str,
|
196
202
|
table_name: str,
|
197
|
-
partition_values: Optional[
|
203
|
+
partition_values: Optional[PartitionValues] = None,
|
198
204
|
table_version: Optional[str] = None,
|
199
205
|
first_stream_position: Optional[int] = None,
|
200
206
|
last_stream_position: Optional[int] = None,
|
201
207
|
ascending_order: Optional[bool] = None,
|
202
208
|
include_manifest: bool = False,
|
209
|
+
partition_filter: Optional[PartitionFilter] = None,
|
203
210
|
*args,
|
204
211
|
**kwargs,
|
205
212
|
) -> ListResult[Delta]:
|
@@ -207,6 +214,13 @@ def list_deltas(
|
|
207
214
|
if stream is None:
|
208
215
|
return ListResult.of([], None, None)
|
209
216
|
|
217
|
+
if partition_values is not None and partition_filter is not None:
|
218
|
+
raise ValueError(
|
219
|
+
"Only one of partition_values or partition_filter must be provided"
|
220
|
+
)
|
221
|
+
if partition_filter is not None:
|
222
|
+
partition_values = partition_filter.partition_values
|
223
|
+
|
210
224
|
partition = get_partition(stream.locator, partition_values, *args, **kwargs)
|
211
225
|
|
212
226
|
all_deltas = list_partition_deltas(
|
@@ -297,15 +311,25 @@ def get_delta(
|
|
297
311
|
namespace: str,
|
298
312
|
table_name: str,
|
299
313
|
stream_position: int,
|
300
|
-
partition_values: Optional[
|
314
|
+
partition_values: Optional[PartitionValues] = None,
|
301
315
|
table_version: Optional[str] = None,
|
302
316
|
include_manifest: bool = False,
|
317
|
+
partition_filter: Optional[PartitionFilter] = None,
|
303
318
|
*args,
|
304
319
|
**kwargs,
|
305
320
|
) -> Optional[Delta]:
|
306
321
|
cur, con = _get_sqlite3_cursor_con(kwargs)
|
307
322
|
|
308
323
|
stream = get_stream(namespace, table_name, table_version, *args, **kwargs)
|
324
|
+
|
325
|
+
if partition_values is not None and partition_filter is not None:
|
326
|
+
raise ValueError(
|
327
|
+
"Only one of partition_values or partition_filter must be provided"
|
328
|
+
)
|
329
|
+
|
330
|
+
if partition_filter is not None:
|
331
|
+
partition_values = partition_filter.partition_values
|
332
|
+
|
309
333
|
partition = get_partition(stream.locator, partition_values, *args, **kwargs)
|
310
334
|
delta_locator = DeltaLocator.of(partition.locator, stream_position)
|
311
335
|
|
@@ -328,22 +352,24 @@ def get_delta(
|
|
328
352
|
def get_latest_delta(
|
329
353
|
namespace: str,
|
330
354
|
table_name: str,
|
331
|
-
partition_values: Optional[
|
355
|
+
partition_values: Optional[PartitionValues] = None,
|
332
356
|
table_version: Optional[str] = None,
|
333
357
|
include_manifest: bool = False,
|
358
|
+
partition_filter: Optional[PartitionFilter] = None,
|
334
359
|
*args,
|
335
360
|
**kwargs,
|
336
361
|
) -> Optional[Delta]:
|
337
362
|
|
338
363
|
deltas = list_deltas(
|
339
|
-
namespace,
|
340
|
-
table_name,
|
341
|
-
partition_values,
|
342
|
-
table_version,
|
343
|
-
None,
|
344
|
-
None,
|
345
|
-
False,
|
346
|
-
include_manifest,
|
364
|
+
namespace=namespace,
|
365
|
+
table_name=table_name,
|
366
|
+
partition_values=partition_values,
|
367
|
+
table_version=table_version,
|
368
|
+
first_stream_position=None,
|
369
|
+
last_stream_position=None,
|
370
|
+
ascending_order=False,
|
371
|
+
include_manifest=include_manifest,
|
372
|
+
partition_filter=partition_filter,
|
347
373
|
*args,
|
348
374
|
**kwargs,
|
349
375
|
).all_items()
|
@@ -363,13 +389,24 @@ def download_delta(
|
|
363
389
|
file_reader_kwargs_provider: Optional[ReadKwargsProvider] = None,
|
364
390
|
ray_options_provider: Callable[[int, Any], Dict[str, Any]] = None,
|
365
391
|
distributed_dataset_type: DistributedDatasetType = DistributedDatasetType.RAY_DATASET,
|
392
|
+
partition_filter: Optional[PartitionFilter] = None,
|
366
393
|
*args,
|
367
394
|
**kwargs,
|
368
395
|
) -> Union[LocalDataset, DistributedDataset]: # type: ignore
|
369
396
|
result = []
|
370
397
|
manifest = get_delta_manifest(delta_like, *args, **kwargs)
|
371
398
|
|
399
|
+
partition_values: PartitionValues = None
|
400
|
+
if partition_filter is not None:
|
401
|
+
partition_values = partition_filter.partition_values
|
402
|
+
|
372
403
|
for entry_index in range(len(manifest.entries)):
|
404
|
+
if (
|
405
|
+
partition_values is not None
|
406
|
+
and partition_values != manifest.entries[entry_index].meta.partition_values
|
407
|
+
):
|
408
|
+
continue
|
409
|
+
|
373
410
|
result.append(
|
374
411
|
download_delta_manifest_entry(
|
375
412
|
delta_like=delta_like,
|
@@ -524,11 +561,29 @@ def create_table_version(
|
|
524
561
|
table_description: Optional[str] = None,
|
525
562
|
table_properties: Optional[Dict[str, str]] = None,
|
526
563
|
supported_content_types: Optional[List[ContentType]] = None,
|
564
|
+
partition_spec: Optional[StreamPartitionSpec] = None,
|
527
565
|
*args,
|
528
566
|
**kwargs,
|
529
567
|
) -> Stream:
|
530
568
|
cur, con = _get_sqlite3_cursor_con(kwargs)
|
531
569
|
|
570
|
+
if partition_keys is not None and partition_spec is not None:
|
571
|
+
raise ValueError(
|
572
|
+
"Only one of partition_keys or partition_spec must be provided"
|
573
|
+
)
|
574
|
+
if partition_spec is not None:
|
575
|
+
assert (
|
576
|
+
partition_spec.ordered_transforms is not None
|
577
|
+
), "Ordered transforms must be specified when partition_spec is specified"
|
578
|
+
partition_keys = []
|
579
|
+
for transform in partition_spec.ordered_transforms:
|
580
|
+
assert transform.name == TransformName.IDENTITY, (
|
581
|
+
"Local DeltaCAT storage does not support creating table versions "
|
582
|
+
"with non identity transform partition spec"
|
583
|
+
)
|
584
|
+
transform_params: IdentityTransformParameters = transform.parameters
|
585
|
+
partition_keys.append(transform_params.column_name)
|
586
|
+
|
532
587
|
latest_version = get_latest_table_version(namespace, table_name, *args, **kwargs)
|
533
588
|
if (
|
534
589
|
table_version is not None
|
@@ -776,7 +831,7 @@ def delete_stream(
|
|
776
831
|
|
777
832
|
|
778
833
|
def stage_partition(
|
779
|
-
stream: Stream, partition_values: Optional[
|
834
|
+
stream: Stream, partition_values: Optional[PartitionValues] = None, *args, **kwargs
|
780
835
|
) -> Partition:
|
781
836
|
cur, con = _get_sqlite3_cursor_con(kwargs)
|
782
837
|
partition_id = uuid.uuid4().__str__()
|
@@ -877,7 +932,7 @@ def delete_partition(
|
|
877
932
|
namespace: str,
|
878
933
|
table_name: str,
|
879
934
|
table_version: Optional[str] = None,
|
880
|
-
partition_values: Optional[
|
935
|
+
partition_values: Optional[PartitionValues] = None,
|
881
936
|
*args,
|
882
937
|
**kwargs,
|
883
938
|
) -> None:
|
@@ -894,7 +949,7 @@ def delete_partition(
|
|
894
949
|
|
895
950
|
def get_partition(
|
896
951
|
stream_locator: StreamLocator,
|
897
|
-
partition_values: Optional[
|
952
|
+
partition_values: Optional[PartitionValues] = None,
|
898
953
|
*args,
|
899
954
|
**kwargs,
|
900
955
|
) -> Optional[Partition]:
|
@@ -935,12 +990,14 @@ def stage_delta(
|
|
935
990
|
s3_table_writer_kwargs: Optional[Dict[str, Any]] = None,
|
936
991
|
content_type: ContentType = ContentType.PARQUET,
|
937
992
|
delete_parameters: Optional[DeleteParameters] = None,
|
993
|
+
partition_spec: Optional[DeltaPartitionSpec] = None,
|
994
|
+
partition_values: Optional[PartitionValues] = None,
|
938
995
|
*args,
|
939
996
|
**kwargs,
|
940
997
|
) -> Delta:
|
941
998
|
cur, con = _get_sqlite3_cursor_con(kwargs)
|
942
|
-
|
943
|
-
uri = _get_manifest_entry_uri(
|
999
|
+
manifest_id = uuid.uuid4().__str__()
|
1000
|
+
uri = _get_manifest_entry_uri(manifest_id)
|
944
1001
|
|
945
1002
|
if data is None:
|
946
1003
|
delta = create_empty_delta(
|
@@ -948,7 +1005,7 @@ def stage_delta(
|
|
948
1005
|
delta_type,
|
949
1006
|
author,
|
950
1007
|
properties=properties,
|
951
|
-
manifest_entry_id=
|
1008
|
+
manifest_entry_id=manifest_id,
|
952
1009
|
)
|
953
1010
|
cur.execute("INSERT OR IGNORE INTO data VALUES (?, ?)", (uri, None))
|
954
1011
|
params = (delta.locator.canonical_string(), "staged_delta", json.dumps(delta))
|
@@ -956,6 +1013,12 @@ def stage_delta(
|
|
956
1013
|
con.commit()
|
957
1014
|
return delta
|
958
1015
|
|
1016
|
+
if partition_spec:
|
1017
|
+
assert partition_values is not None, (
|
1018
|
+
"partition_values must be provided as local "
|
1019
|
+
"storage does not support computing it from input data"
|
1020
|
+
)
|
1021
|
+
|
959
1022
|
serialized_data = None
|
960
1023
|
if content_type == ContentType.PARQUET:
|
961
1024
|
buffer = io.BytesIO()
|
@@ -980,18 +1043,19 @@ def stage_delta(
|
|
980
1043
|
content_type=content_type,
|
981
1044
|
content_encoding=ContentEncoding.IDENTITY,
|
982
1045
|
source_content_length=data.nbytes,
|
1046
|
+
partition_values=partition_values,
|
983
1047
|
)
|
984
1048
|
|
985
1049
|
manifest = Manifest.of(
|
986
1050
|
entries=ManifestEntryList.of(
|
987
1051
|
[
|
988
1052
|
ManifestEntry.of(
|
989
|
-
uri=uri, url=uri, meta=meta, mandatory=True, uuid=
|
1053
|
+
uri=uri, url=uri, meta=meta, mandatory=True, uuid=manifest_id
|
990
1054
|
)
|
991
1055
|
]
|
992
1056
|
),
|
993
1057
|
author=author,
|
994
|
-
uuid=
|
1058
|
+
uuid=manifest_id,
|
995
1059
|
)
|
996
1060
|
|
997
1061
|
delta = Delta.of(
|
@@ -66,7 +66,10 @@ def download_delta(delta_like: Union[Delta, DeltaLocator], *args, **kwargs) -> D
|
|
66
66
|
|
67
67
|
|
68
68
|
def commit_delta_to_partition(
|
69
|
-
partition: Partition,
|
69
|
+
partition: Union[Partition, PartitionLocator],
|
70
|
+
file_paths: List[str],
|
71
|
+
*args,
|
72
|
+
**kwargs,
|
70
73
|
) -> Delta:
|
71
74
|
tables = []
|
72
75
|
|
@@ -0,0 +1,66 @@
|
|
1
|
+
from ray.data import from_items
|
2
|
+
from typing import Any
|
3
|
+
import pytest
|
4
|
+
import fsspec
|
5
|
+
from fsspec import AbstractFileSystem
|
6
|
+
from ray.data.datasource import FilenameProvider
|
7
|
+
from deltacat.types.media import ContentType
|
8
|
+
import ray
|
9
|
+
|
10
|
+
|
11
|
+
class TestDatasetToFile:
|
12
|
+
|
13
|
+
BASE_PATH = "/tmp"
|
14
|
+
SUB_PATH = "abcd"
|
15
|
+
|
16
|
+
@pytest.fixture(autouse=True, scope="module")
|
17
|
+
def ensure_ray_down(self):
|
18
|
+
# ray.data fails when ray is instantiated in local mode
|
19
|
+
ray.shutdown()
|
20
|
+
|
21
|
+
@pytest.fixture(scope="module")
|
22
|
+
def mock_dataset(self):
|
23
|
+
return from_items([{"col1": i, "col2": i * 2} for i in range(1000)])
|
24
|
+
|
25
|
+
@pytest.fixture(scope="module")
|
26
|
+
def mock_filename_provider(self):
|
27
|
+
class MockFilenameProvider(FilenameProvider):
|
28
|
+
def get_filename_for_block(
|
29
|
+
self, block: Any, task_index: int, block_index: int
|
30
|
+
) -> str:
|
31
|
+
return TestDatasetToFile.SUB_PATH
|
32
|
+
|
33
|
+
return MockFilenameProvider()
|
34
|
+
|
35
|
+
def test_parquet_sanity(self, mock_dataset, mock_filename_provider):
|
36
|
+
from deltacat.utils.ray_utils.dataset import dataset_to_file
|
37
|
+
|
38
|
+
fs: AbstractFileSystem = fsspec.filesystem("local")
|
39
|
+
|
40
|
+
dataset_to_file(
|
41
|
+
mock_dataset,
|
42
|
+
self.BASE_PATH,
|
43
|
+
file_system=fs,
|
44
|
+
block_path_provider=mock_filename_provider,
|
45
|
+
)
|
46
|
+
|
47
|
+
file_expected_at = f"{self.BASE_PATH}/{self.SUB_PATH}"
|
48
|
+
assert fs.exists(file_expected_at), "file was not written"
|
49
|
+
fs.delete(file_expected_at)
|
50
|
+
|
51
|
+
def test_csv_sanity(self, mock_dataset, mock_filename_provider):
|
52
|
+
from deltacat.utils.ray_utils.dataset import dataset_to_file
|
53
|
+
|
54
|
+
fs: AbstractFileSystem = fsspec.filesystem("local")
|
55
|
+
|
56
|
+
dataset_to_file(
|
57
|
+
mock_dataset,
|
58
|
+
self.BASE_PATH,
|
59
|
+
file_system=fs,
|
60
|
+
block_path_provider=mock_filename_provider,
|
61
|
+
content_type=ContentType.CSV.value,
|
62
|
+
)
|
63
|
+
|
64
|
+
file_expected_at = f"{self.BASE_PATH}/{self.SUB_PATH}"
|
65
|
+
assert fs.exists(file_expected_at), "file was not written"
|
66
|
+
fs.delete(file_expected_at)
|