deltacat 1.1.9__py3-none-any.whl → 1.1.11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. deltacat/__init__.py +1 -1
  2. deltacat/aws/redshift/model/manifest.py +16 -0
  3. deltacat/aws/s3u.py +19 -13
  4. deltacat/compute/compactor/compaction_session.py +5 -1
  5. deltacat/compute/compactor/repartition_session.py +1 -0
  6. deltacat/compute/compactor/utils/round_completion_file.py +39 -9
  7. deltacat/compute/compactor_v2/compaction_session.py +15 -11
  8. deltacat/compute/compactor_v2/constants.py +3 -0
  9. deltacat/compute/compactor_v2/model/{compaction_session.py → evaluate_compaction_result.py} +1 -2
  10. deltacat/compute/compactor_v2/utils/primary_key_index.py +1 -1
  11. deltacat/exceptions.py +5 -2
  12. deltacat/io/dataset.py +5 -17
  13. deltacat/storage/__init__.py +24 -0
  14. deltacat/storage/interface.py +42 -6
  15. deltacat/storage/model/delta.py +23 -3
  16. deltacat/storage/model/partition.py +6 -7
  17. deltacat/storage/model/partition_spec.py +71 -0
  18. deltacat/storage/model/stream.py +38 -1
  19. deltacat/storage/model/transform.py +127 -0
  20. deltacat/tests/aws/test_s3u.py +2 -0
  21. deltacat/tests/compute/compactor/utils/test_round_completion_file.py +231 -0
  22. deltacat/tests/compute/compactor_v2/test_compaction_session.py +201 -36
  23. deltacat/tests/compute/test_compact_partition_rebase.py +1 -1
  24. deltacat/tests/compute/test_util_common.py +19 -4
  25. deltacat/tests/local_deltacat_storage/__init__.py +83 -19
  26. deltacat/tests/test_utils/pyarrow.py +4 -1
  27. deltacat/tests/utils/ray_utils/test_dataset.py +66 -0
  28. deltacat/utils/numpy.py +3 -3
  29. deltacat/utils/pandas.py +3 -3
  30. deltacat/utils/pyarrow.py +3 -3
  31. deltacat/utils/ray_utils/dataset.py +7 -7
  32. {deltacat-1.1.9.dist-info → deltacat-1.1.11.dist-info}/METADATA +6 -5
  33. {deltacat-1.1.9.dist-info → deltacat-1.1.11.dist-info}/RECORD +36 -33
  34. deltacat/io/aws/redshift/redshift_datasource.py +0 -578
  35. {deltacat-1.1.9.dist-info → deltacat-1.1.11.dist-info}/LICENSE +0 -0
  36. {deltacat-1.1.9.dist-info → deltacat-1.1.11.dist-info}/WHEEL +0 -0
  37. {deltacat-1.1.9.dist-info → deltacat-1.1.11.dist-info}/top_level.txt +0 -0
@@ -1,8 +1,12 @@
1
- import unittest
2
- import sqlite3
1
+ from typing import Dict, Any
3
2
  import ray
4
3
  import os
5
- from unittest.mock import patch
4
+ import pytest
5
+ import boto3
6
+ from deltacat.compute.compactor.model.compaction_session_audit_info import (
7
+ CompactionSessionAuditInfo,
8
+ )
9
+ from boto3.resources.base import ServiceResource
6
10
  import deltacat.tests.local_deltacat_storage as ds
7
11
  from deltacat.types.media import ContentType
8
12
  from deltacat.compute.compactor_v2.compaction_session import (
@@ -11,80 +15,241 @@ from deltacat.compute.compactor_v2.compaction_session import (
11
15
  from deltacat.compute.compactor.model.compact_partition_params import (
12
16
  CompactPartitionParams,
13
17
  )
14
- from deltacat.utils.common import current_time_ms
15
- from deltacat.tests.test_utils.pyarrow import stage_partition_from_file_paths
18
+ from deltacat.tests.test_utils.utils import read_s3_contents
19
+ from deltacat.tests.compute.test_util_constant import (
20
+ TEST_S3_RCF_BUCKET_NAME,
21
+ )
22
+ from deltacat.tests.compute.test_util_common import get_rcf
23
+ from deltacat.tests.test_utils.pyarrow import (
24
+ stage_partition_from_file_paths,
25
+ commit_delta_to_staged_partition,
26
+ commit_delta_to_partition,
27
+ )
28
+ from moto import mock_s3
29
+
30
+ DATABASE_FILE_PATH_KEY, DATABASE_FILE_PATH_VALUE = (
31
+ "db_file_path",
32
+ "deltacat/tests/local_deltacat_storage/db_test.sqlite",
33
+ )
34
+
35
+
36
+ @pytest.fixture(autouse=True, scope="module")
37
+ def setup_ray_cluster():
38
+ ray.init(local_mode=True, ignore_reinit_error=True)
39
+ yield
40
+ ray.shutdown()
41
+
42
+
43
+ @pytest.fixture(autouse=True, scope="module")
44
+ def mock_aws_credential():
45
+ os.environ["AWS_ACCESS_KEY_ID"] = "testing"
46
+ os.environ["AWS_SECRET_ACCESS_ID"] = "testing"
47
+ os.environ["AWS_SECURITY_TOKEN"] = "testing"
48
+ os.environ["AWS_SESSION_TOKEN"] = "testing"
49
+ os.environ["AWS_DEFAULT_REGION"] = "us-east-1"
50
+ yield
51
+
52
+
53
+ @pytest.fixture(scope="module")
54
+ def s3_resource(mock_aws_credential):
55
+ with mock_s3():
56
+ yield boto3.resource("s3")
16
57
 
17
58
 
18
- class TestCompactionSession(unittest.TestCase):
59
+ @pytest.fixture(autouse=True, scope="module")
60
+ def setup_compaction_artifacts_s3_bucket(s3_resource: ServiceResource):
61
+ s3_resource.create_bucket(
62
+ ACL="authenticated-read",
63
+ Bucket=TEST_S3_RCF_BUCKET_NAME,
64
+ )
65
+ yield
66
+
67
+
68
+ @pytest.fixture(scope="function")
69
+ def local_deltacat_storage_kwargs(request: pytest.FixtureRequest):
70
+ kwargs_for_local_deltacat_storage: Dict[str, Any] = {
71
+ DATABASE_FILE_PATH_KEY: DATABASE_FILE_PATH_VALUE,
72
+ }
73
+ yield kwargs_for_local_deltacat_storage
74
+ if os.path.exists(DATABASE_FILE_PATH_VALUE):
75
+ os.remove(DATABASE_FILE_PATH_VALUE)
76
+
77
+
78
+ class TestCompactionSession:
19
79
  """
20
80
  This class adds specific tests that aren't part of the parametrized test suite.
21
81
  """
22
82
 
23
- DB_FILE_PATH = f"{current_time_ms()}.db"
24
83
  NAMESPACE = "compact_partition_v2_namespace"
84
+ BACKFILL_FILE_PATH = (
85
+ "deltacat/tests/compute/compactor_v2/data/backfill_source_date_pk.csv"
86
+ )
87
+ INCREMENTAL_FILE_PATH = (
88
+ "deltacat/tests/compute/compactor_v2/data/incremental_source_date_pk.csv"
89
+ )
25
90
 
26
- @classmethod
27
- def setUpClass(cls):
28
- ray.init(local_mode=True, ignore_reinit_error=True)
91
+ def test_compact_partition_when_no_input_deltas_to_compact(
92
+ self, local_deltacat_storage_kwargs
93
+ ):
94
+ # setup
95
+ staged_source = stage_partition_from_file_paths(
96
+ self.NAMESPACE, ["test"], **local_deltacat_storage_kwargs
97
+ )
98
+ source_partition = ds.commit_partition(
99
+ staged_source, **local_deltacat_storage_kwargs
100
+ )
29
101
 
30
- con = sqlite3.connect(cls.DB_FILE_PATH)
31
- cur = con.cursor()
32
- cls.kwargs = {ds.SQLITE_CON_ARG: con, ds.SQLITE_CUR_ARG: cur}
33
- cls.deltacat_storage_kwargs = {ds.DB_FILE_PATH_ARG: cls.DB_FILE_PATH}
102
+ staged_dest = stage_partition_from_file_paths(
103
+ self.NAMESPACE, ["destination"], **local_deltacat_storage_kwargs
104
+ )
105
+ dest_partition = ds.commit_partition(
106
+ staged_dest, **local_deltacat_storage_kwargs
107
+ )
34
108
 
35
- super().setUpClass()
109
+ # action
110
+ rcf_url = compact_partition(
111
+ CompactPartitionParams.of(
112
+ {
113
+ "compaction_artifact_s3_bucket": TEST_S3_RCF_BUCKET_NAME,
114
+ "compacted_file_content_type": ContentType.PARQUET,
115
+ "dd_max_parallelism_ratio": 1.0,
116
+ "deltacat_storage": ds,
117
+ "deltacat_storage_kwargs": local_deltacat_storage_kwargs,
118
+ "destination_partition_locator": dest_partition.locator,
119
+ "drop_duplicates": True,
120
+ "hash_bucket_count": 2,
121
+ "last_stream_position_to_compact": source_partition.stream_position,
122
+ "list_deltas_kwargs": {
123
+ **local_deltacat_storage_kwargs,
124
+ **{"equivalent_table_types": []},
125
+ },
126
+ "primary_keys": ["pk"],
127
+ "rebase_source_partition_locator": None,
128
+ "rebase_source_partition_high_watermark": None,
129
+ "records_per_compacted_file": 4000,
130
+ "s3_client_kwargs": {},
131
+ "source_partition_locator": source_partition.locator,
132
+ }
133
+ )
134
+ )
36
135
 
37
- @classmethod
38
- def doClassCleanups(cls) -> None:
39
- os.remove(cls.DB_FILE_PATH)
40
- ray.shutdown()
41
- super().tearDownClass()
136
+ # verify that no RCF is written
137
+ assert rcf_url is None
138
+
139
+ def test_compact_partition_when_rcf_was_written_by_past_commit(
140
+ self, s3_resource, local_deltacat_storage_kwargs
141
+ ):
142
+ """
143
+ Backward compatibility test for when a RCF was written by a previous commit.
144
+ """
42
145
 
43
- @patch("deltacat.compute.compactor_v2.compaction_session.rcf")
44
- @patch("deltacat.compute.compactor_v2.compaction_session.s3_utils")
45
- def test_compact_partition_when_no_input_deltas_to_compact(self, s3_utils, rcf_url):
46
146
  # setup
47
- rcf_url.read_round_completion_file.return_value = None
48
147
  staged_source = stage_partition_from_file_paths(
49
- self.NAMESPACE, ["test"], **self.deltacat_storage_kwargs
148
+ self.NAMESPACE, ["source"], **local_deltacat_storage_kwargs
50
149
  )
51
- source_partition = ds.commit_partition(
52
- staged_source, **self.deltacat_storage_kwargs
150
+
151
+ source_delta = commit_delta_to_staged_partition(
152
+ staged_source, [self.BACKFILL_FILE_PATH], **local_deltacat_storage_kwargs
53
153
  )
54
154
 
55
155
  staged_dest = stage_partition_from_file_paths(
56
- self.NAMESPACE, ["destination"], **self.deltacat_storage_kwargs
156
+ self.NAMESPACE, ["destination"], **local_deltacat_storage_kwargs
57
157
  )
58
158
  dest_partition = ds.commit_partition(
59
- staged_dest, **self.deltacat_storage_kwargs
159
+ staged_dest, **local_deltacat_storage_kwargs
60
160
  )
61
161
 
62
162
  # action
63
163
  rcf_url = compact_partition(
64
164
  CompactPartitionParams.of(
65
165
  {
66
- "compaction_artifact_s3_bucket": "test_bucket",
166
+ "compaction_artifact_s3_bucket": TEST_S3_RCF_BUCKET_NAME,
67
167
  "compacted_file_content_type": ContentType.PARQUET,
68
168
  "dd_max_parallelism_ratio": 1.0,
69
169
  "deltacat_storage": ds,
70
- "deltacat_storage_kwargs": self.deltacat_storage_kwargs,
170
+ "deltacat_storage_kwargs": local_deltacat_storage_kwargs,
71
171
  "destination_partition_locator": dest_partition.locator,
72
172
  "drop_duplicates": True,
73
173
  "hash_bucket_count": 1,
74
- "last_stream_position_to_compact": source_partition.stream_position,
174
+ "last_stream_position_to_compact": source_delta.stream_position,
75
175
  "list_deltas_kwargs": {
76
- **self.deltacat_storage_kwargs,
176
+ **local_deltacat_storage_kwargs,
77
177
  **{"equivalent_table_types": []},
78
178
  },
79
179
  "primary_keys": [],
180
+ "rebase_source_partition_locator": source_delta.partition_locator,
181
+ "rebase_source_partition_high_watermark": None,
182
+ "records_per_compacted_file": 4000,
183
+ "s3_client_kwargs": {},
184
+ "source_partition_locator": source_delta.partition_locator,
185
+ }
186
+ )
187
+ )
188
+
189
+ bucket, backfill_key1, backfill_key2 = rcf_url.strip("s3://").split("/")
190
+ assert bucket == TEST_S3_RCF_BUCKET_NAME
191
+
192
+ # Now delete the RCF at new location and copy it to old location
193
+ # Copy the RCF from rcf_url to another location
194
+ s3_resource.Object(TEST_S3_RCF_BUCKET_NAME, f"{backfill_key1}.json").copy_from(
195
+ CopySource=f"{TEST_S3_RCF_BUCKET_NAME}/{backfill_key1}/{backfill_key2}"
196
+ )
197
+
198
+ s3_resource.Object(
199
+ TEST_S3_RCF_BUCKET_NAME, f"{backfill_key1}/{backfill_key2}"
200
+ ).delete()
201
+
202
+ # Now run an incremental compaction and verify if the previous RCF was read properly.
203
+
204
+ new_source_delta = commit_delta_to_partition(
205
+ source_delta.partition_locator,
206
+ [self.INCREMENTAL_FILE_PATH],
207
+ **local_deltacat_storage_kwargs,
208
+ )
209
+
210
+ new_rcf_url = compact_partition(
211
+ CompactPartitionParams.of(
212
+ {
213
+ "compaction_artifact_s3_bucket": TEST_S3_RCF_BUCKET_NAME,
214
+ "compacted_file_content_type": ContentType.PARQUET,
215
+ "dd_max_parallelism_ratio": 1.0,
216
+ "deltacat_storage": ds,
217
+ "deltacat_storage_kwargs": local_deltacat_storage_kwargs,
218
+ "destination_partition_locator": dest_partition.locator,
219
+ "drop_duplicates": True,
220
+ "hash_bucket_count": 1,
221
+ "last_stream_position_to_compact": new_source_delta.stream_position,
222
+ "list_deltas_kwargs": {
223
+ **local_deltacat_storage_kwargs,
224
+ **{"equivalent_table_types": []},
225
+ },
226
+ "primary_keys": ["pk"],
80
227
  "rebase_source_partition_locator": None,
81
228
  "rebase_source_partition_high_watermark": None,
82
229
  "records_per_compacted_file": 4000,
83
230
  "s3_client_kwargs": {},
84
- "source_partition_locator": source_partition.locator,
231
+ "source_partition_locator": new_source_delta.partition_locator,
85
232
  }
86
233
  )
87
234
  )
88
235
 
89
- # verify that no RCF is written
90
- self.assertIsNone(rcf_url)
236
+ new_bucket, incremental_key1, incremental_key2 = new_rcf_url.strip(
237
+ "s3://"
238
+ ).split("/")
239
+
240
+ assert new_bucket == TEST_S3_RCF_BUCKET_NAME
241
+ assert backfill_key1 == incremental_key1
242
+ assert backfill_key2 != incremental_key2
243
+
244
+ rcf = get_rcf(s3_resource, new_rcf_url)
245
+
246
+ _, compaction_audit_key = rcf.compaction_audit_url.strip("s3://").split("/", 1)
247
+ compaction_audit = CompactionSessionAuditInfo(
248
+ **read_s3_contents(
249
+ s3_resource, TEST_S3_RCF_BUCKET_NAME, compaction_audit_key
250
+ )
251
+ )
252
+
253
+ # as it should be running incremental
254
+ assert compaction_audit.uniform_deltas_created == 1
255
+ assert compaction_audit.input_records == 6
@@ -254,7 +254,7 @@ def test_compact_partition_rebase_same_source_and_destination(
254
254
  }
255
255
  )
256
256
 
257
- from deltacat.compute.compactor_v2.model.compaction_session import (
257
+ from deltacat.compute.compactor_v2.model.evaluate_compaction_result import (
258
258
  ExecutionCompactionResult,
259
259
  )
260
260
 
@@ -24,6 +24,12 @@ from deltacat.compute.compactor import (
24
24
  RoundCompletionInfo,
25
25
  )
26
26
 
27
+ from deltacat.storage.model.partition import PartitionLocator
28
+ from deltacat.storage.model.stream import StreamLocator
29
+ from deltacat.storage.model.table_version import TableVersionLocator
30
+ from deltacat.storage.model.table import TableLocator
31
+ from deltacat.storage.model.namespace import NamespaceLocator
32
+
27
33
 
28
34
  class PartitionKeyType(str, Enum):
29
35
  INT = "int"
@@ -51,6 +57,18 @@ UTILS
51
57
  """
52
58
 
53
59
 
60
+ def get_test_partition_locator(partition_id):
61
+ tv_locator = TableVersionLocator.of(
62
+ TableLocator.of(NamespaceLocator.of("default"), "test_table"), "1"
63
+ )
64
+ stream_locator = StreamLocator.of(tv_locator, "test_stream_id", "local")
65
+ partition_locator = PartitionLocator.of(
66
+ stream_locator, partition_id=partition_id, partition_values=[]
67
+ )
68
+
69
+ return partition_locator
70
+
71
+
54
72
  def _create_table(
55
73
  namespace: str,
56
74
  table_name: str,
@@ -140,7 +158,7 @@ def create_rebase_table(
140
158
  def get_rcf(s3_resource, rcf_file_s3_uri: str) -> RoundCompletionInfo:
141
159
  from deltacat.tests.test_utils.utils import read_s3_contents
142
160
 
143
- _, rcf_object_key = rcf_file_s3_uri.rsplit("/", 1)
161
+ _, rcf_object_key = rcf_file_s3_uri.strip("s3://").split("/", 1)
144
162
  rcf_file_output: Dict[str, Any] = read_s3_contents(
145
163
  s3_resource, TEST_S3_RCF_BUCKET_NAME, rcf_object_key
146
164
  )
@@ -151,9 +169,6 @@ def get_compacted_delta_locator_from_rcf(
151
169
  s3_resource: ServiceResource, rcf_file_s3_uri: str
152
170
  ):
153
171
  from deltacat.storage import DeltaLocator
154
- from deltacat.compute.compactor import (
155
- RoundCompletionInfo,
156
- )
157
172
 
158
173
  round_completion_info: RoundCompletionInfo = get_rcf(s3_resource, rcf_file_s3_uri)
159
174
 
@@ -41,6 +41,12 @@ from deltacat.storage import (
41
41
  ManifestEntry,
42
42
  ManifestEntryList,
43
43
  DeleteParameters,
44
+ PartitionFilter,
45
+ PartitionValues,
46
+ DeltaPartitionSpec,
47
+ StreamPartitionSpec,
48
+ TransformName,
49
+ IdentityTransformParameters,
44
50
  )
45
51
  from deltacat.types.media import (
46
52
  ContentType,
@@ -194,12 +200,13 @@ def list_stream_partitions(stream: Stream, *args, **kwargs) -> ListResult[Partit
194
200
  def list_deltas(
195
201
  namespace: str,
196
202
  table_name: str,
197
- partition_values: Optional[List[Any]] = None,
203
+ partition_values: Optional[PartitionValues] = None,
198
204
  table_version: Optional[str] = None,
199
205
  first_stream_position: Optional[int] = None,
200
206
  last_stream_position: Optional[int] = None,
201
207
  ascending_order: Optional[bool] = None,
202
208
  include_manifest: bool = False,
209
+ partition_filter: Optional[PartitionFilter] = None,
203
210
  *args,
204
211
  **kwargs,
205
212
  ) -> ListResult[Delta]:
@@ -207,6 +214,13 @@ def list_deltas(
207
214
  if stream is None:
208
215
  return ListResult.of([], None, None)
209
216
 
217
+ if partition_values is not None and partition_filter is not None:
218
+ raise ValueError(
219
+ "Only one of partition_values or partition_filter must be provided"
220
+ )
221
+ if partition_filter is not None:
222
+ partition_values = partition_filter.partition_values
223
+
210
224
  partition = get_partition(stream.locator, partition_values, *args, **kwargs)
211
225
 
212
226
  all_deltas = list_partition_deltas(
@@ -297,15 +311,25 @@ def get_delta(
297
311
  namespace: str,
298
312
  table_name: str,
299
313
  stream_position: int,
300
- partition_values: Optional[List[Any]] = None,
314
+ partition_values: Optional[PartitionValues] = None,
301
315
  table_version: Optional[str] = None,
302
316
  include_manifest: bool = False,
317
+ partition_filter: Optional[PartitionFilter] = None,
303
318
  *args,
304
319
  **kwargs,
305
320
  ) -> Optional[Delta]:
306
321
  cur, con = _get_sqlite3_cursor_con(kwargs)
307
322
 
308
323
  stream = get_stream(namespace, table_name, table_version, *args, **kwargs)
324
+
325
+ if partition_values is not None and partition_filter is not None:
326
+ raise ValueError(
327
+ "Only one of partition_values or partition_filter must be provided"
328
+ )
329
+
330
+ if partition_filter is not None:
331
+ partition_values = partition_filter.partition_values
332
+
309
333
  partition = get_partition(stream.locator, partition_values, *args, **kwargs)
310
334
  delta_locator = DeltaLocator.of(partition.locator, stream_position)
311
335
 
@@ -328,22 +352,24 @@ def get_delta(
328
352
  def get_latest_delta(
329
353
  namespace: str,
330
354
  table_name: str,
331
- partition_values: Optional[List[Any]] = None,
355
+ partition_values: Optional[PartitionValues] = None,
332
356
  table_version: Optional[str] = None,
333
357
  include_manifest: bool = False,
358
+ partition_filter: Optional[PartitionFilter] = None,
334
359
  *args,
335
360
  **kwargs,
336
361
  ) -> Optional[Delta]:
337
362
 
338
363
  deltas = list_deltas(
339
- namespace,
340
- table_name,
341
- partition_values,
342
- table_version,
343
- None,
344
- None,
345
- False,
346
- include_manifest,
364
+ namespace=namespace,
365
+ table_name=table_name,
366
+ partition_values=partition_values,
367
+ table_version=table_version,
368
+ first_stream_position=None,
369
+ last_stream_position=None,
370
+ ascending_order=False,
371
+ include_manifest=include_manifest,
372
+ partition_filter=partition_filter,
347
373
  *args,
348
374
  **kwargs,
349
375
  ).all_items()
@@ -363,13 +389,24 @@ def download_delta(
363
389
  file_reader_kwargs_provider: Optional[ReadKwargsProvider] = None,
364
390
  ray_options_provider: Callable[[int, Any], Dict[str, Any]] = None,
365
391
  distributed_dataset_type: DistributedDatasetType = DistributedDatasetType.RAY_DATASET,
392
+ partition_filter: Optional[PartitionFilter] = None,
366
393
  *args,
367
394
  **kwargs,
368
395
  ) -> Union[LocalDataset, DistributedDataset]: # type: ignore
369
396
  result = []
370
397
  manifest = get_delta_manifest(delta_like, *args, **kwargs)
371
398
 
399
+ partition_values: PartitionValues = None
400
+ if partition_filter is not None:
401
+ partition_values = partition_filter.partition_values
402
+
372
403
  for entry_index in range(len(manifest.entries)):
404
+ if (
405
+ partition_values is not None
406
+ and partition_values != manifest.entries[entry_index].meta.partition_values
407
+ ):
408
+ continue
409
+
373
410
  result.append(
374
411
  download_delta_manifest_entry(
375
412
  delta_like=delta_like,
@@ -524,11 +561,29 @@ def create_table_version(
524
561
  table_description: Optional[str] = None,
525
562
  table_properties: Optional[Dict[str, str]] = None,
526
563
  supported_content_types: Optional[List[ContentType]] = None,
564
+ partition_spec: Optional[StreamPartitionSpec] = None,
527
565
  *args,
528
566
  **kwargs,
529
567
  ) -> Stream:
530
568
  cur, con = _get_sqlite3_cursor_con(kwargs)
531
569
 
570
+ if partition_keys is not None and partition_spec is not None:
571
+ raise ValueError(
572
+ "Only one of partition_keys or partition_spec must be provided"
573
+ )
574
+ if partition_spec is not None:
575
+ assert (
576
+ partition_spec.ordered_transforms is not None
577
+ ), "Ordered transforms must be specified when partition_spec is specified"
578
+ partition_keys = []
579
+ for transform in partition_spec.ordered_transforms:
580
+ assert transform.name == TransformName.IDENTITY, (
581
+ "Local DeltaCAT storage does not support creating table versions "
582
+ "with non identity transform partition spec"
583
+ )
584
+ transform_params: IdentityTransformParameters = transform.parameters
585
+ partition_keys.append(transform_params.column_name)
586
+
532
587
  latest_version = get_latest_table_version(namespace, table_name, *args, **kwargs)
533
588
  if (
534
589
  table_version is not None
@@ -776,7 +831,7 @@ def delete_stream(
776
831
 
777
832
 
778
833
  def stage_partition(
779
- stream: Stream, partition_values: Optional[List[Any]] = None, *args, **kwargs
834
+ stream: Stream, partition_values: Optional[PartitionValues] = None, *args, **kwargs
780
835
  ) -> Partition:
781
836
  cur, con = _get_sqlite3_cursor_con(kwargs)
782
837
  partition_id = uuid.uuid4().__str__()
@@ -877,7 +932,7 @@ def delete_partition(
877
932
  namespace: str,
878
933
  table_name: str,
879
934
  table_version: Optional[str] = None,
880
- partition_values: Optional[List[Any]] = None,
935
+ partition_values: Optional[PartitionValues] = None,
881
936
  *args,
882
937
  **kwargs,
883
938
  ) -> None:
@@ -894,7 +949,7 @@ def delete_partition(
894
949
 
895
950
  def get_partition(
896
951
  stream_locator: StreamLocator,
897
- partition_values: Optional[List[Any]] = None,
952
+ partition_values: Optional[PartitionValues] = None,
898
953
  *args,
899
954
  **kwargs,
900
955
  ) -> Optional[Partition]:
@@ -935,12 +990,14 @@ def stage_delta(
935
990
  s3_table_writer_kwargs: Optional[Dict[str, Any]] = None,
936
991
  content_type: ContentType = ContentType.PARQUET,
937
992
  delete_parameters: Optional[DeleteParameters] = None,
993
+ partition_spec: Optional[DeltaPartitionSpec] = None,
994
+ partition_values: Optional[PartitionValues] = None,
938
995
  *args,
939
996
  **kwargs,
940
997
  ) -> Delta:
941
998
  cur, con = _get_sqlite3_cursor_con(kwargs)
942
- manifest_entry_id = uuid.uuid4().__str__()
943
- uri = _get_manifest_entry_uri(manifest_entry_id)
999
+ manifest_id = uuid.uuid4().__str__()
1000
+ uri = _get_manifest_entry_uri(manifest_id)
944
1001
 
945
1002
  if data is None:
946
1003
  delta = create_empty_delta(
@@ -948,7 +1005,7 @@ def stage_delta(
948
1005
  delta_type,
949
1006
  author,
950
1007
  properties=properties,
951
- manifest_entry_id=manifest_entry_id,
1008
+ manifest_entry_id=manifest_id,
952
1009
  )
953
1010
  cur.execute("INSERT OR IGNORE INTO data VALUES (?, ?)", (uri, None))
954
1011
  params = (delta.locator.canonical_string(), "staged_delta", json.dumps(delta))
@@ -956,6 +1013,12 @@ def stage_delta(
956
1013
  con.commit()
957
1014
  return delta
958
1015
 
1016
+ if partition_spec:
1017
+ assert partition_values is not None, (
1018
+ "partition_values must be provided as local "
1019
+ "storage does not support computing it from input data"
1020
+ )
1021
+
959
1022
  serialized_data = None
960
1023
  if content_type == ContentType.PARQUET:
961
1024
  buffer = io.BytesIO()
@@ -980,18 +1043,19 @@ def stage_delta(
980
1043
  content_type=content_type,
981
1044
  content_encoding=ContentEncoding.IDENTITY,
982
1045
  source_content_length=data.nbytes,
1046
+ partition_values=partition_values,
983
1047
  )
984
1048
 
985
1049
  manifest = Manifest.of(
986
1050
  entries=ManifestEntryList.of(
987
1051
  [
988
1052
  ManifestEntry.of(
989
- uri=uri, url=uri, meta=meta, mandatory=True, uuid=manifest_entry_id
1053
+ uri=uri, url=uri, meta=meta, mandatory=True, uuid=manifest_id
990
1054
  )
991
1055
  ]
992
1056
  ),
993
1057
  author=author,
994
- uuid=manifest_entry_id,
1058
+ uuid=manifest_id,
995
1059
  )
996
1060
 
997
1061
  delta = Delta.of(
@@ -66,7 +66,10 @@ def download_delta(delta_like: Union[Delta, DeltaLocator], *args, **kwargs) -> D
66
66
 
67
67
 
68
68
  def commit_delta_to_partition(
69
- partition: Partition, file_paths: List[str], *args, **kwargs
69
+ partition: Union[Partition, PartitionLocator],
70
+ file_paths: List[str],
71
+ *args,
72
+ **kwargs,
70
73
  ) -> Delta:
71
74
  tables = []
72
75
 
@@ -0,0 +1,66 @@
1
+ from ray.data import from_items
2
+ from typing import Any
3
+ import pytest
4
+ import fsspec
5
+ from fsspec import AbstractFileSystem
6
+ from ray.data.datasource import FilenameProvider
7
+ from deltacat.types.media import ContentType
8
+ import ray
9
+
10
+
11
+ class TestDatasetToFile:
12
+
13
+ BASE_PATH = "/tmp"
14
+ SUB_PATH = "abcd"
15
+
16
+ @pytest.fixture(autouse=True, scope="module")
17
+ def ensure_ray_down(self):
18
+ # ray.data fails when ray is instantiated in local mode
19
+ ray.shutdown()
20
+
21
+ @pytest.fixture(scope="module")
22
+ def mock_dataset(self):
23
+ return from_items([{"col1": i, "col2": i * 2} for i in range(1000)])
24
+
25
+ @pytest.fixture(scope="module")
26
+ def mock_filename_provider(self):
27
+ class MockFilenameProvider(FilenameProvider):
28
+ def get_filename_for_block(
29
+ self, block: Any, task_index: int, block_index: int
30
+ ) -> str:
31
+ return TestDatasetToFile.SUB_PATH
32
+
33
+ return MockFilenameProvider()
34
+
35
+ def test_parquet_sanity(self, mock_dataset, mock_filename_provider):
36
+ from deltacat.utils.ray_utils.dataset import dataset_to_file
37
+
38
+ fs: AbstractFileSystem = fsspec.filesystem("local")
39
+
40
+ dataset_to_file(
41
+ mock_dataset,
42
+ self.BASE_PATH,
43
+ file_system=fs,
44
+ block_path_provider=mock_filename_provider,
45
+ )
46
+
47
+ file_expected_at = f"{self.BASE_PATH}/{self.SUB_PATH}"
48
+ assert fs.exists(file_expected_at), "file was not written"
49
+ fs.delete(file_expected_at)
50
+
51
+ def test_csv_sanity(self, mock_dataset, mock_filename_provider):
52
+ from deltacat.utils.ray_utils.dataset import dataset_to_file
53
+
54
+ fs: AbstractFileSystem = fsspec.filesystem("local")
55
+
56
+ dataset_to_file(
57
+ mock_dataset,
58
+ self.BASE_PATH,
59
+ file_system=fs,
60
+ block_path_provider=mock_filename_provider,
61
+ content_type=ContentType.CSV.value,
62
+ )
63
+
64
+ file_expected_at = f"{self.BASE_PATH}/{self.SUB_PATH}"
65
+ assert fs.exists(file_expected_at), "file was not written"
66
+ fs.delete(file_expected_at)