deltacat 1.1.8__py3-none-any.whl → 1.1.10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. deltacat/__init__.py +1 -1
  2. deltacat/aws/constants.py +6 -0
  3. deltacat/aws/redshift/model/manifest.py +16 -0
  4. deltacat/aws/s3u.py +65 -38
  5. deltacat/compute/compactor/compaction_session.py +5 -1
  6. deltacat/compute/compactor/model/compact_partition_params.py +12 -1
  7. deltacat/compute/compactor/model/materialize_result.py +0 -4
  8. deltacat/compute/compactor/repartition_session.py +1 -0
  9. deltacat/compute/compactor/utils/round_completion_file.py +39 -9
  10. deltacat/compute/compactor_v2/compaction_session.py +26 -16
  11. deltacat/compute/compactor_v2/constants.py +5 -11
  12. deltacat/compute/compactor_v2/model/{compaction_session.py → evaluate_compaction_result.py} +1 -2
  13. deltacat/compute/compactor_v2/model/merge_input.py +6 -0
  14. deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -7
  15. deltacat/compute/compactor_v2/steps/merge.py +12 -12
  16. deltacat/compute/compactor_v2/utils/merge.py +1 -0
  17. deltacat/compute/compactor_v2/utils/primary_key_index.py +9 -4
  18. deltacat/compute/compactor_v2/utils/task_options.py +2 -12
  19. deltacat/exceptions.py +342 -7
  20. deltacat/io/dataset.py +5 -17
  21. deltacat/io/memcached_object_store.py +7 -4
  22. deltacat/storage/__init__.py +24 -0
  23. deltacat/storage/interface.py +56 -6
  24. deltacat/storage/model/delta.py +23 -3
  25. deltacat/storage/model/partition.py +6 -7
  26. deltacat/storage/model/partition_spec.py +71 -0
  27. deltacat/storage/model/stream.py +38 -1
  28. deltacat/storage/model/transform.py +127 -0
  29. deltacat/tests/aws/test_s3u.py +2 -0
  30. deltacat/tests/compute/compact_partition_rebase_test_cases.py +88 -0
  31. deltacat/tests/compute/compact_partition_rebase_then_incremental_test_cases.py +3 -2
  32. deltacat/tests/compute/compact_partition_test_cases.py +4 -2
  33. deltacat/tests/compute/compactor/utils/test_round_completion_file.py +209 -0
  34. deltacat/tests/compute/compactor_v2/test_compaction_session.py +204 -37
  35. deltacat/tests/compute/test_compact_partition_rebase.py +289 -0
  36. deltacat/tests/compute/test_util_common.py +19 -4
  37. deltacat/tests/compute/test_util_create_table_deltas_repo.py +1 -0
  38. deltacat/tests/io/test_memcached_object_store.py +5 -2
  39. deltacat/tests/local_deltacat_storage/__init__.py +124 -29
  40. deltacat/tests/local_deltacat_storage/exceptions.py +10 -0
  41. deltacat/tests/test_exceptions.py +100 -0
  42. deltacat/tests/test_logs.py +1 -0
  43. deltacat/tests/test_utils/pyarrow.py +4 -1
  44. deltacat/tests/utils/ray_utils/test_dataset.py +66 -0
  45. deltacat/tests/utils/test_daft.py +0 -1
  46. deltacat/tests/utils/test_resources.py +0 -28
  47. deltacat/utils/daft.py +3 -0
  48. deltacat/utils/numpy.py +3 -3
  49. deltacat/utils/pandas.py +3 -3
  50. deltacat/utils/pyarrow.py +11 -8
  51. deltacat/utils/ray_utils/dataset.py +7 -7
  52. deltacat/utils/ray_utils/runtime.py +2 -2
  53. deltacat/utils/resources.py +0 -45
  54. {deltacat-1.1.8.dist-info → deltacat-1.1.10.dist-info}/METADATA +6 -5
  55. {deltacat-1.1.8.dist-info → deltacat-1.1.10.dist-info}/RECORD +58 -51
  56. deltacat/io/aws/redshift/redshift_datasource.py +0 -578
  57. {deltacat-1.1.8.dist-info → deltacat-1.1.10.dist-info}/LICENSE +0 -0
  58. {deltacat-1.1.8.dist-info → deltacat-1.1.10.dist-info}/WHEEL +0 -0
  59. {deltacat-1.1.8.dist-info → deltacat-1.1.10.dist-info}/top_level.txt +0 -0
@@ -23,6 +23,10 @@ from deltacat.storage import (
23
23
  TableVersion,
24
24
  SortKey,
25
25
  PartitionLocator,
26
+ PartitionFilter,
27
+ PartitionValues,
28
+ DeltaPartitionSpec,
29
+ StreamPartitionSpec,
26
30
  )
27
31
  from deltacat.types.media import (
28
32
  ContentType,
@@ -86,12 +90,13 @@ def list_stream_partitions(stream: Stream, *args, **kwargs) -> ListResult[Partit
86
90
  def list_deltas(
87
91
  namespace: str,
88
92
  table_name: str,
89
- partition_values: Optional[List[Any]] = None,
93
+ partition_values: Optional[PartitionValues] = None,
90
94
  table_version: Optional[str] = None,
91
95
  first_stream_position: Optional[int] = None,
92
96
  last_stream_position: Optional[int] = None,
93
97
  ascending_order: Optional[bool] = None,
94
98
  include_manifest: bool = False,
99
+ partition_filter: Optional[PartitionFilter] = None,
95
100
  *args,
96
101
  **kwargs
97
102
  ) -> ListResult[Delta]:
@@ -107,6 +112,9 @@ def list_deltas(
107
112
  To conserve memory, the deltas returned do not include manifests by
108
113
  default. The manifests can either be optionally retrieved as part of this
109
114
  call or lazily loaded via subsequent calls to `get_delta_manifest`.
115
+
116
+ Note: partition_values is deprecated and will be removed in future releases.
117
+ Use partition_filter instead.
110
118
  """
111
119
  raise NotImplementedError("list_deltas not implemented")
112
120
 
@@ -134,9 +142,10 @@ def get_delta(
134
142
  namespace: str,
135
143
  table_name: str,
136
144
  stream_position: int,
137
- partition_values: Optional[List[Any]] = None,
145
+ partition_values: Optional[PartitionValues] = None,
138
146
  table_version: Optional[str] = None,
139
147
  include_manifest: bool = False,
148
+ partition_filter: Optional[PartitionFilter] = None,
140
149
  *args,
141
150
  **kwargs
142
151
  ) -> Optional[Delta]:
@@ -149,6 +158,9 @@ def get_delta(
149
158
  To conserve memory, the delta returned does not include a manifest by
150
159
  default. The manifest can either be optionally retrieved as part of this
151
160
  call or lazily loaded via a subsequent call to `get_delta_manifest`.
161
+
162
+ Note: partition_values is deprecated and will be removed in future releases.
163
+ Use partition_filter instead.
152
164
  """
153
165
  raise NotImplementedError("get_delta not implemented")
154
166
 
@@ -156,9 +168,10 @@ def get_delta(
156
168
  def get_latest_delta(
157
169
  namespace: str,
158
170
  table_name: str,
159
- partition_values: Optional[List[Any]] = None,
171
+ partition_values: Optional[PartitionValues] = None,
160
172
  table_version: Optional[str] = None,
161
173
  include_manifest: bool = False,
174
+ partition_filter: Optional[PartitionFilter] = None,
162
175
  *args,
163
176
  **kwargs
164
177
  ) -> Optional[Delta]:
@@ -172,6 +185,9 @@ def get_latest_delta(
172
185
  To conserve memory, the delta returned does not include a manifest by
173
186
  default. The manifest can either be optionally retrieved as part of this
174
187
  call or lazily loaded via a subsequent call to `get_delta_manifest`.
188
+
189
+ Note: partition_values is deprecated and will be removed in future releases.
190
+ Use partition_filter instead.
175
191
  """
176
192
  raise NotImplementedError("get_latest_delta not implemented")
177
193
 
@@ -185,6 +201,7 @@ def download_delta(
185
201
  file_reader_kwargs_provider: Optional[ReadKwargsProvider] = None,
186
202
  ray_options_provider: Callable[[int, Any], Dict[str, Any]] = None,
187
203
  distributed_dataset_type: DistributedDatasetType = DistributedDatasetType.RAY_DATASET,
204
+ partition_filter: Optional[PartitionFilter] = None,
188
205
  *args,
189
206
  **kwargs
190
207
  ) -> Union[LocalDataset, DistributedDataset]: # type: ignore
@@ -194,6 +211,10 @@ def download_delta(
194
211
  across this Ray cluster's object store memory. Ordered table N of a local
195
212
  table list, or ordered block N of a distributed dataset, always contain
196
213
  the contents of ordered delta manifest entry N.
214
+
215
+ partition_filter is an optional parameter which determines which files to
216
+ download from the delta manifest. A delta manifest contains all the data files
217
+ for a given delta.
197
218
  """
198
219
  raise NotImplementedError("download_delta not implemented")
199
220
 
@@ -268,6 +289,7 @@ def create_table_version(
268
289
  table_description: Optional[str] = None,
269
290
  table_properties: Optional[Dict[str, str]] = None,
270
291
  supported_content_types: Optional[List[ContentType]] = None,
292
+ partition_spec: Optional[StreamPartitionSpec] = None,
271
293
  *args,
272
294
  **kwargs
273
295
  ) -> Stream:
@@ -300,6 +322,8 @@ def create_table_version(
300
322
 
301
323
  Validate: Raise an error for any fields that don't fit the schema. An
302
324
  explicit subset of column names to validate may optionally be specified.
325
+
326
+ Either partition_keys or partition_spec must be specified but not both.
303
327
  """
304
328
  raise NotImplementedError("create_table_version not implemented")
305
329
 
@@ -402,7 +426,7 @@ def get_stream(
402
426
 
403
427
 
404
428
  def stage_partition(
405
- stream: Stream, partition_values: Optional[List[Any]] = None, *args, **kwargs
429
+ stream: Stream, partition_values: Optional[PartitionValues] = None, *args, **kwargs
406
430
  ) -> Partition:
407
431
  """
408
432
  Stages a new partition for the given stream and partition values. Returns
@@ -410,6 +434,9 @@ def stage_partition(
410
434
  with the same partition values, then it will have its previous partition ID
411
435
  set to the ID of the partition being replaced. Partition keys should not be
412
436
  specified for unpartitioned tables.
437
+
438
+ The partition_values must represents the results of transforms in a partition
439
+ spec specified in the stream.
413
440
  """
414
441
  raise NotImplementedError("stage_partition not implemented")
415
442
 
@@ -439,7 +466,7 @@ def delete_partition(
439
466
  namespace: str,
440
467
  table_name: str,
441
468
  table_version: Optional[str] = None,
442
- partition_values: Optional[List[Any]] = None,
469
+ partition_values: Optional[PartitionValues] = None,
443
470
  *args,
444
471
  **kwargs
445
472
  ) -> None:
@@ -454,7 +481,7 @@ def delete_partition(
454
481
 
455
482
  def get_partition(
456
483
  stream_locator: StreamLocator,
457
- partition_values: Optional[List[Any]] = None,
484
+ partition_values: Optional[PartitionValues] = None,
458
485
  *args,
459
486
  **kwargs
460
487
  ) -> Optional[Partition]:
@@ -477,6 +504,8 @@ def stage_delta(
477
504
  s3_table_writer_kwargs: Optional[Dict[str, Any]] = None,
478
505
  content_type: ContentType = ContentType.PARQUET,
479
506
  delete_parameters: Optional[DeleteParameters] = None,
507
+ partition_spec: Optional[DeltaPartitionSpec] = None,
508
+ partition_values: Optional[PartitionValues] = None,
480
509
  *args,
481
510
  **kwargs
482
511
  ) -> Delta:
@@ -484,6 +513,13 @@ def stage_delta(
484
513
  Writes the given table to 1 or more S3 files. Returns an unregistered
485
514
  delta whose manifest entries point to the uploaded files. Applies any
486
515
  schema consistency policies configured for the parent table version.
516
+
517
+ The partition spec will be used to split the input table into
518
+ multiple files. Optionally, partition_values can be provided to avoid
519
+ this method to recompute partition_values from the provided data.
520
+
521
+ Raises an error if the provided data does not conform to a unique ordered
522
+ list of partition_values
487
523
  """
488
524
  raise NotImplementedError("stage_delta not implemented")
489
525
 
@@ -600,3 +636,17 @@ def table_version_exists(
600
636
  Returns True if the given table version exists, False if not.
601
637
  """
602
638
  raise NotImplementedError("table_version_exists not implemented")
639
+
640
+
641
+ def can_categorize(e: BaseException, *args, **kwargs) -> bool:
642
+ """
643
+ Return whether input error is from storage implementation layer.
644
+ """
645
+ raise NotImplementedError
646
+
647
+
648
+ def raise_categorized_error(e: BaseException, *args, **kwargs):
649
+ """
650
+ Raise and handle storage implementation layer specific errors.
651
+ """
652
+ raise NotImplementedError
@@ -12,6 +12,7 @@ from deltacat.storage.model.stream import StreamLocator
12
12
  from deltacat.storage.model.table import TableLocator
13
13
  from deltacat.storage.model.table_version import TableVersionLocator
14
14
  from deltacat.storage.model.types import DeltaType
15
+ from deltacat.storage.model.partition_spec import DeltaPartitionSpec, PartitionValues
15
16
 
16
17
 
17
18
  class Delta(dict):
@@ -24,6 +25,7 @@ class Delta(dict):
24
25
  manifest: Optional[Manifest],
25
26
  previous_stream_position: Optional[int] = None,
26
27
  delete_parameters: Optional[DeleteParameters] = None,
28
+ partition_spec: Optional[DeltaPartitionSpec] = None,
27
29
  ) -> Delta:
28
30
  """
29
31
  Creates a Delta metadata model with the given Delta Locator, Delta Type,
@@ -38,6 +40,7 @@ class Delta(dict):
38
40
  delta.manifest = manifest
39
41
  delta.previous_stream_position = previous_stream_position
40
42
  delta.delete_parameters = delete_parameters
43
+ delta.partition_spec = partition_spec
41
44
  return delta
42
45
 
43
46
  @staticmethod
@@ -90,6 +93,12 @@ class Delta(dict):
90
93
  f"Deltas to merge must all share the same delta type "
91
94
  f"(found {len(distinct_delta_types)} delta types)."
92
95
  )
96
+ distinct_partition_spec = set([d.partition_spec for d in deltas])
97
+ if len(distinct_partition_spec) > 1:
98
+ raise ValueError(
99
+ f"Deltas to merge must all share the same partition spec "
100
+ f"(found {len(distinct_partition_spec)} partition specs)."
101
+ )
93
102
  merged_manifest = Manifest.merge_manifests(
94
103
  manifests,
95
104
  manifest_author,
@@ -252,7 +261,7 @@ class Delta(dict):
252
261
  return None
253
262
 
254
263
  @property
255
- def partition_values(self) -> Optional[List[Any]]:
264
+ def partition_values(self) -> Optional[PartitionValues]:
256
265
  delta_locator = self.locator
257
266
  if delta_locator:
258
267
  return delta_locator.partition_values
@@ -276,6 +285,17 @@ class Delta(dict):
276
285
  def delete_parameters(self, delete_parameters: Optional[DeleteParameters]) -> None:
277
286
  self["delete_parameters"] = delete_parameters
278
287
 
288
+ @property
289
+ def partition_spec(self) -> Optional[DeltaPartitionSpec]:
290
+ val: Dict[str, Any] = self.get("partitionSpec")
291
+ if val is not None and not isinstance(val, DeltaPartitionSpec):
292
+ self.partition_spec = val = DeltaPartitionSpec(val)
293
+ return val
294
+
295
+ @partition_spec.setter
296
+ def partition_spec(self, value: Optional[DeltaPartitionSpec]) -> None:
297
+ self["partitionSpec"] = value
298
+
279
299
 
280
300
  class DeltaLocator(Locator, dict):
281
301
  @staticmethod
@@ -299,7 +319,7 @@ class DeltaLocator(Locator, dict):
299
319
  table_version: Optional[str],
300
320
  stream_id: Optional[str],
301
321
  storage_type: Optional[str],
302
- partition_values: Optional[List[Any]],
322
+ partition_values: Optional[PartitionValues],
303
323
  partition_id: Optional[str],
304
324
  stream_position: Optional[int],
305
325
  ) -> DeltaLocator:
@@ -365,7 +385,7 @@ class DeltaLocator(Locator, dict):
365
385
  return None
366
386
 
367
387
  @property
368
- def partition_values(self) -> Optional[List[Any]]:
388
+ def partition_values(self) -> Optional[PartitionValues]:
369
389
  partition_locator = self.partition_locator
370
390
  if partition_locator:
371
391
  return partition_locator.partition_values
@@ -1,10 +1,9 @@
1
1
  # Allow classes to use self-referencing Type hints in Python 3.7.
2
2
  from __future__ import annotations
3
-
4
3
  from typing import Any, Dict, List, Optional, Union
5
4
 
6
5
  import pyarrow as pa
7
-
6
+ from deltacat.storage.model.partition_spec import PartitionValues
8
7
  from deltacat.storage.model.locator import Locator
9
8
  from deltacat.storage.model.namespace import NamespaceLocator
10
9
  from deltacat.storage.model.stream import StreamLocator
@@ -127,7 +126,7 @@ class Partition(dict):
127
126
  return None
128
127
 
129
128
  @property
130
- def partition_values(self) -> Optional[List[Any]]:
129
+ def partition_values(self) -> Optional[PartitionValues]:
131
130
  partition_locator = self.locator
132
131
  if partition_locator:
133
132
  return partition_locator.partition_values
@@ -199,7 +198,7 @@ class PartitionLocator(Locator, dict):
199
198
  @staticmethod
200
199
  def of(
201
200
  stream_locator: Optional[StreamLocator],
202
- partition_values: Optional[List[Any]],
201
+ partition_values: Optional[PartitionValues],
203
202
  partition_id: Optional[str],
204
203
  ) -> PartitionLocator:
205
204
  """
@@ -225,7 +224,7 @@ class PartitionLocator(Locator, dict):
225
224
  table_version: Optional[str],
226
225
  stream_id: Optional[str],
227
226
  storage_type: Optional[str],
228
- partition_values: Optional[List[Any]],
227
+ partition_values: Optional[PartitionValues],
229
228
  partition_id: Optional[str],
230
229
  ) -> PartitionLocator:
231
230
  stream_locator = StreamLocator.at(
@@ -253,11 +252,11 @@ class PartitionLocator(Locator, dict):
253
252
  self["streamLocator"] = stream_locator
254
253
 
255
254
  @property
256
- def partition_values(self) -> Optional[List[Any]]:
255
+ def partition_values(self) -> Optional[PartitionValues]:
257
256
  return self.get("partitionValues")
258
257
 
259
258
  @partition_values.setter
260
- def partition_values(self, partition_values: Optional[List[Any]]) -> None:
259
+ def partition_values(self, partition_values: Optional[PartitionValues]) -> None:
261
260
  self["partitionValues"] = partition_values
262
261
 
263
262
  @property
@@ -0,0 +1,71 @@
1
+ from __future__ import annotations
2
+ from typing import List, Optional, Any
3
+ from deltacat.storage.model.transform import Transform
4
+
5
+ """
6
+ An ordered list of partition values determining the values of
7
+ ordered transforms specified in the partition spec.
8
+ """
9
+ PartitionValues = List[Any]
10
+
11
+
12
+ class PartitionFilter(dict):
13
+ """
14
+ This class represents a filter for partitions.
15
+ It is used to filter partitions based on certain criteria.
16
+ """
17
+
18
+ @staticmethod
19
+ def of(
20
+ partition_values: Optional[PartitionValues] = None,
21
+ ) -> PartitionFilter:
22
+ """
23
+ Creates a new PartitionFilter instance with the specified partition key and value.
24
+ """
25
+ partition_filter = PartitionFilter()
26
+ partition_filter["partitionValues"] = partition_values
27
+ return partition_filter
28
+
29
+ @property
30
+ def partition_values(self) -> Optional[PartitionValues]:
31
+ return self.get("partitionValues")
32
+
33
+
34
+ class PartitionSpec(dict):
35
+ """
36
+ This class determines how the underlying entities in the
37
+ hierarchy are partitioned. Stream partitions deltas and
38
+ delta partitions files.
39
+ """
40
+
41
+ @staticmethod
42
+ def of(ordered_transforms: List[Transform] = None) -> PartitionSpec:
43
+ partition_spec = PartitionSpec()
44
+ partition_spec.ordered_transforms = ordered_transforms
45
+ return partition_spec
46
+
47
+ @property
48
+ def ordered_transforms(self) -> List[Transform]:
49
+ return self.get("orderedTransforms")
50
+
51
+ @ordered_transforms.setter
52
+ def ordered_transforms(self, value: List[Transform]) -> None:
53
+ self["orderedTransforms"] = value
54
+
55
+
56
+ class StreamPartitionSpec(PartitionSpec):
57
+ """
58
+ A class representing a stream partition specification.
59
+ A stream partitions deltas into multiple different Partition
60
+ """
61
+
62
+ pass
63
+
64
+
65
+ class DeltaPartitionSpec(PartitionSpec):
66
+ """
67
+ A class representing delta partition specification.
68
+ The manifest entries in delta are partitioned based on this spec.
69
+ """
70
+
71
+ pass
@@ -8,6 +8,7 @@ from deltacat.storage.model.namespace import NamespaceLocator
8
8
  from deltacat.storage.model.table import TableLocator
9
9
  from deltacat.storage.model.table_version import TableVersionLocator
10
10
  from deltacat.storage.model.types import CommitState
11
+ from deltacat.storage.model.partition_spec import StreamPartitionSpec, PartitionValues
11
12
 
12
13
 
13
14
  class Stream(dict):
@@ -17,12 +18,14 @@ class Stream(dict):
17
18
  partition_keys: Optional[List[Dict[str, Any]]],
18
19
  state: Optional[CommitState] = None,
19
20
  previous_stream_digest: Optional[bytes] = None,
21
+ partition_spec: Optional[StreamPartitionSpec] = None,
20
22
  ) -> Stream:
21
23
  stream = Stream()
22
24
  stream.locator = locator
23
25
  stream.partition_keys = partition_keys
24
26
  stream.state = state
25
27
  stream.previous_stream_digest = previous_stream_digest
28
+ stream.partition_spec = partition_spec
26
29
  return stream
27
30
 
28
31
  @property
@@ -38,6 +41,14 @@ class Stream(dict):
38
41
 
39
42
  @property
40
43
  def partition_keys(self) -> Optional[List[Dict[str, Any]]]:
44
+ """
45
+ Ordered list of unique column names in the table schema on
46
+ which the underlying data is partitioned. Either partition_spec
47
+ or partition_keys must be specified but not both.
48
+
49
+ (Deprecated): Partition keys will be deprecated in the favor
50
+ of partition_spec in future releases.
51
+ """
41
52
  return self.get("partitionKeys")
42
53
 
43
54
  @partition_keys.setter
@@ -46,6 +57,9 @@ class Stream(dict):
46
57
 
47
58
  @property
48
59
  def previous_stream_digest(self) -> Optional[str]:
60
+ """
61
+ Previous stream digest
62
+ """
49
63
  return self.get("previousStreamDigest")
50
64
 
51
65
  @previous_stream_digest.setter
@@ -54,6 +68,9 @@ class Stream(dict):
54
68
 
55
69
  @property
56
70
  def state(self) -> Optional[CommitState]:
71
+ """
72
+ The commit state of a stream.
73
+ """
57
74
  state = self.get("state")
58
75
  return None if state is None else CommitState(state)
59
76
 
@@ -61,6 +78,26 @@ class Stream(dict):
61
78
  def state(self, state: Optional[CommitState]) -> None:
62
79
  self["state"] = state
63
80
 
81
+ @property
82
+ def partition_spec(self) -> Optional[StreamPartitionSpec]:
83
+ """
84
+ If a table uses complex partitioning instead of identity,
85
+ partition spec can be specified to define that strategy.
86
+ For example, a partition spec can define a bucketing strategy
87
+ on composite column values or can define iceberg compliant
88
+ bucketing.
89
+
90
+ Either partition_spec or partition_keys must be specified but not both.
91
+ """
92
+ val: Dict[str, Any] = self.get("partitionSpec")
93
+ if val is not None and not isinstance(val, StreamPartitionSpec):
94
+ self.partition_spec = val = StreamPartitionSpec(val)
95
+ return val
96
+
97
+ @partition_spec.setter
98
+ def partition_spec(self, spec: StreamPartitionSpec) -> None:
99
+ self["partitionSpec"] = spec
100
+
64
101
  @property
65
102
  def namespace_locator(self) -> Optional[NamespaceLocator]:
66
103
  stream_locator = self.locator
@@ -110,7 +147,7 @@ class Stream(dict):
110
147
  return stream_locator.table_version
111
148
  return None
112
149
 
113
- def validate_partition_values(self, partition_values: Optional[List[Any]]):
150
+ def validate_partition_values(self, partition_values: Optional[PartitionValues]):
114
151
  # TODO (pdames): ensure value data types match key data types
115
152
  partition_keys = self.partition_keys
116
153
  num_keys = len(partition_keys) if partition_keys else 0
@@ -0,0 +1,127 @@
1
+ from __future__ import annotations
2
+ from typing import List
3
+ from enum import Enum
4
+
5
+
6
+ class TransformName(str, Enum):
7
+ IDENTITY = "identity"
8
+ BUCKET = "bucket"
9
+
10
+
11
+ class TransformParameters(dict):
12
+ """
13
+ This is a parent class that contains properties
14
+ to be passed to the corresponding transform
15
+ """
16
+
17
+ pass
18
+
19
+
20
+ class IdentityTransformParameters(TransformParameters):
21
+ """
22
+ This class is used to pass parameters to the identity transform
23
+ """
24
+
25
+ @staticmethod
26
+ def of(column_name: str) -> IdentityTransformParameters:
27
+ identify_transform_parameters = IdentityTransformParameters()
28
+ identify_transform_parameters["columnName"] = column_name
29
+ return identify_transform_parameters
30
+
31
+ @property
32
+ def column_name(self) -> str:
33
+ """
34
+ The name of the column to use for identity transform
35
+ """
36
+ return self["columnName"]
37
+
38
+ @column_name.setter
39
+ def column_name(self, value: str) -> None:
40
+ self["columnName"] = value
41
+
42
+
43
+ class BucketingStrategy(str, Enum):
44
+ """
45
+ A bucketing strategy for the transform
46
+ """
47
+
48
+ # Uses default deltacat bucketing strategy.
49
+ # This strategy supports hashing on composite keys
50
+ # and uses SHA1 hashing for determining the bucket.
51
+ # If no columns passed, it will use a random UUID
52
+ # for determining the bucket.
53
+ DEFAULT = "default"
54
+
55
+ # Uses iceberg compliant bucketing strategy.
56
+ # As indicated in the iceberg spec, it does not support
57
+ # composite keys and uses murmur3 hash for determining
58
+ # the bucket.
59
+ # See https://iceberg.apache.org/spec/#partitioning
60
+ ICEBERG = "iceberg"
61
+
62
+
63
+ class BucketTransformParameters(TransformParameters):
64
+ """
65
+ Encapsulates parameters for the bucket transform.
66
+ """
67
+
68
+ def of(
69
+ self,
70
+ num_buckets: int,
71
+ column_names: List[str],
72
+ bucketing_strategy: BucketingStrategy,
73
+ ) -> BucketTransformParameters:
74
+ bucket_transform_parameters = BucketTransformParameters()
75
+ bucket_transform_parameters["numBuckets"] = num_buckets
76
+ bucket_transform_parameters["columnNames"] = column_names
77
+ bucket_transform_parameters["bucketingStrategy"] = bucketing_strategy
78
+
79
+ return bucket_transform_parameters
80
+
81
+ @property
82
+ def num_buckets(self) -> int:
83
+ """
84
+ The total number of buckets to create for values of the column
85
+ """
86
+ return self["numBuckets"]
87
+
88
+ @property
89
+ def column_names(self) -> List[str]:
90
+ """
91
+ An ordered list of unique column names from the table schema
92
+ to use for bucketings.
93
+ """
94
+ return self["columnNames"]
95
+
96
+ @property
97
+ def bucketing_strategy(self) -> BucketingStrategy:
98
+ """
99
+ The bucketing strategy to used.
100
+ """
101
+ return self["bucketingStrategy"]
102
+
103
+
104
+ class Transform(dict):
105
+ """
106
+ A transform is represents how a particular column value can be
107
+ transformed into a new value. This is mostly used in the context
108
+ of partitioning the data files in a table.
109
+ """
110
+
111
+ @staticmethod
112
+ def of(
113
+ name: TransformName,
114
+ parameters: TransformParameters,
115
+ ) -> Transform:
116
+ partition_transform = Transform()
117
+ partition_transform["name"] = name
118
+ partition_transform["parameters"] = parameters
119
+ return partition_transform
120
+
121
+ @property
122
+ def name(self) -> TransformName:
123
+ return self["name"]
124
+
125
+ @property
126
+ def parameters(self) -> TransformParameters:
127
+ return self["parameters"]
@@ -20,6 +20,7 @@ from botocore.exceptions import (
20
20
  ConnectTimeoutError,
21
21
  HTTPClientError,
22
22
  )
23
+ from ray.data.datasource import FilenameProvider
23
24
  from deltacat.exceptions import NonRetryableError
24
25
  from moto import mock_s3
25
26
  from tenacity import RetryError
@@ -34,6 +35,7 @@ class TestUuidBlockWritePathProvider(unittest.TestCase):
34
35
 
35
36
  result = provider("base_path")
36
37
 
38
+ self.assertTrue(isinstance(provider, FilenameProvider))
37
39
  self.assertRegex(result, r"^base_path/[\w-]{36}$")
38
40
 
39
41