deltacat 1.1.9__py3-none-any.whl → 1.1.10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (35) hide show
  1. deltacat/__init__.py +1 -1
  2. deltacat/aws/redshift/model/manifest.py +16 -0
  3. deltacat/aws/s3u.py +19 -13
  4. deltacat/compute/compactor/compaction_session.py +5 -1
  5. deltacat/compute/compactor/repartition_session.py +1 -0
  6. deltacat/compute/compactor/utils/round_completion_file.py +39 -9
  7. deltacat/compute/compactor_v2/compaction_session.py +15 -11
  8. deltacat/compute/compactor_v2/constants.py +3 -0
  9. deltacat/compute/compactor_v2/model/{compaction_session.py → evaluate_compaction_result.py} +1 -2
  10. deltacat/io/dataset.py +5 -17
  11. deltacat/storage/__init__.py +24 -0
  12. deltacat/storage/interface.py +42 -6
  13. deltacat/storage/model/delta.py +23 -3
  14. deltacat/storage/model/partition.py +6 -7
  15. deltacat/storage/model/partition_spec.py +71 -0
  16. deltacat/storage/model/stream.py +38 -1
  17. deltacat/storage/model/transform.py +127 -0
  18. deltacat/tests/aws/test_s3u.py +2 -0
  19. deltacat/tests/compute/compactor/utils/test_round_completion_file.py +209 -0
  20. deltacat/tests/compute/compactor_v2/test_compaction_session.py +201 -36
  21. deltacat/tests/compute/test_compact_partition_rebase.py +1 -1
  22. deltacat/tests/compute/test_util_common.py +19 -4
  23. deltacat/tests/local_deltacat_storage/__init__.py +83 -19
  24. deltacat/tests/test_utils/pyarrow.py +4 -1
  25. deltacat/tests/utils/ray_utils/test_dataset.py +66 -0
  26. deltacat/utils/numpy.py +3 -3
  27. deltacat/utils/pandas.py +3 -3
  28. deltacat/utils/pyarrow.py +3 -3
  29. deltacat/utils/ray_utils/dataset.py +7 -7
  30. {deltacat-1.1.9.dist-info → deltacat-1.1.10.dist-info}/METADATA +5 -4
  31. {deltacat-1.1.9.dist-info → deltacat-1.1.10.dist-info}/RECORD +34 -31
  32. deltacat/io/aws/redshift/redshift_datasource.py +0 -578
  33. {deltacat-1.1.9.dist-info → deltacat-1.1.10.dist-info}/LICENSE +0 -0
  34. {deltacat-1.1.9.dist-info → deltacat-1.1.10.dist-info}/WHEEL +0 -0
  35. {deltacat-1.1.9.dist-info → deltacat-1.1.10.dist-info}/top_level.txt +0 -0
deltacat/__init__.py CHANGED
@@ -44,7 +44,7 @@ from deltacat.types.tables import TableWriteMode
44
44
 
45
45
  deltacat.logs.configure_deltacat_logger(logging.getLogger(__name__))
46
46
 
47
- __version__ = "1.1.9"
47
+ __version__ = "1.1.10"
48
48
 
49
49
 
50
50
  __all__ = [
@@ -99,6 +99,8 @@ class Manifest(dict):
99
99
  total_source_content_length = 0
100
100
  content_type = None
101
101
  content_encoding = None
102
+ partition_values_set = set()
103
+ partition_values = None
102
104
  if entries:
103
105
  content_type = entries[0].meta.content_type
104
106
  content_encoding = entries[0].meta.content_encoding
@@ -127,6 +129,12 @@ class Manifest(dict):
127
129
  total_record_count += meta.record_count or 0
128
130
  total_content_length += meta.content_length or 0
129
131
  total_source_content_length += meta.source_content_length or 0
132
+ if len(partition_values_set) <= 1:
133
+ partition_values_set.add(entry.meta.partition_values)
134
+
135
+ if len(partition_values_set) == 1:
136
+ partition_values = partition_values_set.pop()
137
+
130
138
  meta = ManifestMeta.of(
131
139
  total_record_count,
132
140
  total_content_length,
@@ -134,6 +142,7 @@ class Manifest(dict):
134
142
  content_encoding,
135
143
  total_source_content_length,
136
144
  entry_type=entry_type,
145
+ partition_values=partition_values,
137
146
  )
138
147
  manifest = Manifest._build_manifest(meta, entries, author, uuid, entry_type)
139
148
  return manifest
@@ -185,6 +194,7 @@ class ManifestMeta(dict):
185
194
  credentials: Optional[Dict[str, str]] = None,
186
195
  content_type_parameters: Optional[List[Dict[str, str]]] = None,
187
196
  entry_type: Optional[EntryType] = None,
197
+ partition_values: Optional[List[str]] = None,
188
198
  ) -> ManifestMeta:
189
199
  manifest_meta = ManifestMeta()
190
200
  if record_count is not None:
@@ -203,6 +213,8 @@ class ManifestMeta(dict):
203
213
  manifest_meta["credentials"] = credentials
204
214
  if entry_type is not None:
205
215
  manifest_meta["entry_type"] = entry_type.value
216
+ if partition_values is not None:
217
+ manifest_meta["partition_values"] = partition_values
206
218
  return manifest_meta
207
219
 
208
220
  @property
@@ -244,6 +256,10 @@ class ManifestMeta(dict):
244
256
  return EntryType(self["entry_type"])
245
257
  return val
246
258
 
259
+ @property
260
+ def partition_values(self) -> Optional[List[str]]:
261
+ return self.get("partition_values")
262
+
247
263
 
248
264
  class ManifestAuthor(dict):
249
265
  @staticmethod
deltacat/aws/s3u.py CHANGED
@@ -21,7 +21,7 @@ from boto3.resources.base import ServiceResource
21
21
  from botocore.client import BaseClient
22
22
  from botocore.exceptions import ClientError
23
23
  from ray.data.block import Block, BlockAccessor, BlockMetadata
24
- from ray.data.datasource import BlockWritePathProvider
24
+ from ray.data.datasource import FilenameProvider
25
25
  from ray.types import ObjectRef
26
26
  from tenacity import (
27
27
  Retrying,
@@ -70,9 +70,6 @@ from deltacat.exceptions import categorize_errors
70
70
 
71
71
  logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
72
72
 
73
- # TODO(raghumdani): refactor redshift datasource to reuse the
74
- # same module for writing output files.
75
-
76
73
 
77
74
  class CapturedBlockWritePaths:
78
75
  def __init__(self):
@@ -100,12 +97,15 @@ class CapturedBlockWritePaths:
100
97
  return self._block_refs
101
98
 
102
99
 
103
- class UuidBlockWritePathProvider(BlockWritePathProvider):
100
+ class UuidBlockWritePathProvider(FilenameProvider):
104
101
  """Block write path provider implementation that writes each
105
102
  dataset block out to a file of the form: {base_path}/{uuid}
106
103
  """
107
104
 
108
- def __init__(self, capture_object: CapturedBlockWritePaths):
105
+ def __init__(
106
+ self, capture_object: CapturedBlockWritePaths, base_path: Optional[str] = None
107
+ ):
108
+ self.base_path = base_path
109
109
  self.write_paths: List[str] = []
110
110
  self.block_refs: List[ObjectRef[Block]] = []
111
111
  self.capture_object = capture_object
@@ -117,6 +117,19 @@ class UuidBlockWritePathProvider(BlockWritePathProvider):
117
117
  self.block_refs,
118
118
  )
119
119
 
120
+ def get_filename_for_block(
121
+ self, block: Any, task_index: int, block_index: int
122
+ ) -> str:
123
+ if self.base_path is None:
124
+ raise ValueError(
125
+ "Base path must be provided to UuidBlockWritePathProvider",
126
+ )
127
+ return self._get_write_path_for_block(
128
+ base_path=self.base_path,
129
+ block=block,
130
+ block_index=block_index,
131
+ )
132
+
120
133
  def _get_write_path_for_block(
121
134
  self,
122
135
  base_path: str,
@@ -143,13 +156,6 @@ class UuidBlockWritePathProvider(BlockWritePathProvider):
143
156
  block_index: Optional[int] = None,
144
157
  file_format: Optional[str] = None,
145
158
  ) -> str:
146
- """
147
- TODO: BlockWritePathProvider is deprecated as of Ray version 2.20.0. Please use FilenameProvider.
148
- See: https://docs.ray.io/en/master/data/api/doc/ray.data.datasource.FilenameProvider.html
149
- Also See: https://github.com/ray-project/deltacat/issues/299
150
-
151
- Hence, this class only works with Ray version 2.20.0 or lower when used in Ray Dataset.
152
- """
153
159
  return self._get_write_path_for_block(
154
160
  base_path,
155
161
  filesystem=filesystem,
@@ -193,6 +193,7 @@ def compact_partition(
193
193
  round_completion_file_s3_url = rcf.write_round_completion_file(
194
194
  compaction_artifact_s3_bucket,
195
195
  new_rcf_partition_locator,
196
+ partition.locator,
196
197
  new_rci,
197
198
  **s3_client_kwargs,
198
199
  )
@@ -312,7 +313,10 @@ def _execute_compaction_round(
312
313
  round_completion_info = None
313
314
  if not rebase_source_partition_locator:
314
315
  round_completion_info = rcf.read_round_completion_file(
315
- compaction_artifact_s3_bucket, source_partition_locator, **s3_client_kwargs
316
+ compaction_artifact_s3_bucket,
317
+ source_partition_locator,
318
+ destination_partition_locator,
319
+ **s3_client_kwargs,
316
320
  )
317
321
  if not round_completion_info:
318
322
  logger.info(
@@ -177,6 +177,7 @@ def repartition(
177
177
  s3_client_kwargs = {}
178
178
 
179
179
  return rcf.write_round_completion_file(
180
+ None,
180
181
  None,
181
182
  None,
182
183
  repartition_completion_info,
@@ -12,10 +12,17 @@ logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
12
12
 
13
13
 
14
14
  def get_round_completion_file_s3_url(
15
- bucket: str, source_partition_locator: PartitionLocator
15
+ bucket: str,
16
+ source_partition_locator: PartitionLocator,
17
+ destination_partition_locator: Optional[PartitionLocator] = None,
16
18
  ) -> str:
17
19
 
18
20
  base_url = source_partition_locator.path(f"s3://{bucket}")
21
+ if destination_partition_locator:
22
+ base_url = destination_partition_locator.path(
23
+ f"s3://{bucket}/{source_partition_locator.hexdigest()}"
24
+ )
25
+
19
26
  return f"{base_url}.json"
20
27
 
21
28
 
@@ -23,20 +30,41 @@ def get_round_completion_file_s3_url(
23
30
  def read_round_completion_file(
24
31
  bucket: str,
25
32
  source_partition_locator: PartitionLocator,
33
+ destination_partition_locator: Optional[PartitionLocator] = None,
26
34
  **s3_client_kwargs: Optional[Dict[str, Any]],
27
35
  ) -> RoundCompletionInfo:
28
36
 
29
- round_completion_file_url = get_round_completion_file_s3_url(
37
+ all_uris = []
38
+ if destination_partition_locator:
39
+ round_completion_file_url_with_destination = get_round_completion_file_s3_url(
40
+ bucket,
41
+ source_partition_locator,
42
+ destination_partition_locator,
43
+ )
44
+ all_uris.append(round_completion_file_url_with_destination)
45
+
46
+ # Note: we read from RCF at two different URI for backward
47
+ # compatibility reasons.
48
+ round_completion_file_url_prev = get_round_completion_file_s3_url(
30
49
  bucket,
31
50
  source_partition_locator,
32
51
  )
33
- logger.info(f"reading round completion file from: {round_completion_file_url}")
52
+
53
+ all_uris.append(round_completion_file_url_prev)
54
+
34
55
  round_completion_info = None
35
- result = s3_utils.download(round_completion_file_url, False, **s3_client_kwargs)
36
- if result:
37
- json_str = result["Body"].read().decode("utf-8")
38
- round_completion_info = RoundCompletionInfo(json.loads(json_str))
39
- logger.info(f"read round completion info: {round_completion_info}")
56
+
57
+ for rcf_uri in all_uris:
58
+ logger.info(f"Reading round completion file from: {rcf_uri}")
59
+ result = s3_utils.download(rcf_uri, False, **s3_client_kwargs)
60
+ if result:
61
+ json_str = result["Body"].read().decode("utf-8")
62
+ round_completion_info = RoundCompletionInfo(json.loads(json_str))
63
+ logger.info(f"Read round completion info: {round_completion_info}")
64
+ break
65
+ else:
66
+ logger.warn(f"Round completion file not present at {rcf_uri}")
67
+
40
68
  return round_completion_info
41
69
 
42
70
 
@@ -44,8 +72,9 @@ def read_round_completion_file(
44
72
  def write_round_completion_file(
45
73
  bucket: Optional[str],
46
74
  source_partition_locator: Optional[PartitionLocator],
75
+ destination_partition_locator: Optional[PartitionLocator],
47
76
  round_completion_info: RoundCompletionInfo,
48
- completion_file_s3_url: str = None,
77
+ completion_file_s3_url: Optional[str] = None,
49
78
  **s3_client_kwargs: Optional[Dict[str, Any]],
50
79
  ) -> str:
51
80
  if bucket is None and completion_file_s3_url is None:
@@ -56,6 +85,7 @@ def write_round_completion_file(
56
85
  completion_file_s3_url = get_round_completion_file_s3_url(
57
86
  bucket,
58
87
  source_partition_locator,
88
+ destination_partition_locator,
59
89
  )
60
90
  logger.info(f"writing round completion file to: {completion_file_s3_url}")
61
91
  s3_utils.upload(
@@ -24,7 +24,7 @@ from deltacat.compute.compactor import (
24
24
  )
25
25
  from deltacat.compute.compactor_v2.model.merge_result import MergeResult
26
26
  from deltacat.compute.compactor_v2.model.hash_bucket_result import HashBucketResult
27
- from deltacat.compute.compactor_v2.model.compaction_session import (
27
+ from deltacat.compute.compactor_v2.model.evaluate_compaction_result import (
28
28
  ExecutionCompactionResult,
29
29
  )
30
30
  from deltacat.compute.compactor.model.materialize_result import MaterializeResult
@@ -78,6 +78,7 @@ from deltacat.compute.compactor_v2.utils.task_options import (
78
78
  )
79
79
  from deltacat.compute.compactor.model.compactor_version import CompactorVersion
80
80
  from deltacat.exceptions import categorize_errors
81
+ from deltacat.compute.compactor_v2.constants import COMPACT_PARTITION_METRIC_PREFIX
81
82
 
82
83
  if importlib.util.find_spec("memray"):
83
84
  import memray
@@ -86,7 +87,7 @@ if importlib.util.find_spec("memray"):
86
87
  logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
87
88
 
88
89
 
89
- @metrics
90
+ @metrics(prefix=COMPACT_PARTITION_METRIC_PREFIX)
90
91
  @categorize_errors
91
92
  def compact_partition(params: CompactPartitionParams, **kwargs) -> Optional[str]:
92
93
  assert (
@@ -109,7 +110,6 @@ def compact_partition(params: CompactPartitionParams, **kwargs) -> Optional[str]
109
110
  f"Partition-{params.source_partition_locator} -> "
110
111
  f"{compaction_session_type} Compaction session data processing completed"
111
112
  )
112
- round_completion_file_s3_url: Optional[str] = None
113
113
  if execute_compaction_result.new_compacted_partition:
114
114
  previous_partition: Optional[Partition] = None
115
115
  if execute_compaction_result.is_inplace_compacted:
@@ -131,19 +131,13 @@ def compact_partition(params: CompactPartitionParams, **kwargs) -> Optional[str]
131
131
  **params.deltacat_storage_kwargs,
132
132
  )
133
133
  logger.info(f"Committed compacted partition: {committed_partition}")
134
- round_completion_file_s3_url = rcf.write_round_completion_file(
135
- params.compaction_artifact_s3_bucket,
136
- execute_compaction_result.new_round_completion_file_partition_locator,
137
- execute_compaction_result.new_round_completion_info,
138
- **params.s3_client_kwargs,
139
- )
140
134
  else:
141
135
  logger.warning("No new partition was committed during compaction.")
142
136
 
143
137
  logger.info(
144
138
  f"Completed compaction session for: {params.source_partition_locator}"
145
139
  )
146
- return round_completion_file_s3_url
140
+ return execute_compaction_result.round_completion_file_s3_url
147
141
 
148
142
 
149
143
  def _execute_compaction(
@@ -188,6 +182,7 @@ def _execute_compaction(
188
182
  round_completion_info = rcf.read_round_completion_file(
189
183
  params.compaction_artifact_s3_bucket,
190
184
  params.source_partition_locator,
185
+ params.destination_partition_locator,
191
186
  **params.s3_client_kwargs,
192
187
  )
193
188
  if not round_completion_info:
@@ -684,9 +679,18 @@ def _execute_compaction(
684
679
  f"and rcf source partition_id of {rcf_source_partition_locator.partition_id}."
685
680
  )
686
681
  rcf_source_partition_locator = compacted_partition.locator
682
+
683
+ round_completion_file_s3_url = rcf.write_round_completion_file(
684
+ params.compaction_artifact_s3_bucket,
685
+ rcf_source_partition_locator,
686
+ compacted_partition.locator,
687
+ new_round_completion_info,
688
+ **params.s3_client_kwargs,
689
+ )
690
+
687
691
  return ExecutionCompactionResult(
688
692
  compacted_partition,
689
693
  new_round_completion_info,
690
- rcf_source_partition_locator,
694
+ round_completion_file_s3_url,
691
695
  is_inplace_compacted,
692
696
  )
@@ -68,3 +68,6 @@ DISCOVER_DELTAS_METRIC_PREFIX = "discover_deltas"
68
68
 
69
69
  # Metric prefix for prepare deletes
70
70
  PREPARE_DELETES_METRIC_PREFIX = "prepare_deletes"
71
+
72
+ # Metric prefix for compact partition method
73
+ COMPACT_PARTITION_METRIC_PREFIX = "compact_partition"
@@ -2,7 +2,6 @@ from dataclasses import dataclass, fields
2
2
 
3
3
  from deltacat.storage import (
4
4
  Partition,
5
- PartitionLocator,
6
5
  )
7
6
  from deltacat.compute.compactor import (
8
7
  RoundCompletionInfo,
@@ -14,7 +13,7 @@ from typing import Optional
14
13
  class ExecutionCompactionResult:
15
14
  new_compacted_partition: Optional[Partition]
16
15
  new_round_completion_info: Optional[RoundCompletionInfo]
17
- new_round_completion_file_partition_locator: Optional[PartitionLocator]
16
+ round_completion_file_s3_url: Optional[str]
18
17
  is_inplace_compacted: bool
19
18
 
20
19
  def __iter__(self):
deltacat/io/dataset.py CHANGED
@@ -6,9 +6,6 @@ from typing import Any, Callable, Dict, Optional, TypeVar, Union, cast
6
6
  import pyarrow as pa
7
7
  import s3fs
8
8
  from ray.data import Dataset
9
- from ray.data.datasource import BlockWritePathProvider, DefaultBlockWritePathProvider
10
-
11
- from deltacat.io.aws.redshift.redshift_datasource import RedshiftDatasource
12
9
 
13
10
  T = TypeVar("T")
14
11
 
@@ -27,7 +24,6 @@ class DeltacatDataset(Dataset[T]):
27
24
  filesystem: Optional[Union[pa.fs.FileSystem, s3fs.S3FileSystem]] = None,
28
25
  try_create_dir: bool = True,
29
26
  arrow_open_stream_args: Optional[Dict[str, Any]] = None,
30
- block_path_provider: BlockWritePathProvider = DefaultBlockWritePathProvider(),
31
27
  arrow_parquet_args_fn: Callable[[], Dict[str, Any]] = lambda: {},
32
28
  **arrow_parquet_args,
33
29
  ) -> None:
@@ -59,9 +55,8 @@ class DeltacatDataset(Dataset[T]):
59
55
  if True. Does nothing if all directories already exist.
60
56
  arrow_open_stream_args: kwargs passed to
61
57
  pyarrow.fs.FileSystem.open_output_stream
62
- block_path_provider: BlockWritePathProvider implementation
63
- to write each dataset block to a custom output path. Uses
64
- DefaultBlockWritePathProvider if None.
58
+ filename_provider: FilenameProvider implementation
59
+ to write each dataset block to a custom output path.
65
60
  arrow_parquet_args_fn: Callable that returns a dictionary of write
66
61
  arguments to use when writing each block to a file. Overrides
67
62
  any duplicate keys from arrow_parquet_args. This should be used
@@ -72,14 +67,7 @@ class DeltacatDataset(Dataset[T]):
72
67
  pyarrow.parquet.write_table(), which is used to write out each
73
68
  block to a file.
74
69
  """
75
- self.write_datasource(
76
- RedshiftDatasource(),
77
- path=path,
78
- dataset_uuid=self._uuid,
79
- filesystem=filesystem,
80
- try_create_dir=try_create_dir,
81
- open_stream_args=arrow_open_stream_args,
82
- block_path_provider=block_path_provider,
83
- write_args_fn=arrow_parquet_args_fn,
84
- **arrow_parquet_args,
70
+ raise NotImplementedError(
71
+ "Writing to Redshift is not yet supported. "
72
+ "Please use DeltacatDataset.write_parquet() instead."
85
73
  )
@@ -14,6 +14,20 @@ from deltacat.storage.model.stream import Stream, StreamLocator
14
14
  from deltacat.storage.model.table import Table, TableLocator
15
15
  from deltacat.storage.model.table_version import TableVersion, TableVersionLocator
16
16
  from deltacat.storage.model.delete_parameters import DeleteParameters
17
+ from deltacat.storage.model.partition_spec import (
18
+ PartitionFilter,
19
+ PartitionValues,
20
+ DeltaPartitionSpec,
21
+ StreamPartitionSpec,
22
+ )
23
+ from deltacat.storage.model.transform import (
24
+ Transform,
25
+ TransformName,
26
+ TransformParameters,
27
+ BucketingStrategy,
28
+ BucketTransformParameters,
29
+ IdentityTransformParameters,
30
+ )
17
31
 
18
32
  from deltacat.storage.model.types import (
19
33
  CommitState,
@@ -56,4 +70,14 @@ __all__ = [
56
70
  "TableVersionLocator",
57
71
  "SortKey",
58
72
  "SortOrder",
73
+ "PartitionFilter",
74
+ "PartitionValues",
75
+ "DeltaPartitionSpec",
76
+ "StreamPartitionSpec",
77
+ "Transform",
78
+ "TransformName",
79
+ "TransformParameters",
80
+ "BucketingStrategy",
81
+ "BucketTransformParameters",
82
+ "IdentityTransformParameters",
59
83
  ]
@@ -23,6 +23,10 @@ from deltacat.storage import (
23
23
  TableVersion,
24
24
  SortKey,
25
25
  PartitionLocator,
26
+ PartitionFilter,
27
+ PartitionValues,
28
+ DeltaPartitionSpec,
29
+ StreamPartitionSpec,
26
30
  )
27
31
  from deltacat.types.media import (
28
32
  ContentType,
@@ -86,12 +90,13 @@ def list_stream_partitions(stream: Stream, *args, **kwargs) -> ListResult[Partit
86
90
  def list_deltas(
87
91
  namespace: str,
88
92
  table_name: str,
89
- partition_values: Optional[List[Any]] = None,
93
+ partition_values: Optional[PartitionValues] = None,
90
94
  table_version: Optional[str] = None,
91
95
  first_stream_position: Optional[int] = None,
92
96
  last_stream_position: Optional[int] = None,
93
97
  ascending_order: Optional[bool] = None,
94
98
  include_manifest: bool = False,
99
+ partition_filter: Optional[PartitionFilter] = None,
95
100
  *args,
96
101
  **kwargs
97
102
  ) -> ListResult[Delta]:
@@ -107,6 +112,9 @@ def list_deltas(
107
112
  To conserve memory, the deltas returned do not include manifests by
108
113
  default. The manifests can either be optionally retrieved as part of this
109
114
  call or lazily loaded via subsequent calls to `get_delta_manifest`.
115
+
116
+ Note: partition_values is deprecated and will be removed in future releases.
117
+ Use partition_filter instead.
110
118
  """
111
119
  raise NotImplementedError("list_deltas not implemented")
112
120
 
@@ -134,9 +142,10 @@ def get_delta(
134
142
  namespace: str,
135
143
  table_name: str,
136
144
  stream_position: int,
137
- partition_values: Optional[List[Any]] = None,
145
+ partition_values: Optional[PartitionValues] = None,
138
146
  table_version: Optional[str] = None,
139
147
  include_manifest: bool = False,
148
+ partition_filter: Optional[PartitionFilter] = None,
140
149
  *args,
141
150
  **kwargs
142
151
  ) -> Optional[Delta]:
@@ -149,6 +158,9 @@ def get_delta(
149
158
  To conserve memory, the delta returned does not include a manifest by
150
159
  default. The manifest can either be optionally retrieved as part of this
151
160
  call or lazily loaded via a subsequent call to `get_delta_manifest`.
161
+
162
+ Note: partition_values is deprecated and will be removed in future releases.
163
+ Use partition_filter instead.
152
164
  """
153
165
  raise NotImplementedError("get_delta not implemented")
154
166
 
@@ -156,9 +168,10 @@ def get_delta(
156
168
  def get_latest_delta(
157
169
  namespace: str,
158
170
  table_name: str,
159
- partition_values: Optional[List[Any]] = None,
171
+ partition_values: Optional[PartitionValues] = None,
160
172
  table_version: Optional[str] = None,
161
173
  include_manifest: bool = False,
174
+ partition_filter: Optional[PartitionFilter] = None,
162
175
  *args,
163
176
  **kwargs
164
177
  ) -> Optional[Delta]:
@@ -172,6 +185,9 @@ def get_latest_delta(
172
185
  To conserve memory, the delta returned does not include a manifest by
173
186
  default. The manifest can either be optionally retrieved as part of this
174
187
  call or lazily loaded via a subsequent call to `get_delta_manifest`.
188
+
189
+ Note: partition_values is deprecated and will be removed in future releases.
190
+ Use partition_filter instead.
175
191
  """
176
192
  raise NotImplementedError("get_latest_delta not implemented")
177
193
 
@@ -185,6 +201,7 @@ def download_delta(
185
201
  file_reader_kwargs_provider: Optional[ReadKwargsProvider] = None,
186
202
  ray_options_provider: Callable[[int, Any], Dict[str, Any]] = None,
187
203
  distributed_dataset_type: DistributedDatasetType = DistributedDatasetType.RAY_DATASET,
204
+ partition_filter: Optional[PartitionFilter] = None,
188
205
  *args,
189
206
  **kwargs
190
207
  ) -> Union[LocalDataset, DistributedDataset]: # type: ignore
@@ -194,6 +211,10 @@ def download_delta(
194
211
  across this Ray cluster's object store memory. Ordered table N of a local
195
212
  table list, or ordered block N of a distributed dataset, always contain
196
213
  the contents of ordered delta manifest entry N.
214
+
215
+ partition_filter is an optional parameter which determines which files to
216
+ download from the delta manifest. A delta manifest contains all the data files
217
+ for a given delta.
197
218
  """
198
219
  raise NotImplementedError("download_delta not implemented")
199
220
 
@@ -268,6 +289,7 @@ def create_table_version(
268
289
  table_description: Optional[str] = None,
269
290
  table_properties: Optional[Dict[str, str]] = None,
270
291
  supported_content_types: Optional[List[ContentType]] = None,
292
+ partition_spec: Optional[StreamPartitionSpec] = None,
271
293
  *args,
272
294
  **kwargs
273
295
  ) -> Stream:
@@ -300,6 +322,8 @@ def create_table_version(
300
322
 
301
323
  Validate: Raise an error for any fields that don't fit the schema. An
302
324
  explicit subset of column names to validate may optionally be specified.
325
+
326
+ Either partition_keys or partition_spec must be specified but not both.
303
327
  """
304
328
  raise NotImplementedError("create_table_version not implemented")
305
329
 
@@ -402,7 +426,7 @@ def get_stream(
402
426
 
403
427
 
404
428
  def stage_partition(
405
- stream: Stream, partition_values: Optional[List[Any]] = None, *args, **kwargs
429
+ stream: Stream, partition_values: Optional[PartitionValues] = None, *args, **kwargs
406
430
  ) -> Partition:
407
431
  """
408
432
  Stages a new partition for the given stream and partition values. Returns
@@ -410,6 +434,9 @@ def stage_partition(
410
434
  with the same partition values, then it will have its previous partition ID
411
435
  set to the ID of the partition being replaced. Partition keys should not be
412
436
  specified for unpartitioned tables.
437
+
438
+ The partition_values must represents the results of transforms in a partition
439
+ spec specified in the stream.
413
440
  """
414
441
  raise NotImplementedError("stage_partition not implemented")
415
442
 
@@ -439,7 +466,7 @@ def delete_partition(
439
466
  namespace: str,
440
467
  table_name: str,
441
468
  table_version: Optional[str] = None,
442
- partition_values: Optional[List[Any]] = None,
469
+ partition_values: Optional[PartitionValues] = None,
443
470
  *args,
444
471
  **kwargs
445
472
  ) -> None:
@@ -454,7 +481,7 @@ def delete_partition(
454
481
 
455
482
  def get_partition(
456
483
  stream_locator: StreamLocator,
457
- partition_values: Optional[List[Any]] = None,
484
+ partition_values: Optional[PartitionValues] = None,
458
485
  *args,
459
486
  **kwargs
460
487
  ) -> Optional[Partition]:
@@ -477,6 +504,8 @@ def stage_delta(
477
504
  s3_table_writer_kwargs: Optional[Dict[str, Any]] = None,
478
505
  content_type: ContentType = ContentType.PARQUET,
479
506
  delete_parameters: Optional[DeleteParameters] = None,
507
+ partition_spec: Optional[DeltaPartitionSpec] = None,
508
+ partition_values: Optional[PartitionValues] = None,
480
509
  *args,
481
510
  **kwargs
482
511
  ) -> Delta:
@@ -484,6 +513,13 @@ def stage_delta(
484
513
  Writes the given table to 1 or more S3 files. Returns an unregistered
485
514
  delta whose manifest entries point to the uploaded files. Applies any
486
515
  schema consistency policies configured for the parent table version.
516
+
517
+ The partition spec will be used to split the input table into
518
+ multiple files. Optionally, partition_values can be provided to avoid
519
+ this method to recompute partition_values from the provided data.
520
+
521
+ Raises an error if the provided data does not conform to a unique ordered
522
+ list of partition_values
487
523
  """
488
524
  raise NotImplementedError("stage_delta not implemented")
489
525