deltacat 0.1.18b13__py3-none-any.whl → 0.1.18b15__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (95) hide show
  1. deltacat/__init__.py +3 -2
  2. deltacat/aws/clients.py +123 -3
  3. deltacat/aws/redshift/model/manifest.py +4 -0
  4. deltacat/aws/s3u.py +24 -1
  5. deltacat/benchmarking/benchmark_parquet_reads.py +53 -0
  6. deltacat/benchmarking/conftest.py +61 -0
  7. deltacat/catalog/delegate.py +1 -1
  8. deltacat/catalog/interface.py +1 -1
  9. deltacat/compute/compactor/__init__.py +0 -3
  10. deltacat/compute/compactor/compaction_session.py +45 -20
  11. deltacat/compute/compactor/model/compact_partition_params.py +287 -58
  12. deltacat/compute/compactor/model/compaction_session_audit_info.py +150 -9
  13. deltacat/compute/compactor/model/delta_annotated.py +91 -9
  14. deltacat/compute/compactor/model/delta_file_envelope.py +15 -3
  15. deltacat/compute/compactor/model/primary_key_index.py +1 -1
  16. deltacat/compute/compactor/model/round_completion_info.py +17 -1
  17. deltacat/compute/compactor/repartition_session.py +5 -3
  18. deltacat/compute/compactor/steps/dedupe.py +10 -8
  19. deltacat/compute/compactor/steps/hash_bucket.py +25 -4
  20. deltacat/compute/compactor/steps/materialize.py +11 -6
  21. deltacat/compute/compactor/steps/repartition.py +16 -1
  22. deltacat/compute/compactor/utils/io.py +40 -23
  23. deltacat/compute/compactor/utils/primary_key_index.py +1 -15
  24. deltacat/compute/compactor/utils/sort_key.py +57 -0
  25. deltacat/compute/compactor/utils/system_columns.py +43 -0
  26. deltacat/compute/compactor_v2/compaction_session.py +506 -0
  27. deltacat/compute/compactor_v2/constants.py +34 -0
  28. deltacat/compute/compactor_v2/model/__init__.py +0 -0
  29. deltacat/compute/compactor_v2/model/hash_bucket_input.py +78 -0
  30. deltacat/compute/compactor_v2/model/hash_bucket_result.py +12 -0
  31. deltacat/compute/compactor_v2/model/merge_input.py +127 -0
  32. deltacat/compute/compactor_v2/model/merge_result.py +12 -0
  33. deltacat/compute/compactor_v2/steps/__init__.py +0 -0
  34. deltacat/compute/compactor_v2/steps/hash_bucket.py +203 -0
  35. deltacat/compute/compactor_v2/steps/merge.py +41 -0
  36. deltacat/compute/compactor_v2/utils/__init__.py +0 -0
  37. deltacat/compute/compactor_v2/utils/content_type_params.py +37 -0
  38. deltacat/compute/compactor_v2/utils/io.py +149 -0
  39. deltacat/compute/compactor_v2/utils/primary_key_index.py +308 -0
  40. deltacat/compute/compactor_v2/utils/task_options.py +228 -0
  41. deltacat/compute/metastats/meta_stats.py +4 -2
  42. deltacat/compute/metastats/stats.py +1 -0
  43. deltacat/compute/metastats/utils/io.py +4 -0
  44. deltacat/compute/stats/utils/io.py +20 -5
  45. deltacat/exceptions.py +4 -0
  46. deltacat/io/memcached_object_store.py +37 -14
  47. deltacat/logs.py +4 -3
  48. deltacat/storage/__init__.py +3 -0
  49. deltacat/storage/interface.py +11 -2
  50. deltacat/storage/model/sort_key.py +33 -0
  51. deltacat/storage/model/table_version.py +11 -0
  52. deltacat/storage/model/types.py +2 -1
  53. deltacat/tests/aws/__init__.py +0 -0
  54. deltacat/tests/aws/test_clients.py +80 -0
  55. deltacat/tests/compute/__init__.py +0 -0
  56. deltacat/tests/compute/common.py +96 -0
  57. deltacat/tests/compute/compactor/__init__.py +0 -0
  58. deltacat/tests/compute/compactor/steps/__init__.py +0 -0
  59. deltacat/tests/{test_repartition.py → compute/compactor/steps/test_repartition.py} +22 -8
  60. deltacat/tests/compute/compactor/utils/__init__.py +0 -0
  61. deltacat/tests/{compactor → compute/compactor}/utils/test_io.py +47 -5
  62. deltacat/tests/compute/compactor_v2/__init__.py +0 -0
  63. deltacat/tests/compute/compactor_v2/steps/__init__.py +0 -0
  64. deltacat/tests/compute/compactor_v2/steps/test_hash_bucket.py +199 -0
  65. deltacat/tests/{compactor → compute}/test_compact_partition_params.py +14 -30
  66. deltacat/tests/compute/test_compaction_session_incremental.py +348 -0
  67. deltacat/tests/compute/testcases.py +390 -0
  68. deltacat/tests/io/test_memcached_object_store.py +5 -4
  69. deltacat/tests/local_deltacat_storage/__init__.py +1109 -0
  70. deltacat/tests/test_utils/pyarrow.py +32 -0
  71. deltacat/tests/test_utils/utils.py +13 -0
  72. deltacat/tests/utils/data/__init__.py +0 -0
  73. deltacat/tests/utils/test_daft.py +76 -0
  74. deltacat/tests/utils/test_pyarrow.py +133 -0
  75. deltacat/tests/utils/test_resources.py +23 -20
  76. deltacat/types/media.py +1 -0
  77. deltacat/types/partial_download.py +82 -0
  78. deltacat/types/tables.py +1 -0
  79. deltacat/utils/arguments.py +26 -0
  80. deltacat/utils/daft.py +87 -0
  81. deltacat/utils/performance.py +4 -2
  82. deltacat/utils/placement.py +20 -3
  83. deltacat/utils/pyarrow.py +213 -1
  84. deltacat/utils/ray_utils/concurrency.py +26 -1
  85. deltacat/utils/resources.py +72 -1
  86. deltacat/utils/s3fs.py +21 -0
  87. {deltacat-0.1.18b13.dist-info → deltacat-0.1.18b15.dist-info}/METADATA +27 -13
  88. deltacat-0.1.18b15.dist-info/RECORD +176 -0
  89. {deltacat-0.1.18b13.dist-info → deltacat-0.1.18b15.dist-info}/WHEEL +1 -1
  90. deltacat/compute/compactor/model/sort_key.py +0 -98
  91. deltacat-0.1.18b13.dist-info/RECORD +0 -136
  92. /deltacat/{tests/compactor → benchmarking}/__init__.py +0 -0
  93. /deltacat/{tests/compactor/utils → compute/compactor_v2}/__init__.py +0 -0
  94. {deltacat-0.1.18b13.dist-info → deltacat-0.1.18b15.dist-info}/LICENSE +0 -0
  95. {deltacat-0.1.18b13.dist-info → deltacat-0.1.18b15.dist-info}/top_level.txt +0 -0
@@ -2,7 +2,9 @@
2
2
  from __future__ import annotations
3
3
 
4
4
  import logging
5
- from types import FunctionType
5
+ import copy
6
+ from deltacat.types.media import ContentType, ContentEncoding
7
+ from deltacat.types.partial_download import PartialParquetParameters
6
8
  from typing import Callable, List, Optional, Union
7
9
 
8
10
  from deltacat import logs
@@ -64,7 +66,9 @@ class DeltaAnnotated(Delta):
64
66
  annotated_deltas: List[DeltaAnnotated],
65
67
  min_delta_bytes: float,
66
68
  min_file_counts: Optional[Union[int, float]] = float("inf"),
67
- estimation_function: Optional[Callable] = None,
69
+ estimation_function: Optional[
70
+ Callable[[ManifestEntry], float]
71
+ ] = lambda entry: entry.meta.content_length,
68
72
  ) -> List[DeltaAnnotated]:
69
73
  """
70
74
  Simple greedy algorithm to split/merge 1 or more annotated deltas into
@@ -76,11 +80,16 @@ class DeltaAnnotated(Delta):
76
80
  of bytes at rest for the associated object. Returns the list of annotated
77
81
  delta groups.
78
82
  """
79
- groups = []
83
+ split_annotated_deltas: List[DeltaAnnotated] = []
84
+ groups: List[DeltaAnnotated] = []
80
85
  new_da = DeltaAnnotated()
81
86
  new_da_bytes = 0
82
87
  da_group_entry_count = 0
83
- for src_da in annotated_deltas:
88
+
89
+ for delta_annotated in annotated_deltas:
90
+ split_annotated_deltas.extend(DeltaAnnotated._split_single(delta_annotated))
91
+
92
+ for src_da in split_annotated_deltas:
84
93
  src_da_annotations = src_da.annotations
85
94
  src_da_entries = src_da.manifest.entries
86
95
  assert (
@@ -105,11 +114,7 @@ class DeltaAnnotated(Delta):
105
114
  src_da, new_da, src_entry, src_da_annotations[i]
106
115
  )
107
116
  # TODO: Fetch s3_obj["Size"] if entry content length undefined?
108
- estimated_new_da_bytes = (
109
- estimation_function(src_entry.meta.content_length)
110
- if type(estimation_function) is FunctionType
111
- else src_entry.meta.content_length
112
- )
117
+ estimated_new_da_bytes = estimation_function(src_entry)
113
118
  new_da_bytes += estimated_new_da_bytes
114
119
  da_group_entry_count += 1
115
120
  if (
@@ -132,6 +137,7 @@ class DeltaAnnotated(Delta):
132
137
  da_group_entry_count = 0
133
138
  if new_da:
134
139
  groups.append(new_da)
140
+
135
141
  return groups
136
142
 
137
143
  @staticmethod
@@ -207,3 +213,79 @@ class DeltaAnnotated(Delta):
207
213
  dst_da.type = None
208
214
  entries.append(src_entry)
209
215
  dst_da.annotations.append(src_annotation)
216
+
217
+ @staticmethod
218
+ def _split_single(delta_annotated: DeltaAnnotated) -> List[DeltaAnnotated]:
219
+ """
220
+ Split a single delta annotated into multiple granular
221
+ annotated entries. Note that split is not always guaranteed.
222
+
223
+ Note: Currently we are only able to split the Parquet File downloads.
224
+ """
225
+
226
+ result = []
227
+
228
+ if (
229
+ delta_annotated.meta
230
+ and delta_annotated.manifest
231
+ and delta_annotated.meta.content_type == ContentType.PARQUET
232
+ and delta_annotated.meta.content_encoding == ContentEncoding.IDENTITY
233
+ ):
234
+ # we split by row groups
235
+ for entry_index, entry in enumerate(delta_annotated.manifest.entries):
236
+ input_split_params = None
237
+ if entry.meta and entry.meta.content_type_parameters:
238
+ for type_params in entry.meta.content_type_parameters:
239
+ if (
240
+ isinstance(type_params, PartialParquetParameters)
241
+ and type_params.num_row_groups > 1
242
+ and type_params.pq_metadata
243
+ ):
244
+ input_split_params = type_params
245
+ break
246
+
247
+ if input_split_params:
248
+ logger.info(
249
+ f"Splitting input file with URI: {entry.uri} into "
250
+ f"different {input_split_params.num_row_groups} entries"
251
+ )
252
+
253
+ for rg in input_split_params.row_groups_to_download:
254
+ new_da = DeltaAnnotated()
255
+ new_entry_dict = copy.deepcopy(entry)
256
+ new_entry = ManifestEntry(new_entry_dict)
257
+
258
+ row_group_meta = input_split_params.pq_metadata.row_group(rg)
259
+
260
+ new_partial_params = PartialParquetParameters.of(
261
+ row_groups_to_download=[rg],
262
+ num_row_groups=1,
263
+ num_rows=row_group_meta.num_rows,
264
+ in_memory_size_bytes=row_group_meta.total_byte_size,
265
+ pq_metadata=input_split_params.pq_metadata,
266
+ )
267
+
268
+ new_entry.meta.content_type_parameters = [new_partial_params]
269
+ for type_params in entry.meta.content_type_parameters:
270
+ if not isinstance(type_params, PartialParquetParameters):
271
+ new_entry.meta.content_type_parameters.append(
272
+ type_params
273
+ )
274
+
275
+ DeltaAnnotated._append_annotated_entry(
276
+ delta_annotated,
277
+ new_da,
278
+ new_entry,
279
+ delta_annotated.annotations[entry_index],
280
+ )
281
+
282
+ result.append(new_da)
283
+
284
+ if result:
285
+ return result
286
+ else:
287
+ logger.info(
288
+ f"Split was not performed on the delta with locator: {delta_annotated.locator}"
289
+ )
290
+
291
+ return [delta_annotated]
@@ -2,6 +2,7 @@
2
2
  from __future__ import annotations
3
3
 
4
4
  import numpy as np
5
+ import pyarrow as pa
5
6
 
6
7
  from deltacat.storage import DeltaType, LocalTable
7
8
 
@@ -14,9 +15,9 @@ class DeltaFileEnvelope(dict):
14
15
  @staticmethod
15
16
  def of(
16
17
  stream_position: int,
17
- file_index: int,
18
18
  delta_type: DeltaType,
19
19
  table: LocalTable,
20
+ file_index: int = None,
20
21
  is_src_delta: np.bool_ = True,
21
22
  file_record_count: Optional[int] = None,
22
23
  ) -> DeltaFileEnvelope:
@@ -37,8 +38,6 @@ class DeltaFileEnvelope(dict):
37
38
  """
38
39
  if stream_position is None:
39
40
  raise ValueError("Missing delta file envelope stream position.")
40
- if file_index is None:
41
- raise ValueError("Missing delta file envelope file index.")
42
41
  if delta_type is None:
43
42
  raise ValueError("Missing Delta file envelope delta type.")
44
43
  if table is None:
@@ -75,3 +74,16 @@ class DeltaFileEnvelope(dict):
75
74
  @property
76
75
  def file_record_count(self) -> int:
77
76
  return self["file_record_count"]
77
+
78
+ @property
79
+ def table_size_bytes(self) -> int:
80
+ if isinstance(self.table, pa.Table):
81
+ return self.table.nbytes
82
+ else:
83
+ raise ValueError(
84
+ f"Table type: {type(self.table)} not for supported for size method."
85
+ )
86
+
87
+ @property
88
+ def table_num_rows(self) -> int:
89
+ return len(self.table)
@@ -4,7 +4,7 @@ from __future__ import annotations
4
4
  from typing import Any, Dict, List
5
5
  from uuid import uuid4
6
6
 
7
- from deltacat.compute.compactor.model.sort_key import SortKey
7
+ from deltacat.storage.model.sort_key import SortKey
8
8
  from deltacat.storage import Locator, PartitionLocator
9
9
  from deltacat.utils.common import sha1_hexdigest
10
10
 
@@ -1,6 +1,7 @@
1
1
  # Allow classes to use self-referencing Type hints in Python 3.7.
2
2
  from __future__ import annotations
3
3
 
4
+ from typing import Tuple
4
5
  from deltacat.storage import DeltaLocator, PartitionLocator
5
6
  from deltacat.compute.compactor.model.pyarrow_write_result import PyArrowWriteResult
6
7
  from deltacat.compute.compactor.model.compaction_session_audit_info import (
@@ -40,9 +41,11 @@ class RoundCompletionInfo(dict):
40
41
  compacted_delta_locator: DeltaLocator,
41
42
  compacted_pyarrow_write_result: PyArrowWriteResult,
42
43
  sort_keys_bit_width: int,
43
- rebase_source_partition_locator: Optional[PartitionLocator],
44
+ rebase_source_partition_locator: Optional[PartitionLocator] = None,
44
45
  manifest_entry_copied_by_reference_ratio: Optional[float] = None,
45
46
  compaction_audit_url: Optional[str] = None,
47
+ hash_bucket_count: Optional[int] = None,
48
+ hb_index_to_entry_range: Optional[Dict[int, Tuple[int, int]]] = None,
46
49
  ) -> RoundCompletionInfo:
47
50
 
48
51
  rci = RoundCompletionInfo()
@@ -55,6 +58,8 @@ class RoundCompletionInfo(dict):
55
58
  "manifestEntryCopiedByReferenceRatio"
56
59
  ] = manifest_entry_copied_by_reference_ratio
57
60
  rci["compactionAuditUrl"] = compaction_audit_url
61
+ rci["hashBucketCount"] = hash_bucket_count
62
+ rci["hbIndexToEntryRange"] = hb_index_to_entry_range
58
63
  return rci
59
64
 
60
65
  @property
@@ -97,3 +102,14 @@ class RoundCompletionInfo(dict):
97
102
  @property
98
103
  def manifest_entry_copied_by_reference_ratio(self) -> Optional[float]:
99
104
  return self["manifestEntryCopiedByReferenceRatio"]
105
+
106
+ @property
107
+ def hash_bucket_count(self) -> Optional[int]:
108
+ return self["hashBucketCount"]
109
+
110
+ @property
111
+ def hb_index_to_entry_range(self) -> Optional[Dict[int, Tuple[int, int]]]:
112
+ """
113
+ The start index is inclusive and end index is exclusive by default.
114
+ """
115
+ return self["hbIndexToEntryRange"]
@@ -7,8 +7,8 @@ import functools
7
7
  import itertools
8
8
  from deltacat.compute.compactor import (
9
9
  RoundCompletionInfo,
10
- SortKey,
11
10
  )
11
+ from deltacat.storage.model.sort_key import SortKey
12
12
  from deltacat.types.media import ContentType
13
13
  from deltacat.compute.compactor import DeltaAnnotated
14
14
  from deltacat.utils.ray_utils.concurrency import (
@@ -31,6 +31,7 @@ from deltacat.storage import (
31
31
  interface as unimplemented_deltacat_storage,
32
32
  )
33
33
  from deltacat.utils.metrics import MetricsConfig
34
+ from deltacat.compute.compactor.utils.sort_key import validate_sort_keys
34
35
 
35
36
  logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
36
37
 
@@ -90,7 +91,7 @@ def repartition(
90
91
  source_partition_locator.partition_values,
91
92
  ).stream_position,
92
93
  deltacat_storage,
93
- **list_deltas_kwargs,
94
+ list_deltas_kwargs,
94
95
  )
95
96
 
96
97
  uniform_deltas = []
@@ -157,10 +158,11 @@ def repartition(
157
158
  new_compacted_partition_locator,
158
159
  compacted_delta.stream_position,
159
160
  )
160
- bit_width_of_sort_keys = SortKey.validate_sort_keys(
161
+ bit_width_of_sort_keys = validate_sort_keys(
161
162
  source_partition_locator,
162
163
  sort_keys,
163
164
  deltacat_storage,
165
+ deltacat_storage_kwargs={},
164
166
  )
165
167
  repartition_completion_info = RoundCompletionInfo.of(
166
168
  last_stream_position_to_compact,
@@ -12,11 +12,10 @@ import ray
12
12
 
13
13
  from deltacat import logs
14
14
  from deltacat.compute.compactor import (
15
- SortKey,
16
- SortOrder,
17
15
  DeltaFileEnvelope,
18
16
  DeltaFileLocator,
19
17
  )
18
+ from deltacat.storage.model.sort_key import SortKey, SortOrder
20
19
  from deltacat.compute.compactor.model.dedupe_result import DedupeResult
21
20
  from deltacat.compute.compactor.utils import system_columns as sc
22
21
  from deltacat.utils.ray_utils.runtime import (
@@ -108,20 +107,21 @@ def _timed_dedupe(
108
107
  dedupe_task_index: int,
109
108
  enable_profiler: bool,
110
109
  object_store: Optional[IObjectStore],
110
+ **kwargs,
111
111
  ):
112
112
  task_id = get_current_ray_task_id()
113
113
  worker_id = get_current_ray_worker_id()
114
114
  with memray.Tracker(
115
115
  f"dedupe_{worker_id}_{task_id}.bin"
116
116
  ) if enable_profiler else nullcontext():
117
- # TODO (pdames): mitigate risk of running out of memory here in cases of
118
- # severe skew of primary key updates in deltas
117
+ # TODO (pdames): mitigate risk of running out of memory here in cases of severe skew of primary key updates in deltas
119
118
  logger.info(
120
119
  f"[Dedupe task {dedupe_task_index}] Getting delta file envelope "
121
120
  f"groups for {len(object_ids)} object refs..."
122
121
  )
123
-
124
- delta_file_envelope_groups_list = object_store.get_many(object_ids)
122
+ delta_file_envelope_groups_list: List[object] = object_store.get_many(
123
+ object_ids
124
+ )
125
125
  hb_index_to_delta_file_envelopes_list = defaultdict(list)
126
126
  for delta_file_envelope_groups in delta_file_envelope_groups_list:
127
127
  for hb_idx, dfes in enumerate(delta_file_envelope_groups):
@@ -172,7 +172,8 @@ def _timed_dedupe(
172
172
 
173
173
  hb_table_record_count = len(table)
174
174
  table, drop_time = timed_invocation(
175
- func=_drop_duplicates_by_primary_key_hash, table=table
175
+ func=_drop_duplicates_by_primary_key_hash,
176
+ table=table,
176
177
  )
177
178
  deduped_record_count = hb_table_record_count - len(table)
178
179
  total_deduped_records += deduped_record_count
@@ -228,7 +229,6 @@ def _timed_dedupe(
228
229
  )
229
230
 
230
231
  peak_memory_usage_bytes = get_current_node_peak_memory_usage_in_bytes()
231
-
232
232
  return DedupeResult(
233
233
  mat_bucket_to_dd_idx_obj_id,
234
234
  np.int64(total_deduped_records),
@@ -247,6 +247,7 @@ def dedupe(
247
247
  enable_profiler: bool,
248
248
  metrics_config: MetricsConfig,
249
249
  object_store: Optional[IObjectStore],
250
+ **kwargs,
250
251
  ) -> DedupeResult:
251
252
  logger.info(f"[Dedupe task {dedupe_task_index}] Starting dedupe task...")
252
253
  dedupe_result, duration = timed_invocation(
@@ -257,6 +258,7 @@ def dedupe(
257
258
  dedupe_task_index=dedupe_task_index,
258
259
  enable_profiler=enable_profiler,
259
260
  object_store=object_store,
261
+ **kwargs,
260
262
  )
261
263
 
262
264
  emit_metrics_time = 0.0
@@ -3,7 +3,7 @@ import logging
3
3
  import time
4
4
  from contextlib import nullcontext
5
5
  from itertools import chain
6
- from typing import Generator, List, Optional, Tuple
6
+ from typing import Any, Dict, Generator, List, Optional, Tuple
7
7
  import numpy as np
8
8
  import pyarrow as pa
9
9
  import ray
@@ -11,9 +11,9 @@ from deltacat import logs
11
11
  from deltacat.compute.compactor import (
12
12
  DeltaAnnotated,
13
13
  DeltaFileEnvelope,
14
- SortKey,
15
14
  RoundCompletionInfo,
16
15
  )
16
+ from deltacat.storage.model.sort_key import SortKey
17
17
  from deltacat.compute.compactor.model.delta_file_envelope import DeltaFileEnvelopeGroups
18
18
  from deltacat.compute.compactor.model.hash_bucket_result import HashBucketResult
19
19
  from deltacat.compute.compactor.utils import system_columns as sc
@@ -91,7 +91,11 @@ def _group_file_records_by_pk_hash_bucket(
91
91
  is_src_delta: np.bool_ = True,
92
92
  read_kwargs_provider: Optional[ReadKwargsProvider] = None,
93
93
  deltacat_storage=unimplemented_deltacat_storage,
94
+ deltacat_storage_kwargs: Optional[Dict[str, Any]] = None,
95
+ **kwargs,
94
96
  ) -> Tuple[Optional[DeltaFileEnvelopeGroups], int]:
97
+ if deltacat_storage_kwargs is None:
98
+ deltacat_storage_kwargs = {}
95
99
  # read input parquet s3 objects into a list of delta file envelopes
96
100
  delta_file_envelopes, total_record_count = _read_delta_file_envelopes(
97
101
  annotated_delta,
@@ -99,6 +103,8 @@ def _group_file_records_by_pk_hash_bucket(
99
103
  sort_key_names,
100
104
  read_kwargs_provider,
101
105
  deltacat_storage,
106
+ deltacat_storage_kwargs,
107
+ **kwargs,
102
108
  )
103
109
  if delta_file_envelopes is None:
104
110
  return None, 0
@@ -134,8 +140,11 @@ def _read_delta_file_envelopes(
134
140
  sort_key_names: List[str],
135
141
  read_kwargs_provider: Optional[ReadKwargsProvider],
136
142
  deltacat_storage=unimplemented_deltacat_storage,
143
+ deltacat_storage_kwargs: Optional[Dict[str, Any]] = None,
144
+ **kwargs,
137
145
  ) -> Tuple[Optional[List[DeltaFileEnvelope]], int]:
138
-
146
+ if deltacat_storage_kwargs is None:
147
+ deltacat_storage_kwargs = {}
139
148
  columns_to_read = list(chain(primary_keys, sort_key_names))
140
149
  # TODO (rootliu) compare performance of column read from unpartitioned vs partitioned file
141
150
  # https://arrow.apache.org/docs/python/parquet.html#writing-to-partitioned-datasets
@@ -145,6 +154,7 @@ def _read_delta_file_envelopes(
145
154
  columns=columns_to_read,
146
155
  file_reader_kwargs_provider=read_kwargs_provider,
147
156
  storage_type=StorageType.LOCAL,
157
+ **deltacat_storage_kwargs,
148
158
  )
149
159
  annotations = annotated_delta.annotations
150
160
  assert (
@@ -182,7 +192,11 @@ def _timed_hash_bucket(
182
192
  read_kwargs_provider: Optional[ReadKwargsProvider] = None,
183
193
  object_store: Optional[IObjectStore] = None,
184
194
  deltacat_storage=unimplemented_deltacat_storage,
195
+ deltacat_storage_kwargs: Optional[Dict[str, Any]] = None,
196
+ **kwargs,
185
197
  ):
198
+ if deltacat_storage_kwargs is None:
199
+ deltacat_storage_kwargs = {}
186
200
  task_id = get_current_ray_task_id()
187
201
  worker_id = get_current_ray_worker_id()
188
202
  with memray.Tracker(
@@ -207,6 +221,8 @@ def _timed_hash_bucket(
207
221
  is_src_delta,
208
222
  read_kwargs_provider,
209
223
  deltacat_storage,
224
+ deltacat_storage_kwargs,
225
+ **kwargs,
210
226
  )
211
227
  hash_bucket_group_to_obj_id, _ = group_hash_bucket_indices(
212
228
  delta_file_envelope_groups, num_buckets, num_groups, object_store
@@ -235,8 +251,11 @@ def hash_bucket(
235
251
  read_kwargs_provider: Optional[ReadKwargsProvider],
236
252
  object_store: Optional[IObjectStore],
237
253
  deltacat_storage=unimplemented_deltacat_storage,
254
+ deltacat_storage_kwargs: Optional[Dict[str, Any]] = None,
255
+ **kwargs,
238
256
  ) -> HashBucketResult:
239
-
257
+ if deltacat_storage_kwargs is None:
258
+ deltacat_storage_kwargs = {}
240
259
  logger.info(f"Starting hash bucket task...")
241
260
  hash_bucket_result, duration = timed_invocation(
242
261
  func=_timed_hash_bucket,
@@ -250,6 +269,8 @@ def hash_bucket(
250
269
  read_kwargs_provider=read_kwargs_provider,
251
270
  object_store=object_store,
252
271
  deltacat_storage=deltacat_storage,
272
+ deltacat_storage_kwargs=deltacat_storage_kwargs,
273
+ **kwargs,
253
274
  )
254
275
 
255
276
  emit_metrics_time = 0.0
@@ -69,7 +69,11 @@ def materialize(
69
69
  s3_table_writer_kwargs: Optional[Dict[str, Any]] = None,
70
70
  object_store: Optional[IObjectStore] = None,
71
71
  deltacat_storage=unimplemented_deltacat_storage,
72
+ deltacat_storage_kwargs: Optional[Dict[str, Any]] = None,
72
73
  ):
74
+ if deltacat_storage_kwargs is None:
75
+ deltacat_storage_kwargs = {}
76
+
73
77
  def _stage_delta_from_manifest_entry_reference_list(
74
78
  manifest_entry_list_reference: List[ManifestEntry],
75
79
  partition: Partition,
@@ -105,6 +109,7 @@ def materialize(
105
109
  max_records_per_entry=max_records_per_output_file,
106
110
  content_type=compacted_file_content_type,
107
111
  s3_table_writer_kwargs=s3_table_writer_kwargs,
112
+ **deltacat_storage_kwargs,
108
113
  )
109
114
  compacted_table_size = TABLE_CLASS_TO_SIZE_FUNC[type(compacted_table)](
110
115
  compacted_table
@@ -116,11 +121,10 @@ def materialize(
116
121
  )
117
122
  manifest = delta.manifest
118
123
  manifest_records = manifest.meta.record_count
119
- assert (
120
- manifest_records == len(compacted_table),
124
+ assert manifest_records == len(compacted_table), (
121
125
  f"Unexpected Error: Materialized delta manifest record count "
122
126
  f"({manifest_records}) does not equal compacted table record count "
123
- f"({len(compacted_table)})",
127
+ f"({len(compacted_table)})"
124
128
  )
125
129
  materialize_result = MaterializeResult.of(
126
130
  delta=delta,
@@ -187,10 +191,11 @@ def materialize(
187
191
  src_stream_position_np.item(),
188
192
  )
189
193
  dl_digest = delta_locator.digest()
190
-
191
194
  manifest = manifest_cache.setdefault(
192
195
  dl_digest,
193
- deltacat_storage.get_delta_manifest(delta_locator),
196
+ deltacat_storage.get_delta_manifest(
197
+ delta_locator, **deltacat_storage_kwargs
198
+ ),
194
199
  )
195
200
 
196
201
  if read_kwargs_provider is None:
@@ -236,6 +241,7 @@ def materialize(
236
241
  Delta.of(delta_locator, None, None, None, manifest),
237
242
  src_file_idx_np.item(),
238
243
  file_reader_kwargs_provider=read_kwargs_provider,
244
+ **deltacat_storage_kwargs,
239
245
  )
240
246
  logger.debug(
241
247
  f"Time taken for materialize task"
@@ -253,7 +259,6 @@ def materialize(
253
259
  materialized_results.append(_materialize(record_batch_tables.remaining))
254
260
 
255
261
  logger.info(f"Got {count_of_src_dfl} source delta files during materialize")
256
-
257
262
  referenced_manifest_delta = (
258
263
  _stage_delta_from_manifest_entry_reference_list(
259
264
  manifest_entry_list_reference, partition
@@ -4,7 +4,7 @@ from contextlib import nullcontext
4
4
  import pyarrow.compute as pc
5
5
  from deltacat.constants import SIGNED_INT64_MIN_VALUE, SIGNED_INT64_MAX_VALUE
6
6
  import pyarrow as pa
7
- from typing import List, Optional
7
+ from typing import Any, Dict, List, Optional
8
8
  from deltacat.types.media import StorageType, ContentType
9
9
  import ray
10
10
  from deltacat import logs
@@ -58,6 +58,8 @@ def repartition_range(
58
58
  max_records_per_output_file: int,
59
59
  repartitioned_file_content_type: ContentType = ContentType.PARQUET,
60
60
  deltacat_storage=unimplemented_deltacat_storage,
61
+ deltacat_storage_kwargs: Optional[Dict[str, Any]] = None,
62
+ **kwargs,
61
63
  ):
62
64
  """
63
65
  Repartitions a list of Arrow tables based on specified ranges and stores the repartitioned tables.
@@ -85,6 +87,8 @@ def repartition_range(
85
87
  in the tables, an error will be raised. For each partition range, a new file is created. This could result in
86
88
  more output files than input files.
87
89
  """
90
+ if deltacat_storage_kwargs is None:
91
+ deltacat_storage_kwargs = {}
88
92
  column: str = repartition_args["column"]
89
93
  partition_ranges: List = repartition_args["ranges"]
90
94
  if len(partition_ranges) == 0:
@@ -141,6 +145,7 @@ def repartition_range(
141
145
  destination_partition,
142
146
  max_records_per_entry=max_records_per_output_file,
143
147
  content_type=repartitioned_file_content_type,
148
+ **deltacat_storage_kwargs,
144
149
  )
145
150
  partition_deltas.append(partition_delta)
146
151
 
@@ -163,7 +168,11 @@ def _timed_repartition(
163
168
  read_kwargs_provider: Optional[ReadKwargsProvider],
164
169
  repartitioned_file_content_type: ContentType = ContentType.PARQUET,
165
170
  deltacat_storage=unimplemented_deltacat_storage,
171
+ deltacat_storage_kwargs: Optional[Dict[str, Any]] = None,
172
+ **kwargs,
166
173
  ) -> RepartitionResult:
174
+ if deltacat_storage_kwargs is None:
175
+ deltacat_storage_kwargs = {}
167
176
  task_id = get_current_ray_task_id()
168
177
  worker_id = get_current_ray_worker_id()
169
178
  with memray.Tracker(
@@ -182,6 +191,7 @@ def _timed_repartition(
182
191
  max_records_per_output_file=max_records_per_output_file,
183
192
  repartitioned_file_content_type=repartitioned_file_content_type,
184
193
  deltacat_storage=deltacat_storage,
194
+ deltacat_storage_kwargs=deltacat_storage_kwargs,
185
195
  )
186
196
  else:
187
197
  raise NotImplementedError(
@@ -201,7 +211,11 @@ def repartition(
201
211
  read_kwargs_provider: Optional[ReadKwargsProvider],
202
212
  repartitioned_file_content_type: ContentType = ContentType.PARQUET,
203
213
  deltacat_storage=unimplemented_deltacat_storage,
214
+ deltacat_storage_kwargs: Optional[Dict[str, Any]] = None,
215
+ **kwargs,
204
216
  ) -> RepartitionResult:
217
+ if deltacat_storage_kwargs is None:
218
+ deltacat_storage_kwargs = {}
205
219
  logger.info(f"Starting repartition task...")
206
220
  repartition_result, duration = timed_invocation(
207
221
  func=_timed_repartition,
@@ -214,6 +228,7 @@ def repartition(
214
228
  read_kwargs_provider=read_kwargs_provider,
215
229
  repartitioned_file_content_type=repartitioned_file_content_type,
216
230
  deltacat_storage=deltacat_storage,
231
+ deltacat_storage_kwargs=deltacat_storage_kwargs,
217
232
  )
218
233
  if metrics_config:
219
234
  emit_timer_metrics(