deltacat 0.1.18b14__py3-none-any.whl → 0.1.18b15__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (80) hide show
  1. deltacat/__init__.py +1 -1
  2. deltacat/aws/clients.py +17 -6
  3. deltacat/aws/redshift/model/manifest.py +4 -0
  4. deltacat/aws/s3u.py +24 -1
  5. deltacat/compute/compactor/compaction_session.py +42 -18
  6. deltacat/compute/compactor/model/compact_partition_params.py +287 -58
  7. deltacat/compute/compactor/model/compaction_session_audit_info.py +150 -9
  8. deltacat/compute/compactor/model/delta_annotated.py +91 -9
  9. deltacat/compute/compactor/model/delta_file_envelope.py +14 -2
  10. deltacat/compute/compactor/model/round_completion_info.py +17 -1
  11. deltacat/compute/compactor/repartition_session.py +2 -1
  12. deltacat/compute/compactor/steps/dedupe.py +9 -6
  13. deltacat/compute/compactor/steps/hash_bucket.py +24 -3
  14. deltacat/compute/compactor/steps/materialize.py +11 -6
  15. deltacat/compute/compactor/steps/repartition.py +16 -1
  16. deltacat/compute/compactor/utils/io.py +40 -23
  17. deltacat/compute/compactor/utils/sort_key.py +5 -0
  18. deltacat/compute/compactor/utils/system_columns.py +43 -0
  19. deltacat/compute/compactor_v2/compaction_session.py +506 -0
  20. deltacat/compute/compactor_v2/constants.py +34 -0
  21. deltacat/compute/compactor_v2/model/hash_bucket_input.py +78 -0
  22. deltacat/compute/compactor_v2/model/hash_bucket_result.py +12 -0
  23. deltacat/compute/compactor_v2/model/merge_input.py +127 -0
  24. deltacat/compute/compactor_v2/model/merge_result.py +12 -0
  25. deltacat/compute/compactor_v2/steps/__init__.py +0 -0
  26. deltacat/compute/compactor_v2/steps/hash_bucket.py +203 -0
  27. deltacat/compute/compactor_v2/steps/merge.py +41 -0
  28. deltacat/compute/compactor_v2/utils/__init__.py +0 -0
  29. deltacat/compute/compactor_v2/utils/content_type_params.py +37 -0
  30. deltacat/compute/compactor_v2/utils/io.py +149 -0
  31. deltacat/compute/compactor_v2/utils/primary_key_index.py +308 -0
  32. deltacat/compute/compactor_v2/utils/task_options.py +228 -0
  33. deltacat/compute/metastats/meta_stats.py +4 -2
  34. deltacat/compute/metastats/stats.py +1 -0
  35. deltacat/compute/metastats/utils/io.py +4 -0
  36. deltacat/compute/stats/utils/io.py +20 -5
  37. deltacat/exceptions.py +4 -0
  38. deltacat/io/memcached_object_store.py +37 -14
  39. deltacat/logs.py +4 -3
  40. deltacat/storage/interface.py +8 -1
  41. deltacat/storage/model/types.py +2 -1
  42. deltacat/tests/aws/test_clients.py +16 -3
  43. deltacat/tests/compute/__init__.py +0 -0
  44. deltacat/tests/compute/common.py +96 -0
  45. deltacat/tests/compute/compactor/__init__.py +0 -0
  46. deltacat/tests/compute/compactor/steps/__init__.py +0 -0
  47. deltacat/tests/{test_repartition.py → compute/compactor/steps/test_repartition.py} +22 -8
  48. deltacat/tests/compute/compactor/utils/__init__.py +0 -0
  49. deltacat/tests/{compactor → compute/compactor}/utils/test_io.py +47 -5
  50. deltacat/tests/compute/compactor_v2/__init__.py +0 -0
  51. deltacat/tests/compute/compactor_v2/steps/__init__.py +0 -0
  52. deltacat/tests/compute/compactor_v2/steps/test_hash_bucket.py +199 -0
  53. deltacat/tests/{compactor → compute}/test_compact_partition_params.py +14 -30
  54. deltacat/tests/compute/test_compaction_session_incremental.py +348 -0
  55. deltacat/tests/compute/testcases.py +390 -0
  56. deltacat/tests/io/test_memcached_object_store.py +5 -4
  57. deltacat/tests/local_deltacat_storage/__init__.py +62 -19
  58. deltacat/tests/test_utils/pyarrow.py +32 -0
  59. deltacat/tests/test_utils/utils.py +13 -0
  60. deltacat/tests/utils/data/__init__.py +0 -0
  61. deltacat/tests/utils/test_daft.py +76 -0
  62. deltacat/tests/utils/test_pyarrow.py +133 -0
  63. deltacat/tests/utils/test_resources.py +23 -20
  64. deltacat/types/media.py +1 -0
  65. deltacat/types/partial_download.py +82 -0
  66. deltacat/types/tables.py +1 -0
  67. deltacat/utils/arguments.py +26 -0
  68. deltacat/utils/daft.py +87 -0
  69. deltacat/utils/placement.py +20 -3
  70. deltacat/utils/pyarrow.py +213 -1
  71. deltacat/utils/ray_utils/concurrency.py +26 -1
  72. deltacat/utils/resources.py +72 -1
  73. deltacat/utils/s3fs.py +21 -0
  74. {deltacat-0.1.18b14.dist-info → deltacat-0.1.18b15.dist-info}/METADATA +17 -3
  75. {deltacat-0.1.18b14.dist-info → deltacat-0.1.18b15.dist-info}/RECORD +80 -47
  76. {deltacat-0.1.18b14.dist-info → deltacat-0.1.18b15.dist-info}/WHEEL +1 -1
  77. /deltacat/{tests/compactor → compute/compactor_v2}/__init__.py +0 -0
  78. /deltacat/{tests/compactor/utils → compute/compactor_v2/model}/__init__.py +0 -0
  79. {deltacat-0.1.18b14.dist-info → deltacat-0.1.18b15.dist-info}/LICENSE +0 -0
  80. {deltacat-0.1.18b14.dist-info → deltacat-0.1.18b15.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,308 @@
1
+ import logging
2
+ from typing import List, Optional, Iterable
3
+
4
+ import numpy as np
5
+ import pyarrow as pa
6
+ import pyarrow.compute as pc
7
+ import uuid
8
+ import hashlib
9
+ from deltacat.compute.compactor_v2.constants import (
10
+ TOTAL_BYTES_IN_SHA1_HASH,
11
+ PK_DELIMITER,
12
+ MAX_SIZE_OF_RECORD_BATCH_IN_GIB,
13
+ )
14
+ import time
15
+ from deltacat.compute.compactor.model.delta_file_envelope import DeltaFileEnvelope
16
+ from deltacat import logs
17
+ from deltacat.compute.compactor.utils import system_columns as sc
18
+ from deltacat.io.object_store import IObjectStore
19
+ from deltacat.utils.performance import timed_invocation
20
+
21
+ logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
22
+
23
+
24
+ def _append_sha1_hash_to_table(table: pa.Table, hash_column: pa.Array) -> pa.Table:
25
+ hash_column_np = hash_column.to_numpy()
26
+
27
+ result = []
28
+ for hash_value in hash_column_np:
29
+ result.append(hashlib.sha1(hash_value.encode("utf-8")).hexdigest())
30
+
31
+ return sc.append_pk_hash_string_column(table, result)
32
+
33
+
34
+ def _is_sha1_desired(hash_column: pa.Array) -> bool:
35
+ return hash_column.nbytes > TOTAL_BYTES_IN_SHA1_HASH * len(hash_column)
36
+
37
+
38
+ def _append_table_by_hash_bucket(
39
+ pki_table: pa.Table, hash_bucket_to_table: np.ndarray
40
+ ) -> int:
41
+
42
+ hb_pk_table, sort_latency = timed_invocation(
43
+ lambda: pki_table.sort_by(sc._HASH_BUCKET_IDX_COLUMN_NAME)
44
+ )
45
+ logger.info(f"Sorting a pk table of length {len(pki_table)} took {sort_latency}s")
46
+
47
+ hb_pk_grouped_by, groupby_latency = timed_invocation(
48
+ lambda: hb_pk_table.group_by(sc._HASH_BUCKET_IDX_COLUMN_NAME).aggregate(
49
+ [(sc._HASH_BUCKET_IDX_COLUMN_NAME, "count")]
50
+ )
51
+ )
52
+
53
+ logger.info(
54
+ f"Grouping a pki table of length {len(pki_table)} took {groupby_latency}s"
55
+ )
56
+
57
+ group_count_array = hb_pk_grouped_by[f"{sc._HASH_BUCKET_IDX_COLUMN_NAME}_count"]
58
+ hb_group_array = hb_pk_grouped_by[sc._HASH_BUCKET_IDX_COLUMN_NAME]
59
+
60
+ result_len = 0
61
+ for i, group_count in enumerate(group_count_array):
62
+ hb_idx = hb_group_array[i].as_py()
63
+ pyarrow_table = hb_pk_table.slice(offset=result_len, length=group_count.as_py())
64
+ pyarrow_table = pyarrow_table.drop([sc._HASH_BUCKET_IDX_COLUMN_NAME])
65
+ if hash_bucket_to_table[hb_idx] is None:
66
+ hash_bucket_to_table[hb_idx] = []
67
+ hash_bucket_to_table[hb_idx].append(pyarrow_table)
68
+ result_len += len(pyarrow_table)
69
+
70
+ return result_len
71
+
72
+
73
+ def _optimized_group_record_batches_by_hash_bucket(
74
+ pki_table: pa.Table, num_buckets: int
75
+ ):
76
+
77
+ input_table_len = len(pki_table)
78
+
79
+ hash_bucket_to_tables = np.empty([num_buckets], dtype="object")
80
+ hb_to_table = np.empty([num_buckets], dtype="object")
81
+
82
+ # This split will ensure that the sort is not performed on a very huge table
83
+ # resulting in ArrowInvalid: offset overflow while concatenating arrays
84
+ # Known issue with Arrow: https://github.com/apache/arrow/issues/25822
85
+ table_batches, to_batches_latency = timed_invocation(lambda: pki_table.to_batches())
86
+
87
+ logger.info(f"to_batches took {to_batches_latency} for {len(pki_table)} rows")
88
+
89
+ current_bytes = 0
90
+ record_batches = []
91
+ result_len = 0
92
+ for record_batch in table_batches:
93
+ current_bytes += record_batch.nbytes
94
+ record_batches.append(record_batch)
95
+ if current_bytes >= MAX_SIZE_OF_RECORD_BATCH_IN_GIB:
96
+ logger.info(
97
+ f"Total number of record batches without exceeding {MAX_SIZE_OF_RECORD_BATCH_IN_GIB} "
98
+ f"is {len(record_batches)} and size {current_bytes}"
99
+ )
100
+ appended_len, append_latency = timed_invocation(
101
+ _append_table_by_hash_bucket,
102
+ pa.Table.from_batches(record_batches),
103
+ hash_bucket_to_tables,
104
+ )
105
+ logger.info(
106
+ f"Appended the hash bucketed batch of {appended_len} in {append_latency}s"
107
+ )
108
+
109
+ result_len += appended_len
110
+ current_bytes = 0
111
+ record_batches.clear()
112
+
113
+ if record_batches:
114
+ appended_len, append_latency = timed_invocation(
115
+ _append_table_by_hash_bucket,
116
+ pa.Table.from_batches(record_batches),
117
+ hash_bucket_to_tables,
118
+ )
119
+ result_len += appended_len
120
+ current_bytes = 0
121
+ record_batches.clear()
122
+
123
+ concat_start = time.monotonic()
124
+ for hb, tables in enumerate(hash_bucket_to_tables):
125
+ if tables:
126
+ assert hb_to_table[hb] is None, f"The HB index is repeated {hb}"
127
+ hb_to_table[hb] = pa.concat_tables(tables)
128
+
129
+ concat_end = time.monotonic()
130
+ logger.info(
131
+ f"Total time taken to concat all record batches with length "
132
+ f"{input_table_len}: {concat_end - concat_start}s"
133
+ )
134
+
135
+ assert (
136
+ input_table_len == result_len
137
+ ), f"Grouping has resulted in record loss as {result_len} != {input_table_len}"
138
+
139
+ return hb_to_table
140
+
141
+
142
+ def group_by_pk_hash_bucket(
143
+ table: pa.Table, num_buckets: int, primary_keys: List[str]
144
+ ) -> np.ndarray:
145
+ table = generate_pk_hash_column(table, primary_keys, requires_sha1=True)
146
+
147
+ # group hash bucket record indices
148
+ result = group_record_indices_by_hash_bucket(
149
+ table,
150
+ num_buckets,
151
+ )
152
+
153
+ return result
154
+
155
+
156
+ def generate_pk_hash_column(
157
+ table: pa.Table,
158
+ primary_keys: Optional[List[str]] = None,
159
+ requires_sha1: bool = False,
160
+ ) -> pa.Table:
161
+ """
162
+ Returns a new table after generating the primary key hash if desired.
163
+
164
+ 1. If there are no primary keys, each hash will be unique uuid/sha1 hex
165
+ 2. If there are more than 0 primary keys, returns a table with new columns appended.
166
+ """
167
+
168
+ start = time.monotonic()
169
+
170
+ can_sha1 = False
171
+ if primary_keys:
172
+ pk_columns = []
173
+ for pk_name in primary_keys:
174
+ pk_columns.append(pc.cast(table[pk_name], pa.string()))
175
+
176
+ pk_columns.append(PK_DELIMITER)
177
+ hash_column = pc.binary_join_element_wise(*pk_columns)
178
+
179
+ can_sha1 = requires_sha1 or _is_sha1_desired(hash_column)
180
+ else:
181
+ hash_column = pa.array(
182
+ [uuid.uuid4().hex for _ in range(len(table))], pa.string()
183
+ )
184
+
185
+ logger.info(
186
+ f"can_generate_sha1={can_sha1} for the table with hash column size"
187
+ f"={hash_column.nbytes} bytes, num_rows={len(hash_column)}, "
188
+ f"and requires_sha1={requires_sha1}"
189
+ )
190
+
191
+ if can_sha1:
192
+ table = _append_sha1_hash_to_table(table, hash_column)
193
+ else:
194
+ table = table.append_column(sc._PK_HASH_STRING_COLUMN_FIELD, hash_column)
195
+
196
+ end = time.monotonic()
197
+
198
+ logger.info(
199
+ f"Took {end - start}s to generate pk hash of len: {len(hash_column)}"
200
+ f" and size: {hash_column.nbytes} bytes"
201
+ )
202
+
203
+ return table
204
+
205
+
206
+ def group_record_indices_by_hash_bucket(
207
+ pki_table: pa.Table, num_buckets: int
208
+ ) -> np.ndarray:
209
+ """
210
+ Groups the record indices by it's corresponding hash bucket. Hence, this method may
211
+ create num_buckets tables as a result.
212
+ """
213
+
214
+ input_table_len = len(pki_table)
215
+
216
+ hash_bucket_id_col_list = np.empty([input_table_len], dtype="int32")
217
+ bucketing_start_time = time.monotonic()
218
+
219
+ for index, hash_value in enumerate(sc.pk_hash_string_column_np(pki_table)):
220
+ hash_bucket = pk_digest_to_hash_bucket_index(hash_value, num_buckets)
221
+ hash_bucket_id_col_list[index] = hash_bucket
222
+
223
+ pki_table = sc.append_hash_bucket_idx_col(pki_table, hash_bucket_id_col_list)
224
+ bucketing_end_time = time.monotonic()
225
+
226
+ logger.info(
227
+ f"Took {bucketing_end_time - bucketing_start_time}s to generate the "
228
+ f"hb index for {len(pki_table)} rows"
229
+ )
230
+
231
+ result, group_latency = timed_invocation(
232
+ _optimized_group_record_batches_by_hash_bucket,
233
+ pki_table=pki_table,
234
+ num_buckets=num_buckets,
235
+ )
236
+
237
+ logger.info(
238
+ f"Final grouping of table with {input_table_len} records took: {group_latency}s"
239
+ )
240
+
241
+ return result
242
+
243
+
244
+ def group_hash_bucket_indices(
245
+ hash_bucket_object_groups: np.ndarray,
246
+ num_buckets: int,
247
+ num_groups: int,
248
+ object_store: Optional[IObjectStore] = None,
249
+ ) -> np.ndarray:
250
+ """
251
+ This method persists all tables for a given hash bucket into the object store
252
+ and returns the object references for each hash group.
253
+ """
254
+
255
+ hash_bucket_group_to_obj_id_size_tuple = np.empty([num_groups], dtype="object")
256
+
257
+ if hash_bucket_object_groups is None:
258
+ return hash_bucket_group_to_obj_id_size_tuple
259
+
260
+ hb_group_to_object = np.empty([num_groups], dtype="object")
261
+ hash_group_to_size = np.empty([num_groups], dtype="int64")
262
+ hash_group_to_num_rows = np.empty([num_groups], dtype="int64")
263
+
264
+ for hb_index, obj in enumerate(hash_bucket_object_groups):
265
+ if obj:
266
+ hb_group = hash_bucket_index_to_hash_group_index(hb_index, num_groups)
267
+ if hb_group_to_object[hb_group] is None:
268
+ hb_group_to_object[hb_group] = np.empty([num_buckets], dtype="object")
269
+ hash_group_to_size[hb_group] = np.int64(0)
270
+ hash_group_to_num_rows[hb_group] = np.int64(0)
271
+ hb_group_to_object[hb_group][hb_index] = obj
272
+ for dfe in obj:
273
+ casted_dfe: DeltaFileEnvelope = dfe
274
+ hash_group_to_size[hb_group] += casted_dfe.table_size_bytes
275
+ hash_group_to_num_rows[hb_group] += casted_dfe.table_num_rows
276
+
277
+ for hb_group, obj in enumerate(hb_group_to_object):
278
+ if obj is None:
279
+ continue
280
+ object_ref = object_store.put(obj)
281
+ hash_bucket_group_to_obj_id_size_tuple[hb_group] = (
282
+ object_ref,
283
+ hash_group_to_size[hb_group],
284
+ hash_group_to_num_rows[hb_group],
285
+ )
286
+ del object_ref
287
+ return hash_bucket_group_to_obj_id_size_tuple
288
+
289
+
290
+ def hash_bucket_index_to_hash_group_index(hb_index: int, num_groups: int) -> int:
291
+ return hb_index % num_groups
292
+
293
+
294
+ def hash_group_index_to_hash_bucket_indices(
295
+ hb_group: int, num_buckets: int, num_groups: int
296
+ ) -> Iterable[int]:
297
+
298
+ if hb_group > num_buckets:
299
+ return []
300
+
301
+ return range(hb_group, num_groups, num_buckets)
302
+
303
+
304
+ def pk_digest_to_hash_bucket_index(digest: str, num_buckets: int) -> int:
305
+ """
306
+ Generates the hash bucket index from the given digest.
307
+ """
308
+ return int(digest, 16) % num_buckets
@@ -0,0 +1,228 @@
1
+ from typing import Dict, Optional, List, Tuple
2
+ from deltacat.types.media import ContentEncoding, ContentType
3
+ from deltacat.types.partial_download import PartialParquetParameters
4
+ from deltacat.storage import (
5
+ Delta,
6
+ ManifestEntry,
7
+ interface as unimplemented_deltacat_storage,
8
+ )
9
+ from deltacat.compute.compactor.model.delta_annotated import DeltaAnnotated
10
+ from deltacat.compute.compactor.model.round_completion_info import RoundCompletionInfo
11
+ from deltacat.compute.compactor_v2.utils.primary_key_index import (
12
+ hash_group_index_to_hash_bucket_indices,
13
+ )
14
+ from deltacat.compute.compactor_v2.utils.content_type_params import (
15
+ append_content_type_params,
16
+ )
17
+ from deltacat.compute.compactor_v2.constants import TOTAL_MEMORY_BUFFER_PERCENTAGE
18
+
19
+
20
+ def _get_parquet_type_params_if_exist(
21
+ entry: ManifestEntry,
22
+ ) -> Optional[PartialParquetParameters]:
23
+ if (
24
+ entry.meta
25
+ and entry.meta.content_type == ContentType.PARQUET
26
+ and entry.meta.content_encoding == ContentEncoding.IDENTITY
27
+ ):
28
+ for type_params in entry.meta.content_type_parameters:
29
+ if isinstance(type_params, PartialParquetParameters):
30
+ return type_params
31
+ return None
32
+
33
+
34
+ def _calculate_parquet_column_size(
35
+ type_params: PartialParquetParameters, columns: List[str]
36
+ ):
37
+ column_size = 0.0
38
+ for rg in type_params.row_groups_to_download:
39
+ columns_found = 0
40
+ row_group_meta = type_params.pq_metadata.row_group(rg)
41
+ for col in range(row_group_meta.num_columns):
42
+ column_meta = row_group_meta.column(col)
43
+ if column_meta.path_in_schema in columns:
44
+ columns_found += 1
45
+ column_size += column_meta.total_uncompressed_size
46
+ assert columns_found == len(columns), (
47
+ "Columns not found in the parquet data as "
48
+ f"{columns_found} != {len(columns)}"
49
+ )
50
+ return column_size
51
+
52
+
53
+ def estimate_manifest_entry_size_bytes(
54
+ entry: ManifestEntry, previous_inflation: float, **kwargs
55
+ ) -> float:
56
+ if entry.meta.source_content_length:
57
+ return entry.meta.source_content_length
58
+
59
+ type_params = _get_parquet_type_params_if_exist(entry=entry)
60
+
61
+ if type_params:
62
+ return type_params.in_memory_size_bytes
63
+
64
+ return entry.meta.content_length * previous_inflation
65
+
66
+
67
+ def estimate_manifest_entry_num_rows(
68
+ entry: ManifestEntry,
69
+ average_record_size_bytes: float,
70
+ previous_inflation: float,
71
+ **kwargs,
72
+ ) -> int:
73
+ if entry.meta.record_count:
74
+ return entry.meta.record_count
75
+
76
+ type_params = _get_parquet_type_params_if_exist(entry=entry)
77
+
78
+ if type_params:
79
+ return type_params.num_rows
80
+
81
+ total_size_bytes = estimate_manifest_entry_size_bytes(
82
+ entry=entry, previous_inflation=previous_inflation, **kwargs
83
+ )
84
+
85
+ return int(total_size_bytes / average_record_size_bytes)
86
+
87
+
88
+ def estimate_manifest_entry_column_size_bytes(
89
+ entry: ManifestEntry, columns: Optional[List[str]] = None
90
+ ) -> Optional[float]:
91
+ if not columns:
92
+ return 0
93
+
94
+ type_params = _get_parquet_type_params_if_exist(entry=entry)
95
+
96
+ if type_params.pq_metadata:
97
+ return _calculate_parquet_column_size(type_params=type_params, columns=columns)
98
+
99
+ return None
100
+
101
+
102
+ def hash_bucket_resource_options_provider(
103
+ index: int,
104
+ item: DeltaAnnotated,
105
+ previous_inflation: float,
106
+ average_record_size_bytes: float,
107
+ primary_keys: List[str] = None,
108
+ **kwargs,
109
+ ) -> Dict:
110
+ size_bytes = 0.0
111
+ num_rows = 0
112
+ total_pk_size = 0
113
+
114
+ if not item.manifest or not item.manifest.entries:
115
+ return {"CPU": 0.01}
116
+
117
+ for entry in item.manifest.entries:
118
+ entry_size = estimate_manifest_entry_size_bytes(
119
+ entry=entry, previous_inflation=previous_inflation
120
+ )
121
+ num_rows += estimate_manifest_entry_num_rows(
122
+ entry=entry,
123
+ previous_inflation=previous_inflation,
124
+ average_record_size_bytes=average_record_size_bytes,
125
+ )
126
+ size_bytes += entry_size
127
+
128
+ if primary_keys:
129
+ pk_size = estimate_manifest_entry_column_size_bytes(
130
+ entry=entry,
131
+ columns=primary_keys,
132
+ )
133
+
134
+ if pk_size is None:
135
+ total_pk_size += entry_size
136
+ else:
137
+ total_pk_size += pk_size
138
+
139
+ # total size + pk size + pk hash column + hash bucket index column
140
+ # Refer to hash_bucket step for more details.
141
+ total_memory = size_bytes + total_pk_size + num_rows * 20 + num_rows * 4
142
+
143
+ # Consider buffer
144
+ total_memory = total_memory * (1 + TOTAL_MEMORY_BUFFER_PERCENTAGE / 100.0)
145
+
146
+ return {"num_cpus": 0.01, "memory": total_memory}
147
+
148
+
149
+ def merge_resource_options_provider(
150
+ index: int,
151
+ item: Tuple[int, List],
152
+ num_hash_groups: int,
153
+ hash_group_size_bytes: Dict[int, int],
154
+ hash_group_num_rows: Dict[int, int],
155
+ round_completion_info: Optional[RoundCompletionInfo] = None,
156
+ compacted_delta: Optional[Delta] = None,
157
+ primary_keys: Optional[List[str]] = None,
158
+ deltacat_storage=unimplemented_deltacat_storage,
159
+ deltacat_storage_kwargs: Optional[Dict] = {},
160
+ **kwargs,
161
+ ) -> Dict:
162
+ hb_group_idx = item[0]
163
+
164
+ data_size = hash_group_size_bytes.get(hb_group_idx, 0)
165
+ num_rows = hash_group_num_rows.get(hb_group_idx, 0)
166
+
167
+ pk_size_bytes = 0
168
+
169
+ if (
170
+ round_completion_info
171
+ and compacted_delta
172
+ and round_completion_info.hb_index_to_entry_range_both_inclusive
173
+ ):
174
+
175
+ previous_inflation = (
176
+ round_completion_info.compacted_pyarrow_write_result.pyarrow_bytes
177
+ / round_completion_info.compacted_pyarrow_write_result.file_bytes
178
+ )
179
+ average_record_size = (
180
+ round_completion_info.compacted_pyarrow_write_result.pyarrow_bytes
181
+ / round_completion_info.compacted_pyarrow_write_result.records
182
+ )
183
+
184
+ iterable = hash_group_index_to_hash_bucket_indices(
185
+ hb_group_idx, round_completion_info.hash_bucket_count, num_hash_groups
186
+ )
187
+
188
+ for hb_idx in iterable:
189
+ entry_start, entry_end = round_completion_info.hb_index_to_entry_range[
190
+ hb_idx
191
+ ]
192
+ for entry_index in range(entry_start, entry_end):
193
+ entry = append_content_type_params(
194
+ compacted_delta,
195
+ entry_index=entry_index,
196
+ deltacat_storage=deltacat_storage,
197
+ deltacat_storage_kwargs=deltacat_storage_kwargs,
198
+ )
199
+
200
+ current_entry_size = estimate_manifest_entry_size_bytes(
201
+ entry=entry, previous_inflation=previous_inflation
202
+ )
203
+ current_entry_rows = estimate_manifest_entry_num_rows(
204
+ entry=entry,
205
+ average_record_size_bytes=average_record_size,
206
+ previous_inflation=previous_inflation,
207
+ )
208
+
209
+ data_size += current_entry_size
210
+ num_rows += current_entry_rows
211
+
212
+ if primary_keys:
213
+ pk_size = estimate_manifest_entry_column_size_bytes(
214
+ entry=entry,
215
+ columns=primary_keys,
216
+ )
217
+
218
+ if pk_size is None:
219
+ pk_size_bytes += current_entry_size
220
+ else:
221
+ pk_size_bytes += pk_size
222
+
223
+ # total data downloaded + primary key hash column + primary key column + dict size for merge
224
+ total_memory = data_size + pk_size_bytes + num_rows * 20 + num_rows * 20
225
+
226
+ total_memory = total_memory * (1 + TOTAL_MEMORY_BUFFER_PERCENTAGE / 100.0)
227
+
228
+ return {"num_cpus": 0.01, "memory": total_memory}
@@ -5,7 +5,7 @@ import functools
5
5
  import logging
6
6
  import os
7
7
  import pathlib
8
- from typing import Dict, List, Optional, Set
8
+ from typing import Any, Dict, List, Optional, Set
9
9
 
10
10
  import ray
11
11
  from ray.types import ObjectRef
@@ -118,10 +118,12 @@ def collect_from_partition(
118
118
  stat_results_s3_bucket: Optional[str] = None,
119
119
  metastats_results_s3_bucket: Optional[str] = None,
120
120
  deltacat_storage=unimplemented_deltacat_storage,
121
+ deltacat_storage_kwargs: Optional[Dict[str, Any]] = None,
121
122
  *args,
122
123
  **kwargs,
123
124
  ) -> ObjectRef[Dict[int, DeltaStats]]:
124
-
125
+ if deltacat_storage_kwargs is None:
126
+ deltacat_storage_kwargs = {}
125
127
  if not columns:
126
128
  columns = deltacat_storage.get_table_version_column_names(
127
129
  source_partition_locator.namespace,
@@ -33,6 +33,7 @@ def start_stats_collection(
33
33
  stat_results_s3_bucket: Optional[str] = None,
34
34
  metastats_results_s3_bucket: Optional[str] = None,
35
35
  deltacat_storage=unimplemented_deltacat_storage,
36
+ **kwargs,
36
37
  ) -> Dict[str, List[DeltaStats]]:
37
38
  """Collects statistics on deltas, given a set of delta stream position ranges.
38
39
  Example:
@@ -171,6 +171,7 @@ def collect_stats_by_columns(
171
171
  delta_annotated: DeltaAnnotated,
172
172
  columns_to_compute: Optional[List[str]] = None,
173
173
  deltacat_storage=unimplemented_deltacat_storage,
174
+ deltacat_storage_kwargs: Optional[Dict[str, Any]] = None,
174
175
  ) -> Dict[str, Any]:
175
176
  """Materializes one manifest entry at a time to save memory usage and calculate stats from each of its columns.
176
177
 
@@ -182,6 +183,8 @@ def collect_stats_by_columns(
182
183
  Returns:
183
184
  A delta wide stats container
184
185
  """
186
+ if deltacat_storage_kwargs is None:
187
+ deltacat_storage_kwargs = {}
185
188
  total_tables_size = 0
186
189
 
187
190
  # Mapping of column_name -> [stats_file_idx_1, stats_file_idx_2, ... stats_file_idx_n]
@@ -198,6 +201,7 @@ def collect_stats_by_columns(
198
201
  TableType.PYARROW,
199
202
  columns_to_compute,
200
203
  equivalent_table_types="uncompacted",
204
+ **deltacat_storage_kwargs,
201
205
  )
202
206
  )
203
207
  assert isinstance(entry_pyarrow_table, pyarrow.Table), (
@@ -1,6 +1,6 @@
1
1
  import logging
2
2
  from collections import defaultdict
3
- from typing import Dict, List, Optional
3
+ from typing import Any, Dict, List, Optional
4
4
 
5
5
  import pyarrow
6
6
  import ray
@@ -83,6 +83,7 @@ def get_delta_stats(
83
83
  delta_locator: DeltaLocator,
84
84
  columns: Optional[List[str]] = None,
85
85
  deltacat_storage=unimplemented_deltacat_storage,
86
+ deltacat_storage_kwargs: Optional[Dict[str, Any]] = None,
86
87
  ) -> DeltaStats:
87
88
  """Ray distributed task to compute and collect stats for a requested delta.
88
89
  If no columns are requested, stats will be computed for all columns.
@@ -93,10 +94,15 @@ def get_delta_stats(
93
94
  Returns:
94
95
  A delta wide stats container
95
96
  """
96
-
97
- manifest = deltacat_storage.get_delta_manifest(delta_locator)
97
+ if deltacat_storage_kwargs is None:
98
+ deltacat_storage_kwargs = {}
99
+ manifest = deltacat_storage.get_delta_manifest(
100
+ delta_locator, **deltacat_storage_kwargs
101
+ )
98
102
  delta = Delta.of(delta_locator, None, None, None, manifest)
99
- return _collect_stats_by_columns(delta, columns, deltacat_storage)
103
+ return _collect_stats_by_columns(
104
+ delta, columns, deltacat_storage, deltacat_storage_kwargs
105
+ )
100
106
 
101
107
 
102
108
  @ray.remote
@@ -105,6 +111,7 @@ def get_deltas_from_range(
105
111
  start_position_inclusive: DeltaRange,
106
112
  end_position_inclusive: DeltaRange,
107
113
  deltacat_storage=unimplemented_deltacat_storage,
114
+ **kwargs,
108
115
  ) -> List[Delta]:
109
116
  """Looks up deltas in the specified partition using Ray, given both starting and ending delta stream positions.
110
117
 
@@ -137,6 +144,7 @@ def get_deltas_from_range(
137
144
  end_position_inclusive,
138
145
  ascending_order=True,
139
146
  include_manifest=False,
147
+ **kwargs,
140
148
  )
141
149
  return deltas_list_result.all_items()
142
150
 
@@ -145,6 +153,7 @@ def _collect_stats_by_columns(
145
153
  delta: Delta,
146
154
  columns_to_compute: Optional[List[str]] = None,
147
155
  deltacat_storage=unimplemented_deltacat_storage,
156
+ deltacat_storage_kwargs: Optional[Dict[str, Any]] = None,
148
157
  ) -> DeltaStats:
149
158
  """Materializes one manifest entry at a time to save memory usage and calculate stats from each of its columns.
150
159
  Args:
@@ -154,6 +163,8 @@ def _collect_stats_by_columns(
154
163
  Returns:
155
164
  A delta wide stats container
156
165
  """
166
+ if deltacat_storage_kwargs is None:
167
+ deltacat_storage_kwargs = {}
157
168
  assert (
158
169
  delta.manifest is not None
159
170
  ), f"Manifest should not be missing from delta for stats calculation: {delta}"
@@ -167,7 +178,11 @@ def _collect_stats_by_columns(
167
178
  for file_idx, manifest in enumerate(delta.manifest.entries):
168
179
  entry_pyarrow_table: LocalTable = (
169
180
  deltacat_storage.download_delta_manifest_entry(
170
- delta, file_idx, TableType.PYARROW, columns_to_compute
181
+ delta,
182
+ file_idx,
183
+ TableType.PYARROW,
184
+ columns_to_compute,
185
+ **deltacat_storage_kwargs,
171
186
  )
172
187
  )
173
188
  assert isinstance(entry_pyarrow_table, pyarrow.Table), (
deltacat/exceptions.py CHANGED
@@ -8,3 +8,7 @@ class NonRetryableError(Exception):
8
8
 
9
9
  class ConcurrentModificationError(Exception):
10
10
  pass
11
+
12
+
13
+ class ValidationError(NonRetryableError):
14
+ pass