deltacat 0.1.18b13__py3-none-any.whl → 0.1.18b15__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (95) hide show
  1. deltacat/__init__.py +3 -2
  2. deltacat/aws/clients.py +123 -3
  3. deltacat/aws/redshift/model/manifest.py +4 -0
  4. deltacat/aws/s3u.py +24 -1
  5. deltacat/benchmarking/benchmark_parquet_reads.py +53 -0
  6. deltacat/benchmarking/conftest.py +61 -0
  7. deltacat/catalog/delegate.py +1 -1
  8. deltacat/catalog/interface.py +1 -1
  9. deltacat/compute/compactor/__init__.py +0 -3
  10. deltacat/compute/compactor/compaction_session.py +45 -20
  11. deltacat/compute/compactor/model/compact_partition_params.py +287 -58
  12. deltacat/compute/compactor/model/compaction_session_audit_info.py +150 -9
  13. deltacat/compute/compactor/model/delta_annotated.py +91 -9
  14. deltacat/compute/compactor/model/delta_file_envelope.py +15 -3
  15. deltacat/compute/compactor/model/primary_key_index.py +1 -1
  16. deltacat/compute/compactor/model/round_completion_info.py +17 -1
  17. deltacat/compute/compactor/repartition_session.py +5 -3
  18. deltacat/compute/compactor/steps/dedupe.py +10 -8
  19. deltacat/compute/compactor/steps/hash_bucket.py +25 -4
  20. deltacat/compute/compactor/steps/materialize.py +11 -6
  21. deltacat/compute/compactor/steps/repartition.py +16 -1
  22. deltacat/compute/compactor/utils/io.py +40 -23
  23. deltacat/compute/compactor/utils/primary_key_index.py +1 -15
  24. deltacat/compute/compactor/utils/sort_key.py +57 -0
  25. deltacat/compute/compactor/utils/system_columns.py +43 -0
  26. deltacat/compute/compactor_v2/compaction_session.py +506 -0
  27. deltacat/compute/compactor_v2/constants.py +34 -0
  28. deltacat/compute/compactor_v2/model/__init__.py +0 -0
  29. deltacat/compute/compactor_v2/model/hash_bucket_input.py +78 -0
  30. deltacat/compute/compactor_v2/model/hash_bucket_result.py +12 -0
  31. deltacat/compute/compactor_v2/model/merge_input.py +127 -0
  32. deltacat/compute/compactor_v2/model/merge_result.py +12 -0
  33. deltacat/compute/compactor_v2/steps/__init__.py +0 -0
  34. deltacat/compute/compactor_v2/steps/hash_bucket.py +203 -0
  35. deltacat/compute/compactor_v2/steps/merge.py +41 -0
  36. deltacat/compute/compactor_v2/utils/__init__.py +0 -0
  37. deltacat/compute/compactor_v2/utils/content_type_params.py +37 -0
  38. deltacat/compute/compactor_v2/utils/io.py +149 -0
  39. deltacat/compute/compactor_v2/utils/primary_key_index.py +308 -0
  40. deltacat/compute/compactor_v2/utils/task_options.py +228 -0
  41. deltacat/compute/metastats/meta_stats.py +4 -2
  42. deltacat/compute/metastats/stats.py +1 -0
  43. deltacat/compute/metastats/utils/io.py +4 -0
  44. deltacat/compute/stats/utils/io.py +20 -5
  45. deltacat/exceptions.py +4 -0
  46. deltacat/io/memcached_object_store.py +37 -14
  47. deltacat/logs.py +4 -3
  48. deltacat/storage/__init__.py +3 -0
  49. deltacat/storage/interface.py +11 -2
  50. deltacat/storage/model/sort_key.py +33 -0
  51. deltacat/storage/model/table_version.py +11 -0
  52. deltacat/storage/model/types.py +2 -1
  53. deltacat/tests/aws/__init__.py +0 -0
  54. deltacat/tests/aws/test_clients.py +80 -0
  55. deltacat/tests/compute/__init__.py +0 -0
  56. deltacat/tests/compute/common.py +96 -0
  57. deltacat/tests/compute/compactor/__init__.py +0 -0
  58. deltacat/tests/compute/compactor/steps/__init__.py +0 -0
  59. deltacat/tests/{test_repartition.py → compute/compactor/steps/test_repartition.py} +22 -8
  60. deltacat/tests/compute/compactor/utils/__init__.py +0 -0
  61. deltacat/tests/{compactor → compute/compactor}/utils/test_io.py +47 -5
  62. deltacat/tests/compute/compactor_v2/__init__.py +0 -0
  63. deltacat/tests/compute/compactor_v2/steps/__init__.py +0 -0
  64. deltacat/tests/compute/compactor_v2/steps/test_hash_bucket.py +199 -0
  65. deltacat/tests/{compactor → compute}/test_compact_partition_params.py +14 -30
  66. deltacat/tests/compute/test_compaction_session_incremental.py +348 -0
  67. deltacat/tests/compute/testcases.py +390 -0
  68. deltacat/tests/io/test_memcached_object_store.py +5 -4
  69. deltacat/tests/local_deltacat_storage/__init__.py +1109 -0
  70. deltacat/tests/test_utils/pyarrow.py +32 -0
  71. deltacat/tests/test_utils/utils.py +13 -0
  72. deltacat/tests/utils/data/__init__.py +0 -0
  73. deltacat/tests/utils/test_daft.py +76 -0
  74. deltacat/tests/utils/test_pyarrow.py +133 -0
  75. deltacat/tests/utils/test_resources.py +23 -20
  76. deltacat/types/media.py +1 -0
  77. deltacat/types/partial_download.py +82 -0
  78. deltacat/types/tables.py +1 -0
  79. deltacat/utils/arguments.py +26 -0
  80. deltacat/utils/daft.py +87 -0
  81. deltacat/utils/performance.py +4 -2
  82. deltacat/utils/placement.py +20 -3
  83. deltacat/utils/pyarrow.py +213 -1
  84. deltacat/utils/ray_utils/concurrency.py +26 -1
  85. deltacat/utils/resources.py +72 -1
  86. deltacat/utils/s3fs.py +21 -0
  87. {deltacat-0.1.18b13.dist-info → deltacat-0.1.18b15.dist-info}/METADATA +27 -13
  88. deltacat-0.1.18b15.dist-info/RECORD +176 -0
  89. {deltacat-0.1.18b13.dist-info → deltacat-0.1.18b15.dist-info}/WHEEL +1 -1
  90. deltacat/compute/compactor/model/sort_key.py +0 -98
  91. deltacat-0.1.18b13.dist-info/RECORD +0 -136
  92. /deltacat/{tests/compactor → benchmarking}/__init__.py +0 -0
  93. /deltacat/{tests/compactor/utils → compute/compactor_v2}/__init__.py +0 -0
  94. {deltacat-0.1.18b13.dist-info → deltacat-0.1.18b15.dist-info}/LICENSE +0 -0
  95. {deltacat-0.1.18b13.dist-info → deltacat-0.1.18b15.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,506 @@
1
+ import importlib
2
+ from contextlib import nullcontext
3
+ import numpy as np
4
+ import functools
5
+ import logging
6
+ import ray
7
+ import time
8
+ import json
9
+ from deltacat.aws import s3u as s3_utils
10
+ import deltacat
11
+ from deltacat import logs
12
+ from deltacat.compute.compactor import (
13
+ PyArrowWriteResult,
14
+ RoundCompletionInfo,
15
+ )
16
+ from deltacat.compute.compactor_v2.model.merge_input import MergeInput
17
+ from deltacat.compute.compactor_v2.model.merge_result import MergeResult
18
+ from deltacat.compute.compactor_v2.model.hash_bucket_input import HashBucketInput
19
+ from deltacat.compute.compactor_v2.model.hash_bucket_result import HashBucketResult
20
+ from deltacat.compute.compactor.model.materialize_result import MaterializeResult
21
+ from deltacat.storage import (
22
+ Delta,
23
+ DeltaLocator,
24
+ Partition,
25
+ )
26
+ from deltacat.compute.compactor.model.compact_partition_params import (
27
+ CompactPartitionParams,
28
+ )
29
+ from deltacat.utils.ray_utils.concurrency import (
30
+ invoke_parallel,
31
+ task_resource_options_provider,
32
+ )
33
+ from deltacat.compute.compactor_v2.steps import merge as mg
34
+ from deltacat.compute.compactor_v2.steps import hash_bucket as hb
35
+ from deltacat.compute.compactor_v2.utils import io
36
+ from deltacat.compute.compactor.utils import round_completion_file as rcf
37
+
38
+ from typing import List, Optional, Tuple
39
+ from collections import defaultdict
40
+ from deltacat.compute.compactor.model.compaction_session_audit_info import (
41
+ CompactionSessionAuditInfo,
42
+ )
43
+ from deltacat.utils.resources import (
44
+ get_current_node_peak_memory_usage_in_bytes,
45
+ )
46
+ from deltacat.compute.compactor_v2.utils.task_options import (
47
+ hash_bucket_resource_options_provider,
48
+ merge_resource_options_provider,
49
+ )
50
+ from deltacat.utils.resources import ClusterUtilizationOverTimeRange
51
+
52
+ if importlib.util.find_spec("memray"):
53
+ import memray
54
+
55
+
56
+ logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
57
+
58
+
59
+ def compact_partition(params: CompactPartitionParams, **kwargs) -> Optional[str]:
60
+
61
+ assert (
62
+ params.hash_bucket_count is not None and params.hash_bucket_count >= 1
63
+ ), "hash_bucket_count is a required arg for compactor v2"
64
+
65
+ with memray.Tracker(
66
+ f"compaction_partition.bin"
67
+ ) if params.enable_profiler else nullcontext(), ClusterUtilizationOverTimeRange() as cluster_util:
68
+ (new_partition, new_rci, new_rcf_partition_locator,) = _execute_compaction(
69
+ params,
70
+ cluster_util=cluster_util,
71
+ **kwargs,
72
+ )
73
+
74
+ logger.info(
75
+ f"Partition-{params.source_partition_locator} -> "
76
+ f"Compaction session data processing completed"
77
+ )
78
+ round_completion_file_s3_url = None
79
+ if new_partition:
80
+ logger.info(f"Committing compacted partition to: {new_partition.locator}")
81
+ partition = params.deltacat_storage.commit_partition(
82
+ new_partition, **params.deltacat_storage_kwargs
83
+ )
84
+ logger.info(f"Committed compacted partition: {partition}")
85
+
86
+ round_completion_file_s3_url = rcf.write_round_completion_file(
87
+ params.compaction_artifact_s3_bucket,
88
+ new_rcf_partition_locator,
89
+ new_rci,
90
+ **params.s3_client_kwargs,
91
+ )
92
+ else:
93
+ logger.warn("No new partition was committed during compaction.")
94
+
95
+ logger.info(
96
+ f"Completed compaction session for: {params.source_partition_locator}"
97
+ )
98
+ return round_completion_file_s3_url
99
+
100
+
101
+ def _execute_compaction(
102
+ params: CompactPartitionParams, **kwargs
103
+ ) -> Tuple[Optional[Partition], Optional[RoundCompletionInfo], Optional[str]]:
104
+
105
+ rcf_source_partition_locator = (
106
+ params.rebase_source_partition_locator or params.source_partition_locator
107
+ )
108
+
109
+ base_audit_url = rcf_source_partition_locator.path(
110
+ f"s3://{params.compaction_artifact_s3_bucket}/compaction-audit"
111
+ )
112
+ audit_url = f"{base_audit_url}.json"
113
+ logger.info(f"Compaction audit will be written to {audit_url}")
114
+ compaction_audit = CompactionSessionAuditInfo(deltacat.__version__, audit_url)
115
+
116
+ compaction_audit.set_hash_bucket_count(params.hash_bucket_count)
117
+
118
+ compaction_start = time.monotonic()
119
+
120
+ task_max_parallelism = params.task_max_parallelism
121
+
122
+ if params.pg_config:
123
+ logger.info(
124
+ "pg_config specified. Tasks will be scheduled in a placement group."
125
+ )
126
+ cluster_resources = params.pg_config.resource
127
+ cluster_cpus = cluster_resources["CPU"]
128
+ cluster_memory = cluster_resources["memory"]
129
+ task_max_parallelism = cluster_cpus
130
+ compaction_audit.set_cluster_cpu_max(cluster_cpus)
131
+ compaction_audit.set_total_cluster_memory_bytes(cluster_memory)
132
+
133
+ # read the results from any previously completed compaction round
134
+ round_completion_info = None
135
+ high_watermark = None
136
+ previous_compacted_delta = None
137
+
138
+ if not params.rebase_source_partition_locator:
139
+ round_completion_info = rcf.read_round_completion_file(
140
+ params.compaction_artifact_s3_bucket,
141
+ params.source_partition_locator,
142
+ **params.s3_client_kwargs,
143
+ )
144
+ if not round_completion_info:
145
+ logger.info(
146
+ f"Both rebase partition and round completion file not found. Performing an entire backfill on source."
147
+ )
148
+ else:
149
+ compacted_delta_locator = round_completion_info.compacted_delta_locator
150
+ previous_compacted_delta = params.deltacat_storage.get_delta(
151
+ namespace=compacted_delta_locator.namespace,
152
+ table_name=compacted_delta_locator.table_name,
153
+ table_version=compacted_delta_locator.table_version,
154
+ stream_position=compacted_delta_locator.stream_position,
155
+ include_manifest=True,
156
+ **params.deltacat_storage_kwargs,
157
+ )
158
+
159
+ high_watermark = round_completion_info.high_watermark
160
+ logger.info(f"Setting round completion high watermark: {high_watermark}")
161
+ assert (
162
+ params.hash_bucket_count == round_completion_info.hash_bucket_count
163
+ ), (
164
+ "The hash bucket count has changed. "
165
+ "Kindly run rebase compaction and trigger incremental again. "
166
+ f"Hash Bucket count in RCF={round_completion_info.hash_bucket_count} "
167
+ f"not equal to Hash bucket count in args={params.hash_bucket_count}."
168
+ )
169
+
170
+ logger.info(f"Round completion file: {round_completion_info}")
171
+
172
+ delta_discovery_start = time.monotonic()
173
+
174
+ input_deltas = io.discover_deltas(
175
+ params.source_partition_locator,
176
+ params.last_stream_position_to_compact,
177
+ params.rebase_source_partition_locator,
178
+ params.rebase_source_partition_high_watermark,
179
+ high_watermark,
180
+ params.deltacat_storage,
181
+ params.deltacat_storage_kwargs,
182
+ params.list_deltas_kwargs,
183
+ )
184
+
185
+ delta_discovery_end = time.monotonic()
186
+ compaction_audit.set_delta_discovery_time_in_seconds(
187
+ delta_discovery_end - delta_discovery_start
188
+ )
189
+
190
+ s3_utils.upload(
191
+ compaction_audit.audit_url,
192
+ str(json.dumps(compaction_audit)),
193
+ **params.s3_client_kwargs,
194
+ )
195
+
196
+ if not input_deltas:
197
+ logger.info("No input deltas found to compact.")
198
+ return None, None, None
199
+
200
+ uniform_deltas = io.create_uniform_input_deltas(
201
+ input_deltas=input_deltas,
202
+ hash_bucket_count=params.hash_bucket_count,
203
+ compaction_audit=compaction_audit,
204
+ deltacat_storage=params.deltacat_storage,
205
+ previous_inflation=params.previous_inflation,
206
+ min_delta_bytes=params.min_delta_bytes_in_batch,
207
+ min_file_counts=params.min_files_in_batch,
208
+ deltacat_storage_kwargs=params.deltacat_storage_kwargs,
209
+ )
210
+
211
+ compaction_audit.set_uniform_deltas_created(len(uniform_deltas))
212
+
213
+ hb_options_provider = functools.partial(
214
+ task_resource_options_provider,
215
+ pg_config=params.pg_config,
216
+ resource_amount_provider=hash_bucket_resource_options_provider,
217
+ previous_inflation=params.previous_inflation,
218
+ average_record_size_bytes=params.average_record_size_bytes,
219
+ primary_keys=params.primary_keys,
220
+ )
221
+
222
+ hb_start = time.monotonic()
223
+
224
+ hash_bucket_input_provider = lambda index, item: {
225
+ "input": HashBucketInput.of(
226
+ item,
227
+ primary_keys=params.primary_keys,
228
+ num_hash_buckets=params.hash_bucket_count,
229
+ num_hash_groups=params.hash_group_count,
230
+ enable_profiler=params.enable_profiler,
231
+ metrics_config=params.metrics_config,
232
+ read_kwargs_provider=params.read_kwargs_provider,
233
+ object_store=params.object_store,
234
+ deltacat_storage=params.deltacat_storage,
235
+ deltacat_storage_kwargs=params.deltacat_storage_kwargs,
236
+ )
237
+ }
238
+
239
+ hb_tasks_pending = invoke_parallel(
240
+ items=uniform_deltas,
241
+ ray_task=hb.hash_bucket,
242
+ max_parallelism=task_max_parallelism,
243
+ options_provider=hb_options_provider,
244
+ kwargs_provider=hash_bucket_input_provider,
245
+ )
246
+
247
+ hb_invoke_end = time.monotonic()
248
+
249
+ logger.info(f"Getting {len(hb_tasks_pending)} hash bucket results...")
250
+ hb_results: List[HashBucketResult] = ray.get(hb_tasks_pending)
251
+ logger.info(f"Got {len(hb_results)} hash bucket results.")
252
+ hb_end = time.monotonic()
253
+
254
+ # we use time.time() here because time.monotonic() has no reference point
255
+ # whereas time.time() measures epoch seconds. Hence, it will be reasonable
256
+ # to compare time.time()s captured in different nodes.
257
+ hb_results_retrieved_at = time.time()
258
+
259
+ telemetry_time_hb = compaction_audit.save_step_stats(
260
+ CompactionSessionAuditInfo.HASH_BUCKET_STEP_NAME,
261
+ hb_results,
262
+ hb_results_retrieved_at,
263
+ hb_invoke_end - hb_start,
264
+ hb_end - hb_start,
265
+ )
266
+
267
+ s3_utils.upload(
268
+ compaction_audit.audit_url,
269
+ str(json.dumps(compaction_audit)),
270
+ **params.s3_client_kwargs,
271
+ )
272
+
273
+ all_hash_group_idx_to_obj_id = defaultdict(list)
274
+ all_hash_group_idx_to_size_bytes = defaultdict(int)
275
+ all_hash_group_idx_to_num_rows = defaultdict(int)
276
+ hb_data_processed_size_bytes = np.int64(0)
277
+ total_hb_record_count = np.int64(0)
278
+
279
+ # initialize all hash groups
280
+ for hb_group in range(params.hash_group_count):
281
+ all_hash_group_idx_to_num_rows[hb_group] = 0
282
+ all_hash_group_idx_to_obj_id[hb_group] = []
283
+ all_hash_group_idx_to_size_bytes[hb_group] = 0
284
+
285
+ for hb_result in hb_results:
286
+ hb_data_processed_size_bytes += hb_result.hb_size_bytes
287
+ total_hb_record_count += hb_result.hb_record_count
288
+
289
+ for hash_group_index, object_id_size_tuple in enumerate(
290
+ hb_result.hash_bucket_group_to_obj_id_tuple
291
+ ):
292
+ if object_id_size_tuple:
293
+ all_hash_group_idx_to_obj_id[hash_group_index].append(
294
+ object_id_size_tuple[0]
295
+ )
296
+ all_hash_group_idx_to_size_bytes[
297
+ hash_group_index
298
+ ] += object_id_size_tuple[1].item()
299
+ all_hash_group_idx_to_num_rows[
300
+ hash_group_index
301
+ ] += object_id_size_tuple[2].item()
302
+
303
+ logger.info(
304
+ f"Got {total_hb_record_count} hash bucket records from hash bucketing step..."
305
+ )
306
+
307
+ compaction_audit.set_input_records(total_hb_record_count.item())
308
+ compaction_audit.set_hash_bucket_processed_size_bytes(
309
+ hb_data_processed_size_bytes.item()
310
+ )
311
+
312
+ # create a new stream for this round
313
+ compacted_stream_locator = params.destination_partition_locator.stream_locator
314
+ compacted_stream = params.deltacat_storage.get_stream(
315
+ compacted_stream_locator.namespace,
316
+ compacted_stream_locator.table_name,
317
+ compacted_stream_locator.table_version,
318
+ **params.deltacat_storage_kwargs,
319
+ )
320
+ compacted_partition = params.deltacat_storage.stage_partition(
321
+ compacted_stream,
322
+ params.destination_partition_locator.partition_values,
323
+ **params.deltacat_storage_kwargs,
324
+ )
325
+
326
+ # BSP Step 2: Merge
327
+ merge_options_provider = functools.partial(
328
+ task_resource_options_provider,
329
+ pg_config=params.pg_config,
330
+ resource_amount_provider=merge_resource_options_provider,
331
+ num_hash_groups=params.hash_group_count,
332
+ hash_group_size_bytes=all_hash_group_idx_to_size_bytes,
333
+ hash_group_num_rows=all_hash_group_idx_to_num_rows,
334
+ round_completion_info=round_completion_info,
335
+ compacted_delta=previous_compacted_delta,
336
+ primary_keys=params.primary_keys,
337
+ deltacat_storage=params.deltacat_storage,
338
+ deltacat_storage_kwargs=params.deltacat_storage_kwargs,
339
+ )
340
+
341
+ merge_input_provider = lambda index, item: {
342
+ "input": MergeInput.of(
343
+ dfe_groups_refs=item[1],
344
+ write_to_partition=compacted_partition,
345
+ compacted_file_content_type=params.compacted_file_content_type,
346
+ primary_keys=params.primary_keys,
347
+ sort_keys=params.sort_keys,
348
+ merge_task_index=index,
349
+ hash_group_index=item[0],
350
+ num_hash_groups=params.hash_group_count,
351
+ max_records_per_output_file=params.records_per_compacted_file,
352
+ enable_profiler=params.enable_profiler,
353
+ metrics_config=params.metrics_config,
354
+ s3_table_writer_kwargs=params.s3_table_writer_kwargs,
355
+ read_kwargs_provider=params.read_kwargs_provider,
356
+ round_completion_info=round_completion_info,
357
+ object_store=params.object_store,
358
+ deltacat_storage=params.deltacat_storage,
359
+ deltacat_storage_kwargs=params.deltacat_storage_kwargs,
360
+ )
361
+ }
362
+
363
+ merge_start = time.monotonic()
364
+
365
+ merge_tasks_pending = invoke_parallel(
366
+ items=all_hash_group_idx_to_obj_id.items(),
367
+ ray_task=mg.merge,
368
+ max_parallelism=task_max_parallelism,
369
+ options_provider=merge_options_provider,
370
+ kwargs_provider=merge_input_provider,
371
+ )
372
+
373
+ merge_invoke_end = time.monotonic()
374
+ logger.info(f"Getting {len(merge_tasks_pending)} merge results...")
375
+ merge_results: List[MergeResult] = ray.get(merge_tasks_pending)
376
+ logger.info(f"Got {len(merge_results)} merge results.")
377
+
378
+ merge_results_retrieved_at = time.time()
379
+ merge_end = time.monotonic()
380
+
381
+ total_dd_record_count = sum([ddr.deduped_record_count for ddr in merge_results])
382
+ logger.info(f"Deduped {total_dd_record_count} records...")
383
+
384
+ telemetry_time_merge = compaction_audit.save_step_stats(
385
+ CompactionSessionAuditInfo.MERGE_STEP_NAME,
386
+ merge_results,
387
+ merge_results_retrieved_at,
388
+ merge_invoke_end - merge_start,
389
+ merge_end - merge_start,
390
+ )
391
+
392
+ compaction_audit.set_records_deduped(total_dd_record_count.item())
393
+
394
+ mat_results = []
395
+ for merge_result in merge_results:
396
+ mat_results.extend(merge_result.materialize_results)
397
+
398
+ mat_results: List[MaterializeResult] = sorted(
399
+ mat_results, key=lambda m: m.task_index
400
+ )
401
+
402
+ deltas = [m.delta for m in mat_results]
403
+
404
+ hb_id_to_entry_indices_range = {}
405
+ file_index = 0
406
+ previous_task_index = -1
407
+
408
+ for m in mat_results:
409
+ assert m.pyarrow_write_result.files >= 1, "Atleast file must be materialized"
410
+ assert m.task_index != previous_task_index, (
411
+ "Multiple materialize results found for a " f"hash bucket: {m.task_index}"
412
+ )
413
+
414
+ hb_id_to_entry_indices_range[str(m.task_index)] = (
415
+ file_index,
416
+ file_index + m.pyarrow_write_result.files - 1,
417
+ )
418
+
419
+ file_index += m.pyarrow_write_result.files
420
+ previous_task_index = m.task_index
421
+
422
+ s3_utils.upload(
423
+ compaction_audit.audit_url,
424
+ str(json.dumps(compaction_audit)),
425
+ **params.s3_client_kwargs,
426
+ )
427
+
428
+ mat_results = sorted(mat_results, key=lambda m: m.task_index)
429
+ deltas = [m.delta for m in mat_results]
430
+
431
+ # Note: An appropriate last stream position must be set
432
+ # to avoid correctness issue.
433
+ merged_delta = Delta.merge_deltas(
434
+ deltas,
435
+ stream_position=params.last_stream_position_to_compact,
436
+ )
437
+
438
+ record_info_msg = (
439
+ f"Hash bucket records: {total_hb_record_count},"
440
+ f" Deduped records: {total_dd_record_count}, "
441
+ f" Materialized records: {merged_delta.meta.record_count}"
442
+ )
443
+ logger.info(record_info_msg)
444
+
445
+ compacted_delta = params.deltacat_storage.commit_delta(
446
+ merged_delta,
447
+ properties=kwargs.get("properties", {}),
448
+ **params.deltacat_storage_kwargs,
449
+ )
450
+
451
+ logger.info(f"Committed compacted delta: {compacted_delta}")
452
+
453
+ compaction_end = time.monotonic()
454
+ compaction_audit.set_compaction_time_in_seconds(compaction_end - compaction_start)
455
+
456
+ new_compacted_delta_locator = DeltaLocator.of(
457
+ compacted_partition.locator,
458
+ compacted_delta.stream_position,
459
+ )
460
+
461
+ pyarrow_write_result = PyArrowWriteResult.union(
462
+ [m.pyarrow_write_result for m in mat_results]
463
+ )
464
+
465
+ session_peak_memory = get_current_node_peak_memory_usage_in_bytes()
466
+ compaction_audit.set_peak_memory_used_bytes_by_compaction_session_process(
467
+ session_peak_memory
468
+ )
469
+
470
+ compaction_audit.save_round_completion_stats(
471
+ mat_results, telemetry_time_hb + telemetry_time_merge
472
+ )
473
+
474
+ cluster_util: ClusterUtilizationOverTimeRange = kwargs.get("cluster_util")
475
+
476
+ if cluster_util:
477
+ compaction_audit.set_total_cpu_seconds(cluster_util.total_vcpu_seconds)
478
+ compaction_audit.set_used_cpu_seconds(cluster_util.used_vcpu_seconds)
479
+
480
+ s3_utils.upload(
481
+ compaction_audit.audit_url,
482
+ str(json.dumps(compaction_audit)),
483
+ **params.s3_client_kwargs,
484
+ )
485
+
486
+ new_round_completion_info = RoundCompletionInfo.of(
487
+ high_watermark=params.last_stream_position_to_compact,
488
+ compacted_delta_locator=new_compacted_delta_locator,
489
+ compacted_pyarrow_write_result=pyarrow_write_result,
490
+ sort_keys_bit_width=params.bit_width_of_sort_keys,
491
+ manifest_entry_copied_by_reference_ratio=compaction_audit.untouched_file_ratio,
492
+ compaction_audit_url=audit_url,
493
+ hash_bucket_count=params.hash_bucket_count,
494
+ hb_index_to_entry_range=hb_id_to_entry_indices_range,
495
+ )
496
+
497
+ logger.info(
498
+ f"partition-{params.source_partition_locator.partition_values},"
499
+ f"compacted at: {params.last_stream_position_to_compact},"
500
+ )
501
+
502
+ return (
503
+ compacted_partition,
504
+ new_round_completion_info,
505
+ rcf_source_partition_locator,
506
+ )
@@ -0,0 +1,34 @@
1
+ TOTAL_BYTES_IN_SHA1_HASH = 20
2
+
3
+ PK_DELIMITER = "L6kl7u5f"
4
+
5
+ MAX_RECORDS_PER_COMPACTED_FILE = 4_000_000
6
+
7
+ # The maximum amount of delta bytes allowed in a batch.
8
+ # A single task will not process more than these many bytes
9
+ # unless a single manifest entry (non-parquet) or single row
10
+ # group (parquet) is bigger than this size.
11
+ MIN_DELTA_BYTES_IN_BATCH = 5_000_000_000
12
+
13
+ # The total number of files that can be processed in a
14
+ # batch. Hence, if there are tiny files, this value can be
15
+ # limited so that enough parallelism can be attained.
16
+ MIN_FILES_IN_BATCH = float("inf")
17
+
18
+ # The average record size in a table.
19
+ AVERAGE_RECORD_SIZE_BYTES = 1000
20
+
21
+ # Maximum parallelism for the tasks at each BSP step.
22
+ # Default is the number of vCPUs in about 168
23
+ # r5.8xlarge EC2 instances.
24
+ TASK_MAX_PARALLELISM = 5367
25
+
26
+ # The percentage of memory that needs to be estimated
27
+ # as buffer. This value will ensure the job doesn't run out
28
+ # of memory by considering buffer for uncertainities.
29
+ TOTAL_MEMORY_BUFFER_PERCENTAGE = 20
30
+
31
+ # The total size of records that will be hash bucketed at once
32
+ # Since, sorting is nlogn, we ensure that is not performed
33
+ # on a very large dataset for best performance.
34
+ MAX_SIZE_OF_RECORD_BATCH_IN_GIB = 2 * 1024 * 1024 * 1024
File without changes
@@ -0,0 +1,78 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import Dict, List, Optional, Any
4
+ from deltacat.utils.metrics import MetricsConfig
5
+ from deltacat.utils.common import ReadKwargsProvider
6
+ from deltacat.io.object_store import IObjectStore
7
+ from deltacat.storage import interface as unimplemented_deltacat_storage
8
+ from deltacat.compute.compactor import DeltaAnnotated
9
+
10
+
11
+ class HashBucketInput(Dict):
12
+ @staticmethod
13
+ def of(
14
+ annotated_delta: DeltaAnnotated,
15
+ primary_keys: List[str],
16
+ num_hash_buckets: int,
17
+ num_hash_groups: int,
18
+ enable_profiler: Optional[bool] = False,
19
+ metrics_config: Optional[MetricsConfig] = None,
20
+ read_kwargs_provider: Optional[ReadKwargsProvider] = None,
21
+ object_store: Optional[IObjectStore] = None,
22
+ deltacat_storage=unimplemented_deltacat_storage,
23
+ deltacat_storage_kwargs: Optional[Dict[str, Any]] = None,
24
+ ) -> HashBucketInput:
25
+
26
+ result = HashBucketInput()
27
+ result["annotated_delta"] = annotated_delta
28
+ result["primary_keys"] = primary_keys
29
+ result["num_hash_buckets"] = num_hash_buckets
30
+ result["num_hash_groups"] = num_hash_groups
31
+ result["enable_profiler"] = enable_profiler
32
+ result["metrics_config"] = metrics_config
33
+ result["read_kwargs_provider"] = read_kwargs_provider
34
+ result["object_store"] = object_store
35
+ result["deltacat_storage"] = deltacat_storage
36
+ result["deltacat_storage_kwargs"] = deltacat_storage_kwargs or {}
37
+
38
+ return result
39
+
40
+ @property
41
+ def annotated_delta(self) -> DeltaAnnotated:
42
+ return self["annotated_delta"]
43
+
44
+ @property
45
+ def primary_keys(self) -> List[str]:
46
+ return self["primary_keys"]
47
+
48
+ @property
49
+ def num_hash_buckets(self) -> int:
50
+ return self["num_hash_buckets"]
51
+
52
+ @property
53
+ def num_hash_groups(self) -> int:
54
+ return self["num_hash_groups"]
55
+
56
+ @property
57
+ def enable_profiler(self) -> Optional[bool]:
58
+ return self.get("enable_profiler")
59
+
60
+ @property
61
+ def metrics_config(self) -> Optional[MetricsConfig]:
62
+ return self.get("metrics_config")
63
+
64
+ @property
65
+ def read_kwargs_provider(self) -> Optional[ReadKwargsProvider]:
66
+ return self.get("read_kwargs_provider")
67
+
68
+ @property
69
+ def object_store(self) -> Optional[IObjectStore]:
70
+ return self.get("object_store")
71
+
72
+ @property
73
+ def deltacat_storage(self) -> unimplemented_deltacat_storage:
74
+ return self.get("deltacat_storage")
75
+
76
+ @property
77
+ def deltacat_storage_kwargs(self) -> Optional[Dict[str, Any]]:
78
+ return self.get("deltacat_storage_kwargs")
@@ -0,0 +1,12 @@
1
+ from typing import NamedTuple
2
+
3
+ import numpy as np
4
+
5
+
6
+ class HashBucketResult(NamedTuple):
7
+ hash_bucket_group_to_obj_id_tuple: np.ndarray
8
+ hb_size_bytes: np.int64
9
+ hb_record_count: np.int64
10
+ peak_memory_usage_bytes: np.double
11
+ telemetry_time_in_seconds: np.double
12
+ task_completed_at: np.double