deltacat 0.2.10__py3-none-any.whl → 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +1 -1
- deltacat/aws/s3u.py +250 -111
- deltacat/catalog/default_catalog_impl/__init__.py +369 -0
- deltacat/compute/compactor_v2/compaction_session.py +175 -152
- deltacat/compute/compactor_v2/model/hash_bucket_input.py +6 -0
- deltacat/compute/compactor_v2/model/merge_file_group.py +213 -0
- deltacat/compute/compactor_v2/model/merge_input.py +8 -24
- deltacat/compute/compactor_v2/model/merge_result.py +1 -0
- deltacat/compute/compactor_v2/steps/hash_bucket.py +4 -56
- deltacat/compute/compactor_v2/steps/merge.py +106 -171
- deltacat/compute/compactor_v2/utils/delta.py +97 -0
- deltacat/compute/compactor_v2/utils/merge.py +126 -0
- deltacat/compute/compactor_v2/utils/task_options.py +16 -4
- deltacat/compute/merge_on_read/__init__.py +4 -0
- deltacat/compute/merge_on_read/daft.py +40 -0
- deltacat/compute/merge_on_read/model/__init__.py +0 -0
- deltacat/compute/merge_on_read/model/merge_on_read_params.py +66 -0
- deltacat/compute/merge_on_read/utils/__init__.py +0 -0
- deltacat/compute/merge_on_read/utils/delta.py +42 -0
- deltacat/storage/interface.py +10 -2
- deltacat/storage/model/types.py +3 -11
- deltacat/tests/catalog/__init__.py +0 -0
- deltacat/tests/catalog/test_default_catalog_impl.py +98 -0
- deltacat/tests/compute/compact_partition_test_cases.py +126 -1
- deltacat/tests/compute/test_compact_partition_incremental.py +4 -1
- deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +9 -2
- deltacat/tests/local_deltacat_storage/__init__.py +19 -2
- deltacat/tests/test_utils/pyarrow.py +33 -14
- deltacat/tests/utils/test_daft.py +42 -2
- deltacat/types/media.py +5 -0
- deltacat/types/tables.py +7 -1
- deltacat/utils/daft.py +78 -13
- {deltacat-0.2.10.dist-info → deltacat-1.0.0.dist-info}/METADATA +2 -2
- {deltacat-0.2.10.dist-info → deltacat-1.0.0.dist-info}/RECORD +37 -25
- {deltacat-0.2.10.dist-info → deltacat-1.0.0.dist-info}/LICENSE +0 -0
- {deltacat-0.2.10.dist-info → deltacat-1.0.0.dist-info}/WHEEL +0 -0
- {deltacat-0.2.10.dist-info → deltacat-1.0.0.dist-info}/top_level.txt +0 -0
@@ -6,18 +6,24 @@ import logging
|
|
6
6
|
import ray
|
7
7
|
import time
|
8
8
|
import json
|
9
|
+
|
10
|
+
from deltacat.compute.compactor_v2.model.merge_file_group import (
|
11
|
+
RemoteMergeFileGroupsProvider,
|
12
|
+
)
|
13
|
+
from deltacat.compute.compactor_v2.model.hash_bucket_input import HashBucketInput
|
14
|
+
|
15
|
+
from deltacat.compute.compactor_v2.model.merge_input import MergeInput
|
16
|
+
|
9
17
|
from deltacat.aws import s3u as s3_utils
|
10
18
|
import deltacat
|
11
19
|
from deltacat import logs
|
12
|
-
from deltacat.compute.compactor import
|
13
|
-
PyArrowWriteResult,
|
14
|
-
RoundCompletionInfo,
|
15
|
-
)
|
16
|
-
from deltacat.compute.compactor_v2.model.merge_input import MergeInput
|
20
|
+
from deltacat.compute.compactor import PyArrowWriteResult, RoundCompletionInfo
|
17
21
|
from deltacat.compute.compactor_v2.model.merge_result import MergeResult
|
18
|
-
from deltacat.compute.compactor_v2.model.hash_bucket_input import HashBucketInput
|
19
22
|
from deltacat.compute.compactor_v2.model.hash_bucket_result import HashBucketResult
|
20
23
|
from deltacat.compute.compactor.model.materialize_result import MaterializeResult
|
24
|
+
from deltacat.compute.compactor_v2.utils.merge import (
|
25
|
+
generate_local_merge_input,
|
26
|
+
)
|
21
27
|
from deltacat.storage import (
|
22
28
|
Delta,
|
23
29
|
DeltaLocator,
|
@@ -210,107 +216,6 @@ def _execute_compaction(
|
|
210
216
|
logger.info("No input deltas found to compact.")
|
211
217
|
return None, None, None
|
212
218
|
|
213
|
-
hb_options_provider = functools.partial(
|
214
|
-
task_resource_options_provider,
|
215
|
-
pg_config=params.pg_config,
|
216
|
-
resource_amount_provider=hash_bucket_resource_options_provider,
|
217
|
-
previous_inflation=params.previous_inflation,
|
218
|
-
average_record_size_bytes=params.average_record_size_bytes,
|
219
|
-
primary_keys=params.primary_keys,
|
220
|
-
ray_custom_resources=params.ray_custom_resources,
|
221
|
-
)
|
222
|
-
|
223
|
-
hb_start = time.monotonic()
|
224
|
-
|
225
|
-
def hash_bucket_input_provider(index, item):
|
226
|
-
return {
|
227
|
-
"input": HashBucketInput.of(
|
228
|
-
item,
|
229
|
-
primary_keys=params.primary_keys,
|
230
|
-
num_hash_buckets=params.hash_bucket_count,
|
231
|
-
num_hash_groups=params.hash_group_count,
|
232
|
-
enable_profiler=params.enable_profiler,
|
233
|
-
metrics_config=params.metrics_config,
|
234
|
-
read_kwargs_provider=params.read_kwargs_provider,
|
235
|
-
object_store=params.object_store,
|
236
|
-
deltacat_storage=params.deltacat_storage,
|
237
|
-
deltacat_storage_kwargs=params.deltacat_storage_kwargs,
|
238
|
-
)
|
239
|
-
}
|
240
|
-
|
241
|
-
hb_tasks_pending = invoke_parallel(
|
242
|
-
items=uniform_deltas,
|
243
|
-
ray_task=hb.hash_bucket,
|
244
|
-
max_parallelism=task_max_parallelism,
|
245
|
-
options_provider=hb_options_provider,
|
246
|
-
kwargs_provider=hash_bucket_input_provider,
|
247
|
-
)
|
248
|
-
|
249
|
-
hb_invoke_end = time.monotonic()
|
250
|
-
|
251
|
-
logger.info(f"Getting {len(hb_tasks_pending)} hash bucket results...")
|
252
|
-
hb_results: List[HashBucketResult] = ray.get(hb_tasks_pending)
|
253
|
-
logger.info(f"Got {len(hb_results)} hash bucket results.")
|
254
|
-
hb_end = time.monotonic()
|
255
|
-
|
256
|
-
# we use time.time() here because time.monotonic() has no reference point
|
257
|
-
# whereas time.time() measures epoch seconds. Hence, it will be reasonable
|
258
|
-
# to compare time.time()s captured in different nodes.
|
259
|
-
hb_results_retrieved_at = time.time()
|
260
|
-
|
261
|
-
telemetry_time_hb = compaction_audit.save_step_stats(
|
262
|
-
CompactionSessionAuditInfo.HASH_BUCKET_STEP_NAME,
|
263
|
-
hb_results,
|
264
|
-
hb_results_retrieved_at,
|
265
|
-
hb_invoke_end - hb_start,
|
266
|
-
hb_end - hb_start,
|
267
|
-
)
|
268
|
-
|
269
|
-
s3_utils.upload(
|
270
|
-
compaction_audit.audit_url,
|
271
|
-
str(json.dumps(compaction_audit)),
|
272
|
-
**params.s3_client_kwargs,
|
273
|
-
)
|
274
|
-
|
275
|
-
all_hash_group_idx_to_obj_id = defaultdict(list)
|
276
|
-
all_hash_group_idx_to_size_bytes = defaultdict(int)
|
277
|
-
all_hash_group_idx_to_num_rows = defaultdict(int)
|
278
|
-
hb_data_processed_size_bytes = np.int64(0)
|
279
|
-
total_hb_record_count = np.int64(0)
|
280
|
-
|
281
|
-
# initialize all hash groups
|
282
|
-
for hb_group in range(params.hash_group_count):
|
283
|
-
all_hash_group_idx_to_num_rows[hb_group] = 0
|
284
|
-
all_hash_group_idx_to_obj_id[hb_group] = []
|
285
|
-
all_hash_group_idx_to_size_bytes[hb_group] = 0
|
286
|
-
|
287
|
-
for hb_result in hb_results:
|
288
|
-
hb_data_processed_size_bytes += hb_result.hb_size_bytes
|
289
|
-
total_hb_record_count += hb_result.hb_record_count
|
290
|
-
|
291
|
-
for hash_group_index, object_id_size_tuple in enumerate(
|
292
|
-
hb_result.hash_bucket_group_to_obj_id_tuple
|
293
|
-
):
|
294
|
-
if object_id_size_tuple:
|
295
|
-
all_hash_group_idx_to_obj_id[hash_group_index].append(
|
296
|
-
object_id_size_tuple[0]
|
297
|
-
)
|
298
|
-
all_hash_group_idx_to_size_bytes[
|
299
|
-
hash_group_index
|
300
|
-
] += object_id_size_tuple[1].item()
|
301
|
-
all_hash_group_idx_to_num_rows[
|
302
|
-
hash_group_index
|
303
|
-
] += object_id_size_tuple[2].item()
|
304
|
-
|
305
|
-
logger.info(
|
306
|
-
f"Got {total_hb_record_count} hash bucket records from hash bucketing step..."
|
307
|
-
)
|
308
|
-
|
309
|
-
compaction_audit.set_input_records(total_hb_record_count.item())
|
310
|
-
compaction_audit.set_hash_bucket_processed_size_bytes(
|
311
|
-
hb_data_processed_size_bytes.item()
|
312
|
-
)
|
313
|
-
|
314
219
|
# create a new stream for this round
|
315
220
|
compacted_stream_locator = params.destination_partition_locator.stream_locator
|
316
221
|
compacted_stream = params.deltacat_storage.get_stream(
|
@@ -325,60 +230,176 @@ def _execute_compaction(
|
|
325
230
|
**params.deltacat_storage_kwargs,
|
326
231
|
)
|
327
232
|
|
328
|
-
|
329
|
-
merge_options_provider = functools.partial(
|
233
|
+
hb_options_provider = functools.partial(
|
330
234
|
task_resource_options_provider,
|
331
235
|
pg_config=params.pg_config,
|
332
|
-
resource_amount_provider=
|
333
|
-
|
334
|
-
|
335
|
-
hash_group_num_rows=all_hash_group_idx_to_num_rows,
|
336
|
-
round_completion_info=round_completion_info,
|
337
|
-
compacted_delta_manifest=previous_compacted_delta_manifest,
|
236
|
+
resource_amount_provider=hash_bucket_resource_options_provider,
|
237
|
+
previous_inflation=params.previous_inflation,
|
238
|
+
average_record_size_bytes=params.average_record_size_bytes,
|
338
239
|
primary_keys=params.primary_keys,
|
339
|
-
deltacat_storage=params.deltacat_storage,
|
340
|
-
deltacat_storage_kwargs=params.deltacat_storage_kwargs,
|
341
240
|
ray_custom_resources=params.ray_custom_resources,
|
342
241
|
)
|
343
242
|
|
344
|
-
|
345
|
-
|
346
|
-
|
347
|
-
|
348
|
-
|
349
|
-
|
350
|
-
|
351
|
-
|
352
|
-
|
353
|
-
|
354
|
-
|
355
|
-
|
356
|
-
|
357
|
-
|
358
|
-
|
359
|
-
|
360
|
-
|
361
|
-
|
362
|
-
|
363
|
-
|
364
|
-
|
365
|
-
|
366
|
-
|
367
|
-
|
243
|
+
total_input_records_count = np.int64(0)
|
244
|
+
total_hb_record_count = np.int64(0)
|
245
|
+
telemetry_time_hb = 0
|
246
|
+
if params.hash_bucket_count == 1:
|
247
|
+
merge_start = time.monotonic()
|
248
|
+
local_merge_input = generate_local_merge_input(
|
249
|
+
params, uniform_deltas, compacted_partition, round_completion_info
|
250
|
+
)
|
251
|
+
local_merge_result = ray.get(mg.merge.remote(local_merge_input))
|
252
|
+
total_input_records_count += local_merge_result.input_record_count
|
253
|
+
merge_results = [local_merge_result]
|
254
|
+
merge_invoke_end = time.monotonic()
|
255
|
+
else:
|
256
|
+
hb_start = time.monotonic()
|
257
|
+
|
258
|
+
def hash_bucket_input_provider(index, item):
|
259
|
+
return {
|
260
|
+
"input": HashBucketInput.of(
|
261
|
+
item,
|
262
|
+
primary_keys=params.primary_keys,
|
263
|
+
hb_task_index=index,
|
264
|
+
num_hash_buckets=params.hash_bucket_count,
|
265
|
+
num_hash_groups=params.hash_group_count,
|
266
|
+
enable_profiler=params.enable_profiler,
|
267
|
+
metrics_config=params.metrics_config,
|
268
|
+
read_kwargs_provider=params.read_kwargs_provider,
|
269
|
+
object_store=params.object_store,
|
270
|
+
deltacat_storage=params.deltacat_storage,
|
271
|
+
deltacat_storage_kwargs=params.deltacat_storage_kwargs,
|
272
|
+
)
|
273
|
+
}
|
274
|
+
|
275
|
+
all_hash_group_idx_to_obj_id = defaultdict(list)
|
276
|
+
all_hash_group_idx_to_size_bytes = defaultdict(int)
|
277
|
+
all_hash_group_idx_to_num_rows = defaultdict(int)
|
278
|
+
hb_tasks_pending = invoke_parallel(
|
279
|
+
items=uniform_deltas,
|
280
|
+
ray_task=hb.hash_bucket,
|
281
|
+
max_parallelism=task_max_parallelism,
|
282
|
+
options_provider=hb_options_provider,
|
283
|
+
kwargs_provider=hash_bucket_input_provider,
|
284
|
+
)
|
368
285
|
|
369
|
-
|
286
|
+
hb_invoke_end = time.monotonic()
|
370
287
|
|
371
|
-
|
372
|
-
|
373
|
-
|
374
|
-
|
375
|
-
|
376
|
-
|
377
|
-
|
288
|
+
logger.info(f"Getting {len(hb_tasks_pending)} hash bucket results...")
|
289
|
+
hb_results: List[HashBucketResult] = ray.get(hb_tasks_pending)
|
290
|
+
logger.info(f"Got {len(hb_results)} hash bucket results.")
|
291
|
+
hb_end = time.monotonic()
|
292
|
+
|
293
|
+
# we use time.time() here because time.monotonic() has no reference point
|
294
|
+
# whereas time.time() measures epoch seconds. Hence, it will be reasonable
|
295
|
+
# to compare time.time()s captured in different nodes.
|
296
|
+
hb_results_retrieved_at = time.time()
|
297
|
+
|
298
|
+
telemetry_time_hb = compaction_audit.save_step_stats(
|
299
|
+
CompactionSessionAuditInfo.HASH_BUCKET_STEP_NAME,
|
300
|
+
hb_results,
|
301
|
+
hb_results_retrieved_at,
|
302
|
+
hb_invoke_end - hb_start,
|
303
|
+
hb_end - hb_start,
|
304
|
+
)
|
305
|
+
|
306
|
+
s3_utils.upload(
|
307
|
+
compaction_audit.audit_url,
|
308
|
+
str(json.dumps(compaction_audit)),
|
309
|
+
**params.s3_client_kwargs,
|
310
|
+
)
|
311
|
+
|
312
|
+
hb_data_processed_size_bytes = np.int64(0)
|
313
|
+
|
314
|
+
# initialize all hash groups
|
315
|
+
for hb_group in range(params.hash_group_count):
|
316
|
+
all_hash_group_idx_to_num_rows[hb_group] = 0
|
317
|
+
all_hash_group_idx_to_obj_id[hb_group] = []
|
318
|
+
all_hash_group_idx_to_size_bytes[hb_group] = 0
|
319
|
+
|
320
|
+
for hb_result in hb_results:
|
321
|
+
hb_data_processed_size_bytes += hb_result.hb_size_bytes
|
322
|
+
total_input_records_count += hb_result.hb_record_count
|
323
|
+
|
324
|
+
for hash_group_index, object_id_size_tuple in enumerate(
|
325
|
+
hb_result.hash_bucket_group_to_obj_id_tuple
|
326
|
+
):
|
327
|
+
if object_id_size_tuple:
|
328
|
+
all_hash_group_idx_to_obj_id[hash_group_index].append(
|
329
|
+
object_id_size_tuple[0],
|
330
|
+
)
|
331
|
+
all_hash_group_idx_to_size_bytes[
|
332
|
+
hash_group_index
|
333
|
+
] += object_id_size_tuple[1].item()
|
334
|
+
all_hash_group_idx_to_num_rows[
|
335
|
+
hash_group_index
|
336
|
+
] += object_id_size_tuple[2].item()
|
337
|
+
|
338
|
+
logger.info(
|
339
|
+
f"Got {total_input_records_count} hash bucket records from hash bucketing step..."
|
340
|
+
)
|
341
|
+
|
342
|
+
total_hb_record_count = total_input_records_count
|
343
|
+
compaction_audit.set_hash_bucket_processed_size_bytes(
|
344
|
+
hb_data_processed_size_bytes.item()
|
345
|
+
)
|
346
|
+
|
347
|
+
# BSP Step 2: Merge
|
348
|
+
merge_options_provider = functools.partial(
|
349
|
+
task_resource_options_provider,
|
350
|
+
pg_config=params.pg_config,
|
351
|
+
resource_amount_provider=merge_resource_options_provider,
|
352
|
+
num_hash_groups=params.hash_group_count,
|
353
|
+
hash_group_size_bytes=all_hash_group_idx_to_size_bytes,
|
354
|
+
hash_group_num_rows=all_hash_group_idx_to_num_rows,
|
355
|
+
round_completion_info=round_completion_info,
|
356
|
+
compacted_delta_manifest=previous_compacted_delta_manifest,
|
357
|
+
primary_keys=params.primary_keys,
|
358
|
+
deltacat_storage=params.deltacat_storage,
|
359
|
+
deltacat_storage_kwargs=params.deltacat_storage_kwargs,
|
360
|
+
ray_custom_resources=params.ray_custom_resources,
|
361
|
+
)
|
362
|
+
|
363
|
+
def merge_input_provider(index, item):
|
364
|
+
return {
|
365
|
+
"input": MergeInput.of(
|
366
|
+
merge_file_groups_provider=RemoteMergeFileGroupsProvider(
|
367
|
+
hash_group_index=item[0],
|
368
|
+
dfe_groups_refs=item[1],
|
369
|
+
hash_bucket_count=params.hash_bucket_count,
|
370
|
+
num_hash_groups=params.hash_group_count,
|
371
|
+
object_store=params.object_store,
|
372
|
+
),
|
373
|
+
write_to_partition=compacted_partition,
|
374
|
+
compacted_file_content_type=params.compacted_file_content_type,
|
375
|
+
primary_keys=params.primary_keys,
|
376
|
+
sort_keys=params.sort_keys,
|
377
|
+
merge_task_index=index,
|
378
|
+
drop_duplicates=params.drop_duplicates,
|
379
|
+
max_records_per_output_file=params.records_per_compacted_file,
|
380
|
+
enable_profiler=params.enable_profiler,
|
381
|
+
metrics_config=params.metrics_config,
|
382
|
+
s3_table_writer_kwargs=params.s3_table_writer_kwargs,
|
383
|
+
read_kwargs_provider=params.read_kwargs_provider,
|
384
|
+
round_completion_info=round_completion_info,
|
385
|
+
object_store=params.object_store,
|
386
|
+
deltacat_storage=params.deltacat_storage,
|
387
|
+
deltacat_storage_kwargs=params.deltacat_storage_kwargs,
|
388
|
+
)
|
389
|
+
}
|
390
|
+
|
391
|
+
merge_start = time.monotonic()
|
392
|
+
merge_tasks_pending = invoke_parallel(
|
393
|
+
items=all_hash_group_idx_to_obj_id.items(),
|
394
|
+
ray_task=mg.merge,
|
395
|
+
max_parallelism=task_max_parallelism,
|
396
|
+
options_provider=merge_options_provider,
|
397
|
+
kwargs_provider=merge_input_provider,
|
398
|
+
)
|
399
|
+
merge_invoke_end = time.monotonic()
|
400
|
+
logger.info(f"Getting {len(merge_tasks_pending)} merge results...")
|
401
|
+
merge_results: List[MergeResult] = ray.get(merge_tasks_pending)
|
378
402
|
|
379
|
-
merge_invoke_end = time.monotonic()
|
380
|
-
logger.info(f"Getting {len(merge_tasks_pending)} merge results...")
|
381
|
-
merge_results: List[MergeResult] = ray.get(merge_tasks_pending)
|
382
403
|
logger.info(f"Got {len(merge_results)} merge results.")
|
383
404
|
|
384
405
|
merge_results_retrieved_at = time.time()
|
@@ -387,6 +408,8 @@ def _execute_compaction(
|
|
387
408
|
total_dd_record_count = sum([ddr.deduped_record_count for ddr in merge_results])
|
388
409
|
logger.info(f"Deduped {total_dd_record_count} records...")
|
389
410
|
|
411
|
+
compaction_audit.set_input_records(total_input_records_count.item())
|
412
|
+
|
390
413
|
telemetry_time_merge = compaction_audit.save_step_stats(
|
391
414
|
CompactionSessionAuditInfo.MERGE_STEP_NAME,
|
392
415
|
merge_results,
|
@@ -15,6 +15,7 @@ class HashBucketInput(Dict):
|
|
15
15
|
primary_keys: List[str],
|
16
16
|
num_hash_buckets: int,
|
17
17
|
num_hash_groups: int,
|
18
|
+
hb_task_index: Optional[int] = 0,
|
18
19
|
enable_profiler: Optional[bool] = False,
|
19
20
|
metrics_config: Optional[MetricsConfig] = None,
|
20
21
|
read_kwargs_provider: Optional[ReadKwargsProvider] = None,
|
@@ -26,6 +27,7 @@ class HashBucketInput(Dict):
|
|
26
27
|
result = HashBucketInput()
|
27
28
|
result["annotated_delta"] = annotated_delta
|
28
29
|
result["primary_keys"] = primary_keys
|
30
|
+
result["hb_task_index"] = hb_task_index
|
29
31
|
result["num_hash_buckets"] = num_hash_buckets
|
30
32
|
result["num_hash_groups"] = num_hash_groups
|
31
33
|
result["enable_profiler"] = enable_profiler
|
@@ -45,6 +47,10 @@ class HashBucketInput(Dict):
|
|
45
47
|
def primary_keys(self) -> List[str]:
|
46
48
|
return self["primary_keys"]
|
47
49
|
|
50
|
+
@property
|
51
|
+
def hb_task_index(self) -> List[str]:
|
52
|
+
return self["hb_task_index"]
|
53
|
+
|
48
54
|
@property
|
49
55
|
def num_hash_buckets(self) -> int:
|
50
56
|
return self["num_hash_buckets"]
|
@@ -0,0 +1,213 @@
|
|
1
|
+
# Allow classes to use self-referencing Type hints in Python 3.7.
|
2
|
+
from __future__ import annotations
|
3
|
+
|
4
|
+
import logging
|
5
|
+
import time
|
6
|
+
from abc import ABC, abstractmethod
|
7
|
+
from collections import defaultdict
|
8
|
+
|
9
|
+
from deltacat.utils.common import ReadKwargsProvider
|
10
|
+
from ray.types import ObjectRef
|
11
|
+
|
12
|
+
from deltacat.compute.compactor.model.delta_file_envelope import DeltaFileEnvelopeGroups
|
13
|
+
from deltacat.compute.compactor_v2.utils.delta import read_delta_file_envelopes
|
14
|
+
|
15
|
+
from deltacat.compute.compactor_v2.utils.primary_key_index import (
|
16
|
+
hash_group_index_to_hash_bucket_indices,
|
17
|
+
)
|
18
|
+
|
19
|
+
from deltacat.storage import interface as unimplemented_deltacat_storage
|
20
|
+
|
21
|
+
from deltacat.io.object_store import IObjectStore
|
22
|
+
|
23
|
+
from deltacat import logs
|
24
|
+
|
25
|
+
from deltacat.compute.compactor import DeltaFileEnvelope, DeltaAnnotated
|
26
|
+
|
27
|
+
from typing import List, Optional
|
28
|
+
|
29
|
+
logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
|
30
|
+
|
31
|
+
|
32
|
+
class MergeFileGroup(dict):
|
33
|
+
@staticmethod
|
34
|
+
def of(hb_index: int, dfe_groups: Optional[List[List[DeltaFileEnvelope]]] = None):
|
35
|
+
"""
|
36
|
+
Creates a container with delta file envelope groupings and other
|
37
|
+
additional properties used primarily for the merging step.
|
38
|
+
|
39
|
+
Args:
|
40
|
+
hb_index: This signifies the hash bucket index corresponding to the envelope delta file groups.
|
41
|
+
dfe_groups: A list of delta file envelope groups.
|
42
|
+
If not present, the provided hash bucket index is a copy by reference candidate during the merge step.
|
43
|
+
|
44
|
+
Returns:
|
45
|
+
A dict
|
46
|
+
|
47
|
+
"""
|
48
|
+
d = MergeFileGroup()
|
49
|
+
d["hb_index"] = hb_index
|
50
|
+
d["dfe_groups"] = dfe_groups
|
51
|
+
return d
|
52
|
+
|
53
|
+
@property
|
54
|
+
def dfe_groups(self) -> Optional[List[List[DeltaFileEnvelope]]]:
|
55
|
+
return self["dfe_groups"]
|
56
|
+
|
57
|
+
@property
|
58
|
+
def hb_index(self) -> int:
|
59
|
+
return self["hb_index"]
|
60
|
+
|
61
|
+
|
62
|
+
class MergeFileGroupsProvider(ABC):
|
63
|
+
@abstractmethod
|
64
|
+
def create(self) -> List[MergeFileGroup]:
|
65
|
+
"""
|
66
|
+
Creates a list of merge file groups.
|
67
|
+
|
68
|
+
Returns: a list of merge file groups.
|
69
|
+
|
70
|
+
"""
|
71
|
+
raise NotImplementedError("Method not implemented")
|
72
|
+
|
73
|
+
@property
|
74
|
+
@abstractmethod
|
75
|
+
def hash_group_index(self):
|
76
|
+
raise NotImplementedError("Method not implemented")
|
77
|
+
|
78
|
+
|
79
|
+
class LocalMergeFileGroupsProvider(MergeFileGroupsProvider):
|
80
|
+
"""
|
81
|
+
A factory class for producing merge file groups given local delta file envelopes.
|
82
|
+
"""
|
83
|
+
|
84
|
+
LOCAL_HASH_BUCKET_INDEX = 0
|
85
|
+
LOCAL_HASH_GROUP_INDEX = 0
|
86
|
+
|
87
|
+
def __init__(
|
88
|
+
self,
|
89
|
+
uniform_deltas: List[DeltaAnnotated],
|
90
|
+
read_kwargs_provider: Optional[ReadKwargsProvider],
|
91
|
+
deltacat_storage=unimplemented_deltacat_storage,
|
92
|
+
deltacat_storage_kwargs: Optional[dict] = None,
|
93
|
+
):
|
94
|
+
self._deltas = uniform_deltas
|
95
|
+
self._read_kwargs_provider = read_kwargs_provider
|
96
|
+
self._deltacat_storage = deltacat_storage
|
97
|
+
self._deltacat_storage_kwargs = deltacat_storage_kwargs
|
98
|
+
self._loaded_deltas = False
|
99
|
+
|
100
|
+
def _read_deltas_locally(self):
|
101
|
+
local_dfe_list = []
|
102
|
+
input_records_count = 0
|
103
|
+
uniform_deltas = self._deltas
|
104
|
+
logger.info(f"Getting {len(uniform_deltas)} DFE Tasks.")
|
105
|
+
dfe_start = time.monotonic()
|
106
|
+
for annotated_delta in uniform_deltas:
|
107
|
+
(
|
108
|
+
delta_file_envelopes,
|
109
|
+
total_record_count,
|
110
|
+
total_size_bytes,
|
111
|
+
) = read_delta_file_envelopes(
|
112
|
+
annotated_delta,
|
113
|
+
self._read_kwargs_provider,
|
114
|
+
self._deltacat_storage,
|
115
|
+
self._deltacat_storage_kwargs,
|
116
|
+
)
|
117
|
+
if delta_file_envelopes:
|
118
|
+
local_dfe_list.extend(delta_file_envelopes)
|
119
|
+
input_records_count += total_record_count
|
120
|
+
dfe_end = time.monotonic()
|
121
|
+
logger.info(
|
122
|
+
f"Retrieved {len(local_dfe_list)} DFE Tasks in {dfe_end - dfe_start}s."
|
123
|
+
)
|
124
|
+
|
125
|
+
self._dfe_groups = [local_dfe_list] if len(local_dfe_list) > 0 else None
|
126
|
+
self._loaded_deltas = True
|
127
|
+
|
128
|
+
def create(self) -> List[MergeFileGroup]:
|
129
|
+
if not self._loaded_deltas:
|
130
|
+
self._read_deltas_locally()
|
131
|
+
|
132
|
+
# Since hash bucketing is skipped for local merges, we use a fixed index here.
|
133
|
+
return [
|
134
|
+
MergeFileGroup.of(
|
135
|
+
hb_index=LocalMergeFileGroupsProvider.LOCAL_HASH_BUCKET_INDEX,
|
136
|
+
dfe_groups=self._dfe_groups,
|
137
|
+
)
|
138
|
+
]
|
139
|
+
|
140
|
+
@property
|
141
|
+
def hash_group_index(self):
|
142
|
+
return LocalMergeFileGroupsProvider.LOCAL_HASH_GROUP_INDEX
|
143
|
+
|
144
|
+
|
145
|
+
class RemoteMergeFileGroupsProvider(MergeFileGroupsProvider):
|
146
|
+
"""
|
147
|
+
A factory class for producing merge file groups given delta file envelope object refs
|
148
|
+
and hash bucketing parameters. Delta file envelopes are pulled from the object store
|
149
|
+
remotely and loaded with in-memory pyarrow tables.
|
150
|
+
"""
|
151
|
+
|
152
|
+
def __init__(
|
153
|
+
self,
|
154
|
+
hash_group_index: int,
|
155
|
+
dfe_groups_refs: List[ObjectRef[DeltaFileEnvelopeGroups]],
|
156
|
+
hash_bucket_count: int,
|
157
|
+
num_hash_groups: int,
|
158
|
+
object_store: IObjectStore,
|
159
|
+
):
|
160
|
+
self.hash_bucket_count = hash_bucket_count
|
161
|
+
self.num_hash_groups = num_hash_groups
|
162
|
+
self.object_store = object_store
|
163
|
+
self._hash_group_index = hash_group_index
|
164
|
+
self._dfe_groups_refs = dfe_groups_refs
|
165
|
+
self._dfe_groups = []
|
166
|
+
self._loaded_from_object_store = False
|
167
|
+
|
168
|
+
def _load_deltas_from_object_store(self):
|
169
|
+
delta_file_envelope_groups_list = self.object_store.get_many(
|
170
|
+
self._dfe_groups_refs
|
171
|
+
)
|
172
|
+
hb_index_to_delta_file_envelopes_list = defaultdict(list)
|
173
|
+
for delta_file_envelope_groups in delta_file_envelope_groups_list:
|
174
|
+
assert self.hash_bucket_count == len(delta_file_envelope_groups), (
|
175
|
+
f"The hash bucket count must match the dfe size as {self.hash_bucket_count}"
|
176
|
+
f" != {len(delta_file_envelope_groups)}"
|
177
|
+
)
|
178
|
+
|
179
|
+
for hb_idx, dfes in enumerate(delta_file_envelope_groups):
|
180
|
+
if dfes:
|
181
|
+
hb_index_to_delta_file_envelopes_list[hb_idx].append(dfes)
|
182
|
+
valid_hb_indices_iterable = hash_group_index_to_hash_bucket_indices(
|
183
|
+
self.hash_group_index, self.hash_bucket_count, self.num_hash_groups
|
184
|
+
)
|
185
|
+
|
186
|
+
total_dfes_found = 0
|
187
|
+
dfe_list_groups = []
|
188
|
+
for hb_idx in valid_hb_indices_iterable:
|
189
|
+
dfe_list = hb_index_to_delta_file_envelopes_list.get(hb_idx)
|
190
|
+
if dfe_list:
|
191
|
+
total_dfes_found += 1
|
192
|
+
dfe_list_groups.append(
|
193
|
+
MergeFileGroup.of(hb_index=hb_idx, dfe_groups=dfe_list)
|
194
|
+
)
|
195
|
+
else:
|
196
|
+
dfe_list_groups.append(MergeFileGroup.of(hb_index=hb_idx))
|
197
|
+
|
198
|
+
assert total_dfes_found == len(hb_index_to_delta_file_envelopes_list), (
|
199
|
+
"The total dfe list does not match the input dfes from hash bucket as "
|
200
|
+
f"{total_dfes_found} != {len(hb_index_to_delta_file_envelopes_list)}"
|
201
|
+
)
|
202
|
+
self._dfe_groups = dfe_list_groups
|
203
|
+
self._loaded_from_object_store = True
|
204
|
+
|
205
|
+
def create(self) -> List[MergeFileGroup]:
|
206
|
+
if not self._loaded_from_object_store:
|
207
|
+
self._load_deltas_from_object_store()
|
208
|
+
|
209
|
+
return self._dfe_groups
|
210
|
+
|
211
|
+
@property
|
212
|
+
def hash_group_index(self):
|
213
|
+
return self._hash_group_index
|
@@ -1,7 +1,10 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
|
-
from ray.types import ObjectRef
|
4
3
|
from typing import Dict, List, Optional, Any
|
4
|
+
|
5
|
+
from deltacat.compute.compactor_v2.model.merge_file_group import (
|
6
|
+
MergeFileGroupsProvider,
|
7
|
+
)
|
5
8
|
from deltacat.utils.metrics import MetricsConfig
|
6
9
|
from deltacat.utils.common import ReadKwargsProvider
|
7
10
|
from deltacat.io.object_store import IObjectStore
|
@@ -16,19 +19,15 @@ from deltacat.compute.compactor_v2.constants import (
|
|
16
19
|
)
|
17
20
|
from deltacat.types.media import ContentType
|
18
21
|
from deltacat.compute.compactor.model.round_completion_info import RoundCompletionInfo
|
19
|
-
from deltacat.compute.compactor.model.delta_file_envelope import DeltaFileEnvelopeGroups
|
20
22
|
|
21
23
|
|
22
24
|
class MergeInput(Dict):
|
23
25
|
@staticmethod
|
24
26
|
def of(
|
25
|
-
|
27
|
+
merge_file_groups_provider: MergeFileGroupsProvider,
|
26
28
|
write_to_partition: Partition,
|
27
29
|
compacted_file_content_type: ContentType,
|
28
30
|
primary_keys: List[str],
|
29
|
-
hash_group_index: int,
|
30
|
-
num_hash_groups: int,
|
31
|
-
hash_bucket_count: int,
|
32
31
|
drop_duplicates: Optional[bool] = DROP_DUPLICATES,
|
33
32
|
sort_keys: Optional[List[SortKey]] = None,
|
34
33
|
merge_task_index: Optional[int] = 0,
|
@@ -44,13 +43,10 @@ class MergeInput(Dict):
|
|
44
43
|
) -> MergeInput:
|
45
44
|
|
46
45
|
result = MergeInput()
|
47
|
-
result["
|
46
|
+
result["merge_file_groups_provider"] = merge_file_groups_provider
|
48
47
|
result["write_to_partition"] = write_to_partition
|
49
48
|
result["compacted_file_content_type"] = compacted_file_content_type
|
50
49
|
result["primary_keys"] = primary_keys
|
51
|
-
result["hash_group_index"] = hash_group_index
|
52
|
-
result["num_hash_groups"] = num_hash_groups
|
53
|
-
result["hash_bucket_count"] = hash_bucket_count
|
54
50
|
result["drop_duplicates"] = drop_duplicates
|
55
51
|
result["sort_keys"] = sort_keys
|
56
52
|
result["merge_task_index"] = merge_task_index
|
@@ -67,8 +63,8 @@ class MergeInput(Dict):
|
|
67
63
|
return result
|
68
64
|
|
69
65
|
@property
|
70
|
-
def
|
71
|
-
return self["
|
66
|
+
def merge_file_groups_provider(self) -> MergeFileGroupsProvider:
|
67
|
+
return self["merge_file_groups_provider"]
|
72
68
|
|
73
69
|
@property
|
74
70
|
def write_to_partition(self) -> Partition:
|
@@ -82,18 +78,6 @@ class MergeInput(Dict):
|
|
82
78
|
def primary_keys(self) -> List[str]:
|
83
79
|
return self["primary_keys"]
|
84
80
|
|
85
|
-
@property
|
86
|
-
def hash_group_index(self) -> int:
|
87
|
-
return self["hash_group_index"]
|
88
|
-
|
89
|
-
@property
|
90
|
-
def num_hash_groups(self) -> int:
|
91
|
-
return self["num_hash_groups"]
|
92
|
-
|
93
|
-
@property
|
94
|
-
def hash_bucket_count(self) -> int:
|
95
|
-
return self["hash_bucket_count"]
|
96
|
-
|
97
81
|
@property
|
98
82
|
def drop_duplicates(self) -> int:
|
99
83
|
return self["drop_duplicates"]
|