deltacat 0.2.10__py3-none-any.whl → 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +1 -1
- deltacat/aws/s3u.py +250 -111
- deltacat/catalog/default_catalog_impl/__init__.py +369 -0
- deltacat/compute/compactor_v2/compaction_session.py +175 -152
- deltacat/compute/compactor_v2/model/hash_bucket_input.py +6 -0
- deltacat/compute/compactor_v2/model/merge_file_group.py +213 -0
- deltacat/compute/compactor_v2/model/merge_input.py +8 -24
- deltacat/compute/compactor_v2/model/merge_result.py +1 -0
- deltacat/compute/compactor_v2/steps/hash_bucket.py +4 -56
- deltacat/compute/compactor_v2/steps/merge.py +106 -171
- deltacat/compute/compactor_v2/utils/delta.py +97 -0
- deltacat/compute/compactor_v2/utils/merge.py +126 -0
- deltacat/compute/compactor_v2/utils/task_options.py +16 -4
- deltacat/compute/merge_on_read/__init__.py +4 -0
- deltacat/compute/merge_on_read/daft.py +40 -0
- deltacat/compute/merge_on_read/model/__init__.py +0 -0
- deltacat/compute/merge_on_read/model/merge_on_read_params.py +66 -0
- deltacat/compute/merge_on_read/utils/__init__.py +0 -0
- deltacat/compute/merge_on_read/utils/delta.py +42 -0
- deltacat/storage/interface.py +10 -2
- deltacat/storage/model/types.py +3 -11
- deltacat/tests/catalog/__init__.py +0 -0
- deltacat/tests/catalog/test_default_catalog_impl.py +98 -0
- deltacat/tests/compute/compact_partition_test_cases.py +126 -1
- deltacat/tests/compute/test_compact_partition_incremental.py +4 -1
- deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +9 -2
- deltacat/tests/local_deltacat_storage/__init__.py +19 -2
- deltacat/tests/test_utils/pyarrow.py +33 -14
- deltacat/tests/utils/test_daft.py +42 -2
- deltacat/types/media.py +5 -0
- deltacat/types/tables.py +7 -1
- deltacat/utils/daft.py +78 -13
- {deltacat-0.2.10.dist-info → deltacat-1.0.0.dist-info}/METADATA +2 -2
- {deltacat-0.2.10.dist-info → deltacat-1.0.0.dist-info}/RECORD +37 -25
- {deltacat-0.2.10.dist-info → deltacat-1.0.0.dist-info}/LICENSE +0 -0
- {deltacat-0.2.10.dist-info → deltacat-1.0.0.dist-info}/WHEEL +0 -0
- {deltacat-0.2.10.dist-info → deltacat-1.0.0.dist-info}/top_level.txt +0 -0
deltacat/__init__.py
CHANGED
deltacat/aws/s3u.py
CHANGED
@@ -22,7 +22,7 @@ from tenacity import (
|
|
22
22
|
stop_after_delay,
|
23
23
|
wait_random_exponential,
|
24
24
|
)
|
25
|
-
|
25
|
+
from deltacat.utils.ray_utils.concurrency import invoke_parallel
|
26
26
|
import deltacat.aws.clients as aws_utils
|
27
27
|
from deltacat import logs
|
28
28
|
from deltacat.aws.constants import TIMEOUT_ERROR_CODES
|
@@ -35,10 +35,17 @@ from deltacat.storage import (
|
|
35
35
|
ManifestEntry,
|
36
36
|
ManifestEntryList,
|
37
37
|
)
|
38
|
-
from deltacat.types.media import
|
38
|
+
from deltacat.types.media import (
|
39
|
+
ContentEncoding,
|
40
|
+
ContentType,
|
41
|
+
TableType,
|
42
|
+
DistributedDatasetType,
|
43
|
+
)
|
39
44
|
from deltacat.types.tables import (
|
40
45
|
TABLE_CLASS_TO_SIZE_FUNC,
|
41
46
|
TABLE_TYPE_TO_READER_FUNC,
|
47
|
+
TABLE_TYPE_TO_DATASET_CREATE_FUNC_REFS,
|
48
|
+
DISTRIBUTED_DATASET_TYPE_TO_READER_FUNC,
|
42
49
|
get_table_length,
|
43
50
|
)
|
44
51
|
from deltacat.types.partial_download import PartialFileDownloadParams
|
@@ -284,59 +291,6 @@ def upload_sliced_table(
|
|
284
291
|
return manifest_entries
|
285
292
|
|
286
293
|
|
287
|
-
@ray.remote
|
288
|
-
def _block_metadata(block: Block) -> BlockMetadata:
|
289
|
-
return BlockAccessor.for_block(block).get_metadata(
|
290
|
-
input_files=None,
|
291
|
-
exec_stats=None,
|
292
|
-
)
|
293
|
-
|
294
|
-
|
295
|
-
def _get_metadata(
|
296
|
-
table: Union[LocalTable, DistributedDataset],
|
297
|
-
write_paths: List[str],
|
298
|
-
block_refs: List[ObjectRef[Block]],
|
299
|
-
) -> List[BlockMetadata]:
|
300
|
-
metadata: List[BlockMetadata] = []
|
301
|
-
if not block_refs:
|
302
|
-
# this must be a local table - ensure it was written to only 1 file
|
303
|
-
assert len(write_paths) == 1, (
|
304
|
-
f"Expected table of type '{type(table)}' to be written to 1 "
|
305
|
-
f"file, but found {len(write_paths)} files."
|
306
|
-
)
|
307
|
-
table_size = None
|
308
|
-
table_size_func = TABLE_CLASS_TO_SIZE_FUNC.get(type(table))
|
309
|
-
if table_size_func:
|
310
|
-
table_size = table_size_func(table)
|
311
|
-
else:
|
312
|
-
logger.warning(f"Unable to estimate '{type(table)}' table size.")
|
313
|
-
metadata.append(
|
314
|
-
BlockMetadata(
|
315
|
-
num_rows=get_table_length(table),
|
316
|
-
size_bytes=table_size,
|
317
|
-
schema=None,
|
318
|
-
input_files=None,
|
319
|
-
exec_stats=None,
|
320
|
-
)
|
321
|
-
)
|
322
|
-
else:
|
323
|
-
# TODO(pdames): Expose BlockList metadata getter from Ray Dataset?
|
324
|
-
# ray 1.10
|
325
|
-
# metadata = dataset._blocks.get_metadata()
|
326
|
-
# ray 2.0.0dev
|
327
|
-
metadata = table._plan.execute().get_metadata()
|
328
|
-
if (
|
329
|
-
not metadata
|
330
|
-
or metadata[0].size_bytes is None
|
331
|
-
or metadata[0].num_rows is None
|
332
|
-
):
|
333
|
-
metadata_futures = [
|
334
|
-
_block_metadata.remote(block_ref) for block_ref in block_refs
|
335
|
-
]
|
336
|
-
metadata = ray.get(metadata_futures)
|
337
|
-
return metadata
|
338
|
-
|
339
|
-
|
340
294
|
def upload_table(
|
341
295
|
table: Union[LocalTable, DistributedDataset],
|
342
296
|
s3_base_url: str,
|
@@ -403,17 +357,7 @@ def download_manifest_entry(
|
|
403
357
|
content_encoding: Optional[ContentEncoding] = None,
|
404
358
|
) -> LocalTable:
|
405
359
|
|
406
|
-
|
407
|
-
s3_client_kwargs = (
|
408
|
-
{
|
409
|
-
"aws_access_key_id": token_holder["accessKeyId"],
|
410
|
-
"aws_secret_access_key": token_holder["secretAccessKey"],
|
411
|
-
"aws_session_token": token_holder["sessionToken"],
|
412
|
-
"config": conf,
|
413
|
-
}
|
414
|
-
if token_holder
|
415
|
-
else {"config": conf}
|
416
|
-
)
|
360
|
+
s3_client_kwargs = _get_s3_client_kwargs_from_token(token_holder=token_holder)
|
417
361
|
if not content_type:
|
418
362
|
content_type = manifest_entry.meta.content_type
|
419
363
|
assert (
|
@@ -458,51 +402,9 @@ def download_manifest_entry(
|
|
458
402
|
return table
|
459
403
|
|
460
404
|
|
461
|
-
|
462
|
-
|
463
|
-
|
464
|
-
table_type: TableType = TableType.PYARROW,
|
465
|
-
column_names: Optional[List[str]] = None,
|
466
|
-
include_columns: Optional[List[str]] = None,
|
467
|
-
file_reader_kwargs_provider: Optional[ReadKwargsProvider] = None,
|
468
|
-
) -> LocalDataset:
|
469
|
-
|
470
|
-
return [
|
471
|
-
download_manifest_entry(
|
472
|
-
e,
|
473
|
-
token_holder,
|
474
|
-
table_type,
|
475
|
-
column_names,
|
476
|
-
include_columns,
|
477
|
-
file_reader_kwargs_provider,
|
478
|
-
)
|
479
|
-
for e in manifest.entries
|
480
|
-
]
|
481
|
-
|
482
|
-
|
483
|
-
def _download_manifest_entries_parallel(
|
484
|
-
manifest: Manifest,
|
485
|
-
token_holder: Optional[Dict[str, Any]] = None,
|
486
|
-
table_type: TableType = TableType.PYARROW,
|
487
|
-
max_parallelism: Optional[int] = None,
|
488
|
-
column_names: Optional[List[str]] = None,
|
489
|
-
include_columns: Optional[List[str]] = None,
|
490
|
-
file_reader_kwargs_provider: Optional[ReadKwargsProvider] = None,
|
491
|
-
) -> LocalDataset:
|
492
|
-
|
493
|
-
tables = []
|
494
|
-
pool = multiprocessing.Pool(max_parallelism)
|
495
|
-
downloader = partial(
|
496
|
-
download_manifest_entry,
|
497
|
-
token_holder=token_holder,
|
498
|
-
table_type=table_type,
|
499
|
-
column_names=column_names,
|
500
|
-
include_columns=include_columns,
|
501
|
-
file_reader_kwargs_provider=file_reader_kwargs_provider,
|
502
|
-
)
|
503
|
-
for table in pool.map(downloader, [e for e in manifest.entries]):
|
504
|
-
tables.append(table)
|
505
|
-
return tables
|
405
|
+
@ray.remote
|
406
|
+
def download_manifest_entry_ray(*args, **kwargs) -> ObjectRef[LocalTable]:
|
407
|
+
return download_manifest_entry(*args, **kwargs)
|
506
408
|
|
507
409
|
|
508
410
|
def download_manifest_entries(
|
@@ -536,6 +438,42 @@ def download_manifest_entries(
|
|
536
438
|
)
|
537
439
|
|
538
440
|
|
441
|
+
def download_manifest_entries_distributed(
|
442
|
+
manifest: Manifest,
|
443
|
+
token_holder: Optional[Dict[str, Any]] = None,
|
444
|
+
table_type: TableType = TableType.PYARROW,
|
445
|
+
max_parallelism: Optional[int] = 1000,
|
446
|
+
column_names: Optional[List[str]] = None,
|
447
|
+
include_columns: Optional[List[str]] = None,
|
448
|
+
file_reader_kwargs_provider: Optional[ReadKwargsProvider] = None,
|
449
|
+
ray_options_provider: Callable[[int, Any], Dict[str, Any]] = None,
|
450
|
+
distributed_dataset_type: Optional[
|
451
|
+
DistributedDatasetType
|
452
|
+
] = DistributedDatasetType.RAY_DATASET,
|
453
|
+
) -> DistributedDataset:
|
454
|
+
|
455
|
+
params = {
|
456
|
+
"manifest": manifest,
|
457
|
+
"token_holder": token_holder,
|
458
|
+
"table_type": table_type,
|
459
|
+
"max_parallelism": max_parallelism,
|
460
|
+
"column_names": column_names,
|
461
|
+
"include_columns": include_columns,
|
462
|
+
"file_reader_kwargs_provider": file_reader_kwargs_provider,
|
463
|
+
"ray_options_provider": ray_options_provider,
|
464
|
+
"distributed_dataset_type": distributed_dataset_type,
|
465
|
+
}
|
466
|
+
|
467
|
+
if distributed_dataset_type == DistributedDatasetType.RAY_DATASET:
|
468
|
+
return _download_manifest_entries_ray_data_distributed(**params)
|
469
|
+
elif distributed_dataset_type is not None:
|
470
|
+
return _download_manifest_entries_all_dataset_distributed(**params)
|
471
|
+
else:
|
472
|
+
raise ValueError(
|
473
|
+
f"Distributed dataset type {distributed_dataset_type} not supported."
|
474
|
+
)
|
475
|
+
|
476
|
+
|
539
477
|
def upload(s3_url: str, body, **s3_client_kwargs) -> Dict[str, Any]:
|
540
478
|
|
541
479
|
# TODO (pdames): add tenacity retrying
|
@@ -574,3 +512,204 @@ def download(
|
|
574
512
|
else:
|
575
513
|
logger.info(f"file not found: {s3_url}")
|
576
514
|
return None
|
515
|
+
|
516
|
+
|
517
|
+
def _download_manifest_entries_parallel(
|
518
|
+
manifest: Manifest,
|
519
|
+
token_holder: Optional[Dict[str, Any]] = None,
|
520
|
+
table_type: TableType = TableType.PYARROW,
|
521
|
+
max_parallelism: Optional[int] = None,
|
522
|
+
column_names: Optional[List[str]] = None,
|
523
|
+
include_columns: Optional[List[str]] = None,
|
524
|
+
file_reader_kwargs_provider: Optional[ReadKwargsProvider] = None,
|
525
|
+
) -> LocalDataset:
|
526
|
+
|
527
|
+
tables = []
|
528
|
+
pool = multiprocessing.Pool(max_parallelism)
|
529
|
+
downloader = partial(
|
530
|
+
download_manifest_entry,
|
531
|
+
token_holder=token_holder,
|
532
|
+
table_type=table_type,
|
533
|
+
column_names=column_names,
|
534
|
+
include_columns=include_columns,
|
535
|
+
file_reader_kwargs_provider=file_reader_kwargs_provider,
|
536
|
+
)
|
537
|
+
for table in pool.map(downloader, [e for e in manifest.entries]):
|
538
|
+
tables.append(table)
|
539
|
+
return tables
|
540
|
+
|
541
|
+
|
542
|
+
def _download_manifest_entries(
|
543
|
+
manifest: Manifest,
|
544
|
+
token_holder: Optional[Dict[str, Any]] = None,
|
545
|
+
table_type: TableType = TableType.PYARROW,
|
546
|
+
column_names: Optional[List[str]] = None,
|
547
|
+
include_columns: Optional[List[str]] = None,
|
548
|
+
file_reader_kwargs_provider: Optional[ReadKwargsProvider] = None,
|
549
|
+
) -> LocalDataset:
|
550
|
+
|
551
|
+
return [
|
552
|
+
download_manifest_entry(
|
553
|
+
manifest_entry=e,
|
554
|
+
token_holder=token_holder,
|
555
|
+
table_type=table_type,
|
556
|
+
column_names=column_names,
|
557
|
+
include_columns=include_columns,
|
558
|
+
file_reader_kwargs_provider=file_reader_kwargs_provider,
|
559
|
+
)
|
560
|
+
for e in manifest.entries
|
561
|
+
]
|
562
|
+
|
563
|
+
|
564
|
+
@ray.remote
|
565
|
+
def _block_metadata(block: Block) -> BlockMetadata:
|
566
|
+
return BlockAccessor.for_block(block).get_metadata(
|
567
|
+
input_files=None,
|
568
|
+
exec_stats=None,
|
569
|
+
)
|
570
|
+
|
571
|
+
|
572
|
+
def _get_s3_client_kwargs_from_token(token_holder) -> Dict[Any, Any]:
|
573
|
+
conf = Config(retries={"max_attempts": BOTO_MAX_RETRIES, "mode": "adaptive"})
|
574
|
+
return (
|
575
|
+
{
|
576
|
+
"aws_access_key_id": token_holder["accessKeyId"],
|
577
|
+
"aws_secret_access_key": token_holder["secretAccessKey"],
|
578
|
+
"aws_session_token": token_holder["sessionToken"],
|
579
|
+
"config": conf,
|
580
|
+
}
|
581
|
+
if token_holder
|
582
|
+
else {"config": conf}
|
583
|
+
)
|
584
|
+
|
585
|
+
|
586
|
+
def _get_metadata(
|
587
|
+
table: Union[LocalTable, DistributedDataset],
|
588
|
+
write_paths: List[str],
|
589
|
+
block_refs: List[ObjectRef[Block]],
|
590
|
+
) -> List[BlockMetadata]:
|
591
|
+
metadata: List[BlockMetadata] = []
|
592
|
+
if not block_refs:
|
593
|
+
# this must be a local table - ensure it was written to only 1 file
|
594
|
+
assert len(write_paths) == 1, (
|
595
|
+
f"Expected table of type '{type(table)}' to be written to 1 "
|
596
|
+
f"file, but found {len(write_paths)} files."
|
597
|
+
)
|
598
|
+
table_size = None
|
599
|
+
table_size_func = TABLE_CLASS_TO_SIZE_FUNC.get(type(table))
|
600
|
+
if table_size_func:
|
601
|
+
table_size = table_size_func(table)
|
602
|
+
else:
|
603
|
+
logger.warning(f"Unable to estimate '{type(table)}' table size.")
|
604
|
+
metadata.append(
|
605
|
+
BlockMetadata(
|
606
|
+
num_rows=get_table_length(table),
|
607
|
+
size_bytes=table_size,
|
608
|
+
schema=None,
|
609
|
+
input_files=None,
|
610
|
+
exec_stats=None,
|
611
|
+
)
|
612
|
+
)
|
613
|
+
else:
|
614
|
+
# TODO(pdames): Expose BlockList metadata getter from Ray Dataset?
|
615
|
+
# ray 1.10
|
616
|
+
# metadata = dataset._blocks.get_metadata()
|
617
|
+
# ray 2.0.0dev
|
618
|
+
metadata = table._plan.execute().get_metadata()
|
619
|
+
if (
|
620
|
+
not metadata
|
621
|
+
or metadata[0].size_bytes is None
|
622
|
+
or metadata[0].num_rows is None
|
623
|
+
):
|
624
|
+
metadata_futures = [
|
625
|
+
_block_metadata.remote(block_ref) for block_ref in block_refs
|
626
|
+
]
|
627
|
+
metadata = ray.get(metadata_futures)
|
628
|
+
return metadata
|
629
|
+
|
630
|
+
|
631
|
+
def _download_manifest_entries_ray_data_distributed(
|
632
|
+
manifest: Manifest,
|
633
|
+
token_holder: Optional[Dict[str, Any]] = None,
|
634
|
+
table_type: TableType = TableType.PYARROW,
|
635
|
+
max_parallelism: Optional[int] = 1000,
|
636
|
+
column_names: Optional[List[str]] = None,
|
637
|
+
include_columns: Optional[List[str]] = None,
|
638
|
+
file_reader_kwargs_provider: Optional[ReadKwargsProvider] = None,
|
639
|
+
ray_options_provider: Callable[[int, Any], Dict[str, Any]] = None,
|
640
|
+
) -> DistributedDataset:
|
641
|
+
|
642
|
+
table_pending_ids = []
|
643
|
+
manifest_entries = manifest.entries
|
644
|
+
if manifest_entries:
|
645
|
+
table_pending_ids = invoke_parallel(
|
646
|
+
manifest_entries,
|
647
|
+
download_manifest_entry_ray,
|
648
|
+
token_holder,
|
649
|
+
table_type,
|
650
|
+
column_names,
|
651
|
+
include_columns,
|
652
|
+
file_reader_kwargs_provider,
|
653
|
+
max_parallelism=max_parallelism,
|
654
|
+
options_provider=ray_options_provider,
|
655
|
+
)
|
656
|
+
return TABLE_TYPE_TO_DATASET_CREATE_FUNC_REFS[table_type](table_pending_ids)
|
657
|
+
|
658
|
+
|
659
|
+
def _download_manifest_entries_all_dataset_distributed(
|
660
|
+
manifest: Manifest,
|
661
|
+
token_holder: Optional[Dict[str, Any]] = None,
|
662
|
+
table_type: TableType = TableType.PYARROW,
|
663
|
+
max_parallelism: Optional[int] = 1000,
|
664
|
+
column_names: Optional[List[str]] = None,
|
665
|
+
include_columns: Optional[List[str]] = None,
|
666
|
+
file_reader_kwargs_provider: Optional[ReadKwargsProvider] = None,
|
667
|
+
ray_options_provider: Callable[[int, Any], Dict[str, Any]] = None,
|
668
|
+
distributed_dataset_type: Optional[
|
669
|
+
DistributedDatasetType
|
670
|
+
] = DistributedDatasetType.RAY_DATASET,
|
671
|
+
) -> DistributedDataset:
|
672
|
+
|
673
|
+
entry_content_type = None
|
674
|
+
entry_content_encoding = None
|
675
|
+
uris = []
|
676
|
+
for entry in manifest.entries or []:
|
677
|
+
if (
|
678
|
+
entry_content_type is not None
|
679
|
+
and entry_content_type != entry.meta.content_type
|
680
|
+
):
|
681
|
+
raise ValueError(
|
682
|
+
f"Mixed content types of ({entry_content_type},"
|
683
|
+
f" {entry.meta.content_type}) is not supported."
|
684
|
+
)
|
685
|
+
|
686
|
+
if (
|
687
|
+
entry_content_encoding is not None
|
688
|
+
and entry_content_encoding != entry.meta.content_encoding
|
689
|
+
):
|
690
|
+
raise ValueError(
|
691
|
+
f"Mixed content encoding of {entry_content_encoding},"
|
692
|
+
f" {entry.meta.content_encoding} is not supported."
|
693
|
+
)
|
694
|
+
|
695
|
+
entry_content_type = entry.meta.content_type
|
696
|
+
entry_content_encoding = entry.meta.content_encoding
|
697
|
+
uris.append(entry.uri)
|
698
|
+
|
699
|
+
s3_client_kwargs = _get_s3_client_kwargs_from_token(token_holder=token_holder)
|
700
|
+
|
701
|
+
if distributed_dataset_type in DISTRIBUTED_DATASET_TYPE_TO_READER_FUNC:
|
702
|
+
return DISTRIBUTED_DATASET_TYPE_TO_READER_FUNC[distributed_dataset_type.value](
|
703
|
+
uris=uris,
|
704
|
+
content_type=entry_content_type,
|
705
|
+
content_encoding=entry_content_encoding,
|
706
|
+
column_names=column_names,
|
707
|
+
include_columns=include_columns,
|
708
|
+
read_func_kwargs_provider=file_reader_kwargs_provider,
|
709
|
+
ray_options_provider=ray_options_provider,
|
710
|
+
s3_client_kwargs=s3_client_kwargs,
|
711
|
+
)
|
712
|
+
else:
|
713
|
+
raise ValueError(
|
714
|
+
f"Unsupported distributed dataset type={distributed_dataset_type}"
|
715
|
+
)
|