deltacat 0.2.11__py3-none-any.whl → 1.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. deltacat/__init__.py +1 -1
  2. deltacat/aws/s3u.py +250 -111
  3. deltacat/catalog/default_catalog_impl/__init__.py +369 -0
  4. deltacat/compute/compactor_v2/compaction_session.py +175 -152
  5. deltacat/compute/compactor_v2/model/hash_bucket_input.py +6 -0
  6. deltacat/compute/compactor_v2/model/merge_file_group.py +213 -0
  7. deltacat/compute/compactor_v2/model/merge_input.py +8 -24
  8. deltacat/compute/compactor_v2/model/merge_result.py +1 -0
  9. deltacat/compute/compactor_v2/steps/hash_bucket.py +4 -56
  10. deltacat/compute/compactor_v2/steps/merge.py +106 -171
  11. deltacat/compute/compactor_v2/utils/delta.py +97 -0
  12. deltacat/compute/compactor_v2/utils/merge.py +126 -0
  13. deltacat/compute/compactor_v2/utils/task_options.py +16 -4
  14. deltacat/compute/merge_on_read/__init__.py +4 -0
  15. deltacat/compute/merge_on_read/daft.py +40 -0
  16. deltacat/compute/merge_on_read/model/__init__.py +0 -0
  17. deltacat/compute/merge_on_read/model/merge_on_read_params.py +66 -0
  18. deltacat/compute/merge_on_read/utils/__init__.py +0 -0
  19. deltacat/compute/merge_on_read/utils/delta.py +42 -0
  20. deltacat/storage/interface.py +10 -2
  21. deltacat/storage/model/types.py +3 -11
  22. deltacat/tests/catalog/__init__.py +0 -0
  23. deltacat/tests/catalog/test_default_catalog_impl.py +98 -0
  24. deltacat/tests/compute/compact_partition_test_cases.py +126 -1
  25. deltacat/tests/compute/test_compact_partition_incremental.py +4 -1
  26. deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +9 -2
  27. deltacat/tests/local_deltacat_storage/__init__.py +19 -2
  28. deltacat/tests/test_utils/pyarrow.py +33 -14
  29. deltacat/tests/utils/test_daft.py +42 -2
  30. deltacat/types/media.py +5 -0
  31. deltacat/types/tables.py +7 -1
  32. deltacat/utils/daft.py +84 -14
  33. {deltacat-0.2.11.dist-info → deltacat-1.0.1.dist-info}/METADATA +2 -2
  34. {deltacat-0.2.11.dist-info → deltacat-1.0.1.dist-info}/RECORD +37 -25
  35. {deltacat-0.2.11.dist-info → deltacat-1.0.1.dist-info}/LICENSE +0 -0
  36. {deltacat-0.2.11.dist-info → deltacat-1.0.1.dist-info}/WHEEL +0 -0
  37. {deltacat-0.2.11.dist-info → deltacat-1.0.1.dist-info}/top_level.txt +0 -0
deltacat/__init__.py CHANGED
@@ -44,7 +44,7 @@ from deltacat.types.tables import TableWriteMode
44
44
 
45
45
  deltacat.logs.configure_deltacat_logger(logging.getLogger(__name__))
46
46
 
47
- __version__ = "0.2.11"
47
+ __version__ = "1.0.1"
48
48
 
49
49
 
50
50
  __all__ = [
deltacat/aws/s3u.py CHANGED
@@ -22,7 +22,7 @@ from tenacity import (
22
22
  stop_after_delay,
23
23
  wait_random_exponential,
24
24
  )
25
-
25
+ from deltacat.utils.ray_utils.concurrency import invoke_parallel
26
26
  import deltacat.aws.clients as aws_utils
27
27
  from deltacat import logs
28
28
  from deltacat.aws.constants import TIMEOUT_ERROR_CODES
@@ -35,10 +35,17 @@ from deltacat.storage import (
35
35
  ManifestEntry,
36
36
  ManifestEntryList,
37
37
  )
38
- from deltacat.types.media import ContentEncoding, ContentType, TableType
38
+ from deltacat.types.media import (
39
+ ContentEncoding,
40
+ ContentType,
41
+ TableType,
42
+ DistributedDatasetType,
43
+ )
39
44
  from deltacat.types.tables import (
40
45
  TABLE_CLASS_TO_SIZE_FUNC,
41
46
  TABLE_TYPE_TO_READER_FUNC,
47
+ TABLE_TYPE_TO_DATASET_CREATE_FUNC_REFS,
48
+ DISTRIBUTED_DATASET_TYPE_TO_READER_FUNC,
42
49
  get_table_length,
43
50
  )
44
51
  from deltacat.types.partial_download import PartialFileDownloadParams
@@ -284,59 +291,6 @@ def upload_sliced_table(
284
291
  return manifest_entries
285
292
 
286
293
 
287
- @ray.remote
288
- def _block_metadata(block: Block) -> BlockMetadata:
289
- return BlockAccessor.for_block(block).get_metadata(
290
- input_files=None,
291
- exec_stats=None,
292
- )
293
-
294
-
295
- def _get_metadata(
296
- table: Union[LocalTable, DistributedDataset],
297
- write_paths: List[str],
298
- block_refs: List[ObjectRef[Block]],
299
- ) -> List[BlockMetadata]:
300
- metadata: List[BlockMetadata] = []
301
- if not block_refs:
302
- # this must be a local table - ensure it was written to only 1 file
303
- assert len(write_paths) == 1, (
304
- f"Expected table of type '{type(table)}' to be written to 1 "
305
- f"file, but found {len(write_paths)} files."
306
- )
307
- table_size = None
308
- table_size_func = TABLE_CLASS_TO_SIZE_FUNC.get(type(table))
309
- if table_size_func:
310
- table_size = table_size_func(table)
311
- else:
312
- logger.warning(f"Unable to estimate '{type(table)}' table size.")
313
- metadata.append(
314
- BlockMetadata(
315
- num_rows=get_table_length(table),
316
- size_bytes=table_size,
317
- schema=None,
318
- input_files=None,
319
- exec_stats=None,
320
- )
321
- )
322
- else:
323
- # TODO(pdames): Expose BlockList metadata getter from Ray Dataset?
324
- # ray 1.10
325
- # metadata = dataset._blocks.get_metadata()
326
- # ray 2.0.0dev
327
- metadata = table._plan.execute().get_metadata()
328
- if (
329
- not metadata
330
- or metadata[0].size_bytes is None
331
- or metadata[0].num_rows is None
332
- ):
333
- metadata_futures = [
334
- _block_metadata.remote(block_ref) for block_ref in block_refs
335
- ]
336
- metadata = ray.get(metadata_futures)
337
- return metadata
338
-
339
-
340
294
  def upload_table(
341
295
  table: Union[LocalTable, DistributedDataset],
342
296
  s3_base_url: str,
@@ -403,17 +357,7 @@ def download_manifest_entry(
403
357
  content_encoding: Optional[ContentEncoding] = None,
404
358
  ) -> LocalTable:
405
359
 
406
- conf = Config(retries={"max_attempts": BOTO_MAX_RETRIES, "mode": "adaptive"})
407
- s3_client_kwargs = (
408
- {
409
- "aws_access_key_id": token_holder["accessKeyId"],
410
- "aws_secret_access_key": token_holder["secretAccessKey"],
411
- "aws_session_token": token_holder["sessionToken"],
412
- "config": conf,
413
- }
414
- if token_holder
415
- else {"config": conf}
416
- )
360
+ s3_client_kwargs = _get_s3_client_kwargs_from_token(token_holder=token_holder)
417
361
  if not content_type:
418
362
  content_type = manifest_entry.meta.content_type
419
363
  assert (
@@ -458,51 +402,9 @@ def download_manifest_entry(
458
402
  return table
459
403
 
460
404
 
461
- def _download_manifest_entries(
462
- manifest: Manifest,
463
- token_holder: Optional[Dict[str, Any]] = None,
464
- table_type: TableType = TableType.PYARROW,
465
- column_names: Optional[List[str]] = None,
466
- include_columns: Optional[List[str]] = None,
467
- file_reader_kwargs_provider: Optional[ReadKwargsProvider] = None,
468
- ) -> LocalDataset:
469
-
470
- return [
471
- download_manifest_entry(
472
- e,
473
- token_holder,
474
- table_type,
475
- column_names,
476
- include_columns,
477
- file_reader_kwargs_provider,
478
- )
479
- for e in manifest.entries
480
- ]
481
-
482
-
483
- def _download_manifest_entries_parallel(
484
- manifest: Manifest,
485
- token_holder: Optional[Dict[str, Any]] = None,
486
- table_type: TableType = TableType.PYARROW,
487
- max_parallelism: Optional[int] = None,
488
- column_names: Optional[List[str]] = None,
489
- include_columns: Optional[List[str]] = None,
490
- file_reader_kwargs_provider: Optional[ReadKwargsProvider] = None,
491
- ) -> LocalDataset:
492
-
493
- tables = []
494
- pool = multiprocessing.Pool(max_parallelism)
495
- downloader = partial(
496
- download_manifest_entry,
497
- token_holder=token_holder,
498
- table_type=table_type,
499
- column_names=column_names,
500
- include_columns=include_columns,
501
- file_reader_kwargs_provider=file_reader_kwargs_provider,
502
- )
503
- for table in pool.map(downloader, [e for e in manifest.entries]):
504
- tables.append(table)
505
- return tables
405
+ @ray.remote
406
+ def download_manifest_entry_ray(*args, **kwargs) -> ObjectRef[LocalTable]:
407
+ return download_manifest_entry(*args, **kwargs)
506
408
 
507
409
 
508
410
  def download_manifest_entries(
@@ -536,6 +438,42 @@ def download_manifest_entries(
536
438
  )
537
439
 
538
440
 
441
+ def download_manifest_entries_distributed(
442
+ manifest: Manifest,
443
+ token_holder: Optional[Dict[str, Any]] = None,
444
+ table_type: TableType = TableType.PYARROW,
445
+ max_parallelism: Optional[int] = 1000,
446
+ column_names: Optional[List[str]] = None,
447
+ include_columns: Optional[List[str]] = None,
448
+ file_reader_kwargs_provider: Optional[ReadKwargsProvider] = None,
449
+ ray_options_provider: Callable[[int, Any], Dict[str, Any]] = None,
450
+ distributed_dataset_type: Optional[
451
+ DistributedDatasetType
452
+ ] = DistributedDatasetType.RAY_DATASET,
453
+ ) -> DistributedDataset:
454
+
455
+ params = {
456
+ "manifest": manifest,
457
+ "token_holder": token_holder,
458
+ "table_type": table_type,
459
+ "max_parallelism": max_parallelism,
460
+ "column_names": column_names,
461
+ "include_columns": include_columns,
462
+ "file_reader_kwargs_provider": file_reader_kwargs_provider,
463
+ "ray_options_provider": ray_options_provider,
464
+ "distributed_dataset_type": distributed_dataset_type,
465
+ }
466
+
467
+ if distributed_dataset_type == DistributedDatasetType.RAY_DATASET:
468
+ return _download_manifest_entries_ray_data_distributed(**params)
469
+ elif distributed_dataset_type is not None:
470
+ return _download_manifest_entries_all_dataset_distributed(**params)
471
+ else:
472
+ raise ValueError(
473
+ f"Distributed dataset type {distributed_dataset_type} not supported."
474
+ )
475
+
476
+
539
477
  def upload(s3_url: str, body, **s3_client_kwargs) -> Dict[str, Any]:
540
478
 
541
479
  # TODO (pdames): add tenacity retrying
@@ -574,3 +512,204 @@ def download(
574
512
  else:
575
513
  logger.info(f"file not found: {s3_url}")
576
514
  return None
515
+
516
+
517
+ def _download_manifest_entries_parallel(
518
+ manifest: Manifest,
519
+ token_holder: Optional[Dict[str, Any]] = None,
520
+ table_type: TableType = TableType.PYARROW,
521
+ max_parallelism: Optional[int] = None,
522
+ column_names: Optional[List[str]] = None,
523
+ include_columns: Optional[List[str]] = None,
524
+ file_reader_kwargs_provider: Optional[ReadKwargsProvider] = None,
525
+ ) -> LocalDataset:
526
+
527
+ tables = []
528
+ pool = multiprocessing.Pool(max_parallelism)
529
+ downloader = partial(
530
+ download_manifest_entry,
531
+ token_holder=token_holder,
532
+ table_type=table_type,
533
+ column_names=column_names,
534
+ include_columns=include_columns,
535
+ file_reader_kwargs_provider=file_reader_kwargs_provider,
536
+ )
537
+ for table in pool.map(downloader, [e for e in manifest.entries]):
538
+ tables.append(table)
539
+ return tables
540
+
541
+
542
+ def _download_manifest_entries(
543
+ manifest: Manifest,
544
+ token_holder: Optional[Dict[str, Any]] = None,
545
+ table_type: TableType = TableType.PYARROW,
546
+ column_names: Optional[List[str]] = None,
547
+ include_columns: Optional[List[str]] = None,
548
+ file_reader_kwargs_provider: Optional[ReadKwargsProvider] = None,
549
+ ) -> LocalDataset:
550
+
551
+ return [
552
+ download_manifest_entry(
553
+ manifest_entry=e,
554
+ token_holder=token_holder,
555
+ table_type=table_type,
556
+ column_names=column_names,
557
+ include_columns=include_columns,
558
+ file_reader_kwargs_provider=file_reader_kwargs_provider,
559
+ )
560
+ for e in manifest.entries
561
+ ]
562
+
563
+
564
+ @ray.remote
565
+ def _block_metadata(block: Block) -> BlockMetadata:
566
+ return BlockAccessor.for_block(block).get_metadata(
567
+ input_files=None,
568
+ exec_stats=None,
569
+ )
570
+
571
+
572
+ def _get_s3_client_kwargs_from_token(token_holder) -> Dict[Any, Any]:
573
+ conf = Config(retries={"max_attempts": BOTO_MAX_RETRIES, "mode": "adaptive"})
574
+ return (
575
+ {
576
+ "aws_access_key_id": token_holder["accessKeyId"],
577
+ "aws_secret_access_key": token_holder["secretAccessKey"],
578
+ "aws_session_token": token_holder["sessionToken"],
579
+ "config": conf,
580
+ }
581
+ if token_holder
582
+ else {"config": conf}
583
+ )
584
+
585
+
586
+ def _get_metadata(
587
+ table: Union[LocalTable, DistributedDataset],
588
+ write_paths: List[str],
589
+ block_refs: List[ObjectRef[Block]],
590
+ ) -> List[BlockMetadata]:
591
+ metadata: List[BlockMetadata] = []
592
+ if not block_refs:
593
+ # this must be a local table - ensure it was written to only 1 file
594
+ assert len(write_paths) == 1, (
595
+ f"Expected table of type '{type(table)}' to be written to 1 "
596
+ f"file, but found {len(write_paths)} files."
597
+ )
598
+ table_size = None
599
+ table_size_func = TABLE_CLASS_TO_SIZE_FUNC.get(type(table))
600
+ if table_size_func:
601
+ table_size = table_size_func(table)
602
+ else:
603
+ logger.warning(f"Unable to estimate '{type(table)}' table size.")
604
+ metadata.append(
605
+ BlockMetadata(
606
+ num_rows=get_table_length(table),
607
+ size_bytes=table_size,
608
+ schema=None,
609
+ input_files=None,
610
+ exec_stats=None,
611
+ )
612
+ )
613
+ else:
614
+ # TODO(pdames): Expose BlockList metadata getter from Ray Dataset?
615
+ # ray 1.10
616
+ # metadata = dataset._blocks.get_metadata()
617
+ # ray 2.0.0dev
618
+ metadata = table._plan.execute().get_metadata()
619
+ if (
620
+ not metadata
621
+ or metadata[0].size_bytes is None
622
+ or metadata[0].num_rows is None
623
+ ):
624
+ metadata_futures = [
625
+ _block_metadata.remote(block_ref) for block_ref in block_refs
626
+ ]
627
+ metadata = ray.get(metadata_futures)
628
+ return metadata
629
+
630
+
631
+ def _download_manifest_entries_ray_data_distributed(
632
+ manifest: Manifest,
633
+ token_holder: Optional[Dict[str, Any]] = None,
634
+ table_type: TableType = TableType.PYARROW,
635
+ max_parallelism: Optional[int] = 1000,
636
+ column_names: Optional[List[str]] = None,
637
+ include_columns: Optional[List[str]] = None,
638
+ file_reader_kwargs_provider: Optional[ReadKwargsProvider] = None,
639
+ ray_options_provider: Callable[[int, Any], Dict[str, Any]] = None,
640
+ ) -> DistributedDataset:
641
+
642
+ table_pending_ids = []
643
+ manifest_entries = manifest.entries
644
+ if manifest_entries:
645
+ table_pending_ids = invoke_parallel(
646
+ manifest_entries,
647
+ download_manifest_entry_ray,
648
+ token_holder,
649
+ table_type,
650
+ column_names,
651
+ include_columns,
652
+ file_reader_kwargs_provider,
653
+ max_parallelism=max_parallelism,
654
+ options_provider=ray_options_provider,
655
+ )
656
+ return TABLE_TYPE_TO_DATASET_CREATE_FUNC_REFS[table_type](table_pending_ids)
657
+
658
+
659
+ def _download_manifest_entries_all_dataset_distributed(
660
+ manifest: Manifest,
661
+ token_holder: Optional[Dict[str, Any]] = None,
662
+ table_type: TableType = TableType.PYARROW,
663
+ max_parallelism: Optional[int] = 1000,
664
+ column_names: Optional[List[str]] = None,
665
+ include_columns: Optional[List[str]] = None,
666
+ file_reader_kwargs_provider: Optional[ReadKwargsProvider] = None,
667
+ ray_options_provider: Callable[[int, Any], Dict[str, Any]] = None,
668
+ distributed_dataset_type: Optional[
669
+ DistributedDatasetType
670
+ ] = DistributedDatasetType.RAY_DATASET,
671
+ ) -> DistributedDataset:
672
+
673
+ entry_content_type = None
674
+ entry_content_encoding = None
675
+ uris = []
676
+ for entry in manifest.entries or []:
677
+ if (
678
+ entry_content_type is not None
679
+ and entry_content_type != entry.meta.content_type
680
+ ):
681
+ raise ValueError(
682
+ f"Mixed content types of ({entry_content_type},"
683
+ f" {entry.meta.content_type}) is not supported."
684
+ )
685
+
686
+ if (
687
+ entry_content_encoding is not None
688
+ and entry_content_encoding != entry.meta.content_encoding
689
+ ):
690
+ raise ValueError(
691
+ f"Mixed content encoding of {entry_content_encoding},"
692
+ f" {entry.meta.content_encoding} is not supported."
693
+ )
694
+
695
+ entry_content_type = entry.meta.content_type
696
+ entry_content_encoding = entry.meta.content_encoding
697
+ uris.append(entry.uri)
698
+
699
+ s3_client_kwargs = _get_s3_client_kwargs_from_token(token_holder=token_holder)
700
+
701
+ if distributed_dataset_type in DISTRIBUTED_DATASET_TYPE_TO_READER_FUNC:
702
+ return DISTRIBUTED_DATASET_TYPE_TO_READER_FUNC[distributed_dataset_type.value](
703
+ uris=uris,
704
+ content_type=entry_content_type,
705
+ content_encoding=entry_content_encoding,
706
+ column_names=column_names,
707
+ include_columns=include_columns,
708
+ read_func_kwargs_provider=file_reader_kwargs_provider,
709
+ ray_options_provider=ray_options_provider,
710
+ s3_client_kwargs=s3_client_kwargs,
711
+ )
712
+ else:
713
+ raise ValueError(
714
+ f"Unsupported distributed dataset type={distributed_dataset_type}"
715
+ )