deltacat 0.2.9__py3-none-any.whl → 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. deltacat/__init__.py +1 -1
  2. deltacat/aws/redshift/__init__.py +4 -0
  3. deltacat/aws/redshift/model/manifest.py +93 -1
  4. deltacat/aws/s3u.py +250 -111
  5. deltacat/catalog/default_catalog_impl/__init__.py +369 -0
  6. deltacat/compute/compactor_v2/compaction_session.py +175 -152
  7. deltacat/compute/compactor_v2/model/hash_bucket_input.py +6 -0
  8. deltacat/compute/compactor_v2/model/merge_file_group.py +213 -0
  9. deltacat/compute/compactor_v2/model/merge_input.py +8 -24
  10. deltacat/compute/compactor_v2/model/merge_result.py +1 -0
  11. deltacat/compute/compactor_v2/steps/hash_bucket.py +4 -56
  12. deltacat/compute/compactor_v2/steps/merge.py +106 -171
  13. deltacat/compute/compactor_v2/utils/delta.py +97 -0
  14. deltacat/compute/compactor_v2/utils/merge.py +126 -0
  15. deltacat/compute/compactor_v2/utils/task_options.py +47 -4
  16. deltacat/compute/merge_on_read/__init__.py +4 -0
  17. deltacat/compute/merge_on_read/daft.py +40 -0
  18. deltacat/compute/merge_on_read/model/__init__.py +0 -0
  19. deltacat/compute/merge_on_read/model/merge_on_read_params.py +66 -0
  20. deltacat/compute/merge_on_read/utils/__init__.py +0 -0
  21. deltacat/compute/merge_on_read/utils/delta.py +42 -0
  22. deltacat/storage/interface.py +10 -2
  23. deltacat/storage/model/types.py +3 -11
  24. deltacat/tests/catalog/__init__.py +0 -0
  25. deltacat/tests/catalog/test_default_catalog_impl.py +98 -0
  26. deltacat/tests/compute/compact_partition_test_cases.py +126 -1
  27. deltacat/tests/compute/test_compact_partition_incremental.py +4 -1
  28. deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +9 -2
  29. deltacat/tests/local_deltacat_storage/__init__.py +19 -2
  30. deltacat/tests/test_utils/pyarrow.py +33 -14
  31. deltacat/tests/utils/test_daft.py +42 -2
  32. deltacat/types/media.py +5 -0
  33. deltacat/types/tables.py +7 -1
  34. deltacat/utils/daft.py +78 -13
  35. {deltacat-0.2.9.dist-info → deltacat-1.0.0.dist-info}/METADATA +2 -2
  36. {deltacat-0.2.9.dist-info → deltacat-1.0.0.dist-info}/RECORD +39 -27
  37. {deltacat-0.2.9.dist-info → deltacat-1.0.0.dist-info}/LICENSE +0 -0
  38. {deltacat-0.2.9.dist-info → deltacat-1.0.0.dist-info}/WHEEL +0 -0
  39. {deltacat-0.2.9.dist-info → deltacat-1.0.0.dist-info}/top_level.txt +0 -0
deltacat/__init__.py CHANGED
@@ -44,7 +44,7 @@ from deltacat.types.tables import TableWriteMode
44
44
 
45
45
  deltacat.logs.configure_deltacat_logger(logging.getLogger(__name__))
46
46
 
47
- __version__ = "0.2.9"
47
+ __version__ = "1.0.0"
48
48
 
49
49
 
50
50
  __all__ = [
@@ -1,4 +1,6 @@
1
1
  from deltacat.aws.redshift.model.manifest import (
2
+ EntryFileParams,
3
+ EntryType,
2
4
  Manifest,
3
5
  ManifestAuthor,
4
6
  ManifestEntry,
@@ -7,6 +9,8 @@ from deltacat.aws.redshift.model.manifest import (
7
9
  )
8
10
 
9
11
  __all__ = [
12
+ "EntryFileParams",
13
+ "EntryType",
10
14
  "Manifest",
11
15
  "ManifestAuthor",
12
16
  "ManifestEntry",
@@ -5,12 +5,63 @@ import itertools
5
5
  import logging
6
6
  from typing import Any, Dict, List, Optional
7
7
  from uuid import uuid4
8
+ from enum import Enum
8
9
 
9
10
  from deltacat import logs
10
11
 
11
12
  logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
12
13
 
13
14
 
15
+ class EntryType(str, Enum):
16
+ """
17
+ Enum representing all possible content categories of an manifest entry file
18
+ """
19
+
20
+ DATA = "data"
21
+ POSITIONAL_DELETE = "positional_delete"
22
+ EQUALITY_DELETE = "equality_delete"
23
+
24
+ @classmethod
25
+ def get_default(cls):
26
+ return EntryType.DATA
27
+
28
+ @classmethod
29
+ def list(cls):
30
+ return [c.value for c in EntryType]
31
+
32
+
33
+ class EntryFileParams(dict):
34
+ """
35
+ Represents parameters relevant to the underlying contents of manifest entry. Contains all parameters required to support DELETEs
36
+ equality_column_names: List of column names that would be used to determine row equality for equality deletes. Relevant only to equality deletes
37
+ position: Ordinal position of a deleted row in the target data file identified by uri, starting at 0. Relevant only to positional deletes
38
+ """
39
+
40
+ @staticmethod
41
+ def of(
42
+ equality_column_names: Optional[List[str]] = None,
43
+ position: Optional[int] = None,
44
+ ) -> EntryFileParams:
45
+ entry_file_params = EntryFileParams()
46
+ if equality_column_names is not None:
47
+ entry_file_params["equality_column_names"] = equality_column_names
48
+ if position is not None:
49
+ entry_file_params["position"] = position
50
+ return entry_file_params
51
+
52
+ @property
53
+ def equality_column_names(self) -> Optional[List[str]]:
54
+ return self.get("equality_column_names")
55
+
56
+ @property
57
+ def url(self) -> Optional[str]:
58
+ return self.get("url")
59
+
60
+ @property
61
+ def position(self) -> Optional[int]:
62
+ return self.get("position")
63
+
64
+
14
65
  class Manifest(dict):
15
66
  @staticmethod
16
67
  def _build_manifest(
@@ -18,6 +69,7 @@ class Manifest(dict):
18
69
  entries: Optional[ManifestEntryList],
19
70
  author: Optional[ManifestAuthor] = None,
20
71
  uuid: str = None,
72
+ entry_type: Optional[EntryType] = None,
21
73
  ) -> Manifest:
22
74
  if not uuid:
23
75
  uuid = str(uuid4())
@@ -29,6 +81,8 @@ class Manifest(dict):
29
81
  manifest["entries"] = entries
30
82
  if author is not None:
31
83
  manifest["author"] = author
84
+ if entry_type is not None:
85
+ manifest["entry_type"] = entry_type.value
32
86
  return manifest
33
87
 
34
88
  @staticmethod
@@ -36,6 +90,7 @@ class Manifest(dict):
36
90
  entries: ManifestEntryList,
37
91
  author: Optional[ManifestAuthor] = None,
38
92
  uuid: str = None,
93
+ entry_type: Optional[EntryType] = None,
39
94
  ) -> Manifest:
40
95
  if not uuid:
41
96
  uuid = str(uuid4())
@@ -78,8 +133,9 @@ class Manifest(dict):
78
133
  content_type,
79
134
  content_encoding,
80
135
  total_source_content_length,
136
+ entry_type=entry_type,
81
137
  )
82
- manifest = Manifest._build_manifest(meta, entries, author, uuid)
138
+ manifest = Manifest._build_manifest(meta, entries, author, uuid, entry_type)
83
139
  return manifest
84
140
 
85
141
  @staticmethod
@@ -128,6 +184,7 @@ class ManifestMeta(dict):
128
184
  source_content_length: Optional[int] = None,
129
185
  credentials: Optional[Dict[str, str]] = None,
130
186
  content_type_parameters: Optional[List[Dict[str, str]]] = None,
187
+ entry_type: Optional[EntryType] = None,
131
188
  ) -> ManifestMeta:
132
189
  manifest_meta = ManifestMeta()
133
190
  if record_count is not None:
@@ -144,6 +201,8 @@ class ManifestMeta(dict):
144
201
  manifest_meta["content_encoding"] = content_encoding
145
202
  if credentials is not None:
146
203
  manifest_meta["credentials"] = credentials
204
+ if entry_type is not None:
205
+ manifest_meta["entry_type"] = entry_type.value
147
206
  return manifest_meta
148
207
 
149
208
  @property
@@ -178,6 +237,13 @@ class ManifestMeta(dict):
178
237
  def credentials(self) -> Optional[Dict[str, str]]:
179
238
  return self.get("credentials")
180
239
 
240
+ @property
241
+ def entry_type(self) -> Optional[EntryType]:
242
+ val = self.get("entry_type")
243
+ if val is not None:
244
+ return EntryType(self["entry_type"])
245
+ return val
246
+
181
247
 
182
248
  class ManifestAuthor(dict):
183
249
  @staticmethod
@@ -206,6 +272,8 @@ class ManifestEntry(dict):
206
272
  mandatory: bool = True,
207
273
  uri: Optional[str] = None,
208
274
  uuid: Optional[str] = None,
275
+ entry_type: Optional[EntryType] = None,
276
+ entry_file_params: Optional[EntryFileParams] = None,
209
277
  ) -> ManifestEntry:
210
278
  manifest_entry = ManifestEntry()
211
279
  if not (uri or url):
@@ -222,6 +290,16 @@ class ManifestEntry(dict):
222
290
  manifest_entry["mandatory"] = mandatory
223
291
  if uuid is not None:
224
292
  manifest_entry["id"] = uuid
293
+ if entry_type is not None:
294
+ manifest_entry["entry_type"] = entry_type.value
295
+ if entry_file_params is not None:
296
+ if entry_file_params.get("url") != manifest_entry.get("url"):
297
+ msg = (
298
+ f"Expected manifest entry url: {manifest_entry.url}"
299
+ f" and entry_file_params: '{entry_file_params.url}' to match"
300
+ )
301
+ raise ValueError(msg)
302
+ manifest_entry["entry_file_params"] = entry_file_params
225
303
  return manifest_entry
226
304
 
227
305
  @staticmethod
@@ -268,6 +346,20 @@ class ManifestEntry(dict):
268
346
  def id(self) -> Optional[str]:
269
347
  return self.get("id")
270
348
 
349
+ @property
350
+ def entry_type(self) -> Optional[EntryType]:
351
+ val = self.get("entry_type")
352
+ if val is not None:
353
+ return EntryType(self["entry_type"])
354
+ return val
355
+
356
+ @property
357
+ def entry_file_params(self) -> Optional[EntryFileParams]:
358
+ val: Dict[str, Any] = self.get("entry_file_params")
359
+ if val is not None and not isinstance(val, EntryFileParams):
360
+ self["entry_file_params"] = val = EntryFileParams(val)
361
+ return val
362
+
271
363
 
272
364
  class ManifestEntryList(List[ManifestEntry]):
273
365
  @staticmethod
deltacat/aws/s3u.py CHANGED
@@ -22,7 +22,7 @@ from tenacity import (
22
22
  stop_after_delay,
23
23
  wait_random_exponential,
24
24
  )
25
-
25
+ from deltacat.utils.ray_utils.concurrency import invoke_parallel
26
26
  import deltacat.aws.clients as aws_utils
27
27
  from deltacat import logs
28
28
  from deltacat.aws.constants import TIMEOUT_ERROR_CODES
@@ -35,10 +35,17 @@ from deltacat.storage import (
35
35
  ManifestEntry,
36
36
  ManifestEntryList,
37
37
  )
38
- from deltacat.types.media import ContentEncoding, ContentType, TableType
38
+ from deltacat.types.media import (
39
+ ContentEncoding,
40
+ ContentType,
41
+ TableType,
42
+ DistributedDatasetType,
43
+ )
39
44
  from deltacat.types.tables import (
40
45
  TABLE_CLASS_TO_SIZE_FUNC,
41
46
  TABLE_TYPE_TO_READER_FUNC,
47
+ TABLE_TYPE_TO_DATASET_CREATE_FUNC_REFS,
48
+ DISTRIBUTED_DATASET_TYPE_TO_READER_FUNC,
42
49
  get_table_length,
43
50
  )
44
51
  from deltacat.types.partial_download import PartialFileDownloadParams
@@ -284,59 +291,6 @@ def upload_sliced_table(
284
291
  return manifest_entries
285
292
 
286
293
 
287
- @ray.remote
288
- def _block_metadata(block: Block) -> BlockMetadata:
289
- return BlockAccessor.for_block(block).get_metadata(
290
- input_files=None,
291
- exec_stats=None,
292
- )
293
-
294
-
295
- def _get_metadata(
296
- table: Union[LocalTable, DistributedDataset],
297
- write_paths: List[str],
298
- block_refs: List[ObjectRef[Block]],
299
- ) -> List[BlockMetadata]:
300
- metadata: List[BlockMetadata] = []
301
- if not block_refs:
302
- # this must be a local table - ensure it was written to only 1 file
303
- assert len(write_paths) == 1, (
304
- f"Expected table of type '{type(table)}' to be written to 1 "
305
- f"file, but found {len(write_paths)} files."
306
- )
307
- table_size = None
308
- table_size_func = TABLE_CLASS_TO_SIZE_FUNC.get(type(table))
309
- if table_size_func:
310
- table_size = table_size_func(table)
311
- else:
312
- logger.warning(f"Unable to estimate '{type(table)}' table size.")
313
- metadata.append(
314
- BlockMetadata(
315
- num_rows=get_table_length(table),
316
- size_bytes=table_size,
317
- schema=None,
318
- input_files=None,
319
- exec_stats=None,
320
- )
321
- )
322
- else:
323
- # TODO(pdames): Expose BlockList metadata getter from Ray Dataset?
324
- # ray 1.10
325
- # metadata = dataset._blocks.get_metadata()
326
- # ray 2.0.0dev
327
- metadata = table._plan.execute().get_metadata()
328
- if (
329
- not metadata
330
- or metadata[0].size_bytes is None
331
- or metadata[0].num_rows is None
332
- ):
333
- metadata_futures = [
334
- _block_metadata.remote(block_ref) for block_ref in block_refs
335
- ]
336
- metadata = ray.get(metadata_futures)
337
- return metadata
338
-
339
-
340
294
  def upload_table(
341
295
  table: Union[LocalTable, DistributedDataset],
342
296
  s3_base_url: str,
@@ -403,17 +357,7 @@ def download_manifest_entry(
403
357
  content_encoding: Optional[ContentEncoding] = None,
404
358
  ) -> LocalTable:
405
359
 
406
- conf = Config(retries={"max_attempts": BOTO_MAX_RETRIES, "mode": "adaptive"})
407
- s3_client_kwargs = (
408
- {
409
- "aws_access_key_id": token_holder["accessKeyId"],
410
- "aws_secret_access_key": token_holder["secretAccessKey"],
411
- "aws_session_token": token_holder["sessionToken"],
412
- "config": conf,
413
- }
414
- if token_holder
415
- else {"config": conf}
416
- )
360
+ s3_client_kwargs = _get_s3_client_kwargs_from_token(token_holder=token_holder)
417
361
  if not content_type:
418
362
  content_type = manifest_entry.meta.content_type
419
363
  assert (
@@ -458,51 +402,9 @@ def download_manifest_entry(
458
402
  return table
459
403
 
460
404
 
461
- def _download_manifest_entries(
462
- manifest: Manifest,
463
- token_holder: Optional[Dict[str, Any]] = None,
464
- table_type: TableType = TableType.PYARROW,
465
- column_names: Optional[List[str]] = None,
466
- include_columns: Optional[List[str]] = None,
467
- file_reader_kwargs_provider: Optional[ReadKwargsProvider] = None,
468
- ) -> LocalDataset:
469
-
470
- return [
471
- download_manifest_entry(
472
- e,
473
- token_holder,
474
- table_type,
475
- column_names,
476
- include_columns,
477
- file_reader_kwargs_provider,
478
- )
479
- for e in manifest.entries
480
- ]
481
-
482
-
483
- def _download_manifest_entries_parallel(
484
- manifest: Manifest,
485
- token_holder: Optional[Dict[str, Any]] = None,
486
- table_type: TableType = TableType.PYARROW,
487
- max_parallelism: Optional[int] = None,
488
- column_names: Optional[List[str]] = None,
489
- include_columns: Optional[List[str]] = None,
490
- file_reader_kwargs_provider: Optional[ReadKwargsProvider] = None,
491
- ) -> LocalDataset:
492
-
493
- tables = []
494
- pool = multiprocessing.Pool(max_parallelism)
495
- downloader = partial(
496
- download_manifest_entry,
497
- token_holder=token_holder,
498
- table_type=table_type,
499
- column_names=column_names,
500
- include_columns=include_columns,
501
- file_reader_kwargs_provider=file_reader_kwargs_provider,
502
- )
503
- for table in pool.map(downloader, [e for e in manifest.entries]):
504
- tables.append(table)
505
- return tables
405
+ @ray.remote
406
+ def download_manifest_entry_ray(*args, **kwargs) -> ObjectRef[LocalTable]:
407
+ return download_manifest_entry(*args, **kwargs)
506
408
 
507
409
 
508
410
  def download_manifest_entries(
@@ -536,6 +438,42 @@ def download_manifest_entries(
536
438
  )
537
439
 
538
440
 
441
+ def download_manifest_entries_distributed(
442
+ manifest: Manifest,
443
+ token_holder: Optional[Dict[str, Any]] = None,
444
+ table_type: TableType = TableType.PYARROW,
445
+ max_parallelism: Optional[int] = 1000,
446
+ column_names: Optional[List[str]] = None,
447
+ include_columns: Optional[List[str]] = None,
448
+ file_reader_kwargs_provider: Optional[ReadKwargsProvider] = None,
449
+ ray_options_provider: Callable[[int, Any], Dict[str, Any]] = None,
450
+ distributed_dataset_type: Optional[
451
+ DistributedDatasetType
452
+ ] = DistributedDatasetType.RAY_DATASET,
453
+ ) -> DistributedDataset:
454
+
455
+ params = {
456
+ "manifest": manifest,
457
+ "token_holder": token_holder,
458
+ "table_type": table_type,
459
+ "max_parallelism": max_parallelism,
460
+ "column_names": column_names,
461
+ "include_columns": include_columns,
462
+ "file_reader_kwargs_provider": file_reader_kwargs_provider,
463
+ "ray_options_provider": ray_options_provider,
464
+ "distributed_dataset_type": distributed_dataset_type,
465
+ }
466
+
467
+ if distributed_dataset_type == DistributedDatasetType.RAY_DATASET:
468
+ return _download_manifest_entries_ray_data_distributed(**params)
469
+ elif distributed_dataset_type is not None:
470
+ return _download_manifest_entries_all_dataset_distributed(**params)
471
+ else:
472
+ raise ValueError(
473
+ f"Distributed dataset type {distributed_dataset_type} not supported."
474
+ )
475
+
476
+
539
477
  def upload(s3_url: str, body, **s3_client_kwargs) -> Dict[str, Any]:
540
478
 
541
479
  # TODO (pdames): add tenacity retrying
@@ -574,3 +512,204 @@ def download(
574
512
  else:
575
513
  logger.info(f"file not found: {s3_url}")
576
514
  return None
515
+
516
+
517
+ def _download_manifest_entries_parallel(
518
+ manifest: Manifest,
519
+ token_holder: Optional[Dict[str, Any]] = None,
520
+ table_type: TableType = TableType.PYARROW,
521
+ max_parallelism: Optional[int] = None,
522
+ column_names: Optional[List[str]] = None,
523
+ include_columns: Optional[List[str]] = None,
524
+ file_reader_kwargs_provider: Optional[ReadKwargsProvider] = None,
525
+ ) -> LocalDataset:
526
+
527
+ tables = []
528
+ pool = multiprocessing.Pool(max_parallelism)
529
+ downloader = partial(
530
+ download_manifest_entry,
531
+ token_holder=token_holder,
532
+ table_type=table_type,
533
+ column_names=column_names,
534
+ include_columns=include_columns,
535
+ file_reader_kwargs_provider=file_reader_kwargs_provider,
536
+ )
537
+ for table in pool.map(downloader, [e for e in manifest.entries]):
538
+ tables.append(table)
539
+ return tables
540
+
541
+
542
+ def _download_manifest_entries(
543
+ manifest: Manifest,
544
+ token_holder: Optional[Dict[str, Any]] = None,
545
+ table_type: TableType = TableType.PYARROW,
546
+ column_names: Optional[List[str]] = None,
547
+ include_columns: Optional[List[str]] = None,
548
+ file_reader_kwargs_provider: Optional[ReadKwargsProvider] = None,
549
+ ) -> LocalDataset:
550
+
551
+ return [
552
+ download_manifest_entry(
553
+ manifest_entry=e,
554
+ token_holder=token_holder,
555
+ table_type=table_type,
556
+ column_names=column_names,
557
+ include_columns=include_columns,
558
+ file_reader_kwargs_provider=file_reader_kwargs_provider,
559
+ )
560
+ for e in manifest.entries
561
+ ]
562
+
563
+
564
+ @ray.remote
565
+ def _block_metadata(block: Block) -> BlockMetadata:
566
+ return BlockAccessor.for_block(block).get_metadata(
567
+ input_files=None,
568
+ exec_stats=None,
569
+ )
570
+
571
+
572
+ def _get_s3_client_kwargs_from_token(token_holder) -> Dict[Any, Any]:
573
+ conf = Config(retries={"max_attempts": BOTO_MAX_RETRIES, "mode": "adaptive"})
574
+ return (
575
+ {
576
+ "aws_access_key_id": token_holder["accessKeyId"],
577
+ "aws_secret_access_key": token_holder["secretAccessKey"],
578
+ "aws_session_token": token_holder["sessionToken"],
579
+ "config": conf,
580
+ }
581
+ if token_holder
582
+ else {"config": conf}
583
+ )
584
+
585
+
586
+ def _get_metadata(
587
+ table: Union[LocalTable, DistributedDataset],
588
+ write_paths: List[str],
589
+ block_refs: List[ObjectRef[Block]],
590
+ ) -> List[BlockMetadata]:
591
+ metadata: List[BlockMetadata] = []
592
+ if not block_refs:
593
+ # this must be a local table - ensure it was written to only 1 file
594
+ assert len(write_paths) == 1, (
595
+ f"Expected table of type '{type(table)}' to be written to 1 "
596
+ f"file, but found {len(write_paths)} files."
597
+ )
598
+ table_size = None
599
+ table_size_func = TABLE_CLASS_TO_SIZE_FUNC.get(type(table))
600
+ if table_size_func:
601
+ table_size = table_size_func(table)
602
+ else:
603
+ logger.warning(f"Unable to estimate '{type(table)}' table size.")
604
+ metadata.append(
605
+ BlockMetadata(
606
+ num_rows=get_table_length(table),
607
+ size_bytes=table_size,
608
+ schema=None,
609
+ input_files=None,
610
+ exec_stats=None,
611
+ )
612
+ )
613
+ else:
614
+ # TODO(pdames): Expose BlockList metadata getter from Ray Dataset?
615
+ # ray 1.10
616
+ # metadata = dataset._blocks.get_metadata()
617
+ # ray 2.0.0dev
618
+ metadata = table._plan.execute().get_metadata()
619
+ if (
620
+ not metadata
621
+ or metadata[0].size_bytes is None
622
+ or metadata[0].num_rows is None
623
+ ):
624
+ metadata_futures = [
625
+ _block_metadata.remote(block_ref) for block_ref in block_refs
626
+ ]
627
+ metadata = ray.get(metadata_futures)
628
+ return metadata
629
+
630
+
631
+ def _download_manifest_entries_ray_data_distributed(
632
+ manifest: Manifest,
633
+ token_holder: Optional[Dict[str, Any]] = None,
634
+ table_type: TableType = TableType.PYARROW,
635
+ max_parallelism: Optional[int] = 1000,
636
+ column_names: Optional[List[str]] = None,
637
+ include_columns: Optional[List[str]] = None,
638
+ file_reader_kwargs_provider: Optional[ReadKwargsProvider] = None,
639
+ ray_options_provider: Callable[[int, Any], Dict[str, Any]] = None,
640
+ ) -> DistributedDataset:
641
+
642
+ table_pending_ids = []
643
+ manifest_entries = manifest.entries
644
+ if manifest_entries:
645
+ table_pending_ids = invoke_parallel(
646
+ manifest_entries,
647
+ download_manifest_entry_ray,
648
+ token_holder,
649
+ table_type,
650
+ column_names,
651
+ include_columns,
652
+ file_reader_kwargs_provider,
653
+ max_parallelism=max_parallelism,
654
+ options_provider=ray_options_provider,
655
+ )
656
+ return TABLE_TYPE_TO_DATASET_CREATE_FUNC_REFS[table_type](table_pending_ids)
657
+
658
+
659
+ def _download_manifest_entries_all_dataset_distributed(
660
+ manifest: Manifest,
661
+ token_holder: Optional[Dict[str, Any]] = None,
662
+ table_type: TableType = TableType.PYARROW,
663
+ max_parallelism: Optional[int] = 1000,
664
+ column_names: Optional[List[str]] = None,
665
+ include_columns: Optional[List[str]] = None,
666
+ file_reader_kwargs_provider: Optional[ReadKwargsProvider] = None,
667
+ ray_options_provider: Callable[[int, Any], Dict[str, Any]] = None,
668
+ distributed_dataset_type: Optional[
669
+ DistributedDatasetType
670
+ ] = DistributedDatasetType.RAY_DATASET,
671
+ ) -> DistributedDataset:
672
+
673
+ entry_content_type = None
674
+ entry_content_encoding = None
675
+ uris = []
676
+ for entry in manifest.entries or []:
677
+ if (
678
+ entry_content_type is not None
679
+ and entry_content_type != entry.meta.content_type
680
+ ):
681
+ raise ValueError(
682
+ f"Mixed content types of ({entry_content_type},"
683
+ f" {entry.meta.content_type}) is not supported."
684
+ )
685
+
686
+ if (
687
+ entry_content_encoding is not None
688
+ and entry_content_encoding != entry.meta.content_encoding
689
+ ):
690
+ raise ValueError(
691
+ f"Mixed content encoding of {entry_content_encoding},"
692
+ f" {entry.meta.content_encoding} is not supported."
693
+ )
694
+
695
+ entry_content_type = entry.meta.content_type
696
+ entry_content_encoding = entry.meta.content_encoding
697
+ uris.append(entry.uri)
698
+
699
+ s3_client_kwargs = _get_s3_client_kwargs_from_token(token_holder=token_holder)
700
+
701
+ if distributed_dataset_type in DISTRIBUTED_DATASET_TYPE_TO_READER_FUNC:
702
+ return DISTRIBUTED_DATASET_TYPE_TO_READER_FUNC[distributed_dataset_type.value](
703
+ uris=uris,
704
+ content_type=entry_content_type,
705
+ content_encoding=entry_content_encoding,
706
+ column_names=column_names,
707
+ include_columns=include_columns,
708
+ read_func_kwargs_provider=file_reader_kwargs_provider,
709
+ ray_options_provider=ray_options_provider,
710
+ s3_client_kwargs=s3_client_kwargs,
711
+ )
712
+ else:
713
+ raise ValueError(
714
+ f"Unsupported distributed dataset type={distributed_dataset_type}"
715
+ )