deltacat 0.2.9__py3-none-any.whl → 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +1 -1
- deltacat/aws/redshift/__init__.py +4 -0
- deltacat/aws/redshift/model/manifest.py +93 -1
- deltacat/aws/s3u.py +250 -111
- deltacat/catalog/default_catalog_impl/__init__.py +369 -0
- deltacat/compute/compactor_v2/compaction_session.py +175 -152
- deltacat/compute/compactor_v2/model/hash_bucket_input.py +6 -0
- deltacat/compute/compactor_v2/model/merge_file_group.py +213 -0
- deltacat/compute/compactor_v2/model/merge_input.py +8 -24
- deltacat/compute/compactor_v2/model/merge_result.py +1 -0
- deltacat/compute/compactor_v2/steps/hash_bucket.py +4 -56
- deltacat/compute/compactor_v2/steps/merge.py +106 -171
- deltacat/compute/compactor_v2/utils/delta.py +97 -0
- deltacat/compute/compactor_v2/utils/merge.py +126 -0
- deltacat/compute/compactor_v2/utils/task_options.py +47 -4
- deltacat/compute/merge_on_read/__init__.py +4 -0
- deltacat/compute/merge_on_read/daft.py +40 -0
- deltacat/compute/merge_on_read/model/__init__.py +0 -0
- deltacat/compute/merge_on_read/model/merge_on_read_params.py +66 -0
- deltacat/compute/merge_on_read/utils/__init__.py +0 -0
- deltacat/compute/merge_on_read/utils/delta.py +42 -0
- deltacat/storage/interface.py +10 -2
- deltacat/storage/model/types.py +3 -11
- deltacat/tests/catalog/__init__.py +0 -0
- deltacat/tests/catalog/test_default_catalog_impl.py +98 -0
- deltacat/tests/compute/compact_partition_test_cases.py +126 -1
- deltacat/tests/compute/test_compact_partition_incremental.py +4 -1
- deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +9 -2
- deltacat/tests/local_deltacat_storage/__init__.py +19 -2
- deltacat/tests/test_utils/pyarrow.py +33 -14
- deltacat/tests/utils/test_daft.py +42 -2
- deltacat/types/media.py +5 -0
- deltacat/types/tables.py +7 -1
- deltacat/utils/daft.py +78 -13
- {deltacat-0.2.9.dist-info → deltacat-1.0.0.dist-info}/METADATA +2 -2
- {deltacat-0.2.9.dist-info → deltacat-1.0.0.dist-info}/RECORD +39 -27
- {deltacat-0.2.9.dist-info → deltacat-1.0.0.dist-info}/LICENSE +0 -0
- {deltacat-0.2.9.dist-info → deltacat-1.0.0.dist-info}/WHEEL +0 -0
- {deltacat-0.2.9.dist-info → deltacat-1.0.0.dist-info}/top_level.txt +0 -0
deltacat/__init__.py
CHANGED
@@ -1,4 +1,6 @@
|
|
1
1
|
from deltacat.aws.redshift.model.manifest import (
|
2
|
+
EntryFileParams,
|
3
|
+
EntryType,
|
2
4
|
Manifest,
|
3
5
|
ManifestAuthor,
|
4
6
|
ManifestEntry,
|
@@ -7,6 +9,8 @@ from deltacat.aws.redshift.model.manifest import (
|
|
7
9
|
)
|
8
10
|
|
9
11
|
__all__ = [
|
12
|
+
"EntryFileParams",
|
13
|
+
"EntryType",
|
10
14
|
"Manifest",
|
11
15
|
"ManifestAuthor",
|
12
16
|
"ManifestEntry",
|
@@ -5,12 +5,63 @@ import itertools
|
|
5
5
|
import logging
|
6
6
|
from typing import Any, Dict, List, Optional
|
7
7
|
from uuid import uuid4
|
8
|
+
from enum import Enum
|
8
9
|
|
9
10
|
from deltacat import logs
|
10
11
|
|
11
12
|
logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
|
12
13
|
|
13
14
|
|
15
|
+
class EntryType(str, Enum):
|
16
|
+
"""
|
17
|
+
Enum representing all possible content categories of an manifest entry file
|
18
|
+
"""
|
19
|
+
|
20
|
+
DATA = "data"
|
21
|
+
POSITIONAL_DELETE = "positional_delete"
|
22
|
+
EQUALITY_DELETE = "equality_delete"
|
23
|
+
|
24
|
+
@classmethod
|
25
|
+
def get_default(cls):
|
26
|
+
return EntryType.DATA
|
27
|
+
|
28
|
+
@classmethod
|
29
|
+
def list(cls):
|
30
|
+
return [c.value for c in EntryType]
|
31
|
+
|
32
|
+
|
33
|
+
class EntryFileParams(dict):
|
34
|
+
"""
|
35
|
+
Represents parameters relevant to the underlying contents of manifest entry. Contains all parameters required to support DELETEs
|
36
|
+
equality_column_names: List of column names that would be used to determine row equality for equality deletes. Relevant only to equality deletes
|
37
|
+
position: Ordinal position of a deleted row in the target data file identified by uri, starting at 0. Relevant only to positional deletes
|
38
|
+
"""
|
39
|
+
|
40
|
+
@staticmethod
|
41
|
+
def of(
|
42
|
+
equality_column_names: Optional[List[str]] = None,
|
43
|
+
position: Optional[int] = None,
|
44
|
+
) -> EntryFileParams:
|
45
|
+
entry_file_params = EntryFileParams()
|
46
|
+
if equality_column_names is not None:
|
47
|
+
entry_file_params["equality_column_names"] = equality_column_names
|
48
|
+
if position is not None:
|
49
|
+
entry_file_params["position"] = position
|
50
|
+
return entry_file_params
|
51
|
+
|
52
|
+
@property
|
53
|
+
def equality_column_names(self) -> Optional[List[str]]:
|
54
|
+
return self.get("equality_column_names")
|
55
|
+
|
56
|
+
@property
|
57
|
+
def url(self) -> Optional[str]:
|
58
|
+
return self.get("url")
|
59
|
+
|
60
|
+
@property
|
61
|
+
def position(self) -> Optional[int]:
|
62
|
+
return self.get("position")
|
63
|
+
|
64
|
+
|
14
65
|
class Manifest(dict):
|
15
66
|
@staticmethod
|
16
67
|
def _build_manifest(
|
@@ -18,6 +69,7 @@ class Manifest(dict):
|
|
18
69
|
entries: Optional[ManifestEntryList],
|
19
70
|
author: Optional[ManifestAuthor] = None,
|
20
71
|
uuid: str = None,
|
72
|
+
entry_type: Optional[EntryType] = None,
|
21
73
|
) -> Manifest:
|
22
74
|
if not uuid:
|
23
75
|
uuid = str(uuid4())
|
@@ -29,6 +81,8 @@ class Manifest(dict):
|
|
29
81
|
manifest["entries"] = entries
|
30
82
|
if author is not None:
|
31
83
|
manifest["author"] = author
|
84
|
+
if entry_type is not None:
|
85
|
+
manifest["entry_type"] = entry_type.value
|
32
86
|
return manifest
|
33
87
|
|
34
88
|
@staticmethod
|
@@ -36,6 +90,7 @@ class Manifest(dict):
|
|
36
90
|
entries: ManifestEntryList,
|
37
91
|
author: Optional[ManifestAuthor] = None,
|
38
92
|
uuid: str = None,
|
93
|
+
entry_type: Optional[EntryType] = None,
|
39
94
|
) -> Manifest:
|
40
95
|
if not uuid:
|
41
96
|
uuid = str(uuid4())
|
@@ -78,8 +133,9 @@ class Manifest(dict):
|
|
78
133
|
content_type,
|
79
134
|
content_encoding,
|
80
135
|
total_source_content_length,
|
136
|
+
entry_type=entry_type,
|
81
137
|
)
|
82
|
-
manifest = Manifest._build_manifest(meta, entries, author, uuid)
|
138
|
+
manifest = Manifest._build_manifest(meta, entries, author, uuid, entry_type)
|
83
139
|
return manifest
|
84
140
|
|
85
141
|
@staticmethod
|
@@ -128,6 +184,7 @@ class ManifestMeta(dict):
|
|
128
184
|
source_content_length: Optional[int] = None,
|
129
185
|
credentials: Optional[Dict[str, str]] = None,
|
130
186
|
content_type_parameters: Optional[List[Dict[str, str]]] = None,
|
187
|
+
entry_type: Optional[EntryType] = None,
|
131
188
|
) -> ManifestMeta:
|
132
189
|
manifest_meta = ManifestMeta()
|
133
190
|
if record_count is not None:
|
@@ -144,6 +201,8 @@ class ManifestMeta(dict):
|
|
144
201
|
manifest_meta["content_encoding"] = content_encoding
|
145
202
|
if credentials is not None:
|
146
203
|
manifest_meta["credentials"] = credentials
|
204
|
+
if entry_type is not None:
|
205
|
+
manifest_meta["entry_type"] = entry_type.value
|
147
206
|
return manifest_meta
|
148
207
|
|
149
208
|
@property
|
@@ -178,6 +237,13 @@ class ManifestMeta(dict):
|
|
178
237
|
def credentials(self) -> Optional[Dict[str, str]]:
|
179
238
|
return self.get("credentials")
|
180
239
|
|
240
|
+
@property
|
241
|
+
def entry_type(self) -> Optional[EntryType]:
|
242
|
+
val = self.get("entry_type")
|
243
|
+
if val is not None:
|
244
|
+
return EntryType(self["entry_type"])
|
245
|
+
return val
|
246
|
+
|
181
247
|
|
182
248
|
class ManifestAuthor(dict):
|
183
249
|
@staticmethod
|
@@ -206,6 +272,8 @@ class ManifestEntry(dict):
|
|
206
272
|
mandatory: bool = True,
|
207
273
|
uri: Optional[str] = None,
|
208
274
|
uuid: Optional[str] = None,
|
275
|
+
entry_type: Optional[EntryType] = None,
|
276
|
+
entry_file_params: Optional[EntryFileParams] = None,
|
209
277
|
) -> ManifestEntry:
|
210
278
|
manifest_entry = ManifestEntry()
|
211
279
|
if not (uri or url):
|
@@ -222,6 +290,16 @@ class ManifestEntry(dict):
|
|
222
290
|
manifest_entry["mandatory"] = mandatory
|
223
291
|
if uuid is not None:
|
224
292
|
manifest_entry["id"] = uuid
|
293
|
+
if entry_type is not None:
|
294
|
+
manifest_entry["entry_type"] = entry_type.value
|
295
|
+
if entry_file_params is not None:
|
296
|
+
if entry_file_params.get("url") != manifest_entry.get("url"):
|
297
|
+
msg = (
|
298
|
+
f"Expected manifest entry url: {manifest_entry.url}"
|
299
|
+
f" and entry_file_params: '{entry_file_params.url}' to match"
|
300
|
+
)
|
301
|
+
raise ValueError(msg)
|
302
|
+
manifest_entry["entry_file_params"] = entry_file_params
|
225
303
|
return manifest_entry
|
226
304
|
|
227
305
|
@staticmethod
|
@@ -268,6 +346,20 @@ class ManifestEntry(dict):
|
|
268
346
|
def id(self) -> Optional[str]:
|
269
347
|
return self.get("id")
|
270
348
|
|
349
|
+
@property
|
350
|
+
def entry_type(self) -> Optional[EntryType]:
|
351
|
+
val = self.get("entry_type")
|
352
|
+
if val is not None:
|
353
|
+
return EntryType(self["entry_type"])
|
354
|
+
return val
|
355
|
+
|
356
|
+
@property
|
357
|
+
def entry_file_params(self) -> Optional[EntryFileParams]:
|
358
|
+
val: Dict[str, Any] = self.get("entry_file_params")
|
359
|
+
if val is not None and not isinstance(val, EntryFileParams):
|
360
|
+
self["entry_file_params"] = val = EntryFileParams(val)
|
361
|
+
return val
|
362
|
+
|
271
363
|
|
272
364
|
class ManifestEntryList(List[ManifestEntry]):
|
273
365
|
@staticmethod
|
deltacat/aws/s3u.py
CHANGED
@@ -22,7 +22,7 @@ from tenacity import (
|
|
22
22
|
stop_after_delay,
|
23
23
|
wait_random_exponential,
|
24
24
|
)
|
25
|
-
|
25
|
+
from deltacat.utils.ray_utils.concurrency import invoke_parallel
|
26
26
|
import deltacat.aws.clients as aws_utils
|
27
27
|
from deltacat import logs
|
28
28
|
from deltacat.aws.constants import TIMEOUT_ERROR_CODES
|
@@ -35,10 +35,17 @@ from deltacat.storage import (
|
|
35
35
|
ManifestEntry,
|
36
36
|
ManifestEntryList,
|
37
37
|
)
|
38
|
-
from deltacat.types.media import
|
38
|
+
from deltacat.types.media import (
|
39
|
+
ContentEncoding,
|
40
|
+
ContentType,
|
41
|
+
TableType,
|
42
|
+
DistributedDatasetType,
|
43
|
+
)
|
39
44
|
from deltacat.types.tables import (
|
40
45
|
TABLE_CLASS_TO_SIZE_FUNC,
|
41
46
|
TABLE_TYPE_TO_READER_FUNC,
|
47
|
+
TABLE_TYPE_TO_DATASET_CREATE_FUNC_REFS,
|
48
|
+
DISTRIBUTED_DATASET_TYPE_TO_READER_FUNC,
|
42
49
|
get_table_length,
|
43
50
|
)
|
44
51
|
from deltacat.types.partial_download import PartialFileDownloadParams
|
@@ -284,59 +291,6 @@ def upload_sliced_table(
|
|
284
291
|
return manifest_entries
|
285
292
|
|
286
293
|
|
287
|
-
@ray.remote
|
288
|
-
def _block_metadata(block: Block) -> BlockMetadata:
|
289
|
-
return BlockAccessor.for_block(block).get_metadata(
|
290
|
-
input_files=None,
|
291
|
-
exec_stats=None,
|
292
|
-
)
|
293
|
-
|
294
|
-
|
295
|
-
def _get_metadata(
|
296
|
-
table: Union[LocalTable, DistributedDataset],
|
297
|
-
write_paths: List[str],
|
298
|
-
block_refs: List[ObjectRef[Block]],
|
299
|
-
) -> List[BlockMetadata]:
|
300
|
-
metadata: List[BlockMetadata] = []
|
301
|
-
if not block_refs:
|
302
|
-
# this must be a local table - ensure it was written to only 1 file
|
303
|
-
assert len(write_paths) == 1, (
|
304
|
-
f"Expected table of type '{type(table)}' to be written to 1 "
|
305
|
-
f"file, but found {len(write_paths)} files."
|
306
|
-
)
|
307
|
-
table_size = None
|
308
|
-
table_size_func = TABLE_CLASS_TO_SIZE_FUNC.get(type(table))
|
309
|
-
if table_size_func:
|
310
|
-
table_size = table_size_func(table)
|
311
|
-
else:
|
312
|
-
logger.warning(f"Unable to estimate '{type(table)}' table size.")
|
313
|
-
metadata.append(
|
314
|
-
BlockMetadata(
|
315
|
-
num_rows=get_table_length(table),
|
316
|
-
size_bytes=table_size,
|
317
|
-
schema=None,
|
318
|
-
input_files=None,
|
319
|
-
exec_stats=None,
|
320
|
-
)
|
321
|
-
)
|
322
|
-
else:
|
323
|
-
# TODO(pdames): Expose BlockList metadata getter from Ray Dataset?
|
324
|
-
# ray 1.10
|
325
|
-
# metadata = dataset._blocks.get_metadata()
|
326
|
-
# ray 2.0.0dev
|
327
|
-
metadata = table._plan.execute().get_metadata()
|
328
|
-
if (
|
329
|
-
not metadata
|
330
|
-
or metadata[0].size_bytes is None
|
331
|
-
or metadata[0].num_rows is None
|
332
|
-
):
|
333
|
-
metadata_futures = [
|
334
|
-
_block_metadata.remote(block_ref) for block_ref in block_refs
|
335
|
-
]
|
336
|
-
metadata = ray.get(metadata_futures)
|
337
|
-
return metadata
|
338
|
-
|
339
|
-
|
340
294
|
def upload_table(
|
341
295
|
table: Union[LocalTable, DistributedDataset],
|
342
296
|
s3_base_url: str,
|
@@ -403,17 +357,7 @@ def download_manifest_entry(
|
|
403
357
|
content_encoding: Optional[ContentEncoding] = None,
|
404
358
|
) -> LocalTable:
|
405
359
|
|
406
|
-
|
407
|
-
s3_client_kwargs = (
|
408
|
-
{
|
409
|
-
"aws_access_key_id": token_holder["accessKeyId"],
|
410
|
-
"aws_secret_access_key": token_holder["secretAccessKey"],
|
411
|
-
"aws_session_token": token_holder["sessionToken"],
|
412
|
-
"config": conf,
|
413
|
-
}
|
414
|
-
if token_holder
|
415
|
-
else {"config": conf}
|
416
|
-
)
|
360
|
+
s3_client_kwargs = _get_s3_client_kwargs_from_token(token_holder=token_holder)
|
417
361
|
if not content_type:
|
418
362
|
content_type = manifest_entry.meta.content_type
|
419
363
|
assert (
|
@@ -458,51 +402,9 @@ def download_manifest_entry(
|
|
458
402
|
return table
|
459
403
|
|
460
404
|
|
461
|
-
|
462
|
-
|
463
|
-
|
464
|
-
table_type: TableType = TableType.PYARROW,
|
465
|
-
column_names: Optional[List[str]] = None,
|
466
|
-
include_columns: Optional[List[str]] = None,
|
467
|
-
file_reader_kwargs_provider: Optional[ReadKwargsProvider] = None,
|
468
|
-
) -> LocalDataset:
|
469
|
-
|
470
|
-
return [
|
471
|
-
download_manifest_entry(
|
472
|
-
e,
|
473
|
-
token_holder,
|
474
|
-
table_type,
|
475
|
-
column_names,
|
476
|
-
include_columns,
|
477
|
-
file_reader_kwargs_provider,
|
478
|
-
)
|
479
|
-
for e in manifest.entries
|
480
|
-
]
|
481
|
-
|
482
|
-
|
483
|
-
def _download_manifest_entries_parallel(
|
484
|
-
manifest: Manifest,
|
485
|
-
token_holder: Optional[Dict[str, Any]] = None,
|
486
|
-
table_type: TableType = TableType.PYARROW,
|
487
|
-
max_parallelism: Optional[int] = None,
|
488
|
-
column_names: Optional[List[str]] = None,
|
489
|
-
include_columns: Optional[List[str]] = None,
|
490
|
-
file_reader_kwargs_provider: Optional[ReadKwargsProvider] = None,
|
491
|
-
) -> LocalDataset:
|
492
|
-
|
493
|
-
tables = []
|
494
|
-
pool = multiprocessing.Pool(max_parallelism)
|
495
|
-
downloader = partial(
|
496
|
-
download_manifest_entry,
|
497
|
-
token_holder=token_holder,
|
498
|
-
table_type=table_type,
|
499
|
-
column_names=column_names,
|
500
|
-
include_columns=include_columns,
|
501
|
-
file_reader_kwargs_provider=file_reader_kwargs_provider,
|
502
|
-
)
|
503
|
-
for table in pool.map(downloader, [e for e in manifest.entries]):
|
504
|
-
tables.append(table)
|
505
|
-
return tables
|
405
|
+
@ray.remote
|
406
|
+
def download_manifest_entry_ray(*args, **kwargs) -> ObjectRef[LocalTable]:
|
407
|
+
return download_manifest_entry(*args, **kwargs)
|
506
408
|
|
507
409
|
|
508
410
|
def download_manifest_entries(
|
@@ -536,6 +438,42 @@ def download_manifest_entries(
|
|
536
438
|
)
|
537
439
|
|
538
440
|
|
441
|
+
def download_manifest_entries_distributed(
|
442
|
+
manifest: Manifest,
|
443
|
+
token_holder: Optional[Dict[str, Any]] = None,
|
444
|
+
table_type: TableType = TableType.PYARROW,
|
445
|
+
max_parallelism: Optional[int] = 1000,
|
446
|
+
column_names: Optional[List[str]] = None,
|
447
|
+
include_columns: Optional[List[str]] = None,
|
448
|
+
file_reader_kwargs_provider: Optional[ReadKwargsProvider] = None,
|
449
|
+
ray_options_provider: Callable[[int, Any], Dict[str, Any]] = None,
|
450
|
+
distributed_dataset_type: Optional[
|
451
|
+
DistributedDatasetType
|
452
|
+
] = DistributedDatasetType.RAY_DATASET,
|
453
|
+
) -> DistributedDataset:
|
454
|
+
|
455
|
+
params = {
|
456
|
+
"manifest": manifest,
|
457
|
+
"token_holder": token_holder,
|
458
|
+
"table_type": table_type,
|
459
|
+
"max_parallelism": max_parallelism,
|
460
|
+
"column_names": column_names,
|
461
|
+
"include_columns": include_columns,
|
462
|
+
"file_reader_kwargs_provider": file_reader_kwargs_provider,
|
463
|
+
"ray_options_provider": ray_options_provider,
|
464
|
+
"distributed_dataset_type": distributed_dataset_type,
|
465
|
+
}
|
466
|
+
|
467
|
+
if distributed_dataset_type == DistributedDatasetType.RAY_DATASET:
|
468
|
+
return _download_manifest_entries_ray_data_distributed(**params)
|
469
|
+
elif distributed_dataset_type is not None:
|
470
|
+
return _download_manifest_entries_all_dataset_distributed(**params)
|
471
|
+
else:
|
472
|
+
raise ValueError(
|
473
|
+
f"Distributed dataset type {distributed_dataset_type} not supported."
|
474
|
+
)
|
475
|
+
|
476
|
+
|
539
477
|
def upload(s3_url: str, body, **s3_client_kwargs) -> Dict[str, Any]:
|
540
478
|
|
541
479
|
# TODO (pdames): add tenacity retrying
|
@@ -574,3 +512,204 @@ def download(
|
|
574
512
|
else:
|
575
513
|
logger.info(f"file not found: {s3_url}")
|
576
514
|
return None
|
515
|
+
|
516
|
+
|
517
|
+
def _download_manifest_entries_parallel(
|
518
|
+
manifest: Manifest,
|
519
|
+
token_holder: Optional[Dict[str, Any]] = None,
|
520
|
+
table_type: TableType = TableType.PYARROW,
|
521
|
+
max_parallelism: Optional[int] = None,
|
522
|
+
column_names: Optional[List[str]] = None,
|
523
|
+
include_columns: Optional[List[str]] = None,
|
524
|
+
file_reader_kwargs_provider: Optional[ReadKwargsProvider] = None,
|
525
|
+
) -> LocalDataset:
|
526
|
+
|
527
|
+
tables = []
|
528
|
+
pool = multiprocessing.Pool(max_parallelism)
|
529
|
+
downloader = partial(
|
530
|
+
download_manifest_entry,
|
531
|
+
token_holder=token_holder,
|
532
|
+
table_type=table_type,
|
533
|
+
column_names=column_names,
|
534
|
+
include_columns=include_columns,
|
535
|
+
file_reader_kwargs_provider=file_reader_kwargs_provider,
|
536
|
+
)
|
537
|
+
for table in pool.map(downloader, [e for e in manifest.entries]):
|
538
|
+
tables.append(table)
|
539
|
+
return tables
|
540
|
+
|
541
|
+
|
542
|
+
def _download_manifest_entries(
|
543
|
+
manifest: Manifest,
|
544
|
+
token_holder: Optional[Dict[str, Any]] = None,
|
545
|
+
table_type: TableType = TableType.PYARROW,
|
546
|
+
column_names: Optional[List[str]] = None,
|
547
|
+
include_columns: Optional[List[str]] = None,
|
548
|
+
file_reader_kwargs_provider: Optional[ReadKwargsProvider] = None,
|
549
|
+
) -> LocalDataset:
|
550
|
+
|
551
|
+
return [
|
552
|
+
download_manifest_entry(
|
553
|
+
manifest_entry=e,
|
554
|
+
token_holder=token_holder,
|
555
|
+
table_type=table_type,
|
556
|
+
column_names=column_names,
|
557
|
+
include_columns=include_columns,
|
558
|
+
file_reader_kwargs_provider=file_reader_kwargs_provider,
|
559
|
+
)
|
560
|
+
for e in manifest.entries
|
561
|
+
]
|
562
|
+
|
563
|
+
|
564
|
+
@ray.remote
|
565
|
+
def _block_metadata(block: Block) -> BlockMetadata:
|
566
|
+
return BlockAccessor.for_block(block).get_metadata(
|
567
|
+
input_files=None,
|
568
|
+
exec_stats=None,
|
569
|
+
)
|
570
|
+
|
571
|
+
|
572
|
+
def _get_s3_client_kwargs_from_token(token_holder) -> Dict[Any, Any]:
|
573
|
+
conf = Config(retries={"max_attempts": BOTO_MAX_RETRIES, "mode": "adaptive"})
|
574
|
+
return (
|
575
|
+
{
|
576
|
+
"aws_access_key_id": token_holder["accessKeyId"],
|
577
|
+
"aws_secret_access_key": token_holder["secretAccessKey"],
|
578
|
+
"aws_session_token": token_holder["sessionToken"],
|
579
|
+
"config": conf,
|
580
|
+
}
|
581
|
+
if token_holder
|
582
|
+
else {"config": conf}
|
583
|
+
)
|
584
|
+
|
585
|
+
|
586
|
+
def _get_metadata(
|
587
|
+
table: Union[LocalTable, DistributedDataset],
|
588
|
+
write_paths: List[str],
|
589
|
+
block_refs: List[ObjectRef[Block]],
|
590
|
+
) -> List[BlockMetadata]:
|
591
|
+
metadata: List[BlockMetadata] = []
|
592
|
+
if not block_refs:
|
593
|
+
# this must be a local table - ensure it was written to only 1 file
|
594
|
+
assert len(write_paths) == 1, (
|
595
|
+
f"Expected table of type '{type(table)}' to be written to 1 "
|
596
|
+
f"file, but found {len(write_paths)} files."
|
597
|
+
)
|
598
|
+
table_size = None
|
599
|
+
table_size_func = TABLE_CLASS_TO_SIZE_FUNC.get(type(table))
|
600
|
+
if table_size_func:
|
601
|
+
table_size = table_size_func(table)
|
602
|
+
else:
|
603
|
+
logger.warning(f"Unable to estimate '{type(table)}' table size.")
|
604
|
+
metadata.append(
|
605
|
+
BlockMetadata(
|
606
|
+
num_rows=get_table_length(table),
|
607
|
+
size_bytes=table_size,
|
608
|
+
schema=None,
|
609
|
+
input_files=None,
|
610
|
+
exec_stats=None,
|
611
|
+
)
|
612
|
+
)
|
613
|
+
else:
|
614
|
+
# TODO(pdames): Expose BlockList metadata getter from Ray Dataset?
|
615
|
+
# ray 1.10
|
616
|
+
# metadata = dataset._blocks.get_metadata()
|
617
|
+
# ray 2.0.0dev
|
618
|
+
metadata = table._plan.execute().get_metadata()
|
619
|
+
if (
|
620
|
+
not metadata
|
621
|
+
or metadata[0].size_bytes is None
|
622
|
+
or metadata[0].num_rows is None
|
623
|
+
):
|
624
|
+
metadata_futures = [
|
625
|
+
_block_metadata.remote(block_ref) for block_ref in block_refs
|
626
|
+
]
|
627
|
+
metadata = ray.get(metadata_futures)
|
628
|
+
return metadata
|
629
|
+
|
630
|
+
|
631
|
+
def _download_manifest_entries_ray_data_distributed(
|
632
|
+
manifest: Manifest,
|
633
|
+
token_holder: Optional[Dict[str, Any]] = None,
|
634
|
+
table_type: TableType = TableType.PYARROW,
|
635
|
+
max_parallelism: Optional[int] = 1000,
|
636
|
+
column_names: Optional[List[str]] = None,
|
637
|
+
include_columns: Optional[List[str]] = None,
|
638
|
+
file_reader_kwargs_provider: Optional[ReadKwargsProvider] = None,
|
639
|
+
ray_options_provider: Callable[[int, Any], Dict[str, Any]] = None,
|
640
|
+
) -> DistributedDataset:
|
641
|
+
|
642
|
+
table_pending_ids = []
|
643
|
+
manifest_entries = manifest.entries
|
644
|
+
if manifest_entries:
|
645
|
+
table_pending_ids = invoke_parallel(
|
646
|
+
manifest_entries,
|
647
|
+
download_manifest_entry_ray,
|
648
|
+
token_holder,
|
649
|
+
table_type,
|
650
|
+
column_names,
|
651
|
+
include_columns,
|
652
|
+
file_reader_kwargs_provider,
|
653
|
+
max_parallelism=max_parallelism,
|
654
|
+
options_provider=ray_options_provider,
|
655
|
+
)
|
656
|
+
return TABLE_TYPE_TO_DATASET_CREATE_FUNC_REFS[table_type](table_pending_ids)
|
657
|
+
|
658
|
+
|
659
|
+
def _download_manifest_entries_all_dataset_distributed(
|
660
|
+
manifest: Manifest,
|
661
|
+
token_holder: Optional[Dict[str, Any]] = None,
|
662
|
+
table_type: TableType = TableType.PYARROW,
|
663
|
+
max_parallelism: Optional[int] = 1000,
|
664
|
+
column_names: Optional[List[str]] = None,
|
665
|
+
include_columns: Optional[List[str]] = None,
|
666
|
+
file_reader_kwargs_provider: Optional[ReadKwargsProvider] = None,
|
667
|
+
ray_options_provider: Callable[[int, Any], Dict[str, Any]] = None,
|
668
|
+
distributed_dataset_type: Optional[
|
669
|
+
DistributedDatasetType
|
670
|
+
] = DistributedDatasetType.RAY_DATASET,
|
671
|
+
) -> DistributedDataset:
|
672
|
+
|
673
|
+
entry_content_type = None
|
674
|
+
entry_content_encoding = None
|
675
|
+
uris = []
|
676
|
+
for entry in manifest.entries or []:
|
677
|
+
if (
|
678
|
+
entry_content_type is not None
|
679
|
+
and entry_content_type != entry.meta.content_type
|
680
|
+
):
|
681
|
+
raise ValueError(
|
682
|
+
f"Mixed content types of ({entry_content_type},"
|
683
|
+
f" {entry.meta.content_type}) is not supported."
|
684
|
+
)
|
685
|
+
|
686
|
+
if (
|
687
|
+
entry_content_encoding is not None
|
688
|
+
and entry_content_encoding != entry.meta.content_encoding
|
689
|
+
):
|
690
|
+
raise ValueError(
|
691
|
+
f"Mixed content encoding of {entry_content_encoding},"
|
692
|
+
f" {entry.meta.content_encoding} is not supported."
|
693
|
+
)
|
694
|
+
|
695
|
+
entry_content_type = entry.meta.content_type
|
696
|
+
entry_content_encoding = entry.meta.content_encoding
|
697
|
+
uris.append(entry.uri)
|
698
|
+
|
699
|
+
s3_client_kwargs = _get_s3_client_kwargs_from_token(token_holder=token_holder)
|
700
|
+
|
701
|
+
if distributed_dataset_type in DISTRIBUTED_DATASET_TYPE_TO_READER_FUNC:
|
702
|
+
return DISTRIBUTED_DATASET_TYPE_TO_READER_FUNC[distributed_dataset_type.value](
|
703
|
+
uris=uris,
|
704
|
+
content_type=entry_content_type,
|
705
|
+
content_encoding=entry_content_encoding,
|
706
|
+
column_names=column_names,
|
707
|
+
include_columns=include_columns,
|
708
|
+
read_func_kwargs_provider=file_reader_kwargs_provider,
|
709
|
+
ray_options_provider=ray_options_provider,
|
710
|
+
s3_client_kwargs=s3_client_kwargs,
|
711
|
+
)
|
712
|
+
else:
|
713
|
+
raise ValueError(
|
714
|
+
f"Unsupported distributed dataset type={distributed_dataset_type}"
|
715
|
+
)
|