deltacat 0.1.10.dev0__py3-none-any.whl → 0.1.12__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +41 -15
- deltacat/aws/clients.py +12 -31
- deltacat/aws/constants.py +1 -1
- deltacat/aws/redshift/__init__.py +7 -2
- deltacat/aws/redshift/model/manifest.py +54 -50
- deltacat/aws/s3u.py +176 -187
- deltacat/catalog/delegate.py +151 -185
- deltacat/catalog/interface.py +78 -97
- deltacat/catalog/model/catalog.py +21 -21
- deltacat/catalog/model/table_definition.py +11 -9
- deltacat/compute/compactor/__init__.py +12 -16
- deltacat/compute/compactor/compaction_session.py +237 -166
- deltacat/compute/compactor/model/delta_annotated.py +60 -44
- deltacat/compute/compactor/model/delta_file_envelope.py +5 -6
- deltacat/compute/compactor/model/delta_file_locator.py +10 -8
- deltacat/compute/compactor/model/materialize_result.py +6 -7
- deltacat/compute/compactor/model/primary_key_index.py +38 -34
- deltacat/compute/compactor/model/pyarrow_write_result.py +3 -4
- deltacat/compute/compactor/model/round_completion_info.py +25 -19
- deltacat/compute/compactor/model/sort_key.py +18 -15
- deltacat/compute/compactor/steps/dedupe.py +119 -94
- deltacat/compute/compactor/steps/hash_bucket.py +48 -47
- deltacat/compute/compactor/steps/materialize.py +86 -92
- deltacat/compute/compactor/steps/rehash/rehash_bucket.py +13 -13
- deltacat/compute/compactor/steps/rehash/rewrite_index.py +5 -5
- deltacat/compute/compactor/utils/io.py +59 -47
- deltacat/compute/compactor/utils/primary_key_index.py +91 -80
- deltacat/compute/compactor/utils/round_completion_file.py +22 -23
- deltacat/compute/compactor/utils/system_columns.py +33 -45
- deltacat/compute/metastats/meta_stats.py +235 -157
- deltacat/compute/metastats/model/partition_stats_dict.py +7 -10
- deltacat/compute/metastats/model/stats_cluster_size_estimator.py +13 -5
- deltacat/compute/metastats/stats.py +95 -64
- deltacat/compute/metastats/utils/io.py +100 -53
- deltacat/compute/metastats/utils/pyarrow_memory_estimation_function.py +5 -2
- deltacat/compute/metastats/utils/ray_utils.py +38 -33
- deltacat/compute/stats/basic.py +107 -69
- deltacat/compute/stats/models/delta_column_stats.py +11 -8
- deltacat/compute/stats/models/delta_stats.py +59 -32
- deltacat/compute/stats/models/delta_stats_cache_result.py +4 -1
- deltacat/compute/stats/models/manifest_entry_stats.py +12 -6
- deltacat/compute/stats/models/stats_result.py +24 -14
- deltacat/compute/stats/utils/intervals.py +16 -9
- deltacat/compute/stats/utils/io.py +86 -51
- deltacat/compute/stats/utils/manifest_stats_file.py +24 -33
- deltacat/constants.py +4 -13
- deltacat/io/__init__.py +2 -2
- deltacat/io/aws/redshift/redshift_datasource.py +157 -143
- deltacat/io/dataset.py +14 -17
- deltacat/io/read_api.py +36 -33
- deltacat/logs.py +94 -42
- deltacat/storage/__init__.py +18 -8
- deltacat/storage/interface.py +196 -213
- deltacat/storage/model/delta.py +45 -51
- deltacat/storage/model/list_result.py +12 -8
- deltacat/storage/model/namespace.py +4 -5
- deltacat/storage/model/partition.py +42 -42
- deltacat/storage/model/stream.py +29 -30
- deltacat/storage/model/table.py +14 -14
- deltacat/storage/model/table_version.py +32 -31
- deltacat/storage/model/types.py +1 -0
- deltacat/tests/stats/test_intervals.py +11 -24
- deltacat/tests/utils/__init__.py +0 -0
- deltacat/tests/utils/test_record_batch_tables.py +284 -0
- deltacat/types/media.py +3 -4
- deltacat/types/tables.py +31 -21
- deltacat/utils/common.py +5 -11
- deltacat/utils/numpy.py +20 -22
- deltacat/utils/pandas.py +73 -100
- deltacat/utils/performance.py +3 -9
- deltacat/utils/placement.py +259 -230
- deltacat/utils/pyarrow.py +302 -89
- deltacat/utils/ray_utils/collections.py +2 -1
- deltacat/utils/ray_utils/concurrency.py +27 -28
- deltacat/utils/ray_utils/dataset.py +28 -28
- deltacat/utils/ray_utils/performance.py +5 -9
- deltacat/utils/ray_utils/runtime.py +9 -10
- {deltacat-0.1.10.dev0.dist-info → deltacat-0.1.12.dist-info}/METADATA +1 -1
- deltacat-0.1.12.dist-info/RECORD +110 -0
- deltacat-0.1.10.dev0.dist-info/RECORD +0 -108
- {deltacat-0.1.10.dev0.dist-info → deltacat-0.1.12.dist-info}/LICENSE +0 -0
- {deltacat-0.1.10.dev0.dist-info → deltacat-0.1.12.dist-info}/WHEEL +0 -0
- {deltacat-0.1.10.dev0.dist-info → deltacat-0.1.12.dist-info}/top_level.txt +0 -0
@@ -1,40 +1,45 @@
|
|
1
1
|
import json
|
2
2
|
import logging
|
3
|
+
from collections import OrderedDict, defaultdict
|
4
|
+
from enum import Enum
|
5
|
+
from errno import ENOENT
|
3
6
|
from os import strerror
|
7
|
+
from typing import Any, Callable, Dict, List, Optional, Tuple, Union
|
4
8
|
|
5
9
|
import pyarrow as pa
|
6
10
|
import ray
|
7
11
|
import s3fs
|
8
|
-
|
9
|
-
from errno import ENOENT
|
10
|
-
from enum import Enum
|
11
|
-
from collections import OrderedDict, defaultdict
|
12
|
-
|
13
|
-
from deltacat.types.media import DELIMITED_TEXT_CONTENT_TYPES
|
14
|
-
from pyarrow.fs import FileType, FileSystem, S3FileSystem
|
15
12
|
from pyarrow import parquet as pq
|
16
|
-
|
17
|
-
from ray.data.
|
18
|
-
|
13
|
+
from pyarrow.fs import FileSystem, FileType, S3FileSystem
|
14
|
+
from ray.data.block import Block, BlockMetadata
|
15
|
+
from ray.data.datasource import (
|
16
|
+
BlockWritePathProvider,
|
17
|
+
CSVDatasource,
|
18
|
+
DefaultBlockWritePathProvider,
|
19
|
+
DefaultFileMetadataProvider,
|
20
|
+
ParquetBaseDatasource,
|
21
|
+
ParquetMetadataProvider,
|
22
|
+
PathPartitionParser,
|
23
|
+
)
|
24
|
+
from ray.data.datasource.datasource import ArrowRow, Datasource, ReadTask, WriteResult
|
25
|
+
from ray.data.datasource.file_based_datasource import _resolve_paths_and_filesystem
|
19
26
|
from ray.data.datasource.file_meta_provider import FastFileMetadataProvider
|
20
|
-
from ray.data.datasource.partitioning import PartitionStyle
|
21
27
|
from ray.types import ObjectRef
|
22
|
-
from ray.data.datasource import CSVDatasource, BlockWritePathProvider, \
|
23
|
-
DefaultBlockWritePathProvider, ParquetMetadataProvider, \
|
24
|
-
DefaultFileMetadataProvider, ParquetBaseDatasource, PathPartitionParser
|
25
|
-
from ray.data.datasource.datasource import ReadTask, WriteResult, Datasource, \
|
26
|
-
ArrowRow
|
27
|
-
from ray.data.block import Block, BlockMetadata
|
28
|
-
|
29
|
-
from deltacat import ContentType, ContentEncoding
|
30
|
-
from deltacat import logs
|
31
|
-
from deltacat.aws.redshift.model.manifest import Manifest, ManifestEntryList, \
|
32
|
-
ManifestEntry, ManifestMeta
|
33
|
-
|
34
|
-
from typing import Any, Callable, List, Optional, Union, Dict, Tuple
|
35
28
|
|
36
|
-
from deltacat
|
37
|
-
|
29
|
+
from deltacat import ContentEncoding, ContentType, logs
|
30
|
+
from deltacat.aws.redshift.model.manifest import (
|
31
|
+
Manifest,
|
32
|
+
ManifestEntry,
|
33
|
+
ManifestEntryList,
|
34
|
+
ManifestMeta,
|
35
|
+
)
|
36
|
+
from deltacat.aws.s3u import (
|
37
|
+
S3Url,
|
38
|
+
filter_objects_by_prefix,
|
39
|
+
objects_to_paths,
|
40
|
+
parse_s3_url,
|
41
|
+
)
|
42
|
+
from deltacat.types.media import DELIMITED_TEXT_CONTENT_TYPES
|
38
43
|
from deltacat.utils.common import ReadKwargsProvider
|
39
44
|
|
40
45
|
logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
|
@@ -43,15 +48,12 @@ logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
|
|
43
48
|
class CapturingBlockWritePathProvider(BlockWritePathProvider):
|
44
49
|
"""Delegating block write path provider that saves an ordered dictionary of
|
45
50
|
input keyword arguments for every block write path returned."""
|
51
|
+
|
46
52
|
def __init__(self, block_write_path_provider: BlockWritePathProvider):
|
47
53
|
self.block_write_path_provider = block_write_path_provider
|
48
54
|
self.write_path_kwargs: Dict[str, Dict[str, Any]] = OrderedDict()
|
49
55
|
|
50
|
-
def _get_write_path_for_block(
|
51
|
-
self,
|
52
|
-
base_path: str,
|
53
|
-
*args,
|
54
|
-
**kwargs) -> str:
|
56
|
+
def _get_write_path_for_block(self, base_path: str, *args, **kwargs) -> str:
|
55
57
|
write_path = self.block_write_path_provider(
|
56
58
|
base_path,
|
57
59
|
*args,
|
@@ -73,10 +75,10 @@ class CachedFileMetadataProvider(
|
|
73
75
|
return self._meta_cache
|
74
76
|
|
75
77
|
def _get_block_metadata(
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
78
|
+
self,
|
79
|
+
paths: List[str],
|
80
|
+
schema: Optional[Union[type, pa.Schema]],
|
81
|
+
**kwargs,
|
80
82
|
) -> BlockMetadata:
|
81
83
|
agg_block_metadata = BlockMetadata(
|
82
84
|
num_rows=0,
|
@@ -103,9 +105,9 @@ class CachedFileMetadataProvider(
|
|
103
105
|
|
104
106
|
class HivePartitionParser(PathPartitionParser):
|
105
107
|
def __init__(
|
106
|
-
|
107
|
-
|
108
|
-
|
108
|
+
self,
|
109
|
+
base_dir: Optional[str] = None,
|
110
|
+
filter_fn: Optional[Callable[[Dict[str, str]], bool]] = None,
|
109
111
|
):
|
110
112
|
super(HivePartitionParser, self).__init__(
|
111
113
|
base_dir=base_dir,
|
@@ -115,17 +117,17 @@ class HivePartitionParser(PathPartitionParser):
|
|
115
117
|
|
116
118
|
class RedshiftUnloadTextArgs:
|
117
119
|
def __init__(
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
120
|
+
self,
|
121
|
+
csv: bool = False,
|
122
|
+
header: bool = False,
|
123
|
+
delimiter: Optional[str] = None,
|
124
|
+
bzip2: bool = False,
|
125
|
+
gzip: bool = False,
|
126
|
+
zstd: bool = False,
|
127
|
+
add_quotes: Optional[bool] = None,
|
128
|
+
null_as: str = "",
|
129
|
+
escape: bool = False,
|
130
|
+
fixed_width: bool = False,
|
129
131
|
):
|
130
132
|
self.header = header
|
131
133
|
self.delimiter = delimiter if delimiter else "," if csv else "|"
|
@@ -149,20 +151,22 @@ class RedshiftUnloadTextArgs:
|
|
149
151
|
raise ValueError(
|
150
152
|
f"Multiple Redshift UNLOAD compression types specified "
|
151
153
|
f"({codecs_enabled}). Please ensure that only one "
|
152
|
-
f"compression type is set and try again."
|
154
|
+
f"compression type is set and try again."
|
155
|
+
)
|
153
156
|
if flag:
|
154
157
|
arrow_compression_codec_name = encoding
|
155
158
|
return arrow_compression_codec_name
|
156
159
|
|
157
160
|
def to_arrow_reader_kwargs(
|
158
|
-
|
159
|
-
|
160
|
-
schema: Optional[pa.Schema]) -> Dict[str, Any]:
|
161
|
+
self, include_columns: Optional[List[str]], schema: Optional[pa.Schema]
|
162
|
+
) -> Dict[str, Any]:
|
161
163
|
from pyarrow import csv
|
164
|
+
|
162
165
|
if self.fixed_width:
|
163
166
|
raise NotImplementedError(
|
164
167
|
"Redshift text files unloaded with FIXEDWIDTH are not "
|
165
|
-
"currently supported."
|
168
|
+
"currently supported."
|
169
|
+
)
|
166
170
|
open_stream_args = {}
|
167
171
|
arrow_compression_codec_name = self._get_arrow_compression_codec_name()
|
168
172
|
if arrow_compression_codec_name:
|
@@ -217,8 +221,8 @@ class RedshiftWriteResult:
|
|
217
221
|
|
218
222
|
|
219
223
|
def _normalize_s3_paths_for_filesystem(
|
220
|
-
|
221
|
-
|
224
|
+
paths: Union[str, List[str]],
|
225
|
+
filesystem: Union[S3FileSystem, s3fs.S3FileSystem],
|
222
226
|
) -> Tuple[List[str], List[S3Url]]:
|
223
227
|
if isinstance(paths, str):
|
224
228
|
paths = [paths]
|
@@ -234,9 +238,9 @@ def _normalize_s3_paths_for_filesystem(
|
|
234
238
|
|
235
239
|
|
236
240
|
def _read_manifest_entry_paths(
|
237
|
-
|
238
|
-
|
239
|
-
|
241
|
+
entries: ManifestEntryList,
|
242
|
+
manifest_content_type: Optional[str],
|
243
|
+
content_type_provider: Callable[[str], ContentType],
|
240
244
|
) -> Tuple[Dict[ContentType, List[str]], CachedFileMetadataProvider]:
|
241
245
|
# support manifests with heterogenous content types
|
242
246
|
content_type_to_paths = defaultdict(list)
|
@@ -261,9 +265,9 @@ def _read_manifest_entry_paths(
|
|
261
265
|
|
262
266
|
|
263
267
|
def _expand_manifest_paths(
|
264
|
-
|
265
|
-
|
266
|
-
|
268
|
+
paths: List[str],
|
269
|
+
filesystem: Optional[Union[S3FileSystem, s3fs.S3FileSystem]],
|
270
|
+
content_type_provider: Callable[[str], ContentType],
|
267
271
|
) -> Tuple[Dict[ContentType, List[str]], CachedFileMetadataProvider]:
|
268
272
|
assert len(paths) == 1, f"Expected 1 manifest path, found {len(paths)}."
|
269
273
|
path = paths[0]
|
@@ -286,8 +290,8 @@ def _expand_manifest_paths(
|
|
286
290
|
|
287
291
|
|
288
292
|
def _infer_content_types_from_paths(
|
289
|
-
|
290
|
-
|
293
|
+
paths: List[str],
|
294
|
+
content_type_provider: Callable[[str], ContentType],
|
291
295
|
) -> Dict[ContentType, List[str]]:
|
292
296
|
content_type_to_paths = defaultdict(list)
|
293
297
|
for path in paths:
|
@@ -297,27 +301,30 @@ def _infer_content_types_from_paths(
|
|
297
301
|
|
298
302
|
|
299
303
|
def _expand_prefix_paths(
|
300
|
-
|
301
|
-
|
302
|
-
|
304
|
+
urls: List[S3Url],
|
305
|
+
content_type_provider: Callable[[str], ContentType],
|
306
|
+
**s3_client_kwargs,
|
303
307
|
) -> Tuple[Dict[ContentType, List[str]], CachedFileMetadataProvider]:
|
304
308
|
assert len(urls) == 1, f"Expected 1 S3 prefix, found {len(urls)}."
|
305
|
-
objects = list(
|
306
|
-
urls[0].bucket,
|
307
|
-
|
308
|
-
|
309
|
-
|
310
|
-
|
311
|
-
|
312
|
-
|
313
|
-
)
|
314
|
-
meta_cache: Dict[str, BlockMetadata] = {
|
315
|
-
|
316
|
-
|
317
|
-
|
318
|
-
|
319
|
-
|
320
|
-
|
309
|
+
objects = list(
|
310
|
+
filter_objects_by_prefix(urls[0].bucket, urls[0].key, **s3_client_kwargs)
|
311
|
+
)
|
312
|
+
paths = list(
|
313
|
+
objects_to_paths(
|
314
|
+
urls[0].bucket,
|
315
|
+
objects,
|
316
|
+
)
|
317
|
+
)
|
318
|
+
meta_cache: Dict[str, BlockMetadata] = {
|
319
|
+
path: BlockMetadata(
|
320
|
+
num_rows=None,
|
321
|
+
size_bytes=objects[i]["ContentLength"],
|
322
|
+
schema=None,
|
323
|
+
input_files=[],
|
324
|
+
exec_stats=None,
|
325
|
+
)
|
326
|
+
for i, path in enumerate(paths)
|
327
|
+
}
|
321
328
|
content_type_to_paths = _infer_content_types_from_paths(
|
322
329
|
paths,
|
323
330
|
content_type_provider,
|
@@ -326,13 +333,13 @@ def _expand_prefix_paths(
|
|
326
333
|
|
327
334
|
|
328
335
|
def _expand_paths_by_content_type(
|
329
|
-
|
330
|
-
|
331
|
-
|
332
|
-
|
333
|
-
|
334
|
-
|
335
|
-
|
336
|
+
base_paths: Union[str, List[str]],
|
337
|
+
base_urls: List[S3Url],
|
338
|
+
content_type_provider: Callable[[str], ContentType],
|
339
|
+
path_type: S3PathType,
|
340
|
+
user_fs: Optional[Union[S3FileSystem, s3fs.S3FileSystem]],
|
341
|
+
resolved_fs: S3FileSystem,
|
342
|
+
**s3_client_kwargs,
|
336
343
|
) -> Tuple[Dict[ContentType, List[str]], CachedFileMetadataProvider]:
|
337
344
|
if path_type == S3PathType.MANIFEST:
|
338
345
|
content_type_to_paths, meta_provider = _expand_manifest_paths(
|
@@ -348,16 +355,22 @@ def _expand_paths_by_content_type(
|
|
348
355
|
)
|
349
356
|
elif path_type == S3PathType.FILES_AND_FOLDERS:
|
350
357
|
# TODO(pdames): Only allow files and call get_object(file_path)?
|
351
|
-
base_paths, file_infos = DefaultFileMetadataProvider()
|
352
|
-
|
358
|
+
base_paths, file_infos = DefaultFileMetadataProvider().expand_paths(
|
359
|
+
base_paths, resolved_fs
|
360
|
+
)
|
353
361
|
file_sizes = [file_info.size for file_info in file_infos]
|
354
|
-
meta_provider = CachedFileMetadataProvider(
|
355
|
-
|
356
|
-
|
357
|
-
|
358
|
-
|
359
|
-
|
360
|
-
|
362
|
+
meta_provider = CachedFileMetadataProvider(
|
363
|
+
{
|
364
|
+
path: BlockMetadata(
|
365
|
+
num_rows=None,
|
366
|
+
size_bytes=file_sizes[i],
|
367
|
+
schema=None,
|
368
|
+
input_files=[],
|
369
|
+
exec_stats=None,
|
370
|
+
)
|
371
|
+
for i, path in enumerate(base_paths)
|
372
|
+
}
|
373
|
+
)
|
361
374
|
content_type_to_paths = _infer_content_types_from_paths(
|
362
375
|
base_paths,
|
363
376
|
content_type_provider,
|
@@ -374,28 +387,30 @@ def _expand_paths_by_content_type(
|
|
374
387
|
)
|
375
388
|
content_type_to_paths[content_type] = paths
|
376
389
|
# normalize block metadata provider S3 file paths based on the filesystem
|
377
|
-
meta_provider = CachedFileMetadataProvider(
|
378
|
-
|
379
|
-
|
380
|
-
|
390
|
+
meta_provider = CachedFileMetadataProvider(
|
391
|
+
{
|
392
|
+
_normalize_s3_paths_for_filesystem(path, user_fs)[0][0]: metadata
|
393
|
+
for path, metadata in meta_provider.get_meta_cache().items()
|
394
|
+
}
|
395
|
+
)
|
381
396
|
return content_type_to_paths, meta_provider
|
382
397
|
|
383
398
|
|
384
399
|
class RedshiftDatasource(Datasource[Union[ArrowRow, Any]]):
|
385
400
|
def prepare_read(
|
386
|
-
|
387
|
-
|
388
|
-
|
389
|
-
|
390
|
-
|
391
|
-
|
392
|
-
|
393
|
-
|
394
|
-
|
395
|
-
|
396
|
-
|
397
|
-
|
398
|
-
|
401
|
+
self,
|
402
|
+
parallelism: int,
|
403
|
+
paths: Union[str, List[str]],
|
404
|
+
content_type_provider: Callable[[str], ContentType],
|
405
|
+
path_type: S3PathType = S3PathType.MANIFEST,
|
406
|
+
filesystem: Optional[Union[S3FileSystem, s3fs.S3FileSystem]] = None,
|
407
|
+
columns: Optional[List[str]] = None,
|
408
|
+
schema: Optional[pa.Schema] = None,
|
409
|
+
unload_args: RedshiftUnloadTextArgs = RedshiftUnloadTextArgs(),
|
410
|
+
partitioning: HivePartitionParser = None,
|
411
|
+
open_stream_args: Optional[Dict[str, Any]] = None,
|
412
|
+
read_kwargs_provider: Optional[ReadKwargsProvider] = None,
|
413
|
+
**s3_client_kwargs,
|
399
414
|
) -> List[ReadTask]:
|
400
415
|
# default to pyarrow.fs.S3FileSystem if no filesystem given
|
401
416
|
if filesystem is None:
|
@@ -445,7 +460,8 @@ class RedshiftDatasource(Datasource[Union[ArrowRow, Any]]):
|
|
445
460
|
prepare_read_kwargs["columns"] = columns
|
446
461
|
elif content_type in DELIMITED_TEXT_CONTENT_TYPES:
|
447
462
|
prepare_read_kwargs.update(
|
448
|
-
unload_args.to_arrow_reader_kwargs(columns, schema)
|
463
|
+
unload_args.to_arrow_reader_kwargs(columns, schema)
|
464
|
+
)
|
449
465
|
else:
|
450
466
|
raise NotImplementedError(f"Unsupported content type: {content_type}")
|
451
467
|
# merge any provided reader kwargs for this content type with those
|
@@ -464,19 +480,18 @@ class RedshiftDatasource(Datasource[Union[ArrowRow, Any]]):
|
|
464
480
|
return all_read_tasks
|
465
481
|
|
466
482
|
def do_write(
|
467
|
-
|
468
|
-
|
469
|
-
|
470
|
-
|
471
|
-
|
472
|
-
|
473
|
-
|
474
|
-
|
475
|
-
|
476
|
-
|
477
|
-
|
478
|
-
|
479
|
-
**write_args,
|
483
|
+
self,
|
484
|
+
blocks: List[ObjectRef[Block]],
|
485
|
+
metadata: List[BlockMetadata],
|
486
|
+
path: str,
|
487
|
+
dataset_uuid: str,
|
488
|
+
filesystem: Optional[FileSystem] = None,
|
489
|
+
try_create_dir: bool = True,
|
490
|
+
open_stream_args: Optional[Dict[str, Any]] = None,
|
491
|
+
block_path_provider: BlockWritePathProvider = DefaultBlockWritePathProvider(),
|
492
|
+
write_args_fn: Callable[[], Dict[str, Any]] = lambda: {},
|
493
|
+
_block_udf: Optional[Callable[[Block], Block]] = None,
|
494
|
+
**write_args,
|
480
495
|
) -> List[ObjectRef[WriteResult]]:
|
481
496
|
if filesystem is None:
|
482
497
|
filesystem = S3FileSystem()
|
@@ -484,8 +499,7 @@ class RedshiftDatasource(Datasource[Union[ArrowRow, Any]]):
|
|
484
499
|
paths, filesystem = _resolve_paths_and_filesystem(paths, filesystem)
|
485
500
|
assert len(paths) == 1, f"Expected 1 write path, found {len(paths)}."
|
486
501
|
path = paths[0]
|
487
|
-
block_path_provider = CapturingBlockWritePathProvider(
|
488
|
-
block_path_provider)
|
502
|
+
block_path_provider = CapturingBlockWritePathProvider(block_path_provider)
|
489
503
|
writer = ParquetBaseDatasource()
|
490
504
|
write_results = writer.do_write(
|
491
505
|
blocks,
|
@@ -513,21 +527,21 @@ class RedshiftDatasource(Datasource[Union[ArrowRow, Any]]):
|
|
513
527
|
write_results.append(rwr_obj_ref)
|
514
528
|
return write_results
|
515
529
|
|
516
|
-
def on_write_complete(self, write_results: List[WriteResult], **kwargs)
|
517
|
-
-> None:
|
530
|
+
def on_write_complete(self, write_results: List[WriteResult], **kwargs) -> None:
|
518
531
|
# TODO (pdames): time latency of this operation - overall redshift write times
|
519
532
|
# are 2-3x pure read_parquet_fast() times
|
520
533
|
# restore the write operation summary from the last write result
|
521
|
-
result: RedshiftWriteResult = write_results[len(write_results)-1]
|
534
|
+
result: RedshiftWriteResult = write_results[len(write_results) - 1]
|
522
535
|
write_path_args = result.block_write_path_provider.write_path_kwargs
|
523
536
|
blocks_written = len(write_path_args)
|
524
537
|
expected_blocks_written = len(result.metadata)
|
525
538
|
# TODO(pdames): Corner cases where mismatch is expected? Emply blocks?
|
526
539
|
# Blocks filtered/split/merged to more/less write paths?
|
527
|
-
assert blocks_written == expected_blocks_written,
|
528
|
-
f"Dataset write result validation failed. Found "
|
529
|
-
f"{blocks_written}/{expected_blocks_written} Dataset blocks "
|
540
|
+
assert blocks_written == expected_blocks_written, (
|
541
|
+
f"Dataset write result validation failed. Found "
|
542
|
+
f"{blocks_written}/{expected_blocks_written} Dataset blocks "
|
530
543
|
f"written. Refusing to commit Redshift Manifest."
|
544
|
+
)
|
531
545
|
manifest_entries = ManifestEntryList()
|
532
546
|
for block_idx, path in enumerate(write_path_args.keys()):
|
533
547
|
file_info = result.filesystem.get_file_info(path)
|
@@ -554,11 +568,11 @@ class RedshiftDatasource(Datasource[Union[ArrowRow, Any]]):
|
|
554
568
|
manifest_path = f"{result.path}/manifest"
|
555
569
|
logger.debug(f"Write succeeded for Dataset ID: {result.dataset_uuid}")
|
556
570
|
with result.filesystem.open_output_stream(
|
557
|
-
|
558
|
-
|
559
|
-
|
560
|
-
|
561
|
-
|
571
|
+
manifest_path,
|
572
|
+
# Also See:
|
573
|
+
# docs.aws.amazon.com/AmazonS3/latest/API/RESTCommonRequestHeaders.html
|
574
|
+
# Arrow s3fs.cc: tinyurl.com/2axa6m9m
|
575
|
+
metadata={"Content-Type": ContentType.JSON.value},
|
562
576
|
) as f:
|
563
577
|
f.write(json.dumps(manifest).encode("utf-8"))
|
564
578
|
logger.debug(f"Manifest committed to: {manifest_path}")
|
deltacat/io/dataset.py
CHANGED
@@ -1,18 +1,16 @@
|
|
1
1
|
# Allow classes to use self-referencing Type hints in Python 3.7.
|
2
2
|
from __future__ import annotations
|
3
3
|
|
4
|
+
from typing import Any, Callable, Dict, Optional, TypeVar, Union, cast
|
5
|
+
|
4
6
|
import pyarrow as pa
|
5
7
|
import s3fs
|
6
|
-
|
7
|
-
from typing import Optional, Union, Callable, Dict, Any, cast, TypeVar
|
8
|
-
|
9
8
|
from ray.data import Dataset
|
10
|
-
from ray.data.datasource import
|
11
|
-
BlockWritePathProvider
|
9
|
+
from ray.data.datasource import BlockWritePathProvider, DefaultBlockWritePathProvider
|
12
10
|
|
13
11
|
from deltacat.io.aws.redshift.redshift_datasource import RedshiftDatasource
|
14
12
|
|
15
|
-
T = TypeVar(
|
13
|
+
T = TypeVar("T")
|
16
14
|
|
17
15
|
|
18
16
|
class DeltacatDataset(Dataset[T]):
|
@@ -23,17 +21,16 @@ class DeltacatDataset(Dataset[T]):
|
|
23
21
|
return cast(DeltacatDataset[T], dataset)
|
24
22
|
|
25
23
|
def write_redshift(
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
**arrow_parquet_args) -> None:
|
24
|
+
self,
|
25
|
+
path: str,
|
26
|
+
*,
|
27
|
+
filesystem: Optional[Union[pa.fs.FileSystem, s3fs.S3FileSystem]] = None,
|
28
|
+
try_create_dir: bool = True,
|
29
|
+
arrow_open_stream_args: Optional[Dict[str, Any]] = None,
|
30
|
+
block_path_provider: BlockWritePathProvider = DefaultBlockWritePathProvider(),
|
31
|
+
arrow_parquet_args_fn: Callable[[], Dict[str, Any]] = lambda: {},
|
32
|
+
**arrow_parquet_args,
|
33
|
+
) -> None:
|
37
34
|
"""Writes the dataset to Parquet files and commits a Redshift manifest
|
38
35
|
back to S3 indexing the files written. The output can be loaded into
|
39
36
|
Redshift by providing it to the Redshift COPY command, or via AWS Data
|
deltacat/io/read_api.py
CHANGED
@@ -1,35 +1,38 @@
|
|
1
|
-
import
|
2
|
-
import pyarrow as pa
|
3
|
-
from deltacat.utils.common import ReadKwargsProvider
|
1
|
+
from typing import Any, Callable, Dict, List, Optional, Union
|
4
2
|
|
3
|
+
import pyarrow as pa
|
4
|
+
import s3fs
|
5
5
|
from ray.data import read_datasource
|
6
6
|
from ray.data._internal.arrow_block import ArrowRow
|
7
7
|
|
8
8
|
from deltacat import ContentType
|
9
|
+
from deltacat.io.aws.redshift.redshift_datasource import (
|
10
|
+
HivePartitionParser,
|
11
|
+
RedshiftDatasource,
|
12
|
+
RedshiftUnloadTextArgs,
|
13
|
+
S3PathType,
|
14
|
+
)
|
9
15
|
from deltacat.io.dataset import DeltacatDataset
|
10
|
-
from deltacat.
|
11
|
-
RedshiftDatasource, RedshiftUnloadTextArgs, S3PathType, HivePartitionParser
|
12
|
-
|
13
|
-
from typing import Optional, Union, List, Dict, Any, Callable
|
16
|
+
from deltacat.utils.common import ReadKwargsProvider
|
14
17
|
|
15
18
|
|
16
19
|
def read_redshift(
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
20
|
+
paths: Union[str, List[str]],
|
21
|
+
*,
|
22
|
+
path_type: S3PathType = S3PathType.MANIFEST,
|
23
|
+
filesystem: Optional[Union[pa.fs.S3FileSystem, s3fs.S3FileSystem]] = None,
|
24
|
+
columns: Optional[List[str]] = None,
|
25
|
+
schema: Optional[pa.Schema] = None,
|
26
|
+
unload_text_args: RedshiftUnloadTextArgs = RedshiftUnloadTextArgs(),
|
27
|
+
partitioning: HivePartitionParser = None,
|
28
|
+
content_type_provider: Callable[[str], ContentType] = lambda p: ContentType.PARQUET
|
29
|
+
if p.endswith(".parquet")
|
30
|
+
else ContentType.CSV,
|
31
|
+
parallelism: int = 200,
|
32
|
+
ray_remote_args: Dict[str, Any] = None,
|
33
|
+
arrow_open_stream_args: Optional[Dict[str, Any]] = None,
|
34
|
+
pa_read_func_kwargs_provider: Optional[ReadKwargsProvider] = None,
|
35
|
+
**kwargs,
|
33
36
|
) -> DeltacatDataset[ArrowRow]:
|
34
37
|
"""Reads Redshift UNLOAD results from either S3 Parquet or delimited text
|
35
38
|
files into a Ray Dataset.
|
@@ -38,7 +41,7 @@ def read_redshift(
|
|
38
41
|
>>> # Read all files contained in a Redshift Manifest:
|
39
42
|
>>> import deltacat as dc
|
40
43
|
>>> dc.io.read_redshift("/bucket/dir/manifest")
|
41
|
-
|
44
|
+
|
42
45
|
>>> # Read all files matching the given key prefix. If this prefix
|
43
46
|
>>> # refers to multiple files, like s3://bucket/data.parquet,
|
44
47
|
>>> # s3://bucket/data.1.csv, etc. then all will be read. The dataset
|
@@ -55,19 +58,19 @@ def read_redshift(
|
|
55
58
|
>>> dc.io.read_redshift(
|
56
59
|
>>> "/bucket/dir",
|
57
60
|
>>> path_type=S3PathType.PREFIX)
|
58
|
-
|
61
|
+
|
59
62
|
>>> # Read multiple files and folders:
|
60
63
|
>>> dc.io.read_redshift(
|
61
|
-
>>> ["/bucket/file1", "/bucket/folder1/"],
|
64
|
+
>>> ["/bucket/file1", "/bucket/folder1/"],
|
62
65
|
>>> path_type=S3PathType.FILES_AND_FOLDERS)
|
63
66
|
|
64
67
|
>>> # Read multiple Parquet and CSV files. The dataset schema will be
|
65
|
-
>>> # inferred from the first parquet file and used for explicit type
|
68
|
+
>>> # inferred from the first parquet file and used for explicit type
|
66
69
|
>>> # conversion of all CSV files:
|
67
70
|
>>> dc.io.read_redshift(
|
68
71
|
>>> ["/bucket/file.parquet", "/bucket/file.csv"],
|
69
72
|
>>> path_type=S3PathType.FILES_AND_FOLDERS)
|
70
|
-
|
73
|
+
|
71
74
|
Args:
|
72
75
|
paths: Paths to S3 files and folders to read. If `path_type` is
|
73
76
|
`MANIFEST` then this must be an S3 Redshift Manifest JSON file. If
|
@@ -93,27 +96,27 @@ def read_redshift(
|
|
93
96
|
discovered is used instead.
|
94
97
|
unload_text_args: Arguments used when running Redshift `UNLOAD` to
|
95
98
|
text file formats (e.g. CSV). These arguments ensure that all input
|
96
|
-
text files will be correctly parsed. If not specified, then all
|
97
|
-
text files read are assumed to use Redshift UNLOAD's default
|
99
|
+
text files will be correctly parsed. If not specified, then all
|
100
|
+
text files read are assumed to use Redshift UNLOAD's default
|
98
101
|
pipe-delimited text format.
|
99
102
|
partition_base_dir: Base directory to start searching for partitions
|
100
103
|
(exclusive). File paths outside of this directory will not be parsed
|
101
104
|
for partitions and automatically added to the dataset without passing
|
102
105
|
through any partition filter. Specify `None` or an empty string to
|
103
106
|
search for partitions in all file path directories.
|
104
|
-
partition_filter_fn: Callback used to filter `PARTITION` columns. Receives a
|
107
|
+
partition_filter_fn: Callback used to filter `PARTITION` columns. Receives a
|
105
108
|
dictionary mapping partition keys to values as input, returns `True` to
|
106
109
|
read a partition, and `False` to skip it. Each partition key and value
|
107
110
|
is a string parsed directly from an S3 key using hive-style
|
108
111
|
partition directory names of the form "{key}={value}". For example:
|
109
|
-
``lambda x:
|
112
|
+
``lambda x:
|
110
113
|
True if x["month"] == "January" and x["year"] == "2022" else False``
|
111
114
|
content_type_provider: Takes a file path as input and returns the file
|
112
115
|
content type as output.
|
113
116
|
parallelism: The requested parallelism of the read. Parallelism may be
|
114
117
|
limited by the number of files of the dataset.
|
115
118
|
ray_remote_args: kwargs passed to `ray.remote` in the read tasks.
|
116
|
-
arrow_open_stream_args: kwargs passed to to
|
119
|
+
arrow_open_stream_args: kwargs passed to to
|
117
120
|
`pa.fs.open_input_stream()`.
|
118
121
|
pa_read_func_kwargs_provider: Callback that takes a `ContentType` value
|
119
122
|
string as input, and provides read options to pass to either
|