deltacat 1.1.9__py3-none-any.whl → 1.1.11__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +1 -1
- deltacat/aws/redshift/model/manifest.py +16 -0
- deltacat/aws/s3u.py +19 -13
- deltacat/compute/compactor/compaction_session.py +5 -1
- deltacat/compute/compactor/repartition_session.py +1 -0
- deltacat/compute/compactor/utils/round_completion_file.py +39 -9
- deltacat/compute/compactor_v2/compaction_session.py +15 -11
- deltacat/compute/compactor_v2/constants.py +3 -0
- deltacat/compute/compactor_v2/model/{compaction_session.py → evaluate_compaction_result.py} +1 -2
- deltacat/compute/compactor_v2/utils/primary_key_index.py +1 -1
- deltacat/exceptions.py +5 -2
- deltacat/io/dataset.py +5 -17
- deltacat/storage/__init__.py +24 -0
- deltacat/storage/interface.py +42 -6
- deltacat/storage/model/delta.py +23 -3
- deltacat/storage/model/partition.py +6 -7
- deltacat/storage/model/partition_spec.py +71 -0
- deltacat/storage/model/stream.py +38 -1
- deltacat/storage/model/transform.py +127 -0
- deltacat/tests/aws/test_s3u.py +2 -0
- deltacat/tests/compute/compactor/utils/test_round_completion_file.py +231 -0
- deltacat/tests/compute/compactor_v2/test_compaction_session.py +201 -36
- deltacat/tests/compute/test_compact_partition_rebase.py +1 -1
- deltacat/tests/compute/test_util_common.py +19 -4
- deltacat/tests/local_deltacat_storage/__init__.py +83 -19
- deltacat/tests/test_utils/pyarrow.py +4 -1
- deltacat/tests/utils/ray_utils/test_dataset.py +66 -0
- deltacat/utils/numpy.py +3 -3
- deltacat/utils/pandas.py +3 -3
- deltacat/utils/pyarrow.py +3 -3
- deltacat/utils/ray_utils/dataset.py +7 -7
- {deltacat-1.1.9.dist-info → deltacat-1.1.11.dist-info}/METADATA +6 -5
- {deltacat-1.1.9.dist-info → deltacat-1.1.11.dist-info}/RECORD +36 -33
- deltacat/io/aws/redshift/redshift_datasource.py +0 -578
- {deltacat-1.1.9.dist-info → deltacat-1.1.11.dist-info}/LICENSE +0 -0
- {deltacat-1.1.9.dist-info → deltacat-1.1.11.dist-info}/WHEEL +0 -0
- {deltacat-1.1.9.dist-info → deltacat-1.1.11.dist-info}/top_level.txt +0 -0
@@ -1,578 +0,0 @@
|
|
1
|
-
import json
|
2
|
-
import logging
|
3
|
-
from collections import OrderedDict, defaultdict
|
4
|
-
from enum import Enum
|
5
|
-
from errno import ENOENT
|
6
|
-
from os import strerror
|
7
|
-
from typing import Any, Callable, Dict, List, Optional, Tuple, Union
|
8
|
-
|
9
|
-
import pyarrow as pa
|
10
|
-
import ray
|
11
|
-
import s3fs
|
12
|
-
from pyarrow import parquet as pq
|
13
|
-
from pyarrow.fs import FileSystem, FileType, S3FileSystem
|
14
|
-
from ray.data.block import Block, BlockMetadata
|
15
|
-
from ray.data.datasource import (
|
16
|
-
BlockWritePathProvider,
|
17
|
-
CSVDatasource,
|
18
|
-
DefaultBlockWritePathProvider,
|
19
|
-
DefaultFileMetadataProvider,
|
20
|
-
ParquetBaseDatasource,
|
21
|
-
ParquetMetadataProvider,
|
22
|
-
PathPartitionParser,
|
23
|
-
)
|
24
|
-
from ray.data.datasource.datasource import ArrowRow, Datasource, ReadTask, WriteResult
|
25
|
-
from ray.data.datasource.file_based_datasource import _resolve_paths_and_filesystem
|
26
|
-
from ray.data.datasource.file_meta_provider import FastFileMetadataProvider
|
27
|
-
from ray.types import ObjectRef
|
28
|
-
|
29
|
-
from deltacat import ContentEncoding, ContentType, logs
|
30
|
-
from deltacat.aws.redshift.model.manifest import (
|
31
|
-
Manifest,
|
32
|
-
ManifestEntry,
|
33
|
-
ManifestEntryList,
|
34
|
-
ManifestMeta,
|
35
|
-
)
|
36
|
-
from deltacat.aws.s3u import (
|
37
|
-
S3Url,
|
38
|
-
filter_objects_by_prefix,
|
39
|
-
objects_to_paths,
|
40
|
-
parse_s3_url,
|
41
|
-
)
|
42
|
-
from deltacat.types.media import DELIMITED_TEXT_CONTENT_TYPES
|
43
|
-
from deltacat.utils.common import ReadKwargsProvider
|
44
|
-
|
45
|
-
logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
|
46
|
-
|
47
|
-
|
48
|
-
class CapturingBlockWritePathProvider(BlockWritePathProvider):
|
49
|
-
"""Delegating block write path provider that saves an ordered dictionary of
|
50
|
-
input keyword arguments for every block write path returned."""
|
51
|
-
|
52
|
-
def __init__(self, block_write_path_provider: BlockWritePathProvider):
|
53
|
-
self.block_write_path_provider = block_write_path_provider
|
54
|
-
self.write_path_kwargs: Dict[str, Dict[str, Any]] = OrderedDict()
|
55
|
-
|
56
|
-
def _get_write_path_for_block(self, base_path: str, *args, **kwargs) -> str:
|
57
|
-
write_path = self.block_write_path_provider(
|
58
|
-
base_path,
|
59
|
-
*args,
|
60
|
-
**kwargs,
|
61
|
-
)
|
62
|
-
kwargs["base_path"] = base_path
|
63
|
-
self.write_path_kwargs[write_path] = kwargs
|
64
|
-
return write_path
|
65
|
-
|
66
|
-
|
67
|
-
class CachedFileMetadataProvider(
|
68
|
-
FastFileMetadataProvider,
|
69
|
-
ParquetMetadataProvider,
|
70
|
-
):
|
71
|
-
def __init__(self, meta_cache: Dict[str, BlockMetadata]):
|
72
|
-
self._meta_cache = meta_cache
|
73
|
-
|
74
|
-
def get_meta_cache(self) -> Dict[str, BlockMetadata]:
|
75
|
-
return self._meta_cache
|
76
|
-
|
77
|
-
def _get_block_metadata(
|
78
|
-
self,
|
79
|
-
paths: List[str],
|
80
|
-
schema: Optional[Union[type, pa.Schema]],
|
81
|
-
**kwargs,
|
82
|
-
) -> BlockMetadata:
|
83
|
-
agg_block_metadata = BlockMetadata(
|
84
|
-
num_rows=0,
|
85
|
-
size_bytes=0,
|
86
|
-
schema=schema,
|
87
|
-
input_files=[],
|
88
|
-
exec_stats=None,
|
89
|
-
)
|
90
|
-
for path in paths:
|
91
|
-
block_metadata = self._meta_cache.get(path)
|
92
|
-
if block_metadata is None:
|
93
|
-
raise ValueError(f"Block metadata not found for path: {path}")
|
94
|
-
if block_metadata.num_rows is None:
|
95
|
-
agg_block_metadata.num_rows = None
|
96
|
-
elif agg_block_metadata.num_rows is not None:
|
97
|
-
agg_block_metadata.num_rows += block_metadata.num_rows
|
98
|
-
if block_metadata.size_bytes is None:
|
99
|
-
agg_block_metadata.size_bytes = None
|
100
|
-
elif agg_block_metadata.size_bytes is not None:
|
101
|
-
agg_block_metadata.size_bytes += block_metadata.size_bytes
|
102
|
-
agg_block_metadata.input_files.append(path)
|
103
|
-
return agg_block_metadata
|
104
|
-
|
105
|
-
|
106
|
-
class HivePartitionParser(PathPartitionParser):
|
107
|
-
def __init__(
|
108
|
-
self,
|
109
|
-
base_dir: Optional[str] = None,
|
110
|
-
filter_fn: Optional[Callable[[Dict[str, str]], bool]] = None,
|
111
|
-
):
|
112
|
-
super(HivePartitionParser, self).__init__(
|
113
|
-
base_dir=base_dir,
|
114
|
-
filter_fn=filter_fn,
|
115
|
-
)
|
116
|
-
|
117
|
-
|
118
|
-
class RedshiftUnloadTextArgs:
|
119
|
-
def __init__(
|
120
|
-
self,
|
121
|
-
csv: bool = False,
|
122
|
-
header: bool = False,
|
123
|
-
delimiter: Optional[str] = None,
|
124
|
-
bzip2: bool = False,
|
125
|
-
gzip: bool = False,
|
126
|
-
zstd: bool = False,
|
127
|
-
add_quotes: Optional[bool] = None,
|
128
|
-
null_as: str = "",
|
129
|
-
escape: bool = False,
|
130
|
-
fixed_width: bool = False,
|
131
|
-
):
|
132
|
-
self.header = header
|
133
|
-
self.delimiter = delimiter if delimiter else "," if csv else "|"
|
134
|
-
self.bzip2 = bzip2
|
135
|
-
self.gzip = gzip
|
136
|
-
self.zstd = zstd
|
137
|
-
self.add_quotes = add_quotes if add_quotes else True if csv else False
|
138
|
-
self.null_as = null_as
|
139
|
-
self.escape = escape
|
140
|
-
self.fixed_width = fixed_width
|
141
|
-
|
142
|
-
def _get_arrow_compression_codec_name(self) -> str:
|
143
|
-
arrow_compression_codec_name = None
|
144
|
-
codecs_enabled = {
|
145
|
-
"bz2": self.bzip2,
|
146
|
-
"gzip": self.gzip,
|
147
|
-
"zstd": self.zstd,
|
148
|
-
}
|
149
|
-
for encoding, flag in codecs_enabled.items():
|
150
|
-
if arrow_compression_codec_name and flag:
|
151
|
-
raise ValueError(
|
152
|
-
f"Multiple Redshift UNLOAD compression types specified "
|
153
|
-
f"({codecs_enabled}). Please ensure that only one "
|
154
|
-
f"compression type is set and try again."
|
155
|
-
)
|
156
|
-
if flag:
|
157
|
-
arrow_compression_codec_name = encoding
|
158
|
-
return arrow_compression_codec_name
|
159
|
-
|
160
|
-
def to_arrow_reader_kwargs(
|
161
|
-
self, include_columns: Optional[List[str]], schema: Optional[pa.Schema]
|
162
|
-
) -> Dict[str, Any]:
|
163
|
-
from pyarrow import csv
|
164
|
-
|
165
|
-
if self.fixed_width:
|
166
|
-
raise NotImplementedError(
|
167
|
-
"Redshift text files unloaded with FIXEDWIDTH are not "
|
168
|
-
"currently supported."
|
169
|
-
)
|
170
|
-
open_stream_args = {}
|
171
|
-
arrow_compression_codec_name = self._get_arrow_compression_codec_name()
|
172
|
-
if arrow_compression_codec_name:
|
173
|
-
open_stream_args["compression"] = arrow_compression_codec_name
|
174
|
-
column_names = None
|
175
|
-
if schema:
|
176
|
-
column_names = schema.names
|
177
|
-
autogen_column_names = False if self.header or column_names else True
|
178
|
-
read_options = csv.ReadOptions(
|
179
|
-
use_threads=False,
|
180
|
-
column_names=column_names,
|
181
|
-
autogenerate_column_names=autogen_column_names,
|
182
|
-
)
|
183
|
-
parse_options = csv.ParseOptions(
|
184
|
-
delimiter=self.delimiter,
|
185
|
-
quote_char='"' if self.add_quotes else False,
|
186
|
-
escape_char="\\" if self.escape else False,
|
187
|
-
double_quote=False if self.escape else True,
|
188
|
-
)
|
189
|
-
convert_options = csv.ConvertOptions(
|
190
|
-
column_types=schema,
|
191
|
-
null_values=[self.null_as] if self.null_as is not None else [],
|
192
|
-
true_values=["t"],
|
193
|
-
false_values=["f"],
|
194
|
-
strings_can_be_null=True if self.null_as is not None else False,
|
195
|
-
quoted_strings_can_be_null=True if self.null_as else False,
|
196
|
-
include_columns=include_columns,
|
197
|
-
)
|
198
|
-
return {
|
199
|
-
"open_stream_args": open_stream_args,
|
200
|
-
"read_options": read_options,
|
201
|
-
"parse_options": parse_options,
|
202
|
-
"convert_options": convert_options,
|
203
|
-
}
|
204
|
-
|
205
|
-
|
206
|
-
class S3PathType(str, Enum):
|
207
|
-
MANIFEST = "manifest"
|
208
|
-
PREFIX = "prefix"
|
209
|
-
FILES_AND_FOLDERS = "files_and_folders"
|
210
|
-
|
211
|
-
|
212
|
-
class RedshiftWriteResult:
|
213
|
-
def __init__(self):
|
214
|
-
self.metadata = None
|
215
|
-
self.path = None
|
216
|
-
self.dataset_uuid = None
|
217
|
-
self.block_write_path_provider = None
|
218
|
-
self.content_type = None
|
219
|
-
self.content_encoding = None
|
220
|
-
self.filesystem = None
|
221
|
-
|
222
|
-
|
223
|
-
def _normalize_s3_paths_for_filesystem(
|
224
|
-
paths: Union[str, List[str]],
|
225
|
-
filesystem: Union[S3FileSystem, s3fs.S3FileSystem],
|
226
|
-
) -> Tuple[List[str], List[S3Url]]:
|
227
|
-
if isinstance(paths, str):
|
228
|
-
paths = [paths]
|
229
|
-
urls = [parse_s3_url(url) for url in paths]
|
230
|
-
if isinstance(filesystem, FileSystem):
|
231
|
-
# pyarrow.fs.FileSystem paths should not start with "s3://"
|
232
|
-
# pyarrow.fs.FileSystem paths should not end with "/"
|
233
|
-
paths = [f"{u.bucket}/{u.key}".rstrip("/") for u in urls]
|
234
|
-
else:
|
235
|
-
# s3fs.S3FileSystem can start with "s3://" (presumably others can too)
|
236
|
-
paths = [u.url.rstrip("/") for u in urls]
|
237
|
-
return paths, urls
|
238
|
-
|
239
|
-
|
240
|
-
def _read_manifest_entry_paths(
|
241
|
-
entries: ManifestEntryList,
|
242
|
-
manifest_content_type: Optional[str],
|
243
|
-
content_type_provider: Callable[[str], ContentType],
|
244
|
-
) -> Tuple[Dict[ContentType, List[str]], CachedFileMetadataProvider]:
|
245
|
-
# support manifests with heterogenous content types
|
246
|
-
content_type_to_paths = defaultdict(list)
|
247
|
-
meta_cache: Dict[str, BlockMetadata] = {}
|
248
|
-
for e in entries:
|
249
|
-
url = e.url if e.url else e.uri
|
250
|
-
# get manifest entry content type or fall back to manifest content type
|
251
|
-
content_type = e.meta.content_type or manifest_content_type
|
252
|
-
if content_type:
|
253
|
-
content_type_to_paths[ContentType(content_type)] = url
|
254
|
-
else:
|
255
|
-
# fall back to content type inference by file extension
|
256
|
-
content_type_to_paths[content_type_provider(url)].append(url)
|
257
|
-
meta_cache[url] = BlockMetadata(
|
258
|
-
num_rows=e.meta.record_count,
|
259
|
-
size_bytes=e.meta.content_length,
|
260
|
-
schema=None,
|
261
|
-
input_files=[],
|
262
|
-
exec_stats=None,
|
263
|
-
)
|
264
|
-
return content_type_to_paths, CachedFileMetadataProvider(meta_cache)
|
265
|
-
|
266
|
-
|
267
|
-
def _expand_manifest_paths(
|
268
|
-
paths: List[str],
|
269
|
-
filesystem: Optional[Union[S3FileSystem, s3fs.S3FileSystem]],
|
270
|
-
content_type_provider: Callable[[str], ContentType],
|
271
|
-
) -> Tuple[Dict[ContentType, List[str]], CachedFileMetadataProvider]:
|
272
|
-
assert len(paths) == 1, f"Expected 1 manifest path, found {len(paths)}."
|
273
|
-
path = paths[0]
|
274
|
-
with filesystem.open_input_file(path) as f:
|
275
|
-
manifest = Manifest(json.loads(f.read()))
|
276
|
-
content_type_to_paths = {}
|
277
|
-
meta_provider = CachedFileMetadataProvider({})
|
278
|
-
if not manifest.entries:
|
279
|
-
logger.warning(f"No entries to read in Redshift Manifest: {path}")
|
280
|
-
else:
|
281
|
-
content_type_to_paths, meta_provider = _read_manifest_entry_paths(
|
282
|
-
manifest.entries,
|
283
|
-
manifest.meta.content_type if manifest.meta else None,
|
284
|
-
content_type_provider,
|
285
|
-
)
|
286
|
-
# TODO(pdames): infer the schema from a verbose manifest if available?
|
287
|
-
# if not schema and ContentType.PARQUET not in content_type_to_paths:
|
288
|
-
# schema = _infer_schema_from_manifest(manifest)
|
289
|
-
return content_type_to_paths, meta_provider
|
290
|
-
|
291
|
-
|
292
|
-
def _infer_content_types_from_paths(
|
293
|
-
paths: List[str],
|
294
|
-
content_type_provider: Callable[[str], ContentType],
|
295
|
-
) -> Dict[ContentType, List[str]]:
|
296
|
-
content_type_to_paths = defaultdict(list)
|
297
|
-
for path in paths:
|
298
|
-
if not path.endswith("/"):
|
299
|
-
content_type_to_paths[content_type_provider(path)].append(path)
|
300
|
-
return content_type_to_paths
|
301
|
-
|
302
|
-
|
303
|
-
def _expand_prefix_paths(
|
304
|
-
urls: List[S3Url],
|
305
|
-
content_type_provider: Callable[[str], ContentType],
|
306
|
-
**s3_client_kwargs,
|
307
|
-
) -> Tuple[Dict[ContentType, List[str]], CachedFileMetadataProvider]:
|
308
|
-
assert len(urls) == 1, f"Expected 1 S3 prefix, found {len(urls)}."
|
309
|
-
objects = list(
|
310
|
-
filter_objects_by_prefix(urls[0].bucket, urls[0].key, **s3_client_kwargs)
|
311
|
-
)
|
312
|
-
paths = list(
|
313
|
-
objects_to_paths(
|
314
|
-
urls[0].bucket,
|
315
|
-
objects,
|
316
|
-
)
|
317
|
-
)
|
318
|
-
meta_cache: Dict[str, BlockMetadata] = {
|
319
|
-
path: BlockMetadata(
|
320
|
-
num_rows=None,
|
321
|
-
size_bytes=objects[i]["ContentLength"],
|
322
|
-
schema=None,
|
323
|
-
input_files=[],
|
324
|
-
exec_stats=None,
|
325
|
-
)
|
326
|
-
for i, path in enumerate(paths)
|
327
|
-
}
|
328
|
-
content_type_to_paths = _infer_content_types_from_paths(
|
329
|
-
paths,
|
330
|
-
content_type_provider,
|
331
|
-
)
|
332
|
-
return content_type_to_paths, CachedFileMetadataProvider(meta_cache)
|
333
|
-
|
334
|
-
|
335
|
-
def _expand_paths_by_content_type(
|
336
|
-
base_paths: Union[str, List[str]],
|
337
|
-
base_urls: List[S3Url],
|
338
|
-
content_type_provider: Callable[[str], ContentType],
|
339
|
-
path_type: S3PathType,
|
340
|
-
user_fs: Optional[Union[S3FileSystem, s3fs.S3FileSystem]],
|
341
|
-
resolved_fs: S3FileSystem,
|
342
|
-
**s3_client_kwargs,
|
343
|
-
) -> Tuple[Dict[ContentType, List[str]], CachedFileMetadataProvider]:
|
344
|
-
if path_type == S3PathType.MANIFEST:
|
345
|
-
content_type_to_paths, meta_provider = _expand_manifest_paths(
|
346
|
-
base_paths,
|
347
|
-
resolved_fs,
|
348
|
-
content_type_provider,
|
349
|
-
)
|
350
|
-
elif path_type == S3PathType.PREFIX:
|
351
|
-
content_type_to_paths, meta_provider = _expand_prefix_paths(
|
352
|
-
base_urls,
|
353
|
-
content_type_provider,
|
354
|
-
**s3_client_kwargs,
|
355
|
-
)
|
356
|
-
elif path_type == S3PathType.FILES_AND_FOLDERS:
|
357
|
-
# TODO(pdames): Only allow files and call get_object(file_path)?
|
358
|
-
base_paths, file_infos = DefaultFileMetadataProvider().expand_paths(
|
359
|
-
base_paths, resolved_fs
|
360
|
-
)
|
361
|
-
file_sizes = [file_info.size for file_info in file_infos]
|
362
|
-
meta_provider = CachedFileMetadataProvider(
|
363
|
-
{
|
364
|
-
path: BlockMetadata(
|
365
|
-
num_rows=None,
|
366
|
-
size_bytes=file_sizes[i],
|
367
|
-
schema=None,
|
368
|
-
input_files=[],
|
369
|
-
exec_stats=None,
|
370
|
-
)
|
371
|
-
for i, path in enumerate(base_paths)
|
372
|
-
}
|
373
|
-
)
|
374
|
-
content_type_to_paths = _infer_content_types_from_paths(
|
375
|
-
base_paths,
|
376
|
-
content_type_provider,
|
377
|
-
)
|
378
|
-
else:
|
379
|
-
raise NotImplementedError(f"Unsupported S3 path type: {path_type}")
|
380
|
-
# TODO(pdames): normalize S3 file paths before adding them to either
|
381
|
-
# content_type_to_paths or meta_provider
|
382
|
-
# normalize S3 file paths for each content type based on the filesystem
|
383
|
-
for content_type, paths in content_type_to_paths.items():
|
384
|
-
paths, urls = _normalize_s3_paths_for_filesystem(
|
385
|
-
paths,
|
386
|
-
user_fs,
|
387
|
-
)
|
388
|
-
content_type_to_paths[content_type] = paths
|
389
|
-
# normalize block metadata provider S3 file paths based on the filesystem
|
390
|
-
meta_provider = CachedFileMetadataProvider(
|
391
|
-
{
|
392
|
-
_normalize_s3_paths_for_filesystem(path, user_fs)[0][0]: metadata
|
393
|
-
for path, metadata in meta_provider.get_meta_cache().items()
|
394
|
-
}
|
395
|
-
)
|
396
|
-
return content_type_to_paths, meta_provider
|
397
|
-
|
398
|
-
|
399
|
-
class RedshiftDatasource(Datasource[Union[ArrowRow, Any]]):
|
400
|
-
def prepare_read(
|
401
|
-
self,
|
402
|
-
parallelism: int,
|
403
|
-
paths: Union[str, List[str]],
|
404
|
-
content_type_provider: Callable[[str], ContentType],
|
405
|
-
path_type: S3PathType = S3PathType.MANIFEST,
|
406
|
-
filesystem: Optional[Union[S3FileSystem, s3fs.S3FileSystem]] = None,
|
407
|
-
columns: Optional[List[str]] = None,
|
408
|
-
schema: Optional[pa.Schema] = None,
|
409
|
-
unload_args: RedshiftUnloadTextArgs = RedshiftUnloadTextArgs(),
|
410
|
-
partitioning: HivePartitionParser = None,
|
411
|
-
open_stream_args: Optional[Dict[str, Any]] = None,
|
412
|
-
read_kwargs_provider: Optional[ReadKwargsProvider] = None,
|
413
|
-
**s3_client_kwargs,
|
414
|
-
) -> List[ReadTask]:
|
415
|
-
# default to pyarrow.fs.S3FileSystem if no filesystem given
|
416
|
-
if filesystem is None:
|
417
|
-
filesystem = S3FileSystem()
|
418
|
-
# normalize s3 paths to work with the filesystem provided
|
419
|
-
paths, urls = _normalize_s3_paths_for_filesystem(paths, filesystem)
|
420
|
-
paths, resolved_fs = _resolve_paths_and_filesystem(
|
421
|
-
paths,
|
422
|
-
filesystem,
|
423
|
-
)
|
424
|
-
# find all files in manifests, prefixes, and folders
|
425
|
-
content_type_to_paths, meta_provider = _expand_paths_by_content_type(
|
426
|
-
paths,
|
427
|
-
urls,
|
428
|
-
content_type_provider,
|
429
|
-
path_type,
|
430
|
-
filesystem,
|
431
|
-
resolved_fs,
|
432
|
-
**s3_client_kwargs,
|
433
|
-
)
|
434
|
-
num_content_types = len(content_type_to_paths)
|
435
|
-
if num_content_types > 1 and not schema:
|
436
|
-
# infer schema from a single parquet file
|
437
|
-
# TODO (pdames): read verbose manifest schema if available, and infer
|
438
|
-
# schema from a sample parquet dataset if not
|
439
|
-
path = content_type_to_paths[ContentType.PARQUET][0]
|
440
|
-
with resolved_fs.open_input_file(path, **open_stream_args) as f:
|
441
|
-
schema = pq.read_schema(f)
|
442
|
-
content_type_to_reader = {
|
443
|
-
ContentType.PARQUET: ParquetBaseDatasource(),
|
444
|
-
ContentType.CSV: CSVDatasource(),
|
445
|
-
}
|
446
|
-
all_read_tasks = []
|
447
|
-
for content_type, paths in content_type_to_paths.items():
|
448
|
-
reader = content_type_to_reader.get(content_type)
|
449
|
-
assert reader, f"No datasource found for: {content_type}"
|
450
|
-
prepare_read_kwargs = {
|
451
|
-
"parallelism": parallelism,
|
452
|
-
"paths": paths,
|
453
|
-
"filesystem": resolved_fs,
|
454
|
-
"schema": schema,
|
455
|
-
"meta_provider": meta_provider,
|
456
|
-
"partitioning": partitioning,
|
457
|
-
}
|
458
|
-
if content_type == ContentType.PARQUET:
|
459
|
-
if columns:
|
460
|
-
prepare_read_kwargs["columns"] = columns
|
461
|
-
elif content_type in DELIMITED_TEXT_CONTENT_TYPES:
|
462
|
-
prepare_read_kwargs.update(
|
463
|
-
unload_args.to_arrow_reader_kwargs(columns, schema)
|
464
|
-
)
|
465
|
-
else:
|
466
|
-
raise NotImplementedError(f"Unsupported content type: {content_type}")
|
467
|
-
# merge any provided reader kwargs for this content type with those
|
468
|
-
# inferred from Redshift UNLOAD args
|
469
|
-
if read_kwargs_provider:
|
470
|
-
prepare_read_kwargs = read_kwargs_provider(
|
471
|
-
content_type,
|
472
|
-
prepare_read_kwargs,
|
473
|
-
)
|
474
|
-
# explicitly specified `open_stream_args` override those inferred
|
475
|
-
# from Redshift UNLOAD args
|
476
|
-
if open_stream_args:
|
477
|
-
prepare_read_kwargs["open_stream_args"] = open_stream_args
|
478
|
-
read_tasks = reader.prepare_read(**prepare_read_kwargs)
|
479
|
-
all_read_tasks.extend(read_tasks)
|
480
|
-
return all_read_tasks
|
481
|
-
|
482
|
-
def do_write(
|
483
|
-
self,
|
484
|
-
blocks: List[ObjectRef[Block]],
|
485
|
-
metadata: List[BlockMetadata],
|
486
|
-
path: str,
|
487
|
-
dataset_uuid: str,
|
488
|
-
filesystem: Optional[FileSystem] = None,
|
489
|
-
try_create_dir: bool = True,
|
490
|
-
open_stream_args: Optional[Dict[str, Any]] = None,
|
491
|
-
block_path_provider: BlockWritePathProvider = DefaultBlockWritePathProvider(),
|
492
|
-
write_args_fn: Callable[[], Dict[str, Any]] = lambda: {},
|
493
|
-
_block_udf: Optional[Callable[[Block], Block]] = None,
|
494
|
-
**write_args,
|
495
|
-
) -> List[ObjectRef[WriteResult]]:
|
496
|
-
if filesystem is None:
|
497
|
-
filesystem = S3FileSystem()
|
498
|
-
paths, _ = _normalize_s3_paths_for_filesystem(path, filesystem)
|
499
|
-
paths, filesystem = _resolve_paths_and_filesystem(paths, filesystem)
|
500
|
-
assert len(paths) == 1, f"Expected 1 write path, found {len(paths)}."
|
501
|
-
path = paths[0]
|
502
|
-
block_path_provider = CapturingBlockWritePathProvider(block_path_provider)
|
503
|
-
writer = ParquetBaseDatasource()
|
504
|
-
write_results = writer.do_write(
|
505
|
-
blocks,
|
506
|
-
metadata,
|
507
|
-
path,
|
508
|
-
dataset_uuid,
|
509
|
-
filesystem,
|
510
|
-
try_create_dir,
|
511
|
-
open_stream_args,
|
512
|
-
block_path_provider,
|
513
|
-
write_args_fn,
|
514
|
-
_block_udf,
|
515
|
-
**write_args,
|
516
|
-
)
|
517
|
-
# append a summary of this write operation in the last write result
|
518
|
-
rwr = RedshiftWriteResult()
|
519
|
-
rwr.metadata = metadata
|
520
|
-
rwr.path = path
|
521
|
-
rwr.dataset_uuid = dataset_uuid
|
522
|
-
rwr.block_write_path_provider = block_path_provider
|
523
|
-
rwr.content_type = ContentType.PARQUET.value
|
524
|
-
rwr.content_encoding = ContentEncoding.IDENTITY.value
|
525
|
-
rwr.filesystem = filesystem
|
526
|
-
rwr_obj_ref = ray.put(rwr)
|
527
|
-
write_results.append(rwr_obj_ref)
|
528
|
-
return write_results
|
529
|
-
|
530
|
-
def on_write_complete(self, write_results: List[WriteResult], **kwargs) -> None:
|
531
|
-
# TODO (pdames): time latency of this operation - overall redshift write times
|
532
|
-
# are 2-3x pure read_parquet_fast() times
|
533
|
-
# restore the write operation summary from the last write result
|
534
|
-
result: RedshiftWriteResult = write_results[len(write_results) - 1]
|
535
|
-
write_path_args = result.block_write_path_provider.write_path_kwargs
|
536
|
-
blocks_written = len(write_path_args)
|
537
|
-
expected_blocks_written = len(result.metadata)
|
538
|
-
# TODO(pdames): Corner cases where mismatch is expected? Emply blocks?
|
539
|
-
# Blocks filtered/split/merged to more/less write paths?
|
540
|
-
assert blocks_written == expected_blocks_written, (
|
541
|
-
f"Dataset write result validation failed. Found "
|
542
|
-
f"{blocks_written}/{expected_blocks_written} Dataset blocks "
|
543
|
-
f"written. Refusing to commit Redshift Manifest."
|
544
|
-
)
|
545
|
-
manifest_entries = ManifestEntryList()
|
546
|
-
for block_idx, path in enumerate(write_path_args.keys()):
|
547
|
-
file_info = result.filesystem.get_file_info(path)
|
548
|
-
if file_info.type == FileType.File:
|
549
|
-
content_length = file_info.size
|
550
|
-
else:
|
551
|
-
raise FileNotFoundError(ENOENT, strerror(ENOENT), path)
|
552
|
-
num_rows = result.metadata[block_idx].num_rows
|
553
|
-
source_content_length = result.metadata[block_idx].size_bytes
|
554
|
-
manifest_entry_meta = ManifestMeta.of(
|
555
|
-
int(num_rows) if num_rows is not None else None,
|
556
|
-
int(content_length) if content_length is not None else None,
|
557
|
-
result.content_type,
|
558
|
-
result.content_encoding,
|
559
|
-
int(source_content_length) if source_content_length else None,
|
560
|
-
)
|
561
|
-
parsed_url = parse_s3_url(path)
|
562
|
-
manifest_entry = ManifestEntry.of(
|
563
|
-
parsed_url.url,
|
564
|
-
manifest_entry_meta,
|
565
|
-
)
|
566
|
-
manifest_entries.append(manifest_entry)
|
567
|
-
manifest = Manifest.of(manifest_entries)
|
568
|
-
manifest_path = f"{result.path}/manifest"
|
569
|
-
logger.debug(f"Write succeeded for Dataset ID: {result.dataset_uuid}")
|
570
|
-
with result.filesystem.open_output_stream(
|
571
|
-
manifest_path,
|
572
|
-
# Also See:
|
573
|
-
# docs.aws.amazon.com/AmazonS3/latest/API/RESTCommonRequestHeaders.html
|
574
|
-
# Arrow s3fs.cc: tinyurl.com/2axa6m9m
|
575
|
-
metadata={"Content-Type": ContentType.JSON.value},
|
576
|
-
) as f:
|
577
|
-
f.write(json.dumps(manifest).encode("utf-8"))
|
578
|
-
logger.debug(f"Manifest committed to: {manifest_path}")
|
File without changes
|
File without changes
|
File without changes
|