deltacat 1.1.9__py3-none-any.whl → 1.1.10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (35) hide show
  1. deltacat/__init__.py +1 -1
  2. deltacat/aws/redshift/model/manifest.py +16 -0
  3. deltacat/aws/s3u.py +19 -13
  4. deltacat/compute/compactor/compaction_session.py +5 -1
  5. deltacat/compute/compactor/repartition_session.py +1 -0
  6. deltacat/compute/compactor/utils/round_completion_file.py +39 -9
  7. deltacat/compute/compactor_v2/compaction_session.py +15 -11
  8. deltacat/compute/compactor_v2/constants.py +3 -0
  9. deltacat/compute/compactor_v2/model/{compaction_session.py → evaluate_compaction_result.py} +1 -2
  10. deltacat/io/dataset.py +5 -17
  11. deltacat/storage/__init__.py +24 -0
  12. deltacat/storage/interface.py +42 -6
  13. deltacat/storage/model/delta.py +23 -3
  14. deltacat/storage/model/partition.py +6 -7
  15. deltacat/storage/model/partition_spec.py +71 -0
  16. deltacat/storage/model/stream.py +38 -1
  17. deltacat/storage/model/transform.py +127 -0
  18. deltacat/tests/aws/test_s3u.py +2 -0
  19. deltacat/tests/compute/compactor/utils/test_round_completion_file.py +209 -0
  20. deltacat/tests/compute/compactor_v2/test_compaction_session.py +201 -36
  21. deltacat/tests/compute/test_compact_partition_rebase.py +1 -1
  22. deltacat/tests/compute/test_util_common.py +19 -4
  23. deltacat/tests/local_deltacat_storage/__init__.py +83 -19
  24. deltacat/tests/test_utils/pyarrow.py +4 -1
  25. deltacat/tests/utils/ray_utils/test_dataset.py +66 -0
  26. deltacat/utils/numpy.py +3 -3
  27. deltacat/utils/pandas.py +3 -3
  28. deltacat/utils/pyarrow.py +3 -3
  29. deltacat/utils/ray_utils/dataset.py +7 -7
  30. {deltacat-1.1.9.dist-info → deltacat-1.1.10.dist-info}/METADATA +5 -4
  31. {deltacat-1.1.9.dist-info → deltacat-1.1.10.dist-info}/RECORD +34 -31
  32. deltacat/io/aws/redshift/redshift_datasource.py +0 -578
  33. {deltacat-1.1.9.dist-info → deltacat-1.1.10.dist-info}/LICENSE +0 -0
  34. {deltacat-1.1.9.dist-info → deltacat-1.1.10.dist-info}/WHEEL +0 -0
  35. {deltacat-1.1.9.dist-info → deltacat-1.1.10.dist-info}/top_level.txt +0 -0
@@ -1,578 +0,0 @@
1
- import json
2
- import logging
3
- from collections import OrderedDict, defaultdict
4
- from enum import Enum
5
- from errno import ENOENT
6
- from os import strerror
7
- from typing import Any, Callable, Dict, List, Optional, Tuple, Union
8
-
9
- import pyarrow as pa
10
- import ray
11
- import s3fs
12
- from pyarrow import parquet as pq
13
- from pyarrow.fs import FileSystem, FileType, S3FileSystem
14
- from ray.data.block import Block, BlockMetadata
15
- from ray.data.datasource import (
16
- BlockWritePathProvider,
17
- CSVDatasource,
18
- DefaultBlockWritePathProvider,
19
- DefaultFileMetadataProvider,
20
- ParquetBaseDatasource,
21
- ParquetMetadataProvider,
22
- PathPartitionParser,
23
- )
24
- from ray.data.datasource.datasource import ArrowRow, Datasource, ReadTask, WriteResult
25
- from ray.data.datasource.file_based_datasource import _resolve_paths_and_filesystem
26
- from ray.data.datasource.file_meta_provider import FastFileMetadataProvider
27
- from ray.types import ObjectRef
28
-
29
- from deltacat import ContentEncoding, ContentType, logs
30
- from deltacat.aws.redshift.model.manifest import (
31
- Manifest,
32
- ManifestEntry,
33
- ManifestEntryList,
34
- ManifestMeta,
35
- )
36
- from deltacat.aws.s3u import (
37
- S3Url,
38
- filter_objects_by_prefix,
39
- objects_to_paths,
40
- parse_s3_url,
41
- )
42
- from deltacat.types.media import DELIMITED_TEXT_CONTENT_TYPES
43
- from deltacat.utils.common import ReadKwargsProvider
44
-
45
- logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
46
-
47
-
48
- class CapturingBlockWritePathProvider(BlockWritePathProvider):
49
- """Delegating block write path provider that saves an ordered dictionary of
50
- input keyword arguments for every block write path returned."""
51
-
52
- def __init__(self, block_write_path_provider: BlockWritePathProvider):
53
- self.block_write_path_provider = block_write_path_provider
54
- self.write_path_kwargs: Dict[str, Dict[str, Any]] = OrderedDict()
55
-
56
- def _get_write_path_for_block(self, base_path: str, *args, **kwargs) -> str:
57
- write_path = self.block_write_path_provider(
58
- base_path,
59
- *args,
60
- **kwargs,
61
- )
62
- kwargs["base_path"] = base_path
63
- self.write_path_kwargs[write_path] = kwargs
64
- return write_path
65
-
66
-
67
- class CachedFileMetadataProvider(
68
- FastFileMetadataProvider,
69
- ParquetMetadataProvider,
70
- ):
71
- def __init__(self, meta_cache: Dict[str, BlockMetadata]):
72
- self._meta_cache = meta_cache
73
-
74
- def get_meta_cache(self) -> Dict[str, BlockMetadata]:
75
- return self._meta_cache
76
-
77
- def _get_block_metadata(
78
- self,
79
- paths: List[str],
80
- schema: Optional[Union[type, pa.Schema]],
81
- **kwargs,
82
- ) -> BlockMetadata:
83
- agg_block_metadata = BlockMetadata(
84
- num_rows=0,
85
- size_bytes=0,
86
- schema=schema,
87
- input_files=[],
88
- exec_stats=None,
89
- )
90
- for path in paths:
91
- block_metadata = self._meta_cache.get(path)
92
- if block_metadata is None:
93
- raise ValueError(f"Block metadata not found for path: {path}")
94
- if block_metadata.num_rows is None:
95
- agg_block_metadata.num_rows = None
96
- elif agg_block_metadata.num_rows is not None:
97
- agg_block_metadata.num_rows += block_metadata.num_rows
98
- if block_metadata.size_bytes is None:
99
- agg_block_metadata.size_bytes = None
100
- elif agg_block_metadata.size_bytes is not None:
101
- agg_block_metadata.size_bytes += block_metadata.size_bytes
102
- agg_block_metadata.input_files.append(path)
103
- return agg_block_metadata
104
-
105
-
106
- class HivePartitionParser(PathPartitionParser):
107
- def __init__(
108
- self,
109
- base_dir: Optional[str] = None,
110
- filter_fn: Optional[Callable[[Dict[str, str]], bool]] = None,
111
- ):
112
- super(HivePartitionParser, self).__init__(
113
- base_dir=base_dir,
114
- filter_fn=filter_fn,
115
- )
116
-
117
-
118
- class RedshiftUnloadTextArgs:
119
- def __init__(
120
- self,
121
- csv: bool = False,
122
- header: bool = False,
123
- delimiter: Optional[str] = None,
124
- bzip2: bool = False,
125
- gzip: bool = False,
126
- zstd: bool = False,
127
- add_quotes: Optional[bool] = None,
128
- null_as: str = "",
129
- escape: bool = False,
130
- fixed_width: bool = False,
131
- ):
132
- self.header = header
133
- self.delimiter = delimiter if delimiter else "," if csv else "|"
134
- self.bzip2 = bzip2
135
- self.gzip = gzip
136
- self.zstd = zstd
137
- self.add_quotes = add_quotes if add_quotes else True if csv else False
138
- self.null_as = null_as
139
- self.escape = escape
140
- self.fixed_width = fixed_width
141
-
142
- def _get_arrow_compression_codec_name(self) -> str:
143
- arrow_compression_codec_name = None
144
- codecs_enabled = {
145
- "bz2": self.bzip2,
146
- "gzip": self.gzip,
147
- "zstd": self.zstd,
148
- }
149
- for encoding, flag in codecs_enabled.items():
150
- if arrow_compression_codec_name and flag:
151
- raise ValueError(
152
- f"Multiple Redshift UNLOAD compression types specified "
153
- f"({codecs_enabled}). Please ensure that only one "
154
- f"compression type is set and try again."
155
- )
156
- if flag:
157
- arrow_compression_codec_name = encoding
158
- return arrow_compression_codec_name
159
-
160
- def to_arrow_reader_kwargs(
161
- self, include_columns: Optional[List[str]], schema: Optional[pa.Schema]
162
- ) -> Dict[str, Any]:
163
- from pyarrow import csv
164
-
165
- if self.fixed_width:
166
- raise NotImplementedError(
167
- "Redshift text files unloaded with FIXEDWIDTH are not "
168
- "currently supported."
169
- )
170
- open_stream_args = {}
171
- arrow_compression_codec_name = self._get_arrow_compression_codec_name()
172
- if arrow_compression_codec_name:
173
- open_stream_args["compression"] = arrow_compression_codec_name
174
- column_names = None
175
- if schema:
176
- column_names = schema.names
177
- autogen_column_names = False if self.header or column_names else True
178
- read_options = csv.ReadOptions(
179
- use_threads=False,
180
- column_names=column_names,
181
- autogenerate_column_names=autogen_column_names,
182
- )
183
- parse_options = csv.ParseOptions(
184
- delimiter=self.delimiter,
185
- quote_char='"' if self.add_quotes else False,
186
- escape_char="\\" if self.escape else False,
187
- double_quote=False if self.escape else True,
188
- )
189
- convert_options = csv.ConvertOptions(
190
- column_types=schema,
191
- null_values=[self.null_as] if self.null_as is not None else [],
192
- true_values=["t"],
193
- false_values=["f"],
194
- strings_can_be_null=True if self.null_as is not None else False,
195
- quoted_strings_can_be_null=True if self.null_as else False,
196
- include_columns=include_columns,
197
- )
198
- return {
199
- "open_stream_args": open_stream_args,
200
- "read_options": read_options,
201
- "parse_options": parse_options,
202
- "convert_options": convert_options,
203
- }
204
-
205
-
206
- class S3PathType(str, Enum):
207
- MANIFEST = "manifest"
208
- PREFIX = "prefix"
209
- FILES_AND_FOLDERS = "files_and_folders"
210
-
211
-
212
- class RedshiftWriteResult:
213
- def __init__(self):
214
- self.metadata = None
215
- self.path = None
216
- self.dataset_uuid = None
217
- self.block_write_path_provider = None
218
- self.content_type = None
219
- self.content_encoding = None
220
- self.filesystem = None
221
-
222
-
223
- def _normalize_s3_paths_for_filesystem(
224
- paths: Union[str, List[str]],
225
- filesystem: Union[S3FileSystem, s3fs.S3FileSystem],
226
- ) -> Tuple[List[str], List[S3Url]]:
227
- if isinstance(paths, str):
228
- paths = [paths]
229
- urls = [parse_s3_url(url) for url in paths]
230
- if isinstance(filesystem, FileSystem):
231
- # pyarrow.fs.FileSystem paths should not start with "s3://"
232
- # pyarrow.fs.FileSystem paths should not end with "/"
233
- paths = [f"{u.bucket}/{u.key}".rstrip("/") for u in urls]
234
- else:
235
- # s3fs.S3FileSystem can start with "s3://" (presumably others can too)
236
- paths = [u.url.rstrip("/") for u in urls]
237
- return paths, urls
238
-
239
-
240
- def _read_manifest_entry_paths(
241
- entries: ManifestEntryList,
242
- manifest_content_type: Optional[str],
243
- content_type_provider: Callable[[str], ContentType],
244
- ) -> Tuple[Dict[ContentType, List[str]], CachedFileMetadataProvider]:
245
- # support manifests with heterogenous content types
246
- content_type_to_paths = defaultdict(list)
247
- meta_cache: Dict[str, BlockMetadata] = {}
248
- for e in entries:
249
- url = e.url if e.url else e.uri
250
- # get manifest entry content type or fall back to manifest content type
251
- content_type = e.meta.content_type or manifest_content_type
252
- if content_type:
253
- content_type_to_paths[ContentType(content_type)] = url
254
- else:
255
- # fall back to content type inference by file extension
256
- content_type_to_paths[content_type_provider(url)].append(url)
257
- meta_cache[url] = BlockMetadata(
258
- num_rows=e.meta.record_count,
259
- size_bytes=e.meta.content_length,
260
- schema=None,
261
- input_files=[],
262
- exec_stats=None,
263
- )
264
- return content_type_to_paths, CachedFileMetadataProvider(meta_cache)
265
-
266
-
267
- def _expand_manifest_paths(
268
- paths: List[str],
269
- filesystem: Optional[Union[S3FileSystem, s3fs.S3FileSystem]],
270
- content_type_provider: Callable[[str], ContentType],
271
- ) -> Tuple[Dict[ContentType, List[str]], CachedFileMetadataProvider]:
272
- assert len(paths) == 1, f"Expected 1 manifest path, found {len(paths)}."
273
- path = paths[0]
274
- with filesystem.open_input_file(path) as f:
275
- manifest = Manifest(json.loads(f.read()))
276
- content_type_to_paths = {}
277
- meta_provider = CachedFileMetadataProvider({})
278
- if not manifest.entries:
279
- logger.warning(f"No entries to read in Redshift Manifest: {path}")
280
- else:
281
- content_type_to_paths, meta_provider = _read_manifest_entry_paths(
282
- manifest.entries,
283
- manifest.meta.content_type if manifest.meta else None,
284
- content_type_provider,
285
- )
286
- # TODO(pdames): infer the schema from a verbose manifest if available?
287
- # if not schema and ContentType.PARQUET not in content_type_to_paths:
288
- # schema = _infer_schema_from_manifest(manifest)
289
- return content_type_to_paths, meta_provider
290
-
291
-
292
- def _infer_content_types_from_paths(
293
- paths: List[str],
294
- content_type_provider: Callable[[str], ContentType],
295
- ) -> Dict[ContentType, List[str]]:
296
- content_type_to_paths = defaultdict(list)
297
- for path in paths:
298
- if not path.endswith("/"):
299
- content_type_to_paths[content_type_provider(path)].append(path)
300
- return content_type_to_paths
301
-
302
-
303
- def _expand_prefix_paths(
304
- urls: List[S3Url],
305
- content_type_provider: Callable[[str], ContentType],
306
- **s3_client_kwargs,
307
- ) -> Tuple[Dict[ContentType, List[str]], CachedFileMetadataProvider]:
308
- assert len(urls) == 1, f"Expected 1 S3 prefix, found {len(urls)}."
309
- objects = list(
310
- filter_objects_by_prefix(urls[0].bucket, urls[0].key, **s3_client_kwargs)
311
- )
312
- paths = list(
313
- objects_to_paths(
314
- urls[0].bucket,
315
- objects,
316
- )
317
- )
318
- meta_cache: Dict[str, BlockMetadata] = {
319
- path: BlockMetadata(
320
- num_rows=None,
321
- size_bytes=objects[i]["ContentLength"],
322
- schema=None,
323
- input_files=[],
324
- exec_stats=None,
325
- )
326
- for i, path in enumerate(paths)
327
- }
328
- content_type_to_paths = _infer_content_types_from_paths(
329
- paths,
330
- content_type_provider,
331
- )
332
- return content_type_to_paths, CachedFileMetadataProvider(meta_cache)
333
-
334
-
335
- def _expand_paths_by_content_type(
336
- base_paths: Union[str, List[str]],
337
- base_urls: List[S3Url],
338
- content_type_provider: Callable[[str], ContentType],
339
- path_type: S3PathType,
340
- user_fs: Optional[Union[S3FileSystem, s3fs.S3FileSystem]],
341
- resolved_fs: S3FileSystem,
342
- **s3_client_kwargs,
343
- ) -> Tuple[Dict[ContentType, List[str]], CachedFileMetadataProvider]:
344
- if path_type == S3PathType.MANIFEST:
345
- content_type_to_paths, meta_provider = _expand_manifest_paths(
346
- base_paths,
347
- resolved_fs,
348
- content_type_provider,
349
- )
350
- elif path_type == S3PathType.PREFIX:
351
- content_type_to_paths, meta_provider = _expand_prefix_paths(
352
- base_urls,
353
- content_type_provider,
354
- **s3_client_kwargs,
355
- )
356
- elif path_type == S3PathType.FILES_AND_FOLDERS:
357
- # TODO(pdames): Only allow files and call get_object(file_path)?
358
- base_paths, file_infos = DefaultFileMetadataProvider().expand_paths(
359
- base_paths, resolved_fs
360
- )
361
- file_sizes = [file_info.size for file_info in file_infos]
362
- meta_provider = CachedFileMetadataProvider(
363
- {
364
- path: BlockMetadata(
365
- num_rows=None,
366
- size_bytes=file_sizes[i],
367
- schema=None,
368
- input_files=[],
369
- exec_stats=None,
370
- )
371
- for i, path in enumerate(base_paths)
372
- }
373
- )
374
- content_type_to_paths = _infer_content_types_from_paths(
375
- base_paths,
376
- content_type_provider,
377
- )
378
- else:
379
- raise NotImplementedError(f"Unsupported S3 path type: {path_type}")
380
- # TODO(pdames): normalize S3 file paths before adding them to either
381
- # content_type_to_paths or meta_provider
382
- # normalize S3 file paths for each content type based on the filesystem
383
- for content_type, paths in content_type_to_paths.items():
384
- paths, urls = _normalize_s3_paths_for_filesystem(
385
- paths,
386
- user_fs,
387
- )
388
- content_type_to_paths[content_type] = paths
389
- # normalize block metadata provider S3 file paths based on the filesystem
390
- meta_provider = CachedFileMetadataProvider(
391
- {
392
- _normalize_s3_paths_for_filesystem(path, user_fs)[0][0]: metadata
393
- for path, metadata in meta_provider.get_meta_cache().items()
394
- }
395
- )
396
- return content_type_to_paths, meta_provider
397
-
398
-
399
- class RedshiftDatasource(Datasource[Union[ArrowRow, Any]]):
400
- def prepare_read(
401
- self,
402
- parallelism: int,
403
- paths: Union[str, List[str]],
404
- content_type_provider: Callable[[str], ContentType],
405
- path_type: S3PathType = S3PathType.MANIFEST,
406
- filesystem: Optional[Union[S3FileSystem, s3fs.S3FileSystem]] = None,
407
- columns: Optional[List[str]] = None,
408
- schema: Optional[pa.Schema] = None,
409
- unload_args: RedshiftUnloadTextArgs = RedshiftUnloadTextArgs(),
410
- partitioning: HivePartitionParser = None,
411
- open_stream_args: Optional[Dict[str, Any]] = None,
412
- read_kwargs_provider: Optional[ReadKwargsProvider] = None,
413
- **s3_client_kwargs,
414
- ) -> List[ReadTask]:
415
- # default to pyarrow.fs.S3FileSystem if no filesystem given
416
- if filesystem is None:
417
- filesystem = S3FileSystem()
418
- # normalize s3 paths to work with the filesystem provided
419
- paths, urls = _normalize_s3_paths_for_filesystem(paths, filesystem)
420
- paths, resolved_fs = _resolve_paths_and_filesystem(
421
- paths,
422
- filesystem,
423
- )
424
- # find all files in manifests, prefixes, and folders
425
- content_type_to_paths, meta_provider = _expand_paths_by_content_type(
426
- paths,
427
- urls,
428
- content_type_provider,
429
- path_type,
430
- filesystem,
431
- resolved_fs,
432
- **s3_client_kwargs,
433
- )
434
- num_content_types = len(content_type_to_paths)
435
- if num_content_types > 1 and not schema:
436
- # infer schema from a single parquet file
437
- # TODO (pdames): read verbose manifest schema if available, and infer
438
- # schema from a sample parquet dataset if not
439
- path = content_type_to_paths[ContentType.PARQUET][0]
440
- with resolved_fs.open_input_file(path, **open_stream_args) as f:
441
- schema = pq.read_schema(f)
442
- content_type_to_reader = {
443
- ContentType.PARQUET: ParquetBaseDatasource(),
444
- ContentType.CSV: CSVDatasource(),
445
- }
446
- all_read_tasks = []
447
- for content_type, paths in content_type_to_paths.items():
448
- reader = content_type_to_reader.get(content_type)
449
- assert reader, f"No datasource found for: {content_type}"
450
- prepare_read_kwargs = {
451
- "parallelism": parallelism,
452
- "paths": paths,
453
- "filesystem": resolved_fs,
454
- "schema": schema,
455
- "meta_provider": meta_provider,
456
- "partitioning": partitioning,
457
- }
458
- if content_type == ContentType.PARQUET:
459
- if columns:
460
- prepare_read_kwargs["columns"] = columns
461
- elif content_type in DELIMITED_TEXT_CONTENT_TYPES:
462
- prepare_read_kwargs.update(
463
- unload_args.to_arrow_reader_kwargs(columns, schema)
464
- )
465
- else:
466
- raise NotImplementedError(f"Unsupported content type: {content_type}")
467
- # merge any provided reader kwargs for this content type with those
468
- # inferred from Redshift UNLOAD args
469
- if read_kwargs_provider:
470
- prepare_read_kwargs = read_kwargs_provider(
471
- content_type,
472
- prepare_read_kwargs,
473
- )
474
- # explicitly specified `open_stream_args` override those inferred
475
- # from Redshift UNLOAD args
476
- if open_stream_args:
477
- prepare_read_kwargs["open_stream_args"] = open_stream_args
478
- read_tasks = reader.prepare_read(**prepare_read_kwargs)
479
- all_read_tasks.extend(read_tasks)
480
- return all_read_tasks
481
-
482
- def do_write(
483
- self,
484
- blocks: List[ObjectRef[Block]],
485
- metadata: List[BlockMetadata],
486
- path: str,
487
- dataset_uuid: str,
488
- filesystem: Optional[FileSystem] = None,
489
- try_create_dir: bool = True,
490
- open_stream_args: Optional[Dict[str, Any]] = None,
491
- block_path_provider: BlockWritePathProvider = DefaultBlockWritePathProvider(),
492
- write_args_fn: Callable[[], Dict[str, Any]] = lambda: {},
493
- _block_udf: Optional[Callable[[Block], Block]] = None,
494
- **write_args,
495
- ) -> List[ObjectRef[WriteResult]]:
496
- if filesystem is None:
497
- filesystem = S3FileSystem()
498
- paths, _ = _normalize_s3_paths_for_filesystem(path, filesystem)
499
- paths, filesystem = _resolve_paths_and_filesystem(paths, filesystem)
500
- assert len(paths) == 1, f"Expected 1 write path, found {len(paths)}."
501
- path = paths[0]
502
- block_path_provider = CapturingBlockWritePathProvider(block_path_provider)
503
- writer = ParquetBaseDatasource()
504
- write_results = writer.do_write(
505
- blocks,
506
- metadata,
507
- path,
508
- dataset_uuid,
509
- filesystem,
510
- try_create_dir,
511
- open_stream_args,
512
- block_path_provider,
513
- write_args_fn,
514
- _block_udf,
515
- **write_args,
516
- )
517
- # append a summary of this write operation in the last write result
518
- rwr = RedshiftWriteResult()
519
- rwr.metadata = metadata
520
- rwr.path = path
521
- rwr.dataset_uuid = dataset_uuid
522
- rwr.block_write_path_provider = block_path_provider
523
- rwr.content_type = ContentType.PARQUET.value
524
- rwr.content_encoding = ContentEncoding.IDENTITY.value
525
- rwr.filesystem = filesystem
526
- rwr_obj_ref = ray.put(rwr)
527
- write_results.append(rwr_obj_ref)
528
- return write_results
529
-
530
- def on_write_complete(self, write_results: List[WriteResult], **kwargs) -> None:
531
- # TODO (pdames): time latency of this operation - overall redshift write times
532
- # are 2-3x pure read_parquet_fast() times
533
- # restore the write operation summary from the last write result
534
- result: RedshiftWriteResult = write_results[len(write_results) - 1]
535
- write_path_args = result.block_write_path_provider.write_path_kwargs
536
- blocks_written = len(write_path_args)
537
- expected_blocks_written = len(result.metadata)
538
- # TODO(pdames): Corner cases where mismatch is expected? Emply blocks?
539
- # Blocks filtered/split/merged to more/less write paths?
540
- assert blocks_written == expected_blocks_written, (
541
- f"Dataset write result validation failed. Found "
542
- f"{blocks_written}/{expected_blocks_written} Dataset blocks "
543
- f"written. Refusing to commit Redshift Manifest."
544
- )
545
- manifest_entries = ManifestEntryList()
546
- for block_idx, path in enumerate(write_path_args.keys()):
547
- file_info = result.filesystem.get_file_info(path)
548
- if file_info.type == FileType.File:
549
- content_length = file_info.size
550
- else:
551
- raise FileNotFoundError(ENOENT, strerror(ENOENT), path)
552
- num_rows = result.metadata[block_idx].num_rows
553
- source_content_length = result.metadata[block_idx].size_bytes
554
- manifest_entry_meta = ManifestMeta.of(
555
- int(num_rows) if num_rows is not None else None,
556
- int(content_length) if content_length is not None else None,
557
- result.content_type,
558
- result.content_encoding,
559
- int(source_content_length) if source_content_length else None,
560
- )
561
- parsed_url = parse_s3_url(path)
562
- manifest_entry = ManifestEntry.of(
563
- parsed_url.url,
564
- manifest_entry_meta,
565
- )
566
- manifest_entries.append(manifest_entry)
567
- manifest = Manifest.of(manifest_entries)
568
- manifest_path = f"{result.path}/manifest"
569
- logger.debug(f"Write succeeded for Dataset ID: {result.dataset_uuid}")
570
- with result.filesystem.open_output_stream(
571
- manifest_path,
572
- # Also See:
573
- # docs.aws.amazon.com/AmazonS3/latest/API/RESTCommonRequestHeaders.html
574
- # Arrow s3fs.cc: tinyurl.com/2axa6m9m
575
- metadata={"Content-Type": ContentType.JSON.value},
576
- ) as f:
577
- f.write(json.dumps(manifest).encode("utf-8"))
578
- logger.debug(f"Manifest committed to: {manifest_path}")