deltacat 0.1.10.dev0__py3-none-any.whl → 0.1.12__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (83) hide show
  1. deltacat/__init__.py +41 -15
  2. deltacat/aws/clients.py +12 -31
  3. deltacat/aws/constants.py +1 -1
  4. deltacat/aws/redshift/__init__.py +7 -2
  5. deltacat/aws/redshift/model/manifest.py +54 -50
  6. deltacat/aws/s3u.py +176 -187
  7. deltacat/catalog/delegate.py +151 -185
  8. deltacat/catalog/interface.py +78 -97
  9. deltacat/catalog/model/catalog.py +21 -21
  10. deltacat/catalog/model/table_definition.py +11 -9
  11. deltacat/compute/compactor/__init__.py +12 -16
  12. deltacat/compute/compactor/compaction_session.py +237 -166
  13. deltacat/compute/compactor/model/delta_annotated.py +60 -44
  14. deltacat/compute/compactor/model/delta_file_envelope.py +5 -6
  15. deltacat/compute/compactor/model/delta_file_locator.py +10 -8
  16. deltacat/compute/compactor/model/materialize_result.py +6 -7
  17. deltacat/compute/compactor/model/primary_key_index.py +38 -34
  18. deltacat/compute/compactor/model/pyarrow_write_result.py +3 -4
  19. deltacat/compute/compactor/model/round_completion_info.py +25 -19
  20. deltacat/compute/compactor/model/sort_key.py +18 -15
  21. deltacat/compute/compactor/steps/dedupe.py +119 -94
  22. deltacat/compute/compactor/steps/hash_bucket.py +48 -47
  23. deltacat/compute/compactor/steps/materialize.py +86 -92
  24. deltacat/compute/compactor/steps/rehash/rehash_bucket.py +13 -13
  25. deltacat/compute/compactor/steps/rehash/rewrite_index.py +5 -5
  26. deltacat/compute/compactor/utils/io.py +59 -47
  27. deltacat/compute/compactor/utils/primary_key_index.py +91 -80
  28. deltacat/compute/compactor/utils/round_completion_file.py +22 -23
  29. deltacat/compute/compactor/utils/system_columns.py +33 -45
  30. deltacat/compute/metastats/meta_stats.py +235 -157
  31. deltacat/compute/metastats/model/partition_stats_dict.py +7 -10
  32. deltacat/compute/metastats/model/stats_cluster_size_estimator.py +13 -5
  33. deltacat/compute/metastats/stats.py +95 -64
  34. deltacat/compute/metastats/utils/io.py +100 -53
  35. deltacat/compute/metastats/utils/pyarrow_memory_estimation_function.py +5 -2
  36. deltacat/compute/metastats/utils/ray_utils.py +38 -33
  37. deltacat/compute/stats/basic.py +107 -69
  38. deltacat/compute/stats/models/delta_column_stats.py +11 -8
  39. deltacat/compute/stats/models/delta_stats.py +59 -32
  40. deltacat/compute/stats/models/delta_stats_cache_result.py +4 -1
  41. deltacat/compute/stats/models/manifest_entry_stats.py +12 -6
  42. deltacat/compute/stats/models/stats_result.py +24 -14
  43. deltacat/compute/stats/utils/intervals.py +16 -9
  44. deltacat/compute/stats/utils/io.py +86 -51
  45. deltacat/compute/stats/utils/manifest_stats_file.py +24 -33
  46. deltacat/constants.py +4 -13
  47. deltacat/io/__init__.py +2 -2
  48. deltacat/io/aws/redshift/redshift_datasource.py +157 -143
  49. deltacat/io/dataset.py +14 -17
  50. deltacat/io/read_api.py +36 -33
  51. deltacat/logs.py +94 -42
  52. deltacat/storage/__init__.py +18 -8
  53. deltacat/storage/interface.py +196 -213
  54. deltacat/storage/model/delta.py +45 -51
  55. deltacat/storage/model/list_result.py +12 -8
  56. deltacat/storage/model/namespace.py +4 -5
  57. deltacat/storage/model/partition.py +42 -42
  58. deltacat/storage/model/stream.py +29 -30
  59. deltacat/storage/model/table.py +14 -14
  60. deltacat/storage/model/table_version.py +32 -31
  61. deltacat/storage/model/types.py +1 -0
  62. deltacat/tests/stats/test_intervals.py +11 -24
  63. deltacat/tests/utils/__init__.py +0 -0
  64. deltacat/tests/utils/test_record_batch_tables.py +284 -0
  65. deltacat/types/media.py +3 -4
  66. deltacat/types/tables.py +31 -21
  67. deltacat/utils/common.py +5 -11
  68. deltacat/utils/numpy.py +20 -22
  69. deltacat/utils/pandas.py +73 -100
  70. deltacat/utils/performance.py +3 -9
  71. deltacat/utils/placement.py +259 -230
  72. deltacat/utils/pyarrow.py +302 -89
  73. deltacat/utils/ray_utils/collections.py +2 -1
  74. deltacat/utils/ray_utils/concurrency.py +27 -28
  75. deltacat/utils/ray_utils/dataset.py +28 -28
  76. deltacat/utils/ray_utils/performance.py +5 -9
  77. deltacat/utils/ray_utils/runtime.py +9 -10
  78. {deltacat-0.1.10.dev0.dist-info → deltacat-0.1.12.dist-info}/METADATA +1 -1
  79. deltacat-0.1.12.dist-info/RECORD +110 -0
  80. deltacat-0.1.10.dev0.dist-info/RECORD +0 -108
  81. {deltacat-0.1.10.dev0.dist-info → deltacat-0.1.12.dist-info}/LICENSE +0 -0
  82. {deltacat-0.1.10.dev0.dist-info → deltacat-0.1.12.dist-info}/WHEEL +0 -0
  83. {deltacat-0.1.10.dev0.dist-info → deltacat-0.1.12.dist-info}/top_level.txt +0 -0
@@ -1,40 +1,45 @@
1
1
  import json
2
2
  import logging
3
+ from collections import OrderedDict, defaultdict
4
+ from enum import Enum
5
+ from errno import ENOENT
3
6
  from os import strerror
7
+ from typing import Any, Callable, Dict, List, Optional, Tuple, Union
4
8
 
5
9
  import pyarrow as pa
6
10
  import ray
7
11
  import s3fs
8
-
9
- from errno import ENOENT
10
- from enum import Enum
11
- from collections import OrderedDict, defaultdict
12
-
13
- from deltacat.types.media import DELIMITED_TEXT_CONTENT_TYPES
14
- from pyarrow.fs import FileType, FileSystem, S3FileSystem
15
12
  from pyarrow import parquet as pq
16
-
17
- from ray.data.datasource.file_based_datasource import \
18
- _resolve_paths_and_filesystem
13
+ from pyarrow.fs import FileSystem, FileType, S3FileSystem
14
+ from ray.data.block import Block, BlockMetadata
15
+ from ray.data.datasource import (
16
+ BlockWritePathProvider,
17
+ CSVDatasource,
18
+ DefaultBlockWritePathProvider,
19
+ DefaultFileMetadataProvider,
20
+ ParquetBaseDatasource,
21
+ ParquetMetadataProvider,
22
+ PathPartitionParser,
23
+ )
24
+ from ray.data.datasource.datasource import ArrowRow, Datasource, ReadTask, WriteResult
25
+ from ray.data.datasource.file_based_datasource import _resolve_paths_and_filesystem
19
26
  from ray.data.datasource.file_meta_provider import FastFileMetadataProvider
20
- from ray.data.datasource.partitioning import PartitionStyle
21
27
  from ray.types import ObjectRef
22
- from ray.data.datasource import CSVDatasource, BlockWritePathProvider, \
23
- DefaultBlockWritePathProvider, ParquetMetadataProvider, \
24
- DefaultFileMetadataProvider, ParquetBaseDatasource, PathPartitionParser
25
- from ray.data.datasource.datasource import ReadTask, WriteResult, Datasource, \
26
- ArrowRow
27
- from ray.data.block import Block, BlockMetadata
28
-
29
- from deltacat import ContentType, ContentEncoding
30
- from deltacat import logs
31
- from deltacat.aws.redshift.model.manifest import Manifest, ManifestEntryList, \
32
- ManifestEntry, ManifestMeta
33
-
34
- from typing import Any, Callable, List, Optional, Union, Dict, Tuple
35
28
 
36
- from deltacat.aws.s3u import parse_s3_url, S3Url, filter_objects_by_prefix, \
37
- objects_to_paths
29
+ from deltacat import ContentEncoding, ContentType, logs
30
+ from deltacat.aws.redshift.model.manifest import (
31
+ Manifest,
32
+ ManifestEntry,
33
+ ManifestEntryList,
34
+ ManifestMeta,
35
+ )
36
+ from deltacat.aws.s3u import (
37
+ S3Url,
38
+ filter_objects_by_prefix,
39
+ objects_to_paths,
40
+ parse_s3_url,
41
+ )
42
+ from deltacat.types.media import DELIMITED_TEXT_CONTENT_TYPES
38
43
  from deltacat.utils.common import ReadKwargsProvider
39
44
 
40
45
  logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
@@ -43,15 +48,12 @@ logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
43
48
  class CapturingBlockWritePathProvider(BlockWritePathProvider):
44
49
  """Delegating block write path provider that saves an ordered dictionary of
45
50
  input keyword arguments for every block write path returned."""
51
+
46
52
  def __init__(self, block_write_path_provider: BlockWritePathProvider):
47
53
  self.block_write_path_provider = block_write_path_provider
48
54
  self.write_path_kwargs: Dict[str, Dict[str, Any]] = OrderedDict()
49
55
 
50
- def _get_write_path_for_block(
51
- self,
52
- base_path: str,
53
- *args,
54
- **kwargs) -> str:
56
+ def _get_write_path_for_block(self, base_path: str, *args, **kwargs) -> str:
55
57
  write_path = self.block_write_path_provider(
56
58
  base_path,
57
59
  *args,
@@ -73,10 +75,10 @@ class CachedFileMetadataProvider(
73
75
  return self._meta_cache
74
76
 
75
77
  def _get_block_metadata(
76
- self,
77
- paths: List[str],
78
- schema: Optional[Union[type, pa.Schema]],
79
- **kwargs,
78
+ self,
79
+ paths: List[str],
80
+ schema: Optional[Union[type, pa.Schema]],
81
+ **kwargs,
80
82
  ) -> BlockMetadata:
81
83
  agg_block_metadata = BlockMetadata(
82
84
  num_rows=0,
@@ -103,9 +105,9 @@ class CachedFileMetadataProvider(
103
105
 
104
106
  class HivePartitionParser(PathPartitionParser):
105
107
  def __init__(
106
- self,
107
- base_dir: Optional[str] = None,
108
- filter_fn: Optional[Callable[[Dict[str, str]], bool]] = None,
108
+ self,
109
+ base_dir: Optional[str] = None,
110
+ filter_fn: Optional[Callable[[Dict[str, str]], bool]] = None,
109
111
  ):
110
112
  super(HivePartitionParser, self).__init__(
111
113
  base_dir=base_dir,
@@ -115,17 +117,17 @@ class HivePartitionParser(PathPartitionParser):
115
117
 
116
118
  class RedshiftUnloadTextArgs:
117
119
  def __init__(
118
- self,
119
- csv: bool = False,
120
- header: bool = False,
121
- delimiter: Optional[str] = None,
122
- bzip2: bool = False,
123
- gzip: bool = False,
124
- zstd: bool = False,
125
- add_quotes: Optional[bool] = None,
126
- null_as: str = "",
127
- escape: bool = False,
128
- fixed_width: bool = False,
120
+ self,
121
+ csv: bool = False,
122
+ header: bool = False,
123
+ delimiter: Optional[str] = None,
124
+ bzip2: bool = False,
125
+ gzip: bool = False,
126
+ zstd: bool = False,
127
+ add_quotes: Optional[bool] = None,
128
+ null_as: str = "",
129
+ escape: bool = False,
130
+ fixed_width: bool = False,
129
131
  ):
130
132
  self.header = header
131
133
  self.delimiter = delimiter if delimiter else "," if csv else "|"
@@ -149,20 +151,22 @@ class RedshiftUnloadTextArgs:
149
151
  raise ValueError(
150
152
  f"Multiple Redshift UNLOAD compression types specified "
151
153
  f"({codecs_enabled}). Please ensure that only one "
152
- f"compression type is set and try again.")
154
+ f"compression type is set and try again."
155
+ )
153
156
  if flag:
154
157
  arrow_compression_codec_name = encoding
155
158
  return arrow_compression_codec_name
156
159
 
157
160
  def to_arrow_reader_kwargs(
158
- self,
159
- include_columns: Optional[List[str]],
160
- schema: Optional[pa.Schema]) -> Dict[str, Any]:
161
+ self, include_columns: Optional[List[str]], schema: Optional[pa.Schema]
162
+ ) -> Dict[str, Any]:
161
163
  from pyarrow import csv
164
+
162
165
  if self.fixed_width:
163
166
  raise NotImplementedError(
164
167
  "Redshift text files unloaded with FIXEDWIDTH are not "
165
- "currently supported.")
168
+ "currently supported."
169
+ )
166
170
  open_stream_args = {}
167
171
  arrow_compression_codec_name = self._get_arrow_compression_codec_name()
168
172
  if arrow_compression_codec_name:
@@ -217,8 +221,8 @@ class RedshiftWriteResult:
217
221
 
218
222
 
219
223
  def _normalize_s3_paths_for_filesystem(
220
- paths: Union[str, List[str]],
221
- filesystem: Union[S3FileSystem, s3fs.S3FileSystem],
224
+ paths: Union[str, List[str]],
225
+ filesystem: Union[S3FileSystem, s3fs.S3FileSystem],
222
226
  ) -> Tuple[List[str], List[S3Url]]:
223
227
  if isinstance(paths, str):
224
228
  paths = [paths]
@@ -234,9 +238,9 @@ def _normalize_s3_paths_for_filesystem(
234
238
 
235
239
 
236
240
  def _read_manifest_entry_paths(
237
- entries: ManifestEntryList,
238
- manifest_content_type: Optional[str],
239
- content_type_provider: Callable[[str], ContentType],
241
+ entries: ManifestEntryList,
242
+ manifest_content_type: Optional[str],
243
+ content_type_provider: Callable[[str], ContentType],
240
244
  ) -> Tuple[Dict[ContentType, List[str]], CachedFileMetadataProvider]:
241
245
  # support manifests with heterogenous content types
242
246
  content_type_to_paths = defaultdict(list)
@@ -261,9 +265,9 @@ def _read_manifest_entry_paths(
261
265
 
262
266
 
263
267
  def _expand_manifest_paths(
264
- paths: List[str],
265
- filesystem: Optional[Union[S3FileSystem, s3fs.S3FileSystem]],
266
- content_type_provider: Callable[[str], ContentType],
268
+ paths: List[str],
269
+ filesystem: Optional[Union[S3FileSystem, s3fs.S3FileSystem]],
270
+ content_type_provider: Callable[[str], ContentType],
267
271
  ) -> Tuple[Dict[ContentType, List[str]], CachedFileMetadataProvider]:
268
272
  assert len(paths) == 1, f"Expected 1 manifest path, found {len(paths)}."
269
273
  path = paths[0]
@@ -286,8 +290,8 @@ def _expand_manifest_paths(
286
290
 
287
291
 
288
292
  def _infer_content_types_from_paths(
289
- paths: List[str],
290
- content_type_provider: Callable[[str], ContentType],
293
+ paths: List[str],
294
+ content_type_provider: Callable[[str], ContentType],
291
295
  ) -> Dict[ContentType, List[str]]:
292
296
  content_type_to_paths = defaultdict(list)
293
297
  for path in paths:
@@ -297,27 +301,30 @@ def _infer_content_types_from_paths(
297
301
 
298
302
 
299
303
  def _expand_prefix_paths(
300
- urls: List[S3Url],
301
- content_type_provider: Callable[[str], ContentType],
302
- **s3_client_kwargs,
304
+ urls: List[S3Url],
305
+ content_type_provider: Callable[[str], ContentType],
306
+ **s3_client_kwargs,
303
307
  ) -> Tuple[Dict[ContentType, List[str]], CachedFileMetadataProvider]:
304
308
  assert len(urls) == 1, f"Expected 1 S3 prefix, found {len(urls)}."
305
- objects = list(filter_objects_by_prefix(
306
- urls[0].bucket,
307
- urls[0].key,
308
- **s3_client_kwargs
309
- ))
310
- paths = list(objects_to_paths(
311
- urls[0].bucket,
312
- objects,
313
- ))
314
- meta_cache: Dict[str, BlockMetadata] = {path: BlockMetadata(
315
- num_rows=None,
316
- size_bytes=objects[i]["ContentLength"],
317
- schema=None,
318
- input_files=[],
319
- exec_stats=None,
320
- ) for i, path in enumerate(paths)}
309
+ objects = list(
310
+ filter_objects_by_prefix(urls[0].bucket, urls[0].key, **s3_client_kwargs)
311
+ )
312
+ paths = list(
313
+ objects_to_paths(
314
+ urls[0].bucket,
315
+ objects,
316
+ )
317
+ )
318
+ meta_cache: Dict[str, BlockMetadata] = {
319
+ path: BlockMetadata(
320
+ num_rows=None,
321
+ size_bytes=objects[i]["ContentLength"],
322
+ schema=None,
323
+ input_files=[],
324
+ exec_stats=None,
325
+ )
326
+ for i, path in enumerate(paths)
327
+ }
321
328
  content_type_to_paths = _infer_content_types_from_paths(
322
329
  paths,
323
330
  content_type_provider,
@@ -326,13 +333,13 @@ def _expand_prefix_paths(
326
333
 
327
334
 
328
335
  def _expand_paths_by_content_type(
329
- base_paths: Union[str, List[str]],
330
- base_urls: List[S3Url],
331
- content_type_provider: Callable[[str], ContentType],
332
- path_type: S3PathType,
333
- user_fs: Optional[Union[S3FileSystem, s3fs.S3FileSystem]],
334
- resolved_fs: S3FileSystem,
335
- **s3_client_kwargs,
336
+ base_paths: Union[str, List[str]],
337
+ base_urls: List[S3Url],
338
+ content_type_provider: Callable[[str], ContentType],
339
+ path_type: S3PathType,
340
+ user_fs: Optional[Union[S3FileSystem, s3fs.S3FileSystem]],
341
+ resolved_fs: S3FileSystem,
342
+ **s3_client_kwargs,
336
343
  ) -> Tuple[Dict[ContentType, List[str]], CachedFileMetadataProvider]:
337
344
  if path_type == S3PathType.MANIFEST:
338
345
  content_type_to_paths, meta_provider = _expand_manifest_paths(
@@ -348,16 +355,22 @@ def _expand_paths_by_content_type(
348
355
  )
349
356
  elif path_type == S3PathType.FILES_AND_FOLDERS:
350
357
  # TODO(pdames): Only allow files and call get_object(file_path)?
351
- base_paths, file_infos = DefaultFileMetadataProvider()\
352
- .expand_paths(base_paths, resolved_fs)
358
+ base_paths, file_infos = DefaultFileMetadataProvider().expand_paths(
359
+ base_paths, resolved_fs
360
+ )
353
361
  file_sizes = [file_info.size for file_info in file_infos]
354
- meta_provider = CachedFileMetadataProvider({path: BlockMetadata(
355
- num_rows=None,
356
- size_bytes=file_sizes[i],
357
- schema=None,
358
- input_files=[],
359
- exec_stats=None,
360
- ) for i, path in enumerate(base_paths)})
362
+ meta_provider = CachedFileMetadataProvider(
363
+ {
364
+ path: BlockMetadata(
365
+ num_rows=None,
366
+ size_bytes=file_sizes[i],
367
+ schema=None,
368
+ input_files=[],
369
+ exec_stats=None,
370
+ )
371
+ for i, path in enumerate(base_paths)
372
+ }
373
+ )
361
374
  content_type_to_paths = _infer_content_types_from_paths(
362
375
  base_paths,
363
376
  content_type_provider,
@@ -374,28 +387,30 @@ def _expand_paths_by_content_type(
374
387
  )
375
388
  content_type_to_paths[content_type] = paths
376
389
  # normalize block metadata provider S3 file paths based on the filesystem
377
- meta_provider = CachedFileMetadataProvider({
378
- _normalize_s3_paths_for_filesystem(path, user_fs)[0][0]: metadata
379
- for path, metadata in meta_provider.get_meta_cache().items()
380
- })
390
+ meta_provider = CachedFileMetadataProvider(
391
+ {
392
+ _normalize_s3_paths_for_filesystem(path, user_fs)[0][0]: metadata
393
+ for path, metadata in meta_provider.get_meta_cache().items()
394
+ }
395
+ )
381
396
  return content_type_to_paths, meta_provider
382
397
 
383
398
 
384
399
  class RedshiftDatasource(Datasource[Union[ArrowRow, Any]]):
385
400
  def prepare_read(
386
- self,
387
- parallelism: int,
388
- paths: Union[str, List[str]],
389
- content_type_provider: Callable[[str], ContentType],
390
- path_type: S3PathType = S3PathType.MANIFEST,
391
- filesystem: Optional[Union[S3FileSystem, s3fs.S3FileSystem]] = None,
392
- columns: Optional[List[str]] = None,
393
- schema: Optional[pa.Schema] = None,
394
- unload_args: RedshiftUnloadTextArgs = RedshiftUnloadTextArgs(),
395
- partitioning: HivePartitionParser = None,
396
- open_stream_args: Optional[Dict[str, Any]] = None,
397
- read_kwargs_provider: Optional[ReadKwargsProvider] = None,
398
- **s3_client_kwargs,
401
+ self,
402
+ parallelism: int,
403
+ paths: Union[str, List[str]],
404
+ content_type_provider: Callable[[str], ContentType],
405
+ path_type: S3PathType = S3PathType.MANIFEST,
406
+ filesystem: Optional[Union[S3FileSystem, s3fs.S3FileSystem]] = None,
407
+ columns: Optional[List[str]] = None,
408
+ schema: Optional[pa.Schema] = None,
409
+ unload_args: RedshiftUnloadTextArgs = RedshiftUnloadTextArgs(),
410
+ partitioning: HivePartitionParser = None,
411
+ open_stream_args: Optional[Dict[str, Any]] = None,
412
+ read_kwargs_provider: Optional[ReadKwargsProvider] = None,
413
+ **s3_client_kwargs,
399
414
  ) -> List[ReadTask]:
400
415
  # default to pyarrow.fs.S3FileSystem if no filesystem given
401
416
  if filesystem is None:
@@ -445,7 +460,8 @@ class RedshiftDatasource(Datasource[Union[ArrowRow, Any]]):
445
460
  prepare_read_kwargs["columns"] = columns
446
461
  elif content_type in DELIMITED_TEXT_CONTENT_TYPES:
447
462
  prepare_read_kwargs.update(
448
- unload_args.to_arrow_reader_kwargs(columns, schema))
463
+ unload_args.to_arrow_reader_kwargs(columns, schema)
464
+ )
449
465
  else:
450
466
  raise NotImplementedError(f"Unsupported content type: {content_type}")
451
467
  # merge any provided reader kwargs for this content type with those
@@ -464,19 +480,18 @@ class RedshiftDatasource(Datasource[Union[ArrowRow, Any]]):
464
480
  return all_read_tasks
465
481
 
466
482
  def do_write(
467
- self,
468
- blocks: List[ObjectRef[Block]],
469
- metadata: List[BlockMetadata],
470
- path: str,
471
- dataset_uuid: str,
472
- filesystem: Optional[FileSystem] = None,
473
- try_create_dir: bool = True,
474
- open_stream_args: Optional[Dict[str, Any]] = None,
475
- block_path_provider: BlockWritePathProvider =
476
- DefaultBlockWritePathProvider(),
477
- write_args_fn: Callable[[], Dict[str, Any]] = lambda: {},
478
- _block_udf: Optional[Callable[[Block], Block]] = None,
479
- **write_args,
483
+ self,
484
+ blocks: List[ObjectRef[Block]],
485
+ metadata: List[BlockMetadata],
486
+ path: str,
487
+ dataset_uuid: str,
488
+ filesystem: Optional[FileSystem] = None,
489
+ try_create_dir: bool = True,
490
+ open_stream_args: Optional[Dict[str, Any]] = None,
491
+ block_path_provider: BlockWritePathProvider = DefaultBlockWritePathProvider(),
492
+ write_args_fn: Callable[[], Dict[str, Any]] = lambda: {},
493
+ _block_udf: Optional[Callable[[Block], Block]] = None,
494
+ **write_args,
480
495
  ) -> List[ObjectRef[WriteResult]]:
481
496
  if filesystem is None:
482
497
  filesystem = S3FileSystem()
@@ -484,8 +499,7 @@ class RedshiftDatasource(Datasource[Union[ArrowRow, Any]]):
484
499
  paths, filesystem = _resolve_paths_and_filesystem(paths, filesystem)
485
500
  assert len(paths) == 1, f"Expected 1 write path, found {len(paths)}."
486
501
  path = paths[0]
487
- block_path_provider = CapturingBlockWritePathProvider(
488
- block_path_provider)
502
+ block_path_provider = CapturingBlockWritePathProvider(block_path_provider)
489
503
  writer = ParquetBaseDatasource()
490
504
  write_results = writer.do_write(
491
505
  blocks,
@@ -513,21 +527,21 @@ class RedshiftDatasource(Datasource[Union[ArrowRow, Any]]):
513
527
  write_results.append(rwr_obj_ref)
514
528
  return write_results
515
529
 
516
- def on_write_complete(self, write_results: List[WriteResult], **kwargs) \
517
- -> None:
530
+ def on_write_complete(self, write_results: List[WriteResult], **kwargs) -> None:
518
531
  # TODO (pdames): time latency of this operation - overall redshift write times
519
532
  # are 2-3x pure read_parquet_fast() times
520
533
  # restore the write operation summary from the last write result
521
- result: RedshiftWriteResult = write_results[len(write_results)-1]
534
+ result: RedshiftWriteResult = write_results[len(write_results) - 1]
522
535
  write_path_args = result.block_write_path_provider.write_path_kwargs
523
536
  blocks_written = len(write_path_args)
524
537
  expected_blocks_written = len(result.metadata)
525
538
  # TODO(pdames): Corner cases where mismatch is expected? Emply blocks?
526
539
  # Blocks filtered/split/merged to more/less write paths?
527
- assert blocks_written == expected_blocks_written, \
528
- f"Dataset write result validation failed. Found " \
529
- f"{blocks_written}/{expected_blocks_written} Dataset blocks " \
540
+ assert blocks_written == expected_blocks_written, (
541
+ f"Dataset write result validation failed. Found "
542
+ f"{blocks_written}/{expected_blocks_written} Dataset blocks "
530
543
  f"written. Refusing to commit Redshift Manifest."
544
+ )
531
545
  manifest_entries = ManifestEntryList()
532
546
  for block_idx, path in enumerate(write_path_args.keys()):
533
547
  file_info = result.filesystem.get_file_info(path)
@@ -554,11 +568,11 @@ class RedshiftDatasource(Datasource[Union[ArrowRow, Any]]):
554
568
  manifest_path = f"{result.path}/manifest"
555
569
  logger.debug(f"Write succeeded for Dataset ID: {result.dataset_uuid}")
556
570
  with result.filesystem.open_output_stream(
557
- manifest_path,
558
- # Also See:
559
- # docs.aws.amazon.com/AmazonS3/latest/API/RESTCommonRequestHeaders.html
560
- # Arrow s3fs.cc: tinyurl.com/2axa6m9m
561
- metadata={"Content-Type": ContentType.JSON.value},
571
+ manifest_path,
572
+ # Also See:
573
+ # docs.aws.amazon.com/AmazonS3/latest/API/RESTCommonRequestHeaders.html
574
+ # Arrow s3fs.cc: tinyurl.com/2axa6m9m
575
+ metadata={"Content-Type": ContentType.JSON.value},
562
576
  ) as f:
563
577
  f.write(json.dumps(manifest).encode("utf-8"))
564
578
  logger.debug(f"Manifest committed to: {manifest_path}")
deltacat/io/dataset.py CHANGED
@@ -1,18 +1,16 @@
1
1
  # Allow classes to use self-referencing Type hints in Python 3.7.
2
2
  from __future__ import annotations
3
3
 
4
+ from typing import Any, Callable, Dict, Optional, TypeVar, Union, cast
5
+
4
6
  import pyarrow as pa
5
7
  import s3fs
6
-
7
- from typing import Optional, Union, Callable, Dict, Any, cast, TypeVar
8
-
9
8
  from ray.data import Dataset
10
- from ray.data.datasource import DefaultBlockWritePathProvider, \
11
- BlockWritePathProvider
9
+ from ray.data.datasource import BlockWritePathProvider, DefaultBlockWritePathProvider
12
10
 
13
11
  from deltacat.io.aws.redshift.redshift_datasource import RedshiftDatasource
14
12
 
15
- T = TypeVar('T')
13
+ T = TypeVar("T")
16
14
 
17
15
 
18
16
  class DeltacatDataset(Dataset[T]):
@@ -23,17 +21,16 @@ class DeltacatDataset(Dataset[T]):
23
21
  return cast(DeltacatDataset[T], dataset)
24
22
 
25
23
  def write_redshift(
26
- self,
27
- path: str,
28
- *,
29
- filesystem: Optional[
30
- Union[pa.fs.FileSystem, s3fs.S3FileSystem]] = None,
31
- try_create_dir: bool = True,
32
- arrow_open_stream_args: Optional[Dict[str, Any]] = None,
33
- block_path_provider: BlockWritePathProvider =
34
- DefaultBlockWritePathProvider(),
35
- arrow_parquet_args_fn: Callable[[], Dict[str, Any]] = lambda: {},
36
- **arrow_parquet_args) -> None:
24
+ self,
25
+ path: str,
26
+ *,
27
+ filesystem: Optional[Union[pa.fs.FileSystem, s3fs.S3FileSystem]] = None,
28
+ try_create_dir: bool = True,
29
+ arrow_open_stream_args: Optional[Dict[str, Any]] = None,
30
+ block_path_provider: BlockWritePathProvider = DefaultBlockWritePathProvider(),
31
+ arrow_parquet_args_fn: Callable[[], Dict[str, Any]] = lambda: {},
32
+ **arrow_parquet_args,
33
+ ) -> None:
37
34
  """Writes the dataset to Parquet files and commits a Redshift manifest
38
35
  back to S3 indexing the files written. The output can be loaded into
39
36
  Redshift by providing it to the Redshift COPY command, or via AWS Data
deltacat/io/read_api.py CHANGED
@@ -1,35 +1,38 @@
1
- import s3fs
2
- import pyarrow as pa
3
- from deltacat.utils.common import ReadKwargsProvider
1
+ from typing import Any, Callable, Dict, List, Optional, Union
4
2
 
3
+ import pyarrow as pa
4
+ import s3fs
5
5
  from ray.data import read_datasource
6
6
  from ray.data._internal.arrow_block import ArrowRow
7
7
 
8
8
  from deltacat import ContentType
9
+ from deltacat.io.aws.redshift.redshift_datasource import (
10
+ HivePartitionParser,
11
+ RedshiftDatasource,
12
+ RedshiftUnloadTextArgs,
13
+ S3PathType,
14
+ )
9
15
  from deltacat.io.dataset import DeltacatDataset
10
- from deltacat.io.aws.redshift.redshift_datasource import \
11
- RedshiftDatasource, RedshiftUnloadTextArgs, S3PathType, HivePartitionParser
12
-
13
- from typing import Optional, Union, List, Dict, Any, Callable
16
+ from deltacat.utils.common import ReadKwargsProvider
14
17
 
15
18
 
16
19
  def read_redshift(
17
- paths: Union[str, List[str]],
18
- *,
19
- path_type: S3PathType = S3PathType.MANIFEST,
20
- filesystem: Optional[
21
- Union[pa.fs.S3FileSystem, s3fs.S3FileSystem]] = None,
22
- columns: Optional[List[str]] = None,
23
- schema: Optional[pa.Schema] = None,
24
- unload_text_args: RedshiftUnloadTextArgs = RedshiftUnloadTextArgs(),
25
- partitioning: HivePartitionParser = None,
26
- content_type_provider: Callable[[str], ContentType] = lambda p:
27
- ContentType.PARQUET if p.endswith(".parquet") else ContentType.CSV,
28
- parallelism: int = 200,
29
- ray_remote_args: Dict[str, Any] = None,
30
- arrow_open_stream_args: Optional[Dict[str, Any]] = None,
31
- pa_read_func_kwargs_provider: Optional[ReadKwargsProvider] = None,
32
- **kwargs,
20
+ paths: Union[str, List[str]],
21
+ *,
22
+ path_type: S3PathType = S3PathType.MANIFEST,
23
+ filesystem: Optional[Union[pa.fs.S3FileSystem, s3fs.S3FileSystem]] = None,
24
+ columns: Optional[List[str]] = None,
25
+ schema: Optional[pa.Schema] = None,
26
+ unload_text_args: RedshiftUnloadTextArgs = RedshiftUnloadTextArgs(),
27
+ partitioning: HivePartitionParser = None,
28
+ content_type_provider: Callable[[str], ContentType] = lambda p: ContentType.PARQUET
29
+ if p.endswith(".parquet")
30
+ else ContentType.CSV,
31
+ parallelism: int = 200,
32
+ ray_remote_args: Dict[str, Any] = None,
33
+ arrow_open_stream_args: Optional[Dict[str, Any]] = None,
34
+ pa_read_func_kwargs_provider: Optional[ReadKwargsProvider] = None,
35
+ **kwargs,
33
36
  ) -> DeltacatDataset[ArrowRow]:
34
37
  """Reads Redshift UNLOAD results from either S3 Parquet or delimited text
35
38
  files into a Ray Dataset.
@@ -38,7 +41,7 @@ def read_redshift(
38
41
  >>> # Read all files contained in a Redshift Manifest:
39
42
  >>> import deltacat as dc
40
43
  >>> dc.io.read_redshift("/bucket/dir/manifest")
41
-
44
+
42
45
  >>> # Read all files matching the given key prefix. If this prefix
43
46
  >>> # refers to multiple files, like s3://bucket/data.parquet,
44
47
  >>> # s3://bucket/data.1.csv, etc. then all will be read. The dataset
@@ -55,19 +58,19 @@ def read_redshift(
55
58
  >>> dc.io.read_redshift(
56
59
  >>> "/bucket/dir",
57
60
  >>> path_type=S3PathType.PREFIX)
58
-
61
+
59
62
  >>> # Read multiple files and folders:
60
63
  >>> dc.io.read_redshift(
61
- >>> ["/bucket/file1", "/bucket/folder1/"],
64
+ >>> ["/bucket/file1", "/bucket/folder1/"],
62
65
  >>> path_type=S3PathType.FILES_AND_FOLDERS)
63
66
 
64
67
  >>> # Read multiple Parquet and CSV files. The dataset schema will be
65
- >>> # inferred from the first parquet file and used for explicit type
68
+ >>> # inferred from the first parquet file and used for explicit type
66
69
  >>> # conversion of all CSV files:
67
70
  >>> dc.io.read_redshift(
68
71
  >>> ["/bucket/file.parquet", "/bucket/file.csv"],
69
72
  >>> path_type=S3PathType.FILES_AND_FOLDERS)
70
-
73
+
71
74
  Args:
72
75
  paths: Paths to S3 files and folders to read. If `path_type` is
73
76
  `MANIFEST` then this must be an S3 Redshift Manifest JSON file. If
@@ -93,27 +96,27 @@ def read_redshift(
93
96
  discovered is used instead.
94
97
  unload_text_args: Arguments used when running Redshift `UNLOAD` to
95
98
  text file formats (e.g. CSV). These arguments ensure that all input
96
- text files will be correctly parsed. If not specified, then all
97
- text files read are assumed to use Redshift UNLOAD's default
99
+ text files will be correctly parsed. If not specified, then all
100
+ text files read are assumed to use Redshift UNLOAD's default
98
101
  pipe-delimited text format.
99
102
  partition_base_dir: Base directory to start searching for partitions
100
103
  (exclusive). File paths outside of this directory will not be parsed
101
104
  for partitions and automatically added to the dataset without passing
102
105
  through any partition filter. Specify `None` or an empty string to
103
106
  search for partitions in all file path directories.
104
- partition_filter_fn: Callback used to filter `PARTITION` columns. Receives a
107
+ partition_filter_fn: Callback used to filter `PARTITION` columns. Receives a
105
108
  dictionary mapping partition keys to values as input, returns `True` to
106
109
  read a partition, and `False` to skip it. Each partition key and value
107
110
  is a string parsed directly from an S3 key using hive-style
108
111
  partition directory names of the form "{key}={value}". For example:
109
- ``lambda x:
112
+ ``lambda x:
110
113
  True if x["month"] == "January" and x["year"] == "2022" else False``
111
114
  content_type_provider: Takes a file path as input and returns the file
112
115
  content type as output.
113
116
  parallelism: The requested parallelism of the read. Parallelism may be
114
117
  limited by the number of files of the dataset.
115
118
  ray_remote_args: kwargs passed to `ray.remote` in the read tasks.
116
- arrow_open_stream_args: kwargs passed to to
119
+ arrow_open_stream_args: kwargs passed to to
117
120
  `pa.fs.open_input_stream()`.
118
121
  pa_read_func_kwargs_provider: Callback that takes a `ContentType` value
119
122
  string as input, and provides read options to pass to either