PyPI - deltacat - Versions diffs - 0.1.10.dev0__py3-none-any.whl → 0.1.12__py3-none-any.whl - Mend

deltacat 0.1.10.dev0py3-none-any.whl → 0.1.12py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (83) hide show

deltacat/__init__.py +41 -15
deltacat/aws/clients.py +12 -31
deltacat/aws/constants.py +1 -1
deltacat/aws/redshift/__init__.py +7 -2
deltacat/aws/redshift/model/manifest.py +54 -50
deltacat/aws/s3u.py +176 -187
deltacat/catalog/delegate.py +151 -185
deltacat/catalog/interface.py +78 -97
deltacat/catalog/model/catalog.py +21 -21
deltacat/catalog/model/table_definition.py +11 -9
deltacat/compute/compactor/__init__.py +12 -16
deltacat/compute/compactor/compaction_session.py +237 -166
deltacat/compute/compactor/model/delta_annotated.py +60 -44
deltacat/compute/compactor/model/delta_file_envelope.py +5 -6
deltacat/compute/compactor/model/delta_file_locator.py +10 -8
deltacat/compute/compactor/model/materialize_result.py +6 -7
deltacat/compute/compactor/model/primary_key_index.py +38 -34
deltacat/compute/compactor/model/pyarrow_write_result.py +3 -4
deltacat/compute/compactor/model/round_completion_info.py +25 -19
deltacat/compute/compactor/model/sort_key.py +18 -15
deltacat/compute/compactor/steps/dedupe.py +119 -94
deltacat/compute/compactor/steps/hash_bucket.py +48 -47
deltacat/compute/compactor/steps/materialize.py +86 -92
deltacat/compute/compactor/steps/rehash/rehash_bucket.py +13 -13
deltacat/compute/compactor/steps/rehash/rewrite_index.py +5 -5
deltacat/compute/compactor/utils/io.py +59 -47
deltacat/compute/compactor/utils/primary_key_index.py +91 -80
deltacat/compute/compactor/utils/round_completion_file.py +22 -23
deltacat/compute/compactor/utils/system_columns.py +33 -45
deltacat/compute/metastats/meta_stats.py +235 -157
deltacat/compute/metastats/model/partition_stats_dict.py +7 -10
deltacat/compute/metastats/model/stats_cluster_size_estimator.py +13 -5
deltacat/compute/metastats/stats.py +95 -64
deltacat/compute/metastats/utils/io.py +100 -53
deltacat/compute/metastats/utils/pyarrow_memory_estimation_function.py +5 -2
deltacat/compute/metastats/utils/ray_utils.py +38 -33
deltacat/compute/stats/basic.py +107 -69
deltacat/compute/stats/models/delta_column_stats.py +11 -8
deltacat/compute/stats/models/delta_stats.py +59 -32
deltacat/compute/stats/models/delta_stats_cache_result.py +4 -1
deltacat/compute/stats/models/manifest_entry_stats.py +12 -6
deltacat/compute/stats/models/stats_result.py +24 -14
deltacat/compute/stats/utils/intervals.py +16 -9
deltacat/compute/stats/utils/io.py +86 -51
deltacat/compute/stats/utils/manifest_stats_file.py +24 -33
deltacat/constants.py +4 -13
deltacat/io/__init__.py +2 -2
deltacat/io/aws/redshift/redshift_datasource.py +157 -143
deltacat/io/dataset.py +14 -17
deltacat/io/read_api.py +36 -33
deltacat/logs.py +94 -42
deltacat/storage/__init__.py +18 -8
deltacat/storage/interface.py +196 -213
deltacat/storage/model/delta.py +45 -51
deltacat/storage/model/list_result.py +12 -8
deltacat/storage/model/namespace.py +4 -5
deltacat/storage/model/partition.py +42 -42
deltacat/storage/model/stream.py +29 -30
deltacat/storage/model/table.py +14 -14
deltacat/storage/model/table_version.py +32 -31
deltacat/storage/model/types.py +1 -0
deltacat/tests/stats/test_intervals.py +11 -24
deltacat/tests/utils/__init__.py +0 -0
deltacat/tests/utils/test_record_batch_tables.py +284 -0
deltacat/types/media.py +3 -4
deltacat/types/tables.py +31 -21
deltacat/utils/common.py +5 -11
deltacat/utils/numpy.py +20 -22
deltacat/utils/pandas.py +73 -100
deltacat/utils/performance.py +3 -9
deltacat/utils/placement.py +259 -230
deltacat/utils/pyarrow.py +302 -89
deltacat/utils/ray_utils/collections.py +2 -1
deltacat/utils/ray_utils/concurrency.py +27 -28
deltacat/utils/ray_utils/dataset.py +28 -28
deltacat/utils/ray_utils/performance.py +5 -9
deltacat/utils/ray_utils/runtime.py +9 -10
{deltacat-0.1.10.dev0.dist-info → deltacat-0.1.12.dist-info}/METADATA +1 -1
deltacat-0.1.12.dist-info/RECORD +110 -0
deltacat-0.1.10.dev0.dist-info/RECORD +0 -108
{deltacat-0.1.10.dev0.dist-info → deltacat-0.1.12.dist-info}/LICENSE +0 -0
{deltacat-0.1.10.dev0.dist-info → deltacat-0.1.12.dist-info}/WHEEL +0 -0
{deltacat-0.1.10.dev0.dist-info → deltacat-0.1.12.dist-info}/top_level.txt +0 -0

deltacat/io/aws/redshift/redshift_datasource.py CHANGED Viewed

@@ -1,40 +1,45 @@
 import json
 import logging
+from collections import OrderedDict, defaultdict
+from enum import Enum
+from errno import ENOENT
 from os import strerror
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
 import pyarrow as pa
 import ray
 import s3fs
-from errno import ENOENT
-from enum import Enum
-from collections import OrderedDict, defaultdict
-from deltacat.types.media import DELIMITED_TEXT_CONTENT_TYPES
-from pyarrow.fs import FileType, FileSystem, S3FileSystem
 from pyarrow import parquet as pq
-from ray.data.datasource.file_based_datasource import \
-    _resolve_paths_and_filesystem
+from pyarrow.fs import FileSystem, FileType, S3FileSystem
+from ray.data.block import Block, BlockMetadata
+from ray.data.datasource import (
+    BlockWritePathProvider,
+    CSVDatasource,
+    DefaultBlockWritePathProvider,
+    DefaultFileMetadataProvider,
+    ParquetBaseDatasource,
+    ParquetMetadataProvider,
+    PathPartitionParser,
+)
+from ray.data.datasource.datasource import ArrowRow, Datasource, ReadTask, WriteResult
+from ray.data.datasource.file_based_datasource import _resolve_paths_and_filesystem
 from ray.data.datasource.file_meta_provider import FastFileMetadataProvider
-from ray.data.datasource.partitioning import PartitionStyle
 from ray.types import ObjectRef
-from ray.data.datasource import CSVDatasource, BlockWritePathProvider, \
-    DefaultBlockWritePathProvider, ParquetMetadataProvider, \
-    DefaultFileMetadataProvider, ParquetBaseDatasource, PathPartitionParser
-from ray.data.datasource.datasource import ReadTask, WriteResult, Datasource, \
-    ArrowRow
-from ray.data.block import Block, BlockMetadata
-from deltacat import ContentType, ContentEncoding
-from deltacat import logs
-from deltacat.aws.redshift.model.manifest import Manifest, ManifestEntryList, \
-    ManifestEntry, ManifestMeta
-from typing import Any, Callable, List, Optional, Union, Dict, Tuple
-from deltacat.aws.s3u import parse_s3_url, S3Url, filter_objects_by_prefix, \
-    objects_to_paths
+from deltacat import ContentEncoding, ContentType, logs
+from deltacat.aws.redshift.model.manifest import (
+    Manifest,
+    ManifestEntry,
+    ManifestEntryList,
+    ManifestMeta,
+)
+from deltacat.aws.s3u import (
+    S3Url,
+    filter_objects_by_prefix,
+    objects_to_paths,
+    parse_s3_url,
+)
+from deltacat.types.media import DELIMITED_TEXT_CONTENT_TYPES
 from deltacat.utils.common import ReadKwargsProvider
 logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
@@ -43,15 +48,12 @@ logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
 class CapturingBlockWritePathProvider(BlockWritePathProvider):
     """Delegating block write path provider that saves an ordered dictionary of
     input keyword arguments for every block write path returned."""
     def __init__(self, block_write_path_provider: BlockWritePathProvider):
         self.block_write_path_provider = block_write_path_provider
         self.write_path_kwargs: Dict[str, Dict[str, Any]] = OrderedDict()
-    def _get_write_path_for_block(
-            self,
-            base_path: str,
-            *args,
-            **kwargs) -> str:
+    def _get_write_path_for_block(self, base_path: str, *args, **kwargs) -> str:
         write_path = self.block_write_path_provider(
             base_path,
             *args,
@@ -73,10 +75,10 @@ class CachedFileMetadataProvider(
         return self._meta_cache
     def _get_block_metadata(
-            self,
-            paths: List[str],
-            schema: Optional[Union[type, pa.Schema]],
-            **kwargs,
+        self,
+        paths: List[str],
+        schema: Optional[Union[type, pa.Schema]],
+        **kwargs,
     ) -> BlockMetadata:
         agg_block_metadata = BlockMetadata(
             num_rows=0,
@@ -103,9 +105,9 @@ class CachedFileMetadataProvider(
 class HivePartitionParser(PathPartitionParser):
     def __init__(
-            self,
-            base_dir: Optional[str] = None,
-            filter_fn: Optional[Callable[[Dict[str, str]], bool]] = None,
+        self,
+        base_dir: Optional[str] = None,
+        filter_fn: Optional[Callable[[Dict[str, str]], bool]] = None,
     ):
         super(HivePartitionParser, self).__init__(
             base_dir=base_dir,
@@ -115,17 +117,17 @@ class HivePartitionParser(PathPartitionParser):
 class RedshiftUnloadTextArgs:
     def __init__(
-            self,
-            csv: bool = False,
-            header: bool = False,
-            delimiter: Optional[str] = None,
-            bzip2: bool = False,
-            gzip: bool = False,
-            zstd: bool = False,
-            add_quotes: Optional[bool] = None,
-            null_as: str = "",
-            escape: bool = False,
-            fixed_width: bool = False,
+        self,
+        csv: bool = False,
+        header: bool = False,
+        delimiter: Optional[str] = None,
+        bzip2: bool = False,
+        gzip: bool = False,
+        zstd: bool = False,
+        add_quotes: Optional[bool] = None,
+        null_as: str = "",
+        escape: bool = False,
+        fixed_width: bool = False,
     ):
         self.header = header
         self.delimiter = delimiter if delimiter else "," if csv else "|"
@@ -149,20 +151,22 @@ class RedshiftUnloadTextArgs:
                 raise ValueError(
                     f"Multiple Redshift UNLOAD compression types specified "
                     f"({codecs_enabled}). Please ensure that only one "
-                    f"compression type is set and try again.")
+                    f"compression type is set and try again."
+                )
             if flag:
                 arrow_compression_codec_name = encoding
         return arrow_compression_codec_name
     def to_arrow_reader_kwargs(
-            self,
-            include_columns: Optional[List[str]],
-            schema: Optional[pa.Schema]) -> Dict[str, Any]:
+        self, include_columns: Optional[List[str]], schema: Optional[pa.Schema]
+    ) -> Dict[str, Any]:
         from pyarrow import csv
         if self.fixed_width:
             raise NotImplementedError(
                 "Redshift text files unloaded with FIXEDWIDTH are not "
-                "currently supported.")
+                "currently supported."
+            )
         open_stream_args = {}
         arrow_compression_codec_name = self._get_arrow_compression_codec_name()
         if arrow_compression_codec_name:
@@ -217,8 +221,8 @@ class RedshiftWriteResult:
 def _normalize_s3_paths_for_filesystem(
-        paths: Union[str, List[str]],
-        filesystem: Union[S3FileSystem, s3fs.S3FileSystem],
+    paths: Union[str, List[str]],
+    filesystem: Union[S3FileSystem, s3fs.S3FileSystem],
 ) -> Tuple[List[str], List[S3Url]]:
     if isinstance(paths, str):
         paths = [paths]
@@ -234,9 +238,9 @@ def _normalize_s3_paths_for_filesystem(
 def _read_manifest_entry_paths(
-        entries: ManifestEntryList,
-        manifest_content_type: Optional[str],
-        content_type_provider: Callable[[str], ContentType],
+    entries: ManifestEntryList,
+    manifest_content_type: Optional[str],
+    content_type_provider: Callable[[str], ContentType],
 ) -> Tuple[Dict[ContentType, List[str]], CachedFileMetadataProvider]:
     # support manifests with heterogenous content types
     content_type_to_paths = defaultdict(list)
@@ -261,9 +265,9 @@ def _read_manifest_entry_paths(
 def _expand_manifest_paths(
-        paths: List[str],
-        filesystem: Optional[Union[S3FileSystem, s3fs.S3FileSystem]],
-        content_type_provider: Callable[[str], ContentType],
+    paths: List[str],
+    filesystem: Optional[Union[S3FileSystem, s3fs.S3FileSystem]],
+    content_type_provider: Callable[[str], ContentType],
 ) -> Tuple[Dict[ContentType, List[str]], CachedFileMetadataProvider]:
     assert len(paths) == 1, f"Expected 1 manifest path, found {len(paths)}."
     path = paths[0]
@@ -286,8 +290,8 @@ def _expand_manifest_paths(
 def _infer_content_types_from_paths(
-        paths: List[str],
-        content_type_provider: Callable[[str], ContentType],
+    paths: List[str],
+    content_type_provider: Callable[[str], ContentType],
 ) -> Dict[ContentType, List[str]]:
     content_type_to_paths = defaultdict(list)
     for path in paths:
@@ -297,27 +301,30 @@ def _infer_content_types_from_paths(
 def _expand_prefix_paths(
-        urls: List[S3Url],
-        content_type_provider: Callable[[str], ContentType],
-        **s3_client_kwargs,
+    urls: List[S3Url],
+    content_type_provider: Callable[[str], ContentType],
+    **s3_client_kwargs,
 ) -> Tuple[Dict[ContentType, List[str]], CachedFileMetadataProvider]:
     assert len(urls) == 1, f"Expected 1 S3 prefix, found {len(urls)}."
-    objects = list(filter_objects_by_prefix(
-        urls[0].bucket,
-        urls[0].key,
-        **s3_client_kwargs
-    ))
-    paths = list(objects_to_paths(
-        urls[0].bucket,
-        objects,
-    ))
-    meta_cache: Dict[str, BlockMetadata] = {path: BlockMetadata(
-        num_rows=None,
-        size_bytes=objects[i]["ContentLength"],
-        schema=None,
-        input_files=[],
-        exec_stats=None,
-    ) for i, path in enumerate(paths)}
+    objects = list(
+        filter_objects_by_prefix(urls[0].bucket, urls[0].key, **s3_client_kwargs)
+    )
+    paths = list(
+        objects_to_paths(
+            urls[0].bucket,
+            objects,
+        )
+    )
+    meta_cache: Dict[str, BlockMetadata] = {
+        path: BlockMetadata(
+            num_rows=None,
+            size_bytes=objects[i]["ContentLength"],
+            schema=None,
+            input_files=[],
+            exec_stats=None,
+        )
+        for i, path in enumerate(paths)
+    }
     content_type_to_paths = _infer_content_types_from_paths(
         paths,
         content_type_provider,
@@ -326,13 +333,13 @@ def _expand_prefix_paths(
 def _expand_paths_by_content_type(
-        base_paths: Union[str, List[str]],
-        base_urls: List[S3Url],
-        content_type_provider: Callable[[str], ContentType],
-        path_type: S3PathType,
-        user_fs: Optional[Union[S3FileSystem, s3fs.S3FileSystem]],
-        resolved_fs: S3FileSystem,
-        **s3_client_kwargs,
+    base_paths: Union[str, List[str]],
+    base_urls: List[S3Url],
+    content_type_provider: Callable[[str], ContentType],
+    path_type: S3PathType,
+    user_fs: Optional[Union[S3FileSystem, s3fs.S3FileSystem]],
+    resolved_fs: S3FileSystem,
+    **s3_client_kwargs,
 ) -> Tuple[Dict[ContentType, List[str]], CachedFileMetadataProvider]:
     if path_type == S3PathType.MANIFEST:
         content_type_to_paths, meta_provider = _expand_manifest_paths(
@@ -348,16 +355,22 @@ def _expand_paths_by_content_type(
         )
     elif path_type == S3PathType.FILES_AND_FOLDERS:
         # TODO(pdames): Only allow files and call get_object(file_path)?
-        base_paths, file_infos = DefaultFileMetadataProvider()\
-            .expand_paths(base_paths, resolved_fs)
+        base_paths, file_infos = DefaultFileMetadataProvider().expand_paths(
+            base_paths, resolved_fs
+        )
         file_sizes = [file_info.size for file_info in file_infos]
-        meta_provider = CachedFileMetadataProvider({path: BlockMetadata(
-            num_rows=None,
-            size_bytes=file_sizes[i],
-            schema=None,
-            input_files=[],
-            exec_stats=None,
-        ) for i, path in enumerate(base_paths)})
+        meta_provider = CachedFileMetadataProvider(
+            {
+                path: BlockMetadata(
+                    num_rows=None,
+                    size_bytes=file_sizes[i],
+                    schema=None,
+                    input_files=[],
+                    exec_stats=None,
+                )
+                for i, path in enumerate(base_paths)
+            }
+        )
         content_type_to_paths = _infer_content_types_from_paths(
             base_paths,
             content_type_provider,
@@ -374,28 +387,30 @@ def _expand_paths_by_content_type(
         )
         content_type_to_paths[content_type] = paths
     # normalize block metadata provider S3 file paths based on the filesystem
-    meta_provider = CachedFileMetadataProvider({
-        _normalize_s3_paths_for_filesystem(path, user_fs)[0][0]: metadata
-        for path, metadata in meta_provider.get_meta_cache().items()
-    })
+    meta_provider = CachedFileMetadataProvider(
+        {
+            _normalize_s3_paths_for_filesystem(path, user_fs)[0][0]: metadata
+            for path, metadata in meta_provider.get_meta_cache().items()
+        }
+    )
     return content_type_to_paths, meta_provider
 class RedshiftDatasource(Datasource[Union[ArrowRow, Any]]):
     def prepare_read(
-            self,
-            parallelism: int,
-            paths: Union[str, List[str]],
-            content_type_provider: Callable[[str], ContentType],
-            path_type: S3PathType = S3PathType.MANIFEST,
-            filesystem: Optional[Union[S3FileSystem, s3fs.S3FileSystem]] = None,
-            columns: Optional[List[str]] = None,
-            schema: Optional[pa.Schema] = None,
-            unload_args: RedshiftUnloadTextArgs = RedshiftUnloadTextArgs(),
-            partitioning: HivePartitionParser = None,
-            open_stream_args: Optional[Dict[str, Any]] = None,
-            read_kwargs_provider: Optional[ReadKwargsProvider] = None,
-            **s3_client_kwargs,
+        self,
+        parallelism: int,
+        paths: Union[str, List[str]],
+        content_type_provider: Callable[[str], ContentType],
+        path_type: S3PathType = S3PathType.MANIFEST,
+        filesystem: Optional[Union[S3FileSystem, s3fs.S3FileSystem]] = None,
+        columns: Optional[List[str]] = None,
+        schema: Optional[pa.Schema] = None,
+        unload_args: RedshiftUnloadTextArgs = RedshiftUnloadTextArgs(),
+        partitioning: HivePartitionParser = None,
+        open_stream_args: Optional[Dict[str, Any]] = None,
+        read_kwargs_provider: Optional[ReadKwargsProvider] = None,
+        **s3_client_kwargs,
     ) -> List[ReadTask]:
         # default to pyarrow.fs.S3FileSystem if no filesystem given
         if filesystem is None:
@@ -445,7 +460,8 @@ class RedshiftDatasource(Datasource[Union[ArrowRow, Any]]):
                     prepare_read_kwargs["columns"] = columns
             elif content_type in DELIMITED_TEXT_CONTENT_TYPES:
                 prepare_read_kwargs.update(
-                    unload_args.to_arrow_reader_kwargs(columns, schema))
+                    unload_args.to_arrow_reader_kwargs(columns, schema)
+                )
             else:
                 raise NotImplementedError(f"Unsupported content type: {content_type}")
             # merge any provided reader kwargs for this content type with those
@@ -464,19 +480,18 @@ class RedshiftDatasource(Datasource[Union[ArrowRow, Any]]):
         return all_read_tasks
     def do_write(
-            self,
-            blocks: List[ObjectRef[Block]],
-            metadata: List[BlockMetadata],
-            path: str,
-            dataset_uuid: str,
-            filesystem: Optional[FileSystem] = None,
-            try_create_dir: bool = True,
-            open_stream_args: Optional[Dict[str, Any]] = None,
-            block_path_provider: BlockWritePathProvider =
-            DefaultBlockWritePathProvider(),
-            write_args_fn: Callable[[], Dict[str, Any]] = lambda: {},
-            _block_udf: Optional[Callable[[Block], Block]] = None,
-            **write_args,
+        self,
+        blocks: List[ObjectRef[Block]],
+        metadata: List[BlockMetadata],
+        path: str,
+        dataset_uuid: str,
+        filesystem: Optional[FileSystem] = None,
+        try_create_dir: bool = True,
+        open_stream_args: Optional[Dict[str, Any]] = None,
+        block_path_provider: BlockWritePathProvider = DefaultBlockWritePathProvider(),
+        write_args_fn: Callable[[], Dict[str, Any]] = lambda: {},
+        _block_udf: Optional[Callable[[Block], Block]] = None,
+        **write_args,
     ) -> List[ObjectRef[WriteResult]]:
         if filesystem is None:
             filesystem = S3FileSystem()
@@ -484,8 +499,7 @@ class RedshiftDatasource(Datasource[Union[ArrowRow, Any]]):
         paths, filesystem = _resolve_paths_and_filesystem(paths, filesystem)
         assert len(paths) == 1, f"Expected 1 write path, found {len(paths)}."
         path = paths[0]
-        block_path_provider = CapturingBlockWritePathProvider(
-            block_path_provider)
+        block_path_provider = CapturingBlockWritePathProvider(block_path_provider)
         writer = ParquetBaseDatasource()
         write_results = writer.do_write(
             blocks,
@@ -513,21 +527,21 @@ class RedshiftDatasource(Datasource[Union[ArrowRow, Any]]):
         write_results.append(rwr_obj_ref)
         return write_results
-    def on_write_complete(self, write_results: List[WriteResult], **kwargs) \
-            -> None:
+    def on_write_complete(self, write_results: List[WriteResult], **kwargs) -> None:
         # TODO (pdames): time latency of this operation - overall redshift write times
         #  are 2-3x pure read_parquet_fast() times
         # restore the write operation summary from the last write result
-        result: RedshiftWriteResult = write_results[len(write_results)-1]
+        result: RedshiftWriteResult = write_results[len(write_results) - 1]
         write_path_args = result.block_write_path_provider.write_path_kwargs
         blocks_written = len(write_path_args)
         expected_blocks_written = len(result.metadata)
         # TODO(pdames): Corner cases where mismatch is expected? Emply blocks?
         #  Blocks filtered/split/merged to more/less write paths?
-        assert blocks_written == expected_blocks_written, \
-            f"Dataset write result validation failed. Found " \
-            f"{blocks_written}/{expected_blocks_written} Dataset blocks " \
+        assert blocks_written == expected_blocks_written, (
+            f"Dataset write result validation failed. Found "
+            f"{blocks_written}/{expected_blocks_written} Dataset blocks "
             f"written. Refusing to commit Redshift Manifest."
+        )
         manifest_entries = ManifestEntryList()
         for block_idx, path in enumerate(write_path_args.keys()):
             file_info = result.filesystem.get_file_info(path)
@@ -554,11 +568,11 @@ class RedshiftDatasource(Datasource[Union[ArrowRow, Any]]):
         manifest_path = f"{result.path}/manifest"
         logger.debug(f"Write succeeded for Dataset ID: {result.dataset_uuid}")
         with result.filesystem.open_output_stream(
-                manifest_path,
-                # Also See:
-                # docs.aws.amazon.com/AmazonS3/latest/API/RESTCommonRequestHeaders.html
-                # Arrow s3fs.cc: tinyurl.com/2axa6m9m
-                metadata={"Content-Type": ContentType.JSON.value},
+            manifest_path,
+            # Also See:
+            # docs.aws.amazon.com/AmazonS3/latest/API/RESTCommonRequestHeaders.html
+            # Arrow s3fs.cc: tinyurl.com/2axa6m9m
+            metadata={"Content-Type": ContentType.JSON.value},
         ) as f:
             f.write(json.dumps(manifest).encode("utf-8"))
         logger.debug(f"Manifest committed to: {manifest_path}")

deltacat/io/dataset.py CHANGED Viewed

@@ -1,18 +1,16 @@
 # Allow classes to use self-referencing Type hints in Python 3.7.
 from __future__ import annotations
+from typing import Any, Callable, Dict, Optional, TypeVar, Union, cast
 import pyarrow as pa
 import s3fs
-from typing import Optional, Union, Callable, Dict, Any, cast, TypeVar
 from ray.data import Dataset
-from ray.data.datasource import DefaultBlockWritePathProvider, \
-    BlockWritePathProvider
+from ray.data.datasource import BlockWritePathProvider, DefaultBlockWritePathProvider
 from deltacat.io.aws.redshift.redshift_datasource import RedshiftDatasource
-T = TypeVar('T')
+T = TypeVar("T")
 class DeltacatDataset(Dataset[T]):
@@ -23,17 +21,16 @@ class DeltacatDataset(Dataset[T]):
         return cast(DeltacatDataset[T], dataset)
     def write_redshift(
-            self,
-            path: str,
-            *,
-            filesystem: Optional[
-                Union[pa.fs.FileSystem, s3fs.S3FileSystem]] = None,
-            try_create_dir: bool = True,
-            arrow_open_stream_args: Optional[Dict[str, Any]] = None,
-            block_path_provider: BlockWritePathProvider =
-            DefaultBlockWritePathProvider(),
-            arrow_parquet_args_fn: Callable[[], Dict[str, Any]] = lambda: {},
-            **arrow_parquet_args) -> None:
+        self,
+        path: str,
+        *,
+        filesystem: Optional[Union[pa.fs.FileSystem, s3fs.S3FileSystem]] = None,
+        try_create_dir: bool = True,
+        arrow_open_stream_args: Optional[Dict[str, Any]] = None,
+        block_path_provider: BlockWritePathProvider = DefaultBlockWritePathProvider(),
+        arrow_parquet_args_fn: Callable[[], Dict[str, Any]] = lambda: {},
+        **arrow_parquet_args,
+    ) -> None:
         """Writes the dataset to Parquet files and commits a Redshift manifest
         back to S3 indexing the files written. The output can be loaded into
         Redshift by providing it to the Redshift COPY command, or via AWS Data

deltacat/io/read_api.py CHANGED Viewed

@@ -1,35 +1,38 @@
-import s3fs
-import pyarrow as pa
-from deltacat.utils.common import ReadKwargsProvider
+from typing import Any, Callable, Dict, List, Optional, Union
+import pyarrow as pa
+import s3fs
 from ray.data import read_datasource
 from ray.data._internal.arrow_block import ArrowRow
 from deltacat import ContentType
+from deltacat.io.aws.redshift.redshift_datasource import (
+    HivePartitionParser,
+    RedshiftDatasource,
+    RedshiftUnloadTextArgs,
+    S3PathType,
+)
 from deltacat.io.dataset import DeltacatDataset
-from deltacat.io.aws.redshift.redshift_datasource import \
-    RedshiftDatasource, RedshiftUnloadTextArgs, S3PathType, HivePartitionParser
-from typing import Optional, Union, List, Dict, Any, Callable
+from deltacat.utils.common import ReadKwargsProvider
 def read_redshift(
-        paths: Union[str, List[str]],
-        *,
-        path_type: S3PathType = S3PathType.MANIFEST,
-        filesystem: Optional[
-            Union[pa.fs.S3FileSystem, s3fs.S3FileSystem]] = None,
-        columns: Optional[List[str]] = None,
-        schema: Optional[pa.Schema] = None,
-        unload_text_args: RedshiftUnloadTextArgs = RedshiftUnloadTextArgs(),
-        partitioning: HivePartitionParser = None,
-        content_type_provider: Callable[[str], ContentType] = lambda p:
-        ContentType.PARQUET if p.endswith(".parquet") else ContentType.CSV,
-        parallelism: int = 200,
-        ray_remote_args: Dict[str, Any] = None,
-        arrow_open_stream_args: Optional[Dict[str, Any]] = None,
-        pa_read_func_kwargs_provider: Optional[ReadKwargsProvider] = None,
-        **kwargs,
+    paths: Union[str, List[str]],
+    *,
+    path_type: S3PathType = S3PathType.MANIFEST,
+    filesystem: Optional[Union[pa.fs.S3FileSystem, s3fs.S3FileSystem]] = None,
+    columns: Optional[List[str]] = None,
+    schema: Optional[pa.Schema] = None,
+    unload_text_args: RedshiftUnloadTextArgs = RedshiftUnloadTextArgs(),
+    partitioning: HivePartitionParser = None,
+    content_type_provider: Callable[[str], ContentType] = lambda p: ContentType.PARQUET
+    if p.endswith(".parquet")
+    else ContentType.CSV,
+    parallelism: int = 200,
+    ray_remote_args: Dict[str, Any] = None,
+    arrow_open_stream_args: Optional[Dict[str, Any]] = None,
+    pa_read_func_kwargs_provider: Optional[ReadKwargsProvider] = None,
+    **kwargs,
 ) -> DeltacatDataset[ArrowRow]:
     """Reads Redshift UNLOAD results from either S3 Parquet or delimited text
     files into a Ray Dataset.
@@ -38,7 +41,7 @@ def read_redshift(
         >>> # Read all files contained in a Redshift Manifest:
         >>> import deltacat as dc
         >>> dc.io.read_redshift("/bucket/dir/manifest")
         >>> # Read all files matching the given key prefix. If this prefix
         >>> # refers to multiple files, like s3://bucket/data.parquet,
         >>> # s3://bucket/data.1.csv, etc. then all will be read. The dataset
@@ -55,19 +58,19 @@ def read_redshift(
         >>> dc.io.read_redshift(
         >>>     "/bucket/dir",
         >>>     path_type=S3PathType.PREFIX)
         >>> # Read multiple files and folders:
         >>> dc.io.read_redshift(
-        >>>     ["/bucket/file1", "/bucket/folder1/"],
+        >>>     ["/bucket/file1", "/bucket/folder1/"],
         >>>     path_type=S3PathType.FILES_AND_FOLDERS)
         >>> # Read multiple Parquet and CSV files. The dataset schema will be
-        >>> # inferred from the first parquet file and used for explicit type
+        >>> # inferred from the first parquet file and used for explicit type
         >>> # conversion of all CSV files:
         >>> dc.io.read_redshift(
         >>>     ["/bucket/file.parquet", "/bucket/file.csv"],
         >>>     path_type=S3PathType.FILES_AND_FOLDERS)
     Args:
         paths: Paths to S3 files and folders to read. If `path_type` is
             `MANIFEST` then this must be an S3 Redshift Manifest JSON file. If
@@ -93,27 +96,27 @@ def read_redshift(
             discovered is used instead.
         unload_text_args: Arguments used when running Redshift `UNLOAD` to
             text file formats (e.g. CSV). These arguments ensure that all input
-            text files will be correctly parsed. If not specified, then all
-            text files read are assumed to use Redshift UNLOAD's default
+            text files will be correctly parsed. If not specified, then all
+            text files read are assumed to use Redshift UNLOAD's default
             pipe-delimited text format.
         partition_base_dir: Base directory to start searching for partitions
             (exclusive). File paths outside of this directory will not be parsed
             for partitions and automatically added to the dataset without passing
             through any partition filter. Specify `None` or an empty string to
             search for partitions in all file path directories.
-        partition_filter_fn: Callback used to filter `PARTITION` columns. Receives a
+        partition_filter_fn: Callback used to filter `PARTITION` columns. Receives a
             dictionary mapping partition keys to values as input, returns `True` to
             read a partition, and `False` to skip it. Each partition key and value
             is a string parsed directly from an S3 key using hive-style
             partition directory names of the form "{key}={value}". For example:
-            ``lambda x:
+            ``lambda x:
             True if x["month"] == "January" and x["year"] == "2022" else False``
         content_type_provider: Takes a file path as input and returns the file
             content type as output.
         parallelism: The requested parallelism of the read. Parallelism may be
             limited by the number of files of the dataset.
         ray_remote_args: kwargs passed to `ray.remote` in the read tasks.
-        arrow_open_stream_args: kwargs passed to to
+        arrow_open_stream_args: kwargs passed to to
             `pa.fs.open_input_stream()`.
         pa_read_func_kwargs_provider: Callback that takes a `ContentType` value
             string as input, and provides read options to pass to either

deltacat 0.1.10.dev0__py3-none-any.whl → 0.1.12__py3-none-any.whl

deltacat 0.1.10.dev0py3-none-any.whl → 0.1.12py3-none-any.whl