PyPI - deltacat - Versions diffs - 0.1.8__py3-none-any.whl → 0.1.11__py3-none-any.whl - Mend

deltacat 0.1.8py3-none-any.whl → 0.1.11py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (105) hide show

deltacat/__init__.py +41 -15
deltacat/aws/clients.py +12 -31
deltacat/aws/constants.py +1 -1
deltacat/aws/redshift/__init__.py +7 -2
deltacat/aws/redshift/model/manifest.py +54 -50
deltacat/aws/s3u.py +188 -218
deltacat/catalog/delegate.py +151 -185
deltacat/catalog/interface.py +78 -97
deltacat/catalog/model/catalog.py +21 -21
deltacat/catalog/model/table_definition.py +11 -9
deltacat/compute/compactor/__init__.py +12 -16
deltacat/compute/compactor/compaction_session.py +259 -316
deltacat/compute/compactor/model/delta_annotated.py +60 -44
deltacat/compute/compactor/model/delta_file_envelope.py +5 -6
deltacat/compute/compactor/model/delta_file_locator.py +10 -8
deltacat/compute/compactor/model/materialize_result.py +6 -7
deltacat/compute/compactor/model/primary_key_index.py +38 -34
deltacat/compute/compactor/model/pyarrow_write_result.py +3 -4
deltacat/compute/compactor/model/round_completion_info.py +25 -19
deltacat/compute/compactor/model/sort_key.py +18 -15
deltacat/compute/compactor/steps/dedupe.py +152 -259
deltacat/compute/compactor/steps/hash_bucket.py +57 -73
deltacat/compute/compactor/steps/materialize.py +138 -99
deltacat/compute/compactor/steps/rehash/rehash_bucket.py +13 -13
deltacat/compute/compactor/steps/rehash/rewrite_index.py +11 -13
deltacat/compute/compactor/utils/io.py +59 -47
deltacat/compute/compactor/utils/primary_key_index.py +131 -90
deltacat/compute/compactor/utils/round_completion_file.py +22 -23
deltacat/compute/compactor/utils/system_columns.py +33 -42
deltacat/compute/metastats/meta_stats.py +235 -157
deltacat/compute/metastats/model/partition_stats_dict.py +7 -10
deltacat/compute/metastats/model/stats_cluster_size_estimator.py +13 -5
deltacat/compute/metastats/stats.py +95 -64
deltacat/compute/metastats/utils/io.py +100 -53
deltacat/compute/metastats/utils/pyarrow_memory_estimation_function.py +5 -2
deltacat/compute/metastats/utils/ray_utils.py +38 -33
deltacat/compute/stats/basic.py +107 -69
deltacat/compute/stats/models/delta_column_stats.py +11 -8
deltacat/compute/stats/models/delta_stats.py +59 -32
deltacat/compute/stats/models/delta_stats_cache_result.py +4 -1
deltacat/compute/stats/models/manifest_entry_stats.py +12 -6
deltacat/compute/stats/models/stats_result.py +24 -14
deltacat/compute/stats/utils/intervals.py +16 -9
deltacat/compute/stats/utils/io.py +86 -51
deltacat/compute/stats/utils/manifest_stats_file.py +24 -33
deltacat/constants.py +8 -10
deltacat/io/__init__.py +2 -2
deltacat/io/aws/redshift/redshift_datasource.py +157 -143
deltacat/io/dataset.py +14 -17
deltacat/io/read_api.py +36 -33
deltacat/logs.py +94 -42
deltacat/storage/__init__.py +18 -8
deltacat/storage/interface.py +196 -213
deltacat/storage/model/delta.py +45 -51
deltacat/storage/model/list_result.py +12 -8
deltacat/storage/model/namespace.py +4 -5
deltacat/storage/model/partition.py +42 -42
deltacat/storage/model/stream.py +29 -30
deltacat/storage/model/table.py +14 -14
deltacat/storage/model/table_version.py +32 -31
deltacat/storage/model/types.py +1 -0
deltacat/tests/stats/test_intervals.py +11 -24
deltacat/tests/utils/test_record_batch_tables.py +284 -0
deltacat/types/media.py +3 -4
deltacat/types/tables.py +31 -21
deltacat/utils/common.py +5 -11
deltacat/utils/numpy.py +20 -22
deltacat/utils/pandas.py +73 -100
deltacat/utils/performance.py +3 -9
deltacat/utils/placement.py +276 -231
deltacat/utils/pyarrow.py +302 -89
deltacat/utils/ray_utils/collections.py +2 -1
deltacat/utils/ray_utils/concurrency.py +38 -32
deltacat/utils/ray_utils/dataset.py +28 -28
deltacat/utils/ray_utils/performance.py +5 -9
deltacat/utils/ray_utils/runtime.py +9 -10
{deltacat-0.1.8.dist-info → deltacat-0.1.11.dist-info}/METADATA +22 -12
deltacat-0.1.11.dist-info/RECORD +110 -0
{deltacat-0.1.8.dist-info → deltacat-0.1.11.dist-info}/WHEEL +1 -1
deltacat/autoscaler/events/__init__.py +0 -0
deltacat/autoscaler/events/compaction/__init__.py +0 -0
deltacat/autoscaler/events/compaction/cluster.py +0 -82
deltacat/autoscaler/events/compaction/collections/__init__.py +0 -0
deltacat/autoscaler/events/compaction/collections/partition_key_value.py +0 -36
deltacat/autoscaler/events/compaction/dispatcher.py +0 -28
deltacat/autoscaler/events/compaction/input.py +0 -27
deltacat/autoscaler/events/compaction/process.py +0 -25
deltacat/autoscaler/events/compaction/session_manager.py +0 -13
deltacat/autoscaler/events/compaction/utils.py +0 -216
deltacat/autoscaler/events/compaction/workflow.py +0 -303
deltacat/autoscaler/events/dispatcher.py +0 -95
deltacat/autoscaler/events/dynamodb/__init__.py +0 -0
deltacat/autoscaler/events/dynamodb/event_store.py +0 -164
deltacat/autoscaler/events/event_store.py +0 -55
deltacat/autoscaler/events/exceptions.py +0 -6
deltacat/autoscaler/events/processor.py +0 -177
deltacat/autoscaler/events/session_manager.py +0 -25
deltacat/autoscaler/events/states.py +0 -88
deltacat/autoscaler/events/workflow.py +0 -54
deltacat/autoscaler/node_group.py +0 -230
deltacat/autoscaler/utils.py +0 -69
deltacat-0.1.8.dist-info/RECORD +0 -131
/deltacat/{autoscaler → tests/utils}/__init__.py +0 -0
{deltacat-0.1.8.dist-info → deltacat-0.1.11.dist-info}/LICENSE +0 -0
{deltacat-0.1.8.dist-info → deltacat-0.1.11.dist-info}/top_level.txt +0 -0

deltacat/aws/s3u.py CHANGED Viewed

@@ -1,51 +1,58 @@
-import ray
-import deltacat.aws.clients as aws_utils
 import logging
 import multiprocessing
-import s3fs
-import pyarrow as pa
 from functools import partial
+from typing import Any, Callable, Dict, Generator, List, Optional, Union
 from uuid import uuid4
-from ray.types import ObjectRef
-from ray.data.datasource import BlockWritePathProvider
+import pyarrow as pa
+import ray
+import s3fs
+from boto3.resources.base import ServiceResource
+from botocore.client import BaseClient
+from botocore.exceptions import ClientError
 from ray.data.block import Block, BlockAccessor, BlockMetadata
+from ray.data.datasource import BlockWritePathProvider
+from ray.types import ObjectRef
+from tenacity import (
+    Retrying,
+    retry_if_exception_type,
+    retry_if_not_exception_type,
+    stop_after_delay,
+    wait_random_exponential,
+)
+import deltacat.aws.clients as aws_utils
 from deltacat import logs
-from deltacat.storage import LocalTable, LocalDataset, DistributedDataset, \
-    Manifest, ManifestEntry, ManifestEntryList
 from deltacat.aws.constants import TIMEOUT_ERROR_CODES
-from deltacat.exceptions import RetryableError, NonRetryableError
-from deltacat.types.media import ContentType, ContentEncoding
-from deltacat.types.tables import TABLE_TYPE_TO_READER_FUNC, \
-    TABLE_CLASS_TO_SIZE_FUNC, get_table_length
-from deltacat.types.media import TableType
+from deltacat.exceptions import NonRetryableError, RetryableError
+from deltacat.storage import (
+    DistributedDataset,
+    LocalDataset,
+    LocalTable,
+    Manifest,
+    ManifestEntry,
+    ManifestEntryList,
+)
+from deltacat.types.media import ContentEncoding, ContentType, TableType
+from deltacat.types.tables import (
+    TABLE_CLASS_TO_SIZE_FUNC,
+    TABLE_TYPE_TO_READER_FUNC,
+    get_table_length,
+)
 from deltacat.utils.common import ReadKwargsProvider
-from boto3.resources.base import ServiceResource
-from botocore.client import BaseClient
-from botocore.exceptions import ClientError
-from tenacity import Retrying
-from tenacity import wait_random_exponential
-from tenacity import stop_after_delay
-from tenacity import retry_if_exception_type, retry_if_not_exception_type
-from typing import Any, Callable, Dict, List, Optional, Generator, Union, Tuple
 logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
+# TODO(raghumdani): refactor redshift datasource to reuse the
+# same module for writing output files.
-@ray.remote
 class CapturedBlockWritePaths:
     def __init__(self):
         self._write_paths: List[str] = []
         self._block_refs: List[ObjectRef[Block]] = []
-    def extend(
-            self,
-            write_paths: List[str],
-            block_refs: List[ObjectRef[Block]]):
+    def extend(self, write_paths: List[str], block_refs: List[ObjectRef[Block]]):
         try:
             iter(write_paths)
         except TypeError:
@@ -70,27 +77,29 @@ class UuidBlockWritePathProvider(BlockWritePathProvider):
     """Block write path provider implementation that writes each
     dataset block out to a file of the form: {base_path}/{uuid}
     """
-    def __init__(self, capture_actor: CapturedBlockWritePaths):
+    def __init__(self, capture_object: CapturedBlockWritePaths):
         self.write_paths: List[str] = []
         self.block_refs: List[ObjectRef[Block]] = []
-        self.capture_actor = capture_actor
+        self.capture_object = capture_object
     def __del__(self):
         if self.write_paths or self.block_refs:
-            self.capture_actor.extend.remote(
+            self.capture_object.extend(
                 self.write_paths,
                 self.block_refs,
             )
     def _get_write_path_for_block(
-            self,
-            base_path: str,
-            *,
-            filesystem: Optional[pa.filesystem.FileSystem] = None,
-            dataset_uuid: Optional[str] = None,
-            block: Optional[ObjectRef[Block]] = None,
-            block_index: Optional[int] = None,
-            file_format: Optional[str] = None) -> str:
+        self,
+        base_path: str,
+        *,
+        filesystem: Optional[pa.filesystem.FileSystem] = None,
+        dataset_uuid: Optional[str] = None,
+        block: Optional[ObjectRef[Block]] = None,
+        block_index: Optional[int] = None,
+        file_format: Optional[str] = None,
+    ) -> str:
         write_path = f"{base_path}/{str(uuid4())}"
         self.write_paths.append(write_path)
         if block:
@@ -99,24 +108,18 @@ class UuidBlockWritePathProvider(BlockWritePathProvider):
 class S3Url:
-    def __init__(
-            self,
-            url: str):
+    def __init__(self, url: str):
         from urllib.parse import urlparse
-        self._parsed = urlparse(
-            url,
-            allow_fragments=False  # support '#' in path
-        )
+        self._parsed = urlparse(url, allow_fragments=False)  # support '#' in path
         if not self._parsed.scheme:  # support paths w/o 's3://' scheme
             url = f"s3://{url}"
             self._parsed = urlparse(url, allow_fragments=False)
         if self._parsed.query:  # support '?' in path
-            self.key = \
-                f"{self._parsed.path.lstrip('/')}?{self._parsed.query}"
+            self.key = f"{self._parsed.path.lstrip('/')}?{self._parsed.query}"
         else:
-            self.key = self._parsed.path.lstrip('/')
+            self.key = self._parsed.path.lstrip("/")
         self.bucket = self._parsed.netloc
         self.url = self._parsed.geturl()
@@ -125,9 +128,7 @@ def parse_s3_url(url: str) -> S3Url:
     return S3Url(url)
-def s3_resource_cache(
-        region: Optional[str],
-        **kwargs) -> ServiceResource:
+def s3_resource_cache(region: Optional[str], **kwargs) -> ServiceResource:
     return aws_utils.resource_cache(
         "s3",
@@ -136,36 +137,20 @@ def s3_resource_cache(
     )
-def s3_client_cache(
-        region: Optional[str],
-        **kwargs) -> BaseClient:
+def s3_client_cache(region: Optional[str], **kwargs) -> BaseClient:
-    return aws_utils.client_cache(
-        "s3",
-        region,
-        **kwargs
-    )
+    return aws_utils.client_cache("s3", region, **kwargs)
-def get_object_at_url(
-        url: str,
-        **s3_client_kwargs) -> Dict[str, Any]:
+def get_object_at_url(url: str, **s3_client_kwargs) -> Dict[str, Any]:
-    s3 = s3_client_cache(
-        None,
-        **s3_client_kwargs)
+    s3 = s3_client_cache(None, **s3_client_kwargs)
     parsed_s3_url = parse_s3_url(url)
-    return s3.get_object(
-        Bucket=parsed_s3_url.bucket,
-        Key=parsed_s3_url.key
-    )
+    return s3.get_object(Bucket=parsed_s3_url.bucket, Key=parsed_s3_url.key)
-def delete_files_by_prefix(
-        bucket: str,
-        prefix: str,
-        **s3_client_kwargs) -> None:
+def delete_files_by_prefix(bucket: str, prefix: str, **s3_client_kwargs) -> None:
     s3 = s3_resource_cache(None, **s3_client_kwargs)
     bucket = s3.Bucket(bucket)
@@ -189,14 +174,10 @@ def get_path_from_object(bucket, obj):
 def filter_objects_by_prefix(
-        bucket: str,
-        prefix: str,
-        **s3_client_kwargs) -> Generator[Dict[str, Any], None, None]:
+    bucket: str, prefix: str, **s3_client_kwargs
+) -> Generator[Dict[str, Any], None, None]:
-    s3 = s3_client_cache(
-        None,
-        **s3_client_kwargs
-    )
+    s3 = s3_client_cache(None, **s3_client_kwargs)
     params = {"Bucket": bucket, "Prefix": prefix}
     more_objects_to_list = True
     while more_objects_to_list:
@@ -209,14 +190,15 @@ def filter_objects_by_prefix(
 def read_file(
-        s3_url: str,
-        content_type: ContentType,
-        content_encoding: ContentEncoding = ContentEncoding.IDENTITY,
-        table_type: TableType = TableType.PYARROW,
-        column_names: Optional[List[str]] = None,
-        include_columns: Optional[List[str]] = None,
-        file_reader_kwargs_provider: Optional[ReadKwargsProvider] = None,
-        **s3_client_kwargs) -> LocalTable:
+    s3_url: str,
+    content_type: ContentType,
+    content_encoding: ContentEncoding = ContentEncoding.IDENTITY,
+    table_type: TableType = TableType.PYARROW,
+    column_names: Optional[List[str]] = None,
+    include_columns: Optional[List[str]] = None,
+    file_reader_kwargs_provider: Optional[ReadKwargsProvider] = None,
+    **s3_client_kwargs,
+) -> LocalTable:
     reader = TABLE_TYPE_TO_READER_FUNC[table_type.value]
     try:
@@ -227,34 +209,33 @@ def read_file(
             column_names,
             include_columns,
             file_reader_kwargs_provider,
-            **s3_client_kwargs
+            **s3_client_kwargs,
         )
         return table
     except ClientError as e:
         if e.response["Error"]["Code"] in TIMEOUT_ERROR_CODES:
             # Timeout error not caught by botocore
-            raise RetryableError(f"Retry table download from: {s3_url}") \
-                from e
-        raise NonRetryableError(f"Failed table download from: {s3_url}") \
-            from e
+            raise RetryableError(f"Retry table download from: {s3_url}") from e
+        raise NonRetryableError(f"Failed table download from: {s3_url}") from e
 def upload_sliced_table(
-        table: Union[LocalTable, DistributedDataset],
-        s3_url_prefix: str,
-        s3_file_system: s3fs.S3FileSystem,
-        max_records_per_entry: Optional[int],
-        s3_table_writer_func: Callable,
-        table_slicer_func: Callable,
-        s3_table_writer_kwargs: Optional[Dict[str, Any]] = None,
-        content_type: ContentType = ContentType.PARQUET,
-        **s3_client_kwargs) -> ManifestEntryList:
+    table: Union[LocalTable, DistributedDataset],
+    s3_url_prefix: str,
+    s3_file_system: s3fs.S3FileSystem,
+    max_records_per_entry: Optional[int],
+    s3_table_writer_func: Callable,
+    table_slicer_func: Callable,
+    s3_table_writer_kwargs: Optional[Dict[str, Any]] = None,
+    content_type: ContentType = ContentType.PARQUET,
+    **s3_client_kwargs,
+) -> ManifestEntryList:
     # @retry decorator can't be pickled by Ray, so wrap upload in Retrying
     retrying = Retrying(
         wait=wait_random_exponential(multiplier=1, max=60),
         stop=stop_after_delay(30 * 60),
-        retry=retry_if_exception_type(RetryableError)
+        retry=retry_if_exception_type(RetryableError),
     )
     manifest_entries = ManifestEntryList()
@@ -270,14 +251,11 @@ def upload_sliced_table(
             s3_table_writer_func,
             s3_table_writer_kwargs,
             content_type,
-            **s3_client_kwargs
+            **s3_client_kwargs,
         )
     else:
         # iteratively write table slices
-        table_slices = table_slicer_func(
-            table,
-            max_records_per_entry
-        )
+        table_slices = table_slicer_func(table, max_records_per_entry)
         for table_slice in table_slices:
             slice_entries = retrying(
                 upload_table,
@@ -287,7 +265,7 @@ def upload_sliced_table(
                 s3_table_writer_func,
                 s3_table_writer_kwargs,
                 content_type,
-                **s3_client_kwargs
+                **s3_client_kwargs,
             )
             manifest_entries.extend(slice_entries)
@@ -303,15 +281,17 @@ def _block_metadata(block: Block) -> BlockMetadata:
 def _get_metadata(
-        table: Union[LocalTable, DistributedDataset],
-        write_paths: List[str],
-        block_refs: List[ObjectRef[Block]])-> List[BlockMetadata]:
+    table: Union[LocalTable, DistributedDataset],
+    write_paths: List[str],
+    block_refs: List[ObjectRef[Block]],
+) -> List[BlockMetadata]:
     metadata: List[BlockMetadata] = []
     if not block_refs:
         # this must be a local table - ensure it was written to only 1 file
-        assert len(write_paths) == 1, \
-            f"Expected table of type '{type(table)}' to be written to 1 " \
+        assert len(write_paths) == 1, (
+            f"Expected table of type '{type(table)}' to be written to 1 "
             f"file, but found {len(write_paths)} files."
+        )
         table_size = None
         table_size_func = TABLE_CLASS_TO_SIZE_FUNC.get(type(table))
         if table_size_func:
@@ -333,23 +313,27 @@ def _get_metadata(
         # metadata = dataset._blocks.get_metadata()
         # ray 2.0.0dev
         metadata = table._plan.execute().get_metadata()
-        if not metadata or metadata[0].size_bytes is None or \
-                metadata[0].num_rows is None:
-            metadata_futures = [_block_metadata.remote(block_ref)
-                                for block_ref
-                                in block_refs]
+        if (
+            not metadata
+            or metadata[0].size_bytes is None
+            or metadata[0].num_rows is None
+        ):
+            metadata_futures = [
+                _block_metadata.remote(block_ref) for block_ref in block_refs
+            ]
             metadata = ray.get(metadata_futures)
     return metadata
 def upload_table(
-        table: Union[LocalTable, DistributedDataset],
-        s3_base_url: str,
-        s3_file_system: s3fs.S3FileSystem,
-        s3_table_writer_func: Callable,
-        s3_table_writer_kwargs: Optional[Dict[str, Any]],
-        content_type: ContentType = ContentType.PARQUET,
-        **s3_client_kwargs) -> ManifestEntryList:
+    table: Union[LocalTable, DistributedDataset],
+    s3_base_url: str,
+    s3_file_system: s3fs.S3FileSystem,
+    s3_table_writer_func: Callable,
+    s3_table_writer_kwargs: Optional[Dict[str, Any]],
+    content_type: ContentType = ContentType.PARQUET,
+    **s3_client_kwargs,
+) -> ManifestEntryList:
     """
     Writes the given table to 1 or more S3 files and return Redshift
     manifest entries describing the uploaded files.
@@ -357,20 +341,20 @@ def upload_table(
     if s3_table_writer_kwargs is None:
         s3_table_writer_kwargs = {}
-    capture_actor = CapturedBlockWritePaths.remote()
-    block_write_path_provider = UuidBlockWritePathProvider(capture_actor)
+    capture_object = CapturedBlockWritePaths()
+    block_write_path_provider = UuidBlockWritePathProvider(capture_object)
     s3_table_writer_func(
         table,
         s3_base_url,
         s3_file_system,
         block_write_path_provider,
         content_type.value,
-        **s3_table_writer_kwargs
+        **s3_table_writer_kwargs,
     )
     # TODO: Add a proper fix for block_refs and write_paths not persisting in Ray actors
     del block_write_path_provider
-    block_refs = ray.get(capture_actor.block_refs.remote())
-    write_paths = ray.get(capture_actor.write_paths.remote())
+    block_refs = capture_object.block_refs()
+    write_paths = capture_object.write_paths()
     metadata = _get_metadata(table, write_paths, block_refs)
     manifest_entries = ManifestEntryList()
     for block_idx, s3_url in enumerate(write_paths):
@@ -385,37 +369,42 @@ def upload_table(
         except ClientError as e:
             if e.response["Error"]["Code"] == "NoSuchKey":
                 # s3fs may swallow S3 errors - we were probably throttled
-                raise RetryableError(f"Retry table upload to: {s3_url}") \
-                    from e
-            raise NonRetryableError(f"Failed table upload to: {s3_url}") \
-                from e
+                raise RetryableError(f"Retry table upload to: {s3_url}") from e
+            raise NonRetryableError(f"Failed table upload to: {s3_url}") from e
     return manifest_entries
 def download_manifest_entry(
-        manifest_entry: ManifestEntry,
-        token_holder: Optional[Dict[str, Any]] = None,
-        table_type: TableType = TableType.PYARROW,
-        column_names: Optional[List[str]] = None,
-        include_columns: Optional[List[str]] = None,
-        file_reader_kwargs_provider: Optional[ReadKwargsProvider] = None,
-        content_type: Optional[ContentType] = None,
-        content_encoding: Optional[ContentEncoding] = None) -> LocalTable:
-    s3_client_kwargs = {
-        "aws_access_key_id": token_holder["accessKeyId"],
-        "aws_secret_access_key": token_holder["secretAccessKey"],
-        "aws_session_token": token_holder["sessionToken"]
-    } if token_holder else {}
+    manifest_entry: ManifestEntry,
+    token_holder: Optional[Dict[str, Any]] = None,
+    table_type: TableType = TableType.PYARROW,
+    column_names: Optional[List[str]] = None,
+    include_columns: Optional[List[str]] = None,
+    file_reader_kwargs_provider: Optional[ReadKwargsProvider] = None,
+    content_type: Optional[ContentType] = None,
+    content_encoding: Optional[ContentEncoding] = None,
+) -> LocalTable:
+    s3_client_kwargs = (
+        {
+            "aws_access_key_id": token_holder["accessKeyId"],
+            "aws_secret_access_key": token_holder["secretAccessKey"],
+            "aws_session_token": token_holder["sessionToken"],
+        }
+        if token_holder
+        else {}
+    )
     if not content_type:
         content_type = manifest_entry.meta.content_type
-        assert content_type, \
-            f"Unknown content type for manifest entry: {manifest_entry}"
+        assert (
+            content_type
+        ), f"Unknown content type for manifest entry: {manifest_entry}"
         content_type = ContentType(content_type)
     if not content_encoding:
         content_encoding = manifest_entry.meta.content_encoding
-        assert content_encoding, \
-            f"Unknown content encoding for manifest entry: {manifest_entry}"
+        assert (
+            content_encoding
+        ), f"Unknown content encoding for manifest entry: {manifest_entry}"
         content_encoding = ContentEncoding(content_encoding)
     s3_url = manifest_entry.uri
     if s3_url is None:
@@ -424,7 +413,7 @@ def download_manifest_entry(
     retrying = Retrying(
         wait=wait_random_exponential(multiplier=1, max=60),
         stop=stop_after_delay(30 * 60),
-        retry=retry_if_not_exception_type(NonRetryableError)
+        retry=retry_if_not_exception_type(NonRetryableError),
     )
     table = retrying(
         read_file,
@@ -441,46 +430,36 @@ def download_manifest_entry(
 def _download_manifest_entries(
-        manifest: Manifest,
-        token_holder: Optional[Dict[str, Any]] = None,
-        table_type: TableType = TableType.PYARROW,
-        ignore_missing_manifest: bool = False,
-        column_names: Optional[List[str]] = None,
-        include_columns: Optional[List[str]] = None,
-        file_reader_kwargs_provider: Optional[ReadKwargsProvider] = None) \
-        -> Tuple[LocalDataset,Optional[List[int]]]:
-    if ignore_missing_manifest:
-        result = []
-        missing = []
-        for ide, e in enumerate(manifest.entries):
-            try:
-                tmp = download_manifest_entry(e, token_holder, table_type, column_names,
-                                        include_columns, file_reader_kwargs_provider)
-                result.append(tmp)
-            except Exception as e:
-                missing.append(ide)
-                logger.info(f"missing {len(missing)} manifest_entry")
-                pass
-        return result, missing
-    else:
-        return [
-            download_manifest_entry(e, token_holder, table_type, column_names,
-                                    include_columns, file_reader_kwargs_provider)
-            for e in manifest.entries
-        ]
+    manifest: Manifest,
+    token_holder: Optional[Dict[str, Any]] = None,
+    table_type: TableType = TableType.PYARROW,
+    column_names: Optional[List[str]] = None,
+    include_columns: Optional[List[str]] = None,
+    file_reader_kwargs_provider: Optional[ReadKwargsProvider] = None,
+) -> LocalDataset:
+    return [
+        download_manifest_entry(
+            e,
+            token_holder,
+            table_type,
+            column_names,
+            include_columns,
+            file_reader_kwargs_provider,
+        )
+        for e in manifest.entries
+    ]
 def _download_manifest_entries_parallel(
-        manifest: Manifest,
-        token_holder: Optional[Dict[str, Any]] = None,
-        table_type: TableType = TableType.PYARROW,
-        ignore_missing_manifest: bool = False,
-        max_parallelism: Optional[int] = None,
-        column_names: Optional[List[str]] = None,
-        include_columns: Optional[List[str]] = None,
-        file_reader_kwargs_provider: Optional[ReadKwargsProvider] = None) \
-        -> LocalDataset:
+    manifest: Manifest,
+    token_holder: Optional[Dict[str, Any]] = None,
+    table_type: TableType = TableType.PYARROW,
+    max_parallelism: Optional[int] = None,
+    column_names: Optional[List[str]] = None,
+    include_columns: Optional[List[str]] = None,
+    file_reader_kwargs_provider: Optional[ReadKwargsProvider] = None,
+) -> LocalDataset:
     tables = []
     pool = multiprocessing.Pool(max_parallelism)
@@ -498,22 +477,20 @@ def _download_manifest_entries_parallel(
 def download_manifest_entries(
-        manifest: Manifest,
-        token_holder: Optional[Dict[str, Any]] = None,
-        table_type: TableType = TableType.PYARROW,
-        ignore_missing_manifest: bool = False,
-        max_parallelism: Optional[int] = 1,
-        column_names: Optional[List[str]] = None,
-        include_columns: Optional[List[str]] = None,
-        file_reader_kwargs_provider: Optional[ReadKwargsProvider] = None) \
-        -> Tuple[LocalDataset,Optional[List[int]]]:
+    manifest: Manifest,
+    token_holder: Optional[Dict[str, Any]] = None,
+    table_type: TableType = TableType.PYARROW,
+    max_parallelism: Optional[int] = 1,
+    column_names: Optional[List[str]] = None,
+    include_columns: Optional[List[str]] = None,
+    file_reader_kwargs_provider: Optional[ReadKwargsProvider] = None,
+) -> LocalDataset:
     if max_parallelism and max_parallelism <= 1:
         return _download_manifest_entries(
             manifest,
             token_holder,
             table_type,
-            ignore_missing_manifest,
             column_names,
             include_columns,
             file_reader_kwargs_provider,
@@ -523,7 +500,6 @@ def download_manifest_entries(
             manifest,
             token_holder,
             table_type,
-            ignore_missing_manifest,
             max_parallelism,
             column_names,
             include_columns,
@@ -531,10 +507,7 @@ def download_manifest_entries(
         )
-def upload(
-        s3_url: str,
-        body,
-        **s3_client_kwargs) -> Dict[str, Any]:
+def upload(s3_url: str, body, **s3_client_kwargs) -> Dict[str, Any]:
     # TODO (pdames): add tenacity retrying
     parsed_s3_url = parse_s3_url(s3_url)
@@ -547,9 +520,8 @@ def upload(
 def download(
-        s3_url: str,
-        fail_if_not_found: bool = True,
-        **s3_client_kwargs) -> Optional[Dict[str, Any]]:
+    s3_url: str, fail_if_not_found: bool = True, **s3_client_kwargs
+) -> Optional[Dict[str, Any]]:
     # TODO (pdames): add tenacity retrying
     parsed_s3_url = parse_s3_url(s3_url)
@@ -563,15 +535,13 @@ def download(
         if fail_if_not_found:
             raise
         else:
-            if e.response['Error']['Code'] != "404":
-                if e.response['Error']['Code'] != 'NoSuchKey':
+            if e.response["Error"]["Code"] != "404":
+                if e.response["Error"]["Code"] != "NoSuchKey":
                     raise
-            logger.info(
-                f"file not found: {s3_url}")
+            logger.info(f"file not found: {s3_url}")
     except s3.exceptions.NoSuchKey:
         if fail_if_not_found:
             raise
         else:
-            logger.info(
-                f"file not found: {s3_url}")
+            logger.info(f"file not found: {s3_url}")
     return None

deltacat 0.1.8__py3-none-any.whl → 0.1.11__py3-none-any.whl

deltacat 0.1.8py3-none-any.whl → 0.1.11py3-none-any.whl