PyPI - deltacat - Versions diffs - 0.1.10.dev0__py3-none-any.whl → 0.1.12__py3-none-any.whl - Mend

deltacat 0.1.10.dev0py3-none-any.whl → 0.1.12py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (83) hide show

deltacat/__init__.py +41 -15
deltacat/aws/clients.py +12 -31
deltacat/aws/constants.py +1 -1
deltacat/aws/redshift/__init__.py +7 -2
deltacat/aws/redshift/model/manifest.py +54 -50
deltacat/aws/s3u.py +176 -187
deltacat/catalog/delegate.py +151 -185
deltacat/catalog/interface.py +78 -97
deltacat/catalog/model/catalog.py +21 -21
deltacat/catalog/model/table_definition.py +11 -9
deltacat/compute/compactor/__init__.py +12 -16
deltacat/compute/compactor/compaction_session.py +237 -166
deltacat/compute/compactor/model/delta_annotated.py +60 -44
deltacat/compute/compactor/model/delta_file_envelope.py +5 -6
deltacat/compute/compactor/model/delta_file_locator.py +10 -8
deltacat/compute/compactor/model/materialize_result.py +6 -7
deltacat/compute/compactor/model/primary_key_index.py +38 -34
deltacat/compute/compactor/model/pyarrow_write_result.py +3 -4
deltacat/compute/compactor/model/round_completion_info.py +25 -19
deltacat/compute/compactor/model/sort_key.py +18 -15
deltacat/compute/compactor/steps/dedupe.py +119 -94
deltacat/compute/compactor/steps/hash_bucket.py +48 -47
deltacat/compute/compactor/steps/materialize.py +86 -92
deltacat/compute/compactor/steps/rehash/rehash_bucket.py +13 -13
deltacat/compute/compactor/steps/rehash/rewrite_index.py +5 -5
deltacat/compute/compactor/utils/io.py +59 -47
deltacat/compute/compactor/utils/primary_key_index.py +91 -80
deltacat/compute/compactor/utils/round_completion_file.py +22 -23
deltacat/compute/compactor/utils/system_columns.py +33 -45
deltacat/compute/metastats/meta_stats.py +235 -157
deltacat/compute/metastats/model/partition_stats_dict.py +7 -10
deltacat/compute/metastats/model/stats_cluster_size_estimator.py +13 -5
deltacat/compute/metastats/stats.py +95 -64
deltacat/compute/metastats/utils/io.py +100 -53
deltacat/compute/metastats/utils/pyarrow_memory_estimation_function.py +5 -2
deltacat/compute/metastats/utils/ray_utils.py +38 -33
deltacat/compute/stats/basic.py +107 -69
deltacat/compute/stats/models/delta_column_stats.py +11 -8
deltacat/compute/stats/models/delta_stats.py +59 -32
deltacat/compute/stats/models/delta_stats_cache_result.py +4 -1
deltacat/compute/stats/models/manifest_entry_stats.py +12 -6
deltacat/compute/stats/models/stats_result.py +24 -14
deltacat/compute/stats/utils/intervals.py +16 -9
deltacat/compute/stats/utils/io.py +86 -51
deltacat/compute/stats/utils/manifest_stats_file.py +24 -33
deltacat/constants.py +4 -13
deltacat/io/__init__.py +2 -2
deltacat/io/aws/redshift/redshift_datasource.py +157 -143
deltacat/io/dataset.py +14 -17
deltacat/io/read_api.py +36 -33
deltacat/logs.py +94 -42
deltacat/storage/__init__.py +18 -8
deltacat/storage/interface.py +196 -213
deltacat/storage/model/delta.py +45 -51
deltacat/storage/model/list_result.py +12 -8
deltacat/storage/model/namespace.py +4 -5
deltacat/storage/model/partition.py +42 -42
deltacat/storage/model/stream.py +29 -30
deltacat/storage/model/table.py +14 -14
deltacat/storage/model/table_version.py +32 -31
deltacat/storage/model/types.py +1 -0
deltacat/tests/stats/test_intervals.py +11 -24
deltacat/tests/utils/__init__.py +0 -0
deltacat/tests/utils/test_record_batch_tables.py +284 -0
deltacat/types/media.py +3 -4
deltacat/types/tables.py +31 -21
deltacat/utils/common.py +5 -11
deltacat/utils/numpy.py +20 -22
deltacat/utils/pandas.py +73 -100
deltacat/utils/performance.py +3 -9
deltacat/utils/placement.py +259 -230
deltacat/utils/pyarrow.py +302 -89
deltacat/utils/ray_utils/collections.py +2 -1
deltacat/utils/ray_utils/concurrency.py +27 -28
deltacat/utils/ray_utils/dataset.py +28 -28
deltacat/utils/ray_utils/performance.py +5 -9
deltacat/utils/ray_utils/runtime.py +9 -10
{deltacat-0.1.10.dev0.dist-info → deltacat-0.1.12.dist-info}/METADATA +1 -1
deltacat-0.1.12.dist-info/RECORD +110 -0
deltacat-0.1.10.dev0.dist-info/RECORD +0 -108
{deltacat-0.1.10.dev0.dist-info → deltacat-0.1.12.dist-info}/LICENSE +0 -0
{deltacat-0.1.10.dev0.dist-info → deltacat-0.1.12.dist-info}/WHEEL +0 -0
{deltacat-0.1.10.dev0.dist-info → deltacat-0.1.12.dist-info}/top_level.txt +0 -0

deltacat/utils/pyarrow.py CHANGED Viewed

@@ -1,25 +1,31 @@
-import pyarrow as pa
-import gzip
+# Allow classes to use self-referencing Type hints in Python 3.7.
+from __future__ import annotations
 import bz2
+import gzip
 import io
 import logging
 from functools import partial
-from fsspec import AbstractFileSystem
-from pyarrow import feather as paf, parquet as papq, csv as pacsv, \
-    json as pajson
+from typing import Any, Callable, Dict, Iterable, List, Optional
+import pyarrow as pa
+from fsspec import AbstractFileSystem
+from pyarrow import csv as pacsv
+from pyarrow import feather as paf
+from pyarrow import json as pajson
+from pyarrow import parquet as papq
 from ray.data.datasource import BlockWritePathProvider
 from deltacat import logs
-from deltacat.types.media import ContentType, ContentEncoding, \
-    DELIMITED_TEXT_CONTENT_TYPES, TABULAR_CONTENT_TYPES
-from deltacat.utils.common import ReadKwargsProvider, ContentTypeKwargsProvider
+from deltacat.types.media import (
+    DELIMITED_TEXT_CONTENT_TYPES,
+    TABULAR_CONTENT_TYPES,
+    ContentEncoding,
+    ContentType,
+)
+from deltacat.utils.common import ContentTypeKwargsProvider, ReadKwargsProvider
 from deltacat.utils.performance import timed_invocation
-from typing import Any, Callable, Dict, List, Optional, Iterable, Union
 logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
@@ -33,35 +39,28 @@ CONTENT_TYPE_TO_PA_READ_FUNC: Dict[str, Callable] = {
     # Pyarrow.orc is disabled in Pyarrow 0.15, 0.16:
     # https://issues.apache.org/jira/browse/ARROW-7811
     # ContentType.ORC.value: paorc.ContentType.ORCFile,
-    ContentType.JSON.value: pajson.read_json
+    ContentType.JSON.value: pajson.read_json,
 }
 def write_feather(
-        table: pa.Table,
-        path: str,
-        *,
-        filesystem: AbstractFileSystem,
-        **kwargs) -> None:
+    table: pa.Table, path: str, *, filesystem: AbstractFileSystem, **kwargs
+) -> None:
     with filesystem.open(path, "wb") as f:
         paf.write_feather(table, f, **kwargs)
 def write_csv(
-        table: pa.Table,
-        path: str,
-        *,
-        filesystem: AbstractFileSystem,
-        **kwargs) -> None:
+    table: pa.Table, path: str, *, filesystem: AbstractFileSystem, **kwargs
+) -> None:
     with filesystem.open(path, "wb") as f:
         # TODO (pdames): Add support for client-specified compression types.
         with pa.CompressedOutputStream(f, ContentEncoding.GZIP.value) as out:
             if kwargs.get("write_options") is None:
                 # column names are kept in table metadata, so omit header
-                kwargs["write_options"] = pacsv.WriteOptions(
-                    include_header=False)
+                kwargs["write_options"] = pacsv.WriteOptions(include_header=False)
             pacsv.write_csv(table, out, **kwargs)
@@ -78,13 +77,11 @@ CONTENT_TYPE_TO_PA_WRITE_FUNC: Dict[str, Callable] = {
 def content_type_to_reader_kwargs(content_type: str) -> Dict[str, Any]:
     if content_type == ContentType.UNESCAPED_TSV.value:
         return {
-            "parse_options": pacsv.ParseOptions(
-                delimiter="\t",
-                quote_char=False),
+            "parse_options": pacsv.ParseOptions(delimiter="\t", quote_char=False),
             "convert_options": pacsv.ConvertOptions(
                 null_values=[""],  # pyarrow defaults are ["", "NULL", "null"]
                 strings_can_be_null=True,
-            )
+            ),
         }
     if content_type == ContentType.TSV.value:
         return {"parse_options": pacsv.ParseOptions(delimiter="\t")}
@@ -92,9 +89,11 @@ def content_type_to_reader_kwargs(content_type: str) -> Dict[str, Any]:
         return {"parse_options": pacsv.ParseOptions(delimiter=",")}
     if content_type == ContentType.PSV.value:
         return {"parse_options": pacsv.ParseOptions(delimiter="|")}
-    if content_type in {ContentType.PARQUET.value,
-                        ContentType.FEATHER.value,
-                        ContentType.JSON.value}:
+    if content_type in {
+        ContentType.PARQUET.value,
+        ContentType.FEATHER.value,
+        ContentType.JSON.value,
+    }:
         return {}
     # Pyarrow.orc is disabled in Pyarrow 0.15, 0.16:
     # https://issues.apache.org/jira/browse/ARROW-7811
@@ -105,15 +104,13 @@ def content_type_to_reader_kwargs(content_type: str) -> Dict[str, Any]:
 # TODO (pdames): add deflate and snappy
 ENCODING_TO_FILE_INIT: Dict[str, Callable] = {
-    ContentEncoding.GZIP.value: partial(gzip.GzipFile, mode='rb'),
-    ContentEncoding.BZIP2.value: partial(bz2.BZ2File, mode='rb'),
+    ContentEncoding.GZIP.value: partial(gzip.GzipFile, mode="rb"),
+    ContentEncoding.BZIP2.value: partial(bz2.BZ2File, mode="rb"),
     ContentEncoding.IDENTITY.value: lambda fileobj: fileobj,
 }
-def slice_table(
-        table: pa.Table,
-        max_len: Optional[int]) -> List[pa.Table]:
+def slice_table(table: pa.Table, max_len: Optional[int]) -> List[pa.Table]:
     """
     Iteratively create 0-copy table slices.
     """
@@ -123,10 +120,7 @@ def slice_table(
     offset = 0
     records_remaining = len(table)
     while records_remaining > 0:
-        records_this_entry = min(
-            max_len,
-            records_remaining
-        )
+        records_this_entry = min(max_len, records_remaining)
         tables.append(table.slice(offset, records_this_entry))
         records_remaining -= records_this_entry
         offset += records_this_entry
@@ -138,21 +132,21 @@ class ReadKwargsProviderPyArrowCsvPureUtf8(ContentTypeKwargsProvider):
     as UTF-8 strings (i.e. disables type inference). Useful for ensuring
     lossless reads of UTF-8 delimited text datasets and improving read
     performance in cases where type casting is not required."""
     def __init__(self, include_columns: Optional[Iterable[str]] = None):
         self.include_columns = include_columns
-    def _get_kwargs(
-            self,
-            content_type: str,
-            kwargs: Dict[str, Any]) -> Dict[str, Any]:
+    def _get_kwargs(self, content_type: str, kwargs: Dict[str, Any]) -> Dict[str, Any]:
         if content_type in DELIMITED_TEXT_CONTENT_TYPES:
-            convert_options: pacsv.ConvertOptions = \
-                kwargs.get("convert_options")
+            convert_options: pacsv.ConvertOptions = kwargs.get("convert_options")
             if convert_options is None:
                 convert_options = pacsv.ConvertOptions()
             # read only the included columns as strings?
-            column_names = self.include_columns \
-                if self.include_columns else convert_options.include_columns
+            column_names = (
+                self.include_columns
+                if self.include_columns
+                else convert_options.include_columns
+            )
             if not column_names:
                 # read all columns as strings?
                 read_options: pacsv.ReadOptions = kwargs.get("read_options")
@@ -171,13 +165,26 @@ class ReadKwargsProviderPyArrowSchemaOverride(ContentTypeKwargsProvider):
     """ReadKwargsProvider impl that explicitly maps column names to column types when
     loading dataset files into a PyArrow table. Disables the default type inference
     behavior on the defined columns."""
-    def __init__(self, schema: Optional[pa.Schema] = None):
+    def __init__(
+        self,
+        schema: Optional[pa.Schema] = None,
+        pq_coerce_int96_timestamp_unit: Optional[str] = None,
+    ):
+        """
+        Args:
+            schema: The schema to use for reading the dataset.
+                If unspecified, the schema will be inferred from the source.
+            pq_coerce_int96_timestamp_unit: When reading from parquet files, cast timestamps that are stored in INT96
+                format to a particular resolution (e.g. 'ms'). Setting to None is equivalent to 'ms'
+                and therefore INT96 timestamps will be inferred as timestamps in milliseconds.
+        """
         self.schema = schema
+        self.pq_coerce_int96_timestamp_unit = pq_coerce_int96_timestamp_unit
-    def _get_kwargs(
-            self,
-            content_type: str,
-            kwargs: Dict[str, Any]) -> Dict[str, Any]:
+    def _get_kwargs(self, content_type: str, kwargs: Dict[str, Any]) -> Dict[str, Any]:
         if content_type in DELIMITED_TEXT_CONTENT_TYPES:
             convert_options = kwargs.get("convert_options", pacsv.ConvertOptions())
             if self.schema:
@@ -188,14 +195,21 @@ class ReadKwargsProviderPyArrowSchemaOverride(ContentTypeKwargsProvider):
             # Only supported in PyArrow 8.0.0+
             if self.schema:
                 kwargs["schema"] = self.schema
+            # Coerce deprecated int96 timestamp to millisecond if unspecified
+            kwargs["coerce_int96_timestamp_unit"] = (
+                self.pq_coerce_int96_timestamp_unit or "ms"
+            )
         return kwargs
 def _add_column_kwargs(
-        content_type: str,
-        column_names: Optional[List[str]],
-        include_columns: Optional[List[str]],
-        kwargs: Dict[str, Any]):
+    content_type: str,
+    column_names: Optional[List[str]],
+    include_columns: Optional[List[str]],
+    kwargs: Dict[str, Any],
+):
     if content_type in DELIMITED_TEXT_CONTENT_TYPES:
         read_options: pacsv.ReadOptions = kwargs.get("read_options")
@@ -219,25 +233,27 @@ def _add_column_kwargs(
             if include_columns:
                 logger.warning(
                     f"Ignoring request to include columns {include_columns} "
-                    f"for non-tabular content type {content_type}")
+                    f"for non-tabular content type {content_type}"
+                )
 def s3_file_to_table(
-        s3_url: str,
-        content_type: str,
-        content_encoding: str,
-        column_names: Optional[List[str]] = None,
-        include_columns: Optional[List[str]] = None,
-        pa_read_func_kwargs_provider: Optional[ReadKwargsProvider] = None,
-        **s3_client_kwargs) -> pa.Table:
+    s3_url: str,
+    content_type: str,
+    content_encoding: str,
+    column_names: Optional[List[str]] = None,
+    include_columns: Optional[List[str]] = None,
+    pa_read_func_kwargs_provider: Optional[ReadKwargsProvider] = None,
+    **s3_client_kwargs,
+) -> pa.Table:
     from deltacat.aws import s3u as s3_utils
-    logger.debug(f"Reading {s3_url} to PyArrow. Content type: {content_type}. "
-                 f"Encoding: {content_encoding}")
-    s3_obj = s3_utils.get_object_at_url(
-        s3_url,
-        **s3_client_kwargs
+    logger.debug(
+        f"Reading {s3_url} to PyArrow. Content type: {content_type}. "
+        f"Encoding: {content_encoding}"
     )
+    s3_obj = s3_utils.get_object_at_url(s3_url, **s3_client_kwargs)
     logger.debug(f"Read S3 object from {s3_url}: {s3_obj}")
     pa_read_func = CONTENT_TYPE_TO_PA_READ_FUNC[content_type]
     input_file_init = ENCODING_TO_FILE_INIT[content_encoding]
@@ -251,11 +267,7 @@ def s3_file_to_table(
         kwargs = pa_read_func_kwargs_provider(content_type, kwargs)
     logger.debug(f"Reading {s3_url} via {pa_read_func} with kwargs: {kwargs}")
-    table, latency = timed_invocation(
-        pa_read_func,
-        *args,
-        **kwargs
-    )
+    table, latency = timed_invocation(pa_read_func, *args, **kwargs)
     logger.debug(f"Time to read {s3_url} into PyArrow table: {latency}s")
     return table
@@ -265,12 +277,13 @@ def table_size(table: pa.Table) -> int:
 def table_to_file(
-        table: pa.Table,
-        base_path: str,
-        file_system: AbstractFileSystem,
-        block_path_provider: BlockWritePathProvider,
-        content_type: str = ContentType.PARQUET.value,
-        **kwargs) -> None:
+    table: pa.Table,
+    base_path: str,
+    file_system: AbstractFileSystem,
+    block_path_provider: BlockWritePathProvider,
+    content_type: str = ContentType.PARQUET.value,
+    **kwargs,
+) -> None:
     """
     Writes the given Pyarrow Table to a file.
     """
@@ -279,11 +292,211 @@ def table_to_file(
         raise NotImplementedError(
             f"Pyarrow writer for content type '{content_type}' not "
             f"implemented. Known content types: "
-            f"{CONTENT_TYPE_TO_PA_WRITE_FUNC.keys}")
+            f"{CONTENT_TYPE_TO_PA_WRITE_FUNC.keys}"
+        )
     path = block_path_provider(base_path)
-    writer(
-        table,
-        path,
-        filesystem=file_system,
-        **kwargs
-    )
+    writer(table, path, filesystem=file_system, **kwargs)
+class RecordBatchTables:
+    def __init__(self, batch_size: int):
+        """
+        Data structure for maintaining a batched list of tables, where each batched table has
+        a record count of some multiple of the specified record batch size.
+        Remaining records are stored in a separate list of tables.
+        Args:
+            batch_size: Minimum record count per table to batch by. Batched tables are
+             guaranteed to have a record count multiple of the batch_size.
+        """
+        self._batched_tables: List[pa.Table] = []
+        self._batched_record_count: int = 0
+        self._remaining_tables: List[pa.Table] = []
+        self._remaining_record_count: int = 0
+        self._batch_size: int = batch_size
+    def append(self, table: pa.Table) -> None:
+        """
+        Appends a table for batching.
+        Table record counts are added to any previous remaining record count.
+        If the new remainder record count meets or exceeds the configured batch size record count,
+        the remainder will be shifted over to the list of batched tables in FIFO order via table slicing.
+        Batched tables will always have a record count of some multiple of the configured batch size.
+        Record ordering is preserved from input tables whenever tables are shifted from the remainder
+        over to the batched list. Records from Table A will always precede records from Table B,
+        if Table A was appended before Table B. Records from the batched list will always precede records
+        from the remainders.
+        Ex:
+            bt = RecordBatchTables(8)
+            col1 = pa.array([i for i in range(10)])
+            test_table = pa.Table.from_arrays([col1], names=["col1"])
+            bt.append(test_table)
+            print(bt.batched_records)  # 8
+            print(bt.batched)  # [0, 1, 2, 3, 4, 5, 6, 7]
+            print(bt.remaining_records)  # 2
+            print(bt.remaining)  # [8, 9]
+        Args:
+            table: Input table to add
+        """
+        if self._remaining_tables:
+            if self._remaining_record_count + len(table) < self._batch_size:
+                self._remaining_tables.append(table)
+                self._remaining_record_count += len(table)
+                return
+            records_to_fit = self._batch_size - self._remaining_record_count
+            fitted_table = table.slice(length=records_to_fit)
+            self._remaining_tables.append(fitted_table)
+            self._remaining_record_count += len(fitted_table)
+            table = table.slice(offset=records_to_fit)
+        record_count = len(table)
+        record_multiplier, records_leftover = (
+            record_count // self._batch_size,
+            record_count % self._batch_size,
+        )
+        if record_multiplier > 0:
+            batched_table = table.slice(length=record_multiplier * self._batch_size)
+            # Add to remainder tables to preserve record ordering
+            self._remaining_tables.append(batched_table)
+            self._remaining_record_count += len(batched_table)
+        if self._remaining_tables:
+            self._shift_remaining_to_new_batch()
+        if records_leftover > 0:
+            leftover_table = table.slice(offset=record_multiplier * self._batch_size)
+            self._remaining_tables.append(leftover_table)
+            self._remaining_record_count += len(leftover_table)
+    def _shift_remaining_to_new_batch(self) -> None:
+        new_batch = pa.concat_tables(self._remaining_tables)
+        self._batched_tables.append(new_batch)
+        self._batched_record_count += self._remaining_record_count
+        self.clear_remaining()
+    @staticmethod
+    def from_tables(tables: List[pa.Table], batch_size: int) -> RecordBatchTables:
+        """
+        Static factory for generating batched tables and remainders given a list of input tables.
+        Args:
+            tables: A list of input tables with various record counts
+            batch_size: Minimum record count per table to batch by. Batched tables are
+             guaranteed to have a record count multiple of the batch_size.
+        Returns: A batched tables object
+        """
+        rbt = RecordBatchTables(batch_size)
+        for table in tables:
+            rbt.append(table)
+        return rbt
+    @property
+    def batched(self) -> List[pa.Table]:
+        """
+        List of tables batched and ready for processing.
+        Each table has N records, where N records are some multiple of the configured records batch size.
+        For example, if the configured batch size is 5, then a list of batched tables
+        could have the following record counts: [60, 5, 30, 10]
+        Returns: a list of batched tables
+        """
+        return self._batched_tables
+    @property
+    def batched_record_count(self) -> int:
+        """
+        The number of total records from the batched list.
+        Returns: batched record count
+        """
+        return self._batched_record_count
+    @property
+    def remaining(self) -> List[pa.Table]:
+        """
+        List of tables carried over from table slicing during the batching operation.
+        The sum of all record counts in the remaining tables is guaranteed to be less than the configured batch size.
+        Returns: a list of remaining tables
+        """
+        return self._remaining_tables
+    @property
+    def remaining_record_count(self) -> int:
+        """
+        The number of total records from the remaining tables list.
+        Returns: remaining record count
+        """
+        return self._remaining_record_count
+    @property
+    def batch_size(self) -> int:
+        """
+        The configured batch size.
+        Returns: batch size
+        """
+        return self._batch_size
+    def has_batches(self) -> bool:
+        """
+        Checks if there are any currently batched tables ready for processing.
+        Returns: true if batched records exist, otherwise false
+        """
+        return self._batched_record_count > 0
+    def has_remaining(self) -> bool:
+        """
+        Checks if any remaining tables exist after batching.
+        Returns: true if remaining records exist, otherwise false
+        """
+        return self._remaining_record_count > 0
+    def evict(self) -> List[pa.Table]:
+        """
+        Evicts all batched tables from this object and returns them.
+        Returns: a list of batched tables
+        """
+        evicted_tables = [*self.batched]
+        self.clear_batches()
+        return evicted_tables
+    def clear_batches(self) -> None:
+        """
+        Removes all batched tables and resets batched records.
+        """
+        self._batched_tables.clear()
+        self._batched_record_count = 0
+    def clear_remaining(self) -> None:
+        """
+        Removes all remaining tables and resets remaining records.
+        """
+        self._remaining_tables.clear()
+        self._remaining_record_count = 0

deltacat/utils/ray_utils/collections.py CHANGED Viewed

@@ -1,6 +1,7 @@
-import ray
 from collections import Counter
+import ray
 @ray.remote
 class DistributedCounter(object):

deltacat/utils/ray_utils/concurrency.py CHANGED Viewed

@@ -1,22 +1,23 @@
-import ray
+import copy
+import itertools
+from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union
+import ray
 from ray._private.ray_constants import MIN_RESOURCE_GRANULARITY
 from ray.types import ObjectRef
 from deltacat.utils.ray_utils.runtime import current_node_resource_key
-import copy
-from typing import Any, Iterable, Callable, Dict, List, Tuple, Union, Optional
-import itertools
 def invoke_parallel(
-        items: Iterable,
-        ray_task: Callable,
-        *args,
-        max_parallelism: Optional[int] = 1000,
-        options_provider: Callable[[int, Any], Dict[str, Any]] = None,
-        kwargs_provider: Callable[[int, Any], Dict[str, Any]] = None,
-        **kwargs) -> List[Union[ObjectRef, Tuple[ObjectRef, ...]]]:
+    items: Iterable,
+    ray_task: Callable,
+    *args,
+    max_parallelism: Optional[int] = 1000,
+    options_provider: Callable[[int, Any], Dict[str, Any]] = None,
+    kwargs_provider: Callable[[int, Any], Dict[str, Any]] = None,
+    **kwargs,
+) -> List[Union[ObjectRef, Tuple[ObjectRef, ...]]]:
     """
     Creates a limited number of parallel remote invocations of the given ray
     task. By default each task is provided an ordered item from the input
@@ -57,11 +58,11 @@ def invoke_parallel(
                 ray.wait(
                     list(itertools.chain(*pending_ids)),
                     num_returns=int(
-                        len(pending_ids[0])*(len(pending_ids) - max_parallelism)
-                    )
+                        len(pending_ids[0]) * (len(pending_ids) - max_parallelism)
+                    ),
                 )
             else:
-                ray.wait(pending_ids, num_returns=len(pending_ids)-max_parallelism)
+                ray.wait(pending_ids, num_returns=len(pending_ids) - max_parallelism)
         opt = {}
         if options_provider:
             opt = options_provider(i, item)
@@ -79,21 +80,17 @@ def current_node_options_provider(*args, **kwargs) -> Dict[str, Any]:
     """Returns a resource dictionary that can be included with ray remote
     options to pin the task or actor on the current node via:
     `foo.options(current_node_options_provider()).remote()`"""
-    return {
-        "resources": {
-            current_node_resource_key(): MIN_RESOURCE_GRANULARITY
-        }
-    }
+    return {"resources": {current_node_resource_key(): MIN_RESOURCE_GRANULARITY}}
 def round_robin_options_provider(
-        i: int,
-        item: Any,
-        resource_keys: List[str],
-        *args,
-        resource_amount_provider: Callable[[int], int] =
-        lambda i: MIN_RESOURCE_GRANULARITY,
-        **kwargs) -> Dict[str, Any]:
+    i: int,
+    item: Any,
+    resource_keys: List[str],
+    *args,
+    resource_amount_provider: Callable[[int], int] = lambda i: MIN_RESOURCE_GRANULARITY,
+    **kwargs,
+) -> Dict[str, Any]:
     """Returns a resource dictionary that can be included with ray remote
     options to round robin indexed tasks or actors across a list of resource
     keys. For example, the following code round-robins 100 tasks across all
@@ -108,8 +105,10 @@ def round_robin_options_provider(
     opts = kwargs.get("pg_config")
     if opts:
         new_opts = copy.deepcopy(opts)
-        bundle_key_index = i % len(new_opts['scheduling_strategy'].placement_group.bundle_specs)
-        new_opts['scheduling_strategy'].placement_group_bundle_index = bundle_key_index
+        bundle_key_index = i % len(
+            new_opts["scheduling_strategy"].placement_group.bundle_specs
+        )
+        new_opts["scheduling_strategy"].placement_group_bundle_index = bundle_key_index
         return new_opts
     else:
         assert resource_keys, f"No resource keys given to round robin!"

deltacat 0.1.10.dev0__py3-none-any.whl → 0.1.12__py3-none-any.whl

deltacat 0.1.10.dev0py3-none-any.whl → 0.1.12py3-none-any.whl