PyPI - opengris-parfun - Versions diffs - 7.3.0__py3-none-any.whl - Mend

opengris-parfun 7.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (43) hide show

opengris_parfun-7.3.0.dist-info/METADATA +165 -0
opengris_parfun-7.3.0.dist-info/RECORD +43 -0
opengris_parfun-7.3.0.dist-info/WHEEL +5 -0
opengris_parfun-7.3.0.dist-info/licenses/LICENSE +201 -0
opengris_parfun-7.3.0.dist-info/licenses/LICENSE.spdx +7 -0
opengris_parfun-7.3.0.dist-info/licenses/NOTICE +7 -0
opengris_parfun-7.3.0.dist-info/top_level.txt +1 -0
parfun/__init__.py +26 -0
parfun/about.py +1 -0
parfun/backend/__init__.py +0 -0
parfun/backend/dask.py +151 -0
parfun/backend/local_multiprocessing.py +92 -0
parfun/backend/local_single_process.py +47 -0
parfun/backend/mixins.py +68 -0
parfun/backend/profiled_future.py +50 -0
parfun/backend/scaler.py +226 -0
parfun/backend/utility.py +7 -0
parfun/combine/__init__.py +0 -0
parfun/combine/collection.py +13 -0
parfun/combine/dataframe.py +13 -0
parfun/dataframe.py +175 -0
parfun/decorators.py +135 -0
parfun/entry_point.py +180 -0
parfun/functions.py +71 -0
parfun/kernel/__init__.py +0 -0
parfun/kernel/function_signature.py +197 -0
parfun/kernel/parallel_function.py +262 -0
parfun/object.py +7 -0
parfun/partition/__init__.py +0 -0
parfun/partition/api.py +136 -0
parfun/partition/collection.py +13 -0
parfun/partition/dataframe.py +16 -0
parfun/partition/object.py +50 -0
parfun/partition/primitives.py +317 -0
parfun/partition/utility.py +54 -0
parfun/partition_size_estimator/__init__.py +0 -0
parfun/partition_size_estimator/linear_regression_estimator.py +189 -0
parfun/partition_size_estimator/mixins.py +22 -0
parfun/partition_size_estimator/object.py +19 -0
parfun/profiler/__init__.py +0 -0
parfun/profiler/functions.py +261 -0
parfun/profiler/object.py +68 -0
parfun/py_list.py +56 -0

parfun/kernel/parallel_function.py ADDED Viewed

@@ -0,0 +1,262 @@
+import collections
+import logging
+from inspect import Parameter, currentframe
+from itertools import chain
+from typing import Callable, Deque, Generator, Iterable, Optional, Tuple, Union
+import attrs
+from parfun.backend.mixins import BackendEngine, ProfiledFuture
+from parfun.entry_point import get_parallel_backend, set_parallel_backend_context
+from parfun.kernel.function_signature import FunctionSignature, NamedArguments
+from parfun.object import FunctionInputType, FunctionOutputType, PartitionType
+from parfun.partition.object import PartitionGenerator
+from parfun.partition_size_estimator.linear_regression_estimator import LinearRegessionEstimator
+from parfun.partition_size_estimator.mixins import PartitionSizeEstimator
+from parfun.profiler.functions import (
+    export_task_trace, print_profile_trace, timed_combine_with, timed_function, timed_partition,
+)
+from parfun.profiler.object import PartitionedTaskTrace, TraceTime
+@attrs.define
+class ParallelFunction:
+    """Wraps a function so that it executes in parallel using a map-reduce/scatter-gather approach.
+    See the `@parallel()` decorator for a more user-friendly interface.
+    """
+    function: Callable[[FunctionInputType], FunctionOutputType] = attrs.field()
+    function_name: str = attrs.field()
+    split: Callable[[NamedArguments], Tuple[NamedArguments, PartitionGenerator[NamedArguments]]] = attrs.field()
+    combine_with: Callable[[Iterable[FunctionOutputType]], FunctionOutputType] = attrs.field()
+    initial_partition_size: Optional[Union[int, Callable[[FunctionInputType], int]]] = attrs.field(default=None)
+    fixed_partition_size: Optional[Union[int, Callable[[FunctionInputType], int]]] = attrs.field(default=None)
+    profile: bool = attrs.field(default=None)
+    trace_export: Optional[str] = attrs.field(default=None)
+    partition_size_estimator_factory: Callable[[], PartitionSizeEstimator] = attrs.field(
+        default=LinearRegessionEstimator
+    )
+    _partition_size_estimator: Optional[PartitionSizeEstimator] = attrs.field(init=False, default=None)
+    _function_signature: FunctionSignature = attrs.field(init=False)
+    def __attrs_post_init__(self) -> None:
+        self._function_signature = FunctionSignature.from_function(self.function)
+        if self.initial_partition_size is not None and self.fixed_partition_size is not None:
+            raise ValueError("`initial_partition_size` and `fixed_partition_size` cannot be set simultaneously.")
+        if self.fixed_partition_size is None:
+            self._partition_size_estimator = self.partition_size_estimator_factory()
+        self._validate_function_signature()
+    @initial_partition_size.validator
+    @fixed_partition_size.validator
+    def _partition_size_validator(self, attribute, value):
+        if value is not None and not isinstance(value, int) and not callable(value):
+            raise ValueError(f"`{attribute.name}` should be either an integer, a callable or `None`.")
+    def _validate_function_signature(self):
+        if self._function_signature.has_var_arg or self._function_signature.has_var_kwarg:
+            return
+        if any(arg.kind == Parameter.POSITIONAL_ONLY for arg in self._function_signature.args.values()):
+            raise ValueError("parfun toolkit does not support positional only parameters yet.")
+    def __call__(self, *args, **kwargs) -> FunctionOutputType:
+        current_backend = get_parallel_backend()
+        allows_nested_tasks = current_backend is not None and current_backend.allows_nested_tasks()
+        # Note: is_nested_parallelism check should appears before any backend check, as unsupported nested function
+        # calls will have an empty backend setup.
+        if is_nested_parallelism() and not allows_nested_tasks:
+            logging.debug(
+                f"backend does not support nested parallelism. Running {self.function.__name__} sequentially."
+            )
+            return self.function(*args, **kwargs)
+        if current_backend is None:
+            logging.warning(f"no parallel backend engine set, run `{self.function_name}(...)` sequentially.")
+            return self.function(*args, **kwargs)
+        # Assign a name to each argument based on the decorated function's signature.
+        named_args = self._function_signature.assign(args, kwargs)
+        # Initialize the partition generator
+        non_partitioned_args, partition_generator = self.split(named_args)
+        initial_partition_size, fixed_partition_size = self._get_user_partition_sizes(args, kwargs)
+        partitions = timed_partition(
+            partition_generator, self._partition_size_estimator, initial_partition_size, fixed_partition_size
+        )
+        # Execute the function
+        if allows_nested_tasks:
+            nested_backend = current_backend
+        else:
+            nested_backend = None
+        results = run_function_on_partitions(
+            self.function,
+            non_partitioned_args,
+            partitions,
+            current_backend,
+            nested_backend,
+        )
+        # Combine the results
+        combined_result, task_trace = timed_combine_with(self.combine_with, self._partition_size_estimator, results)
+        if self.profile:
+            print_profile_trace(self.function, self.function_name, self._partition_size_estimator, task_trace)
+        if self.trace_export:
+            export_task_trace(self.trace_export, task_trace)
+        logging.info(
+            f"Run `{self.function_name}(...)` with {task_trace.partition_count} of "
+            f"sub-tasks using backend {current_backend.__class__} successfully"
+        )
+        return combined_result
+    def _get_user_partition_sizes(self, args, kwargs) -> Tuple[Optional[int], Optional[int]]:
+        """Returns the initial partition size and fixed partition size for the calling function arguments."""
+        if callable(self.initial_partition_size):
+            initial_partition_size = self.initial_partition_size(*args, **kwargs)
+        else:
+            initial_partition_size = self.initial_partition_size
+        if callable(self.fixed_partition_size):
+            fixed_partition_size = self.fixed_partition_size(*args, **kwargs)
+        else:
+            fixed_partition_size = self.fixed_partition_size
+        return initial_partition_size, fixed_partition_size
+def is_nested_parallelism():
+    """Returns True if there is any call to `_apply_function()` in the current call stack."""
+    frame = currentframe()
+    while frame is not None:
+        if frame.f_code.co_name == run_function_in_worker.__name__ and frame.f_code.co_filename == __file__:
+            return True
+        frame = frame.f_back
+    return False
+def run_function_on_partitions(
+    function: Callable[[PartitionType], FunctionOutputType],
+    non_partitioned_args: NamedArguments,
+    partitions: Generator[Tuple[NamedArguments, PartitionedTaskTrace], None, None],
+    backend: BackendEngine,
+    nested_backend: Optional[BackendEngine],
+) -> Generator[Tuple[FunctionOutputType, TraceTime], None, None]:
+    """
+    Applies the provided function on all non-partitioned and partitioned arguments using the provided backend.
+    """
+    # First, tries to get the first 2 partitions. If we get less than 2, we run the function sequentially to avoid
+    # any parallelism overhead.
+    iterator = iter(partitions)
+    first_values = []
+    try:
+        first_values.append(next(iterator))
+        first_values.append(next(iterator))
+    except StopIteration:
+        # Less than 2 values, run these sequentially and return
+        assert len(first_values) <= 2
+        for partitioned_args in first_values:
+            yield timed_function(
+                run_function_in_worker,
+                function,
+                non_partitioned_args,
+                partitioned_args,
+                backend=None,
+            )
+        return
+    # At least two values, submits these and the rest to the backend.
+    with backend.session() as session:
+        preloaded_non_partitioned_args = session.preload_value(non_partitioned_args)
+        # We take care of futures.pop() no longer required futures' references as we yield them, to avoid keeping no
+        # longer used results. Not doing this will prevent the Python's GC to free the yielded results once these have
+        # been processed by the consuming function.
+        futures: Deque[ProfiledFuture] = collections.deque()
+        try:
+            for partitioned_args in chain(first_values, iterator):
+                futures.append(
+                    session.submit(
+                        run_function_in_worker,
+                        function,
+                        preloaded_non_partitioned_args,
+                        partitioned_args,
+                        nested_backend,
+                    )
+                )
+                # Yields any finished future from the head of the queue.
+                while len(futures) > 0 and futures[0].done():
+                    yield futures.popleft().result_and_duration()
+            # Yields the remaining results.
+            while len(futures) > 0:
+                yield futures.popleft().result_and_duration()
+        finally:
+            # If any failure, cancels all unfinished tasks.
+            for future in futures:
+                future.cancel()
+def run_function_in_worker(
+    function: Callable[[PartitionType], FunctionOutputType],
+    non_partitioned_args: NamedArguments,
+    partition: Tuple[NamedArguments, PartitionedTaskTrace],
+    backend: Optional[BackendEngine] = None,
+) -> Tuple[FunctionOutputType, PartitionedTaskTrace]:
+    """
+    Runs the function with the partitioned object, setting up the expected worker environment.
+    :param non_partitioned_args: the function arguments that are identical for every function call.
+    :param partition: the partitioned arguments and the associated partition task trace.
+    :param backend: if not None, setup this backend before executing the function.
+    :returns the function's output and the original partition task trace.
+    """
+    partitioned_args, trace = partition
+    merged_args = non_partitioned_args.merge(partitioned_args)
+    assert len(non_partitioned_args.var_args) == 0
+    args, kwargs = merged_args.as_args_kwargs()
+    if backend is not None:
+        with set_parallel_backend_context(backend):
+            result = function(*args, **kwargs)
+    else:
+        result = function(*args, **kwargs)
+    return result, trace

parfun/object.py ADDED Viewed

@@ -0,0 +1,7 @@
+from typing import Any, TypeVar
+# TODO we can specify and limit their values in future
+FunctionInputType = Any
+FunctionOutputType = Any
+PartitionType = TypeVar("PartitionType")  # Input and output are identical for partitioning functions.

parfun/partition/__init__.py ADDED Viewed

File without changes

parfun/partition/api.py ADDED Viewed

@@ -0,0 +1,136 @@
+from collections import OrderedDict
+from itertools import chain
+from typing import Callable, Tuple, Union
+from parfun.kernel.function_signature import NamedArguments
+from parfun.partition.object import PartitionFunction, PartitionGenerator
+from parfun.partition.primitives import partition_map, partition_zip
+def per_argument(
+    **partition_arg_with: PartitionFunction,
+) -> Callable[[NamedArguments], Tuple[NamedArguments, PartitionGenerator[NamedArguments]]]:
+    """
+    Applies multiple partitioning functions simultaneously on different function arguments.
+    .. code:: python
+        @pf.parallel(
+            split=pf.per_argument(
+                df=pf.dataframe.by_row,
+                xs=pf.py_list.by_chunk,
+            )
+        )
+        def func(df: pd.DataFrame, xs: List, constant: int):
+            ...
+    """
+    partition_arg_names = set(partition_arg_with.keys())
+    def partitioning_function(named_args: NamedArguments) -> Tuple[NamedArguments, PartitionGenerator[NamedArguments]]:
+        # Applies all partition functions simultaneously using `partition_zip()`, and then rebuilds the `NamedArguments`
+        # object with the partitioned values.
+        partitioned_args, non_partitioned_args = named_args.split(partition_arg_names)
+        def reassign_partitioned_arguments(*partitioned_values) -> NamedArguments:
+            changes = dict(zip(partition_arg_names, [v[0] for v in partitioned_values]))
+            return partitioned_args.reassigned(**changes)
+        partitioned_arg_generators = [
+            partition_arg_with[arg_name](partitioned_args[arg_name]) for arg_name in partition_arg_names
+        ]
+        zipped = partition_zip(*partitioned_arg_generators)
+        generator = partition_map(reassign_partitioned_arguments, zipped)  # type: ignore[type-var]
+        return non_partitioned_args, generator
+    return partitioning_function
+def multiple_arguments(
+    partition_on: Union[Tuple[str, ...], str], partition_with: PartitionFunction
+) -> Callable[[NamedArguments], Tuple[NamedArguments, PartitionGenerator[NamedArguments]]]:
+    """
+    Applies a single partitioning function to multiple, but not all, arguments.
+    .. code:: python
+        @pf.parallel(
+            split=pf.multiple_arguments(
+                ("df_1", "df_2"),
+                pf.dataframe.by_group(by=["year", "month"]),
+            ),
+            ...
+        )
+        def func(df_1: pd.DataFrame, df_2: pd.DataFrame, constant: int):
+            ...
+    """
+    if isinstance(partition_on, str):
+        partition_on = (partition_on,)
+    if not isinstance(partition_on, tuple) or not all(isinstance(i, str) for i in partition_on):
+        raise ValueError(f"`partition_on` must be str or tuple of string, but got: {partition_on}.")
+    if len(partition_on) == 0:
+        raise ValueError("empty `partition_on` value.")
+    def partitioning_function(named_args: NamedArguments) -> Tuple[NamedArguments, PartitionGenerator]:
+        # Applies the partitioning function to the selected parameters, and then rebuilds the `NamedArguments` object
+        # with these partitioned values.
+        partitioned_args, non_partitioned_args = named_args.split(set(partition_on))
+        arg_values = [partitioned_args[a] for a in partition_on]
+        generator = partition_map(
+            lambda *partitioned_values: partitioned_args.reassigned(**dict(zip(partition_on, partitioned_values))),
+            partition_with(*arg_values),
+        )  # type: ignore[type-var]
+        return non_partitioned_args, generator
+    return partitioning_function
+def all_arguments(
+    partition_with: PartitionFunction,
+) -> Callable[[NamedArguments], Tuple[NamedArguments, PartitionGenerator[NamedArguments]]]:
+    """
+    Applies a single partitioning function to all arguments.
+    .. code:: python
+        @pf.parallel(
+            split=pf.all_arguments(pf.dataframe.by_group(by=["year", "month"]),
+            ...
+        )
+        def func(df_1: pd.DataFrame, df_2: pd.DataFrame):
+            ...
+    """
+    def partitioning_function(named_args: NamedArguments) -> Tuple[NamedArguments, PartitionGenerator]:
+        # Applies the partition function to the named positional parameters first, then keyword, then variable args,
+        # then rebuilds the partitioned NamedArgument object in the same order.
+        def reassign_all_arguments(*partitioned_values) -> NamedArguments:
+            n_args = len(named_args.args)
+            n_kwargs = len(named_args.kwargs)
+            args = OrderedDict(zip(named_args.args.keys(), partitioned_values[:n_args]))
+            kwargs = dict(zip(named_args.kwargs.keys(), partitioned_values[n_args : n_args + n_kwargs]))
+            var_args = partitioned_values[n_args + n_kwargs :]
+            return NamedArguments(args=args, kwargs=kwargs, var_args=var_args)
+        arg_values = chain(named_args.args.values(), named_args.kwargs.values(), named_args.var_args)
+        return NamedArguments(), partition_map(reassign_all_arguments, partition_with(*arg_values))
+    return partitioning_function

parfun/partition/collection.py ADDED Viewed

@@ -0,0 +1,13 @@
+import warnings
+from parfun.py_list import by_chunk
+warnings.warn(
+    "parfun.partition.collection is deprecated and will be removed in a future version, use parfun.py_list.",
+    DeprecationWarning
+)
+list_by_chunk = by_chunk
+__all__ = ["list_by_chunk"]

parfun/partition/dataframe.py ADDED Viewed

@@ -0,0 +1,16 @@
+import warnings
+from parfun.dataframe import by_group, by_row
+warnings.warn(
+    "parfun.partition.dataframe is deprecated and will be removed in a future version, use parfun.dataframe.",
+    DeprecationWarning
+)
+df_by_group = by_group
+df_by_row = by_row
+__all__ = ["df_by_group", "df_by_row"]

parfun/partition/object.py ADDED Viewed

@@ -0,0 +1,50 @@
+from typing import Callable, Generator, Optional, Tuple, Union
+from parfun.object import PartitionType
+SimplePartitionIterator = Generator[PartitionType, None, None]
+SmartPartitionGenerator = Generator[Optional[Tuple[int, PartitionType]], int, None]
+PartitionGenerator = Union[SimplePartitionIterator[PartitionType], SmartPartitionGenerator[PartitionType]]
+"""
+All partitioning functions must return a Python generator of this type.
+There are two ways of writing a partitioning functions:
+* Use regular Python generators (prefered) or iterators, returning partitioned values:
+.. code:: python
+    def partition_list_by_chunks(values: List): PartitionGenerator[List]:
+        PARTITION_SIZE = len(values) / 100
+        for begin in range(0, len(values), PARTITION_SIZE)):
+            yield values[begin:begin + PARTITION_SIZE]
+* Use partition size aware Python generators, or smart generators. These are more complex but more efficient. Partition
+  size aware generators must get a suggested partition size through the return value of the ``yield`` statement, and
+  yield partition sizes with its partitioned values:
+.. code:: python
+    def partition_list_by_chunks(values: List, constant: int) -> PartitionGenerator[Tuple[List, int]]:
+        # A first empty call to `yield` is required to obtain the first requested partition size
+        requested_partition_size = yield None
+        begin = 0
+        while begin < len(values):
+            end = min(len(values), begin + requested_partition_size)
+            partition_size = end - begin
+            partition = (values[begin:end], a)
+            # Yield the actual partition along its size, and obtain the requested size for the next partition.
+            requested_partition_size = yield partition_size, partition
+            begin = end
+"""
+PartitionFunction = Callable[..., PartitionGenerator[PartitionType]]