PyPI - opengris-parfun - Versions diffs - 7.3.0__py3-none-any.whl - Mend

opengris-parfun 7.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (43) hide show

opengris_parfun-7.3.0.dist-info/METADATA +165 -0
opengris_parfun-7.3.0.dist-info/RECORD +43 -0
opengris_parfun-7.3.0.dist-info/WHEEL +5 -0
opengris_parfun-7.3.0.dist-info/licenses/LICENSE +201 -0
opengris_parfun-7.3.0.dist-info/licenses/LICENSE.spdx +7 -0
opengris_parfun-7.3.0.dist-info/licenses/NOTICE +7 -0
opengris_parfun-7.3.0.dist-info/top_level.txt +1 -0
parfun/__init__.py +26 -0
parfun/about.py +1 -0
parfun/backend/__init__.py +0 -0
parfun/backend/dask.py +151 -0
parfun/backend/local_multiprocessing.py +92 -0
parfun/backend/local_single_process.py +47 -0
parfun/backend/mixins.py +68 -0
parfun/backend/profiled_future.py +50 -0
parfun/backend/scaler.py +226 -0
parfun/backend/utility.py +7 -0
parfun/combine/__init__.py +0 -0
parfun/combine/collection.py +13 -0
parfun/combine/dataframe.py +13 -0
parfun/dataframe.py +175 -0
parfun/decorators.py +135 -0
parfun/entry_point.py +180 -0
parfun/functions.py +71 -0
parfun/kernel/__init__.py +0 -0
parfun/kernel/function_signature.py +197 -0
parfun/kernel/parallel_function.py +262 -0
parfun/object.py +7 -0
parfun/partition/__init__.py +0 -0
parfun/partition/api.py +136 -0
parfun/partition/collection.py +13 -0
parfun/partition/dataframe.py +16 -0
parfun/partition/object.py +50 -0
parfun/partition/primitives.py +317 -0
parfun/partition/utility.py +54 -0
parfun/partition_size_estimator/__init__.py +0 -0
parfun/partition_size_estimator/linear_regression_estimator.py +189 -0
parfun/partition_size_estimator/mixins.py +22 -0
parfun/partition_size_estimator/object.py +19 -0
parfun/profiler/__init__.py +0 -0
parfun/profiler/functions.py +261 -0
parfun/profiler/object.py +68 -0
parfun/py_list.py +56 -0

parfun/decorators.py ADDED Viewed

@@ -0,0 +1,135 @@
+"""
+A decorator that helps users run their functions in parallel.
+"""
+import importlib
+import warnings
+from functools import wraps
+from typing import Callable, Iterable, Optional, Tuple, Union
+from parfun.kernel.function_signature import NamedArguments
+from parfun.kernel.parallel_function import ParallelFunction
+from parfun.object import FunctionInputType, FunctionOutputType
+from parfun.partition.object import PartitionGenerator
+from parfun.partition_size_estimator.linear_regression_estimator import LinearRegessionEstimator
+from parfun.partition_size_estimator.mixins import PartitionSizeEstimator
+def parallel(
+    split: Callable[[NamedArguments], Tuple[NamedArguments, PartitionGenerator[NamedArguments]]],
+    combine_with: Callable[[Iterable[FunctionOutputType]], FunctionOutputType],
+    initial_partition_size: Optional[Union[int, Callable[[FunctionInputType], int]]] = None,
+    fixed_partition_size: Optional[Union[int, Callable[[FunctionInputType], int]]] = None,
+    profile: bool = False,
+    trace_export: Optional[str] = None,
+    partition_size_estimator_factory: Callable[[], PartitionSizeEstimator] = LinearRegessionEstimator,
+) -> Callable:
+    """
+    Returns a function decorator that automatically parallelizes a function.
+    .. code:: python
+        @pf.parallel(
+            split=pf.per_argument(
+                values=pf.py_list.by_chunk,
+            ),
+            combine_with=pf.py_list.concat
+        )
+        def multiply_by_constant(values: Iterable[int], constant: int):
+            return [v * constant for v in values]
+        # This would be functionally equivalent to running the function inside a single for loop:
+        results = []
+        for partition in pf.py_list.by_chunk(values):
+            results.append(multiply_by_constant(partition, constant))
+        return combine_with(results)
+    :param split:
+        Partition the data based on the provided partitioning function.
+        See :py:mod:`~parfun.partition.api` for the list of predefined partitioning functions.
+    :param combine_with: aggregates the results by running the function.
+    :type combine_with: Callable
+    :param initial_partition_size:
+        Overrides the first estimate from the partition size estimator.
+        If the value is a callable, the function will be provided with the input to be partitioned and shall return the
+        initial partition size to use.
+    :type initial_partition_size: int | Callable[[PartitionType], int] | None
+    :param fixed_partition_size:
+        Uses a constant partition size and do not run the partition size estimator.
+        If the value is a callable, the function will be provided with the input to be partitioned and shall return the
+        partition size to use.
+    :type fixed_partition_size: int | Callable[[PartitionType], int] | None
+    :param profile: if true, prints additional debugging information about the parallelization overhead.
+    :type profile: bool
+    :param trace_export: if defined, will export the execution time to the provided CSV file's path.
+    :type trace_export: str
+    :param partition_size_estimator_factory: the partition size estimator class to use
+    :type partition_size_estimator_factory: Callable[[], PartitionSizeEstimator]
+    :return: a decorated function
+    :rtype: Callable
+    """
+    def decorator(function: Callable[[FunctionInputType], FunctionOutputType]):
+        # init a ParallelFunction object to handle parallel computations automatically
+        parallel_function = ParallelFunction(
+            function=function,
+            function_name=function.__name__,
+            split=split,
+            combine_with=combine_with,
+            initial_partition_size=initial_partition_size,
+            fixed_partition_size=fixed_partition_size,
+            profile=profile,
+            trace_export=trace_export,
+            partition_size_estimator_factory=partition_size_estimator_factory,
+        )
+        @wraps(function)
+        def wrapped(*args, **kwargs):
+            # Remark: we cannot decorate `parallel_function` with `wraps` directly as it's not a regular function.
+            return parallel_function(*args, **kwargs)
+        # Renames the original function as "_{function_name}_sequential" and adds it to the same module.
+        # This is required as Pickle requires all serialized functions to be accessible from a qualified module, which
+        # will not be the case for the original function as it gets overridden by the decorator.
+        if function.__module__ is not None:
+            module = importlib.import_module(function.__module__)
+            name = f"_{function.__name__}_sequential"
+            parent_qualname, parent_separator, old_qualname = function.__qualname__.rpartition(".")
+            qualname = f"{parent_qualname}{parent_separator}_{old_qualname}_sequential"
+            setattr(module, name, function)
+            getattr(module, name).__name__ = name
+            getattr(module, name).__qualname__ = qualname
+        return wrapped
+    return decorator
+def parfun(
+    split: Callable[[NamedArguments], Tuple[NamedArguments, PartitionGenerator[NamedArguments]]],
+    combine_with: Callable[[Iterable[FunctionOutputType]], FunctionOutputType],
+    initial_partition_size: Optional[Union[int, Callable[[FunctionInputType], int]]] = None,
+    fixed_partition_size: Optional[Union[int, Callable[[FunctionInputType], int]]] = None,
+    profile: bool = False,
+    trace_export: Optional[str] = None,
+    partition_size_estimator_factory: Callable[[], PartitionSizeEstimator] = LinearRegessionEstimator,
+) -> Callable:
+    warnings.warn("parfun() is deprecated and will be removed in a future version.", DeprecationWarning)
+    return parallel(
+        split=split,
+        combine_with=combine_with,
+        initial_partition_size=initial_partition_size,
+        fixed_partition_size=fixed_partition_size,
+        profile=profile,
+        trace_export=trace_export,
+        partition_size_estimator_factory=partition_size_estimator_factory,
+    )

parfun/entry_point.py ADDED Viewed

@@ -0,0 +1,180 @@
+"""
+APIs to manage backends and integrate the toolkit with other projects.
+"""
+import argparse
+import atexit
+import contextlib
+import logging
+import os
+from contextvars import ContextVar, Token
+from typing import Callable, Dict, Optional, Union
+from parfun.backend.local_multiprocessing import LocalMultiprocessingBackend
+from parfun.backend.local_single_process import LocalSingleProcessBackend
+from parfun.backend.mixins import BackendEngine
+_backend_engine: ContextVar[Optional[BackendEngine]] = ContextVar("_backend_engine", default=None)
+BACKEND_REGISTRY: Dict[str, Callable] = {
+    "none": lambda *_args, **_kwargs: None,
+    "local_single_process": LocalSingleProcessBackend,
+    "local_multiprocessing": LocalMultiprocessingBackend,
+}
+try:
+    from parfun.backend.dask import DaskCurrentBackend, DaskLocalClusterBackend, DaskRemoteClusterBackend
+    BACKEND_REGISTRY["dask_local"] = DaskLocalClusterBackend
+    BACKEND_REGISTRY["dask_remote"] = DaskRemoteClusterBackend
+    BACKEND_REGISTRY["dask_current"] = DaskCurrentBackend
+except ImportError:
+    logging.debug("Dask backends disabled. Use `pip install 'opengris-parfun[dask]'` to install Dask dependencies.")
+try:
+    from parfun.backend.scaler import ScalerLocalBackend, ScalerRemoteBackend
+    BACKEND_REGISTRY["scaler_local"] = ScalerLocalBackend
+    BACKEND_REGISTRY["scaler_remote"] = ScalerRemoteBackend
+except ImportError:
+    logging.debug(
+        "Scaler backends disabled. Use `pip install 'opengris-parfun[scaler]'` to install Scaler dependencies."
+    )
+def set_parallel_backend(backend: Union[str, BackendEngine], *args, **kwargs) -> None:
+    """
+    Initializes and sets the current parfun backend.
+    .. code:: python
+        set_parallel_backend("local_multiprocessing", max_workers=4, is_process=False)
+    :param backend:
+        Supported backend options:
+        * ``"none"``: disable the current parallel backend.
+          Functions decorated with :py:func:`~parfun.decorators.parfun` will run sequentially as if not decorated.
+          Partitioning and combining functions will be ignored.
+        * ``"local_single_process"``: runs the tasks inside the calling Python process.
+          Functions decorated with :py:func:`~parfun.decorators.parfun` will partition the input data, and run the
+          combining function on the output data, but will also execute the function inside the calling Python process.
+          Mostly intended for debugging purposes.
+          See :py:mod:`~parfun.backend.local_single_process.LocalSingleProcessBackend`.
+        * ``"local_multiprocessing"``: runs the tasks in parallel using Python ``multiprocessing`` processes.
+          See :py:mod:`~parfun.backend.local_multiprocessing.LocalMultiprocessingBackend`.
+        * ``"scaler_local"``: runs the tasks in parallel using an internally managed Scaler cluster.
+          See :py:mod:`~parfun.backend.scaler.ScalerLocalBackend`.
+        * ``"scaler_remote"``: runs the tasks in parallel using an externally managed Dask cluster.
+          See :py:mod:`~parfun.backend.scaler.ScalerRemoteBackend`.
+        * ``"dask_local"``: runs the tasks in parallel using an internally managed Dask cluster.
+          See :py:mod:`~parfun.backend.dask_local.DaskLocalClusterBackend`.
+        * ``"dask_remote"``: runs the tasks in parallel using an externally managed Dask cluster.
+          See :py:mod:`~parfun.backend.dask_remote.DaskRemoteClusterBackend`.
+        * ``"dask_current"``: runs the tasks in parallel using the currently running Dask client
+          (:py:func:`~distributed.get_client`).
+          See :py:mod:`~parfun.backend.dask_current.DaskCurrentBackend`.
+    :type backend:  Union[str, BackendEngine]
+    :param args: Additional positional parameters for the backend constructor
+    :param kwargs: Additional keyword parameters for the backend constructor.
+    :rtype: None
+    """
+    _cleanup_current_backend()
+    _set_parallel_backend(backend, *args, **kwargs)
+@contextlib.contextmanager
+def set_parallel_backend_context(backend: Union[str, BackendEngine], *args, **kwargs):
+    """
+    Sets a new parallel backend instance in a contextlib's context.
+    .. code:: python
+        with set_parallel_backend_context("local_single_processing"):
+            some_parallel_computation()
+    :param backend: See :py:func:`set_parallel_backend`.
+    :type backend: Union[str, BackendEngine]
+    """
+    token = _set_parallel_backend(backend, *args, **kwargs)
+    try:
+        yield
+    finally:
+        _cleanup_current_backend()
+        _backend_engine.reset(token)
+def get_parallel_backend() -> Optional[BackendEngine]:
+    """
+    :return: the current backend instance, or :py:obj:`None` if no backend is currently set.
+    :rtype: Optional[BackendEngine]
+    """
+    return _backend_engine.get()
+def add_parallel_options(parser: argparse.ArgumentParser) -> None:
+    """
+    Adds argparse options required to initialize this parallel toolkit.
+    :type parser: argparse.ArgumentParser
+    :rtype: None
+    """
+    group = parser.add_argument_group()
+    group.add_argument(
+        "--parallel-backend",
+        type=str,
+        choices=list(BACKEND_REGISTRY.keys()),
+        default="local_multiprocessing",
+        help="The backend engine selected to run code. If 'none', disables parallel computations.",
+    )
+def _set_parallel_backend(backend: Union[str, BackendEngine], *args, **kwargs) -> Token:
+    if isinstance(backend, BackendEngine):
+        if len(args) > 0 or len(kwargs) > 0:
+            raise ValueError("Cannot pass additional arguments when passing a backend instance")
+        backend_instance = backend
+        backend_name = backend.__class__.__name__
+    elif backend in BACKEND_REGISTRY:
+        backend_instance = BACKEND_REGISTRY[backend](*args, **kwargs)
+        backend_name = backend
+    else:
+        raise ValueError(f"Supported parallel backends are: {set(BACKEND_REGISTRY.keys())}")
+    if backend != "none":
+        # set numpy OpenBlas threads to be 1 each process only have 1 thread, easier to manage resources
+        os.environ["OPENBLAS_NUM_THREADS"] = "1"
+    logging.info(f"Set up parallel backend: {backend_name}")
+    return _backend_engine.set(backend_instance)
+@atexit.register
+def _cleanup_current_backend():
+    engine = _backend_engine.get()
+    if engine is not None:
+        engine.shutdown()

parfun/functions.py ADDED Viewed

@@ -0,0 +1,71 @@
+import collections
+import logging
+from typing import Any, Callable, Deque, Iterable, Optional, Tuple
+from parfun.backend.mixins import BackendSession, ProfiledFuture
+from parfun.entry_point import get_parallel_backend
+def parallel_map(func: Callable, *iterables, backend_session: Optional[BackendSession] = None) -> Iterable:
+    """
+    Similar to :py:func:`concurrent.futures.Executor.map()` but lazily consumes and returns the iterators' content as
+    worker nodes become available.
+    .. code:: python
+        parallel_map(math.sqrt, [4, 9, 16, 25])  # [2.0, 3.0, 4.0, 5.0]
+        parallel_map(operator.add, [10, 7, 15], [12, 15, 5])  # [22, 22, 20]
+    :param backend_session: the parallel backend session. If `None`, creates a new session from the current backend.
+    """
+    # Uses a generator function, so that we can use deque.pop() and thus discard the no longer required futures'
+    # references as we yield them.
+    def result_generator(backend_session: BackendSession):
+        futures: Deque[ProfiledFuture] = collections.deque()
+        try:
+            for args in zip(*iterables):
+                futures.append(backend_session.submit(func, *args))
+                # Yields any finished future from the head of the queue.
+                while len(futures) > 0 and futures[0].done():
+                    yield futures.popleft().result()
+            # Yields the remaining results.
+            while len(futures) > 0:
+                yield futures.popleft().result()
+        finally:
+            # If any failure, cancels all unfinished tasks.
+            for future in futures:
+                future.cancel()
+    if backend_session is None:
+        current_backend = get_parallel_backend()
+        if current_backend is None:
+            logging.warning(f"no parallel backend engine set, run `{func.__name__}()` sequentially.")
+            return map(func, *iterables)
+        with current_backend.session() as current_backend_session:
+            return result_generator(current_backend_session)
+    else:
+        return result_generator(backend_session)
+def parallel_starmap(
+    func: Callable,
+    iterable: Iterable[Tuple[Any, ...]],
+    backend_session: Optional[BackendSession] = None
+) -> Iterable:
+    """
+    Similar to :py:func:`concurrent.futures.Executor.starmap()` but lazily consumes and returns the iterators' content
+    as worker nodes become available.
+    .. code:: python
+        parallel_starmap(operator.add, [(10, 12), (7, 15), (15, 5)])  # [22, 22, 20]
+    """
+    yield from parallel_map(func, *zip(*iterable), backend_session=backend_session)

parfun/kernel/__init__.py ADDED Viewed

File without changes

parfun/kernel/function_signature.py ADDED Viewed

@@ -0,0 +1,197 @@
+import collections
+import inspect
+from inspect import Parameter
+from typing import Any, Callable, Dict, Optional, OrderedDict, Set, Tuple, Type
+import attrs
+@attrs.define(frozen=True)
+class FunctionSignature:
+    """
+    Helper class to inspect a function' parameter and return types.
+    In Python 3.8+ whether an argument is positional only or keyword only can be specified using the / and * syntax,
+    respectively. As an example:
+    def f(pos1, pos2, /, pos_or_kwd, *, kwd1, kwd2):
+      -----------    ----------     ----------
+      |              |              |
+      |              Positional or  |
+      |              keyword        Keyword only
+      Positional only
+    1. Everything before / is positional only;
+    2. Everything after * is keyword only.
+    3. Note that order matters – / must come before *.
+    4. If you don't explicitly specify positional or keyword only through the syntax,
+        all arguments are of the positional or keyword kind.
+    """
+    args: OrderedDict[str, inspect.Parameter] = attrs.field()
+    kwargs: Dict[str, inspect.Parameter] = attrs.field()
+    has_var_arg: bool = attrs.field()
+    has_var_kwarg: bool = attrs.field()
+    return_type: Optional[Type] = attrs.field()
+    @classmethod
+    def from_function(cls, function: Callable) -> "FunctionSignature":
+        signature = inspect.signature(function)
+        if signature.return_annotation not in (inspect.Signature.empty, None):
+            return_type = signature.return_annotation
+        else:
+            return_type = None
+        parameters = list(signature.parameters.values())
+        args = collections.OrderedDict(
+            (p.name, p) for p in parameters if p.kind in [Parameter.POSITIONAL_OR_KEYWORD, Parameter.POSITIONAL_ONLY]
+        )
+        kwargs = {p.name: p for p in parameters if p.kind in [Parameter.POSITIONAL_OR_KEYWORD, Parameter.KEYWORD_ONLY]}
+        has_var_arg = any(p.kind == Parameter.VAR_POSITIONAL for p in parameters)
+        has_var_kwarg = any(p.kind == Parameter.VAR_KEYWORD for p in parameters)
+        return cls(
+            args=args, kwargs=kwargs, has_var_arg=has_var_arg, has_var_kwarg=has_var_kwarg, return_type=return_type
+        )
+    def assign(self, args, kwargs) -> "NamedArguments":
+        """
+        Categorizes and names the ``args`` and ``kwargs`` arguments based on the function signature.
+        Raise an exception if the arguments do not match the function's signature.
+        :returns the assigned positional, keyword and variable parameters.
+        """
+        # Assigns positional arguments.
+        named_args = collections.OrderedDict(
+            (arg_type.name, arg_value) for arg_type, arg_value in zip(self.args.values(), args)
+        )
+        if len(args) > len(self.args):
+            if self.has_var_arg:
+                var_args = tuple(args[len(named_args) :])
+            else:
+                raise ValueError(f"expected {len(self.args)} arguments, got {len(args)}.")
+        else:
+            unassigned_args = [
+                a
+                for a in list(self.args.values())[len(args) :]
+                if a.kind == Parameter.POSITIONAL_ONLY and a.default == Parameter.empty
+            ]
+            if len(unassigned_args) > 0:
+                unassigned_kwarg_names = ", ".join(a.name for a in unassigned_args)
+                raise ValueError(f"unassigned positional parameter(s): {unassigned_kwarg_names}.")
+            var_args = tuple()
+        # Assign keyword arguments.
+        double_assigned_args = [a for a in kwargs.keys() if a in named_args]
+        if len(double_assigned_args) > 0:
+            double_assigned_arg_names = ", ".join(a for a in double_assigned_args)
+            raise ValueError(f"parameter(s) assigned twice: {double_assigned_arg_names}.")
+        if not self.has_var_kwarg:
+            invalid_kwargs = [a for a in kwargs.keys() if a not in self.kwargs]
+            if len(invalid_kwargs) > 0:
+                invalid_kwarg_names = ", ".join(a for a in invalid_kwargs)
+                raise ValueError(f"invalid keyword parameter(s): {invalid_kwarg_names}.")
+        unassigned_kwargs = [
+            a
+            for a in self.kwargs.values()
+            if a.default == Parameter.empty and a.name not in named_args and a.name not in kwargs
+        ]
+        if len(unassigned_kwargs) > 0:
+            unassigned_kwarg_names = ", ".join(a.name for a in unassigned_kwargs)
+            raise ValueError(f"unassigned keyword parameter(s): {unassigned_kwarg_names}.")
+        return NamedArguments(args=named_args, kwargs=kwargs, var_args=var_args)
+@attrs.define(frozen=True)
+class NamedArguments:
+    """Contains the argument values of a function call, but associated with their respective names, based on the
+    function's signature."""
+    args: OrderedDict[str, Any] = attrs.field(factory=OrderedDict)
+    kwargs: Dict[str, Any] = attrs.field(factory=dict)
+    var_args: Tuple = attrs.field(default=tuple())
+    def __getitem__(self, name: str) -> Any:
+        """Gets the value of an argument by name."""
+        if name in self.args:
+            return self.args[name]
+        elif name in self.kwargs:
+            return self.kwargs[name]
+        else:
+            raise KeyError(f"unknown argument name: {name}.")
+    def as_args_kwargs(self) -> Tuple[Tuple, Dict[str, Any]]:
+        """Returns a tuple of positional and keyword parameters that can be used to call the function."""
+        return self.var_args, {**self.args, **self.kwargs}
+    def keys(self) -> Set[str]:
+        """Returns all argument names."""
+        keys = set(self.args.keys())
+        keys.update(self.kwargs.keys())
+        return keys
+    def split(self, arg_names: Set[str]) -> Tuple["NamedArguments", "NamedArguments"]:
+        """Returns the subset of the arguments that matches the argument names, and those that do not."""
+        includes = NamedArguments(
+            args=OrderedDict((name, value) for name, value in self.args.items() if name in arg_names),
+            kwargs={name: value for name, value in self.kwargs.items() if name in arg_names},
+            var_args=tuple(),
+        )
+        excludes = NamedArguments(
+            args=OrderedDict((name, value) for name, value in self.args.items() if name not in arg_names),
+            kwargs={name: value for name, value in self.kwargs.items() if name not in arg_names},
+            var_args=self.var_args,
+        )
+        return includes, excludes
+    def reassigned(self, **changes) -> "NamedArguments":
+        """Returns a new ``NamedArguments`` objects with some of the values reassigned.
+        .. code:: python
+            named_args.reassign(arg_1="new_value", arg_2="new_value")
+        """
+        args = self.args.copy()
+        kwargs = self.kwargs.copy()
+        for arg_name, arg_value in changes.items():
+            if arg_name in args:
+                args[arg_name] = arg_value
+            elif arg_name in kwargs:
+                kwargs[arg_name] = arg_value
+            else:
+                raise ValueError(f"invalid argument key: `{arg_name}`.")
+        return attrs.evolve(self, args=args, kwargs=kwargs)
+    def merge(self, other: "NamedArguments") -> "NamedArguments":
+        """Returns a new ``NamedArguments`` object with the values of both objects merged."""
+        args = self.args.copy()
+        args.update(other.args)
+        return NamedArguments(
+            args=args, kwargs={**self.kwargs, **other.kwargs}, var_args=self.var_args + other.var_args
+        )