PyPI - opengris-parfun - Versions diffs - 7.3.0__py3-none-any.whl - Mend

opengris-parfun 7.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (43) hide show

opengris_parfun-7.3.0.dist-info/METADATA +165 -0
opengris_parfun-7.3.0.dist-info/RECORD +43 -0
opengris_parfun-7.3.0.dist-info/WHEEL +5 -0
opengris_parfun-7.3.0.dist-info/licenses/LICENSE +201 -0
opengris_parfun-7.3.0.dist-info/licenses/LICENSE.spdx +7 -0
opengris_parfun-7.3.0.dist-info/licenses/NOTICE +7 -0
opengris_parfun-7.3.0.dist-info/top_level.txt +1 -0
parfun/__init__.py +26 -0
parfun/about.py +1 -0
parfun/backend/__init__.py +0 -0
parfun/backend/dask.py +151 -0
parfun/backend/local_multiprocessing.py +92 -0
parfun/backend/local_single_process.py +47 -0
parfun/backend/mixins.py +68 -0
parfun/backend/profiled_future.py +50 -0
parfun/backend/scaler.py +226 -0
parfun/backend/utility.py +7 -0
parfun/combine/__init__.py +0 -0
parfun/combine/collection.py +13 -0
parfun/combine/dataframe.py +13 -0
parfun/dataframe.py +175 -0
parfun/decorators.py +135 -0
parfun/entry_point.py +180 -0
parfun/functions.py +71 -0
parfun/kernel/__init__.py +0 -0
parfun/kernel/function_signature.py +197 -0
parfun/kernel/parallel_function.py +262 -0
parfun/object.py +7 -0
parfun/partition/__init__.py +0 -0
parfun/partition/api.py +136 -0
parfun/partition/collection.py +13 -0
parfun/partition/dataframe.py +16 -0
parfun/partition/object.py +50 -0
parfun/partition/primitives.py +317 -0
parfun/partition/utility.py +54 -0
parfun/partition_size_estimator/__init__.py +0 -0
parfun/partition_size_estimator/linear_regression_estimator.py +189 -0
parfun/partition_size_estimator/mixins.py +22 -0
parfun/partition_size_estimator/object.py +19 -0
parfun/profiler/__init__.py +0 -0
parfun/profiler/functions.py +261 -0
parfun/profiler/object.py +68 -0
parfun/py_list.py +56 -0

parfun/partition/primitives.py ADDED Viewed

@@ -0,0 +1,317 @@
+from typing import Callable, Optional, Sequence, Tuple, TypeVar, cast
+from parfun.partition.object import PartitionGenerator, PartitionType, SimplePartitionIterator, SmartPartitionGenerator
+InputPartitionType = TypeVar("InputPartitionType", bound=Tuple)
+OutputPartitionType = TypeVar("OutputPartitionType", bound=Tuple)
+def partition_map(
+    func: Callable[..., OutputPartitionType], generator: PartitionGenerator[InputPartitionType]
+) -> PartitionGenerator[OutputPartitionType]:
+    """
+    Same as Python's built-in ``map()``, but works on partition generators.
+    .. code:: python
+        partition_map(
+            lambda partition_df: partition_df * 2,
+            df_by_row(df)
+        )
+    If the generator is a regular Python generator, the function returns a regular generator. Otherwise, it returns a
+    smart generator.
+    """
+    try:
+        first_value = cast(Optional[InputPartitionType], next(generator))
+        if first_value is not None:
+            # This is a regular generator
+            simple_generator = cast(SimplePartitionIterator[InputPartitionType], generator)
+            yield func(*first_value)
+            while True:
+                yield func(*next(simple_generator))
+        else:
+            smart_generator = cast(SmartPartitionGenerator[InputPartitionType], generator)
+            requested_partition_size = yield None
+            while True:
+                value = smart_generator.send(requested_partition_size)
+                _validate_smart_partition_value(value)
+                partition_size, partition = value
+                requested_partition_size = yield partition_size, func(*partition)
+    except StopIteration:
+        return
+def partition_unit(partition_size: int, partition: PartitionType) -> PartitionGenerator[PartitionType]:
+    """Creates a generator returning a single partition."""
+    _ = yield None
+    yield partition_size, partition
+def partition_flatmap(
+    func: Callable[[InputPartitionType], PartitionGenerator[OutputPartitionType]],
+    generator: PartitionGenerator[InputPartitionType],
+) -> PartitionGenerator[OutputPartitionType]:
+    """
+    Allows allows the nesting of ``PartitionGenerator``s, similarly to nested for loops:
+    .. code:: python
+        partition_flatmap(
+            lambda partition_df: df_by_row(*partition_df),
+            df_by_group(by="year")(df)
+        )
+    Returns a regular Python generator iff the parent and iterated generators are regular Python generators. Otherwise,
+    it returns a smart generator.
+    """
+    try:
+        first_value = cast(Optional[InputPartitionType], next(generator))
+    except StopIteration:
+        return
+    if first_value is not None:
+        # The parent generator is a regular generator
+        simple_generator = cast(SimplePartitionIterator[InputPartitionType], generator)
+        yield from _partition_flatmap_regular_generator(func, first_value, simple_generator)
+    else:
+        smart_generator = cast(SmartPartitionGenerator[InputPartitionType], generator)
+        yield from _partition_flatmap_smart_generator(func, smart_generator)
+def _partition_flatmap_regular_generator(
+    func: Callable[[InputPartitionType], PartitionGenerator[OutputPartitionType]],
+    first_value: InputPartitionType,
+    generator: SimplePartitionIterator[InputPartitionType],
+) -> PartitionGenerator[OutputPartitionType]:
+    """
+    `partition_flatmap()` specialisation for parent generators that are regular Python generators.
+    The function returns a smart generator iff the iterated function return smart generators, otherwise it returns a
+    regular Python generator.
+    """
+    def iterate_nested_generator(
+        nested_generator: PartitionGenerator[OutputPartitionType], requested_partition_size: Optional[int] = None
+    ):
+        try:
+            first_value = cast(Optional[OutputPartitionType], next(nested_generator))
+            if first_value is not None:
+                # This is a regular generator
+                nested_simple_generator = cast(SimplePartitionIterator[OutputPartitionType], nested_generator)
+                if requested_partition_size is not None:
+                    raise ValueError(
+                        "`partition_flatmap()` doesn't support mixing smart and regular generators in applied function."
+                    )
+                yield first_value
+                yield from nested_simple_generator
+            else:
+                nested_smart_generator = cast(SmartPartitionGenerator[OutputPartitionType], nested_generator)
+                if requested_partition_size is None:  # First nested call value.
+                    requested_partition_size = yield None
+                while True:
+                    value = nested_smart_generator.send(requested_partition_size)
+                    _validate_smart_partition_value(value)
+                    partition_size, partition = value
+                    requested_partition_size = yield partition_size, func(*partition)
+        except StopIteration:
+            return requested_partition_size
+    requested_partition_size = None
+    value = first_value
+    try:
+        while True:
+            requested_partition_size = yield from iterate_nested_generator(func(*value), requested_partition_size)
+            value = next(generator)
+    except StopIteration:
+        return
+def _partition_flatmap_smart_generator(
+    func: Callable[[InputPartitionType], PartitionGenerator[OutputPartitionType]],
+    generator: SmartPartitionGenerator[InputPartitionType],
+) -> SmartPartitionGenerator[OutputPartitionType]:
+    """
+    `partition_flatmap()` specialisation for parent generators that are smart generators.
+    The function always returns a smart generator.
+    """
+    def iterate_nested_generator(
+        nested_generator: PartitionGenerator[OutputPartitionType],
+        requested_partition_size: int,
+        parent_partition_size: int,
+    ):
+        total_size = 0
+        try:
+            nested_value = next(nested_generator)
+            if nested_value is not None:
+                # This is a regular nested generator
+                nested_simple_generator = cast(SimplePartitionIterator[OutputPartitionType], nested_generator)
+                while True:
+                    total_size += 1
+                    requested_partition_size = yield parent_partition_size, nested_value
+                    nested_value = next(nested_simple_generator)
+            else:
+                # This is a smart nested generator
+                nested_smart_generator = cast(SmartPartitionGenerator[OutputPartitionType], nested_generator)
+                while True:
+                    nested_requested_partition_size = max(1, round(requested_partition_size / parent_partition_size))
+                    nested_value = nested_smart_generator.send(nested_requested_partition_size)
+                    _validate_smart_partition_value(nested_value)
+                    nested_partition_size, nested_partition = nested_value
+                    total_size += nested_partition_size
+                    requested_partition_size = yield parent_partition_size * nested_partition_size, nested_partition
+        except StopIteration:
+            return total_size, requested_partition_size
+    # Keep track of the nested total size of the previous iteration of the nested function, so that we can
+    # estimate the optimal partition size to propagate to the parent's generator.
+    total_nested_size = 0
+    n_nested = 0
+    requested_partition_size = yield None
+    parent_requested_partition_size = 1
+    try:
+        while True:
+            value = generator.send(parent_requested_partition_size)
+            _validate_smart_partition_value(value)
+            parent_partition_size, partition = value
+            nested_size, requested_partition_size = yield from iterate_nested_generator(
+                func(*partition), requested_partition_size, parent_partition_size
+            )
+            total_nested_size += nested_size
+            n_nested += 1
+            avg_nested_size = total_nested_size / n_nested
+            parent_requested_partition_size = max(1, round(requested_partition_size / avg_nested_size))
+    except StopIteration:
+        return
+def partition_zip(*generators: PartitionGenerator) -> PartitionGenerator[Tuple]:
+    """
+    Same as Python's built-in ``zip()``, but works on ``PartitionGenerator``s.
+    """
+    if len(generators) < 1:
+        raise ValueError("at least one partition generator required.")
+    try:
+        # Detects which generators are partition-size aware
+        is_smart = []
+        first_values = []
+        for generator in generators:
+            first_value = next(generator)
+            is_smart.append(first_value is None)
+            first_values.append(first_value)
+        has_smart = any(is_smart)
+        # If at least one of the generator is partition-size aware (smart), yields a partition-size aware generator.
+        if has_smart:
+            requested_partition_size = yield None
+        else:
+            requested_partition_size = None
+        # Collects the first values of the smart generators (we already have the non-smart first values).
+        partition_size = None
+        for i, generator in enumerate(generators):
+            if not is_smart[i]:
+                continue
+            value = cast(SmartPartitionGenerator, generator).send(requested_partition_size)
+            _validate_partition_zip_smart_partition_value(value, partition_size)
+            partition_size, first_values[i] = value
+        if has_smart:
+            requested_partition_size = yield partition_size, tuple(first_values)
+        else:
+            yield tuple(first_values)
+        # Iterates through the next values until one generator finishes.
+        while True:
+            values = []
+            partition_size = None
+            for i, generator in enumerate(generators):
+                if is_smart[i]:
+                    value = cast(SmartPartitionGenerator, generator).send(requested_partition_size)
+                    _validate_partition_zip_smart_partition_value(value, partition_size)
+                    partition_size, partition = value
+                else:
+                    partition = next(generator)
+                values.append(partition)
+            if has_smart:
+                requested_partition_size = yield partition_size, tuple(values)
+            else:
+                yield tuple(values)
+    except StopIteration:
+        return
+def _validate_partition_zip_smart_partition_value(
+    partition_value: Tuple[int, PartitionType], partition_size: Optional[int]
+):
+    """
+    Validates the smart partition value iterated by ``partition_zip()``, and validates that its size matches the other
+    concurrent partitions' size (``partition_size``).
+    """
+    _validate_smart_partition_value(partition_value)
+    current_partition_size, partition = partition_value
+    if partition_size is not None and partition_size != current_partition_size:
+        raise ValueError("all smart partition generators should yield identically sized partitions.")
+    return current_partition_size, partition
+def _validate_smart_partition_value(value):
+    if not isinstance(value, Sequence) or len(value) != 2:
+        raise ValueError("partition generator should yield a partition with its size.")
+    partition_size, _ = value
+    if not isinstance(partition_size, int) or partition_size < 1:
+        raise ValueError("partition generator should return a strictly positive partition size.")

parfun/partition/utility.py ADDED Viewed

@@ -0,0 +1,54 @@
+from typing import Callable, Generator, Optional, Union, cast
+from parfun.object import PartitionType
+from parfun.partition.object import PartitionGenerator, SimplePartitionIterator, SmartPartitionGenerator
+def with_partition_size(
+    generator: PartitionGenerator[PartitionType], partition_size: Union[int, Callable[[], int]] = 1
+) -> Generator[PartitionType, None, None]:
+    """
+    Runs a partitioning generator without requiring the partition size estimator.
+    This function uses the provided partition size input to feed the partitioning generator through Python's
+    :py:meth:`generator.send` method, simulating the parallel function's behavior.
+    .. code:: python
+        # Runs the `by_row` partitioning function with a random partition size generator.
+        with_partition_size(
+            pf.dataframe.by_row(df_1, df_2),
+            partition_size=lambda: random.randint(1, 10)
+        )
+    :param partitions_with: the partitioning generator to execute
+    :param partition_size: a constant partition size, or a function generating partition sizes
+    """
+    try:
+        first_value = cast(Optional[PartitionType], next(generator))
+        if first_value is not None:
+            # This is a regular generator
+            simple_generator = cast(SimplePartitionIterator[PartitionType], generator)
+            yield first_value
+            yield from simple_generator
+        else:
+            smart_generator = cast(SmartPartitionGenerator[PartitionType], generator)
+            while True:
+                if isinstance(partition_size, int):
+                    current_partition_size = partition_size
+                else:
+                    assert callable(partition_size)
+                    current_partition_size = partition_size()
+                value = smart_generator.send(current_partition_size)
+                if value is None or len(value) != 2:
+                    raise ValueError("partition generator should yield a partition with its size.")
+                yield value[1]
+    except StopIteration:
+        return

parfun/partition_size_estimator/__init__.py ADDED Viewed

File without changes

parfun/partition_size_estimator/linear_regression_estimator.py ADDED Viewed

@@ -0,0 +1,189 @@
+import bisect
+import logging
+from math import ceil
+from typing import Callable, List, Optional, Tuple
+import attrs
+import numpy as np
+from attrs.validators import instance_of, is_callable
+from sklearn.base import BaseEstimator
+from sklearn.linear_model import LinearRegression
+from sklearn.pipeline import Pipeline
+from sklearn.preprocessing import FunctionTransformer
+from parfun.entry_point import get_parallel_backend
+from parfun.partition_size_estimator.mixins import PartitionSizeEstimator
+from parfun.partition_size_estimator.object import PartitionSizeEstimate, PartitionSizeEstimatorState
+from parfun.profiler.object import PartitionedTaskTrace
+@attrs.define()
+class LinearRegressionCoefficients:
+    a: float = attrs.field(validator=instance_of(float))
+    b: float = attrs.field(validator=instance_of(float))
+    score: float = attrs.field(validator=instance_of(float))
+    # The number of traces used to train the estimator.
+    trace_count: int = attrs.field(validator=instance_of(int))
+@attrs.define()
+class LinearRegessionEstimate(PartitionSizeEstimate):
+    coefficients: Optional[LinearRegressionCoefficients]
+@attrs.define()
+class LinearRegessionEstimator(PartitionSizeEstimator[LinearRegessionEstimate]):
+    """
+    Train a linear regression model to estimate the optimal partition size, based on the function's initialization time,
+    and the function's processing time.
+    """
+    # Parallel tasks have some constant computational overhead that stay the same whatever the partition size is (i.e.
+    # code loading, preprocessing, input checks, initialisation ...).
+    #
+    # We would not like the parallel functions to spend too much time on these by selecting a small partition size, as
+    # this will waste CPU resources while only providing a negligeable parallel speedup.
+    #
+    # This is a tradeoff between computation efficiency and parallelisation. The larger this parameter, the less
+    # parallel the task will run, but the most efficient the task will be computed.
+    min_parallelism_efficiency: float = attrs.field(validator=instance_of(float), default=0.95)
+    # Will partially randomly probe the task's execution profile before making chunk size estimates.
+    learning_sample_count: int = attrs.field(validator=instance_of(int), default=5)
+    # Will circle these partition sizes until the estimator receives `n_learning_samples`.
+    learning_sample_sizes: List[int] = attrs.field(init=False, default=[64, 8, 96, 32, 256, 1, 128, 48, 4])
+    _current_learning_sample: int = attrs.field(init=False, default=0)
+    # Will keep up to `max_traces` before starting to forget previously added traces.
+    max_traces: int = attrs.field(validator=instance_of(int), default=100)
+    _run_traces: List[Tuple[int, int]] = attrs.field(init=False, factory=list)
+    regressor_factory: Callable[[], BaseEstimator] = attrs.field(
+        validator=is_callable(), default=lambda: LinearRegessionEstimator.default_regressor()
+    )
+    _current_coefficients: Optional[LinearRegressionCoefficients] = attrs.field(default=None)
+    _current_estimate: Optional[LinearRegessionEstimate] = attrs.field(default=None)
+    def add_partition_trace(self, trace: PartitionedTaskTrace) -> None:
+        partition_size = trace.partition_size
+        tupled_trace = (partition_size, trace.total_duration // partition_size)
+        if len(self._run_traces) < self.max_traces:
+            self._run_traces.append(tupled_trace)
+            if len(self._run_traces) >= self.max_traces:
+                # Next trace, we will have to replace one exisiting value. Prepare the lists for bisect() by sorting.
+                self._run_traces.sort(key=lambda t: t[0])
+        else:
+            # Replaces the existing entry with the closest partition size.
+            #
+            # As the estimator will converge to similar partition size estimates, this will ensure we keep older but
+            # valuable traces from the initial learning phase of the estimator.
+            left_idx = bisect.bisect_left(self._run_traces, tupled_trace)
+            right_idx = left_idx + 1
+            if left_idx <= 0:
+                self._run_traces[0] = tupled_trace
+            elif right_idx >= len(self._run_traces):
+                self._run_traces[-1] = tupled_trace
+            else:
+                # Replaces the closest value when the value fall between two existing values.
+                left_partition_size = self._run_traces[left_idx][0]
+                right_partition_size = self._run_traces[right_idx][0]
+                if partition_size - left_partition_size < right_partition_size - partition_size:
+                    self._run_traces[left_idx] = tupled_trace
+                else:
+                    self._run_traces[right_idx] = tupled_trace
+                assert self._run_traces[left_idx][0] <= self._run_traces[right_idx][0]
+        self._current_estimate = None
+        self._current_coefficients = None
+    def state(self) -> PartitionSizeEstimatorState:
+        if len(self._run_traces) < self.learning_sample_count:
+            return PartitionSizeEstimatorState.Learning
+        else:
+            return PartitionSizeEstimatorState.Running
+    def coefficients(self) -> LinearRegressionCoefficients:
+        """Trains a linear regression ´f(partition_size) = a + b / partition_size´ on the previously recorded task runs.
+        This pretty accurately estimates the time it takes to process a single item (i.e. row) when feeding a dataset of
+        a given partition size. The behavior of parallel functions is that the larger the partition size, the less
+        function initialization overhead (`b`) will be weighted when compared to the actual processing time of that
+        single item (`a`)."""
+        if self._current_coefficients is not None:
+            return self._current_coefficients
+        regressor = self.regressor_factory()
+        numpy_traces = np.array(self._run_traces)
+        regressor.fit(numpy_traces[:, 0:1], numpy_traces[:, 1])
+        linear_regressor = dict(regressor.steps)["linear"]
+        a = linear_regressor.intercept_
+        b = linear_regressor.coef_[0]
+        score = regressor.score(numpy_traces[:, 0:1], numpy_traces[:, 1])
+        self._current_coefficients = LinearRegressionCoefficients(a, b, score, len(self._run_traces))
+        return self._current_coefficients
+    def estimate(self, dry_run: bool = False) -> LinearRegessionEstimate:
+        if self._current_estimate is not None:
+            return self._current_estimate
+        if self.state() == PartitionSizeEstimatorState.Learning:
+            return self._learn_estimate(dry_run=dry_run)
+        # Knowing f()'s coefficients, we can accuratly compute when the parallel overheads become negligeable when
+        # compared to the actual computation time (`min_parallelism_efficiency`).
+        coefficients = self.coefficients()
+        a = coefficients.a
+        b = coefficients.b
+        if b <= 0 or a < 0:
+            # TODO: we could use more advanced heurestics, like the error value of the regressor.
+            if len(self._run_traces) >= self.max_traces:
+                logging.debug("failed to estimate a valid partition size, fallback to learning.")
+            return self._learn_estimate(dry_run=dry_run)
+        current_backend = get_parallel_backend()
+        if current_backend is None:
+            raise ValueError("partition size estimator requires a contextual parallel backend instance.")
+        # Solves the partition size that satisfies `min_parallelism_efficiency`.
+        partition_size = ceil(b / (a * (1 - self.min_parallelism_efficiency)))
+        self._current_estimate = LinearRegessionEstimate(partition_size, coefficients)
+        return self._current_estimate
+    def _learn_estimate(self, dry_run: bool = False) -> LinearRegessionEstimate:
+        """Learning estimate. Probes the task execution times before running the actual estimator."""
+        partition_size = self.learning_sample_sizes[self._current_learning_sample]
+        if not dry_run:
+            self._current_learning_sample += 1
+            self._current_learning_sample %= len(self.learning_sample_sizes)
+        return LinearRegessionEstimate(partition_size, None)
+    @staticmethod
+    def default_regressor() -> BaseEstimator:
+        return Pipeline(
+            steps=[("inv", FunctionTransformer(func=lambda xs: 1.0 / xs)), ("linear", LinearRegression(positive=True))]
+        )

parfun/partition_size_estimator/mixins.py ADDED Viewed

@@ -0,0 +1,22 @@
+import abc
+from typing import Generic
+import attrs
+from parfun.partition_size_estimator.object import PartitionSizeEstimateType, PartitionSizeEstimatorState
+from parfun.profiler.object import PartitionedTaskTrace
+@attrs.define
+class PartitionSizeEstimator(Generic[PartitionSizeEstimateType], metaclass=abc.ABCMeta):
+    @abc.abstractmethod
+    def add_partition_trace(self, trace: PartitionedTaskTrace) -> None:
+        raise NotImplementedError()
+    @abc.abstractmethod
+    def state(self) -> PartitionSizeEstimatorState:
+        raise NotImplementedError()
+    @abc.abstractmethod
+    def estimate(self, dry_run: bool = False) -> PartitionSizeEstimateType:
+        raise NotImplementedError()

parfun/partition_size_estimator/object.py ADDED Viewed

@@ -0,0 +1,19 @@
+import abc
+import enum
+from typing import TypeVar
+import attrs
+from attrs.validators import gt, instance_of
+class PartitionSizeEstimatorState(enum.Enum):
+    Learning = "learning"
+    Running = "running"
+@attrs.define
+class PartitionSizeEstimate(metaclass=abc.ABCMeta):
+    value: int = attrs.field(validator=(instance_of(int), gt(0)))
+PartitionSizeEstimateType = TypeVar("PartitionSizeEstimateType", bound=PartitionSizeEstimate)

parfun/profiler/__init__.py ADDED Viewed

File without changes