opengris-parfun 7.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. opengris_parfun-7.3.0.dist-info/METADATA +165 -0
  2. opengris_parfun-7.3.0.dist-info/RECORD +43 -0
  3. opengris_parfun-7.3.0.dist-info/WHEEL +5 -0
  4. opengris_parfun-7.3.0.dist-info/licenses/LICENSE +201 -0
  5. opengris_parfun-7.3.0.dist-info/licenses/LICENSE.spdx +7 -0
  6. opengris_parfun-7.3.0.dist-info/licenses/NOTICE +7 -0
  7. opengris_parfun-7.3.0.dist-info/top_level.txt +1 -0
  8. parfun/__init__.py +26 -0
  9. parfun/about.py +1 -0
  10. parfun/backend/__init__.py +0 -0
  11. parfun/backend/dask.py +151 -0
  12. parfun/backend/local_multiprocessing.py +92 -0
  13. parfun/backend/local_single_process.py +47 -0
  14. parfun/backend/mixins.py +68 -0
  15. parfun/backend/profiled_future.py +50 -0
  16. parfun/backend/scaler.py +226 -0
  17. parfun/backend/utility.py +7 -0
  18. parfun/combine/__init__.py +0 -0
  19. parfun/combine/collection.py +13 -0
  20. parfun/combine/dataframe.py +13 -0
  21. parfun/dataframe.py +175 -0
  22. parfun/decorators.py +135 -0
  23. parfun/entry_point.py +180 -0
  24. parfun/functions.py +71 -0
  25. parfun/kernel/__init__.py +0 -0
  26. parfun/kernel/function_signature.py +197 -0
  27. parfun/kernel/parallel_function.py +262 -0
  28. parfun/object.py +7 -0
  29. parfun/partition/__init__.py +0 -0
  30. parfun/partition/api.py +136 -0
  31. parfun/partition/collection.py +13 -0
  32. parfun/partition/dataframe.py +16 -0
  33. parfun/partition/object.py +50 -0
  34. parfun/partition/primitives.py +317 -0
  35. parfun/partition/utility.py +54 -0
  36. parfun/partition_size_estimator/__init__.py +0 -0
  37. parfun/partition_size_estimator/linear_regression_estimator.py +189 -0
  38. parfun/partition_size_estimator/mixins.py +22 -0
  39. parfun/partition_size_estimator/object.py +19 -0
  40. parfun/profiler/__init__.py +0 -0
  41. parfun/profiler/functions.py +261 -0
  42. parfun/profiler/object.py +68 -0
  43. parfun/py_list.py +56 -0
@@ -0,0 +1,262 @@
1
+ import collections
2
+ import logging
3
+ from inspect import Parameter, currentframe
4
+ from itertools import chain
5
+ from typing import Callable, Deque, Generator, Iterable, Optional, Tuple, Union
6
+
7
+ import attrs
8
+
9
+ from parfun.backend.mixins import BackendEngine, ProfiledFuture
10
+ from parfun.entry_point import get_parallel_backend, set_parallel_backend_context
11
+ from parfun.kernel.function_signature import FunctionSignature, NamedArguments
12
+ from parfun.object import FunctionInputType, FunctionOutputType, PartitionType
13
+ from parfun.partition.object import PartitionGenerator
14
+ from parfun.partition_size_estimator.linear_regression_estimator import LinearRegessionEstimator
15
+ from parfun.partition_size_estimator.mixins import PartitionSizeEstimator
16
+ from parfun.profiler.functions import (
17
+ export_task_trace, print_profile_trace, timed_combine_with, timed_function, timed_partition,
18
+ )
19
+ from parfun.profiler.object import PartitionedTaskTrace, TraceTime
20
+
21
+
22
+ @attrs.define
23
+ class ParallelFunction:
24
+ """Wraps a function so that it executes in parallel using a map-reduce/scatter-gather approach.
25
+
26
+ See the `@parallel()` decorator for a more user-friendly interface.
27
+ """
28
+
29
+ function: Callable[[FunctionInputType], FunctionOutputType] = attrs.field()
30
+
31
+ function_name: str = attrs.field()
32
+
33
+ split: Callable[[NamedArguments], Tuple[NamedArguments, PartitionGenerator[NamedArguments]]] = attrs.field()
34
+
35
+ combine_with: Callable[[Iterable[FunctionOutputType]], FunctionOutputType] = attrs.field()
36
+
37
+ initial_partition_size: Optional[Union[int, Callable[[FunctionInputType], int]]] = attrs.field(default=None)
38
+ fixed_partition_size: Optional[Union[int, Callable[[FunctionInputType], int]]] = attrs.field(default=None)
39
+
40
+ profile: bool = attrs.field(default=None)
41
+ trace_export: Optional[str] = attrs.field(default=None)
42
+
43
+ partition_size_estimator_factory: Callable[[], PartitionSizeEstimator] = attrs.field(
44
+ default=LinearRegessionEstimator
45
+ )
46
+
47
+ _partition_size_estimator: Optional[PartitionSizeEstimator] = attrs.field(init=False, default=None)
48
+ _function_signature: FunctionSignature = attrs.field(init=False)
49
+
50
+ def __attrs_post_init__(self) -> None:
51
+ self._function_signature = FunctionSignature.from_function(self.function)
52
+
53
+ if self.initial_partition_size is not None and self.fixed_partition_size is not None:
54
+ raise ValueError("`initial_partition_size` and `fixed_partition_size` cannot be set simultaneously.")
55
+
56
+ if self.fixed_partition_size is None:
57
+ self._partition_size_estimator = self.partition_size_estimator_factory()
58
+
59
+ self._validate_function_signature()
60
+
61
+ @initial_partition_size.validator
62
+ @fixed_partition_size.validator
63
+ def _partition_size_validator(self, attribute, value):
64
+ if value is not None and not isinstance(value, int) and not callable(value):
65
+ raise ValueError(f"`{attribute.name}` should be either an integer, a callable or `None`.")
66
+
67
+ def _validate_function_signature(self):
68
+ if self._function_signature.has_var_arg or self._function_signature.has_var_kwarg:
69
+ return
70
+
71
+ if any(arg.kind == Parameter.POSITIONAL_ONLY for arg in self._function_signature.args.values()):
72
+ raise ValueError("parfun toolkit does not support positional only parameters yet.")
73
+
74
+ def __call__(self, *args, **kwargs) -> FunctionOutputType:
75
+ current_backend = get_parallel_backend()
76
+ allows_nested_tasks = current_backend is not None and current_backend.allows_nested_tasks()
77
+
78
+ # Note: is_nested_parallelism check should appears before any backend check, as unsupported nested function
79
+ # calls will have an empty backend setup.
80
+ if is_nested_parallelism() and not allows_nested_tasks:
81
+ logging.debug(
82
+ f"backend does not support nested parallelism. Running {self.function.__name__} sequentially."
83
+ )
84
+ return self.function(*args, **kwargs)
85
+
86
+ if current_backend is None:
87
+ logging.warning(f"no parallel backend engine set, run `{self.function_name}(...)` sequentially.")
88
+ return self.function(*args, **kwargs)
89
+
90
+ # Assign a name to each argument based on the decorated function's signature.
91
+
92
+ named_args = self._function_signature.assign(args, kwargs)
93
+
94
+ # Initialize the partition generator
95
+
96
+ non_partitioned_args, partition_generator = self.split(named_args)
97
+
98
+ initial_partition_size, fixed_partition_size = self._get_user_partition_sizes(args, kwargs)
99
+
100
+ partitions = timed_partition(
101
+ partition_generator, self._partition_size_estimator, initial_partition_size, fixed_partition_size
102
+ )
103
+
104
+ # Execute the function
105
+
106
+ if allows_nested_tasks:
107
+ nested_backend = current_backend
108
+ else:
109
+ nested_backend = None
110
+
111
+ results = run_function_on_partitions(
112
+ self.function,
113
+ non_partitioned_args,
114
+ partitions,
115
+ current_backend,
116
+ nested_backend,
117
+ )
118
+
119
+ # Combine the results
120
+
121
+ combined_result, task_trace = timed_combine_with(self.combine_with, self._partition_size_estimator, results)
122
+
123
+ if self.profile:
124
+ print_profile_trace(self.function, self.function_name, self._partition_size_estimator, task_trace)
125
+
126
+ if self.trace_export:
127
+ export_task_trace(self.trace_export, task_trace)
128
+
129
+ logging.info(
130
+ f"Run `{self.function_name}(...)` with {task_trace.partition_count} of "
131
+ f"sub-tasks using backend {current_backend.__class__} successfully"
132
+ )
133
+
134
+ return combined_result
135
+
136
+ def _get_user_partition_sizes(self, args, kwargs) -> Tuple[Optional[int], Optional[int]]:
137
+ """Returns the initial partition size and fixed partition size for the calling function arguments."""
138
+
139
+ if callable(self.initial_partition_size):
140
+ initial_partition_size = self.initial_partition_size(*args, **kwargs)
141
+ else:
142
+ initial_partition_size = self.initial_partition_size
143
+
144
+ if callable(self.fixed_partition_size):
145
+ fixed_partition_size = self.fixed_partition_size(*args, **kwargs)
146
+ else:
147
+ fixed_partition_size = self.fixed_partition_size
148
+
149
+ return initial_partition_size, fixed_partition_size
150
+
151
+
152
+ def is_nested_parallelism():
153
+ """Returns True if there is any call to `_apply_function()` in the current call stack."""
154
+
155
+ frame = currentframe()
156
+ while frame is not None:
157
+ if frame.f_code.co_name == run_function_in_worker.__name__ and frame.f_code.co_filename == __file__:
158
+ return True
159
+ frame = frame.f_back
160
+ return False
161
+
162
+
163
+ def run_function_on_partitions(
164
+ function: Callable[[PartitionType], FunctionOutputType],
165
+ non_partitioned_args: NamedArguments,
166
+ partitions: Generator[Tuple[NamedArguments, PartitionedTaskTrace], None, None],
167
+ backend: BackendEngine,
168
+ nested_backend: Optional[BackendEngine],
169
+ ) -> Generator[Tuple[FunctionOutputType, TraceTime], None, None]:
170
+ """
171
+ Applies the provided function on all non-partitioned and partitioned arguments using the provided backend.
172
+ """
173
+
174
+ # First, tries to get the first 2 partitions. If we get less than 2, we run the function sequentially to avoid
175
+ # any parallelism overhead.
176
+
177
+ iterator = iter(partitions)
178
+
179
+ first_values = []
180
+ try:
181
+ first_values.append(next(iterator))
182
+ first_values.append(next(iterator))
183
+ except StopIteration:
184
+ # Less than 2 values, run these sequentially and return
185
+ assert len(first_values) <= 2
186
+
187
+ for partitioned_args in first_values:
188
+ yield timed_function(
189
+ run_function_in_worker,
190
+ function,
191
+ non_partitioned_args,
192
+ partitioned_args,
193
+ backend=None,
194
+ )
195
+
196
+ return
197
+
198
+ # At least two values, submits these and the rest to the backend.
199
+
200
+ with backend.session() as session:
201
+ preloaded_non_partitioned_args = session.preload_value(non_partitioned_args)
202
+
203
+ # We take care of futures.pop() no longer required futures' references as we yield them, to avoid keeping no
204
+ # longer used results. Not doing this will prevent the Python's GC to free the yielded results once these have
205
+ # been processed by the consuming function.
206
+ futures: Deque[ProfiledFuture] = collections.deque()
207
+
208
+ try:
209
+ for partitioned_args in chain(first_values, iterator):
210
+ futures.append(
211
+ session.submit(
212
+ run_function_in_worker,
213
+ function,
214
+ preloaded_non_partitioned_args,
215
+ partitioned_args,
216
+ nested_backend,
217
+ )
218
+ )
219
+
220
+ # Yields any finished future from the head of the queue.
221
+ while len(futures) > 0 and futures[0].done():
222
+ yield futures.popleft().result_and_duration()
223
+
224
+ # Yields the remaining results.
225
+ while len(futures) > 0:
226
+ yield futures.popleft().result_and_duration()
227
+ finally:
228
+ # If any failure, cancels all unfinished tasks.
229
+ for future in futures:
230
+ future.cancel()
231
+
232
+
233
+ def run_function_in_worker(
234
+ function: Callable[[PartitionType], FunctionOutputType],
235
+ non_partitioned_args: NamedArguments,
236
+ partition: Tuple[NamedArguments, PartitionedTaskTrace],
237
+ backend: Optional[BackendEngine] = None,
238
+ ) -> Tuple[FunctionOutputType, PartitionedTaskTrace]:
239
+ """
240
+ Runs the function with the partitioned object, setting up the expected worker environment.
241
+
242
+ :param non_partitioned_args: the function arguments that are identical for every function call.
243
+ :param partition: the partitioned arguments and the associated partition task trace.
244
+ :param backend: if not None, setup this backend before executing the function.
245
+
246
+ :returns the function's output and the original partition task trace.
247
+ """
248
+
249
+ partitioned_args, trace = partition
250
+
251
+ merged_args = non_partitioned_args.merge(partitioned_args)
252
+ assert len(non_partitioned_args.var_args) == 0
253
+
254
+ args, kwargs = merged_args.as_args_kwargs()
255
+
256
+ if backend is not None:
257
+ with set_parallel_backend_context(backend):
258
+ result = function(*args, **kwargs)
259
+ else:
260
+ result = function(*args, **kwargs)
261
+
262
+ return result, trace
parfun/object.py ADDED
@@ -0,0 +1,7 @@
1
+ from typing import Any, TypeVar
2
+
3
+ # TODO we can specify and limit their values in future
4
+ FunctionInputType = Any
5
+ FunctionOutputType = Any
6
+
7
+ PartitionType = TypeVar("PartitionType") # Input and output are identical for partitioning functions.
File without changes
@@ -0,0 +1,136 @@
1
+ from collections import OrderedDict
2
+ from itertools import chain
3
+ from typing import Callable, Tuple, Union
4
+
5
+ from parfun.kernel.function_signature import NamedArguments
6
+ from parfun.partition.object import PartitionFunction, PartitionGenerator
7
+ from parfun.partition.primitives import partition_map, partition_zip
8
+
9
+
10
+ def per_argument(
11
+ **partition_arg_with: PartitionFunction,
12
+ ) -> Callable[[NamedArguments], Tuple[NamedArguments, PartitionGenerator[NamedArguments]]]:
13
+ """
14
+ Applies multiple partitioning functions simultaneously on different function arguments.
15
+
16
+ .. code:: python
17
+
18
+ @pf.parallel(
19
+ split=pf.per_argument(
20
+ df=pf.dataframe.by_row,
21
+ xs=pf.py_list.by_chunk,
22
+ )
23
+ )
24
+ def func(df: pd.DataFrame, xs: List, constant: int):
25
+ ...
26
+
27
+ """
28
+
29
+ partition_arg_names = set(partition_arg_with.keys())
30
+
31
+ def partitioning_function(named_args: NamedArguments) -> Tuple[NamedArguments, PartitionGenerator[NamedArguments]]:
32
+ # Applies all partition functions simultaneously using `partition_zip()`, and then rebuilds the `NamedArguments`
33
+ # object with the partitioned values.
34
+
35
+ partitioned_args, non_partitioned_args = named_args.split(partition_arg_names)
36
+
37
+ def reassign_partitioned_arguments(*partitioned_values) -> NamedArguments:
38
+ changes = dict(zip(partition_arg_names, [v[0] for v in partitioned_values]))
39
+ return partitioned_args.reassigned(**changes)
40
+
41
+ partitioned_arg_generators = [
42
+ partition_arg_with[arg_name](partitioned_args[arg_name]) for arg_name in partition_arg_names
43
+ ]
44
+
45
+ zipped = partition_zip(*partitioned_arg_generators)
46
+
47
+ generator = partition_map(reassign_partitioned_arguments, zipped) # type: ignore[type-var]
48
+
49
+ return non_partitioned_args, generator
50
+
51
+ return partitioning_function
52
+
53
+
54
+ def multiple_arguments(
55
+ partition_on: Union[Tuple[str, ...], str], partition_with: PartitionFunction
56
+ ) -> Callable[[NamedArguments], Tuple[NamedArguments, PartitionGenerator[NamedArguments]]]:
57
+ """
58
+ Applies a single partitioning function to multiple, but not all, arguments.
59
+
60
+ .. code:: python
61
+
62
+ @pf.parallel(
63
+ split=pf.multiple_arguments(
64
+ ("df_1", "df_2"),
65
+ pf.dataframe.by_group(by=["year", "month"]),
66
+ ),
67
+ ...
68
+ )
69
+ def func(df_1: pd.DataFrame, df_2: pd.DataFrame, constant: int):
70
+ ...
71
+
72
+ """
73
+
74
+ if isinstance(partition_on, str):
75
+ partition_on = (partition_on,)
76
+
77
+ if not isinstance(partition_on, tuple) or not all(isinstance(i, str) for i in partition_on):
78
+ raise ValueError(f"`partition_on` must be str or tuple of string, but got: {partition_on}.")
79
+
80
+ if len(partition_on) == 0:
81
+ raise ValueError("empty `partition_on` value.")
82
+
83
+ def partitioning_function(named_args: NamedArguments) -> Tuple[NamedArguments, PartitionGenerator]:
84
+ # Applies the partitioning function to the selected parameters, and then rebuilds the `NamedArguments` object
85
+ # with these partitioned values.
86
+
87
+ partitioned_args, non_partitioned_args = named_args.split(set(partition_on))
88
+
89
+ arg_values = [partitioned_args[a] for a in partition_on]
90
+
91
+ generator = partition_map(
92
+ lambda *partitioned_values: partitioned_args.reassigned(**dict(zip(partition_on, partitioned_values))),
93
+ partition_with(*arg_values),
94
+ ) # type: ignore[type-var]
95
+
96
+ return non_partitioned_args, generator
97
+
98
+ return partitioning_function
99
+
100
+
101
+ def all_arguments(
102
+ partition_with: PartitionFunction,
103
+ ) -> Callable[[NamedArguments], Tuple[NamedArguments, PartitionGenerator[NamedArguments]]]:
104
+ """
105
+ Applies a single partitioning function to all arguments.
106
+
107
+ .. code:: python
108
+
109
+ @pf.parallel(
110
+ split=pf.all_arguments(pf.dataframe.by_group(by=["year", "month"]),
111
+ ...
112
+ )
113
+ def func(df_1: pd.DataFrame, df_2: pd.DataFrame):
114
+ ...
115
+
116
+ """
117
+
118
+ def partitioning_function(named_args: NamedArguments) -> Tuple[NamedArguments, PartitionGenerator]:
119
+ # Applies the partition function to the named positional parameters first, then keyword, then variable args,
120
+ # then rebuilds the partitioned NamedArgument object in the same order.
121
+
122
+ def reassign_all_arguments(*partitioned_values) -> NamedArguments:
123
+ n_args = len(named_args.args)
124
+ n_kwargs = len(named_args.kwargs)
125
+
126
+ args = OrderedDict(zip(named_args.args.keys(), partitioned_values[:n_args]))
127
+ kwargs = dict(zip(named_args.kwargs.keys(), partitioned_values[n_args : n_args + n_kwargs]))
128
+ var_args = partitioned_values[n_args + n_kwargs :]
129
+
130
+ return NamedArguments(args=args, kwargs=kwargs, var_args=var_args)
131
+
132
+ arg_values = chain(named_args.args.values(), named_args.kwargs.values(), named_args.var_args)
133
+
134
+ return NamedArguments(), partition_map(reassign_all_arguments, partition_with(*arg_values))
135
+
136
+ return partitioning_function
@@ -0,0 +1,13 @@
1
+ import warnings
2
+
3
+ from parfun.py_list import by_chunk
4
+
5
+
6
+ warnings.warn(
7
+ "parfun.partition.collection is deprecated and will be removed in a future version, use parfun.py_list.",
8
+ DeprecationWarning
9
+ )
10
+
11
+ list_by_chunk = by_chunk
12
+
13
+ __all__ = ["list_by_chunk"]
@@ -0,0 +1,16 @@
1
+
2
+ import warnings
3
+
4
+ from parfun.dataframe import by_group, by_row
5
+
6
+
7
+ warnings.warn(
8
+ "parfun.partition.dataframe is deprecated and will be removed in a future version, use parfun.dataframe.",
9
+ DeprecationWarning
10
+ )
11
+
12
+ df_by_group = by_group
13
+
14
+ df_by_row = by_row
15
+
16
+ __all__ = ["df_by_group", "df_by_row"]
@@ -0,0 +1,50 @@
1
+ from typing import Callable, Generator, Optional, Tuple, Union
2
+
3
+ from parfun.object import PartitionType
4
+
5
+ SimplePartitionIterator = Generator[PartitionType, None, None]
6
+
7
+ SmartPartitionGenerator = Generator[Optional[Tuple[int, PartitionType]], int, None]
8
+
9
+ PartitionGenerator = Union[SimplePartitionIterator[PartitionType], SmartPartitionGenerator[PartitionType]]
10
+ """
11
+ All partitioning functions must return a Python generator of this type.
12
+
13
+ There are two ways of writing a partitioning functions:
14
+
15
+ * Use regular Python generators (prefered) or iterators, returning partitioned values:
16
+
17
+ .. code:: python
18
+
19
+ def partition_list_by_chunks(values: List): PartitionGenerator[List]:
20
+ PARTITION_SIZE = len(values) / 100
21
+
22
+ for begin in range(0, len(values), PARTITION_SIZE)):
23
+ yield values[begin:begin + PARTITION_SIZE]
24
+
25
+
26
+ * Use partition size aware Python generators, or smart generators. These are more complex but more efficient. Partition
27
+ size aware generators must get a suggested partition size through the return value of the ``yield`` statement, and
28
+ yield partition sizes with its partitioned values:
29
+
30
+ .. code:: python
31
+
32
+ def partition_list_by_chunks(values: List, constant: int) -> PartitionGenerator[Tuple[List, int]]:
33
+ # A first empty call to `yield` is required to obtain the first requested partition size
34
+ requested_partition_size = yield None
35
+
36
+ begin = 0
37
+ while begin < len(values):
38
+ end = min(len(values), begin + requested_partition_size)
39
+
40
+ partition_size = end - begin
41
+ partition = (values[begin:end], a)
42
+
43
+ # Yield the actual partition along its size, and obtain the requested size for the next partition.
44
+ requested_partition_size = yield partition_size, partition
45
+
46
+ begin = end
47
+
48
+ """
49
+
50
+ PartitionFunction = Callable[..., PartitionGenerator[PartitionType]]