opengris-parfun 7.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- opengris_parfun-7.3.0.dist-info/METADATA +165 -0
- opengris_parfun-7.3.0.dist-info/RECORD +43 -0
- opengris_parfun-7.3.0.dist-info/WHEEL +5 -0
- opengris_parfun-7.3.0.dist-info/licenses/LICENSE +201 -0
- opengris_parfun-7.3.0.dist-info/licenses/LICENSE.spdx +7 -0
- opengris_parfun-7.3.0.dist-info/licenses/NOTICE +7 -0
- opengris_parfun-7.3.0.dist-info/top_level.txt +1 -0
- parfun/__init__.py +26 -0
- parfun/about.py +1 -0
- parfun/backend/__init__.py +0 -0
- parfun/backend/dask.py +151 -0
- parfun/backend/local_multiprocessing.py +92 -0
- parfun/backend/local_single_process.py +47 -0
- parfun/backend/mixins.py +68 -0
- parfun/backend/profiled_future.py +50 -0
- parfun/backend/scaler.py +226 -0
- parfun/backend/utility.py +7 -0
- parfun/combine/__init__.py +0 -0
- parfun/combine/collection.py +13 -0
- parfun/combine/dataframe.py +13 -0
- parfun/dataframe.py +175 -0
- parfun/decorators.py +135 -0
- parfun/entry_point.py +180 -0
- parfun/functions.py +71 -0
- parfun/kernel/__init__.py +0 -0
- parfun/kernel/function_signature.py +197 -0
- parfun/kernel/parallel_function.py +262 -0
- parfun/object.py +7 -0
- parfun/partition/__init__.py +0 -0
- parfun/partition/api.py +136 -0
- parfun/partition/collection.py +13 -0
- parfun/partition/dataframe.py +16 -0
- parfun/partition/object.py +50 -0
- parfun/partition/primitives.py +317 -0
- parfun/partition/utility.py +54 -0
- parfun/partition_size_estimator/__init__.py +0 -0
- parfun/partition_size_estimator/linear_regression_estimator.py +189 -0
- parfun/partition_size_estimator/mixins.py +22 -0
- parfun/partition_size_estimator/object.py +19 -0
- parfun/profiler/__init__.py +0 -0
- parfun/profiler/functions.py +261 -0
- parfun/profiler/object.py +68 -0
- parfun/py_list.py +56 -0
|
@@ -0,0 +1,261 @@
|
|
|
1
|
+
import csv
|
|
2
|
+
import datetime
|
|
3
|
+
import inspect
|
|
4
|
+
import sys
|
|
5
|
+
import time
|
|
6
|
+
from contextlib import contextmanager
|
|
7
|
+
from typing import Any, Callable, Generator, Iterable, Optional, TextIO, Tuple
|
|
8
|
+
|
|
9
|
+
from parfun.kernel.function_signature import NamedArguments
|
|
10
|
+
from parfun.object import FunctionOutputType
|
|
11
|
+
from parfun.partition.object import PartitionGenerator
|
|
12
|
+
from parfun.partition_size_estimator.mixins import PartitionSizeEstimator
|
|
13
|
+
from parfun.profiler.object import PartitionedTaskTrace, ProfileDuration, TaskTrace, TraceTime
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
@contextmanager
|
|
17
|
+
def profile(timer_function: Callable[[], TraceTime] = time.process_time_ns):
|
|
18
|
+
"""
|
|
19
|
+
Provides a Python ``with`` context that measures the execution time of the enclosing block.
|
|
20
|
+
|
|
21
|
+
.. code:: python
|
|
22
|
+
|
|
23
|
+
with profile() as duration:
|
|
24
|
+
some_heavy_task()
|
|
25
|
+
|
|
26
|
+
print(f"Task duration: {duration.value}ns")
|
|
27
|
+
|
|
28
|
+
"""
|
|
29
|
+
|
|
30
|
+
starts_at = timer_function()
|
|
31
|
+
|
|
32
|
+
profile_duration = ProfileDuration()
|
|
33
|
+
yield profile_duration
|
|
34
|
+
|
|
35
|
+
profile_duration.value = timer_function() - starts_at
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def timed_function(fn: Callable, *args, **kwargs) -> Tuple[Any, TraceTime]:
|
|
39
|
+
"""
|
|
40
|
+
Runs the provided function with the specified args, and returns its execution CPU time and its returned value.
|
|
41
|
+
"""
|
|
42
|
+
|
|
43
|
+
with profile() as duration:
|
|
44
|
+
result = fn(*args, **kwargs)
|
|
45
|
+
|
|
46
|
+
return result, duration.value
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def timed_partition(
|
|
50
|
+
generator: PartitionGenerator[NamedArguments],
|
|
51
|
+
partition_size_estimator: Optional[PartitionSizeEstimator],
|
|
52
|
+
initial_partition_size: Optional[int],
|
|
53
|
+
fixed_partition_size: Optional[int],
|
|
54
|
+
) -> Generator[Tuple[NamedArguments, PartitionedTaskTrace], None, None]:
|
|
55
|
+
"""
|
|
56
|
+
Wraps the ``partition_generator`` with performance timers.
|
|
57
|
+
|
|
58
|
+
:returns the wrapped generator.
|
|
59
|
+
"""
|
|
60
|
+
|
|
61
|
+
if initial_partition_size is not None and fixed_partition_size is not None:
|
|
62
|
+
raise ValueError("`initial_partition_size` and `fixed_partition_size` cannot be set simultaneously.")
|
|
63
|
+
|
|
64
|
+
if partition_size_estimator is not None and fixed_partition_size is not None:
|
|
65
|
+
raise ValueError("`partition_size_estimator` and `fixed_partition_size` cannot be set simultaneously.")
|
|
66
|
+
|
|
67
|
+
if partition_size_estimator is None and fixed_partition_size is None:
|
|
68
|
+
raise ValueError("either `partition_size_estimator` or `fixed_partition_size` must be set.")
|
|
69
|
+
|
|
70
|
+
if not inspect.isgenerator(generator):
|
|
71
|
+
raise TypeError(f"partition functions must be generators, got {type(generator).__name__}")
|
|
72
|
+
|
|
73
|
+
try:
|
|
74
|
+
with profile() as first_value_duration:
|
|
75
|
+
first_value = next(generator)
|
|
76
|
+
|
|
77
|
+
if first_value is not None:
|
|
78
|
+
# This is a regular generator. Iterates without relying on the partition size estimator.
|
|
79
|
+
|
|
80
|
+
trace = PartitionedTaskTrace(
|
|
81
|
+
partition_size_estimate=None, partition_size=1, partition_duration=first_value_duration.value
|
|
82
|
+
)
|
|
83
|
+
yield first_value, trace
|
|
84
|
+
|
|
85
|
+
while True:
|
|
86
|
+
with profile() as partition_duration:
|
|
87
|
+
partition = next(generator)
|
|
88
|
+
|
|
89
|
+
trace = PartitionedTaskTrace(
|
|
90
|
+
partition_size_estimate=None, partition_size=1, partition_duration=partition_duration.value
|
|
91
|
+
)
|
|
92
|
+
yield partition, trace
|
|
93
|
+
else:
|
|
94
|
+
# This is a smart generator. Iterates while running the partition size estimator.
|
|
95
|
+
|
|
96
|
+
if initial_partition_size is not None or fixed_partition_size is not None:
|
|
97
|
+
requested_partition_size = initial_partition_size or fixed_partition_size
|
|
98
|
+
partition_size_estimate = None
|
|
99
|
+
else:
|
|
100
|
+
assert partition_size_estimator is not None
|
|
101
|
+
partition_size_estimate = partition_size_estimator.estimate()
|
|
102
|
+
requested_partition_size = partition_size_estimate.value
|
|
103
|
+
|
|
104
|
+
assert requested_partition_size is not None
|
|
105
|
+
|
|
106
|
+
while True:
|
|
107
|
+
with profile() as partition_duration:
|
|
108
|
+
partition_size, partition = generator.send(requested_partition_size)
|
|
109
|
+
|
|
110
|
+
trace = PartitionedTaskTrace(
|
|
111
|
+
partition_size_estimate=partition_size_estimate,
|
|
112
|
+
partition_size=partition_size,
|
|
113
|
+
partition_duration=partition_duration.value,
|
|
114
|
+
)
|
|
115
|
+
yield partition, trace
|
|
116
|
+
|
|
117
|
+
if fixed_partition_size is None:
|
|
118
|
+
assert partition_size_estimator is not None
|
|
119
|
+
partition_size_estimate = partition_size_estimator.estimate()
|
|
120
|
+
requested_partition_size = partition_size_estimate.value
|
|
121
|
+
else:
|
|
122
|
+
requested_partition_size = fixed_partition_size
|
|
123
|
+
|
|
124
|
+
except StopIteration:
|
|
125
|
+
return
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
def timed_combine_with(
|
|
129
|
+
combine_with: Callable[[Iterable[FunctionOutputType]], FunctionOutputType],
|
|
130
|
+
partition_size_estimator: Optional[PartitionSizeEstimator],
|
|
131
|
+
results: Iterable[Tuple[Tuple[FunctionOutputType, PartitionedTaskTrace], TraceTime]],
|
|
132
|
+
) -> Tuple[FunctionOutputType, TaskTrace]:
|
|
133
|
+
"""
|
|
134
|
+
Wraps the ``combine_with`` function with performance timers.
|
|
135
|
+
|
|
136
|
+
:param combine_with: the combining function.
|
|
137
|
+
:param partition_size_estimator: forward execution feedback to the estimator is not ``None``.
|
|
138
|
+
:param results: the partitioned execution result and their associated partitioning and execution times.
|
|
139
|
+
|
|
140
|
+
:returns the combined output, and the task execution time measurements.
|
|
141
|
+
"""
|
|
142
|
+
|
|
143
|
+
trace = TaskTrace()
|
|
144
|
+
|
|
145
|
+
def timed_combine_generator():
|
|
146
|
+
for result_with_trace, task_duration in results:
|
|
147
|
+
result, partition_trace = result_with_trace
|
|
148
|
+
|
|
149
|
+
partition_trace.task_duration = task_duration
|
|
150
|
+
|
|
151
|
+
with profile() as combine_duration:
|
|
152
|
+
yield result
|
|
153
|
+
|
|
154
|
+
partition_trace.combine_duration = combine_duration.value
|
|
155
|
+
|
|
156
|
+
if partition_size_estimator is not None:
|
|
157
|
+
partition_size_estimator.add_partition_trace(partition_trace)
|
|
158
|
+
|
|
159
|
+
trace.partition_traces.append(partition_trace)
|
|
160
|
+
|
|
161
|
+
result = combine_with(timed_combine_generator())
|
|
162
|
+
|
|
163
|
+
return result, trace
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
def print_profile_trace(
|
|
167
|
+
function: Callable,
|
|
168
|
+
function_name: Optional[str],
|
|
169
|
+
partition_size_estimator: Optional[PartitionSizeEstimator],
|
|
170
|
+
task_trace: TaskTrace,
|
|
171
|
+
file: TextIO = sys.stderr,
|
|
172
|
+
) -> None:
|
|
173
|
+
"""Prints an human-readable summary of the task's execution times."""
|
|
174
|
+
|
|
175
|
+
def print_to_file(value: str):
|
|
176
|
+
print(value, file=file)
|
|
177
|
+
|
|
178
|
+
partition_timedelta = datetime.timedelta(microseconds=task_trace.total_partition_duration / 1000)
|
|
179
|
+
partition_average_timedelta = partition_timedelta / task_trace.partition_count
|
|
180
|
+
function_timedelta = datetime.timedelta(microseconds=task_trace.total_task_duration / 1000)
|
|
181
|
+
function_timedeltas = [
|
|
182
|
+
datetime.timedelta(microseconds=partition_trace.task_duration / 1000)
|
|
183
|
+
for partition_trace in task_trace.partition_traces
|
|
184
|
+
]
|
|
185
|
+
combine_timedelta = datetime.timedelta(microseconds=task_trace.total_combine_duration / 1000)
|
|
186
|
+
|
|
187
|
+
total_cpu_duration = partition_timedelta + function_timedelta + combine_timedelta
|
|
188
|
+
|
|
189
|
+
parallel_overhead = partition_timedelta + combine_timedelta
|
|
190
|
+
|
|
191
|
+
if function_name is not None:
|
|
192
|
+
print_to_file(f"{function_name}()")
|
|
193
|
+
else:
|
|
194
|
+
print_to_file(f"{function.__name__}()")
|
|
195
|
+
|
|
196
|
+
print_to_file(f"\ttotal CPU execution time: {total_cpu_duration}.")
|
|
197
|
+
|
|
198
|
+
# Execution stats
|
|
199
|
+
|
|
200
|
+
min_compute = min(function_timedeltas)
|
|
201
|
+
max_compute = max(function_timedeltas)
|
|
202
|
+
|
|
203
|
+
print_to_file(f"\tcompute time: {function_timedelta} ({function_timedelta / total_cpu_duration * 100:.2f}%)")
|
|
204
|
+
print_to_file(f"\t\tmin.: {min_compute}")
|
|
205
|
+
print_to_file(f"\t\tmax.: {max_compute}")
|
|
206
|
+
print_to_file(f"\t\tavg.: {function_timedelta / task_trace.partition_count}")
|
|
207
|
+
|
|
208
|
+
# Partitioning / Combining stats
|
|
209
|
+
|
|
210
|
+
print_to_file(
|
|
211
|
+
f"\ttotal parallel overhead: {parallel_overhead} ({parallel_overhead / total_cpu_duration * 100:.2f}%)"
|
|
212
|
+
)
|
|
213
|
+
print_to_file(
|
|
214
|
+
f"\t\ttotal partitioning: {partition_timedelta} ({partition_timedelta / total_cpu_duration * 100:.2f}%)"
|
|
215
|
+
)
|
|
216
|
+
print_to_file(f"\t\taverage partitioning: {partition_average_timedelta}")
|
|
217
|
+
print_to_file(f"\t\ttotal combining: {combine_timedelta} ({combine_timedelta / total_cpu_duration * 100:.2f}%)")
|
|
218
|
+
|
|
219
|
+
theoretical_speedup = total_cpu_duration / max(parallel_overhead, max_compute)
|
|
220
|
+
print_to_file(f"\tmaximum speedup (theoretical): {theoretical_speedup:.2f}x")
|
|
221
|
+
|
|
222
|
+
# Partition size estimator state
|
|
223
|
+
print_to_file(f"\ttotal partition count: {task_trace.partition_count}")
|
|
224
|
+
|
|
225
|
+
if partition_size_estimator is not None:
|
|
226
|
+
estimator_state = partition_size_estimator.state()
|
|
227
|
+
print_to_file(f"\t\testimator state: {estimator_state.value}")
|
|
228
|
+
|
|
229
|
+
estimate = partition_size_estimator.estimate(dry_run=True).value
|
|
230
|
+
print_to_file(f"\t\testimated partition size: {estimate}")
|
|
231
|
+
|
|
232
|
+
|
|
233
|
+
def export_task_trace(file_path: str, task_trace: TaskTrace) -> None:
|
|
234
|
+
"""Exports the task trace as a CSV file."""
|
|
235
|
+
|
|
236
|
+
def export_partitioned_task_trace(trace: PartitionedTaskTrace) -> Tuple:
|
|
237
|
+
if trace.partition_size_estimate is not None:
|
|
238
|
+
requested_partition_size = trace.partition_size_estimate.value
|
|
239
|
+
else:
|
|
240
|
+
requested_partition_size = None
|
|
241
|
+
|
|
242
|
+
return (
|
|
243
|
+
requested_partition_size,
|
|
244
|
+
trace.partition_size,
|
|
245
|
+
trace.partition_duration,
|
|
246
|
+
trace.task_duration,
|
|
247
|
+
trace.combine_duration,
|
|
248
|
+
)
|
|
249
|
+
|
|
250
|
+
with open(file_path, "w") as file:
|
|
251
|
+
writer = csv.writer(file)
|
|
252
|
+
writer.writerow(
|
|
253
|
+
(
|
|
254
|
+
"requested_partition_size",
|
|
255
|
+
"partition_size",
|
|
256
|
+
"partition_duration",
|
|
257
|
+
"function_duration",
|
|
258
|
+
"combine_duration",
|
|
259
|
+
)
|
|
260
|
+
)
|
|
261
|
+
writer.writerows(export_partitioned_task_trace(trace) for trace in task_trace.partition_traces)
|
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
from typing import List, Optional
|
|
2
|
+
|
|
3
|
+
import attrs
|
|
4
|
+
from attrs.validators import gt, instance_of, optional
|
|
5
|
+
|
|
6
|
+
from parfun.partition_size_estimator.object import PartitionSizeEstimate
|
|
7
|
+
|
|
8
|
+
TraceTime = int # Process time in nanosecond used for profiling
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
@attrs.define
|
|
12
|
+
class ProfileDuration:
|
|
13
|
+
"""See :py:func:`parfun.profiler.functions.profile`."""
|
|
14
|
+
|
|
15
|
+
value: Optional[TraceTime] = attrs.field(default=None)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
@attrs.define
|
|
19
|
+
class PartitionedTaskTrace:
|
|
20
|
+
"""The profiling traces for a single partitioned task (i.e. a function call with a single partitioned dataset)."""
|
|
21
|
+
|
|
22
|
+
partition_size_estimate: Optional[PartitionSizeEstimate] = attrs.field(
|
|
23
|
+
validator=optional(instance_of(PartitionSizeEstimate))
|
|
24
|
+
)
|
|
25
|
+
|
|
26
|
+
partition_size: int = attrs.field(validator=(instance_of(int), gt(0)))
|
|
27
|
+
partition_duration: TraceTime = attrs.field(validator=instance_of(TraceTime))
|
|
28
|
+
|
|
29
|
+
task_duration: Optional[TraceTime] = attrs.field(validator=optional(instance_of(TraceTime)), default=None)
|
|
30
|
+
combine_duration: Optional[TraceTime] = attrs.field(validator=optional(instance_of(TraceTime)), default=None)
|
|
31
|
+
|
|
32
|
+
@property
|
|
33
|
+
def total_duration(self) -> TraceTime:
|
|
34
|
+
if self.task_duration is None or self.combine_duration is None:
|
|
35
|
+
raise ValueError("`task_duration` and `combine_duration` should be initialized.")
|
|
36
|
+
|
|
37
|
+
return self.partition_duration + self.task_duration + self.combine_duration
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
@attrs.define
|
|
41
|
+
class TaskTrace:
|
|
42
|
+
"""The profiling traces for a partitioned job (i.e. a `ParallelFunction` call)."""
|
|
43
|
+
|
|
44
|
+
partition_traces: List[PartitionedTaskTrace] = attrs.field(validator=instance_of(list), factory=list)
|
|
45
|
+
|
|
46
|
+
@property
|
|
47
|
+
def partition_count(self) -> int:
|
|
48
|
+
return len(self.partition_traces)
|
|
49
|
+
|
|
50
|
+
@property
|
|
51
|
+
def total_partition_duration(self) -> TraceTime:
|
|
52
|
+
return sum((t.partition_duration for t in self.partition_traces), 0)
|
|
53
|
+
|
|
54
|
+
@property
|
|
55
|
+
def total_task_duration(self) -> TraceTime:
|
|
56
|
+
"""Returns the total CPU time required to schedule and compute the function."""
|
|
57
|
+
|
|
58
|
+
if any(t.task_duration is None for t in self.partition_traces):
|
|
59
|
+
raise ValueError("`function_duration` values shall be initialized.")
|
|
60
|
+
|
|
61
|
+
return sum((t.task_duration for t in self.partition_traces), 0)
|
|
62
|
+
|
|
63
|
+
@property
|
|
64
|
+
def total_combine_duration(self) -> TraceTime:
|
|
65
|
+
if any(t.combine_duration is None for t in self.partition_traces):
|
|
66
|
+
raise ValueError("`combine_duration` values shall be initialized.")
|
|
67
|
+
|
|
68
|
+
return sum((t.combine_duration for t in self.partition_traces), 0)
|
parfun/py_list.py
ADDED
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
"""
|
|
2
|
+
A collection of pre-define APIs to help users partition and combine collection, such as lists, arrays or tuples.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from itertools import chain
|
|
6
|
+
from typing import Iterable, List, Tuple, TypeVar
|
|
7
|
+
|
|
8
|
+
from parfun.partition.object import PartitionGenerator, PartitionType
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
ListValue = TypeVar("ListValue")
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def concat(values: Iterable[List[ListValue]]) -> List[ListValue]:
|
|
15
|
+
"""
|
|
16
|
+
Chains a collection of lists in a single list.
|
|
17
|
+
|
|
18
|
+
.. code:: python
|
|
19
|
+
|
|
20
|
+
concat([[1,2], [3], [4, 5]]) # [1, 2, 3, 4, 5]
|
|
21
|
+
|
|
22
|
+
"""
|
|
23
|
+
return list(chain.from_iterable(values))
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def by_chunk(*iterables: Iterable[PartitionType]) -> PartitionGenerator[Tuple[Iterable[PartitionType], ...]]:
|
|
27
|
+
"""
|
|
28
|
+
Partition one or multiple iterables by chunks of identical sizes.
|
|
29
|
+
|
|
30
|
+
.. code:: python
|
|
31
|
+
|
|
32
|
+
ls_1 = [1, 2, 3, 4]
|
|
33
|
+
ls_2 = [1, 4, 9, 16]
|
|
34
|
+
|
|
35
|
+
with_partition_size(by_chunk, ls_1, ls_2, partition_size=2))
|
|
36
|
+
# [((1, 2), (1, 4)), ((3, 4), (9, 16))]
|
|
37
|
+
|
|
38
|
+
"""
|
|
39
|
+
|
|
40
|
+
chunk_size = yield None
|
|
41
|
+
|
|
42
|
+
i = 0
|
|
43
|
+
partition = []
|
|
44
|
+
|
|
45
|
+
for tuple_item in zip(*iterables):
|
|
46
|
+
if i < chunk_size:
|
|
47
|
+
partition.append(tuple_item)
|
|
48
|
+
i += 1
|
|
49
|
+
|
|
50
|
+
if i == chunk_size:
|
|
51
|
+
chunk_size = yield chunk_size, tuple(zip(*partition))
|
|
52
|
+
i = 0
|
|
53
|
+
partition = []
|
|
54
|
+
|
|
55
|
+
if partition:
|
|
56
|
+
yield len(partition), tuple(zip(*partition))
|