opengris-parfun 7.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. opengris_parfun-7.3.0.dist-info/METADATA +165 -0
  2. opengris_parfun-7.3.0.dist-info/RECORD +43 -0
  3. opengris_parfun-7.3.0.dist-info/WHEEL +5 -0
  4. opengris_parfun-7.3.0.dist-info/licenses/LICENSE +201 -0
  5. opengris_parfun-7.3.0.dist-info/licenses/LICENSE.spdx +7 -0
  6. opengris_parfun-7.3.0.dist-info/licenses/NOTICE +7 -0
  7. opengris_parfun-7.3.0.dist-info/top_level.txt +1 -0
  8. parfun/__init__.py +26 -0
  9. parfun/about.py +1 -0
  10. parfun/backend/__init__.py +0 -0
  11. parfun/backend/dask.py +151 -0
  12. parfun/backend/local_multiprocessing.py +92 -0
  13. parfun/backend/local_single_process.py +47 -0
  14. parfun/backend/mixins.py +68 -0
  15. parfun/backend/profiled_future.py +50 -0
  16. parfun/backend/scaler.py +226 -0
  17. parfun/backend/utility.py +7 -0
  18. parfun/combine/__init__.py +0 -0
  19. parfun/combine/collection.py +13 -0
  20. parfun/combine/dataframe.py +13 -0
  21. parfun/dataframe.py +175 -0
  22. parfun/decorators.py +135 -0
  23. parfun/entry_point.py +180 -0
  24. parfun/functions.py +71 -0
  25. parfun/kernel/__init__.py +0 -0
  26. parfun/kernel/function_signature.py +197 -0
  27. parfun/kernel/parallel_function.py +262 -0
  28. parfun/object.py +7 -0
  29. parfun/partition/__init__.py +0 -0
  30. parfun/partition/api.py +136 -0
  31. parfun/partition/collection.py +13 -0
  32. parfun/partition/dataframe.py +16 -0
  33. parfun/partition/object.py +50 -0
  34. parfun/partition/primitives.py +317 -0
  35. parfun/partition/utility.py +54 -0
  36. parfun/partition_size_estimator/__init__.py +0 -0
  37. parfun/partition_size_estimator/linear_regression_estimator.py +189 -0
  38. parfun/partition_size_estimator/mixins.py +22 -0
  39. parfun/partition_size_estimator/object.py +19 -0
  40. parfun/profiler/__init__.py +0 -0
  41. parfun/profiler/functions.py +261 -0
  42. parfun/profiler/object.py +68 -0
  43. parfun/py_list.py +56 -0
@@ -0,0 +1,261 @@
1
+ import csv
2
+ import datetime
3
+ import inspect
4
+ import sys
5
+ import time
6
+ from contextlib import contextmanager
7
+ from typing import Any, Callable, Generator, Iterable, Optional, TextIO, Tuple
8
+
9
+ from parfun.kernel.function_signature import NamedArguments
10
+ from parfun.object import FunctionOutputType
11
+ from parfun.partition.object import PartitionGenerator
12
+ from parfun.partition_size_estimator.mixins import PartitionSizeEstimator
13
+ from parfun.profiler.object import PartitionedTaskTrace, ProfileDuration, TaskTrace, TraceTime
14
+
15
+
16
+ @contextmanager
17
+ def profile(timer_function: Callable[[], TraceTime] = time.process_time_ns):
18
+ """
19
+ Provides a Python ``with`` context that measures the execution time of the enclosing block.
20
+
21
+ .. code:: python
22
+
23
+ with profile() as duration:
24
+ some_heavy_task()
25
+
26
+ print(f"Task duration: {duration.value}ns")
27
+
28
+ """
29
+
30
+ starts_at = timer_function()
31
+
32
+ profile_duration = ProfileDuration()
33
+ yield profile_duration
34
+
35
+ profile_duration.value = timer_function() - starts_at
36
+
37
+
38
+ def timed_function(fn: Callable, *args, **kwargs) -> Tuple[Any, TraceTime]:
39
+ """
40
+ Runs the provided function with the specified args, and returns its execution CPU time and its returned value.
41
+ """
42
+
43
+ with profile() as duration:
44
+ result = fn(*args, **kwargs)
45
+
46
+ return result, duration.value
47
+
48
+
49
+ def timed_partition(
50
+ generator: PartitionGenerator[NamedArguments],
51
+ partition_size_estimator: Optional[PartitionSizeEstimator],
52
+ initial_partition_size: Optional[int],
53
+ fixed_partition_size: Optional[int],
54
+ ) -> Generator[Tuple[NamedArguments, PartitionedTaskTrace], None, None]:
55
+ """
56
+ Wraps the ``partition_generator`` with performance timers.
57
+
58
+ :returns the wrapped generator.
59
+ """
60
+
61
+ if initial_partition_size is not None and fixed_partition_size is not None:
62
+ raise ValueError("`initial_partition_size` and `fixed_partition_size` cannot be set simultaneously.")
63
+
64
+ if partition_size_estimator is not None and fixed_partition_size is not None:
65
+ raise ValueError("`partition_size_estimator` and `fixed_partition_size` cannot be set simultaneously.")
66
+
67
+ if partition_size_estimator is None and fixed_partition_size is None:
68
+ raise ValueError("either `partition_size_estimator` or `fixed_partition_size` must be set.")
69
+
70
+ if not inspect.isgenerator(generator):
71
+ raise TypeError(f"partition functions must be generators, got {type(generator).__name__}")
72
+
73
+ try:
74
+ with profile() as first_value_duration:
75
+ first_value = next(generator)
76
+
77
+ if first_value is not None:
78
+ # This is a regular generator. Iterates without relying on the partition size estimator.
79
+
80
+ trace = PartitionedTaskTrace(
81
+ partition_size_estimate=None, partition_size=1, partition_duration=first_value_duration.value
82
+ )
83
+ yield first_value, trace
84
+
85
+ while True:
86
+ with profile() as partition_duration:
87
+ partition = next(generator)
88
+
89
+ trace = PartitionedTaskTrace(
90
+ partition_size_estimate=None, partition_size=1, partition_duration=partition_duration.value
91
+ )
92
+ yield partition, trace
93
+ else:
94
+ # This is a smart generator. Iterates while running the partition size estimator.
95
+
96
+ if initial_partition_size is not None or fixed_partition_size is not None:
97
+ requested_partition_size = initial_partition_size or fixed_partition_size
98
+ partition_size_estimate = None
99
+ else:
100
+ assert partition_size_estimator is not None
101
+ partition_size_estimate = partition_size_estimator.estimate()
102
+ requested_partition_size = partition_size_estimate.value
103
+
104
+ assert requested_partition_size is not None
105
+
106
+ while True:
107
+ with profile() as partition_duration:
108
+ partition_size, partition = generator.send(requested_partition_size)
109
+
110
+ trace = PartitionedTaskTrace(
111
+ partition_size_estimate=partition_size_estimate,
112
+ partition_size=partition_size,
113
+ partition_duration=partition_duration.value,
114
+ )
115
+ yield partition, trace
116
+
117
+ if fixed_partition_size is None:
118
+ assert partition_size_estimator is not None
119
+ partition_size_estimate = partition_size_estimator.estimate()
120
+ requested_partition_size = partition_size_estimate.value
121
+ else:
122
+ requested_partition_size = fixed_partition_size
123
+
124
+ except StopIteration:
125
+ return
126
+
127
+
128
+ def timed_combine_with(
129
+ combine_with: Callable[[Iterable[FunctionOutputType]], FunctionOutputType],
130
+ partition_size_estimator: Optional[PartitionSizeEstimator],
131
+ results: Iterable[Tuple[Tuple[FunctionOutputType, PartitionedTaskTrace], TraceTime]],
132
+ ) -> Tuple[FunctionOutputType, TaskTrace]:
133
+ """
134
+ Wraps the ``combine_with`` function with performance timers.
135
+
136
+ :param combine_with: the combining function.
137
+ :param partition_size_estimator: forward execution feedback to the estimator is not ``None``.
138
+ :param results: the partitioned execution result and their associated partitioning and execution times.
139
+
140
+ :returns the combined output, and the task execution time measurements.
141
+ """
142
+
143
+ trace = TaskTrace()
144
+
145
+ def timed_combine_generator():
146
+ for result_with_trace, task_duration in results:
147
+ result, partition_trace = result_with_trace
148
+
149
+ partition_trace.task_duration = task_duration
150
+
151
+ with profile() as combine_duration:
152
+ yield result
153
+
154
+ partition_trace.combine_duration = combine_duration.value
155
+
156
+ if partition_size_estimator is not None:
157
+ partition_size_estimator.add_partition_trace(partition_trace)
158
+
159
+ trace.partition_traces.append(partition_trace)
160
+
161
+ result = combine_with(timed_combine_generator())
162
+
163
+ return result, trace
164
+
165
+
166
+ def print_profile_trace(
167
+ function: Callable,
168
+ function_name: Optional[str],
169
+ partition_size_estimator: Optional[PartitionSizeEstimator],
170
+ task_trace: TaskTrace,
171
+ file: TextIO = sys.stderr,
172
+ ) -> None:
173
+ """Prints an human-readable summary of the task's execution times."""
174
+
175
+ def print_to_file(value: str):
176
+ print(value, file=file)
177
+
178
+ partition_timedelta = datetime.timedelta(microseconds=task_trace.total_partition_duration / 1000)
179
+ partition_average_timedelta = partition_timedelta / task_trace.partition_count
180
+ function_timedelta = datetime.timedelta(microseconds=task_trace.total_task_duration / 1000)
181
+ function_timedeltas = [
182
+ datetime.timedelta(microseconds=partition_trace.task_duration / 1000)
183
+ for partition_trace in task_trace.partition_traces
184
+ ]
185
+ combine_timedelta = datetime.timedelta(microseconds=task_trace.total_combine_duration / 1000)
186
+
187
+ total_cpu_duration = partition_timedelta + function_timedelta + combine_timedelta
188
+
189
+ parallel_overhead = partition_timedelta + combine_timedelta
190
+
191
+ if function_name is not None:
192
+ print_to_file(f"{function_name}()")
193
+ else:
194
+ print_to_file(f"{function.__name__}()")
195
+
196
+ print_to_file(f"\ttotal CPU execution time: {total_cpu_duration}.")
197
+
198
+ # Execution stats
199
+
200
+ min_compute = min(function_timedeltas)
201
+ max_compute = max(function_timedeltas)
202
+
203
+ print_to_file(f"\tcompute time: {function_timedelta} ({function_timedelta / total_cpu_duration * 100:.2f}%)")
204
+ print_to_file(f"\t\tmin.: {min_compute}")
205
+ print_to_file(f"\t\tmax.: {max_compute}")
206
+ print_to_file(f"\t\tavg.: {function_timedelta / task_trace.partition_count}")
207
+
208
+ # Partitioning / Combining stats
209
+
210
+ print_to_file(
211
+ f"\ttotal parallel overhead: {parallel_overhead} ({parallel_overhead / total_cpu_duration * 100:.2f}%)"
212
+ )
213
+ print_to_file(
214
+ f"\t\ttotal partitioning: {partition_timedelta} ({partition_timedelta / total_cpu_duration * 100:.2f}%)"
215
+ )
216
+ print_to_file(f"\t\taverage partitioning: {partition_average_timedelta}")
217
+ print_to_file(f"\t\ttotal combining: {combine_timedelta} ({combine_timedelta / total_cpu_duration * 100:.2f}%)")
218
+
219
+ theoretical_speedup = total_cpu_duration / max(parallel_overhead, max_compute)
220
+ print_to_file(f"\tmaximum speedup (theoretical): {theoretical_speedup:.2f}x")
221
+
222
+ # Partition size estimator state
223
+ print_to_file(f"\ttotal partition count: {task_trace.partition_count}")
224
+
225
+ if partition_size_estimator is not None:
226
+ estimator_state = partition_size_estimator.state()
227
+ print_to_file(f"\t\testimator state: {estimator_state.value}")
228
+
229
+ estimate = partition_size_estimator.estimate(dry_run=True).value
230
+ print_to_file(f"\t\testimated partition size: {estimate}")
231
+
232
+
233
+ def export_task_trace(file_path: str, task_trace: TaskTrace) -> None:
234
+ """Exports the task trace as a CSV file."""
235
+
236
+ def export_partitioned_task_trace(trace: PartitionedTaskTrace) -> Tuple:
237
+ if trace.partition_size_estimate is not None:
238
+ requested_partition_size = trace.partition_size_estimate.value
239
+ else:
240
+ requested_partition_size = None
241
+
242
+ return (
243
+ requested_partition_size,
244
+ trace.partition_size,
245
+ trace.partition_duration,
246
+ trace.task_duration,
247
+ trace.combine_duration,
248
+ )
249
+
250
+ with open(file_path, "w") as file:
251
+ writer = csv.writer(file)
252
+ writer.writerow(
253
+ (
254
+ "requested_partition_size",
255
+ "partition_size",
256
+ "partition_duration",
257
+ "function_duration",
258
+ "combine_duration",
259
+ )
260
+ )
261
+ writer.writerows(export_partitioned_task_trace(trace) for trace in task_trace.partition_traces)
@@ -0,0 +1,68 @@
1
+ from typing import List, Optional
2
+
3
+ import attrs
4
+ from attrs.validators import gt, instance_of, optional
5
+
6
+ from parfun.partition_size_estimator.object import PartitionSizeEstimate
7
+
8
+ TraceTime = int # Process time in nanosecond used for profiling
9
+
10
+
11
+ @attrs.define
12
+ class ProfileDuration:
13
+ """See :py:func:`parfun.profiler.functions.profile`."""
14
+
15
+ value: Optional[TraceTime] = attrs.field(default=None)
16
+
17
+
18
+ @attrs.define
19
+ class PartitionedTaskTrace:
20
+ """The profiling traces for a single partitioned task (i.e. a function call with a single partitioned dataset)."""
21
+
22
+ partition_size_estimate: Optional[PartitionSizeEstimate] = attrs.field(
23
+ validator=optional(instance_of(PartitionSizeEstimate))
24
+ )
25
+
26
+ partition_size: int = attrs.field(validator=(instance_of(int), gt(0)))
27
+ partition_duration: TraceTime = attrs.field(validator=instance_of(TraceTime))
28
+
29
+ task_duration: Optional[TraceTime] = attrs.field(validator=optional(instance_of(TraceTime)), default=None)
30
+ combine_duration: Optional[TraceTime] = attrs.field(validator=optional(instance_of(TraceTime)), default=None)
31
+
32
+ @property
33
+ def total_duration(self) -> TraceTime:
34
+ if self.task_duration is None or self.combine_duration is None:
35
+ raise ValueError("`task_duration` and `combine_duration` should be initialized.")
36
+
37
+ return self.partition_duration + self.task_duration + self.combine_duration
38
+
39
+
40
+ @attrs.define
41
+ class TaskTrace:
42
+ """The profiling traces for a partitioned job (i.e. a `ParallelFunction` call)."""
43
+
44
+ partition_traces: List[PartitionedTaskTrace] = attrs.field(validator=instance_of(list), factory=list)
45
+
46
+ @property
47
+ def partition_count(self) -> int:
48
+ return len(self.partition_traces)
49
+
50
+ @property
51
+ def total_partition_duration(self) -> TraceTime:
52
+ return sum((t.partition_duration for t in self.partition_traces), 0)
53
+
54
+ @property
55
+ def total_task_duration(self) -> TraceTime:
56
+ """Returns the total CPU time required to schedule and compute the function."""
57
+
58
+ if any(t.task_duration is None for t in self.partition_traces):
59
+ raise ValueError("`function_duration` values shall be initialized.")
60
+
61
+ return sum((t.task_duration for t in self.partition_traces), 0)
62
+
63
+ @property
64
+ def total_combine_duration(self) -> TraceTime:
65
+ if any(t.combine_duration is None for t in self.partition_traces):
66
+ raise ValueError("`combine_duration` values shall be initialized.")
67
+
68
+ return sum((t.combine_duration for t in self.partition_traces), 0)
parfun/py_list.py ADDED
@@ -0,0 +1,56 @@
1
+ """
2
+ A collection of pre-define APIs to help users partition and combine collection, such as lists, arrays or tuples.
3
+ """
4
+
5
+ from itertools import chain
6
+ from typing import Iterable, List, Tuple, TypeVar
7
+
8
+ from parfun.partition.object import PartitionGenerator, PartitionType
9
+
10
+
11
+ ListValue = TypeVar("ListValue")
12
+
13
+
14
+ def concat(values: Iterable[List[ListValue]]) -> List[ListValue]:
15
+ """
16
+ Chains a collection of lists in a single list.
17
+
18
+ .. code:: python
19
+
20
+ concat([[1,2], [3], [4, 5]]) # [1, 2, 3, 4, 5]
21
+
22
+ """
23
+ return list(chain.from_iterable(values))
24
+
25
+
26
+ def by_chunk(*iterables: Iterable[PartitionType]) -> PartitionGenerator[Tuple[Iterable[PartitionType], ...]]:
27
+ """
28
+ Partition one or multiple iterables by chunks of identical sizes.
29
+
30
+ .. code:: python
31
+
32
+ ls_1 = [1, 2, 3, 4]
33
+ ls_2 = [1, 4, 9, 16]
34
+
35
+ with_partition_size(by_chunk, ls_1, ls_2, partition_size=2))
36
+ # [((1, 2), (1, 4)), ((3, 4), (9, 16))]
37
+
38
+ """
39
+
40
+ chunk_size = yield None
41
+
42
+ i = 0
43
+ partition = []
44
+
45
+ for tuple_item in zip(*iterables):
46
+ if i < chunk_size:
47
+ partition.append(tuple_item)
48
+ i += 1
49
+
50
+ if i == chunk_size:
51
+ chunk_size = yield chunk_size, tuple(zip(*partition))
52
+ i = 0
53
+ partition = []
54
+
55
+ if partition:
56
+ yield len(partition), tuple(zip(*partition))