opengris-parfun 7.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. opengris_parfun-7.3.0.dist-info/METADATA +165 -0
  2. opengris_parfun-7.3.0.dist-info/RECORD +43 -0
  3. opengris_parfun-7.3.0.dist-info/WHEEL +5 -0
  4. opengris_parfun-7.3.0.dist-info/licenses/LICENSE +201 -0
  5. opengris_parfun-7.3.0.dist-info/licenses/LICENSE.spdx +7 -0
  6. opengris_parfun-7.3.0.dist-info/licenses/NOTICE +7 -0
  7. opengris_parfun-7.3.0.dist-info/top_level.txt +1 -0
  8. parfun/__init__.py +26 -0
  9. parfun/about.py +1 -0
  10. parfun/backend/__init__.py +0 -0
  11. parfun/backend/dask.py +151 -0
  12. parfun/backend/local_multiprocessing.py +92 -0
  13. parfun/backend/local_single_process.py +47 -0
  14. parfun/backend/mixins.py +68 -0
  15. parfun/backend/profiled_future.py +50 -0
  16. parfun/backend/scaler.py +226 -0
  17. parfun/backend/utility.py +7 -0
  18. parfun/combine/__init__.py +0 -0
  19. parfun/combine/collection.py +13 -0
  20. parfun/combine/dataframe.py +13 -0
  21. parfun/dataframe.py +175 -0
  22. parfun/decorators.py +135 -0
  23. parfun/entry_point.py +180 -0
  24. parfun/functions.py +71 -0
  25. parfun/kernel/__init__.py +0 -0
  26. parfun/kernel/function_signature.py +197 -0
  27. parfun/kernel/parallel_function.py +262 -0
  28. parfun/object.py +7 -0
  29. parfun/partition/__init__.py +0 -0
  30. parfun/partition/api.py +136 -0
  31. parfun/partition/collection.py +13 -0
  32. parfun/partition/dataframe.py +16 -0
  33. parfun/partition/object.py +50 -0
  34. parfun/partition/primitives.py +317 -0
  35. parfun/partition/utility.py +54 -0
  36. parfun/partition_size_estimator/__init__.py +0 -0
  37. parfun/partition_size_estimator/linear_regression_estimator.py +189 -0
  38. parfun/partition_size_estimator/mixins.py +22 -0
  39. parfun/partition_size_estimator/object.py +19 -0
  40. parfun/profiler/__init__.py +0 -0
  41. parfun/profiler/functions.py +261 -0
  42. parfun/profiler/object.py +68 -0
  43. parfun/py_list.py +56 -0
@@ -0,0 +1,317 @@
1
+ from typing import Callable, Optional, Sequence, Tuple, TypeVar, cast
2
+
3
+ from parfun.partition.object import PartitionGenerator, PartitionType, SimplePartitionIterator, SmartPartitionGenerator
4
+
5
+ InputPartitionType = TypeVar("InputPartitionType", bound=Tuple)
6
+ OutputPartitionType = TypeVar("OutputPartitionType", bound=Tuple)
7
+
8
+
9
+ def partition_map(
10
+ func: Callable[..., OutputPartitionType], generator: PartitionGenerator[InputPartitionType]
11
+ ) -> PartitionGenerator[OutputPartitionType]:
12
+ """
13
+ Same as Python's built-in ``map()``, but works on partition generators.
14
+
15
+ .. code:: python
16
+
17
+ partition_map(
18
+ lambda partition_df: partition_df * 2,
19
+ df_by_row(df)
20
+ )
21
+
22
+ If the generator is a regular Python generator, the function returns a regular generator. Otherwise, it returns a
23
+ smart generator.
24
+ """
25
+
26
+ try:
27
+ first_value = cast(Optional[InputPartitionType], next(generator))
28
+
29
+ if first_value is not None:
30
+ # This is a regular generator
31
+ simple_generator = cast(SimplePartitionIterator[InputPartitionType], generator)
32
+
33
+ yield func(*first_value)
34
+
35
+ while True:
36
+ yield func(*next(simple_generator))
37
+ else:
38
+ smart_generator = cast(SmartPartitionGenerator[InputPartitionType], generator)
39
+
40
+ requested_partition_size = yield None
41
+
42
+ while True:
43
+ value = smart_generator.send(requested_partition_size)
44
+ _validate_smart_partition_value(value)
45
+
46
+ partition_size, partition = value
47
+
48
+ requested_partition_size = yield partition_size, func(*partition)
49
+ except StopIteration:
50
+ return
51
+
52
+
53
+ def partition_unit(partition_size: int, partition: PartitionType) -> PartitionGenerator[PartitionType]:
54
+ """Creates a generator returning a single partition."""
55
+
56
+ _ = yield None
57
+ yield partition_size, partition
58
+
59
+
60
+ def partition_flatmap(
61
+ func: Callable[[InputPartitionType], PartitionGenerator[OutputPartitionType]],
62
+ generator: PartitionGenerator[InputPartitionType],
63
+ ) -> PartitionGenerator[OutputPartitionType]:
64
+ """
65
+ Allows allows the nesting of ``PartitionGenerator``s, similarly to nested for loops:
66
+
67
+ .. code:: python
68
+
69
+ partition_flatmap(
70
+ lambda partition_df: df_by_row(*partition_df),
71
+ df_by_group(by="year")(df)
72
+ )
73
+
74
+ Returns a regular Python generator iff the parent and iterated generators are regular Python generators. Otherwise,
75
+ it returns a smart generator.
76
+ """
77
+
78
+ try:
79
+ first_value = cast(Optional[InputPartitionType], next(generator))
80
+ except StopIteration:
81
+ return
82
+
83
+ if first_value is not None:
84
+ # The parent generator is a regular generator
85
+ simple_generator = cast(SimplePartitionIterator[InputPartitionType], generator)
86
+ yield from _partition_flatmap_regular_generator(func, first_value, simple_generator)
87
+ else:
88
+ smart_generator = cast(SmartPartitionGenerator[InputPartitionType], generator)
89
+ yield from _partition_flatmap_smart_generator(func, smart_generator)
90
+
91
+
92
+ def _partition_flatmap_regular_generator(
93
+ func: Callable[[InputPartitionType], PartitionGenerator[OutputPartitionType]],
94
+ first_value: InputPartitionType,
95
+ generator: SimplePartitionIterator[InputPartitionType],
96
+ ) -> PartitionGenerator[OutputPartitionType]:
97
+ """
98
+ `partition_flatmap()` specialisation for parent generators that are regular Python generators.
99
+
100
+ The function returns a smart generator iff the iterated function return smart generators, otherwise it returns a
101
+ regular Python generator.
102
+ """
103
+
104
+ def iterate_nested_generator(
105
+ nested_generator: PartitionGenerator[OutputPartitionType], requested_partition_size: Optional[int] = None
106
+ ):
107
+ try:
108
+ first_value = cast(Optional[OutputPartitionType], next(nested_generator))
109
+
110
+ if first_value is not None:
111
+ # This is a regular generator
112
+ nested_simple_generator = cast(SimplePartitionIterator[OutputPartitionType], nested_generator)
113
+
114
+ if requested_partition_size is not None:
115
+ raise ValueError(
116
+ "`partition_flatmap()` doesn't support mixing smart and regular generators in applied function."
117
+ )
118
+
119
+ yield first_value
120
+ yield from nested_simple_generator
121
+ else:
122
+ nested_smart_generator = cast(SmartPartitionGenerator[OutputPartitionType], nested_generator)
123
+
124
+ if requested_partition_size is None: # First nested call value.
125
+ requested_partition_size = yield None
126
+
127
+ while True:
128
+ value = nested_smart_generator.send(requested_partition_size)
129
+ _validate_smart_partition_value(value)
130
+
131
+ partition_size, partition = value
132
+
133
+ requested_partition_size = yield partition_size, func(*partition)
134
+ except StopIteration:
135
+ return requested_partition_size
136
+
137
+ requested_partition_size = None
138
+ value = first_value
139
+
140
+ try:
141
+ while True:
142
+ requested_partition_size = yield from iterate_nested_generator(func(*value), requested_partition_size)
143
+ value = next(generator)
144
+ except StopIteration:
145
+ return
146
+
147
+
148
+ def _partition_flatmap_smart_generator(
149
+ func: Callable[[InputPartitionType], PartitionGenerator[OutputPartitionType]],
150
+ generator: SmartPartitionGenerator[InputPartitionType],
151
+ ) -> SmartPartitionGenerator[OutputPartitionType]:
152
+ """
153
+ `partition_flatmap()` specialisation for parent generators that are smart generators.
154
+
155
+ The function always returns a smart generator.
156
+ """
157
+
158
+ def iterate_nested_generator(
159
+ nested_generator: PartitionGenerator[OutputPartitionType],
160
+ requested_partition_size: int,
161
+ parent_partition_size: int,
162
+ ):
163
+ total_size = 0
164
+
165
+ try:
166
+ nested_value = next(nested_generator)
167
+
168
+ if nested_value is not None:
169
+ # This is a regular nested generator
170
+ nested_simple_generator = cast(SimplePartitionIterator[OutputPartitionType], nested_generator)
171
+
172
+ while True:
173
+ total_size += 1
174
+
175
+ requested_partition_size = yield parent_partition_size, nested_value
176
+ nested_value = next(nested_simple_generator)
177
+ else:
178
+ # This is a smart nested generator
179
+ nested_smart_generator = cast(SmartPartitionGenerator[OutputPartitionType], nested_generator)
180
+
181
+ while True:
182
+ nested_requested_partition_size = max(1, round(requested_partition_size / parent_partition_size))
183
+
184
+ nested_value = nested_smart_generator.send(nested_requested_partition_size)
185
+ _validate_smart_partition_value(nested_value)
186
+
187
+ nested_partition_size, nested_partition = nested_value
188
+
189
+ total_size += nested_partition_size
190
+
191
+ requested_partition_size = yield parent_partition_size * nested_partition_size, nested_partition
192
+ except StopIteration:
193
+ return total_size, requested_partition_size
194
+
195
+ # Keep track of the nested total size of the previous iteration of the nested function, so that we can
196
+ # estimate the optimal partition size to propagate to the parent's generator.
197
+ total_nested_size = 0
198
+ n_nested = 0
199
+
200
+ requested_partition_size = yield None
201
+ parent_requested_partition_size = 1
202
+
203
+ try:
204
+ while True:
205
+ value = generator.send(parent_requested_partition_size)
206
+ _validate_smart_partition_value(value)
207
+
208
+ parent_partition_size, partition = value
209
+
210
+ nested_size, requested_partition_size = yield from iterate_nested_generator(
211
+ func(*partition), requested_partition_size, parent_partition_size
212
+ )
213
+
214
+ total_nested_size += nested_size
215
+ n_nested += 1
216
+
217
+ avg_nested_size = total_nested_size / n_nested
218
+ parent_requested_partition_size = max(1, round(requested_partition_size / avg_nested_size))
219
+ except StopIteration:
220
+ return
221
+
222
+
223
+ def partition_zip(*generators: PartitionGenerator) -> PartitionGenerator[Tuple]:
224
+ """
225
+ Same as Python's built-in ``zip()``, but works on ``PartitionGenerator``s.
226
+ """
227
+
228
+ if len(generators) < 1:
229
+ raise ValueError("at least one partition generator required.")
230
+
231
+ try:
232
+ # Detects which generators are partition-size aware
233
+
234
+ is_smart = []
235
+ first_values = []
236
+
237
+ for generator in generators:
238
+ first_value = next(generator)
239
+
240
+ is_smart.append(first_value is None)
241
+ first_values.append(first_value)
242
+
243
+ has_smart = any(is_smart)
244
+
245
+ # If at least one of the generator is partition-size aware (smart), yields a partition-size aware generator.
246
+
247
+ if has_smart:
248
+ requested_partition_size = yield None
249
+ else:
250
+ requested_partition_size = None
251
+
252
+ # Collects the first values of the smart generators (we already have the non-smart first values).
253
+
254
+ partition_size = None
255
+ for i, generator in enumerate(generators):
256
+ if not is_smart[i]:
257
+ continue
258
+
259
+ value = cast(SmartPartitionGenerator, generator).send(requested_partition_size)
260
+ _validate_partition_zip_smart_partition_value(value, partition_size)
261
+ partition_size, first_values[i] = value
262
+
263
+ if has_smart:
264
+ requested_partition_size = yield partition_size, tuple(first_values)
265
+ else:
266
+ yield tuple(first_values)
267
+
268
+ # Iterates through the next values until one generator finishes.
269
+
270
+ while True:
271
+ values = []
272
+ partition_size = None
273
+
274
+ for i, generator in enumerate(generators):
275
+ if is_smart[i]:
276
+ value = cast(SmartPartitionGenerator, generator).send(requested_partition_size)
277
+ _validate_partition_zip_smart_partition_value(value, partition_size)
278
+ partition_size, partition = value
279
+ else:
280
+ partition = next(generator)
281
+
282
+ values.append(partition)
283
+
284
+ if has_smart:
285
+ requested_partition_size = yield partition_size, tuple(values)
286
+ else:
287
+ yield tuple(values)
288
+ except StopIteration:
289
+ return
290
+
291
+
292
+ def _validate_partition_zip_smart_partition_value(
293
+ partition_value: Tuple[int, PartitionType], partition_size: Optional[int]
294
+ ):
295
+ """
296
+ Validates the smart partition value iterated by ``partition_zip()``, and validates that its size matches the other
297
+ concurrent partitions' size (``partition_size``).
298
+ """
299
+
300
+ _validate_smart_partition_value(partition_value)
301
+
302
+ current_partition_size, partition = partition_value
303
+
304
+ if partition_size is not None and partition_size != current_partition_size:
305
+ raise ValueError("all smart partition generators should yield identically sized partitions.")
306
+
307
+ return current_partition_size, partition
308
+
309
+
310
+ def _validate_smart_partition_value(value):
311
+ if not isinstance(value, Sequence) or len(value) != 2:
312
+ raise ValueError("partition generator should yield a partition with its size.")
313
+
314
+ partition_size, _ = value
315
+
316
+ if not isinstance(partition_size, int) or partition_size < 1:
317
+ raise ValueError("partition generator should return a strictly positive partition size.")
@@ -0,0 +1,54 @@
1
+ from typing import Callable, Generator, Optional, Union, cast
2
+
3
+ from parfun.object import PartitionType
4
+ from parfun.partition.object import PartitionGenerator, SimplePartitionIterator, SmartPartitionGenerator
5
+
6
+
7
+ def with_partition_size(
8
+ generator: PartitionGenerator[PartitionType], partition_size: Union[int, Callable[[], int]] = 1
9
+ ) -> Generator[PartitionType, None, None]:
10
+ """
11
+ Runs a partitioning generator without requiring the partition size estimator.
12
+
13
+ This function uses the provided partition size input to feed the partitioning generator through Python's
14
+ :py:meth:`generator.send` method, simulating the parallel function's behavior.
15
+
16
+ .. code:: python
17
+
18
+ # Runs the `by_row` partitioning function with a random partition size generator.
19
+ with_partition_size(
20
+ pf.dataframe.by_row(df_1, df_2),
21
+ partition_size=lambda: random.randint(1, 10)
22
+ )
23
+
24
+ :param partitions_with: the partitioning generator to execute
25
+ :param partition_size: a constant partition size, or a function generating partition sizes
26
+ """
27
+
28
+ try:
29
+ first_value = cast(Optional[PartitionType], next(generator))
30
+
31
+ if first_value is not None:
32
+ # This is a regular generator
33
+ simple_generator = cast(SimplePartitionIterator[PartitionType], generator)
34
+
35
+ yield first_value
36
+ yield from simple_generator
37
+ else:
38
+ smart_generator = cast(SmartPartitionGenerator[PartitionType], generator)
39
+
40
+ while True:
41
+ if isinstance(partition_size, int):
42
+ current_partition_size = partition_size
43
+ else:
44
+ assert callable(partition_size)
45
+ current_partition_size = partition_size()
46
+
47
+ value = smart_generator.send(current_partition_size)
48
+
49
+ if value is None or len(value) != 2:
50
+ raise ValueError("partition generator should yield a partition with its size.")
51
+
52
+ yield value[1]
53
+ except StopIteration:
54
+ return
File without changes
@@ -0,0 +1,189 @@
1
+ import bisect
2
+ import logging
3
+ from math import ceil
4
+ from typing import Callable, List, Optional, Tuple
5
+
6
+ import attrs
7
+ import numpy as np
8
+ from attrs.validators import instance_of, is_callable
9
+ from sklearn.base import BaseEstimator
10
+ from sklearn.linear_model import LinearRegression
11
+ from sklearn.pipeline import Pipeline
12
+ from sklearn.preprocessing import FunctionTransformer
13
+
14
+ from parfun.entry_point import get_parallel_backend
15
+ from parfun.partition_size_estimator.mixins import PartitionSizeEstimator
16
+ from parfun.partition_size_estimator.object import PartitionSizeEstimate, PartitionSizeEstimatorState
17
+ from parfun.profiler.object import PartitionedTaskTrace
18
+
19
+
20
+ @attrs.define()
21
+ class LinearRegressionCoefficients:
22
+ a: float = attrs.field(validator=instance_of(float))
23
+ b: float = attrs.field(validator=instance_of(float))
24
+
25
+ score: float = attrs.field(validator=instance_of(float))
26
+
27
+ # The number of traces used to train the estimator.
28
+ trace_count: int = attrs.field(validator=instance_of(int))
29
+
30
+
31
+ @attrs.define()
32
+ class LinearRegessionEstimate(PartitionSizeEstimate):
33
+ coefficients: Optional[LinearRegressionCoefficients]
34
+
35
+
36
+ @attrs.define()
37
+ class LinearRegessionEstimator(PartitionSizeEstimator[LinearRegessionEstimate]):
38
+ """
39
+ Train a linear regression model to estimate the optimal partition size, based on the function's initialization time,
40
+ and the function's processing time.
41
+ """
42
+
43
+ # Parallel tasks have some constant computational overhead that stay the same whatever the partition size is (i.e.
44
+ # code loading, preprocessing, input checks, initialisation ...).
45
+ #
46
+ # We would not like the parallel functions to spend too much time on these by selecting a small partition size, as
47
+ # this will waste CPU resources while only providing a negligeable parallel speedup.
48
+ #
49
+ # This is a tradeoff between computation efficiency and parallelisation. The larger this parameter, the less
50
+ # parallel the task will run, but the most efficient the task will be computed.
51
+ min_parallelism_efficiency: float = attrs.field(validator=instance_of(float), default=0.95)
52
+
53
+ # Will partially randomly probe the task's execution profile before making chunk size estimates.
54
+ learning_sample_count: int = attrs.field(validator=instance_of(int), default=5)
55
+
56
+ # Will circle these partition sizes until the estimator receives `n_learning_samples`.
57
+ learning_sample_sizes: List[int] = attrs.field(init=False, default=[64, 8, 96, 32, 256, 1, 128, 48, 4])
58
+ _current_learning_sample: int = attrs.field(init=False, default=0)
59
+
60
+ # Will keep up to `max_traces` before starting to forget previously added traces.
61
+ max_traces: int = attrs.field(validator=instance_of(int), default=100)
62
+
63
+ _run_traces: List[Tuple[int, int]] = attrs.field(init=False, factory=list)
64
+
65
+ regressor_factory: Callable[[], BaseEstimator] = attrs.field(
66
+ validator=is_callable(), default=lambda: LinearRegessionEstimator.default_regressor()
67
+ )
68
+
69
+ _current_coefficients: Optional[LinearRegressionCoefficients] = attrs.field(default=None)
70
+ _current_estimate: Optional[LinearRegessionEstimate] = attrs.field(default=None)
71
+
72
+ def add_partition_trace(self, trace: PartitionedTaskTrace) -> None:
73
+ partition_size = trace.partition_size
74
+
75
+ tupled_trace = (partition_size, trace.total_duration // partition_size)
76
+
77
+ if len(self._run_traces) < self.max_traces:
78
+ self._run_traces.append(tupled_trace)
79
+
80
+ if len(self._run_traces) >= self.max_traces:
81
+ # Next trace, we will have to replace one exisiting value. Prepare the lists for bisect() by sorting.
82
+ self._run_traces.sort(key=lambda t: t[0])
83
+ else:
84
+ # Replaces the existing entry with the closest partition size.
85
+ #
86
+ # As the estimator will converge to similar partition size estimates, this will ensure we keep older but
87
+ # valuable traces from the initial learning phase of the estimator.
88
+
89
+ left_idx = bisect.bisect_left(self._run_traces, tupled_trace)
90
+ right_idx = left_idx + 1
91
+
92
+ if left_idx <= 0:
93
+ self._run_traces[0] = tupled_trace
94
+ elif right_idx >= len(self._run_traces):
95
+ self._run_traces[-1] = tupled_trace
96
+ else:
97
+ # Replaces the closest value when the value fall between two existing values.
98
+ left_partition_size = self._run_traces[left_idx][0]
99
+ right_partition_size = self._run_traces[right_idx][0]
100
+
101
+ if partition_size - left_partition_size < right_partition_size - partition_size:
102
+ self._run_traces[left_idx] = tupled_trace
103
+ else:
104
+ self._run_traces[right_idx] = tupled_trace
105
+
106
+ assert self._run_traces[left_idx][0] <= self._run_traces[right_idx][0]
107
+
108
+ self._current_estimate = None
109
+ self._current_coefficients = None
110
+
111
+ def state(self) -> PartitionSizeEstimatorState:
112
+ if len(self._run_traces) < self.learning_sample_count:
113
+ return PartitionSizeEstimatorState.Learning
114
+ else:
115
+ return PartitionSizeEstimatorState.Running
116
+
117
+ def coefficients(self) -> LinearRegressionCoefficients:
118
+ """Trains a linear regression ´f(partition_size) = a + b / partition_size´ on the previously recorded task runs.
119
+
120
+ This pretty accurately estimates the time it takes to process a single item (i.e. row) when feeding a dataset of
121
+ a given partition size. The behavior of parallel functions is that the larger the partition size, the less
122
+ function initialization overhead (`b`) will be weighted when compared to the actual processing time of that
123
+ single item (`a`)."""
124
+
125
+ if self._current_coefficients is not None:
126
+ return self._current_coefficients
127
+
128
+ regressor = self.regressor_factory()
129
+
130
+ numpy_traces = np.array(self._run_traces)
131
+ regressor.fit(numpy_traces[:, 0:1], numpy_traces[:, 1])
132
+
133
+ linear_regressor = dict(regressor.steps)["linear"]
134
+ a = linear_regressor.intercept_
135
+ b = linear_regressor.coef_[0]
136
+
137
+ score = regressor.score(numpy_traces[:, 0:1], numpy_traces[:, 1])
138
+
139
+ self._current_coefficients = LinearRegressionCoefficients(a, b, score, len(self._run_traces))
140
+
141
+ return self._current_coefficients
142
+
143
+ def estimate(self, dry_run: bool = False) -> LinearRegessionEstimate:
144
+ if self._current_estimate is not None:
145
+ return self._current_estimate
146
+
147
+ if self.state() == PartitionSizeEstimatorState.Learning:
148
+ return self._learn_estimate(dry_run=dry_run)
149
+
150
+ # Knowing f()'s coefficients, we can accuratly compute when the parallel overheads become negligeable when
151
+ # compared to the actual computation time (`min_parallelism_efficiency`).
152
+
153
+ coefficients = self.coefficients()
154
+ a = coefficients.a
155
+ b = coefficients.b
156
+ if b <= 0 or a < 0:
157
+ # TODO: we could use more advanced heurestics, like the error value of the regressor.
158
+ if len(self._run_traces) >= self.max_traces:
159
+ logging.debug("failed to estimate a valid partition size, fallback to learning.")
160
+
161
+ return self._learn_estimate(dry_run=dry_run)
162
+
163
+ current_backend = get_parallel_backend()
164
+
165
+ if current_backend is None:
166
+ raise ValueError("partition size estimator requires a contextual parallel backend instance.")
167
+
168
+ # Solves the partition size that satisfies `min_parallelism_efficiency`.
169
+ partition_size = ceil(b / (a * (1 - self.min_parallelism_efficiency)))
170
+
171
+ self._current_estimate = LinearRegessionEstimate(partition_size, coefficients)
172
+ return self._current_estimate
173
+
174
+ def _learn_estimate(self, dry_run: bool = False) -> LinearRegessionEstimate:
175
+ """Learning estimate. Probes the task execution times before running the actual estimator."""
176
+
177
+ partition_size = self.learning_sample_sizes[self._current_learning_sample]
178
+
179
+ if not dry_run:
180
+ self._current_learning_sample += 1
181
+ self._current_learning_sample %= len(self.learning_sample_sizes)
182
+
183
+ return LinearRegessionEstimate(partition_size, None)
184
+
185
+ @staticmethod
186
+ def default_regressor() -> BaseEstimator:
187
+ return Pipeline(
188
+ steps=[("inv", FunctionTransformer(func=lambda xs: 1.0 / xs)), ("linear", LinearRegression(positive=True))]
189
+ )
@@ -0,0 +1,22 @@
1
+ import abc
2
+ from typing import Generic
3
+
4
+ import attrs
5
+
6
+ from parfun.partition_size_estimator.object import PartitionSizeEstimateType, PartitionSizeEstimatorState
7
+ from parfun.profiler.object import PartitionedTaskTrace
8
+
9
+
10
+ @attrs.define
11
+ class PartitionSizeEstimator(Generic[PartitionSizeEstimateType], metaclass=abc.ABCMeta):
12
+ @abc.abstractmethod
13
+ def add_partition_trace(self, trace: PartitionedTaskTrace) -> None:
14
+ raise NotImplementedError()
15
+
16
+ @abc.abstractmethod
17
+ def state(self) -> PartitionSizeEstimatorState:
18
+ raise NotImplementedError()
19
+
20
+ @abc.abstractmethod
21
+ def estimate(self, dry_run: bool = False) -> PartitionSizeEstimateType:
22
+ raise NotImplementedError()
@@ -0,0 +1,19 @@
1
+ import abc
2
+ import enum
3
+ from typing import TypeVar
4
+
5
+ import attrs
6
+ from attrs.validators import gt, instance_of
7
+
8
+
9
+ class PartitionSizeEstimatorState(enum.Enum):
10
+ Learning = "learning"
11
+ Running = "running"
12
+
13
+
14
+ @attrs.define
15
+ class PartitionSizeEstimate(metaclass=abc.ABCMeta):
16
+ value: int = attrs.field(validator=(instance_of(int), gt(0)))
17
+
18
+
19
+ PartitionSizeEstimateType = TypeVar("PartitionSizeEstimateType", bound=PartitionSizeEstimate)
File without changes