max-div 0.0.3__py3-none-any.whl → 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,350 @@
1
+ """
2
+ Methods for sampling WITH constraints.
3
+ """
4
+
5
+ import copy
6
+ import math
7
+
8
+ import numba
9
+ import numpy as np
10
+ from numba import types
11
+
12
+ from max_div.constraints import Constraint
13
+ from max_div.constraints._numba import _np_con_indices, _np_con_max_value, _np_con_min_value
14
+ from max_div.sampling import randint_numba
15
+
16
+
17
+ # =================================================================================================
18
+ # randint_constrained
19
+ # =================================================================================================
20
+ def randint_constrained(
21
+ n: int,
22
+ k: int,
23
+ cons: list[Constraint],
24
+ p: np.ndarray | None = None,
25
+ seed: int | None = None,
26
+ ) -> list[int]:
27
+ """
28
+ Generate `k` unique random integers from the range `[0, n)` while satisfying given constraints.
29
+
30
+ NOTES:
31
+
32
+ * there are no guarantees are given that constraints are satisfied; a best-effort attempt will be made, with the
33
+ probability of the result satisfying the constraints increasing the simpler the constraints are.
34
+
35
+ * `randint_constrained` is essentially a version of [`randint`][max_div.sampling.uncon.randint] that supports constraints.
36
+
37
+ * It is strongly advised to use the fully equivalent function [randint_constrained_numba][max_div.sampling.con.randint_constraint_numba]
38
+ which is 10-100x faster due to its use of numba JIT compilation and efficient numpy-based data structures.
39
+
40
+ :param n: range to sample from [0, n)
41
+ :param k: number of unique samples to draw (no replacement)
42
+ :param cons: list of Constraint objects, indicating we want to sample at least `min_count`
43
+ and at most `max_count` integers from `int_set`, with all values of `int_set` in range `[0, n)`.
44
+ :param p: optional, target probabilities for each integer in `[0, n)`. No guarantees are given if provided,
45
+ but will help guide qualitative preference of sampling algorithm. Higher p[i] values will increase
46
+ probability of integer `i` being included in the sample, to the extent the constraints allow this.
47
+ :param seed: (int, optional) random seed (not set if None or 0 are provided)
48
+ :return: list of samples
49
+ """
50
+
51
+ # --- initialize --------------------------------------
52
+ samples = []
53
+ k_remaining = k
54
+ cons = copy.deepcopy(cons) # make sure we don't modify the original ones
55
+
56
+ # --- sample ------------------------------------------
57
+ while k_remaining > 0:
58
+ # --- score & thresholds ----------------
59
+
60
+ # determine how much each integer would help us satisfy min_count constraints
61
+ # (at this point, hard-excluding those integers that are already sampled or that would violate max_count)
62
+ score = _compute_score(n, cons, samples, hard_max_constraints=True)
63
+
64
+ # determine how much improvement we need to be able to satisfy all min_count constraints
65
+ # with k_remaining samples
66
+ total_score_needed = sum([con.min_count for con in cons if con.min_count > 0])
67
+ score_threshold = math.ceil(total_score_needed / k_remaining)
68
+
69
+ if max(score) >= score_threshold:
70
+ # at this point, it still seems possible to satisfy all min_count constraints with the
71
+ # remaining # of samples we have.
72
+ # --> STRATEGY 1: focus on those samples that help us enough to satisfy all constraints with the
73
+ # remaining # of samples we have, and do not sample from any of the others.
74
+ pass
75
+ else:
76
+ # we cannot satisfy all constraints with the k remaining samples.
77
+ # --> STRATEGY 2: choose samples with best net effect (help achieve min_count vs not violating max_count),
78
+ # still hard-excluding already sampled integers.
79
+ score = _compute_score(n, cons, samples, hard_max_constraints=False)
80
+ score_threshold = max(score) # this could even be negative; we focus on those samples that do least harm
81
+
82
+ # --- sample according to strategy ------
83
+ # construct modified probabilities, taking into account scores
84
+ if (p is None) or (p.size == 0):
85
+ p_mod = np.ones(n) # uniform probabilities
86
+ else:
87
+ p_mod = p.copy() # avoid modifying input array
88
+
89
+ # make sure no p_mod is == 0, such that we will always have some p_mod[i]>0 after setting some to 0
90
+ p_mod = np.maximum(p_mod, 1e-12 * max(p_mod))
91
+ for i in range(n):
92
+ if score[i] < score_threshold:
93
+ p_mod[i] = 0.0 # exclude from sampling
94
+
95
+ # one sample from p_mod
96
+ s = int(
97
+ randint_numba(
98
+ n=np.int32(n),
99
+ k=np.int32(1),
100
+ p=p_mod.astype(np.float32),
101
+ replace=False,
102
+ seed=np.int64(seed or 0),
103
+ )[0]
104
+ )
105
+
106
+ # --- update stats --------------------------------
107
+ for con in cons:
108
+ if s in con.int_set:
109
+ con.min_count -= 1
110
+ con.max_count -= 1
111
+
112
+ samples.append(s)
113
+ k_remaining -= 1
114
+
115
+ # --- done ----------------------------------------
116
+ return samples
117
+
118
+
119
+ def _compute_score(
120
+ n: int,
121
+ cons: list[Constraint],
122
+ already_sampled: list[int],
123
+ hard_max_constraints: bool,
124
+ ) -> np.ndarray:
125
+ """
126
+ Score each integer in `[0, n)` based on how sampling each integer helps toward satisfying the constraints
127
+ - if it helps achieve a min_count that is not satisfied yet: +1
128
+ - if it would violate a max_count that we already hit: -1 if hard_max_constraints=False
129
+ -2**24 if hard_max_constraints=True
130
+ - if we already sampled it: -2**24 if hard_max_constraints=True
131
+
132
+ The basic idea behind the scoring is that integers with score <= 0 will not be sampled, if at all possible.
133
+
134
+ :param n: range to score [0, n)
135
+ :param cons: list of Constraint objects, indicating we want to sample at least `min_count`
136
+ and at most `max_count` integers from `int_set`, with all values of `int_set` in range `[0, n)`.
137
+ :param already_sampled: list of integers already sampled
138
+ :param hard_max_constraints: if True, integers that would violate max_count constraints are penalized such that they
139
+ will never be sampled.
140
+ :return: array of scores for each integer in `[0, n)`
141
+ """
142
+
143
+ # --- init --------------------------------------------
144
+ large_penalty = 2**24
145
+ if hard_max_constraints:
146
+ max_count_penalty = large_penalty
147
+ else:
148
+ max_count_penalty = 1
149
+ scores = np.zeros(n, dtype=np.int32)
150
+
151
+ # --- min_count / max_count ---------------------------
152
+ for con in cons:
153
+ if con.min_count > 0:
154
+ for i in con.int_set:
155
+ scores[i] += 1
156
+ if con.max_count <= 0:
157
+ for i in con.int_set:
158
+ scores[i] -= max_count_penalty
159
+
160
+ # --- already sampled ---------------------------------
161
+ for i in already_sampled:
162
+ scores[i] -= large_penalty
163
+
164
+ return scores
165
+
166
+
167
+ # =================================================================================================
168
+ # randint_constrained_numba
169
+ # =================================================================================================
170
+ @numba.njit(
171
+ types.int32[:](
172
+ types.int32,
173
+ types.int32[:, :],
174
+ types.int32[:],
175
+ types.int32[:],
176
+ types.boolean,
177
+ )
178
+ )
179
+ def _compute_score_numba(
180
+ n: np.int32,
181
+ con_values: np.ndarray[np.int32],
182
+ con_indices: np.ndarray[np.int32],
183
+ already_sampled: np.ndarray[np.int32],
184
+ hard_max_constraints: bool,
185
+ ) -> np.ndarray[np.int32]:
186
+ """
187
+ Numba version of _compute_score.
188
+ Score each integer based on how sampling each integer helps toward satisfying the constraints.
189
+
190
+ :param n: range to score [0, n)
191
+ :param con_values: 2D array (n_cons, 2) with min_count and max_count for each constraint
192
+ :param con_indices: 1D array with constraint indices in the format described in _constraints.py
193
+ :param already_sampled: 1D array of integers already sampled (negative values indicate no more samples)
194
+ :param hard_max_constraints: if True, integers that would violate max_count constraints are heavily penalized
195
+ :return: array of scores for each integer
196
+ """
197
+ n_cons = con_values.shape[0]
198
+
199
+ # --- init --------------------------------------------
200
+ large_penalty = np.int32(2**24)
201
+ if hard_max_constraints:
202
+ max_count_penalty = large_penalty
203
+ else:
204
+ max_count_penalty = np.int32(1)
205
+ scores = np.zeros(n, dtype=np.int32)
206
+
207
+ # --- min_count / max_count ---------------------------
208
+ for i_con in np.arange(n_cons, dtype=np.int32):
209
+ min_val = _np_con_min_value(con_values, i_con)
210
+ max_val = _np_con_max_value(con_values, i_con)
211
+ indices = _np_con_indices(con_indices, i_con)
212
+
213
+ if min_val > 0:
214
+ for idx in indices:
215
+ scores[idx] += 1
216
+ if max_val <= 0:
217
+ for idx in indices:
218
+ scores[idx] -= max_count_penalty
219
+
220
+ # --- already sampled ---------------------------------
221
+ for i in already_sampled:
222
+ if i >= 0: # negative values indicate end of valid samples
223
+ scores[i] -= large_penalty
224
+
225
+ return scores
226
+
227
+
228
+ @numba.njit(
229
+ types.int32[:](
230
+ types.int32,
231
+ types.int32,
232
+ types.int32[:, :],
233
+ types.int32[:],
234
+ types.float32[:],
235
+ types.int64,
236
+ )
237
+ )
238
+ def randint_constrained_numba(
239
+ n: np.int32,
240
+ k: np.int32,
241
+ con_values: np.ndarray[np.int32],
242
+ con_indices: np.ndarray[np.int32],
243
+ p: np.ndarray[np.float32] = np.zeros(0, dtype=np.float32),
244
+ seed: np.int64 = 0,
245
+ ) -> np.ndarray[np.int32]:
246
+ """
247
+ Numba version of randint_constrained, which is 10-100x faster than the pure Python version.
248
+
249
+ Generate `k` unique random integers from the range `[0, n)` while satisfying given constraints.
250
+
251
+ `con_values` & `con_indices` can be obtained by using the `to_numpy` method of the [Constraints][max_div.constraints.constraints.Constraints] class.
252
+
253
+ For benchmark results, see [here](../../../../benchmarks/randint_constrained.md)
254
+
255
+ :param n: range to sample from [0, n)
256
+ :param k: number of unique samples to draw (no replacement)
257
+ :param con_values: 2D array (n_cons, 2) with min_count and max_count for each constraint
258
+ :param con_indices: 1D array with constraint indices in the format described in _constraints.py
259
+ :param p: optional, target probabilities for each integer in `[0, n)`
260
+ :param seed: random seed
261
+ :return: array of samples
262
+ """
263
+ # --- initialize --------------------------------------
264
+ samples = np.empty(k, dtype=np.int32)
265
+ k_remaining = k
266
+ n_cons = con_values.shape[0]
267
+
268
+ # Make a copy of con_values to track current min/max counts
269
+ con_values_working = con_values.copy()
270
+
271
+ sample_idx = np.int32(0)
272
+
273
+ # --- sample ------------------------------------------
274
+ while k_remaining > 0:
275
+ # --- score & thresholds ----------------
276
+
277
+ # Get already sampled integers
278
+ already_sampled = samples[:sample_idx]
279
+
280
+ # determine how much each integer would help us satisfy min_count constraints
281
+ score = _compute_score_numba(n, con_values_working, con_indices, already_sampled, True)
282
+
283
+ # determine how much improvement we need to be able to satisfy all min_count constraints
284
+ total_score_needed = np.int32(0)
285
+ for i_con in range(n_cons):
286
+ min_val = _np_con_min_value(con_values_working, np.int32(i_con))
287
+ if min_val > 0:
288
+ total_score_needed += min_val
289
+
290
+ score_threshold = np.int32((total_score_needed + k_remaining - 1) // k_remaining) # ceil division
291
+
292
+ max_score = np.int32(-(2**30))
293
+ for s in score:
294
+ if s > max_score:
295
+ max_score = s
296
+
297
+ if max_score >= score_threshold:
298
+ # STRATEGY 1: focus on those samples that help satisfy constraints
299
+ pass
300
+ else:
301
+ # STRATEGY 2: choose samples with best net effect
302
+ score = _compute_score_numba(n, con_values_working, con_indices, already_sampled, False)
303
+ max_score = np.int32(-(2**30))
304
+ for s in score:
305
+ if s > max_score:
306
+ max_score = s
307
+ score_threshold = max_score
308
+
309
+ # --- sample according to strategy ------
310
+ # construct modified probabilities
311
+ if p.size == 0:
312
+ p_mod = np.ones(n, dtype=np.float32)
313
+ else:
314
+ p_mod = p.copy()
315
+
316
+ # make sure no p_mod is == 0
317
+ max_p = np.float32(0.0)
318
+ for val in p_mod:
319
+ if val > max_p:
320
+ max_p = val
321
+ min_p = np.float32(1e-12 * max_p)
322
+ for i in range(n):
323
+ if p_mod[i] < min_p:
324
+ p_mod[i] = min_p
325
+
326
+ # zero out probabilities for scores below threshold
327
+ for i in range(n):
328
+ if score[i] < score_threshold:
329
+ p_mod[i] = np.float32(0.0)
330
+
331
+ # sample one integer
332
+ result = randint_numba(n, np.int32(1), False, p_mod, seed)
333
+ s = result[0]
334
+
335
+ # --- update stats --------------------------------
336
+ for i_con in range(n_cons):
337
+ indices = _np_con_indices(con_indices, np.int32(i_con))
338
+ for idx in indices:
339
+ if idx == s:
340
+ # Decrement both min and max count for this constraint
341
+ con_values_working[i_con, 0] -= 1
342
+ con_values_working[i_con, 1] -= 1
343
+ break
344
+
345
+ samples[sample_idx] = s
346
+ sample_idx += 1
347
+ k_remaining -= 1
348
+
349
+ # --- done ----------------------------------------
350
+ return samples
@@ -0,0 +1,269 @@
1
+ """
2
+ Methods for sampling WITHOUT constraints.
3
+ """
4
+
5
+ import numba
6
+ import numpy as np
7
+
8
+ from max_div.internal.math.fast_log import fast_log2_f32_poly
9
+ from max_div.internal.math.random import (
10
+ rand_float32,
11
+ rand_int32,
12
+ rand_int32_array,
13
+ set_seed,
14
+ )
15
+ from max_div.internal.math.select_k_minmax import select_k_min
16
+
17
+
18
+ # =================================================================================================
19
+ # randint
20
+ # =================================================================================================
21
+ def randint(
22
+ n: int,
23
+ k: int | None = None,
24
+ replace: bool = True,
25
+ p: np.ndarray[np.float32] | None = None,
26
+ seed: int | None = None,
27
+ use_numba: bool = True,
28
+ ) -> int | np.ndarray[np.int64]:
29
+ """
30
+ Randomly sample `k` integers from range `[0, n-1]`, optionally with replacement and per-value probabilities.
31
+
32
+ Depending on the value of `use_numba`, computations are executed by...
33
+
34
+ - `use_numba=False`: see [randint_numpy][max_div.sampling.uncon.randint_numpy]
35
+ - `use_numba=True`: see [randint_numba][max_div.sampling.uncon.randint_numba]
36
+
37
+ NOTES:
38
+
39
+ * If you're looking for a way to impose constraints (e.g. sample at least 2 from set {2, 4, 7, 11}),
40
+ check out [`randint_constrained`][max_div.sampling.con.randint_constrained].
41
+
42
+ :param n: defines population to sample from as range [0, n-1]. `n` must be >0.
43
+ :param k: The number of integers to sample (>0). `k=None` indicates a single integer sample.
44
+ :param replace: Whether to sample with replacement.
45
+ :param p: Optional 1D array of probabilities associated with each integer in the range.
46
+ Size must be equal to max_value + 1 and sum to 1.
47
+ (if `None` or size==0, uniform sampling is performed)
48
+ :param seed: Optional random seed for reproducibility. If `None` or 0, no seed is set.
49
+ :param use_numba: Use the custom numba-accelerated implementation, otherwise we use `np.random.choice`.
50
+ :return: `k=None` --> single integer; `k>=1` --> (k,)-sized array with sampled integers.
51
+ """
52
+ n = np.int32(n)
53
+ k = np.int32(k) if k is not None else None
54
+
55
+ if not use_numba:
56
+ return randint_numpy(n=n, k=k, replace=replace, p=p, seed=seed)
57
+
58
+ else:
59
+ # NOTE: minimal validation to make sure numba doesn't fail to compile
60
+ if (p is not None) and p.ndim != 1:
61
+ raise ValueError(f"p must be a 1D array. (here: ndim={p.ndim})")
62
+
63
+ # NOTE: we need a few if-clauses, since numba does not support optional arguments
64
+
65
+ if k is None:
66
+ # assume k=1 and return an integer
67
+ if p is None:
68
+ return randint_numba(n=n, k=1, replace=replace, seed=seed or 0)[0]
69
+ else:
70
+ return randint_numba(n=n, k=1, replace=replace, p=p, seed=seed or 0)[0]
71
+
72
+ else:
73
+ # k is specified, return array
74
+ if p is None:
75
+ return randint_numba(n=n, k=k, replace=replace, seed=seed or 0)
76
+ else:
77
+ return randint_numba(n=n, k=k, replace=replace, p=p, seed=seed or 0)
78
+
79
+
80
+ # =================================================================================================
81
+ # randint_numpy
82
+ # =================================================================================================
83
+ def randint_numpy(
84
+ n: np.int32,
85
+ k: np.int32 | None = None,
86
+ replace: bool = True,
87
+ p: np.ndarray[float] | None = None,
88
+ seed: np.int64 | None = None,
89
+ ) -> np.int32 | np.ndarray[np.int32]:
90
+ """
91
+ Randomly sample `k` integers from range `[0, n-1]`, optionally with replacement and per-value probabilities.
92
+
93
+ This will always use `np.random.choice` for sampling and is intended to be used to compare against the
94
+ numba-accelerated version. For production-use, use `randint_numba` or `randint` with `accelerated=True`.
95
+
96
+ :param n: defines population to sample from as range [0, n-1]. `n` must be >0.
97
+ :param k: The number of integers to sample (>0). `k=None` indicates a single integer sample.
98
+ :param replace: Whether to sample with replacement.
99
+ :param p: Optional 1D array of probabilities associated with each integer in the range.
100
+ Size must be equal to max_value + 1, and should have non-negative values. Sum is not require to be 1.
101
+ :param seed: Optional random seed for reproducibility. If `None` or 0, no seed is set.
102
+ :return: `k=None` --> single integer; `k>=1` --> (k,)-sized array with sampled integers.
103
+ """
104
+
105
+ # --- argument handling ---------------------------
106
+ if (k == 1) or (k is None):
107
+ replace = True # single sample, replacement makes no difference, so we can fall back to faster methods
108
+
109
+ # --- argument validation -------------------------
110
+ if n < 1:
111
+ raise ValueError(f"n must be >=1. (here: {n})")
112
+ if k is not None:
113
+ if k < 1:
114
+ raise ValueError(f"k must be >=1. (here: {k})")
115
+ if (not replace) and (k > n):
116
+ raise ValueError(f"Cannot sample {k} unique values from range [0, {n}) without replacement.")
117
+ if p is not None:
118
+ if (p.size > 0) and (p.size != n):
119
+ raise ValueError(f"p must be of size n=0 or n={n}. (here: size={p.size})")
120
+ elif p.size == 0:
121
+ p = None # indicate no probabilities specified
122
+
123
+ # --- sampling ------------------------------------
124
+ if seed:
125
+ np.random.seed(seed)
126
+ if p is not None:
127
+ p = p * (1.0 / np.sum(p)) # normalize probabilities
128
+
129
+ if k is None:
130
+ # returns scalar
131
+ return np.int32(np.random.choice(n, size=None, replace=replace, p=p))
132
+ else:
133
+ # returns array
134
+ return np.random.choice(n, size=k, replace=replace, p=p).astype(np.int32)
135
+
136
+
137
+ # =================================================================================================
138
+ # randint_numba
139
+ # =================================================================================================
140
+ @numba.njit(fastmath=True)
141
+ def randint_numba(
142
+ n: np.int32,
143
+ k: np.int32,
144
+ replace: bool,
145
+ p: np.ndarray[np.float32] = np.zeros(0, dtype=np.float32),
146
+ seed: np.int64 = 0,
147
+ ) -> np.ndarray[np.int32]:
148
+ """
149
+ Randomly sample `k` integers from range `[0, n-1]`, optionally with replacement and per-value probabilities.
150
+
151
+ This is a custom numba, speed-optimized implementation, using a different algorithm depending on the case:
152
+
153
+ | `p` specified | `replace` | `k` | Method Used | Complexity |
154
+ |----------------|------------|-------|------------------------------------------|-----------------|
155
+ | No | `True` | *any* | `np.random.randint`, uniform sampling | O(k) |
156
+ | No | `False` | *any* | k-element Fisher-Yates shuffle | O(n) |
157
+ | Yes | *any* | 1 | Multinomial sampling using CDF | O(n + log(n)) |
158
+ | Yes | `True` | >1 | Multinomial sampling using CDF | O(n + k log(n)) |
159
+ | Yes | `False` | >1 | Efraimidis-Spirakis sampling + exponential key sampling (Gumbel-Max Trick). | O(n) |
160
+
161
+ NOTES:
162
+
163
+ - using the np.random.Generator API incurs an extra 3-4 μsec overhead per call compared to using the legacy
164
+ np.random functions. The main reason is that the new interface requires calls through the numpy C-API, while the
165
+ legacy functions are re-implemented in Numba and compiled together with the rest of the numba-accelerated code.
166
+ Instantiating a Generator incurs a ~10 μsec penalty, so should also be avoided to be done repeatedly.
167
+
168
+ - given the intended use-case within max_div, it is acceptable that provided probabilities are only approximately
169
+ taken into account. Therefore, we use float32 representation and use a fast-approx-log function in the
170
+ Efraimidis-Spirakis sampling method. Overall this can result in <1% deviation from target probabilities, i.e.
171
+ p[3] = 0.1 --> actual frequency in samples = [0.099 to 0.101].
172
+
173
+ <br>
174
+
175
+ :param n: defines population to sample from as range [0, n-1]. `n` must be >0.
176
+ :param k: The number of integers to sample (>0). `k=None` indicates a single integer sample.
177
+ :param replace: Whether to sample with replacement.
178
+ :param p: Optional 1D array of probabilities associated with each integer in the range.
179
+ Size must be equal to max_value + 1, and should have non-negative values. Sum is not require to be 1.
180
+ NOTE: if size is 0, indicates no probabilities specified. (=DEFAULT)
181
+ if size > 0, but not equal to max_value+1, a ValueError is raised.
182
+ :param seed: (default=0) Optional random seed for reproducibility. If `None` or 0, no seed is set.
183
+ :return: (k,)-sized array with sampled integers.
184
+ """
185
+
186
+ if n < 1:
187
+ raise ValueError(f"n must be >=1. (here: {n})")
188
+ if k < 1:
189
+ raise ValueError(f"k must be >=1. (here: {k})")
190
+ if k == 1:
191
+ replace = True # single sample, replacement makes no difference, so we can fall back to faster methods
192
+ elif (not replace) and (k > n):
193
+ raise ValueError(f"Cannot sample {k} unique values from range [0, {n}) without replacement.")
194
+
195
+ if seed != 0:
196
+ rng_state = set_seed(seed)
197
+ else:
198
+ rng_state = set_seed(np.random.randint(1, 1_000_000_000_000))
199
+
200
+ if p.size == 0:
201
+ if replace:
202
+ # UNIFORM sampling with replacement
203
+ return rand_int32_array(rng_state, 0, n, k) # O(k)
204
+ # samples = np.empty(k, dtype=np.int32)
205
+ # for i in range(k):
206
+ # samples[i] = rand_int32(rng_state, 0, n)
207
+ # return samples
208
+ else:
209
+ # UNIFORM sampling without replacement using Fisher-Yates shuffle
210
+ population = np.arange(n, dtype=np.int32) # O(n)
211
+ for i in range(k): # k x O(1)
212
+ j = rand_int32(rng_state, i, n)
213
+ population[i], population[j] = population[j], population[i]
214
+ return population[:k] # O(k)
215
+
216
+ elif p.size == n:
217
+ if replace:
218
+ # NON-UNIFORM sampling with replacement using CDF
219
+ cdf = np.cumsum(p) # O(n)
220
+ p_sum = cdf[-1]
221
+ samples = np.empty(k, dtype=np.int32) # O(k)
222
+ # notes:
223
+ # - computing the below in a loop, is faster than writing a np-vectorized one-liner
224
+ # - implementing & calling a rand_float32_array outside the loop once is not faster
225
+ for i in range(k): # k x O(log(n))
226
+ r = rand_float32(rng_state) * p_sum
227
+ idx = np.searchsorted(cdf, r)
228
+ samples[i] = idx
229
+ return samples
230
+ else:
231
+ # NON-UNIFORM sampling without replacement using Efraimidis-Spirakis + Exponential keys
232
+ # algorithm description:
233
+ # Efraimidis: select k elements corresponding to k largest values of u_i^{1/p_i} (u_i ~ U(0,1))
234
+ # Gumbel-Max Trick: select k smallest values of -log(u_i)/p_i (u_i ~ U(0,1))
235
+ # Ziggurat: INVESTIGATE: generate log(u_i) more efficiently, applying the Ziggurat algorithm
236
+ # to the exponential distribution, which avoids usage of transcendental
237
+ # functions for the majority of the samples.
238
+ # (Initial testing surprisingly did not show improvements)
239
+ if k < n:
240
+ keys = np.empty(n, dtype=np.float32) # O(n)
241
+ # notes:
242
+ # - computing -np.log(u[i]) does not seem to be noticeably slower than np.random.standard_exponential().
243
+ # - implementing & calling a rand_float32_array outside the loop once is not faster
244
+ for i in range(n): # n x O(1)
245
+ if p[i] == 0.0:
246
+ keys[i] = np.inf
247
+ else:
248
+ ui = rand_float32(rng_state)
249
+ # NOTE: we use a fast log2 approximation here for speed; log2 vs log is irrelevant since
250
+ # it's just a scaling factor, and we are only interested in the order of the final list
251
+ keys[i] = -fast_log2_f32_poly(ui, degree=2) / p[i] # using fast log2 approximation
252
+
253
+ # Get indices of k smallest keys
254
+ if k <= (10 + n // 20):
255
+ return select_k_min(keys, np.int32(k)) # most efficient for small k and k/n
256
+ else:
257
+ return np.argpartition(keys, k)[:k].astype(np.int32) # O(n) average case
258
+
259
+ else:
260
+ # corner case: return all elements in random order
261
+ # to this end we perform 1 full Fisher-Yates shuffle
262
+ population = np.arange(n, dtype=np.int32) # O(n)
263
+ for i in range(n): # n x O(1)
264
+ j = rand_int32(rng_state, i, n)
265
+ population[i], population[j] = population[j], population[i]
266
+ return population[:k] # O(k)
267
+
268
+ else:
269
+ raise ValueError(f"p must be of size 0 (no probabilities) or size n={n}. (here: size={p.size})")
@@ -1,16 +1,21 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: max-div
3
- Version: 0.0.3
4
- Summary: Flexible Solver for Maximum Diversity Problems with Fairness Constraints.
3
+ Version: 0.1.1
4
+ Summary: Configurable Solver for Maximum Diversity Problems with Fairness Constraints.
5
5
  Project-URL: Documentation, https://max-div.readthedocs.io/
6
6
  Project-URL: Source, https://github.com/bertpl/max-div
7
- Project-URL: ChangeLog, https://github.com/bertpl/max-div/blob/develop/CHANGELOG.md
7
+ Project-URL: ChangeLog, https://github.com/bertpl/max-div/blob/main/CHANGELOG.md
8
8
  Project-URL: Issues, https://github.com/bertpl/max-div/issues
9
9
  Project-URL: Roadmap, https://github.com/bertpl/max-div/milestones
10
10
  Author-email: Bert Pluymers <bert.pluymers@gmail.com>
11
11
  License-File: LICENSE
12
12
  Requires-Python: >=3.11
13
+ Requires-Dist: click>=8.2.0
14
+ Requires-Dist: icc-rt; platform_machine == 'x86_64' and platform_system == 'Linux'
15
+ Requires-Dist: intel-cmplr-lib-rt; platform_machine == 'x86_64' and platform_system == 'Linux'
16
+ Requires-Dist: numba>=0.57
13
17
  Requires-Dist: numpy>=2.0.0
18
+ Requires-Dist: tqdm>=4.66.0
14
19
  Provides-Extra: docs
15
20
  Requires-Dist: mkdocs-autoapi[python]>=0.4.1; extra == 'docs'
16
21
  Requires-Dist: mkdocs-include-markdown-plugin>=7.2.0; extra == 'docs'
@@ -18,14 +23,14 @@ Requires-Dist: mkdocs-material>=9.0.0; extra == 'docs'
18
23
  Requires-Dist: mkdocs>=1.6.1; extra == 'docs'
19
24
  Requires-Dist: mkdocstrings[python]>=0.24.0; extra == 'docs'
20
25
  Requires-Dist: ruff>=0.14.0; extra == 'docs'
21
- Provides-Extra: numba
22
- Requires-Dist: numba>=0.57; extra == 'numba'
23
26
  Description-Content-Type: text/markdown
24
27
 
25
28
  ![shields.io-python-versions](https://img.shields.io/badge/python-3.11%20%7C%203.12%20%7C%203.13-blue)
26
- ![genbadge-test-count](https://bertpl.github.io/max-div/version_artifacts/release/v0.0.3/badge-test-count.svg)
27
- ![genbadge-test-coverage](https://bertpl.github.io/max-div/version_artifacts/release/v0.0.3/badge-coverage.svg)
28
- ![max-div logo](https://bertpl.github.io/max-div/version_artifacts/release/v0.0.3/splash.webp)
29
+ ![genbadge-test-count](https://bertpl.github.io/max-div/version_artifacts/release/v0.1.1/badge-test-count.svg)
30
+ ![genbadge-test-coverage](https://bertpl.github.io/max-div/version_artifacts/release/v0.1.1/badge-coverage.svg)
31
+ <p>
32
+ <img src="https://bertpl.github.io/max-div/version_artifacts/release/v0.1.1/splash.webp" alt="max-div logo" style="max-width: max(60%, min(100%,800px)); height: auto;">
33
+ </p>
29
34
 
30
35
  # max-div
31
36
  Configurable Solver for Maximum Diversity Problems with Fairness Constraints.