pyxla 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
pyxla/sampling.py ADDED
@@ -0,0 +1,508 @@
1
+ """Sampling techniques.
2
+
3
+ A set of functions for sampling.
4
+ """
5
+ import random
6
+ import numpy as np
7
+ import pandas
8
+ from tqdm.auto import tqdm
9
+ from hilbertcurve.hilbertcurve import HilbertCurve
10
+ import math
11
+ from typing import Union, List, Iterable, Tuple, Callable
12
+ import seaborn as sns
13
+ import matplotlib.pyplot as plt
14
+ from .util import plot_3d_
15
+ import logging
16
+
17
+ logging.basicConfig(level=logging.INFO)
18
+
19
+ def random_walk_sampling(sample_size: int,
20
+ step_size: Union[float, List[float]],
21
+ dim: int = 1,
22
+ num_neighbours = 1,
23
+ l_bound: Union[float, List[float]] = 0,
24
+ u_bound: Union[float, List[float]] = 100,
25
+ seed: int = None) -> Tuple[pandas.DataFrame, pandas.DataFrame]:
26
+ """Generate an a sample consisting of an X (solutions) file
27
+ and an N (neighbourhood) file using random walk.
28
+
29
+ Performs a random walk in the search space and captures
30
+ neighbourhood in the process.
31
+
32
+ Parameters
33
+ ----------
34
+ sample_size : int
35
+ Desired size of the sample.
36
+ step_size: Union[float, List[float]]
37
+ A float (or integer) or array of floats (integers) specifying the
38
+ step size for random walk in each dimension. If the ``dim`` > 2,
39
+ and ``l_bound`` is supplied as a single float, the same bound will
40
+ be assumed for all the dimensions.
41
+ dim : int, optional
42
+ The dimensionality of the sample, by default 1.
43
+ num_neighbours: int, optional
44
+ Number of neighbours to sample, by default 1.
45
+ l_bound : Union[float, List[float]], optional
46
+ A float (or integer) or array of floats (integers) specifying the
47
+ lower bound of the sample, by default 0. If an array is supplied
48
+ each element corresponds to a dimension. If the ``dim`` > 2, and
49
+ ``l_bound`` is supplied as a single float, the same bound will be
50
+ assumed for all the dimensions.
51
+ u_bound : Union[float, List[float]], optional
52
+ A float (or integer) or array of floats (integers) specifying the
53
+ upper bound of the sample, by default 10. If an array is supplied
54
+ each element corresponds to a dimension. If the ``dim`` > 2, and
55
+ ``l_bound`` is supplied as a single float, the same bound will be
56
+ assumed for all the dimensions.
57
+ seed : int, optional
58
+ Seed for random number generator for reproducibility, by
59
+ default None.
60
+
61
+ Returns
62
+ -------
63
+ pandas.DataFrame
64
+ A dataframe consisting the solutions i.e an X file.
65
+ pandas.DataFrame
66
+ A dataframe defining neighbourhood among the solutions
67
+ i.e an N file.
68
+
69
+ Examples
70
+ --------
71
+ Generating a 1-dimensional sample:
72
+
73
+ >>> import numpy as np
74
+ >>> from pyxla.sampling import random_walk_sampling
75
+ >>> sample = np.random.rand(100, 2)
76
+ >>> N = random_walk_sampling(100, 5, 1, 0, 6) # doctest: +SKIP
77
+
78
+ Generating a n-dimensional sample:
79
+
80
+ >>> n, dim = 100, 2
81
+ >>> l_bound, u_bound, step = [0, 100], [100, 1000], [5, 100]
82
+ >>> X, N = random_walk_sampling(n, step, dim=dim, l_bound=l_bound, u_bound=u_bound)
83
+ """
84
+ if seed is not None:
85
+ random.seed(seed)
86
+ np.random.seed(seed)
87
+
88
+ if not isinstance(step_size, Iterable): step_size = [step_size] * dim
89
+ step_size = np.array(step_size)
90
+
91
+ if not isinstance(l_bound, Iterable): l_bound = [l_bound] * dim
92
+ if not isinstance(u_bound, Iterable): u_bound = [u_bound] * dim
93
+
94
+ # start from a random position in the domain
95
+ prev = np.random.uniform(low=l_bound, high=u_bound, size=dim)
96
+ prev_idx = 0
97
+
98
+ X = [prev]
99
+ neighbourhood = []
100
+
101
+ while len(X) < sample_size:
102
+ neighbours = []
103
+ while len(neighbours) < num_neighbours and len(X) < sample_size:
104
+ step = lambda: np.random.uniform(low=-step_size, high=step_size, size=dim)
105
+
106
+ # generate a random r;
107
+ next = prev + step()
108
+
109
+ # confirm that it is still within bounds
110
+ while (next < l_bound).any() or (next > u_bound).any():
111
+ next = prev + step()
112
+
113
+ # record point
114
+ X.append(next)
115
+ current_idx = len(X) - 1
116
+ neighbours.append(current_idx)
117
+
118
+ # record neighbourhood; `next` neighbours `prev`
119
+ neighbourhood.append([prev_idx, current_idx])
120
+
121
+ prev_idx = random.choice(neighbours)
122
+ prev = X[prev_idx]
123
+
124
+
125
+ X = pandas.DataFrame(X, columns=[ f'x{col}' for col in range(dim)])
126
+ N = pandas.DataFrame(neighbourhood, columns=['id1', 'id2'])
127
+
128
+ return X, N
129
+
130
+ def latin_hypercube_sample():
131
+ """Install pyDOE and use lhs"""
132
+ pass
133
+
134
+ def hilbert_curve_sampling(sample_size: int,
135
+ dim: int = 2,
136
+ l_bound: Union[float, List[float]] = 0,
137
+ u_bound: Union[float, List[float]] = 10,
138
+ std_dev: float = 0.3,
139
+ seed: int = None) -> Tuple[pandas.DataFrame, pandas.DataFrame]:
140
+ """Generate a sample using the Hilbert curve.
141
+
142
+ A Hilbert curve is a space-filling curve described by David Hilbert
143
+ in 1891. It has been showed to be a good alternative to random
144
+ sampling and Latin hypercube sampling [1]_. It is applicable in the
145
+ generation of multidimensional samples. To add stochasticity points
146
+ are sampled around the Hilbert curve vertices are sampled using the
147
+ normal distribution.
148
+
149
+ Parameters
150
+ ----------
151
+ sample_size : int
152
+ Desired size of the sample
153
+ dim : int, optional
154
+ The dimensionality of the sample, by default 2.
155
+ l_bound : Union[float, List[float]], optional
156
+ A float (or integer) or array of floats (integers) specifying the
157
+ lower bound of the sample, by default 0. If an array is supplied
158
+ each element corresponds to a dimension. If the ``dim`` > 2, and
159
+ ``l_bound`` is supplied as a single float, the same bound will be
160
+ assumed for all the dimensions.
161
+ u_bound : Union[float, List[float]], optional
162
+ A float (or integer) or array of floats (integers) specifying the
163
+ upper bound of the sample, by default 10. If an array is supplied
164
+ each element corresponds to a dimension. If the ``dim`` > 2, and
165
+ ``l_bound`` is supplied as a single float, the same bound will be
166
+ assumed for all the dimensions.
167
+ std_dev : float, optional
168
+ Standard deviation to sampling points around Hilbert curve
169
+ vertices, by default 0.3, chosen empirically see [1]_.
170
+ seed : int, optional
171
+ Seed for random number generator for reproducibility, by default None
172
+
173
+ Returns
174
+ -------
175
+ pandas.DataFrame
176
+ A dataframe consisting the solutions i.e an X file.
177
+ pandas.DataFrame
178
+ A dataframe defining neighbourhood among the solutions
179
+ i.e an N file.
180
+
181
+ Raises
182
+ ------
183
+ Exception
184
+ Throws an exception if dimension ``dim`` is anything below 2.
185
+ The Hilbert curve with dimension 1 is just a number line.
186
+
187
+ Examples
188
+ --------
189
+ >>> from pyxla import sampling
190
+ >>> n, dim = 100, 2
191
+ >>> l_bound, u_bound = np.array([0, 100]), np.array([100, 1000])
192
+ >>> X, N = sampling.hilbert_curve_sampling(n, dim, l_bound, u_bound) # doctest: +SKIP
193
+
194
+ References
195
+ ----------
196
+ .. [1] J. J. Pienaar, A. S. Boman, and K. M. Malan, 'Hilbert curves for efficient exploratory landscape analysis neighbourhood sampling', in International Conference on the Applications of Evolutionary Computation (Part of EvoStar), 2024, pp. 293-309.
197
+
198
+ """
199
+
200
+ if dim < 2: logging.warning('The Hilbert curve with dimension 1 is just a number line. You are sampling around points on a number line.')
201
+
202
+ if seed is not None:
203
+ random.seed(seed)
204
+ np.random.seed(seed)
205
+
206
+ if not isinstance(l_bound, Iterable): l_bound = [l_bound] * dim
207
+ if not isinstance(u_bound, Iterable): u_bound = [u_bound] * dim
208
+
209
+ l_bound = np.array(l_bound).astype(float)
210
+ u_bound = np.array(u_bound).astype(float)
211
+
212
+ # num_points_on_curve = 2 ** (order * dim)
213
+ # log2(num_points) = (order * dim) log2(2)
214
+ hcurve_order = math.ceil(math.log2(sample_size) / dim)
215
+
216
+ hilbert_curve = HilbertCurve(p=hcurve_order, n=dim)
217
+
218
+ distances = np.arange(hilbert_curve.max_h + 1)
219
+
220
+
221
+ points = hilbert_curve.points_from_distances(distances)
222
+
223
+ # sample random point around vertices
224
+ points_near_vertices = []
225
+
226
+ for point in points:
227
+ points_near_vertices.append(np.random.normal(point, std_dev))
228
+ points_near_vertices = np.array(points_near_vertices)
229
+
230
+ X = points_near_vertices
231
+
232
+ # delete excess points
233
+ k = len(X) - sample_size # excess points
234
+ if k > 0:
235
+ idxs = random.sample(list(range(len(X))), k=k)
236
+ X = np.delete(X, idxs, axis=0)
237
+
238
+
239
+ X = pandas.DataFrame(X, columns=[f'x{col}' for col in range(dim)])
240
+
241
+ # scale to the user-supplied bound
242
+ X = (X - X.min(axis=0)) * (u_bound - l_bound) / (X.max(axis=0) - X.min(axis=0)) + l_bound
243
+
244
+ num_pairs = sample_size - 1
245
+ neighbours = np.zeros((num_pairs, 2), dtype=int)
246
+
247
+ for i in range(num_pairs):
248
+ neighbours[i] = [i, i + 1]
249
+
250
+ N = pandas.DataFrame(neighbours, columns=['id1', 'id2'])
251
+
252
+ return X, N
253
+
254
+ def hilbert_curve_neighbour_sampling(X: pandas.DataFrame, binary: bool = False) -> pandas.DataFrame:
255
+ """Generate an N (neighbourhood) file using the hilbert curve.
256
+
257
+ Maps samples from an n-dimensional space on a Hilbert curve to a
258
+ 1-d Hilbert curve and infers neighbourhood from the order. The inputs
259
+ are rescaled. Taking a 1-d Hilbert curve ``[5, 2, 1, 7]``, the following
260
+ set of neighbourhood pairs in inferred: ``[[5, 2], [2, 1], [1, 7]]``.
261
+
262
+ Parameters
263
+ ----------
264
+ X : pandas.DataFrame
265
+ Dataframe containing the decision space variable i.e. the X file.
266
+ binary: bool, optional
267
+ Specify whether the sample is binary or not, by default ``False``.
268
+
269
+ Returns
270
+ -------
271
+ pandas.DataFrame
272
+ A 2-d sorted dataframe where for an row, the solution in column
273
+ ``id2`` can be reached from column ``id1``.
274
+
275
+ Examples
276
+ --------
277
+ >>> from pyxla.util import load_sample
278
+ >>> from pyxla.sampling import hilbert_curve_neighbour_sampling
279
+ >>> sample = load_sample('nk_n14_k2_id5_F3_V2', test=True)
280
+ >>> N = hilbert_curve_neighbour_sampling(sample) # doctest: +SKIP
281
+ """
282
+
283
+ dimensions = len(X.columns)
284
+
285
+ if dimensions < 2: raise Exception('Dimension must be >= 2. The Hilbert curve with dimension 1 is just a number line.')
286
+ n = len(X)
287
+
288
+ # calculate order of the Hilbert curve
289
+ hcurve_order = 1 if binary == 'binary' else math.ceil(math.log2(n + 1))
290
+
291
+ # min-max scaling per dimension
292
+ for dim in X:
293
+ if X[dim].max() > 1 or X[dim].min() < 0:
294
+ X[dim] = (X[dim] - X[dim].min()) / (X[dim].max() - X[dim].min())
295
+
296
+ if not binary:
297
+ # re-scale to min and max of the Hilbert curve planes and convert values to ints
298
+ X = (X * 2 ** hcurve_order).astype(int)
299
+
300
+ hilbert_curve = HilbertCurve(p=hcurve_order, n=dimensions)
301
+
302
+ # get distances on 1-dim Hilbert curve
303
+ hilbert_idxs = hilbert_curve.distances_from_points(X.to_numpy())
304
+
305
+ # use distances to infer indices to order the sample
306
+ ordered = np.argsort(hilbert_idxs)
307
+
308
+ # extract neighbourhood
309
+ # i.e [5, 2, 1, 7] -> [[5, 2], [2, 1], [1, 7]]
310
+ num_pairs = len(ordered) - 1
311
+ neighbours = np.zeros((num_pairs, 2), dtype=int)
312
+
313
+ for i in range(num_pairs):
314
+ neighbours[i] = [ordered[i], ordered[i + 1]]
315
+
316
+ # sort the order of indices in the N file
317
+ sorted_idxs = neighbours[:, 0].argsort()
318
+
319
+ return pandas.DataFrame(neighbours[sorted_idxs], columns=['id1', 'id2'])
320
+
321
+ def adaptive_walk_continuous(objective: Callable[[List[float]], float],
322
+ sample_size: int,
323
+ step_size: Union[float, List[float]],
324
+ max: bool = False,
325
+ dim: int = 1,
326
+ num_neighbours = 1,
327
+ step_retries = 10,
328
+ l_bound: Union[float, List[float]] = 0,
329
+ u_bound: Union[float, List[float]] = 100,
330
+ seed: int = None) -> Tuple[pandas.DataFrame, pandas.DataFrame]:
331
+ # multi-objective??
332
+
333
+ if seed is not None: random.seed(seed)
334
+
335
+ if not isinstance(step_size, Iterable): step_size = [step_size] * dim
336
+ step_size = np.array(step_size)
337
+
338
+ if not isinstance(l_bound, Iterable): l_bound = [l_bound] * dim
339
+ if not isinstance(u_bound, Iterable): u_bound = [u_bound] * dim
340
+
341
+ # start from a random position in the domain
342
+ prev_idx = 0
343
+ prev = np.random.uniform(low=l_bound, high=u_bound, size=dim)
344
+
345
+ X = [prev]
346
+ neighbourhood = []
347
+
348
+ while len(X) < sample_size:
349
+ neighbours = []
350
+ neighbour_attempts = 0
351
+
352
+ while len(neighbours) < num_neighbours and len(X) < sample_size:
353
+ if neighbour_attempts > num_neighbours: break
354
+
355
+ step = lambda: np.random.uniform(low=-step_size, high=step_size, size=dim)
356
+
357
+ # generate a random r;
358
+ step_attempts = 0
359
+ next = prev + step()
360
+
361
+ fitter = lambda x: objective(x) > objective(prev) if max else objective(x) < objective(prev)
362
+
363
+ valid = lambda next: (next >= l_bound).all() and (next <= u_bound).all() and fitter(next)
364
+
365
+ # confirm that it is still within bounds and is fitter
366
+ while not valid(next):
367
+ if step_attempts > step_retries: break
368
+ step_attempts += 1
369
+ next = prev + step()
370
+
371
+ if valid(next):
372
+ # record point
373
+ X.append(next)
374
+ current_idx = len(X) - 1
375
+ neighbours.append(current_idx)
376
+
377
+ # record neighbourhood; `next` neighbours `prev`
378
+ neighbourhood.append([prev_idx, current_idx])
379
+
380
+ neighbour_attempts += 1
381
+
382
+ if len(neighbours):
383
+ # choose a neighbour randomly
384
+ prev_idx = random.choice(neighbours)
385
+ prev = X[prev_idx]
386
+ elif len(X) < sample_size:
387
+ # generate random point and repeat
388
+ prev = np.random.uniform(low=l_bound, high=u_bound, size=dim)
389
+ X.append(prev)
390
+ prev_idx = len(X) - 1
391
+
392
+ X = pandas.DataFrame(X, columns=[ f'x{col}' for col in range(dim)])
393
+ N = pandas.DataFrame(neighbourhood, columns=['id1', 'id2'])
394
+
395
+ return X, N
396
+
397
+ def box_scale(X: pandas.DataFrame, l_bound: Union[float, List[float]] = 0,
398
+ u_bound: Union[float, List[float]] = 10):
399
+ dim = len(X.columns)
400
+ if not isinstance(l_bound, Iterable): l_bound = [l_bound] * dim
401
+ if not isinstance(u_bound, Iterable): u_bound = [u_bound] * dim
402
+
403
+ l_bound = np.array(l_bound).astype(float)
404
+ u_bound = np.array(u_bound).astype(float)
405
+ return (X - X.min(axis=0)) * (u_bound - l_bound) / (X.max(axis=0) - X.min(axis=0)) + l_bound
406
+
407
+ def hilbert_curve_sampling_viz(sample_size: int,
408
+ dim: int = 2,
409
+ l_bound = -5,
410
+ u_bound = 5,
411
+ std_dev: float = 0.3, obj=None, seed=None):
412
+ if seed is not None: np.random.seed(seed)
413
+
414
+ hcurve_order = math.ceil(math.log2(sample_size) / dim)
415
+ print(hcurve_order)
416
+
417
+ hilbert_curve = HilbertCurve(p=hcurve_order, n=dim)
418
+
419
+ distances = list(range(hilbert_curve.max_h + 1))
420
+
421
+ points = hilbert_curve.points_from_distances(distances)
422
+
423
+ # sample random point around vertices
424
+ points_near_vertices = []
425
+
426
+ for point in points:
427
+ points_near_vertices.append(np.random.normal(point, std_dev))
428
+ points_near_vertices = np.array(points_near_vertices)
429
+
430
+ X = points_near_vertices
431
+
432
+ X = pandas.DataFrame(X, columns=[f'x{col}' for col in range(dim)])
433
+ HC = pandas.DataFrame(points, columns=[f'x{col}' for col in range(dim)])
434
+
435
+ X['type'] = 'randomised'
436
+ HC['type'] = 'HC'
437
+ palette = sns.color_palette()
438
+
439
+ data = pandas.concat([X, HC])
440
+
441
+ cols = 3
442
+
443
+ palette = sns.color_palette()
444
+
445
+ hc_fig, hc_ax = plt.subplots(ncols=1, figsize=(7, 7))
446
+ ax = sns.scatterplot(HC, x='x0', y='x1', color=palette[1], ax=hc_ax)
447
+ ax.plot(HC['x0'], HC['x1'], ':', color=palette[1])
448
+
449
+ fig, axs = plt.subplots(ncols=cols, figsize=(7 * cols, 7))
450
+
451
+ ax = sns.scatterplot(data, x='x0', y='x1', hue='type', hue_order=['randomised', 'HC'], ax=axs[0])
452
+ # ax.legend_.remove()
453
+ ax.plot(X['x0'], X['x1'], ':', color=palette[0])
454
+ ax.plot(HC['x0'], HC['x1'], ':', color=palette[1])
455
+ ax.set(title=f"All points ({len(X)})")
456
+ ax.set_aspect('equal', adjustable='box')
457
+
458
+ # remove excess points
459
+ k = len(X) - sample_size # excess points
460
+ if k > 0:
461
+ idxs = random.sample(list(range(len(X))), k=k)
462
+ X_ = X[~X.index.isin(idxs)]
463
+ HC_ = HC[~HC.index.isin(idxs)]
464
+
465
+ data_ = pandas.concat([X_, HC_])
466
+
467
+ ax = sns.scatterplot(data_, x='x0', y='x1', hue='type', hue_order=['randomised', 'HC'], ax=axs[1])
468
+ # ax.legend_.remove()
469
+ ax.plot(X_['x0'], X_['x1'], ':', color=palette[0])
470
+ ax.plot(HC_['x0'], HC_['x1'], ':', color=palette[1])
471
+ ax.set(title=f"{k} points deleted to leave {sample_size} points")
472
+ ax.set_aspect('equal', adjustable='box')
473
+
474
+ X = box_scale(X.drop('type', axis=1), l_bound, u_bound)
475
+ k = len(X) - sample_size # excess points
476
+ if k > 0:
477
+ idxs = random.sample(list(range(len(X))), k=k)
478
+ X_ = X[~X.index.isin(idxs)]
479
+
480
+ ax = sns.scatterplot(X_, x='x0', y='x1', ax=axs[2])
481
+ ax.plot(X_['x0'], X_['x1'], ':', color=palette[0])
482
+ ax.set(title=f"{k} points deleted to leave {sample_size} points")
483
+ ax.set_aspect('equal', adjustable='box')
484
+
485
+ # for row in X_.itertuples():
486
+ # ax.annotate(str(int(row.x0**2 + row.x1**2)), (row.x0, row.x1))
487
+
488
+ final_fig, final_ax = plt.subplots(ncols=1, figsize=(7, 7))
489
+ ax = sns.scatterplot(X_, x='x0', y='x1', ax=final_ax)
490
+ ax.plot(X_['x0'], X_['x1'], ':', color=palette[1])
491
+
492
+ if obj:
493
+ F = pandas.DataFrame()
494
+ F['f0'] = X.apply(obj, axis=1)
495
+
496
+ F_ = pandas.DataFrame()
497
+ F_['f0'] = X_.apply(obj, axis=1)
498
+
499
+ fig3d, axs = plt.subplots(ncols=2, subplot_kw=dict(projection='3d'))
500
+
501
+ plot_3d_(X['x0'], X['x1'], F['f0'], axs[0])
502
+ plot_3d_(X_['x0'], X_['x1'], F_['f0'], axs[1])
503
+
504
+ fig3d.tight_layout()
505
+
506
+ fig.tight_layout()
507
+
508
+ return fig, hc_fig, final_fig