PyPI - pyxla - Versions diffs - 0.0.1__py3-none-any.whl - Mend

pyxla 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

pyxla/sampling.py ADDED Viewed

@@ -0,0 +1,508 @@
+"""Sampling techniques.
+A set of functions for sampling.
+"""
+import random
+import numpy as np
+import pandas
+from tqdm.auto import tqdm
+from hilbertcurve.hilbertcurve import HilbertCurve
+import math
+from typing import Union, List, Iterable, Tuple, Callable
+import seaborn as sns
+import matplotlib.pyplot as plt
+from .util import plot_3d_
+import logging
+logging.basicConfig(level=logging.INFO)
+def random_walk_sampling(sample_size: int,
+                         step_size: Union[float, List[float]],
+                         dim: int = 1,
+                         num_neighbours = 1,
+                         l_bound: Union[float, List[float]] = 0,
+                         u_bound: Union[float, List[float]] = 100,
+                         seed: int = None) -> Tuple[pandas.DataFrame, pandas.DataFrame]:
+    """Generate an a sample consisting of an X (solutions) file
+    and an N (neighbourhood) file using random walk.
+    Performs a random walk in the search space and captures
+    neighbourhood in the process.
+    Parameters
+    ----------
+    sample_size : int
+        Desired size of the sample.
+    step_size: Union[float, List[float]]
+        A float (or integer) or array of floats (integers) specifying the
+        step size for random walk in each dimension. If the ``dim`` > 2,
+        and ``l_bound`` is supplied as a single float, the same bound will
+        be assumed for all the dimensions.
+    dim : int, optional
+        The dimensionality of the sample, by default 1.
+    num_neighbours: int, optional
+        Number of neighbours to sample, by default 1.
+    l_bound : Union[float, List[float]], optional
+        A float (or integer) or array of floats (integers) specifying the
+        lower bound of the sample, by default 0. If an array is supplied
+        each element corresponds to a dimension. If the ``dim`` > 2, and
+        ``l_bound`` is supplied as a single float, the same bound will be
+        assumed for all the dimensions.
+    u_bound : Union[float, List[float]], optional
+        A float (or integer) or array of floats (integers) specifying the
+        upper bound of the sample, by default 10. If an array is supplied
+        each element corresponds to a dimension. If the ``dim`` > 2, and
+        ``l_bound`` is supplied as a single float, the same bound will be
+        assumed for all the dimensions.
+    seed : int, optional
+        Seed for random number generator for reproducibility, by
+        default None.
+    Returns
+    -------
+    pandas.DataFrame
+        A dataframe consisting the solutions i.e an X file.
+    pandas.DataFrame
+        A dataframe defining neighbourhood among the solutions
+        i.e an N file.
+    Examples
+    --------
+    Generating a 1-dimensional sample:
+    >>> import numpy as np
+    >>> from pyxla.sampling import random_walk_sampling
+    >>> sample = np.random.rand(100, 2)
+    >>> N = random_walk_sampling(100, 5, 1, 0, 6) # doctest: +SKIP
+    Generating a n-dimensional sample:
+    >>> n, dim = 100, 2
+    >>> l_bound, u_bound, step = [0, 100], [100, 1000], [5, 100]
+    >>> X, N = random_walk_sampling(n, step, dim=dim, l_bound=l_bound, u_bound=u_bound)
+    """
+    if seed is not None:
+        random.seed(seed)
+        np.random.seed(seed)
+    if not isinstance(step_size, Iterable): step_size = [step_size] * dim
+    step_size = np.array(step_size)
+    if not isinstance(l_bound, Iterable): l_bound = [l_bound] * dim
+    if not isinstance(u_bound, Iterable): u_bound = [u_bound] * dim
+    # start from a random position in the domain
+    prev = np.random.uniform(low=l_bound, high=u_bound, size=dim)
+    prev_idx = 0
+    X = [prev]
+    neighbourhood = []
+    while len(X) < sample_size:
+        neighbours = []
+        while len(neighbours) < num_neighbours and len(X) < sample_size:
+            step = lambda: np.random.uniform(low=-step_size, high=step_size, size=dim)
+            # generate a random r;
+            next = prev + step()
+            # confirm that it is still within bounds
+            while (next < l_bound).any() or (next > u_bound).any():
+                next = prev + step()
+            # record point
+            X.append(next)
+            current_idx = len(X) - 1
+            neighbours.append(current_idx)
+            # record neighbourhood; `next` neighbours `prev`
+            neighbourhood.append([prev_idx, current_idx])
+        prev_idx = random.choice(neighbours)
+        prev = X[prev_idx]
+    X = pandas.DataFrame(X, columns=[ f'x{col}' for col in range(dim)])
+    N = pandas.DataFrame(neighbourhood, columns=['id1', 'id2'])
+    return X, N
+def latin_hypercube_sample():
+    """Install pyDOE and use lhs"""
+    pass
+def hilbert_curve_sampling(sample_size: int,
+                         dim: int = 2,
+                         l_bound: Union[float, List[float]] = 0,
+                         u_bound: Union[float, List[float]] = 10,
+                         std_dev: float = 0.3,
+                         seed: int = None) -> Tuple[pandas.DataFrame, pandas.DataFrame]:
+    """Generate a sample using the Hilbert curve.
+    A Hilbert curve is a space-filling curve described by David Hilbert
+    in 1891. It has been showed to be a good alternative to random
+    sampling and Latin hypercube sampling [1]_. It is applicable in the
+    generation of multidimensional samples. To add stochasticity points
+    are sampled around the Hilbert curve vertices are sampled using the
+    normal distribution.
+    Parameters
+    ----------
+    sample_size : int
+        Desired size of the sample
+    dim : int, optional
+        The dimensionality of the sample, by default 2.
+    l_bound : Union[float, List[float]], optional
+        A float (or integer) or array of floats (integers) specifying the
+        lower bound of the sample, by default 0. If an array is supplied
+        each element corresponds to a dimension. If the ``dim`` > 2, and
+        ``l_bound`` is supplied as a single float, the same bound will be
+        assumed for all the dimensions.
+    u_bound : Union[float, List[float]], optional
+        A float (or integer) or array of floats (integers) specifying the
+        upper bound of the sample, by default 10. If an array is supplied
+        each element corresponds to a dimension. If the ``dim`` > 2, and
+        ``l_bound`` is supplied as a single float, the same bound will be
+        assumed for all the dimensions.
+    std_dev : float, optional
+        Standard deviation to sampling points around Hilbert curve
+        vertices, by default 0.3, chosen empirically see [1]_.
+    seed : int, optional
+        Seed for random number generator for reproducibility, by default None
+    Returns
+    -------
+    pandas.DataFrame
+        A dataframe consisting the solutions i.e an X file.
+    pandas.DataFrame
+        A dataframe defining neighbourhood among the solutions
+        i.e an N file.
+    Raises
+    ------
+    Exception
+        Throws an exception if dimension ``dim`` is anything below 2.
+        The Hilbert curve with dimension 1 is just a number line.
+    Examples
+    --------
+    >>> from pyxla import sampling
+    >>> n, dim = 100, 2
+    >>> l_bound, u_bound = np.array([0, 100]), np.array([100, 1000])
+    >>> X, N = sampling.hilbert_curve_sampling(n, dim, l_bound, u_bound) # doctest: +SKIP
+    References
+    ----------
+    .. [1] J. J. Pienaar, A. S. Boman, and K. M. Malan, 'Hilbert curves for efficient exploratory landscape analysis neighbourhood sampling', in International Conference on the Applications of Evolutionary Computation (Part of EvoStar), 2024, pp. 293-309.
+    """
+    if dim < 2: logging.warning('The Hilbert curve with dimension 1 is just a number line. You are sampling around points on a number line.')
+    if seed is not None:
+        random.seed(seed)
+        np.random.seed(seed)
+    if not isinstance(l_bound, Iterable): l_bound = [l_bound] * dim
+    if not isinstance(u_bound, Iterable): u_bound = [u_bound] * dim
+    l_bound = np.array(l_bound).astype(float)
+    u_bound = np.array(u_bound).astype(float)
+    # num_points_on_curve = 2 ** (order * dim)
+    # log2(num_points) = (order * dim) log2(2)
+    hcurve_order = math.ceil(math.log2(sample_size) / dim)
+    hilbert_curve = HilbertCurve(p=hcurve_order, n=dim)
+    distances = np.arange(hilbert_curve.max_h + 1)
+    points = hilbert_curve.points_from_distances(distances)
+    # sample random point around vertices
+    points_near_vertices = []
+    for point in points:
+        points_near_vertices.append(np.random.normal(point, std_dev))
+    points_near_vertices = np.array(points_near_vertices)
+    X = points_near_vertices
+    # delete excess points
+    k = len(X) - sample_size # excess points
+    if k > 0:
+        idxs = random.sample(list(range(len(X))), k=k)
+        X = np.delete(X, idxs, axis=0)
+    X = pandas.DataFrame(X, columns=[f'x{col}' for col in range(dim)])
+    # scale to the user-supplied bound
+    X = (X - X.min(axis=0)) * (u_bound - l_bound) / (X.max(axis=0) - X.min(axis=0)) + l_bound
+    num_pairs = sample_size - 1
+    neighbours = np.zeros((num_pairs, 2), dtype=int)
+    for i in range(num_pairs):
+        neighbours[i] = [i, i + 1]
+    N = pandas.DataFrame(neighbours, columns=['id1', 'id2'])
+    return X, N
+def hilbert_curve_neighbour_sampling(X: pandas.DataFrame, binary: bool = False) -> pandas.DataFrame:
+    """Generate an N (neighbourhood) file using the hilbert curve.
+    Maps samples from an n-dimensional space on a Hilbert curve to a
+    1-d Hilbert curve and infers neighbourhood from the order. The inputs
+    are rescaled. Taking a 1-d Hilbert curve ``[5, 2, 1, 7]``, the following
+    set of neighbourhood pairs in inferred: ``[[5, 2], [2, 1], [1, 7]]``.
+    Parameters
+    ----------
+    X : pandas.DataFrame
+        Dataframe containing the decision space variable i.e. the X file.
+    binary: bool, optional
+        Specify whether the sample is binary or not, by default ``False``.
+    Returns
+    -------
+    pandas.DataFrame
+        A 2-d sorted dataframe where for an row, the solution in column
+        ``id2`` can be reached from column ``id1``.
+    Examples
+    --------
+    >>> from pyxla.util import load_sample
+    >>> from pyxla.sampling import hilbert_curve_neighbour_sampling
+    >>> sample = load_sample('nk_n14_k2_id5_F3_V2', test=True)
+    >>> N = hilbert_curve_neighbour_sampling(sample) # doctest: +SKIP
+    """
+    dimensions = len(X.columns)
+    if dimensions < 2: raise Exception('Dimension must be >= 2. The Hilbert curve with dimension 1 is just a number line.')
+    n = len(X)
+    # calculate order of the Hilbert curve
+    hcurve_order = 1 if binary == 'binary' else math.ceil(math.log2(n + 1))
+    # min-max scaling per dimension
+    for dim in X:
+        if X[dim].max() > 1 or X[dim].min() < 0:
+            X[dim] = (X[dim] - X[dim].min()) / (X[dim].max() - X[dim].min())
+    if not binary:
+        # re-scale to min and max of the Hilbert curve planes and convert values to ints
+        X = (X * 2 ** hcurve_order).astype(int)
+    hilbert_curve = HilbertCurve(p=hcurve_order, n=dimensions)
+    # get distances on 1-dim Hilbert curve
+    hilbert_idxs = hilbert_curve.distances_from_points(X.to_numpy())
+    # use distances to infer indices to order the sample
+    ordered = np.argsort(hilbert_idxs)
+    # extract neighbourhood
+    # i.e [5, 2, 1, 7] -> [[5, 2], [2, 1], [1, 7]]
+    num_pairs = len(ordered) - 1
+    neighbours = np.zeros((num_pairs, 2), dtype=int)
+    for i in range(num_pairs):
+        neighbours[i] = [ordered[i], ordered[i + 1]]
+    # sort the order of indices in the N file
+    sorted_idxs = neighbours[:, 0].argsort()
+    return pandas.DataFrame(neighbours[sorted_idxs], columns=['id1', 'id2'])
+def adaptive_walk_continuous(objective: Callable[[List[float]], float],
+                             sample_size: int,
+                             step_size: Union[float, List[float]],
+                             max: bool = False,
+                             dim: int = 1,
+                             num_neighbours = 1,
+                             step_retries = 10,
+                             l_bound: Union[float, List[float]] = 0,
+                             u_bound: Union[float, List[float]] = 100,
+                             seed: int = None) -> Tuple[pandas.DataFrame, pandas.DataFrame]:
+    # multi-objective??
+    if seed is not None: random.seed(seed)
+    if not isinstance(step_size, Iterable): step_size = [step_size] * dim
+    step_size = np.array(step_size)
+    if not isinstance(l_bound, Iterable): l_bound = [l_bound] * dim
+    if not isinstance(u_bound, Iterable): u_bound = [u_bound] * dim
+    # start from a random position in the domain
+    prev_idx = 0
+    prev = np.random.uniform(low=l_bound, high=u_bound, size=dim)
+    X = [prev]
+    neighbourhood = []
+    while len(X) < sample_size:
+        neighbours = []
+        neighbour_attempts = 0
+        while len(neighbours) < num_neighbours and len(X) < sample_size:
+            if neighbour_attempts > num_neighbours: break
+            step = lambda: np.random.uniform(low=-step_size, high=step_size, size=dim)
+            # generate a random r;
+            step_attempts = 0
+            next = prev + step()
+            fitter = lambda x: objective(x) > objective(prev) if max else objective(x) < objective(prev)
+            valid = lambda next: (next >= l_bound).all() and (next <= u_bound).all() and fitter(next)
+            # confirm that it is still within bounds and is fitter
+            while not valid(next):
+                if step_attempts > step_retries: break
+                step_attempts += 1
+                next = prev + step()
+            if valid(next):
+                # record point
+                X.append(next)
+                current_idx = len(X) - 1
+                neighbours.append(current_idx)
+                # record neighbourhood; `next` neighbours `prev`
+                neighbourhood.append([prev_idx, current_idx])
+            neighbour_attempts += 1
+        if len(neighbours):
+            # choose a neighbour randomly
+            prev_idx = random.choice(neighbours)
+            prev = X[prev_idx]
+        elif len(X) < sample_size:
+            # generate random point and repeat
+            prev = np.random.uniform(low=l_bound, high=u_bound, size=dim)
+            X.append(prev)
+            prev_idx = len(X) - 1
+    X = pandas.DataFrame(X, columns=[ f'x{col}' for col in range(dim)])
+    N = pandas.DataFrame(neighbourhood, columns=['id1', 'id2'])
+    return X, N
+def box_scale(X: pandas.DataFrame, l_bound: Union[float, List[float]] = 0,
+              u_bound: Union[float, List[float]] = 10):
+    dim = len(X.columns)
+    if not isinstance(l_bound, Iterable): l_bound = [l_bound] * dim
+    if not isinstance(u_bound, Iterable): u_bound = [u_bound] * dim
+    l_bound = np.array(l_bound).astype(float)
+    u_bound = np.array(u_bound).astype(float)
+    return (X - X.min(axis=0)) * (u_bound - l_bound) / (X.max(axis=0) - X.min(axis=0)) + l_bound
+def hilbert_curve_sampling_viz(sample_size: int,
+                               dim: int = 2,
+                               l_bound = -5,
+                               u_bound = 5,
+                               std_dev: float = 0.3, obj=None, seed=None):
+    if seed is not None: np.random.seed(seed)
+    hcurve_order = math.ceil(math.log2(sample_size) / dim)
+    print(hcurve_order)
+    hilbert_curve = HilbertCurve(p=hcurve_order, n=dim)
+    distances = list(range(hilbert_curve.max_h + 1))
+    points = hilbert_curve.points_from_distances(distances)
+    # sample random point around vertices
+    points_near_vertices = []
+    for point in points:
+        points_near_vertices.append(np.random.normal(point, std_dev))
+    points_near_vertices = np.array(points_near_vertices)
+    X = points_near_vertices
+    X = pandas.DataFrame(X, columns=[f'x{col}' for col in range(dim)])
+    HC = pandas.DataFrame(points, columns=[f'x{col}' for col in range(dim)])
+    X['type'] = 'randomised'
+    HC['type'] = 'HC'
+    palette = sns.color_palette()
+    data = pandas.concat([X, HC])
+    cols = 3
+    palette = sns.color_palette()
+    hc_fig, hc_ax = plt.subplots(ncols=1, figsize=(7, 7))
+    ax = sns.scatterplot(HC, x='x0', y='x1', color=palette[1], ax=hc_ax)
+    ax.plot(HC['x0'], HC['x1'], ':', color=palette[1])
+    fig, axs = plt.subplots(ncols=cols, figsize=(7 * cols, 7))
+    ax = sns.scatterplot(data, x='x0', y='x1', hue='type', hue_order=['randomised', 'HC'], ax=axs[0])
+    # ax.legend_.remove()
+    ax.plot(X['x0'], X['x1'], ':', color=palette[0])
+    ax.plot(HC['x0'], HC['x1'], ':', color=palette[1])
+    ax.set(title=f"All points ({len(X)})")
+    ax.set_aspect('equal', adjustable='box')
+    # remove excess points
+    k = len(X) - sample_size # excess points
+    if k > 0:
+        idxs = random.sample(list(range(len(X))), k=k)
+        X_ = X[~X.index.isin(idxs)]
+        HC_ = HC[~HC.index.isin(idxs)]
+    data_ = pandas.concat([X_, HC_])
+    ax = sns.scatterplot(data_, x='x0', y='x1', hue='type', hue_order=['randomised', 'HC'], ax=axs[1])
+    # ax.legend_.remove()
+    ax.plot(X_['x0'], X_['x1'], ':', color=palette[0])
+    ax.plot(HC_['x0'], HC_['x1'], ':', color=palette[1])
+    ax.set(title=f"{k} points deleted to leave {sample_size} points")
+    ax.set_aspect('equal', adjustable='box')
+    X = box_scale(X.drop('type', axis=1), l_bound, u_bound)
+    k = len(X) - sample_size # excess points
+    if k > 0:
+        idxs = random.sample(list(range(len(X))), k=k)
+        X_ = X[~X.index.isin(idxs)]
+    ax = sns.scatterplot(X_, x='x0', y='x1', ax=axs[2])
+    ax.plot(X_['x0'], X_['x1'], ':', color=palette[0])
+    ax.set(title=f"{k} points deleted to leave {sample_size} points")
+    ax.set_aspect('equal', adjustable='box')
+    # for row in X_.itertuples():
+    #     ax.annotate(str(int(row.x0**2 + row.x1**2)), (row.x0, row.x1))
+    final_fig, final_ax = plt.subplots(ncols=1, figsize=(7, 7))
+    ax = sns.scatterplot(X_, x='x0', y='x1', ax=final_ax)
+    ax.plot(X_['x0'], X_['x1'], ':', color=palette[1])
+    if obj:
+        F = pandas.DataFrame()
+        F['f0'] = X.apply(obj, axis=1)
+        F_ = pandas.DataFrame()
+        F_['f0'] = X_.apply(obj, axis=1)
+        fig3d, axs = plt.subplots(ncols=2, subplot_kw=dict(projection='3d'))
+        plot_3d_(X['x0'], X['x1'], F['f0'], axs[0])
+        plot_3d_(X_['x0'], X_['x1'], F_['f0'], axs[1])
+        fig3d.tight_layout()
+    fig.tight_layout()
+    return fig, hc_fig, final_fig