PyPI - dclab - Versions diffs - 0.62.16__cp312-cp312-win_amd64.whl → 0.63.0__cp312-cp312-win_amd64.whl - Mend

dclab 0.62.16__cp312-cp312-win_amd64.whl → 0.63.0__cp312-cp312-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of dclab might be problematic. Click here for more details.

Files changed (21) hide show

dclab/__init__.py +23 -5
dclab/_version.py +2 -2
dclab/downsampling.cp312-win_amd64.pyd +0 -0
dclab/external/skimage/_find_contours_cy.cp312-win_amd64.pyd +0 -0
dclab/external/skimage/_pnpoly.cp312-win_amd64.pyd +0 -0
dclab/external/skimage/_shared/geometry.cp312-win_amd64.pyd +0 -0
dclab/kde/__init__.py +1 -0
dclab/kde/base.py +238 -0
dclab/kde/contours.py +222 -0
dclab/kde/methods.py +303 -0
dclab/kde_contours.py +7 -219
dclab/kde_methods.py +9 -301
dclab/rtdc_dataset/core.py +30 -146
dclab/rtdc_dataset/export.py +129 -37
dclab/util.py +20 -0
{dclab-0.62.16.dist-info → dclab-0.63.0.dist-info}/METADATA +4 -4
{dclab-0.62.16.dist-info → dclab-0.63.0.dist-info}/RECORD +21 -17
{dclab-0.62.16.dist-info → dclab-0.63.0.dist-info}/WHEEL +1 -1
{dclab-0.62.16.dist-info → dclab-0.63.0.dist-info}/licenses/LICENSE +1 -61
{dclab-0.62.16.dist-info → dclab-0.63.0.dist-info}/entry_points.txt +0 -0
{dclab-0.62.16.dist-info → dclab-0.63.0.dist-info}/top_level.txt +0 -0

dclab/kde/methods.py ADDED Viewed

@@ -0,0 +1,303 @@
+"""Kernel Density Estimation methods"""
+import numpy as np
+from scipy.interpolate import RectBivariateSpline
+from scipy.stats import gaussian_kde, skew
+from ..cached import Cache
+from ..external.statsmodels.nonparametric.kernel_density import KDEMultivariate
+def bin_num_doane(a):
+    """Compute number of bins based on Doane's formula
+    Notes
+    -----
+    If the bin width cannot be determined, then a bin
+    number of 5 is returned.
+    See Also
+    --------
+    bin_width_doane: method used to compute the bin width
+    """
+    bad = np.isnan(a) | np.isinf(a)
+    data = a[~bad]
+    acc = bin_width_doane(a)
+    if acc == 0 or np.isnan(acc):
+        num = 5
+    else:
+        num = int(np.round((data.max() - data.min()) / acc))
+    return num
+def bin_width_doane(a):
+    """Compute contour spacing based on Doane's formula
+    References
+    ----------
+    - `<https://en.wikipedia.org/wiki/Histogram#Number_of_bins_and_width>`_
+    - `<https://stats.stackexchange.com/questions/55134/
+      doanes-formula-for-histogram-binning>`_
+    Notes
+    -----
+    Doane's formula is actually designed for histograms. This
+    function is kept here for backwards-compatibility reasons.
+    It is highly recommended to use :func:`bin_width_percentile`
+    instead.
+    """
+    bad = np.isnan(a) | np.isinf(a)
+    data = a[~bad]
+    n = data.size
+    g1 = skew(data)
+    sigma_g1 = np.sqrt(6 * (n - 2) / ((n + 1) * (n + 3)))
+    k = 1 + np.log2(n) + np.log2(1 + np.abs(g1) / sigma_g1)
+    acc = (data.max() - data.min()) / k
+    return acc
+def bin_width_percentile(a):
+    """Compute contour spacing based on data percentiles
+    The 10th and the 90th percentile of the input data are taken.
+    The spacing then computes to the difference between those
+    two percentiles divided by 23.
+    Notes
+    -----
+    The Freedman–Diaconis rule uses the interquartile range and
+    normalizes to the third root of len(a). Such things do not
+    work very well for RT-DC data, because len(a) is huge. Here
+    we use just the top and bottom 10th percentiles with a fixed
+    normalization.
+    """
+    bad = np.isnan(a) | np.isinf(a)
+    data = a[~bad]
+    start = np.percentile(data, 10)
+    end = np.percentile(data, 90)
+    acc = (end - start) / 23
+    return acc
+def get_bad_vals(x, y):
+    return np.isnan(x) | np.isinf(x) | np.isnan(y) | np.isinf(y)
+def ignore_nan_inf(kde_method):
+    """Ignores nans and infs from the input data
+    Invalid positions in the resulting density are set to nan.
+    """
+    def new_kde_method(events_x, events_y, xout=None, yout=None,
+                       *args, **kwargs):
+        bad_in = get_bad_vals(events_x, events_y)
+        if xout is None:
+            density = np.zeros_like(events_x, dtype=np.float64)
+            bad_out = bad_in
+            xo = yo = None
+        else:
+            density = np.zeros_like(xout, dtype=np.float64)
+            bad_out = get_bad_vals(xout, yout)
+            xo = xout[~bad_out]
+            yo = yout[~bad_out]
+        # Filter events
+        ev_x = events_x[~bad_in]
+        ev_y = events_y[~bad_in]
+        density[~bad_out] = kde_method(ev_x, ev_y,
+                                       xo, yo,
+                                       *args, **kwargs)
+        density[bad_out] = np.nan
+        return density
+    doc_add = "\n    Notes\n" +\
+              "    -----\n" +\
+              "    This is a wrapped version that ignores nan and inf values."
+    new_kde_method.__doc__ = kde_method.__doc__ + doc_add
+    return new_kde_method
+@ignore_nan_inf
+@Cache
+def kde_gauss(events_x, events_y, xout=None, yout=None):
+    """ Gaussian Kernel Density Estimation
+    Parameters
+    ----------
+    events_x, events_y: 1D ndarray
+        The input points for kernel density estimation. Input
+        is flattened automatically.
+    xout, yout: ndarray
+        The coordinates at which the KDE should be computed.
+        If set to none, input coordinates are used.
+    Returns
+    -------
+    density: ndarray, same shape as `xout`
+        The KDE for the points in (xout, yout)
+    See Also
+    --------
+    `scipy.stats.gaussian_kde`
+    """
+    valid_combi = ((xout is None and yout is None) or
+                   (xout is not None and yout is not None)
+                   )
+    if not valid_combi:
+        raise ValueError("Both `xout` and `yout` must be (un)set.")
+    if xout is None and yout is None:
+        xout = events_x
+        yout = events_y
+    try:
+        estimator = gaussian_kde([events_x.flatten(), events_y.flatten()])
+        density = estimator.evaluate([xout.flatten(), yout.flatten()])
+    except np.linalg.LinAlgError:
+        # LinAlgError occurs when matrix to solve is singular (issue #117)
+        density = np.zeros(xout.shape)*np.nan
+    return density.reshape(xout.shape)
+@ignore_nan_inf
+@Cache
+def kde_histogram(events_x, events_y, xout=None, yout=None, bins=None):
+    """ Histogram-based Kernel Density Estimation
+    Parameters
+    ----------
+    events_x, events_y: 1D ndarray
+        The input points for kernel density estimation. Input
+        is flattened automatically.
+    xout, yout: ndarray
+        The coordinates at which the KDE should be computed.
+        If set to none, input coordinates are used.
+    bins: tuple (binsx, binsy)
+        The number of bins to use for the histogram.
+    Returns
+    -------
+    density: ndarray, same shape as `xout`
+        The KDE for the points in (xout, yout)
+    See Also
+    --------
+    `numpy.histogram2d`
+    `scipy.interpolate.RectBivariateSpline`
+    """
+    valid_combi = ((xout is None and yout is None) or
+                   (xout is not None and yout is not None)
+                   )
+    if not valid_combi:
+        raise ValueError("Both `xout` and `yout` must be (un)set.")
+    if xout is None and yout is None:
+        xout = events_x
+        yout = events_y
+    if bins is None:
+        bins = (max(5, bin_num_doane(events_x)),
+                max(5, bin_num_doane(events_y)))
+    # Compute the histogram
+    hist2d, xedges, yedges = np.histogram2d(x=events_x,
+                                            y=events_y,
+                                            bins=bins,
+                                            density=True)
+    xip = xedges[1:]-(xedges[1]-xedges[0])/2
+    yip = yedges[1:]-(yedges[1]-yedges[0])/2
+    estimator = RectBivariateSpline(x=xip, y=yip, z=hist2d)
+    density = estimator.ev(xout, yout)
+    density[density < 0] = 0
+    return density.reshape(xout.shape)
+def kde_none(events_x, events_y, xout=None, yout=None):
+    """No Kernel Density Estimation
+    Parameters
+    ----------
+    events_x, events_y: 1D ndarray
+        The input points for kernel density estimation. Input
+        is flattened automatically.
+    xout, yout: ndarray
+        The coordinates at which the KDE should be computed.
+        If set to none, input coordinates are used.
+    Returns
+    -------
+    density: ndarray, same shape as `xout`
+        The KDE for the points in (xout, yout)
+    Notes
+    -----
+    This method is a convenience method that always returns ones in the shape
+    that the other methods in this module produce.
+    """
+    valid_combi = ((xout is None and yout is None) or
+                   (xout is not None and yout is not None)
+                   )
+    if not valid_combi:
+        raise ValueError("Both `xout` and `yout` must be (un)set.")
+    if xout is None and yout is None:
+        xout = events_x
+        _ = events_y
+    return np.ones(xout.shape)
+@ignore_nan_inf
+@Cache
+def kde_multivariate(events_x, events_y, xout=None, yout=None, bw=None):
+    """ Multivariate Kernel Density Estimation
+    Parameters
+    ----------
+    events_x, events_y: 1D ndarray
+        The input points for kernel density estimation. Input
+        is flattened automatically.
+    bw: tuple (bwx, bwy) or None
+        The bandwith for kernel density estimation.
+    xout, yout: ndarray
+        The coordinates at which the KDE should be computed.
+        If set to none, input coordinates are used.
+    Returns
+    -------
+    density: ndarray, same shape as `xout`
+        The KDE for the points in (xout, yout)
+    See Also
+    --------
+    `statsmodels.nonparametric.kernel_density.KDEMultivariate`
+    """
+    valid_combi = ((xout is None and yout is None) or
+                   (xout is not None and yout is not None)
+                   )
+    if not valid_combi:
+        raise ValueError("Both `xout` and `yout` must be (un)set.")
+    if xout is None and yout is None:
+        xout = events_x
+        yout = events_y
+    if bw is None:
+        # divide by 2 to make it comparable to histogram KDE
+        bw = (bin_width_doane(events_x) / 2,
+              bin_width_doane(events_y) / 2)
+    positions = np.vstack([xout.flatten(), yout.flatten()])
+    estimator_ly = KDEMultivariate(data=[events_x.flatten(),
+                                         events_y.flatten()],
+                                   var_type='cc', bw=bw)
+    density = estimator_ly.pdf(positions)
+    return density.reshape(xout.shape)
+methods = {"gauss": kde_gauss,
+           "histogram": kde_histogram,
+           "none": kde_none,
+           "multivariate": kde_multivariate}

dclab/kde_contours.py CHANGED Viewed

@@ -1,222 +1,10 @@
+import warnings
-import numpy as np
+from .kde.contours import (  # noqa: F401
+    find_contours_level, _find_quantile_level, get_quantile_levels
+)
-from .external.skimage.measure import find_contours, points_in_poly
-import scipy.interpolate as spint
-from .kde_methods import get_bad_vals
-def find_contours_level(density, x, y, level, closed=False):
-    """Find iso-valued density contours for a given level value
-    Parameters
-    ----------
-    density: 2d ndarray of shape (M, N)
-        Kernel density estimate (KDE) for which to compute the contours
-    x: 2d ndarray of shape (M, N) or 1d ndarray of size M
-        X-values corresponding to `density`
-    y: 2d ndarray of shape (M, N) or 1d ndarray of size M
-        Y-values corresponding to `density`
-    level: float between 0 and 1
-        Value along which to find contours in `density` relative
-        to its maximum
-    closed: bool
-        Whether to close contours at the KDE support boundaries
-    Returns
-    -------
-    contours: list of ndarrays of shape (P, 2)
-        Contours found for the given level value
-    See Also
-    --------
-    skimage.measure.find_contours: Contour finding algorithm used
-    """
-    if level >= 1 or level <= 0:
-        raise ValueError("`level` must be in (0,1), got '{}'!".format(level))
-    # level relative to maximum
-    level = level * density.max()
-    # xy coordinates
-    if len(x.shape) == 2:
-        assert np.all(x[:, 0] == x[:, 1])
-        x = x[:, 0]
-    if len(y.shape) == 2:
-        assert np.all(y[0, :] == y[1, :])
-        y = y[0, :]
-    if closed:
-        # find closed contours
-        density = np.pad(density, ((1, 1), (1, 1)), mode="constant")
-        offset = 1
-    else:
-        # leave contours open at kde boundary
-        offset = 0
-    conts_idx = find_contours(density, level)
-    conts_xy = []
-    for cc in conts_idx:
-        cx = np.interp(x=cc[:, 0]-offset,
-                       xp=range(x.size),
-                       fp=x)
-        cy = np.interp(x=cc[:, 1]-offset,
-                       xp=range(y.size),
-                       fp=y)
-        conts_xy.append(np.stack((cx, cy), axis=1))
-    return conts_xy
-def get_quantile_levels(density, x, y, xp, yp, q, normalize=True):
-    """Compute density levels for given quantiles by interpolation
-    For a given 2D density, compute the density levels at which
-    the resulting contours contain the fraction `1-q` of all
-    data points. E.g. for a measurement of 1000 events, all
-    contours at the level corresponding to a quantile of
-    `q=0.95` (95th percentile) contain 50 events (5%).
-    Parameters
-    ----------
-    density: 2d ndarray of shape (M, N)
-        Kernel density estimate for which to compute the contours
-    x: 2d ndarray of shape (M, N) or 1d ndarray of size M
-        X-values corresponding to `density`
-    y: 2d ndarray of shape (M, N) or 1d ndarray of size M
-        Y-values corresponding to `density`
-    xp: 1d ndarray of size D
-        Event x-data from which to compute the quantile
-    yp: 1d ndarray of size D
-        Event y-data from which to compute the quantile
-    q: array_like or float between 0 and 1
-        Quantile along which to find contours in `density` relative
-        to its maximum
-    normalize: bool
-        Whether output levels should be normalized to the maximum
-        of `density`
-    Returns
-    -------
-    level: np.ndarray or float
-        Contours level(s) corresponding to the given quantile
-    Notes
-    -----
-    NaN-values events in `xp` and `yp` are ignored.
-    """
-    # xy coordinates
-    if len(x.shape) == 2:
-        assert np.all(x[:, 0] == x[:, 1])
-        x = x[:, 0]
-    if len(y.shape) == 2:
-        assert np.all(y[0, :] == y[1, :])
-        y = y[0, :]
-    # remove bad events
-    bad = get_bad_vals(xp, yp)
-    xp = xp[~bad]
-    yp = yp[~bad]
-    # Normalize interpolation data such that the spacing for
-    # x and y is about the same during interpolation.
-    x_norm = x.max()
-    x = x / x_norm
-    xp = xp / x_norm
-    y_norm = y.max()
-    y = y / y_norm
-    yp = yp / y_norm
-    # Perform interpolation
-    dp = spint.interpn((x, y), density,
-                       (xp, yp),
-                       method='linear',
-                       bounds_error=False,
-                       fill_value=0)
-    if normalize:
-        dp /= density.max()
-    if not np.isscalar(q):
-        q = np.array(q)
-    plev = np.nanpercentile(dp, q=q*100)
-    return plev
-def _find_quantile_level(density, x, y, xp, yp, quantile, acc=.01,
-                         ret_err=False):
-    """Find density level for a given data quantile by iteration
-    Parameters
-    ----------
-    density: 2d ndarray of shape (M, N)
-        Kernel density estimate for which to compute the contours
-    x: 2d ndarray of shape (M, N) or 1d ndarray of size M
-        X-values corresponding to `density`
-    y: 2d ndarray of shape (M, N) or 1d ndarray of size M
-        Y-values corresponding to `density`
-    xp: 1d ndarray of size D
-        Event x-data from which to compute the quantile
-    yp: 1d ndarray of size D
-        Event y-data from which to compute the quantile
-    quantile: float between 0 and 1
-        Quantile along which to find contours in `density` relative
-        to its maximum
-    acc: float
-        Desired absolute accuracy (stopping criterion) of the
-        contours
-    ret_err: bool
-        If True, also return the absolute error
-    Returns
-    -------
-    level: float
-        Contours level corresponding to the given quantile
-    Notes
-    -----
-    A much more faster method (using interpolation) is implemented in
-    :func:`get_quantile_levels`.
-    NaN-values events in `xp` and `yp` are ignored.
-    See Also
-    --------
-    skimage.measure.find_contours: Contour finding algorithm
-    """
-    if quantile >= 1 or quantile <= 0:
-        raise ValueError("Invalid value for `quantile`: {}".format(quantile))
-    # remove bad events
-    bad = get_bad_vals(xp, yp)
-    xp = xp[~bad]
-    yp = yp[~bad]
-    points = np.concatenate((xp.reshape(-1, 1), yp.reshape(-1, 1)), axis=1)
-    # initial guess
-    level = quantile
-    # error of current iteration
-    err = 1
-    # iteration factor (guarantees convergence)
-    itfac = 1
-    # total number of events
-    nev = xp.size
-    while np.abs(err) > acc:
-        # compute contours
-        conts = find_contours_level(density, x, y, level, closed=True)
-        # compute number of points in contour
-        isin = 0
-        pi = np.array(points, copy=True)
-        for cc in conts:
-            pinc = points_in_poly(points=pi, verts=cc)
-            isin += np.sum(pinc)
-            # ignore these points for the other contours
-            pi = pi[~pinc]
-        err = quantile - (nev - isin) / nev
-        level += err * itfac
-        itfac *= .9
-    if ret_err:
-        return level, err
-    else:
-        return level
+warnings.warn("`dclab.kde_contours` is deprecated; please use "
+              "the `dclab.kde.contours` instead",
+              DeprecationWarning)