PyPI - datamapplot - Versions diffs - 0.1.0__py3-none-any.whl - Mend

datamapplot 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (11) hide show

datamapplot/__init__.py +277 -0
datamapplot/medoids.py +103 -0
datamapplot/overlap_computations.py +160 -0
datamapplot/palette_handling.py +224 -0
datamapplot/plot_rendering.py +587 -0
datamapplot/text_placement.py +324 -0
datamapplot-0.1.0.dist-info/LICENSE +21 -0
datamapplot-0.1.0.dist-info/METADATA +119 -0
datamapplot-0.1.0.dist-info/RECORD +11 -0
datamapplot-0.1.0.dist-info/WHEEL +5 -0
datamapplot-0.1.0.dist-info/top_level.txt +1 -0

datamapplot/__init__.py ADDED Viewed

@@ -0,0 +1,277 @@
+import numpy as np
+import pandas as pd
+import textwrap
+from matplotlib import pyplot as plt
+from datamapplot.palette_handling import (
+    palette_from_datamap,
+    palette_from_cmap_and_datamap,
+    deep_palette,
+    pastel_palette,
+)
+from datamapplot.plot_rendering import render_plot
+from datamapplot.medoids import medoid
+def create_plot(
+    data_map_coords,
+    labels,
+    *,
+    title=None,
+    sub_title=None,
+    noise_label="Unlabelled",
+    noise_color="#999999",
+    color_label_text=True,
+    label_wrap_width=16,
+    label_color_map=None,
+    figsize=(12, 12),
+    dynamic_label_size=False,
+    dpi=plt.rcParams["figure.dpi"],
+    force_matplotlib=False,
+    darkmode=False,
+    highlight_labels=None,
+    palette_hue_shift=0.0,
+    palette_hue_radius_dependence=1.0,
+    use_medoids=False,
+    cmap=None,
+    **render_plot_kwds,
+):
+    """Create a static plot from ``data_map_coords`` with text labels provided by ``labels``.
+    This is the primary function for DataMapPlot and provides the easiest interface to the
+    static plotting functionality. This function provides a number of options, but also
+    passes any further keyword options through to the lower level ``render_plot`` function
+    so be sure to check the documentation for ``render_plot`` to discover further keyword
+    arguments that can be used here as well.
+    Parameters
+    ----------
+    data_map_coords: ndarray of floats of shape (n_samples, 2)
+        The 2D coordinates for the data map. Usually this is produced via a
+        dimension reduction technique such as UMAP, t-SNE, PacMAP, PyMDE etc.
+    labels: ndarray of strings (object) of shape (n_samples,)
+        A string label each data point in the data map. There should ideally by
+        only up to 64 unique labels. Noise or unlabelled points should have the
+        same label as ``noise_label``, which is "Unlabelled" by default.
+    title: str or None (optional, default=None)
+        A title for the plot. If ``None`` then no title is used for the plot.
+        The title should be succint; three to seven words.
+    sub_title: str or None (optional, default=None)
+        A sub-title for the plot. If ``None`` then no sub-title is used for the plot.
+        The sub-title can be significantly longer then the title and provide more information\
+        about the plot and data sources.
+    noise_label: str (optional, default="Unlabelled")
+        The string used in the ``labels`` array to identify the unlabelled or noise points
+        in the dataset.
+    noise_color: str (optional, default="#999999")
+        The colour to use for unlabelled or noise points in the data map. This should usually
+        be a muted or neutral colour to distinguish background points from the labelled clusters.
+    color_label_text: bool (optional, default=True)
+        Whether to use colours for the text labels generated in the plot. If ``False`` then
+        the text labels will default to either black or white depending on ``darkmode``.
+    label_wrap_width: int (optional, default=16)
+        The number of characters to apply text-wrapping at when creating text labels for
+        display in the plot. Note that long words will not be broken, so you can choose
+        relatively small values if you want tight text-wrapping.
+    label_color_map: dict or None (optional, default=None)
+        A colour mapping to use to colour points/clusters in the data map. The mapping should
+        be keyed by the unique cluster labels in ``labels`` and take values that are hex-string
+        representations of colours. If ``None`` then a colour mapping will be auto-generated.
+    figsize: (int, int) (optional, default=(12,12))
+        How big to make the figure in inches (actual pixel size will depend on ``dpi``).
+    dynamic_label_size: bool (optional, default=False)
+        Whether to dynamically resize the text labels based on the relative sizes of the
+        clusters. This can be useful to help highlight larger clusters.
+    dpi: int (optional, default=plt.rcParams["figure.dpi"])
+        The dots-per-inch setting usd when rendering the plot.
+    force_matplotlib: bool (optional, default=False)
+        Force using matplotlib instead of datashader for rendering the scatterplot of the
+        data map. This can be useful if you wish to have a different marker_type, or variably
+        sized markers based on a marker_size_array, neither of which are supported by the
+        datashader based renderer.
+    darkmode: bool (optional, default=False)
+        Whether to render the plot in darkmode (with a dark background) or not.
+    highlight_labels: list of str or None (optional, default=None)
+        A list of unique labels that should have their text highlighted in the resulting plot.
+        Arguments supported by ``render_plot`` can allow for control over how highlighted labels
+        are rendered. By default they are simply rendered in bold text.
+    palette_hue_shift: float (optional, default=0.0)
+        A setting, in degrees clockwise, to shift the hue channel when generating a colour
+        palette and color_mapping for the labels.
+    palette_hue_radius_dependence: float (optional, default=1.0)
+        A setting that determines how dependent on the radius the hue channel is. Larger
+        values will result in more hue variation where there are more outlying points.
+    use_medoids: bool (optional, default=False)
+        Whether to use medoids instead of centroids to determine the "location" of the cluster,
+        both for the label indicator line, and for palette colouring. Note that medoids are
+        more computationally expensive, especially for large plots, so use with some caution.
+    cmap: matplotlib cmap or None (optional, default=None)
+        A linear matplotlib cmap colour map to use as the base for a generated colour mapping.
+        This *should* be a matplotlib cmap that is smooth and linear, and cyclic
+        (see the colorcet package for some good options). If not a cyclic cmap it will be
+        "made" cyclic by reflecting it. If ``None`` then a custom method will be used instead.
+    **render_plot_kwds
+        All opther keyword arguments are passed through the ``render_plot`` which provides
+        significant further control over the aesthetics of the plot.
+    Returns
+    -------
+    fig: matplotlib.Figure
+        The figure that the resulting plot is rendered to.
+    ax: matpolotlib.Axes
+        The axes contained within the figure that the plot is rendered to.
+    """
+    cluster_label_vector = np.asarray(labels)
+    unique_non_noise_labels = [
+        label for label in np.unique(cluster_label_vector) if label != noise_label
+    ]
+    if use_medoids:
+        label_locations = np.asarray(
+            [
+                medoid(data_map_coords[cluster_label_vector == i])
+                for i in unique_non_noise_labels
+            ]
+        )
+    else:
+        label_locations = np.asarray(
+            [
+                data_map_coords[cluster_label_vector == i].mean(axis=0)
+                for i in unique_non_noise_labels
+            ]
+        )
+    label_text = [
+        textwrap.fill(x, width=label_wrap_width, break_long_words=False)
+        for x in unique_non_noise_labels
+    ]
+    if highlight_labels is not None:
+        highlight_labels = [
+            textwrap.fill(x, width=label_wrap_width, break_long_words=False)
+            for x in highlight_labels
+        ]
+    # If we don't have a color map, generate one
+    if label_color_map is None:
+        if cmap is None:
+            palette = palette_from_datamap(
+                data_map_coords,
+                label_locations,
+                hue_shift=palette_hue_shift,
+                radius_weight_power=palette_hue_radius_dependence,
+            )
+        else:
+            palette = palette_from_cmap_and_datamap(
+                cmap,
+                data_map_coords,
+                label_locations,
+                radius_weight_power=palette_hue_radius_dependence,
+            )
+        label_to_index_map = {
+            name: index for index, name in enumerate(unique_non_noise_labels)
+        }
+        color_list = [
+            palette[label_to_index_map[x]] if x in label_to_index_map else noise_color
+            for x in cluster_label_vector
+        ]
+        label_color_map = {
+            x: (
+                palette[label_to_index_map[x]]
+                if x in label_to_index_map
+                else noise_color
+            )
+            for x in np.unique(cluster_label_vector)
+        }
+    else:
+        color_list = [
+            label_color_map[x] if x != noise_label else noise_color
+            for x in cluster_label_vector
+        ]
+    # Darken and reduce chroma of label colors to get text labels
+    if color_label_text:
+        if darkmode:
+            label_text_colors = pastel_palette(
+                [label_color_map[x] for x in unique_non_noise_labels]
+            )
+        else:
+            label_text_colors = deep_palette(
+                [label_color_map[x] for x in unique_non_noise_labels]
+            )
+    else:
+        label_text_colors = None
+    if dynamic_label_size:
+        font_scale_factor = np.sqrt(figsize[0] * figsize[1])
+        cluster_sizes = np.sqrt(pd.Series(cluster_label_vector).value_counts())
+        label_size_adjustments = cluster_sizes - cluster_sizes.min()
+        label_size_adjustments /= label_size_adjustments.max()
+        label_size_adjustments *= (
+            render_plot_kwds.get("label_font_size", font_scale_factor) + 2
+        )
+        label_size_adjustments = dict(label_size_adjustments - 2)
+        label_size_adjustments = [
+            label_size_adjustments[x] for x in unique_non_noise_labels
+        ]
+    else:
+        label_size_adjustments = [0.0] * len(unique_non_noise_labels)
+    # Heuristics for point size and alpha values
+    n_points = data_map_coords.shape[0]
+    if data_map_coords.shape[0] < 100_000 or force_matplotlib:
+        magic_number = np.clip(128 * 4 ** (-np.log10(n_points)), 0.05, 64)
+        point_scale_factor = np.sqrt(figsize[0] * figsize[1])
+        point_size = magic_number * (point_scale_factor / 2)
+        alpha = np.clip(magic_number, 0.05, 1)
+    else:
+        point_size = int(np.sqrt(figsize[0] * figsize[1]) * dpi) // 2048
+        alpha = 1.0
+    if "point_size" in render_plot_kwds:
+        point_size = render_plot_kwds.pop("point_size")
+    if "alpha" in render_plot_kwds:
+        alpha = render_plot_kwds.pop("alpha")
+    fig, ax = render_plot(
+        data_map_coords,
+        color_list,
+        label_text,
+        label_locations,
+        title=title,
+        sub_title=sub_title,
+        point_size=point_size,
+        alpha=alpha,
+        label_colors=None if not color_label_text else label_text_colors,
+        highlight_colors=[label_color_map[x] for x in unique_non_noise_labels],
+        figsize=figsize,
+        noise_color=noise_color,
+        label_size_adjustments=label_size_adjustments,
+        dpi=dpi,
+        force_matplotlib=force_matplotlib,
+        darkmode=darkmode,
+        highlight_labels=highlight_labels,
+        **render_plot_kwds,
+    )
+    return fig, ax

datamapplot/medoids.py ADDED Viewed

@@ -0,0 +1,103 @@
+import numpy as np
+import numba
+@numba.njit(
+    [
+        "f4(f4[::1],f4[::1])",
+        numba.types.float32(
+            numba.types.Array(numba.types.float32, 1, "C", readonly=True),
+            numba.types.Array(numba.types.float32, 1, "C", readonly=True),
+        ),
+    ],
+    fastmath=True,
+    locals={
+        "result": numba.types.float32,
+        "diff": numba.types.float32,
+        "dim": numba.types.intp,
+        "i": numba.types.uint16,
+    },
+)
+def euclidean(x, y):
+    r"""Squared euclidean distance.
+    .. math::
+        D(x, y) = \sum_i (x_i - y_i)^2
+    """
+    result = 0.0
+    dim = x.shape[0]
+    for i in range(dim):
+        diff = x[i] - y[i]
+        result += diff * diff
+    return np.sqrt(result)
+@numba.njit(parallel=True, nogil=True)
+def chunked_parallel_pairwise_distances(X, Y=None, metric=euclidean, chunk_size=16):
+    if Y is None:
+        XX, symmetrical = X, True
+        row_size = col_size = X.shape[0]
+    else:
+        XX, symmetrical = Y, False
+        row_size, col_size = X.shape[0], Y.shape[0]
+    result = np.zeros((row_size, col_size), dtype=np.float32)
+    n_row_chunks = (row_size // chunk_size) + 1
+    for chunk_idx in numba.prange(n_row_chunks):
+        n = chunk_idx * chunk_size
+        chunk_end_n = min(n + chunk_size, row_size)
+        m_start = n if symmetrical else 0
+        for m in range(m_start, col_size, chunk_size):
+            chunk_end_m = min(m + chunk_size, col_size)
+            for i in range(n, chunk_end_n):
+                for j in range(m, chunk_end_m):
+                    result[i, j] = metric(X[i], XX[j])
+    return result
+@numba.njit()
+def pull_arms(data, arms, num_pulls_per_arm, estimates, pull_counts):
+    other_candidates = np.random.choice(
+        data.shape[0], size=num_pulls_per_arm, replace=False
+    ).astype(np.int32)
+    data_arm = data[arms]
+    data_other = data[other_candidates]
+    distance_sums = np.sum(
+        chunked_parallel_pairwise_distances(data_arm, data_other), axis=1
+    )
+    estimates *= pull_counts
+    estimates += distance_sums
+    pull_counts += num_pulls_per_arm
+    estimates /= pull_counts
+@numba.njit()
+def medoid(data, arm_budget=20):
+    pull_counts = np.zeros(data.shape[0], dtype=np.int32)
+    pull_budget = arm_budget * data.shape[0]
+    estimates = np.zeros(data.shape[0], dtype=np.float32)
+    current_active_arms = np.arange(data.shape[0])
+    n_rounds = int(np.ceil(np.log2(data.shape[0])))
+    while current_active_arms.shape[0] > 1:
+        num_pulls_per_arm = max(
+            1,
+            int(
+                min(
+                    data.shape[0],
+                    np.floor(pull_budget / (current_active_arms.shape[0] * n_rounds)),
+                )
+            ),
+        )
+        pull_arms(data, current_active_arms, num_pulls_per_arm, estimates, pull_counts)
+        median = np.median(estimates)
+        mask = estimates <= median
+        current_active_arms = current_active_arms[mask]
+        estimates = estimates[mask]
+        pull_counts = pull_counts[mask]
+    return data[current_active_arms[0]]

datamapplot/overlap_computations.py ADDED Viewed

@@ -0,0 +1,160 @@
+import numpy as np
+import io
+from matplotlib import pyplot as plt
+try:
+    from matplotlib.backend_bases import _get_renderer as matplot_get_renderer
+except ImportError:
+    matplot_get_renderer = None
+def ccw(a, b, c):
+    return (c[1] - a[1]) * (b[0] - a[0]) > (b[1] - a[1]) * (c[0] - a[0])
+def intersect(a, b, c, d):
+    return ccw(a, c, d) != ccw(b, c, d) and ccw(a, b, c) != ccw(a, b, d)
+# From bioframe (https://github.com/open2c/bioframe)
+def arange_multi(starts, stops):
+    lengths = stops - starts
+    if np.isscalar(starts):
+        starts = np.full(len(stops), starts)
+    cat_start = np.repeat(starts, lengths)
+    cat_counter = np.arange(lengths.sum()) - np.repeat(
+        lengths.cumsum() - lengths, lengths
+    )
+    cat_range = cat_start + cat_counter
+    return cat_range
+# From bioframe (https://github.com/open2c/bioframe)
+def overlap_intervals(starts1, ends1, starts2, ends2, closed=False, sort=False):
+    starts1 = np.asarray(starts1)
+    ends1 = np.asarray(ends1)
+    starts2 = np.asarray(starts2)
+    ends2 = np.asarray(ends2)
+    # Concatenate intervals lists
+    n1 = len(starts1)
+    n2 = len(starts2)
+    ids1 = np.arange(0, n1)
+    ids2 = np.arange(0, n2)
+    # Sort all intervals together
+    order1 = np.lexsort([ends1, starts1])
+    order2 = np.lexsort([ends2, starts2])
+    starts1, ends1, ids1 = starts1[order1], ends1[order1], ids1[order1]
+    starts2, ends2, ids2 = starts2[order2], ends2[order2], ids2[order2]
+    # Find interval overlaps
+    match_2in1_starts = np.searchsorted(starts2, starts1, "left")
+    match_2in1_ends = np.searchsorted(starts2, ends1, "right" if closed else "left")
+    # "right" is intentional here to avoid duplication
+    match_1in2_starts = np.searchsorted(starts1, starts2, "right")
+    match_1in2_ends = np.searchsorted(starts1, ends2, "right" if closed else "left")
+    # Ignore self-overlaps
+    match_2in1_mask = match_2in1_ends > match_2in1_starts
+    match_1in2_mask = match_1in2_ends > match_1in2_starts
+    match_2in1_starts, match_2in1_ends = (
+        match_2in1_starts[match_2in1_mask],
+        match_2in1_ends[match_2in1_mask],
+    )
+    match_1in2_starts, match_1in2_ends = (
+        match_1in2_starts[match_1in2_mask],
+        match_1in2_ends[match_1in2_mask],
+    )
+    # Generate IDs of pairs of overlapping intervals
+    overlap_ids = np.block(
+        [
+            [
+                np.repeat(ids1[match_2in1_mask], match_2in1_ends - match_2in1_starts)[
+                    :, None
+                ],
+                ids2[arange_multi(match_2in1_starts, match_2in1_ends)][:, None],
+            ],
+            [
+                ids1[arange_multi(match_1in2_starts, match_1in2_ends)][:, None],
+                np.repeat(ids2[match_1in2_mask], match_1in2_ends - match_1in2_starts)[
+                    :, None
+                ],
+            ],
+        ]
+    )
+    if sort:
+        # Sort overlaps according to the 1st
+        overlap_ids = overlap_ids[np.lexsort([overlap_ids[:, 1], overlap_ids[:, 0]])]
+    return overlap_ids
+# From adjustText (https://github.com/Phyla/adjustText)
+def get_renderer(fig):
+    # If the backend support get_renderer() or renderer, use that.
+    if hasattr(fig.canvas, "get_renderer"):
+        return fig.canvas.get_renderer()
+    if hasattr(fig.canvas, "renderer"):
+        return fig.canvas.renderer
+    # Otherwise, if we have the matplotlib function available, use that.
+    if matplot_get_renderer:
+        return matplot_get_renderer(fig)
+    # No dice, try and guess.
+    # Write the figure to a temp location, and then retrieve whichever
+    # render was used (doesn't work in all matplotlib versions).
+    fig.canvas.print_figure(io.BytesIO())
+    try:
+        return fig._cachedRenderer
+    except AttributeError:
+        # No luck.
+        # We're out of options.
+        raise ValueError("Unable to determine renderer") from None
+# From adjustText (https://github.com/Phyla/adjustText)
+def get_bboxes(objs, r=None, expand=(1, 1), ax=None):
+    ax = ax or plt.gca()
+    r = r or get_renderer(ax.get_figure())
+    return [i.get_window_extent(r).expanded(*expand) for i in objs]
+# From adjustText (https://github.com/Phyla/adjustText)
+def get_2d_coordinates(objs, expand=(1.0, 1.0)):
+    try:
+        ax = objs[0].axes
+    except:
+        ax = objs.axes
+    bboxes = get_bboxes(objs, get_renderer(ax.get_figure()), expand, ax)
+    xs = [
+        (ax.convert_xunits(bbox.xmin), ax.convert_yunits(bbox.xmax)) for bbox in bboxes
+    ]
+    ys = [
+        (ax.convert_xunits(bbox.ymin), ax.convert_yunits(bbox.ymax)) for bbox in bboxes
+    ]
+    coords = np.hstack([np.array(xs), np.array(ys)])
+    return coords
+def text_line_overlaps(text_locations, label_locations, text_bounding_boxes):
+    result = []
+    for i, box in enumerate(text_bounding_boxes):
+        for j in range(text_locations.shape[0]):
+            if i == j:
+                continue
+            if intersect(
+                text_locations[j], label_locations[j], box[[0, 2]], box[[1, 3]]
+            ) or intersect(
+                text_locations[j], label_locations[j], box[[0, 3]], box[[1, 2]]
+            ):
+                result.append((i, j))
+    return result