PyPI - microarray - Versions diffs - 0.1.0__py3-none-any.whl - Mend

microarray 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (44) hide show

microarray/__init__.py +15 -0
microarray/_version.py +3 -0
microarray/datasets/__init__.py +3 -0
microarray/datasets/_arrayexpress.py +1 -0
microarray/datasets/_cdf_files.py +35 -0
microarray/datasets/_geo.py +1 -0
microarray/datasets/_utils.py +143 -0
microarray/io/__init__.py +17 -0
microarray/io/_anndata_converter.py +198 -0
microarray/io/_cdf.py +575 -0
microarray/io/_cel.py +591 -0
microarray/io/_read.py +127 -0
microarray/plotting/__init__.py +28 -0
microarray/plotting/_base.py +253 -0
microarray/plotting/_cel.py +75 -0
microarray/plotting/_de_plots.py +239 -0
microarray/plotting/_diagnostic_plots.py +268 -0
microarray/plotting/_heatmap.py +279 -0
microarray/plotting/_ma_plots.py +136 -0
microarray/plotting/_pca.py +320 -0
microarray/plotting/_qc_plots.py +335 -0
microarray/plotting/_score.py +38 -0
microarray/plotting/_top_table_heatmap.py +98 -0
microarray/plotting/_utils.py +280 -0
microarray/preprocessing/__init__.py +39 -0
microarray/preprocessing/_background.py +862 -0
microarray/preprocessing/_log2.py +77 -0
microarray/preprocessing/_normalize.py +1292 -0
microarray/preprocessing/_rma.py +243 -0
microarray/preprocessing/_robust.py +170 -0
microarray/preprocessing/_summarize.py +318 -0
microarray/py.typed +0 -0
microarray/tools/__init__.py +26 -0
microarray/tools/_biomart.py +416 -0
microarray/tools/_empirical_bayes.py +401 -0
microarray/tools/_fdist.py +171 -0
microarray/tools/_linear_models.py +387 -0
microarray/tools/_mds.py +101 -0
microarray/tools/_pca.py +88 -0
microarray/tools/_score.py +86 -0
microarray/tools/_toptable.py +360 -0
microarray-0.1.0.dist-info/METADATA +75 -0
microarray-0.1.0.dist-info/RECORD +44 -0
microarray-0.1.0.dist-info/WHEEL +4 -0

microarray/plotting/_heatmap.py ADDED Viewed

@@ -0,0 +1,279 @@
+from typing import Any
+import matplotlib.colors as mcolors
+import matplotlib.patches as mpatches
+import matplotlib.pyplot as plt
+import numpy as np
+from anndata import AnnData
+from matplotlib.axes import Axes
+from matplotlib.figure import Figure
+from scipy.cluster.hierarchy import dendrogram, linkage
+from scipy.stats import zscore
+from microarray.plotting._utils import get_default_colors
+def _to_dense_array(x: Any) -> np.ndarray:
+    """Convert sparse or dense array-like values to numpy ndarray."""
+    if hasattr(x, "toarray"):
+        return np.asarray(x.toarray())
+    return np.asarray(x)
+def _style_dendrogram_axis(ax: Axes) -> None:
+    """Remove ticks and frame from dendrogram axis."""
+    ax.set_xticks([])
+    ax.set_yticks([])
+    ax.set_frame_on(False)
+    for spine in ax.spines.values():
+        spine.set_visible(False)
+def _equalize_linkage_levels(z: np.ndarray) -> np.ndarray:
+    """Return a copy of linkage matrix with uniformly spaced merge heights."""
+    z_eq = z.copy()
+    z_eq[:, 2] = np.arange(1, z.shape[0] + 1, dtype=float)
+    return z_eq
+def _resolve_group_colors(
+    group_levels: list[str],
+    group_colors: dict[str, str] | str | mcolors.Colormap | None,
+) -> dict[str, str]:
+    """Resolve group color mapping from explicit map, cmap name/object, or defaults."""
+    if isinstance(group_colors, dict):
+        resolved = dict(group_colors)
+        missing = [g for g in group_levels if g not in resolved]
+        if missing:
+            fallback = get_default_colors(len(missing))
+            resolved.update(dict(zip(missing, fallback, strict=False)))
+        return resolved
+    if group_colors is None:
+        default_colors = get_default_colors(len(group_levels))
+        return dict(zip(group_levels, default_colors, strict=False))
+    cmap_obj = plt.get_cmap(group_colors) if isinstance(group_colors, str) else group_colors
+    if len(group_levels) == 1:
+        return {group_levels[0]: mcolors.to_hex(cmap_obj(0.5))}
+    if isinstance(cmap_obj, mcolors.ListedColormap) and cmap_obj.N > 0:
+        # Sample at bin centers to preserve listed color order (e.g. tab10/tab20)
+        positions = ((np.arange(len(group_levels)) % cmap_obj.N) + 0.5) / cmap_obj.N
+    else:
+        positions = np.linspace(0.0, 1.0, len(group_levels), endpoint=True)
+    sampled = [mcolors.to_hex(cmap_obj(p)) for p in positions]
+    return dict(zip(group_levels, sampled, strict=False))
+def heatmap(
+    adata: AnnData,
+    genes: list[str],
+    groupby: str | None = None,
+    group_colors: dict[str, str] | str | mcolors.Colormap | None = None,
+    swap_axes: bool = False,
+    show_dendrograms: bool = True,
+    dendrogram_axes: str | None = None,
+    z_score: bool = True,
+    cmap: str = "RdBu_r",
+    vmin: float = -2,
+    vmax: float = 2,
+    title: str = "Hierarchical Clustering Heatmap",
+    figsize: tuple[float, float] = (14, 10),
+    colorbar_shrink: float = 0.7,
+    show: bool = True,
+) -> tuple[Figure, dict[str, Axes | None]]:
+    """Plot sample-by-gene heatmap with optional hierarchical clustering.
+    Args:
+            adata: AnnData object containing expression data in ``.X``.
+            genes: Ordered list of genes to display in the heatmap.
+            groupby: Optional ``adata.obs`` column used for sample group annotations.
+            group_colors: Optional group color mapping or colormap (name/object).
+            swap_axes: If True, transpose heatmap so genes are rows and samples are columns.
+            show_dendrograms: Backward-compatible toggle for showing dendrograms.
+            dendrogram_axes: Which dendrograms to display: ``"x"``, ``"y"``, ``"both"``, or ``"none"``.
+            z_score: If True, z-score each gene across samples.
+            cmap: Colormap used for heatmap values.
+            vmin: Lower color limit for heatmap.
+            vmax: Upper color limit for heatmap.
+            title: Figure title.
+            figsize: Figure size passed to matplotlib.
+            colorbar_shrink: Shrink factor for colorbar to reduce visual footprint.
+            show: If True, call ``fig.show()`` before returning.
+    Returns:
+            Tuple of ``(Figure, axes_dict)``. The dictionary contains
+            ``heatmap``, ``dendrogram_row``, ``dendrogram_col``, ``groupbar``, and ``colorbar``.
+    """
+    genes = [g for g in dict.fromkeys(genes) if g in adata.var_names]
+    if len(genes) == 0:
+        raise ValueError("None of the provided genes were found in adata.var_names")
+    x = _to_dense_array(adata[:, adata.var.index.isin(genes)].X)
+    if z_score:
+        x = zscore(x, axis=0, nan_policy="omit")
+        x = np.nan_to_num(x, nan=0.0)
+    sample_names = np.array(adata.obs_names)
+    gene_names = np.array(genes)
+    matrix = x.T if swap_axes else x
+    if dendrogram_axes is None:
+        dendrogram_mode = "both" if show_dendrograms else "none"
+    else:
+        dendrogram_mode = str(dendrogram_axes).lower()
+    valid_dendrogram_modes = {"none", "x", "y", "both"}
+    if dendrogram_mode not in valid_dendrogram_modes:
+        raise ValueError("dendrogram_axes must be one of: 'none', 'x', 'y', 'both'")
+    show_dendrogram_x = dendrogram_mode in {"x", "both"}
+    show_dendrogram_y = dendrogram_mode in {"y", "both"}
+    z_rows = None
+    z_cols = None
+    row_order = np.arange(matrix.shape[0])
+    col_order = np.arange(matrix.shape[1])
+    if show_dendrogram_y and matrix.shape[0] > 1:
+        z_rows = linkage(matrix, method="ward", metric="euclidean")
+        row_order = np.array(dendrogram(z_rows, no_plot=True)["leaves"])
+    if show_dendrogram_x and matrix.shape[1] > 1:
+        z_cols = linkage(matrix.T, method="ward", metric="euclidean")
+        col_order = np.array(dendrogram(z_cols, no_plot=True)["leaves"])
+    x_ord = matrix[np.ix_(row_order, col_order)]
+    if swap_axes:
+        row_labels = gene_names[row_order]
+        col_labels = sample_names[col_order]
+    else:
+        row_labels = sample_names[row_order]
+        col_labels = gene_names[col_order]
+    has_groups = groupby is not None
+    if has_groups and groupby not in adata.obs:
+        raise ValueError(f"Column '{groupby}' not found in adata.obs")
+    use_groupbar_column = has_groups and not swap_axes
+    n_rows = 2 if show_dendrogram_x else 1
+    heatmap_row_idx = 1 if show_dendrogram_x else 0
+    width_ratios = [4.6]
+    if show_dendrogram_y:
+        width_ratios.append(1.0)
+    if use_groupbar_column:
+        width_ratios.append(0.14)
+        width_ratios.append(0.22)
+    fig = plt.figure(figsize=figsize)
+    if n_rows == 2:
+        gs = fig.add_gridspec(
+            2,
+            len(width_ratios),
+            width_ratios=width_ratios,
+            height_ratios=[1.0, 4.6],
+            hspace=0.05,
+            wspace=0.05,
+        )
+    else:
+        gs = fig.add_gridspec(1, len(width_ratios), width_ratios=width_ratios, wspace=0.05)
+    ax_heatmap = fig.add_subplot(gs[heatmap_row_idx, 0])
+    ax_dendro_col = fig.add_subplot(gs[0, 0]) if show_dendrogram_x else None
+    ax_dendro_row = fig.add_subplot(gs[heatmap_row_idx, 1]) if show_dendrogram_y else None
+    group_col_idx = (2 + int(show_dendrogram_y)) if use_groupbar_column else None
+    if ax_dendro_col is not None and z_cols is not None:
+        z_cols_eq = _equalize_linkage_levels(z_cols)
+        dendrogram(
+            z_cols_eq,
+            ax=ax_dendro_col,
+            orientation="top",
+            no_labels=True,
+            link_color_func=lambda _: "black",
+        )
+    if ax_dendro_row is not None and z_rows is not None:
+        z_rows_eq = _equalize_linkage_levels(z_rows)
+        dendrogram(
+            z_rows_eq,
+            ax=ax_dendro_row,
+            orientation="right",
+            no_labels=True,
+            link_color_func=lambda _: "black",
+        )
+    if ax_dendro_col is not None:
+        _style_dendrogram_axis(ax_dendro_col)
+    if ax_dendro_row is not None:
+        _style_dendrogram_axis(ax_dendro_row)
+    im = ax_heatmap.imshow(x_ord, aspect="auto", cmap=cmap, vmin=vmin, vmax=vmax)
+    ax_heatmap.set_xlabel("Samples" if swap_axes else "Genes")
+    ax_heatmap.set_ylabel("Genes" if swap_axes else "Samples")
+    ax_heatmap.set_xticks(np.arange(len(col_labels)))
+    ax_heatmap.set_xticklabels(col_labels, rotation=90)
+    ax_heatmap.set_yticks(np.arange(len(row_labels)))
+    ax_heatmap.set_yticklabels(row_labels)
+    ax_heatmap.yaxis.tick_left()
+    ax_heatmap.yaxis.set_label_position("left")
+    ax_heatmap.tick_params(axis="y", labelleft=True, labelright=False)
+    ax_groupbar = None
+    if has_groups:
+        group_values_full = adata.obs[groupby].astype(str).to_numpy()
+        sample_order = col_order if swap_axes else row_order
+        group_values = group_values_full[sample_order]
+        group_levels = list(dict.fromkeys(group_values.tolist()))
+        group_colors = _resolve_group_colors(group_levels, group_colors)
+        legend_handles = [mpatches.Patch(facecolor=group_colors[g], edgecolor="black", label=g) for g in group_levels]
+        ax_heatmap.legend(
+            handles=legend_handles,
+            title=str(groupby),
+            loc="upper left",
+            bbox_to_anchor=(1.02, 1.2),
+            frameon=False,
+        )
+    if has_groups and group_col_idx is not None:
+        ax_groupbar = fig.add_subplot(gs[heatmap_row_idx, group_col_idx])
+        rgba = np.array([mcolors.to_rgba(group_colors[g]) for g in group_values]).reshape(-1, 1, 4)
+        ax_groupbar.imshow(rgba, aspect="auto", origin="upper")
+        ax_groupbar.set_xticks([])
+        ax_groupbar.set_yticks([])
+        for spine in ax_groupbar.spines.values():
+            spine.set_visible(False)
+    elif has_groups and swap_axes:
+        ax_groupbar = ax_heatmap.inset_axes([0.0, 1.01, 1.0, 0.04], transform=ax_heatmap.transAxes)
+        rgba = np.array([mcolors.to_rgba(group_colors[g]) for g in group_values]).reshape(1, -1, 4)
+        ax_groupbar.imshow(rgba, aspect="auto", origin="upper")
+        ax_groupbar.set_xticks([])
+        ax_groupbar.set_yticks([])
+        for spine in ax_groupbar.spines.values():
+            spine.set_visible(False)
+    shrink = float(np.clip(colorbar_shrink, 0.2, 1.0))
+    cbar_width = 0.20 * shrink
+    cbar_bottom = 0.05
+    cbar_left = 0.95 - cbar_width
+    ax_cbar = fig.add_axes([cbar_left, cbar_bottom, cbar_width, 0.018])
+    cbar = fig.colorbar(im, cax=ax_cbar, orientation="horizontal")
+    cbar.set_label("Z-score" if z_score else "Expression")
+    fig.suptitle(title)
+    if show:
+        fig.show()
+    return fig, {
+        "heatmap": ax_heatmap,
+        "dendrogram_row": ax_dendro_row,
+        "dendrogram_col": ax_dendro_col,
+        "groupbar": ax_groupbar,
+        "colorbar": ax_cbar,
+    }

microarray/plotting/_ma_plots.py ADDED Viewed

@@ -0,0 +1,136 @@
+"""MA and MD plot functions for microarray quality control."""
+from typing import Any
+import matplotlib.pyplot as plt
+import numpy as np
+from anndata import AnnData
+from matplotlib.axes import Axes
+from microarray.plotting._utils import add_loess_curve, add_reference_line, with_highlights
+def ma(
+    adata: AnnData,
+    arrays: tuple[int | str, int | str] | None = None,
+    status: np.ndarray | None = None,
+    span: float = 0.3,
+    xlab: str = "A (average log-expression)",
+    ylab: str = "M (log-ratio)",
+    title: str = "",
+    loess: bool = True,
+    reference_line: bool = True,
+    ax: Axes | None = None,
+    **kwargs: Any,
+) -> Axes:
+    """MA plot (M vs A plot) for comparing two arrays or array vs reference.
+    MA plot displays log-ratio (M) vs average log-expression (A) to visualize
+    differences between two arrays. Useful for quality control and identifying
+    systematic biases.
+    M = log2(array1) - log2(array2)
+    A = 0.5 * (log2(array1) + log2(array2))
+    Args:
+        adata: AnnData object with probe-level expression data in .X
+        arrays: Tuple of two array indices/names to compare. If None, compares
+            first array to pseudo-median reference.
+        status: Status labels for highlighting points (e.g., 'up', 'down', 'not-significant')
+        span: Smoothing span for LOESS curve (0-1). Default 0.3.
+        xlab: X-axis label
+        ylab: Y-axis label
+        title: Plot title
+        loess: Whether to add LOESS smoothing curve. Default True.
+        reference_line: Whether to add horizontal line at M=0. Default True.
+        ax: Existing Axes object. If None, creates new figure.
+        **kwargs: Additional arguments passed to scatter plot
+    Returns:
+        Axes object with MA plot
+    Examples:
+        >>> import anndata as ad
+        >>> import numpy as np
+        >>> from microarray.plotting import ma
+        >>> # Compare two arrays
+        >>> data = np.random.randn(1000, 4)
+        >>> adata = ad.AnnData(data.T)
+        >>> ax = ma(adata, arrays=(0, 1))
+        >>> # Compare to median reference
+        >>> ax = ma(adata)
+    """
+    if ax is None:
+        _, ax = plt.subplots(figsize=(8, 6))
+    # Get expression matrix (probes x samples)
+    # AnnData stores as samples x features, so transpose
+    expr = adata.X.T  # Now probes x samples
+    # Check for multiple arrays
+    if expr.ndim == 1 or expr.shape[1] < 2:
+        raise ValueError("AnnData must contain multiple arrays for MA plot")
+    # Convert to log2 if not already
+    # Check if data appears to be log-transformed (negative values or small range)
+    if expr.min() < 0 or (expr.max() - expr.min()) < 20:
+        # Likely already log-transformed
+        log_expr = expr
+    else:
+        # Apply log2 transformation
+        log_expr = np.log2(expr + 1)  # Add pseudocount to avoid log(0)
+    # Select arrays to compare
+    if arrays is None:
+        # Compare first array to pseudo-median reference
+        array1_idx = 0
+        reference = np.median(log_expr, axis=1)  # Median across all arrays
+        log_array1 = log_expr[:, array1_idx]
+        log_array2 = reference
+        if not title:
+            title = "MA Plot: Array 0 vs Median"
+    else:
+        # Compare two specified arrays
+        if len(arrays) != 2:
+            raise ValueError("arrays must be a tuple of length 2")
+        # Handle array indices or names
+        if isinstance(arrays[0], str):
+            array1_idx = list(adata.obs_names).index(arrays[0])
+        else:
+            array1_idx = arrays[0]
+        if isinstance(arrays[1], str):
+            array2_idx = list(adata.obs_names).index(arrays[1])
+        else:
+            array2_idx = arrays[1]
+        log_array1 = log_expr[:, array1_idx]
+        log_array2 = log_expr[:, array2_idx]
+        if not title:
+            title = f"MA Plot: Array {array1_idx} vs Array {array2_idx}"
+    # Calculate M and A
+    M = log_array1 - log_array2
+    A = 0.5 * (log_array1 + log_array2)
+    # Remove NaN/Inf values
+    mask = np.isfinite(M) & np.isfinite(A)
+    M = M[mask]
+    A = A[mask]
+    if status is not None:
+        status = status[mask]
+    # Create scatter plot with highlighting
+    ax = with_highlights(A, M, status=status, xlab=xlab, ylab=ylab, title=title, ax=ax, **kwargs)
+    # Add reference line at M=0
+    if reference_line:
+        add_reference_line(ax, y=0, color="gray", linestyle="--", linewidth=1)
+    # Add LOESS smoothing curve
+    if loess and len(A) > 10:
+        add_loess_curve(ax, A, M, span=span, color="blue", linewidth=2, label="LOESS")
+        ax.legend(loc="best")
+    return ax