PyPI - microarray - Versions diffs - 0.1.0__py3-none-any.whl - Mend

microarray 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (44) hide show

microarray/__init__.py +15 -0
microarray/_version.py +3 -0
microarray/datasets/__init__.py +3 -0
microarray/datasets/_arrayexpress.py +1 -0
microarray/datasets/_cdf_files.py +35 -0
microarray/datasets/_geo.py +1 -0
microarray/datasets/_utils.py +143 -0
microarray/io/__init__.py +17 -0
microarray/io/_anndata_converter.py +198 -0
microarray/io/_cdf.py +575 -0
microarray/io/_cel.py +591 -0
microarray/io/_read.py +127 -0
microarray/plotting/__init__.py +28 -0
microarray/plotting/_base.py +253 -0
microarray/plotting/_cel.py +75 -0
microarray/plotting/_de_plots.py +239 -0
microarray/plotting/_diagnostic_plots.py +268 -0
microarray/plotting/_heatmap.py +279 -0
microarray/plotting/_ma_plots.py +136 -0
microarray/plotting/_pca.py +320 -0
microarray/plotting/_qc_plots.py +335 -0
microarray/plotting/_score.py +38 -0
microarray/plotting/_top_table_heatmap.py +98 -0
microarray/plotting/_utils.py +280 -0
microarray/preprocessing/__init__.py +39 -0
microarray/preprocessing/_background.py +862 -0
microarray/preprocessing/_log2.py +77 -0
microarray/preprocessing/_normalize.py +1292 -0
microarray/preprocessing/_rma.py +243 -0
microarray/preprocessing/_robust.py +170 -0
microarray/preprocessing/_summarize.py +318 -0
microarray/py.typed +0 -0
microarray/tools/__init__.py +26 -0
microarray/tools/_biomart.py +416 -0
microarray/tools/_empirical_bayes.py +401 -0
microarray/tools/_fdist.py +171 -0
microarray/tools/_linear_models.py +387 -0
microarray/tools/_mds.py +101 -0
microarray/tools/_pca.py +88 -0
microarray/tools/_score.py +86 -0
microarray/tools/_toptable.py +360 -0
microarray-0.1.0.dist-info/METADATA +75 -0
microarray-0.1.0.dist-info/RECORD +44 -0
microarray-0.1.0.dist-info/WHEEL +4 -0

microarray/plotting/_de_plots.py ADDED Viewed

@@ -0,0 +1,239 @@
+"""Differential expression plot functions for microarray analysis."""
+from typing import Any
+import matplotlib.pyplot as plt
+import numpy as np
+from matplotlib.axes import Axes
+from matplotlib.patches import Circle
+from microarray.plotting._utils import add_reference_line, with_highlights
+def volcano(
+    logfc: np.ndarray,
+    pvalues: np.ndarray,
+    logfc_threshold: float = 1.0,
+    pvalue_threshold: float = 0.05,
+    labels: list[str] | np.ndarray | None = None,
+    top_n: int = 10,
+    status: np.ndarray | None = None,
+    xlab: str = "Log2 fold-change",
+    ylab: str = "-Log10(p-value)",
+    title: str = "Volcano Plot",
+    ax: Axes | None = None,
+    **kwargs: Any,
+) -> Axes:
+    """Volcano plot for differential expression results.
+    Volcano plot displays log fold-changes vs statistical significance
+    (-log10 p-values). Points in upper left/right corners represent genes
+    with large fold-changes and high significance.
+    Args:
+        logfc: Array of log2 fold-changes
+        pvalues: Array of p-values
+        logfc_threshold: Fold-change threshold for significance lines. Default 1.0.
+        pvalue_threshold: P-value threshold for significance line. Default 0.05.
+        labels: Gene/probe labels. If provided with top_n, labels top genes.
+        top_n: Number of top genes to label (by significance). Default 10.
+        status: Custom status labels for coloring. If None, automatically determines
+            status based on thresholds (up/down/not-significant).
+        xlab: X-axis label
+        ylab: Y-axis label
+        title: Plot title
+        ax: Existing Axes object. If None, creates new figure.
+        **kwargs: Additional arguments passed to scatter plot
+    Returns:
+        Axes object with volcano plot
+    Examples:
+        >>> import numpy as np
+        >>> from microarray.plotting import volcano
+        >>> logfc = np.random.randn(1000) * 2
+        >>> pvalues = np.random.uniform(0, 1, 1000)
+        >>> ax = volcano(logfc, pvalues)
+    """
+    if ax is None:
+        _, ax = plt.subplots(figsize=(8, 7))
+    # Calculate -log10(p-values)
+    # Handle p-values of 0 by setting a minimum
+    pvalues = np.maximum(pvalues, 1e-300)
+    logp = -np.log10(pvalues)
+    # Determine status if not provided
+    if status is None:
+        status = np.array(["not-significant"] * len(logfc))
+        significant = logp >= -np.log10(pvalue_threshold)
+        up = significant & (logfc >= logfc_threshold)
+        down = significant & (logfc <= -logfc_threshold)
+        status[up] = "up"
+        status[down] = "down"
+    # Create scatter plot with highlighting
+    ax = with_highlights(
+        logfc, logp, status=status, xlab=xlab, ylab=ylab, title=title, ax=ax, legend="upper right", **kwargs
+    )
+    # Volcano-specific styling: no background grid and legend outside without frame.
+    ax.grid(False)
+    legend = ax.get_legend()
+    if legend is not None:
+        legend.set_loc("upper left")
+        legend.set_bbox_to_anchor((1.02, 1.0))
+        legend.set_frame_on(False)
+    # Add threshold lines
+    # Vertical lines for fold-change thresholds
+    add_reference_line(ax, x=logfc_threshold, color="darkgray", linestyle="--", alpha=0.7)
+    add_reference_line(ax, x=-logfc_threshold, color="darkgray", linestyle="--", alpha=0.7)
+    # Horizontal line for p-value threshold
+    add_reference_line(ax, y=-np.log10(pvalue_threshold), color="darkgray", linestyle="--", alpha=0.7)
+    # Label top genes if requested
+    if labels is not None and top_n > 0:
+        # Get indices of top genes by p-value
+        top_indices = np.argsort(logp)[-top_n:]
+        for idx in top_indices:
+            ax.annotate(
+                labels[idx],
+                (logfc[idx], logp[idx]),
+                xytext=(5, 5),
+                textcoords="offset points",
+                fontsize=8,
+                alpha=0.7,
+                bbox=dict(boxstyle="round,pad=0.3", facecolor="white", alpha=0.7, edgecolor="none"),
+            )
+    return ax
+def venn(
+    sets: dict[str, set] | list[set],
+    labels: list[str] | None = None,
+    colors: list[str] | None = None,
+    alpha: float = 0.4,
+    title: str = "Venn Diagram",
+    ax: Axes | None = None,
+) -> Axes:
+    """Venn diagram for visualizing overlap between sets.
+    Creates Venn diagram showing overlap between 2 or 3 sets.
+    Common use case: visualizing overlap of differentially expressed genes
+    across multiple contrasts or conditions.
+    Args:
+        sets: Dictionary mapping labels to sets, or list of sets.
+            If list, labels parameter must be provided.
+        labels: Labels for each set. Required if sets is a list.
+        colors: Colors for each set. If None, uses default palette.
+        alpha: Transparency of circles (0-1)
+        title: Plot title
+        ax: Existing Axes object. If None, creates new figure.
+    Returns:
+        Axes object with Venn diagram
+    Examples:
+        >>> from microarray.plotting import venn
+        >>> set1 = set(["gene1", "gene2", "gene3", "gene4"])
+        >>> set2 = set(["gene3", "gene4", "gene5", "gene6"])
+        >>> ax = venn({"Control": set1, "Treatment": set2})
+        >>> # Three-way Venn
+        >>> set3 = set(["gene1", "gene5", "gene7"])
+        >>> ax = venn({"A": set1, "B": set2, "C": set3})
+    """
+    if ax is None:
+        _, ax = plt.subplots(figsize=(8, 8))
+    # Parse input
+    if isinstance(sets, dict):
+        labels = list(sets.keys())
+        set_list = list(sets.values())
+    else:
+        if labels is None:
+            raise ValueError("labels must be provided when sets is a list")
+        set_list = sets
+    n_sets = len(set_list)
+    if n_sets < 2 or n_sets > 3:
+        raise ValueError("Venn diagrams support 2 or 3 sets only")
+    # Get default colors
+    if colors is None:
+        default_colors = ["#E41A1C", "#377EB8", "#4DAF4A"]
+        colors = default_colors[:n_sets]
+    ax.set_aspect("equal")
+    ax.set_xlim(-2, 2)
+    ax.set_ylim(-2, 2)
+    ax.axis("off")
+    ax.set_title(title, fontsize=14, pad=20)
+    if n_sets == 2:
+        # Two-way Venn diagram
+        set_a, set_b = set_list
+        # Draw circles
+        circle_a = Circle((-0.5, 0), 1, color=colors[0], alpha=alpha, ec="black", linewidth=2)
+        circle_b = Circle((0.5, 0), 1, color=colors[1], alpha=alpha, ec="black", linewidth=2)
+        ax.add_patch(circle_a)
+        ax.add_patch(circle_b)
+        # Calculate counts
+        only_a = len(set_a - set_b)
+        only_b = len(set_b - set_a)
+        both = len(set_a & set_b)
+        # Add text labels
+        ax.text(-0.9, 0, str(only_a), fontsize=16, ha="center", va="center", weight="bold")
+        ax.text(0.9, 0, str(only_b), fontsize=16, ha="center", va="center", weight="bold")
+        ax.text(0, 0, str(both), fontsize=16, ha="center", va="center", weight="bold")
+        # Add set labels
+        ax.text(-0.6, 1.3, labels[0], fontsize=12, ha="center", weight="bold")
+        ax.text(0.6, 1.3, labels[1], fontsize=12, ha="center", weight="bold")
+    elif n_sets == 3:
+        # Three-way Venn diagram
+        set_a, set_b, set_c = set_list
+        # Draw circles
+        r = 1  # radius
+        d = 0.7  # distance from center
+        circle_a = Circle((-d / 2, d / 2), r, color=colors[0], alpha=alpha, ec="black", linewidth=2)
+        circle_b = Circle((d / 2, d / 2), r, color=colors[1], alpha=alpha, ec="black", linewidth=2)
+        circle_c = Circle((0, -d / 2), r, color=colors[2], alpha=alpha, ec="black", linewidth=2)
+        ax.add_patch(circle_a)
+        ax.add_patch(circle_b)
+        ax.add_patch(circle_c)
+        # Calculate counts
+        only_a = len(set_a - set_b - set_c)
+        only_b = len(set_b - set_a - set_c)
+        only_c = len(set_c - set_a - set_b)
+        ab_only = len((set_a & set_b) - set_c)
+        ac_only = len((set_a & set_c) - set_b)
+        bc_only = len((set_b & set_c) - set_a)
+        abc = len(set_a & set_b & set_c)
+        # Add text labels (positioned by eye for typical 3-way Venn)
+        ax.text(-0.8, 0.65, str(only_a), fontsize=14, ha="center", va="center", weight="bold")
+        ax.text(0.8, 0.65, str(only_b), fontsize=14, ha="center", va="center", weight="bold")
+        ax.text(0, -1.0, str(only_c), fontsize=14, ha="center", va="center", weight="bold")
+        ax.text(0, 0.78, str(ab_only), fontsize=14, ha="center", va="center", weight="bold")
+        ax.text(-0.55, -0.3, str(ac_only), fontsize=14, ha="center", va="center", weight="bold")
+        ax.text(0.55, -0.3, str(bc_only), fontsize=14, ha="center", va="center", weight="bold")
+        ax.text(0, 0.2, str(abc), fontsize=14, ha="center", va="center", weight="bold")
+        # Add set labels
+        ax.text(-d, d / 2 + 1.3, labels[0], fontsize=12, ha="center", weight="bold")
+        ax.text(d, d / 2 + 1.3, labels[1], fontsize=12, ha="center", weight="bold")
+        ax.text(0, -d / 2 - 1.3, labels[2], fontsize=12, ha="center", weight="bold")
+    return ax

microarray/plotting/_diagnostic_plots.py ADDED Viewed

@@ -0,0 +1,268 @@
+"""Diagnostic plot functions for microarray data analysis."""
+from typing import Any
+import matplotlib.pyplot as plt
+import numpy as np
+from anndata import AnnData
+from matplotlib.axes import Axes
+from microarray.plotting._utils import get_default_colors
+def mds(
+    adata: AnnData,
+    obsm_key: str = "X_mds",
+    top: int = 500,
+    gene_selection: str = "common",
+    dimensions: int = 2,
+    labels: list[str] | None = None,
+    colors: list[str] | str | None = None,
+    groups: np.ndarray | list | None = None,
+    xlab: str | None = None,
+    ylab: str | None = None,
+    title: str = "MDS Plot",
+    ax: Axes | None = None,
+    **kwargs: Any,
+) -> Axes:
+    """Plot Multidimensional Scaling (MDS) embedding.
+    Visualizes the MDS embedding stored in `.obsm` to show sample relationships
+    in 2D space. Samples that are similar (highly correlated) appear close together,
+    while dissimilar samples are far apart. Essential for quality control and
+    identifying batch effects or outliers.
+    Note:
+        If MDS embedding is not found in `.obsm[obsm_key]`, it will be computed
+        automatically using `microarray.tl.mds()` with the provided parameters.
+    Args:
+        adata: AnnData object with MDS embedding in .obsm or expression data in .X
+        obsm_key: Key in .obsm where the MDS embedding is stored. Default "X_mds".
+        top: Number of top varying probes to use if computing MDS. Default 500.
+        gene_selection: Method for selecting genes if computing MDS. Default "common".
+        dimensions: Number of dimensions to plot (must be 2). Default 2.
+        labels: Custom labels for each sample. If None, uses obs_names.
+        colors: Color(s) for points. Can be single color or list of colors per sample.
+        groups: Group assignments for color coding. If provided with colors as dict,
+            maps groups to colors.
+        xlab: X-axis label. If None, uses "Dimension 1".
+        ylab: Y-axis label. If None, uses "Dimension 2".
+        title: Plot title
+        ax: Existing Axes object. If None, creates new figure.
+        **kwargs: Additional arguments passed to ax.scatter()
+    Returns:
+        Axes object with MDS plot
+    Examples:
+        >>> import anndata as ad
+        >>> import numpy as np
+        >>> import microarray as ma
+        >>> data = np.random.randn(1000, 6)
+        >>> adata = ad.AnnData(data.T)
+        >>> # Compute MDS first
+        >>> ma.tl.mds(adata, top=500)
+        >>> # Then plot it
+        >>> ax = ma.pl.mds(adata)
+        >>> # Or let the plot function compute it automatically
+        >>> ax = ma.pl.mds(adata, top=500)
+        >>> # With group coloring
+        >>> groups = ["control", "control", "control", "treated", "treated", "treated"]
+        >>> ax = ma.pl.mds(adata, groups=groups)
+    """
+    if ax is None:
+        _, ax = plt.subplots(figsize=(8, 7))
+    if dimensions != 2:
+        raise NotImplementedError("Only 2D MDS plots are currently supported")
+    # Check if MDS embedding exists, if not compute it
+    if obsm_key not in adata.obsm:
+        # Import here to avoid circular dependency
+        from microarray.tools import mds as compute_mds
+        compute_mds(adata, top=top, gene_selection=gene_selection, n_components=dimensions, obsm_key=obsm_key)
+    # Get MDS coordinates from obsm
+    coords = adata.obsm[obsm_key]
+    n_samples = coords.shape[0]
+    # Prepare labels
+    if labels is None:
+        labels = list(adata.obs_names) if adata.obs_names is not None else [f"Sample {i}" for i in range(n_samples)]
+    # Prepare colors
+    if groups is not None:
+        unique_groups = np.unique(groups)
+        n_groups = len(unique_groups)
+        # If colors is a dict, map groups to colors
+        if isinstance(colors, dict):
+            color_map = colors
+        else:
+            # Generate default colors for groups
+            default_colors = get_default_colors(n_groups)
+            color_map = dict(zip(unique_groups, default_colors, strict=False))
+        # Plot by group for legend
+        for group in unique_groups:
+            mask = np.array(groups) == group
+            ax.scatter(
+                coords[mask, 0],
+                coords[mask, 1],
+                c=color_map[group],
+                label=str(group),
+                s=100,
+                alpha=0.7,
+                edgecolors="black",
+                linewidth=0.5,
+                **kwargs,
+            )
+        ax.legend(loc="best", frameon=True)
+    else:
+        # Single color or list of colors without grouping
+        if colors is None:
+            colors = get_default_colors(1)[0]
+        if isinstance(colors, str):
+            # Single color for all points
+            ax.scatter(
+                coords[:, 0], coords[:, 1], c=colors, s=100, alpha=0.7, edgecolors="black", linewidth=0.5, **kwargs
+            )
+        else:
+            # List of colors
+            ax.scatter(
+                coords[:, 0], coords[:, 1], c=colors, s=100, alpha=0.7, edgecolors="black", linewidth=0.5, **kwargs
+            )
+    # Add labels to points
+    for i, label in enumerate(labels):
+        ax.annotate(
+            label, (coords[i, 0], coords[i, 1]), xytext=(5, 5), textcoords="offset points", fontsize=9, alpha=0.8
+        )
+    # Set labels
+    if xlab is None:
+        xlab = "Dimension 1"
+    if ylab is None:
+        ylab = "Dimension 2"
+    ax.set_xlabel(xlab)
+    ax.set_ylabel(ylab)
+    ax.set_title(title)
+    ax.grid(True, alpha=0.3, linestyle="--")
+    ax.axhline(y=0, color="gray", linewidth=0.5, alpha=0.5)
+    ax.axvline(x=0, color="gray", linewidth=0.5, alpha=0.5)
+    return ax
+def sa(
+    adata: AnnData,
+    fit_values: np.ndarray | None = None,
+    xlab: str = "Average log-expression",
+    ylab: str = "Sqrt(standard deviation)",
+    title: str = "SA Plot",
+    show_trend: bool = True,
+    ax: Axes | None = None,
+    **kwargs: Any,
+) -> Axes:
+    """Sigma vs average plot for mean-variance relationship.
+    SA plot (also called mean-variance plot) shows the relationship between
+    average expression and variability. Used to assess variance stabilization
+    and the appropriateness of statistical models.
+    Plots sqrt(standard deviation) vs mean log-expression. If fit_values are
+    provided (e.g., from limma's empirical Bayes estimation), shows the
+    smoothed variance trend.
+    Args:
+        adata: AnnData object with probe-level expression data in .X
+        fit_values: Fitted/smoothed variance values from statistical model.
+            If provided, overlays trend line.
+        xlab: X-axis label
+        ylab: Y-axis label
+        title: Plot title
+        show_trend: Whether to show smoothed trend line. Default True.
+        ax: Existing Axes object. If None, creates new figure.
+        **kwargs: Additional arguments passed to ax.scatter()
+    Returns:
+        Axes object with SA plot
+    Examples:
+        >>> import anndata as ad
+        >>> import numpy as np
+        >>> from microarray.plotting import sa
+        >>> data = np.random.randn(1000, 6)
+        >>> adata = ad.AnnData(data.T)
+        >>> ax = sa(adata)
+    """
+    if ax is None:
+        _, ax = plt.subplots(figsize=(8, 6))
+    # Get expression matrix (samples x probes)
+    expr = adata.X
+    # Convert to log2 if not already
+    if expr.min() >= 0 and (expr.max() - expr.min()) > 20:
+        log_expr = np.log2(expr + 1)
+    else:
+        log_expr = expr
+    # Calculate mean and standard deviation for each probe
+    mean_expr = np.mean(log_expr, axis=0)
+    std_expr = np.std(log_expr, axis=0, ddof=1)  # Sample std dev
+    # Remove NaN/Inf values
+    mask = np.isfinite(mean_expr) & np.isfinite(std_expr) & (std_expr > 0)
+    mean_expr = mean_expr[mask]
+    std_expr = std_expr[mask]
+    # Transform standard deviation
+    sqrt_std = np.sqrt(std_expr)
+    # Create scatter plot
+    ax.scatter(mean_expr, sqrt_std, alpha=0.5, s=10, **kwargs)
+    # Add trend line if fit values provided
+    if fit_values is not None:
+        fit_values = fit_values[mask]
+        sqrt_fit = np.sqrt(fit_values)
+        # Sort for plotting
+        sort_idx = np.argsort(mean_expr)
+        ax.plot(mean_expr[sort_idx], sqrt_fit[sort_idx], color="red", linewidth=2, label="Fitted trend")
+        ax.legend(loc="best")
+    elif show_trend:
+        # Calculate simple smoothed trend using local polynomial
+        try:
+            from scipy.signal import savgol_filter
+            # Sort by mean expression
+            sort_idx = np.argsort(mean_expr)
+            sorted_mean = mean_expr[sort_idx]
+            sorted_sqrt_std = sqrt_std[sort_idx]
+            # Apply Savitzky-Golay filter for smoothing
+            window_length = min(51, len(sorted_mean) // 10)
+            if window_length % 2 == 0:
+                window_length += 1
+            if window_length >= 3:
+                smoothed = savgol_filter(sorted_sqrt_std, window_length, 3)
+                ax.plot(sorted_mean, smoothed, color="red", linewidth=2, label="Smoothed trend")
+                ax.legend(loc="best")
+        except ImportError:
+            pass  # Skip trend line if scipy not available
+    # Set labels and title
+    ax.set_xlabel(xlab)
+    ax.set_ylabel(ylab)
+    ax.set_title(title)
+    ax.grid(True, alpha=0.3, linestyle="--")
+    return ax