microarray 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. microarray/__init__.py +15 -0
  2. microarray/_version.py +3 -0
  3. microarray/datasets/__init__.py +3 -0
  4. microarray/datasets/_arrayexpress.py +1 -0
  5. microarray/datasets/_cdf_files.py +35 -0
  6. microarray/datasets/_geo.py +1 -0
  7. microarray/datasets/_utils.py +143 -0
  8. microarray/io/__init__.py +17 -0
  9. microarray/io/_anndata_converter.py +198 -0
  10. microarray/io/_cdf.py +575 -0
  11. microarray/io/_cel.py +591 -0
  12. microarray/io/_read.py +127 -0
  13. microarray/plotting/__init__.py +28 -0
  14. microarray/plotting/_base.py +253 -0
  15. microarray/plotting/_cel.py +75 -0
  16. microarray/plotting/_de_plots.py +239 -0
  17. microarray/plotting/_diagnostic_plots.py +268 -0
  18. microarray/plotting/_heatmap.py +279 -0
  19. microarray/plotting/_ma_plots.py +136 -0
  20. microarray/plotting/_pca.py +320 -0
  21. microarray/plotting/_qc_plots.py +335 -0
  22. microarray/plotting/_score.py +38 -0
  23. microarray/plotting/_top_table_heatmap.py +98 -0
  24. microarray/plotting/_utils.py +280 -0
  25. microarray/preprocessing/__init__.py +39 -0
  26. microarray/preprocessing/_background.py +862 -0
  27. microarray/preprocessing/_log2.py +77 -0
  28. microarray/preprocessing/_normalize.py +1292 -0
  29. microarray/preprocessing/_rma.py +243 -0
  30. microarray/preprocessing/_robust.py +170 -0
  31. microarray/preprocessing/_summarize.py +318 -0
  32. microarray/py.typed +0 -0
  33. microarray/tools/__init__.py +26 -0
  34. microarray/tools/_biomart.py +416 -0
  35. microarray/tools/_empirical_bayes.py +401 -0
  36. microarray/tools/_fdist.py +171 -0
  37. microarray/tools/_linear_models.py +387 -0
  38. microarray/tools/_mds.py +101 -0
  39. microarray/tools/_pca.py +88 -0
  40. microarray/tools/_score.py +86 -0
  41. microarray/tools/_toptable.py +360 -0
  42. microarray-0.1.0.dist-info/METADATA +75 -0
  43. microarray-0.1.0.dist-info/RECORD +44 -0
  44. microarray-0.1.0.dist-info/WHEEL +4 -0
@@ -0,0 +1,279 @@
1
+ from typing import Any
2
+
3
+ import matplotlib.colors as mcolors
4
+ import matplotlib.patches as mpatches
5
+ import matplotlib.pyplot as plt
6
+ import numpy as np
7
+ from anndata import AnnData
8
+ from matplotlib.axes import Axes
9
+ from matplotlib.figure import Figure
10
+ from scipy.cluster.hierarchy import dendrogram, linkage
11
+ from scipy.stats import zscore
12
+
13
+ from microarray.plotting._utils import get_default_colors
14
+
15
+
16
+ def _to_dense_array(x: Any) -> np.ndarray:
17
+ """Convert sparse or dense array-like values to numpy ndarray."""
18
+ if hasattr(x, "toarray"):
19
+ return np.asarray(x.toarray())
20
+ return np.asarray(x)
21
+
22
+
23
+ def _style_dendrogram_axis(ax: Axes) -> None:
24
+ """Remove ticks and frame from dendrogram axis."""
25
+ ax.set_xticks([])
26
+ ax.set_yticks([])
27
+ ax.set_frame_on(False)
28
+ for spine in ax.spines.values():
29
+ spine.set_visible(False)
30
+
31
+
32
+ def _equalize_linkage_levels(z: np.ndarray) -> np.ndarray:
33
+ """Return a copy of linkage matrix with uniformly spaced merge heights."""
34
+ z_eq = z.copy()
35
+ z_eq[:, 2] = np.arange(1, z.shape[0] + 1, dtype=float)
36
+ return z_eq
37
+
38
+
39
+ def _resolve_group_colors(
40
+ group_levels: list[str],
41
+ group_colors: dict[str, str] | str | mcolors.Colormap | None,
42
+ ) -> dict[str, str]:
43
+ """Resolve group color mapping from explicit map, cmap name/object, or defaults."""
44
+ if isinstance(group_colors, dict):
45
+ resolved = dict(group_colors)
46
+ missing = [g for g in group_levels if g not in resolved]
47
+ if missing:
48
+ fallback = get_default_colors(len(missing))
49
+ resolved.update(dict(zip(missing, fallback, strict=False)))
50
+ return resolved
51
+
52
+ if group_colors is None:
53
+ default_colors = get_default_colors(len(group_levels))
54
+ return dict(zip(group_levels, default_colors, strict=False))
55
+
56
+ cmap_obj = plt.get_cmap(group_colors) if isinstance(group_colors, str) else group_colors
57
+ if len(group_levels) == 1:
58
+ return {group_levels[0]: mcolors.to_hex(cmap_obj(0.5))}
59
+
60
+ if isinstance(cmap_obj, mcolors.ListedColormap) and cmap_obj.N > 0:
61
+ # Sample at bin centers to preserve listed color order (e.g. tab10/tab20)
62
+ positions = ((np.arange(len(group_levels)) % cmap_obj.N) + 0.5) / cmap_obj.N
63
+ else:
64
+ positions = np.linspace(0.0, 1.0, len(group_levels), endpoint=True)
65
+ sampled = [mcolors.to_hex(cmap_obj(p)) for p in positions]
66
+ return dict(zip(group_levels, sampled, strict=False))
67
+
68
+
69
+ def heatmap(
70
+ adata: AnnData,
71
+ genes: list[str],
72
+ groupby: str | None = None,
73
+ group_colors: dict[str, str] | str | mcolors.Colormap | None = None,
74
+ swap_axes: bool = False,
75
+ show_dendrograms: bool = True,
76
+ dendrogram_axes: str | None = None,
77
+ z_score: bool = True,
78
+ cmap: str = "RdBu_r",
79
+ vmin: float = -2,
80
+ vmax: float = 2,
81
+ title: str = "Hierarchical Clustering Heatmap",
82
+ figsize: tuple[float, float] = (14, 10),
83
+ colorbar_shrink: float = 0.7,
84
+ show: bool = True,
85
+ ) -> tuple[Figure, dict[str, Axes | None]]:
86
+ """Plot sample-by-gene heatmap with optional hierarchical clustering.
87
+
88
+ Args:
89
+ adata: AnnData object containing expression data in ``.X``.
90
+ genes: Ordered list of genes to display in the heatmap.
91
+ groupby: Optional ``adata.obs`` column used for sample group annotations.
92
+ group_colors: Optional group color mapping or colormap (name/object).
93
+ swap_axes: If True, transpose heatmap so genes are rows and samples are columns.
94
+ show_dendrograms: Backward-compatible toggle for showing dendrograms.
95
+ dendrogram_axes: Which dendrograms to display: ``"x"``, ``"y"``, ``"both"``, or ``"none"``.
96
+ z_score: If True, z-score each gene across samples.
97
+ cmap: Colormap used for heatmap values.
98
+ vmin: Lower color limit for heatmap.
99
+ vmax: Upper color limit for heatmap.
100
+ title: Figure title.
101
+ figsize: Figure size passed to matplotlib.
102
+ colorbar_shrink: Shrink factor for colorbar to reduce visual footprint.
103
+ show: If True, call ``fig.show()`` before returning.
104
+
105
+ Returns:
106
+ Tuple of ``(Figure, axes_dict)``. The dictionary contains
107
+ ``heatmap``, ``dendrogram_row``, ``dendrogram_col``, ``groupbar``, and ``colorbar``.
108
+ """
109
+ genes = [g for g in dict.fromkeys(genes) if g in adata.var_names]
110
+ if len(genes) == 0:
111
+ raise ValueError("None of the provided genes were found in adata.var_names")
112
+
113
+ x = _to_dense_array(adata[:, adata.var.index.isin(genes)].X)
114
+ if z_score:
115
+ x = zscore(x, axis=0, nan_policy="omit")
116
+ x = np.nan_to_num(x, nan=0.0)
117
+
118
+ sample_names = np.array(adata.obs_names)
119
+ gene_names = np.array(genes)
120
+ matrix = x.T if swap_axes else x
121
+
122
+ if dendrogram_axes is None:
123
+ dendrogram_mode = "both" if show_dendrograms else "none"
124
+ else:
125
+ dendrogram_mode = str(dendrogram_axes).lower()
126
+
127
+ valid_dendrogram_modes = {"none", "x", "y", "both"}
128
+ if dendrogram_mode not in valid_dendrogram_modes:
129
+ raise ValueError("dendrogram_axes must be one of: 'none', 'x', 'y', 'both'")
130
+
131
+ show_dendrogram_x = dendrogram_mode in {"x", "both"}
132
+ show_dendrogram_y = dendrogram_mode in {"y", "both"}
133
+
134
+ z_rows = None
135
+ z_cols = None
136
+ row_order = np.arange(matrix.shape[0])
137
+ col_order = np.arange(matrix.shape[1])
138
+
139
+ if show_dendrogram_y and matrix.shape[0] > 1:
140
+ z_rows = linkage(matrix, method="ward", metric="euclidean")
141
+ row_order = np.array(dendrogram(z_rows, no_plot=True)["leaves"])
142
+
143
+ if show_dendrogram_x and matrix.shape[1] > 1:
144
+ z_cols = linkage(matrix.T, method="ward", metric="euclidean")
145
+ col_order = np.array(dendrogram(z_cols, no_plot=True)["leaves"])
146
+
147
+ x_ord = matrix[np.ix_(row_order, col_order)]
148
+
149
+ if swap_axes:
150
+ row_labels = gene_names[row_order]
151
+ col_labels = sample_names[col_order]
152
+ else:
153
+ row_labels = sample_names[row_order]
154
+ col_labels = gene_names[col_order]
155
+
156
+ has_groups = groupby is not None
157
+ if has_groups and groupby not in adata.obs:
158
+ raise ValueError(f"Column '{groupby}' not found in adata.obs")
159
+
160
+ use_groupbar_column = has_groups and not swap_axes
161
+
162
+ n_rows = 2 if show_dendrogram_x else 1
163
+ heatmap_row_idx = 1 if show_dendrogram_x else 0
164
+
165
+ width_ratios = [4.6]
166
+ if show_dendrogram_y:
167
+ width_ratios.append(1.0)
168
+ if use_groupbar_column:
169
+ width_ratios.append(0.14)
170
+ width_ratios.append(0.22)
171
+
172
+ fig = plt.figure(figsize=figsize)
173
+ if n_rows == 2:
174
+ gs = fig.add_gridspec(
175
+ 2,
176
+ len(width_ratios),
177
+ width_ratios=width_ratios,
178
+ height_ratios=[1.0, 4.6],
179
+ hspace=0.05,
180
+ wspace=0.05,
181
+ )
182
+ else:
183
+ gs = fig.add_gridspec(1, len(width_ratios), width_ratios=width_ratios, wspace=0.05)
184
+
185
+ ax_heatmap = fig.add_subplot(gs[heatmap_row_idx, 0])
186
+ ax_dendro_col = fig.add_subplot(gs[0, 0]) if show_dendrogram_x else None
187
+ ax_dendro_row = fig.add_subplot(gs[heatmap_row_idx, 1]) if show_dendrogram_y else None
188
+ group_col_idx = (2 + int(show_dendrogram_y)) if use_groupbar_column else None
189
+
190
+ if ax_dendro_col is not None and z_cols is not None:
191
+ z_cols_eq = _equalize_linkage_levels(z_cols)
192
+ dendrogram(
193
+ z_cols_eq,
194
+ ax=ax_dendro_col,
195
+ orientation="top",
196
+ no_labels=True,
197
+ link_color_func=lambda _: "black",
198
+ )
199
+ if ax_dendro_row is not None and z_rows is not None:
200
+ z_rows_eq = _equalize_linkage_levels(z_rows)
201
+ dendrogram(
202
+ z_rows_eq,
203
+ ax=ax_dendro_row,
204
+ orientation="right",
205
+ no_labels=True,
206
+ link_color_func=lambda _: "black",
207
+ )
208
+
209
+ if ax_dendro_col is not None:
210
+ _style_dendrogram_axis(ax_dendro_col)
211
+ if ax_dendro_row is not None:
212
+ _style_dendrogram_axis(ax_dendro_row)
213
+
214
+ im = ax_heatmap.imshow(x_ord, aspect="auto", cmap=cmap, vmin=vmin, vmax=vmax)
215
+ ax_heatmap.set_xlabel("Samples" if swap_axes else "Genes")
216
+ ax_heatmap.set_ylabel("Genes" if swap_axes else "Samples")
217
+ ax_heatmap.set_xticks(np.arange(len(col_labels)))
218
+ ax_heatmap.set_xticklabels(col_labels, rotation=90)
219
+ ax_heatmap.set_yticks(np.arange(len(row_labels)))
220
+ ax_heatmap.set_yticklabels(row_labels)
221
+ ax_heatmap.yaxis.tick_left()
222
+ ax_heatmap.yaxis.set_label_position("left")
223
+ ax_heatmap.tick_params(axis="y", labelleft=True, labelright=False)
224
+
225
+ ax_groupbar = None
226
+ if has_groups:
227
+ group_values_full = adata.obs[groupby].astype(str).to_numpy()
228
+ sample_order = col_order if swap_axes else row_order
229
+ group_values = group_values_full[sample_order]
230
+ group_levels = list(dict.fromkeys(group_values.tolist()))
231
+
232
+ group_colors = _resolve_group_colors(group_levels, group_colors)
233
+
234
+ legend_handles = [mpatches.Patch(facecolor=group_colors[g], edgecolor="black", label=g) for g in group_levels]
235
+ ax_heatmap.legend(
236
+ handles=legend_handles,
237
+ title=str(groupby),
238
+ loc="upper left",
239
+ bbox_to_anchor=(1.02, 1.2),
240
+ frameon=False,
241
+ )
242
+
243
+ if has_groups and group_col_idx is not None:
244
+ ax_groupbar = fig.add_subplot(gs[heatmap_row_idx, group_col_idx])
245
+ rgba = np.array([mcolors.to_rgba(group_colors[g]) for g in group_values]).reshape(-1, 1, 4)
246
+ ax_groupbar.imshow(rgba, aspect="auto", origin="upper")
247
+ ax_groupbar.set_xticks([])
248
+ ax_groupbar.set_yticks([])
249
+ for spine in ax_groupbar.spines.values():
250
+ spine.set_visible(False)
251
+ elif has_groups and swap_axes:
252
+ ax_groupbar = ax_heatmap.inset_axes([0.0, 1.01, 1.0, 0.04], transform=ax_heatmap.transAxes)
253
+ rgba = np.array([mcolors.to_rgba(group_colors[g]) for g in group_values]).reshape(1, -1, 4)
254
+ ax_groupbar.imshow(rgba, aspect="auto", origin="upper")
255
+ ax_groupbar.set_xticks([])
256
+ ax_groupbar.set_yticks([])
257
+ for spine in ax_groupbar.spines.values():
258
+ spine.set_visible(False)
259
+
260
+ shrink = float(np.clip(colorbar_shrink, 0.2, 1.0))
261
+ cbar_width = 0.20 * shrink
262
+ cbar_bottom = 0.05
263
+ cbar_left = 0.95 - cbar_width
264
+ ax_cbar = fig.add_axes([cbar_left, cbar_bottom, cbar_width, 0.018])
265
+ cbar = fig.colorbar(im, cax=ax_cbar, orientation="horizontal")
266
+ cbar.set_label("Z-score" if z_score else "Expression")
267
+
268
+ fig.suptitle(title)
269
+
270
+ if show:
271
+ fig.show()
272
+
273
+ return fig, {
274
+ "heatmap": ax_heatmap,
275
+ "dendrogram_row": ax_dendro_row,
276
+ "dendrogram_col": ax_dendro_col,
277
+ "groupbar": ax_groupbar,
278
+ "colorbar": ax_cbar,
279
+ }
@@ -0,0 +1,136 @@
1
+ """MA and MD plot functions for microarray quality control."""
2
+
3
+ from typing import Any
4
+
5
+ import matplotlib.pyplot as plt
6
+ import numpy as np
7
+ from anndata import AnnData
8
+ from matplotlib.axes import Axes
9
+
10
+ from microarray.plotting._utils import add_loess_curve, add_reference_line, with_highlights
11
+
12
+
13
+ def ma(
14
+ adata: AnnData,
15
+ arrays: tuple[int | str, int | str] | None = None,
16
+ status: np.ndarray | None = None,
17
+ span: float = 0.3,
18
+ xlab: str = "A (average log-expression)",
19
+ ylab: str = "M (log-ratio)",
20
+ title: str = "",
21
+ loess: bool = True,
22
+ reference_line: bool = True,
23
+ ax: Axes | None = None,
24
+ **kwargs: Any,
25
+ ) -> Axes:
26
+ """MA plot (M vs A plot) for comparing two arrays or array vs reference.
27
+
28
+ MA plot displays log-ratio (M) vs average log-expression (A) to visualize
29
+ differences between two arrays. Useful for quality control and identifying
30
+ systematic biases.
31
+
32
+ M = log2(array1) - log2(array2)
33
+ A = 0.5 * (log2(array1) + log2(array2))
34
+
35
+ Args:
36
+ adata: AnnData object with probe-level expression data in .X
37
+ arrays: Tuple of two array indices/names to compare. If None, compares
38
+ first array to pseudo-median reference.
39
+ status: Status labels for highlighting points (e.g., 'up', 'down', 'not-significant')
40
+ span: Smoothing span for LOESS curve (0-1). Default 0.3.
41
+ xlab: X-axis label
42
+ ylab: Y-axis label
43
+ title: Plot title
44
+ loess: Whether to add LOESS smoothing curve. Default True.
45
+ reference_line: Whether to add horizontal line at M=0. Default True.
46
+ ax: Existing Axes object. If None, creates new figure.
47
+ **kwargs: Additional arguments passed to scatter plot
48
+
49
+ Returns:
50
+ Axes object with MA plot
51
+
52
+ Examples:
53
+ >>> import anndata as ad
54
+ >>> import numpy as np
55
+ >>> from microarray.plotting import ma
56
+ >>> # Compare two arrays
57
+ >>> data = np.random.randn(1000, 4)
58
+ >>> adata = ad.AnnData(data.T)
59
+ >>> ax = ma(adata, arrays=(0, 1))
60
+ >>> # Compare to median reference
61
+ >>> ax = ma(adata)
62
+ """
63
+ if ax is None:
64
+ _, ax = plt.subplots(figsize=(8, 6))
65
+
66
+ # Get expression matrix (probes x samples)
67
+ # AnnData stores as samples x features, so transpose
68
+ expr = adata.X.T # Now probes x samples
69
+
70
+ # Check for multiple arrays
71
+ if expr.ndim == 1 or expr.shape[1] < 2:
72
+ raise ValueError("AnnData must contain multiple arrays for MA plot")
73
+
74
+ # Convert to log2 if not already
75
+ # Check if data appears to be log-transformed (negative values or small range)
76
+ if expr.min() < 0 or (expr.max() - expr.min()) < 20:
77
+ # Likely already log-transformed
78
+ log_expr = expr
79
+ else:
80
+ # Apply log2 transformation
81
+ log_expr = np.log2(expr + 1) # Add pseudocount to avoid log(0)
82
+
83
+ # Select arrays to compare
84
+ if arrays is None:
85
+ # Compare first array to pseudo-median reference
86
+ array1_idx = 0
87
+ reference = np.median(log_expr, axis=1) # Median across all arrays
88
+ log_array1 = log_expr[:, array1_idx]
89
+ log_array2 = reference
90
+ if not title:
91
+ title = "MA Plot: Array 0 vs Median"
92
+ else:
93
+ # Compare two specified arrays
94
+ if len(arrays) != 2:
95
+ raise ValueError("arrays must be a tuple of length 2")
96
+
97
+ # Handle array indices or names
98
+ if isinstance(arrays[0], str):
99
+ array1_idx = list(adata.obs_names).index(arrays[0])
100
+ else:
101
+ array1_idx = arrays[0]
102
+
103
+ if isinstance(arrays[1], str):
104
+ array2_idx = list(adata.obs_names).index(arrays[1])
105
+ else:
106
+ array2_idx = arrays[1]
107
+
108
+ log_array1 = log_expr[:, array1_idx]
109
+ log_array2 = log_expr[:, array2_idx]
110
+ if not title:
111
+ title = f"MA Plot: Array {array1_idx} vs Array {array2_idx}"
112
+
113
+ # Calculate M and A
114
+ M = log_array1 - log_array2
115
+ A = 0.5 * (log_array1 + log_array2)
116
+
117
+ # Remove NaN/Inf values
118
+ mask = np.isfinite(M) & np.isfinite(A)
119
+ M = M[mask]
120
+ A = A[mask]
121
+ if status is not None:
122
+ status = status[mask]
123
+
124
+ # Create scatter plot with highlighting
125
+ ax = with_highlights(A, M, status=status, xlab=xlab, ylab=ylab, title=title, ax=ax, **kwargs)
126
+
127
+ # Add reference line at M=0
128
+ if reference_line:
129
+ add_reference_line(ax, y=0, color="gray", linestyle="--", linewidth=1)
130
+
131
+ # Add LOESS smoothing curve
132
+ if loess and len(A) > 10:
133
+ add_loess_curve(ax, A, M, span=span, color="blue", linewidth=2, label="LOESS")
134
+ ax.legend(loc="best")
135
+
136
+ return ax