microarray 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- microarray/__init__.py +15 -0
- microarray/_version.py +3 -0
- microarray/datasets/__init__.py +3 -0
- microarray/datasets/_arrayexpress.py +1 -0
- microarray/datasets/_cdf_files.py +35 -0
- microarray/datasets/_geo.py +1 -0
- microarray/datasets/_utils.py +143 -0
- microarray/io/__init__.py +17 -0
- microarray/io/_anndata_converter.py +198 -0
- microarray/io/_cdf.py +575 -0
- microarray/io/_cel.py +591 -0
- microarray/io/_read.py +127 -0
- microarray/plotting/__init__.py +28 -0
- microarray/plotting/_base.py +253 -0
- microarray/plotting/_cel.py +75 -0
- microarray/plotting/_de_plots.py +239 -0
- microarray/plotting/_diagnostic_plots.py +268 -0
- microarray/plotting/_heatmap.py +279 -0
- microarray/plotting/_ma_plots.py +136 -0
- microarray/plotting/_pca.py +320 -0
- microarray/plotting/_qc_plots.py +335 -0
- microarray/plotting/_score.py +38 -0
- microarray/plotting/_top_table_heatmap.py +98 -0
- microarray/plotting/_utils.py +280 -0
- microarray/preprocessing/__init__.py +39 -0
- microarray/preprocessing/_background.py +862 -0
- microarray/preprocessing/_log2.py +77 -0
- microarray/preprocessing/_normalize.py +1292 -0
- microarray/preprocessing/_rma.py +243 -0
- microarray/preprocessing/_robust.py +170 -0
- microarray/preprocessing/_summarize.py +318 -0
- microarray/py.typed +0 -0
- microarray/tools/__init__.py +26 -0
- microarray/tools/_biomart.py +416 -0
- microarray/tools/_empirical_bayes.py +401 -0
- microarray/tools/_fdist.py +171 -0
- microarray/tools/_linear_models.py +387 -0
- microarray/tools/_mds.py +101 -0
- microarray/tools/_pca.py +88 -0
- microarray/tools/_score.py +86 -0
- microarray/tools/_toptable.py +360 -0
- microarray-0.1.0.dist-info/METADATA +75 -0
- microarray-0.1.0.dist-info/RECORD +44 -0
- microarray-0.1.0.dist-info/WHEEL +4 -0
|
@@ -0,0 +1,387 @@
|
|
|
1
|
+
"""Linear model fitting for microarray differential expression analysis.
|
|
2
|
+
|
|
3
|
+
This module provides functions for fitting linear models to microarray gene
|
|
4
|
+
expression data, supporting both standard least squares and robust regression.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import re
|
|
8
|
+
import warnings
|
|
9
|
+
from typing import Literal
|
|
10
|
+
|
|
11
|
+
import numpy as np
|
|
12
|
+
import pandas as pd
|
|
13
|
+
from anndata import AnnData
|
|
14
|
+
from scipy import linalg
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def lm_fit(
|
|
18
|
+
adata: AnnData,
|
|
19
|
+
groupby: str | np.ndarray,
|
|
20
|
+
method: Literal["ls", "robust"] = "ls",
|
|
21
|
+
copy: bool = True,
|
|
22
|
+
return_fit: bool = False,
|
|
23
|
+
) -> AnnData | dict | None:
|
|
24
|
+
"""Fit linear models for each gene across arrays.
|
|
25
|
+
|
|
26
|
+
Fits a linear model to the expression data for each gene (row), using the
|
|
27
|
+
provided design matrix. This is the first step in limma-style differential
|
|
28
|
+
expression analysis.
|
|
29
|
+
|
|
30
|
+
Parameters
|
|
31
|
+
----------
|
|
32
|
+
adata : AnnData
|
|
33
|
+
Annotated data object containing normalized expression values in `.X`.
|
|
34
|
+
Rows are samples, columns are features/genes (following AnnData convention).
|
|
35
|
+
groupby : str
|
|
36
|
+
Column in `adata.obs` used to define groups. A design matrix is built
|
|
37
|
+
internally as one-hot encoded group indicators (no intercept).
|
|
38
|
+
|
|
39
|
+
For backward compatibility, a NumPy array is also accepted and treated as
|
|
40
|
+
a precomputed design matrix.
|
|
41
|
+
method : {"ls", "robust"}, default="ls"
|
|
42
|
+
Fitting method:
|
|
43
|
+
- "ls": Standard least squares using QR decomposition
|
|
44
|
+
- "robust": Robust regression using iteratively reweighted least squares (IRLS)
|
|
45
|
+
copy : bool, default=True
|
|
46
|
+
Whether to copy the AnnData object. If False, modifies in place.
|
|
47
|
+
|
|
48
|
+
return_fit : bool, default=False
|
|
49
|
+
If True, return the fit dictionary. Otherwise, store fit results in
|
|
50
|
+
`adata.uns['lm_fit']` and return `adata` if `copy=True` else None.
|
|
51
|
+
|
|
52
|
+
Returns:
|
|
53
|
+
-------
|
|
54
|
+
AnnData | dict | None
|
|
55
|
+
Returns fit dictionary only when `return_fit=True`.
|
|
56
|
+
Otherwise returns `adata` if `copy=True`, else None.
|
|
57
|
+
|
|
58
|
+
The fit object is stored in `adata.uns['lm_fit']` and contains:
|
|
59
|
+
- coefficients: np.ndarray of shape (n_genes, n_coefficients)
|
|
60
|
+
Estimated coefficients for each gene and design column
|
|
61
|
+
- stdev_unscaled: np.ndarray of shape (n_genes, n_coefficients)
|
|
62
|
+
Unscaled standard errors (before multiplying by sigma)
|
|
63
|
+
- sigma: np.ndarray of shape (n_genes,)
|
|
64
|
+
Residual standard deviation for each gene
|
|
65
|
+
- df_residual: np.ndarray of shape (n_genes,)
|
|
66
|
+
Residual degrees of freedom for each gene
|
|
67
|
+
- cov_coefficients: np.ndarray of shape (n_coefficients, n_coefficients)
|
|
68
|
+
Covariance matrix of coefficients (shared across genes)
|
|
69
|
+
- design: np.ndarray
|
|
70
|
+
Copy of the design matrix used for fitting
|
|
71
|
+
- genes: np.ndarray
|
|
72
|
+
Gene/feature names from adata.var_names
|
|
73
|
+
- method: str
|
|
74
|
+
Method used for fitting
|
|
75
|
+
- adata: AnnData
|
|
76
|
+
Reference to (possibly copied) AnnData object for annotation retrieval
|
|
77
|
+
|
|
78
|
+
Notes:
|
|
79
|
+
-----
|
|
80
|
+
**Algorithm for least squares (method="ls")**:
|
|
81
|
+
|
|
82
|
+
For each gene g with expression vector y_g across n samples:
|
|
83
|
+
|
|
84
|
+
1. Compute QR decomposition: design = Q @ R
|
|
85
|
+
2. Solve: coefficients = R^(-1) @ Q.T @ y_g
|
|
86
|
+
3. Compute fitted values: y_hat = design @ coefficients
|
|
87
|
+
4. Compute residuals: resid = y_g - y_hat
|
|
88
|
+
5. Estimate variance: sigma^2 = sum(resid^2) / df_residual
|
|
89
|
+
6. Unscaled standard errors: sqrt(diag((R.T @ R)^(-1)))
|
|
90
|
+
|
|
91
|
+
**Algorithm for robust regression (method="robust")**:
|
|
92
|
+
|
|
93
|
+
Uses iteratively reweighted least squares (IRLS) with Huber weights:
|
|
94
|
+
|
|
95
|
+
1. Initialize with OLS fit
|
|
96
|
+
2. Compute residuals and robust scale estimate (MAD)
|
|
97
|
+
3. Compute Huber weights based on scaled residuals
|
|
98
|
+
4. Refit with weighted least squares
|
|
99
|
+
5. Iterate until convergence
|
|
100
|
+
|
|
101
|
+
**Design Matrix Requirements**:
|
|
102
|
+
|
|
103
|
+
- Must have full column rank or estimable contrasts
|
|
104
|
+
- Number of rows must match number of samples in adata
|
|
105
|
+
- Should not have an intercept column of all 1s unless intended
|
|
106
|
+
|
|
107
|
+
The expression data is transposed internally to genes × samples for efficient
|
|
108
|
+
gene-wise fitting, then results are returned in the original orientation.
|
|
109
|
+
|
|
110
|
+
Examples:
|
|
111
|
+
--------
|
|
112
|
+
>>> import microarray as ma
|
|
113
|
+
>>> import numpy as np
|
|
114
|
+
>>> # Suppose we have preprocessed data with 4 samples: 2 control, 2 treatment
|
|
115
|
+
>>> # adata.X is shape (4, 10000) - 4 samples, 10000 genes
|
|
116
|
+
>>> # Create design matrix for two-group comparison
|
|
117
|
+
>>> design = np.array(
|
|
118
|
+
... [
|
|
119
|
+
... [1, 0], # Sample 1: control
|
|
120
|
+
... [1, 0], # Sample 2: control
|
|
121
|
+
... [0, 1], # Sample 3: treatment
|
|
122
|
+
... [0, 1], # Sample 4: treatment
|
|
123
|
+
... ]
|
|
124
|
+
... )
|
|
125
|
+
>>> # Fit linear models
|
|
126
|
+
>>> fit = ma.tl.lm_fit(adata, design)
|
|
127
|
+
>>> # Coefficients: one row per gene, columns are [control_mean, treatment_mean]
|
|
128
|
+
>>> print(fit["coefficients"].shape) # (10000, 2)
|
|
129
|
+
>>> # Use robust fitting for data with outliers
|
|
130
|
+
>>> fit_robust = ma.tl.lm_fit(adata, design, method="robust")
|
|
131
|
+
|
|
132
|
+
References:
|
|
133
|
+
----------
|
|
134
|
+
Smyth, G. K. (2004). Linear models and empirical Bayes methods for assessing
|
|
135
|
+
differential expression in microarray experiments. Statistical Applications
|
|
136
|
+
in Genetics and Molecular Biology, 3(1).
|
|
137
|
+
|
|
138
|
+
See Also:
|
|
139
|
+
--------
|
|
140
|
+
ebayes : Apply empirical Bayes moderation to the fitted model
|
|
141
|
+
top_table : Extract top differentially expressed genes
|
|
142
|
+
"""
|
|
143
|
+
adata = adata.copy() if copy else adata
|
|
144
|
+
|
|
145
|
+
legacy_design_input = isinstance(groupby, np.ndarray)
|
|
146
|
+
|
|
147
|
+
if legacy_design_input:
|
|
148
|
+
design = np.asarray(groupby, dtype=float)
|
|
149
|
+
design_columns = [f"coef_{i}" for i in range(design.shape[1])]
|
|
150
|
+
group_to_column = None
|
|
151
|
+
group_values = None
|
|
152
|
+
groupby_col = None
|
|
153
|
+
else:
|
|
154
|
+
if groupby not in adata.obs.columns:
|
|
155
|
+
raise ValueError(f"Column '{groupby}' not found in adata.obs")
|
|
156
|
+
|
|
157
|
+
group_series = adata.obs[groupby]
|
|
158
|
+
if group_series.isna().any():
|
|
159
|
+
raise ValueError(f"Column '{groupby}' contains missing values, cannot build design matrix")
|
|
160
|
+
|
|
161
|
+
# Preserve observed order for non-categorical data.
|
|
162
|
+
if pd.api.types.is_categorical_dtype(group_series):
|
|
163
|
+
levels = list(group_series.cat.categories)
|
|
164
|
+
else:
|
|
165
|
+
levels = list(pd.unique(group_series))
|
|
166
|
+
|
|
167
|
+
def _snake_case(value: object) -> str:
|
|
168
|
+
text = str(value).strip().lower()
|
|
169
|
+
text = re.sub(r"[^a-z0-9]+", "_", text)
|
|
170
|
+
text = re.sub(r"_+", "_", text).strip("_")
|
|
171
|
+
return text or "group"
|
|
172
|
+
|
|
173
|
+
base_names = [_snake_case(level) for level in levels]
|
|
174
|
+
name_counts: dict[str, int] = {}
|
|
175
|
+
design_columns = []
|
|
176
|
+
for name in base_names:
|
|
177
|
+
count = name_counts.get(name, 0)
|
|
178
|
+
if count == 0:
|
|
179
|
+
design_columns.append(name)
|
|
180
|
+
else:
|
|
181
|
+
design_columns.append(f"{name}_{count + 1}")
|
|
182
|
+
name_counts[name] = count + 1
|
|
183
|
+
|
|
184
|
+
group_to_column = {levels[i]: design_columns[i] for i in range(len(levels))}
|
|
185
|
+
group_values = np.asarray(group_series)
|
|
186
|
+
level_to_idx = {level: i for i, level in enumerate(levels)}
|
|
187
|
+
design = np.zeros((adata.n_obs, len(levels)), dtype=float)
|
|
188
|
+
row_idx = np.arange(adata.n_obs)
|
|
189
|
+
col_idx = np.array([level_to_idx[val] for val in group_values], dtype=int)
|
|
190
|
+
design[row_idx, col_idx] = 1.0
|
|
191
|
+
groupby_col = groupby
|
|
192
|
+
|
|
193
|
+
# Validate inputs
|
|
194
|
+
if design.shape[0] != adata.n_obs:
|
|
195
|
+
raise ValueError(f"Design matrix rows ({design.shape[0]}) must match number of samples ({adata.n_obs})")
|
|
196
|
+
|
|
197
|
+
# Check design matrix rank
|
|
198
|
+
rank = np.linalg.matrix_rank(design)
|
|
199
|
+
n_coef = design.shape[1]
|
|
200
|
+
if rank < n_coef:
|
|
201
|
+
warnings.warn(
|
|
202
|
+
f"Design matrix is not full rank (rank={rank}, columns={n_coef}). Some coefficients may not be estimable.",
|
|
203
|
+
category=UserWarning,
|
|
204
|
+
stacklevel=2,
|
|
205
|
+
)
|
|
206
|
+
|
|
207
|
+
# Transpose to genes × samples for gene-wise fitting
|
|
208
|
+
# AnnData convention: samples × genes, but we need genes × samples
|
|
209
|
+
expr = adata.X.T # Now (n_genes, n_samples)
|
|
210
|
+
n_genes, n_samples = expr.shape
|
|
211
|
+
|
|
212
|
+
if method == "ls":
|
|
213
|
+
fit_result = _fit_ls(expr, design)
|
|
214
|
+
elif method == "robust":
|
|
215
|
+
fit_result = _fit_robust(expr, design)
|
|
216
|
+
else:
|
|
217
|
+
raise ValueError(f"Unknown method: {method}. Use 'ls' or 'robust'.")
|
|
218
|
+
|
|
219
|
+
# Add metadata
|
|
220
|
+
fit_result["design"] = design.copy()
|
|
221
|
+
fit_result["design_columns"] = list(design_columns)
|
|
222
|
+
fit_result["groupby"] = groupby_col
|
|
223
|
+
fit_result["group_to_column"] = group_to_column
|
|
224
|
+
fit_result["group_values"] = group_values
|
|
225
|
+
fit_result["genes"] = np.asarray(adata.var_names.to_numpy(copy=True))
|
|
226
|
+
fit_result["method"] = method
|
|
227
|
+
fit_result["_moderated"] = False
|
|
228
|
+
|
|
229
|
+
fit_for_uns = fit_result.copy()
|
|
230
|
+
adata.uns["lm_fit"] = fit_for_uns
|
|
231
|
+
|
|
232
|
+
if return_fit or legacy_design_input:
|
|
233
|
+
fit_for_return = fit_for_uns.copy()
|
|
234
|
+
fit_for_return["adata"] = adata
|
|
235
|
+
return fit_for_return
|
|
236
|
+
return adata if copy else None
|
|
237
|
+
|
|
238
|
+
|
|
239
|
+
def _fit_ls(expr: np.ndarray, design: np.ndarray) -> dict:
|
|
240
|
+
"""Fit linear models using least squares.
|
|
241
|
+
|
|
242
|
+
Parameters
|
|
243
|
+
----------
|
|
244
|
+
expr : np.ndarray
|
|
245
|
+
Expression matrix (n_genes, n_samples)
|
|
246
|
+
design : np.ndarray
|
|
247
|
+
Design matrix (n_samples, n_coefficients)
|
|
248
|
+
|
|
249
|
+
Returns:
|
|
250
|
+
-------
|
|
251
|
+
dict
|
|
252
|
+
Fit results including coefficients, sigma, df_residual, etc.
|
|
253
|
+
"""
|
|
254
|
+
n_genes, n_samples = expr.shape
|
|
255
|
+
n_coef = design.shape[1]
|
|
256
|
+
df_residual = n_samples - n_coef
|
|
257
|
+
|
|
258
|
+
if df_residual <= 0:
|
|
259
|
+
raise ValueError(
|
|
260
|
+
f"Not enough samples ({n_samples}) for design with "
|
|
261
|
+
f"{n_coef} coefficients. Need at least {n_coef + 1} samples."
|
|
262
|
+
)
|
|
263
|
+
|
|
264
|
+
# QR decomposition of design for stable computation
|
|
265
|
+
Q, R = linalg.qr(design, mode="economic")
|
|
266
|
+
|
|
267
|
+
# Compute coefficients: beta = R^-1 @ Q.T @ expr.T
|
|
268
|
+
# expr.T is (n_samples, n_genes), Q.T @ expr.T is (n_coef, n_genes)
|
|
269
|
+
coefficients = linalg.solve_triangular(R, Q.T @ expr.T, lower=False).T
|
|
270
|
+
# Now coefficients is (n_genes, n_coef)
|
|
271
|
+
|
|
272
|
+
# Compute fitted values and residuals
|
|
273
|
+
fitted = coefficients @ design.T # (n_genes, n_samples)
|
|
274
|
+
residuals = expr - fitted
|
|
275
|
+
|
|
276
|
+
# Estimate residual variance for each gene
|
|
277
|
+
# sigma^2 = sum(residuals^2) / df_residual
|
|
278
|
+
rss = np.sum(residuals**2, axis=1) # Residual sum of squares per gene
|
|
279
|
+
sigma = np.sqrt(rss / df_residual)
|
|
280
|
+
|
|
281
|
+
# Compute unscaled covariance matrix: (R.T @ R)^-1
|
|
282
|
+
# This is the (X.T @ X)^-1 matrix
|
|
283
|
+
R_inv = linalg.solve_triangular(R, np.eye(n_coef), lower=False)
|
|
284
|
+
cov_coef = R_inv @ R_inv.T
|
|
285
|
+
|
|
286
|
+
# Unscaled standard errors: sqrt(diag(cov_coef))
|
|
287
|
+
# These need to be multiplied by sigma to get actual standard errors
|
|
288
|
+
stdev_unscaled = np.sqrt(np.diag(cov_coef))
|
|
289
|
+
# Broadcast to (n_genes, n_coef)
|
|
290
|
+
stdev_unscaled = np.tile(stdev_unscaled, (n_genes, 1))
|
|
291
|
+
|
|
292
|
+
return {
|
|
293
|
+
"coefficients": coefficients,
|
|
294
|
+
"stdev_unscaled": stdev_unscaled,
|
|
295
|
+
"sigma": sigma,
|
|
296
|
+
"df_residual": np.full(n_genes, df_residual, dtype=float),
|
|
297
|
+
"cov_coefficients": cov_coef,
|
|
298
|
+
}
|
|
299
|
+
|
|
300
|
+
|
|
301
|
+
def _fit_robust(expr: np.ndarray, design: np.ndarray, max_iter: int = 20) -> dict:
|
|
302
|
+
"""Fit linear models using robust regression (IRLS with Huber weights).
|
|
303
|
+
|
|
304
|
+
Parameters
|
|
305
|
+
----------
|
|
306
|
+
expr : np.ndarray
|
|
307
|
+
Expression matrix (n_genes, n_samples)
|
|
308
|
+
design : np.ndarray
|
|
309
|
+
Design matrix (n_samples, n_coefficients)
|
|
310
|
+
max_iter : int, default=20
|
|
311
|
+
Maximum number of IRLS iterations
|
|
312
|
+
|
|
313
|
+
Returns:
|
|
314
|
+
-------
|
|
315
|
+
dict
|
|
316
|
+
Fit results including coefficients, sigma, df_residual, etc.
|
|
317
|
+
"""
|
|
318
|
+
n_genes, n_samples = expr.shape
|
|
319
|
+
n_coef = design.shape[1]
|
|
320
|
+
df_residual = n_samples - n_coef
|
|
321
|
+
|
|
322
|
+
if df_residual <= 0:
|
|
323
|
+
raise ValueError(
|
|
324
|
+
f"Not enough samples ({n_samples}) for design with "
|
|
325
|
+
f"{n_coef} coefficients. Need at least {n_coef + 1} samples."
|
|
326
|
+
)
|
|
327
|
+
|
|
328
|
+
# Initialize with OLS
|
|
329
|
+
Q, R = linalg.qr(design, mode="economic")
|
|
330
|
+
coefficients = linalg.solve_triangular(R, Q.T @ expr.T, lower=False).T
|
|
331
|
+
|
|
332
|
+
# Huber's k parameter
|
|
333
|
+
k = 1.345
|
|
334
|
+
|
|
335
|
+
# Iteratively reweight
|
|
336
|
+
for _ in range(max_iter):
|
|
337
|
+
# Compute residuals
|
|
338
|
+
fitted = coefficients @ design.T
|
|
339
|
+
residuals = expr - fitted
|
|
340
|
+
|
|
341
|
+
# Robust scale estimate using MAD (Median Absolute Deviation)
|
|
342
|
+
mad = np.median(np.abs(residuals - np.median(residuals, axis=1, keepdims=True)), axis=1)
|
|
343
|
+
scale = 1.4826 * mad # Scale factor for normal distribution
|
|
344
|
+
scale[scale < 1e-8] = 1.0 # Avoid division by zero
|
|
345
|
+
|
|
346
|
+
# Compute Huber weights
|
|
347
|
+
scaled_resid = residuals / scale[:, np.newaxis]
|
|
348
|
+
weights = np.where(np.abs(scaled_resid) <= k, 1.0, k / np.abs(scaled_resid))
|
|
349
|
+
|
|
350
|
+
# Weighted least squares
|
|
351
|
+
# For each gene, solve: (X.T @ W @ X) @ beta = X.T @ W @ y
|
|
352
|
+
coefficients_new = np.zeros_like(coefficients)
|
|
353
|
+
for i in range(n_genes):
|
|
354
|
+
W = np.diag(weights[i, :])
|
|
355
|
+
XtWX = design.T @ W @ design
|
|
356
|
+
XtWy = design.T @ W @ expr[i, :]
|
|
357
|
+
try:
|
|
358
|
+
coefficients_new[i, :] = linalg.solve(XtWX, XtWy, assume_a="pos")
|
|
359
|
+
except linalg.LinAlgError:
|
|
360
|
+
# Fall back to OLS for this gene
|
|
361
|
+
coefficients_new[i, :] = coefficients[i, :]
|
|
362
|
+
|
|
363
|
+
# Check convergence
|
|
364
|
+
coef_change = np.max(np.abs(coefficients_new - coefficients))
|
|
365
|
+
coefficients = coefficients_new
|
|
366
|
+
if coef_change < 1e-6:
|
|
367
|
+
break
|
|
368
|
+
|
|
369
|
+
# Final residuals and sigma
|
|
370
|
+
fitted = coefficients @ design.T
|
|
371
|
+
residuals = expr - fitted
|
|
372
|
+
rss = np.sum(residuals**2, axis=1)
|
|
373
|
+
sigma = np.sqrt(rss / df_residual)
|
|
374
|
+
|
|
375
|
+
# Unscaled covariance (use final weights)
|
|
376
|
+
# For simplicity, use OLS covariance structure
|
|
377
|
+
R_inv = linalg.solve_triangular(R, np.eye(n_coef), lower=False)
|
|
378
|
+
cov_coef = R_inv @ R_inv.T
|
|
379
|
+
stdev_unscaled = np.tile(np.sqrt(np.diag(cov_coef)), (n_genes, 1))
|
|
380
|
+
|
|
381
|
+
return {
|
|
382
|
+
"coefficients": coefficients,
|
|
383
|
+
"stdev_unscaled": stdev_unscaled,
|
|
384
|
+
"sigma": sigma,
|
|
385
|
+
"df_residual": np.full(n_genes, df_residual, dtype=float),
|
|
386
|
+
"cov_coefficients": cov_coef,
|
|
387
|
+
}
|
microarray/tools/_mds.py
ADDED
|
@@ -0,0 +1,101 @@
|
|
|
1
|
+
"""Multidimensional scaling for microarray data."""
|
|
2
|
+
|
|
3
|
+
import numpy as np
|
|
4
|
+
from anndata import AnnData
|
|
5
|
+
from sklearn.manifold import MDS
|
|
6
|
+
from sklearn.metrics import pairwise_distances
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def mds(
|
|
10
|
+
adata: AnnData,
|
|
11
|
+
top: int = 500,
|
|
12
|
+
gene_selection: str = "common",
|
|
13
|
+
n_components: int = 2,
|
|
14
|
+
obsm_key: str = "X_mds",
|
|
15
|
+
random_state: int = 42,
|
|
16
|
+
copy: bool = False,
|
|
17
|
+
) -> AnnData | None:
|
|
18
|
+
"""Compute Multidimensional Scaling (MDS) embedding.
|
|
19
|
+
|
|
20
|
+
MDS reduces high-dimensional expression data to a lower-dimensional space
|
|
21
|
+
while preserving pairwise distances between samples. This is useful for
|
|
22
|
+
visualizing sample relationships and identifying outliers or batch effects.
|
|
23
|
+
|
|
24
|
+
Args:
|
|
25
|
+
adata: AnnData object with probe-level expression data in .X
|
|
26
|
+
top: Number of top varying probes to use for distance calculation. Default 500.
|
|
27
|
+
gene_selection: Method for selecting genes:
|
|
28
|
+
- "common": Use top probes with highest median absolute deviation
|
|
29
|
+
- "pairwise": Use different probes for each pair (not implemented)
|
|
30
|
+
n_components: Number of dimensions to reduce to. Default 2.
|
|
31
|
+
obsm_key: Key to store the MDS embedding in .obsm. Default "X_mds".
|
|
32
|
+
random_state: Random state for reproducibility. Default 42.
|
|
33
|
+
copy: Return a copy instead of writing to adata. Default False.
|
|
34
|
+
|
|
35
|
+
Returns:
|
|
36
|
+
Returns None if `copy=False`, else returns an `AnnData` object with the
|
|
37
|
+
MDS embedding stored in `.obsm[obsm_key]`.
|
|
38
|
+
|
|
39
|
+
Examples:
|
|
40
|
+
>>> import anndata as ad
|
|
41
|
+
>>> import numpy as np
|
|
42
|
+
>>> import microarray as ma
|
|
43
|
+
>>> data = np.random.randn(1000, 6)
|
|
44
|
+
>>> adata = ad.AnnData(data.T) # Samples x probes
|
|
45
|
+
>>> ma.tl.mds(adata, top=500)
|
|
46
|
+
>>> print(adata.obsm["X_mds"].shape)
|
|
47
|
+
(6, 2)
|
|
48
|
+
"""
|
|
49
|
+
adata = adata.copy() if copy else adata
|
|
50
|
+
|
|
51
|
+
# Get expression matrix (samples x probes)
|
|
52
|
+
expr = adata.X
|
|
53
|
+
n_samples, n_probes = expr.shape
|
|
54
|
+
|
|
55
|
+
# Convert to log2 if not already
|
|
56
|
+
if expr.min() >= 0 and (expr.max() - expr.min()) > 20:
|
|
57
|
+
log_expr = np.log2(expr + 1)
|
|
58
|
+
else:
|
|
59
|
+
log_expr = expr
|
|
60
|
+
|
|
61
|
+
# Gene selection
|
|
62
|
+
if gene_selection == "common":
|
|
63
|
+
# Calculate median absolute deviation for each probe
|
|
64
|
+
mad = np.median(np.abs(log_expr - np.median(log_expr, axis=0)), axis=0)
|
|
65
|
+
# Select top varying probes
|
|
66
|
+
top_n = min(top, n_probes)
|
|
67
|
+
top_indices = np.argpartition(mad, -top_n)[-top_n:]
|
|
68
|
+
expr_subset = log_expr[:, top_indices]
|
|
69
|
+
elif gene_selection == "pairwise":
|
|
70
|
+
raise NotImplementedError("Pairwise gene selection not yet implemented")
|
|
71
|
+
else:
|
|
72
|
+
raise ValueError(f"Unknown gene_selection method: {gene_selection}")
|
|
73
|
+
|
|
74
|
+
# Calculate pairwise distances
|
|
75
|
+
# Use Euclidean distance by default
|
|
76
|
+
distances = pairwise_distances(expr_subset, metric="euclidean")
|
|
77
|
+
|
|
78
|
+
# Perform MDS
|
|
79
|
+
mds_model = MDS(
|
|
80
|
+
n_components=n_components,
|
|
81
|
+
metric="precomputed",
|
|
82
|
+
n_init=4,
|
|
83
|
+
init="random",
|
|
84
|
+
random_state=random_state,
|
|
85
|
+
)
|
|
86
|
+
coords = mds_model.fit_transform(distances)
|
|
87
|
+
|
|
88
|
+
# Store in obsm
|
|
89
|
+
adata.obsm[obsm_key] = coords
|
|
90
|
+
|
|
91
|
+
# Store parameters in uns
|
|
92
|
+
adata.uns[obsm_key] = {
|
|
93
|
+
"params": {
|
|
94
|
+
"top": top,
|
|
95
|
+
"gene_selection": gene_selection,
|
|
96
|
+
"n_components": n_components,
|
|
97
|
+
"random_state": random_state,
|
|
98
|
+
}
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
return adata if copy else None
|
microarray/tools/_pca.py
ADDED
|
@@ -0,0 +1,88 @@
|
|
|
1
|
+
"""Principal component analysis for microarray data."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import numpy as np
|
|
6
|
+
from anndata import AnnData
|
|
7
|
+
from scipy import sparse
|
|
8
|
+
from sklearn.decomposition import PCA
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def pca(
|
|
12
|
+
adata: AnnData,
|
|
13
|
+
n_components: int | None = None,
|
|
14
|
+
obsm_key: str = "X_pca",
|
|
15
|
+
layer: str | None = None,
|
|
16
|
+
scale: bool = False,
|
|
17
|
+
random_state: int | None = 42,
|
|
18
|
+
copy: bool = False,
|
|
19
|
+
) -> AnnData | None:
|
|
20
|
+
"""Compute a PCA embedding and store it in ``adata.obsm``.
|
|
21
|
+
|
|
22
|
+
Args:
|
|
23
|
+
adata: AnnData object with expression values in ``.X`` or ``layer``.
|
|
24
|
+
n_components: Number of principal components to compute.
|
|
25
|
+
Defaults to ``adata.n_obs``.
|
|
26
|
+
obsm_key: Key used to store coordinates in ``adata.obsm``.
|
|
27
|
+
layer: Optional layer key to use instead of ``adata.X``.
|
|
28
|
+
scale: Whether to z-scale each feature before PCA.
|
|
29
|
+
random_state: Random seed used by PCA solvers that require randomness.
|
|
30
|
+
copy: Return a modified copy instead of writing in place.
|
|
31
|
+
|
|
32
|
+
Returns:
|
|
33
|
+
Modified AnnData when ``copy=True``, else ``None``.
|
|
34
|
+
|
|
35
|
+
Raises:
|
|
36
|
+
ValueError: If ``n_components`` is invalid.
|
|
37
|
+
KeyError: If ``layer`` is provided but missing from ``adata.layers``.
|
|
38
|
+
"""
|
|
39
|
+
adata = adata.copy() if copy else adata
|
|
40
|
+
|
|
41
|
+
if layer is None:
|
|
42
|
+
matrix = adata.X
|
|
43
|
+
else:
|
|
44
|
+
if layer not in adata.layers:
|
|
45
|
+
raise KeyError(f"AnnData .layers has no '{layer}' layer")
|
|
46
|
+
matrix = adata.layers[layer]
|
|
47
|
+
|
|
48
|
+
values = matrix.toarray() if sparse.issparse(matrix) else np.asarray(matrix)
|
|
49
|
+
values = np.asarray(values, dtype=float)
|
|
50
|
+
|
|
51
|
+
if values.ndim != 2:
|
|
52
|
+
raise ValueError("Expected a 2D expression matrix")
|
|
53
|
+
|
|
54
|
+
if n_components is None:
|
|
55
|
+
n_components = adata.n_obs
|
|
56
|
+
|
|
57
|
+
if n_components < 1:
|
|
58
|
+
raise ValueError("n_components must be at least 1")
|
|
59
|
+
|
|
60
|
+
max_components = min(values.shape)
|
|
61
|
+
if n_components > max_components:
|
|
62
|
+
raise ValueError(f"n_components must be <= {max_components} for input with shape {values.shape}")
|
|
63
|
+
|
|
64
|
+
if scale:
|
|
65
|
+
mean = np.nanmean(values, axis=0)
|
|
66
|
+
std = np.nanstd(values, axis=0)
|
|
67
|
+
scaled = values - mean
|
|
68
|
+
valid = std > 0
|
|
69
|
+
scaled[:, valid] = scaled[:, valid] / std[valid]
|
|
70
|
+
values = scaled
|
|
71
|
+
|
|
72
|
+
model = PCA(n_components=n_components, random_state=random_state)
|
|
73
|
+
coords = model.fit_transform(values)
|
|
74
|
+
|
|
75
|
+
adata.obsm[obsm_key] = coords
|
|
76
|
+
adata.uns[obsm_key] = {
|
|
77
|
+
"variance_ratio": model.explained_variance_ratio_.copy(),
|
|
78
|
+
"variance": model.explained_variance_.copy(),
|
|
79
|
+
"components": model.components_.copy(),
|
|
80
|
+
"params": {
|
|
81
|
+
"n_components": n_components,
|
|
82
|
+
"layer": layer,
|
|
83
|
+
"scale": scale,
|
|
84
|
+
"random_state": random_state,
|
|
85
|
+
},
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
return adata if copy else None
|
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
"""Gene set scoring helpers for sample-level summaries."""
|
|
2
|
+
|
|
3
|
+
from collections.abc import Sequence
|
|
4
|
+
from typing import Literal
|
|
5
|
+
|
|
6
|
+
import numpy as np
|
|
7
|
+
from anndata import AnnData
|
|
8
|
+
from scipy import sparse
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def score(
|
|
12
|
+
adata: AnnData,
|
|
13
|
+
gene_set: Sequence[str],
|
|
14
|
+
method: Literal["mean", "median", "zscore"] = "mean",
|
|
15
|
+
*,
|
|
16
|
+
score_name: str = "score",
|
|
17
|
+
var_key: str | None = None,
|
|
18
|
+
layer: str | None = None,
|
|
19
|
+
copy: bool = False,
|
|
20
|
+
) -> AnnData | None:
|
|
21
|
+
"""Compute per-sample gene set scores and write them to ``adata.obs``.
|
|
22
|
+
|
|
23
|
+
Args:
|
|
24
|
+
adata: AnnData object with expression values in ``.X`` or ``layer``.
|
|
25
|
+
gene_set: Sequence of genes/features to score.
|
|
26
|
+
method: Scoring method: ``"mean"``, ``"median"``, or ``"zscore"``.
|
|
27
|
+
score_name: Column name used in ``adata.obs`` for output scores.
|
|
28
|
+
var_key: Optional ``adata.var`` column containing feature identifiers.
|
|
29
|
+
If not provided, ``adata.var_names`` are used.
|
|
30
|
+
layer: Optional layer key to use instead of ``adata.X``.
|
|
31
|
+
copy: Return a copy instead of writing into ``adata`` in place.
|
|
32
|
+
|
|
33
|
+
Returns:
|
|
34
|
+
Modified AnnData when ``copy=True``, else ``None``.
|
|
35
|
+
|
|
36
|
+
Raises:
|
|
37
|
+
ValueError: If ``gene_set`` is empty, method is unknown, or no genes match.
|
|
38
|
+
KeyError: If ``var_key`` or ``layer`` is missing.
|
|
39
|
+
"""
|
|
40
|
+
adata = adata.copy() if copy else adata
|
|
41
|
+
|
|
42
|
+
if len(gene_set) == 0:
|
|
43
|
+
raise ValueError("gene_set must contain at least one gene")
|
|
44
|
+
|
|
45
|
+
if var_key is None:
|
|
46
|
+
feature_ids = adata.var_names.astype(str)
|
|
47
|
+
else:
|
|
48
|
+
if var_key not in adata.var.columns:
|
|
49
|
+
raise KeyError(f"AnnData .var has no '{var_key}' column")
|
|
50
|
+
feature_ids = adata.var[var_key].astype(str)
|
|
51
|
+
|
|
52
|
+
gene_lookup = {str(gene) for gene in gene_set}
|
|
53
|
+
feature_mask = np.asarray(feature_ids.isin(gene_lookup), dtype=bool)
|
|
54
|
+
if not np.any(feature_mask):
|
|
55
|
+
raise ValueError("None of the provided genes were found in AnnData features")
|
|
56
|
+
|
|
57
|
+
if layer is None:
|
|
58
|
+
matrix = adata.X
|
|
59
|
+
else:
|
|
60
|
+
if layer not in adata.layers:
|
|
61
|
+
raise KeyError(f"AnnData .layers has no '{layer}' layer")
|
|
62
|
+
matrix = adata.layers[layer]
|
|
63
|
+
|
|
64
|
+
subset = matrix[:, feature_mask]
|
|
65
|
+
if sparse.issparse(subset):
|
|
66
|
+
values = subset.toarray()
|
|
67
|
+
else:
|
|
68
|
+
values = np.asarray(subset)
|
|
69
|
+
|
|
70
|
+
if method == "mean":
|
|
71
|
+
scores = np.nanmean(values, axis=1)
|
|
72
|
+
elif method == "median":
|
|
73
|
+
scores = np.nanmedian(values, axis=1)
|
|
74
|
+
elif method == "zscore":
|
|
75
|
+
gene_mean = np.nanmean(values, axis=0)
|
|
76
|
+
gene_std = np.nanstd(values, axis=0)
|
|
77
|
+
z_values = np.zeros_like(values, dtype=float)
|
|
78
|
+
valid = gene_std > 0
|
|
79
|
+
z_values[:, valid] = (values[:, valid] - gene_mean[valid]) / gene_std[valid]
|
|
80
|
+
scores = np.nanmean(z_values, axis=1)
|
|
81
|
+
else:
|
|
82
|
+
raise ValueError(f"Unknown scoring method: {method}")
|
|
83
|
+
|
|
84
|
+
adata.obs[score_name] = np.asarray(scores, dtype=float)
|
|
85
|
+
|
|
86
|
+
return adata if copy else None
|