microarray 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. microarray/__init__.py +15 -0
  2. microarray/_version.py +3 -0
  3. microarray/datasets/__init__.py +3 -0
  4. microarray/datasets/_arrayexpress.py +1 -0
  5. microarray/datasets/_cdf_files.py +35 -0
  6. microarray/datasets/_geo.py +1 -0
  7. microarray/datasets/_utils.py +143 -0
  8. microarray/io/__init__.py +17 -0
  9. microarray/io/_anndata_converter.py +198 -0
  10. microarray/io/_cdf.py +575 -0
  11. microarray/io/_cel.py +591 -0
  12. microarray/io/_read.py +127 -0
  13. microarray/plotting/__init__.py +28 -0
  14. microarray/plotting/_base.py +253 -0
  15. microarray/plotting/_cel.py +75 -0
  16. microarray/plotting/_de_plots.py +239 -0
  17. microarray/plotting/_diagnostic_plots.py +268 -0
  18. microarray/plotting/_heatmap.py +279 -0
  19. microarray/plotting/_ma_plots.py +136 -0
  20. microarray/plotting/_pca.py +320 -0
  21. microarray/plotting/_qc_plots.py +335 -0
  22. microarray/plotting/_score.py +38 -0
  23. microarray/plotting/_top_table_heatmap.py +98 -0
  24. microarray/plotting/_utils.py +280 -0
  25. microarray/preprocessing/__init__.py +39 -0
  26. microarray/preprocessing/_background.py +862 -0
  27. microarray/preprocessing/_log2.py +77 -0
  28. microarray/preprocessing/_normalize.py +1292 -0
  29. microarray/preprocessing/_rma.py +243 -0
  30. microarray/preprocessing/_robust.py +170 -0
  31. microarray/preprocessing/_summarize.py +318 -0
  32. microarray/py.typed +0 -0
  33. microarray/tools/__init__.py +26 -0
  34. microarray/tools/_biomart.py +416 -0
  35. microarray/tools/_empirical_bayes.py +401 -0
  36. microarray/tools/_fdist.py +171 -0
  37. microarray/tools/_linear_models.py +387 -0
  38. microarray/tools/_mds.py +101 -0
  39. microarray/tools/_pca.py +88 -0
  40. microarray/tools/_score.py +86 -0
  41. microarray/tools/_toptable.py +360 -0
  42. microarray-0.1.0.dist-info/METADATA +75 -0
  43. microarray-0.1.0.dist-info/RECORD +44 -0
  44. microarray-0.1.0.dist-info/WHEEL +4 -0
@@ -0,0 +1,387 @@
1
+ """Linear model fitting for microarray differential expression analysis.
2
+
3
+ This module provides functions for fitting linear models to microarray gene
4
+ expression data, supporting both standard least squares and robust regression.
5
+ """
6
+
7
+ import re
8
+ import warnings
9
+ from typing import Literal
10
+
11
+ import numpy as np
12
+ import pandas as pd
13
+ from anndata import AnnData
14
+ from scipy import linalg
15
+
16
+
17
+ def lm_fit(
18
+ adata: AnnData,
19
+ groupby: str | np.ndarray,
20
+ method: Literal["ls", "robust"] = "ls",
21
+ copy: bool = True,
22
+ return_fit: bool = False,
23
+ ) -> AnnData | dict | None:
24
+ """Fit linear models for each gene across arrays.
25
+
26
+ Fits a linear model to the expression data for each gene (row), using the
27
+ provided design matrix. This is the first step in limma-style differential
28
+ expression analysis.
29
+
30
+ Parameters
31
+ ----------
32
+ adata : AnnData
33
+ Annotated data object containing normalized expression values in `.X`.
34
+ Rows are samples, columns are features/genes (following AnnData convention).
35
+ groupby : str
36
+ Column in `adata.obs` used to define groups. A design matrix is built
37
+ internally as one-hot encoded group indicators (no intercept).
38
+
39
+ For backward compatibility, a NumPy array is also accepted and treated as
40
+ a precomputed design matrix.
41
+ method : {"ls", "robust"}, default="ls"
42
+ Fitting method:
43
+ - "ls": Standard least squares using QR decomposition
44
+ - "robust": Robust regression using iteratively reweighted least squares (IRLS)
45
+ copy : bool, default=True
46
+ Whether to copy the AnnData object. If False, modifies in place.
47
+
48
+ return_fit : bool, default=False
49
+ If True, return the fit dictionary. Otherwise, store fit results in
50
+ `adata.uns['lm_fit']` and return `adata` if `copy=True` else None.
51
+
52
+ Returns:
53
+ -------
54
+ AnnData | dict | None
55
+ Returns fit dictionary only when `return_fit=True`.
56
+ Otherwise returns `adata` if `copy=True`, else None.
57
+
58
+ The fit object is stored in `adata.uns['lm_fit']` and contains:
59
+ - coefficients: np.ndarray of shape (n_genes, n_coefficients)
60
+ Estimated coefficients for each gene and design column
61
+ - stdev_unscaled: np.ndarray of shape (n_genes, n_coefficients)
62
+ Unscaled standard errors (before multiplying by sigma)
63
+ - sigma: np.ndarray of shape (n_genes,)
64
+ Residual standard deviation for each gene
65
+ - df_residual: np.ndarray of shape (n_genes,)
66
+ Residual degrees of freedom for each gene
67
+ - cov_coefficients: np.ndarray of shape (n_coefficients, n_coefficients)
68
+ Covariance matrix of coefficients (shared across genes)
69
+ - design: np.ndarray
70
+ Copy of the design matrix used for fitting
71
+ - genes: np.ndarray
72
+ Gene/feature names from adata.var_names
73
+ - method: str
74
+ Method used for fitting
75
+ - adata: AnnData
76
+ Reference to (possibly copied) AnnData object for annotation retrieval
77
+
78
+ Notes:
79
+ -----
80
+ **Algorithm for least squares (method="ls")**:
81
+
82
+ For each gene g with expression vector y_g across n samples:
83
+
84
+ 1. Compute QR decomposition: design = Q @ R
85
+ 2. Solve: coefficients = R^(-1) @ Q.T @ y_g
86
+ 3. Compute fitted values: y_hat = design @ coefficients
87
+ 4. Compute residuals: resid = y_g - y_hat
88
+ 5. Estimate variance: sigma^2 = sum(resid^2) / df_residual
89
+ 6. Unscaled standard errors: sqrt(diag((R.T @ R)^(-1)))
90
+
91
+ **Algorithm for robust regression (method="robust")**:
92
+
93
+ Uses iteratively reweighted least squares (IRLS) with Huber weights:
94
+
95
+ 1. Initialize with OLS fit
96
+ 2. Compute residuals and robust scale estimate (MAD)
97
+ 3. Compute Huber weights based on scaled residuals
98
+ 4. Refit with weighted least squares
99
+ 5. Iterate until convergence
100
+
101
+ **Design Matrix Requirements**:
102
+
103
+ - Must have full column rank or estimable contrasts
104
+ - Number of rows must match number of samples in adata
105
+ - Should not have an intercept column of all 1s unless intended
106
+
107
+ The expression data is transposed internally to genes × samples for efficient
108
+ gene-wise fitting, then results are returned in the original orientation.
109
+
110
+ Examples:
111
+ --------
112
+ >>> import microarray as ma
113
+ >>> import numpy as np
114
+ >>> # Suppose we have preprocessed data with 4 samples: 2 control, 2 treatment
115
+ >>> # adata.X is shape (4, 10000) - 4 samples, 10000 genes
116
+ >>> # Create design matrix for two-group comparison
117
+ >>> design = np.array(
118
+ ... [
119
+ ... [1, 0], # Sample 1: control
120
+ ... [1, 0], # Sample 2: control
121
+ ... [0, 1], # Sample 3: treatment
122
+ ... [0, 1], # Sample 4: treatment
123
+ ... ]
124
+ ... )
125
+ >>> # Fit linear models
126
+ >>> fit = ma.tl.lm_fit(adata, design)
127
+ >>> # Coefficients: one row per gene, columns are [control_mean, treatment_mean]
128
+ >>> print(fit["coefficients"].shape) # (10000, 2)
129
+ >>> # Use robust fitting for data with outliers
130
+ >>> fit_robust = ma.tl.lm_fit(adata, design, method="robust")
131
+
132
+ References:
133
+ ----------
134
+ Smyth, G. K. (2004). Linear models and empirical Bayes methods for assessing
135
+ differential expression in microarray experiments. Statistical Applications
136
+ in Genetics and Molecular Biology, 3(1).
137
+
138
+ See Also:
139
+ --------
140
+ ebayes : Apply empirical Bayes moderation to the fitted model
141
+ top_table : Extract top differentially expressed genes
142
+ """
143
+ adata = adata.copy() if copy else adata
144
+
145
+ legacy_design_input = isinstance(groupby, np.ndarray)
146
+
147
+ if legacy_design_input:
148
+ design = np.asarray(groupby, dtype=float)
149
+ design_columns = [f"coef_{i}" for i in range(design.shape[1])]
150
+ group_to_column = None
151
+ group_values = None
152
+ groupby_col = None
153
+ else:
154
+ if groupby not in adata.obs.columns:
155
+ raise ValueError(f"Column '{groupby}' not found in adata.obs")
156
+
157
+ group_series = adata.obs[groupby]
158
+ if group_series.isna().any():
159
+ raise ValueError(f"Column '{groupby}' contains missing values, cannot build design matrix")
160
+
161
+ # Preserve observed order for non-categorical data.
162
+ if pd.api.types.is_categorical_dtype(group_series):
163
+ levels = list(group_series.cat.categories)
164
+ else:
165
+ levels = list(pd.unique(group_series))
166
+
167
+ def _snake_case(value: object) -> str:
168
+ text = str(value).strip().lower()
169
+ text = re.sub(r"[^a-z0-9]+", "_", text)
170
+ text = re.sub(r"_+", "_", text).strip("_")
171
+ return text or "group"
172
+
173
+ base_names = [_snake_case(level) for level in levels]
174
+ name_counts: dict[str, int] = {}
175
+ design_columns = []
176
+ for name in base_names:
177
+ count = name_counts.get(name, 0)
178
+ if count == 0:
179
+ design_columns.append(name)
180
+ else:
181
+ design_columns.append(f"{name}_{count + 1}")
182
+ name_counts[name] = count + 1
183
+
184
+ group_to_column = {levels[i]: design_columns[i] for i in range(len(levels))}
185
+ group_values = np.asarray(group_series)
186
+ level_to_idx = {level: i for i, level in enumerate(levels)}
187
+ design = np.zeros((adata.n_obs, len(levels)), dtype=float)
188
+ row_idx = np.arange(adata.n_obs)
189
+ col_idx = np.array([level_to_idx[val] for val in group_values], dtype=int)
190
+ design[row_idx, col_idx] = 1.0
191
+ groupby_col = groupby
192
+
193
+ # Validate inputs
194
+ if design.shape[0] != adata.n_obs:
195
+ raise ValueError(f"Design matrix rows ({design.shape[0]}) must match number of samples ({adata.n_obs})")
196
+
197
+ # Check design matrix rank
198
+ rank = np.linalg.matrix_rank(design)
199
+ n_coef = design.shape[1]
200
+ if rank < n_coef:
201
+ warnings.warn(
202
+ f"Design matrix is not full rank (rank={rank}, columns={n_coef}). Some coefficients may not be estimable.",
203
+ category=UserWarning,
204
+ stacklevel=2,
205
+ )
206
+
207
+ # Transpose to genes × samples for gene-wise fitting
208
+ # AnnData convention: samples × genes, but we need genes × samples
209
+ expr = adata.X.T # Now (n_genes, n_samples)
210
+ n_genes, n_samples = expr.shape
211
+
212
+ if method == "ls":
213
+ fit_result = _fit_ls(expr, design)
214
+ elif method == "robust":
215
+ fit_result = _fit_robust(expr, design)
216
+ else:
217
+ raise ValueError(f"Unknown method: {method}. Use 'ls' or 'robust'.")
218
+
219
+ # Add metadata
220
+ fit_result["design"] = design.copy()
221
+ fit_result["design_columns"] = list(design_columns)
222
+ fit_result["groupby"] = groupby_col
223
+ fit_result["group_to_column"] = group_to_column
224
+ fit_result["group_values"] = group_values
225
+ fit_result["genes"] = np.asarray(adata.var_names.to_numpy(copy=True))
226
+ fit_result["method"] = method
227
+ fit_result["_moderated"] = False
228
+
229
+ fit_for_uns = fit_result.copy()
230
+ adata.uns["lm_fit"] = fit_for_uns
231
+
232
+ if return_fit or legacy_design_input:
233
+ fit_for_return = fit_for_uns.copy()
234
+ fit_for_return["adata"] = adata
235
+ return fit_for_return
236
+ return adata if copy else None
237
+
238
+
239
+ def _fit_ls(expr: np.ndarray, design: np.ndarray) -> dict:
240
+ """Fit linear models using least squares.
241
+
242
+ Parameters
243
+ ----------
244
+ expr : np.ndarray
245
+ Expression matrix (n_genes, n_samples)
246
+ design : np.ndarray
247
+ Design matrix (n_samples, n_coefficients)
248
+
249
+ Returns:
250
+ -------
251
+ dict
252
+ Fit results including coefficients, sigma, df_residual, etc.
253
+ """
254
+ n_genes, n_samples = expr.shape
255
+ n_coef = design.shape[1]
256
+ df_residual = n_samples - n_coef
257
+
258
+ if df_residual <= 0:
259
+ raise ValueError(
260
+ f"Not enough samples ({n_samples}) for design with "
261
+ f"{n_coef} coefficients. Need at least {n_coef + 1} samples."
262
+ )
263
+
264
+ # QR decomposition of design for stable computation
265
+ Q, R = linalg.qr(design, mode="economic")
266
+
267
+ # Compute coefficients: beta = R^-1 @ Q.T @ expr.T
268
+ # expr.T is (n_samples, n_genes), Q.T @ expr.T is (n_coef, n_genes)
269
+ coefficients = linalg.solve_triangular(R, Q.T @ expr.T, lower=False).T
270
+ # Now coefficients is (n_genes, n_coef)
271
+
272
+ # Compute fitted values and residuals
273
+ fitted = coefficients @ design.T # (n_genes, n_samples)
274
+ residuals = expr - fitted
275
+
276
+ # Estimate residual variance for each gene
277
+ # sigma^2 = sum(residuals^2) / df_residual
278
+ rss = np.sum(residuals**2, axis=1) # Residual sum of squares per gene
279
+ sigma = np.sqrt(rss / df_residual)
280
+
281
+ # Compute unscaled covariance matrix: (R.T @ R)^-1
282
+ # This is the (X.T @ X)^-1 matrix
283
+ R_inv = linalg.solve_triangular(R, np.eye(n_coef), lower=False)
284
+ cov_coef = R_inv @ R_inv.T
285
+
286
+ # Unscaled standard errors: sqrt(diag(cov_coef))
287
+ # These need to be multiplied by sigma to get actual standard errors
288
+ stdev_unscaled = np.sqrt(np.diag(cov_coef))
289
+ # Broadcast to (n_genes, n_coef)
290
+ stdev_unscaled = np.tile(stdev_unscaled, (n_genes, 1))
291
+
292
+ return {
293
+ "coefficients": coefficients,
294
+ "stdev_unscaled": stdev_unscaled,
295
+ "sigma": sigma,
296
+ "df_residual": np.full(n_genes, df_residual, dtype=float),
297
+ "cov_coefficients": cov_coef,
298
+ }
299
+
300
+
301
+ def _fit_robust(expr: np.ndarray, design: np.ndarray, max_iter: int = 20) -> dict:
302
+ """Fit linear models using robust regression (IRLS with Huber weights).
303
+
304
+ Parameters
305
+ ----------
306
+ expr : np.ndarray
307
+ Expression matrix (n_genes, n_samples)
308
+ design : np.ndarray
309
+ Design matrix (n_samples, n_coefficients)
310
+ max_iter : int, default=20
311
+ Maximum number of IRLS iterations
312
+
313
+ Returns:
314
+ -------
315
+ dict
316
+ Fit results including coefficients, sigma, df_residual, etc.
317
+ """
318
+ n_genes, n_samples = expr.shape
319
+ n_coef = design.shape[1]
320
+ df_residual = n_samples - n_coef
321
+
322
+ if df_residual <= 0:
323
+ raise ValueError(
324
+ f"Not enough samples ({n_samples}) for design with "
325
+ f"{n_coef} coefficients. Need at least {n_coef + 1} samples."
326
+ )
327
+
328
+ # Initialize with OLS
329
+ Q, R = linalg.qr(design, mode="economic")
330
+ coefficients = linalg.solve_triangular(R, Q.T @ expr.T, lower=False).T
331
+
332
+ # Huber's k parameter
333
+ k = 1.345
334
+
335
+ # Iteratively reweight
336
+ for _ in range(max_iter):
337
+ # Compute residuals
338
+ fitted = coefficients @ design.T
339
+ residuals = expr - fitted
340
+
341
+ # Robust scale estimate using MAD (Median Absolute Deviation)
342
+ mad = np.median(np.abs(residuals - np.median(residuals, axis=1, keepdims=True)), axis=1)
343
+ scale = 1.4826 * mad # Scale factor for normal distribution
344
+ scale[scale < 1e-8] = 1.0 # Avoid division by zero
345
+
346
+ # Compute Huber weights
347
+ scaled_resid = residuals / scale[:, np.newaxis]
348
+ weights = np.where(np.abs(scaled_resid) <= k, 1.0, k / np.abs(scaled_resid))
349
+
350
+ # Weighted least squares
351
+ # For each gene, solve: (X.T @ W @ X) @ beta = X.T @ W @ y
352
+ coefficients_new = np.zeros_like(coefficients)
353
+ for i in range(n_genes):
354
+ W = np.diag(weights[i, :])
355
+ XtWX = design.T @ W @ design
356
+ XtWy = design.T @ W @ expr[i, :]
357
+ try:
358
+ coefficients_new[i, :] = linalg.solve(XtWX, XtWy, assume_a="pos")
359
+ except linalg.LinAlgError:
360
+ # Fall back to OLS for this gene
361
+ coefficients_new[i, :] = coefficients[i, :]
362
+
363
+ # Check convergence
364
+ coef_change = np.max(np.abs(coefficients_new - coefficients))
365
+ coefficients = coefficients_new
366
+ if coef_change < 1e-6:
367
+ break
368
+
369
+ # Final residuals and sigma
370
+ fitted = coefficients @ design.T
371
+ residuals = expr - fitted
372
+ rss = np.sum(residuals**2, axis=1)
373
+ sigma = np.sqrt(rss / df_residual)
374
+
375
+ # Unscaled covariance (use final weights)
376
+ # For simplicity, use OLS covariance structure
377
+ R_inv = linalg.solve_triangular(R, np.eye(n_coef), lower=False)
378
+ cov_coef = R_inv @ R_inv.T
379
+ stdev_unscaled = np.tile(np.sqrt(np.diag(cov_coef)), (n_genes, 1))
380
+
381
+ return {
382
+ "coefficients": coefficients,
383
+ "stdev_unscaled": stdev_unscaled,
384
+ "sigma": sigma,
385
+ "df_residual": np.full(n_genes, df_residual, dtype=float),
386
+ "cov_coefficients": cov_coef,
387
+ }
@@ -0,0 +1,101 @@
1
+ """Multidimensional scaling for microarray data."""
2
+
3
+ import numpy as np
4
+ from anndata import AnnData
5
+ from sklearn.manifold import MDS
6
+ from sklearn.metrics import pairwise_distances
7
+
8
+
9
+ def mds(
10
+ adata: AnnData,
11
+ top: int = 500,
12
+ gene_selection: str = "common",
13
+ n_components: int = 2,
14
+ obsm_key: str = "X_mds",
15
+ random_state: int = 42,
16
+ copy: bool = False,
17
+ ) -> AnnData | None:
18
+ """Compute Multidimensional Scaling (MDS) embedding.
19
+
20
+ MDS reduces high-dimensional expression data to a lower-dimensional space
21
+ while preserving pairwise distances between samples. This is useful for
22
+ visualizing sample relationships and identifying outliers or batch effects.
23
+
24
+ Args:
25
+ adata: AnnData object with probe-level expression data in .X
26
+ top: Number of top varying probes to use for distance calculation. Default 500.
27
+ gene_selection: Method for selecting genes:
28
+ - "common": Use top probes with highest median absolute deviation
29
+ - "pairwise": Use different probes for each pair (not implemented)
30
+ n_components: Number of dimensions to reduce to. Default 2.
31
+ obsm_key: Key to store the MDS embedding in .obsm. Default "X_mds".
32
+ random_state: Random state for reproducibility. Default 42.
33
+ copy: Return a copy instead of writing to adata. Default False.
34
+
35
+ Returns:
36
+ Returns None if `copy=False`, else returns an `AnnData` object with the
37
+ MDS embedding stored in `.obsm[obsm_key]`.
38
+
39
+ Examples:
40
+ >>> import anndata as ad
41
+ >>> import numpy as np
42
+ >>> import microarray as ma
43
+ >>> data = np.random.randn(1000, 6)
44
+ >>> adata = ad.AnnData(data.T) # Samples x probes
45
+ >>> ma.tl.mds(adata, top=500)
46
+ >>> print(adata.obsm["X_mds"].shape)
47
+ (6, 2)
48
+ """
49
+ adata = adata.copy() if copy else adata
50
+
51
+ # Get expression matrix (samples x probes)
52
+ expr = adata.X
53
+ n_samples, n_probes = expr.shape
54
+
55
+ # Convert to log2 if not already
56
+ if expr.min() >= 0 and (expr.max() - expr.min()) > 20:
57
+ log_expr = np.log2(expr + 1)
58
+ else:
59
+ log_expr = expr
60
+
61
+ # Gene selection
62
+ if gene_selection == "common":
63
+ # Calculate median absolute deviation for each probe
64
+ mad = np.median(np.abs(log_expr - np.median(log_expr, axis=0)), axis=0)
65
+ # Select top varying probes
66
+ top_n = min(top, n_probes)
67
+ top_indices = np.argpartition(mad, -top_n)[-top_n:]
68
+ expr_subset = log_expr[:, top_indices]
69
+ elif gene_selection == "pairwise":
70
+ raise NotImplementedError("Pairwise gene selection not yet implemented")
71
+ else:
72
+ raise ValueError(f"Unknown gene_selection method: {gene_selection}")
73
+
74
+ # Calculate pairwise distances
75
+ # Use Euclidean distance by default
76
+ distances = pairwise_distances(expr_subset, metric="euclidean")
77
+
78
+ # Perform MDS
79
+ mds_model = MDS(
80
+ n_components=n_components,
81
+ metric="precomputed",
82
+ n_init=4,
83
+ init="random",
84
+ random_state=random_state,
85
+ )
86
+ coords = mds_model.fit_transform(distances)
87
+
88
+ # Store in obsm
89
+ adata.obsm[obsm_key] = coords
90
+
91
+ # Store parameters in uns
92
+ adata.uns[obsm_key] = {
93
+ "params": {
94
+ "top": top,
95
+ "gene_selection": gene_selection,
96
+ "n_components": n_components,
97
+ "random_state": random_state,
98
+ }
99
+ }
100
+
101
+ return adata if copy else None
@@ -0,0 +1,88 @@
1
+ """Principal component analysis for microarray data."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import numpy as np
6
+ from anndata import AnnData
7
+ from scipy import sparse
8
+ from sklearn.decomposition import PCA
9
+
10
+
11
+ def pca(
12
+ adata: AnnData,
13
+ n_components: int | None = None,
14
+ obsm_key: str = "X_pca",
15
+ layer: str | None = None,
16
+ scale: bool = False,
17
+ random_state: int | None = 42,
18
+ copy: bool = False,
19
+ ) -> AnnData | None:
20
+ """Compute a PCA embedding and store it in ``adata.obsm``.
21
+
22
+ Args:
23
+ adata: AnnData object with expression values in ``.X`` or ``layer``.
24
+ n_components: Number of principal components to compute.
25
+ Defaults to ``adata.n_obs``.
26
+ obsm_key: Key used to store coordinates in ``adata.obsm``.
27
+ layer: Optional layer key to use instead of ``adata.X``.
28
+ scale: Whether to z-scale each feature before PCA.
29
+ random_state: Random seed used by PCA solvers that require randomness.
30
+ copy: Return a modified copy instead of writing in place.
31
+
32
+ Returns:
33
+ Modified AnnData when ``copy=True``, else ``None``.
34
+
35
+ Raises:
36
+ ValueError: If ``n_components`` is invalid.
37
+ KeyError: If ``layer`` is provided but missing from ``adata.layers``.
38
+ """
39
+ adata = adata.copy() if copy else adata
40
+
41
+ if layer is None:
42
+ matrix = adata.X
43
+ else:
44
+ if layer not in adata.layers:
45
+ raise KeyError(f"AnnData .layers has no '{layer}' layer")
46
+ matrix = adata.layers[layer]
47
+
48
+ values = matrix.toarray() if sparse.issparse(matrix) else np.asarray(matrix)
49
+ values = np.asarray(values, dtype=float)
50
+
51
+ if values.ndim != 2:
52
+ raise ValueError("Expected a 2D expression matrix")
53
+
54
+ if n_components is None:
55
+ n_components = adata.n_obs
56
+
57
+ if n_components < 1:
58
+ raise ValueError("n_components must be at least 1")
59
+
60
+ max_components = min(values.shape)
61
+ if n_components > max_components:
62
+ raise ValueError(f"n_components must be <= {max_components} for input with shape {values.shape}")
63
+
64
+ if scale:
65
+ mean = np.nanmean(values, axis=0)
66
+ std = np.nanstd(values, axis=0)
67
+ scaled = values - mean
68
+ valid = std > 0
69
+ scaled[:, valid] = scaled[:, valid] / std[valid]
70
+ values = scaled
71
+
72
+ model = PCA(n_components=n_components, random_state=random_state)
73
+ coords = model.fit_transform(values)
74
+
75
+ adata.obsm[obsm_key] = coords
76
+ adata.uns[obsm_key] = {
77
+ "variance_ratio": model.explained_variance_ratio_.copy(),
78
+ "variance": model.explained_variance_.copy(),
79
+ "components": model.components_.copy(),
80
+ "params": {
81
+ "n_components": n_components,
82
+ "layer": layer,
83
+ "scale": scale,
84
+ "random_state": random_state,
85
+ },
86
+ }
87
+
88
+ return adata if copy else None
@@ -0,0 +1,86 @@
1
+ """Gene set scoring helpers for sample-level summaries."""
2
+
3
+ from collections.abc import Sequence
4
+ from typing import Literal
5
+
6
+ import numpy as np
7
+ from anndata import AnnData
8
+ from scipy import sparse
9
+
10
+
11
+ def score(
12
+ adata: AnnData,
13
+ gene_set: Sequence[str],
14
+ method: Literal["mean", "median", "zscore"] = "mean",
15
+ *,
16
+ score_name: str = "score",
17
+ var_key: str | None = None,
18
+ layer: str | None = None,
19
+ copy: bool = False,
20
+ ) -> AnnData | None:
21
+ """Compute per-sample gene set scores and write them to ``adata.obs``.
22
+
23
+ Args:
24
+ adata: AnnData object with expression values in ``.X`` or ``layer``.
25
+ gene_set: Sequence of genes/features to score.
26
+ method: Scoring method: ``"mean"``, ``"median"``, or ``"zscore"``.
27
+ score_name: Column name used in ``adata.obs`` for output scores.
28
+ var_key: Optional ``adata.var`` column containing feature identifiers.
29
+ If not provided, ``adata.var_names`` are used.
30
+ layer: Optional layer key to use instead of ``adata.X``.
31
+ copy: Return a copy instead of writing into ``adata`` in place.
32
+
33
+ Returns:
34
+ Modified AnnData when ``copy=True``, else ``None``.
35
+
36
+ Raises:
37
+ ValueError: If ``gene_set`` is empty, method is unknown, or no genes match.
38
+ KeyError: If ``var_key`` or ``layer`` is missing.
39
+ """
40
+ adata = adata.copy() if copy else adata
41
+
42
+ if len(gene_set) == 0:
43
+ raise ValueError("gene_set must contain at least one gene")
44
+
45
+ if var_key is None:
46
+ feature_ids = adata.var_names.astype(str)
47
+ else:
48
+ if var_key not in adata.var.columns:
49
+ raise KeyError(f"AnnData .var has no '{var_key}' column")
50
+ feature_ids = adata.var[var_key].astype(str)
51
+
52
+ gene_lookup = {str(gene) for gene in gene_set}
53
+ feature_mask = np.asarray(feature_ids.isin(gene_lookup), dtype=bool)
54
+ if not np.any(feature_mask):
55
+ raise ValueError("None of the provided genes were found in AnnData features")
56
+
57
+ if layer is None:
58
+ matrix = adata.X
59
+ else:
60
+ if layer not in adata.layers:
61
+ raise KeyError(f"AnnData .layers has no '{layer}' layer")
62
+ matrix = adata.layers[layer]
63
+
64
+ subset = matrix[:, feature_mask]
65
+ if sparse.issparse(subset):
66
+ values = subset.toarray()
67
+ else:
68
+ values = np.asarray(subset)
69
+
70
+ if method == "mean":
71
+ scores = np.nanmean(values, axis=1)
72
+ elif method == "median":
73
+ scores = np.nanmedian(values, axis=1)
74
+ elif method == "zscore":
75
+ gene_mean = np.nanmean(values, axis=0)
76
+ gene_std = np.nanstd(values, axis=0)
77
+ z_values = np.zeros_like(values, dtype=float)
78
+ valid = gene_std > 0
79
+ z_values[:, valid] = (values[:, valid] - gene_mean[valid]) / gene_std[valid]
80
+ scores = np.nanmean(z_values, axis=1)
81
+ else:
82
+ raise ValueError(f"Unknown scoring method: {method}")
83
+
84
+ adata.obs[score_name] = np.asarray(scores, dtype=float)
85
+
86
+ return adata if copy else None