microarray 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. microarray/__init__.py +15 -0
  2. microarray/_version.py +3 -0
  3. microarray/datasets/__init__.py +3 -0
  4. microarray/datasets/_arrayexpress.py +1 -0
  5. microarray/datasets/_cdf_files.py +35 -0
  6. microarray/datasets/_geo.py +1 -0
  7. microarray/datasets/_utils.py +143 -0
  8. microarray/io/__init__.py +17 -0
  9. microarray/io/_anndata_converter.py +198 -0
  10. microarray/io/_cdf.py +575 -0
  11. microarray/io/_cel.py +591 -0
  12. microarray/io/_read.py +127 -0
  13. microarray/plotting/__init__.py +28 -0
  14. microarray/plotting/_base.py +253 -0
  15. microarray/plotting/_cel.py +75 -0
  16. microarray/plotting/_de_plots.py +239 -0
  17. microarray/plotting/_diagnostic_plots.py +268 -0
  18. microarray/plotting/_heatmap.py +279 -0
  19. microarray/plotting/_ma_plots.py +136 -0
  20. microarray/plotting/_pca.py +320 -0
  21. microarray/plotting/_qc_plots.py +335 -0
  22. microarray/plotting/_score.py +38 -0
  23. microarray/plotting/_top_table_heatmap.py +98 -0
  24. microarray/plotting/_utils.py +280 -0
  25. microarray/preprocessing/__init__.py +39 -0
  26. microarray/preprocessing/_background.py +862 -0
  27. microarray/preprocessing/_log2.py +77 -0
  28. microarray/preprocessing/_normalize.py +1292 -0
  29. microarray/preprocessing/_rma.py +243 -0
  30. microarray/preprocessing/_robust.py +170 -0
  31. microarray/preprocessing/_summarize.py +318 -0
  32. microarray/py.typed +0 -0
  33. microarray/tools/__init__.py +26 -0
  34. microarray/tools/_biomart.py +416 -0
  35. microarray/tools/_empirical_bayes.py +401 -0
  36. microarray/tools/_fdist.py +171 -0
  37. microarray/tools/_linear_models.py +387 -0
  38. microarray/tools/_mds.py +101 -0
  39. microarray/tools/_pca.py +88 -0
  40. microarray/tools/_score.py +86 -0
  41. microarray/tools/_toptable.py +360 -0
  42. microarray-0.1.0.dist-info/METADATA +75 -0
  43. microarray-0.1.0.dist-info/RECORD +44 -0
  44. microarray-0.1.0.dist-info/WHEEL +4 -0
@@ -0,0 +1,862 @@
1
+ """Background correction for microarray data.
2
+
3
+ Implements multiple background correction methods including:
4
+ - Basic methods: none, subtract, half, minimum
5
+ - Advanced methods: edwards, normexp (normal+exponential convolution model)
6
+
7
+ The normexp model is based on Ritchie et al. (2007) and Irizarry et al. (2003).
8
+
9
+ References:
10
+ Ritchie, M.E., Silver, J., Oshlack, A., Holmes, M., Diyagama, D., Holloway, A., and Smyth, G.K. (2007).
11
+ A comparison of background correction methods for two-colour microarrays.
12
+ Bioinformatics 23, 2700-2707.
13
+
14
+ Irizarry, R.A., Hobbs, B., Collin, F., et al. (2003).
15
+ Exploration, normalization, and summaries of high density oligonucleotide array probe level data.
16
+ Biostatistics, 4(2), 249-264.
17
+
18
+ Edwards, D. (2003). Non-linear normalization and background correction in one-channel cDNA microarray studies.
19
+ Bioinformatics 19, 825-833.
20
+ """
21
+
22
+ import warnings
23
+ from typing import Literal
24
+
25
+ import numpy as np
26
+ from anndata import AnnData
27
+ from numpy.typing import NDArray
28
+ from scipy.stats import norm
29
+
30
+
31
+ def background_correct(
32
+ adata: AnnData,
33
+ method: Literal["none", "subtract", "half", "minimum", "edwards", "normexp", "mas"] = "normexp",
34
+ offset: float = 0,
35
+ normexp_method: Literal["saddle", "mle", "rma", "rma75"] = "saddle",
36
+ edwards_offset: float = 0.1,
37
+ mas_grid_dim: int = 16,
38
+ copy: bool = False,
39
+ ) -> AnnData | None:
40
+ """Apply background correction to microarray intensity data.
41
+
42
+ Implements multiple background correction methods from limma, suitable for
43
+ different experimental designs and data characteristics.
44
+
45
+ Parameters
46
+ ----------
47
+ adata
48
+ AnnData object with shape (n_samples, n_probes).
49
+ Must contain raw intensity values in `.X`.
50
+ For methods other than 'none' and 'normexp', background intensities
51
+ should be available in `.layers['background']`.
52
+ method
53
+ Background correction method:
54
+ - 'none': No correction applied
55
+ - 'subtract': Simple background subtraction (E - Eb)
56
+ - 'half': Subtract with minimum threshold of 0.5
57
+ - 'minimum': Subtract, then replace negatives with half-minimum per array
58
+ - 'edwards': Log-linear interpolation for dull spots (Edwards 2003)
59
+ - 'normexp': Normal+exponential convolution model (default, recommended)
60
+ - 'mas': MAS5 spatial background correction with grid-based smoothing
61
+ offset
62
+ Value to add to corrected intensities. Default is 0.
63
+ normexp_method
64
+ Parameter estimation method for normexp correction:
65
+ - 'saddle': Saddle-point approximation (default, fast)
66
+ - 'mle': Maximum likelihood estimation (slower, more accurate)
67
+ - 'rma': RMA algorithm (requires affy package equivalent)
68
+ - 'rma75': RMA-75 variant (McGee & Chen 2006)
69
+ edwards_offset
70
+ Threshold fraction for edwards method. Default is 0.1 (10%).
71
+ mas_grid_dim
72
+ Number of grid regions for MAS5 spatial background (default=16).
73
+ Must be a perfect square (e.g., 4, 9, 16, 25).
74
+ copy
75
+ If True, return a copy of the AnnData object. If False, modify in place.
76
+
77
+ Returns:
78
+ -------
79
+ AnnData or None
80
+ If `copy=True`, returns corrected AnnData object.
81
+ If `copy=False`, modifies `adata` in place and returns None.
82
+ The corrected intensities are stored in `.X`.
83
+ Correction parameters are stored in `.uns['background_correction']`.
84
+
85
+ Examples:
86
+ --------
87
+ >>> import microarray as ma
88
+ >>> adata = ma.io.cel_batch_to_anndata(cel_files, cdf_path, annotation_db)
89
+
90
+ >>> # RMA-style normexp correction (default)
91
+ >>> adata_corrected = ma.pp.background_correct(adata, method="normexp", copy=True)
92
+
93
+ >>> # Simple subtraction (requires background layer)
94
+ >>> adata_corrected = ma.pp.background_correct(adata, method="subtract", copy=True)
95
+
96
+ >>> # Edwards method for dull spots
97
+ >>> adata_corrected = ma.pp.background_correct(adata, method="edwards", copy=True)
98
+
99
+ Notes:
100
+ -----
101
+ **Method descriptions:**
102
+
103
+ - **none**: No correction, sets background to zero. Fast but not recommended.
104
+
105
+ - **subtract**: Traditional `E = E - Eb`. Simple but can produce negative values.
106
+
107
+ - **half**: `E = max(E - Eb, 0.5)`. Prevents values below 0.5.
108
+
109
+ - **minimum**: Subtracts background, then for each array replaces negatives with
110
+ half the minimum positive value in that array. Array-specific adjustment.
111
+
112
+ - **edwards**: Log-linear interpolation for dull spots. For spots where
113
+ `E - Eb < delta`, applies: `E = delta * exp(1 - (Eb + delta)/E)`.
114
+ The threshold `delta` is chosen so approximately `edwards_offset` fraction
115
+ of negative/zero spots are interpolated.
116
+
117
+ - **normexp**: Normal+exponential convolution model. Observed intensity =
118
+ Normal(μ, σ²) background + Exponential(α) signal. Computes conditional
119
+ expectation E[signal | observed]. Most sophisticated and recommended for
120
+ Affymetrix and similar data.
121
+
122
+ - **mas**: MAS5 spatial background correction. Divides chip into grid regions
123
+ (default 16), estimates background from lowest 2% of intensities in each region,
124
+ then applies weighted smoothing based on distance to region centroids.
125
+ Corrected value = max(E - BG_weighted, 0.5 * noise_weighted). Required for
126
+ MAS5 pipeline.
127
+
128
+ The normexp model uses different parameter estimation methods:
129
+ - 'saddle': Fast saddle-point approximation (recommended)
130
+ - 'mle': Maximum likelihood via optimization (more accurate, slower)
131
+ - 'rma': Original RMA method (mode-based estimation)
132
+ - 'rma75': RMA-75 with improved mode correction (more robust)
133
+ """
134
+ adata = adata.copy() if copy else adata
135
+
136
+ # Validate input
137
+ if adata.X is None:
138
+ raise ValueError("AnnData object must have .X attribute with intensity values")
139
+ if adata.n_obs < 1:
140
+ raise ValueError("AnnData must contain at least one sample")
141
+
142
+ # Check if background layer is needed
143
+ needs_background = method in ["subtract", "half", "minimum", "edwards"]
144
+ if needs_background and "background" not in adata.layers:
145
+ raise ValueError(f"Method '{method}' requires background intensities in adata.layers['background']")
146
+
147
+ # Apply the appropriate correction method
148
+ if method == "none":
149
+ _background_none(adata)
150
+ elif method == "subtract":
151
+ _background_subtract(adata)
152
+ elif method == "half":
153
+ _background_half(adata)
154
+ elif method == "minimum":
155
+ _background_minimum(adata)
156
+ elif method == "edwards":
157
+ _background_edwards(adata, offset=edwards_offset)
158
+ elif method == "normexp":
159
+ _background_normexp(adata, method=normexp_method)
160
+ elif method == "mas":
161
+ _background_mas(adata, grid_dim=mas_grid_dim)
162
+ else:
163
+ raise ValueError(f"Unknown background correction method: {method}")
164
+
165
+ # Apply offset if specified
166
+ if offset != 0:
167
+ adata.X = adata.X + offset
168
+
169
+ # Store correction metadata
170
+ adata.uns["background_correction"] = {
171
+ "method": method,
172
+ "offset": offset,
173
+ }
174
+ if method == "normexp":
175
+ adata.uns["background_correction"]["normexp_method"] = normexp_method
176
+ elif method == "mas":
177
+ adata.uns["background_correction"]["mas_grid_dim"] = mas_grid_dim
178
+ elif method == "edwards":
179
+ adata.uns["background_correction"]["edwards_offset"] = edwards_offset
180
+
181
+ return adata if copy else None
182
+
183
+
184
+ def _background_none(adata: AnnData) -> None:
185
+ """Apply no background correction (passthrough)."""
186
+ # No modification needed, just for completeness
187
+ pass
188
+
189
+
190
+ def _background_subtract(adata: AnnData) -> None:
191
+ """Simple background subtraction: E = E - Eb.
192
+
193
+ Can produce negative values.
194
+ """
195
+ Eb = adata.layers["background"]
196
+ adata.X = adata.X - Eb
197
+
198
+
199
+ def _background_half(adata: AnnData) -> None:
200
+ """Background subtraction with minimum threshold: E = max(E - Eb, 0.5).
201
+
202
+ Prevents values below 0.5.
203
+ """
204
+ Eb = adata.layers["background"]
205
+ adata.X = np.maximum(adata.X - Eb, 0.5)
206
+
207
+
208
+ def _background_minimum(adata: AnnData) -> None:
209
+ """Background subtraction with array-specific negative replacement.
210
+
211
+ For each array:
212
+ 1. Subtract background: E = E - Eb
213
+ 2. For spots where E < 1e-18, set E = min(positive_values) / 2
214
+ """
215
+ Eb = adata.layers["background"]
216
+ E = adata.X - Eb
217
+
218
+ # Process each sample (array) separately
219
+ n_samples = adata.n_obs
220
+ for i in range(n_samples):
221
+ sample_E = E[i, :]
222
+ negative_mask = sample_E < 1e-18
223
+
224
+ if np.any(negative_mask):
225
+ # Find minimum positive value
226
+ positive_values = sample_E[~negative_mask]
227
+ if len(positive_values) > 0:
228
+ min_positive = np.min(positive_values)
229
+ E[i, negative_mask] = min_positive / 2
230
+ else:
231
+ # All values are negative - set to small positive value
232
+ E[i, negative_mask] = 1e-6
233
+
234
+ adata.X = E
235
+
236
+
237
+ def _background_edwards(adata: AnnData, offset: float = 0.1) -> None:
238
+ """Edwards log-linear interpolation for dull spots.
239
+
240
+ Applies log-linear interpolation for spots where E - Eb < delta.
241
+ The threshold delta is chosen per array such that the number of spots
242
+ with 0 < E - Eb < delta is approximately offset * (number of spots with E - Eb <= 0).
243
+
244
+ Parameters
245
+ ----------
246
+ adata
247
+ AnnData object with foreground in .X and background in .layers['background']
248
+ offset
249
+ Fraction controlling threshold (default 0.1 = 10%)
250
+
251
+ References:
252
+ ----------
253
+ Edwards, D. (2003). Non-linear normalization and background correction
254
+ in one-channel cDNA microarray studies. Bioinformatics 19, 825-833.
255
+ """
256
+ E = adata.X.copy()
257
+ Eb = adata.layers["background"]
258
+ sub = E - Eb
259
+
260
+ # Compute delta threshold for each array (sample)
261
+ n_samples = adata.n_obs
262
+ delta = np.zeros((n_samples, 1))
263
+
264
+ for i in range(n_samples):
265
+ sub_sample = sub[i, :]
266
+ # Compute fraction of negative/zero values
267
+ neg_frac = np.mean(sub_sample < 1e-16)
268
+ # Set quantile threshold
269
+ quantile_val = neg_frac * (1 + offset)
270
+ delta[i, 0] = np.quantile(sub_sample, quantile_val)
271
+
272
+ # Apply correction
273
+ # where sub < delta: E = delta * exp(1 - (Eb + delta) / E)
274
+ # where sub >= delta: E = sub
275
+ corrected = np.where(sub < delta, delta * np.exp(1 - (Eb + delta) / E), sub)
276
+
277
+ adata.X = corrected
278
+
279
+
280
+ def _background_normexp(adata: AnnData, method: str = "saddle") -> None:
281
+ """Apply normexp (normal+exponential convolution) background correction.
282
+
283
+ For each sample, estimates parameters (μ, σ, α) and computes the
284
+ conditional expectation E[signal | observed].
285
+
286
+ Parameters
287
+ ----------
288
+ adata
289
+ AnnData object with intensities in .X
290
+ method
291
+ Parameter estimation method: 'saddle', 'mle', 'rma', or 'rma75'
292
+ """
293
+ Y = adata.X.copy()
294
+ n_samples, n_probes = Y.shape
295
+
296
+ # Store parameters for each sample
297
+ correction_params = {
298
+ "mu": np.zeros(n_samples),
299
+ "sigma": np.zeros(n_samples),
300
+ "alpha": np.zeros(n_samples),
301
+ "method": method,
302
+ }
303
+
304
+ # Apply correction to each sample
305
+ for i in range(n_samples):
306
+ y = Y[i, :]
307
+
308
+ # Estimate parameters
309
+ if method == "rma":
310
+ params = _normexp_fit_rma(y)
311
+ elif method == "rma75":
312
+ params = _normexp_fit_rma75(y)
313
+ elif method in ["saddle", "mle"]:
314
+ # For now, use simplified estimation (full optimization in step 6)
315
+ params = _normexp_fit_simple(y)
316
+ else:
317
+ raise ValueError(f"Unknown normexp method: {method}")
318
+
319
+ # Store parameters
320
+ correction_params["mu"][i] = params["mu"]
321
+ correction_params["sigma"][i] = params["sigma"]
322
+ correction_params["alpha"][i] = params["alpha"]
323
+
324
+ # Compute corrected signal
325
+ y_corrected = _normexp_signal(params, y)
326
+ Y[i, :] = y_corrected
327
+
328
+ # Update AnnData
329
+ adata.X = Y
330
+ adata.uns["normexp_params"] = correction_params
331
+
332
+
333
+ def _normexp_fit_simple(x: np.ndarray) -> dict[str, float]:
334
+ """Simple parameter estimation for normexp (current RMA implementation).
335
+
336
+ Parameters
337
+ ----------
338
+ x
339
+ Intensity values for one sample
340
+
341
+ Returns:
342
+ -------
343
+ dict
344
+ Dictionary with keys 'mu', 'sigma', 'alpha'
345
+ """
346
+ # Estimate μ from histogram mode (0 to 75th percentile)
347
+ q75 = np.percentile(x, 75)
348
+ x_subset = x[x <= q75]
349
+
350
+ hist, bin_edges = np.histogram(x_subset, bins=100)
351
+ mode_idx = np.argmax(hist)
352
+ mu = (bin_edges[mode_idx] + bin_edges[mode_idx + 1]) / 2
353
+
354
+ # Estimate σ from probes with intensity < mu
355
+ x_low = x[x < mu]
356
+ if len(x_low) > 0:
357
+ sigma = np.std(x_low - mu) * np.sqrt(2)
358
+ else:
359
+ sigma = np.std(x) * 0.5
360
+ warnings.warn(
361
+ f"No probes below estimated μ={mu:.2f}. Using fallback σ estimation.",
362
+ UserWarning,
363
+ stacklevel=3,
364
+ )
365
+
366
+ # Ensure sigma is not too small
367
+ sigma = max(sigma, 1e-6)
368
+
369
+ # Fixed alpha for simple method
370
+ alpha = 0.03
371
+
372
+ return {"mu": mu, "sigma": sigma, "alpha": alpha}
373
+
374
+
375
+ def _normexp_fit_rma(x: np.ndarray) -> dict[str, float]:
376
+ """RMA-style parameter estimation for normexp.
377
+
378
+ Based on the original RMA algorithm from Affymetrix.
379
+ Uses mode-based estimation without mode correction.
380
+
381
+ Parameters
382
+ ----------
383
+ x
384
+ Intensity values for one sample
385
+
386
+ Returns:
387
+ -------
388
+ dict
389
+ Dictionary with keys 'mu', 'sigma', 'alpha'
390
+ """
391
+ # Get mode estimate using 5th percentile as rough approximation
392
+ mu = np.percentile(x, 5)
393
+
394
+ # Estimate sigma from background probes
395
+ bg_data = x[x < mu]
396
+ if len(bg_data) > 1:
397
+ sigma = np.std(bg_data - mu) * np.sqrt(2)
398
+ else:
399
+ # Fallback
400
+ sigma = np.std(x) * 0.5
401
+
402
+ # Ensure sigma > 0
403
+ sigma = max(sigma, 1e-6)
404
+
405
+ # Estimate alpha (mean signal)
406
+ alpha = np.mean(x) - mu
407
+ alpha = max(alpha, 1e-6)
408
+
409
+ return {"mu": mu, "sigma": sigma, "alpha": alpha}
410
+
411
+
412
+ def _normexp_fit_rma75(x: np.ndarray, n_pts: int = 2**14) -> dict[str, float]:
413
+ """RMA-75 parameter estimation with mode correction.
414
+
415
+ Implements the improved RMA-75 method from McGee & Chen (2006),
416
+ which includes mode correction and uses 75th quantile for alpha estimation.
417
+
418
+ Parameters
419
+ ----------
420
+ x
421
+ Intensity values for one sample
422
+ n_pts
423
+ Number of points for density estimation (default 16384)
424
+
425
+ Returns:
426
+ -------
427
+ dict
428
+ Dictionary with keys 'mu', 'sigma', 'alpha'
429
+
430
+ References:
431
+ ----------
432
+ McGee, M. and Chen, Z. (2006). Parameter estimation for the
433
+ exponential-normal convolution model for background correction
434
+ of Affymetrix GeneChip data. Stat Appl Genet Mol Biol, 5(1), Article 24.
435
+ """
436
+ from scipy.optimize import brentq
437
+ from scipy.stats import gaussian_kde
438
+
439
+ def max_density(data, n_pts):
440
+ """Find mode using kernel density estimation."""
441
+ if len(data) < 2:
442
+ return np.median(data)
443
+ # Use Epanechnikov kernel equivalent
444
+ kde = gaussian_kde(data, bw_method="scott")
445
+ x_range = np.linspace(data.min(), data.max(), n_pts)
446
+ density = kde(x_range)
447
+ return x_range[np.argmax(density)]
448
+
449
+ def mu_est_correct(m, s, a):
450
+ """Mode correction function."""
451
+
452
+ def f(t):
453
+ z1 = t - s * a
454
+ z2 = m / s + s * a
455
+ return norm.pdf(z1) - s * a * (norm.cdf(z1) + norm.cdf(z2) - 1)
456
+
457
+ try:
458
+ t = brentq(f, -5, 10, xtol=1e-12)
459
+ return m - s * t
460
+ except ValueError:
461
+ # If root finding fails, return original mode
462
+ return m
463
+
464
+ # Get initial mode estimate
465
+ pmbg = max_density(x, min(n_pts, len(x) // 2))
466
+ bg_data = x[x < pmbg]
467
+
468
+ if len(bg_data) > 0:
469
+ pmbg = max_density(bg_data, min(n_pts, len(bg_data) // 2))
470
+
471
+ mubg = pmbg # Initial mode
472
+
473
+ # Estimate sigma from background
474
+ bg_data = x[x < pmbg]
475
+ if len(bg_data) > 1:
476
+ bg_data_centered = bg_data - pmbg
477
+ bgsd = np.sqrt(np.sum(bg_data_centered**2) / (len(bg_data) - 1)) * np.sqrt(2)
478
+ else:
479
+ bgsd = np.std(x) * 0.5
480
+
481
+ # Estimate alpha from 75th quantile
482
+ q75 = 0.75
483
+ alpha3 = -(np.quantile(x, q75) - pmbg) / np.log(1 - q75)
484
+
485
+ # Apply mode correction
486
+ mu3 = mu_est_correct(m=mubg, s=bgsd, a=1 / alpha3)
487
+ mu3 = (mu3 + mubg) / 2 # Average with original mode
488
+
489
+ # Re-estimate sigma with corrected mode
490
+ bg_data3 = x[x < mu3]
491
+ if len(bg_data3) > 1:
492
+ bg_data3_centered = bg_data3 - mu3
493
+ bgsd3 = np.sqrt(np.sum(bg_data3_centered**2) / (len(bg_data3) - 1)) * np.sqrt(2)
494
+ else:
495
+ bgsd3 = bgsd
496
+
497
+ # Re-estimate alpha
498
+ alpha3 = -(np.quantile(x, q75) - mu3) / np.log(1 - q75)
499
+
500
+ return {"mu": mu3, "sigma": max(bgsd3, 1e-6), "alpha": max(1 / alpha3, 1e-6)}
501
+
502
+
503
+ def _normexp_signal(params: dict[str, float], x: np.ndarray) -> np.ndarray:
504
+ """Compute expected signal given observed intensity in normexp model.
505
+
506
+ Computes E[signal | observed] using the normal+exponential convolution model.
507
+
508
+ Parameters
509
+ ----------
510
+ params
511
+ Dictionary with keys 'mu', 'sigma', 'alpha'
512
+ x
513
+ Observed intensity values
514
+
515
+ Returns:
516
+ -------
517
+ np.ndarray
518
+ Corrected signal values
519
+
520
+ References:
521
+ ----------
522
+ Ritchie et al. (2007). A comparison of background correction methods
523
+ for two-colour microarrays. Bioinformatics 23, 2700-2707.
524
+ """
525
+ mu = params["mu"]
526
+ sigma = params["sigma"]
527
+ alpha = params["alpha"]
528
+
529
+ if alpha <= 0:
530
+ raise ValueError("alpha must be positive")
531
+ if sigma <= 0:
532
+ raise ValueError("sigma must be positive")
533
+
534
+ # Compute mu.sf = x - mu - sigma²/alpha
535
+ sigma2 = sigma * sigma
536
+ mu_sf = x - mu - sigma2 / alpha
537
+
538
+ # Compute signal = mu_sf + sigma² * exp(log_pdf - log_cdf)
539
+ # Using log-space for numerical stability
540
+ a_std = mu_sf / sigma
541
+
542
+ log_pdf = norm.logpdf(a_std)
543
+ log_cdf = norm.logcdf(a_std)
544
+
545
+ # Handle numerical issues
546
+ with warnings.catch_warnings():
547
+ warnings.filterwarnings("ignore", category=RuntimeWarning)
548
+ correction_term = sigma * np.exp(log_pdf - log_cdf)
549
+ correction_term = np.nan_to_num(correction_term, nan=0.0, posinf=0.0, neginf=0.0)
550
+
551
+ signal = mu_sf + correction_term
552
+
553
+ # Ensure non-negative values
554
+ if np.any(signal < 0):
555
+ warnings.warn(
556
+ "Numerical accuracy limit reached with very low intensity or high background. "
557
+ "Setting adjusted intensities to small positive value.",
558
+ UserWarning,
559
+ stacklevel=3,
560
+ )
561
+ signal = np.maximum(signal, 1e-6)
562
+
563
+ return signal
564
+
565
+
566
+ def rma_background_correct(
567
+ adata: AnnData,
568
+ copy: bool = False,
569
+ ) -> AnnData | None:
570
+ """Apply RMA background correction to microarray intensity data.
571
+
572
+ This function is maintained for backward compatibility. It calls the unified
573
+ `background_correct()` function with method='normexp'.
574
+
575
+ Implements the RMA background correction using a convolution model where
576
+ observed intensity = background + signal. The background is modeled as
577
+ normal(μ, σ²) and signal as exponential(α).
578
+
579
+ For each sample, the algorithm:
580
+ 1. Estimates μ (mu) from the mode of the intensity distribution
581
+ 2. Estimates σ (sigma) from the standard deviation of low-intensity probes
582
+ 3. Adjusts intensities using the conditional expectation: E[signal | observed]
583
+
584
+ Parameters
585
+ ----------
586
+ adata
587
+ AnnData object with shape (n_samples, n_probes).
588
+ Must contain raw intensity values in `.X`.
589
+ copy
590
+ If True, return a copy of the AnnData object. If False, modify in place.
591
+
592
+ Returns:
593
+ -------
594
+ AnnData or None
595
+ If `copy=True`, returns corrected AnnData object.
596
+ If `copy=False`, modifies `adata` in place and returns None.
597
+ The corrected intensities are stored in `.X`.
598
+ Correction parameters are stored in `.uns['background_correction']`
599
+ and `.uns['normexp_params']`.
600
+
601
+ Examples:
602
+ --------
603
+ >>> import microarray as ma
604
+ >>> adata = ma.io.cel_batch_to_anndata(cel_files, cdf_path, annotation_db)
605
+ >>> adata_corrected = ma.pp.rma_background_correct(adata, copy=True)
606
+
607
+ Notes:
608
+ -----
609
+ The algorithm uses:
610
+ - α (alpha) = 0.03 (fixed, in simplified mode)
611
+ - μ (mu) estimated from histogram mode (0 to 75th percentile, 100 bins)
612
+ - σ (sigma) estimated from probes with intensity < μ, scaled by √2
613
+
614
+ The correction formula is:
615
+ a = y - μ - α·σ²
616
+ y_adjusted = a + σ·exp(log_pdf(a/σ) - log_cdf(a/σ))
617
+
618
+ where y is the observed intensity and pdf/cdf are from the standard normal distribution.
619
+
620
+ See Also:
621
+ --------
622
+ background_correct : Unified interface with multiple background correction methods
623
+ """
624
+ # Call unified interface with normexp method
625
+ result = background_correct(adata, method="normexp", normexp_method="saddle", copy=copy)
626
+
627
+ # For backward compatibility, also store in old location
628
+ if copy:
629
+ if "normexp_params" in result.uns:
630
+ result.uns["rma_background"] = result.uns["normexp_params"].copy()
631
+ result.uns["rma_background"]["method"] = "saddle"
632
+ return result
633
+ else:
634
+ if "normexp_params" in adata.uns:
635
+ adata.uns["rma_background"] = adata.uns["normexp_params"].copy()
636
+ adata.uns["rma_background"]["method"] = "saddle"
637
+ return None
638
+
639
+
640
+ def _background_mas(adata: AnnData, grid_dim: int = 16) -> None:
641
+ """MAS5 spatial background correction with grid-based smoothing.
642
+
643
+ Divides the chip into grid_dim regions, estimates background in each region
644
+ from the lowest 2% of intensities, then applies weighted smoothing based on
645
+ distance to region centroids.
646
+
647
+ Parameters
648
+ ----------
649
+ adata : AnnData
650
+ AnnData object with probe intensities. Must have 'x' and 'y' columns in
651
+ `.var` containing probe spatial coordinates.
652
+ grid_dim : int, default=16
653
+ Number of grid regions (must be a perfect square).
654
+
655
+ Notes:
656
+ -----
657
+ Algorithm:
658
+ 1. Divide chip into sqrt(grid_dim) × sqrt(grid_dim) grid regions
659
+ 2. For each region, compute background as mean of lowest 2% intensities
660
+ 3. Compute noise as standard deviation of lowest 2% intensities
661
+ 4. For each probe, compute weighted average of regional backgrounds based on
662
+ inverse squared distance to region
663
+
664
+
665
+ centroids
666
+ 5. Corrected value = max(intensity - bg_weighted, 0.5 * noise_weighted)
667
+
668
+ References:
669
+ ----------
670
+ Affymetrix (2002). Statistical Algorithms Description Document.
671
+ """
672
+ # Validate grid_dim is a perfect square
673
+ grid_dim_1d = int(np.sqrt(grid_dim))
674
+ if grid_dim_1d**2 != grid_dim:
675
+ raise ValueError(f"grid_dim must be a perfect square, got {grid_dim}")
676
+
677
+ # Check for spatial coordinates
678
+ if "x" not in adata.var.columns or "y" not in adata.var.columns:
679
+ raise ValueError(
680
+ "MAS5 background correction requires probe spatial coordinates in adata.var['x'] and adata.var['y']"
681
+ )
682
+
683
+ x = adata.var["x"].values
684
+ y = adata.var["y"].values
685
+
686
+ # Infer chip dimensions from max coordinates
687
+ rows = int(np.max(x)) + 1
688
+ cols = int(np.max(y)) + 1
689
+
690
+ n_samples, n_probes = adata.X.shape
691
+
692
+ # Compute grid centroids
693
+ centroidx, centroidy = _get_centroids(rows, cols, grid_dim_1d, grid_dim_1d)
694
+
695
+ # Compute grid boundaries
696
+ gridpt_x, gridpt_y = _get_gridpts(rows, cols, grid_dim)
697
+
698
+ # Assign each probe to a grid region
699
+ whichgrid = _compute_grids(x, y, rows, cols, n_probes, gridpt_x, gridpt_y)
700
+
701
+ # Compute weights for each probe (based on distance to centroids)
702
+ weights = _compute_weights(x, y, n_probes, grid_dim, centroidx, centroidy)
703
+
704
+ # Process each sample independently
705
+ corrected = np.zeros_like(adata.X)
706
+
707
+ for j in range(n_samples):
708
+ probe_intensity = adata.X[j, :]
709
+
710
+ # Compute background and noise for each grid region
711
+ bg_grid, noise_grid = _compute_background_quadrant(probe_intensity, n_probes, grid_dim, whichgrid)
712
+
713
+ # Apply weighted background correction to each probe
714
+ for i in range(n_probes):
715
+ bg_weighted = _background_correct_probe(x[i], y[i], grid_dim, weights[i, :], bg_grid)
716
+ noise_weighted = _background_correct_probe(x[i], y[i], grid_dim, weights[i, :], noise_grid)
717
+
718
+ # Corrected value = max(intensity - bg, 0.5 * noise)
719
+ corrected[j, i] = max(probe_intensity[i] - bg_weighted, 0.5 * noise_weighted)
720
+
721
+ adata.X = corrected
722
+
723
+
724
+ def _get_centroids(
725
+ rows: int, cols: int, grid_dim_rows: int, grid_dim_cols: int
726
+ ) -> tuple[NDArray[np.floating], NDArray[np.floating]]:
727
+ """Compute centroids of grid regions."""
728
+ grid_dim = grid_dim_rows * grid_dim_cols
729
+
730
+ cuts_x = np.array([(i + 1) * rows / grid_dim_rows - rows / (2.0 * grid_dim_rows) for i in range(grid_dim_rows)])
731
+ cuts_y = np.array([(j + 1) * cols / grid_dim_cols - cols / (2.0 * grid_dim_cols) for j in range(grid_dim_cols)])
732
+
733
+ centroidx = np.zeros(grid_dim)
734
+ centroidy = np.zeros(grid_dim)
735
+
736
+ for j in range(grid_dim_cols):
737
+ for i in range(grid_dim_rows):
738
+ idx = j * grid_dim_rows + i
739
+ centroidx[idx] = cuts_x[idx // grid_dim_rows] + 0.5
740
+ centroidy[idx] = cuts_y[idx % grid_dim_rows] + 0.5
741
+
742
+ return centroidx, centroidy
743
+
744
+
745
+ def _get_gridpts(rows: int, cols: int, grid_dim: int) -> tuple[NDArray[np.int_], NDArray[np.int_]]:
746
+ """Compute grid boundary points."""
747
+ grid_dim_1d = int(np.sqrt(grid_dim))
748
+
749
+ gridpt_x = np.array([(i + 1) * cols // grid_dim_1d for i in range(grid_dim_1d - 1)])
750
+ gridpt_y = np.array([(i + 1) * rows // grid_dim_1d for i in range(grid_dim_1d - 1)])
751
+
752
+ return gridpt_x, gridpt_y
753
+
754
+
755
+ def _compute_grids(
756
+ x: NDArray[np.int_],
757
+ y: NDArray[np.int_],
758
+ rows: int,
759
+ cols: int,
760
+ n_probes: int,
761
+ gridpt_x: NDArray[np.int_],
762
+ gridpt_y: NDArray[np.int_],
763
+ ) -> NDArray[np.int_]:
764
+ """Assign each probe to a grid region."""
765
+ grid_dim_1d = len(gridpt_x) + 1
766
+ whichgrid = np.zeros(n_probes, dtype=np.int32)
767
+
768
+ for i in range(n_probes):
769
+ # Find x grid
770
+ x_grid = 0
771
+ for j in range(len(gridpt_x)):
772
+ if x[i] <= gridpt_x[j]:
773
+ x_grid = j
774
+ break
775
+ else:
776
+ x_grid = len(gridpt_x)
777
+
778
+ # Find y grid
779
+ y_grid = 0
780
+ for j in range(len(gridpt_y)):
781
+ if y[i] <= gridpt_y[j]:
782
+ y_grid = j
783
+ break
784
+ else:
785
+ y_grid = len(gridpt_y)
786
+
787
+ # Grid index (1-based)
788
+ whichgrid[i] = x_grid * grid_dim_1d + y_grid + 1
789
+
790
+ return whichgrid
791
+
792
+
793
+ def _compute_weights(
794
+ x: NDArray[np.int_],
795
+ y: NDArray[np.int_],
796
+ n_probes: int,
797
+ grid_dim: int,
798
+ centroidx: NDArray[np.floating],
799
+ centroidy: NDArray[np.floating],
800
+ ) -> NDArray[np.floating]:
801
+ """Compute inverse distance weights for each probe to each grid centroid."""
802
+ smooth = 100.0
803
+ weights = np.zeros((n_probes, grid_dim), dtype=np.float64)
804
+
805
+ for i in range(n_probes):
806
+ # Compute squared distances to all centroids
807
+ distances_sq = (x[i] - centroidx) ** 2 + (y[i] - centroidy) ** 2
808
+
809
+ # Inverse distance weights
810
+ weights[i, :] = 1.0 / (distances_sq + smooth)
811
+
812
+ return weights
813
+
814
+
815
+ def _compute_background_quadrant(
816
+ probe_intensity: NDArray[np.floating],
817
+ n_probes: int,
818
+ grid_dim: int,
819
+ whichgrid: NDArray[np.int_],
820
+ ) -> tuple[NDArray[np.floating], NDArray[np.floating]]:
821
+ """Compute background and noise for each grid region.
822
+
823
+ Background is mean of lowest 2% of intensities.
824
+ Noise is standard deviation of lowest 2%.
825
+ """
826
+ bg_grid = np.zeros(grid_dim, dtype=np.float64)
827
+ noise_grid = np.zeros(grid_dim, dtype=np.float64)
828
+
829
+ for j in range(grid_dim):
830
+ # Get probes in this grid (whichgrid is 1-based)
831
+ mask = whichgrid == (j + 1)
832
+ if not np.any(mask):
833
+ bg_grid[j] = 0.0
834
+ noise_grid[j] = 1.0
835
+ continue
836
+
837
+ grid_intensities = probe_intensity[mask]
838
+
839
+ # Sort to find lowest 2%
840
+ grid_sorted = np.sort(grid_intensities)
841
+ lower_2pc = max(1, int(0.02 * len(grid_sorted)))
842
+
843
+ # Mean and std of lowest 2%
844
+ lowest = grid_sorted[:lower_2pc]
845
+ bg_grid[j] = np.mean(lowest)
846
+ noise_grid[j] = np.std(lowest, ddof=1) if len(lowest) > 1 else 1.0
847
+
848
+ return bg_grid, noise_grid
849
+
850
+
851
+ def _background_correct_probe(
852
+ x: int,
853
+ y: int,
854
+ grid_dim: int,
855
+ weights: NDArray[np.floating],
856
+ centroid_values: NDArray[np.floating],
857
+ ) -> float:
858
+ """Compute weighted background value for a single probe."""
859
+ weighted_sum = np.sum(weights * centroid_values)
860
+ sum_weights = np.sum(weights)
861
+
862
+ return weighted_sum / sum_weights if sum_weights > 0 else 0.0