microarray 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. microarray/__init__.py +15 -0
  2. microarray/_version.py +3 -0
  3. microarray/datasets/__init__.py +3 -0
  4. microarray/datasets/_arrayexpress.py +1 -0
  5. microarray/datasets/_cdf_files.py +35 -0
  6. microarray/datasets/_geo.py +1 -0
  7. microarray/datasets/_utils.py +143 -0
  8. microarray/io/__init__.py +17 -0
  9. microarray/io/_anndata_converter.py +198 -0
  10. microarray/io/_cdf.py +575 -0
  11. microarray/io/_cel.py +591 -0
  12. microarray/io/_read.py +127 -0
  13. microarray/plotting/__init__.py +28 -0
  14. microarray/plotting/_base.py +253 -0
  15. microarray/plotting/_cel.py +75 -0
  16. microarray/plotting/_de_plots.py +239 -0
  17. microarray/plotting/_diagnostic_plots.py +268 -0
  18. microarray/plotting/_heatmap.py +279 -0
  19. microarray/plotting/_ma_plots.py +136 -0
  20. microarray/plotting/_pca.py +320 -0
  21. microarray/plotting/_qc_plots.py +335 -0
  22. microarray/plotting/_score.py +38 -0
  23. microarray/plotting/_top_table_heatmap.py +98 -0
  24. microarray/plotting/_utils.py +280 -0
  25. microarray/preprocessing/__init__.py +39 -0
  26. microarray/preprocessing/_background.py +862 -0
  27. microarray/preprocessing/_log2.py +77 -0
  28. microarray/preprocessing/_normalize.py +1292 -0
  29. microarray/preprocessing/_rma.py +243 -0
  30. microarray/preprocessing/_robust.py +170 -0
  31. microarray/preprocessing/_summarize.py +318 -0
  32. microarray/py.typed +0 -0
  33. microarray/tools/__init__.py +26 -0
  34. microarray/tools/_biomart.py +416 -0
  35. microarray/tools/_empirical_bayes.py +401 -0
  36. microarray/tools/_fdist.py +171 -0
  37. microarray/tools/_linear_models.py +387 -0
  38. microarray/tools/_mds.py +101 -0
  39. microarray/tools/_pca.py +88 -0
  40. microarray/tools/_score.py +86 -0
  41. microarray/tools/_toptable.py +360 -0
  42. microarray-0.1.0.dist-info/METADATA +75 -0
  43. microarray-0.1.0.dist-info/RECORD +44 -0
  44. microarray-0.1.0.dist-info/WHEEL +4 -0
@@ -0,0 +1,1292 @@
1
+ """Normalization methods for microarray data."""
2
+
3
+ import warnings
4
+
5
+ import numpy as np
6
+ from anndata import AnnData
7
+
8
+
9
+ def normalize_quantile(
10
+ adata: AnnData,
11
+ copy: bool = False,
12
+ ) -> AnnData | None:
13
+ """Apply quantile normalization to microarray intensity data.
14
+
15
+ Quantile normalization makes the distribution of intensities identical
16
+ across all samples by:
17
+ 1. Sorting intensity values for each sample
18
+ 2. Replacing sorted values with the mean across samples at each rank
19
+ 3. Restoring the original order within each sample
20
+
21
+ This ensures all samples have the same empirical distribution, removing
22
+ systematic technical variation while preserving biological differences.
23
+
24
+ Parameters
25
+ ----------
26
+ adata
27
+ AnnData object with shape (n_samples, n_probes).
28
+ Must contain intensity values in `.X`.
29
+ copy
30
+ If True, return a copy of the AnnData object. If False, modify in place.
31
+
32
+ Returns:
33
+ -------
34
+ AnnData or None
35
+ If `copy=True`, returns normalized AnnData object.
36
+ If `copy=False`, modifies `adata` in place and returns None.
37
+ The normalized intensities are stored in `.X`.
38
+ Normalization metadata is stored in `.uns['normalization']`.
39
+
40
+ Examples:
41
+ --------
42
+ >>> import microarray as ma
43
+ >>> adata = ma.io.cel_batch_to_anndata(cel_files, cdf_path, annotation_db)
44
+ >>> adata_norm = ma.pp.normalize_quantile(adata, copy=True)
45
+
46
+ Notes:
47
+ -----
48
+ Quantile normalization requires multiple samples. For a single sample,
49
+ a warning is issued and the data is returned unchanged.
50
+
51
+ The algorithm preserves the rank order of probes within each sample
52
+ while making the marginal distributions identical across samples.
53
+
54
+ Reference:
55
+ Bolstad, B.M., Irizarry, R.A., Astrand, M., Speed, T.P. (2003).
56
+ A comparison of normalization methods for high density oligonucleotide
57
+ array data based on variance and bias. Bioinformatics, 19(2), 185-193.
58
+ """
59
+ adata = adata.copy() if copy else adata
60
+
61
+ # Validate input
62
+ if adata.X is None:
63
+ raise ValueError("AnnData object must have .X attribute with intensity values")
64
+
65
+ n_samples, n_probes = adata.shape
66
+
67
+ # Check for single sample case
68
+ if n_samples == 1:
69
+ warnings.warn(
70
+ "Quantile normalization requires multiple samples for proper normalization. "
71
+ "With only 1 sample, data is returned unchanged.",
72
+ UserWarning,
73
+ stacklevel=2,
74
+ )
75
+ adata.uns["normalization"] = {
76
+ "method": "quantile",
77
+ "n_samples": n_samples,
78
+ "applied": False,
79
+ "reason": "single_sample",
80
+ }
81
+ return adata if copy else None
82
+
83
+ # Get intensity matrix (samples × probes)
84
+ X = adata.X.copy()
85
+
86
+ # Step 1: Sort each sample (column)
87
+ X_sorted = np.sort(X, axis=1)
88
+
89
+ # Step 2: Compute mean across samples at each rank
90
+ rank_means = np.mean(X_sorted, axis=0)
91
+
92
+ # Step 3: Get ranks for each sample to restore original order
93
+ # argsort gives indices that would sort the array
94
+ # argsort(argsort) gives the rank of each element
95
+ ranks = np.argsort(np.argsort(X, axis=1), axis=1)
96
+
97
+ # Step 4: Replace with rank means
98
+ X_normalized = rank_means[ranks]
99
+
100
+ # Update AnnData
101
+ adata.X = X_normalized
102
+ adata.uns["normalization"] = {
103
+ "method": "quantile",
104
+ "n_samples": n_samples,
105
+ "n_probes": n_probes,
106
+ "applied": True,
107
+ }
108
+
109
+ return adata if copy else None
110
+
111
+
112
+ def normalize_constant(
113
+ adata: AnnData,
114
+ refindex: int = 0,
115
+ method: str = "mean",
116
+ target_value: float | None = None,
117
+ copy: bool = False,
118
+ ) -> AnnData | None:
119
+ """Apply constant (scale) normalization to microarray intensity data.
120
+
121
+ Constant normalization is a simple scaling method that multiplies each array
122
+ by a constant to make a summary statistic (mean or median) equal across arrays
123
+ or to a specific target value.
124
+
125
+ Parameters
126
+ ----------
127
+ adata
128
+ AnnData object with shape (n_samples, n_probes).
129
+ Must contain intensity values in `.X`.
130
+ refindex
131
+ Index of the reference sample (0-based). Default is 0 (first sample).
132
+ Only used when target_value is None.
133
+ method
134
+ Function to compute the scaling constant: "mean" or "median".
135
+ Default is "mean".
136
+ target_value
137
+ If specified, scale all samples to this target value instead of scaling
138
+ to a reference sample. Commonly used in MAS5 (target=500).
139
+ copy
140
+ If True, return a copy of the AnnData object. If False, modify in place.
141
+
142
+ Returns:
143
+ -------
144
+ AnnData or None
145
+ If `copy=True`, returns normalized AnnData object.
146
+ If `copy=False`, modifies `adata` in place and returns None.
147
+ The normalized intensities are stored in `.X`.
148
+ Normalization metadata is stored in `.uns['normalize_constant']`.
149
+
150
+ Examples:
151
+ --------
152
+ >>> import microarray as ma
153
+ >>> adata = ma.io.cel_batch_to_anndata(cel_files, cdf_path, annotation_db)
154
+ >>> adata_norm = ma.pp.normalize_constant(adata, method="median", copy=True)
155
+
156
+ Notes:
157
+ -----
158
+ This is the simplest normalization method and assumes that most genes are
159
+ not differentially expressed and that technical variation affects all probes
160
+ uniformly. It may not be appropriate when there are global shifts in
161
+ expression levels between samples.
162
+
163
+ The normalization formula for each sample i is:
164
+ X_norm[i] = X[i] * (ref_constant / sample_constant[i])
165
+
166
+ Reference
167
+ -------
168
+ See normalize.constant in the affy Bioconductor package documentation.
169
+ """
170
+ adata = adata.copy() if copy else adata
171
+
172
+ # Validate input
173
+ if adata.X is None:
174
+ raise ValueError("AnnData object must have .X attribute with intensity values")
175
+
176
+ if method not in ["mean", "median"]:
177
+ raise ValueError(f"method must be 'mean' or 'median', got '{method}'")
178
+
179
+ n_samples, n_probes = adata.shape
180
+
181
+ if refindex < 0 or refindex >= n_samples:
182
+ raise ValueError(f"refindex must be between 0 and {n_samples - 1}, got {refindex}")
183
+
184
+ # Get intensity matrix (samples × probes)
185
+ X = adata.X.copy()
186
+
187
+ # Compute constants for each sample
188
+ if method == "mean":
189
+ constants = np.mean(X, axis=1)
190
+ else: # median
191
+ constants = np.median(X, axis=1)
192
+
193
+ # Determine reference constant
194
+ if target_value is not None:
195
+ ref_constant = target_value
196
+ else:
197
+ # Get reference constant from specified sample
198
+ ref_constant = constants[refindex]
199
+
200
+ # Avoid division by zero
201
+ if np.any(constants == 0):
202
+ warnings.warn(
203
+ "Some samples have zero constant value. These samples will not be normalized.",
204
+ UserWarning,
205
+ stacklevel=2,
206
+ )
207
+ constants = np.where(constants == 0, 1.0, constants)
208
+
209
+ # Compute scaling factors
210
+ scale_factors = ref_constant / constants
211
+
212
+ # Apply scaling to each sample
213
+ X_normalized = X * scale_factors[:, np.newaxis]
214
+
215
+ # Update AnnData
216
+ adata.X = X_normalized
217
+ adata.uns["normalize_constant"] = {
218
+ "method": method,
219
+ "refindex": refindex if target_value is None else None,
220
+ "target_value": float(target_value) if target_value is not None else None,
221
+ "ref_constant": float(ref_constant),
222
+ "scale_factors": scale_factors.tolist(),
223
+ "n_samples": n_samples,
224
+ "n_probes": n_probes,
225
+ "applied": True,
226
+ }
227
+
228
+ return adata if copy else None
229
+
230
+
231
+ def normalize_quantile_robust(
232
+ adata: AnnData,
233
+ weights: np.ndarray | None = None,
234
+ remove_extreme: str = "none",
235
+ n_remove: int = 0,
236
+ use_median: bool = False,
237
+ use_log2: bool = False,
238
+ copy: bool = False,
239
+ ) -> AnnData | None:
240
+ """Apply robust quantile normalization with optional weighting and outlier removal.
241
+
242
+ This is an enhanced version of quantile normalization that supports chip weighting
243
+ and outlier removal options. It can be useful when some arrays are of lower quality
244
+ or when extreme outliers should be excluded from the normalization process.
245
+
246
+ Parameters
247
+ ----------
248
+ adata
249
+ AnnData object with shape (n_samples, n_probes).
250
+ Must contain intensity values in `.X`.
251
+ weights
252
+ Optional array of weights for each sample (length n_samples).
253
+ Larger weights give more influence in computing the target distribution.
254
+ If None, all samples are weighted equally.
255
+ remove_extreme
256
+ Method for identifying extreme samples to remove:
257
+ - "none": no removal (default)
258
+ - "variance": remove samples with highest variance
259
+ - "mean": remove samples with extreme mean values
260
+ - "both": remove samples with extreme variance or mean
261
+ n_remove
262
+ Number of extreme samples to remove (default 0).
263
+ Only used if remove_extreme is not "none".
264
+ use_median
265
+ If True, use weighted median for target distribution.
266
+ If False, use weighted mean (default).
267
+ use_log2
268
+ If True, work on log2 scale and compute geometric mean.
269
+ If False, work on original scale (default).
270
+ copy
271
+ If True, return a copy of the AnnData object. If False, modify in place.
272
+
273
+ Returns:
274
+ -------
275
+ AnnData or None
276
+ If `copy=True`, returns normalized AnnData object.
277
+ If `copy=False`, modifies `adata` in place and returns None.
278
+ The normalized intensities are stored in `.X`.
279
+ Normalization metadata is stored in `.uns['normalize_quantile_robust']`.
280
+
281
+ Examples:
282
+ --------
283
+ >>> import microarray as ma
284
+ >>> import numpy as np
285
+ >>> adata = ma.io.cel_batch_to_anndata(cel_files, cdf_path, annotation_db)
286
+ >>> # Remove 2 samples with highest variance
287
+ >>> adata_norm = ma.pp.normalize_quantile_robust(adata, remove_extreme="variance", n_remove=2, copy=True)
288
+ >>> # Use weighted normalization
289
+ >>> weights = np.array([1.0, 0.8, 1.0, 0.9, 1.0]) # lower weight for quality issues
290
+ >>> adata_norm = ma.pp.normalize_quantile_robust(adata, weights=weights, copy=True)
291
+
292
+ Notes:
293
+ -----
294
+ This method is marked as experimental in the affy R package. Use with caution.
295
+
296
+ When remove_extreme is used, the extreme samples are identified but still
297
+ normalized. They are simply excluded from computing the target distribution.
298
+
299
+ Reference
300
+ -------
301
+ See normalize.quantiles.robust in the affy Bioconductor package documentation.
302
+ """
303
+ adata = adata.copy() if copy else adata
304
+
305
+ # Validate input
306
+ if adata.X is None:
307
+ raise ValueError("AnnData object must have .X attribute with intensity values")
308
+
309
+ if remove_extreme not in ["none", "variance", "mean", "both"]:
310
+ raise ValueError(f"remove_extreme must be one of 'none', 'variance', 'mean', 'both', got '{remove_extreme}'")
311
+
312
+ n_samples, n_probes = adata.shape
313
+
314
+ # Warn about experimental status
315
+ warnings.warn(
316
+ "Robust quantile normalization is marked as experimental. Use with caution and validate results.",
317
+ UserWarning,
318
+ stacklevel=2,
319
+ )
320
+
321
+ # Check for single sample case
322
+ if n_samples == 1:
323
+ warnings.warn(
324
+ "Quantile normalization requires multiple samples. With only 1 sample, data is returned unchanged.",
325
+ UserWarning,
326
+ stacklevel=2,
327
+ )
328
+ adata.uns["normalize_quantile_robust"] = {
329
+ "method": "quantile_robust",
330
+ "n_samples": n_samples,
331
+ "applied": False,
332
+ "reason": "single_sample",
333
+ }
334
+ return adata if copy else None
335
+
336
+ # Get intensity matrix (samples × probes)
337
+ X = adata.X.copy()
338
+
339
+ # Apply log2 transformation if requested
340
+ if use_log2:
341
+ X = np.log2(X + 1) # Add 1 to avoid log(0)
342
+
343
+ # Initialize weights if not provided
344
+ if weights is None:
345
+ weights = np.ones(n_samples)
346
+ else:
347
+ weights = np.array(weights)
348
+ if len(weights) != n_samples:
349
+ raise ValueError(f"weights must have length {n_samples}, got {len(weights)}")
350
+
351
+ # Identify samples to exclude from target distribution
352
+ exclude_mask = np.zeros(n_samples, dtype=bool)
353
+
354
+ if remove_extreme != "none" and n_remove > 0:
355
+ if n_remove >= n_samples:
356
+ raise ValueError(f"n_remove ({n_remove}) must be less than n_samples ({n_samples})")
357
+
358
+ if remove_extreme in ["variance", "both"]:
359
+ variances = np.var(X, axis=1)
360
+ variance_ranks = np.argsort(variances)[::-1] # highest first
361
+ exclude_mask[variance_ranks[:n_remove]] = True
362
+
363
+ if remove_extreme in ["mean", "both"]:
364
+ means = np.mean(X, axis=1)
365
+ # Remove most extreme (furthest from overall median)
366
+ median_mean = np.median(means)
367
+ mean_distances = np.abs(means - median_mean)
368
+ mean_ranks = np.argsort(mean_distances)[::-1] # highest first
369
+ exclude_mask[mean_ranks[:n_remove]] = True
370
+
371
+ # Get samples to include in target distribution
372
+ include_mask = ~exclude_mask
373
+ n_included = np.sum(include_mask)
374
+
375
+ if n_included == 0:
376
+ raise ValueError("All samples would be excluded. Reduce n_remove.")
377
+
378
+ # Sort each sample
379
+ X_sorted = np.sort(X, axis=1)
380
+
381
+ # Compute weighted target distribution
382
+ if use_median:
383
+ # Weighted median is more complex; use simple median for now
384
+ rank_targets = np.median(X_sorted[include_mask], axis=0)
385
+ else:
386
+ # Weighted mean
387
+ included_weights = weights[include_mask]
388
+ included_sorted = X_sorted[include_mask]
389
+ weight_sum = np.sum(included_weights)
390
+ rank_targets = np.sum(included_sorted * included_weights[:, np.newaxis], axis=0) / weight_sum
391
+
392
+ # Get ranks for each sample to restore original order
393
+ ranks = np.argsort(np.argsort(X, axis=1), axis=1)
394
+
395
+ # Replace with rank targets
396
+ X_normalized = rank_targets[ranks]
397
+
398
+ # Reverse log2 transformation if applied
399
+ if use_log2:
400
+ X_normalized = np.power(2, X_normalized) - 1
401
+ X_normalized = np.maximum(X_normalized, 0) # Ensure non-negative
402
+
403
+ # Update AnnData
404
+ adata.X = X_normalized
405
+ adata.uns["normalize_quantile_robust"] = {
406
+ "method": "quantile_robust",
407
+ "n_samples": n_samples,
408
+ "n_probes": n_probes,
409
+ "n_included": int(n_included),
410
+ "n_excluded": int(np.sum(exclude_mask)),
411
+ "remove_extreme": remove_extreme,
412
+ "n_remove": n_remove,
413
+ "use_median": use_median,
414
+ "use_log2": use_log2,
415
+ "excluded_indices": np.where(exclude_mask)[0].tolist(),
416
+ "applied": True,
417
+ }
418
+
419
+ return adata if copy else None
420
+
421
+
422
+ def _lowess_fit(x: np.ndarray, y: np.ndarray, span: float = 2 / 3) -> np.ndarray:
423
+ """Simple LOWESS (Locally Weighted Scatterplot Smoothing) implementation.
424
+
425
+ Parameters
426
+ ----------
427
+ x
428
+ Independent variable values (sorted).
429
+ y
430
+ Dependent variable values.
431
+ span
432
+ Fraction of data to use for local regression (0 < span <= 1).
433
+
434
+ Returns:
435
+ -------
436
+ np.ndarray
437
+ Smoothed y values.
438
+ """
439
+ n = len(x)
440
+ y_smooth = np.zeros(n)
441
+ k = int(np.ceil(span * n))
442
+
443
+ for i in range(n):
444
+ # Get local neighborhood
445
+ distances = np.abs(x - x[i])
446
+ nearest = np.argsort(distances)[:k]
447
+
448
+ # Compute weights using tricube kernel
449
+ max_dist = distances[nearest[-1]]
450
+ if max_dist > 0:
451
+ weights = (1 - (distances[nearest] / max_dist) ** 3) ** 3
452
+ else:
453
+ weights = np.ones(k)
454
+
455
+ # Weighted linear regression
456
+ X_local = np.column_stack([np.ones(k), x[nearest]])
457
+ W = np.diag(weights)
458
+ try:
459
+ beta = np.linalg.lstsq(X_local.T @ W @ X_local, X_local.T @ W @ y[nearest], rcond=None)[0]
460
+ y_smooth[i] = beta[0] + beta[1] * x[i]
461
+ except np.linalg.LinAlgError:
462
+ # Fallback to weighted mean if regression fails
463
+ y_smooth[i] = np.average(y[nearest], weights=weights)
464
+
465
+ return y_smooth
466
+
467
+
468
+ def normalize_loess(
469
+ adata: AnnData,
470
+ subset: int = 5000,
471
+ span: float = 2 / 3,
472
+ iterations: int = 1,
473
+ epsilon: float = 0.01,
474
+ copy: bool = False,
475
+ ) -> AnnData | None:
476
+ """Apply loess (locally weighted scatterplot smoothing) normalization.
477
+
478
+ Loess normalization uses MA plots (M vs A, where M = log-intensity difference
479
+ and A = average log-intensity) to normalize arrays against a reference. It
480
+ fits a smooth curve to correct for intensity-dependent biases.
481
+
482
+ Parameters
483
+ ----------
484
+ adata
485
+ AnnData object with shape (n_samples, n_probes).
486
+ Must contain intensity values in `.X`.
487
+ subset
488
+ Number of probes to use for fitting the loess curve.
489
+ Using a subset improves computational efficiency.
490
+ If subset >= n_probes, all probes are used.
491
+ Default is 5000.
492
+ span
493
+ Fraction of data points to use for local regression (0 < span <= 1).
494
+ Larger values produce smoother curves. Default is 2/3.
495
+ iterations
496
+ Number of normalization iterations. Default is 1.
497
+ More iterations can improve convergence but increase computation time.
498
+ epsilon
499
+ Convergence threshold. If maximum change between iterations
500
+ is less than epsilon, stop early. Default is 0.01.
501
+ copy
502
+ If True, return a copy of the AnnData object. If False, modify in place.
503
+
504
+ Returns:
505
+ -------
506
+ AnnData or None
507
+ If `copy=True`, returns normalized AnnData object.
508
+ If `copy=False`, modifies `adata` in place and returns None.
509
+ The normalized intensities are stored in `.X`.
510
+ Normalization metadata is stored in `.uns['normalize_loess']`.
511
+
512
+ Examples:
513
+ --------
514
+ >>> import microarray as ma
515
+ >>> adata = ma.io.cel_batch_to_anndata(cel_files, cdf_path, annotation_db)
516
+ >>> adata_norm = ma.pp.normalize_loess(adata, subset=3000, iterations=2, copy=True)
517
+
518
+ Notes:
519
+ -----
520
+ This method normalizes all arrays to have the same relationship between
521
+ intensity and average intensity, removing intensity-dependent bias.
522
+
523
+ The algorithm:
524
+ 1. Compute reference as median array
525
+ 2. For each array, compute M = log(array) - log(reference)
526
+ and A = (log(array) + log(reference)) / 2
527
+ 3. Fit loess curve M ~ A on subset of probes
528
+ 4. Apply correction to all probes via interpolation
529
+ 5. Repeat until convergence or max iterations reached
530
+
531
+ For large datasets, using a subset of probes significantly improves speed
532
+ while maintaining normalization quality.
533
+
534
+ Reference
535
+ -------
536
+ Yang, Y.H., et al. (2002). Normalization for cDNA microarray data:
537
+ a robust composite method addressing single and multiple slide systematic
538
+ variation. Nucleic Acids Research, 30(4), e15.
539
+ """
540
+ adata = adata.copy() if copy else adata
541
+
542
+ # Validate input
543
+ if adata.X is None:
544
+ raise ValueError("AnnData object must have .X attribute with intensity values")
545
+
546
+ if span <= 0 or span > 1:
547
+ raise ValueError(f"span must be between 0 and 1, got {span}")
548
+
549
+ if iterations < 1:
550
+ raise ValueError(f"iterations must be at least 1, got {iterations}")
551
+
552
+ n_samples, n_probes = adata.shape
553
+
554
+ if n_samples == 1:
555
+ warnings.warn(
556
+ "Loess normalization requires multiple samples. With only 1 sample, data is returned unchanged.",
557
+ UserWarning,
558
+ stacklevel=2,
559
+ )
560
+ adata.uns["normalize_loess"] = {
561
+ "method": "loess",
562
+ "n_samples": n_samples,
563
+ "applied": False,
564
+ "reason": "single_sample",
565
+ }
566
+ return adata if copy else None
567
+
568
+ # Work on log scale (add small constant to avoid log(0))
569
+ X_log = np.log2(adata.X + 1)
570
+ X_normalized = X_log.copy()
571
+
572
+ # Determine subset for fitting
573
+ n_subset = min(subset, n_probes)
574
+ if n_subset < n_probes:
575
+ # Randomly select subset
576
+ np.random.seed(42)
577
+ subset_indices = np.random.choice(n_probes, n_subset, replace=False)
578
+ subset_indices = np.sort(subset_indices)
579
+ else:
580
+ subset_indices = np.arange(n_probes)
581
+
582
+ # Iterative normalization
583
+ for _ in range(iterations):
584
+ X_prev = X_normalized.copy()
585
+
586
+ # Compute reference (median across samples)
587
+ reference = np.median(X_normalized, axis=0)
588
+
589
+ # Normalize each sample against reference
590
+ for i in range(n_samples):
591
+ # Compute M and A for subset
592
+ M_subset = X_normalized[i, subset_indices] - reference[subset_indices]
593
+ A_subset = (X_normalized[i, subset_indices] + reference[subset_indices]) / 2
594
+
595
+ # Sort by A for loess fitting
596
+ sort_idx = np.argsort(A_subset)
597
+ A_sorted = A_subset[sort_idx]
598
+ M_sorted = M_subset[sort_idx]
599
+
600
+ # Fit loess curve
601
+ M_smooth = _lowess_fit(A_sorted, M_sorted, span=span)
602
+
603
+ # Interpolate to all probes
604
+ A_all = (X_normalized[i, :] + reference) / 2
605
+ M_correction = np.interp(A_all, A_sorted, M_smooth)
606
+
607
+ # Apply correction
608
+ X_normalized[i, :] = X_normalized[i, :] - M_correction
609
+
610
+ # Check convergence
611
+ max_change = np.max(np.abs(X_normalized - X_prev))
612
+ if max_change < epsilon:
613
+ break
614
+
615
+ # Convert back to original scale
616
+ X_normalized = np.power(2, X_normalized) - 1
617
+ X_normalized = np.maximum(X_normalized, 0) # Ensure non-negative
618
+
619
+ # Update AnnData
620
+ adata.X = X_normalized
621
+ adata.uns["normalize_loess"] = {
622
+ "method": "loess",
623
+ "n_samples": n_samples,
624
+ "n_probes": n_probes,
625
+ "subset": n_subset,
626
+ "span": span,
627
+ "iterations": iterations,
628
+ "iterations_performed": iterations,
629
+ "epsilon": epsilon,
630
+ "converged": max_change < epsilon if iterations > 1 else None,
631
+ "applied": True,
632
+ }
633
+
634
+ return adata if copy else None
635
+
636
+
637
+ def normalize_invariantset(
638
+ adata: AnnData,
639
+ baseline_type: str = "mean",
640
+ baseline_index: int | None = None,
641
+ prd_td: tuple[float, float] = (0.003, 0.007),
642
+ max_iterations: int = 10,
643
+ copy: bool = False,
644
+ ) -> AnnData | None:
645
+ """Apply invariant set normalization.
646
+
647
+ This method identifies a set of invariant probes (probes that show consistent
648
+ expression across samples) and uses them to fit a normalization curve via
649
+ smoothing splines. The curve is then applied to all probes.
650
+
651
+ Parameters
652
+ ----------
653
+ adata
654
+ AnnData object with shape (n_samples, n_probes).
655
+ Must contain intensity values in `.X`.
656
+ baseline_type
657
+ Method for creating baseline array:
658
+ - "mean": arithmetic mean across samples
659
+ - "median": median across samples
660
+ - "pseudo-mean": trimmed mean (10% trim)
661
+ - "pseudo-median": weighted median
662
+ - None: use array at baseline_index
663
+ Default is "mean".
664
+ baseline_index
665
+ Index of sample to use as baseline (0-based).
666
+ Only used if baseline_type is None.
667
+ prd_td
668
+ Tuple of (lower, upper) thresholds for identifying invariant probes
669
+ based on rank consistency. Default is (0.003, 0.007).
670
+ max_iterations
671
+ Maximum iterations for identifying invariant set. Default is 10.
672
+ copy
673
+ If True, return a copy of the AnnData object. If False, modify in place.
674
+
675
+ Returns:
676
+ -------
677
+ AnnData or None
678
+ If `copy=True`, returns normalized AnnData object.
679
+ If `copy=False`, modifies `adata` in place and returns None.
680
+ The normalized intensities are stored in `.X`.
681
+ Normalization metadata is stored in `.uns['normalize_invariantset']`.
682
+
683
+ Examples:
684
+ --------
685
+ >>> import microarray as ma
686
+ >>> adata = ma.io.cel_batch_to_anndata(cel_files, cdf_path, annotation_db)
687
+ >>> adata_norm = ma.pp.normalize_invariantset(adata, baseline_type="median", copy=True)
688
+
689
+ Notes:
690
+ -----
691
+ The algorithm iteratively identifies probes that have consistent ranks
692
+ across arrays. These invariant probes are assumed to not be differentially
693
+ expressed and are used to learn the normalization transformation.
694
+
695
+ A smoothing spline is fit to the invariant probes' intensities in the
696
+ (baseline, sample) space, and this curve is used to normalize all probes.
697
+
698
+ Reference
699
+ -------
700
+ Li, C. and Wong, W.H. (2001). Model-based analysis of oligonucleotide
701
+ arrays: expression index computation and outlier detection.
702
+ Proceedings of the National Academy of Sciences, 98(1), 31-36.
703
+ """
704
+ from scipy.interpolate import UnivariateSpline
705
+
706
+ adata = adata.copy() if copy else adata
707
+
708
+ # Validate input
709
+ if adata.X is None:
710
+ raise ValueError("AnnData object must have .X attribute with intensity values")
711
+
712
+ valid_baseline_types = ["mean", "median", "pseudo-mean", "pseudo-median", None]
713
+ if baseline_type not in valid_baseline_types:
714
+ raise ValueError(f"baseline_type must be one of {valid_baseline_types}, got '{baseline_type}'")
715
+
716
+ n_samples, n_probes = adata.shape
717
+
718
+ if n_samples == 1:
719
+ warnings.warn(
720
+ "Invariant set normalization requires multiple samples. With only 1 sample, data is returned unchanged.",
721
+ UserWarning,
722
+ stacklevel=2,
723
+ )
724
+ adata.uns["normalize_invariantset"] = {
725
+ "method": "invariantset",
726
+ "n_samples": n_samples,
727
+ "applied": False,
728
+ "reason": "single_sample",
729
+ }
730
+ return adata if copy else None
731
+
732
+ # Work on log scale
733
+ X_log = np.log2(adata.X + 1)
734
+
735
+ # Create baseline
736
+ if baseline_type is None:
737
+ if baseline_index is None:
738
+ raise ValueError("baseline_index must be provided when baseline_type is None")
739
+ if baseline_index < 0 or baseline_index >= n_samples:
740
+ raise ValueError(f"baseline_index must be between 0 and {n_samples - 1}, got {baseline_index}")
741
+ baseline = X_log[baseline_index, :]
742
+ elif baseline_type == "mean":
743
+ baseline = np.mean(X_log, axis=0)
744
+ elif baseline_type == "median":
745
+ baseline = np.median(X_log, axis=0)
746
+ elif baseline_type == "pseudo-mean":
747
+ # Trimmed mean (remove top and bottom 10%)
748
+ from scipy.stats import trim_mean
749
+
750
+ baseline = trim_mean(X_log, proportiontocut=0.1, axis=0)
751
+ elif baseline_type == "pseudo-median":
752
+ # Use median for simplicity
753
+ baseline = np.median(X_log, axis=0)
754
+
755
+ X_normalized = X_log.copy()
756
+ invariant_indices_list = []
757
+
758
+ # Normalize each sample
759
+ for i in range(n_samples):
760
+ sample = X_log[i, :]
761
+
762
+ # Iteratively identify invariant set
763
+ invariant_mask = np.ones(n_probes, dtype=bool)
764
+
765
+ for iter_num in range(max_iterations):
766
+ # Compute M and A
767
+ M = sample[invariant_mask] - baseline[invariant_mask]
768
+ # A = (sample[invariant_mask] + baseline[invariant_mask]) / 2
769
+
770
+ # Compute rank-based criterion
771
+ M_abs = np.abs(M)
772
+ ranks = np.argsort(np.argsort(M_abs)) # Ranks of |M|
773
+ rank_fraction = ranks / len(ranks)
774
+
775
+ # Keep probes with small |M| ranks (between prd_td thresholds)
776
+ keep_local = (rank_fraction >= prd_td[0]) & (rank_fraction <= prd_td[1])
777
+
778
+ # Update invariant mask
779
+ temp_mask = np.zeros(n_probes, dtype=bool)
780
+ temp_mask[np.where(invariant_mask)[0][keep_local]] = True
781
+
782
+ # Check convergence
783
+ if np.array_equal(temp_mask, invariant_mask):
784
+ break
785
+
786
+ invariant_mask = temp_mask
787
+
788
+ # Ensure we have enough invariant probes
789
+ if np.sum(invariant_mask) < 10:
790
+ warnings.warn(
791
+ f"Sample {i}: Only {np.sum(invariant_mask)} invariant probes found. "
792
+ "Using previous mask or all probes.",
793
+ UserWarning,
794
+ stacklevel=2,
795
+ )
796
+ # Revert to previous mask if we have too few probes
797
+ if iter_num > 0:
798
+ break
799
+ else:
800
+ # Use all probes on first iteration if we have too few
801
+ invariant_mask = np.ones(n_probes, dtype=bool)
802
+ break
803
+
804
+ invariant_indices_list.append(np.where(invariant_mask)[0].tolist())
805
+
806
+ # Check if we have enough probes for spline fitting
807
+ n_invariant = np.sum(invariant_mask)
808
+ if n_invariant < 5:
809
+ warnings.warn(
810
+ f"Sample {i}: Too few invariant probes ({n_invariant}). Skipping normalization for this sample.",
811
+ UserWarning,
812
+ stacklevel=2,
813
+ )
814
+ # Keep original values for this sample
815
+ continue
816
+
817
+ # Fit smoothing spline to invariant probes
818
+ baseline_inv = baseline[invariant_mask]
819
+ sample_inv = sample[invariant_mask]
820
+
821
+ # Sort by baseline for spline fitting
822
+ sort_idx = np.argsort(baseline_inv)
823
+ baseline_sorted = baseline_inv[sort_idx]
824
+ sample_sorted = sample_inv[sort_idx]
825
+
826
+ # Fit smoothing spline
827
+ try:
828
+ # Use smoothing spline (s=None for automatic smoothing parameter)
829
+ spline = UnivariateSpline(baseline_sorted, sample_sorted, s=None, k=3)
830
+
831
+ # Apply normalization: correct sample to match baseline
832
+ # Correction is: normalized = baseline + (sample - spline(baseline))
833
+ # But this is equivalent to: normalized = sample - (spline(baseline) - baseline)
834
+ spline_values = spline(baseline)
835
+ correction = spline_values - baseline
836
+ X_normalized[i, :] = sample - correction
837
+
838
+ except Exception as e: # noqa: BLE001
839
+ warnings.warn(
840
+ f"Sample {i}: Spline fitting failed ({e}). Using linear interpolation instead.",
841
+ UserWarning,
842
+ stacklevel=2,
843
+ )
844
+ # Fallback to linear interpolation
845
+ if len(baseline_sorted) >= 2:
846
+ correction = np.interp(baseline, baseline_sorted, sample_sorted - baseline_sorted)
847
+ X_normalized[i, :] = sample - correction
848
+ else:
849
+ # Not enough points, skip normalization
850
+ warnings.warn(
851
+ f"Sample {i}: Not enough points for interpolation. Skipping.",
852
+ UserWarning,
853
+ stacklevel=2,
854
+ )
855
+
856
+ # Convert back to original scale
857
+ X_normalized = np.power(2, X_normalized) - 1
858
+ X_normalized = np.maximum(X_normalized, 0) # Ensure non-negative
859
+
860
+ # Update AnnData
861
+ adata.X = X_normalized
862
+ adata.uns["normalize_invariantset"] = {
863
+ "method": "invariantset",
864
+ "n_samples": n_samples,
865
+ "n_probes": n_probes,
866
+ "baseline_type": baseline_type,
867
+ "baseline_index": baseline_index,
868
+ "prd_td": prd_td,
869
+ "max_iterations": max_iterations,
870
+ "n_invariant_per_sample": [len(indices) for indices in invariant_indices_list],
871
+ "applied": True,
872
+ }
873
+
874
+ return adata if copy else None
875
+
876
+
877
+ def normalize_qspline(
878
+ adata: AnnData,
879
+ target: np.ndarray | None = None,
880
+ samples: float | int = 0.02,
881
+ fit_iters: int = 5,
882
+ smooth: bool = True,
883
+ spar: float | None = None,
884
+ copy: bool = False,
885
+ ) -> AnnData | None:
886
+ """Apply cubic spline (qspline) normalization using quantile mapping.
887
+
888
+ This method fits cubic splines to map sample quantiles to target quantiles,
889
+ providing a smooth transformation that normalizes the intensity distribution.
890
+ Multiple spline fits with offsets are averaged to reduce variability.
891
+
892
+ Parameters
893
+ ----------
894
+ adata
895
+ AnnData object with shape (n_samples, n_probes).
896
+ Must contain intensity values in `.X`.
897
+ target
898
+ Target distribution to normalize towards (length n_probes).
899
+ If None, uses the geometric mean of all samples (default).
900
+ samples
901
+ Number of quantile points to use for spline fitting.
902
+ If < 1, interpreted as sampling rate (e.g., 0.02 = 2% of data).
903
+ If >= 1, interpreted as number of quantile points.
904
+ Default is 0.02 (2% sampling rate).
905
+ fit_iters
906
+ Number of spline interpolations with offsets to average.
907
+ More iterations provide smoother results. Default is 5.
908
+ smooth
909
+ If True, use smoothing splines. If False, use interpolating splines.
910
+ Default is True.
911
+ spar
912
+ Smoothing parameter for splines (0 to 1).
913
+ If None, automatically determined. Only used if smooth=True.
914
+ copy
915
+ If True, return a copy of the AnnData object. If False, modify in place.
916
+
917
+ Returns:
918
+ -------
919
+ AnnData or None
920
+ If `copy=True`, returns normalized AnnData object.
921
+ If `copy=False`, modifies `adata` in place and returns None.
922
+ The normalized intensities are stored in `.X`.
923
+ Normalization metadata is stored in `.uns['normalize_qspline']`.
924
+
925
+ Examples:
926
+ --------
927
+ >>> import microarray as ma
928
+ >>> adata = ma.io.cel_batch_to_anndata(cel_files, cdf_path, annotation_db)
929
+ >>> adata_norm = ma.pp.normalize_qspline(adata, samples=0.05, copy=True)
930
+
931
+ Notes:
932
+ -----
933
+ Qspline normalization is flexible and can handle complex intensity-dependent
934
+ biases. It's particularly useful when the transformation between samples is
935
+ nonlinear.
936
+
937
+ The algorithm:
938
+ 1. Compute target distribution (geometric mean if not provided)
939
+ 2. For each sample and each iteration:
940
+ - Select quantile points (with offset for each iteration)
941
+ - Fit cubic spline mapping sample quantiles to target quantiles
942
+ - Apply spline transformation
943
+ - Average across iterations
944
+ 3. Apply averaged transformation to all probe intensities
945
+
946
+ Reference
947
+ -------
948
+ Workman, C., et al. (2002). A new non-linear normalization method for
949
+ reducing variability in DNA microarray experiments.
950
+ Genome Biology, 3(9), research0048.
951
+ """
952
+ from scipy.interpolate import CubicSpline, UnivariateSpline
953
+
954
+ adata = adata.copy() if copy else adata
955
+
956
+ # Validate input
957
+ if adata.X is None:
958
+ raise ValueError("AnnData object must have .X attribute with intensity values")
959
+
960
+ if samples <= 0:
961
+ raise ValueError(f"samples must be positive, got {samples}")
962
+
963
+ if fit_iters < 1:
964
+ raise ValueError(f"fit_iters must be at least 1, got {fit_iters}")
965
+
966
+ n_samples, n_probes = adata.shape
967
+
968
+ if n_samples == 1:
969
+ warnings.warn(
970
+ "Qspline normalization requires multiple samples. With only 1 sample, data is returned unchanged.",
971
+ UserWarning,
972
+ stacklevel=2,
973
+ )
974
+ adata.uns["normalize_qspline"] = {
975
+ "method": "qspline",
976
+ "n_samples": n_samples,
977
+ "applied": False,
978
+ "reason": "single_sample",
979
+ }
980
+ return adata if copy else None
981
+
982
+ # Work on log scale
983
+ X_log = np.log2(adata.X + 1)
984
+
985
+ # Compute target distribution if not provided (geometric mean)
986
+ if target is None:
987
+ # Geometric mean = exp(mean(log(x)))
988
+ # On log scale: mean of log values
989
+ target = np.mean(X_log, axis=0)
990
+ target_provided = False
991
+ else:
992
+ target = np.log2(np.array(target) + 1)
993
+ target_provided = True
994
+ if len(target) != n_probes:
995
+ raise ValueError(f"target must have length {n_probes}, got {len(target)}")
996
+
997
+ # Determine number of quantile points
998
+ if samples < 1:
999
+ n_quantiles = max(int(samples * n_probes), 10)
1000
+ else:
1001
+ n_quantiles = int(samples)
1002
+
1003
+ # Normalize each sample
1004
+ X_normalized = np.zeros_like(X_log)
1005
+
1006
+ for i in range(n_samples):
1007
+ sample = X_log[i, :]
1008
+ sample_transformed = np.zeros((fit_iters, n_probes))
1009
+
1010
+ # Multiple iterations with offsets
1011
+ for iter_num in range(fit_iters):
1012
+ # Compute quantile points with offset
1013
+ offset = iter_num / fit_iters
1014
+ quantile_levels = np.linspace(offset, 1 - offset / 2, n_quantiles)
1015
+ quantile_levels = np.clip(quantile_levels, 0.001, 0.999)
1016
+
1017
+ # Get sample and target quantiles
1018
+ sample_quantiles = np.quantile(sample, quantile_levels)
1019
+ target_quantiles = np.quantile(target, quantile_levels)
1020
+
1021
+ # Ensure quantiles are sorted and remove duplicates
1022
+ sort_idx = np.argsort(sample_quantiles)
1023
+ sample_quantiles = sample_quantiles[sort_idx]
1024
+ target_quantiles = target_quantiles[sort_idx]
1025
+
1026
+ # Remove duplicate sample quantiles (keep unique values only)
1027
+ unique_mask = np.concatenate([[True], np.diff(sample_quantiles) > 1e-10])
1028
+ sample_quantiles = sample_quantiles[unique_mask]
1029
+ target_quantiles = target_quantiles[unique_mask]
1030
+
1031
+ # Check if we have enough unique quantiles for spline
1032
+ if len(sample_quantiles) < 4:
1033
+ # Not enough unique points for cubic spline, use linear interpolation
1034
+ sample_transformed[iter_num, :] = np.interp(sample, sample_quantiles, target_quantiles)
1035
+ continue
1036
+
1037
+ # Fit spline
1038
+ if smooth and spar is not None:
1039
+ # Smoothing spline
1040
+ try:
1041
+ spline = UnivariateSpline(
1042
+ sample_quantiles,
1043
+ target_quantiles,
1044
+ s=spar * len(sample_quantiles),
1045
+ k=3,
1046
+ )
1047
+ except Exception: # noqa: BLE001
1048
+ # Fallback to cubic spline
1049
+ warnings.warn(
1050
+ "Smoothing spline fitting failed. Falling back to cubic spline.",
1051
+ UserWarning,
1052
+ stacklevel=2,
1053
+ )
1054
+ spline = CubicSpline(sample_quantiles, target_quantiles)
1055
+ elif smooth:
1056
+ # Auto-smoothing spline
1057
+ try:
1058
+ spline = UnivariateSpline(sample_quantiles, target_quantiles, s=None, k=3)
1059
+ except Exception: # noqa: BLE001
1060
+ warnings.warn(
1061
+ "Auto-smoothing spline fitting failed. Falling back to cubic spline.",
1062
+ UserWarning,
1063
+ stacklevel=2,
1064
+ )
1065
+ spline = CubicSpline(sample_quantiles, target_quantiles)
1066
+ else:
1067
+ # Interpolating cubic spline
1068
+ spline = CubicSpline(sample_quantiles, target_quantiles)
1069
+
1070
+ # Apply transformation
1071
+ # Clip to avoid extrapolation issues
1072
+ sample_clipped = np.clip(sample, sample_quantiles[0], sample_quantiles[-1])
1073
+ sample_transformed[iter_num, :] = spline(sample_clipped)
1074
+
1075
+ # Average across iterations
1076
+ X_normalized[i, :] = np.mean(sample_transformed, axis=0)
1077
+
1078
+ # Convert back to original scale
1079
+ X_normalized = np.power(2, X_normalized) - 1
1080
+ X_normalized = np.maximum(X_normalized, 0) # Ensure non-negative
1081
+
1082
+ # Update AnnData
1083
+ adata.X = X_normalized
1084
+ adata.uns["normalize_qspline"] = {
1085
+ "method": "qspline",
1086
+ "n_samples": n_samples,
1087
+ "n_probes": n_probes,
1088
+ "n_quantiles": n_quantiles,
1089
+ "samples": samples,
1090
+ "fit_iters": fit_iters,
1091
+ "smooth": smooth,
1092
+ "spar": spar,
1093
+ "target_provided": target_provided,
1094
+ "applied": True,
1095
+ }
1096
+
1097
+ return adata if copy else None
1098
+
1099
+
1100
+ def _select_rank_invariant_subset(X: np.ndarray, subset_size: int) -> np.ndarray:
1101
+ """Select probes with small rank-range across samples (maffy.subset).
1102
+
1103
+ Parameters
1104
+ ----------
1105
+ X
1106
+ Intensity matrix (n_samples, n_probes).
1107
+ subset_size
1108
+ Number of probes to select.
1109
+
1110
+ Returns:
1111
+ -------
1112
+ np.ndarray
1113
+ Boolean mask of selected probes.
1114
+ """
1115
+ n_samples, n_probes = X.shape
1116
+
1117
+ # Compute ranks for each sample
1118
+ ranks = np.zeros_like(X)
1119
+ for i in range(n_samples):
1120
+ ranks[i, :] = np.argsort(np.argsort(X[i, :]))
1121
+
1122
+ # Compute rank range for each probe
1123
+ rank_ranges = np.max(ranks, axis=0) - np.min(ranks, axis=0)
1124
+
1125
+ # Select probes with smallest rank range
1126
+ subset_indices = np.argsort(rank_ranges)[:subset_size]
1127
+ mask = np.zeros(n_probes, dtype=bool)
1128
+ mask[subset_indices] = True
1129
+
1130
+ return mask
1131
+
1132
+
1133
+ def normalize_contrasts(
1134
+ adata: AnnData,
1135
+ span: float = 2 / 3,
1136
+ choose_subset: bool = True,
1137
+ subset_size: int = 5000,
1138
+ copy: bool = False,
1139
+ ) -> AnnData | None:
1140
+ """Apply contrast-based normalization with rank-invariant probe selection.
1141
+
1142
+ This method transforms the data to contrasts (differences from the first sample),
1143
+ applies loess smoothing to each contrast dimension, and transforms back.
1144
+ It optionally selects a subset of rank-invariant probes for fitting efficiency.
1145
+
1146
+ Parameters
1147
+ ----------
1148
+ adata
1149
+ AnnData object with shape (n_samples, n_probes).
1150
+ Must contain intensity values in `.X`.
1151
+ span
1152
+ Fraction of data points to use for loess regression (0 < span <= 1).
1153
+ Larger values produce smoother curves. Default is 2/3.
1154
+ choose_subset
1155
+ If True, automatically select subset of rank-invariant probes.
1156
+ If False, use all probes. Default is True.
1157
+ subset_size
1158
+ Number of probes to use in subset (if choose_subset=True).
1159
+ Default is 5000.
1160
+ copy
1161
+ If True, return a copy of the AnnData object. If False, modify in place.
1162
+
1163
+ Returns:
1164
+ -------
1165
+ AnnData or None
1166
+ If `copy=True`, returns normalized AnnData object.
1167
+ If `copy=False`, modifies `adata` in place and returns None.
1168
+ The normalized intensities are stored in `.X`.
1169
+ Normalization metadata is stored in `.uns['normalize_contrasts']`.
1170
+
1171
+ Examples:
1172
+ --------
1173
+ >>> import microarray as ma
1174
+ >>> adata = ma.io.cel_batch_to_anndata(cel_files, cdf_path, annotation_db)
1175
+ >>> adata_norm = ma.pp.normalize_contrasts(adata, subset_size=3000, copy=True)
1176
+
1177
+ Notes:
1178
+ -----
1179
+ This is a complex method that normalizes using contrast transformations.
1180
+ It can be computationally expensive for large datasets.
1181
+
1182
+ The algorithm:
1183
+ 1. Transform to log scale
1184
+ 2. Optionally select rank-invariant subset of probes
1185
+ 3. Compute contrasts: C[i] = X[i] - X[0] for i > 0
1186
+ 4. For each contrast, smooth using loess
1187
+ 5. Apply corrections and transform back
1188
+
1189
+ This implementation uses a simplified approach compared to the full
1190
+ multivariate loess in the R affy package, making it more practical
1191
+ for large datasets.
1192
+
1193
+ Reference
1194
+ -------
1195
+ See normalize.contrasts in the affy Bioconductor package and related
1196
+ maffy package documentation.
1197
+ """
1198
+ adata = adata.copy() if copy else adata
1199
+
1200
+ # Validate input
1201
+ if adata.X is None:
1202
+ raise ValueError("AnnData object must have .X attribute with intensity values")
1203
+
1204
+ if span <= 0 or span > 1:
1205
+ raise ValueError(f"span must be between 0 and 1, got {span}")
1206
+
1207
+ n_samples, n_probes = adata.shape
1208
+
1209
+ if n_samples == 1:
1210
+ warnings.warn(
1211
+ "Contrast normalization requires multiple samples. With only 1 sample, data is returned unchanged.",
1212
+ UserWarning,
1213
+ stacklevel=2,
1214
+ )
1215
+ adata.uns["normalize_contrasts"] = {
1216
+ "method": "contrasts",
1217
+ "n_samples": n_samples,
1218
+ "applied": False,
1219
+ "reason": "single_sample",
1220
+ }
1221
+ return adata if copy else None
1222
+
1223
+ if n_samples > 20:
1224
+ warnings.warn(
1225
+ f"Contrast normalization with {n_samples} samples may be slow. "
1226
+ "Consider using a simpler method like quantile or loess normalization.",
1227
+ UserWarning,
1228
+ stacklevel=2,
1229
+ )
1230
+
1231
+ # Work on log scale
1232
+ X_log = np.log2(adata.X + 1)
1233
+
1234
+ # Select subset if requested
1235
+ if choose_subset and subset_size < n_probes:
1236
+ subset_mask = _select_rank_invariant_subset(X_log, subset_size)
1237
+ n_subset = np.sum(subset_mask)
1238
+ else:
1239
+ subset_mask = np.ones(n_probes, dtype=bool)
1240
+ n_subset = n_probes
1241
+
1242
+ # Use first sample as reference
1243
+ reference = X_log[0, :]
1244
+
1245
+ # Compute average intensity across all samples for each probe
1246
+ A_all = np.mean(X_log, axis=0)
1247
+
1248
+ X_normalized = X_log.copy()
1249
+
1250
+ # Normalize each sample (except reference)
1251
+ for i in range(1, n_samples):
1252
+ # Compute contrast (difference from reference)
1253
+ contrast = X_log[i, :] - reference
1254
+
1255
+ # Get subset for fitting
1256
+ contrast_subset = contrast[subset_mask]
1257
+ A_subset = A_all[subset_mask]
1258
+
1259
+ # Sort by A for loess fitting
1260
+ sort_idx = np.argsort(A_subset)
1261
+ A_sorted = A_subset[sort_idx]
1262
+ contrast_sorted = contrast_subset[sort_idx]
1263
+
1264
+ # Fit loess curve to contrast
1265
+ contrast_smooth = _lowess_fit(A_sorted, contrast_sorted, span=span)
1266
+
1267
+ # Interpolate to all probes
1268
+ contrast_correction = np.interp(A_all, A_sorted, contrast_smooth)
1269
+
1270
+ # Apply correction: normalized = sample - (fitted_contrast - original_contrast)
1271
+ # This adjusts the sample to have the smoothed contrast relationship
1272
+ X_normalized[i, :] = X_log[i, :] - (contrast_correction - contrast)
1273
+
1274
+ # Convert back to original scale
1275
+ X_normalized = np.power(2, X_normalized) - 1
1276
+ X_normalized = np.maximum(X_normalized, 0) # Ensure non-negative
1277
+
1278
+ # Update AnnData
1279
+ adata.X = X_normalized
1280
+ adata.uns["normalize_contrasts"] = {
1281
+ "method": "contrasts",
1282
+ "n_samples": n_samples,
1283
+ "n_probes": n_probes,
1284
+ "span": span,
1285
+ "choose_subset": choose_subset,
1286
+ "subset_size": subset_size,
1287
+ "n_subset_used": n_subset,
1288
+ "reference_index": 0,
1289
+ "applied": True,
1290
+ }
1291
+
1292
+ return adata if copy else None