microarray 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- microarray/__init__.py +15 -0
- microarray/_version.py +3 -0
- microarray/datasets/__init__.py +3 -0
- microarray/datasets/_arrayexpress.py +1 -0
- microarray/datasets/_cdf_files.py +35 -0
- microarray/datasets/_geo.py +1 -0
- microarray/datasets/_utils.py +143 -0
- microarray/io/__init__.py +17 -0
- microarray/io/_anndata_converter.py +198 -0
- microarray/io/_cdf.py +575 -0
- microarray/io/_cel.py +591 -0
- microarray/io/_read.py +127 -0
- microarray/plotting/__init__.py +28 -0
- microarray/plotting/_base.py +253 -0
- microarray/plotting/_cel.py +75 -0
- microarray/plotting/_de_plots.py +239 -0
- microarray/plotting/_diagnostic_plots.py +268 -0
- microarray/plotting/_heatmap.py +279 -0
- microarray/plotting/_ma_plots.py +136 -0
- microarray/plotting/_pca.py +320 -0
- microarray/plotting/_qc_plots.py +335 -0
- microarray/plotting/_score.py +38 -0
- microarray/plotting/_top_table_heatmap.py +98 -0
- microarray/plotting/_utils.py +280 -0
- microarray/preprocessing/__init__.py +39 -0
- microarray/preprocessing/_background.py +862 -0
- microarray/preprocessing/_log2.py +77 -0
- microarray/preprocessing/_normalize.py +1292 -0
- microarray/preprocessing/_rma.py +243 -0
- microarray/preprocessing/_robust.py +170 -0
- microarray/preprocessing/_summarize.py +318 -0
- microarray/py.typed +0 -0
- microarray/tools/__init__.py +26 -0
- microarray/tools/_biomart.py +416 -0
- microarray/tools/_empirical_bayes.py +401 -0
- microarray/tools/_fdist.py +171 -0
- microarray/tools/_linear_models.py +387 -0
- microarray/tools/_mds.py +101 -0
- microarray/tools/_pca.py +88 -0
- microarray/tools/_score.py +86 -0
- microarray/tools/_toptable.py +360 -0
- microarray-0.1.0.dist-info/METADATA +75 -0
- microarray-0.1.0.dist-info/RECORD +44 -0
- microarray-0.1.0.dist-info/WHEEL +4 -0
|
@@ -0,0 +1,1292 @@
|
|
|
1
|
+
"""Normalization methods for microarray data."""
|
|
2
|
+
|
|
3
|
+
import warnings
|
|
4
|
+
|
|
5
|
+
import numpy as np
|
|
6
|
+
from anndata import AnnData
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def normalize_quantile(
|
|
10
|
+
adata: AnnData,
|
|
11
|
+
copy: bool = False,
|
|
12
|
+
) -> AnnData | None:
|
|
13
|
+
"""Apply quantile normalization to microarray intensity data.
|
|
14
|
+
|
|
15
|
+
Quantile normalization makes the distribution of intensities identical
|
|
16
|
+
across all samples by:
|
|
17
|
+
1. Sorting intensity values for each sample
|
|
18
|
+
2. Replacing sorted values with the mean across samples at each rank
|
|
19
|
+
3. Restoring the original order within each sample
|
|
20
|
+
|
|
21
|
+
This ensures all samples have the same empirical distribution, removing
|
|
22
|
+
systematic technical variation while preserving biological differences.
|
|
23
|
+
|
|
24
|
+
Parameters
|
|
25
|
+
----------
|
|
26
|
+
adata
|
|
27
|
+
AnnData object with shape (n_samples, n_probes).
|
|
28
|
+
Must contain intensity values in `.X`.
|
|
29
|
+
copy
|
|
30
|
+
If True, return a copy of the AnnData object. If False, modify in place.
|
|
31
|
+
|
|
32
|
+
Returns:
|
|
33
|
+
-------
|
|
34
|
+
AnnData or None
|
|
35
|
+
If `copy=True`, returns normalized AnnData object.
|
|
36
|
+
If `copy=False`, modifies `adata` in place and returns None.
|
|
37
|
+
The normalized intensities are stored in `.X`.
|
|
38
|
+
Normalization metadata is stored in `.uns['normalization']`.
|
|
39
|
+
|
|
40
|
+
Examples:
|
|
41
|
+
--------
|
|
42
|
+
>>> import microarray as ma
|
|
43
|
+
>>> adata = ma.io.cel_batch_to_anndata(cel_files, cdf_path, annotation_db)
|
|
44
|
+
>>> adata_norm = ma.pp.normalize_quantile(adata, copy=True)
|
|
45
|
+
|
|
46
|
+
Notes:
|
|
47
|
+
-----
|
|
48
|
+
Quantile normalization requires multiple samples. For a single sample,
|
|
49
|
+
a warning is issued and the data is returned unchanged.
|
|
50
|
+
|
|
51
|
+
The algorithm preserves the rank order of probes within each sample
|
|
52
|
+
while making the marginal distributions identical across samples.
|
|
53
|
+
|
|
54
|
+
Reference:
|
|
55
|
+
Bolstad, B.M., Irizarry, R.A., Astrand, M., Speed, T.P. (2003).
|
|
56
|
+
A comparison of normalization methods for high density oligonucleotide
|
|
57
|
+
array data based on variance and bias. Bioinformatics, 19(2), 185-193.
|
|
58
|
+
"""
|
|
59
|
+
adata = adata.copy() if copy else adata
|
|
60
|
+
|
|
61
|
+
# Validate input
|
|
62
|
+
if adata.X is None:
|
|
63
|
+
raise ValueError("AnnData object must have .X attribute with intensity values")
|
|
64
|
+
|
|
65
|
+
n_samples, n_probes = adata.shape
|
|
66
|
+
|
|
67
|
+
# Check for single sample case
|
|
68
|
+
if n_samples == 1:
|
|
69
|
+
warnings.warn(
|
|
70
|
+
"Quantile normalization requires multiple samples for proper normalization. "
|
|
71
|
+
"With only 1 sample, data is returned unchanged.",
|
|
72
|
+
UserWarning,
|
|
73
|
+
stacklevel=2,
|
|
74
|
+
)
|
|
75
|
+
adata.uns["normalization"] = {
|
|
76
|
+
"method": "quantile",
|
|
77
|
+
"n_samples": n_samples,
|
|
78
|
+
"applied": False,
|
|
79
|
+
"reason": "single_sample",
|
|
80
|
+
}
|
|
81
|
+
return adata if copy else None
|
|
82
|
+
|
|
83
|
+
# Get intensity matrix (samples × probes)
|
|
84
|
+
X = adata.X.copy()
|
|
85
|
+
|
|
86
|
+
# Step 1: Sort each sample (column)
|
|
87
|
+
X_sorted = np.sort(X, axis=1)
|
|
88
|
+
|
|
89
|
+
# Step 2: Compute mean across samples at each rank
|
|
90
|
+
rank_means = np.mean(X_sorted, axis=0)
|
|
91
|
+
|
|
92
|
+
# Step 3: Get ranks for each sample to restore original order
|
|
93
|
+
# argsort gives indices that would sort the array
|
|
94
|
+
# argsort(argsort) gives the rank of each element
|
|
95
|
+
ranks = np.argsort(np.argsort(X, axis=1), axis=1)
|
|
96
|
+
|
|
97
|
+
# Step 4: Replace with rank means
|
|
98
|
+
X_normalized = rank_means[ranks]
|
|
99
|
+
|
|
100
|
+
# Update AnnData
|
|
101
|
+
adata.X = X_normalized
|
|
102
|
+
adata.uns["normalization"] = {
|
|
103
|
+
"method": "quantile",
|
|
104
|
+
"n_samples": n_samples,
|
|
105
|
+
"n_probes": n_probes,
|
|
106
|
+
"applied": True,
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
return adata if copy else None
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
def normalize_constant(
|
|
113
|
+
adata: AnnData,
|
|
114
|
+
refindex: int = 0,
|
|
115
|
+
method: str = "mean",
|
|
116
|
+
target_value: float | None = None,
|
|
117
|
+
copy: bool = False,
|
|
118
|
+
) -> AnnData | None:
|
|
119
|
+
"""Apply constant (scale) normalization to microarray intensity data.
|
|
120
|
+
|
|
121
|
+
Constant normalization is a simple scaling method that multiplies each array
|
|
122
|
+
by a constant to make a summary statistic (mean or median) equal across arrays
|
|
123
|
+
or to a specific target value.
|
|
124
|
+
|
|
125
|
+
Parameters
|
|
126
|
+
----------
|
|
127
|
+
adata
|
|
128
|
+
AnnData object with shape (n_samples, n_probes).
|
|
129
|
+
Must contain intensity values in `.X`.
|
|
130
|
+
refindex
|
|
131
|
+
Index of the reference sample (0-based). Default is 0 (first sample).
|
|
132
|
+
Only used when target_value is None.
|
|
133
|
+
method
|
|
134
|
+
Function to compute the scaling constant: "mean" or "median".
|
|
135
|
+
Default is "mean".
|
|
136
|
+
target_value
|
|
137
|
+
If specified, scale all samples to this target value instead of scaling
|
|
138
|
+
to a reference sample. Commonly used in MAS5 (target=500).
|
|
139
|
+
copy
|
|
140
|
+
If True, return a copy of the AnnData object. If False, modify in place.
|
|
141
|
+
|
|
142
|
+
Returns:
|
|
143
|
+
-------
|
|
144
|
+
AnnData or None
|
|
145
|
+
If `copy=True`, returns normalized AnnData object.
|
|
146
|
+
If `copy=False`, modifies `adata` in place and returns None.
|
|
147
|
+
The normalized intensities are stored in `.X`.
|
|
148
|
+
Normalization metadata is stored in `.uns['normalize_constant']`.
|
|
149
|
+
|
|
150
|
+
Examples:
|
|
151
|
+
--------
|
|
152
|
+
>>> import microarray as ma
|
|
153
|
+
>>> adata = ma.io.cel_batch_to_anndata(cel_files, cdf_path, annotation_db)
|
|
154
|
+
>>> adata_norm = ma.pp.normalize_constant(adata, method="median", copy=True)
|
|
155
|
+
|
|
156
|
+
Notes:
|
|
157
|
+
-----
|
|
158
|
+
This is the simplest normalization method and assumes that most genes are
|
|
159
|
+
not differentially expressed and that technical variation affects all probes
|
|
160
|
+
uniformly. It may not be appropriate when there are global shifts in
|
|
161
|
+
expression levels between samples.
|
|
162
|
+
|
|
163
|
+
The normalization formula for each sample i is:
|
|
164
|
+
X_norm[i] = X[i] * (ref_constant / sample_constant[i])
|
|
165
|
+
|
|
166
|
+
Reference
|
|
167
|
+
-------
|
|
168
|
+
See normalize.constant in the affy Bioconductor package documentation.
|
|
169
|
+
"""
|
|
170
|
+
adata = adata.copy() if copy else adata
|
|
171
|
+
|
|
172
|
+
# Validate input
|
|
173
|
+
if adata.X is None:
|
|
174
|
+
raise ValueError("AnnData object must have .X attribute with intensity values")
|
|
175
|
+
|
|
176
|
+
if method not in ["mean", "median"]:
|
|
177
|
+
raise ValueError(f"method must be 'mean' or 'median', got '{method}'")
|
|
178
|
+
|
|
179
|
+
n_samples, n_probes = adata.shape
|
|
180
|
+
|
|
181
|
+
if refindex < 0 or refindex >= n_samples:
|
|
182
|
+
raise ValueError(f"refindex must be between 0 and {n_samples - 1}, got {refindex}")
|
|
183
|
+
|
|
184
|
+
# Get intensity matrix (samples × probes)
|
|
185
|
+
X = adata.X.copy()
|
|
186
|
+
|
|
187
|
+
# Compute constants for each sample
|
|
188
|
+
if method == "mean":
|
|
189
|
+
constants = np.mean(X, axis=1)
|
|
190
|
+
else: # median
|
|
191
|
+
constants = np.median(X, axis=1)
|
|
192
|
+
|
|
193
|
+
# Determine reference constant
|
|
194
|
+
if target_value is not None:
|
|
195
|
+
ref_constant = target_value
|
|
196
|
+
else:
|
|
197
|
+
# Get reference constant from specified sample
|
|
198
|
+
ref_constant = constants[refindex]
|
|
199
|
+
|
|
200
|
+
# Avoid division by zero
|
|
201
|
+
if np.any(constants == 0):
|
|
202
|
+
warnings.warn(
|
|
203
|
+
"Some samples have zero constant value. These samples will not be normalized.",
|
|
204
|
+
UserWarning,
|
|
205
|
+
stacklevel=2,
|
|
206
|
+
)
|
|
207
|
+
constants = np.where(constants == 0, 1.0, constants)
|
|
208
|
+
|
|
209
|
+
# Compute scaling factors
|
|
210
|
+
scale_factors = ref_constant / constants
|
|
211
|
+
|
|
212
|
+
# Apply scaling to each sample
|
|
213
|
+
X_normalized = X * scale_factors[:, np.newaxis]
|
|
214
|
+
|
|
215
|
+
# Update AnnData
|
|
216
|
+
adata.X = X_normalized
|
|
217
|
+
adata.uns["normalize_constant"] = {
|
|
218
|
+
"method": method,
|
|
219
|
+
"refindex": refindex if target_value is None else None,
|
|
220
|
+
"target_value": float(target_value) if target_value is not None else None,
|
|
221
|
+
"ref_constant": float(ref_constant),
|
|
222
|
+
"scale_factors": scale_factors.tolist(),
|
|
223
|
+
"n_samples": n_samples,
|
|
224
|
+
"n_probes": n_probes,
|
|
225
|
+
"applied": True,
|
|
226
|
+
}
|
|
227
|
+
|
|
228
|
+
return adata if copy else None
|
|
229
|
+
|
|
230
|
+
|
|
231
|
+
def normalize_quantile_robust(
|
|
232
|
+
adata: AnnData,
|
|
233
|
+
weights: np.ndarray | None = None,
|
|
234
|
+
remove_extreme: str = "none",
|
|
235
|
+
n_remove: int = 0,
|
|
236
|
+
use_median: bool = False,
|
|
237
|
+
use_log2: bool = False,
|
|
238
|
+
copy: bool = False,
|
|
239
|
+
) -> AnnData | None:
|
|
240
|
+
"""Apply robust quantile normalization with optional weighting and outlier removal.
|
|
241
|
+
|
|
242
|
+
This is an enhanced version of quantile normalization that supports chip weighting
|
|
243
|
+
and outlier removal options. It can be useful when some arrays are of lower quality
|
|
244
|
+
or when extreme outliers should be excluded from the normalization process.
|
|
245
|
+
|
|
246
|
+
Parameters
|
|
247
|
+
----------
|
|
248
|
+
adata
|
|
249
|
+
AnnData object with shape (n_samples, n_probes).
|
|
250
|
+
Must contain intensity values in `.X`.
|
|
251
|
+
weights
|
|
252
|
+
Optional array of weights for each sample (length n_samples).
|
|
253
|
+
Larger weights give more influence in computing the target distribution.
|
|
254
|
+
If None, all samples are weighted equally.
|
|
255
|
+
remove_extreme
|
|
256
|
+
Method for identifying extreme samples to remove:
|
|
257
|
+
- "none": no removal (default)
|
|
258
|
+
- "variance": remove samples with highest variance
|
|
259
|
+
- "mean": remove samples with extreme mean values
|
|
260
|
+
- "both": remove samples with extreme variance or mean
|
|
261
|
+
n_remove
|
|
262
|
+
Number of extreme samples to remove (default 0).
|
|
263
|
+
Only used if remove_extreme is not "none".
|
|
264
|
+
use_median
|
|
265
|
+
If True, use weighted median for target distribution.
|
|
266
|
+
If False, use weighted mean (default).
|
|
267
|
+
use_log2
|
|
268
|
+
If True, work on log2 scale and compute geometric mean.
|
|
269
|
+
If False, work on original scale (default).
|
|
270
|
+
copy
|
|
271
|
+
If True, return a copy of the AnnData object. If False, modify in place.
|
|
272
|
+
|
|
273
|
+
Returns:
|
|
274
|
+
-------
|
|
275
|
+
AnnData or None
|
|
276
|
+
If `copy=True`, returns normalized AnnData object.
|
|
277
|
+
If `copy=False`, modifies `adata` in place and returns None.
|
|
278
|
+
The normalized intensities are stored in `.X`.
|
|
279
|
+
Normalization metadata is stored in `.uns['normalize_quantile_robust']`.
|
|
280
|
+
|
|
281
|
+
Examples:
|
|
282
|
+
--------
|
|
283
|
+
>>> import microarray as ma
|
|
284
|
+
>>> import numpy as np
|
|
285
|
+
>>> adata = ma.io.cel_batch_to_anndata(cel_files, cdf_path, annotation_db)
|
|
286
|
+
>>> # Remove 2 samples with highest variance
|
|
287
|
+
>>> adata_norm = ma.pp.normalize_quantile_robust(adata, remove_extreme="variance", n_remove=2, copy=True)
|
|
288
|
+
>>> # Use weighted normalization
|
|
289
|
+
>>> weights = np.array([1.0, 0.8, 1.0, 0.9, 1.0]) # lower weight for quality issues
|
|
290
|
+
>>> adata_norm = ma.pp.normalize_quantile_robust(adata, weights=weights, copy=True)
|
|
291
|
+
|
|
292
|
+
Notes:
|
|
293
|
+
-----
|
|
294
|
+
This method is marked as experimental in the affy R package. Use with caution.
|
|
295
|
+
|
|
296
|
+
When remove_extreme is used, the extreme samples are identified but still
|
|
297
|
+
normalized. They are simply excluded from computing the target distribution.
|
|
298
|
+
|
|
299
|
+
Reference
|
|
300
|
+
-------
|
|
301
|
+
See normalize.quantiles.robust in the affy Bioconductor package documentation.
|
|
302
|
+
"""
|
|
303
|
+
adata = adata.copy() if copy else adata
|
|
304
|
+
|
|
305
|
+
# Validate input
|
|
306
|
+
if adata.X is None:
|
|
307
|
+
raise ValueError("AnnData object must have .X attribute with intensity values")
|
|
308
|
+
|
|
309
|
+
if remove_extreme not in ["none", "variance", "mean", "both"]:
|
|
310
|
+
raise ValueError(f"remove_extreme must be one of 'none', 'variance', 'mean', 'both', got '{remove_extreme}'")
|
|
311
|
+
|
|
312
|
+
n_samples, n_probes = adata.shape
|
|
313
|
+
|
|
314
|
+
# Warn about experimental status
|
|
315
|
+
warnings.warn(
|
|
316
|
+
"Robust quantile normalization is marked as experimental. Use with caution and validate results.",
|
|
317
|
+
UserWarning,
|
|
318
|
+
stacklevel=2,
|
|
319
|
+
)
|
|
320
|
+
|
|
321
|
+
# Check for single sample case
|
|
322
|
+
if n_samples == 1:
|
|
323
|
+
warnings.warn(
|
|
324
|
+
"Quantile normalization requires multiple samples. With only 1 sample, data is returned unchanged.",
|
|
325
|
+
UserWarning,
|
|
326
|
+
stacklevel=2,
|
|
327
|
+
)
|
|
328
|
+
adata.uns["normalize_quantile_robust"] = {
|
|
329
|
+
"method": "quantile_robust",
|
|
330
|
+
"n_samples": n_samples,
|
|
331
|
+
"applied": False,
|
|
332
|
+
"reason": "single_sample",
|
|
333
|
+
}
|
|
334
|
+
return adata if copy else None
|
|
335
|
+
|
|
336
|
+
# Get intensity matrix (samples × probes)
|
|
337
|
+
X = adata.X.copy()
|
|
338
|
+
|
|
339
|
+
# Apply log2 transformation if requested
|
|
340
|
+
if use_log2:
|
|
341
|
+
X = np.log2(X + 1) # Add 1 to avoid log(0)
|
|
342
|
+
|
|
343
|
+
# Initialize weights if not provided
|
|
344
|
+
if weights is None:
|
|
345
|
+
weights = np.ones(n_samples)
|
|
346
|
+
else:
|
|
347
|
+
weights = np.array(weights)
|
|
348
|
+
if len(weights) != n_samples:
|
|
349
|
+
raise ValueError(f"weights must have length {n_samples}, got {len(weights)}")
|
|
350
|
+
|
|
351
|
+
# Identify samples to exclude from target distribution
|
|
352
|
+
exclude_mask = np.zeros(n_samples, dtype=bool)
|
|
353
|
+
|
|
354
|
+
if remove_extreme != "none" and n_remove > 0:
|
|
355
|
+
if n_remove >= n_samples:
|
|
356
|
+
raise ValueError(f"n_remove ({n_remove}) must be less than n_samples ({n_samples})")
|
|
357
|
+
|
|
358
|
+
if remove_extreme in ["variance", "both"]:
|
|
359
|
+
variances = np.var(X, axis=1)
|
|
360
|
+
variance_ranks = np.argsort(variances)[::-1] # highest first
|
|
361
|
+
exclude_mask[variance_ranks[:n_remove]] = True
|
|
362
|
+
|
|
363
|
+
if remove_extreme in ["mean", "both"]:
|
|
364
|
+
means = np.mean(X, axis=1)
|
|
365
|
+
# Remove most extreme (furthest from overall median)
|
|
366
|
+
median_mean = np.median(means)
|
|
367
|
+
mean_distances = np.abs(means - median_mean)
|
|
368
|
+
mean_ranks = np.argsort(mean_distances)[::-1] # highest first
|
|
369
|
+
exclude_mask[mean_ranks[:n_remove]] = True
|
|
370
|
+
|
|
371
|
+
# Get samples to include in target distribution
|
|
372
|
+
include_mask = ~exclude_mask
|
|
373
|
+
n_included = np.sum(include_mask)
|
|
374
|
+
|
|
375
|
+
if n_included == 0:
|
|
376
|
+
raise ValueError("All samples would be excluded. Reduce n_remove.")
|
|
377
|
+
|
|
378
|
+
# Sort each sample
|
|
379
|
+
X_sorted = np.sort(X, axis=1)
|
|
380
|
+
|
|
381
|
+
# Compute weighted target distribution
|
|
382
|
+
if use_median:
|
|
383
|
+
# Weighted median is more complex; use simple median for now
|
|
384
|
+
rank_targets = np.median(X_sorted[include_mask], axis=0)
|
|
385
|
+
else:
|
|
386
|
+
# Weighted mean
|
|
387
|
+
included_weights = weights[include_mask]
|
|
388
|
+
included_sorted = X_sorted[include_mask]
|
|
389
|
+
weight_sum = np.sum(included_weights)
|
|
390
|
+
rank_targets = np.sum(included_sorted * included_weights[:, np.newaxis], axis=0) / weight_sum
|
|
391
|
+
|
|
392
|
+
# Get ranks for each sample to restore original order
|
|
393
|
+
ranks = np.argsort(np.argsort(X, axis=1), axis=1)
|
|
394
|
+
|
|
395
|
+
# Replace with rank targets
|
|
396
|
+
X_normalized = rank_targets[ranks]
|
|
397
|
+
|
|
398
|
+
# Reverse log2 transformation if applied
|
|
399
|
+
if use_log2:
|
|
400
|
+
X_normalized = np.power(2, X_normalized) - 1
|
|
401
|
+
X_normalized = np.maximum(X_normalized, 0) # Ensure non-negative
|
|
402
|
+
|
|
403
|
+
# Update AnnData
|
|
404
|
+
adata.X = X_normalized
|
|
405
|
+
adata.uns["normalize_quantile_robust"] = {
|
|
406
|
+
"method": "quantile_robust",
|
|
407
|
+
"n_samples": n_samples,
|
|
408
|
+
"n_probes": n_probes,
|
|
409
|
+
"n_included": int(n_included),
|
|
410
|
+
"n_excluded": int(np.sum(exclude_mask)),
|
|
411
|
+
"remove_extreme": remove_extreme,
|
|
412
|
+
"n_remove": n_remove,
|
|
413
|
+
"use_median": use_median,
|
|
414
|
+
"use_log2": use_log2,
|
|
415
|
+
"excluded_indices": np.where(exclude_mask)[0].tolist(),
|
|
416
|
+
"applied": True,
|
|
417
|
+
}
|
|
418
|
+
|
|
419
|
+
return adata if copy else None
|
|
420
|
+
|
|
421
|
+
|
|
422
|
+
def _lowess_fit(x: np.ndarray, y: np.ndarray, span: float = 2 / 3) -> np.ndarray:
|
|
423
|
+
"""Simple LOWESS (Locally Weighted Scatterplot Smoothing) implementation.
|
|
424
|
+
|
|
425
|
+
Parameters
|
|
426
|
+
----------
|
|
427
|
+
x
|
|
428
|
+
Independent variable values (sorted).
|
|
429
|
+
y
|
|
430
|
+
Dependent variable values.
|
|
431
|
+
span
|
|
432
|
+
Fraction of data to use for local regression (0 < span <= 1).
|
|
433
|
+
|
|
434
|
+
Returns:
|
|
435
|
+
-------
|
|
436
|
+
np.ndarray
|
|
437
|
+
Smoothed y values.
|
|
438
|
+
"""
|
|
439
|
+
n = len(x)
|
|
440
|
+
y_smooth = np.zeros(n)
|
|
441
|
+
k = int(np.ceil(span * n))
|
|
442
|
+
|
|
443
|
+
for i in range(n):
|
|
444
|
+
# Get local neighborhood
|
|
445
|
+
distances = np.abs(x - x[i])
|
|
446
|
+
nearest = np.argsort(distances)[:k]
|
|
447
|
+
|
|
448
|
+
# Compute weights using tricube kernel
|
|
449
|
+
max_dist = distances[nearest[-1]]
|
|
450
|
+
if max_dist > 0:
|
|
451
|
+
weights = (1 - (distances[nearest] / max_dist) ** 3) ** 3
|
|
452
|
+
else:
|
|
453
|
+
weights = np.ones(k)
|
|
454
|
+
|
|
455
|
+
# Weighted linear regression
|
|
456
|
+
X_local = np.column_stack([np.ones(k), x[nearest]])
|
|
457
|
+
W = np.diag(weights)
|
|
458
|
+
try:
|
|
459
|
+
beta = np.linalg.lstsq(X_local.T @ W @ X_local, X_local.T @ W @ y[nearest], rcond=None)[0]
|
|
460
|
+
y_smooth[i] = beta[0] + beta[1] * x[i]
|
|
461
|
+
except np.linalg.LinAlgError:
|
|
462
|
+
# Fallback to weighted mean if regression fails
|
|
463
|
+
y_smooth[i] = np.average(y[nearest], weights=weights)
|
|
464
|
+
|
|
465
|
+
return y_smooth
|
|
466
|
+
|
|
467
|
+
|
|
468
|
+
def normalize_loess(
|
|
469
|
+
adata: AnnData,
|
|
470
|
+
subset: int = 5000,
|
|
471
|
+
span: float = 2 / 3,
|
|
472
|
+
iterations: int = 1,
|
|
473
|
+
epsilon: float = 0.01,
|
|
474
|
+
copy: bool = False,
|
|
475
|
+
) -> AnnData | None:
|
|
476
|
+
"""Apply loess (locally weighted scatterplot smoothing) normalization.
|
|
477
|
+
|
|
478
|
+
Loess normalization uses MA plots (M vs A, where M = log-intensity difference
|
|
479
|
+
and A = average log-intensity) to normalize arrays against a reference. It
|
|
480
|
+
fits a smooth curve to correct for intensity-dependent biases.
|
|
481
|
+
|
|
482
|
+
Parameters
|
|
483
|
+
----------
|
|
484
|
+
adata
|
|
485
|
+
AnnData object with shape (n_samples, n_probes).
|
|
486
|
+
Must contain intensity values in `.X`.
|
|
487
|
+
subset
|
|
488
|
+
Number of probes to use for fitting the loess curve.
|
|
489
|
+
Using a subset improves computational efficiency.
|
|
490
|
+
If subset >= n_probes, all probes are used.
|
|
491
|
+
Default is 5000.
|
|
492
|
+
span
|
|
493
|
+
Fraction of data points to use for local regression (0 < span <= 1).
|
|
494
|
+
Larger values produce smoother curves. Default is 2/3.
|
|
495
|
+
iterations
|
|
496
|
+
Number of normalization iterations. Default is 1.
|
|
497
|
+
More iterations can improve convergence but increase computation time.
|
|
498
|
+
epsilon
|
|
499
|
+
Convergence threshold. If maximum change between iterations
|
|
500
|
+
is less than epsilon, stop early. Default is 0.01.
|
|
501
|
+
copy
|
|
502
|
+
If True, return a copy of the AnnData object. If False, modify in place.
|
|
503
|
+
|
|
504
|
+
Returns:
|
|
505
|
+
-------
|
|
506
|
+
AnnData or None
|
|
507
|
+
If `copy=True`, returns normalized AnnData object.
|
|
508
|
+
If `copy=False`, modifies `adata` in place and returns None.
|
|
509
|
+
The normalized intensities are stored in `.X`.
|
|
510
|
+
Normalization metadata is stored in `.uns['normalize_loess']`.
|
|
511
|
+
|
|
512
|
+
Examples:
|
|
513
|
+
--------
|
|
514
|
+
>>> import microarray as ma
|
|
515
|
+
>>> adata = ma.io.cel_batch_to_anndata(cel_files, cdf_path, annotation_db)
|
|
516
|
+
>>> adata_norm = ma.pp.normalize_loess(adata, subset=3000, iterations=2, copy=True)
|
|
517
|
+
|
|
518
|
+
Notes:
|
|
519
|
+
-----
|
|
520
|
+
This method normalizes all arrays to have the same relationship between
|
|
521
|
+
intensity and average intensity, removing intensity-dependent bias.
|
|
522
|
+
|
|
523
|
+
The algorithm:
|
|
524
|
+
1. Compute reference as median array
|
|
525
|
+
2. For each array, compute M = log(array) - log(reference)
|
|
526
|
+
and A = (log(array) + log(reference)) / 2
|
|
527
|
+
3. Fit loess curve M ~ A on subset of probes
|
|
528
|
+
4. Apply correction to all probes via interpolation
|
|
529
|
+
5. Repeat until convergence or max iterations reached
|
|
530
|
+
|
|
531
|
+
For large datasets, using a subset of probes significantly improves speed
|
|
532
|
+
while maintaining normalization quality.
|
|
533
|
+
|
|
534
|
+
Reference
|
|
535
|
+
-------
|
|
536
|
+
Yang, Y.H., et al. (2002). Normalization for cDNA microarray data:
|
|
537
|
+
a robust composite method addressing single and multiple slide systematic
|
|
538
|
+
variation. Nucleic Acids Research, 30(4), e15.
|
|
539
|
+
"""
|
|
540
|
+
adata = adata.copy() if copy else adata
|
|
541
|
+
|
|
542
|
+
# Validate input
|
|
543
|
+
if adata.X is None:
|
|
544
|
+
raise ValueError("AnnData object must have .X attribute with intensity values")
|
|
545
|
+
|
|
546
|
+
if span <= 0 or span > 1:
|
|
547
|
+
raise ValueError(f"span must be between 0 and 1, got {span}")
|
|
548
|
+
|
|
549
|
+
if iterations < 1:
|
|
550
|
+
raise ValueError(f"iterations must be at least 1, got {iterations}")
|
|
551
|
+
|
|
552
|
+
n_samples, n_probes = adata.shape
|
|
553
|
+
|
|
554
|
+
if n_samples == 1:
|
|
555
|
+
warnings.warn(
|
|
556
|
+
"Loess normalization requires multiple samples. With only 1 sample, data is returned unchanged.",
|
|
557
|
+
UserWarning,
|
|
558
|
+
stacklevel=2,
|
|
559
|
+
)
|
|
560
|
+
adata.uns["normalize_loess"] = {
|
|
561
|
+
"method": "loess",
|
|
562
|
+
"n_samples": n_samples,
|
|
563
|
+
"applied": False,
|
|
564
|
+
"reason": "single_sample",
|
|
565
|
+
}
|
|
566
|
+
return adata if copy else None
|
|
567
|
+
|
|
568
|
+
# Work on log scale (add small constant to avoid log(0))
|
|
569
|
+
X_log = np.log2(adata.X + 1)
|
|
570
|
+
X_normalized = X_log.copy()
|
|
571
|
+
|
|
572
|
+
# Determine subset for fitting
|
|
573
|
+
n_subset = min(subset, n_probes)
|
|
574
|
+
if n_subset < n_probes:
|
|
575
|
+
# Randomly select subset
|
|
576
|
+
np.random.seed(42)
|
|
577
|
+
subset_indices = np.random.choice(n_probes, n_subset, replace=False)
|
|
578
|
+
subset_indices = np.sort(subset_indices)
|
|
579
|
+
else:
|
|
580
|
+
subset_indices = np.arange(n_probes)
|
|
581
|
+
|
|
582
|
+
# Iterative normalization
|
|
583
|
+
for _ in range(iterations):
|
|
584
|
+
X_prev = X_normalized.copy()
|
|
585
|
+
|
|
586
|
+
# Compute reference (median across samples)
|
|
587
|
+
reference = np.median(X_normalized, axis=0)
|
|
588
|
+
|
|
589
|
+
# Normalize each sample against reference
|
|
590
|
+
for i in range(n_samples):
|
|
591
|
+
# Compute M and A for subset
|
|
592
|
+
M_subset = X_normalized[i, subset_indices] - reference[subset_indices]
|
|
593
|
+
A_subset = (X_normalized[i, subset_indices] + reference[subset_indices]) / 2
|
|
594
|
+
|
|
595
|
+
# Sort by A for loess fitting
|
|
596
|
+
sort_idx = np.argsort(A_subset)
|
|
597
|
+
A_sorted = A_subset[sort_idx]
|
|
598
|
+
M_sorted = M_subset[sort_idx]
|
|
599
|
+
|
|
600
|
+
# Fit loess curve
|
|
601
|
+
M_smooth = _lowess_fit(A_sorted, M_sorted, span=span)
|
|
602
|
+
|
|
603
|
+
# Interpolate to all probes
|
|
604
|
+
A_all = (X_normalized[i, :] + reference) / 2
|
|
605
|
+
M_correction = np.interp(A_all, A_sorted, M_smooth)
|
|
606
|
+
|
|
607
|
+
# Apply correction
|
|
608
|
+
X_normalized[i, :] = X_normalized[i, :] - M_correction
|
|
609
|
+
|
|
610
|
+
# Check convergence
|
|
611
|
+
max_change = np.max(np.abs(X_normalized - X_prev))
|
|
612
|
+
if max_change < epsilon:
|
|
613
|
+
break
|
|
614
|
+
|
|
615
|
+
# Convert back to original scale
|
|
616
|
+
X_normalized = np.power(2, X_normalized) - 1
|
|
617
|
+
X_normalized = np.maximum(X_normalized, 0) # Ensure non-negative
|
|
618
|
+
|
|
619
|
+
# Update AnnData
|
|
620
|
+
adata.X = X_normalized
|
|
621
|
+
adata.uns["normalize_loess"] = {
|
|
622
|
+
"method": "loess",
|
|
623
|
+
"n_samples": n_samples,
|
|
624
|
+
"n_probes": n_probes,
|
|
625
|
+
"subset": n_subset,
|
|
626
|
+
"span": span,
|
|
627
|
+
"iterations": iterations,
|
|
628
|
+
"iterations_performed": iterations,
|
|
629
|
+
"epsilon": epsilon,
|
|
630
|
+
"converged": max_change < epsilon if iterations > 1 else None,
|
|
631
|
+
"applied": True,
|
|
632
|
+
}
|
|
633
|
+
|
|
634
|
+
return adata if copy else None
|
|
635
|
+
|
|
636
|
+
|
|
637
|
+
def normalize_invariantset(
|
|
638
|
+
adata: AnnData,
|
|
639
|
+
baseline_type: str = "mean",
|
|
640
|
+
baseline_index: int | None = None,
|
|
641
|
+
prd_td: tuple[float, float] = (0.003, 0.007),
|
|
642
|
+
max_iterations: int = 10,
|
|
643
|
+
copy: bool = False,
|
|
644
|
+
) -> AnnData | None:
|
|
645
|
+
"""Apply invariant set normalization.
|
|
646
|
+
|
|
647
|
+
This method identifies a set of invariant probes (probes that show consistent
|
|
648
|
+
expression across samples) and uses them to fit a normalization curve via
|
|
649
|
+
smoothing splines. The curve is then applied to all probes.
|
|
650
|
+
|
|
651
|
+
Parameters
|
|
652
|
+
----------
|
|
653
|
+
adata
|
|
654
|
+
AnnData object with shape (n_samples, n_probes).
|
|
655
|
+
Must contain intensity values in `.X`.
|
|
656
|
+
baseline_type
|
|
657
|
+
Method for creating baseline array:
|
|
658
|
+
- "mean": arithmetic mean across samples
|
|
659
|
+
- "median": median across samples
|
|
660
|
+
- "pseudo-mean": trimmed mean (10% trim)
|
|
661
|
+
- "pseudo-median": weighted median
|
|
662
|
+
- None: use array at baseline_index
|
|
663
|
+
Default is "mean".
|
|
664
|
+
baseline_index
|
|
665
|
+
Index of sample to use as baseline (0-based).
|
|
666
|
+
Only used if baseline_type is None.
|
|
667
|
+
prd_td
|
|
668
|
+
Tuple of (lower, upper) thresholds for identifying invariant probes
|
|
669
|
+
based on rank consistency. Default is (0.003, 0.007).
|
|
670
|
+
max_iterations
|
|
671
|
+
Maximum iterations for identifying invariant set. Default is 10.
|
|
672
|
+
copy
|
|
673
|
+
If True, return a copy of the AnnData object. If False, modify in place.
|
|
674
|
+
|
|
675
|
+
Returns:
|
|
676
|
+
-------
|
|
677
|
+
AnnData or None
|
|
678
|
+
If `copy=True`, returns normalized AnnData object.
|
|
679
|
+
If `copy=False`, modifies `adata` in place and returns None.
|
|
680
|
+
The normalized intensities are stored in `.X`.
|
|
681
|
+
Normalization metadata is stored in `.uns['normalize_invariantset']`.
|
|
682
|
+
|
|
683
|
+
Examples:
|
|
684
|
+
--------
|
|
685
|
+
>>> import microarray as ma
|
|
686
|
+
>>> adata = ma.io.cel_batch_to_anndata(cel_files, cdf_path, annotation_db)
|
|
687
|
+
>>> adata_norm = ma.pp.normalize_invariantset(adata, baseline_type="median", copy=True)
|
|
688
|
+
|
|
689
|
+
Notes:
|
|
690
|
+
-----
|
|
691
|
+
The algorithm iteratively identifies probes that have consistent ranks
|
|
692
|
+
across arrays. These invariant probes are assumed to not be differentially
|
|
693
|
+
expressed and are used to learn the normalization transformation.
|
|
694
|
+
|
|
695
|
+
A smoothing spline is fit to the invariant probes' intensities in the
|
|
696
|
+
(baseline, sample) space, and this curve is used to normalize all probes.
|
|
697
|
+
|
|
698
|
+
Reference
|
|
699
|
+
-------
|
|
700
|
+
Li, C. and Wong, W.H. (2001). Model-based analysis of oligonucleotide
|
|
701
|
+
arrays: expression index computation and outlier detection.
|
|
702
|
+
Proceedings of the National Academy of Sciences, 98(1), 31-36.
|
|
703
|
+
"""
|
|
704
|
+
from scipy.interpolate import UnivariateSpline
|
|
705
|
+
|
|
706
|
+
adata = adata.copy() if copy else adata
|
|
707
|
+
|
|
708
|
+
# Validate input
|
|
709
|
+
if adata.X is None:
|
|
710
|
+
raise ValueError("AnnData object must have .X attribute with intensity values")
|
|
711
|
+
|
|
712
|
+
valid_baseline_types = ["mean", "median", "pseudo-mean", "pseudo-median", None]
|
|
713
|
+
if baseline_type not in valid_baseline_types:
|
|
714
|
+
raise ValueError(f"baseline_type must be one of {valid_baseline_types}, got '{baseline_type}'")
|
|
715
|
+
|
|
716
|
+
n_samples, n_probes = adata.shape
|
|
717
|
+
|
|
718
|
+
if n_samples == 1:
|
|
719
|
+
warnings.warn(
|
|
720
|
+
"Invariant set normalization requires multiple samples. With only 1 sample, data is returned unchanged.",
|
|
721
|
+
UserWarning,
|
|
722
|
+
stacklevel=2,
|
|
723
|
+
)
|
|
724
|
+
adata.uns["normalize_invariantset"] = {
|
|
725
|
+
"method": "invariantset",
|
|
726
|
+
"n_samples": n_samples,
|
|
727
|
+
"applied": False,
|
|
728
|
+
"reason": "single_sample",
|
|
729
|
+
}
|
|
730
|
+
return adata if copy else None
|
|
731
|
+
|
|
732
|
+
# Work on log scale
|
|
733
|
+
X_log = np.log2(adata.X + 1)
|
|
734
|
+
|
|
735
|
+
# Create baseline
|
|
736
|
+
if baseline_type is None:
|
|
737
|
+
if baseline_index is None:
|
|
738
|
+
raise ValueError("baseline_index must be provided when baseline_type is None")
|
|
739
|
+
if baseline_index < 0 or baseline_index >= n_samples:
|
|
740
|
+
raise ValueError(f"baseline_index must be between 0 and {n_samples - 1}, got {baseline_index}")
|
|
741
|
+
baseline = X_log[baseline_index, :]
|
|
742
|
+
elif baseline_type == "mean":
|
|
743
|
+
baseline = np.mean(X_log, axis=0)
|
|
744
|
+
elif baseline_type == "median":
|
|
745
|
+
baseline = np.median(X_log, axis=0)
|
|
746
|
+
elif baseline_type == "pseudo-mean":
|
|
747
|
+
# Trimmed mean (remove top and bottom 10%)
|
|
748
|
+
from scipy.stats import trim_mean
|
|
749
|
+
|
|
750
|
+
baseline = trim_mean(X_log, proportiontocut=0.1, axis=0)
|
|
751
|
+
elif baseline_type == "pseudo-median":
|
|
752
|
+
# Use median for simplicity
|
|
753
|
+
baseline = np.median(X_log, axis=0)
|
|
754
|
+
|
|
755
|
+
X_normalized = X_log.copy()
|
|
756
|
+
invariant_indices_list = []
|
|
757
|
+
|
|
758
|
+
# Normalize each sample
|
|
759
|
+
for i in range(n_samples):
|
|
760
|
+
sample = X_log[i, :]
|
|
761
|
+
|
|
762
|
+
# Iteratively identify invariant set
|
|
763
|
+
invariant_mask = np.ones(n_probes, dtype=bool)
|
|
764
|
+
|
|
765
|
+
for iter_num in range(max_iterations):
|
|
766
|
+
# Compute M and A
|
|
767
|
+
M = sample[invariant_mask] - baseline[invariant_mask]
|
|
768
|
+
# A = (sample[invariant_mask] + baseline[invariant_mask]) / 2
|
|
769
|
+
|
|
770
|
+
# Compute rank-based criterion
|
|
771
|
+
M_abs = np.abs(M)
|
|
772
|
+
ranks = np.argsort(np.argsort(M_abs)) # Ranks of |M|
|
|
773
|
+
rank_fraction = ranks / len(ranks)
|
|
774
|
+
|
|
775
|
+
# Keep probes with small |M| ranks (between prd_td thresholds)
|
|
776
|
+
keep_local = (rank_fraction >= prd_td[0]) & (rank_fraction <= prd_td[1])
|
|
777
|
+
|
|
778
|
+
# Update invariant mask
|
|
779
|
+
temp_mask = np.zeros(n_probes, dtype=bool)
|
|
780
|
+
temp_mask[np.where(invariant_mask)[0][keep_local]] = True
|
|
781
|
+
|
|
782
|
+
# Check convergence
|
|
783
|
+
if np.array_equal(temp_mask, invariant_mask):
|
|
784
|
+
break
|
|
785
|
+
|
|
786
|
+
invariant_mask = temp_mask
|
|
787
|
+
|
|
788
|
+
# Ensure we have enough invariant probes
|
|
789
|
+
if np.sum(invariant_mask) < 10:
|
|
790
|
+
warnings.warn(
|
|
791
|
+
f"Sample {i}: Only {np.sum(invariant_mask)} invariant probes found. "
|
|
792
|
+
"Using previous mask or all probes.",
|
|
793
|
+
UserWarning,
|
|
794
|
+
stacklevel=2,
|
|
795
|
+
)
|
|
796
|
+
# Revert to previous mask if we have too few probes
|
|
797
|
+
if iter_num > 0:
|
|
798
|
+
break
|
|
799
|
+
else:
|
|
800
|
+
# Use all probes on first iteration if we have too few
|
|
801
|
+
invariant_mask = np.ones(n_probes, dtype=bool)
|
|
802
|
+
break
|
|
803
|
+
|
|
804
|
+
invariant_indices_list.append(np.where(invariant_mask)[0].tolist())
|
|
805
|
+
|
|
806
|
+
# Check if we have enough probes for spline fitting
|
|
807
|
+
n_invariant = np.sum(invariant_mask)
|
|
808
|
+
if n_invariant < 5:
|
|
809
|
+
warnings.warn(
|
|
810
|
+
f"Sample {i}: Too few invariant probes ({n_invariant}). Skipping normalization for this sample.",
|
|
811
|
+
UserWarning,
|
|
812
|
+
stacklevel=2,
|
|
813
|
+
)
|
|
814
|
+
# Keep original values for this sample
|
|
815
|
+
continue
|
|
816
|
+
|
|
817
|
+
# Fit smoothing spline to invariant probes
|
|
818
|
+
baseline_inv = baseline[invariant_mask]
|
|
819
|
+
sample_inv = sample[invariant_mask]
|
|
820
|
+
|
|
821
|
+
# Sort by baseline for spline fitting
|
|
822
|
+
sort_idx = np.argsort(baseline_inv)
|
|
823
|
+
baseline_sorted = baseline_inv[sort_idx]
|
|
824
|
+
sample_sorted = sample_inv[sort_idx]
|
|
825
|
+
|
|
826
|
+
# Fit smoothing spline
|
|
827
|
+
try:
|
|
828
|
+
# Use smoothing spline (s=None for automatic smoothing parameter)
|
|
829
|
+
spline = UnivariateSpline(baseline_sorted, sample_sorted, s=None, k=3)
|
|
830
|
+
|
|
831
|
+
# Apply normalization: correct sample to match baseline
|
|
832
|
+
# Correction is: normalized = baseline + (sample - spline(baseline))
|
|
833
|
+
# But this is equivalent to: normalized = sample - (spline(baseline) - baseline)
|
|
834
|
+
spline_values = spline(baseline)
|
|
835
|
+
correction = spline_values - baseline
|
|
836
|
+
X_normalized[i, :] = sample - correction
|
|
837
|
+
|
|
838
|
+
except Exception as e: # noqa: BLE001
|
|
839
|
+
warnings.warn(
|
|
840
|
+
f"Sample {i}: Spline fitting failed ({e}). Using linear interpolation instead.",
|
|
841
|
+
UserWarning,
|
|
842
|
+
stacklevel=2,
|
|
843
|
+
)
|
|
844
|
+
# Fallback to linear interpolation
|
|
845
|
+
if len(baseline_sorted) >= 2:
|
|
846
|
+
correction = np.interp(baseline, baseline_sorted, sample_sorted - baseline_sorted)
|
|
847
|
+
X_normalized[i, :] = sample - correction
|
|
848
|
+
else:
|
|
849
|
+
# Not enough points, skip normalization
|
|
850
|
+
warnings.warn(
|
|
851
|
+
f"Sample {i}: Not enough points for interpolation. Skipping.",
|
|
852
|
+
UserWarning,
|
|
853
|
+
stacklevel=2,
|
|
854
|
+
)
|
|
855
|
+
|
|
856
|
+
# Convert back to original scale
|
|
857
|
+
X_normalized = np.power(2, X_normalized) - 1
|
|
858
|
+
X_normalized = np.maximum(X_normalized, 0) # Ensure non-negative
|
|
859
|
+
|
|
860
|
+
# Update AnnData
|
|
861
|
+
adata.X = X_normalized
|
|
862
|
+
adata.uns["normalize_invariantset"] = {
|
|
863
|
+
"method": "invariantset",
|
|
864
|
+
"n_samples": n_samples,
|
|
865
|
+
"n_probes": n_probes,
|
|
866
|
+
"baseline_type": baseline_type,
|
|
867
|
+
"baseline_index": baseline_index,
|
|
868
|
+
"prd_td": prd_td,
|
|
869
|
+
"max_iterations": max_iterations,
|
|
870
|
+
"n_invariant_per_sample": [len(indices) for indices in invariant_indices_list],
|
|
871
|
+
"applied": True,
|
|
872
|
+
}
|
|
873
|
+
|
|
874
|
+
return adata if copy else None
|
|
875
|
+
|
|
876
|
+
|
|
877
|
+
def normalize_qspline(
|
|
878
|
+
adata: AnnData,
|
|
879
|
+
target: np.ndarray | None = None,
|
|
880
|
+
samples: float | int = 0.02,
|
|
881
|
+
fit_iters: int = 5,
|
|
882
|
+
smooth: bool = True,
|
|
883
|
+
spar: float | None = None,
|
|
884
|
+
copy: bool = False,
|
|
885
|
+
) -> AnnData | None:
|
|
886
|
+
"""Apply cubic spline (qspline) normalization using quantile mapping.
|
|
887
|
+
|
|
888
|
+
This method fits cubic splines to map sample quantiles to target quantiles,
|
|
889
|
+
providing a smooth transformation that normalizes the intensity distribution.
|
|
890
|
+
Multiple spline fits with offsets are averaged to reduce variability.
|
|
891
|
+
|
|
892
|
+
Parameters
|
|
893
|
+
----------
|
|
894
|
+
adata
|
|
895
|
+
AnnData object with shape (n_samples, n_probes).
|
|
896
|
+
Must contain intensity values in `.X`.
|
|
897
|
+
target
|
|
898
|
+
Target distribution to normalize towards (length n_probes).
|
|
899
|
+
If None, uses the geometric mean of all samples (default).
|
|
900
|
+
samples
|
|
901
|
+
Number of quantile points to use for spline fitting.
|
|
902
|
+
If < 1, interpreted as sampling rate (e.g., 0.02 = 2% of data).
|
|
903
|
+
If >= 1, interpreted as number of quantile points.
|
|
904
|
+
Default is 0.02 (2% sampling rate).
|
|
905
|
+
fit_iters
|
|
906
|
+
Number of spline interpolations with offsets to average.
|
|
907
|
+
More iterations provide smoother results. Default is 5.
|
|
908
|
+
smooth
|
|
909
|
+
If True, use smoothing splines. If False, use interpolating splines.
|
|
910
|
+
Default is True.
|
|
911
|
+
spar
|
|
912
|
+
Smoothing parameter for splines (0 to 1).
|
|
913
|
+
If None, automatically determined. Only used if smooth=True.
|
|
914
|
+
copy
|
|
915
|
+
If True, return a copy of the AnnData object. If False, modify in place.
|
|
916
|
+
|
|
917
|
+
Returns:
|
|
918
|
+
-------
|
|
919
|
+
AnnData or None
|
|
920
|
+
If `copy=True`, returns normalized AnnData object.
|
|
921
|
+
If `copy=False`, modifies `adata` in place and returns None.
|
|
922
|
+
The normalized intensities are stored in `.X`.
|
|
923
|
+
Normalization metadata is stored in `.uns['normalize_qspline']`.
|
|
924
|
+
|
|
925
|
+
Examples:
|
|
926
|
+
--------
|
|
927
|
+
>>> import microarray as ma
|
|
928
|
+
>>> adata = ma.io.cel_batch_to_anndata(cel_files, cdf_path, annotation_db)
|
|
929
|
+
>>> adata_norm = ma.pp.normalize_qspline(adata, samples=0.05, copy=True)
|
|
930
|
+
|
|
931
|
+
Notes:
|
|
932
|
+
-----
|
|
933
|
+
Qspline normalization is flexible and can handle complex intensity-dependent
|
|
934
|
+
biases. It's particularly useful when the transformation between samples is
|
|
935
|
+
nonlinear.
|
|
936
|
+
|
|
937
|
+
The algorithm:
|
|
938
|
+
1. Compute target distribution (geometric mean if not provided)
|
|
939
|
+
2. For each sample and each iteration:
|
|
940
|
+
- Select quantile points (with offset for each iteration)
|
|
941
|
+
- Fit cubic spline mapping sample quantiles to target quantiles
|
|
942
|
+
- Apply spline transformation
|
|
943
|
+
- Average across iterations
|
|
944
|
+
3. Apply averaged transformation to all probe intensities
|
|
945
|
+
|
|
946
|
+
Reference
|
|
947
|
+
-------
|
|
948
|
+
Workman, C., et al. (2002). A new non-linear normalization method for
|
|
949
|
+
reducing variability in DNA microarray experiments.
|
|
950
|
+
Genome Biology, 3(9), research0048.
|
|
951
|
+
"""
|
|
952
|
+
from scipy.interpolate import CubicSpline, UnivariateSpline
|
|
953
|
+
|
|
954
|
+
adata = adata.copy() if copy else adata
|
|
955
|
+
|
|
956
|
+
# Validate input
|
|
957
|
+
if adata.X is None:
|
|
958
|
+
raise ValueError("AnnData object must have .X attribute with intensity values")
|
|
959
|
+
|
|
960
|
+
if samples <= 0:
|
|
961
|
+
raise ValueError(f"samples must be positive, got {samples}")
|
|
962
|
+
|
|
963
|
+
if fit_iters < 1:
|
|
964
|
+
raise ValueError(f"fit_iters must be at least 1, got {fit_iters}")
|
|
965
|
+
|
|
966
|
+
n_samples, n_probes = adata.shape
|
|
967
|
+
|
|
968
|
+
if n_samples == 1:
|
|
969
|
+
warnings.warn(
|
|
970
|
+
"Qspline normalization requires multiple samples. With only 1 sample, data is returned unchanged.",
|
|
971
|
+
UserWarning,
|
|
972
|
+
stacklevel=2,
|
|
973
|
+
)
|
|
974
|
+
adata.uns["normalize_qspline"] = {
|
|
975
|
+
"method": "qspline",
|
|
976
|
+
"n_samples": n_samples,
|
|
977
|
+
"applied": False,
|
|
978
|
+
"reason": "single_sample",
|
|
979
|
+
}
|
|
980
|
+
return adata if copy else None
|
|
981
|
+
|
|
982
|
+
# Work on log scale
|
|
983
|
+
X_log = np.log2(adata.X + 1)
|
|
984
|
+
|
|
985
|
+
# Compute target distribution if not provided (geometric mean)
|
|
986
|
+
if target is None:
|
|
987
|
+
# Geometric mean = exp(mean(log(x)))
|
|
988
|
+
# On log scale: mean of log values
|
|
989
|
+
target = np.mean(X_log, axis=0)
|
|
990
|
+
target_provided = False
|
|
991
|
+
else:
|
|
992
|
+
target = np.log2(np.array(target) + 1)
|
|
993
|
+
target_provided = True
|
|
994
|
+
if len(target) != n_probes:
|
|
995
|
+
raise ValueError(f"target must have length {n_probes}, got {len(target)}")
|
|
996
|
+
|
|
997
|
+
# Determine number of quantile points
|
|
998
|
+
if samples < 1:
|
|
999
|
+
n_quantiles = max(int(samples * n_probes), 10)
|
|
1000
|
+
else:
|
|
1001
|
+
n_quantiles = int(samples)
|
|
1002
|
+
|
|
1003
|
+
# Normalize each sample
|
|
1004
|
+
X_normalized = np.zeros_like(X_log)
|
|
1005
|
+
|
|
1006
|
+
for i in range(n_samples):
|
|
1007
|
+
sample = X_log[i, :]
|
|
1008
|
+
sample_transformed = np.zeros((fit_iters, n_probes))
|
|
1009
|
+
|
|
1010
|
+
# Multiple iterations with offsets
|
|
1011
|
+
for iter_num in range(fit_iters):
|
|
1012
|
+
# Compute quantile points with offset
|
|
1013
|
+
offset = iter_num / fit_iters
|
|
1014
|
+
quantile_levels = np.linspace(offset, 1 - offset / 2, n_quantiles)
|
|
1015
|
+
quantile_levels = np.clip(quantile_levels, 0.001, 0.999)
|
|
1016
|
+
|
|
1017
|
+
# Get sample and target quantiles
|
|
1018
|
+
sample_quantiles = np.quantile(sample, quantile_levels)
|
|
1019
|
+
target_quantiles = np.quantile(target, quantile_levels)
|
|
1020
|
+
|
|
1021
|
+
# Ensure quantiles are sorted and remove duplicates
|
|
1022
|
+
sort_idx = np.argsort(sample_quantiles)
|
|
1023
|
+
sample_quantiles = sample_quantiles[sort_idx]
|
|
1024
|
+
target_quantiles = target_quantiles[sort_idx]
|
|
1025
|
+
|
|
1026
|
+
# Remove duplicate sample quantiles (keep unique values only)
|
|
1027
|
+
unique_mask = np.concatenate([[True], np.diff(sample_quantiles) > 1e-10])
|
|
1028
|
+
sample_quantiles = sample_quantiles[unique_mask]
|
|
1029
|
+
target_quantiles = target_quantiles[unique_mask]
|
|
1030
|
+
|
|
1031
|
+
# Check if we have enough unique quantiles for spline
|
|
1032
|
+
if len(sample_quantiles) < 4:
|
|
1033
|
+
# Not enough unique points for cubic spline, use linear interpolation
|
|
1034
|
+
sample_transformed[iter_num, :] = np.interp(sample, sample_quantiles, target_quantiles)
|
|
1035
|
+
continue
|
|
1036
|
+
|
|
1037
|
+
# Fit spline
|
|
1038
|
+
if smooth and spar is not None:
|
|
1039
|
+
# Smoothing spline
|
|
1040
|
+
try:
|
|
1041
|
+
spline = UnivariateSpline(
|
|
1042
|
+
sample_quantiles,
|
|
1043
|
+
target_quantiles,
|
|
1044
|
+
s=spar * len(sample_quantiles),
|
|
1045
|
+
k=3,
|
|
1046
|
+
)
|
|
1047
|
+
except Exception: # noqa: BLE001
|
|
1048
|
+
# Fallback to cubic spline
|
|
1049
|
+
warnings.warn(
|
|
1050
|
+
"Smoothing spline fitting failed. Falling back to cubic spline.",
|
|
1051
|
+
UserWarning,
|
|
1052
|
+
stacklevel=2,
|
|
1053
|
+
)
|
|
1054
|
+
spline = CubicSpline(sample_quantiles, target_quantiles)
|
|
1055
|
+
elif smooth:
|
|
1056
|
+
# Auto-smoothing spline
|
|
1057
|
+
try:
|
|
1058
|
+
spline = UnivariateSpline(sample_quantiles, target_quantiles, s=None, k=3)
|
|
1059
|
+
except Exception: # noqa: BLE001
|
|
1060
|
+
warnings.warn(
|
|
1061
|
+
"Auto-smoothing spline fitting failed. Falling back to cubic spline.",
|
|
1062
|
+
UserWarning,
|
|
1063
|
+
stacklevel=2,
|
|
1064
|
+
)
|
|
1065
|
+
spline = CubicSpline(sample_quantiles, target_quantiles)
|
|
1066
|
+
else:
|
|
1067
|
+
# Interpolating cubic spline
|
|
1068
|
+
spline = CubicSpline(sample_quantiles, target_quantiles)
|
|
1069
|
+
|
|
1070
|
+
# Apply transformation
|
|
1071
|
+
# Clip to avoid extrapolation issues
|
|
1072
|
+
sample_clipped = np.clip(sample, sample_quantiles[0], sample_quantiles[-1])
|
|
1073
|
+
sample_transformed[iter_num, :] = spline(sample_clipped)
|
|
1074
|
+
|
|
1075
|
+
# Average across iterations
|
|
1076
|
+
X_normalized[i, :] = np.mean(sample_transformed, axis=0)
|
|
1077
|
+
|
|
1078
|
+
# Convert back to original scale
|
|
1079
|
+
X_normalized = np.power(2, X_normalized) - 1
|
|
1080
|
+
X_normalized = np.maximum(X_normalized, 0) # Ensure non-negative
|
|
1081
|
+
|
|
1082
|
+
# Update AnnData
|
|
1083
|
+
adata.X = X_normalized
|
|
1084
|
+
adata.uns["normalize_qspline"] = {
|
|
1085
|
+
"method": "qspline",
|
|
1086
|
+
"n_samples": n_samples,
|
|
1087
|
+
"n_probes": n_probes,
|
|
1088
|
+
"n_quantiles": n_quantiles,
|
|
1089
|
+
"samples": samples,
|
|
1090
|
+
"fit_iters": fit_iters,
|
|
1091
|
+
"smooth": smooth,
|
|
1092
|
+
"spar": spar,
|
|
1093
|
+
"target_provided": target_provided,
|
|
1094
|
+
"applied": True,
|
|
1095
|
+
}
|
|
1096
|
+
|
|
1097
|
+
return adata if copy else None
|
|
1098
|
+
|
|
1099
|
+
|
|
1100
|
+
def _select_rank_invariant_subset(X: np.ndarray, subset_size: int) -> np.ndarray:
|
|
1101
|
+
"""Select probes with small rank-range across samples (maffy.subset).
|
|
1102
|
+
|
|
1103
|
+
Parameters
|
|
1104
|
+
----------
|
|
1105
|
+
X
|
|
1106
|
+
Intensity matrix (n_samples, n_probes).
|
|
1107
|
+
subset_size
|
|
1108
|
+
Number of probes to select.
|
|
1109
|
+
|
|
1110
|
+
Returns:
|
|
1111
|
+
-------
|
|
1112
|
+
np.ndarray
|
|
1113
|
+
Boolean mask of selected probes.
|
|
1114
|
+
"""
|
|
1115
|
+
n_samples, n_probes = X.shape
|
|
1116
|
+
|
|
1117
|
+
# Compute ranks for each sample
|
|
1118
|
+
ranks = np.zeros_like(X)
|
|
1119
|
+
for i in range(n_samples):
|
|
1120
|
+
ranks[i, :] = np.argsort(np.argsort(X[i, :]))
|
|
1121
|
+
|
|
1122
|
+
# Compute rank range for each probe
|
|
1123
|
+
rank_ranges = np.max(ranks, axis=0) - np.min(ranks, axis=0)
|
|
1124
|
+
|
|
1125
|
+
# Select probes with smallest rank range
|
|
1126
|
+
subset_indices = np.argsort(rank_ranges)[:subset_size]
|
|
1127
|
+
mask = np.zeros(n_probes, dtype=bool)
|
|
1128
|
+
mask[subset_indices] = True
|
|
1129
|
+
|
|
1130
|
+
return mask
|
|
1131
|
+
|
|
1132
|
+
|
|
1133
|
+
def normalize_contrasts(
|
|
1134
|
+
adata: AnnData,
|
|
1135
|
+
span: float = 2 / 3,
|
|
1136
|
+
choose_subset: bool = True,
|
|
1137
|
+
subset_size: int = 5000,
|
|
1138
|
+
copy: bool = False,
|
|
1139
|
+
) -> AnnData | None:
|
|
1140
|
+
"""Apply contrast-based normalization with rank-invariant probe selection.
|
|
1141
|
+
|
|
1142
|
+
This method transforms the data to contrasts (differences from the first sample),
|
|
1143
|
+
applies loess smoothing to each contrast dimension, and transforms back.
|
|
1144
|
+
It optionally selects a subset of rank-invariant probes for fitting efficiency.
|
|
1145
|
+
|
|
1146
|
+
Parameters
|
|
1147
|
+
----------
|
|
1148
|
+
adata
|
|
1149
|
+
AnnData object with shape (n_samples, n_probes).
|
|
1150
|
+
Must contain intensity values in `.X`.
|
|
1151
|
+
span
|
|
1152
|
+
Fraction of data points to use for loess regression (0 < span <= 1).
|
|
1153
|
+
Larger values produce smoother curves. Default is 2/3.
|
|
1154
|
+
choose_subset
|
|
1155
|
+
If True, automatically select subset of rank-invariant probes.
|
|
1156
|
+
If False, use all probes. Default is True.
|
|
1157
|
+
subset_size
|
|
1158
|
+
Number of probes to use in subset (if choose_subset=True).
|
|
1159
|
+
Default is 5000.
|
|
1160
|
+
copy
|
|
1161
|
+
If True, return a copy of the AnnData object. If False, modify in place.
|
|
1162
|
+
|
|
1163
|
+
Returns:
|
|
1164
|
+
-------
|
|
1165
|
+
AnnData or None
|
|
1166
|
+
If `copy=True`, returns normalized AnnData object.
|
|
1167
|
+
If `copy=False`, modifies `adata` in place and returns None.
|
|
1168
|
+
The normalized intensities are stored in `.X`.
|
|
1169
|
+
Normalization metadata is stored in `.uns['normalize_contrasts']`.
|
|
1170
|
+
|
|
1171
|
+
Examples:
|
|
1172
|
+
--------
|
|
1173
|
+
>>> import microarray as ma
|
|
1174
|
+
>>> adata = ma.io.cel_batch_to_anndata(cel_files, cdf_path, annotation_db)
|
|
1175
|
+
>>> adata_norm = ma.pp.normalize_contrasts(adata, subset_size=3000, copy=True)
|
|
1176
|
+
|
|
1177
|
+
Notes:
|
|
1178
|
+
-----
|
|
1179
|
+
This is a complex method that normalizes using contrast transformations.
|
|
1180
|
+
It can be computationally expensive for large datasets.
|
|
1181
|
+
|
|
1182
|
+
The algorithm:
|
|
1183
|
+
1. Transform to log scale
|
|
1184
|
+
2. Optionally select rank-invariant subset of probes
|
|
1185
|
+
3. Compute contrasts: C[i] = X[i] - X[0] for i > 0
|
|
1186
|
+
4. For each contrast, smooth using loess
|
|
1187
|
+
5. Apply corrections and transform back
|
|
1188
|
+
|
|
1189
|
+
This implementation uses a simplified approach compared to the full
|
|
1190
|
+
multivariate loess in the R affy package, making it more practical
|
|
1191
|
+
for large datasets.
|
|
1192
|
+
|
|
1193
|
+
Reference
|
|
1194
|
+
-------
|
|
1195
|
+
See normalize.contrasts in the affy Bioconductor package and related
|
|
1196
|
+
maffy package documentation.
|
|
1197
|
+
"""
|
|
1198
|
+
adata = adata.copy() if copy else adata
|
|
1199
|
+
|
|
1200
|
+
# Validate input
|
|
1201
|
+
if adata.X is None:
|
|
1202
|
+
raise ValueError("AnnData object must have .X attribute with intensity values")
|
|
1203
|
+
|
|
1204
|
+
if span <= 0 or span > 1:
|
|
1205
|
+
raise ValueError(f"span must be between 0 and 1, got {span}")
|
|
1206
|
+
|
|
1207
|
+
n_samples, n_probes = adata.shape
|
|
1208
|
+
|
|
1209
|
+
if n_samples == 1:
|
|
1210
|
+
warnings.warn(
|
|
1211
|
+
"Contrast normalization requires multiple samples. With only 1 sample, data is returned unchanged.",
|
|
1212
|
+
UserWarning,
|
|
1213
|
+
stacklevel=2,
|
|
1214
|
+
)
|
|
1215
|
+
adata.uns["normalize_contrasts"] = {
|
|
1216
|
+
"method": "contrasts",
|
|
1217
|
+
"n_samples": n_samples,
|
|
1218
|
+
"applied": False,
|
|
1219
|
+
"reason": "single_sample",
|
|
1220
|
+
}
|
|
1221
|
+
return adata if copy else None
|
|
1222
|
+
|
|
1223
|
+
if n_samples > 20:
|
|
1224
|
+
warnings.warn(
|
|
1225
|
+
f"Contrast normalization with {n_samples} samples may be slow. "
|
|
1226
|
+
"Consider using a simpler method like quantile or loess normalization.",
|
|
1227
|
+
UserWarning,
|
|
1228
|
+
stacklevel=2,
|
|
1229
|
+
)
|
|
1230
|
+
|
|
1231
|
+
# Work on log scale
|
|
1232
|
+
X_log = np.log2(adata.X + 1)
|
|
1233
|
+
|
|
1234
|
+
# Select subset if requested
|
|
1235
|
+
if choose_subset and subset_size < n_probes:
|
|
1236
|
+
subset_mask = _select_rank_invariant_subset(X_log, subset_size)
|
|
1237
|
+
n_subset = np.sum(subset_mask)
|
|
1238
|
+
else:
|
|
1239
|
+
subset_mask = np.ones(n_probes, dtype=bool)
|
|
1240
|
+
n_subset = n_probes
|
|
1241
|
+
|
|
1242
|
+
# Use first sample as reference
|
|
1243
|
+
reference = X_log[0, :]
|
|
1244
|
+
|
|
1245
|
+
# Compute average intensity across all samples for each probe
|
|
1246
|
+
A_all = np.mean(X_log, axis=0)
|
|
1247
|
+
|
|
1248
|
+
X_normalized = X_log.copy()
|
|
1249
|
+
|
|
1250
|
+
# Normalize each sample (except reference)
|
|
1251
|
+
for i in range(1, n_samples):
|
|
1252
|
+
# Compute contrast (difference from reference)
|
|
1253
|
+
contrast = X_log[i, :] - reference
|
|
1254
|
+
|
|
1255
|
+
# Get subset for fitting
|
|
1256
|
+
contrast_subset = contrast[subset_mask]
|
|
1257
|
+
A_subset = A_all[subset_mask]
|
|
1258
|
+
|
|
1259
|
+
# Sort by A for loess fitting
|
|
1260
|
+
sort_idx = np.argsort(A_subset)
|
|
1261
|
+
A_sorted = A_subset[sort_idx]
|
|
1262
|
+
contrast_sorted = contrast_subset[sort_idx]
|
|
1263
|
+
|
|
1264
|
+
# Fit loess curve to contrast
|
|
1265
|
+
contrast_smooth = _lowess_fit(A_sorted, contrast_sorted, span=span)
|
|
1266
|
+
|
|
1267
|
+
# Interpolate to all probes
|
|
1268
|
+
contrast_correction = np.interp(A_all, A_sorted, contrast_smooth)
|
|
1269
|
+
|
|
1270
|
+
# Apply correction: normalized = sample - (fitted_contrast - original_contrast)
|
|
1271
|
+
# This adjusts the sample to have the smoothed contrast relationship
|
|
1272
|
+
X_normalized[i, :] = X_log[i, :] - (contrast_correction - contrast)
|
|
1273
|
+
|
|
1274
|
+
# Convert back to original scale
|
|
1275
|
+
X_normalized = np.power(2, X_normalized) - 1
|
|
1276
|
+
X_normalized = np.maximum(X_normalized, 0) # Ensure non-negative
|
|
1277
|
+
|
|
1278
|
+
# Update AnnData
|
|
1279
|
+
adata.X = X_normalized
|
|
1280
|
+
adata.uns["normalize_contrasts"] = {
|
|
1281
|
+
"method": "contrasts",
|
|
1282
|
+
"n_samples": n_samples,
|
|
1283
|
+
"n_probes": n_probes,
|
|
1284
|
+
"span": span,
|
|
1285
|
+
"choose_subset": choose_subset,
|
|
1286
|
+
"subset_size": subset_size,
|
|
1287
|
+
"n_subset_used": n_subset,
|
|
1288
|
+
"reference_index": 0,
|
|
1289
|
+
"applied": True,
|
|
1290
|
+
}
|
|
1291
|
+
|
|
1292
|
+
return adata if copy else None
|