microarray 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- microarray/__init__.py +15 -0
- microarray/_version.py +3 -0
- microarray/datasets/__init__.py +3 -0
- microarray/datasets/_arrayexpress.py +1 -0
- microarray/datasets/_cdf_files.py +35 -0
- microarray/datasets/_geo.py +1 -0
- microarray/datasets/_utils.py +143 -0
- microarray/io/__init__.py +17 -0
- microarray/io/_anndata_converter.py +198 -0
- microarray/io/_cdf.py +575 -0
- microarray/io/_cel.py +591 -0
- microarray/io/_read.py +127 -0
- microarray/plotting/__init__.py +28 -0
- microarray/plotting/_base.py +253 -0
- microarray/plotting/_cel.py +75 -0
- microarray/plotting/_de_plots.py +239 -0
- microarray/plotting/_diagnostic_plots.py +268 -0
- microarray/plotting/_heatmap.py +279 -0
- microarray/plotting/_ma_plots.py +136 -0
- microarray/plotting/_pca.py +320 -0
- microarray/plotting/_qc_plots.py +335 -0
- microarray/plotting/_score.py +38 -0
- microarray/plotting/_top_table_heatmap.py +98 -0
- microarray/plotting/_utils.py +280 -0
- microarray/preprocessing/__init__.py +39 -0
- microarray/preprocessing/_background.py +862 -0
- microarray/preprocessing/_log2.py +77 -0
- microarray/preprocessing/_normalize.py +1292 -0
- microarray/preprocessing/_rma.py +243 -0
- microarray/preprocessing/_robust.py +170 -0
- microarray/preprocessing/_summarize.py +318 -0
- microarray/py.typed +0 -0
- microarray/tools/__init__.py +26 -0
- microarray/tools/_biomart.py +416 -0
- microarray/tools/_empirical_bayes.py +401 -0
- microarray/tools/_fdist.py +171 -0
- microarray/tools/_linear_models.py +387 -0
- microarray/tools/_mds.py +101 -0
- microarray/tools/_pca.py +88 -0
- microarray/tools/_score.py +86 -0
- microarray/tools/_toptable.py +360 -0
- microarray-0.1.0.dist-info/METADATA +75 -0
- microarray-0.1.0.dist-info/RECORD +44 -0
- microarray-0.1.0.dist-info/WHEEL +4 -0
|
@@ -0,0 +1,243 @@
|
|
|
1
|
+
"""RMA (Robust Multi-array Average) normalization pipeline."""
|
|
2
|
+
|
|
3
|
+
import warnings
|
|
4
|
+
|
|
5
|
+
import numpy as np
|
|
6
|
+
from anndata import AnnData
|
|
7
|
+
|
|
8
|
+
from microarray.preprocessing._background import rma_background_correct
|
|
9
|
+
from microarray.preprocessing._log2 import _is_log_transformed, log2
|
|
10
|
+
from microarray.preprocessing._normalize import normalize_quantile
|
|
11
|
+
from microarray.preprocessing._summarize import summarize_probesets
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def rma(
|
|
15
|
+
adata: AnnData,
|
|
16
|
+
background_correct: bool = True,
|
|
17
|
+
normalize: bool = True,
|
|
18
|
+
log_transform: bool = True,
|
|
19
|
+
summarize: bool = True,
|
|
20
|
+
output_level: str = "gene",
|
|
21
|
+
copy: bool = True,
|
|
22
|
+
) -> AnnData | None:
|
|
23
|
+
"""Apply RMA (Robust Multi-array Average) normalization to microarray data.
|
|
24
|
+
|
|
25
|
+
RMA is a widely-used preprocessing method for Affymetrix microarrays that
|
|
26
|
+
combines background correction, quantile normalization, log transformation,
|
|
27
|
+
and robust summarization.
|
|
28
|
+
|
|
29
|
+
The standard RMA pipeline consists of:
|
|
30
|
+
1. Background correction using a convolution model
|
|
31
|
+
2. Quantile normalization across samples
|
|
32
|
+
3. Log2 transformation
|
|
33
|
+
4. Probeset summarization using median polish
|
|
34
|
+
|
|
35
|
+
Parameters
|
|
36
|
+
----------
|
|
37
|
+
adata
|
|
38
|
+
AnnData object with shape (n_samples, n_probes).
|
|
39
|
+
Should contain raw intensity values from multiple CEL files.
|
|
40
|
+
Use `microarray.io.cel_batch_to_anndata()` to load data.
|
|
41
|
+
background_correct
|
|
42
|
+
If True, apply RMA background correction.
|
|
43
|
+
normalize
|
|
44
|
+
If True, apply quantile normalization across samples.
|
|
45
|
+
Requires n_samples > 1; skipped with warning for single sample.
|
|
46
|
+
log_transform
|
|
47
|
+
If True, apply log2 transformation to intensities.
|
|
48
|
+
summarize
|
|
49
|
+
If True, apply median polish summarization to probesets.
|
|
50
|
+
Requires 'gene_id' column in `.var`.
|
|
51
|
+
output_level
|
|
52
|
+
Output granularity when summarize=True:
|
|
53
|
+
- 'gene': Return gene-level data (n_samples, n_genes)
|
|
54
|
+
- 'probe': Keep probe-level data with effects in `.layers`
|
|
55
|
+
Ignored if summarize=False.
|
|
56
|
+
copy
|
|
57
|
+
If True, return a new AnnData object. If False, may modify in place
|
|
58
|
+
(except when output_level='gene', which always returns new object).
|
|
59
|
+
|
|
60
|
+
Returns:
|
|
61
|
+
-------
|
|
62
|
+
AnnData or None
|
|
63
|
+
Processed AnnData object if `copy=True`.
|
|
64
|
+
If `copy=False` and `output_level='probe'`, modifies in place and returns None.
|
|
65
|
+
Full RMA parameters and history are stored in `.uns['rma']`.
|
|
66
|
+
|
|
67
|
+
Raises:
|
|
68
|
+
------
|
|
69
|
+
ValueError
|
|
70
|
+
If input validation fails (e.g., empty data, incompatible parameters).
|
|
71
|
+
|
|
72
|
+
Examples:
|
|
73
|
+
--------
|
|
74
|
+
>>> # Standard RMA with gene-level output
|
|
75
|
+
>>> import microarray as ma
|
|
76
|
+
>>> adata = ma.io.cel_batch_to_anndata(
|
|
77
|
+
... cel_paths=["sample1.CEL", "sample2.CEL"], cdf_path="chip.cdf", annotation_db_path="annotations.db"
|
|
78
|
+
... )
|
|
79
|
+
>>> adata_rma = ma.pp.rma(adata)
|
|
80
|
+
>>> print(adata_rma.shape) # (2, n_genes)
|
|
81
|
+
|
|
82
|
+
>>> # Partial RMA: only background correction and normalization
|
|
83
|
+
>>> adata_norm = ma.pp.rma(adata, background_correct=True, normalize=True, log_transform=True, summarize=False)
|
|
84
|
+
>>> print(adata_norm.shape) # (2, n_probes) - still probe-level
|
|
85
|
+
|
|
86
|
+
>>> # RMA with probe-level output
|
|
87
|
+
>>> adata_probes = ma.pp.rma(adata, output_level="probe")
|
|
88
|
+
|
|
89
|
+
Notes:
|
|
90
|
+
-----
|
|
91
|
+
RMA was developed for Affymetrix oligonucleotide arrays and is one of
|
|
92
|
+
the most widely-used preprocessing methods due to its robustness and
|
|
93
|
+
performance in reducing technical variation.
|
|
94
|
+
|
|
95
|
+
The method assumes:
|
|
96
|
+
- Multiple samples are being processed together (for normalization)
|
|
97
|
+
- Data are from PM (perfect match) probes
|
|
98
|
+
- Probes are grouped into probesets representing genes/transcripts
|
|
99
|
+
|
|
100
|
+
References:
|
|
101
|
+
----------
|
|
102
|
+
.. [1] Irizarry, R.A., Hobbs, B., Collin, F., et al. (2003).
|
|
103
|
+
Exploration, normalization, and summaries of high density
|
|
104
|
+
oligonucleotide array probe level data.
|
|
105
|
+
Biostatistics, 4(2), 249-264.
|
|
106
|
+
|
|
107
|
+
.. [2] Bolstad, B.M., Irizarry, R.A., Astrand, M., Speed, T.P. (2003).
|
|
108
|
+
A comparison of normalization methods for high density
|
|
109
|
+
oligonucleotide array data based on variance and bias.
|
|
110
|
+
Bioinformatics, 19(2), 185-193.
|
|
111
|
+
"""
|
|
112
|
+
# Validate input
|
|
113
|
+
_validate_rma_input(adata, summarize)
|
|
114
|
+
|
|
115
|
+
# Check if already processed
|
|
116
|
+
if _is_likely_processed(adata):
|
|
117
|
+
warnings.warn(
|
|
118
|
+
"Input data appears to already be processed (values in log scale or "
|
|
119
|
+
"normalized). Applying RMA to already-processed data may give "
|
|
120
|
+
"incorrect results.",
|
|
121
|
+
UserWarning,
|
|
122
|
+
stacklevel=2,
|
|
123
|
+
)
|
|
124
|
+
|
|
125
|
+
# Initialize result
|
|
126
|
+
adata_result = adata.copy() if copy else adata
|
|
127
|
+
|
|
128
|
+
# Track which steps were applied
|
|
129
|
+
steps_applied = []
|
|
130
|
+
|
|
131
|
+
# Step 1: Background correction
|
|
132
|
+
if background_correct:
|
|
133
|
+
rma_background_correct(adata_result, copy=False)
|
|
134
|
+
steps_applied.append("background_correction")
|
|
135
|
+
|
|
136
|
+
# Step 2: Quantile normalization
|
|
137
|
+
if normalize:
|
|
138
|
+
if adata_result.n_obs == 1:
|
|
139
|
+
warnings.warn(
|
|
140
|
+
"Skipping quantile normalization: only 1 sample provided. Normalization requires multiple samples.",
|
|
141
|
+
UserWarning,
|
|
142
|
+
stacklevel=2,
|
|
143
|
+
)
|
|
144
|
+
else:
|
|
145
|
+
normalize_quantile(adata_result, copy=False)
|
|
146
|
+
steps_applied.append("quantile_normalization")
|
|
147
|
+
|
|
148
|
+
# Step 3: Log transformation
|
|
149
|
+
if log_transform:
|
|
150
|
+
# Check if already log-transformed
|
|
151
|
+
if not _is_log_transformed(adata_result):
|
|
152
|
+
log2(adata_result, copy=False)
|
|
153
|
+
steps_applied.append("log2_transform")
|
|
154
|
+
else:
|
|
155
|
+
warnings.warn(
|
|
156
|
+
"Data appears to already be log-transformed. Skipping log transformation.",
|
|
157
|
+
UserWarning,
|
|
158
|
+
stacklevel=2,
|
|
159
|
+
)
|
|
160
|
+
|
|
161
|
+
# Step 4: Probeset summarization
|
|
162
|
+
if summarize:
|
|
163
|
+
if "gene_id" not in adata_result.var.columns:
|
|
164
|
+
warnings.warn(
|
|
165
|
+
"Cannot summarize probesets: 'gene_id' column missing from .var. "
|
|
166
|
+
"Load data with annotation_db_path to enable summarization. "
|
|
167
|
+
"Skipping summarization step.",
|
|
168
|
+
UserWarning,
|
|
169
|
+
stacklevel=2,
|
|
170
|
+
)
|
|
171
|
+
else:
|
|
172
|
+
# Note: summarize_probesets returns new AnnData for gene-level output
|
|
173
|
+
result_or_none = summarize_probesets(
|
|
174
|
+
adata_result,
|
|
175
|
+
method="medpolish",
|
|
176
|
+
output_level=output_level,
|
|
177
|
+
copy=False if output_level == "probe" else True,
|
|
178
|
+
)
|
|
179
|
+
# If gene-level, we get a new object; if probe-level with copy=False, we get None
|
|
180
|
+
if result_or_none is not None:
|
|
181
|
+
adata_result = result_or_none
|
|
182
|
+
steps_applied.append("median_polish_summarization")
|
|
183
|
+
|
|
184
|
+
# Store RMA parameters and history
|
|
185
|
+
adata_result.uns["rma"] = {
|
|
186
|
+
"background_correct": background_correct,
|
|
187
|
+
"normalize": normalize,
|
|
188
|
+
"log_transform": log_transform,
|
|
189
|
+
"summarize": summarize,
|
|
190
|
+
"output_level": output_level if summarize else "probe",
|
|
191
|
+
"steps_applied": steps_applied,
|
|
192
|
+
"n_samples": adata.n_obs,
|
|
193
|
+
"n_probes_input": adata.n_vars,
|
|
194
|
+
"n_features_output": adata_result.n_vars,
|
|
195
|
+
}
|
|
196
|
+
|
|
197
|
+
return adata_result if copy else None
|
|
198
|
+
|
|
199
|
+
|
|
200
|
+
def _validate_rma_input(adata: AnnData, summarize: bool) -> None:
|
|
201
|
+
"""Validate input AnnData for RMA processing."""
|
|
202
|
+
if adata is None:
|
|
203
|
+
raise ValueError("adata cannot be None")
|
|
204
|
+
|
|
205
|
+
if adata.X is None:
|
|
206
|
+
raise ValueError("AnnData.X must contain intensity values")
|
|
207
|
+
|
|
208
|
+
if adata.n_obs < 1:
|
|
209
|
+
raise ValueError("AnnData must contain at least one sample")
|
|
210
|
+
|
|
211
|
+
if adata.n_vars < 1:
|
|
212
|
+
raise ValueError("AnnData must contain at least one probe")
|
|
213
|
+
|
|
214
|
+
# Check for negative values (indicates already processed data)
|
|
215
|
+
if np.any(adata.X < 0):
|
|
216
|
+
warnings.warn(
|
|
217
|
+
"Input data contains negative values. RMA expects raw positive intensities.",
|
|
218
|
+
UserWarning,
|
|
219
|
+
stacklevel=2,
|
|
220
|
+
)
|
|
221
|
+
|
|
222
|
+
# Check for NaN or inf values
|
|
223
|
+
if np.any(~np.isfinite(adata.X)):
|
|
224
|
+
raise ValueError("Input data contains NaN or infinite values. RMA requires finite intensity values.")
|
|
225
|
+
|
|
226
|
+
|
|
227
|
+
def _is_likely_processed(adata: AnnData) -> bool:
|
|
228
|
+
"""Check if data appears to already be processed.
|
|
229
|
+
|
|
230
|
+
Checks for signs of normalization or log transformation.
|
|
231
|
+
"""
|
|
232
|
+
# Check for log transformation
|
|
233
|
+
if _is_log_transformed(adata):
|
|
234
|
+
return True
|
|
235
|
+
|
|
236
|
+
# Check if normalization metadata exists
|
|
237
|
+
if "normalization" in adata.uns:
|
|
238
|
+
return True
|
|
239
|
+
|
|
240
|
+
if "rma" in adata.uns:
|
|
241
|
+
return True
|
|
242
|
+
|
|
243
|
+
return False
|
|
@@ -0,0 +1,170 @@
|
|
|
1
|
+
"""Robust statistical methods for microarray preprocessing.
|
|
2
|
+
|
|
3
|
+
This module provides robust statistical methods used in expression estimation,
|
|
4
|
+
particularly in the MAS5 algorithm.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import numpy as np
|
|
8
|
+
from numpy.typing import NDArray
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def tukey_biweight(
|
|
12
|
+
x: NDArray[np.floating],
|
|
13
|
+
c: float = 5.0,
|
|
14
|
+
epsilon: float = 0.0001,
|
|
15
|
+
max_iter: int = 50,
|
|
16
|
+
tol: float = 1e-7,
|
|
17
|
+
) -> float:
|
|
18
|
+
"""Compute Tukey's biweight (bisquare) robust mean.
|
|
19
|
+
|
|
20
|
+
This function calculates a robust estimate of the mean using Tukey's biweight
|
|
21
|
+
algorithm. It downweights outliers based on their distance from the median,
|
|
22
|
+
providing more robust estimates than the arithmetic mean.
|
|
23
|
+
|
|
24
|
+
The algorithm computes weights based on standardized residuals:
|
|
25
|
+
- w_i = (1 - u_i^2)^2 for |u_i| <= 1
|
|
26
|
+
- w_i = 0 for |u_i| > 1
|
|
27
|
+
|
|
28
|
+
where u_i = (x_i - m) / (c * s + epsilon), m is the median, and s is the
|
|
29
|
+
median absolute deviation (MAD).
|
|
30
|
+
|
|
31
|
+
Parameters
|
|
32
|
+
----------
|
|
33
|
+
x : NDArray[np.floating]
|
|
34
|
+
Input array of values, 1-dimensional.
|
|
35
|
+
c : float, default=5.0
|
|
36
|
+
Tuning constant that controls the downweighting of outliers.
|
|
37
|
+
Larger values assign more weight to outliers. Common values:
|
|
38
|
+
- c=5: standard for expression summarization (MAS5)
|
|
39
|
+
- c=6: more permissive
|
|
40
|
+
- c=4.685: asymptotically 95% efficiency for normal data
|
|
41
|
+
epsilon : float, default=0.0001
|
|
42
|
+
Small constant added to prevent division by zero when MAD is very small.
|
|
43
|
+
max_iter : int, default=50
|
|
44
|
+
Maximum number of iterations for refinement (currently single-pass).
|
|
45
|
+
tol : float, default=1e-7
|
|
46
|
+
Convergence tolerance (currently unused, reserved for iterative version).
|
|
47
|
+
|
|
48
|
+
Returns:
|
|
49
|
+
-------
|
|
50
|
+
float
|
|
51
|
+
The Tukey biweight estimate of the mean.
|
|
52
|
+
|
|
53
|
+
Notes:
|
|
54
|
+
-----
|
|
55
|
+
This implementation follows the Affymetrix MAS5 algorithm as implemented in
|
|
56
|
+
the affy R package (tukey.biweight.R). The single-pass version is used,
|
|
57
|
+
which computes weights based on deviations from the median.
|
|
58
|
+
|
|
59
|
+
For data with no variability (all values identical), the function returns
|
|
60
|
+
the common value.
|
|
61
|
+
|
|
62
|
+
References:
|
|
63
|
+
----------
|
|
64
|
+
.. [1] Mosteller, F., and Tukey, J. W. (1977), Data Analysis and Regression:
|
|
65
|
+
A Second Course in Statistics. Addison-Wesley.
|
|
66
|
+
.. [2] Affymetrix (2002). Statistical Algorithms Description Document.
|
|
67
|
+
|
|
68
|
+
Examples:
|
|
69
|
+
--------
|
|
70
|
+
>>> import numpy as np
|
|
71
|
+
>>> from microarray.preprocessing import tukey_biweight
|
|
72
|
+
>>> x = np.array([1.0, 2.0, 3.0, 4.0, 5.0, 100.0]) # one outlier
|
|
73
|
+
>>> tukey_biweight(x)
|
|
74
|
+
2.9998...
|
|
75
|
+
>>> np.mean(x) # regular mean is heavily influenced
|
|
76
|
+
19.166...
|
|
77
|
+
"""
|
|
78
|
+
x = np.asarray(x, dtype=np.float64)
|
|
79
|
+
|
|
80
|
+
if x.ndim != 1:
|
|
81
|
+
raise ValueError(f"Input must be 1-dimensional, got shape {x.shape}")
|
|
82
|
+
|
|
83
|
+
if len(x) == 0:
|
|
84
|
+
raise ValueError("Input array is empty")
|
|
85
|
+
|
|
86
|
+
# Compute median and median absolute deviation
|
|
87
|
+
m = np.median(x)
|
|
88
|
+
s = np.median(np.abs(x - m))
|
|
89
|
+
|
|
90
|
+
# Handle case where all values are identical (s = 0)
|
|
91
|
+
if s < epsilon:
|
|
92
|
+
return float(m)
|
|
93
|
+
|
|
94
|
+
# Compute standardized residuals
|
|
95
|
+
u = (x - m) / (c * s + epsilon)
|
|
96
|
+
|
|
97
|
+
# Compute weights: w = (1 - u^2)^2 for |u| <= 1, else 0
|
|
98
|
+
w = np.zeros_like(u)
|
|
99
|
+
mask = np.abs(u) <= 1
|
|
100
|
+
w[mask] = (1 - u[mask] ** 2) ** 2
|
|
101
|
+
|
|
102
|
+
# Compute weighted mean
|
|
103
|
+
if np.sum(w) == 0:
|
|
104
|
+
# All points are outliers, fall back to median
|
|
105
|
+
return float(m)
|
|
106
|
+
|
|
107
|
+
t_bi = np.sum(w * x) / np.sum(w)
|
|
108
|
+
return float(t_bi)
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
def tukey_biweight_summary(
|
|
112
|
+
x: NDArray[np.floating],
|
|
113
|
+
c: float = 5.0,
|
|
114
|
+
epsilon: float = 0.0001,
|
|
115
|
+
) -> tuple[NDArray[np.floating], NDArray[np.floating]]:
|
|
116
|
+
"""Apply Tukey biweight to summarize across probes for each sample.
|
|
117
|
+
|
|
118
|
+
This function is designed for probeset summarization, where each column
|
|
119
|
+
represents a sample and each row represents a probe. It applies the
|
|
120
|
+
Tukey biweight algorithm to each column independently.
|
|
121
|
+
|
|
122
|
+
Parameters
|
|
123
|
+
----------
|
|
124
|
+
x : NDArray[np.floating]
|
|
125
|
+
Input array of shape (n_probes, n_samples).
|
|
126
|
+
c : float, default=5.0
|
|
127
|
+
Tuning constant for Tukey biweight.
|
|
128
|
+
epsilon : float, default=0.0001
|
|
129
|
+
Small constant to prevent division by zero.
|
|
130
|
+
|
|
131
|
+
Returns:
|
|
132
|
+
-------
|
|
133
|
+
exprs : NDArray[np.floating]
|
|
134
|
+
Array of shape (n_samples,) containing the biweight estimates.
|
|
135
|
+
se_exprs : NDArray[np.floating]
|
|
136
|
+
Array of shape (n_samples,) containing NaN values (standard errors
|
|
137
|
+
not computed in this implementation, following affy R package).
|
|
138
|
+
|
|
139
|
+
Notes:
|
|
140
|
+
-----
|
|
141
|
+
This function matches the behavior of tukeybiweight() in the affy R package,
|
|
142
|
+
which returns NA for standard errors.
|
|
143
|
+
|
|
144
|
+
Examples:
|
|
145
|
+
--------
|
|
146
|
+
>>> import numpy as np
|
|
147
|
+
>>> from microarray.preprocessing import tukey_biweight_summary
|
|
148
|
+
>>> # 5 probes, 3 samples
|
|
149
|
+
>>> x = np.array(
|
|
150
|
+
... [[1.0, 10.0, 100.0], [2.0, 11.0, 101.0], [3.0, 12.0, 102.0], [4.0, 13.0, 103.0], [5.0, 14.0, 104.0]]
|
|
151
|
+
... )
|
|
152
|
+
>>> exprs, se = tukey_biweight_summary(x)
|
|
153
|
+
>>> exprs.shape
|
|
154
|
+
(3,)
|
|
155
|
+
"""
|
|
156
|
+
x = np.asarray(x, dtype=np.float64)
|
|
157
|
+
|
|
158
|
+
if x.ndim != 2:
|
|
159
|
+
raise ValueError(f"Input must be 2-dimensional, got shape {x.shape}")
|
|
160
|
+
|
|
161
|
+
n_samples = x.shape[1]
|
|
162
|
+
exprs = np.empty(n_samples, dtype=np.float64)
|
|
163
|
+
|
|
164
|
+
for i in range(n_samples):
|
|
165
|
+
exprs[i] = tukey_biweight(x[:, i], c=c, epsilon=epsilon)
|
|
166
|
+
|
|
167
|
+
# Standard errors not computed (following affy R package behavior)
|
|
168
|
+
se_exprs = np.full(n_samples, np.nan, dtype=np.float64)
|
|
169
|
+
|
|
170
|
+
return exprs, se_exprs
|