microarray 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. microarray/__init__.py +15 -0
  2. microarray/_version.py +3 -0
  3. microarray/datasets/__init__.py +3 -0
  4. microarray/datasets/_arrayexpress.py +1 -0
  5. microarray/datasets/_cdf_files.py +35 -0
  6. microarray/datasets/_geo.py +1 -0
  7. microarray/datasets/_utils.py +143 -0
  8. microarray/io/__init__.py +17 -0
  9. microarray/io/_anndata_converter.py +198 -0
  10. microarray/io/_cdf.py +575 -0
  11. microarray/io/_cel.py +591 -0
  12. microarray/io/_read.py +127 -0
  13. microarray/plotting/__init__.py +28 -0
  14. microarray/plotting/_base.py +253 -0
  15. microarray/plotting/_cel.py +75 -0
  16. microarray/plotting/_de_plots.py +239 -0
  17. microarray/plotting/_diagnostic_plots.py +268 -0
  18. microarray/plotting/_heatmap.py +279 -0
  19. microarray/plotting/_ma_plots.py +136 -0
  20. microarray/plotting/_pca.py +320 -0
  21. microarray/plotting/_qc_plots.py +335 -0
  22. microarray/plotting/_score.py +38 -0
  23. microarray/plotting/_top_table_heatmap.py +98 -0
  24. microarray/plotting/_utils.py +280 -0
  25. microarray/preprocessing/__init__.py +39 -0
  26. microarray/preprocessing/_background.py +862 -0
  27. microarray/preprocessing/_log2.py +77 -0
  28. microarray/preprocessing/_normalize.py +1292 -0
  29. microarray/preprocessing/_rma.py +243 -0
  30. microarray/preprocessing/_robust.py +170 -0
  31. microarray/preprocessing/_summarize.py +318 -0
  32. microarray/py.typed +0 -0
  33. microarray/tools/__init__.py +26 -0
  34. microarray/tools/_biomart.py +416 -0
  35. microarray/tools/_empirical_bayes.py +401 -0
  36. microarray/tools/_fdist.py +171 -0
  37. microarray/tools/_linear_models.py +387 -0
  38. microarray/tools/_mds.py +101 -0
  39. microarray/tools/_pca.py +88 -0
  40. microarray/tools/_score.py +86 -0
  41. microarray/tools/_toptable.py +360 -0
  42. microarray-0.1.0.dist-info/METADATA +75 -0
  43. microarray-0.1.0.dist-info/RECORD +44 -0
  44. microarray-0.1.0.dist-info/WHEEL +4 -0
@@ -0,0 +1,243 @@
1
+ """RMA (Robust Multi-array Average) normalization pipeline."""
2
+
3
+ import warnings
4
+
5
+ import numpy as np
6
+ from anndata import AnnData
7
+
8
+ from microarray.preprocessing._background import rma_background_correct
9
+ from microarray.preprocessing._log2 import _is_log_transformed, log2
10
+ from microarray.preprocessing._normalize import normalize_quantile
11
+ from microarray.preprocessing._summarize import summarize_probesets
12
+
13
+
14
+ def rma(
15
+ adata: AnnData,
16
+ background_correct: bool = True,
17
+ normalize: bool = True,
18
+ log_transform: bool = True,
19
+ summarize: bool = True,
20
+ output_level: str = "gene",
21
+ copy: bool = True,
22
+ ) -> AnnData | None:
23
+ """Apply RMA (Robust Multi-array Average) normalization to microarray data.
24
+
25
+ RMA is a widely-used preprocessing method for Affymetrix microarrays that
26
+ combines background correction, quantile normalization, log transformation,
27
+ and robust summarization.
28
+
29
+ The standard RMA pipeline consists of:
30
+ 1. Background correction using a convolution model
31
+ 2. Quantile normalization across samples
32
+ 3. Log2 transformation
33
+ 4. Probeset summarization using median polish
34
+
35
+ Parameters
36
+ ----------
37
+ adata
38
+ AnnData object with shape (n_samples, n_probes).
39
+ Should contain raw intensity values from multiple CEL files.
40
+ Use `microarray.io.cel_batch_to_anndata()` to load data.
41
+ background_correct
42
+ If True, apply RMA background correction.
43
+ normalize
44
+ If True, apply quantile normalization across samples.
45
+ Requires n_samples > 1; skipped with warning for single sample.
46
+ log_transform
47
+ If True, apply log2 transformation to intensities.
48
+ summarize
49
+ If True, apply median polish summarization to probesets.
50
+ Requires 'gene_id' column in `.var`.
51
+ output_level
52
+ Output granularity when summarize=True:
53
+ - 'gene': Return gene-level data (n_samples, n_genes)
54
+ - 'probe': Keep probe-level data with effects in `.layers`
55
+ Ignored if summarize=False.
56
+ copy
57
+ If True, return a new AnnData object. If False, may modify in place
58
+ (except when output_level='gene', which always returns new object).
59
+
60
+ Returns:
61
+ -------
62
+ AnnData or None
63
+ Processed AnnData object if `copy=True`.
64
+ If `copy=False` and `output_level='probe'`, modifies in place and returns None.
65
+ Full RMA parameters and history are stored in `.uns['rma']`.
66
+
67
+ Raises:
68
+ ------
69
+ ValueError
70
+ If input validation fails (e.g., empty data, incompatible parameters).
71
+
72
+ Examples:
73
+ --------
74
+ >>> # Standard RMA with gene-level output
75
+ >>> import microarray as ma
76
+ >>> adata = ma.io.cel_batch_to_anndata(
77
+ ... cel_paths=["sample1.CEL", "sample2.CEL"], cdf_path="chip.cdf", annotation_db_path="annotations.db"
78
+ ... )
79
+ >>> adata_rma = ma.pp.rma(adata)
80
+ >>> print(adata_rma.shape) # (2, n_genes)
81
+
82
+ >>> # Partial RMA: only background correction and normalization
83
+ >>> adata_norm = ma.pp.rma(adata, background_correct=True, normalize=True, log_transform=True, summarize=False)
84
+ >>> print(adata_norm.shape) # (2, n_probes) - still probe-level
85
+
86
+ >>> # RMA with probe-level output
87
+ >>> adata_probes = ma.pp.rma(adata, output_level="probe")
88
+
89
+ Notes:
90
+ -----
91
+ RMA was developed for Affymetrix oligonucleotide arrays and is one of
92
+ the most widely-used preprocessing methods due to its robustness and
93
+ performance in reducing technical variation.
94
+
95
+ The method assumes:
96
+ - Multiple samples are being processed together (for normalization)
97
+ - Data are from PM (perfect match) probes
98
+ - Probes are grouped into probesets representing genes/transcripts
99
+
100
+ References:
101
+ ----------
102
+ .. [1] Irizarry, R.A., Hobbs, B., Collin, F., et al. (2003).
103
+ Exploration, normalization, and summaries of high density
104
+ oligonucleotide array probe level data.
105
+ Biostatistics, 4(2), 249-264.
106
+
107
+ .. [2] Bolstad, B.M., Irizarry, R.A., Astrand, M., Speed, T.P. (2003).
108
+ A comparison of normalization methods for high density
109
+ oligonucleotide array data based on variance and bias.
110
+ Bioinformatics, 19(2), 185-193.
111
+ """
112
+ # Validate input
113
+ _validate_rma_input(adata, summarize)
114
+
115
+ # Check if already processed
116
+ if _is_likely_processed(adata):
117
+ warnings.warn(
118
+ "Input data appears to already be processed (values in log scale or "
119
+ "normalized). Applying RMA to already-processed data may give "
120
+ "incorrect results.",
121
+ UserWarning,
122
+ stacklevel=2,
123
+ )
124
+
125
+ # Initialize result
126
+ adata_result = adata.copy() if copy else adata
127
+
128
+ # Track which steps were applied
129
+ steps_applied = []
130
+
131
+ # Step 1: Background correction
132
+ if background_correct:
133
+ rma_background_correct(adata_result, copy=False)
134
+ steps_applied.append("background_correction")
135
+
136
+ # Step 2: Quantile normalization
137
+ if normalize:
138
+ if adata_result.n_obs == 1:
139
+ warnings.warn(
140
+ "Skipping quantile normalization: only 1 sample provided. Normalization requires multiple samples.",
141
+ UserWarning,
142
+ stacklevel=2,
143
+ )
144
+ else:
145
+ normalize_quantile(adata_result, copy=False)
146
+ steps_applied.append("quantile_normalization")
147
+
148
+ # Step 3: Log transformation
149
+ if log_transform:
150
+ # Check if already log-transformed
151
+ if not _is_log_transformed(adata_result):
152
+ log2(adata_result, copy=False)
153
+ steps_applied.append("log2_transform")
154
+ else:
155
+ warnings.warn(
156
+ "Data appears to already be log-transformed. Skipping log transformation.",
157
+ UserWarning,
158
+ stacklevel=2,
159
+ )
160
+
161
+ # Step 4: Probeset summarization
162
+ if summarize:
163
+ if "gene_id" not in adata_result.var.columns:
164
+ warnings.warn(
165
+ "Cannot summarize probesets: 'gene_id' column missing from .var. "
166
+ "Load data with annotation_db_path to enable summarization. "
167
+ "Skipping summarization step.",
168
+ UserWarning,
169
+ stacklevel=2,
170
+ )
171
+ else:
172
+ # Note: summarize_probesets returns new AnnData for gene-level output
173
+ result_or_none = summarize_probesets(
174
+ adata_result,
175
+ method="medpolish",
176
+ output_level=output_level,
177
+ copy=False if output_level == "probe" else True,
178
+ )
179
+ # If gene-level, we get a new object; if probe-level with copy=False, we get None
180
+ if result_or_none is not None:
181
+ adata_result = result_or_none
182
+ steps_applied.append("median_polish_summarization")
183
+
184
+ # Store RMA parameters and history
185
+ adata_result.uns["rma"] = {
186
+ "background_correct": background_correct,
187
+ "normalize": normalize,
188
+ "log_transform": log_transform,
189
+ "summarize": summarize,
190
+ "output_level": output_level if summarize else "probe",
191
+ "steps_applied": steps_applied,
192
+ "n_samples": adata.n_obs,
193
+ "n_probes_input": adata.n_vars,
194
+ "n_features_output": adata_result.n_vars,
195
+ }
196
+
197
+ return adata_result if copy else None
198
+
199
+
200
+ def _validate_rma_input(adata: AnnData, summarize: bool) -> None:
201
+ """Validate input AnnData for RMA processing."""
202
+ if adata is None:
203
+ raise ValueError("adata cannot be None")
204
+
205
+ if adata.X is None:
206
+ raise ValueError("AnnData.X must contain intensity values")
207
+
208
+ if adata.n_obs < 1:
209
+ raise ValueError("AnnData must contain at least one sample")
210
+
211
+ if adata.n_vars < 1:
212
+ raise ValueError("AnnData must contain at least one probe")
213
+
214
+ # Check for negative values (indicates already processed data)
215
+ if np.any(adata.X < 0):
216
+ warnings.warn(
217
+ "Input data contains negative values. RMA expects raw positive intensities.",
218
+ UserWarning,
219
+ stacklevel=2,
220
+ )
221
+
222
+ # Check for NaN or inf values
223
+ if np.any(~np.isfinite(adata.X)):
224
+ raise ValueError("Input data contains NaN or infinite values. RMA requires finite intensity values.")
225
+
226
+
227
+ def _is_likely_processed(adata: AnnData) -> bool:
228
+ """Check if data appears to already be processed.
229
+
230
+ Checks for signs of normalization or log transformation.
231
+ """
232
+ # Check for log transformation
233
+ if _is_log_transformed(adata):
234
+ return True
235
+
236
+ # Check if normalization metadata exists
237
+ if "normalization" in adata.uns:
238
+ return True
239
+
240
+ if "rma" in adata.uns:
241
+ return True
242
+
243
+ return False
@@ -0,0 +1,170 @@
1
+ """Robust statistical methods for microarray preprocessing.
2
+
3
+ This module provides robust statistical methods used in expression estimation,
4
+ particularly in the MAS5 algorithm.
5
+ """
6
+
7
+ import numpy as np
8
+ from numpy.typing import NDArray
9
+
10
+
11
+ def tukey_biweight(
12
+ x: NDArray[np.floating],
13
+ c: float = 5.0,
14
+ epsilon: float = 0.0001,
15
+ max_iter: int = 50,
16
+ tol: float = 1e-7,
17
+ ) -> float:
18
+ """Compute Tukey's biweight (bisquare) robust mean.
19
+
20
+ This function calculates a robust estimate of the mean using Tukey's biweight
21
+ algorithm. It downweights outliers based on their distance from the median,
22
+ providing more robust estimates than the arithmetic mean.
23
+
24
+ The algorithm computes weights based on standardized residuals:
25
+ - w_i = (1 - u_i^2)^2 for |u_i| <= 1
26
+ - w_i = 0 for |u_i| > 1
27
+
28
+ where u_i = (x_i - m) / (c * s + epsilon), m is the median, and s is the
29
+ median absolute deviation (MAD).
30
+
31
+ Parameters
32
+ ----------
33
+ x : NDArray[np.floating]
34
+ Input array of values, 1-dimensional.
35
+ c : float, default=5.0
36
+ Tuning constant that controls the downweighting of outliers.
37
+ Larger values assign more weight to outliers. Common values:
38
+ - c=5: standard for expression summarization (MAS5)
39
+ - c=6: more permissive
40
+ - c=4.685: asymptotically 95% efficiency for normal data
41
+ epsilon : float, default=0.0001
42
+ Small constant added to prevent division by zero when MAD is very small.
43
+ max_iter : int, default=50
44
+ Maximum number of iterations for refinement (currently single-pass).
45
+ tol : float, default=1e-7
46
+ Convergence tolerance (currently unused, reserved for iterative version).
47
+
48
+ Returns:
49
+ -------
50
+ float
51
+ The Tukey biweight estimate of the mean.
52
+
53
+ Notes:
54
+ -----
55
+ This implementation follows the Affymetrix MAS5 algorithm as implemented in
56
+ the affy R package (tukey.biweight.R). The single-pass version is used,
57
+ which computes weights based on deviations from the median.
58
+
59
+ For data with no variability (all values identical), the function returns
60
+ the common value.
61
+
62
+ References:
63
+ ----------
64
+ .. [1] Mosteller, F., and Tukey, J. W. (1977), Data Analysis and Regression:
65
+ A Second Course in Statistics. Addison-Wesley.
66
+ .. [2] Affymetrix (2002). Statistical Algorithms Description Document.
67
+
68
+ Examples:
69
+ --------
70
+ >>> import numpy as np
71
+ >>> from microarray.preprocessing import tukey_biweight
72
+ >>> x = np.array([1.0, 2.0, 3.0, 4.0, 5.0, 100.0]) # one outlier
73
+ >>> tukey_biweight(x)
74
+ 2.9998...
75
+ >>> np.mean(x) # regular mean is heavily influenced
76
+ 19.166...
77
+ """
78
+ x = np.asarray(x, dtype=np.float64)
79
+
80
+ if x.ndim != 1:
81
+ raise ValueError(f"Input must be 1-dimensional, got shape {x.shape}")
82
+
83
+ if len(x) == 0:
84
+ raise ValueError("Input array is empty")
85
+
86
+ # Compute median and median absolute deviation
87
+ m = np.median(x)
88
+ s = np.median(np.abs(x - m))
89
+
90
+ # Handle case where all values are identical (s = 0)
91
+ if s < epsilon:
92
+ return float(m)
93
+
94
+ # Compute standardized residuals
95
+ u = (x - m) / (c * s + epsilon)
96
+
97
+ # Compute weights: w = (1 - u^2)^2 for |u| <= 1, else 0
98
+ w = np.zeros_like(u)
99
+ mask = np.abs(u) <= 1
100
+ w[mask] = (1 - u[mask] ** 2) ** 2
101
+
102
+ # Compute weighted mean
103
+ if np.sum(w) == 0:
104
+ # All points are outliers, fall back to median
105
+ return float(m)
106
+
107
+ t_bi = np.sum(w * x) / np.sum(w)
108
+ return float(t_bi)
109
+
110
+
111
+ def tukey_biweight_summary(
112
+ x: NDArray[np.floating],
113
+ c: float = 5.0,
114
+ epsilon: float = 0.0001,
115
+ ) -> tuple[NDArray[np.floating], NDArray[np.floating]]:
116
+ """Apply Tukey biweight to summarize across probes for each sample.
117
+
118
+ This function is designed for probeset summarization, where each column
119
+ represents a sample and each row represents a probe. It applies the
120
+ Tukey biweight algorithm to each column independently.
121
+
122
+ Parameters
123
+ ----------
124
+ x : NDArray[np.floating]
125
+ Input array of shape (n_probes, n_samples).
126
+ c : float, default=5.0
127
+ Tuning constant for Tukey biweight.
128
+ epsilon : float, default=0.0001
129
+ Small constant to prevent division by zero.
130
+
131
+ Returns:
132
+ -------
133
+ exprs : NDArray[np.floating]
134
+ Array of shape (n_samples,) containing the biweight estimates.
135
+ se_exprs : NDArray[np.floating]
136
+ Array of shape (n_samples,) containing NaN values (standard errors
137
+ not computed in this implementation, following affy R package).
138
+
139
+ Notes:
140
+ -----
141
+ This function matches the behavior of tukeybiweight() in the affy R package,
142
+ which returns NA for standard errors.
143
+
144
+ Examples:
145
+ --------
146
+ >>> import numpy as np
147
+ >>> from microarray.preprocessing import tukey_biweight_summary
148
+ >>> # 5 probes, 3 samples
149
+ >>> x = np.array(
150
+ ... [[1.0, 10.0, 100.0], [2.0, 11.0, 101.0], [3.0, 12.0, 102.0], [4.0, 13.0, 103.0], [5.0, 14.0, 104.0]]
151
+ ... )
152
+ >>> exprs, se = tukey_biweight_summary(x)
153
+ >>> exprs.shape
154
+ (3,)
155
+ """
156
+ x = np.asarray(x, dtype=np.float64)
157
+
158
+ if x.ndim != 2:
159
+ raise ValueError(f"Input must be 2-dimensional, got shape {x.shape}")
160
+
161
+ n_samples = x.shape[1]
162
+ exprs = np.empty(n_samples, dtype=np.float64)
163
+
164
+ for i in range(n_samples):
165
+ exprs[i] = tukey_biweight(x[:, i], c=c, epsilon=epsilon)
166
+
167
+ # Standard errors not computed (following affy R package behavior)
168
+ se_exprs = np.full(n_samples, np.nan, dtype=np.float64)
169
+
170
+ return exprs, se_exprs