microarray 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- microarray/__init__.py +15 -0
- microarray/_version.py +3 -0
- microarray/datasets/__init__.py +3 -0
- microarray/datasets/_arrayexpress.py +1 -0
- microarray/datasets/_cdf_files.py +35 -0
- microarray/datasets/_geo.py +1 -0
- microarray/datasets/_utils.py +143 -0
- microarray/io/__init__.py +17 -0
- microarray/io/_anndata_converter.py +198 -0
- microarray/io/_cdf.py +575 -0
- microarray/io/_cel.py +591 -0
- microarray/io/_read.py +127 -0
- microarray/plotting/__init__.py +28 -0
- microarray/plotting/_base.py +253 -0
- microarray/plotting/_cel.py +75 -0
- microarray/plotting/_de_plots.py +239 -0
- microarray/plotting/_diagnostic_plots.py +268 -0
- microarray/plotting/_heatmap.py +279 -0
- microarray/plotting/_ma_plots.py +136 -0
- microarray/plotting/_pca.py +320 -0
- microarray/plotting/_qc_plots.py +335 -0
- microarray/plotting/_score.py +38 -0
- microarray/plotting/_top_table_heatmap.py +98 -0
- microarray/plotting/_utils.py +280 -0
- microarray/preprocessing/__init__.py +39 -0
- microarray/preprocessing/_background.py +862 -0
- microarray/preprocessing/_log2.py +77 -0
- microarray/preprocessing/_normalize.py +1292 -0
- microarray/preprocessing/_rma.py +243 -0
- microarray/preprocessing/_robust.py +170 -0
- microarray/preprocessing/_summarize.py +318 -0
- microarray/py.typed +0 -0
- microarray/tools/__init__.py +26 -0
- microarray/tools/_biomart.py +416 -0
- microarray/tools/_empirical_bayes.py +401 -0
- microarray/tools/_fdist.py +171 -0
- microarray/tools/_linear_models.py +387 -0
- microarray/tools/_mds.py +101 -0
- microarray/tools/_pca.py +88 -0
- microarray/tools/_score.py +86 -0
- microarray/tools/_toptable.py +360 -0
- microarray-0.1.0.dist-info/METADATA +75 -0
- microarray-0.1.0.dist-info/RECORD +44 -0
- microarray-0.1.0.dist-info/WHEEL +4 -0
|
@@ -0,0 +1,318 @@
|
|
|
1
|
+
"""Probeset summarization methods including median polish."""
|
|
2
|
+
|
|
3
|
+
import warnings
|
|
4
|
+
|
|
5
|
+
import numpy as np
|
|
6
|
+
import pandas as pd
|
|
7
|
+
from anndata import AnnData
|
|
8
|
+
|
|
9
|
+
from ._robust import tukey_biweight_summary
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def median_polish(
|
|
13
|
+
X: np.ndarray,
|
|
14
|
+
eps: float = 0.01,
|
|
15
|
+
max_iter: int = 10,
|
|
16
|
+
) -> tuple[np.ndarray, np.ndarray, np.ndarray, float, bool]:
|
|
17
|
+
"""Apply Tukey's median polish algorithm to a matrix.
|
|
18
|
+
|
|
19
|
+
Median polish is a robust method for additive decomposition of a matrix:
|
|
20
|
+
X = global_effect + row_effects + col_effects + residuals
|
|
21
|
+
|
|
22
|
+
The algorithm iteratively:
|
|
23
|
+
1. Subtracts row medians from each row
|
|
24
|
+
2. Subtracts column medians from each column
|
|
25
|
+
3. Continues until convergence or max iterations
|
|
26
|
+
|
|
27
|
+
Parameters
|
|
28
|
+
----------
|
|
29
|
+
X
|
|
30
|
+
Input matrix of shape (n_rows, n_cols) to decompose.
|
|
31
|
+
eps
|
|
32
|
+
Convergence threshold. Algorithm stops when relative change in
|
|
33
|
+
sum of absolute residuals is less than eps.
|
|
34
|
+
max_iter
|
|
35
|
+
Maximum number of iterations.
|
|
36
|
+
|
|
37
|
+
Returns:
|
|
38
|
+
-------
|
|
39
|
+
residuals
|
|
40
|
+
Residual matrix after removing effects, shape (n_rows, n_cols).
|
|
41
|
+
row_effects
|
|
42
|
+
Effect for each row, shape (n_rows,).
|
|
43
|
+
col_effects
|
|
44
|
+
Effect for each column, shape (n_cols,).
|
|
45
|
+
global_effect
|
|
46
|
+
Global effect (scalar).
|
|
47
|
+
converged
|
|
48
|
+
True if algorithm converged within max_iter iterations.
|
|
49
|
+
|
|
50
|
+
Examples:
|
|
51
|
+
--------
|
|
52
|
+
>>> X = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
|
|
53
|
+
>>> res, row_eff, col_eff, global_eff, converged = median_polish(X)
|
|
54
|
+
|
|
55
|
+
Notes:
|
|
56
|
+
-----
|
|
57
|
+
For RMA summarization, the column effects + global effect represent
|
|
58
|
+
the expression value for each sample.
|
|
59
|
+
|
|
60
|
+
Reference:
|
|
61
|
+
Tukey, J.W. (1977). Exploratory Data Analysis. Addison-Wesley.
|
|
62
|
+
"""
|
|
63
|
+
# Initialize
|
|
64
|
+
n_rows, n_cols = X.shape
|
|
65
|
+
residuals = X.copy().astype(float)
|
|
66
|
+
row_effects = np.zeros(n_rows)
|
|
67
|
+
col_effects = np.zeros(n_cols)
|
|
68
|
+
global_effect = 0.0
|
|
69
|
+
|
|
70
|
+
# Track convergence
|
|
71
|
+
prev_sar = np.sum(np.abs(residuals))
|
|
72
|
+
converged = False
|
|
73
|
+
|
|
74
|
+
for _iteration in range(max_iter):
|
|
75
|
+
# Row sweep: subtract row medians
|
|
76
|
+
row_medians = np.median(residuals, axis=1)
|
|
77
|
+
residuals -= row_medians[:, np.newaxis]
|
|
78
|
+
row_effects += row_medians
|
|
79
|
+
|
|
80
|
+
# Update column effects and global effect
|
|
81
|
+
col_median = np.median(col_effects)
|
|
82
|
+
col_effects -= col_median
|
|
83
|
+
global_effect += col_median
|
|
84
|
+
|
|
85
|
+
# Column sweep: subtract column medians
|
|
86
|
+
col_medians = np.median(residuals, axis=0)
|
|
87
|
+
residuals -= col_medians
|
|
88
|
+
col_effects += col_medians
|
|
89
|
+
|
|
90
|
+
# Update row effects and global effect
|
|
91
|
+
row_median = np.median(row_effects)
|
|
92
|
+
row_effects -= row_median
|
|
93
|
+
global_effect += row_median
|
|
94
|
+
|
|
95
|
+
# Check convergence
|
|
96
|
+
current_sar = np.sum(np.abs(residuals))
|
|
97
|
+
|
|
98
|
+
if current_sar == 0:
|
|
99
|
+
converged = True
|
|
100
|
+
break
|
|
101
|
+
|
|
102
|
+
relative_change = abs(current_sar - prev_sar) / current_sar
|
|
103
|
+
|
|
104
|
+
if relative_change < eps:
|
|
105
|
+
converged = True
|
|
106
|
+
break
|
|
107
|
+
|
|
108
|
+
prev_sar = current_sar
|
|
109
|
+
|
|
110
|
+
return residuals, row_effects, col_effects, global_effect, converged
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
def summarize_probesets(
|
|
114
|
+
adata: AnnData,
|
|
115
|
+
method: str = "medpolish",
|
|
116
|
+
output_level: str = "gene",
|
|
117
|
+
copy: bool = True,
|
|
118
|
+
) -> AnnData | None:
|
|
119
|
+
"""Summarize probe-level data to gene-level using median polish or simple median.
|
|
120
|
+
|
|
121
|
+
Groups probes by gene (using `.var['gene_id']`) and applies summarization
|
|
122
|
+
to reduce multiple probes per gene to a single expression value per gene.
|
|
123
|
+
|
|
124
|
+
Parameters
|
|
125
|
+
----------
|
|
126
|
+
adata
|
|
127
|
+
AnnData object with shape (n_samples, n_probes).
|
|
128
|
+
Must contain 'gene_id' column in `.var`.
|
|
129
|
+
method
|
|
130
|
+
Summarization method:
|
|
131
|
+
- 'medpolish': Tukey's median polish (default, used in RMA)
|
|
132
|
+
- 'median': Simple median across probes
|
|
133
|
+
- 'mas': Tukey biweight (used in MAS5)
|
|
134
|
+
output_level
|
|
135
|
+
Output granularity:
|
|
136
|
+
- 'gene': Return gene-level AnnData (n_samples, n_genes)
|
|
137
|
+
- 'probe': Keep probe-level, store effects in `.layers`
|
|
138
|
+
copy
|
|
139
|
+
If True, return a new AnnData object. If False and output_level='gene',
|
|
140
|
+
must return new object (raises ValueError).
|
|
141
|
+
|
|
142
|
+
Returns:
|
|
143
|
+
-------
|
|
144
|
+
AnnData or None
|
|
145
|
+
If `output_level='gene'`, returns gene-level AnnData object.
|
|
146
|
+
If `output_level='probe'` and `copy=True`, returns probe-level AnnData
|
|
147
|
+
with summarization stored in `.layers['medpolish_effects']`.
|
|
148
|
+
If `output_level='probe'` and `copy=False`, modifies in place and returns None.
|
|
149
|
+
Summarization metadata is stored in `.uns['summarization']`.
|
|
150
|
+
|
|
151
|
+
Raises:
|
|
152
|
+
------
|
|
153
|
+
ValueError
|
|
154
|
+
If 'gene_id' column is missing from `.var`.
|
|
155
|
+
If `output_level='gene'` and `copy=False` (cannot modify shape in place).
|
|
156
|
+
|
|
157
|
+
Examples:
|
|
158
|
+
--------
|
|
159
|
+
>>> # Summarize to gene level
|
|
160
|
+
>>> adata_genes = ma.pp.summarize_probesets(adata, output_level="gene")
|
|
161
|
+
>>> print(adata_genes.shape) # (n_samples, n_genes)
|
|
162
|
+
|
|
163
|
+
>>> # Keep probe level but store median polish effects
|
|
164
|
+
>>> adata_probes = ma.pp.summarize_probesets(adata, output_level="probe")
|
|
165
|
+
|
|
166
|
+
Notes:
|
|
167
|
+
-----
|
|
168
|
+
For median polish:
|
|
169
|
+
- Each probeset forms a (n_probes_in_set, n_samples) matrix
|
|
170
|
+
- Median polish decomposes: X = global + row_eff + col_eff + residuals
|
|
171
|
+
- Gene expression = col_eff + global (one value per sample)
|
|
172
|
+
|
|
173
|
+
Probes without gene annotation (gene_id is None or empty) are excluded
|
|
174
|
+
from gene-level output but retained in probe-level output.
|
|
175
|
+
"""
|
|
176
|
+
if "gene_id" not in adata.var.columns:
|
|
177
|
+
raise ValueError(
|
|
178
|
+
"AnnData.var must contain 'gene_id' column for probeset summarization. "
|
|
179
|
+
"Load data with annotation_db_path to get gene annotations."
|
|
180
|
+
)
|
|
181
|
+
|
|
182
|
+
if output_level not in ["gene", "probe"]:
|
|
183
|
+
raise ValueError(f"output_level must be 'gene' or 'probe', got '{output_level}'")
|
|
184
|
+
|
|
185
|
+
if output_level == "gene" and not copy:
|
|
186
|
+
raise ValueError(
|
|
187
|
+
"Cannot modify AnnData in place when changing from probe to gene level. "
|
|
188
|
+
"Set copy=True for gene-level output."
|
|
189
|
+
)
|
|
190
|
+
|
|
191
|
+
if method not in ["medpolish", "median", "mas"]:
|
|
192
|
+
raise ValueError(f"method must be 'medpolish', 'median', or 'mas', got '{method}'")
|
|
193
|
+
|
|
194
|
+
adata_work = adata.copy() if copy else adata
|
|
195
|
+
|
|
196
|
+
# Get probe-to-gene mapping
|
|
197
|
+
gene_ids = adata_work.var["gene_id"].values
|
|
198
|
+
unique_genes = pd.Series(gene_ids).dropna().unique()
|
|
199
|
+
|
|
200
|
+
n_samples = adata_work.n_obs
|
|
201
|
+
n_genes = len(unique_genes)
|
|
202
|
+
|
|
203
|
+
if n_genes == 0:
|
|
204
|
+
raise ValueError("No valid gene IDs found in .var['gene_id']")
|
|
205
|
+
|
|
206
|
+
# Store summarization metadata
|
|
207
|
+
convergence_info = []
|
|
208
|
+
|
|
209
|
+
if output_level == "gene":
|
|
210
|
+
# Create gene-level expression matrix
|
|
211
|
+
X_genes = np.zeros((n_samples, n_genes))
|
|
212
|
+
var_data = {
|
|
213
|
+
"gene_id": [],
|
|
214
|
+
"n_probes": [],
|
|
215
|
+
"converged": [] if method == "medpolish" else None,
|
|
216
|
+
}
|
|
217
|
+
|
|
218
|
+
for i, gene_id in enumerate(unique_genes):
|
|
219
|
+
# Get probes for this gene
|
|
220
|
+
probe_mask = gene_ids == gene_id
|
|
221
|
+
probe_indices = np.where(probe_mask)[0]
|
|
222
|
+
n_probes_in_set = len(probe_indices)
|
|
223
|
+
|
|
224
|
+
# Extract probeset intensities: (n_probes, n_samples)
|
|
225
|
+
probeset_data = adata_work.X[:, probe_indices].T
|
|
226
|
+
|
|
227
|
+
if method == "medpolish":
|
|
228
|
+
if n_probes_in_set == 1:
|
|
229
|
+
# Single probe: no summarization needed
|
|
230
|
+
gene_expr = probeset_data[0, :]
|
|
231
|
+
converged = True
|
|
232
|
+
else:
|
|
233
|
+
# Apply median polish
|
|
234
|
+
residuals, row_eff, col_eff, global_eff, converged = median_polish(
|
|
235
|
+
probeset_data, eps=0.01, max_iter=10
|
|
236
|
+
)
|
|
237
|
+
# Gene expression = column effects + global effect
|
|
238
|
+
gene_expr = col_eff + global_eff
|
|
239
|
+
|
|
240
|
+
var_data["converged"].append(converged)
|
|
241
|
+
if not converged:
|
|
242
|
+
warnings.warn(
|
|
243
|
+
f"Median polish did not converge for gene '{gene_id}' ({n_probes_in_set} probes)",
|
|
244
|
+
UserWarning,
|
|
245
|
+
stacklevel=2,
|
|
246
|
+
)
|
|
247
|
+
|
|
248
|
+
elif method == "median":
|
|
249
|
+
# Simple median across probes
|
|
250
|
+
gene_expr = np.median(probeset_data, axis=0)
|
|
251
|
+
converged = True
|
|
252
|
+
|
|
253
|
+
elif method == "mas":
|
|
254
|
+
# Tukey biweight summarization
|
|
255
|
+
gene_expr, _ = tukey_biweight_summary(probeset_data)
|
|
256
|
+
converged = True
|
|
257
|
+
|
|
258
|
+
X_genes[:, i] = gene_expr
|
|
259
|
+
var_data["gene_id"].append(gene_id)
|
|
260
|
+
var_data["n_probes"].append(n_probes_in_set)
|
|
261
|
+
convergence_info.append(converged)
|
|
262
|
+
|
|
263
|
+
# Create new gene-level AnnData
|
|
264
|
+
var_df = pd.DataFrame(var_data)
|
|
265
|
+
if method != "medpolish":
|
|
266
|
+
var_df = var_df.drop(columns=["converged"])
|
|
267
|
+
|
|
268
|
+
adata_out = AnnData(
|
|
269
|
+
X=X_genes,
|
|
270
|
+
obs=adata_work.obs.copy(),
|
|
271
|
+
var=var_df,
|
|
272
|
+
uns=adata_work.uns.copy(),
|
|
273
|
+
)
|
|
274
|
+
|
|
275
|
+
# Store summarization info
|
|
276
|
+
adata_out.uns["summarization"] = {
|
|
277
|
+
"method": method,
|
|
278
|
+
"output_level": output_level,
|
|
279
|
+
"n_genes": n_genes,
|
|
280
|
+
"n_probes_original": adata_work.n_vars,
|
|
281
|
+
"convergence_rate": sum(convergence_info) / len(convergence_info) if convergence_info else 1.0,
|
|
282
|
+
}
|
|
283
|
+
|
|
284
|
+
return adata_out
|
|
285
|
+
|
|
286
|
+
else: # output_level == 'probe'
|
|
287
|
+
# Keep probe level, optionally store effects in layers
|
|
288
|
+
if method == "medpolish":
|
|
289
|
+
# Store median polish effects for each probe
|
|
290
|
+
effects = np.zeros_like(adata_work.X)
|
|
291
|
+
|
|
292
|
+
for gene_id in unique_genes:
|
|
293
|
+
probe_mask = gene_ids == gene_id
|
|
294
|
+
probe_indices = np.where(probe_mask)[0]
|
|
295
|
+
n_probes_in_set = len(probe_indices)
|
|
296
|
+
|
|
297
|
+
probeset_data = adata_work.X[:, probe_indices].T
|
|
298
|
+
|
|
299
|
+
if n_probes_in_set == 1:
|
|
300
|
+
# Single probe: effect = original value
|
|
301
|
+
effects[:, probe_indices[0]] = probeset_data[0, :]
|
|
302
|
+
else:
|
|
303
|
+
residuals, row_eff, col_eff, global_eff, converged = median_polish(
|
|
304
|
+
probeset_data, eps=0.01, max_iter=10
|
|
305
|
+
)
|
|
306
|
+
# Store column effects + global for each probe
|
|
307
|
+
for _j, probe_idx in enumerate(probe_indices):
|
|
308
|
+
effects[:, probe_idx] = col_eff + global_eff
|
|
309
|
+
|
|
310
|
+
adata_work.layers["medpolish_effects"] = effects
|
|
311
|
+
|
|
312
|
+
adata_work.uns["summarization"] = {
|
|
313
|
+
"method": method,
|
|
314
|
+
"output_level": output_level,
|
|
315
|
+
"n_genes": n_genes,
|
|
316
|
+
}
|
|
317
|
+
|
|
318
|
+
return adata_work if copy else None
|
microarray/py.typed
ADDED
|
File without changes
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
"""Tools for microarray data analysis.
|
|
2
|
+
|
|
3
|
+
This module provides analytical tools for microarray data, including:
|
|
4
|
+
- Dimension reduction (MDS)
|
|
5
|
+
- Biomart-based feature annotation
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from microarray.tools._biomart import BiomartDataset, annotate
|
|
9
|
+
from microarray.tools._empirical_bayes import ebayes, squeeze_var
|
|
10
|
+
from microarray.tools._linear_models import lm_fit
|
|
11
|
+
from microarray.tools._mds import mds
|
|
12
|
+
from microarray.tools._pca import pca
|
|
13
|
+
from microarray.tools._score import score
|
|
14
|
+
from microarray.tools._toptable import top_table
|
|
15
|
+
|
|
16
|
+
__all__ = [
|
|
17
|
+
"mds",
|
|
18
|
+
"pca",
|
|
19
|
+
"BiomartDataset",
|
|
20
|
+
"annotate",
|
|
21
|
+
"ebayes",
|
|
22
|
+
"squeeze_var",
|
|
23
|
+
"lm_fit",
|
|
24
|
+
"score",
|
|
25
|
+
"top_table",
|
|
26
|
+
]
|