microarray 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. microarray/__init__.py +15 -0
  2. microarray/_version.py +3 -0
  3. microarray/datasets/__init__.py +3 -0
  4. microarray/datasets/_arrayexpress.py +1 -0
  5. microarray/datasets/_cdf_files.py +35 -0
  6. microarray/datasets/_geo.py +1 -0
  7. microarray/datasets/_utils.py +143 -0
  8. microarray/io/__init__.py +17 -0
  9. microarray/io/_anndata_converter.py +198 -0
  10. microarray/io/_cdf.py +575 -0
  11. microarray/io/_cel.py +591 -0
  12. microarray/io/_read.py +127 -0
  13. microarray/plotting/__init__.py +28 -0
  14. microarray/plotting/_base.py +253 -0
  15. microarray/plotting/_cel.py +75 -0
  16. microarray/plotting/_de_plots.py +239 -0
  17. microarray/plotting/_diagnostic_plots.py +268 -0
  18. microarray/plotting/_heatmap.py +279 -0
  19. microarray/plotting/_ma_plots.py +136 -0
  20. microarray/plotting/_pca.py +320 -0
  21. microarray/plotting/_qc_plots.py +335 -0
  22. microarray/plotting/_score.py +38 -0
  23. microarray/plotting/_top_table_heatmap.py +98 -0
  24. microarray/plotting/_utils.py +280 -0
  25. microarray/preprocessing/__init__.py +39 -0
  26. microarray/preprocessing/_background.py +862 -0
  27. microarray/preprocessing/_log2.py +77 -0
  28. microarray/preprocessing/_normalize.py +1292 -0
  29. microarray/preprocessing/_rma.py +243 -0
  30. microarray/preprocessing/_robust.py +170 -0
  31. microarray/preprocessing/_summarize.py +318 -0
  32. microarray/py.typed +0 -0
  33. microarray/tools/__init__.py +26 -0
  34. microarray/tools/_biomart.py +416 -0
  35. microarray/tools/_empirical_bayes.py +401 -0
  36. microarray/tools/_fdist.py +171 -0
  37. microarray/tools/_linear_models.py +387 -0
  38. microarray/tools/_mds.py +101 -0
  39. microarray/tools/_pca.py +88 -0
  40. microarray/tools/_score.py +86 -0
  41. microarray/tools/_toptable.py +360 -0
  42. microarray-0.1.0.dist-info/METADATA +75 -0
  43. microarray-0.1.0.dist-info/RECORD +44 -0
  44. microarray-0.1.0.dist-info/WHEEL +4 -0
@@ -0,0 +1,318 @@
1
+ """Probeset summarization methods including median polish."""
2
+
3
+ import warnings
4
+
5
+ import numpy as np
6
+ import pandas as pd
7
+ from anndata import AnnData
8
+
9
+ from ._robust import tukey_biweight_summary
10
+
11
+
12
+ def median_polish(
13
+ X: np.ndarray,
14
+ eps: float = 0.01,
15
+ max_iter: int = 10,
16
+ ) -> tuple[np.ndarray, np.ndarray, np.ndarray, float, bool]:
17
+ """Apply Tukey's median polish algorithm to a matrix.
18
+
19
+ Median polish is a robust method for additive decomposition of a matrix:
20
+ X = global_effect + row_effects + col_effects + residuals
21
+
22
+ The algorithm iteratively:
23
+ 1. Subtracts row medians from each row
24
+ 2. Subtracts column medians from each column
25
+ 3. Continues until convergence or max iterations
26
+
27
+ Parameters
28
+ ----------
29
+ X
30
+ Input matrix of shape (n_rows, n_cols) to decompose.
31
+ eps
32
+ Convergence threshold. Algorithm stops when relative change in
33
+ sum of absolute residuals is less than eps.
34
+ max_iter
35
+ Maximum number of iterations.
36
+
37
+ Returns:
38
+ -------
39
+ residuals
40
+ Residual matrix after removing effects, shape (n_rows, n_cols).
41
+ row_effects
42
+ Effect for each row, shape (n_rows,).
43
+ col_effects
44
+ Effect for each column, shape (n_cols,).
45
+ global_effect
46
+ Global effect (scalar).
47
+ converged
48
+ True if algorithm converged within max_iter iterations.
49
+
50
+ Examples:
51
+ --------
52
+ >>> X = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
53
+ >>> res, row_eff, col_eff, global_eff, converged = median_polish(X)
54
+
55
+ Notes:
56
+ -----
57
+ For RMA summarization, the column effects + global effect represent
58
+ the expression value for each sample.
59
+
60
+ Reference:
61
+ Tukey, J.W. (1977). Exploratory Data Analysis. Addison-Wesley.
62
+ """
63
+ # Initialize
64
+ n_rows, n_cols = X.shape
65
+ residuals = X.copy().astype(float)
66
+ row_effects = np.zeros(n_rows)
67
+ col_effects = np.zeros(n_cols)
68
+ global_effect = 0.0
69
+
70
+ # Track convergence
71
+ prev_sar = np.sum(np.abs(residuals))
72
+ converged = False
73
+
74
+ for _iteration in range(max_iter):
75
+ # Row sweep: subtract row medians
76
+ row_medians = np.median(residuals, axis=1)
77
+ residuals -= row_medians[:, np.newaxis]
78
+ row_effects += row_medians
79
+
80
+ # Update column effects and global effect
81
+ col_median = np.median(col_effects)
82
+ col_effects -= col_median
83
+ global_effect += col_median
84
+
85
+ # Column sweep: subtract column medians
86
+ col_medians = np.median(residuals, axis=0)
87
+ residuals -= col_medians
88
+ col_effects += col_medians
89
+
90
+ # Update row effects and global effect
91
+ row_median = np.median(row_effects)
92
+ row_effects -= row_median
93
+ global_effect += row_median
94
+
95
+ # Check convergence
96
+ current_sar = np.sum(np.abs(residuals))
97
+
98
+ if current_sar == 0:
99
+ converged = True
100
+ break
101
+
102
+ relative_change = abs(current_sar - prev_sar) / current_sar
103
+
104
+ if relative_change < eps:
105
+ converged = True
106
+ break
107
+
108
+ prev_sar = current_sar
109
+
110
+ return residuals, row_effects, col_effects, global_effect, converged
111
+
112
+
113
+ def summarize_probesets(
114
+ adata: AnnData,
115
+ method: str = "medpolish",
116
+ output_level: str = "gene",
117
+ copy: bool = True,
118
+ ) -> AnnData | None:
119
+ """Summarize probe-level data to gene-level using median polish or simple median.
120
+
121
+ Groups probes by gene (using `.var['gene_id']`) and applies summarization
122
+ to reduce multiple probes per gene to a single expression value per gene.
123
+
124
+ Parameters
125
+ ----------
126
+ adata
127
+ AnnData object with shape (n_samples, n_probes).
128
+ Must contain 'gene_id' column in `.var`.
129
+ method
130
+ Summarization method:
131
+ - 'medpolish': Tukey's median polish (default, used in RMA)
132
+ - 'median': Simple median across probes
133
+ - 'mas': Tukey biweight (used in MAS5)
134
+ output_level
135
+ Output granularity:
136
+ - 'gene': Return gene-level AnnData (n_samples, n_genes)
137
+ - 'probe': Keep probe-level, store effects in `.layers`
138
+ copy
139
+ If True, return a new AnnData object. If False and output_level='gene',
140
+ must return new object (raises ValueError).
141
+
142
+ Returns:
143
+ -------
144
+ AnnData or None
145
+ If `output_level='gene'`, returns gene-level AnnData object.
146
+ If `output_level='probe'` and `copy=True`, returns probe-level AnnData
147
+ with summarization stored in `.layers['medpolish_effects']`.
148
+ If `output_level='probe'` and `copy=False`, modifies in place and returns None.
149
+ Summarization metadata is stored in `.uns['summarization']`.
150
+
151
+ Raises:
152
+ ------
153
+ ValueError
154
+ If 'gene_id' column is missing from `.var`.
155
+ If `output_level='gene'` and `copy=False` (cannot modify shape in place).
156
+
157
+ Examples:
158
+ --------
159
+ >>> # Summarize to gene level
160
+ >>> adata_genes = ma.pp.summarize_probesets(adata, output_level="gene")
161
+ >>> print(adata_genes.shape) # (n_samples, n_genes)
162
+
163
+ >>> # Keep probe level but store median polish effects
164
+ >>> adata_probes = ma.pp.summarize_probesets(adata, output_level="probe")
165
+
166
+ Notes:
167
+ -----
168
+ For median polish:
169
+ - Each probeset forms a (n_probes_in_set, n_samples) matrix
170
+ - Median polish decomposes: X = global + row_eff + col_eff + residuals
171
+ - Gene expression = col_eff + global (one value per sample)
172
+
173
+ Probes without gene annotation (gene_id is None or empty) are excluded
174
+ from gene-level output but retained in probe-level output.
175
+ """
176
+ if "gene_id" not in adata.var.columns:
177
+ raise ValueError(
178
+ "AnnData.var must contain 'gene_id' column for probeset summarization. "
179
+ "Load data with annotation_db_path to get gene annotations."
180
+ )
181
+
182
+ if output_level not in ["gene", "probe"]:
183
+ raise ValueError(f"output_level must be 'gene' or 'probe', got '{output_level}'")
184
+
185
+ if output_level == "gene" and not copy:
186
+ raise ValueError(
187
+ "Cannot modify AnnData in place when changing from probe to gene level. "
188
+ "Set copy=True for gene-level output."
189
+ )
190
+
191
+ if method not in ["medpolish", "median", "mas"]:
192
+ raise ValueError(f"method must be 'medpolish', 'median', or 'mas', got '{method}'")
193
+
194
+ adata_work = adata.copy() if copy else adata
195
+
196
+ # Get probe-to-gene mapping
197
+ gene_ids = adata_work.var["gene_id"].values
198
+ unique_genes = pd.Series(gene_ids).dropna().unique()
199
+
200
+ n_samples = adata_work.n_obs
201
+ n_genes = len(unique_genes)
202
+
203
+ if n_genes == 0:
204
+ raise ValueError("No valid gene IDs found in .var['gene_id']")
205
+
206
+ # Store summarization metadata
207
+ convergence_info = []
208
+
209
+ if output_level == "gene":
210
+ # Create gene-level expression matrix
211
+ X_genes = np.zeros((n_samples, n_genes))
212
+ var_data = {
213
+ "gene_id": [],
214
+ "n_probes": [],
215
+ "converged": [] if method == "medpolish" else None,
216
+ }
217
+
218
+ for i, gene_id in enumerate(unique_genes):
219
+ # Get probes for this gene
220
+ probe_mask = gene_ids == gene_id
221
+ probe_indices = np.where(probe_mask)[0]
222
+ n_probes_in_set = len(probe_indices)
223
+
224
+ # Extract probeset intensities: (n_probes, n_samples)
225
+ probeset_data = adata_work.X[:, probe_indices].T
226
+
227
+ if method == "medpolish":
228
+ if n_probes_in_set == 1:
229
+ # Single probe: no summarization needed
230
+ gene_expr = probeset_data[0, :]
231
+ converged = True
232
+ else:
233
+ # Apply median polish
234
+ residuals, row_eff, col_eff, global_eff, converged = median_polish(
235
+ probeset_data, eps=0.01, max_iter=10
236
+ )
237
+ # Gene expression = column effects + global effect
238
+ gene_expr = col_eff + global_eff
239
+
240
+ var_data["converged"].append(converged)
241
+ if not converged:
242
+ warnings.warn(
243
+ f"Median polish did not converge for gene '{gene_id}' ({n_probes_in_set} probes)",
244
+ UserWarning,
245
+ stacklevel=2,
246
+ )
247
+
248
+ elif method == "median":
249
+ # Simple median across probes
250
+ gene_expr = np.median(probeset_data, axis=0)
251
+ converged = True
252
+
253
+ elif method == "mas":
254
+ # Tukey biweight summarization
255
+ gene_expr, _ = tukey_biweight_summary(probeset_data)
256
+ converged = True
257
+
258
+ X_genes[:, i] = gene_expr
259
+ var_data["gene_id"].append(gene_id)
260
+ var_data["n_probes"].append(n_probes_in_set)
261
+ convergence_info.append(converged)
262
+
263
+ # Create new gene-level AnnData
264
+ var_df = pd.DataFrame(var_data)
265
+ if method != "medpolish":
266
+ var_df = var_df.drop(columns=["converged"])
267
+
268
+ adata_out = AnnData(
269
+ X=X_genes,
270
+ obs=adata_work.obs.copy(),
271
+ var=var_df,
272
+ uns=adata_work.uns.copy(),
273
+ )
274
+
275
+ # Store summarization info
276
+ adata_out.uns["summarization"] = {
277
+ "method": method,
278
+ "output_level": output_level,
279
+ "n_genes": n_genes,
280
+ "n_probes_original": adata_work.n_vars,
281
+ "convergence_rate": sum(convergence_info) / len(convergence_info) if convergence_info else 1.0,
282
+ }
283
+
284
+ return adata_out
285
+
286
+ else: # output_level == 'probe'
287
+ # Keep probe level, optionally store effects in layers
288
+ if method == "medpolish":
289
+ # Store median polish effects for each probe
290
+ effects = np.zeros_like(adata_work.X)
291
+
292
+ for gene_id in unique_genes:
293
+ probe_mask = gene_ids == gene_id
294
+ probe_indices = np.where(probe_mask)[0]
295
+ n_probes_in_set = len(probe_indices)
296
+
297
+ probeset_data = adata_work.X[:, probe_indices].T
298
+
299
+ if n_probes_in_set == 1:
300
+ # Single probe: effect = original value
301
+ effects[:, probe_indices[0]] = probeset_data[0, :]
302
+ else:
303
+ residuals, row_eff, col_eff, global_eff, converged = median_polish(
304
+ probeset_data, eps=0.01, max_iter=10
305
+ )
306
+ # Store column effects + global for each probe
307
+ for _j, probe_idx in enumerate(probe_indices):
308
+ effects[:, probe_idx] = col_eff + global_eff
309
+
310
+ adata_work.layers["medpolish_effects"] = effects
311
+
312
+ adata_work.uns["summarization"] = {
313
+ "method": method,
314
+ "output_level": output_level,
315
+ "n_genes": n_genes,
316
+ }
317
+
318
+ return adata_work if copy else None
microarray/py.typed ADDED
File without changes
@@ -0,0 +1,26 @@
1
+ """Tools for microarray data analysis.
2
+
3
+ This module provides analytical tools for microarray data, including:
4
+ - Dimension reduction (MDS)
5
+ - Biomart-based feature annotation
6
+ """
7
+
8
+ from microarray.tools._biomart import BiomartDataset, annotate
9
+ from microarray.tools._empirical_bayes import ebayes, squeeze_var
10
+ from microarray.tools._linear_models import lm_fit
11
+ from microarray.tools._mds import mds
12
+ from microarray.tools._pca import pca
13
+ from microarray.tools._score import score
14
+ from microarray.tools._toptable import top_table
15
+
16
+ __all__ = [
17
+ "mds",
18
+ "pca",
19
+ "BiomartDataset",
20
+ "annotate",
21
+ "ebayes",
22
+ "squeeze_var",
23
+ "lm_fit",
24
+ "score",
25
+ "top_table",
26
+ ]