sonata-learn 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
sonata/__init__.py ADDED
@@ -0,0 +1,19 @@
1
+ """
2
+ Sonata: a non-negative matrix factorization toolkit for signature analysis
3
+ ==========================================================================
4
+ """
5
+
6
+ from . import models
7
+ from . import plot as pl
8
+ from . import tools as tl
9
+
10
+ __version__ = "0.1.0"
11
+
12
+ pl.set_sonata_style()
13
+
14
+ __all__ = [
15
+ "__version__",
16
+ "models",
17
+ "pl",
18
+ "tl",
19
+ ]
sonata/consts.py ADDED
@@ -0,0 +1,95 @@
1
+ from matplotlib.colors import LinearSegmentedColormap
2
+
3
+ NUCLEOTIDES = ["A", "C", "G", "T"]
4
+
5
+ SBS_TYPES_6 = ["C>A", "C>G", "C>T", "T>A", "T>C", "T>G"]
6
+ SBS_TYPES_96 = [
7
+ f"{n1}[{sbs_6}]{n2}"
8
+ for sbs_6 in SBS_TYPES_6
9
+ for n1 in NUCLEOTIDES
10
+ for n2 in NUCLEOTIDES
11
+ ]
12
+
13
+ # fmt: off
14
+ INDEL_TYPES_83 = [
15
+ "DEL.C.1.1", "DEL.C.1.2", 'DEL.C.1.3', "DEL.C.1.4", "DEL.C.1.5", "DEL.C.1.6+",
16
+ "DEL.T.1.1", "DEL.T.1.2", 'DEL.T.1.3', "DEL.T.1.4", "DEL.T.1.5", "DEL.T.1.6+",
17
+ "INS.C.1.0", "INS.C.1.1", 'INS.C.1.2', "INS.C.1.3", "INS.C.1.4", "INS.C.1.5+",
18
+ "INS.T.1.0", "INS.T.1.1", 'INS.T.1.2', "INS.T.1.3", "INS.T.1.4", "INS.T.1.5+",
19
+ "DEL.repeats.2.1", "DEL.repeats.2.2", "DEL.repeats.2.3",
20
+ "DEL.repeats.2.4", "DEL.repeats.2.5", "DEL.repeats.2.6+",
21
+ "DEL.repeats.3.1", "DEL.repeats.3.2", "DEL.repeats.3.3",
22
+ "DEL.repeats.3.4", "DEL.repeats.3.5", "DEL.repeats.3.6+",
23
+ "DEL.repeats.4.1", "DEL.repeats.4.2", "DEL.repeats.4.3",
24
+ "DEL.repeats.4.4", "DEL.repeats.4.5", "DEL.repeats.4.6+",
25
+ "DEL.repeats.5+.1", "DEL.repeats.5+.2", "DEL.repeats.5+.3",
26
+ "DEL.repeats.5+.4", "DEL.repeats.5+.5", "DEL.repeats.5+.6+",
27
+ "INS.repeats.2.0", "INS.repeats.2.1", "INS.repeats.2.2",
28
+ "INS.repeats.2.3", "INS.repeats.2.4", "INS.repeats.2.5+",
29
+ "INS.repeats.3.0", "INS.repeats.3.1", "INS.repeats.3.2",
30
+ "INS.repeats.3.3", "INS.repeats.3.4", "INS.repeats.3.5+",
31
+ "INS.repeats.4.0", "INS.repeats.4.1", "INS.repeats.4.2",
32
+ "INS.repeats.4.3", "INS.repeats.4.4", "INS.repeats.4.5+",
33
+ "INS.repeats.5+.0", "INS.repeats.5+.1", "INS.repeats.5+.2",
34
+ "INS.repeats.5+.3", "INS.repeats.5+.4", "INS.repeats.5+.5+",
35
+ "DEL.MH.2.1",
36
+ "DEL.MH.3.1", "DEL.MH.3.2",
37
+ "DEL.MH.4.1", "DEL.MH.4.2", "DEL.MH.4.3",
38
+ "DEL.MH.5+.1", "DEL.MH.5+.2", "DEL.MH.5+.3", "DEL.MH.5+.4", "DEL.MH.5+.5+"
39
+ ]
40
+ # fmt: on
41
+
42
+ # 10 colors
43
+ COLORS_MATHEMATICA = [
44
+ (0.368417, 0.506779, 0.709798),
45
+ (0.880722, 0.611041, 0.142051),
46
+ (0.560181, 0.691569, 0.194885),
47
+ (0.922526, 0.385626, 0.209179),
48
+ (0.528288, 0.470624, 0.701351),
49
+ (0.772079, 0.431554, 0.102387),
50
+ (0.363898, 0.618501, 0.782349),
51
+ (1.0, 0.75, 0.0),
52
+ (0.280264, 0.715, 0.429209),
53
+ (0.0, 0.0, 0.0),
54
+ ]
55
+
56
+ # Trinucleotide colors for the 96 dimensional mutation spectrum
57
+ COLORS_TRINUCLEOTIDES = [
58
+ "#427aa1ff",
59
+ (0.0, 0.0, 0.0),
60
+ "#d1664aff",
61
+ (0.78, 0.78, 0.78),
62
+ "#64b3aaff",
63
+ (0.89, 0.67, 0.72),
64
+ ]
65
+
66
+ COLORS_SBS96 = [COLORS_TRINUCLEOTIDES[i // 16] for i in range(96)]
67
+
68
+ DIVERGING_PALETTE = LinearSegmentedColormap.from_list(
69
+ "sonata_diverging",
70
+ ["#427aa1ff", "#FAFAFA", "#e07a5fff"],
71
+ )
72
+
73
+ COLORS_INDEL = [
74
+ "#FCBD6F", # 1bp Del C
75
+ "#FD8001", # 1bp Del T
76
+ "#B0DC8B", # 1bp Ins C
77
+ "#35A02E", # 1bp Ins T
78
+ "#FCC9B4", # 2bp Del Repeats
79
+ "#FC896B", # 3bp Del Repeats
80
+ "#F04432", # 4bp Del Repeats
81
+ "#BC1A1A", # 5+ bp Del Repeats
82
+ "#CFE0F0", # 2bp Ins Repeats
83
+ "#94C3DF", # 3bp Ins Repeats
84
+ "#4A98C8", # 4bp Ins Repeats
85
+ "#1665AA", # 5+ bp Ins Repeats
86
+ "#E1E0ED", # 2bp Del MH
87
+ "#B5B5D8", # 3bp Del MH
88
+ "#8683BC", # 4bp Del MH
89
+ "#624099", # 5+bp Del MH
90
+ ]
91
+
92
+ # 12 * 6 + 11 = 83 colors
93
+ n_times = 12 * [6] + [1, 2, 3, 5]
94
+ COLORS_INDEL83 = [n * [col] for n, col in zip(n_times, COLORS_INDEL)]
95
+ COLORS_INDEL83 = [col for color_list in COLORS_INDEL83 for col in color_list]
@@ -0,0 +1 @@
1
+ """"""
@@ -0,0 +1,382 @@
1
+ """
2
+ Initialization methods for non-negative matrix factorization (NMF) models.
3
+ """
4
+
5
+ from __future__ import annotations
6
+
7
+ from typing import Any
8
+
9
+ import anndata as ad
10
+ import numpy as np
11
+
12
+ from ..utils import (
13
+ dict_checker,
14
+ normalize_WH,
15
+ shape_checker,
16
+ type_checker,
17
+ value_checker,
18
+ )
19
+ from .methods import (
20
+ _INIT_METHODS,
21
+ _Init_methods,
22
+ init_custom,
23
+ init_flat,
24
+ init_nndsvd,
25
+ init_random,
26
+ init_separableNMF,
27
+ )
28
+
29
+ EPSILON = np.finfo(np.float32).eps
30
+
31
+ # allowed given parameters
32
+ GIVEN_PARAMETERS_STANDARD_NMF = ["asignatures"]
33
+ GIVEN_PARAMETERS_CORNET = [
34
+ "asignatures",
35
+ "signature_offsets",
36
+ "sample_offsets",
37
+ "signature_embeddings",
38
+ "sample_embeddings",
39
+ "variance",
40
+ ]
41
+
42
+
43
+ def initialize_mat(
44
+ data_mat: np.ndarray,
45
+ n_signatures: int,
46
+ method: _Init_methods = "nndsvd",
47
+ given_signatures_mat: np.ndarray | None = None,
48
+ **kwargs,
49
+ ) -> tuple[np.ndarray, np.ndarray]:
50
+ """
51
+ Initialize the signature and exposure matrices.
52
+
53
+ Inputs
54
+ ------
55
+ data_mat : np.ndarray
56
+ shape (n_samples, n_features)
57
+
58
+ n_signatures : int
59
+
60
+ method : str
61
+ initialization method. One of 'custom', 'flat',
62
+ 'nndsvd', 'nndsvda', 'nndsvdar', 'random', 'separableNMF'.
63
+
64
+ given_signatures_mat : np.ndarray, optional
65
+ At most 'n_signatures' many signatures can be provided to
66
+ overwrite some of the initialized signatures. This does not
67
+ change the initialized exposures.
68
+
69
+ kwargs : dict
70
+ Any keyword arguments to be passed to the initialization method.
71
+ This includes, for example, a possible 'seed' keyword argument
72
+ for all stochastic methods.
73
+
74
+ Returns
75
+ -------
76
+ signatures_mat : np.ndarray
77
+ shape (n_signatures, n_features)
78
+
79
+ exposures_mat : np.ndarray
80
+ shape (n_samples, n_signatures)
81
+ """
82
+ value_checker("method", method, _INIT_METHODS)
83
+
84
+ if method == "custom":
85
+ matrices = init_custom(data_mat, n_signatures, **kwargs)
86
+ elif method == "flat":
87
+ matrices = init_flat(data_mat, n_signatures)
88
+ elif method in ["nndsvd", "nndsvda", "nndsvdar"]:
89
+ # mypy does not recognize that 'method' is compatible
90
+ # with Literal["nndsvd", "nndsvda", "nndsvdar"]
91
+ matrices = init_nndsvd(
92
+ data_mat, n_signatures, method=method, **kwargs # type: ignore[arg-type] # noqa: E501
93
+ )
94
+ elif method == "random":
95
+ matrices = init_random(data_mat, n_signatures, **kwargs)
96
+ else:
97
+ matrices = init_separableNMF(data_mat, n_signatures, **kwargs)
98
+
99
+ signatures_mat, exposures_mat = matrices
100
+
101
+ if given_signatures_mat is not None:
102
+ type_checker("given_signatures_mat", given_signatures_mat, np.ndarray)
103
+ given_n_signatures, given_n_features = given_signatures_mat.shape
104
+
105
+ if given_n_features != data_mat.shape[1]:
106
+ raise ValueError(
107
+ "The given signature matrix has a different number of features "
108
+ "than the data."
109
+ )
110
+ if given_n_signatures > n_signatures:
111
+ raise ValueError("The given signature matrix contains too many signatures.")
112
+
113
+ signatures_mat[:given_n_signatures, :] = given_signatures_mat.copy()
114
+
115
+ W, H = normalize_WH(signatures_mat.T, exposures_mat.T)
116
+ W, H = W.clip(EPSILON), H.clip(EPSILON)
117
+ signatures_mat, exposures_mat = W.T, H.T
118
+ return signatures_mat, exposures_mat
119
+
120
+
121
+ def check_given_asignatures(
122
+ given_asignatures: ad.AnnData, adata: ad.AnnData, n_signatures: int
123
+ ) -> None:
124
+ """
125
+ Check if the given signatures are compatible with
126
+ the input data and the number of signatures to be initialized.
127
+ The number of given signatures can be less or equal to the number of
128
+ signatures specified.
129
+
130
+ Inputs
131
+ ------
132
+ given_asignatures: AnnData
133
+ Known signatures that should be fixed by the algorithm.
134
+
135
+ adata: ad.AnnData
136
+ Input data.
137
+
138
+ n_signatures: int
139
+ The number of signatures to initialize.
140
+ """
141
+ type_checker("given_asignatures", given_asignatures, ad.AnnData)
142
+ if given_asignatures.n_vars != adata.n_vars:
143
+ raise ValueError(
144
+ "The given signatures have a different number of features than the data."
145
+ )
146
+ if not all(given_asignatures.var_names == adata.var_names):
147
+ raise ValueError(
148
+ "The features of the given signatures and the data are not identical."
149
+ )
150
+ if given_asignatures.n_obs > n_signatures:
151
+ raise ValueError(
152
+ "The number of given signatures exceeds "
153
+ "the number of signatures to initialize."
154
+ )
155
+
156
+
157
+ def initialize_base(
158
+ adata: ad.AnnData,
159
+ n_signatures: int,
160
+ method: _Init_methods = "nndsvd",
161
+ given_asignatures: ad.AnnData | None = None,
162
+ **kwargs,
163
+ ) -> tuple[ad.AnnData, np.ndarray]:
164
+ """
165
+ Initialize the signature anndata object and the exposure matrix.
166
+ The anndata object is unchanged and the exposure matrix is returned.
167
+
168
+ Inputs
169
+ ------
170
+ adata : ad.AnnData
171
+
172
+ n_signatures : int
173
+
174
+ method : str
175
+ initialization method. One of 'custom', 'flat',
176
+ 'nndsvd', 'nndsvda', 'nndsvdar', 'random', 'separableNMF'.
177
+
178
+ given_asignatures : ad.AnnData, optional
179
+ At most 'n_signatures' many signatures can be provided to
180
+ overwrite some of the initialized signatures. This does not
181
+ change the initialized exposures.
182
+
183
+ kwargs : dict
184
+ Any keyword arguments to be passed to the initialization method.
185
+ This includes, for example, a possible 'seed' keyword argument
186
+ for all stochastic methods.
187
+
188
+ Returns
189
+ -------
190
+ asignatures : ad.AnnData
191
+ Annotated signature matrix of shape (n_signatures, n_features)
192
+
193
+ exposures_mat : np.ndarray
194
+ shape (n_samples, n_signatures)
195
+ """
196
+ if given_asignatures is not None:
197
+ check_given_asignatures(given_asignatures, adata, n_signatures)
198
+ given_signatures_mat = given_asignatures.X
199
+ else:
200
+ given_signatures_mat = None
201
+
202
+ signatures_mat, exposures_mat = initialize_mat(
203
+ adata.X, n_signatures, method, given_signatures_mat, **kwargs
204
+ )
205
+ asignatures = ad.AnnData(signatures_mat)
206
+ asignatures.var_names = adata.var_names
207
+ asignatures.obs_names = [f"Sig{k+1}" for k in range(n_signatures)]
208
+
209
+ # keep signature annotations
210
+ if given_asignatures is not None:
211
+ n_given_signatures = given_asignatures.n_obs
212
+ asignatures.obs_names = np.roll(asignatures.obs_names, n_given_signatures)
213
+ asignatures = ad.concat(
214
+ [given_asignatures, asignatures[n_given_signatures:, :]], join="outer"
215
+ )
216
+
217
+ return asignatures, exposures_mat
218
+
219
+
220
+ def check_given_parameters_standard_nmf(
221
+ adata: ad.AnnData,
222
+ n_signatures: int,
223
+ given_parameters: dict[str, Any],
224
+ ) -> None:
225
+ dict_checker("given_parameters", given_parameters, GIVEN_PARAMETERS_STANDARD_NMF)
226
+
227
+ if "asignatures" in given_parameters:
228
+ check_given_asignatures(given_parameters["asignatures"], adata, n_signatures)
229
+
230
+
231
+ def initialize_standard_nmf(
232
+ adata: ad.AnnData,
233
+ n_signatures: int,
234
+ method: _Init_methods = "nndsvd",
235
+ given_parameters: dict[str, Any] | None = None,
236
+ **kwargs,
237
+ ) -> ad.AnnData:
238
+ given_parameters = {} if given_parameters is None else given_parameters.copy()
239
+ check_given_parameters_standard_nmf(adata, n_signatures, given_parameters)
240
+
241
+ if "asignatures" in given_parameters:
242
+ given_asignatures = given_parameters["asignatures"]
243
+ else:
244
+ given_asignatures = None
245
+
246
+ asignatures, exposures_mat = initialize_base(
247
+ adata,
248
+ n_signatures,
249
+ method,
250
+ given_asignatures,
251
+ **kwargs,
252
+ )
253
+ adata.obsm["exposures"] = exposures_mat
254
+ return asignatures
255
+
256
+
257
+ def check_given_offsets_cornet(
258
+ given_offsets: np.ndarray, n_offsets_expected: int, name: str
259
+ ) -> None:
260
+ """
261
+ Check if the given sample or signature offsets match the expected shape.
262
+ """
263
+ type_checker(name, given_offsets, np.ndarray)
264
+ shape_checker(name, given_offsets, (n_offsets_expected,))
265
+
266
+
267
+ def check_given_embeddings_cornet(
268
+ given_embeddings: np.ndarray,
269
+ n_embeddings_expected: int,
270
+ dim_embeddings_expected: int,
271
+ name: str,
272
+ ) -> None:
273
+ type_checker(name, given_embeddings, np.ndarray)
274
+ shape_checker(
275
+ name, given_embeddings, (n_embeddings_expected, dim_embeddings_expected)
276
+ )
277
+
278
+
279
+ def check_given_parameters_cornet(
280
+ adata: ad.AnnData,
281
+ n_signatures: int,
282
+ dim_embeddings: int,
283
+ given_parameters: dict[str, Any],
284
+ ) -> None:
285
+ dict_checker("given_parameters", given_parameters, GIVEN_PARAMETERS_CORNET)
286
+
287
+ if "asignatures" in given_parameters:
288
+ check_given_asignatures(given_parameters["asignatures"], adata, n_signatures)
289
+
290
+ if "signature_offsets" in given_parameters:
291
+ check_given_offsets_cornet(
292
+ given_parameters["signature_offsets"],
293
+ n_signatures,
294
+ "given_signature_offsets",
295
+ )
296
+ if "sample_offsets" in given_parameters:
297
+ check_given_offsets_cornet(
298
+ given_parameters["sample_offsets"], adata.n_obs, "given_sample_offsets"
299
+ )
300
+ if "signature_embeddings" in given_parameters:
301
+ check_given_embeddings_cornet(
302
+ given_parameters["signature_embeddings"],
303
+ n_signatures,
304
+ dim_embeddings,
305
+ "given_signature_embeddings",
306
+ )
307
+ if "sample_embeddings" in given_parameters:
308
+ check_given_embeddings_cornet(
309
+ given_parameters["sample_embeddings"],
310
+ adata.n_obs,
311
+ dim_embeddings,
312
+ "given_sample_embeddings",
313
+ )
314
+ if "variance" in given_parameters:
315
+ given_variance = given_parameters["variance"]
316
+ type_checker("given_variance", given_variance, [float, int])
317
+ if given_variance <= 0.0:
318
+ raise ValueError("The variance has to be a positive real number.")
319
+
320
+
321
+ def initialize_cornet(
322
+ adata: ad.AnnData,
323
+ n_signatures: int,
324
+ dim_embeddings: int,
325
+ method: _Init_methods = "nndsvd",
326
+ given_parameters: dict[str, Any] | None = None,
327
+ **kwargs,
328
+ ) -> tuple[ad.AnnData, float]:
329
+ if method == "custom":
330
+ raise ValueError(
331
+ "Custom parameter initializations are currently not supported "
332
+ "for correlated NMF."
333
+ )
334
+
335
+ given_parameters = {} if given_parameters is None else given_parameters.copy()
336
+ check_given_parameters_cornet(adata, n_signatures, dim_embeddings, given_parameters)
337
+
338
+ if "asignatures" in given_parameters:
339
+ given_asignatures = given_parameters["asignatures"]
340
+ else:
341
+ given_asignatures = None
342
+
343
+ asignatures, _ = initialize_base(
344
+ adata,
345
+ n_signatures,
346
+ method,
347
+ given_asignatures,
348
+ **kwargs,
349
+ )
350
+
351
+ if "signature_offsets" in given_parameters:
352
+ asignatures.obs["offsets"] = given_parameters["signature_offsets"]
353
+ else:
354
+ asignatures.obs["offsets"] = np.zeros(n_signatures)
355
+
356
+ if "sample_offsets" in given_parameters:
357
+ adata.obs["offsets"] = given_parameters["sample_offsets"]
358
+ else:
359
+ adata.obs["offsets"] = np.zeros(adata.n_obs)
360
+
361
+ if "signature_embeddings" in given_parameters:
362
+ asignatures.obsm["embeddings"] = given_parameters["signature_embeddings"]
363
+ else:
364
+ asignatures.obsm["embeddings"] = np.random.multivariate_normal(
365
+ np.zeros(dim_embeddings), np.identity(dim_embeddings), size=n_signatures
366
+ )
367
+
368
+ if "sample_embeddings" in given_parameters:
369
+ adata.obsm["embeddings"] = given_parameters["sample_embeddings"]
370
+ else:
371
+ adata.obsm["embeddings"] = np.random.multivariate_normal(
372
+ np.zeros(dim_embeddings),
373
+ np.identity(dim_embeddings),
374
+ size=adata.n_obs,
375
+ )
376
+
377
+ if "variance" in given_parameters:
378
+ variance = float(given_parameters["variance"])
379
+ else:
380
+ variance = 1.0
381
+
382
+ return asignatures, variance
@@ -0,0 +1,135 @@
1
+ """
2
+ Initialization methods for non-negative matrix factorization (NMF)
3
+ """
4
+
5
+ from __future__ import annotations
6
+
7
+ from typing import Literal, get_args
8
+
9
+ import numpy as np
10
+ from sklearn.decomposition import _nmf as sknmf
11
+
12
+ from ..utils import shape_checker, type_checker
13
+
14
+ EPSILON = np.finfo(np.float32).eps
15
+ _Init_methods = Literal[
16
+ "custom",
17
+ "flat",
18
+ "nndsvd",
19
+ "nndsvda",
20
+ "nndsvdar",
21
+ "random",
22
+ "separableNMF",
23
+ ]
24
+ _INIT_METHODS = get_args(_Init_methods)
25
+
26
+
27
+ def init_custom(
28
+ data_mat: np.ndarray,
29
+ n_signatures: int,
30
+ signatures_mat: np.ndarray,
31
+ exposures_mat: np.ndarray,
32
+ ) -> tuple[np.ndarray, np.ndarray]:
33
+ """
34
+ Perform type and shape checks on custom signature and
35
+ exposure matrix initializations.
36
+
37
+ Inputs:
38
+ -------
39
+ data_mat: np.ndarray
40
+ shape (n_samples, n_features)
41
+
42
+ n_signatures: int
43
+
44
+ signatures_mat: np.ndarray
45
+ shape (n_signatures, n_features)
46
+
47
+ exposures_mat: np.ndarray
48
+ shape (n_samples, n_signatures)
49
+ """
50
+ type_checker("signatures_mat", signatures_mat, np.ndarray)
51
+ type_checker("exposures_mat", exposures_mat, np.ndarray)
52
+ n_samples, n_features = data_mat.shape
53
+ shape_checker("signatures_mat", signatures_mat, (n_signatures, n_features))
54
+ shape_checker("exposures_mat", exposures_mat, (n_samples, n_signatures))
55
+ return signatures_mat, exposures_mat
56
+
57
+
58
+ def init_flat(data_mat: np.ndarray, n_signatures: int) -> tuple[np.ndarray, np.ndarray]:
59
+ """
60
+ Initialize the signature and exposure matrices with one float, respectively.
61
+ """
62
+ n_features = data_mat.shape[1]
63
+ signatures_mat = np.full((n_signatures, n_features), 1 / n_features)
64
+ exposures = np.sum(data_mat, axis=1) / n_signatures
65
+ exposures_mat = np.tile(exposures, (n_signatures, 1)).T
66
+ return signatures_mat, exposures_mat
67
+
68
+
69
+ def init_nndsvd(
70
+ data_mat: np.ndarray,
71
+ n_signatures: int,
72
+ method: Literal["nndsvd", "nndsvda", "nndsvdar"] = "nndsvd",
73
+ seed: int | None = None,
74
+ ) -> tuple[np.ndarray, np.ndarray]:
75
+ """
76
+ A wrapper around the non-negative double singular value decomposition (NNDSVD)
77
+ initialization methods "nndsvd", "nndsvda" and "nndsvdar" from scikit-learn.
78
+ """
79
+ if seed is not None:
80
+ np.random.seed(seed)
81
+
82
+ # pylint: disable-next=W0212
83
+ exposures_mat, signatures_mat = sknmf._initialize_nmf(
84
+ data_mat, n_signatures, init=method
85
+ )
86
+ return signatures_mat, exposures_mat
87
+
88
+
89
+ def init_random(
90
+ data_mat: np.ndarray, n_signatures: int, seed: int | None = None
91
+ ) -> tuple[np.ndarray, np.ndarray]:
92
+ """
93
+ Initialize each signature by drawing from the uniform
94
+ distribution on the simplex.
95
+ Initialize the exposures of each sample as a scaled sample
96
+ from the uniform distribution on a simplex.
97
+ The scaling is chosen such that the expected total exposure is equal to
98
+ the column sum of that sample in the count matrix X.
99
+ """
100
+ if seed is not None:
101
+ np.random.seed(seed)
102
+
103
+ n_samples, n_features = data_mat.shape
104
+ signatures_mat = np.random.dirichlet(np.ones(n_features), size=n_signatures)
105
+ scaling = np.sum(data_mat, axis=1)
106
+ exposures_mat = scaling[:, np.newaxis] * np.random.dirichlet(
107
+ np.ones(n_signatures), size=n_samples
108
+ )
109
+ return signatures_mat, exposures_mat
110
+
111
+
112
+ def init_separableNMF(
113
+ data_mat: np.ndarray, n_signatures: int, seed: int | None = None
114
+ ) -> tuple[np.ndarray, np.ndarray]:
115
+ r"""
116
+ This code is following Algorithm 1 from "Fast and Robust Recursive
117
+ Algorithms for Separable Nonnegative Matrix Factorization"
118
+ (Gillis and Vavasis, 2013), with the canonical choice of
119
+ f(x) = \| x \|_2^2 as the strongly convex function f satisfying
120
+ Assumption 2 from the paper.
121
+ """
122
+ signature_indices = np.empty(n_signatures, dtype=int)
123
+ R = data_mat.T / np.sum(data_mat.T, axis=0)
124
+
125
+ for k in range(n_signatures):
126
+ column_norms = np.sum(R**2, axis=0)
127
+ kstar = np.argmax(column_norms)
128
+ u = R[:, kstar]
129
+ R = (np.identity(R.shape[0]) - np.outer(u, u) / column_norms[kstar]) @ R
130
+ signature_indices[k] = kstar
131
+
132
+ signatures_mat = data_mat[signature_indices, :].astype(float)
133
+ signatures_mat /= signatures_mat.sum(axis=1)[:, np.newaxis]
134
+ _, exposures_mat = init_random(data_mat, n_signatures, seed=seed)
135
+ return signatures_mat, exposures_mat
@@ -0,0 +1,13 @@
1
+ """
2
+ A collection of NMF algorithms
3
+ """
4
+
5
+ from .cornet import Cornet
6
+ from .mvnmf import MvNMF
7
+ from .nmf import NMF
8
+
9
+ __all__ = [
10
+ "Cornet",
11
+ "NMF",
12
+ "MvNMF",
13
+ ]