sonata-learn 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sonata/__init__.py +19 -0
- sonata/consts.py +95 -0
- sonata/initialization/__init__.py +1 -0
- sonata/initialization/initialize.py +382 -0
- sonata/initialization/methods.py +135 -0
- sonata/models/__init__.py +13 -0
- sonata/models/_utils_cornet.py +395 -0
- sonata/models/_utils_nmf.py +361 -0
- sonata/models/cornet.py +288 -0
- sonata/models/mvnmf.py +218 -0
- sonata/models/nmf.py +153 -0
- sonata/models/signature_nmf.py +306 -0
- sonata/plot.py +847 -0
- sonata/tools.py +176 -0
- sonata/utils.py +191 -0
- sonata_learn-0.1.0.dist-info/METADATA +114 -0
- sonata_learn-0.1.0.dist-info/RECORD +19 -0
- sonata_learn-0.1.0.dist-info/WHEEL +4 -0
- sonata_learn-0.1.0.dist-info/licenses/LICENSE +21 -0
sonata/__init__.py
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Sonata: a non-negative matrix factorization toolkit for signature analysis
|
|
3
|
+
==========================================================================
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from . import models
|
|
7
|
+
from . import plot as pl
|
|
8
|
+
from . import tools as tl
|
|
9
|
+
|
|
10
|
+
__version__ = "0.1.0"
|
|
11
|
+
|
|
12
|
+
pl.set_sonata_style()
|
|
13
|
+
|
|
14
|
+
__all__ = [
|
|
15
|
+
"__version__",
|
|
16
|
+
"models",
|
|
17
|
+
"pl",
|
|
18
|
+
"tl",
|
|
19
|
+
]
|
sonata/consts.py
ADDED
|
@@ -0,0 +1,95 @@
|
|
|
1
|
+
from matplotlib.colors import LinearSegmentedColormap
|
|
2
|
+
|
|
3
|
+
NUCLEOTIDES = ["A", "C", "G", "T"]
|
|
4
|
+
|
|
5
|
+
SBS_TYPES_6 = ["C>A", "C>G", "C>T", "T>A", "T>C", "T>G"]
|
|
6
|
+
SBS_TYPES_96 = [
|
|
7
|
+
f"{n1}[{sbs_6}]{n2}"
|
|
8
|
+
for sbs_6 in SBS_TYPES_6
|
|
9
|
+
for n1 in NUCLEOTIDES
|
|
10
|
+
for n2 in NUCLEOTIDES
|
|
11
|
+
]
|
|
12
|
+
|
|
13
|
+
# fmt: off
|
|
14
|
+
INDEL_TYPES_83 = [
|
|
15
|
+
"DEL.C.1.1", "DEL.C.1.2", 'DEL.C.1.3', "DEL.C.1.4", "DEL.C.1.5", "DEL.C.1.6+",
|
|
16
|
+
"DEL.T.1.1", "DEL.T.1.2", 'DEL.T.1.3', "DEL.T.1.4", "DEL.T.1.5", "DEL.T.1.6+",
|
|
17
|
+
"INS.C.1.0", "INS.C.1.1", 'INS.C.1.2', "INS.C.1.3", "INS.C.1.4", "INS.C.1.5+",
|
|
18
|
+
"INS.T.1.0", "INS.T.1.1", 'INS.T.1.2', "INS.T.1.3", "INS.T.1.4", "INS.T.1.5+",
|
|
19
|
+
"DEL.repeats.2.1", "DEL.repeats.2.2", "DEL.repeats.2.3",
|
|
20
|
+
"DEL.repeats.2.4", "DEL.repeats.2.5", "DEL.repeats.2.6+",
|
|
21
|
+
"DEL.repeats.3.1", "DEL.repeats.3.2", "DEL.repeats.3.3",
|
|
22
|
+
"DEL.repeats.3.4", "DEL.repeats.3.5", "DEL.repeats.3.6+",
|
|
23
|
+
"DEL.repeats.4.1", "DEL.repeats.4.2", "DEL.repeats.4.3",
|
|
24
|
+
"DEL.repeats.4.4", "DEL.repeats.4.5", "DEL.repeats.4.6+",
|
|
25
|
+
"DEL.repeats.5+.1", "DEL.repeats.5+.2", "DEL.repeats.5+.3",
|
|
26
|
+
"DEL.repeats.5+.4", "DEL.repeats.5+.5", "DEL.repeats.5+.6+",
|
|
27
|
+
"INS.repeats.2.0", "INS.repeats.2.1", "INS.repeats.2.2",
|
|
28
|
+
"INS.repeats.2.3", "INS.repeats.2.4", "INS.repeats.2.5+",
|
|
29
|
+
"INS.repeats.3.0", "INS.repeats.3.1", "INS.repeats.3.2",
|
|
30
|
+
"INS.repeats.3.3", "INS.repeats.3.4", "INS.repeats.3.5+",
|
|
31
|
+
"INS.repeats.4.0", "INS.repeats.4.1", "INS.repeats.4.2",
|
|
32
|
+
"INS.repeats.4.3", "INS.repeats.4.4", "INS.repeats.4.5+",
|
|
33
|
+
"INS.repeats.5+.0", "INS.repeats.5+.1", "INS.repeats.5+.2",
|
|
34
|
+
"INS.repeats.5+.3", "INS.repeats.5+.4", "INS.repeats.5+.5+",
|
|
35
|
+
"DEL.MH.2.1",
|
|
36
|
+
"DEL.MH.3.1", "DEL.MH.3.2",
|
|
37
|
+
"DEL.MH.4.1", "DEL.MH.4.2", "DEL.MH.4.3",
|
|
38
|
+
"DEL.MH.5+.1", "DEL.MH.5+.2", "DEL.MH.5+.3", "DEL.MH.5+.4", "DEL.MH.5+.5+"
|
|
39
|
+
]
|
|
40
|
+
# fmt: on
|
|
41
|
+
|
|
42
|
+
# 10 colors
|
|
43
|
+
COLORS_MATHEMATICA = [
|
|
44
|
+
(0.368417, 0.506779, 0.709798),
|
|
45
|
+
(0.880722, 0.611041, 0.142051),
|
|
46
|
+
(0.560181, 0.691569, 0.194885),
|
|
47
|
+
(0.922526, 0.385626, 0.209179),
|
|
48
|
+
(0.528288, 0.470624, 0.701351),
|
|
49
|
+
(0.772079, 0.431554, 0.102387),
|
|
50
|
+
(0.363898, 0.618501, 0.782349),
|
|
51
|
+
(1.0, 0.75, 0.0),
|
|
52
|
+
(0.280264, 0.715, 0.429209),
|
|
53
|
+
(0.0, 0.0, 0.0),
|
|
54
|
+
]
|
|
55
|
+
|
|
56
|
+
# Trinucleotide colors for the 96 dimensional mutation spectrum
|
|
57
|
+
COLORS_TRINUCLEOTIDES = [
|
|
58
|
+
"#427aa1ff",
|
|
59
|
+
(0.0, 0.0, 0.0),
|
|
60
|
+
"#d1664aff",
|
|
61
|
+
(0.78, 0.78, 0.78),
|
|
62
|
+
"#64b3aaff",
|
|
63
|
+
(0.89, 0.67, 0.72),
|
|
64
|
+
]
|
|
65
|
+
|
|
66
|
+
COLORS_SBS96 = [COLORS_TRINUCLEOTIDES[i // 16] for i in range(96)]
|
|
67
|
+
|
|
68
|
+
DIVERGING_PALETTE = LinearSegmentedColormap.from_list(
|
|
69
|
+
"sonata_diverging",
|
|
70
|
+
["#427aa1ff", "#FAFAFA", "#e07a5fff"],
|
|
71
|
+
)
|
|
72
|
+
|
|
73
|
+
COLORS_INDEL = [
|
|
74
|
+
"#FCBD6F", # 1bp Del C
|
|
75
|
+
"#FD8001", # 1bp Del T
|
|
76
|
+
"#B0DC8B", # 1bp Ins C
|
|
77
|
+
"#35A02E", # 1bp Ins T
|
|
78
|
+
"#FCC9B4", # 2bp Del Repeats
|
|
79
|
+
"#FC896B", # 3bp Del Repeats
|
|
80
|
+
"#F04432", # 4bp Del Repeats
|
|
81
|
+
"#BC1A1A", # 5+ bp Del Repeats
|
|
82
|
+
"#CFE0F0", # 2bp Ins Repeats
|
|
83
|
+
"#94C3DF", # 3bp Ins Repeats
|
|
84
|
+
"#4A98C8", # 4bp Ins Repeats
|
|
85
|
+
"#1665AA", # 5+ bp Ins Repeats
|
|
86
|
+
"#E1E0ED", # 2bp Del MH
|
|
87
|
+
"#B5B5D8", # 3bp Del MH
|
|
88
|
+
"#8683BC", # 4bp Del MH
|
|
89
|
+
"#624099", # 5+bp Del MH
|
|
90
|
+
]
|
|
91
|
+
|
|
92
|
+
# 12 * 6 + 11 = 83 colors
|
|
93
|
+
n_times = 12 * [6] + [1, 2, 3, 5]
|
|
94
|
+
COLORS_INDEL83 = [n * [col] for n, col in zip(n_times, COLORS_INDEL)]
|
|
95
|
+
COLORS_INDEL83 = [col for color_list in COLORS_INDEL83 for col in color_list]
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
""""""
|
|
@@ -0,0 +1,382 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Initialization methods for non-negative matrix factorization (NMF) models.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
|
|
7
|
+
from typing import Any
|
|
8
|
+
|
|
9
|
+
import anndata as ad
|
|
10
|
+
import numpy as np
|
|
11
|
+
|
|
12
|
+
from ..utils import (
|
|
13
|
+
dict_checker,
|
|
14
|
+
normalize_WH,
|
|
15
|
+
shape_checker,
|
|
16
|
+
type_checker,
|
|
17
|
+
value_checker,
|
|
18
|
+
)
|
|
19
|
+
from .methods import (
|
|
20
|
+
_INIT_METHODS,
|
|
21
|
+
_Init_methods,
|
|
22
|
+
init_custom,
|
|
23
|
+
init_flat,
|
|
24
|
+
init_nndsvd,
|
|
25
|
+
init_random,
|
|
26
|
+
init_separableNMF,
|
|
27
|
+
)
|
|
28
|
+
|
|
29
|
+
EPSILON = np.finfo(np.float32).eps
|
|
30
|
+
|
|
31
|
+
# allowed given parameters
|
|
32
|
+
GIVEN_PARAMETERS_STANDARD_NMF = ["asignatures"]
|
|
33
|
+
GIVEN_PARAMETERS_CORNET = [
|
|
34
|
+
"asignatures",
|
|
35
|
+
"signature_offsets",
|
|
36
|
+
"sample_offsets",
|
|
37
|
+
"signature_embeddings",
|
|
38
|
+
"sample_embeddings",
|
|
39
|
+
"variance",
|
|
40
|
+
]
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def initialize_mat(
|
|
44
|
+
data_mat: np.ndarray,
|
|
45
|
+
n_signatures: int,
|
|
46
|
+
method: _Init_methods = "nndsvd",
|
|
47
|
+
given_signatures_mat: np.ndarray | None = None,
|
|
48
|
+
**kwargs,
|
|
49
|
+
) -> tuple[np.ndarray, np.ndarray]:
|
|
50
|
+
"""
|
|
51
|
+
Initialize the signature and exposure matrices.
|
|
52
|
+
|
|
53
|
+
Inputs
|
|
54
|
+
------
|
|
55
|
+
data_mat : np.ndarray
|
|
56
|
+
shape (n_samples, n_features)
|
|
57
|
+
|
|
58
|
+
n_signatures : int
|
|
59
|
+
|
|
60
|
+
method : str
|
|
61
|
+
initialization method. One of 'custom', 'flat',
|
|
62
|
+
'nndsvd', 'nndsvda', 'nndsvdar', 'random', 'separableNMF'.
|
|
63
|
+
|
|
64
|
+
given_signatures_mat : np.ndarray, optional
|
|
65
|
+
At most 'n_signatures' many signatures can be provided to
|
|
66
|
+
overwrite some of the initialized signatures. This does not
|
|
67
|
+
change the initialized exposures.
|
|
68
|
+
|
|
69
|
+
kwargs : dict
|
|
70
|
+
Any keyword arguments to be passed to the initialization method.
|
|
71
|
+
This includes, for example, a possible 'seed' keyword argument
|
|
72
|
+
for all stochastic methods.
|
|
73
|
+
|
|
74
|
+
Returns
|
|
75
|
+
-------
|
|
76
|
+
signatures_mat : np.ndarray
|
|
77
|
+
shape (n_signatures, n_features)
|
|
78
|
+
|
|
79
|
+
exposures_mat : np.ndarray
|
|
80
|
+
shape (n_samples, n_signatures)
|
|
81
|
+
"""
|
|
82
|
+
value_checker("method", method, _INIT_METHODS)
|
|
83
|
+
|
|
84
|
+
if method == "custom":
|
|
85
|
+
matrices = init_custom(data_mat, n_signatures, **kwargs)
|
|
86
|
+
elif method == "flat":
|
|
87
|
+
matrices = init_flat(data_mat, n_signatures)
|
|
88
|
+
elif method in ["nndsvd", "nndsvda", "nndsvdar"]:
|
|
89
|
+
# mypy does not recognize that 'method' is compatible
|
|
90
|
+
# with Literal["nndsvd", "nndsvda", "nndsvdar"]
|
|
91
|
+
matrices = init_nndsvd(
|
|
92
|
+
data_mat, n_signatures, method=method, **kwargs # type: ignore[arg-type] # noqa: E501
|
|
93
|
+
)
|
|
94
|
+
elif method == "random":
|
|
95
|
+
matrices = init_random(data_mat, n_signatures, **kwargs)
|
|
96
|
+
else:
|
|
97
|
+
matrices = init_separableNMF(data_mat, n_signatures, **kwargs)
|
|
98
|
+
|
|
99
|
+
signatures_mat, exposures_mat = matrices
|
|
100
|
+
|
|
101
|
+
if given_signatures_mat is not None:
|
|
102
|
+
type_checker("given_signatures_mat", given_signatures_mat, np.ndarray)
|
|
103
|
+
given_n_signatures, given_n_features = given_signatures_mat.shape
|
|
104
|
+
|
|
105
|
+
if given_n_features != data_mat.shape[1]:
|
|
106
|
+
raise ValueError(
|
|
107
|
+
"The given signature matrix has a different number of features "
|
|
108
|
+
"than the data."
|
|
109
|
+
)
|
|
110
|
+
if given_n_signatures > n_signatures:
|
|
111
|
+
raise ValueError("The given signature matrix contains too many signatures.")
|
|
112
|
+
|
|
113
|
+
signatures_mat[:given_n_signatures, :] = given_signatures_mat.copy()
|
|
114
|
+
|
|
115
|
+
W, H = normalize_WH(signatures_mat.T, exposures_mat.T)
|
|
116
|
+
W, H = W.clip(EPSILON), H.clip(EPSILON)
|
|
117
|
+
signatures_mat, exposures_mat = W.T, H.T
|
|
118
|
+
return signatures_mat, exposures_mat
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
def check_given_asignatures(
|
|
122
|
+
given_asignatures: ad.AnnData, adata: ad.AnnData, n_signatures: int
|
|
123
|
+
) -> None:
|
|
124
|
+
"""
|
|
125
|
+
Check if the given signatures are compatible with
|
|
126
|
+
the input data and the number of signatures to be initialized.
|
|
127
|
+
The number of given signatures can be less or equal to the number of
|
|
128
|
+
signatures specified.
|
|
129
|
+
|
|
130
|
+
Inputs
|
|
131
|
+
------
|
|
132
|
+
given_asignatures: AnnData
|
|
133
|
+
Known signatures that should be fixed by the algorithm.
|
|
134
|
+
|
|
135
|
+
adata: ad.AnnData
|
|
136
|
+
Input data.
|
|
137
|
+
|
|
138
|
+
n_signatures: int
|
|
139
|
+
The number of signatures to initialize.
|
|
140
|
+
"""
|
|
141
|
+
type_checker("given_asignatures", given_asignatures, ad.AnnData)
|
|
142
|
+
if given_asignatures.n_vars != adata.n_vars:
|
|
143
|
+
raise ValueError(
|
|
144
|
+
"The given signatures have a different number of features than the data."
|
|
145
|
+
)
|
|
146
|
+
if not all(given_asignatures.var_names == adata.var_names):
|
|
147
|
+
raise ValueError(
|
|
148
|
+
"The features of the given signatures and the data are not identical."
|
|
149
|
+
)
|
|
150
|
+
if given_asignatures.n_obs > n_signatures:
|
|
151
|
+
raise ValueError(
|
|
152
|
+
"The number of given signatures exceeds "
|
|
153
|
+
"the number of signatures to initialize."
|
|
154
|
+
)
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
def initialize_base(
|
|
158
|
+
adata: ad.AnnData,
|
|
159
|
+
n_signatures: int,
|
|
160
|
+
method: _Init_methods = "nndsvd",
|
|
161
|
+
given_asignatures: ad.AnnData | None = None,
|
|
162
|
+
**kwargs,
|
|
163
|
+
) -> tuple[ad.AnnData, np.ndarray]:
|
|
164
|
+
"""
|
|
165
|
+
Initialize the signature anndata object and the exposure matrix.
|
|
166
|
+
The anndata object is unchanged and the exposure matrix is returned.
|
|
167
|
+
|
|
168
|
+
Inputs
|
|
169
|
+
------
|
|
170
|
+
adata : ad.AnnData
|
|
171
|
+
|
|
172
|
+
n_signatures : int
|
|
173
|
+
|
|
174
|
+
method : str
|
|
175
|
+
initialization method. One of 'custom', 'flat',
|
|
176
|
+
'nndsvd', 'nndsvda', 'nndsvdar', 'random', 'separableNMF'.
|
|
177
|
+
|
|
178
|
+
given_asignatures : ad.AnnData, optional
|
|
179
|
+
At most 'n_signatures' many signatures can be provided to
|
|
180
|
+
overwrite some of the initialized signatures. This does not
|
|
181
|
+
change the initialized exposures.
|
|
182
|
+
|
|
183
|
+
kwargs : dict
|
|
184
|
+
Any keyword arguments to be passed to the initialization method.
|
|
185
|
+
This includes, for example, a possible 'seed' keyword argument
|
|
186
|
+
for all stochastic methods.
|
|
187
|
+
|
|
188
|
+
Returns
|
|
189
|
+
-------
|
|
190
|
+
asignatures : ad.AnnData
|
|
191
|
+
Annotated signature matrix of shape (n_signatures, n_features)
|
|
192
|
+
|
|
193
|
+
exposures_mat : np.ndarray
|
|
194
|
+
shape (n_samples, n_signatures)
|
|
195
|
+
"""
|
|
196
|
+
if given_asignatures is not None:
|
|
197
|
+
check_given_asignatures(given_asignatures, adata, n_signatures)
|
|
198
|
+
given_signatures_mat = given_asignatures.X
|
|
199
|
+
else:
|
|
200
|
+
given_signatures_mat = None
|
|
201
|
+
|
|
202
|
+
signatures_mat, exposures_mat = initialize_mat(
|
|
203
|
+
adata.X, n_signatures, method, given_signatures_mat, **kwargs
|
|
204
|
+
)
|
|
205
|
+
asignatures = ad.AnnData(signatures_mat)
|
|
206
|
+
asignatures.var_names = adata.var_names
|
|
207
|
+
asignatures.obs_names = [f"Sig{k+1}" for k in range(n_signatures)]
|
|
208
|
+
|
|
209
|
+
# keep signature annotations
|
|
210
|
+
if given_asignatures is not None:
|
|
211
|
+
n_given_signatures = given_asignatures.n_obs
|
|
212
|
+
asignatures.obs_names = np.roll(asignatures.obs_names, n_given_signatures)
|
|
213
|
+
asignatures = ad.concat(
|
|
214
|
+
[given_asignatures, asignatures[n_given_signatures:, :]], join="outer"
|
|
215
|
+
)
|
|
216
|
+
|
|
217
|
+
return asignatures, exposures_mat
|
|
218
|
+
|
|
219
|
+
|
|
220
|
+
def check_given_parameters_standard_nmf(
|
|
221
|
+
adata: ad.AnnData,
|
|
222
|
+
n_signatures: int,
|
|
223
|
+
given_parameters: dict[str, Any],
|
|
224
|
+
) -> None:
|
|
225
|
+
dict_checker("given_parameters", given_parameters, GIVEN_PARAMETERS_STANDARD_NMF)
|
|
226
|
+
|
|
227
|
+
if "asignatures" in given_parameters:
|
|
228
|
+
check_given_asignatures(given_parameters["asignatures"], adata, n_signatures)
|
|
229
|
+
|
|
230
|
+
|
|
231
|
+
def initialize_standard_nmf(
|
|
232
|
+
adata: ad.AnnData,
|
|
233
|
+
n_signatures: int,
|
|
234
|
+
method: _Init_methods = "nndsvd",
|
|
235
|
+
given_parameters: dict[str, Any] | None = None,
|
|
236
|
+
**kwargs,
|
|
237
|
+
) -> ad.AnnData:
|
|
238
|
+
given_parameters = {} if given_parameters is None else given_parameters.copy()
|
|
239
|
+
check_given_parameters_standard_nmf(adata, n_signatures, given_parameters)
|
|
240
|
+
|
|
241
|
+
if "asignatures" in given_parameters:
|
|
242
|
+
given_asignatures = given_parameters["asignatures"]
|
|
243
|
+
else:
|
|
244
|
+
given_asignatures = None
|
|
245
|
+
|
|
246
|
+
asignatures, exposures_mat = initialize_base(
|
|
247
|
+
adata,
|
|
248
|
+
n_signatures,
|
|
249
|
+
method,
|
|
250
|
+
given_asignatures,
|
|
251
|
+
**kwargs,
|
|
252
|
+
)
|
|
253
|
+
adata.obsm["exposures"] = exposures_mat
|
|
254
|
+
return asignatures
|
|
255
|
+
|
|
256
|
+
|
|
257
|
+
def check_given_offsets_cornet(
|
|
258
|
+
given_offsets: np.ndarray, n_offsets_expected: int, name: str
|
|
259
|
+
) -> None:
|
|
260
|
+
"""
|
|
261
|
+
Check if the given sample or signature offsets match the expected shape.
|
|
262
|
+
"""
|
|
263
|
+
type_checker(name, given_offsets, np.ndarray)
|
|
264
|
+
shape_checker(name, given_offsets, (n_offsets_expected,))
|
|
265
|
+
|
|
266
|
+
|
|
267
|
+
def check_given_embeddings_cornet(
|
|
268
|
+
given_embeddings: np.ndarray,
|
|
269
|
+
n_embeddings_expected: int,
|
|
270
|
+
dim_embeddings_expected: int,
|
|
271
|
+
name: str,
|
|
272
|
+
) -> None:
|
|
273
|
+
type_checker(name, given_embeddings, np.ndarray)
|
|
274
|
+
shape_checker(
|
|
275
|
+
name, given_embeddings, (n_embeddings_expected, dim_embeddings_expected)
|
|
276
|
+
)
|
|
277
|
+
|
|
278
|
+
|
|
279
|
+
def check_given_parameters_cornet(
|
|
280
|
+
adata: ad.AnnData,
|
|
281
|
+
n_signatures: int,
|
|
282
|
+
dim_embeddings: int,
|
|
283
|
+
given_parameters: dict[str, Any],
|
|
284
|
+
) -> None:
|
|
285
|
+
dict_checker("given_parameters", given_parameters, GIVEN_PARAMETERS_CORNET)
|
|
286
|
+
|
|
287
|
+
if "asignatures" in given_parameters:
|
|
288
|
+
check_given_asignatures(given_parameters["asignatures"], adata, n_signatures)
|
|
289
|
+
|
|
290
|
+
if "signature_offsets" in given_parameters:
|
|
291
|
+
check_given_offsets_cornet(
|
|
292
|
+
given_parameters["signature_offsets"],
|
|
293
|
+
n_signatures,
|
|
294
|
+
"given_signature_offsets",
|
|
295
|
+
)
|
|
296
|
+
if "sample_offsets" in given_parameters:
|
|
297
|
+
check_given_offsets_cornet(
|
|
298
|
+
given_parameters["sample_offsets"], adata.n_obs, "given_sample_offsets"
|
|
299
|
+
)
|
|
300
|
+
if "signature_embeddings" in given_parameters:
|
|
301
|
+
check_given_embeddings_cornet(
|
|
302
|
+
given_parameters["signature_embeddings"],
|
|
303
|
+
n_signatures,
|
|
304
|
+
dim_embeddings,
|
|
305
|
+
"given_signature_embeddings",
|
|
306
|
+
)
|
|
307
|
+
if "sample_embeddings" in given_parameters:
|
|
308
|
+
check_given_embeddings_cornet(
|
|
309
|
+
given_parameters["sample_embeddings"],
|
|
310
|
+
adata.n_obs,
|
|
311
|
+
dim_embeddings,
|
|
312
|
+
"given_sample_embeddings",
|
|
313
|
+
)
|
|
314
|
+
if "variance" in given_parameters:
|
|
315
|
+
given_variance = given_parameters["variance"]
|
|
316
|
+
type_checker("given_variance", given_variance, [float, int])
|
|
317
|
+
if given_variance <= 0.0:
|
|
318
|
+
raise ValueError("The variance has to be a positive real number.")
|
|
319
|
+
|
|
320
|
+
|
|
321
|
+
def initialize_cornet(
|
|
322
|
+
adata: ad.AnnData,
|
|
323
|
+
n_signatures: int,
|
|
324
|
+
dim_embeddings: int,
|
|
325
|
+
method: _Init_methods = "nndsvd",
|
|
326
|
+
given_parameters: dict[str, Any] | None = None,
|
|
327
|
+
**kwargs,
|
|
328
|
+
) -> tuple[ad.AnnData, float]:
|
|
329
|
+
if method == "custom":
|
|
330
|
+
raise ValueError(
|
|
331
|
+
"Custom parameter initializations are currently not supported "
|
|
332
|
+
"for correlated NMF."
|
|
333
|
+
)
|
|
334
|
+
|
|
335
|
+
given_parameters = {} if given_parameters is None else given_parameters.copy()
|
|
336
|
+
check_given_parameters_cornet(adata, n_signatures, dim_embeddings, given_parameters)
|
|
337
|
+
|
|
338
|
+
if "asignatures" in given_parameters:
|
|
339
|
+
given_asignatures = given_parameters["asignatures"]
|
|
340
|
+
else:
|
|
341
|
+
given_asignatures = None
|
|
342
|
+
|
|
343
|
+
asignatures, _ = initialize_base(
|
|
344
|
+
adata,
|
|
345
|
+
n_signatures,
|
|
346
|
+
method,
|
|
347
|
+
given_asignatures,
|
|
348
|
+
**kwargs,
|
|
349
|
+
)
|
|
350
|
+
|
|
351
|
+
if "signature_offsets" in given_parameters:
|
|
352
|
+
asignatures.obs["offsets"] = given_parameters["signature_offsets"]
|
|
353
|
+
else:
|
|
354
|
+
asignatures.obs["offsets"] = np.zeros(n_signatures)
|
|
355
|
+
|
|
356
|
+
if "sample_offsets" in given_parameters:
|
|
357
|
+
adata.obs["offsets"] = given_parameters["sample_offsets"]
|
|
358
|
+
else:
|
|
359
|
+
adata.obs["offsets"] = np.zeros(adata.n_obs)
|
|
360
|
+
|
|
361
|
+
if "signature_embeddings" in given_parameters:
|
|
362
|
+
asignatures.obsm["embeddings"] = given_parameters["signature_embeddings"]
|
|
363
|
+
else:
|
|
364
|
+
asignatures.obsm["embeddings"] = np.random.multivariate_normal(
|
|
365
|
+
np.zeros(dim_embeddings), np.identity(dim_embeddings), size=n_signatures
|
|
366
|
+
)
|
|
367
|
+
|
|
368
|
+
if "sample_embeddings" in given_parameters:
|
|
369
|
+
adata.obsm["embeddings"] = given_parameters["sample_embeddings"]
|
|
370
|
+
else:
|
|
371
|
+
adata.obsm["embeddings"] = np.random.multivariate_normal(
|
|
372
|
+
np.zeros(dim_embeddings),
|
|
373
|
+
np.identity(dim_embeddings),
|
|
374
|
+
size=adata.n_obs,
|
|
375
|
+
)
|
|
376
|
+
|
|
377
|
+
if "variance" in given_parameters:
|
|
378
|
+
variance = float(given_parameters["variance"])
|
|
379
|
+
else:
|
|
380
|
+
variance = 1.0
|
|
381
|
+
|
|
382
|
+
return asignatures, variance
|
|
@@ -0,0 +1,135 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Initialization methods for non-negative matrix factorization (NMF)
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
|
|
7
|
+
from typing import Literal, get_args
|
|
8
|
+
|
|
9
|
+
import numpy as np
|
|
10
|
+
from sklearn.decomposition import _nmf as sknmf
|
|
11
|
+
|
|
12
|
+
from ..utils import shape_checker, type_checker
|
|
13
|
+
|
|
14
|
+
EPSILON = np.finfo(np.float32).eps
|
|
15
|
+
_Init_methods = Literal[
|
|
16
|
+
"custom",
|
|
17
|
+
"flat",
|
|
18
|
+
"nndsvd",
|
|
19
|
+
"nndsvda",
|
|
20
|
+
"nndsvdar",
|
|
21
|
+
"random",
|
|
22
|
+
"separableNMF",
|
|
23
|
+
]
|
|
24
|
+
_INIT_METHODS = get_args(_Init_methods)
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def init_custom(
|
|
28
|
+
data_mat: np.ndarray,
|
|
29
|
+
n_signatures: int,
|
|
30
|
+
signatures_mat: np.ndarray,
|
|
31
|
+
exposures_mat: np.ndarray,
|
|
32
|
+
) -> tuple[np.ndarray, np.ndarray]:
|
|
33
|
+
"""
|
|
34
|
+
Perform type and shape checks on custom signature and
|
|
35
|
+
exposure matrix initializations.
|
|
36
|
+
|
|
37
|
+
Inputs:
|
|
38
|
+
-------
|
|
39
|
+
data_mat: np.ndarray
|
|
40
|
+
shape (n_samples, n_features)
|
|
41
|
+
|
|
42
|
+
n_signatures: int
|
|
43
|
+
|
|
44
|
+
signatures_mat: np.ndarray
|
|
45
|
+
shape (n_signatures, n_features)
|
|
46
|
+
|
|
47
|
+
exposures_mat: np.ndarray
|
|
48
|
+
shape (n_samples, n_signatures)
|
|
49
|
+
"""
|
|
50
|
+
type_checker("signatures_mat", signatures_mat, np.ndarray)
|
|
51
|
+
type_checker("exposures_mat", exposures_mat, np.ndarray)
|
|
52
|
+
n_samples, n_features = data_mat.shape
|
|
53
|
+
shape_checker("signatures_mat", signatures_mat, (n_signatures, n_features))
|
|
54
|
+
shape_checker("exposures_mat", exposures_mat, (n_samples, n_signatures))
|
|
55
|
+
return signatures_mat, exposures_mat
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def init_flat(data_mat: np.ndarray, n_signatures: int) -> tuple[np.ndarray, np.ndarray]:
|
|
59
|
+
"""
|
|
60
|
+
Initialize the signature and exposure matrices with one float, respectively.
|
|
61
|
+
"""
|
|
62
|
+
n_features = data_mat.shape[1]
|
|
63
|
+
signatures_mat = np.full((n_signatures, n_features), 1 / n_features)
|
|
64
|
+
exposures = np.sum(data_mat, axis=1) / n_signatures
|
|
65
|
+
exposures_mat = np.tile(exposures, (n_signatures, 1)).T
|
|
66
|
+
return signatures_mat, exposures_mat
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def init_nndsvd(
|
|
70
|
+
data_mat: np.ndarray,
|
|
71
|
+
n_signatures: int,
|
|
72
|
+
method: Literal["nndsvd", "nndsvda", "nndsvdar"] = "nndsvd",
|
|
73
|
+
seed: int | None = None,
|
|
74
|
+
) -> tuple[np.ndarray, np.ndarray]:
|
|
75
|
+
"""
|
|
76
|
+
A wrapper around the non-negative double singular value decomposition (NNDSVD)
|
|
77
|
+
initialization methods "nndsvd", "nndsvda" and "nndsvdar" from scikit-learn.
|
|
78
|
+
"""
|
|
79
|
+
if seed is not None:
|
|
80
|
+
np.random.seed(seed)
|
|
81
|
+
|
|
82
|
+
# pylint: disable-next=W0212
|
|
83
|
+
exposures_mat, signatures_mat = sknmf._initialize_nmf(
|
|
84
|
+
data_mat, n_signatures, init=method
|
|
85
|
+
)
|
|
86
|
+
return signatures_mat, exposures_mat
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
def init_random(
|
|
90
|
+
data_mat: np.ndarray, n_signatures: int, seed: int | None = None
|
|
91
|
+
) -> tuple[np.ndarray, np.ndarray]:
|
|
92
|
+
"""
|
|
93
|
+
Initialize each signature by drawing from the uniform
|
|
94
|
+
distribution on the simplex.
|
|
95
|
+
Initialize the exposures of each sample as a scaled sample
|
|
96
|
+
from the uniform distribution on a simplex.
|
|
97
|
+
The scaling is chosen such that the expected total exposure is equal to
|
|
98
|
+
the column sum of that sample in the count matrix X.
|
|
99
|
+
"""
|
|
100
|
+
if seed is not None:
|
|
101
|
+
np.random.seed(seed)
|
|
102
|
+
|
|
103
|
+
n_samples, n_features = data_mat.shape
|
|
104
|
+
signatures_mat = np.random.dirichlet(np.ones(n_features), size=n_signatures)
|
|
105
|
+
scaling = np.sum(data_mat, axis=1)
|
|
106
|
+
exposures_mat = scaling[:, np.newaxis] * np.random.dirichlet(
|
|
107
|
+
np.ones(n_signatures), size=n_samples
|
|
108
|
+
)
|
|
109
|
+
return signatures_mat, exposures_mat
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
def init_separableNMF(
|
|
113
|
+
data_mat: np.ndarray, n_signatures: int, seed: int | None = None
|
|
114
|
+
) -> tuple[np.ndarray, np.ndarray]:
|
|
115
|
+
r"""
|
|
116
|
+
This code is following Algorithm 1 from "Fast and Robust Recursive
|
|
117
|
+
Algorithms for Separable Nonnegative Matrix Factorization"
|
|
118
|
+
(Gillis and Vavasis, 2013), with the canonical choice of
|
|
119
|
+
f(x) = \| x \|_2^2 as the strongly convex function f satisfying
|
|
120
|
+
Assumption 2 from the paper.
|
|
121
|
+
"""
|
|
122
|
+
signature_indices = np.empty(n_signatures, dtype=int)
|
|
123
|
+
R = data_mat.T / np.sum(data_mat.T, axis=0)
|
|
124
|
+
|
|
125
|
+
for k in range(n_signatures):
|
|
126
|
+
column_norms = np.sum(R**2, axis=0)
|
|
127
|
+
kstar = np.argmax(column_norms)
|
|
128
|
+
u = R[:, kstar]
|
|
129
|
+
R = (np.identity(R.shape[0]) - np.outer(u, u) / column_norms[kstar]) @ R
|
|
130
|
+
signature_indices[k] = kstar
|
|
131
|
+
|
|
132
|
+
signatures_mat = data_mat[signature_indices, :].astype(float)
|
|
133
|
+
signatures_mat /= signatures_mat.sum(axis=1)[:, np.newaxis]
|
|
134
|
+
_, exposures_mat = init_random(data_mat, n_signatures, seed=seed)
|
|
135
|
+
return signatures_mat, exposures_mat
|