rawk 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
rawk/__init__.py ADDED
@@ -0,0 +1,25 @@
1
+ from .rawk_sample import RawkSample
2
+
3
+ from .rawk import Rawk
4
+ from .rawk import RawkTest
5
+
6
+ from .multisample_rawk import MultiSampleRawk
7
+ from .multisample_rawk import MultiSampleRawkTest
8
+
9
+ from .input_prep import get_met_net_dfs
10
+ from .input_prep import transform_gene_prop
11
+ from .input_prep import qn_transform
12
+ from .input_prep import get_mrn_gp_df
13
+
14
+ from .plot import plot_nw_stats
15
+ from .plot import plot_elbow
16
+ from .plot import plot_graph
17
+ from .plot import hist
18
+ from .plot import plot_pw_neighborhood
19
+ from .plot import plot_rawk_sample_mtx
20
+
21
+ import importlib.metadata
22
+
23
+
24
+
25
+ __version__ = importlib.metadata.version("rawk")
rawk/fastrp.py ADDED
@@ -0,0 +1,73 @@
1
+ # This file is adapted from the GTmac/FastRP GitHub repository at commit 3a6a71c
2
+
3
+ import numpy as np
4
+
5
+
6
+ from sklearn import random_projection
7
+ from sklearn.preprocessing import normalize, scale
8
+ from scipy.sparse import coo_matrix, csr_matrix, csc_matrix, spdiags
9
+
10
+
11
+ # projection method: choose from Gaussian and Sparse
12
+ # input matrix: choose from adjacency and transition matrix
13
+ # alpha adjusts the weighting of nodes according to their degree
14
+ def fastrp_projection(A, q=3, dim=128, projection_method='gaussian', input_matrix='adj', alpha=None):
15
+ assert input_matrix == 'adj' or input_matrix == 'trans'
16
+ assert projection_method == 'gaussian' or projection_method == 'sparse'
17
+ #
18
+ if input_matrix == 'adj':
19
+ N = A.shape[0]
20
+ M = A
21
+ else:
22
+ N = A.shape[0]
23
+ normalizer = spdiags(np.squeeze(1.0 / csc_matrix.sum(A, axis=1) ), 0, N, N)
24
+ M = normalizer @ A
25
+ # Gaussian projection matrix
26
+ if projection_method == 'gaussian':
27
+ transformer = random_projection.GaussianRandomProjection(n_components=dim, random_state=42)
28
+ # Sparse projection matrix
29
+ else:
30
+ transformer = random_projection.SparseRandomProjection(n_components=dim, random_state=42)
31
+ Y = transformer.fit(M)
32
+ # Random projection for A
33
+ if alpha is not None:
34
+ Y.components_ = Y.components_ @ spdiags( \
35
+ np.squeeze(np.power(csc_matrix.sum(A, axis=1), alpha)), 0, N, N)
36
+ cur_U = transformer.transform(M)
37
+ U_list = [cur_U]
38
+ #
39
+ for i in range(2, q + 1):
40
+ cur_U = M @ cur_U
41
+ U_list.append(cur_U)
42
+ return U_list
43
+
44
+
45
+ # When weights is None, concatenate instead of linearly combines the embeddings from different powers of A
46
+ def fastrp_merge(U_list, weights, normalization=False):
47
+ dense_U_list = [np.asarray(_U.todense()) for _U in U_list] if type(U_list[0]) == csc_matrix else U_list
48
+ _U_list = [normalize(_U, norm='l2', axis=1) for _U in dense_U_list] if normalization else dense_U_list
49
+ #
50
+ if weights is None:
51
+ return np.concatenate(_U_list, axis=1)
52
+ U = np.zeros_like(_U_list[0])
53
+ for cur_U, weight in zip(_U_list, weights):
54
+ U += cur_U * weight
55
+ # U = scale(U.todense())
56
+ # U = normalize(U.todense(), norm='l2', axis=1)
57
+ # U = np.asarray(U)
58
+ # return U
59
+ return scale(U.todense())
60
+
61
+
62
+ # A is always the adjacency matrix
63
+ # the choice between adj matrix and trans matrix is decided in the conf
64
+ def fastrp_wrapper(A, conf):
65
+ U_list = fastrp_projection(A,
66
+ q=len(conf['weights']),
67
+ dim=conf['dim'],
68
+ projection_method=conf['projection_method'],
69
+ input_matrix=conf['input_matrix'],
70
+ alpha=conf['alpha'],
71
+ )
72
+ U = fastrp_merge(U_list, conf['weights'], conf['normalization'])
73
+ return U
rawk/input_prep.py ADDED
@@ -0,0 +1,369 @@
1
+ import pandas as pd
2
+ import numpy as np
3
+ import networkx as nx
4
+ from sklearn.preprocessing import QuantileTransformer
5
+
6
+
7
+
8
+ def transform_gene_prop(gene_prop_df, transform):
9
+ """
10
+ Apply transform function to each property column
11
+
12
+ Parameters
13
+ ----------
14
+ gene_prop_df : dataframe
15
+ A dataframe of gene properties, such as log fold changes, normalized
16
+ read counts, and z-scores. The 'gene' column of the dataframe is a
17
+ list of gene symbols. Other columns of the dataframe are the gene
18
+ properties of samples.
19
+ transform : function
20
+ A function to transform each property column.
21
+
22
+ Returns
23
+ -------
24
+ dataframe after transformation
25
+ """
26
+ assert not gene_prop_df['gene'].duplicated().any()
27
+
28
+ if transform is not None:
29
+ tdf = (
30
+ gene_prop_df
31
+ .copy()
32
+ .set_index('gene')
33
+ .apply(transform, axis=0)
34
+ .reset_index(names='gene')
35
+ )
36
+
37
+ return tdf
38
+
39
+
40
+
41
+ def qn_transform(s, sigma=0.367879, log1p=False, collapse_0s=False,
42
+ center=False, seed=42):
43
+ """
44
+ Quantile transform gene properties to normal distribution
45
+
46
+ Parameters
47
+ ----------
48
+ s : series
49
+ A pandas.Series of gene properties, such as log fold changes,
50
+ normalized read counts, and z-scores.
51
+ sigma : float
52
+ Output normal distribution sigma. Default to
53
+ np.round(1 / np.e, 6), so np.e ** (sigma * (3 - (-3))) ~= 9.
54
+ log1p : bool
55
+ Apply log1p transform on the properties or not, before
56
+ quantile normalization.
57
+ collapse_0s : bool
58
+ If True, all 0s will be collapse into one 0, and the 0's
59
+ quantile normalized value will be assigned to all 0s.
60
+ center : bool
61
+ If True, the tranformed values will be centered at the input 0s.
62
+ seed : int
63
+ Random state.
64
+
65
+ Returns
66
+ -------
67
+ Series after transformation
68
+ """
69
+ x = s.values.copy()
70
+ if log1p:
71
+ x = np.log1p(x)
72
+
73
+ assert len(x.shape) == 1
74
+ n = len(x)
75
+
76
+ qt = QuantileTransformer(
77
+ output_distribution='normal',
78
+ random_state=seed)
79
+
80
+ if collapse_0s:
81
+ # collapse all 0s when transform
82
+ non0_idc = x != 0
83
+ non0_x = x[non0_idc].copy()
84
+
85
+ c0_x = np.concatenate((np.array([0]), non0_x))
86
+
87
+ c0_x = c0_x.reshape(-1, 1)
88
+ assert c0_x[0] == 0
89
+ assert c0_x[1] != 0
90
+ assert c0_x.shape == (sum(non0_idc) + 1, 1)
91
+
92
+ c0_x = qt.fit_transform(c0_x)
93
+ assert c0_x.shape == (sum(non0_idc) + 1, 1)
94
+
95
+ c0_x = c0_x.flatten()
96
+ assert c0_x.shape == (sum(non0_idc) + 1,)
97
+
98
+ x[np.logical_not(non0_idc)] = c0_x[0]
99
+
100
+ x[non0_idc] = c0_x[1:]
101
+
102
+ else:
103
+ x = x.reshape(-1, 1)
104
+ assert x.shape == (n, 1)
105
+
106
+ x = qt.fit_transform(x)
107
+ assert x.shape == (n, 1)
108
+
109
+ x = x.flatten()
110
+
111
+ assert len(x.shape) == 1
112
+
113
+ if center:
114
+ t0 = qt.transform([[0.0]])[0][0]
115
+ x = x - t0
116
+
117
+ x = x * sigma
118
+
119
+ return pd.Series(x, index=s.index.copy())
120
+
121
+
122
+
123
+ def get_mrn_gp_df(gene_prop_df, rxn_gene_df,
124
+ fill_missing_gene_prop=None,
125
+ transform_gene_prop_func=None):
126
+ """
127
+ Prepare metabolic reaction network gene property dataframe
128
+
129
+ Parameters
130
+ ----------
131
+ gene_prop_df : dataframe
132
+ A dataframe of gene properties, such as log fold changes, normalized
133
+ read counts, and z-scores. The 'gene' column of the dataframe is a
134
+ list of gene symbols. Other columns of the dataframe are the gene
135
+ properties of samples.
136
+ rxn_gene_df : dataframe
137
+ A dataframe of reactions and their associated genes. Following are the
138
+ required columns: 'rxn' (reaction ID), 'rxn_name' (reaction name),
139
+ 'equation' (reaction equation), 'pathway' (reaction pathway), and
140
+ 'gene' (gene symbol). If a reaction is associated with multiple genes,
141
+ one row lists one associated gene.
142
+ fill_missing_gene_prop : int, or float, or None
143
+ If int or float, replace missing gene properties with this value.
144
+ If None, drop genes with missing properties.
145
+ transform_gene_prop_func : function
146
+ A function to transform each property column. If None, no
147
+ transformation will be applied.
148
+
149
+ Returns
150
+ -------
151
+ dataframe
152
+ """
153
+ if fill_missing_gene_prop is not None:
154
+ all_genes = pd.concat(
155
+ [gene_prop_df["gene"], rxn_gene_df["gene"]]
156
+ ).drop_duplicates().tolist()
157
+
158
+ gene_prop_df = (
159
+ gene_prop_df
160
+ .copy()
161
+ .set_index("gene")
162
+ .reindex(
163
+ all_genes,
164
+ fill_value=fill_missing_gene_prop)
165
+ .reset_index()
166
+ )
167
+
168
+ assert not np.any(gene_prop_df.isnull().values)
169
+
170
+ if transform_gene_prop_func is not None:
171
+ gene_prop_df = transform_gene_prop(
172
+ gene_prop_df, transform_gene_prop_func)
173
+
174
+ assert not np.any(gene_prop_df.isnull().values)
175
+
176
+ return gene_prop_df
177
+
178
+
179
+
180
+ def get_met_net_dfs(rxn_gene_df, rxn_edge_df, gene_prop_df, mn_weight_cutoff,
181
+ fill_missing_gene_prop=0,
182
+ transform_gene_prop_func=None,
183
+ rxn_gene_prop_agg_func=None):
184
+ """
185
+ Prepare input node and edge dataframes for Rawk
186
+
187
+ Parameters
188
+ ----------
189
+ rxn_gene_df : dataframe
190
+ A dataframe of reactions and their associated genes. Following are the
191
+ required columns: 'rxn' (reaction ID), 'rxn_name' (reaction name),
192
+ 'equation' (reaction equation), 'pathway' (reaction pathway), and
193
+ 'gene' (gene symbol). If a reaction is associated with multiple genes,
194
+ one row lists one associated gene.
195
+ rxn_edge_df : dataframe
196
+ A dataframe of reactions edges. Following are the required columns:
197
+ 'src' (source node reaction ID), 'dest' (destination node reaction ID),
198
+ 'mn_weight' (metabolic network edge weight). The edges are undirected,
199
+ with the src <= dest in alphabetical order.
200
+ gene_prop_df : dataframe
201
+ A dataframe of gene properties, such as log fold changes, normalized
202
+ read counts, and z-scores. The 'gene' column of the dataframe is a
203
+ list of gene symbols. Other columns of the dataframe are the gene
204
+ properties of samples.
205
+ mn_weight_cutoff : float
206
+ The cutoff of metabolic network weights. Keep only edges with weights >
207
+ mn_weight_cutoff.
208
+ fill_missing_gene_prop : int, or float, or None
209
+ If int or float, replace missing gene properties with this value.
210
+ If None, drop genes with missing properties.
211
+ transform_gene_prop_func : function
212
+ A function to transform each property column.
213
+ rxn_gene_prop_agg_func : function or None
214
+ The function used to aggregate the property values of multiple genes
215
+ that are associated with each reaction. If None, use
216
+ lambda x: x.mean() to aggregate.
217
+
218
+ Returns
219
+ -------
220
+ (dataframe, dataframe)
221
+ A 2-tuple of a reaction node property dataframe and a reaction edge
222
+ dataframe
223
+ """
224
+ if np.any(gene_prop_df.isnull().values):
225
+ raise ValueError("gene_prop_df contains one or more NA/NaN... values")
226
+ if np.any(rxn_edge_df.isnull().values):
227
+ raise ValueError("rxn_edge_df contains one or more NA/NaN... values")
228
+ if np.any(rxn_gene_df.isnull().values):
229
+ raise ValueError("rxn_gene_df contains one or more NA/NaN... values")
230
+
231
+ if rxn_edge_df[["src", "dest"]].duplicated().any():
232
+ raise ValueError("rxn_edge_df contains duplicated (src, dest) pairs")
233
+
234
+ if not all(rxn_edge_df.src <= rxn_edge_df.dest):
235
+ raise ValueError(
236
+ "rxn_edge_df requires src <= dest in alphabetical order")
237
+
238
+ if rxn_gene_df[["rxn", "gene"]].duplicated().any():
239
+ raise ValueError("rxn_gene_df contains duplicated (rxn, gene) pairs")
240
+
241
+ if gene_prop_df["gene"].duplicated().any():
242
+ raise ValueError("gene_prop_df contains duplicated genes.")
243
+
244
+ gene_prop_df = get_mrn_gp_df(
245
+ gene_prop_df, rxn_gene_df,
246
+ fill_missing_gene_prop=fill_missing_gene_prop,
247
+ transform_gene_prop_func=transform_gene_prop_func)
248
+
249
+ rxn_pw_set_dict = (
250
+ rxn_gene_df
251
+ .groupby("rxn")["pathway"]
252
+ .apply(set)
253
+ .to_dict()
254
+ )
255
+ rxn_pw_dict = {}
256
+ for k, v in rxn_pw_set_dict.items():
257
+ if len(v) != 1:
258
+ raise ValueError(
259
+ "rxn_gene_df contains one reaction to "
260
+ "multiple pathways mappings.")
261
+ rxn_pw_dict[k] = list(v)[0]
262
+
263
+ pw_rxn_set_dict = (
264
+ rxn_gene_df
265
+ .groupby("pathway")["rxn"]
266
+ .apply(set)
267
+ .to_dict()
268
+ )
269
+
270
+ rxn_edge_df = rxn_edge_df.loc[
271
+ rxn_edge_df["mn_weight"] > mn_weight_cutoff, :].copy()
272
+
273
+ edge_rxn_set = set(
274
+ rxn_edge_df["src"].tolist() + rxn_edge_df["dest"].tolist())
275
+
276
+ rxn_gene_prop_df = (
277
+ rxn_gene_df
278
+ .loc[rxn_gene_df["rxn"].isin(edge_rxn_set), :]
279
+ .merge(
280
+ gene_prop_df, how="left", on="gene",
281
+ validate="many_to_one")
282
+ )
283
+
284
+ if fill_missing_gene_prop is None:
285
+ rxn_gene_prop_df = rxn_gene_prop_df.dropna()
286
+ else:
287
+ assert not np.any(rxn_gene_prop_df.isnull().values)
288
+
289
+ # weight filtered; property exists
290
+ wf_pe_rxn_set = set(rxn_gene_prop_df["rxn"].tolist())
291
+ rxn_edge_df = rxn_edge_df.loc[
292
+ np.logical_and(
293
+ rxn_edge_df["src"].isin(wf_pe_rxn_set),
294
+ rxn_edge_df["dest"].isin(wf_pe_rxn_set)),
295
+ :].copy()
296
+
297
+ # If a rxn in a pathway cannot reach other pathways, remove the rxn.
298
+ # Such rxns will have all random walk steps within their own pathways
299
+ # regardless of the property values.
300
+ f_graph = nx.from_pandas_edgelist(
301
+ rxn_edge_df,
302
+ source="src",
303
+ target="dest")
304
+ assert not f_graph.is_directed()
305
+ # rxn to reachable rxn
306
+ r_rrs_dict = {
307
+ k: nx.node_connected_component(f_graph, k)
308
+ for k in list(f_graph.nodes())
309
+ }
310
+
311
+ rm_rxns = set([
312
+ k for k, v in r_rrs_dict.items()
313
+ if len(v - pw_rxn_set_dict[rxn_pw_dict[k]]) <= 0
314
+ ])
315
+
316
+ # If a reaction in a pathway can reach other pathways,
317
+ # the reaction cannot reach any reaction that cannot.
318
+ rxn_edge_df = rxn_edge_df.loc[
319
+ np.logical_not(np.logical_or(
320
+ rxn_edge_df["src"].isin(rm_rxns),
321
+ rxn_edge_df["dest"].isin(rm_rxns))),
322
+ :].copy()
323
+
324
+ def unique_one(x):
325
+ x_set = set(x.tolist())
326
+ assert len(x_set) == 1, str(x_set)
327
+ return list(x_set)[0]
328
+
329
+ agg_func_dict = {
330
+ "rxn_name": unique_one,
331
+ "equation": unique_one,
332
+ "pathway": unique_one,
333
+ "gene": (
334
+ lambda x: ";;;".join(sorted(x.tolist()))
335
+ )
336
+ }
337
+
338
+ if rxn_gene_prop_agg_func is None:
339
+ rxn_gene_prop_agg_func = lambda x: x.mean()
340
+
341
+ for i in rxn_gene_prop_df.columns:
342
+ if i != "rxn" and i not in agg_func_dict:
343
+ agg_func_dict[i] = rxn_gene_prop_agg_func
344
+
345
+ rxn_prop_df = (
346
+ rxn_gene_prop_df
347
+ .groupby("rxn")
348
+ .agg(agg_func_dict)
349
+ .reset_index(names=["rxn"])
350
+ )
351
+ assert rxn_prop_df.isnull().values.sum() == 0
352
+
353
+ common_rxns = (
354
+ set(rxn_prop_df["rxn"].tolist())
355
+ .intersection(set(rxn_edge_df["src"].tolist()))
356
+ .intersection(set(rxn_edge_df["dest"].tolist()))
357
+ )
358
+
359
+ rxn_edge_df = rxn_edge_df.loc[
360
+ np.logical_and(
361
+ rxn_edge_df["src"].isin(common_rxns),
362
+ rxn_edge_df["dest"].isin(common_rxns)),
363
+ :].reset_index(drop=True).copy()
364
+
365
+ rxn_prop_df = rxn_prop_df.loc[
366
+ rxn_prop_df["rxn"].isin(common_rxns),
367
+ :].reset_index(drop=True).copy()
368
+
369
+ return rxn_prop_df, rxn_edge_df