rawk 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- rawk-0.1.0/LICENSE.txt +31 -0
- rawk-0.1.0/PKG-INFO +64 -0
- rawk-0.1.0/README.md +42 -0
- rawk-0.1.0/pyproject.toml +40 -0
- rawk-0.1.0/setup.cfg +4 -0
- rawk-0.1.0/src/rawk/__init__.py +25 -0
- rawk-0.1.0/src/rawk/fastrp.py +73 -0
- rawk-0.1.0/src/rawk/input_prep.py +369 -0
- rawk-0.1.0/src/rawk/multisample_rawk.py +412 -0
- rawk-0.1.0/src/rawk/n2v.py +165 -0
- rawk-0.1.0/src/rawk/plot.py +1135 -0
- rawk-0.1.0/src/rawk/rawk.py +578 -0
- rawk-0.1.0/src/rawk/rawk_sample.py +939 -0
- rawk-0.1.0/src/rawk.egg-info/PKG-INFO +64 -0
- rawk-0.1.0/src/rawk.egg-info/SOURCES.txt +16 -0
- rawk-0.1.0/src/rawk.egg-info/dependency_links.txt +1 -0
- rawk-0.1.0/src/rawk.egg-info/requires.txt +10 -0
- rawk-0.1.0/src/rawk.egg-info/top_level.txt +1 -0
rawk-0.1.0/LICENSE.txt
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
LICENSE
|
|
2
|
+
|
|
3
|
+
Non-Commercial Research License
|
|
4
|
+
|
|
5
|
+
Copyright © 2025 The Children’s Hospital of Philadelphia. All Rights Reserved.
|
|
6
|
+
|
|
7
|
+
Permission is hereby granted, free of charge, to any person or organization to use, copy, modify, and distribute this software and associated documentation files (the “Software”) for academic, research, or educational purposes only, subject to the following conditions:
|
|
8
|
+
|
|
9
|
+
1. Attribution
|
|
10
|
+
|
|
11
|
+
Appropriate credit must be given to the authors in any use, publication, or derivative work of the Software.
|
|
12
|
+
|
|
13
|
+
2. Non-Commercial Use Only
|
|
14
|
+
|
|
15
|
+
The Software may not be used, in whole or in part, for commercial purposes, including but not limited to:
|
|
16
|
+
|
|
17
|
+
- Use in a product for sale,
|
|
18
|
+
|
|
19
|
+
- Use in a for-profit company’s operations,
|
|
20
|
+
|
|
21
|
+
- Use in services provided to customers for a fee,
|
|
22
|
+
|
|
23
|
+
- Use in sponsored research.
|
|
24
|
+
|
|
25
|
+
3. Commercial Licensing
|
|
26
|
+
|
|
27
|
+
For commercial use, a separate license must be obtained from the copyright holder. Please contact: licensing@chop.edu
|
|
28
|
+
|
|
29
|
+
4. Warranty Disclaimer
|
|
30
|
+
|
|
31
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
rawk-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: rawk
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Metabolic pathway local enrichment analysis via random walks on metabolic reaction network
|
|
5
|
+
Author: Taylor Lab
|
|
6
|
+
Keywords: metabolism,network,enrichment
|
|
7
|
+
Classifier: Development Status :: 3 - Alpha
|
|
8
|
+
Requires-Python: ~=3.11
|
|
9
|
+
Description-Content-Type: text/markdown
|
|
10
|
+
License-File: LICENSE.txt
|
|
11
|
+
Requires-Dist: pecanpy~=2.0.9
|
|
12
|
+
Requires-Dist: pandas~=2.2.0
|
|
13
|
+
Requires-Dist: networkx~=3.1
|
|
14
|
+
Requires-Dist: matplotlib~=3.8.3
|
|
15
|
+
Requires-Dist: numpy~=1.26.4
|
|
16
|
+
Requires-Dist: scipy~=1.12.0
|
|
17
|
+
Requires-Dist: scikit-learn~=1.4.1
|
|
18
|
+
Requires-Dist: adjustText~=1.1.1
|
|
19
|
+
Requires-Dist: joblib~=1.4.2
|
|
20
|
+
Requires-Dist: cobra~=0.29.0
|
|
21
|
+
Dynamic: license-file
|
|
22
|
+
|
|
23
|
+
# Rawk
|
|
24
|
+
|
|
25
|
+
A python package for metabolic pathway local enrichment analysis via random
|
|
26
|
+
walks on metabolic reaction network.
|
|
27
|
+
|
|
28
|
+
## Install
|
|
29
|
+
|
|
30
|
+
**Note** that Rawk currently can only be installed from the source files in
|
|
31
|
+
this repository.
|
|
32
|
+
|
|
33
|
+
Rawk will be available on PyPI soon.
|
|
34
|
+
|
|
35
|
+
## Documentation
|
|
36
|
+
|
|
37
|
+
### Tutorials
|
|
38
|
+
|
|
39
|
+
The tutorials of Rawk are in the `docs/tutorials` folder,
|
|
40
|
+
which contains the following tutorials:
|
|
41
|
+
|
|
42
|
+
- `construct_recon3d_mrn.md`: Construct a metabolic reaction network from a
|
|
43
|
+
genome scale metabolic model.
|
|
44
|
+
- `example_mouse_data_analysis.md`: Run Rawk standard analysis workflow on an
|
|
45
|
+
example mouse dataset.
|
|
46
|
+
- `example_human_data_analysis.md`: Run Rawk standard analysis workflow on an
|
|
47
|
+
example human dataset.
|
|
48
|
+
|
|
49
|
+
### API reference
|
|
50
|
+
|
|
51
|
+
The API reference files of Rawk are in
|
|
52
|
+
`docs/api_reference`. The API reference files were
|
|
53
|
+
generated from the package docstrings. The docstrings can also be accessed
|
|
54
|
+
using `help` in python interpreter. For example, `help(rawk.Rawk)` shows the
|
|
55
|
+
documentation of the `Rawk` class.
|
|
56
|
+
|
|
57
|
+
## Troubleshooting
|
|
58
|
+
|
|
59
|
+
If you encounter any error related to `tkinter` multi-threading, try rerunning
|
|
60
|
+
with parameters set to use only one CPU core.
|
|
61
|
+
|
|
62
|
+
## Notice about license
|
|
63
|
+
|
|
64
|
+
This project is released under a Non-Commercial Research License. For commercial use, please contact licensing@chop.edu for licensing terms.
|
rawk-0.1.0/README.md
ADDED
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
# Rawk
|
|
2
|
+
|
|
3
|
+
A python package for metabolic pathway local enrichment analysis via random
|
|
4
|
+
walks on metabolic reaction network.
|
|
5
|
+
|
|
6
|
+
## Install
|
|
7
|
+
|
|
8
|
+
**Note** that Rawk currently can only be installed from the source files in
|
|
9
|
+
this repository.
|
|
10
|
+
|
|
11
|
+
Rawk will be available on PyPI soon.
|
|
12
|
+
|
|
13
|
+
## Documentation
|
|
14
|
+
|
|
15
|
+
### Tutorials
|
|
16
|
+
|
|
17
|
+
The tutorials of Rawk are in the `docs/tutorials` folder,
|
|
18
|
+
which contains the following tutorials:
|
|
19
|
+
|
|
20
|
+
- `construct_recon3d_mrn.md`: Construct a metabolic reaction network from a
|
|
21
|
+
genome scale metabolic model.
|
|
22
|
+
- `example_mouse_data_analysis.md`: Run Rawk standard analysis workflow on an
|
|
23
|
+
example mouse dataset.
|
|
24
|
+
- `example_human_data_analysis.md`: Run Rawk standard analysis workflow on an
|
|
25
|
+
example human dataset.
|
|
26
|
+
|
|
27
|
+
### API reference
|
|
28
|
+
|
|
29
|
+
The API reference files of Rawk are in
|
|
30
|
+
`docs/api_reference`. The API reference files were
|
|
31
|
+
generated from the package docstrings. The docstrings can also be accessed
|
|
32
|
+
using `help` in python interpreter. For example, `help(rawk.Rawk)` shows the
|
|
33
|
+
documentation of the `Rawk` class.
|
|
34
|
+
|
|
35
|
+
## Troubleshooting
|
|
36
|
+
|
|
37
|
+
If you encounter any error related to `tkinter` multi-threading, try rerunning
|
|
38
|
+
with parameters set to use only one CPU core.
|
|
39
|
+
|
|
40
|
+
## Notice about license
|
|
41
|
+
|
|
42
|
+
This project is released under a Non-Commercial Research License. For commercial use, please contact licensing@chop.edu for licensing terms.
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=61.0"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
[project]
|
|
7
|
+
name = "rawk"
|
|
8
|
+
|
|
9
|
+
version = "0.1.0"
|
|
10
|
+
|
|
11
|
+
description = "Metabolic pathway local enrichment analysis via random walks on metabolic reaction network"
|
|
12
|
+
|
|
13
|
+
readme = "README.md"
|
|
14
|
+
|
|
15
|
+
requires-python = "~=3.11"
|
|
16
|
+
|
|
17
|
+
license-files = ["LICENSE.txt"]
|
|
18
|
+
|
|
19
|
+
keywords = ["metabolism", "network", "enrichment"]
|
|
20
|
+
|
|
21
|
+
authors = [
|
|
22
|
+
{name = "Taylor Lab"}
|
|
23
|
+
]
|
|
24
|
+
|
|
25
|
+
classifiers = [
|
|
26
|
+
"Development Status :: 3 - Alpha"
|
|
27
|
+
]
|
|
28
|
+
|
|
29
|
+
dependencies = [
|
|
30
|
+
"pecanpy~=2.0.9",
|
|
31
|
+
"pandas~=2.2.0",
|
|
32
|
+
"networkx~=3.1",
|
|
33
|
+
"matplotlib~=3.8.3",
|
|
34
|
+
"numpy~=1.26.4",
|
|
35
|
+
"scipy~=1.12.0",
|
|
36
|
+
"scikit-learn~=1.4.1",
|
|
37
|
+
"adjustText~=1.1.1",
|
|
38
|
+
"joblib~=1.4.2",
|
|
39
|
+
"cobra~=0.29.0",
|
|
40
|
+
]
|
rawk-0.1.0/setup.cfg
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
from .rawk_sample import RawkSample
|
|
2
|
+
|
|
3
|
+
from .rawk import Rawk
|
|
4
|
+
from .rawk import RawkTest
|
|
5
|
+
|
|
6
|
+
from .multisample_rawk import MultiSampleRawk
|
|
7
|
+
from .multisample_rawk import MultiSampleRawkTest
|
|
8
|
+
|
|
9
|
+
from .input_prep import get_met_net_dfs
|
|
10
|
+
from .input_prep import transform_gene_prop
|
|
11
|
+
from .input_prep import qn_transform
|
|
12
|
+
from .input_prep import get_mrn_gp_df
|
|
13
|
+
|
|
14
|
+
from .plot import plot_nw_stats
|
|
15
|
+
from .plot import plot_elbow
|
|
16
|
+
from .plot import plot_graph
|
|
17
|
+
from .plot import hist
|
|
18
|
+
from .plot import plot_pw_neighborhood
|
|
19
|
+
from .plot import plot_rawk_sample_mtx
|
|
20
|
+
|
|
21
|
+
import importlib.metadata
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
__version__ = importlib.metadata.version("rawk")
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
# This file is adapted from the GTmac/FastRP GitHub repository at commit 3a6a71c
|
|
2
|
+
|
|
3
|
+
import numpy as np
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
from sklearn import random_projection
|
|
7
|
+
from sklearn.preprocessing import normalize, scale
|
|
8
|
+
from scipy.sparse import coo_matrix, csr_matrix, csc_matrix, spdiags
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
# projection method: choose from Gaussian and Sparse
|
|
12
|
+
# input matrix: choose from adjacency and transition matrix
|
|
13
|
+
# alpha adjusts the weighting of nodes according to their degree
|
|
14
|
+
def fastrp_projection(A, q=3, dim=128, projection_method='gaussian', input_matrix='adj', alpha=None):
|
|
15
|
+
assert input_matrix == 'adj' or input_matrix == 'trans'
|
|
16
|
+
assert projection_method == 'gaussian' or projection_method == 'sparse'
|
|
17
|
+
#
|
|
18
|
+
if input_matrix == 'adj':
|
|
19
|
+
N = A.shape[0]
|
|
20
|
+
M = A
|
|
21
|
+
else:
|
|
22
|
+
N = A.shape[0]
|
|
23
|
+
normalizer = spdiags(np.squeeze(1.0 / csc_matrix.sum(A, axis=1) ), 0, N, N)
|
|
24
|
+
M = normalizer @ A
|
|
25
|
+
# Gaussian projection matrix
|
|
26
|
+
if projection_method == 'gaussian':
|
|
27
|
+
transformer = random_projection.GaussianRandomProjection(n_components=dim, random_state=42)
|
|
28
|
+
# Sparse projection matrix
|
|
29
|
+
else:
|
|
30
|
+
transformer = random_projection.SparseRandomProjection(n_components=dim, random_state=42)
|
|
31
|
+
Y = transformer.fit(M)
|
|
32
|
+
# Random projection for A
|
|
33
|
+
if alpha is not None:
|
|
34
|
+
Y.components_ = Y.components_ @ spdiags( \
|
|
35
|
+
np.squeeze(np.power(csc_matrix.sum(A, axis=1), alpha)), 0, N, N)
|
|
36
|
+
cur_U = transformer.transform(M)
|
|
37
|
+
U_list = [cur_U]
|
|
38
|
+
#
|
|
39
|
+
for i in range(2, q + 1):
|
|
40
|
+
cur_U = M @ cur_U
|
|
41
|
+
U_list.append(cur_U)
|
|
42
|
+
return U_list
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
# When weights is None, concatenate instead of linearly combines the embeddings from different powers of A
|
|
46
|
+
def fastrp_merge(U_list, weights, normalization=False):
|
|
47
|
+
dense_U_list = [np.asarray(_U.todense()) for _U in U_list] if type(U_list[0]) == csc_matrix else U_list
|
|
48
|
+
_U_list = [normalize(_U, norm='l2', axis=1) for _U in dense_U_list] if normalization else dense_U_list
|
|
49
|
+
#
|
|
50
|
+
if weights is None:
|
|
51
|
+
return np.concatenate(_U_list, axis=1)
|
|
52
|
+
U = np.zeros_like(_U_list[0])
|
|
53
|
+
for cur_U, weight in zip(_U_list, weights):
|
|
54
|
+
U += cur_U * weight
|
|
55
|
+
# U = scale(U.todense())
|
|
56
|
+
# U = normalize(U.todense(), norm='l2', axis=1)
|
|
57
|
+
# U = np.asarray(U)
|
|
58
|
+
# return U
|
|
59
|
+
return scale(U.todense())
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
# A is always the adjacency matrix
|
|
63
|
+
# the choice between adj matrix and trans matrix is decided in the conf
|
|
64
|
+
def fastrp_wrapper(A, conf):
|
|
65
|
+
U_list = fastrp_projection(A,
|
|
66
|
+
q=len(conf['weights']),
|
|
67
|
+
dim=conf['dim'],
|
|
68
|
+
projection_method=conf['projection_method'],
|
|
69
|
+
input_matrix=conf['input_matrix'],
|
|
70
|
+
alpha=conf['alpha'],
|
|
71
|
+
)
|
|
72
|
+
U = fastrp_merge(U_list, conf['weights'], conf['normalization'])
|
|
73
|
+
return U
|
|
@@ -0,0 +1,369 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
import numpy as np
|
|
3
|
+
import networkx as nx
|
|
4
|
+
from sklearn.preprocessing import QuantileTransformer
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def transform_gene_prop(gene_prop_df, transform):
|
|
9
|
+
"""
|
|
10
|
+
Apply transform function to each property column
|
|
11
|
+
|
|
12
|
+
Parameters
|
|
13
|
+
----------
|
|
14
|
+
gene_prop_df : dataframe
|
|
15
|
+
A dataframe of gene properties, such as log fold changes, normalized
|
|
16
|
+
read counts, and z-scores. The 'gene' column of the dataframe is a
|
|
17
|
+
list of gene symbols. Other columns of the dataframe are the gene
|
|
18
|
+
properties of samples.
|
|
19
|
+
transform : function
|
|
20
|
+
A function to transform each property column.
|
|
21
|
+
|
|
22
|
+
Returns
|
|
23
|
+
-------
|
|
24
|
+
dataframe after transformation
|
|
25
|
+
"""
|
|
26
|
+
assert not gene_prop_df['gene'].duplicated().any()
|
|
27
|
+
|
|
28
|
+
if transform is not None:
|
|
29
|
+
tdf = (
|
|
30
|
+
gene_prop_df
|
|
31
|
+
.copy()
|
|
32
|
+
.set_index('gene')
|
|
33
|
+
.apply(transform, axis=0)
|
|
34
|
+
.reset_index(names='gene')
|
|
35
|
+
)
|
|
36
|
+
|
|
37
|
+
return tdf
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def qn_transform(s, sigma=0.367879, log1p=False, collapse_0s=False,
|
|
42
|
+
center=False, seed=42):
|
|
43
|
+
"""
|
|
44
|
+
Quantile transform gene properties to normal distribution
|
|
45
|
+
|
|
46
|
+
Parameters
|
|
47
|
+
----------
|
|
48
|
+
s : series
|
|
49
|
+
A pandas.Series of gene properties, such as log fold changes,
|
|
50
|
+
normalized read counts, and z-scores.
|
|
51
|
+
sigma : float
|
|
52
|
+
Output normal distribution sigma. Default to
|
|
53
|
+
np.round(1 / np.e, 6), so np.e ** (sigma * (3 - (-3))) ~= 9.
|
|
54
|
+
log1p : bool
|
|
55
|
+
Apply log1p transform on the properties or not, before
|
|
56
|
+
quantile normalization.
|
|
57
|
+
collapse_0s : bool
|
|
58
|
+
If True, all 0s will be collapse into one 0, and the 0's
|
|
59
|
+
quantile normalized value will be assigned to all 0s.
|
|
60
|
+
center : bool
|
|
61
|
+
If True, the tranformed values will be centered at the input 0s.
|
|
62
|
+
seed : int
|
|
63
|
+
Random state.
|
|
64
|
+
|
|
65
|
+
Returns
|
|
66
|
+
-------
|
|
67
|
+
Series after transformation
|
|
68
|
+
"""
|
|
69
|
+
x = s.values.copy()
|
|
70
|
+
if log1p:
|
|
71
|
+
x = np.log1p(x)
|
|
72
|
+
|
|
73
|
+
assert len(x.shape) == 1
|
|
74
|
+
n = len(x)
|
|
75
|
+
|
|
76
|
+
qt = QuantileTransformer(
|
|
77
|
+
output_distribution='normal',
|
|
78
|
+
random_state=seed)
|
|
79
|
+
|
|
80
|
+
if collapse_0s:
|
|
81
|
+
# collapse all 0s when transform
|
|
82
|
+
non0_idc = x != 0
|
|
83
|
+
non0_x = x[non0_idc].copy()
|
|
84
|
+
|
|
85
|
+
c0_x = np.concatenate((np.array([0]), non0_x))
|
|
86
|
+
|
|
87
|
+
c0_x = c0_x.reshape(-1, 1)
|
|
88
|
+
assert c0_x[0] == 0
|
|
89
|
+
assert c0_x[1] != 0
|
|
90
|
+
assert c0_x.shape == (sum(non0_idc) + 1, 1)
|
|
91
|
+
|
|
92
|
+
c0_x = qt.fit_transform(c0_x)
|
|
93
|
+
assert c0_x.shape == (sum(non0_idc) + 1, 1)
|
|
94
|
+
|
|
95
|
+
c0_x = c0_x.flatten()
|
|
96
|
+
assert c0_x.shape == (sum(non0_idc) + 1,)
|
|
97
|
+
|
|
98
|
+
x[np.logical_not(non0_idc)] = c0_x[0]
|
|
99
|
+
|
|
100
|
+
x[non0_idc] = c0_x[1:]
|
|
101
|
+
|
|
102
|
+
else:
|
|
103
|
+
x = x.reshape(-1, 1)
|
|
104
|
+
assert x.shape == (n, 1)
|
|
105
|
+
|
|
106
|
+
x = qt.fit_transform(x)
|
|
107
|
+
assert x.shape == (n, 1)
|
|
108
|
+
|
|
109
|
+
x = x.flatten()
|
|
110
|
+
|
|
111
|
+
assert len(x.shape) == 1
|
|
112
|
+
|
|
113
|
+
if center:
|
|
114
|
+
t0 = qt.transform([[0.0]])[0][0]
|
|
115
|
+
x = x - t0
|
|
116
|
+
|
|
117
|
+
x = x * sigma
|
|
118
|
+
|
|
119
|
+
return pd.Series(x, index=s.index.copy())
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
def get_mrn_gp_df(gene_prop_df, rxn_gene_df,
|
|
124
|
+
fill_missing_gene_prop=None,
|
|
125
|
+
transform_gene_prop_func=None):
|
|
126
|
+
"""
|
|
127
|
+
Prepare metabolic reaction network gene property dataframe
|
|
128
|
+
|
|
129
|
+
Parameters
|
|
130
|
+
----------
|
|
131
|
+
gene_prop_df : dataframe
|
|
132
|
+
A dataframe of gene properties, such as log fold changes, normalized
|
|
133
|
+
read counts, and z-scores. The 'gene' column of the dataframe is a
|
|
134
|
+
list of gene symbols. Other columns of the dataframe are the gene
|
|
135
|
+
properties of samples.
|
|
136
|
+
rxn_gene_df : dataframe
|
|
137
|
+
A dataframe of reactions and their associated genes. Following are the
|
|
138
|
+
required columns: 'rxn' (reaction ID), 'rxn_name' (reaction name),
|
|
139
|
+
'equation' (reaction equation), 'pathway' (reaction pathway), and
|
|
140
|
+
'gene' (gene symbol). If a reaction is associated with multiple genes,
|
|
141
|
+
one row lists one associated gene.
|
|
142
|
+
fill_missing_gene_prop : int, or float, or None
|
|
143
|
+
If int or float, replace missing gene properties with this value.
|
|
144
|
+
If None, drop genes with missing properties.
|
|
145
|
+
transform_gene_prop_func : function
|
|
146
|
+
A function to transform each property column. If None, no
|
|
147
|
+
transformation will be applied.
|
|
148
|
+
|
|
149
|
+
Returns
|
|
150
|
+
-------
|
|
151
|
+
dataframe
|
|
152
|
+
"""
|
|
153
|
+
if fill_missing_gene_prop is not None:
|
|
154
|
+
all_genes = pd.concat(
|
|
155
|
+
[gene_prop_df["gene"], rxn_gene_df["gene"]]
|
|
156
|
+
).drop_duplicates().tolist()
|
|
157
|
+
|
|
158
|
+
gene_prop_df = (
|
|
159
|
+
gene_prop_df
|
|
160
|
+
.copy()
|
|
161
|
+
.set_index("gene")
|
|
162
|
+
.reindex(
|
|
163
|
+
all_genes,
|
|
164
|
+
fill_value=fill_missing_gene_prop)
|
|
165
|
+
.reset_index()
|
|
166
|
+
)
|
|
167
|
+
|
|
168
|
+
assert not np.any(gene_prop_df.isnull().values)
|
|
169
|
+
|
|
170
|
+
if transform_gene_prop_func is not None:
|
|
171
|
+
gene_prop_df = transform_gene_prop(
|
|
172
|
+
gene_prop_df, transform_gene_prop_func)
|
|
173
|
+
|
|
174
|
+
assert not np.any(gene_prop_df.isnull().values)
|
|
175
|
+
|
|
176
|
+
return gene_prop_df
|
|
177
|
+
|
|
178
|
+
|
|
179
|
+
|
|
180
|
+
def get_met_net_dfs(rxn_gene_df, rxn_edge_df, gene_prop_df, mn_weight_cutoff,
|
|
181
|
+
fill_missing_gene_prop=0,
|
|
182
|
+
transform_gene_prop_func=None,
|
|
183
|
+
rxn_gene_prop_agg_func=None):
|
|
184
|
+
"""
|
|
185
|
+
Prepare input node and edge dataframes for Rawk
|
|
186
|
+
|
|
187
|
+
Parameters
|
|
188
|
+
----------
|
|
189
|
+
rxn_gene_df : dataframe
|
|
190
|
+
A dataframe of reactions and their associated genes. Following are the
|
|
191
|
+
required columns: 'rxn' (reaction ID), 'rxn_name' (reaction name),
|
|
192
|
+
'equation' (reaction equation), 'pathway' (reaction pathway), and
|
|
193
|
+
'gene' (gene symbol). If a reaction is associated with multiple genes,
|
|
194
|
+
one row lists one associated gene.
|
|
195
|
+
rxn_edge_df : dataframe
|
|
196
|
+
A dataframe of reactions edges. Following are the required columns:
|
|
197
|
+
'src' (source node reaction ID), 'dest' (destination node reaction ID),
|
|
198
|
+
'mn_weight' (metabolic network edge weight). The edges are undirected,
|
|
199
|
+
with the src <= dest in alphabetical order.
|
|
200
|
+
gene_prop_df : dataframe
|
|
201
|
+
A dataframe of gene properties, such as log fold changes, normalized
|
|
202
|
+
read counts, and z-scores. The 'gene' column of the dataframe is a
|
|
203
|
+
list of gene symbols. Other columns of the dataframe are the gene
|
|
204
|
+
properties of samples.
|
|
205
|
+
mn_weight_cutoff : float
|
|
206
|
+
The cutoff of metabolic network weights. Keep only edges with weights >
|
|
207
|
+
mn_weight_cutoff.
|
|
208
|
+
fill_missing_gene_prop : int, or float, or None
|
|
209
|
+
If int or float, replace missing gene properties with this value.
|
|
210
|
+
If None, drop genes with missing properties.
|
|
211
|
+
transform_gene_prop_func : function
|
|
212
|
+
A function to transform each property column.
|
|
213
|
+
rxn_gene_prop_agg_func : function or None
|
|
214
|
+
The function used to aggregate the property values of multiple genes
|
|
215
|
+
that are associated with each reaction. If None, use
|
|
216
|
+
lambda x: x.mean() to aggregate.
|
|
217
|
+
|
|
218
|
+
Returns
|
|
219
|
+
-------
|
|
220
|
+
(dataframe, dataframe)
|
|
221
|
+
A 2-tuple of a reaction node property dataframe and a reaction edge
|
|
222
|
+
dataframe
|
|
223
|
+
"""
|
|
224
|
+
if np.any(gene_prop_df.isnull().values):
|
|
225
|
+
raise ValueError("gene_prop_df contains one or more NA/NaN... values")
|
|
226
|
+
if np.any(rxn_edge_df.isnull().values):
|
|
227
|
+
raise ValueError("rxn_edge_df contains one or more NA/NaN... values")
|
|
228
|
+
if np.any(rxn_gene_df.isnull().values):
|
|
229
|
+
raise ValueError("rxn_gene_df contains one or more NA/NaN... values")
|
|
230
|
+
|
|
231
|
+
if rxn_edge_df[["src", "dest"]].duplicated().any():
|
|
232
|
+
raise ValueError("rxn_edge_df contains duplicated (src, dest) pairs")
|
|
233
|
+
|
|
234
|
+
if not all(rxn_edge_df.src <= rxn_edge_df.dest):
|
|
235
|
+
raise ValueError(
|
|
236
|
+
"rxn_edge_df requires src <= dest in alphabetical order")
|
|
237
|
+
|
|
238
|
+
if rxn_gene_df[["rxn", "gene"]].duplicated().any():
|
|
239
|
+
raise ValueError("rxn_gene_df contains duplicated (rxn, gene) pairs")
|
|
240
|
+
|
|
241
|
+
if gene_prop_df["gene"].duplicated().any():
|
|
242
|
+
raise ValueError("gene_prop_df contains duplicated genes.")
|
|
243
|
+
|
|
244
|
+
gene_prop_df = get_mrn_gp_df(
|
|
245
|
+
gene_prop_df, rxn_gene_df,
|
|
246
|
+
fill_missing_gene_prop=fill_missing_gene_prop,
|
|
247
|
+
transform_gene_prop_func=transform_gene_prop_func)
|
|
248
|
+
|
|
249
|
+
rxn_pw_set_dict = (
|
|
250
|
+
rxn_gene_df
|
|
251
|
+
.groupby("rxn")["pathway"]
|
|
252
|
+
.apply(set)
|
|
253
|
+
.to_dict()
|
|
254
|
+
)
|
|
255
|
+
rxn_pw_dict = {}
|
|
256
|
+
for k, v in rxn_pw_set_dict.items():
|
|
257
|
+
if len(v) != 1:
|
|
258
|
+
raise ValueError(
|
|
259
|
+
"rxn_gene_df contains one reaction to "
|
|
260
|
+
"multiple pathways mappings.")
|
|
261
|
+
rxn_pw_dict[k] = list(v)[0]
|
|
262
|
+
|
|
263
|
+
pw_rxn_set_dict = (
|
|
264
|
+
rxn_gene_df
|
|
265
|
+
.groupby("pathway")["rxn"]
|
|
266
|
+
.apply(set)
|
|
267
|
+
.to_dict()
|
|
268
|
+
)
|
|
269
|
+
|
|
270
|
+
rxn_edge_df = rxn_edge_df.loc[
|
|
271
|
+
rxn_edge_df["mn_weight"] > mn_weight_cutoff, :].copy()
|
|
272
|
+
|
|
273
|
+
edge_rxn_set = set(
|
|
274
|
+
rxn_edge_df["src"].tolist() + rxn_edge_df["dest"].tolist())
|
|
275
|
+
|
|
276
|
+
rxn_gene_prop_df = (
|
|
277
|
+
rxn_gene_df
|
|
278
|
+
.loc[rxn_gene_df["rxn"].isin(edge_rxn_set), :]
|
|
279
|
+
.merge(
|
|
280
|
+
gene_prop_df, how="left", on="gene",
|
|
281
|
+
validate="many_to_one")
|
|
282
|
+
)
|
|
283
|
+
|
|
284
|
+
if fill_missing_gene_prop is None:
|
|
285
|
+
rxn_gene_prop_df = rxn_gene_prop_df.dropna()
|
|
286
|
+
else:
|
|
287
|
+
assert not np.any(rxn_gene_prop_df.isnull().values)
|
|
288
|
+
|
|
289
|
+
# weight filtered; property exists
|
|
290
|
+
wf_pe_rxn_set = set(rxn_gene_prop_df["rxn"].tolist())
|
|
291
|
+
rxn_edge_df = rxn_edge_df.loc[
|
|
292
|
+
np.logical_and(
|
|
293
|
+
rxn_edge_df["src"].isin(wf_pe_rxn_set),
|
|
294
|
+
rxn_edge_df["dest"].isin(wf_pe_rxn_set)),
|
|
295
|
+
:].copy()
|
|
296
|
+
|
|
297
|
+
# If a rxn in a pathway cannot reach other pathways, remove the rxn.
|
|
298
|
+
# Such rxns will have all random walk steps within their own pathways
|
|
299
|
+
# regardless of the property values.
|
|
300
|
+
f_graph = nx.from_pandas_edgelist(
|
|
301
|
+
rxn_edge_df,
|
|
302
|
+
source="src",
|
|
303
|
+
target="dest")
|
|
304
|
+
assert not f_graph.is_directed()
|
|
305
|
+
# rxn to reachable rxn
|
|
306
|
+
r_rrs_dict = {
|
|
307
|
+
k: nx.node_connected_component(f_graph, k)
|
|
308
|
+
for k in list(f_graph.nodes())
|
|
309
|
+
}
|
|
310
|
+
|
|
311
|
+
rm_rxns = set([
|
|
312
|
+
k for k, v in r_rrs_dict.items()
|
|
313
|
+
if len(v - pw_rxn_set_dict[rxn_pw_dict[k]]) <= 0
|
|
314
|
+
])
|
|
315
|
+
|
|
316
|
+
# If a reaction in a pathway can reach other pathways,
|
|
317
|
+
# the reaction cannot reach any reaction that cannot.
|
|
318
|
+
rxn_edge_df = rxn_edge_df.loc[
|
|
319
|
+
np.logical_not(np.logical_or(
|
|
320
|
+
rxn_edge_df["src"].isin(rm_rxns),
|
|
321
|
+
rxn_edge_df["dest"].isin(rm_rxns))),
|
|
322
|
+
:].copy()
|
|
323
|
+
|
|
324
|
+
def unique_one(x):
|
|
325
|
+
x_set = set(x.tolist())
|
|
326
|
+
assert len(x_set) == 1, str(x_set)
|
|
327
|
+
return list(x_set)[0]
|
|
328
|
+
|
|
329
|
+
agg_func_dict = {
|
|
330
|
+
"rxn_name": unique_one,
|
|
331
|
+
"equation": unique_one,
|
|
332
|
+
"pathway": unique_one,
|
|
333
|
+
"gene": (
|
|
334
|
+
lambda x: ";;;".join(sorted(x.tolist()))
|
|
335
|
+
)
|
|
336
|
+
}
|
|
337
|
+
|
|
338
|
+
if rxn_gene_prop_agg_func is None:
|
|
339
|
+
rxn_gene_prop_agg_func = lambda x: x.mean()
|
|
340
|
+
|
|
341
|
+
for i in rxn_gene_prop_df.columns:
|
|
342
|
+
if i != "rxn" and i not in agg_func_dict:
|
|
343
|
+
agg_func_dict[i] = rxn_gene_prop_agg_func
|
|
344
|
+
|
|
345
|
+
rxn_prop_df = (
|
|
346
|
+
rxn_gene_prop_df
|
|
347
|
+
.groupby("rxn")
|
|
348
|
+
.agg(agg_func_dict)
|
|
349
|
+
.reset_index(names=["rxn"])
|
|
350
|
+
)
|
|
351
|
+
assert rxn_prop_df.isnull().values.sum() == 0
|
|
352
|
+
|
|
353
|
+
common_rxns = (
|
|
354
|
+
set(rxn_prop_df["rxn"].tolist())
|
|
355
|
+
.intersection(set(rxn_edge_df["src"].tolist()))
|
|
356
|
+
.intersection(set(rxn_edge_df["dest"].tolist()))
|
|
357
|
+
)
|
|
358
|
+
|
|
359
|
+
rxn_edge_df = rxn_edge_df.loc[
|
|
360
|
+
np.logical_and(
|
|
361
|
+
rxn_edge_df["src"].isin(common_rxns),
|
|
362
|
+
rxn_edge_df["dest"].isin(common_rxns)),
|
|
363
|
+
:].reset_index(drop=True).copy()
|
|
364
|
+
|
|
365
|
+
rxn_prop_df = rxn_prop_df.loc[
|
|
366
|
+
rxn_prop_df["rxn"].isin(common_rxns),
|
|
367
|
+
:].reset_index(drop=True).copy()
|
|
368
|
+
|
|
369
|
+
return rxn_prop_df, rxn_edge_df
|