rawk 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
rawk-0.1.0/LICENSE.txt ADDED
@@ -0,0 +1,31 @@
1
+ LICENSE
2
+
3
+ Non-Commercial Research License
4
+
5
+ Copyright © 2025 The Children’s Hospital of Philadelphia. All Rights Reserved.
6
+
7
+ Permission is hereby granted, free of charge, to any person or organization to use, copy, modify, and distribute this software and associated documentation files (the “Software”) for academic, research, or educational purposes only, subject to the following conditions:
8
+
9
+ 1. Attribution
10
+
11
+ Appropriate credit must be given to the authors in any use, publication, or derivative work of the Software.
12
+
13
+ 2. Non-Commercial Use Only
14
+
15
+ The Software may not be used, in whole or in part, for commercial purposes, including but not limited to:
16
+
17
+ - Use in a product for sale,
18
+
19
+ - Use in a for-profit company’s operations,
20
+
21
+ - Use in services provided to customers for a fee,
22
+
23
+ - Use in sponsored research.
24
+
25
+ 3. Commercial Licensing
26
+
27
+ For commercial use, a separate license must be obtained from the copyright holder. Please contact: licensing@chop.edu
28
+
29
+ 4. Warranty Disclaimer
30
+
31
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
rawk-0.1.0/PKG-INFO ADDED
@@ -0,0 +1,64 @@
1
+ Metadata-Version: 2.4
2
+ Name: rawk
3
+ Version: 0.1.0
4
+ Summary: Metabolic pathway local enrichment analysis via random walks on metabolic reaction network
5
+ Author: Taylor Lab
6
+ Keywords: metabolism,network,enrichment
7
+ Classifier: Development Status :: 3 - Alpha
8
+ Requires-Python: ~=3.11
9
+ Description-Content-Type: text/markdown
10
+ License-File: LICENSE.txt
11
+ Requires-Dist: pecanpy~=2.0.9
12
+ Requires-Dist: pandas~=2.2.0
13
+ Requires-Dist: networkx~=3.1
14
+ Requires-Dist: matplotlib~=3.8.3
15
+ Requires-Dist: numpy~=1.26.4
16
+ Requires-Dist: scipy~=1.12.0
17
+ Requires-Dist: scikit-learn~=1.4.1
18
+ Requires-Dist: adjustText~=1.1.1
19
+ Requires-Dist: joblib~=1.4.2
20
+ Requires-Dist: cobra~=0.29.0
21
+ Dynamic: license-file
22
+
23
+ # Rawk
24
+
25
+ A python package for metabolic pathway local enrichment analysis via random
26
+ walks on metabolic reaction network.
27
+
28
+ ## Install
29
+
30
+ **Note** that Rawk currently can only be installed from the source files in
31
+ this repository.
32
+
33
+ Rawk will be available on PyPI soon.
34
+
35
+ ## Documentation
36
+
37
+ ### Tutorials
38
+
39
+ The tutorials of Rawk are in the `docs/tutorials` folder,
40
+ which contains the following tutorials:
41
+
42
+ - `construct_recon3d_mrn.md`: Construct a metabolic reaction network from a
43
+ genome scale metabolic model.
44
+ - `example_mouse_data_analysis.md`: Run Rawk standard analysis workflow on an
45
+ example mouse dataset.
46
+ - `example_human_data_analysis.md`: Run Rawk standard analysis workflow on an
47
+ example human dataset.
48
+
49
+ ### API reference
50
+
51
+ The API reference files of Rawk are in
52
+ `docs/api_reference`. The API reference files were
53
+ generated from the package docstrings. The docstrings can also be accessed
54
+ using `help` in python interpreter. For example, `help(rawk.Rawk)` shows the
55
+ documentation of the `Rawk` class.
56
+
57
+ ## Troubleshooting
58
+
59
+ If you encounter any error related to `tkinter` multi-threading, try rerunning
60
+ with parameters set to use only one CPU core.
61
+
62
+ ## Notice about license
63
+
64
+ This project is released under a Non-Commercial Research License. For commercial use, please contact licensing@chop.edu for licensing terms.
rawk-0.1.0/README.md ADDED
@@ -0,0 +1,42 @@
1
+ # Rawk
2
+
3
+ A python package for metabolic pathway local enrichment analysis via random
4
+ walks on metabolic reaction network.
5
+
6
+ ## Install
7
+
8
+ **Note** that Rawk currently can only be installed from the source files in
9
+ this repository.
10
+
11
+ Rawk will be available on PyPI soon.
12
+
13
+ ## Documentation
14
+
15
+ ### Tutorials
16
+
17
+ The tutorials of Rawk are in the `docs/tutorials` folder,
18
+ which contains the following tutorials:
19
+
20
+ - `construct_recon3d_mrn.md`: Construct a metabolic reaction network from a
21
+ genome scale metabolic model.
22
+ - `example_mouse_data_analysis.md`: Run Rawk standard analysis workflow on an
23
+ example mouse dataset.
24
+ - `example_human_data_analysis.md`: Run Rawk standard analysis workflow on an
25
+ example human dataset.
26
+
27
+ ### API reference
28
+
29
+ The API reference files of Rawk are in
30
+ `docs/api_reference`. The API reference files were
31
+ generated from the package docstrings. The docstrings can also be accessed
32
+ using `help` in python interpreter. For example, `help(rawk.Rawk)` shows the
33
+ documentation of the `Rawk` class.
34
+
35
+ ## Troubleshooting
36
+
37
+ If you encounter any error related to `tkinter` multi-threading, try rerunning
38
+ with parameters set to use only one CPU core.
39
+
40
+ ## Notice about license
41
+
42
+ This project is released under a Non-Commercial Research License. For commercial use, please contact licensing@chop.edu for licensing terms.
@@ -0,0 +1,40 @@
1
+ [build-system]
2
+ requires = ["setuptools>=61.0"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+
6
+ [project]
7
+ name = "rawk"
8
+
9
+ version = "0.1.0"
10
+
11
+ description = "Metabolic pathway local enrichment analysis via random walks on metabolic reaction network"
12
+
13
+ readme = "README.md"
14
+
15
+ requires-python = "~=3.11"
16
+
17
+ license-files = ["LICENSE.txt"]
18
+
19
+ keywords = ["metabolism", "network", "enrichment"]
20
+
21
+ authors = [
22
+ {name = "Taylor Lab"}
23
+ ]
24
+
25
+ classifiers = [
26
+ "Development Status :: 3 - Alpha"
27
+ ]
28
+
29
+ dependencies = [
30
+ "pecanpy~=2.0.9",
31
+ "pandas~=2.2.0",
32
+ "networkx~=3.1",
33
+ "matplotlib~=3.8.3",
34
+ "numpy~=1.26.4",
35
+ "scipy~=1.12.0",
36
+ "scikit-learn~=1.4.1",
37
+ "adjustText~=1.1.1",
38
+ "joblib~=1.4.2",
39
+ "cobra~=0.29.0",
40
+ ]
rawk-0.1.0/setup.cfg ADDED
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,25 @@
1
+ from .rawk_sample import RawkSample
2
+
3
+ from .rawk import Rawk
4
+ from .rawk import RawkTest
5
+
6
+ from .multisample_rawk import MultiSampleRawk
7
+ from .multisample_rawk import MultiSampleRawkTest
8
+
9
+ from .input_prep import get_met_net_dfs
10
+ from .input_prep import transform_gene_prop
11
+ from .input_prep import qn_transform
12
+ from .input_prep import get_mrn_gp_df
13
+
14
+ from .plot import plot_nw_stats
15
+ from .plot import plot_elbow
16
+ from .plot import plot_graph
17
+ from .plot import hist
18
+ from .plot import plot_pw_neighborhood
19
+ from .plot import plot_rawk_sample_mtx
20
+
21
+ import importlib.metadata
22
+
23
+
24
+
25
+ __version__ = importlib.metadata.version("rawk")
@@ -0,0 +1,73 @@
1
+ # This file is adapted from the GTmac/FastRP GitHub repository at commit 3a6a71c
2
+
3
+ import numpy as np
4
+
5
+
6
+ from sklearn import random_projection
7
+ from sklearn.preprocessing import normalize, scale
8
+ from scipy.sparse import coo_matrix, csr_matrix, csc_matrix, spdiags
9
+
10
+
11
+ # projection method: choose from Gaussian and Sparse
12
+ # input matrix: choose from adjacency and transition matrix
13
+ # alpha adjusts the weighting of nodes according to their degree
14
+ def fastrp_projection(A, q=3, dim=128, projection_method='gaussian', input_matrix='adj', alpha=None):
15
+ assert input_matrix == 'adj' or input_matrix == 'trans'
16
+ assert projection_method == 'gaussian' or projection_method == 'sparse'
17
+ #
18
+ if input_matrix == 'adj':
19
+ N = A.shape[0]
20
+ M = A
21
+ else:
22
+ N = A.shape[0]
23
+ normalizer = spdiags(np.squeeze(1.0 / csc_matrix.sum(A, axis=1) ), 0, N, N)
24
+ M = normalizer @ A
25
+ # Gaussian projection matrix
26
+ if projection_method == 'gaussian':
27
+ transformer = random_projection.GaussianRandomProjection(n_components=dim, random_state=42)
28
+ # Sparse projection matrix
29
+ else:
30
+ transformer = random_projection.SparseRandomProjection(n_components=dim, random_state=42)
31
+ Y = transformer.fit(M)
32
+ # Random projection for A
33
+ if alpha is not None:
34
+ Y.components_ = Y.components_ @ spdiags( \
35
+ np.squeeze(np.power(csc_matrix.sum(A, axis=1), alpha)), 0, N, N)
36
+ cur_U = transformer.transform(M)
37
+ U_list = [cur_U]
38
+ #
39
+ for i in range(2, q + 1):
40
+ cur_U = M @ cur_U
41
+ U_list.append(cur_U)
42
+ return U_list
43
+
44
+
45
+ # When weights is None, concatenate instead of linearly combines the embeddings from different powers of A
46
+ def fastrp_merge(U_list, weights, normalization=False):
47
+ dense_U_list = [np.asarray(_U.todense()) for _U in U_list] if type(U_list[0]) == csc_matrix else U_list
48
+ _U_list = [normalize(_U, norm='l2', axis=1) for _U in dense_U_list] if normalization else dense_U_list
49
+ #
50
+ if weights is None:
51
+ return np.concatenate(_U_list, axis=1)
52
+ U = np.zeros_like(_U_list[0])
53
+ for cur_U, weight in zip(_U_list, weights):
54
+ U += cur_U * weight
55
+ # U = scale(U.todense())
56
+ # U = normalize(U.todense(), norm='l2', axis=1)
57
+ # U = np.asarray(U)
58
+ # return U
59
+ return scale(U.todense())
60
+
61
+
62
+ # A is always the adjacency matrix
63
+ # the choice between adj matrix and trans matrix is decided in the conf
64
+ def fastrp_wrapper(A, conf):
65
+ U_list = fastrp_projection(A,
66
+ q=len(conf['weights']),
67
+ dim=conf['dim'],
68
+ projection_method=conf['projection_method'],
69
+ input_matrix=conf['input_matrix'],
70
+ alpha=conf['alpha'],
71
+ )
72
+ U = fastrp_merge(U_list, conf['weights'], conf['normalization'])
73
+ return U
@@ -0,0 +1,369 @@
1
+ import pandas as pd
2
+ import numpy as np
3
+ import networkx as nx
4
+ from sklearn.preprocessing import QuantileTransformer
5
+
6
+
7
+
8
+ def transform_gene_prop(gene_prop_df, transform):
9
+ """
10
+ Apply transform function to each property column
11
+
12
+ Parameters
13
+ ----------
14
+ gene_prop_df : dataframe
15
+ A dataframe of gene properties, such as log fold changes, normalized
16
+ read counts, and z-scores. The 'gene' column of the dataframe is a
17
+ list of gene symbols. Other columns of the dataframe are the gene
18
+ properties of samples.
19
+ transform : function
20
+ A function to transform each property column.
21
+
22
+ Returns
23
+ -------
24
+ dataframe after transformation
25
+ """
26
+ assert not gene_prop_df['gene'].duplicated().any()
27
+
28
+ if transform is not None:
29
+ tdf = (
30
+ gene_prop_df
31
+ .copy()
32
+ .set_index('gene')
33
+ .apply(transform, axis=0)
34
+ .reset_index(names='gene')
35
+ )
36
+
37
+ return tdf
38
+
39
+
40
+
41
+ def qn_transform(s, sigma=0.367879, log1p=False, collapse_0s=False,
42
+ center=False, seed=42):
43
+ """
44
+ Quantile transform gene properties to normal distribution
45
+
46
+ Parameters
47
+ ----------
48
+ s : series
49
+ A pandas.Series of gene properties, such as log fold changes,
50
+ normalized read counts, and z-scores.
51
+ sigma : float
52
+ Output normal distribution sigma. Default to
53
+ np.round(1 / np.e, 6), so np.e ** (sigma * (3 - (-3))) ~= 9.
54
+ log1p : bool
55
+ Apply log1p transform on the properties or not, before
56
+ quantile normalization.
57
+ collapse_0s : bool
58
+ If True, all 0s will be collapse into one 0, and the 0's
59
+ quantile normalized value will be assigned to all 0s.
60
+ center : bool
61
+ If True, the tranformed values will be centered at the input 0s.
62
+ seed : int
63
+ Random state.
64
+
65
+ Returns
66
+ -------
67
+ Series after transformation
68
+ """
69
+ x = s.values.copy()
70
+ if log1p:
71
+ x = np.log1p(x)
72
+
73
+ assert len(x.shape) == 1
74
+ n = len(x)
75
+
76
+ qt = QuantileTransformer(
77
+ output_distribution='normal',
78
+ random_state=seed)
79
+
80
+ if collapse_0s:
81
+ # collapse all 0s when transform
82
+ non0_idc = x != 0
83
+ non0_x = x[non0_idc].copy()
84
+
85
+ c0_x = np.concatenate((np.array([0]), non0_x))
86
+
87
+ c0_x = c0_x.reshape(-1, 1)
88
+ assert c0_x[0] == 0
89
+ assert c0_x[1] != 0
90
+ assert c0_x.shape == (sum(non0_idc) + 1, 1)
91
+
92
+ c0_x = qt.fit_transform(c0_x)
93
+ assert c0_x.shape == (sum(non0_idc) + 1, 1)
94
+
95
+ c0_x = c0_x.flatten()
96
+ assert c0_x.shape == (sum(non0_idc) + 1,)
97
+
98
+ x[np.logical_not(non0_idc)] = c0_x[0]
99
+
100
+ x[non0_idc] = c0_x[1:]
101
+
102
+ else:
103
+ x = x.reshape(-1, 1)
104
+ assert x.shape == (n, 1)
105
+
106
+ x = qt.fit_transform(x)
107
+ assert x.shape == (n, 1)
108
+
109
+ x = x.flatten()
110
+
111
+ assert len(x.shape) == 1
112
+
113
+ if center:
114
+ t0 = qt.transform([[0.0]])[0][0]
115
+ x = x - t0
116
+
117
+ x = x * sigma
118
+
119
+ return pd.Series(x, index=s.index.copy())
120
+
121
+
122
+
123
+ def get_mrn_gp_df(gene_prop_df, rxn_gene_df,
124
+ fill_missing_gene_prop=None,
125
+ transform_gene_prop_func=None):
126
+ """
127
+ Prepare metabolic reaction network gene property dataframe
128
+
129
+ Parameters
130
+ ----------
131
+ gene_prop_df : dataframe
132
+ A dataframe of gene properties, such as log fold changes, normalized
133
+ read counts, and z-scores. The 'gene' column of the dataframe is a
134
+ list of gene symbols. Other columns of the dataframe are the gene
135
+ properties of samples.
136
+ rxn_gene_df : dataframe
137
+ A dataframe of reactions and their associated genes. Following are the
138
+ required columns: 'rxn' (reaction ID), 'rxn_name' (reaction name),
139
+ 'equation' (reaction equation), 'pathway' (reaction pathway), and
140
+ 'gene' (gene symbol). If a reaction is associated with multiple genes,
141
+ one row lists one associated gene.
142
+ fill_missing_gene_prop : int, or float, or None
143
+ If int or float, replace missing gene properties with this value.
144
+ If None, drop genes with missing properties.
145
+ transform_gene_prop_func : function
146
+ A function to transform each property column. If None, no
147
+ transformation will be applied.
148
+
149
+ Returns
150
+ -------
151
+ dataframe
152
+ """
153
+ if fill_missing_gene_prop is not None:
154
+ all_genes = pd.concat(
155
+ [gene_prop_df["gene"], rxn_gene_df["gene"]]
156
+ ).drop_duplicates().tolist()
157
+
158
+ gene_prop_df = (
159
+ gene_prop_df
160
+ .copy()
161
+ .set_index("gene")
162
+ .reindex(
163
+ all_genes,
164
+ fill_value=fill_missing_gene_prop)
165
+ .reset_index()
166
+ )
167
+
168
+ assert not np.any(gene_prop_df.isnull().values)
169
+
170
+ if transform_gene_prop_func is not None:
171
+ gene_prop_df = transform_gene_prop(
172
+ gene_prop_df, transform_gene_prop_func)
173
+
174
+ assert not np.any(gene_prop_df.isnull().values)
175
+
176
+ return gene_prop_df
177
+
178
+
179
+
180
+ def get_met_net_dfs(rxn_gene_df, rxn_edge_df, gene_prop_df, mn_weight_cutoff,
181
+ fill_missing_gene_prop=0,
182
+ transform_gene_prop_func=None,
183
+ rxn_gene_prop_agg_func=None):
184
+ """
185
+ Prepare input node and edge dataframes for Rawk
186
+
187
+ Parameters
188
+ ----------
189
+ rxn_gene_df : dataframe
190
+ A dataframe of reactions and their associated genes. Following are the
191
+ required columns: 'rxn' (reaction ID), 'rxn_name' (reaction name),
192
+ 'equation' (reaction equation), 'pathway' (reaction pathway), and
193
+ 'gene' (gene symbol). If a reaction is associated with multiple genes,
194
+ one row lists one associated gene.
195
+ rxn_edge_df : dataframe
196
+ A dataframe of reactions edges. Following are the required columns:
197
+ 'src' (source node reaction ID), 'dest' (destination node reaction ID),
198
+ 'mn_weight' (metabolic network edge weight). The edges are undirected,
199
+ with the src <= dest in alphabetical order.
200
+ gene_prop_df : dataframe
201
+ A dataframe of gene properties, such as log fold changes, normalized
202
+ read counts, and z-scores. The 'gene' column of the dataframe is a
203
+ list of gene symbols. Other columns of the dataframe are the gene
204
+ properties of samples.
205
+ mn_weight_cutoff : float
206
+ The cutoff of metabolic network weights. Keep only edges with weights >
207
+ mn_weight_cutoff.
208
+ fill_missing_gene_prop : int, or float, or None
209
+ If int or float, replace missing gene properties with this value.
210
+ If None, drop genes with missing properties.
211
+ transform_gene_prop_func : function
212
+ A function to transform each property column.
213
+ rxn_gene_prop_agg_func : function or None
214
+ The function used to aggregate the property values of multiple genes
215
+ that are associated with each reaction. If None, use
216
+ lambda x: x.mean() to aggregate.
217
+
218
+ Returns
219
+ -------
220
+ (dataframe, dataframe)
221
+ A 2-tuple of a reaction node property dataframe and a reaction edge
222
+ dataframe
223
+ """
224
+ if np.any(gene_prop_df.isnull().values):
225
+ raise ValueError("gene_prop_df contains one or more NA/NaN... values")
226
+ if np.any(rxn_edge_df.isnull().values):
227
+ raise ValueError("rxn_edge_df contains one or more NA/NaN... values")
228
+ if np.any(rxn_gene_df.isnull().values):
229
+ raise ValueError("rxn_gene_df contains one or more NA/NaN... values")
230
+
231
+ if rxn_edge_df[["src", "dest"]].duplicated().any():
232
+ raise ValueError("rxn_edge_df contains duplicated (src, dest) pairs")
233
+
234
+ if not all(rxn_edge_df.src <= rxn_edge_df.dest):
235
+ raise ValueError(
236
+ "rxn_edge_df requires src <= dest in alphabetical order")
237
+
238
+ if rxn_gene_df[["rxn", "gene"]].duplicated().any():
239
+ raise ValueError("rxn_gene_df contains duplicated (rxn, gene) pairs")
240
+
241
+ if gene_prop_df["gene"].duplicated().any():
242
+ raise ValueError("gene_prop_df contains duplicated genes.")
243
+
244
+ gene_prop_df = get_mrn_gp_df(
245
+ gene_prop_df, rxn_gene_df,
246
+ fill_missing_gene_prop=fill_missing_gene_prop,
247
+ transform_gene_prop_func=transform_gene_prop_func)
248
+
249
+ rxn_pw_set_dict = (
250
+ rxn_gene_df
251
+ .groupby("rxn")["pathway"]
252
+ .apply(set)
253
+ .to_dict()
254
+ )
255
+ rxn_pw_dict = {}
256
+ for k, v in rxn_pw_set_dict.items():
257
+ if len(v) != 1:
258
+ raise ValueError(
259
+ "rxn_gene_df contains one reaction to "
260
+ "multiple pathways mappings.")
261
+ rxn_pw_dict[k] = list(v)[0]
262
+
263
+ pw_rxn_set_dict = (
264
+ rxn_gene_df
265
+ .groupby("pathway")["rxn"]
266
+ .apply(set)
267
+ .to_dict()
268
+ )
269
+
270
+ rxn_edge_df = rxn_edge_df.loc[
271
+ rxn_edge_df["mn_weight"] > mn_weight_cutoff, :].copy()
272
+
273
+ edge_rxn_set = set(
274
+ rxn_edge_df["src"].tolist() + rxn_edge_df["dest"].tolist())
275
+
276
+ rxn_gene_prop_df = (
277
+ rxn_gene_df
278
+ .loc[rxn_gene_df["rxn"].isin(edge_rxn_set), :]
279
+ .merge(
280
+ gene_prop_df, how="left", on="gene",
281
+ validate="many_to_one")
282
+ )
283
+
284
+ if fill_missing_gene_prop is None:
285
+ rxn_gene_prop_df = rxn_gene_prop_df.dropna()
286
+ else:
287
+ assert not np.any(rxn_gene_prop_df.isnull().values)
288
+
289
+ # weight filtered; property exists
290
+ wf_pe_rxn_set = set(rxn_gene_prop_df["rxn"].tolist())
291
+ rxn_edge_df = rxn_edge_df.loc[
292
+ np.logical_and(
293
+ rxn_edge_df["src"].isin(wf_pe_rxn_set),
294
+ rxn_edge_df["dest"].isin(wf_pe_rxn_set)),
295
+ :].copy()
296
+
297
+ # If a rxn in a pathway cannot reach other pathways, remove the rxn.
298
+ # Such rxns will have all random walk steps within their own pathways
299
+ # regardless of the property values.
300
+ f_graph = nx.from_pandas_edgelist(
301
+ rxn_edge_df,
302
+ source="src",
303
+ target="dest")
304
+ assert not f_graph.is_directed()
305
+ # rxn to reachable rxn
306
+ r_rrs_dict = {
307
+ k: nx.node_connected_component(f_graph, k)
308
+ for k in list(f_graph.nodes())
309
+ }
310
+
311
+ rm_rxns = set([
312
+ k for k, v in r_rrs_dict.items()
313
+ if len(v - pw_rxn_set_dict[rxn_pw_dict[k]]) <= 0
314
+ ])
315
+
316
+ # If a reaction in a pathway can reach other pathways,
317
+ # the reaction cannot reach any reaction that cannot.
318
+ rxn_edge_df = rxn_edge_df.loc[
319
+ np.logical_not(np.logical_or(
320
+ rxn_edge_df["src"].isin(rm_rxns),
321
+ rxn_edge_df["dest"].isin(rm_rxns))),
322
+ :].copy()
323
+
324
+ def unique_one(x):
325
+ x_set = set(x.tolist())
326
+ assert len(x_set) == 1, str(x_set)
327
+ return list(x_set)[0]
328
+
329
+ agg_func_dict = {
330
+ "rxn_name": unique_one,
331
+ "equation": unique_one,
332
+ "pathway": unique_one,
333
+ "gene": (
334
+ lambda x: ";;;".join(sorted(x.tolist()))
335
+ )
336
+ }
337
+
338
+ if rxn_gene_prop_agg_func is None:
339
+ rxn_gene_prop_agg_func = lambda x: x.mean()
340
+
341
+ for i in rxn_gene_prop_df.columns:
342
+ if i != "rxn" and i not in agg_func_dict:
343
+ agg_func_dict[i] = rxn_gene_prop_agg_func
344
+
345
+ rxn_prop_df = (
346
+ rxn_gene_prop_df
347
+ .groupby("rxn")
348
+ .agg(agg_func_dict)
349
+ .reset_index(names=["rxn"])
350
+ )
351
+ assert rxn_prop_df.isnull().values.sum() == 0
352
+
353
+ common_rxns = (
354
+ set(rxn_prop_df["rxn"].tolist())
355
+ .intersection(set(rxn_edge_df["src"].tolist()))
356
+ .intersection(set(rxn_edge_df["dest"].tolist()))
357
+ )
358
+
359
+ rxn_edge_df = rxn_edge_df.loc[
360
+ np.logical_and(
361
+ rxn_edge_df["src"].isin(common_rxns),
362
+ rxn_edge_df["dest"].isin(common_rxns)),
363
+ :].reset_index(drop=True).copy()
364
+
365
+ rxn_prop_df = rxn_prop_df.loc[
366
+ rxn_prop_df["rxn"].isin(common_rxns),
367
+ :].reset_index(drop=True).copy()
368
+
369
+ return rxn_prop_df, rxn_edge_df