napistu 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- napistu/__init__.py +12 -0
- napistu/__main__.py +867 -0
- napistu/consensus.py +1557 -0
- napistu/constants.py +500 -0
- napistu/gcs/__init__.py +10 -0
- napistu/gcs/constants.py +69 -0
- napistu/gcs/downloads.py +180 -0
- napistu/identifiers.py +805 -0
- napistu/indices.py +227 -0
- napistu/ingestion/__init__.py +10 -0
- napistu/ingestion/bigg.py +146 -0
- napistu/ingestion/constants.py +296 -0
- napistu/ingestion/cpr_edgelist.py +106 -0
- napistu/ingestion/identifiers_etl.py +148 -0
- napistu/ingestion/obo.py +268 -0
- napistu/ingestion/psi_mi.py +276 -0
- napistu/ingestion/reactome.py +218 -0
- napistu/ingestion/sbml.py +621 -0
- napistu/ingestion/string.py +356 -0
- napistu/ingestion/trrust.py +285 -0
- napistu/ingestion/yeast.py +147 -0
- napistu/mechanism_matching.py +597 -0
- napistu/modify/__init__.py +10 -0
- napistu/modify/constants.py +86 -0
- napistu/modify/curation.py +628 -0
- napistu/modify/gaps.py +635 -0
- napistu/modify/pathwayannot.py +1381 -0
- napistu/modify/uncompartmentalize.py +264 -0
- napistu/network/__init__.py +10 -0
- napistu/network/constants.py +117 -0
- napistu/network/neighborhoods.py +1594 -0
- napistu/network/net_create.py +1647 -0
- napistu/network/net_utils.py +652 -0
- napistu/network/paths.py +500 -0
- napistu/network/precompute.py +221 -0
- napistu/rpy2/__init__.py +127 -0
- napistu/rpy2/callr.py +168 -0
- napistu/rpy2/constants.py +101 -0
- napistu/rpy2/netcontextr.py +464 -0
- napistu/rpy2/rids.py +697 -0
- napistu/sbml_dfs_core.py +2216 -0
- napistu/sbml_dfs_utils.py +304 -0
- napistu/source.py +394 -0
- napistu/utils.py +943 -0
- napistu-0.1.0.dist-info/METADATA +56 -0
- napistu-0.1.0.dist-info/RECORD +77 -0
- napistu-0.1.0.dist-info/WHEEL +5 -0
- napistu-0.1.0.dist-info/entry_points.txt +2 -0
- napistu-0.1.0.dist-info/licenses/LICENSE +21 -0
- napistu-0.1.0.dist-info/top_level.txt +2 -0
- tests/__init__.py +0 -0
- tests/conftest.py +83 -0
- tests/test_consensus.py +255 -0
- tests/test_constants.py +20 -0
- tests/test_curation.py +134 -0
- tests/test_data/__init__.py +0 -0
- tests/test_edgelist.py +20 -0
- tests/test_gcs.py +23 -0
- tests/test_identifiers.py +151 -0
- tests/test_igraph.py +353 -0
- tests/test_indices.py +88 -0
- tests/test_mechanism_matching.py +126 -0
- tests/test_net_utils.py +66 -0
- tests/test_netcontextr.py +105 -0
- tests/test_obo.py +34 -0
- tests/test_pathwayannot.py +95 -0
- tests/test_precomputed_distances.py +222 -0
- tests/test_rpy2.py +61 -0
- tests/test_sbml.py +46 -0
- tests/test_sbml_dfs_create.py +307 -0
- tests/test_sbml_dfs_utils.py +22 -0
- tests/test_sbo.py +11 -0
- tests/test_set_coverage.py +50 -0
- tests/test_source.py +67 -0
- tests/test_uncompartmentalize.py +40 -0
- tests/test_utils.py +487 -0
- tests/utils.py +30 -0
@@ -0,0 +1,1594 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
import logging
|
4
|
+
import math
|
5
|
+
import os
|
6
|
+
import pickle
|
7
|
+
import shutil
|
8
|
+
import textwrap
|
9
|
+
import warnings
|
10
|
+
from collections import ChainMap
|
11
|
+
from typing import Any
|
12
|
+
|
13
|
+
import igraph as ig
|
14
|
+
import numpy as np
|
15
|
+
import pandas as pd
|
16
|
+
from napistu import sbml_dfs_core
|
17
|
+
from napistu import utils
|
18
|
+
from napistu.network import net_utils
|
19
|
+
from napistu.network import paths
|
20
|
+
|
21
|
+
from napistu.constants import SBML_DFS
|
22
|
+
from napistu.constants import MINI_SBO_NAME_TO_POLARITY
|
23
|
+
from napistu.constants import MINI_SBO_TO_NAME
|
24
|
+
|
25
|
+
from napistu.network.constants import CPR_GRAPH_TYPES
|
26
|
+
from napistu.network.constants import NEIGHBORHOOD_NETWORK_TYPES
|
27
|
+
from napistu.network.constants import VALID_NEIGHBORHOOD_NETWORK_TYPES
|
28
|
+
|
29
|
+
logger = logging.getLogger(__name__)
|
30
|
+
|
31
|
+
|
32
|
+
def find_and_prune_neighborhoods(
|
33
|
+
sbml_dfs: sbml_dfs_core.SBML_dfs,
|
34
|
+
cpr_graph: ig.Graph,
|
35
|
+
compartmentalized_species: str | list[str],
|
36
|
+
precomputed_distances: pd.DataFrame | None = None,
|
37
|
+
network_type: str = NEIGHBORHOOD_NETWORK_TYPES.DOWNSTREAM,
|
38
|
+
order: int = 3,
|
39
|
+
verbose: bool = True,
|
40
|
+
top_n: int = 10,
|
41
|
+
) -> dict[str, Any]:
|
42
|
+
"""
|
43
|
+
Find and Prune Neighborhoods
|
44
|
+
|
45
|
+
Wrapper which combines find_neighborhoods() and prune_neighborhoods()
|
46
|
+
|
47
|
+
Parameters
|
48
|
+
----------
|
49
|
+
sbml_dfs: sbml_dfs_core.SBML_dfs
|
50
|
+
A mechanistic molecular model
|
51
|
+
cpr_graph : igraph.Graph
|
52
|
+
A bipartite network connecting molecular species and reactions
|
53
|
+
compartmentalized_species : [str] or str
|
54
|
+
Compartmentalized species IDs for neighborhood centers
|
55
|
+
precomputed_distances : pd.DataFrame or None
|
56
|
+
If provided, an edgelist of origin->destination path weights and lengths
|
57
|
+
network_type: str
|
58
|
+
If the network is directed should neighbors be located "downstream",
|
59
|
+
or "upstream" of each compartmentalized species. The "hourglass" option
|
60
|
+
locates both upstream and downstream species.
|
61
|
+
order: int
|
62
|
+
Max steps away from center node
|
63
|
+
verbose: bool
|
64
|
+
Extra reporting
|
65
|
+
top_n: int
|
66
|
+
How many neighboring molecular species should be retained?
|
67
|
+
If the neighborhood includes both upstream and downstream connections
|
68
|
+
(i.e., hourglass), this filter will be applied to both sets separately.
|
69
|
+
|
70
|
+
Returns:
|
71
|
+
----------
|
72
|
+
A dict containing the neighborhood of each compartmentalized species.
|
73
|
+
Each entry in the dict is a dict of the subgraph, vertices, and edges.
|
74
|
+
"""
|
75
|
+
|
76
|
+
if not isinstance(network_type, str):
|
77
|
+
raise TypeError(f"network_type was a {type(network_type)} and must be an str")
|
78
|
+
|
79
|
+
if not isinstance(order, int):
|
80
|
+
raise TypeError(f"order was a {type(order)} and must be an int")
|
81
|
+
|
82
|
+
if not isinstance(top_n, int):
|
83
|
+
raise TypeError(f"top_n was a {type(top_n)} and must be an int")
|
84
|
+
|
85
|
+
if isinstance(compartmentalized_species, str):
|
86
|
+
compartmentalized_species = [compartmentalized_species]
|
87
|
+
assert isinstance(compartmentalized_species, list)
|
88
|
+
|
89
|
+
if isinstance(precomputed_distances, pd.DataFrame):
|
90
|
+
logger.info("Pre-computed neighbors based on precomputed_distances")
|
91
|
+
|
92
|
+
precomputed_neighbors = _precompute_neighbors(
|
93
|
+
compartmentalized_species,
|
94
|
+
precomputed_distances=precomputed_distances,
|
95
|
+
sbml_dfs=sbml_dfs,
|
96
|
+
network_type=network_type,
|
97
|
+
order=order,
|
98
|
+
top_n=math.ceil(top_n * 1.1), # ties when using head()?
|
99
|
+
)
|
100
|
+
else:
|
101
|
+
precomputed_neighbors = None
|
102
|
+
|
103
|
+
neighborhoods = find_neighborhoods(
|
104
|
+
sbml_dfs=sbml_dfs,
|
105
|
+
cpr_graph=cpr_graph,
|
106
|
+
compartmentalized_species=compartmentalized_species,
|
107
|
+
network_type=network_type,
|
108
|
+
order=order,
|
109
|
+
verbose=verbose,
|
110
|
+
precomputed_neighbors=precomputed_neighbors,
|
111
|
+
)
|
112
|
+
|
113
|
+
pruned_neighborhoods = prune_neighborhoods(neighborhoods, top_n=top_n)
|
114
|
+
|
115
|
+
return pruned_neighborhoods
|
116
|
+
|
117
|
+
|
118
|
+
def load_neighborhoods(
|
119
|
+
s_ids: list[str],
|
120
|
+
sbml_dfs: sbml_dfs_core.SBML_dfs,
|
121
|
+
cpr_graph: ig.Graph,
|
122
|
+
output_dir: str,
|
123
|
+
network_type: str,
|
124
|
+
order: int,
|
125
|
+
top_n: int,
|
126
|
+
overwrite: bool = False,
|
127
|
+
verbose: bool = False,
|
128
|
+
) -> tuple[pd.DataFrame, dict[str, Any]]:
|
129
|
+
"""
|
130
|
+
Load Neighborhoods
|
131
|
+
|
132
|
+
Load existing neighborhoods if they exist
|
133
|
+
(and overwrite = False) and otherwise construct
|
134
|
+
neighborhoods using the provided settings
|
135
|
+
|
136
|
+
Parameters
|
137
|
+
----------
|
138
|
+
s_ids: list(str)
|
139
|
+
create a neighborhood around each species
|
140
|
+
sbml_dfs: sbml_dfs_core.SBML_dfs
|
141
|
+
network model
|
142
|
+
cpr_graph: igraph.Graph
|
143
|
+
network associated with sbml_dfs
|
144
|
+
output_dir: str
|
145
|
+
path to existing output directory
|
146
|
+
network_type: str
|
147
|
+
downstream, upstream or hourglass (i.e., downstream and upstream)
|
148
|
+
order: 10
|
149
|
+
maximum number of steps from the focal node
|
150
|
+
top_n: 30
|
151
|
+
target number of upstream and downstream species to retain
|
152
|
+
overwrite: bool
|
153
|
+
ignore cached files and regenerate neighborhoods
|
154
|
+
verbose: bool
|
155
|
+
extra reporting
|
156
|
+
|
157
|
+
Returns
|
158
|
+
-------
|
159
|
+
all_neighborhoods_df: pd.DataFrame
|
160
|
+
A table containing all species in each query s_ids neighborhood
|
161
|
+
neighborhoods_dict: dict
|
162
|
+
Outputs from find_and_prune_neighborhoods for each s_id
|
163
|
+
|
164
|
+
"""
|
165
|
+
|
166
|
+
if not os.path.isdir(output_dir):
|
167
|
+
raise FileNotFoundError(f"{output_dir} does not exist")
|
168
|
+
|
169
|
+
neighborhood_prefix = create_neighborhood_prefix(network_type, order, top_n)
|
170
|
+
vertices_path = os.path.join(output_dir, f"{neighborhood_prefix}_vertices.tsv")
|
171
|
+
networks_path = os.path.join(output_dir, f"{neighborhood_prefix}_networks.pkl")
|
172
|
+
neighborhood_paths = [vertices_path, networks_path]
|
173
|
+
|
174
|
+
if all([os.path.isfile(x) for x in neighborhood_paths]) and overwrite is False:
|
175
|
+
print(f"loading existing neighborhoods for {neighborhood_prefix}")
|
176
|
+
|
177
|
+
all_neighborhoods_df = pd.read_csv(vertices_path, sep="\t")
|
178
|
+
with open(networks_path, "rb") as in_file:
|
179
|
+
neighborhoods_dict = pickle.load(in_file)
|
180
|
+
|
181
|
+
else:
|
182
|
+
print(f"creating neighborhoods based on {neighborhood_prefix}")
|
183
|
+
|
184
|
+
all_neighborhoods_df, neighborhoods_dict = create_neighborhoods(
|
185
|
+
s_ids=s_ids,
|
186
|
+
sbml_dfs=sbml_dfs,
|
187
|
+
cpr_graph=cpr_graph,
|
188
|
+
network_type=network_type,
|
189
|
+
order=order,
|
190
|
+
top_n=top_n,
|
191
|
+
verbose=verbose,
|
192
|
+
)
|
193
|
+
|
194
|
+
# save df
|
195
|
+
all_neighborhoods_df.to_csv(vertices_path, sep="\t", index=False)
|
196
|
+
|
197
|
+
# pickle neighborhoods
|
198
|
+
with open(networks_path, "wb") as fh:
|
199
|
+
pickle.dump(neighborhoods_dict, fh)
|
200
|
+
|
201
|
+
return all_neighborhoods_df, neighborhoods_dict
|
202
|
+
|
203
|
+
|
204
|
+
def create_neighborhoods(
|
205
|
+
s_ids: list[str],
|
206
|
+
sbml_dfs: sbml_dfs_core.SBML_dfs,
|
207
|
+
cpr_graph: ig.Graph,
|
208
|
+
network_type: str,
|
209
|
+
order: int,
|
210
|
+
top_n: int,
|
211
|
+
verbose: bool = False,
|
212
|
+
) -> tuple[pd.DataFrame, dict]:
|
213
|
+
"""
|
214
|
+
Create Neighborhoods
|
215
|
+
|
216
|
+
Create neighborhoods for a set of species and return
|
217
|
+
|
218
|
+
Parameters
|
219
|
+
----------
|
220
|
+
s_ids: list(str)
|
221
|
+
create a neighborhood around each species
|
222
|
+
sbml_dfs: sbml_dfs_core.SBML_dfs
|
223
|
+
network model
|
224
|
+
cpr_graph: igraph.Graph
|
225
|
+
network associated with sbml_dfs
|
226
|
+
network_type: str
|
227
|
+
downstream, upstream or hourglass (i.e., downstream and upstream)
|
228
|
+
order: 10
|
229
|
+
maximum number of steps from the focal node
|
230
|
+
top_n: 30
|
231
|
+
target number of upstream and downstream species to retain
|
232
|
+
verbose: bool
|
233
|
+
extra reporting
|
234
|
+
|
235
|
+
Returns
|
236
|
+
-------
|
237
|
+
all_neighborhoods_df: pd.DataFrame
|
238
|
+
A table containing all species in each query s_ids neighborhood
|
239
|
+
neighborhoods_dict: dict
|
240
|
+
Outputs from find_and_prune_neighborhoods for each s_id
|
241
|
+
"""
|
242
|
+
|
243
|
+
if not isinstance(s_ids, list):
|
244
|
+
raise TypeError(f"s_ids was a {type(s_ids)} and must be an list")
|
245
|
+
|
246
|
+
for s_id in s_ids:
|
247
|
+
if not isinstance(s_id, str):
|
248
|
+
raise TypeError(f"s_id was a {type(s_id)} and must be an str")
|
249
|
+
|
250
|
+
if not isinstance(network_type, str):
|
251
|
+
raise TypeError(f"network_type was a {type(network_type)} and must be an str")
|
252
|
+
|
253
|
+
if not isinstance(order, int):
|
254
|
+
raise TypeError(f"order was a {type(order)} and must be an int")
|
255
|
+
|
256
|
+
if not isinstance(top_n, int):
|
257
|
+
raise TypeError(f"top_n was a {type(top_n)} and must be an int")
|
258
|
+
|
259
|
+
neighborhoods_list = list()
|
260
|
+
neighborhoods_dict = dict()
|
261
|
+
for s_id in s_ids:
|
262
|
+
query_sc_species = net_utils.compartmentalize_species(sbml_dfs, s_id)
|
263
|
+
|
264
|
+
compartmentalized_species = query_sc_species[SBML_DFS.SC_ID].tolist()
|
265
|
+
|
266
|
+
neighborhoods = find_and_prune_neighborhoods(
|
267
|
+
sbml_dfs,
|
268
|
+
cpr_graph,
|
269
|
+
compartmentalized_species=compartmentalized_species,
|
270
|
+
network_type=network_type,
|
271
|
+
order=order,
|
272
|
+
top_n=top_n,
|
273
|
+
verbose=verbose,
|
274
|
+
)
|
275
|
+
|
276
|
+
# combine multiple neighborhoods
|
277
|
+
|
278
|
+
neighborhood_entities = pd.concat(
|
279
|
+
[
|
280
|
+
neighborhoods[sc_id]["vertices"].assign(focal_sc_id=sc_id)
|
281
|
+
for sc_id in neighborhoods.keys()
|
282
|
+
]
|
283
|
+
).assign(focal_s_id=s_id)
|
284
|
+
|
285
|
+
neighborhood_species = neighborhood_entities.merge(
|
286
|
+
sbml_dfs.compartmentalized_species[SBML_DFS.S_ID],
|
287
|
+
left_on="name",
|
288
|
+
right_index=True,
|
289
|
+
)
|
290
|
+
|
291
|
+
neighborhoods_list.append(neighborhood_species)
|
292
|
+
neighborhoods_dict[s_id] = neighborhoods
|
293
|
+
|
294
|
+
all_neighborhoods_df = pd.concat(neighborhoods_list).reset_index(drop=True)
|
295
|
+
|
296
|
+
return all_neighborhoods_df, neighborhoods_dict
|
297
|
+
|
298
|
+
|
299
|
+
def create_neighborhood_prefix(network_type: str, order: int, top_n: int) -> str:
|
300
|
+
if not isinstance(network_type, str):
|
301
|
+
raise TypeError(f"network_type was a {type(network_type)} and must be a str")
|
302
|
+
|
303
|
+
if network_type not in VALID_NEIGHBORHOOD_NETWORK_TYPES:
|
304
|
+
raise ValueError(
|
305
|
+
f"network_type was {network_type} and must be one of {', '.join(VALID_NEIGHBORHOOD_NETWORK_TYPES)}"
|
306
|
+
)
|
307
|
+
if not isinstance(order, int):
|
308
|
+
raise ValueError("order must be an int")
|
309
|
+
if not isinstance(top_n, int):
|
310
|
+
raise ValueError("top_n must be an int")
|
311
|
+
|
312
|
+
return f"{network_type[0]}{order}s{top_n}n"
|
313
|
+
|
314
|
+
|
315
|
+
def load_neighborhoods_by_partition(
|
316
|
+
selected_partition: int,
|
317
|
+
neighborhood_outdir: str,
|
318
|
+
graph_type: str = CPR_GRAPH_TYPES.REGULATORY,
|
319
|
+
) -> None:
|
320
|
+
"""
|
321
|
+
Load Neighborhoods By Partition
|
322
|
+
|
323
|
+
Call load_neighborhoods for a subset of species ids defined by a partition.
|
324
|
+
This function is setup to be called in a slurm job.
|
325
|
+
|
326
|
+
Params
|
327
|
+
------
|
328
|
+
selected_partition: int
|
329
|
+
A partition of sids to search
|
330
|
+
neighborhood_outdir: str
|
331
|
+
Output directory
|
332
|
+
|
333
|
+
|
334
|
+
Returns
|
335
|
+
-------
|
336
|
+
None, used for side-effects
|
337
|
+
|
338
|
+
"""
|
339
|
+
|
340
|
+
consensus_root = "/group/cpr/consensus"
|
341
|
+
consensus_name = "reactome"
|
342
|
+
consensus_outdir = os.path.join(consensus_root, consensus_name)
|
343
|
+
|
344
|
+
if not os.path.isdir(neighborhood_outdir):
|
345
|
+
raise FileNotFoundError(f"{neighborhood_outdir} does not exist")
|
346
|
+
|
347
|
+
partition_output = os.path.join(
|
348
|
+
neighborhood_outdir, f"partition_{selected_partition}"
|
349
|
+
)
|
350
|
+
# initialize an empty output
|
351
|
+
if os.path.isdir(partition_output):
|
352
|
+
print(f"removing existing directory: {partition_output}")
|
353
|
+
shutil.rmtree(partition_output)
|
354
|
+
os.makedirs(partition_output)
|
355
|
+
|
356
|
+
# format partition s_ids
|
357
|
+
|
358
|
+
sids_to_partition = pd.read_csv(os.path.join(neighborhood_outdir, "partitions.csv"))
|
359
|
+
parition_sids_df = sids_to_partition[
|
360
|
+
sids_to_partition["partition"] == selected_partition
|
361
|
+
]
|
362
|
+
|
363
|
+
if parition_sids_df.shape[0] == 0:
|
364
|
+
raise ValueError(f"No s_ids associated with partition {selected_partition}")
|
365
|
+
|
366
|
+
parition_sids = parition_sids_df["s_id"].tolist()
|
367
|
+
|
368
|
+
# read pathway and network data
|
369
|
+
|
370
|
+
# read model containing Calico curations. this is primarily to support search programs
|
371
|
+
# to not use these switch to refined.pkl
|
372
|
+
refined_model_pkl_path = os.path.join(consensus_outdir, "curated.pkl")
|
373
|
+
with open(refined_model_pkl_path, "rb") as in_file:
|
374
|
+
refined_model = pickle.load(in_file)
|
375
|
+
refined_model.validate()
|
376
|
+
|
377
|
+
# load the graph
|
378
|
+
cpr_graph = net_utils.read_network_pkl(
|
379
|
+
model_prefix="curated",
|
380
|
+
network_dir=consensus_outdir,
|
381
|
+
directed=True,
|
382
|
+
graph_type=graph_type,
|
383
|
+
)
|
384
|
+
|
385
|
+
all_neighborhoods_df, neighborhoods_dict = load_neighborhoods(
|
386
|
+
s_ids=parition_sids,
|
387
|
+
sbml_dfs=refined_model,
|
388
|
+
cpr_graph=cpr_graph,
|
389
|
+
output_dir=partition_output,
|
390
|
+
network_type="hourglass",
|
391
|
+
order=12,
|
392
|
+
top_n=100,
|
393
|
+
overwrite=True,
|
394
|
+
verbose=True,
|
395
|
+
)
|
396
|
+
|
397
|
+
return None
|
398
|
+
|
399
|
+
|
400
|
+
def read_paritioned_neighborhoods(
|
401
|
+
sbml_dfs: sbml_dfs_core.SBML_dfs,
|
402
|
+
cpr_graph: ig.Graph,
|
403
|
+
partitions_path: str,
|
404
|
+
n_partitions: int = 200,
|
405
|
+
) -> tuple[pd.DataFrame, dict[str, Any]]:
|
406
|
+
"""
|
407
|
+
Read Partitioned Neighborhoods
|
408
|
+
|
409
|
+
Import a set of neighborhoods produced by the find_neighborhoods_batch.sh slurm job
|
410
|
+
|
411
|
+
Params
|
412
|
+
------
|
413
|
+
sbml_dfs: sbml_dfs_core.SBML_dfs
|
414
|
+
network model
|
415
|
+
cpr_graph: igraph.Graph
|
416
|
+
network associated with sbml_dfs
|
417
|
+
partitions_path: str
|
418
|
+
Path to a directory containing folders for each partition's results
|
419
|
+
n_partitions: int
|
420
|
+
Number of partitions that exist
|
421
|
+
|
422
|
+
Returns
|
423
|
+
-------
|
424
|
+
all_neighborhoods_df: pd.DataFrame
|
425
|
+
A table containing all species in each query s_ids neighborhood
|
426
|
+
neighborhoods_dict: dict
|
427
|
+
Outputs from find_and_prune_neighborhoods for each s_id
|
428
|
+
|
429
|
+
"""
|
430
|
+
|
431
|
+
# check for partition directories
|
432
|
+
expected_partition_dirs = ["partition_" + str(p) for p in range(0, n_partitions)]
|
433
|
+
missing_partition_dirs = set(expected_partition_dirs).difference(
|
434
|
+
set(os.listdir(partitions_path))
|
435
|
+
)
|
436
|
+
if len(missing_partition_dirs) != 0:
|
437
|
+
raise FileNotFoundError(
|
438
|
+
f"{len(missing_partition_dirs)} neighborhood partition directories were not found:"
|
439
|
+
f" {', '.join(missing_partition_dirs)}"
|
440
|
+
)
|
441
|
+
|
442
|
+
# check for required files
|
443
|
+
expected_files = ["h12s100n_vertices.tsv", "h12s100n_networks.pkl"]
|
444
|
+
expected_paths_df = pd.DataFrame(
|
445
|
+
[
|
446
|
+
{"partition": p, "file": f}
|
447
|
+
for p in expected_partition_dirs
|
448
|
+
for f in expected_files
|
449
|
+
]
|
450
|
+
)
|
451
|
+
expected_paths_df["path"] = [
|
452
|
+
os.path.join(partitions_path, p, f)
|
453
|
+
for p, f in zip(expected_paths_df["partition"], expected_paths_df["file"])
|
454
|
+
]
|
455
|
+
expected_paths_df["exists"] = [os.path.isfile(p) for p in expected_paths_df["path"]]
|
456
|
+
missing_expected_paths_df = expected_paths_df[~expected_paths_df["exists"]]
|
457
|
+
|
458
|
+
if missing_expected_paths_df.shape[0] > 0:
|
459
|
+
styled_df = utils.style_df(
|
460
|
+
missing_expected_paths_df.drop(["exists"], axis=1), headers="keys"
|
461
|
+
)
|
462
|
+
logger.warning(styled_df)
|
463
|
+
|
464
|
+
raise FileNotFoundError(
|
465
|
+
f"missing {missing_expected_paths_df.shape[0]} required files"
|
466
|
+
)
|
467
|
+
|
468
|
+
neighborhood_paths_list = list()
|
469
|
+
path_dict_list = list()
|
470
|
+
|
471
|
+
for p in expected_partition_dirs:
|
472
|
+
partition_paths, partition_dict = load_neighborhoods(
|
473
|
+
s_ids=["stub"],
|
474
|
+
sbml_dfs=sbml_dfs,
|
475
|
+
cpr_graph=cpr_graph,
|
476
|
+
output_dir=os.path.join(partitions_path, p),
|
477
|
+
# these settings define the neighborhood string so they must
|
478
|
+
# match the settings at the time of network generation
|
479
|
+
network_type="hourglass",
|
480
|
+
order=12,
|
481
|
+
top_n=100,
|
482
|
+
overwrite=False,
|
483
|
+
verbose=False,
|
484
|
+
)
|
485
|
+
|
486
|
+
neighborhood_paths_list.append(partition_paths)
|
487
|
+
path_dict_list.append(partition_dict)
|
488
|
+
|
489
|
+
# combine all partitions' dfs and dicts
|
490
|
+
all_neighborhoods_df = pd.concat(neighborhood_paths_list).reset_index(drop=True)
|
491
|
+
neighborhoods_dict = dict(ChainMap(*path_dict_list))
|
492
|
+
|
493
|
+
# TO DO - remove s_id duplication (these are present in the vertices table in the partition outputs)
|
494
|
+
if not all(all_neighborhoods_df["s_id_x"] == all_neighborhoods_df["s_id_y"]):
|
495
|
+
raise ValueError("The patch won't hold")
|
496
|
+
all_neighborhoods_df = all_neighborhoods_df.drop(["s_id_y"], axis=1).rename(
|
497
|
+
{"s_id_x": "s_id"}, axis=1
|
498
|
+
)
|
499
|
+
|
500
|
+
return all_neighborhoods_df, neighborhoods_dict
|
501
|
+
|
502
|
+
|
503
|
+
def find_neighborhoods(
|
504
|
+
sbml_dfs: sbml_dfs_core.SBML_dfs,
|
505
|
+
cpr_graph: ig.Graph,
|
506
|
+
compartmentalized_species: list[str],
|
507
|
+
network_type: str = "downstream",
|
508
|
+
order: int = 3,
|
509
|
+
verbose: bool = True,
|
510
|
+
precomputed_neighbors: pd.DataFrame | None = None,
|
511
|
+
) -> dict:
|
512
|
+
"""
|
513
|
+
Find Neighborhood
|
514
|
+
|
515
|
+
Create a network composed of all species and reactions within N steps of
|
516
|
+
each of a set of compartmentalized species.
|
517
|
+
|
518
|
+
Parameters
|
519
|
+
----------
|
520
|
+
sbml_dfs: sbml_dfs_core.SBML_dfs
|
521
|
+
A mechanistic molecular model
|
522
|
+
cpr_graph : igraph.Graph
|
523
|
+
A network connecting molecular species and reactions
|
524
|
+
compartmentalized_species : [str]
|
525
|
+
Compartmentalized species IDs for neighborhood centers
|
526
|
+
network_type: str
|
527
|
+
If the network is directed should neighbors be located "downstream",
|
528
|
+
or "upstream" of each compartmentalized species. The "hourglass" option
|
529
|
+
locates both upstream and downstream species.
|
530
|
+
order: int
|
531
|
+
Max steps away from center node
|
532
|
+
verbose: bool
|
533
|
+
Extra reporting
|
534
|
+
precomputed_neighbors: pd.DataFrame or None
|
535
|
+
If provided, a pre-filtered table of nodes nearby the compartmentalized species
|
536
|
+
which will be used to skip on-the-fly neighborhood generation.
|
537
|
+
|
538
|
+
Returns:
|
539
|
+
----------
|
540
|
+
A dict containing the neighborhood of each compartmentalized species.
|
541
|
+
Each entry in the dict is a dict of the subgraph, vertices, and edges.
|
542
|
+
"""
|
543
|
+
|
544
|
+
if not isinstance(network_type, str):
|
545
|
+
raise TypeError(f"network_type was a {type(network_type)} and must be a str")
|
546
|
+
|
547
|
+
valid_network_types = ["downstream", "upstream", "hourglass"]
|
548
|
+
if network_type not in valid_network_types:
|
549
|
+
raise ValueError(
|
550
|
+
f"network_type must be one of {', '.join(valid_network_types)}"
|
551
|
+
)
|
552
|
+
|
553
|
+
if not isinstance(order, int):
|
554
|
+
raise TypeError(f"order was a {type(order)} and must be an int")
|
555
|
+
|
556
|
+
# create a table which includes cspecies and reaction nearby each of the
|
557
|
+
# focal compartmentalized_speecies
|
558
|
+
neighborhood_df = _build_raw_neighborhood_df(
|
559
|
+
cpr_graph=cpr_graph,
|
560
|
+
compartmentalized_species=compartmentalized_species,
|
561
|
+
network_type=network_type,
|
562
|
+
order=order,
|
563
|
+
precomputed_neighbors=precomputed_neighbors,
|
564
|
+
)
|
565
|
+
|
566
|
+
# format the vertices and edges in each compartmentalized species' network
|
567
|
+
neighborhood_dict = {
|
568
|
+
sc_id: create_neighborhood_dict_entry(
|
569
|
+
sc_id, neighborhood_df, sbml_dfs, cpr_graph, verbose=verbose
|
570
|
+
)
|
571
|
+
for sc_id in compartmentalized_species
|
572
|
+
}
|
573
|
+
|
574
|
+
return neighborhood_dict
|
575
|
+
|
576
|
+
|
577
|
+
def create_neighborhood_dict_entry(
|
578
|
+
sc_id: str,
|
579
|
+
neighborhood_df: pd.DataFrame,
|
580
|
+
sbml_dfs: sbml_dfs_core.SBML_dfs,
|
581
|
+
cpr_graph: ig.Graph,
|
582
|
+
verbose: bool = False,
|
583
|
+
) -> dict[str, Any]:
|
584
|
+
"""
|
585
|
+
Create Neighborhood Dict Entry
|
586
|
+
|
587
|
+
Generate a summary of a compartmentalized species' neighborhood
|
588
|
+
|
589
|
+
Parameters
|
590
|
+
----------
|
591
|
+
sc_id: str
|
592
|
+
A compartmentalized species id
|
593
|
+
neighborhood_df: pd.DataFrame
|
594
|
+
A table of upstream and/or downstream neighbors of all compartmentalized species
|
595
|
+
sbml_dfs: sbml_dfs_core.SBML_dfs
|
596
|
+
A mechanistic molecular model
|
597
|
+
cpr_graph: igraph.Graph
|
598
|
+
A network connecting molecular species and reactions
|
599
|
+
verbose: bool
|
600
|
+
Extra reporting?
|
601
|
+
|
602
|
+
Returns
|
603
|
+
-------
|
604
|
+
dict containing:
|
605
|
+
graph: igraph.Graph
|
606
|
+
subgraph of sc_id's neighborhood,
|
607
|
+
vertices: pd.DataFrame
|
608
|
+
nodes in the neighborhood
|
609
|
+
edges: pd.DataFrame
|
610
|
+
edges in the neighborhood
|
611
|
+
edge_sources: pd.DataFrame
|
612
|
+
models that edges were derived from
|
613
|
+
neighborhood_path_entities: dict
|
614
|
+
upstream and downstream dicts representing entities in paths.
|
615
|
+
If the keys are to be included in a neighborhood, the
|
616
|
+
values should be as well in order to maintain connection to the
|
617
|
+
focal node.
|
618
|
+
"""
|
619
|
+
|
620
|
+
one_neighborhood_df = neighborhood_df[neighborhood_df["sc_id"] == sc_id]
|
621
|
+
|
622
|
+
if verbose:
|
623
|
+
_create_neighborhood_dict_entry_logging(sc_id, one_neighborhood_df, sbml_dfs)
|
624
|
+
|
625
|
+
if not one_neighborhood_df["name"].eq(sc_id).any():
|
626
|
+
raise ValueError(
|
627
|
+
f"The focal node sc_id = {sc_id} was not in 'one_neighborhood_df'.\
|
628
|
+
By convention it should be part of its neighborhood"
|
629
|
+
)
|
630
|
+
|
631
|
+
# create the subgraph formed by filtering to neighborhoods
|
632
|
+
neighborhood_graph = cpr_graph.subgraph(
|
633
|
+
cpr_graph.vs[one_neighborhood_df["neighbor"]], implementation="auto"
|
634
|
+
)
|
635
|
+
|
636
|
+
vertices = pd.DataFrame([v.attributes() for v in neighborhood_graph.vs])
|
637
|
+
edges = pd.DataFrame([e.attributes() for e in neighborhood_graph.es])
|
638
|
+
|
639
|
+
# add edge polarity: whether edges are activating, inhibiting or unknown
|
640
|
+
if edges.shape[0] > 0:
|
641
|
+
edges["link_polarity"] = (
|
642
|
+
edges["sbo_term"].map(MINI_SBO_TO_NAME).map(MINI_SBO_NAME_TO_POLARITY)
|
643
|
+
)
|
644
|
+
|
645
|
+
try:
|
646
|
+
edge_sources = net_utils.get_minimal_sources_edges(
|
647
|
+
vertices.rename(columns={"name": "node"}), sbml_dfs
|
648
|
+
)
|
649
|
+
except Exception:
|
650
|
+
edge_sources = None
|
651
|
+
|
652
|
+
# to add weights to the network solve the shortest path problem
|
653
|
+
# from the focal node to each neighbor
|
654
|
+
# solve this problem separately whether a given neighbor is an
|
655
|
+
# ancestor or descendant
|
656
|
+
|
657
|
+
# focal node -> descendants
|
658
|
+
|
659
|
+
one_descendants_df = one_neighborhood_df[
|
660
|
+
one_neighborhood_df["relationship"] == "descendants"
|
661
|
+
]
|
662
|
+
descendants_list = list(set(one_descendants_df["name"].tolist()).union({sc_id}))
|
663
|
+
|
664
|
+
# hide warnings which are mostly just Dijkstra complaining about not finding neighbors
|
665
|
+
with warnings.catch_warnings():
|
666
|
+
# igraph throws warnings for each pair of unconnected species
|
667
|
+
warnings.simplefilter("ignore")
|
668
|
+
|
669
|
+
neighborhood_paths = neighborhood_graph.get_shortest_paths(
|
670
|
+
# focal node
|
671
|
+
v=sc_id,
|
672
|
+
to=descendants_list,
|
673
|
+
weights="weights",
|
674
|
+
mode="out",
|
675
|
+
output="epath",
|
676
|
+
)
|
677
|
+
|
678
|
+
downstream_path_attrs, downstream_entity_dict = _calculate_path_attrs(
|
679
|
+
neighborhood_paths, edges, vertices=descendants_list, weight_var="weights"
|
680
|
+
)
|
681
|
+
downstream_path_attrs = downstream_path_attrs.assign(node_orientation="downstream")
|
682
|
+
|
683
|
+
# ancestors -> focal_node
|
684
|
+
|
685
|
+
one_ancestors_df = one_neighborhood_df[
|
686
|
+
one_neighborhood_df["relationship"] == "ancestors"
|
687
|
+
]
|
688
|
+
ancestors_list = list(set(one_ancestors_df["name"].tolist()).union({sc_id}))
|
689
|
+
|
690
|
+
with warnings.catch_warnings():
|
691
|
+
# igraph throws warnings for each pair of unconnected species
|
692
|
+
warnings.simplefilter("ignore")
|
693
|
+
|
694
|
+
neighborhood_paths = neighborhood_graph.get_shortest_paths(
|
695
|
+
v=sc_id,
|
696
|
+
to=ancestors_list,
|
697
|
+
weights="upstream_weights",
|
698
|
+
mode="in",
|
699
|
+
output="epath",
|
700
|
+
)
|
701
|
+
|
702
|
+
upstream_path_attrs, upstream_entity_dict = _calculate_path_attrs(
|
703
|
+
neighborhood_paths,
|
704
|
+
edges,
|
705
|
+
vertices=ancestors_list,
|
706
|
+
weight_var="upstream_weights",
|
707
|
+
)
|
708
|
+
upstream_path_attrs = upstream_path_attrs.assign(node_orientation="upstream")
|
709
|
+
|
710
|
+
# combine upstream and downstream shortest paths
|
711
|
+
# in cases a node is upstream and downstream of the focal node
|
712
|
+
# by taking the lowest path weight
|
713
|
+
vertex_neighborhood_attrs = (
|
714
|
+
pd.concat([downstream_path_attrs, upstream_path_attrs])
|
715
|
+
.sort_values("path_weight")
|
716
|
+
.groupby("neighbor")
|
717
|
+
.first()
|
718
|
+
)
|
719
|
+
# label the focal node
|
720
|
+
vertex_neighborhood_attrs.loc[sc_id, "node_orientation"] = "focal"
|
721
|
+
|
722
|
+
# if the precomputed distances, graph and/or sbml_dfs are inconsistent
|
723
|
+
# then the shortest paths search may just return empty lists
|
724
|
+
# throw a clearer error message in this case.
|
725
|
+
EXPECTED_VERTEX_ATTRS = {"final_from", "final_to", "net_polarity"}
|
726
|
+
missing_vertex_attrs = EXPECTED_VERTEX_ATTRS.difference(
|
727
|
+
set(vertex_neighborhood_attrs.columns.tolist())
|
728
|
+
)
|
729
|
+
|
730
|
+
if len(missing_vertex_attrs) > 0:
|
731
|
+
raise ValueError(
|
732
|
+
f"vertex_neighborhood_attrs did not contain the expected columns: {EXPECTED_VERTEX_ATTRS}."
|
733
|
+
"This is likely because of inconsistencies between the precomputed distances, graph and/or sbml_dfs."
|
734
|
+
"Please try net_utils.validate_assets() to check for consistency."
|
735
|
+
)
|
736
|
+
|
737
|
+
# add net_polarity to edges in addition to nodes
|
738
|
+
edges = edges.merge(
|
739
|
+
vertex_neighborhood_attrs.reset_index()[
|
740
|
+
["final_from", "final_to", "net_polarity"]
|
741
|
+
].dropna(),
|
742
|
+
left_on=["from", "to"],
|
743
|
+
right_on=["final_from", "final_to"],
|
744
|
+
how="left",
|
745
|
+
)
|
746
|
+
|
747
|
+
vertices = vertices.merge(
|
748
|
+
vertex_neighborhood_attrs, left_on="name", right_index=True
|
749
|
+
)
|
750
|
+
|
751
|
+
# drop nodes with a path length / weight of zero
|
752
|
+
# which are NOT the focal node
|
753
|
+
# these were cases where no path to/from the focal node to the query node was found
|
754
|
+
disconnected_neighbors = vertices.query(
|
755
|
+
"(not node_orientation == 'focal') and path_weight == 0"
|
756
|
+
)
|
757
|
+
vertices = vertices[~vertices.index.isin(disconnected_neighbors.index.tolist())]
|
758
|
+
|
759
|
+
# add reference urls
|
760
|
+
vertices = add_vertices_uri_urls(vertices, sbml_dfs)
|
761
|
+
|
762
|
+
neighborhood_path_entities = {
|
763
|
+
"downstream": downstream_entity_dict,
|
764
|
+
"upstream": upstream_entity_dict,
|
765
|
+
}
|
766
|
+
|
767
|
+
# update graph with additional vertex and edge attributes
|
768
|
+
updated_cpr_graph = ig.Graph.DictList(
|
769
|
+
vertices=vertices.to_dict("records"),
|
770
|
+
edges=edges.to_dict("records"),
|
771
|
+
directed=cpr_graph.is_directed(),
|
772
|
+
vertex_name_attr="name",
|
773
|
+
edge_foreign_keys=("from", "to"),
|
774
|
+
)
|
775
|
+
|
776
|
+
outdict = {
|
777
|
+
"graph": updated_cpr_graph,
|
778
|
+
"vertices": vertices,
|
779
|
+
"edges": edges,
|
780
|
+
"edge_sources": edge_sources,
|
781
|
+
"neighborhood_path_entities": neighborhood_path_entities,
|
782
|
+
}
|
783
|
+
|
784
|
+
return outdict
|
785
|
+
|
786
|
+
|
787
|
+
def _create_neighborhood_dict_entry_logging(
|
788
|
+
sc_id: str, one_neighborhood_df: pd.DataFrame, sbml_dfs: sbml_dfs_core.SBML_dfs
|
789
|
+
):
|
790
|
+
df_summary = one_neighborhood_df.copy()
|
791
|
+
df_summary["node_type"] = [
|
792
|
+
"species" if x else "reactions"
|
793
|
+
for x in df_summary["name"].isin(sbml_dfs.compartmentalized_species.index)
|
794
|
+
]
|
795
|
+
relationship_counts = df_summary.value_counts(
|
796
|
+
["relationship", "node_type"]
|
797
|
+
).sort_index()
|
798
|
+
|
799
|
+
relation_strings = list()
|
800
|
+
for relation in relationship_counts.index.get_level_values(0).unique():
|
801
|
+
relation_str = " and ".join(
|
802
|
+
[
|
803
|
+
f"{relationship_counts[relation][i]} {i}"
|
804
|
+
for i in relationship_counts[relation].index
|
805
|
+
]
|
806
|
+
)
|
807
|
+
relation_strings.append(f"{relation}: {relation_str}")
|
808
|
+
|
809
|
+
msg = f"{sc_id} neighborhood: {'; '.join(relation_strings)}"
|
810
|
+
logger.info(msg)
|
811
|
+
|
812
|
+
|
813
|
+
def add_vertices_uri_urls(
|
814
|
+
vertices: pd.DataFrame, sbml_dfs: sbml_dfs_core.SBML_dfs
|
815
|
+
) -> pd.DataFrame:
|
816
|
+
"""
|
817
|
+
Add Vertices URI URLs
|
818
|
+
|
819
|
+
Add a url variable to the neighborhood vertices pd.DataFrame
|
820
|
+
|
821
|
+
Parameters
|
822
|
+
----------
|
823
|
+
vertices: pd.DataFrame
|
824
|
+
table of neighborhood vertices
|
825
|
+
sbml_dfs: sbml_dfs_core.SBML_dfs
|
826
|
+
consensus network model
|
827
|
+
|
828
|
+
Returns
|
829
|
+
-------
|
830
|
+
vertices: pd.DataFrame
|
831
|
+
input table with a url field
|
832
|
+
|
833
|
+
"""
|
834
|
+
|
835
|
+
assert isinstance(vertices, pd.DataFrame)
|
836
|
+
assert vertices.shape[0] > 0
|
837
|
+
|
838
|
+
# add uri urls for each node
|
839
|
+
|
840
|
+
# add s_ids
|
841
|
+
neighborhood_species = vertices[vertices["node_type"] == "species"].merge(
|
842
|
+
sbml_dfs.compartmentalized_species["s_id"],
|
843
|
+
left_on="name",
|
844
|
+
right_index=True,
|
845
|
+
how="left",
|
846
|
+
)
|
847
|
+
|
848
|
+
# add a standard reference identifier
|
849
|
+
neighborhood_species_aug = neighborhood_species.merge(
|
850
|
+
sbml_dfs.get_uri_urls("species", neighborhood_species["s_id"]),
|
851
|
+
left_on="s_id",
|
852
|
+
right_index=True,
|
853
|
+
how="left",
|
854
|
+
# add pharos ids where available
|
855
|
+
).merge(
|
856
|
+
sbml_dfs.get_uri_urls(
|
857
|
+
"species", neighborhood_species["s_id"], required_ontology="pharos"
|
858
|
+
).rename("pharos"),
|
859
|
+
left_on="s_id",
|
860
|
+
right_index=True,
|
861
|
+
how="left",
|
862
|
+
)
|
863
|
+
|
864
|
+
if sum(vertices["node_type"] == "reaction") > 0:
|
865
|
+
neighborhood_reactions = vertices[vertices["node_type"] == "reaction"].merge(
|
866
|
+
sbml_dfs.get_uri_urls(
|
867
|
+
"reactions", vertices[vertices["node_type"] == "reaction"]["name"]
|
868
|
+
),
|
869
|
+
left_on="name",
|
870
|
+
right_index=True,
|
871
|
+
how="left",
|
872
|
+
)
|
873
|
+
else:
|
874
|
+
neighborhood_reactions = None
|
875
|
+
|
876
|
+
if neighborhood_reactions is None:
|
877
|
+
updated_vertices = neighborhood_species_aug.fillna("")
|
878
|
+
else:
|
879
|
+
updated_vertices = pd.concat(
|
880
|
+
[neighborhood_species_aug, neighborhood_reactions]
|
881
|
+
).fillna("")
|
882
|
+
|
883
|
+
assert isinstance(updated_vertices, pd.DataFrame)
|
884
|
+
if vertices.shape[0] != updated_vertices.shape[0]:
|
885
|
+
raise ValueError("output vertices rows did not match input")
|
886
|
+
|
887
|
+
return updated_vertices
|
888
|
+
|
889
|
+
|
890
|
+
def prune_neighborhoods(neighborhoods: dict, top_n: int = 100) -> dict:
|
891
|
+
"""
|
892
|
+
Prune Neighborhoods
|
893
|
+
|
894
|
+
Take a possibly very large neighborhood around a set of focal nodes
|
895
|
+
and prune to the most highly weighted nodes. Nodes weights are
|
896
|
+
constructed as the sum of path weights from the focal node to each
|
897
|
+
neighbor so each pruned neighborhood will still be a single subnetwork.
|
898
|
+
|
899
|
+
Parameters
|
900
|
+
----------
|
901
|
+
neighborhoods: dict
|
902
|
+
A dictionary of sc_id neighborhoods as produced by find_neighborhoods()
|
903
|
+
top_n: int
|
904
|
+
How many neighbors should be retained? If the neighborhood includes
|
905
|
+
both upstream and downstream connections (i.e., hourglass), this filter
|
906
|
+
will be applied to both sets separately
|
907
|
+
|
908
|
+
Returns
|
909
|
+
-------
|
910
|
+
neighborhoods: dict
|
911
|
+
Same structure as neighborhoods input
|
912
|
+
"""
|
913
|
+
|
914
|
+
if not isinstance(top_n, int):
|
915
|
+
raise TypeError(f"top_n was a {type(top_n)} and must be an int")
|
916
|
+
|
917
|
+
pruned_neighborhoods_dict = dict()
|
918
|
+
|
919
|
+
for an_sc_id in neighborhoods.keys():
|
920
|
+
one_neighborhood = neighborhoods[an_sc_id]
|
921
|
+
|
922
|
+
# filter to the desired number of vertices w/ lowest path_weight (from focal node)
|
923
|
+
# filter neighborhood to high-weight vertices
|
924
|
+
pruned_vertices = _prune_vertex_set(one_neighborhood, top_n=top_n)
|
925
|
+
|
926
|
+
# reduce neighborhood to this set of high-weight vertices
|
927
|
+
all_neighbors = pd.DataFrame({"name": one_neighborhood["graph"].vs["name"]})
|
928
|
+
pruned_vertices_indices = all_neighbors[
|
929
|
+
all_neighbors["name"].isin(pruned_vertices["name"])
|
930
|
+
].index.tolist()
|
931
|
+
|
932
|
+
pruned_neighborhood = one_neighborhood["graph"].subgraph(
|
933
|
+
one_neighborhood["graph"].vs[pruned_vertices_indices],
|
934
|
+
implementation="auto",
|
935
|
+
)
|
936
|
+
|
937
|
+
pruned_edges = pd.DataFrame([e.attributes() for e in pruned_neighborhood.es])
|
938
|
+
|
939
|
+
pruned_reactions = pruned_vertices[pruned_vertices["node_type"] == "reaction"][
|
940
|
+
"name"
|
941
|
+
]
|
942
|
+
|
943
|
+
if pruned_reactions.shape[0] != 0:
|
944
|
+
if one_neighborhood["edge_sources"] is None:
|
945
|
+
# allow for missing source information since this is currently optional
|
946
|
+
pruned_edge_sources = one_neighborhood["edge_sources"]
|
947
|
+
else:
|
948
|
+
pruned_edge_sources = one_neighborhood["edge_sources"][
|
949
|
+
one_neighborhood["edge_sources"]["r_id"].isin(pruned_reactions)
|
950
|
+
]
|
951
|
+
else:
|
952
|
+
pruned_edge_sources = one_neighborhood["edge_sources"]
|
953
|
+
|
954
|
+
pruned_neighborhoods_dict[an_sc_id] = {
|
955
|
+
"graph": pruned_neighborhood,
|
956
|
+
"vertices": pruned_vertices,
|
957
|
+
"edges": pruned_edges,
|
958
|
+
"edge_sources": pruned_edge_sources,
|
959
|
+
}
|
960
|
+
|
961
|
+
return pruned_neighborhoods_dict
|
962
|
+
|
963
|
+
|
964
|
+
def plot_neighborhood(
|
965
|
+
neighborhood_graph: ig.Graph,
|
966
|
+
name_nodes: bool = False,
|
967
|
+
plot_size: int = 1000,
|
968
|
+
network_layout: str = "drl",
|
969
|
+
) -> ig.plot:
|
970
|
+
"""
|
971
|
+
Plot Neighborhood
|
972
|
+
|
973
|
+
Parameters:
|
974
|
+
----------
|
975
|
+
neighborhood_graph: igraph.Graph
|
976
|
+
An igraph network
|
977
|
+
name_nodes: bool
|
978
|
+
Should nodes be named
|
979
|
+
plot_size: int
|
980
|
+
Plot width/height in pixels
|
981
|
+
network_layout: str
|
982
|
+
Igraph network layout method
|
983
|
+
|
984
|
+
Returns:
|
985
|
+
----------
|
986
|
+
An igraph plot
|
987
|
+
"""
|
988
|
+
|
989
|
+
neighborhood_graph_layout = neighborhood_graph.layout(network_layout)
|
990
|
+
|
991
|
+
if "net_polarity" not in neighborhood_graph.es.attributes():
|
992
|
+
logger.warning(
|
993
|
+
"net_polarity was not defined as an edge attribute so edges will not be colored"
|
994
|
+
)
|
995
|
+
neighborhood_graph.es.set_attribute_values("net_polarity", np.nan)
|
996
|
+
|
997
|
+
color_dict = {
|
998
|
+
"focal disease": "lime",
|
999
|
+
"disease": "aquamarine",
|
1000
|
+
"focal": "lightcoral",
|
1001
|
+
"species": "firebrick",
|
1002
|
+
"reaction": "dodgerblue",
|
1003
|
+
}
|
1004
|
+
|
1005
|
+
edge_polarity_colors = {
|
1006
|
+
"ambiguous": "dimgray",
|
1007
|
+
"activation": "gold",
|
1008
|
+
"inhibition": "royalblue",
|
1009
|
+
"ambiguous activation": "palegoldenrod",
|
1010
|
+
"ambiguous inhibition": "powerblue",
|
1011
|
+
np.nan: "dimgray",
|
1012
|
+
}
|
1013
|
+
|
1014
|
+
visual_style = {} # type: dict[str,Any]
|
1015
|
+
visual_style["background"] = "black"
|
1016
|
+
visual_style["vertex_size"] = 10
|
1017
|
+
if name_nodes:
|
1018
|
+
visual_style["vertex_label"] = [
|
1019
|
+
textwrap.fill(x, 15) for x in neighborhood_graph.vs["node_name"]
|
1020
|
+
]
|
1021
|
+
visual_style["vertex_label_color"] = "white"
|
1022
|
+
visual_style["vertex_label_size"] = 8
|
1023
|
+
visual_style["vertex_label_angle"] = 90
|
1024
|
+
visual_style["vertex_label_dist"] = 3
|
1025
|
+
visual_style["vertex_color"] = [
|
1026
|
+
color_dict[x] for x in neighborhood_graph.vs["node_type"]
|
1027
|
+
]
|
1028
|
+
visual_style["edge_color"] = [
|
1029
|
+
edge_polarity_colors[x] for x in neighborhood_graph.es["net_polarity"]
|
1030
|
+
]
|
1031
|
+
visual_style["layout"] = neighborhood_graph_layout
|
1032
|
+
visual_style["bbox"] = (plot_size, plot_size)
|
1033
|
+
visual_style["margin"] = 50
|
1034
|
+
visual_style["title"] = "foobar"
|
1035
|
+
|
1036
|
+
return ig.plot(neighborhood_graph, **visual_style)
|
1037
|
+
|
1038
|
+
|
1039
|
+
def _precompute_neighbors(
|
1040
|
+
compartmentalized_species: list[str],
|
1041
|
+
precomputed_distances: pd.DataFrame,
|
1042
|
+
sbml_dfs: sbml_dfs_core.SBML_dfs,
|
1043
|
+
network_type: str = NEIGHBORHOOD_NETWORK_TYPES.DOWNSTREAM,
|
1044
|
+
order: int = 3,
|
1045
|
+
top_n: int = 10,
|
1046
|
+
) -> pd.DataFrame:
|
1047
|
+
"""
|
1048
|
+
Precompute Neighbors
|
1049
|
+
|
1050
|
+
Identify compartmentalized_species' most tightly connected neighbors using parameters
|
1051
|
+
shared by the on-the-fly methods (order for identifying neighbors within N steps;
|
1052
|
+
top_n for identifying the most the lowest weight network paths between the focal node
|
1053
|
+
and each possible neighbors). This precomputation will greatly speed up the neighborhood
|
1054
|
+
generation for highly connected species or densely connected networks. In those situations
|
1055
|
+
naively creating a neighborhood in N steps could contain thousands of neighbors.
|
1056
|
+
|
1057
|
+
"""
|
1058
|
+
|
1059
|
+
# check that compartmentalized_species are included in precomputed_distances
|
1060
|
+
all_cspecies = {
|
1061
|
+
*precomputed_distances["sc_id_origin"].tolist(),
|
1062
|
+
*precomputed_distances["sc_id_dest"].tolist(),
|
1063
|
+
}
|
1064
|
+
missing_cspecies = set(compartmentalized_species).difference(all_cspecies)
|
1065
|
+
if len(missing_cspecies) > 0:
|
1066
|
+
logged_specs = ", ".join(list(missing_cspecies)[0:10])
|
1067
|
+
logger.warning(
|
1068
|
+
f"{len(missing_cspecies)} cspecies were missing from precomputed_distances including {logged_specs}"
|
1069
|
+
)
|
1070
|
+
|
1071
|
+
# filter precomputed_distances to those which originate or end with one of the compartmentalized_species
|
1072
|
+
# if we are looking for downstream species then we want relationships where a cspecies is the origin
|
1073
|
+
if network_type in [
|
1074
|
+
NEIGHBORHOOD_NETWORK_TYPES.DOWNSTREAM,
|
1075
|
+
NEIGHBORHOOD_NETWORK_TYPES.HOURGLASS,
|
1076
|
+
]:
|
1077
|
+
valid_origin = precomputed_distances["sc_id_origin"].isin(
|
1078
|
+
compartmentalized_species
|
1079
|
+
)
|
1080
|
+
if network_type in [
|
1081
|
+
NEIGHBORHOOD_NETWORK_TYPES.UPSTREAM,
|
1082
|
+
NEIGHBORHOOD_NETWORK_TYPES.HOURGLASS,
|
1083
|
+
]:
|
1084
|
+
valid_dest = precomputed_distances["sc_id_dest"].isin(compartmentalized_species)
|
1085
|
+
|
1086
|
+
if network_type == NEIGHBORHOOD_NETWORK_TYPES.HOURGLASS:
|
1087
|
+
cspecies_subset_precomputed_distances = precomputed_distances[
|
1088
|
+
[True if (x or y) else False for (x, y) in zip(valid_origin, valid_dest)]
|
1089
|
+
]
|
1090
|
+
elif network_type == NEIGHBORHOOD_NETWORK_TYPES.DOWNSTREAM:
|
1091
|
+
cspecies_subset_precomputed_distances = precomputed_distances.loc[valid_origin]
|
1092
|
+
elif network_type == NEIGHBORHOOD_NETWORK_TYPES.UPSTREAM:
|
1093
|
+
cspecies_subset_precomputed_distances = precomputed_distances.loc[valid_dest]
|
1094
|
+
else:
|
1095
|
+
raise ValueError(
|
1096
|
+
f"network_type was {network_type} and must by one of 'hourglass', 'downstream', 'upstream'"
|
1097
|
+
)
|
1098
|
+
|
1099
|
+
logger.debug(
|
1100
|
+
f"Pre-filtered neighbors {cspecies_subset_precomputed_distances.shape[0]}"
|
1101
|
+
)
|
1102
|
+
|
1103
|
+
# filter by distance
|
1104
|
+
close_cspecies_subset_precomputed_distances = cspecies_subset_precomputed_distances[
|
1105
|
+
cspecies_subset_precomputed_distances["path_length"] <= order
|
1106
|
+
]
|
1107
|
+
|
1108
|
+
# filter to retain top_n
|
1109
|
+
if network_type in [
|
1110
|
+
NEIGHBORHOOD_NETWORK_TYPES.DOWNSTREAM,
|
1111
|
+
NEIGHBORHOOD_NETWORK_TYPES.HOURGLASS,
|
1112
|
+
]:
|
1113
|
+
top_descendants = (
|
1114
|
+
close_cspecies_subset_precomputed_distances[
|
1115
|
+
close_cspecies_subset_precomputed_distances["sc_id_origin"].isin(
|
1116
|
+
compartmentalized_species
|
1117
|
+
)
|
1118
|
+
]
|
1119
|
+
# sort by path_weight so we can retain the lowest weight neighbors
|
1120
|
+
.sort_values("path_weights")
|
1121
|
+
.groupby("sc_id_origin")
|
1122
|
+
.head(top_n)
|
1123
|
+
)
|
1124
|
+
|
1125
|
+
logger.debug(f"N top_descendants {top_descendants.shape[0]}")
|
1126
|
+
|
1127
|
+
if network_type in [
|
1128
|
+
NEIGHBORHOOD_NETWORK_TYPES.UPSTREAM,
|
1129
|
+
NEIGHBORHOOD_NETWORK_TYPES.HOURGLASS,
|
1130
|
+
]:
|
1131
|
+
top_ancestors = (
|
1132
|
+
close_cspecies_subset_precomputed_distances[
|
1133
|
+
close_cspecies_subset_precomputed_distances["sc_id_dest"].isin(
|
1134
|
+
compartmentalized_species
|
1135
|
+
)
|
1136
|
+
]
|
1137
|
+
# sort by path_upstream_weights so we can retain the lowest weight neighbors
|
1138
|
+
# we allow for upstream weights to differ from downstream weights
|
1139
|
+
# when creating a network in process_cpr_graph.
|
1140
|
+
#
|
1141
|
+
# the default network weighting penalizing an edge from a node
|
1142
|
+
# based on the number of children it has. this captures the idea
|
1143
|
+
# that if there are many children we might expect that each
|
1144
|
+
# of them is less likely to transduct an effect.
|
1145
|
+
# the logic is flipped if we are looking for ancestors where
|
1146
|
+
# we penalize based on the number of parents of a node when
|
1147
|
+
# we use it (i.e., the default upstream_weights).
|
1148
|
+
.sort_values("path_upstream_weights")
|
1149
|
+
.groupby("sc_id_dest")
|
1150
|
+
.head(top_n)
|
1151
|
+
)
|
1152
|
+
|
1153
|
+
logger.debug(f"N top_ancestors {top_ancestors.shape[0]}")
|
1154
|
+
|
1155
|
+
# add reactions
|
1156
|
+
|
1157
|
+
if network_type in [
|
1158
|
+
NEIGHBORHOOD_NETWORK_TYPES.DOWNSTREAM,
|
1159
|
+
NEIGHBORHOOD_NETWORK_TYPES.HOURGLASS,
|
1160
|
+
]:
|
1161
|
+
downstream_reactions = _find_reactions_by_relationship(
|
1162
|
+
precomputed_neighbors=top_descendants,
|
1163
|
+
compartmentalized_species=compartmentalized_species,
|
1164
|
+
sbml_dfs=sbml_dfs,
|
1165
|
+
relationship="descendants",
|
1166
|
+
)
|
1167
|
+
|
1168
|
+
if downstream_reactions is not None:
|
1169
|
+
logger.debug(f"N downstream reactions {downstream_reactions.shape[0]}")
|
1170
|
+
|
1171
|
+
if network_type in [
|
1172
|
+
NEIGHBORHOOD_NETWORK_TYPES.UPSTREAM,
|
1173
|
+
NEIGHBORHOOD_NETWORK_TYPES.HOURGLASS,
|
1174
|
+
]:
|
1175
|
+
upstream_reactions = _find_reactions_by_relationship(
|
1176
|
+
precomputed_neighbors=top_ancestors,
|
1177
|
+
compartmentalized_species=compartmentalized_species,
|
1178
|
+
sbml_dfs=sbml_dfs,
|
1179
|
+
relationship="ancestors",
|
1180
|
+
)
|
1181
|
+
|
1182
|
+
if upstream_reactions is not None:
|
1183
|
+
logger.debug(f"N upstream reactions {upstream_reactions.shape[0]}")
|
1184
|
+
|
1185
|
+
# add the self links since sc_id_dest will be used to define
|
1186
|
+
# an sc_id_origin-specific subgraph
|
1187
|
+
identity_df = pd.DataFrame(
|
1188
|
+
{
|
1189
|
+
"sc_id_origin": compartmentalized_species,
|
1190
|
+
"sc_id_dest": compartmentalized_species,
|
1191
|
+
}
|
1192
|
+
)
|
1193
|
+
|
1194
|
+
# combine all ancestor-descendent edges into the precomputed_neighbors edgelist
|
1195
|
+
if network_type == NEIGHBORHOOD_NETWORK_TYPES.HOURGLASS:
|
1196
|
+
precomputed_neighbors = pd.concat(
|
1197
|
+
[
|
1198
|
+
top_ancestors,
|
1199
|
+
top_descendants,
|
1200
|
+
upstream_reactions, # type: ignore
|
1201
|
+
downstream_reactions, # type: ignore
|
1202
|
+
identity_df,
|
1203
|
+
]
|
1204
|
+
)[["sc_id_origin", "sc_id_dest"]].drop_duplicates()
|
1205
|
+
elif network_type == NEIGHBORHOOD_NETWORK_TYPES.DOWNSTREAM:
|
1206
|
+
precomputed_neighbors = pd.concat([top_descendants, downstream_reactions, identity_df])[ # type: ignore
|
1207
|
+
["sc_id_origin", "sc_id_dest"]
|
1208
|
+
].drop_duplicates()
|
1209
|
+
elif network_type == NEIGHBORHOOD_NETWORK_TYPES.UPSTREAM:
|
1210
|
+
precomputed_neighbors = pd.concat([top_ancestors, upstream_reactions, identity_df])[ # type: ignore
|
1211
|
+
["sc_id_origin", "sc_id_dest"]
|
1212
|
+
].drop_duplicates()
|
1213
|
+
else:
|
1214
|
+
raise ValueError("This error shouldn't happen")
|
1215
|
+
|
1216
|
+
return precomputed_neighbors
|
1217
|
+
|
1218
|
+
|
1219
|
+
def _build_raw_neighborhood_df(
|
1220
|
+
cpr_graph: ig.Graph,
|
1221
|
+
compartmentalized_species: list[str],
|
1222
|
+
network_type: str,
|
1223
|
+
order: int,
|
1224
|
+
precomputed_neighbors: pd.DataFrame | None = None,
|
1225
|
+
) -> pd.DataFrame:
|
1226
|
+
# report if network_type is not the default and will be ignored due to the network
|
1227
|
+
# being undirected
|
1228
|
+
is_directed = cpr_graph.is_directed()
|
1229
|
+
if not is_directed and network_type != NEIGHBORHOOD_NETWORK_TYPES.DOWNSTREAM:
|
1230
|
+
logger.warning(
|
1231
|
+
"Network is undirected; network_type will be treated as 'downstream'"
|
1232
|
+
)
|
1233
|
+
network_type = NEIGHBORHOOD_NETWORK_TYPES.DOWNSTREAM
|
1234
|
+
|
1235
|
+
# create the "out-network" of descendant nodes
|
1236
|
+
if network_type in [
|
1237
|
+
NEIGHBORHOOD_NETWORK_TYPES.DOWNSTREAM,
|
1238
|
+
NEIGHBORHOOD_NETWORK_TYPES.HOURGLASS,
|
1239
|
+
]:
|
1240
|
+
descendants_df = _find_neighbors(
|
1241
|
+
cpr_graph=cpr_graph,
|
1242
|
+
compartmentalized_species=compartmentalized_species,
|
1243
|
+
relationship="descendants",
|
1244
|
+
order=order,
|
1245
|
+
precomputed_neighbors=precomputed_neighbors,
|
1246
|
+
)
|
1247
|
+
|
1248
|
+
# create the "in-network" of ancestor nodes
|
1249
|
+
if network_type in [
|
1250
|
+
NEIGHBORHOOD_NETWORK_TYPES.UPSTREAM,
|
1251
|
+
NEIGHBORHOOD_NETWORK_TYPES.HOURGLASS,
|
1252
|
+
]:
|
1253
|
+
ancestors_df = _find_neighbors(
|
1254
|
+
cpr_graph=cpr_graph,
|
1255
|
+
compartmentalized_species=compartmentalized_species,
|
1256
|
+
relationship="ancestors",
|
1257
|
+
order=order,
|
1258
|
+
precomputed_neighbors=precomputed_neighbors,
|
1259
|
+
)
|
1260
|
+
|
1261
|
+
if network_type == NEIGHBORHOOD_NETWORK_TYPES.HOURGLASS:
|
1262
|
+
# merge descendants and ancestors
|
1263
|
+
neighborhood_df = pd.concat([ancestors_df, descendants_df])
|
1264
|
+
elif network_type == NEIGHBORHOOD_NETWORK_TYPES.DOWNSTREAM:
|
1265
|
+
neighborhood_df = descendants_df
|
1266
|
+
elif network_type == NEIGHBORHOOD_NETWORK_TYPES.UPSTREAM:
|
1267
|
+
neighborhood_df = ancestors_df
|
1268
|
+
else:
|
1269
|
+
raise NotImplementedError("invalid network_type")
|
1270
|
+
|
1271
|
+
# add name since this is an easy way to lookup igraph vertices
|
1272
|
+
neighborhood_df["name"] = [
|
1273
|
+
x["name"] for x in cpr_graph.vs[neighborhood_df["neighbor"]]
|
1274
|
+
]
|
1275
|
+
|
1276
|
+
return neighborhood_df
|
1277
|
+
|
1278
|
+
|
1279
|
+
def _find_neighbors(
|
1280
|
+
cpr_graph: ig.Graph,
|
1281
|
+
compartmentalized_species: list[str],
|
1282
|
+
relationship: str,
|
1283
|
+
order: int = 3,
|
1284
|
+
precomputed_neighbors: pd.DataFrame | None = None,
|
1285
|
+
) -> pd.DataFrame:
|
1286
|
+
"""
|
1287
|
+
Find Neighbors
|
1288
|
+
|
1289
|
+
Identify the neighbors nearby each of the requested compartmentalized_species
|
1290
|
+
|
1291
|
+
If 'precomputed_neighbors' are provided, neighbors will be summarized by reformatting
|
1292
|
+
this table. Otherwise, neighbors will be found on-the-fly using the igraph.neighborhood() method.
|
1293
|
+
|
1294
|
+
"""
|
1295
|
+
|
1296
|
+
if isinstance(precomputed_neighbors, pd.DataFrame):
|
1297
|
+
# add graph indices to neighbors
|
1298
|
+
nodes_to_names = (
|
1299
|
+
pd.DataFrame({"name": cpr_graph.vs["name"]})
|
1300
|
+
.reset_index()
|
1301
|
+
.rename({"index": "neighbor"}, axis=1)
|
1302
|
+
)
|
1303
|
+
|
1304
|
+
if relationship == "descendants":
|
1305
|
+
bait_id = "sc_id_origin"
|
1306
|
+
target_id = "sc_id_dest"
|
1307
|
+
elif relationship == "ancestors":
|
1308
|
+
bait_id = "sc_id_dest"
|
1309
|
+
target_id = "sc_id_origin"
|
1310
|
+
else:
|
1311
|
+
raise ValueError(
|
1312
|
+
f"relationship must be 'descendants' or 'ancestors' but was {relationship}"
|
1313
|
+
)
|
1314
|
+
|
1315
|
+
neighbors_df = (
|
1316
|
+
precomputed_neighbors[
|
1317
|
+
precomputed_neighbors[bait_id].isin(compartmentalized_species)
|
1318
|
+
]
|
1319
|
+
.merge(nodes_to_names.rename({"name": target_id}, axis=1))
|
1320
|
+
.rename({bait_id: "sc_id"}, axis=1)
|
1321
|
+
.drop([target_id], axis=1)
|
1322
|
+
.assign(relationship=relationship)
|
1323
|
+
)
|
1324
|
+
else:
|
1325
|
+
if relationship == "descendants":
|
1326
|
+
mode_type = "out"
|
1327
|
+
elif relationship == "ancestors":
|
1328
|
+
mode_type = "in"
|
1329
|
+
else:
|
1330
|
+
raise ValueError(
|
1331
|
+
f"relationship must be 'descendants' or 'ancestors' but was {relationship}"
|
1332
|
+
)
|
1333
|
+
|
1334
|
+
neighbors = cpr_graph.neighborhood(
|
1335
|
+
# mode = out queries outgoing edges and is ignored if the network is undirected
|
1336
|
+
vertices=compartmentalized_species,
|
1337
|
+
order=order,
|
1338
|
+
mode=mode_type,
|
1339
|
+
)
|
1340
|
+
|
1341
|
+
neighbors_df = pd.concat(
|
1342
|
+
[
|
1343
|
+
pd.DataFrame({"sc_id": c, "neighbor": x}, index=range(0, len(x)))
|
1344
|
+
for c, x in zip(compartmentalized_species, neighbors)
|
1345
|
+
]
|
1346
|
+
).assign(relationship=relationship)
|
1347
|
+
|
1348
|
+
return neighbors_df
|
1349
|
+
|
1350
|
+
|
1351
|
+
def _find_reactions_by_relationship(
|
1352
|
+
precomputed_neighbors,
|
1353
|
+
compartmentalized_species: list,
|
1354
|
+
sbml_dfs: sbml_dfs_core.SBML_dfs,
|
1355
|
+
relationship: str,
|
1356
|
+
) -> pd.DataFrame | None:
|
1357
|
+
"""
|
1358
|
+
Find Reactions by Relationship
|
1359
|
+
|
1360
|
+
Based on an ancestor-descendant edgelist of compartmentalized species find all reactions which involve 2+ members
|
1361
|
+
|
1362
|
+
Since we primarily care about paths between species and reactions are more of a means-to-an-end of
|
1363
|
+
connecting pairs of species precomputed_distances are generated between just pairs of species
|
1364
|
+
this also makes the problem feasible since the number of species is upper bounded at <100K but
|
1365
|
+
the number of reactions is unbounded. Having a bound ensures that we can calculate
|
1366
|
+
the precomputed_distances efficiently using matrix operations whose memory footprint scales with O(N^2).
|
1367
|
+
"""
|
1368
|
+
|
1369
|
+
# if there are no neighboring cspecies then there will be no reactions
|
1370
|
+
if precomputed_neighbors.shape[0] == 0:
|
1371
|
+
return None
|
1372
|
+
|
1373
|
+
if relationship == "descendants":
|
1374
|
+
bait_id = "sc_id_origin"
|
1375
|
+
target_id = "sc_id_dest"
|
1376
|
+
elif relationship == "ancestors":
|
1377
|
+
bait_id = "sc_id_dest"
|
1378
|
+
target_id = "sc_id_origin"
|
1379
|
+
else:
|
1380
|
+
raise ValueError(
|
1381
|
+
f"relationship must be 'descendants' or 'ancestors' but was {relationship}"
|
1382
|
+
)
|
1383
|
+
|
1384
|
+
# index by the bait id to create a series with all relatives of the specified relationship
|
1385
|
+
indexed_relatives = (
|
1386
|
+
precomputed_neighbors[
|
1387
|
+
precomputed_neighbors[bait_id].isin(compartmentalized_species)
|
1388
|
+
]
|
1389
|
+
.set_index(bait_id)
|
1390
|
+
.sort_index()
|
1391
|
+
)
|
1392
|
+
|
1393
|
+
reaction_relatives = list()
|
1394
|
+
|
1395
|
+
# loop through compartmentalized species in precomputed_neighbors
|
1396
|
+
for uq in indexed_relatives.index.unique():
|
1397
|
+
relatives = indexed_relatives.loc[uq, target_id]
|
1398
|
+
if isinstance(relatives, str):
|
1399
|
+
relatives = [relatives]
|
1400
|
+
elif isinstance(relatives, pd.Series):
|
1401
|
+
relatives = relatives.tolist()
|
1402
|
+
else:
|
1403
|
+
raise ValueError("relatives is an unexpected type")
|
1404
|
+
|
1405
|
+
# add the focal node to the set of relatives
|
1406
|
+
relatives_cspecies = {*relatives, *[uq]}
|
1407
|
+
# count the number of relative cspecies including each reaction
|
1408
|
+
rxn_species_counts = sbml_dfs.reaction_species[
|
1409
|
+
sbml_dfs.reaction_species["sc_id"].isin(relatives_cspecies)
|
1410
|
+
].value_counts("r_id")
|
1411
|
+
|
1412
|
+
# retain reactions involving 2+ cspecies.
|
1413
|
+
# some of these reactions will be irrelevant and will be excluded when
|
1414
|
+
# calculating the shortest paths from/to the focal node from each neighbor
|
1415
|
+
# in prune_neighborhoods()
|
1416
|
+
neighboring_reactions = rxn_species_counts[
|
1417
|
+
rxn_species_counts >= 2
|
1418
|
+
].index.tolist()
|
1419
|
+
|
1420
|
+
# create new entries for reaction relatives
|
1421
|
+
kws = {bait_id: uq}
|
1422
|
+
new_entries = pd.DataFrame({target_id: neighboring_reactions}).assign(**kws)
|
1423
|
+
|
1424
|
+
reaction_relatives.append(new_entries)
|
1425
|
+
|
1426
|
+
reactions_df = pd.concat(reaction_relatives)
|
1427
|
+
|
1428
|
+
return reactions_df
|
1429
|
+
|
1430
|
+
|
1431
|
+
def _prune_vertex_set(one_neighborhood: dict, top_n: int) -> pd.DataFrame:
|
1432
|
+
"""
|
1433
|
+
Prune Vertex Set
|
1434
|
+
|
1435
|
+
Filter a neighborhood to the lowest weight neighbors connected to the focal node.
|
1436
|
+
During this process upstream and downstream nodes are treated separately.
|
1437
|
+
|
1438
|
+
Parameters
|
1439
|
+
----------
|
1440
|
+
one_neighborhood: dict
|
1441
|
+
The neighborhood around a single compartmentalized species - one of the values
|
1442
|
+
in dict created by find_neighborhoods().
|
1443
|
+
top_n: int
|
1444
|
+
How many neighboring molecular species should be retained?
|
1445
|
+
If the neighborhood includes both upstream and downstream connections
|
1446
|
+
(i.e., hourglass), this filter will be applied to both sets separately.
|
1447
|
+
|
1448
|
+
Returns
|
1449
|
+
-------
|
1450
|
+
vertices: pd.DataFrame
|
1451
|
+
the vertices in one_neighborhood with high weight neighbors removed.
|
1452
|
+
|
1453
|
+
"""
|
1454
|
+
|
1455
|
+
neighborhood_vertices = one_neighborhood["vertices"]
|
1456
|
+
|
1457
|
+
indexed_neighborhood_species = neighborhood_vertices[
|
1458
|
+
neighborhood_vertices["node_type"] == "species"
|
1459
|
+
].set_index("node_orientation")
|
1460
|
+
|
1461
|
+
pruned_oriented_neighbors = list()
|
1462
|
+
for a_node_orientation in indexed_neighborhood_species.index.unique().tolist():
|
1463
|
+
vertex_subset = indexed_neighborhood_species.loc[a_node_orientation]
|
1464
|
+
if type(vertex_subset) is pd.Series:
|
1465
|
+
# handle cases where only one entry exists to DF->series coercion occurs
|
1466
|
+
vertex_subset = vertex_subset.to_frame().T
|
1467
|
+
|
1468
|
+
sorted_vertex_set = vertex_subset.sort_values("path_weight")
|
1469
|
+
weight_cutoff = sorted_vertex_set["path_weight"].iloc[
|
1470
|
+
min(top_n - 1, sorted_vertex_set.shape[0] - 1)
|
1471
|
+
]
|
1472
|
+
|
1473
|
+
top_neighbors = sorted_vertex_set[
|
1474
|
+
sorted_vertex_set["path_weight"] <= weight_cutoff
|
1475
|
+
]["name"].tolist()
|
1476
|
+
|
1477
|
+
# include reactions and other species necessary to reach the top neighbors
|
1478
|
+
# by pulling in the past solutions to weighted shortest paths problems
|
1479
|
+
if a_node_orientation in one_neighborhood["neighborhood_path_entities"].keys():
|
1480
|
+
# path to/from focal node to each species
|
1481
|
+
neighborhood_path_entities = one_neighborhood["neighborhood_path_entities"][
|
1482
|
+
a_node_orientation
|
1483
|
+
]
|
1484
|
+
|
1485
|
+
top_neighbors = set().union(
|
1486
|
+
*[neighborhood_path_entities[p] for p in top_neighbors]
|
1487
|
+
)
|
1488
|
+
|
1489
|
+
pruned_oriented_neighbors.append(top_neighbors)
|
1490
|
+
|
1491
|
+
# combine all neighbors
|
1492
|
+
pruned_neighbors = set().union(*pruned_oriented_neighbors)
|
1493
|
+
pruned_vertices = neighborhood_vertices[
|
1494
|
+
neighborhood_vertices["name"].isin(pruned_neighbors)
|
1495
|
+
].reset_index(drop=True)
|
1496
|
+
|
1497
|
+
return pruned_vertices
|
1498
|
+
|
1499
|
+
|
1500
|
+
def _calculate_path_attrs(
|
1501
|
+
neighborhood_paths: list[list],
|
1502
|
+
edges: pd.DataFrame,
|
1503
|
+
vertices: list,
|
1504
|
+
weight_var: str = "weights",
|
1505
|
+
) -> tuple[pd.DataFrame, dict[Any, set]]:
|
1506
|
+
"""
|
1507
|
+
Calculate Path Attributes
|
1508
|
+
|
1509
|
+
Return the vertices and path weights (sum of edge weights) for a list of paths.
|
1510
|
+
|
1511
|
+
Parameters
|
1512
|
+
----------
|
1513
|
+
neighborhood_paths: list
|
1514
|
+
List of lists of edge indices
|
1515
|
+
edges: pd.DataFrame
|
1516
|
+
Edges with rows correponding to entries in neighborhood_paths inner lists
|
1517
|
+
vertices: list
|
1518
|
+
List of vertices correponding to the ordering of neighborhood_paths
|
1519
|
+
weights_var: str
|
1520
|
+
variable in edges to use for scoring path weights
|
1521
|
+
|
1522
|
+
Returns
|
1523
|
+
-------
|
1524
|
+
path_attributes_df: pd.DataFrame
|
1525
|
+
A table containing attributes summarizing the path to each neighbor
|
1526
|
+
neighborhood_path_entities: dict
|
1527
|
+
Dict mapping from each neighbor to the entities connecting it to the focal node
|
1528
|
+
|
1529
|
+
"""
|
1530
|
+
|
1531
|
+
if not isinstance(neighborhood_paths, list):
|
1532
|
+
raise TypeError("neighborhood_paths should be a list of lists of edge indices")
|
1533
|
+
if not isinstance(vertices, list):
|
1534
|
+
raise TypeError("vertices should be a list of list of vertices")
|
1535
|
+
assert len(vertices) > 0 # control for length zero vertices upstream
|
1536
|
+
if len(neighborhood_paths) != len(vertices):
|
1537
|
+
raise ValueError("vertices and neighborhood_paths were not the same length")
|
1538
|
+
|
1539
|
+
if any([len(x) > 0 for x in neighborhood_paths]):
|
1540
|
+
all_path_edges = (
|
1541
|
+
# create a table of edges traversed to reach each neighbor
|
1542
|
+
pd.concat(
|
1543
|
+
[
|
1544
|
+
edges.iloc[neighborhood_paths[i]].assign(neighbor=vertices[i])
|
1545
|
+
for i in range(0, len(neighborhood_paths))
|
1546
|
+
]
|
1547
|
+
).groupby("neighbor")
|
1548
|
+
)
|
1549
|
+
|
1550
|
+
# if all_path_edges.ngroups > 0:
|
1551
|
+
path_attributes_df = pd.concat(
|
1552
|
+
[
|
1553
|
+
all_path_edges[weight_var].agg("sum").rename("path_weight"),
|
1554
|
+
all_path_edges.agg("size").rename("path_length"),
|
1555
|
+
all_path_edges["link_polarity"]
|
1556
|
+
.agg(paths._terminal_net_polarity)
|
1557
|
+
.rename("net_polarity"),
|
1558
|
+
# add the final edge since this can be used to add path attributes to edges
|
1559
|
+
# i.e., apply net_polarity to an edge
|
1560
|
+
all_path_edges["from"].agg("last").rename("final_from"),
|
1561
|
+
all_path_edges["to"].agg("last").rename("final_to"),
|
1562
|
+
],
|
1563
|
+
axis=1,
|
1564
|
+
).reset_index()
|
1565
|
+
|
1566
|
+
# create a dict mapping from a neighbor to all mediating nodes
|
1567
|
+
neighborhood_path_entities = {
|
1568
|
+
group_name: set().union(*[dat["from"], dat["to"]])
|
1569
|
+
for group_name, dat in all_path_edges
|
1570
|
+
}
|
1571
|
+
|
1572
|
+
else:
|
1573
|
+
# catch case where there are no paths
|
1574
|
+
path_attributes_df = pd.DataFrame()
|
1575
|
+
neighborhood_path_entities = dict()
|
1576
|
+
|
1577
|
+
# add entries with no edges
|
1578
|
+
edgeless_nodes = [
|
1579
|
+
vertices[i]
|
1580
|
+
for i in range(0, len(neighborhood_paths))
|
1581
|
+
if len(neighborhood_paths[i]) == 0
|
1582
|
+
]
|
1583
|
+
edgeles_nodes_df = pd.DataFrame({"neighbor": edgeless_nodes}).assign(
|
1584
|
+
path_length=0, path_weight=0, net_polarity=None
|
1585
|
+
)
|
1586
|
+
|
1587
|
+
# add edgeless entries as entries in the two outputs
|
1588
|
+
path_attributes_df = pd.concat([path_attributes_df, edgeles_nodes_df])
|
1589
|
+
neighborhood_path_entities.update({x: {x} for x in edgeless_nodes})
|
1590
|
+
|
1591
|
+
assert path_attributes_df.shape[0] == len(neighborhood_paths)
|
1592
|
+
assert len(neighborhood_path_entities) == len(neighborhood_paths)
|
1593
|
+
|
1594
|
+
return path_attributes_df, neighborhood_path_entities
|