napistu 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- napistu/__init__.py +12 -0
- napistu/__main__.py +867 -0
- napistu/consensus.py +1557 -0
- napistu/constants.py +500 -0
- napistu/gcs/__init__.py +10 -0
- napistu/gcs/constants.py +69 -0
- napistu/gcs/downloads.py +180 -0
- napistu/identifiers.py +805 -0
- napistu/indices.py +227 -0
- napistu/ingestion/__init__.py +10 -0
- napistu/ingestion/bigg.py +146 -0
- napistu/ingestion/constants.py +296 -0
- napistu/ingestion/cpr_edgelist.py +106 -0
- napistu/ingestion/identifiers_etl.py +148 -0
- napistu/ingestion/obo.py +268 -0
- napistu/ingestion/psi_mi.py +276 -0
- napistu/ingestion/reactome.py +218 -0
- napistu/ingestion/sbml.py +621 -0
- napistu/ingestion/string.py +356 -0
- napistu/ingestion/trrust.py +285 -0
- napistu/ingestion/yeast.py +147 -0
- napistu/mechanism_matching.py +597 -0
- napistu/modify/__init__.py +10 -0
- napistu/modify/constants.py +86 -0
- napistu/modify/curation.py +628 -0
- napistu/modify/gaps.py +635 -0
- napistu/modify/pathwayannot.py +1381 -0
- napistu/modify/uncompartmentalize.py +264 -0
- napistu/network/__init__.py +10 -0
- napistu/network/constants.py +117 -0
- napistu/network/neighborhoods.py +1594 -0
- napistu/network/net_create.py +1647 -0
- napistu/network/net_utils.py +652 -0
- napistu/network/paths.py +500 -0
- napistu/network/precompute.py +221 -0
- napistu/rpy2/__init__.py +127 -0
- napistu/rpy2/callr.py +168 -0
- napistu/rpy2/constants.py +101 -0
- napistu/rpy2/netcontextr.py +464 -0
- napistu/rpy2/rids.py +697 -0
- napistu/sbml_dfs_core.py +2216 -0
- napistu/sbml_dfs_utils.py +304 -0
- napistu/source.py +394 -0
- napistu/utils.py +943 -0
- napistu-0.1.0.dist-info/METADATA +56 -0
- napistu-0.1.0.dist-info/RECORD +77 -0
- napistu-0.1.0.dist-info/WHEEL +5 -0
- napistu-0.1.0.dist-info/entry_points.txt +2 -0
- napistu-0.1.0.dist-info/licenses/LICENSE +21 -0
- napistu-0.1.0.dist-info/top_level.txt +2 -0
- tests/__init__.py +0 -0
- tests/conftest.py +83 -0
- tests/test_consensus.py +255 -0
- tests/test_constants.py +20 -0
- tests/test_curation.py +134 -0
- tests/test_data/__init__.py +0 -0
- tests/test_edgelist.py +20 -0
- tests/test_gcs.py +23 -0
- tests/test_identifiers.py +151 -0
- tests/test_igraph.py +353 -0
- tests/test_indices.py +88 -0
- tests/test_mechanism_matching.py +126 -0
- tests/test_net_utils.py +66 -0
- tests/test_netcontextr.py +105 -0
- tests/test_obo.py +34 -0
- tests/test_pathwayannot.py +95 -0
- tests/test_precomputed_distances.py +222 -0
- tests/test_rpy2.py +61 -0
- tests/test_sbml.py +46 -0
- tests/test_sbml_dfs_create.py +307 -0
- tests/test_sbml_dfs_utils.py +22 -0
- tests/test_sbo.py +11 -0
- tests/test_set_coverage.py +50 -0
- tests/test_source.py +67 -0
- tests/test_uncompartmentalize.py +40 -0
- tests/test_utils.py +487 -0
- tests/utils.py +30 -0
@@ -0,0 +1,106 @@
|
|
1
|
+
"""
|
2
|
+
Module with helper functions to deal with edgelists
|
3
|
+
|
4
|
+
Edgelists are assumed to be DataFrames whose first two columns represent an Edge relation, eg From, To
|
5
|
+
"""
|
6
|
+
|
7
|
+
from __future__ import annotations
|
8
|
+
|
9
|
+
import logging
|
10
|
+
|
11
|
+
import pandas as pd
|
12
|
+
|
13
|
+
logger = logging.getLogger(__name__)
|
14
|
+
|
15
|
+
|
16
|
+
def remove_reciprocal_interactions(
|
17
|
+
edgelist: pd.DataFrame, extra_defining_vars: list = list()
|
18
|
+
) -> pd.DataFrame:
|
19
|
+
"""Remove reciprocal edges from an edgelist (i.e., if B-A always exists for every A-B then remove B-A)
|
20
|
+
|
21
|
+
Args:
|
22
|
+
edgelist (pd.DataFrame): edgelist (pd.DataFrame): edgelist where the first two
|
23
|
+
columns are assumed to be the edge vertices
|
24
|
+
extra_defining_vars (list): list (which can be empty) of variables which define
|
25
|
+
a unique interaction beyond the vertices
|
26
|
+
|
27
|
+
Returns:
|
28
|
+
indegenerate_edgelist (pd.DataFrame): edgelist with B-A edges removed and A-B retained
|
29
|
+
|
30
|
+
"""
|
31
|
+
|
32
|
+
edgelist_vars = edgelist.columns.tolist()[0:2]
|
33
|
+
logger.info(
|
34
|
+
"Removing reciprocal interactions treating "
|
35
|
+
f"{edgelist_vars[0]} and {edgelist_vars[1]} as vertices"
|
36
|
+
)
|
37
|
+
|
38
|
+
reciprocal_interaction_fraction = count_fraction_of_reciprocal_interactions(
|
39
|
+
edgelist, extra_defining_vars
|
40
|
+
)
|
41
|
+
if reciprocal_interaction_fraction != 1:
|
42
|
+
raise ValueError(
|
43
|
+
f"Only {reciprocal_interaction_fraction} of edges are present as reciprocal edges;"
|
44
|
+
" this method of removing reciprocal edges will be unreliable"
|
45
|
+
)
|
46
|
+
|
47
|
+
indegenerate_edgelist = edgelist.loc[
|
48
|
+
edgelist[edgelist_vars[0]] < edgelist[edgelist_vars[1]]
|
49
|
+
]
|
50
|
+
|
51
|
+
return indegenerate_edgelist
|
52
|
+
|
53
|
+
|
54
|
+
def count_fraction_of_reciprocal_interactions(
|
55
|
+
edgelist: pd.DataFrame, extra_defining_vars: list = list()
|
56
|
+
) -> float:
|
57
|
+
"""Count the fraction of A-B edges which also show up as B-A edges
|
58
|
+
|
59
|
+
Args:
|
60
|
+
edgelist (pd.DataFrame): edgelist (pd.DataFrame): edgelist where the first two
|
61
|
+
columns are assumed to be the edge vertices
|
62
|
+
extra_defining_vars (list): list (which can be empty) of variables which define
|
63
|
+
a unique interaction beyond the vertices
|
64
|
+
|
65
|
+
Returns:
|
66
|
+
fraction (float): fraction of A-B edges which are also included as B-A edges
|
67
|
+
|
68
|
+
"""
|
69
|
+
|
70
|
+
# first two variables are assumed to be vertices of edgelist
|
71
|
+
edgelist_vars = edgelist.columns.tolist()[0:2]
|
72
|
+
logger.info(
|
73
|
+
"Counting the fraction of reciprocal interactions treating "
|
74
|
+
f"{edgelist_vars[0]} and {edgelist_vars[1]} as vertices"
|
75
|
+
)
|
76
|
+
|
77
|
+
# extra defining variables must exist
|
78
|
+
missing_extra_defining_vars = set(extra_defining_vars).difference(
|
79
|
+
set(edgelist.columns)
|
80
|
+
)
|
81
|
+
if len(missing_extra_defining_vars) > 0:
|
82
|
+
raise ValueError(
|
83
|
+
f"{', '.join(missing_extra_defining_vars)} are \"extra_defining_vars\" "
|
84
|
+
"but were missing from the edgelist"
|
85
|
+
)
|
86
|
+
|
87
|
+
extended_edgelist_vars = [*edgelist_vars, *extra_defining_vars]
|
88
|
+
logger.info(
|
89
|
+
f"{', '.join(extra_defining_vars)} will be used as \"extra_defining_vars\" "
|
90
|
+
"which much match across reciprocal edges for the edge to be identical"
|
91
|
+
)
|
92
|
+
|
93
|
+
possible_reciprocal_interactions = (
|
94
|
+
edgelist[extended_edgelist_vars]
|
95
|
+
.rename(
|
96
|
+
{edgelist_vars[0]: edgelist_vars[1], edgelist_vars[1]: edgelist_vars[0]},
|
97
|
+
axis=1,
|
98
|
+
)
|
99
|
+
.assign(reciprocal_exists=True)
|
100
|
+
)
|
101
|
+
|
102
|
+
reciprocal_interaction_test = edgelist[extended_edgelist_vars].merge(
|
103
|
+
possible_reciprocal_interactions
|
104
|
+
)
|
105
|
+
|
106
|
+
return reciprocal_interaction_test.shape[0] / edgelist.shape[0]
|
@@ -0,0 +1,148 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
import os
|
4
|
+
import re
|
5
|
+
|
6
|
+
import pandas as pd
|
7
|
+
import requests
|
8
|
+
from napistu.ingestion.constants import IDENTIFIERS_ETL_SBO_URL
|
9
|
+
from napistu.ingestion.constants import IDENTIFIERS_ETL_YEAST_FIELDS
|
10
|
+
from napistu.ingestion.constants import IDENTIFIERS_ETL_YEAST_URL
|
11
|
+
from napistu.ingestion.constants import IDENTIFIERS_ETL_YEAST_HEADER_REGEX
|
12
|
+
|
13
|
+
|
14
|
+
def read_yeast_identifiers(url: str = IDENTIFIERS_ETL_YEAST_URL):
|
15
|
+
"""Read Yeast Identifiers
|
16
|
+
Generate a pd.DataFrame which maps between yeast identifiers including
|
17
|
+
common and systematic (OLN) names, as well as Swiss-Prot and SGD identifiers.
|
18
|
+
|
19
|
+
Params:
|
20
|
+
url (str): url to the identifier file
|
21
|
+
Returns:
|
22
|
+
pd.DataFrame with one row per gene
|
23
|
+
"""
|
24
|
+
response = requests.get(url).text
|
25
|
+
|
26
|
+
yeast_id_list = list()
|
27
|
+
break_line_hit = 0
|
28
|
+
for line in response.splitlines():
|
29
|
+
if re.match(IDENTIFIERS_ETL_YEAST_HEADER_REGEX, line):
|
30
|
+
# find start and end of header indicated by a line of underscores
|
31
|
+
break_line_hit += 1
|
32
|
+
continue
|
33
|
+
|
34
|
+
if break_line_hit >= 2:
|
35
|
+
if line == "":
|
36
|
+
# reached the end
|
37
|
+
break
|
38
|
+
|
39
|
+
# split each line into a list of fields, the only optional field is 3d
|
40
|
+
# all white spaces are space
|
41
|
+
line = re.sub(" +", " ", line)
|
42
|
+
line = re.sub("; ", ";", line)
|
43
|
+
# remove pol and gag designations from transposons since they are an unnecessary extra field
|
44
|
+
line = re.sub("(-[0-9]) (GAG)|(POL)", "\\1", line)
|
45
|
+
|
46
|
+
line = line.split()
|
47
|
+
|
48
|
+
if line[6] != "(3)":
|
49
|
+
# if no 3D field is present then create one
|
50
|
+
line.insert(6, "none")
|
51
|
+
|
52
|
+
# split common fields into a separate list
|
53
|
+
common_list = line[0].split(";")
|
54
|
+
line[0] = common_list[0]
|
55
|
+
line.insert(1, common_list)
|
56
|
+
|
57
|
+
if len(line) != 9:
|
58
|
+
raise ValueError(
|
59
|
+
"the yeast id file could not be read; all entries should have 8 fields"
|
60
|
+
)
|
61
|
+
|
62
|
+
yeast_id_list.append(dict(zip(IDENTIFIERS_ETL_YEAST_FIELDS, line)))
|
63
|
+
|
64
|
+
return pd.DataFrame(yeast_id_list)
|
65
|
+
|
66
|
+
|
67
|
+
def read_sbo_ontology(
|
68
|
+
url: str = IDENTIFIERS_ETL_SBO_URL, verbose: bool = False
|
69
|
+
) -> pd.DataFrame:
|
70
|
+
"""Read SBO Ontology
|
71
|
+
Read the Systems Biology Ontology (SBO) identifiers and reformat the obo results into a pd.DataFrame.
|
72
|
+
|
73
|
+
Params:
|
74
|
+
url (str): url to the obo specification file
|
75
|
+
verbose (bool): throw warnings when attributes are overwritten
|
76
|
+
Returns:
|
77
|
+
pd.DataFrame
|
78
|
+
"""
|
79
|
+
|
80
|
+
# save the obo file locally
|
81
|
+
tmp_file = os.path.join("/tmp", "sbo.obo")
|
82
|
+
r = requests.get(url, allow_redirects=True)
|
83
|
+
open(tmp_file, "wb").write(r.content)
|
84
|
+
|
85
|
+
with open(tmp_file) as sbo:
|
86
|
+
sbo_dict = dict()
|
87
|
+
current_id = None
|
88
|
+
in_header = True
|
89
|
+
for line in sbo:
|
90
|
+
# skip the header
|
91
|
+
if line == "[Term]\n":
|
92
|
+
in_header = False
|
93
|
+
continue
|
94
|
+
if in_header:
|
95
|
+
continue
|
96
|
+
|
97
|
+
line_entries = line.split(":", 1)
|
98
|
+
|
99
|
+
if len(line_entries) == 2:
|
100
|
+
entry_type = line_entries[0]
|
101
|
+
entry_value = line_entries[1].strip()
|
102
|
+
|
103
|
+
# drop type defs
|
104
|
+
if (
|
105
|
+
(current_id is not None)
|
106
|
+
and (entry_type != "id")
|
107
|
+
and (re.match("SBO", current_id) is None)
|
108
|
+
):
|
109
|
+
continue
|
110
|
+
|
111
|
+
# clean-up definitions
|
112
|
+
if entry_type == "is_a":
|
113
|
+
entry_value = re.match("SBO:[0-9]+", entry_value)[0]
|
114
|
+
|
115
|
+
# if a new id has been reached then initilize a new dict and
|
116
|
+
# update current id
|
117
|
+
|
118
|
+
if entry_type == "id":
|
119
|
+
current_id = entry_value
|
120
|
+
if re.match("SBO", current_id) is not None:
|
121
|
+
sbo_dict[current_id] = {"is_a": []}
|
122
|
+
continue
|
123
|
+
|
124
|
+
if entry_type == "is_a":
|
125
|
+
sbo_dict[current_id]["is_a"].append(entry_value)
|
126
|
+
else:
|
127
|
+
# add a new entry
|
128
|
+
if (entry_type in sbo_dict[current_id].keys()) and verbose:
|
129
|
+
print(
|
130
|
+
f"2+ {entry_type} entries were found for {current_id}, only one value should be present "
|
131
|
+
)
|
132
|
+
sbo_dict[current_id][entry_type] = entry_value
|
133
|
+
|
134
|
+
sbo_df = pd.DataFrame(sbo_dict).T
|
135
|
+
|
136
|
+
obsolete_terms = set(
|
137
|
+
sbo_df["name"][sbo_df["name"].str.match("obsolete")].index.tolist()
|
138
|
+
)
|
139
|
+
sbo_df["is_obsolete"] = [
|
140
|
+
(x in obsolete_terms) | (len(set(y).intersection(obsolete_terms)) > 0)
|
141
|
+
for x, y in zip(sbo_df.index, sbo_df["is_a"])
|
142
|
+
]
|
143
|
+
|
144
|
+
sbo_df = sbo_df[["name", "comment", "is_a", "is_obsolete"]]
|
145
|
+
sbo_df.index.name = "sbo_term"
|
146
|
+
sbo_df = sbo_df.reset_index()
|
147
|
+
|
148
|
+
return sbo_df
|
napistu/ingestion/obo.py
ADDED
@@ -0,0 +1,268 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
import collections
|
4
|
+
import os
|
5
|
+
from itertools import chain
|
6
|
+
from typing import Any
|
7
|
+
|
8
|
+
import igraph as ig
|
9
|
+
import pandas as pd
|
10
|
+
from napistu import utils
|
11
|
+
from napistu.ingestion.constants import OBO_GO_BASIC_LOCAL_TMP
|
12
|
+
from napistu.ingestion.constants import OBO_GO_BASIC_URL
|
13
|
+
|
14
|
+
|
15
|
+
def create_go_parents_df(go_basic_obo_df: pd.DataFrame) -> pd.DataFrame:
|
16
|
+
"""
|
17
|
+
Create the GO Parents Table
|
18
|
+
|
19
|
+
Reformat a table with GO attributes into a table with child-parent relationships
|
20
|
+
|
21
|
+
Args:
|
22
|
+
go_basic_obo_df (pd.DataFrame): Table generated from parsing go-basic.obo with
|
23
|
+
obo.format_obo_dict_as_df
|
24
|
+
|
25
|
+
Returns:
|
26
|
+
go_parents_df (pd.DataFrame): a table with:
|
27
|
+
- parent_id: GO ID of parent (from an is-a entry)
|
28
|
+
- parent_name: common name of parent (from an is-a entry)
|
29
|
+
- child_id: GO ID from the index
|
30
|
+
|
31
|
+
"""
|
32
|
+
# filter to CC ontology and look at a series
|
33
|
+
# where the index is GO IDs and values is a list of parent "is-a" relations
|
34
|
+
cc_parents = go_basic_obo_df.query("namespace == 'cellular_component'")["is_a"]
|
35
|
+
|
36
|
+
# this is currently at 4496 rows - this is expected to slowly increase
|
37
|
+
assert cc_parents.shape[0] >= 4496
|
38
|
+
assert cc_parents.shape[0] < 5000
|
39
|
+
|
40
|
+
# convert from a list of strings to a list of dicts then expand so each
|
41
|
+
# dict is its own row
|
42
|
+
parent_entries = cc_parents.map(_isa_str_list_to_dict_list).explode()
|
43
|
+
# drop orphans which will be NaN's after the explosion
|
44
|
+
parent_entries = parent_entries[~parent_entries.isnull()]
|
45
|
+
|
46
|
+
# convert to a DF which just has string variables
|
47
|
+
go_parents_df = pd.DataFrame(parent_entries.tolist())
|
48
|
+
go_parents_df["child_id"] = parent_entries.index
|
49
|
+
|
50
|
+
# currently at 4688 rows - this may increase or decrease but will do so slowly
|
51
|
+
assert go_parents_df.shape[0] > 4600
|
52
|
+
assert go_parents_df.shape[0] < 5000
|
53
|
+
|
54
|
+
return go_parents_df
|
55
|
+
|
56
|
+
|
57
|
+
def read_obo_as_dict(local_obo_path: str) -> dict:
|
58
|
+
"""
|
59
|
+
Read OBO as Dictionary
|
60
|
+
|
61
|
+
The Open Biological and Biomedical Ontologies (OBO) format is a standard format
|
62
|
+
for representing ontologies. Many parsers exist for obo but since we are not
|
63
|
+
relying extensively on it and we are trying to minimize dependencies here we provide a
|
64
|
+
few functions for parsing standard obo formats.
|
65
|
+
|
66
|
+
Args:
|
67
|
+
local_obo_path (str): path to a local obo file.
|
68
|
+
|
69
|
+
Returns
|
70
|
+
term_dict (dict): dictionary where keys are ids and values are tuples
|
71
|
+
containing (attribute, value) pairs
|
72
|
+
"""
|
73
|
+
# create a dict where keys are term IDs and values are lists of tuples
|
74
|
+
term_dict = dict() # type: dict[str, Any]
|
75
|
+
term_is_next = False
|
76
|
+
active_term = None
|
77
|
+
|
78
|
+
with open(local_obo_path) as file:
|
79
|
+
for line in file:
|
80
|
+
line_strip = line.rstrip()
|
81
|
+
|
82
|
+
# reset the active term using the break between term definitions
|
83
|
+
if line_strip == "":
|
84
|
+
active_term = None
|
85
|
+
|
86
|
+
line_as_tuple = _format_entry_tuple(line_strip)
|
87
|
+
|
88
|
+
# catch new term definitions
|
89
|
+
if term_is_next:
|
90
|
+
attrib, value = line_as_tuple
|
91
|
+
if attrib != "id":
|
92
|
+
raise ValueError(
|
93
|
+
f'{line_strip} was expected to be an "id" but it was not recongized as one'
|
94
|
+
)
|
95
|
+
|
96
|
+
active_term = value
|
97
|
+
term_dict[active_term] = list()
|
98
|
+
term_is_next = False
|
99
|
+
continue
|
100
|
+
|
101
|
+
if line_strip == "[Term]":
|
102
|
+
term_is_next = True
|
103
|
+
continue
|
104
|
+
else:
|
105
|
+
term_is_next = False
|
106
|
+
|
107
|
+
if active_term is not None:
|
108
|
+
term_dict[active_term].append(line_as_tuple)
|
109
|
+
|
110
|
+
return term_dict
|
111
|
+
|
112
|
+
|
113
|
+
def format_obo_dict_as_df(obo_term_dict: dict) -> pd.DataFrame:
|
114
|
+
"""
|
115
|
+
Format an OBO Dict as a DataFrame
|
116
|
+
|
117
|
+
Reorganize a dictionary of tuples into a DataFrame
|
118
|
+
|
119
|
+
Args:
|
120
|
+
term_dict (dict): dictionary where keys are ids and values are tuples
|
121
|
+
containing (attribute, value) pairs
|
122
|
+
|
123
|
+
Returns
|
124
|
+
obo_df (pd.DataFrame): A pd.DataFrame with one row per identifier and one columns for unique attribute
|
125
|
+
"""
|
126
|
+
# find attributes which can occur multiple times. These will be represented as lists within the
|
127
|
+
# pandas DataFrame. The remaining attributes will just be strings.
|
128
|
+
dups = [_find_obo_attrib_dups(obo_term_dict[k]) for k in obo_term_dict.keys()]
|
129
|
+
degenerate_attribs = set(chain(*dups))
|
130
|
+
|
131
|
+
# reorganize term as list to setup creation of pd.DataFrame
|
132
|
+
term_dicts = list()
|
133
|
+
for k, v in obo_term_dict.items():
|
134
|
+
term_dict = _reformat_obo_entry_as_dict(v, degenerate_attribs)
|
135
|
+
term_dict["id"] = k
|
136
|
+
term_dicts.append(term_dict)
|
137
|
+
|
138
|
+
obo_df = pd.DataFrame(term_dicts).set_index("id")
|
139
|
+
|
140
|
+
return obo_df
|
141
|
+
|
142
|
+
|
143
|
+
def _reformat_obo_entry_as_dict(one_term, degenerate_attribs) -> dict:
|
144
|
+
term_dict = dict()
|
145
|
+
for attrib in degenerate_attribs:
|
146
|
+
term_dict[attrib] = list()
|
147
|
+
|
148
|
+
for attrib, value in one_term:
|
149
|
+
if attrib in degenerate_attribs:
|
150
|
+
term_dict[attrib].append(value)
|
151
|
+
else:
|
152
|
+
term_dict[attrib] = value
|
153
|
+
|
154
|
+
return term_dict
|
155
|
+
|
156
|
+
|
157
|
+
def create_parent_child_graph(go_parents_df: pd.DataFrame) -> ig.Graph:
|
158
|
+
"""
|
159
|
+
Create Parent:Child Graph
|
160
|
+
|
161
|
+
Format the Simple GO CC Ontology as a Directed Acyclic Graph (DAG).
|
162
|
+
|
163
|
+
Args:
|
164
|
+
go_parents_df (pd.DataFrame): a table with:
|
165
|
+
- parent_id: GO ID of parent (from an is-a entry)
|
166
|
+
- parent_name: common name of parent (from an is-a entry)
|
167
|
+
- child_id: GO ID from the index
|
168
|
+
|
169
|
+
Returns:
|
170
|
+
parent_child_graph (ig.Graph): a DAG formed from parent-child relationships.
|
171
|
+
|
172
|
+
"""
|
173
|
+
valid_go_ids = {
|
174
|
+
*go_parents_df["parent_id"].tolist(),
|
175
|
+
*go_parents_df["child_id"].tolist(),
|
176
|
+
}
|
177
|
+
valid_go_ids_df = pd.DataFrame(valid_go_ids)
|
178
|
+
valid_go_ids_df.columns = ["go_id"] # type: ignore
|
179
|
+
|
180
|
+
# format edgelist as an igraph network
|
181
|
+
parent_child_graph = ig.Graph.DictList(
|
182
|
+
vertices=valid_go_ids_df.to_dict("records"),
|
183
|
+
edges=go_parents_df[["child_id", "parent_id"]].to_dict("records"),
|
184
|
+
directed=True,
|
185
|
+
vertex_name_attr="go_id",
|
186
|
+
edge_foreign_keys=("child_id", "parent_id"),
|
187
|
+
)
|
188
|
+
|
189
|
+
# is it a fully connected DAG as expected?
|
190
|
+
assert parent_child_graph.is_dag()
|
191
|
+
assert parent_child_graph.is_connected("weak")
|
192
|
+
|
193
|
+
return parent_child_graph
|
194
|
+
|
195
|
+
|
196
|
+
def create_go_ancestors_df(parent_child_graph: ig.Graph) -> pd.DataFrame:
|
197
|
+
"""
|
198
|
+
Create GO Ancestors DataFrame
|
199
|
+
|
200
|
+
Args:
|
201
|
+
parent_child_graph (ig.Graph): a DAG formed from parent-child relationships.
|
202
|
+
|
203
|
+
Returns:
|
204
|
+
go_ancestors_df (pd.DataFrame): a table with:
|
205
|
+
- go_id: GO ID of a CC GO term of interest
|
206
|
+
- ancestor_id: An ancestor (parent, parent of parent, ...)'s GO CC ID
|
207
|
+
"""
|
208
|
+
# find the ancestors of each vertex
|
209
|
+
ancestor_dict = [
|
210
|
+
{
|
211
|
+
"go_id": v["go_id"],
|
212
|
+
"ancestor_id": parent_child_graph.vs(
|
213
|
+
parent_child_graph.subcomponent(v, mode=ig.OUT)
|
214
|
+
).get_attribute_values("go_id"),
|
215
|
+
}
|
216
|
+
for v in parent_child_graph.vs
|
217
|
+
]
|
218
|
+
|
219
|
+
go_ancestors_df = pd.DataFrame(ancestor_dict).explode("ancestor_id")
|
220
|
+
# drop self edges
|
221
|
+
go_ancestors_df = go_ancestors_df[
|
222
|
+
go_ancestors_df["go_id"] != go_ancestors_df["ancestor_id"]
|
223
|
+
]
|
224
|
+
|
225
|
+
return go_ancestors_df
|
226
|
+
|
227
|
+
|
228
|
+
def _download_go_basic_obo(local_obo_path: str = OBO_GO_BASIC_LOCAL_TMP) -> None:
|
229
|
+
"""Download an OBO file containing GO categories and their relations (but not the genes in each category)."""
|
230
|
+
|
231
|
+
utils.download_wget(OBO_GO_BASIC_URL, local_obo_path)
|
232
|
+
|
233
|
+
if not os.path.isfile(local_obo_path):
|
234
|
+
raise FileNotFoundError(
|
235
|
+
f"{local_obo_path} was not found after trying to download from {OBO_GO_BASIC_URL}"
|
236
|
+
)
|
237
|
+
|
238
|
+
|
239
|
+
def _isa_str_list_to_dict_list(isa_list: list) -> list[dict[str, Any]]:
|
240
|
+
"""Split parent-child relationships from individual strings to dictionaries where parent and child are separated."""
|
241
|
+
|
242
|
+
split_vals = [tuple(val.split(" ! ")) for val in isa_list]
|
243
|
+
|
244
|
+
isa_dict_list = list()
|
245
|
+
for split_val in split_vals:
|
246
|
+
assert len(split_val) == 2
|
247
|
+
|
248
|
+
isa_dict_list.append({"parent_id": split_val[0], "parent_name": split_val[1]})
|
249
|
+
|
250
|
+
return isa_dict_list
|
251
|
+
|
252
|
+
|
253
|
+
def _format_entry_tuple(line_str: str) -> tuple | None:
|
254
|
+
"""Split and return a colon-separated tuple."""
|
255
|
+
|
256
|
+
entry = line_str.split(": ", maxsplit=1)
|
257
|
+
if len(entry) == 2:
|
258
|
+
return tuple(entry)
|
259
|
+
return None
|
260
|
+
|
261
|
+
|
262
|
+
def _find_obo_attrib_dups(one_term) -> list:
|
263
|
+
"""Identify attributes which are present multiple times."""
|
264
|
+
|
265
|
+
attrib_count = collections.Counter([v[0] for v in one_term])
|
266
|
+
duplicated_attributes = [item for item, count in attrib_count.items() if count > 1]
|
267
|
+
|
268
|
+
return duplicated_attributes
|