napistu 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (77) hide show
  1. napistu/__init__.py +12 -0
  2. napistu/__main__.py +867 -0
  3. napistu/consensus.py +1557 -0
  4. napistu/constants.py +500 -0
  5. napistu/gcs/__init__.py +10 -0
  6. napistu/gcs/constants.py +69 -0
  7. napistu/gcs/downloads.py +180 -0
  8. napistu/identifiers.py +805 -0
  9. napistu/indices.py +227 -0
  10. napistu/ingestion/__init__.py +10 -0
  11. napistu/ingestion/bigg.py +146 -0
  12. napistu/ingestion/constants.py +296 -0
  13. napistu/ingestion/cpr_edgelist.py +106 -0
  14. napistu/ingestion/identifiers_etl.py +148 -0
  15. napistu/ingestion/obo.py +268 -0
  16. napistu/ingestion/psi_mi.py +276 -0
  17. napistu/ingestion/reactome.py +218 -0
  18. napistu/ingestion/sbml.py +621 -0
  19. napistu/ingestion/string.py +356 -0
  20. napistu/ingestion/trrust.py +285 -0
  21. napistu/ingestion/yeast.py +147 -0
  22. napistu/mechanism_matching.py +597 -0
  23. napistu/modify/__init__.py +10 -0
  24. napistu/modify/constants.py +86 -0
  25. napistu/modify/curation.py +628 -0
  26. napistu/modify/gaps.py +635 -0
  27. napistu/modify/pathwayannot.py +1381 -0
  28. napistu/modify/uncompartmentalize.py +264 -0
  29. napistu/network/__init__.py +10 -0
  30. napistu/network/constants.py +117 -0
  31. napistu/network/neighborhoods.py +1594 -0
  32. napistu/network/net_create.py +1647 -0
  33. napistu/network/net_utils.py +652 -0
  34. napistu/network/paths.py +500 -0
  35. napistu/network/precompute.py +221 -0
  36. napistu/rpy2/__init__.py +127 -0
  37. napistu/rpy2/callr.py +168 -0
  38. napistu/rpy2/constants.py +101 -0
  39. napistu/rpy2/netcontextr.py +464 -0
  40. napistu/rpy2/rids.py +697 -0
  41. napistu/sbml_dfs_core.py +2216 -0
  42. napistu/sbml_dfs_utils.py +304 -0
  43. napistu/source.py +394 -0
  44. napistu/utils.py +943 -0
  45. napistu-0.1.0.dist-info/METADATA +56 -0
  46. napistu-0.1.0.dist-info/RECORD +77 -0
  47. napistu-0.1.0.dist-info/WHEEL +5 -0
  48. napistu-0.1.0.dist-info/entry_points.txt +2 -0
  49. napistu-0.1.0.dist-info/licenses/LICENSE +21 -0
  50. napistu-0.1.0.dist-info/top_level.txt +2 -0
  51. tests/__init__.py +0 -0
  52. tests/conftest.py +83 -0
  53. tests/test_consensus.py +255 -0
  54. tests/test_constants.py +20 -0
  55. tests/test_curation.py +134 -0
  56. tests/test_data/__init__.py +0 -0
  57. tests/test_edgelist.py +20 -0
  58. tests/test_gcs.py +23 -0
  59. tests/test_identifiers.py +151 -0
  60. tests/test_igraph.py +353 -0
  61. tests/test_indices.py +88 -0
  62. tests/test_mechanism_matching.py +126 -0
  63. tests/test_net_utils.py +66 -0
  64. tests/test_netcontextr.py +105 -0
  65. tests/test_obo.py +34 -0
  66. tests/test_pathwayannot.py +95 -0
  67. tests/test_precomputed_distances.py +222 -0
  68. tests/test_rpy2.py +61 -0
  69. tests/test_sbml.py +46 -0
  70. tests/test_sbml_dfs_create.py +307 -0
  71. tests/test_sbml_dfs_utils.py +22 -0
  72. tests/test_sbo.py +11 -0
  73. tests/test_set_coverage.py +50 -0
  74. tests/test_source.py +67 -0
  75. tests/test_uncompartmentalize.py +40 -0
  76. tests/test_utils.py +487 -0
  77. tests/utils.py +30 -0
@@ -0,0 +1,106 @@
1
+ """
2
+ Module with helper functions to deal with edgelists
3
+
4
+ Edgelists are assumed to be DataFrames whose first two columns represent an Edge relation, eg From, To
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import logging
10
+
11
+ import pandas as pd
12
+
13
+ logger = logging.getLogger(__name__)
14
+
15
+
16
+ def remove_reciprocal_interactions(
17
+ edgelist: pd.DataFrame, extra_defining_vars: list = list()
18
+ ) -> pd.DataFrame:
19
+ """Remove reciprocal edges from an edgelist (i.e., if B-A always exists for every A-B then remove B-A)
20
+
21
+ Args:
22
+ edgelist (pd.DataFrame): edgelist (pd.DataFrame): edgelist where the first two
23
+ columns are assumed to be the edge vertices
24
+ extra_defining_vars (list): list (which can be empty) of variables which define
25
+ a unique interaction beyond the vertices
26
+
27
+ Returns:
28
+ indegenerate_edgelist (pd.DataFrame): edgelist with B-A edges removed and A-B retained
29
+
30
+ """
31
+
32
+ edgelist_vars = edgelist.columns.tolist()[0:2]
33
+ logger.info(
34
+ "Removing reciprocal interactions treating "
35
+ f"{edgelist_vars[0]} and {edgelist_vars[1]} as vertices"
36
+ )
37
+
38
+ reciprocal_interaction_fraction = count_fraction_of_reciprocal_interactions(
39
+ edgelist, extra_defining_vars
40
+ )
41
+ if reciprocal_interaction_fraction != 1:
42
+ raise ValueError(
43
+ f"Only {reciprocal_interaction_fraction} of edges are present as reciprocal edges;"
44
+ " this method of removing reciprocal edges will be unreliable"
45
+ )
46
+
47
+ indegenerate_edgelist = edgelist.loc[
48
+ edgelist[edgelist_vars[0]] < edgelist[edgelist_vars[1]]
49
+ ]
50
+
51
+ return indegenerate_edgelist
52
+
53
+
54
+ def count_fraction_of_reciprocal_interactions(
55
+ edgelist: pd.DataFrame, extra_defining_vars: list = list()
56
+ ) -> float:
57
+ """Count the fraction of A-B edges which also show up as B-A edges
58
+
59
+ Args:
60
+ edgelist (pd.DataFrame): edgelist (pd.DataFrame): edgelist where the first two
61
+ columns are assumed to be the edge vertices
62
+ extra_defining_vars (list): list (which can be empty) of variables which define
63
+ a unique interaction beyond the vertices
64
+
65
+ Returns:
66
+ fraction (float): fraction of A-B edges which are also included as B-A edges
67
+
68
+ """
69
+
70
+ # first two variables are assumed to be vertices of edgelist
71
+ edgelist_vars = edgelist.columns.tolist()[0:2]
72
+ logger.info(
73
+ "Counting the fraction of reciprocal interactions treating "
74
+ f"{edgelist_vars[0]} and {edgelist_vars[1]} as vertices"
75
+ )
76
+
77
+ # extra defining variables must exist
78
+ missing_extra_defining_vars = set(extra_defining_vars).difference(
79
+ set(edgelist.columns)
80
+ )
81
+ if len(missing_extra_defining_vars) > 0:
82
+ raise ValueError(
83
+ f"{', '.join(missing_extra_defining_vars)} are \"extra_defining_vars\" "
84
+ "but were missing from the edgelist"
85
+ )
86
+
87
+ extended_edgelist_vars = [*edgelist_vars, *extra_defining_vars]
88
+ logger.info(
89
+ f"{', '.join(extra_defining_vars)} will be used as \"extra_defining_vars\" "
90
+ "which much match across reciprocal edges for the edge to be identical"
91
+ )
92
+
93
+ possible_reciprocal_interactions = (
94
+ edgelist[extended_edgelist_vars]
95
+ .rename(
96
+ {edgelist_vars[0]: edgelist_vars[1], edgelist_vars[1]: edgelist_vars[0]},
97
+ axis=1,
98
+ )
99
+ .assign(reciprocal_exists=True)
100
+ )
101
+
102
+ reciprocal_interaction_test = edgelist[extended_edgelist_vars].merge(
103
+ possible_reciprocal_interactions
104
+ )
105
+
106
+ return reciprocal_interaction_test.shape[0] / edgelist.shape[0]
@@ -0,0 +1,148 @@
1
+ from __future__ import annotations
2
+
3
+ import os
4
+ import re
5
+
6
+ import pandas as pd
7
+ import requests
8
+ from napistu.ingestion.constants import IDENTIFIERS_ETL_SBO_URL
9
+ from napistu.ingestion.constants import IDENTIFIERS_ETL_YEAST_FIELDS
10
+ from napistu.ingestion.constants import IDENTIFIERS_ETL_YEAST_URL
11
+ from napistu.ingestion.constants import IDENTIFIERS_ETL_YEAST_HEADER_REGEX
12
+
13
+
14
+ def read_yeast_identifiers(url: str = IDENTIFIERS_ETL_YEAST_URL):
15
+ """Read Yeast Identifiers
16
+ Generate a pd.DataFrame which maps between yeast identifiers including
17
+ common and systematic (OLN) names, as well as Swiss-Prot and SGD identifiers.
18
+
19
+ Params:
20
+ url (str): url to the identifier file
21
+ Returns:
22
+ pd.DataFrame with one row per gene
23
+ """
24
+ response = requests.get(url).text
25
+
26
+ yeast_id_list = list()
27
+ break_line_hit = 0
28
+ for line in response.splitlines():
29
+ if re.match(IDENTIFIERS_ETL_YEAST_HEADER_REGEX, line):
30
+ # find start and end of header indicated by a line of underscores
31
+ break_line_hit += 1
32
+ continue
33
+
34
+ if break_line_hit >= 2:
35
+ if line == "":
36
+ # reached the end
37
+ break
38
+
39
+ # split each line into a list of fields, the only optional field is 3d
40
+ # all white spaces are space
41
+ line = re.sub(" +", " ", line)
42
+ line = re.sub("; ", ";", line)
43
+ # remove pol and gag designations from transposons since they are an unnecessary extra field
44
+ line = re.sub("(-[0-9]) (GAG)|(POL)", "\\1", line)
45
+
46
+ line = line.split()
47
+
48
+ if line[6] != "(3)":
49
+ # if no 3D field is present then create one
50
+ line.insert(6, "none")
51
+
52
+ # split common fields into a separate list
53
+ common_list = line[0].split(";")
54
+ line[0] = common_list[0]
55
+ line.insert(1, common_list)
56
+
57
+ if len(line) != 9:
58
+ raise ValueError(
59
+ "the yeast id file could not be read; all entries should have 8 fields"
60
+ )
61
+
62
+ yeast_id_list.append(dict(zip(IDENTIFIERS_ETL_YEAST_FIELDS, line)))
63
+
64
+ return pd.DataFrame(yeast_id_list)
65
+
66
+
67
+ def read_sbo_ontology(
68
+ url: str = IDENTIFIERS_ETL_SBO_URL, verbose: bool = False
69
+ ) -> pd.DataFrame:
70
+ """Read SBO Ontology
71
+ Read the Systems Biology Ontology (SBO) identifiers and reformat the obo results into a pd.DataFrame.
72
+
73
+ Params:
74
+ url (str): url to the obo specification file
75
+ verbose (bool): throw warnings when attributes are overwritten
76
+ Returns:
77
+ pd.DataFrame
78
+ """
79
+
80
+ # save the obo file locally
81
+ tmp_file = os.path.join("/tmp", "sbo.obo")
82
+ r = requests.get(url, allow_redirects=True)
83
+ open(tmp_file, "wb").write(r.content)
84
+
85
+ with open(tmp_file) as sbo:
86
+ sbo_dict = dict()
87
+ current_id = None
88
+ in_header = True
89
+ for line in sbo:
90
+ # skip the header
91
+ if line == "[Term]\n":
92
+ in_header = False
93
+ continue
94
+ if in_header:
95
+ continue
96
+
97
+ line_entries = line.split(":", 1)
98
+
99
+ if len(line_entries) == 2:
100
+ entry_type = line_entries[0]
101
+ entry_value = line_entries[1].strip()
102
+
103
+ # drop type defs
104
+ if (
105
+ (current_id is not None)
106
+ and (entry_type != "id")
107
+ and (re.match("SBO", current_id) is None)
108
+ ):
109
+ continue
110
+
111
+ # clean-up definitions
112
+ if entry_type == "is_a":
113
+ entry_value = re.match("SBO:[0-9]+", entry_value)[0]
114
+
115
+ # if a new id has been reached then initilize a new dict and
116
+ # update current id
117
+
118
+ if entry_type == "id":
119
+ current_id = entry_value
120
+ if re.match("SBO", current_id) is not None:
121
+ sbo_dict[current_id] = {"is_a": []}
122
+ continue
123
+
124
+ if entry_type == "is_a":
125
+ sbo_dict[current_id]["is_a"].append(entry_value)
126
+ else:
127
+ # add a new entry
128
+ if (entry_type in sbo_dict[current_id].keys()) and verbose:
129
+ print(
130
+ f"2+ {entry_type} entries were found for {current_id}, only one value should be present "
131
+ )
132
+ sbo_dict[current_id][entry_type] = entry_value
133
+
134
+ sbo_df = pd.DataFrame(sbo_dict).T
135
+
136
+ obsolete_terms = set(
137
+ sbo_df["name"][sbo_df["name"].str.match("obsolete")].index.tolist()
138
+ )
139
+ sbo_df["is_obsolete"] = [
140
+ (x in obsolete_terms) | (len(set(y).intersection(obsolete_terms)) > 0)
141
+ for x, y in zip(sbo_df.index, sbo_df["is_a"])
142
+ ]
143
+
144
+ sbo_df = sbo_df[["name", "comment", "is_a", "is_obsolete"]]
145
+ sbo_df.index.name = "sbo_term"
146
+ sbo_df = sbo_df.reset_index()
147
+
148
+ return sbo_df
@@ -0,0 +1,268 @@
1
+ from __future__ import annotations
2
+
3
+ import collections
4
+ import os
5
+ from itertools import chain
6
+ from typing import Any
7
+
8
+ import igraph as ig
9
+ import pandas as pd
10
+ from napistu import utils
11
+ from napistu.ingestion.constants import OBO_GO_BASIC_LOCAL_TMP
12
+ from napistu.ingestion.constants import OBO_GO_BASIC_URL
13
+
14
+
15
+ def create_go_parents_df(go_basic_obo_df: pd.DataFrame) -> pd.DataFrame:
16
+ """
17
+ Create the GO Parents Table
18
+
19
+ Reformat a table with GO attributes into a table with child-parent relationships
20
+
21
+ Args:
22
+ go_basic_obo_df (pd.DataFrame): Table generated from parsing go-basic.obo with
23
+ obo.format_obo_dict_as_df
24
+
25
+ Returns:
26
+ go_parents_df (pd.DataFrame): a table with:
27
+ - parent_id: GO ID of parent (from an is-a entry)
28
+ - parent_name: common name of parent (from an is-a entry)
29
+ - child_id: GO ID from the index
30
+
31
+ """
32
+ # filter to CC ontology and look at a series
33
+ # where the index is GO IDs and values is a list of parent "is-a" relations
34
+ cc_parents = go_basic_obo_df.query("namespace == 'cellular_component'")["is_a"]
35
+
36
+ # this is currently at 4496 rows - this is expected to slowly increase
37
+ assert cc_parents.shape[0] >= 4496
38
+ assert cc_parents.shape[0] < 5000
39
+
40
+ # convert from a list of strings to a list of dicts then expand so each
41
+ # dict is its own row
42
+ parent_entries = cc_parents.map(_isa_str_list_to_dict_list).explode()
43
+ # drop orphans which will be NaN's after the explosion
44
+ parent_entries = parent_entries[~parent_entries.isnull()]
45
+
46
+ # convert to a DF which just has string variables
47
+ go_parents_df = pd.DataFrame(parent_entries.tolist())
48
+ go_parents_df["child_id"] = parent_entries.index
49
+
50
+ # currently at 4688 rows - this may increase or decrease but will do so slowly
51
+ assert go_parents_df.shape[0] > 4600
52
+ assert go_parents_df.shape[0] < 5000
53
+
54
+ return go_parents_df
55
+
56
+
57
+ def read_obo_as_dict(local_obo_path: str) -> dict:
58
+ """
59
+ Read OBO as Dictionary
60
+
61
+ The Open Biological and Biomedical Ontologies (OBO) format is a standard format
62
+ for representing ontologies. Many parsers exist for obo but since we are not
63
+ relying extensively on it and we are trying to minimize dependencies here we provide a
64
+ few functions for parsing standard obo formats.
65
+
66
+ Args:
67
+ local_obo_path (str): path to a local obo file.
68
+
69
+ Returns
70
+ term_dict (dict): dictionary where keys are ids and values are tuples
71
+ containing (attribute, value) pairs
72
+ """
73
+ # create a dict where keys are term IDs and values are lists of tuples
74
+ term_dict = dict() # type: dict[str, Any]
75
+ term_is_next = False
76
+ active_term = None
77
+
78
+ with open(local_obo_path) as file:
79
+ for line in file:
80
+ line_strip = line.rstrip()
81
+
82
+ # reset the active term using the break between term definitions
83
+ if line_strip == "":
84
+ active_term = None
85
+
86
+ line_as_tuple = _format_entry_tuple(line_strip)
87
+
88
+ # catch new term definitions
89
+ if term_is_next:
90
+ attrib, value = line_as_tuple
91
+ if attrib != "id":
92
+ raise ValueError(
93
+ f'{line_strip} was expected to be an "id" but it was not recongized as one'
94
+ )
95
+
96
+ active_term = value
97
+ term_dict[active_term] = list()
98
+ term_is_next = False
99
+ continue
100
+
101
+ if line_strip == "[Term]":
102
+ term_is_next = True
103
+ continue
104
+ else:
105
+ term_is_next = False
106
+
107
+ if active_term is not None:
108
+ term_dict[active_term].append(line_as_tuple)
109
+
110
+ return term_dict
111
+
112
+
113
+ def format_obo_dict_as_df(obo_term_dict: dict) -> pd.DataFrame:
114
+ """
115
+ Format an OBO Dict as a DataFrame
116
+
117
+ Reorganize a dictionary of tuples into a DataFrame
118
+
119
+ Args:
120
+ term_dict (dict): dictionary where keys are ids and values are tuples
121
+ containing (attribute, value) pairs
122
+
123
+ Returns
124
+ obo_df (pd.DataFrame): A pd.DataFrame with one row per identifier and one columns for unique attribute
125
+ """
126
+ # find attributes which can occur multiple times. These will be represented as lists within the
127
+ # pandas DataFrame. The remaining attributes will just be strings.
128
+ dups = [_find_obo_attrib_dups(obo_term_dict[k]) for k in obo_term_dict.keys()]
129
+ degenerate_attribs = set(chain(*dups))
130
+
131
+ # reorganize term as list to setup creation of pd.DataFrame
132
+ term_dicts = list()
133
+ for k, v in obo_term_dict.items():
134
+ term_dict = _reformat_obo_entry_as_dict(v, degenerate_attribs)
135
+ term_dict["id"] = k
136
+ term_dicts.append(term_dict)
137
+
138
+ obo_df = pd.DataFrame(term_dicts).set_index("id")
139
+
140
+ return obo_df
141
+
142
+
143
+ def _reformat_obo_entry_as_dict(one_term, degenerate_attribs) -> dict:
144
+ term_dict = dict()
145
+ for attrib in degenerate_attribs:
146
+ term_dict[attrib] = list()
147
+
148
+ for attrib, value in one_term:
149
+ if attrib in degenerate_attribs:
150
+ term_dict[attrib].append(value)
151
+ else:
152
+ term_dict[attrib] = value
153
+
154
+ return term_dict
155
+
156
+
157
+ def create_parent_child_graph(go_parents_df: pd.DataFrame) -> ig.Graph:
158
+ """
159
+ Create Parent:Child Graph
160
+
161
+ Format the Simple GO CC Ontology as a Directed Acyclic Graph (DAG).
162
+
163
+ Args:
164
+ go_parents_df (pd.DataFrame): a table with:
165
+ - parent_id: GO ID of parent (from an is-a entry)
166
+ - parent_name: common name of parent (from an is-a entry)
167
+ - child_id: GO ID from the index
168
+
169
+ Returns:
170
+ parent_child_graph (ig.Graph): a DAG formed from parent-child relationships.
171
+
172
+ """
173
+ valid_go_ids = {
174
+ *go_parents_df["parent_id"].tolist(),
175
+ *go_parents_df["child_id"].tolist(),
176
+ }
177
+ valid_go_ids_df = pd.DataFrame(valid_go_ids)
178
+ valid_go_ids_df.columns = ["go_id"] # type: ignore
179
+
180
+ # format edgelist as an igraph network
181
+ parent_child_graph = ig.Graph.DictList(
182
+ vertices=valid_go_ids_df.to_dict("records"),
183
+ edges=go_parents_df[["child_id", "parent_id"]].to_dict("records"),
184
+ directed=True,
185
+ vertex_name_attr="go_id",
186
+ edge_foreign_keys=("child_id", "parent_id"),
187
+ )
188
+
189
+ # is it a fully connected DAG as expected?
190
+ assert parent_child_graph.is_dag()
191
+ assert parent_child_graph.is_connected("weak")
192
+
193
+ return parent_child_graph
194
+
195
+
196
+ def create_go_ancestors_df(parent_child_graph: ig.Graph) -> pd.DataFrame:
197
+ """
198
+ Create GO Ancestors DataFrame
199
+
200
+ Args:
201
+ parent_child_graph (ig.Graph): a DAG formed from parent-child relationships.
202
+
203
+ Returns:
204
+ go_ancestors_df (pd.DataFrame): a table with:
205
+ - go_id: GO ID of a CC GO term of interest
206
+ - ancestor_id: An ancestor (parent, parent of parent, ...)'s GO CC ID
207
+ """
208
+ # find the ancestors of each vertex
209
+ ancestor_dict = [
210
+ {
211
+ "go_id": v["go_id"],
212
+ "ancestor_id": parent_child_graph.vs(
213
+ parent_child_graph.subcomponent(v, mode=ig.OUT)
214
+ ).get_attribute_values("go_id"),
215
+ }
216
+ for v in parent_child_graph.vs
217
+ ]
218
+
219
+ go_ancestors_df = pd.DataFrame(ancestor_dict).explode("ancestor_id")
220
+ # drop self edges
221
+ go_ancestors_df = go_ancestors_df[
222
+ go_ancestors_df["go_id"] != go_ancestors_df["ancestor_id"]
223
+ ]
224
+
225
+ return go_ancestors_df
226
+
227
+
228
+ def _download_go_basic_obo(local_obo_path: str = OBO_GO_BASIC_LOCAL_TMP) -> None:
229
+ """Download an OBO file containing GO categories and their relations (but not the genes in each category)."""
230
+
231
+ utils.download_wget(OBO_GO_BASIC_URL, local_obo_path)
232
+
233
+ if not os.path.isfile(local_obo_path):
234
+ raise FileNotFoundError(
235
+ f"{local_obo_path} was not found after trying to download from {OBO_GO_BASIC_URL}"
236
+ )
237
+
238
+
239
+ def _isa_str_list_to_dict_list(isa_list: list) -> list[dict[str, Any]]:
240
+ """Split parent-child relationships from individual strings to dictionaries where parent and child are separated."""
241
+
242
+ split_vals = [tuple(val.split(" ! ")) for val in isa_list]
243
+
244
+ isa_dict_list = list()
245
+ for split_val in split_vals:
246
+ assert len(split_val) == 2
247
+
248
+ isa_dict_list.append({"parent_id": split_val[0], "parent_name": split_val[1]})
249
+
250
+ return isa_dict_list
251
+
252
+
253
+ def _format_entry_tuple(line_str: str) -> tuple | None:
254
+ """Split and return a colon-separated tuple."""
255
+
256
+ entry = line_str.split(": ", maxsplit=1)
257
+ if len(entry) == 2:
258
+ return tuple(entry)
259
+ return None
260
+
261
+
262
+ def _find_obo_attrib_dups(one_term) -> list:
263
+ """Identify attributes which are present multiple times."""
264
+
265
+ attrib_count = collections.Counter([v[0] for v in one_term])
266
+ duplicated_attributes = [item for item, count in attrib_count.items() if count > 1]
267
+
268
+ return duplicated_attributes